Spaces:
Sleeping
Sleeping
File size: 4,033 Bytes
513a1f2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
import pandas as pd
import numpy as np
from smolagents import HfApiModel,tool,CodeAgent
from transformers import AutoTokenizer, AutoModelForCausalLM
@tool
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
"""
Clean the DataFrame by stripping whitespace from column names and dropping rows that are completely empty.
Args:
df (pd.DataFrame): The input DataFrame containing the raw data.
Returns:
pd.DataFrame: A cleaned DataFrame with stripped column names and without completely empty rows.
"""
df.columns = df.columns.str.strip()
df = df.dropna(how="all")
return df
@tool
def extract_features(df: pd.DataFrame) -> pd.DataFrame:
"""
Dynamically extract features from the DataFrame.
For numeric columns:
- If all values are non-negative, a log-transformed version is created.
For columns that appear to be dates:
- Year, month, and day are extracted.
For non-numeric, non-date columns:
- They are encoded as categorical numeric codes.
Args:
df (pd.DataFrame): The input DataFrame containing the raw data.
Returns:
pd.DataFrame: The DataFrame updated with new dynamically engineered features.
"""
# Numeric columns: log transformation
numeric_cols = df.select_dtypes(include=[np.number]).columns.to_list()
for col in numeric_cols:
if (df[col] >= 0).all():
df[f"log_{col}"] = np.log(df[col] + 1)
# Date-like columns extraction
for col in df.columns:
if "date" in col.lower() or "time" in col.lower():
try:
df[col] = pd.to_datetime(df[col], errors='coerce')
df[f"{col}_year"] = df[col].dt.year
df[f"{col}_month"] = df[col].dt.month
df[f"{col}_day"] = df[col].dt.day
except Exception:
pass
# Non-numeric processing: encode as categorical numeric codes.
non_numeric = df.select_dtypes(include=["object"]).columns.to_list()
valid_cat = []
for col in non_numeric:
try:
pd.to_datetime(df[col], errors='raise')
except Exception:
valid_cat.append(col)
for col in valid_cat:
df[f"{col}_cat"] = df[col].astype("category").cat.codes
return df
@tool
def save_to_csv(df: pd.DataFrame, filename: str = "output.csv") -> str:
"""
Save the DataFrame to a CSV file and return the file path.
Args:
df (pd.DataFrame): The DataFrame to save.
filename (str): The name of the output CSV file.
Returns:
str: The file path of the saved CSV.
"""
df.to_csv(filename, index=False)
return filename
class DataSmolAgent(CodeAgent):
"""
A data processing agent that cleans and extracts features from the provided DataFrame.
"""
def __init__(self, df: pd.DataFrame):
self.df = df
self.tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
self.model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
super().__init__(
tools=[
clean_data,
extract_features,
save_to_csv, # Added save_to_csv tool
],
model=self.model,
additional_authorized_imports=["pandas", "numpy"]
)
def run(self, prompt: str, output_csv: bool = False) -> pd.DataFrame:
# Run the agent with the provided DataFrame
clean_output = self.tools["clean_data"](df=self.df)
self.df = clean_output.result if hasattr(clean_output, "result") else clean_output
features_output = self.tools["extract_features"](df=self.df)
self.df = features_output.result if hasattr(features_output, "result") else features_output
if output_csv:
csv_output = self.tools["save_to_csv"](df=self.df, filename="processed_output.csv")
print(f"CSV saved at: {csv_output}")
return self.df |