import pandas as pd import numpy as np from smolagents import tool, CodeAgent from transformers import AutoTokenizer, AutoModelForCausalLM @tool def clean_data(df: pd.DataFrame) -> pd.DataFrame: """ Cleans the input DataFrame by stripping whitespace from column names and dropping rows that are completely empty. Args: df: The input DataFrame containing the raw data. Returns: A cleaned DataFrame with stripped column names and without completely empty rows. """ df.columns = df.columns.str.strip() df = df.dropna(how="all") return df @tool def extract_features(df: pd.DataFrame) -> pd.DataFrame: """ Dynamically extracts features from the input DataFrame. Args: df: The input DataFrame containing the raw data. Returns: The DataFrame updated with new dynamically engineered features. """ # Numeric columns: log transformation numeric_cols = df.select_dtypes(include=[np.number]).columns.to_list() for col in numeric_cols: if (df[col] >= 0).all(): df[f"log_{col}"] = np.log(df[col] + 1) # Date-like columns extraction for col in df.columns: if "date" in col.lower() or "time" in col.lower(): try: df[col] = pd.to_datetime(df[col], errors='coerce') df[f"{col}_year"] = df[col].dt.year df[f"{col}_month"] = df[col].dt.month df[f"{col}_day"] = df[col].dt.day except (ValueError, TypeError): pass # Non-numeric processing: encode as categorical numeric codes. non_numeric = df.select_dtypes(include=["object"]).columns.to_list() valid_cat = [] for col in non_numeric: try: pd.to_datetime(df[col], errors='raise') except ValueError: valid_cat.append(col) for col in valid_cat: df[f"{col}_cat"] = df[col].astype("category").cat.codes return df @tool def save_to_csv(df: pd.DataFrame, filename: str = "output.csv") -> str: """ Saves the input DataFrame to a CSV file and returns the file path. Args: df: The DataFrame to save. filename: The name of the output CSV file (default is "output.csv"). Returns: The file path of the saved CSV. """ df.to_csv(filename, index=False) return filename class DataSmolAgent(CodeAgent): """ A data processing agent that cleans and extracts features from the provided DataFrame. """ def __init__(self, df: pd.DataFrame): self.df = df self.tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct") self.model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct") super().__init__( tools=[ clean_data, extract_features, save_to_csv, # Added save_to_csv tool ], model=self.model, additional_authorized_imports=["pandas", "numpy"] ) def run(self, prompt: str, output_csv: bool = False) -> pd.DataFrame: # Run the agent with the provided DataFrame clean_output = self.tools["clean_data"](df=self.df) self.df = clean_output.result if hasattr(clean_output, "result") else clean_output features_output = self.tools["extract_features"](df=self.df) self.df = features_output.result if hasattr(features_output, "result") else features_output if output_csv: csv_output = self.tools["save_to_csv"](df=self.df, filename="processed_output.csv") print(f"CSV saved at: {csv_output}") return self.df