Spaces:
Sleeping
Sleeping
import pandas as pd | |
import numpy as np | |
from smolagents import tool, CodeAgent | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
def clean_data(df: pd.DataFrame) -> pd.DataFrame: | |
""" | |
Cleans the input DataFrame by stripping whitespace from column names and dropping rows that are completely empty. | |
Args: | |
df: The input DataFrame containing the raw data. | |
Returns: | |
A cleaned DataFrame with stripped column names and without completely empty rows. | |
""" | |
df.columns = df.columns.str.strip() | |
df = df.dropna(how="all") | |
return df | |
def extract_features(df: pd.DataFrame) -> pd.DataFrame: | |
""" | |
Dynamically extracts features from the input DataFrame. | |
Args: | |
df: The input DataFrame containing the raw data. | |
Returns: | |
The DataFrame updated with new dynamically engineered features. | |
""" | |
# Numeric columns: log transformation | |
numeric_cols = df.select_dtypes(include=[np.number]).columns.to_list() | |
for col in numeric_cols: | |
if (df[col] >= 0).all(): | |
df[f"log_{col}"] = np.log(df[col] + 1) | |
# Date-like columns extraction | |
for col in df.columns: | |
if "date" in col.lower() or "time" in col.lower(): | |
try: | |
df[col] = pd.to_datetime(df[col], errors='coerce') | |
df[f"{col}_year"] = df[col].dt.year | |
df[f"{col}_month"] = df[col].dt.month | |
df[f"{col}_day"] = df[col].dt.day | |
except (ValueError, TypeError): | |
pass | |
# Non-numeric processing: encode as categorical numeric codes. | |
non_numeric = df.select_dtypes(include=["object"]).columns.to_list() | |
valid_cat = [] | |
for col in non_numeric: | |
try: | |
pd.to_datetime(df[col], errors='raise') | |
except ValueError: | |
valid_cat.append(col) | |
for col in valid_cat: | |
df[f"{col}_cat"] = df[col].astype("category").cat.codes | |
return df | |
def save_to_csv(df: pd.DataFrame, filename: str = "output.csv") -> str: | |
""" | |
Saves the input DataFrame to a CSV file and returns the file path. | |
Args: | |
df: The DataFrame to save. | |
filename: The name of the output CSV file (default is "output.csv"). | |
Returns: | |
The file path of the saved CSV. | |
""" | |
df.to_csv(filename, index=False) | |
return filename | |
class DataSmolAgent(CodeAgent): | |
""" | |
A data processing agent that cleans and extracts features from the provided DataFrame. | |
""" | |
def __init__(self, df: pd.DataFrame): | |
self.df = df | |
self.tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct") | |
self.model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct") | |
super().__init__( | |
tools=[ | |
clean_data, | |
extract_features, | |
save_to_csv, # Added save_to_csv tool | |
], | |
model=self.model, | |
additional_authorized_imports=["pandas", "numpy"] | |
) | |
def run(self, prompt: str, output_csv: bool = False) -> pd.DataFrame: | |
# Run the agent with the provided DataFrame | |
clean_output = self.tools["clean_data"](df=self.df) | |
self.df = clean_output.result if hasattr(clean_output, "result") else clean_output | |
features_output = self.tools["extract_features"](df=self.df) | |
self.df = features_output.result if hasattr(features_output, "result") else features_output | |
if output_csv: | |
csv_output = self.tools["save_to_csv"](df=self.df, filename="processed_output.csv") | |
print(f"CSV saved at: {csv_output}") | |
return self.df |