File size: 4,033 Bytes
513a1f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import pandas as pd
import numpy as np
from smolagents import HfApiModel,tool,CodeAgent
from transformers import AutoTokenizer, AutoModelForCausalLM

@tool
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean the DataFrame by stripping whitespace from column names and dropping rows that are completely empty.

    Args:
        df (pd.DataFrame): The input DataFrame containing the raw data.

    Returns:
        pd.DataFrame: A cleaned DataFrame with stripped column names and without completely empty rows.
    """
    df.columns = df.columns.str.strip()
    df = df.dropna(how="all")
    return df

@tool
def extract_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Dynamically extract features from the DataFrame.
    
    For numeric columns:
      - If all values are non-negative, a log-transformed version is created.
      
    For columns that appear to be dates:
      - Year, month, and day are extracted.
      
    For non-numeric, non-date columns:
      - They are encoded as categorical numeric codes.
    
    Args:
        df (pd.DataFrame): The input DataFrame containing the raw data.
        
    Returns:
        pd.DataFrame: The DataFrame updated with new dynamically engineered features.
    """
    # Numeric columns: log transformation
    numeric_cols = df.select_dtypes(include=[np.number]).columns.to_list()
    for col in numeric_cols:
        if (df[col] >= 0).all():
            df[f"log_{col}"] = np.log(df[col] + 1)

    # Date-like columns extraction
    for col in df.columns:
        if "date" in col.lower() or "time" in col.lower():
            try:
                df[col] = pd.to_datetime(df[col], errors='coerce')
                df[f"{col}_year"] = df[col].dt.year
                df[f"{col}_month"] = df[col].dt.month
                df[f"{col}_day"] = df[col].dt.day
            except Exception:
                pass

    # Non-numeric processing: encode as categorical numeric codes.
    non_numeric = df.select_dtypes(include=["object"]).columns.to_list()
    valid_cat = []
    for col in non_numeric:
        try:
            pd.to_datetime(df[col], errors='raise')
        except Exception:
            valid_cat.append(col)
    for col in valid_cat:
        df[f"{col}_cat"] = df[col].astype("category").cat.codes             
    
    return df   

@tool
def save_to_csv(df: pd.DataFrame, filename: str = "output.csv") -> str:
    """
    Save the DataFrame to a CSV file and return the file path.

    Args:
        df (pd.DataFrame): The DataFrame to save.
        filename (str): The name of the output CSV file.

    Returns:
        str: The file path of the saved CSV.
    """
    df.to_csv(filename, index=False)
    return filename

class DataSmolAgent(CodeAgent):
    """
    A data processing agent that cleans and extracts features from the provided DataFrame.
    """
    def __init__(self, df: pd.DataFrame):
        self.df = df
        self.tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
        self.model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
        super().__init__(
            tools=[
                clean_data,
                extract_features,
                save_to_csv,  # Added save_to_csv tool
            ],
            model=self.model,
            additional_authorized_imports=["pandas", "numpy"]
        )

    def run(self, prompt: str, output_csv: bool = False) -> pd.DataFrame:
        # Run the agent with the provided DataFrame
        clean_output = self.tools["clean_data"](df=self.df)
        self.df = clean_output.result if hasattr(clean_output, "result") else clean_output

        features_output = self.tools["extract_features"](df=self.df)
        self.df = features_output.result if hasattr(features_output, "result") else features_output

        if output_csv:
            csv_output = self.tools["save_to_csv"](df=self.df, filename="processed_output.csv")
            print(f"CSV saved at: {csv_output}")

        return self.df