File size: 3,690 Bytes
513a1f2
 
ea0ef5f
513a1f2
 
 
 
51b48a8
 
 
 
 
 
 
 
 
513a1f2
 
 
 
 
 
51b48a8
 
 
 
 
 
 
 
 
513a1f2
 
 
 
 
 
 
 
 
 
 
 
 
 
ea0ef5f
513a1f2
 
 
 
 
 
 
 
ea0ef5f
513a1f2
 
 
 
 
 
 
 
51b48a8
 
 
 
 
 
 
 
 
 
513a1f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import pandas as pd
import numpy as np
from smolagents import tool, CodeAgent
from transformers import AutoTokenizer, AutoModelForCausalLM

@tool
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans the input DataFrame by stripping whitespace from column names and dropping rows that are completely empty.

    Args:
        df: The input DataFrame containing the raw data.

    Returns:
         A cleaned DataFrame with stripped column names and without completely empty rows.
    """
    df.columns = df.columns.str.strip()
    df = df.dropna(how="all")
    return df

@tool
def extract_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Dynamically extracts features from the input DataFrame.

    Args:
        df: The input DataFrame containing the raw data.

    Returns:
        The DataFrame updated with new dynamically engineered features.
    """
    # Numeric columns: log transformation
    numeric_cols = df.select_dtypes(include=[np.number]).columns.to_list()
    for col in numeric_cols:
        if (df[col] >= 0).all():
            df[f"log_{col}"] = np.log(df[col] + 1)

    # Date-like columns extraction
    for col in df.columns:
        if "date" in col.lower() or "time" in col.lower():
            try:
                df[col] = pd.to_datetime(df[col], errors='coerce')
                df[f"{col}_year"] = df[col].dt.year
                df[f"{col}_month"] = df[col].dt.month
                df[f"{col}_day"] = df[col].dt.day
            except (ValueError, TypeError):
                pass

    # Non-numeric processing: encode as categorical numeric codes.
    non_numeric = df.select_dtypes(include=["object"]).columns.to_list()
    valid_cat = []
    for col in non_numeric:
        try:
            pd.to_datetime(df[col], errors='raise')
        except ValueError:
            valid_cat.append(col)
    for col in valid_cat:
        df[f"{col}_cat"] = df[col].astype("category").cat.codes             
    
    return df   

@tool
def save_to_csv(df: pd.DataFrame, filename: str = "output.csv") -> str:
    """
    Saves the input DataFrame to a CSV file and returns the file path.

    Args:
        df: The DataFrame to save.
        filename: The name of the output CSV file (default is "output.csv").

    Returns:
        The file path of the saved CSV.
    """
    df.to_csv(filename, index=False)
    return filename

class DataSmolAgent(CodeAgent):
    """
    A data processing agent that cleans and extracts features from the provided DataFrame.
    """
    def __init__(self, df: pd.DataFrame):
        self.df = df
        self.tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
        self.model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
        super().__init__(
            tools=[
                clean_data,
                extract_features,
                save_to_csv,  # Added save_to_csv tool
            ],
            model=self.model,
            additional_authorized_imports=["pandas", "numpy"]
        )

    def run(self, prompt: str, output_csv: bool = False) -> pd.DataFrame:
        # Run the agent with the provided DataFrame
        clean_output = self.tools["clean_data"](df=self.df)
        self.df = clean_output.result if hasattr(clean_output, "result") else clean_output

        features_output = self.tools["extract_features"](df=self.df)
        self.df = features_output.result if hasattr(features_output, "result") else features_output

        if output_csv:
            csv_output = self.tools["save_to_csv"](df=self.df, filename="processed_output.csv")
            print(f"CSV saved at: {csv_output}")

        return self.df