DeepFundingOracle / Oracle /DataSmolAgent.py
FelixPhilip's picture
updated the smolagents
51b48a8
raw
history blame
3.69 kB
import pandas as pd
import numpy as np
from smolagents import tool, CodeAgent
from transformers import AutoTokenizer, AutoModelForCausalLM
@tool
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
"""
Cleans the input DataFrame by stripping whitespace from column names and dropping rows that are completely empty.
Args:
df: The input DataFrame containing the raw data.
Returns:
A cleaned DataFrame with stripped column names and without completely empty rows.
"""
df.columns = df.columns.str.strip()
df = df.dropna(how="all")
return df
@tool
def extract_features(df: pd.DataFrame) -> pd.DataFrame:
"""
Dynamically extracts features from the input DataFrame.
Args:
df: The input DataFrame containing the raw data.
Returns:
The DataFrame updated with new dynamically engineered features.
"""
# Numeric columns: log transformation
numeric_cols = df.select_dtypes(include=[np.number]).columns.to_list()
for col in numeric_cols:
if (df[col] >= 0).all():
df[f"log_{col}"] = np.log(df[col] + 1)
# Date-like columns extraction
for col in df.columns:
if "date" in col.lower() or "time" in col.lower():
try:
df[col] = pd.to_datetime(df[col], errors='coerce')
df[f"{col}_year"] = df[col].dt.year
df[f"{col}_month"] = df[col].dt.month
df[f"{col}_day"] = df[col].dt.day
except (ValueError, TypeError):
pass
# Non-numeric processing: encode as categorical numeric codes.
non_numeric = df.select_dtypes(include=["object"]).columns.to_list()
valid_cat = []
for col in non_numeric:
try:
pd.to_datetime(df[col], errors='raise')
except ValueError:
valid_cat.append(col)
for col in valid_cat:
df[f"{col}_cat"] = df[col].astype("category").cat.codes
return df
@tool
def save_to_csv(df: pd.DataFrame, filename: str = "output.csv") -> str:
"""
Saves the input DataFrame to a CSV file and returns the file path.
Args:
df: The DataFrame to save.
filename: The name of the output CSV file (default is "output.csv").
Returns:
The file path of the saved CSV.
"""
df.to_csv(filename, index=False)
return filename
class DataSmolAgent(CodeAgent):
"""
A data processing agent that cleans and extracts features from the provided DataFrame.
"""
def __init__(self, df: pd.DataFrame):
self.df = df
self.tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
self.model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
super().__init__(
tools=[
clean_data,
extract_features,
save_to_csv, # Added save_to_csv tool
],
model=self.model,
additional_authorized_imports=["pandas", "numpy"]
)
def run(self, prompt: str, output_csv: bool = False) -> pd.DataFrame:
# Run the agent with the provided DataFrame
clean_output = self.tools["clean_data"](df=self.df)
self.df = clean_output.result if hasattr(clean_output, "result") else clean_output
features_output = self.tools["extract_features"](df=self.df)
self.df = features_output.result if hasattr(features_output, "result") else features_output
if output_csv:
csv_output = self.tools["save_to_csv"](df=self.df, filename="processed_output.csv")
print(f"CSV saved at: {csv_output}")
return self.df