DeepFundingOracle / Oracle /DataSmolAgent.py
FelixPhilip's picture
Oracle
ea68d4a
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from smolagents import tool, CodeAgent
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.preprocessing import StandardScaler
@tool
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
"""
Cleans the input DataFrame by stripping whitespace from column names and dropping rows that are completely empty.
Args:
df: The input DataFrame containing the raw data.
Returns:
A cleaned DataFrame with stripped column names and without completely empty rows.
"""
df.columns = df.columns.str.strip()
df = df.dropna(how="all")
return df
@tool
def extract_features(df: pd.DataFrame) -> pd.DataFrame:
"""
Dynamically extracts features from the input DataFrame.
Args:
df: The input DataFrame containing the raw data.
Returns:
The DataFrame updated with new dynamically engineered features.
"""
# Numeric columns: log transformation for skewed features
numeric_cols = df.select_dtypes(include=[np.number]).columns.to_list()
for col in numeric_cols:
if (df[col] >= 0).all():
df[f"log_{col}"] = np.log(df[col] + 1)
# Date-like columns extraction
for col in df.columns:
if "date" in col.lower() or "time" in col.lower() or col == "activity":
try:
df[col] = pd.to_datetime(df[col], errors='coerce')
if not df[col].isna().all(): # Only create features if we have valid dates
df[f"{col}_year"] = df[col].dt.year
df[f"{col}_month"] = df[col].dt.month
df[f"{col}_day"] = df[col].dt.day
# Calculate age (days since date)
df[f"{col}_age_days"] = (pd.Timestamp.now() - df[col]).dt.days
except Exception:
pass
# Repository age (days since creation)
if "created_at" in df.columns:
df["created_at"] = pd.to_datetime(df["created_at"], errors="coerce")
df["repo_age_days"] = (pd.Timestamp.now() - df["created_at"]).dt.days
# Recent activity count (commits/issues in last 30/90 days)
if "activity" in df.columns and pd.api.types.is_datetime64_any_dtype(df["activity"]):
now = pd.Timestamp.now()
df["recent_activity_30d"] = ((now - df["activity"]).dt.days <= 30).astype(int)
df["recent_activity_90d"] = ((now - df["activity"]).dt.days <= 90).astype(int)
# Open/closed PR ratio
if {"open_prs", "closed_prs"}.issubset(df.columns):
df["pr_ratio"] = df["open_prs"] / (df["closed_prs"] + 1)
# Issue resolution speed
if {"issues_closed", "issues_opened"}.issubset(df.columns):
df["issue_resolution_speed"] = df["issues_closed"] / (df["issues_opened"] + 1)
# Feature ratios
if {"stars", "forks"}.issubset(df.columns):
df["stars_to_forks_ratio"] = df["stars"] / (df["forks"] + 1)
if {"open_issues", "closed_issues"}.issubset(df.columns):
df["issues_ratio"] = df["closed_issues"] / (df["open_issues"] + df["closed_issues"] + 1)
# Non-numeric processing: encode categorical features
non_numeric = df.select_dtypes(include=["object"]).columns.to_list()
valid_cat = []
for col in non_numeric:
try:
pd.to_datetime(df[col], errors='raise')
except Exception:
valid_cat.append(col)
for col in valid_cat:
if col not in ["repo", "parent"]: # Skip identifier columns
df[f"{col}_cat"] = df[col].astype("category").cat.codes
# Normalize or standardize features
scaler = StandardScaler()
scaled_cols = ["stars", "forks", "watchers", "open_issues", "pulls", "contributors"]
for col in scaled_cols:
if col in df.columns:
df[f"scaled_{col}"] = scaler.fit_transform(df[[col]])
return df
@tool
def save_to_csv(df: pd.DataFrame, filename: str = "output.csv") -> str:
"""
Saves the input DataFrame to a CSV file and returns the file path.
Args:
df: The DataFrame to save.
filename: The name of the output CSV file (default is "output.csv").
Returns:
The file path of the saved CSV.
"""
df.to_csv(filename, index=False)
return filename
@tool
def predict_funding(df: pd.DataFrame) -> pd.DataFrame:
"""
Predicts funding for child repositories based on the parent-child relationship.
Args:
df: The input DataFrame containing 'repo', 'parent', and other features.
Returns:
A DataFrame with an updated 'final_weight' column for child repositories.
"""
# Ensure required columns exist
if not {"repo", "parent", "final_weight"}.issubset(df.columns):
raise ValueError("Input DataFrame must contain 'repo', 'parent', and 'final_weight' columns.")
# Normalize funding weights for child repositories grouped by parent
df["final_weight"] = df.groupby("parent")["final_weight"].transform(
lambda x: x / x.sum() if x.sum() > 0 else 1 / len(x)
)
return df
@tool
def analyze_feature_importance(feature_importances: dict, feature_cols: list):
"""
Visualizes feature importance and identifies irrelevant features.
Args:
feature_importances: A dictionary of feature names and their importance scores.
feature_cols: List of feature column names.
"""
importance_df = pd.DataFrame({"Feature": feature_cols, "Importance": feature_importances}).sort_values(by="Importance", ascending=False)
print("[INFO] Feature importances:")
print(importance_df)
# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(importance_df["Feature"], importance_df["Importance"], color="skyblue")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("Feature Importance")
plt.gca().invert_yaxis()
plt.show()
# Drop irrelevant features (importance < threshold)
threshold = 0.01
irrelevant_features = importance_df[importance_df["Importance"] < threshold]["Feature"].tolist()
print(f"[INFO] Irrelevant features (importance < {threshold}): {irrelevant_features}")
return irrelevant_features
class DataSmolAgent(CodeAgent):
"""
A data processing agent that cleans and extracts features from the provided DataFrame.
"""
def __init__(self, df: pd.DataFrame):
self.df = df
self.tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
self.model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
super().__init__(
tools=[
clean_data,
extract_features,
save_to_csv, # Added save_to_csv tool
predict_funding, # Added predict_funding tool
analyze_feature_importance, # Added analyze_feature_importance tool
],
model=self.model,
additional_authorized_imports=["pandas", "numpy"]
)
def run(self, prompt: str, output_csv: bool = False) -> pd.DataFrame:
# Run the agent with the provided DataFrame
clean_output = self.tools["clean_data"](df=self.df)
self.df = clean_output.result if hasattr(clean_output, "result") else clean_output
features_output = self.tools["extract_features"](df=self.df)
self.df = features_output.result if hasattr(features_output, "result") else features_output
funding_output = self.tools["predict_funding"](df=self.df)
self.df = funding_output.result if hasattr(funding_output, "result") else funding_output
if output_csv:
csv_output = self.tools["save_to_csv"](df=self.df, filename="processed_output.csv")
print(f"CSV saved at: {csv_output}")
return self.df