Spaces:
Sleeping
Sleeping
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from smolagents import tool, CodeAgent | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
from sklearn.preprocessing import StandardScaler | |
def clean_data(df: pd.DataFrame) -> pd.DataFrame: | |
""" | |
Cleans the input DataFrame by stripping whitespace from column names and dropping rows that are completely empty. | |
Args: | |
df: The input DataFrame containing the raw data. | |
Returns: | |
A cleaned DataFrame with stripped column names and without completely empty rows. | |
""" | |
df.columns = df.columns.str.strip() | |
df = df.dropna(how="all") | |
return df | |
def extract_features(df: pd.DataFrame) -> pd.DataFrame: | |
""" | |
Dynamically extracts features from the input DataFrame. | |
Args: | |
df: The input DataFrame containing the raw data. | |
Returns: | |
The DataFrame updated with new dynamically engineered features. | |
""" | |
# Numeric columns: log transformation for skewed features | |
numeric_cols = df.select_dtypes(include=[np.number]).columns.to_list() | |
for col in numeric_cols: | |
if (df[col] >= 0).all(): | |
df[f"log_{col}"] = np.log(df[col] + 1) | |
# Date-like columns extraction | |
for col in df.columns: | |
if "date" in col.lower() or "time" in col.lower() or col == "activity": | |
try: | |
df[col] = pd.to_datetime(df[col], errors='coerce') | |
if not df[col].isna().all(): # Only create features if we have valid dates | |
df[f"{col}_year"] = df[col].dt.year | |
df[f"{col}_month"] = df[col].dt.month | |
df[f"{col}_day"] = df[col].dt.day | |
# Calculate age (days since date) | |
df[f"{col}_age_days"] = (pd.Timestamp.now() - df[col]).dt.days | |
except Exception: | |
pass | |
# Repository age (days since creation) | |
if "created_at" in df.columns: | |
df["created_at"] = pd.to_datetime(df["created_at"], errors="coerce") | |
df["repo_age_days"] = (pd.Timestamp.now() - df["created_at"]).dt.days | |
# Recent activity count (commits/issues in last 30/90 days) | |
if "activity" in df.columns and pd.api.types.is_datetime64_any_dtype(df["activity"]): | |
now = pd.Timestamp.now() | |
df["recent_activity_30d"] = ((now - df["activity"]).dt.days <= 30).astype(int) | |
df["recent_activity_90d"] = ((now - df["activity"]).dt.days <= 90).astype(int) | |
# Open/closed PR ratio | |
if {"open_prs", "closed_prs"}.issubset(df.columns): | |
df["pr_ratio"] = df["open_prs"] / (df["closed_prs"] + 1) | |
# Issue resolution speed | |
if {"issues_closed", "issues_opened"}.issubset(df.columns): | |
df["issue_resolution_speed"] = df["issues_closed"] / (df["issues_opened"] + 1) | |
# Feature ratios | |
if {"stars", "forks"}.issubset(df.columns): | |
df["stars_to_forks_ratio"] = df["stars"] / (df["forks"] + 1) | |
if {"open_issues", "closed_issues"}.issubset(df.columns): | |
df["issues_ratio"] = df["closed_issues"] / (df["open_issues"] + df["closed_issues"] + 1) | |
# Non-numeric processing: encode categorical features | |
non_numeric = df.select_dtypes(include=["object"]).columns.to_list() | |
valid_cat = [] | |
for col in non_numeric: | |
try: | |
pd.to_datetime(df[col], errors='raise') | |
except Exception: | |
valid_cat.append(col) | |
for col in valid_cat: | |
if col not in ["repo", "parent"]: # Skip identifier columns | |
df[f"{col}_cat"] = df[col].astype("category").cat.codes | |
# Normalize or standardize features | |
scaler = StandardScaler() | |
scaled_cols = ["stars", "forks", "watchers", "open_issues", "pulls", "contributors"] | |
for col in scaled_cols: | |
if col in df.columns: | |
df[f"scaled_{col}"] = scaler.fit_transform(df[[col]]) | |
return df | |
def save_to_csv(df: pd.DataFrame, filename: str = "output.csv") -> str: | |
""" | |
Saves the input DataFrame to a CSV file and returns the file path. | |
Args: | |
df: The DataFrame to save. | |
filename: The name of the output CSV file (default is "output.csv"). | |
Returns: | |
The file path of the saved CSV. | |
""" | |
df.to_csv(filename, index=False) | |
return filename | |
def predict_funding(df: pd.DataFrame) -> pd.DataFrame: | |
""" | |
Predicts funding for child repositories based on the parent-child relationship. | |
Args: | |
df: The input DataFrame containing 'repo', 'parent', and other features. | |
Returns: | |
A DataFrame with an updated 'final_weight' column for child repositories. | |
""" | |
# Ensure required columns exist | |
if not {"repo", "parent", "final_weight"}.issubset(df.columns): | |
raise ValueError("Input DataFrame must contain 'repo', 'parent', and 'final_weight' columns.") | |
# Normalize funding weights for child repositories grouped by parent | |
df["final_weight"] = df.groupby("parent")["final_weight"].transform( | |
lambda x: x / x.sum() if x.sum() > 0 else 1 / len(x) | |
) | |
return df | |
def analyze_feature_importance(feature_importances: dict, feature_cols: list): | |
""" | |
Visualizes feature importance and identifies irrelevant features. | |
Args: | |
feature_importances: A dictionary of feature names and their importance scores. | |
feature_cols: List of feature column names. | |
""" | |
importance_df = pd.DataFrame({"Feature": feature_cols, "Importance": feature_importances}).sort_values(by="Importance", ascending=False) | |
print("[INFO] Feature importances:") | |
print(importance_df) | |
# Plot feature importance | |
plt.figure(figsize=(10, 6)) | |
plt.barh(importance_df["Feature"], importance_df["Importance"], color="skyblue") | |
plt.xlabel("Importance") | |
plt.ylabel("Feature") | |
plt.title("Feature Importance") | |
plt.gca().invert_yaxis() | |
plt.show() | |
# Drop irrelevant features (importance < threshold) | |
threshold = 0.01 | |
irrelevant_features = importance_df[importance_df["Importance"] < threshold]["Feature"].tolist() | |
print(f"[INFO] Irrelevant features (importance < {threshold}): {irrelevant_features}") | |
return irrelevant_features | |
class DataSmolAgent(CodeAgent): | |
""" | |
A data processing agent that cleans and extracts features from the provided DataFrame. | |
""" | |
def __init__(self, df: pd.DataFrame): | |
self.df = df | |
self.tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct") | |
self.model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct") | |
super().__init__( | |
tools=[ | |
clean_data, | |
extract_features, | |
save_to_csv, # Added save_to_csv tool | |
predict_funding, # Added predict_funding tool | |
analyze_feature_importance, # Added analyze_feature_importance tool | |
], | |
model=self.model, | |
additional_authorized_imports=["pandas", "numpy"] | |
) | |
def run(self, prompt: str, output_csv: bool = False) -> pd.DataFrame: | |
# Run the agent with the provided DataFrame | |
clean_output = self.tools["clean_data"](df=self.df) | |
self.df = clean_output.result if hasattr(clean_output, "result") else clean_output | |
features_output = self.tools["extract_features"](df=self.df) | |
self.df = features_output.result if hasattr(features_output, "result") else features_output | |
funding_output = self.tools["predict_funding"](df=self.df) | |
self.df = funding_output.result if hasattr(funding_output, "result") else funding_output | |
if output_csv: | |
csv_output = self.tools["save_to_csv"](df=self.df, filename="processed_output.csv") | |
print(f"CSV saved at: {csv_output}") | |
return self.df | |