import pandas as pd import numpy as np import matplotlib.pyplot as plt from smolagents import tool, CodeAgent from transformers import AutoTokenizer, AutoModelForCausalLM from sklearn.preprocessing import StandardScaler @tool def clean_data(df: pd.DataFrame) -> pd.DataFrame: """ Cleans the input DataFrame by stripping whitespace from column names and dropping rows that are completely empty. Args: df: The input DataFrame containing the raw data. Returns: A cleaned DataFrame with stripped column names and without completely empty rows. """ df.columns = df.columns.str.strip() df = df.dropna(how="all") return df @tool def extract_features(df: pd.DataFrame) -> pd.DataFrame: """ Dynamically extracts features from the input DataFrame. Args: df: The input DataFrame containing the raw data. Returns: The DataFrame updated with new dynamically engineered features. """ # Numeric columns: log transformation for skewed features numeric_cols = df.select_dtypes(include=[np.number]).columns.to_list() for col in numeric_cols: if (df[col] >= 0).all(): df[f"log_{col}"] = np.log(df[col] + 1) # Date-like columns extraction for col in df.columns: if "date" in col.lower() or "time" in col.lower() or col == "activity": try: df[col] = pd.to_datetime(df[col], errors='coerce') if not df[col].isna().all(): # Only create features if we have valid dates df[f"{col}_year"] = df[col].dt.year df[f"{col}_month"] = df[col].dt.month df[f"{col}_day"] = df[col].dt.day # Calculate age (days since date) df[f"{col}_age_days"] = (pd.Timestamp.now() - df[col]).dt.days except Exception: pass # Repository age (days since creation) if "created_at" in df.columns: df["created_at"] = pd.to_datetime(df["created_at"], errors="coerce") df["repo_age_days"] = (pd.Timestamp.now() - df["created_at"]).dt.days # Recent activity count (commits/issues in last 30/90 days) if "activity" in df.columns and pd.api.types.is_datetime64_any_dtype(df["activity"]): now = pd.Timestamp.now() df["recent_activity_30d"] = ((now - df["activity"]).dt.days <= 30).astype(int) df["recent_activity_90d"] = ((now - df["activity"]).dt.days <= 90).astype(int) # Open/closed PR ratio if {"open_prs", "closed_prs"}.issubset(df.columns): df["pr_ratio"] = df["open_prs"] / (df["closed_prs"] + 1) # Issue resolution speed if {"issues_closed", "issues_opened"}.issubset(df.columns): df["issue_resolution_speed"] = df["issues_closed"] / (df["issues_opened"] + 1) # Feature ratios if {"stars", "forks"}.issubset(df.columns): df["stars_to_forks_ratio"] = df["stars"] / (df["forks"] + 1) if {"open_issues", "closed_issues"}.issubset(df.columns): df["issues_ratio"] = df["closed_issues"] / (df["open_issues"] + df["closed_issues"] + 1) # Non-numeric processing: encode categorical features non_numeric = df.select_dtypes(include=["object"]).columns.to_list() valid_cat = [] for col in non_numeric: try: pd.to_datetime(df[col], errors='raise') except Exception: valid_cat.append(col) for col in valid_cat: if col not in ["repo", "parent"]: # Skip identifier columns df[f"{col}_cat"] = df[col].astype("category").cat.codes # Normalize or standardize features scaler = StandardScaler() scaled_cols = ["stars", "forks", "watchers", "open_issues", "pulls", "contributors"] for col in scaled_cols: if col in df.columns: df[f"scaled_{col}"] = scaler.fit_transform(df[[col]]) return df @tool def save_to_csv(df: pd.DataFrame, filename: str = "output.csv") -> str: """ Saves the input DataFrame to a CSV file and returns the file path. Args: df: The DataFrame to save. filename: The name of the output CSV file (default is "output.csv"). Returns: The file path of the saved CSV. """ df.to_csv(filename, index=False) return filename @tool def predict_funding(df: pd.DataFrame) -> pd.DataFrame: """ Predicts funding for child repositories based on the parent-child relationship. Args: df: The input DataFrame containing 'repo', 'parent', and other features. Returns: A DataFrame with an updated 'final_weight' column for child repositories. """ # Ensure required columns exist if not {"repo", "parent", "final_weight"}.issubset(df.columns): raise ValueError("Input DataFrame must contain 'repo', 'parent', and 'final_weight' columns.") # Normalize funding weights for child repositories grouped by parent df["final_weight"] = df.groupby("parent")["final_weight"].transform( lambda x: x / x.sum() if x.sum() > 0 else 1 / len(x) ) return df @tool def analyze_feature_importance(feature_importances: dict, feature_cols: list): """ Visualizes feature importance and identifies irrelevant features. Args: feature_importances: A dictionary of feature names and their importance scores. feature_cols: List of feature column names. """ importance_df = pd.DataFrame({"Feature": feature_cols, "Importance": feature_importances}).sort_values(by="Importance", ascending=False) print("[INFO] Feature importances:") print(importance_df) # Plot feature importance plt.figure(figsize=(10, 6)) plt.barh(importance_df["Feature"], importance_df["Importance"], color="skyblue") plt.xlabel("Importance") plt.ylabel("Feature") plt.title("Feature Importance") plt.gca().invert_yaxis() plt.show() # Drop irrelevant features (importance < threshold) threshold = 0.01 irrelevant_features = importance_df[importance_df["Importance"] < threshold]["Feature"].tolist() print(f"[INFO] Irrelevant features (importance < {threshold}): {irrelevant_features}") return irrelevant_features class DataSmolAgent(CodeAgent): """ A data processing agent that cleans and extracts features from the provided DataFrame. """ def __init__(self, df: pd.DataFrame): self.df = df self.tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct") self.model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct") super().__init__( tools=[ clean_data, extract_features, save_to_csv, # Added save_to_csv tool predict_funding, # Added predict_funding tool analyze_feature_importance, # Added analyze_feature_importance tool ], model=self.model, additional_authorized_imports=["pandas", "numpy"] ) def run(self, prompt: str, output_csv: bool = False) -> pd.DataFrame: # Run the agent with the provided DataFrame clean_output = self.tools["clean_data"](df=self.df) self.df = clean_output.result if hasattr(clean_output, "result") else clean_output features_output = self.tools["extract_features"](df=self.df) self.df = features_output.result if hasattr(features_output, "result") else features_output funding_output = self.tools["predict_funding"](df=self.df) self.df = funding_output.result if hasattr(funding_output, "result") else funding_output if output_csv: csv_output = self.tools["save_to_csv"](df=self.df, filename="processed_output.csv") print(f"CSV saved at: {csv_output}") return self.df