Spaces:
Sleeping
Sleeping
File size: 7,806 Bytes
513a1f2 3388ab8 ea0ef5f 513a1f2 3388ab8 513a1f2 51b48a8 513a1f2 51b48a8 3388ab8 513a1f2 ea68d4a 513a1f2 ea68d4a 3388ab8 ea68d4a 3388ab8 ea68d4a 3388ab8 ea68d4a 3388ab8 ea68d4a 3388ab8 513a1f2 51b48a8 513a1f2 955c99b 3388ab8 513a1f2 955c99b 3388ab8 513a1f2 955c99b 513a1f2 955c99b ea68d4a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 |
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from smolagents import tool, CodeAgent
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.preprocessing import StandardScaler
@tool
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
"""
Cleans the input DataFrame by stripping whitespace from column names and dropping rows that are completely empty.
Args:
df: The input DataFrame containing the raw data.
Returns:
A cleaned DataFrame with stripped column names and without completely empty rows.
"""
df.columns = df.columns.str.strip()
df = df.dropna(how="all")
return df
@tool
def extract_features(df: pd.DataFrame) -> pd.DataFrame:
"""
Dynamically extracts features from the input DataFrame.
Args:
df: The input DataFrame containing the raw data.
Returns:
The DataFrame updated with new dynamically engineered features.
"""
# Numeric columns: log transformation for skewed features
numeric_cols = df.select_dtypes(include=[np.number]).columns.to_list()
for col in numeric_cols:
if (df[col] >= 0).all():
df[f"log_{col}"] = np.log(df[col] + 1)
# Date-like columns extraction
for col in df.columns:
if "date" in col.lower() or "time" in col.lower() or col == "activity":
try:
df[col] = pd.to_datetime(df[col], errors='coerce')
if not df[col].isna().all(): # Only create features if we have valid dates
df[f"{col}_year"] = df[col].dt.year
df[f"{col}_month"] = df[col].dt.month
df[f"{col}_day"] = df[col].dt.day
# Calculate age (days since date)
df[f"{col}_age_days"] = (pd.Timestamp.now() - df[col]).dt.days
except Exception:
pass
# Repository age (days since creation)
if "created_at" in df.columns:
df["created_at"] = pd.to_datetime(df["created_at"], errors="coerce")
df["repo_age_days"] = (pd.Timestamp.now() - df["created_at"]).dt.days
# Recent activity count (commits/issues in last 30/90 days)
if "activity" in df.columns and pd.api.types.is_datetime64_any_dtype(df["activity"]):
now = pd.Timestamp.now()
df["recent_activity_30d"] = ((now - df["activity"]).dt.days <= 30).astype(int)
df["recent_activity_90d"] = ((now - df["activity"]).dt.days <= 90).astype(int)
# Open/closed PR ratio
if {"open_prs", "closed_prs"}.issubset(df.columns):
df["pr_ratio"] = df["open_prs"] / (df["closed_prs"] + 1)
# Issue resolution speed
if {"issues_closed", "issues_opened"}.issubset(df.columns):
df["issue_resolution_speed"] = df["issues_closed"] / (df["issues_opened"] + 1)
# Feature ratios
if {"stars", "forks"}.issubset(df.columns):
df["stars_to_forks_ratio"] = df["stars"] / (df["forks"] + 1)
if {"open_issues", "closed_issues"}.issubset(df.columns):
df["issues_ratio"] = df["closed_issues"] / (df["open_issues"] + df["closed_issues"] + 1)
# Non-numeric processing: encode categorical features
non_numeric = df.select_dtypes(include=["object"]).columns.to_list()
valid_cat = []
for col in non_numeric:
try:
pd.to_datetime(df[col], errors='raise')
except Exception:
valid_cat.append(col)
for col in valid_cat:
if col not in ["repo", "parent"]: # Skip identifier columns
df[f"{col}_cat"] = df[col].astype("category").cat.codes
# Normalize or standardize features
scaler = StandardScaler()
scaled_cols = ["stars", "forks", "watchers", "open_issues", "pulls", "contributors"]
for col in scaled_cols:
if col in df.columns:
df[f"scaled_{col}"] = scaler.fit_transform(df[[col]])
return df
@tool
def save_to_csv(df: pd.DataFrame, filename: str = "output.csv") -> str:
"""
Saves the input DataFrame to a CSV file and returns the file path.
Args:
df: The DataFrame to save.
filename: The name of the output CSV file (default is "output.csv").
Returns:
The file path of the saved CSV.
"""
df.to_csv(filename, index=False)
return filename
@tool
def predict_funding(df: pd.DataFrame) -> pd.DataFrame:
"""
Predicts funding for child repositories based on the parent-child relationship.
Args:
df: The input DataFrame containing 'repo', 'parent', and other features.
Returns:
A DataFrame with an updated 'final_weight' column for child repositories.
"""
# Ensure required columns exist
if not {"repo", "parent", "final_weight"}.issubset(df.columns):
raise ValueError("Input DataFrame must contain 'repo', 'parent', and 'final_weight' columns.")
# Normalize funding weights for child repositories grouped by parent
df["final_weight"] = df.groupby("parent")["final_weight"].transform(
lambda x: x / x.sum() if x.sum() > 0 else 1 / len(x)
)
return df
@tool
def analyze_feature_importance(feature_importances: dict, feature_cols: list):
"""
Visualizes feature importance and identifies irrelevant features.
Args:
feature_importances: A dictionary of feature names and their importance scores.
feature_cols: List of feature column names.
"""
importance_df = pd.DataFrame({"Feature": feature_cols, "Importance": feature_importances}).sort_values(by="Importance", ascending=False)
print("[INFO] Feature importances:")
print(importance_df)
# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(importance_df["Feature"], importance_df["Importance"], color="skyblue")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("Feature Importance")
plt.gca().invert_yaxis()
plt.show()
# Drop irrelevant features (importance < threshold)
threshold = 0.01
irrelevant_features = importance_df[importance_df["Importance"] < threshold]["Feature"].tolist()
print(f"[INFO] Irrelevant features (importance < {threshold}): {irrelevant_features}")
return irrelevant_features
class DataSmolAgent(CodeAgent):
"""
A data processing agent that cleans and extracts features from the provided DataFrame.
"""
def __init__(self, df: pd.DataFrame):
self.df = df
self.tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
self.model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
super().__init__(
tools=[
clean_data,
extract_features,
save_to_csv, # Added save_to_csv tool
predict_funding, # Added predict_funding tool
analyze_feature_importance, # Added analyze_feature_importance tool
],
model=self.model,
additional_authorized_imports=["pandas", "numpy"]
)
def run(self, prompt: str, output_csv: bool = False) -> pd.DataFrame:
# Run the agent with the provided DataFrame
clean_output = self.tools["clean_data"](df=self.df)
self.df = clean_output.result if hasattr(clean_output, "result") else clean_output
features_output = self.tools["extract_features"](df=self.df)
self.df = features_output.result if hasattr(features_output, "result") else features_output
funding_output = self.tools["predict_funding"](df=self.df)
self.df = funding_output.result if hasattr(funding_output, "result") else funding_output
if output_csv:
csv_output = self.tools["save_to_csv"](df=self.df, filename="processed_output.csv")
print(f"CSV saved at: {csv_output}")
return self.df
|