File size: 7,806 Bytes
513a1f2
 
3388ab8
ea0ef5f
513a1f2
3388ab8
513a1f2
 
 
51b48a8
 
 
 
 
 
 
 
 
513a1f2
 
 
 
 
 
51b48a8
 
 
 
 
 
 
 
 
3388ab8
513a1f2
 
ea68d4a
513a1f2
 
ea68d4a
 
 
 
 
 
 
 
 
 
 
 
 
 
3388ab8
 
 
 
 
 
ea68d4a
3388ab8
 
 
 
 
 
 
 
 
 
 
 
ea68d4a
 
 
 
 
 
3388ab8
ea68d4a
 
 
 
 
 
 
 
3388ab8
ea68d4a
 
 
3388ab8
 
 
 
 
 
 
 
 
513a1f2
 
 
51b48a8
 
 
 
 
 
 
 
 
 
513a1f2
 
 
955c99b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3388ab8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
513a1f2
 
 
 
 
 
 
 
 
 
 
 
 
955c99b
3388ab8
513a1f2
 
 
 
 
 
 
 
 
 
 
 
 
955c99b
 
 
513a1f2
 
 
 
955c99b
ea68d4a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from smolagents import tool, CodeAgent
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.preprocessing import StandardScaler

@tool
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans the input DataFrame by stripping whitespace from column names and dropping rows that are completely empty.

    Args:
        df: The input DataFrame containing the raw data.

    Returns:
         A cleaned DataFrame with stripped column names and without completely empty rows.
    """
    df.columns = df.columns.str.strip()
    df = df.dropna(how="all")
    return df

@tool
def extract_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Dynamically extracts features from the input DataFrame.

    Args:
        df: The input DataFrame containing the raw data.

    Returns:
        The DataFrame updated with new dynamically engineered features.
    """
    # Numeric columns: log transformation for skewed features
    numeric_cols = df.select_dtypes(include=[np.number]).columns.to_list()
    for col in numeric_cols:
        if (df[col] >= 0).all():
            df[f"log_{col}"] = np.log(df[col] + 1)

    # Date-like columns extraction
    for col in df.columns:
        if "date" in col.lower() or "time" in col.lower() or col == "activity":
            try:
                df[col] = pd.to_datetime(df[col], errors='coerce')
                if not df[col].isna().all():  # Only create features if we have valid dates
                    df[f"{col}_year"] = df[col].dt.year
                    df[f"{col}_month"] = df[col].dt.month
                    df[f"{col}_day"] = df[col].dt.day
                    # Calculate age (days since date)
                    df[f"{col}_age_days"] = (pd.Timestamp.now() - df[col]).dt.days
            except Exception:
                pass

    # Repository age (days since creation)
    if "created_at" in df.columns:
        df["created_at"] = pd.to_datetime(df["created_at"], errors="coerce")
        df["repo_age_days"] = (pd.Timestamp.now() - df["created_at"]).dt.days

    # Recent activity count (commits/issues in last 30/90 days)
    if "activity" in df.columns and pd.api.types.is_datetime64_any_dtype(df["activity"]):
        now = pd.Timestamp.now()
        df["recent_activity_30d"] = ((now - df["activity"]).dt.days <= 30).astype(int)
        df["recent_activity_90d"] = ((now - df["activity"]).dt.days <= 90).astype(int)

    # Open/closed PR ratio
    if {"open_prs", "closed_prs"}.issubset(df.columns):
        df["pr_ratio"] = df["open_prs"] / (df["closed_prs"] + 1)

    # Issue resolution speed
    if {"issues_closed", "issues_opened"}.issubset(df.columns):
        df["issue_resolution_speed"] = df["issues_closed"] / (df["issues_opened"] + 1)

    # Feature ratios
    if {"stars", "forks"}.issubset(df.columns):
        df["stars_to_forks_ratio"] = df["stars"] / (df["forks"] + 1)

    if {"open_issues", "closed_issues"}.issubset(df.columns):
        df["issues_ratio"] = df["closed_issues"] / (df["open_issues"] + df["closed_issues"] + 1)

    # Non-numeric processing: encode categorical features
    non_numeric = df.select_dtypes(include=["object"]).columns.to_list()
    valid_cat = []
    for col in non_numeric:
        try:
            pd.to_datetime(df[col], errors='raise')
        except Exception:
            valid_cat.append(col)

    for col in valid_cat:
        if col not in ["repo", "parent"]:  # Skip identifier columns
            df[f"{col}_cat"] = df[col].astype("category").cat.codes

    # Normalize or standardize features
    scaler = StandardScaler()
    scaled_cols = ["stars", "forks", "watchers", "open_issues", "pulls", "contributors"]
    for col in scaled_cols:
        if col in df.columns:
            df[f"scaled_{col}"] = scaler.fit_transform(df[[col]])

    return df

@tool
def save_to_csv(df: pd.DataFrame, filename: str = "output.csv") -> str:
    """
    Saves the input DataFrame to a CSV file and returns the file path.

    Args:
        df: The DataFrame to save.
        filename: The name of the output CSV file (default is "output.csv").

    Returns:
        The file path of the saved CSV.
    """
    df.to_csv(filename, index=False)
    return filename

@tool
def predict_funding(df: pd.DataFrame) -> pd.DataFrame:
    """
    Predicts funding for child repositories based on the parent-child relationship.

    Args:
        df: The input DataFrame containing 'repo', 'parent', and other features.

    Returns:
        A DataFrame with an updated 'final_weight' column for child repositories.
    """
    # Ensure required columns exist
    if not {"repo", "parent", "final_weight"}.issubset(df.columns):
        raise ValueError("Input DataFrame must contain 'repo', 'parent', and 'final_weight' columns.")

    # Normalize funding weights for child repositories grouped by parent
    df["final_weight"] = df.groupby("parent")["final_weight"].transform(
        lambda x: x / x.sum() if x.sum() > 0 else 1 / len(x)
    )
    return df

@tool
def analyze_feature_importance(feature_importances: dict, feature_cols: list):
    """
    Visualizes feature importance and identifies irrelevant features.

    Args:
        feature_importances: A dictionary of feature names and their importance scores.
        feature_cols: List of feature column names.
    """
    importance_df = pd.DataFrame({"Feature": feature_cols, "Importance": feature_importances}).sort_values(by="Importance", ascending=False)
    print("[INFO] Feature importances:")
    print(importance_df)

    # Plot feature importance
    plt.figure(figsize=(10, 6))
    plt.barh(importance_df["Feature"], importance_df["Importance"], color="skyblue")
    plt.xlabel("Importance")
    plt.ylabel("Feature")
    plt.title("Feature Importance")
    plt.gca().invert_yaxis()
    plt.show()

    # Drop irrelevant features (importance < threshold)
    threshold = 0.01
    irrelevant_features = importance_df[importance_df["Importance"] < threshold]["Feature"].tolist()
    print(f"[INFO] Irrelevant features (importance < {threshold}): {irrelevant_features}")
    return irrelevant_features

class DataSmolAgent(CodeAgent):
    """
    A data processing agent that cleans and extracts features from the provided DataFrame.
    """
    def __init__(self, df: pd.DataFrame):
        self.df = df
        self.tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
        self.model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
        super().__init__(
            tools=[
                clean_data,
                extract_features,
                save_to_csv,  # Added save_to_csv tool
                predict_funding,  # Added predict_funding tool
                analyze_feature_importance,  # Added analyze_feature_importance tool
            ],
            model=self.model,
            additional_authorized_imports=["pandas", "numpy"]
        )

    def run(self, prompt: str, output_csv: bool = False) -> pd.DataFrame:
        # Run the agent with the provided DataFrame
        clean_output = self.tools["clean_data"](df=self.df)
        self.df = clean_output.result if hasattr(clean_output, "result") else clean_output

        features_output = self.tools["extract_features"](df=self.df)
        self.df = features_output.result if hasattr(features_output, "result") else features_output

        funding_output = self.tools["predict_funding"](df=self.df)
        self.df = funding_output.result if hasattr(funding_output, "result") else funding_output

        if output_csv:
            csv_output = self.tools["save_to_csv"](df=self.df, filename="processed_output.csv")
            print(f"CSV saved at: {csv_output}")

        return self.df