Spaces:

FelixPhilip
/

DeepFundingOracle

Running

App Files Files Community

FelixPhilip commited on May 3

Commit

3388ab8

1 Parent(s): db24239

Oracle

Browse files

Files changed (3) hide show

Oracle/DataSmolAgent.py +73 -25
Oracle/deepfundingoracle.py +101 -60
app.py +31 -19

Oracle/DataSmolAgent.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import pandas as pd
 import numpy as np
 from smolagents import tool, CodeAgent
 from transformers import AutoTokenizer, AutoModelForCausalLM
 @tool
 def clean_data(df: pd.DataFrame) -> pd.DataFrame:
@@ -29,35 +31,52 @@ def extract_features(df: pd.DataFrame) -> pd.DataFrame:
     Returns:
         The DataFrame updated with new dynamically engineered features.
     """
-    # Numeric columns: log transformation
     numeric_cols = df.select_dtypes(include=[np.number]).columns.to_list()
     for col in numeric_cols:
-        if (df[col] >= 0).all():
             df[f"log_{col}"] = np.log(df[col] + 1)
-    # Date-like columns extraction
-    for col in df.columns:
-        if "date" in col.lower() or "time" in col.lower():
-            try:
-                df[col] = pd.to_datetime(df[col], errors='coerce')
-                df[f"{col}_year"] = df[col].dt.year
-                df[f"{col}_month"] = df[col].dt.month
-                df[f"{col}_day"] = df[col].dt.day
-            except (ValueError, TypeError):
-                pass
-    # Non-numeric processing: encode as categorical numeric codes.
-    non_numeric = df.select_dtypes(include=["object"]).columns.to_list()
-    valid_cat = []
-    for col in non_numeric:
-        try:
-            pd.to_datetime(df[col], errors='raise')
-        except ValueError:
-            valid_cat.append(col)
-    for col in valid_cat:
-        df[f"{col}_cat"] = df[col].astype("category").cat.codes
-    return df
 @tool
 def save_to_csv(df: pd.DataFrame, filename: str = "output.csv") -> str:
@@ -95,6 +114,34 @@ def predict_funding(df: pd.DataFrame) -> pd.DataFrame:
     )
     return df
 class DataSmolAgent(CodeAgent):
     """
     A data processing agent that cleans and extracts features from the provided DataFrame.
@@ -109,6 +156,7 @@ class DataSmolAgent(CodeAgent):
                 extract_features,
                 save_to_csv,  # Added save_to_csv tool
                 predict_funding,  # Added predict_funding tool
             ],
             model=self.model,
             additional_authorized_imports=["pandas", "numpy"]

 import pandas as pd
 import numpy as np
+import matplotlib.pyplot as plt
 from smolagents import tool, CodeAgent
 from transformers import AutoTokenizer, AutoModelForCausalLM
+from sklearn.preprocessing import StandardScaler
 @tool
 def clean_data(df: pd.DataFrame) -> pd.DataFrame:
     Returns:
         The DataFrame updated with new dynamically engineered features.
     """
+    # Numeric columns: log transformation for skewed features
     numeric_cols = df.select_dtypes(include=[np.number]).columns.to_list()
     for col in numeric_cols:
+        if (df[col] >= 0).all() and df[col].skew() > 1:
             df[f"log_{col}"] = np.log(df[col] + 1)
+    # Repository age (days since creation)
+    if "created_at" in df.columns:
+        df["created_at"] = pd.to_datetime(df["created_at"], errors="coerce")
+        df["repo_age_days"] = (pd.Timestamp.now() - df["created_at"]).dt.days
+    # Recent activity count (commits/issues in last 30/90 days)
+    if "activity" in df.columns:
+        df["activity"] = pd.to_datetime(df["activity"], errors="coerce")
+        now = pd.Timestamp.now()
+        df["recent_activity_30d"] = ((now - df["activity"]).dt.days <= 30).astype(int)
+        df["recent_activity_90d"] = ((now - df["activity"]).dt.days <= 90).astype(int)
+    # Open/closed PR ratio
+    if {"open_prs", "closed_prs"}.issubset(df.columns):
+        df["pr_ratio"] = df["open_prs"] / (df["closed_prs"] + 1)
+    # Issue resolution speed
+    if {"issues_closed", "issues_opened"}.issubset(df.columns):
+        df["issue_resolution_speed"] = df["issues_closed"] / (df["issues_opened"] + 1)
+    # Is the repo archived?
+    if "archived" in df.columns:
+        df["is_archived"] = df["archived"].astype(int)
+    # Description length
+    if "description" in df.columns:
+        df["description_length"] = df["description"].fillna("").apply(len)
+    # Topics count
+    if "topics" in df.columns:
+        df["topics_count"] = df["topics"].fillna("").apply(lambda x: len(x.split(",")))
+    # Normalize or standardize features
+    scaler = StandardScaler()
+    scaled_cols = ["stars", "forks", "watchers", "open_issues", "pulls", "contributors"]
+    for col in scaled_cols:
+        if col in df.columns:
+            df[f"scaled_{col}"] = scaler.fit_transform(df[[col]])
+    return df
 @tool
 def save_to_csv(df: pd.DataFrame, filename: str = "output.csv") -> str:
     )
     return df
+@tool
+def analyze_feature_importance(feature_importances: dict, feature_cols: list):
+    """
+    Visualizes feature importance and identifies irrelevant features.
+    Args:
+        feature_importances: A dictionary of feature names and their importance scores.
+        feature_cols: List of feature column names.
+    """
+    importance_df = pd.DataFrame({"Feature": feature_cols, "Importance": feature_importances}).sort_values(by="Importance", ascending=False)
+    print("[INFO] Feature importances:")
+    print(importance_df)
+    # Plot feature importance
+    plt.figure(figsize=(10, 6))
+    plt.barh(importance_df["Feature"], importance_df["Importance"], color="skyblue")
+    plt.xlabel("Importance")
+    plt.ylabel("Feature")
+    plt.title("Feature Importance")
+    plt.gca().invert_yaxis()
+    plt.show()
+    # Drop irrelevant features (importance < threshold)
+    threshold = 0.01
+    irrelevant_features = importance_df[importance_df["Importance"] < threshold]["Feature"].tolist()
+    print(f"[INFO] Irrelevant features (importance < {threshold}): {irrelevant_features}")
+    return irrelevant_features
 class DataSmolAgent(CodeAgent):
     """
     A data processing agent that cleans and extracts features from the provided DataFrame.
                 extract_features,
                 save_to_csv,  # Added save_to_csv tool
                 predict_funding,  # Added predict_funding tool
+                analyze_feature_importance,  # Added analyze_feature_importance tool
             ],
             model=self.model,
             additional_authorized_imports=["pandas", "numpy"]

Oracle/deepfundingoracle.py CHANGED Viewed

@@ -28,9 +28,11 @@ import re
 import json
 import time
-from sklearn.model_selection import train_test_split, RandomizedSearchCV
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.metrics import mean_squared_error
 from Oracle.SmolLM import SmolLM
@@ -261,6 +263,34 @@ def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0):
     logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
     return df
 def normalize_funding(df):
     """
@@ -284,91 +314,101 @@ def prepare_dataset(file):
     print("[INFO] Fetching GitHub features...")
     df = fetch_github_features(df)
     print("[INFO] GitHub features fetched successfully.")
     print("[INFO] Assigning base weights using LLama model...")
     df = assign_base_weight(df)
     df = train_predict_weight(df)
     df = normalize_funding(df)
     end_time = time.time()
     print(f"[INFO] Dataset preparation completed in {end_time - start_time:.2f} seconds.")
     return df
 ##############################
 #  RandomForest Regression
 ##############################
-def train_predict_weight(df,
-                         criterion='gini',
-                         max_features='sqrt',
-                         max_depth=12,
-                         min_samples_split=2,
-                         min_samples_leaf=1):
     """
-    Uses a RandomForestRegressor to predict a repository weight based on GitHub features.
-    The regressor is tuned with provided hyperparameters.
-    A flag column 'is_source' is used to indicate if a repository is the primary source.
-    If none is flagged, the repo with the highest prediction is set as the parent.
     """
-    print("[INFO] Starting weight prediction...", flush=True)
     start_time = time.time()
     target = "base_weight"
-    feature_cols = ["stars", "forks", "watchers", "open_issues", "pulls", "activity", "contributors"]
-    if "activity" in df.columns:
-        df["activity"] = pd.to_datetime(df["activity"], errors="coerce", utc=True)
-        now = pd.Timestamp.now(tz="UTC")
-        df["activity"] = (now - df["activity"]).dt.days.fillna(-1)
-    if target not in df.columns:
-        raise ValueError("Base weight column missing.")
     X = df[feature_cols]
     y = df[target]
-    # For regression, if a classification criterion is given, switch to 'mse'
-    reg_criterion = "squared_error" if criterion in ["gini", "entropy"] else criterion
-    rf_model = RandomForestRegressor(random_state=42,
-                                     criterion=reg_criterion,
-                                     max_features=max_features,
-                                     max_depth=max_depth,
-                                     min_samples_split=min_samples_split,
-                                     min_samples_leaf=min_samples_leaf,
-                                     n_estimators=200)
-    rf_model.fit(X, y)
-    df["rf_pred"] = rf_model.predict(X)
-    # Provide feedback about one of the trees in the RF
-    try:
-        depth = rf_model.estimators_[0].get_depth()
-        leaves = rf_model.estimators_[0].get_n_leaves()
-        print(f"[INFO] RF tree depth: {depth}, number of leaves: {leaves}", flush=True)
-    except Exception:
-        pass
-    parent_map = df.groupby("parent")["repo"].apply(list).to_dict()
-    final_weights = {}
-    for parent, children in parent_map.items():
-        group_idxs = df[df["parent"] == parent].index
-        preds = df.loc[group_idxs, "rf_pred"]
-        total = preds.sum()
-        if total > 0:
-            normed = preds / total
-        else:
-            # If sum is zero, assign equal weights.
-            normed = pd.Series([1/len(preds)] * len(preds), index=preds.index)
-        for idx, weight in normed.items():
-            final_weights[idx] = weight
-    df["final_weight"] = df.index.map(final_weights).fillna(0.0)
     end_time = time.time()
     print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True)
     return df
 ##############################
 # CSV Output
 ##############################
@@ -393,3 +433,4 @@ if __name__ == "__main__":
     print("[INFO] Creating submission CSV...")
     create_submission_csv(df, output_file)
     print("[INFO] Process completed successfully.")

 import json
 import time
+from sklearn.model_selection import train_test_split, GridSearchCV
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.metrics import mean_squared_error
+import matplotlib.pyplot as plt
+import seaborn as sns
 from Oracle.SmolLM import SmolLM
     logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
     return df
+def sanity_check_weights(df):
+    """
+    Sanity-checks LLM weights by comparing them with other metrics.
+    """
+    print("[INFO] Performing sanity check on LLM weights...")
+    df["sanity_check_weight"] = (df["stars"] + df["forks"] + df["watchers"]) / 3
+    df["ensemble_weight"] = (df["base_weight"] + df["sanity_check_weight"]) / 2
+    print("[INFO] Sanity check and ensemble weights added.")
+    return df
+def visualize_feature_distributions(df):
+    """
+    Visualizes feature distributions and correlations.
+    """
+    print("[INFO] Visualizing feature distributions and correlations...")
+    numeric_cols = df.select_dtypes(include=[np.number]).columns
+    # Plot feature distributions
+    df[numeric_cols].hist(bins=20, figsize=(15, 10), color="skyblue", edgecolor="black")
+    plt.suptitle("Feature Distributions", fontsize=16)
+    plt.show()
+    # Plot feature correlations
+    correlation_matrix = df[numeric_cols].corr()
+    plt.figure(figsize=(12, 8))
+    sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
+    plt.title("Feature Correlation Matrix", fontsize=16)
+    plt.show()
 def normalize_funding(df):
     """
     print("[INFO] Fetching GitHub features...")
     df = fetch_github_features(df)
     print("[INFO] GitHub features fetched successfully.")
+    print("[INFO] Cleaning data...")
+    df = clean_data(df)
+    print("[INFO] Data cleaned successfully.")
     print("[INFO] Assigning base weights using LLama model...")
     df = assign_base_weight(df)
+    df = sanity_check_weights(df)  # Add sanity-check and ensemble weights
     df = train_predict_weight(df)
+    visualize_feature_distributions(df)  # Add feature visualization
     df = normalize_funding(df)
     end_time = time.time()
     print(f"[INFO] Dataset preparation completed in {end_time - start_time:.2f} seconds.")
     return df
+##############################
+#  Data Cleaning
+##############################
+def clean_data(df):
+    """
+    Cleans the input DataFrame by handling missing values and removing outliers.
+    """
+    # Impute missing values
+    df.fillna(df.median(numeric_only=True), inplace=True)
+    # Remove extreme outliers using quantiles
+    for col in df.select_dtypes(include=[np.number]).columns:
+        q1 = df[col].quantile(0.25)
+        q3 = df[col].quantile(0.75)
+        iqr = q3 - q1
+        lower_bound = q1 - 1.5 * iqr
+        upper_bound = q3 + 1.5 * iqr
+        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
+    return df
 ##############################
 #  RandomForest Regression
 ##############################
+def train_predict_weight(df):
     """
+    Trains a RandomForestRegressor with hyperparameter tuning and evaluates the model.
     """
+    print("[INFO] Starting weight prediction with hyperparameter tuning...", flush=True)
     start_time = time.time()
     target = "base_weight"
+    feature_cols = [col for col in df.columns if col not in ["repo", "parent", "base_weight", "final_weight"]]
     X = df[feature_cols]
     y = df[target]
+    # Split data into train/test sets
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+    # Hyperparameter tuning using GridSearchCV
+    param_grid = {
+        "n_estimators": [100, 200, 300],
+        "max_depth": [10, 15, 20],
+        "min_samples_split": [2, 5, 10],
+        "min_samples_leaf": [1, 2, 4]
+    }
+    rf = RandomForestRegressor(random_state=42)
+    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring="neg_mean_squared_error", verbose=2)
+    grid_search.fit(X_train, y_train)
+    # Best model
+    best_rf = grid_search.best_estimator_
+    print(f"[INFO] Best parameters: {grid_search.best_params_}")
+    # Evaluate on test set
+    y_pred = best_rf.predict(X_test)
+    mse = mean_squared_error(y_test, y_pred)
+    print(f"[INFO] Test MSE: {mse}")
+    # Feature importance analysis
+    feature_importances = best_rf.feature_importances_
+    importance_df = pd.DataFrame({"Feature": feature_cols, "Importance": feature_importances}).sort_values(by="Importance", ascending=False)
+    print("[INFO] Feature importances:")
+    print(importance_df)
+    # Plot predictions vs. actual values
+    plt.scatter(y_test, y_pred, alpha=0.5)
+    plt.xlabel("Actual Base Weight")
+    plt.ylabel("Predicted Base Weight")
+    plt.title("Predictions vs. Actual")
+    plt.show()
+    # Assign predictions to DataFrame
+    df["final_weight"] = best_rf.predict(X)
     end_time = time.time()
     print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True)
     return df
 ##############################
 # CSV Output
 ##############################
     print("[INFO] Creating submission CSV...")
     create_submission_csv(df, output_file)
     print("[INFO] Process completed successfully.")

app.py CHANGED Viewed

@@ -3,6 +3,8 @@ import gradio as gr
 from Oracle.deepfundingoracle import prepare_dataset, train_predict_weight, create_submission_csv
 import pandas as pd
 import matplotlib.pyplot as plt
 import time
 import io
 from PIL import Image
@@ -15,25 +17,34 @@ def analyze_file(file, progress=gr.Progress(track_tqdm=True)):
     df = train_predict_weight(df)
     progress(0.6, desc="Saving results to CSV...")
     csv_path = create_submission_csv(df, "submission.csv")
-    progress(0.8, desc="Generating graph...")
-    # Example: plot histogram of a column if exists
-    fig, ax = plt.subplots()
-    if 'final_weight' in df.columns:
-        df['final_weight'].hist(ax=ax)
-        ax.set_title('Distribution of Final Weights')
-        ax.set_xlabel('Final Weight')
-        ax.set_ylabel('Count')
-    else:
-        ax.text(0.5, 0.5, 'No final_weight column to plot', ha='center')
-    buf = io.BytesIO()
-    plt.savefig(buf, format='png')
-    buf.seek(0)
-    plt.close(fig)
-    img = Image.open(buf)
     progress(1, desc="Done!")
     elapsed = time.time() - start_time
     preview = df.head().to_csv(index=False)
-    return preview, csv_path, img, f"Analysis completed in {elapsed:.2f} seconds."
 iface = gr.Interface(
     fn=analyze_file,
@@ -41,14 +52,15 @@ iface = gr.Interface(
     outputs=[
         gr.Textbox(label="Preview of Results"),
         gr.File(label="Download CSV"),
-        gr.Image(label="Analysis Graph"),
         gr.Textbox(label="Status/Timing Info")
     ],
     title="DeepFunding Oracle",
-    description="Upload a CSV of repo-parent relationships; see analysis progress, get a graph, and download results as CSV.",
     allow_flagging="never"
 )
 if __name__ == "__main__":
     port = int(os.environ.get("PORT", 7860))
-    iface.launch(server_name="0.0.0.0", server_port=port)

 from Oracle.deepfundingoracle import prepare_dataset, train_predict_weight, create_submission_csv
 import pandas as pd
 import matplotlib.pyplot as plt
+import seaborn as sns
+import numpy as np
 import time
 import io
 from PIL import Image
     df = train_predict_weight(df)
     progress(0.6, desc="Saving results to CSV...")
     csv_path = create_submission_csv(df, "submission.csv")
+    progress(0.8, desc="Generating graphs...")
+    # Feature distribution plot
+    dist_fig = plt.figure(figsize=(15, 10))
+    numeric_cols = df.select_dtypes(include=[np.number]).columns
+    df[numeric_cols].hist(bins=20, figsize=(15, 10), color="skyblue", edgecolor="black")
+    plt.suptitle("Feature Distributions", fontsize=16)
+    dist_buf = io.BytesIO()
+    plt.savefig(dist_buf, format='png')
+    dist_buf.seek(0)
+    plt.close(dist_fig)
+    dist_img = Image.open(dist_buf)
+    # Correlation matrix plot
+    corr_fig = plt.figure(figsize=(12, 8))
+    correlation_matrix = df[numeric_cols].corr()
+    sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
+    plt.title("Feature Correlation Matrix", fontsize=16)
+    corr_buf = io.BytesIO()
+    plt.savefig(corr_buf, format='png')
+    corr_buf.seek(0)
+    plt.close(corr_fig)
+    corr_img = Image.open(corr_buf)
     progress(1, desc="Done!")
     elapsed = time.time() - start_time
     preview = df.head().to_csv(index=False)
+    return preview, csv_path, dist_img, corr_img, f"Analysis completed in {elapsed:.2f} seconds."
 iface = gr.Interface(
     fn=analyze_file,
     outputs=[
         gr.Textbox(label="Preview of Results"),
         gr.File(label="Download CSV"),
+        gr.Image(label="Feature Distributions"),
+        gr.Image(label="Feature Correlation Matrix"),
         gr.Textbox(label="Status/Timing Info")
     ],
     title="DeepFunding Oracle",
+    description="Upload a CSV of repo-parent relationships; see analysis progress, get graphs, and download results as CSV.",
     allow_flagging="never"
 )
 if __name__ == "__main__":
     port = int(os.environ.get("PORT", 7860))
+    iface.launch(server_name="0.0.0.0", server_port=port)