Spaces:

FelixPhilip
/

DeepFundingOracle

Running

App Files Files Community

FelixPhilip commited on Apr 29

Commit

6282a14

1 Parent(s): 0861b62

Oracle

Browse files

Files changed (1) hide show

Oracle/deepfundingoracle.py +74 -19

Oracle/deepfundingoracle.py CHANGED Viewed

@@ -7,7 +7,7 @@ This script dynamically loads dependency data and for each repository URL:
   • Trains a RandomForest regressor on these features (with the base weight as the target) to predict a final weight.
 The output submission CSV has three columns: repo, parent, and final_weight.
 """
 from io import StringIO
 import os
 import warnings
@@ -123,6 +123,7 @@ def fetch_github_features(df):
     pulls_list = []
     activity_list = []
     contributors_list = []
     cache = {}
@@ -130,6 +131,28 @@ def fetch_github_features(df):
         if repo_url in cache:
             return cache[repo_url]
         val = fetch_repo_metrics(repo_url)
         cache[repo_url] = val
         return val
@@ -137,13 +160,13 @@ def fetch_github_features(df):
         futures = {executor.submit(get_metrics, row['repo']): i for i, row in df.iterrows()}
         for fut in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Fetching metrics"):
             res = fut.result()
-            stars_list.append(res["stargazers_count"])
-            forks_list.append(res["forks_count"])
-            watchers_list.append(res["watchers_count"])
-            issues_list.append(res["open_issues_count"])
-            pulls_list.append(res["pulls_count"])
-            activity_list.append(res["activity"])
             # Fetch contributors count
             try:
                 contributors_url = f"https://api.github.com/repos/{res['owner']}/{res['repo_name']}/contributors"
@@ -163,6 +186,7 @@ def fetch_github_features(df):
     df["pulls"] = pulls_list
     df["activity"] = activity_list
     df["contributors"] = contributors_list
     end_time = time.time()
     print(f"[INFO] GitHub features fetched successfully in {end_time - start_time:.2f} seconds.")
@@ -321,11 +345,23 @@ def prepare_dataset(file):
 ##############################
 #  RandomForest Regression
 ##############################
-def train_predict_weight(df):
     print("[INFO] Starting weight prediction...", flush=True)
     start_time = time.time()
     target = "base_weight"
     feature_cols = ["stars", "forks", "watchers", "open_issues", "pulls", "activity", "contributors"]
     if "activity" in df.columns:
         df["activity"] = pd.to_datetime(df["activity"], errors="coerce", utc=True)
         now = pd.Timestamp.now(tz="UTC")
@@ -333,25 +369,44 @@ def train_predict_weight(df):
     if target not in df.columns:
         raise ValueError("Base weight column missing.")
     X = df[feature_cols]
     y = df[target]
-    rf_model = RandomForestRegressor(random_state=42, max_depth=12, n_estimators=200)
     rf_model.fit(X, y)
     df["rf_pred"] = rf_model.predict(X)
     parent_map = df.groupby("parent")["repo"].apply(list).to_dict()
     final_weights = {}
     for parent, children in parent_map.items():
-        parent_idx = df[df["repo"] == parent].index
         group_idxs = df[df["parent"] == parent].index
-        if len(parent_idx) == 0:
-            group_preds = df.loc[group_idxs, "rf_pred"]
-            normed = (group_preds - group_preds.min()) / (group_preds.max() - group_preds.min() + 1e-8)
-            for idx, val in zip(group_idxs, normed):
-                final_weights[idx] = val
-            continue
-        parent_idx = parent_idx[0]
         child_idxs = [idx for idx in group_idxs if idx != parent_idx]
         if child_idxs:
             child_preds = df.loc[child_idxs, "rf_pred"]
@@ -366,7 +421,7 @@ def train_predict_weight(df):
     df["final_weight"] = df.index.map(final_weights).fillna(0.0)
-    # Enforce monotonicity within each group
     for parent, children in parent_map.items():
         group_idxs = df[df["parent"] == parent].index
         group_weights = df.loc[group_idxs, "final_weight"].sort_values(ascending=False)

   • Trains a RandomForest regressor on these features (with the base weight as the target) to predict a final weight.
 The output submission CSV has three columns: repo, parent, and final_weight.
 """
+import base64
 from io import StringIO
 import os
 import warnings
     pulls_list = []
     activity_list = []
     contributors_list = []
+    dependencies_list =[]
     cache = {}
         if repo_url in cache:
             return cache[repo_url]
         val = fetch_repo_metrics(repo_url)
+        try:
+            m = re.search(r"github\.com/([^/]+)/([^/]+)",repo_url)
+            if m:
+                owner, repo_name = m.group(1), m.group(2)
+                pkg_url = f"https://api.github.com/repos/{owner}/{repo_name}/packages.json"
+                headers = {}
+                token = os.environ.get("GITHUB_API_TOKEN", "")
+                if token:
+                    headers["Authorization"] = f"token {token}"
+                pkg_resp = requests.get(pkg_url, headers=headers)
+                if pkg_resp.status_code ==200:
+                    pkg_data = pkg_resp.json()
+                    content = base64.b64decode(pkg_data["content",""]),decode("utf-8")
+                    pkg_json = json.loads(content)
+                    dependencies = pkg_json.get("dependencies", {})
+                    val["dependencies_count"] = len(dependencies)
+                else:
+                    val["dependencies_count"] = 0
+            else:
+                val["dependencies_count"] = 0
+        except Exception:
+            val["dependencies_count"] = 0
         cache[repo_url] = val
         return val
         futures = {executor.submit(get_metrics, row['repo']): i for i, row in df.iterrows()}
         for fut in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Fetching metrics"):
             res = fut.result()
+            stars_list.append(res.get("stargazers_count", 0))
+            forks_list.append(res.get("forks_count", 0))
+            watchers_list.append(res.get("watchers_count", 0))
+            issues_list.append(res.get("open_issues_count", 0))
+            pulls_list.append(res.get("pulls_count", 0))
+            activity_list.append(res.get("activity", 0))
+            dependencies_list.append(res.get("dependencies_count", 0))
             # Fetch contributors count
             try:
                 contributors_url = f"https://api.github.com/repos/{res['owner']}/{res['repo_name']}/contributors"
     df["pulls"] = pulls_list
     df["activity"] = activity_list
     df["contributors"] = contributors_list
+    df["dependencies_count"] = dependencies_list
     end_time = time.time()
     print(f"[INFO] GitHub features fetched successfully in {end_time - start_time:.2f} seconds.")
 ##############################
 #  RandomForest Regression
 ##############################
+def train_predict_weight(df,
+                         criterion='gini',
+                         max_features='auto',
+                         max_depth=12,
+                         min_samples_split=2,
+                         min_samples_leaf=1):
+    """
+    Uses a RandomForestRegressor to predict a repository weight based on GitHub features.
+    The regressor is tuned with provided hyperparameters.
+    A flag column 'is_source' is used to indicate if a repository is the primary source.
+    If none is flagged, the repo with the highest prediction is set as the parent.
+    """
     print("[INFO] Starting weight prediction...", flush=True)
     start_time = time.time()
     target = "base_weight"
     feature_cols = ["stars", "forks", "watchers", "open_issues", "pulls", "activity", "contributors"]
     if "activity" in df.columns:
         df["activity"] = pd.to_datetime(df["activity"], errors="coerce", utc=True)
         now = pd.Timestamp.now(tz="UTC")
     if target not in df.columns:
         raise ValueError("Base weight column missing.")
     X = df[feature_cols]
     y = df[target]
+    # For regression, if a classification criterion is given, switch to 'mse'
+    reg_criterion = "mse" if criterion in ["gini", "entropy"] else criterion
+    rf_model = RandomForestRegressor(random_state=42,
+                                     criterion=reg_criterion,
+                                     max_features=max_features,
+                                     max_depth=max_depth,
+                                     min_samples_split=min_samples_split,
+                                     min_samples_leaf=min_samples_leaf,
+                                     n_estimators=200)
     rf_model.fit(X, y)
     df["rf_pred"] = rf_model.predict(X)
+    # Provide feedback about one of the trees in the RF
+    try:
+        depth = rf_model.estimators_[0].get_depth()
+        leaves = rf_model.estimators_[0].get_n_leaves()
+        print(f"[INFO] RF tree depth: {depth}, number of leaves: {leaves}", flush=True)
+    except Exception:
+        pass
     parent_map = df.groupby("parent")["repo"].apply(list).to_dict()
     final_weights = {}
     for parent, children in parent_map.items():
         group_idxs = df[df["parent"] == parent].index
+        # Check if a repo in the group is flagged as is_source
+        source_idxs = df.loc[group_idxs][df["is_source"] == True].index.tolist() if "is_source" in df.columns else []
+        if source_idxs:
+            parent_idx = source_idxs[0]
+        else:
+            # Fallback: choose the repo with the maximum prediction as the parent
+            preds = df.loc[group_idxs, "rf_pred"]
+            parent_idx = preds.idxmax()
         child_idxs = [idx for idx in group_idxs if idx != parent_idx]
         if child_idxs:
             child_preds = df.loc[child_idxs, "rf_pred"]
     df["final_weight"] = df.index.map(final_weights).fillna(0.0)
+    # Enforce monotonicity within each group so weights are descending
     for parent, children in parent_map.items():
         group_idxs = df[df["parent"] == parent].index
         group_weights = df.loc[group_idxs, "final_weight"].sort_values(ascending=False)