Spaces:

FelixPhilip
/

DeepFundingOracle

Sleeping

App Files Files Community

FelixPhilip commited on Apr 26

Commit

3868d8d

1 Parent(s): 6a89c42

Oracle

Browse files

Files changed (1) hide show

Oracle/deepfundingoracle.py +66 -46

Oracle/deepfundingoracle.py CHANGED Viewed

@@ -25,7 +25,7 @@ from tqdm import tqdm
 import sys
 import re
-from sklearn.model_selection import train_test_split, GridSearchCV
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.metrics import mean_squared_error
@@ -121,28 +121,37 @@ def fetch_github_features(df):
     activity_list = []
     contributors_list = []
-    for idx, row in df.iterrows():
-        repo_url = row.get("repo", "")
-        print(f"[INFO] Processing repository {idx + 1}/{len(df)}: {repo_url}")
-        features = fetch_repo_metrics(repo_url)
-        stars_list.append(features["stargazers_count"])
-        forks_list.append(features["forks_count"])
-        watchers_list.append(features["watchers_count"])
-        issues_list.append(features["open_issues_count"])
-        pulls_list.append(features["pulls_count"])
-        activity_list.append(features["activity"])
-        # Fetch contributors count
-        try:
-            contributors_url = f"https://api.github.com/repos/{features['owner']}/{features['repo_name']}/contributors"
-            headers = {"Authorization": f"token {features['token']}"}
-            contributors_response = requests.get(contributors_url, headers=headers)
-            if contributors_response.status_code == 200:
-                contributors_list.append(len(contributors_response.json()))
-            else:
                 contributors_list.append(0)
-        except Exception:
-            contributors_list.append(0)
     df["stars"] = stars_list
     df["forks"] = forks_list
@@ -165,6 +174,7 @@ def assign_base_weight(df):
     start_time = time.time()
     llama = SmolLM()
     base_weights = []
     for idx, row in tqdm(df.iterrows(), total=len(df), desc="Assigning weights"):
         repo = row.get("repo", "")
@@ -186,19 +196,23 @@ def assign_base_weight(df):
             "Only output the numeric value."
         )
         try:
-            print(f"[INFO] Sending prompt to LLama model for repo: {repo}", flush=True)
-            start_llama_time = time.time()
-            response = llama.predict(prompt)
-            # Use regex to extract the first valid float from the response
-            match = re.search(r"[-+]?\d*\.\d+|\d+", response)
-            if match:
-                weight = float(match.group())
-                weight = min(max(weight, 0), 1)
             else:
-                raise ValueError(f"No valid float found in response: {response}")
-            end_llama_time = time.time()
-            print(f"[INFO] Received weight {weight} for {repo} in {end_llama_time - start_llama_time:.2f} seconds.", flush=True)
-            logging.info(f"[INFO] Processed repository {repo} in {end_llama_time - start_llama_time:.2f} seconds. Weight: {weight}")
         except Exception as e:
             print(f"[ERROR] Failed to process repository {repo}: {e}", flush=True)
             logging.error(f"[ERROR] Failed to process repository {repo}: {e}")
@@ -250,28 +264,34 @@ def train_predict_weight(df):
     print("[INFO] Splitting data into training and testing sets...", flush=True)
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
     rf_model = RandomForestRegressor(random_state=42, max_depth=None)
-    param_grid = {
         "n_estimators": [100, 200, 300],
-        "max_depth": [None],  # Only allow unlimited depth
         "min_samples_split": [2, 5, 10],
         "min_samples_leaf": [1, 2, 4]
     }
-    print("[INFO] Performing grid search for hyperparameter tuning...", flush=True)
-    gridSearch = GridSearchCV(
         estimator=rf_model,
-        param_grid=param_grid,
-        cv=5,
-        scoring="neg_mean_squared_error"
     )
-    gridSearch.fit(X_train, y_train)
-    print("[INFO] Grid search completed.", flush=True)
-    print("Best Parameters:", gridSearch.best_params_, flush=True)
-    print("Best MSE:", -gridSearch.best_score_, flush=True)
-    y_pred = gridSearch.best_estimator_.predict(X_test)
     mse = mean_squared_error(y_test, y_pred)
     print("Final RF Test MSE:", mse, flush=True)
     print("[INFO] Predicting final weights for all rows...")
-    df["final_weight"] = gridSearch.best_estimator_.predict(X)
     end_time = time.time()
     print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True)
     return df

 import sys
 import re
+from sklearn.model_selection import train_test_split, RandomizedSearchCV
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.metrics import mean_squared_error
     activity_list = []
     contributors_list = []
+    cache = {}
+    def get_metrics(repo_url):
+        if repo_url in cache:
+            return cache[repo_url]
+        val = fetch_repo_metrics(repo_url)
+        cache[repo_url] = val
+        return val
+    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
+        futures = {executor.submit(get_metrics, row['repo']): i for i, row in df.iterrows()}
+        for fut in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Fetching metrics"):
+            res = fut.result()
+            stars_list.append(res["stargazers_count"])
+            forks_list.append(res["forks_count"])
+            watchers_list.append(res["watchers_count"])
+            issues_list.append(res["open_issues_count"])
+            pulls_list.append(res["pulls_count"])
+            activity_list.append(res["activity"])
+            # Fetch contributors count
+            try:
+                contributors_url = f"https://api.github.com/repos/{res['owner']}/{res['repo_name']}/contributors"
+                headers = {"Authorization": f"token {res['token']}"}
+                contributors_response = requests.get(contributors_url, headers=headers)
+                if contributors_response.status_code == 200:
+                    contributors_list.append(len(contributors_response.json()))
+                else:
+                    contributors_list.append(0)
+            except Exception:
                 contributors_list.append(0)
     df["stars"] = stars_list
     df["forks"] = forks_list
     start_time = time.time()
     llama = SmolLM()
     base_weights = []
+    llm_cache = {}
     for idx, row in tqdm(df.iterrows(), total=len(df), desc="Assigning weights"):
         repo = row.get("repo", "")
             "Only output the numeric value."
         )
         try:
+            if repo in llm_cache:
+                weight = llm_cache[repo]
             else:
+                print(f"[INFO] Sending prompt to LLama model for repo: {repo}", flush=True)
+                start_llama_time = time.time()
+                response = llama.predict(prompt)
+                # Use regex to extract the first valid float from the response
+                match = re.search(r"[-+]?\d*\.\d+|\d+", response)
+                if match:
+                    weight = float(match.group())
+                    weight = min(max(weight, 0), 1)
+                else:
+                    raise ValueError(f"No valid float found in response: {response}")
+                end_llama_time = time.time()
+                print(f"[INFO] Received weight {weight} for {repo} in {end_llama_time - start_llama_time:.2f} seconds.", flush=True)
+                logging.info(f"[INFO] Processed repository {repo} in {end_llama_time - start_llama_time:.2f} seconds. Weight: {weight}")
+                llm_cache[repo] = weight
         except Exception as e:
             print(f"[ERROR] Failed to process repository {repo}: {e}", flush=True)
             logging.error(f"[ERROR] Failed to process repository {repo}: {e}")
     print("[INFO] Splitting data into training and testing sets...", flush=True)
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
     rf_model = RandomForestRegressor(random_state=42, max_depth=None)
+    param_dist = {
         "n_estimators": [100, 200, 300],
         "min_samples_split": [2, 5, 10],
         "min_samples_leaf": [1, 2, 4]
     }
+    print("[INFO] Performing randomized search for hyperparameter tuning...", flush=True)
+    rand_search = RandomizedSearchCV(
         estimator=rf_model,
+        param_distributions=param_dist,
+        n_iter=20,
+        cv=3,
+        scoring="neg_mean_squared_error",
+        random_state=42,
+        error_score="raise"
     )
+    rand_search.fit(X_train, y_train)
+    print("[INFO] Randomized search completed.", flush=True)
+    print("Best Parameters:", rand_search.best_params_, flush=True)
+    print("Best MSE:", -rand_search.best_score_, flush=True)
+    y_pred = rand_search.best_estimator_.predict(X_test)
     mse = mean_squared_error(y_test, y_pred)
     print("Final RF Test MSE:", mse, flush=True)
     print("[INFO] Predicting final weights for all rows...")
+    df["final_weight_raw"] = rand_search.best_estimator_.predict(X)
+    # Normalize weights per parent for meaningful spread
+    df["final_weight"] = df.groupby("parent")["final_weight_raw"].transform(
+        lambda s: (s - s.min()) / (s.max() - s.min() if s.max() != s.min() else 1)
+    )
     end_time = time.time()
     print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True)
     return df