FelixPhilip commited on
Commit
3868d8d
·
1 Parent(s): 6a89c42
Files changed (1) hide show
  1. Oracle/deepfundingoracle.py +66 -46
Oracle/deepfundingoracle.py CHANGED
@@ -25,7 +25,7 @@ from tqdm import tqdm
25
  import sys
26
  import re
27
 
28
- from sklearn.model_selection import train_test_split, GridSearchCV
29
  from sklearn.ensemble import RandomForestRegressor
30
  from sklearn.metrics import mean_squared_error
31
 
@@ -121,28 +121,37 @@ def fetch_github_features(df):
121
  activity_list = []
122
  contributors_list = []
123
 
124
- for idx, row in df.iterrows():
125
- repo_url = row.get("repo", "")
126
- print(f"[INFO] Processing repository {idx + 1}/{len(df)}: {repo_url}")
127
- features = fetch_repo_metrics(repo_url)
128
- stars_list.append(features["stargazers_count"])
129
- forks_list.append(features["forks_count"])
130
- watchers_list.append(features["watchers_count"])
131
- issues_list.append(features["open_issues_count"])
132
- pulls_list.append(features["pulls_count"])
133
- activity_list.append(features["activity"])
134
 
135
- # Fetch contributors count
136
- try:
137
- contributors_url = f"https://api.github.com/repos/{features['owner']}/{features['repo_name']}/contributors"
138
- headers = {"Authorization": f"token {features['token']}"}
139
- contributors_response = requests.get(contributors_url, headers=headers)
140
- if contributors_response.status_code == 200:
141
- contributors_list.append(len(contributors_response.json()))
142
- else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  contributors_list.append(0)
144
- except Exception:
145
- contributors_list.append(0)
146
 
147
  df["stars"] = stars_list
148
  df["forks"] = forks_list
@@ -165,6 +174,7 @@ def assign_base_weight(df):
165
  start_time = time.time()
166
  llama = SmolLM()
167
  base_weights = []
 
168
 
169
  for idx, row in tqdm(df.iterrows(), total=len(df), desc="Assigning weights"):
170
  repo = row.get("repo", "")
@@ -186,19 +196,23 @@ def assign_base_weight(df):
186
  "Only output the numeric value."
187
  )
188
  try:
189
- print(f"[INFO] Sending prompt to LLama model for repo: {repo}", flush=True)
190
- start_llama_time = time.time()
191
- response = llama.predict(prompt)
192
- # Use regex to extract the first valid float from the response
193
- match = re.search(r"[-+]?\d*\.\d+|\d+", response)
194
- if match:
195
- weight = float(match.group())
196
- weight = min(max(weight, 0), 1)
197
  else:
198
- raise ValueError(f"No valid float found in response: {response}")
199
- end_llama_time = time.time()
200
- print(f"[INFO] Received weight {weight} for {repo} in {end_llama_time - start_llama_time:.2f} seconds.", flush=True)
201
- logging.info(f"[INFO] Processed repository {repo} in {end_llama_time - start_llama_time:.2f} seconds. Weight: {weight}")
 
 
 
 
 
 
 
 
 
 
202
  except Exception as e:
203
  print(f"[ERROR] Failed to process repository {repo}: {e}", flush=True)
204
  logging.error(f"[ERROR] Failed to process repository {repo}: {e}")
@@ -250,28 +264,34 @@ def train_predict_weight(df):
250
  print("[INFO] Splitting data into training and testing sets...", flush=True)
251
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
252
  rf_model = RandomForestRegressor(random_state=42, max_depth=None)
253
- param_grid = {
254
  "n_estimators": [100, 200, 300],
255
- "max_depth": [None], # Only allow unlimited depth
256
  "min_samples_split": [2, 5, 10],
257
  "min_samples_leaf": [1, 2, 4]
258
  }
259
- print("[INFO] Performing grid search for hyperparameter tuning...", flush=True)
260
- gridSearch = GridSearchCV(
261
  estimator=rf_model,
262
- param_grid=param_grid,
263
- cv=5,
264
- scoring="neg_mean_squared_error"
 
 
 
265
  )
266
- gridSearch.fit(X_train, y_train)
267
- print("[INFO] Grid search completed.", flush=True)
268
- print("Best Parameters:", gridSearch.best_params_, flush=True)
269
- print("Best MSE:", -gridSearch.best_score_, flush=True)
270
- y_pred = gridSearch.best_estimator_.predict(X_test)
271
  mse = mean_squared_error(y_test, y_pred)
272
  print("Final RF Test MSE:", mse, flush=True)
273
  print("[INFO] Predicting final weights for all rows...")
274
- df["final_weight"] = gridSearch.best_estimator_.predict(X)
 
 
 
 
275
  end_time = time.time()
276
  print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True)
277
  return df
 
25
  import sys
26
  import re
27
 
28
+ from sklearn.model_selection import train_test_split, RandomizedSearchCV
29
  from sklearn.ensemble import RandomForestRegressor
30
  from sklearn.metrics import mean_squared_error
31
 
 
121
  activity_list = []
122
  contributors_list = []
123
 
124
+ cache = {}
 
 
 
 
 
 
 
 
 
125
 
126
+ def get_metrics(repo_url):
127
+ if repo_url in cache:
128
+ return cache[repo_url]
129
+ val = fetch_repo_metrics(repo_url)
130
+ cache[repo_url] = val
131
+ return val
132
+
133
+ with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
134
+ futures = {executor.submit(get_metrics, row['repo']): i for i, row in df.iterrows()}
135
+ for fut in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Fetching metrics"):
136
+ res = fut.result()
137
+ stars_list.append(res["stargazers_count"])
138
+ forks_list.append(res["forks_count"])
139
+ watchers_list.append(res["watchers_count"])
140
+ issues_list.append(res["open_issues_count"])
141
+ pulls_list.append(res["pulls_count"])
142
+ activity_list.append(res["activity"])
143
+
144
+ # Fetch contributors count
145
+ try:
146
+ contributors_url = f"https://api.github.com/repos/{res['owner']}/{res['repo_name']}/contributors"
147
+ headers = {"Authorization": f"token {res['token']}"}
148
+ contributors_response = requests.get(contributors_url, headers=headers)
149
+ if contributors_response.status_code == 200:
150
+ contributors_list.append(len(contributors_response.json()))
151
+ else:
152
+ contributors_list.append(0)
153
+ except Exception:
154
  contributors_list.append(0)
 
 
155
 
156
  df["stars"] = stars_list
157
  df["forks"] = forks_list
 
174
  start_time = time.time()
175
  llama = SmolLM()
176
  base_weights = []
177
+ llm_cache = {}
178
 
179
  for idx, row in tqdm(df.iterrows(), total=len(df), desc="Assigning weights"):
180
  repo = row.get("repo", "")
 
196
  "Only output the numeric value."
197
  )
198
  try:
199
+ if repo in llm_cache:
200
+ weight = llm_cache[repo]
 
 
 
 
 
 
201
  else:
202
+ print(f"[INFO] Sending prompt to LLama model for repo: {repo}", flush=True)
203
+ start_llama_time = time.time()
204
+ response = llama.predict(prompt)
205
+ # Use regex to extract the first valid float from the response
206
+ match = re.search(r"[-+]?\d*\.\d+|\d+", response)
207
+ if match:
208
+ weight = float(match.group())
209
+ weight = min(max(weight, 0), 1)
210
+ else:
211
+ raise ValueError(f"No valid float found in response: {response}")
212
+ end_llama_time = time.time()
213
+ print(f"[INFO] Received weight {weight} for {repo} in {end_llama_time - start_llama_time:.2f} seconds.", flush=True)
214
+ logging.info(f"[INFO] Processed repository {repo} in {end_llama_time - start_llama_time:.2f} seconds. Weight: {weight}")
215
+ llm_cache[repo] = weight
216
  except Exception as e:
217
  print(f"[ERROR] Failed to process repository {repo}: {e}", flush=True)
218
  logging.error(f"[ERROR] Failed to process repository {repo}: {e}")
 
264
  print("[INFO] Splitting data into training and testing sets...", flush=True)
265
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
266
  rf_model = RandomForestRegressor(random_state=42, max_depth=None)
267
+ param_dist = {
268
  "n_estimators": [100, 200, 300],
 
269
  "min_samples_split": [2, 5, 10],
270
  "min_samples_leaf": [1, 2, 4]
271
  }
272
+ print("[INFO] Performing randomized search for hyperparameter tuning...", flush=True)
273
+ rand_search = RandomizedSearchCV(
274
  estimator=rf_model,
275
+ param_distributions=param_dist,
276
+ n_iter=20,
277
+ cv=3,
278
+ scoring="neg_mean_squared_error",
279
+ random_state=42,
280
+ error_score="raise"
281
  )
282
+ rand_search.fit(X_train, y_train)
283
+ print("[INFO] Randomized search completed.", flush=True)
284
+ print("Best Parameters:", rand_search.best_params_, flush=True)
285
+ print("Best MSE:", -rand_search.best_score_, flush=True)
286
+ y_pred = rand_search.best_estimator_.predict(X_test)
287
  mse = mean_squared_error(y_test, y_pred)
288
  print("Final RF Test MSE:", mse, flush=True)
289
  print("[INFO] Predicting final weights for all rows...")
290
+ df["final_weight_raw"] = rand_search.best_estimator_.predict(X)
291
+ # Normalize weights per parent for meaningful spread
292
+ df["final_weight"] = df.groupby("parent")["final_weight_raw"].transform(
293
+ lambda s: (s - s.min()) / (s.max() - s.min() if s.max() != s.min() else 1)
294
+ )
295
  end_time = time.time()
296
  print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True)
297
  return df