FelixPhilip commited on
Commit
6282a14
·
1 Parent(s): 0861b62
Files changed (1) hide show
  1. Oracle/deepfundingoracle.py +74 -19
Oracle/deepfundingoracle.py CHANGED
@@ -7,7 +7,7 @@ This script dynamically loads dependency data and for each repository URL:
7
  • Trains a RandomForest regressor on these features (with the base weight as the target) to predict a final weight.
8
  The output submission CSV has three columns: repo, parent, and final_weight.
9
  """
10
-
11
  from io import StringIO
12
  import os
13
  import warnings
@@ -123,6 +123,7 @@ def fetch_github_features(df):
123
  pulls_list = []
124
  activity_list = []
125
  contributors_list = []
 
126
 
127
  cache = {}
128
 
@@ -130,6 +131,28 @@ def fetch_github_features(df):
130
  if repo_url in cache:
131
  return cache[repo_url]
132
  val = fetch_repo_metrics(repo_url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  cache[repo_url] = val
134
  return val
135
 
@@ -137,13 +160,13 @@ def fetch_github_features(df):
137
  futures = {executor.submit(get_metrics, row['repo']): i for i, row in df.iterrows()}
138
  for fut in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Fetching metrics"):
139
  res = fut.result()
140
- stars_list.append(res["stargazers_count"])
141
- forks_list.append(res["forks_count"])
142
- watchers_list.append(res["watchers_count"])
143
- issues_list.append(res["open_issues_count"])
144
- pulls_list.append(res["pulls_count"])
145
- activity_list.append(res["activity"])
146
-
147
  # Fetch contributors count
148
  try:
149
  contributors_url = f"https://api.github.com/repos/{res['owner']}/{res['repo_name']}/contributors"
@@ -163,6 +186,7 @@ def fetch_github_features(df):
163
  df["pulls"] = pulls_list
164
  df["activity"] = activity_list
165
  df["contributors"] = contributors_list
 
166
 
167
  end_time = time.time()
168
  print(f"[INFO] GitHub features fetched successfully in {end_time - start_time:.2f} seconds.")
@@ -321,11 +345,23 @@ def prepare_dataset(file):
321
  ##############################
322
  # RandomForest Regression
323
  ##############################
324
- def train_predict_weight(df):
 
 
 
 
 
 
 
 
 
 
 
325
  print("[INFO] Starting weight prediction...", flush=True)
326
  start_time = time.time()
327
  target = "base_weight"
328
  feature_cols = ["stars", "forks", "watchers", "open_issues", "pulls", "activity", "contributors"]
 
329
  if "activity" in df.columns:
330
  df["activity"] = pd.to_datetime(df["activity"], errors="coerce", utc=True)
331
  now = pd.Timestamp.now(tz="UTC")
@@ -333,25 +369,44 @@ def train_predict_weight(df):
333
 
334
  if target not in df.columns:
335
  raise ValueError("Base weight column missing.")
 
336
  X = df[feature_cols]
337
  y = df[target]
338
- rf_model = RandomForestRegressor(random_state=42, max_depth=12, n_estimators=200)
 
 
 
 
 
 
 
 
 
 
339
  rf_model.fit(X, y)
340
  df["rf_pred"] = rf_model.predict(X)
341
 
 
 
 
 
 
 
 
 
342
  parent_map = df.groupby("parent")["repo"].apply(list).to_dict()
343
  final_weights = {}
344
 
345
  for parent, children in parent_map.items():
346
- parent_idx = df[df["repo"] == parent].index
347
  group_idxs = df[df["parent"] == parent].index
348
- if len(parent_idx) == 0:
349
- group_preds = df.loc[group_idxs, "rf_pred"]
350
- normed = (group_preds - group_preds.min()) / (group_preds.max() - group_preds.min() + 1e-8)
351
- for idx, val in zip(group_idxs, normed):
352
- final_weights[idx] = val
353
- continue
354
- parent_idx = parent_idx[0]
 
355
  child_idxs = [idx for idx in group_idxs if idx != parent_idx]
356
  if child_idxs:
357
  child_preds = df.loc[child_idxs, "rf_pred"]
@@ -366,7 +421,7 @@ def train_predict_weight(df):
366
 
367
  df["final_weight"] = df.index.map(final_weights).fillna(0.0)
368
 
369
- # Enforce monotonicity within each group
370
  for parent, children in parent_map.items():
371
  group_idxs = df[df["parent"] == parent].index
372
  group_weights = df.loc[group_idxs, "final_weight"].sort_values(ascending=False)
 
7
  • Trains a RandomForest regressor on these features (with the base weight as the target) to predict a final weight.
8
  The output submission CSV has three columns: repo, parent, and final_weight.
9
  """
10
+ import base64
11
  from io import StringIO
12
  import os
13
  import warnings
 
123
  pulls_list = []
124
  activity_list = []
125
  contributors_list = []
126
+ dependencies_list =[]
127
 
128
  cache = {}
129
 
 
131
  if repo_url in cache:
132
  return cache[repo_url]
133
  val = fetch_repo_metrics(repo_url)
134
+ try:
135
+ m = re.search(r"github\.com/([^/]+)/([^/]+)",repo_url)
136
+ if m:
137
+ owner, repo_name = m.group(1), m.group(2)
138
+ pkg_url = f"https://api.github.com/repos/{owner}/{repo_name}/packages.json"
139
+ headers = {}
140
+ token = os.environ.get("GITHUB_API_TOKEN", "")
141
+ if token:
142
+ headers["Authorization"] = f"token {token}"
143
+ pkg_resp = requests.get(pkg_url, headers=headers)
144
+ if pkg_resp.status_code ==200:
145
+ pkg_data = pkg_resp.json()
146
+ content = base64.b64decode(pkg_data["content",""]),decode("utf-8")
147
+ pkg_json = json.loads(content)
148
+ dependencies = pkg_json.get("dependencies", {})
149
+ val["dependencies_count"] = len(dependencies)
150
+ else:
151
+ val["dependencies_count"] = 0
152
+ else:
153
+ val["dependencies_count"] = 0
154
+ except Exception:
155
+ val["dependencies_count"] = 0
156
  cache[repo_url] = val
157
  return val
158
 
 
160
  futures = {executor.submit(get_metrics, row['repo']): i for i, row in df.iterrows()}
161
  for fut in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Fetching metrics"):
162
  res = fut.result()
163
+ stars_list.append(res.get("stargazers_count", 0))
164
+ forks_list.append(res.get("forks_count", 0))
165
+ watchers_list.append(res.get("watchers_count", 0))
166
+ issues_list.append(res.get("open_issues_count", 0))
167
+ pulls_list.append(res.get("pulls_count", 0))
168
+ activity_list.append(res.get("activity", 0))
169
+ dependencies_list.append(res.get("dependencies_count", 0))
170
  # Fetch contributors count
171
  try:
172
  contributors_url = f"https://api.github.com/repos/{res['owner']}/{res['repo_name']}/contributors"
 
186
  df["pulls"] = pulls_list
187
  df["activity"] = activity_list
188
  df["contributors"] = contributors_list
189
+ df["dependencies_count"] = dependencies_list
190
 
191
  end_time = time.time()
192
  print(f"[INFO] GitHub features fetched successfully in {end_time - start_time:.2f} seconds.")
 
345
  ##############################
346
  # RandomForest Regression
347
  ##############################
348
+ def train_predict_weight(df,
349
+ criterion='gini',
350
+ max_features='auto',
351
+ max_depth=12,
352
+ min_samples_split=2,
353
+ min_samples_leaf=1):
354
+ """
355
+ Uses a RandomForestRegressor to predict a repository weight based on GitHub features.
356
+ The regressor is tuned with provided hyperparameters.
357
+ A flag column 'is_source' is used to indicate if a repository is the primary source.
358
+ If none is flagged, the repo with the highest prediction is set as the parent.
359
+ """
360
  print("[INFO] Starting weight prediction...", flush=True)
361
  start_time = time.time()
362
  target = "base_weight"
363
  feature_cols = ["stars", "forks", "watchers", "open_issues", "pulls", "activity", "contributors"]
364
+
365
  if "activity" in df.columns:
366
  df["activity"] = pd.to_datetime(df["activity"], errors="coerce", utc=True)
367
  now = pd.Timestamp.now(tz="UTC")
 
369
 
370
  if target not in df.columns:
371
  raise ValueError("Base weight column missing.")
372
+
373
  X = df[feature_cols]
374
  y = df[target]
375
+
376
+ # For regression, if a classification criterion is given, switch to 'mse'
377
+ reg_criterion = "mse" if criterion in ["gini", "entropy"] else criterion
378
+
379
+ rf_model = RandomForestRegressor(random_state=42,
380
+ criterion=reg_criterion,
381
+ max_features=max_features,
382
+ max_depth=max_depth,
383
+ min_samples_split=min_samples_split,
384
+ min_samples_leaf=min_samples_leaf,
385
+ n_estimators=200)
386
  rf_model.fit(X, y)
387
  df["rf_pred"] = rf_model.predict(X)
388
 
389
+ # Provide feedback about one of the trees in the RF
390
+ try:
391
+ depth = rf_model.estimators_[0].get_depth()
392
+ leaves = rf_model.estimators_[0].get_n_leaves()
393
+ print(f"[INFO] RF tree depth: {depth}, number of leaves: {leaves}", flush=True)
394
+ except Exception:
395
+ pass
396
+
397
  parent_map = df.groupby("parent")["repo"].apply(list).to_dict()
398
  final_weights = {}
399
 
400
  for parent, children in parent_map.items():
 
401
  group_idxs = df[df["parent"] == parent].index
402
+ # Check if a repo in the group is flagged as is_source
403
+ source_idxs = df.loc[group_idxs][df["is_source"] == True].index.tolist() if "is_source" in df.columns else []
404
+ if source_idxs:
405
+ parent_idx = source_idxs[0]
406
+ else:
407
+ # Fallback: choose the repo with the maximum prediction as the parent
408
+ preds = df.loc[group_idxs, "rf_pred"]
409
+ parent_idx = preds.idxmax()
410
  child_idxs = [idx for idx in group_idxs if idx != parent_idx]
411
  if child_idxs:
412
  child_preds = df.loc[child_idxs, "rf_pred"]
 
421
 
422
  df["final_weight"] = df.index.map(final_weights).fillna(0.0)
423
 
424
+ # Enforce monotonicity within each group so weights are descending
425
  for parent, children in parent_map.items():
426
  group_idxs = df[df["parent"] == parent].index
427
  group_weights = df.loc[group_idxs, "final_weight"].sort_values(ascending=False)