Spaces:
Running
Running
Commit
·
6282a14
1
Parent(s):
0861b62
Oracle
Browse files- Oracle/deepfundingoracle.py +74 -19
Oracle/deepfundingoracle.py
CHANGED
@@ -7,7 +7,7 @@ This script dynamically loads dependency data and for each repository URL:
|
|
7 |
• Trains a RandomForest regressor on these features (with the base weight as the target) to predict a final weight.
|
8 |
The output submission CSV has three columns: repo, parent, and final_weight.
|
9 |
"""
|
10 |
-
|
11 |
from io import StringIO
|
12 |
import os
|
13 |
import warnings
|
@@ -123,6 +123,7 @@ def fetch_github_features(df):
|
|
123 |
pulls_list = []
|
124 |
activity_list = []
|
125 |
contributors_list = []
|
|
|
126 |
|
127 |
cache = {}
|
128 |
|
@@ -130,6 +131,28 @@ def fetch_github_features(df):
|
|
130 |
if repo_url in cache:
|
131 |
return cache[repo_url]
|
132 |
val = fetch_repo_metrics(repo_url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
cache[repo_url] = val
|
134 |
return val
|
135 |
|
@@ -137,13 +160,13 @@ def fetch_github_features(df):
|
|
137 |
futures = {executor.submit(get_metrics, row['repo']): i for i, row in df.iterrows()}
|
138 |
for fut in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Fetching metrics"):
|
139 |
res = fut.result()
|
140 |
-
stars_list.append(res
|
141 |
-
forks_list.append(res
|
142 |
-
watchers_list.append(res
|
143 |
-
issues_list.append(res
|
144 |
-
pulls_list.append(res
|
145 |
-
activity_list.append(res
|
146 |
-
|
147 |
# Fetch contributors count
|
148 |
try:
|
149 |
contributors_url = f"https://api.github.com/repos/{res['owner']}/{res['repo_name']}/contributors"
|
@@ -163,6 +186,7 @@ def fetch_github_features(df):
|
|
163 |
df["pulls"] = pulls_list
|
164 |
df["activity"] = activity_list
|
165 |
df["contributors"] = contributors_list
|
|
|
166 |
|
167 |
end_time = time.time()
|
168 |
print(f"[INFO] GitHub features fetched successfully in {end_time - start_time:.2f} seconds.")
|
@@ -321,11 +345,23 @@ def prepare_dataset(file):
|
|
321 |
##############################
|
322 |
# RandomForest Regression
|
323 |
##############################
|
324 |
-
def train_predict_weight(df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
325 |
print("[INFO] Starting weight prediction...", flush=True)
|
326 |
start_time = time.time()
|
327 |
target = "base_weight"
|
328 |
feature_cols = ["stars", "forks", "watchers", "open_issues", "pulls", "activity", "contributors"]
|
|
|
329 |
if "activity" in df.columns:
|
330 |
df["activity"] = pd.to_datetime(df["activity"], errors="coerce", utc=True)
|
331 |
now = pd.Timestamp.now(tz="UTC")
|
@@ -333,25 +369,44 @@ def train_predict_weight(df):
|
|
333 |
|
334 |
if target not in df.columns:
|
335 |
raise ValueError("Base weight column missing.")
|
|
|
336 |
X = df[feature_cols]
|
337 |
y = df[target]
|
338 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
339 |
rf_model.fit(X, y)
|
340 |
df["rf_pred"] = rf_model.predict(X)
|
341 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
342 |
parent_map = df.groupby("parent")["repo"].apply(list).to_dict()
|
343 |
final_weights = {}
|
344 |
|
345 |
for parent, children in parent_map.items():
|
346 |
-
parent_idx = df[df["repo"] == parent].index
|
347 |
group_idxs = df[df["parent"] == parent].index
|
348 |
-
if
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
|
|
355 |
child_idxs = [idx for idx in group_idxs if idx != parent_idx]
|
356 |
if child_idxs:
|
357 |
child_preds = df.loc[child_idxs, "rf_pred"]
|
@@ -366,7 +421,7 @@ def train_predict_weight(df):
|
|
366 |
|
367 |
df["final_weight"] = df.index.map(final_weights).fillna(0.0)
|
368 |
|
369 |
-
# Enforce monotonicity within each group
|
370 |
for parent, children in parent_map.items():
|
371 |
group_idxs = df[df["parent"] == parent].index
|
372 |
group_weights = df.loc[group_idxs, "final_weight"].sort_values(ascending=False)
|
|
|
7 |
• Trains a RandomForest regressor on these features (with the base weight as the target) to predict a final weight.
|
8 |
The output submission CSV has three columns: repo, parent, and final_weight.
|
9 |
"""
|
10 |
+
import base64
|
11 |
from io import StringIO
|
12 |
import os
|
13 |
import warnings
|
|
|
123 |
pulls_list = []
|
124 |
activity_list = []
|
125 |
contributors_list = []
|
126 |
+
dependencies_list =[]
|
127 |
|
128 |
cache = {}
|
129 |
|
|
|
131 |
if repo_url in cache:
|
132 |
return cache[repo_url]
|
133 |
val = fetch_repo_metrics(repo_url)
|
134 |
+
try:
|
135 |
+
m = re.search(r"github\.com/([^/]+)/([^/]+)",repo_url)
|
136 |
+
if m:
|
137 |
+
owner, repo_name = m.group(1), m.group(2)
|
138 |
+
pkg_url = f"https://api.github.com/repos/{owner}/{repo_name}/packages.json"
|
139 |
+
headers = {}
|
140 |
+
token = os.environ.get("GITHUB_API_TOKEN", "")
|
141 |
+
if token:
|
142 |
+
headers["Authorization"] = f"token {token}"
|
143 |
+
pkg_resp = requests.get(pkg_url, headers=headers)
|
144 |
+
if pkg_resp.status_code ==200:
|
145 |
+
pkg_data = pkg_resp.json()
|
146 |
+
content = base64.b64decode(pkg_data["content",""]),decode("utf-8")
|
147 |
+
pkg_json = json.loads(content)
|
148 |
+
dependencies = pkg_json.get("dependencies", {})
|
149 |
+
val["dependencies_count"] = len(dependencies)
|
150 |
+
else:
|
151 |
+
val["dependencies_count"] = 0
|
152 |
+
else:
|
153 |
+
val["dependencies_count"] = 0
|
154 |
+
except Exception:
|
155 |
+
val["dependencies_count"] = 0
|
156 |
cache[repo_url] = val
|
157 |
return val
|
158 |
|
|
|
160 |
futures = {executor.submit(get_metrics, row['repo']): i for i, row in df.iterrows()}
|
161 |
for fut in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Fetching metrics"):
|
162 |
res = fut.result()
|
163 |
+
stars_list.append(res.get("stargazers_count", 0))
|
164 |
+
forks_list.append(res.get("forks_count", 0))
|
165 |
+
watchers_list.append(res.get("watchers_count", 0))
|
166 |
+
issues_list.append(res.get("open_issues_count", 0))
|
167 |
+
pulls_list.append(res.get("pulls_count", 0))
|
168 |
+
activity_list.append(res.get("activity", 0))
|
169 |
+
dependencies_list.append(res.get("dependencies_count", 0))
|
170 |
# Fetch contributors count
|
171 |
try:
|
172 |
contributors_url = f"https://api.github.com/repos/{res['owner']}/{res['repo_name']}/contributors"
|
|
|
186 |
df["pulls"] = pulls_list
|
187 |
df["activity"] = activity_list
|
188 |
df["contributors"] = contributors_list
|
189 |
+
df["dependencies_count"] = dependencies_list
|
190 |
|
191 |
end_time = time.time()
|
192 |
print(f"[INFO] GitHub features fetched successfully in {end_time - start_time:.2f} seconds.")
|
|
|
345 |
##############################
|
346 |
# RandomForest Regression
|
347 |
##############################
|
348 |
+
def train_predict_weight(df,
|
349 |
+
criterion='gini',
|
350 |
+
max_features='auto',
|
351 |
+
max_depth=12,
|
352 |
+
min_samples_split=2,
|
353 |
+
min_samples_leaf=1):
|
354 |
+
"""
|
355 |
+
Uses a RandomForestRegressor to predict a repository weight based on GitHub features.
|
356 |
+
The regressor is tuned with provided hyperparameters.
|
357 |
+
A flag column 'is_source' is used to indicate if a repository is the primary source.
|
358 |
+
If none is flagged, the repo with the highest prediction is set as the parent.
|
359 |
+
"""
|
360 |
print("[INFO] Starting weight prediction...", flush=True)
|
361 |
start_time = time.time()
|
362 |
target = "base_weight"
|
363 |
feature_cols = ["stars", "forks", "watchers", "open_issues", "pulls", "activity", "contributors"]
|
364 |
+
|
365 |
if "activity" in df.columns:
|
366 |
df["activity"] = pd.to_datetime(df["activity"], errors="coerce", utc=True)
|
367 |
now = pd.Timestamp.now(tz="UTC")
|
|
|
369 |
|
370 |
if target not in df.columns:
|
371 |
raise ValueError("Base weight column missing.")
|
372 |
+
|
373 |
X = df[feature_cols]
|
374 |
y = df[target]
|
375 |
+
|
376 |
+
# For regression, if a classification criterion is given, switch to 'mse'
|
377 |
+
reg_criterion = "mse" if criterion in ["gini", "entropy"] else criterion
|
378 |
+
|
379 |
+
rf_model = RandomForestRegressor(random_state=42,
|
380 |
+
criterion=reg_criterion,
|
381 |
+
max_features=max_features,
|
382 |
+
max_depth=max_depth,
|
383 |
+
min_samples_split=min_samples_split,
|
384 |
+
min_samples_leaf=min_samples_leaf,
|
385 |
+
n_estimators=200)
|
386 |
rf_model.fit(X, y)
|
387 |
df["rf_pred"] = rf_model.predict(X)
|
388 |
|
389 |
+
# Provide feedback about one of the trees in the RF
|
390 |
+
try:
|
391 |
+
depth = rf_model.estimators_[0].get_depth()
|
392 |
+
leaves = rf_model.estimators_[0].get_n_leaves()
|
393 |
+
print(f"[INFO] RF tree depth: {depth}, number of leaves: {leaves}", flush=True)
|
394 |
+
except Exception:
|
395 |
+
pass
|
396 |
+
|
397 |
parent_map = df.groupby("parent")["repo"].apply(list).to_dict()
|
398 |
final_weights = {}
|
399 |
|
400 |
for parent, children in parent_map.items():
|
|
|
401 |
group_idxs = df[df["parent"] == parent].index
|
402 |
+
# Check if a repo in the group is flagged as is_source
|
403 |
+
source_idxs = df.loc[group_idxs][df["is_source"] == True].index.tolist() if "is_source" in df.columns else []
|
404 |
+
if source_idxs:
|
405 |
+
parent_idx = source_idxs[0]
|
406 |
+
else:
|
407 |
+
# Fallback: choose the repo with the maximum prediction as the parent
|
408 |
+
preds = df.loc[group_idxs, "rf_pred"]
|
409 |
+
parent_idx = preds.idxmax()
|
410 |
child_idxs = [idx for idx in group_idxs if idx != parent_idx]
|
411 |
if child_idxs:
|
412 |
child_preds = df.loc[child_idxs, "rf_pred"]
|
|
|
421 |
|
422 |
df["final_weight"] = df.index.map(final_weights).fillna(0.0)
|
423 |
|
424 |
+
# Enforce monotonicity within each group so weights are descending
|
425 |
for parent, children in parent_map.items():
|
426 |
group_idxs = df[df["parent"] == parent].index
|
427 |
group_weights = df.loc[group_idxs, "final_weight"].sort_values(ascending=False)
|