Spaces:
Sleeping
Sleeping
Commit
·
722cfc4
1
Parent(s):
e574555
Oracle
Browse files- Oracle/SmolLM.py +4 -4
- Oracle/deepfundingoracle.py +201 -256
Oracle/SmolLM.py
CHANGED
@@ -15,10 +15,10 @@ class SmolLM:
|
|
15 |
print(f"[ERROR] Failed to load model '{model_path}': {e}")
|
16 |
self.available = False
|
17 |
|
18 |
-
def predict(self, prompt,
|
19 |
if not self.available:
|
20 |
print("[WARN] Oracle unavailable, returning default weight 0.5")
|
21 |
-
return "
|
22 |
try:
|
23 |
# Use chat template as per documentation
|
24 |
messages = [{"role": "user", "content": prompt}]
|
@@ -26,13 +26,13 @@ class SmolLM:
|
|
26 |
outputs = self.model.generate(
|
27 |
inputs,
|
28 |
max_new_tokens=max_new_tokens,
|
29 |
-
temperature=0.
|
30 |
top_p=0.9,
|
31 |
do_sample=True
|
32 |
)
|
33 |
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
34 |
print(f"[INFO] Generated response: {response[:100]}...", flush=True)
|
35 |
-
return response
|
36 |
except Exception as e:
|
37 |
print(f"[ERROR] Oracle has failed: {e}")
|
38 |
return "0.5"
|
|
|
15 |
print(f"[ERROR] Failed to load model '{model_path}': {e}")
|
16 |
self.available = False
|
17 |
|
18 |
+
def predict(self, prompt, max_new_tokens=200):
|
19 |
if not self.available:
|
20 |
print("[WARN] Oracle unavailable, returning default weight 0.5")
|
21 |
+
return ""
|
22 |
try:
|
23 |
# Use chat template as per documentation
|
24 |
messages = [{"role": "user", "content": prompt}]
|
|
|
26 |
outputs = self.model.generate(
|
27 |
inputs,
|
28 |
max_new_tokens=max_new_tokens,
|
29 |
+
temperature=0.2,
|
30 |
top_p=0.9,
|
31 |
do_sample=True
|
32 |
)
|
33 |
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
34 |
print(f"[INFO] Generated response: {response[:100]}...", flush=True)
|
35 |
+
return response.split("<|assistant|>")[-1].strip()
|
36 |
except Exception as e:
|
37 |
print(f"[ERROR] Oracle has failed: {e}")
|
38 |
return "0.5"
|
Oracle/deepfundingoracle.py
CHANGED
@@ -29,17 +29,33 @@ import sys
|
|
29 |
import re
|
30 |
import json
|
31 |
import time
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
from sklearn.ensemble import RandomForestRegressor
|
35 |
-
from sklearn.metrics import mean_squared_error
|
36 |
from sklearn.preprocessing import StandardScaler
|
37 |
import matplotlib.pyplot as plt
|
38 |
import seaborn as sns
|
39 |
from scipy.special import log1p, expm1
|
40 |
-
|
|
|
|
|
|
|
41 |
from Oracle.SmolLM import SmolLM
|
42 |
-
|
43 |
warnings.filterwarnings("ignore")
|
44 |
|
45 |
# Configure logging to file and console
|
@@ -52,155 +68,104 @@ logging.basicConfig(
|
|
52 |
format="%(asctime)s - %(levelname)s - %(message)s"
|
53 |
)
|
54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
##############################
|
56 |
# GitHub API helper: Fetch repository metrics
|
57 |
##############################
|
58 |
def fetch_repo_metrics(repo_url):
|
59 |
"""
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
"""
|
64 |
-
|
65 |
-
|
66 |
-
"stargazers_count": 0,
|
67 |
-
"forks_count": 0,
|
68 |
-
"watchers_count": 0,
|
69 |
-
"open_issues_count": 0,
|
70 |
-
"pulls_count": 0,
|
71 |
-
"activity": "",
|
72 |
-
"contributors": 0,
|
73 |
-
"dependencies_count": 0
|
74 |
-
}
|
75 |
-
|
76 |
try:
|
77 |
-
# Extract owner and repo name
|
78 |
m = re.search(r"github\.com/([^/]+)/([^/]+)", repo_url)
|
79 |
if not m:
|
80 |
-
|
81 |
return default_metrics
|
82 |
-
|
83 |
owner, repo_name = m.group(1), m.group(2)
|
84 |
api_url = f"https://api.github.com/repos/{owner}/{repo_name}"
|
85 |
headers = {}
|
86 |
-
|
87 |
-
token = os.environ.get("GITHUB_API_TOKEN", "")
|
88 |
if token:
|
89 |
headers["Authorization"] = f"token {token}"
|
90 |
-
|
91 |
-
|
92 |
-
r
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
"stargazers_count": data.get("stargazers_count", 0),
|
97 |
-
"forks_count": data.get("forks_count", 0),
|
98 |
-
"watchers_count": data.get("watchers_count", 0),
|
99 |
-
"open_issues_count": data.get("open_issues_count", 0),
|
100 |
-
"activity": data.get("updated_at", ""),
|
101 |
-
"owner": owner,
|
102 |
-
"repo_name": repo_name,
|
103 |
-
"dependencies_count": 0
|
104 |
-
}
|
105 |
-
|
106 |
-
# Try to fetch pull requests count
|
107 |
-
try:
|
108 |
-
pulls_url = f"{api_url}/pulls"
|
109 |
-
pulls_resp = requests.get(pulls_url, headers=headers, timeout=5)
|
110 |
-
metrics["pulls_count"] = len(pulls_resp.json()) if pulls_resp.status_code == 200 else 0
|
111 |
-
except Exception as e:
|
112 |
-
print(f"[WARN] Failed to fetch pulls for {repo_url}: {e}")
|
113 |
-
metrics["pulls_count"] = 0
|
114 |
-
|
115 |
-
# Try to fetch contributors count
|
116 |
-
try:
|
117 |
-
contributors_url = f"{api_url}/contributors"
|
118 |
-
contributors_resp = requests.get(contributors_url, headers=headers, timeout=5)
|
119 |
-
metrics["contributors"] = len(contributors_resp.json()) if contributors_resp.status_code == 200 else 0
|
120 |
-
except Exception as e:
|
121 |
-
print(f"[WARN] Failed to fetch contributors for {repo_url}: {e}")
|
122 |
-
metrics["contributors"] = 0
|
123 |
-
|
124 |
-
# Try to estimate dependencies from package files
|
125 |
try:
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
else:
|
147 |
-
print(f"[ERROR] Failed to fetch data for {repo_url}: {r.status_code}")
|
148 |
-
return default_metrics
|
149 |
-
except Exception as e:
|
150 |
-
print(f"[ERROR] Exception while fetching data for {repo_url}: {e}")
|
151 |
return default_metrics
|
152 |
|
153 |
def fetch_github_features(df):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
"""
|
155 |
-
|
156 |
-
|
157 |
-
Adds these as new columns to the DataFrame.
|
158 |
"""
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
metrics_lists = {
|
164 |
-
"stars": [],
|
165 |
-
"forks": [],
|
166 |
-
"watchers": [],
|
167 |
-
"open_issues": [],
|
168 |
-
"pulls": [],
|
169 |
-
"activity": [],
|
170 |
-
"contributors": [],
|
171 |
-
"dependencies_count": []
|
172 |
-
}
|
173 |
|
174 |
-
|
175 |
-
|
176 |
-
def get_metrics(repo_url):
|
177 |
-
if repo_url in cache:
|
178 |
-
print(f"[DEBUG] Cached GitHub data for {repo_url}: {cache[repo_url]}")
|
179 |
-
return cache[repo_url]
|
180 |
-
val = fetch_repo_metrics(repo_url)
|
181 |
-
print(f"[DEBUG] Extracted GitHub data for {repo_url}: {val}")
|
182 |
-
cache[repo_url] = val
|
183 |
-
return val
|
184 |
-
|
185 |
-
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
|
186 |
-
futures = {executor.submit(get_metrics, row['repo']): i for i, row in df.iterrows()}
|
187 |
-
for fut in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Fetching metrics"):
|
188 |
-
res = fut.result()
|
189 |
-
metrics_lists["stars"].append(res.get("stargazers_count", 0))
|
190 |
-
metrics_lists["forks"].append(res.get("forks_count", 0))
|
191 |
-
metrics_lists["watchers"].append(res.get("watchers_count", 0))
|
192 |
-
metrics_lists["open_issues"].append(res.get("open_issues_count", 0))
|
193 |
-
metrics_lists["pulls"].append(res.get("pulls_count", 0))
|
194 |
-
metrics_lists["activity"].append(res.get("activity", ""))
|
195 |
-
metrics_lists["contributors"].append(res.get("contributors", 0))
|
196 |
-
metrics_lists["dependencies_count"].append(res.get("dependencies_count", 0))
|
197 |
-
|
198 |
-
# Add the fetched data to the DataFrame
|
199 |
-
for key, values in metrics_lists.items():
|
200 |
-
df[key] = values
|
201 |
|
202 |
-
|
203 |
-
|
204 |
return df
|
205 |
|
206 |
def calculate_fallback_weights(df):
|
@@ -265,66 +230,52 @@ def load_data(file):
|
|
265 |
def timeout_handler(signum, frame):
|
266 |
raise TimeoutError("LLama model prediction timed out.")
|
267 |
|
268 |
-
|
|
|
269 |
"""
|
270 |
-
|
271 |
-
|
272 |
"""
|
273 |
-
|
274 |
-
logging.info("[INFO] Assigning base weights using optimized approach...")
|
275 |
-
start_time = time.time()
|
276 |
oracle = SmolLM()
|
277 |
|
|
|
|
|
|
|
278 |
prompt = (
|
279 |
-
"
|
280 |
-
"
|
281 |
-
"
|
282 |
-
'
|
|
|
|
|
|
|
|
|
|
|
283 |
)
|
|
|
284 |
feature_weights = None
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
print("[WARN] LLM failed to provide feature weights. Calculating fallback weights dynamically.")
|
306 |
-
feature_weights = calculate_fallback_weights(df)
|
307 |
-
print(f"[INFO] Fallback feature weights: {feature_weights}", flush=True)
|
308 |
-
|
309 |
-
for feature in feature_weights.keys():
|
310 |
-
if feature in df.columns:
|
311 |
-
df[feature] = pd.to_numeric(df[feature], errors='coerce').fillna(0)
|
312 |
-
|
313 |
-
def calculate_weight(row):
|
314 |
-
weight = 0
|
315 |
-
for feature, feature_weight in feature_weights.items():
|
316 |
-
if feature in row:
|
317 |
-
weight += row[feature] * feature_weight
|
318 |
-
return weight
|
319 |
-
|
320 |
-
df["base_weight_raw"] = df.apply(calculate_weight, axis=1)
|
321 |
-
df["base_weight"] = df.groupby("parent")["base_weight_raw"].transform(
|
322 |
-
lambda s: (s - s.min()) / (s.max() - s.min() if s.max() != s.min() else 1)
|
323 |
-
)
|
324 |
|
325 |
-
end_time = time.time()
|
326 |
-
print(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.", flush=True)
|
327 |
-
logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
|
328 |
return df
|
329 |
|
330 |
def sanity_check_weights(df):
|
@@ -481,72 +432,58 @@ def validate_target(df):
|
|
481 |
|
482 |
|
483 |
##############################
|
484 |
-
#
|
485 |
##############################
|
486 |
def train_predict_weight(df):
|
487 |
"""
|
488 |
-
Trains
|
489 |
"""
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
-
|
494 |
-
|
495 |
-
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
|
502 |
-
|
503 |
-
|
504 |
-
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
|
509 |
-
|
510 |
-
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
-
|
515 |
-
|
516 |
-
|
517 |
-
|
518 |
-
#
|
519 |
-
|
520 |
-
"rf__n_estimators": [100, 300, 500, 800, 1000],
|
521 |
-
"rf__max_depth": [None, 20, 30, 40],
|
522 |
-
"rf__min_samples_split": [2, 5, 10],
|
523 |
-
"rf__min_samples_leaf": [1, 2, 4],
|
524 |
-
"rf__max_features": ["auto", "sqrt"],
|
525 |
-
}
|
526 |
search = RandomizedSearchCV(
|
527 |
-
pipeline,
|
528 |
-
|
529 |
-
n_iter=50,
|
530 |
-
cv=10,
|
531 |
-
scoring="neg_root_mean_squared_error",
|
532 |
-
verbose=2,
|
533 |
-
n_jobs=-1,
|
534 |
-
random_state=42
|
535 |
)
|
536 |
-
search.fit(
|
|
|
537 |
best_model = search.best_estimator_
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
538 |
|
539 |
-
#Predict on test, invert transform
|
540 |
-
y_pred_test_log = best_model.predict(X_test)
|
541 |
-
y_pred_test = expm1(y_pred_test_log)
|
542 |
-
y_true_test = expm1(y_test_log)
|
543 |
-
mse = mean_squared_error(y_true_test, y_pred_test)
|
544 |
-
print(f"[INFO] Test MSE after RandomizedSearch: {mse:.4f}", flush=True)
|
545 |
-
# Predict on full dataset and invert
|
546 |
-
df["final_weight"] = expm1(best_model.predict(df[feature_cols]))
|
547 |
-
df = normalize_and_clip_weights(df)
|
548 |
-
end_time = time.time()
|
549 |
-
print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True)
|
550 |
return df
|
551 |
|
552 |
|
@@ -554,23 +491,31 @@ def train_predict_weight(df):
|
|
554 |
# CSV Output
|
555 |
##############################
|
556 |
def create_submission_csv(df, output_filename="submission.csv"):
|
557 |
-
|
558 |
-
|
559 |
-
|
560 |
-
|
561 |
-
print(f"[INFO] Results written to {output_filename}.", flush=True)
|
562 |
-
return output_filename
|
563 |
|
564 |
-
# Removed Gradio UI code from this file to ensure modular workflow.
|
565 |
-
# This file now focuses solely on data processing and prediction.
|
566 |
|
567 |
if __name__ == "__main__":
|
568 |
-
|
569 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
570 |
|
571 |
-
|
572 |
-
|
|
|
|
|
|
|
|
|
|
|
573 |
|
574 |
-
|
575 |
-
create_submission_csv(df, output_file)
|
576 |
-
print("[INFO] Process completed successfully.")
|
|
|
29 |
import re
|
30 |
import json
|
31 |
import time
|
32 |
+
import json
|
33 |
+
import time
|
34 |
+
import logging
|
35 |
+
import sys
|
36 |
+
import warnings
|
37 |
+
import concurrent.futures
|
38 |
+
from concurrent.futures import ThreadPoolExecutor
|
39 |
+
import numpy as np
|
40 |
+
import pandas as pd
|
41 |
+
import requests
|
42 |
+
from tqdm import tqdm
|
43 |
+
from scipy.special import log1p, expm1
|
44 |
+
from sklearn.model_selection import RandomizedSearchCV, GroupKFold
|
45 |
+
from sklearn.pipeline import Pipeline
|
46 |
+
from sklearn.preprocessing import RobustScaler
|
47 |
+
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV,KFold
|
48 |
from sklearn.ensemble import RandomForestRegressor
|
|
|
49 |
from sklearn.preprocessing import StandardScaler
|
50 |
import matplotlib.pyplot as plt
|
51 |
import seaborn as sns
|
52 |
from scipy.special import log1p, expm1
|
53 |
+
from sklearn.preprocessing import RobustScaler
|
54 |
+
from sklearn.metrics import mean_squared_error
|
55 |
+
from xgboost import XGBRegressor
|
56 |
+
from scipy.special import log1p, expm1
|
57 |
from Oracle.SmolLM import SmolLM
|
58 |
+
import os
|
59 |
warnings.filterwarnings("ignore")
|
60 |
|
61 |
# Configure logging to file and console
|
|
|
68 |
format="%(asctime)s - %(levelname)s - %(message)s"
|
69 |
)
|
70 |
|
71 |
+
|
72 |
+
def add_temporal_and_ratio_features(df):
|
73 |
+
"""
|
74 |
+
Adds:
|
75 |
+
- days_since_update: days between last GitHub update and today
|
76 |
+
- closed_issue_ratio: ratio of closed to total issues
|
77 |
+
- (Optional) merged_pull_ratio: if you have merged_pulls count
|
78 |
+
"""
|
79 |
+
df['activity'] = pd.to_datetime(df['activity'], errors='coerce')
|
80 |
+
today = pd.to_datetime('today')
|
81 |
+
df['days_since_update'] = (today - df['activity']).dt.days.fillna((today - df['activity'].median()).days)
|
82 |
+
# closed_issue_ratio: assuming open_issues includes all and closed = total - open
|
83 |
+
df['closed_issue_ratio']= 0
|
84 |
+
total_issues = (df['open_issues']/ (1- 0.5)).replace([np.inf, -np.inf], np.nan)
|
85 |
+
df['closed_issue_ratio'] = ((total_issues - df['open_issues']).fillna(0)/ total_issues.fillna(1))
|
86 |
+
df['closed_issue_ratio'] = df['closed_issue_ratio'].clip(0, 1)
|
87 |
+
# merged_pull_ratio: if you have merged_pulls count
|
88 |
+
df['merged_pull_ratio'] = df['merged_pulls'].clip(lower=0)/ df['pulls'].clip(lower=1)
|
89 |
+
return df
|
90 |
+
|
91 |
+
|
92 |
##############################
|
93 |
# GitHub API helper: Fetch repository metrics
|
94 |
##############################
|
95 |
def fetch_repo_metrics(repo_url):
|
96 |
"""
|
97 |
+
RATIONALE (Recommendation 2): Fetches GitHub metrics, handling API pagination to get accurate
|
98 |
+
contributor and pull request counts instead of the default cap of 30. This provides much
|
99 |
+
more accurate features for popular repositories.
|
100 |
"""
|
101 |
+
default_metrics = {"stars": 0, "forks": 0, "watchers": 0, "open_issues": 0, "pulls": 0, "activity": pd.NaT,
|
102 |
+
"contributors": 0}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
try:
|
|
|
104 |
m = re.search(r"github\.com/([^/]+)/([^/]+)", repo_url)
|
105 |
if not m:
|
106 |
+
logging.warning(f"Malformed GitHub URL: {repo_url}")
|
107 |
return default_metrics
|
|
|
108 |
owner, repo_name = m.group(1), m.group(2)
|
109 |
api_url = f"https://api.github.com/repos/{owner}/{repo_name}"
|
110 |
headers = {}
|
111 |
+
token = os.environ.get("GITHUB_API_TOKEN")
|
|
|
112 |
if token:
|
113 |
headers["Authorization"] = f"token {token}"
|
114 |
+
|
115 |
+
r = requests.get(api_url, headers=headers, timeout=15)
|
116 |
+
r.raise_for_status()
|
117 |
+
data = r.json()
|
118 |
+
|
119 |
+
def get_count_from_pagination(url, headers):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
try:
|
121 |
+
resp = requests.get(f"{url}?per_page=1", headers=headers, timeout=10)
|
122 |
+
if resp.status_code == 200 and 'Link' in resp.headers:
|
123 |
+
match = re.search(r'page=(\d+)>; rel="last"', resp.headers['Link'])
|
124 |
+
if match:
|
125 |
+
return int(match.group(1))
|
126 |
+
return len(resp.json()) if resp.status_code == 200 else 0
|
127 |
+
except requests.exceptions.RequestException:
|
128 |
+
return 0
|
129 |
+
|
130 |
+
return {
|
131 |
+
"stars": data.get("stargazers_count", 0),
|
132 |
+
"forks": data.get("forks_count", 0),
|
133 |
+
"watchers": data.get("subscribers_count", 0), # subscribers_count is a better 'watch' metric
|
134 |
+
"open_issues": data.get("open_issues_count", 0),
|
135 |
+
"activity": pd.to_datetime(data.get("updated_at")),
|
136 |
+
"contributors": get_count_from_pagination(data['contributors_url'], headers),
|
137 |
+
"pulls": get_count_from_pagination(data['pulls_url'].replace('{/number}', ''), headers)
|
138 |
+
}
|
139 |
+
except requests.exceptions.RequestException as e:
|
140 |
+
logging.error(f"Failed to fetch data for {repo_url}: {e}")
|
|
|
|
|
|
|
|
|
|
|
141 |
return default_metrics
|
142 |
|
143 |
def fetch_github_features(df):
|
144 |
+
"""Concurrently fetches GitHub features for all repositories in the DataFrame."""
|
145 |
+
logging.info("Fetching GitHub features for repositories...")
|
146 |
+
metrics_data = []
|
147 |
+
with ThreadPoolExecutor(max_workers=20) as executor:
|
148 |
+
future_to_url = {executor.submit(fetch_repo_metrics, url): url for url in df['repo']}
|
149 |
+
for future in tqdm(concurrent.futures.as_completed(future_to_url), total=len(df), desc="Fetching GitHub Metrics"):
|
150 |
+
metrics_data.append(future.result())
|
151 |
+
return pd.concat([df.reset_index(drop=True), pd.DataFrame(metrics_data)], axis=1)
|
152 |
+
|
153 |
+
|
154 |
+
def add_derived_features(df):
|
155 |
"""
|
156 |
+
RATIONALE (Recommendation 2): Adds derived temporal and interaction features like 'days_since_update'
|
157 |
+
and 'stars_per_contributor' to give the model more powerful signals to learn from.
|
|
|
158 |
"""
|
159 |
+
logging.info("Engineering derived features...")
|
160 |
+
df['activity'] = pd.to_datetime(df['activity'], errors='coerce')
|
161 |
+
df['days_since_update'] = (pd.Timestamp.now(tz='UTC') - df['activity']).dt.days
|
162 |
+
df['days_since_update'].fillna(df['days_since_update'].median(), inplace=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
|
164 |
+
df['stars_per_contributor'] = df['stars'] / df['contributors'].clip(lower=1)
|
165 |
+
df['forks_per_star'] = df['forks'] / df['stars'].clip(lower=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
|
167 |
+
numeric_cols = df.select_dtypes(include=np.number).columns
|
168 |
+
df[numeric_cols] = df[numeric_cols].fillna(0)
|
169 |
return df
|
170 |
|
171 |
def calculate_fallback_weights(df):
|
|
|
230 |
def timeout_handler(signum, frame):
|
231 |
raise TimeoutError("LLama model prediction timed out.")
|
232 |
|
233 |
+
|
234 |
+
def assign_base_weight(df):
|
235 |
"""
|
236 |
+
Assigns a robust `base_weight` using an LLM with a specific persona and JSON output,
|
237 |
+
then applies log transformation before normalization.
|
238 |
"""
|
239 |
+
logging.info("Assigning robust base weights using LLM...")
|
|
|
|
|
240 |
oracle = SmolLM()
|
241 |
|
242 |
+
# RATIONALE (Recommendation 1): This prompt is highly specific. It sets a persona (VC), defines
|
243 |
+
# the goal (assess health), prioritizes metrics, and demands a strict JSON output. This
|
244 |
+
# leads to a much higher quality and more reliable response from the LLM.
|
245 |
prompt = (
|
246 |
+
"As an expert venture capitalist specializing in open-source software, your goal is to assess a project's "
|
247 |
+
"overall health, community engagement, and development velocity. Based on this, assign a numeric importance "
|
248 |
+
"weight to each of the following GitHub metrics: 'stars', 'forks', 'watchers', 'open_issues', 'pulls', "
|
249 |
+
"'contributors', and 'days_since_update'. "
|
250 |
+
"Prioritize metrics indicating active, collaborative development (like contributors, pulls, recent updates) "
|
251 |
+
"over simple popularity metrics (like stars). The 'days_since_update' metric is inverse; lower is better, so it should have a negative weight. "
|
252 |
+
"The absolute values of the weights should sum to approximately 1. "
|
253 |
+
"Provide your answer ONLY in a strict JSON format. Example: "
|
254 |
+
'{"stars": 0.2, "forks": 0.1, "watchers": 0.05, "pulls": 0.2, "open_issues": 0.1, "contributors": 0.25, "days_since_update": -0.1}'
|
255 |
)
|
256 |
+
|
257 |
feature_weights = None
|
258 |
+
try:
|
259 |
+
response_text = oracle.predict(prompt)
|
260 |
+
json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
|
261 |
+
if not json_match: raise ValueError("No JSON object found in the LLM response.")
|
262 |
+
feature_weights = json.loads(json_match.group(0))
|
263 |
+
logging.info(f"Successfully parsed feature weights from LLM: {feature_weights}")
|
264 |
+
except Exception as e:
|
265 |
+
logging.error(f"Failed to parse LLM response, using fallback weights. Error: {e}")
|
266 |
+
feature_weights = {'stars': 0.15, 'forks': 0.1, 'watchers': 0.05, 'pulls': 0.25, 'open_issues': 0.1,
|
267 |
+
'contributors': 0.25, 'days_since_update': -0.1}
|
268 |
+
|
269 |
+
df["base_weight_raw"] = sum(df[feature] * weight for feature, weight in feature_weights.items() if feature in df)
|
270 |
+
|
271 |
+
# RATIONALE (Recommendation 1): Log-transforming the raw score before scaling prevents extreme
|
272 |
+
# outliers from dominating the normalization process, creating a more stable target variable.
|
273 |
+
df['base_weight_log'] = np.log1p(df['base_weight_raw'] - df['base_weight_raw'].min())
|
274 |
+
|
275 |
+
df['base_weight'] = df.groupby("parent")["base_weight_log"].transform(
|
276 |
+
lambda s: (s - s.min()) / (s.max() - s.min() if s.max() > s.min() else 1)
|
277 |
+
).fillna(0.5)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
278 |
|
|
|
|
|
|
|
279 |
return df
|
280 |
|
281 |
def sanity_check_weights(df):
|
|
|
432 |
|
433 |
|
434 |
##############################
|
435 |
+
# Model Training and Prediction
|
436 |
##############################
|
437 |
def train_predict_weight(df):
|
438 |
"""
|
439 |
+
Trains an XGBoost Regressor with GroupKFold cross-validation and extensive hyperparameter tuning.
|
440 |
"""
|
441 |
+
logging.info("Starting model training with GroupKFold validation...")
|
442 |
+
|
443 |
+
target_col = 'base_weight'
|
444 |
+
drop_cols = ["repo", "parent", "activity", "base_weight_raw", "base_weight_log", target_col]
|
445 |
+
feature_cols = [col for col in df.select_dtypes(include=np.number).columns if col not in drop_cols]
|
446 |
+
|
447 |
+
X = df[feature_cols].copy()
|
448 |
+
y = df[target_col]
|
449 |
+
groups = df['parent']
|
450 |
+
|
451 |
+
# RATIONALE (Recommendation 2): Log-transforming skewed input features helps the model by
|
452 |
+
# making their distributions more normal, improving the performance of the regressor.
|
453 |
+
skewed_features = ['stars', 'forks', 'watchers', 'open_issues', 'pulls', 'contributors', 'stars_per_contributor']
|
454 |
+
for col in skewed_features:
|
455 |
+
if col in X.columns:
|
456 |
+
X[col] = np.log1p(X[col])
|
457 |
+
|
458 |
+
pipeline = Pipeline([("scaler", RobustScaler()),
|
459 |
+
("xgb", XGBRegressor(objective="reg:squarederror", n_jobs=-1, random_state=42, verbosity=0))])
|
460 |
+
|
461 |
+
param_dist = {'xgb__n_estimators': [100, 300, 500, 700], 'xgb__max_depth': [3, 5, 7, 9],
|
462 |
+
'xgb__learning_rate': [0.01, 0.02, 0.05, 0.1], 'xgb__subsample': [0.6, 0.7, 0.8, 0.9],
|
463 |
+
'xgb__colsample_bytree': [0.6, 0.7, 0.8, 0.9]}
|
464 |
+
|
465 |
+
# RATIONALE (Recommendation 3): GroupKFold ensures that all repos from the same parent are in the
|
466 |
+
# same fold. This prevents data leakage and gives a realistic measure of true performance.
|
467 |
+
cv = GroupKFold(n_splits=5)
|
468 |
+
|
469 |
+
# RATIONALE (Recommendation 4): Increasing n_iter explores more hyperparameter combinations,
|
470 |
+
# increasing the chance of finding a better-performing model.
|
|
|
|
|
|
|
|
|
|
|
|
|
471 |
search = RandomizedSearchCV(
|
472 |
+
pipeline, param_distributions=param_dist, n_iter=50, cv=cv.split(X, y, groups),
|
473 |
+
scoring="neg_root_mean_squared_error", verbose=1, n_jobs=-1, random_state=42
|
|
|
|
|
|
|
|
|
|
|
|
|
474 |
)
|
475 |
+
search.fit(X, y)
|
476 |
+
|
477 |
best_model = search.best_estimator_
|
478 |
+
logging.info(f"Best CV score (neg RMSE): {search.best_score_:.4f}")
|
479 |
+
logging.info(f"Best parameters found: {search.best_params_}")
|
480 |
+
|
481 |
+
df['final_weight'] = best_model.predict(X)
|
482 |
+
|
483 |
+
df['final_weight'] = df['final_weight'].clip(lower=0)
|
484 |
+
df['final_weight'] = df.groupby("parent")['final_weight'].transform(
|
485 |
+
lambda w: w / w.sum() if w.sum() > 0 else np.ones_like(w) / len(w))
|
486 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
487 |
return df
|
488 |
|
489 |
|
|
|
491 |
# CSV Output
|
492 |
##############################
|
493 |
def create_submission_csv(df, output_filename="submission.csv"):
|
494 |
+
"""Saves the final predictions to a CSV file."""
|
495 |
+
logging.info(f"Writing final results to {output_filename}...")
|
496 |
+
df[["repo", "parent", "final_weight"]].to_csv(output_filename, index=False)
|
497 |
+
logging.info(f"Successfully created {output_filename}.")
|
|
|
|
|
498 |
|
|
|
|
|
499 |
|
500 |
if __name__ == "__main__":
|
501 |
+
if 'GITHUB_API_TOKEN' not in os.environ:
|
502 |
+
logging.warning("GITHUB_API_TOKEN environment variable not set. API rate limits will be low.")
|
503 |
+
|
504 |
+
input_file = "input.csv"
|
505 |
+
output_file = "submission_enhanced.csv"
|
506 |
+
|
507 |
+
if not os.path.exists(input_file):
|
508 |
+
logging.error(f"Input file not found: {input_file}. Please create it with 'repo' and 'parent' columns.")
|
509 |
+
sys.exit(1)
|
510 |
+
|
511 |
+
logging.info("--- Starting DeepFunding Oracle - Enhanced Process ---")
|
512 |
|
513 |
+
# Execute the full pipeline
|
514 |
+
main_df = pd.read_csv(input_file)
|
515 |
+
main_df = fetch_github_features(main_df)
|
516 |
+
main_df = add_derived_features(main_df)
|
517 |
+
main_df = assign_base_weight(main_df)
|
518 |
+
main_df = train_predict_weight(main_df)
|
519 |
+
create_submission_csv(main_df, output_file)
|
520 |
|
521 |
+
logging.info("--- Process Completed Successfully ---")
|
|
|
|