FelixPhilip commited on
Commit
722cfc4
·
1 Parent(s): e574555
Files changed (2) hide show
  1. Oracle/SmolLM.py +4 -4
  2. Oracle/deepfundingoracle.py +201 -256
Oracle/SmolLM.py CHANGED
@@ -15,10 +15,10 @@ class SmolLM:
15
  print(f"[ERROR] Failed to load model '{model_path}': {e}")
16
  self.available = False
17
 
18
- def predict(self, prompt, max_length=512, max_new_tokens=150):
19
  if not self.available:
20
  print("[WARN] Oracle unavailable, returning default weight 0.5")
21
- return "0.5"
22
  try:
23
  # Use chat template as per documentation
24
  messages = [{"role": "user", "content": prompt}]
@@ -26,13 +26,13 @@ class SmolLM:
26
  outputs = self.model.generate(
27
  inputs,
28
  max_new_tokens=max_new_tokens,
29
- temperature=0.7,
30
  top_p=0.9,
31
  do_sample=True
32
  )
33
  response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
34
  print(f"[INFO] Generated response: {response[:100]}...", flush=True)
35
- return response
36
  except Exception as e:
37
  print(f"[ERROR] Oracle has failed: {e}")
38
  return "0.5"
 
15
  print(f"[ERROR] Failed to load model '{model_path}': {e}")
16
  self.available = False
17
 
18
+ def predict(self, prompt, max_new_tokens=200):
19
  if not self.available:
20
  print("[WARN] Oracle unavailable, returning default weight 0.5")
21
+ return ""
22
  try:
23
  # Use chat template as per documentation
24
  messages = [{"role": "user", "content": prompt}]
 
26
  outputs = self.model.generate(
27
  inputs,
28
  max_new_tokens=max_new_tokens,
29
+ temperature=0.2,
30
  top_p=0.9,
31
  do_sample=True
32
  )
33
  response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
34
  print(f"[INFO] Generated response: {response[:100]}...", flush=True)
35
+ return response.split("<|assistant|>")[-1].strip()
36
  except Exception as e:
37
  print(f"[ERROR] Oracle has failed: {e}")
38
  return "0.5"
Oracle/deepfundingoracle.py CHANGED
@@ -29,17 +29,33 @@ import sys
29
  import re
30
  import json
31
  import time
32
-
33
- from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  from sklearn.ensemble import RandomForestRegressor
35
- from sklearn.metrics import mean_squared_error
36
  from sklearn.preprocessing import StandardScaler
37
  import matplotlib.pyplot as plt
38
  import seaborn as sns
39
  from scipy.special import log1p, expm1
40
-
 
 
 
41
  from Oracle.SmolLM import SmolLM
42
-
43
  warnings.filterwarnings("ignore")
44
 
45
  # Configure logging to file and console
@@ -52,155 +68,104 @@ logging.basicConfig(
52
  format="%(asctime)s - %(levelname)s - %(message)s"
53
  )
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  ##############################
56
  # GitHub API helper: Fetch repository metrics
57
  ##############################
58
  def fetch_repo_metrics(repo_url):
59
  """
60
- Fetch GitHub metrics (stars, forks, watchers, open issues, pull requests, and activity) given a repository URL.
61
- Assumes repo_url is in the form "https://github.com/owner/repo".
62
- Handles API failures and malformed URLs gracefully.
63
  """
64
- # Default values in case of failure
65
- default_metrics = {
66
- "stargazers_count": 0,
67
- "forks_count": 0,
68
- "watchers_count": 0,
69
- "open_issues_count": 0,
70
- "pulls_count": 0,
71
- "activity": "",
72
- "contributors": 0,
73
- "dependencies_count": 0
74
- }
75
-
76
  try:
77
- # Extract owner and repo name
78
  m = re.search(r"github\.com/([^/]+)/([^/]+)", repo_url)
79
  if not m:
80
- print(f"[WARN] Malformed GitHub URL: {repo_url}")
81
  return default_metrics
82
-
83
  owner, repo_name = m.group(1), m.group(2)
84
  api_url = f"https://api.github.com/repos/{owner}/{repo_name}"
85
  headers = {}
86
-
87
- token = os.environ.get("GITHUB_API_TOKEN", "")
88
  if token:
89
  headers["Authorization"] = f"token {token}"
90
-
91
- # Fetch main repository data
92
- r = requests.get(api_url, headers=headers, timeout=10)
93
- if r.status_code == 200:
94
- data = r.json()
95
- metrics = {
96
- "stargazers_count": data.get("stargazers_count", 0),
97
- "forks_count": data.get("forks_count", 0),
98
- "watchers_count": data.get("watchers_count", 0),
99
- "open_issues_count": data.get("open_issues_count", 0),
100
- "activity": data.get("updated_at", ""),
101
- "owner": owner,
102
- "repo_name": repo_name,
103
- "dependencies_count": 0
104
- }
105
-
106
- # Try to fetch pull requests count
107
- try:
108
- pulls_url = f"{api_url}/pulls"
109
- pulls_resp = requests.get(pulls_url, headers=headers, timeout=5)
110
- metrics["pulls_count"] = len(pulls_resp.json()) if pulls_resp.status_code == 200 else 0
111
- except Exception as e:
112
- print(f"[WARN] Failed to fetch pulls for {repo_url}: {e}")
113
- metrics["pulls_count"] = 0
114
-
115
- # Try to fetch contributors count
116
- try:
117
- contributors_url = f"{api_url}/contributors"
118
- contributors_resp = requests.get(contributors_url, headers=headers, timeout=5)
119
- metrics["contributors"] = len(contributors_resp.json()) if contributors_resp.status_code == 200 else 0
120
- except Exception as e:
121
- print(f"[WARN] Failed to fetch contributors for {repo_url}: {e}")
122
- metrics["contributors"] = 0
123
-
124
- # Try to estimate dependencies from package files
125
  try:
126
- # Look for package.json for Node.js projects
127
- package_json_url = f"https://raw.githubusercontent.com/{owner}/{repo_name}/master/package.json"
128
- package_resp = requests.get(package_json_url, timeout=5)
129
- if package_resp.status_code == 200:
130
- package_data = package_resp.json()
131
- deps = package_data.get("dependencies", {})
132
- dev_deps = package_data.get("devDependencies", {})
133
- metrics["dependencies_count"] = len(deps) + len(dev_deps)
134
- else:
135
- # Try requirements.txt for Python projects
136
- req_txt_url = f"https://raw.githubusercontent.com/{owner}/{repo_name}/master/requirements.txt"
137
- req_resp = requests.get(req_txt_url, timeout=5)
138
- if req_resp.status_code == 200:
139
- deps = [line for line in req_resp.text.split('\n') if line.strip() and not line.startswith('#')]
140
- metrics["dependencies_count"] = len(deps)
141
- except Exception as e:
142
- print(f"[WARN] Failed to fetch dependencies for {repo_url}: {e}")
143
- metrics["dependencies_count"] = 0
144
-
145
- return metrics
146
- else:
147
- print(f"[ERROR] Failed to fetch data for {repo_url}: {r.status_code}")
148
- return default_metrics
149
- except Exception as e:
150
- print(f"[ERROR] Exception while fetching data for {repo_url}: {e}")
151
  return default_metrics
152
 
153
  def fetch_github_features(df):
 
 
 
 
 
 
 
 
 
 
 
154
  """
155
- For each row, using the repo URL, call the GitHub API to fetch:
156
- stars, forks, watchers, open issues, pull requests, activity, and contributors count.
157
- Adds these as new columns to the DataFrame.
158
  """
159
- print("[INFO] Fetching GitHub features for repositories...")
160
- start_time = time.time()
161
-
162
- # Initialize lists for storing fetched data
163
- metrics_lists = {
164
- "stars": [],
165
- "forks": [],
166
- "watchers": [],
167
- "open_issues": [],
168
- "pulls": [],
169
- "activity": [],
170
- "contributors": [],
171
- "dependencies_count": []
172
- }
173
 
174
- cache = {}
175
-
176
- def get_metrics(repo_url):
177
- if repo_url in cache:
178
- print(f"[DEBUG] Cached GitHub data for {repo_url}: {cache[repo_url]}")
179
- return cache[repo_url]
180
- val = fetch_repo_metrics(repo_url)
181
- print(f"[DEBUG] Extracted GitHub data for {repo_url}: {val}")
182
- cache[repo_url] = val
183
- return val
184
-
185
- with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
186
- futures = {executor.submit(get_metrics, row['repo']): i for i, row in df.iterrows()}
187
- for fut in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Fetching metrics"):
188
- res = fut.result()
189
- metrics_lists["stars"].append(res.get("stargazers_count", 0))
190
- metrics_lists["forks"].append(res.get("forks_count", 0))
191
- metrics_lists["watchers"].append(res.get("watchers_count", 0))
192
- metrics_lists["open_issues"].append(res.get("open_issues_count", 0))
193
- metrics_lists["pulls"].append(res.get("pulls_count", 0))
194
- metrics_lists["activity"].append(res.get("activity", ""))
195
- metrics_lists["contributors"].append(res.get("contributors", 0))
196
- metrics_lists["dependencies_count"].append(res.get("dependencies_count", 0))
197
-
198
- # Add the fetched data to the DataFrame
199
- for key, values in metrics_lists.items():
200
- df[key] = values
201
 
202
- end_time = time.time()
203
- print(f"[INFO] GitHub features fetched successfully in {end_time - start_time:.2f} seconds.")
204
  return df
205
 
206
  def calculate_fallback_weights(df):
@@ -265,66 +230,52 @@ def load_data(file):
265
  def timeout_handler(signum, frame):
266
  raise TimeoutError("LLama model prediction timed out.")
267
 
268
- def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0):
 
269
  """
270
- Assign base weights using a single LLM call to determine feature weights,
271
- and programmatically calculate repository weights.
272
  """
273
- print("[INFO] Starting optimized base weight assignment...", flush=True)
274
- logging.info("[INFO] Assigning base weights using optimized approach...")
275
- start_time = time.time()
276
  oracle = SmolLM()
277
 
 
 
 
278
  prompt = (
279
- "Can you Predict a weight in the range (0-1) for these GitHub features such as stars, forks, watchers, "
280
- "open_issues, pulls, activity, contributors based on their importance in determining the influence of a repository? "
281
- "Output the weights for each feature as text e.g.: "
282
- 'stars: 0.3, forks: 0.2, watchers: 0.2, open_issues: 0.1, pulls: 0.1, activity: 0.05, contributors: 0.05'
 
 
 
 
 
283
  )
 
284
  feature_weights = None
285
- for attempt in range(llm_retries):
286
- try:
287
- response = oracle.predict(prompt, max_length=512, max_new_tokens=150)
288
- if not response or not response.strip():
289
- raise ValueError("Empty response from Oracle.")
290
- matches = re.findall(
291
- r'(stars|forks|watchers|open_issues|pulls|activity|contributors)\s*[:=]\s*([0-9]*\.?[0-9]+)',
292
- response, re.IGNORECASE)
293
- feature_weights = {k.lower(): float(v) for k, v in matches}
294
- if not feature_weights or len(feature_weights) < 7:
295
- raise ValueError("Could not extract all feature weights from response.")
296
- print(f"[INFO] Feature weights from LLM: {feature_weights}", flush=True)
297
- break
298
- except Exception as e:
299
- print(f"[ERROR] Oracle attempt {attempt+1} failed: {e}", flush=True)
300
- logging.error(f"[ERROR] Oracle attempt {attempt+1} failed: {e}")
301
- time.sleep(llm_delay)
302
-
303
- # Fallback mechanism: Calculate feature weights dynamically if LLM fails
304
- if feature_weights is None:
305
- print("[WARN] LLM failed to provide feature weights. Calculating fallback weights dynamically.")
306
- feature_weights = calculate_fallback_weights(df)
307
- print(f"[INFO] Fallback feature weights: {feature_weights}", flush=True)
308
-
309
- for feature in feature_weights.keys():
310
- if feature in df.columns:
311
- df[feature] = pd.to_numeric(df[feature], errors='coerce').fillna(0)
312
-
313
- def calculate_weight(row):
314
- weight = 0
315
- for feature, feature_weight in feature_weights.items():
316
- if feature in row:
317
- weight += row[feature] * feature_weight
318
- return weight
319
-
320
- df["base_weight_raw"] = df.apply(calculate_weight, axis=1)
321
- df["base_weight"] = df.groupby("parent")["base_weight_raw"].transform(
322
- lambda s: (s - s.min()) / (s.max() - s.min() if s.max() != s.min() else 1)
323
- )
324
 
325
- end_time = time.time()
326
- print(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.", flush=True)
327
- logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
328
  return df
329
 
330
  def sanity_check_weights(df):
@@ -481,72 +432,58 @@ def validate_target(df):
481
 
482
 
483
  ##############################
484
- # RandomForest Regression
485
  ##############################
486
  def train_predict_weight(df):
487
  """
488
- Trains a RandomForestRegressor with hyperparameter tuning and evaluates the model.
489
  """
490
- print("[INFO] Starting weight prediction with hyperparameter tuning...", flush=True)
491
- start_time = time.time()
492
- target = "base_weight"
493
- feature_cols = [col for col in df.select_dtypes(include=[np.number]).columns if col not in ["base_weight", "final_weight","base_weight_raw"]]
494
-
495
- X = df[feature_cols].fillna(0)
496
- y = df[target]
497
-
498
-
499
- # Remove rows with NaN values
500
- mask = X.notna().all(axis=1) & y.notna()
501
- X,y = X[mask], y[mask]
502
-
503
- # Check for sufficient data and variance
504
- if X.shape[0] < 5 or y.nunique() <=1:
505
- print("[WARN] Not enough data or variance for model training. Using base weights directly.")
506
- df["final_weight"] = df[target]
507
- return normalize_and_clip_weights(df)
508
-
509
- # log1p transform target
510
- y_log = log1p(y)
511
-
512
- # Split data into train/test sets
513
- X_train, X_test, y_train_log, y_test_log = train_test_split(X, y_log, test_size=0.2, random_state=42)
514
-
515
- pipeline = Pipeline([
516
- ("rf", RandomForestRegressor(random_state=42))
517
- ])
518
- # Hyperparameter tuning using GridSearchCV
519
- param_dist = {
520
- "rf__n_estimators": [100, 300, 500, 800, 1000],
521
- "rf__max_depth": [None, 20, 30, 40],
522
- "rf__min_samples_split": [2, 5, 10],
523
- "rf__min_samples_leaf": [1, 2, 4],
524
- "rf__max_features": ["auto", "sqrt"],
525
- }
526
  search = RandomizedSearchCV(
527
- pipeline,
528
- param_distributions=param_dist,
529
- n_iter=50,
530
- cv=10,
531
- scoring="neg_root_mean_squared_error",
532
- verbose=2,
533
- n_jobs=-1,
534
- random_state=42
535
  )
536
- search.fit(X_train, y_train_log)
 
537
  best_model = search.best_estimator_
 
 
 
 
 
 
 
 
538
 
539
- #Predict on test, invert transform
540
- y_pred_test_log = best_model.predict(X_test)
541
- y_pred_test = expm1(y_pred_test_log)
542
- y_true_test = expm1(y_test_log)
543
- mse = mean_squared_error(y_true_test, y_pred_test)
544
- print(f"[INFO] Test MSE after RandomizedSearch: {mse:.4f}", flush=True)
545
- # Predict on full dataset and invert
546
- df["final_weight"] = expm1(best_model.predict(df[feature_cols]))
547
- df = normalize_and_clip_weights(df)
548
- end_time = time.time()
549
- print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True)
550
  return df
551
 
552
 
@@ -554,23 +491,31 @@ def train_predict_weight(df):
554
  # CSV Output
555
  ##############################
556
  def create_submission_csv(df, output_filename="submission.csv"):
557
- print(f"[INFO] Writing results to {output_filename}...", flush=True)
558
- required_cols = ["repo", "parent", "final_weight"]
559
- submission_df = df[required_cols]
560
- submission_df.to_csv(output_filename, index=False)
561
- print(f"[INFO] Results written to {output_filename}.", flush=True)
562
- return output_filename
563
 
564
- # Removed Gradio UI code from this file to ensure modular workflow.
565
- # This file now focuses solely on data processing and prediction.
566
 
567
  if __name__ == "__main__":
568
- input_file = "input.csv" # Replace with the actual input file path
569
- output_file = "submission.csv"
 
 
 
 
 
 
 
 
 
570
 
571
- print("[INFO] Preparing dataset...")
572
- df = prepare_dataset(input_file)
 
 
 
 
 
573
 
574
- print("[INFO] Creating submission CSV...")
575
- create_submission_csv(df, output_file)
576
- print("[INFO] Process completed successfully.")
 
29
  import re
30
  import json
31
  import time
32
+ import json
33
+ import time
34
+ import logging
35
+ import sys
36
+ import warnings
37
+ import concurrent.futures
38
+ from concurrent.futures import ThreadPoolExecutor
39
+ import numpy as np
40
+ import pandas as pd
41
+ import requests
42
+ from tqdm import tqdm
43
+ from scipy.special import log1p, expm1
44
+ from sklearn.model_selection import RandomizedSearchCV, GroupKFold
45
+ from sklearn.pipeline import Pipeline
46
+ from sklearn.preprocessing import RobustScaler
47
+ from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV,KFold
48
  from sklearn.ensemble import RandomForestRegressor
 
49
  from sklearn.preprocessing import StandardScaler
50
  import matplotlib.pyplot as plt
51
  import seaborn as sns
52
  from scipy.special import log1p, expm1
53
+ from sklearn.preprocessing import RobustScaler
54
+ from sklearn.metrics import mean_squared_error
55
+ from xgboost import XGBRegressor
56
+ from scipy.special import log1p, expm1
57
  from Oracle.SmolLM import SmolLM
58
+ import os
59
  warnings.filterwarnings("ignore")
60
 
61
  # Configure logging to file and console
 
68
  format="%(asctime)s - %(levelname)s - %(message)s"
69
  )
70
 
71
+
72
+ def add_temporal_and_ratio_features(df):
73
+ """
74
+ Adds:
75
+ - days_since_update: days between last GitHub update and today
76
+ - closed_issue_ratio: ratio of closed to total issues
77
+ - (Optional) merged_pull_ratio: if you have merged_pulls count
78
+ """
79
+ df['activity'] = pd.to_datetime(df['activity'], errors='coerce')
80
+ today = pd.to_datetime('today')
81
+ df['days_since_update'] = (today - df['activity']).dt.days.fillna((today - df['activity'].median()).days)
82
+ # closed_issue_ratio: assuming open_issues includes all and closed = total - open
83
+ df['closed_issue_ratio']= 0
84
+ total_issues = (df['open_issues']/ (1- 0.5)).replace([np.inf, -np.inf], np.nan)
85
+ df['closed_issue_ratio'] = ((total_issues - df['open_issues']).fillna(0)/ total_issues.fillna(1))
86
+ df['closed_issue_ratio'] = df['closed_issue_ratio'].clip(0, 1)
87
+ # merged_pull_ratio: if you have merged_pulls count
88
+ df['merged_pull_ratio'] = df['merged_pulls'].clip(lower=0)/ df['pulls'].clip(lower=1)
89
+ return df
90
+
91
+
92
  ##############################
93
  # GitHub API helper: Fetch repository metrics
94
  ##############################
95
  def fetch_repo_metrics(repo_url):
96
  """
97
+ RATIONALE (Recommendation 2): Fetches GitHub metrics, handling API pagination to get accurate
98
+ contributor and pull request counts instead of the default cap of 30. This provides much
99
+ more accurate features for popular repositories.
100
  """
101
+ default_metrics = {"stars": 0, "forks": 0, "watchers": 0, "open_issues": 0, "pulls": 0, "activity": pd.NaT,
102
+ "contributors": 0}
 
 
 
 
 
 
 
 
 
 
103
  try:
 
104
  m = re.search(r"github\.com/([^/]+)/([^/]+)", repo_url)
105
  if not m:
106
+ logging.warning(f"Malformed GitHub URL: {repo_url}")
107
  return default_metrics
 
108
  owner, repo_name = m.group(1), m.group(2)
109
  api_url = f"https://api.github.com/repos/{owner}/{repo_name}"
110
  headers = {}
111
+ token = os.environ.get("GITHUB_API_TOKEN")
 
112
  if token:
113
  headers["Authorization"] = f"token {token}"
114
+
115
+ r = requests.get(api_url, headers=headers, timeout=15)
116
+ r.raise_for_status()
117
+ data = r.json()
118
+
119
+ def get_count_from_pagination(url, headers):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  try:
121
+ resp = requests.get(f"{url}?per_page=1", headers=headers, timeout=10)
122
+ if resp.status_code == 200 and 'Link' in resp.headers:
123
+ match = re.search(r'page=(\d+)>; rel="last"', resp.headers['Link'])
124
+ if match:
125
+ return int(match.group(1))
126
+ return len(resp.json()) if resp.status_code == 200 else 0
127
+ except requests.exceptions.RequestException:
128
+ return 0
129
+
130
+ return {
131
+ "stars": data.get("stargazers_count", 0),
132
+ "forks": data.get("forks_count", 0),
133
+ "watchers": data.get("subscribers_count", 0), # subscribers_count is a better 'watch' metric
134
+ "open_issues": data.get("open_issues_count", 0),
135
+ "activity": pd.to_datetime(data.get("updated_at")),
136
+ "contributors": get_count_from_pagination(data['contributors_url'], headers),
137
+ "pulls": get_count_from_pagination(data['pulls_url'].replace('{/number}', ''), headers)
138
+ }
139
+ except requests.exceptions.RequestException as e:
140
+ logging.error(f"Failed to fetch data for {repo_url}: {e}")
 
 
 
 
 
141
  return default_metrics
142
 
143
  def fetch_github_features(df):
144
+ """Concurrently fetches GitHub features for all repositories in the DataFrame."""
145
+ logging.info("Fetching GitHub features for repositories...")
146
+ metrics_data = []
147
+ with ThreadPoolExecutor(max_workers=20) as executor:
148
+ future_to_url = {executor.submit(fetch_repo_metrics, url): url for url in df['repo']}
149
+ for future in tqdm(concurrent.futures.as_completed(future_to_url), total=len(df), desc="Fetching GitHub Metrics"):
150
+ metrics_data.append(future.result())
151
+ return pd.concat([df.reset_index(drop=True), pd.DataFrame(metrics_data)], axis=1)
152
+
153
+
154
+ def add_derived_features(df):
155
  """
156
+ RATIONALE (Recommendation 2): Adds derived temporal and interaction features like 'days_since_update'
157
+ and 'stars_per_contributor' to give the model more powerful signals to learn from.
 
158
  """
159
+ logging.info("Engineering derived features...")
160
+ df['activity'] = pd.to_datetime(df['activity'], errors='coerce')
161
+ df['days_since_update'] = (pd.Timestamp.now(tz='UTC') - df['activity']).dt.days
162
+ df['days_since_update'].fillna(df['days_since_update'].median(), inplace=True)
 
 
 
 
 
 
 
 
 
 
163
 
164
+ df['stars_per_contributor'] = df['stars'] / df['contributors'].clip(lower=1)
165
+ df['forks_per_star'] = df['forks'] / df['stars'].clip(lower=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
+ numeric_cols = df.select_dtypes(include=np.number).columns
168
+ df[numeric_cols] = df[numeric_cols].fillna(0)
169
  return df
170
 
171
  def calculate_fallback_weights(df):
 
230
  def timeout_handler(signum, frame):
231
  raise TimeoutError("LLama model prediction timed out.")
232
 
233
+
234
+ def assign_base_weight(df):
235
  """
236
+ Assigns a robust `base_weight` using an LLM with a specific persona and JSON output,
237
+ then applies log transformation before normalization.
238
  """
239
+ logging.info("Assigning robust base weights using LLM...")
 
 
240
  oracle = SmolLM()
241
 
242
+ # RATIONALE (Recommendation 1): This prompt is highly specific. It sets a persona (VC), defines
243
+ # the goal (assess health), prioritizes metrics, and demands a strict JSON output. This
244
+ # leads to a much higher quality and more reliable response from the LLM.
245
  prompt = (
246
+ "As an expert venture capitalist specializing in open-source software, your goal is to assess a project's "
247
+ "overall health, community engagement, and development velocity. Based on this, assign a numeric importance "
248
+ "weight to each of the following GitHub metrics: 'stars', 'forks', 'watchers', 'open_issues', 'pulls', "
249
+ "'contributors', and 'days_since_update'. "
250
+ "Prioritize metrics indicating active, collaborative development (like contributors, pulls, recent updates) "
251
+ "over simple popularity metrics (like stars). The 'days_since_update' metric is inverse; lower is better, so it should have a negative weight. "
252
+ "The absolute values of the weights should sum to approximately 1. "
253
+ "Provide your answer ONLY in a strict JSON format. Example: "
254
+ '{"stars": 0.2, "forks": 0.1, "watchers": 0.05, "pulls": 0.2, "open_issues": 0.1, "contributors": 0.25, "days_since_update": -0.1}'
255
  )
256
+
257
  feature_weights = None
258
+ try:
259
+ response_text = oracle.predict(prompt)
260
+ json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
261
+ if not json_match: raise ValueError("No JSON object found in the LLM response.")
262
+ feature_weights = json.loads(json_match.group(0))
263
+ logging.info(f"Successfully parsed feature weights from LLM: {feature_weights}")
264
+ except Exception as e:
265
+ logging.error(f"Failed to parse LLM response, using fallback weights. Error: {e}")
266
+ feature_weights = {'stars': 0.15, 'forks': 0.1, 'watchers': 0.05, 'pulls': 0.25, 'open_issues': 0.1,
267
+ 'contributors': 0.25, 'days_since_update': -0.1}
268
+
269
+ df["base_weight_raw"] = sum(df[feature] * weight for feature, weight in feature_weights.items() if feature in df)
270
+
271
+ # RATIONALE (Recommendation 1): Log-transforming the raw score before scaling prevents extreme
272
+ # outliers from dominating the normalization process, creating a more stable target variable.
273
+ df['base_weight_log'] = np.log1p(df['base_weight_raw'] - df['base_weight_raw'].min())
274
+
275
+ df['base_weight'] = df.groupby("parent")["base_weight_log"].transform(
276
+ lambda s: (s - s.min()) / (s.max() - s.min() if s.max() > s.min() else 1)
277
+ ).fillna(0.5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
 
 
 
 
279
  return df
280
 
281
  def sanity_check_weights(df):
 
432
 
433
 
434
  ##############################
435
+ # Model Training and Prediction
436
  ##############################
437
  def train_predict_weight(df):
438
  """
439
+ Trains an XGBoost Regressor with GroupKFold cross-validation and extensive hyperparameter tuning.
440
  """
441
+ logging.info("Starting model training with GroupKFold validation...")
442
+
443
+ target_col = 'base_weight'
444
+ drop_cols = ["repo", "parent", "activity", "base_weight_raw", "base_weight_log", target_col]
445
+ feature_cols = [col for col in df.select_dtypes(include=np.number).columns if col not in drop_cols]
446
+
447
+ X = df[feature_cols].copy()
448
+ y = df[target_col]
449
+ groups = df['parent']
450
+
451
+ # RATIONALE (Recommendation 2): Log-transforming skewed input features helps the model by
452
+ # making their distributions more normal, improving the performance of the regressor.
453
+ skewed_features = ['stars', 'forks', 'watchers', 'open_issues', 'pulls', 'contributors', 'stars_per_contributor']
454
+ for col in skewed_features:
455
+ if col in X.columns:
456
+ X[col] = np.log1p(X[col])
457
+
458
+ pipeline = Pipeline([("scaler", RobustScaler()),
459
+ ("xgb", XGBRegressor(objective="reg:squarederror", n_jobs=-1, random_state=42, verbosity=0))])
460
+
461
+ param_dist = {'xgb__n_estimators': [100, 300, 500, 700], 'xgb__max_depth': [3, 5, 7, 9],
462
+ 'xgb__learning_rate': [0.01, 0.02, 0.05, 0.1], 'xgb__subsample': [0.6, 0.7, 0.8, 0.9],
463
+ 'xgb__colsample_bytree': [0.6, 0.7, 0.8, 0.9]}
464
+
465
+ # RATIONALE (Recommendation 3): GroupKFold ensures that all repos from the same parent are in the
466
+ # same fold. This prevents data leakage and gives a realistic measure of true performance.
467
+ cv = GroupKFold(n_splits=5)
468
+
469
+ # RATIONALE (Recommendation 4): Increasing n_iter explores more hyperparameter combinations,
470
+ # increasing the chance of finding a better-performing model.
 
 
 
 
 
 
471
  search = RandomizedSearchCV(
472
+ pipeline, param_distributions=param_dist, n_iter=50, cv=cv.split(X, y, groups),
473
+ scoring="neg_root_mean_squared_error", verbose=1, n_jobs=-1, random_state=42
 
 
 
 
 
 
474
  )
475
+ search.fit(X, y)
476
+
477
  best_model = search.best_estimator_
478
+ logging.info(f"Best CV score (neg RMSE): {search.best_score_:.4f}")
479
+ logging.info(f"Best parameters found: {search.best_params_}")
480
+
481
+ df['final_weight'] = best_model.predict(X)
482
+
483
+ df['final_weight'] = df['final_weight'].clip(lower=0)
484
+ df['final_weight'] = df.groupby("parent")['final_weight'].transform(
485
+ lambda w: w / w.sum() if w.sum() > 0 else np.ones_like(w) / len(w))
486
 
 
 
 
 
 
 
 
 
 
 
 
487
  return df
488
 
489
 
 
491
  # CSV Output
492
  ##############################
493
  def create_submission_csv(df, output_filename="submission.csv"):
494
+ """Saves the final predictions to a CSV file."""
495
+ logging.info(f"Writing final results to {output_filename}...")
496
+ df[["repo", "parent", "final_weight"]].to_csv(output_filename, index=False)
497
+ logging.info(f"Successfully created {output_filename}.")
 
 
498
 
 
 
499
 
500
  if __name__ == "__main__":
501
+ if 'GITHUB_API_TOKEN' not in os.environ:
502
+ logging.warning("GITHUB_API_TOKEN environment variable not set. API rate limits will be low.")
503
+
504
+ input_file = "input.csv"
505
+ output_file = "submission_enhanced.csv"
506
+
507
+ if not os.path.exists(input_file):
508
+ logging.error(f"Input file not found: {input_file}. Please create it with 'repo' and 'parent' columns.")
509
+ sys.exit(1)
510
+
511
+ logging.info("--- Starting DeepFunding Oracle - Enhanced Process ---")
512
 
513
+ # Execute the full pipeline
514
+ main_df = pd.read_csv(input_file)
515
+ main_df = fetch_github_features(main_df)
516
+ main_df = add_derived_features(main_df)
517
+ main_df = assign_base_weight(main_df)
518
+ main_df = train_predict_weight(main_df)
519
+ create_submission_csv(main_df, output_file)
520
 
521
+ logging.info("--- Process Completed Successfully ---")