FelixPhilip commited on
Commit
386c440
·
1 Parent(s): 17c5050
Files changed (1) hide show
  1. Oracle/deepfundingoracle.py +31 -16
Oracle/deepfundingoracle.py CHANGED
@@ -55,7 +55,6 @@ logging.basicConfig(
55
  def fetch_repo_metrics(repo_url):
56
  """
57
  Fetch GitHub metrics (stars, forks, watchers, open issues, pull requests, and activity) given a repository URL.
58
- Assumes repo_url is in the form "https://github.com/owner/repo".
59
  """
60
  try:
61
  # Extract owner and repo name
@@ -71,7 +70,9 @@ def fetch_repo_metrics(repo_url):
71
  r = requests.get(api_url, headers=headers)
72
  if r.status_code == 200:
73
  data = r.json()
74
- pulls_url = data.get("pulls_url", "").replace("{\/*state}", "")
 
 
75
  pulls_count = len(requests.get(pulls_url, headers=headers).json()) if pulls_url else 0
76
  activity = data.get("updated_at", "")
77
  return {
@@ -86,8 +87,10 @@ def fetch_repo_metrics(repo_url):
86
  "token": token
87
  }
88
  else:
 
89
  return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0}
90
- except Exception:
 
91
  return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0}
92
 
93
 
@@ -132,8 +135,10 @@ def fetch_github_features(df):
132
 
133
  def get_metrics(repo_url):
134
  if repo_url in cache:
 
135
  return cache[repo_url]
136
  val = fetch_repo_metrics(repo_url)
 
137
  try:
138
  m = re.search(r"github\.com/([^/]+)/([^/]+)",repo_url)
139
  if m:
@@ -209,8 +214,9 @@ def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0):
209
  oracle = SmolLM()
210
 
211
  prompt = (
212
- "Can you Predict a weight in the range (0-1) for these github features such as stars,forks,watchers,open_issues,pulls,activity,contributors based on its importance in determining "
213
- "the influence of a repository. Output the weights for each feature as text e.g.: "
 
214
  'stars: 0.3, forks: 0.2, watchers: 0.2, open_issues: 0.1, pulls: 0.1, activity: 0.05, contributors: 0.05'
215
  )
216
  feature_weights = None
@@ -231,18 +237,14 @@ def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0):
231
  print(f"[ERROR] Oracle attempt {attempt+1} failed: {e}", flush=True)
232
  logging.error(f"[ERROR] Oracle attempt {attempt+1} failed: {e}")
233
  time.sleep(llm_delay)
 
 
234
  if feature_weights is None:
235
- feature_weights = {
236
- "stars": 0.3,
237
- "forks": 0.2,
238
- "watchers": 0.2,
239
- "open_issues": 0.1,
240
- "pulls": 0.1,
241
- "activity": 0.05,
242
- "contributors": 0.05
243
- }
244
- print(f"[INFO] Using default feature weights: {feature_weights}", flush=True)
245
 
 
246
  for feature in feature_weights.keys():
247
  if feature in df.columns:
248
  df[feature] = pd.to_numeric(df[feature], errors='coerce').fillna(0)
@@ -264,6 +266,19 @@ def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0):
264
  logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
265
  return df
266
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
  def sanity_check_weights(df):
268
  """
269
  Sanity-checks LLM weights by comparing them with other metrics.
@@ -383,7 +398,7 @@ def validate_target(df):
383
  variance = df[target].var()
384
  print(f"[DEBUG] Target variable variance: {variance}")
385
  if variance < 1e-6:
386
- raise ValueError(f"Target variable '{target}' has insufficient variance.")
387
  return df
388
 
389
 
 
55
  def fetch_repo_metrics(repo_url):
56
  """
57
  Fetch GitHub metrics (stars, forks, watchers, open issues, pull requests, and activity) given a repository URL.
 
58
  """
59
  try:
60
  # Extract owner and repo name
 
70
  r = requests.get(api_url, headers=headers)
71
  if r.status_code == 200:
72
  data = r.json()
73
+ # Log fetched data for debugging
74
+ print(f"[DEBUG] Fetched data for {repo_url}: {data}")
75
+ pulls_url = data.get("pulls_url", "").replace("{/state}", "")
76
  pulls_count = len(requests.get(pulls_url, headers=headers).json()) if pulls_url else 0
77
  activity = data.get("updated_at", "")
78
  return {
 
87
  "token": token
88
  }
89
  else:
90
+ print(f"[ERROR] Failed to fetch data for {repo_url}: {r.status_code}")
91
  return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0}
92
+ except Exception as e:
93
+ print(f"[ERROR] Exception while fetching data for {repo_url}: {e}")
94
  return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0}
95
 
96
 
 
135
 
136
  def get_metrics(repo_url):
137
  if repo_url in cache:
138
+ print(f"[DEBUG] Cached data for {repo_url}: {cache[repo_url]}")
139
  return cache[repo_url]
140
  val = fetch_repo_metrics(repo_url)
141
+ print(f"[DEBUG] Extracted GitHub data for {repo_url}: {val}") # <-- Add this line
142
  try:
143
  m = re.search(r"github\.com/([^/]+)/([^/]+)",repo_url)
144
  if m:
 
214
  oracle = SmolLM()
215
 
216
  prompt = (
217
+ "Can you Predict a weight in the range (0-1) for these GitHub features such as stars, forks, watchers, "
218
+ "open_issues, pulls, activity, contributors based on their importance in determining the influence of a repository? "
219
+ "Output the weights for each feature as text e.g.: "
220
  'stars: 0.3, forks: 0.2, watchers: 0.2, open_issues: 0.1, pulls: 0.1, activity: 0.05, contributors: 0.05'
221
  )
222
  feature_weights = None
 
237
  print(f"[ERROR] Oracle attempt {attempt+1} failed: {e}", flush=True)
238
  logging.error(f"[ERROR] Oracle attempt {attempt+1} failed: {e}")
239
  time.sleep(llm_delay)
240
+
241
+ # Fallback mechanism: Calculate feature weights dynamically if LLM fails
242
  if feature_weights is None:
243
+ print("[WARN] LLM failed to provide feature weights. Calculating fallback weights dynamically.")
244
+ feature_weights = calculate_fallback_weights(df)
245
+ print(f"[INFO] Fallback feature weights: {feature_weights}", flush=True)
 
 
 
 
 
 
 
246
 
247
+ # Ensure numeric columns are properly formatted
248
  for feature in feature_weights.keys():
249
  if feature in df.columns:
250
  df[feature] = pd.to_numeric(df[feature], errors='coerce').fillna(0)
 
266
  logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
267
  return df
268
 
269
+ def calculate_fallback_weights(df):
270
+ """
271
+ Dynamically calculate fallback feature weights based on feature variance and correlation with the target.
272
+ """
273
+ print("[INFO] Calculating fallback feature weights...")
274
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
275
+ feature_variances = df[numeric_cols].var()
276
+ total_variance = feature_variances.sum()
277
+
278
+ # Assign weights proportional to feature variance
279
+ fallback_weights = {col: var / total_variance for col, var in feature_variances.items() if total_variance > 0}
280
+ return fallback_weights
281
+
282
  def sanity_check_weights(df):
283
  """
284
  Sanity-checks LLM weights by comparing them with other metrics.
 
398
  variance = df[target].var()
399
  print(f"[DEBUG] Target variable variance: {variance}")
400
  if variance < 1e-6:
401
+ raise ValueError(f"Target variable '{target}' has insufficient variance. Please check feature values.")
402
  return df
403
 
404