FelixPhilip commited on
Commit
2424d59
·
1 Parent(s): ba26d2b

Oracle weight assigning update

Browse files
Files changed (1) hide show
  1. Oracle/deepfundingoracle.py +22 -33
Oracle/deepfundingoracle.py CHANGED
@@ -233,9 +233,7 @@ def timeout_handler(signum, frame):
233
  # logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
234
  # return df
235
 
236
-
237
-
238
- def assign_base_weight(df, max_workers=32, llm_retries=2,llm_delay=0):
239
  """
240
  Assign base weights using a single LLM call to determine feature weights,
241
  and programmatically calculate repository weights.
@@ -245,53 +243,45 @@ def assign_base_weight(df, max_workers=32, llm_retries=2,llm_delay=0):
245
  start_time = time.time()
246
  oracle = SmolLM()
247
 
248
- # Step 1: Call LLM once to determine weights for each feature
249
  prompt = (
250
  "Can you Predict a weight in the range (0-1) for these github features such as stars,forks,watchers,open_issues,pulls,activity,contributors based on its importance in determining "
251
- "the influence of a repository. Output ONLY a valid JSON object with keys as feature names and values as the predicted weights. "
252
- "Do not include any explanation or extra text. here is an output example: \n"
253
- '{\n'
254
- ' "stars": 0.3,\n'
255
- ' "forks": 0.2,\n'
256
- ' "watchers": 0.2,\n'
257
- ' "open_issues": 0.1,\n'
258
- ' "pulls": 0.1,\n'
259
- ' "activity": 0.05,\n'
260
- ' "contributors": 0.05\n'
261
- '}\n'
262
  )
263
- feature_weights= None
264
  for attempt in range(llm_retries):
265
  try:
266
- response = oracle.predict(prompt,max_length=512, max_new_tokens=150)
267
  if not response or not response.strip():
268
  raise ValueError("Empty response from Oracle.")
269
- feature_weights = json.loads(response) # Safely parse JSON
 
 
 
 
 
270
  print(f"[INFO] Feature weights from LLM: {feature_weights}", flush=True)
271
  break
272
  except Exception as e:
273
  print(f"[ERROR] Oracle attempt {attempt+1} failed: {e}", flush=True)
274
  logging.error(f"[ERROR] Oracle attempt {attempt+1} failed: {e}")
275
  time.sleep(llm_delay)
276
- # Fallback to default weights
277
  if feature_weights is None:
278
- feature_weights = {
279
- "stars": 0.3,
280
- "forks": 0.2,
281
- "watchers": 0.2,
282
- "open_issues": 0.1,
283
- "pulls": 0.1,
284
- "activity": 0.05,
285
- "contributors": 0.05
286
- }
287
- print(f"[INFO] Using default feature weights: {feature_weights}", flush=True)
288
 
289
- # Step 2: Ensure all feature columns are numeric
290
  for feature in feature_weights.keys():
291
  if feature in df.columns:
292
  df[feature] = pd.to_numeric(df[feature], errors='coerce').fillna(0)
293
 
294
- # Step 3: Programmatically calculate weights for each repository
295
  def calculate_weight(row):
296
  weight = 0
297
  for feature, feature_weight in feature_weights.items():
@@ -300,8 +290,6 @@ def assign_base_weight(df, max_workers=32, llm_retries=2,llm_delay=0):
300
  return weight
301
 
302
  df["base_weight_raw"] = df.apply(calculate_weight, axis=1)
303
-
304
- # Step 4: Normalize weights per parent
305
  df["base_weight"] = df.groupby("parent")["base_weight_raw"].transform(
306
  lambda s: (s - s.min()) / (s.max() - s.min() if s.max() != s.min() else 1)
307
  )
@@ -311,6 +299,7 @@ def assign_base_weight(df, max_workers=32, llm_retries=2,llm_delay=0):
311
  logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
312
  return df
313
 
 
314
  def prepare_dataset(file):
315
  print("[INFO] Starting dataset preparation...")
316
  start_time = time.time()
 
233
  # logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
234
  # return df
235
 
236
+ def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0):
 
 
237
  """
238
  Assign base weights using a single LLM call to determine feature weights,
239
  and programmatically calculate repository weights.
 
243
  start_time = time.time()
244
  oracle = SmolLM()
245
 
 
246
  prompt = (
247
  "Can you Predict a weight in the range (0-1) for these github features such as stars,forks,watchers,open_issues,pulls,activity,contributors based on its importance in determining "
248
+ "the influence of a repository. Output the weights for each feature as text e.g.: "
249
+ 'stars: 0.3, forks: 0.2, watchers: 0.2, open_issues: 0.1, pulls: 0.1, activity: 0.05, contributors: 0.05'
 
 
 
 
 
 
 
 
 
250
  )
251
+ feature_weights = None
252
  for attempt in range(llm_retries):
253
  try:
254
+ response = oracle.predict(prompt, max_length=512, max_new_tokens=150)
255
  if not response or not response.strip():
256
  raise ValueError("Empty response from Oracle.")
257
+ matches = re.findall(
258
+ r'(stars|forks|watchers|open_issues|pulls|activity|contributors)\s*[:=]\s*([0-9]*\.?[0-9]+)',
259
+ response, re.IGNORECASE)
260
+ feature_weights = {k.lower(): float(v) for k, v in matches}
261
+ if not feature_weights or len(feature_weights) < 7:
262
+ raise ValueError("Could not extract all feature weights from response.")
263
  print(f"[INFO] Feature weights from LLM: {feature_weights}", flush=True)
264
  break
265
  except Exception as e:
266
  print(f"[ERROR] Oracle attempt {attempt+1} failed: {e}", flush=True)
267
  logging.error(f"[ERROR] Oracle attempt {attempt+1} failed: {e}")
268
  time.sleep(llm_delay)
 
269
  if feature_weights is None:
270
+ feature_weights = {
271
+ "stars": 0.3,
272
+ "forks": 0.2,
273
+ "watchers": 0.2,
274
+ "open_issues": 0.1,
275
+ "pulls": 0.1,
276
+ "activity": 0.05,
277
+ "contributors": 0.05
278
+ }
279
+ print(f"[INFO] Using default feature weights: {feature_weights}", flush=True)
280
 
 
281
  for feature in feature_weights.keys():
282
  if feature in df.columns:
283
  df[feature] = pd.to_numeric(df[feature], errors='coerce').fillna(0)
284
 
 
285
  def calculate_weight(row):
286
  weight = 0
287
  for feature, feature_weight in feature_weights.items():
 
290
  return weight
291
 
292
  df["base_weight_raw"] = df.apply(calculate_weight, axis=1)
 
 
293
  df["base_weight"] = df.groupby("parent")["base_weight_raw"].transform(
294
  lambda s: (s - s.min()) / (s.max() - s.min() if s.max() != s.min() else 1)
295
  )
 
299
  logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
300
  return df
301
 
302
+
303
  def prepare_dataset(file):
304
  print("[INFO] Starting dataset preparation...")
305
  start_time = time.time()