FelixPhilip commited on
Commit
955c99b
·
1 Parent(s): 6282a14
Oracle/DataSmolAgent.py CHANGED
@@ -74,6 +74,27 @@ def save_to_csv(df: pd.DataFrame, filename: str = "output.csv") -> str:
74
  df.to_csv(filename, index=False)
75
  return filename
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  class DataSmolAgent(CodeAgent):
78
  """
79
  A data processing agent that cleans and extracts features from the provided DataFrame.
@@ -87,6 +108,7 @@ class DataSmolAgent(CodeAgent):
87
  clean_data,
88
  extract_features,
89
  save_to_csv, # Added save_to_csv tool
 
90
  ],
91
  model=self.model,
92
  additional_authorized_imports=["pandas", "numpy"]
@@ -100,8 +122,11 @@ class DataSmolAgent(CodeAgent):
100
  features_output = self.tools["extract_features"](df=self.df)
101
  self.df = features_output.result if hasattr(features_output, "result") else features_output
102
 
 
 
 
103
  if output_csv:
104
  csv_output = self.tools["save_to_csv"](df=self.df, filename="processed_output.csv")
105
  print(f"CSV saved at: {csv_output}")
106
 
107
- return self.df
 
74
  df.to_csv(filename, index=False)
75
  return filename
76
 
77
+ @tool
78
+ def predict_funding(df: pd.DataFrame) -> pd.DataFrame:
79
+ """
80
+ Predicts funding for child repositories based on the parent-child relationship.
81
+
82
+ Args:
83
+ df: The input DataFrame containing 'repo', 'parent', and other features.
84
+
85
+ Returns:
86
+ A DataFrame with an updated 'final_weight' column for child repositories.
87
+ """
88
+ # Ensure required columns exist
89
+ if not {"repo", "parent", "final_weight"}.issubset(df.columns):
90
+ raise ValueError("Input DataFrame must contain 'repo', 'parent', and 'final_weight' columns.")
91
+
92
+ # Normalize funding weights for child repositories grouped by parent
93
+ df["final_weight"] = df.groupby("parent")["final_weight"].transform(
94
+ lambda x: x / x.sum() if x.sum() > 0 else 1 / len(x)
95
+ )
96
+ return df
97
+
98
  class DataSmolAgent(CodeAgent):
99
  """
100
  A data processing agent that cleans and extracts features from the provided DataFrame.
 
108
  clean_data,
109
  extract_features,
110
  save_to_csv, # Added save_to_csv tool
111
+ predict_funding, # Added predict_funding tool
112
  ],
113
  model=self.model,
114
  additional_authorized_imports=["pandas", "numpy"]
 
122
  features_output = self.tools["extract_features"](df=self.df)
123
  self.df = features_output.result if hasattr(features_output, "result") else features_output
124
 
125
+ funding_output = self.tools["predict_funding"](df=self.df)
126
+ self.df = funding_output.result if hasattr(funding_output, "result") else funding_output
127
+
128
  if output_csv:
129
  csv_output = self.tools["save_to_csv"](df=self.df, filename="processed_output.csv")
130
  print(f"CSV saved at: {csv_output}")
131
 
132
+ return self.df
Oracle/deepfundingoracle.py CHANGED
@@ -195,68 +195,6 @@ def fetch_github_features(df):
195
  def timeout_handler(signum, frame):
196
  raise TimeoutError("LLama model prediction timed out.")
197
 
198
- # def assign_base_weight(df, max_workers=32):
199
- # """
200
- # Assign base weights using LLama model in parallel.
201
- # """
202
- # print("[INFO] Starting base weight assignment using LLama model...", flush=True)
203
- # logging.info("[INFO] Assigning base weights using LLama model...")
204
- # start_time = time.time()
205
- # llama = SmolLM()
206
- # base_weights = []
207
- # llm_cache = {}
208
- #
209
- # # Prepare prompts for all repositories
210
- # prompts = {}
211
- # for idx, row in df.iterrows():
212
- # repo = row.get("repo", "")
213
- # parent = row.get("parent", "")
214
- # stars = row.get("stars", 0)
215
- # forks = row.get("forks", 0)
216
- # watchers = row.get("watchers", 0)
217
- # issues = row.get("open_issues", 0)
218
- # pulls = row.get("pulls", 0)
219
- # activity = row.get("activity", "")
220
- # prompts[idx] = (
221
- # f"Repository: {repo}\n"
222
- # f"GitHub Metrics: {stars} stars, {forks} forks, {watchers} watchers, {issues} open issues, {pulls} pull requests, activity: {activity}.\n"
223
- # f"Parent or dependency: {parent}\n\n"
224
- # "Based on these features, assign a dependency weight between 0 and 1 for the repository "
225
- # "that reflects how influential the repository is as a source relative to its parent. "
226
- # "Only output the numeric value."
227
- # )
228
- #
229
- # # Define the prediction function
230
- # def _predict(idx, prompt):
231
- # if idx in llm_cache:
232
- # return idx, llm_cache[idx]
233
- # try:
234
- # resp = llama.predict(prompt)
235
- # match = re.search(r"[-+]?\d*\.\d+|\d+", resp)
236
- # weight = min(max(float(match.group()), 0), 1) if match else 0.0
237
- # llm_cache[idx] = weight
238
- # return idx, weight
239
- # except Exception as e:
240
- # print(f"[ERROR] Failed to process repository {idx}: {e}", flush=True)
241
- # logging.error(f"[ERROR] Failed to process repository {idx}: {e}")
242
- # return idx, 0.0 # Default weight in case of failure
243
- #
244
- # # Run predictions in parallel
245
- # with ThreadPoolExecutor(max_workers=max_workers) as executor:
246
- # futures = [executor.submit(_predict, idx, prompt) for idx, prompt in prompts.items()]
247
- # for fut in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="LLM Prompts"):
248
- # idx, weight = fut.result()
249
- # base_weights.append((idx, weight))
250
- #
251
- # # Sort weights by index and assign to DataFrame
252
- # base_weights.sort(key=lambda x: x[0])
253
- # df["base_weight"] = [weight for _, weight in base_weights]
254
- #
255
- # end_time = time.time()
256
- # print(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.", flush=True)
257
- # logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
258
- # return df
259
-
260
  def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0):
261
  """
262
  Assign base weights using a single LLM call to determine feature weights,
@@ -324,6 +262,17 @@ def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0):
324
  return df
325
 
326
 
 
 
 
 
 
 
 
 
 
 
 
327
  def prepare_dataset(file):
328
  print("[INFO] Starting dataset preparation...")
329
  start_time = time.time()
@@ -337,6 +286,8 @@ def prepare_dataset(file):
337
  print("[INFO] GitHub features fetched successfully.")
338
  print("[INFO] Assigning base weights using LLama model...")
339
  df = assign_base_weight(df)
 
 
340
  end_time = time.time()
341
  print(f"[INFO] Dataset preparation completed in {end_time - start_time:.2f} seconds.")
342
  return df
@@ -374,7 +325,7 @@ def train_predict_weight(df,
374
  y = df[target]
375
 
376
  # For regression, if a classification criterion is given, switch to 'mse'
377
- reg_criterion = "mse" if criterion in ["gini", "entropy"] else criterion
378
 
379
  rf_model = RandomForestRegressor(random_state=42,
380
  criterion=reg_criterion,
@@ -399,38 +350,18 @@ def train_predict_weight(df,
399
 
400
  for parent, children in parent_map.items():
401
  group_idxs = df[df["parent"] == parent].index
402
- # Check if a repo in the group is flagged as is_source
403
- source_idxs = df.loc[group_idxs][df["is_source"] == True].index.tolist() if "is_source" in df.columns else []
404
- if source_idxs:
405
- parent_idx = source_idxs[0]
406
  else:
407
- # Fallback: choose the repo with the maximum prediction as the parent
408
- preds = df.loc[group_idxs, "rf_pred"]
409
- parent_idx = preds.idxmax()
410
- child_idxs = [idx for idx in group_idxs if idx != parent_idx]
411
- if child_idxs:
412
- child_preds = df.loc[child_idxs, "rf_pred"]
413
- if child_preds.max() > child_preds.min():
414
- normed = (child_preds - child_preds.min()) / (child_preds.max() - child_preds.min() + 1e-8)
415
- else:
416
- normed = pd.Series([0.0] * len(child_idxs), index=child_idxs)
417
- normed = normed * 0.99
418
- for idx, val in zip(child_idxs, normed):
419
- final_weights[idx] = val
420
- final_weights[parent_idx] = 1.0
421
 
422
  df["final_weight"] = df.index.map(final_weights).fillna(0.0)
423
 
424
- # Enforce monotonicity within each group so weights are descending
425
- for parent, children in parent_map.items():
426
- group_idxs = df[df["parent"] == parent].index
427
- group_weights = df.loc[group_idxs, "final_weight"].sort_values(ascending=False)
428
- prev = 1.0
429
- for idx in group_weights.index:
430
- if df.at[idx, "final_weight"] > prev:
431
- df.at[idx, "final_weight"] = prev
432
- prev = df.at[idx, "final_weight"]
433
-
434
  end_time = time.time()
435
  print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True)
436
  return df
@@ -453,4 +384,12 @@ def create_submission_csv(df, output_filename="submission.csv"):
453
  # This file now focuses solely on data processing and prediction.
454
 
455
  if __name__ == "__main__":
456
- print("DeepFunding Oracle is now ready for backend processing.", flush=True)
 
 
 
 
 
 
 
 
 
195
  def timeout_handler(signum, frame):
196
  raise TimeoutError("LLama model prediction timed out.")
197
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0):
199
  """
200
  Assign base weights using a single LLM call to determine feature weights,
 
262
  return df
263
 
264
 
265
+ def normalize_funding(df):
266
+ """
267
+ Normalize funding weights for child repositories grouped by parent.
268
+ """
269
+ print("[INFO] Normalizing funding weights...", flush=True)
270
+ df["final_weight"] = df.groupby("parent")["final_weight"].transform(
271
+ lambda x: x / x.sum() if x.sum() > 0 else 1 / len(x)
272
+ )
273
+ print("[INFO] Funding weights normalized successfully.", flush=True)
274
+ return df
275
+
276
  def prepare_dataset(file):
277
  print("[INFO] Starting dataset preparation...")
278
  start_time = time.time()
 
286
  print("[INFO] GitHub features fetched successfully.")
287
  print("[INFO] Assigning base weights using LLama model...")
288
  df = assign_base_weight(df)
289
+ df = train_predict_weight(df)
290
+ df = normalize_funding(df)
291
  end_time = time.time()
292
  print(f"[INFO] Dataset preparation completed in {end_time - start_time:.2f} seconds.")
293
  return df
 
325
  y = df[target]
326
 
327
  # For regression, if a classification criterion is given, switch to 'mse'
328
+ reg_criterion = "squared_error" if criterion in ["gini", "entropy"] else criterion
329
 
330
  rf_model = RandomForestRegressor(random_state=42,
331
  criterion=reg_criterion,
 
350
 
351
  for parent, children in parent_map.items():
352
  group_idxs = df[df["parent"] == parent].index
353
+ preds = df.loc[group_idxs, "rf_pred"]
354
+ total = preds.sum()
355
+ if total > 0:
356
+ normed = preds / total
357
  else:
358
+ # If sum is zero, assign equal weights.
359
+ normed = pd.Series([1/len(preds)] * len(preds), index=preds.index)
360
+ for idx, weight in normed.items():
361
+ final_weights[idx] = weight
 
 
 
 
 
 
 
 
 
 
362
 
363
  df["final_weight"] = df.index.map(final_weights).fillna(0.0)
364
 
 
 
 
 
 
 
 
 
 
 
365
  end_time = time.time()
366
  print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True)
367
  return df
 
384
  # This file now focuses solely on data processing and prediction.
385
 
386
  if __name__ == "__main__":
387
+ input_file = "input.csv" # Replace with the actual input file path
388
+ output_file = "submission.csv"
389
+
390
+ print("[INFO] Preparing dataset...")
391
+ df = prepare_dataset(input_file)
392
+
393
+ print("[INFO] Creating submission CSV...")
394
+ create_submission_csv(df, output_file)
395
+ print("[INFO] Process completed successfully.")