Spaces:
Sleeping
Sleeping
Commit
·
955c99b
1
Parent(s):
6282a14
Oracle
Browse files- Oracle/DataSmolAgent.py +26 -1
- Oracle/deepfundingoracle.py +31 -92
Oracle/DataSmolAgent.py
CHANGED
@@ -74,6 +74,27 @@ def save_to_csv(df: pd.DataFrame, filename: str = "output.csv") -> str:
|
|
74 |
df.to_csv(filename, index=False)
|
75 |
return filename
|
76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
class DataSmolAgent(CodeAgent):
|
78 |
"""
|
79 |
A data processing agent that cleans and extracts features from the provided DataFrame.
|
@@ -87,6 +108,7 @@ class DataSmolAgent(CodeAgent):
|
|
87 |
clean_data,
|
88 |
extract_features,
|
89 |
save_to_csv, # Added save_to_csv tool
|
|
|
90 |
],
|
91 |
model=self.model,
|
92 |
additional_authorized_imports=["pandas", "numpy"]
|
@@ -100,8 +122,11 @@ class DataSmolAgent(CodeAgent):
|
|
100 |
features_output = self.tools["extract_features"](df=self.df)
|
101 |
self.df = features_output.result if hasattr(features_output, "result") else features_output
|
102 |
|
|
|
|
|
|
|
103 |
if output_csv:
|
104 |
csv_output = self.tools["save_to_csv"](df=self.df, filename="processed_output.csv")
|
105 |
print(f"CSV saved at: {csv_output}")
|
106 |
|
107 |
-
return self.df
|
|
|
74 |
df.to_csv(filename, index=False)
|
75 |
return filename
|
76 |
|
77 |
+
@tool
|
78 |
+
def predict_funding(df: pd.DataFrame) -> pd.DataFrame:
|
79 |
+
"""
|
80 |
+
Predicts funding for child repositories based on the parent-child relationship.
|
81 |
+
|
82 |
+
Args:
|
83 |
+
df: The input DataFrame containing 'repo', 'parent', and other features.
|
84 |
+
|
85 |
+
Returns:
|
86 |
+
A DataFrame with an updated 'final_weight' column for child repositories.
|
87 |
+
"""
|
88 |
+
# Ensure required columns exist
|
89 |
+
if not {"repo", "parent", "final_weight"}.issubset(df.columns):
|
90 |
+
raise ValueError("Input DataFrame must contain 'repo', 'parent', and 'final_weight' columns.")
|
91 |
+
|
92 |
+
# Normalize funding weights for child repositories grouped by parent
|
93 |
+
df["final_weight"] = df.groupby("parent")["final_weight"].transform(
|
94 |
+
lambda x: x / x.sum() if x.sum() > 0 else 1 / len(x)
|
95 |
+
)
|
96 |
+
return df
|
97 |
+
|
98 |
class DataSmolAgent(CodeAgent):
|
99 |
"""
|
100 |
A data processing agent that cleans and extracts features from the provided DataFrame.
|
|
|
108 |
clean_data,
|
109 |
extract_features,
|
110 |
save_to_csv, # Added save_to_csv tool
|
111 |
+
predict_funding, # Added predict_funding tool
|
112 |
],
|
113 |
model=self.model,
|
114 |
additional_authorized_imports=["pandas", "numpy"]
|
|
|
122 |
features_output = self.tools["extract_features"](df=self.df)
|
123 |
self.df = features_output.result if hasattr(features_output, "result") else features_output
|
124 |
|
125 |
+
funding_output = self.tools["predict_funding"](df=self.df)
|
126 |
+
self.df = funding_output.result if hasattr(funding_output, "result") else funding_output
|
127 |
+
|
128 |
if output_csv:
|
129 |
csv_output = self.tools["save_to_csv"](df=self.df, filename="processed_output.csv")
|
130 |
print(f"CSV saved at: {csv_output}")
|
131 |
|
132 |
+
return self.df
|
Oracle/deepfundingoracle.py
CHANGED
@@ -195,68 +195,6 @@ def fetch_github_features(df):
|
|
195 |
def timeout_handler(signum, frame):
|
196 |
raise TimeoutError("LLama model prediction timed out.")
|
197 |
|
198 |
-
# def assign_base_weight(df, max_workers=32):
|
199 |
-
# """
|
200 |
-
# Assign base weights using LLama model in parallel.
|
201 |
-
# """
|
202 |
-
# print("[INFO] Starting base weight assignment using LLama model...", flush=True)
|
203 |
-
# logging.info("[INFO] Assigning base weights using LLama model...")
|
204 |
-
# start_time = time.time()
|
205 |
-
# llama = SmolLM()
|
206 |
-
# base_weights = []
|
207 |
-
# llm_cache = {}
|
208 |
-
#
|
209 |
-
# # Prepare prompts for all repositories
|
210 |
-
# prompts = {}
|
211 |
-
# for idx, row in df.iterrows():
|
212 |
-
# repo = row.get("repo", "")
|
213 |
-
# parent = row.get("parent", "")
|
214 |
-
# stars = row.get("stars", 0)
|
215 |
-
# forks = row.get("forks", 0)
|
216 |
-
# watchers = row.get("watchers", 0)
|
217 |
-
# issues = row.get("open_issues", 0)
|
218 |
-
# pulls = row.get("pulls", 0)
|
219 |
-
# activity = row.get("activity", "")
|
220 |
-
# prompts[idx] = (
|
221 |
-
# f"Repository: {repo}\n"
|
222 |
-
# f"GitHub Metrics: {stars} stars, {forks} forks, {watchers} watchers, {issues} open issues, {pulls} pull requests, activity: {activity}.\n"
|
223 |
-
# f"Parent or dependency: {parent}\n\n"
|
224 |
-
# "Based on these features, assign a dependency weight between 0 and 1 for the repository "
|
225 |
-
# "that reflects how influential the repository is as a source relative to its parent. "
|
226 |
-
# "Only output the numeric value."
|
227 |
-
# )
|
228 |
-
#
|
229 |
-
# # Define the prediction function
|
230 |
-
# def _predict(idx, prompt):
|
231 |
-
# if idx in llm_cache:
|
232 |
-
# return idx, llm_cache[idx]
|
233 |
-
# try:
|
234 |
-
# resp = llama.predict(prompt)
|
235 |
-
# match = re.search(r"[-+]?\d*\.\d+|\d+", resp)
|
236 |
-
# weight = min(max(float(match.group()), 0), 1) if match else 0.0
|
237 |
-
# llm_cache[idx] = weight
|
238 |
-
# return idx, weight
|
239 |
-
# except Exception as e:
|
240 |
-
# print(f"[ERROR] Failed to process repository {idx}: {e}", flush=True)
|
241 |
-
# logging.error(f"[ERROR] Failed to process repository {idx}: {e}")
|
242 |
-
# return idx, 0.0 # Default weight in case of failure
|
243 |
-
#
|
244 |
-
# # Run predictions in parallel
|
245 |
-
# with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
246 |
-
# futures = [executor.submit(_predict, idx, prompt) for idx, prompt in prompts.items()]
|
247 |
-
# for fut in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="LLM Prompts"):
|
248 |
-
# idx, weight = fut.result()
|
249 |
-
# base_weights.append((idx, weight))
|
250 |
-
#
|
251 |
-
# # Sort weights by index and assign to DataFrame
|
252 |
-
# base_weights.sort(key=lambda x: x[0])
|
253 |
-
# df["base_weight"] = [weight for _, weight in base_weights]
|
254 |
-
#
|
255 |
-
# end_time = time.time()
|
256 |
-
# print(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.", flush=True)
|
257 |
-
# logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
|
258 |
-
# return df
|
259 |
-
|
260 |
def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0):
|
261 |
"""
|
262 |
Assign base weights using a single LLM call to determine feature weights,
|
@@ -324,6 +262,17 @@ def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0):
|
|
324 |
return df
|
325 |
|
326 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
327 |
def prepare_dataset(file):
|
328 |
print("[INFO] Starting dataset preparation...")
|
329 |
start_time = time.time()
|
@@ -337,6 +286,8 @@ def prepare_dataset(file):
|
|
337 |
print("[INFO] GitHub features fetched successfully.")
|
338 |
print("[INFO] Assigning base weights using LLama model...")
|
339 |
df = assign_base_weight(df)
|
|
|
|
|
340 |
end_time = time.time()
|
341 |
print(f"[INFO] Dataset preparation completed in {end_time - start_time:.2f} seconds.")
|
342 |
return df
|
@@ -374,7 +325,7 @@ def train_predict_weight(df,
|
|
374 |
y = df[target]
|
375 |
|
376 |
# For regression, if a classification criterion is given, switch to 'mse'
|
377 |
-
reg_criterion = "
|
378 |
|
379 |
rf_model = RandomForestRegressor(random_state=42,
|
380 |
criterion=reg_criterion,
|
@@ -399,38 +350,18 @@ def train_predict_weight(df,
|
|
399 |
|
400 |
for parent, children in parent_map.items():
|
401 |
group_idxs = df[df["parent"] == parent].index
|
402 |
-
|
403 |
-
|
404 |
-
if
|
405 |
-
|
406 |
else:
|
407 |
-
#
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
if child_idxs:
|
412 |
-
child_preds = df.loc[child_idxs, "rf_pred"]
|
413 |
-
if child_preds.max() > child_preds.min():
|
414 |
-
normed = (child_preds - child_preds.min()) / (child_preds.max() - child_preds.min() + 1e-8)
|
415 |
-
else:
|
416 |
-
normed = pd.Series([0.0] * len(child_idxs), index=child_idxs)
|
417 |
-
normed = normed * 0.99
|
418 |
-
for idx, val in zip(child_idxs, normed):
|
419 |
-
final_weights[idx] = val
|
420 |
-
final_weights[parent_idx] = 1.0
|
421 |
|
422 |
df["final_weight"] = df.index.map(final_weights).fillna(0.0)
|
423 |
|
424 |
-
# Enforce monotonicity within each group so weights are descending
|
425 |
-
for parent, children in parent_map.items():
|
426 |
-
group_idxs = df[df["parent"] == parent].index
|
427 |
-
group_weights = df.loc[group_idxs, "final_weight"].sort_values(ascending=False)
|
428 |
-
prev = 1.0
|
429 |
-
for idx in group_weights.index:
|
430 |
-
if df.at[idx, "final_weight"] > prev:
|
431 |
-
df.at[idx, "final_weight"] = prev
|
432 |
-
prev = df.at[idx, "final_weight"]
|
433 |
-
|
434 |
end_time = time.time()
|
435 |
print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True)
|
436 |
return df
|
@@ -453,4 +384,12 @@ def create_submission_csv(df, output_filename="submission.csv"):
|
|
453 |
# This file now focuses solely on data processing and prediction.
|
454 |
|
455 |
if __name__ == "__main__":
|
456 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
def timeout_handler(signum, frame):
|
196 |
raise TimeoutError("LLama model prediction timed out.")
|
197 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
198 |
def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0):
|
199 |
"""
|
200 |
Assign base weights using a single LLM call to determine feature weights,
|
|
|
262 |
return df
|
263 |
|
264 |
|
265 |
+
def normalize_funding(df):
|
266 |
+
"""
|
267 |
+
Normalize funding weights for child repositories grouped by parent.
|
268 |
+
"""
|
269 |
+
print("[INFO] Normalizing funding weights...", flush=True)
|
270 |
+
df["final_weight"] = df.groupby("parent")["final_weight"].transform(
|
271 |
+
lambda x: x / x.sum() if x.sum() > 0 else 1 / len(x)
|
272 |
+
)
|
273 |
+
print("[INFO] Funding weights normalized successfully.", flush=True)
|
274 |
+
return df
|
275 |
+
|
276 |
def prepare_dataset(file):
|
277 |
print("[INFO] Starting dataset preparation...")
|
278 |
start_time = time.time()
|
|
|
286 |
print("[INFO] GitHub features fetched successfully.")
|
287 |
print("[INFO] Assigning base weights using LLama model...")
|
288 |
df = assign_base_weight(df)
|
289 |
+
df = train_predict_weight(df)
|
290 |
+
df = normalize_funding(df)
|
291 |
end_time = time.time()
|
292 |
print(f"[INFO] Dataset preparation completed in {end_time - start_time:.2f} seconds.")
|
293 |
return df
|
|
|
325 |
y = df[target]
|
326 |
|
327 |
# For regression, if a classification criterion is given, switch to 'mse'
|
328 |
+
reg_criterion = "squared_error" if criterion in ["gini", "entropy"] else criterion
|
329 |
|
330 |
rf_model = RandomForestRegressor(random_state=42,
|
331 |
criterion=reg_criterion,
|
|
|
350 |
|
351 |
for parent, children in parent_map.items():
|
352 |
group_idxs = df[df["parent"] == parent].index
|
353 |
+
preds = df.loc[group_idxs, "rf_pred"]
|
354 |
+
total = preds.sum()
|
355 |
+
if total > 0:
|
356 |
+
normed = preds / total
|
357 |
else:
|
358 |
+
# If sum is zero, assign equal weights.
|
359 |
+
normed = pd.Series([1/len(preds)] * len(preds), index=preds.index)
|
360 |
+
for idx, weight in normed.items():
|
361 |
+
final_weights[idx] = weight
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
362 |
|
363 |
df["final_weight"] = df.index.map(final_weights).fillna(0.0)
|
364 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
365 |
end_time = time.time()
|
366 |
print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True)
|
367 |
return df
|
|
|
384 |
# This file now focuses solely on data processing and prediction.
|
385 |
|
386 |
if __name__ == "__main__":
|
387 |
+
input_file = "input.csv" # Replace with the actual input file path
|
388 |
+
output_file = "submission.csv"
|
389 |
+
|
390 |
+
print("[INFO] Preparing dataset...")
|
391 |
+
df = prepare_dataset(input_file)
|
392 |
+
|
393 |
+
print("[INFO] Creating submission CSV...")
|
394 |
+
create_submission_csv(df, output_file)
|
395 |
+
print("[INFO] Process completed successfully.")
|