Spaces:

FelixPhilip
/

DeepFundingOracle

Sleeping

App Files Files Community

FelixPhilip commited on Apr 30

Commit

955c99b

1 Parent(s): 6282a14

Oracle

Browse files

Files changed (2) hide show

Oracle/DataSmolAgent.py +26 -1
Oracle/deepfundingoracle.py +31 -92

Oracle/DataSmolAgent.py CHANGED Viewed

@@ -74,6 +74,27 @@ def save_to_csv(df: pd.DataFrame, filename: str = "output.csv") -> str:
     df.to_csv(filename, index=False)
     return filename
 class DataSmolAgent(CodeAgent):
     """
     A data processing agent that cleans and extracts features from the provided DataFrame.
@@ -87,6 +108,7 @@ class DataSmolAgent(CodeAgent):
                 clean_data,
                 extract_features,
                 save_to_csv,  # Added save_to_csv tool
             ],
             model=self.model,
             additional_authorized_imports=["pandas", "numpy"]
@@ -100,8 +122,11 @@ class DataSmolAgent(CodeAgent):
         features_output = self.tools["extract_features"](df=self.df)
         self.df = features_output.result if hasattr(features_output, "result") else features_output
         if output_csv:
             csv_output = self.tools["save_to_csv"](df=self.df, filename="processed_output.csv")
             print(f"CSV saved at: {csv_output}")
-        return self.df

     df.to_csv(filename, index=False)
     return filename
+@tool
+def predict_funding(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Predicts funding for child repositories based on the parent-child relationship.
+    Args:
+        df: The input DataFrame containing 'repo', 'parent', and other features.
+    Returns:
+        A DataFrame with an updated 'final_weight' column for child repositories.
+    """
+    # Ensure required columns exist
+    if not {"repo", "parent", "final_weight"}.issubset(df.columns):
+        raise ValueError("Input DataFrame must contain 'repo', 'parent', and 'final_weight' columns.")
+    # Normalize funding weights for child repositories grouped by parent
+    df["final_weight"] = df.groupby("parent")["final_weight"].transform(
+        lambda x: x / x.sum() if x.sum() > 0 else 1 / len(x)
+    )
+    return df
 class DataSmolAgent(CodeAgent):
     """
     A data processing agent that cleans and extracts features from the provided DataFrame.
                 clean_data,
                 extract_features,
                 save_to_csv,  # Added save_to_csv tool
+                predict_funding,  # Added predict_funding tool
             ],
             model=self.model,
             additional_authorized_imports=["pandas", "numpy"]
         features_output = self.tools["extract_features"](df=self.df)
         self.df = features_output.result if hasattr(features_output, "result") else features_output
+        funding_output = self.tools["predict_funding"](df=self.df)
+        self.df = funding_output.result if hasattr(funding_output, "result") else funding_output
         if output_csv:
             csv_output = self.tools["save_to_csv"](df=self.df, filename="processed_output.csv")
             print(f"CSV saved at: {csv_output}")
+        return self.df

Oracle/deepfundingoracle.py CHANGED Viewed

@@ -195,68 +195,6 @@ def fetch_github_features(df):
 def timeout_handler(signum, frame):
     raise TimeoutError("LLama model prediction timed out.")
-# def assign_base_weight(df, max_workers=32):
-#     """
-#     Assign base weights using LLama model in parallel.
-#     """
-#     print("[INFO] Starting base weight assignment using LLama model...", flush=True)
-#     logging.info("[INFO] Assigning base weights using LLama model...")
-#     start_time = time.time()
-#     llama = SmolLM()
-#     base_weights = []
-#     llm_cache = {}
-#
-#     # Prepare prompts for all repositories
-#     prompts = {}
-#     for idx, row in df.iterrows():
-#         repo = row.get("repo", "")
-#         parent = row.get("parent", "")
-#         stars = row.get("stars", 0)
-#         forks = row.get("forks", 0)
-#         watchers = row.get("watchers", 0)
-#         issues = row.get("open_issues", 0)
-#         pulls = row.get("pulls", 0)
-#         activity = row.get("activity", "")
-#         prompts[idx] = (
-#             f"Repository: {repo}\n"
-#             f"GitHub Metrics: {stars} stars, {forks} forks, {watchers} watchers, {issues} open issues, {pulls} pull requests, activity: {activity}.\n"
-#             f"Parent or dependency: {parent}\n\n"
-#             "Based on these features, assign a dependency weight between 0 and 1 for the repository "
-#             "that reflects how influential the repository is as a source relative to its parent. "
-#             "Only output the numeric value."
-#         )
-#
-#     # Define the prediction function
-#     def _predict(idx, prompt):
-#         if idx in llm_cache:
-#             return idx, llm_cache[idx]
-#         try:
-#             resp = llama.predict(prompt)
-#             match = re.search(r"[-+]?\d*\.\d+|\d+", resp)
-#             weight = min(max(float(match.group()), 0), 1) if match else 0.0
-#             llm_cache[idx] = weight
-#             return idx, weight
-#         except Exception as e:
-#             print(f"[ERROR] Failed to process repository {idx}: {e}", flush=True)
-#             logging.error(f"[ERROR] Failed to process repository {idx}: {e}")
-#             return idx, 0.0  # Default weight in case of failure
-#
-#     # Run predictions in parallel
-#     with ThreadPoolExecutor(max_workers=max_workers) as executor:
-#         futures = [executor.submit(_predict, idx, prompt) for idx, prompt in prompts.items()]
-#         for fut in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="LLM Prompts"):
-#             idx, weight = fut.result()
-#             base_weights.append((idx, weight))
-#
-#     # Sort weights by index and assign to DataFrame
-#     base_weights.sort(key=lambda x: x[0])
-#     df["base_weight"] = [weight for _, weight in base_weights]
-#
-#     end_time = time.time()
-#     print(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.", flush=True)
-#     logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
-#     return df
 def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0):
     """
     Assign base weights using a single LLM call to determine feature weights,
@@ -324,6 +262,17 @@ def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0):
     return df
 def prepare_dataset(file):
     print("[INFO] Starting dataset preparation...")
     start_time = time.time()
@@ -337,6 +286,8 @@ def prepare_dataset(file):
     print("[INFO] GitHub features fetched successfully.")
     print("[INFO] Assigning base weights using LLama model...")
     df = assign_base_weight(df)
     end_time = time.time()
     print(f"[INFO] Dataset preparation completed in {end_time - start_time:.2f} seconds.")
     return df
@@ -374,7 +325,7 @@ def train_predict_weight(df,
     y = df[target]
     # For regression, if a classification criterion is given, switch to 'mse'
-    reg_criterion = "mse" if criterion in ["gini", "entropy"] else criterion
     rf_model = RandomForestRegressor(random_state=42,
                                      criterion=reg_criterion,
@@ -399,38 +350,18 @@ def train_predict_weight(df,
     for parent, children in parent_map.items():
         group_idxs = df[df["parent"] == parent].index
-        # Check if a repo in the group is flagged as is_source
-        source_idxs = df.loc[group_idxs][df["is_source"] == True].index.tolist() if "is_source" in df.columns else []
-        if source_idxs:
-            parent_idx = source_idxs[0]
         else:
-            # Fallback: choose the repo with the maximum prediction as the parent
-            preds = df.loc[group_idxs, "rf_pred"]
-            parent_idx = preds.idxmax()
-        child_idxs = [idx for idx in group_idxs if idx != parent_idx]
-        if child_idxs:
-            child_preds = df.loc[child_idxs, "rf_pred"]
-            if child_preds.max() > child_preds.min():
-                normed = (child_preds - child_preds.min()) / (child_preds.max() - child_preds.min() + 1e-8)
-            else:
-                normed = pd.Series([0.0] * len(child_idxs), index=child_idxs)
-            normed = normed * 0.99
-            for idx, val in zip(child_idxs, normed):
-                final_weights[idx] = val
-        final_weights[parent_idx] = 1.0
     df["final_weight"] = df.index.map(final_weights).fillna(0.0)
-    # Enforce monotonicity within each group so weights are descending
-    for parent, children in parent_map.items():
-        group_idxs = df[df["parent"] == parent].index
-        group_weights = df.loc[group_idxs, "final_weight"].sort_values(ascending=False)
-        prev = 1.0
-        for idx in group_weights.index:
-            if df.at[idx, "final_weight"] > prev:
-                df.at[idx, "final_weight"] = prev
-            prev = df.at[idx, "final_weight"]
     end_time = time.time()
     print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True)
     return df
@@ -453,4 +384,12 @@ def create_submission_csv(df, output_filename="submission.csv"):
 # This file now focuses solely on data processing and prediction.
 if __name__ == "__main__":
-    print("DeepFunding Oracle is now ready for backend processing.", flush=True)

 def timeout_handler(signum, frame):
     raise TimeoutError("LLama model prediction timed out.")
 def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0):
     """
     Assign base weights using a single LLM call to determine feature weights,
     return df
+def normalize_funding(df):
+    """
+    Normalize funding weights for child repositories grouped by parent.
+    """
+    print("[INFO] Normalizing funding weights...", flush=True)
+    df["final_weight"] = df.groupby("parent")["final_weight"].transform(
+        lambda x: x / x.sum() if x.sum() > 0 else 1 / len(x)
+    )
+    print("[INFO] Funding weights normalized successfully.", flush=True)
+    return df
 def prepare_dataset(file):
     print("[INFO] Starting dataset preparation...")
     start_time = time.time()
     print("[INFO] GitHub features fetched successfully.")
     print("[INFO] Assigning base weights using LLama model...")
     df = assign_base_weight(df)
+    df = train_predict_weight(df)
+    df = normalize_funding(df)
     end_time = time.time()
     print(f"[INFO] Dataset preparation completed in {end_time - start_time:.2f} seconds.")
     return df
     y = df[target]
     # For regression, if a classification criterion is given, switch to 'mse'
+    reg_criterion = "squared_error" if criterion in ["gini", "entropy"] else criterion
     rf_model = RandomForestRegressor(random_state=42,
                                      criterion=reg_criterion,
     for parent, children in parent_map.items():
         group_idxs = df[df["parent"] == parent].index
+        preds = df.loc[group_idxs, "rf_pred"]
+        total = preds.sum()
+        if total > 0:
+            normed = preds / total
         else:
+            # If sum is zero, assign equal weights.
+            normed = pd.Series([1/len(preds)] * len(preds), index=preds.index)
+        for idx, weight in normed.items():
+            final_weights[idx] = weight
     df["final_weight"] = df.index.map(final_weights).fillna(0.0)
     end_time = time.time()
     print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True)
     return df
 # This file now focuses solely on data processing and prediction.
 if __name__ == "__main__":
+    input_file = "input.csv"  # Replace with the actual input file path
+    output_file = "submission.csv"
+    print("[INFO] Preparing dataset...")
+    df = prepare_dataset(input_file)
+    print("[INFO] Creating submission CSV...")
+    create_submission_csv(df, output_file)
+    print("[INFO] Process completed successfully.")