Spaces:

FelixPhilip
/

DeepFundingOracle

Sleeping

App Files Files Community

FelixPhilip commited on Apr 27

Commit

2424d59

1 Parent(s): ba26d2b

Oracle weight assigning update

Browse files

Files changed (1) hide show

Oracle/deepfundingoracle.py +22 -33

Oracle/deepfundingoracle.py CHANGED Viewed

@@ -233,9 +233,7 @@ def timeout_handler(signum, frame):
 #     logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
 #     return df
-def assign_base_weight(df, max_workers=32, llm_retries=2,llm_delay=0):
     """
     Assign base weights using a single LLM call to determine feature weights,
     and programmatically calculate repository weights.
@@ -245,53 +243,45 @@ def assign_base_weight(df, max_workers=32, llm_retries=2,llm_delay=0):
     start_time = time.time()
     oracle = SmolLM()
-    # Step 1: Call LLM once to determine weights for each feature
     prompt = (
         "Can you Predict a weight in the range (0-1) for these github features such as stars,forks,watchers,open_issues,pulls,activity,contributors based on its importance in determining "
-        "the influence of a repository. Output ONLY a valid JSON object with keys as feature names and values as the predicted weights. "
-        "Do not include any explanation or extra text. here is an output example: \n"
-        '{\n'
-        '  "stars": 0.3,\n'
-        '  "forks": 0.2,\n'
-        '  "watchers": 0.2,\n'
-        '  "open_issues": 0.1,\n'
-        '  "pulls": 0.1,\n'
-        '  "activity": 0.05,\n'
-        '  "contributors": 0.05\n'
-        '}\n'
     )
-    feature_weights= None
     for attempt in range(llm_retries):
         try:
-            response = oracle.predict(prompt,max_length=512, max_new_tokens=150)
             if not response or not response.strip():
                 raise ValueError("Empty response from Oracle.")
-            feature_weights = json.loads(response)  # Safely parse JSON
             print(f"[INFO] Feature weights from LLM: {feature_weights}", flush=True)
             break
         except Exception as e:
             print(f"[ERROR] Oracle attempt {attempt+1} failed: {e}", flush=True)
             logging.error(f"[ERROR] Oracle attempt {attempt+1} failed: {e}")
             time.sleep(llm_delay)
-            # Fallback to default weights
     if feature_weights is None:
-            feature_weights = {
-                "stars": 0.3,
-                "forks": 0.2,
-                "watchers": 0.2,
-                "open_issues": 0.1,
-                "pulls": 0.1,
-                "activity": 0.05,
-                "contributors": 0.05
-            }
-            print(f"[INFO] Using default feature weights: {feature_weights}", flush=True)
-    # Step 2: Ensure all feature columns are numeric
     for feature in feature_weights.keys():
         if feature in df.columns:
             df[feature] = pd.to_numeric(df[feature], errors='coerce').fillna(0)
-    # Step 3: Programmatically calculate weights for each repository
     def calculate_weight(row):
         weight = 0
         for feature, feature_weight in feature_weights.items():
@@ -300,8 +290,6 @@ def assign_base_weight(df, max_workers=32, llm_retries=2,llm_delay=0):
         return weight
     df["base_weight_raw"] = df.apply(calculate_weight, axis=1)
-    # Step 4: Normalize weights per parent
     df["base_weight"] = df.groupby("parent")["base_weight_raw"].transform(
         lambda s: (s - s.min()) / (s.max() - s.min() if s.max() != s.min() else 1)
     )
@@ -311,6 +299,7 @@ def assign_base_weight(df, max_workers=32, llm_retries=2,llm_delay=0):
     logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
     return df
 def prepare_dataset(file):
     print("[INFO] Starting dataset preparation...")
     start_time = time.time()

 #     logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
 #     return df
+def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0):
     """
     Assign base weights using a single LLM call to determine feature weights,
     and programmatically calculate repository weights.
     start_time = time.time()
     oracle = SmolLM()
     prompt = (
         "Can you Predict a weight in the range (0-1) for these github features such as stars,forks,watchers,open_issues,pulls,activity,contributors based on its importance in determining "
+        "the influence of a repository. Output the weights for each feature as text e.g.: "
+        'stars: 0.3, forks: 0.2, watchers: 0.2, open_issues: 0.1, pulls: 0.1, activity: 0.05, contributors: 0.05'
     )
+    feature_weights = None
     for attempt in range(llm_retries):
         try:
+            response = oracle.predict(prompt, max_length=512, max_new_tokens=150)
             if not response or not response.strip():
                 raise ValueError("Empty response from Oracle.")
+            matches = re.findall(
+                r'(stars|forks|watchers|open_issues|pulls|activity|contributors)\s*[:=]\s*([0-9]*\.?[0-9]+)',
+                response, re.IGNORECASE)
+            feature_weights = {k.lower(): float(v) for k, v in matches}
+            if not feature_weights or len(feature_weights) < 7:
+                raise ValueError("Could not extract all feature weights from response.")
             print(f"[INFO] Feature weights from LLM: {feature_weights}", flush=True)
             break
         except Exception as e:
             print(f"[ERROR] Oracle attempt {attempt+1} failed: {e}", flush=True)
             logging.error(f"[ERROR] Oracle attempt {attempt+1} failed: {e}")
             time.sleep(llm_delay)
     if feature_weights is None:
+        feature_weights = {
+            "stars": 0.3,
+            "forks": 0.2,
+            "watchers": 0.2,
+            "open_issues": 0.1,
+            "pulls": 0.1,
+            "activity": 0.05,
+            "contributors": 0.05
+        }
+        print(f"[INFO] Using default feature weights: {feature_weights}", flush=True)
     for feature in feature_weights.keys():
         if feature in df.columns:
             df[feature] = pd.to_numeric(df[feature], errors='coerce').fillna(0)
     def calculate_weight(row):
         weight = 0
         for feature, feature_weight in feature_weights.items():
         return weight
     df["base_weight_raw"] = df.apply(calculate_weight, axis=1)
     df["base_weight"] = df.groupby("parent")["base_weight_raw"].transform(
         lambda s: (s - s.min()) / (s.max() - s.min() if s.max() != s.min() else 1)
     )
     logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
     return df
 def prepare_dataset(file):
     print("[INFO] Starting dataset preparation...")
     start_time = time.time()