Spaces:

FelixPhilip
/

DeepFundingOracle

Running

App Files Files Community

FelixPhilip commited on Apr 26

Commit

dc9d402

1 Parent(s): 3868d8d

Oracle

Browse files

Files changed (1) hide show

Oracle/deepfundingoracle.py +33 -28

Oracle/deepfundingoracle.py CHANGED Viewed

@@ -20,6 +20,7 @@ import time
 import threading
 import logging
 import concurrent.futures
 import signal
 from tqdm import tqdm
 import sys
@@ -168,7 +169,10 @@ def fetch_github_features(df):
 def timeout_handler(signum, frame):
     raise TimeoutError("LLama model prediction timed out.")
-def assign_base_weight(df):
     print("[INFO] Starting base weight assignment using LLama model...", flush=True)
     logging.info("[INFO] Assigning base weights using LLama model...")
     start_time = time.time()
@@ -176,10 +180,10 @@ def assign_base_weight(df):
     base_weights = []
     llm_cache = {}
-    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Assigning weights"):
         repo = row.get("repo", "")
-        print(f"[INFO] Assigning weight for repository {idx + 1}/{len(df)}: {repo}", flush=True)
-        logging.info(f"[INFO] Processing repository {idx + 1}/{len(df)}: {repo}")
         parent = row.get("parent", "")
         stars = row.get("stars", 0)
         forks = row.get("forks", 0)
@@ -187,7 +191,7 @@ def assign_base_weight(df):
         issues = row.get("open_issues", 0)
         pulls = row.get("pulls", 0)
         activity = row.get("activity", "")
-        prompt = (
             f"Repository: {repo}\n"
             f"GitHub Metrics: {stars} stars, {forks} forks, {watchers} watchers, {issues} open issues, {pulls} pull requests, activity: {activity}.\n"
             f"Parent or dependency: {parent}\n\n"
@@ -195,32 +199,33 @@ def assign_base_weight(df):
             "that reflects how influential the repository is as a source relative to its parent. "
             "Only output the numeric value."
         )
         try:
-            if repo in llm_cache:
-                weight = llm_cache[repo]
-            else:
-                print(f"[INFO] Sending prompt to LLama model for repo: {repo}", flush=True)
-                start_llama_time = time.time()
-                response = llama.predict(prompt)
-                # Use regex to extract the first valid float from the response
-                match = re.search(r"[-+]?\d*\.\d+|\d+", response)
-                if match:
-                    weight = float(match.group())
-                    weight = min(max(weight, 0), 1)
-                else:
-                    raise ValueError(f"No valid float found in response: {response}")
-                end_llama_time = time.time()
-                print(f"[INFO] Received weight {weight} for {repo} in {end_llama_time - start_llama_time:.2f} seconds.", flush=True)
-                logging.info(f"[INFO] Processed repository {repo} in {end_llama_time - start_llama_time:.2f} seconds. Weight: {weight}")
-                llm_cache[repo] = weight
         except Exception as e:
-            print(f"[ERROR] Failed to process repository {repo}: {e}", flush=True)
-            logging.error(f"[ERROR] Failed to process repository {repo}: {e}")
-            weight = 0.0  # Default weight in case of failure (set to 0 for no work)
-        base_weights.append(weight)
-        print(f"[PROGRESS] Finished {idx + 1}/{len(df)} repositories.", flush=True)
-    df["base_weight"] = base_weights
     end_time = time.time()
     print(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.", flush=True)
     logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")

 import threading
 import logging
 import concurrent.futures
+from concurrent.futures import ThreadPoolExecutor
 import signal
 from tqdm import tqdm
 import sys
 def timeout_handler(signum, frame):
     raise TimeoutError("LLama model prediction timed out.")
+def assign_base_weight(df, max_workers=8):
+    """
+    Assign base weights using LLama model in parallel.
+    """
     print("[INFO] Starting base weight assignment using LLama model...", flush=True)
     logging.info("[INFO] Assigning base weights using LLama model...")
     start_time = time.time()
     base_weights = []
     llm_cache = {}
+    # Prepare prompts for all repositories
+    prompts = {}
+    for idx, row in df.iterrows():
         repo = row.get("repo", "")
         parent = row.get("parent", "")
         stars = row.get("stars", 0)
         forks = row.get("forks", 0)
         issues = row.get("open_issues", 0)
         pulls = row.get("pulls", 0)
         activity = row.get("activity", "")
+        prompts[idx] = (
             f"Repository: {repo}\n"
             f"GitHub Metrics: {stars} stars, {forks} forks, {watchers} watchers, {issues} open issues, {pulls} pull requests, activity: {activity}.\n"
             f"Parent or dependency: {parent}\n\n"
             "that reflects how influential the repository is as a source relative to its parent. "
             "Only output the numeric value."
         )
+    # Define the prediction function
+    def _predict(idx, prompt):
+        if idx in llm_cache:
+            return idx, llm_cache[idx]
         try:
+            resp = llama.predict(prompt)
+            match = re.search(r"[-+]?\d*\.\d+|\d+", resp)
+            weight = min(max(float(match.group()), 0), 1) if match else 0.0
+            llm_cache[idx] = weight
+            return idx, weight
         except Exception as e:
+            print(f"[ERROR] Failed to process repository {idx}: {e}", flush=True)
+            logging.error(f"[ERROR] Failed to process repository {idx}: {e}")
+            return idx, 0.0  # Default weight in case of failure
+    # Run predictions in parallel
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = [executor.submit(_predict, idx, prompt) for idx, prompt in prompts.items()]
+        for fut in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="LLM Prompts"):
+            idx, weight = fut.result()
+            base_weights.append((idx, weight))
+    # Sort weights by index and assign to DataFrame
+    base_weights.sort(key=lambda x: x[0])
+    df["base_weight"] = [weight for _, weight in base_weights]
     end_time = time.time()
     print(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.", flush=True)
     logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")