Spaces:

FelixPhilip
/

DeepFundingOracle

Sleeping

App Files Files Community

FelixPhilip commited on Apr 27

Commit

57ca96a

1 Parent(s): 52a14c1

Oracle weight assigning update

Browse files

Files changed (7) hide show

.idea/.gitignore +8 -0
.idea/DeepFundingOracle.iml +14 -0
.idea/inspectionProfiles/profiles_settings.xml +6 -0
.idea/misc.xml +7 -0
.idea/modules.xml +8 -0
.idea/vcs.xml +6 -0
Oracle/deepfundingoracle.py +100 -47

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml

.idea/DeepFundingOracle.iml ADDED Viewed

	@@ -0,0 +1,14 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/.venv" />
+    </content>
+    <orderEntry type="jdk" jdkName="Python 3.11 (DeepFundingOracle)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="GOOGLE" />
+    <option name="myDocStringFormat" value="Google" />
+  </component>
+</module>

.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,7 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Black">
+    <option name="sdkName" value="Python 3.11 (DeepFundingOracle)" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.11 (DeepFundingOracle)" project-jdk-type="Python SDK" />
+</project>

.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/DeepFundingOracle.iml" filepath="$PROJECT_DIR$/.idea/DeepFundingOracle.iml" />
+    </modules>
+  </component>
+</project>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>

Oracle/deepfundingoracle.py CHANGED Viewed

@@ -169,62 +169,115 @@ def fetch_github_features(df):
 def timeout_handler(signum, frame):
     raise TimeoutError("LLama model prediction timed out.")
 def assign_base_weight(df, max_workers=32):
     """
-    Assign base weights using LLama model in parallel.
     """
-    print("[INFO] Starting base weight assignment using LLama model...", flush=True)
-    logging.info("[INFO] Assigning base weights using LLama model...")
     start_time = time.time()
     llama = SmolLM()
-    base_weights = []
-    llm_cache = {}
-    # Prepare prompts for all repositories
-    prompts = {}
-    for idx, row in df.iterrows():
-        repo = row.get("repo", "")
-        parent = row.get("parent", "")
-        stars = row.get("stars", 0)
-        forks = row.get("forks", 0)
-        watchers = row.get("watchers", 0)
-        issues = row.get("open_issues", 0)
-        pulls = row.get("pulls", 0)
-        activity = row.get("activity", "")
-        prompts[idx] = (
-            f"Repository: {repo}\n"
-            f"GitHub Metrics: {stars} stars, {forks} forks, {watchers} watchers, {issues} open issues, {pulls} pull requests, activity: {activity}.\n"
-            f"Parent or dependency: {parent}\n\n"
-            "Based on these features, assign a dependency weight between 0 and 1 for the repository "
-            "that reflects how influential the repository is as a source relative to its parent. "
-            "Only output the numeric value."
-        )
-    # Define the prediction function
-    def _predict(idx, prompt):
-        if idx in llm_cache:
-            return idx, llm_cache[idx]
-        try:
-            resp = llama.predict(prompt)
-            match = re.search(r"[-+]?\d*\.\d+|\d+", resp)
-            weight = min(max(float(match.group()), 0), 1) if match else 0.0
-            llm_cache[idx] = weight
-            return idx, weight
-        except Exception as e:
-            print(f"[ERROR] Failed to process repository {idx}: {e}", flush=True)
-            logging.error(f"[ERROR] Failed to process repository {idx}: {e}")
-            return idx, 0.0  # Default weight in case of failure
-    # Run predictions in parallel
-    with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        futures = [executor.submit(_predict, idx, prompt) for idx, prompt in prompts.items()]
-        for fut in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="LLM Prompts"):
-            idx, weight = fut.result()
-            base_weights.append((idx, weight))
-    # Sort weights by index and assign to DataFrame
-    base_weights.sort(key=lambda x: x[0])
-    df["base_weight"] = [weight for _, weight in base_weights]
     end_time = time.time()
     print(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.", flush=True)

 def timeout_handler(signum, frame):
     raise TimeoutError("LLama model prediction timed out.")
+# def assign_base_weight(df, max_workers=32):
+#     """
+#     Assign base weights using LLama model in parallel.
+#     """
+#     print("[INFO] Starting base weight assignment using LLama model...", flush=True)
+#     logging.info("[INFO] Assigning base weights using LLama model...")
+#     start_time = time.time()
+#     llama = SmolLM()
+#     base_weights = []
+#     llm_cache = {}
+#
+#     # Prepare prompts for all repositories
+#     prompts = {}
+#     for idx, row in df.iterrows():
+#         repo = row.get("repo", "")
+#         parent = row.get("parent", "")
+#         stars = row.get("stars", 0)
+#         forks = row.get("forks", 0)
+#         watchers = row.get("watchers", 0)
+#         issues = row.get("open_issues", 0)
+#         pulls = row.get("pulls", 0)
+#         activity = row.get("activity", "")
+#         prompts[idx] = (
+#             f"Repository: {repo}\n"
+#             f"GitHub Metrics: {stars} stars, {forks} forks, {watchers} watchers, {issues} open issues, {pulls} pull requests, activity: {activity}.\n"
+#             f"Parent or dependency: {parent}\n\n"
+#             "Based on these features, assign a dependency weight between 0 and 1 for the repository "
+#             "that reflects how influential the repository is as a source relative to its parent. "
+#             "Only output the numeric value."
+#         )
+#
+#     # Define the prediction function
+#     def _predict(idx, prompt):
+#         if idx in llm_cache:
+#             return idx, llm_cache[idx]
+#         try:
+#             resp = llama.predict(prompt)
+#             match = re.search(r"[-+]?\d*\.\d+|\d+", resp)
+#             weight = min(max(float(match.group()), 0), 1) if match else 0.0
+#             llm_cache[idx] = weight
+#             return idx, weight
+#         except Exception as e:
+#             print(f"[ERROR] Failed to process repository {idx}: {e}", flush=True)
+#             logging.error(f"[ERROR] Failed to process repository {idx}: {e}")
+#             return idx, 0.0  # Default weight in case of failure
+#
+#     # Run predictions in parallel
+#     with ThreadPoolExecutor(max_workers=max_workers) as executor:
+#         futures = [executor.submit(_predict, idx, prompt) for idx, prompt in prompts.items()]
+#         for fut in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="LLM Prompts"):
+#             idx, weight = fut.result()
+#             base_weights.append((idx, weight))
+#
+#     # Sort weights by index and assign to DataFrame
+#     base_weights.sort(key=lambda x: x[0])
+#     df["base_weight"] = [weight for _, weight in base_weights]
+#
+#     end_time = time.time()
+#     print(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.", flush=True)
+#     logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
+#     return df
 def assign_base_weight(df, max_workers=32):
     """
+    Assign base weights using a single LLM call to determine feature weights,
+    and programmatically calculate repository weights.
     """
+    print("[INFO] Starting optimized base weight assignment...", flush=True)
+    logging.info("[INFO] Assigning base weights using optimized approach...")
     start_time = time.time()
     llama = SmolLM()
+    # Step 1: Call LLM once to determine weights for each feature
+    prompt = (
+        "The following are GitHub repository features:\n"
+        "- Stars\n"
+        "- Forks\n"
+        "- Watchers\n"
+        "- Open Issues\n"
+        "- Pull Requests\n"
+        "- Activity (days since last update)\n"
+        "- Contributors\n\n"
+        "Assign a weight (0-1) to each feature based on its importance in determining "
+        "the influence of a repository. Provide the weights as a JSON object with "
+        "keys as feature names and values as their weights."
+    )
+    try:
+        response = llama.predict(prompt)
+        feature_weights = eval(response)  # Convert JSON string to dictionary
+        print(f"[INFO] Feature weights from LLM: {feature_weights}", flush=True)
+    except Exception as e:
+        print(f"[ERROR] Failed to fetch feature weights from LLM: {e}", flush=True)
+        logging.error(f"[ERROR] Failed to fetch feature weights from LLM: {e}")
+        return df
+    # Step 2: Programmatically calculate weights for each repository
+    def calculate_weight(row):
+        weight = 0
+        for feature, feature_weight in feature_weights.items():
+            if feature in row and pd.notna(row[feature]):
+                weight += row[feature] * feature_weight
+        return weight
+    df["base_weight_raw"] = df.apply(calculate_weight, axis=1)
+    # Step 3: Normalize weights per parent
+    df["base_weight"] = df.groupby("parent")["base_weight_raw"].transform(
+        lambda s: (s - s.min()) / (s.max() - s.min() if s.max() != s.min() else 1)
+    )
     end_time = time.time()
     print(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.", flush=True)