Spaces:

FelixPhilip
/

DeepFundingOracle

Sleeping

App Files Files Community

FelixPhilip commited on May 4

Commit

386c440

1 Parent(s): 17c5050

Oracle

Browse files

Files changed (1) hide show

Oracle/deepfundingoracle.py +31 -16

Oracle/deepfundingoracle.py CHANGED Viewed

@@ -55,7 +55,6 @@ logging.basicConfig(
 def fetch_repo_metrics(repo_url):
     """
     Fetch GitHub metrics (stars, forks, watchers, open issues, pull requests, and activity) given a repository URL.
-    Assumes repo_url is in the form "https://github.com/owner/repo".
     """
     try:
         # Extract owner and repo name
@@ -71,7 +70,9 @@ def fetch_repo_metrics(repo_url):
         r = requests.get(api_url, headers=headers)
         if r.status_code == 200:
             data = r.json()
-            pulls_url = data.get("pulls_url", "").replace("{\/*state}", "")
             pulls_count = len(requests.get(pulls_url, headers=headers).json()) if pulls_url else 0
             activity = data.get("updated_at", "")
             return {
@@ -86,8 +87,10 @@ def fetch_repo_metrics(repo_url):
                 "token": token
             }
         else:
             return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0}
-    except Exception:
         return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0}
@@ -132,8 +135,10 @@ def fetch_github_features(df):
     def get_metrics(repo_url):
         if repo_url in cache:
             return cache[repo_url]
         val = fetch_repo_metrics(repo_url)
         try:
             m = re.search(r"github\.com/([^/]+)/([^/]+)",repo_url)
             if m:
@@ -209,8 +214,9 @@ def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0):
     oracle = SmolLM()
     prompt = (
-        "Can you Predict a weight in the range (0-1) for these github features such as stars,forks,watchers,open_issues,pulls,activity,contributors based on its importance in determining "
-        "the influence of a repository. Output the weights for each feature as text e.g.: "
         'stars: 0.3, forks: 0.2, watchers: 0.2, open_issues: 0.1, pulls: 0.1, activity: 0.05, contributors: 0.05'
     )
     feature_weights = None
@@ -231,18 +237,14 @@ def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0):
             print(f"[ERROR] Oracle attempt {attempt+1} failed: {e}", flush=True)
             logging.error(f"[ERROR] Oracle attempt {attempt+1} failed: {e}")
             time.sleep(llm_delay)
     if feature_weights is None:
-        feature_weights = {
-            "stars": 0.3,
-            "forks": 0.2,
-            "watchers": 0.2,
-            "open_issues": 0.1,
-            "pulls": 0.1,
-            "activity": 0.05,
-            "contributors": 0.05
-        }
-        print(f"[INFO] Using default feature weights: {feature_weights}", flush=True)
     for feature in feature_weights.keys():
         if feature in df.columns:
             df[feature] = pd.to_numeric(df[feature], errors='coerce').fillna(0)
@@ -264,6 +266,19 @@ def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0):
     logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
     return df
 def sanity_check_weights(df):
     """
     Sanity-checks LLM weights by comparing them with other metrics.
@@ -383,7 +398,7 @@ def validate_target(df):
     variance = df[target].var()
     print(f"[DEBUG] Target variable variance: {variance}")
     if variance < 1e-6:
-        raise ValueError(f"Target variable '{target}' has insufficient variance.")
     return df

 def fetch_repo_metrics(repo_url):
     """
     Fetch GitHub metrics (stars, forks, watchers, open issues, pull requests, and activity) given a repository URL.
     """
     try:
         # Extract owner and repo name
         r = requests.get(api_url, headers=headers)
         if r.status_code == 200:
             data = r.json()
+            # Log fetched data for debugging
+            print(f"[DEBUG] Fetched data for {repo_url}: {data}")
+            pulls_url = data.get("pulls_url", "").replace("{/state}", "")
             pulls_count = len(requests.get(pulls_url, headers=headers).json()) if pulls_url else 0
             activity = data.get("updated_at", "")
             return {
                 "token": token
             }
         else:
+            print(f"[ERROR] Failed to fetch data for {repo_url}: {r.status_code}")
             return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0}
+    except Exception as e:
+        print(f"[ERROR] Exception while fetching data for {repo_url}: {e}")
         return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0}
     def get_metrics(repo_url):
         if repo_url in cache:
+            print(f"[DEBUG] Cached data for {repo_url}: {cache[repo_url]}")
             return cache[repo_url]
         val = fetch_repo_metrics(repo_url)
+        print(f"[DEBUG] Extracted GitHub data for {repo_url}: {val}")  # <-- Add this line
         try:
             m = re.search(r"github\.com/([^/]+)/([^/]+)",repo_url)
             if m:
     oracle = SmolLM()
     prompt = (
+        "Can you Predict a weight in the range (0-1) for these GitHub features such as stars, forks, watchers, "
+        "open_issues, pulls, activity, contributors based on their importance in determining the influence of a repository? "
+        "Output the weights for each feature as text e.g.: "
         'stars: 0.3, forks: 0.2, watchers: 0.2, open_issues: 0.1, pulls: 0.1, activity: 0.05, contributors: 0.05'
     )
     feature_weights = None
             print(f"[ERROR] Oracle attempt {attempt+1} failed: {e}", flush=True)
             logging.error(f"[ERROR] Oracle attempt {attempt+1} failed: {e}")
             time.sleep(llm_delay)
+    # Fallback mechanism: Calculate feature weights dynamically if LLM fails
     if feature_weights is None:
+        print("[WARN] LLM failed to provide feature weights. Calculating fallback weights dynamically.")
+        feature_weights = calculate_fallback_weights(df)
+        print(f"[INFO] Fallback feature weights: {feature_weights}", flush=True)
+    # Ensure numeric columns are properly formatted
     for feature in feature_weights.keys():
         if feature in df.columns:
             df[feature] = pd.to_numeric(df[feature], errors='coerce').fillna(0)
     logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
     return df
+def calculate_fallback_weights(df):
+    """
+    Dynamically calculate fallback feature weights based on feature variance and correlation with the target.
+    """
+    print("[INFO] Calculating fallback feature weights...")
+    numeric_cols = df.select_dtypes(include=[np.number]).columns
+    feature_variances = df[numeric_cols].var()
+    total_variance = feature_variances.sum()
+    # Assign weights proportional to feature variance
+    fallback_weights = {col: var / total_variance for col, var in feature_variances.items() if total_variance > 0}
+    return fallback_weights
 def sanity_check_weights(df):
     """
     Sanity-checks LLM weights by comparing them with other metrics.
     variance = df[target].var()
     print(f"[DEBUG] Target variable variance: {variance}")
     if variance < 1e-6:
+        raise ValueError(f"Target variable '{target}' has insufficient variance. Please check feature values.")
     return df