Spaces:
Running
Running
Commit
·
386c440
1
Parent(s):
17c5050
Oracle
Browse files- Oracle/deepfundingoracle.py +31 -16
Oracle/deepfundingoracle.py
CHANGED
@@ -55,7 +55,6 @@ logging.basicConfig(
|
|
55 |
def fetch_repo_metrics(repo_url):
|
56 |
"""
|
57 |
Fetch GitHub metrics (stars, forks, watchers, open issues, pull requests, and activity) given a repository URL.
|
58 |
-
Assumes repo_url is in the form "https://github.com/owner/repo".
|
59 |
"""
|
60 |
try:
|
61 |
# Extract owner and repo name
|
@@ -71,7 +70,9 @@ def fetch_repo_metrics(repo_url):
|
|
71 |
r = requests.get(api_url, headers=headers)
|
72 |
if r.status_code == 200:
|
73 |
data = r.json()
|
74 |
-
|
|
|
|
|
75 |
pulls_count = len(requests.get(pulls_url, headers=headers).json()) if pulls_url else 0
|
76 |
activity = data.get("updated_at", "")
|
77 |
return {
|
@@ -86,8 +87,10 @@ def fetch_repo_metrics(repo_url):
|
|
86 |
"token": token
|
87 |
}
|
88 |
else:
|
|
|
89 |
return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0}
|
90 |
-
except Exception:
|
|
|
91 |
return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0}
|
92 |
|
93 |
|
@@ -132,8 +135,10 @@ def fetch_github_features(df):
|
|
132 |
|
133 |
def get_metrics(repo_url):
|
134 |
if repo_url in cache:
|
|
|
135 |
return cache[repo_url]
|
136 |
val = fetch_repo_metrics(repo_url)
|
|
|
137 |
try:
|
138 |
m = re.search(r"github\.com/([^/]+)/([^/]+)",repo_url)
|
139 |
if m:
|
@@ -209,8 +214,9 @@ def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0):
|
|
209 |
oracle = SmolLM()
|
210 |
|
211 |
prompt = (
|
212 |
-
"Can you Predict a weight in the range (0-1) for these
|
213 |
-
"
|
|
|
214 |
'stars: 0.3, forks: 0.2, watchers: 0.2, open_issues: 0.1, pulls: 0.1, activity: 0.05, contributors: 0.05'
|
215 |
)
|
216 |
feature_weights = None
|
@@ -231,18 +237,14 @@ def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0):
|
|
231 |
print(f"[ERROR] Oracle attempt {attempt+1} failed: {e}", flush=True)
|
232 |
logging.error(f"[ERROR] Oracle attempt {attempt+1} failed: {e}")
|
233 |
time.sleep(llm_delay)
|
|
|
|
|
234 |
if feature_weights is None:
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
"watchers": 0.2,
|
239 |
-
"open_issues": 0.1,
|
240 |
-
"pulls": 0.1,
|
241 |
-
"activity": 0.05,
|
242 |
-
"contributors": 0.05
|
243 |
-
}
|
244 |
-
print(f"[INFO] Using default feature weights: {feature_weights}", flush=True)
|
245 |
|
|
|
246 |
for feature in feature_weights.keys():
|
247 |
if feature in df.columns:
|
248 |
df[feature] = pd.to_numeric(df[feature], errors='coerce').fillna(0)
|
@@ -264,6 +266,19 @@ def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0):
|
|
264 |
logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
|
265 |
return df
|
266 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
267 |
def sanity_check_weights(df):
|
268 |
"""
|
269 |
Sanity-checks LLM weights by comparing them with other metrics.
|
@@ -383,7 +398,7 @@ def validate_target(df):
|
|
383 |
variance = df[target].var()
|
384 |
print(f"[DEBUG] Target variable variance: {variance}")
|
385 |
if variance < 1e-6:
|
386 |
-
raise ValueError(f"Target variable '{target}' has insufficient variance.")
|
387 |
return df
|
388 |
|
389 |
|
|
|
55 |
def fetch_repo_metrics(repo_url):
|
56 |
"""
|
57 |
Fetch GitHub metrics (stars, forks, watchers, open issues, pull requests, and activity) given a repository URL.
|
|
|
58 |
"""
|
59 |
try:
|
60 |
# Extract owner and repo name
|
|
|
70 |
r = requests.get(api_url, headers=headers)
|
71 |
if r.status_code == 200:
|
72 |
data = r.json()
|
73 |
+
# Log fetched data for debugging
|
74 |
+
print(f"[DEBUG] Fetched data for {repo_url}: {data}")
|
75 |
+
pulls_url = data.get("pulls_url", "").replace("{/state}", "")
|
76 |
pulls_count = len(requests.get(pulls_url, headers=headers).json()) if pulls_url else 0
|
77 |
activity = data.get("updated_at", "")
|
78 |
return {
|
|
|
87 |
"token": token
|
88 |
}
|
89 |
else:
|
90 |
+
print(f"[ERROR] Failed to fetch data for {repo_url}: {r.status_code}")
|
91 |
return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0}
|
92 |
+
except Exception as e:
|
93 |
+
print(f"[ERROR] Exception while fetching data for {repo_url}: {e}")
|
94 |
return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0}
|
95 |
|
96 |
|
|
|
135 |
|
136 |
def get_metrics(repo_url):
|
137 |
if repo_url in cache:
|
138 |
+
print(f"[DEBUG] Cached data for {repo_url}: {cache[repo_url]}")
|
139 |
return cache[repo_url]
|
140 |
val = fetch_repo_metrics(repo_url)
|
141 |
+
print(f"[DEBUG] Extracted GitHub data for {repo_url}: {val}") # <-- Add this line
|
142 |
try:
|
143 |
m = re.search(r"github\.com/([^/]+)/([^/]+)",repo_url)
|
144 |
if m:
|
|
|
214 |
oracle = SmolLM()
|
215 |
|
216 |
prompt = (
|
217 |
+
"Can you Predict a weight in the range (0-1) for these GitHub features such as stars, forks, watchers, "
|
218 |
+
"open_issues, pulls, activity, contributors based on their importance in determining the influence of a repository? "
|
219 |
+
"Output the weights for each feature as text e.g.: "
|
220 |
'stars: 0.3, forks: 0.2, watchers: 0.2, open_issues: 0.1, pulls: 0.1, activity: 0.05, contributors: 0.05'
|
221 |
)
|
222 |
feature_weights = None
|
|
|
237 |
print(f"[ERROR] Oracle attempt {attempt+1} failed: {e}", flush=True)
|
238 |
logging.error(f"[ERROR] Oracle attempt {attempt+1} failed: {e}")
|
239 |
time.sleep(llm_delay)
|
240 |
+
|
241 |
+
# Fallback mechanism: Calculate feature weights dynamically if LLM fails
|
242 |
if feature_weights is None:
|
243 |
+
print("[WARN] LLM failed to provide feature weights. Calculating fallback weights dynamically.")
|
244 |
+
feature_weights = calculate_fallback_weights(df)
|
245 |
+
print(f"[INFO] Fallback feature weights: {feature_weights}", flush=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
246 |
|
247 |
+
# Ensure numeric columns are properly formatted
|
248 |
for feature in feature_weights.keys():
|
249 |
if feature in df.columns:
|
250 |
df[feature] = pd.to_numeric(df[feature], errors='coerce').fillna(0)
|
|
|
266 |
logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
|
267 |
return df
|
268 |
|
269 |
+
def calculate_fallback_weights(df):
|
270 |
+
"""
|
271 |
+
Dynamically calculate fallback feature weights based on feature variance and correlation with the target.
|
272 |
+
"""
|
273 |
+
print("[INFO] Calculating fallback feature weights...")
|
274 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
275 |
+
feature_variances = df[numeric_cols].var()
|
276 |
+
total_variance = feature_variances.sum()
|
277 |
+
|
278 |
+
# Assign weights proportional to feature variance
|
279 |
+
fallback_weights = {col: var / total_variance for col, var in feature_variances.items() if total_variance > 0}
|
280 |
+
return fallback_weights
|
281 |
+
|
282 |
def sanity_check_weights(df):
|
283 |
"""
|
284 |
Sanity-checks LLM weights by comparing them with other metrics.
|
|
|
398 |
variance = df[target].var()
|
399 |
print(f"[DEBUG] Target variable variance: {variance}")
|
400 |
if variance < 1e-6:
|
401 |
+
raise ValueError(f"Target variable '{target}' has insufficient variance. Please check feature values.")
|
402 |
return df
|
403 |
|
404 |
|