Spaces:
Sleeping
Sleeping
Commit
·
2424d59
1
Parent(s):
ba26d2b
Oracle weight assigning update
Browse files- Oracle/deepfundingoracle.py +22 -33
Oracle/deepfundingoracle.py
CHANGED
@@ -233,9 +233,7 @@ def timeout_handler(signum, frame):
|
|
233 |
# logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
|
234 |
# return df
|
235 |
|
236 |
-
|
237 |
-
|
238 |
-
def assign_base_weight(df, max_workers=32, llm_retries=2,llm_delay=0):
|
239 |
"""
|
240 |
Assign base weights using a single LLM call to determine feature weights,
|
241 |
and programmatically calculate repository weights.
|
@@ -245,53 +243,45 @@ def assign_base_weight(df, max_workers=32, llm_retries=2,llm_delay=0):
|
|
245 |
start_time = time.time()
|
246 |
oracle = SmolLM()
|
247 |
|
248 |
-
# Step 1: Call LLM once to determine weights for each feature
|
249 |
prompt = (
|
250 |
"Can you Predict a weight in the range (0-1) for these github features such as stars,forks,watchers,open_issues,pulls,activity,contributors based on its importance in determining "
|
251 |
-
"the influence of a repository. Output
|
252 |
-
|
253 |
-
'{\n'
|
254 |
-
' "stars": 0.3,\n'
|
255 |
-
' "forks": 0.2,\n'
|
256 |
-
' "watchers": 0.2,\n'
|
257 |
-
' "open_issues": 0.1,\n'
|
258 |
-
' "pulls": 0.1,\n'
|
259 |
-
' "activity": 0.05,\n'
|
260 |
-
' "contributors": 0.05\n'
|
261 |
-
'}\n'
|
262 |
)
|
263 |
-
feature_weights= None
|
264 |
for attempt in range(llm_retries):
|
265 |
try:
|
266 |
-
response = oracle.predict(prompt,max_length=512, max_new_tokens=150)
|
267 |
if not response or not response.strip():
|
268 |
raise ValueError("Empty response from Oracle.")
|
269 |
-
|
|
|
|
|
|
|
|
|
|
|
270 |
print(f"[INFO] Feature weights from LLM: {feature_weights}", flush=True)
|
271 |
break
|
272 |
except Exception as e:
|
273 |
print(f"[ERROR] Oracle attempt {attempt+1} failed: {e}", flush=True)
|
274 |
logging.error(f"[ERROR] Oracle attempt {attempt+1} failed: {e}")
|
275 |
time.sleep(llm_delay)
|
276 |
-
# Fallback to default weights
|
277 |
if feature_weights is None:
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
|
289 |
-
# Step 2: Ensure all feature columns are numeric
|
290 |
for feature in feature_weights.keys():
|
291 |
if feature in df.columns:
|
292 |
df[feature] = pd.to_numeric(df[feature], errors='coerce').fillna(0)
|
293 |
|
294 |
-
# Step 3: Programmatically calculate weights for each repository
|
295 |
def calculate_weight(row):
|
296 |
weight = 0
|
297 |
for feature, feature_weight in feature_weights.items():
|
@@ -300,8 +290,6 @@ def assign_base_weight(df, max_workers=32, llm_retries=2,llm_delay=0):
|
|
300 |
return weight
|
301 |
|
302 |
df["base_weight_raw"] = df.apply(calculate_weight, axis=1)
|
303 |
-
|
304 |
-
# Step 4: Normalize weights per parent
|
305 |
df["base_weight"] = df.groupby("parent")["base_weight_raw"].transform(
|
306 |
lambda s: (s - s.min()) / (s.max() - s.min() if s.max() != s.min() else 1)
|
307 |
)
|
@@ -311,6 +299,7 @@ def assign_base_weight(df, max_workers=32, llm_retries=2,llm_delay=0):
|
|
311 |
logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
|
312 |
return df
|
313 |
|
|
|
314 |
def prepare_dataset(file):
|
315 |
print("[INFO] Starting dataset preparation...")
|
316 |
start_time = time.time()
|
|
|
233 |
# logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
|
234 |
# return df
|
235 |
|
236 |
+
def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0):
|
|
|
|
|
237 |
"""
|
238 |
Assign base weights using a single LLM call to determine feature weights,
|
239 |
and programmatically calculate repository weights.
|
|
|
243 |
start_time = time.time()
|
244 |
oracle = SmolLM()
|
245 |
|
|
|
246 |
prompt = (
|
247 |
"Can you Predict a weight in the range (0-1) for these github features such as stars,forks,watchers,open_issues,pulls,activity,contributors based on its importance in determining "
|
248 |
+
"the influence of a repository. Output the weights for each feature as text e.g.: "
|
249 |
+
'stars: 0.3, forks: 0.2, watchers: 0.2, open_issues: 0.1, pulls: 0.1, activity: 0.05, contributors: 0.05'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
250 |
)
|
251 |
+
feature_weights = None
|
252 |
for attempt in range(llm_retries):
|
253 |
try:
|
254 |
+
response = oracle.predict(prompt, max_length=512, max_new_tokens=150)
|
255 |
if not response or not response.strip():
|
256 |
raise ValueError("Empty response from Oracle.")
|
257 |
+
matches = re.findall(
|
258 |
+
r'(stars|forks|watchers|open_issues|pulls|activity|contributors)\s*[:=]\s*([0-9]*\.?[0-9]+)',
|
259 |
+
response, re.IGNORECASE)
|
260 |
+
feature_weights = {k.lower(): float(v) for k, v in matches}
|
261 |
+
if not feature_weights or len(feature_weights) < 7:
|
262 |
+
raise ValueError("Could not extract all feature weights from response.")
|
263 |
print(f"[INFO] Feature weights from LLM: {feature_weights}", flush=True)
|
264 |
break
|
265 |
except Exception as e:
|
266 |
print(f"[ERROR] Oracle attempt {attempt+1} failed: {e}", flush=True)
|
267 |
logging.error(f"[ERROR] Oracle attempt {attempt+1} failed: {e}")
|
268 |
time.sleep(llm_delay)
|
|
|
269 |
if feature_weights is None:
|
270 |
+
feature_weights = {
|
271 |
+
"stars": 0.3,
|
272 |
+
"forks": 0.2,
|
273 |
+
"watchers": 0.2,
|
274 |
+
"open_issues": 0.1,
|
275 |
+
"pulls": 0.1,
|
276 |
+
"activity": 0.05,
|
277 |
+
"contributors": 0.05
|
278 |
+
}
|
279 |
+
print(f"[INFO] Using default feature weights: {feature_weights}", flush=True)
|
280 |
|
|
|
281 |
for feature in feature_weights.keys():
|
282 |
if feature in df.columns:
|
283 |
df[feature] = pd.to_numeric(df[feature], errors='coerce').fillna(0)
|
284 |
|
|
|
285 |
def calculate_weight(row):
|
286 |
weight = 0
|
287 |
for feature, feature_weight in feature_weights.items():
|
|
|
290 |
return weight
|
291 |
|
292 |
df["base_weight_raw"] = df.apply(calculate_weight, axis=1)
|
|
|
|
|
293 |
df["base_weight"] = df.groupby("parent")["base_weight_raw"].transform(
|
294 |
lambda s: (s - s.min()) / (s.max() - s.min() if s.max() != s.min() else 1)
|
295 |
)
|
|
|
299 |
logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
|
300 |
return df
|
301 |
|
302 |
+
|
303 |
def prepare_dataset(file):
|
304 |
print("[INFO] Starting dataset preparation...")
|
305 |
start_time = time.time()
|