Spaces:
Running
Running
""" | |
DeepFunding Oracle: | |
This script dynamically loads dependency data and for each repository URL: | |
• Fetches GitHub features (stars, forks, watchers, open issues, pull requests, activity) using the GitHub API. | |
• Uses the LLama model to analyze parent-child behavior (based on the fetched features and parent info) | |
and returns a base weight (0-1) for the repository. | |
• Trains a RandomForest regressor on these features (with the base weight as the target) to predict a final weight. | |
The output submission CSV has three columns: repo, parent, and final_weight. | |
""" | |
from io import StringIO | |
import os | |
import warnings | |
import csv | |
import re | |
import requests | |
import numpy as np | |
import pandas as pd | |
import time | |
import threading | |
import logging | |
import concurrent.futures | |
from concurrent.futures import ThreadPoolExecutor | |
import signal | |
from tqdm import tqdm | |
import sys | |
import re | |
from sklearn.model_selection import train_test_split, RandomizedSearchCV | |
from sklearn.ensemble import RandomForestRegressor | |
from sklearn.metrics import mean_squared_error | |
from Oracle.SmolLM import SmolLM | |
warnings.filterwarnings("ignore") | |
# Configure logging to file and console | |
logging.basicConfig( | |
handlers=[ | |
logging.FileHandler("deepfundingoracle.log"), | |
logging.StreamHandler(sys.stdout) | |
], | |
level=logging.INFO, | |
format="%(asctime)s - %(levelname)s - %(message)s" | |
) | |
############################## | |
# Enhanced GitHub API helper: Fetch repository metrics | |
############################## | |
def fetch_repo_metrics(repo_url): | |
""" | |
Fetch GitHub metrics (stars, forks, watchers, open issues, pull requests, and activity) given a repository URL. | |
Assumes repo_url is in the form "https://github.com/owner/repo". | |
""" | |
try: | |
# Extract owner and repo name | |
m = re.search(r"github\.com/([^/]+)/([^/]+)", repo_url) | |
if not m: | |
return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0} | |
owner, repo_name = m.group(1), m.group(2) | |
api_url = f"https://api.github.com/repos/{owner}/{repo_name}" | |
headers = {} | |
token = os.environ.get("GITHUB_API_TOKEN", "") | |
if token: headers["Authorization"] = f"token {token}" | |
r = requests.get(api_url, headers=headers) | |
if r.status_code == 200: | |
data = r.json() | |
pulls_url = data.get("pulls_url", "").replace("{\/*state}", "") | |
pulls_count = len(requests.get(pulls_url, headers=headers).json()) if pulls_url else 0 | |
activity = data.get("updated_at", "") | |
return { | |
"stargazers_count": data.get("stargazers_count", 0), | |
"forks_count": data.get("forks_count", 0), | |
"watchers_count": data.get("watchers_count", 0), | |
"open_issues_count": data.get("open_issues_count", 0), | |
"pulls_count": pulls_count, | |
"activity": activity, | |
"owner": owner, | |
"repo_name": repo_name, | |
"token": token | |
} | |
else: | |
return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0} | |
except Exception: | |
return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0} | |
############################## | |
# Enhanced Feature Extraction | |
############################## | |
def load_data(file): | |
""" | |
Dynamically load the dependency data CSV from the uploaded file. | |
Expects at least "repo" and "parent" columns. | |
""" | |
try: | |
print("[INFO] Loading data from uploaded file...") | |
start_time = time.time() | |
# Read the uploaded file directly into a DataFrame | |
df = pd.read_csv(file) | |
end_time = time.time() | |
print(f"[INFO] Data loaded successfully in {end_time - start_time:.2f} seconds.") | |
return df | |
except Exception as e: | |
print("[ERROR] Error loading data:", e) | |
return None | |
def fetch_github_features(df): | |
""" | |
For each row, using the repo URL, call the GitHub API to fetch: | |
stars, forks, watchers, open issues, pull requests, activity, and contributors count. | |
Adds these as new columns to the DataFrame. | |
""" | |
print("[INFO] Fetching GitHub features for repositories...") | |
start_time = time.time() | |
stars_list = [] | |
forks_list = [] | |
watchers_list = [] | |
issues_list = [] | |
pulls_list = [] | |
activity_list = [] | |
contributors_list = [] | |
cache = {} | |
def get_metrics(repo_url): | |
if repo_url in cache: | |
return cache[repo_url] | |
val = fetch_repo_metrics(repo_url) | |
cache[repo_url] = val | |
return val | |
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: | |
futures = {executor.submit(get_metrics, row['repo']): i for i, row in df.iterrows()} | |
for fut in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Fetching metrics"): | |
res = fut.result() | |
stars_list.append(res["stargazers_count"]) | |
forks_list.append(res["forks_count"]) | |
watchers_list.append(res["watchers_count"]) | |
issues_list.append(res["open_issues_count"]) | |
pulls_list.append(res["pulls_count"]) | |
activity_list.append(res["activity"]) | |
# Fetch contributors count | |
try: | |
contributors_url = f"https://api.github.com/repos/{res['owner']}/{res['repo_name']}/contributors" | |
headers = {"Authorization": f"token {res['token']}"} | |
contributors_response = requests.get(contributors_url, headers=headers) | |
if contributors_response.status_code == 200: | |
contributors_list.append(len(contributors_response.json())) | |
else: | |
contributors_list.append(0) | |
except Exception: | |
contributors_list.append(0) | |
df["stars"] = stars_list | |
df["forks"] = forks_list | |
df["watchers"] = watchers_list | |
df["open_issues"] = issues_list | |
df["pulls"] = pulls_list | |
df["activity"] = activity_list | |
df["contributors"] = contributors_list | |
end_time = time.time() | |
print(f"[INFO] GitHub features fetched successfully in {end_time - start_time:.2f} seconds.") | |
return df | |
def timeout_handler(signum, frame): | |
raise TimeoutError("LLama model prediction timed out.") | |
# def assign_base_weight(df, max_workers=32): | |
# """ | |
# Assign base weights using LLama model in parallel. | |
# """ | |
# print("[INFO] Starting base weight assignment using LLama model...", flush=True) | |
# logging.info("[INFO] Assigning base weights using LLama model...") | |
# start_time = time.time() | |
# llama = SmolLM() | |
# base_weights = [] | |
# llm_cache = {} | |
# | |
# # Prepare prompts for all repositories | |
# prompts = {} | |
# for idx, row in df.iterrows(): | |
# repo = row.get("repo", "") | |
# parent = row.get("parent", "") | |
# stars = row.get("stars", 0) | |
# forks = row.get("forks", 0) | |
# watchers = row.get("watchers", 0) | |
# issues = row.get("open_issues", 0) | |
# pulls = row.get("pulls", 0) | |
# activity = row.get("activity", "") | |
# prompts[idx] = ( | |
# f"Repository: {repo}\n" | |
# f"GitHub Metrics: {stars} stars, {forks} forks, {watchers} watchers, {issues} open issues, {pulls} pull requests, activity: {activity}.\n" | |
# f"Parent or dependency: {parent}\n\n" | |
# "Based on these features, assign a dependency weight between 0 and 1 for the repository " | |
# "that reflects how influential the repository is as a source relative to its parent. " | |
# "Only output the numeric value." | |
# ) | |
# | |
# # Define the prediction function | |
# def _predict(idx, prompt): | |
# if idx in llm_cache: | |
# return idx, llm_cache[idx] | |
# try: | |
# resp = llama.predict(prompt) | |
# match = re.search(r"[-+]?\d*\.\d+|\d+", resp) | |
# weight = min(max(float(match.group()), 0), 1) if match else 0.0 | |
# llm_cache[idx] = weight | |
# return idx, weight | |
# except Exception as e: | |
# print(f"[ERROR] Failed to process repository {idx}: {e}", flush=True) | |
# logging.error(f"[ERROR] Failed to process repository {idx}: {e}") | |
# return idx, 0.0 # Default weight in case of failure | |
# | |
# # Run predictions in parallel | |
# with ThreadPoolExecutor(max_workers=max_workers) as executor: | |
# futures = [executor.submit(_predict, idx, prompt) for idx, prompt in prompts.items()] | |
# for fut in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="LLM Prompts"): | |
# idx, weight = fut.result() | |
# base_weights.append((idx, weight)) | |
# | |
# # Sort weights by index and assign to DataFrame | |
# base_weights.sort(key=lambda x: x[0]) | |
# df["base_weight"] = [weight for _, weight in base_weights] | |
# | |
# end_time = time.time() | |
# print(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.", flush=True) | |
# logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.") | |
# return df | |
def assign_base_weight(df, max_workers=32): | |
""" | |
Assign base weights using a single LLM call to determine feature weights, | |
and programmatically calculate repository weights. | |
""" | |
print("[INFO] Starting optimized base weight assignment...", flush=True) | |
logging.info("[INFO] Assigning base weights using optimized approach...") | |
start_time = time.time() | |
llama = SmolLM() | |
# Step 1: Call LLM once to determine weights for each feature | |
prompt = ( | |
"The following are GitHub repository features:\n" | |
"- Stars\n" | |
"- Forks\n" | |
"- Watchers\n" | |
"- Open Issues\n" | |
"- Pull Requests\n" | |
"- Activity (days since last update)\n" | |
"- Contributors\n\n" | |
"Assign a weight (0-1) to each feature based on its importance in determining " | |
"the influence of a repository. Provide the weights as a JSON object with " | |
"keys as feature names and values as their weights." | |
) | |
try: | |
response = llama.predict(prompt) | |
feature_weights = eval(response) # Convert JSON string to dictionary | |
print(f"[INFO] Feature weights from LLM: {feature_weights}", flush=True) | |
except Exception as e: | |
print(f"[ERROR] Failed to fetch feature weights from LLM: {e}", flush=True) | |
logging.error(f"[ERROR] Failed to fetch feature weights from LLM: {e}") | |
return df | |
# Step 2: Programmatically calculate weights for each repository | |
def calculate_weight(row): | |
weight = 0 | |
for feature, feature_weight in feature_weights.items(): | |
if feature in row and pd.notna(row[feature]): | |
weight += row[feature] * feature_weight | |
return weight | |
df["base_weight_raw"] = df.apply(calculate_weight, axis=1) | |
# Step 3: Normalize weights per parent | |
df["base_weight"] = df.groupby("parent")["base_weight_raw"].transform( | |
lambda s: (s - s.min()) / (s.max() - s.min() if s.max() != s.min() else 1) | |
) | |
end_time = time.time() | |
print(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.", flush=True) | |
logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.") | |
return df | |
def prepare_dataset(file): | |
print("[INFO] Starting dataset preparation...") | |
start_time = time.time() | |
df = load_data(file) | |
if df is None: | |
raise ValueError("Failed to load data.") | |
if not {"repo", "parent"}.issubset(df.columns): | |
raise ValueError("Input CSV must contain 'repo' and 'parent' columns.") | |
print("[INFO] Fetching GitHub features...") | |
df = fetch_github_features(df) | |
print("[INFO] GitHub features fetched successfully.") | |
print("[INFO] Assigning base weights using LLama model...") | |
df = assign_base_weight(df) | |
end_time = time.time() | |
print(f"[INFO] Dataset preparation completed in {end_time - start_time:.2f} seconds.") | |
return df | |
############################## | |
# Enhanced RandomForest Regression | |
############################## | |
def train_predict_weight(df): | |
print("[INFO] Starting weight prediction...", flush=True) | |
start_time = time.time() | |
target = "base_weight" | |
if "activity" in df.columns: | |
# Parse ISO timestamps as UTC and subtract with a UTC timestamp | |
df["activity"] = pd.to_datetime(df["activity"], errors="coerce", utc=True) | |
now = pd.Timestamp.now(tz="UTC") | |
df["activity"] = (now - df["activity"]).dt.days.fillna(-1) | |
feature_cols = ["stars", "forks", "watchers", "open_issues", "pulls", "activity", "contributors"] | |
if target not in df.columns: | |
raise ValueError("Base weight column missing.") | |
X = df[feature_cols] | |
y = df[target] | |
print("[INFO] Splitting data into training and testing sets...", flush=True) | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
rf_model = RandomForestRegressor(random_state=42, max_depth=None) | |
param_dist = { | |
"n_estimators": [100, 200, 300], | |
"min_samples_split": [2, 5, 10], | |
"min_samples_leaf": [1, 2, 4] | |
} | |
print("[INFO] Performing randomized search for hyperparameter tuning...", flush=True) | |
rand_search = RandomizedSearchCV( | |
estimator=rf_model, | |
param_distributions=param_dist, | |
n_iter=20, | |
cv=3, | |
scoring="neg_mean_squared_error", | |
random_state=42, | |
error_score="raise" | |
) | |
rand_search.fit(X_train, y_train) | |
print("[INFO] Randomized search completed.", flush=True) | |
print("Best Parameters:", rand_search.best_params_, flush=True) | |
print("Best MSE:", -rand_search.best_score_, flush=True) | |
y_pred = rand_search.best_estimator_.predict(X_test) | |
mse = mean_squared_error(y_test, y_pred) | |
print("Final RF Test MSE:", mse, flush=True) | |
print("[INFO] Predicting final weights for all rows...") | |
df["final_weight_raw"] = rand_search.best_estimator_.predict(X) | |
# Normalize weights per parent for meaningful spread | |
df["final_weight"] = df.groupby("parent")["final_weight_raw"].transform( | |
lambda s: (s - s.min()) / (s.max() - s.min() if s.max() != s.min() else 1) | |
) | |
end_time = time.time() | |
print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True) | |
return df | |
############################## | |
# CSV Output | |
############################## | |
def create_submission_csv(df, output_filename="submission.csv"): | |
print(f"[INFO] Writing results to {output_filename}...", flush=True) | |
required_cols = ["repo", "parent", "final_weight"] | |
submission_df = df[required_cols] | |
submission_df.to_csv(output_filename, index=False) | |
print(f"[INFO] Results written to {output_filename}.", flush=True) | |
return output_filename | |
# Removed Gradio UI code from this file to ensure modular workflow. | |
# This file now focuses solely on data processing and prediction. | |
if __name__ == "__main__": | |
print("DeepFunding Oracle is now ready for backend processing.", flush=True) | |