Spaces:
Sleeping
Sleeping
""" | |
DeepFunding Oracle: | |
This script dynamically loads dependency data and for each repository URL: | |
• Fetches GitHub features (stars, forks, watchers, open issues, pull requests, activity) using the GitHub API. | |
• Uses the LLama model to analyze parent-child behavior (based on the fetched features and parent info) | |
and returns a base weight (0-1) for the repository. | |
• Trains a RandomForest regressor on these features (with the base weight as the target) to predict a final weight. | |
The output submission CSV has three columns: repo, parent, and final_weight. | |
""" | |
from io import StringIO | |
import os | |
import warnings | |
import csv | |
import re | |
import requests | |
import numpy as np | |
import pandas as pd | |
import time | |
import threading | |
import logging | |
import concurrent.futures | |
import signal | |
from tqdm import tqdm | |
import sys | |
from sklearn.model_selection import train_test_split, GridSearchCV | |
from sklearn.ensemble import RandomForestRegressor | |
from sklearn.metrics import mean_squared_error | |
from Oracle.SmolLM import SmolLM | |
warnings.filterwarnings("ignore") | |
# Configure logging to file and console | |
logging.basicConfig( | |
handlers=[ | |
logging.FileHandler("deepfundingoracle.log"), | |
logging.StreamHandler(sys.stdout) | |
], | |
level=logging.INFO, | |
format="%(asctime)s - %(levelname)s - %(message)s" | |
) | |
############################## | |
# Enhanced GitHub API helper: Fetch repository metrics | |
############################## | |
def fetch_repo_metrics(repo_url): | |
""" | |
Fetch GitHub metrics (stars, forks, watchers, open issues, pull requests, and activity) given a repository URL. | |
Assumes repo_url is in the form "https://github.com/owner/repo". | |
""" | |
try: | |
# Extract owner and repo name | |
m = re.search(r"github\.com/([^/]+)/([^/]+)", repo_url) | |
if not m: | |
return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0} | |
owner, repo_name = m.group(1), m.group(2) | |
api_url = f"https://api.github.com/repos/{owner}/{repo_name}" | |
headers = {} | |
token = os.environ.get("GITHUB_API_TOKEN", "") | |
if token: headers["Authorization"] = f"token {token}" | |
r = requests.get(api_url, headers=headers) | |
if r.status_code == 200: | |
data = r.json() | |
pulls_url = data.get("pulls_url", "").replace("{\/*state}", "") | |
pulls_count = len(requests.get(pulls_url, headers=headers).json()) if pulls_url else 0 | |
activity = data.get("updated_at", "") | |
return { | |
"stargazers_count": data.get("stargazers_count", 0), | |
"forks_count": data.get("forks_count", 0), | |
"watchers_count": data.get("watchers_count", 0), | |
"open_issues_count": data.get("open_issues_count", 0), | |
"pulls_count": pulls_count, | |
"activity": activity, | |
"owner": owner, | |
"repo_name": repo_name, | |
"token": token | |
} | |
else: | |
return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0} | |
except Exception: | |
return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0} | |
############################## | |
# Enhanced Feature Extraction | |
############################## | |
def load_data(file): | |
""" | |
Dynamically load the dependency data CSV from the uploaded file. | |
Expects at least "repo" and "parent" columns. | |
""" | |
try: | |
print("[INFO] Loading data from uploaded file...") | |
start_time = time.time() | |
# Read the uploaded file directly into a DataFrame | |
df = pd.read_csv(file) | |
end_time = time.time() | |
print(f"[INFO] Data loaded successfully in {end_time - start_time:.2f} seconds.") | |
return df | |
except Exception as e: | |
print("[ERROR] Error loading data:", e) | |
return None | |
def fetch_github_features(df): | |
""" | |
For each row, using the repo URL, call the GitHub API to fetch: | |
stars, forks, watchers, open issues, pull requests, activity, and contributors count. | |
Adds these as new columns to the DataFrame. | |
""" | |
print("[INFO] Fetching GitHub features for repositories...") | |
start_time = time.time() | |
stars_list = [] | |
forks_list = [] | |
watchers_list = [] | |
issues_list = [] | |
pulls_list = [] | |
activity_list = [] | |
contributors_list = [] | |
for idx, row in df.iterrows(): | |
repo_url = row.get("repo", "") | |
print(f"[INFO] Processing repository {idx + 1}/{len(df)}: {repo_url}") | |
features = fetch_repo_metrics(repo_url) | |
stars_list.append(features["stargazers_count"]) | |
forks_list.append(features["forks_count"]) | |
watchers_list.append(features["watchers_count"]) | |
issues_list.append(features["open_issues_count"]) | |
pulls_list.append(features["pulls_count"]) | |
activity_list.append(features["activity"]) | |
# Fetch contributors count | |
try: | |
contributors_url = f"https://api.github.com/repos/{features['owner']}/{features['repo_name']}/contributors" | |
headers = {"Authorization": f"token {features['token']}"} | |
contributors_response = requests.get(contributors_url, headers=headers) | |
if contributors_response.status_code == 200: | |
contributors_list.append(len(contributors_response.json())) | |
else: | |
contributors_list.append(0) | |
except Exception: | |
contributors_list.append(0) | |
df["stars"] = stars_list | |
df["forks"] = forks_list | |
df["watchers"] = watchers_list | |
df["open_issues"] = issues_list | |
df["pulls"] = pulls_list | |
df["activity"] = activity_list | |
df["contributors"] = contributors_list | |
end_time = time.time() | |
print(f"[INFO] GitHub features fetched successfully in {end_time - start_time:.2f} seconds.") | |
return df | |
def timeout_handler(signum, frame): | |
raise TimeoutError("LLama model prediction timed out.") | |
def assign_base_weight(df): | |
print("[INFO] Starting base weight assignment using LLama model...", flush=True) | |
logging.info("[INFO] Assigning base weights using LLama model...") | |
start_time = time.time() | |
llama = SmolLM() | |
base_weights = [] | |
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Assigning weights"): | |
repo = row.get("repo", "") | |
print(f"[INFO] Assigning weight for repository {idx + 1}/{len(df)}: {repo}", flush=True) | |
logging.info(f"[INFO] Processing repository {idx + 1}/{len(df)}: {repo}") | |
parent = row.get("parent", "") | |
stars = row.get("stars", 0) | |
forks = row.get("forks", 0) | |
watchers = row.get("watchers", 0) | |
issues = row.get("open_issues", 0) | |
pulls = row.get("pulls", 0) | |
activity = row.get("activity", "") | |
prompt = ( | |
f"Repository: {repo}\n" | |
f"GitHub Metrics: {stars} stars, {forks} forks, {watchers} watchers, {issues} open issues, {pulls} pull requests, activity: {activity}.\n" | |
f"Parent or dependency: {parent}\n\n" | |
"Based on these features, assign a dependency weight between 0 and 1 for the repository " | |
"that reflects how influential the repository is as a source relative to its parent. " | |
"Only output the numeric value." | |
) | |
try: | |
print(f"[INFO] Sending prompt to LLama model for repo: {repo}", flush=True) | |
start_llama_time = time.time() | |
response = llama.predict(prompt) | |
weight = float(''.join([c for c in response if c.isdigit() or c == '.'])) | |
weight = min(max(weight, 0), 1) | |
end_llama_time = time.time() | |
print(f"[INFO] Received weight {weight} for {repo} in {end_llama_time - start_llama_time:.2f} seconds.", flush=True) | |
logging.info(f"[INFO] Processed repository {repo} in {end_llama_time - start_llama_time:.2f} seconds. Weight: {weight}") | |
except Exception as e: | |
print(f"[ERROR] Failed to process repository {repo}: {e}", flush=True) | |
logging.error(f"[ERROR] Failed to process repository {repo}: {e}") | |
weight = 0.5 # Default weight in case of failure | |
base_weights.append(weight) | |
print(f"[PROGRESS] Finished {idx + 1}/{len(df)} repositories.", flush=True) | |
df["base_weight"] = base_weights | |
end_time = time.time() | |
print(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.", flush=True) | |
logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.") | |
return df | |
def prepare_dataset(file): | |
print("[INFO] Starting dataset preparation...") | |
start_time = time.time() | |
df = load_data(file) | |
if df is None: | |
raise ValueError("Failed to load data.") | |
if not {"repo", "parent"}.issubset(df.columns): | |
raise ValueError("Input CSV must contain 'repo' and 'parent' columns.") | |
print("[INFO] Fetching GitHub features...") | |
df = fetch_github_features(df) | |
print("[INFO] GitHub features fetched successfully.") | |
print("[INFO] Assigning base weights using LLama model...") | |
df = assign_base_weight(df) | |
end_time = time.time() | |
print(f"[INFO] Dataset preparation completed in {end_time - start_time:.2f} seconds.") | |
return df | |
############################## | |
# Enhanced RandomForest Regression | |
############################## | |
def train_predict_weight(df): | |
print("[INFO] Starting weight prediction...", flush=True) | |
start_time = time.time() | |
target = "base_weight" | |
feature_cols = ["stars", "forks", "watchers", "open_issues", "pulls", "activity", "contributors"] | |
if target not in df.columns: | |
raise ValueError("Base weight column missing.") | |
X = df[feature_cols] | |
y = df[target] | |
print("[INFO] Splitting data into training and testing sets...", flush=True) | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
rf_model = RandomForestRegressor(random_state=42) | |
param_grid = { | |
"n_estimators": [100, 200, 300], | |
"max_depth": [None, 10, 20, 30], | |
"min_samples_split": [2, 5, 10], | |
"min_samples_leaf": [1, 2, 4] | |
} | |
print("[INFO] Performing grid search for hyperparameter tuning...", flush=True) | |
gridSearch = GridSearchCV( | |
estimator=rf_model, | |
param_grid=param_grid, | |
cv=5, | |
scoring="neg_mean_squared_error" | |
) | |
gridSearch.fit(X_train, y_train) | |
print("[INFO] Grid search completed.", flush=True) | |
print("Best Parameters:", gridSearch.best_params_, flush=True) | |
print("Best MSE:", -gridSearch.best_score_, flush=True) | |
y_pred = gridSearch.best_estimator_.predict(X_test) | |
mse = mean_squared_error(y_test, y_pred) | |
print("Final RF Test MSE:", mse, flush=True) | |
print("[INFO] Predicting final weights for all rows...") | |
df["final_weight"] = gridSearch.best_estimator_.predict(X) | |
end_time = time.time() | |
print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True) | |
return df | |
############################## | |
# CSV Output | |
############################## | |
def create_submission_csv(df, output_filename="submission.csv"): | |
print(f"[INFO] Writing results to {output_filename}...", flush=True) | |
required_cols = ["repo", "parent", "final_weight"] | |
submission_df = df[required_cols] | |
submission_df.to_csv(output_filename, index=False) | |
print(f"[INFO] Results written to {output_filename}.", flush=True) | |
return output_filename | |
# Removed Gradio UI code from this file to ensure modular workflow. | |
# This file now focuses solely on data processing and prediction. | |
if __name__ == "__main__": | |
print("DeepFunding Oracle is now ready for backend processing.", flush=True) | |