Spaces:
Sleeping
Sleeping
""" | |
DeepFunding Oracle: | |
This script dynamically loads dependency data and for each repository URL: | |
• Fetches GitHub features (stars, forks, watchers, open issues, pull requests, activity) using the GitHub API. | |
• Uses the LLama model to analyze parent-child behavior (based on the fetched features and parent info) | |
and returns a base weight (0-1) for the repository. | |
• Trains a RandomForest regressor on these features (with the base weight as the target) to predict a final weight. | |
The output submission CSV has three columns: repo, parent, and final_weight. | |
""" | |
import base64 | |
from io import StringIO | |
import os | |
import warnings | |
import csv | |
import re | |
import requests | |
import numpy as np | |
import pandas as pd | |
import time | |
import threading | |
import logging | |
import concurrent.futures | |
from concurrent.futures import ThreadPoolExecutor | |
import signal | |
from tqdm import tqdm | |
import sys | |
import re | |
import json | |
import time | |
from sklearn.model_selection import train_test_split, GridSearchCV | |
from sklearn.ensemble import RandomForestRegressor | |
from sklearn.metrics import mean_squared_error | |
from sklearn.preprocessing import StandardScaler | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from Oracle.SmolLM import SmolLM | |
warnings.filterwarnings("ignore") | |
# Configure logging to file and console | |
logging.basicConfig( | |
handlers=[ | |
logging.FileHandler("deepfundingoracle.log"), | |
logging.StreamHandler(sys.stdout) | |
], | |
level=logging.INFO, | |
format="%(asctime)s - %(levelname)s - %(message)s" | |
) | |
############################## | |
# GitHub API helper: Fetch repository metrics | |
############################## | |
def fetch_repo_metrics(repo_url): | |
""" | |
Fetch GitHub metrics (stars, forks, watchers, open issues, pull requests, and activity) given a repository URL. | |
""" | |
try: | |
# Extract owner and repo name | |
m = re.search(r"github\.com/([^/]+)/([^/]+)", repo_url) | |
if not m: | |
return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0} | |
owner, repo_name = m.group(1), m.group(2) | |
api_url = f"https://api.github.com/repos/{owner}/{repo_name}" | |
headers = {} | |
token = os.environ.get("GITHUB_API_TOKEN", "") | |
if token: headers["Authorization"] = f"token {token}" | |
r = requests.get(api_url, headers=headers) | |
if r.status_code == 200: | |
data = r.json() | |
# Log fetched data for debugging | |
print(f"[DEBUG] Fetched data for {repo_url}: {data}") | |
pulls_url = data.get("pulls_url", "").replace("{/state}", "") | |
pulls_count = len(requests.get(pulls_url, headers=headers).json()) if pulls_url else 0 | |
activity = data.get("updated_at", "") | |
return { | |
"stargazers_count": data.get("stargazers_count", 0), | |
"forks_count": data.get("forks_count", 0), | |
"watchers_count": data.get("watchers_count", 0), | |
"open_issues_count": data.get("open_issues_count", 0), | |
"pulls_count": pulls_count, | |
"activity": activity, | |
"owner": owner, | |
"repo_name": repo_name, | |
"token": token | |
} | |
else: | |
print(f"[ERROR] Failed to fetch data for {repo_url}: {r.status_code}") | |
return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0} | |
except Exception as e: | |
print(f"[ERROR] Exception while fetching data for {repo_url}: {e}") | |
return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0} | |
############################## | |
# Feature Extraction | |
############################## | |
def load_data(file): | |
""" | |
Dynamically load the dependency data CSV from the uploaded file. | |
Expects at least "repo" and "parent" columns. | |
""" | |
try: | |
print("[INFO] Loading data from uploaded file...") | |
start_time = time.time() | |
# Read the uploaded file directly into a DataFrame | |
df = pd.read_csv(file) | |
end_time = time.time() | |
print(f"[INFO] Data loaded successfully in {end_time - start_time:.2f} seconds.") | |
return df | |
except Exception as e: | |
print("[ERROR] Error loading data:", e) | |
return None | |
def fetch_github_features(df): | |
""" | |
For each row, using the repo URL, call the GitHub API to fetch: | |
stars, forks, watchers, open issues, pull requests, activity, and contributors count. | |
Adds these as new columns to the DataFrame. | |
""" | |
print("[INFO] Fetching GitHub features for repositories...") | |
start_time = time.time() | |
stars_list = [] | |
forks_list = [] | |
watchers_list = [] | |
issues_list = [] | |
pulls_list = [] | |
activity_list = [] | |
contributors_list = [] | |
dependencies_list =[] | |
cache = {} | |
def get_metrics(repo_url): | |
if repo_url in cache: | |
print(f"[DEBUG] Cached data for {repo_url}: {cache[repo_url]}") | |
return cache[repo_url] | |
val = fetch_repo_metrics(repo_url) | |
print(f"[DEBUG] Extracted GitHub data for {repo_url}: {val}") # <-- Add this line | |
try: | |
m = re.search(r"github\.com/([^/]+)/([^/]+)",repo_url) | |
if m: | |
owner, repo_name = m.group(1), m.group(2) | |
pkg_url = f"https://api.github.com/repos/{owner}/{repo_name}/packages.json" | |
headers = {} | |
token = os.environ.get("GITHUB_API_TOKEN", "") | |
if token: | |
headers["Authorization"] = f"token {token}" | |
pkg_resp = requests.get(pkg_url, headers=headers) | |
if pkg_resp.status_code ==200: | |
pkg_data = pkg_resp.json() | |
content = base64.b64decode(pkg_data["content",""]).decode("utf-8") | |
pkg_json = json.loads(content) | |
dependencies = pkg_json.get("dependencies", {}) | |
val["dependencies_count"] = len(dependencies) | |
else: | |
val["dependencies_count"] = 0 | |
else: | |
val["dependencies_count"] = 0 | |
except Exception: | |
val["dependencies_count"] = 0 | |
cache[repo_url] = val | |
return val | |
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: | |
futures = {executor.submit(get_metrics, row['repo']): i for i, row in df.iterrows()} | |
for fut in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Fetching metrics"): | |
res = fut.result() | |
stars_list.append(res.get("stargazers_count", 0)) | |
forks_list.append(res.get("forks_count", 0)) | |
watchers_list.append(res.get("watchers_count", 0)) | |
issues_list.append(res.get("open_issues_count", 0)) | |
pulls_list.append(res.get("pulls_count", 0)) | |
activity_list.append(res.get("activity", 0)) | |
dependencies_list.append(res.get("dependencies_count", 0)) | |
# Fetch contributors count | |
try: | |
contributors_url = f"https://api.github.com/repos/{res['owner']}/{res['repo_name']}/contributors" | |
headers = {"Authorization": f"token {res['token']}"} | |
contributors_response = requests.get(contributors_url, headers=headers) | |
if contributors_response.status_code == 200: | |
contributors_list.append(len(contributors_response.json())) | |
else: | |
contributors_list.append(0) | |
except Exception: | |
contributors_list.append(0) | |
df["stars"] = stars_list | |
df["forks"] = forks_list | |
df["watchers"] = watchers_list | |
df["open_issues"] = issues_list | |
df["pulls"] = pulls_list | |
df["activity"] = activity_list | |
df["contributors"] = contributors_list | |
df["dependencies_count"] = dependencies_list | |
end_time = time.time() | |
print(f"[INFO] GitHub features fetched successfully in {end_time - start_time:.2f} seconds.") | |
return df | |
def timeout_handler(signum, frame): | |
raise TimeoutError("LLama model prediction timed out.") | |
def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0): | |
""" | |
Assign base weights using a single LLM call to determine feature weights, | |
and programmatically calculate repository weights. | |
""" | |
print("[INFO] Starting optimized base weight assignment...", flush=True) | |
logging.info("[INFO] Assigning base weights using optimized approach...") | |
start_time = time.time() | |
oracle = SmolLM() | |
prompt = ( | |
"Can you Predict a weight in the range (0-1) for these GitHub features such as stars, forks, watchers, " | |
"open_issues, pulls, activity, contributors based on their importance in determining the influence of a repository? " | |
"Output the weights for each feature as text e.g.: " | |
'stars: 0.3, forks: 0.2, watchers: 0.2, open_issues: 0.1, pulls: 0.1, activity: 0.05, contributors: 0.05' | |
) | |
feature_weights = None | |
for attempt in range(llm_retries): | |
try: | |
response = oracle.predict(prompt, max_length=512, max_new_tokens=150) | |
if not response or not response.strip(): | |
raise ValueError("Empty response from Oracle.") | |
matches = re.findall( | |
r'(stars|forks|watchers|open_issues|pulls|activity|contributors)\s*[:=]\s*([0-9]*\.?[0-9]+)', | |
response, re.IGNORECASE) | |
feature_weights = {k.lower(): float(v) for k, v in matches} | |
if not feature_weights or len(feature_weights) < 7: | |
raise ValueError("Could not extract all feature weights from response.") | |
print(f"[INFO] Feature weights from LLM: {feature_weights}", flush=True) | |
break | |
except Exception as e: | |
print(f"[ERROR] Oracle attempt {attempt+1} failed: {e}", flush=True) | |
logging.error(f"[ERROR] Oracle attempt {attempt+1} failed: {e}") | |
time.sleep(llm_delay) | |
# Fallback mechanism: Calculate feature weights dynamically if LLM fails | |
if feature_weights is None: | |
print("[WARN] LLM failed to provide feature weights. Calculating fallback weights dynamically.") | |
feature_weights = calculate_fallback_weights(df) | |
print(f"[INFO] Fallback feature weights: {feature_weights}", flush=True) | |
# Ensure numeric columns are properly formatted | |
for feature in feature_weights.keys(): | |
if feature in df.columns: | |
df[feature] = pd.to_numeric(df[feature], errors='coerce').fillna(0) | |
def calculate_weight(row): | |
weight = 0 | |
for feature, feature_weight in feature_weights.items(): | |
if feature in row: | |
weight += row[feature] * feature_weight | |
return weight | |
df["base_weight_raw"] = df.apply(calculate_weight, axis=1) | |
df["base_weight"] = df.groupby("parent")["base_weight_raw"].transform( | |
lambda s: (s - s.min()) / (s.max() - s.min() if s.max() != s.min() else 1) | |
) | |
end_time = time.time() | |
print(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.", flush=True) | |
logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.") | |
return df | |
def calculate_fallback_weights(df): | |
""" | |
Dynamically calculate fallback feature weights based on feature variance and correlation with the target. | |
""" | |
print("[INFO] Calculating fallback feature weights...") | |
numeric_cols = df.select_dtypes(include=[np.number]).columns | |
feature_variances = df[numeric_cols].var() | |
total_variance = feature_variances.sum() | |
# Assign weights proportional to feature variance | |
fallback_weights = {col: var / total_variance for col, var in feature_variances.items() if total_variance > 0} | |
return fallback_weights | |
def sanity_check_weights(df): | |
""" | |
Sanity-checks LLM weights by comparing them with other metrics. | |
""" | |
print("[INFO] Performing sanity check on LLM weights...") | |
df["sanity_check_weight"] = (df["stars"] + df["forks"] + df["watchers"]) / 3 | |
df["ensemble_weight"] = (df["base_weight"] + df["sanity_check_weight"]) / 2 | |
print("[INFO] Sanity check and ensemble weights added.") | |
return df | |
def visualize_feature_distributions(df): | |
""" | |
Visualizes feature distributions and correlations. | |
""" | |
print("[INFO] Visualizing feature distributions and correlations...") | |
numeric_cols = df.select_dtypes(include=[np.number]).columns | |
# Plot feature distributions | |
df[numeric_cols].hist(bins=20, figsize=(15, 10), color="skyblue", edgecolor="black") | |
plt.suptitle("Feature Distributions", fontsize=16) | |
plt.show() | |
# Plot feature correlations | |
correlation_matrix = df[numeric_cols].corr() | |
plt.figure(figsize=(12, 8)) | |
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5) | |
plt.title("Feature Correlation Matrix", fontsize=16) | |
plt.show() | |
def normalize_funding(df): | |
""" | |
Normalize funding weights for child repositories grouped by parent. | |
""" | |
print("[INFO] Normalizing funding weights...", flush=True) | |
df["final_weight"] = df.groupby("parent")["final_weight"].transform( | |
lambda x: x / x.sum() if x.sum() > 0 else 1 / len(x) | |
) | |
print("[INFO] Funding weights normalized successfully.", flush=True) | |
return df | |
def prepare_dataset(file): | |
print("[INFO] Starting dataset preparation...") | |
start_time = time.time() | |
df = load_data(file) | |
if df is None: | |
raise ValueError("Failed to load data.") | |
if not {"repo", "parent"}.issubset(df.columns): | |
raise ValueError("Input CSV must contain 'repo' and 'parent' columns.") | |
print("[INFO] Fetching GitHub features...") | |
df = fetch_github_features(df) | |
print("[INFO] GitHub features fetched successfully.") | |
print("[INFO] Cleaning data...") | |
df = clean_data(df) | |
print("[INFO] Data cleaned successfully.") | |
print("[INFO] Assigning base weights using LLama model...") | |
df = assign_base_weight(df) | |
df = sanity_check_weights(df) # Add sanity-check and ensemble weights | |
df = train_predict_weight(df) | |
visualize_feature_distributions(df) # Add feature visualization | |
df = normalize_funding(df) | |
end_time = time.time() | |
print(f"[INFO] Dataset preparation completed in {end_time - start_time:.2f} seconds.") | |
return df | |
############################## | |
# Data Cleaning | |
############################## | |
def clean_data(df): | |
""" | |
Cleans the input DataFrame by handling missing values and removing outliers. | |
""" | |
# Impute missing values | |
df.fillna(df.median(numeric_only=True), inplace=True) | |
# Remove extreme outliers using quantiles | |
for col in df.select_dtypes(include=[np.number]).columns: | |
q1 = df[col].quantile(0.25) | |
q3 = df[col].quantile(0.75) | |
iqr = q3 - q1 | |
lower_bound = q1 - 1.5 * iqr | |
upper_bound = q3 + 1.5 * iqr | |
df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)] | |
return df | |
############################## | |
# Feature Validation and Scaling | |
############################## | |
def validate_features(df): | |
""" | |
Validates and scales features to ensure they are meaningful for model training. | |
""" | |
print("[INFO] Validating and scaling features...") | |
numeric_cols = df.select_dtypes(include=[np.number]).columns | |
scaler = StandardScaler() | |
# Log feature distributions | |
for col in numeric_cols: | |
print(f"[DEBUG] Feature '{col}' - Mean: {df[col].mean()}, Std: {df[col].std()}, Min: {df[col].min()}, Max: {df[col].max()}") | |
# Scale numeric features | |
df[numeric_cols] = scaler.fit_transform(df[numeric_cols]) | |
print("[INFO] Features scaled successfully.") | |
return df | |
def validate_target(df): | |
""" | |
Validates the target variable to ensure it has sufficient variance. | |
""" | |
print("[INFO] Validating target variable 'base_weight'...") | |
target = "base_weight" | |
if target not in df.columns: | |
raise ValueError(f"Target variable '{target}' not found in DataFrame.") | |
variance = df[target].var() | |
print(f"[DEBUG] Target variable variance: {variance}") | |
if variance < 1e-6: | |
raise ValueError(f"Target variable '{target}' has insufficient variance. Please check feature values.") | |
return df | |
############################## | |
# RandomForest Regression | |
############################## | |
def train_predict_weight(df): | |
""" | |
Trains a RandomForestRegressor with hyperparameter tuning and evaluates the model. | |
""" | |
print("[INFO] Starting weight prediction with hyperparameter tuning...", flush=True) | |
start_time = time.time() | |
target = "base_weight" | |
feature_cols = [col for col in df.columns if col not in ["repo", "parent", "base_weight", "final_weight"]] | |
# Validate and scale features | |
df = validate_features(df) | |
# Validate target variable | |
df = validate_target(df) | |
X = df[feature_cols] | |
y = df[target] | |
# Split data into train/test sets | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
# Hyperparameter tuning using GridSearchCV | |
param_grid = { | |
"n_estimators": [100, 200, 300], | |
"max_depth": [10, 15, 20], | |
"min_samples_split": [2, 5, 10], | |
"min_samples_leaf": [1, 2, 4] | |
} | |
rf = RandomForestRegressor(random_state=42) | |
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring="neg_mean_squared_error", verbose=2) | |
grid_search.fit(X_train, y_train) | |
# Best model | |
best_rf = grid_search.best_estimator_ | |
print(f"[INFO] Best parameters: {grid_search.best_params_}") | |
# Evaluate on test set | |
y_pred = best_rf.predict(X_test) | |
mse = mean_squared_error(y_test, y_pred) | |
print(f"[INFO] Test MSE: {mse}") | |
# Feature importance analysis | |
feature_importances = best_rf.feature_importances_ | |
importance_df = pd.DataFrame({"Feature": feature_cols, "Importance": feature_importances}).sort_values(by="Importance", ascending=False) | |
print("[INFO] Feature importances:") | |
print(importance_df) | |
# Drop irrelevant features | |
irrelevant_features = importance_df[importance_df["Importance"] < 0.01]["Feature"].tolist() | |
print(f"[INFO] Dropping irrelevant features: {irrelevant_features}") | |
df.drop(columns=irrelevant_features, inplace=True) | |
# Plot predictions vs. actual values | |
plt.scatter(y_test, y_pred, alpha=0.5) | |
plt.xlabel("Actual Base Weight") | |
plt.ylabel("Predicted Base Weight") | |
plt.title("Predictions vs. Actual") | |
plt.show() | |
# Assign predictions to DataFrame | |
df["final_weight"] = best_rf.predict(X) | |
end_time = time.time() | |
print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True) | |
return df | |
############################## | |
# CSV Output | |
############################## | |
def create_submission_csv(df, output_filename="submission.csv"): | |
print(f"[INFO] Writing results to {output_filename}...", flush=True) | |
required_cols = ["repo", "parent", "final_weight"] | |
submission_df = df[required_cols] | |
submission_df.to_csv(output_filename, index=False) | |
print(f"[INFO] Results written to {output_filename}.", flush=True) | |
return output_filename | |
# Removed Gradio UI code from this file to ensure modular workflow. | |
# This file now focuses solely on data processing and prediction. | |
if __name__ == "__main__": | |
input_file = "input.csv" # Replace with the actual input file path | |
output_file = "submission.csv" | |
print("[INFO] Preparing dataset...") | |
df = prepare_dataset(input_file) | |
print("[INFO] Creating submission CSV...") | |
create_submission_csv(df, output_file) | |
print("[INFO] Process completed successfully.") | |