Spaces:
Sleeping
Sleeping
""" | |
DeepFunding Oracle: | |
This script dynamically loads dependency data and for each repository URL: | |
• Fetches GitHub features (stars, forks, watchers, open issues, pull requests, activity) using the GitHub API. | |
• Uses the LLama model to analyze parent-child behavior (based on the fetched features and parent info) | |
and returns a base weight (0-1) for the repository. | |
• Trains a RandomForest regressor on these features (with the base weight as the target) to predict a final weight. | |
The output submission CSV has three columns: repo, parent, and final_weight. | |
""" | |
import base64 | |
from io import StringIO | |
import os | |
import warnings | |
import csv | |
import re | |
import requests | |
import numpy as np | |
import pandas as pd | |
import time | |
import threading | |
import logging | |
import concurrent.futures | |
from concurrent.futures import ThreadPoolExecutor | |
import signal | |
from sklearn.pipeline import Pipeline | |
from tqdm import tqdm | |
import sys | |
import re | |
import json | |
import time | |
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV | |
from sklearn.ensemble import RandomForestRegressor | |
from sklearn.metrics import mean_squared_error | |
from sklearn.preprocessing import StandardScaler | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from scipy.special import log1p, expm1 | |
from Oracle.SmolLM import SmolLM | |
warnings.filterwarnings("ignore") | |
# Configure logging to file and console | |
logging.basicConfig( | |
handlers=[ | |
logging.FileHandler("deepfundingoracle.log"), | |
logging.StreamHandler(sys.stdout) | |
], | |
level=logging.INFO, | |
format="%(asctime)s - %(levelname)s - %(message)s" | |
) | |
############################## | |
# GitHub API helper: Fetch repository metrics | |
############################## | |
def fetch_repo_metrics(repo_url): | |
""" | |
Fetch GitHub metrics (stars, forks, watchers, open issues, pull requests, and activity) given a repository URL. | |
Assumes repo_url is in the form "https://github.com/owner/repo". | |
Handles API failures and malformed URLs gracefully. | |
""" | |
# Default values in case of failure | |
default_metrics = { | |
"stargazers_count": 0, | |
"forks_count": 0, | |
"watchers_count": 0, | |
"open_issues_count": 0, | |
"pulls_count": 0, | |
"activity": "", | |
"contributors": 0, | |
"dependencies_count": 0 | |
} | |
try: | |
# Extract owner and repo name | |
m = re.search(r"github\.com/([^/]+)/([^/]+)", repo_url) | |
if not m: | |
print(f"[WARN] Malformed GitHub URL: {repo_url}") | |
return default_metrics | |
owner, repo_name = m.group(1), m.group(2) | |
api_url = f"https://api.github.com/repos/{owner}/{repo_name}" | |
headers = {} | |
token = os.environ.get("GITHUB_API_TOKEN", "") | |
if token: | |
headers["Authorization"] = f"token {token}" | |
# Fetch main repository data | |
r = requests.get(api_url, headers=headers, timeout=10) | |
if r.status_code == 200: | |
data = r.json() | |
metrics = { | |
"stargazers_count": data.get("stargazers_count", 0), | |
"forks_count": data.get("forks_count", 0), | |
"watchers_count": data.get("watchers_count", 0), | |
"open_issues_count": data.get("open_issues_count", 0), | |
"activity": data.get("updated_at", ""), | |
"owner": owner, | |
"repo_name": repo_name, | |
"dependencies_count": 0 | |
} | |
# Try to fetch pull requests count | |
try: | |
pulls_url = f"{api_url}/pulls" | |
pulls_resp = requests.get(pulls_url, headers=headers, timeout=5) | |
metrics["pulls_count"] = len(pulls_resp.json()) if pulls_resp.status_code == 200 else 0 | |
except Exception as e: | |
print(f"[WARN] Failed to fetch pulls for {repo_url}: {e}") | |
metrics["pulls_count"] = 0 | |
# Try to fetch contributors count | |
try: | |
contributors_url = f"{api_url}/contributors" | |
contributors_resp = requests.get(contributors_url, headers=headers, timeout=5) | |
metrics["contributors"] = len(contributors_resp.json()) if contributors_resp.status_code == 200 else 0 | |
except Exception as e: | |
print(f"[WARN] Failed to fetch contributors for {repo_url}: {e}") | |
metrics["contributors"] = 0 | |
# Try to estimate dependencies from package files | |
try: | |
# Look for package.json for Node.js projects | |
package_json_url = f"https://raw.githubusercontent.com/{owner}/{repo_name}/master/package.json" | |
package_resp = requests.get(package_json_url, timeout=5) | |
if package_resp.status_code == 200: | |
package_data = package_resp.json() | |
deps = package_data.get("dependencies", {}) | |
dev_deps = package_data.get("devDependencies", {}) | |
metrics["dependencies_count"] = len(deps) + len(dev_deps) | |
else: | |
# Try requirements.txt for Python projects | |
req_txt_url = f"https://raw.githubusercontent.com/{owner}/{repo_name}/master/requirements.txt" | |
req_resp = requests.get(req_txt_url, timeout=5) | |
if req_resp.status_code == 200: | |
deps = [line for line in req_resp.text.split('\n') if line.strip() and not line.startswith('#')] | |
metrics["dependencies_count"] = len(deps) | |
except Exception as e: | |
print(f"[WARN] Failed to fetch dependencies for {repo_url}: {e}") | |
metrics["dependencies_count"] = 0 | |
return metrics | |
else: | |
print(f"[ERROR] Failed to fetch data for {repo_url}: {r.status_code}") | |
return default_metrics | |
except Exception as e: | |
print(f"[ERROR] Exception while fetching data for {repo_url}: {e}") | |
return default_metrics | |
def fetch_github_features(df): | |
""" | |
For each row, using the repo URL, call the GitHub API to fetch: | |
stars, forks, watchers, open issues, pull requests, activity, and contributors count. | |
Adds these as new columns to the DataFrame. | |
""" | |
print("[INFO] Fetching GitHub features for repositories...") | |
start_time = time.time() | |
# Initialize lists for storing fetched data | |
metrics_lists = { | |
"stars": [], | |
"forks": [], | |
"watchers": [], | |
"open_issues": [], | |
"pulls": [], | |
"activity": [], | |
"contributors": [], | |
"dependencies_count": [] | |
} | |
cache = {} | |
def get_metrics(repo_url): | |
if repo_url in cache: | |
print(f"[DEBUG] Cached GitHub data for {repo_url}: {cache[repo_url]}") | |
return cache[repo_url] | |
val = fetch_repo_metrics(repo_url) | |
print(f"[DEBUG] Extracted GitHub data for {repo_url}: {val}") | |
cache[repo_url] = val | |
return val | |
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: | |
futures = {executor.submit(get_metrics, row['repo']): i for i, row in df.iterrows()} | |
for fut in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Fetching metrics"): | |
res = fut.result() | |
metrics_lists["stars"].append(res.get("stargazers_count", 0)) | |
metrics_lists["forks"].append(res.get("forks_count", 0)) | |
metrics_lists["watchers"].append(res.get("watchers_count", 0)) | |
metrics_lists["open_issues"].append(res.get("open_issues_count", 0)) | |
metrics_lists["pulls"].append(res.get("pulls_count", 0)) | |
metrics_lists["activity"].append(res.get("activity", "")) | |
metrics_lists["contributors"].append(res.get("contributors", 0)) | |
metrics_lists["dependencies_count"].append(res.get("dependencies_count", 0)) | |
# Add the fetched data to the DataFrame | |
for key, values in metrics_lists.items(): | |
df[key] = values | |
end_time = time.time() | |
print(f"[INFO] GitHub features fetched successfully in {end_time - start_time:.2f} seconds.") | |
return df | |
def calculate_fallback_weights(df): | |
""" | |
Dynamically calculate fallback feature weights based on feature variance. | |
""" | |
print("[INFO] Calculating fallback feature weights...") | |
numeric_cols = ['stars', 'forks', 'watchers', 'open_issues', 'pulls', 'contributors', 'dependencies_count'] | |
# Filter to only include columns that exist in the DataFrame | |
valid_cols = [col for col in numeric_cols if col in df.columns] | |
# Create default weights | |
default_weights = { | |
'stars': 0.3, | |
'forks': 0.2, | |
'watchers': 0.2, | |
'open_issues': 0.1, | |
'pulls': 0.1, | |
'contributors': 0.05, | |
'dependencies_count': 0.05 | |
} | |
# If any data exists, calculate variance-based weights | |
if len(valid_cols) > 0 and df[valid_cols].sum().sum() > 0: | |
# Calculate variance for each feature | |
feature_variances = df[valid_cols].var() | |
total_variance = feature_variances.sum() | |
# If meaningful variance exists, use it for weights | |
if total_variance > 0: | |
weights = {col: var / total_variance for col, var in feature_variances.items()} | |
# Normalize to ensure sum is 1.0 | |
sum_weights = sum(weights.values()) | |
if sum_weights > 0: | |
weights = {k: v / sum_weights for k, v in weights.items()} | |
return weights | |
# Return default weights if we couldn't calculate meaningful ones | |
print("[INFO] Using default fallback weights") | |
return default_weights | |
############################## | |
# Feature Extraction | |
############################## | |
def load_data(file): | |
""" | |
Dynamically load the dependency data CSV from the uploaded file. | |
Expects at least "repo" and "parent" columns. | |
""" | |
try: | |
print("[INFO] Loading data from uploaded file...") | |
start_time = time.time() | |
# Read the uploaded file directly into a DataFrame | |
df = pd.read_csv(file) | |
end_time = time.time() | |
print(f"[INFO] Data loaded successfully in {end_time - start_time:.2f} seconds.") | |
return df | |
except Exception as e: | |
print("[ERROR] Error loading data:", e) | |
return None | |
def timeout_handler(signum, frame): | |
raise TimeoutError("LLama model prediction timed out.") | |
def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0): | |
""" | |
Assign base weights using a single LLM call to determine feature weights, | |
and programmatically calculate repository weights. | |
""" | |
print("[INFO] Starting optimized base weight assignment...", flush=True) | |
logging.info("[INFO] Assigning base weights using optimized approach...") | |
start_time = time.time() | |
oracle = SmolLM() | |
prompt = ( | |
"Can you Predict a weight in the range (0-1) for these GitHub features such as stars, forks, watchers, " | |
"open_issues, pulls, activity, contributors based on their importance in determining the influence of a repository? " | |
"Output the weights for each feature as text e.g.: " | |
'stars: 0.3, forks: 0.2, watchers: 0.2, open_issues: 0.1, pulls: 0.1, activity: 0.05, contributors: 0.05' | |
) | |
feature_weights = None | |
for attempt in range(llm_retries): | |
try: | |
response = oracle.predict(prompt, max_length=512, max_new_tokens=150) | |
if not response or not response.strip(): | |
raise ValueError("Empty response from Oracle.") | |
matches = re.findall( | |
r'(stars|forks|watchers|open_issues|pulls|activity|contributors)\s*[:=]\s*([0-9]*\.?[0-9]+)', | |
response, re.IGNORECASE) | |
feature_weights = {k.lower(): float(v) for k, v in matches} | |
if not feature_weights or len(feature_weights) < 7: | |
raise ValueError("Could not extract all feature weights from response.") | |
print(f"[INFO] Feature weights from LLM: {feature_weights}", flush=True) | |
break | |
except Exception as e: | |
print(f"[ERROR] Oracle attempt {attempt+1} failed: {e}", flush=True) | |
logging.error(f"[ERROR] Oracle attempt {attempt+1} failed: {e}") | |
time.sleep(llm_delay) | |
# Fallback mechanism: Calculate feature weights dynamically if LLM fails | |
if feature_weights is None: | |
print("[WARN] LLM failed to provide feature weights. Calculating fallback weights dynamically.") | |
feature_weights = calculate_fallback_weights(df) | |
print(f"[INFO] Fallback feature weights: {feature_weights}", flush=True) | |
for feature in feature_weights.keys(): | |
if feature in df.columns: | |
df[feature] = pd.to_numeric(df[feature], errors='coerce').fillna(0) | |
def calculate_weight(row): | |
weight = 0 | |
for feature, feature_weight in feature_weights.items(): | |
if feature in row: | |
weight += row[feature] * feature_weight | |
return weight | |
df["base_weight_raw"] = df.apply(calculate_weight, axis=1) | |
df["base_weight"] = df.groupby("parent")["base_weight_raw"].transform( | |
lambda s: (s - s.min()) / (s.max() - s.min() if s.max() != s.min() else 1) | |
) | |
end_time = time.time() | |
print(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.", flush=True) | |
logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.") | |
return df | |
def sanity_check_weights(df): | |
""" | |
Sanity-checks LLM weights by comparing them with other metrics. | |
""" | |
print("[INFO] Performing sanity check on LLM weights...") | |
df["sanity_check_weight"] = (df["stars"] + df["forks"] + df["watchers"]) / 3 | |
df["ensemble_weight"] = (df["base_weight"] + df["sanity_check_weight"]) / 2 | |
print("[INFO] Sanity check and ensemble weights added.") | |
return df | |
def visualize_feature_distributions(df): | |
""" | |
Visualizes feature distributions and correlations. | |
""" | |
print("[INFO] Visualizing feature distributions and correlations...") | |
if not plt or not sns: | |
print("[ERROR] Matplotlib or Seaborn not available for visualization.") | |
return | |
numeric_cols = df.select_dtypes(include=[np.number]).columns | |
# Plot feature distributions | |
df[numeric_cols].hist(bins=20, figsize=(15, 10), color="skyblue", edgecolor="black") | |
plt.suptitle("Feature Distributions", fontsize=16) | |
plt.show() | |
# Plot feature correlations | |
correlation_matrix = df[numeric_cols].corr() | |
plt.figure(figsize=(12, 8)) | |
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5) | |
plt.title("Feature Correlation Matrix", fontsize=16) | |
plt.show() | |
def normalize_and_clip_weights(df, group_col="parent", weight_col="final_weight"): | |
""" | |
Ensures weights are non-negative and sum to 1 per group. | |
""" | |
if df is None: | |
raise ValueError("DataFrame is None, cannot normalize weights.") | |
if weight_col not in df.columns: | |
raise KeyError(f"`{weight_col}` column not found in DataFrame.") | |
# Clip negatives | |
df[weight_col] = df[weight_col].clip(lower=0) | |
# Normalize within each group | |
def normalize_group(x): | |
total = x.sum() | |
if total > 0: | |
return x / total | |
return np.ones_like(x) / len(x) | |
df[weight_col] = df.groupby(group_col)[weight_col].transform(normalize_group) | |
return df | |
def normalize_funding(df): | |
""" | |
Normalize funding weights for child repositories grouped by parent. | |
""" | |
print("[INFO] Normalizing funding weights...", flush=True) | |
if df is None or df.empty: | |
print("[WARN] Skipping normalization: DataFrame is None or empty.", flush=True) | |
return df | |
df = normalize_and_clip_weights(df) | |
print("[INFO] Funding weights normalized successfully.", flush=True) | |
return df | |
def prepare_dataset(file): | |
print("[INFO] Starting dataset preparation...") | |
start_time = time.time() | |
df = load_data(file) | |
if df is None: | |
raise ValueError("Failed to load data.") | |
if not {"repo", "parent"}.issubset(df.columns): | |
raise ValueError("Input CSV must contain 'repo' and 'parent' columns.") | |
print("[INFO] Fetching GitHub features...") | |
df = fetch_github_features(df) | |
print("[INFO] GitHub features fetched successfully.") | |
print("[INFO] Cleaning data...") | |
df = clean_data(df) | |
print("[INFO] Data cleaned successfully.") | |
print("[INFO] Assigning base weights using LLama model...") | |
df = assign_base_weight(df) | |
df = sanity_check_weights(df) # Add sanity-check and ensemble weights | |
df = train_predict_weight(df) | |
if df is not None and not df.empty: | |
visualize_feature_distributions(df) | |
else: | |
print("[WARN] DataFrame is empty after processing. Skipping visualization.") | |
df = normalize_funding(df) | |
end_time = time.time() | |
print(f"[INFO] Dataset preparation completed in {end_time - start_time:.2f} seconds.") | |
return df | |
############################## | |
# Data Cleaning | |
############################## | |
def clean_data(df): | |
""" | |
Cleans the input DataFrame by handling missing values and removing outliers. | |
""" | |
# Impute missing values | |
df.fillna(df.median(numeric_only=True), inplace=True) | |
# Remove extreme outliers using quantiles | |
for col in df.select_dtypes(include=[np.number]).columns: | |
q1 = df[col].quantile(0.25) | |
q3 = df[col].quantile(0.75) | |
iqr = q3 - q1 | |
lower_bound = q1 - 1.5 * iqr | |
upper_bound = q3 + 1.5 * iqr | |
df[col] = df[col].clip(lower=lower_bound, upper=upper_bound) | |
return df | |
############################## | |
# Feature Validation and Scaling | |
############################## | |
def validate_features(df): | |
""" | |
Validates and scales features to ensure they are meaningful for model training. | |
""" | |
print("[INFO] Validating and scaling features...") | |
numeric_cols = df.select_dtypes(include=[np.number]).columns | |
for col in numeric_cols: | |
df[col]= log1p(df[col].clip(lower=0)) | |
scaler = StandardScaler() | |
# Scale numeric features | |
df[numeric_cols] = scaler.fit_transform(df[numeric_cols]) | |
print("[INFO] Features scaled successfully.") | |
return df | |
def validate_target(df): | |
""" | |
Validates the target variable to ensure it has sufficient variance. | |
If variance is insufficient, adds small random noise to create variance. | |
""" | |
print("[INFO] Validating target variable 'base_weight'...") | |
target = "base_weight" | |
if target not in df.columns: | |
raise ValueError(f"Target variable '{target}' not found in DataFrame.") | |
variance = df[target].var() | |
print(f"[DEBUG] Target variable variance: {variance}") | |
if variance < 1e-6: | |
print("[WARN] Target variable has insufficient variance. Adding small random noise...") | |
# Add small random noise to introduce variance | |
np.random.seed(42) # For reproducibility | |
noise = np.random.normal(0.5, 0.1, size=len(df)) | |
df[target] = noise | |
print(f"[INFO] New target variable variance: {df[target].var()}") | |
return df | |
############################## | |
# RandomForest Regression | |
############################## | |
def train_predict_weight(df): | |
""" | |
Trains a RandomForestRegressor with hyperparameter tuning and evaluates the model. | |
""" | |
print("[INFO] Starting weight prediction with hyperparameter tuning...", flush=True) | |
start_time = time.time() | |
target = "base_weight" | |
feature_cols = [col for col in df.select_dtypes(include=[np.number]).columns if col not in ["base_weight", "final_weight","base_weight_raw"]] | |
X = df[feature_cols].fillna(0) | |
y = df[target] | |
# Remove rows with NaN values | |
mask = X.notna().all(axis=1) & y.notna() | |
X,y = X[mask], y[mask] | |
# Check for sufficient data and variance | |
if X.shape[0] < 5 or y.nunique() <=1: | |
print("[WARN] Not enough data or variance for model training. Using base weights directly.") | |
df["final_weight"] = df[target] | |
return normalize_and_clip_weights(df) | |
# log1p transform target | |
y_log = log1p(y) | |
# Split data into train/test sets | |
X_train, X_test, y_train_log, y_test_log = train_test_split(X, y_log, test_size=0.2, random_state=42) | |
pipeline = Pipeline([ | |
("rf", RandomForestRegressor(random_state=42)) | |
]) | |
# Hyperparameter tuning using GridSearchCV | |
param_dist = { | |
"rf__n_estimators": [100, 300, 500, 800, 1000], | |
"rf__max_depth": [None, 20, 30, 40], | |
"rf__min_samples_split": [2, 5, 10], | |
"rf__min_samples_leaf": [1, 2, 4], | |
"rf__max_features": ["auto", "sqrt"], | |
} | |
search = RandomizedSearchCV( | |
pipeline, | |
param_distributions=param_dist, | |
n_iter=50, | |
cv=10, | |
scoring="neg_root_mean_squared_error", | |
verbose=2, | |
n_jobs=-1, | |
random_state=42 | |
) | |
search.fit(X_train, y_train_log) | |
best_model = search.best_estimator_ | |
#Predict on test, invert transform | |
y_pred_test_log = best_model.predict(X_test) | |
y_pred_test = expm1(y_pred_test_log) | |
y_true_test = expm1(y_test_log) | |
mse = mean_squared_error(y_true_test, y_pred_test) | |
print(f"[INFO] Test MSE after RandomizedSearch: {mse:.4f}", flush=True) | |
# Predict on full dataset and invert | |
df["final_weight"] = expm1(best_model.predict(df[feature_cols])) | |
df = normalize_and_clip_weights(df) | |
end_time = time.time() | |
print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True) | |
return df | |
############################## | |
# CSV Output | |
############################## | |
def create_submission_csv(df, output_filename="submission.csv"): | |
print(f"[INFO] Writing results to {output_filename}...", flush=True) | |
required_cols = ["repo", "parent", "final_weight"] | |
submission_df = df[required_cols] | |
submission_df.to_csv(output_filename, index=False) | |
print(f"[INFO] Results written to {output_filename}.", flush=True) | |
return output_filename | |
# Removed Gradio UI code from this file to ensure modular workflow. | |
# This file now focuses solely on data processing and prediction. | |
if __name__ == "__main__": | |
input_file = "input.csv" # Replace with the actual input file path | |
output_file = "submission.csv" | |
print("[INFO] Preparing dataset...") | |
df = prepare_dataset(input_file) | |
print("[INFO] Creating submission CSV...") | |
create_submission_csv(df, output_file) | |
print("[INFO] Process completed successfully.") | |