Spaces:
Running
Running
""" | |
DeepFunding Oracle: | |
This script dynamically loads dependency data and for each repository URL: | |
• Fetches GitHub features (stars, forks, watchers, open issues, pull requests, activity) using the GitHub API. | |
• Uses the LLama model to analyze parent-child behavior (based on the fetched features and parent info) | |
and returns a base weight (0-1) for the repository. | |
• Trains a RandomForest regressor on these features (with the base weight as the target) to predict a final weight. | |
The output submission CSV has three columns: repo, parent, and final_weight. | |
""" | |
import base64 | |
from io import StringIO | |
import os | |
import warnings | |
import csv | |
import re | |
import requests | |
import numpy as np | |
import pandas as pd | |
import time | |
import threading | |
import logging | |
import concurrent.futures | |
from concurrent.futures import ThreadPoolExecutor | |
import signal | |
from sklearn.pipeline import Pipeline | |
from tqdm import tqdm | |
import sys | |
import re | |
import json | |
import time | |
import json | |
import time | |
import logging | |
import sys | |
import warnings | |
import concurrent.futures | |
from concurrent.futures import ThreadPoolExecutor | |
import numpy as np | |
import pandas as pd | |
import requests | |
from tqdm import tqdm | |
from scipy.special import log1p, expm1 | |
from sklearn.model_selection import RandomizedSearchCV, GroupKFold | |
from sklearn.pipeline import Pipeline | |
from sklearn.preprocessing import RobustScaler | |
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV,KFold | |
from sklearn.ensemble import RandomForestRegressor | |
from sklearn.preprocessing import StandardScaler | |
from scipy.special import log1p, expm1 | |
from sklearn.preprocessing import RobustScaler | |
from sklearn.metrics import mean_squared_error | |
from xgboost import XGBRegressor | |
from scipy.special import log1p, expm1 | |
from Oracle.SmolLM import SmolLM | |
import os | |
warnings.filterwarnings("ignore") | |
# Configure logging to file and console | |
logging.basicConfig( | |
handlers=[ | |
logging.FileHandler("deepfundingoracle.log", mode='w'), | |
logging.StreamHandler(sys.stdout) | |
], | |
level=logging.INFO, | |
format="%(asctime)s - %(levelname)s - %(message)s") | |
# Add these functions to make the pipeline importable by app.py | |
def prepare_dataset(file_path): | |
""" | |
Wrapper function that prepares the dataset by: | |
1. Loading the CSV | |
2. Fetching GitHub features | |
3. Adding derived features | |
4. Cleaning data | |
5. Generating base weights using LLM | |
Args: | |
file_path: Path to the input CSV file | |
Returns: | |
DataFrame with all features and base_weight prepared | |
""" | |
logging.info(f"Preparing dataset from {file_path}") | |
# Load data | |
if isinstance(file_path, str): | |
df = pd.read_csv(file_path) | |
else: | |
# Handle file object (from Gradio) | |
df = pd.read_csv(file_path) | |
# Check required columns | |
if not {"repo", "parent"}.issubset(df.columns): | |
raise ValueError("Input CSV must contain 'repo' and 'parent' columns.") | |
# Run the pipeline steps | |
df = fetch_github_features(df) | |
df = add_derived_features(df) | |
df = clean_data(df) | |
df = generate_all_base_weights(df) | |
return df | |
def run_full_pipeline(input_file, output_file="submission_enhanced.csv"): | |
""" | |
Runs the complete DeepFunding Oracle pipeline. | |
Args: | |
input_file: Path to input CSV file | |
output_file: Path for output CSV file | |
Returns: | |
The processed DataFrame with final_weight column | |
""" | |
logging.info("--- Starting DeepFunding Oracle Pipeline ---") | |
# Prepare dataset | |
df = prepare_dataset(input_file) | |
# Train model and predict weights | |
df = train_predict_weight(df) | |
# Normalize weights | |
df = normalize_and_clip_weights(df) | |
# Save results | |
create_submission_csv(df, output_file) | |
logging.info("--- Pipeline Completed Successfully ---") | |
return df | |
############################## | |
# GitHub API helper: Fetch repository metrics | |
############################## | |
def fetch_repo_metrics(repo_url): | |
""" | |
Fetches GitHub metrics, handling API pagination to get accurate | |
contributor and pull request counts. | |
""" | |
default_metrics = { | |
"stars": 0, | |
"forks": 0, | |
"watchers": 0, | |
"open_issues": 0, | |
"pulls": 0, | |
"activity": pd.NaT, | |
"created_at":pd.NaT, | |
"contributors": 0 | |
} | |
try: | |
m = re.search(r"github.com/([^/]+)/([^/]+)", repo_url) | |
if not m: | |
logging.warning(f"Malformed GitHub URL: {repo_url}") | |
return default_metrics | |
owner, repo_name = m.group(1), m.group(2) | |
api_url = f"https://api.github.com/repos/{owner}/{repo_name}" | |
headers = {} | |
token = os.environ.get("GITHUB_API_TOKEN") | |
if token: | |
headers["Authorization"] = f"token {token}" | |
r = requests.get(api_url, headers=headers, timeout=15) | |
r.raise_for_status() | |
data = r.json() | |
def get_count_from_pagination(url, headers): | |
try: | |
resp = requests.get(f"{url}?per_page=1", headers=headers, timeout=10) | |
if resp.status_code == 200 and 'Link' in resp.headers: | |
match = re.search(r'page=(\d+)>; rel="last"', resp.headers['Link']) | |
if match: | |
return int(match.group(1)) | |
return len(resp.json()) if resp.status_code == 200 else 0 | |
except requests.exceptions.RequestException: | |
return 0 | |
return { | |
"stars": data.get("stargazers_count", 0), | |
"forks": data.get("forks_count", 0), | |
"watchers": data.get("subscribers_count", 0), # subscribers_count is a better 'watch' metric | |
"open_issues": data.get("open_issues_count", 0), | |
"activity": pd.to_datetime(data.get("updated_at")), | |
"created_at": pd.to_datetime(data.get("created_at")), | |
"contributors": get_count_from_pagination(data['contributors_url'], headers), | |
"pulls": get_count_from_pagination(data['pulls_url'].replace('{/number}', ''), headers) | |
} | |
except requests.exceptions.RequestException as e: | |
logging.error(f"Failed to fetch data for {repo_url}: {e}") | |
return default_metrics | |
def fetch_github_features(df): | |
"""Concurrently fetches GitHub features for all repositories in the DataFrame.""" | |
logging.info("Fetching GitHub features for repositories...") | |
metrics_data = [] | |
with ThreadPoolExecutor(max_workers=20) as executor: | |
future_to_url = {executor.submit(fetch_repo_metrics, url): url for url in df['repo']} | |
for future in tqdm(concurrent.futures.as_completed(future_to_url), total=len(df), desc="Fetching GitHub Metrics"): | |
metrics_data.append(future.result()) | |
return pd.concat([df.reset_index(drop=True), pd.DataFrame(metrics_data)], axis=1) | |
def add_derived_features(df): | |
""" | |
RATIONALE (Recommendation 2): Adds derived temporal and interaction features like 'days_since_update' | |
and 'stars_per_contributor' to give the model more powerful signals to learn from. | |
""" | |
logging.info("Engineering derived features...") | |
# Handle timestamp | |
df['activity'] = pd.to_datetime(df['activity'], errors='coerce',utc=True) | |
df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce', utc=True) | |
# Temporal features | |
now = pd.Timestamp.now(tz='UTC') | |
df['days_since_update'] = (now - df['activity']).dt.days | |
df['repo_age_days'] = (now - df['created_at']).dt.days | |
#Interactions and Ratio Features | |
df['stars_per_contributor'] = df['stars'] / df['contributors'].clip(lower=1) | |
df['forks_per_star'] = df['forks'] / df['stars'].clip(lower=1) | |
df ['pulls_per_contributor'] = df['pulls'] / df['contributors'].clip(lower=1) | |
df['stars_per_day'] = df['stars']/df['repo_age_days'].clip(lower=1) | |
#Dependency features | |
df['dependencies_count']= df.groupby('parent')['repo'].transform('count') | |
numeric_cols = df.select_dtypes(include=np.number).columns | |
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median()) | |
return df | |
############################## | |
# Feature Extraction | |
############################## | |
def assign_base_weight_per_row(row, oracle): | |
""" | |
RATIONALE: Asks the LLM to score each repo individually on a 0-1 scale based on its specific stats. | |
This creates a much more accurate and nuanced target variable `base_weight` for the XGBoost model. | |
""" | |
stats = (f"Stars: {int(row.get('stars', 0))}, " | |
f"Contributors: {int(row.get('contributors', 0))}, " | |
f"Pull Requests: {int(row.get('pulls', 0))}, " | |
f"Days Since Last Update: {int(row.get('days_since_update', 0))}, " | |
f"Repo Age (Days): {int(row.get('repo_age_days', 0))}") | |
# <<< UPDATED: The prompt now asks for a score on a 0.0 to 1.0 scale. >>> | |
prompt = ( | |
"You are a venture capitalist analyzing open-source projects for the Ethereum ecosystem. " | |
f"A project named '{row['repo'].split('/')[-1]}' is a dependency of '{row['parent'].split('/')[-1]}'. " | |
"Here are its key metrics: " | |
f"[{stats}]. " | |
"Based on these metrics, evaluate its standalone importance, community health, and development velocity. " | |
"On a scale of 0.0 (minor, easily replaceable utility) to 1.0 (critical, foundational dependency), how would you score this project's value to its parent? " | |
"Provide ONLY the numeric score in your answer. Example: 0.8" | |
) | |
try: | |
response = oracle.predict(prompt, max_new_tokens=10) | |
match = re.search(r"(\d+(\.\d+)?)", response) | |
if match: | |
return max(0.0, min(1.0, float(match.group(1)))) | |
logging.warning(f"Could not parse score from LLM for {row['repo']}. Defaulting.") | |
return 0.4 | |
except Exception as e: | |
logging.error(f"LLM prediction failed for {row['repo']}: {e}. Defaulting.") | |
return 0.4 | |
def generate_all_base_weights(df): | |
"""Applies the per-row LLM evaluation to the entire dataframe.""" | |
logging.info("Assigning robust base weights using per-row LLM evaluation...") | |
oracle = SmolLM() | |
if not oracle.available: | |
logging.error("Oracle (LLM) is not available. Falling back to composite score.") | |
df['base_weight'] = ( | |
np.log1p(df['stars']) * 0.4 + | |
np.log1p(df['contributors']) * 0.4 + | |
np.log1p(df['pulls']) * 0.2 | |
) | |
return df | |
df['base_weight'] = df.progress_apply(lambda row: assign_base_weight_per_row(row, oracle), axis=1) | |
# This normalization step is kept as a crucial safeguard. It ensures the final `base_weight` | |
# is scaled relative to its siblings, even if the LLM isn't perfectly calibrated. | |
df['base_weight'] = df.groupby("parent")["base_weight"].transform( | |
lambda s: (s - s.min()) / (s.max() - s.min() if s.max() > s.min() else 0.0) | |
).fillna(0.5) | |
return df | |
def normalize_and_clip_weights(df, group_col="parent", weight_col="final_weight"): | |
"""Ensures final weights are non-negative and sum to 1 per group.""" | |
logging.info("Normalizing final weights...") | |
df[weight_col] = df[weight_col].clip(lower=0) | |
group_sums = df.groupby(group_col)[weight_col].transform('sum') | |
# Normalize where sum > 0 | |
df[weight_col] = np.where(group_sums > 0, df[weight_col] / group_sums, 0) | |
# Handle groups where the sum was 0 by distributing weight equally | |
zero_sum_parents = df[group_sums == 0][group_col].unique() | |
for parent in zero_sum_parents: | |
mask = df[group_col] == parent | |
count = mask.sum() | |
if count > 0: | |
df.loc[mask, weight_col] = 1 / count | |
return df | |
############################## | |
# Data Cleaning | |
############################## | |
def clean_data(df): | |
""" | |
INTEGRATED: Cleans the DataFrame by imputing missing values and clipping extreme | |
outliers, which helps stabilize the model. | |
""" | |
logging.info("Cleaning data and handling outliers...") | |
numeric_cols = df.select_dtypes(include=np.number).columns | |
for col in numeric_cols: | |
df[col].fillna(df[col].median(), inplace=True) | |
# REFINED: Clip outliers using a wider percentile range (5% and 95%) which is often | |
# more suitable for heavily skewed data like GitHub stats. | |
for col in numeric_cols: | |
if col not in ['repo_age_days', 'days_since_update']: # Don't clip age features | |
q_low = df[col].quantile(0.05) | |
q_high = df[col].quantile(0.95) | |
df[col] = df[col].clip(q_low, q_high) | |
return df | |
############################## | |
# Model Training and Prediction | |
############################## | |
def train_predict_weight(df): | |
""" | |
Trains an XGBoost Regressor with GroupKFold cross-validation and extensive hyperparameter tuning. | |
""" | |
logging.info("Starting model training...") | |
target_col = 'base_weight' | |
if target_col not in df.columns or df[target_col].isnull().all(): | |
logging.error("Target column 'base_weight' is missing or all null. Aborting training.") | |
df['final_weight'] = df['stars'] | |
return df | |
drop_cols = ["repo", "parent", "activity", "created_at", target_col] | |
feature_cols = [col for col in df.select_dtypes(include=np.number).columns if col not in drop_cols] | |
X = df[feature_cols].copy().fillna(0) | |
y = df[target_col] | |
groups = df['parent'] | |
# RATIONALE (Recommendation 2): Log-transforming skewed input features helps the model by | |
# making their distributions more normal, improving the performance of the regressor. | |
for col in X.columns: | |
if 'ratio' not in col and 'per' not in col and 'day' not in col: | |
X[col] = np.log1p(X[col]) | |
pipeline = Pipeline([("scaler", RobustScaler()), | |
("xgb", XGBRegressor(objective="reg:squarederror", n_jobs=-1, random_state=42, base_score=y.mean()))]) | |
param_dist = { | |
'xgb__n_estimators': [100, 200, 300, 500], | |
'xgb__max_depth': [3, 5, 7], | |
'xgb__learning_rate': [0.01, 0.05, 0.1], | |
'xgb__subsample': [0.7, 0.8, 0.9], | |
'xgb__colsample_bytree': [0.7, 0.8, 0.9] | |
} | |
# RATIONALE (Recommendation 3): GroupKFold ensures that all repos from the same parent are in the | |
# same fold. This prevents data leakage and gives a realistic measure of true performance. | |
cv = GroupKFold(n_splits=5) | |
# RATIONALE (Recommendation 4): Increasing n_iter explores more hyperparameter combinations, | |
# increasing the chance of finding a better-performing model. | |
search = RandomizedSearchCV( | |
pipeline, param_distributions=param_dist, n_iter=25, cv=cv.split(X, y, groups), | |
scoring="neg_root_mean_squared_error", verbose=1, n_jobs=-1, random_state=42 | |
) | |
search.fit(X, y) | |
best_model = search.best_estimator_ | |
logging.info(f"Best CV score (neg RMSE): {search.best_score_:.4f}") | |
logging.info(f"Best parameters found: {search.best_params_}") | |
raw_predictions = best_model.predict(X) | |
# Apply a log transformation to raw predictions to stabilize variance before normalization. | |
df['final_weight'] = np.log1p(raw_predictions - raw_predictions.min()) | |
return df | |
############################## | |
# CSV Output | |
############################## | |
def create_submission_csv(df, output_filename="submission.csv"): | |
"""Saves the final predictions to a CSV file.""" | |
# FIXED: Changed "weight" to "final_weight" to match the calculated column. | |
submission_df = df[["repo", "parent", "final_weight"]] | |
logging.info(f"Writing final results to {output_filename}...") | |
submission_df.to_csv(output_filename, index=False) | |
logging.info(f"Successfully created {output_filename}.") | |
if __name__ == "__main__": | |
if 'GITHUB_API_TOKEN' not in os.environ: | |
logging.warning("GITHUB_API_TOKEN environment variable not set. API rate limits will be low.") | |
input_file = "input.csv" | |
output_file = "submission_enhanced.csv" | |
if not os.path.exists(input_file): | |
logging.error(f"Input file not found: {input_file}. Please create it with 'repo' and 'parent' columns.") | |
sys.exit(1) | |
logging.info("--- Starting DeepFunding Oracle - Enhanced Process ---") | |
main_df = pd.read_csv(input_file) | |
main_df = fetch_github_features(main_df) | |
main_df = add_derived_features(main_df) | |
main_df = generate_all_base_weights(main_df) # New LLM step with corrected prompt | |
main_df = train_predict_weight(main_df) | |
main_df = normalize_and_clip_weights(main_df) # Final normalization step | |
create_submission_csv(main_df, output_file) | |
logging.info("--- Process Completed Successfully ---") |