Spaces:

FelixPhilip
/

DeepFundingOracle

Sleeping

App Files Files Community

FelixPhilip commited on Jun 9

Commit

722cfc4

1 Parent(s): e574555

Oracle

Browse files

Files changed (2) hide show

Oracle/SmolLM.py +4 -4
Oracle/deepfundingoracle.py +201 -256

Oracle/SmolLM.py CHANGED Viewed

@@ -15,10 +15,10 @@ class SmolLM:
             print(f"[ERROR] Failed to load model '{model_path}': {e}")
             self.available = False
-    def predict(self, prompt, max_length=512, max_new_tokens=150):
         if not self.available:
             print("[WARN] Oracle unavailable, returning default weight 0.5")
-            return "0.5"
         try:
             # Use chat template as per documentation
             messages = [{"role": "user", "content": prompt}]
@@ -26,13 +26,13 @@ class SmolLM:
             outputs = self.model.generate(
                 inputs,
                 max_new_tokens=max_new_tokens,
-                temperature=0.7,
                 top_p=0.9,
                 do_sample=True
             )
             response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
             print(f"[INFO] Generated response: {response[:100]}...", flush=True)
-            return response
         except Exception as e:
             print(f"[ERROR] Oracle has failed: {e}")
             return "0.5"

             print(f"[ERROR] Failed to load model '{model_path}': {e}")
             self.available = False
+    def predict(self, prompt, max_new_tokens=200):
         if not self.available:
             print("[WARN] Oracle unavailable, returning default weight 0.5")
+            return ""
         try:
             # Use chat template as per documentation
             messages = [{"role": "user", "content": prompt}]
             outputs = self.model.generate(
                 inputs,
                 max_new_tokens=max_new_tokens,
+                temperature=0.2,
                 top_p=0.9,
                 do_sample=True
             )
             response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
             print(f"[INFO] Generated response: {response[:100]}...", flush=True)
+            return response.split("<|assistant|>")[-1].strip()
         except Exception as e:
             print(f"[ERROR] Oracle has failed: {e}")
             return "0.5"

Oracle/deepfundingoracle.py CHANGED Viewed

@@ -29,17 +29,33 @@ import sys
 import re
 import json
 import time
-from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
 from sklearn.ensemble import RandomForestRegressor
-from sklearn.metrics import mean_squared_error
 from sklearn.preprocessing import StandardScaler
 import matplotlib.pyplot as plt
 import seaborn as sns
 from scipy.special import log1p, expm1
 from Oracle.SmolLM import SmolLM
 warnings.filterwarnings("ignore")
 # Configure logging to file and console
@@ -52,155 +68,104 @@ logging.basicConfig(
     format="%(asctime)s - %(levelname)s - %(message)s"
 )
 ##############################
 #  GitHub API helper: Fetch repository metrics
 ##############################
 def fetch_repo_metrics(repo_url):
     """
-    Fetch GitHub metrics (stars, forks, watchers, open issues, pull requests, and activity) given a repository URL.
-    Assumes repo_url is in the form "https://github.com/owner/repo".
-    Handles API failures and malformed URLs gracefully.
     """
-    # Default values in case of failure
-    default_metrics = {
-        "stargazers_count": 0,
-        "forks_count": 0,
-        "watchers_count": 0,
-        "open_issues_count": 0,
-        "pulls_count": 0,
-        "activity": "",
-        "contributors": 0,
-        "dependencies_count": 0
-    }
     try:
-        # Extract owner and repo name
         m = re.search(r"github\.com/([^/]+)/([^/]+)", repo_url)
         if not m:
-            print(f"[WARN] Malformed GitHub URL: {repo_url}")
             return default_metrics
         owner, repo_name = m.group(1), m.group(2)
         api_url = f"https://api.github.com/repos/{owner}/{repo_name}"
         headers = {}
-        token = os.environ.get("GITHUB_API_TOKEN", "")
         if token:
             headers["Authorization"] = f"token {token}"
-        # Fetch main repository data
-        r = requests.get(api_url, headers=headers, timeout=10)
-        if r.status_code == 200:
-            data = r.json()
-            metrics = {
-                "stargazers_count": data.get("stargazers_count", 0),
-                "forks_count": data.get("forks_count", 0),
-                "watchers_count": data.get("watchers_count", 0),
-                "open_issues_count": data.get("open_issues_count", 0),
-                "activity": data.get("updated_at", ""),
-                "owner": owner,
-                "repo_name": repo_name,
-                "dependencies_count": 0
-            }
-            # Try to fetch pull requests count
-            try:
-                pulls_url = f"{api_url}/pulls"
-                pulls_resp = requests.get(pulls_url, headers=headers, timeout=5)
-                metrics["pulls_count"] = len(pulls_resp.json()) if pulls_resp.status_code == 200 else 0
-            except Exception as e:
-                print(f"[WARN] Failed to fetch pulls for {repo_url}: {e}")
-                metrics["pulls_count"] = 0
-            # Try to fetch contributors count
-            try:
-                contributors_url = f"{api_url}/contributors"
-                contributors_resp = requests.get(contributors_url, headers=headers, timeout=5)
-                metrics["contributors"] = len(contributors_resp.json()) if contributors_resp.status_code == 200 else 0
-            except Exception as e:
-                print(f"[WARN] Failed to fetch contributors for {repo_url}: {e}")
-                metrics["contributors"] = 0
-            # Try to estimate dependencies from package files
             try:
-                # Look for package.json for Node.js projects
-                package_json_url = f"https://raw.githubusercontent.com/{owner}/{repo_name}/master/package.json"
-                package_resp = requests.get(package_json_url, timeout=5)
-                if package_resp.status_code == 200:
-                    package_data = package_resp.json()
-                    deps = package_data.get("dependencies", {})
-                    dev_deps = package_data.get("devDependencies", {})
-                    metrics["dependencies_count"] = len(deps) + len(dev_deps)
-                else:
-                    # Try requirements.txt for Python projects
-                    req_txt_url = f"https://raw.githubusercontent.com/{owner}/{repo_name}/master/requirements.txt"
-                    req_resp = requests.get(req_txt_url, timeout=5)
-                    if req_resp.status_code == 200:
-                        deps = [line for line in req_resp.text.split('\n') if line.strip() and not line.startswith('#')]
-                        metrics["dependencies_count"] = len(deps)
-            except Exception as e:
-                print(f"[WARN] Failed to fetch dependencies for {repo_url}: {e}")
-                metrics["dependencies_count"] = 0
-            return metrics
-        else:
-            print(f"[ERROR] Failed to fetch data for {repo_url}: {r.status_code}")
-            return default_metrics
-    except Exception as e:
-        print(f"[ERROR] Exception while fetching data for {repo_url}: {e}")
         return default_metrics
 def fetch_github_features(df):
     """
-    For each row, using the repo URL, call the GitHub API to fetch:
-      stars, forks, watchers, open issues, pull requests, activity, and contributors count.
-    Adds these as new columns to the DataFrame.
     """
-    print("[INFO] Fetching GitHub features for repositories...")
-    start_time = time.time()
-    # Initialize lists for storing fetched data
-    metrics_lists = {
-        "stars": [],
-        "forks": [],
-        "watchers": [],
-        "open_issues": [],
-        "pulls": [],
-        "activity": [],
-        "contributors": [],
-        "dependencies_count": []
-    }
-    cache = {}
-    def get_metrics(repo_url):
-        if repo_url in cache:
-            print(f"[DEBUG] Cached GitHub data for {repo_url}: {cache[repo_url]}")
-            return cache[repo_url]
-        val = fetch_repo_metrics(repo_url)
-        print(f"[DEBUG] Extracted GitHub data for {repo_url}: {val}")
-        cache[repo_url] = val
-        return val
-    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
-        futures = {executor.submit(get_metrics, row['repo']): i for i, row in df.iterrows()}
-        for fut in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Fetching metrics"):
-            res = fut.result()
-            metrics_lists["stars"].append(res.get("stargazers_count", 0))
-            metrics_lists["forks"].append(res.get("forks_count", 0))
-            metrics_lists["watchers"].append(res.get("watchers_count", 0))
-            metrics_lists["open_issues"].append(res.get("open_issues_count", 0))
-            metrics_lists["pulls"].append(res.get("pulls_count", 0))
-            metrics_lists["activity"].append(res.get("activity", ""))
-            metrics_lists["contributors"].append(res.get("contributors", 0))
-            metrics_lists["dependencies_count"].append(res.get("dependencies_count", 0))
-    # Add the fetched data to the DataFrame
-    for key, values in metrics_lists.items():
-        df[key] = values
-    end_time = time.time()
-    print(f"[INFO] GitHub features fetched successfully in {end_time - start_time:.2f} seconds.")
     return df
 def calculate_fallback_weights(df):
@@ -265,66 +230,52 @@ def load_data(file):
 def timeout_handler(signum, frame):
     raise TimeoutError("LLama model prediction timed out.")
-def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0):
     """
-    Assign base weights using a single LLM call to determine feature weights,
-    and programmatically calculate repository weights.
     """
-    print("[INFO] Starting optimized base weight assignment...", flush=True)
-    logging.info("[INFO] Assigning base weights using optimized approach...")
-    start_time = time.time()
     oracle = SmolLM()
     prompt = (
-        "Can you Predict a weight in the range (0-1) for these GitHub features such as stars, forks, watchers, "
-        "open_issues, pulls, activity, contributors based on their importance in determining the influence of a repository? "
-        "Output the weights for each feature as text e.g.: "
-        'stars: 0.3, forks: 0.2, watchers: 0.2, open_issues: 0.1, pulls: 0.1, activity: 0.05, contributors: 0.05'
     )
     feature_weights = None
-    for attempt in range(llm_retries):
-        try:
-            response = oracle.predict(prompt, max_length=512, max_new_tokens=150)
-            if not response or not response.strip():
-                raise ValueError("Empty response from Oracle.")
-            matches = re.findall(
-                r'(stars|forks|watchers|open_issues|pulls|activity|contributors)\s*[:=]\s*([0-9]*\.?[0-9]+)',
-                response, re.IGNORECASE)
-            feature_weights = {k.lower(): float(v) for k, v in matches}
-            if not feature_weights or len(feature_weights) < 7:
-                raise ValueError("Could not extract all feature weights from response.")
-            print(f"[INFO] Feature weights from LLM: {feature_weights}", flush=True)
-            break
-        except Exception as e:
-            print(f"[ERROR] Oracle attempt {attempt+1} failed: {e}", flush=True)
-            logging.error(f"[ERROR] Oracle attempt {attempt+1} failed: {e}")
-            time.sleep(llm_delay)
-    # Fallback mechanism: Calculate feature weights dynamically if LLM fails
-    if feature_weights is None:
-        print("[WARN] LLM failed to provide feature weights. Calculating fallback weights dynamically.")
-        feature_weights = calculate_fallback_weights(df)
-        print(f"[INFO] Fallback feature weights: {feature_weights}", flush=True)
-    for feature in feature_weights.keys():
-        if feature in df.columns:
-            df[feature] = pd.to_numeric(df[feature], errors='coerce').fillna(0)
-    def calculate_weight(row):
-        weight = 0
-        for feature, feature_weight in feature_weights.items():
-            if feature in row:
-                weight += row[feature] * feature_weight
-        return weight
-    df["base_weight_raw"] = df.apply(calculate_weight, axis=1)
-    df["base_weight"] = df.groupby("parent")["base_weight_raw"].transform(
-        lambda s: (s - s.min()) / (s.max() - s.min() if s.max() != s.min() else 1)
-    )
-    end_time = time.time()
-    print(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.", flush=True)
-    logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
     return df
 def sanity_check_weights(df):
@@ -481,72 +432,58 @@ def validate_target(df):
 ##############################
-#  RandomForest Regression
 ##############################
 def train_predict_weight(df):
     """
-    Trains a RandomForestRegressor with hyperparameter tuning and evaluates the model.
     """
-    print("[INFO] Starting weight prediction with hyperparameter tuning...", flush=True)
-    start_time = time.time()
-    target = "base_weight"
-    feature_cols = [col for col in df.select_dtypes(include=[np.number]).columns if col not in ["base_weight", "final_weight","base_weight_raw"]]
-    X = df[feature_cols].fillna(0)
-    y = df[target]
-    # Remove rows with NaN values
-    mask = X.notna().all(axis=1) & y.notna()
-    X,y = X[mask], y[mask]
-    # Check for sufficient data and variance
-    if X.shape[0] < 5 or y.nunique() <=1:
-        print("[WARN] Not enough data or variance for model training. Using base weights directly.")
-        df["final_weight"] = df[target]
-        return normalize_and_clip_weights(df)
-    # log1p transform target
-    y_log = log1p(y)
-    # Split data into train/test sets
-    X_train, X_test, y_train_log, y_test_log = train_test_split(X, y_log, test_size=0.2, random_state=42)
-    pipeline = Pipeline([
-        ("rf", RandomForestRegressor(random_state=42))
-    ])
-    # Hyperparameter tuning using GridSearchCV
-    param_dist = {
-        "rf__n_estimators": [100, 300, 500, 800, 1000],
-        "rf__max_depth": [None, 20, 30, 40],
-        "rf__min_samples_split": [2, 5, 10],
-        "rf__min_samples_leaf": [1, 2, 4],
-        "rf__max_features": ["auto", "sqrt"],
-    }
     search = RandomizedSearchCV(
-        pipeline,
-        param_distributions=param_dist,
-        n_iter=50,
-        cv=10,
-        scoring="neg_root_mean_squared_error",
-        verbose=2,
-        n_jobs=-1,
-        random_state=42
     )
-    search.fit(X_train, y_train_log)
     best_model = search.best_estimator_
-    #Predict on test, invert transform
-    y_pred_test_log = best_model.predict(X_test)
-    y_pred_test = expm1(y_pred_test_log)
-    y_true_test = expm1(y_test_log)
-    mse = mean_squared_error(y_true_test, y_pred_test)
-    print(f"[INFO] Test MSE after RandomizedSearch: {mse:.4f}", flush=True)
-    # Predict on full dataset and invert
-    df["final_weight"] = expm1(best_model.predict(df[feature_cols]))
-    df = normalize_and_clip_weights(df)
-    end_time = time.time()
-    print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True)
     return df
@@ -554,23 +491,31 @@ def train_predict_weight(df):
 # CSV Output
 ##############################
 def create_submission_csv(df, output_filename="submission.csv"):
-    print(f"[INFO] Writing results to {output_filename}...", flush=True)
-    required_cols = ["repo", "parent", "final_weight"]
-    submission_df = df[required_cols]
-    submission_df.to_csv(output_filename, index=False)
-    print(f"[INFO] Results written to {output_filename}.", flush=True)
-    return output_filename
-# Removed Gradio UI code from this file to ensure modular workflow.
-# This file now focuses solely on data processing and prediction.
 if __name__ == "__main__":
-    input_file = "input.csv"  # Replace with the actual input file path
-    output_file = "submission.csv"
-    print("[INFO] Preparing dataset...")
-    df = prepare_dataset(input_file)
-    print("[INFO] Creating submission CSV...")
-    create_submission_csv(df, output_file)
-    print("[INFO] Process completed successfully.")

 import re
 import json
 import time
+import json
+import time
+import logging
+import sys
+import warnings
+import concurrent.futures
+from concurrent.futures import ThreadPoolExecutor
+import numpy as np
+import pandas as pd
+import requests
+from tqdm import tqdm
+from scipy.special import log1p, expm1
+from sklearn.model_selection import RandomizedSearchCV, GroupKFold
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import RobustScaler
+from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV,KFold
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.preprocessing import StandardScaler
 import matplotlib.pyplot as plt
 import seaborn as sns
 from scipy.special import log1p, expm1
+from sklearn.preprocessing import RobustScaler
+from sklearn.metrics import mean_squared_error
+from xgboost import XGBRegressor
+from scipy.special import log1p, expm1
 from Oracle.SmolLM import SmolLM
+import os
 warnings.filterwarnings("ignore")
 # Configure logging to file and console
     format="%(asctime)s - %(levelname)s - %(message)s"
 )
+def add_temporal_and_ratio_features(df):
+    """
+    Adds:
+      - days_since_update: days between last GitHub update and today
+      - closed_issue_ratio: ratio of closed to total issues
+      - (Optional) merged_pull_ratio: if you have merged_pulls count
+    """
+    df['activity'] = pd.to_datetime(df['activity'], errors='coerce')
+    today = pd.to_datetime('today')
+    df['days_since_update'] = (today - df['activity']).dt.days.fillna((today - df['activity'].median()).days)
+    # closed_issue_ratio: assuming open_issues includes all and closed = total - open
+    df['closed_issue_ratio']= 0
+    total_issues = (df['open_issues']/ (1- 0.5)).replace([np.inf, -np.inf], np.nan)
+    df['closed_issue_ratio'] = ((total_issues - df['open_issues']).fillna(0)/ total_issues.fillna(1))
+    df['closed_issue_ratio'] = df['closed_issue_ratio'].clip(0, 1)
+    # merged_pull_ratio: if you have merged_pulls count
+    df['merged_pull_ratio'] = df['merged_pulls'].clip(lower=0)/ df['pulls'].clip(lower=1)
+    return df
 ##############################
 #  GitHub API helper: Fetch repository metrics
 ##############################
 def fetch_repo_metrics(repo_url):
     """
+    RATIONALE (Recommendation 2): Fetches GitHub metrics, handling API pagination to get accurate
+    contributor and pull request counts instead of the default cap of 30. This provides much
+    more accurate features for popular repositories.
     """
+    default_metrics = {"stars": 0, "forks": 0, "watchers": 0, "open_issues": 0, "pulls": 0, "activity": pd.NaT,
+                       "contributors": 0}
     try:
         m = re.search(r"github\.com/([^/]+)/([^/]+)", repo_url)
         if not m:
+            logging.warning(f"Malformed GitHub URL: {repo_url}")
             return default_metrics
         owner, repo_name = m.group(1), m.group(2)
         api_url = f"https://api.github.com/repos/{owner}/{repo_name}"
         headers = {}
+        token = os.environ.get("GITHUB_API_TOKEN")
         if token:
             headers["Authorization"] = f"token {token}"
+        r = requests.get(api_url, headers=headers, timeout=15)
+        r.raise_for_status()
+        data = r.json()
+        def get_count_from_pagination(url, headers):
             try:
+                resp = requests.get(f"{url}?per_page=1", headers=headers, timeout=10)
+                if resp.status_code == 200 and 'Link' in resp.headers:
+                    match = re.search(r'page=(\d+)>; rel="last"', resp.headers['Link'])
+                    if match:
+                        return int(match.group(1))
+                return len(resp.json()) if resp.status_code == 200 else 0
+            except requests.exceptions.RequestException:
+                return 0
+        return {
+            "stars": data.get("stargazers_count", 0),
+            "forks": data.get("forks_count", 0),
+            "watchers": data.get("subscribers_count", 0),  # subscribers_count is a better 'watch' metric
+            "open_issues": data.get("open_issues_count", 0),
+            "activity": pd.to_datetime(data.get("updated_at")),
+            "contributors":  get_count_from_pagination(data['contributors_url'], headers),
+            "pulls": get_count_from_pagination(data['pulls_url'].replace('{/number}', ''), headers)
+        }
+    except requests.exceptions.RequestException as e:
+        logging.error(f"Failed to fetch data for {repo_url}: {e}")
         return default_metrics
 def fetch_github_features(df):
+    """Concurrently fetches GitHub features for all repositories in the DataFrame."""
+    logging.info("Fetching GitHub features for repositories...")
+    metrics_data = []
+    with ThreadPoolExecutor(max_workers=20) as executor:
+        future_to_url = {executor.submit(fetch_repo_metrics, url): url for url in df['repo']}
+        for future in tqdm(concurrent.futures.as_completed(future_to_url), total=len(df), desc="Fetching GitHub Metrics"):
+            metrics_data.append(future.result())
+    return pd.concat([df.reset_index(drop=True), pd.DataFrame(metrics_data)], axis=1)
+def add_derived_features(df):
     """
+    RATIONALE (Recommendation 2): Adds derived temporal and interaction features like 'days_since_update'
+    and 'stars_per_contributor' to give the model more powerful signals to learn from.
     """
+    logging.info("Engineering derived features...")
+    df['activity'] = pd.to_datetime(df['activity'], errors='coerce')
+    df['days_since_update'] = (pd.Timestamp.now(tz='UTC') - df['activity']).dt.days
+    df['days_since_update'].fillna(df['days_since_update'].median(), inplace=True)
+    df['stars_per_contributor'] = df['stars'] / df['contributors'].clip(lower=1)
+    df['forks_per_star'] = df['forks'] / df['stars'].clip(lower=1)
+    numeric_cols = df.select_dtypes(include=np.number).columns
+    df[numeric_cols] = df[numeric_cols].fillna(0)
     return df
 def calculate_fallback_weights(df):
 def timeout_handler(signum, frame):
     raise TimeoutError("LLama model prediction timed out.")
+def assign_base_weight(df):
     """
+    Assigns a robust `base_weight` using an LLM with a specific persona and JSON output,
+    then applies log transformation before normalization.
     """
+    logging.info("Assigning robust base weights using LLM...")
     oracle = SmolLM()
+    # RATIONALE (Recommendation 1): This prompt is highly specific. It sets a persona (VC), defines
+    # the goal (assess health), prioritizes metrics, and demands a strict JSON output. This
+    # leads to a much higher quality and more reliable response from the LLM.
     prompt = (
+        "As an expert venture capitalist specializing in open-source software, your goal is to assess a project's "
+        "overall health, community engagement, and development velocity. Based on this, assign a numeric importance "
+        "weight to each of the following GitHub metrics: 'stars', 'forks', 'watchers', 'open_issues', 'pulls', "
+        "'contributors', and 'days_since_update'. "
+        "Prioritize metrics indicating active, collaborative development (like contributors, pulls, recent updates) "
+        "over simple popularity metrics (like stars). The 'days_since_update' metric is inverse; lower is better, so it should have a negative weight. "
+        "The absolute values of the weights should sum to approximately 1. "
+        "Provide your answer ONLY in a strict JSON format. Example: "
+        '{"stars": 0.2, "forks": 0.1, "watchers": 0.05, "pulls": 0.2, "open_issues": 0.1, "contributors": 0.25, "days_since_update": -0.1}'
     )
     feature_weights = None
+    try:
+        response_text = oracle.predict(prompt)
+        json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
+        if not json_match: raise ValueError("No JSON object found in the LLM response.")
+        feature_weights = json.loads(json_match.group(0))
+        logging.info(f"Successfully parsed feature weights from LLM: {feature_weights}")
+    except Exception as e:
+        logging.error(f"Failed to parse LLM response, using fallback weights. Error: {e}")
+        feature_weights = {'stars': 0.15, 'forks': 0.1, 'watchers': 0.05, 'pulls': 0.25, 'open_issues': 0.1,
+                           'contributors': 0.25, 'days_since_update': -0.1}
+    df["base_weight_raw"] = sum(df[feature] * weight for feature, weight in feature_weights.items() if feature in df)
+    # RATIONALE (Recommendation 1): Log-transforming the raw score before scaling prevents extreme
+    # outliers from dominating the normalization process, creating a more stable target variable.
+    df['base_weight_log'] = np.log1p(df['base_weight_raw'] - df['base_weight_raw'].min())
+    df['base_weight'] = df.groupby("parent")["base_weight_log"].transform(
+        lambda s: (s - s.min()) / (s.max() - s.min() if s.max() > s.min() else 1)
+    ).fillna(0.5)
     return df
 def sanity_check_weights(df):
 ##############################
+#  Model Training and Prediction
 ##############################
 def train_predict_weight(df):
     """
+    Trains an XGBoost Regressor with GroupKFold cross-validation and extensive hyperparameter tuning.
     """
+    logging.info("Starting model training with GroupKFold validation...")
+    target_col = 'base_weight'
+    drop_cols = ["repo", "parent", "activity", "base_weight_raw", "base_weight_log", target_col]
+    feature_cols = [col for col in df.select_dtypes(include=np.number).columns if col not in drop_cols]
+    X = df[feature_cols].copy()
+    y = df[target_col]
+    groups = df['parent']
+    # RATIONALE (Recommendation 2): Log-transforming skewed input features helps the model by
+    # making their distributions more normal, improving the performance of the regressor.
+    skewed_features = ['stars', 'forks', 'watchers', 'open_issues', 'pulls', 'contributors', 'stars_per_contributor']
+    for col in skewed_features:
+        if col in X.columns:
+            X[col] = np.log1p(X[col])
+    pipeline = Pipeline([("scaler", RobustScaler()),
+                         ("xgb", XGBRegressor(objective="reg:squarederror", n_jobs=-1, random_state=42, verbosity=0))])
+    param_dist = {'xgb__n_estimators': [100, 300, 500, 700], 'xgb__max_depth': [3, 5, 7, 9],
+                  'xgb__learning_rate': [0.01, 0.02, 0.05, 0.1], 'xgb__subsample': [0.6, 0.7, 0.8, 0.9],
+                  'xgb__colsample_bytree': [0.6, 0.7, 0.8, 0.9]}
+    # RATIONALE (Recommendation 3): GroupKFold ensures that all repos from the same parent are in the
+    # same fold. This prevents data leakage and gives a realistic measure of true performance.
+    cv = GroupKFold(n_splits=5)
+    # RATIONALE (Recommendation 4): Increasing n_iter explores more hyperparameter combinations,
+    # increasing the chance of finding a better-performing model.
     search = RandomizedSearchCV(
+        pipeline, param_distributions=param_dist, n_iter=50, cv=cv.split(X, y, groups),
+        scoring="neg_root_mean_squared_error", verbose=1, n_jobs=-1, random_state=42
     )
+    search.fit(X, y)
     best_model = search.best_estimator_
+    logging.info(f"Best CV score (neg RMSE): {search.best_score_:.4f}")
+    logging.info(f"Best parameters found: {search.best_params_}")
+    df['final_weight'] = best_model.predict(X)
+    df['final_weight'] = df['final_weight'].clip(lower=0)
+    df['final_weight'] = df.groupby("parent")['final_weight'].transform(
+        lambda w: w / w.sum() if w.sum() > 0 else np.ones_like(w) / len(w))
     return df
 # CSV Output
 ##############################
 def create_submission_csv(df, output_filename="submission.csv"):
+    """Saves the final predictions to a CSV file."""
+    logging.info(f"Writing final results to {output_filename}...")
+    df[["repo", "parent", "final_weight"]].to_csv(output_filename, index=False)
+    logging.info(f"Successfully created {output_filename}.")
 if __name__ == "__main__":
+    if 'GITHUB_API_TOKEN' not in os.environ:
+        logging.warning("GITHUB_API_TOKEN environment variable not set. API rate limits will be low.")
+    input_file = "input.csv"
+    output_file = "submission_enhanced.csv"
+    if not os.path.exists(input_file):
+        logging.error(f"Input file not found: {input_file}. Please create it with 'repo' and 'parent' columns.")
+        sys.exit(1)
+    logging.info("--- Starting DeepFunding Oracle - Enhanced Process ---")
+    # Execute the full pipeline
+    main_df = pd.read_csv(input_file)
+    main_df = fetch_github_features(main_df)
+    main_df = add_derived_features(main_df)
+    main_df = assign_base_weight(main_df)
+    main_df = train_predict_weight(main_df)
+    create_submission_csv(main_df, output_file)
+    logging.info("--- Process Completed Successfully ---")