Spaces:

FelixPhilip
/

DeepFundingOracle

Running

App Files Files Community

FelixPhilip commited on May 10

Commit

607473c

1 Parent(s): 911b780

Oracle

Browse files

Files changed (1) hide show

Oracle/deepfundingoracle.py +43 -58

Oracle/deepfundingoracle.py CHANGED Viewed

@@ -22,18 +22,21 @@ import logging
 import concurrent.futures
 from concurrent.futures import ThreadPoolExecutor
 import signal
 from tqdm import tqdm
 import sys
 import re
 import json
 import time
-from sklearn.model_selection import train_test_split, GridSearchCV
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.metrics import mean_squared_error
 from sklearn.preprocessing import StandardScaler
 import matplotlib.pyplot as plt
 import seaborn as sns
 from Oracle.SmolLM import SmolLM
@@ -447,12 +450,9 @@ def validate_features(df):
     """
     print("[INFO] Validating and scaling features...")
     numeric_cols = df.select_dtypes(include=[np.number]).columns
-    scaler = StandardScaler()
-    # Log feature distributions
     for col in numeric_cols:
-        print(f"[DEBUG] Feature '{col}' - Mean: {df[col].mean()}, Std: {df[col].std()}, Min: {df[col].min()}, Max: {df[col].max()}")
     # Scale numeric features
     df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
     print("[INFO] Features scaled successfully.")
@@ -492,71 +492,56 @@ def train_predict_weight(df):
     target = "base_weight"
     feature_cols = [col for col in df.select_dtypes(include=[np.number]).columns if col not in ["base_weight", "final_weight","base_weight_raw"]]
-    # Validate and scale features
-    df = validate_features(df)
-    # Validate target variable
-    df = validate_target(df)
-    X = df[feature_cols]
     y = df[target]
-    # Remove columns with all NaN values
-    X = X.loc[:, X.notna().any()]
-    X = X.loc[:, X.nunique() > 1]
     # Remove rows with NaN values
     mask = X.notna().all(axis=1) & y.notna()
-    X= X[mask]
-    y = y[mask]
     # Check for sufficient data and variance
-    if X.shape[0] < 5 or X.shape[1] == 0 or y.nunique() <=1:
         print("[WARN] Not enough data or variance for model training. Using base weights directly.")
-        df["final_weight"] = df["base_weight"]
-        df = normalize_and_clip_weights(df)
-        return df
-    # Split data into train/test sets
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-    # Check again after split
-    if X_train.shape[0] < 2 or X_train.shape[1] == 0 or y_train.nunique() <= 1:
-        print("[WARN] Not enough data or variance for model training. Using base weights directly.")
-        df["final_weight"] = df["base_weight"]
-        return df
     # Hyperparameter tuning using GridSearchCV
-    param_grid = {
-        "n_estimators": [100, 200],
-        "max_depth": [10, 15],
-        "min_samples_split": [2, 5],
-        "min_samples_leaf": [1, 2]
     }
-    rf = RandomForestRegressor(random_state=42)
-    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring="neg_mean_squared_error", verbose=2)
-    try:
-        grid_search.fit(X_train, y_train)
-        best_rf = grid_search.best_estimator_
-        print(f"[INFO] Best parameters: {grid_search.best_params_}")
-        # Evaluate on test set
-        y_pred = best_rf.predict(X_test)
-        mse = mean_squared_error(y_test, y_pred)
-        print(f"[INFO] Test MSE: {mse}")
-        # Feature importance analysis
-        feature_importances = best_rf.feature_importances_
-        importance_df = pd.DataFrame({"Feature": X_train.columns, "Importance": feature_importances}).sort_values(by="Importance", ascending=False)
-        print("[INFO] Feature importances:")
-        print(importance_df)
-        # Assign predictions to DataFrame
-        df["final_weight"] = best_rf.predict(df[X_train.columns].fillna(0))
-    except Exception as e:
-        print(f"[ERROR] Model training failed: {e}")
-        df["final_weight"] = df["base_weight"]
     df = normalize_and_clip_weights(df)
     end_time = time.time()
     print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True)

 import concurrent.futures
 from concurrent.futures import ThreadPoolExecutor
 import signal
+from sklearn.pipeline import Pipeline
 from tqdm import tqdm
 import sys
 import re
 import json
 import time
+from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.metrics import mean_squared_error
 from sklearn.preprocessing import StandardScaler
 import matplotlib.pyplot as plt
 import seaborn as sns
+from scipy.special import log1p, expm1
 from Oracle.SmolLM import SmolLM
     """
     print("[INFO] Validating and scaling features...")
     numeric_cols = df.select_dtypes(include=[np.number]).columns
     for col in numeric_cols:
+        df[col]= log1p(df[col].clip(lower=0))
+    scaler = StandardScaler()
     # Scale numeric features
     df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
     print("[INFO] Features scaled successfully.")
     target = "base_weight"
     feature_cols = [col for col in df.select_dtypes(include=[np.number]).columns if col not in ["base_weight", "final_weight","base_weight_raw"]]
+    X = df[feature_cols].fillna(0)
     y = df[target]
     # Remove rows with NaN values
     mask = X.notna().all(axis=1) & y.notna()
+    X,y = X[mask], y[mask]
     # Check for sufficient data and variance
+    if X.shape[0] < 5 or y.nunique() <=1:
         print("[WARN] Not enough data or variance for model training. Using base weights directly.")
+        df["final_weight"] = df[target]
+        return normalize_and_clip_weights(df)
+    # log1p transform target
+    y_log = log1p(y)
+    # Split data into train/test sets
+    X_train, X_test, y_train_log, y_test_log = train_test_split(X, y_log, test_size=0.2, random_state=42)
+    pipeline = Pipeline(["rf", RandomForestRegressor(random_state=42)])
     # Hyperparameter tuning using GridSearchCV
+    param_dist = {
+        "rf__n_estimators": [100, 300, 500, 800, 1000],
+        "rf__max_depth": [None, 20, 30, 40],
+        "rf__min_samples_split": [2, 5, 10],
+        "rf__min_samples_leaf": [1, 2, 4],
+        "rf__max_features": ["auto", "sqrt"],
     }
+    search = RandomizedSearchCV(
+        pipeline,
+        param_distributions=param_dist,
+        n_iter=50,
+        cv=10,
+        scoring="neg_root_mean_squared_error",
+        verbose=2,
+        n_jobs=-1,
+        random_state=42
+    )
+    search.fit(X_train, y_train_log)
+    best_model = search.best_estimator_
+    #Predict on test, invert transform
+    y_pred_test_log = best_model.predict(X_test)
+    y_pred_test = expm1(y_pred_test_log)
+    y_true_test = expm1(y_test_log)
+    mse = mean_squared_error(y_true_test, y_pred_test)
+    print(f"[INFO] Test MSE after RandomizedSearch: {mse:.4f}", flush=True)
+    # Predict on full dataset and invert
+    df["final_weight"] = expm1(best_model.predict(df[feature_cols]))
     df = normalize_and_clip_weights(df)
     end_time = time.time()
     print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True)