FelixPhilip commited on
Commit
607473c
·
1 Parent(s): 911b780
Files changed (1) hide show
  1. Oracle/deepfundingoracle.py +43 -58
Oracle/deepfundingoracle.py CHANGED
@@ -22,18 +22,21 @@ import logging
22
  import concurrent.futures
23
  from concurrent.futures import ThreadPoolExecutor
24
  import signal
 
 
25
  from tqdm import tqdm
26
  import sys
27
  import re
28
  import json
29
  import time
30
 
31
- from sklearn.model_selection import train_test_split, GridSearchCV
32
  from sklearn.ensemble import RandomForestRegressor
33
  from sklearn.metrics import mean_squared_error
34
  from sklearn.preprocessing import StandardScaler
35
  import matplotlib.pyplot as plt
36
  import seaborn as sns
 
37
 
38
  from Oracle.SmolLM import SmolLM
39
 
@@ -447,12 +450,9 @@ def validate_features(df):
447
  """
448
  print("[INFO] Validating and scaling features...")
449
  numeric_cols = df.select_dtypes(include=[np.number]).columns
450
- scaler = StandardScaler()
451
-
452
- # Log feature distributions
453
  for col in numeric_cols:
454
- print(f"[DEBUG] Feature '{col}' - Mean: {df[col].mean()}, Std: {df[col].std()}, Min: {df[col].min()}, Max: {df[col].max()}")
455
-
456
  # Scale numeric features
457
  df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
458
  print("[INFO] Features scaled successfully.")
@@ -492,71 +492,56 @@ def train_predict_weight(df):
492
  target = "base_weight"
493
  feature_cols = [col for col in df.select_dtypes(include=[np.number]).columns if col not in ["base_weight", "final_weight","base_weight_raw"]]
494
 
495
- # Validate and scale features
496
- df = validate_features(df)
497
-
498
- # Validate target variable
499
- df = validate_target(df)
500
-
501
- X = df[feature_cols]
502
  y = df[target]
503
 
504
- # Remove columns with all NaN values
505
- X = X.loc[:, X.notna().any()]
506
- X = X.loc[:, X.nunique() > 1]
507
 
508
  # Remove rows with NaN values
509
  mask = X.notna().all(axis=1) & y.notna()
510
- X= X[mask]
511
- y = y[mask]
512
 
513
  # Check for sufficient data and variance
514
- if X.shape[0] < 5 or X.shape[1] == 0 or y.nunique() <=1:
515
  print("[WARN] Not enough data or variance for model training. Using base weights directly.")
516
- df["final_weight"] = df["base_weight"]
517
- df = normalize_and_clip_weights(df)
518
- return df
519
 
520
- # Split data into train/test sets
521
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
522
 
523
- # Check again after split
524
- if X_train.shape[0] < 2 or X_train.shape[1] == 0 or y_train.nunique() <= 1:
525
- print("[WARN] Not enough data or variance for model training. Using base weights directly.")
526
- df["final_weight"] = df["base_weight"]
527
- return df
528
 
 
529
  # Hyperparameter tuning using GridSearchCV
530
- param_grid = {
531
- "n_estimators": [100, 200],
532
- "max_depth": [10, 15],
533
- "min_samples_split": [2, 5],
534
- "min_samples_leaf": [1, 2]
 
535
  }
536
- rf = RandomForestRegressor(random_state=42)
537
- grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring="neg_mean_squared_error", verbose=2)
538
-
539
- try:
540
- grid_search.fit(X_train, y_train)
541
- best_rf = grid_search.best_estimator_
542
- print(f"[INFO] Best parameters: {grid_search.best_params_}")
543
-
544
- # Evaluate on test set
545
- y_pred = best_rf.predict(X_test)
546
- mse = mean_squared_error(y_test, y_pred)
547
- print(f"[INFO] Test MSE: {mse}")
548
-
549
- # Feature importance analysis
550
- feature_importances = best_rf.feature_importances_
551
- importance_df = pd.DataFrame({"Feature": X_train.columns, "Importance": feature_importances}).sort_values(by="Importance", ascending=False)
552
- print("[INFO] Feature importances:")
553
- print(importance_df)
554
-
555
- # Assign predictions to DataFrame
556
- df["final_weight"] = best_rf.predict(df[X_train.columns].fillna(0))
557
- except Exception as e:
558
- print(f"[ERROR] Model training failed: {e}")
559
- df["final_weight"] = df["base_weight"]
560
  df = normalize_and_clip_weights(df)
561
  end_time = time.time()
562
  print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True)
 
22
  import concurrent.futures
23
  from concurrent.futures import ThreadPoolExecutor
24
  import signal
25
+
26
+ from sklearn.pipeline import Pipeline
27
  from tqdm import tqdm
28
  import sys
29
  import re
30
  import json
31
  import time
32
 
33
+ from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
34
  from sklearn.ensemble import RandomForestRegressor
35
  from sklearn.metrics import mean_squared_error
36
  from sklearn.preprocessing import StandardScaler
37
  import matplotlib.pyplot as plt
38
  import seaborn as sns
39
+ from scipy.special import log1p, expm1
40
 
41
  from Oracle.SmolLM import SmolLM
42
 
 
450
  """
451
  print("[INFO] Validating and scaling features...")
452
  numeric_cols = df.select_dtypes(include=[np.number]).columns
 
 
 
453
  for col in numeric_cols:
454
+ df[col]= log1p(df[col].clip(lower=0))
455
+ scaler = StandardScaler()
456
  # Scale numeric features
457
  df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
458
  print("[INFO] Features scaled successfully.")
 
492
  target = "base_weight"
493
  feature_cols = [col for col in df.select_dtypes(include=[np.number]).columns if col not in ["base_weight", "final_weight","base_weight_raw"]]
494
 
495
+ X = df[feature_cols].fillna(0)
 
 
 
 
 
 
496
  y = df[target]
497
 
 
 
 
498
 
499
  # Remove rows with NaN values
500
  mask = X.notna().all(axis=1) & y.notna()
501
+ X,y = X[mask], y[mask]
 
502
 
503
  # Check for sufficient data and variance
504
+ if X.shape[0] < 5 or y.nunique() <=1:
505
  print("[WARN] Not enough data or variance for model training. Using base weights directly.")
506
+ df["final_weight"] = df[target]
507
+ return normalize_and_clip_weights(df)
 
508
 
509
+ # log1p transform target
510
+ y_log = log1p(y)
511
 
512
+ # Split data into train/test sets
513
+ X_train, X_test, y_train_log, y_test_log = train_test_split(X, y_log, test_size=0.2, random_state=42)
 
 
 
514
 
515
+ pipeline = Pipeline(["rf", RandomForestRegressor(random_state=42)])
516
  # Hyperparameter tuning using GridSearchCV
517
+ param_dist = {
518
+ "rf__n_estimators": [100, 300, 500, 800, 1000],
519
+ "rf__max_depth": [None, 20, 30, 40],
520
+ "rf__min_samples_split": [2, 5, 10],
521
+ "rf__min_samples_leaf": [1, 2, 4],
522
+ "rf__max_features": ["auto", "sqrt"],
523
  }
524
+ search = RandomizedSearchCV(
525
+ pipeline,
526
+ param_distributions=param_dist,
527
+ n_iter=50,
528
+ cv=10,
529
+ scoring="neg_root_mean_squared_error",
530
+ verbose=2,
531
+ n_jobs=-1,
532
+ random_state=42
533
+ )
534
+ search.fit(X_train, y_train_log)
535
+ best_model = search.best_estimator_
536
+
537
+ #Predict on test, invert transform
538
+ y_pred_test_log = best_model.predict(X_test)
539
+ y_pred_test = expm1(y_pred_test_log)
540
+ y_true_test = expm1(y_test_log)
541
+ mse = mean_squared_error(y_true_test, y_pred_test)
542
+ print(f"[INFO] Test MSE after RandomizedSearch: {mse:.4f}", flush=True)
543
+ # Predict on full dataset and invert
544
+ df["final_weight"] = expm1(best_model.predict(df[feature_cols]))
 
 
 
545
  df = normalize_and_clip_weights(df)
546
  end_time = time.time()
547
  print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True)