FelixPhilip commited on
Commit
cb06856
·
1 Parent(s): ea68d4a
Files changed (1) hide show
  1. Oracle/deepfundingoracle.py +37 -33
Oracle/deepfundingoracle.py CHANGED
@@ -474,49 +474,53 @@ def train_predict_weight(df):
474
  X = df[feature_cols]
475
  y = df[target]
476
 
 
 
 
 
 
 
477
  # Split data into train/test sets
478
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
479
 
 
 
 
 
 
 
480
  # Hyperparameter tuning using GridSearchCV
481
  param_grid = {
482
- "n_estimators": [100, 200, 300],
483
- "max_depth": [10, 15, 20],
484
- "min_samples_split": [2, 5, 10],
485
- "min_samples_leaf": [1, 2, 4]
486
  }
487
  rf = RandomForestRegressor(random_state=42)
488
  grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring="neg_mean_squared_error", verbose=2)
489
  grid_search.fit(X_train, y_train)
490
 
491
- # Best model
492
- best_rf = grid_search.best_estimator_
493
- print(f"[INFO] Best parameters: {grid_search.best_params_}")
494
-
495
- # Evaluate on test set
496
- y_pred = best_rf.predict(X_test)
497
- mse = mean_squared_error(y_test, y_pred)
498
- print(f"[INFO] Test MSE: {mse}")
499
-
500
- # Feature importance analysis
501
- feature_importances = best_rf.feature_importances_
502
- importance_df = pd.DataFrame({"Feature": feature_cols, "Importance": feature_importances}).sort_values(by="Importance", ascending=False)
503
- print("[INFO] Feature importances:")
504
- print(importance_df)
505
-
506
- # Drop irrelevant features
507
- irrelevant_features = importance_df[importance_df["Importance"] < 0.01]["Feature"].tolist()
508
- print(f"[INFO] Dropping irrelevant features: {irrelevant_features}")
509
- df.drop(columns=irrelevant_features, inplace=True)
510
-
511
- # Plot predictions vs. actual values
512
- plt.scatter(y_test, y_pred, alpha=0.5)
513
- plt.xlabel("Actual Base Weight")
514
- plt.ylabel("Predicted Base Weight")
515
- plt.title("Predictions vs. Actual")
516
- plt.show()
517
-
518
- # Assign predictions to DataFrame
519
- df["final_weight"] = best_rf.predict(X)
520
 
521
  end_time = time.time()
522
  print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True)
 
474
  X = df[feature_cols]
475
  y = df[target]
476
 
477
+ # Check for sufficient data and variance
478
+ if X.shape[0] < 5 or X.nunique().sum() <=1 or y.nunique() <=1:
479
+ print("[WARN] Not enough data or variance for model training. Using base weights directly.")
480
+ df["final_weight"] = df["base_weight"]
481
+ return df
482
+
483
  # Split data into train/test sets
484
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
485
 
486
+ # Check again after split
487
+ if X_train.shape[0] < 2 or y_train.nunique()<=1 or X_train.nunique().sum() <=1:
488
+ print("[WARN] Not enough data or variance for model training. Using base weights directly.")
489
+ df["final_weight"] = df["base_weight"]
490
+ return df
491
+
492
  # Hyperparameter tuning using GridSearchCV
493
  param_grid = {
494
+ "n_estimators": [100, 200],
495
+ "max_depth": [10, 15],
496
+ "min_samples_split": [2, 5],
497
+ "min_samples_leaf": [1, 2]
498
  }
499
  rf = RandomForestRegressor(random_state=42)
500
  grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring="neg_mean_squared_error", verbose=2)
501
  grid_search.fit(X_train, y_train)
502
 
503
+ try:
504
+ grid_search.fit(X_train, y_train)
505
+ best_rf = grid_search.best_estimator_
506
+ print(f"[INFO] Best parameters: {grid_search.best_params_}")
507
+
508
+ # Evaluate on test set
509
+ y_pred = best_rf.predict(X_test)
510
+ mse = mean_squared_error(y_test, y_pred)
511
+ print(f"[INFO] Test MSE: {mse}")
512
+
513
+ # Feature importance analysis
514
+ feature_importances = best_rf.feature_importances_
515
+ importance_df = pd.DataFrame({"Feature": feature_cols, "Importance": feature_importances}).sort_values(by="Importance", ascending=False)
516
+ print("[INFO] Feature importances:")
517
+ print(importance_df)
518
+
519
+ # Assign predictions to DataFrame
520
+ df["final_weight"] = best_rf.predict(X)
521
+ except Exception as e:
522
+ print(f"[ERROR] Model training failed: {e}")
523
+ df["final_weight"] = df["base_weight"]
 
 
 
 
 
 
 
 
524
 
525
  end_time = time.time()
526
  print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True)