Spaces:

FelixPhilip
/

DeepFundingOracle

Running

App Files Files Community

FelixPhilip commited on May 4

Commit

17c5050

1 Parent(s): 3388ab8

Oracle

Browse files

Files changed (1) hide show

Oracle/deepfundingoracle.py +48 -1

Oracle/deepfundingoracle.py CHANGED Viewed

@@ -31,6 +31,7 @@ import time
 from sklearn.model_selection import train_test_split, GridSearchCV
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.metrics import mean_squared_error
 import matplotlib.pyplot as plt
 import seaborn as sns
@@ -350,6 +351,42 @@ def clean_data(df):
     return df
 ##############################
 #  RandomForest Regression
 ##############################
@@ -362,6 +399,12 @@ def train_predict_weight(df):
     target = "base_weight"
     feature_cols = [col for col in df.columns if col not in ["repo", "parent", "base_weight", "final_weight"]]
     X = df[feature_cols]
     y = df[target]
@@ -394,6 +437,11 @@ def train_predict_weight(df):
     print("[INFO] Feature importances:")
     print(importance_df)
     # Plot predictions vs. actual values
     plt.scatter(y_test, y_pred, alpha=0.5)
     plt.xlabel("Actual Base Weight")
@@ -433,4 +481,3 @@ if __name__ == "__main__":
     print("[INFO] Creating submission CSV...")
     create_submission_csv(df, output_file)
     print("[INFO] Process completed successfully.")

 from sklearn.model_selection import train_test_split, GridSearchCV
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.metrics import mean_squared_error
+from sklearn.preprocessing import StandardScaler
 import matplotlib.pyplot as plt
 import seaborn as sns
     return df
+##############################
+#  Feature Validation and Scaling
+##############################
+def validate_features(df):
+    """
+    Validates and scales features to ensure they are meaningful for model training.
+    """
+    print("[INFO] Validating and scaling features...")
+    numeric_cols = df.select_dtypes(include=[np.number]).columns
+    scaler = StandardScaler()
+    # Log feature distributions
+    for col in numeric_cols:
+        print(f"[DEBUG] Feature '{col}' - Mean: {df[col].mean()}, Std: {df[col].std()}, Min: {df[col].min()}, Max: {df[col].max()}")
+    # Scale numeric features
+    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
+    print("[INFO] Features scaled successfully.")
+    return df
+def validate_target(df):
+    """
+    Validates the target variable to ensure it has sufficient variance.
+    """
+    print("[INFO] Validating target variable 'base_weight'...")
+    target = "base_weight"
+    if target not in df.columns:
+        raise ValueError(f"Target variable '{target}' not found in DataFrame.")
+    variance = df[target].var()
+    print(f"[DEBUG] Target variable variance: {variance}")
+    if variance < 1e-6:
+        raise ValueError(f"Target variable '{target}' has insufficient variance.")
+    return df
 ##############################
 #  RandomForest Regression
 ##############################
     target = "base_weight"
     feature_cols = [col for col in df.columns if col not in ["repo", "parent", "base_weight", "final_weight"]]
+    # Validate and scale features
+    df = validate_features(df)
+    # Validate target variable
+    df = validate_target(df)
     X = df[feature_cols]
     y = df[target]
     print("[INFO] Feature importances:")
     print(importance_df)
+    # Drop irrelevant features
+    irrelevant_features = importance_df[importance_df["Importance"] < 0.01]["Feature"].tolist()
+    print(f"[INFO] Dropping irrelevant features: {irrelevant_features}")
+    df.drop(columns=irrelevant_features, inplace=True)
     # Plot predictions vs. actual values
     plt.scatter(y_test, y_pred, alpha=0.5)
     plt.xlabel("Actual Base Weight")
     print("[INFO] Creating submission CSV...")
     create_submission_csv(df, output_file)
     print("[INFO] Process completed successfully.")