Spaces:
Running
Running
Commit
·
607473c
1
Parent(s):
911b780
Oracle
Browse files- Oracle/deepfundingoracle.py +43 -58
Oracle/deepfundingoracle.py
CHANGED
@@ -22,18 +22,21 @@ import logging
|
|
22 |
import concurrent.futures
|
23 |
from concurrent.futures import ThreadPoolExecutor
|
24 |
import signal
|
|
|
|
|
25 |
from tqdm import tqdm
|
26 |
import sys
|
27 |
import re
|
28 |
import json
|
29 |
import time
|
30 |
|
31 |
-
from sklearn.model_selection import train_test_split, GridSearchCV
|
32 |
from sklearn.ensemble import RandomForestRegressor
|
33 |
from sklearn.metrics import mean_squared_error
|
34 |
from sklearn.preprocessing import StandardScaler
|
35 |
import matplotlib.pyplot as plt
|
36 |
import seaborn as sns
|
|
|
37 |
|
38 |
from Oracle.SmolLM import SmolLM
|
39 |
|
@@ -447,12 +450,9 @@ def validate_features(df):
|
|
447 |
"""
|
448 |
print("[INFO] Validating and scaling features...")
|
449 |
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
450 |
-
scaler = StandardScaler()
|
451 |
-
|
452 |
-
# Log feature distributions
|
453 |
for col in numeric_cols:
|
454 |
-
|
455 |
-
|
456 |
# Scale numeric features
|
457 |
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
|
458 |
print("[INFO] Features scaled successfully.")
|
@@ -492,71 +492,56 @@ def train_predict_weight(df):
|
|
492 |
target = "base_weight"
|
493 |
feature_cols = [col for col in df.select_dtypes(include=[np.number]).columns if col not in ["base_weight", "final_weight","base_weight_raw"]]
|
494 |
|
495 |
-
|
496 |
-
df = validate_features(df)
|
497 |
-
|
498 |
-
# Validate target variable
|
499 |
-
df = validate_target(df)
|
500 |
-
|
501 |
-
X = df[feature_cols]
|
502 |
y = df[target]
|
503 |
|
504 |
-
# Remove columns with all NaN values
|
505 |
-
X = X.loc[:, X.notna().any()]
|
506 |
-
X = X.loc[:, X.nunique() > 1]
|
507 |
|
508 |
# Remove rows with NaN values
|
509 |
mask = X.notna().all(axis=1) & y.notna()
|
510 |
-
X= X[mask]
|
511 |
-
y = y[mask]
|
512 |
|
513 |
# Check for sufficient data and variance
|
514 |
-
if X.shape[0] < 5 or
|
515 |
print("[WARN] Not enough data or variance for model training. Using base weights directly.")
|
516 |
-
df["final_weight"] = df[
|
517 |
-
|
518 |
-
return df
|
519 |
|
520 |
-
#
|
521 |
-
|
522 |
|
523 |
-
#
|
524 |
-
|
525 |
-
print("[WARN] Not enough data or variance for model training. Using base weights directly.")
|
526 |
-
df["final_weight"] = df["base_weight"]
|
527 |
-
return df
|
528 |
|
|
|
529 |
# Hyperparameter tuning using GridSearchCV
|
530 |
-
|
531 |
-
"
|
532 |
-
"
|
533 |
-
"
|
534 |
-
"
|
|
|
535 |
}
|
536 |
-
|
537 |
-
|
538 |
-
|
539 |
-
|
540 |
-
|
541 |
-
|
542 |
-
|
543 |
-
|
544 |
-
|
545 |
-
|
546 |
-
|
547 |
-
|
548 |
-
|
549 |
-
|
550 |
-
|
551 |
-
|
552 |
-
|
553 |
-
|
554 |
-
|
555 |
-
|
556 |
-
|
557 |
-
except Exception as e:
|
558 |
-
print(f"[ERROR] Model training failed: {e}")
|
559 |
-
df["final_weight"] = df["base_weight"]
|
560 |
df = normalize_and_clip_weights(df)
|
561 |
end_time = time.time()
|
562 |
print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True)
|
|
|
22 |
import concurrent.futures
|
23 |
from concurrent.futures import ThreadPoolExecutor
|
24 |
import signal
|
25 |
+
|
26 |
+
from sklearn.pipeline import Pipeline
|
27 |
from tqdm import tqdm
|
28 |
import sys
|
29 |
import re
|
30 |
import json
|
31 |
import time
|
32 |
|
33 |
+
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
|
34 |
from sklearn.ensemble import RandomForestRegressor
|
35 |
from sklearn.metrics import mean_squared_error
|
36 |
from sklearn.preprocessing import StandardScaler
|
37 |
import matplotlib.pyplot as plt
|
38 |
import seaborn as sns
|
39 |
+
from scipy.special import log1p, expm1
|
40 |
|
41 |
from Oracle.SmolLM import SmolLM
|
42 |
|
|
|
450 |
"""
|
451 |
print("[INFO] Validating and scaling features...")
|
452 |
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
|
|
|
|
|
453 |
for col in numeric_cols:
|
454 |
+
df[col]= log1p(df[col].clip(lower=0))
|
455 |
+
scaler = StandardScaler()
|
456 |
# Scale numeric features
|
457 |
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
|
458 |
print("[INFO] Features scaled successfully.")
|
|
|
492 |
target = "base_weight"
|
493 |
feature_cols = [col for col in df.select_dtypes(include=[np.number]).columns if col not in ["base_weight", "final_weight","base_weight_raw"]]
|
494 |
|
495 |
+
X = df[feature_cols].fillna(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
496 |
y = df[target]
|
497 |
|
|
|
|
|
|
|
498 |
|
499 |
# Remove rows with NaN values
|
500 |
mask = X.notna().all(axis=1) & y.notna()
|
501 |
+
X,y = X[mask], y[mask]
|
|
|
502 |
|
503 |
# Check for sufficient data and variance
|
504 |
+
if X.shape[0] < 5 or y.nunique() <=1:
|
505 |
print("[WARN] Not enough data or variance for model training. Using base weights directly.")
|
506 |
+
df["final_weight"] = df[target]
|
507 |
+
return normalize_and_clip_weights(df)
|
|
|
508 |
|
509 |
+
# log1p transform target
|
510 |
+
y_log = log1p(y)
|
511 |
|
512 |
+
# Split data into train/test sets
|
513 |
+
X_train, X_test, y_train_log, y_test_log = train_test_split(X, y_log, test_size=0.2, random_state=42)
|
|
|
|
|
|
|
514 |
|
515 |
+
pipeline = Pipeline(["rf", RandomForestRegressor(random_state=42)])
|
516 |
# Hyperparameter tuning using GridSearchCV
|
517 |
+
param_dist = {
|
518 |
+
"rf__n_estimators": [100, 300, 500, 800, 1000],
|
519 |
+
"rf__max_depth": [None, 20, 30, 40],
|
520 |
+
"rf__min_samples_split": [2, 5, 10],
|
521 |
+
"rf__min_samples_leaf": [1, 2, 4],
|
522 |
+
"rf__max_features": ["auto", "sqrt"],
|
523 |
}
|
524 |
+
search = RandomizedSearchCV(
|
525 |
+
pipeline,
|
526 |
+
param_distributions=param_dist,
|
527 |
+
n_iter=50,
|
528 |
+
cv=10,
|
529 |
+
scoring="neg_root_mean_squared_error",
|
530 |
+
verbose=2,
|
531 |
+
n_jobs=-1,
|
532 |
+
random_state=42
|
533 |
+
)
|
534 |
+
search.fit(X_train, y_train_log)
|
535 |
+
best_model = search.best_estimator_
|
536 |
+
|
537 |
+
#Predict on test, invert transform
|
538 |
+
y_pred_test_log = best_model.predict(X_test)
|
539 |
+
y_pred_test = expm1(y_pred_test_log)
|
540 |
+
y_true_test = expm1(y_test_log)
|
541 |
+
mse = mean_squared_error(y_true_test, y_pred_test)
|
542 |
+
print(f"[INFO] Test MSE after RandomizedSearch: {mse:.4f}", flush=True)
|
543 |
+
# Predict on full dataset and invert
|
544 |
+
df["final_weight"] = expm1(best_model.predict(df[feature_cols]))
|
|
|
|
|
|
|
545 |
df = normalize_and_clip_weights(df)
|
546 |
end_time = time.time()
|
547 |
print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True)
|