import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# Load the data
train_data = pd.read_csv("./input/train.csv")
test_data = pd.read_csv("./input/test.csv")

# Separate features and target
X = train_data.drop(["id", "target"], axis=1)
y = train_data["target"]

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical features
cat_features = [col for col in X_train.columns if X_train[col].dtype == "object"]

# Define hyperparameter space
learning_rates = [0.03, 0.1]
depths = [4, 6, 8]
n_estimators = [100, 500, 1000]

best_rmse = float("inf")
best_params = {}

# Grid search
for learning_rate in learning_rates:
    for depth in depths:
        for n_estimator in n_estimators:
            model = CatBoostRegressor(
                loss_function="RMSE",
                cat_features=cat_features,
                verbose=200,
                random_seed=42,
                learning_rate=learning_rate,
                depth=depth,
                n_estimators=n_estimator,
            )
            model.fit(
                X_train,
                y_train,
                eval_set=(X_val, y_val),
                early_stopping_rounds=50,
                use_best_model=True,
            )
            y_pred = model.predict(X_val)
            rmse = np.sqrt(mean_squared_error(y_val, y_pred))
            if rmse < best_rmse:
                best_rmse = rmse
                best_params = {
                    "learning_rate": learning_rate,
                    "depth": depth,
                    "n_estimators": n_estimator,
                }

# Train the model with best parameters
model = CatBoostRegressor(
    loss_function="RMSE",
    cat_features=cat_features,
    verbose=200,
    random_seed=42,
    **best_params,
)
model.fit(X, y)

# Predict on test data
test_predictions = model.predict(test_data.drop(["id"], axis=1))

# Save test predictions to file
submission = pd.DataFrame({"id": test_data["id"], "target": test_predictions})
submission.to_csv("./working/submission.csv", index=False)

print(f"Best Validation RMSE: {best_rmse}")
print(f"Best Parameters: {best_params}")