|
import pandas as pd |
|
from sklearn.ensemble import RandomForestRegressor |
|
from sklearn.metrics import mean_squared_error |
|
from sklearn.model_selection import train_test_split |
|
import numpy as np |
|
|
|
|
|
train_df = pd.read_csv("./input/train.csv", nrows=500000) |
|
|
|
|
|
train_df = train_df.dropna(how="any", axis="rows") |
|
train_df = train_df[(train_df.fare_amount >= 2.5) & (train_df.fare_amount <= 500)] |
|
train_df = train_df[(train_df.passenger_count > 0) & (train_df.passenger_count <= 6)] |
|
train_df = train_df[ |
|
(train_df["pickup_latitude"] != 0) | (train_df["pickup_longitude"] != 0) |
|
] |
|
train_df = train_df[ |
|
(train_df["dropoff_latitude"] != 0) | (train_df["dropoff_longitude"] != 0) |
|
] |
|
|
|
|
|
|
|
def haversine_distance(lat1, lon1, lat2, lon2): |
|
R = 6371 |
|
phi1 = np.radians(lat1) |
|
phi2 = np.radians(lat2) |
|
delta_phi = np.radians(lat2 - lat1) |
|
delta_lambda = np.radians(lon2 - lon1) |
|
a = ( |
|
np.sin(delta_phi / 2) ** 2 |
|
+ np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2) ** 2 |
|
) |
|
c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a)) |
|
d = R * c |
|
return d |
|
|
|
|
|
train_df["pickup_datetime"] = pd.to_datetime(train_df["pickup_datetime"]) |
|
train_df["year"] = train_df["pickup_datetime"].dt.year |
|
train_df["month"] = train_df["pickup_datetime"].dt.month |
|
train_df["day"] = train_df["pickup_datetime"].dt.day |
|
train_df["hour"] = train_df["pickup_datetime"].dt.hour |
|
train_df["weekday"] = train_df["pickup_datetime"].dt.weekday |
|
train_df["distance"] = haversine_distance( |
|
train_df["pickup_latitude"], |
|
train_df["pickup_longitude"], |
|
train_df["dropoff_latitude"], |
|
train_df["dropoff_longitude"], |
|
) |
|
|
|
|
|
features = [ |
|
"year", |
|
"month", |
|
"day", |
|
"hour", |
|
"weekday", |
|
"passenger_count", |
|
"pickup_latitude", |
|
"pickup_longitude", |
|
"dropoff_latitude", |
|
"dropoff_longitude", |
|
"distance", |
|
] |
|
target = "fare_amount" |
|
|
|
X = train_df[features] |
|
y = train_df[target] |
|
|
|
|
|
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42) |
|
|
|
|
|
rf = RandomForestRegressor(n_estimators=50, max_depth=25, random_state=42) |
|
rf.fit(X_train, y_train) |
|
|
|
|
|
y_pred = rf.predict(X_val) |
|
|
|
|
|
rmse = np.sqrt(mean_squared_error(y_val, y_pred)) |
|
print(f"Validation RMSE: {rmse}") |
|
|
|
|
|
test_df = pd.read_csv("./input/test.csv") |
|
test_df["pickup_datetime"] = pd.to_datetime(test_df["pickup_datetime"]) |
|
test_df["year"] = test_df["pickup_datetime"].dt.year |
|
test_df["month"] = test_df["pickup_datetime"].dt.month |
|
test_df["day"] = test_df["pickup_datetime"].dt.day |
|
test_df["hour"] = test_df["pickup_datetime"].dt.hour |
|
test_df["weekday"] = test_df["pickup_datetime"].dt.weekday |
|
|
|
|
|
for feature in [ |
|
"pickup_latitude", |
|
"pickup_longitude", |
|
"dropoff_latitude", |
|
"dropoff_longitude", |
|
]: |
|
median_value = train_df[feature].median() |
|
test_df[feature].fillna(median_value, inplace=True) |
|
|
|
test_df["distance"] = haversine_distance( |
|
test_df["pickup_latitude"], |
|
test_df["pickup_longitude"], |
|
test_df["dropoff_latitude"], |
|
test_df["dropoff_longitude"], |
|
) |
|
|
|
|
|
X_test = test_df[features] |
|
test_df["fare_amount"] = rf.predict(X_test) |
|
|
|
|
|
submission = test_df[["key", "fare_amount"]] |
|
submission.to_csv("./working/submission.csv", index=False) |
|
|