Synthack-SyntaxSquad / src /data /preprocessing.py
Bibek Mukherjee
Upload 77 files
3efedb0 verified
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
def load_data(filepath):
"""Load data from CSV file"""
return pd.read_csv(filepath)
def preprocess_data(df, target_column='loan_approved'):
"""Preprocess data for model training"""
# Split features and target
X = df.drop(columns=[target_column])
y = df[target_column]
# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns
# Create preprocessing pipelines
numeric_transformer = Pipeline(steps=[
('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# Combine preprocessing steps
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
return preprocessor, X_train, X_test, y_train, y_test