File size: 4,596 Bytes
9d99cff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

class LoanDataProcessor:
    """
    Class for preprocessing loan data for machine learning models.
    """
    
    def __init__(self):
        """Initialize the data processor."""
        self.preprocessor = None
        self.categorical_features = ['gender', 'employment_status', 'payment_history']
        self.numerical_features = ['age', 'annual_income', 'credit_score', 'loan_amount', 
                                  'interest_rate', 'loan_term', 'days_past_due', 
                                  'previous_defaults', 'monthly_payment', 'debt_to_income']
        
    def fit(self, X):
        """
        Fit the preprocessor on the training data.
        
        Parameters:
        -----------
        X : pandas.DataFrame
            The training data
            
        Returns:
        --------
        self : LoanDataProcessor
            The fitted processor
        """
        # Define preprocessing for numerical features
        numerical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])
        
        # Define preprocessing for categorical features
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])
        
        # Combine preprocessing steps
        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer, self.numerical_features),
                ('cat', categorical_transformer, self.categorical_features)
            ])
        
        # Fit the preprocessor
        self.preprocessor.fit(X)
        
        return self
    
    def transform(self, X):
        """
        Transform the data using the fitted preprocessor.
        
        Parameters:
        -----------
        X : pandas.DataFrame
            The data to transform
            
        Returns:
        --------
        numpy.ndarray
            The transformed data
        """
        if self.preprocessor is None:
            raise ValueError("Preprocessor has not been fitted. Call fit() first.")
        
        return self.preprocessor.transform(X)
    
    def fit_transform(self, X):
        """
        Fit the preprocessor and transform the data.
        
        Parameters:
        -----------
        X : pandas.DataFrame
            The data to fit and transform
            
        Returns:
        --------
        numpy.ndarray
            The transformed data
        """
        return self.fit(X).transform(X)
    
    def get_feature_names(self):
        """
        Get the names of the transformed features.
        
        Returns:
        --------
        list
            List of feature names after transformation
        """
        if self.preprocessor is None:
            raise ValueError("Preprocessor has not been fitted. Call fit() first.")
        
        # Get feature names from the column transformer
        feature_names = []
        
        # Get numerical feature names (these stay the same)
        feature_names.extend(self.numerical_features)
        
        # Get categorical feature names (these are expanded by one-hot encoding)
        categorical_features = self.preprocessor.transformers_[1][1].named_steps['onehot'].get_feature_names_out(
            self.categorical_features)
        feature_names.extend(categorical_features)
        
        return feature_names
    
    def prepare_data(self, data, target_column='recovery_status'):
        """
        Prepare data for model training or prediction.
        
        Parameters:
        -----------
        data : pandas.DataFrame
            The data to prepare
        target_column : str, optional
            The name of the target column, by default 'recovery_status'
            
        Returns:
        --------
        tuple
            (X, y) if target_column is in data, otherwise just X
        """
        # Drop customer_id as it's not a feature
        if 'customer_id' in data.columns:
            data = data.drop('customer_id', axis=1)
        
        if target_column in data.columns:
            X = data.drop(target_column, axis=1)
            y = data[target_column]
            return X, y
        else:
            return data