### TU257 - AutoML Demo

#### Run this notebook and examine what is produced for each cell.

#### Add your own comments to enrich the information given to make it more meaningful to you.

#### In this notebook we will have a look at two different AutoML libraries.  See the notes/website for links to posts illustrating other AutoML libraries

In [1]:
import pandas as pd

#Load in the dataset
colnames=['Age', 'WorkClass', 'Fnlwgt', 'Education', 'Edu_Num', 'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Sex', 'CapitalGain', 'CapitalLoss', 'HrPerWk', 'Native', 'Target'] 
df = pd.read_csv('/Users/brendan.tierney/Dropbox/4-Datasets/adult.csv', names=colnames, header=None)
df.head(10)

Unnamed: 0,Age,WorkClass,Fnlwgt,Education,Edu_Num,MaritalStatus,Occupation,Relationship,Race,Sex,CapitalGain,CapitalLoss,HrPerWk,Native,Target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [2]:
df.isnull().values.any()

False

In [3]:
print ("Rows     : " ,df.shape[0])
print ("Columns  : " ,df.shape[1])
print ("\nFeatures : \n" ,df.columns.tolist())
print ("\nMissing values :  ", df.isnull().sum().values.sum())
print ("\nUnique values :  \n",df.nunique())

Rows     :  32561
Columns  :  15

Features : 
 ['Age', 'WorkClass', 'Fnlwgt', 'Education', 'Edu_Num', 'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Sex', 'CapitalGain', 'CapitalLoss', 'HrPerWk', 'Native', 'Target']

Missing values :   0

Unique values :  
 Age                 73
WorkClass            9
Fnlwgt           21648
Education           16
Edu_Num             16
MaritalStatus        7
Occupation          15
Relationship         6
Race                 5
Sex                  2
CapitalGain        119
CapitalLoss         92
HrPerWk             94
Native              42
Target               2
dtype: int64


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Age            32561 non-null  int64 
 1   WorkClass      32561 non-null  object
 2   Fnlwgt         32561 non-null  int64 
 3   Education      32561 non-null  object
 4   Edu_Num        32561 non-null  int64 
 5   MaritalStatus  32561 non-null  object
 6   Occupation     32561 non-null  object
 7   Relationship   32561 non-null  object
 8   Race           32561 non-null  object
 9   Sex            32561 non-null  object
 10  CapitalGain    32561 non-null  int64 
 11  CapitalLoss    32561 non-null  int64 
 12  HrPerWk        32561 non-null  int64 
 13  Native         32561 non-null  object
 14  Target         32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [5]:
# Numerical feature of summary/description 
df.describe()

Unnamed: 0,Age,Fnlwgt,Edu_Num,CapitalGain,CapitalLoss,HrPerWk
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [6]:
# checking "?" values, how many are there in the whole dataset
df_missing = (df=='?').sum()
df_missing

Age              0
WorkClass        0
Fnlwgt           0
Education        0
Edu_Num          0
MaritalStatus    0
Occupation       0
Relationship     0
Race             0
Sex              0
CapitalGain      0
CapitalLoss      0
HrPerWk          0
Native           0
Target           0
dtype: int64

In [7]:
df['Target'].value_counts()/len(df)   #calculate percentages

 <=50K    0.75919
 >50K     0.24081
Name: Target, dtype: float64

In [8]:
from sklearn import preprocessing

# encode categorical variables using label Encoder

# select all categorical variables
df_categorical = df.select_dtypes(include=['object'])
df_categorical.head()

Unnamed: 0,WorkClass,Education,MaritalStatus,Occupation,Relationship,Race,Sex,Native,Target
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K


In [9]:
# apply label encoder to df_categorical
le = preprocessing.LabelEncoder()
df_categorical = df_categorical.apply(le.fit_transform)
df_categorical.head()

Unnamed: 0,WorkClass,Education,MaritalStatus,Occupation,Relationship,Race,Sex,Native,Target
0,7,9,4,1,1,4,1,39,0
1,6,9,2,4,0,4,1,39,0
2,4,11,0,6,1,4,1,39,0
3,4,1,2,6,0,2,1,39,0
4,4,9,2,10,5,2,0,5,0


In [10]:
# Next, Concatenate df_categorical dataframe with original df (dataframe)

# first, Drop earlier duplicate columns which had categorical values
df = df.drop(df_categorical.columns,axis=1)
df = pd.concat([df,df_categorical],axis=1)
df.head()

Unnamed: 0,Age,Fnlwgt,Edu_Num,CapitalGain,CapitalLoss,HrPerWk,WorkClass,Education,MaritalStatus,Occupation,Relationship,Race,Sex,Native,Target
0,39,77516,13,2174,0,40,7,9,4,1,1,4,1,39,0
1,50,83311,13,0,0,13,6,9,2,4,0,4,1,39,0
2,38,215646,9,0,0,40,4,11,0,6,1,4,1,39,0
3,53,234721,7,0,0,40,4,1,2,6,0,2,1,39,0
4,28,338409,13,0,0,40,4,9,2,10,5,2,0,5,0


In [11]:
corr_matrix=df.corr()
corr_matrix

Unnamed: 0,Age,Fnlwgt,Edu_Num,CapitalGain,CapitalLoss,HrPerWk,WorkClass,Education,MaritalStatus,Occupation,Relationship,Race,Sex,Native,Target
Age,1.0,-0.076646,0.036527,0.077674,0.057775,0.068756,0.003787,-0.010508,-0.266288,-0.020947,-0.263698,0.028718,0.088832,-0.001151,0.234037
Fnlwgt,-0.076646,1.0,-0.043195,0.000432,-0.010252,-0.018768,-0.016656,-0.028145,0.028153,0.001597,0.008931,-0.021291,0.026858,-0.051966,-0.009463
Edu_Num,0.036527,-0.043195,1.0,0.12263,0.079923,0.148123,0.052085,0.359153,-0.069304,0.109697,-0.094153,0.031838,0.01228,0.05084,0.335154
CapitalGain,0.077674,0.000432,0.12263,1.0,-0.031615,0.078409,0.033835,0.030046,-0.043393,0.025505,-0.057919,0.011145,0.04848,-0.001982,0.223329
CapitalLoss,0.057775,-0.010252,0.079923,-0.031615,1.0,0.054256,0.012216,0.016746,-0.034187,0.017987,-0.061062,0.018899,0.045567,0.000419,0.150526
HrPerWk,0.068756,-0.018768,0.148123,0.078409,0.054256,1.0,0.138962,0.05551,-0.190519,0.080383,-0.248974,0.04191,0.229309,-0.002671,0.229689
WorkClass,0.003787,-0.016656,0.052085,0.033835,0.012216,0.138962,1.0,0.023513,-0.064731,0.254892,-0.090461,0.049742,0.095981,-0.00769,0.051604
Education,-0.010508,-0.028145,0.359153,0.030046,0.016746,0.05551,0.023513,1.0,-0.038407,-0.02126,-0.010876,0.014131,-0.027356,0.064288,0.079317
MaritalStatus,-0.266288,0.028153,-0.069304,-0.043393,-0.034187,-0.190519,-0.064731,-0.038407,1.0,-0.009654,0.185451,-0.068013,-0.129314,-0.023819,-0.199307
Occupation,-0.020947,0.001597,0.109697,0.025505,0.017987,0.080383,0.254892,-0.02126,-0.009654,1.0,-0.075607,0.006763,0.080296,-0.012543,0.075468


In [12]:
import seaborn as sn
import matplotlib.pyplot as plt

fig = plt.subplots(figsize=(17,14))
sn.heatmap(corr_matrix, annot=True)

<AxesSubplot:>

In [13]:
df['Target'].value_counts()

0    24720
1     7841
Name: Target, dtype: int64

In [14]:
from sklearn.model_selection import train_test_split

# Putting independent variables/features to X
X = df.drop('Target',axis=1)

# Putting response/dependent variable/feature to y
y = df['Target']

In [15]:
# Splitting the data into train and test
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30,random_state=99)

In [16]:
y_train.dtypes
print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))
print(len(X_train)/(len(X_train)+len(X_test)))
print(len(X_test)/(len(X_train)+len(X_test)))

22792
22792
9769
9769
0.6999785018887626
0.30002149811123735


### Now use AutoML
#### example using tpot library

In [22]:
import tpot



In [23]:
from tpot import TPOTClassifier
from tpot import TPOTRegressor

tpot = TPOTClassifier(generations=5, population_size=5, verbosity=2)

tpot.fit(X_train, y_train)


Optimization Progress:   0%|          | 0/30 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.8726748059978281

Generation 2 - Current best internal CV score: 0.8726748059978281

Generation 3 - Current best internal CV score: 0.8726748059978281

Generation 4 - Current best internal CV score: 0.8726748059978281

Generation 5 - Current best internal CV score: 0.8726748059978281

Best pipeline: XGBClassifier(input_matrix, learning_rate=0.1, max_depth=6, min_child_weight=4, n_estimators=100, n_jobs=1, subsample=0.8500000000000001, verbosity=0)


TPOTClassifier(generations=5, population_size=5, verbosity=2)

In [24]:
#Change Verboity = 3
from tpot import TPOTClassifier
from tpot import TPOTRegressor

tpot = TPOTClassifier(generations=5, population_size=5, verbosity=3)

tpot.fit(X_train, y_train)

32 operators have been imported by TPOT.


Optimization Progress:   0%|          | 0/30 [00:00<?, ?pipeline/s]

Pipeline encountered that has previously been evaluated during the optimization process. Using the score from the previous evaluation.

Generation 1 - Current Pareto front scores:

-1	0.846393379147429	ExtraTreesClassifier(input_matrix, ExtraTreesClassifier__bootstrap=True, ExtraTreesClassifier__criterion=entropy, ExtraTreesClassifier__max_features=0.7500000000000001, ExtraTreesClassifier__min_samples_leaf=20, ExtraTreesClassifier__min_samples_split=18, ExtraTreesClassifier__n_estimators=100)

Generation 2 - Current Pareto front scores:

-1	0.8489381528958578	ExtraTreesClassifier(input_matrix, ExtraTreesClassifier__bootstrap=True, ExtraTreesClassifier__criterion=entropy, ExtraTreesClassifier__max_features=0.7500000000000001, ExtraTreesClassifier__min_samples_leaf=15, ExtraTreesClassifier__min_samples_split=18, ExtraTreesClassifier__n_estimators=100)
_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 feature(s) (shape=(50, 0)) while a minimum of 1 is required.

TPOTClassifier(generations=5, population_size=5, verbosity=3)

In [25]:
#Change Verboity = 4
from tpot import TPOTClassifier
from tpot import TPOTRegressor

tpot = TPOTClassifier(generations=5, population_size=5, verbosity=3, n_jobs=4) #, max_time_mins=3)

tpot.fit(X_train, y_train)

32 operators have been imported by TPOT.


Optimization Progress:   0%|          | 0/30 [00:00<?, ?pipeline/s]


Generation 1 - Current Pareto front scores:

-1	0.8431029144382738	DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=entropy, DecisionTreeClassifier__max_depth=4, DecisionTreeClassifier__min_samples_leaf=10, DecisionTreeClassifier__min_samples_split=4)
_pre_test decorator: _random_mutation_operator: num_test=0 cosine was provided as affinity. Ward can only work with euclidean distances..
Pipeline encountered that has previously been evaluated during the optimization process. Using the score from the previous evaluation.

Generation 2 - Current Pareto front scores:

-1	0.8431029144382738	DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=entropy, DecisionTreeClassifier__max_depth=4, DecisionTreeClassifier__min_samples_leaf=10, DecisionTreeClassifier__min_samples_split=4)
Pipeline encountered that has previously been evaluated during the optimization process. Using the score from the previous evaluation.

Generation 3 - Current Pareto front score

TPOTClassifier(generations=5, n_jobs=4, population_size=5, verbosity=3)

In [26]:
#Change Verboity = 4  & scoring = accuracy (which is default value)
from tpot import TPOTClassifier
from tpot import TPOTRegressor

tpot = TPOTClassifier(generations=5, population_size=5, verbosity=3, n_jobs=4, scoring='accuracy')

tpot.fit(X_train, y_train)

32 operators have been imported by TPOT.


Optimization Progress:   0%|          | 0/30 [00:00<?, ?pipeline/s]


Generation 1 - Current Pareto front scores:

-1	0.8579326621149012	RandomForestClassifier(input_matrix, RandomForestClassifier__bootstrap=True, RandomForestClassifier__criterion=entropy, RandomForestClassifier__max_features=0.9500000000000001, RandomForestClassifier__min_samples_leaf=4, RandomForestClassifier__min_samples_split=4, RandomForestClassifier__n_estimators=100)

-2	0.8691648409459862	XGBClassifier(MaxAbsScaler(input_matrix), XGBClassifier__learning_rate=0.1, XGBClassifier__max_depth=9, XGBClassifier__min_child_weight=11, XGBClassifier__n_estimators=100, XGBClassifier__n_jobs=1, XGBClassifier__subsample=0.8, XGBClassifier__verbosity=0)

Generation 2 - Current Pareto front scores:

-1	0.8690771312808586	XGBClassifier(input_matrix, XGBClassifier__learning_rate=0.1, XGBClassifier__max_depth=9, XGBClassifier__min_child_weight=11, XGBClassifier__n_estimators=100, XGBClassifier__n_jobs=1, XGBClassifier__subsample=0.8, XGBClassifier__verbosity=0)

-2	0.8691648409459862	XGBClassifie

TPOTClassifier(generations=5, n_jobs=4, population_size=5, scoring='accuracy',
               verbosity=3)

In [None]:
#Add the parameter  random_state  and give it a values
#Copy the code and add this parameter.
#Q: Are the results different to what is shown above?
#Q: Rerun this code and see if the outputs change?  If they remain the same, why?

In [27]:
tpot.score(X_test, y_test)

0.8670283550005118

In [28]:
#export the model
tpot.export('/Users/brendan.tierney/Dropbox/4-Datasets/tpot_Adult_pipeline.py')

In [29]:
tpot.fitted_pipeline_

Pipeline(steps=[('xgbclassifier',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, gpu_id=-1,
                               importance_type='gain',
                               interaction_constraints='', learning_rate=0.1,
                               max_delta_step=0, max_depth=9,
                               min_child_weight=11, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=1, num_parallel_tree=1, random_state=0,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                               subsample=1.0, tree_method='exact',
                               validate_parameters=1, verbosity=0))])