Lesson 23 - Grid Search

The following topis are discussed in this notebook:

  • Using GridSearchCV for hyperparameter selection.

Additional Resources

  • Hands-On Machine Learning, pages 72 - 74
In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
In [2]:
from sklearn.model_selection import GridSearchCV

Generate Data

In [3]:
np.random.seed(1)
X, y = make_classification(n_samples=400, n_features=6, n_informative=6, n_redundant=0, n_classes=7, class_sep=2)

np.set_printoptions(suppress=True, precision=2)
print('Distribution of Features:')
print('Min: ', np.min(X, axis=0))
print('Max: ', np.max(X, axis=0))
print('Mean:', np.mean(X, axis=0))
print('SDev:', np.std(X, axis=0))
np.set_printoptions(suppress=True, precision=4)
Distribution of Features:
Min:  [-5.84 -6.25 -6.16 -5.73 -4.93 -5.76]
Max:  [6.33 5.66 5.61 7.56 4.95 7.44]
Mean: [ 0.36 -0.27  0.54  0.19  0.53 -0.6 ]
SDev: [2.51 2.32 2.39 2.51 2.32 2.4 ]

Grid Search with Logistic Regression

In [4]:
param_grid = [
    {'C': 10**np.linspace(-3,3,10)}
]

lin_reg = LogisticRegression(solver='lbfgs', multi_class='ovr')

gscv_01 = GridSearchCV(lin_reg, param_grid, cv=5, scoring='accuracy', 
                              refit=True, iid=False)
gscv_01.fit(X, y)


res_01 = gscv_01.cv_results_

Exploring Grid Search Results

In [5]:
print(res_01.keys())
dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_C', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'split3_train_score', 'split4_train_score', 'mean_train_score', 'std_train_score'])
In [6]:
print(res_01['mean_test_score'])
[0.5752 0.6175 0.6305 0.6302 0.6277 0.6276 0.6276 0.6302 0.6302 0.6302]
In [7]:
#for i in range(0,10):
#    print(cv_res_01['mean_test_score'][i], cv_res_01['params'][i])

for score, params in zip(res_01['mean_test_score'], res_01['params']):
    print(score, params)
0.5751558136740329 {'C': 0.001}
0.6175366911959623 {'C': 0.004641588833612777}
0.630453977162029 {'C': 0.021544346900318832}
0.6302052375090871 {'C': 0.1}
0.6276744349782846 {'C': 0.46415888336127775}
0.627610332414182 {'C': 2.154434690031882}
0.627610332414182 {'C': 10.0}
0.6301744349782845 {'C': 46.41588833612773}
0.6301744349782845 {'C': 215.44346900318823}
0.6301744349782845 {'C': 1000.0}
In [8]:
print(gscv_01.best_score_)
print(gscv_01.best_params_)
0.630453977162029
{'C': 0.021544346900318832}

Obtaining Best Model

In [9]:
lin_reg = gscv_01.best_estimator_
print('Training Score:', lin_reg.score(X, y))
Training Score: 0.655

Grid Search with K-Nearest Neighbors

In [10]:
param_grid = [
    {'n_neighbors': range(1,20), 'p': [1,2]}
]

knn = KNeighborsClassifier()

gscv_02 = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', 
                              refit=True, iid=False)

gscv_02.fit(X, y)

res_02 = gscv_02.cv_results_
In [11]:
print(gscv_02.best_score_)
print(gscv_02.best_params_)
0.9059215779323038
{'n_neighbors': 5, 'p': 2}
In [12]:
knn = gscv_02.best_estimator_
print('Training Score:', knn.score(X, y))
Training Score: 0.9325

Grid Search with SVC

In [13]:
param_grid = [
    {'kernel':['poly'], 'degree': [1,2,3], 'C':10**np.linspace(-3,3,10), 'gamma':['auto']},
    {'kernel':['rbf'], 'C':10**np.linspace(-3,3,10), 'gamma':10**np.linspace(-3,3,10)}
]

svm = SVC()

gscv_03 = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', 
                       refit=True, iid=False)

gscv_03.fit(X, y)

res_03 = gscv_03.cv_results_
In [14]:
print(gscv_03.best_score_)
print(gscv_03.best_params_)
0.9056171935616544
{'C': 10.0, 'gamma': 0.021544346900318832, 'kernel': 'rbf'}
In [15]:
svm = gscv_03.best_estimator_
print('Training Score:', svm.score(X, y))
Training Score: 0.9625

Grid Search with Random Forests

In [16]:
param_grid = [
    {'n_estimators':np.arange(100,500,100), 'max_depth':range(2,6), 'bootstrap':['True','False']}
]

forest = RandomForestClassifier()

gscv_04 = GridSearchCV(forest, param_grid, cv=5, scoring='accuracy', 
                       refit=True, iid=False)

gscv_04.fit(X, y)

res_04 = gscv_04.cv_results_
In [17]:
print(gscv_04.best_score_)
print(gscv_04.best_params_)
0.8407154861496619
{'bootstrap': 'False', 'max_depth': 5, 'n_estimators': 300}
In [18]:
forest = gscv_04.best_estimator_
print('Training Score:', forest.score(X, y))
Training Score: 0.9425
In [ ]:
 
In [ ]: