Lesson 15 - Voting Classifiers

Additional Resources

  • Hands-On Machine Learning, Ch 7
In [1]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from ClassificationPlotter import plot_regions

Generate the Data

In [2]:
from sklearn.datasets import make_circles, make_classification
In [3]:
np.random.seed(4643)
X, y = make_classification(n_samples=500, n_classes=4, n_clusters_per_class=1, n_features=2, n_redundant=0, class_sep=0.8)

plt.figure(figsize=[8,6])
plt.scatter(X[:,0], X[:,1], c=y, cmap='rainbow', edgecolor='k')
plt.show()
In [4]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.5, random_state=1)
print(X_train.shape)
print(X_val.shape)
(250, 2)
(250, 2)

Model 01 - Logistic Regression

In [5]:
tr_acc = []
va_acc = []
exp_list = np.linspace(-3, 3, 100)

for k in exp_list:
    temp_mod = LogisticRegression(solver='lbfgs', C=10**k, multi_class='auto')
    temp_mod.fit(X_train, y_train)
    tr_acc.append(temp_mod.score(X_train, y_train))
    va_acc.append(temp_mod.score(X_val, y_val))
    
plt.figure(figsize=([6,4]))
plt.plot(exp_list, tr_acc, label='Training Accuracy')
plt.plot(exp_list, va_acc, label='Validation Accuracy')
plt.xlabel('log(C)')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
In [6]:
idx = np.argmax(va_acc)
best_logC = exp_list[idx]
print(best_logC)
0.5757575757575757
In [7]:
mod_01 = LogisticRegression(solver='lbfgs', C=10**best_logC, multi_class='auto')
mod_01.fit(X_train, y_train)

print('Training Accuracy:  ', mod_01.score(X_train, y_train))
print('Validation Accuracy:', mod_01.score(X_val, y_val))
Training Accuracy:   0.792
Validation Accuracy: 0.756
In [8]:
plot_regions(mod_01, X_train, y_train)

Model 02 - K-Nearest Neighbors

In [9]:
tr_acc = []
va_acc = []
K_list = range(1,40)

for k in K_list:
    temp_mod = KNeighborsClassifier(k)
    temp_mod.fit(X_train, y_train)
    tr_acc.append(temp_mod.score(X_train, y_train))
    va_acc.append(temp_mod.score(X_val, y_val))
    
plt.figure(figsize=([6,4]))
plt.plot(K_list, tr_acc, label='Training Accuracy')
plt.plot(K_list, va_acc, label='Validation Accuracy')
plt.xlabel('K')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
In [10]:
idx = np.argmax(va_acc)
best_K = K_list[idx]
print(best_K)
1
In [11]:
mod_02 = KNeighborsClassifier(best_K)
mod_02.fit(X_train, y_train)

print('Training Accuracy:  ', mod_02.score(X_train, y_train))
print('Validation Accuracy:', mod_02.score(X_val, y_val))
Training Accuracy:   1.0
Validation Accuracy: 0.844
In [12]:
plot_regions(mod_02, X_train, y_train)

Model 03 - Decision Tree

In [13]:
tr_acc = []
va_acc = []
depth_list = range(1,10)

for d in depth_list:
    temp_mod = DecisionTreeClassifier(max_depth=d)
    temp_mod.fit(X_train, y_train)
    tr_acc.append(temp_mod.score(X_train, y_train))
    va_acc.append(temp_mod.score(X_val, y_val))
    
plt.figure(figsize=([6,4]))
plt.plot(depth_list, tr_acc, label='Training Accuracy')
plt.plot(depth_list, va_acc, label='Validation Accuracy')
plt.xlabel('Max Depth')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
In [14]:
idx = np.argmax(va_acc)
best_d = depth_list[idx]
print(best_d)
6
In [15]:
mod_03 = DecisionTreeClassifier(max_depth=best_d)
mod_03.fit(X_train, y_train)

print('Training Accuracy:  ', mod_03.score(X_train, y_train))
print('Validation Accuracy:', mod_03.score(X_val, y_val))
Training Accuracy:   0.936
Validation Accuracy: 0.812
In [16]:
plot_regions(mod_03, X_train, y_train)

Voting Classifier

In [17]:
from sklearn.ensemble import VotingClassifier
In [18]:
voting_clf = VotingClassifier(
    estimators = [('lr', mod_01), ('knn', mod_02), ('tree', mod_03)],
    voting = 'soft'
)

voting_clf.fit(X_train, y_train)

print('Training Accuracy:  ', voting_clf.score(X_train, y_train))
print('Validation Accuracy:', voting_clf.score(X_val, y_val))
Training Accuracy:   0.976
Validation Accuracy: 0.868
In [19]:
plot_regions(voting_clf, X_train, y_train)
In [20]:
print('Model 01 Validation Accuracy:', mod_01.score(X_val, y_val))
print('Model 02 Validation Accuracy:', mod_02.score(X_val, y_val))
print('Model 03 Validation Accuracy:', mod_03.score(X_val, y_val))
print('Ensemble Validation Accuracy:', voting_clf.score(X_val, y_val))
Model 01 Validation Accuracy: 0.756
Model 02 Validation Accuracy: 0.844
Model 03 Validation Accuracy: 0.812
Ensemble Validation Accuracy: 0.868