Lesson 17 - Bagging and Random Forests

Additional Resources

  • Hands-On Machine Learning, Ch 7
In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.datasets import make_classification

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from ClassificationPlotter import plot_regions

Generate the Data

In [2]:
np.random.seed(41)
X, y = make_classification(n_samples=500, n_classes=4, n_clusters_per_class=1, 
                           n_features=2, n_redundant=0, class_sep=0.8)

plt.figure(figsize=[8,6])
plt.scatter(X[:,0], X[:,1], c=y, cmap='rainbow', edgecolor='k')
plt.show()
In [3]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)
print(X_train.shape)
print(X_val.shape)
(400, 2)
(100, 2)

Model 1: K-Nearest Neighbors

In [4]:
tr_acc = []
va_acc = []
K_list = range(1,100)

for k in K_list:
    temp_mod = KNeighborsClassifier(k)
    temp_mod.fit(X_train, y_train)
    tr_acc.append(temp_mod.score(X_train, y_train))
    va_acc.append(temp_mod.score(X_val, y_val))
    
plt.figure(figsize=([6,4]))
plt.plot(K_list, tr_acc, label='Training Accuracy')
plt.plot(K_list, va_acc, label='Validation Accuracy')
plt.xlabel('K')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

idx = np.argmax(va_acc)
best_K = K_list[idx]
print(best_K)
1
In [5]:
knn_mod = KNeighborsClassifier(best_K)
knn_mod.fit(X_train, y_train)

print('Training Accuracy:  ', knn_mod.score(X_train, y_train))
print('Validation Accuracy:', knn_mod.score(X_val, y_val))
Training Accuracy:   1.0
Validation Accuracy: 0.75
In [6]:
plot_regions(knn_mod, X_train, y_train)

Model 2: Decision Tree Model

In [7]:
tr_acc = []
va_acc = []
max_nodes_list = range(2,64)

np.random.seed(1)
for d in max_nodes_list:
    #temp_mod = DecisionTreeClassifier(max_depth=d)
    temp_mod = DecisionTreeClassifier(max_leaf_nodes=d)
    temp_mod.fit(X_train, y_train)
    tr_acc.append(temp_mod.score(X_train, y_train))
    va_acc.append(temp_mod.score(X_val, y_val))
    
plt.figure(figsize=([6,4]))
plt.plot(max_nodes_list, tr_acc, label='Training Accuracy')
plt.plot(max_nodes_list, va_acc, label='Validation Accuracy')
plt.xlabel('Max Depth')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

idx = np.argmax(va_acc)
best_max = max_nodes_list[idx]
print(best_max)
5
In [8]:
dt_mod = DecisionTreeClassifier(max_leaf_nodes=best_max)
dt_mod.fit(X_train, y_train)

print('Training Accuracy:  ', dt_mod.score(X_train, y_train))
print('Validation Accuracy:', dt_mod.score(X_val, y_val))
Training Accuracy:   0.7575
Validation Accuracy: 0.74
In [9]:
plot_regions(dt_mod, X_train, y_train)

Bagging and Pasting

Bagging (short for bootstrap aggregating) and pasting are two ensembling methods in which several classifiers of the same model type and specifications are trained on different sets randomly sampled from the training data. When using bagging, the sampling is performed with replacement, whereas with pasting the sampling is performed without replacement.

In practice, bagging tends to perform somewhat better than pasting, and is generally preferred.

A bagging classifier can be created by using the BaggingClassifier class from sklearn. Some parameters of interest for the bagging classifier are:

  • base_estimator - This described the type of base classifier being used in the model.
  • n_estimators - The number of base classifiers to use.
  • max_samples - This is the number of training samples to use when training the base classifiers. If provided as a float, this parameter will be interpreted as a proportion.
  • bootstrap - If True, then sampling is performed with replacement (Bagging). If False, then sampling is performed without replacement (Pasting).
  • oob_score - If True, then the performance of each instance of the base classifer will be evaluated on the "out of bag" training samples that were not used to construct that model.

Model 3: KNN Bagging Classifier

In [10]:
from sklearn.ensemble import BaggingClassifier
In [11]:
np.random.seed(1)
knn_bag_mod = BaggingClassifier(
    KNeighborsClassifier(best_K),
    n_estimators=500, max_samples=0.5, 
    bootstrap=True, oob_score=True
)

knn_bag_mod.fit(X_train, y_train)

print('Out of bag score:', knn_bag_mod.oob_score_)

print('Training Accuracy:  ', knn_bag_mod.score(X_train, y_train))
print('Validation Accuracy:', knn_bag_mod.score(X_val, y_val))
Out of bag score: 0.735
Training Accuracy:   0.89
Validation Accuracy: 0.73
In [12]:
plot_regions(knn_bag_mod, X_train, y_train)

Model 4: Decision Tree Bagging Classifier (Random Forest)

In [13]:
np.random.seed(1)
tree_bag_mod = BaggingClassifier(
    DecisionTreeClassifier(splitter='random', max_leaf_nodes=32),
    n_estimators=500, max_samples=0.5, bootstrap=True, oob_score = True
)

tree_bag_mod.fit(X_train, y_train)

print('Out of bag score:', tree_bag_mod.oob_score_)

print('Training Accuracy:  ', tree_bag_mod.score(X_train, y_train))
print('Validation Accuracy:', tree_bag_mod.score(X_val, y_val))
Out of bag score: 0.76
Training Accuracy:   0.8375
Validation Accuracy: 0.77
In [14]:
plot_regions(tree_bag_mod, X_train, y_train)

Model 5: Random Forest Classifier

Scikit-Learn provides a RandomForestClassifier class. The model created using this class is similar to what we would obtain using BaggingClassifier with DecisionTreeClassifier, but some of the parameters are fixed. For example, in RandomForestClassifier, we have that splitter='random' and max_samples=1.0.

In [15]:
from sklearn.ensemble import RandomForestClassifier
In [16]:
np.random.seed(1)
rf_mod = RandomForestClassifier(n_estimators=500, max_leaf_nodes=32, 
                                bootstrap='True', oob_score=True)
rf_mod.fit(X_train, y_train)

print('Out of bag score:', rf_mod.oob_score_)

print('Training Accuracy:  ', rf_mod.score(X_train, y_train))
print('Validation Accuracy:', rf_mod.score(X_val, y_val))
Out of bag score: 0.7325
Training Accuracy:   0.895
Validation Accuracy: 0.74
In [17]:
plot_regions(rf_mod, X_train, y_train)

Expected Proportion of In-Bag Samples

In [18]:
in_bag_prop = []

sizes = np.arange(start=10, stop=500, step=10)

for sz in sizes:
    
    my_set = range(sz)
    
    total_of_props = 0
    for i in range(1000):
        sample = np.random.choice(my_set, size=sz, replace=True)
        n_unique = len(np.unique(sample))
        total_of_props += n_unique / sz
    
    avg_prop = total_of_props / 1000
    in_bag_prop.append(avg_prop)
        
plt.figure(figsize=[8,6])
plt.plot(in_bag_prop)
plt.xlabel('Training Set Size')
plt.ylabel('In-Bag Proportion')
plt.show()

Feature Importance

In [19]:
iris = pd.read_csv(filepath_or_buffer='Data/iris_mod.txt', sep='\t')
iris.head(n=10)
Out[19]:
sepal_length sepal_width petal_length petal_width species
0 6.3 3.2 5.0 2.0 virginica
1 5.3 3.8 1.9 0.4 setosa
2 7.5 2.9 5.8 1.5 virginica
3 6.5 3.0 4.8 1.6 versicolor
4 6.8 3.1 4.9 1.5 versicolor
5 6.1 2.3 4.4 1.3 versicolor
6 4.9 3.5 1.6 0.4 setosa
7 6.3 3.1 5.7 1.7 virginica
8 4.9 3.5 1.5 0.2 setosa
9 5.5 3.9 1.3 0.4 setosa
In [20]:
import seaborn as sns

g = sns.pairplot(iris, hue="species")
plt.show()
In [21]:
X_iris = iris.iloc[:,:4]
y_iris = iris.iloc[:,4]
In [22]:
X_iris_train, X_iris_val, y_iris_train, y_iris_val = train_test_split(
    X_iris, y_iris, test_size=0.2, random_state=1)
In [23]:
iris_forest = RandomForestClassifier(n_estimators=500, max_leaf_nodes=32, 
                                bootstrap='True', oob_score=True)

iris_forest.fit(X_iris_train, y_iris_train)

print('Out of bag score:', iris_forest.oob_score_)

print('Training Accuracy:  ', iris_forest.score(X_iris_train, y_iris_train))
print('Validation Accuracy:', iris_forest.score(X_iris_train, y_iris_train))
Out of bag score: 0.98125
Training Accuracy:   1.0
Validation Accuracy: 1.0
In [24]:
print(iris_forest.feature_importances_)
[0.08091799 0.0261854  0.42763723 0.46525938]