import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from ClassificationPlotter import plot_regions
np.random.seed(41)
X, y = make_classification(n_samples=500, n_classes=4, n_clusters_per_class=1,
n_features=2, n_redundant=0, class_sep=0.8)
plt.figure(figsize=[8,6])
plt.scatter(X[:,0], X[:,1], c=y, cmap='rainbow', edgecolor='k')
plt.show()
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)
print(X_train.shape)
print(X_val.shape)
tr_acc = []
va_acc = []
K_list = range(1,100)
for k in K_list:
temp_mod = KNeighborsClassifier(k)
temp_mod.fit(X_train, y_train)
tr_acc.append(temp_mod.score(X_train, y_train))
va_acc.append(temp_mod.score(X_val, y_val))
plt.figure(figsize=([6,4]))
plt.plot(K_list, tr_acc, label='Training Accuracy')
plt.plot(K_list, va_acc, label='Validation Accuracy')
plt.xlabel('K')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
idx = np.argmax(va_acc)
best_K = K_list[idx]
print(best_K)
knn_mod = KNeighborsClassifier(best_K)
knn_mod.fit(X_train, y_train)
print('Training Accuracy: ', knn_mod.score(X_train, y_train))
print('Validation Accuracy:', knn_mod.score(X_val, y_val))
plot_regions(knn_mod, X_train, y_train)
tr_acc = []
va_acc = []
max_nodes_list = range(2,64)
np.random.seed(1)
for d in max_nodes_list:
#temp_mod = DecisionTreeClassifier(max_depth=d)
temp_mod = DecisionTreeClassifier(max_leaf_nodes=d)
temp_mod.fit(X_train, y_train)
tr_acc.append(temp_mod.score(X_train, y_train))
va_acc.append(temp_mod.score(X_val, y_val))
plt.figure(figsize=([6,4]))
plt.plot(max_nodes_list, tr_acc, label='Training Accuracy')
plt.plot(max_nodes_list, va_acc, label='Validation Accuracy')
plt.xlabel('Max Depth')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
idx = np.argmax(va_acc)
best_max = max_nodes_list[idx]
print(best_max)
dt_mod = DecisionTreeClassifier(max_leaf_nodes=best_max)
dt_mod.fit(X_train, y_train)
print('Training Accuracy: ', dt_mod.score(X_train, y_train))
print('Validation Accuracy:', dt_mod.score(X_val, y_val))
plot_regions(dt_mod, X_train, y_train)
Bagging (short for bootstrap aggregating) and pasting are two ensembling methods in which several classifiers of the same model type and specifications are trained on different sets randomly sampled from the training data. When using bagging, the sampling is performed with replacement, whereas with pasting the sampling is performed without replacement.
In practice, bagging tends to perform somewhat better than pasting, and is generally preferred.
A bagging classifier can be created by using the BaggingClassifier
class from sklearn
. Some parameters of interest for the bagging classifier are:
base_estimator
- This described the type of base classifier being used in the model. n_estimators
- The number of base classifiers to use. max_samples
- This is the number of training samples to use when training the base classifiers. If provided as a float, this parameter will be interpreted as a proportion.bootstrap
- If True
, then sampling is performed with replacement (Bagging). If False
, then sampling is performed without replacement (Pasting). oob_score
- If True
, then the performance of each instance of the base classifer will be evaluated on the "out of bag" training samples that were not used to construct that model. from sklearn.ensemble import BaggingClassifier
np.random.seed(1)
knn_bag_mod = BaggingClassifier(
KNeighborsClassifier(best_K),
n_estimators=500, max_samples=0.5,
bootstrap=True, oob_score=True
)
knn_bag_mod.fit(X_train, y_train)
print('Out of bag score:', knn_bag_mod.oob_score_)
print('Training Accuracy: ', knn_bag_mod.score(X_train, y_train))
print('Validation Accuracy:', knn_bag_mod.score(X_val, y_val))
plot_regions(knn_bag_mod, X_train, y_train)
np.random.seed(1)
tree_bag_mod = BaggingClassifier(
DecisionTreeClassifier(splitter='random', max_leaf_nodes=32),
n_estimators=500, max_samples=0.5, bootstrap=True, oob_score = True
)
tree_bag_mod.fit(X_train, y_train)
print('Out of bag score:', tree_bag_mod.oob_score_)
print('Training Accuracy: ', tree_bag_mod.score(X_train, y_train))
print('Validation Accuracy:', tree_bag_mod.score(X_val, y_val))
plot_regions(tree_bag_mod, X_train, y_train)
Scikit-Learn provides a RandomForestClassifier
class. The model created using this class is similar to what we would obtain using BaggingClassifier
with DecisionTreeClassifier
, but some of the parameters are fixed. For example, in RandomForestClassifier
, we have that splitter='random'
and max_samples=1.0
.
from sklearn.ensemble import RandomForestClassifier
np.random.seed(1)
rf_mod = RandomForestClassifier(n_estimators=500, max_leaf_nodes=32,
bootstrap='True', oob_score=True)
rf_mod.fit(X_train, y_train)
print('Out of bag score:', rf_mod.oob_score_)
print('Training Accuracy: ', rf_mod.score(X_train, y_train))
print('Validation Accuracy:', rf_mod.score(X_val, y_val))
plot_regions(rf_mod, X_train, y_train)
in_bag_prop = []
sizes = np.arange(start=10, stop=500, step=10)
for sz in sizes:
my_set = range(sz)
total_of_props = 0
for i in range(1000):
sample = np.random.choice(my_set, size=sz, replace=True)
n_unique = len(np.unique(sample))
total_of_props += n_unique / sz
avg_prop = total_of_props / 1000
in_bag_prop.append(avg_prop)
plt.figure(figsize=[8,6])
plt.plot(in_bag_prop)
plt.xlabel('Training Set Size')
plt.ylabel('In-Bag Proportion')
plt.show()
iris = pd.read_csv(filepath_or_buffer='Data/iris_mod.txt', sep='\t')
iris.head(n=10)
import seaborn as sns
g = sns.pairplot(iris, hue="species")
plt.show()
X_iris = iris.iloc[:,:4]
y_iris = iris.iloc[:,4]
X_iris_train, X_iris_val, y_iris_train, y_iris_val = train_test_split(
X_iris, y_iris, test_size=0.2, random_state=1)
iris_forest = RandomForestClassifier(n_estimators=500, max_leaf_nodes=32,
bootstrap='True', oob_score=True)
iris_forest.fit(X_iris_train, y_iris_train)
print('Out of bag score:', iris_forest.oob_score_)
print('Training Accuracy: ', iris_forest.score(X_iris_train, y_iris_train))
print('Validation Accuracy:', iris_forest.score(X_iris_train, y_iris_train))
print(iris_forest.feature_importances_)