import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from ClassificationPlotter import plot_regions
import ipywidgets as widgets
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.datasets import make_circles, make_blobs
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
Decision tree algorithms apply a divide-and-conquer strategy to split the feature space into small rectangular regions. A single label value is then assigned to each of the regions for the purposes of making predictions.
Decision trees can be used for either classification or regression tasks. In this lecture, we will focus on using decision trees for classification.
X1 = np.array([[1,1],[1,2],[1,3],[2,1],[2,2],[2,3],[3,1],[3,2],[3,3],[4,1],[4,2],[4,3]])
y1 = np.array([0,0,0, 0,0,2, 1,2,2, 2,1,1])
tree_mod_01 = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=2)
tree_mod_01.fit(X1,y1)
plot_regions(tree_mod_01, X1, y1, fig_size=[8,6])
One advantage of decision trees is that they generate easy to understand rules-based heuristics for making predictions. The rules generated by a decision tree are often represented in a flowchart, such as the following one.
#export_graphviz(tree_mod_01, out_file='images/tree_01.dot', filled=True, rounded=True)
X2, y2 = make_circles(n_samples=100, noise=0.25, random_state=9662, factor=0.4)
plt.close()
plt.rcParams["figure.figsize"] = [8,6]
plt.scatter(X2[y2==0,0],X2[y2==0,1],c='b', edgecolor='k')
plt.scatter(X2[y2==1,0],X2[y2==1,1],c='r', edgecolor='k')
plt.show()
def knn_example_2(max_depth):
tree_mod_02 = DecisionTreeClassifier(criterion='gini', max_depth=max_depth, random_state=1)
tree_mod_02.fit(X2,y2)
plot_regions(tree_mod_02, X2, y2)
print('Training Accuracy:', tree_mod_02.score(X2,y2))
_ = widgets.interact(knn_example_2,
max_depth=widgets.IntSlider(min=1,max=15,step=1,value=1,continuous_update=False))
X3, y3 = make_blobs(n_samples=300, centers=4, random_state=2997, n_features=2, cluster_std=2)
plt.close()
plt.scatter(X3[y3==0,0],X3[y3==0,1],c='purple', edgecolor='k')
plt.scatter(X3[y3==1,0],X3[y3==1,1],c='blue', edgecolor='k')
plt.scatter(X3[y3==2,0],X3[y3==2,1],c='yellow', edgecolor='k')
plt.scatter(X3[y3==3,0],X3[y3==3,1],c='red', edgecolor='k')
plt.show()
def knn_example_3(max_depth):
tree_mod_03 = DecisionTreeClassifier(criterion='gini', max_depth=max_depth, random_state=1)
tree_mod_03.fit(X3,y3)
plot_regions(tree_mod_03, X3, y3)
_ = widgets.interact(knn_example_3,
max_depth=widgets.IntSlider(min=1,max=15,step=1,value=1, continuous_update=False))
iris = pd.read_table(filepath_or_buffer='Data/iris_mod.txt', sep='\t')
iris.head(n=10)
import seaborn as sns
plt.close()
g = sns.pairplot(iris, hue="species")
plt.show()
X_iris = iris.iloc[:,:4]
y_iris = iris.iloc[:,4]
X_train, X_holdout, y_train, y_holdout = train_test_split(X_iris, y_iris, test_size=0.2, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_iris, y_iris, test_size=0.5, random_state=1)
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)
tr_acc = []
va_acc = []
rng = range(1,11)
for d in rng:
temp_mod = DecisionTreeClassifier(max_depth=d, criterion='gini', random_state=1)
temp_mod.fit(X_train, y_train)
tr_acc.append(temp_mod.score(X_train, y_train))
va_acc.append(temp_mod.score(X_val, y_val))
plt.figure(figsize=([9, 6]))
plt.plot(rng, tr_acc, label='Training Accuracy')
plt.plot(rng, va_acc, label='Validation Accuracy')
plt.xlabel('Maximum Depth')
plt.ylabel('Accuracy')
plt.xticks(rng)
plt.legend()
plt.show()
iris_tree = DecisionTreeClassifier(max_depth = 7, criterion='gini', random_state=1)
iris_tree.fit(X_train, y_train)
print(iris_tree.score(X_test, y_test))