Lesson 14 - Decision Tree Classifier for Titanic Dataset

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

Load the Data

In [2]:
df = pd.read_csv('data/titanic.txt', sep='\t')
df.head(10)
Out[2]:
Survived Pclass Name Sex Age Siblings/Spouses Aboard Parents/Children Aboard Fare
0 0 3 Mr. Owen Harris Braund male 22.0 1 0 7.2500
1 1 1 Mrs. John Bradley (Florence Briggs Thayer) Cum... female 38.0 1 0 71.2833
2 1 3 Miss. Laina Heikkinen female 26.0 0 0 7.9250
3 1 1 Mrs. Jacques Heath (Lily May Peel) Futrelle female 35.0 1 0 53.1000
4 0 3 Mr. William Henry Allen male 35.0 0 0 8.0500
5 0 3 Mr. James Moran male 27.0 0 0 8.4583
6 0 1 Mr. Timothy J McCarthy male 54.0 0 0 51.8625
7 0 3 Master. Gosta Leonard Palsson male 2.0 3 1 21.0750
8 1 3 Mrs. Oscar W (Elisabeth Vilhelmina Berg) Johnson female 27.0 0 2 11.1333
9 1 2 Mrs. Nicholas (Adele Achem) Nasser female 14.0 1 0 30.0708

Separate Numerical and Categorical Columns

In [3]:
Xnum = df.iloc[:, [4]].values
Xcat = df.iloc[:, [1, 3]].values.astype('str')
y = df.iloc[:, 0].values

Create Train Test Split

In [4]:
Xnum_train, Xnum_holdout, y_train, y_holdout = train_test_split(Xnum, y, test_size = 0.3, random_state=1)
Xnum_val, Xnum_test, y_val, y_test = train_test_split(Xnum_holdout, y_holdout, test_size = 0.5, random_state=1)

Xcat_train, Xcat_holdout, y_train, y_holdout = train_test_split(Xcat, y, test_size = 0.3, random_state=1)
Xcat_val, Xcat_test, y_val, y_test = train_test_split(Xcat_holdout, y_holdout, test_size = 0.5, random_state=1)

print(Xnum_train.shape)
print(Xnum_val.shape)
print(Xnum_test.shape)
print()
print(Xcat_train.shape)
print(Xcat_val.shape)
print(Xcat_test.shape)
(620, 1)
(133, 1)
(134, 1)

(620, 2)
(133, 2)
(134, 2)

Encode Categorical Features

In [5]:
encoder = OneHotEncoder(sparse=False)
encoder.fit(Xcat_train)

Xenc_train = encoder.transform(Xcat_train)
Xenc_val   = encoder.transform(Xcat_val)
Xenc_test  = encoder.transform(Xcat_test)

print(Xenc_train.shape)
print(Xenc_val.shape)
print(Xenc_test.shape)
(620, 5)
(133, 5)
(134, 5)

Merge Feature Arrays

In [6]:
X_train = np.hstack([Xnum_train, Xenc_train])
X_val = np.hstack([Xnum_val, Xenc_val])
X_test = np.hstack([Xnum_test, Xenc_test])

print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
(620, 6)
(133, 6)
(134, 6)
In [41]:
tr_acc = []
va_acc = []

rng = range(1,21)

for d in rng:
    temp_mod = DecisionTreeClassifier(max_depth=d, min_samples_leaf=10, criterion='gini', random_state=1)
    temp_mod.fit(X_train, y_train)
    tr_acc.append(temp_mod.score(X_train, y_train))
    va_acc.append(temp_mod.score(X_val, y_val))

plt.figure(figsize=([9, 6]))
plt.plot(rng, tr_acc, label='Training Accuracy')
plt.plot(rng, va_acc, label='Validation Accuracy')
plt.xlabel('Maximum Depth')
plt.ylabel('Accuracy')
plt.xticks(rng)
plt.legend()
plt.show()
In [43]:
titanic_mod = DecisionTreeClassifier(max_depth=5, min_samples_leaf=10, criterion='gini', random_state=1)
titanic_mod.fit(X_train, y_train)

print('Training Accuracy:  ', titanic_mod.score(X_train, y_train))
print('Validation Accuracy:', titanic_mod.score(X_val, y_val))
Training Accuracy:   0.8387096774193549
Validation Accuracy: 0.8045112781954887

Titanic