Lesson 14 - Decision Tree Classifier for Titanic Dataset¶

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

Load the Data¶

df = pd.read_csv('data/titanic.txt', sep='\t')
df.head(10)

Separate Numerical and Categorical Columns¶

Xnum = df.iloc[:, [4]].values
Xcat = df.iloc[:, [1, 3]].values.astype('str')
y = df.iloc[:, 0].values

Create Train Test Split¶

Xnum_train, Xnum_holdout, y_train, y_holdout = train_test_split(Xnum, y, test_size = 0.3, random_state=1)
Xnum_val, Xnum_test, y_val, y_test = train_test_split(Xnum_holdout, y_holdout, test_size = 0.5, random_state=1)

Xcat_train, Xcat_holdout, y_train, y_holdout = train_test_split(Xcat, y, test_size = 0.3, random_state=1)
Xcat_val, Xcat_test, y_val, y_test = train_test_split(Xcat_holdout, y_holdout, test_size = 0.5, random_state=1)

print(Xnum_train.shape)
print(Xnum_val.shape)
print(Xnum_test.shape)
print()
print(Xcat_train.shape)
print(Xcat_val.shape)
print(Xcat_test.shape)

(620, 1)
(133, 1)
(134, 1)

(620, 2)
(133, 2)
(134, 2)

Encode Categorical Features¶

encoder = OneHotEncoder(sparse=False)
encoder.fit(Xcat_train)

Xenc_train = encoder.transform(Xcat_train)
Xenc_val   = encoder.transform(Xcat_val)
Xenc_test  = encoder.transform(Xcat_test)

print(Xenc_train.shape)
print(Xenc_val.shape)
print(Xenc_test.shape)

(620, 5)
(133, 5)
(134, 5)

Merge Feature Arrays¶

X_train = np.hstack([Xnum_train, Xenc_train])
X_val = np.hstack([Xnum_val, Xenc_val])
X_test = np.hstack([Xnum_test, Xenc_test])

print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(620, 6)
(133, 6)
(134, 6)

tr_acc = []
va_acc = []

rng = range(1,21)

for d in rng:
    temp_mod = DecisionTreeClassifier(max_depth=d, min_samples_leaf=10, criterion='gini', random_state=1)
    temp_mod.fit(X_train, y_train)
    tr_acc.append(temp_mod.score(X_train, y_train))
    va_acc.append(temp_mod.score(X_val, y_val))

plt.figure(figsize=([9, 6]))
plt.plot(rng, tr_acc, label='Training Accuracy')
plt.plot(rng, va_acc, label='Validation Accuracy')
plt.xlabel('Maximum Depth')
plt.ylabel('Accuracy')
plt.xticks(rng)
plt.legend()
plt.show()

titanic_mod = DecisionTreeClassifier(max_depth=5, min_samples_leaf=10, criterion='gini', random_state=1)
titanic_mod.fit(X_train, y_train)

print('Training Accuracy:  ', titanic_mod.score(X_train, y_train))
print('Validation Accuracy:', titanic_mod.score(X_val, y_val))

Training Accuracy:   0.8387096774193549
Validation Accuracy: 0.8045112781954887

Titanic

	Survived	Pclass	Name	Sex	Age	Siblings/Spouses Aboard	Parents/Children Aboard	Fare
0	0	3	Mr. Owen Harris Braund	male	22.0	1	0	7.2500
1	1	1	Mrs. John Bradley (Florence Briggs Thayer) Cum...	female	38.0	1	0	71.2833
2	1	3	Miss. Laina Heikkinen	female	26.0	0	0	7.9250
3	1	1	Mrs. Jacques Heath (Lily May Peel) Futrelle	female	35.0	1	0	53.1000
4	0	3	Mr. William Henry Allen	male	35.0	0	0	8.0500
5	0	3	Mr. James Moran	male	27.0	0	0	8.4583
6	0	1	Mr. Timothy J McCarthy	male	54.0	0	0	51.8625
7	0	3	Master. Gosta Leonard Palsson	male	2.0	3	1	21.0750
8	1	3	Mrs. Oscar W (Elisabeth Vilhelmina Berg) Johnson	female	27.0	0	2	11.1333
9	1	2	Mrs. Nicholas (Adele Achem) Nasser	female	14.0	1	0	30.0708