import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
df = pd.read_csv('data/titanic.txt', sep='\t')
df.head(10)
Xnum = df.iloc[:, [4]].values
Xcat = df.iloc[:, [1, 3]].values.astype('str')
y = df.iloc[:, 0].values
Xnum_train, Xnum_holdout, y_train, y_holdout = train_test_split(Xnum, y, test_size = 0.3, random_state=1)
Xnum_val, Xnum_test, y_val, y_test = train_test_split(Xnum_holdout, y_holdout, test_size = 0.5, random_state=1)
Xcat_train, Xcat_holdout, y_train, y_holdout = train_test_split(Xcat, y, test_size = 0.3, random_state=1)
Xcat_val, Xcat_test, y_val, y_test = train_test_split(Xcat_holdout, y_holdout, test_size = 0.5, random_state=1)
print(Xnum_train.shape)
print(Xnum_val.shape)
print(Xnum_test.shape)
print()
print(Xcat_train.shape)
print(Xcat_val.shape)
print(Xcat_test.shape)
encoder = OneHotEncoder(sparse=False)
encoder.fit(Xcat_train)
Xenc_train = encoder.transform(Xcat_train)
Xenc_val = encoder.transform(Xcat_val)
Xenc_test = encoder.transform(Xcat_test)
print(Xenc_train.shape)
print(Xenc_val.shape)
print(Xenc_test.shape)
X_train = np.hstack([Xnum_train, Xenc_train])
X_val = np.hstack([Xnum_val, Xenc_val])
X_test = np.hstack([Xnum_test, Xenc_test])
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
tr_acc = []
va_acc = []
rng = range(1,21)
for d in rng:
temp_mod = DecisionTreeClassifier(max_depth=d, min_samples_leaf=10, criterion='gini', random_state=1)
temp_mod.fit(X_train, y_train)
tr_acc.append(temp_mod.score(X_train, y_train))
va_acc.append(temp_mod.score(X_val, y_val))
plt.figure(figsize=([9, 6]))
plt.plot(rng, tr_acc, label='Training Accuracy')
plt.plot(rng, va_acc, label='Validation Accuracy')
plt.xlabel('Maximum Depth')
plt.ylabel('Accuracy')
plt.xticks(rng)
plt.legend()
plt.show()
titanic_mod = DecisionTreeClassifier(max_depth=5, min_samples_leaf=10, criterion='gini', random_state=1)
titanic_mod.fit(X_train, y_train)
print('Training Accuracy: ', titanic_mod.score(X_train, y_train))
print('Validation Accuracy:', titanic_mod.score(X_val, y_val))