import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
df = pd.read_csv('data/titanic.txt', sep='\t')
df.head(10)
Xnum = df.iloc[:, [4]].values
Xcat = df.iloc[:, [1, 3]].values.astype('str')
y = df.iloc[:, 0].values
Xnum_train, Xnum_val, y_train, y_val = train_test_split(Xnum, y, test_size = 0.2, random_state=1)
print(Xnum_train.shape)
print(Xnum_val.shape)
scaler = StandardScaler()
scaler.fit(Xnum_train)
Xsca_train = scaler.transform(Xnum_train)
Xsca_val = scaler.transform(Xnum_val)
print(Xsca_train.shape)
print(Xsca_val.shape)
encoder = OneHotEncoder(sparse=False)
encoder.fit(Xcat)
Xenc = encoder.transform(Xcat)
print(Xenc.shape)
Xenc_train, Xenc_val, y_train, y_val = train_test_split(Xenc, y, test_size = 0.2, random_state=1)
print(Xenc_train.shape)
print(Xenc_val.shape)
X_train = np.hstack([Xsca_train, Xenc_train])
X_val = np.hstack([Xsca_val, Xenc_val])
print(X_train.shape)
print(X_val.shape)
tr_acc = []
va_acc = []
exp_list = np.linspace(-3, 3, 100)
for k in exp_list:
temp_mod = LogisticRegression(solver='lbfgs', C=10**k, multi_class='auto')
temp_mod.fit(X_train, y_train)
tr_acc.append(temp_mod.score(X_train, y_train))
va_acc.append(temp_mod.score(X_val, y_val))
plt.figure(figsize=([6,4]))
plt.plot(exp_list, tr_acc, label='Training Accuracy')
plt.plot(exp_list, va_acc, label='Validation Accuracy')
plt.xlabel('log(C)')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
idx = np.argmax(va_acc)
best_logC = exp_list[idx]
print(best_logC)
mod_01 = LogisticRegression(solver='lbfgs', C=10**best_logC, multi_class='auto')
mod_01.fit(X_train, y_train)
print('Training Accuracy: ', mod_01.score(X_train, y_train))
print('Validation Accuracy:', mod_01.score(X_val, y_val))
tr_acc = []
va_acc = []
K_list = range(1,40)
for k in K_list:
temp_mod = KNeighborsClassifier(k)
temp_mod.fit(X_train, y_train)
tr_acc.append(temp_mod.score(X_train, y_train))
va_acc.append(temp_mod.score(X_val, y_val))
plt.figure(figsize=([6,4]))
plt.plot(K_list, tr_acc, label='Training Accuracy')
plt.plot(K_list, va_acc, label='Validation Accuracy')
plt.xlabel('K')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
idx = np.argmax(va_acc)
best_K = K_list[idx]
print(best_K)
mod_02 = KNeighborsClassifier(best_K)
mod_02.fit(X_train, y_train)
print('Training Accuracy: ', mod_02.score(X_train, y_train))
print('Validation Accuracy:', mod_02.score(X_val, y_val))
tr_acc = []
va_acc = []
depth_list = range(1,16)
np.random.seed(1)
for d in depth_list:
temp_mod = DecisionTreeClassifier(max_depth=d)
temp_mod.fit(X_train, y_train)
tr_acc.append(temp_mod.score(X_train, y_train))
va_acc.append(temp_mod.score(X_val, y_val))
plt.figure(figsize=([6,4]))
plt.plot(depth_list, tr_acc, label='Training Accuracy')
plt.plot(depth_list, va_acc, label='Validation Accuracy')
plt.xlabel('Max Depth')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
idx = np.argmax(va_acc)
best_d = depth_list[idx]
print(best_d)
np.random.seed(1)
mod_03 = DecisionTreeClassifier(max_depth=best_d)
mod_03.fit(X_train, y_train)
print('Training Accuracy: ', mod_03.score(X_train, y_train))
print('Validation Accuracy:', mod_03.score(X_val, y_val))
mod_04 = VotingClassifier(
estimators = [('lr', mod_01), ('knn', mod_02), ('tree', mod_03)],
voting = 'soft'
)
mod_04.fit(X_train, y_train)
print('Training Accuracy: ', mod_04.score(X_train, y_train))
print('Validation Accuracy:', mod_04.score(X_val, y_val))
print('Model 01 Validation Accuracy:', mod_01.score(X_val, y_val))
print('Model 02 Validation Accuracy:', mod_02.score(X_val, y_val))
print('Model 03 Validation Accuracy:', mod_03.score(X_val, y_val))
print('Model 04 Validation Accuracy:', mod_04.score(X_val, y_val))
mod_05 = VotingClassifier(
estimators = [('knn', mod_02), ('tree', mod_03)],
voting = 'soft'
)
mod_05.fit(X_train, y_train)
print('Training Accuracy: ', mod_05.score(X_train, y_train))
print('Validation Accuracy:', mod_05.score(X_val, y_val))
print('Model 01 Validation Accuracy:', mod_01.score(X_val, y_val))
print('Model 02 Validation Accuracy:', mod_02.score(X_val, y_val))
print('Model 03 Validation Accuracy:', mod_03.score(X_val, y_val))
print('Model 04 Validation Accuracy:', mod_04.score(X_val, y_val))
print('Model 05 Validation Accuracy:', mod_05.score(X_val, y_val))
mod_06 = VotingClassifier(
estimators = [
('clf_01', KNeighborsClassifier(1)),
('clf_02', KNeighborsClassifier(2)),
('clf_03', KNeighborsClassifier(3)),
('clf_04', DecisionTreeClassifier(max_depth=3)),
('clf_05', DecisionTreeClassifier(max_depth=3)),
('clf_06', DecisionTreeClassifier(max_depth=4))
],
voting = 'soft'
)
mod_06.fit(X_train, y_train)
print('Training Accuracy: ', mod_06.score(X_train, y_train))
print('Validation Accuracy:', mod_06.score(X_val, y_val))