import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
This dataset contains information about the 887 passengers on the voyage of the Titanic. Of the columns available, we will be particularly interested in the following:
Survived
- Binary variable indicating whether or not the passenger survived the voyage. Pclass
- Categorical variable indicating the passenger class. Sex
- Categorical variable indicating the gender of the passenger. Age
- Age of the passenger.df = pd.read_csv('data/titanic.txt', sep='\t')
df.head(10)
print(df.shape)
df.isna().sum(axis=0)
female_survival_rate = np.sum((df.Sex == 'female') & (df.Survived == 1)) / np.sum(df.Sex == 'female')
male_survival_rate = np.sum((df.Sex == 'male') & (df.Survived == 1)) / np.sum(df.Sex == 'male')
sex_survival_rates = np.array([female_survival_rate, male_survival_rate])
sex_death_rates = 1 - sex_survival_rates
plt.figure(figsize=[6,4])
plt.bar(range(2),sex_survival_rates, label='Survived', color='cornflowerblue')
plt.bar(range(2),sex_death_rates, label='Died', bottom=sex_survival_rates, color='Salmon')
plt.legend(loc="center left", bbox_to_anchor=(1.03,0.5))
plt.xticks(range(2), ['Female', 'Male'])
plt.ylabel('Proportion')
plt.title('Survival Rate by Sex')
plt.show()
class1_survival_rate = np.sum((df.Pclass == 1) & (df.Survived == 1)) / np.sum(df.Pclass == 1)
class2_survival_rate = np.sum((df.Pclass == 2) & (df.Survived == 1)) / np.sum(df.Pclass == 2)
class3_survival_rate = np.sum((df.Pclass == 3) & (df.Survived == 1)) / np.sum(df.Pclass == 3)
class_survival_rates = np.array([class1_survival_rate, class2_survival_rate, class3_survival_rate])
class_death_rates = 1 - class_survival_rates
plt.figure(figsize=[6,4])
plt.bar(range(3),class_survival_rates, label='Survived', color='cornflowerblue')
plt.bar(range(3),class_death_rates, label='Died', bottom=class_survival_rates, color='Salmon')
plt.legend(loc="center left", bbox_to_anchor=(1.03,0.5))
plt.xticks(range(3), ['Class 1', 'Class 2', 'Class 3'])
plt.ylabel('Proportion')
plt.title('Survival Rate by Class')
plt.show()
plt.violinplot([df.Age.values[df.Survived == 0], df.Age.values[df.Survived == 1]], showmeans=True)
plt.xticks(range(3), ['', 'Died', 'Survived'])
plt.xlim(0.5, 2.5)
plt.show()
Xnum = df.iloc[:, [4]].values
Xcat = df.iloc[:, [1, 3]].values.astype('str')
y = df.iloc[:, 0].values
Xnum_train, Xnum_holdout, y_train, y_holdout = train_test_split(Xnum, y, test_size = 0.3, random_state=1)
Xnum_val, Xnum_test, y_val, y_test = train_test_split(Xnum_holdout, y_holdout, test_size = 0.5, random_state=1)
Xcat_train, Xcat_holdout, y_train, y_holdout = train_test_split(Xcat, y, test_size = 0.3, random_state=1)
Xcat_val, Xcat_test, y_val, y_test = train_test_split(Xcat_holdout, y_holdout, test_size = 0.5, random_state=1)
print(Xnum_train.shape)
print(Xnum_val.shape)
print(Xnum_test.shape)
print()
print(Xcat_train.shape)
print(Xcat_val.shape)
print(Xcat_test.shape)
encoder = OneHotEncoder(sparse=False)
encoder.fit(Xcat_train)
Xenc_train_ = encoder.transform(Xcat_train)
Xenc_val_ = encoder.transform(Xcat_val)
Xenc_test_ = encoder.transform(Xcat_test)
print(Xenc_train_.shape)
print(Xenc_val_.shape)
print(Xenc_test_.shape)
Xenc_train = np.delete(Xenc_train_, [0, 3], axis=1)
Xenc_val = np.delete(Xenc_val_, [0, 3], axis=1)
Xenc_test = np.delete(Xenc_test_, [0, 3], axis=1)
print(Xenc_train.shape)
print(Xenc_val.shape)
print(Xenc_test.shape)
scaler = StandardScaler()
scaler.fit(Xnum_train)
Xsca_train = scaler.transform(Xnum_train)
Xsca_val = scaler.transform(Xnum_val)
Xsca_test = scaler.transform(Xnum_test)
print(Xsca_train.shape)
print(Xsca_val.shape)
print(Xsca_test.shape)
X_train = np.hstack([Xsca_train, Xenc_train])
X_val = np.hstack([Xsca_val, Xenc_val])
X_test = np.hstack([Xsca_test, Xenc_test])
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
logreg_model = LogisticRegression(solver='lbfgs')
logreg_model.fit(X_train, y_train)
print('Training Accuracy: ', logreg_model.score(X_train, y_train))
print('Validation Accuracy:', logreg_model.score(X_val, y_val))
exp_list = np.linspace(-3,3, 20)
tr_acc = []
va_acc = []
for k in exp_list:
temp_model = LogisticRegression(solver='lbfgs', C=10**k)
temp_model.fit(X_train, y_train)
tr_acc.append(temp_model.score(X_train, y_train))
va_acc.append(temp_model.score(X_val, y_val))
plt.figure(figsize=([6,4]))
plt.plot(exp_list, tr_acc, label='Training Accuracy')
plt.plot(exp_list, va_acc, label='Validation Accuracy')
plt.xlabel('log(alpha)')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
logreg_model = LogisticRegression(solver='lbfgs', C=10**2)
logreg_model.fit(X_train, y_train)
print('Training Accuracy: ', logreg_model.score(X_train, y_train))
print('Validation Accuracy:', logreg_model.score(X_val, y_val))
tr_acc = []
va_acc = []
k_range = range(1, 60)
for k in k_range:
temp_model = KNeighborsClassifier(k)
temp_model.fit(X_train, y_train)
tr_acc.append(temp_model.score(X_train, y_train))
va_acc.append(temp_model.score(X_val, y_val))
plt.figure(figsize=([6,4]))
plt.plot(k_range, tr_acc, label='Training Accuracy')
plt.plot(k_range, va_acc, label='Validation Accuracy')
plt.xlabel('K')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
print(np.argmax(va_acc))
knn_model = KNeighborsClassifier(4)
knn_model.fit(X_train, y_train)
print('Training Accuracy: ', knn_model.score(X_train, y_train))
print('Validation Accuracy:', knn_model.score(X_val, y_val))
print('Testing Accuracy: ', knn_model.score(X_test, y_test))
We will use our models to generate predictions for 18 individuals representing all possible combinations of the following feature values:
Sex
- Female, or MaleClass
- 1, 2, or 3Age
- 10, 20, or 40X_new_10 = np.array([[10, 1, 'male'], [10, 2, 'male'], [10, 3, 'male'],
[10, 1, 'female'], [10, 2, 'female'], [10, 3, 'female'],])
X_new_20 = np.array([[20, 1, 'male'], [20, 2, 'male'], [20, 3, 'male'],
[20, 1, 'female'], [20, 2, 'female'], [20, 3, 'female'],])
X_new_40 = np.array([[40, 1, 'male'], [40, 2, 'male'], [40, 3, 'male'],
[40, 1, 'female'], [40, 2, 'female'], [40, 3, 'female'],])
X_new_sca_10 = scaler.transform(X_new_10[:,[0]].astype('float64'))
X_new_enc_10 = np.delete(encoder.transform(X_new_10[:,[1,2]]), [0, 3], axis=1)
X_new_pp_10 = np.hstack([X_new_sca_10, X_new_enc_10])
print(X_new_pp_10)
X_new_sca_20 = scaler.transform(X_new_20[:,[0]].astype('float64'))
X_new_enc_20 = np.delete(encoder.transform(X_new_20[:,[1,2]]), [0, 3], axis=1)
X_new_pp_20 = np.hstack([X_new_sca_20, X_new_enc_20])
print(X_new_pp_20)
X_new_sca_40 = scaler.transform(X_new_40[:,[0]].astype('float64'))
X_new_enc_40 = np.delete(encoder.transform(X_new_40[:,[1,2]]), [0, 3], axis=1)
X_new_pp_40 = np.hstack([X_new_sca_40, X_new_enc_40])
print(X_new_pp_40)
print(knn_model.predict(X_new_pp_10))
print(knn_model.predict(X_new_pp_20))
print(knn_model.predict(X_new_pp_40))
print(logreg_model.predict(X_new_pp_10))
print(logreg_model.predict(X_new_pp_20))
print(logreg_model.predict(X_new_pp_40))
print(logreg_model.predict_proba(X_new_pp_40))
print(knn_model.predict_proba(X_new_pp_40))