Lesson 25 - Facial Recognition with PCA

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_lfw_people
In [2]:
faces = fetch_lfw_people(min_faces_per_person=60)
print(type(faces))
<class 'sklearn.utils.Bunch'>
In [3]:
y = faces.target
X = faces.data

print(y.shape)
print(X.shape)
(964,)
(964, 2914)
In [4]:
names = faces.target_names
print(names)
['Donald Rumsfeld' 'George W Bush' 'Gerhard Schroeder' 'Junichiro Koizumi'
 'Tony Blair']
In [5]:
images = faces.images
print(images.shape)
(964, 62, 47)
In [6]:
i = np.random.choice(range(images.shape[0]))

plt.close()
plt.rcParams["figure.figsize"] = [4,4]
plt.imshow(images[i], cmap='bone')
plt.axis('off')
plt.show()
In [7]:
_, name_dist = np.unique(y, return_counts=True)

plt.close()
plt.bar(range(len(names)), name_dist)
plt.xticks(range(len(names)), names, rotation='vertical')
plt.show()

Train-Test Splits and Rescaling

In [8]:
from sklearn.model_selection import train_test_split
In [9]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.20, random_state = 1)

print(y_train.size)
print(y_val.size)
771
193
In [10]:
Xs = X / 255
Xs_train = X_train / 255
Xs_val = X_val / 255

Import Algorithms

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

Logistic Regression

In [12]:
%%time
lr_mod = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=2000)
lr_mod.fit(Xs_train, y_train)
print(lr_mod.score(Xs_train, y_train))
print(lr_mod.score(Xs_val, y_val))
1.0
0.8756476683937824
Wall time: 14.9 s
In [13]:
i = np.random.choice(range(X_val.shape[0]))
pred = lr_mod.predict([Xs_val[i,:]])
plt.imshow(X_val[i].reshape(62,47), cmap='bone')
plt.axis('off')
plt.text(50, 5, s = names[y_val[i]], fontsize=16, color='b')
plt.text(50, 15, s = names[pred[0]], fontsize=16, color='r')
plt.show()
In [14]:
pred_val = lr_mod.predict(Xs_val)
In [15]:
print( classification_report(y_val, pred_val, target_names=names))
                   precision    recall  f1-score   support

  Donald Rumsfeld       0.85      0.77      0.81        30
    George W Bush       0.88      0.95      0.91       100
Gerhard Schroeder       0.75      0.60      0.67        15
Junichiro Koizumi       0.83      0.94      0.88        16
       Tony Blair       0.96      0.84      0.90        32

        micro avg       0.88      0.88      0.88       193
        macro avg       0.86      0.82      0.83       193
     weighted avg       0.88      0.88      0.87       193

In [16]:
print(confusion_matrix(y_val, pred_val))
[[23  5  2  0  0]
 [ 3 95  0  1  1]
 [ 0  4  9  2  0]
 [ 0  1  0 15  0]
 [ 1  3  1  0 27]]

KNN

In [17]:
%%time
KNN_mod = KNeighborsClassifier(n_neighbors=12)
KNN_mod.fit(Xs_train, y_train)
print(KNN_mod.score(Xs_train, y_train))
print(KNN_mod.score(Xs_val, y_val))
0.6977950713359273
0.6062176165803109
Wall time: 5.53 s

SVM

In [18]:
%%time
svm_mod = SVC(kernel='rbf', gamma=0.001, C=10)
svm_mod.fit(Xs_train, y_train)
print(svm_mod.score(Xs_train, y_train))
print(svm_mod.score(Xs_val, y_val))
0.9429312581063554
0.8808290155440415
Wall time: 5.14 s

PCA

In [19]:
from sklearn.decomposition import PCA
In [20]:
pca = PCA(n_components = 150, whiten=True)
Z = pca.fit_transform(X)
In [21]:
print(np.cumsum(pca.explained_variance_ratio_))
[0.1816338  0.3364153  0.41166073 0.47067642 0.5227791  0.55331653
 0.57884026 0.6009663  0.62075657 0.6392411  0.65541136 0.67019105
 0.68319    0.6941593  0.7047003  0.7145431  0.723653   0.7321862
 0.73992765 0.7470082  0.75388503 0.7604203  0.7666335  0.772263
 0.7775028  0.78250843 0.78734744 0.79188144 0.79626787 0.8005816
 0.8046137  0.80836153 0.81201607 0.8155992  0.8190737  0.8223044
 0.82551885 0.82869947 0.8317205  0.8346612  0.8375452  0.84031194
 0.84300965 0.84557354 0.8480582  0.8504399  0.85277313 0.8550852
 0.8573101  0.859521   0.8616967  0.86369383 0.86568403 0.8676316
 0.8695059  0.87136334 0.8731691  0.8749307  0.8766196  0.87827647
 0.87990505 0.88151217 0.8830891  0.8846481  0.8861708  0.8876455
 0.8891152  0.8905677  0.89199173 0.89333713 0.8946594  0.8959665
 0.8972639  0.89853543 0.8997869  0.90102345 0.90222174 0.90337235
 0.90450156 0.9056183  0.90671676 0.9077809  0.90884197 0.9099
 0.91092694 0.9119377  0.91293526 0.9139156  0.9148934  0.91583896
 0.9167742  0.9176962  0.91859406 0.91947556 0.92035174 0.9212172
 0.9220518  0.9228759  0.9236937  0.92450166 0.925305   0.92609763
 0.926862   0.9276127  0.9283528  0.92908245 0.92980295 0.9305171
 0.93122447 0.9319076  0.9325745  0.93324006 0.9338987  0.9345487
 0.93519133 0.9358185  0.9364378  0.93704873 0.93765604 0.93824637
 0.93883175 0.9394158  0.93999213 0.94056064 0.9411189  0.9416696
 0.94221693 0.94275934 0.943287   0.94380796 0.9443242  0.9448329
 0.945329   0.9458196  0.9463045  0.9467833  0.94725734 0.9477259
 0.94819236 0.9486538  0.94910634 0.9495432  0.949978   0.9504101
 0.95083565 0.95125616 0.9516682  0.95207804 0.9524814  0.9528795 ]
In [22]:
pc = pca.components_
In [23]:
plt.close()
plt.rcParams["figure.figsize"] = [12,12]
for i in range(25):
    plt.subplot(5,5,i+1)
    plt.imshow(pc[i].reshape(62,47), cmap='bone')
    plt.axis('off')
plt.show()
In [24]:
plt.rcParams["figure.figsize"] = [4,4]
plt.imshow(images[0], cmap='bone')
plt.axis('off')
plt.show()

print(Z[0,:25])
[-0.7898546   0.8197325  -0.40130562  1.2586493  -1.2593197  -0.31830105
 -0.7259493   0.06564734  0.6927802  -0.09108707 -0.56477296  1.1630869
  0.05774781  1.3372539  -0.04533439 -0.54714173  0.07360779  0.63917345
  0.31831864  0.4101087  -0.77057254 -0.6850689   1.6429013  -0.6700993
 -0.32018936]
In [25]:
cp = np.zeros(len(pc[0])).reshape(62,47)

plt.close()
plt.rcParams["figure.figsize"] = [12,12]
for i in range(25):
    
    cp += Z[0,i] * pc[i].reshape(62,47)
    
    plt.subplot(5,5,i+1)
    plt.imshow(cp, cmap='bone')
    plt.axis('off')
plt.show()
In [26]:
cp = Z[0,:].reshape(150,1) * pc[:,:]
cp = np.sum(cp, axis=0)

plt.rcParams["figure.figsize"] = [4,4]
plt.imshow(cp.reshape(62,47), cmap='bone')
plt.axis('off')
plt.show()
In [27]:
Z_train = pca.transform(X_train)
Z_val = pca.transform(X_val)

Logistic Regression (PCA)

In [28]:
%%time
pca_lr_mod = LogisticRegression(solver='lbfgs', multi_class='ovr', max_iter=6000)
pca_lr_mod.fit(Z_train, y_train)
print(pca_lr_mod.score(Z_train, y_train))
print(pca_lr_mod.score(Z_val, y_val))
0.9961089494163424
0.8911917098445595
Wall time: 62.5 ms
In [29]:
i = np.random.choice(range(X_val.shape[0]))
pred = lr_mod.predict([Xs_val[i,:]])
plt.imshow(X_val[i].reshape(62,47), cmap='bone')
plt.axis('off')
plt.text(50, 5, s = names[y_val[i]], fontsize=16, color='b')
plt.text(50, 15, s = names[pred[0]], fontsize=16, color='r')
plt.show()
In [34]:
i = np.random.choice(range(X_val.shape[0]))
pred = pca_lr_mod.predict([Z_val[i,:]])

plt.rcParams["figure.figsize"] = [8,8]

plt.subplot(1,2,1)
plt.imshow(X_val[i].reshape(62,47), cmap='bone')
plt.axis('off')

plt.subplot(1,2,2)
pca_rep = np.sum(Z_val[i,:].reshape(150,1) * pc[:,:], axis=0).reshape(62,47)
plt.imshow(pca_rep, cmap='bone')
plt.axis('off')

plt.text(50, 5, s = names[y_val[i]], fontsize=16, color='b')
plt.text(50, 15, s = names[pred[0]], fontsize=16, color='r')

plt.show()
In [31]:
print( classification_report(y_val, pca_lr_mod.predict(Z_val), target_names=names))
                   precision    recall  f1-score   support

  Donald Rumsfeld       0.92      0.80      0.86        30
    George W Bush       0.90      0.94      0.92       100
Gerhard Schroeder       0.80      0.80      0.80        15
Junichiro Koizumi       1.00      0.88      0.93        16
       Tony Blair       0.85      0.88      0.86        32

        micro avg       0.89      0.89      0.89       193
        macro avg       0.89      0.86      0.87       193
     weighted avg       0.89      0.89      0.89       193

KNN (PCA)

In [32]:
%%time
pca_KNN_mod = KNeighborsClassifier(n_neighbors=12)
pca_KNN_mod.fit(Z_train, y_train)
print(pca_KNN_mod.score(Z_train, y_train))
print(pca_KNN_mod.score(Z_val, y_val))
0.6718547341115434
0.5906735751295337
Wall time: 281 ms

SVM (PCA)

In [33]:
%%time
pca_svm_mod = SVC(kernel='rbf', gamma=0.001, C=10)
pca_svm_mod.fit(Z_train, y_train)
print(pca_svm_mod.score(Z_train, y_train))
print(pca_svm_mod.score(Z_val, y_val))
0.9909208819714657
0.8756476683937824
Wall time: 328 ms