import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ClassificationPlotter import plot_regions
The K-Nearest Neighbors Classification algorithm classifies observations by allowing the K
observations in the training set that are nearest to the new observation to "vote" on the class of the new observation. More specifically, the algorithm works as follows:
K
observations in the training set that are nearest to the new observation. K
nearest neighbors. Consider the dataset shown below. Attempt to determine the class of the new observation, indicated by an X, as predicted by KNN classifier with $K=1,2,3,4,5,6$.
np.random.seed(391)
X = np.random.uniform(0,10,24).reshape(12,2)
y = np.array(['red']*6 + ['blue']*6)
xticks = np.linspace(0, 10, 100)
yticks = np.linspace(0, 10, 100)
xgrid, ygrid = np.meshgrid(xticks, yticks)
Z = ((xgrid - 6)**2 + (ygrid - 4)**2)**0.5
plt.close()
plt.rcParams["figure.figsize"] = [8,6]
plt.scatter(X[:6,0],X[:6,1], c='r')
plt.scatter(X[6:,0],X[6:,1], c='b')
plt.scatter([6],[4], c='k', marker='X', s=200)
plt.contour(xgrid, ygrid, Z)
plt.show()
We will provide an example sketching out how to implement the KNN classification algorithm. First, we will provide an example illustrating the argsort
function, which will be useful in our implementation of the KNN algorithm.
my_array = [7.8, 5.7, 4.8, 1.3, 6.4]
print(np.sort(my_array))
print(np.argsort(my_array))
We will now illustrate how to use base Python and numpy to perform KNN classification
P = [6,4]
K = 5
#distances = []
#n = len(y)
#for i in range(n):
# d = (P[0] - X[i,0])**2 + (P[1] - X[i,1])**2
# distances.append(d)
# distances = np.array(distances)
distances = np.sum((X - P)**2, axis=1)
idx = np.argsort(distances)[:K]
knn_y = y[idx]
knn_d = distances[idx]
n_blue = np.sum(knn_y == 'blue')
n_red = np.sum(knn_y == 'red')
dist_blue = np.sum(knn_d[knn_y == 'blue'])
dist_red = np.sum(knn_d[knn_y == 'red'])
if n_red > n_blue:
print('The predicted class for the point', P, 'is \'red\'.')
elif n_blue > n_red:
print('The predicted class for the point', P, 'is \'blue\'.')
elif dist_red < dist_blue:
print('The predicted class for the point', P, 'is \'red\'.')
else:
print('The predicted class for the point', P, 'is \'blue\'.')
We will now see several examples of using Scikit Learn to construct KNN classifiers.
from sklearn.neighbors import KNeighborsClassifier
# Experiment with different values of K and p
mod = KNeighborsClassifier(n_neighbors=3, p=2)
mod.fit(X,y)
plot_regions(mod, X, y, num_ticks=500)
The example below explores how the choice of K
affects the KNN algorithm. Notice that for small values of K
, the decision boundary is jagged, conforms to the training data in very specific ways. When K
is large, the decision boundary is smoother, and adapts more to the "global" structure of the training data than to local eccentricities. As a consequence, small values of K
are more likely to result in over-fitting, while large values of K
are more prone to producing models that underfit.
import ipywidgets
df = pd.read_table('Data\knn_example.txt', sep='\t')
X = df.iloc[:,:2].values
y = df.iloc[:,2].values
def knn_example(K):
mod = KNeighborsClassifier(K)
mod.fit(X,y)
plot_regions(mod, X, y)
slider = ipywidgets.IntSlider(min=1,max=400,step=1,value=1,continuous_update=False)
_ = ipywidgets.interact(knn_example, K=slider)
As with all model hyperparameters, we use a validation set to select the value of K
to be used in a KNN model.
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 1)
train_acc = []
val_acc = []
for i in range(1,101):
knn = KNeighborsClassifier(i)
knn.fit(X_train, y_train)
train_acc.append(knn.score(X_train, y_train))
val_acc.append(knn.score(X_val, y_val))
plt.close()
plt.rcParams["figure.figsize"] = [8,6]
plt.plot(train_acc, label="Training Accuracy")
plt.plot(val_acc, label="Validation Accuracy")
plt.legend()
plt.show()
print('\n')
print('Maximum validation accuracy is equal to =', np.max(val_acc))
print('Maximum validation accuracy occurs at K =', np.argmax(val_acc))
np.random.seed(874)
%run -i snippets/snippet07.py
%run -i snippets/snippet08.py
%run -i snippets/snippet09.py
KNN can be easily addapted to multiclass classification problems.
np.random.seed(9)
X, y = skds.make_classification(n_samples = 1000, n_classes = 4,
n_features=2, n_redundant=0,
n_informative=2, n_clusters_per_class=1)
# Experiment with different values of K
knn = KNeighborsClassifier(3)
knn.fit(X, y)
plot_regions(knn, X, y, 500, fig_size=[12,8])
It is generally necessary to scale (standardize / normalize) our features before applying a KNN algorithm. The importance of this is illustrated with the next example.
x1 = np.array([1, 2, 3, 4, 5, 6, 7])
x2 = np.array([0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1])
X = np.hstack([x1.reshape(7,1), x2.reshape(7,1)])
y = np.array([0,1,0,1,0,1,0])
plt.close()
plt.rcParams["figure.figsize"] = [8,6]
plt.scatter(x1[y==0], x2[y==0], c='b', edgecolors='k', s=80)
plt.scatter(x1[y==1], x2[y==1], c='r', edgecolors='k', s=80)
plt.show()
knn = KNeighborsClassifier(3)
knn.fit(X,y)
plot_regions(knn, X, y)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
Xs = scaler.fit_transform(X)
knn = KNeighborsClassifier(3)
knn.fit(Xs,y)
plot_regions(knn, Xs, y)
Pros
Cons
def curse_of_dim(d):
n_pairs = 10000
P = np.random.uniform(0, 1, d*n_pairs).reshape(n_pairs, d)
Q = np.random.uniform(0, 1, d*n_pairs).reshape(n_pairs, d)
dist = np.sum((P - Q)**2, axis=1)**0.5 / np.sum(np.zeros(d) + 1)**0.5
plt.figure(figsize=[8,6])
plt.hist(dist, bins=20)
plt.xlim([0,1])
plt.show()
_ = ipywidgets.interact(curse_of_dim,
d=ipywidgets.IntSlider(min=1,max=1000,step=1,value=1,continuous_update=False))