Lesson 05 - Feature Engineering: Polynomial Regression¶

The following topics are discussed in this notebook:¶

Using Scikit-Learn to add polynomial features into our model.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

Generate Data¶

np.random.seed(1)
n = 40
x = np.random.uniform(-4, 4, n)
X = x.reshape(-1,1)
y =  0.3 + 0.05 * x + 0.001 * x**7 + np.random.normal(0, 2, n)

plt.close()
plt.scatter(x,y)
plt.show()

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.4, random_state=1)
print(X_train.shape)
print(X_val.shape)

(24, 1)
(16, 1)

Linear Model¶

mod1 = LinearRegression()
mod1.fit(X_train, y_train)

print('Training r-Squared:', mod1.score(X_train, y_train))
print('Testing r-Squared: ', mod1.score(X_val, y_val))

Training r-Squared: 0.6382751443295913
Testing r-Squared:  0.5334020784605253

x_curve = np.linspace(-4, 4, 100)
y_curve = mod1.predict(x_curve.reshape(-1,1))

plt.close()
plt.scatter(X_train.reshape(-1,), y_train)
plt.scatter(X_val.reshape(-1,), y_val, c='r')
plt.plot(x_curve, y_curve, c='darkorange')
plt.show()

Degree 3 Polynomial Model¶

from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(3)
Xp3_train = poly.fit_transform(X_train)
Xp3_val = poly.fit_transform(X_val)

mod3 = LinearRegression()
mod3.fit(Xp3_train, y_train)

print('Training r-Squared:', mod3.score(Xp3_train, y_train))
print('Testing r-Squared: ', mod3.score(Xp3_val, y_val))

Training r-Squared: 0.8228042419821454
Testing r-Squared:  0.8224250413792193

x_curve = np.linspace(-4, 4, 100)
xp_curve = poly.fit_transform(x_curve.reshape(-1,1))
y_curve = mod3.predict(xp_curve)

plt.close()
plt.scatter(X_train.reshape(-1,), y_train)
plt.scatter(X_val.reshape(-1,), y_val, c='r')
plt.plot(x_curve, y_curve, c='darkorange')
plt.show()

Degree 5 Polynomial Model¶

poly = PolynomialFeatures(5)
Xp5_train = poly.fit_transform(X_train)
Xp5_val = poly.fit_transform(X_val)

mod5 = LinearRegression()
mod5.fit(Xp5_train, y_train)

print('Training r-Squared:', mod5.score(Xp5_train, y_train))
print('Testing r-Squared: ', mod5.score(Xp5_val, y_val))

Training r-Squared: 0.8508252764216662
Testing r-Squared:  0.8788413864576503

x_curve = np.linspace(-4, 4, 100)
xp_curve = poly.fit_transform(x_curve.reshape(-1,1))
y_curve = mod5.predict(xp_curve)

plt.close()
plt.scatter(X_train.reshape(-1,), y_train)
plt.scatter(X_val.reshape(-1,), y_val, c='r')
plt.plot(x_curve, y_curve, c='darkorange')
plt.show()

Degree 7 Polynomial Model¶

poly = PolynomialFeatures(7)
Xp7_train = poly.fit_transform(X_train)
Xp7_val = poly.fit_transform(X_val)

mod7 = LinearRegression()
mod7.fit(Xp7_train, y_train)

print('Training r-Squared:', mod7.score(Xp7_train, y_train))
print('Testing r-Squared: ', mod7.score(Xp7_val, y_val))

Training r-Squared: 0.8728242410836946
Testing r-Squared:  0.12019555086909085

x_curve = np.linspace(-4, 4, 100)
xp_curve = poly.fit_transform(x_curve.reshape(-1,1))
y_curve = mod7.predict(xp_curve)

plt.close()
plt.scatter(X_train.reshape(-1,), y_train)
plt.scatter(X_val.reshape(-1,), y_val, c='r')
plt.plot(x_curve, y_curve, c='darkorange')
plt.show()

Degree 9 Polynomial Model¶

poly = PolynomialFeatures(9)
Xp9_train = poly.fit_transform(X_train)
Xp9_val = poly.fit_transform(X_val)

mod9 = LinearRegression()
mod9.fit(Xp9_train, y_train)

print('Training r-Squared:', mod9.score(Xp9_train, y_train))
print('Testing r-Squared: ', mod9.score(Xp9_val, y_val))

Training r-Squared: 0.8899490344773457
Testing r-Squared:  -0.10629843367512493

x_curve = np.linspace(-4, 4, 100)
xp_curve = poly.fit_transform(x_curve.reshape(-1,1))
y_curve = mod9.predict(xp_curve)

plt.close()
plt.scatter(X_train.reshape(-1,), y_train)
plt.scatter(X_val.reshape(-1,), y_val, c='r')
plt.plot(x_curve, y_curve, c='darkorange')
plt.show()

Polynomial Regression on Boston Housing Data¶

df = pd.read_csv('data/BostonHousingV2.txt', sep='\t')
df.head(n=10)

#Xb = df.iloc[:,6:].values
Xb = df.iloc[:,[11,16,18]].values
yb = df.iloc[:,5].values

print(Xb.shape)
print(yb.shape)

(506, 3)
(506,)

Xb_train, Xb_holdout, yb_train, yb_holdout = train_test_split(Xb, yb, test_size=0.2, random_state=1)
Xb_val, Xb_test, yb_val, yb_test = train_test_split(Xb_holdout, yb_holdout, test_size=0.5, random_state=1)

print(Xb_train.shape)
print(Xb_val.shape)
print(Xb_test.shape)

(404, 3)
(51, 3)
(51, 3)

tr_r2 = []
va_r2 = []

for d in range(1, 9):
    poly = PolynomialFeatures(d)
    Xp_train = poly.fit_transform(Xb_train)
    Xp_val = poly.fit_transform(Xb_val)
    
    temp_mod = LinearRegression()
    temp_mod.fit(Xp_train, yb_train)

    print('--- Degree', d, '------------' )
    print('Training r-Squared:  ', temp_mod.score(Xp_train, yb_train))
    print('Validation r-Squared:', temp_mod.score(Xp_val, yb_val))
    print()
    
    tr_r2.append(temp_mod.score(Xp_train, yb_train))
    va_r2.append(temp_mod.score(Xp_val, yb_val))

--- Degree 1 ------------
Training r-Squared:   0.6739085856095928
Validation r-Squared: 0.6411429375679718

--- Degree 2 ------------
Training r-Squared:   0.791004772377863
Validation r-Squared: 0.7717960434468277

--- Degree 3 ------------
Training r-Squared:   0.8062014308813019
Validation r-Squared: 0.7889563056381033

--- Degree 4 ------------
Training r-Squared:   0.8250294785058452
Validation r-Squared: 0.8151681064379985

--- Degree 5 ------------
Training r-Squared:   0.800313674903169
Validation r-Squared: 0.8063440702139141

--- Degree 6 ------------
Training r-Squared:   0.7183285206393024
Validation r-Squared: 0.6940996243983029

--- Degree 7 ------------
Training r-Squared:   0.85573390624061
Validation r-Squared: 0.48665501203587663

--- Degree 8 ------------
Training r-Squared:   0.8798923905981678
Validation r-Squared: 0.45722355558913286

plt.figure(figsize=(8,4))
plt.plot(range(1,9), tr_r2, label='Training')
plt.plot(range(1,9), va_r2, label='Validation')
plt.legend()
plt.show()

Create Degree 4 Model¶

poly = PolynomialFeatures(4)
Xb4_train = poly.fit_transform(Xb_train)
Xb4_val = poly.fit_transform(Xb_val)
Xb4_test = poly.fit_transform(Xb_test)
    
b4_mod = LinearRegression()
b4_mod.fit(Xb4_train, yb_train)

print('Training r-Squared:  ', b4_mod.score(Xb4_train, yb_train))
print('Validation r-Squared:', b4_mod.score(Xb4_val, yb_val))
print('Testing r-Squared:   ', b4_mod.score(Xb4_test, yb_test))

Training r-Squared:   0.8250294785058452
Validation r-Squared: 0.8151681064379985
Testing r-Squared:    0.8529511257737558

	town	tract	lon	lat	medv	cmedv	crim	zn	indus	nox	rm	age	dis	rad	tax	ptratio	b	lstat
0	Nahant	2011	-70.9550	42.2550	24.0	24.0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1	296	15.3	396.90	4.98
1	Swampscott	2021	-70.9500	42.2875	21.6	21.6	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2	242	17.8	396.90	9.14
2	Swampscott	2022	-70.9360	42.2830	34.7	34.7	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2	242	17.8	392.83	4.03
3	Marblehead	2031	-70.9280	42.2930	33.4	33.4	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3	222	18.7	394.63	2.94
4	Marblehead	2032	-70.9220	42.2980	36.2	36.2	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3	222	18.7	396.90	5.33
5	Marblehead	2033	-70.9165	42.3040	28.7	28.7	0.02985	0.0	2.18	0.458	6.430	58.7	6.0622	3	222	18.7	394.12	5.21
6	Salem	2041	-70.9360	42.2970	22.9	22.9	0.08829	12.5	7.87	0.524	6.012	66.6	5.5605	5	311	15.2	395.60	12.43
7	Salem	2042	-70.9375	42.3100	27.1	22.1	0.14455	12.5	7.87	0.524	6.172	96.1	5.9505	5	311	15.2	396.90	19.15
8	Salem	2043	-70.9330	42.3120	16.5	16.5	0.21124	12.5	7.87	0.524	5.631	100.0	6.0821	5	311	15.2	386.63	29.93
9	Salem	2044	-70.9290	42.3160	18.9	18.9	0.17004	12.5	7.87	0.524	6.004	85.9	6.5921	5	311	15.2	386.71	17.10