Lesson 05 - Feature Engineering: Polynomial Regression

The following topics are discussed in this notebook:

  • Using Scikit-Learn to add polynomial features into our model.
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

Generate Data

In [2]:
np.random.seed(1)
n = 40
x = np.random.uniform(-4, 4, n)
X = x.reshape(-1,1)
y =  0.3 + 0.05 * x + 0.001 * x**7 + np.random.normal(0, 2, n)

plt.close()
plt.scatter(x,y)
plt.show()
In [3]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.4, random_state=1)
print(X_train.shape)
print(X_val.shape)
(24, 1)
(16, 1)

Linear Model

In [4]:
mod1 = LinearRegression()
mod1.fit(X_train, y_train)

print('Training r-Squared:', mod1.score(X_train, y_train))
print('Testing r-Squared: ', mod1.score(X_val, y_val))
Training r-Squared: 0.6382751443295913
Testing r-Squared:  0.5334020784605253
In [5]:
x_curve = np.linspace(-4, 4, 100)
y_curve = mod1.predict(x_curve.reshape(-1,1))

plt.close()
plt.scatter(X_train.reshape(-1,), y_train)
plt.scatter(X_val.reshape(-1,), y_val, c='r')
plt.plot(x_curve, y_curve, c='darkorange')
plt.show()

Degree 3 Polynomial Model

In [6]:
from sklearn.preprocessing import PolynomialFeatures
In [7]:
poly = PolynomialFeatures(3)
Xp3_train = poly.fit_transform(X_train)
Xp3_val = poly.fit_transform(X_val)
In [8]:
mod3 = LinearRegression()
mod3.fit(Xp3_train, y_train)

print('Training r-Squared:', mod3.score(Xp3_train, y_train))
print('Testing r-Squared: ', mod3.score(Xp3_val, y_val))
Training r-Squared: 0.8228042419821454
Testing r-Squared:  0.8224250413792193
In [9]:
x_curve = np.linspace(-4, 4, 100)
xp_curve = poly.fit_transform(x_curve.reshape(-1,1))
y_curve = mod3.predict(xp_curve)

plt.close()
plt.scatter(X_train.reshape(-1,), y_train)
plt.scatter(X_val.reshape(-1,), y_val, c='r')
plt.plot(x_curve, y_curve, c='darkorange')
plt.show()

Degree 5 Polynomial Model

In [10]:
poly = PolynomialFeatures(5)
Xp5_train = poly.fit_transform(X_train)
Xp5_val = poly.fit_transform(X_val)
In [11]:
mod5 = LinearRegression()
mod5.fit(Xp5_train, y_train)

print('Training r-Squared:', mod5.score(Xp5_train, y_train))
print('Testing r-Squared: ', mod5.score(Xp5_val, y_val))
Training r-Squared: 0.8508252764216662
Testing r-Squared:  0.8788413864576503
In [12]:
x_curve = np.linspace(-4, 4, 100)
xp_curve = poly.fit_transform(x_curve.reshape(-1,1))
y_curve = mod5.predict(xp_curve)

plt.close()
plt.scatter(X_train.reshape(-1,), y_train)
plt.scatter(X_val.reshape(-1,), y_val, c='r')
plt.plot(x_curve, y_curve, c='darkorange')
plt.show()

Degree 7 Polynomial Model

In [13]:
poly = PolynomialFeatures(7)
Xp7_train = poly.fit_transform(X_train)
Xp7_val = poly.fit_transform(X_val)
In [14]:
mod7 = LinearRegression()
mod7.fit(Xp7_train, y_train)

print('Training r-Squared:', mod7.score(Xp7_train, y_train))
print('Testing r-Squared: ', mod7.score(Xp7_val, y_val))
Training r-Squared: 0.8728242410836946
Testing r-Squared:  0.12019555086909085
In [15]:
x_curve = np.linspace(-4, 4, 100)
xp_curve = poly.fit_transform(x_curve.reshape(-1,1))
y_curve = mod7.predict(xp_curve)

plt.close()
plt.scatter(X_train.reshape(-1,), y_train)
plt.scatter(X_val.reshape(-1,), y_val, c='r')
plt.plot(x_curve, y_curve, c='darkorange')
plt.show()

Degree 9 Polynomial Model

In [16]:
poly = PolynomialFeatures(9)
Xp9_train = poly.fit_transform(X_train)
Xp9_val = poly.fit_transform(X_val)
In [17]:
mod9 = LinearRegression()
mod9.fit(Xp9_train, y_train)

print('Training r-Squared:', mod9.score(Xp9_train, y_train))
print('Testing r-Squared: ', mod9.score(Xp9_val, y_val))
Training r-Squared: 0.8899490344773457
Testing r-Squared:  -0.10629843367512493
In [18]:
x_curve = np.linspace(-4, 4, 100)
xp_curve = poly.fit_transform(x_curve.reshape(-1,1))
y_curve = mod9.predict(xp_curve)

plt.close()
plt.scatter(X_train.reshape(-1,), y_train)
plt.scatter(X_val.reshape(-1,), y_val, c='r')
plt.plot(x_curve, y_curve, c='darkorange')
plt.show()

Polynomial Regression on Boston Housing Data

In [19]:
df = pd.read_csv('data/BostonHousingV2.txt', sep='\t')
df.head(n=10)
Out[19]:
town tract lon lat medv cmedv crim zn indus chas nox rm age dis rad tax ptratio b lstat
0 Nahant 2011 -70.9550 42.2550 24.0 24.0 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
1 Swampscott 2021 -70.9500 42.2875 21.6 21.6 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
2 Swampscott 2022 -70.9360 42.2830 34.7 34.7 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
3 Marblehead 2031 -70.9280 42.2930 33.4 33.4 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
4 Marblehead 2032 -70.9220 42.2980 36.2 36.2 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
5 Marblehead 2033 -70.9165 42.3040 28.7 28.7 0.02985 0.0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
6 Salem 2041 -70.9360 42.2970 22.9 22.9 0.08829 12.5 7.87 0 0.524 6.012 66.6 5.5605 5 311 15.2 395.60 12.43
7 Salem 2042 -70.9375 42.3100 27.1 22.1 0.14455 12.5 7.87 0 0.524 6.172 96.1 5.9505 5 311 15.2 396.90 19.15
8 Salem 2043 -70.9330 42.3120 16.5 16.5 0.21124 12.5 7.87 0 0.524 5.631 100.0 6.0821 5 311 15.2 386.63 29.93
9 Salem 2044 -70.9290 42.3160 18.9 18.9 0.17004 12.5 7.87 0 0.524 6.004 85.9 6.5921 5 311 15.2 386.71 17.10
In [20]:
#Xb = df.iloc[:,6:].values
Xb = df.iloc[:,[11,16,18]].values
yb = df.iloc[:,5].values

print(Xb.shape)
print(yb.shape)
(506, 3)
(506,)
In [21]:
Xb_train, Xb_holdout, yb_train, yb_holdout = train_test_split(Xb, yb, test_size=0.2, random_state=1)
Xb_val, Xb_test, yb_val, yb_test = train_test_split(Xb_holdout, yb_holdout, test_size=0.5, random_state=1)

print(Xb_train.shape)
print(Xb_val.shape)
print(Xb_test.shape)
(404, 3)
(51, 3)
(51, 3)
In [22]:
tr_r2 = []
va_r2 = []

for d in range(1, 9):
    poly = PolynomialFeatures(d)
    Xp_train = poly.fit_transform(Xb_train)
    Xp_val = poly.fit_transform(Xb_val)
    
    temp_mod = LinearRegression()
    temp_mod.fit(Xp_train, yb_train)

    print('--- Degree', d, '------------' )
    print('Training r-Squared:  ', temp_mod.score(Xp_train, yb_train))
    print('Validation r-Squared:', temp_mod.score(Xp_val, yb_val))
    print()
    
    tr_r2.append(temp_mod.score(Xp_train, yb_train))
    va_r2.append(temp_mod.score(Xp_val, yb_val))    
--- Degree 1 ------------
Training r-Squared:   0.6739085856095928
Validation r-Squared: 0.6411429375679718

--- Degree 2 ------------
Training r-Squared:   0.791004772377863
Validation r-Squared: 0.7717960434468277

--- Degree 3 ------------
Training r-Squared:   0.8062014308813019
Validation r-Squared: 0.7889563056381033

--- Degree 4 ------------
Training r-Squared:   0.8250294785058452
Validation r-Squared: 0.8151681064379985

--- Degree 5 ------------
Training r-Squared:   0.800313674903169
Validation r-Squared: 0.8063440702139141

--- Degree 6 ------------
Training r-Squared:   0.7183285206393024
Validation r-Squared: 0.6940996243983029

--- Degree 7 ------------
Training r-Squared:   0.85573390624061
Validation r-Squared: 0.48665501203587663

--- Degree 8 ------------
Training r-Squared:   0.8798923905981678
Validation r-Squared: 0.45722355558913286

In [23]:
plt.figure(figsize=(8,4))
plt.plot(range(1,9), tr_r2, label='Training')
plt.plot(range(1,9), va_r2, label='Validation')
plt.legend()
plt.show()

Create Degree 4 Model

In [24]:
poly = PolynomialFeatures(4)
Xb4_train = poly.fit_transform(Xb_train)
Xb4_val = poly.fit_transform(Xb_val)
Xb4_test = poly.fit_transform(Xb_test)
    
b4_mod = LinearRegression()
b4_mod.fit(Xb4_train, yb_train)

print('Training r-Squared:  ', b4_mod.score(Xb4_train, yb_train))
print('Validation r-Squared:', b4_mod.score(Xb4_val, yb_val))
print('Testing r-Squared:   ', b4_mod.score(Xb4_test, yb_test))

    
Training r-Squared:   0.8250294785058452
Validation r-Squared: 0.8151681064379985
Testing r-Squared:    0.8529511257737558