import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
Regularization refers to a set of techniques used in supevised learning to attempt to reduce to likelihood of model overfitting. In this lession, we will consider L1 and L2 regularization, which penalize a model for being overly complex.
The following cell generates a dataset to be used in a regression task. The dataset contains one feature, and displays a nonlinear relationship.
np.random.seed(1)
n = 40
x = np.random.uniform(-4, 4, n)
X = x.reshape(-1,1)
y = 0.3 + 0.05 * x + 0.001 * x**7 + np.random.normal(0, 2, n)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.4, random_state=1)
print('X_train shape:', X_train.shape)
print('X_test shape: ', X_val.shape)
plt.close()
plt.scatter(X_train.reshape(-1,), y_train)
plt.scatter(X_val.reshape(-1,), y_val, c='r')
plt.show()
We will fit poynomial models of degrees 1, 3, 5, 7, 9, and 11 to this dataset.
from sklearn.preprocessing import PolynomialFeatures
degree = [1, 3, 5, 7, 9, 11]
x_curve = np.linspace(-4, 4, 100)
tr_score = []
va_score = []
plt.figure(figsize=[12,8])
for i in range(len(degree)):
poly = PolynomialFeatures(degree[i])
Xp_train = poly.fit_transform(X_train)
Xp_val = poly.fit_transform(X_val)
mod = LinearRegression()
mod.fit(Xp_train, y_train)
tr_score.append(mod.score(Xp_train, y_train))
va_score.append(mod.score(Xp_val, y_val))
xp_curve = poly.fit_transform(x_curve.reshape(-1,1))
y_curve = mod.predict(xp_curve)
plt.subplot(2,3,i+1)
plt.scatter(X_train.reshape(-1,), y_train)
plt.scatter(X_val.reshape(-1,), y_val, c='r')
plt.ylim([-20,20])
plt.plot(x_curve, y_curve, c='darkgreen')
plt.show()
for i in range(len(degree)):
print('-- Degree', degree[i], '--')
print('Training r2: ', tr_score[i])
print('Validation r2:', va_score[i], '\n')
L2 regularization is performed by adding an additional term to the loss function. This new term applies a penalty to the loss for having larger coefficients in the model. As such, it can be thought of as penalizing model complexity.
The loss function used in L2 regularizaion is:
$$\Large Loss = SSE + \alpha \sum_{i=1}^n \hat\beta_{i} ^2$$
Note that the penalty term does not apply to the intercept (bias) term in the model. The constant $\alpha$ is a model hyperparameter that we are free to choose.
In is important to scale features prior to applying L2 regularization.
from sklearn.preprocessing import StandardScaler
poly = PolynomialFeatures(11)
Xp_train = poly.fit_transform(X_train)
Xp_val = poly.transform(X_val)
scaler = StandardScaler()
scaler.fit(Xp_train)
Xps_train = scaler.transform(Xp_train)
Xps_val = scaler.transform(Xp_val)
print(Xps_train.shape)
print(Xps_val.shape)
We can use tools from sklearn
to implement an L2-regularized regression (or Ridge) model.
When training a ridge model, we must select a value for $\alpha$. We will arbitrarily set that to 0.01 in our example.
from sklearn.linear_model import Ridge
L2_mod = Ridge(alpha=0.01)
L2_mod.fit(Xps_train, y_train)
print('Training r2: ', L2_mod.score(Xps_train, y_train))
print('Validation r2:', L2_mod.score(Xps_val, y_val))
The plot below shows the fit resulting from our Ridge model.
plt.figure(figsize=[6,4])
x_curve = np.linspace(-4, 4, 100)
xp_curve = poly.fit_transform(x_curve.reshape(-1,1))
xps_curve = scaler.transform(xp_curve)
y_curve = L2_mod.predict(xps_curve)
plt.scatter(X_train.reshape(-1,), y_train)
plt.scatter(X_val.reshape(-1,), y_val, c='r')
plt.ylim([-20,20])
plt.plot(x_curve, y_curve, c='darkgreen')
plt.show()
We got decent (but not great) results for our choice of $\alpha$. Perhaps we could have done better with a different choice for this hyperparameter. We will write a loop to consider several values of $\alpha$. We will compare the resulting models based on their validation scores.
tr_score = []
va_score = []
exp_list = np.linspace(-3,10, 100)
for k in exp_list:
temp_mod = Ridge(alpha=10**k)
temp_mod.fit(Xps_train, y_train)
tr_score.append(temp_mod.score(Xps_train, y_train))
va_score.append(temp_mod.score(Xps_val, y_val))
plt.figure(figsize=([6,4]))
plt.plot(exp_list, tr_score, label='Training Score')
plt.plot(exp_list, va_score, label='Validation Score')
plt.legend()
plt.show()
idx = np.argmax(va_score)
print(exp_list[idx])
L2_mod_2 = Ridge(alpha=10**(-0.9))
L2_mod_2.fit(Xps_train, y_train)
print('Training r2: ', L2_mod_2.score(Xps_train, y_train))
print('Validation r2:', L2_mod_2.score(Xps_val, y_val))
plt.figure(figsize=[6,4])
x_curve = np.linspace(-4, 4, 100)
xp_curve = poly.fit_transform(x_curve.reshape(-1,1))
xps_curve = scaler.transform(xp_curve)
y_curve = L2_mod_2.predict(xps_curve)
plt.scatter(X_train.reshape(-1,), y_train)
plt.scatter(X_val.reshape(-1,), y_val, c='r')
plt.ylim([-20,20])
plt.plot(x_curve, y_curve, c='darkgreen')
plt.show()
Since L2 regularization penalizes models for having large coefficients, selecting larger values of $\alpha$ will generally result in models with smaller coefficients. The plot below visualizes the effects on the coefficient estimates as we increase the value of alpha.
coef_list = []
exp_list = np.linspace(-3,0, 100)
for k in exp_list:
temp_mod = Ridge(alpha=10**k)
temp_mod.fit(Xps_train, y_train)
coef_list.append(temp_mod.coef_)
coef_array = np.array(coef_list).transpose()
plt.figure(figsize=([12,8]))
for i in range(coef_array.shape[0]):
plt.plot(exp_list, coef_array[i, :], label='Beta' + str(i+1))
plt.legend()
plt.title('Effect of Alpha on Model Coefficients')
plt.xlabel('log(alpha)')
plt.ylabel('Coefficient Estimate')
plt.show()
L1 regularization is similar to L2 regression conceptually, but instead of squaring the model coefficients in the penalty term, we take their absolute value.
The loss function used in L2 regularizaion is:
$$\Large Loss = SSE + \alpha \sum_{i=1}^n \left|\hat\beta_{i} \right|$$
As before, penalty term does not apply to the intercept (bias) term in the model.
We can use tools from sklearn
to implement an L1-regularized regression (or Lasso) model.
As with ridge regression, when training a lasso model, we must select a value for $\alpha$. We will arbitrarily set that to 1.5 in our example.
from sklearn.linear_model import Lasso
L1_mod = Lasso(alpha=1.5)
L1_mod.fit(Xps_train, y_train)
print('Training r2: ', L1_mod.score(Xps_train, y_train))
print('Validation r2:', L1_mod.score(Xps_val, y_val))
plt.figure(figsize=[6,4])
x_curve = np.linspace(-4, 4, 100)
xp_curve = poly.fit_transform(x_curve.reshape(-1,1))
xps_curve = scaler.transform(xp_curve)
y_curve = L1_mod.predict(xps_curve)
plt.scatter(X_train.reshape(-1,), y_train)
plt.scatter(X_val.reshape(-1,), y_val, c='r')
plt.ylim([-20,20])
plt.plot(x_curve, y_curve, c='darkgreen')
plt.show()
We will write a loop to consider several values of $\alpha$, comparing the resulting models based on their validation scores.
tr_score = []
va_score = []
exp_list = np.linspace(-3,1, 100)
for k in exp_list:
temp_mod = Lasso(alpha=10**k, max_iter=1e8)
temp_mod.fit(Xps_train, y_train)
tr_score.append(temp_mod.score(Xps_train, y_train))
va_score.append(temp_mod.score(Xps_val, y_val))
plt.figure(figsize=([6,4]))
plt.plot(exp_list, tr_score, label='Training Score')
plt.plot(exp_list, va_score, label='Validation Score')
plt.legend()
plt.show()
idx = np.argmax(va_score)
print(exp_list[idx])
L1_mod_2 = Lasso(alpha=10**(-2.1))
L1_mod_2.fit(Xps_train, y_train)
print('Training r2: ', L1_mod_2.score(Xps_train, y_train))
print('Validation r2:', L1_mod_2.score(Xps_val, y_val))
plt.figure(figsize=[6,4])
x_curve = np.linspace(-4, 4, 100)
xp_curve = poly.fit_transform(x_curve.reshape(-1,1))
xps_curve = scaler.transform(xp_curve)
y_curve = L1_mod_2.predict(xps_curve)
plt.scatter(X_train.reshape(-1,), y_train)
plt.scatter(X_val.reshape(-1,), y_val, c='r')
plt.ylim([-20,20])
plt.plot(x_curve, y_curve, c='darkgreen')
plt.show()
In L1 regularization, like L2 regularization, selecting larger values of $\alpha$ applies a larger penalty for having large coefficients, and thus pushes coefficients toward 0. L1 regularization is somewhat more agressive in this, however. In L1 regularization, some model coefficients will actually be set to zero for sufficiently large values of $\alpha$, effectively removing the related features from the model. As a result, L1 regularization can be thought of as performing a type of automated variable selection.
coef_list = []
exp_list = np.linspace(-3,-1, 100)
for k in exp_list:
temp_mod = Lasso(alpha=10**k, max_iter=1e8)
temp_mod.fit(Xps_train, y_train)
coef_list.append(temp_mod.coef_)
coef_array = np.array(coef_list).transpose()
plt.figure(figsize=([12,8]))
for i in range(coef_array.shape[0]):
plt.plot(exp_list, coef_array[i, :], label='Beta' + str(i+1))
plt.legend()
plt.title('Effect of Alpha on Model Coefficients')
plt.xlabel('log(alpha)')
plt.ylabel('Coefficient Estimate')
plt.show()