set.seed(125)
n <- 20
x <- runif(n, -4, 4)
y <- 3 + 0.5 * x + 0.01 * x**7 + rnorm(n, 0, 10)
plot(y ~ x)
sel <- sample(1:length(x), 0.7*length(x))
x_train <- x[sel]
x_test <- x[-sel]
y_train <- y[sel]
y_test <- y[-sel]
c(length(x_train), length(x_test))
[1] 14 6
X <- x_train
poly_mod <- lm(y_train ~ X + I(X^2) + I(X^3) + I(X^4) + I(X^5) + I(X^6) + I(X^7) + I(X^8) + I(X^9))
summary(poly_mod)
Call:
lm(formula = y_train ~ X + I(X^2) + I(X^3) + I(X^4) + I(X^5) +
I(X^6) + I(X^7) + I(X^8) + I(X^9))
Residuals:
1 2 3 4 5 6 7 8 9 10 11
0.001675 12.072649 -0.044082 -0.936949 0.281054 -0.012450 -9.652762 1.134774 -10.166205 0.021866 -2.723524
12 13 14
4.438837 8.860344 -3.275227
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 13.94676 9.33858 1.493 0.2096
X 41.38780 18.90707 2.189 0.0938 .
I(X^2) -14.18574 19.03735 -0.745 0.4976
I(X^3) -52.71864 26.24965 -2.008 0.1150
I(X^4) 1.60472 9.88913 0.162 0.8790
I(X^5) 17.33773 9.63701 1.799 0.1464
I(X^6) 0.16687 1.35439 0.123 0.9079
I(X^7) -1.84734 1.10405 -1.673 0.1696
I(X^8) -0.01302 0.05115 -0.255 0.8116
I(X^9) 0.06175 0.03826 1.614 0.1818
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 10.73 on 4 degrees of freedom
Multiple R-squared: 0.9855, Adjusted R-squared: 0.9529
F-statistic: 30.21 on 9 and 4 DF, p-value: 0.002514
xg <- seq(-4,4,length=100)
yg_poly <- predict(poly_mod, data.frame(X=xg))
y_train_pred_poly <- predict(poly_mod, data.frame(X = x_train))
y_test_pred_poly <- predict(poly_mod, data.frame(X = x_test))
plot(y_train ~ x_train, ylim=c(-80, 140), xlab="x", ylab="y", main="Polynomial Fit")
lines(xg, yg_poly)
segments(x_train, y_train, x_train, y_train_pred_poly)
points(y_test ~ x_test, col="red")
segments(x_test, y_test, x_test, y_test_pred_poly, col="red")
sse <- sum((y_train - y_train_pred_poly)^2)
sst <- sum((y_train - mean(y_train))^2)
training_rsq <- 1 - sse / sst
sse <- sum((y_test - y_test_pred_poly)^2)
sst <- sum((y_test - mean(y_test))^2)
testing_rsq <- 1 - sse / sst
poly_rsq <- c(training_rsq, testing_rsq)
names(poly_rsq) <- c('Training', 'Testing')
poly_rsq
Training Testing
0.9855016 -0.1894984
Goal: Minimize \(Loss\left(\hat\beta_0, ..., \hat\beta_p\right) = \frac{1}{n} SSE + \lambda \sum_{i=1}^{p} \hat\beta_i^2\)
Xmat <- model.matrix(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8) + I(x^9))[,-1]
colnames(Xmat) <- c('x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9')
Xmat_train <- Xmat[sel,]
Xmat_test <- Xmat[-sel,]
ridge_grid <- 10^(seq(6, -2, length=100))
ridge_models <- glmnet(Xmat_train, y_train, alpha = 0, lambda = ridge_grid)
rcf <- coef(ridge_models)
rcf <- as.matrix(rcf)
colnames(rcf) <- as.character(round(ridge_grid,3))
round(rcf[,c(100:97,4:1)],8)
0.01 0.012 0.015 0.017 572236.766 689261.21 830217.568 1e+06
(Intercept) 4.57653075 4.51553835 4.44337860 4.35915386 4.44681139 4.44667746 4.44656624 4.44647399
x1 3.12784539 3.03769286 2.94841494 2.86006226 0.00148821 0.00123561 0.00102588 0.00085187
x2 -1.77544283 -1.70997150 -1.63332628 -1.54438618 -0.00002874 -0.00002387 -0.00001982 -0.00001647
x3 -0.48269574 -0.42921637 -0.37813056 -0.32953346 0.00015033 0.00012481 0.00010363 0.00008605
x4 0.47887585 0.46253979 0.44539099 0.42722849 -0.00000511 -0.00000424 -0.00000352 -0.00000293
x5 0.10548593 0.09870831 0.09231422 0.08630598 0.00001134 0.00000942 0.00000782 0.00000649
x6 -0.01084175 -0.00959778 -0.00847765 -0.00747294 -0.00000058 -0.00000048 -0.00000040 -0.00000033
x7 0.00209152 0.00224199 0.00237304 0.00248595 0.00000081 0.00000067 0.00000056 0.00000046
x8 -0.00111615 -0.00114182 -0.00115836 -0.00116574 -0.00000006 -0.00000005 -0.00000004 -0.00000003
x9 0.00006628 0.00007295 0.00007999 0.00008735 0.00000006 0.00000005 0.00000004 0.00000003
ridge_cfdf <- data.frame(grid = ridge_grid, x1 = rcf[2,], x2 = rcf[3,], x3 = rcf[4,], x4 = rcf[5,],
x5 = rcf[6,], x6 = rcf[7,], x7 = rcf[8,], x8 = rcf[9,], x9 = rcf[10,])
ridge_melted <- melt(ridge_cfdf, id.vars="grid")
ggplot(ridge_melted, aes(log(grid), value, col=variable)) + geom_line(size=1) +
xlab('log(lambda)') + ylab('Coefficients')
set.seed(1)
ridge_cv_out <- cv.glmnet(Xmat_train, y_train, alpha = 0, grouped = FALSE, lambda = exp(seq(from = -4,to = 5,length = 100)))
plot(ridge_cv_out, xlim=c(-4,5))
ridge_best_lambda <- ridge_cv_out$lambda.min
ridge_best_lambda
[1] 0.03790291
ridge_model <- glmnet(Xmat_train, y_train, alpha = 0, lambda = ridge_best_lambda)
coef(ridge_model)
10 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) 3.932470e+00
x1 2.815298e+00
x2 -1.101402e+00
x3 -3.380609e-01
x4 3.802801e-01
x5 8.320549e-02
x6 -9.916439e-03
x7 3.044001e-03
x8 -9.036347e-04
x9 7.095034e-05
y_train_pred_ridge <- predict(ridge_model, Xmat_train)
sse <- sum((y_train - y_train_pred_ridge)^2)
sst <- sum((y_train - mean(y_train))^2)
training_rsq <- 1 - sse / sst
y_test_pred_ridge <- predict(ridge_model, Xmat_test)
sse <- sum((y_test - y_test_pred_ridge)^2)
sst <- sum((y_test - mean(y_test))^2)
testing_rsq <- 1 - sse / sst
ridge_rsq <- c(training_rsq, testing_rsq)
names(ridge_rsq) <- c('Training', 'Testing')
ridge_rsq
Training Testing
0.9616233 0.9295274
X_grid <- cbind(xg, xg^2, xg^3, xg^4, xg^5, xg^6, xg^7, xg^8, xg^9)
colnames(X_grid) <- c('x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9')
yg_ridge <- predict(ridge_model, X_grid)
plot(y_train ~ x_train, ylim=c(-80, 140), xlab="x", ylab="y", main="Ridge Regression")
lines(xg, yg_ridge)
segments(x_train, y_train, x_train, y_train_pred_ridge)
points(y_test ~ x_test, col="red")
segments(x_test, y_test, x_test, y_test_pred_ridge, col="red")
Goal: Minimize \(Loss\left(\hat\beta_0, ..., \hat\beta_p\right) = \frac{1}{n} SSE + \lambda \sum_{i=1}^{p} | \hat\beta_i |\)
lasso_grid <- 10^(seq(2, -2, length=100))
lasso_models <- glmnet(Xmat_train, y_train, alpha = 1, lambda = lasso_grid)
lcf <- coef(lasso_models)
lcf <- as.matrix(lcf)
colnames(lcf) <- as.character(round(lasso_grid,3))
round(lcf[,c(100:97,4:1)],8)
0.01 0.011 0.012 0.013 75.646 83.022 91.116 100
(Intercept) 4.12853918 4.07711662 4.02493649 3.96171216 4.446022 4.446022 4.446022 4.446022
x1 2.46054372 2.42761968 2.39595038 2.36083518 0.000000 0.000000 0.000000 0.000000
x2 -1.27930830 -1.22735910 -1.17492552 -1.11172822 0.000000 0.000000 0.000000 0.000000
x3 -0.06356629 -0.05206069 -0.04163760 -0.03092317 0.000000 0.000000 0.000000 0.000000
x4 0.35116138 0.34349641 0.33602884 0.32762155 0.000000 0.000000 0.000000 0.000000
x5 0.03633810 0.03498810 0.03378586 0.03258892 0.000000 0.000000 0.000000 0.000000
x6 -0.00022746 -0.00009151 -0.00000046 0.00000000 0.000000 0.000000 0.000000 0.000000
x7 0.00606544 0.00611837 0.00616600 0.00621414 0.000000 0.000000 0.000000 0.000000
x8 -0.00138371 -0.00137189 -0.00135805 -0.00133667 0.000000 0.000000 0.000000 0.000000
x9 0.00000000 0.00000000 0.00000000 0.00000000 0.000000 0.000000 0.000000 0.000000
lasso_cfdf <- data.frame(grid = ridge_grid, x1 = lcf[2,], x2 = lcf[3,], x3 = lcf[4,], x4 = lcf[5,],
x5 = lcf[6,], x6 = lcf[7,], x7 = lcf[8,], x8 = lcf[9,], x9 = lcf[10,])
lasso_melted <- melt(ridge_cfdf, id.vars="grid")
ggplot(lasso_melted, aes(log(grid), value, col=variable)) + geom_line(size=1) +
xlab('log(lambda)') + ylab('Coefficients')
set.seed(1)
lasso_cv_out <- cv.glmnet(Xmat_train, y_train, alpha = 1, grouped = FALSE)
plot(lasso_cv_out)
lasso_best_lambda <- lasso_cv_out$lambda.min
lasso_best_lambda
[1] 6.00723
lasso_model <- glmnet(Xmat_train, y_train, alpha = 1, lambda = lasso_best_lambda)
coef(lasso_model)
10 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) 4.056519834
x1 .
x2 .
x3 .
x4 .
x5 0.050889129
x6 .
x7 0.003181004
x8 .
x9 0.000118107
y_train_pred_lasso <- predict(lasso_model, Xmat_train)
sse <- sum((y_train - y_train_pred_lasso)^2)
sst <- sum((y_train - mean(y_train))^2)
training_rsq <- 1 - sse / sst
y_test_pred_lasso <- predict(lasso_model, Xmat_test)
sse <- sum((y_test - y_test_pred_lasso)^2)
sst <- sum((y_test - mean(y_test))^2)
testing_rsq <- 1 - sse / sst
lasso_rsq <- c(training_rsq, testing_rsq)
names(lasso_rsq) <- c('Training', 'Testing')
lasso_rsq
Training Testing
0.9378925 0.9322479
yg_lasso <- predict(lasso_model, X_grid)
plot(y_train ~ x_train, ylim=c(-80, 140), xlab="x", ylab="y", main="LASSO Regression")
lines(xg, yg_lasso)
segments(x_train, y_train, x_train, y_train_pred_lasso)
points(y_test ~ x_test, col="red")
segments(x_test, y_test, x_test, y_test_pred_lasso, col="red")
poly <- coef(poly_mod)
ridge <- as.matrix(t(coef(ridge_model)))
lasso <- as.matrix(t(coef(lasso_model)))
names(poly) <- c('Intercept', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9' )
cf <- rbind(poly, ridge, lasso)
rownames(cf) <- c("Poly", "Ridge", "Lasso")
round(cf, 6)
Intercept x1 x2 x3 x4 x5 x6 x7 x8 x9
Poly 13.94676 41.387804 -14.185737 -52.718643 1.604721 17.337728 0.166865 -1.847336 -0.013019 0.061749
Ridge 3.93247 2.815298 -1.101402 -0.338061 0.380280 0.083205 -0.009916 0.003044 -0.000904 0.000071
Lasso 4.05652 0.000000 0.000000 0.000000 0.000000 0.050889 0.000000 0.003181 0.000000 0.000118
rbind(poly_rsq, ridge_rsq, lasso_rsq)
Training Testing
poly_rsq 0.9855016 -0.1894984
ridge_rsq 0.9616233 0.9295274
lasso_rsq 0.9378925 0.9322479
par(mfrow=c(1,3))
plot(y_train ~ x_train, ylim=c(-80, 140), xlab="x", ylab="y", main="Poly Regression")
lines(xg, yg_poly)
segments(x_train, y_train, x_train, y_train_pred_poly)
points(y_test ~ x_test, col="red")
segments(x_test, y_test, x_test, y_test_pred_poly, col="red")
plot(y_train ~ x_train, ylim=c(-80, 140), xlab="x", ylab="y", main="Ridge Regression")
lines(xg, yg_ridge)
segments(x_train, y_train, x_train, y_train_pred_ridge)
points(y_test ~ x_test, col="red")
segments(x_test, y_test, x_test, y_test_pred_ridge, col="red")
plot(y_train ~ x_train, ylim=c(-80, 140), xlab="x", ylab="y", main="LASSO Regression")
lines(xg, yg_lasso)
segments(x_train, y_train, x_train, y_train_pred_lasso)
points(y_test ~ x_test, col="red")
segments(x_test, y_test, x_test, y_test_pred_lasso, col="red")
par(mfrow=c(1,1))