require(leaps)
require(openintro)
require(ggplot2)
require(GGally)
We will be working with the bdims
dataset, which is included in the openintro
package. A description of the dataset can be found here: [https://www.openintro.org/stat/data/bdims.php].
data(bdims)
#summary(bdims)
We will use the function regsubsets()
from the leaps
package to perform best subset selection.
subset <- regsubsets(wgt ~ ., bdims, nvmax = 24)
reg_summary <- summary(subset)
names(reg_summary)
[1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
which.min(reg_summary$bic)
[1] 12
which.min(reg_summary$cp)
[1] 16
which.max(reg_summary$adjr2)
[1] 18
par(mfrow=c(2,2))
# r-Squared
plot(reg_summary$rsq, xlab="Number of Predictors", ylab="r-Squared", type="l")
# Adjusted r-Squared
plot(reg_summary$adjr2, xlab="Number of Predictors", ylab="Adjusted r-Squared", type="l")
k <- which.max(reg_summary$adjr2)
points(k, reg_summary$adjr2[k], col="red", cex=2, pch=20)
lines(c(k,k), c(0,reg_summary$adjr2[k]), col="red", lty="dashed")
# C_p
plot(reg_summary$cp, xlab="Number of Predictors", ylab="Cp", type="l")
k <- which.min(reg_summary$cp)
points(k, reg_summary$cp[k], col="red", cex=2, pch=20)
lines(c(k,k), c(0,reg_summary$cp[k]), col="red", lty="dashed")
# BIC
plot(reg_summary$bic, xlab="Number of Predictors", ylab="BIC", type="l")
k <- which.min(reg_summary$bic)
points(k, reg_summary$bic[k], col="red", cex=2, pch=20)
lines(c(k,k), c(0,reg_summary$bic[k]), col="red", lty="dashed")
par(mfrow=c(2,2))
coef(subset, 12)
(Intercept) che.de kne.di sho.gi che.gi wai.gi hip.gi thi.gi
-122.26112532 0.26583511 0.64052751 0.08654813 0.16187767 0.38579933 0.23327814 0.25781813
for.gi cal.gi age hgt sex
0.59434308 0.40568160 -0.05330840 0.32246990 -1.57949799
names(coef(subset, 12))
[1] "(Intercept)" "che.de" "kne.di" "sho.gi" "che.gi" "wai.gi" "hip.gi"
[8] "thi.gi" "for.gi" "cal.gi" "age" "hgt" "sex"
mod_bs_12 <- lm(wgt ~ che.de + kne.di + sho.gi + che.gi + wai.gi + hip.gi + thi.gi + for.gi + cal.gi + age + hgt + sex, bdims)
summary(mod_bs_12)
Call:
lm(formula = wgt ~ che.de + kne.di + sho.gi + che.gi + wai.gi +
hip.gi + thi.gi + for.gi + cal.gi + age + hgt + sex, data = bdims)
Residuals:
Min 1Q Median 3Q Max
-7.3146 -1.3279 0.0409 1.3412 8.7983
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -122.26113 2.52608 -48.400 < 2e-16 ***
che.de 0.26584 0.06879 3.864 0.000126 ***
kne.di 0.64053 0.11727 5.462 7.47e-08 ***
sho.gi 0.08655 0.02825 3.063 0.002308 **
che.gi 0.16188 0.03385 4.782 2.30e-06 ***
wai.gi 0.38580 0.02499 15.440 < 2e-16 ***
hip.gi 0.23328 0.03843 6.070 2.55e-09 ***
thi.gi 0.25782 0.04873 5.290 1.84e-07 ***
for.gi 0.59434 0.09648 6.160 1.51e-09 ***
cal.gi 0.40568 0.05797 6.998 8.49e-12 ***
age -0.05331 0.01181 -4.515 7.93e-06 ***
hgt 0.32247 0.01553 20.769 < 2e-16 ***
sex -1.57950 0.48321 -3.269 0.001155 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.114 on 494 degrees of freedom
Multiple R-squared: 0.9755, Adjusted R-squared: 0.9749
F-statistic: 1639 on 12 and 494 DF, p-value: < 2.2e-16
We can als use regsubsets()
to perform forward stepwise selection.
fwd <- regsubsets(wgt ~ ., bdims, nvmax = 24, method = "forward")
fwd_summary <- summary(fwd)
which.min(fwd_summary$bic)
[1] 13
which.min(fwd_summary$cp)
[1] 16
which.max(fwd_summary$adjr2)
[1] 18
coef(fwd, 13)
(Intercept) che.de kne.di sho.gi che.gi wai.gi hip.gi thi.gi
-121.70803751 0.26496834 0.55717873 0.08506882 0.17090451 0.37698430 0.22402952 0.24640509
for.gi kne.gi cal.gi age hgt sex
0.56661919 0.17977125 0.35343020 -0.05207904 0.31403104 -1.42781749
mod_fs_13 <- lm(wgt ~ che.de + kne.di + sho.gi + che.gi + wai.gi + hip.gi + thi.gi + for.gi + kne.gi + cal.gi + age + hgt + sex, bdims)
summary(mod_fs_13)
Call:
lm(formula = wgt ~ che.de + kne.di + sho.gi + che.gi + wai.gi +
hip.gi + thi.gi + for.gi + kne.gi + cal.gi + age + hgt +
sex, data = bdims)
Residuals:
Min 1Q Median 3Q Max
-7.6401 -1.2840 -0.0028 1.2981 8.9204
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -121.70804 2.52411 -48.218 < 2e-16 ***
che.de 0.26497 0.06846 3.870 0.000123 ***
kne.di 0.55718 0.12167 4.579 5.91e-06 ***
sho.gi 0.08507 0.02812 3.025 0.002615 **
che.gi 0.17090 0.03389 5.042 6.48e-07 ***
wai.gi 0.37698 0.02513 15.002 < 2e-16 ***
hip.gi 0.22403 0.03844 5.829 1.01e-08 ***
thi.gi 0.24641 0.04872 5.057 6.01e-07 ***
for.gi 0.56662 0.09669 5.860 8.47e-09 ***
kne.gi 0.17977 0.07427 2.420 0.015861 *
cal.gi 0.35343 0.06159 5.738 1.67e-08 ***
age -0.05208 0.01176 -4.428 1.17e-05 ***
hgt 0.31403 0.01584 19.827 < 2e-16 ***
sex -1.42782 0.48492 -2.944 0.003388 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.104 on 493 degrees of freedom
Multiple R-squared: 0.9758, Adjusted R-squared: 0.9752
F-statistic: 1528 on 13 and 493 DF, p-value: < 2.2e-16
We can als use regsubsets()
to perform forward stepwise selection.
bwd <- regsubsets(wgt ~ ., bdims, nvmax = 24, method = "backward")
bwd_summary <- summary(bwd)
which.min(bwd_summary$bic)
[1] 12
which.min(bwd_summary$cp)
[1] 16
which.max(bwd_summary$adjr2)
[1] 20
coef(bwd, 12)
(Intercept) che.de kne.di sho.gi che.gi wai.gi hip.gi thi.gi
-122.26112532 0.26583511 0.64052751 0.08654813 0.16187767 0.38579933 0.23327814 0.25781813
for.gi cal.gi age hgt sex
0.59434308 0.40568160 -0.05330840 0.32246990 -1.57949799
mod_bs_12 <- lm(wgt ~ che.de + kne.di + sho.gi + che.gi + wai.gi + hip.gi + thi.gi + for.gi + cal.gi + age + hgt + sex, bdims)
summary(mod_bs_12)
Call:
lm(formula = wgt ~ che.de + kne.di + sho.gi + che.gi + wai.gi +
hip.gi + thi.gi + for.gi + cal.gi + age + hgt + sex, data = bdims)
Residuals:
Min 1Q Median 3Q Max
-7.3146 -1.3279 0.0409 1.3412 8.7983
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -122.26113 2.52608 -48.400 < 2e-16 ***
che.de 0.26584 0.06879 3.864 0.000126 ***
kne.di 0.64053 0.11727 5.462 7.47e-08 ***
sho.gi 0.08655 0.02825 3.063 0.002308 **
che.gi 0.16188 0.03385 4.782 2.30e-06 ***
wai.gi 0.38580 0.02499 15.440 < 2e-16 ***
hip.gi 0.23328 0.03843 6.070 2.55e-09 ***
thi.gi 0.25782 0.04873 5.290 1.84e-07 ***
for.gi 0.59434 0.09648 6.160 1.51e-09 ***
cal.gi 0.40568 0.05797 6.998 8.49e-12 ***
age -0.05331 0.01181 -4.515 7.93e-06 ***
hgt 0.32247 0.01553 20.769 < 2e-16 ***
sex -1.57950 0.48321 -3.269 0.001155 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.114 on 494 degrees of freedom
Multiple R-squared: 0.9755, Adjusted R-squared: 0.9749
F-statistic: 1639 on 12 and 494 DF, p-value: < 2.2e-16