Load and Explore the Dataset
require(ggplot2)
df <- read.table("Galton.txt", sep="\t", header=TRUE)
summary(df)
Family Father Mother Gender Height Kids
185 : 15 Min. :62.00 Min. :58.00 F:433 Min. :56.00 Min. : 1.000
166 : 11 1st Qu.:68.00 1st Qu.:63.00 M:465 1st Qu.:64.00 1st Qu.: 4.000
66 : 11 Median :69.00 Median :64.00 Median :66.50 Median : 6.000
130 : 10 Mean :69.23 Mean :64.08 Mean :66.76 Mean : 6.136
136 : 10 3rd Qu.:71.00 3rd Qu.:65.50 3rd Qu.:69.70 3rd Qu.: 8.000
140 : 10 Max. :78.50 Max. :70.50 Max. :79.00 Max. :15.000
(Other):831
df <- df[c(5,2,3,6,4)]
summary(df)
Height Father Mother Kids Gender
Min. :56.00 Min. :62.00 Min. :58.00 Min. : 1.000 F:433
1st Qu.:64.00 1st Qu.:68.00 1st Qu.:63.00 1st Qu.: 4.000 M:465
Median :66.50 Median :69.00 Median :64.00 Median : 6.000
Mean :66.76 Mean :69.23 Mean :64.08 Mean : 6.136
3rd Qu.:69.70 3rd Qu.:71.00 3rd Qu.:65.50 3rd Qu.: 8.000
Max. :79.00 Max. :78.50 Max. :70.50 Max. :15.000
pairs(df[1:4])
cor(df[1:4])
Height Father Mother Kids
Height 1.0000000 0.27535483 0.20165489 -0.12691012
Father 0.2753548 1.00000000 0.07366461 -0.16002294
Mother 0.2016549 0.07366461 1.00000000 -0.02002964
Kids -0.1269101 -0.16002294 -0.02002964 1.00000000
ggplot(df, aes(x = Father, y = Height, col = Gender)) + geom_point(alpha=0.6, size=2)
ggplot(df, aes(x = Father, y = Height, col = Gender)) + geom_point(alpha=0.6, size=2) + facet_wrap(~Gender)
ggplot(df, aes(x = Mother, y = Height, col = Gender)) + geom_point(alpha=0.6, size=2)
ggplot(df, aes(x = Mother, y = Height, col = Gender)) + geom_point(alpha=0.6, size=2) + facet_wrap(~Gender)
ggplot(df, aes(x = Kids, y = Height, col = Gender)) + geom_point(alpha=0.6, size=2)
ggplot(df, aes(x = Kids, y = Height, col = Gender)) + geom_point(alpha=0.6, size=2) + facet_wrap(~Gender)
ggplot(df, aes(x = Gender, y = Height, fill = Gender)) + geom_boxplot()
Model Height Against Mother, Father, and Kids
mod1 <- lm(Height ~ Mother + Father + Kids, df)
summary(mod1)
Call:
lm(formula = Height ~ Mother + Father + Kids, data = df)
Residuals:
Min 1Q Median 3Q Max
-9.0675 -2.6552 -0.1819 2.7806 11.8006
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 24.39855 4.36625 5.588 3.05e-08 ***
Mother 0.28214 0.04898 5.760 1.15e-08 ***
Father 0.36059 0.04633 7.783 1.95e-14 ***
Kids -0.11140 0.04252 -2.620 0.00894 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.375 on 894 degrees of freedom
Multiple R-squared: 0.1157, Adjusted R-squared: 0.1127
F-statistic: 38.98 on 3 and 894 DF, p-value: < 2.2e-16
Residual Analysis
plot(mod1$residuals ~ mod1$fitted.values, pch=21, col="black", bg="salmon",
xlab="Fitted Value", ylab="Residuals", main="Residual Plot")
abline(h=0, col="darkred", lwd=2)
res <- mod1$residuals
par(mfrow=c(1,2))
hist(res, col='orchid', breaks=10)
qqnorm(res)
qqline(res)
par(mfrow=c(1,1))
Consider Full Model
mod2 <- lm(Height ~ Mother + Father + Kids + Gender, df)
summary(mod2)
Call:
lm(formula = Height ~ Mother + Father + Kids + Gender, data = df)
Residuals:
Min 1Q Median 3Q Max
-9.4748 -1.4500 0.0889 1.4716 9.1656
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 16.18771 2.79387 5.794 9.52e-09 ***
Mother 0.32096 0.03126 10.269 < 2e-16 ***
Father 0.39831 0.02957 13.472 < 2e-16 ***
Kids -0.04382 0.02718 -1.612 0.107
GenderM 5.20995 0.14422 36.125 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.152 on 893 degrees of freedom
Multiple R-squared: 0.6407, Adjusted R-squared: 0.6391
F-statistic: 398.1 on 4 and 893 DF, p-value: < 2.2e-16
Model Height Against Mother, Father, and Gender
mod3 <- lm(Height ~ Mother + Father + Gender, df)
summary(mod3)
Call:
lm(formula = Height ~ Mother + Father + Gender, data = df)
Residuals:
Min 1Q Median 3Q Max
-9.523 -1.440 0.117 1.473 9.114
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 15.34476 2.74696 5.586 3.08e-08 ***
Mother 0.32150 0.03128 10.277 < 2e-16 ***
Father 0.40598 0.02921 13.900 < 2e-16 ***
GenderM 5.22595 0.14401 36.289 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.154 on 894 degrees of freedom
Multiple R-squared: 0.6397, Adjusted R-squared: 0.6385
F-statistic: 529 on 3 and 894 DF, p-value: < 2.2e-16
Residual Analysis
plot(mod3$residuals ~ mod1$fitted.values, pch=21, col="black", bg="salmon",
xlab="Fitted Value", ylab="Residuals", main="Residual Plot")
abline(h=0, col="darkred", lwd=2)
res <- mod3$residuals
par(mfrow=c(1,2))
hist(res, col='orchid', breaks=10)
qqnorm(res)
qqline(res)
par(mfrow=c(1,1))
Generate Predictions
nd <- data.frame(Mother = c(62, 62, 70, 72), Father = c(68, 68, 76, 64), Gender = c('F', 'M', 'M', 'F'))
predict(mod3, newdata=nd, interval = "prediction", level = 0.95)
fit lwr upr
1 62.88396 58.64838 67.11955
2 68.10992 63.87507 72.34476
3 73.92970 69.66551 78.19389
4 64.47500 60.20147 68.74854
LS0tDQp0aXRsZTogIkxlc3NvbiAxOCAtIEdhbHRvbiBEYXRhIg0KYXV0aG9yOiAiUm9iYmllIEJlYW5lIg0Kb3V0cHV0Og0KICBodG1sX25vdGVib29rOg0KICAgIHRoZW1lOiBmbGF0bHkNCiAgICB0b2M6IHllcw0KICAgIHRvY19kZXB0aDogMg0KLS0tDQoNCiMgTG9hZCBhbmQgRXhwbG9yZSB0aGUgRGF0YXNldA0KDQpgYGB7ciwgbWVzc2FnZT1GQUxTRX0NCnJlcXVpcmUoZ2dwbG90MikNCmBgYA0KDQoNCmBgYHtyfQ0KZGYgPC0gcmVhZC50YWJsZSgiR2FsdG9uLnR4dCIsIHNlcD0iXHQiLCBoZWFkZXI9VFJVRSkNCnN1bW1hcnkoZGYpDQpgYGANCg0KYGBge3J9DQpkZiA8LSBkZltjKDUsMiwzLDYsNCldDQpzdW1tYXJ5KGRmKQ0KYGBgDQoNCg0KYGBge3J9DQpwYWlycyhkZlsxOjRdKQ0KYGBgDQoNCmBgYHtyfQ0KY29yKGRmWzE6NF0pDQpgYGANCg0KDQpgYGB7cn0NCmdncGxvdChkZiwgYWVzKHggPSBGYXRoZXIsIHkgPSBIZWlnaHQsIGNvbCA9IEdlbmRlcikpICsgZ2VvbV9wb2ludChhbHBoYT0wLjYsIHNpemU9MikNCmBgYA0KDQoNCmBgYHtyfQ0KZ2dwbG90KGRmLCBhZXMoeCA9IEZhdGhlciwgeSA9IEhlaWdodCwgY29sID0gR2VuZGVyKSkgKyBnZW9tX3BvaW50KGFscGhhPTAuNiwgc2l6ZT0yKSArIGZhY2V0X3dyYXAofkdlbmRlcikNCmBgYA0KDQpgYGB7cn0NCmdncGxvdChkZiwgYWVzKHggPSBNb3RoZXIsIHkgPSBIZWlnaHQsIGNvbCA9IEdlbmRlcikpICsgZ2VvbV9wb2ludChhbHBoYT0wLjYsIHNpemU9MikNCmBgYA0KDQpgYGB7cn0NCmdncGxvdChkZiwgYWVzKHggPSBNb3RoZXIsIHkgPSBIZWlnaHQsIGNvbCA9IEdlbmRlcikpICsgZ2VvbV9wb2ludChhbHBoYT0wLjYsIHNpemU9MikgKyBmYWNldF93cmFwKH5HZW5kZXIpDQpgYGANCg0KDQpgYGB7cn0NCmdncGxvdChkZiwgYWVzKHggPSBLaWRzLCB5ID0gSGVpZ2h0LCBjb2wgPSBHZW5kZXIpKSArIGdlb21fcG9pbnQoYWxwaGE9MC42LCBzaXplPTIpDQpgYGANCg0KYGBge3J9DQpnZ3Bsb3QoZGYsIGFlcyh4ID0gS2lkcywgeSA9IEhlaWdodCwgY29sID0gR2VuZGVyKSkgKyBnZW9tX3BvaW50KGFscGhhPTAuNiwgc2l6ZT0yKSArIGZhY2V0X3dyYXAofkdlbmRlcikNCmBgYA0KDQpgYGB7cn0NCmdncGxvdChkZiwgYWVzKHggPSBHZW5kZXIsIHkgPSBIZWlnaHQsIGZpbGwgPSBHZW5kZXIpKSArIGdlb21fYm94cGxvdCgpDQpgYGANCg0KIyBNb2RlbCBIZWlnaHQgQWdhaW5zdCBNb3RoZXIsIEZhdGhlciwgYW5kIEtpZHMNCg0KYGBge3J9DQptb2QxIDwtIGxtKEhlaWdodCB+IE1vdGhlciArIEZhdGhlciArIEtpZHMsIGRmKQ0Kc3VtbWFyeShtb2QxKQ0KYGBgDQoNCg0KIyMgUmVzaWR1YWwgQW5hbHlzaXMNCg0KYGBge3J9DQpwbG90KG1vZDEkcmVzaWR1YWxzIH4gbW9kMSRmaXR0ZWQudmFsdWVzLCBwY2g9MjEsIGNvbD0iYmxhY2siLCBiZz0ic2FsbW9uIiwNCiAgICAgeGxhYj0iRml0dGVkIFZhbHVlIiwgeWxhYj0iUmVzaWR1YWxzIiwgbWFpbj0iUmVzaWR1YWwgUGxvdCIpDQphYmxpbmUoaD0wLCBjb2w9ImRhcmtyZWQiLCBsd2Q9MikNCmBgYA0KDQoNCmBgYHtyfQ0KcmVzIDwtIG1vZDEkcmVzaWR1YWxzDQpwYXIobWZyb3c9YygxLDIpKQ0KaGlzdChyZXMsIGNvbD0nb3JjaGlkJywgYnJlYWtzPTEwKQ0KcXFub3JtKHJlcykNCnFxbGluZShyZXMpDQpwYXIobWZyb3c9YygxLDEpKQ0KYGBgDQoNCiMgQ29uc2lkZXIgRnVsbCBNb2RlbCANCg0KYGBge3J9DQptb2QyIDwtIGxtKEhlaWdodCB+IE1vdGhlciArIEZhdGhlciArIEtpZHMgKyBHZW5kZXIsIGRmKQ0Kc3VtbWFyeShtb2QyKQ0KYGBgDQoNCiMgTW9kZWwgSGVpZ2h0IEFnYWluc3QgTW90aGVyLCBGYXRoZXIsIGFuZCBHZW5kZXINCg0KYGBge3J9DQptb2QzIDwtIGxtKEhlaWdodCB+IE1vdGhlciArIEZhdGhlciArIEdlbmRlciwgZGYpDQpzdW1tYXJ5KG1vZDMpDQpgYGANCg0KIyMgUmVzaWR1YWwgQW5hbHlzaXMNCg0KYGBge3J9DQpwbG90KG1vZDMkcmVzaWR1YWxzIH4gbW9kMSRmaXR0ZWQudmFsdWVzLCBwY2g9MjEsIGNvbD0iYmxhY2siLCBiZz0ic2FsbW9uIiwNCiAgICAgeGxhYj0iRml0dGVkIFZhbHVlIiwgeWxhYj0iUmVzaWR1YWxzIiwgbWFpbj0iUmVzaWR1YWwgUGxvdCIpDQphYmxpbmUoaD0wLCBjb2w9ImRhcmtyZWQiLCBsd2Q9MikNCmBgYA0KDQpgYGB7cn0NCnJlcyA8LSBtb2QzJHJlc2lkdWFscw0KcGFyKG1mcm93PWMoMSwyKSkNCmhpc3QocmVzLCBjb2w9J29yY2hpZCcsIGJyZWFrcz0xMCkNCnFxbm9ybShyZXMpDQpxcWxpbmUocmVzKQ0KcGFyKG1mcm93PWMoMSwxKSkNCmBgYA0KDQojIEdlbmVyYXRlIFByZWRpY3Rpb25zDQoNCmBgYHtyfQ0KbmQgPC0gZGF0YS5mcmFtZShNb3RoZXIgPSBjKDYyLCA2MiwgNzAsIDcyKSwgRmF0aGVyID0gYyg2OCwgNjgsIDc2LCA2NCksIEdlbmRlciA9IGMoJ0YnLCAnTScsICdNJywgJ0YnKSkNCnByZWRpY3QobW9kMywgbmV3ZGF0YT1uZCwgaW50ZXJ2YWwgPSAicHJlZGljdGlvbiIsIGxldmVsID0gMC45NSkNCmBgYA0KDQoNCg0KDQoNCg==