Load and Explore the Dataset

require(ggplot2)
df <- read.table("Galton.txt", sep="\t", header=TRUE)
summary(df)
     Family        Father          Mother      Gender      Height           Kids       
 185    : 15   Min.   :62.00   Min.   :58.00   F:433   Min.   :56.00   Min.   : 1.000  
 166    : 11   1st Qu.:68.00   1st Qu.:63.00   M:465   1st Qu.:64.00   1st Qu.: 4.000  
 66     : 11   Median :69.00   Median :64.00           Median :66.50   Median : 6.000  
 130    : 10   Mean   :69.23   Mean   :64.08           Mean   :66.76   Mean   : 6.136  
 136    : 10   3rd Qu.:71.00   3rd Qu.:65.50           3rd Qu.:69.70   3rd Qu.: 8.000  
 140    : 10   Max.   :78.50   Max.   :70.50           Max.   :79.00   Max.   :15.000  
 (Other):831                                                                           
df <- df[c(5,2,3,6,4)]
summary(df)
     Height          Father          Mother           Kids        Gender 
 Min.   :56.00   Min.   :62.00   Min.   :58.00   Min.   : 1.000   F:433  
 1st Qu.:64.00   1st Qu.:68.00   1st Qu.:63.00   1st Qu.: 4.000   M:465  
 Median :66.50   Median :69.00   Median :64.00   Median : 6.000          
 Mean   :66.76   Mean   :69.23   Mean   :64.08   Mean   : 6.136          
 3rd Qu.:69.70   3rd Qu.:71.00   3rd Qu.:65.50   3rd Qu.: 8.000          
 Max.   :79.00   Max.   :78.50   Max.   :70.50   Max.   :15.000          
pairs(df[1:4])

cor(df[1:4])
           Height      Father      Mother        Kids
Height  1.0000000  0.27535483  0.20165489 -0.12691012
Father  0.2753548  1.00000000  0.07366461 -0.16002294
Mother  0.2016549  0.07366461  1.00000000 -0.02002964
Kids   -0.1269101 -0.16002294 -0.02002964  1.00000000
ggplot(df, aes(x = Father, y = Height, col = Gender)) + geom_point(alpha=0.6, size=2)

ggplot(df, aes(x = Father, y = Height, col = Gender)) + geom_point(alpha=0.6, size=2) + facet_wrap(~Gender)

ggplot(df, aes(x = Mother, y = Height, col = Gender)) + geom_point(alpha=0.6, size=2)

ggplot(df, aes(x = Mother, y = Height, col = Gender)) + geom_point(alpha=0.6, size=2) + facet_wrap(~Gender)

ggplot(df, aes(x = Kids, y = Height, col = Gender)) + geom_point(alpha=0.6, size=2)

ggplot(df, aes(x = Kids, y = Height, col = Gender)) + geom_point(alpha=0.6, size=2) + facet_wrap(~Gender)

ggplot(df, aes(x = Gender, y = Height, fill = Gender)) + geom_boxplot()

Model Height Against Mother, Father, and Kids

mod1 <- lm(Height ~ Mother + Father + Kids, df)
summary(mod1)

Call:
lm(formula = Height ~ Mother + Father + Kids, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-9.0675 -2.6552 -0.1819  2.7806 11.8006 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 24.39855    4.36625   5.588 3.05e-08 ***
Mother       0.28214    0.04898   5.760 1.15e-08 ***
Father       0.36059    0.04633   7.783 1.95e-14 ***
Kids        -0.11140    0.04252  -2.620  0.00894 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.375 on 894 degrees of freedom
Multiple R-squared:  0.1157,    Adjusted R-squared:  0.1127 
F-statistic: 38.98 on 3 and 894 DF,  p-value: < 2.2e-16

Residual Analysis

plot(mod1$residuals ~ mod1$fitted.values, pch=21, col="black", bg="salmon",
     xlab="Fitted Value", ylab="Residuals", main="Residual Plot")
abline(h=0, col="darkred", lwd=2)

res <- mod1$residuals
par(mfrow=c(1,2))
hist(res, col='orchid', breaks=10)
qqnorm(res)
qqline(res)
par(mfrow=c(1,1))

Consider Full Model

mod2 <- lm(Height ~ Mother + Father + Kids + Gender, df)
summary(mod2)

Call:
lm(formula = Height ~ Mother + Father + Kids + Gender, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-9.4748 -1.4500  0.0889  1.4716  9.1656 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 16.18771    2.79387   5.794 9.52e-09 ***
Mother       0.32096    0.03126  10.269  < 2e-16 ***
Father       0.39831    0.02957  13.472  < 2e-16 ***
Kids        -0.04382    0.02718  -1.612    0.107    
GenderM      5.20995    0.14422  36.125  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.152 on 893 degrees of freedom
Multiple R-squared:  0.6407,    Adjusted R-squared:  0.6391 
F-statistic: 398.1 on 4 and 893 DF,  p-value: < 2.2e-16

Model Height Against Mother, Father, and Gender

mod3 <- lm(Height ~ Mother + Father + Gender, df)
summary(mod3)

Call:
lm(formula = Height ~ Mother + Father + Gender, data = df)

Residuals:
   Min     1Q Median     3Q    Max 
-9.523 -1.440  0.117  1.473  9.114 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 15.34476    2.74696   5.586 3.08e-08 ***
Mother       0.32150    0.03128  10.277  < 2e-16 ***
Father       0.40598    0.02921  13.900  < 2e-16 ***
GenderM      5.22595    0.14401  36.289  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.154 on 894 degrees of freedom
Multiple R-squared:  0.6397,    Adjusted R-squared:  0.6385 
F-statistic:   529 on 3 and 894 DF,  p-value: < 2.2e-16

Residual Analysis

plot(mod3$residuals ~ mod1$fitted.values, pch=21, col="black", bg="salmon",
     xlab="Fitted Value", ylab="Residuals", main="Residual Plot")
abline(h=0, col="darkred", lwd=2)

res <- mod3$residuals
par(mfrow=c(1,2))
hist(res, col='orchid', breaks=10)
qqnorm(res)
qqline(res)
par(mfrow=c(1,1))

Generate Predictions

nd <- data.frame(Mother = c(62, 62, 70, 72), Father = c(68, 68, 76, 64), Gender = c('F', 'M', 'M', 'F'))
predict(mod3, newdata=nd, interval = "prediction", level = 0.95)
       fit      lwr      upr
1 62.88396 58.64838 67.11955
2 68.10992 63.87507 72.34476
3 73.92970 69.66551 78.19389
4 64.47500 60.20147 68.74854
LS0tDQp0aXRsZTogIkxlc3NvbiAxOCAtIEdhbHRvbiBEYXRhIg0KYXV0aG9yOiAiUm9iYmllIEJlYW5lIg0Kb3V0cHV0Og0KICBodG1sX25vdGVib29rOg0KICAgIHRoZW1lOiBmbGF0bHkNCiAgICB0b2M6IHllcw0KICAgIHRvY19kZXB0aDogMg0KLS0tDQoNCiMgTG9hZCBhbmQgRXhwbG9yZSB0aGUgRGF0YXNldA0KDQpgYGB7ciwgbWVzc2FnZT1GQUxTRX0NCnJlcXVpcmUoZ2dwbG90MikNCmBgYA0KDQoNCmBgYHtyfQ0KZGYgPC0gcmVhZC50YWJsZSgiR2FsdG9uLnR4dCIsIHNlcD0iXHQiLCBoZWFkZXI9VFJVRSkNCnN1bW1hcnkoZGYpDQpgYGANCg0KYGBge3J9DQpkZiA8LSBkZltjKDUsMiwzLDYsNCldDQpzdW1tYXJ5KGRmKQ0KYGBgDQoNCg0KYGBge3J9DQpwYWlycyhkZlsxOjRdKQ0KYGBgDQoNCmBgYHtyfQ0KY29yKGRmWzE6NF0pDQpgYGANCg0KDQpgYGB7cn0NCmdncGxvdChkZiwgYWVzKHggPSBGYXRoZXIsIHkgPSBIZWlnaHQsIGNvbCA9IEdlbmRlcikpICsgZ2VvbV9wb2ludChhbHBoYT0wLjYsIHNpemU9MikNCmBgYA0KDQoNCmBgYHtyfQ0KZ2dwbG90KGRmLCBhZXMoeCA9IEZhdGhlciwgeSA9IEhlaWdodCwgY29sID0gR2VuZGVyKSkgKyBnZW9tX3BvaW50KGFscGhhPTAuNiwgc2l6ZT0yKSArIGZhY2V0X3dyYXAofkdlbmRlcikNCmBgYA0KDQpgYGB7cn0NCmdncGxvdChkZiwgYWVzKHggPSBNb3RoZXIsIHkgPSBIZWlnaHQsIGNvbCA9IEdlbmRlcikpICsgZ2VvbV9wb2ludChhbHBoYT0wLjYsIHNpemU9MikNCmBgYA0KDQpgYGB7cn0NCmdncGxvdChkZiwgYWVzKHggPSBNb3RoZXIsIHkgPSBIZWlnaHQsIGNvbCA9IEdlbmRlcikpICsgZ2VvbV9wb2ludChhbHBoYT0wLjYsIHNpemU9MikgKyBmYWNldF93cmFwKH5HZW5kZXIpDQpgYGANCg0KDQpgYGB7cn0NCmdncGxvdChkZiwgYWVzKHggPSBLaWRzLCB5ID0gSGVpZ2h0LCBjb2wgPSBHZW5kZXIpKSArIGdlb21fcG9pbnQoYWxwaGE9MC42LCBzaXplPTIpDQpgYGANCg0KYGBge3J9DQpnZ3Bsb3QoZGYsIGFlcyh4ID0gS2lkcywgeSA9IEhlaWdodCwgY29sID0gR2VuZGVyKSkgKyBnZW9tX3BvaW50KGFscGhhPTAuNiwgc2l6ZT0yKSArIGZhY2V0X3dyYXAofkdlbmRlcikNCmBgYA0KDQpgYGB7cn0NCmdncGxvdChkZiwgYWVzKHggPSBHZW5kZXIsIHkgPSBIZWlnaHQsIGZpbGwgPSBHZW5kZXIpKSArIGdlb21fYm94cGxvdCgpDQpgYGANCg0KIyBNb2RlbCBIZWlnaHQgQWdhaW5zdCBNb3RoZXIsIEZhdGhlciwgYW5kIEtpZHMNCg0KYGBge3J9DQptb2QxIDwtIGxtKEhlaWdodCB+IE1vdGhlciArIEZhdGhlciArIEtpZHMsIGRmKQ0Kc3VtbWFyeShtb2QxKQ0KYGBgDQoNCg0KIyMgUmVzaWR1YWwgQW5hbHlzaXMNCg0KYGBge3J9DQpwbG90KG1vZDEkcmVzaWR1YWxzIH4gbW9kMSRmaXR0ZWQudmFsdWVzLCBwY2g9MjEsIGNvbD0iYmxhY2siLCBiZz0ic2FsbW9uIiwNCiAgICAgeGxhYj0iRml0dGVkIFZhbHVlIiwgeWxhYj0iUmVzaWR1YWxzIiwgbWFpbj0iUmVzaWR1YWwgUGxvdCIpDQphYmxpbmUoaD0wLCBjb2w9ImRhcmtyZWQiLCBsd2Q9MikNCmBgYA0KDQoNCmBgYHtyfQ0KcmVzIDwtIG1vZDEkcmVzaWR1YWxzDQpwYXIobWZyb3c9YygxLDIpKQ0KaGlzdChyZXMsIGNvbD0nb3JjaGlkJywgYnJlYWtzPTEwKQ0KcXFub3JtKHJlcykNCnFxbGluZShyZXMpDQpwYXIobWZyb3c9YygxLDEpKQ0KYGBgDQoNCiMgQ29uc2lkZXIgRnVsbCBNb2RlbCANCg0KYGBge3J9DQptb2QyIDwtIGxtKEhlaWdodCB+IE1vdGhlciArIEZhdGhlciArIEtpZHMgKyBHZW5kZXIsIGRmKQ0Kc3VtbWFyeShtb2QyKQ0KYGBgDQoNCiMgTW9kZWwgSGVpZ2h0IEFnYWluc3QgTW90aGVyLCBGYXRoZXIsIGFuZCBHZW5kZXINCg0KYGBge3J9DQptb2QzIDwtIGxtKEhlaWdodCB+IE1vdGhlciArIEZhdGhlciArIEdlbmRlciwgZGYpDQpzdW1tYXJ5KG1vZDMpDQpgYGANCg0KIyMgUmVzaWR1YWwgQW5hbHlzaXMNCg0KYGBge3J9DQpwbG90KG1vZDMkcmVzaWR1YWxzIH4gbW9kMSRmaXR0ZWQudmFsdWVzLCBwY2g9MjEsIGNvbD0iYmxhY2siLCBiZz0ic2FsbW9uIiwNCiAgICAgeGxhYj0iRml0dGVkIFZhbHVlIiwgeWxhYj0iUmVzaWR1YWxzIiwgbWFpbj0iUmVzaWR1YWwgUGxvdCIpDQphYmxpbmUoaD0wLCBjb2w9ImRhcmtyZWQiLCBsd2Q9MikNCmBgYA0KDQpgYGB7cn0NCnJlcyA8LSBtb2QzJHJlc2lkdWFscw0KcGFyKG1mcm93PWMoMSwyKSkNCmhpc3QocmVzLCBjb2w9J29yY2hpZCcsIGJyZWFrcz0xMCkNCnFxbm9ybShyZXMpDQpxcWxpbmUocmVzKQ0KcGFyKG1mcm93PWMoMSwxKSkNCmBgYA0KDQojIEdlbmVyYXRlIFByZWRpY3Rpb25zDQoNCmBgYHtyfQ0KbmQgPC0gZGF0YS5mcmFtZShNb3RoZXIgPSBjKDYyLCA2MiwgNzAsIDcyKSwgRmF0aGVyID0gYyg2OCwgNjgsIDc2LCA2NCksIEdlbmRlciA9IGMoJ0YnLCAnTScsICdNJywgJ0YnKSkNCnByZWRpY3QobW9kMywgbmV3ZGF0YT1uZCwgaW50ZXJ2YWwgPSAicHJlZGljdGlvbiIsIGxldmVsID0gMC45NSkNCmBgYA0KDQoNCg0KDQoNCg==