df <- read.table("studytime.txt", sep="\t", header=TRUE)
summary(df)
Passed Hours
Min. :0.00 Min. : 50.0
1st Qu.:0.00 1st Qu.:116.2
Median :1.00 Median :189.0
Mean :0.56 Mean :178.2
3rd Qu.:1.00 3rd Qu.:241.5
Max. :1.00 Max. :299.0
plot(Passed ~ Hours, df)
We will covert Passed
to a factor variable, and will plot the data again.
df$Passed <- factor(df$Passed)
summary(df)
Passed Hours
0:44 Min. : 50.0
1:56 1st Qu.:116.2
Median :189.0
Mean :178.2
3rd Qu.:241.5
Max. :299.0
plot(Passed ~ Hours, df)
plot(Hours ~ Passed, df)
# Create Logistic Regression Model
m1 <- glm(Passed ~ Hours, data=df, family=binomial(link="logit"))
# Create new data consisting of vector of study times
newdata = data.frame(Hours = c(50, 100, 150, 200, 250, 300))
# Create predictions
logodds <- predict(m1, newdata) # This produces a prediction of log(odds)
logodds
1 2 3 4 5 6
-2.2020131 -1.2189354 -0.2358576 0.7472202 1.7302980 2.7133758
odds <- exp(logodds) # Prediction of odds
odds
1 2 3 4 5 6
0.1105803 0.2955447 0.7898932 2.1111234 5.6423351 15.0800968
odds / (1 + odds) # Prediction of probabilities
1 2 3 4 5 6
0.09956985 0.22812386 0.44130744 0.67857270 0.84945053 0.93781132
predict(m1, newdata, type="response")
1 2 3 4 5 6
0.09956985 0.22812386 0.44130744 0.67857270 0.84945053 0.93781132
df <- read.table("admissions.txt", sep="\t", header=TRUE)
summary(df)
admit gre gpa rank
Min. :0.0000 Min. :220.0 Min. :2.260 Min. :1.000
1st Qu.:0.0000 1st Qu.:520.0 1st Qu.:3.130 1st Qu.:2.000
Median :0.0000 Median :580.0 Median :3.395 Median :2.000
Mean :0.3175 Mean :587.7 Mean :3.390 Mean :2.485
3rd Qu.:1.0000 3rd Qu.:660.0 3rd Qu.:3.670 3rd Qu.:3.000
Max. :1.0000 Max. :800.0 Max. :4.000 Max. :4.000
df$admit = factor(df$admit)
df$rank = factor(df$rank)
summary(df)
admit gre gpa rank
0:273 Min. :220.0 Min. :2.260 1: 61
1:127 1st Qu.:520.0 1st Qu.:3.130 2:151
Median :580.0 Median :3.395 3:121
Mean :587.7 Mean :3.390 4: 67
3rd Qu.:660.0 3rd Qu.:3.670
Max. :800.0 Max. :4.000
plot(gre ~ admit, df)
plot(gpa ~ admit, df)
plot(rank ~ admit, df)
# Create Logistic Regression Model
m2 <- glm(admit ~ gre + gpa + rank, data=df, family=binomial(link="logit"))
summary(m2)
Call:
glm(formula = admit ~ gre + gpa + rank, family = binomial(link = "logit"),
data = df)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.6268 -0.8662 -0.6388 1.1490 2.0790
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.989979 1.139951 -3.500 0.000465 ***
gre 0.002264 0.001094 2.070 0.038465 *
gpa 0.804038 0.331819 2.423 0.015388 *
rank2 -0.675443 0.316490 -2.134 0.032829 *
rank3 -1.340204 0.345306 -3.881 0.000104 ***
rank4 -1.551464 0.417832 -3.713 0.000205 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 499.98 on 399 degrees of freedom
Residual deviance: 458.52 on 394 degrees of freedom
AIC: 470.52
Number of Fisher Scoring iterations: 4
nd = data.frame( gre = c(720, 720, 720, 720, 550, 550, 550, 550),
gpa = c(3.2, 3.2, 3.2, 3.2, 3.8, 3.8, 3.8, 3.8),
rank = c('1', '2', '3', '4', '1', '2', '3', '4') )
predict(m2, nd, type="response")
1 2 3 4 5 6 7 8
0.5531306 0.3864841 0.2447380 0.2078168 0.5770800 0.4098356 0.2631993 0.2243201
logodds <- predict(m2, nd)
odds <- exp(logodds)
prob <- odds / (1 + odds)
prob
1 2 3 4 5 6 7 8
0.5531306 0.3864841 0.2447380 0.2078168 0.5770800 0.4098356 0.2631993 0.2243201