Load and Explore the Data

We begin by reading in the data and looking at a summary of it.

df <- read.table("diamonds.txt", sep="\t", header=TRUE)
summary(df)
     carat               cut        color        clarity          depth           table           price
 Min.   :0.2000   Fair     : 1610   D: 6775   SI1    :13065   Min.   :43.00   Min.   :43.00   Min.   :  326
 1st Qu.:0.4000   Good     : 4906   E: 9797   VS2    :12258   1st Qu.:61.00   1st Qu.:56.00   1st Qu.:  950
 Median :0.7000   Ideal    :21551   F: 9542   SI2    : 9194   Median :61.80   Median :57.00   Median : 2401
 Mean   :0.7979   Premium  :13791   G:11292   VS1    : 8171   Mean   :61.75   Mean   :57.46   Mean   : 3933
 3rd Qu.:1.0400   Very Good:12082   H: 8304   VVS2   : 5066   3rd Qu.:62.50   3rd Qu.:59.00   3rd Qu.: 5324
 Max.   :5.0100                     I: 5422   VVS1   : 3655   Max.   :79.00   Max.   :95.00   Max.   :18823
                                    J: 2808   (Other): 2531
       x                y                z
 Min.   : 0.000   Min.   : 0.000   Min.   : 0.000
 1st Qu.: 4.710   1st Qu.: 4.720   1st Qu.: 2.910
 Median : 5.700   Median : 5.710   Median : 3.530
 Mean   : 5.731   Mean   : 5.735   Mean   : 3.539
 3rd Qu.: 6.540   3rd Qu.: 6.540   3rd Qu.: 4.040
 Max.   :10.740   Max.   :58.900   Max.   :31.800
                                                   

Scatterplots

plot(price ~ carat, df, col=rgb(0.1,0.6,0.6,0.6), cex=0.5)

plot(log(price) ~ carat, df, col=rgb(0.1,0.6,0.6,0.6), cex=0.5)

plot(log(price) ~ log(carat), df, col=rgb(0.1,0.6,0.6,0.6), cex=0.5)

Log-Log Model

model <- lm(log(price) ~ log(carat), df)
summary(model)

Call:
lm(formula = log(price) ~ log(carat), data = df)

Residuals:
     Min       1Q   Median       3Q      Max
-1.50833 -0.16951 -0.00591  0.16637  1.33793

Coefficients:
            Estimate Std. Error t value Pr(>|t|)
(Intercept) 8.448661   0.001365  6190.9   <2e-16 ***
log(carat)  1.675817   0.001934   866.6   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.2627 on 53938 degrees of freedom
Multiple R-squared:  0.933, Adjusted R-squared:  0.933
F-statistic: 7.51e+05 on 1 and 53938 DF,  p-value: < 2.2e-16

Residual Analysis

plot(model$residuals ~ df$carat, col=rgb(0.1,0.6,0.6,0.6), cex=0.5,
     xlab="Carat Size", ylab="Residuals", main="Residual Plot")
abline(h=0, col="darkred", lwd=2)

res <- model$residuals
par(mfrow=c(1,2))
hist(res, col='orchid')
qqnorm(res)
qqline(res)
par(mfrow=c(1,1))

Generating Predictions with the Transformed Model

nd <- data.frame(carat = c(0.5, 1, 1.5, 2))
ln_yhat <- predict(model, newdata=nd, interval="prediction", level=0.95)
yhat <- exp(ln_yhat)
yhat
        fit       lwr       upr
1  1461.287  873.2777  2445.225
2  4668.816 2790.1212  7812.507
3  9210.929 5504.4865 15413.101
4 14916.875 8914.3144 24961.330
LS0tDQp0aXRsZTogIkxlc3NvbiAxNSAtIExvZ2FyaXRobWljIFRyYW5zZm9ybWF0aW9ucyAoRGlhbW9uZCBEYXRhKSINCmF1dGhvcjogIlJvYmJpZSBCZWFuZSINCm91dHB1dDoNCiAgaHRtbF9ub3RlYm9vazoNCiAgICB0aGVtZTogZmxhdGx5DQogICAgdG9jOiB5ZXMNCiAgICB0b2NfZGVwdGg6IDINCi0tLQ0KDQojIExvYWQgYW5kIEV4cGxvcmUgdGhlIERhdGENCg0KV2UgYmVnaW4gYnkgcmVhZGluZyBpbiB0aGUgZGF0YSBhbmQgbG9va2luZyBhdCBhIHN1bW1hcnkgb2YgaXQuIA0KDQpgYGB7cn0NCmRmIDwtIHJlYWQudGFibGUoImRpYW1vbmRzLnR4dCIsIHNlcD0iXHQiLCBoZWFkZXI9VFJVRSkNCnN1bW1hcnkoZGYpDQpgYGANCg0KIyBTY2F0dGVycGxvdHMNCg0KYGBge3J9DQpwbG90KHByaWNlIH4gY2FyYXQsIGRmLCBjb2w9cmdiKDAuMSwwLjYsMC42LDAuNiksIGNleD0wLjUpDQpgYGANCg0KYGBge3J9DQpwbG90KGxvZyhwcmljZSkgfiBjYXJhdCwgZGYsIGNvbD1yZ2IoMC4xLDAuNiwwLjYsMC42KSwgY2V4PTAuNSkNCmBgYA0KDQpgYGB7cn0NCnBsb3QobG9nKHByaWNlKSB+IGxvZyhjYXJhdCksIGRmLCBjb2w9cmdiKDAuMSwwLjYsMC42LDAuNiksIGNleD0wLjUpDQpgYGANCg0KIyBMb2ctTG9nIE1vZGVsDQoNCmBgYHtyfQ0KbW9kZWwgPC0gbG0obG9nKHByaWNlKSB+IGxvZyhjYXJhdCksIGRmKQ0Kc3VtbWFyeShtb2RlbCkNCmBgYA0KDQojIFJlc2lkdWFsIEFuYWx5c2lzDQoNCmBgYHtyfQ0KcGxvdChtb2RlbCRyZXNpZHVhbHMgfiBkZiRjYXJhdCwgY29sPXJnYigwLjEsMC42LDAuNiwwLjYpLCBjZXg9MC41LCANCiAgICAgeGxhYj0iQ2FyYXQgU2l6ZSIsIHlsYWI9IlJlc2lkdWFscyIsIG1haW49IlJlc2lkdWFsIFBsb3QiKQ0KDQphYmxpbmUoaD0wLCBjb2w9ImRhcmtyZWQiLCBsd2Q9MikNCmBgYA0KDQoNCmBgYHtyfQ0KcmVzIDwtIG1vZGVsJHJlc2lkdWFscw0KcGFyKG1mcm93PWMoMSwyKSkNCmhpc3QocmVzLCBjb2w9J29yY2hpZCcpDQpxcW5vcm0ocmVzKQ0KcXFsaW5lKHJlcykNCnBhcihtZnJvdz1jKDEsMSkpDQpgYGANCg0KDQojIEdlbmVyYXRpbmcgUHJlZGljdGlvbnMgd2l0aCB0aGUgVHJhbnNmb3JtZWQgTW9kZWwgDQoNCmBgYHtyfQ0KbmQgPC0gZGF0YS5mcmFtZShjYXJhdCA9IGMoMC41LCAxLCAxLjUsIDIpKQ0KbG5feWhhdCA8LSBwcmVkaWN0KG1vZGVsLCBuZXdkYXRhPW5kLCBpbnRlcnZhbD0icHJlZGljdGlvbiIsIGxldmVsPTAuOTUpDQp5aGF0IDwtIGV4cChsbl95aGF0KQ0KeWhhdA0KYGBgDQoNCg0KDQoNCg0KDQoNCg==