Resampling-lab
Published:
This lesson is from An Introduction to Statistical Learning
Validation Set
set.seed(1); sample(x=1:5, size=3)
set.seed(1); sample(x=10, size=3)
library(ISLR2)
Auto = ISLR2::Auto
attach(Auto)
print(dim(Auto)) # 392, 9
# Validation Approach
set.seed(1); train <- sample(x=392, size=196) # 392/2
# Fit model on train
lm.fit = lm(mpg ~ horsepower, data=Auto, subset=train)
# Predict on all
mpg_predict = predict(lm.fit , Auto)
# Valid MSE
error_valid = (mpg - mpg_predict)[-train]
mse_valid = mean(error_valid^2)
print(mse_valid) # 23.27
# Polynomial Regression
lm.fit2 <- lm(mpg ~ poly(horsepower, 2), data=Auto, subset=train)
mpg_predict = predict(lm.fit2 , Auto)
error_valid = (mpg - mpg_predict)[-train]
mse_valid = mean(error_valid^2)
print(mse_valid) # 18.72
lm.fit3 <- lm(mpg ~ poly(horsepower, 3), data=Auto, subset=train)
mpg_predict = predict(lm.fit3 , Auto)
error_valid = (mpg - mpg_predict)[-train]
mse_valid = mean(error_valid^2)
print(mse_valid) # 18.79
# Different Seed
# Validation Approach
set.seed(2); train <- sample(x=392, size=196) # 392/2
# Fit model on train
lm.fit = lm(mpg ~ horsepower, data=Auto, subset=train)
# Linear
mpg_predict = predict(lm.fit , Auto)
error_valid = (mpg - mpg_predict)[-train]
mse_valid = mean(error_valid^2)
print(mse_valid) # 25.73
# Degree 2
lm.fit2 <- lm(mpg ~ poly(horsepower, 2), data=Auto, subset=train)
mpg_predict = predict(lm.fit2 , Auto)
error_valid = (mpg - mpg_predict)[-train]
mse_valid = mean(error_valid^2)
print(mse_valid) # 20.43
# Degree 3
lm.fit3 <- lm(mpg ~ poly(horsepower, 3), data=Auto, subset=train)
mpg_predict = predict(lm.fit3 , Auto)
error_valid = (mpg - mpg_predict)[-train]
mse_valid = mean(error_valid^2)
print(mse_valid) # 20.39
Leave-One-Out Cross-Validation
The LOOCV estimate can be automatically computed for any generalized linear model using the glm() and cv.glm() functions.
library(boot) for
cv.glm() calculates the estimated K-fold cross-validation prediction error for generalized linear models
In the lab for Chapter 4, we used the glm() function to perform logistic regression by passing in the family = “binomial” argument.
But if we use glm() to fit a model without passing in the family argument, then it performs linear regression, just like the lm() function.
cv.err = cv.glm()$delta
- cv.glm() results are in delta, a vector of length two.
- The first component is the raw cross-validation estimate of prediction error. The second component is the adjusted cross-validation estimate. The adjustment is designed to compensate for the bias introduced by not using leave-one-out cross-validation.
library(ISLR2) Auto = ISLR2::Auto attach(Auto) print(dim(Auto)) # 392, 9 # glm(family=binomial) is logistic regression # glm() is linear regression as lm() glm.fit = glm(mpg ~ horsepower , data=Auto) print(coef(glm.fit)) # 39.9358610 -0.1578447 lm.fit = lm(mpg ~ horsepower , data=Auto) print(coef(lm.fit)) # 39.9358610 -0.1578447 # We will use glm() instead of lm() since glm() can be used with cv.glm() with boot library library(boot) glm.fit = glm(mpg ~ horsepower , data=Auto) # lm() cv.err = cv.glm(Auto , glm.fit) # print(cv.err$delta) # CV results are in delta vector corresponding to LOOCV # repeat experiment 10 times with polynomial degree increasing cv.error = rep(0, 10) for (i in 1:10){ glm.fit = glm(mpg ~ poly(horsepower, i), data=Auto) cv.error[i] = cv.glm(Auto, glm.fit)$delta[1] } print(cv.error) # drop in MSE but no significant after quadratic
k-Fold Cross-Validation
library(ISLR2)
Auto = ISLR2::Auto
attach(Auto)
print(dim(Auto)) # 392, 9
# CV with K=10
cv.error.10 <- rep(0, 10)
for (i in 1:10){
glm.fit <- glm(mpg ~ poly(horsepower, i), data=Auto)
set.seed(1); cv.err = cv.glm(Auto, glm.fit, K=10)
cv.error.10[i] = cv.err$delta[1]
}
print(cv.error.10)