Resampling-lab

3 minute read

Published:

This lesson is from An Introduction to Statistical Learning

Validation Set

set.seed(1); sample(x=1:5, size=3)
set.seed(1); sample(x=10, size=3)
library(ISLR2)
Auto = ISLR2::Auto
attach(Auto)

print(dim(Auto)) # 392, 9

# Validation Approach
set.seed(1); train <- sample(x=392, size=196) # 392/2

# Fit model on train
lm.fit = lm(mpg ~ horsepower, data=Auto, subset=train)

# Predict on all
mpg_predict = predict(lm.fit , Auto)

# Valid MSE
error_valid = (mpg - mpg_predict)[-train]
mse_valid = mean(error_valid^2)
print(mse_valid) # 23.27
# Polynomial Regression
lm.fit2 <- lm(mpg ~ poly(horsepower, 2), data=Auto, subset=train)
mpg_predict = predict(lm.fit2 , Auto)
error_valid = (mpg - mpg_predict)[-train]
mse_valid = mean(error_valid^2)
print(mse_valid) # 18.72


lm.fit3 <- lm(mpg ~ poly(horsepower, 3), data=Auto, subset=train)
mpg_predict = predict(lm.fit3 , Auto)
error_valid = (mpg - mpg_predict)[-train]
mse_valid = mean(error_valid^2)
print(mse_valid) # 18.79
# Different Seed

# Validation Approach
set.seed(2); train <- sample(x=392, size=196) # 392/2

# Fit model on train
lm.fit = lm(mpg ~ horsepower, data=Auto, subset=train)

# Linear
mpg_predict = predict(lm.fit , Auto)
error_valid = (mpg - mpg_predict)[-train]
mse_valid = mean(error_valid^2)
print(mse_valid) # 25.73

# Degree 2
lm.fit2 <- lm(mpg ~ poly(horsepower, 2), data=Auto, subset=train)
mpg_predict = predict(lm.fit2 , Auto)
error_valid = (mpg - mpg_predict)[-train]
mse_valid = mean(error_valid^2)
print(mse_valid) # 20.43

# Degree 3
lm.fit3 <- lm(mpg ~ poly(horsepower, 3), data=Auto, subset=train)
mpg_predict = predict(lm.fit3 , Auto)
error_valid = (mpg - mpg_predict)[-train]
mse_valid = mean(error_valid^2)
print(mse_valid) # 20.39

Leave-One-Out Cross-Validation

  • The LOOCV estimate can be automatically computed for any generalized linear model using the glm() and cv.glm() functions.

    • library(boot) for

    • cv.glm() calculates the estimated K-fold cross-validation prediction error for generalized linear models

  • In the lab for Chapter 4, we used the glm() function to perform logistic regression by passing in the family = “binomial” argument.

  • But if we use glm() to fit a model without passing in the family argument, then it performs linear regression, just like the lm() function.

  • cv.err = cv.glm()$delta

    • cv.glm() results are in delta, a vector of length two.
    • The first component is the raw cross-validation estimate of prediction error. The second component is the adjusted cross-validation estimate. The adjustment is designed to compensate for the bias introduced by not using leave-one-out cross-validation.
      library(ISLR2)
      Auto = ISLR2::Auto
      attach(Auto)
      print(dim(Auto)) # 392, 9
        
      # glm(family=binomial) is logistic regression
      # glm() is linear regression as lm()
      glm.fit = glm(mpg ~ horsepower , data=Auto)
      print(coef(glm.fit)) # 39.9358610  -0.1578447
        
      lm.fit = lm(mpg ~ horsepower , data=Auto)
      print(coef(lm.fit)) # 39.9358610  -0.1578447
        
      # We will use glm() instead of lm() since glm() can be used with cv.glm() with boot library
      library(boot)
      glm.fit = glm(mpg ~ horsepower , data=Auto) # lm()
      cv.err = cv.glm(Auto , glm.fit) # 
      print(cv.err$delta) # CV results are in delta vector corresponding to LOOCV
        
        
        
      # repeat experiment 10 times with polynomial degree increasing
      cv.error = rep(0, 10)
      for (i in 1:10){
        glm.fit = glm(mpg ~ poly(horsepower, i), data=Auto)
        cv.error[i] = cv.glm(Auto, glm.fit)$delta[1]
      }
      print(cv.error) # drop in MSE but no significant after quadratic
    

k-Fold Cross-Validation

library(ISLR2)
Auto = ISLR2::Auto
attach(Auto)
print(dim(Auto)) # 392, 9

# CV with K=10
cv.error.10 <- rep(0, 10)
for (i in 1:10){
  glm.fit <- glm(mpg ~ poly(horsepower, i), data=Auto)
  set.seed(1); cv.err = cv.glm(Auto, glm.fit, K=10)
  cv.error.10[i] = cv.err$delta[1]
}
print(cv.error.10)