# Resampling-lab

Published:

This lesson is from An Introduction to Statistical Learning

# Validation Set

set.seed(1); sample(x=1:5, size=3)
set.seed(1); sample(x=10, size=3)

library(ISLR2)
Auto = ISLR2::Auto
attach(Auto)

print(dim(Auto)) # 392, 9

# Validation Approach
set.seed(1); train <- sample(x=392, size=196) # 392/2

# Fit model on train
lm.fit = lm(mpg ~ horsepower, data=Auto, subset=train)

# Predict on all
mpg_predict = predict(lm.fit , Auto)

# Valid MSE
error_valid = (mpg - mpg_predict)[-train]
mse_valid = mean(error_valid^2)
print(mse_valid) # 23.27

# Polynomial Regression
lm.fit2 <- lm(mpg ~ poly(horsepower, 2), data=Auto, subset=train)
mpg_predict = predict(lm.fit2 , Auto)
error_valid = (mpg - mpg_predict)[-train]
mse_valid = mean(error_valid^2)
print(mse_valid) # 18.72

lm.fit3 <- lm(mpg ~ poly(horsepower, 3), data=Auto, subset=train)
mpg_predict = predict(lm.fit3 , Auto)
error_valid = (mpg - mpg_predict)[-train]
mse_valid = mean(error_valid^2)
print(mse_valid) # 18.79

# Different Seed

# Validation Approach
set.seed(2); train <- sample(x=392, size=196) # 392/2

# Fit model on train
lm.fit = lm(mpg ~ horsepower, data=Auto, subset=train)

# Linear
mpg_predict = predict(lm.fit , Auto)
error_valid = (mpg - mpg_predict)[-train]
mse_valid = mean(error_valid^2)
print(mse_valid) # 25.73

# Degree 2
lm.fit2 <- lm(mpg ~ poly(horsepower, 2), data=Auto, subset=train)
mpg_predict = predict(lm.fit2 , Auto)
error_valid = (mpg - mpg_predict)[-train]
mse_valid = mean(error_valid^2)
print(mse_valid) # 20.43

# Degree 3
lm.fit3 <- lm(mpg ~ poly(horsepower, 3), data=Auto, subset=train)
mpg_predict = predict(lm.fit3 , Auto)
error_valid = (mpg - mpg_predict)[-train]
mse_valid = mean(error_valid^2)
print(mse_valid) # 20.39


# Leave-One-Out Cross-Validation

• The LOOCV estimate can be automatically computed for any generalized linear model using the glm() and cv.glm() functions.

• library(boot) for

• cv.glm() calculates the estimated K-fold cross-validation prediction error for generalized linear models

• In the lab for Chapter 4, we used the glm() function to perform logistic regression by passing in the family = “binomial” argument.

• But if we use glm() to fit a model without passing in the family argument, then it performs linear regression, just like the lm() function.

• cv.err = cv.glm()$delta • cv.glm() results are in delta, a vector of length two. • The first component is the raw cross-validation estimate of prediction error. The second component is the adjusted cross-validation estimate. The adjustment is designed to compensate for the bias introduced by not using leave-one-out cross-validation.  library(ISLR2) Auto = ISLR2::Auto attach(Auto) print(dim(Auto)) # 392, 9 # glm(family=binomial) is logistic regression # glm() is linear regression as lm() glm.fit = glm(mpg ~ horsepower , data=Auto) print(coef(glm.fit)) # 39.9358610 -0.1578447 lm.fit = lm(mpg ~ horsepower , data=Auto) print(coef(lm.fit)) # 39.9358610 -0.1578447 # We will use glm() instead of lm() since glm() can be used with cv.glm() with boot library library(boot) glm.fit = glm(mpg ~ horsepower , data=Auto) # lm() cv.err = cv.glm(Auto , glm.fit) # print(cv.err$delta) # CV results are in delta vector corresponding to LOOCV

# repeat experiment 10 times with polynomial degree increasing
cv.error = rep(0, 10)
for (i in 1:10){
glm.fit = glm(mpg ~ poly(horsepower, i), data=Auto)
cv.error[i] = cv.glm(Auto, glm.fit)$delta[1] } print(cv.error) # drop in MSE but no significant after quadratic  # k-Fold Cross-Validation library(ISLR2) Auto = ISLR2::Auto attach(Auto) print(dim(Auto)) # 392, 9 # CV with K=10 cv.error.10 <- rep(0, 10) for (i in 1:10){ glm.fit <- glm(mpg ~ poly(horsepower, i), data=Auto) set.seed(1); cv.err = cv.glm(Auto, glm.fit, K=10) cv.error.10[i] = cv.err$delta[1]
}
print(cv.error.10)


Tags: