ISLR Home

library(ISLR)
library(MASS)

Polynomial Regression and Step Functions

Fitting a linear model using a 4th degree polynomial

fit = lm(wage ~ poly(age,4), data=Wage)
summary(fit)
## 
## Call:
## lm(formula = wage ~ poly(age, 4), data = Wage)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -98.707 -24.626  -4.993  15.217 203.693 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    111.7036     0.7287 153.283  < 2e-16 ***
## poly(age, 4)1  447.0679    39.9148  11.201  < 2e-16 ***
## poly(age, 4)2 -478.3158    39.9148 -11.983  < 2e-16 ***
## poly(age, 4)3  125.5217    39.9148   3.145  0.00168 ** 
## poly(age, 4)4  -77.9112    39.9148  -1.952  0.05104 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39.91 on 2995 degrees of freedom
## Multiple R-squared:  0.08626,    Adjusted R-squared:  0.08504 
## F-statistic: 70.69 on 4 and 2995 DF,  p-value: < 2.2e-16

COMMENT: raw = false This means that each column of poly() is a linear combination of variables

Coefficients of the 4th degree polynomial

coef(summary(fit))
##                 Estimate Std. Error    t value     Pr(>|t|)
## (Intercept)    111.70361  0.7287409 153.283015 0.000000e+00
## poly(age, 4)1  447.06785 39.9147851  11.200558 1.484604e-28
## poly(age, 4)2 -478.31581 39.9147851 -11.983424 2.355831e-32
## poly(age, 4)3  125.52169 39.9147851   3.144742 1.678622e-03
## poly(age, 4)4  -77.91118 39.9147851  -1.951938 5.103865e-02

Using poly(raw=TRUE)

fit2 = lm(wage ~ poly(age, 4, raw=TRUE), data=Wage)
coef(summary(fit2))
##                                Estimate   Std. Error   t value     Pr(>|t|)
## (Intercept)               -1.841542e+02 6.004038e+01 -3.067172 0.0021802539
## poly(age, 4, raw = TRUE)1  2.124552e+01 5.886748e+00  3.609042 0.0003123618
## poly(age, 4, raw = TRUE)2 -5.638593e-01 2.061083e-01 -2.735743 0.0062606446
## poly(age, 4, raw = TRUE)3  6.810688e-03 3.065931e-03  2.221409 0.0263977518
## poly(age, 4, raw = TRUE)4 -3.203830e-05 1.641359e-05 -1.951938 0.0510386498

Using I() to fit polynomial

fit2a = lm(wage ~ age + I(age^2) + I(age^3) + I(age^4), data=Wage)
coef(fit2a)
##   (Intercept)           age      I(age^2)      I(age^3)      I(age^4) 
## -1.841542e+02  2.124552e+01 -5.638593e-01  6.810688e-03 -3.203830e-05

Using cbind() to fit polynomial

fit2b = lm(wage ~ cbind(age, age^2, age^3, age^4), data=Wage)
coef(fit2b)
##                        (Intercept) cbind(age, age^2, age^3, age^4)age 
##                      -1.841542e+02                       2.124552e+01 
##    cbind(age, age^2, age^3, age^4)    cbind(age, age^2, age^3, age^4) 
##                      -5.638593e-01                       6.810688e-03 
##    cbind(age, age^2, age^3, age^4) 
##                      -3.203830e-05

Getting the range of ages in Wage

agelims = range(Wage$age)
agelims
## [1] 18 80
## Creating a sequence of values in between agelims
age.grid = seq(from=agelims[1], to=agelims[2])
age.grid
##  [1] 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
## [26] 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
## [51] 68 69 70 71 72 73 74 75 76 77 78 79 80
## Predicting the wages for the ages in age.grid using fit model
preds = predict(fit, newdata=list(age=age.grid), se=TRUE)

# Getting confidence interval for each prediction
se.bands = cbind(preds$fit+2*preds$se.fit, preds$fit-2*preds$se.fit)

# Plotting the data
par(mfrow=c(1,2), mar=c(4.5, 4.5, 1, 1), oma=c(0,0,4,0))
plot(Wage$age, Wage$wage, xlim=agelims, cex=.5, col='darkgrey')
title("Degree-4 Polynomial", outer=TRUE)
## Plotting the best fit line
lines(age.grid, preds$fit, lwd=2, col="blue")

## Plotting the confidence bands
matlines(age.grid, se.bands, lwd=1, col="blue", lty=3)

Predicting wages using poly(raw=TRUE)

preds2 = predict(fit2, newdata=list(age=age.grid), se=TRUE)
# Checking to see if there is a difference in using poly(raw=TRUE)
max(abs(preds$fit-preds2$fit)) # 7.81597e-11 (no difference)
## [1] 7.81597e-11

Testing to see which degree of polynomial is sufficient using ANOVA

Fitting 5 different poly fits

fit.1 = lm (wage ~ age, data=Wage)
fit.2 = lm (wage ~ poly(age, 2), data=Wage)
fit.3 = lm (wage ~ poly(age, 3), data=Wage)
fit.4 = lm (wage ~ poly(age, 4), data=Wage)
fit.5 = lm (wage ~ poly(age, 5), data=Wage)

# Running ANOVA (pg.116)
anova(fit.1, fit.2, fit.3, fit.4, fit.5)
## Analysis of Variance Table
## 
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

COMMENTS: P-values in ANOVA that are around 0.05 is desired. ???? - Too low p-values indeicate that it is not sufficient - Too high p-values indeicate that it is not necessary/justified Can use ANOVA for both poly(raw=FALSE) or poly(raw=TRUE)

p-values using coefs if poly(raw=FALSE) is used

coef(summary(fit.5))
##                 Estimate Std. Error     t value     Pr(>|t|)
## (Intercept)    111.70361  0.7287647 153.2780243 0.000000e+00
## poly(age, 5)1  447.06785 39.9160847  11.2001930 1.491111e-28
## poly(age, 5)2 -478.31581 39.9160847 -11.9830341 2.367734e-32
## poly(age, 5)3  125.52169 39.9160847   3.1446392 1.679213e-03
## poly(age, 5)4  -77.91118 39.9160847  -1.9518743 5.104623e-02
## poly(age, 5)5  -35.81289 39.9160847  -0.8972045 3.696820e-01

Fit model to predict if person receives over 250k or below

fit = glm(I(wage>250) ~ poly(age, 4), data=Wage, family=binomial)

Predicting using Logistic Regression

preds = predict(fit, newdata=list(age=age.grid), se=TRUE)

Inversing the logits to get probabilities

pfit = exp(preds$fit)/(1+exp(preds$fit))

Getting the confidence bands of logits

se.bands.logit = cbind(preds$fit+2*preds$se.fit, preds$fit-2*preds$se.fit)

# Getting the confidence bands of probabilities
se.bands = exp(se.bands.logit)/(1+exp(se.bands.logit))

# Predicting using type='response'
preds = predict(fit, newdata=list(age=age.grid), type='response', se=TRUE)

# Plotting
plot(Wage$age, I(Wage$wage>250), xlim=agelims, type="n", ylim=c(0,.2))
points(jitter(Wage$age), I((Wage$wage>250)/5), cex=0.5, pch="|", col="darkgrey")
lines(age.grid, pfit, lwd=2, col="blue")
matlines(age.grid, se.bands, lwd=1, col="blue", lty=3)