ISLR Home

``````library(ISLR)
library(MASS)``````

# Polynomial Regression and Step Functions

Fitting a linear model using a 4th degree polynomial

``````fit = lm(wage ~ poly(age,4), data=Wage)
summary(fit)``````
``````##
## Call:
## lm(formula = wage ~ poly(age, 4), data = Wage)
##
## Residuals:
##     Min      1Q  Median      3Q     Max
## -98.707 -24.626  -4.993  15.217 203.693
##
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)
## (Intercept)    111.7036     0.7287 153.283  < 2e-16 ***
## poly(age, 4)1  447.0679    39.9148  11.201  < 2e-16 ***
## poly(age, 4)2 -478.3158    39.9148 -11.983  < 2e-16 ***
## poly(age, 4)3  125.5217    39.9148   3.145  0.00168 **
## poly(age, 4)4  -77.9112    39.9148  -1.952  0.05104 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39.91 on 2995 degrees of freedom
## Multiple R-squared:  0.08626,    Adjusted R-squared:  0.08504
## F-statistic: 70.69 on 4 and 2995 DF,  p-value: < 2.2e-16``````

COMMENT: raw = false This means that each column of poly() is a linear combination of variables

## Coefficients of the 4th degree polynomial

``coef(summary(fit))``
``````##                 Estimate Std. Error    t value     Pr(>|t|)
## (Intercept)    111.70361  0.7287409 153.283015 0.000000e+00
## poly(age, 4)1  447.06785 39.9147851  11.200558 1.484604e-28
## poly(age, 4)2 -478.31581 39.9147851 -11.983424 2.355831e-32
## poly(age, 4)3  125.52169 39.9147851   3.144742 1.678622e-03
## poly(age, 4)4  -77.91118 39.9147851  -1.951938 5.103865e-02``````

Using poly(raw=TRUE)

``````fit2 = lm(wage ~ poly(age, 4, raw=TRUE), data=Wage)
coef(summary(fit2))``````
``````##                                Estimate   Std. Error   t value     Pr(>|t|)
## (Intercept)               -1.841542e+02 6.004038e+01 -3.067172 0.0021802539
## poly(age, 4, raw = TRUE)1  2.124552e+01 5.886748e+00  3.609042 0.0003123618
## poly(age, 4, raw = TRUE)2 -5.638593e-01 2.061083e-01 -2.735743 0.0062606446
## poly(age, 4, raw = TRUE)3  6.810688e-03 3.065931e-03  2.221409 0.0263977518
## poly(age, 4, raw = TRUE)4 -3.203830e-05 1.641359e-05 -1.951938 0.0510386498``````

## Using I() to fit polynomial

``````fit2a = lm(wage ~ age + I(age^2) + I(age^3) + I(age^4), data=Wage)
coef(fit2a)``````
``````##   (Intercept)           age      I(age^2)      I(age^3)      I(age^4)
## -1.841542e+02  2.124552e+01 -5.638593e-01  6.810688e-03 -3.203830e-05``````

## Using cbind() to fit polynomial

``````fit2b = lm(wage ~ cbind(age, age^2, age^3, age^4), data=Wage)
coef(fit2b)``````
``````##                        (Intercept) cbind(age, age^2, age^3, age^4)age
##                      -1.841542e+02                       2.124552e+01
##    cbind(age, age^2, age^3, age^4)    cbind(age, age^2, age^3, age^4)
##                      -5.638593e-01                       6.810688e-03
##    cbind(age, age^2, age^3, age^4)
##                      -3.203830e-05``````

## Getting the range of ages in Wage

``````agelims = range(Wage\$age)
agelims``````
``## [1] 18 80``
``````## Creating a sequence of values in between agelims
age.grid = seq(from=agelims[1], to=agelims[2])
age.grid``````
``````##  [1] 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
## [26] 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
## [51] 68 69 70 71 72 73 74 75 76 77 78 79 80``````
``````## Predicting the wages for the ages in age.grid using fit model
preds = predict(fit, newdata=list(age=age.grid), se=TRUE)

# Getting confidence interval for each prediction
se.bands = cbind(preds\$fit+2*preds\$se.fit, preds\$fit-2*preds\$se.fit)

# Plotting the data
par(mfrow=c(1,2), mar=c(4.5, 4.5, 1, 1), oma=c(0,0,4,0))
plot(Wage\$age, Wage\$wage, xlim=agelims, cex=.5, col='darkgrey')
title("Degree-4 Polynomial", outer=TRUE)
## Plotting the best fit line
lines(age.grid, preds\$fit, lwd=2, col="blue")

## Plotting the confidence bands
matlines(age.grid, se.bands, lwd=1, col="blue", lty=3)``````

# Predicting wages using poly(raw=TRUE)

``````preds2 = predict(fit2, newdata=list(age=age.grid), se=TRUE)
# Checking to see if there is a difference in using poly(raw=TRUE)
max(abs(preds\$fit-preds2\$fit)) # 7.81597e-11 (no difference)``````
``## [1] 7.81597e-11``

# Testing to see which degree of polynomial is sufficient using ANOVA

## Fitting 5 different poly fits

``````fit.1 = lm (wage ~ age, data=Wage)
fit.2 = lm (wage ~ poly(age, 2), data=Wage)
fit.3 = lm (wage ~ poly(age, 3), data=Wage)
fit.4 = lm (wage ~ poly(age, 4), data=Wage)
fit.5 = lm (wage ~ poly(age, 5), data=Wage)

# Running ANOVA (pg.116)
anova(fit.1, fit.2, fit.3, fit.4, fit.5)``````
``````## Analysis of Variance Table
##
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)
## 1   2998 5022216
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 **
## 4   2995 4771604  1      6070   3.8098  0.051046 .
## 5   2994 4770322  1      1283   0.8050  0.369682
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1``````

COMMENTS: P-values in ANOVA that are around 0.05 is desired. ???? - Too low p-values indeicate that it is not sufficient - Too high p-values indeicate that it is not necessary/justified Can use ANOVA for both poly(raw=FALSE) or poly(raw=TRUE)

# p-values using coefs if poly(raw=FALSE) is used

``coef(summary(fit.5))``
``````##                 Estimate Std. Error     t value     Pr(>|t|)
## (Intercept)    111.70361  0.7287647 153.2780243 0.000000e+00
## poly(age, 5)1  447.06785 39.9160847  11.2001930 1.491111e-28
## poly(age, 5)2 -478.31581 39.9160847 -11.9830341 2.367734e-32
## poly(age, 5)3  125.52169 39.9160847   3.1446392 1.679213e-03
## poly(age, 5)4  -77.91118 39.9160847  -1.9518743 5.104623e-02
## poly(age, 5)5  -35.81289 39.9160847  -0.8972045 3.696820e-01``````

# Fit model to predict if person receives over 250k or below

``fit = glm(I(wage>250) ~ poly(age, 4), data=Wage, family=binomial)``

# Predicting using Logistic Regression

``preds = predict(fit, newdata=list(age=age.grid), se=TRUE)``

# Inversing the logits to get probabilities

``pfit = exp(preds\$fit)/(1+exp(preds\$fit))``

# Getting the confidence bands of logits

``````se.bands.logit = cbind(preds\$fit+2*preds\$se.fit, preds\$fit-2*preds\$se.fit)

# Getting the confidence bands of probabilities
se.bands = exp(se.bands.logit)/(1+exp(se.bands.logit))

# Predicting using type='response'
preds = predict(fit, newdata=list(age=age.grid), type='response', se=TRUE)

# Plotting
plot(Wage\$age, I(Wage\$wage>250), xlim=agelims, type="n", ylim=c(0,.2))
points(jitter(Wage\$age), I((Wage\$wage>250)/5), cex=0.5, pch="|", col="darkgrey")
lines(age.grid, pfit, lwd=2, col="blue")
matlines(age.grid, se.bands, lwd=1, col="blue", lty=3)``````