Polynomial Regression and Step Functions
Predicting wages using poly(raw=TRUE)
Testing to see which degree of polynomial is sufficient using ANOVA
- Fitting 5 different poly fits
p-values using coefs if poly(raw=FALSE) is used
Fit model to predict if person receives over 250k or below
Predicting using Logistic Regression
Inversing the logits to get probabilities
Getting the confidence bands of logits
Using cut function to apply step function
Splines
Natural Spline
Local Regression
GAMs
References

library(ISLR)
library(MASS)

Polynomial Regression and Step Functions

Fitting a linear model using a 4th degree polynomial

fit = lm(wage ~ poly(age,4), data=Wage)
summary(fit)

## 
## Call:
## lm(formula = wage ~ poly(age, 4), data = Wage)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -98.707 -24.626  -4.993  15.217 203.693 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    111.7036     0.7287 153.283  < 2e-16 ***
## poly(age, 4)1  447.0679    39.9148  11.201  < 2e-16 ***
## poly(age, 4)2 -478.3158    39.9148 -11.983  < 2e-16 ***
## poly(age, 4)3  125.5217    39.9148   3.145  0.00168 ** 
## poly(age, 4)4  -77.9112    39.9148  -1.952  0.05104 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39.91 on 2995 degrees of freedom
## Multiple R-squared:  0.08626,    Adjusted R-squared:  0.08504 
## F-statistic: 70.69 on 4 and 2995 DF,  p-value: < 2.2e-16

COMMENT: raw = false This means that each column of poly() is a linear combination of variables

Coefficients of the 4th degree polynomial

coef(summary(fit))

##                 Estimate Std. Error    t value     Pr(>|t|)
## (Intercept)    111.70361  0.7287409 153.283015 0.000000e+00
## poly(age, 4)1  447.06785 39.9147851  11.200558 1.484604e-28
## poly(age, 4)2 -478.31581 39.9147851 -11.983424 2.355831e-32
## poly(age, 4)3  125.52169 39.9147851   3.144742 1.678622e-03
## poly(age, 4)4  -77.91118 39.9147851  -1.951938 5.103865e-02

Using poly(raw=TRUE)

fit2 = lm(wage ~ poly(age, 4, raw=TRUE), data=Wage)
coef(summary(fit2))

##                                Estimate   Std. Error   t value     Pr(>|t|)
## (Intercept)               -1.841542e+02 6.004038e+01 -3.067172 0.0021802539
## poly(age, 4, raw = TRUE)1  2.124552e+01 5.886748e+00  3.609042 0.0003123618
## poly(age, 4, raw = TRUE)2 -5.638593e-01 2.061083e-01 -2.735743 0.0062606446
## poly(age, 4, raw = TRUE)3  6.810688e-03 3.065931e-03  2.221409 0.0263977518
## poly(age, 4, raw = TRUE)4 -3.203830e-05 1.641359e-05 -1.951938 0.0510386498

Using I() to fit polynomial

fit2a = lm(wage ~ age + I(age^2) + I(age^3) + I(age^4), data=Wage)
coef(fit2a)

##   (Intercept)           age      I(age^2)      I(age^3)      I(age^4) 
## -1.841542e+02  2.124552e+01 -5.638593e-01  6.810688e-03 -3.203830e-05

Using cbind() to fit polynomial

fit2b = lm(wage ~ cbind(age, age^2, age^3, age^4), data=Wage)
coef(fit2b)

##                        (Intercept) cbind(age, age^2, age^3, age^4)age 
##                      -1.841542e+02                       2.124552e+01 
##    cbind(age, age^2, age^3, age^4)    cbind(age, age^2, age^3, age^4) 
##                      -5.638593e-01                       6.810688e-03 
##    cbind(age, age^2, age^3, age^4) 
##                      -3.203830e-05

Getting the range of ages in Wage

agelims = range(Wage$age)
agelims

## [1] 18 80

## Creating a sequence of values in between agelims
age.grid = seq(from=agelims[1], to=agelims[2])
age.grid

##  [1] 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
## [26] 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
## [51] 68 69 70 71 72 73 74 75 76 77 78 79 80

## Predicting the wages for the ages in age.grid using fit model
preds = predict(fit, newdata=list(age=age.grid), se=TRUE)

# Getting confidence interval for each prediction
se.bands = cbind(preds$fit+2*preds$se.fit, preds$fit-2*preds$se.fit)

# Plotting the data
par(mfrow=c(1,2), mar=c(4.5, 4.5, 1, 1), oma=c(0,0,4,0))
plot(Wage$age, Wage$wage, xlim=agelims, cex=.5, col='darkgrey')
title("Degree-4 Polynomial", outer=TRUE)
## Plotting the best fit line
lines(age.grid, preds$fit, lwd=2, col="blue")

## Plotting the confidence bands
matlines(age.grid, se.bands, lwd=1, col="blue", lty=3)

Predicting wages using poly(raw=TRUE)

preds2 = predict(fit2, newdata=list(age=age.grid), se=TRUE)
# Checking to see if there is a difference in using poly(raw=TRUE)
max(abs(preds$fit-preds2$fit)) # 7.81597e-11 (no difference)

## [1] 7.81597e-11

Testing to see which degree of polynomial is sufficient using ANOVA

Fitting 5 different poly fits

fit.1 = lm (wage ~ age, data=Wage)
fit.2 = lm (wage ~ poly(age, 2), data=Wage)
fit.3 = lm (wage ~ poly(age, 3), data=Wage)
fit.4 = lm (wage ~ poly(age, 4), data=Wage)
fit.5 = lm (wage ~ poly(age, 5), data=Wage)

# Running ANOVA (pg.116)
anova(fit.1, fit.2, fit.3, fit.4, fit.5)

## Analysis of Variance Table
## 
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

COMMENTS: P-values in ANOVA that are around 0.05 is desired. ???? - Too low p-values indeicate that it is not sufficient - Too high p-values indeicate that it is not necessary/justified Can use ANOVA for both poly(raw=FALSE) or poly(raw=TRUE)

p-values using coefs if poly(raw=FALSE) is used

coef(summary(fit.5))

##                 Estimate Std. Error     t value     Pr(>|t|)
## (Intercept)    111.70361  0.7287647 153.2780243 0.000000e+00
## poly(age, 5)1  447.06785 39.9160847  11.2001930 1.491111e-28
## poly(age, 5)2 -478.31581 39.9160847 -11.9830341 2.367734e-32
## poly(age, 5)3  125.52169 39.9160847   3.1446392 1.679213e-03
## poly(age, 5)4  -77.91118 39.9160847  -1.9518743 5.104623e-02
## poly(age, 5)5  -35.81289 39.9160847  -0.8972045 3.696820e-01

Fit model to predict if person receives over 250k or below

fit = glm(I(wage>250) ~ poly(age, 4), data=Wage, family=binomial)

Predicting using Logistic Regression

preds = predict(fit, newdata=list(age=age.grid), se=TRUE)

Inversing the logits to get probabilities

pfit = exp(preds$fit)/(1+exp(preds$fit))

Getting the confidence bands of logits

se.bands.logit = cbind(preds$fit+2*preds$se.fit, preds$fit-2*preds$se.fit)

# Getting the confidence bands of probabilities
se.bands = exp(se.bands.logit)/(1+exp(se.bands.logit))

# Predicting using type='response'
preds = predict(fit, newdata=list(age=age.grid), type='response', se=TRUE)

# Plotting
plot(Wage$age, I(Wage$wage>250), xlim=agelims, type="n", ylim=c(0,.2))
points(jitter(Wage$age), I((Wage$wage>250)/5), cex=0.5, pch="|", col="darkgrey")
lines(age.grid, pfit, lwd=2, col="blue")
matlines(age.grid, se.bands, lwd=1, col="blue", lty=3)

Using cut function to apply step function

## Cutting age into 4 bins
table(cut(Wage$age,4))

## 
## (17.9,33.5]   (33.5,49]   (49,64.5] (64.5,80.1] 
##         750        1399         779          72

Fitting a linear model on each bin - Step function

fit = lm(wage ~ cut(age,4), data=Wage)
coef(summary(fit))

##                         Estimate Std. Error   t value     Pr(>|t|)
## (Intercept)            94.158392   1.476069 63.789970 0.000000e+00
## cut(age, 4)(33.5,49]   24.053491   1.829431 13.148074 1.982315e-38
## cut(age, 4)(49,64.5]   23.664559   2.067958 11.443444 1.040750e-29
## cut(age, 4)(64.5,80.1]  7.640592   4.987424  1.531972 1.256350e-01

Splines

p293

library(splines)

Basic functions B-spline or basis spline

fit = lm(wage ~ bs(age, knots=c(25,40,60)), data = Wage)
pred = predict(fit, newdata=list(age=age.grid), se=T)

plot(Wage$age,Wage$wage, col="gray")

lines(age.grid, pred$fit, lwd=2)
lines(age.grid, pred$fit + 2 * pred$se , lty="dashed")
lines(age.grid, pred$fit - 2 * pred$se , lty="dashed")

dim(bs(Wage$age, knots = c(25,40,60)))

## [1] 3000    6

#bs(Wage$age, knots = c(25,40,60))
dim(bs(Wage$age, df=6)) # 7 Degrees of freedom, 6 basis functions

## [1] 3000    6

attr(bs(Wage$age, df=6), "knots")

##   25%   50%   75% 
## 33.75 42.00 51.00

Natural Spline

fit2=lm(wage~ns(age,df=4), data = Wage)

pred2=predict(fit2, newdata=list(age=age.grid), se=T)

#lines(age.grid, pred2$fit, col="red", lwd=2) # FIXME

plot(Wage$age, Wage$wage, xlim = agelims, cex=.5, col="darkgrey")
title("Smoothing Spline")
fit= smooth.spline(Wage$age, Wage$wage, df=16)
fit2=smooth.spline(Wage$age, Wage$wage, cv = TRUE)

## Warning in smooth.spline(Wage$age, Wage$wage, cv = TRUE): cross-validation with
## non-unique 'x' values seems doubtful

#fit2$df

lines(fit, col = "red", lwd=2)
lines(fit2, col = "blue", lwd=2)
legend("topright", legend=c("16 DF", "6.8 DF"),
       col=c("red", "blue"), lty=1, lwd = 2, cex=.8)

Local Regression

loess() - local regression fitting

attach(Wage)
plot(age, wage, xlim = agelims, cex=.5, col="darkgrey")
title("Local Regression")

fit=loess(wage~age, span=.2,data=Wage)
fit2=loess(wage~age, span=.5,data=Wage)

lines(age.grid, predict(fit,data.frame(age=age.grid)),
      col="red", lwd=2)
lines(age.grid, predict(fit2,data.frame(age=age.grid)),
      col="blue", lwd=2)
legend("topright", legend=c("Span=0.2", "Span=0.5"),
       col=c("red", "blue"), lty=1, lwd = 2, cex=.8)

GAMs

p294

gam1=lm(wage~ns(year,4)+ns(age,5) + education, data=Wage)
#plot.gam(gam1, se=TRUE, col="red")

The s() function, which is part of the gam library, is used to indicate that s() we would like to use a smoothing spline.

library(gam)

## Loading required package: foreach

## Loaded gam 1.20

gam.m3=gam(wage~s(year,4) + s(age,5) + education, data=Wage)
par(mfrow=c(1,3))
plot(gam.m3, se=TRUE,col="blue")

library(gam)
plot.Gam(gam1, se=TRUE, col="red")

gam.m1=gam(wage~s(age,5) + education, data=Wage)
gam.m2=gam(wage~year+s(age,5) + education, data=Wage)
anova(gam.m1,gam.m2,gam.m3,test="F")

## Analysis of Deviance Table
## 
## Model 1: wage ~ s(age, 5) + education
## Model 2: wage ~ year + s(age, 5) + education
## Model 3: wage ~ s(year, 4) + s(age, 5) + education
##   Resid. Df Resid. Dev Df Deviance       F    Pr(>F)    
## 1      2990    3711731                                  
## 2      2989    3693842  1  17889.2 14.4771 0.0001447 ***
## 3      2986    3689770  3   4071.1  1.0982 0.3485661    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(gam.m3)

## 
## Call: gam(formula = wage ~ s(year, 4) + s(age, 5) + education, data = Wage)
## Deviance Residuals:
##     Min      1Q  Median      3Q     Max 
## -119.43  -19.70   -3.33   14.17  213.48 
## 
## (Dispersion Parameter for gaussian family taken to be 1235.69)
## 
##     Null Deviance: 5222086 on 2999 degrees of freedom
## Residual Deviance: 3689770 on 2986 degrees of freedom
## AIC: 29887.75 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##              Df  Sum Sq Mean Sq F value    Pr(>F)    
## s(year, 4)    1   27162   27162  21.981 2.877e-06 ***
## s(age, 5)     1  195338  195338 158.081 < 2.2e-16 ***
## education     4 1069726  267432 216.423 < 2.2e-16 ***
## Residuals  2986 3689770    1236                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##             Npar Df Npar F  Pr(F)    
## (Intercept)                          
## s(year, 4)        3  1.086 0.3537    
## s(age, 5)         4 32.380 <2e-16 ***
## education                            
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The large p-value for year reinforces our conclusion from the ANOVA test that a linear function is adequate for this term. However, there is very clear evidence that a non-linear term is required for age.

preds=predict(gam.m2,newdata=Wage)

gam.lo=gam(wage~s(year,df=4)+lo(age,span=0.7)+education, data=Wage)
#plot.Gam(gam.lo, se=TRUE, col="green")

gam.lo.i=gam(wage~lo(year,age,span=0.5)+education,data=Wage)

## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, : liv
## too small. (Discovered by lowesd)

## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, : lv
## too small. (Discovered by lowesd)

## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, : liv
## too small. (Discovered by lowesd)

## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, : lv
## too small. (Discovered by lowesd)

library(akima)
plot(gam.lo.i)

In order to fit a logistic regression GAM, we once again use the I() function in constructing the binary response variable, and set family=binomial

gam.lr=gam(I(wage>250)~year+s(age,df=5)+education, family=binomial,data=Wage)
par(mfrow=c(1,3))
plot(gam.lr,se=T,col="green")

p297

table(education, I(wage >250))

##                     
## education            FALSE TRUE
##   1. < HS Grad         268    0
##   2. HS Grad           966    5
##   3. Some College      643    7
##   4. College Grad      663   22
##   5. Advanced Degree   381   45

gam.lr.s=gam(I(wage>250) ~ year+s(age,df=5)+education, family=binomial, data=Wage, subset=(education!="1. < HS Grad"))
plot(gam.lr.s,se=T,col="green")

ISLR Chapter 7 Lab - Non-linear Modeling