ISLR Home

p155

library(ISLR)

Stock Market Data

names(Smarket)
## [1] "Year"      "Lag1"      "Lag2"      "Lag3"      "Lag4"      "Lag5"     
## [7] "Volume"    "Today"     "Direction"
dim(Smarket)
## [1] 1250    9
summary(Smarket)
##       Year           Lag1                Lag2                Lag3          
##  Min.   :2001   Min.   :-4.922000   Min.   :-4.922000   Min.   :-4.922000  
##  1st Qu.:2002   1st Qu.:-0.639500   1st Qu.:-0.639500   1st Qu.:-0.640000  
##  Median :2003   Median : 0.039000   Median : 0.039000   Median : 0.038500  
##  Mean   :2003   Mean   : 0.003834   Mean   : 0.003919   Mean   : 0.001716  
##  3rd Qu.:2004   3rd Qu.: 0.596750   3rd Qu.: 0.596750   3rd Qu.: 0.596750  
##  Max.   :2005   Max.   : 5.733000   Max.   : 5.733000   Max.   : 5.733000  
##       Lag4                Lag5              Volume           Today          
##  Min.   :-4.922000   Min.   :-4.92200   Min.   :0.3561   Min.   :-4.922000  
##  1st Qu.:-0.640000   1st Qu.:-0.64000   1st Qu.:1.2574   1st Qu.:-0.639500  
##  Median : 0.038500   Median : 0.03850   Median :1.4229   Median : 0.038500  
##  Mean   : 0.001636   Mean   : 0.00561   Mean   :1.4783   Mean   : 0.003138  
##  3rd Qu.: 0.596750   3rd Qu.: 0.59700   3rd Qu.:1.6417   3rd Qu.: 0.596750  
##  Max.   : 5.733000   Max.   : 5.73300   Max.   :3.1525   Max.   : 5.733000  
##  Direction 
##  Down:602  
##  Up  :648  
##            
##            
##            
## 
cor(Smarket[,-9]) # Skip col 9 because it's qualitative
##              Year         Lag1         Lag2         Lag3         Lag4
## Year   1.00000000  0.029699649  0.030596422  0.033194581  0.035688718
## Lag1   0.02969965  1.000000000 -0.026294328 -0.010803402 -0.002985911
## Lag2   0.03059642 -0.026294328  1.000000000 -0.025896670 -0.010853533
## Lag3   0.03319458 -0.010803402 -0.025896670  1.000000000 -0.024051036
## Lag4   0.03568872 -0.002985911 -0.010853533 -0.024051036  1.000000000
## Lag5   0.02978799 -0.005674606 -0.003557949 -0.018808338 -0.027083641
## Volume 0.53900647  0.040909908 -0.043383215 -0.041823686 -0.048414246
## Today  0.03009523 -0.026155045 -0.010250033 -0.002447647 -0.006899527
##                Lag5      Volume        Today
## Year    0.029787995  0.53900647  0.030095229
## Lag1   -0.005674606  0.04090991 -0.026155045
## Lag2   -0.003557949 -0.04338321 -0.010250033
## Lag3   -0.018808338 -0.04182369 -0.002447647
## Lag4   -0.027083641 -0.04841425 -0.006899527
## Lag5    1.000000000 -0.02200231 -0.034860083
## Volume -0.022002315  1.00000000  0.014591823
## Today  -0.034860083  0.01459182  1.000000000
attach(Smarket)
#detach(Smarket)
plot(Smarket$Volume)

Logistic Regression

p156

glm.fit1 = glm(Direction ~ Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
               data = Smarket,
               family = binomial)  # binomial for logistic regression
summary(glm.fit1)              
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Smarket)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.446  -1.203   1.065   1.145   1.326  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.126000   0.240736  -0.523    0.601
## Lag1        -0.073074   0.050167  -1.457    0.145
## Lag2        -0.042301   0.050086  -0.845    0.398
## Lag3         0.011085   0.049939   0.222    0.824
## Lag4         0.009359   0.049974   0.187    0.851
## Lag5         0.010313   0.049511   0.208    0.835
## Volume       0.135441   0.158360   0.855    0.392
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1731.2  on 1249  degrees of freedom
## Residual deviance: 1727.6  on 1243  degrees of freedom
## AIC: 1741.6
## 
## Number of Fisher Scoring iterations: 3
coef(glm.fit1)
##  (Intercept)         Lag1         Lag2         Lag3         Lag4         Lag5 
## -0.126000257 -0.073073746 -0.042301344  0.011085108  0.009358938  0.010313068 
##       Volume 
##  0.135440659
summary(glm.fit1)$coef
##                 Estimate Std. Error    z value  Pr(>|z|)
## (Intercept) -0.126000257 0.24073574 -0.5233966 0.6006983
## Lag1        -0.073073746 0.05016739 -1.4565986 0.1452272
## Lag2        -0.042301344 0.05008605 -0.8445733 0.3983491
## Lag3         0.011085108 0.04993854  0.2219750 0.8243333
## Lag4         0.009358938 0.04997413  0.1872757 0.8514445
## Lag5         0.010313068 0.04951146  0.2082966 0.8349974
## Volume       0.135440659 0.15835970  0.8552723 0.3924004
summary(glm.fit1)$coef[,4] # Column 4
## (Intercept)        Lag1        Lag2        Lag3        Lag4        Lag5 
##   0.6006983   0.1452272   0.3983491   0.8243333   0.8514445   0.8349974 
##      Volume 
##   0.3924004

p157

P(Y=1|X)

If no data set is supplied to the predict() function, then the probabilities are computed for the training data that was used to fit the logistic regression mode

glm.probs = predict(glm.fit1, type = "response")
glm.probs[1:6]
##         1         2         3         4         5         6 
## 0.5070841 0.4814679 0.4811388 0.5152224 0.5107812 0.5069565
#View(glm.probs)

contrasts(Smarket$Direction)
##      Up
## Down  0
## Up    1

Confusion Matrix

#glm.probs
glm.pred = rep("Down", 1250) 
glm.pred[glm.probs > .5] = "Up"
summary(glm.pred)
##    Length     Class      Mode 
##      1250 character character
# Produce a confusion matrix
# Diagonals indicate correct predictions. Off-diagonals indicate incorrect predictions.
table(glm.pred, Smarket$Direction)
##         
## glm.pred Down  Up
##     Down  145 141
##     Up    457 507

Determine how many correctly and incorrectly classified

(507 + 145) / 1250 # Correct 52.16%
## [1] 0.5216
mean(glm.pred == Smarket$Direction)
## [1] 0.5216
head(glm.pred, 5)
## [1] "Up"   "Down" "Down" "Up"   "Up"
head(Smarket$Direction, 5)
## [1] Up   Up   Down Up   Up  
## Levels: Down Up

Now Predict

p159

train = (Smarket$Year<2005)

Smarket.2005 = Smarket[!train,]
dim(Smarket.2005)
## [1] 252   9
Direction.2005 = Smarket$Direction[!train] # Vector

Smarket.2005[1:6,]
##      Year   Lag1   Lag2   Lag3   Lag4   Lag5 Volume  Today Direction
## 999  2005 -0.134  0.008 -0.007  0.715 -0.431 0.7869 -0.812      Down
## 1000 2005 -0.812 -0.134  0.008 -0.007  0.715 1.5108 -1.167      Down
## 1001 2005 -1.167 -0.812 -0.134  0.008 -0.007 1.7210 -0.363      Down
## 1002 2005 -0.363 -1.167 -0.812 -0.134  0.008 1.7389  0.351        Up
## 1003 2005  0.351 -0.363 -1.167 -0.812 -0.134 1.5691 -0.143      Down
## 1004 2005 -0.143  0.351 -0.363 -1.167 -0.812 1.4779  0.342        Up
head(Smarket.2005, 4)
##      Year   Lag1   Lag2   Lag3   Lag4   Lag5 Volume  Today Direction
## 999  2005 -0.134  0.008 -0.007  0.715 -0.431 0.7869 -0.812      Down
## 1000 2005 -0.812 -0.134  0.008 -0.007  0.715 1.5108 -1.167      Down
## 1001 2005 -1.167 -0.812 -0.134  0.008 -0.007 1.7210 -0.363      Down
## 1002 2005 -0.363 -1.167 -0.812 -0.134  0.008 1.7389  0.351        Up

Logistic Regression Model

glm.fit2 = glm(Direction ~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
data = Smarket,
family = binomial, subset = train)
glm.probs2 = predict(glm.fit2, Smarket.2005, type = "response")

Now check prediction

glm.pred = rep("Down", 252) # nrow(Smarket.2005)
glm.pred[glm.probs2 > .5] = "Up"
table(glm.pred, Direction.2005)
##         Direction.2005
## glm.pred Down Up
##     Down   77 97
##     Up     34 44
mean(glm.pred == Direction.2005)
## [1] 0.4801587
mean(glm.pred != Direction.2005) # Compute and test error rate
## [1] 0.5198413

Remove Predictors

remove +Lag3+Lag4+Lag5+Volume

glm.fit1 = glm(Direction ~Lag1+Lag2, data = Smarket, family = binomial, subset = train) # Train Data
glm.probs2 = predict(glm.fit1, Smarket.2005, type = "response")
head(Smarket.2005, 5)
##      Year   Lag1   Lag2   Lag3   Lag4   Lag5 Volume  Today Direction
## 999  2005 -0.134  0.008 -0.007  0.715 -0.431 0.7869 -0.812      Down
## 1000 2005 -0.812 -0.134  0.008 -0.007  0.715 1.5108 -1.167      Down
## 1001 2005 -1.167 -0.812 -0.134  0.008 -0.007 1.7210 -0.363      Down
## 1002 2005 -0.363 -1.167 -0.812 -0.134  0.008 1.7389  0.351        Up
## 1003 2005  0.351 -0.363 -1.167 -0.812 -0.134 1.5691 -0.143      Down

Again check

glm.pred = rep("Down", 252)
glm.pred[glm.probs2 > .5] = "Up"
table(glm.pred, Direction.2005)
##         Direction.2005
## glm.pred Down  Up
##     Down   35  35
##     Up     76 106
mean(glm.pred == Direction.2005)
## [1] 0.5595238
106 / (106 + 76)
## [1] 0.5824176
predict(glm.fit1, newdata = data.frame(Lag1=c(1.2,1.5), Lag2=c(1.1,-0.8)), type = "response")
##         1         2 
## 0.4791462 0.4960939

Linear Discriminant Analysis

p161

library(MASS)

Same format as glm: glm(Direction ~Lag1+Lag2, data = Smarket, family = binomial, subset = train) - family

lda.fit = lda(Direction ~Lag1+Lag2, data = Smarket, subset = train)
lda.fit
## Call:
## lda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
## 
## Prior probabilities of groups:
##     Down       Up 
## 0.491984 0.508016 
## 
## Group means:
##             Lag1        Lag2
## Down  0.04279022  0.03389409
## Up   -0.03954635 -0.03132544
## 
## Coefficients of linear discriminants:
##             LD1
## Lag1 -0.6420190
## Lag2 -0.5135293
lda.pred = predict(lda.fit, Smarket.2005)
names(lda.pred)
## [1] "class"     "posterior" "x"
lda.class = lda.pred$class
table(lda.class, Direction.2005)
##          Direction.2005
## lda.class Down  Up
##      Down   35  35
##      Up     76 106
mean(lda.class == Direction.2005)
## [1] 0.5595238
sum(lda.pred$posterior[,1] >=.5)
## [1] 70
sum(lda.pred$posterior[,1] <.5)
## [1] 182
sum(lda.pred$posterior[,1] >.9) # Want only over 90% posterior probability
## [1] 0
# 0 !!!

Quadratic Discriminant Analysis

p163

Same format as lda()

qda.fit = qda(Direction ~Lag1+Lag2, data = Smarket, subset = train)
qda.fit
## Call:
## qda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
## 
## Prior probabilities of groups:
##     Down       Up 
## 0.491984 0.508016 
## 
## Group means:
##             Lag1        Lag2
## Down  0.04279022  0.03389409
## Up   -0.03954635 -0.03132544
qda.pred = predict(qda.fit, Smarket.2005)
names(qda.pred)
## [1] "class"     "posterior"
qda.class = qda.pred$class
table(qda.class,Direction.2005)
##          Direction.2005
## qda.class Down  Up
##      Down   30  20
##      Up     81 121
mean(qda.class == Direction.2005) # Accurate 60% of the time
## [1] 0.5992063

K-Nearest Neighbors

p163

data(package=“ISLR”)

knn() requires 4 inputs

library(class) # knn()
train.X = cbind(Smarket$Lag1, Smarket$Lag2)[train,]

#View(train.X)
test.X = cbind(Smarket$Lag1, Smarket$Lag2)[!train,] # ! wrong??
train.Direction = Smarket$Direction[train]
#train.Direction
#Direction[train]
dim(train.X)
## [1] 998   2
summary(Smarket$Direction)
## Down   Up 
##  602  648

Make predictions for dates in 2005

k=1

set.seed(1)
knn.pred = knn(train.X, test.X, train.Direction, k=1)
table(knn.pred,Direction.2005)
##         Direction.2005
## knn.pred Down Up
##     Down   43 58
##     Up     68 83
(83+43)/252
## [1] 0.5

k=3

knn.pred = knn(train.X, test.X, train.Direction, k=3)
table(knn.pred, Direction.2005)
##         Direction.2005
## knn.pred Down Up
##     Down   48 54
##     Up     63 87
mean(knn.pred == Direction.2005)
## [1] 0.5357143

detach(Smarket) # Cleanup so we don’t have bad references to wrong table

QDA provides best results so far with an accuracy of 60%