ISLR Home

p165

library(ISLR)
dim(Caravan)
## [1] 5822   86
#View(Caravan)
attach(Caravan)
summary(Caravan$Purchase) ## No Yes ## 5474 348 348/5822 # 6% purchased Caravan insurance ## [1] 0.05977327 # Salary and age are on different scales summary(Caravan[,1]) ## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 1.00 10.00 30.00 24.25 35.00 41.00 Now every column of standardized.X has a standard deviation of one and a mean of zero. ## Scale Data standardized.X=scale(Caravan [,-86]) # Produces matrix for all columns summary(standardized.X[,1]) ## Min. 1st Qu. Median Mean 3rd Qu. Max. ## -1.8101 -1.1095 0.4473 0.0000 0.8365 1.3036 var(Caravan [ ,1]) ## [1] 165.0378 var(Caravan [ ,2]) ## [1] 0.1647078 var(standardized.X[,1]) ## [1] 1 var(standardized.X[,2]) ## [1] 1 mean(standardized.X[,1]) ## [1] -7.025576e-17 ## Split Data into Train/Test test=1:1000 train.X=standardized.X[-test ,] # Exclude first 1000 rows test.X=standardized.X[test ,] train.Y=Caravan$Purchase [-test]
test.Y=Caravan\$Purchase [test]

# KNN Model

library(class) # knn
set.seed(1)
knn.pred=knn(train.X, test.X, train.Y, k=1)

## Evaluate

mean(test.Y!=knn.pred) # 0.118  11.8%
## [1] 0.118
mean(test.Y!="No") # 0.059 6%
## [1] 0.059
table(knn.pred,test.Y) # Confusion Matrix
##         test.Y
## knn.pred  No Yes
##      No  873  50
##      Yes  68   9
# Yes  68   9
9/(68+9) # Success rate = 11.7%
## [1] 0.1168831

## K=3

knn.pred=knn(train.X, test.X, train.Y, k=3)
table(knn.pred,test.Y) # Confusion Matrix
##         test.Y
## knn.pred  No Yes
##      No  920  54
##      Yes  21   5
# Yes  21   5
5/(21 + 5) # Success rate = 19.2%
## [1] 0.1923077

## K=5

knn.pred=knn(train.X, test.X, train.Y, k=5)
table(knn.pred,test.Y) # Confusion Matrix
##         test.Y
## knn.pred  No Yes
##      No  930  55
##      Yes  11   4
# Yes  11   4
4/(11+4) # Success rate = 26%
## [1] 0.2666667

# Logistic Regression Model

### 1. Fit

glm.fits=glm(Purchase~.,data=Caravan ,family=binomial, subset=-test)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## 2. Predict

glm.probs=predict(glm.fits,Caravan[test,], type="response")

## 3. Evaluate

glm.pred=rep("No",1000) # The 1000 in test
glm.pred[glm.probs >.5]="Yes"
table(glm.pred,test.Y)
##         test.Y
## glm.pred  No Yes
##      No  934  59
##      Yes   7   0

Yes 7 0

glm.pred=rep("No",1000) # The 1000 in test
glm.pred[glm.probs >.25]="Yes"
table(glm.pred,test.Y)
##         test.Y
## glm.pred  No Yes
##      No  919  48
##      Yes  22  11
#Yes  22  11
11/(22+11) # Success rate = 33.3%
## [1] 0.3333333