ISLR Home

Question

p369

Generate a simulated two-class data set with 100 observations and two features in which there is a visible but non-linear separation between the two classes.


2 columns and the class

set.seed(1)
x = matrix(rnorm(50*2), ncol=2)
y = c(rep(-1,25), rep(1,25)) # The class vector
x[y==1,]=x[y==1,] + 1
rnorm(2)
## [1] -0.62036668  0.04211587
library(e1071)
dat = data.frame(x=x, y=as.factor(y))
train = sample(50, 25)
x
##              [,1]        [,2]
##  [1,] -0.62645381  0.39810588
##  [2,]  0.18364332 -0.61202639
##  [3,] -0.83562861  0.34111969
##  [4,]  1.59528080 -1.12936310
##  [5,]  0.32950777  1.43302370
##  [6,] -0.82046838  1.98039990
##  [7,]  0.48742905 -0.36722148
##  [8,]  0.73832471 -1.04413463
##  [9,]  0.57578135  0.56971963
## [10,] -0.30538839 -0.13505460
## [11,]  1.51178117  2.40161776
## [12,]  0.38984324 -0.03924000
## [13,] -0.62124058  0.68973936
## [14,] -2.21469989  0.02800216
## [15,]  1.12493092 -0.74327321
## [16,] -0.04493361  0.18879230
## [17,] -0.01619026 -1.80495863
## [18,]  0.94383621  1.46555486
## [19,]  0.82122120  0.15325334
## [20,]  0.59390132  2.17261167
## [21,]  0.91897737  0.47550953
## [22,]  0.78213630 -0.70994643
## [23,]  0.07456498  0.61072635
## [24,] -1.98935170 -0.93409763
## [25,]  0.61982575 -1.25363340
## [26,]  0.94387126  1.29144624
## [27,]  0.84420449  0.55670813
## [28,] -0.47075238  1.00110535
## [29,]  0.52184994  1.07434132
## [30,]  1.41794156  0.41047905
## [31,]  2.35867955  0.43133127
## [32,]  0.89721227  0.86482138
## [33,]  1.38767161  2.17808700
## [34,]  0.94619496 -0.52356680
## [35,] -0.37705956  1.59394619
## [36,]  0.58500544  1.33295037
## [37,]  0.60571005  2.06309984
## [38,]  0.94068660  0.69581608
## [39,]  2.10002537  1.37001881
## [40,]  1.76317575  1.26709879
## [41,]  0.83547640  0.45747997
## [42,]  0.74663832  2.20786781
## [43,]  1.69696338  2.16040262
## [44,]  1.55666320  1.70021365
## [45,]  0.31124431  2.58683345
## [46,]  0.29250484  1.55848643
## [47,]  1.36458196 -0.27659221
## [48,]  1.76853292  0.42673459
## [49,]  0.88765379 -0.22461261
## [50,]  1.88110773  0.52659936

Scatter Plot

df = data.frame(x1 = x[1], x2=x[2], y=y)
ggplot(df, aes(x=x1, x2, color=factor(y))) +
  geom_point(size=2, shape=23) + 
  xlim(-2, 2) + 
  ylim(-2, 2) + geom_jitter()

dat
##            x.1         x.2  y
## 1  -0.62645381  0.39810588 -1
## 2   0.18364332 -0.61202639 -1
## 3  -0.83562861  0.34111969 -1
## 4   1.59528080 -1.12936310 -1
## 5   0.32950777  1.43302370 -1
## 6  -0.82046838  1.98039990 -1
## 7   0.48742905 -0.36722148 -1
## 8   0.73832471 -1.04413463 -1
## 9   0.57578135  0.56971963 -1
## 10 -0.30538839 -0.13505460 -1
## 11  1.51178117  2.40161776 -1
## 12  0.38984324 -0.03924000 -1
## 13 -0.62124058  0.68973936 -1
## 14 -2.21469989  0.02800216 -1
## 15  1.12493092 -0.74327321 -1
## 16 -0.04493361  0.18879230 -1
## 17 -0.01619026 -1.80495863 -1
## 18  0.94383621  1.46555486 -1
## 19  0.82122120  0.15325334 -1
## 20  0.59390132  2.17261167 -1
## 21  0.91897737  0.47550953 -1
## 22  0.78213630 -0.70994643 -1
## 23  0.07456498  0.61072635 -1
## 24 -1.98935170 -0.93409763 -1
## 25  0.61982575 -1.25363340 -1
## 26  0.94387126  1.29144624  1
## 27  0.84420449  0.55670813  1
## 28 -0.47075238  1.00110535  1
## 29  0.52184994  1.07434132  1
## 30  1.41794156  0.41047905  1
## 31  2.35867955  0.43133127  1
## 32  0.89721227  0.86482138  1
## 33  1.38767161  2.17808700  1
## 34  0.94619496 -0.52356680  1
## 35 -0.37705956  1.59394619  1
## 36  0.58500544  1.33295037  1
## 37  0.60571005  2.06309984  1
## 38  0.94068660  0.69581608  1
## 39  2.10002537  1.37001881  1
## 40  1.76317575  1.26709879  1
## 41  0.83547640  0.45747997  1
## 42  0.74663832  2.20786781  1
## 43  1.69696338  2.16040262  1
## 44  1.55666320  1.70021365  1
## 45  0.31124431  2.58683345  1
## 46  0.29250484  1.55848643  1
## 47  1.36458196 -0.27659221  1
## 48  1.76853292  0.42673459  1
## 49  0.88765379 -0.22461261  1
## 50  1.88110773  0.52659936  1

Linear, Cost=1

svmfit.linear=svm(y~., data=dat[train,], kernel="linear", cost=1, scale = FALSE)
plot(svmfit.linear, dat)

summary(svmfit.linear)
## 
## Call:
## svm(formula = y ~ ., data = dat[train, ], kernel = "linear", cost = 1, 
##     scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  16
## 
##  ( 8 8 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  -1 1

Confusion Matrix

svm.predict = predict(svmfit.linear, newdata = dat[train,])
table(true=dat[train,"y"], pred=svm.predict)
##     pred
## true -1  1
##   -1 12  2
##   1   3  8
(11+7)/25
## [1] 0.72
length(svm.predict)
## [1] 25

Radial, gamma=1, cost=1

svmfit.radial=svm(y~., data=dat[train,], kernel="radial", gamma=1,
           cost=1)
plot(svmfit.radial, dat[train,])

summary(svmfit.radial)
## 
## Call:
## svm(formula = y ~ ., data = dat[train, ], kernel = "radial", gamma = 1, 
##     cost = 1)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  20
## 
##  ( 10 10 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  -1 1
svm.predict.radial = predict(svmfit.radial, newdata = dat[train,])
table(true=dat[train,"y"], pred=svm.predict.radial)
##     pred
## true -1  1
##   -1 11  3
##   1   1 10
(13+9)/25
## [1] 0.88

Do Predict for Test Data

Linear

svm.predict.test = predict(svmfit.linear, newdata = dat[-train,])
table(true=dat[-train,"y"], pred=svm.predict.test)
##     pred
## true -1  1
##   -1  8  3
##   1   3 11
(10+11)/25
## [1] 0.84
plot(svmfit.linear, dat[-train,])

Radial Kernel

svm.predict.radial.test = predict(svmfit.radial, newdata = dat[-train,])
table(true=dat[-train,"y"], pred=svm.predict.radial.test)
##     pred
## true -1  1
##   -1  8  3
##   1   4 10
(8+9)/25
## [1] 0.68
plot(svmfit.radial, dat[-train,])

The radial kernel overfit the training data and did not do as well on the test data. In other words, linear kernel was better on the test data.