- We use the R package
e1071
for fitting an SVM to the data. - There are other versions as SVM is very popular. You should also try to implement SVM on your own.
e1071
for fitting an SVM to the data.svm
function requires a few parameters apart from model and data.kernel
: could be any of linear
, polynomial
,radial
.cost
: Different from budget.cost
: cost of a violation to the margin. When the cost argument is small, then the margins will be wide and many support vectors will be on the margin or will violate the margin. When the cost argument is large, then the margins will be narrow and there will be few support vectors on the margin or violating the margin.library(e1071) set.seed(1) x=matrix(rnorm(200*2), ncol=2) x[1:100,]=x[1:100,]+2 x[101:150,]=x[101:150,]-2 y=c(rep(1,150),rep(2,50)) dat=data.frame(x=x,y=as.factor(y))
plot(x, col=y)
gamma
argument can take value for the \(\gamma\) parameter.train=sample(200,100) svmfit=svm(y~., data=dat[train,], kernel="radial", gamma=1, cost=1)
degree
parameter.plot(svmfit, dat[train,])
summary(svmfit)
## ## Call: ## svm(formula = y ~ ., data = dat[train, ], kernel = "radial", ## gamma = 1, cost = 1) ## ## ## Parameters: ## SVM-Type: C-classification ## SVM-Kernel: radial ## cost: 1 ## gamma: 1 ## ## Number of Support Vectors: 37 ## ## ( 17 20 ) ## ## ## Number of Classes: 2 ## ## Levels: ## 1 2
svmfit=svm(y~., data=dat[train,], kernel="radial",gamma=1,cost=1e5) plot(svmfit,dat[train,])
tune()
, to perform cross-validation.tune()
performs ten-fold cross-validation on a set of models of interest.set.seed(1) tune.out=tune(svm, y~., data=dat[train,], kernel="radial", ranges=list(cost=c(0.1,1,10,100,1000),gamma=c(0.5,1,2,3,4))) summary(tune.out)
## ## Parameter tuning of 'svm': ## ## - sampling method: 10-fold cross validation ## ## - best parameters: ## cost gamma ## 1 2 ## ## - best performance: 0.12 ## ## - Detailed performance results: ## cost gamma error dispersion ## 1 1e-01 0.5 0.27 0.11595018 ## 2 1e+00 0.5 0.13 0.08232726 ## 3 1e+01 0.5 0.15 0.07071068 ## 4 1e+02 0.5 0.17 0.08232726 ## 5 1e+03 0.5 0.21 0.09944289 ## 6 1e-01 1.0 0.25 0.13540064 ## 7 1e+00 1.0 0.13 0.08232726 ## 8 1e+01 1.0 0.16 0.06992059 ## 9 1e+02 1.0 0.20 0.09428090 ## 10 1e+03 1.0 0.20 0.08164966 ## 11 1e-01 2.0 0.25 0.12692955 ## 12 1e+00 2.0 0.12 0.09189366 ## 13 1e+01 2.0 0.17 0.09486833 ## 14 1e+02 2.0 0.19 0.09944289 ## 15 1e+03 2.0 0.20 0.09428090 ## 16 1e-01 3.0 0.27 0.11595018 ## 17 1e+00 3.0 0.13 0.09486833 ## 18 1e+01 3.0 0.18 0.10327956 ## 19 1e+02 3.0 0.21 0.08755950 ## 20 1e+03 3.0 0.22 0.10327956 ## 21 1e-01 4.0 0.27 0.11595018 ## 22 1e+00 4.0 0.15 0.10801234 ## 23 1e+01 4.0 0.18 0.11352924 ## 24 1e+02 4.0 0.21 0.08755950 ## 25 1e+03 4.0 0.24 0.10749677
table(true=dat[-train,"y"], pred=predict(tune.out$best.model, newdata=dat[-train,]))
## pred ## true 1 2 ## 1 74 3 ## 2 7 16
svm()
function will perform multi-class classification using the one-versus-one approach.set.seed(1) x=rbind(x, matrix(rnorm(50*2), ncol=2)) y=c(y, rep(0,50)) x[y==0,2]=x[y==0,2]+2 dat=data.frame(x=x, y=as.factor(y))
par(mfrow=c(1,1)) plot(x,col=(y+1))
svmfit=svm(y~., data=dat, kernel="radial", cost=10, gamma=1) plot(svmfit, dat)
We now examine the Khan data set, which consists of a number of tissue samples corresponding to four distinct types of small round blue cell tumors.
library(ISLR) names(Khan)
## [1] "xtrain" "xtest" "ytrain" "ytest"
dim(Khan$xtrain)
## [1] 63 2308
dim(Khan$xtest)
## [1] 20 2308
table(Khan$ytrain)
## ## 1 2 3 4 ## 8 23 12 20
table(Khan$ytest)
## ## 1 2 3 4 ## 3 6 6 5
dat=data.frame(x=Khan$xtrain, y=as.factor(Khan$ytrain)) out=svm(y~., data=dat, kernel="linear",cost=10) #summary(out) table(out$fitted, dat$y)
## ## 1 2 3 4 ## 1 8 0 0 0 ## 2 0 23 0 0 ## 3 0 0 12 0 ## 4 0 0 0 20
dat.te=data.frame(x=Khan$xtest, y=as.factor(Khan$ytest)) pred.te=predict(out, newdata=dat.te) table(pred.te, dat.te$y)
## ## pred.te 1 2 3 4 ## 1 3 0 0 0 ## 2 0 6 2 0 ## 3 0 0 4 0 ## 4 0 0 0 5