Professional Documents
Culture Documents
Bagging
mtry=13 indicates that all 13 predictors should be considered for each split of the treein
other words, that bagging should be done.
library(MASS)
library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
attach(Boston)
set.seed(1)
train = sample (1: nrow(Boston),nrow(Boston)/2)
boston.test=Boston[-train ,"medv"]
bag.boston=randomForest(medv~.,data=Boston,subset=train,mtry=13,importance =TRUE)
bag.boston
##
## Call:
## randomForest(formula = medv ~ ., data = Boston, mtry = 13, importance = TRUE,
##
Type of random forest: regression
##
Number of trees: 500
## No. of variables tried at each split: 13
##
##
Mean of squared residuals: 11.08966
##
% Var explained: 86.57
plot(bag.boston)
subset = train)
10
15
Error
20
25
bag.boston
100
200
300
trees
yhat.bag=predict(bag.boston,newdata=Boston[-train,])
plot(yhat.bag, boston.test)
abline(0,1)
400
500
50
40
30
20
10
boston.test
10
20
30
40
50
yhat.bag
mean((yhat.bag-boston.test)^2)
## [1] 13.33831
bag.boston =randomForest(medv~.,data=Boston,subset =train,mtry=13, importance=T,ntree =25)
bag.boston
##
## Call:
## randomForest(formula = medv ~ ., data = Boston, mtry = 13, importance = T,
##
Type of random forest: regression
##
Number of trees: 25
## No. of variables tried at each split: 13
##
##
Mean of squared residuals: 12.92873
##
% Var explained: 84.35
yhat.bag = predict(bag.boston ,newdata=Boston[-train,])
mean((yhat.bag -boston.test)^2)
## [1] 14.41793
16
12
14
MSE
18
20
set.seed(2)
MSE<-rep(0,13)
for(i in 1:13){ # find best mtry
bag.boston =randomForest(medv~.,data=Boston,subset=train,mtry=i,importance=T)
yhat.bag = predict(bag.boston ,newdata=Boston[-train,])
MSE[i]<-mean((yhat.bag -boston.test)^2)
}
plot(1:13,MSE,type="b")
10
12
1:13
%IncMSE IncNodePurity
12.421853
1126.72987
3.435084
45.48735
9.875513
1057.45440
4
##
##
##
##
##
##
##
##
##
##
chas
nox
rm
age
dis
rad
tax
ptratio
black
lstat
1.760632
13.420937
31.822013
10.611374
13.838517
3.148078
8.404814
11.932105
7.215369
29.282301
62.54896
1029.92004
6463.62153
532.12941
1268.48617
88.94067
483.25198
863.51232
386.88513
7140.69398
varImpPlot(rf.boston )
rf.boston
rm
lstat
dis
nox
crim
ptratio
age
indus
tax
black
zn
rad
chas
lstat
rm
dis
crim
indus
nox
ptratio
age
tax
black
rad
chas
zn
5
10
15 20 25
%IncMSE
30
Boosting
If classification problem, use distribution=bernoulli
We can also produce partial dependence plots for these two variables. These plots illustrate
the marginal effect of the selected variables on the response after integrating out the other
variables. In this case, as we might expect, median house prices are increasing with rm and
decreasing with lstat.
library(gbm)
##
##
##
##
##
Loading required
Loading required
Loading required
Loading required
Loaded gbm 2.1.1
package:
package:
package:
package:
survival
lattice
splines
parallel
zn
indus
black
crim
lstat
set.seed(1)
boost.boston =gbm(medv~.,data=Boston[train,],distribution="gaussian",n.trees =5000, interaction.depth =4
summary(boost.boston)
10
20
Relative influence
##
##
##
##
##
##
##
##
##
##
##
##
var
rel.inf
lstat
lstat 45.9627334
rm
rm 31.2238187
dis
dis 6.8087398
crim
crim 4.0743784
nox
nox 2.5605001
ptratio ptratio 2.2748652
black
black 1.7971159
age
age 1.6488532
tax
tax 1.3595005
indus
indus 1.2705924
chas
chas 0.8014323
6
30
40
## rad
## zn
rad
zn
0.2026619
0.0148083
28
26
22
24
f(rm)
30
32
6
rm
plot(boost.boston ,i="lstat")
30
25
20
f(lstat)
10
15
20
25
30
35
lstat
## [1] 11.84434
MSE<-rep(0,7)
lambda<-c(0.00001,0.0001,0.001,0.01,0.1,0.15,0.2)
for(i in 1:7){
boost.boston =gbm(medv~.,data=Boston[train,],distribution="gaussian",n.trees =5000,interaction.depth =
yhat.boost=predict (boost.boston ,newdata =Boston [-train ,],n.trees =5000)
MSE[i]<-mean((yhat.boost -boston.test)^2)
}
plot(1:7,MSE,type="b")
10 20 30 40 50 60 70 80
MSE
1:7
MSE<-rep(0,8)
size<-c(500,1000,2000,4000,5000,7000,8000,10000)
for(i in 1:8){
boost.boston =gbm(medv~.,data=Boston[train,],distribution="gaussian",n.trees =size[i],interaction.dept
yhat.boost=predict (boost.boston ,newdata =Boston [-train ,],n.trees =size[i])
MSE[i]<-mean((yhat.boost -boston.test)^2)
}
plot(1:8,MSE,type="b")
12.0
11.5
11.0
10.5
MSE
1:8
## [1] 10.39679
Reference:
James, Gareth, et al. An introduction to statistical learning. New
York: springer, 2013.
10