ISLR Home

library(MASS)
library(randomForest)
set.seed(1)

# Creating boolean vector for train indices
train = sample(1:nrow(Boston), nrow(Boston)*0.7)

# Creating test set
boston.test = Boston[-train, "medv"]

# Setting the trees to try out
ntrees = c(25, 50, 100, 200, 300, 400, 500, 1000)
num.predictors = dim(Boston)[2] - 1

Creating a list of terminal nodes to try

m.tries = c(num.predictors, num.predictors/2, sqrt(num.predictors))

# Creating the vector to collect MSE
mse.p = rep(0, length(ntrees))
mse.half = rep(0, length(ntrees))
mse.sqp = rep(0, length(ntrees))

Creating a mse dataframe to store mse

df = data.frame(mse.p, mse.half, mse.sqp)

# Looping through the array of trees and fitting a rf
for (n in 1:length(m.tries)) {
  
  # Creating a mse vector to store mse from training for each tree
  mse = rep(0, length(ntrees))
  
  for (i in 1:length(ntrees)) {
    # Fitting a random forest with i trees
    rf.boston = randomForest(medv ~ ., 
                             ntrees=ntrees[i],
                             data=Boston, 
                             subset=train,
                             mtry=m.tries[n],
                             importance=TRUE)
              
    # Predicting on the validation set
    yhat.rf = predict(rf.boston, newdata=Boston[-train,])
              
    # Calculating and adding the MSE to the mse vector
    mse[i] = mean((yhat.rf-boston.test)^2)
  }
  
  # Storing the variable in the dataframe
  df[n] = mse  
}

df$trees = ntrees

Plotting the MSE

## Plotting first line
plot(df$trees, df$mse.p, type="o", col="blue", pch="o", lty=1, ylim=c(10,50))

## Plotting second line
points(df$trees, df$mse.half, col="red", pch="*")
lines(df$trees, df$mse.half, col="red", lty=2)

## Plotting third line
points(df$trees, df$mse.sqp, col="green", pch="+")
lines(df$trees, df$mse.sqp, col="green", lty=3)
legend(500,50,legend=c("mtry=p","mtry=p/2","mtry=sqrt(p)"), col=c("blue","red","black"),
       pch=c("o","*","+"),lty=c(1,2,3), ncol=1)