1 Dataset

Packages used in this section: sp for the sample dataset.

library(sp)

Sample point dataset, comes with sp; make a log10-transformed copy of the metal concentrations.

Also load the prediction grid: locations where we do not know the value of the target variable, at which we want to make prediction of that variable, from the model built (trained, calibrated) from the known points, where we know both the target variable and the predictors.

data(meuse)
str(meuse)
## 'data.frame':    155 obs. of  14 variables:
##  $ x      : num  181072 181025 181165 181298 181307 ...
##  $ y      : num  333611 333558 333537 333484 333330 ...
##  $ cadmium: num  11.7 8.6 6.5 2.6 2.8 3 3.2 2.8 2.4 1.6 ...
##  $ copper : num  85 81 68 81 48 61 31 29 37 24 ...
##  $ lead   : num  299 277 199 116 117 137 132 150 133 80 ...
##  $ zinc   : num  1022 1141 640 257 269 ...
##  $ elev   : num  7.91 6.98 7.8 7.66 7.48 ...
##  $ dist   : num  0.00136 0.01222 0.10303 0.19009 0.27709 ...
##  $ om     : num  13.6 14 13 8 8.7 7.8 9.2 9.5 10.6 6.3 ...
##  $ ffreq  : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ soil   : Factor w/ 3 levels "1","2","3": 1 1 1 2 2 2 2 1 1 2 ...
##  $ lime   : Factor w/ 2 levels "0","1": 2 2 2 1 1 1 1 1 1 1 ...
##  $ landuse: Factor w/ 15 levels "Aa","Ab","Ag",..: 4 4 4 11 4 11 4 2 2 15 ...
##  $ dist.m : num  50 30 150 270 380 470 240 120 240 420 ...
meuse$logZn <- log10(meuse$zinc)    # 锌
meuse$logCu <- log10(meuse$copper)  # 铜
meuse$logPb <- log10(meuse$lead)    # 铅
meuse$logCd <- log10(meuse$cadmium) # 镉
data(meuse.grid)
str(meuse.grid)
## 'data.frame':    3103 obs. of  7 variables:
##  $ x     : num  181180 181140 181180 181220 181100 ...
##  $ y     : num  333740 333700 333700 333700 333660 ...
##  $ part.a: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ part.b: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ dist  : num  0 0 0.0122 0.0435 0 ...
##  $ soil  : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ffreq : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...

2 Random forest with randomForest

Packages used in this section: randomForest for random forests, randomForestExplainer for some diagnostics.

library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(randomForestExplainer)

2.1 Build the forest

First build the forest, using the randomForest function:

m.lzn.rf <- randomForest(logZn ~ ffreq + x + y + dist.m + elev + soil + lime, data=meuse, importance=T, na.action=na.omit, mtry=5)
print(m.lzn.rf)
## 
## Call:
##  randomForest(formula = logZn ~ ffreq + x + y + dist.m + elev +      soil + lime, data = meuse, importance = T, mtry = 5, na.action = na.omit) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##           Mean of squared residuals: 0.01847609
##                     % Var explained: 81.08

The mtry optional argument gives the number of variables randomly sampled as candidates at each split. By default this is \(\lfloor{p/3}\rfloor\) where \(p\) is the number of predictors, here \(5\), so \(\lfloor{5/3}\rfloor = 2\). We want to try all the variables at each split for more accurate splitting.

Display the cross-validation error rate against the number of trees:

plot(m.lzn.rf)

2.2 Variable importance

Examine the variable importance. Type 1 is the increase in MSE if that predictor is assigned to its actuql point, rather than randomly assigned to a point. It is computed as follows (from the Help): “For each tree, the prediction error on the OOB portion of the data is recorded (…MSE for regression). Then the same is done after permuting each predictor variable. The difference between the two are then averaged over all trees, and normalized by the standard deviation of the differences”.

Type 2 is the increase in node purity. It is computed as follows: “The total decrease in node impurities from splitting on the [given] variable, averaged over all trees… measured by residual sum of squares.”

importance(m.lzn.rf, type=1)
##          %IncMSE
## ffreq  13.899792
## x      27.884703
## y      17.903159
## dist.m 54.401199
## elev   36.291613
## soil    7.075930
## lime    8.782412
importance(m.lzn.rf, type=2)
##        IncNodePurity
## ffreq      0.4409211
## x          1.1092703
## y          0.7235428
## dist.m     7.9824271
## elev       3.6566473
## soil       0.1498958
## lime       0.6286723
varImpPlot(m.lzn.rf, type=1)

2.3 Goodness-of-fit and out-of-bag errors

Predict back onto the known points, and evaluate the goodness-of-fit:

p.rf <- predict(m.lzn.rf, newdata=meuse)
length(unique(p.rf))
## [1] 155
summary(r.rpp <- meuse$logZn - p.rf)
##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -0.1693091 -0.0386182 -0.0035208 -0.0006388  0.0339918  0.1584753
(rmse.rf <- sqrt(sum(r.rpp^2)/length(r.rpp)))
## [1] 0.06053521
plot(meuse$logZn ~ p.rf, asp=1, pch=20, xlab="fitted", ylab="actual", xlim=c(2,3.3),          ylim=c(2,3.3), main="log10(Zn), Meuse topsoils, Random Forest")
grid()
abline(0,1)

A more realistic view of the predictive power of the model is the out-of-bag validation.

p.rf.oob <- predict(m.lzn.rf)
summary(r.rpp.oob <- meuse$logZn - p.rf.oob)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.415293 -0.080181 -0.012552 -0.001904  0.076448  0.319725
(rmse.oob <- sqrt(sum(r.rpp.oob^2)/length(r.rpp.oob)))
## [1] 0.1359268
plot(meuse$logZn ~ p.rf.oob, asp=1, pch=20,
     xlab="Out-of-bag cross-validation estimates",
     ylab="actual", xlim=c(2,3.3), ylim=c(2,3.3),
     main="log10(Zn), Meuse topsoils, Random Forest")
grid()
abline(0,1)

2.4 Sensitivity

Compute the RF several times and collect statistics:

n <- 24
rf.stats <- data.frame(rep=1:10, rsq=as.numeric(NA), mse=as.numeric(NA))
for (i in 1:n) {
  model.rf <- randomForest(logZn ~ ffreq + x + y + dist.m + elev + soil + lime,
                           data=meuse, importance=T, na.action=na.omit, mtry=5)
  summary(model.rf$rsq)
  summary(model.rf$mse)
  rf.stats[i, "mse"] <- median(summary(model.rf$mse))
  rf.stats[i, "rsq"] <- median(summary(model.rf$rsq))
}
summary(rf.stats[,2:3])
##       rsq              mse         
##  Min.   :0.7949   Min.   :0.01861  
##  1st Qu.:0.7990   1st Qu.:0.01920  
##  Median :0.8018   Median :0.01936  
##  Mean   :0.8016   Mean   :0.01938  
##  3rd Qu.:0.8033   3rd Qu.:0.01963  
##  Max.   :0.8094   Max.   :0.02003
hist(rf.stats[,"rsq"], xlab="RandomForest R^2")
rug(rf.stats[,"rsq"])

hist(rf.stats[,"mse"], xlab="RandomForest RMSE")
rug(rf.stats[,"mse"])

3 Random forest with ranger

require(ranger)
## Loading required package: ranger
## 
## Attaching package: 'ranger'
## The following object is masked from 'package:randomForest':
## 
##     importance
m.lzn.ra <- ranger(logZn ~ ffreq + x + y + dist.m + elev + soil + lime, data=meuse, importance='permutation', mtry=5)
print(m.lzn.ra)
## Ranger result
## 
## Call:
##  ranger(logZn ~ ffreq + x + y + dist.m + elev + soil + lime, data = meuse,      importance = "permutation", mtry = 5) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      155 
## Number of independent variables:  7 
## Mtry:                             5 
## Target node size:                 5 
## Variable importance mode:         permutation 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.0186529 
## R squared (OOB):                  0.8102218

Goodness-of-fit:

p.ra <- predict(m.lzn.ra, data=meuse)
str(p.ra)
## List of 5
##  $ predictions              : num [1:155] 3 3.03 2.71 2.48 2.44 ...
##  $ num.trees                : num 500
##  $ num.independent.variables: num 7
##  $ num.samples              : int 155
##  $ treetype                 : chr "Regression"
##  - attr(*, "class")= chr "ranger.prediction"
length(unique(p.ra$predictions))
## [1] 155
summary(r.rap <- meuse$logZn - p.ra$predictions)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.171827 -0.038095 -0.004803 -0.001670  0.032449  0.152370
(rmse.ra <- sqrt(sum(r.rap^2)/length(r.rap)))
## [1] 0.06126902
plot(meuse$logZn ~ p.ra$predictions, asp=1, pch=20, xlab="fitted", ylab="actual", xlim=c(2,3.3),          ylim=c(2,3.3), main="log10(Zn), Meuse topsoils, Ranger")
grid()
abline(0,1)

Variable importance:

importance(m.lzn.ra)
##       ffreq           x           y      dist.m        elev        soil 
## 0.008329948 0.011433750 0.007454751 0.081427247 0.033895884 0.001347622 
##        lime 
## 0.003676220

3.1 Sensitivity

Compute the RF several times and collect statistics:

n <- 24
ra.stats <- data.frame(rep=1:10, rsq=as.numeric(NA), mse=as.numeric(NA))
for (i in 1:n) {
  model.ra <- ranger(logZn ~ ffreq + x + y + dist.m + elev + soil + lime,
                           data=meuse, importance="none", mtry=5,
                     write.forest = FALSE)
  ra.stats[i, "mse"] <- model.ra$prediction.error
  ra.stats[i, "rsq"] <- model.ra$r.squared
}
summary(ra.stats[,2:3])
##       rsq              mse         
##  Min.   :0.8034   Min.   :0.01821  
##  1st Qu.:0.8056   1st Qu.:0.01872  
##  Median :0.8080   Median :0.01888  
##  Mean   :0.8080   Mean   :0.01887  
##  3rd Qu.:0.8095   3rd Qu.:0.01911  
##  Max.   :0.8147   Max.   :0.01933
hist(ra.stats[,"rsq"], xlab="RandomForest R^2")
rug(ra.stats[,"rsq"])

hist(ra.stats[,"mse"], xlab="RandomForest RMSE")
rug(ra.stats[,"mse"])