Search code examples
rmachine-learningexport-to-csvrandom-forest

How can I export in a csv, multiple r2 of multiple Random Forest regressions?


I have a csv which contains the response variable (called ntl) and the predictors. I am running a Random Forest (RF) regression to predict the ntl values. Because I execute multiple times the RF algorithm, I added a number (030, 040) after the predictors name in order to make my life easier when I execute the for loop (see code below). The first six rows of the csv look like this:

head(block.data)
                 ntl  agbh030  agbh040   blue030   blue040    nir030    nir040   pop030   pop040  road030  road040  tirs030
1 52.94 1.773147 1.778323 0.7179366 0.7178904 0.9911458 0.9909238 33.98462 33.99507 152.4204 153.0130 25.39851
2 49.45 2.895677 2.893922 0.9956004 0.9937335 1.3766788 1.3741488 62.45861 62.34375 435.8899 434.7412 35.85143
3 49.12 3.996930 3.992193 1.0742660 1.0725741 1.4975094 1.4952157 78.47204 78.32093 412.6563 411.8447 37.86271
4 44.37 4.146727 4.142556 1.1628517 1.1606724 1.5424725 1.5395863 64.81855 64.78107 295.8405 295.5381 38.16120
5 39.97 4.324530 4.319309 1.1125427 1.1113752 1.4355043 1.4340464 69.18198 68.98098 289.2147 288.5684 36.89496
6 38.44 3.946339 3.943006 1.0540453 1.0531118 1.4150538 1.4135966 47.66867 47.65901 194.1575 193.8339 34.88610
   tirs040
1 25.39134
2 35.77946
3 37.79913
4 38.09968
5 36.84203
6 34.84804 

As you can see, when I execute the first loop I am using the predictors which have the 030 in their names, and in the second loop the ones that have the 040 in their names etc.

library("randomForest")
library("doFuture") 

wd <- "test/"

# Load the data
block.data <- read.csv(paste0(wd, "block.data.psf.csv"))
eq1 <- ntl ~ .

# for reproduciblity
set.seed(123)

foreach (i = seq(030, 040, by = 10)) %do% {
  std <- sprintf("%03.0f", i)
  
  popCol <- paste0("pop", std)
  tirsCol <- paste0("tirs", std)
  agbhCol <- paste0("agbh", std)
  roadCol <- paste0("road", std)
  blueCol <- paste0("blue", std)
  nirCol <- paste0("nir", std)

  testVect = c("ntl", 
               popCol,
               tirsCol,
               agbhCol,
               roadCol,
               blueCol,
               nirCol)
  
  subBlockData <- subset(block.data, select = testVect)
  
  # default RF model
  m1 <- randomForest(
    formula = eq1,
    data    = subBlockData)
  
  # number of trees with highest r2
  btree = which.max(m1$rsq)
  
  features <- subBlockData[, 2:7]
  
  m1 <- randomForest(
    formula = ntl ~ .,
    data    = subBlockData,
    ntree = btree,
    mtry = ncol(features)/3,
    importance = FALSE)
  
  m1 # "% Var explained" is the number I want in the csv
}

What I want is to create a csv (called r2.csv) which will have two columns (called std and r2). The first column will contain the number (030, 040 etc) after the predictors name and the second the r-squared of the RF (called m1 in my code). How can I do that?

Here is my csv:

block.data = structure(list(ntl = c(52.93999863, 49.45000076, 49.11999893, 
44.36999893, 39.97000122, 38.43999863, 38.99000168, 40.88000107, 
37.47999954, 40.65000153, 44.79999924, 50.38000107, 49.38000107, 
68.80000305, 59.88000107), agbh030 = c(1.773146749, 2.89567709, 
3.996930361, 4.146727085, 4.324529648, 3.946338654, 3.430744886, 
2.554599762, 2.052541256, 1.614771366, 1.991117001, 2.35665369, 
0.801872492, 3.310310841, 5.005721092), agbh040 = c(1.778323054, 
2.893922091, 3.992192984, 4.14255619, 4.319309235, 3.943005562, 
3.425927639, 2.553015947, 2.049137354, 1.619293809, 1.997152209, 
2.3637712, 0.81288743, 3.306827068, 4.997349739), blue030 = c(0.717936635, 
0.995600402, 1.074265957, 1.162851691, 1.112542748, 1.05404532, 
0.972470403, 0.864918828, 0.797388136, 0.780449748, 0.717492938, 
0.659686863, 0.544718802, 0.954298615, 1.098036289), blue040 = c(0.717890441, 
0.993733525, 1.072574139, 1.160672426, 1.111375213, 1.053111792, 
0.971249402, 0.863858879, 0.796197593, 0.780405819, 0.717574954, 
0.661465645, 0.546935678, 0.95236063, 1.097277403), nir030 = c(0.99114579, 
1.376678824, 1.49750936, 1.542472482, 1.435504317, 1.415053844, 
1.312831402, 1.290184379, 1.151046991, 1.062780142, 0.961566031, 
0.851697564, 0.746279657, 1.258480549, 1.483649731), nir040 = c(0.990923822, 
1.374148846, 1.495215654, 1.539586306, 1.434046388, 1.41359663, 
1.31127429, 1.288556933, 1.149601102, 1.062715054, 0.961549222, 
0.854078174, 0.749200106, 1.256223083, 1.48290813), pop030 = c(33.98461914, 
62.45861053, 78.47203827, 64.81855011, 69.18198395, 47.66866684, 
85.6471405, 63.21319962, 17.80360603, 57.13486862, 88.91275024, 
58.64352036, 34.46327209, 54.0263176, 61.17818451), pop040 = c(33.99507141, 
62.34374619, 78.32093048, 64.78107452, 68.98097992, 47.65901184, 
85.38433838, 63.14036942, 17.84538078, 57.06893158, 88.97719574, 
58.98400879, 34.64493942, 53.96364212, 61.17742538), road030 = c(152.4203644, 
435.8898926, 412.6563416, 295.8404541, 289.2147217, 194.1574554, 
136.9489441, 213.915741, 195.634903, 275.4249573, 129.0567017, 
110.0097885, 35.32381439, 138.662796, 365.485321), road040 = c(153.013031, 
434.7412109, 411.8447266, 295.538147, 288.5683899, 193.8339233, 
137.1492004, 213.4812012, 196.2715302, 276.1116638, 129.8981934, 
110.5063858, 35.86360931, 139.2144165, 364.8994446), tirs030 = c(25.39851189, 
35.85142899, 37.86270905, 38.16120148, 36.8949585, 34.88610458, 
33.1179924, 29.98550034, 26.51055908, 26.58303452, 25.10464287, 
22.69356346, 18.32171249, 30.8216877, 36.87643814), tirs040 = c(25.39133835, 
35.77946091, 37.79912567, 38.09967804, 36.84203339, 34.84803772, 
33.07061386, 29.94636917, 26.48450851, 26.57940674, 25.10551834, 
22.75337219, 18.39633369, 30.77775002, 36.85960007)), class = "data.frame", row.names = c(NA, 
-15L))

Solution

  • You don't need to fit the randomForest model twice inside the loop, since the model fitted first time already contains the (pseudo) r-square values for each individual regression tree constructing the randomForest model, we just need to index the values returned, in order to extract the precentage variance explained by the tree that explains maximum variance.

    set.seed(123)
    r2.df <- NULL
    
    foreach (i = seq(030, 040, by = 10)) %do% {
      std <- sprintf("%03.0f", i)
      
      popCol <- paste0("pop", std)
      tirsCol <- paste0("tirs", std)
      agbhCol <- paste0("agbh", std)
      roadCol <- paste0("road", std)
      blueCol <- paste0("blue", std)
      nirCol <- paste0("nir", std)
      
      testVect = c("ntl", 
                   popCol,
                   tirsCol,
                   agbhCol,
                   roadCol,
                   blueCol,
                   nirCol)
      
      subBlockData <- subset(block.data, select = testVect)
      
      # default RF model
      m1 <- randomForest(
        formula = eq1,
        data    = subBlockData)
      
      # index of the tree with the highest r2
      btree = which.max(m1$rsq)
      
      features <- subBlockData[, 2:7]      
     
      #m1$rsq[btree] # "% Var explained" is the number I want in the csv
      r2.df <- rbind(r2.df, data.frame(column=i, r2=m1$rsq[btree]))
    }
    
    head(r2.df)
    #  column         r2
    #1     30 -0.1492738
    #2     40 -0.4921124
    write.csv(r2.df, 'result.csv', row.names = FALSE)
    

    The r2 is negative here since the model fits the data poorly (worse than the mean value of the response variable explains the variancw), the data size being very small (only 15 observations).

    y <- block.data$ntl
    n <- length(y)
    m <- mean(y)
    p <- predict(m1)
    plot(y, pch='19', col='black')
    lines(y, col='black')
    lines(rep(m, n), col='red', lty=2)
    points(p, col='green', pch=19)
    lines(p, col='green', pch=19)
    legend('left', legend=c("ntl", "mean", "RF-pred"), fill = c("black", "red","green"))
    

    enter image description here