I have a csv which contains the response variable (called ntl) and the predictors. I am running a Random Forest (RF) regression to predict the ntl values. Because I execute multiple times the RF algorithm, I added a number (030, 040) after the predictors name in order to make my life easier when I execute the for loop (see code below). The first six rows of the csv look like this:
head(block.data)
ntl agbh030 agbh040 blue030 blue040 nir030 nir040 pop030 pop040 road030 road040 tirs030
1 52.94 1.773147 1.778323 0.7179366 0.7178904 0.9911458 0.9909238 33.98462 33.99507 152.4204 153.0130 25.39851
2 49.45 2.895677 2.893922 0.9956004 0.9937335 1.3766788 1.3741488 62.45861 62.34375 435.8899 434.7412 35.85143
3 49.12 3.996930 3.992193 1.0742660 1.0725741 1.4975094 1.4952157 78.47204 78.32093 412.6563 411.8447 37.86271
4 44.37 4.146727 4.142556 1.1628517 1.1606724 1.5424725 1.5395863 64.81855 64.78107 295.8405 295.5381 38.16120
5 39.97 4.324530 4.319309 1.1125427 1.1113752 1.4355043 1.4340464 69.18198 68.98098 289.2147 288.5684 36.89496
6 38.44 3.946339 3.943006 1.0540453 1.0531118 1.4150538 1.4135966 47.66867 47.65901 194.1575 193.8339 34.88610
tirs040
1 25.39134
2 35.77946
3 37.79913
4 38.09968
5 36.84203
6 34.84804
As you can see, when I execute the first loop I am using the predictors which have the 030 in their names, and in the second loop the ones that have the 040 in their names etc.
library("randomForest")
library("doFuture")
wd <- "test/"
# Load the data
block.data <- read.csv(paste0(wd, "block.data.psf.csv"))
eq1 <- ntl ~ .
# for reproduciblity
set.seed(123)
foreach (i = seq(030, 040, by = 10)) %do% {
std <- sprintf("%03.0f", i)
popCol <- paste0("pop", std)
tirsCol <- paste0("tirs", std)
agbhCol <- paste0("agbh", std)
roadCol <- paste0("road", std)
blueCol <- paste0("blue", std)
nirCol <- paste0("nir", std)
testVect = c("ntl",
popCol,
tirsCol,
agbhCol,
roadCol,
blueCol,
nirCol)
subBlockData <- subset(block.data, select = testVect)
# default RF model
m1 <- randomForest(
formula = eq1,
data = subBlockData)
# number of trees with highest r2
btree = which.max(m1$rsq)
features <- subBlockData[, 2:7]
m1 <- randomForest(
formula = ntl ~ .,
data = subBlockData,
ntree = btree,
mtry = ncol(features)/3,
importance = FALSE)
m1 # "% Var explained" is the number I want in the csv
}
What I want is to create a csv (called r2.csv) which will have two columns (called std and r2). The first column will contain the number (030, 040 etc) after the predictors name and the second the r-squared of the RF (called m1 in my code). How can I do that?
Here is my csv:
block.data = structure(list(ntl = c(52.93999863, 49.45000076, 49.11999893,
44.36999893, 39.97000122, 38.43999863, 38.99000168, 40.88000107,
37.47999954, 40.65000153, 44.79999924, 50.38000107, 49.38000107,
68.80000305, 59.88000107), agbh030 = c(1.773146749, 2.89567709,
3.996930361, 4.146727085, 4.324529648, 3.946338654, 3.430744886,
2.554599762, 2.052541256, 1.614771366, 1.991117001, 2.35665369,
0.801872492, 3.310310841, 5.005721092), agbh040 = c(1.778323054,
2.893922091, 3.992192984, 4.14255619, 4.319309235, 3.943005562,
3.425927639, 2.553015947, 2.049137354, 1.619293809, 1.997152209,
2.3637712, 0.81288743, 3.306827068, 4.997349739), blue030 = c(0.717936635,
0.995600402, 1.074265957, 1.162851691, 1.112542748, 1.05404532,
0.972470403, 0.864918828, 0.797388136, 0.780449748, 0.717492938,
0.659686863, 0.544718802, 0.954298615, 1.098036289), blue040 = c(0.717890441,
0.993733525, 1.072574139, 1.160672426, 1.111375213, 1.053111792,
0.971249402, 0.863858879, 0.796197593, 0.780405819, 0.717574954,
0.661465645, 0.546935678, 0.95236063, 1.097277403), nir030 = c(0.99114579,
1.376678824, 1.49750936, 1.542472482, 1.435504317, 1.415053844,
1.312831402, 1.290184379, 1.151046991, 1.062780142, 0.961566031,
0.851697564, 0.746279657, 1.258480549, 1.483649731), nir040 = c(0.990923822,
1.374148846, 1.495215654, 1.539586306, 1.434046388, 1.41359663,
1.31127429, 1.288556933, 1.149601102, 1.062715054, 0.961549222,
0.854078174, 0.749200106, 1.256223083, 1.48290813), pop030 = c(33.98461914,
62.45861053, 78.47203827, 64.81855011, 69.18198395, 47.66866684,
85.6471405, 63.21319962, 17.80360603, 57.13486862, 88.91275024,
58.64352036, 34.46327209, 54.0263176, 61.17818451), pop040 = c(33.99507141,
62.34374619, 78.32093048, 64.78107452, 68.98097992, 47.65901184,
85.38433838, 63.14036942, 17.84538078, 57.06893158, 88.97719574,
58.98400879, 34.64493942, 53.96364212, 61.17742538), road030 = c(152.4203644,
435.8898926, 412.6563416, 295.8404541, 289.2147217, 194.1574554,
136.9489441, 213.915741, 195.634903, 275.4249573, 129.0567017,
110.0097885, 35.32381439, 138.662796, 365.485321), road040 = c(153.013031,
434.7412109, 411.8447266, 295.538147, 288.5683899, 193.8339233,
137.1492004, 213.4812012, 196.2715302, 276.1116638, 129.8981934,
110.5063858, 35.86360931, 139.2144165, 364.8994446), tirs030 = c(25.39851189,
35.85142899, 37.86270905, 38.16120148, 36.8949585, 34.88610458,
33.1179924, 29.98550034, 26.51055908, 26.58303452, 25.10464287,
22.69356346, 18.32171249, 30.8216877, 36.87643814), tirs040 = c(25.39133835,
35.77946091, 37.79912567, 38.09967804, 36.84203339, 34.84803772,
33.07061386, 29.94636917, 26.48450851, 26.57940674, 25.10551834,
22.75337219, 18.39633369, 30.77775002, 36.85960007)), class = "data.frame", row.names = c(NA,
-15L))
You don't need to fit the randomForest
model twice inside the loop, since the model fitted first time already contains the (pseudo) r-square values for each individual regression tree constructing the randomForest model, we just need to index the values returned, in order to extract the precentage variance explained by the tree that explains maximum variance.
set.seed(123)
r2.df <- NULL
foreach (i = seq(030, 040, by = 10)) %do% {
std <- sprintf("%03.0f", i)
popCol <- paste0("pop", std)
tirsCol <- paste0("tirs", std)
agbhCol <- paste0("agbh", std)
roadCol <- paste0("road", std)
blueCol <- paste0("blue", std)
nirCol <- paste0("nir", std)
testVect = c("ntl",
popCol,
tirsCol,
agbhCol,
roadCol,
blueCol,
nirCol)
subBlockData <- subset(block.data, select = testVect)
# default RF model
m1 <- randomForest(
formula = eq1,
data = subBlockData)
# index of the tree with the highest r2
btree = which.max(m1$rsq)
features <- subBlockData[, 2:7]
#m1$rsq[btree] # "% Var explained" is the number I want in the csv
r2.df <- rbind(r2.df, data.frame(column=i, r2=m1$rsq[btree]))
}
head(r2.df)
# column r2
#1 30 -0.1492738
#2 40 -0.4921124
write.csv(r2.df, 'result.csv', row.names = FALSE)
The r2 is negative here since the model fits the data poorly (worse than the mean value of the response variable explains the variancw), the data size being very small (only 15 observations).
y <- block.data$ntl
n <- length(y)
m <- mean(y)
p <- predict(m1)
plot(y, pch='19', col='black')
lines(y, col='black')
lines(rep(m, n), col='red', lty=2)
points(p, col='green', pch=19)
lines(p, col='green', pch=19)
legend('left', legend=c("ntl", "mean", "RF-pred"), fill = c("black", "red","green"))