Search code examples
rr-rasterparty

Inaccurate, Random Probabilities in Raster::Predict Cforest calculation with factor variables using type="prob"


I am having an issue with the raster::predict function related generating probability predictions of Presence with a cforest object.

When I generate a classification output, I obtain a correct output - however, when adding type="prob" to obtain probabilities, the output is a strange banding that doesn't correspond to the classification output. I have attached two photos here: Correct output based on classification predict and Banded, incorrect output based on type="prob" predict that details these issues.

As you will notice, the checkerboard pattern on the output places random areas of high and low probability randomly, whereas the classification output creates blocks of presence (1) and absence (0) where they would be expected.

Below is a snippit of code, with training data and a segment of a raster brick used for prediction.

I would greatly appreciate if anyone has insight into how I can obtain valid probabilities using this function.

Am I missing something important in the predfun function?

Thank you in advance!

Here is the example data:

library(raster)
training = structure(list(
 Presence = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("0", "1"), class = "factor"), 
 Elevation = c(3937.63541666689, 384.003472222217, 401.357638888879, 226.43749999999, 21.3055555555572, 305.399305555546, 38.3402777777742, 347.302083333335, 168.156250000001, 700.708333333328, 1034.2013888889, 1033.78125, 1426.99305555577, 912.874999999952, 665.854166666672, 657.187499999983, 1181.97916666667, 696.062499999948, 976.812500000002, 1017.98263888889), 
 Region = structure(c(3L, 5L, 5L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 4L, 4L, 4L, 3L, 3L, 1L, 4L, 2L, 4L, 4L), .Label = c("1", "2", "3", "4", "5"), class = "factor"), 
 Protected = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L), .Label = c("0","1"), class = "factor"), 
 Population = structure(c(17L, 3L, 4L, 1L, 13L, 2L, 5L, 2L, 9L, 13L, 6L, 5L, 5L, 7L, 8L, 14L, 1L, 13L, 7L, 1L), .Label = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20"), class = "factor"), 
 Mean_Diunal_Range = c(126,151, 152, 153, 138, 125, 137, 158, 129, 137, 170, 172, 115,151, 150, 178, 149, 146, 158, 165), 
 Isothermality = c(31,49, 46, 49, 54, 51, 59, 48, 47, 62, 54, 54, 61, 37, 47, 63,61, 75, 55, 57), 
 Mean_Temp_Wettest_Q = c(-87, 338, 348, 135,236, 193, 305, 322, 247, 253, 249, 247, 210, 105, 313, 252,238, 267, 250, 255), 
 Mean_Temp_Driest_Q = c(64, 231, 253,230, 324, 194, 263, 261, 328, 236, 133, 130, 158, 315, 188,232, 173, 248, 166, 151), 
 Precip_Wettest_M = c(119, 30, 18,3, 16, 8, 22, 12, 11, 165, 62, 59, 71, 13, 11, 201, 85, 110,121,91), 
 Precip_Dryest_m = c(2, 0, 0, 0, 0, 0, 0, 0, 0,0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0), 
 Precip_Seasonality = c(81,133, 136, 65, 106, 65, 126, 85, 101, 110, 77, 76, 111, 85,111, 125, 116, 123, 106, 93), 
 Precip_Warmest_Q = c(19, 55,29, 0, 0, 6, 49, 23, 1, 152, 148, 146, 56, 0, 15, 143, 63,140, 174, 195), 
 Precip_Coldest_Q = c(201, 0, 1, 8, 14, 14,5, 6, 17, 0, 9, 10, 0, 29, 0, 0, 2, 0, 1, 2), 
 Agriculture_Date = c(9000,3000, 3000, 7200, 7600, 3500, 3500, 3500, 7600, 5000, 1000,1700, 1250, 9500, 4000, 2700, 1250, 4000, 1000, 1000), 
 distance_to_highways = c(0.697535828668298,3.21145962385929, 4.70553674201054, 1.26025341029057, 0.0306302671744576,1.29795751420045, 0.341335382947427, 2.01515485126633, 1.90935593837289,1.00778111672525, 0.0841242925270876, 0.0135315745860043,2.39038097221274, 1.49284290327056, 1.21019159581485, 5.71817373942967,1.64045219527117, 0.121375728043842, 0.535675418474612, 1.08581690073317)), 
row.names = c(NA, -20L), class = c("data.table", "data.frame"))

##Create Prediction Raster Brick

## Create Data Frame for Raster Brick
data_segment <- structure(list(
 Elevation = c(1187.9, 1173.5, 1158.2, 1143.3, 1125.6, 1112.4, 1232.7, 1203.3, 1176.7, 1156.8, 1138.9, 1140.8, 1249.9, 1216.9, 1193.2, 1171.4, 1153.5, 1157.6, 1261.5, 1233.2, 1208.2, 1185.6, 1175.5, 1184.2, 1289.1, 1256.4, 1240.7, 1212.2, 1208.1, 1220.9, 1304.6, 1297.2, 1286.6, 1249.2, 1231, 1256.1, 1341.5, 1329.3, 1328.5, 1291.8, 1250.2, 1288.6, 1379.2, 1322.8, 1305.6, 1293.7, 1234.9, 1260.5, 1354.5, 1274.5, 1257.7, 1281.1, 1230.8, 1196.3, 1288.5, 1237.4, 1221.5, 1241.6, 1212.7, 1162.3, 1199.4, 1185.4, 1194.5, 1216.1, 1207.4, 1149.8, 1150.6, 1157, 1176, 1176.1, 1155.6, 1138, 1145, 1131.9, 1121.1, 1116.3, 1109.4, 1116.2), 
 Region = c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), 
 Protected = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), 
 Population = c(10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15, 11, 11, 12, 12, 12, 15, 11, 11, 12, 12, 12, 15, 11, 11, 12, 12, 12, 16, 11, 11, 12, 12, 12, 16, 11, 11, 12, 12, 12, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12), 
 Mean_Diunal_Range = c(10.5, 10.5, 10.6, 10.7, 10.8, 10.9, 10.4, 10.5, 10.5, 10.5, 10.6, 10.6, 10.3, 10.4, 10.5, 10.5, 10.5, 10.4, 10.2, 10.3, 10.3, 10.4, 10.3, 10.4, 10.2, 10.2, 10.2, 10.2, 10.3, 10.3, 10.3, 10.2, 10.1, 10.2, 10.3, 10.3, 10.3, 10.2, 10.2, 10.2, 10.2, 10.1, 10.2, 10.3, 10.2, 10.1, 10.1, 10.1, 10.1, 10.2, 10.2, 10, 10, 10.1, 10.1, 10.1, 10, 9.9, 9.9, 10, 10, 10, 9.9, 9.9, 9.9, 9.9, 10, 9.9, 9.8, 9.9, 9.8, 9.8, 10.1, 10.1, 10, 9.8, 9.8, 9.7), 
 Isothermality = c(68.5, 68.7, 69.3, 69.4, 69.6, 69.7, 68.4, 68.5, 68.8, 68.7, 69.2, 69.2, 67.8, 68.1, 68.4, 68.6, 68.6, 68.3, 67.4, 67.4, 67.6, 67.9, 68, 68.1, 67.2, 66.8, 66.9, 67.3, 67.5, 67.3, 67.2, 66.4, 66.6, 66.9, 66.7, 67, 66.7, 66.7, 66.5, 66.4, 66.5, 66.2, 65.7, 66.3, 66, 65.7, 65.5, 65.7, 65.4, 65.1, 65.1, 65.2, 65.2, 65.3, 64.2, 64.5, 64.5, 64.1, 64.6, 64.7, 63.8, 64, 64.1, 63.7, 63.8, 64.2, 63.4, 63.7, 63.4, 63.6, 63.5, 63.7, 63.8, 64.1, 63.5, 63, 63.2, 63.1), 
 Mean_Temp_Wettest_Q = c(24.5, 24.5, 24.7, 24.8, 25, 25, 24.2, 24.4, 24.5, 24.7, 24.8, 24.8, 24.1, 24.3, 24.5, 24.6, 24.7, 24.8, 24, 24.2, 24.4, 24.5, 24.6, 24.7, 24, 24.1, 24.2, 24.4, 24.5, 24.5, 23.9, 24, 24, 24.3, 24.5, 24.4, 23.8, 23.8, 23.9, 24, 24.3, 24.1, 23.5, 23.9, 24, 24, 24.3, 24.3, 23.7, 24.1, 24.2, 24.1, 24.4, 24.6, 24, 24.4, 24.4, 24.3, 24.5, 24.8, 24.5, 24.6, 24.5, 24.5, 24.5, 24.9, 24.8, 24.8, 24.6, 24.7, 24.8, 24.9, 24.8, 25, 25, 25, 25, 25), 
 Mean_Temp_Driest_Q = c(22.4, 22.4, 22.5, 22.7, 22.8, 22.9, 22.1, 22.3, 22.4, 22.5, 22.7, 22.7, 22, 22.2, 22.4, 22.5, 22.6, 22.6, 21.9, 22.1, 22.3, 22.4, 22.5, 22.5, 21.8, 22, 22, 22.2, 22.3, 22.3, 21.7, 21.7, 21.8, 22.1, 22.2, 22.1, 21.5, 21.5, 21.6, 21.8, 22, 21.8, 21.2, 21.6, 21.7, 21.7, 22, 21.9, 21.3, 21.8, 21.9, 21.8, 22, 22.2, 21.6, 21.9, 22, 21.9, 22, 22.3, 22, 22.1, 22, 22, 22, 22.4, 22.2, 22.2, 22.1, 22.2, 22.2, 22.4, 22.3, 22.8, 22.4, 22.5, 22.5, 22.5),
 Precip_Wettest_M = c(161, 160, 161, 161, 159, 158, 163, 162, 163, 163, 162, 161, 166, 165, 164, 164, 163, 163, 169, 167, 166, 166, 166, 166, 172, 169, 169, 168, 169, 169, 173, 172, 173, 171, 170, 170, 174, 176, 176, 174, 171, 173, 179, 175, 175, 175, 171, 173, 180, 176, 174, 176, 172, 171, 181, 177, 176, 177, 174, 170, 177, 176, 176, 177, 175, 170, 175, 175, 177, 178, 175, 174, 176, 175, 175, 175, 173, 174), 
 Precip_Dryest_m = c(6, 6, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 7, 6, 7, 6, 6, 6, 7, 7, 7, 6, 6, 6, 8, 7, 7, 7, 7, 7, 8, 7, 7, 7, 7, 7, 8, 7, 7, 7, 7, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7), 
 Precip_Seasonality = c(100.8, 101.7, 103, 105.2, 105.9, 107.2, 99.8, 101.8, 103.4, 105.5, 106.3, 106.8, 101.1, 101.8, 103.4, 105.1, 106.2, 106.8, 100.8, 102, 103.1, 105.3, 106.6, 107.6, 101.4, 102.5, 103.8, 105.3, 106.9, 107.2, 100.8, 102.2, 102.7, 104.9, 106.3, 106.6, 100.1, 101.3, 102.6, 104.1, 105.7, 106.3, 98.8, 101.3, 101.9, 103.6, 104.8, 105.8, 99.2, 102, 102.6, 103.5, 104.5, 106.4, 100.1, 102.4, 103.8, 104.2, 105, 106.8, 100.7, 103, 103.6, 104.1, 104.7, 105.8, 101.2, 102.2, 103.9, 104.6, 105.6, 106.7, 101.3, 102.5, 103.4, 104.2, 105.5, 106.7), 
 Precip_Warmest_Q = c(97, 96, 94, 89, 86, 84, 100, 97, 94, 90, 87, 86, 100, 99, 95, 91, 88, 87, 102, 99, 96, 92, 89, 87, 102, 98, 96, 92, 91, 90, 105, 100, 98, 95, 92, 92, 106, 104, 101, 97, 93, 93, 109, 101, 101, 97, 92, 93, 108, 100, 97, 97, 93, 89, 105, 99, 96, 96, 93, 89, 101, 98, 96, 95, 94, 89, 98, 97, 96, 94, 91, 89, 99, 96, 94, 93, 90, 88), 
 Precip_Coldest_Q = c(23, 22, 21, 21, 21, 19, 25, 22, 22, 21, 21, 20, 24, 23, 21, 21, 21, 20, 25, 23, 22, 21, 21, 21, 25, 23, 22, 21, 21, 21, 25, 24, 25, 22, 21, 21, 27, 27, 25, 23, 22, 22, 30, 27, 26, 25, 24, 23, 30, 27, 26, 26, 24, 23, 30, 27, 26, 26, 25, 21, 28, 26, 26, 26, 25, 24, 28, 26, 26, 26, 25, 24, 28, 27, 26, 26, 24, 24), 
 Agriculture_Date = c(4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000), 
 distance_to_highways = c(0.1, 0, 0, 0.1, 0.1, 0.1, 0, 0, 0, 0.1, 0.1, 0.1, 0, 0, 0, 0.1, 0.1, 0.2, 0.1, 0, 0, 0.1, 0.1, 0.1, 0.1, 0, 0, 0.1, 0.1, 0.1, 0.1, 0, 0, 0, 0.1, 0.1, 0.1, 0, 0, 0, 0.1, 0.1, 0, 0, 0, 0, 0.1, 0.1, 0, 0, 0, 0.1, 0.1, 0.1, 0, 0, 0.1, 0.1, 0.1, 0.1, 0, 0, 0, 0.1, 0.1, 0.2, 0, 0, 0, 0.1, 0.1, 0.2, 0, 0, 0, 0.1, 0.1, 0.2)), 
class = "data.frame", row.names = c(NA, -78L))

## Create RasterBrick and assign values from dataframe
segment_brick <- brick(nrow=13, ncol=6, xmn=39, xmx=39.25, ymn=3.833333, ymx=4.375,crs="+proj=longlat +datum=WGS84 +no_defs", nl=15)
values(segment_brick) <-  as.matrix(data_segment)

Now the modeling:

library(party)
##Fit Cforest Model
cforest_example = party::cforest(Presence ~ ., data = training,
            control = cforest_unbiased(ntree=10,mtry=3,trace=TRUE))


##Create factor object for raster predict
f = list(levels(training$Protected),levels(training$Region),levels(training$Population))

##Rename factor lists to match original data
names(f) <- c("Protected", "Region", "Population")

##create a wrapper function for Cforest predict in raster package
predfun <- function(m, d, ...) predict(m, newdata=d, ...)

##Predict Raster with no Probabilities - results in correct prediction
Attempt_Processed = raster::predict(segment_brick, cforest_example, OOB=TRUE, factors=f, fun=predfun,progress="text",na.rm=TRUE)
plot(Attempt_Processed)

##Predict Raster with Probabilities - results in odd banded image
Attempt_Processed_prob = raster::predict(segment_brick, cforest_example, OOB=TRUE, factors=f, fun=predfun,progress="text",na.rm=TRUE,type="prob")
plot(Attempt_Processed_prob)

Correct output based on classification predict

Banded, incorrect output based on type="prob" predict


Solution

  • The way to go about this is to have a look what the model's predict function returns

    p <- predict(cforest_example, newdata=training[1:3,], OOB=TRUE, type="prob")
    p
    #$`1`
    #     Presence.0 Presence.1
    #[1,]  0.5461538  0.4538462
    #
    #$`2`
    #     Presence.0 Presence.1
    #[1,]  0.5461538  0.4538462
    #
    #$`3`
    #     Presence.0 Presence.1
    #[1,]  0.5461538  0.4538462
    

    It returns a list, which gets unlisted by raster::predict

    unlist(p)
    #       11        12        21        22        31        32 
    #0.5461538 0.4538462 0.5461538 0.4538462 0.5461538 0.4538462 
    

    and that explains the banding pattern

    You can fix this with a prediction function like this

    pfun <- function(m, d, ...) {
        p <- predict(m, newdata=d, ...)
        matrix(unlist(p), ncol=2, byrow=TRUE)
    }
        
    pfun(cforest_example, training[1:3,], OOB=TRUE, type="prob")
    #          [,1]      [,2]
    #[1,] 0.5461538 0.4538462
    #[2,] 0.5461538 0.4538462
    #[3,] 0.5461538 0.4538462
    

    Which you can now use with the RasterBrick

    prob = raster::predict(segment_brick, cforest_example, OOB=TRUE, factors=f, fun=pfun, index=1:2, na.rm=TRUE,type="prob")
    plot(prob)
    prob
    #class      : RasterBrick 
    #dimensions : 13, 6, 78, 2  (nrow, ncol, ncell, nlayers)
    #resolution : 0.04166667, 0.04166669  (x, y)
    #extent     : 39, 39.25, 3.833333, 4.375  (xmin, xmax, ymin, ymax)
    #crs        : +proj=longlat +datum=WGS84 +no_defs 
    #source     : memory
    #names      :   layer.1,   layer.2 
    #min values : 0.5461538, 0.4538462 
    #max values : 0.5461538, 0.4538462 
    

    As you only have 2 probabilities, you could simplify the function to return only the probability for one class (absence in this case):

    pfun <- function(m, d, ...) {
        p <- predict(m, newdata=d, ...)
        matrix(unlist(p), ncol=2, byrow=TRUE)[,1]
    }
    prob = raster::predict(segment_brick, cforest_example, OOB=TRUE, factors=f, fun=pfun, na.rm=TRUE,type="prob")