I have to classify a lot of crops based on three conditions calculated in a grid of 1e6 points. I'm trying to optimize the function below (hopefully without moving to C or Rust). Any ideas?
Iit's possible to reformat the input data if necessary. I already tried with data.table
but the performance was worse.
This is my best shot:
condtion1 <- letters[1:8]
condtion2 <- letters[9:15]
condtion3 <- letters[16:24]
crop <- sample(0:1, 24, replace = T)
names(crop) <- letters[1:24]
n <- 1e6
condtions1 <- sample(condtion1, n, replace = T)
condtions2 <- sample(condtion2, n, replace = T)
condtions3 <- sample(condtion3, n, replace = T)
get_suitability <- function(){
result <- character(n)
for (i in seq_along(result)) {
if (crop[[condtions1[[i]]]] == 0 | crop[[condtions2[[i]]]] == 0) result[[i]] <- "not suitable"
else if(crop[[condtions1[[i]]]] == 1 & crop[[condtions2[[i]]]] == 1 & crop[[condtions3[[i]]]] == 1) result[[i]] <- "suitable"
else if(crop[[condtions1[[i]]]] == 1 & crop[[condtions2[[i]]]] == 1 & crop[[condtions3[[i]]]] == 0) result[[i]] <- "suitable with irrigation"
}
result
}
microbenchmark::microbenchmark(
get_suitability(),
times = 5
)
#> Unit: seconds
#> expr min lq mean median uq max neval
#> get_suitability() 2.402434 2.408322 2.568981 2.641211 2.667943 2.724993 5
Created on 2024-03-24 with reprex v2.1.0
Vectorise over the condtions
getting rid of for/if
. The logical indices take care of both for
and if
.
In a comment to the question I write:
You can initialize
result <- rep("not suitable", n)
and remove one of the if's from the loop.
Notes:
get_suitability2
is my idea in comment to the question, a bad idea as it turned out;get_suitability3b
is a simplified version of get_suitability3
and the fastest of all;get_suitability4
is user2554330´s last function and faster than the original question code.condtion1 <- letters[1:8]
condtion2 <- letters[9:15]
condtion3 <- letters[16:24]
crop <- sample(0:1, 24, replace = T)
names(crop) <- letters[1:24]
n <- 1e6
condtions1 <- sample(condtion1, n, replace = T)
condtions2 <- sample(condtion2, n, replace = T)
condtions3 <- sample(condtion3, n, replace = T)
get_suitability <- function(){
result <- character(n)
for (i in seq_along(result)) {
if (crop[[condtions1[[i]]]] == 0 | crop[[condtions2[[i]]]] == 0) result[[i]] <- "not suitable"
else if(crop[[condtions1[[i]]]] == 1 & crop[[condtions2[[i]]]] == 1 & crop[[condtions3[[i]]]] == 1) result[[i]] <- "suitable"
else if(crop[[condtions1[[i]]]] == 1 & crop[[condtions2[[i]]]] == 1 & crop[[condtions3[[i]]]] == 0) result[[i]] <- "suitable with irrigation"
}
result
}
get_suitability2 <- function(){
result <- rep("not suitable", n)
for (i in seq_along(result)) {
if(crop[[condtions1[[i]]]] == 1 & crop[[condtions2[[i]]]] == 1 & crop[[condtions3[[i]]]] == 1) result[[i]] <- "suitable"
else if(crop[[condtions1[[i]]]] == 1 & crop[[condtions2[[i]]]] == 1 & crop[[condtions3[[i]]]] == 0) result[[i]] <- "suitable with irrigation"
}
result
}
get_suitability3 <- function(){
result <- rep("not suitable", n)
i1 <- crop[ condtions1 ] == 1
i2 <- crop[ condtions2 ] == 1
i3 <- crop[ condtions3 ] == 1
result[i1 & i2 & i3] <- "suitable"
result[i1 & i2 & !i3] <- "suitable with irrigation"
result
}
get_suitability3b <- function(){
result <- rep("not suitable", n)
i1 <- crop[ condtions1 ] == 1 & crop[ condtions2 ] == 1
i3 <- crop[ condtions3 ] == 1
result[i1 & i3] <- "suitable"
result[i1 & !i3] <- "suitable with irrigation"
result
}
get_suitability4 <- function(){
result <- ifelse(crop[condtions1] == 0 |
crop[condtions2] == 0, "not suitable",
ifelse(crop[condtions3] == 1, "suitable",
"suitable with irrigation"))
names(result) <- NULL
result
}
library(microbenchmark)
res <- get_suitability()
res2 <- get_suitability2()
res3 <- get_suitability3()
res3b <- get_suitability3b()
res4 <- get_suitability4()
identical(res, res2)
#> [1] TRUE
identical(res, res3)
#> [1] TRUE
identical(res, res3b)
#> [1] TRUE
identical(res, res4)
#> [1] TRUE
mb <- microbenchmark(
get_suitability(),
get_suitability2(),
get_suitability3(),
get_suitability3b(),
get_suitability4(),
times = 5L
)
print(mb, order = "median")
#> Unit: milliseconds
#> expr min lq mean median uq
#> get_suitability3b() 120.5004 123.8272 144.3827 137.7121 158.9400
#> get_suitability3() 130.9886 141.4570 158.9099 154.2719 179.9035
#> get_suitability4() 630.0646 651.2294 677.3693 687.7445 703.8762
#> get_suitability() 1496.4989 1522.9126 1540.5882 1535.8001 1566.6336
#> get_suitability2() 2999.3825 3008.2696 3064.8530 3083.5560 3102.7165
#> max neval cld
#> 180.9339 5 c
#> 187.9287 5 c
#> 713.9316 5 d
#> 1581.0956 5 a
#> 3130.3405 5 b
Created on 2024-03-24 with reprex v2.1.0