Search code examples
ragent-based-modeling

How to manage 3D arrays in agent-based models with R?


I am building an agent-based model with R but I have memory issues by trying to work with large 3D arrays. In the 3D arrays, the first dimension corresponds to the time (from 1 to 3650 days), the second dimension defines the properties of individuals or landscape cells and the third dimension represents each individual or landscape cell. At each time step (1 day), each 3D array is filled using several functions. Ideally, I would like to run my ABM over large landscapes (for example, 90000 cells) containing a large number of individuals (for example, 720000). Actually, this is impossible because of memory issues.

Currently, the 3D arrays are defined at initialization so that data are stored in the array at each time step. However, to fill one 3D array at t from the model, I need to only keep data at t – 1 and t – tf – 1, where tf is a duration parameter that is fixed (e.g., tf = 320 days). Here an example of one model function that is used to fill one 3D array (there are 3 duration parameters):

    library(ff)
    library(magrittr)
    library(dplyr)
    library(tidyr)
    library(gtools)

    ## Define parameters
    tf1 <- 288
    tf2 <- 292
    tf3 <- 150

    ## Define 3D array
    col_array <- c(letters[seq( from = 1, to = 9 )])
    s_array <- ff(-999, dim=c(3650, 9, 2500), dimnames=list(NULL, col_array, as.character(seq(1, 2500, 1))), 
                              filename="s_array.ffd", vmode="double", overwrite = t) ## 3th dimension = patch ID

    ## Define initial array
    initial_s_array <- matrix(sample.int(100, size = 2500*9, replace = TRUE), nrow = 2500, ncol = 9, dimnames=list(NULL, col_array))

   ## Loop over time 
    line <- 1
    for(t in 1:3650){
      print(t)
      s_array[t,c("a", "b", "c", "d", "e", "f", "g", "h", "i"),] <- func(v_t_1 = ifelse((t - 1) >= 1, list(s_array[(t - 1),,]), NA)[[1]], 
                                                                         v_t_tf1_1 = ifelse((t - tf1 - 1) >= 1, list(s_array[(t - tf1 - 1),,]), NA)[[1]], 
                                                                         v_t_tf2_1 = ifelse((t - tf2 - 1) >= 1, list(s_array[(t - tf2 - 1),,]), NA)[[1]], 
                                                                         v_t_tf3_1 = ifelse((t - tf3 - 1) >= 1, list(s_array[(t - tf3 - 1),,]), NA)[[1]], 
                                                                         v_t_0 = initial_s_array, columns_names = col_array)
      line <- line + 1
    }


    func <- function(v_t_1, v_t_tf1_1, v_t_tf2_1, v_t_tf3_1, v_t_0, columns_names){

      ## Data at t-1
      dt_t_1 <- (ifelse(!(all(is.na(v_t_1))), 
                        list(v_t_1 %>% 
                               as.data.frame.table(stringsAsFactors = FALSE) %>% 
                               dplyr::mutate_all(as.character)), NA))[[1]]

      ## Data at t-tf1-1
      dt_t_tf1_1 <- (ifelse(!(all(is.na(v_t_tf1_1))), 
                             list(v_t_tf1_1 %>% 
                                    as.data.frame.table(stringsAsFactors = FALSE) %>% 
                                    dplyr::mutate_all(as.character)), NA))[[1]]

      ## Data at t-tf2-1
      dt_t_tf2_1 <- (ifelse(!(all(is.na(v_t_tf2_1))), 
                             list(v_t_tf2_1 %>% 
                                    as.data.frame.table(stringsAsFactors = FALSE) %>% 
                                    dplyr::mutate_all(as.character)), NA))[[1]]

      ## Data at t-tf3-1
      dt_t_tf3_1 <- (ifelse(!(all(is.na(v_t_tf3_1))), 
                             list(v_t_tf3_1 %>% 
                                    as.data.frame.table(stringsAsFactors = FALSE) %>% 
                                    dplyr::mutate_all(as.character)), NA))[[1]]

      ## Format data at t-1
      dt_t_1_reshape <- (ifelse(!(all(is.na(dt_t_1))), 
                                list(dt_t_1 %>%
                                       dplyr::rename(ID = Var2) %>%
                                       tidyr::spread(Var1, Freq) %>%
                                       dplyr::select(ID, columns_names) %>%
                                       dplyr::arrange(match(ID, mixedsort(colnames(v_t_1))))), NA))[[1]]

      ## Format data at t-tf1-1
      dt_t_tf1_1_reshape <- (ifelse(!(all(is.na(dt_t_tf1_1))), 
                                     list(dt_t_tf1_1 %>%
                                            dplyr::rename(ID = Var2) %>%
                                            tidyr::spread(Var1, Freq) %>%
                                            dplyr::select(ID, columns_names) %>%
                                            dplyr::arrange(match(ID, mixedsort(colnames(v_t_tf1_1))))), NA))[[1]]

      ## Format data at t-tf2-1
      dt_t_tf2_1_reshape <- (ifelse(!(all(is.na(dt_t_tf2_1))), 
                                     list(dt_t_tf2_1 %>%
                                            dplyr::rename(ID = Var2) %>%
                                            tidyr::spread(Var1, Freq) %>%
                                            dplyr::select(ID, columns_names) %>%
                                            dplyr::arrange(match(ID, mixedsort(colnames(v_t_tf2_1))))), NA))[[1]]

      ## Format data at t-tf3-1
      dt_t_tf3_1_reshape <- (ifelse(!(all(is.na(dt_t_tf3_1))), 
                                     list(dt_t_tf3_1 %>%
                                            dplyr::rename(ID = Var2) %>%
                                            tidyr::spread(Var1, Freq) %>%
                                            dplyr::select(ID, columns_names) %>%
                                            dplyr::arrange(match(ID, mixedsort(colnames(v_t_tf3_1))))), NA))[[1]]

      ## Retrieve data
      a_t_1 <- (ifelse((!(all(is.na(dt_t_1_reshape)))), list(as.numeric(dt_t_1_reshape[,c("a")])), list(v_t_0[,c("a")])))[[1]] 
      d_t_1 <- (ifelse((!(all(is.na(dt_t_1_reshape)))), list(as.numeric(dt_t_1_reshape[,c("d")])), list(v_t_0[,c("d")])))[[1]]   
      g_t_1 <- (ifelse((!(all(is.na(dt_t_1_reshape)))), list(as.numeric(dt_t_1_reshape[,c("f")])), list(v_t_0[,c("f")])))[[1]] 

      a_t_tf1_1 <- (ifelse(!(all(is.na(dt_t_tf1_1_reshape))), list(as.numeric(dt_t_tf1_1_reshape[,c("a")])), 0))[[1]] 
      d_t_tf2_1 <- (ifelse(!(all(is.na(dt_t_tf2_1_reshape))), list(as.numeric(dt_t_tf2_1_reshape[,c("d")])), 0))[[1]] 
      g_t_tf3_1 <- (ifelse(!(all(is.na(dt_t_tf3_1_reshape))), list(as.numeric(dt_t_tf3_1_reshape[,c("f")])), 0))[[1]] 

      b_t_1 <- (ifelse((!(all(is.na(dt_t_1_reshape)))), list(as.numeric(dt_t_1_reshape[,c("b")])), list(v_t_0[,c("b")])))[[1]] 
      e_t_1 <- (ifelse((!(all(is.na(dt_t_1_reshape)))), list(as.numeric(dt_t_1_reshape[,c("e")])), list(v_t_0[,c("e")])))[[1]] 
      h_t_1 <- (ifelse((!(all(is.na(dt_t_1_reshape)))), list(as.numeric(dt_t_1_reshape[,c("h")])), list(v_t_0[,c("h")])))[[1]] 

      b_t_tf1_1 <- (ifelse(!(all(is.na(dt_t_tf1_1_reshape))), list(as.numeric(dt_t_tf1_1_reshape[,c("b")])), 0))[[1]] 
      e_t_tf2_1 <- (ifelse(!(all(is.na(dt_t_tf2_1_reshape))), list(as.numeric(dt_t_tf2_1_reshape[,c("e")])), 0))[[1]] 
      h_t_tf3_1 <- (ifelse(!(all(is.na(dt_t_tf3_1_reshape))), list(as.numeric(dt_t_tf3_1_reshape[,c("h")])), 0))[[1]] 

      ## Define discrete equations
      a_t <- round(0.4*a_t_1 + 0.5*a_t_tf1_1)
      b_t <- round(0.5*b_t_1 + 0.6*b_t_tf1_1)
      c_t <- a_t + b_t
      d_t <- round(0.7*d_t_1 + 0.7*d_t_tf2_1)
      e_t <- round(0.9*e_t_1 + 0.4*e_t_tf2_1)
      f_t <- d_t + e_t
      g_t <- round(0.3*g_t_1 + 0.2*g_t_tf3_1)
      h_t <- round(0.5*h_t_1 + 0.1*h_t_tf3_1)
      i_t <- g_t + h_t

      ## Update the values
      dt_array <- as.matrix(cbind(a_t, b_t, c_t, d_t, e_t, f_t, g_t, h_t, i_t))
      ## print(dt_array)

      ## Build the output matrix         
      dt_array <- t(dt_array)

      return(dt_array)

    }

The function “func” takes as argument data at t – 1 and t – tf – 1 providing one 3D array “s_array”. The function returns a data frame that is used to fill the 3D array.

I think that I could reduce the first dimension of my arrays by keeping only data at t – 1 and t – tf – 1 (rather than keep data at each time step from 1 to 3650 days). However, I don’t know how to manage these new 3D arrays in the ABM at each time step (i.e., how to initialize the 3D arrays and to only store data at t – 1 and t – tf – 1)?

Edit: I have tested the example with 90000 observations for the 3rd dimension. The number of rows (i.e., 3650) in each array is too large.

> s_array <- ff(-999, dim=c(3650, 9, 90000), dimnames=list(NULL, col_array, as.character(seq(1, 90000, 1))), 
+               filename="s_array.ffd", vmode="double", overwrite = TRUE) 
Error in if (length < 0 || length > .Machine$integer.max) stop("length must be between 1 and .Machine$integer.max") : 
  missing value where TRUE/FALSE needed
In addition: Warning message:
In ff(-999, dim = c(3650, 9, 90000), dimnames = list(NULL, col_array,  :
  NAs introduced by coercion to integer range

Is there a way to reduce the number of rows and apply the function that is used to fill the arrays ?


Solution

  • The reason I said that R probably isn't ideal is because of its copy-on-modify semantics, so every time you change something in any array/matrix/data frame, a copy must be made. I think R can do some clever things with its memory management, but still.

    Using ff to avoid having all time slices in RAM at the same time is probably indeed advantageous, but your code is probably changing storage formats far too often, juggling data structures back and forth. I think I packed the logic in a couple R6 class (along with some improvements), maybe it can get you started with improving your code's memory usage:

    suppressPackageStartupMessages({
      library(R6)
      library(ff)
    })
    
    SArray <- R6::R6Class(
      "SArray",
      public = list(
        time_slices = NULL,
    
        initialize = function(sdim, sdimnames) {
          self$time_slices <- lapply(1L:sdim[1L], function(ignored) {
            ff(NA_real_, vmode="double", dim=sdim[-1L], dimnames=sdimnames[-1L])#, FF_RETURN=FALSE)
          })
    
          names(self$time_slices) <- sdimnames[[1L]]
        }
      )
    )
    
    `[.SArray` <- function(s_array, i, j, ...) {
      s_array$time_slices[[i]][j,]
    }
    
    `[<-.SArray` <- function(s_array, i, j, ..., value) {
      s_array$time_slices[[i]][j,] <- value
      s_array
    }
    
    dim.SArray <- function(x) {
      c(length(x$time_slices), dim(x$time_slices[[1L]]))
    }
    
    ABM <- R6::R6Class(
      "ABM",
      public = list(
        s_array = NULL,
        tf1 = NULL,
        tf2 = NULL,
        tf3 = NULL,
    
        initialize = function(sdim, sdimnames, tfs) {
          self$tf1 <- tfs[1L]
          self$tf2 <- tfs[2L]
          self$tf3 <- tfs[3L]
    
          self$s_array <- SArray$new(sdim, sdimnames)
        },
    
        init_abm = function(seed = NULL) {
          set.seed(seed)
          sdim <- dim(self$s_array)
          s_init <- matrix(sample.int(100L, size = 6L * sdim[3L], replace=TRUE),
                           nrow=6L, ncol=sdim[3L],
                           dimnames=list(c("a", "b", "d", "e", "g", "h"),
                                         as.character(1:sdim[3L])))
    
          self$a(1L, s_init["a", ])
          self$b(1L, s_init["b", ])
          self$c(1L)
          self$d(1L, s_init["d", ])
          self$e(1L, s_init["e", ])
          self$f(1L)
          self$g(1L, s_init["g", ])
          self$h(1L, s_init["h", ])
          self$i(1L)
    
          private$t <- 1L
    
          invisible()
        },
    
        can_advance = function() {
          private$t < dim(self$s_array)[1L]
        },
    
        advance = function(verbose = FALSE) {
          t <- private$t + 1L
          if (verbose) print(t)
    
          self$a(t)
          self$b(t)
          self$c(t)
          self$d(t)
          self$e(t)
          self$f(t)
          self$g(t)
          self$h(t)
          self$i(t)
    
          private$t <- t
    
          invisible()
        },
    
        # get time slice at t - tf - 1 for given letter
        s_tf = function(t, tf, letter) {
          t_tf_1 <- t - tf - 1L
    
          if (t_tf_1 > 0L)
            self$s_array[t_tf_1, letter, ]
          else
            0
        },
    
        # discrete equations
    
        a = function(t, t_0) {
          if (t < 2L) {
            t <- 1L
            t_1 <- t_0
            t_tf_1 <- 0
          }
          else {
            t_1 <- self$s_array[t - 1L, "a", ]
            t_tf_1 <- self$s_tf(t, self$tf1, "a")
          }
    
          self$s_array[t, "a", ] <- round(0.4 * t_1 + 0.5 * t_tf_1)
          invisible()
        },
    
        b = function(t, t_0) {
          if (t < 2L) {
            t <- 1L
            t_1 <- t_0
            t_tf_1 <- 0
          }
          else {
            t_1 <- self$s_array[t - 1L, "b", ]
            t_tf_1 <- self$s_tf(t, self$tf1, "b")
          }
    
          self$s_array[t, "b", ] <- round(0.5 * t_1 + 0.6 * t_tf_1)
          invisible()
        },
    
        c = function(t) {
          if (t < 1L) stop("t must be positive")
          a_t <- self$s_array[t, "a", ]
          b_t <- self$s_array[t, "b", ]
          self$s_array[t, "c", ] <- a_t + b_t
          invisible()
        },
    
        d = function(t, t_0) {
          if (t < 2L) {
            t <- 1L
            t_1 <- t_0
            t_tf_1 <- 0
          }
          else {
            t_1 <- self$s_array[t - 1L, "d", ]
            t_tf_1 <- self$s_tf(t, self$tf2, "d")
          }
    
          self$s_array[t, "d", ] <- round(0.7 * t_1 + 0.7 * t_tf_1)
          invisible()
        },
    
        e = function(t, t_0) {
          if (t < 2L) {
            t <- 1L
            t_1 <- t_0
            t_tf_1 <- 0
          }
          else {
            t_1 <- self$s_array[t - 1L, "e", ]
            t_tf_1 <- self$s_tf(t, self$tf2, "e")
          }
    
          self$s_array[t, "e", ] <- round(0.9 * t_1 + 0.4 * t_tf_1)
          invisible()
        },
    
        f = function(t) {
          if (t < 1L) stop("t must be positive")
          d_t <- self$s_array[t, "d", ]
          e_t <- self$s_array[t, "e", ]
          self$s_array[t, "f", ] <- d_t + e_t
          invisible()
        },
    
        g = function(t, t_0) {
          if (t < 2L) {
            t <- 1L
            t_1 <- t_0
            t_tf_1 <- 0
          }
          else {
            t_1 <- self$s_array[t - 1L, "g", ]
            t_tf_1 <- self$s_tf(t, self$tf3, "g")
          }
    
          self$s_array[t, "g", ] <- round(0.3 * t_1 + 0.2 * t_tf_1)
          invisible()
        },
    
        h = function(t, t_0) {
          if (t < 2L) {
            t <- 1L
            t_1 <- t_0
            t_tf_1 <- 0
          }
          else {
            t_1 <- self$s_array[t - 1L, "h", ]
            t_tf_1 <- self$s_tf(t, self$tf3, "h")
          }
    
          self$s_array[t, "h", ] <- round(0.5 * t_1 + 0.1 * t_tf_1)
          invisible()
        },
    
        i = function(t) {
          if (t < 1L) stop("t must be positive")
          g_t <- self$s_array[t, "g", ]
          h_t <- self$s_array[t, "h", ]
          self$s_array[t, "i", ] <- g_t + h_t
          invisible()
        }
      ),
      private = list(
        t = NULL
      )
    )
    
    max_t <- 10
    abm <- ABM$new(c(max_t, 9, 2500),
                   list(NULL, letters[1:9], as.character(1:2500)),
                   c(288L, 292L, 150L))
    
    abm$init_abm()
    
    while (abm$can_advance()) {
      abm$advance(TRUE)
    }
    
    anyNA(abm$s_array[])
    # FALSE
    

    Some of the functions under discrete equations encapsulate the logic for initialization when t < 2L. The SArray class separates the 3D array into a list of 2D arrays to work around the .Machine$integer.max limit.