Search code examples
rlistmutate

Replace NA in list of dfs in certain columns and under certain conditions


I have a list of dfs:

my_list <- list(structure(list(col1 = c("v1", "v2", "v3", "V2", "V1"), col2 = c("wood", NA, "water", NA, "water"), col3 = c("cup", NA, "fork", NA, NA), col4 = c(NA, "pear", "banana", NA, "apple")), class = "data.frame", row.names = c(NA, -5L)), structure(list(col1 = c("v1", "v2"), col2 = c("wood", NA), col4 = c(NA, "pear")), class = "data.frame", row.names = c(NA, -2L)), structure(list(col1 = c("v1", "v2", "v3", "V3"), col3 = c("cup", NA, NA, NA), col4 = c(NA, "pear", "banana", NA)), class = "data.frame", row.names = c(NA, -4L)))
  
my_list

[[1]]
  col1  col2 col3   col4
1   v1  wood  cup   <NA>
2   v2  <NA> <NA>   pear
3   v3 water fork banana
4   v2  <NA> <NA>   <NA>
5   v1 water <NA>  apple

[[2]]
  col1 col2 col4
1   v1 wood <NA>
2   v2 <NA> pear

[[3]]
  col1 col3   col4
1   v1  cup   <NA>
2   v2 <NA>   pear
3   v3 <NA> banana
4   v3 <NA>   <NA>

I want to replace NA with "VAL" in col3 only, and only if col1 is v2 or v3.

I found solutions to replace NA in certain columns, but not in certain columns and other conditions (or only for a single df, not for a list of dfs.)

Note that col2 or col3 do not necessarily exist in all dfs.

I need a solution with lapply(list, function), ideally.

Desired output:

  [[1]]
  col1  col2 col3   col4
1   v1  wood  cup   <NA>
2   v2  <NA>  VAL   pear 
3   v3 water fork banana
4   v2  <NA>  VAL   <NA>
5   v1 water <NA>  apple

[[2]]
  col1 col2 col4
1   v1 wood <NA>
2   v2 <NA> pear

[[3]]
  col1 col3   col4
1   v1  cup   <NA>
2   v2  VAL   pear
3   v3  VAL banana
4   v3  VAL   <NA>

Solution

  • In such cases for loops can be much faster.

    > for (s in seq_along(my_list)) {
    +   x <- my_list[[s]]
    +   if ('col3' %in% names(x)) {
    +     x$col3[is.na(x$col3) & x$col1 %in% c('v2', 'v3')] <- 'VAL'
    +     my_list[[s]] <- x
    +   }
    + }
    > my_list
    [[1]]
      col1  col2 col3   col4
    1   v1  wood  cup   <NA>
    2   v2  <NA>  VAL   pear
    3   v3 water fork banana
    4   v2  <NA>  VAL   <NA>
    5   v1 water <NA>  apple
    
    [[2]]
      col1 col2 col4
    1   v1 wood <NA>
    2   v2 <NA> pear
    
    [[3]]
      col1 col3   col4
    1   v1  cup   <NA>
    2   v2  VAL   pear
    3   v3  VAL banana
    4   v3  VAL   <NA>
    

    Benchmark

    Runs 80% faster, which is quite significant. Demonstrated on a list with just 1,000 elements.

    $ Rscript --vanilla foo.R
    Unit: milliseconds
       expr       min        lq      mean    median        uq       max neval cld
      floop  18.84617  19.95898  22.17803  22.08178  24.21662  27.02679   100  a 
     lapply 100.05645 106.24458 111.66269 111.17931 116.06089 150.08886   100   b
    

    Benchmark code

    set.seed(42)
    big_list <- my_list[sample(1:3, 1e3, replace=TRUE)]
    
    microbenchmark::microbenchmark(
      floop={
        for (s in seq_along(big_list)) {
          x <- big_list[[s]]
          if ('col3' %in% names(x)) {
            x$col3[is.na(x$col3) & x$col1 %in% c('v2', 'v3')] <- 'VAL'
            big_list[[s]] <- x
          }
        }
        big_list
      },
      lapply=lapply(
        big_list,
        \(x) if ('col3' %in% names(x)) {
          transform(x, col3=replace(col3, 
                                    is.na(col3) & col1 %in% c('v2', 'v3'), 
                                    'VAL'))
        } else {
          x
        }),
      check='identical')
    

    Edit

    Add a "col3" that is filled with "VAL" if none exists yet:

    > for (s in seq_along(my_list)) {
    +   x <- my_list[[s]]
    +   if ('col3' %in% names(x)) {
    +     x$col3[is.na(x$col3) & x$col1 %in% c('v2', 'v3')] <- 'VAL'
    +   } else {
    +     x$col3 <- 'VAL'
    +   }
    +   my_list[[s]] <- x
    + }
    > my_list
    [[1]]
      col1  col2 col3   col4
    1   v1  wood  cup   <NA>
    2   v2  <NA>  VAL   pear
    3   v3 water fork banana
    4   v2  <NA>  VAL   <NA>
    5   v1 water <NA>  apple
    
    [[2]]
      col1 col2 col4 col3
    1   v1 wood <NA>  VAL
    2   v2 <NA> pear  VAL
    
    [[3]]
      col1 col3   col4
    1   v1  cup   <NA>
    2   v2  VAL   pear
    3   v3  VAL banana
    4   v3  VAL   <NA>
    

    Data:

    > dput(my_list)
    list(structure(list(col1 = c("v1", "v2", "v3", "v2", "v1"), col2 = c("wood", 
    NA, "water", NA, "water"), col3 = c("cup", NA, "fork", NA, NA
    ), col4 = c(NA, "pear", "banana", NA, "apple")), class = "data.frame", row.names = c(NA, 
    -5L)), structure(list(col1 = c("v1", "v2"), col2 = c("wood", 
    NA), col4 = c(NA, "pear")), class = "data.frame", row.names = c(NA, 
    -2L)), structure(list(col1 = c("v1", "v2", "v3", "v3"), col3 = c("cup", 
    NA, NA, NA), col4 = c(NA, "pear", "banana", NA)), class = "data.frame", row.names = c(NA, 
    -4L)))