I have a list of dfs:
my_list <- list(structure(list(col1 = c("v1", "v2", "v3", "V2", "V1"), col2 = c("wood", NA, "water", NA, "water"), col3 = c("cup", NA, "fork", NA, NA), col4 = c(NA, "pear", "banana", NA, "apple")), class = "data.frame", row.names = c(NA, -5L)), structure(list(col1 = c("v1", "v2"), col2 = c("wood", NA), col4 = c(NA, "pear")), class = "data.frame", row.names = c(NA, -2L)), structure(list(col1 = c("v1", "v2", "v3", "V3"), col3 = c("cup", NA, NA, NA), col4 = c(NA, "pear", "banana", NA)), class = "data.frame", row.names = c(NA, -4L)))
my_list
[[1]]
col1 col2 col3 col4
1 v1 wood cup <NA>
2 v2 <NA> <NA> pear
3 v3 water fork banana
4 v2 <NA> <NA> <NA>
5 v1 water <NA> apple
[[2]]
col1 col2 col4
1 v1 wood <NA>
2 v2 <NA> pear
[[3]]
col1 col3 col4
1 v1 cup <NA>
2 v2 <NA> pear
3 v3 <NA> banana
4 v3 <NA> <NA>
I want to replace NA with "VAL" in col3 only, and only if col1 is v2 or v3.
I found solutions to replace NA in certain columns, but not in certain columns and other conditions (or only for a single df, not for a list of dfs.)
Note that col2 or col3 do not necessarily exist in all dfs.
I need a solution with lapply(list, function)
, ideally.
Desired output:
[[1]]
col1 col2 col3 col4
1 v1 wood cup <NA>
2 v2 <NA> VAL pear
3 v3 water fork banana
4 v2 <NA> VAL <NA>
5 v1 water <NA> apple
[[2]]
col1 col2 col4
1 v1 wood <NA>
2 v2 <NA> pear
[[3]]
col1 col3 col4
1 v1 cup <NA>
2 v2 VAL pear
3 v3 VAL banana
4 v3 VAL <NA>
In such cases for
loops can be much faster.
> for (s in seq_along(my_list)) {
+ x <- my_list[[s]]
+ if ('col3' %in% names(x)) {
+ x$col3[is.na(x$col3) & x$col1 %in% c('v2', 'v3')] <- 'VAL'
+ my_list[[s]] <- x
+ }
+ }
> my_list
[[1]]
col1 col2 col3 col4
1 v1 wood cup <NA>
2 v2 <NA> VAL pear
3 v3 water fork banana
4 v2 <NA> VAL <NA>
5 v1 water <NA> apple
[[2]]
col1 col2 col4
1 v1 wood <NA>
2 v2 <NA> pear
[[3]]
col1 col3 col4
1 v1 cup <NA>
2 v2 VAL pear
3 v3 VAL banana
4 v3 VAL <NA>
Runs 80% faster, which is quite significant. Demonstrated on a list with just 1,000 elements.
$ Rscript --vanilla foo.R
Unit: milliseconds
expr min lq mean median uq max neval cld
floop 18.84617 19.95898 22.17803 22.08178 24.21662 27.02679 100 a
lapply 100.05645 106.24458 111.66269 111.17931 116.06089 150.08886 100 b
Benchmark code
set.seed(42)
big_list <- my_list[sample(1:3, 1e3, replace=TRUE)]
microbenchmark::microbenchmark(
floop={
for (s in seq_along(big_list)) {
x <- big_list[[s]]
if ('col3' %in% names(x)) {
x$col3[is.na(x$col3) & x$col1 %in% c('v2', 'v3')] <- 'VAL'
big_list[[s]] <- x
}
}
big_list
},
lapply=lapply(
big_list,
\(x) if ('col3' %in% names(x)) {
transform(x, col3=replace(col3,
is.na(col3) & col1 %in% c('v2', 'v3'),
'VAL'))
} else {
x
}),
check='identical')
Add a "col3"
that is filled with "VAL"
if none exists yet:
> for (s in seq_along(my_list)) {
+ x <- my_list[[s]]
+ if ('col3' %in% names(x)) {
+ x$col3[is.na(x$col3) & x$col1 %in% c('v2', 'v3')] <- 'VAL'
+ } else {
+ x$col3 <- 'VAL'
+ }
+ my_list[[s]] <- x
+ }
> my_list
[[1]]
col1 col2 col3 col4
1 v1 wood cup <NA>
2 v2 <NA> VAL pear
3 v3 water fork banana
4 v2 <NA> VAL <NA>
5 v1 water <NA> apple
[[2]]
col1 col2 col4 col3
1 v1 wood <NA> VAL
2 v2 <NA> pear VAL
[[3]]
col1 col3 col4
1 v1 cup <NA>
2 v2 VAL pear
3 v3 VAL banana
4 v3 VAL <NA>
Data:
> dput(my_list)
list(structure(list(col1 = c("v1", "v2", "v3", "v2", "v1"), col2 = c("wood",
NA, "water", NA, "water"), col3 = c("cup", NA, "fork", NA, NA
), col4 = c(NA, "pear", "banana", NA, "apple")), class = "data.frame", row.names = c(NA,
-5L)), structure(list(col1 = c("v1", "v2"), col2 = c("wood",
NA), col4 = c(NA, "pear")), class = "data.frame", row.names = c(NA,
-2L)), structure(list(col1 = c("v1", "v2", "v3", "v3"), col3 = c("cup",
NA, NA, NA), col4 = c(NA, "pear", "banana", NA)), class = "data.frame", row.names = c(NA,
-4L)))