I would like to unnest()
a data frame where NULL
values are stored as list()
. How can I do it?
library(dplyr)
library(tidyr)
df <-
tibble::tribble(
~date, ~id,
'2023-07-21', 1L,
list(), 140L
)
df
#> # A tibble: 2 x 2
#> date id
#> <list> <int>
#> 1 <chr [1]> 1
#> 2 <list [0]> 140
unnest(df, date)
#> Error in `list_unchop()`:
#> ! Can't combine `x[[1]]` <character> and `x[[2]]` <list>.
#> Backtrace:
#> x
#> 1. +-tidyr::unnest(df, date)
#> 2. +-tidyr:::unnest.data.frame(df, date)
#> 3. | \-tidyr::unchop(...)
#> 4. | \-tidyr:::df_unchop(...)
#> 5. | \-vctrs::list_unchop(col, ptype = col_ptype)
#> 6. \-vctrs (local) `<fn>`()
#> 7. \-vctrs::vec_default_ptype2(...)
#> 8. +-base::withRestarts(...)
#> 9. | \-base (local) withOneRestart(expr, restarts[[1L]])
#> 10. | \-base (local) doWithOneRestart(return(expr), restart)
#> 11. \-vctrs::stop_incompatible_type(...)
#> 12. \-vctrs:::stop_incompatible(...)
#> 13. \-vctrs:::stop_vctrs(...)
#> 14. \-rlang::abort(message, class = c(class, "vctrs_error"), ..., call = call)
Created on 2023-10-18 with reprex v2.0.2
We can replace an empty element with NA
:
library(dplyr)
library(tidyr)
df %>%
mutate(date = lapply(date, function(z) if (!length(z)) NA else z)) %>%
unnest(date)
# # A tibble: 2 × 2
# date id
# <chr> <int>
# 1 2023-07-21 1
# 2 NA 140
Technically this introduces the logical variant of NA
(vice what should be a NA_character_
), but fortunately unnest(.)
is good enough to cast it correctly when unlisting.
Borrowing from @ThomasIsCoding's excellent answer, we can also use replace(.)
here:
df %>%
mutate(date = replace(date, lengths(date) == 0, NA)) %>%
unnest(date)
for the same effect. The benefit is purely speed:
dfbig <- replicate(10000, df, simplify = FALSE) %>%
bind_rows()
bench::mark(
lapply = mutate(dfbig, date = lapply(date, function(z) if (!length(z)) NA else z)) %>% unnest(date),
replace_NA = mutate(dfbig, date = replace(date, lengths(date) == 0, NA)) %>% unnest(date),
replace_NULL = mutate(dfbig, date = replace(date, lengths(date) == 0, list(NULL))) %>% unnest(date, keep_empty = TRUE),
within = within(dfbig, date <- unlist(replace(date, !lengths(date), NA_character_), use.names = FALSE))
)
# # A tibble: 4 × 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:t> <bch:t> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 lapply 36.8ms 37.22ms 26.8 NA 8.03 10 3 374ms <tibble> <NULL> <bench_tm> <tibble>
# 2 replace_NA 17.96ms 18.42ms 54.3 NA 7.09 23 3 423ms <tibble> <NULL> <bench_tm> <tibble>
# 3 replace_N… 50.75ms 51.47ms 19.4 NA 4.84 8 2 413ms <tibble> <NULL> <bench_tm> <tibble>
# 4 within 1.09ms 1.14ms 867. NA 11.0 393 5 453ms <tibble> <NULL> <bench_tm> <tibble>