Search code examples
rsubsetnested-lists

Subset sublists of nested list by vector condition


I have a nested list list1:

list1 <- list(Alpha = structure(list(sample_0 = c(3, NA, 7, 9, 2),
                                     sample_1 = c(NA, 3, 5, 4, NA),
                                     sample_2 = c(7, 3, 5, NA, NA)),
                                row.names = c(NA, -5L),
                                class = c("tbl_df", "tbl", "data.frame")),
              Beta = structure (list(sample_0 = c(2, 9, NA, 3, 7),
                                     sample_1 = c(3, 7, 9, 3, NA),
                                     sample_2 = c(4, 2, 6, 4, 6)),
                                row.names = c(NA, -5L),
                                class = c("tbl_df", "tbl", "data.frame")),
              Gamma = structure(list(sample_0 = c(NA, NA, 4, 6, 3),
                                     sample_1 = c(3, 5, 3, NA, 2)),
                                row.names = c(NA, -5L),
                                class = c("tbl_df", "tbl", "data.frame")),
              Delta = structure(list(sample_0 = c(3, NA, 7, 9, 2),
                                     sample_1 = c(3, 8, 5, 4, 9)),
                                     row.names = c(NA, -5L),
                                     class = c("tbl_df", "tbl", "data.frame")))

I needed to change one Column to a factor level, as follows (is there a more elegant way to do that than using a for loop?):

for (i in names(list1)) {
  list1[[i]]$sample_1 <- list1[[i]]$sample_1 %>% as.numeric() %>% 
    cut(breaks=c(0, 2, 4, 5, 8, 10),
        labels=c("-", "+", "++", "+++", "++++"), right = TRUE, na.rm = TRUE)
 }

I now would like to keep only the sublists that contain at least level "+++" (or "++++") in sample_1...

sapply(sapply(list1, `[`, "sample_1"), unique)
$Alpha.sample_1
[1] <NA> +    ++  
Levels: - + ++ +++ ++++

$Beta.sample_1
[1] +    +++  ++++ <NA>
Levels: - + ++ +++ ++++

$Gamma.sample_1
[1] +    ++   <NA> -   
Levels: - + ++ +++ ++++

$Delta.sample_1
[1] +    +++  ++   ++++
Levels: - + ++ +++ ++++

... and all the other elements of the same sublist in a new list2. The desired result would look as follows:

list1
$Beta
# A tibble: 5 x 3
  sample_0 sample_1 sample_2
     <dbl> <fct>       <dbl>
1        2 +               4
2        9 +++             2
3       NA ++++            6
4        3 +               4
5        7 NA              6
    
$Delta
# A tibble: 5 x 2
  sample_0 sample_1
     <dbl> <fct>   
1        3 +       
2       NA +++     
3        7 ++      
4        9 +       
5        2 ++++  

I believe I would have to convert the vector to as.integer, but I can't come up with the solution.


Solution

  • You could define what you want to do in a function myFun, use it in lapply and subset the result for nrow > 0.

    myFun <- \(x) {
      x$sample_1  <- with(x, cut(sample_1, breaks=c(0, 2, 4, 5, 8, 10),
                                 labels=c("-", "+", "++", "+++", "++++"), 
                                 right=TRUE, na.rm=TRUE))
      subset(x, sample_1 %in% c("+++", "++++"))
    }
        
    res <- lapply(list1, myFun)
    res[sapply(res, nrow) > 0]
    # $Beta
    #   sample_0 sample_1 sample_2
    # 2        9      +++        2
    # 3       NA     ++++        6
    # 
    # $Delta
    #   sample_0 sample_1
    # 2       NA      +++
    # 5        2     ++++
    #   
    

    Edit

    To split the list elements into sublists at the threshold, you may use split in the function. We may also split at the underlying integer, i.e. as.numeric(x$sample_1).

    myFun2 <- \(x) {
      x$sample_1 <- with(x, cut(sample_1, breaks=c(0, 2, 4, 5, 8, 10), 
                                labels=c("-", "+", "++", "+++", "++++"), 
                                right=TRUE, na.rm=TRUE))
      split(x, as.numeric(x$sample_1) > 4)
    }
    
    lapply(list1, myFun2)
    # $Alpha
    # $Alpha$`FALSE`
    # sample_0 sample_1 sample_2
    # 2       NA        +        3
    # 3        7       ++        5
    # 4        9        +       NA
    # 
    # 
    # $Beta
    # $Beta$`FALSE`
    # sample_0 sample_1 sample_2
    # 1        2        +        4
    # 2        9      +++        2
    # 4        3        +        4
    # 
    # $Beta$`TRUE`
    # sample_0 sample_1 sample_2
    # 3       NA     ++++        6
    # 
    # 
    # $Gamma
    # $Gamma$`FALSE`
    # sample_0 sample_1
    # 1       NA        +
    #   2       NA       ++
    #   3        4        +
    #   5        3        -
    #   
    #   
    #   $Delta
    # $Delta$`FALSE`
    # sample_0 sample_1
    # 1        3        +
    #   2       NA      +++
    #   3        7       ++
    #   4        9        +
    #   
    #   $Delta$`TRUE`
    # sample_0 sample_1
    # 5        2     ++++