Search code examples
rdplyrmutatevctrs

Optimize repeated fisher test over dataframe


I have a code segment that I am trying to optimize to run a bit faster.

df1 <- df %>%
rowwise() %>%
    mutate(fisher = fisher.test(matrix(c(counts, nt1_not_t2,
                                         nt2_not_t1, not_t1_or_t2), nrow = 2, ncol = 2))$p.value) %>% 
    filter(oddsRatio > 1 & fisher < pval) %>% 
    mutate(test_direction = binom.test(x = forward, n = counts, p = 0.5)$p.value) %>%
    filter(test_direction < 0.05)

I've been trying to replace the first mutate/filter pair with

df1 <- df %>%
{.$fisher = fisher.test(matrix(c(.$counts, .$nt1_not_t2,
                                     .$nt2_not_t1, .$not_t1_or_t2), nrow = 2, ncol = 2))$p.value; .} %>% 
    vctrs::vec_slice(., .$oddsRatio > 1 & .$fisher < pval)

However the matrix, instead of having 4 values, is trying to use the entire dataset.

Warning message:
In matrix(c(.$counts, .$nt1_not_t2, .$nt2_not_t1, .$not_t1_or_t2),  :
  data length differs from size of matrix: [12184 != 2 x 2]

Not sure if there is some syntax I can add to avoid that, or another more performant way to achieve this.

Any advice would be great

EDIT:: using profvis()

[![profiling][1]][1]

some sample data

df <- structure(list(counts = c(114L, 55L, 57L, 95L, 514L, 65L, 694L, 
28L, 148L, 122L, 240L, 38L, 260L, 65L, 40L, 12L, 32L, 134L, 42L, 
16L, 37L, 33L, 63L, 16L, 20L, 13L, 12L, 17L, 15L, 26L, 548L, 
31L, 467L, 202L, 1696L, 1422L, 219L, 362L, 417L, 1449L, 2142L, 
241L, 128L, 1812L, 3281L, 677L, 1006L, 137L, 67L, 161L), forward = c(61L, 
37L, 32L, 47L, 233L, 43L, 568L, 15L, 75L, 76L, 149L, 27L, 177L, 
34L, 33L, 7L, 22L, 81L, 24L, 8L, 19L, 22L, 55L, 8L, 12L, 8L, 
7L, 10L, 12L, 13L, 407L, 17L, 260L, 119L, 861L, 906L, 144L, 199L, 
195L, 645L, 1223L, 166L, 73L, 844L, 2727L, 341L, 529L, 86L, 36L, 
87L), oddsRatio = c(2.81719639416155, 2.56249627110554, 3.0012284711951, 
3.2379086619481, 3.28262910798122, 1.78506192857701, 1.23683314379245, 
1.47200829293576, 1.60268857356236, 2.54327837666455, 2.65317932754055, 
2.7443152244971, 2.41170031230883, 1.39183344640434, 1.67403290633562, 
2.45502917152859, 1.52227974689146, 1.67590893004601, 1.5285951005419, 
1.88542317834637, 1.64599293496765, 1.23362495081645, 0.884202671972712, 
1.15874072081511, 1.23428571428571, 0.918351141954909, 0.942141312184571, 
1.63698770491803, 1.92011125705014, 1.67230461700034, 6.63377209451277, 
1.98838889786539, 3.21071805754822, 2.3432554142601, 3.03013725938027, 
11.6868669158062, 3.25457032130808, 3.5341881506799, 2.37982532860213, 
11.383770310192, 2.98290705092543, 1.43986166989779, 1.32458721885379, 
1.70316762338472, 1.20465352503506, 1.32048669611991, 1.38391148677588, 
1.70567801561587, 1.11338513259357, 1.18942080378251), not_t1_or_t2 = c(16736L, 
17180L, 17230L, 16989L, 12236L, 16869L, 4192L, 17243L, 15631L, 
16569L, 15416L, 17344L, 14981L, 16665L, 17139L, 17533L, 17201L, 
15903L, 17066L, 16400L, 12543L, 12161L, 4345L, 15346L, 14742L, 
15391L, 15681L, 15977L, 16568L, 14576L, 14024L, 14291L, 13801L, 
14039L, 11687L, 13734L, 14104L, 13968L, 13703L, 13701L, 10576L, 
13757L, 14009L, 9868L, 3491L, 12503L, 11656L, 14064L, 14137L, 
13875L), nt1_not_t2 = c(755L, 814L, 812L, 774L, 355L, 804L, 175L, 
841L, 721L, 747L, 629L, 831L, 609L, 804L, 829L, 857L, 837L, 735L, 
827L, 69L, 48L, 52L, 22L, 69L, 65L, 72L, 73L, 68L, 70L, 59L, 
3609L, 4126L, 3690L, 3955L, 2461L, 2735L, 3938L, 3795L, 3740L, 
2708L, 2015L, 3916L, 4029L, 2345L, 876L, 3480L, 3151L, 4020L, 
4090L, 3996L), nt2_not_t1 = c(897L, 453L, 403L, 644L, 5397L, 
764L, 13441L, 390L, 2002L, 1064L, 2217L, 289L, 2652L, 968L, 494L, 
100L, 432L, 1730L, 567L, 2017L, 5874L, 6256L, 14072L, 3071L, 
3675L, 3026L, 2736L, 2440L, 1849L, 3841L, 321L, 54L, 544L, 306L, 
2658L, 611L, 241L, 377L, 642L, 644L, 3769L, 588L, 336L, 4477L, 
10854L, 1842L, 2689L, 281L, 208L, 470L)), row.names = c(NA, -50L
), class = c("tbl_df", "tbl", "data.frame")) ```


  [1]: https://i.sstatic.net/a1ebG.png

Solution

  • Based on Axeman's comment here a small benchmark using different approaches:

    library(dplyr)
    library(purrr)
    
    my_fisher <- function(counts, nt1_not_t2, nt2_not_t1, not_t1_or_t2) {
      fisher.test(
        matrix(
          c(counts, nt1_not_t2, nt2_not_t1, not_t1_or_t2), 
          nrow = 2, 
          ncol = 2
          )
      )$p.value
    }
    

    We apply this function using purrrs pmap-function and using furrrs future_pmap, a parallel version of pmap:

    # purrr
    df %>% 
      mutate(fisher = pmap_dbl(list(counts, nt1_not_t2, nt2_not_t1, not_t1_or_t2), my_fisher))
    
    # furrr, choose workers based on actual hardware
    library(furrr)
    plan("future::multisession", workers = 4)
    
    df %>% 
      mutate(fisher = future_pmap_dbl(list(counts, nt1_not_t2, nt2_not_t1, not_t1_or_t2), my_fisher))
    

    Benchmarking:

    library(microbenchmark)
    
    microbenchmark(
      orig = df %>%
        rowwise() %>%
        mutate(fisher = fisher.test(matrix(c(counts, nt1_not_t2,
                                             nt2_not_t1, not_t1_or_t2), nrow = 2, ncol = 2))$p.value),
      purrr = df %>% 
        mutate(fisher = pmap_dbl(list(counts, nt1_not_t2, nt2_not_t1, not_t1_or_t2), my_fisher)),
      furrr = df %>% 
        mutate(fisher = future_pmap_dbl(list(counts, nt1_not_t2, nt2_not_t1, not_t1_or_t2), my_fisher))
    )
    

    returns

    Unit: milliseconds
      expr      min       lq      mean    median       uq      max neval
      orig 108.7756 110.3752 116.08041 113.84570 118.3328 224.2154   100
     purrr 105.9916 108.0498 114.80299 113.86020 116.7421 224.0461   100
     furrr  86.1203  89.5472  93.77818  92.63145  96.1540 122.7134   100
    

    So using furrr seems to give a small advantage regarding time / speed.