Search code examples
rdplyrsumfrequency

Frequency of Occurrence of values above a certain value across many columns?


I have a dataset with many columns and thousands of rows. I am trying to get another column, foo which has the frequency that a value above 100 occurs in a row.

structure(list(S026401.R1 = c(0L, 0L, 0L, 0L, 0L), S026404.R1 = c(0L, 
0L, 0L, 0L, 0L), S026406.R1 = c(0L, 0L, 0L, 0L, 0L), S026409.R1 = c(0L, 
0L, 0L, 0L, 0L), S026412.R1 = c(0L, 0L, 0L, 0L, 0L), S026413.R1 = c(0L, 
0L, 0L, 0L, 0L), S026414.R1 = c(47L, 0L, 0L, 0L, 0L), S026415.R1 = c(0L, 
0L, 0L, 0L, 0L), S026416.R1 = c(31L, 0L, 0L, 0L, 0L), S026419.R1 = c(0L, 
0L, 0L, 0L, 0L), S026421.R1 = c(0L, 0L, 0L, 0L, 34L), S026422.R1 = c(0L, 
0L, 0L, 0L, 0L), S026423.R1 = c(0L, 0L, 0L, 0L, 0L), S026427.R1 = c(0L, 
0L, 0L, 0L, 0L), S026428.R1 = c(0L, 0L, 0L, 0L, 1049L), S026429.R1 = c(0L, 
0L, 0L, 0L, 0L), S026430.R1 = c(0L, 0L, 0L, 0L, 0L), S026431.R1 = c(0L, 
10L, 0L, 0L, 0L), S026432.R1 = c(0L, 0L, 0L, 0L, 0L), S026433.R1 = c(0L, 
0L, 0L, 0L, 0L), S026434.R1 = c(0L, 0L, 0L, 0L, 0L), S026435.R1 = c(0L, 
0L, 0L, 0L, 0L), S026438.R1 = c(0L, 0L, 0L, 0L, 0L), S026440.R1 = c(0L, 
0L, 0L, 0L, 0L), S026444.R1 = c(0L, 0L, 0L, 0L, 0L), S026447.R1 = c(0L, 
0L, 0L, 0L, 0L), S026450.R1 = c(0L, 0L, 0L, 0L, 0L), S026451.R1 = c(0L, 
0L, 0L, 0L, 0L), S026453.R1 = c(0L, 0L, 53L, 0L, 0L), S026456.R1 = c(0L, 
0L, 0L, 0L, 0L), S026457.R1 = c(0L, 0L, 0L, 0L, 0L), S026458.R1 = c(0L, 
0L, 0L, 0L, 0L), S026461.R1 = c(0L, 0L, 0L, 0L, 0L), S026462.R1 = c(0L, 
0L, 0L, 0L, 18L), S026463.R1 = c(153L, 0L, 0L, 0L, 0L), S026464.R1 = c(0L, 
0L, 0L, 0L, 0L), S026466.R1 = c(0L, 0L, 0L, 0L, 0L), S026467.R1 = c(32L, 
0L, 0L, 0L, 0L), S026469.R1 = c(0L, 0L, 0L, 0L, 0L), S026470.R1 = c(0L, 
0L, 0L, 0L, 0L), S026471.R1 = c(0L, 0L, 0L, 0L, 0L), S026473.R1 = c(0L, 
0L, 0L, 0L, 0L), S026474.R1 = c(0L, 0L, 0L, 0L, 0L), S026476.R1 = c(0L, 
0L, 0L, 0L, 0L), S026477.R1 = c(780L, 0L, 0L, 0L, 0L), S026483.R1 = c(21L, 
0L, 0L, 0L, 0L), S026484.R1 = c(0L, 0L, 0L, 0L, 0L), S026485.R1 = c(0L, 
0L, 0L, 13L, 0L), S026488.R1 = c(0L, 0L, 0L, 0L, 0L), S026489.R1 = c(0L, 
0L, 0L, 0L, 0L), S026490.R1 = c(60L, 0L, 0L, 0L, 0L), S026493.R1 = c(0L, 
0L, 103L, 0L, 0L)), class = c("rowwise_df", "tbl_df", "tbl", 
"data.frame"), row.names = c(NA, -5L), groups = structure(list(
    .rows = structure(list(1L, 2L, 3L, 4L, 5L), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, -5L), class = c("tbl_df", 
"tbl", "data.frame")))

what i've been trying is:

 df %>%
 rowwise() %>%
  mutate(foo = sum(c(8:52>100), na.rm = TRUE))

but this returns all 0s in the new column foo when i change >100 to a lower number, it does provide a frequency. however, there are many values above 100.


Solution

  • As suggested, use c_across with a range (or collection) of columns.

    library(dplyr)
    df %>%
      rowwise() %>%
      mutate(foo = sum(c_across(X1:X8) > 15)) %>%
      ungroup()
    # # A tibble: 3 x 9
    #      X1    X2    X3    X4    X5    X6    X7    X8   foo
    #   <int> <int> <int> <int> <int> <int> <int> <int> <int>
    # 1    17    10    24     7    23     2    22    12     4
    # 2     5     4    15    20    14    19     6    11     2
    # 3     1    18     8     9    21     3    16    13     3
    

    Sample data:

    set.seed(42)
    df <- data.frame(matrix(sample(24), nrow=3))
    df
    #   X1 X2 X3 X4 X5 X6 X7 X8
    # 1 17 10 24  7 23  2 22 12
    # 2  5  4 15 20 14 19  6 11
    # 3  1 18  8  9 21  3 16 13