I have a dataset with many columns and thousands of rows. I am trying to get another column, foo
which has the frequency that a value above 100 occurs in a row.
structure(list(S026401.R1 = c(0L, 0L, 0L, 0L, 0L), S026404.R1 = c(0L,
0L, 0L, 0L, 0L), S026406.R1 = c(0L, 0L, 0L, 0L, 0L), S026409.R1 = c(0L,
0L, 0L, 0L, 0L), S026412.R1 = c(0L, 0L, 0L, 0L, 0L), S026413.R1 = c(0L,
0L, 0L, 0L, 0L), S026414.R1 = c(47L, 0L, 0L, 0L, 0L), S026415.R1 = c(0L,
0L, 0L, 0L, 0L), S026416.R1 = c(31L, 0L, 0L, 0L, 0L), S026419.R1 = c(0L,
0L, 0L, 0L, 0L), S026421.R1 = c(0L, 0L, 0L, 0L, 34L), S026422.R1 = c(0L,
0L, 0L, 0L, 0L), S026423.R1 = c(0L, 0L, 0L, 0L, 0L), S026427.R1 = c(0L,
0L, 0L, 0L, 0L), S026428.R1 = c(0L, 0L, 0L, 0L, 1049L), S026429.R1 = c(0L,
0L, 0L, 0L, 0L), S026430.R1 = c(0L, 0L, 0L, 0L, 0L), S026431.R1 = c(0L,
10L, 0L, 0L, 0L), S026432.R1 = c(0L, 0L, 0L, 0L, 0L), S026433.R1 = c(0L,
0L, 0L, 0L, 0L), S026434.R1 = c(0L, 0L, 0L, 0L, 0L), S026435.R1 = c(0L,
0L, 0L, 0L, 0L), S026438.R1 = c(0L, 0L, 0L, 0L, 0L), S026440.R1 = c(0L,
0L, 0L, 0L, 0L), S026444.R1 = c(0L, 0L, 0L, 0L, 0L), S026447.R1 = c(0L,
0L, 0L, 0L, 0L), S026450.R1 = c(0L, 0L, 0L, 0L, 0L), S026451.R1 = c(0L,
0L, 0L, 0L, 0L), S026453.R1 = c(0L, 0L, 53L, 0L, 0L), S026456.R1 = c(0L,
0L, 0L, 0L, 0L), S026457.R1 = c(0L, 0L, 0L, 0L, 0L), S026458.R1 = c(0L,
0L, 0L, 0L, 0L), S026461.R1 = c(0L, 0L, 0L, 0L, 0L), S026462.R1 = c(0L,
0L, 0L, 0L, 18L), S026463.R1 = c(153L, 0L, 0L, 0L, 0L), S026464.R1 = c(0L,
0L, 0L, 0L, 0L), S026466.R1 = c(0L, 0L, 0L, 0L, 0L), S026467.R1 = c(32L,
0L, 0L, 0L, 0L), S026469.R1 = c(0L, 0L, 0L, 0L, 0L), S026470.R1 = c(0L,
0L, 0L, 0L, 0L), S026471.R1 = c(0L, 0L, 0L, 0L, 0L), S026473.R1 = c(0L,
0L, 0L, 0L, 0L), S026474.R1 = c(0L, 0L, 0L, 0L, 0L), S026476.R1 = c(0L,
0L, 0L, 0L, 0L), S026477.R1 = c(780L, 0L, 0L, 0L, 0L), S026483.R1 = c(21L,
0L, 0L, 0L, 0L), S026484.R1 = c(0L, 0L, 0L, 0L, 0L), S026485.R1 = c(0L,
0L, 0L, 13L, 0L), S026488.R1 = c(0L, 0L, 0L, 0L, 0L), S026489.R1 = c(0L,
0L, 0L, 0L, 0L), S026490.R1 = c(60L, 0L, 0L, 0L, 0L), S026493.R1 = c(0L,
0L, 103L, 0L, 0L)), class = c("rowwise_df", "tbl_df", "tbl",
"data.frame"), row.names = c(NA, -5L), groups = structure(list(
.rows = structure(list(1L, 2L, 3L, 4L, 5L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame")))
what i've been trying is:
df %>%
rowwise() %>%
mutate(foo = sum(c(8:52>100), na.rm = TRUE))
but this returns all 0s in the new column foo
when i change >100
to a lower number, it does provide a frequency. however, there are many values above 100.
As suggested, use c_across
with a range (or collection) of columns.
library(dplyr)
df %>%
rowwise() %>%
mutate(foo = sum(c_across(X1:X8) > 15)) %>%
ungroup()
# # A tibble: 3 x 9
# X1 X2 X3 X4 X5 X6 X7 X8 foo
# <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1 17 10 24 7 23 2 22 12 4
# 2 5 4 15 20 14 19 6 11 2
# 3 1 18 8 9 21 3 16 13 3
Sample data:
set.seed(42)
df <- data.frame(matrix(sample(24), nrow=3))
df
# X1 X2 X3 X4 X5 X6 X7 X8
# 1 17 10 24 7 23 2 22 12
# 2 5 4 15 20 14 19 6 11
# 3 1 18 8 9 21 3 16 13