I have a dataframe similar to this dummy:
dframe <- structure(list(id = c("294361-7349174-75411122", "294365-7645230-95464222",
"291915-7345264-75464222", "291365-7345074-75164202", "594165-7345274-78444212",
"234385-7335274-75464229", "734515-1345274-95464892", "201365-8345274-78464232",
"294365-7315971-75464120", "591365-7345374-75464222", "394365-7345204-75411022",
"494305-7345273-75464222", "291161-7345271-75461210", "294035-7345201-75464292",
"298365-7345279-78864223", "294365-7345274-15964293", "294395-7345274-69464299",
"899965-1345294-95464222", "194365-7145274-75464222", "194361-7349231-75464222",
"294365-7345274-75464122", "191315-1345274-13464322", "794365-7349274-75464292",
"214365-8318274-75464222", "394363-8341274-39494929"), gene = structure(c(3L,
3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("ABC_1", "C_1", "XYZ_123"
), class = "factor"), group = structure(c(2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("KO", "WT"), class = "factor"), class_A = c(0,
1, 0, 2, 1, 0, 0, 1, 0, 1, 0, 0, 0, 2, 2, 1, 0, 0, 0, 0, 1, 1,
1, 0, 3), class_B = c(0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1)), row.names = c(NA, -25L), class = "data.frame")
Based on this, I would like to produce the summary table (for data grouped per "group" and "gene"). For the purpose of the summary I refer to the non-zero occurrences as "counts" and to the sum of non-zero values as "hits". I do not feel very fluent in English, so without further ado - below is the desired output:
desired_dframe <- structure(list(group = structure(c(1L, 1L, 2L, 2L, 2L), .Label = c("KO",
"WT"), class = "factor"), gene = structure(c(1L, 3L, 1L, 2L,
3L), .Label = c("ABC_1", "C_1", "XYZ_123"), class = "factor"),
counts_total = c(8L, 7L, 5L, 1L, 4L), counts_AB = c(6L, 3L,
3L, 1L, 3L), hits_AB = c(9, 6, 3, 1, 5), counts_A = c(4L,
3L, 2L, 1L, 2L), hits_A = c(6, 5, 2, 1, 3), counts_B = c(3L,
1L, 1L, 0L, 2L), hits_B = c(3, 1, 1, 0, 2)), row.names = c(NA,
-5L), class = "data.frame")
I can produce this with the following function:
summarize_mygene <- function(dframe){
# count all ids per group and gene (counts)
dframe1 <- dframe %>%
dplyr::group_by(group, gene) %>%
dplyr::summarise(counts_total = dplyr::n())
# count all ocurrences of non-zero instances in both class_ columns (counts)
dframe2 <- dframe %>%
rowwise() %>%
dplyr::mutate(counts_AB = sum(dplyr::c_across(class_A:class_B))) %>%
dplyr::group_by(group, gene) %>%
dplyr::summarise(dplyr::across(counts_AB, ~ sum(.x != 0)))
# sum up all non-zero instances in both class columns (hits)
dframe3 <- dframe %>%
rowwise() %>%
dplyr::mutate(hits_AB = sum(dplyr::c_across(class_A:class_B))) %>%
dplyr::group_by(group, gene) %>%
dplyr::summarise(dplyr::across(hits_AB, ~ sum(.x)))
# count non-zero ocurrences per group and gene in class_A column (counts)
dframe4 <- dframe %>%
dplyr::group_by(group, gene) %>%
dplyr::summarise(dplyr::across(class_A, ~ sum(.x != 0))) %>%
dplyr::rename(counts_A = class_A)
# sum up all non-zero instances in class_A column (hits)
dframe5 <- dframe %>%
rowwise() %>%
dplyr::mutate(hits_A = class_A) %>%
dplyr::group_by(group, gene) %>%
dplyr::summarise(dplyr::across(hits_A, ~ sum(.x)))
# count non-zero ocurrences per group and gene in class_A column (counts)
dframe6 <- dframe %>%
dplyr::group_by(group, gene) %>%
dplyr::summarise(dplyr::across(class_B, ~ sum(.x != 0))) %>%
dplyr::rename(counts_B=class_B)
# sum up all non-zero instances in class_B column (hits)
dframe7 <- dframe %>%
rowwise() %>%
dplyr::mutate(hits_B = class_B) %>%
dplyr::group_by(group, gene) %>%
dplyr::summarise(dplyr::across(hits_B, ~ sum(.x)))
# merge the outputs
dframe_list <- list(dframe1, dframe2, dframe3, dframe4, dframe5, dframe6, dframe7)
merged_dframe <- Reduce(function(x, y) merge(x, y, all=TRUE), dframe_list)
return(merged_dframe)
}
However, as you can see, this is not the most elegant example of using dplyr. I wrote this function in a step-wise manner, because this was the only way I could avoid the errors. Initially, I tried to handle all operations at once, within single dplyr::mutate() call or within single dplyr::summarize() call. However, It threw multiple errors or resulted in a column(s) containing a tibble in each row. After a few attempts I gave up on writing concise code and landed with this summarize_mygene() abomination. However, I would appreciate assistance with improving this nightmare fuel.
We could do this in a single group by operation
library(dplyr)
library(stringr)
dframe %>%
mutate(Sum_AB = rowSums(across(starts_with('class_')))) %>%
group_by(group, gene) %>%
summarise(counts_total = n(),
across(c(Sum_AB, class_A, class_B),
list(counts = ~ sum(.x != 0), hits = ~ sum(.x))), .groups= 'drop')%>%
rename_with(~str_replace(.x, '^\\w+_(\\w+)_(\\w+)', '\\2_\\1'), 3:last_col() )
-output
# A tibble: 5 × 9
group gene counts_total counts_AB hits_AB counts_A hits_A counts_B hits_B
<fct> <fct> <int> <int> <dbl> <int> <dbl> <int> <dbl>
1 KO ABC_1 8 6 9 4 6 3 3
2 KO XYZ_123 7 3 6 3 5 1 1
3 WT ABC_1 5 3 3 2 2 1 1
4 WT C_1 1 1 1 1 1 0 0
5 WT XYZ_123 4 3 5 2 3 2 2
-OP's desired
> desired_dframe
group gene counts_total counts_AB hits_AB counts_A hits_A counts_B hits_B
1 KO ABC_1 8 6 9 4 6 3 3
2 KO XYZ_123 7 3 6 3 5 1 1
3 WT ABC_1 5 3 3 2 2 1 1
4 WT C_1 1 1 1 1 1 0 0
5 WT XYZ_123 4 3 5 2 3 2 2