Combine apply function with lapply: calculate mean of groups in df

From two dataframes with single expression values (rows) per sample (cols) of different groups, I want to calculate the mean and median per group. My solution seems a bit verbose and I wonder if there is a more elegant solution.

Data

# expression values
genes <- paste("gene",1:1000,sep="")
x <- list(
  A = sample(genes,300), 
  B = sample(genes,525), 
  C = sample(genes,440),
  D = sample(genes,350)
)

# expression dataframe
crete_exp_df <- function(gene_nr, sample_nr){
  df <- replicate(sample_nr, rnorm(gene_nr))
  rownames(df) <- paste("Gene", c(1:nrow(df)))
  colnames(df) <- paste("Sample", c(1:ncol(df)))
  return(df)
}

exp1 <- crete_exp_df(50, 20)
exp2 <- crete_exp_df(50, 20)

# sample annotation
san <- data.frame(
  id = colnames(exp1),
  group = sample(1:4, 20, replace = TRUE))

Solution

# get ids of samples per group
ids_1 <- san %>% filter(group == 1) %>% pull(id)
ids_2 <- san %>% filter(group == 2) %>% pull(id)
ids_3 <- san %>% filter(group == 3) %>% pull(id)
ids_4 <- san %>% filter(group == 4) %>% pull(id)
id_list <- list(group1 = ids_1, group2 = ids_2, group3 = ids_3, group4 = ids_4)

# fct means df1
get_means_exp1 <- function(id){
  apply(exp1[, id], 1, mean, na.rm = T)
} 
# fct means df2
get_means_exp2 <- function(id){
  apply(exp2[, id], 1, mean, na.rm = T)
} 
# lapply on df1
list_means_exp1 <- lapply(id_list, get_means_exp1)
means_exp1 <- as.data.frame(list_means_exp1)
# lapply on df2
list_means_exp2 <- lapply(id_list, get_means_exp2)
means_exp2 <- as.data.frame(list_means_exp2)

I suppose this can be solved much more elegant. Specifically, how to get the ids per group and write a function that works for both df. Looking forwards to learning from your solutions, thanks a lot!

Solution

So, I worked with the data generation process you provided and came up with a more simple solution. I changed exp1 into a dataframe, brought it in tidy format (pivot_longer()), added the groups from the san dataframe and finally applied the simple dplyr syntax to summarise your data.

library(tidyverse)

as.data.frame(exp1) %>%
  rownames_to_column("Gene") %>%
  pivot_longer(cols= 2:21, names_to = "id", values_to = "Values") %>%
  left_join(., san) %>%
  group_by(group) %>%
  summarise(mean= mean(Values),
            median= median(Values))
#> Joining with `by = join_by(id)`
#> # A tibble: 4 × 3
#>   group     mean  median
#>   <int>    <dbl>   <dbl>
#> 1     1  0.0803   0.0568
#> 2     2 -0.0383  -0.0387
#> 3     3 -0.00929  0.0356
#> 4     4 -0.0840  -0.0306

Considering your comment, simply also group by gene and that gets you the expected output.

library(tidyverse)

as.data.frame(exp1) %>%
  rownames_to_column("Gene") %>%
  pivot_longer(cols= 2:21, names_to = "id", values_to = "Values") %>%
  left_join(., san) %>%
  group_by(group, Gene) %>%
  summarise(mean= mean(Values),
            median= median(Values))
#> Joining with `by = join_by(id)`
#> `summarise()` has grouped output by 'group'. You can override using the
#> `.groups` argument.
#> # A tibble: 200 × 4
#> # Groups:   group [4]
#>    group Gene       mean  median
#>    <int> <chr>     <dbl>   <dbl>
#>  1     1 Gene 1  -0.0642 -0.122 
#>  2     1 Gene 10  0.0151  0.563 
#>  3     1 Gene 11 -0.0585 -0.0367
#>  4     1 Gene 12 -0.978  -0.917 
#>  5     1 Gene 13 -1.01   -1.37  
#>  6     1 Gene 14  0.160  -0.394 
#>  7     1 Gene 15 -0.295  -0.689 
#>  8     1 Gene 16  0.774   0.729 
#>  9     1 Gene 17 -0.356  -0.336 
#> 10     1 Gene 18 -0.741  -0.103 
#> # … with 190 more rows

^{Created on 2023-04-13 with reprex v2.0.2}