Search code examples
rdataframelarge-data

How to create a df by extracting a column from each df of a list of dfs in R?


I've many list of dataframes like split_plot_data2, and I want to make a separate single dataframe in which columns 3 and 2 will be there. Column 3 is common for all and col 2 will be separately added with the name of col 1.

head(split_plot_data2)
$EE87786ln1
# A tibble: 1,584 × 4
   EE_variant    lambda variable                     value
   <chr>          <dbl> <chr>                        <dbl>
 1 EE87786ln1 0.0000228 blood_vessel_h16                 0
 2 EE87786ln1 0.0000228 adrenal_gland_h16                0
 3 EE87786ln1 0.0000228 bone_element_h16                 0
 4 EE87786ln1 0.0000228 Bronchus_h16                     0
 5 EE87786ln1 0.0000228 esophagus_h16                    0
 6 EE87786ln1 0.0000228 extraembryonic_structure_h16     0
 7 EE87786ln1 0.0000228 eye_h16                          0
 8 EE87786ln1 0.0000228 gonad_h16                        0
 9 EE87786ln1 0.0000228 heart_h16                        0
10 EE87786ln1 0.0000228 kidney_h16                       0
# ℹ 1,574 more rows
# ℹ Use `print(n = ...)` to see more rows

$EE87787ln1
# A tibble: 1,560 × 4
   EE_variant    lambda variable                     value
   <chr>          <dbl> <chr>                        <dbl>
 1 EE87787ln1 0.0000751 blood_vessel_h16                 0
 2 EE87787ln1 0.0000751 adrenal_gland_h16                0
 3 EE87787ln1 0.0000751 bone_element_h16                 0
 4 EE87787ln1 0.0000751 Bronchus_h16                     0
 5 EE87787ln1 0.0000751 esophagus_h16                    0
 6 EE87787ln1 0.0000751 extraembryonic_structure_h16     0
 7 EE87787ln1 0.0000751 eye_h16                          0
 8 EE87787ln1 0.0000751 gonad_h16                        0
 9 EE87787ln1 0.0000751 heart_h16                        0
10 EE87787ln1 0.0000751 kidney_h16                       0
# ℹ 1,550 more rows
# ℹ Use `print(n = ...)` to see more rows

$EE87788ln1
# A tibble: 1,608 × 4
   EE_variant    lambda variable                     value
   <chr>          <dbl> <chr>                        <dbl>
 1 EE87788ln1 0.0000761 blood_vessel_h16                 0
 2 EE87788ln1 0.0000761 adrenal_gland_h16                0
 3 EE87788ln1 0.0000761 bone_element_h16                 0
 4 EE87788ln1 0.0000761 Bronchus_h16                     0
 5 EE87788ln1 0.0000761 esophagus_h16                    0
 6 EE87788ln1 0.0000761 extraembryonic_structure_h16     0
 7 EE87788ln1 0.0000761 eye_h16                          0
 8 EE87788ln1 0.0000761 gonad_h16                        0
 9 EE87788ln1 0.0000761 heart_h16                        0
10 EE87788ln1 0.0000761 kidney_h16                       0
# ℹ 1,598 more rows
# ℹ Use `print(n = ...)` to see more rows

$EE87789ln1
# A tibble: 1,608 × 4
   EE_variant    lambda variable                     value
   <chr>          <dbl> <chr>                        <dbl>
 1 EE87789ln1 0.0000732 blood_vessel_h16                 0
 2 EE87789ln1 0.0000732 adrenal_gland_h16                0
 3 EE87789ln1 0.0000732 bone_element_h16                 0
 4 EE87789ln1 0.0000732 Bronchus_h16                     0
 5 EE87789ln1 0.0000732 esophagus_h16                    0
 6 EE87789ln1 0.0000732 extraembryonic_structure_h16     0
 7 EE87789ln1 0.0000732 eye_h16                          0
 8 EE87789ln1 0.0000732 gonad_h16                        0
 9 EE87789ln1 0.0000732 heart_h16                        0
10 EE87789ln1 0.0000732 kidney_h16                       0
# ℹ 1,598 more rows
# ℹ Use `print(n = ...)` to see more rows

$EE87790ln1
# A tibble: 1,536 × 4
   EE_variant    lambda variable                     value
   <chr>          <dbl> <chr>                        <dbl>
 1 EE87790ln1 0.0000328 blood_vessel_h16                 0
 2 EE87790ln1 0.0000328 adrenal_gland_h16                0
 3 EE87790ln1 0.0000328 bone_element_h16                 0
 4 EE87790ln1 0.0000328 Bronchus_h16                     0
 5 EE87790ln1 0.0000328 esophagus_h16                    0
 6 EE87790ln1 0.0000328 extraembryonic_structure_h16     0
 7 EE87790ln1 0.0000328 eye_h16                          0
 8 EE87790ln1 0.0000328 gonad_h16                        0
 9 EE87790ln1 0.0000328 heart_h16                        0
10 EE87790ln1 0.0000328 kidney_h16                       0
# ℹ 1,526 more rows
# ℹ Use `print(n = ...)` to see more rows

$EE87791ln1
# A tibble: 1,536 × 4
   EE_variant    lambda variable                     value
   <chr>          <dbl> <chr>                        <dbl>
 1 EE87791ln1 0.0000444 blood_vessel_h16                 0
 2 EE87791ln1 0.0000444 adrenal_gland_h16                0
 3 EE87791ln1 0.0000444 bone_element_h16                 0
 4 EE87791ln1 0.0000444 Bronchus_h16                     0
 5 EE87791ln1 0.0000444 esophagus_h16                    0
 6 EE87791ln1 0.0000444 extraembryonic_structure_h16     0
 7 EE87791ln1 0.0000444 eye_h16                          0
 8 EE87791ln1 0.0000444 gonad_h16                        0
 9 EE87791ln1 0.0000444 heart_h16                        0
10 EE87791ln1 0.0000444 kidney_h16                       0
# ℹ 1,526 more rows
# ℹ Use `print(n = ...)` to see more rows

The expected dataframe would look like this,

head(comb_plot_lamb)
# A tibble: 6 × 8
  Variable               Lambda_EE87802 Lambda_EE87786 Lambda_EE87787 Lambda_EE87788 Lambda_EE87789 Lambda_EE87790 Lambda_EE87791
  <chr>                       <dbl>          <dbl>          <dbl>          <dbl>          <dbl>          <dbl>          <dbl>
1 blood_vessel_h16        0.0000348      0.0000228      0.0000751      0.0000761      0.0000732      0.0000328      0.0000444
2 adrenal_gland_h16       0.0000348      0.0000228      0.0000751      0.0000761      0.0000732      0.0000328      0.0000444
3 bone_element_h16        0.0000348      0.0000228      0.0000751      0.0000761      0.0000732      0.0000328      0.0000444
4 Bronchus_h16            0.0000348      0.0000228      0.0000751      0.0000761      0.0000732      0.0000328      0.0000444
5 esophagus_h16           0.0000348      0.0000228      0.0000751      0.0000761      0.0000732      0.0000328      0.0000444
6 extraembryonic_struct…  0.0000348      0.0000228      0.0000751      0.0000761      0.0000732      0.0000328      0.0000444

What I did, but I wonder how can I make this thing at a go rather than writing multiple lines to get every column of lambda values.

n1 <- sapply(split_plot_data2, nrow)
n2 <- max(n1)
n4 <- which.max(n1)
comb_plot_data <- split_plot_data2[[n4]][1:4]
comb_plot_data2 <- comb_plot_data[, c(3,2)]

colnames(comb_plot_data2) <- c('Variable','Lambda_EE87802)

comb_plot_lamb <- comb_plot_data2

comb_plot_lamb$Lambda_EE87786 <- c(split_plot_data2[[1]]$lambda, rep(NA, nrow(comb_plot_lamb)-length(split_plot_data2[[1]]$lambda)))
comb_plot_lamb$Lambda_EE87787 <- c(split_plot_data2[[2]]$lambda, rep(NA, nrow(comb_plot_lamb)-length(split_plot_data2[[2]]$lambda)))
comb_plot_lamb$Lambda_EE87788 <- c(split_plot_data2[[3]]$lambda, rep(NA, nrow(comb_plot_lamb)-length(split_plot_data2[[3]]$lambda)))
comb_plot_lamb$Lambda_EE87789 <- c(split_plot_data2[[4]]$lambda, rep(NA, nrow(comb_plot_lamb)-length(split_plot_data2[[4]]$lambda)))
comb_plot_lamb$Lambda_EE87790 <- c(split_plot_data2[[5]]$lambda, rep(NA, nrow(comb_plot_lamb)-length(split_plot_data2[[5]]$lambda)))
comb_plot_lamb$Lambda_EE87791 <- c(split_plot_data2[[6]]$lambda, rep(NA, nrow(comb_plot_lamb)-length(split_plot_data2[[6]]$lambda)))

Solution

  • Here is a tidyverse solution.

    suppressPackageStartupMessages({
      library(dplyr)
      library(tidyr)
      library(purrr)
    })
    
    df1 %>%
      group_split(EE_variant) %>%
      map(\(x) {
        newname <- first(x$EE_variant) %>% as.character()
        newname <- paste0("Lambda_", newname)
        x %>%
          select(-EE_variant) %>%
          rename(!!sym(newname) := lambda) %>% 
          select(2, 1)
      }) %>%
      reduce(left_join, by = "variable")
    #> # A tibble: 50 × 4
    #>    variable Lambda_EE87786 Lambda_EE87787 Lambda_EE87802
    #>    <chr>             <dbl>          <dbl>          <dbl>
    #>  1 var 1               1.4            2.5            0.2
    #>  2 var 2               1.5            1.9            0.2
    #>  3 var 3               1.5            2.1            0.2
    #>  4 var 4               1.3            1.8            0.2
    #>  5 var 5               1.5            2.2            0.2
    #>  6 var 6               1.3            2.1            0.4
    #>  7 var 7               1.6            1.7            0.3
    #>  8 var 8               1              1.8            0.2
    #>  9 var 9               1.3            1.8            0.2
    #> 10 var 10              1.4            2.5            0.1
    #> # ℹ 40 more rows
    

    Created on 2023-09-03 with reprex v2.0.2


    Edit

    Inspired in Limey's answer, here is a solution that binds the rows and then reshapes the data to wide format.

    suppressPackageStartupMessages({
      library(dplyr)
      library(tidyr)
      library(purrr)
    })
    
    df1 %>%
      group_by(variable, EE_variant) %>%
      mutate(id = row_number()) %>%
      ungroup() %>%
      group_split(EE_variant) %>%
      map(select, -value) %>%
      bind_rows() %>% 
      pivot_wider(
        id_cols = c(id, variable),
        names_from = EE_variant,
        names_glue = "Lambda_{EE_variant}",
        values_from = lambda
      ) %>%
      select(-id) %>% 
      arrange(variable)
    #> # A tibble: 50 × 4
    #>    variable Lambda_EE87786 Lambda_EE87787 Lambda_EE87802
    #>    <chr>             <dbl>          <dbl>          <dbl>
    #>  1 var 01              1.6            1.7            0.3
    #>  2 var 01              1              1.8            0.2
    #>  3 var 01              1.3            1.8            0.2
    #>  4 var 01              1.4            2.5            0.1
    #>  5 var 01              1.7            1.8            0.2
    #>  6 var 01              1              1.6            0.2
    #>  7 var 01              1.4            2.3            0.3
    #>  8 var 01              1.3            2.5            0.4
    #>  9 var 01              1.1            2.3            0.2
    #> 10 var 02              1.3            2.1            0.4
    #> # ℹ 40 more rows
    

    Created on 2023-09-03 with reprex v2.0.2


    Test data

    # use a built.in data set to make the example reproducible
    set.seed(2023)
    df1 <- iris[c(5:3)]
    df1$variable <- sprintf("var %02d", sample(10, 50, TRUE))
    df1 <- df1[c(1, 2, 4, 3)]
    names(df1) <- c("EE_variant", "lambda", "variable", "value")
    df1$EE_variant <- as.character(df1$EE_variant)
    df1$EE_variant <- rep(c("EE87802", "EE87786", "EE87787"), each = 50)
    head(df1)
    #>   EE_variant lambda variable value
    #> 1    EE87802    0.2   var 05   1.4
    #> 2    EE87802    0.2   var 09   1.4
    #> 3    EE87802    0.2   var 08   1.3
    #> 4    EE87802    0.2   var 03   1.5
    #> 5    EE87802    0.2   var 10   1.4
    #> 6    EE87802    0.4   var 02   1.7
    

    Created on 2023-09-03 with reprex v2.0.2