Search code examples
rdplyrsplittidyversetidytable

How to name the list of the group_split output in dplyr


I have the following process which uses group_split of dplyr:

library(tidyverse)
set.seed(1)
iris %>% sample_n(size = 5) %>% 
    group_by(Species) %>% 
    group_split()

The result is:

[[1]]
# A tibble: 2 x 5
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
         <dbl>       <dbl>        <dbl>       <dbl> <fct>  
1          5           3.5          1.6         0.6 setosa 
2          5.1         3.8          1.5         0.3 setosa 

[[2]]
# A tibble: 2 x 5
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species   
         <dbl>       <dbl>        <dbl>       <dbl> <fct>     
1          5.9         3            4.2         1.5 versicolor
2          6.2         2.2          4.5         1.5 versicolor

[[3]]
# A tibble: 1 x 5
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species  
         <dbl>       <dbl>        <dbl>       <dbl> <fct>    
1          6.2         3.4          5.4         2.3 virginica

What I want to achieve is to name this list by grouped name (i.e. Species). Yielding this (done by hand):

$setosa
# A tibble: 2 x 5
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
         <dbl>       <dbl>        <dbl>       <dbl> <fct>  
1          5           3.5          1.6         0.6 setosa 
2          5.1         3.8          1.5         0.3 setosa 

$versicolor
# A tibble: 2 x 5
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species   
         <dbl>       <dbl>        <dbl>       <dbl> <fct>     
1          5.9         3            4.2         1.5 versicolor
2          6.2         2.2          4.5         1.5 versicolor

$virginica
# A tibble: 1 x 5
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species  
         <dbl>       <dbl>        <dbl>       <dbl> <fct>    
1          6.2         3.4          5.4         2.3 virginica

How can I achieve that?

Update

I tried this new data, where the naming now is called Cluster :

df <- structure(list(Cluster = c("Cluster9", "Cluster11", "Cluster1", 
"Cluster9", "Cluster6", "Cluster12", "Cluster9", "Cluster11", 
"Cluster8", "Cluster8"), gene_name = c("Tbc1d8", "Vimp", "Grhpr", 
"H1f0", "Zfp398", "Pikfyve", "Ankrd13a", "Fgfr1op2", "Golga7", 
"Lars2"), p_value = c(3.46629097620496e-47, 3.16837338947245e-62, 
1.55108439059684e-06, 9.46078511685542e-131, 0.000354049720507017, 
0.0146807415917158, 1.42799750295289e-38, 2.0697825959399e-08, 
4.13777221466668e-06, 3.92889640704683e-184), morans_test_statistic = c(14.3797687352223, 
16.6057085487911, 4.66393667525872, 24.301453902967, 3.38642377758137, 
2.17859882998961, 12.9350063459509, 5.48479186018979, 4.4579286289179, 
28.9144540271157), morans_I = c(0.0814728893885783, 0.0947505609609695, 
0.0260671534007409, 0.138921824574569, 0.018764800166045, 0.0119813199210325, 
0.0736554862590782, 0.0309849638728409, 0.0250591347318986, 0.165310420808725
), q_value = c(1.57917584337356e-46, 1.62106594498462e-61, 3.43312171446844e-06, 
6.99503520654745e-130, 0.000683559649593623, 0.0245476826213791, 
5.96116678335584e-38, 4.97603701391971e-08, 8.9649490080526e-06, 
3.48152096326702e-183)), row.names = c(NA, -10L), class = c("tbl_df", 
"tbl", "data.frame"))

With Ronak Shah's approach I get inconsistent result:

df %>% group_split(Cluster) %>% setNames(unique(df$Cluster))
$Cluster9
# A tibble: 1 x 6
  Cluster  gene_name    p_value morans_test_statistic morans_I    q_value
  <chr>    <chr>          <dbl>                 <dbl>    <dbl>      <dbl>
1 Cluster1 Grhpr     0.00000155                  4.66   0.0261 0.00000343

$Cluster11
# A tibble: 2 x 6
  Cluster   gene_name  p_value morans_test_statistic morans_I  q_value
  <chr>     <chr>        <dbl>                 <dbl>    <dbl>    <dbl>
1 Cluster11 Vimp      3.17e-62                 16.6    0.0948 1.62e-61
2 Cluster11 Fgfr1op2  2.07e- 8                  5.48   0.0310 4.98e- 8

$Cluster1
# A tibble: 1 x 6
  Cluster   gene_name p_value morans_test_statistic morans_I q_value
  <chr>     <chr>       <dbl>                 <dbl>    <dbl>   <dbl>
1 Cluster12 Pikfyve    0.0147                  2.18   0.0120  0.0245

$Cluster6
# A tibble: 1 x 6
  Cluster  gene_name  p_value morans_test_statistic morans_I  q_value
  <chr>    <chr>        <dbl>                 <dbl>    <dbl>    <dbl>
1 Cluster6 Zfp398    0.000354                  3.39   0.0188 0.000684

$Cluster12
# A tibble: 2 x 6
  Cluster  gene_name   p_value morans_test_statistic morans_I   q_value
  <chr>    <chr>         <dbl>                 <dbl>    <dbl>     <dbl>
1 Cluster8 Golga7    4.14e-  6                  4.46   0.0251 8.96e-  6
2 Cluster8 Lars2     3.93e-184                 28.9    0.165  3.48e-183

$Cluster8
# A tibble: 3 x 6
  Cluster  gene_name   p_value morans_test_statistic morans_I   q_value
  <chr>    <chr>         <dbl>                 <dbl>    <dbl>     <dbl>
1 Cluster9 Tbc1d8    3.47e- 47                  14.4   0.0815 1.58e- 46
2 Cluster9 H1f0      9.46e-131                  24.3   0.139  7.00e-130
3 Cluster9 Ankrd13a  1.43e- 38                  12.9   0.0737 5.96e- 38

Note that $Cluster9 has Cluster1 in it.

Please advice how to go about this?


Solution

  • Lots of good answers. You can also just do:

    iris %>% sample_n(size = 5) %>% 
      split(f = as.factor(.$Species))
    

    Which will give you:

    $setosa
      Sepal.Length Sepal.Width Petal.Length Petal.Width Species
    4          5.5         3.5          1.3         0.2  setosa
    5          5.3         3.7          1.5         0.2  setosa
    
    $versicolor
      Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
    3            5         2.3          3.3           1 versicolor
    
    $virginica
      Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
    1          7.7         2.6          6.9         2.3 virginica
    2          7.2         3.0          5.8         1.6 virginica
    

    Also works with your dataframe above:

    df %>% 
      split(f = as.factor(.$Cluster))
    

    Gives you:

    $Cluster1
    # A tibble: 1 x 6
      Cluster  gene_name    p_value morans_test_statistic morans_I    q_value
      <chr>    <chr>          <dbl>                 <dbl>    <dbl>      <dbl>
    1 Cluster1 Grhpr     0.00000155                  4.66   0.0261 0.00000343
    
    $Cluster11
    # A tibble: 2 x 6
      Cluster   gene_name  p_value morans_test_statistic morans_I  q_value
      <chr>     <chr>        <dbl>                 <dbl>    <dbl>    <dbl>
    1 Cluster11 Vimp      3.17e-62                 16.6    0.0948 1.62e-61
    2 Cluster11 Fgfr1op2  2.07e- 8                  5.48   0.0310 4.98e- 8
    
    $Cluster12
    # A tibble: 1 x 6
      Cluster   gene_name p_value morans_test_statistic morans_I q_value
      <chr>     <chr>       <dbl>                 <dbl>    <dbl>   <dbl>
    1 Cluster12 Pikfyve    0.0147                  2.18   0.0120  0.0245
    
    $Cluster6
    # A tibble: 1 x 6
      Cluster  gene_name  p_value morans_test_statistic morans_I  q_value
      <chr>    <chr>        <dbl>                 <dbl>    <dbl>    <dbl>
    1 Cluster6 Zfp398    0.000354                  3.39   0.0188 0.000684
    
    $Cluster8
    # A tibble: 2 x 6
      Cluster  gene_name   p_value morans_test_statistic morans_I   q_value
      <chr>    <chr>         <dbl>                 <dbl>    <dbl>     <dbl>
    1 Cluster8 Golga7    4.14e-  6                  4.46   0.0251 8.96e-  6
    2 Cluster8 Lars2     3.93e-184                 28.9    0.165  3.48e-183
    
    $Cluster9
    # A tibble: 3 x 6
      Cluster  gene_name   p_value morans_test_statistic morans_I   q_value
      <chr>    <chr>         <dbl>                 <dbl>    <dbl>     <dbl>
    1 Cluster9 Tbc1d8    3.47e- 47                  14.4   0.0815 1.58e- 46
    2 Cluster9 H1f0      9.46e-131                  24.3   0.139  7.00e-130
    3 Cluster9 Ankrd13a  1.43e- 38                  12.9   0.0737 5.96e- 38