Search code examples
rdplyrtidyr

pivot longer by group prefix


I need to pivot longer grouped by column string prefix. the toy example below has two groups "A" and "B" but I need a general tidyverse solution for any number of groups by prefix.

#toy df
set.seed(1)
df <- data.table(
  date = rep(seq(as.Date("2020-01-01"),as.Date("2020-01-05"),by="day"),each=6),
  k = rep(c("A.mean","A.median","A.min","B.mean","B.median","B.min"),5),
  v = runif(30,0,50)
  ) %>%
  pivot_wider(names_from = k, values_from = v)

df %>% head

  date       A.mean A.median  A.min B.mean B.median B.min
  <date>      <dbl>    <dbl>  <dbl>  <dbl>    <dbl> <dbl>
1 2020-01-01   13.3     18.6 28.6    45.4      10.1 44.9 
2 2020-01-02   47.2     33.0 31.5     3.09     10.3  8.83
3 2020-01-03   34.4     19.2 38.5    24.9      35.9 49.6 
4 2020-01-04   19.0     38.9 46.7    10.6      32.6  6.28
5 2020-01-05   13.4     19.3  0.670  19.1      43.5 17.0 

#pivot longer by group prefix
df %>%
  select(date,matches("A\\.")) %>%
  rename_with(~str_replace(.x,"A\\.","")) %>%
  mutate( k = "A") %>%
  bind_rows(
    df %>%
      select(date,matches("B\\.")) %>%
      rename_with(~str_replace(.x,"B\\.","")) %>%
      mutate( k = "B")
  )

   date        mean median    min k    
   <date>     <dbl>  <dbl>  <dbl> <chr>
 1 2020-01-01 13.3    18.6 28.6   A    
 2 2020-01-02 47.2    33.0 31.5   A    
 3 2020-01-03 34.4    19.2 38.5   A    
 4 2020-01-04 19.0    38.9 46.7   A    
 5 2020-01-05 13.4    19.3  0.670 A    
 6 2020-01-01 45.4    10.1 44.9   B    
 7 2020-01-02  3.09   10.3  8.83  B    
 8 2020-01-03 24.9    35.9 49.6   B    
 9 2020-01-04 10.6    32.6  6.28  B    
10 2020-01-05 19.1    43.5 17.0   B 

Solution

  • Here is a two step process (shown in two lines for demonstration purposes). First pivot longer to create columns for k, stat name and value, then pivot wider to create the desired result.

    The edited code below, one can obtain the answer is one step, by using the ".value" wild card option in the "names_to" specifier.

    library(tidyr)
    set.seed(1)
    df <- data.frame(
       date = rep(seq(as.Date("2020-01-01"),as.Date("2020-01-05"),by="day"),each=6),
       k = rep(c("A.mean","A.median","A.min","B.mean","B.median","B.min"),5),
       v = runif(30,0,50)
    ) %>%
       pivot_wider(names_from = k, values_from = v)
    
    
    #temp <- pivot_longer(df, -date, names_sep = "\\.", names_to = c("k", "stat"))
    #answer <- pivot_wider(temp, id_cols = c("date", "k"), names_from= "stat", values_from="value")
    
    #updated answer simplified down to just the pivot longer function
    answer <- pivot_longer(df, -date, names_sep = "\\.", names_to = c("k", ".value"))
    
    print(head(answer))
    # A tibble: 6 x 5
    date       k      mean median   min
    <date>     <chr> <dbl>  <dbl> <dbl>
    1 2020-01-01 A     13.3    18.6 28.6 
    2 2020-01-01 B     45.4    10.1 44.9 
    3 2020-01-02 A     47.2    33.0 31.5 
    4 2020-01-02 B      3.09   10.3  8.83
    5 2020-01-03 A     34.4    19.2 38.5 
    6 2020-01-03 B     24.9    35.9 49.6