Search code examples
rdataframereshapelong-format-datawide-format-data

Converting dataframe from "wide" to "long" format with pairs of ID variables


Here's an example of what I am trying to do. I am starting with a dataframe in "wide" format, like below.

#sample dataframe
id_1 <- c(260, 500, 640, 720)
id_2 <- c(261, 501, 641, 721)
sleep_1 <- c(7, 3, 10, 6)
sleep_2 <- c(8, 9, 1, 4)
eat_1 <- c(6,8,4,2)
eat_2 <- c(8,1,3,8)
df <- data.frame(id_1, id_2, sleep_1, sleep_2, eat_1, eat_2)

> df
  id_1 id_2 sleep_1 sleep_2 eat_1 eat_2
1  260  261       7       8     6     8
2  500  501       3       9     8     1
3  640  641      10       1     4     3
4  720  721       6       4     2     8
5  801   NA       8      NA     5    NA
6  440  441       4       9     3     6

We can think of id_1 and id_2 as denoting pairs of siblings, in which 260, 261 is a pair, and 500, 501 is a pair etc. I would like to convert this dataframe to one in "long" format like below. In doing so, I would also like to be able to handle cases in which only one member of the pair is present (like 801) and corresponding sibling is NA (as shown above).

    id sleep eat
1  260     7   6
2  261     8   8
3  500     3   8
4  501     9   1
5  640    10   4
6  641     1   3
7  720     6   2
8  721     4   8
9  801     8   5
10 440     4   3
11 441     9   6

Solution

  • One potential option is to use .value in the pivot_longer() function to get the corresponding component of the column name (per https://tidyr.tidyverse.org/reference/pivot_longer.html), e.g.

    library(tidyverse)
    
    id_1 <- c(260, 500, 640, 720)
    id_2 <- c(261, 501, 641, 721)
    sleep_1 <- c(7, 3, 10, 6)
    sleep_2 <- c(8, 9, 1, 4)
    eat_1 <- c(6,8,4,2)
    eat_2 <- c(8,1,3,8)
    df <- data.frame(id_1, id_2, sleep_1, sleep_2, eat_1, eat_2)
    
    df %>%
      pivot_longer(everything(),
                   names_pattern = "(\\w+)_\\d+",
                   names_to = ".value")
    #> # A tibble: 8 × 3
    #>      id sleep   eat
    #>   <dbl> <dbl> <dbl>
    #> 1   260     7     6
    #> 2   261     8     8
    #> 3   500     3     8
    #> 4   501     9     1
    #> 5   640    10     4
    #> 6   641     1     3
    #> 7   720     6     2
    #> 8   721     4     8
    

    Created on 2024-05-23 with reprex v2.1.0


    Regarding NAs, you could filter out the 'missing' patients using filter():

    library(tidyverse)
    
    id_1 <- c(260, 500, 640, 720, 801, 901, 902)
    id_2 <- c(261, 501, 641, 721, NA, 444, 555)
    sleep_1 <- c(7, 3, 10, 6, 8, 10, 12)
    sleep_2 <- c(8, 9, 1, 4, NA, 6, 7)
    eat_1 <- c(6,8,4,2,5,6,7)
    eat_2 <- c(8,1,3,8,NA,5,6)
    df <- data.frame(id_1, id_2, sleep_1, sleep_2, eat_1, eat_2)
    
    df %>%
      pivot_longer(everything(),
                   names_pattern = "(\\w+)_\\d+",
                   names_to = ".value")
    #> # A tibble: 14 × 3
    #>       id sleep   eat
    #>    <dbl> <dbl> <dbl>
    #>  1   260     7     6
    #>  2   261     8     8
    #>  3   500     3     8
    #>  4   501     9     1
    #>  5   640    10     4
    #>  6   641     1     3
    #>  7   720     6     2
    #>  8   721     4     8
    #>  9   801     8     5
    #> 10    NA    NA    NA
    #> 11   901    10     6
    #> 12   444     6     5
    #> 13   902    12     7
    #> 14   555     7     6
    
    df %>%
      pivot_longer(everything(),
                   names_pattern = "(\\w+)_\\d+",
                   names_to = ".value") %>%
      filter(!if_all(everything(), is.na)) # filter out '802' (all NAs)
    #> # A tibble: 13 × 3
    #>       id sleep   eat
    #>    <dbl> <dbl> <dbl>
    #>  1   260     7     6
    #>  2   261     8     8
    #>  3   500     3     8
    #>  4   501     9     1
    #>  5   640    10     4
    #>  6   641     1     3
    #>  7   720     6     2
    #>  8   721     4     8
    #>  9   801     8     5
    #> 10   901    10     6
    #> 11   444     6     5
    #> 12   902    12     7
    #> 13   555     7     6
    
    # or use na.omit()
    df %>%
      pivot_longer(everything(),
                   names_pattern = "(\\w+)_\\d+",
                   names_to = ".value") %>%
      na.omit()
    #> # A tibble: 13 × 3
    #>       id sleep   eat
    #>    <dbl> <dbl> <dbl>
    #>  1   260     7     6
    #>  2   261     8     8
    #>  3   500     3     8
    #>  4   501     9     1
    #>  5   640    10     4
    #>  6   641     1     3
    #>  7   720     6     2
    #>  8   721     4     8
    #>  9   801     8     5
    #> 10   901    10     6
    #> 11   444     6     5
    #> 12   902    12     7
    #> 13   555     7     6
    

    Created on 2024-05-23 with reprex v2.1.0