Search code examples
rdplyrsplittidyrdata-wrangling

Subset dataframe based on a specific number of unique values in a column in R


I have a dataframe:

group_eg <- data.frame(
  CID = c(091, 091, 091, 091,
          101, 101, 101,
          102, 102, 102, 
          103, 103,
          104, 104, 104),
  PID_A = c(1091, 1091, 1091, 1091,
            1101, 1101, 1101,
            1102, 1102, 1102,
            1103, 1103,
            1104, 1104, 1104),
  PID_B = c(2091, 2091, 2091, 2091,
            2101, 2101, 2101, 
            2102, 2102, 2102,
            2103, 2103,
            2104, 2104, 2104),
  text = c("eg1", "eg2", "eg3", "eg3", 
           "eg4","eg5", "eg6", 
           "eg7", "eg8","eg9", 
           "eg10", "eg11", 
           "eg12", "eg13", "eg14")
)

I want to divide this dataframe into a list of smaller dataframes. Each of the smaller dataframes should only contain rows that belong to 2 unique CIDs in ascending order. If the number of CIDs is not divisible by 2, the last dataframe can contain the rows that belong to just 1 CID. Notice that there are different number of observations for each unique CID, so I'm a bit stuck.

Here are the example outputs:

output1 <- data.frame(
  CID = c(091, 091, 091, 091,
          101, 101, 101),
  PID_A = c(1091, 1091, 1091, 1091,
            1101, 1101, 1101),
  PID_B = c(2091, 2091, 2091, 2091,
            2101, 2101, 2101),
  text = c("eg1", "eg2", "eg3", "eg3", 
           "eg4","eg5", "eg6")
)

output2 <- data.frame(
  CID = c(102, 102, 102, 
          103, 103),
  PID_A = c(1102, 1102, 1102,
            1103, 1103),
  PID_B = c(2102, 2102, 2102,
            2103, 2103),
  text = c( "eg7", "eg8","eg9", 
           "eg10", "eg11")
)

output3 <- data.frame(
  CID = c(104, 104, 104),
  PID_A = c(1104, 1104, 1104),
  PID_B = c(2104, 2104, 2104),
  text = c("eg12", "eg13", "eg14")
)

Does anyone know how to do this? Thank you!


Solution

  • Using dplyr::consecutive_id and integer division %/% you could do:

    library(dplyr, warn = FALSE)
    
    group_eg |>
      group_by(group = (consecutive_id(CID) + 1) %/% 2) |>
      group_split()
    #> <list_of<
    #>   tbl_df<
    #>     CID  : double
    #>     PID_A: double
    #>     PID_B: double
    #>     text : character
    #>     group: double
    #>   >
    #> >[3]>
    #> [[1]]
    #> # A tibble: 7 × 5
    #>     CID PID_A PID_B text  group
    #>   <dbl> <dbl> <dbl> <chr> <dbl>
    #> 1    91  1091  2091 eg1       1
    #> 2    91  1091  2091 eg2       1
    #> 3    91  1091  2091 eg3       1
    #> 4    91  1091  2091 eg3       1
    #> 5   101  1101  2101 eg4       1
    #> 6   101  1101  2101 eg5       1
    #> 7   101  1101  2101 eg6       1
    #> 
    #> [[2]]
    #> # A tibble: 5 × 5
    #>     CID PID_A PID_B text  group
    #>   <dbl> <dbl> <dbl> <chr> <dbl>
    #> 1   102  1102  2102 eg7       2
    #> 2   102  1102  2102 eg8       2
    #> 3   102  1102  2102 eg9       2
    #> 4   103  1103  2103 eg10      2
    #> 5   103  1103  2103 eg11      2
    #> 
    #> [[3]]
    #> # A tibble: 3 × 5
    #>     CID PID_A PID_B text  group
    #>   <dbl> <dbl> <dbl> <chr> <dbl>
    #> 1   104  1104  2104 eg12      3
    #> 2   104  1104  2104 eg13      3
    #> 3   104  1104  2104 eg14      3