Search code examples
rdataframedplyrrep

Duplicate clustered observations and create a unique identifiers for the duplicated clusters


Consider the small dataset df1. There are 5 clusters identified by ID, row_numbers contains a unique value for each observation and weights identifies how many copies we want to each cluster.

df1 <-data.frame(ID=c("10","20","30","30","30", "40", "40","50","50","50","50"), row_numbers = c(1,2,3,4,5,6,7,8,9,10,11),weights=c(4,3,2,2,2,3,3,2,2,2,2))
df1
#>    ID row_numbers weights
#> 1  10           1       4
#> 2  20           2       3
#> 3  30           3       2
#> 4  30           4       2
#> 5  30           5       2
#> 6  40           6       3
#> 7  40           7       3
#> 8  50           8       2
#> 9  50           9       2
#> 10 50          10       2
#> 11 50          11       2

The expected output is df2 The most important part of df2 is the new variable "newID". The unique identifiers for the duplicated clusters are stored in newID where newID will identify the clusters by using consecutive integers starting from 1.

df2 <-data.frame(ID=c("10","10","10","10","20","20","20","30","30","30","30","30","30", "40", "40","40", "40","40", "40","50","50","50","50","50","50","50","50"), row_numbers = c(1,1,1,1,2,2,2,3,3,4,4,5,5,6,6,6,7,7,7,8,8,9,9,10,10,11,11),weights=c(4,4,4,4,3,3,3,2,2,2,2,2,2,3,3,3,3,3,3,2,2,2,2,2,2,2,2), newID= c(1,2,3,4,5,6,7,8,8,8,9,9,9,10,10,11,11,12,12,13,13,13,13,14,14,14,14))
df2
#>    ID row_numbers weights newID
#> 1  10           1       4     1
#> 2  10           1       4     2
#> 3  10           1       4     3
#> 4  10           1       4     4
#> 5  20           2       3     5
#> 6  20           2       3     6
#> 7  20           2       3     7
#> 8  30           3       2     8
#> 9  30           3       2     8
#> 10 30           4       2     8
#> 11 30           4       2     9
#> 12 30           5       2     9
#> 13 30           5       2     9
#> 14 40           6       3    10
#> 15 40           6       3    10
#> 16 40           6       3    11
#> 17 40           7       3    11
#> 18 40           7       3    12
#> 19 40           7       3    12
#> 20 50           8       2    13
#> 21 50           8       2    13
#> 22 50           9       2    13
#> 23 50           9       2    13
#> 24 50          10       2    14
#> 25 50          10       2    14
#> 26 50          11       2    14
#> 27 50          11       2    14

Solution

  • Here's a solution using a split-apply-bind approach:

    df3 <- do.call(rbind, lapply(split(df1, df1$ID), function(x) 
    {
      group_size   <- nrow(x)
      n_groups     <- x$weights[1]
      if(is.na(n_groups)) n_groups <- 1
      if (n_groups < 1)   n_groups <- 1
      
      group_labels <- rep(paste(x$ID[1], seq(n_groups)), each = group_size)
    
      x <- x[rep(seq(group_size), each = n_groups), ]
      x$newID <- group_labels
      x
    }))
    
    df3$newID <- as.numeric(as.factor(df3$newID))
    df3 <- `rownames<-`(df3, seq(nrow(df3)))
    

    Which matches your expected output:

    df3
    #>    ID row_numbers weights newID
    #> 1  10           1       4     1
    #> 2  10           1       4     2
    #> 3  10           1       4     3
    #> 4  10           1       4     4
    #> 5  20           2       3     5
    #> 6  20           2       3     6
    #> 7  20           2       3     7
    #> 8  30           3       2     8
    #> 9  30           3       2     8
    #> 10 30           4       2     8
    #> 11 30           4       2     9
    #> 12 30           5       2     9
    #> 13 30           5       2     9
    #> 14 40           6       3    10
    #> 15 40           6       3    10
    #> 16 40           6       3    11
    #> 17 40           7       3    11
    #> 18 40           7       3    12
    #> 19 40           7       3    12
    #> 20 50           8       2    13
    #> 21 50           8       2    13
    #> 22 50           9       2    13
    #> 23 50           9       2    13
    #> 24 50          10       2    14
    #> 25 50          10       2    14
    #> 26 50          11       2    14
    #> 27 50          11       2    14
    

    And we can show this is identical to your desired result:

    identical(df2, df3)
    #> [1] TRUE