Duplicate clustered observations and create a unique identifiers for the duplicated clusters

Consider the small dataset df1. There are 5 clusters identified by ID, row_numbers contains a unique value for each observation and weights identifies how many copies we want to each cluster.

df1 <-data.frame(ID=c("10","20","30","30","30", "40", "40","50","50","50","50"), row_numbers = c(1,2,3,4,5,6,7,8,9,10,11),weights=c(4,3,2,2,2,3,3,2,2,2,2))
df1
#>    ID row_numbers weights
#> 1  10           1       4
#> 2  20           2       3
#> 3  30           3       2
#> 4  30           4       2
#> 5  30           5       2
#> 6  40           6       3
#> 7  40           7       3
#> 8  50           8       2
#> 9  50           9       2
#> 10 50          10       2
#> 11 50          11       2

The expected output is df2 The most important part of df2 is the new variable "newID". The unique identifiers for the duplicated clusters are stored in newID where newID will identify the clusters by using consecutive integers starting from 1.

df2 <-data.frame(ID=c("10","10","10","10","20","20","20","30","30","30","30","30","30", "40", "40","40", "40","40", "40","50","50","50","50","50","50","50","50"), row_numbers = c(1,1,1,1,2,2,2,3,3,4,4,5,5,6,6,6,7,7,7,8,8,9,9,10,10,11,11),weights=c(4,4,4,4,3,3,3,2,2,2,2,2,2,3,3,3,3,3,3,2,2,2,2,2,2,2,2), newID= c(1,2,3,4,5,6,7,8,8,8,9,9,9,10,10,11,11,12,12,13,13,13,13,14,14,14,14))
df2
#>    ID row_numbers weights newID
#> 1  10           1       4     1
#> 2  10           1       4     2
#> 3  10           1       4     3
#> 4  10           1       4     4
#> 5  20           2       3     5
#> 6  20           2       3     6
#> 7  20           2       3     7
#> 8  30           3       2     8
#> 9  30           3       2     8
#> 10 30           4       2     8
#> 11 30           4       2     9
#> 12 30           5       2     9
#> 13 30           5       2     9
#> 14 40           6       3    10
#> 15 40           6       3    10
#> 16 40           6       3    11
#> 17 40           7       3    11
#> 18 40           7       3    12
#> 19 40           7       3    12
#> 20 50           8       2    13
#> 21 50           8       2    13
#> 22 50           9       2    13
#> 23 50           9       2    13
#> 24 50          10       2    14
#> 25 50          10       2    14
#> 26 50          11       2    14
#> 27 50          11       2    14

Solution

Here's a solution using a split-apply-bind approach:

df3 <- do.call(rbind, lapply(split(df1, df1$ID), function(x) 
{
  group_size   <- nrow(x)
  n_groups     <- x$weights[1]
  if(is.na(n_groups)) n_groups <- 1
  if (n_groups < 1)   n_groups <- 1
  
  group_labels <- rep(paste(x$ID[1], seq(n_groups)), each = group_size)

  x <- x[rep(seq(group_size), each = n_groups), ]
  x$newID <- group_labels
  x
}))

df3$newID <- as.numeric(as.factor(df3$newID))
df3 <- `rownames<-`(df3, seq(nrow(df3)))

Which matches your expected output:

df3
#>    ID row_numbers weights newID
#> 1  10           1       4     1
#> 2  10           1       4     2
#> 3  10           1       4     3
#> 4  10           1       4     4
#> 5  20           2       3     5
#> 6  20           2       3     6
#> 7  20           2       3     7
#> 8  30           3       2     8
#> 9  30           3       2     8
#> 10 30           4       2     8
#> 11 30           4       2     9
#> 12 30           5       2     9
#> 13 30           5       2     9
#> 14 40           6       3    10
#> 15 40           6       3    10
#> 16 40           6       3    11
#> 17 40           7       3    11
#> 18 40           7       3    12
#> 19 40           7       3    12
#> 20 50           8       2    13
#> 21 50           8       2    13
#> 22 50           9       2    13
#> 23 50           9       2    13
#> 24 50          10       2    14
#> 25 50          10       2    14
#> 26 50          11       2    14
#> 27 50          11       2    14

And we can show this is identical to your desired result:

identical(df2, df3)
#> [1] TRUE