I have a simple data frame:
df <- data.frame(X = LETTERS[1:20],
Y = paste0("abc_", 1:20))
# X Y
#1 A abc_1
#2 B abc_2
#3 C abc_3
#4 D abc_4
#5 E abc_5
#6 F abc_6
# ...
I need to assign group ID based on two vectors of integers. One indicates where the group starts, the other indicates where the group ends:
start_ix <- c(2, 5, 8, 10, 15, 18)
end_ix <- c(4, 7, 9, 13, 17, 19)
i.e. the first group is rows 2 through 4, the second is row 5 through 7, and so on. Any row not contained in these indexes (or the span between the start and stop values) should be NA
.
The desired outcome would be:
df_want <- structure(list(X = c("A", "B", "C", "D", "E", "F", "G", "H",
"I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T"),
Y = c("abc_1", "abc_2", "abc_3", "abc_4", "abc_5", "abc_6",
"abc_7", "abc_8", "abc_9", "abc_10", "abc_11", "abc_12",
"abc_13", "abc_14", "abc_15", "abc_16", "abc_17", "abc_18",
"abc_19", "abc_20"), grp = c(NA, 1, 1, 1, 2, 2, 2, 3, 3,
4, 4, 4, 4, NA, 5, 5, 5, 6, 6, NA)), row.names = c(NA, -20L
), class = "data.frame")
# X Y grp
# 1 A abc_1 NA
# 2 B abc_2 1
# 3 C abc_3 1
# 4 D abc_4 1
# 5 E abc_5 2
# 6 F abc_6 2
# 7 G abc_7 2
# 8 H abc_8 3
# 9 I abc_9 3
# 10 J abc_10 4
# 11 K abc_11 4
# 12 L abc_12 4
# 13 M abc_13 4
# 14 N abc_14 NA
# 15 O abc_15 5
# 16 P abc_16 5
# 17 Q abc_17 5
# 18 R abc_18 6
# 19 S abc_19 6
# 20 T abc_20 NA
The solution in my specific case would need to be done in base R, but for the sake of others who may have the same issue feel free to post solutions from external packages.
I have tried a combination of indexing, sorting, and seq
but can't seem to come up with a solution.
A base method using sequence
.
. <- 1 + end_ix - start_ix
df[sequence(., start_ix), "grp"] <- rep(seq_along(.), .)
Another base methode using Map
.
. <- Map(seq, start_ix, end_ix)
df[unlist(.), "grp"] <- rep(seq_along(.), lengths(.))
Benchmark
bench::mark(
jpsmith = local({seqV <- Vectorize(seq.default, vectorize.args = c("to", "from"))
ix <- unlist(seqV(start_ix, end_ix))
df[ix, "grp"] <- rep(1:length(start_ix), (end_ix - start_ix) + 1)
df}),
zx8754 = local({for(i in seq_along(start_ix)){
df[ start_ix[ i ]:end_ix[ i ], "grp"] <- i}
df}),
GKiSeq = local({. <- 1 + end_ix - start_ix
df[sequence(., start_ix), "grp"] <- rep(seq_along(.), .)
df}),
GKiMap = local({. <- Map(seq, start_ix, end_ix)
df[unlist(.), "grp"] <- rep(seq_along(.), lengths(.))
df})
)
# expression min median itr/s…¹ mem_a…² gc/se…³ n_itr n_gc total…⁴ result
# <bch:expr> <bch:tm> <bch:t> <dbl> <bch:b> <dbl> <int> <dbl> <bch:t> <list>
#1 jpsmith 186.1µs 198.4µs 4944. 280B 12.3 2408 6 487ms <df>
#2 zx8754 212.4µs 226.6µs 4339. 280B 12.4 2095 6 483ms <df>
#3 GKiSeq 60.6µs 67.1µs 14679. 280B 12.3 7160 6 488ms <df>
#4 GKiMap 103µs 111.6µs 8748. 280B 12.3 4265 6 488ms <df>