I have a dataset that looks like the following:
id <-c(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2)
date <- c("2022-11-01 22:22:01","2022-11-01 22:22:01","2022-11-18 12:48:16","2022-11-19 20:57:44","2022-11-19 20:57:44","2022-11-28 13:33:28","2022-11-29 19:24:28",
"2022-11-29 19:24:28","2022-11-01 10:02:47","2022-11-01 10:02:47","2022-11-08 02:48:37","2022-11-08 02:48:37","2022-11-17 17:35:17","2022-11-17 17:35:17",
"2022-11-22 12:30:20","2022-11-22 12:30:20","2022-11-30 09:47:45")
type <- c("aaa", "aaa", "bbb", "ccc", "aaa", "ccc", "aaa", "bbb", "bbb", "aaa", "bbb", "ccc", "bbb", "aaa", "ccc", "bbb", "ddd")
o_number <- c(NA, NA, NA, NA, 11, NA, NA, 12, NA, NA, NA, NA, NA, 13, NA, NA, 14)
total <- c(0, 0, 0, 0, 100, 0, 0, 200, 0, 0, 0, 0, 0, 300, 0, 0, 400)
df <- data.table(id,date,type, o_number, total)
I would like to summarize all the text elements of the column "type" for the last 10 or 30 days by the column "id" everywhere I have an "o_number". The soulution should consider the "date" column. This means that a time range can be selected (e.g. 10 or 30 days).
The result for a periode of 30 days should look like this:
id date type o_number total type_over_last_30days_per_id
1: 1 2022-11-01 22:22:01 aaa NA 0
2: 1 2022-11-01 22:22:01 aaa NA 0
3: 1 2022-11-18 12:48:16 bbb NA 0
4: 1 2022-11-19 20:57:44 ccc NA 0
5: 1 2022-11-19 20:57:44 aaa 11 100 aaa > aaa > bbb > ccc > aaa
6: 1 2022-11-28 13:33:28 ccc NA 0
7: 1 2022-11-29 19:24:28 aaa NA 0
8: 1 2022-11-29 19:24:28 bbb 12 200 aaa > aaa > bbb > ccc > aaa > ccc > aaa > bbb
9: 2 2022-11-01 10:02:47 bbb NA 0
10: 2 2022-11-01 10:02:47 aaa NA 0
11: 2 2022-11-08 02:48:37 bbb NA 0
12: 2 2022-11-08 02:48:37 ccc NA 0
13: 2 2022-11-17 17:35:17 bbb NA 0
14: 2 2022-11-17 17:35:17 aaa 13 300 bbb > aaa > bbb > ccc > bbb > aaa
15: 2 2022-11-22 12:30:20 ccc NA 0
16: 2 2022-11-22 12:30:20 bbb NA 0
17: 2 2022-11-30 09:47:45 ddd 14 400 bbb > aaa > bbb > ccc > bbb > aaa > ccc > bbb > ddd
I tried a lot of variations of the following code (in this case i used 30 days but i should also work with 10 days):
df %>%
filter(date >= (date - days(30)) & (date - days(30)) <= date) %>%
dplyr::group_by(id, o_number) %>%
dplyr::summarise(type_over_last_30days_per_id = paste(type, collapse = ">"))
Can you please help me?
I'm sure there's a better way to do this but I gave this a shot.
I added a grp_id column to return an index relative to each group. Then use an ifelse statement to return all values in type when o_number is not NA, concatenated by ">".
Next you need to subset the extra values by the grp_id value to get the correct number of type changes.
I made some updates to handle any date range that you want. With the caveat that the range will start on the minimum date per id. If you want to start by a different date then subset df
first.
Additionally, the data you provided won't work as is if the date range is 10 days. Since for both ids, o_number is NA within the 10 day period starting from the minimum date.
For this I built a little function that can handle your df and includes an option to set your date range.
Hope this helps!
library(data.table)
library(lubridate)
df <- data.table(id,date,type, o_number, total)
df[, date := as.POSIXct(date)] # convert to datetime set to local timezone
# Generate type changes per id over a specified number of days.
type_change_summary = function(d, days) {
# Subset by number of days for each id
df_sub = d[date <= min(date) %m+% days(days), .SD, by = .(id)]
# Err if there are only NAs
if(all(is.na(df_sub$o_number))) {
stop("o_number does not include any values")
}
# This returns all values of type per group when o_number != NA
df_sub[, grp_id := rowid(id)]
df_sub[, rng := ifelse(!is.na(o_number), vapply(.SD, paste0, collapse = ">", FUN.VALUE = character(1L)), NA), by = .(id), .SDcols = c("type")]
# Generate col name
col = paste0("type_over_last_",days,"days_per_id")
# Apply by rows over grp_id and rng then subset the initial character string in rng by the grp_id value.
df_sub[rng != "NA",
(col) := apply(.SD, 1, function(x) {
# x is a named vector here
paste0(unlist(strsplit(x["rng"], ">"))[1:as.integer(x["grp_id"])], collapse = " > ")
}),
by = .I,
.SDcols = c("rng", "grp_id")][, `:=` (grp_id = NULL, rng = NULL)][]
return(df_sub)
}
type_change_summary(df, 20) # set to 20 days.
id date type o_number total type_over_last_20days_per_id
1: 1 2022-11-01 22:22:01 aaa NA 0 <NA>
2: 1 2022-11-01 22:22:01 aaa NA 0 <NA>
3: 1 2022-11-18 12:48:16 bbb NA 0 <NA>
4: 1 2022-11-19 20:57:44 ccc NA 0 <NA>
5: 1 2022-11-19 20:57:44 aaa 11 100 aaa > aaa > bbb > ccc > aaa
6: 2 2022-11-01 10:02:47 bbb NA 0 <NA>
7: 2 2022-11-01 10:02:47 aaa NA 0 <NA>
8: 2 2022-11-08 02:48:37 bbb NA 0 <NA>
9: 2 2022-11-08 02:48:37 ccc NA 0 <NA>
10: 2 2022-11-17 17:35:17 bbb NA 0 <NA>
11: 2 2022-11-17 17:35:17 aaa 13 300 bbb > aaa > bbb > ccc > bbb > aaa
Example by subsetting the data to "2022-11-17" and using 10 days.
df_2 = df[date >= as_date("2022-11-17")]
type_change_summary(df_2, 10)
id date type o_number total type_over_last_10days_per_id
1: 1 2022-11-18 12:48:16 bbb NA 0 <NA>
2: 1 2022-11-19 20:57:44 ccc NA 0 <NA>
3: 1 2022-11-19 20:57:44 aaa 11 100 bbb > ccc > aaa
4: 2 2022-11-17 17:35:17 bbb NA 0 <NA>
5: 2 2022-11-17 17:35:17 aaa 13 300 bbb > aaa
6: 2 2022-11-22 12:30:20 ccc NA 0 <NA>
7: 2 2022-11-22 12:30:20 bbb NA 0 <NA>
library(data.table)
# Get row id per group
df[, grp_id := rowid(id)]
# This returns all values of type per group when o_number 1= NA
df[, last_30 := ifelse(!is.na(o_number), vapply(.SD, paste0, collapse = ">", FUN.VALUE = character(1L)), NA), by = .(id), .SDcols = c("type")][]
# Apply by rows over grp_id and last_30 then subset the initial character string in last_30 by the grp id value.
df[last_30 != "NA",
type_over_last_30days_per_id := apply(.SD, 1, function(x) {
# x is a named vector here
paste0(unlist(strsplit(x["last_30"], ">"))[1:as.integer(x["grp_id"])], collapse = " > ")
}),
by = .I,
.SDcols = c("last_30", "grp_id")][, `:=` (grp_id = NULL, last_30 = NULL)]
> df
id date type o_number total type_over_last_30days_per_id
1: 1 2022-11-01 22:22:01 aaa NA 0 <NA>
2: 1 2022-11-01 22:22:01 aaa NA 0 <NA>
3: 1 2022-11-18 12:48:16 bbb NA 0 <NA>
4: 1 2022-11-19 20:57:44 ccc NA 0 <NA>
5: 1 2022-11-19 20:57:44 aaa 11 100 aaa > aaa > bbb > ccc > aaa
6: 1 2022-11-28 13:33:28 ccc NA 0 <NA>
7: 1 2022-11-29 19:24:28 aaa NA 0 <NA>
8: 1 2022-11-29 19:24:28 bbb 12 200 aaa > aaa > bbb > ccc > aaa > ccc > aaa > bbb
9: 2 2022-11-01 10:02:47 bbb NA 0 <NA>
10: 2 2022-11-01 10:02:47 aaa NA 0 <NA>
11: 2 2022-11-08 02:48:37 bbb NA 0 <NA>
12: 2 2022-11-08 02:48:37 ccc NA 0 <NA>
13: 2 2022-11-17 17:35:17 bbb NA 0 <NA>
14: 2 2022-11-17 17:35:17 aaa 13 300 bbb > aaa > bbb > ccc > bbb > aaa
15: 2 2022-11-22 12:30:20 ccc NA 0 <NA>
16: 2 2022-11-22 12:30:20 bbb NA 0 <NA>
17: 2 2022-11-30 09:47:45 ddd 14 400 bbb > aaa > bbb > ccc > bbb > aaa > ccc > bbb > ddd