I have a dataset containing purchases made by different households across different retailers. For eg
Using dput()
structure(list(household_code = c(76, 76, 76, 76, 76, 76, 76,
76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 126,
126, 126, 126, 126, 126, 126, 126, 126), trip_code_uc = c(1032497498L,
1025776063L, 1029419047L, 1030418100L, 1029502602L, 1034153056L,
1027035051L, 1027533991L, 1033515804L, 1032998207L, 1032066227L,
1028192785L, 1033419039L, 1028730296L, 1027388499L, 1030652869L,
1025638394L, 1034032718L, 1034032718L, 1025678520L, 1029490031L,
1029898838L, 1028024134L, 1030324171L, 1031983761L, 1031983761L,
1033767148L, 1023953965L, 1030954113L, 1030954113L, 1027392968L
), purchase_date = structure(c(1L, 2L, 23L, 50L, 52L, 74L, 77L,
94L, 148L, 158L, 176L, 179L, 196L, 211L, 224L, 246L, 271L, 286L,
286L, 309L, 329L, 346L, 2L, 9L, 46L, 46L, 50L, 58L, 66L, 66L,
68L), .Label = c("2012-01-01", "2012-01-02", "2012-01-03", "2012-01-04",
"2012-01-05", "2012-01-06", "2012-01-07", "2012-01-08", "2012-01-09",
"2012-01-10", "2012-01-11", "2012-01-12", "2012-01-13", "2012-01-14",
"2012-01-15", "2012-01-16", "2012-01-17", "2012-01-18", "2012-01-19",
"2012-01-20", "2012-01-21", "2012-01-22", "2012-01-23", "2012-01-24",
"2012-01-25", "2012-01-26", "2012-01-27", "2012-01-28", "2012-01-29",
"2012-01-30", "2012-01-31", "2012-02-01", "2012-02-02", "2012-02-03",
"2012-02-04", "2012-02-05", "2012-02-06", "2012-02-07", "2012-02-08",
"2012-02-09", "2012-02-10", "2012-02-11", "2012-02-12", "2012-02-13",
"2012-02-14", "2012-02-15", "2012-02-16", "2012-02-17", "2012-02-18",
"2012-02-19", "2012-02-20", "2012-02-21", "2012-02-22", "2012-02-23",
"2012-02-24", "2012-02-25", "2012-02-26", "2012-02-27", "2012-02-28",
"2012-02-29", "2012-03-01", "2012-03-02", "2012-03-03", "2012-03-04",
"2012-03-05", "2012-03-06", "2012-03-07", "2012-03-08", "2012-03-09",
"2012-03-10", "2012-03-11", "2012-03-12", "2012-03-13", "2012-03-14",
"2012-03-15", "2012-03-16", "2012-03-17", "2012-03-18", "2012-03-19",
"2012-03-20", "2012-03-21", "2012-03-22", "2012-03-23", "2012-03-24",
"2012-03-25", "2012-03-26", "2012-03-27", "2012-03-28", "2012-03-29",
"2012-03-30", "2012-03-31", "2012-04-01", "2012-04-02", "2012-04-03",
"2012-04-04", "2012-04-05", "2012-04-06", "2012-04-07", "2012-04-08",
"2012-04-09", "2012-04-10", "2012-04-11", "2012-04-12", "2012-04-13",
"2012-04-14", "2012-04-15", "2012-04-16", "2012-04-17", "2012-04-18",
"2012-04-19", "2012-04-20", "2012-04-21", "2012-04-22", "2012-04-23",
"2012-04-24", "2012-04-25", "2012-04-26", "2012-04-27", "2012-04-28",
"2012-04-29", "2012-04-30", "2012-05-01", "2012-05-02", "2012-05-03",
"2012-05-04", "2012-05-05", "2012-05-06", "2012-05-07", "2012-05-08",
"2012-05-09", "2012-05-10", "2012-05-11", "2012-05-12", "2012-05-13",
"2012-05-14", "2012-05-15", "2012-05-16", "2012-05-17", "2012-05-18",
"2012-05-19", "2012-05-20", "2012-05-21", "2012-05-22", "2012-05-23",
"2012-05-24", "2012-05-25", "2012-05-26", "2012-05-27", "2012-05-28",
"2012-05-29", "2012-05-30", "2012-05-31", "2012-06-01", "2012-06-02",
"2012-06-03", "2012-06-04", "2012-06-05", "2012-06-06", "2012-06-07",
"2012-06-08", "2012-06-09", "2012-06-10", "2012-06-11", "2012-06-12",
"2012-06-13", "2012-06-14", "2012-06-15", "2012-06-16", "2012-06-17",
"2012-06-18", "2012-06-19", "2012-06-20", "2012-06-21", "2012-06-22",
"2012-06-23", "2012-06-24", "2012-06-25", "2012-06-26", "2012-06-27",
"2012-06-28", "2012-06-29", "2012-06-30", "2012-07-01", "2012-07-02",
"2012-07-03", "2012-07-04", "2012-07-05", "2012-07-06", "2012-07-07",
"2012-07-08", "2012-07-09", "2012-07-10", "2012-07-11", "2012-07-12",
"2012-07-13", "2012-07-14", "2012-07-15", "2012-07-16", "2012-07-17",
"2012-07-18", "2012-07-19", "2012-07-20", "2012-07-21", "2012-07-22",
"2012-07-23", "2012-07-24", "2012-07-25", "2012-07-26", "2012-07-27",
"2012-07-28", "2012-07-29", "2012-07-30", "2012-07-31", "2012-08-01",
"2012-08-02", "2012-08-03", "2012-08-04", "2012-08-05", "2012-08-06",
"2012-08-07", "2012-08-08", "2012-08-09", "2012-08-10", "2012-08-11",
"2012-08-12", "2012-08-13", "2012-08-14", "2012-08-15", "2012-08-16",
"2012-08-17", "2012-08-18", "2012-08-19", "2012-08-20", "2012-08-21",
"2012-08-22", "2012-08-23", "2012-08-24", "2012-08-25", "2012-08-26",
"2012-08-27", "2012-08-28", "2012-08-29", "2012-08-30", "2012-08-31",
"2012-09-01", "2012-09-02", "2012-09-03", "2012-09-04", "2012-09-05",
"2012-09-06", "2012-09-07", "2012-09-08", "2012-09-09", "2012-09-10",
"2012-09-11", "2012-09-12", "2012-09-13", "2012-09-14", "2012-09-15",
"2012-09-16", "2012-09-17", "2012-09-18", "2012-09-19", "2012-09-20",
"2012-09-21", "2012-09-22", "2012-09-23", "2012-09-24", "2012-09-25",
"2012-09-26", "2012-09-27", "2012-09-28", "2012-09-29", "2012-09-30",
"2012-10-01", "2012-10-02", "2012-10-03", "2012-10-04", "2012-10-05",
"2012-10-06", "2012-10-07", "2012-10-08", "2012-10-09", "2012-10-10",
"2012-10-11", "2012-10-12", "2012-10-13", "2012-10-14", "2012-10-15",
"2012-10-16", "2012-10-17", "2012-10-18", "2012-10-19", "2012-10-20",
"2012-10-21", "2012-10-22", "2012-10-23", "2012-10-24", "2012-10-25",
"2012-10-26", "2012-10-27", "2012-10-28", "2012-10-29", "2012-10-30",
"2012-10-31", "2012-11-01", "2012-11-02", "2012-11-03", "2012-11-04",
"2012-11-05", "2012-11-06", "2012-11-07", "2012-11-08", "2012-11-09",
"2012-11-10", "2012-11-11", "2012-11-12", "2012-11-13", "2012-11-14",
"2012-11-15", "2012-11-16", "2012-11-17", "2012-11-18", "2012-11-19",
"2012-11-20", "2012-11-21", "2012-11-22", "2012-11-23", "2012-11-24",
"2012-11-25", "2012-11-26", "2012-11-27", "2012-11-28", "2012-11-29",
"2012-11-30", "2012-12-01", "2012-12-02", "2012-12-03", "2012-12-04",
"2012-12-05", "2012-12-06", "2012-12-07", "2012-12-08", "2012-12-09",
"2012-12-10", "2012-12-11", "2012-12-12", "2012-12-13", "2012-12-14",
"2012-12-15", "2012-12-16", "2012-12-17", "2012-12-18", "2012-12-19",
"2012-12-20", "2012-12-21", "2012-12-22", "2012-12-23", "2012-12-24",
"2012-12-25", "2012-12-26", "2012-12-27", "2012-12-28", "2012-12-29"
), class = "factor"), retailer_code = c(11024, 11024, 11024,
11024, 11024, 11024, 11024, 11024, 11024, 11024, 11024, 11024,
11024, 11024, 11024, 11024, 2353, 11024, 11024, 2353, 11024,
11024, 63882, 650, 89960, 89960, 650, 89960, 89960, 89960, 650
), Overall_Brand = structure(c(19L, 74L, 19L, 48L, 19L, 48L,
19L, 19L, 19L, 48L, 48L, 31L, 46L, 31L, 31L, 48L, 74L, 31L, 74L,
19L, 31L, 19L, 48L, 48L, 31L, 31L, 48L, 31L, 31L, 48L, 48L), .Label = c("ABUNDANCE",
"ALPEN", "AMERICAN BREAKFAST ", "ANNIE'S HOMEGROWN", "ARWHD MLS",
"BARBARA'S", "BEAR NAKED", "BEAR RIVER", "BOB'S RED MILL", "BOKOMO COUNTRY",
"BREAKFAST CHOICE", "BREAKFAST ZONE", "BROOKFARM MACADAMIA ",
"BRUGGEN", "BUCKEYE HEROES", "CADIA", "CASCADIAN FARM ", "CHOCOLATE SPOONERS",
"CTL BR", "DORSET", "ENJOY LIFE PERKY'S CRUNCHY FLX", "EREWHON",
"F-FACTOR", "FAMILIA", "FIELD DAY", "FINAX", "FLEURY FLAKES MARC ANDRE",
"FOOD FOR LIFE EZEKIEL 50", "FORRELLI ", "GEFEN KING", "GENERAL MILLS",
"GERONIMO PEYTON HILLIS REDZONE", "GLUCERNA", "GLUTINO", "GOLDEN FOODS",
"GRANVITA PUFFY'S", "GREENBRIER INT INC-NBL CRN FLK", "HEALTH VALLEY",
"HODGSON MILL", "HOME FAVORITE", "HOSTESS TOASTED OATS", "HSP",
"ISABEL'S WAY ", "JASPER", "JUSTIN VERLANDER'S FASTBLL FLK",
"KASHI", "KAY'S NATURALS BETTER BALANCE", "Kellogg", "KIND",
"KOZY SHACK READY GRAINS", "KRETSCHMER", "LADY LIBERTY", "LIEBER'S",
"LIVING INTENTIONS SPRFD CRL", "LOVE GROWN FOODS", "MAIZORO",
"MANISCHEWITZ", "MILL SELECT ", "MOTHER'S", "MULTIGRAIN SPNRS",
"NASH BROTHERS", "NATURE'S PATH", "NESTLE", "NEW ENGLAND NATURALS",
"NEWMAN'S OWN SWEET ENGH WHT PF", "NUTRISYSTEM NOURISH", "NUTRITIOUS LIVING",
"PAMPA ", "PASKESZ CHOCO RIOS", "PBLC LB BR-NBL HY NT TSD OT DM",
"PEACE", "POST", "PROTEIN PLUS CORN FLAKE", "QUAKER", "RALSTON",
"RLSTN", "SALLY'S ", "SCRUMPTIOUS SPELNDID CRLS CBB", "SEITENBACHER MUESLI",
"SIMPLY FIBER", "SKINNER'S RAISIN BRAN", "SORIANA WHOLE BRAN",
"STREIT'S FRUIT & NUT MUESLI", "SUNBELT", "SWAD", "T. ABRAHAM'S",
"TAANUG CORN FLAKES", "TASTY", "TEMMY'S", "THE", "THREE SISTERS",
"TIKISS SWEETENED PUFFD WHL WHT", "TRU ROOTS", "VOTTO'S", "WEETABIX",
"WHOLESOME GOODNESS", "WILD ROOTS ANCIENT ORIGINS", "WONDER CORN FLAKES",
"YOG ACTIVE"), class = "factor")), row.names = c(90609L, 222436L,
90606L, 688592L, 90607L, 688593L, 90605L, 90604L, 90608L, 668330L,
321377L, 567447L, 945385L, 567445L, 567443L, 892854L, 583186L,
567446L, 583185L, 168138L, 567444L, 60086L, 698120L, 698127L,
3933L, 809409L, 698129L, 15286L, 15284L, 698116L, 319455L), class = "data.frame")
I have a separate variable for Household, trip ID for a particular purchase instance, retailer, and Brand they purchased. One household may purchase more than 1 brand in 1 trip. I wanted to calculate the repeat purchases in 2 consecutive trips. For example, if a household purchase General Mills and Kellogg both in trip 1 and only General mill in 2nd trip, for general mills, it will be a repeat.
Example output:
Now, it is easy when I have only 1 purchase per trip. I do it by the following code using rle()
e1 = transform(e1, brand_last_dum = ave(as.character(Brand), rleid(Household, Brand), FUN = seq_along))
However, when there is more than 1 purchase, it doesn't work. Data is arranged by the purchase date and more than 1 purchase of the same brand in a trip maybe considered as a different purchase for calculating repeat in the next row. Please Help
2020-11-29 UPDATE: Adjusted code to count second+ purchases of same brand in same shopping trip as repeat purchases. Given the additional requirement I was able to eliminate one of the joins which had been previously added to treat all purchases of a given brand within a trip consistently.
library(dplyr)
data %>%
arrange(.,household_code,purchase_date,trip_code_uc) %>%
group_by(household_code,purchase_date) %>%
distinct(trip_code_uc) %>%
group_by(household_code) %>%
mutate(trip_seq = seq_along(household_code)) %>%
ungroup() %>%
left_join(data,.) %>%
arrange(household_code,Overall_Brand,trip_seq) %>%
mutate(repeat_purchase = if_else(household_code == lag(household_code) &
Overall_Brand == lag(Overall_Brand) &
(trip_seq == lag(trip_seq) + 1 | trip_seq == lag(trip_seq)),
TRUE,FALSE,missing=FALSE)) %>%
arrange(.,household_code,purchase_date,Overall_Brand) -> result
print(as.data.frame(result[,c(1,3,5,6,7)]))
...and the output:
> print(as.data.frame(result[,c(1,3,5,6,7)]))
household_code purchase_date Overall_Brand trip_seq repeat_purchase
1 76 2012-01-01 CTL BR 1 FALSE
2 76 2012-01-02 QUAKER 2 FALSE
3 76 2012-01-23 CTL BR 3 FALSE
4 76 2012-02-19 Kellogg 4 FALSE
5 76 2012-02-21 CTL BR 5 FALSE
6 76 2012-03-14 Kellogg 6 FALSE
7 76 2012-03-17 CTL BR 7 FALSE
8 76 2012-04-03 CTL BR 8 TRUE
9 76 2012-05-27 CTL BR 9 TRUE
10 76 2012-06-06 Kellogg 10 FALSE
11 76 2012-06-24 Kellogg 11 TRUE
12 76 2012-06-27 GENERAL MILLS 12 FALSE
13 76 2012-07-14 KASHI 13 FALSE
14 76 2012-07-29 GENERAL MILLS 14 FALSE
15 76 2012-08-11 GENERAL MILLS 15 TRUE
16 76 2012-09-02 Kellogg 16 FALSE
17 76 2012-09-27 QUAKER 17 FALSE
18 76 2012-10-12 GENERAL MILLS 18 FALSE
19 76 2012-10-12 QUAKER 18 TRUE
20 76 2012-11-04 CTL BR 19 FALSE
21 76 2012-11-24 GENERAL MILLS 20 FALSE
22 76 2012-12-11 CTL BR 21 FALSE
23 126 2012-01-02 Kellogg 1 FALSE
24 126 2012-01-09 Kellogg 2 TRUE
25 126 2012-02-15 GENERAL MILLS 3 FALSE
26 126 2012-02-15 GENERAL MILLS 3 TRUE
27 126 2012-02-19 Kellogg 4 FALSE
28 126 2012-02-27 GENERAL MILLS 5 FALSE
29 126 2012-03-06 GENERAL MILLS 6 TRUE
30 126 2012-03-06 Kellogg 6 FALSE
31 126 2012-03-08 Kellogg 7 TRUE
>
2020-11-28 UPDATE: After receiving the updated data that includes purchase_date
, we altered our solution to use this information to generate trip_seq
. We assume that if there are multiple trip_code_uc
values in a single day, the shopping experiences occur in ascending order of trip_code_uc
.
We use dplyr
to define distinct trip identifiers for each household and assign a trip sequence. We then use the trip sequence to evaluate whether the same brand was purchased in two consecutive shopping trips after merging it with the original data.
library(dplyr)
data %>%
arrange(.,household_code,purchase_date,trip_code_uc) %>%
group_by(household_code,purchase_date) %>%
distinct(trip_code_uc) %>%
group_by(household_code) %>%
mutate(trip_seq = seq_along(household_code)) %>%
ungroup() %>%
left_join(data,.) %>%
group_by(household_code,purchase_date,trip_seq) %>%
distinct(Overall_Brand) %>%
ungroup() %>%
arrange(household_code,Overall_Brand,purchase_date,trip_seq) %>%
mutate(repeat_purchase = if_else(household_code == lag(household_code) &
Overall_Brand == lag(Overall_Brand) &
trip_seq == lag(trip_seq) + 1,
TRUE,FALSE,missing=FALSE)) %>%
left_join(data,.) -> result
result <- arrange(result,household_code,purchase_date,Overall_Brand)
print(as.data.frame(result[,c(1,3,5,6,7)]))
...and the output:
> print(as.data.frame(result[,c(1,3,5,6,7)]))
household_code purchase_date Overall_Brand trip_seq repeat_purchase
1 76 2012-01-01 CTL BR 1 FALSE
2 76 2012-01-02 QUAKER 2 FALSE
3 76 2012-01-23 CTL BR 3 FALSE
4 76 2012-02-19 Kellogg 4 FALSE
5 76 2012-02-21 CTL BR 5 FALSE
6 76 2012-03-14 Kellogg 6 FALSE
7 76 2012-03-17 CTL BR 7 FALSE
8 76 2012-04-03 CTL BR 8 TRUE
9 76 2012-05-27 CTL BR 9 TRUE
10 76 2012-06-06 Kellogg 10 FALSE
11 76 2012-06-24 Kellogg 11 TRUE
12 76 2012-06-27 GENERAL MILLS 12 FALSE
13 76 2012-07-14 KASHI 13 FALSE
14 76 2012-07-29 GENERAL MILLS 14 FALSE
15 76 2012-08-11 GENERAL MILLS 15 TRUE
16 76 2012-09-02 Kellogg 16 FALSE
17 76 2012-09-27 QUAKER 17 FALSE
18 76 2012-10-12 GENERAL MILLS 18 FALSE
19 76 2012-10-12 QUAKER 18 TRUE
20 76 2012-11-04 CTL BR 19 FALSE
21 76 2012-11-24 GENERAL MILLS 20 FALSE
22 76 2012-12-11 CTL BR 21 FALSE
23 126 2012-01-02 Kellogg 1 FALSE
24 126 2012-01-09 Kellogg 2 TRUE
25 126 2012-02-15 GENERAL MILLS 3 FALSE
26 126 2012-02-15 GENERAL MILLS 3 FALSE
27 126 2012-02-19 Kellogg 4 FALSE
28 126 2012-02-27 GENERAL MILLS 5 FALSE
29 126 2012-03-06 GENERAL MILLS 6 TRUE
30 126 2012-03-06 Kellogg 6 FALSE
31 126 2012-03-08 Kellogg 7 TRUE
The sequence number is important because if we simply sort by Household
, Brand
and Trip ID
, we can't tell whether the next trip number is truly the "next" purchase, as illustrated by Household 126 purchases of the Kellog Brand, where it is purchased on trip sequences 1, 2, 4, 6, and 7. Only purchases on trips 2 and 7 should be counted as repeat purchases in consecutive trips, per the request in the OP.