Search code examples
rrepeatrle

To check if a value in a row is repeated between groups in R


I have a dataset containing purchases made by different households across different retailers. For eg

Example Dataset

Using dput()

structure(list(household_code = c(76, 76, 76, 76, 76, 76, 76, 
76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 126, 
126, 126, 126, 126, 126, 126, 126, 126), trip_code_uc = c(1032497498L, 
1025776063L, 1029419047L, 1030418100L, 1029502602L, 1034153056L, 
1027035051L, 1027533991L, 1033515804L, 1032998207L, 1032066227L, 
1028192785L, 1033419039L, 1028730296L, 1027388499L, 1030652869L, 
1025638394L, 1034032718L, 1034032718L, 1025678520L, 1029490031L, 
1029898838L, 1028024134L, 1030324171L, 1031983761L, 1031983761L, 
1033767148L, 1023953965L, 1030954113L, 1030954113L, 1027392968L
), purchase_date = structure(c(1L, 2L, 23L, 50L, 52L, 74L, 77L, 
94L, 148L, 158L, 176L, 179L, 196L, 211L, 224L, 246L, 271L, 286L, 
286L, 309L, 329L, 346L, 2L, 9L, 46L, 46L, 50L, 58L, 66L, 66L, 
68L), .Label = c("2012-01-01", "2012-01-02", "2012-01-03", "2012-01-04", 
"2012-01-05", "2012-01-06", "2012-01-07", "2012-01-08", "2012-01-09", 
"2012-01-10", "2012-01-11", "2012-01-12", "2012-01-13", "2012-01-14", 
"2012-01-15", "2012-01-16", "2012-01-17", "2012-01-18", "2012-01-19", 
"2012-01-20", "2012-01-21", "2012-01-22", "2012-01-23", "2012-01-24", 
"2012-01-25", "2012-01-26", "2012-01-27", "2012-01-28", "2012-01-29", 
"2012-01-30", "2012-01-31", "2012-02-01", "2012-02-02", "2012-02-03", 
"2012-02-04", "2012-02-05", "2012-02-06", "2012-02-07", "2012-02-08", 
"2012-02-09", "2012-02-10", "2012-02-11", "2012-02-12", "2012-02-13", 
"2012-02-14", "2012-02-15", "2012-02-16", "2012-02-17", "2012-02-18", 
"2012-02-19", "2012-02-20", "2012-02-21", "2012-02-22", "2012-02-23", 
"2012-02-24", "2012-02-25", "2012-02-26", "2012-02-27", "2012-02-28", 
"2012-02-29", "2012-03-01", "2012-03-02", "2012-03-03", "2012-03-04", 
"2012-03-05", "2012-03-06", "2012-03-07", "2012-03-08", "2012-03-09", 
"2012-03-10", "2012-03-11", "2012-03-12", "2012-03-13", "2012-03-14", 
"2012-03-15", "2012-03-16", "2012-03-17", "2012-03-18", "2012-03-19", 
"2012-03-20", "2012-03-21", "2012-03-22", "2012-03-23", "2012-03-24", 
"2012-03-25", "2012-03-26", "2012-03-27", "2012-03-28", "2012-03-29", 
"2012-03-30", "2012-03-31", "2012-04-01", "2012-04-02", "2012-04-03", 
"2012-04-04", "2012-04-05", "2012-04-06", "2012-04-07", "2012-04-08", 
"2012-04-09", "2012-04-10", "2012-04-11", "2012-04-12", "2012-04-13", 
"2012-04-14", "2012-04-15", "2012-04-16", "2012-04-17", "2012-04-18", 
"2012-04-19", "2012-04-20", "2012-04-21", "2012-04-22", "2012-04-23", 
"2012-04-24", "2012-04-25", "2012-04-26", "2012-04-27", "2012-04-28", 
"2012-04-29", "2012-04-30", "2012-05-01", "2012-05-02", "2012-05-03", 
"2012-05-04", "2012-05-05", "2012-05-06", "2012-05-07", "2012-05-08", 
"2012-05-09", "2012-05-10", "2012-05-11", "2012-05-12", "2012-05-13", 
"2012-05-14", "2012-05-15", "2012-05-16", "2012-05-17", "2012-05-18", 
"2012-05-19", "2012-05-20", "2012-05-21", "2012-05-22", "2012-05-23", 
"2012-05-24", "2012-05-25", "2012-05-26", "2012-05-27", "2012-05-28", 
"2012-05-29", "2012-05-30", "2012-05-31", "2012-06-01", "2012-06-02", 
"2012-06-03", "2012-06-04", "2012-06-05", "2012-06-06", "2012-06-07", 
"2012-06-08", "2012-06-09", "2012-06-10", "2012-06-11", "2012-06-12", 
"2012-06-13", "2012-06-14", "2012-06-15", "2012-06-16", "2012-06-17", 
"2012-06-18", "2012-06-19", "2012-06-20", "2012-06-21", "2012-06-22", 
"2012-06-23", "2012-06-24", "2012-06-25", "2012-06-26", "2012-06-27", 
"2012-06-28", "2012-06-29", "2012-06-30", "2012-07-01", "2012-07-02", 
"2012-07-03", "2012-07-04", "2012-07-05", "2012-07-06", "2012-07-07", 
"2012-07-08", "2012-07-09", "2012-07-10", "2012-07-11", "2012-07-12", 
"2012-07-13", "2012-07-14", "2012-07-15", "2012-07-16", "2012-07-17", 
"2012-07-18", "2012-07-19", "2012-07-20", "2012-07-21", "2012-07-22", 
"2012-07-23", "2012-07-24", "2012-07-25", "2012-07-26", "2012-07-27", 
"2012-07-28", "2012-07-29", "2012-07-30", "2012-07-31", "2012-08-01", 
"2012-08-02", "2012-08-03", "2012-08-04", "2012-08-05", "2012-08-06", 
"2012-08-07", "2012-08-08", "2012-08-09", "2012-08-10", "2012-08-11", 
"2012-08-12", "2012-08-13", "2012-08-14", "2012-08-15", "2012-08-16", 
"2012-08-17", "2012-08-18", "2012-08-19", "2012-08-20", "2012-08-21", 
"2012-08-22", "2012-08-23", "2012-08-24", "2012-08-25", "2012-08-26", 
"2012-08-27", "2012-08-28", "2012-08-29", "2012-08-30", "2012-08-31", 
"2012-09-01", "2012-09-02", "2012-09-03", "2012-09-04", "2012-09-05", 
"2012-09-06", "2012-09-07", "2012-09-08", "2012-09-09", "2012-09-10", 
"2012-09-11", "2012-09-12", "2012-09-13", "2012-09-14", "2012-09-15", 
"2012-09-16", "2012-09-17", "2012-09-18", "2012-09-19", "2012-09-20", 
"2012-09-21", "2012-09-22", "2012-09-23", "2012-09-24", "2012-09-25", 
"2012-09-26", "2012-09-27", "2012-09-28", "2012-09-29", "2012-09-30", 
"2012-10-01", "2012-10-02", "2012-10-03", "2012-10-04", "2012-10-05", 
"2012-10-06", "2012-10-07", "2012-10-08", "2012-10-09", "2012-10-10", 
"2012-10-11", "2012-10-12", "2012-10-13", "2012-10-14", "2012-10-15", 
"2012-10-16", "2012-10-17", "2012-10-18", "2012-10-19", "2012-10-20", 
"2012-10-21", "2012-10-22", "2012-10-23", "2012-10-24", "2012-10-25", 
"2012-10-26", "2012-10-27", "2012-10-28", "2012-10-29", "2012-10-30", 
"2012-10-31", "2012-11-01", "2012-11-02", "2012-11-03", "2012-11-04", 
"2012-11-05", "2012-11-06", "2012-11-07", "2012-11-08", "2012-11-09", 
"2012-11-10", "2012-11-11", "2012-11-12", "2012-11-13", "2012-11-14", 
"2012-11-15", "2012-11-16", "2012-11-17", "2012-11-18", "2012-11-19", 
"2012-11-20", "2012-11-21", "2012-11-22", "2012-11-23", "2012-11-24", 
"2012-11-25", "2012-11-26", "2012-11-27", "2012-11-28", "2012-11-29", 
"2012-11-30", "2012-12-01", "2012-12-02", "2012-12-03", "2012-12-04", 
"2012-12-05", "2012-12-06", "2012-12-07", "2012-12-08", "2012-12-09", 
"2012-12-10", "2012-12-11", "2012-12-12", "2012-12-13", "2012-12-14", 
"2012-12-15", "2012-12-16", "2012-12-17", "2012-12-18", "2012-12-19", 
"2012-12-20", "2012-12-21", "2012-12-22", "2012-12-23", "2012-12-24", 
"2012-12-25", "2012-12-26", "2012-12-27", "2012-12-28", "2012-12-29"
), class = "factor"), retailer_code = c(11024, 11024, 11024, 
11024, 11024, 11024, 11024, 11024, 11024, 11024, 11024, 11024, 
11024, 11024, 11024, 11024, 2353, 11024, 11024, 2353, 11024, 
11024, 63882, 650, 89960, 89960, 650, 89960, 89960, 89960, 650
), Overall_Brand = structure(c(19L, 74L, 19L, 48L, 19L, 48L, 
19L, 19L, 19L, 48L, 48L, 31L, 46L, 31L, 31L, 48L, 74L, 31L, 74L, 
19L, 31L, 19L, 48L, 48L, 31L, 31L, 48L, 31L, 31L, 48L, 48L), .Label = c("ABUNDANCE", 
"ALPEN", "AMERICAN BREAKFAST ", "ANNIE'S HOMEGROWN", "ARWHD MLS", 
"BARBARA'S", "BEAR NAKED", "BEAR RIVER", "BOB'S RED MILL", "BOKOMO COUNTRY", 
"BREAKFAST CHOICE", "BREAKFAST ZONE", "BROOKFARM MACADAMIA ", 
"BRUGGEN", "BUCKEYE HEROES", "CADIA", "CASCADIAN FARM ", "CHOCOLATE SPOONERS", 
"CTL BR", "DORSET", "ENJOY LIFE PERKY'S CRUNCHY FLX", "EREWHON", 
"F-FACTOR", "FAMILIA", "FIELD DAY", "FINAX", "FLEURY FLAKES MARC ANDRE", 
"FOOD FOR LIFE EZEKIEL 50", "FORRELLI ", "GEFEN KING", "GENERAL MILLS", 
"GERONIMO PEYTON HILLIS REDZONE", "GLUCERNA", "GLUTINO", "GOLDEN FOODS", 
"GRANVITA PUFFY'S", "GREENBRIER INT INC-NBL CRN FLK", "HEALTH VALLEY", 
"HODGSON MILL", "HOME FAVORITE", "HOSTESS TOASTED OATS", "HSP", 
"ISABEL'S WAY ", "JASPER", "JUSTIN VERLANDER'S FASTBLL FLK", 
"KASHI", "KAY'S NATURALS BETTER BALANCE", "Kellogg", "KIND", 
"KOZY SHACK READY GRAINS", "KRETSCHMER", "LADY LIBERTY", "LIEBER'S", 
"LIVING INTENTIONS SPRFD CRL", "LOVE GROWN FOODS", "MAIZORO", 
"MANISCHEWITZ", "MILL SELECT ", "MOTHER'S", "MULTIGRAIN SPNRS", 
"NASH BROTHERS", "NATURE'S PATH", "NESTLE", "NEW ENGLAND NATURALS", 
"NEWMAN'S OWN SWEET ENGH WHT PF", "NUTRISYSTEM NOURISH", "NUTRITIOUS LIVING", 
"PAMPA ", "PASKESZ CHOCO RIOS", "PBLC LB BR-NBL HY NT TSD OT DM", 
"PEACE", "POST", "PROTEIN PLUS CORN FLAKE", "QUAKER", "RALSTON", 
"RLSTN", "SALLY'S ", "SCRUMPTIOUS SPELNDID CRLS CBB", "SEITENBACHER MUESLI", 
"SIMPLY FIBER", "SKINNER'S RAISIN BRAN", "SORIANA WHOLE BRAN", 
"STREIT'S FRUIT & NUT MUESLI", "SUNBELT", "SWAD", "T. ABRAHAM'S", 
"TAANUG CORN FLAKES", "TASTY", "TEMMY'S", "THE", "THREE SISTERS", 
"TIKISS SWEETENED PUFFD WHL WHT", "TRU ROOTS", "VOTTO'S", "WEETABIX", 
"WHOLESOME GOODNESS", "WILD ROOTS ANCIENT ORIGINS", "WONDER CORN FLAKES", 
"YOG ACTIVE"), class = "factor")), row.names = c(90609L, 222436L, 
90606L, 688592L, 90607L, 688593L, 90605L, 90604L, 90608L, 668330L, 
321377L, 567447L, 945385L, 567445L, 567443L, 892854L, 583186L, 
567446L, 583185L, 168138L, 567444L, 60086L, 698120L, 698127L, 
3933L, 809409L, 698129L, 15286L, 15284L, 698116L, 319455L), class = "data.frame")

I have a separate variable for Household, trip ID for a particular purchase instance, retailer, and Brand they purchased. One household may purchase more than 1 brand in 1 trip. I wanted to calculate the repeat purchases in 2 consecutive trips. For example, if a household purchase General Mills and Kellogg both in trip 1 and only General mill in 2nd trip, for general mills, it will be a repeat.

Example output:

Example Output

Now, it is easy when I have only 1 purchase per trip. I do it by the following code using rle()

e1 = transform(e1, brand_last_dum = ave(as.character(Brand), rleid(Household, Brand), FUN = seq_along))

However, when there is more than 1 purchase, it doesn't work. Data is arranged by the purchase date and more than 1 purchase of the same brand in a trip maybe considered as a different purchase for calculating repeat in the next row. Please Help


Solution

  • 2020-11-29 UPDATE: Adjusted code to count second+ purchases of same brand in same shopping trip as repeat purchases. Given the additional requirement I was able to eliminate one of the joins which had been previously added to treat all purchases of a given brand within a trip consistently.

    library(dplyr)
    data %>%
            arrange(.,household_code,purchase_date,trip_code_uc) %>% 
            group_by(household_code,purchase_date) %>%
            distinct(trip_code_uc) %>% 
            group_by(household_code) %>% 
            mutate(trip_seq = seq_along(household_code)) %>% 
            ungroup() %>% 
            left_join(data,.) %>%
            arrange(household_code,Overall_Brand,trip_seq) %>%
            mutate(repeat_purchase = if_else(household_code == lag(household_code) &
                                                     Overall_Brand == lag(Overall_Brand) & 
                                                     (trip_seq == lag(trip_seq) + 1 | trip_seq == lag(trip_seq)),
                                             TRUE,FALSE,missing=FALSE)) %>%
            arrange(.,household_code,purchase_date,Overall_Brand) -> result
    
    print(as.data.frame(result[,c(1,3,5,6,7)]))
    

    ...and the output:

    > print(as.data.frame(result[,c(1,3,5,6,7)]))
       household_code purchase_date Overall_Brand trip_seq repeat_purchase
    1              76    2012-01-01        CTL BR        1           FALSE
    2              76    2012-01-02        QUAKER        2           FALSE
    3              76    2012-01-23        CTL BR        3           FALSE
    4              76    2012-02-19       Kellogg        4           FALSE
    5              76    2012-02-21        CTL BR        5           FALSE
    6              76    2012-03-14       Kellogg        6           FALSE
    7              76    2012-03-17        CTL BR        7           FALSE
    8              76    2012-04-03        CTL BR        8            TRUE
    9              76    2012-05-27        CTL BR        9            TRUE
    10             76    2012-06-06       Kellogg       10           FALSE
    11             76    2012-06-24       Kellogg       11            TRUE
    12             76    2012-06-27 GENERAL MILLS       12           FALSE
    13             76    2012-07-14         KASHI       13           FALSE
    14             76    2012-07-29 GENERAL MILLS       14           FALSE
    15             76    2012-08-11 GENERAL MILLS       15            TRUE
    16             76    2012-09-02       Kellogg       16           FALSE
    17             76    2012-09-27        QUAKER       17           FALSE
    18             76    2012-10-12 GENERAL MILLS       18           FALSE
    19             76    2012-10-12        QUAKER       18            TRUE
    20             76    2012-11-04        CTL BR       19           FALSE
    21             76    2012-11-24 GENERAL MILLS       20           FALSE
    22             76    2012-12-11        CTL BR       21           FALSE
    23            126    2012-01-02       Kellogg        1           FALSE
    24            126    2012-01-09       Kellogg        2            TRUE
    25            126    2012-02-15 GENERAL MILLS        3           FALSE
    26            126    2012-02-15 GENERAL MILLS        3            TRUE
    27            126    2012-02-19       Kellogg        4           FALSE
    28            126    2012-02-27 GENERAL MILLS        5           FALSE
    29            126    2012-03-06 GENERAL MILLS        6            TRUE
    30            126    2012-03-06       Kellogg        6           FALSE
    31            126    2012-03-08       Kellogg        7            TRUE
    > 
    

    Prior version included below to maintain relevance of comments

    2020-11-28 UPDATE: After receiving the updated data that includes purchase_date, we altered our solution to use this information to generate trip_seq. We assume that if there are multiple trip_code_uc values in a single day, the shopping experiences occur in ascending order of trip_code_uc.

    We use dplyr to define distinct trip identifiers for each household and assign a trip sequence. We then use the trip sequence to evaluate whether the same brand was purchased in two consecutive shopping trips after merging it with the original data.

    library(dplyr)
    data %>%
         arrange(.,household_code,purchase_date,trip_code_uc) %>% 
         group_by(household_code,purchase_date) %>%
         distinct(trip_code_uc) %>% 
         group_by(household_code) %>% 
         mutate(trip_seq = seq_along(household_code)) %>% 
         ungroup() %>% 
         left_join(data,.) %>% 
         group_by(household_code,purchase_date,trip_seq) %>%
         distinct(Overall_Brand) %>%
         ungroup() %>%
         arrange(household_code,Overall_Brand,purchase_date,trip_seq) %>%
         mutate(repeat_purchase = if_else(household_code == lag(household_code) &
                                               Overall_Brand == lag(Overall_Brand) & 
                                               trip_seq == lag(trip_seq) + 1,
                                          TRUE,FALSE,missing=FALSE)) %>%
         left_join(data,.) -> result
    result <- arrange(result,household_code,purchase_date,Overall_Brand)
    print(as.data.frame(result[,c(1,3,5,6,7)]))
    

    ...and the output:

    > print(as.data.frame(result[,c(1,3,5,6,7)]))
       household_code purchase_date Overall_Brand trip_seq repeat_purchase
    1              76    2012-01-01        CTL BR        1           FALSE
    2              76    2012-01-02        QUAKER        2           FALSE
    3              76    2012-01-23        CTL BR        3           FALSE
    4              76    2012-02-19       Kellogg        4           FALSE
    5              76    2012-02-21        CTL BR        5           FALSE
    6              76    2012-03-14       Kellogg        6           FALSE
    7              76    2012-03-17        CTL BR        7           FALSE
    8              76    2012-04-03        CTL BR        8            TRUE
    9              76    2012-05-27        CTL BR        9            TRUE
    10             76    2012-06-06       Kellogg       10           FALSE
    11             76    2012-06-24       Kellogg       11            TRUE
    12             76    2012-06-27 GENERAL MILLS       12           FALSE
    13             76    2012-07-14         KASHI       13           FALSE
    14             76    2012-07-29 GENERAL MILLS       14           FALSE
    15             76    2012-08-11 GENERAL MILLS       15            TRUE
    16             76    2012-09-02       Kellogg       16           FALSE
    17             76    2012-09-27        QUAKER       17           FALSE
    18             76    2012-10-12 GENERAL MILLS       18           FALSE
    19             76    2012-10-12        QUAKER       18            TRUE
    20             76    2012-11-04        CTL BR       19           FALSE
    21             76    2012-11-24 GENERAL MILLS       20           FALSE
    22             76    2012-12-11        CTL BR       21           FALSE
    23            126    2012-01-02       Kellogg        1           FALSE
    24            126    2012-01-09       Kellogg        2            TRUE
    25            126    2012-02-15 GENERAL MILLS        3           FALSE
    26            126    2012-02-15 GENERAL MILLS        3           FALSE
    27            126    2012-02-19       Kellogg        4           FALSE
    28            126    2012-02-27 GENERAL MILLS        5           FALSE
    29            126    2012-03-06 GENERAL MILLS        6            TRUE
    30            126    2012-03-06       Kellogg        6           FALSE
    31            126    2012-03-08       Kellogg        7            TRUE
    

    The sequence number is important because if we simply sort by Household, Brand and Trip ID, we can't tell whether the next trip number is truly the "next" purchase, as illustrated by Household 126 purchases of the Kellog Brand, where it is purchased on trip sequences 1, 2, 4, 6, and 7. Only purchases on trips 2 and 7 should be counted as repeat purchases in consecutive trips, per the request in the OP.