Search code examples
rdatatabledplyrmanipulate

How to get the most common combination across multiple columns


I have the following dataset, where I have a 1 to indicate a food type was put out and a 0 to indicate it wasn't. I know that the most common number of food types put out is 2, but I'd love to know what combination is the most common.

Here is a sample of the dataset

structure(list(Type_SunflowerSeeds = c(1L, 1L, 1L, 0L, 0L), Type_SafflowerSeeds = c(0L, 
0L, 0L, 0L, 0L), Type_Nyjer = c(0L, 0L, 0L, 0L, 0L), Type_EconMix = c(1L, 
1L, 0L, 1L, 1L), Type_PremMix = c(0L, 0L, 0L, 0L, 0L), Type_Grains = c(0L, 
0L, 0L, 0L, 0L), Type_Nuts = c(0L, 0L, 1L, 1L, 0L), Type_Suet = c(1L, 
0L, 0L, 0L, 0L), Type_SugarWater = c(1L, 0L, 0L, 0L, 1L), Type_FruitOrJams = c(0L, 
0L, 0L, 1L, 0L), Type_Mealworms = c(0L, 0L, 0L, 0L, 0L), Type_Corn = c(0L, 
0L, 0L, 0L, 0L), Type_BarkOrPeanutButter = c(0L, 0L, 0L, 0L, 
0L), Type_Scraps = c(1L, 1L, 1L, 1L, 0L), Type_Bread = c(0L, 
0L, 0L, 0L, 0L), Type_Other = c(0L, 0L, 0L, 0L, 0L), total = c(5, 
3, 3, 4, 2)), row.names = c(NA, 5L), class = "data.frame")

I would like to know the most common pairwise combination as well as the most common three-way combination.

So the output would look like this for pairwise and similar for the three-way combo:

                 type1           type2 number_of_times
1     Type_SugarWater    Type_EconMix             351
2 Type_SunflowerSeeds Type_SugarWater             335

Solution

  • If I'm understanding your question correctly and you're looking for how frequent two of the same categories are 1 in the same row (e.g. pairwise like @M-- asked), here's how I've done it in the past. I'm sure there's a more graceful way of going about it though :D

    library(dplyr)
    library(tidyr)
    
    test.df <- structure(list(Type_SunflowerSeeds = c(1L, 1L, 1L, 0L, 0L, 1L, 
    1L, 0L, 0L, 0L), Type_SafflowerSeeds = c(0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L), Type_Nyjer = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L), Type_EconMix = c(1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 
    0L, 0L), Type_PremMix = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    1L), Type_Grains = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), 
        Type_Nuts = c(0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), Type_Suet = c(1L, 
        0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L), Type_SugarWater = c(1L, 
        0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 1L), Type_FruitOrJams = c(0L, 
        0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), Type_Mealworms = c(0L, 
        0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Type_Corn = c(0L, 0L, 
        0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L), Type_BarkOrPeanutButter = c(0L, 
        0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Type_Scraps = c(1L, 
        1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), Type_Bread = c(0L, 0L, 
        0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Type_Other = c(0L, 0L, 0L, 
        0L, 0L, 0L, 0L, 0L, 0L, 0L), total = c(5, 3, 3, 4, 2, 3, 
        3, 1, 1, 2)), row.names = c(NA, 10L), class = "data.frame")
    
    test.df %>%
      mutate(food.id = 1:n()) %>%
      gather(key = "type1", value = "val", -food.id, -total) %>% #create an ID column for each row
      filter(val==1) %>% 
      select(food.id, type1) %>% #now we have a data.frame with one column for food.id and 
    # one column for every food.type it is associated with
      left_join( # this left join is essentially doing the same thing we did before
        test.df %>%
          mutate(food.id = 1:n()) %>%
          gather(key = "type2", value = "val", -food.id, -total) %>%
          filter(val==1) %>%
          select(food.id, type2),
        by = c("food.id") # now we're matching each food with all of its associated types
      ) %>%
      mutate(type1.n = as.numeric(factor(type1)), # quick way of making sure we're not counting duplicates 
    # (e.g. if type1 = Type_SunflowerSeeds and type2 = Type_SafflowerSeeds, that's the same if they were switched)
             type2.n = as.numeric(factor(type2))) %>%
      filter(type1 > type2) %>% # this filter step takes care of the flip flopping issue
      group_by(type1, type2) %>%
      summarise( #finally, count the combinations/pairwise values
        n.times = n()
      ) %>%
      ungroup() %>%
      arrange(desc(n.times), type1, type2)
    

    With the output of:

       type1               type2            n.times
       <chr>               <chr>              <int>
     1 Type_Scraps         Type_EconMix           3
     2 Type_SugarWater     Type_EconMix           3
     3 Type_SunflowerSeeds Type_EconMix           3
     4 Type_SunflowerSeeds Type_Scraps            3
     5 Type_SunflowerSeeds Type_SugarWater        3
     6 Type_Scraps         Type_Nuts              2
     7 Type_SugarWater     Type_Suet              2
     8 Type_SunflowerSeeds Type_Suet              2
     9 Type_FruitOrJams    Type_EconMix           1
    10 Type_Nuts           Type_EconMix           1
    11 Type_Nuts           Type_FruitOrJams       1
    12 Type_Scraps         Type_FruitOrJams       1
    13 Type_Suet           Type_EconMix           1
    14 Type_Suet           Type_Scraps            1
    15 Type_SugarWater     Type_PremMix           1
    16 Type_SugarWater     Type_Scraps            1
    17 Type_SunflowerSeeds Type_Nuts              1
    

    To extend this and do a three-way combination count, you can follow this code. I've also added some additional comments to walk-through what's going on:

    # create a baseline data.frame with food.id and every food type that it matches
    food.type.long.df <- test.df %>%
      mutate(food.id = 1:n()) %>%
      gather(key = "type1", value = "val", -food.id, -total) %>%
      filter(val==1) %>%
      select(food.id, type1) %>%
      arrange(food.id)
    
    # join the baseline data.frame to itself to see all possible combinations of food types
    # note: this includes repeated types like type1=Type_Corn and type2=Type_Corn
    # this also includes rows where the types are simply flip-flopped types 
    # ex. Row 2 is type1=Type_SunflowerSeeds    and type2 = Type_EconMix 
    # but Row 6 is type1=Type_EconMix   and type2 = Type_SunflowerSeeds - we don't want to count this combinations twice
    food.2types.df <- food.type.long.df %>%
      left_join(
        select(food.type.long.df, food.id, type2 = type1),
        by = "food.id"
      ) %>%
      arrange(food.id)
    
    # let's add the third type as well; as with before, the same issues are in this df but we'll fix the duplicates
    # and flip flops later
    food.3types.df <- food.2types.df %>%
      left_join(
        select(food.type.long.df, food.id, type3 = type1),
        by = "food.id"
      ) %>%
      arrange(food.id)
    
    food.3types.df.fixed <- food.3types.df %>%
      distinct() %>%
      mutate(type1.n = as.numeric(factor(type1)), # assign each type1 a number (in alphabetical order)
             type2.n = as.numeric(factor(type2)), # assign each type2 a number (in alphabetical order)
             type3.n = as.numeric(factor(type3))) %>%  # assign each type3 a number (in alphabetical order)
      filter(type1 > type2) %>% # to remove duplicates and flip-flopped rows for types 1 and 2, use a strict inequality
      filter(type2 > type3) # to remove duplicates and flip-flopped rows for types 2 and 3, use a strict inequality
    
    food.3type.combination.count <- food.3types.df.fixed %>%
      group_by(type1, type2, type3) %>% # group by all three types you want to count
      summarise(
        n.times = n()
      ) %>%
      ungroup() %>%
      arrange(desc(n.times), type1, type2, type3) 
    
    

    With the output:

       type1               type2            type3            n.times
       <chr>               <chr>            <chr>              <int>
     1 Type_SunflowerSeeds Type_Scraps      Type_EconMix           2
     2 Type_SunflowerSeeds Type_SugarWater  Type_EconMix           2
     3 Type_SunflowerSeeds Type_SugarWater  Type_Suet              2
     4 Type_Nuts           Type_FruitOrJams Type_EconMix           1
     5 Type_Scraps         Type_FruitOrJams Type_EconMix           1
     6 Type_Scraps         Type_Nuts        Type_EconMix           1
     7 Type_Scraps         Type_Nuts        Type_FruitOrJams       1
     8 Type_Suet           Type_Scraps      Type_EconMix           1
     9 Type_SugarWater     Type_Scraps      Type_EconMix           1
    10 Type_SugarWater     Type_Suet        Type_EconMix           1
    11 Type_SugarWater     Type_Suet        Type_Scraps            1
    12 Type_SunflowerSeeds Type_Scraps      Type_Nuts              1
    13 Type_SunflowerSeeds Type_Suet        Type_EconMix           1
    14 Type_SunflowerSeeds Type_Suet        Type_Scraps            1
    15 Type_SunflowerSeeds Type_SugarWater  Type_Scraps            1