Search code examples
associationsrulesinverse

Inverse Association Rules


Association rules are a very common technique when you want to figure out which events happens together (like burger and bread mostly sell together). In marketing this technique is used to find out the complimentary products.

I am looking for a technique to extract the "Substitute Products" and to be it is like Inverse Association rules to find out which events are less likely happens together. Is there any algorithm or technique available in Spark, R, Python, etc. for this?

Thanks, Amir


Solution

  • I've done an a very practical implementation for Substitution Rule Mining using Teng, Hsieh and Chen (2002) for R. Maybe it can help you:

    # Used packages:
    library(arules)
    
    
    SRM <- function(TransData, MinSup, MinConf, pMin, pChi, itemLabel, nTID){
    
    # Packages ----------------------------------------------------------------
    
    if (sum(search() %in% "package:arules") == 0) {
    stop("Please load package arules")
    }  
    
    # Checking Input data -----------------------------------------------------
     if (missing(TransData)) {
      stop("Transaction data is missing")
    }
    
    if (is.numeric(nTID) == FALSE) {
      stop("nTID has to be one numeric number for the count of      
     Transactions")
     }
    
      if (length(nTID) > 1) {
       stop("nTID has to be one number for the count of Transactions")
      }
    
      if (is.character(itemLabel) == FALSE) {
       stop("itemLabel has to be a character")
      }
      # Concrete Item sets  ---------------------------------------------------
    
      # adding complements to transaction data
      compl_trans <- addComplement(TransData,labels = itemLabel)
      compl_tab <- crossTable(compl_trans,"support")
      compl_tab_D <- as.data.frame(compl_tab)
      # ordering matrix
      compl_tab_D <-           compl_tab_D[order(rownames((compl_tab))),order(colnames((compl_tab)))]
    
    
      # Chi Value ---------------------------------------------------------------
    
    
      # empty data frame for loop
    
      complement_data <- data.frame(Chi = as.numeric(),
                               Sup_X.Y = as.numeric(),
                               X = as.character(),
                               Sup_X = as.numeric(),
                               Y = as.character(),
                               Sup_Y = as.numeric(),
                               CX = as.character(),
                               SupCX = as.numeric(),
                               CY = as.character(),
                               Sup_CY = as.numeric(),
                               Conf_X.CY = as.numeric(),
                               Sup_X.CY = as.numeric(),
                               Conf_Y.CX = as.numeric(),
                               SupY_CX = as.numeric())
    
    
    
      # first loop for one item
      for ( i in 1 : (length(itemLabel) - 1)) {
       # second loop combines it with all other items
       for (u in (i + 1) : length(itemLabel)) {
    
    
        # getting chi value from Teng
        a <-  itemLabel[i]
        b <-  itemLabel[u]
        ca <- paste0("!", itemLabel[i])
        cb <- paste0("!", itemLabel[u])
    
        chiValue <- nTID * (
         compl_tab[ca, cb] ^ 2 / (compl_tab[ca, ca] * compl_tab[cb, cb]) +
          compl_tab[ca, b] ^ 2 / (compl_tab[ca, ca] * compl_tab[b, b]) +
          compl_tab[a, cb] ^ 2 / (compl_tab[a, a] * compl_tab[cb, cb]) +
          compl_tab[a, b] ^ 2 / (compl_tab[a, a] * compl_tab[b, b]) - 1)
    
    
    
        # condition to be dependent
        if (compl_tab[a, b] > compl_tab[a, a] * compl_tab[b, b] &&      chiValue >= qchisq(pChi, 1) && 
            compl_tab[a, a] >= MinSup && compl_tab[b, b] >= MinSup ) {
    
    
    
         chi_sup <- data.frame(Chi = chiValue,
                          Sup_X.Y = compl_tab[a, b],
                          X = a,
                          Sup_X = compl_tab[a, a],
                          Y = b,
                          Sup_Y = compl_tab[b, b],
                          CX = ca,
                          SupCX = compl_tab[ca, ca],
                          CY = cb,
                          Sup_CY = compl_tab[cb, cb],
                          Conf_X.CY = compl_tab[a, cb] / compl_tab[a, a],
                          Sup_X.CY = compl_tab[a, cb],
                          Conf_Y.CX = compl_tab[ca, b] / compl_tab[b, b],
                          SupY_CX = compl_tab[ca, b])
    
    
         try(complement_data <- rbind(complement_data, chi_sup))
    
        }
    
    
       }
      }
      if (nrow(complement_data) == 0) {
       stop("No complement item sets could have been found")
      }
    
    
      #  changing mode of 
      complement_data$X <- as.character(complement_data$X)
      complement_data$Y <- as.character(complement_data$Y)
    
    
      # calculating support for concrete itemsets with all others and their complements -------------------
    
    
      ## with complements
      matrix_trans <- as.data.frame(as(compl_trans, "matrix"))
    
      sup_three <- data.frame(Items = as.character(),
                         Support = as.numeric()) 
    
    
      setCompl <- names(matrix_trans)
      # 1. extracts all other values than that are not in the itemset
      for (i in 1 : nrow(complement_data)) {
       value <- setCompl[ !setCompl %in% c(complement_data$X[i], 
                                      complement_data$Y[i], 
                                      paste0("!", complement_data$X[i]), 
                                      paste0("!",complement_data$Y[i]))]
    
    
       # 2. calculation of support
       for (u in value) {
        count <- sum(rowSums(matrix_trans[, c(complement_data$X[i],      complement_data$Y[i], u )]) == 3)
        sup <- count / nTID  
        sup_three_items <- data.frame(Items =      paste0(complement_data$X[i], complement_data$Y[i], u),
                                 Support=sup) 
        sup_three <- rbind(sup_three, sup_three_items)
       }
    
      }
    
      # Correlation of single items-------------------------------------------------------------
    
    
      # all items of concrete itemsets should be mixed for correlation
      combis <- unique(c(complement_data$X, complement_data$Y))
    
      # empty object
      rules<- data.frame(
       Substitute = as.character(),
       Product = as.character(),
       Support = as.numeric(),
       Confidence = as.numeric(),
       Correlation = as.numeric())
    
      # first loop for one item
      for (i in 1 : (length(combis) - 1)) {
       # second loop combines it with all other items
       for (u in (i + 1) : length(combis)) {
    
        first <- combis[i]
        second <- combis[u]
    
        corXY <- (compl_tab[first, second] - (compl_tab[first, first] *      compl_tab[second, second])) /
    (sqrt((compl_tab[first, first] * (1 - compl_tab[first,first])) *
           (compl_tab[second, second] * (1 - compl_tab[second, second]))))
    
    
        # confidence
        conf1 <- compl_tab[first, paste0("!", second)] / compl_tab[first, first]
        conf2 <- compl_tab[second, paste0("!", first)] / compl_tab[second, second]
    
        two_rules <- data.frame(
         Substitute = c(paste("{", first, "}"), 
                   paste("{", second, "}")),
         Product = c(paste("=>", "{", second, "}"),
                paste("=>", "{", first, "}")),
         Support = c(compl_tab[first, paste0("!", second)], compl_tab[second, paste0("!", first)]),
         Confidence = c(conf1, conf2),
         Correlation = c(corXY, corXY)
        )
    
        # conditions
        try({
         if (two_rules$Correlation[1] < pMin) {
          if (two_rules$Support[1] >= MinSup && two_rules$Confidence[1] >= MinConf) {
           rules <- rbind(rules, two_rules[1, ])
     }
          if (two_rules$Support[2] >= MinSup && two_rules$Confidence[2] >= MinConf) {
           rules <- rbind(rules, two_rules[2, ])
          }
    
         } })
    
       }
      }
    
    
      # Correlation of concrete item pairs with single items --------------------
      # adding variable for loop
      complement_data$XY <- paste0(complement_data$X, complement_data$Y)
    
      # combination of items
      for (i in 1 : nrow(complement_data)){
    
       # set of combinations from dependent items with single items
       univector <- c(as.vector(unique(complement_data$X)),      as.vector(unique(complement_data$Y)))
       univector <- univector[!univector %in% c(complement_data$X[i], complement_data$Y[i])]
    
       combis <- c(complement_data[i,"XY"], univector)
    
    
    
       for (u in 2 : length(combis)) {
        corXYZ <-(sup_three[sup_three$Items == paste0(combis[1], combis[u]),2] - 
                   complement_data[complement_data$XY == combis[1],"Sup_X.Y"] *
              compl_tab[combis[u],combis[u]]) /
    (sqrt((complement_data[complement_data$XY == combis[1],"Sup_X.Y"] * 
             (1 - complement_data[complement_data$XY == combis[1],"Sup_X.Y"]) *
            compl_tab[combis[u],combis[u]] * (1 - compl_tab[combis[u],combis[u]]))))
    
        dataXYZ <- data.frame(
    Substitute = paste("{", combis[1], "}"), 
    Product = paste("=>", "{", combis[u], "}"),
    Support = sup_three[sup_three$Items == paste0(combis[1], "!", combis[u]),2],
    Confidence = sup_three[sup_three$Items == paste0(combis[1], "!", combis[u]),2] /
     complement_data[complement_data$XY == combis[1],"Sup_X.Y"],
    Correlation = corXYZ)
    
    
        # conditions
        if (dataXYZ$Correlation < pMin && dataXYZ$Support >= MinSup && dataXYZ$Confidence >= MinConf) {
    
         try(rules <- rbind(rules, dataXYZ))
        }
       }
      }
      if (nrow(rules) == 0) {
       message("Sorry no rules could have been calculated. Maybe change input conditions.")
      }      else {
       return(rules)
      }
    
      # end
     }
    

    I think a better explanation is in my blog: http://mattimeyer.github.io/2016-12-21-Substitution-Rule-Mining/