Association rules are a very common technique when you want to figure out which events happens together (like burger and bread mostly sell together). In marketing this technique is used to find out the complimentary products.
I am looking for a technique to extract the "Substitute Products" and to be it is like Inverse Association rules to find out which events are less likely happens together. Is there any algorithm or technique available in Spark, R, Python, etc. for this?
Thanks, Amir
I've done an a very practical implementation for Substitution Rule Mining using Teng, Hsieh and Chen (2002) for R. Maybe it can help you:
# Used packages:
library(arules)
SRM <- function(TransData, MinSup, MinConf, pMin, pChi, itemLabel, nTID){
# Packages ----------------------------------------------------------------
if (sum(search() %in% "package:arules") == 0) {
stop("Please load package arules")
}
# Checking Input data -----------------------------------------------------
if (missing(TransData)) {
stop("Transaction data is missing")
}
if (is.numeric(nTID) == FALSE) {
stop("nTID has to be one numeric number for the count of
Transactions")
}
if (length(nTID) > 1) {
stop("nTID has to be one number for the count of Transactions")
}
if (is.character(itemLabel) == FALSE) {
stop("itemLabel has to be a character")
}
# Concrete Item sets ---------------------------------------------------
# adding complements to transaction data
compl_trans <- addComplement(TransData,labels = itemLabel)
compl_tab <- crossTable(compl_trans,"support")
compl_tab_D <- as.data.frame(compl_tab)
# ordering matrix
compl_tab_D <- compl_tab_D[order(rownames((compl_tab))),order(colnames((compl_tab)))]
# Chi Value ---------------------------------------------------------------
# empty data frame for loop
complement_data <- data.frame(Chi = as.numeric(),
Sup_X.Y = as.numeric(),
X = as.character(),
Sup_X = as.numeric(),
Y = as.character(),
Sup_Y = as.numeric(),
CX = as.character(),
SupCX = as.numeric(),
CY = as.character(),
Sup_CY = as.numeric(),
Conf_X.CY = as.numeric(),
Sup_X.CY = as.numeric(),
Conf_Y.CX = as.numeric(),
SupY_CX = as.numeric())
# first loop for one item
for ( i in 1 : (length(itemLabel) - 1)) {
# second loop combines it with all other items
for (u in (i + 1) : length(itemLabel)) {
# getting chi value from Teng
a <- itemLabel[i]
b <- itemLabel[u]
ca <- paste0("!", itemLabel[i])
cb <- paste0("!", itemLabel[u])
chiValue <- nTID * (
compl_tab[ca, cb] ^ 2 / (compl_tab[ca, ca] * compl_tab[cb, cb]) +
compl_tab[ca, b] ^ 2 / (compl_tab[ca, ca] * compl_tab[b, b]) +
compl_tab[a, cb] ^ 2 / (compl_tab[a, a] * compl_tab[cb, cb]) +
compl_tab[a, b] ^ 2 / (compl_tab[a, a] * compl_tab[b, b]) - 1)
# condition to be dependent
if (compl_tab[a, b] > compl_tab[a, a] * compl_tab[b, b] && chiValue >= qchisq(pChi, 1) &&
compl_tab[a, a] >= MinSup && compl_tab[b, b] >= MinSup ) {
chi_sup <- data.frame(Chi = chiValue,
Sup_X.Y = compl_tab[a, b],
X = a,
Sup_X = compl_tab[a, a],
Y = b,
Sup_Y = compl_tab[b, b],
CX = ca,
SupCX = compl_tab[ca, ca],
CY = cb,
Sup_CY = compl_tab[cb, cb],
Conf_X.CY = compl_tab[a, cb] / compl_tab[a, a],
Sup_X.CY = compl_tab[a, cb],
Conf_Y.CX = compl_tab[ca, b] / compl_tab[b, b],
SupY_CX = compl_tab[ca, b])
try(complement_data <- rbind(complement_data, chi_sup))
}
}
}
if (nrow(complement_data) == 0) {
stop("No complement item sets could have been found")
}
# changing mode of
complement_data$X <- as.character(complement_data$X)
complement_data$Y <- as.character(complement_data$Y)
# calculating support for concrete itemsets with all others and their complements -------------------
## with complements
matrix_trans <- as.data.frame(as(compl_trans, "matrix"))
sup_three <- data.frame(Items = as.character(),
Support = as.numeric())
setCompl <- names(matrix_trans)
# 1. extracts all other values than that are not in the itemset
for (i in 1 : nrow(complement_data)) {
value <- setCompl[ !setCompl %in% c(complement_data$X[i],
complement_data$Y[i],
paste0("!", complement_data$X[i]),
paste0("!",complement_data$Y[i]))]
# 2. calculation of support
for (u in value) {
count <- sum(rowSums(matrix_trans[, c(complement_data$X[i], complement_data$Y[i], u )]) == 3)
sup <- count / nTID
sup_three_items <- data.frame(Items = paste0(complement_data$X[i], complement_data$Y[i], u),
Support=sup)
sup_three <- rbind(sup_three, sup_three_items)
}
}
# Correlation of single items-------------------------------------------------------------
# all items of concrete itemsets should be mixed for correlation
combis <- unique(c(complement_data$X, complement_data$Y))
# empty object
rules<- data.frame(
Substitute = as.character(),
Product = as.character(),
Support = as.numeric(),
Confidence = as.numeric(),
Correlation = as.numeric())
# first loop for one item
for (i in 1 : (length(combis) - 1)) {
# second loop combines it with all other items
for (u in (i + 1) : length(combis)) {
first <- combis[i]
second <- combis[u]
corXY <- (compl_tab[first, second] - (compl_tab[first, first] * compl_tab[second, second])) /
(sqrt((compl_tab[first, first] * (1 - compl_tab[first,first])) *
(compl_tab[second, second] * (1 - compl_tab[second, second]))))
# confidence
conf1 <- compl_tab[first, paste0("!", second)] / compl_tab[first, first]
conf2 <- compl_tab[second, paste0("!", first)] / compl_tab[second, second]
two_rules <- data.frame(
Substitute = c(paste("{", first, "}"),
paste("{", second, "}")),
Product = c(paste("=>", "{", second, "}"),
paste("=>", "{", first, "}")),
Support = c(compl_tab[first, paste0("!", second)], compl_tab[second, paste0("!", first)]),
Confidence = c(conf1, conf2),
Correlation = c(corXY, corXY)
)
# conditions
try({
if (two_rules$Correlation[1] < pMin) {
if (two_rules$Support[1] >= MinSup && two_rules$Confidence[1] >= MinConf) {
rules <- rbind(rules, two_rules[1, ])
}
if (two_rules$Support[2] >= MinSup && two_rules$Confidence[2] >= MinConf) {
rules <- rbind(rules, two_rules[2, ])
}
} })
}
}
# Correlation of concrete item pairs with single items --------------------
# adding variable for loop
complement_data$XY <- paste0(complement_data$X, complement_data$Y)
# combination of items
for (i in 1 : nrow(complement_data)){
# set of combinations from dependent items with single items
univector <- c(as.vector(unique(complement_data$X)), as.vector(unique(complement_data$Y)))
univector <- univector[!univector %in% c(complement_data$X[i], complement_data$Y[i])]
combis <- c(complement_data[i,"XY"], univector)
for (u in 2 : length(combis)) {
corXYZ <-(sup_three[sup_three$Items == paste0(combis[1], combis[u]),2] -
complement_data[complement_data$XY == combis[1],"Sup_X.Y"] *
compl_tab[combis[u],combis[u]]) /
(sqrt((complement_data[complement_data$XY == combis[1],"Sup_X.Y"] *
(1 - complement_data[complement_data$XY == combis[1],"Sup_X.Y"]) *
compl_tab[combis[u],combis[u]] * (1 - compl_tab[combis[u],combis[u]]))))
dataXYZ <- data.frame(
Substitute = paste("{", combis[1], "}"),
Product = paste("=>", "{", combis[u], "}"),
Support = sup_three[sup_three$Items == paste0(combis[1], "!", combis[u]),2],
Confidence = sup_three[sup_three$Items == paste0(combis[1], "!", combis[u]),2] /
complement_data[complement_data$XY == combis[1],"Sup_X.Y"],
Correlation = corXYZ)
# conditions
if (dataXYZ$Correlation < pMin && dataXYZ$Support >= MinSup && dataXYZ$Confidence >= MinConf) {
try(rules <- rbind(rules, dataXYZ))
}
}
}
if (nrow(rules) == 0) {
message("Sorry no rules could have been calculated. Maybe change input conditions.")
} else {
return(rules)
}
# end
}
I think a better explanation is in my blog: http://mattimeyer.github.io/2016-12-21-Substitution-Rule-Mining/