Consider this code:
# Load libraries
library(RCurl)
library(TraMineR)
library(PST)
# Get data
x <- getURL("https://gist.githubusercontent.com/aronlindberg/08228977353bf6dc2edb3ec121f54a29/raw/c2539d06771317c5f4c8d3a2052a73fc485a09c6/challenge_level.csv")
data <- read.csv(text = x)
# Load and transform data
data <- read.table("thread_level.csv", sep = ",", header = F, stringsAsFactors = F)
# Create sequence object
data.seq <- seqdef(data[2:nrow(data),2:ncol(data)], missing = NA, right= NA, nr = "*")
# Make a tree
S1 <- pstree(data.seq, ymin = 0.05, L = 6, lik = FALSE, with.missing = TRUE)
# Look at contexts
cmine(S1, pmin = 0, state = "N3", l = 3)
I can then calculate the significance thresholds for lift values for two particular "association rules" in the following manner:
# Calculate lift threshold for N2-QU->N3
ngood_idea <- sum(data.seq == "N3")
nn <- nrow(data.seq)*ncol(data.seq)
p_good_idea <- ngood_idea/nn
x <- seqdef("N2-QU")
p_context <- predict(S1, x, decomp = F, output = "prob")
p_not_context_good_idea <- (1-p_context)*(1-(p_good_idea))
p_context_good_idea <- p_context*p_good_idea
N2_QU_N3_threshold <- 1+1.645*sqrt(((1/nn)*(p_not_context_good_idea/p_context_good_idea)))
# Calculate lift threshold for N2-QU->N1
nbad_idea <- sum(data.seq == "N1")
nn <- nrow(data.seq)*ncol(data.seq)
p_bad_idea <- nbad_idea/nn
p_not_context_bad_idea <- (1-p_context)*(1-(p_bad_idea))
p_context_bad_idea <- p_context*p_bad_idea
N2_QU_N1_threshold <- 1+1.645*sqrt(((1/nn)*(p_not_context_bad_idea/p_context_bad_idea)))
# Print lift thresholds
N2_QU_N3_threshold
N2_QU_N1_threshold
However, what if I want to compare two lift values with each other, to see if they are significantly different from each other (in a manner similar to how I can compare two regression coefficients to each other to see if they are significantly different from each other)? How can I accomplish this?
Utilizing this equation:
$Z = \frac{\beta_1-\beta_2}{\sqrt{(SE\beta_1)^2+(SE\beta_2)^2}}$
Where $SE\beta$
is the standard error of $\beta$
.
This equation is provided by Clogg et al (1995)
We can analogize, using the lifts as the coefficients, and the calculation of the variance of each lift based on Lenca et al (2008, p. 619)
# Calculate conditional probability for I3
cp_good <- query(S1, context = "N2-QU", output= "prob")@.Data[attr(query(S1, context = "N2-QU", output= "prob")@.Data, "dimnames")[[2]]=="I3"]
cp_good <- unlist(cp_good)
# Calculate conditional probability for I1
cp_bad <- query(S1, context = "N2-QU", output= "prob")@.Data[attr(query(S1, context = "N2-QU", output= "prob")@.Data, "dimnames")[[2]]=="I1"]
cp_bad <- unlist(cp_bad)
# Calculate lift for I3
ngood_idea <- sum(data.seq == "I3")
nn <- nrow(data.seq)*ncol(data.seq)
p_good_idea <- ngood_idea/nn
good_lift <- cp_good/p_good_idea
# Calculate lift for I1
nbad_idea <- sum(data.seq == "I1")
nn <- nrow(data.seq)*ncol(data.seq)
p_bad_idea <- nbad_idea/nn
bad_lift <- cp_bad/p_bad_idea
# Calculate z_diff
p_context <- predict(S1, x, decomp = F, output = "prob")
p_not_context_good_idea <- (1-p_context)*(1-(p_good_idea))
p_context_good_idea <- p_context*p_good_idea
p_not_context_bad_idea <- (1-p_context)*(1-(p_bad_idea))
p_context_bad_idea <- p_context*p_bad_idea
var_good_idea <- ((1/nn)*(p_not_context_good_idea/p_context_good_idea))
var_bad_idea <- ((1/nn)*(p_not_context_bad_idea/p_context_bad_idea))
z_diff <- (good_lift-bad_lift)/sqrt(var_good_idea+var_bad_idea)
z_diff
The z-value of the difference is 0.2556881
.
Clogg, C. C., Petkova, E., & Haritou, A. (1995). Statistical methods for comparing regression coefficients between models. American Journal of Sociology, 100(5), 1261-1293.]
Lenca, P., Meyer, P., Vaillant, B., and Lallich, S. 2008. “On selecting interestingness measures for association rules: User oriented description and multiple criteria decision aid,” European Journal of Operational Research (184:2), pp. 610–626 (doi: 10.1016/j.ejor.2006.10.059).