I have a data frame which has 20 columns/items in it, and 593 rows (number of rows doesn't matter though) as shown below:
Using this the reliability of test is obtained as 0.94, with the help of alpha from psych package psych::alpha
. The output also gives me the the new value of cronbach's alpha if I drop one of the items. However, I want to know how many items can I drop to retain an alpha of at least 0.8 I used a brute force approach for the purpose where I am creating the combination of all the items that exists in my data frame and check if their alpha is in the range (0.7,0.9). Is there a better way of doing this, as this is taking forever to run because number of items is too large to check for all the combination of items. Below is my current piece of code:
numberOfItems <- 20
for(i in 2:(2^numberOfItems)-1){
# ignoring the first case i.e. i=1, as it doesn't represent any model
# convert the value of i to binary, e.g. i=5 will give combination = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1
# using the binaryLogic package
combination <- as.binary(i, n=numberOfItems)
model <- c()
for(j in 1:length(combination)){
# choose which columns to consider depending on the combination
if(combination[j])
model <- c(model, j)
}
itemsToUse <- itemResponses[, c(model)]
#cat(model)
if(length(model) > 13){
alphaVal <- psych::alpha(itemsToUse)$total$raw_alpha
if(alphaVal > 0.7 && alphaVal < 0.9){
cat(alphaVal)
print(model)
}
}
}
A sample output from this code is as follows:
0.8989831 1 4 5 7 8 9 10 11 13 14 15 16 17 19 20
0.899768 1 4 5 7 8 9 10 11 12 13 15 17 18 19 20
0.899937 1 4 5 7 8 9 10 11 12 13 15 16 17 19 20
0.8980605 1 4 5 7 8 9 10 11 12 13 14 15 17 19 20
Here are the first 10 rows of data:
dput(itemResponses) structure(list(CESD1 = c(1, 2, 2, 0, 1, 0, 0, 0, 0, 1), CESD2 = c(2, 3, 1, 0, 0, 1, 1, 1, 0, 1), CESD3 = c(0, 3, 0, 1, 1, 0, 0, 0, 0, 0), CESD4 = c(1, 2, 0, 1, 0, 1, 1, 1, 0, 0), CESD5 = c(0, 1, 0, 2, 1, 2, 2, 0, 0, 0), CESD6 = c(0, 3, 0, 1, 0, 0, 2, 0, 0, 0), CESD7 = c(1, 2, 1, 1, 2, 0, 1, 0, 1, 0), CESD8 = c(1, 3, 1, 1, 0, 1, 0, 0, 1, 0), CESD9 = c(0, 1, 0, 2, 0, 0, 1, 1, 0, 1), CESD10 = c(0, 1, 0, 2, 0, 0, 1, 1, 0, 1), CESD11 = c(0, 2, 1, 1, 1, 1, 2, 3, 0, 0), CESD12 = c(0, 3, 1, 1, 1, 0, 2, 0, 0, 0), CESD13 = c(0, 3, 0, 2, 1, 2, 1, 0, 1, 0), CESD14 = c(0, 3, 1, 2, 1, 1, 1, 0, 1, 1), CESD15 = c(0, 2, 0, 1, 0, 1, 0, 1, 1, 0), CESD16 = c(0, 2, 2, 0, 0, 1, 1, 0, 0, 0), CESD17 = c(0, 0, 0, 0, 0, 1, 1, 0, 0, 0), CESD18 = c(0, 2, 0, 0, 0, 0, 0, 0, 0, 1), CESD19 = c(0, 3, 0, 0, 0, 0, 0, 1, 1, 0), CESD20 = c(0, 3, 0, 1, 0, 0, 0, 0, 0, 0)), .Names = c("CESD1", "CESD2", "CESD3", "CESD4", "CESD5", "CESD6", "CESD7", "CESD8", "CESD9", "CESD10", "CESD11", "CESD12", "CESD13", "CESD14", "CESD15", "CESD16", "CESD17", "CESD18", "CESD19", "CESD20"), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"))
The idea is to replace the computation of alpha
with the so-called discrimination for each item from classical test theory (CTT). The discrimination is the correlation of the item score with a "true score" (which we would assume to be the row sum).
Let the data be
dat <- structure(list(CESD1 = c(1, 2, 2, 0, 1, 0, 0, 0, 0, 1), CESD2 = c(2, 3, 1, 0, 0, 1, 1, 1, 0, 1),
CESD3 = c(0, 3, 0, 1, 1, 0, 0, 0, 0, 0), CESD4 = c(1, 2, 0, 1, 0, 1, 1, 1, 0, 0),
CESD5 = c(0, 1, 0, 2, 1, 2, 2, 0, 0, 0), CESD6 = c(0, 3, 0, 1, 0, 0, 2, 0, 0, 0),
CESD7 = c(1, 2, 1, 1, 2, 0, 1, 0, 1, 0), CESD8 = c(1, 3, 1, 1, 0, 1, 0, 0, 1, 0),
CESD9 = c(0, 1, 0, 2, 0, 0, 1, 1, 0, 1), CESD10 = c(0, 1, 0, 2, 0, 0, 1, 1, 0, 1),
CESD11 = c(0, 2, 1, 1, 1, 1, 2, 3, 0, 0), CESD12 = c(0, 3, 1, 1, 1, 0, 2, 0, 0, 0),
CESD13 = c(0, 3, 0, 2, 1, 2, 1, 0, 1, 0), CESD14 = c(0, 3, 1, 2, 1, 1, 1, 0, 1, 1),
CESD15 = c(0, 2, 0, 1, 0, 1, 0, 1, 1, 0), CESD16 = c(0, 2, 2, 0, 0, 1, 1, 0, 0, 0),
CESD17 = c(0, 0, 0, 0, 0, 1, 1, 0, 0, 0), CESD18 = c(0, 2, 0, 0, 0, 0, 0, 0, 0, 1),
CESD19 = c(0, 3, 0, 0, 0, 0, 0, 1, 1, 0), CESD20 = c(0, 3, 0, 1, 0, 0, 0, 0, 0, 0)),
.Names = c("CESD1", "CESD2", "CESD3", "CESD4", "CESD5", "CESD6", "CESD7", "CESD8", "CESD9",
"CESD10", "CESD11", "CESD12", "CESD13", "CESD14", "CESD15", "CESD16", "CESD17",
"CESD18", "CESD19", "CESD20"), row.names = c(NA, -10L),
class = c("tbl_df", "tbl", "data.frame"))
We compute (1) the discrimination and (2) the alpha coefficient.
stat <- t(sapply(1:ncol(dat), function(ii){
dd <- dat[, ii]
# discrimination is the correlation of the item to the rowsum
disc <- if(var(dd, na.rm = TRUE) > 0) cor(dd, rowSums(dat[, -ii]), use = "pairwise")
# alpha that would be obtained when we skip this item
alpha <- psych::alpha(dat[, -ii])$total$raw_alpha
c(disc, alpha)
}))
dimnames(stat) <- list(colnames(dat), c("disc", "alpha^I"))
stat <- data.frame(stat)
Observe that the discrimination (which is more efficient to compute) is inversely proportional to alpha
that is obtained when deleting this item. In other words, alpha
is highest when there are many high "discriminating" items (that correlate with each other).
plot(stat, pch = 19)
Use this information to select the sequence with which the items should be deleted to fall below a benchmark (say .9, since the toy data doesn't allow for a lower mark):
1) delete as many items as possible to stay above the benchmark; that is, start with the least discriminating items.
stat <- stat[order(stat$disc), ]
this <- sapply(1:(nrow(stat)-2), function(ii){
ind <- match(rownames(stat)[1:ii], colnames(dat))
alpha <- psych::alpha(dat[, -ind, drop = FALSE])$total$raw_alpha
})
delete_these <- rownames(stat)[which(this > .9)]
psych::alpha(dat[, -match(delete_these, colnames(dat)), drop = FALSE])$total$raw_alpha
length(delete_these)
2) delete as few items as possible to stay above the benchmark; that is, start with the highest discriminating items.
stat <- stat[order(stat$disc, decreasing = TRUE), ]
this <- sapply(1:(nrow(stat)-2), function(ii){
ind <- match(rownames(stat)[1:ii], colnames(dat))
alpha <- psych::alpha(dat[, -ind, drop = FALSE])$total$raw_alpha
})
delete_these <- rownames(stat)[which(this > .9)]
psych::alpha(dat[, -match(delete_these, colnames(dat)), drop = FALSE])$total$raw_alpha
length(delete_these)
Note, that 1) is coherent with classical item selection procedures in (psychological/educational) diagnostic/assessments: remove items from the assessment, that fall below a benchmark in terms of discriminatory power.