Search code examples
rmatrixdplyrsimilaritytidyr

Creating a Similarity Matrix from Raw Card-Sort Data


I have a data set from an online card sorting activity. Participants were presented with a random subset of Cards (from a larger set) and asked to create Groups of Cards they felt were similar to one another. Participants were able to create as many Groups as they liked and name the Groups whatever they wanted.

An example data set is something like this:

Data <- structure(list(Subject = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L), Card = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 
7L, 8L, 9L, 10L, 2L, 3L, 5L, 7L, 9L, 10L, 11L, 12L, 13L, 14L, 
1L, 3L, 4L, 5L, 6L, 7L, 8L, 12L, 13L, 14L), .Label = c("A", "B", 
"C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N"), class = "factor"), 
    Group = structure(c(1L, 2L, 3L, 4L, 1L, 3L, 3L, 5L, 2L, 5L, 
    1L, 2L, 1L, 3L, 1L, 4L, 4L, 2L, 3L, 1L, 1L, 2L, 1L, 2L, 3L, 
    2L, 1L, 2L, 2L, 3L), .Label = c("Cat1", "Cat2", "Cat3", "Cat4", 
    "Cat5"), class = "factor")), .Names = c("Subject", "Card", 
"Group"), class = "data.frame", row.names = c(NA, -30L))

From these data I'd like to create a similarity matrix, ideally of proportion or percentage of total counts where items were grouped together.

Something like these:

Count:

    A   B   C   D   E   F   G   H   I   J   K   L   M   N
A       0   0   1   1   0   0   1   0   0   0   0   0   0
B   0       0   0   1   0   0   0   2   0   0   0   0   1
C   0   0       0   0   1   2   0   0   0   0   2   1   0
D   1   0   0       0   0   0   1   0   0   0   0   0   0
E   1   1   0   0       0   1   0   1   0   0   1   1   1
F   0   0   1   0   0       1   0   0   0   0   0   0   1
G   0   0   2   0   1   1       0   0   0   0   1   2   0
H   1   0   0   1   0   0   0       0   1   0   0   0   0
I   0   2   0   0   1   0   0   0       0   0   0   0   1
J   0   0   0   0   0   0   0   1   0       1   0   0   0
K   0   0   0   0   0   0   0   0   0   1       0   0   0
L   0   0   2   0   1   0   1   0   0   0   0       1   0
M   0   0   1   0   1   0   2   0   0   0   0   1       0
N   0   1   0   0   1   1   0   0   1   0   0   0   0   

Every subject named their Groups differently, so it's not possible to index by Group.

In addition to counts, I'd also like to generate a similarity matrix that reports the percentage of participants, who were presented with a particular pair of Cards, that grouped those two Cards together.

From the example data set, this as a result:

    A   B   C   D   E   F   G   H   I   J   K   L   M   N
A       0   0   50  50  0   0   50  0   0   0   0   0   0
B   0       0   0   50  0   0   0   100 0   0   0   0   100
C   0   0       0   0   50  67  0   0   0   0   100 50  0
D   50  0   0       0   0   0   50  0   0   0   0   0   0
E   50  50  33  0       0   33  0   50  0   0   33  50  50
F   0   0   50  0   0       50  0   0   0   0   0   0   100
G   0   0   67  0   33  50      0   0   0   0   50  100 0
H   50  0   0   50  0   0   0       0   100 0   0   0   0
I   0   100 0   0   50  0   0   0       0   0   0   0   100
J   0   0   0   0   0   0   0   100 0       100 0   0   0
K   0   0   0   0   0   0   0   0   0   100     0   0   0
L   0   0   100 0   33  0   50  0   0   0   0       50  0
M   0   0   50  0   50  0   100 0   0   0   0   50      0
N   0   100 0   0   50  100 0   0   100 0   0   0   0   

Any suggestions would be greatly appreciated!

Edit: While the answer below works for the example data. It doesn't seem to work for my actual data posted here: https://www.dropbox.com/s/mhqwyok0nmvt3g9/Sim_Example.csv?dl=0

For example, in those data I manually count 22 pairings of "Aircraft" and "Airport", which would be ~55%. But the answer below yields a count of 12 and 60%


Solution

  • Edited solution based on OP's requirement clarification

    Step 1. Process data to create card pairs & whether they've been grouped together by any user:

    library(tidyverse); library(data.table)
    
    Data.matrix <- Data %>% 
    
      # convert data into list of data frames by subject
      split(Data$Subject) %>%
    
      # for each subject, we create all pair combinations based on the subset cards he 
      # received, & note down whether he grouped the pair into the same group 
      # (assume INTERNAL group naming consistency. i.e. if subject 1 uses group names such 
      # as "cat", "dog", "rat", they are all named exactly so, & we don't worry about 
      # variations / typos such as "cat1.5", "dgo", etc.)
      lapply(function(x){
        data.frame(V1 = t(combn(x$Card, 2))[,1],
                   V2 = t(combn(x$Card, 2))[,2],
                   G1 = x$Group[match(t(combn(x$Card, 2))[,1], x$Card)],
                   G2 = x$Group[match(t(combn(x$Card, 2))[,2], x$Card)],
                   stringsAsFactors = FALSE) %>%
          mutate(co.occurrence = 1,
                 same.group = G1==G2) %>%
          select(-G1, -G2)}) %>%
    
      # combine the list of data frames back into one, now that we don't worry about group 
      # names, & calculate the proportion of times each pair is assigned the same group, 
      # based on the total number of times they occurred together in any subject's 
      # subset.
      rbindlist() %>%
      rowwise() %>%
      mutate(V1.sorted = min(V1, V2),
             V2.sorted = max(V1, V2)) %>%
      ungroup() %>%
      group_by(V1.sorted, V2.sorted) %>%
      summarise(co.occurrence = sum(co.occurrence),
                same.group = sum(same.group)) %>%
      ungroup() %>%
      rename(V1 = V1.sorted, V2 = V2.sorted) %>%
      mutate(same.group.perc = same.group/co.occurrence * 100) %>%
    
      # now V1 ranges from A:M, where V2 ranges from B:N. let's complete all combinations
      mutate(V1 = factor(V1, levels = sort(unique(Data$Card))),
             V2 = factor(V2, levels = sort(unique(Data$Card)))) %>%
      complete(V1, V2, fill = list(NA))
    
    > Data.matrix
    # A tibble: 196 x 5
           V1     V2 co.occurrence same.group same.group.perc
       <fctr> <fctr>         <dbl>      <int>           <dbl>
     1      A      A            NA         NA              NA
     2      A      B             1          0               0
     3      A      C             2          0               0
     4      A      D             2          1              50
     5      A      E             2          1              50
     6      A      F             2          0               0
     7      A      G             2          0               0
     8      A      H             2          1              50
     9      A      I             1          0               0
    10      A      J             1          0               0
    # ... with 186 more rows
    
    # same.group is the number of times a card pair has been grouped together.
    # same.group.perc is the percentage of users who grouped the card pair together.
    

    Step 2. Create separate matrices for count & percentage:

    # spread count / percentage respectively into wide form
    
    Data.count <- Data.matrix %>%
      select(V1, V2, same.group) %>%
      spread(V2, same.group, fill = 0) %>%
      remove_rownames() %>%
      column_to_rownames("V1") %>%
      as.matrix()
    
    Data.perc <- Data.matrix %>%
      select(V1, V2, same.group.perc) %>%
      spread(V2, same.group.perc, fill = 0) %>%
      remove_rownames() %>%
      column_to_rownames("V1") %>%
      as.matrix()
    

    Step 3. Convert the upper triangular matrices into symmetric matrices (note: I've just found a shorter & neater solution here):

    # fill up lower triangle to create symmetric matrices
    Data.count[lower.tri(Data.count)] <- t(Data.count)[lower.tri(t(Data.count))]
    Data.perc[lower.tri(Data.perc)] <- t(Data.perc)[lower.tri(t(Data.perc))]
    
    # ALTERNATE to previous step
    Data.count <- pmax(Data.count, t(Data.count))
    Data.perc <- pmax(Data.perc, t(Data.perc))
    

    Step 4. Get rid of the diagonals since there's no point pairing a card with itself:

    # convert diagonals to NA since you don't really need them
    diag(Data.count) <- NA
    diag(Data.perc) <- NA
    

    Step 5. Verify the results:

    > Data.count
       A  B  C  D  E  F  G  H  I  J  K  L  M  N
    A NA  0  0  1  1  0  0  1  0  0  0  0  0  0
    B  0 NA  0  0  1  0  0  0  2  0  0  0  0  1
    C  0  0 NA  0  1  1  2  0  0  0  0  2  1  0
    D  1  0  0 NA  0  0  0  1  0  0  0  0  0  0
    E  1  1  1  0 NA  0  1  0  1  0  0  1  1  1
    F  0  0  1  0  0 NA  1  0  0  0  0  0  0  1
    G  0  0  2  0  1  1 NA  0  0  0  0  1  2  0
    H  1  0  0  1  0  0  0 NA  0  1  0  0  0  0
    I  0  2  0  0  1  0  0  0 NA  0  0  0  0  1
    J  0  0  0  0  0  0  0  1  0 NA  1  0  0  0
    K  0  0  0  0  0  0  0  0  0  1 NA  0  0  0
    L  0  0  2  0  1  0  1  0  0  0  0 NA  1  0
    M  0  0  1  0  1  0  2  0  0  0  0  1 NA  0
    N  0  1  0  0  1  1  0  0  1  0  0  0  0 NA
    
    > Data.perc
       A   B   C  D  E   F   G   H   I   J   K   L   M   N
    A NA   0   0 50 50   0   0  50   0   0   0   0   0   0
    B  0  NA   0  0 50   0   0   0 100   0   0   0   0 100
    C  0   0  NA  0 33  50  67   0   0   0   0 100  50   0
    D 50   0   0 NA  0   0   0  50   0   0   0   0   0   0
    E 50  50  33  0 NA   0  33   0  50   0   0  50  50  50
    F  0   0  50  0  0  NA  50   0   0   0   0   0   0 100
    G  0   0  67  0 33  50  NA   0   0   0   0  50 100   0
    H 50   0   0 50  0   0   0  NA   0 100   0   0   0   0
    I  0 100   0  0 50   0   0   0  NA   0   0   0   0 100
    J  0   0   0  0  0   0   0 100   0  NA 100   0   0   0
    K  0   0   0  0  0   0   0   0   0 100  NA   0   0   0
    L  0   0 100  0 50   0  50   0   0   0   0  NA  50   0
    M  0   0  50  0 50   0 100   0   0   0   0  50  NA   0
    N  0 100   0  0 50 100   0   0 100   0   0   0   0  NA