Search code examples
ralgorithmmatrixigraphsubmatrix

From a pairwise matrix, find the largest group of individuals that equal a certain value


I have a pairwise relatedness 39x39 matrix containing the relatedness values for all pairwise combinations of 39 individuals. I would like to find the largest group of individuals that are completely unrelated, i.e. where all pairwise relatedness values i the group equal 0.

Is there an easy way to do this in R?

A simpler example:

set.seed(420)

#Create the matrix
relatedness.matrix <- matrix(data = sample(x = c(0.5, 1, 0,0), size = 25, replace = TRUE), nrow = 5, ncol = 5)

# Matrix has the same upper and lower triangles
relatedness.matrix[upper.tri(relatedness.matrix)] <- relatedness.matrix[lower.tri(relatedness.matrix)]

# Add names for simplicity of reference
colnames(relatedness.matrix) <- letters[1:5]
rownames(relatedness.matrix) <- letters[1:5]

# Relatedness between the same individual does not count
diag(relatedness.matrix) <- NA

In this case there are three possible solutions: a 2x2 matrix with only e and b, a 2x2 matrix with only c and d, and a 2x2 matrix with only a and e. Adding any other individuals to any of those matrices would be adding a related individual.

EDIT: added that upper and lower triangles are the same, and that there's actually multiple 2x2 solutions in this example.


Solution

  • The set of unrelated individuals in a symmetric adjacency matrix describes an independent set. We can use igraph::largest_ivs to find the largest such sets.

    I'll use a larger matrix that is actually symmetric.

    set.seed(420)
    
    m <- matrix(sample(0:2, 26^2, 1), 26, 26, 0, rep(list(letters), 2))
    m[lower.tri(m)] <- t(m)[lower.tri(m)]
    diag(m) <- NA
    

    Check that the matrix is symmetric

    isSymmetric(m)
    #> [1] TRUE
    
    m
    #>    a  b  c  d  e  f  g  h  i  j  k  l  m  n  o  p  q  r  s  t  u  v  w  x  y  z
    #> a NA  0  1  2  1  2  0  2  2  0  1  2  0  1  2  2  0  0  0  0  0  2  2  2  1  1
    #> b  0 NA  0  1  2  2  0  2  0  2  0  2  2  2  1  2  2  1  1  0  1  2  1  2  0  1
    #> c  1  0 NA  0  1  0  2  1  2  1  0  1  0  1  2  2  2  2  1  2  2  0  2  0  1  0
    #> d  2  1  0 NA  2  2  2  2  2  2  1  1  0  1  2  1  2  2  1  2  1  0  1  0  2  1
    #> e  1  2  1  2 NA  2  1  0  1  0  1  0  0  0  1  2  0  2  0  2  2  1  2  1  2  2
    #> f  2  2  0  2  2 NA  2  2  2  1  1  2  1  2  0  2  0  2  2  0  1  1  0  2  2  2
    #> g  0  0  2  2  1  2 NA  0  2  1  2  2  2  2  0  1  2  0  2  1  0  0  1  1  2  1
    #> h  2  2  1  2  0  2  0 NA  2  2  1  0  2  2  1  0  1  1  1  1  2  1  1  1  1  2
    #> i  2  0  2  2  1  2  2  2 NA  1  2  1  0  2  2  0  2  2  2  0  2  0  0  0  0  2
    #> j  0  2  1  2  0  1  1  2  1 NA  1  1  2  2  0  0  1  1  2  2  2  1  0  0  2  2
    #> k  1  0  0  1  1  1  2  1  2  1 NA  2  2  1  0  0  2  0  2  0  0  1  1  1  1  2
    #> l  2  2  1  1  0  2  2  0  1  1  2 NA  1  1  2  0  2  2  1  2  1  0  0  2  1  1
    #> m  0  2  0  0  0  1  2  2  0  2  2  1 NA  0  2  2  0  2  1  1  1  1  0  2  1  1
    #> n  1  2  1  1  0  2  2  2  2  2  1  1  0 NA  1  0  1  2  1  2  0  1  0  1  1  2
    #> o  2  1  2  2  1  0  0  1  2  0  0  2  2  1 NA  2  2  0  1  2  1  2  2  1  1  0
    #> p  2  2  2  1  2  2  1  0  0  0  0  0  2  0  2 NA  2  2  2  1  0  2  0  0  1  2
    #> q  0  2  2  2  0  0  2  1  2  1  2  2  0  1  2  2 NA  1  0  1  2  2  1  0  1  1
    #> r  0  1  2  2  2  2  0  1  2  1  0  2  2  2  0  2  1 NA  1  1  2  1  2  2  2  1
    #> s  0  1  1  1  0  2  2  1  2  2  2  1  1  1  1  2  0  1 NA  2  1  1  2  1  1  1
    #> t  0  0  2  2  2  0  1  1  0  2  0  2  1  2  2  1  1  1  2 NA  0  0  1  2  2  0
    #> u  0  1  2  1  2  1  0  2  2  2  0  1  1  0  1  0  2  2  1  0 NA  2  2  0  2  0
    #> v  2  2  0  0  1  1  0  1  0  1  1  0  1  1  2  2  2  1  1  0  2 NA  2  0  1  1
    #> w  2  1  2  1  2  0  1  1  0  0  1  0  0  0  2  0  1  2  2  1  2  2 NA  0  2  0
    #> x  2  2  0  0  1  2  1  1  0  0  1  2  2  1  1  0  0  2  1  2  0  0  0 NA  1  2
    #> y  1  0  1  2  2  2  2  1  0  2  1  1  1  1  1  1  1  2  1  2  2  1  2  1 NA  0
    #> z  1  1  0  1  2  2  1  2  2  2  2  1  1  2  0  2  1  1  1  0  0  1  0  2  0 NA
    

    Get the largest independent sets:

    library(igraph)
    
    is <- largest_ivs(graph_from_adjacency_matrix(m, "undirected"))
    is
    #> [[1]]
    #> + 4/26 vertices, named, from 272900e:
    #> [1] i p w x
    #> 
    #> [[2]]
    #> + 4/26 vertices, named, from 272900e:
    #> [1] c d v x
    #> 
    #> [[3]]
    #> + 4/26 vertices, named, from 272900e:
    #> [1] j p w x
    

    Verify that no edges exist among the independent sets.

    lapply(is, \(i) m[i, i])
    #> [[1]]
    #>    i  p  w  x
    #> i NA  0  0  0
    #> p  0 NA  0  0
    #> w  0  0 NA  0
    #> x  0  0  0 NA
    #> 
    #> [[2]]
    #>    c  d  v  x
    #> c NA  0  0  0
    #> d  0 NA  0  0
    #> v  0  0 NA  0
    #> x  0  0  0 NA
    #> 
    #> [[3]]
    #>    j  p  w  x
    #> j NA  0  0  0
    #> p  0 NA  0  0
    #> w  0  0 NA  0
    #> x  0  0  0 NA
    

    As a side note, the independent sets in a graph formed from an adjacency matrix, m, will be the same as the cliques in the graph formed by !m. Interestingly, for my small example, largest_cliques is more performant than largest_ivs for finding the desired answer:

    microbenchmark::microbenchmark(
      cliques = largest_cliques(graph_from_adjacency_matrix(!m, "undirected")),
      ivs = largest_ivs(graph_from_adjacency_matrix(m, "undirected"))
    )
    #> Unit: microseconds
    #>     expr   min    lq    mean median     uq    max neval
    #>  cliques 319.7 348.6 372.581 368.90 388.55  555.0   100
    #>      ivs 560.8 589.6 629.992 616.55 654.35 1187.6   100
    

    And the performance difference grows as the matrix gets larger:

    m <- matrix(sample(0:2, 1e4, 1), 100, 100, 0)
    m[lower.tri(m)] <- t(m)[lower.tri(m)]
    diag(m) <- NA
    
    microbenchmark::microbenchmark(
      cliques = largest_cliques(graph_from_adjacency_matrix(!m, "undirected")),
      ivs = largest_ivs(graph_from_adjacency_matrix(m, "undirected"))
    )
    #> Unit: milliseconds
    #>     expr      min       lq       mean   median       uq      max neval
    #>  cliques   2.5735   2.7651   3.275977   2.9013   3.3138   7.9742   100
    #>      ivs 161.9572 182.3812 191.595736 191.2344 202.1377 243.5654   100
    
    m <- matrix(sample(0:2, 4e4, 1), 200, 200, 0)
    m[lower.tri(m)] <- t(m)[lower.tri(m)]
    diag(m) <- NA
    
    system.time(cl <- largest_cliques(graph_from_adjacency_matrix(!m, "undirected")))
    #>    user  system elapsed 
    #>    0.05    0.00    0.05
    system.time(is <- largest_ivs(graph_from_adjacency_matrix(m, "undirected")))
    #>    user  system elapsed 
    #>   10.14    0.00   10.15
    

    Check that the answers are the same.

    library(data.table)
    
    identical(
      setorder(as.data.table(t(mapply(sort, cl)))),
      setorder(as.data.table(t(mapply(sort, is))))
    )
    #> [1] TRUE