bipartite projection without the intermediate bipartite graph

I have a data.frame which describes a bipartite graph with a very large (millions) and a reasonably small (hundreds) independent sets.

I want to get the bipartite projection of the graph on the smaller independent set, but without first creating the large bipartite graph and especially the huge bipartite projection to the larger independent set. The reason for this limitation is an igraph segfault and a RAM limitation (I have only 8GB RAM).

E.g., given

data.frame(beg=c("a","a","b","b","c","c"),
           end=c("1","2","1","2","1","2"),
           weight=1:6)

I want data frame

data.frame(beg=c("a","a","b"),
           end=c("b","c","c"),
           weight=c(1+3+2+4,1+5+2+6,3+5+4+6))

where weights of edges add up.

(in this example, abc is the "smaller" set and 12 is the "larger" one).

Solution

Here is something which appears to do what I need (the key is to use data.table for fast join):

> library(igraph)
> library(data.table)
data.table 1.8.8  For help type: help("data.table")
> f <- data.frame(beg=c("a","a","b","b","c","c"),
                  end=c("1","2","1","2","1","2"),
                  count=1:6)
> f
   beg end count
1:   a   1     1
2:   b   1     3
3:   c   1     5
4:   a   2     2
5:   b   2     4
6:   c   2     6
> m <- f[f,allow.cartesian=TRUE]

> m
    end beg weight beg.1 weight.1
 1:   1   a      1     a        1
 2:   1   b      3     a        1
 3:   1   c      5     a        1
 4:   1   a      1     b        3
 5:   1   b      3     b        3
 6:   1   c      5     b        3
 7:   1   a      1     c        5
 8:   1   b      3     c        5
 9:   1   c      5     c        5
10:   2   a      2     a        2
11:   2   b      4     a        2
12:   2   c      6     a        2
13:   2   a      2     b        4
14:   2   b      4     b        4
15:   2   c      6     b        4
16:   2   a      2     c        6
17:   2   b      4     c        6
18:   2   c      6     c        6
> v <- m$beg == m$beg.1
> m <- f[f,allow.cartesian=TRUE]
> v <- m$beg == m$beg.1
> m$end <- NULL
> m$weight <- (m$count + m$count.1)/2
> m$count <- NULL
> m$count.1 <- NULL
> m
    beg beg.1 weight
 1:   a     a      1
 2:   b     a      2
 3:   c     a      3
 4:   a     b      2
 5:   b     b      3
 6:   c     b      4
 7:   a     c      3
 8:   b     c      4
 9:   c     c      5
10:   a     a      2
11:   b     a      3
12:   c     a      4
13:   a     b      3
14:   b     b      4
15:   c     b      5
16:   a     c      4
17:   b     c      5
18:   c     c      6
> ve <- data.table(vertex=m$beg[v], weight=m$weight[v], key="vertex")
> ve <- ve[, list(count = .N, weight = sum(weight)), by = "vertex"]
> ve
   vertex count weight
1:      a     2      3
2:      b     2      7
3:      c     2     11
> g1 <- graph.data.frame(m[!v,], vertices=ve, directed=FALSE)
> g1 <- simplify(g1, edge.attr.comb="sum")
> V(g1)$weight
[1]  3  7 11
> E(g1)$weight
[1] 10 14 18