r data.table subset benchmarking adjacency-matrix

R data.table: Run functions across custom - non-distinct subsets

I am looking for a more efficient data.table-oriented method for achieving what I am currently doing with for-loops.

I have a data.table that contains an edge list consisting of a sender, receiver, tie indicator, and a variable of interest:

library(data.table)
#Create data
  set.seed(1)
  dt<-data.table(
    id1=rep(letters,each=length(letters)),
    id2=rep(letters,length(letters)),
    tie=rbinom(length(letters)^2,1,.1),
    interest=abs(rnorm(n=length(letters)^2))
  )
  dt$tie[dt$id1==dt$id2]<-1

I want to derive some summary statistics based on each of id1's alters (id2 where tie==1) and relationships among these alters. That is, I derive summary statistics based on adjacency matrices among each id's alters. I place these statistics in the following vectors.

#Initialize summary statistics
  sum.interest.lessthan1<-vector()
  sum.interest.lessthan1.weighted<-vector()
  rank.sum.interest.lessthan1<-vector()
  rank.mean<-vector()

To get these summary statistics, I currently run the following loop:

for(i in 1:length(unique(dt$id1))){
#1) Produce vector of alters
  alters<-dt$id2[dt$id1==dt$id2[i] & dt$tie==1]
#2) Create a datatable containing all info among alters
  tempdt<-dt[dt$id1%in%alters & dt$id2%in%alters,]
#3) Skip if no ties other than to self
   if(nrow(tempdt)==1){
     next()
   }
#4) Get summary statistics
  #Number of alters with interest <1
      sum.interest.lessthan1[i]<-sum(tempdt[tempdt$id1==dt$id2[i] & tempdt$id2!=dt$id2[i] ]$interest<1)
  #Number of alters with interest <1, weighted by mean interest
      sum.interest.lessthan1.weighted[i]<-sum(tempdt[tempdt$id1==dt$id2[i] & tempdt$id2!=dt$id2[i]]$interest/mean(tempdt$interest)<1)
  #Ego rank number of alters with interest <1 among all alters
      tempstat<-tempdt[tempdt$id1!=tempdt$id2,.(suminterest=sum(interest<1)),by="id1"]
      rank.sum.interest.lessthan1[i]<-((rank(tempstat$suminterest)-1)/(length(tempstat$suminterest)-1))[which(tempstat$id1==dt$id2[i])]
  #Ego rank mean interest among all alters
      tempstat<-tempdt[tempdt$id1!=tempdt$id2,.(meaninterest=mean(interest)),by="id1"]
      rank.mean[i]<-((rank(tempstat$meaninterest)-1)/(length(tempstat$meaninterest)-1))[which(tempstat$id1==dt$id2[i])]
}

Is there any way to derive these statistics more efficiently without relying on loops? My actual dataset consists of thousands of distinct IDs and multiple types of ties, therefore it usually takes hours to run through. Thanks in advance for any advice! My instinct is to use data.table's "by" syntax, but I cannot think of how to construct a "by group" that represents my subsets.

Brian

Solution

Per chinsoon12's suggestion, below is how to answer the above question:

#create data table of all edges
    altersDT <- dt[tie==1, .(alter=id2), by=.(grp=id1)]
#create data table listing edges among all ties among ego and ego's alters
    altersDT<-altersDT[, CJ(alter, alter), by=.(grp)]
#Merge the tie data with the edge attribute data 
    altersDT <- dt[altersDT, on=.(id1=V1, id2=V2)]
#produce summary statistics
    #sumint and sumw
        altersDT[, c("sumint", "sumw") := {list(
            sum(interest[id1==grp & id1!=id2]<1),
            sum(interest[id1==grp & id1!=id2]/mean(interest)<1)
        )}, by=.(grp)]
    #ranksum and rankmean
        #create tempstats
            altersDT[, c("ranksum", "rankmean") := {list(
              as.numeric(sum(interest[id1!=id2]<1)),
              as.numeric(mean(interest[id1!=id2]))
            )}, by=.(grp,id1)]
        #derive stats of interest
            altersDT[, c("ranksum", "rankmean") := {list(
              (rank(ranksum)-1)/(length(ranksum)-1),
              (rank(rankmean)-1)/(length(rankmean)-1)
            )}, by=.(grp,id2)]
#Put results into key variables
    subcrit<-altersDT$id1==altersDT$id2 & altersDT$id1==altersDT$grp
    new.sum.interest.lessthan1<-altersDT$sumint[subcrit]
    new.sum.interest.lessthan1.weighted<-altersDT$sumw[subcrit]
    new.rank.sum.interest.lessthan1<-altersDT$ranksum[subcrit]
    new.rank.mean<-altersDT$rankmean[subcrit]

#check that results are the same as old
    table(new.sum.interest.lessthan1==sum.interest.lessthan1)
    table(new.sum.interest.lessthan1.weighted==sum.interest.lessthan1.weighted)
    table(new.rank.sum.interest.lessthan1==rank.sum.interest.lessthan1)
    table(new.rank.mean==rank.mean)