Search code examples
rdata.tablesubsetbenchmarkingadjacency-matrix

R data.table: Run functions across custom - non-distinct subsets


I am looking for a more efficient data.table-oriented method for achieving what I am currently doing with for-loops.

I have a data.table that contains an edge list consisting of a sender, receiver, tie indicator, and a variable of interest:

library(data.table)
#Create data
  set.seed(1)
  dt<-data.table(
    id1=rep(letters,each=length(letters)),
    id2=rep(letters,length(letters)),
    tie=rbinom(length(letters)^2,1,.1),
    interest=abs(rnorm(n=length(letters)^2))
  )
  dt$tie[dt$id1==dt$id2]<-1

I want to derive some summary statistics based on each of id1's alters (id2 where tie==1) and relationships among these alters. That is, I derive summary statistics based on adjacency matrices among each id's alters. I place these statistics in the following vectors.

#Initialize summary statistics
  sum.interest.lessthan1<-vector()
  sum.interest.lessthan1.weighted<-vector()
  rank.sum.interest.lessthan1<-vector()
  rank.mean<-vector()

To get these summary statistics, I currently run the following loop:

for(i in 1:length(unique(dt$id1))){
#1) Produce vector of alters
  alters<-dt$id2[dt$id1==dt$id2[i] & dt$tie==1]
#2) Create a datatable containing all info among alters
  tempdt<-dt[dt$id1%in%alters & dt$id2%in%alters,]
#3) Skip if no ties other than to self
   if(nrow(tempdt)==1){
     next()
   }
#4) Get summary statistics
  #Number of alters with interest <1
      sum.interest.lessthan1[i]<-sum(tempdt[tempdt$id1==dt$id2[i] & tempdt$id2!=dt$id2[i] ]$interest<1)
  #Number of alters with interest <1, weighted by mean interest
      sum.interest.lessthan1.weighted[i]<-sum(tempdt[tempdt$id1==dt$id2[i] & tempdt$id2!=dt$id2[i]]$interest/mean(tempdt$interest)<1)
  #Ego rank number of alters with interest <1 among all alters
      tempstat<-tempdt[tempdt$id1!=tempdt$id2,.(suminterest=sum(interest<1)),by="id1"]
      rank.sum.interest.lessthan1[i]<-((rank(tempstat$suminterest)-1)/(length(tempstat$suminterest)-1))[which(tempstat$id1==dt$id2[i])]
  #Ego rank mean interest among all alters
      tempstat<-tempdt[tempdt$id1!=tempdt$id2,.(meaninterest=mean(interest)),by="id1"]
      rank.mean[i]<-((rank(tempstat$meaninterest)-1)/(length(tempstat$meaninterest)-1))[which(tempstat$id1==dt$id2[i])]
}

Is there any way to derive these statistics more efficiently without relying on loops? My actual dataset consists of thousands of distinct IDs and multiple types of ties, therefore it usually takes hours to run through. Thanks in advance for any advice! My instinct is to use data.table's "by" syntax, but I cannot think of how to construct a "by group" that represents my subsets.

Brian


Solution

  • Per chinsoon12's suggestion, below is how to answer the above question:

    #create data table of all edges
        altersDT <- dt[tie==1, .(alter=id2), by=.(grp=id1)]
    #create data table listing edges among all ties among ego and ego's alters
        altersDT<-altersDT[, CJ(alter, alter), by=.(grp)]
    #Merge the tie data with the edge attribute data 
        altersDT <- dt[altersDT, on=.(id1=V1, id2=V2)]
    #produce summary statistics
        #sumint and sumw
            altersDT[, c("sumint", "sumw") := {list(
                sum(interest[id1==grp & id1!=id2]<1),
                sum(interest[id1==grp & id1!=id2]/mean(interest)<1)
            )}, by=.(grp)]
        #ranksum and rankmean
            #create tempstats
                altersDT[, c("ranksum", "rankmean") := {list(
                  as.numeric(sum(interest[id1!=id2]<1)),
                  as.numeric(mean(interest[id1!=id2]))
                )}, by=.(grp,id1)]
            #derive stats of interest
                altersDT[, c("ranksum", "rankmean") := {list(
                  (rank(ranksum)-1)/(length(ranksum)-1),
                  (rank(rankmean)-1)/(length(rankmean)-1)
                )}, by=.(grp,id2)]
    #Put results into key variables
        subcrit<-altersDT$id1==altersDT$id2 & altersDT$id1==altersDT$grp
        new.sum.interest.lessthan1<-altersDT$sumint[subcrit]
        new.sum.interest.lessthan1.weighted<-altersDT$sumw[subcrit]
        new.rank.sum.interest.lessthan1<-altersDT$ranksum[subcrit]
        new.rank.mean<-altersDT$rankmean[subcrit]
    
    #check that results are the same as old
        table(new.sum.interest.lessthan1==sum.interest.lessthan1)
        table(new.sum.interest.lessthan1.weighted==sum.interest.lessthan1.weighted)
        table(new.rank.sum.interest.lessthan1==rank.sum.interest.lessthan1)
        table(new.rank.mean==rank.mean)