Search code examples
rperformancesumdata.tablematch

R data.table: Count number of match for multiple strings for two group between two DT


I am trying to do a rolling sum of match by working with two tables:
DT1:

M A1 A2
M01 A G
M02 G A
M03 T C
Mnn A G

DT2:

IND Group M01 M02 Mnn
I1 1 A G G
I2 1 A G G
I3 1 G A A
I4 2 G A G
In 2 G A G

I being the n individual of the group 1 or 2 and with its information about n Markers.

The output is the sum of both Alleles for both group and for every n Markers.

##Code for replicability
#DT1
DT1<-data.table(M=c("M01","M02","M03","Mnn"),
                A1= c("A","G","T","A"),
                A2=c("G","A","C","G"))
#DT2
DT2<-data.table(IND=c("I1","I2","I3","I4","In"),
                Group=c(1,1,1,2,2),
                M01=c("A","A","A","G","G"),
                M02=c("G","G","A","G","G"),
                M03=c("C","C","C","T","C"),
                Mnn=c("G","A","A","G","A"))
#M being the nn marker with its Allele1 and Allele2  

#What I did found so far: 

for (i in colnames(DT2)){
  print(i)
  DT1$A1G1[DT1$M==i]<-  sum(DT2[[i]][DT2$Group==1] == DT1$A1[DT1$M==i])    
  DT1$A2G1[DT1$M==i]<-  sum(DT2[[i]][DT2$Group==1] == DT1$A2[DT1$M==i])    
  DT1$A1G2[DT1$M==i]<-  sum(DT2[[i]][DT2$Group==2] == DT1$A1[DT1$M==i])    
  DT1$A2G2[DT1$M==i]<-  sum(DT2[[i]][DT2$Group==2] == DT1$A2[DT1$M==i]) 
}

#The output I want would be the sum of both A for the two group and for every Mnn.

#     M A1 A2 A1G1 A2G1 A1G2 A2G2
#1: M01  A  G    3    0    0    2
#2: M02  G  A    2    1    2    0
#3: M03  T  C    0    3    1    1
#4: Mnn  A  G    2    1    1    1

It does the job but I feel like data.table could do it in one line and with less computation time by avoiding looping as Mnn is up to 50k and In is up to 15k it takes a long time.

Anyone with solution would greatly help me as I have trouble working with data.table logic of key and indexes when working with two different tables.


Solution

  • We could make the loop a bit more efficient by using colSums. Also, reduce the number of == by splitting the 'DT2' by 'Group'

    mcols <- grep("^M", names(DT2), value = TRUE)
    lst1 <- split(DT2[, ..mcols], DT2$Group)
    for(i in seq_along(lst1)) {
          tmp <- lst1[[i]]
          DT1[, paste0("A1G", i) := colSums(tmp == A1[col(tmp)],  na.rm = TRUE)]
           DT1[, paste0("A2G", i) := colSums(tmp == A2[col(tmp)],  na.rm = TRUE)][]
    
    }
    

    -output

    > DT1
            M     A1     A2  A1G1  A2G1  A1G2  A2G2
       <char> <char> <char> <num> <num> <num> <num>
    1:    M01      A      G     3     0     0     2
    2:    M02      G      A     2     1     2     0
    3:    M03      T      C     0     3     1     1
    4:    Mnn      A      G     2     1     1     1
    

    Benchmarks

    On a slightly bigger dataset, checked the timings with OP's method and this

    # data
    set.seed(24)
    DT1test<-data.table(M=sprintf('M%02d', 1:5000),
                    A1= sample(c("A","G","T","C"), 5000, replace = TRUE),
                    A2=sample(c("G","A","T","C"), 5000, replace = TRUE))
    
    DT1testold <- copy(DT1test)
    set.seed(42)
    m1 <- matrix(sample(c("A", "G", "T", "C"), 5000 * 15000, 
        replace = TRUE), ncol = 5000, dimnames = list(NULL, DT1test$M))
    DT2test<-data.table(IND=paste0("I", 1:15000),
                                Group=rep(1:300, each = 50))    
    DT2test <- cbind(DT2test, m1)
    

    timings - old method

    system.time({
    for (i in colnames(DT2test)){
       for(j in unique(DT2test$Group)) {
       
         DT1testold[[paste0("A1G", j)]][DT1testold$M==i] <- 
         sum(DT2testold[[i]][DT2test$Group==j] == DT1testold$A1[DT1test$M==i])
          DT1testold[[paste0("A2G", j)]][DT1testold$M==i] <- 
        sum(DT2test[[i]][DT2test$Group==j] == DT1testold$A1[DT1test$M==i])  
       
      }
      }
      
      
      
      })
       user  system elapsed 
    502.603 106.631 610.908 
    

    timings-new method

    system.time({
     mcols <- grep("^M", names(DT2test), value = TRUE)
     lst1 <- split(DT2test[, ..mcols], DT2test$Group)
     for(i in seq_along(lst1)) {
           tmp <- lst1[[i]]
           DT1test[, paste0("A1G", i) := colSums(tmp == A1[col(tmp)], 
            na.rm = TRUE)]
            DT1test[, paste0("A2G", i) := colSums(tmp == A2[col(tmp)], 
           na.rm = TRUE)][]
    
     }
     
     
     
     })
     #user  system elapsed 
     #36.079   0.968  36.934