Search code examples
rsparse-matrixsubmatrix

How can I create submatrices


I understand that I can extract submatrices from an already created matrix but I want to be able to create submatrices first then combine the created submatrices to form a bigger matrix to save space and time. For example in my example, I want to be able to create a submatrix for IDs with NAs (1-10) and IDs without NAs(11-20) then combine the two matrices together to form a bigger matrix but I am not getting it, would like if someone can suggest what should be in my codes given that I will make same calculations for both with NAs and without NAs.

P.S: I also want to be able to save these submatrices separately before merging them together to a singular matrix (20x20)

dorm<-function(data)
{ 
  library(Matrix)
  n<-max(as.numeric(fam[,"ID"])) 
  t<-min(as.numeric(fam[,"ID"])) 
  A <- sparseMatrix(i = n, j=n, x=n)
  while(t <=n) {

    for( t in t:n ){

      s <- max(fam[t,"dad"],fam[t,"mum"]) 
      d <- min(fam[t,"dad"],fam[t,"mum"])

      if( !is.na(s) ){ 
        if( !is.na(d) ){
          A[t,t] = 2-0.5^(fam[t,"GEN"]-1)+0.5^(fam[t,"GEN"])*A[fam[t,"dad"],fam[t,"mum"]]
          tmp = 0.5 * (A[1:(t-1),s] + A[1:(t-1),d])
          A[t, 1:(t-1)] = tmp
          A[1:(t-1), t] = tmp
        } else {
          A[t,t] = 2-0.5^(fam[t,"GEN"]-1)
          tmp = 0.5 * A[1:(t-1),s]
          A[t, 1:(t-1)] = tmp
          A[1:(t-1), t] = tmp
        }
      } else {
        A[t,t] = 2-0.5^(fam[t,"GEN"]-1)
      }
      message(" MatbyGEN: ", t)
    }

    return(A)
  }
}

fam <- structure(list(ID = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 
11L, 12L, 13L, 14L, 18L, 15L, 16L, 17L, 20L, 19L), dad = c(NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, 1L, 4L, 6L, 4L, 10L, 
12L, 13L, 13L, 14L), mum = c(NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, 2L, 3L, 2L, 5L, 11L, 11L, 5L, 3L, 7L, 2L), GEN = c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 
3L, 3L, 3L)), class = "data.frame", row.names = c(NA, -20L))

A <- dorm(fam)

Solution

  • Here is an solution. It is ~50x faster on the large dataset (1 second vs. 50 seconds):

    #include <RcppArmadillo.h>
    // [[Rcpp::depends(RcppArmadillo)]]
    using namespace Rcpp;
    using namespace arma;
    
    // [[Rcpp::export]]
    sp_mat rcpp_dorm_sp(IntegerVector ID, IntegerVector dad, IntegerVector mum, IntegerVector gen){
      int n; 
      int s; int d;
    
      double tmp;
    
      sp_mat A(dad.size(), dad.size());
    
      A.diag().ones();
      n = max(ID); 
    
      for(int t = 0; t < n; t++){
        s = std::max(dad[t], mum[t]); 
        d = std::min(dad[t], mum[t]);
    
        A(t,t) = 2-pow(0.5, gen[t] - 1);
    
        if ((s>0) & (d>0) ) { 
          A(t,t) +=  pow(0.5, gen[t])*A(dad[t]-1,mum[t]-1);
          for(int j = 0; j < t; j++){
    
            tmp = 0.5 * (A(j, dad[t]-1) + A(j, mum[t]-1));
            if (tmp > 0){
              A(t,j) = tmp;
              A(j,t) = tmp;
            }
          }
        } else if ((s>0) & (d==0)) {
    
          for(int j = 0; j < t; j++){
            tmp = 0.5 * A(j, s-1);
            if (tmp > 0){
              A(t,j) = tmp;
              A(j,t) = tmp;
            }
          }
        }
      }
    
      return(A);
    }
    

    And the R portion:

    fam_mid <- structure(list(ID = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 
                                             11L, 12L, 13L, 14L, 18L, 15L, 16L, 17L, 20L, 19L),
                                      dad = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, 1L, 4L, 6L, 4L, 10L, 
                                              12L, 13L, 13L, 14L),
                                      mum = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 2L, 3L, 2L, 5L, 11L, 11L, 5L, 3L, 7L, 2L)
                                      , GEN = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 
                                                3L, 3L, 3L)), class = "data.frame", row.names = c(NA, -20L))
    
    rcpp_dorm_sp(fam_cpp$ID, fam_cpp$dad, fam_cpp$mum, fam_cpp$GEN)