Search code examples
rbioinformaticsgenetics

R code to work on genotype data


I have this data called mydf.

I need to match the letters (DNA letters) in column REF and ALT with the colnames(x) ("A","T","G","C") and get the corresponding numerical values pasted together as "REF,ALT".

However, there are some lines where I have "snp:+[0-9]" and "flat$" in TYPE column.

Now for "flat$" lines I want to:

  1. sum the ALT values from as many as "snp:+[0-9]" of the corresponding "start" ids, including the flat line itself, if the ALT letters are unique (please see the script enclosed in curly brackets for one flat line)
  2. paste that ALT value again as "REF,ALT" (REF value will be same for both "snp:+[0-9]" and "flat$" with same start ID)
  3. get the output as shown in result.

I have done this for one flat line,but I need help making the function for flatcase so that it will do the same for all flat lines.

How can I make a function to do this for flatcase?

Code

normalCase <- function(x, ns) {
      ref.idx <- which(ns == "REF")
      ref.allele <- x[ref.idx]
      ref.count <- x[which(ns == ref.allele)]

      alt.idx <- which(ns == "ALT")
      alt.allele <- x[alt.idx]
      alt.count <- x[which(ns == alt.allele)]

      paste(ref.count, alt.count, sep=",")
    }



    flatcase??{

     g<-x[,"start"]=="chr16:2530921"& grepl("snp:+[0-9]",x[,"TYPE"])
     myt<-x[g,]
     x[g,"ALT"]
     unique(x[g,"ALT"])
     c<-unique(x[g,"ALT"])
     flat<-myt[grepl("flat$",myt[,"TYPE"]),]
     c<-unique(x[g,"ALT"])
    alt.count<- sum(as.numeric(flat[c]))
    }

    calculateAD <- function(x, mat, ns) {
      if (grepl("flat$", x[which(ns == 'TYPE')])) {
        flatCase(x, mat, ns)
      } else {
        normalCase(x, ns)
      }
    }


    bamAD <- function(x) {
       new.x <- cbind(x, apply(x, 1, calculateAD, x, colnames(x)))
      colnames(new.x)[ncol(new.x)] <- "bam.AD"
      new.x      
    }

The function I have tried for flatCase is:

flatCase <- function(x, mat, ns) {
  id.idx <- which(ns == 'start')
  type.idx <- which(ns == 'TYPE')
  ref.idx <- which(ns == 'REF')
  alt.idx <- which(ns == 'ALT')


  id <- x[id.idx]
  #m <- mat[mat[, id.idx] == id & mat[, type.idx] == "snp", ]
  #m <- mat[mat[, id.idx] == id & mat[, type.idx] == "snp", ]
  m<-mat[grepl(id,mat[, id.idx]) & grepl("snp:+[0-9]",mat[, type.idx]),]
  #flat<-mat[grepl("flat$",mat[, type.idx]),]
  ref.allele <- x[ref.idx]
  ref.count<-x[which(ns == ref.allele)]


  alt.count <- sum(apply(m, 1, function(x) as.numeric(x[which(ns == x[alt.idx])])))
  paste(ref.count, alt.count, sep=",") 
}

mydf

x <- as.matrix(read.csv(text="start,A,T,G,C,REF,ALT,TYPE 
chr20:5363934,95,29,14,59,C,T,snp
chr5:8529759,24,1,28,41,G,C,snp
chr14:9620689,65,49,41,96,T,G,snp
chr18:547375,94,1,51,67,G,C,snp
chr8:5952145,27,80,25,96,T,T,snp
chr14:8694382,68,94,26,30,A,A,snp
chr16:2530921,49,15,79,72,A,T,snp:2530921
chr16:2530921,49,15,79,72,A,G,snp:2530921
chr16:2530921,49,15,79,72,A,T,snp:2530921flat
chr16:2533924,42,13,19,52,G,T,snp:2533924flat
chr16:2543344,4,13,13,42,G,T,snp:2543344flat
chr16:2543344,4,23,13,42,G,A,snp:2543344
chr14:4214117,73,49,18,77,G,A,snp
chr4:7799768,36,28,1,16,C,A,snp
chr3:9141263,27,41,93,90,A,A,snp", stringsAsFactors=FALSE))

Result:

       start           A    T    G    C    REF ALT TYPE              bam.AD  
 [1,] "chr20:5363934" "95" "29" "14" "59" "C" "T" "snp"             "59,29" 
 [2,] "chr5:8529759"  "24" " 1" "28" "41" "G" "C" "snp"             "28,41" 
 [3,] "chr14:9620689" "65" "49" "41" "96" "T" "G" "snp"             "49,41" 
 [4,] "chr18:547375"  "94" " 1" "51" "67" "G" "C" "snp"             "51,67" 
 [5,] "chr8:5952145"  "27" "80" "25" "96" "T" "T" "snp"             "80,80" 
 [6,] "chr14:8694382" "68" "94" "26" "30" "A" "A" "snp"             "68,68" 
 [7,] "chr16:2530921" "49" "15" "79" "72" "A" "T" "snp:2530921"     "49,15" 
 [8,] "chr16:2530921" "49" "15" "79" "72" "A" "G" "snp:2530921"     "49,79" 
 [9,] "chr16:2530921" "49" "15" "79" "72" "A" "T" "snp:2530921flat" "49,94"
[10,] "chr16:2533924" "42" "13" "19" "52" "G" "T" "snp:2533924flat" "19,13"  
[11,] "chr16:2543344" "42" "13" "13" "42" "G" "T" "snp:2543344flat" "13,55" 
[12,] "chr16:2543344" "42" "23" "13" "42" "G" "A" "snp:2543344"     "13,42" 
[13,] "chr14:4214117" "73" "49" "18" "77" "G" "A" "snp"             "18,73" 
[14,] "chr4:7799768"  "36" "28" " 1" "16" "C" "A" "snp"             "16,36" 
[15,] "chr3:9141263"  "27" "41" "93" "90" "A" "A" "snp"             "27,27" 

Solution

  • Here's a way to do it all, vectorised.

    First, note that REF is the same regardless of type. We can look it up quickly by using REF as a coordinate into the matrix, so e.g. row 1 has REF C, so if we look up coordinates (1, "C") we get the REF value for that row.

    # the REFs are the same regardless of TYPE
    rownames(x) <- 1:nrow(x)
    ref <- x[cbind(1:nrow(x), x[, 'REF'])]
    

    Have a look at cbind(1:nrow(x), x[, 'REF']): this is just a list of coordinates (row number, REF) and we use it to look up the REF number.

    Then we do the same for ALT:

    alt <- x[cbind(1:nrow(x), x[, 'ALT'])]
    

    However we have to make sure that if the type is 'flat', we add all the other ALTs to the 'flat' row's ALT (only the unique ones, as you say).

    First, work out which rows are flat:

    which.flat <- grep('flat$', x[, 'TYPE'])
    

    Next, for each flat row, look up ALT of the other rows with the same 'start' (that's the x[, 'start'] == x[i, 'start'] bit), and exclude rows that have duplicate ALT (that's the x[, 'ALT'] != x[i, 'ALT'] bit). Here i is the index of the current flat line. Add them all to the flat line's ALT. The sapply just vectorises this all for each flat line.

    # add the other alts to the alt of the 'flat' line.
    alt[which.flat] <- as.numeric(alt[which.flat]) + sapply(which.flat,
        function (i) {
            sum(as.numeric(alt[ x[, 'start'] == x[i, 'start'] &
                 x[, 'ALT'] != x[i, 'ALT'] ]))
        })
    

    now we just paste together:

    x <- cbind(x, bam.AD=paste(ref, alt, sep=','))
    

    The result is the same as yours except for row 10 where I believe you have made a mistake - there's only one row with "chr16:2533924" and its ALT is "T" (value 13), so bam.AD is "19,13" (you have "19,42" as if the ALT was "A", but it isn't).


    If you must stick with the function form in your question (quite slow & inefficient!), it's essentially the same as what I have done (hence why you can do it without the apply call and skip the loop entirely):

    flatCase <- function(x, mat, ns) { # get alt of the flat row alt <- as.numeric(x[x['ALT']])

    # get the other rows with the same 'start' and different 'ALT'
    xx <- mat[mat[, 'start'] == x['start'] & mat[, 'ALT'] != x['ALT'], ,drop=F]
    if (nrow(xx) > 0) {
      # grab all the alts as done before
      rownames(xx) <- 1:nrow(xx)
      alt <- alt + sum(as.numeric(xx[cbind(1:nrow(xx), xx[, 'ALT'])]))
     }
    
    ref <- x[x['REF']]
    return(paste(ref, alt, sep=','))
    }
    

    However as mentioned before, if you vectorise it the entire code you have above reduces to just a few lines, and is faster:

    newBamAD <- function (x) {
        # the version above
        rownames(x) <- 1:nrow(x)
        ref <- x[cbind(1:nrow(x), x[, 'REF'])]
        alt <- x[cbind(1:nrow(x), x[, 'ALT'])]
        which.flat <- grep('flat$', x[, 'TYPE'])
        alt[which.flat] <- as.numeric(alt[which.flat]) + sapply(which.flat,
            function (i) {
                sum(as.numeric(alt[ x[, 'start'] == x[i, 'start'] &
                     x[, 'ALT'] != x[i, 'ALT'] ]))
            })
        cbind(x, bam.AD=paste(ref, alt, sep=','))
    }
    
    library(rbenchmark)
    benchmark(
      bamAD=bamAD(x),
      newBamAD=newBamAD(x)
    )
    #       test replications elapsed relative user.self sys.self user.child sys.child
    # 1    bamAD          100   0.082    3.905     0.072    0.004          0         0
    # 2 newBamAD          100   0.021    1.000     0.020    0.000          0         0
    

    the vectorised version is almost 4x faster.