I have this data called mydf
.
I need to match the letters (DNA letters) in column REF
and ALT
with the colnames(x)
("A","T","G","C"
) and get the corresponding numerical values pasted together as "REF,ALT"
.
However, there are some lines where I have "snp:+[0-9]"
and "flat$"
in TYPE
column.
Now for "flat$"
lines I want to:
ALT
values from as many as "snp:+[0-9]"
of the corresponding "start"
ids, including the flat line itself, if the
ALT
letters are unique (please see the script enclosed in curly
brackets for one flat line)ALT
value again as "REF,ALT"
(REF
value will be
same for both "snp:+[0-9]"
and "flat$"
with same start ID)I have done this for one flat line,but I need help making the function for flatcase
so that it will do the same for all flat lines.
How can I make a function to do this for flatcase
?
Code
normalCase <- function(x, ns) {
ref.idx <- which(ns == "REF")
ref.allele <- x[ref.idx]
ref.count <- x[which(ns == ref.allele)]
alt.idx <- which(ns == "ALT")
alt.allele <- x[alt.idx]
alt.count <- x[which(ns == alt.allele)]
paste(ref.count, alt.count, sep=",")
}
flatcase??{
g<-x[,"start"]=="chr16:2530921"& grepl("snp:+[0-9]",x[,"TYPE"])
myt<-x[g,]
x[g,"ALT"]
unique(x[g,"ALT"])
c<-unique(x[g,"ALT"])
flat<-myt[grepl("flat$",myt[,"TYPE"]),]
c<-unique(x[g,"ALT"])
alt.count<- sum(as.numeric(flat[c]))
}
calculateAD <- function(x, mat, ns) {
if (grepl("flat$", x[which(ns == 'TYPE')])) {
flatCase(x, mat, ns)
} else {
normalCase(x, ns)
}
}
bamAD <- function(x) {
new.x <- cbind(x, apply(x, 1, calculateAD, x, colnames(x)))
colnames(new.x)[ncol(new.x)] <- "bam.AD"
new.x
}
The function I have tried for flatCase is:
flatCase <- function(x, mat, ns) {
id.idx <- which(ns == 'start')
type.idx <- which(ns == 'TYPE')
ref.idx <- which(ns == 'REF')
alt.idx <- which(ns == 'ALT')
id <- x[id.idx]
#m <- mat[mat[, id.idx] == id & mat[, type.idx] == "snp", ]
#m <- mat[mat[, id.idx] == id & mat[, type.idx] == "snp", ]
m<-mat[grepl(id,mat[, id.idx]) & grepl("snp:+[0-9]",mat[, type.idx]),]
#flat<-mat[grepl("flat$",mat[, type.idx]),]
ref.allele <- x[ref.idx]
ref.count<-x[which(ns == ref.allele)]
alt.count <- sum(apply(m, 1, function(x) as.numeric(x[which(ns == x[alt.idx])])))
paste(ref.count, alt.count, sep=",")
}
mydf
x <- as.matrix(read.csv(text="start,A,T,G,C,REF,ALT,TYPE
chr20:5363934,95,29,14,59,C,T,snp
chr5:8529759,24,1,28,41,G,C,snp
chr14:9620689,65,49,41,96,T,G,snp
chr18:547375,94,1,51,67,G,C,snp
chr8:5952145,27,80,25,96,T,T,snp
chr14:8694382,68,94,26,30,A,A,snp
chr16:2530921,49,15,79,72,A,T,snp:2530921
chr16:2530921,49,15,79,72,A,G,snp:2530921
chr16:2530921,49,15,79,72,A,T,snp:2530921flat
chr16:2533924,42,13,19,52,G,T,snp:2533924flat
chr16:2543344,4,13,13,42,G,T,snp:2543344flat
chr16:2543344,4,23,13,42,G,A,snp:2543344
chr14:4214117,73,49,18,77,G,A,snp
chr4:7799768,36,28,1,16,C,A,snp
chr3:9141263,27,41,93,90,A,A,snp", stringsAsFactors=FALSE))
Result:
start A T G C REF ALT TYPE bam.AD
[1,] "chr20:5363934" "95" "29" "14" "59" "C" "T" "snp" "59,29"
[2,] "chr5:8529759" "24" " 1" "28" "41" "G" "C" "snp" "28,41"
[3,] "chr14:9620689" "65" "49" "41" "96" "T" "G" "snp" "49,41"
[4,] "chr18:547375" "94" " 1" "51" "67" "G" "C" "snp" "51,67"
[5,] "chr8:5952145" "27" "80" "25" "96" "T" "T" "snp" "80,80"
[6,] "chr14:8694382" "68" "94" "26" "30" "A" "A" "snp" "68,68"
[7,] "chr16:2530921" "49" "15" "79" "72" "A" "T" "snp:2530921" "49,15"
[8,] "chr16:2530921" "49" "15" "79" "72" "A" "G" "snp:2530921" "49,79"
[9,] "chr16:2530921" "49" "15" "79" "72" "A" "T" "snp:2530921flat" "49,94"
[10,] "chr16:2533924" "42" "13" "19" "52" "G" "T" "snp:2533924flat" "19,13"
[11,] "chr16:2543344" "42" "13" "13" "42" "G" "T" "snp:2543344flat" "13,55"
[12,] "chr16:2543344" "42" "23" "13" "42" "G" "A" "snp:2543344" "13,42"
[13,] "chr14:4214117" "73" "49" "18" "77" "G" "A" "snp" "18,73"
[14,] "chr4:7799768" "36" "28" " 1" "16" "C" "A" "snp" "16,36"
[15,] "chr3:9141263" "27" "41" "93" "90" "A" "A" "snp" "27,27"
Here's a way to do it all, vectorised.
First, note that REF is the same regardless of type. We can look it up quickly by using REF as a coordinate into the matrix, so e.g. row 1 has REF C, so if we look up coordinates (1, "C") we get the REF value for that row.
# the REFs are the same regardless of TYPE
rownames(x) <- 1:nrow(x)
ref <- x[cbind(1:nrow(x), x[, 'REF'])]
Have a look at cbind(1:nrow(x), x[, 'REF'])
: this is just a list of coordinates (row number, REF)
and we use it to look up the REF number.
Then we do the same for ALT:
alt <- x[cbind(1:nrow(x), x[, 'ALT'])]
However we have to make sure that if the type is 'flat', we add all the other ALTs to the 'flat' row's ALT (only the unique ones, as you say).
First, work out which rows are flat:
which.flat <- grep('flat$', x[, 'TYPE'])
Next, for each flat row, look up ALT of the other rows with the same 'start' (that's the x[, 'start'] == x[i, 'start']
bit), and exclude rows that have duplicate ALT (that's the x[, 'ALT'] != x[i, 'ALT']
bit). Here i
is the index of the current flat line. Add them all to the flat line's ALT. The sapply
just vectorises this all for each flat line.
# add the other alts to the alt of the 'flat' line.
alt[which.flat] <- as.numeric(alt[which.flat]) + sapply(which.flat,
function (i) {
sum(as.numeric(alt[ x[, 'start'] == x[i, 'start'] &
x[, 'ALT'] != x[i, 'ALT'] ]))
})
now we just paste together:
x <- cbind(x, bam.AD=paste(ref, alt, sep=','))
The result is the same as yours except for row 10 where I believe you have made a mistake - there's only one row with "chr16:2533924" and its ALT is "T" (value 13), so bam.AD
is "19,13" (you have "19,42" as if the ALT was "A", but it isn't).
If you must stick with the function form in your question (quite slow & inefficient!), it's essentially the same as what I have done (hence why you can do it without the apply
call and skip the loop entirely):
flatCase <- function(x, mat, ns) { # get alt of the flat row alt <- as.numeric(x[x['ALT']])
# get the other rows with the same 'start' and different 'ALT'
xx <- mat[mat[, 'start'] == x['start'] & mat[, 'ALT'] != x['ALT'], ,drop=F]
if (nrow(xx) > 0) {
# grab all the alts as done before
rownames(xx) <- 1:nrow(xx)
alt <- alt + sum(as.numeric(xx[cbind(1:nrow(xx), xx[, 'ALT'])]))
}
ref <- x[x['REF']]
return(paste(ref, alt, sep=','))
}
However as mentioned before, if you vectorise it the entire code you have above reduces to just a few lines, and is faster:
newBamAD <- function (x) {
# the version above
rownames(x) <- 1:nrow(x)
ref <- x[cbind(1:nrow(x), x[, 'REF'])]
alt <- x[cbind(1:nrow(x), x[, 'ALT'])]
which.flat <- grep('flat$', x[, 'TYPE'])
alt[which.flat] <- as.numeric(alt[which.flat]) + sapply(which.flat,
function (i) {
sum(as.numeric(alt[ x[, 'start'] == x[i, 'start'] &
x[, 'ALT'] != x[i, 'ALT'] ]))
})
cbind(x, bam.AD=paste(ref, alt, sep=','))
}
library(rbenchmark)
benchmark(
bamAD=bamAD(x),
newBamAD=newBamAD(x)
)
# test replications elapsed relative user.self sys.self user.child sys.child
# 1 bamAD 100 0.082 3.905 0.072 0.004 0 0
# 2 newBamAD 100 0.021 1.000 0.020 0.000 0 0
the vectorised version is almost 4x faster.