Although this is a genomics
question, since it's dealing with splicing (getting subsets) of strings I think it's relevant for this audience rather to Bioconductor
alone.
Pretty simple, I have a list of long strings (chromosomes of the genome). For example, I create and store 10 chromosomes using the Bioconductor
Biostrings
package:
set.seed(1)
set <- NULL
for (i in 1:10) set <- c(set,paste(sample(Biostrings::DNA_ALPHABET[1:4],10000,replace=T),collapse=""))
genome.set <- Biostrings::DNAStringSet(set)
names(genome.set) <- paste0("chr",1:10)
And then I have a data.frame
of transcript coordinates (from a GTF
file), where each transcript can have multiple lines:
library(dplyr)
gtf.df <- data.frame(seqnames = sample(names(genome.set),100,replace=T),
strand = sample(c("+","-"),100,replace=T),
start = sample(1:9000,100,replace=F)) %>%
dplyr::mutate(end = start+sample(1:1000,100,replace = F))
gtf.df <- gtf.df %>% dplyr::group_by(seqnames) %>%
dplyr::arrange(start,end) %>%
dplyr::mutate(transcript_id = paste0(seqnames,"_",sample(1:8,length(seqnames),replace=T))) %>%
dplyr::ungroup()
And what I want to do is for each transcript join its sequences by splicing them out from genome.set
.
Using the Biostrings
again I achieve this this way:
transcript_ids <- unique(gtf.df$transcript_id)
transcript.seqs <- sapply(1:length(transcript_ids),function(t){
transcript.gtf.df <- gtf.df %>% dplyr::filter(gtf.df$transcript_id == transcript_ids[t])
transcript.seq <- paste(sapply(1:nrow(transcript.gtf.df),function(e)
unname(as.character(Biostrings::subseq(genome.set[which(names(genome.set) == transcript.gtf.df$seqnames[1])],start=transcript.gtf.df$start[e],end=transcript.gtf.df$end[e])))
),collapse="")
if(transcript.gtf.df$strand[1] == "-") transcript.seq <- unname(as.character(Biostrings::reverseComplement(Biostrings::DNAString(transcript.seq))))
return(transcript.seq)
})
My problem is that I have 4520919
transcripts in my real data and the last part takes a long while. So my question is if and how this can be done faster, either using Biostrings
or any other way.
I have rewritten your sapply
methods with two major changes:
vapply
which in general is quite faster.subset2
to subset data framesEdit
vapply
)Biostrings::reverseComplement
Here is the code
names_genome.set <- names(genome.set)
transcript_ids <- unique(gtf.df$transcript_id)
transcript_seqs <- vapply(seq_along(transcript_ids), function (t) {
ind_id <- which(.subset2(gtf.df, 5L) == transcript_ids[t])
x <- unname(as.character(genome.set[names_genome.set == .subset2(gtf.df, 1L)[ind_id[1L]]]))
out <- paste0(substring(text = x, first = .subset2(gtf.df, 3L)[ind_id], last = .subset2(gtf.df, 4L)[ind_id]), collapse = '')
if (.subset2(gtf.df, 2L)[ind_id[1L]] == '-') {
out <- unlist(strsplit(out, ''))
ind_A <- out == 'A'
ind_T <- out == 'T'
ind_C <- out == 'C'
ind_G <- out == 'G'
out[ind_A] <- 'C'
out[ind_T] <- 'G'
out[ind_G] <- 'T'
out[ind_C] <- 'A'
out <- paste(out, collapse = '')
}
out
}, character(1))
Here are some benchmarking figures with the sample data provided
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# sapply 160.94296 169.97698 180.13836 175.20474 182.58224 400.3273 100 c
# vapply_old 66.20113 69.59185 72.96804 71.45861 74.56051 120.0023 100 b
# vapply_new 47.45224 49.51573 52.87001 50.97023 54.52104 109.3320 100 a
microbenchmark::microbenchmark(
'sapply' = {
transcript.seqs <- sapply(1:length(transcript_ids),function(t){
transcript.gtf.df <- gtf.df %>% dplyr::filter(gtf.df$transcript_id == transcript_ids[t])
transcript.seq <- paste(sapply(1:nrow(transcript.gtf.df),function(e)
unname(as.character(Biostrings::subseq(genome.set[which(names(genome.set) == transcript.gtf.df$seqnames[1])],start=transcript.gtf.df$start[e],end=transcript.gtf.df$end[e])))
),collapse="")
if(transcript.gtf.df$strand[1] == "-") transcript.seq <- unname(as.character(Biostrings::reverseComplement(Biostrings::DNAString(transcript.seq))))
return(transcript.seq)
})
},
'vapply_old' = {
transcript_seqs <- vapply(seq_along(transcript_ids), function (t) {
ind_id <- which(.subset2(gtf.df, 5L) == transcript_ids[t])
x <- unname(as.character(genome.set[names_genome.set == .subset2(gtf.df, 1L)[ind_id[1L]]]))
out <- vapply(ind_id,
function (e) substr(x = x, start = .subset2(gtf.df, 3L)[e], stop = .subset2(gtf.df, 4L)[e]),
character(1))
out <- paste(out, collapse = '')
if (.subset2(gtf.df, 2L)[ind_id[1L]] == '-') {
out <- unname(as.character(Biostrings::reverseComplement(Biostrings::DNAString(out))))
}
out
}, character(1))
},
'vapply_new' = {
transcript_seqs <- vapply(seq_along(transcript_ids), function (t) {
ind_id <- which(.subset2(gtf.df, 5L) == transcript_ids[t])
x <- unname(as.character(genome.set[names_genome.set == .subset2(gtf.df, 1L)[ind_id[1L]]]))
out <- paste0(substring(text = x, first = .subset2(gtf.df, 3L)[ind_id], last = .subset2(gtf.df, 4L)[ind_id]), collapse = '')
if (.subset2(gtf.df, 2L)[ind_id[1L]] == '-') {
out <- unlist(strsplit(out, ''))
ind_A <- out == 'A'
ind_T <- out == 'T'
ind_C <- out == 'C'
ind_G <- out == 'G'
out[ind_A] <- 'C'
out[ind_T] <- 'G'
out[ind_G] <- 'T'
out[ind_C] <- 'A'
out <- paste(out, collapse = '')
}
out
}, character(1))
}
)
I will try to to find ways to enhance it still (there might be vectorization possible). I haven't grasped yet what the function reverseComplement
does for instance - maybe that can be performed more efficiently.
You can try with larger data sets and see if there is an improvement. Also, Rcpp
might be an idea here if efficiency really is at stake.