Search code examples
rbioinformaticsstring-search

Efficiently match multiple strings/keywords to multiple texts in R


I am trying to efficiently map exact peptides (short sequences of amino acids in the 26 character alphabet A-Z1) to proteins (longer sequences of the same alphabet). The most efficient way to do this I'm aware of is an Aho-Corasick trie (where peptides are the keywords). Unfortunately I can't find a version of AC in R that will work with a non-nucleotide alphabet (Biostrings' PDict and Starr's match_ac are both hard-coded for DNA).

As a crutch I've been trying to parallelize a basic grep approach. But I'm having trouble figuring out a way to do so without incurring significant IO overhead. Here is a brief example:

peptides = c("FSSSGGGGGGGR","GAHLQGGAK","GGSGGSYGGGGSGGGYGGGSGSR","IISNASCTTNCLAPLAK")
if (!exists("proteins"))
{
  biocLite("biomaRt", ask=F, suppressUpdates=T, suppressAutoUpdate=T)
  library(biomaRt)
  ensembl = useMart("ensembl",dataset="hsapiens_gene_ensembl")
  proteins = getBM(attributes=c('peptide', 'refseq_peptide'), filters='refseq_peptide', values=c("NP_000217", "NP_001276675"), mart=ensembl)
  row.names(proteins) = proteins$refseq_peptide
}

library(snowfall)
library(Biostrings)
library(plyr)
sfInit(parallel=T, cpus=detectCores()-1)

allPeptideInstances = NULL
i=1
increment=100
count=nrow(proteins)
while(T)
{
  print(paste(i, min(count, i+increment), sep=":"))
  text_source = proteins[i:min(count, i+increment),]
  text = text_source$peptide

  #peptideInstances = sapply(peptides, regexpr, text, fixed=T, useBytes=T)
  peptideInstances = sfSapply(peptides, regexpr, text, fixed=T, useBytes=T)
  dimnames(peptideInstances) = list(text_source$refseq_peptide, colnames(peptideInstances))

  sparsePeptideInstances = alply(peptideInstances, 2, .fun = function(x) {x[x > 0]}, .dims = T)

  allPeptideInstances = c(allPeptideInstances, sparsePeptideInstances, recursive=T)
  if (i==count | nrow(text_source) < increment)
    break
  i = i+increment
}

sfStop()

There are a few issues here:

  • peptideInstances here is a dense matrix, so returning it from each worker is very verbose. I have broken it up into blocks so that I'm not dealing with a 40,000 (proteins) x 60,000 (peptides) matrix.
  • Parallelizing over peptides, when it would make more sense to parallelize over the proteins because they're bigger. But I got frustrated with trying to do it by protein because:
  • This code breaks if there is only one protein in text_source.

Alternatively, if anyone is aware of a better solution in R, I'm happy to use that. I've spent enough time on this I probably would have been better served implementing Aho-Corasick.

1 Some of those are ambiguity codes, but for simplicity, ignore that.


Solution

  • I learned Rcpp and implemented an Aho-Corasick myself. Now CRAN has a good general purpose multiple-keyword search package.

    Here are some usage examples:

    listEquals = function(a, b) { is.null(unlist(a)) && is.null(unlist(b)) || !is.null(a) && !is.null(b) && all(unlist(a) == unlist(b)) }
    
    # simple search of multiple keywords in a single text
    keywords = c("Abra", "cadabra", "is", "the", "Magic", "Word")
    oneSearch = AhoCorasickSearch(keywords, "Is Abracadabra the Magic Word?")
    stopifnot(listEquals(oneSearch[[1]][[1]], list(keyword="Abra", offset=4)))
    stopifnot(listEquals(oneSearch[[1]][[2]], list(keyword="cadabra", offset=8)))
    stopifnot(listEquals(oneSearch[[1]][[3]], list(keyword="the", offset=16)))
    stopifnot(listEquals(oneSearch[[1]][[4]], list(keyword="Magic", offset=20)))
    stopifnot(listEquals(oneSearch[[1]][[5]], list(keyword="Word", offset=26)))
    
    # search a list of lists
    # * sublists are accessed by index
    # * texts are accessed by index
    # * non-matched texts are kept (to preserve index order)
    listSearch = AhoCorasickSearchList(keywords, list(c("What in", "the world"), c("is"), "secret about", "the Magic Word?"))
    stopifnot(listEquals(listSearch[[1]][[1]], list()))
    stopifnot(listEquals(listSearch[[1]][[2]][[1]], list(keyword="the", offset=1)))
    stopifnot(listEquals(listSearch[[2]][[1]][[1]], list(keyword="is", offset=1)))
    stopifnot(listEquals(listSearch[[3]], list()))
    stopifnot(listEquals(listSearch[[4]][[1]][[1]], list(keyword="the", offset=1)))
    stopifnot(listEquals(listSearch[[4]][[1]][[2]], list(keyword="Magic", offset=5)))
    stopifnot(listEquals(listSearch[[4]][[1]][[3]], list(keyword="Word", offset=11)))
    
    # named search of a list of lists
    # * sublists are accessed by name
    # * matched texts are accessed by name
    # * non-matched texts are dropped
    namedSearch = AhoCorasickSearchList(keywords, list(subject=c(phrase1="What in", phrase2="the world"),
                                                       verb=c(phrase1="is"),
                                                       predicate1=c(phrase1="secret about"),
                                                       predicate2=c(phrase1="the Magic Word?")))
    stopifnot(listEquals(namedSearch$subject$phrase2[[1]], list(keyword="the", offset=1)))
    stopifnot(listEquals(namedSearch$verb$phrase1[[1]], list(keyword="is", offset=1)))
    stopifnot(listEquals(namedSearch$predicate1, list()))
    stopifnot(listEquals(namedSearch$predicate2$phrase1[[1]], list(keyword="the", offset=1)))
    stopifnot(listEquals(namedSearch$predicate2$phrase1[[2]], list(keyword="Magic", offset=5)))
    stopifnot(listEquals(namedSearch$predicate2$phrase1[[3]], list(keyword="Word", offset=11)))
    
    # named search of multiple texts in a single list with keyword grouping and aminoacid alphabet
    # * all matches to a keyword are accessed by name
    # * non-matched keywords are dropped
    proteins = c(protein1="PEPTIDEPEPTIDEDADADARARARARAKEKEKEKEPEPTIDE",
                 protein2="DERPADERPAPEWPEWPEEPEERAWRAWWARRAGTAGPEPTIDEKESEQUENCE")
    peptides = c("PEPTIDE", "DERPA", "SEQUENCE", "KEKE", "PEPPIE")
    peptideSearch = AhoCorasickSearch(peptides, proteins, alphabet="aminoacid", groupByKeyword=T)
    stopifnot(listEquals(peptideSearch$PEPTIDE, list(list(keyword="protein1", offset=1),
                                                     list(keyword="protein1", offset=8),
                                                     list(keyword="protein1", offset=37),
                                                     list(keyword="protein2", offset=38))))
    stopifnot(listEquals(peptideSearch$DERPA, list(list(keyword="protein2", offset=1),
                                                   list(keyword="protein2", offset=6))))
    stopifnot(listEquals(peptideSearch$SEQUENCE, list(list(keyword="protein2", offset=47))))
    stopifnot(listEquals(peptideSearch$KEKE, list(list(keyword="protein1", offset=29),
                                                  list(keyword="protein1", offset=31),
                                                  list(keyword="protein1", offset=33))))
    stopifnot(listEquals(peptideSearch$PEPPIE, NULL))
    
    # grouping by keyword without text names: offsets are given without reference to the text
    names(proteins) = NULL
    peptideSearch = AhoCorasickSearch(peptides, proteins, groupByKeyword=T)
    stopifnot(listEquals(peptideSearch$PEPTIDE, list(1, 8, 37, 38)))
    stopifnot(listEquals(peptideSearch$DERPA, list(1, 6)))
    stopifnot(listEquals(peptideSearch$SEQUENCE, list(47)))
    stopifnot(listEquals(peptideSearch$KEKE, list(29, 31, 33)))