Issues parsing GenBank file info into R

Bioconductor package functions for parsing GenBank files into R never seem to work for bacterial genomes (nobody cares about us microbiologists, I guess), so I've written the following code to read in a GBFF file line-by-line and start extracting FEATURE header information from https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/008/685/GCA_000008685.2_ASM868v2/GCA_000008685.2_ASM868v2_genomic.gbff.gz

library(tidyverse)

# read gbff file with readLines and split into a list of separate genbank records 
readGBFF <- function (file, text = readLines(file), verbose = FALSE){
  if (is(text, "character")) {
    split_indices <- cumsum(grepl("^LOCUS", text))
    text <- base::split(text, split_indices)
  }
  return(text)
}

GBFFraw <- readGBFF("GCA_000008685.2_ASM868v2_genomic.gbff.gz")

# assign accession numbers to the names of each genbank record 
names(GBFFraw) <- sapply(GBFFraw, function(x) {
  version_line <- x[grep("^VERSION", x)]
  gsub("VERSION\\s+", "", version_line)
})

# capture the features for each genbank record 
GBFFfeat <- lapply(GBFFraw, function(x) {
  i_start <- grep("^FEATURES", x)
  i_end <- grep("^ORIGIN", x)
  paste0(x[(i_start[1]+1):(i_end[1]-1)])
})

From the GBFFfeat list (example of list element below), I would like to make a dataframe like this:

> GBFFfeat$AE000790.2
  [1] "     source          1..53657"                                                   
  [2] "                     /organism=\"Borreliella burgdorferi B31\""                  
  [3] "                     /mol_type=\"genomic DNA\""                                  
  [4] "                     /strain=\"B31\""                                            
  [5] "                     /type_material=\"type strain of Borrelia burgdorferi\""     
  [6] "                     /db_xref=\"taxon:224326\""                                  
  [7] "                     /plasmid=\"lp54\""                                          
  [8] "     gene            652..1137"                                                  
  [9] "                     /locus_tag=\"BB_A01\""                                      
 [10] "                     /note=\"outer membrane protein P13; this gene contains a"   
 [11] "                     frame shift which may be the result of a sequencing error;" 
 [12] "                     identified by match to protein family HMM PF05628\""        
 [13] "                     /pseudo"                                                    
 [14] "     gene            1461..1970"                                                 
 [15] "                     /locus_tag=\"BB_A03\""                                      
 [16] "     CDS             1461..1970"                                                 
 [17] "                     /locus_tag=\"BB_A03\""                                      
 [18] "                     /codon_start=1"                                             
 [19] "                     /transl_table=11"                                           
 [20] "                     /product=\"outer membrane protein\""                        
 [21] "                     /protein_id=\"AAC66227.1\""                                 
 [22] "                     /translation=\"MKKTIIVFIILAFMLNCKNKSNDAEPNNDLDEKSQAKSNLVDED"
 [23] "                     RIEFSKATPLEKLVSRLNLNNTEKETLTFLTNLLKEKLVDPNIGLHFKNSGGDESKIE" 
 [24] "                     ESVQKFLSELKEDEIKDLLAKIKENKDKKEKDPEELNTYKSILASGFDGIFNQADSKT" 
 [25] "                     TLNKLKDTI\""                                                
 [26] "     gene            complement(2045..2893)"                                     
 [27] "                     /locus_tag=\"BB_A04\""                                      
 [28] "     CDS             complement(2045..2893)"                                     
 [29] "                     /locus_tag=\"BB_A04\""                                      
 [30] "                     /codon_start=1"                                             
 [31] "                     /transl_table=11"                                           
 [32] "                     /product=\"S2 antigen\""                                    
 [33] "                     /protein_id=\"AAC66228.1\""                                 
 [34] "                     /translation=\"MKRVIVSFVVLILGCNLDDNSKMERKGSNKLIRESGSDRRGQEN"
 [35] "                     RALGAMNFGLFSGDSGVVYDLQNYETLKALENKNKFIDYSKIEFLEGTKSINAFIWAV" 
 [36] "                     SVRWIKIKARDLFGECGDFIKELKGIKYSYLVSPVDGSYISYAMPIIVFETTRESDPL" 
 [37] "                     YSVSGFKLISKGNDINFNENKSGFWGRLPMSEKSVESGLVTAYPFGSSDAKKVIEAFA" 
 [38] "                     SLYNNGTWSDMIAEITIKSKQYPKNEKVYRITLDSQLFNVAMKKIIEKYPKIKSASFA" 
 [39] "                     FNSLIN\""                                                   
 [40] "     gene            complement(3003..4256)"                                     
 [41] "                     /locus_tag=\"BB_A05\""                                      
 [42] "     CDS             complement(3003..4256)"                                     
 [43] "                     /locus_tag=\"BB_A05\""                                      
 [44] "                     /codon_start=1"                                             
 [45] "                     /transl_table=11"                                           
 [46] "                     /product=\"S1 antigen\""                                    
 [47] "                     /protein_id=\"AAC66229.1\""                                 
 [48] "                     /translation=\"MNKIGIAFIISFLLFVNCKGKSLEEDLKSTTSNNKQNLISNEKK"
 [49] "                     SLNSKNNRLKDSRLSNFESKKNDQTLKKSKDFKKDLQTLRNSKNLMPKDLDQSSNDFE" 
 [50] "                     NLDNSESLQEASSKHNIGKSRYGKALLKNDHDEIWIPHLNLEEDKNFEFFKKSLQNDE" 
 [51] "                     NRYALGGWLLNNDEVLVKYRYSEKDVNQFLIDIGKKRWGDLSSKMSTLVRLIGNYSDK" 
 [52] "                     SDREDEISLLDMNLCQQFYLTKINAGGSSADILVALEKTIDQQISGVSKELLELKNFS" 
 [53] "                     LTTKSELDWYLNWKRNLTDEEEETLQCCRVLLGGELDFENLDDLFKRLGKEYSRLILR" 
 [54] "                     KLEEITLNYDVNRFLKEMEKSRKSFKQALGSIRNKSKRVVIFKVRNSLLEIFKLYYNN" 
 [55] "                     IGRNKKLYDYINRMLNSLIKEISRR\""                                
 [56] "     gene            complement(4667..5140)"                                     
 [57] "                     /locus_tag=\"BB_A07\""                                      
 [58] "     CDS             complement(4667..5140)"                                     
 [59] "                     /locus_tag=\"BB_A07\""                                      
 [60] "                     /codon_start=1"                                             
 [61] "                     /transl_table=11"                                           
 [62] "                     /product=\"chpAI protein, putative\""                       
 [63] "                     /protein_id=\"AAC66230.2\""                                 
 [64] "                     /translation=\"MKNILLFVILLFFSCKEFNYSDLRRRPSKVLNASNGASNKELKI"
 [65] "                     SFVDSLNDDQKEALFFLEQVVLDSNPDKFNQIFNLNEEKVKEMLVTVVKCLKAKRKAK" 
 [66] "                     MALESSNVANVANAKQQLLQVEKTYIDNLRQSFMTTKNIEEACNLVKNYDASASF\""  
 [67] "     gene            5314..5649"                                                 
 [68] "                     /locus_tag=\"BB_A08\""                                      
 [69] "     CDS             5314..5649"                                                 
 [70] "                     /locus_tag=\"BB_A08\""                                      
 [71] "                     /note=\"identified by match to protein family HMM PF07115\""
 [72] "                     /codon_start=1"                                             
 [73] "                     /transl_table=11"                                           
 [74] "                     /product=\"conserved hypothetical protein\""                
 [75] "                     /protein_id=\"AAC66231.1\""                                 
 [76] "                     /translation=\"MEIKIDEKFNIVFNNDLKLIENIEEQKQRLFFYLKTPKGSLKEN"
 [77] "                     PNYGLDYSFYFKLCKANKLESIKNFFLNLSKELKIDLINVRPSIKNKTITINFFFLGK" 
 [78] "                     DNLKMDFKI\""

I have no idea how to do this from the current list formatting. I can use regex expressions to find specific identifiers like "source", "gene", and "CDS", but I'm lost on how to make it into a dataframe. My dataframe example only shows the information I need, but maybe it would be easier to just capture everything based on white spacing and characters like "/" and "="? Any help on this would be greatly appreciated.

Solution

Pretty big problem. My attempt here. Some comments: In your example you fill the "plasmid" column down, i don't known the criteria used there, but you can "fill" the column using the tidyr::fill function (see coments) I include all tags as columns, binding them with bind_rows. Please filter what you need.

hope may be helpful.

library(tidyverse)


url <- "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/008/685/GCA_000008685.2_ASM868v2/GCA_000008685.2_ASM868v2_genomic.gbff.gz"
file <- basename(url)

download.file(url, destfile = file)

# read gbff file with readLines and split into a list of separate genbank records 
readGBFF <- function (file, text = readLines(file), verbose = FALSE){
  if (is(text, "character")) {
    split_indices <- cumsum(grepl("^LOCUS", text))
    text <- base::split(text, split_indices)
  }
  return(text)
}

GBFFraw <- readGBFF("GCA_000008685.2_ASM868v2_genomic.gbff.gz")

# assign accession numbers to the names of each genbank record 
names(GBFFraw) <- sapply(GBFFraw, function(x) {
  version_line <- x[grep("^VERSION", x)]
  gsub("VERSION\\s+", "", version_line)
})

# capture the features for each genbank record 
GBFFfeat <- lapply(GBFFraw, function(x) {
  i_start <- grep("^FEATURES", x)
  i_end <- grep("^ORIGIN", x)
  paste0(x[(i_start[1]+1):(i_end[1]-1)])
})

# check length of fixet space being consistent among list of lists
range(sapply(GBFFfeat, function(x) {
  max(sapply(gregexpr("^ +", GBFFfeat$AE000790.2), attr, "match.length")) }))
#> [1] 21 21
# 21 21  it is consistently 21

GBFF_df <- mapply(GBFFfeat, names(GBFFfeat), FUN = function(elem, seqid){

  #check first row is not empty
  stopifnot(!grepl("^ {21}" , elem[[1]]))  
  # identify rows 
  rows <- cumsum(!grepl(elem, pattern ="^ {21}"))
  elem_list <- split(elem, rows)


  row_list <- lapply(elem_list, function(text){
    #remove first line of each block and make one single line
    block <- paste(substring(text[-1], 22, nchar(text[-1])), collapse = " ")    
    #identify sequences /tagname="content"
    matches = gregexec("/([a-zA-Z0-9_]+)(?:=(\"[^\"]+\"|[a-zA-Z0-9_]+))?", block)
    
    #separate values and tagnames and remove double quotes 
    values <- gsub("\"([^\"]*)\"","\\1", regmatches(block, matches)[[1]][3,])
    cols <- regmatches(block, matches)[[1]][2,] 
    
    # paste all in a one_line data.frame
    as.data.frame(as.list(
      c(seqid = seqid,
        type = trimws(substr(text[1], 1,21)), 
        region = trimws(substr(text[1], 22,nchar(text[1]))),
        setNames(values, cols)))) 
  })
  # dplyr::bind_rows for binding all dataframes by col name 
  df <- dplyr::bind_rows(row_list)
  # if you want to extend "plasmid" you can use tidyr::fill
  # TODO: tidyr::fill(df, plasmid, .direction = "down")
})

# dplyr::bind_rows for building a one single data.frame
GBFF_df <- dplyr::bind_rows(GBFF_df)

head(GBFF_df)
#>        seqid   type                 region                    organism
#> 1 AE000783.1 source              1..910724 Borreliella burgdorferi B31
#> 2 AE000783.1   gene               105..677                        <NA>
#> 3 AE000783.1    CDS               105..677                        <NA>
#> 4 AE000783.1   gene  complement(768..1796)                        <NA>
#> 5 AE000783.1    CDS  complement(768..1796)                        <NA>
#> 6 AE000783.1   gene complement(1784..3249)                        <NA>
#>      mol_type strain                       type_material      db_xref locus_tag
#> 1 genomic DNA    B31 type strain of Borrelia burgdorferi taxon:224326      <NA>
#> 2        <NA>   <NA>                                <NA>         <NA>   BB_0001
#> 3        <NA>   <NA>                                <NA>         <NA>   BB_0001
#> 4        <NA>   <NA>                                <NA>         <NA>   BB_0002
#> 5        <NA>   <NA>                                <NA>         <NA>   BB_0002
#> 6        <NA>   <NA>                                <NA>         <NA>   BB_0003
#>   codon_start transl_table                                      product
#> 1        <NA>         <NA>                                         <NA>
#> 2        <NA>         <NA>                                         <NA>
#> 3           1           11               conserved hypothetical protein
#> 4        <NA>         <NA>                                         <NA>
#> 5           1           11 glycosyl hydrolase family 3 N domain protein
#> 6        <NA>         <NA>                                         <NA>
#>   protein_id
#> 1       <NA>
#> 2       <NA>
#> 3 AAC66406.1
#> 4       <NA>
#> 5 AAC66400.1
#> 6       <NA>
#>                                                                                                                                                                                                                                                                                                                                                    translation
#> 1                                                                                                                                                                                                                                                                                                                                                         <NA>
#> 2                                                                                                                                                                                                                                                                                                                                                         <NA>
#> 3                                                                                                                                                            MILMKYSAILLICSVNLFCFQNKLTTSRWEFPKEDLIKKKIKIG IIYHNYINSIFYNENYKYIAFIGILTSYNEWIEIQFSPINFFTIPTNKDFISNTYFNL AFTIYITKYSILTDTLAIKFFIGTQIDLTLRTTIFTGKTTHAFLYPILPIITFKFEID FIPNNYSIYYKLSTSFKEFILLDLGISIFI
#> 4                                                                                                                                                                                                                                                                                                                                                         <NA>
#> 5 MDFLKTFSFLFFSFFCLNLIAIESLPEIDYEYFNKDKSDLVDLI KFLGELDFQTILKDRNLFIGIRNLTNFKNVQELNIDDINRIKKINPIGIILFRENLKD AEQTKGLINAIKSHIGHDIFIAIDEEGGIVSRASENKKMGVYNFPAMEHVGGVKDLHL IYKIGEVLAKQLRRLGINLNMAPVADIKFAPHTPLLNRTFGGYSAYNIGLMVEAFIDG MQNNGVFSAIKHFPGLGGTTTDTHKYLAFLPYSKSFLMLNNFVPFVFGRAAKFIMIGH VNVPKISKDITSMSKSIVNIIRENLNITSIMMTDSYDMGAITRSFSNIENAIKKSLSS GVNIVLIP
#> 6                                                                                                                                                                                                                                                                                                                                                         <NA>
#>                                                                                                                                                                                                                                         note
#> 1                                                                                                                                                                                                                                       <NA>
#> 2                                                                                                                                                                                                                                       <NA>
#> 3                                                                                                                                                                                                                                       <NA>
#> 4                                                                                                                                                                                                                                       <NA>
#> 5                                                                                                                                                                                          identified by match to protein family HMM PF00933
#> 6 UTP--glucose-1-phosphate uridylyltransferase subfamily, authentic frameshift; this gene contains a frame shift which is not the result of sequencing error; identified by similarity to GB:AAB93994.1; match to protein family HMM PF01704
#>   pseudo gene EC_number ribosomal_slippage EC_number.1 plasmid
#> 1   <NA> <NA>      <NA>               <NA>        <NA>    <NA>
#> 2   <NA> <NA>      <NA>               <NA>        <NA>    <NA>
#> 3   <NA> <NA>      <NA>               <NA>        <NA>    <NA>
#> 4   <NA> <NA>      <NA>               <NA>        <NA>    <NA>
#> 5   <NA> <NA>      <NA>               <NA>        <NA>    <NA>
#> 6        <NA>      <NA>               <NA>        <NA>    <NA>