Bioconductor package functions for parsing GenBank files into R never seem to work for bacterial genomes (nobody cares about us microbiologists, I guess), so I've written the following code to read in a GBFF file line-by-line and start extracting FEATURE header information from https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/008/685/GCA_000008685.2_ASM868v2/GCA_000008685.2_ASM868v2_genomic.gbff.gz
library(tidyverse)
# read gbff file with readLines and split into a list of separate genbank records
readGBFF <- function (file, text = readLines(file), verbose = FALSE){
if (is(text, "character")) {
split_indices <- cumsum(grepl("^LOCUS", text))
text <- base::split(text, split_indices)
}
return(text)
}
GBFFraw <- readGBFF("GCA_000008685.2_ASM868v2_genomic.gbff.gz")
# assign accession numbers to the names of each genbank record
names(GBFFraw) <- sapply(GBFFraw, function(x) {
version_line <- x[grep("^VERSION", x)]
gsub("VERSION\\s+", "", version_line)
})
# capture the features for each genbank record
GBFFfeat <- lapply(GBFFraw, function(x) {
i_start <- grep("^FEATURES", x)
i_end <- grep("^ORIGIN", x)
paste0(x[(i_start[1]+1):(i_end[1]-1)])
})
From the GBFFfeat list (example of list element below), I would like to make a dataframe like this:
> GBFFfeat$AE000790.2
[1] " source 1..53657"
[2] " /organism=\"Borreliella burgdorferi B31\""
[3] " /mol_type=\"genomic DNA\""
[4] " /strain=\"B31\""
[5] " /type_material=\"type strain of Borrelia burgdorferi\""
[6] " /db_xref=\"taxon:224326\""
[7] " /plasmid=\"lp54\""
[8] " gene 652..1137"
[9] " /locus_tag=\"BB_A01\""
[10] " /note=\"outer membrane protein P13; this gene contains a"
[11] " frame shift which may be the result of a sequencing error;"
[12] " identified by match to protein family HMM PF05628\""
[13] " /pseudo"
[14] " gene 1461..1970"
[15] " /locus_tag=\"BB_A03\""
[16] " CDS 1461..1970"
[17] " /locus_tag=\"BB_A03\""
[18] " /codon_start=1"
[19] " /transl_table=11"
[20] " /product=\"outer membrane protein\""
[21] " /protein_id=\"AAC66227.1\""
[22] " /translation=\"MKKTIIVFIILAFMLNCKNKSNDAEPNNDLDEKSQAKSNLVDED"
[23] " RIEFSKATPLEKLVSRLNLNNTEKETLTFLTNLLKEKLVDPNIGLHFKNSGGDESKIE"
[24] " ESVQKFLSELKEDEIKDLLAKIKENKDKKEKDPEELNTYKSILASGFDGIFNQADSKT"
[25] " TLNKLKDTI\""
[26] " gene complement(2045..2893)"
[27] " /locus_tag=\"BB_A04\""
[28] " CDS complement(2045..2893)"
[29] " /locus_tag=\"BB_A04\""
[30] " /codon_start=1"
[31] " /transl_table=11"
[32] " /product=\"S2 antigen\""
[33] " /protein_id=\"AAC66228.1\""
[34] " /translation=\"MKRVIVSFVVLILGCNLDDNSKMERKGSNKLIRESGSDRRGQEN"
[35] " RALGAMNFGLFSGDSGVVYDLQNYETLKALENKNKFIDYSKIEFLEGTKSINAFIWAV"
[36] " SVRWIKIKARDLFGECGDFIKELKGIKYSYLVSPVDGSYISYAMPIIVFETTRESDPL"
[37] " YSVSGFKLISKGNDINFNENKSGFWGRLPMSEKSVESGLVTAYPFGSSDAKKVIEAFA"
[38] " SLYNNGTWSDMIAEITIKSKQYPKNEKVYRITLDSQLFNVAMKKIIEKYPKIKSASFA"
[39] " FNSLIN\""
[40] " gene complement(3003..4256)"
[41] " /locus_tag=\"BB_A05\""
[42] " CDS complement(3003..4256)"
[43] " /locus_tag=\"BB_A05\""
[44] " /codon_start=1"
[45] " /transl_table=11"
[46] " /product=\"S1 antigen\""
[47] " /protein_id=\"AAC66229.1\""
[48] " /translation=\"MNKIGIAFIISFLLFVNCKGKSLEEDLKSTTSNNKQNLISNEKK"
[49] " SLNSKNNRLKDSRLSNFESKKNDQTLKKSKDFKKDLQTLRNSKNLMPKDLDQSSNDFE"
[50] " NLDNSESLQEASSKHNIGKSRYGKALLKNDHDEIWIPHLNLEEDKNFEFFKKSLQNDE"
[51] " NRYALGGWLLNNDEVLVKYRYSEKDVNQFLIDIGKKRWGDLSSKMSTLVRLIGNYSDK"
[52] " SDREDEISLLDMNLCQQFYLTKINAGGSSADILVALEKTIDQQISGVSKELLELKNFS"
[53] " LTTKSELDWYLNWKRNLTDEEEETLQCCRVLLGGELDFENLDDLFKRLGKEYSRLILR"
[54] " KLEEITLNYDVNRFLKEMEKSRKSFKQALGSIRNKSKRVVIFKVRNSLLEIFKLYYNN"
[55] " IGRNKKLYDYINRMLNSLIKEISRR\""
[56] " gene complement(4667..5140)"
[57] " /locus_tag=\"BB_A07\""
[58] " CDS complement(4667..5140)"
[59] " /locus_tag=\"BB_A07\""
[60] " /codon_start=1"
[61] " /transl_table=11"
[62] " /product=\"chpAI protein, putative\""
[63] " /protein_id=\"AAC66230.2\""
[64] " /translation=\"MKNILLFVILLFFSCKEFNYSDLRRRPSKVLNASNGASNKELKI"
[65] " SFVDSLNDDQKEALFFLEQVVLDSNPDKFNQIFNLNEEKVKEMLVTVVKCLKAKRKAK"
[66] " MALESSNVANVANAKQQLLQVEKTYIDNLRQSFMTTKNIEEACNLVKNYDASASF\""
[67] " gene 5314..5649"
[68] " /locus_tag=\"BB_A08\""
[69] " CDS 5314..5649"
[70] " /locus_tag=\"BB_A08\""
[71] " /note=\"identified by match to protein family HMM PF07115\""
[72] " /codon_start=1"
[73] " /transl_table=11"
[74] " /product=\"conserved hypothetical protein\""
[75] " /protein_id=\"AAC66231.1\""
[76] " /translation=\"MEIKIDEKFNIVFNNDLKLIENIEEQKQRLFFYLKTPKGSLKEN"
[77] " PNYGLDYSFYFKLCKANKLESIKNFFLNLSKELKIDLINVRPSIKNKTITINFFFLGK"
[78] " DNLKMDFKI\""
I have no idea how to do this from the current list formatting. I can use regex expressions to find specific identifiers like "source", "gene", and "CDS", but I'm lost on how to make it into a dataframe. My dataframe example only shows the information I need, but maybe it would be easier to just capture everything based on white spacing and characters like "/" and "="? Any help on this would be greatly appreciated.
Pretty big problem. My attempt here. Some comments: In your example you fill the "plasmid" column down, i don't known the criteria used there, but you can "fill" the column using the tidyr::fill function (see coments) I include all tags as columns, binding them with bind_rows. Please filter what you need.
hope may be helpful.
library(tidyverse)
url <- "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/008/685/GCA_000008685.2_ASM868v2/GCA_000008685.2_ASM868v2_genomic.gbff.gz"
file <- basename(url)
download.file(url, destfile = file)
# read gbff file with readLines and split into a list of separate genbank records
readGBFF <- function (file, text = readLines(file), verbose = FALSE){
if (is(text, "character")) {
split_indices <- cumsum(grepl("^LOCUS", text))
text <- base::split(text, split_indices)
}
return(text)
}
GBFFraw <- readGBFF("GCA_000008685.2_ASM868v2_genomic.gbff.gz")
# assign accession numbers to the names of each genbank record
names(GBFFraw) <- sapply(GBFFraw, function(x) {
version_line <- x[grep("^VERSION", x)]
gsub("VERSION\\s+", "", version_line)
})
# capture the features for each genbank record
GBFFfeat <- lapply(GBFFraw, function(x) {
i_start <- grep("^FEATURES", x)
i_end <- grep("^ORIGIN", x)
paste0(x[(i_start[1]+1):(i_end[1]-1)])
})
# check length of fixet space being consistent among list of lists
range(sapply(GBFFfeat, function(x) {
max(sapply(gregexpr("^ +", GBFFfeat$AE000790.2), attr, "match.length")) }))
#> [1] 21 21
# 21 21 it is consistently 21
GBFF_df <- mapply(GBFFfeat, names(GBFFfeat), FUN = function(elem, seqid){
#check first row is not empty
stopifnot(!grepl("^ {21}" , elem[[1]]))
# identify rows
rows <- cumsum(!grepl(elem, pattern ="^ {21}"))
elem_list <- split(elem, rows)
row_list <- lapply(elem_list, function(text){
#remove first line of each block and make one single line
block <- paste(substring(text[-1], 22, nchar(text[-1])), collapse = " ")
#identify sequences /tagname="content"
matches = gregexec("/([a-zA-Z0-9_]+)(?:=(\"[^\"]+\"|[a-zA-Z0-9_]+))?", block)
#separate values and tagnames and remove double quotes
values <- gsub("\"([^\"]*)\"","\\1", regmatches(block, matches)[[1]][3,])
cols <- regmatches(block, matches)[[1]][2,]
# paste all in a one_line data.frame
as.data.frame(as.list(
c(seqid = seqid,
type = trimws(substr(text[1], 1,21)),
region = trimws(substr(text[1], 22,nchar(text[1]))),
setNames(values, cols))))
})
# dplyr::bind_rows for binding all dataframes by col name
df <- dplyr::bind_rows(row_list)
# if you want to extend "plasmid" you can use tidyr::fill
# TODO: tidyr::fill(df, plasmid, .direction = "down")
})
# dplyr::bind_rows for building a one single data.frame
GBFF_df <- dplyr::bind_rows(GBFF_df)
head(GBFF_df)
#> seqid type region organism
#> 1 AE000783.1 source 1..910724 Borreliella burgdorferi B31
#> 2 AE000783.1 gene 105..677 <NA>
#> 3 AE000783.1 CDS 105..677 <NA>
#> 4 AE000783.1 gene complement(768..1796) <NA>
#> 5 AE000783.1 CDS complement(768..1796) <NA>
#> 6 AE000783.1 gene complement(1784..3249) <NA>
#> mol_type strain type_material db_xref locus_tag
#> 1 genomic DNA B31 type strain of Borrelia burgdorferi taxon:224326 <NA>
#> 2 <NA> <NA> <NA> <NA> BB_0001
#> 3 <NA> <NA> <NA> <NA> BB_0001
#> 4 <NA> <NA> <NA> <NA> BB_0002
#> 5 <NA> <NA> <NA> <NA> BB_0002
#> 6 <NA> <NA> <NA> <NA> BB_0003
#> codon_start transl_table product
#> 1 <NA> <NA> <NA>
#> 2 <NA> <NA> <NA>
#> 3 1 11 conserved hypothetical protein
#> 4 <NA> <NA> <NA>
#> 5 1 11 glycosyl hydrolase family 3 N domain protein
#> 6 <NA> <NA> <NA>
#> protein_id
#> 1 <NA>
#> 2 <NA>
#> 3 AAC66406.1
#> 4 <NA>
#> 5 AAC66400.1
#> 6 <NA>
#> translation
#> 1 <NA>
#> 2 <NA>
#> 3 MILMKYSAILLICSVNLFCFQNKLTTSRWEFPKEDLIKKKIKIG IIYHNYINSIFYNENYKYIAFIGILTSYNEWIEIQFSPINFFTIPTNKDFISNTYFNL AFTIYITKYSILTDTLAIKFFIGTQIDLTLRTTIFTGKTTHAFLYPILPIITFKFEID FIPNNYSIYYKLSTSFKEFILLDLGISIFI
#> 4 <NA>
#> 5 MDFLKTFSFLFFSFFCLNLIAIESLPEIDYEYFNKDKSDLVDLI KFLGELDFQTILKDRNLFIGIRNLTNFKNVQELNIDDINRIKKINPIGIILFRENLKD AEQTKGLINAIKSHIGHDIFIAIDEEGGIVSRASENKKMGVYNFPAMEHVGGVKDLHL IYKIGEVLAKQLRRLGINLNMAPVADIKFAPHTPLLNRTFGGYSAYNIGLMVEAFIDG MQNNGVFSAIKHFPGLGGTTTDTHKYLAFLPYSKSFLMLNNFVPFVFGRAAKFIMIGH VNVPKISKDITSMSKSIVNIIRENLNITSIMMTDSYDMGAITRSFSNIENAIKKSLSS GVNIVLIP
#> 6 <NA>
#> note
#> 1 <NA>
#> 2 <NA>
#> 3 <NA>
#> 4 <NA>
#> 5 identified by match to protein family HMM PF00933
#> 6 UTP--glucose-1-phosphate uridylyltransferase subfamily, authentic frameshift; this gene contains a frame shift which is not the result of sequencing error; identified by similarity to GB:AAB93994.1; match to protein family HMM PF01704
#> pseudo gene EC_number ribosomal_slippage EC_number.1 plasmid
#> 1 <NA> <NA> <NA> <NA> <NA> <NA>
#> 2 <NA> <NA> <NA> <NA> <NA> <NA>
#> 3 <NA> <NA> <NA> <NA> <NA> <NA>
#> 4 <NA> <NA> <NA> <NA> <NA> <NA>
#> 5 <NA> <NA> <NA> <NA> <NA> <NA>
#> 6 <NA> <NA> <NA> <NA> <NA>