Search code examples
pythonbioinformaticsbiopythonncbi

How to use EPOST and than use ESEARCH in biopython?


I have a lit of gene ids:

id_list = ["19304878", "18606172", "16403221", "16377612", "14871861", "14630660"]

how I can take just the nucleotide sequence of this genes using EPOST and ESEARCH in biopython?


Solution

  • You probably want to be using efetch:

    from Bio import SeqIO, Entrez
    
    id_list = ["19304878", "18606172", "16403221", 
               "16377612", "14871861", "14630660"]
    
    # Always tell NCBI who you are!
    Entrez.email = "[email protected]"
    
    # Fetch results from NCBI and write to output
    with open("output.fa", "w") as out_handle:
        for gene_id in id_list:
            fa = Entrez.efetch(db="nucleotide", id=gene_id, rettype="fasta")
            SeqIO.write(SeqIO.parse(fa, "fasta"), out_handle, "fasta")
    

    This is a python script that will do it:

    get_fasta.py

    $ python3 get_fasta.py -h
    usage: get_fasta.py [-h] -e EMAIL -g GENES [-o OUTPUT]
    
    Retrieve FASTA from NCBI using gene IDs
    
    General options:
      -h, --help            Show this help and exit
    
    Inputs:
      -e EMAIL, --email EMAIL
                            Your email address
      -g GENES, --genes GENES
                            A file containing a list of gene IDs, with one ID on
                            each line
    
    Outputs:
      -o OUTPUT, --output OUTPUT
                            The file to write FASTA to. If not provided use STDOUT
    

    #! /usr/bin/env python3
    
    from argparse import ArgumentParser
    from pathlib import Path
    import sys
    
    from Bio import SeqIO, Entrez
    
    
    def main():
        # Get our command-line arguments
        parser, args = get_args()
    
        # Always tell NCBI who you are!
        Entrez.email = args.email
    
        # Check the input file is a file
        if not Path(args.genes).is_file():
            parser.error("--genes parameter must be a file")
    
        # Read in the gene IDs
        with open(args.genes, "r") as file_handle:
            id_list = [line.strip() for line in file_handle]
    
        #  Specify output as a file of STDOUT
        if args.output:
            out_handle = open(args.output, "w")
        else:
            out_handle = sys.stdout
    
        # Fetch results from NCBI and write to output
        for gene_id in id_list:
            fa = Entrez.efetch(db="nucleotide", id=gene_id, rettype="fasta")
            SeqIO.write(SeqIO.parse(fa, "fasta"), out_handle, "fasta")
    
        # Close handle if it's a file
        if out_handle is not sys.stdout:
            out_handle.close()
    
    
    def get_args():
        parser = ArgumentParser(
            description="Retrieve FASTA from NCBI using gene IDs",
            add_help=False,
        )
        general = parser.add_argument_group(
            title='General options'
        )
        general.add_argument(
            "-h",
            "--help",
            action="help",
            help="Show this help and exit",
        )
        in_args = parser.add_argument_group(title='Inputs')
        in_args.add_argument(
            "-e",
            "--email",
            help="Your email address",
            required=True,
        )
        in_args.add_argument(
            "-g",
            "--genes",
            help="A file containing a list of gene IDs, with one ID on each line",
            required=True,
        )
        out_args = parser.add_argument_group(title='Outputs')
        out_args.add_argument(
            "-o",
            "--output",
            help="The file to write FASTA to. If not provided use STDOUT",
        )
        return parser, parser.parse_args()
    
    
    if __name__ == "__main__":
        main()