Search code examples
pythonpandasmatplotlibbiopythonfasta

How to write files without overwriting them with each loop iteration


The following code is to analyze the amino acid composition of FASTA sequences (.faa files)

from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import fastaparser
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)

filename = input("Please enter the full path of the amino acid sequence file!: ")
pH_input = input("At which pH should the analysis be conducted? ")
flexibility_ynu = input("Do you wish a flexibility analysis?\n   (1) Yes\n   (0) No\n")

if pH_input == "":
    pH= 7.4
elif pH_input != "":
    pH = pH_input

f = open(filename + "_analysis.txt","w+")
for record in SeqIO.parse(filename, 'fasta'):
    X = ProteinAnalysis(str(record.seq))
    print("ANALYSIS OF", record, "\n ----------- \n -----------", file=f)
    #
    pd_count_amino_acids = pd.DataFrame(X.count_amino_acids(), index=[1])
    print("number of amino acids: \n",pd_count_amino_acids , file=f)
    plt_acc = pd_count_amino_acids.plot.bar()
    plt.savefig(filename + "_count_amino_acids_plot.pdf")
    #
    pd_get_amino_acids_percent = pd.DataFrame(X.get_amino_acids_percent(), index=[1])
    print("\n percentage of amino acids: \n", pd_get_amino_acids_percent, file=f)
    plt_acp = pd_get_amino_acids_percent.plot.bar()
    plt.savefig(filename + "amino_acids_percent_plot.pdf")
    #
    print("\n molecular weight: {:.2f}".format(X.molecular_weight()), file=f)
    print("\n aromaticity: {:.2f}".format(X.aromaticity()), file=f)
    print("\n instability index: {:.2f}".format(X.instability_index()), file=f)
    if flexibility_ynu == "1":
        print("\n flexibility: ", X.flexibility(), file=f)
    print("\n IEP: ", X.isoelectric_point(), file=f)
    print("therefore its charge at pH = ",pH," is {:.2f}".format(X.charge_at_pH(pH)), file=f)
    print("secondary structure fraction: (Helix, Turn, Sheet): ", X.secondary_structure_fraction(), "\n\n\n", file=f)
f.close()
print("done")

I now would like bar plots for both the absolute and relative number of amino acids, but create a unique plot for every FASTA ID.

For example, NC_011544 has 5 IDs, so I'd like to generate 10 unique plots (2 per ID, one for absolute, and one for relative number).

Is there any way to do this?

NC_011544.faa

>gi|212671454|ref|YP_002308464.1| replicase [Hosta virus X]
MARLREVFSSFTEPNLKTIVQQETYKLAKAELKTIQTYNPYAQTKDAADLLEDLGINTNPHAVTAHTHAA
AKSIENDLYGITSHYLPKTPITFLFMKRGKLQFFKRGPQHNDLFFYTTHEPKDVIRYQSEDQTADMFRVP
TSTGFIGDTLHFLSLKYLHRLFLKNPNLNTLYATMVLPPEAMYRMASIYPEIYQIQYQEDGFLYIPGGHG
GAAYFHTYDTLTWLRVGQFQAKEFTAHLPKVGDKGANHLFIIQRADLKTPKYRTFVPRRKWVTLPNIFLP
STQANHLFIIQRADLKTPKYRTFVPRRKWVTSNIFLPKHTNARKPILKQTMMQLFLYEKSVKEITFRDVF
AKIRQLIQTKDLEQFDPDELVRLANYVMHTSKLLEKDPYELIEGQGKLQDLVNPIKTWVSEKWQNWFGWK
DYTRLIRALKWVDVDLVLRVMNTRSTPTGIQTSELLPDEAGPPKSKKKRGGKKIPSPEPSRNCRSKSKRT
RGNRAQREKEPHRRKLRWQKENFQRVTVQVHQAPKGDPSPLARFSQSLKELPRRSQPRRLSKFQDFLMSS
TQTRFQIPSSLNRRAGHWRPKQQGTPPTTQEAGTEGPPTTQPGKPTASSPRAAPQPTANAETMEKGSQAS
SATTRGRDPVTDRTREQAPTNLTPEEEALPWKHWLKQLKAVGFKGNETQMDGDGTSISPIEQIKSCPGKP
KSVSKEILETLRSGHAPNFWKPDASRARAYTSDIKNRRTGAAVHMAPQAWKETMDFIAENAERTLHILRH
PWRRRFREEQMSSRDAHKFHFLFDETLVVCPTNELRRDWIDKLPLSEPGSVLTFERALMNPAKGTVIFDD
YTKLPAGFIEAYSICQPNVELVILTGDAKQASHHESNDNAMIAGLDPAAFEFSKFCRYYLNATHRNPRNL
ANALGIYSEKPGNLKVTFTNHLLPEMHILVPSLLKKATLEELGHKCSTYAGCQGVTLSKVQIYLDSNTTL
CSNEVLYTALSRAVEQINFVNSGPFNGPFWAKLEATPYLKTFLRLTREEKINEITPEEPKPKEPEPPKTH
FPVETSAHLYSSITEEMPEKHAREIYNKTHGHTNCVQTDEPLVQMFAHQQAKDEALFWETIEARLRITTS
EANVQELNEKRDIGDLLFHAYHKAMGLPKDPIPFENDLWETCAQEVQQTYLSKPINLIKNGEKRQGPDFD
KNAIMLFLKSQWVKKMEKLGAPTIKPGQTIASFHQITVMLYGTMARYMRRIRDRFCPKHILINCEKTPTQ
ISDFVKAQWDFSDFAYANDFTAFDQSQDGAMLQFEIIKAKFHNIPEDIILGYMDIKTNAKIFLGTLAIMR
LTGEGPTFDANTECNIAYTHLRFNVPENVAQVYAGDDSALSKVCPEKDSFKQFADRLTLKSKPQVFPQTQ
GAWAEFCGLLITPRGIIKDPVKLHASWVLATKLGTLQQIKCVNSYGEDLKLSYDLGDHLQELLSESQCRT
HQVTVRELVKFAGKVEKHQAEIRSVANGNIRQLPFFY
>gi|212671455|ref|YP_002308465.1| 26 kDa protein [Hosta virus X]
MATFASFLSSTRPDFERTNTPLTKPLVIHAVAGAGKTTLLRDFLRANPLTNAQTLGTPDCPTLDGAYIRP
FSGPVANLVNILDEYTAHRHGSWDVLIADPLQHYERAKLPHYICKRSHRLCPATARLLRKLGLDIHSYRE
DESEISFSDIFSGQLEGTVLPLTPLCKDLLERHSCPFKCPSEFIGEQDDIITVVSEIPLSKHPDKTALYR
ALTRHTRRLNVLAPPPYPTP
>gi|212671456|ref|YP_002308466.1| 13 kDa protein [Hosta virus X]
MSSPHRLTPPPNYTPVLLAVVIGVGLAVVTNQLTRSTLPHVGDNIHSLPHGGNYKDGTKSVIYRGPAPFQ
RSHSTAPPFNAVLLLTFAIWFLSCRTRRAAIGIHVCHTCSQTREQQ
>gi|212671457|ref|YP_002308467.1| 8 kDa protein [Hosta virus X]
MQSFCSHLRSGSFPVVLGALLLAFTCATLVLRLGNNNSNNCLIYVDGARAFLEGNCAGISAEVVAALRPH
SHAG
>gi|212671458|ref|YP_002308468.1| coat protein [Hosta virus X]
MASDAPTPPAAPSPVTFTAPTQEQLTSLALPIISTRLPSPDVLNQISVKWQELGVPTASISSTAIALCMA
CYHSGSSGSTLIPGLAPGTTVNYTSLAAAVKSLATLREFARYFAPIIWNYAIEHKIPPANWAAMGYKENT
KYAAFDTFDSILNPAALQPTGGLIRQPTEEELLAHQANSALHIFDSLRNDFASTDGRVTRGHITSNVNSL
NYLPAPEGSS

Solution

    • The code was already creating a plot for each record, however, the same file name was being used for everything, so each new plot and analysis.txt was overwriting the previous one.
    • Extract the id from record, as record_id, and use that to create unique file names with each loop.
    • The following code will now generate a percent_plot.pdf, count_plot.pdf, and an analysis.txt for each id
    • Use with open, which will automatically close the file f
    • .T is used to transpose the dataframe when plotting, which places the amino acid names on the axis, thereby making the plot easier to read.
    import pandas
    import matplotlib.pyplot as plt
    from Bio import SeqIO
    from Bio.SeqUtils.ProtParam import ProteinAnalysis
    
    # set pandas display option
    pd.set_option('display.max_columns', None)
    
    # input selections were removed for testing
    filename = 'NC_011544.faa'
    flexibility_ynu =  '1'
    pH = 7.4
    
    for record in SeqIO.parse(filename, 'fasta'):
        # get each record id to be used for unique file names
        record_id = record.id.split('|')[1]
        print(record)
        # open the file and use the record id as part of the name, so there will be a unique file for each id
        with open(f"{filename}_{record_id}_analysis.txt","w+") as f:
    
            X = ProteinAnalysis(str(record.seq))
            print("ANALYSIS OF", record, "\n ----------- \n -----------", file=f)
            #
            pd_count_amino_acids = pd.DataFrame(X.count_amino_acids(), index=[1])
            display(pd_count_amino_acids)  # display is for jupyter notebook, otherwise use print
            print("number of amino acids: \n",pd_count_amino_acids , file=f)
            plt_acc = pd_count_amino_acids.T.plot.bar(legend=False, figsize=(7, 5))
            plt.title(f'Number of Amino Acids for {record_id} in {filename}')
            plt.xticks(rotation=0)
            # add the record id as part of the file name
            plt.savefig(f'{filename}_{record_id}_count_amino_acids_plot.pdf')
            plt.show()
            #
            pd_get_amino_acids_percent = pd.DataFrame(X.get_amino_acids_percent(), index=[1])
            display(pd_get_amino_acids_percent.round(2))  # display is for jupyter notebook, otherwise use print
            print("\n percentage of amino acids: \n", pd_get_amino_acids_percent, file=f)
            plt_acp = pd_get_amino_acids_percent.T.plot.bar(legend=False, figsize=(7, 5))
            plt.title(f'Percentage of Amino Acids for {record_id} in {filename}')
            plt.xticks(rotation=0)
            # add the record id as part of the file name
            plt.savefig(f'{filename}_{record_id}_amino_acids_percent_plot.pdf')
            plt.show()
            #
            print("\n molecular weight: {:.2f}".format(X.molecular_weight()), file=f)
            print("\n aromaticity: {:.2f}".format(X.aromaticity()), file=f)
            print("\n instability index: {:.2f}".format(X.instability_index()), file=f)
    
            if flexibility_ynu == "1":
                print("\n flexibility: ", X.flexibility(), file=f)
    
            print("\n IEP: ", X.isoelectric_point(), file=f)
            print("therefore its charge at pH = ",pH," is {:.2f}".format(X.charge_at_pH(pH)), file=f)
            print("secondary structure fraction: (Helix, Turn, Sheet): ", X.secondary_structure_fraction(), "\n\n\n", file=f)
            print('\n')
    

    First ID

    enter image description here

    Second ID

    enter image description here