The following code is to analyze the amino acid composition of FASTA sequences (.faa files)
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import fastaparser
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
filename = input("Please enter the full path of the amino acid sequence file!: ")
pH_input = input("At which pH should the analysis be conducted? ")
flexibility_ynu = input("Do you wish a flexibility analysis?\n (1) Yes\n (0) No\n")
if pH_input == "":
pH= 7.4
elif pH_input != "":
pH = pH_input
f = open(filename + "_analysis.txt","w+")
for record in SeqIO.parse(filename, 'fasta'):
X = ProteinAnalysis(str(record.seq))
print("ANALYSIS OF", record, "\n ----------- \n -----------", file=f)
#
pd_count_amino_acids = pd.DataFrame(X.count_amino_acids(), index=[1])
print("number of amino acids: \n",pd_count_amino_acids , file=f)
plt_acc = pd_count_amino_acids.plot.bar()
plt.savefig(filename + "_count_amino_acids_plot.pdf")
#
pd_get_amino_acids_percent = pd.DataFrame(X.get_amino_acids_percent(), index=[1])
print("\n percentage of amino acids: \n", pd_get_amino_acids_percent, file=f)
plt_acp = pd_get_amino_acids_percent.plot.bar()
plt.savefig(filename + "amino_acids_percent_plot.pdf")
#
print("\n molecular weight: {:.2f}".format(X.molecular_weight()), file=f)
print("\n aromaticity: {:.2f}".format(X.aromaticity()), file=f)
print("\n instability index: {:.2f}".format(X.instability_index()), file=f)
if flexibility_ynu == "1":
print("\n flexibility: ", X.flexibility(), file=f)
print("\n IEP: ", X.isoelectric_point(), file=f)
print("therefore its charge at pH = ",pH," is {:.2f}".format(X.charge_at_pH(pH)), file=f)
print("secondary structure fraction: (Helix, Turn, Sheet): ", X.secondary_structure_fraction(), "\n\n\n", file=f)
f.close()
print("done")
I now would like bar plots for both the absolute and relative number of amino acids, but create a unique plot for every FASTA ID.
For example, NC_011544 has 5 IDs, so I'd like to generate 10 unique plots (2 per ID, one for absolute, and one for relative number).
Is there any way to do this?
NC_011544.faa
>gi|212671454|ref|YP_002308464.1| replicase [Hosta virus X]
MARLREVFSSFTEPNLKTIVQQETYKLAKAELKTIQTYNPYAQTKDAADLLEDLGINTNPHAVTAHTHAA
AKSIENDLYGITSHYLPKTPITFLFMKRGKLQFFKRGPQHNDLFFYTTHEPKDVIRYQSEDQTADMFRVP
TSTGFIGDTLHFLSLKYLHRLFLKNPNLNTLYATMVLPPEAMYRMASIYPEIYQIQYQEDGFLYIPGGHG
GAAYFHTYDTLTWLRVGQFQAKEFTAHLPKVGDKGANHLFIIQRADLKTPKYRTFVPRRKWVTLPNIFLP
STQANHLFIIQRADLKTPKYRTFVPRRKWVTSNIFLPKHTNARKPILKQTMMQLFLYEKSVKEITFRDVF
AKIRQLIQTKDLEQFDPDELVRLANYVMHTSKLLEKDPYELIEGQGKLQDLVNPIKTWVSEKWQNWFGWK
DYTRLIRALKWVDVDLVLRVMNTRSTPTGIQTSELLPDEAGPPKSKKKRGGKKIPSPEPSRNCRSKSKRT
RGNRAQREKEPHRRKLRWQKENFQRVTVQVHQAPKGDPSPLARFSQSLKELPRRSQPRRLSKFQDFLMSS
TQTRFQIPSSLNRRAGHWRPKQQGTPPTTQEAGTEGPPTTQPGKPTASSPRAAPQPTANAETMEKGSQAS
SATTRGRDPVTDRTREQAPTNLTPEEEALPWKHWLKQLKAVGFKGNETQMDGDGTSISPIEQIKSCPGKP
KSVSKEILETLRSGHAPNFWKPDASRARAYTSDIKNRRTGAAVHMAPQAWKETMDFIAENAERTLHILRH
PWRRRFREEQMSSRDAHKFHFLFDETLVVCPTNELRRDWIDKLPLSEPGSVLTFERALMNPAKGTVIFDD
YTKLPAGFIEAYSICQPNVELVILTGDAKQASHHESNDNAMIAGLDPAAFEFSKFCRYYLNATHRNPRNL
ANALGIYSEKPGNLKVTFTNHLLPEMHILVPSLLKKATLEELGHKCSTYAGCQGVTLSKVQIYLDSNTTL
CSNEVLYTALSRAVEQINFVNSGPFNGPFWAKLEATPYLKTFLRLTREEKINEITPEEPKPKEPEPPKTH
FPVETSAHLYSSITEEMPEKHAREIYNKTHGHTNCVQTDEPLVQMFAHQQAKDEALFWETIEARLRITTS
EANVQELNEKRDIGDLLFHAYHKAMGLPKDPIPFENDLWETCAQEVQQTYLSKPINLIKNGEKRQGPDFD
KNAIMLFLKSQWVKKMEKLGAPTIKPGQTIASFHQITVMLYGTMARYMRRIRDRFCPKHILINCEKTPTQ
ISDFVKAQWDFSDFAYANDFTAFDQSQDGAMLQFEIIKAKFHNIPEDIILGYMDIKTNAKIFLGTLAIMR
LTGEGPTFDANTECNIAYTHLRFNVPENVAQVYAGDDSALSKVCPEKDSFKQFADRLTLKSKPQVFPQTQ
GAWAEFCGLLITPRGIIKDPVKLHASWVLATKLGTLQQIKCVNSYGEDLKLSYDLGDHLQELLSESQCRT
HQVTVRELVKFAGKVEKHQAEIRSVANGNIRQLPFFY
>gi|212671455|ref|YP_002308465.1| 26 kDa protein [Hosta virus X]
MATFASFLSSTRPDFERTNTPLTKPLVIHAVAGAGKTTLLRDFLRANPLTNAQTLGTPDCPTLDGAYIRP
FSGPVANLVNILDEYTAHRHGSWDVLIADPLQHYERAKLPHYICKRSHRLCPATARLLRKLGLDIHSYRE
DESEISFSDIFSGQLEGTVLPLTPLCKDLLERHSCPFKCPSEFIGEQDDIITVVSEIPLSKHPDKTALYR
ALTRHTRRLNVLAPPPYPTP
>gi|212671456|ref|YP_002308466.1| 13 kDa protein [Hosta virus X]
MSSPHRLTPPPNYTPVLLAVVIGVGLAVVTNQLTRSTLPHVGDNIHSLPHGGNYKDGTKSVIYRGPAPFQ
RSHSTAPPFNAVLLLTFAIWFLSCRTRRAAIGIHVCHTCSQTREQQ
>gi|212671457|ref|YP_002308467.1| 8 kDa protein [Hosta virus X]
MQSFCSHLRSGSFPVVLGALLLAFTCATLVLRLGNNNSNNCLIYVDGARAFLEGNCAGISAEVVAALRPH
SHAG
>gi|212671458|ref|YP_002308468.1| coat protein [Hosta virus X]
MASDAPTPPAAPSPVTFTAPTQEQLTSLALPIISTRLPSPDVLNQISVKWQELGVPTASISSTAIALCMA
CYHSGSSGSTLIPGLAPGTTVNYTSLAAAVKSLATLREFARYFAPIIWNYAIEHKIPPANWAAMGYKENT
KYAAFDTFDSILNPAALQPTGGLIRQPTEEELLAHQANSALHIFDSLRNDFASTDGRVTRGHITSNVNSL
NYLPAPEGSS
analysis.txt
was overwriting the previous one.id
from record
, as record_id
, and use that to create unique file names with each loop.percent_plot.pdf
, count_plot.pdf
, and an analysis.txt
for each id
with open
, which will automatically close the file f
.T
is used to transpose the dataframe when plotting, which places the amino acid names on the axis, thereby making the plot easier to read.import pandas
import matplotlib.pyplot as plt
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
# set pandas display option
pd.set_option('display.max_columns', None)
# input selections were removed for testing
filename = 'NC_011544.faa'
flexibility_ynu = '1'
pH = 7.4
for record in SeqIO.parse(filename, 'fasta'):
# get each record id to be used for unique file names
record_id = record.id.split('|')[1]
print(record)
# open the file and use the record id as part of the name, so there will be a unique file for each id
with open(f"{filename}_{record_id}_analysis.txt","w+") as f:
X = ProteinAnalysis(str(record.seq))
print("ANALYSIS OF", record, "\n ----------- \n -----------", file=f)
#
pd_count_amino_acids = pd.DataFrame(X.count_amino_acids(), index=[1])
display(pd_count_amino_acids) # display is for jupyter notebook, otherwise use print
print("number of amino acids: \n",pd_count_amino_acids , file=f)
plt_acc = pd_count_amino_acids.T.plot.bar(legend=False, figsize=(7, 5))
plt.title(f'Number of Amino Acids for {record_id} in {filename}')
plt.xticks(rotation=0)
# add the record id as part of the file name
plt.savefig(f'{filename}_{record_id}_count_amino_acids_plot.pdf')
plt.show()
#
pd_get_amino_acids_percent = pd.DataFrame(X.get_amino_acids_percent(), index=[1])
display(pd_get_amino_acids_percent.round(2)) # display is for jupyter notebook, otherwise use print
print("\n percentage of amino acids: \n", pd_get_amino_acids_percent, file=f)
plt_acp = pd_get_amino_acids_percent.T.plot.bar(legend=False, figsize=(7, 5))
plt.title(f'Percentage of Amino Acids for {record_id} in {filename}')
plt.xticks(rotation=0)
# add the record id as part of the file name
plt.savefig(f'{filename}_{record_id}_amino_acids_percent_plot.pdf')
plt.show()
#
print("\n molecular weight: {:.2f}".format(X.molecular_weight()), file=f)
print("\n aromaticity: {:.2f}".format(X.aromaticity()), file=f)
print("\n instability index: {:.2f}".format(X.instability_index()), file=f)
if flexibility_ynu == "1":
print("\n flexibility: ", X.flexibility(), file=f)
print("\n IEP: ", X.isoelectric_point(), file=f)
print("therefore its charge at pH = ",pH," is {:.2f}".format(X.charge_at_pH(pH)), file=f)
print("secondary structure fraction: (Helix, Turn, Sheet): ", X.secondary_structure_fraction(), "\n\n\n", file=f)
print('\n')