I'm trying to transfer a .fasta file into a .xls file so that I can conveniently color my phylogenetic tree.
import pandas as pd
import re
from Bio import SeqIO
s1 = {}
s2 = {}
with open('/Users/xxx.fasta') as seqF:
for seqFP in SeqIO.parse(seqF,"fasta"):
seq_id = seqFP.description
seq_an = re.search(r"\[([^]]*)\]",seq_id)
s1[seqFP.id] = seqFP.seq
s2[seqFP.id] = seq_an
print(s2.values)
However when I tried to use the s1 to create a pd.Series, the sequence column shows like "(M, S, A, C, C, N, K, L, A, V, L, G, L, T, F, ...".
Using this FASTA file as an example:
import pandas as pd
import re
from Bio import SeqIO
from collections import defaultdict
data = defaultdict(list)
with open('nucleotide-sample.txt') as fp:
for record in SeqIO.parse(fp,"fasta"):
organism = re.search(r"\[([^]]*)\]", record.description).group(1)
sequence = str(record.seq)
data['organism'].append(organism)
data['sequence'].append(sequence)
df = pd.DataFrame.from_dict(data)
Or, shorter but maybe less readable:
with open('nucleotide-sample.txt') as fp:
records = [{'organism': re.search(r"\[([^]]*)\]", record.description).group(1),
'sequence': str(record.seq)} for record in SeqIO.parse(fp,"fasta")]
df = pd.DataFrame.from_records(records)
Both return the following DataFrame:
organism | sequence | |
---|---|---|
0 | organism=Carpodacus mexicanus | CCTTTATCTAATCTTTGGAGCATGAGCTGGCATAGTTGGAACCGCCCTCAGCCTCCTCATCCGTGCAGAACTTGGACAACCTGGAACTCTTCTAGGAGACGACCAAATTTACAATGTAATCGTCACTGCCCACGCCTTCGTAATAATTTTCTTTATAGTAATACCAATCATGATCGGTGGTTTCGGAAACTGACTAGTCCCACTCATAATCGGCGCCCCCGACATAGCATTCCCCCGTATAAACAACATAAGCTTCTGACTACTTCCCCCATCATTTCTTTTACTTCTAGCATCCTCCACAGTAGAAGCTGGAGCAGGAACAGGGTGAACAGTATATCCCCCTCTCGCTGGTAACCTAGCCCATGCCGGTGCTTCAGTAGACCTAGCCATCTTCTCCCTCCACTTAGCAGGTGTTTCCTCTATCCTAGGTGCTATTAACTTTATTACAACCGCCATCAACATAAAACCCCCAACCCTCTCCCAATACCAAACCCCCCTATTCGTATGATCAGTCCTTATTACCGCCGTCCTTCTCCTACTCTCTCTCCCAGTCCTCGCTGCTGGCATTACTATACTACTAACAGACCGAAACCTAAACACTACGTTCTTTGACCCAGCTGGAGGAGGAGACCCAGTCCTGTACCAACACCTCTTCTGATTCTTCGGCCATCCAGAAGTCTATATCCTCATTTTAC |
1 | organism=uncultured bacillus sp. | GGTAGGTACCGCCCTAAGNCTCCTAATCCGAGCAGAACTANGCCAACCCGGAGCCCTTCTGGGAGACGACCAAATCTACAACGTAGTCGTTACGGCCCACGCCTTCGTAATAATCTTTTTCATAGTAATGCCAATCATAATCGGAGGATTCGGGAACTGACTAGTTCCTCTAATGATTGGGGCCCCAGACATAGCATTCCCTCGAATAAACAACATAAGCTTTTGACTACTACCACCATCATTCCTACTCCTAATAGCCTCCTCAACAGTAGAAGCAGGAGCCGGAACCGGATGAACCGTGTACCCACCACTAGCTGGAAACCTGGCCCACGCCGGAGCCTCAGTAGACCTAGCTATCTTCTCCCTACACCTAGCAGGTATCTCATCCATCCTGGGGGCAATTAACTTCATTACAACAGCAATCAACATAAAACCACCCGCCCTCTCACAATACCAAACACCACTATTCGTGTGATCCGTCCTAATTACGGCCGTACTACTCCTACTATCTCTCCCAGTACTAGCCGCCGGTATCACCATGCTACTCACAGACCGCAACCTCAACACCACCTTCTTTGACCCAGCAGGAGGAGGAGACCCAGTACTATACCAGCACCTATTCTGATTCTTCGGACACCCAGAAGTCTACATCCTAATTCTC |
2 | organism=Phalaenopsis equestris var. leucaspis | CCTATACCTAATTTTCGGCGCATGAGCCGGAATGGTGGGTACCGCTCTAAGCCTCCTCATTCGAGCAGAACTAGGCCAACCCGGAGCCCTTCTGGGAGACGACCAAGTCTACAACGTGGTTGTCACGGCCCATGCCTTCGTAATAATCTTCTTTATAGTTATGCCGATTATAATCGGAGGATTCGGAAACTGACTAGTCCCCCTAATAATCGGAGCCCCAGACATAGCATTTCCGCGAATAAACAACATAAGCTTCTGACTACTCCCACCATCATTCCTCCTCCTCTTAGCATCCTCCACAGTGGAAGCAGGCGTAGGTACAGGCTGAACAGTGTATCCCCCACTAGCTGGCAACCTAGCTCATGCCGGGGCCTCAGTCGACCTCGCAATCTTCTCCTTACACCTAGCTGGTATTTCCTCAATCCTCGGAGCAATTAACTTCATTACAACAGCAATTAACATGAAACCTCCTGCCCTCTCACAATACCAAACCCCACTATTCGTCTGATCAGTGTTAATTACTGCAGTCCTCCTTCTCCTTTCCCTTCCAGTTCTAGCTGCAGGAATCACAATGCTCCTCACAGACCGCAACCTCAACACCACATTCTTCGACCCTGCCGGAGGAGGAGATCCCGTCCTATATCAACATCTCTTCTGATTCTTCGGCCACCCAGAAGTCTACATCCTAATCCTC |
3 | organism=uncultured archaeon | CATGAGCTGGAATAGTAGGTACCGCCCTAAGCCTCCTAATTCGAGCAGAGCTAGGCCAACCCGGAGCCCTACTGGGAGACGACCAAATCTACAACGTAGTCGNCACGGCCCATGCTTTTGTAATAATCTTCTTCATAGTAATGCCAATCATAATCGGAGGGTTTGGAAACTGACTGGTCCCCCTAATAATTGGAGCTCCAGACATAGCATTCCCCCGAATAAACAACATGAGTTTCTGACTACTTCCCCCATCATTCCTACTACTAATAGCCTCCTCAACAGTAGAAGCAGGCGTTGGAACAGGATGAACCGTATATCCACCACTAGCCGGAAACCTAGCCCATGCAGGAGCCTCAGTAGACCTAGCTATCTTCTCCCTACACCTAGCAGGTATCTCCTCCATCCTAGGGGCAATCAACTTCATTACAACAGCAATCAACATAAAACCACCCGCCCTATCACAATACCAAACACCACTATTCGTATGATCCGTCCTAATCACAGCCGTACTACTCCTCCTATCACTCCCAGTGCTAGCTGCTGGAATTACCATGCTACTTACAGACCGCAACCTCAACACTACCTTCTTTGACCCAGCAGGGGGAGGAGACCCAGTGCTATACCAACATCTATTCTGATTCTTCGGACACCCAGAAGTTTACATCCTAATTCTC |
4 | organism=Influenza A virus | CCTATACCTAATTTTCGGCGCATGAGCCGGAATAGTGGGTACCGCCCTAAGCCTCCTCATTCGAGCAGAACTAGGCCAACCCGGAGCCCTTCTGGGAGACGACCAAGTCTATAACGTAGTTGTCACGGCCCATGCCTTCGTAATAATTTTCTTTATAGTTATGCCGATTATAATCGGAGGATTCGGAAACTGACTAGTCCCCCTAATAATCGGAGCCCCAGACATAGCATTCCCACGAATAAACAACATAAGCTTCTGACTACTCCCACCATCATTCCTTCTCCTCCTAGCATCCTCCACAGTCGAAGCAGGCGTAGGTACAGGCTGAACAGTATACCCCCCACTAGCTGGCAACCTAGCTCACGCCGGAGCCTCAGTCGACCTCGCAATCTTCTCTCTACACCTAGCTGGTATTTCCTCAATCCTCGGAGCAATCAACTTCATTACAACAGCAATTAACATAAAACCTCCTGCCCTCTCACAATACCAAACCCCACTGTTCGTCTGATCCGTCCTAATCACTGCAGTCCTCCTGCTCCTTTCCCTTCCAGTTCTAGCTGCAGGAATCACAATACTCCTCACAGACCGCAACCTAAACACCACATTCTTCGACCCTGCTGGAGGAGGAGATCCCGTCCTATATCAACACCTTTTCTGATTCTTCGGCCACCCAGAAGTCTACATCCTAATCNTC |
5 | organism=Vireo gilvus | CATGAGCTGGAATAGTAGGTACCGCCCTAAGCCTCCTAATTCGAGCAGAGCTAGGCCAACCCGGAGCCCTACTGGGAGACGACCAAATCTACAACGTAGTCGTCACGGCCCATGCTTTTGTAATAATCTTCTTCATAGTAATGCCAATCATAATCGGAGGGTTTGGAAACTGACTGGTCCCCCTAATAATTGGAGCTCCAGACATAGCATTCCCCCGAATAAACAACATGAGTTTCTGACTACTTCCCCCATCATTCCTACTACTAATAGCCTCCTCAACAGTAGAAGCAGGCGTTGGAACAGGATGAACTGTATACCCGCCACTAGCCGGTAACCTAGCCCATGCAGGAGCCTCAGTAGACCTAGCTATCTTCTCCCTACACCTAGCAGGTATCTCCTCCATCCTAGGGGCAATCAACTTCATTACAACAGCAATCAACATAAAACCACCCGCCCTATCACAATACCAAACACCACTATTCGTATGATCCGTCCTAATCACAGCCGTACTACTCCTCCTATCACTCCCAGTGCTAGCTGCTGGAATTACCATGCTACTTACAGACCGCAACCTCAACACTACCTTCTTTGACCCAGCAGGGGGAGGAGACCCAGTGCTATACCAACATCTATTCTGATTCTTCGGACACCCAGAAGTTTACATCCTAATTCTC |
6 | organism=Pelecanus erythrorhynchos | TAGTTGGAACAGCCCTCAGCCTACTCATCCGAGCAGAACTAGGCCAACCCGGAACCCTCCTGGGAGATGACCAAATCTACAATGTAATCGTCACTGCCCATGCCTTCGTAATAATCTTCTTCATAGTAATACCAGTCATAATTGGAGGCTTCGGAAACTGACTAGTCCCCCTCATAATCGGCGCTCCAGACATAGCATTCCCACGTATAAACAACATAAGCTTCTGACTCCTACCCCCATCCTTCCTACTCCTCCTAGCCTCATCCACAGTGGAAGCAGGCGCAGGAACAGGATGAACGGTGTACCCCCCACTAGCTGGCAACCTAGCCCATGCCGGAGCCTCAGTAGATCTAGCTATTTTCTCACTCCACTTAGCAGGGGTATCCTCTATTCTAGGTGCAATCAATTTCATCACAACCGCCATCAACATAAAACCACCAGCCCTATCACAATATCAAACTCCATTATTCGTGTGATCCGTACTCATCACTGCCGTCCTACTACTATTATCCCTCCCAGTCCTAGCCGCCGGCATCACTATGCTCCTCACAGACCGAAATCTGAACACTACATTCTTCGACCCCGCTGGAGGAGGAGACCCAGTCCTATACCAACACTTATTCTGGTTTTTCGGCCACCCAGAAGTTTACATCCTAATTCTC |
7 | organism=Hippodamia tredecimpunctata tibialis | TAGTTGGAACAGCCCTCAGCCTACTCATCCGAGCAGAACTAGGCCAACCCGGAACCCTCCTGGGAGATGACCAAATCTACAATGTAATCGTCACTGCCCATGCCTTCGTAATAATCTTCTTCATAGTAATACCAGTCATAATTGGAGGCTTCGGAAACTGACTAGTCCCCCTCATAATCGGCGCTCCAGACATAGCATTCACAACATAAGCTTCTGACTCCTACCCCCATCCTTCCTACTCCTCCTAGCCTCATCCACAGTGGAAGCAGGCGCAGGAACAGGATGAACGGTGTACCCCCCACTAGCTGGCAACCTAGCCCATGCCGGAGCCTCAGTAGATCTAGCTATTTTCTCACTCCACTTAGCAGGGGTATCCTCTATTCTAGGTGCAATCAATTTCATCACAACCGCCATCAACATAAAACCACCAGCCCTATCACAATATCAAACTCCATTATTCGTGTGATCCGTACTCATCACTGCCGTCCTACTACTATTATCCCTCCCAGTCCTAGCCGCCGGCATCACTATGCTCCTCACAGACCGAAATCTGAACACTACATTCTTCGACCCCGCTGGAGGAGGAGACCCAGTCCTATACCAACACTTATTCTGGTTTTTCGGCCACCCAGAAGTTTACATCCTAATTCTC |
8 | organism=Petunia integrifolia subsp. inflata | TAGTTGGAACAGCCCTCAGCCTACTCATCCGAGCAGAACTAGGCCAACCCGGAACCCTCCTGGGAGATGACCAAATCTACAATGTAATCGTCACTGCCCATGCCTTCGTAATAATCTTCTTCATAGTAATACCAGTCATAATTGGAGGCTTCGGAAACTGACTAGTCCCCCTCATAATCGGCGCTCCAGACATAGCATTCCCACGTATAAACAACATAAGCTTCTGACTCCTACCCCCATCCTTCCTACTCCTCCTAGCCTCATCCACAGTGGAAGCAGGCGCAGGAACAGGATGAACGGTGTACCCCCCACTAGCTGGCAACCTAGCCCATGCCGGAGCCTCAGTAGATCCATCAACATAAAACCACCAGCCCTATCACAATATCAAACTCCATTATTCGTGTGATCCGTACTCATCACTGCCGTCCTACTACTATTATCCCTCCCAGTCCTAGCCGCCGGCATCACTATGCTCCTCACAGACCGAAATCTGAACACTACATTCTTCGACCCCGCTGGAGGAGGAGACCCAGTCCTATACCAACACTTATTCTGGTTTTTCGGCCACCCAGAAGTTTACATCCTAATTCTC |
9 | organism=Fusarium oxysporum f. tuberosi | TAGTTGGAACAGCCCTCAGCCTACTCATCCGAGCAGAACTAGGCCAACCCGGAACCCTCCTGGGAGATGAATTGGAGGCTTCGGAAACTGACTAGTCCCCCTCATAATCGGCGCTCCAGACATAGCATTCCCACGTATAAACAACATAAGCTTCTGACTCCTACCCCCATCCTTCCTACTCCTCCTAGCCTCATCCACAGTGGAAGCAGGCGCAGGAACAGGATGAACGGTGTACCCCCCACTAGCTGGCAACCTAGCCCATGCCGGAGCCTCAGTAGATCTAGCTATTTTCTCACTCCACTTAGCAGGGGTATCCTCTATTCTAGGTGCAATCAATTTCATCACAACCGCCATCAACATAAAACCACCAGCCCTATCACAATATCAAACTCCATTATTCGTGTGATCCGTACTCATCACTGCCGTCCTACTACTATTATCCCTCCCAGTCCTAGCCGCCGGCATCACTATGCTCCTCACAGACCGAAATCTGAACACTACATTCTTCGACCCCGCTGGAGGAGGAGACCCAGTCCTATACCAACACTTATTCTGGTTTTTCGGCCACCCAGAAGTTTACATCCTAATTCTC |
10 | organism=Dendroica tigrina | TAGTTGGAACAGCCCTCAGCCTACTCATCCGAGCAGAACTAGGCCAACCCGGAACCCTCCTGGGAGATGACCAAATCTACAATGTAATCGTCACTGCCCATGCCTTCGTAATAATCTTCTTCATAGTAATACCAGTCATAATTGGAGGCTTCGGAAACTGACTAGTCCCCCTCATAATCGGCGCTCCAGACATAGCATTCCCACGTATAAACAACATAAGCTTCTGACTCCTACCCCCATCCTTCCTACTCCTCCTAGCCTCATCCACAGTGGAAGCAGGCGCAGGAACAGGATGAACGGTGTACCCCCCACTAGCTGGCAACCTAGCCCATGCCGGAGCCTCAGTAGATCTAGCTATTTTCTCACTCCACTTAGCAGGGGTATCCTCTATTCTAGGTGCAATCAATTTCATCACAACCGCCATCAACATAAAACCACCAGCCCTATCACAATATCAAACTCCATTATTCGTGTGATCCGTACTCATCAC |