I'm trying to create a pandas dataframe with row
from one dictionary and column
from another, and the values for row[i]-column[j] defined by any operation done on the key-value pair of both dictionaries (i.e. value of row[dict1[key]]-column[dict2[key]]
can be calculated from a function which accepts values: dict1[key] and dict2[key]).
So far my code looks like this:
# -*- coding: utf-8 -*-
import sys
import os
import pandas as pd
from optparse import OptionParser
from sklearn.preprocessing import MinMaxScaler
from joblib import Parallel, delayed
import pybedtools
from subprocess import call
from collections import defaultdict
import numpy as np
from skbio.sequence import DNA
from skbio.alignment import local_pairwise_align_ssw
class sequenceCompare:
class sequenceCompare:
'''Common class for comparing multifasta files'''
def __init__(
self,
fasta1,
fasta2
):
self.fasta1 = fasta1
self.fasta2 = fasta2
def computeScore(self):
sequenceList1 = {}
sequenceList2 = {}
score_matrix = pd.DataFrame([])
with open(self.fasta1) as file_one:
sequenceList1 = {line.strip(">\n"):next(file_one).rstrip() for line in file_one}
with open(self.fasta2) as file_two:
sequenceList2 = {line.strip(">\n"):next(file_two).rstrip() for line in file_two}
#Is there any way to make following step parallel
for key1, value1 in sequenceList1.items():
for key2, value2 in sequenceList2.items():
alignment, score, start_end_positions = local_pairwise_align_ssw(DNA(value1), DNA(value2))
#Store value of score in dataframe column key1 and row key2
Eg.
Sequence list 1:
>A1
AAACCTTGGG
>A2
CCCAAAATTT
>A3
CCTTAAGGG
Sequence list 2:
>B1
GGTTAACC
>B2
GATCATCCA
>B3
CCAAAATTC
And the resulting dataframe after doing operations on the two dictionaries should look like this:
Dataframe:
A1 A2 A3
B1 dist(A1,B1) dist(A2,B1) dist(A3,B1)
B2 dist(A1,B2) dist(A2,B2) dist(A3,B2)
B3 dist(A1,B3) dist(A2,B3) dist(A3,B3)
What would be the most efficient(and hopefully parallel) way to do this ?
Checking the documentation, it seems that it is more efficient to build a StripedSmithWaterman
object and use it multiple times instead of using local_pairwise_align_ssw
every time. However, it does not seem to provide parallelism on its own (which is strange, because the library on which it is based claims to implement SIMD parallelism, so I may be wrong), but you can use regular Python multiprocessing to parallelize things:
# -*- coding: utf-8 -*-
import sys
import os
import pandas as pd
from optparse import OptionParser
from sklearn.preprocessing import MinMaxScaler
from joblib import Parallel, delayed
import pybedtools
from subprocess import call
from multiprocessing import Pool
from itertools import repeat
from collections import defaultdict
import numpy as np
from skbio.sequence import DNA
from skbio.alignment import StripedSmithWaterman
def compute_scores(dna1, dnas2):
# StripedSmithWaterman docs:
# http://scikit-bio.org/docs/0.4.2/generated/skbio.alignment.StripedSmithWaterman.html
ssw1 = StripedSmithWaterman(dna1)
# AlignmentStructure docs:
# http://scikit-bio.org/docs/0.4.2/generated/skbio.alignment.AlignmentStructure.html
return [ssw1(dna2).optimal_alignment_score for dna2 in dnas2]
class sequenceCompare:
'''Common class for comparing multifasta files'''
def __init__(
self,
fasta1,
fasta2
):
self.fasta1 = fasta1
self.fasta2 = fasta2
def computeScore(self):
sequenceList1 = {}
sequenceList2 = {}
score_matrix = pd.DataFrame([])
with open(self.fasta1) as file_one:
sequenceList1 = {line.strip(">\n"):next(file_one).rstrip() for line in file_one}
with open(self.fasta2) as file_two:
sequenceList2 = {line.strip(">\n"):next(file_two).rstrip() for line in file_two}
with Pool(os.cpu_count()) as p:
values2 = list(sequenceList2.values())
data = p.starmap(compute_scores, zip(sequenceList1.values(), repeat(values2)))
df = pd.DataFrame(data, columns=list(sequenceList1.keys()), index=list(sequenceList2.keys()))
# df contains the resulting data frame