Search code examples
pythonpandasdictionaryjoblib

Create pandas dataframe by operating on two dictionaries


I'm trying to create a pandas dataframe with row from one dictionary and column from another, and the values for row[i]-column[j] defined by any operation done on the key-value pair of both dictionaries (i.e. value of row[dict1[key]]-column[dict2[key]] can be calculated from a function which accepts values: dict1[key] and dict2[key]).

So far my code looks like this:

# -*- coding: utf-8 -*-
import sys
import os
import pandas as pd
from optparse import OptionParser
from sklearn.preprocessing import MinMaxScaler
from joblib import Parallel, delayed
import pybedtools
from subprocess import call
from collections import defaultdict
import numpy as np
from skbio.sequence import DNA
from skbio.alignment import local_pairwise_align_ssw
class sequenceCompare:

class sequenceCompare:

    '''Common class for comparing multifasta files'''

    def __init__(
        self,
        fasta1,
        fasta2
        ):
        self.fasta1 = fasta1
        self.fasta2 = fasta2

    def computeScore(self):
        sequenceList1 = {}
        sequenceList2 = {}
        score_matrix = pd.DataFrame([])
        with open(self.fasta1) as file_one:
            sequenceList1 = {line.strip(">\n"):next(file_one).rstrip() for line in file_one}        
        with open(self.fasta2) as file_two:
            sequenceList2 = {line.strip(">\n"):next(file_two).rstrip() for line in file_two} 
        #Is there any way to make following step parallel 
        for key1, value1 in sequenceList1.items():
            for key2, value2 in sequenceList2.items():
                    alignment, score, start_end_positions = local_pairwise_align_ssw(DNA(value1), DNA(value2))
                    #Store value of score in dataframe column key1 and row key2

Eg.

Sequence list 1: 
>A1
AAACCTTGGG
>A2
CCCAAAATTT
>A3
CCTTAAGGG

Sequence list 2:
>B1
GGTTAACC
>B2
GATCATCCA
>B3
CCAAAATTC

And the resulting dataframe after doing operations on the two dictionaries should look like this:

Dataframe: 
       A1          A2          A3
B1 dist(A1,B1) dist(A2,B1) dist(A3,B1)
B2 dist(A1,B2) dist(A2,B2) dist(A3,B2)
B3 dist(A1,B3) dist(A2,B3) dist(A3,B3)

What would be the most efficient(and hopefully parallel) way to do this ?


Solution

  • Checking the documentation, it seems that it is more efficient to build a StripedSmithWaterman object and use it multiple times instead of using local_pairwise_align_ssw every time. However, it does not seem to provide parallelism on its own (which is strange, because the library on which it is based claims to implement SIMD parallelism, so I may be wrong), but you can use regular Python multiprocessing to parallelize things:

    # -*- coding: utf-8 -*-
    import sys
    import os
    import pandas as pd
    from optparse import OptionParser
    from sklearn.preprocessing import MinMaxScaler
    from joblib import Parallel, delayed
    import pybedtools
    from subprocess import call
    from multiprocessing import Pool
    from itertools import repeat
    from collections import defaultdict
    import numpy as np
    from skbio.sequence import DNA
    from skbio.alignment import StripedSmithWaterman
    
    
    def compute_scores(dna1, dnas2):
        # StripedSmithWaterman docs:
        # http://scikit-bio.org/docs/0.4.2/generated/skbio.alignment.StripedSmithWaterman.html
        ssw1 = StripedSmithWaterman(dna1)
        # AlignmentStructure docs:
        # http://scikit-bio.org/docs/0.4.2/generated/skbio.alignment.AlignmentStructure.html
        return [ssw1(dna2).optimal_alignment_score for dna2 in dnas2]
    
    class sequenceCompare:
    
        '''Common class for comparing multifasta files'''
    
        def __init__(
            self,
            fasta1,
            fasta2
            ):
            self.fasta1 = fasta1
            self.fasta2 = fasta2
    
        def computeScore(self):
            sequenceList1 = {}
            sequenceList2 = {}
            score_matrix = pd.DataFrame([])
            with open(self.fasta1) as file_one:
                sequenceList1 = {line.strip(">\n"):next(file_one).rstrip() for line in file_one}
            with open(self.fasta2) as file_two:
                sequenceList2 = {line.strip(">\n"):next(file_two).rstrip() for line in file_two}
            with Pool(os.cpu_count()) as p:
                values2 = list(sequenceList2.values())
                data = p.starmap(compute_scores, zip(sequenceList1.values(), repeat(values2)))
                df = pd.DataFrame(data, columns=list(sequenceList1.keys()), index=list(sequenceList2.keys()))
                # df contains the resulting data frame