Search code examples
pythonnumpydictionaryjoblib

Python, add key:value to dictionary in parallelised loop


I have written some code to perform some calculations in parallel (joblib) and update a dictionary with the calculation results. The code consists of a main function which calls a generator function and calculation function to be run in parallel. The calculation result (a key:value pair) are added by each instance of the calculation function to a dictionary created in the main function and market as global.

Below is a simplified version of my code, illustrating the procedure described above.

When everything runs, the result dictionary (d_result) is empty, but it should have been populated with the results generated by the calculation function. Why is it so?

import numpy as np
from joblib import Parallel, delayed


def do_calc(d, r, pair_index): # function to be run in parallel

    data_1 = d[str(r)][pair_index, 1]
    data_2 = d[str(r)][pair_index, 2]
    result_name = str(data_1) + " ^ " + str(data_2)
    result = data_1 ** data_2
    d_result[result_name] = result
    # d_result.setdefault(result_name, []).append(result)  ## same result as above


def compute_indices(d): # generator function

    for r in d:
        num_pairs = d[str(r)].shape[0]
        for pair_index in range(num_pairs):
            yield r, pair_index


def process(): # main function

    global d_result
    d_result = {}
    r1 = np.array([['ab', 1, 2], ['vw', 10, 12]], dtype=object)
    r2 = np.array([['ac', 1, 3], ['vx', 10, 13]], dtype=object)
    r3 = np.array([['ad', 1, 4], ['vy', 10, 14]], dtype=object)
    r4 = np.array([['ae', 1, 5], ['vz', 10, 15]], dtype=object)
    d = {'r1': r1, 'r2': r2, 'r3': r3, 'r4': r4}
    Parallel(n_jobs=4)(delayed(do_calc)(d, r, pair_index) for r, pair_index in (compute_indices)(d))
    print(d_result)


process()

Solution

  • OK, I've figured it out. Answer and new code below:

    The do_calc() function now generates an empty dict, then populates it with a single key:value pair and returns the dict.

    The parallel bit in process() by default creates a list of that which is returned from do_calc(). So what I end up with after the parallelised do_calc() is a list of dicts.

    What I really want is a single dict, so using dict comprehension I convert the list of dicts to dict, and wala, she's all good!

    This helped: python convert list of single key dictionaries into a single dictionary

    import numpy as np
    from joblib import Parallel, delayed
    
    
    def do_calc(d, r, pair_index):  # calculation function to be run in parallel
    
        data_1 = d[str(r)][pair_index, 1]
        data_2 = d[str(r)][pair_index, 2]
        result_name = str(data_1) + " ^ " + str(data_2)
        result = data_1 ** data_2
        d_result = {}  # create empty dict
        d_result[result_name] = result  #add key:value pair to dict
        return d_result  # return dict
    
    
    def compute_indices(d):  # generator function
    
        for r in d:
            num_pairs = d[str(r)].shape[0]
            for pair_index in range(num_pairs):
                yield r, pair_index
    
    
    def process():  # main function
    
        r1 = np.array([['ab', 1, 2], ['vw', 10, 12]], dtype=object)
        r2 = np.array([['ac', 1, 3], ['vx', 10, 13]], dtype=object)
        r3 = np.array([['ad', 1, 4], ['vy', 10, 14]], dtype=object)
        r4 = np.array([['ae', 1, 5], ['vz', 10, 15]], dtype=object)
        d = {'r1': r1, 'r2': r2, 'r3': r3, 'r4': r4}
        # parallelised calc.  Each run returns dict, final output is list of dicts
        d_result = Parallel(n_jobs=4)(delayed(do_calc)(d, r, pair_index) for r, pair_index in (compute_indices)(d))
        # transform list of dicts to dict
        d_result = {k: v for x in d_result for k, v in x.items()}
        print(d_result)
    
    process()