Search code examples
pythonpython-3.xdictionarymultiprocessingglobal-variables

How to use pool processing to update global dictionary


I'm trying to use pool processing of Python to update global dictionary named: globalDict. I'm expecting that the globalDict={'0':0,'1':1,'2':2} but after the code run, this dictionary is still blank. Please help me to fix this issue, the code as below:

from multiprocessing import Pool
import time

def f(x):
    global globalDict # update this dictionary every time function called
    globalDict.setdefault(str(x),x)
    return globalDict

def init_pool(dictX):
    # function to initial global dictionary 
    global globalDict
    globalDict = dictX

if __name__ == '__main__':
    start=time.time()
    globalDict={}
    pool=Pool(initializer=init_pool, initargs=(globalDict,)) # initial global dictionary
    pool.map(f, range(3)) # using pool processing to call f()function
    pool.close()
    pool.join()
    stop=time.time()
    print('Done in {:4f}'.format(stop-start))

Solution

  • A solution is to use a managed dictionary. There is no need to be returning the dictionary back from the worker function, f:

    from multiprocessing import Pool, Manager
    import time
    
    def f(x):
        globalDict.setdefault(str(x),x)
    
    def init_pool(dictX):
        # function to initial global dictionary
        global globalDict
        globalDict = dictX
    
    if __name__ == '__main__':
        start = time.time()
        with Manager() as manager:
            globalDict = manager.dict()
            pool = Pool(initializer=init_pool, initargs=(globalDict,)) # initial global dictionary
            pool.map(f, range(3)) # using pool processing to call f()function
            pool.close()
            pool.join()
            stop = time.time()
            print('Done in {:4f}'.format(stop-start))
            print(globalDict)
    

    Prints:

    Done in 0.606996
    {'0': 0, '2': 2, '1': 1
    

    If you want to end up with a "regular" dictionary that no longer requires the SycnManager class that is returned by the call to Manager(), then after the call to map completes, add the following statement:

    regular_dict = {k: v for k, v in globalDict.items()}
    

    Or, if you want to get clever, you can create your own managed dictionary type (we will call it Dict) that only supports the one method we need, setdefault, and dispatches that method call to an underlying dict that we will be able to retrieve when our call to map completes:

    from multiprocessing import Pool
    from multiprocessing.managers import BaseManager
    import time
    
    class DictManager(BaseManager):
        pass
    
    class Dict:
        def __init__(self):
            self._dict = {}
    
        def setdefault(self, *args):
            return self._dict.setdefault(*args)
    
        def get_underlying_dict(self):
            return self._dict
    
    def f(x):
        globalDict.setdefault(str(x),x)
    
    def init_pool(dictX):
        # function to initial global dictionary
        global globalDict
        globalDict = dictX
    
    if __name__ == '__main__':
        start = time.time()
        DictManager.register('Dict', Dict)
        with DictManager() as manager:
            globalDict = manager.Dict()
            pool = Pool(initializer=init_pool, initargs=(globalDict,)) # initial global dictionary
            pool.map(f, range(3)) # using pool processing to call f()function
            pool.close()
            pool.join()
            stop = time.time()
            print('Done in {:4f}'.format(stop-start))
            regular_dict = globalDict.get_underlying_dict()
        print(regular_dict)
    

    Prints:

    Done in 0.460001
    {'0': 0, '1': 1, '2': 2}