Speeding Up np.sum With multiprocessing

If I have a numpy array of size 10^8 to 10^9, is it possible to compute its sum faster than np.sum?

I've tried using multiprocessing with fork, but it seems to be slower than just calling np.sum, regardless of the number of workers (1-4). I'm using Python 3.8 on a Mac with a 2 GHz Dual-Core Intel Core i5 processor. Not sure whether the results would be different if I had more CPUs.

My code:

import concurrent.futures
import multiprocessing as mp
import time
from concurrent.futures.process import ProcessPoolExecutor

import numpy as np

# based on: https://luis-sena.medium.com/sharing-big-numpy-arrays-across-python-processes-abf0dc2a0ab2


def np_sum_global(start, stop):
    return np.sum(data[start:stop])


def benchmark():
    st = time.time()
    ARRAY_SIZE = int(3e8)
    print("array size =", ARRAY_SIZE)
    global data
    data = np.random.random(ARRAY_SIZE)
    print("generated", time.time() - st)
    print("CPU Count =", mp.cpu_count())

    for trial in range(5):
        print("TRIAL =", trial)
        st = time.time()
        s = np.sum(data)
        print("method 1", time.time() - st, s)

        for NUM_WORKERS in range(1, 5):
            st = time.time()
            futures = []
            with ProcessPoolExecutor(max_workers=NUM_WORKERS) as executor:
                for i in range(0, NUM_WORKERS):
                    futures.append(
                        executor.submit(
                            np_sum_global,
                            ARRAY_SIZE * i // NUM_WORKERS,
                            ARRAY_SIZE * (i + 1) // NUM_WORKERS,
                        )
                    )
            futures, _ = concurrent.futures.wait(futures)
            s = sum(future.result() for future in futures)
            print("workers =", NUM_WORKERS, time.time() - st, s)
        print()


if __name__ == "__main__":
    mp.set_start_method("fork")
    benchmark()

Output:

array size = 300000000
generated 5.1455769538879395
CPU Count = 4
TRIAL = 0
method 1 0.29593801498413086 150004049.39847052
workers = 1 1.8904719352722168 150004049.39847052
workers = 2 1.2082111835479736 150004049.39847034
workers = 3 1.2650330066680908 150004049.39847082
workers = 4 1.233708143234253 150004049.39847046

TRIAL = 1
method 1 0.5861320495605469 150004049.39847052
workers = 1 1.801928997039795 150004049.39847052
workers = 2 1.165492057800293 150004049.39847034
workers = 3 1.2669389247894287 150004049.39847082
workers = 4 1.2941789627075195 150004049.39847043

TRIAL = 2
method 1 0.44912219047546387 150004049.39847052
workers = 1 1.8038971424102783 150004049.39847052
workers = 2 1.1491520404815674 150004049.39847034
workers = 3 1.3324410915374756 150004049.39847082
workers = 4 1.4198641777038574 150004049.39847046

TRIAL = 3
method 1 0.5163640975952148 150004049.39847052
workers = 1 3.248213052749634 150004049.39847052
workers = 2 2.5148861408233643 150004049.39847034
workers = 3 1.0224149227142334 150004049.39847082
workers = 4 1.20924711227417 150004049.39847046

TRIAL = 4
method 1 1.2363107204437256 150004049.39847052
workers = 1 1.8627309799194336 150004049.39847052
workers = 2 1.233341932296753 150004049.39847034
workers = 3 1.3235111236572266 150004049.39847082
workers = 4 1.344843864440918 150004049.39847046

Some links I've looked at:

Solution

Here's a benchmark with numba. It first has to compile the code which makes the first run a lot slower. The next runs are about twice to 3 times faster than numpy. So it depends on how often you run the code, if numba is worth it for you or not.

import numba
import numpy as np
import time

# based on: https://luis-sena.medium.com/sharing-big-numpy-arrays-across-python-processes-abf0dc2a0ab2

@numba.jit(nopython=True, parallel=True, cache=True)
def numba_sum(data):
    return np.sum(data)

def benchmark():
    st = time.time()
    ARRAY_SIZE = int(3e8)
    print("array size =", ARRAY_SIZE)
    global data
    data = np.random.random(ARRAY_SIZE)
    print("generated", time.time() - st)

    for trial in range(5):
        print("TRIAL =", trial)
        st = time.time()
        s = np.sum(data)
        print("method 1", time.time() - st, s)
        print("TRIAL =", trial)
        st = time.time()
        s = numba_sum(data)
        print("method 2", time.time() - st, s)


if __name__ == "__main__":
    benchmark()