I have a function that I want to multithread/parallelize in Python3.
The df.myfunc(c1,c2)
function takes a long time to compute, and thus I would like to parallelize it, to speed up the computation for larger datasets.
def multi_thread_func(df):
cols = df.schema.names
length = len(cols)
a = np.zeros((length * length))
with multiprocessing.Pool() as pool:
i = 0
for value in pool.starmap(df.myfunc, itertools.product(cols, repeat=2)):
a[i] = None if value is None else value
i += 1
return a
The specific error I am getting is:
TypeError: cannot pickle '_thread.lock' object
def multi_thread_func(df):
length = len(df.cols)
a = np.zeros((length * length))
with multiprocessing.Pool() as pool:
i = 0
for value in pool.starmap(calculate, itertools.product(range(length), repeat=2)):
a[i] = value
i += 1
return a