I'm trying to use multiprocessing to train multiple models with different algorithms at the same time. For example, a Naive bayes model and a RandomForest model being trained parallel to each other on the same dataset. I am using concurrent.futures.ProcessPoolExecutor() to accomplish this but I'm running into an error.
This is my code:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score , f1_score , recall_score, precision_score
import time
import concurrent.futures
df = pd.read_csv(".\DATA\heart.csv")
X = df.iloc[:,:-1] # We get all but the output column
y = pd.DataFrame(df["output"]) # We get output column
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size = 0.15 , random_state = 53)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
if __name__ == '__main__':
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
start = time.perf_counter()
# list of algorithms names
algo_name = [BernoulliNB, RandomForestClassifier, SVC, SGDClassifier]
# function to train the model and print the accuracy
def train(algo_name) :
model = algo_name().fit(x_train, y_train)
y_pred = model.predict(x_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print("--------------------------------")
print("Accuracy: ", round(accuracy_score(y_test, y_pred)*100, 2), "%")
print("F1 accuracy: ", round(f1_score(y_test, y_pred)*100, 2), "%")
print("Precision: ", round(precision_score(y_test, y_pred)*100, 2), "%")
print("Recall : ", round(recall_score(y_test, y_pred)*100, 2), "%")
print("--------------------------------")
# run the train function with every item in the algo_name list parallel to each other
with concurrent.futures.ProcessPoolExecutor() as executor:
executor.map(train, algo_name)
end = time.perf_counter()
print(f'Program runtime is {round((end - start) * 1000 , 2)} ms')
And this is the error I'm getting:
Process SpawnProcess-2:
Process SpawnProcess-4:
Process SpawnProcess-1:
Process SpawnProcess-3:
Worth mentioning that when I change concurrent.futures.ProcessPoolExecutor() to concurrent.futures.ThreadPoolExecutor() the program runs just fine but there is no time improvement than from running the program sequentially.
I rewrote this slightly as a minimal reproducible example:
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import time
import concurrent.futures
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=10_000)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
if __name__ == '__main__':
start = time.perf_counter()
algo_name = [BernoulliNB, RandomForestClassifier, SVC, SGDClassifier]
def train(algo_name) :
model = algo_name().fit(X_train, y_train)
y_pred = model.predict(X_test)
print(model, accuracy_score(y_test, y_pred))
with concurrent.futures.ProcessPoolExecutor() as executor:
executor.map(train, algo_name)
end = time.perf_counter()
print(f'Program runtime is {round((end - start) * 1000 , 2)} ms')
And got the output:
BernoulliNB() 0.9196
SGDClassifier() 0.94
SVC() 0.9652
RandomForestClassifier() 0.9736
Program runtime is 1978.72 ms
Running Python 3.10.6
with scikit-learn==1.2.0
on an Ubuntu machine.
scikit-learn has separate ways of dealing with parallelism (e.g. see the documentation on Parallelism, resource management, and configuration) which might be interfering.
System details from sklearn.show_versions()
in case it is useful:
System:
python: 3.10.6 (main, Oct 24 2022, 16:07:47) [GCC 11.2.0]
executable: /home/hayesall/miniconda3/envs/srlearn/bin/python
machine: Linux-5.15.0-56-generic-x86_64-with-glibc2.35
Python dependencies:
sklearn: 1.2.0
pip: 22.2.2
setuptools: 65.5.0
numpy: 1.23.4
scipy: 1.9.3
Cython: None
pandas: 1.5.2
matplotlib: 3.6.2
joblib: 1.2.0
threadpoolctl: 3.1.0
Built with OpenMP: True
threadpoolctl info:
user_api: openmp
internal_api: openmp
prefix: libgomp
filepath: /home/hayesall/miniconda3/envs/srlearn/lib/python3.10/site-packages/scikit_learn.libs/libgomp-a34b3233.so.1.0.0
version: None
num_threads: 8
user_api: blas
internal_api: openblas
prefix: libopenblas
filepath: /home/hayesall/miniconda3/envs/srlearn/lib/python3.10/site-packages/numpy.libs/libopenblas64_p-r0-742d56dc.3.20.so
version: 0.3.20
threading_layer: pthreads
architecture: Haswell
num_threads: 8
user_api: blas
internal_api: openblas
prefix: libopenblas
filepath: /home/hayesall/miniconda3/envs/srlearn/lib/python3.10/site-packages/scipy.libs/libopenblasp-r0-41284840.3.18.so
version: 0.3.18
threading_layer: pthreads
architecture: Haswell
num_threads: 8