So I have code that spins up 4 selenium chrome drivers and scrapes data from an element on the web pages. The code can be simplified to something like this:
import json
import multiprocessing as mp
from selenium import webdriver
from import By
from import WebDriverWait
class scraper():
def __init__(self,list_of_urls, process_num):
self.urls = list_of_urls
self.process_num = process_num
def scrape_urls(self):
driver = webdriver.Chrome(driver_dir)
data = []
for url in self.urls:
element = WebDriverWait(driver, timeout=7).until(lambda d: d.find_element(by=By.CLASS_NAME, value="InterestingData"))
print("Scraper # ", self.process_num," got data from: ",url)
return data
if __name__ == '__main__':
with open('array_of_urls', 'r') as infile:
urls = json.load(infile)
length_of_urls = len(urls)
partition_into = math.ceil(length_of_urls/number_of_processes)
scrapers = []
start = 0
end = start + partition_into
for num in range(number_of_processes):
new_scraper = scraper(urls[start:end],num)
start = end
end = start + partition_into
if end > length_of_urls:
end = length_of_urls-1
with mp.Pool(processes=number_of_processes) as pool:
result_array = []
for num in range(number_of_processes):
The problem I am running into is that after 5-10 minutes one of the scrapers would just stop, the only thing that would wake it back up is to manually refresh the page on the browser. If I leave it for an hour or so, 3 of the 4 stop and only one is running. They don't error out or print anything it just stops running. I've tried it on 2 different laptops and they both have the same issue. I've also tried doing this with 4 different mp.Process() running scrape_url and that also does the same thing. Has anyone else run into this issue or am I doing something wrong here?
For one thing, Selenium is already creating a process so it is far better to be using multithreading instead of multiprocessing since each thread will be starting a process anyway. Also, in scrape_urls
after your driver = webdriver.Chrome(driver_dir)
statement, the rest of the function should be enclosed in a try/finally statement where the finally block contains driver.quit()
to ensure that the driver process is terminated whether there is an exception or not. Right now you are leaving all the driver processes running.
You might also consider using the following technique that creates a thread pool of size 4 (or less depending on how many URLs there are to process), but each thread in the pool automatically reuses the driver that has been allocated to its thread, which is kept in thread local storage. You might wish to change the options used to create the driver (currently "headless" mode):
import json
from selenium import webdriver
from import By
from import WebDriverWait
from multiprocessing.pool import ThreadPool
import threading
import gc
threadLocal = threading.local()
class Driver:
def __init__(self):
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
self.driver = webdriver.Chrome(options=options)
def __del__(self):
self.driver.quit() # clean up driver when we are cleaned up
print('The driver has been "quitted".')
def create_driver(cls):
the_driver = getattr(threadLocal, 'the_driver', None)
if the_driver is None:
print('Creating new driver.')
the_driver = cls()
threadLocal.the_driver = the_driver
driver = the_driver.driver
the_driver = None
return driver
def scraper(url):
This now scrapes a single URL.
driver = Driver.create_driver()
element = WebDriverWait(driver, timeout=7).until(lambda d: d.find_element(by=By.CLASS_NAME, value="InterestingData"))
print("got data from: ", url)
return element.text
with open('array_of_urls', 'r') as infile:
urls = json.load(infile)
number_of_processes = min(4, len(urls))
with ThreadPool(processes=number_of_processes) as pool:
result_array =, urls)
# Must ensure drivers are quitted before threads are destroyed:
del threadLocal
# This should ensure that the __del__ method is run on class Driver: