My code saves pages from a website and uses bs4 to get the urls of other pages. In my previous question I was advised to use multithreading instead of multiprocessing, and I was given a code that at first has really helped me. However, when I tested it on 100+ operations, it didn't close after completion (waited for 10 minutes), and I noticed that the program didn't saved all pages (and in different runs it doesn't save different pages, about 5-8 from a 120). With 30 operations it saves all consistently and waits for about 10 seconds before stopping. Maybe the code waits for remaining threads? How can I fix non-closing? This is the code I was given:
# libraries
import os
from bs4 import BeautifulSoup
from selenium import webdriver
from multiprocessing.pool import ThreadPool
import threading
# variables
url = ""
directory = os.path.dirname(os.path.realpath(__file__))
env_path = directory + "\chromedriver"
chromedriver_path = env_path + "\chromedriver.exe"
UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 " \
"Safari/537.36 "
dict1 = {"Смартфоны и телефоны": "",
"Телевизоры и аудиотехника": "",
"Ноутбуки, ПК и Планшеты": "",
"Техника для кухни": "",
"Техника для дома": "",
"Игровая зона": "",
"Гаджеты и аксесуары": "",
"Посуда": "",
"Фото и видео": "",
"Красота и здоровье": "",
"Авто и инструменты": "",
"Спорт и туризм": "",
"Товары для дома и сада": "",
"Товары для детей": ""}
count = 0
threaded_data = threading.local() # Создаёт хранилище (класс) для потоков, чтобы не вызывать webdriver при каждой итерации
os.environ['PATH'] += env_path # Добавляет chromedriver в PATH
class Driver:
def __init__(self):
options = webdriver.ChromeOptions()
options.headless = True
options.add_experimental_option("excludeSwitches", ['enable-automation'])
self.driver = webdriver.Chrome(executable_path=chromedriver_path, options=options)
def __del__(self):
self.driver.quit() # driver.quit(), когда переменная больше не используется (при окончании выполнения потока)
print('The driver has been quited.')
def create_driver():
the_driver = getattr(threaded_data, 'the_driver', None)
if the_driver is None:
the_driver = Driver()
setattr(threaded_data, 'the_driver', the_driver)
return the_driver.driver
def processing_brand_pages(name):
with open(f"{directory}\section_pages\\{name}.html", encoding="utf-8") as file:
soup = BeautifulSoup(, "lxml")
links = soup.find_all("div", class_="title")
driver = create_driver()
for n in links:
ref = url + n.find('a').get('href')
global count
print(n.text, count)
count += 1
with open(f"{directory}\\brand_pages\\{name}\\{n.text}.html", "w", encoding="utf-8") as file:
except Exception as ex:
if __name__ == "__main__":
ThreadPool(processes=6).map(processing_brand_pages, dict1.keys())
del threaded_data # Quit all the Selenium drivers
import gc
I've modified it not to use Class (personal preference):
# libraries
import os
from bs4 import BeautifulSoup
from selenium import webdriver
from multiprocessing.pool import ThreadPool
import threading
# variables
url = ""
directory = os.path.dirname(os.path.realpath(__file__))
env_path = directory + "\chromedriver"
chromedriver_path = env_path + "\chromedriver.exe"
UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 " \
"Safari/537.36 "
dict1 = {"Смартфоны и телефоны": "",
"Телевизоры и аудиотехника": "",
"Ноутбуки, ПК и Планшеты": "",
"Техника для кухни": "",
"Техника для дома": "",
"Игровая зона": "",
"Гаджеты и аксесуары": "",
"Посуда": "",
"Фото и видео": "",
"Красота и здоровье": "",
"Авто и инструменты": "",
"Спорт и туризм": "",
"Товары для дома и сада": "",
"Товары для детей": ""}
count = 0
threaded_data = threading.local() # Создаёт экземпляр класса (типо хранилище) для потоков, чтобы не вызывать webdriver при каждой итерации
os.environ['PATH'] += env_path # Добавляет chromedriver в PATH
def processing_brand_pages(name):
with open(f"{directory}\section_pages\\{name}.html", encoding="utf-8") as file:
soup = BeautifulSoup(, "lxml")
links = soup.find_all("div", class_="title")
driver = put_driver_in_threaded_data()
for n in links:
ref = url + n.find('a').get('href')
global count
print(n.text, count)
count += 1
with open(f"{directory}\\brand_pages\\{name}\\{n.text}.html", "w", encoding="utf-8") as file:
except Exception as ex:
def put_driver_in_threaded_data():
threaded_driver = getattr(threaded_data, 'driver_in_threaded_data', None)
if threaded_driver is None:
options = webdriver.ChromeOptions()
options.headless = True
options.add_experimental_option("excludeSwitches", ['enable-automation'])
threaded_driver = webdriver.Chrome(executable_path=chromedriver_path, options=options)
setattr(threaded_data, 'driver_in_threaded_data', threaded_driver)
return threaded_driver
if __name__ == "__main__":
ThreadPool(processes=6).map(processing_brand_pages, dict1.keys())
del threaded_data # Quit all the Selenium drivers
import gc
Both work in the same way and the problem happens also with both of them.
I think you need to write:
with ThreadPool(processes=6) as pool:, dict1.keys())
This forces a pool.join()
to happen before the next statement is executed. As your code is now written, you're just starting the threads, but then charging on ahead.