It's my first python project after 10 years and my first experience with python multiprocessing, so there may just be some very basic mistakes I haven't seen.
I'm stuck with python and a multiprocessing web crawler. My crawler checks a main page for changes and then iterates through subcategories in parallel, adding items to a list. These items are then checked in parallel and extracted via selenium (as I couldn't figure out how to do it otherwise, because content is dynamically loaded into the page when clicking the items).
Main loop:
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
import time
from bs4 import BeautifulSoup
import pickledb
import random
import multiprocessing
import itertools
import config
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
def getAllSubCategories(pageNumber, items):
# check website and look for subcategories that are "worth" extracting
url = 'https://www.google.com' + str(pageNumber)
response = requests.get(url, verify=False, headers=config.headers, cookies=config.cookies)
pageSoup = BeautifulSoup(response.content, features='html.parser')
elements = soup.find(...)
if not elements: # website not loading properly
return getAllSubCategories(items)
for element in elements:
items.append(element)
def checkAndExtract(item, ignoredItems, itemsToIgnore):
# check if items are already extracted; if not, extract them if they contain a keyword
import checker
import extractor
if item not in ignoredItems:
if checker.check(item):
extractor.extract(item, itemsToIgnore)
else: itemsToIgnore.append(item)
if __name__ == '__main__':
multiprocessing.freeze_support()
itemsToIgnore = multiprocessing.Manager().list()
crawlUrl = 'https://www.google.com/'
db = pickledb.load('myDB.db', False)
while True:
try:
# check main website for changes
response = requests.get(crawlUrl, verify=False, headers=config.headers, cookies=config.cookies)
soup = BeautifulSoup(response.content, features='html.parser')
mainCondition = soup.find(...)
if mainCondition:
numberOfPages = soup.find(...)
ignoredItems = db.get('ignoredItems')
if not ignoredItems:
db.lcreate('ignoredItems')
ignoredItems = db.get('ignoredItems')
items = multiprocessing.Manager().list()
# get all items from subcategories
with multiprocessing.Pool(30) as pool:
pool.starmap(getAllSubCategories, zip(range(numberOfPages, 0, -1), itertools.repeat(items)))
itemsToIgnore[:] = []
# loop through all items
with multiprocessing.Pool(30) as pool:
pool.starmap(checkAndExtract, zip(items, itertools.repeat(ignoredItems), itertools.repeat(itemsToIgnore)))
for item in itemsToIgnore:
if item not in db.get('ignoredItems'): db.ladd('ignoredItems', item)
db.dump()
time.sleep(random.randint(10, 20))
except KeyboardInterrupt:
break
except Exception as e:
print(e)
continue
Checker:
import config
def check(item):
title = item...
try:
for keyword in config.keywords: # just a string array
if keyword.lower() in title.lower():
return True
except Exception as e:
print(e)
return False
Extractor:
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
import time
import config
def extract(item, itemsToIgnore):
driver = webdriver.Chrome('./chromedriver')
driver.implicitly_wait(3)
driver.get('https://www.google.com')
for key in config.cookies:
driver.add_cookie({'name': key, 'value': config.cookies[key], 'domain': '.google.com'})
try:
driver.get('https://www.google.com')
wait = WebDriverWait(driver, 10)
if driver.title == 'Page Not Found':
extract(item, itemsToIgnore)
return
driver.find_element_by_xpath('...').click()
time.sleep(1)
button = wait.until(EC.element_to_be_clickable((By.XPATH, '...')))
button.click()
# and some extraction magic
except:
extract(item, itemsToIgnore) # try again
Everything is working fine and some test runs were successful. But sometimes the loop would start again before the pool has finished its work. In the logs I can see how the item checker returns true, but the extractor is not even starting and the main process begins the next iteration:
2019-12-23 00:21:16,614 [SpawnPoolWorker-6220] [INFO ] check returns true
2019-12-23 00:21:18,142 [MainProcess ] [DEBUG] starting next iteration
2019-12-23 00:21:39,630 [SpawnPoolWorker-6247] [INFO ] checking subcategory
Also I guess that the pool does not clean up somehow as I doubt the SpawnPoolWorker-XXXX
number should be that high. It also freezes after ~1 hour. This may be connected to this issue.
I fixed the loop issue with either switching from Win7 to Win10 or switching from starmap to starmap_async and calling get() on the result afterwards.
The freeze was most probably caused by calling requests.get() without passing a value for timeout.