Search code examples
pythonseleniumpython-multiprocessingpython-multithreading

Creating Multiple Instances of a Selenium Scraper Class and running the in Parallel


So I have created a web scraper with selenium that infinitely crawls a web page. I am trying to create two instances of this scraper and run them in parallel so that two different portions of the site (or two different sites entirely) will be scraped at the same time. With my current code, both processes start and two chrome instances launch, but only one actually starts scraping. The other just sits on the landing page and never moves. My current scraper class looks like this

class clBot(Scraper):

def __init__(self, light_or_dark):
    light_side_xpaths = ['//*[@id="hhh"]/h4/a', '//*[@id="sss"]/h4/a/', '//*[@id="jjj"]/h4/a',
                              '//*[@id="bbb"]/h4/a', '//*[@id="ggg"]/h4/a']
    dark_side_xpaths = ['//*[@id="ccc"]/h4/a', '//*[@id="ppp"]/h4', '//*[@id="forums"]/h4/a']
    if light_or_dark == "light":
        self.xpaths_to_scrape = light_side_xpaths
        self.csv_file = "lightside.csv"
    elif light_or_dark == "dark":
        self.xpaths_to_scrape = dark_side_xpaths
        self.csv_file = "darkside.csv"
    else:
        print('Incorrect variable entered. Please enter "light" or "dark" when initializing this class')
        quit()
    self.user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
    self.options = webdriver.ChromeOptions()
    #self.options.add_argument('--headless')
    self.options.add_argument('user-agent={self.user_agent}')
    self.current_region = ''
    self.driver = webdriver.Chrome(chrome_options=self.options)
    self.driver.get('https://craigslist.org')

    def run(self):
        self.navigate_pages()


def identify_phone_number(self, string, phone_number_list):
    reg = re.findall(".*?(\(?\d{3}\D{0,3}\d{3}\D{0,3}\d{4}).*?", string)
    if len(reg) > 0:
        for r in reg:
            if r.strip() not in phone_number_list:
                with open(self.csv_file, 'a') as csv:
                    csv.write("{}\n".format(r.strip()))
                print("Extracted {} from listing".format(r.strip()))
            else:
                print('Phone number already in list.')


def extract_phone_number(self):
    try:
        with open(self.csv_file, 'r') as csv:
            current_phone_numbers = csv.read()
        posting_body = self.driver.find_element_by_id('postingbody')
        self.scraper_wait_class_until_all(self.driver, 'showcontact', seconds_to_wait=5)
        contact_info = self.driver.find_element_by_class_name('showcontact')
        contact_info.click()
        time.sleep(1)
        self.identify_phone_number(posting_body.text, current_phone_numbers)
    except TimeoutException:
        self.identify_phone_number(posting_body.text, current_phone_numbers)
        print('There is no phone number in this listing.')



def scrape_pages(self):
    i=1
    while True:
        try:
            self.scraper_wait_class_until_all(self.driver, 'result-row')
            results = self.driver.find_elements_by_class_name('result-row')
            print("clicking result {}".format(i))
            results[i].find_element_by_class_name('result-title').click()
            self.extract_phone_number()
            self.driver.back()
            i+=1
        except IndexError:
            self.scraper_wait_xpath_until_any(self.driver, '//*[@id="searchform"]/div[5]/div[3]/span[2]/a[3]')
            next_button = self.driver.find_element_by_xpath('//*[@id="searchform"]/div[5]/div[3]/span[2]/a[3]')
            print('Navigating to next page.')
            next_button.click()
            i=1

def choose_xpath_to_scrape(self, list_of_xpaths):
    xpath_index = randint(0, len(list_of_xpaths)-1)
    xpath = list_of_xpaths[xpath_index]
    return xpath
def navigate_pages(self):
    try:
        while True:
            try:
                self.scraper_wait_xpath_until_any(self.driver, '//*[@id="rightbar"]')
                rightbar = self.driver.find_element_by_xpath('//*[@id="rightbar"]')
                nearby_cl = rightbar.find_element_by_xpath('//*[@id="rightbar"]/ul/li[1]')
                child_items = nearby_cl.find_elements_by_class_name('s')
                random = randint(1, len(child_items)-1)
                time.sleep(3)
                print("Clicking {}".format(child_items[random].text))
                child_items[random].click()
                for xpath in self.xpaths_to_scrape:
                    area_to_scrape = self.driver.find_element_by_xpath(self.choose_xpath_to_scrape(self.xpaths_to_scrape))
                    area_to_scrape.click()
                    self.scrape_pages()
                    self.driver.back()
                    time.sleep(1)
            except WebDriverException:
                continue
    except Exception as e:
        print(e)
        return
    finally:
        self.driver.quit()

and the main.py file that opens the two processes and initializes them is as follows:

import scraper

from multiprocessing import Process, Manager



if __name__ == "__main__":
    manager = Manager()
    d = manager.dict()
    l = manager.list(range(10))
    darksideScraper = scraper.clBot('light')
    lightsideScraper = scraper.clBot('dark')

    darkside = Process(target=darksideScraper.navigate_pages())
    lightside = Process(target=lightsideScraper.navigate_pages())
    darkside.start()
    lightside.start()
    darkside.join()
    lightside.join()

Any help would be appreciated!


Solution

  • Try passing your target as reference to your function instead of calling it, like this Process(target=darksideScraper.navigate_pages). Also refer to this, for another example of how to use multiprocessing.