Search code examples
pythonselenium-webdriverpython-multithreadingconcurrent.futures

Appending scrapped information to lists with python multi-threading resulting in 0 lists


I am using Selenium headless browsers with concurrent futures threading in order to scrape information from a website and output it to an Excel sheet. However, while Beautiful Soup is able to find the elements fine and is able to print the elements, appending them to the list is not working and ends up with blank lists except for 1 list which appends fine. Here is the code (sorry if it is messy, I'm still learning):

def init_driver():
    PROXY = 'http://p.webshare.io:9999'
    chrome_options = ChromeOptions()
    chrome_options.add_argument('log-level=3')
    chrome_options.add_argument('--proxy-server=%s' % PROXY)
    chrome_options.add_argument("--headless")  # Run headless
    driver_service = ChromeService(executable_path='chromedriver.exe')
    driver = webdriver.Chrome(service=driver_service, options=chrome_options)
    return driver


def Get_info(url2):
    driver = init_driver()
    driver.get(url2)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    title = soup.find('h1', class_='DrugHeader__title-content___2ZaPo').text
    images = soup.find_all('img', alt = title)
    store = soup.find_all(class_='saltInfo DrugHeader__meta-value___vqYM0')
    for things in store:
        if 'store' in things.text:
            stor = thing.text
        else:
            pass
    with lock:
        pop = []
        for image in images:
            blah = image['src']
            final = blah.replace('l_watermark_346,w_480,h_480/a_ignore,w_480,h_480,c_fit,q_auto,f_auto/', '')
            print(final)
            pop.append(final)
        thing = ', '.join(pop)
        image_list.append(thing)
        names.append(title)
        storage.append(stor)
        manufacturer.append(soup.find_all(class_='DrugHeader__meta-value___vqYM0')[0].text)
        salt.append(soup.find_all(class_='DrugHeader__meta-value___vqYM0')[1].text)
        if soup.find(class_='DrugPriceBox__best-price___32JXw').text != None:
            price.append(soup.find(class_='DrugPriceBox__best-price___32JXw').text)
        elif soup.find(class_='PriceBoxPlanOption__offer-price___3v9x8 PriceBoxPlanOption__offer-price-cp___2QPU_').text != None:
            price.append(soup.find(class_='PriceBoxPlanOption__offer-price___3v9x8 PriceBoxPlanOption__offer-price-cp___2QPU_').text)
        elif soup.find(class_='DrugPriceBox__price___dj2lv').text != None:
            price.append(soup.find(class_='DrugPriceBox__price___dj2lv').text)
        pack = soup.find(class_='DrugPriceBox__quantity___2LGBX').text
        if 'vial' in pack:
            pack_type.append('vial')
        elif 'strip' in pack:
            pack_type.append('strip')
        elif 'bottle' in pack:
            pack_type.append('bottle')
        elif 'tube' in pack:
            pack_type.append('tube')
        elif 'packet' in pack:
            pack_type.append('packet')
        elif 'box' in pack:
            pack_type.append('box')
        elif 'vial' in pack:
            pack_type.append('vial')
        elif 'cartridge' in pack:
            pack_type.append('cartridge')
        elif 'ampoule' in pack:
            pack_type.append('ampoule')
        elif 'syringe' in pack:
            pack_type.append('syringe')
        else:
            pack_type.append('N/A')
        packaging.append(soup.find(class_='DrugPriceBox__quantity___2LGBX').text)
    driver.quit()


with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor2:
    print('Start!')
    print(len(url_list))
    for url2 in url_list:
        executor2.submit(Get_info, url2)

file2.close()


print('a' + str(len(names)))
print('b' + str(len(price)))
print('d' + str(len(packaging)))
print('salt = ' + str(len(salt)))
print('e'+str(len(storage)))
print('f'+str(len(manufacturer)))



listing_dict = {
    'Drug Name': names,
    'Price': price,
    'Packagaing type': pack_type,
    'Packaging': packaging,
    'Composition': salt,
    'Storage': storage,
    'Manufacterer': manufacturer,
    'Images': image_list
 }

print("______________________Job Finished!______________________")
df = pd.DataFrame(listing_dict)
df.to_excel(f"Test.xlsx")

Solution

  • As you noticed, the threads do not share the lists that you are appending to.

    The usual way to collect results from threads is to save the concurrent.futures.Future object that submit() creates and to call the result() method for each.

    This would be the template:

    with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor2:
        print('Start!')
        print(len(url_list))
        futures = []
        for url2 in url_list:
            futures.append(executor2.submit(Get_Info, url2))
        concurrent.futures.wait(futures)
        results = [f.result() for f in futures]
    

    However, you will need to change your Get_Info() code to return the partial lists (I'd suggest a dictionary, since that is your final goal) and write code to combine the results at the end.