I am using Selenium headless browsers with concurrent futures threading in order to scrape information from a website and output it to an Excel sheet. However, while Beautiful Soup is able to find the elements fine and is able to print the elements, appending them to the list is not working and ends up with blank lists except for 1 list which appends fine. Here is the code (sorry if it is messy, I'm still learning):
def init_driver():
PROXY = 'http://p.webshare.io:9999'
chrome_options = ChromeOptions()
chrome_options.add_argument('log-level=3')
chrome_options.add_argument('--proxy-server=%s' % PROXY)
chrome_options.add_argument("--headless") # Run headless
driver_service = ChromeService(executable_path='chromedriver.exe')
driver = webdriver.Chrome(service=driver_service, options=chrome_options)
return driver
def Get_info(url2):
driver = init_driver()
driver.get(url2)
soup = BeautifulSoup(driver.page_source, 'html.parser')
title = soup.find('h1', class_='DrugHeader__title-content___2ZaPo').text
images = soup.find_all('img', alt = title)
store = soup.find_all(class_='saltInfo DrugHeader__meta-value___vqYM0')
for things in store:
if 'store' in things.text:
stor = thing.text
else:
pass
with lock:
pop = []
for image in images:
blah = image['src']
final = blah.replace('l_watermark_346,w_480,h_480/a_ignore,w_480,h_480,c_fit,q_auto,f_auto/', '')
print(final)
pop.append(final)
thing = ', '.join(pop)
image_list.append(thing)
names.append(title)
storage.append(stor)
manufacturer.append(soup.find_all(class_='DrugHeader__meta-value___vqYM0')[0].text)
salt.append(soup.find_all(class_='DrugHeader__meta-value___vqYM0')[1].text)
if soup.find(class_='DrugPriceBox__best-price___32JXw').text != None:
price.append(soup.find(class_='DrugPriceBox__best-price___32JXw').text)
elif soup.find(class_='PriceBoxPlanOption__offer-price___3v9x8 PriceBoxPlanOption__offer-price-cp___2QPU_').text != None:
price.append(soup.find(class_='PriceBoxPlanOption__offer-price___3v9x8 PriceBoxPlanOption__offer-price-cp___2QPU_').text)
elif soup.find(class_='DrugPriceBox__price___dj2lv').text != None:
price.append(soup.find(class_='DrugPriceBox__price___dj2lv').text)
pack = soup.find(class_='DrugPriceBox__quantity___2LGBX').text
if 'vial' in pack:
pack_type.append('vial')
elif 'strip' in pack:
pack_type.append('strip')
elif 'bottle' in pack:
pack_type.append('bottle')
elif 'tube' in pack:
pack_type.append('tube')
elif 'packet' in pack:
pack_type.append('packet')
elif 'box' in pack:
pack_type.append('box')
elif 'vial' in pack:
pack_type.append('vial')
elif 'cartridge' in pack:
pack_type.append('cartridge')
elif 'ampoule' in pack:
pack_type.append('ampoule')
elif 'syringe' in pack:
pack_type.append('syringe')
else:
pack_type.append('N/A')
packaging.append(soup.find(class_='DrugPriceBox__quantity___2LGBX').text)
driver.quit()
with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor2:
print('Start!')
print(len(url_list))
for url2 in url_list:
executor2.submit(Get_info, url2)
file2.close()
print('a' + str(len(names)))
print('b' + str(len(price)))
print('d' + str(len(packaging)))
print('salt = ' + str(len(salt)))
print('e'+str(len(storage)))
print('f'+str(len(manufacturer)))
listing_dict = {
'Drug Name': names,
'Price': price,
'Packagaing type': pack_type,
'Packaging': packaging,
'Composition': salt,
'Storage': storage,
'Manufacterer': manufacturer,
'Images': image_list
}
print("______________________Job Finished!______________________")
df = pd.DataFrame(listing_dict)
df.to_excel(f"Test.xlsx")
As you noticed, the threads do not share the lists that you are appending to.
The usual way to collect results from threads is to save the concurrent.futures.Future
object that submit()
creates and to call the result()
method for each.
This would be the template:
with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor2:
print('Start!')
print(len(url_list))
futures = []
for url2 in url_list:
futures.append(executor2.submit(Get_Info, url2))
concurrent.futures.wait(futures)
results = [f.result() for f in futures]
However, you will need to change your Get_Info()
code to return the partial lists (I'd suggest a dictionary, since that is your final goal) and write code to combine the results at the end.