python python-3.x web-scraping openpyxl concurrent.futures

Unable to figure out how I can write the results back to the same worksheet when I go for concurrent.futures

I've used openpyxl library to read different ticker names from a worksheet and then use those tickers in a website to produce results and finally write the results back to the same worksheet right next to the tickers in their concerning cells.

When I run the script without implementing multiprocessing within it, I find it working flawlessly.

However, I can't figure out as to how I can write the results back to the worksheet in their concerning cells when I go for this library concurrent.futures.

My current attempt:

import requests
from openpyxl import load_workbook
import concurrent.futures as futures

wb = load_workbook('Screener.xlsx')
ws = wb['Screener-1']

link = 'https://backend.otcmarkets.com/otcapi/company/profile/full/{}?'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36',
}
params = {
    'symbol': ''
}

def get_info(ticker):
    target_link = link.format(ticker)
    params['symbol'] = ticker
    r = requests.get(target_link,params,headers=headers)
    try:
        address = r.json()['address']
    except (AttributeError,KeyError,IndexError): 
        address = ""
    try:
        website = r.json()['website']
    except (AttributeError,KeyError,IndexError): 
        website = ""
    return address,website

if __name__ == '__main__':
    ticker_list = []
    for row in range(2, ws.max_row + 1):
        if ws.cell(row=row,column=1).value==None:break          
        ticker = ws.cell(row=row,column=1).value
        ticker_list.append(ticker)

    with futures.ThreadPoolExecutor(max_workers=6) as executor:
        future_to_url = {executor.submit(get_info, ticker): ticker for ticker in ticker_list}
        for future in futures.as_completed(future_to_url):
            address,website = future.result()[0],future.result()[1]
            print(address,website)

    #         ws.cell(row=row, column=2).value = '{}'.format(address)
    #         ws.cell(row=row, column=3).value = '{}'.format(website)
    # wb.save('Screener.xlsx')

Few tickers for your test:

tickers = ['URBT','TPRP','CRBO','PVSP','TSPG','VMHG','MRTI','VTMC','TORM','SORT']

How can I write the results back to the same worksheet while doing reverse search using concurrent.futures?

In case you wish to know where exactly I'm trying to write the data to, this is how the worksheet looks like.

Solution

Since you're already using openpyxl, I'd recommend using pandas, as you might find it a bit easier to work with workbooks. openpyxl powers pandas read_excel.

Assuming you have a file Screener.xlsx that has column Symbol, like so:

You can grab the missing data and update the workbook.

Here's how:

import concurrent.futures as futures

import pandas as pd
import requests

link = 'https://backend.otcmarkets.com/otcapi/company/profile/full/{}'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 '
                  '(KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36',
}


def get_info(ticker) -> dict:
    r = requests.get(link.format(ticker), headers=headers)
    print(f"Fetching data for {ticker}...")
    try:
        address = r.json()["address1"]
    except (AttributeError, KeyError, IndexError):
        address = "None"
    try:
        website = r.json()["website"]
    except (AttributeError, KeyError, IndexError):
        website = "None"
    return {"ticker": ticker, "address": address, "website": website}


if __name__ == "__main__":
    df = pd.read_excel("Screener.xlsx")
    tickers = df["Symbol"].to_list()
    with futures.ThreadPoolExecutor(max_workers=6) as executor:
        future_to_url = {
            executor.submit(get_info, ticker): ticker for ticker in tickers
        }
    tickers_scraped = [
        future.result() for future in futures.as_completed(future_to_url)
    ]
    sorted_tickers = sorted(
        tickers_scraped, key=lambda i: tickers.index(i["ticker"])
    )
    df.loc[:, ["Address", "Website"]] = [
        [i["address"], i["website"]] for i in sorted_tickers
    ]
    df.to_excel("Screener.xlsx", index=False)

To get this:

EDIT:

Here's a pure pandas approach without sorting the scraped data first.

if __name__ == "__main__":
    df = pd.read_excel("Screener.xlsx")
    tickers = df["Symbol"].to_list()
    with futures.ThreadPoolExecutor(max_workers=6) as executor:
        future_to_url = {
            executor.submit(get_info, ticker): ticker for ticker in tickers
        }
    tickers_scraped = [
        future.result() for future in futures.as_completed(future_to_url)
    ]
    df_scraped = pd.DataFrame(tickers_scraped).set_index("ticker")
    df = df.set_index("Symbol")
    df[["Address", "Website"]] = df_scraped[["address", "website"]]
    df = df.reset_index()
    df.to_excel("Screener.xlsx", index=False)