Search code examples
pythonpandasmultithreadingmultiprocessingconcurrent.futures

raise ValueError(err) - Implementation of multithreading using concurrent.future in Python


I have written a python code which scrape information from a website. I tried to apply multi-thread method in my code. Here's my code before applying multithreading: It run perfectly on my PC.

import requests
from bs4 import BeautifulSoup
import pandas as pd
import investpy

def getCurrencyHistorical():
    t1 = time.perf_counter()
    headers = {'Accept-Language': 'en-US,en;q=0.9',
               'Upgrade-Insecure-Requests': '1',
               'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36 Edg/88.0.705.63',
               'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3;q=0.9',
               'Cache-Control': 'max-age=0',
               'Connection': 'keep-alive'}
    links = {"USD-IDR":"https://www.investing.com/currencies/usd-idr-historical-data",
             "USD-JPY":"https://www.investing.com/currencies/usd-jpy-historical-data",
             "USD-CNY":"https://www.investing.com/currencies/usd-cny-historical-data"}
 
    column = []
    output = []
    for key, value in links.items():
        page = requests.get(value, headers=headers)
        soup = BeautifulSoup(page.content, 'html.parser')
        table =soup.select('table')[0]
        #ColumnName
        rows = table.find_all('tr')
        for row in rows:
            cols = row.find_all('th')
            cols = [item.text.strip() for item in cols]
            column.append(cols)
            outs = row.find_all('td')
            outs = [item.text.strip() for item in outs]
            outs.append(key) 
            output.append(outs)
        del output[0]
        #print(value)
        #print(output)
    column[0].append('Currency')
    df = pd.DataFrame(output, columns = column[0])
    t2 = time.perf_counter()
    print(f'Finished in {t2-t1} seconds')
    return(df)

But, when I convert to below, I got some error. here's the code after applying multithreading:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import concurrent.futures
from functools import partial
import psutil

def process_data(key, page):
    soup = BeautifulSoup(page, 'html.parser')
    table =soup.select('table')[0]
    #ColumnName
    rows = table.find_all('tr')
    for row in rows:
        cols = row.find_all('th')
        cols = [item.text.strip() for item in cols]
        
        outs = row.find_all('td')
        outs = [item.text.strip() for item in outs]
        outs.append(key) 
        
    return cols, outs


def getCurrencyHistorical(session, pool_executor, item):
    key, value = item

    page = session.get(value)
    f = pool_executor.submit(process_data, key, page.content)
    return f.result()

def main():

    t1 = time.perf_counter()

    links = {"USD-IDR":"https://www.investing.com/currencies/usd-idr-historical-data",
             "USD-JPY":"https://www.investing.com/currencies/usd-jpy-historical-data",
             "USD-CNY":"https://www.investing.com/currencies/usd-cny-historical-data"}


    with requests.Session() as session:
        user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.37"
        session.headers = {'User-Agent': user_agent}
        column = []
        output = []
        with concurrent.futures.ProcessPoolExecutor(psutil.cpu_count(logical=False)) as pool_executor, \
        concurrent.futures.ThreadPoolExecutor(max_workers=len(links)) as executor:
            for return_value in executor.map(partial(getCurrencyHistorical, session, pool_executor), links.items()):
                cols, outs = return_value
                column.append(cols)
                output.append(outs)
            del output[0]
        column[0].append('Currency')
        df = pd.DataFrame(output, columns = column[0])

    t2 = time.perf_counter()

    print(f'Finished in {t2-t1} seconds')

    print(df)

# Required for Windows:
if __name__ == '__main__':
    main()

I got error raise ValueError(err) from err. ValueError: 1 columns passed, passed data had 7 columns. and it comes from the line df = pd.DataFrame(output, columns = column[0]). What is wrong? Thank you.


Solution

  • process_data should be just like the non-multiprocessing case except for the fact it is only processing one key-value pair, but that's not what you have done. The main process now must do extend operations on the lists returned by process_data.

    Update

    You were not retrieving the data items for key "USD-JPY" because you were not looking at the correct table. You should be looking at the table with id 'curr_table'. I have also updated the multiprocessing pool size per my comment to your question.

    import requests
    from bs4 import BeautifulSoup
    import pandas as pd
    import time
    import concurrent.futures
    from functools import partial
    from os import cpu_count
    
    def process_data(key, page):
        soup = BeautifulSoup(page, 'html.parser')
        table = soup.find('table', {'id': 'curr_table'})
        #ColumnName
        rows = table.find_all('tr')
        column = []
        output = []
        for row in rows:
            cols = row.find_all('th')
            cols = [item.text.strip() for item in cols]
            column.append(cols)
            outs = row.find_all('td')
            outs = [item.text.strip() for item in outs]
            outs.append(key)
            output.append(outs)
        del output[0]
    
        return column, output
    
    
    def getCurrencyHistorical(session, pool_executor, item):
        key, value = item
    
        page = session.get(value)
        f = pool_executor.submit(process_data, key, page.content)
        return f.result()
    
    def main():
    
        t1 = time.perf_counter()
    
        links = {"USD-IDR":"https://www.investing.com/currencies/usd-idr-historical-data",
                 "USD-JPY":"https://www.investing.com/currencies/usd-jpy-historical-data",
                 "USD-CNY":"https://www.investing.com/currencies/usd-cny-historical-data"}
    
        with requests.Session() as session:
            user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.37"
    
            session.headers = {'User-Agent': user_agent}
            column = []
            output = []
            with concurrent.futures.ProcessPoolExecutor(min(len(links), cpu_count())) as pool_executor, \
            concurrent.futures.ThreadPoolExecutor(max_workers=len(links)) as executor:
                for return_value in executor.map(partial(getCurrencyHistorical, session, pool_executor), links.items()):
                    cols, outs = return_value
                    column.extend(cols)
                    output.extend(outs)
            column[0].append('Currency')
            df = pd.DataFrame(output, columns = column[0])
    
        t2 = time.perf_counter()
    
        print(f'Finished in {t2-t1} seconds')
    
        pd.set_option("display.max_rows", None, "display.max_columns", None)
        print(df)
    
    # Required for Windows:
    if __name__ == '__main__':
        main()
    

    Prints:

    Finished in 2.1944901 seconds
                Date     Price      Open      High       Low Change % Currency
    0   Aug 26, 2021  14,417.5  14,425.0  14,430.0  14,411.0    0.16%  USD-IDR
    1   Aug 25, 2021  14,395.0  14,405.0  14,421.0  14,387.5    0.03%  USD-IDR
    2   Aug 24, 2021  14,390.0  14,395.0  14,407.5  14,377.5   -0.14%  USD-IDR
    3   Aug 23, 2021  14,410.0  14,435.0  14,438.5  14,404.0   -0.28%  USD-IDR
    4   Aug 20, 2021  14,450.0  14,475.0  14,485.0  14,422.5    0.35%  USD-IDR
    5   Aug 19, 2021  14,400.0  14,405.0  14,425.0  14,392.5    0.21%  USD-IDR
    6   Aug 18, 2021  14,370.0  14,387.5  14,400.0  14,372.5    0.00%  USD-IDR
    7   Aug 16, 2021  14,370.0  14,390.0  14,395.0  14,371.5   -0.10%  USD-IDR
    8   Aug 13, 2021  14,385.0  14,382.5  14,395.0  14,366.0    0.03%  USD-IDR
    9   Aug 12, 2021  14,380.0  14,395.0  14,407.5  14,366.0    0.00%  USD-IDR
    10  Aug 10, 2021  14,380.0  14,375.0  14,402.0  14,375.0    0.14%  USD-IDR
    11  Aug 09, 2021  14,360.0  14,370.0  14,387.5  14,357.5    0.07%  USD-IDR
    12  Aug 06, 2021  14,350.0  14,360.0  14,377.5  14,347.5    0.07%  USD-IDR
    13  Aug 05, 2021  14,340.0  14,330.0  14,360.0  14,321.0    0.21%  USD-IDR
    14  Aug 04, 2021  14,310.0  14,325.0  14,347.5  14,304.5   -0.21%  USD-IDR
    15  Aug 03, 2021  14,340.0  14,375.0  14,388.0  14,338.5   -0.55%  USD-IDR
    16  Aug 02, 2021  14,420.0  14,465.0  14,472.5  14,422.5   -0.28%  USD-IDR
    17  Jul 30, 2021  14,460.0  14,435.0  14,477.5  14,434.5   -0.14%  USD-IDR
    18  Jul 29, 2021  14,480.0  14,490.0  14,502.5  14,482.5   -0.03%  USD-IDR
    19  Jul 28, 2021  14,485.0  14,500.0  14,512.5  14,485.0   -0.03%  USD-IDR
    20  Jul 27, 2021  14,490.0  14,473.5  14,497.5  14,465.0    0.07%  USD-IDR
    21  Jul 26, 2021  14,480.0  14,510.0  14,522.5  14,470.0   -0.07%  USD-IDR
    22  Aug 26, 2021    110.10    109.98    110.23    109.93    0.10%  USD-JPY
    23  Aug 25, 2021    109.99    109.64    110.13    109.61    0.34%  USD-JPY
    24  Aug 24, 2021    109.62    109.69    109.89    109.41   -0.05%  USD-JPY
    25  Aug 23, 2021    109.68    109.81    110.15    109.65   -0.11%  USD-JPY
    26  Aug 20, 2021    109.80    109.75    109.89    109.57    0.07%  USD-JPY
    27  Aug 19, 2021    109.72    109.76    110.23    109.49   -0.02%  USD-JPY
    28  Aug 18, 2021    109.74    109.57    110.07    109.47    0.16%  USD-JPY
    29  Aug 17, 2021    109.57    109.22    109.66    109.12    0.31%  USD-JPY
    30  Aug 16, 2021    109.23    109.71    109.76    109.11   -0.31%  USD-JPY
    31  Aug 13, 2021    109.57    110.39    110.46    109.54   -0.73%  USD-JPY
    32  Aug 12, 2021    110.38    110.42    110.55    110.31   -0.02%  USD-JPY
    33  Aug 11, 2021    110.40    110.58    110.81    110.31   -0.14%  USD-JPY
    34  Aug 10, 2021    110.56    110.29    110.60    110.28    0.25%  USD-JPY
    35  Aug 09, 2021    110.28    110.26    110.36    110.02    0.03%  USD-JPY
    36  Aug 06, 2021    110.25    109.77    110.36    109.69    0.46%  USD-JPY
    37  Aug 05, 2021    109.74    109.49    109.79    109.40    0.25%  USD-JPY
    38  Aug 04, 2021    109.47    109.07    109.68    108.72    0.39%  USD-JPY
    39  Aug 03, 2021    109.04    109.32    109.36    108.88   -0.22%  USD-JPY
    40  Aug 02, 2021    109.28    109.69    109.79    109.18   -0.38%  USD-JPY
    41  Jul 30, 2021    109.70    109.49    109.83    109.36    0.22%  USD-JPY
    42  Jul 29, 2021    109.46    109.91    109.96    109.42   -0.40%  USD-JPY
    43  Jul 28, 2021    109.90    109.75    110.29    109.74    0.13%  USD-JPY
    44  Jul 27, 2021    109.76    110.36    110.41    109.58   -0.53%  USD-JPY
    45  Jul 26, 2021    110.34    110.57    110.59    110.11   -0.18%  USD-JPY
    46  Aug 26, 2021    6.4815    6.4725    6.4866    6.4725    0.09%  USD-CNY
    47  Aug 25, 2021    6.4756    6.4714    6.4811    6.4707    0.07%  USD-CNY
    48  Aug 24, 2021    6.4710    6.4790    6.4851    6.4676   -0.15%  USD-CNY
    49  Aug 23, 2021    6.4805    6.4915    6.4973    6.4788   -0.32%  USD-CNY
    50  Aug 20, 2021    6.5012    6.4960    6.5057    6.4935    0.11%  USD-CNY
    51  Aug 19, 2021    6.4942    6.4847    6.4997    6.4840    0.16%  USD-CNY
    52  Aug 18, 2021    6.4841    6.4861    6.4872    6.4776   -0.02%  USD-CNY
    53  Aug 17, 2021    6.4854    6.4787    6.4889    6.4759    0.17%  USD-CNY
    54  Aug 16, 2021    6.4742    6.4774    6.4810    6.4719   -0.04%  USD-CNY
    55  Aug 13, 2021    6.4768    6.4778    6.4854    6.4749   -0.02%  USD-CNY
    56  Aug 12, 2021    6.4782    6.4767    6.4811    6.4719   -0.00%  USD-CNY
    57  Aug 11, 2021    6.4783    6.4846    6.4894    6.4752   -0.11%  USD-CNY
    58  Aug 10, 2021    6.4852    6.4826    6.4875    6.4774   -0.01%  USD-CNY
    59  Aug 09, 2021    6.4857    6.4835    6.4895    6.4731    0.05%  USD-CNY
    60  Aug 06, 2021    6.4825    6.4660    6.4848    6.4622    0.34%  USD-CNY
    61  Aug 05, 2021    6.4608    6.4671    6.4677    6.4595   -0.07%  USD-CNY
    62  Aug 04, 2021    6.4655    6.4662    6.4673    6.4555   -0.07%  USD-CNY
    63  Aug 03, 2021    6.4700    6.4656    6.4710    6.4604    0.12%  USD-CNY
    64  Aug 02, 2021    6.4620    6.4615    6.4693    6.4580    0.02%  USD-CNY
    65  Jul 30, 2021    6.4609    6.4645    6.4693    6.4506    0.07%  USD-CNY
    66  Jul 29, 2021    6.4562    6.4908    6.4908    6.4544   -0.53%  USD-CNY
    67  Jul 28, 2021    6.4905    6.5095    6.5101    6.4891   -0.31%  USD-CNY
    68  Jul 27, 2021    6.5104    6.4760    6.5132    6.4735    0.43%  USD-CNY
    69  Jul 26, 2021    6.4825    6.4790    6.4875    6.4785    0.03%  USD-CNY