I have written a python code which scrape information from a website. I tried to apply multi-thread method in my code. Here's my code before applying multithreading: It run perfectly on my PC.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import investpy
def getCurrencyHistorical():
t1 = time.perf_counter()
headers = {'Accept-Language': 'en-US,en;q=0.9',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36 Edg/88.0.705.63',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive'}
links = {"USD-IDR":"https://www.investing.com/currencies/usd-idr-historical-data",
"USD-JPY":"https://www.investing.com/currencies/usd-jpy-historical-data",
"USD-CNY":"https://www.investing.com/currencies/usd-cny-historical-data"}
column = []
output = []
for key, value in links.items():
page = requests.get(value, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
table =soup.select('table')[0]
#ColumnName
rows = table.find_all('tr')
for row in rows:
cols = row.find_all('th')
cols = [item.text.strip() for item in cols]
column.append(cols)
outs = row.find_all('td')
outs = [item.text.strip() for item in outs]
outs.append(key)
output.append(outs)
del output[0]
#print(value)
#print(output)
column[0].append('Currency')
df = pd.DataFrame(output, columns = column[0])
t2 = time.perf_counter()
print(f'Finished in {t2-t1} seconds')
return(df)
But, when I convert to below, I got some error. here's the code after applying multithreading:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import concurrent.futures
from functools import partial
import psutil
def process_data(key, page):
soup = BeautifulSoup(page, 'html.parser')
table =soup.select('table')[0]
#ColumnName
rows = table.find_all('tr')
for row in rows:
cols = row.find_all('th')
cols = [item.text.strip() for item in cols]
outs = row.find_all('td')
outs = [item.text.strip() for item in outs]
outs.append(key)
return cols, outs
def getCurrencyHistorical(session, pool_executor, item):
key, value = item
page = session.get(value)
f = pool_executor.submit(process_data, key, page.content)
return f.result()
def main():
t1 = time.perf_counter()
links = {"USD-IDR":"https://www.investing.com/currencies/usd-idr-historical-data",
"USD-JPY":"https://www.investing.com/currencies/usd-jpy-historical-data",
"USD-CNY":"https://www.investing.com/currencies/usd-cny-historical-data"}
with requests.Session() as session:
user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.37"
session.headers = {'User-Agent': user_agent}
column = []
output = []
with concurrent.futures.ProcessPoolExecutor(psutil.cpu_count(logical=False)) as pool_executor, \
concurrent.futures.ThreadPoolExecutor(max_workers=len(links)) as executor:
for return_value in executor.map(partial(getCurrencyHistorical, session, pool_executor), links.items()):
cols, outs = return_value
column.append(cols)
output.append(outs)
del output[0]
column[0].append('Currency')
df = pd.DataFrame(output, columns = column[0])
t2 = time.perf_counter()
print(f'Finished in {t2-t1} seconds')
print(df)
# Required for Windows:
if __name__ == '__main__':
main()
I got error raise ValueError(err) from err. ValueError: 1 columns passed, passed data had 7 columns.
and it comes from the line df = pd.DataFrame(output, columns = column[0])
. What is wrong? Thank you.
process_data
should be just like the non-multiprocessing case except for the fact it is only processing one key-value pair, but that's not what you have done. The main process now must do extend
operations on the lists returned by process_data
.
Update
You were not retrieving the data items for key "USD-JPY" because you were not looking at the correct table. You should be looking at the table with id 'curr_table'. I have also updated the multiprocessing pool size per my comment to your question.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import concurrent.futures
from functools import partial
from os import cpu_count
def process_data(key, page):
soup = BeautifulSoup(page, 'html.parser')
table = soup.find('table', {'id': 'curr_table'})
#ColumnName
rows = table.find_all('tr')
column = []
output = []
for row in rows:
cols = row.find_all('th')
cols = [item.text.strip() for item in cols]
column.append(cols)
outs = row.find_all('td')
outs = [item.text.strip() for item in outs]
outs.append(key)
output.append(outs)
del output[0]
return column, output
def getCurrencyHistorical(session, pool_executor, item):
key, value = item
page = session.get(value)
f = pool_executor.submit(process_data, key, page.content)
return f.result()
def main():
t1 = time.perf_counter()
links = {"USD-IDR":"https://www.investing.com/currencies/usd-idr-historical-data",
"USD-JPY":"https://www.investing.com/currencies/usd-jpy-historical-data",
"USD-CNY":"https://www.investing.com/currencies/usd-cny-historical-data"}
with requests.Session() as session:
user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.37"
session.headers = {'User-Agent': user_agent}
column = []
output = []
with concurrent.futures.ProcessPoolExecutor(min(len(links), cpu_count())) as pool_executor, \
concurrent.futures.ThreadPoolExecutor(max_workers=len(links)) as executor:
for return_value in executor.map(partial(getCurrencyHistorical, session, pool_executor), links.items()):
cols, outs = return_value
column.extend(cols)
output.extend(outs)
column[0].append('Currency')
df = pd.DataFrame(output, columns = column[0])
t2 = time.perf_counter()
print(f'Finished in {t2-t1} seconds')
pd.set_option("display.max_rows", None, "display.max_columns", None)
print(df)
# Required for Windows:
if __name__ == '__main__':
main()
Prints:
Finished in 2.1944901 seconds
Date Price Open High Low Change % Currency
0 Aug 26, 2021 14,417.5 14,425.0 14,430.0 14,411.0 0.16% USD-IDR
1 Aug 25, 2021 14,395.0 14,405.0 14,421.0 14,387.5 0.03% USD-IDR
2 Aug 24, 2021 14,390.0 14,395.0 14,407.5 14,377.5 -0.14% USD-IDR
3 Aug 23, 2021 14,410.0 14,435.0 14,438.5 14,404.0 -0.28% USD-IDR
4 Aug 20, 2021 14,450.0 14,475.0 14,485.0 14,422.5 0.35% USD-IDR
5 Aug 19, 2021 14,400.0 14,405.0 14,425.0 14,392.5 0.21% USD-IDR
6 Aug 18, 2021 14,370.0 14,387.5 14,400.0 14,372.5 0.00% USD-IDR
7 Aug 16, 2021 14,370.0 14,390.0 14,395.0 14,371.5 -0.10% USD-IDR
8 Aug 13, 2021 14,385.0 14,382.5 14,395.0 14,366.0 0.03% USD-IDR
9 Aug 12, 2021 14,380.0 14,395.0 14,407.5 14,366.0 0.00% USD-IDR
10 Aug 10, 2021 14,380.0 14,375.0 14,402.0 14,375.0 0.14% USD-IDR
11 Aug 09, 2021 14,360.0 14,370.0 14,387.5 14,357.5 0.07% USD-IDR
12 Aug 06, 2021 14,350.0 14,360.0 14,377.5 14,347.5 0.07% USD-IDR
13 Aug 05, 2021 14,340.0 14,330.0 14,360.0 14,321.0 0.21% USD-IDR
14 Aug 04, 2021 14,310.0 14,325.0 14,347.5 14,304.5 -0.21% USD-IDR
15 Aug 03, 2021 14,340.0 14,375.0 14,388.0 14,338.5 -0.55% USD-IDR
16 Aug 02, 2021 14,420.0 14,465.0 14,472.5 14,422.5 -0.28% USD-IDR
17 Jul 30, 2021 14,460.0 14,435.0 14,477.5 14,434.5 -0.14% USD-IDR
18 Jul 29, 2021 14,480.0 14,490.0 14,502.5 14,482.5 -0.03% USD-IDR
19 Jul 28, 2021 14,485.0 14,500.0 14,512.5 14,485.0 -0.03% USD-IDR
20 Jul 27, 2021 14,490.0 14,473.5 14,497.5 14,465.0 0.07% USD-IDR
21 Jul 26, 2021 14,480.0 14,510.0 14,522.5 14,470.0 -0.07% USD-IDR
22 Aug 26, 2021 110.10 109.98 110.23 109.93 0.10% USD-JPY
23 Aug 25, 2021 109.99 109.64 110.13 109.61 0.34% USD-JPY
24 Aug 24, 2021 109.62 109.69 109.89 109.41 -0.05% USD-JPY
25 Aug 23, 2021 109.68 109.81 110.15 109.65 -0.11% USD-JPY
26 Aug 20, 2021 109.80 109.75 109.89 109.57 0.07% USD-JPY
27 Aug 19, 2021 109.72 109.76 110.23 109.49 -0.02% USD-JPY
28 Aug 18, 2021 109.74 109.57 110.07 109.47 0.16% USD-JPY
29 Aug 17, 2021 109.57 109.22 109.66 109.12 0.31% USD-JPY
30 Aug 16, 2021 109.23 109.71 109.76 109.11 -0.31% USD-JPY
31 Aug 13, 2021 109.57 110.39 110.46 109.54 -0.73% USD-JPY
32 Aug 12, 2021 110.38 110.42 110.55 110.31 -0.02% USD-JPY
33 Aug 11, 2021 110.40 110.58 110.81 110.31 -0.14% USD-JPY
34 Aug 10, 2021 110.56 110.29 110.60 110.28 0.25% USD-JPY
35 Aug 09, 2021 110.28 110.26 110.36 110.02 0.03% USD-JPY
36 Aug 06, 2021 110.25 109.77 110.36 109.69 0.46% USD-JPY
37 Aug 05, 2021 109.74 109.49 109.79 109.40 0.25% USD-JPY
38 Aug 04, 2021 109.47 109.07 109.68 108.72 0.39% USD-JPY
39 Aug 03, 2021 109.04 109.32 109.36 108.88 -0.22% USD-JPY
40 Aug 02, 2021 109.28 109.69 109.79 109.18 -0.38% USD-JPY
41 Jul 30, 2021 109.70 109.49 109.83 109.36 0.22% USD-JPY
42 Jul 29, 2021 109.46 109.91 109.96 109.42 -0.40% USD-JPY
43 Jul 28, 2021 109.90 109.75 110.29 109.74 0.13% USD-JPY
44 Jul 27, 2021 109.76 110.36 110.41 109.58 -0.53% USD-JPY
45 Jul 26, 2021 110.34 110.57 110.59 110.11 -0.18% USD-JPY
46 Aug 26, 2021 6.4815 6.4725 6.4866 6.4725 0.09% USD-CNY
47 Aug 25, 2021 6.4756 6.4714 6.4811 6.4707 0.07% USD-CNY
48 Aug 24, 2021 6.4710 6.4790 6.4851 6.4676 -0.15% USD-CNY
49 Aug 23, 2021 6.4805 6.4915 6.4973 6.4788 -0.32% USD-CNY
50 Aug 20, 2021 6.5012 6.4960 6.5057 6.4935 0.11% USD-CNY
51 Aug 19, 2021 6.4942 6.4847 6.4997 6.4840 0.16% USD-CNY
52 Aug 18, 2021 6.4841 6.4861 6.4872 6.4776 -0.02% USD-CNY
53 Aug 17, 2021 6.4854 6.4787 6.4889 6.4759 0.17% USD-CNY
54 Aug 16, 2021 6.4742 6.4774 6.4810 6.4719 -0.04% USD-CNY
55 Aug 13, 2021 6.4768 6.4778 6.4854 6.4749 -0.02% USD-CNY
56 Aug 12, 2021 6.4782 6.4767 6.4811 6.4719 -0.00% USD-CNY
57 Aug 11, 2021 6.4783 6.4846 6.4894 6.4752 -0.11% USD-CNY
58 Aug 10, 2021 6.4852 6.4826 6.4875 6.4774 -0.01% USD-CNY
59 Aug 09, 2021 6.4857 6.4835 6.4895 6.4731 0.05% USD-CNY
60 Aug 06, 2021 6.4825 6.4660 6.4848 6.4622 0.34% USD-CNY
61 Aug 05, 2021 6.4608 6.4671 6.4677 6.4595 -0.07% USD-CNY
62 Aug 04, 2021 6.4655 6.4662 6.4673 6.4555 -0.07% USD-CNY
63 Aug 03, 2021 6.4700 6.4656 6.4710 6.4604 0.12% USD-CNY
64 Aug 02, 2021 6.4620 6.4615 6.4693 6.4580 0.02% USD-CNY
65 Jul 30, 2021 6.4609 6.4645 6.4693 6.4506 0.07% USD-CNY
66 Jul 29, 2021 6.4562 6.4908 6.4908 6.4544 -0.53% USD-CNY
67 Jul 28, 2021 6.4905 6.5095 6.5101 6.4891 -0.31% USD-CNY
68 Jul 27, 2021 6.5104 6.4760 6.5132 6.4735 0.43% USD-CNY
69 Jul 26, 2021 6.4825 6.4790 6.4875 6.4785 0.03% USD-CNY