Search code examples
pythonmultithreadingasynchronousthreadpoolpython-multithreading

how to improve the Webscraping code speed by multi threading code python


below is my code in which i am writing row by row (there are around 900 pages with 10 rows and 5 data in each row) is there any way to make this faster. currently it's taking 80 min to export the data into csv.Is there any way to make parallel Request to pages and make this code more efficient.

import requests
from urllib3.exceptions import InsecureRequestWarning
import csv

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from bs4 import BeautifulSoup as bs

f = csv.writer(open('GEM.csv', 'w', newline=''))
f.writerow(['Bidnumber', 'Items', 'Quantitiy', 'Department', 'Enddate'])


def scrap_bid_data():
    page_no = 1
    while page_no < 910:
        print('Hold on creating URL to fetch data...')
        url = 'https://bidplus.gem.gov.in/bidlists?bidlists&page_no=' + str(page_no)
        print('URL created: ' + url)
        scraped_data = requests.get(url, verify=False)
        soup_data = bs(scraped_data.text, 'lxml')
        extracted_data = soup_data.find('div', {'id': 'pagi_content'})
        if len(extracted_data) == 0:
            break
        else:
            for idx in range(len(extracted_data)):
                if (idx % 2 == 1):
                    bid_data = extracted_data.contents[idx].text.strip().split('\n')

                    bidno = bid_data[0].split(":")[-1]
                    items = bid_data[5].split(":")[-1]
                    qnty = int(bid_data[6].split(':')[1].strip())
                    dept = (bid_data[10] + bid_data[12].strip()).split(":")[-1]
                    edate = bid_data[17].split("End Date:")[-1]
                    f.writerow([bidno, items, qnty, dept, edate])

            page_no=page_no+1
scrap_bid_data()

Solution

  • I've restructured your code a bit to ensure that your CSV file is closed. I also got the following error message:

    ConnectionError: HTTPSConnectionPool(host='bidplus.gem.gov.in', port=443): Max retries exceeded with url: /bidlists?bidlists&page_no=1 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000012EB0DF1E80>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond',))

    You should experiment with the NUMBER_THREADS value:

    import requests
    from urllib3.exceptions import InsecureRequestWarning
    import csv
    import concurrent.futures
    import functools
    
    requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
    from bs4 import BeautifulSoup as bs
    
    
    def download_page(session, page_no):
        url = 'https://bidplus.gem.gov.in/bidlists?bidlists&page_no=' + str(page_no)
        print('URL created: ' + url)
        resp = session.get(url, verify=False)
        return resp.text
    
    
    def scrap_bid_data():
        NUMBER_THREADS = 30 # number of concurrent download requests
        with open('GEM.csv', 'w', newline='') as out_file:
            f = csv.writer(out_file)
            f.writerow(['Bidnumber', 'Items', 'Quantitiy', 'Department', 'Enddate'])
            with requests.Session() as session:
                page_downloader = functools.partial(download_page, session)
                with concurrent.futures.ThreadPoolExecutor(max_workers=NUMBER_THREADS) as executor:
                    pages = executor.map(page_downloader, range(1, 910))
                    page_no = 0
                    for page in pages:
                        page_no += 1
                        soup_data = bs(page, 'lxml')
                        extracted_data = soup_data.find('div', {'id': 'pagi_content'})
                        if extracted_data is None or len(extracted_data) == 0:
                            print('No data at page number', page_no)
                            print(page)
                            break
                        else:
                            for idx in range(len(extracted_data)):
                                if (idx % 2 == 1):
                                    bid_data = extracted_data.contents[idx].text.strip().split('\n')
    
                                    bidno = bid_data[0].split(":")[-1]
                                    items = bid_data[5].split(":")[-1]
                                    qnty = int(bid_data[6].split(':')[1].strip())
                                    dept = (bid_data[10] + bid_data[12].strip()).split(":")[-1]
                                    edate = bid_data[17].split("End Date:")[-1]
                                    f.writerow([bidno, items, qnty, dept, edate])
    scrap_bid_data()