Search code examples
pythonbeautifulsoupurlliblarge-data

Python, 'IndexError: list index out of range' when parsing large arrays BeautifulSoup


I keep on getting the following error:

Traceback (most recent call last):
  File "C:\Users\User\Documents\Project.py", line 100, in <module>
    parseData(array)
  File "C:\Users\User\Documents\Project.py", line 91, in parseData
    name2 = pageSoup.findAll('div', {'class': 'item-title'})[0].string
IndexError: list index out of range

The array being passed into the function holds a couple thousand URL's. When I tested with an array of a much shorter length in the hundreds it was functional, finishing with no problems. I'm not too sure to why it is not functional when a larger array is used as an input.

def parseData(urls):
    f = io.open('output.txt', 'a', encoding='utf-8')
    for url in urls:
        response = urllib.request.urlopen(url)
        responseContent = response.read()
        pageSoup = BeautifulSoup(responseContent, 'html.parser', from_encoding="utf-8")
        if 'https://example.com' in url:
            name = pageSoup.findAll('h3', {'class': 'tb-main-title'})[0].string
            price = pageSoup.findAll('em', {'class': 'tb-rmb-num'})[0].string
            link = url
            print('Retriving data from ' + str(link) + '...\n' + str(name) + ':' + str(price))
            f.write('\n' + str(link) + '\n' + str(name) + '\n' + str(price) + '\n')

        elif 'https://example2.com' in url:
            name2 = pageSoup.findAll('div', {'class': 'item-title'})[0].string
            price2 = pageSoup.findAll('span', {'class': 'cur-price'})[0].string
            print('Retriving data from ' + str(link) + '...\n' + str(name2) + ':' + str(price2))
            f.write('\n' + str(link) + '\n' + str(name2) + '\n' + str(price2) + '\n')

Thank you for taking the time to check this out, any help is much appreciated! :)


Solution

  • This improve above response

    import urllib.request
    from bs4 import BeautifulSoup
    from collections import namedtuple
    Data = namedtuple('Data', 'link name price')
    
    def parseData(url):
        link = None
        name = None
        price = None
    
        with urllib.request.urlopen(url) as response:
            if response:
                # responseContent = response.read()
                pageSoup = BeautifulSoup(response, 'html.parser', from_encoding="utf-8")
                if 'https://example.com' in url:
                    try:
                        name = pageSoup.findAll('h3', {'class': 'tb-main-title'})[0].string
                        price = pageSoup.findAll('em', {'class': 'tb-rmb-num'})[0].string
                    except IndexError as e:
                        pass
                elif 'https://example2.com' in url:
                    breakpoint()
                    try:
                        name = pageSoup.findAll('div', {'class': 'item-title'})[0].string
                        price = pageSoup.findAll('span', {'class': 'cur-price'})[0].string
                    except IndexError as e:
                        pass
                link = url
                print('Retriving data from ' + str(link) + '...\n' + str(name) + ':' + str(price))
            return Data(link=link, name=name, price=price)
    
    
    urls = ["https://www.yahoo.com", "https://www.google.com"]
    
    
    if __name__ == "__main__":
        for url_ in urls:
            data = parseData(url_)
            if data.link and data.name and data.price:
                with open('output.txt', 'a', encoding='utf-8') as f:
                    f.write('\n' + str(link) + '\n' + str(name) + '\n' + str(price) + '\n')