Search code examples
pythonweb-scrapingbeautifulsoupindex-error

Python : IndexError: list index out of range after modifying code


My code is supposed to provide output in the below format.

I tried modifying the code and I broke it.

import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import threading
from multiprocessing.pool import ThreadPool

class Driver:
    def __init__(self):
        options = webdriver.ChromeOptions()
        options.add_argument("--headless")
        # Un-comment next line to supress logging:
        # options.add_experimental_option('excludeSwitches', ['enable-logging'])
        self.driver = webdriver.Chrome(options=options)

    def __del__(self):
        self.driver.quit()  # clean up driver when we are cleaned up
        # print('The driver has been "quitted".')


threadLocal = threading.local()


def create_driver():
    the_driver = getattr(threadLocal, 'the_driver', None)
    if the_driver is None:
        the_driver = Driver()
        setattr(threadLocal, 'the_driver', the_driver)
    return the_driver.driver


class GameData:

    def __init__(self):
        self.date = []
        self.time = []
        self.game = []
        self.score = []
        self.home_odds = []
        self.draw_odds = []
        self.away_odds = []
        self.country = []
        self.league = []


def parse_data(url):
    try:
        browser = create_driver()
        browser.get(url)
        df = pd.read_html(browser.page_source)[0]
    except KeyError:
        print('KeyError')
        return None
    html = browser.page_source
    soup = bs(html, "lxml")
    cont = soup.find('div', {'id': 'wrap'})
    content = cont.find('div', {'id': 'col-content'})
    content = content.find('table', {'class': 'table-main'}, {'id': 'tournamentTable'})
    main = content.find('th', {'class': 'first2 tl'})
    if main is None:
        return None
    count = main.findAll('a')
    country = count[1].text
    league = count[2].text
    game_data = GameData()
    game_date = None
    for row in df.itertuples():
        if not isinstance(row[1], str):
            continue
        elif ':' not in row[1]:
            game_date = row[1].split('-')[0]
            continue
        game_data.date.append(game_date)
        game_data.time.append(row[1])
        game_data.game.append(row[2])
        game_data.score.append(row[3])
        game_data.home_odds.append(row[4])
        game_data.draw_odds.append(row[5])
        game_data.away_odds.append(row[6])
        game_data.country.append(country)
        game_data.league.append(league)
    return game_data


# URLs go here
urls = {

    "https://www.oddsportal.com/matches/soccer/20210903/",

}

if __name__ == '__main__':
    results = None
    # To limit the number of browsers we will use
    # (set to a large number if you don't want a limit):
    MAX_BROWSERS = 5
    pool = ThreadPool(min(MAX_BROWSERS, len(urls)))
    for game_data in pool.imap(parse_data, urls):
        if game_data is None:
            continue
        result = pd.DataFrame(game_data.__dict__)
        if results is None:
            results = result
        else:
            results = results.append(result, ignore_index=True)

    print(results)
    # ensure all the drivers are "quitted":
    del threadLocal
    import gc

    gc.collect()  # a little extra insurance

print(results.head())

I got this error:

Traceback (most recent call last):
  File "C:\Users\harsh\AppData\Roaming\JetBrains\PyCharmCE2021.2\scratches\scratch_13.py", line 107, in <module>
    for game_data in pool.imap(parse_data, urls):
  File "C:\Program Files\Python39\lib\multiprocessing\pool.py", line 870, in next
    raise value
  File "C:\Program Files\Python39\lib\multiprocessing\pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "C:\Users\harsh\AppData\Roaming\JetBrains\PyCharmCE2021.2\scratches\scratch_13.py", line 72, in parse_data
    league = count[2].text
IndexError: list index out of range

The results are usually in the follow format:

     date   time                          game score home_odds draw_odds away_odds   country          league
    0          None  15:30       Wolves - Manchester Utd   0:1   393/100     69/25     39/50   England  Premier League
    1          None  13:00               Burnley - Leeds   1:1   231/100     64/25   123/100   England  Premier League
    2          None  13:00           Tottenham - Watford   1:0     23/50     87/25   709/100   England  Premier League
    3   28 Aug 2021  16:30           Liverpool - Chelsea   1:1     29/20     59/25   207/100   England  Premier League
    4   28 Aug 2021  14:00       Aston Villa - Brentford   1:1   109/100     58/25     74/25   England  Premier League
    5   28 Aug 2021  14:00            Brighton - Everton   0:2     33/25    113/50   239/100   England  Premier League
    6   28 Aug 2
021  14:00       Newcastle - Southampton   2:2     73/50   257/100   189/100   England  Premier League

How can i get the data?

verbose:

I have a code that runs url cyclically for the next matches and I want to modify it . the Xpath for the matches 'next matches' for oddsportal is: //*[@id="col-content"]/div[3]/div/div/span This image. I want this code to run through all these pages and get the concat dataframe value

Please help


Solution

  • The only date I see is at the top of the page with the caption "Next Soccer Matches:".

    I don't see any point in your initial creation of a dataframe with df = pd.read_html(browser.page_source)[0] and your subsequent iteration of that dataframe; you should just be iterating directly the tags of the main table. Done correctly you will end up with the correct values in the country and league columns.

    I have also changed a few names of variables to more closely reflect what type of value they hold. Also, I have simplified a bit your navigation through the HTML hierarchy recognizing that a element with an id attribute must be unique within the document so you can directly retrieve it by that id and do not have to first retrieve its parent.

    import pandas as pd
    from bs4 import BeautifulSoup as bs
    from selenium import webdriver
    import threading
    from multiprocessing.pool import ThreadPool
    import re
    
    class Driver:
        def __init__(self):
            options = webdriver.ChromeOptions()
            options.add_argument("--headless")
            # Un-comment next line to supress logging:
            options.add_experimental_option('excludeSwitches', ['enable-logging'])
            self.driver = webdriver.Chrome(options=options)
    
        def __del__(self):
            self.driver.quit()  # clean up driver when we are cleaned up
            # print('The driver has been "quitted".')
    
    
    threadLocal = threading.local()
    
    
    def create_driver():
        the_driver = getattr(threadLocal, 'the_driver', None)
        if the_driver is None:
            the_driver = Driver()
            setattr(threadLocal, 'the_driver', the_driver)
        return the_driver.driver
    
    
    class GameData:
    
        def __init__(self):
            self.date = []
            self.time = []
            self.game = []
            self.score = []
            self.home_odds = []
            self.draw_odds = []
            self.away_odds = []
            self.country = []
            self.league = []
    
    def generate_matches(table):
        tr_tags = table.findAll('tr')
        for tr_tag in tr_tags:
            if 'class' not in tr_tag.attrs:
                continue
            tr_class = tr_tag['class']
            if 'dark' in tr_class:
                th_tag = tr_tag.find('th', {'class': 'first2 tl'})
                a_tags = th_tag.findAll('a')
                country = a_tags[0].text
                league = a_tags[1].text
            elif 'deactivate' in tr_class:
                td_tags = tr_tag.findAll('td')
                yield td_tags[0].text, td_tags[1].text, td_tags[2].text, td_tags[3].text, \
                td_tags[4].text, td_tags[5].text, country, league
    
    def parse_data(url):
        browser = create_driver()
        browser.get(url)
        soup = bs(browser.page_source, "lxml")
        div = soup.find('div', {'id': 'col-content'})
        table = div.find('table', {'class': 'table-main'})
        h1 = soup.find('h1').text
        m = re.search(r'\d+ \w+ \d{4}$', h1)
        game_date = m[0]
        game_data = GameData()
        for row in generate_matches(table):
            game_data.date.append(game_date)
            game_data.time.append(row[0])
            game_data.game.append(row[1])
            game_data.score.append(row[2])
            game_data.home_odds.append(row[3])
            game_data.draw_odds.append(row[4])
            game_data.away_odds.append(row[5])
            game_data.country.append(row[6])
            game_data.league.append(row[7])
        return game_data
    
    
    # URLs go here
    urls = {
    
        "https://www.oddsportal.com/matches/soccer/20210903/",
    
    }
    
    if __name__ == '__main__':
        results = None
        # To limit the number of browsers we will use
        # (set to a large number if you don't want a limit):
        MAX_BROWSERS = 5
        pool = ThreadPool(min(MAX_BROWSERS, len(urls)))
        for game_data in pool.imap(parse_data, urls):
            result = pd.DataFrame(game_data.__dict__)
            if results is None:
                results = result
            else:
                results = results.append(result, ignore_index=True)
    
        print(results)
        #print(results.head())
        # ensure all the drivers are "quitted":
        del threadLocal
        import gc
        gc.collect()  # a little extra insurance
    

    Prints:

                date   time                                         game score home_odds draw_odds away_odds     country            league
    0    03 Sep 2021  00:00                      Petrolera - Dep. Pasto    2:3      -128      +245      +334    Colombia     Copa Colombia
    1    03 Sep 2021  00:00                       Jalapa - Export Sebaco   0:2      -137      +266      +307   Nicaragua      Liga Primera
    2    03 Sep 2021  00:00                        Venezuela - Argentina   1:3      +799      +376      -270       World    World Cup 2022
    3    03 Sep 2021  00:05                            Canada - Honduras   1:1      -196      +290      +597       World    World Cup 2022
    4    03 Sep 2021  01:00                               Peru - Uruguay   1:1      +231      +204      +140       World    World Cup 2022
    ..           ...    ...                                          ...   ...       ...       ...       ...         ...               ...
    219  03 Sep 2021  23:00                   Greenville - Toronto FC II   3:0      -147      +263      +363         USA    USL League One
    220  03 Sep 2021  23:30                 Nashville SC - New York City   3:1      +166      +235      +166         USA               MLS
    221  03 Sep 2021  23:30  Philadelphia Union - New England Revolution   0:1      +164      +256      +154         USA               MLS
    222  03 Sep 2021  23:30                   Louisville City - FC Tulsa   0:1      -233      +394      +459         USA  USL Championship
    223  03 Sep 2021  23:30                    Tampa Bay - Oakland Roots   3:0      -227      +320      +573         USA  USL Championship
    
    [224 rows x 9 columns]