python web-scraping beautifulsoup python-requests

How to use Beautiful Soup to scrap odds from Bet Explorer with Python?

I am trying to scrap for betting odds and the fixture names from the website betting explorer whose full address is https://www.betexplorer.com, I have tried to use this script to do it but it os saying that no table was found, please help me make it work

import pandas as pd
import requests
from bs4 import BeautifulSoup

# DEFINE THE URL to use for mining odds
URL = "https://www.betexplorer.com/"

# Send a GET request to the URL
response = requests.get(URL)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the table with the soccer odds
    table_matches = soup.find('table', attrs={'class': 'table-main-js-tablebanner-t js-tablebanner-ntb'})

    if table_matches:
        data = []
        rows = table_matches.find_all('tr')

        for row in rows:
            utils = []
            cols = row.find_all('td')
            for element in cols:
                # Clean up the text and add it to the utils list
                utils.append(element.get_text(strip=True))
            if utils:  # Only append non-empty rows
                data.append(utils)

        # Create a DataFrame with the scraped data
        df = pd.DataFrame(data, columns=["Match", "Result", "1", "X", "2", "Date"])
        print(df)
    else:
        print("No table found on the page.")
else:
    print(f"Failed to retrieve data. HTTP Status code: {response.status_code}")

I expected the code to generate a Pandas Data Frame that will have the match name, odd for the home team, odd for a draw, odd for the away side to win and the date time the event is supposed to happen, because the websites uses a table to represent the fixture details, help me make it work.

Solution

Your web page is highly dynamic and only fully rendered once the page is scrolled to its end.

Web pages that utilise JavaScript for their functionality typically do not present a fully rendered page upon first opening. That's why running GET on the page and passing the resulting HTML to BeautifulSoup often can't / won't work.

The selenium module is helpful for this but even then you'll often have to jump through some hoops to get the result you want.

This code uses the Chrome driver which you'll need to install. Selenium has emulators for other browsers so you're not restricted to Chrome.

Be patient if you decide to try running this. It's quite slow...

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import ChromeService, ChromeOptions
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

CHROMEDRIVER = "../.wdm/drivers/chromedriver/mac64/124.0.6367.91/chromedriver-mac-arm64/chromedriver"
URL = "https://www.betexplorer.com"
options = ChromeOptions()
options.add_argument("--headless")

data = {
    "Home": [],
    "Away": [],
    "1": [],
    "X": [],
    "2": []
}

def scroll(driver):
    height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
        time.sleep(1) # <- traditional hack
        hnow = driver.execute_script("return document.body.scrollHeight")
        if hnow == height:
            break
        height = hnow

with webdriver.Chrome(options=options, service=ChromeService(CHROMEDRIVER)) as driver:
    driver.get(URL)
    scroll(driver)
    uls = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "ul.leagues-list")))
    for ul in uls:
        suls = WebDriverWait(ul, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "ul.table-main__matchInfo")))
        for _sul in suls:
            home = WebDriverWait(_sul, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.table-main__participantHome")))
            away = WebDriverWait(_sul, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.table-main__participantAway")))
            data["Home"].append(home.text.strip())
            data["Away"].append(away.text.strip())
            odds = WebDriverWait(_sul, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.table-main__odds")))
            for k, o in zip("1X2", odds):
                data[k].append(float(o.text))

df = pd.DataFrame(data)
print(df)

Output:

                      Home           Away      1      X      2
0               St Etienne          Rodez   1.75   3.84   4.29
1                    Genoa        Bologna   3.32   3.31   2.26
2                   Girona     Granada CF   1.23   7.07  11.36
3               Skenderbeu       Vllaznia   2.75   3.22   2.32
4                     Arba     Mostaganem   3.85   3.25   1.86
..                     ...            ...    ...    ...    ...
275                Da Nang      Dong Thap   1.20   5.66  11.07
276          TTBD Phu Dong            Hue   1.76   3.25   4.53
277    A-Leagues All Stars      Newcastle   2.72   4.39   2.05
278           Blumenthaler  Werder Bremen  45.50  28.00   1.01
279  A-Leagues All Stars W      Arsenal W   4.45   4.98   1.51

[280 rows x 5 columns]