python beautifulsoup python-requests python-requests-html

Python script not returning any results while web scraping

I am looking to scrape a list of URLs -- I want to visit each one & then return all IMG links contained within each HREF on the page (in essence, visit each link and return the image address of the player headshot on each player profile).

I have a successful script for one set of URLs below - this is what I'm trying to achieve:

import requests
from bs4 import BeautifulSoup

import gspread
gc = gspread.service_account(filename='creds.json')
sh = gc.open_by_key('1TD4YmhfAsnSL_Fwo1lckEbnUVBQB6VyKC05ieJ7PKCw')
worksheet = sh.get_worksheet(0)
# AddValue = ["Test", 25, "Test2"]
# worksheet.insert_row(AddValue, 3)


def get_links(url):
    data = []
    req_url = requests.get(url)
    soup = BeautifulSoup(req_url.content, "html.parser")

    for td in soup.find_all('td', {'data-th': 'Player'}):
        a_tag = td.a
        name = a_tag.text
        player_url = a_tag['href']
        print(f"Getting {name}")

        req_player_url = requests.get(
            f"https://basketball.realgm.com{player_url}")
        soup_player = BeautifulSoup(req_player_url.content, "html.parser")

        div_profile_box = soup_player.find('div', {'class': 'profile-box'})

        img_tag = div_profile_box.find('img')
        image_url = img_tag['src']

        row = {"Name": name, "URL": player_url, "Image URL": image_url}

        data.append(row)

    return data


urls = [
    'https://basketball.realgm.com/dleague/players/2022',
    'https://basketball.realgm.com/dleague/players/2021',
    'https://basketball.realgm.com/dleague/players/2020',
    'https://basketball.realgm.com/dleague/players/2019',
    'https://basketball.realgm.com/dleague/players/2018',
]


res = []
for url in urls:
    print(f"Getting: {url}")
    data = get_links(url)
    res = [*res, *data]

if res != []:
    header = list(res[0].keys())
    values = [
        header, *[[e[k] if e.get(k) else "" for k in header] for e in res]]
    worksheet.append_rows(values, value_input_option="USER_ENTERED")

This returns an output of: Player Name, Player URL, Player Headshot:

I tweaked the code to tweak for a different set of URLs, but it's not returning any information. No errors are showing, but nothing seems to be happening:

import requests
from bs4 import BeautifulSoup

import gspread
gc = gspread.service_account(filename='creds.json')
sh = gc.open_by_key('1TD4YmhfAsnSL_Fwo1lckEbnUVBQB6VyKC05ieJ7PKCw')
worksheet = sh.get_worksheet(0)


def get_links(url):
    data = []
    req_url = requests.get(url)
    soup = BeautifulSoup(req_url.content, "html.parser")

    for td in soup.find_all('td', {'data-th': 'Player'}):
        a_tag = td.a
        name = a_tag.text
        player_url = a_tag['href']
        print(f"Getting {name}")

        req_player_url = requests.get(
            f"https://basketball.realgm.com{player_url}")
        soup_player = BeautifulSoup(req_player_url.content, "html.parser")

        div_profile_box = soup_player.find('div', {'class': 'profile-box'})

        img_tags = div_profile_box.find_all('img')
        for i, img_tag in enumerate(img_tags):
            image_url = img_tag['src']
            row = {"Name": name, "URL": player_url,
                   f"Image URL {i}": image_url}
            data.append(row)

    return data


urls = [
    "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/player/All/desc",
    "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/2",
    "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/3"
]

for url in urls:
    data = get_links(url)
    for row in data:
        worksheet.insert_row(list(row.values()))

I also checked a version debugging "soup_player", but I'm still not receiving any results:

import requests
from bs4 import BeautifulSoup

import gspread
gc = gspread.service_account(filename='creds.json')
sh = gc.open_by_key('1TD4YmhfAsnSL_Fwo1lckEbnUVBQB6VyKC05ieJ7PKCw')
worksheet = sh.get_worksheet(0)


def get_links(url):
    data = []
    req_url = requests.get(url)
    soup = BeautifulSoup(req_url.content, "html.parser")

    for td in soup.find_all('td', {'data-th': 'Player'}):
        a_tag = td.a
        name = a_tag.text
        player_url = a_tag['href']
        print(f"Getting {name}")

        req_player_url = requests.get(
            f"https://basketball.realgm.com{player_url}")
        soup_player = BeautifulSoup(req_player_url.content, "html.parser")
        print(f"soup_player for {name}: {soup_player}")

        div_profile_box = soup_player.find('div', {'class': 'profile-box'})

        img_tags = div_profile_box.find_all('img')
        for i, img_tag in enumerate(img_tags):
            image_url = img_tag['src']
            row = {"Name": name, "URL": player_url, f"Image URL {i}": image_url}
            data.append(row)

    return data


urls = [    "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/player/All/desc",    "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/2",    "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/3"]

for url in urls:
    data = get_links(url)
    for row in data:
        worksheet.insert_row(list(row.values()))

Any advice as to what I may be doing wrong here? Thank you in advance!

Solution

Always and first of all, take a look at your soup to see if all the expected ingredients are in place.

Main issue is that your selection of initial table elements will not find any, so ResultSet is empty.

Change:

for td in soup.find_all('td', {'data-th': 'Player'}):

to:

for td in soup.select('td:has(>a[href^="/player"])'):