I'm looking to visit each URL and return every player image found within the HREF tags, meaning - visit URL, click each player, store profile image link.
I had the right result printing with the code below, but it was pushing the data one by one & ultimately hitting a 429 G Spread quota issue.
My full code is here:
import requests
from bs4 import BeautifulSoup
import gspread
gc = gspread.service_account(filename='creds.json')
sh = gc.open_by_key('1TD4YmhfAsnSL_Fwo1lckEbnUVBQB6VyKC05ieJ7PKCw')
worksheet = sh.get_worksheet(0)
# def get_links(url):
# data = []
# req_url = requests.get(url)
# soup = BeautifulSoup(req_url.content, "html.parser")
# for td in soup.select('td:has(>a[href^="/player"])'):
# a_tag = td.a
# name = a_tag.text
# player_url = a_tag['href']
# print(f"Getting {name}")
# req_player_url = requests.get(
# f"https://basketball.realgm.com{player_url}")
# soup_player = BeautifulSoup(req_player_url.content, "html.parser")
# print(f"soup_player for {name}: {soup_player}")
# div_profile_box = soup_player.find('div', {'class': 'profile-box'})
# img_tags = div_profile_box.find_all('img')
# for i, img_tag in enumerate(img_tags):
# image_url = img_tag['src']
# row = {"Name": name, "URL": player_url,
# f"Image URL {i}": image_url}
# data.append(row)
# return data
def get_links2(url):
data = []
req_url = requests.get(url)
soup = BeautifulSoup(req_url.content, "html.parser")
for td in soup.select('td.nowrap'):
a_tag = td.a
if a_tag:
name = a_tag.text
player_url = a_tag['href']
pos = td.find_next_sibling('td').text
print(f"Getting {name}")
req_player_url = requests.get(
f"https://basketball.realgm.com{player_url}")
soup_player = BeautifulSoup(req_player_url.content, "html.parser")
div_profile_box = soup_player.find("div", class_="profile-box")
row = {"Name": name, "URL": player_url, "pos_option1": pos}
row['pos_option2'] = div_profile_box.h2.span.text if div_profile_box.h2.span else None
for p in div_profile_box.find_all("p"):
try:
key, value = p.get_text(strip=True).split(':', 1)
row[key.strip()] = value.strip()
except: # not all entries have values
pass
# Add img tags to row dictionary
img_tags = div_profile_box.find_all('img')
for i, img in enumerate(img_tags):
row[f'img_{i+1}'] = img['src']
data.append(row)
return data
urls = ["https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/player/All/desc",
"https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/2", "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/3",
"https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/4"]
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/5",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/6",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/7",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/8",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/9",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/10",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/11",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/12",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/13",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/14",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/15",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/16",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/17",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/18",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/19",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/20",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/21",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/22",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/23",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/24",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/25",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/26",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/27",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/28",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/29",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/30",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/31",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/32",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/33",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/34",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/35",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/36",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/37",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/38",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/39",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/40",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/41",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/42",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/43",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/44",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/45",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/46",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/47",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/48",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/49",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/50",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/51",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/52",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/53",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/54",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/55",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/56",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/57",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/58",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/59",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/60",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/61",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/62",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/63",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/64",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/65",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/66",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/67",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/68",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/69",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/70",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/71",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/72",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/73",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/74",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/75",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/76",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/77",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/78",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/79",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/80",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/81",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/82",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/83",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/84",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/85",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/86",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/87",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/88",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/89",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/90",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/91",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/92",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/93",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/94",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/95",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/96"]
for url in urls:
data = get_links2(url)
for row in data:
worksheet.insert_row(list(row.values()))
I tried to switch to "append_rows" instead of "insert_row" in my final statement. This created a very confusing error:
Traceback (most recent call last):
File "c:\Users\AMadle\GLeague Tracking\(A) INTLimgScrape.py", line 175, in <module>
worksheet.append_rows(list(row.values()))
File "C:\Python\python3.10.5\lib\site-packages\gspread\worksheet.py", line 1338, in append_rows
return self.spreadsheet.values_append(range_label, params, body)
File "C:\Python\python3.10.5\lib\site-packages\gspread\spreadsheet.py", line 149, in values_append
r = self.client.request("post", url, params=params, json=body)
File "C:\Python\python3.10.5\lib\site-packages\gspread\client.py", line 86, in request
raise APIError(response)
gspread.exceptions.APIError: {'code': 400, 'message': 'Invalid value at \'data.values[0]\' (type.googleapis.com/google.protobuf.ListValue), "Jaroslaw Zyskowski"\nInvalid value at \'data.values[1]\' (type.googleapis.com/google.protobuf.ListValue), "/player/Jaroslaw-Zyskowski/Summary/32427"\nInvalid value at \'data.values[2]\' (type.googleapis.com/google.protobuf.ListValue), "TRE"\nInvalid value at \'data.values[3]\' (type.googleapis.com/google.protobuf.ListValue), "SF"\nInvalid value at \'data.values[4]\' (type.googleapis.com/google.protobuf.ListValue), "Trefl Sopot"\nInvalid value at \'data.values[5]\' (type.googleapis.com/google.protobuf.ListValue), "Jul 16, 1992(30 years old)"\nInvalid value at \'data.values[6]\' (type.googleapis.com/google.protobuf.ListValue), "Wroclaw, Poland"\nInvalid value at \'data.values[7]\' (type.googleapis.com/google.protobuf.ListValue), "Poland"\nInvalid value at \'data.values[8]\' (type.googleapis.com/google.protobuf.ListValue), "6-7 (201cm)Weight:220 (100kg)"\nInvalid value at \'data.values[9]\' (type.googleapis.com/google.protobuf.ListValue), "Unrestricted Free Agent"\nInvalid value at \'data.values[10]\' (type.googleapis.com/google.protobuf.ListValue), "Manuel Capicchioni"\nInvalid value at \'data.values[11]\' (type.googleapis.com/google.protobuf.ListValue), "2014 NBA Draft"\nInvalid value at \'data.values[12]\' (type.googleapis.com/google.protobuf.ListValue), "Undrafted"\nInvalid value at \'data.values[13]\' (type.googleapis.com/google.protobuf.ListValue), "Kotwica Kolobrzeg (Poland)"\nInvalid value at \'data.values[14]\' (type.googleapis.com/google.protobuf.ListValue), "/images/nba/4.2/profiles/photos/2006/player_photo.jpg"\nInvalid value at \'data.values[15]\' (type.googleapis.com/google.protobuf.ListValue), "/images/basketball/5.0/team_logos/international/polish/trefl.png"', 'status': 'INVALID_ARGUMENT', 'details': [{'@type': 'type.googleapis.com/google.rpc.BadRequest', 'fieldViolations': [{'field': 'data.values[0]', 'description': 'Invalid value at \'data.values[0]\' (type.googleapis.com/google.protobuf.ListValue), "Jaroslaw Zyskowski"'}, {'field': 'data.values[1]', 'description': 'Invalid value at \'data.values[1]\' (type.googleapis.com/google.protobuf.ListValue), "/player/Jaroslaw-Zyskowski/Summary/32427"'}, {'field': 'data.values[2]', 'description': 'Invalid value at \'data.values[2]\' (type.googleapis.com/google.protobuf.ListValue), "TRE"'}, {'field': 'data.values[3]', 'description': 'Invalid value at \'data.values[3]\' (type.googleapis.com/google.protobuf.ListValue), "SF"'}, {'field': 'data.values[4]', 'description': 'Invalid value at \'data.values[4]\' (type.googleapis.com/google.protobuf.ListValue), "Trefl Sopot"'}, {'field': 'data.values[5]', 'description': 'Invalid value at \'data.values[5]\' (type.googleapis.com/google.protobuf.ListValue), "Jul 16, 1992(30 years old)"'}, {'field': 'data.values[6]', 'description': 'Invalid value at \'data.values[6]\' (type.googleapis.com/google.protobuf.ListValue), "Wroclaw, Poland"'}, {'field': 'data.values[7]', 'description': 'Invalid value at \'data.values[7]\' (type.googleapis.com/google.protobuf.ListValue), "Poland"'}, {'field': 'data.values[8]', 'description': 'Invalid value at \'data.values[8]\' (type.googleapis.com/google.protobuf.ListValue), "6-7 (201cm)Weight:220 (100kg)"'}, {'field': 'data.values[9]', 'description': 'Invalid value at \'data.values[9]\' (type.googleapis.com/google.protobuf.ListValue), "Unrestricted Free Agent"'}, {'field': 'data.values[10]', 'description': 'Invalid value at \'data.values[10]\' (type.googleapis.com/google.protobuf.ListValue), "Manuel Capicchioni"'}, {'field': 'data.values[11]', 'description': 'Invalid value at \'data.values[11]\' (type.googleapis.com/google.protobuf.ListValue), "2014 NBA Draft"'}, {'field': 'data.values[12]', 'description': 'Invalid value at \'data.values[12]\' (type.googleapis.com/google.protobuf.ListValue), "Undrafted"'}, {'field': 'data.values[13]', 'description': 'Invalid value at \'data.values[13]\' (type.googleapis.com/google.protobuf.ListValue), "Kotwica Kolobrzeg (Poland)"'}, {'field': 'data.values[14]', 'description': 'Invalid value at \'data.values[14]\' (type.googleapis.com/google.protobuf.ListValue), "/images/nba/4.2/profiles/photos/2006/player_photo.jpg"'}, {'field': 'data.values[15]', 'description': 'Invalid value at \'data.values[15]\'
(type.googleapis.com/google.protobuf.ListValue), "/images/basketball/5.0/team_logos/international/polish/trefl.png"'}]}]}
PS C:\Users\AMadle\GLeague Tracking>
Any thoughts as to how I could push this output to my Google Sheet in one move, rather than inserting rows each time?
In your showing script, worksheet.insert_row(list(row.values()))
is used in a loop. I thought that this might be the reason for your current issue. In this case, how about the following modification?
for url in urls:
data = get_links2(url)
for row in data:
worksheet.insert_row(list(row.values()))
values = []
for url in urls:
values = [*values, *get_links2(url)]
if values != []:
header = list(values[0].keys())
values = [header, *[[e[k] if e.get(k) else "" for k in header] for e in values]]
worksheet.append_rows(values, value_input_option="USER_ENTERED")
By this modification, after all values were retrieved in for url in urls:
, the values are put into the Spreadsheet. This flow can be achieved by one API call. I thought that by this, your current issue might be able to be removed.
If you don't want to include the header row, please modify [header, *[[e[k] if e.get(k) else "" for k in header] for e in values]]
to [[e[k] if e.get(k) else "" for k in header] for e in values]
.
If you want to put the value of every URL, how about the following modification? But, in this case, the Sheets API for the number of URLs is used.
From
for url in urls:
data = get_links2(url)
for row in data:
worksheet.insert_row(list(row.values()))
To
header = None
for url in urls:
values = get_links2(url)
if values != []:
if not header:
header = list(values[0].keys())
values = [[e[k] if e.get(k) else "" for k in header] for e in values]
worksheet.append_rows(values, value_input_option="USER_ENTERED")