Search code examples
pythongoogle-sheetsbeautifulsoupgoogle-sheets-apigspread

Append Rows issue while scraping URLs via Python loop


I'm looking to visit each URL and return every player image found within the HREF tags, meaning - visit URL, click each player, store profile image link.

I had the right result printing with the code below, but it was pushing the data one by one & ultimately hitting a 429 G Spread quota issue.

My full code is here:

import requests
from bs4 import BeautifulSoup

import gspread
gc = gspread.service_account(filename='creds.json')
sh = gc.open_by_key('1TD4YmhfAsnSL_Fwo1lckEbnUVBQB6VyKC05ieJ7PKCw')
worksheet = sh.get_worksheet(0)


# def get_links(url):
#     data = []
#     req_url = requests.get(url)
#     soup = BeautifulSoup(req_url.content, "html.parser")

#     for td in soup.select('td:has(>a[href^="/player"])'):
#         a_tag = td.a
#         name = a_tag.text
#         player_url = a_tag['href']
#         print(f"Getting {name}")

#         req_player_url = requests.get(
#             f"https://basketball.realgm.com{player_url}")
#         soup_player = BeautifulSoup(req_player_url.content, "html.parser")
#         print(f"soup_player for {name}: {soup_player}")

#         div_profile_box = soup_player.find('div', {'class': 'profile-box'})

#         img_tags = div_profile_box.find_all('img')
#         for i, img_tag in enumerate(img_tags):
#             image_url = img_tag['src']
#             row = {"Name": name, "URL": player_url,
#                    f"Image URL {i}": image_url}
#             data.append(row)

#     return data


def get_links2(url):
    data = []
    req_url = requests.get(url)
    soup = BeautifulSoup(req_url.content, "html.parser")

    for td in soup.select('td.nowrap'):
        a_tag = td.a
        if a_tag:
            name = a_tag.text
            player_url = a_tag['href']
            pos = td.find_next_sibling('td').text
            print(f"Getting {name}")

            req_player_url = requests.get(
                f"https://basketball.realgm.com{player_url}")
            soup_player = BeautifulSoup(req_player_url.content, "html.parser")
            div_profile_box = soup_player.find("div", class_="profile-box")
            row = {"Name": name, "URL": player_url, "pos_option1": pos}
            row['pos_option2'] = div_profile_box.h2.span.text if div_profile_box.h2.span else None

            for p in div_profile_box.find_all("p"):
                try:
                    key, value = p.get_text(strip=True).split(':', 1)
                    row[key.strip()] = value.strip()
                except:     # not all entries have values
                    pass

            # Add img tags to row dictionary
            img_tags = div_profile_box.find_all('img')
            for i, img in enumerate(img_tags):
                row[f'img_{i+1}'] = img['src']

            data.append(row)

    return data


urls = ["https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/player/All/desc",
        "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/2", "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/3",
        "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/4"]
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/5",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/6",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/7",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/8",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/9",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/10",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/11",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/12",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/13",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/14",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/15",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/16",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/17",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/18",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/19",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/20",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/21",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/22",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/23",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/24",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/25",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/26",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/27",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/28",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/29",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/30",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/31",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/32",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/33",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/34",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/35",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/36",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/37",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/38",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/39",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/40",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/41",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/42",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/43",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/44",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/45",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/46",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/47",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/48",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/49",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/50",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/51",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/52",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/53",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/54",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/55",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/56",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/57",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/58",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/59",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/60",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/61",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/62",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/63",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/64",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/65",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/66",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/67",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/68",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/69",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/70",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/71",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/72",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/73",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/74",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/75",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/76",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/77",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/78",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/79",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/80",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/81",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/82",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/83",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/84",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/85",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/86",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/87",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/88",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/89",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/90",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/91",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/92",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/93",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/94",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/95",
#  "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/96"]

for url in urls:
    data = get_links2(url)
    for row in data:
        worksheet.insert_row(list(row.values()))

I tried to switch to "append_rows" instead of "insert_row" in my final statement. This created a very confusing error:

Traceback (most recent call last):
  File "c:\Users\AMadle\GLeague Tracking\(A) INTLimgScrape.py", line 175, in <module>
    worksheet.append_rows(list(row.values()))
  File "C:\Python\python3.10.5\lib\site-packages\gspread\worksheet.py", line 1338, in append_rows
    return self.spreadsheet.values_append(range_label, params, body)
  File "C:\Python\python3.10.5\lib\site-packages\gspread\spreadsheet.py", line 149, in values_append
    r = self.client.request("post", url, params=params, json=body)
  File "C:\Python\python3.10.5\lib\site-packages\gspread\client.py", line 86, in request
    raise APIError(response)
gspread.exceptions.APIError: {'code': 400, 'message': 'Invalid value at \'data.values[0]\' (type.googleapis.com/google.protobuf.ListValue), "Jaroslaw Zyskowski"\nInvalid value at \'data.values[1]\' (type.googleapis.com/google.protobuf.ListValue), "/player/Jaroslaw-Zyskowski/Summary/32427"\nInvalid value at \'data.values[2]\' (type.googleapis.com/google.protobuf.ListValue), "TRE"\nInvalid value at \'data.values[3]\' (type.googleapis.com/google.protobuf.ListValue), "SF"\nInvalid value at \'data.values[4]\' (type.googleapis.com/google.protobuf.ListValue), "Trefl Sopot"\nInvalid value at \'data.values[5]\' (type.googleapis.com/google.protobuf.ListValue), "Jul 16, 1992(30 years old)"\nInvalid value at \'data.values[6]\' (type.googleapis.com/google.protobuf.ListValue), "Wroclaw, Poland"\nInvalid value at \'data.values[7]\' (type.googleapis.com/google.protobuf.ListValue), "Poland"\nInvalid value at \'data.values[8]\' (type.googleapis.com/google.protobuf.ListValue), "6-7 (201cm)Weight:220 (100kg)"\nInvalid value at \'data.values[9]\' (type.googleapis.com/google.protobuf.ListValue), "Unrestricted Free Agent"\nInvalid value at \'data.values[10]\' (type.googleapis.com/google.protobuf.ListValue), "Manuel Capicchioni"\nInvalid value at \'data.values[11]\' (type.googleapis.com/google.protobuf.ListValue), "2014 NBA Draft"\nInvalid value at \'data.values[12]\' (type.googleapis.com/google.protobuf.ListValue), "Undrafted"\nInvalid value at \'data.values[13]\' (type.googleapis.com/google.protobuf.ListValue), "Kotwica Kolobrzeg (Poland)"\nInvalid value at \'data.values[14]\' (type.googleapis.com/google.protobuf.ListValue), "/images/nba/4.2/profiles/photos/2006/player_photo.jpg"\nInvalid value at \'data.values[15]\' (type.googleapis.com/google.protobuf.ListValue), "/images/basketball/5.0/team_logos/international/polish/trefl.png"', 'status': 'INVALID_ARGUMENT', 'details': [{'@type': 'type.googleapis.com/google.rpc.BadRequest', 'fieldViolations': [{'field': 'data.values[0]', 'description': 'Invalid value at \'data.values[0]\' (type.googleapis.com/google.protobuf.ListValue), "Jaroslaw Zyskowski"'}, {'field': 'data.values[1]', 'description': 'Invalid value at \'data.values[1]\' (type.googleapis.com/google.protobuf.ListValue), "/player/Jaroslaw-Zyskowski/Summary/32427"'}, {'field': 'data.values[2]', 'description': 'Invalid value at \'data.values[2]\' (type.googleapis.com/google.protobuf.ListValue), "TRE"'}, {'field': 'data.values[3]', 'description': 'Invalid value at \'data.values[3]\' (type.googleapis.com/google.protobuf.ListValue), "SF"'}, {'field': 'data.values[4]', 'description': 'Invalid value at \'data.values[4]\' (type.googleapis.com/google.protobuf.ListValue), "Trefl Sopot"'}, {'field': 'data.values[5]', 'description': 'Invalid value at \'data.values[5]\' (type.googleapis.com/google.protobuf.ListValue), "Jul 16, 1992(30 years old)"'}, {'field': 'data.values[6]', 'description': 'Invalid value at \'data.values[6]\' (type.googleapis.com/google.protobuf.ListValue), "Wroclaw, Poland"'}, {'field': 'data.values[7]', 'description': 'Invalid value at \'data.values[7]\' (type.googleapis.com/google.protobuf.ListValue), "Poland"'}, {'field': 'data.values[8]', 'description': 'Invalid value at \'data.values[8]\' (type.googleapis.com/google.protobuf.ListValue), "6-7 (201cm)Weight:220 (100kg)"'}, {'field': 'data.values[9]', 'description': 'Invalid value at \'data.values[9]\' (type.googleapis.com/google.protobuf.ListValue), "Unrestricted Free Agent"'}, {'field': 'data.values[10]', 'description': 'Invalid value at \'data.values[10]\' (type.googleapis.com/google.protobuf.ListValue), "Manuel Capicchioni"'}, {'field': 'data.values[11]', 'description': 'Invalid value at \'data.values[11]\' (type.googleapis.com/google.protobuf.ListValue), "2014 NBA Draft"'}, {'field': 'data.values[12]', 'description': 'Invalid value at \'data.values[12]\' (type.googleapis.com/google.protobuf.ListValue), "Undrafted"'}, {'field': 'data.values[13]', 'description': 'Invalid value at \'data.values[13]\' (type.googleapis.com/google.protobuf.ListValue), "Kotwica Kolobrzeg (Poland)"'}, {'field': 'data.values[14]', 'description': 'Invalid value at \'data.values[14]\' (type.googleapis.com/google.protobuf.ListValue), "/images/nba/4.2/profiles/photos/2006/player_photo.jpg"'}, {'field': 'data.values[15]', 'description': 'Invalid value at \'data.values[15]\' 
(type.googleapis.com/google.protobuf.ListValue), "/images/basketball/5.0/team_logos/international/polish/trefl.png"'}]}]}
PS C:\Users\AMadle\GLeague Tracking> 

Any thoughts as to how I could push this output to my Google Sheet in one move, rather than inserting rows each time?


Solution

  • In your showing script, worksheet.insert_row(list(row.values())) is used in a loop. I thought that this might be the reason for your current issue. In this case, how about the following modification?

    From:

    for url in urls:
        data = get_links2(url)
        for row in data:
            worksheet.insert_row(list(row.values()))
    

    To:

    values = []
    for url in urls:
        values = [*values, *get_links2(url)]
    
    if values != []:
        header = list(values[0].keys())
        values = [header, *[[e[k] if e.get(k) else "" for k in header] for e in values]]
        worksheet.append_rows(values, value_input_option="USER_ENTERED")
    
    • By this modification, after all values were retrieved in for url in urls:, the values are put into the Spreadsheet. This flow can be achieved by one API call. I thought that by this, your current issue might be able to be removed.

    • If you don't want to include the header row, please modify [header, *[[e[k] if e.get(k) else "" for k in header] for e in values]] to [[e[k] if e.get(k) else "" for k in header] for e in values].

    • If you want to put the value of every URL, how about the following modification? But, in this case, the Sheets API for the number of URLs is used.

      • From

        for url in urls:
            data = get_links2(url)
            for row in data:
                worksheet.insert_row(list(row.values()))
        
      • To

        header = None
        for url in urls:
            values = get_links2(url)
            if values != []:
                if not header:
                    header = list(values[0].keys())
                values = [[e[k] if e.get(k) else "" for k in header] for e in values]
                worksheet.append_rows(values, value_input_option="USER_ENTERED")