Search code examples
rweb-scrapingrvesthttr

Scraping tables from stats.nba.com - multiple approaches not working


We are trying to scrape the table from here - https://www.nba.com/stats/teams/advanced/?sort=W&dir=-1&Season=2020-21&SeasonType=Regular%20Season - into R. Here's what we've tried so far:

# get request from API found in network tab - this doesn't work, the request hangs
httr::GET(url = 'https://stats.nba.com/stats/leaguedashteamstats?Conference=&DateFrom=&DateTo=&Division=&GameScope=&GameSegment=&LastNGames=0&LeagueID=00&Location=&MeasureType=Advanced&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&Season=2020-21&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=0&TwoWay=0&VsConference=&VsDivision=')
# rvest returns empty nodeset when grabbing tables on page
'https://www.nba.com/stats/teams/advanced/?sort=W&dir=-1&Season=2020-21&SeasonType=Regular%20Season' %>%
  read_html() %>%
  html_nodes('table')

Is it possible to scrape the main table from this webpage using R?

Edit:

headers = c(
    `authority` = 'www.nba.com',
    `cache-control` = 'max-age=0',
    `sec-ch-ua` = '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
    `sec-ch-ua-mobile` = '?0',
    `sec-ch-ua-platform` = '"macOS"',
    `upgrade-insecure-requests` = '1',
    `user-agent` = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36',
    `accept` = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    `sec-fetch-site` = 'same-origin',
    `sec-fetch-mode` = 'navigate',
    `sec-fetch-user` = '?1',
    `sec-fetch-dest` = 'document',
    `accept-language` = 'en-US,en;q=0.9',
    `cookie` = 'usprivacy=1YNN; AMCVS_248F210755B762187F000101%40AdobeOrg=1; s_ecid=MCMID%7C39761269548384710744541812242089157146; countryCode=US; s_cc=true; ug=61647d1f0252400a3f87470014d69025; nlhidescores=false; _pbjs_userid_consent_data=3524755945110770; qoscid=524912006.1633975588; qossid=1633975588; client_type=html5; client_version=4.4.0; ugs=1; OptanonAlertBoxClosed=2021-10-12T23:20:24.183Z; at_check=true; _parsely_visitor={%22id%22:%22pid=0cb0a9a5854f45ea8a6d48f74f03e800%22%2C%22session_count%22:1%2C%22last_session_ts%22:1634155541257}; ab.storage.deviceId.cf150dab-3153-49b0-b48c-66a7c18688ea=%7B%22g%22%3A%2228d2f640-2ad0-b8e9-b78c-016ba5a85671%22%2C%22c%22%3A1634155541318%2C%22l%22%3A1634155541318%7D; OptanonControl=ccc=US&csc=&cic=0&otvers=6.24.0&pctm=2021-10-12T23%3A20%3A24.183Z&reg=ccpa&ustcs=1YNN&vers=3.1.5; aam_uuid=39724801183369993254542124123886279717; s_ips=796; mbox=session#70d31bd3ea124acc80cb089a5594528e#1634158760|PC#70d31bd3ea124acc80cb089a5594528e.34_0#1697401700; ab.storage.sessionId.cf150dab-3153-49b0-b48c-66a7c18688ea=%7B%22g%22%3A%228dcfd2a2-4419-87f9-7e1c-22cf76830e7e%22%2C%22e%22%3A1634158700129%2C%22c%22%3A1634155541315%2C%22l%22%3A1634156900129%7D; s_tp=2924; s_ppv=nba%253Ateams%253Amain%2C27%2C27%2C796%2C1%2C3; ak_bmsc=2C1E9B2928FD1C90ECFF4A5887776269~000000000000000000000000000000~YAAQrL4cuDCzpVJ8AQAAytvzew1NuriisqR0MtOqexD1CqvqIJKuuhJda9NNGXOBCOjAdMEXnQjL10fYxWYj9HLm2DJdQLQIjLSqvl3faGyPbxWARg6dKwmf4NK/+RENdJTZfsKGTbwUMxTtPRSoR7TmMc3UWE4tAdft14nRiSPZwp/DJjK9NUhLtpTDjCa65HELyeJ7O4M4d98rAu5R7YYZOEVRjz5VRQEGaFBc5u2OlaUpcyFDqUM+j+jII/6xmqgwVRUhX8t8oNmdeiYpfEALo1yewznqZcfOO18htGp4sF3SLPG8bBFvLeGwW118Mu1rVkyeO4PEvC7UFZUc+a7tGNSjGyGe0WSC/0iSjTC+/ikP2BPwMosXe7DxWk/a0vuFtUlw7jArB/YQuYHH61uu8E97UTA=; AMCV_248F210755B762187F000101%40AdobeOrg=359503849%7CMCMID%7C39761269548384710744541812242089157146%7CMCAAMLH-1634771953%7C7%7CMCAAMB-1634771953%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1634174353s%7CNONE%7CMCAID%7CNONE%7CvVersion%7C5.0.1%7CMCIDTS%7C18914; s_gpv_pageModal=nba%3Astats%3Ateams%3Aadvanced; s_sq=%5B%5BB%5D%5D; akavpau_allowednbamain=1634169266~id=4fd4cabce5336e66bef275d5dd409a10; bm_sv=467DB2784E3DE76FAA9F4CD21DD7DE3C~8bPs2wRiWvWAD8K8MYos9duNZqYto/EQc8HFibswczdPYqofRTJZOTE4Xy1RsB9fJag8YMdv3OOHkVFDGoh7aG8x4Y8eZepOfBGMFtPmQF0Vgg0XNix35HHU2sk9RKCEQujy2BRS4m269Y6fIapqEQ==; OptanonConsent=isIABGlobal=false&datestamp=Wed+Oct+13+2021+19%3A44%3A27+GMT-0400+(Eastern+Daylight+Time)&version=6.24.0&hosts=&consentId=e8a9be54-a345-44df-90e1-eaaf56d98079&interactionCount=2&landingPath=NotLandingPage&groups=BG30%3A1%2Cven%3A1%2Cpad%3A1%2Cpap%3A1%2Ccad%3A1%2Cmap%3A1%2Cdsa%3A1%2CNBAad%3A1%2Creq%3A1%2Csec%3A1%2Cgld%3A1%2Cpcp%3A1%2Cmcp%3A1%2Cmra%3A1%2Ctdc%3A1%2Ccos%3A1%2Cdid%3A1%2Csid%3A1%2Cpdd%3A1%2Cpcd%3A1%2CNBAmt%3A1&AwaitingReconsent=false&geolocation=US%3B'
  )
  
  params = list(
    `sort` = 'W',
    `dir` = '-1',
    `Season` = '2020-21',
    `SeasonType` = 'Regular Season'
  )
  
res <- httr::GET(url = 'https://www.nba.com/stats/teams/advanced/', httr::add_headers(.headers=headers), query = params)

The following returns a res variable but we are now struggling to extract the content from res.


Solution

  • As discussed in the comments, many of the headers and parameters are not needed, but this works:

    library(data.table)
    library(magrittr)
    
    headers = c(
      `Connection` = 'keep-alive',
      `Accept` = 'application/json, text/plain, */*',
      `x-nba-stats-token` = 'true',
      `DNT` = '1',
      `User-Agent` = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
      `x-nba-stats-origin` = 'stats',
      `Sec-GPC` = '1',
      `Origin` = 'https://www.nba.com',
      `Sec-Fetch-Site` = 'same-site',
      `Sec-Fetch-Mode` = 'cors',
      `Sec-Fetch-Dest` = 'empty',
      `Referer` = 'https://www.nba.com/',
      `Accept-Language` = 'en-US,en;q=0.9',
      `If-Modified-Since` = 'Wed, 13 Oct 2021 23:24:06 GMT'
    )
    
    params = list(
      `Conference` = '',
      `DateFrom` = '',
      `DateTo` = '',
      `Division` = '',
      `GameScope` = '',
      `GameSegment` = '',
      `LastNGames` = '0',
      `LeagueID` = '00',
      `Location` = '',
      `MeasureType` = 'Advanced',
      `Month` = '0',
      `OpponentTeamID` = '0',
      `Outcome` = '',
      `PORound` = '0',
      `PaceAdjust` = 'N',
      `PerMode` = 'PerGame',
      `Period` = '0',
      `PlayerExperience` = '',
      `PlayerPosition` = '',
      `PlusMinus` = 'N',
      `Rank` = 'N',
      `Season` = '2020-21',
      `SeasonSegment` = '',
      `SeasonType` = 'Regular Season',
      `ShotClockRange` = '',
      `StarterBench` = '',
      `TeamID` = '0',
      `TwoWay` = '0',
      `VsConference` = '',
      `VsDivision` = ''
    )
    
    res <- httr::GET(url = 'https://stats.nba.com/stats/leaguedashteamstats', httr::add_headers(.headers=headers), query = params)
    data <- httr::content(res) %>% .[['resultSets']] %>% .[[1]]
    column_names <- data$headers %>% as.character()  
    dt <- rbindlist(data$rowSet) %>% setnames(column_names)
    

    Gives:

    head(dt, 2)
          TEAM_ID      TEAM_NAME GP  W  L W_PCT  MIN E_OFF_RATING OFF_RATING E_DEF_RATING DEF_RATING E_NET_RATING NET_RATING AST_PCT
    1: 1610612737  Atlanta Hawks 72 41 31 0.569 3481          113      114.3        110.6      112.1          2.5        2.2   0.591
    2: 1610612738 Boston Celtics 72 36 36   0.5 3476          111      113.1          110      111.8          0.9        1.2   0.566
       AST_TO AST_RATIO OREB_PCT DREB_PCT REB_PCT TM_TOV_PCT EFG_PCT TS_PCT E_PACE  PACE PACE_PER40 POSS   PIE GP_RANK W_RANK L_RANK
    1:   1.82      17.6    0.284    0.742   0.516      0.133   0.539  0.581   99.9 98.68      82.23 7160 0.511       1     11     11
    2:   1.67      17.1    0.289    0.737    0.51      0.141   0.543  0.574  100.7 98.94      82.45 7172 0.501       1     16     16
       W_PCT_RANK MIN_RANK OFF_RATING_RANK DEF_RATING_RANK NET_RATING_RANK AST_PCT_RANK AST_TO_RANK AST_RATIO_RANK OREB_PCT_RANK
    1:         11       11               9              18              11           18          14             20             6
    2:         16       16              10              13              13           27          22             26             3
       DREB_PCT_RANK REB_PCT_RANK TM_TOV_PCT_RANK EFG_PCT_RANK TS_PCT_RANK PACE_RANK PIE_RANK CFID       CFPARAMS
    1:             9            7              10           16          10        22       10   10  Atlanta Hawks
    2:            13           10              18           12          16        20       17   10 Boston Celtics