We are trying to scrape the table from here - https://www.nba.com/stats/teams/advanced/?sort=W&dir=-1&Season=2020-21&SeasonType=Regular%20Season - into R. Here's what we've tried so far:
# get request from API found in network tab - this doesn't work, the request hangs
httr::GET(url = 'https://stats.nba.com/stats/leaguedashteamstats?Conference=&DateFrom=&DateTo=&Division=&GameScope=&GameSegment=&LastNGames=0&LeagueID=00&Location=&MeasureType=Advanced&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&Season=2020-21&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=0&TwoWay=0&VsConference=&VsDivision=')
# rvest returns empty nodeset when grabbing tables on page
'https://www.nba.com/stats/teams/advanced/?sort=W&dir=-1&Season=2020-21&SeasonType=Regular%20Season' %>%
read_html() %>%
html_nodes('table')
Is it possible to scrape the main table from this webpage using R?
Edit:
headers = c(
`authority` = 'www.nba.com',
`cache-control` = 'max-age=0',
`sec-ch-ua` = '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
`sec-ch-ua-mobile` = '?0',
`sec-ch-ua-platform` = '"macOS"',
`upgrade-insecure-requests` = '1',
`user-agent` = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36',
`accept` = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
`sec-fetch-site` = 'same-origin',
`sec-fetch-mode` = 'navigate',
`sec-fetch-user` = '?1',
`sec-fetch-dest` = 'document',
`accept-language` = 'en-US,en;q=0.9',
`cookie` = 'usprivacy=1YNN; AMCVS_248F210755B762187F000101%40AdobeOrg=1; s_ecid=MCMID%7C39761269548384710744541812242089157146; countryCode=US; s_cc=true; ug=61647d1f0252400a3f87470014d69025; nlhidescores=false; _pbjs_userid_consent_data=3524755945110770; qoscid=524912006.1633975588; qossid=1633975588; client_type=html5; client_version=4.4.0; ugs=1; OptanonAlertBoxClosed=2021-10-12T23:20:24.183Z; at_check=true; _parsely_visitor={%22id%22:%22pid=0cb0a9a5854f45ea8a6d48f74f03e800%22%2C%22session_count%22:1%2C%22last_session_ts%22:1634155541257}; ab.storage.deviceId.cf150dab-3153-49b0-b48c-66a7c18688ea=%7B%22g%22%3A%2228d2f640-2ad0-b8e9-b78c-016ba5a85671%22%2C%22c%22%3A1634155541318%2C%22l%22%3A1634155541318%7D; OptanonControl=ccc=US&csc=&cic=0&otvers=6.24.0&pctm=2021-10-12T23%3A20%3A24.183Z®=ccpa&ustcs=1YNN&vers=3.1.5; aam_uuid=39724801183369993254542124123886279717; s_ips=796; mbox=session#70d31bd3ea124acc80cb089a5594528e#1634158760|PC#70d31bd3ea124acc80cb089a5594528e.34_0#1697401700; ab.storage.sessionId.cf150dab-3153-49b0-b48c-66a7c18688ea=%7B%22g%22%3A%228dcfd2a2-4419-87f9-7e1c-22cf76830e7e%22%2C%22e%22%3A1634158700129%2C%22c%22%3A1634155541315%2C%22l%22%3A1634156900129%7D; s_tp=2924; s_ppv=nba%253Ateams%253Amain%2C27%2C27%2C796%2C1%2C3; ak_bmsc=2C1E9B2928FD1C90ECFF4A5887776269~000000000000000000000000000000~YAAQrL4cuDCzpVJ8AQAAytvzew1NuriisqR0MtOqexD1CqvqIJKuuhJda9NNGXOBCOjAdMEXnQjL10fYxWYj9HLm2DJdQLQIjLSqvl3faGyPbxWARg6dKwmf4NK/+RENdJTZfsKGTbwUMxTtPRSoR7TmMc3UWE4tAdft14nRiSPZwp/DJjK9NUhLtpTDjCa65HELyeJ7O4M4d98rAu5R7YYZOEVRjz5VRQEGaFBc5u2OlaUpcyFDqUM+j+jII/6xmqgwVRUhX8t8oNmdeiYpfEALo1yewznqZcfOO18htGp4sF3SLPG8bBFvLeGwW118Mu1rVkyeO4PEvC7UFZUc+a7tGNSjGyGe0WSC/0iSjTC+/ikP2BPwMosXe7DxWk/a0vuFtUlw7jArB/YQuYHH61uu8E97UTA=; AMCV_248F210755B762187F000101%40AdobeOrg=359503849%7CMCMID%7C39761269548384710744541812242089157146%7CMCAAMLH-1634771953%7C7%7CMCAAMB-1634771953%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1634174353s%7CNONE%7CMCAID%7CNONE%7CvVersion%7C5.0.1%7CMCIDTS%7C18914; s_gpv_pageModal=nba%3Astats%3Ateams%3Aadvanced; s_sq=%5B%5BB%5D%5D; akavpau_allowednbamain=1634169266~id=4fd4cabce5336e66bef275d5dd409a10; bm_sv=467DB2784E3DE76FAA9F4CD21DD7DE3C~8bPs2wRiWvWAD8K8MYos9duNZqYto/EQc8HFibswczdPYqofRTJZOTE4Xy1RsB9fJag8YMdv3OOHkVFDGoh7aG8x4Y8eZepOfBGMFtPmQF0Vgg0XNix35HHU2sk9RKCEQujy2BRS4m269Y6fIapqEQ==; OptanonConsent=isIABGlobal=false&datestamp=Wed+Oct+13+2021+19%3A44%3A27+GMT-0400+(Eastern+Daylight+Time)&version=6.24.0&hosts=&consentId=e8a9be54-a345-44df-90e1-eaaf56d98079&interactionCount=2&landingPath=NotLandingPage&groups=BG30%3A1%2Cven%3A1%2Cpad%3A1%2Cpap%3A1%2Ccad%3A1%2Cmap%3A1%2Cdsa%3A1%2CNBAad%3A1%2Creq%3A1%2Csec%3A1%2Cgld%3A1%2Cpcp%3A1%2Cmcp%3A1%2Cmra%3A1%2Ctdc%3A1%2Ccos%3A1%2Cdid%3A1%2Csid%3A1%2Cpdd%3A1%2Cpcd%3A1%2CNBAmt%3A1&AwaitingReconsent=false&geolocation=US%3B'
)
params = list(
`sort` = 'W',
`dir` = '-1',
`Season` = '2020-21',
`SeasonType` = 'Regular Season'
)
res <- httr::GET(url = 'https://www.nba.com/stats/teams/advanced/', httr::add_headers(.headers=headers), query = params)
The following returns a res
variable but we are now struggling to extract the content from res
.
As discussed in the comments, many of the headers and parameters are not needed, but this works:
library(data.table)
library(magrittr)
headers = c(
`Connection` = 'keep-alive',
`Accept` = 'application/json, text/plain, */*',
`x-nba-stats-token` = 'true',
`DNT` = '1',
`User-Agent` = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
`x-nba-stats-origin` = 'stats',
`Sec-GPC` = '1',
`Origin` = 'https://www.nba.com',
`Sec-Fetch-Site` = 'same-site',
`Sec-Fetch-Mode` = 'cors',
`Sec-Fetch-Dest` = 'empty',
`Referer` = 'https://www.nba.com/',
`Accept-Language` = 'en-US,en;q=0.9',
`If-Modified-Since` = 'Wed, 13 Oct 2021 23:24:06 GMT'
)
params = list(
`Conference` = '',
`DateFrom` = '',
`DateTo` = '',
`Division` = '',
`GameScope` = '',
`GameSegment` = '',
`LastNGames` = '0',
`LeagueID` = '00',
`Location` = '',
`MeasureType` = 'Advanced',
`Month` = '0',
`OpponentTeamID` = '0',
`Outcome` = '',
`PORound` = '0',
`PaceAdjust` = 'N',
`PerMode` = 'PerGame',
`Period` = '0',
`PlayerExperience` = '',
`PlayerPosition` = '',
`PlusMinus` = 'N',
`Rank` = 'N',
`Season` = '2020-21',
`SeasonSegment` = '',
`SeasonType` = 'Regular Season',
`ShotClockRange` = '',
`StarterBench` = '',
`TeamID` = '0',
`TwoWay` = '0',
`VsConference` = '',
`VsDivision` = ''
)
res <- httr::GET(url = 'https://stats.nba.com/stats/leaguedashteamstats', httr::add_headers(.headers=headers), query = params)
data <- httr::content(res) %>% .[['resultSets']] %>% .[[1]]
column_names <- data$headers %>% as.character()
dt <- rbindlist(data$rowSet) %>% setnames(column_names)
Gives:
head(dt, 2)
TEAM_ID TEAM_NAME GP W L W_PCT MIN E_OFF_RATING OFF_RATING E_DEF_RATING DEF_RATING E_NET_RATING NET_RATING AST_PCT
1: 1610612737 Atlanta Hawks 72 41 31 0.569 3481 113 114.3 110.6 112.1 2.5 2.2 0.591
2: 1610612738 Boston Celtics 72 36 36 0.5 3476 111 113.1 110 111.8 0.9 1.2 0.566
AST_TO AST_RATIO OREB_PCT DREB_PCT REB_PCT TM_TOV_PCT EFG_PCT TS_PCT E_PACE PACE PACE_PER40 POSS PIE GP_RANK W_RANK L_RANK
1: 1.82 17.6 0.284 0.742 0.516 0.133 0.539 0.581 99.9 98.68 82.23 7160 0.511 1 11 11
2: 1.67 17.1 0.289 0.737 0.51 0.141 0.543 0.574 100.7 98.94 82.45 7172 0.501 1 16 16
W_PCT_RANK MIN_RANK OFF_RATING_RANK DEF_RATING_RANK NET_RATING_RANK AST_PCT_RANK AST_TO_RANK AST_RATIO_RANK OREB_PCT_RANK
1: 11 11 9 18 11 18 14 20 6
2: 16 16 10 13 13 27 22 26 3
DREB_PCT_RANK REB_PCT_RANK TM_TOV_PCT_RANK EFG_PCT_RANK TS_PCT_RANK PACE_RANK PIE_RANK CFID CFPARAMS
1: 9 7 10 16 10 22 10 10 Atlanta Hawks
2: 13 10 18 12 16 20 17 10 Boston Celtics