Search code examples
rweb-scrapingrvest

Webscraping a page with a dynamic table using R


I am trying to webscrape the table from the following page using R (rvest)

https://www.nba.com/stats/players/passing?DateFrom=02/07/2024&DateTo=02/07/2024&dir=1

However I noticed that the table itself has 3 pages but when I click to the 2nd and 3rd page the URL does not change at all.

Does anyone know how to webscrape the 147 rows from the table using R please?

I have tried the following with no luck

page <- read_html("https://www.nba.com/stats/players/passing?DateFrom=02/07/2024&DateTo=02/07/2024&dir=1")
  
contentnodes <-page %>% html_nodes ("div.l-content.pre-quench") %>% 
    html_attr("q-data") %>% jsonlite::fromJSON()

Thanks in advance


Solution

  • To complete with an rvest solution (it's wiser to use the API option as suggested by @Kat) :

    ### Setup chromote which is used by rvest (path to MS Edge is used here)
    Sys.setenv(
      CHROMOTE_CHROME = "C:/Program Files (x86)/Microsoft/Edge/Application/msedge.exe"
    )
    
    ### Load the package 
    library(rvest)
    
    ### Interact with the webpage (selecting the button to load all the results, then display them)
    a=read_html_live("https://www.nba.com/stats/players/passing?DateFrom=02/07/2024&DateTo=05/21/2024&page=all")
    a$click(css = 'div[class*="Pagination_pageDropdown"] select',n_clicks = 1)
    a$press(css='div[class*="Pagination_pageDropdown"] select',key_code = 'ArrowUp')
    a$scroll_into_view(css = 'div[class*="FooterLegalSection_legalCopy"]')
    a %>% html_elements(css='table[class*="Crom_table"]') %>% html_table()
    

    Output :

    # A tibble: 213 × 16
       PLAYER            TEAM     GP     W     L   MIN PassesMade
       <chr>             <chr> <int> <int> <int> <dbl>      <dbl>
     1 A.J. Lawson       DAL       7     5     2   2.3        1.6
     2 AJ Green          MIL       6     2     4  11.2        9.3
     3 Aaron Gordon      DEN      12     7     5  37.1       38  
     4 Aaron Nesmith     IND      13     8     5  32.7       35.6
     5 Aaron Wiggins     OKC      10     6     4  15.7       12.2
     6 Al Horford        BOS      10     8     2  28.6       28.1
     7 Alec Burks        NYK       6     2     4  20.1       17.5
     8 Amir Coffey       LAC       6     2     4  18.7       13  
     9 Andre Jackson Jr. MIL       5     1     4  11.9       16.6
    10 Andrew Nembhard   IND      13     8     5  32         39  
    # ℹ 203 more rows
    # ℹ 9 more variables: PassesReceived <dbl>, AST <dbl>,
    #   SecondaryAST <dbl>, PotentialAST <dbl>,
    #   `AST PTSCreated` <dbl>, `AST PTSCreated` <dbl>,
    #   ASTAdj <dbl>, `AST ToPass%` <dbl>,
    #   `AST ToPass% Adj` <lgl>
    

    EDIT : To get data for each day, in a less reliable way than the API, we can use something like this :

    ### Load the package 
    library(rvest)
    library(purrr)
    library(dplyr)
    library(lubridate)
    
    ### Setup chromote which is used by rvest (path to MS Edge is used here)
    Sys.setenv(
      CHROMOTE_CHROME = "C:/Program Files (x86)/Microsoft/Edge/Application/msedge.exe"
    )
    
    ### Load the packages
    library(rvest)
    library(purrr)
    library(dplyr)
    library(lubridate)
    
    ### Declare columns names.
    ### They'll be used to fix the table returned by rvest later
    colnames = c(
      "PLAYER",
      "TEAM",
      "GP",
      "W",
      "L",
      "MIN",
      "PassesMade",
      "PassesReceived",
      "AST",
      "SecondaryAST",
      "PotentialAST",
      "AST_PTSCreated",
      "ASTAdj",
      "AST_ToPass%",
      "AST_ToPass%_Adj",
      "dummy"
    )
    
    ### Get the dates of every match of the regular season
    ### We use basketball-reference.com here
    calendrier=read_html("https://www.basketball-reference.com/leagues/NBA_2024_games.html")
    
    ### The regular season starts in october and ends in april
    ### We get the url of each month
    urls=calendrier %>% html_elements(xpath = '//div[@class="filter"]//a') %>% html_attr("href")
    season=paste0("https://www.basketball-reference.com",urls[1:7])
    
    ### Function to get the date from each url
    ext_dates = function(x) {
      a = read_html(x)
      b = a %>%
        html_element(xpath = "//table") %>%
        html_table %>% select(Date,Notes)
    }
    
    ### We apply the function for each page (month)
    cal=map(season,ext_dates,.progress = TRUE)
    
    ### We bind the result and extract the dates before the 16th april (end of the regular season)
    dates=bind_rows(cal) %>%
      filter(Notes==""|is.na(Notes)) %>%
      select(-Notes) %>% 
      mutate(fix_date=mdy(Date)) %>% 
      filter(fix_date<ymd("2024-04-16")) %>% 
      mutate(fix_date=as.character(fix_date)) %>% 
      distinct() %>% 
      pull(fix_date)
    
    ### We write a function to read and extract data for each date
    ### Note we're using "Sys.sleep" in order to wait for the page to load completely
    ### The code can manage when the data is splitted on multiple pages or not
    
    nba = function(x) {
      url=paste0("https://www.nba.com/stats/players/passing?DateFrom=",x,"&DateTo=",x,"&page=all&SeasonType=Regular%20Season")
      a = read_html_live(url)
      Sys.sleep(3)
      c = a %>% html_element('div[class*="Pagination_pageDropdown"] select') %>% length()
      if (c != 0) {
        a$press(css = 'div[class*="Pagination_pageDropdown"] select', key_code = 'ArrowUp')
        a$scroll_into_view(css = 'div[class*="FooterLegalSection_legalCopy"]')
      }
      Sys.sleep(3)
      temp = a %>% html_element(css = 'table[class*="Crom_table"]') %>% html_table() %>%
        setNames(colnames) %>%
        select(-dummy) %>%
        mutate(date = x, .before = 1)
      a$session$close()
      rm(a)
      gc()
      return(temp)
    }
    
    ### We apply the function to each date and bind the results
    out=map(dates,possibly(nba,tibble(date=NA_character_)),.progress = TRUE)
    fin=bind_rows(out)
    
    ### We can check if all dates have been correctly scraped with
    setdiff(dates,fin$date)
    
    ### If there's some missing data, we rerun the code for these specific dates
    check_dates=setdiff(dates,fin$date)
    out_2=map(check_dates,possibly(nba,tibble(date=NA_character_)),.progress = TRUE)
    fix=bind_rows(out_2)
    
    ### We combine the second table with the first one to have a complete dataset
    fix$date[is.na(fix$date)]<-setdiff(check_dates,fix$date)
    test=fin %>% filter(!is.na(PLAYER)) %>% bind_rows(fix) %>% arrange(date)
    
    ### We check if there's still missing data with
    test %>% filter(is.na(PLAYER))
    

    Output (~24.5K rows):

    # A tibble: 24,617 × 16
       date       PLAYER    TEAM     GP     W     L   MIN PassesMade PassesReceived   AST SecondaryAST PotentialAST AST_PTSCreated ASTAdj `AST_ToPass%`
       <chr>      <chr>     <chr> <int> <int> <int> <dbl>      <dbl>          <dbl> <dbl>        <dbl>        <dbl>          <dbl>  <dbl>         <dbl>
     1 2023-10-24 Aaron Go… DEN       1     1     0  35           32             27     5            0           10             12      5          15.6
     2 2023-10-24 Andrew W… GSW       1     0     1  27.3         15             20     0            1            1              0      1           0  
     3 2023-10-24 Anthony … LAL       1     0     1  34.2         41             39     4            1            6             10      5           9.8
     4 2023-10-24 Austin R… LAL       1     0     1  31.3         39             37     4            0            8              9      4          10.3
     5 2023-10-24 Braxton … DEN       1     1     0   0.7          0              0     0            0            0              0      0           0  
     6 2023-10-24 Cam Redd… LAL       1     0     1  17.6          8              5     0            0            1              0      0           0  
     7 2023-10-24 Chris Pa… GSW       1     0     1  34.2         66             69     9            1           24             24     11          13.6
     8 2023-10-24 Christia… DEN       1     1     0  19.3         25             23     2            0            3              5      2           8  
     9 2023-10-24 Christia… LAL       1     0     1  15.5         10              8     0            0            0              0      0           0  
    10 2023-10-24 Collin G… DEN       1     1     0   0.7          0              0     0            0            0              0      0           0  
    # ℹ 24,607 more rows
    # ℹ 1 more variable: `AST_ToPass%_Adj` <dbl>
    # ℹ Use `print(n = ...)` to see more rows
    

    Note : stats for the 9th March 2024 are missing from the NBA website.