Search code examples
rselenium-webdriverweb-scrapingrvest

web scrape espn box score data in R


I am very new to web scraping and I am trying to extract all of the information in the box score tables for the NHL for certain games. For example, for game with the id of 401459058, it is all of the information in this table (url: https://www.espn.com/nhl/boxscore/_/gameId/401044673):

enter image description here

I tried using the following:

library(RSelenium)
library(netstat)
library(wdman)
library(rvest)
library(xml2)
library(dplyr)

url = 'https://www.espn.com/nhl/boxscore/_/gameId/401044673'

rD = rsDriver(browser='firefox', chromever='114.0.5735.90', port = free_port()) #specify chrome version

remDr = rD[['client']]
remDr$open()
remDr$navigate(url)
src = remDr$getPageSource()[[1]] 

df = read_html(src) %>% 
  html_elements(xpath = "//tr[@class = 'Table__TR Table__TR--sm Table__even']//text()") %>% 
  html_text

The result is a single character vector. If I use:

as.data.frame(matrix(unlist(df),nrow=length(df),byrow=TRUE))

I get the values in a column like so:

2                 2
3                 4
4                 2
5                 8
6               STL
7                 2
8                 0
9                 2
10                4
11          Skaters
12        K. Connor
13                 
14               LW
15        N. Ehlers
16   

But I can't work out how to get the values into a df that looks like the table on the website, which is the expected output.


Solution

  • Tables are included in the page content, so for that exact task there's no need for {rselenium} and {rvest} should do just fine. But there are 4 tables per team.

    Following processes team sections (2), binds 2 table pairs from both sections and returns a list of tables, 2 for both teams:

    library(rvest)
    library(dplyr, warn.conflicts = FALSE)
    library(purrr)
    library(tidyr)
    
    url_ <- "https://www.espn.com/nhl/boxscore/_/gameId/401459058"
    
    boxscore <- read_html(url_) %>% 
      # extract team sections (2)
      html_elements("div.Boxscore div.Wrapper") %>% 
      # extract team names, use as list element names
      set_names(html_elements(., ".BoxscoreItem__TeamName") %>% html_text()) %>% 
      # extact tables, 4 per team
      map(\(team_section) html_elements(team_section, "table")) %>% 
      map(\(team_tables) list(
        # bind tables 1 & 2 (skaters/defensemen and data section)
        tbl_1 = html_table(team_tables[1:2]) %>% 
          bind_cols(.name_repair = "minimal") %>% 
          # columns names from first row
          set_names(.[1,]) %>% 
          rename(player = Skaters) %>% 
          # position to spearate column
          mutate(position = if_else(G == "G", player, NA), .before = 1) %>% 
          fill(position, .direction = "down") %>% 
          filter(G != "G"),
        # bind tables 3 & 4 (goalies and data section)
        tbl_2 = html_table(team_tables[3:4]) %>% 
          bind_cols(.name_repair = "minimal") %>% 
          set_names(.[1,]) %>% 
          filter(SA != "SA")
        )
      ) 
    

    Result:

    glimpse(boxscore)
    #> List of 2
    #>  $ Los Angeles Kings:List of 2
    #>   ..$ tbl_1: tibble [18 × 21] (S3: tbl_df/tbl/data.frame)
    #>   ..$ tbl_2: tibble [1 × 12] (S3: tbl_df/tbl/data.frame)
    #>  $ Boston Bruins    :List of 2
    #>   ..$ tbl_1: tibble [18 × 21] (S3: tbl_df/tbl/data.frame)
    #>   ..$ tbl_2: tibble [1 × 12] (S3: tbl_df/tbl/data.frame)
    boxscore
    #> $`Los Angeles Kings`
    #> $`Los Angeles Kings`$tbl_1
    #> # A tibble: 18 × 21
    #>    position   player G     A     `+/-` S     SM    BS    PN    PIM   HT    TK   
    #>    <chr>      <chr>  <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
    #>  1 Skaters    J. An… 0     0     -1    1     2     0     0     0     2     0    
    #>  2 Skaters    P. Da… 0     0     0     1     1     2     1     2     0     2    
    #>  3 Skaters    K. Fi… 0     2     1     4     1     0     0     0     0     0    
    #>  4 Skaters    C. Gr… 0     0     0     0     0     0     1     2     0     0    
    #>  5 Skaters    A. Ia… 0     0     0     3     0     0     0     0     0     1    
    #>  6 Skaters    A. Ka… 0     0     0     1     1     0     0     0     0     0    
    #>  7 Skaters    A. Ke… 2     0     0     3     2     0     0     0     1     0    
    #>  8 Skaters    A. Ko… 0     1     1     3     2     1     0     0     1     3    
    #>  9 Skaters    R. Ku… 0     0     0     0     0     0     0     0     1     0    
    #> 10 Skaters    B. Li… 0     0     0     3     0     0     1     2     0     1    
    #> 11 Skaters    T. Mo… 0     0     0     6     3     1     1     2     0     1    
    #> 12 Skaters    G. Vi… 0     0     -1    0     1     0     0     0     0     1    
    #> 13 defensemen M. An… 0     0     0     1     0     2     0     0     1     0    
    #> 14 defensemen D. Do… 0     1     0     1     1     1     2     4     0     1    
    #> 15 defensemen S. Du… 0     0     -1    0     1     2     1     2     0     1    
    #> 16 defensemen A. Ed… 0     0     1     2     1     6     0     0     4     0    
    #> 17 defensemen M. Ro… 0     0     -1    1     0     2     0     0     3     1    
    #> 18 defensemen S. Wa… 0     0     1     0     0     3     0     0     0     0    
    #> # ℹ 9 more variables: GV <chr>, SHFT <chr>, TOI <chr>, PPTOI <chr>,
    #> #   SHTOI <chr>, ESTOI <chr>, FW <chr>, FL <chr>, `FO%` <chr>
    #> 
    #> $`Los Angeles Kings`$tbl_2
    #> # A tibble: 1 × 12
    #>   goalies     SA    GA    SV    `SV%` ESSV  PPSV  SHSV  SOSA  SOS   TOI   PIM  
    #>   <chr>       <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
    #> 1 P. Copley G 35    2     33    .943  17    12    4     0     0     64:48 0    
    #> 
    #> 
    #> $`Boston Bruins`
    #> $`Boston Bruins`$tbl_1
    #> # A tibble: 18 × 21
    #>    position   player G     A     `+/-` S     SM    BS    PN    PIM   HT    TK   
    #>    <chr>      <chr>  <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
    #>  1 Skaters    P. Be… 0     0     1     6     0     2     0     0     0     0    
    #>  2 Skaters    C. Co… 0     1     1     0     0     0     0     0     1     2    
    #>  3 Skaters    J. De… 0     0     0     2     1     1     0     0     0     0    
    #>  4 Skaters    N. Fo… 0     0     0     0     0     2     0     0     1     1    
    #>  5 Skaters    T. Fr… 0     0     1     0     2     0     0     0     4     0    
    #>  6 Skaters    A.J. … 0     0     0     0     0     2     0     0     3     1    
    #>  7 Skaters    T. Ha… 1     0     1     4     2     0     0     0     0     0    
    #>  8 Skaters    D. Kr… 0     0     -1    1     0     0     0     0     1     1    
    #>  9 Skaters    B. Ma… 1     0     0     2     1     0     3     6     2     1    
    #> 10 Skaters    T. No… 0     0     0     2     1     0     0     0     0     0    
    #> 11 Skaters    D. Pa… 0     1     -1    5     4     0     0     0     0     1    
    #> 12 Skaters    P. Za… 0     0     -1    3     0     0     0     0     1     0    
    #> 13 defensemen B. Ca… 0     0     1     0     2     3     1     2     1     0    
    #> 14 defensemen C. Cl… 0     0     0     1     0     1     1     2     5     2    
    #> 15 defensemen D. Fo… 0     0     -1    3     0     1     0     0     2     0    
    #> 16 defensemen M. Gr… 0     1     1     3     0     2     0     0     0     1    
    #> 17 defensemen H. Li… 0     0     0     1     0     1     0     0     0     0    
    #> 18 defensemen C. Mc… 0     1     -1    2     1     1     1     4     1     1    
    #> # ℹ 9 more variables: GV <chr>, SHFT <chr>, TOI <chr>, PPTOI <chr>,
    #> #   SHTOI <chr>, ESTOI <chr>, FW <chr>, FL <chr>, `FO%` <chr>
    #> 
    #> $`Boston Bruins`$tbl_2
    #> # A tibble: 1 × 12
    #>   goalies      SA    GA    SV    `SV%` ESSV  PPSV  SHSV  SOSA  SOS   TOI   PIM  
    #>   <chr>        <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
    #> 1 L. Ullmark G 30    2     28    .933  23    3     2     0     0     64:43 0
    

    To access a single table from the list:

    boxscore$`Los Angeles Kings`$tbl_1
    #> # A tibble: 18 × 21
    #>    position   player G     A     `+/-` S     SM    BS    PN    PIM   HT    TK   
    #>    <chr>      <chr>  <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
    #>  1 Skaters    J. An… 0     0     -1    1     2     0     0     0     2     0    
    #>  2 Skaters    P. Da… 0     0     0     1     1     2     1     2     0     2    
    #>  3 Skaters    K. Fi… 0     2     1     4     1     0     0     0     0     0    
    #> ...
    #> 18 defensemen S. Wa… 0     0     1     0     0     3     0     0     0     0    
    #> # ℹ 9 more variables: GV <chr>, SHFT <chr>, TOI <chr>, PPTOI <chr>,
    #> #   SHTOI <chr>, ESTOI <chr>, FW <chr>, FL <chr>, `FO%` <chr>
    

    As we are dealing with named lists here, it's super-convenient to use purrr::map() and list_rbind() to extract matching tables, row-bind those and add a teams column from list names:

    boxscore %>% 
      map("tbl_1") %>% 
      list_rbind(names_to = "team")
    #> # A tibble: 36 × 22
    #>    team    position player G     A     `+/-` S     SM    BS    PN    PIM   HT   
    #>    <chr>   <chr>    <chr>  <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
    #>  1 Los An… Skaters  J. An… 0     0     -1    1     2     0     0     0     2    
    #>  2 Los An… Skaters  P. Da… 0     0     0     1     1     2     1     2     0    
    #>  3 Los An… Skaters  K. Fi… 0     2     1     4     1     0     0     0     0    
    #>  4 Los An… Skaters  C. Gr… 0     0     0     0     0     0     1     2     0    
    #>  5 Los An… Skaters  A. Ia… 0     0     0     3     0     0     0     0     0    
    #>  6 Los An… Skaters  A. Ka… 0     0     0     1     1     0     0     0     0    
    #>  7 Los An… Skaters  A. Ke… 2     0     0     3     2     0     0     0     1    
    #>  8 Los An… Skaters  A. Ko… 0     1     1     3     2     1     0     0     1    
    #>  9 Los An… Skaters  R. Ku… 0     0     0     0     0     0     0     0     1    
    #> 10 Los An… Skaters  B. Li… 0     0     0     3     0     0     1     2     0    
    #> # ℹ 26 more rows
    #> # ℹ 10 more variables: TK <chr>, GV <chr>, SHFT <chr>, TOI <chr>, PPTOI <chr>,
    #> #   SHTOI <chr>, ESTOI <chr>, FW <chr>, FL <chr>, `FO%` <chr>
    
    boxscore %>% 
      map("tbl_2") %>% 
      list_rbind(names_to = "team")
    #> # A tibble: 2 × 13
    #>   team       goalies SA    GA    SV    `SV%` ESSV  PPSV  SHSV  SOSA  SOS   TOI  
    #>   <chr>      <chr>   <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
    #> 1 Los Angel… P. Cop… 35    2     33    .943  17    12    4     0     0     64:48
    #> 2 Boston Br… L. Ull… 30    2     28    .933  23    3     2     0     0     64:43
    #> # ℹ 1 more variable: PIM <chr>
    

    Created on 2023-10-12 with reprex v2.0.2