Search code examples
rpurrrrvest

How to loop through a list and store the results in a data frame?


This is a follow up to my question here.

I am wanting to learn how to loop through a list and then store (or bind) the results together.

The objective is to first create a list of game ids, append these to a generic url, then loop over these to get the boxscore tables and finally store the results (or add them together) in a dataframe.

I defined the game_ids like so:

game_ids <- list(401559239,401559240)

And the url:

url_ = "https://www.espn.com/nhl/boxscore/_/gameId"

I then added this code for the loop to the beginning of the suggested solution:

for (game_id in game_ids) {
  url2 = paste(url_, game_id, sep = '/')

Complete code:

library(rvest)
library(dplyr)
library(purrr)
library(tidyr)

game_ids <- list(401559239,401559240)
url_ = "https://www.espn.com/nhl/boxscore/_/gameId"
    
for (game_id in game_ids) {
  url2 = paste(url_, game_id, sep = '/')
  
  boxscore <- read_html(url2) %>% 
    # extract team sections (2)
    html_elements("div.Boxscore div.Wrapper") %>% 
    # extract team names, use as list element names
    set_names(html_elements(., ".BoxscoreItem__TeamName") %>% html_text()) %>% 
    # extact table elements, 4 per team
    map(\(team_section) html_elements(team_section, "table")) %>% 
    map(\(team_tables) list(
      # bind tables 1 & 2 (skaters/defensemen and data section)
      tbl_1 = html_table(team_tables[1:2]) %>% 
        bind_cols(.name_repair = "minimal") %>% 
        # column names from the first row
        set_names(.[1,]) %>% 
        rename(player = Skaters) %>% 
        # position to spearate column
        mutate(position = if_else(G == "G", player, NA), .before = 1) %>% 
        fill(position, .direction = "down") %>% 
        # remove rows with header info
        filter(G != "G"),
      # bind tables 3 & 4 (goalies and data section)
      tbl_2 = html_table(team_tables[3:4]) %>% 
        bind_cols(.name_repair = "minimal") %>% 
        set_names(.[1,]) %>% 
        filter(SA != "SA")
    ) 
    )
  output = boxscore %>% 
    map("tbl_1") %>% 
    list_rbind(names_to = "team")   
}

I added this to the end of the loop (inside after list_rbind):

print(res)

I can see results of both games:

[1] "https://www.espn.com/nhl/boxscore/_/gameId/401559239"
# A tibble: 36 × 22
   team  position player G     A     `+/-` S     SM    BS    PN    PIM   HT    TK    GV    SHFT  TOI   PPTOI SHTOI
   <chr> <chr>    <chr>  <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
 1 Mont… Skaters  J. An… 0     0     -1    3     1     1     1     2     2     0     1     25    17:31 1:46  0:00 
 2 Mont… Skaters  C. Ca… 1     0     0     1     1     1     0     0     2     0     1     23    19:11 3:59  0:00 
 3 Mont… Skaters  K. Da… 0     2     2     3     0     0     0     0     3     1     0     23    21:22 3:59  0:15 
 4 Mont… Skaters  J. Ev… 1     0     1     1     0     0     1     2     3     2     0     19    11:22 0:12  2:27 
 5 Mont… Skaters  B. Ga… 0     0     0     0     0     1     0     0     1     0     0     17    12:34 1:50  0:00 
 6 Mont… Skaters  R. Ha… 0     1     0     0     1     2     0     0     3     0     1     18    13:39 0:11  2:56 
 7 Mont… Skaters  S. Mo… 0     0     -1    3     2     1     0     0     0     1     0     22    18:51 4:00  1:10 
 8 Mont… Skaters  A. Ne… 2     0     1     2     0     0     1     2     2     0     2     22    16:46 1:49  0:00 
 9 Mont… Skaters  T. Pe… 0     0     0     0     1     0     0     0     4     0     0     18    12:10 0:00  1:10 
10 Mont… Skaters  J. Sl… 0     1     2     1     3     0     0     0     3     2     1     20    15:25 1:28  0:00 
# ℹ 26 more rows
# ℹ 4 more variables: ESTOI <chr>, FW <chr>, FL <chr>, `FO%` <chr>
# ℹ Use `print(n = ...)` to see more rows
[1] "https://www.espn.com/nhl/boxscore/_/gameId/401559240"
# A tibble: 35 × 22
   team  position player G     A     `+/-` S     SM    BS    PN    PIM   HT    TK    GV    SHFT  TOI   PPTOI SHTOI
   <chr> <chr>    <chr>  <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
 1 Otta… Skaters  D. Ba… 0     0     -1    5     3     0     0     0     1     0     0     24    18:21 4:44  0:18 
 2 Otta… Skaters  R. Ch… 0     0     1     3     0     0     0     0     4     0     0     24    13:28 0:07  2:51 
 3 Otta… Skaters  C. Gi… 0     0     -1    1     1     1     1     2     1     0     1     29    19:48 4:19  1:45 
 4 Otta… Skaters  R. Gr… 0     0     -1    2     0     0     0     0     0     0     0     26    16:04 3:49  3:15 
 5 Otta… Skaters  M. Jo… 1     1     1     1     2     2     0     0     0     1     0     29    17:52 0:21  6:00 
 6 Otta… Skaters  M. Ka… 0     0     -1    0     0     1     1     2     0     0     0     15    7:33  0:00  0:33 
 7 Otta… Skaters  P. Ke… 1     1     0     2     1     0     0     0     0     0     0     25    12:40 0:14  4:42 
 8 Otta… Skaters  D. Ku… 0     0     0     3     0     0     0     0     0     0     1     21    14:32 2:58  0:00 
 9 Otta… Skaters  T. St… 1     0     -1    3     0     0     2     4     1     0     2     28    21:44 5:25  2:12 
10 Otta… Skaters  V. Ta… 0     0     0     0     1     0     0     0     1     0     0     23    13:12 3:00  0:00 
# ℹ 25 more rows
# ℹ 4 more variables: ESTOI <chr>, FW <chr>, FL <chr>, `FO%` <chr>
# ℹ Use `print(n = ...)` to see more rows

But I cannot work out how to bind them together in a data frame.

I tried res = rbind(output) and res = do.call(rbind,output)

But both return results of the last game only:

> head(res,10)
# A tibble: 10 × 22
   team  position player G     A     `+/-` S     SM    BS    PN    PIM   HT    TK    GV    SHFT  TOI   PPTOI SHTOI
   <chr> <chr>    <chr>  <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
 1 Otta… Skaters  D. Ba… 0     0     -1    5     3     0     0     0     1     0     0     24    18:21 4:44  0:18 
 2 Otta… Skaters  R. Ch… 0     0     1     3     0     0     0     0     4     0     0     24    13:28 0:07  2:51 
 3 Otta… Skaters  C. Gi… 0     0     -1    1     1     1     1     2     1     0     1     29    19:48 4:19  1:45 
 4 Otta… Skaters  R. Gr… 0     0     -1    2     0     0     0     0     0     0     0     26    16:04 3:49  3:15 
 5 Otta… Skaters  M. Jo… 1     1     1     1     2     2     0     0     0     1     0     29    17:52 0:21  6:00 
 6 Otta… Skaters  M. Ka… 0     0     -1    0     0     1     1     2     0     0     0     15    7:33  0:00  0:33 
 7 Otta… Skaters  P. Ke… 1     1     0     2     1     0     0     0     0     0     0     25    12:40 0:14  4:42 
 8 Otta… Skaters  D. Ku… 0     0     0     3     0     0     0     0     0     0     1     21    14:32 2:58  0:00 
 9 Otta… Skaters  T. St… 1     0     -1    3     0     0     2     4     1     0     2     28    21:44 5:25  2:12 
10 Otta… Skaters  V. Ta… 0     0     0     0     1     0     0     0     1     0     0     23    13:12 3:00  0:00 
# ℹ 4 more variables: ESTOI <chr>, FW <chr>, FL <chr>, `FO%` <chr>

How can I store the desired output of each game, and add to a data frame?


Solution

  • With all the code it was difficult to spot the problem. :) However, the solution is kind of easy:

    In each iteration you overwrite the output variable. Hence, in the end you only see the game from the last iteration. To fix this you can create output as an empty list before the loop, and fill it with each iteration. Afterwards, you can bind the rows. Something like that should work:

    output <- list()
    for (game_id in game_ids) {
            
            url2 = paste(url_, game_id, sep = '/')
            
            boxscore <- read_html(url2) %>%
                    # extract team sections (2)
                    html_elements("div.Boxscore div.Wrapper") %>%
                    # extract team names, use as list element names
                    set_names(html_elements(., ".BoxscoreItem__TeamName") %>% html_text()) %>%
                    # extact table elements, 4 per team
                    map(\(team_section) html_elements(team_section, "table")) %>%
                    map(
                            \(team_tables) list(
                                    # bind tables 1 & 2 (skaters/defensemen and data section)
                                    tbl_1 = html_table(team_tables[1:2]) %>%
                                            bind_cols(.name_repair = "minimal") %>%
                                            # column names from the first row
                                            set_names(.[1, ]) %>%
                                            dplyr::rename(player = Skaters) %>%
                                            # position to spearate column
                                            mutate(
                                                    position = if_else(G == "G", player, NA),
                                                    .before = 1
                                            ) %>%
                                            fill(position, .direction = "down") %>%
                                            # remove rows with header info
                                            filter(G != "G"),
                                    # bind tables 3 & 4 (goalies and data section)
                                    tbl_2 = html_table(team_tables[3:4]) %>%
                                            bind_cols(.name_repair = "minimal") %>%
                                            set_names(.[1, ]) %>%
                                            filter(SA != "SA")
                            )
                    )
            output[[as.character(game_id)]] <-  boxscore %>%
                    map("tbl_1") %>%
                    list_rbind(names_to = "team")
    }
    output <- output %>% bind_rows(.id = "game")
    

    Note: I had to change the game_id in output[[as.character(game_id)]] to a character, because indexing does not work like that with numerics.