Search code examples
rrvest

Avoid 404 error when scraping from multiple pages using rvest


This is a follow up to my question here.

The code provided does give the desired output, however it seems there is an issue when a page does not exist, and I am trying to use try/catch to avoid these errors and continue.

For example, I am specifying all dates with the following:

month <- c('02')
year <- c('2024')
day <- c('220','270','280')   
team <- c('CHI')

This is fine, as Chicago played home games on all these days, hence the following urls all work:

https://www.basketball-reference.com/boxscores/202402220CHI.html

https://www.basketball-reference.com/boxscores/202402270CHI.html

https://www.basketball-reference.com/boxscores/202402280CHI.html

But if I add another day and/or month like so:

month <- c('02')
year <- c('2024')
day <- c(**'210'**,'220','270','280')   
team <- c('CHI')

Chicago did not play a home game on 21 Feb 24 and this url doesn't exist:

https://www.basketball-reference.com/boxscores/202402210CHI.html

I tried adding it to the code here:

page <- tryCatch(read_html(url), error = function(err) "error 404")

But then I received this message:

no applicable method for 'xml_find_first' applied to an object of class "character"

How can I skip the pages that don't exist and just return the values from those that do?

Full code:

library(rvest)
library(dplyr)
library(tidyr)

##sample only - ultimately this will include all teams and all months and days
month <- c('02')
year <- c('2024')
day <- c('220','270','280')   
team <- c('CHI')

make_url <- function(team, year, month, day) {
   paste0('https://www.basketball-reference.com/boxscores/', year, month, day, team, '.html')
}

dates <- expand.grid(team = team, year = year, month = month, day = day)

urls <- dates |>
   mutate( url = make_url(team, year, month, day),
      team = team,
      date = paste(year, month, gsub('.{1}$', '', day), sep = '-'),
      .keep = 'unused'
   )

getPageTable <- function(url) {
   #read the page
   page <- read_html(url)

   #get the game's date
   gamedate <- page %>% html_element("div.scorebox_meta div") %>% html_text2()
   
   #get game title
   gameInfo <- page %>% html_elements("div.box h1") %>% html_text()
   #get the table headings
   headings <- page %>% html_elements("div.section_wrapper") %>% html_element("h2") %>% html_text()
   
   #find the quarter scores
   quarters <- grep("Q[1|2|3|4]", headings)
   
   #retrieve the tables from the page
   tables <- page %>% html_elements("div.section_wrapper") %>% html_element("table") 

   #select the desired headings and tables
   headings <- headings[quarters]
   tables <- tables[quarters] %>% html_table(header=FALSE)

   #add game date and team name/quater to the results
   tables <- lapply(1:length(tables), function(i) {
      #set column titles to second row
      names(tables[[i]]) <- tables[[i]][2,]
     tables[[i]] <- tables[[i]][-c(1:2),]  
      tables[[i]]$gamedate <- gamedate
      tables[[i]]$team <- headings[i]
      tables[[i]]$title <- gameInfo
      tables[[i]]
   })
   #merge the quarterly status into 1 dataframe
   df <- bind_rows(tables)
   df <- df %>% filter("Starters" != "Reserves"  | "Starters" != "Team Totals" )
   df
}


#loop through the URLS
dfs <- lapply(urls$url, getPageTable)
#merge into one big table
finalResult <- bind_rows(dfs)
finalResult <- finalResult %>% separate("team", into=c("team", "quarter"), " \\(")
finalResult$quarter <- sub("\\)", "", finalResult$quarter)

Solution

  • Here is a solution. Wrap the call to read_html in tryCatch and return the error condition, if anything goes wrong. Then test the condition right after the read instruction. Like this you will have a list with both the data (URL's are OK) and the errors (URL's are not OK) and can test which are which outside the function.

    Here is the function, corrected.

    getPageTable <- function(url) {
      # read the page, returning the error condition if error 404 (or other)
      page <- tryCatch(
        read_html(url),
        error = function(e) e
      )
      if(inherits(page, "error")) {
        return(page)
      }
      # then continue as in the question's code 
      #get the game's date
      gamedate <- page %>% html_element("div.scorebox_meta div") %>% html_text2()
      
      #get game title
      gameInfo <- page %>% html_elements("div.box h1") %>% html_text()
      #get the table headings
      headings <- page %>% html_elements("div.section_wrapper") %>% html_element("h2") %>% html_text()
      
      #find the quarter scores
      quarters <- grep("Q[1|2|3|4]", headings)
      
      #retrieve the tables from the page
      tables <- page %>% html_elements("div.section_wrapper") %>% html_element("table") 
      
      #select the desired headings and tables
      headings <- headings[quarters]
      tables <- tables[quarters] %>% html_table(header=FALSE)
      
      #add game date and team name/quater to the results
      tables <- lapply(1:length(tables), function(i) {
        #set column titles to second row
        names(tables[[i]]) <- tables[[i]][2,]
        tables[[i]] <- tables[[i]][-c(1:2),]  
        tables[[i]]$gamedate <- gamedate
        tables[[i]]$team <- headings[i]
        tables[[i]]$title <- gameInfo
        tables[[i]]
      })
      #merge the quarterly status into 1 dataframe
      df <- bind_rows(tables)
      df <- df %>% filter("Starters" != "Reserves"  | "Starters" != "Team Totals" )
      df
    }
    

    Call the function above, check for valid return values and decide what to do with the errors. In the case below the bad URL's and the corresponding errors are printed as messages.

    #loop through the URLS
    dfs <- lapply(urls$url, getPageTable)
    # get which weren't read in
    err <- sapply(dfs, inherits, what = "error")
    # optional, make a list of the bad ones
    dfs_err <- dfs[err]
    # and print the URL's and error messages
    for(i in which(err)) {
      urls$url[i] %>% message()
      dfs_err[[i]] %>%
        conditionMessage() %>%
        message()
    }
    
    # these are the good ones and the rest of the code is like in the question
    dfs <- dfs[!err]
    #merge into one big table
    finalResult <- bind_rows(dfs)