Search code examples
rweb-scrapingrselenium

Rselenium scraping table


I want to extract the data in the "Completed Games" table located here "https://www.chess.com/member/magnuscarlsen".

The code below gives me a list of size 0. The Selenium side of things seems to be working. A firefox browser opens on my desktop and navigates to the page. Any help would be greatly appreciated. I'm at my wits end!

rD <- rsDriver(browser="firefox", port=4442L, verbose=F)
remDr <- rD[["client"]]

remDr$navigate("https://www.chess.com/member/magnuscarlsen")

Sys.sleep(5) # give the page time to fully load
html <- remDr$getPageSource()[[1]]

html <- read_html(html)

signal <- html %>%
  html_nodes("table.table-component table-hover archived-games-table")  


Solution


  • 1 If you don't mind not having the accuracy figures (for which I believe there is no published basis for calculation) have a look at the public APIs from Chess.com. You do get all the moves info included.

    In particular, the implementations via BigChess package. I amended examples from there below:

    All games:

    library(rjson)
    library(bigchess)
    
    user <- "magnuscarlsen"
    json_file <- paste0("https://api.chess.com/pub/player/", user,"/games/archives")
    json_data <- fromJSON(paste(readLines(json_file), collapse = ""))
    result <- data.frame()
    for(i in json_data$archives)
      result <- rbind(result, read.pgn(paste0(i, "/pgn")))
    

    Single month:

    library(bigchess)
    
    df <- read.pgn("https://api.chess.com/pub/player/magnuscarlsen/games/2020/12/pgn")
    print(df[df$Date == '2020.12.11'])
    

    1. Adding in your accuracies as requested. Most of the info on that page is actually available via the APIs:
    
    library(bigchess)
    #> Warning: package 'bigchess' was built under R version 4.0.3
    library(purrr)
    library(jsonlite)
    #> Warning: package 'jsonlite' was built under R version 4.0.3
    #> 
    #> Attaching package: 'jsonlite'
    #> The following object is masked from 'package:purrr':
    #> 
    #>     flatten
    library(stringr)
    
    try_again <- function(link) {  #https://blog.r-hub.io/2020/04/07/retry-wheel/
      maxtry <- 5
      try <- 1
      resp <- read_json(link)
      while (try <= maxtry && is.null(resp$data)) {
        resp <- read_json(.)
        try <- try + 1
        Sys.sleep(try * .25)
      }
      return(resp)
    }
    
    url <- "https://api.chess.com/pub/player/magnuscarlsen/games/2020/12"
    result <- data.frame()
    result <- read.pgn(paste0(url, "/pgn"))
    #> Warning in readLines(con): incomplete final line found on 'https://
    #> api.chess.com/pub/player/magnuscarlsen/games/2020/12/pgn'
    #> 2021-02-15 20:29:04, successfully imported 47 games
    #> 2021-02-15 20:29:04, N moves computed
    #> 2021-02-15 20:29:04, extract moves done
    #> 2021-02-15 20:29:04, stat moves computed
    result <- filter(result, result$Date == "2020.12.11")
    
    data <- read_json(url)
    mask <- map(data$games, ~ !is.na(str_match(.x$pgn, 'UTCDate\\s\\"2020\\.12\\.11')[, 1])) %>% unlist()
    games <- data$games[mask]
    games <- paste0("https://www.chess.com/callback/analysis/game/live/", map(games, ~ str_match(.x$url, "\\d+")[, 1]), "/all")
    
    df <- map_df(games, ~ {
      json_data <- try_again(.x)
      tryCatch(
        data.frame(
          Url = .x,
          WhiteAccuracy = json_data$data$analysis$CAPS$white$all,
          BlackAccuracy = json_data$data$analysis$CAPS$black$all,
          stringsAsFactors = FALSE
        ),
        error = function(e) {
          data.frame(
            Url = .x,
            WhiteAccuracy = NA_integer_,
            BlackAccuracy = NA_integer_,
            stringsAsFactors = FALSE
          )
        }
      )
    })
    
    final <- cbind(result, df)
    #> Error in .cbind.ts(list(...), .makeNamesTs(...), dframe = FALSE, union = TRUE): non-time series not of the correct length
    

    Created on 2021-02-15 by the reprex package (v0.3.0)