Search code examples
rweb-scrapingrselenium

Creating "for" loop when webscraping with Rselenium and saving as dataframe


I have been playing around with Rselenium and webscraping from a list of URLs. Naturally, I would want to combine the data from each URL I scrape into a dataframe. When I do that, the dataframe that is returned will have the data, along with miscellaneous things such as "checkStatus", "statusClass" etc. Its quite difficult to explain but I hope that the code will help to explain it better.

remDr <- remoteDriver( remoteServerAddr = "localhost",
                       port = 4444,
                       browserName = "chrome")

remDr$open()    
URL_list <- c("https://www.premierleague.com/players/4183/Ahmed-El-Mohamady/stats?co=1&se=363")

# Webscrape function
ScrapeDF <- function(link_element){
  #General Stats
  link_element <- remDr$findElement(using = "css selector",".statappearances")
  Appearance <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statwins")
  Wins <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statlosses")
  Losses <- as.character(link_element$getElementText())
  
  #Defence Stats
  link_element <- remDr$findElement(using = "css selector",".statclean_sheet")
  CleanSheet <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statgoals_conceded")
  Conceded <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stattotal_tackle")
  Tackles <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stattackle_success")
  SuccessfulTackle <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statlast_man_tackle")
  LastManTackle <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statblocked_scoring_att")
  BlockedShots <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statinterception")
  Interceptions <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stattotal_clearance")
  Clearance <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stateffective_head_clearance")
  HeadedClearance <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statclearance_off_line")
  ClearanceOffLine <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statball_recovery")
  Recovery <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statduel_won")
  DuelsWon <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statduel_lost")
  DuelsLost <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statwon_contest")
  Successful5050 <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stataerial_won")
  AerialWon <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stataerial_lost")
  AerialLost <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statown_goals")
  OwnGoal <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".staterror_lead_to_goal")
  ErrorsToGoal <- as.character(link_element$getElementText())
  
  #Team Play Stats
  link_element <- remDr$findElement(using = "css selector",".statgoal_assist")
  Assists <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stattotal_pass")
  Passes <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stattotal_pass_per_game")
  PassperMatch <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statbig_chance_created")
  BigChanceCreated <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stattotal_cross")
  Crosses <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statcross_accuracy")
  CrossAcc <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stattotal_through_ball")
  ThroughBall <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stataccurate_long_balls")
  LongBall <- as.character(link_element$getElementText())
  
  #Discipline Stats
  link_element <- remDr$findElement(using = "css selector",".statyellow_card")
  YelCard <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statred_card")
  RedCard <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statfouls")
  Fouls <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stattotal_offside")
  Offside <- as.character(link_element$getElementText()) 
  
  #Attack stats
  link_element <- remDr$findElement(using = "css selector",".statgoals")
  Goals <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statatt_hd_goal")
  HeadedGoal <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statatt_rf_goal")
  RightFootGoal <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statatt_lf_goal")
  LeftFootGoal <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stathit_woodwork")
  Woodwork <- as.character(link_element$getElementText())

  
  DF_Compiled <- data.frame("Position" = Text, "Appearance" = Appearance,
                      "Wins" = Wins, "Losses" = Losses, "Goals" = Goals,
                      "HeadedGoals" = HeadedGoal, "RightFootGoal" = RightFootGoal,
                      "LeftFootGoal" = LeftFootGoal, "Woodwork" = Woodwork,
                      "YellowCard" = YelCard, "RedCard" = RedCard,
                      "Fouls" = Fouls, "Offside" = Offside, "Assist" = Assists,
                      "Passes" = Passes, "PassperMatch" = PassperMatch, "BigChanceCreated" = BigChanceCreated,
                      "Crosses" = Crosses, "CrossAcc" = CrossAcc, "ThroughBall" = ThroughBall,
                      "AccLongBall" = LongBall, "CleanSheet" = CleanSheet,
                      "Conceded" = Conceded, "Tackles" = Tackles,
                      "SuccessTackle" = SuccessfulTackle, "LastManTackle" = LastManTackle,
                      "BlockedShots" = BlockedShots, "Interceptions" = Interceptions,
                      "Clearances" = Clearance, "HeadedClearance" = HeadedClearance,
                      "OffLineClearance" = ClearanceOffLine,"Recoveries" = Recovery,
                      "DuelsWon" = DuelsWon, "DuelsLost" = DuelsLost,
                      "Successful50_50" = Successful5050, "AerialWon" = AerialWon,
                      "AerialLost" = AerialLost, "OwnGoal" = OwnGoal,
                      "ErrorsToGoal" = ErrorsToGoal)
}
 

## For loop to webscrape
CompletePlayerData <- data.frame(matrix(nrow = 0,ncol = 0))

#looping function of scraping the stats for all the players
 for (url in URL_list) {
   remDr$navigate(url)
   Sys.sleep(4)
   
   Position <- remDr$findElement(using = "css selector",".info")
   Text <- as.character(Position$getElementText())
   
    if(Text == "Defender"){
     saved_list <- lapply(Position, ScrapeDF)    
               
   } else {
     Position <- remDr$findElement(using = "css selector",".info~ .info")
     Text <- as.character(Position$getElementText()) 
     
     if(Text == "Defender"){
       saved_list <- lapply(Position, ScrapeDF)    
       
     } 
     }
 
   CompletePlayerData <- bind_rows(CompletePlayerData, saved_list)
 }

This will return a dataframe with 900 columns like this

checkError.Position ...  checkStatus.Position ... nativeEvents.Appearance ...
 Defender          ...         Defender          ...      14                ...

So my questions here are:

  1. why does it return with so many columns
  2. is there a way to bind the data such that these columns appear correspondingly to "Position", "Appearance", "Goals" etc.?

I would like to apologise for the long code in advance


Solution

  • library(RSelenium)
    library(tidyverse)
    
    remDr <- remoteDriver( remoteServerAddr = "localhost",
                           port = 4444,
                           browserName = "chrome")
    
    remDr$open()    
    URL_list <- c("https://www.premierleague.com/players/4183/Ahmed-El-Mohamady/stats?co=1&se=363")
    
    # Webscrape function
    ScrapeDF <- function(link_element){
      #General Stats
      link_element <- remDr$findElement(using = "css selector",".statappearances")
      Appearance <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".statwins")
      Wins <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".statlosses")
      Losses <- as.character(link_element$getElementText())
      
      #Defence Stats
      link_element <- remDr$findElement(using = "css selector",".statclean_sheet")
      CleanSheet <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".statgoals_conceded")
      Conceded <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".stattotal_tackle")
      Tackles <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".stattackle_success")
      SuccessfulTackle <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".statlast_man_tackle")
      LastManTackle <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".statblocked_scoring_att")
      BlockedShots <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".statinterception")
      Interceptions <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".stattotal_clearance")
      Clearance <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".stateffective_head_clearance")
      HeadedClearance <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".statclearance_off_line")
      ClearanceOffLine <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".statball_recovery")
      Recovery <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".statduel_won")
      DuelsWon <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".statduel_lost")
      DuelsLost <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".statwon_contest")
      Successful5050 <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".stataerial_won")
      AerialWon <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".stataerial_lost")
      AerialLost <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".statown_goals")
      OwnGoal <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".staterror_lead_to_goal")
      ErrorsToGoal <- as.character(link_element$getElementText())
      
      #Team Play Stats
      link_element <- remDr$findElement(using = "css selector",".statgoal_assist")
      Assists <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".stattotal_pass")
      Passes <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".stattotal_pass_per_game")
      PassperMatch <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".statbig_chance_created")
      BigChanceCreated <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".stattotal_cross")
      Crosses <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".statcross_accuracy")
      CrossAcc <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".stattotal_through_ball")
      ThroughBall <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".stataccurate_long_balls")
      LongBall <- as.character(link_element$getElementText())
      
      #Discipline Stats
      link_element <- remDr$findElement(using = "css selector",".statyellow_card")
      YelCard <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".statred_card")
      RedCard <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".statfouls")
      Fouls <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".stattotal_offside")
      Offside <- as.character(link_element$getElementText()) 
      
      #Attack stats
      link_element <- remDr$findElement(using = "css selector",".statgoals")
      Goals <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".statatt_hd_goal")
      HeadedGoal <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".statatt_rf_goal")
      RightFootGoal <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".statatt_lf_goal")
      LeftFootGoal <- as.character(link_element$getElementText())
      link_element <- remDr$findElement(using = "css selector",".stathit_woodwork")
      Woodwork <- as.character(link_element$getElementText())
      
      
      DF_Compiled <- data.frame("Position" = Text, "Appearance" = Appearance,
                                "Wins" = Wins, "Losses" = Losses, "Goals" = Goals,
                                "HeadedGoals" = HeadedGoal, "RightFootGoal" = RightFootGoal,
                                "LeftFootGoal" = LeftFootGoal, "Woodwork" = Woodwork,
                                "YellowCard" = YelCard, "RedCard" = RedCard,
                                "Fouls" = Fouls, "Offside" = Offside, "Assist" = Assists,
                                "Passes" = Passes, "PassperMatch" = PassperMatch, "BigChanceCreated" = BigChanceCreated,
                                "Crosses" = Crosses, "CrossAcc" = CrossAcc, "ThroughBall" = ThroughBall,
                                "AccLongBall" = LongBall, "CleanSheet" = CleanSheet,
                                "Conceded" = Conceded, "Tackles" = Tackles,
                                "SuccessTackle" = SuccessfulTackle, "LastManTackle" = LastManTackle,
                                "BlockedShots" = BlockedShots, "Interceptions" = Interceptions,
                                "Clearances" = Clearance, "HeadedClearance" = HeadedClearance,
                                "OffLineClearance" = ClearanceOffLine,"Recoveries" = Recovery,
                                "DuelsWon" = DuelsWon, "DuelsLost" = DuelsLost,
                                "Successful50_50" = Successful5050, "AerialWon" = AerialWon,
                                "AerialLost" = AerialLost, "OwnGoal" = OwnGoal,
                                "ErrorsToGoal" = ErrorsToGoal)
    }
    
    ## For loop to webscrape
    CompletePlayerData <- tibble()
    
    #looping function of scraping the stats for all the players
    for (url in URL_list) {
      remDr$navigate(url)
      Sys.sleep(4)
      
      Position <- remDr$findElement(using = "css selector",".info")
      Text <- as.character(Position$getElementText())
    
      if(Text == "Defender"){
        # Return an empty list if call fails
        saved_list <- lapply(Position, possibly(ScrapeDF, list()))
      }
      
      new_data <- tibble(
        Position = Position %>% list(),
        Text = Text,
        saved_list = saved_list %>% list()
      )
      
      CompletePlayerData <- bind_rows(CompletePlayerData, new_data)
    }
    
    CompletePlayerData %>%
      select(saved_list) %>%
      unnest(saved_list) %>%
      unnest(saved_list) %>%
      distinct(Position, Goals, Appearance)
    

    Output:

    # A tibble: 1 x 3
      Position Appearance Goals
      <fct>    <fct>      <fct>
    1 Defender 14         0