Search code examples
rrseleniumdoparallel

Run RSelenium in parallel


How would i go about running RSelenium in parallel.

The following is an example using rvest in parallel

library(RSelenium)
library(rvest)
library(magrittr)
library(foreach)
library(doParallel)

URLsPar <- c("http://www.example.com/", "http://s5.tinypic.com/n392s6_th.jpg", "http://s5.tinypic.com/jl1jex_th.jpg",
         "http://s6.tinypic.com/16abj1s_th.jpg", "http://s6.tinypic.com/2ymvpqa_th.jpg")

(detectCores() - 1) %>%  makeCluster %>% registerDoParallel

ws <- foreach(x = 1:length(URLsPar), .packages = c("rvest", "magrittr", "RSelenium"))  %dopar%  {
      URLsPar[x] %>% read_html %>% as("character")}

stopImplicitCluster()

Solution

  • On each node in the cluster start a remoteDriver:

    library(RSelenium)
    library(rvest)
    library(magrittr)
    library(foreach)
    library(doParallel)
    
    URLsPar <- c("http://www.bbc.com/", "http://www.cnn.com", "http://www.google.com",
                 "http://www.yahoo.com", "http://www.twitter.com")
    appHTML <- c()
    # start a Selenium Server
    selServ <- startServer()
    
    (cl <- (detectCores() - 1) %>%  makeCluster) %>% registerDoParallel
    # open a remoteDriver for each node on the cluster
    clusterEvalQ(cl, {
      library(RSelenium)
      remDr <- remoteDriver()
      remDr$open()
    })
    myTitles <- c()
    ws <- foreach(x = 1:length(URLsPar), .packages = c("rvest", "magrittr", "RSelenium"))  %dopar%  {
      remDr$navigate(URLsPar[x])
      remDr$getTitle()[[1]]
    }
    
    # close browser on each node
    clusterEvalQ(cl, {
      remDr$close()
    })
    
    stopImplicitCluster()
    # stop Selenium Server
    selServ$stop()
    
    > ws
    [[1]]
    [1] "BBC - Homepage"
    
    [[2]]
    [1] "CNN - Breaking News, U.S., World, Weather, Entertainment & Video News"
    
    [[3]]
    [1] "Google"
    
    [[4]]
    [1] "Yahoo"
    
    [[5]]
    [1] "Welcome to Twitter - Login or Sign up"