Search code examples
rweb-scrapingphantomjs

Download file with phantomJS R


I want to download a file using webdriver package (phantomJS), however whenever I click/enter the object, nothing happens. I intend to click on 'Download dos dados' and then click and download 'Baixar .csv'.

library(webdriver)

url <-'https://idesevis.dee.rs.gov.br/#tab-9023-6'

#webdriver::install_phantomjs() # If it is not installed
pjs <- run_phantomjs()

ses <- Session$new(port = pjs$port)
ses$go(url)
ses$getUrl()

### Click on 'Download dos dados'
search <- ses$findElement(xpath='/html/body/div[2]/nav/div/ul/li[6]/a')
search$sendKeys('html',key$enter)

### Finds and clicks 'Baixar .csv'
search <- ses$findElement(xpath='/html/body/div[2]/div[3]/div/div[7]/div/div[1]/form/a')
search$sendKeys('html',key$enter)
ses$takeScreenshot()

Apparently, I'm not selecting the object 'Baixar .csv' even using the full xpath. I'm using webdriver as I can't use RSelenium on my notebook and static webscraping it is not suitable here as the session code alters everytime we access the main url.


Solution

  • CSV download, at least in this particular case with Shiny-driven page, also works outside of the browser session, so instead of triggering download in PhantomJS, you could extract href from CSV link and pass it to download.file() / httr(2) / curl, for example.

    library(webdriver)
    library(httr2)
    library(stringr)
    library(readr)
    
    url_ <-'https://idesevis.dee.rs.gov.br/#tab-9023-6'
    
    #webdriver::install_phantomjs() # If it is not installed
    pjs <- run_phantomjs()
    
    ses <- Session$new(port = pjs$port)
    ses$go(url_)
    ses$getUrl()
    #> [1] "https://idesevis.dee.rs.gov.br/#tab-9023-6"
    
    ### Click on 'Download dos dados'
    ses$findElement("a[data-value='Download dos Dados']")$sendKeys('html',key$enter)
    ses$getUrl()
    #> [1] "https://idesevis.dee.rs.gov.br/#tab-9023-6"
    
    ### Wait until downloadData element is available and href is set;
    ### defaults: checkInterval = 100, timeout = 3000;
    ### find a#downloadData and get download link
    ses$waitFor('document.getElementById("downloadData").getAttribute("href")')
    #> [1] TRUE
    
    (csv_url <- ses$findElement("a#downloadData")$getAttribute("href"))
    #> [1] "https://idesevis.dee.rs.gov.br/session/ca0f03facf886621a9ce4a7c8d6520bc/download/downloadData?w="
    
    ### Test url and extract filename from response headers
    (resp_head <- request(csv_url) |> req_method("HEAD") |> req_perform())
    #> <httr2_response>
    #> HEAD
    #> https://idesevis.dee.rs.gov.br/session/ca0f03facf886621a9ce4a7c8d6520bc/download/downloadData?w=
    #> Status: 200 OK
    #> Content-Type: text/csv
    #> Body: Empty
    
    filename <- resp_header(resp_head, header = "content-disposition") |> 
      str_split_i("=", 2) |>
      str_remove_all('\\"')
    filename
    #> [1] "base_idese.csv"
    
    ### Fetch CSV
    request(csv_url) |> req_perform(path = filename)
    #> <httr2_response>
    #> GET
    #> https://idesevis.dee.rs.gov.br/session/ca0f03facf886621a9ce4a7c8d6520bc/download/downloadData?w=
    #> Status: 200 OK
    #> Content-Type: text/csv
    #> Body: On disk 'body'
    fs::file_info(filename)[,1:3]
    #> # A tibble: 1 × 3
    #>   path           type         size
    #>   <fs::path>     <fct> <fs::bytes>
    #> 1 base_idese.csv file        11.8M
    

    Downloaded dataset:

    # Read with correct encoding
    read_csv("base_idese.csv", locale = locale(encoding = "ISO-8859-1"))
    #> New names:
    #> • `` -> `...1`
    #> Rows: 114720 Columns: 7
    #> ── Column specification ────────────────────────────────────────────────────────
    #> Delimiter: ","
    #> chr (4): TIPO_UNID, COD, NOME, CATEGORIA
    #> dbl (3): ...1, ANO, VALOR
    #> 
    #> ℹ Use `spec()` to retrieve the full column specification for this data.
    #> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
    #> # A tibble: 114,720 × 7
    #>     ...1 TIPO_UNID  COD     NOME            CATEGORIA                  ANO VALOR
    #>    <dbl> <chr>      <chr>   <chr>           <chr>                    <dbl> <dbl>
    #>  1     1 Municípios 4300059 Água Santa      "Bloco Renda\\Apropriaç…  2013 0.919
    #>  2     2 Municípios 4304804 Carlos Barbosa  "Bloco Renda\\Apropriaç…  2013 0.929
    #>  3     3 Municípios 4300901 Aratiba         "Bloco Renda\\Apropriaç…  2013 0.746
    #>  4     4 Municípios 4310462 Ipiranga do Sul "Bloco Renda\\Apropriaç…  2013 0.924
    #>  5     5 Municípios 4322806 Veranópolis     "Bloco Renda\\Apropriaç…  2013 0.835
    #>  6     6 Municípios 4321634 Três Arroios    "Bloco Renda\\Apropriaç…  2013 1    
    #>  7     7 Municípios 4313334 Nova Ramada     "Bloco Renda\\Apropriaç…  2013 0.741
    #>  8     8 Municípios 4304903 Casca           "Bloco Renda\\Apropriaç…  2013 0.793
    #>  9     9 Municípios 4314001 Paraí           "Bloco Renda\\Apropriaç…  2013 0.849
    #> 10    10 Municípios 4322350 União da Serra  "Bloco Renda\\Apropriaç…  2013 0.843
    #> # ℹ 114,710 more rows
    

    Created on 2023-12-27 with reprex v2.0.2