Search code examples
rxmlrvestamchartshttr

How to scrape Amcharts interactive data using R


I'm trying to retrieve data from the following link but I've noticed that the data is in an interactive Amcharts 1.1. I can't find the best parameters for the GET or POST method. I think the website is using another process to post the data.

Would anyone be able to offer some insights? Your assistance would be greatly appreciated.

library(httr)
library(rvest)
library(XML)
url <- 'https://live.euronext.com/en/product/equities/FR0004040608-XPAR'

page <- read_html(url)

class(page)
# "xml_document" "xml_node" 

xml_child(page, 2)
# {html_node}
# <body class="layout-no-sidebars path-product">
# [1] <a href="#main-content" class="visually-hidden focusable skip-link">\n   ...
# [2] <noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-T ...
# ....
# [20] <script src="/sites/default/files/js/optimized/js_gcfptZrWxdKuaT414lrcjk ...


> page %>% html_nodes(xpath = '//*[@id="price-chart"]')%>% html_text()
# character(0)

page %>% html_nodes("script")%>% html_text()

page %>% html_nodes("noscript")%>% html_text()
# [1] "form.antibot * :not(.antibot-message) { display: none !important; }"
# [2] ""  

Image of the dev tools part After i clicked on Max

The format of the response does not contain data

Data are in a svg


Solution

  • You have successfully identified the data source, it looks bit cryptic as it actually is encrypted (or "encrypted", at least not just base64 encoded).

    Though ajax_secure_dataFilter(), function that preprocess chart data, is also callable in browser session. So if we choose a lazy route and add chromote to this mix, we can point it to the site so it would first load all JavaScript requirements for us, we can then create a small js helper function that would fetch json within a Chrome session and calls ajax_secure_dataFilter() on fetched chart data.

    Finally, we can call that js function with any chart data endpoint from R, get decoded JSON as a list of lists and wrangle it into a flat frame.

    library(ggplot2)
    library(dplyr)
    library(chromote)
    library(stringr)
    
    url_ <- "https://live.euronext.com/en/product/equities/FR0004040608-XPAR"
    chart_max <- str_glue("https://live.euronext.com/en/intraday_chart/getChartData/{basename(url_)}/max")
    
    b <- ChromoteSession$new()
    {
      b$Page$navigate("https://live.euronext.com/en/product/equities/FR0004040608-XPAR")
      b$Page$loadEventFired()
    } 
    #> $timestamp
    #> [1] 522961.4
    
    # create js function that fetches chart data and passes it though ajax_secure_dataFilter() 
    b$Runtime$evaluate("function fetch_chart_data(url) {return fetch(url).then(response => response.json()).then(json => ajax_secure_dataFilter(json, 'json', false))}")
    #> $result
    #> $result$type
    #> [1] "undefined"
    
    # call it with chart data url
    chart_data <- b$Runtime$evaluate(str_glue('fetch_chart_data("{chart_max}")'), awaitPromise = TRUE, returnByValue = TRUE)$result$value
    head(chart_data, n = 3) |> str()
    #> List of 3
    #>  $ :List of 3
    #>   ..$ time  : chr "1999-09-01 02:00"
    #>   ..$ price : num 9.7
    #>   ..$ volume: int 25810
    #>  $ :List of 3
    #>   ..$ time  : chr "1999-09-02 02:00"
    #>   ..$ price : num 9.75
    #>   ..$ volume: int 17430
    #>  $ :List of 3
    #>   ..$ time  : chr "1999-09-03 02:00"
    #>   ..$ price : num 9.85
    #>   ..$ volume: int 28270
    
    # list to frame, parse date/time
    chart_data_df <- 
      chart_data |>
      bind_rows(chart_data) |>
      mutate(time = lubridate::ymd_hm(time))
    

    Result:

    chart_data_df
    #> # A tibble: 12,480 × 3
    #>    time                price volume
    #>    <dttm>              <dbl>  <dbl>
    #>  1 1999-09-01 02:00:00  9.7   25810
    #>  2 1999-09-02 02:00:00  9.75  17430
    #>  3 1999-09-03 02:00:00  9.85  28270
    #>  4 1999-09-06 02:00:00 10.1   57750
    #>  5 1999-09-07 02:00:00 10.2   29830
    #>  6 1999-09-08 02:00:00 10.0   13700
    #>  7 1999-09-09 02:00:00 10.1   17560
    #>  8 1999-09-10 02:00:00 10.2    7290
    #>  9 1999-09-13 02:00:00 10.2   10040
    #> 10 1999-09-14 02:00:00 10.2   12220
    #> # ℹ 12,470 more rows
    
    ggplot(chart_data_df, aes(x = time, y = price)) +
      geom_area(aes(fill = "price")) +
      scale_x_datetime(breaks = "5 years") +
      theme_minimal()
    

    Created on 2024-05-28 with reprex v2.1.0