Web Scraping from B3/BM&F Bovespa

I am trying to download some data from BM&FBOVESPA reference rates page.

Their web pages is...

http://www.b3.com.br/en_us/market-data-and-indices/data-services/market-data/reports/derivatives-market/reference-prices/bm-fbovespa-reference-rates/

and the Frame is...

http://www2.bmf.com.br/pages/portal/bmfbovespa/lumis/lum-taxas-referenciais-bmf-ptBR.asp

Here is my code, which is giving me an error: Error in out[j + k, ] : subscript out of bounds

#URL which contains the data 
url <- 'http://www2.bmf.com.br/pages/portal/bmfbovespa/lumis/lum-taxas-referenciais-bmf-ptBR.asp'

#Read the HTML from the URL
site <- read_html(url)

#Save the table as "list"
lista_tabela <- site %>%
  html_nodes("table") %>%
  html_table(fill = TRUE) 

#"list" to df
CurvaDI <- lista_tabela[[1]]

I cannot correct this error and just download the table from their website and save it as a df.

Also, I am trying to download several period in a single code. Happy if somebody could help!

Thanks a lot!

Solution

It seems the html is intentionnaly malformed in the raw source so you'll have to restructure it before parsing the table. The following uses a series of regex to get the table parsable :

library(rvest)
library(httr)
library(stringr)

url <- 'http://www2.bmf.com.br/pages/portal/bmfbovespa/lumis/lum-taxas-referenciais-bmf-enUS.asp'

html <- content(GET(url), as = "raw") %>% rawToChar(.)
html <- str_replace_all(html, '(</tr>\r\n\r\n</tr>)', '</tr>\r\n\r\n<tr>')
html <- str_replace_all(html[[1]], '(<thead>|</thead>)', '')
html <- str_replace_all(html[[1]], '(</tr>\r\n\r\n<th)', '</tr><tr>')

data <- html[[1]] %>% read_html() %>% html_table(fill = TRUE) 

dataframe <- tail(data[[1]], -1)

print(dataframe)

which gives :

    Calendar Days ID x fixed rate ID x fixed rate
2               1            1.90            0.00
3               7            1.90            1.55
4               8            1.90            1.70
5               9            1.90            1.81
6              13            1.91            1.67
7              14            1.91            1.75
8              21            1.91            1.81
9              23            1.91            1.89
10             24            1.91            1.93
11             28            1.91            1.75
12             30            1.91            1.82
13             34            1.92            1.77
14             41            1.93            1.82
15             43            1.94            1.87
16             52            1.95            1.93
.................................................

To submit the form data, you can build the POST request with the specific options and date format. The following will get the options and prompt user to select one and then get the data :

library(rvest)
library(httr)
library(stringr)

date <- as.Date("2020-10-07")

url <- 'http://www2.bmf.com.br/pages/portal/bmfbovespa/lumis/lum-taxas-referenciais-bmf-enUS.asp'

html <- content(GET(url), as = "raw") %>% rawToChar(.)

getData <- function(html){
    html <- str_replace_all(html, '(</tr>\r\n\r\n</tr>)', '</tr>\r\n\r\n<tr>')
    html <- str_replace_all(html[[1]], '(<thead>|</thead>)', '')
    html <- str_replace_all(html[[1]], '(</tr>\r\n\r\n<th)', '</tr><tr>')

    body <- html[[1]] %>% read_html()
    table <- body %>% html_table(fill = TRUE) 

    if (length(table) > 0){
        dataframe <- tail(table[[1]], -1)
        return(list(data = dataframe, body = body))
    }
    return(list(data = NULL, body = body))
}

res <- getData(html)
print(res[[1]])

options <- res[[2]] %>% html_nodes("option")
i <- 1
optionList = list()
for(o in options){
    optionList[[i]] <- c(
        key = o %>% html_attr("value"), 
        value = str_replace_all(o %>% html_text(),'\r\n','')
    )
    print(paste("[",i,"] ", optionList[[i]]["value"], sep=""))
    i <- i + 1
}
cat("Choose option by index : ")
selected <- readLines("stdin",n=1);
selectedOption <- optionList[[as.integer(selected)]]
print(paste("you selected :", selectedOption["value"], sep=" "))

postUrl <- modify_url(url, 
    query = list(
        Data = format(date, format="%m/%d/%Y"), 
        Data1 = format(date, format="%Y%m%d"), 
        slcTaxa = selectedOption["key"]
    )
)
html <- content(POST(postUrl, body = list(
    Data = format(date, format="%m/%d/%Y"), 
    Data1 = format(date, format="%Y%m%d"), 
    slcTaxa = selectedOption["key"],
    nomexls = "",
    lQtdTabelas = "",
    IDIOM =  2
), encode = "form"), as = "raw") %>% rawToChar(.)

res <- getData(html)
print(res[[1]])