Search code examples
rrvestxml2

Why read_xml() is not reading the file as expected?


How can I avoid the error of tag mismatch while reading html file? The following code replicates the error I'm receiving related to read_xml().

library(dplyr)
library(tidyr)
library(xml2)
library(rvest)
library(tibble)


temp <- tempfile()

"http://www1.caixa.gov.br/loterias/_arquivos/loterias/D_megase.zip" %>%
    download.file(temp)

megasena <- unz(temp, "D_MEGA.HTM") %>%
    read_xml() %>%
    html_table() %>%
    .[[1]]

The error returned is:

  Error in doc_parse_raw(x, encoding = encoding, base_url = base_url, as_html = as_html,  : 
      Opening and ending tag mismatch: img line 1 and p [76]

Solution

  • Since it is indeed an HTML/.HTM file, why not read it using read_html()?

    This works:

    library(dplyr)
    #> 
    #> Attaching package: 'dplyr'
    #> The following objects are masked from 'package:stats':
    #> 
    #>     filter, lag
    #> The following objects are masked from 'package:base':
    #> 
    #>     intersect, setdiff, setequal, union
    library(tidyr)
    library(xml2)
    library(rvest)
    library(tibble)
    
    
    temp <- tempfile()
    
    "http://www1.caixa.gov.br/loterias/_arquivos/loterias/D_megase.zip" %>%
      download.file(temp)
    
    megasena <- unz(temp, "D_MEGA.HTM") %>%
      read_html() %>%
      html_table() %>%
      .[[1]]
    
    head(megasena)
    #>   Concurso Data Sorteio    1\xaa Dezena 2ª Dezena 3ª Dezena 4ª Dezena
    #> 1        1   11/03/1996           41         5         4        52
    #> 2        2   18/03/1996            9        39        37        49
    #> 3        3   25/03/1996           36        30        10        11
    #> 4        3   25/03/1996           36        30        10        11
    #> 5        4   01/04/1996            6        59        42        27
    #> 6        5   08/04/1996            1        19        46         6
    #>   5ª Dezena 6ª Dezena Arrecadacao_Total Ganhadores_Sena Cidade    UF
    #> 1        30        33              0,00               0  &nbsp &nbsp
    #> 2        43        41              0,00               1           PR
    #> 3        29        47              0,00               2           RN
    #> 4        29        47              0,00               2           SP
    #> 5         1         5              0,00               0  &nbsp &nbsp
    #> 6        16         2              0,00               0  &nbsp &nbsp
    #>    Rateio_Sena Ganhadores_Quina Rateio_Quina Ganhadores_Quadra
    #> 1         0,00               17    39.158,92              2016
    #> 2 2.307.162,23               65    14.424,02              4488
    #> 3   391.192,51               62    10.515,93              4261
    #> 4   391.192,51               62    10.515,93              4261
    #> 5         0,00               39    15.322,24              3311
    #> 6         0,00               98     5.318,10              5399
    #>   Rateio_Quadra Acumulado Valor_Acumulado Estimativa_Prêmio
    #> 1        330,21       SIM    1.714.650,23              0,00
    #> 2        208,91       NÃO            0,00              0,00
    #> 3        153,01       NÃO            0,00              0,00
    #> 4        153,01       NÃO            0,00              0,00
    #> 5        180,48       SIM      717.080,75              0,00
    #> 6         96,53       SIM    1.342.488,85              0,00
    #>   Acumulado_Mega_da_Virada
    #> 1                     0,00
    #> 2                     0,00
    #> 3                     0,00
    #> 4                     0,00
    #> 5                     0,00
    #> 6                     0,00
    

    Created on 2019-01-05 by the reprex package (v0.2.1)