Search code examples
rxml-parsingfinancexml2

read_xml not recognizing the contents inside the nodes and returning charater[0]


I'm trying to read and parse an XML file and turn it into a data frame object, but every attempt returned "character[0]" at the list levels that were expected to return a string or a number. Also, when trying to debug the read_xml function, the console keeps printing (and doesn't stop when halting debug mode) the following error:

INTEGER() can only be applied to a 'integer', not a 'unknown type #29'

What might be causing the problem? Are there any other arguments I could pass to the read_xml function influence how it reads the values and strings?

This is the full script I'm running:

library(dplyr)
library (xml2)

test <- read_xml("path/XMLbulletinsample.xml", encoding = "UTF-8")

ns <- xml_ns(test)

reports <- xml_find_all(test, ".//d3:PricRpt", ns = ns)

# Initializing an empty list to store data
data_list <- list()

# Looping through each PricRpt node and extract information
for (report in reports) {
  
  # Extract trade date, security ID, and various price attributes
  trad_dt <- xml_text(xml_find_first(report, ".//d3:TradDt/Dt"))
  ticker <- xml_text(xml_find_first(report, ".//d3:SctyId/TckrSymb"))
  open_interest <- xml_text(xml_find_first(report, ".//d3:FinInstrmAttrbts/OpnIntrst"))
  first_price <- xml_text(xml_find_first(report, ".//d3:FinInstrmAttrbts/FrstPric"))
  min_price <- xml_text(xml_find_first(report, ".//d3:FinInstrmAttrbts/MinPric"))
  max_price <- xml_text(xml_find_first(report, ".//d3:FinInstrmAttrbts/MaxPric"))
  avg_price <- xml_text(xml_find_first(report, ".//d3:FinInstrmAttrbts/TradAvrgPric"))
  last_price <- xml_text(xml_find_first(report, ".//d3:FinInstrmAttrbts/LastPric"))
  
  # Add extracted data to the list
  data_list[[length(data_list) + 1]] <- data.frame(
    trad_dt = trad_dt,
    ticker = ticker,
    open_interest = as.numeric(open_interest),
    first_price = as.numeric(first_price),
    min_price = as.numeric(min_price),
    max_price = as.numeric(max_price),
    avg_price = as.numeric(avg_price),
    last_price = as.numeric(last_price),
    stringsAsFactors = FALSE
  )
}

# Combine the list into a single dataframe
df <- bind_rows(data_list)

This is a sample I took from the full XML file to facilitate handling while creating the parsing script:

<?xml version="1.0" encoding="utf-8"?>
<Document xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="urn:bvmf.052.01.xsd bvmf.052.01.xsd" xmlns="urn:bvmf.052.01.xsd">
  <BizFileHdr>
    <Xchg>
      <BizGrpDesc>
        <Fr>
          <OrgId>
            <Id>
              <OrgId>
                <Othr>
                  <Id>BVMF</Id>
                  <Issr>40</Issr>
                  <SchmeNm>
                    <Prtry>39</Prtry>
                  </SchmeNm>
                </Othr>
              </OrgId>
            </Id>
          </OrgId>
        </Fr>
        <To>
          <OrgId>
            <Id>
              <OrgId>
                <Othr>
                  <Id>PUBLIC</Id>
                  <Issr>40</Issr>
                  <SchmeNm>
                    <Prtry>39</Prtry>
                  </SchmeNm>
                </Othr>
              </OrgId>
            </Id>
          </OrgId>
        </To>
        <BizGrpDtls>
          <BizGrpIdr>BV000471202408300001000071943367860</BizGrpIdr>
          <TtlNbOfMsg>2264</TtlNbOfMsg>
          <BizGrpTp>BVBG.187.01</BizGrpTp>
          <CreDtAndTm>2024-08-30T19:43:36</CreDtAndTm>
        </BizGrpDtls>
        <MsgTpDef>
          <MsgDefIdr>BVMF.217.01</MsgDefIdr>
          <NbOfMsg>2264</NbOfMsg>
        </MsgTpDef>
      </BizGrpDesc>
      <BizGrp>
        <AppHdr xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="urn:iso:std:iso:20022:tech:xsd:head.001.001.01">
          <BizMsgIdr>BV000471202408300001000071956346330</BizMsgIdr>
          <MsgDefIdr>BVMF.217.01</MsgDefIdr>
          <CreDt>2024-08-30T22:56:34Z</CreDt>
          <Fr>
            <OrgId>
              <Id>
                <OrgId>
                  <Othr>
                    <Id>BVMF</Id>
                    <SchmeNm>
                      <Prtry>39</Prtry>
                    </SchmeNm>
                    <Issr>40</Issr>
                  </Othr>
                </OrgId>
              </Id>
            </OrgId>
          </Fr>
          <To>
            <OrgId>
              <Id>
                <OrgId>
                  <Othr>
                    <Id>PUBLIC</Id>
                    <SchmeNm>
                      <Prtry>39</Prtry>
                    </SchmeNm>
                    <Issr>40</Issr>
                  </Othr>
                </OrgId>
              </Id>
            </OrgId>
          </To>
        </AppHdr>
        <Document xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="urn:bvmf.217.01.xsd">
          <PricRpt>
            <TradDt>
              <Dt>2024-08-30</Dt>
            </TradDt>
            <SctyId>
              <TckrSymb>CCMU24</TckrSymb>
            </SctyId>
            <FinInstrmId>
              <OthrId>
                <Id>200001038816</Id>
                <Tp>
                  <Prtry>8</Prtry>
                </Tp>
              </OthrId>
              <PlcOfListg>
                <MktIdrCd>BVMF</MktIdrCd>
              </PlcOfListg>
            </FinInstrmId>
            <TradDtls/>
            <FinInstrmAttrbts>
              <OpnIntrst>35614</OpnIntrst>
              <FrstPric Ccy="BRL">60.99</FrstPric>
              <MinPric Ccy="BRL">60.75</MinPric>
              <MaxPric Ccy="BRL">61.7</MaxPric>
              <TradAvrgPric Ccy="BRL">61.46</TradAvrgPric>
              <LastPric Ccy="BRL">61.7</LastPric>
              <RglrTxsQty>6697</RglrTxsQty>
              <AdjstdQt Ccy="BRL">61.61</AdjstdQt>
              <AdjstdQtStin>F</AdjstdQtStin>
              <PrvsAdjstdQt Ccy="BRL">61.1</PrvsAdjstdQt>
              <PrvsAdjstdQtStin>F</PrvsAdjstdQtStin>
            </FinInstrmAttrbts>
          </PricRpt>
        </Document>
      </BizGrp>
      </Xchg>
  </BizFileHdr>
</Document>

And this is my R, RStudio and package information:

platform       x86_64-w64-mingw32               
version.string R version 4.4.1 (2024-06-14 ucrt)
rstudio 2024.09.0+375 "Cranberry Hibiscus" Release (c8fc7aee6dc218d5687553f9041c6b1e5ea268ff, 2024-09-16)
xml2 version  xml2_1.3.6
dplyr version dplyr_1.1.4

Solution

  • You have 2 options here.
    Either add the name space to all of the subnodes, for example:

    xml_text(xml_find_first(report, ".//d3:TradDt/d3:Dt"))
    

    or use the xml_ns_strip() function and remove the need to use the namespaces in the xml_find functions.

    xml_ns_strip(test)
    reports <- xml_find_all(test, ".//PricRpt")
    xml_text(xml_find_first(reports, ".//TradDt/Dt"))