Search code examples
rxmlxml2

Convert Dataframe to XML


I'm trying to convert a dataframe to xml. It is about 600K records. I'm using the XML package:

library(XML)

con <- xmlOutputDOM("mydata")
for(i in seq(nrow(mydata))){
  con$addTag("person", attrs = mydata[i,])
}

The code above is taking too long to run, Is there a way for my to rewrite this code or use a different package to improve the performance?


Solution

  • library('XML')
    

    data

    df1 <- data.frame(a = 1:7, b = letters[1:7], stringsAsFactors = FALSE)
    

    code

    # create a new xml doc
    doc_xml <- newXMLDoc(isHTML = FALSE)
    
    # create a table node
    table_node <- newXMLNode("table", doc = doc_xml)
    
    # row data
    row_data <- apply(df1, 1, function(x) {
      z1 <- newXMLNode('row') # create a new node for each row
      addChildren(z1, lapply(names(x), function(y) newXMLNode(y, x[y])))
    })
    
    # add row data to table node
    xmlParent(row_data) <- table_node
    
    # save as xml file
    saveXML(doc_xml, file = "df1.xml")
    

    Output

    doc_xml
    # <?xml version="1.0"?>
    # <table>
    #   <row>
    #     <a>1</a>
    #     <b>a</b>
    #   </row>
    #   <row>
    #     <a>2</a>
    #     <b>b</b>
    #   </row>
    #   <row>
    #     <a>3</a>
    #     <b>c</b>
    #   </row>
    #   <row>
    #     <a>4</a>
    #     <b>d</b>
    #   </row>
    #   <row>
    #     <a>5</a>
    #     <b>e</b>
    #   </row>
    #   <row>
    #     <a>6</a>
    #     <b>f</b>
    #   </row>
    #   <row>
    #     <a>7</a>
    #     <b>g</b>
    #   </row>
    # </table>
    

    verify nodes

    getNodeSet(doc_xml, "//a")
    getNodeSet(doc_xml, "//b")
    

    Convert xml to dataframe

    # using xpath expression of xml data inside R
    xmlToDataFrame(nodes = getNodeSet(doc_xml, "//table/*"),
                   stringsAsFactors = FALSE,
                   colClasses = c('integer', 'character'))
    # using name of xml data inside R
    xmlToDataFrame(doc = doc_xml, 
                   stringsAsFactors = FALSE, 
                   colClasses = c('integer', 'character'))
    # from xml file
    xmlToDataFrame(doc = "df1.xml", 
                   stringsAsFactors = FALSE, 
                   colClasses = c('integer', 'character'))
    #   a b
    # 1 1 a
    # 2 2 b
    # 3 3 c
    # 4 4 d
    # 5 5 e
    # 6 6 f
    # 7 7 g