I am looking to import the following XML document into data frames: http://opensource.adobe.com/Spry/data/donuts.xml
There should be 3 data frames created:
(Data does not need to be 3NF - i.e. each batter can be repeated for each item it is listed against)
Using the XML2 package, I have so far used the following code to import the XML and convert it to a nested list:
library(xml2)
xmlobj <- read_xml("http://opensource.adobe.com/Spry/data/donuts.xml")
ls1 <- as_list(xmlobj) #Converts XML to a nested list
I am now looking to parse / flatten the list into the 3 data frames as described above.
How best to achieve this? Is it through a series of loops (lapply/map), passing objects into vectors an then loading the data frame? Or should I avoid using XML2 / Lists altogether and use the XML package and achieve this using an XPath type syntax?
I tried the following and could pull out the Item attributes and elements for a single item, but when I tried to lapply the function it crashed:
#Function for pulling out item attributes from list
ItemDF <- function(myItem){
#Gather Item data into DF including attributes
itemFrame <- data_frame(
id = attr(myItem$item,'id'),
type = attr(myItem$item,'type'),
name = unlist(myItem$item$name),
ppu = unlist(myItem$item$ppu)
)
return(itemFrame)
}
#Single instance
df1 <- ItemDF(ls1$items[1])
df1
#Lapply across all items throws an error
lapply(ls1$items,ItemDF)
(NB This data set is a proof of concept, so I am looking for a method I can then adapt for other XML files that I expect to be working on).
library(xml2)
library( tidyverse )
xmlobj <- read_xml("http://opensource.adobe.com/Spry/data/donuts.xml")
df_items <- data.frame(
id = xml_find_all( xmlobj, ".//item" ) %>% xml_attr( "id" ),
type = xml_find_all( xmlobj, ".//item" ) %>% xml_attr( "type" ),
name = xml_find_all( xmlobj, ".//item/name" ) %>% xml_text(),
ppu = xml_find_all( xmlobj, ".//item/ppu" ) %>% xml_text(),
stringsAsFactors = FALSE )
# id type name ppu
# 1 0001 donut Cake 0.55
# 2 0002 donut Raised 0.55
# 3 0003 donut Buttermilk 0.55
# 4 0004 bar Bar 0.75
# 5 0005 twist Twist 0.65
# 6 0006 filled Filled 0.75
df_batters <- xml_find_all( xmlobj, ".//item" ) %>%
map_df(~{
set_names(
xml_find_all(.x, ".//batters/batter") %>% xml_attr( "id" ),
xml_find_all(.x, ".//batters/batter") %>% xml_text()
) %>%
as.list() %>%
flatten_df() %>%
mutate(itemID = xml_attr(.x, "id" ) )
}) %>%
type_convert() %>%
gather( batter, batterID, -itemID, na.rm = TRUE) %>%
select( batterID, batter, itemID )
# # A tibble: 10 x 3
# batterID batter itemID
# * <int> <chr> <chr>
# 1 1001 Regular 0001
# 2 1001 Regular 0002
# 3 1001 Regular 0003
# 4 1001 Regular 0004
# 5 1001 Regular 0005
# 6 1001 Regular 0006
# 7 1002 Chocolate 0001
# 8 1002 Chocolate 0003
# 9 1003 Blueberry 0001
# 10 1003 Devil's Food 0001
df_toppings <- xml_find_all( xmlobj, ".//item" ) %>%
map_df(~{
set_names(
xml_find_all(.x, ".//topping") %>% xml_attr( "id" ),
xml_find_all(.x, ".//topping") %>% xml_text()
) %>%
as.list() %>%
flatten_df() %>%
mutate(itemID = xml_attr(.x, "id" ) )
}) %>%
type_convert() %>%
gather( topping, toppingID, -itemID, na.rm = TRUE) %>%
select( toppingID, topping, itemID )
# # A tibble: 20 x 3
# toppingID topping itemID
# * <int> <chr> <chr>
# 1 5001 None 0001
# 2 5001 None 0002
# 3 5002 Glazed 0001
# 4 5002 Glazed 0002
# 5 5002 Glazed 0005
# 6 5002 Glazed 0006
# 7 5005 Sugar 0001
# 8 5005 Sugar 0002
# 9 5005 Sugar 0005
# 10 5007 Powdered Sugar 0001
# 11 5007 Powdered Sugar 0006
# 12 5006 Chocolate with Sprinkles 0001
# 13 5003 Chocolate 0001
# 14 5003 Chocolate 0002
# 15 5003 Chocolate 0004
# 16 5003 Chocolate 0006
# 17 5004 Maple 0001
# 18 5004 Maple 0002
# 19 5004 Maple 0004
# 20 5004 Maple 0006