Search code examples
xmlperlperl-module

Listing all the child nodes of document element


I have a very huge XML file and i want to list down all the child nodes for document element. I am using the below code which is working fine but it is taking very long time to process the file and also it is getting data from the document element which is not required:

use XML::Simple;
my $xml = XML::Simple->new();
my $d = $xml->XMLin("sample.xml");
my @arr = keys %$d;
print "@arr\n";

Example XML:

    <?xml version="1.0" encoding="ISO-8859-15"?>
<document version="1.0" createdAt="2017-03-31T11:41:34">
   <TITLE>Computer Parts</TITLE>
   <PART001>
      <ITEM>Motherboard</ITEM>
      <MANUFACTURER>ASUS</MANUFACTURER>
      <MODEL>P3B-F</MODEL>
      <COST> 123.00</COST>
   </PART001>
   <PART002>
      <ITEM>Video Card</ITEM>
      <MANUFACTURER>ATI</MANUFACTURER>
      <MODEL>All-in-Wonder Pro</MODEL>
      <COST> 160.00</COST>
   </PART002>
   <PART003>
      <ITEM>Sound Card</ITEM>
      <MANUFACTURER>Creative Labs</MANUFACTURER>
      <MODEL>Sound Blaster Live</MODEL>
      <COST> 80.00</COST>
   </PART003>
   <PART004>
      <ITEM>14 inch Monitor</ITEM>
      <MANUFACTURER>LG Electronics</MANUFACTURER>
      <MODEL> 995E</MODEL>
      <COST> 290.00</COST>
   </PART004>
</document>

Expected Output: TITLE, PART001, PART002, PART003, PART004

Can anyone please suggest a faster and better way to get the required output?


Solution

  • By using XML::LibXML and XPath.

    use 5.014;
    use warnings;
    use XML::LibXML;
    
    my $file = 'xml';
    my $dom = XML::LibXML->load_xml(location => $file);
    
    for my $child ($dom->findnodes( q{//document/*} )) {
        say $child->nodeName();
    }
    

    output

    TITLE
    PART001
    PART002
    PART003
    PART004
    

    or just for the case, if you need only the PARTs

    for my $part ($dom->findnodes( q{//*[contains(name(),'PART')]} )) {
        say $part->nodeName();
    }
    

    output

    PART001
    PART002
    PART003
    PART004
    

    EDIT: Using the pull parser (doesn't loads the whole xml into memory):

    use 5.014;
    use warnings;
    use XML::LibXML::Reader qw(XML_READER_TYPE_ELEMENT);
    
    my $file="xml";    
    my $reader = XML::LibXML::Reader->new(location => $file) or die "problem $!";
    
    while($reader->read()) {
            if( $reader->depth == 1 && $reader->nodeType == XML_READER_TYPE_ELEMENT ) {
                    say $reader->name;
            }
    }
    
    TITLE
    PART001
    PART002
    PART003
    PART004
    

    EDIT2

    use 5.014;
    use warnings;
    
    use XML::LibXML::Reader qw(XML_READER_TYPE_ELEMENT);
    my $file="xml";
    
    my $reader = XML::LibXML::Reader->new(location => $file) or die "problem $!";
    my $indoc;
    while($reader->read()) {
            # sets the flag in youre inside the <document>
            if( $reader->name eq 'document' ) {
                    $indoc = $reader->nodeType == XML_READER_TYPE_ELEMENT ? 1 : 0;
            }
            # all nodes with level 1 if they're inside of the <document>
            if( $indoc &&  $reader->depth == 1 && $reader->nodeType == XML_READER_TYPE_ELEMENT ) {
                    say $reader->name;
            }
    }