Search code examples
perlxsltxml-libxml

unable to parse xml file using registered namespace


I am using XML::LibXML to parse a XML file. There seems to some problem in using registered namespace while accessing the node elements. I am planning to covert this xml data into CSV file. I am trying to access each and every element here. To start with I tried out extracting attribute values of <country> and <state> tags. Below is the code I have come with . But I am getting error saying XPath error : Undefined namespace prefix.

use strict;
use warnings;
use Data::Dumper;
use XML::LibXML;

my $XML=<<EOF;
<DataSet xmlns="http://www.w3schools.com" xmlns:xsi="https://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.w3schools.com note.xsd">
    <exec>
        <survey_region ver="1.1" type="x789" date="20160312"/>
        <survey_loc ver="1.1" type="x789" date="20160312"/>
        <note>Population survey</note>
    </exec>
    <country name="ABC" type="MALE">
        <state name="ABC_state1" result="PASS">
            <info>
                <type>literacy rate comparison</type>
            </info>
            <comment><![CDATA[
Some random text
contained here
]]></comment>
        </state>
    </country>
    <country name="XYZ" type="MALE">
        <state name="XYZ_state2" result="FAIL">
            <info>
                <type>literacy rate comparison</type>
            </info>
            <comment><![CDATA[
any random text data
]]></comment>
        </state>
    </country>
</DataSet>
EOF




my $parser = XML::LibXML->new();
my $doc  = $parser->parse_string($XML);


my $xc     = XML::LibXML::XPathContext->new($doc);
$xc->registerNs('x','http://www.w3schools.com');



foreach my $camelid ($xc->findnodes('//x:DataSet')) {

    my $country_name = $camelid->findvalue('./x:country/@name');
    my $country_type = $camelid->findvalue('./x:country/@type');

    my $state_name =  $camelid->findvalue('./x:state/@name');
    my $state_result =  $camelid->findvalue('./x:state/@result');
    print "state_name ($state_name)\n";
    print "state_result ($state_result)\n";
    print "country_name ($country_name)\n";
    print "country_type ($country_type)\n";
}

Update if I remove the name space from XML and change my XPath slightly it seems to work. Can someone help me understand the difference.

foreach my $camelid ($xc->findnodes('//DataSet')) {
    my $country_name = $camelid->findvalue('./country/@name');
    my $country_type = $camelid->findvalue('./country/@type');

    my $state_name =  $camelid->findvalue('./country/state/@name');
    my $state_result =  $camelid->findvalue('./country/state/@result');
    print "state_name ($state_name)\n";
    print "state_result ($state_result)\n";
    print "country_name ($country_name)\n";
    print "country_type ($country_type)\n";
}

Solution

  • This would be my approach

    #!/usr/bin/perl
    
    use strict;
    use warnings;
    use XML::LibXML;
    
    my $XML=<<EOF;
    <DataSet xmlns="http://www.w3schools.com" xmlns:xsi="https://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.w3schools.com note.xsd">
        <exec>
            <survey_region ver="1.1" type="x789" date="20160312"/>
            <survey_loc ver="1.1" type="x789" date="20160312"/>
            <note>Population survey</note>
        </exec>
        <country name="ABC" type="MALE">
            <state name="ABC_state1" result="PASS">
                <info>
                    <type>literacy rate comparison</type>
                </info>
                <comment><![CDATA[
    Some random text
    contained here
    ]]></comment>
            </state>
        </country>
        <country name="XYZ" type="MALE">
            <state name="XYZ_state2" result="FAIL">
                <info>
                    <type>literacy rate comparison</type>
                </info>
                <comment><![CDATA[
    any random text data
    ]]></comment>
            </state>
        </country>
    </DataSet>
    EOF
    
    
    my $parser = XML::LibXML->new();
    my $tree = $parser->parse_string($XML);
    my $root = $tree->getDocumentElement;
    my @country = $root->getElementsByTagName('country');
    
    
    foreach my $citem(@country){
        my $country_name = $citem->getAttribute('name');
        my $country_type = $citem->getAttribute('type');
        print "Country Name -- $country_name\nCountry Type -- $country_type\n";
        my @state = $citem->getElementsByTagName('state');
        foreach my $sitem(@state){
            my @info = $sitem->getElementsByTagName('info');
            my $state_name = $sitem->getAttribute('name');
            my $state_result = $sitem->getAttribute('result');
            print "State Name -- $state_name\nState Result -- $state_result\n";
            foreach my $i (@info){
                my $text = $i->getElementsByTagName('type');
                print "Info --- $text\n";
            }
        }
        print "\n";
    }
    

    Of course you can manipulate the data anyway you'd like. If you are parsing from a file change parse_string to parse_file.

    For the individual elements in the xml use the getElementsByTagName to get the elements within the tags. This should be enough to get you going