Search code examples
perlxpathxml-twig

Perl: XML-Twig get_xpath("//table") not able to get all the <table> elements


I am parsing this file with XML-Twig perl module:

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE reference
  PUBLIC "-//OASIS//DTD DITA Reference//EN" "reference.dtd">
<reference xmlns:ditaarch="http://dita.oasis-open.org/architecture/2005/" id="dav1395327178563" xml:lang="en-us" ditaarch:DITAArchVersion="1.2">
  <title>Display</title>
  <shortdesc></shortdesc>
  <prolog>


  </prolog>
  <refbody>
    <table>
      <title>LPSV register</title>
      <desc>Address offset: <codeph>0x100</codeph>.</desc>
      <tgroup cols="5">
        <colspec colname="col1" colnum="1"/>
        <colspec colname="col2" colnum="2"/>
        <colspec colname="col3" colnum="3"/>
        <colspec colname="col4" colnum="4"/>
        <colspec colname="col5" colnum="5"/>
        <thead>
          <row>
            <entry colname="col1">Bits</entry>
            <entry colname="col2">Reset value</entry>
            <entry colname="col3">Access type</entry>
            <entry colname="col4">Name</entry>
            <entry colname="col5">Usage</entry>
          </row>
        </thead>
        <tbody>
          <row>
            <entry colname="col1">4:0</entry>
            <entry colname="col2">0</entry>
            <entry colname="col3">tt</entry>
            <entry colname="col4">xx</entry>
            <entry colname="col5">
              <p>Video layer input format.</p>

            </entry>
          </row>
          <row>
            <entry colname="col1">7:5</entry>
            <entry colname="col2">-</entry>
            <entry colname="col3">-</entry>
            <entry colname="col4">-</entry>
            <entry colname="col5">Reserved</entry>
          </row>

        </tbody>
      </tgroup>
    </table>
    <table>
      <title>LV_CONTROL</title>
      <desc>Address offset: <codeph>0x104</codeph>.</desc>
      <tgroup cols="5">
        <colspec colname="col1" colnum="1"/>
        <colspec colname="col2" colnum="2"/>
        <colspec colname="col3" colnum="3"/>
        <colspec colname="col4" colnum="4"/>
        <colspec colname="col5" colnum="5"/>
        <thead>
          <row>
            <entry colname="col1">Bits</entry>
            <entry colname="col2">Reset value</entry>
            <entry colname="col3">Access type</entry>
            <entry colname="col4">Name</entry>
            <entry colname="col5">Usage</entry>
          </row>
        </thead>
        <tbody>
          <row>
            <entry colname="col1">0</entry>
            <entry colname="col2">0</entry>
            <entry colname="col3">RWPU</entry>
            <entry colname="col4">EN</entry>
            <entry colname="col5">layer enable flag</entry>
          </row>
          <row>
            <entry colname="col1">3:1</entry>
            <entry colname="col2">0</entry>
            <entry colname="col3">rr</entry>
            <entry colname="col4">ss</entry>
            <entry colname="col5">layer data flow configuration</entry>
          </row>
          <row>
            <entry colname="col1">4</entry>
            <entry colname="col2">0</entry>
            <entry colname="col3">rr</entry>
            <entry colname="col4">rr</entry>
            <entry colname="col5">layer inverse gamma enable flag</entry>
          </row>
        </tbody>
      </tgroup>
    </table>
  </refbody>
</reference>

But the get_xpath("//table") only picks, first <table> element. Can anyone please help me what's wrong in my script:

use strict;
use open ':std', ':encoding(UTF-8)';    # To remove 'Wide character in print' error

#use warnings;
use XML::Twig;

my @input_xml_files = ( 'dav1395327178563.xml' ); #<input_regs/*.xml>;

our @registers;
our @registers_desc;
our @registers_offset;

my $register_count = 0;

foreach my $input_xml_file ( @input_xml_files ) {

    my $twig = XML::Twig->new(
        twig_handlers => {
            'table/title'       => \&register_name,
            #'section/dl'        => \&register_description,
            'table/desc/codeph' => \&register_offset
        },
    );
    $twig->parsefile( $input_xml_file );

    $input_xml_file =~ s/.xml//;
    $input_xml_file =~ s/input_regs\///;

    my $regs_file_name = $input_xml_file . ".regs";

    #chdir ($curdir);

    open( REGS_PPFE, ">$regs_file_name" ) || die( "Can't open the file." );

    foreach my $table ( $twig->get_xpath( "//table" ) ) { # get each <table>

        # my $header = $table->prev_sibling->text;

        my @headers;
        my $register_id = $registers[$register_count];

        $register_id =~ s/[\(\s+-]/_/g;
        $register_id =~ s/,//g;           # Remove ,

        print $regs_file_name. "\n";
        print $table. "\n\n";

        #print $register_id."\n";
        print REGS_PPFE "name=\"$register_id\" id=\"$register_id\"
\"$registers[$register_count]\"

offset=\"$registers_offset[$register_count]\"
\n\n";

################################## Bit fields ####################################################################

        my $bit_num        = 1;    # bit number
        my $bit_width_temp = 0;
        my $bit_offset     = 0;    # bit offset

        my %entries;

        my $row_count = 0;         # row counter

        my $reserved_bit_offset;
        my $reserved_bit_width;

        my $prev_bit_offset     = 0;
        my $prev_bit_width_temp = 0;
        my $prev_colon          = 0;
        my $prev_single         = 0;

        foreach my $row ( $table->get_xpath( "//row" ) ) { #foreach my $row ( $table->get_xpath("tgroup/tbody/row") ) # get each <row> of one <table>

            # my %entries;
            $row_count = $row_count + 1;

            # print $row_count, "\n";

            my @row_entries = map { $_->text =~ s/\n\s+//rg; } $row->children;  # remove 'linefeed and whitespace' (s/\n\s+//gr) /

            if ( @headers ) {

                # my $bit_width_temp;
                # my $bit_offset;
                my $bit_width;    # bit width

                @entries{@headers} = @row_entries;

                #foreach my $field (@headers)
                #{
                $entries{'Bits'} =~ s/[\[\]]//g;    # remove [] from this text

                #print "$entries{'Bits'}\n";

                if ( $entries{'Bits'} =~ /:/ ) {      # e.g. 3:2

                    $prev_bit_width_temp = $bit_width_temp;
                    $prev_bit_offset     = $bit_offset;

                    # print $reg_name," Prev prev_bit_width_temp: ", $prev_bit_width_temp, " Prev prev_bit_offset: ", $prev_bit_offset;
                    $prev_colon = 1 - $prev_single;

                    #$prev_single = 0;
                    ( $bit_width_temp, $bit_offset ) = split( ':', $entries{'Bits'} );
                    $bit_width = int( $bit_width_temp ) - int( $bit_offset ) + 1;

                    # print " Now bit_width_temp: ",$bit_width_temp, " Now bit_offset: ",$bit_offset, "\n";

                }
                else {    # e.g. 24

                    $prev_bit_offset     = $bit_offset;
                    $prev_bit_width_temp = $bit_width_temp;

                    # print $reg_name, " Prev prev_bit_width_temp: ", $prev_bit_width_temp," Prev prev_bit_offset: ", $prev_bit_offset;
                    $prev_single = 1 - $prev_colon;

                    #$prev_colon = 0;
                    ( $bit_width_temp, $bit_offset ) = ( 0, $entries{'Bits'} );
                    $bit_width = int( $bit_width_temp ) + 1;

                    # print  " Now bit_width_temp: ",$bit_width_temp," Now bit_offset: ",$bit_offset,"\n";
                }

                my $bit_reset = $entries{'Reset value'};

                print "This is binary \n" if -B $bit_reset;

                #my $bit_access = $entries{'Access'};
                #$bit_access =~ s/RAZ\/WI/ROZ/g;
                my $bit_name = $entries{'Name'};

                #$bit_name =~ s/[\(\s+-]/_/g;
                #$entries{'Usage'} =~ s/[‘’]/'/g;
                #print $bit_name;
                if ( $bit_name =~ /-/ ) {
                    print REGS_PPFE "reserved_bit_field position=\"$bit_offset\" width=\"$bit_width\" 
reset_value value=\"0x$bit_reset\" 
\n";
                }
                else {

                    print REGS_PPFE
                            "<bit_field name=\"$bit_name\" id=\"$bit_name\" position=\"$bit_offset\" width=\"$bit_width\" access=\"RW\">
<brief_description>$entries{'Name'}</brief_description>
<long_description>
<p>$entries{'Usage'}</p>
</long_description>
<reset_value value=\"0x$bit_reset\" override=\"true\"/>
</bit_field>\n";

                }

                $bit_num = $bit_num + 1;
            }
            else {

                @headers = @row_entries;
            }
        }
        print REGS_PPFE "</register_def>";
        $register_count++;
    }

    close( REGS_PPFE );

}

#print "\n\n\n\n@registers \n";
#print "\n@registers_desc \n\n\n\n\n";
#================================================== Subroutines =================================================

sub register_name {
    my ( $twig, $text_elt ) = @_;
    push @registers, $text_elt->text;
    print $text_elt ->text . "\n";
    $twig->purge;
}

sub register_description {
    my ( $twig, $text_elt ) = @_;

    #print $text_elt -> text;
    my $temp_reg_desc = $text_elt->text;

    $temp_reg_desc =~ s/\bPurpose/<p>Purpose: /i;
    $temp_reg_desc =~ s/\bUsage constraints/<\/p><p>Usage constraints: /i;
    $temp_reg_desc =~ s/\bConfigurations/<\/p><p>Configurations: /i;
    $temp_reg_desc =~ s/\bAttributes.*//;    #Remove all text after Attributes
    $temp_reg_desc = $temp_reg_desc . "</p>";

    #print $temp_reg_desc;
    push @registers_desc, $temp_reg_desc;

    $twig->purge;
}

sub register_offset {
    my ( $twig, $text_elt ) = @_;

    #print $text_elt -> text;

    push @registers_offset, $text_elt->text;

    $twig->purge;
}

Solution

  • The problem is the calls to purge at the end of each of your handler subroutines. This deletes from memory all fully-process nodes, so you are removing the whole of the first table element before you use get_xpath to search for it. There is no need to purge or flush the nodes accessed by the handlers

    You are using two different ways of processing your data: via handlers and through get_xpath on the complete twig. I suggest that you remove the handlers altogether and just access the XML through the DOM