Search code examples
perlparse-recdescent

Parse::RecDescent performance issue


I'm using Parse::RecDescent to parse lines in a Cisco IOS ACL. The ACL is used on the edge router of a large network, so it contains almost 8k lines which are set by the government. I'm looping through each of those lines and putting the values into a hash. Although it is 8k lines I'm still spending over 14 seconds parsing the lines? Does this sound reasonable? It seems VERY slow to me. Is there some overhead to using a hash verse another data structure?

Sample Input: (about 8k of these or similar)

deny   ip 2.3.4.5 0.0.0.7 any log-input
deny   ip 5.6.7.8 0.0.0.255 any log-input
deny   ip host 9.10.11.12 any log-input
deny   ip 13.14.15.16 0.0.31.255 any log-input
permit tcp host 17.18.19.20 host 21.22.23.24 eq bgp
permit icmp 25.26.0.0 0.0.255.255 27.28.0.0 0.0.255.255

Here is my entire parser:

package AccessList::Parser;

use strict;
use warnings;
use Carp;
use Scalar::Util 'blessed';
use Parse::RecDescent;

our $VERSION = '0.05';

sub new {
    my ($class) = @_;
    my $self = { PARSER => undef, };
    bless $self, $class;
    $self->_init();
    return $self;
}

sub _init {
    my ($self) = @_;
    $self->{PARSER} = Parse::RecDescent->new( $self->_grammar() );
}

sub parse {
    my ( $self, $string ) = @_;
    defined ($string) or confess "blank line received";
    my $tree = $self->{PARSER}->startrule($string);
    defined($tree) or confess "unrecognized line\n";
    return visit($tree);
}

#
# Finished tests
#

sub visit {
    my ($node) = @_;

    my $Rule_To_Key_Map = {
        "acl_action"              => 1,
        "acl_protocol"            => 1,
        "acl_src_ip"              => 1,
        "acl_src_port"            => 1,
        "acl_dst_ip"              => 1,
        "acl_dst_port"            => 1,
        "acl_remark"              => 1
    };

    my $parent_key;
    my $result;

    # set s of explored vertices
    my %seen;

    #stack is all neighbors of s
    my @stack;
    push @stack, [ $node, $parent_key ];

    my $key;

    while (@stack) {

        my $rec = pop @stack;

        $node       = $rec->[0];
        $parent_key = $rec->[1];    #undef for root

        next if ( $seen{$node}++ );

        my $rule_id = ref($node);

        if ( exists( $Rule_To_Key_Map->{$rule_id} ) ) {
            $parent_key = $rule_id;
        }

        foreach my $key ( keys %$node ) {
            next if ( $key eq "EOL" );
            my $next = $node->{$key};
            if ( blessed($next) ) {
                if ( exists( $next->{__VALUE__} ) ) {
                    #print ref($node), " ", ref($next), " ", $next->{__VALUE__},"\n";
                    my $rule  = ref($node);
                    my $token = $next->{__VALUE__};
                    $result->{$parent_key} = $token;
                    #print $rule, " ", $result->{$rule}, "\n";
                }
                push @stack, [ $next, $parent_key ];
                #push @stack, $next;
            }
        }
    }
    return $result;
}

sub _grammar {
    my ($self) = @_;

    my $grammar = q{
<autotree>

startrule :
        access_list EOL
    |   acl_remark EOL
    |   <error>

#
# access-lists
#

access_list : acl_action

acl_remark :
        "remark" REMARKS

acl_action :
        ACTIONS acl_protocol

#
# protocol options
#

acl_protocol :
        PROTOCOL acl_src_ip

#
# access-list source IP addresses
#

acl_src_ip :
        address acl_dst_ip
    |   address acl_src_port

#
# access-list source ports
#

acl_src_port : 
        port acl_dst_ip

#
# access-list destination IP address
#

acl_dst_ip :
        address acl_dst_port
    |   address acl_options
    | address CONNECTION_TYPE
    | address LAYER3_OPTIONS
    | IPRANGE

#
# access-list destination ports
#

acl_dst_port : 
        port acl_options
    |   acl_icmp_type acl_options

#
# icmp_types
#

acl_icmp_type :
       ICMP_TYPE

#
# access-list options
#

acl_options :
      acl_logging LAYER3_OPTIONS
    |   acl_logging
    |   EOL
    |   <error>

acl_logging :
            "log-input"
    |       "log"

#
# IP address types
#
# "object" should be fine here because "object" can not  
# be used to specify ports 

address :
        "host" IPADDRESS
    |   "host" NAME
    |   IPNETWORK
    | WILDCARD_NETWORK
    |   ANY


#
# port types
#

port :
        port_eq
    |   port_range
    |   port_gt
    |   port_lt
    |   port_neq

port_eq :
    "eq" PORT_ID

port_range :
    "range" PORT_RANGE

port_gt :
    "gt" PORT_GT

port_lt :
    "lt" PORT_LT

port_neq :
    "neq" <error: neq is unsupported>

#
# Token Definitions
#

STRING :
        /\S+/

DIGIT :
        /\d+/

NAME :
        /((^|\s[a-zA-Z])(\.|[0-9a-zA-Z_-]+)+)/

RULE_REF :
        /\S+/

ANY:
        "any"

IPADDRESS :
        /((\d{1,3})((\.)(\d{1,3})){3})/

MASK :
        /(((255\.){3}(255|254|252|248|240|224|192|128|0+))|((255\.){2}(255|254|252|248|240|224|192|128|0+)\.0)|((255\.)(255|254|252|248|240|224|192|128|0+)(\.0+){2})|((255|254|252|248|240|224|192|128|0+)(\.0+){3}))/

INVERSE_MASK :
        /(0+|1|3|7|15|31|63|127|255)((\.)(255|127|63|31|15|7|3|1|0)){3}/

WILDCARD_NETWORK :
        /((\d{1,3})((\.)(\d{1,3})){3}) (0+|1|3|7|15|31|63|127|255)((\.)(255|127|63|31|15|7|3|1|0)){3}/

IPNETWORK :
        /((\d{1,3})((\.)(\d{1,3})){3}) (((255\.){3}(255|254|252|248|240|224|192|128|0+))|((255\.){2}(255|254|252|248|240|224|192|128|0+)\.0)|((255\.)(255|254|252|248|240|224|192|128|0+)(\.0+){2})|((255|254|252|248|240|224|192|128|0+)(\.0+){3}))/

IPRANGE :
        /((\d{1,3})((\.)(\d{1,3})){3}) ((\d{1,3})((\.)(\d{1,3})){3})/

PROTOCOL :
        /\d+/ | "ahp" | "eigrp" | "esp" | "gre" | "icmp" | "icmp6" | "igmp" 
    | "igrp" | "ip" | "ipinip" | "ipsec" | "nos" | "ospf" | "pcp" 
    | "pim" | "pptp" | "snp" | "tcp" | "udp" | "41" 

GROUP_PROTOCOL :
        "tcp-udp" | "tcp" | "udp"

ICMP_TYPE : 
        /\d+/ | "alternate-address" | "conversion-error" | "echo-reply" | "echo"
    | "information-reply" | "information-request" | "mask-reply" | "mask-request"
    | "mobile-redirect" | "parameter-problem" | "redirect" | "router-advertisement"
    | "router-solicitation" | "source-quench" | "time-exceeded" | "timestamp-reply"
    | "timestamp-request" | "traceroute" | "unreachable"

CONNECTION_TYPE:
        "established"

LAYER3_OPTIONS:
        "fragments" | "packet-too-big"

PORT_ID :
        /\S+/

PORT_GT :
        /\S+/
{
    bless {__VALUE__=>"$item[1] 65535"}, $item[0]
}

PORT_LT :
        /\S+/
{
    bless {__VALUE__=>"1 $item[1]"}, $item[0]
}

PORT_RANGE :
        /\S+ \S+/

ACTIONS :
        "permit"
    |   "deny"

REMARKS :
        /.*$/

LOG_LEVEL :
        /\d+/ | "emergencies" | "alerts" | "critical" | "errors" 
    | "warnings" | "notifications" | "informational" | "debugging"
    | "disable"

EOL :
        /$/ 
};

    return $grammar;
}

1;

Solution

  • Performance problems:

    • Factor out common prefixes (e.g. address in acl_dst_ip, IPRANGE in acl_dst_ip)
    • Remove unnecessary rules (e.g. access_list)

    Functionality problems:

    • You mistakenly treat remarkfoo as remark. Similar mistakes elsewhere.
    • You allow newlines between tokens, but that doesn't seem to be desired.
    • You only allow single spaces between some tokens when you should have a more permitting definition of whitespace.
    • The same rule treats 0.0.127.4 0.0.127.255 as "from 0.0.127.4 to 0.0.127.255" and as "from 0.0.0.0 to 0.0.127.255". (The first one found win, so its treated as "from 0.0.0.0 to 0.0.127.255".) The distinction shouldn't even be made in the parser.

    I started fixing up your code. (COMPLETELY UNTESTED)

    # make_parser.pl
    
    use strict;
    use warnings;
    
    use Parse::RecDescent qw( );
    
    my $grammar = <<'__EOI__';
    
       {
          use strict;
          use warnings;
    
          use Socket qw( inet_aton );
    
          my %protocol_names = map { $_ => 1 } qw(
             ahp   eigrp  esp     gre    icmp  icmp6  igmp
             igrp  ip     ipinip  ipsec  nos   ospf   pcp
             pim   pptp   snp     tcp    udp
          );
    
          my %protocol_group_names = map { $_ => 1 } qw(
             tcp-udp  tcp  udp
          );
    
          my %icmp_type_names = map { $_ => 1 } qw(
             alternate-address    conversion-error     echo-reply     echo
             information-reply    information-request  mask-reply     mask-request
             mobile-redirect      parameter-problem    redirect       router-advertisement
             router-solicitation  source-quench        time-exceeded  timestamp-reply
             timestamp-request    traceroute           unreachable
          );
    
          sub parse_ipv4_addr {
             my ($addr) = @_;
             return inet_aton($addr);
          }
       }
    
       parse            : <skip: qr/[ \t]*/> line(s) /\Z/ { $item[2] }
    
       line             : line_body /\n|\Z/ { $item[1] }
    
       line_body        : PERMIT <commit> permit_deny_args { [ $item[1], $item[3] ] }
                        | DENY   <commit> permit_deny_args { [ $item[1], $item[3] ] }
                        | REMARK <commit> /[^\n]*/         { 0 }
                        | /[ \t]+/                         { 0 }
    
       permit_deny_args : protocol permit_deny_src permit_deny_dst { [ @item[1,2,3] ] }
    
       permit_deny_src  : addrs ports { [ @item[1, 2] ] }
    
       permit_deny_dst  : ...
    
       addrs            : HOST      <commit> ( IPv4_ADDR | DOMAIN ) { [ host  => $item[3]           ] }
                        | IPv4_ADDR <commit> IPv4_ADDR              { [ range => $item[1], $item[3] ] }
                        | ANY       <commit>                        { [ any   =>                    ] }
    
       ports            : EQ    <commit> IDENT       { [ permit => $item[2], $item[2] ] }
                        | NEQ   <commit> IDENT       { [ deny   => $item[2], $item[2] ] }
                        | GT    <commit> IDENT       { [ deny   => 1,        $item[2] ] }
                        | LT    <commit> IDENT       { [ deny   => $item[2], 65535    ] }
                        | RANGE <commit> IDENT IDENT { [ permit => $item[2], $item[3] ] }
                        |                            { [ permit => 1,        65535    ] }
    
    
       # Rules that match simply return what they match (i.e. no type info is returned).
    
       PROTOCOL_NAME    : IDENT { $protocol_names{$item[1]} ? $item[1] : undef }
    
       DOMAIN           : ...
    
       IPv4_ADDR        : /[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+/ { parse_ipv4_addr($item[1]) }
    
       # Keywords
       REMARK           : IDENT { $item[1] eq 'remark' ? $item[1] : undef }
       PERMIT           : IDENT { $item[1] eq 'permit' ? $item[1] : undef }
       DENY             : IDENT { $item[1] eq 'deny'   ? $item[1] : undef }
       ANY              : IDENT { $item[1] eq 'any'    ? $item[1] : undef }
       EQ               : IDENT { $item[1] eq 'eq'     ? $item[1] : undef }
       NEQ              : IDENT { $item[1] eq 'neq'    ? $item[1] : undef }
       LT               : IDENT { $item[1] eq 'lt'     ? $item[1] : undef }
       GT               : IDENT { $item[1] eq 'gt'     ? $item[1] : undef }
    
       IDENT            : /[a-zA-Z][a-zA-Z0-9_]*/
    
    __EOI__
    
    Parse::RecDescent->Precompile($grammar, 'Parser')
        or die("Bad grammar\n");
    

    Run the above file, then you can use the parse as follows:

    # test.pl
    
    use strict;
    use warnings;
    
    use Data::Dumper qw( Dumper );
    use Parser       qw( );
    
    my $text = '...';
    
    my $parser = Parser->new();
    
    print(Dumper($parser->parse($text)));