Search code examples
perlparse-recdescent

Collecting data with Parse::RecDescent


I have a list of strings (30,000+) which are a collection of statements. Logically, Parse::RecDescent is the tool to use to parse the string to gather the data, but I just can't get my head round the construction of the grammar specification.

The following code is building a big list of blessed nodes, however I just can't figure out how to actually create a more useful data-structure (All I want are the Year, Vol & Iss values)

#!/usr/bin/perl

use strict;
use warnings;

use Parse::RecDescent;
use Data::Dumper;

my $string1 = '2006 - v. 1 (1-2), 2007 - v. 2 (1-4), 2008 - v. 3 (1-4), 2009 - v. 4 (1-4), 2010 - v. 5 (1-4), 2011 - v. 6 (1-2), 2012 - v. 7 (1, 4), 2013/2014 - v. 8 (1-4), 2014 - v. 9 (1, 3)';
my $string2 = 'v.35(1,2),v.36(1,2),v.33(1,2),v.34(1,2),v.39(1,2),v.37(1,2),v.38(1,2),v.43(1,2),v.42(1,2),v.41(1,2),v.40(1,2),v.22(),v.23(),v.24(),v.25(1),v.26(),v.27(),v.28(1,2),v.29(1,2),v.3(),v.2(1,2),v.1(1,2),v.30(),v.7(),v.6(),v.32(1,2),v.5(),v.4(),v.31()';
my $string3 = '1820/1825 - v. 1 (1-2), 1821/1825 - v. 2 (3-4), 1821/1826 - v. 3 (5-6), 1821 - v. 4 (7-8), 1822 - v. 5 (9-10), 1823 - v. 6 (11-12), 1823 - v. 7 (13-14), 1823 - v. 8 (15-16), 1824 - v. 9 (17-18)';

my $data = {}; # Edit: Added hash-ref to show alternate testing
my $grammar = q {
   <autotree> 
     Holdings  : Node(s /,/) 
     Node      : When(?) Volume Issue { $data->{ $item{when} } = [ $item{Vol}, $item{Iss} ] } # Edit: Action Added - This was one option I tried
     When      : Years | Year { $arg[0] = $item{When} } 
     Years     : Year '/' Year
     Year      : /\\d{4}/  { $item[1] } # Edit: Action Added - This was another option I tried
     Volume    : /v\\.\\s*/ Vol { $arg[1] = $item{Vol} } # Edit: Add commet "This was blindly flailing to work out how to get variable data
     Vol       : /\\d+/
     Issue     : /\\s*\(/ Iss ')' { $arg[2] = $item{Iss} }
     Iss       : /[\\d+\\-\\,]*/ 
     };

my $parser = Parse::RecDescent->new($grammar);

foreach my $string ( ($string1, $string2, $string3) ) {
  $string =~ s/\s+\-\s+//g;
  print "$string\n";
  my $output = $parser->Holdings($string);
  print Dumper $output;
}

On a side-note, how do I write the grammar statement so I don't need the substitution in the loop?


Solution

  • make_parser.pl:

    #!/usr/bin/perl
    
    use strict;
    use warnings;
    
    use Parse::RecDescent qw( );
    
    my $grammar = <<'__END_OF_GRAMMAR__';
       {
          use strict;
          use warnings;
       }
    
       parse    : Holdings /\Z/            { $item[1] }
    
       Holdings : Node(s /,/)             #{ $item[-1] }
    
       Node     : When /-?/ Node_          { [ $item[1], @{$item[3]} ] }
                | Node_                    { [ undef,    @{$item[1]} ] }
       Node_    : Volume Issue             { [ $item[1], $item[2] ] }
    
       When     : Year When_[ $item[1] ]  #{ $item[-1] }
       When_    : '/' Year                 { $arg[0] . '/' . $item[2] }
                |                          { $arg[0] }
       Year     : /\d{4}/                 #{ $item[-1] }
    
       Volume   : /v\.\s*/ /\d+/          #{ $item[-1] }
    
       Issue    : '(' /[^)]*/ ')'          { $item[2] =~ s/\s//gr }
    
    __END_OF_GRAMMAR__
    
    Parse::RecDescent->Precompile($grammar, 'Grammar')
       or die("Bad grammar\n");
    

    a.pl:

    #!/usr/bin/perl
    
    use strict;
    use warnings;
    
    use Grammar qw( );
    
    #$::RD_TRACE = 1;
    
    my $parser = Grammar->new();
    
    while (<DATA>) {
       chomp;
    
       my $recs = $parser->parse($_)
          or do { warn("Bad data at line $.\n");
                  next;
                };
    
       print("For: $_\n");
       for my $rec (@$recs) {
          printf("   %s | %s | %s\n",
             defined($rec->[0]) ? $rec->[0] : '[undef]',
             $rec->[1],
             $rec->[2],
          );
       }
    }
    
    __DATA__
    2006 - v. 1 (1-2), 2007 - v. 2 (1-4), 2008 - v. 3 (1-4), 2009 - v. 4 (1-4), 2010 - v. 5 (1-4), 2011 - v. 6 (1-2), 2012 - v. 7 (1, 4), 2013/2014 - v. 8 (1-4), 2014 - v. 9 (1, 3)
    v.35(1,2),v.36(1,2),v.33(1,2),v.34(1,2),v.39(1,2),v.37(1,2),v.38(1,2),v.43(1,2),v.42(1,2),v.41(1,2),v.40(1,2),v.22(),v.23(),v.24(),v.25(1),v.26(),v.27(),v.28(1,2),v.29(1,2),v.3(),v.2(1,2),v.1(1,2),v.30(),v.7(),v.6(),v.32(1,2),v.5(),v.4(),v.31()
    1820/1825 - v. 1 (1-2), 1821/1825 - v. 2 (3-4), 1821/1826 - v. 3 (5-6), 1821 - v. 4 (7-8), 1822 - v. 5 (9-10), 1823 - v. 6 (11-12), 1823 - v. 7 (13-14), 1823 - v. 8 (15-16), 1824 - v. 9 (17-18)
    

    Output:

    $ perl make_parser.pl
    
    $ perl a.pl
    For: 2006 - v. 1 (1-2), 2007 - v. 2 (1-4), 2008 - v. 3 (1-4), 2009 - v. 4 (1-4), 2010 - v. 5 (1-4), 2011 - v. 6 (1-2), 2012 - v. 7 (1, 4), 2013/2014 - v. 8 (1-4), 2014 - v. 9 (1, 3)
       2006 | 1 | 1-2
       2007 | 2 | 1-4
       2008 | 3 | 1-4
       2009 | 4 | 1-4
       2010 | 5 | 1-4
       2011 | 6 | 1-2
       2012 | 7 | 1,4
       2013/2014 | 8 | 1-4
       2014 | 9 | 1,3
    For: v.35(1,2),v.36(1,2),v.33(1,2),v.34(1,2),v.39(1,2),v.37(1,2),v.38(1,2),v.43(1,2),v.42(1,2),v.41(1,2),v.40(1,2),v.22(),v.23(),v.24(),v.25(1),v.26(),v.27(),v.28(1,2),v.29(1,2),v.3(),v.2(1,2),v.1(1,2),v.30(),v.7(),v.6(),v.32(1,2),v.5(),v.4(),v.31()
       [undef] | 35 | 1,2
       [undef] | 36 | 1,2
       [undef] | 33 | 1,2
       [undef] | 34 | 1,2
       [undef] | 39 | 1,2
       [undef] | 37 | 1,2
       [undef] | 38 | 1,2
       [undef] | 43 | 1,2
       [undef] | 42 | 1,2
       [undef] | 41 | 1,2
       [undef] | 40 | 1,2
       [undef] | 22 |
       [undef] | 23 |
       [undef] | 24 |
       [undef] | 25 | 1
       [undef] | 26 |
       [undef] | 27 |
       [undef] | 28 | 1,2
       [undef] | 29 | 1,2
       [undef] | 3 |
       [undef] | 2 | 1,2
       [undef] | 1 | 1,2
       [undef] | 30 |
       [undef] | 7 |
       [undef] | 6 |
       [undef] | 32 | 1,2
       [undef] | 5 |
       [undef] | 4 |
       [undef] | 31 |
    For: 1820/1825 - v. 1 (1-2), 1821/1825 - v. 2 (3-4), 1821/1826 - v. 3 (5-6), 1821 - v. 4 (7-8), 1822 - v. 5 (9-10), 1823 - v. 6 (11-12), 1823 - v. 7 (13-14), 1823 - v. 8 (15-16), 1824 - v. 9 (17-18)
       1820/1825 | 1 | 1-2
       1821/1825 | 2 | 3-4
       1821/1826 | 3 | 5-6
       1821 | 4 | 7-8
       1822 | 5 | 9-10
       1823 | 6 | 11-12
       1823 | 7 | 13-14
       1823 | 8 | 15-16
       1824 | 9 | 17-18