Search code examples
perlperl-data-structures

Perl Parser using Hash


I am new to perl and not a very good with data structures. I have been working on a text parser to extract information from a text file and store it in a database. The regular patterns are now ok but,i just noticed that the key i used for my hash "Time" is not unique since there are multiple updates(in the text file) that could happen at the same time. What the hash does is create duplicates which are unacceptable for my purpose. So i was thinking to add a another key maybe a unique counter but i don't know how to get about it. So i tried to add a another key "{$recordcnt}" as counter you will see it appended on all hash statements.I deleted the counter increment statements(maybe i didn't implement correctly)

Also, if you look at the print statement block(last section) of my code, i am trying to print get an array (@nodes_and_index) value that contains two columns nodes and index, print it and display them differently. However it doesn't print the desired results. Was testing assuming i wanted to enter the data into the database separately.

So, did i put the "{$recordcnt}" in the wrong places, if so. How do i make each entry unique in the hash with time ? Thanks for reading.

This is an example of my data:

TIME: 11/01/13 14:30:24
FROM: 10.255.9.4 AS172193
TO: 10.255.9.10 AS676767
ASPATH: 172193 19601 14835 1286 577 4097 2841 14735 9486 573 10633 4488
NEXT_HOP: 10.255.9.126
ANNOUNCE
  10.44.193.0/24

TIME: 11/01/13 14:30:24
FROM: 10.255.9.4 AS172193
TO: 10.255.9.10 AS676767
ASPATH: 172193 19601 14835 4758 2379 10721 10787 7830 17777 4875 4488
NEXT_HOP: 10.255.9.126
ANNOUNCE
  10.44.193.0/24

TIME: 11/01/13 14:30:25
FROM: 10.255.9.4 AS172193
TO: 10.255.9.10 AS676767
ASPATH: 172193 19601 14835 4758 2379 10721 10787 7830 17777 16480 9486 573 10633 4488
NEXT_HOP: 10.255.9.126
ANNOUNCE
  10.44.193.0/24

TIME: 11/01/13 14:30:25
FROM: 10.255.9.4 AS172193
TO: 10.255.9.10 AS676767
ASPATH: 172193 19601 19602 3252 3665 2315 2379 10721 7311 12934 4875 4488
NEXT_HOP: 10.255.9.125
ANNOUNCE
  10.44.193.0/24

TIME: 11/01/13 14:30:34
FROM: 10.255.9.4 AS172193
TO: 10.255.9.10 AS676767
ASPATH: 172193 19601 19602 3252 3665 2315 2379 3725
NEXT_HOP: 10.255.9.125
ANNOUNCE
  10.44.193.0/24

This is my full code :

#!/usr/bin/perl -w
use strict;
use warnings;

my %hash;
my $Dir = "/root/updates/processed/";
my $exit = 0;
my $recordcnt = 0 ;
opendir(DIRECTORY, $Dir) or die $!;

while (my $file = readdir(DIRECTORY)) { 

    unless ($file=~/\.hr$/){next;}


    my $file = $Dir."/".$file;
    print "$file\n";

    open (IN, $file) or die "error reading file: ", $file,"\n";

    my $record_id = "";
    #my $recordcnt = 0 ;
    my $type = "";
    my $peer_ip = "";
    my $peer_as = "";
    my $local_ip = "";
    my $local_as = "";
    my $next_hop = "";
    my @nodes_and_index = ();
    my @withdraw_prefix = ();
    my @announce_prefix = ();



    while (<IN>) {          

        #$exit++; last if ($exit==5);

        if (/^TIME/) {

            if ($type) {$hash{$record_id}{$recordcnt}{'type'} = $type;}
            if ($peer_ip) {$hash{$record_id}{$recordcnt}{'peer_ip'} = $peer_ip;}
            if ($peer_as) {$hash{$record_id}{$recordcnt}{'peer_as'} = $peer_as;}
            if ($local_ip) {$hash{$record_id}{$recordcnt}{'local_ip'} = $local_ip;}
            if ($local_as) {$hash{$record_id}{$recordcnt}{'local_as'} = $local_as;}
            if ($next_hop) {$hash{$record_id}{$recordcnt}{'next_hop'} = $next_hop;}

            if (@nodes_and_index) {push @{$hash{$record_id}{$recordcnt}{'nodes_and_index'}}, @nodes_and_index;}  
            if (@withdraw_prefix) {push @{$hash{$record_id}{$recordcnt}{'withdraw_prefix'}}, @withdraw_prefix;}
            if (@announce_prefix) {push @{$hash{$record_id}{$recordcnt}{'announce_prefix'}}, @announce_prefix;}

            $peer_as = "";
            $peer_ip = "";
            $type = "";
            $local_ip = "";
            $local_as = "";
            $next_hop = "";
            $record_id = "";
            $recordcnt = 0;
            @nodes_and_index = ();
            @withdraw_prefix = ();
            @announce_prefix = ();


            my @time = split '\s', $_;
            $record_id = $time[1]."_".$time[2]; 

        } elsif (/^FROM/) {
            my @from_tmp = split '\s', $_;
            $peer_ip = $from_tmp[1];
            $peer_as = $from_tmp[2];
            $peer_as =~ s/AS//;

        } elsif (/^TO/) {
            my @to_tmp = split '\s', $_;
            $local_ip = $to_tmp[1];
            $local_as = $to_tmp[2];
            $local_as =~ s/AS//;
            #print "$local_ip\n"; 

        } elsif (/^ASPATH/) {

            my @nodes_tmp = split '\s', $_;
                shift @nodes_tmp;       
            my $index = 0;

            foreach my $node (@nodes_tmp) {
                  $index++;
                  push @nodes_and_index, ($node , $index);
             }  

        }elsif (/^NEXT_HOP/) {

            my @next_hop_tmp = split '\s', $_;
            $next_hop = $next_hop_tmp[1];  

        }elsif (/^WITHDRAW/) {
            while (<IN>) {       
                     last if !/^ +/;     
                     @withdraw_prefix, $_ ;             
                 }


        }elsif (/^ANNOUNCE/) {
                while (<IN>) {        
                        last if !/^ +/;    
                push @announce_prefix, $_;

                 }  
            }


        if ($record_id) { # handle last result
            $hash{$record_id}{$recordcnt}{'peer_as'}    = $peer_as;
            $hash{$record_id}{$recordcnt}{'peer_ip'}    = $peer_ip;
            $hash{$record_id}{$recordcnt}{'local_as'}   = $local_as;
            $hash{$record_id}{$recordcnt}{'local_ip'}   = $local_ip;
            $hash{$record_id}{$recordcnt}{'next_hop'}   = $next_hop;

            push @{$hash{$record_id}{$recordcnt}{'nodes_and_index'}} ,@nodes_and_index;  
            push @{$hash{$record_id}{$recordcnt}{'withdraw_prefix'}} ,@withdraw_prefix;
            push @{$hash{$record_id}{$recordcnt}{'announce_prefix'}} ,@announce_prefix;

        }
    }
    close IN;
}  
my @arraystuff;
my @separated;
my @iindex=();
my @ppath=();
foreach (sort keys %hash) {

    print $_, "\n";
    print "\t $hash{$_}{$recordcnt}{'peer_ip'}\n";
    print "\t $hash{$_}{$recordcnt}{'peer_as'}\n";
    print "\t $hash{$_}{$recordcnt}{'local_ip'}\n";
    print "\t $hash{$_}{$recordcnt}{'local_as'}\n"; 
    print "\t $hash{$_}{$recordcnt}{'next_hop'}\n";

    @arraystuff = @{$hash{$_}{$recordcnt}{'nodes_and_index'}};
    foreach (@arraystuff) {
         @separated = split('\s', $_);
         push @iindex, $separated[1];
         push @ppath, $separated[0];
         print "\t index: @iindex";
         print "\t path: @ppath";
        }


    print "\t node index : @{$hash{$_}{$recordcnt}{'nodes_and_index'}}\n";    
    print "\t withdraw_prefix: @{$hash{$_}{$recordcnt}{'withdraw_prefix'}}\n"; 
    print "\t announce: @{$hash{$_}{$recordcnt}{'announce_prefix'}}\n"; 
}

==========================================================================================

New version advised by Foibs

#!/usr/bin/perl -w

use strict;
use warnings;


my @datasetarray;
my $Dir = "/root/updates\/processed/";
my $exit = 0;  

opendir(DIRECTORY, $Dir) or die $!;

while (my $file = readdir(DIRECTORY)) { 

    unless ($file=~/\.hr$/){next;}
    #unless ($file=~/\.txt$/){next;}

    my $file = $Dir."/".$file;
    print "$file\n";

    open (IN, $file) or die "error reading file: ", $file,"\n";

    my $record_id = "";
    my $type = "";
    my $peer_ip = "";
    my $peer_as = "";
    my $local_ip = "";
    my $local_as = "";
    my $next_hop = "";
    my @nodes_and_index = ();
    my @withdraw_prefix = ();
    my @announce_prefix = ();


    my $tmphash = {};

    while (<IN>) {          

        #$exit++; last if ($exit==5);

        if (/^TIME/) {


            if ($type) {$tmphash->{'type'} = $type;}
            if ($peer_ip) {$tmphash->{'peer_ip'} = $peer_ip;}
            if ($peer_as) {$tmphash->{'peer_as'} = $peer_as;}
            if ($local_ip) {$tmphash->{'local_ip'} = $local_ip;}
            if ($local_as) {$tmphash->{'local_as'} = $local_as;}
            if ($next_hop) {$tmphash->{'next_hop'} = $next_hop;}
               #if (@nodes_and_index) {push {$tmphash->{'nodes_and_index'}}, @nodes_and_index;}  
            #if (@withdraw_prefix) {push {$tmphash->{'withdraw_prefix'}}, @withdraw_prefix;}
            #if (@announce_prefix) {push {$tmphash->{'announce_prefix'}}, @announce_prefix;}

#The three commented lines above provide error, thus i don't know if i am implementing it the right way, since they are array and different from the others.

            $peer_as = "";
            $peer_ip = "";
            $type = "";
            $local_ip = "";
            $local_as = "";
            $next_hop = "";
            $record_id = "";
            @nodes_and_index = ();
            @withdraw_prefix = ();
            @announce_prefix = ();


            my @time = split '\s', $_;
            $record_id = $time[1]."_".$time[2];


        } elsif (/^TYPE/) {
            my @type_tmp = split '\s', $_;
            $type = $type_tmp[1];

        } elsif (/^FROM/) {
            my @from_tmp = split '\s', $_;
            $peer_ip = $from_tmp[1];
            $peer_as = $from_tmp[2];
            $peer_as =~ s/AS//;

        } elsif (/^TO/) {
            my @to_tmp = split '\s', $_;
            $local_ip = $to_tmp[1];
            $local_as = $to_tmp[2];
            $local_as =~ s/AS//;

        } elsif (/^ASPATH/) {

            my @nodes_tmp = split '\s', $_;
                shift @nodes_tmp;       
            my $index = 0;

            foreach my $node (@nodes_tmp) {
                    $index++;
            push @nodes_and_index, ($node , $index); 
             }  

        }elsif (/^NEXT_HOP/) {

            my @next_hop_tmp = split '\s', $_;
            $next_hop = $next_hop_tmp[1];  

        }elsif (/^WITHDRAW/) {
            while (<IN>) {       
                     last if !/^ +/;    
                     push @withdraw_prefix, $_ ;           

                 }


        }elsif (/^ANNOUNCE/) {

                 while (<IN>) {        
                     last if !/^ +/;    
                     push @announce_prefix, $_;

                 }  

            }


        if ($record_id) { # handle last result
            push @datasetarray, $tmphash;
            $tmphash = {};
        }
    }
    close IN; 
}  

foreach my $row (@datasetarray) {


    print $_, "\n";                       #Time doesn't get printed
    print "\t $row->{'peer_ip'}\n";       #OK
    print "\t $row->{'peer_as'}\n";       #OK
    print "\t $row->{'local_ip'}\n";      #OK
    print "\t $row->{'local_as'}\n";      #OK
    print "\t $row->{'next_hop'}\n";      #OK
#   print "\t $row->{'nodes_and_index'}\n"; # Are these guys ok ? since they are arrays
#   print "\t $row->{'withdraw_prefix'}\n"; # Are these guys ok ? since they are arrays
#   print "\t $row->{'announce_prefix'}\n"; # Are these guys ok ? since they are arrays


}

============================================================================


Solution

  • The simplest seems to be to put your $recordcnt in the key itself like this $record_id = $recordcnt.'_'.$time[1]."_".$time[2]; and make sure that it never gets zeroed inside the loop (you have a line that does $recordcnt=0`, this is wrong). Also I didn't find any place where you actually increment the recordcnt.


    However, it seems to me that you'd be much better off by using an array of hashes instead of a simple hash. The array will be ordered the same as your input file but you can use sort to sort it any way and you don't get messed up with strange counters and such. It's not too hard to rewrite it with an array.

    First, create the array that will hold all your data in the beginning of your script (let's call it @myarray).

    Before your loop starts, create a hash ref (reference to a hash, easier to handle) which will contain one object.

    my $tmphash = {};
    while (<IN>) {    
    ......
    

    Now replace your $hash{$record_id}{$recordcnt} with just $tmphash->

    (e.g.

    if ($peer_ip) {$hash{$record_id}{$recordcnt}{'peer_ip'} = $peer_ip;}

    will now be

    if ($peer_ip) {$tmphash->{'peer_ip'} = $peer_ip;} and so on)

    When you know that you have gathered the whole object in the tmphash, just push the tmphash in the array, reinitialize the tmphash and continue with the next object.

    push @myarray, $tmphash;
    $tmphash = {};
    

    All done! Now all you need to do is loop through the array to print your data

    foreach my $row (@myarray) {
      print "\t $row->{'peer_ip'}\n";
      #... and so on
    

    EDIT

    I took the liberty of fixing your script. There where small errors and a major logical error. I didn't delete anything of your code, but I commented out some lines and added some of my own. All lines that I changed or added are marked with a #~#~ in the end of the line so you can easily track them and see the differences.

    #!/usr/bin/perl -w
    
    use strict;
    use warnings;
    
    
    my @datasetarray;
    my $Dir = "/root/updates/processed/";
    my $exit = 0;  
    
    opendir(DIRECTORY, $Dir) or die $!;
    
    while (my $file = readdir(DIRECTORY)) { 
    
        unless ($file=~/\.hr$/){next;}
        #unless ($file=~/\.txt$/){next;}
    
        my $file = $Dir."/".$file;
        print "$file\n";
    
        open (IN, $file) or die "error reading file: ", $file,"\n";
    
        my $record_id = "";
        my $type = "";
        my $peer_ip = "";
        my $peer_as = "";
        my $local_ip = "";
        my $local_as = "";
        my $next_hop = "";
        my @nodes_and_index = ();
        my @withdraw_prefix = ();
        my @announce_prefix = ();
    
    
        my $tmphash = {};
    
        while (<IN>) {          
    
            #$exit++; last if ($exit==5);
    
            if (/^TIME/) {
    
    
                if ($type) {$tmphash->{'type'} = $type;}
                if ($peer_ip) {$tmphash->{'peer_ip'} = $peer_ip;}
                if ($peer_as) {$tmphash->{'peer_as'} = $peer_as;}
                if ($local_ip) {$tmphash->{'local_ip'} = $local_ip;}
                if ($local_as) {$tmphash->{'local_as'} = $local_as;}
                if ($next_hop) {$tmphash->{'next_hop'} = $next_hop;}
                if (@nodes_and_index) {push @{$tmphash->{'nodes_and_index'}}, @nodes_and_index;}  #~#~
                if (@withdraw_prefix) {push @{$tmphash->{'withdraw_prefix'}}, @withdraw_prefix;}  #~#~
                if (@announce_prefix) {push @{$tmphash->{'announce_prefix'}}, @announce_prefix;}  #~#~
    
                if ($record_id) {  #~#~
                    $tmphash->{'time'} = $record_id; #~#~
                    push @datasetarray, $tmphash;#~#~
                    $tmphash = {};#~#~
                } #~#~
    #The three commented lines above provide error, thus i don't know if i am implementing it the right way, since they are array and different from the others.
    
                $peer_as = "";
                $peer_ip = "";
                $type = "";
                $local_ip = "";
                $local_as = "";
                $next_hop = "";
                $record_id = "";
                @nodes_and_index = ();
                @withdraw_prefix = ();
                @announce_prefix = ();
    
    
                my @time = split '\s', $_;
                $record_id = $time[1]."_".$time[2];
    
    
            } elsif (/^TYPE/) {
                my @type_tmp = split '\s', $_;
                $type = $type_tmp[1];
    
            } elsif (/^FROM/) {
                my @from_tmp = split '\s', $_;
                $peer_ip = $from_tmp[1];
                $peer_as = $from_tmp[2];
                $peer_as =~ s/AS//;
    
            } elsif (/^TO/) {
                my @to_tmp = split '\s', $_;
                $local_ip = $to_tmp[1];
                $local_as = $to_tmp[2];
                $local_as =~ s/AS//;
    
            } elsif (/^ASPATH/) {
    
                my @nodes_tmp = split '\s', $_;
                    shift @nodes_tmp;       
                my $index = 0;
    
                foreach my $node (@nodes_tmp) {
                        $index++;
                push @nodes_and_index, ($node , $index); 
                 }  
    
            }elsif (/^NEXT_HOP/) {
    
                my @next_hop_tmp = split '\s', $_;
                $next_hop = $next_hop_tmp[1];  
    
            }elsif (/^WITHDRAW/) {
                while (<IN>) {       
                         last if !/^ +/;    
                         push @withdraw_prefix, $_ ;           
    
                     }
    
    
            }elsif (/^ANNOUNCE/) {
    
                     while (<IN>) {        
                         last if !/^ +/;    
                         push @announce_prefix, $_;
    
                     }  
    
                }
    
    
            #if ($record_id) { # handle last result #~#~
           #     push @datasetarray, $tmphash;#~#~
           #     $tmphash = {};#~#~
           # }#~#~
        }
        close IN; 
    
        #insert the last element of the file
        if ($type) {$tmphash->{'type'} = $type;} #~#~
        if ($peer_ip) {$tmphash->{'peer_ip'} = $peer_ip;} #~#~
        if ($peer_as) {$tmphash->{'peer_as'} = $peer_as;} #~#~
        if ($local_ip) {$tmphash->{'local_ip'} = $local_ip;} #~#~
        if ($local_as) {$tmphash->{'local_as'} = $local_as;} #~#~
        if ($next_hop) {$tmphash->{'next_hop'} = $next_hop;}  #~#~
        if (@nodes_and_index) {push @{$tmphash->{'nodes_and_index'}}, @nodes_and_index;}  #~#~
        if (@withdraw_prefix) {push @{$tmphash->{'withdraw_prefix'}}, @withdraw_prefix;}  #~#~
        if (@announce_prefix) {push @{$tmphash->{'announce_prefix'}}, @announce_prefix;}  #~#~
    
        if ($record_id) {  #~#~
            $tmphash->{'time'} = $record_id; #~#~
            push @datasetarray, $tmphash;#~#~
            $tmphash = {};#~#~
        } #~#~
    }  
    
    foreach my $row (@datasetarray) {
    
    
        #print $_, "\n";                       #Time doesn't get printed #~#~
        print "\t $row->{'time'}\n";       #~#~
        print "\t $row->{'peer_ip'}\n";       #OK
        print "\t $row->{'peer_as'}\n";       #OK
        print "\t $row->{'local_ip'}\n";      #OK
        print "\t $row->{'local_as'}\n";      #OK
        print "\t $row->{'next_hop'}\n";      #OK
    #you can print array refs like this, just make a check that they are declared
       print "\t @{$row->{'nodes_and_index'}}\n" if ref $row->{'nodes_and_index'} eq 'ARRAY';#~#~
       print "\t @{$row->{'withdraw_prefix'}}\n" if ref $row->{'withdraw_prefix'} eq 'ARRAY';#~#~
       print "\t @{$row->{'withdraw_prefix'}}\n" if ref $row->{'withdraw_prefix'} eq 'ARRAY';#~#~
    
    
    }