Search code examples
regexperloffsetskip

Perl regex start match search after (skip) N char


Probably basic question, but can't find the answer: I have a pattern match regex where I'm looking for 'G', 187 char and 'G' again, in a large buffer. This works well with $s =~ m/(G.{187}G)/s. Sometimes I want to add an offset of N bytes to the search (I don't want to start at position 0 of the buffer). I now I can do $s =~ m/.{N}(G.{187}G)/s but this sounds to me like not very efficient as I don't want to parse at all the beginning buffer (and it might be big). I tried to work with \G but could not get it to right.

Thanks


Solution

  • I tested the speed of the two approaches:

    use feature qw(say);
    use strict;
    use warnings;
    
    use Benchmark qw(:all);
    use Getopt::Long qw(GetOptions);
    
    GetOptions( "case=i" => \my $case ) or die("Error in command line arguments\n");
    
    my $num_cases = 5;
    sub case1 { return (187, 2000) };
    sub case2 { return (5, 10000) };
    sub case3 { return (5, 20000) };
    sub case4 { return (10000, 20000) };
    sub case5 { return (5, 40000) };
    
    my @cases = ( $case );
    if ( !defined $case ) {
        @cases = 1..$num_cases;
    }
    for my $case ( @cases ) {
        run_case( $case );
    }
    
    sub run_case {
        my ( $case ) = @_;
    
        my $case_coderef = \&{"case" . $case};
        my ( $M, $N ) = $case_coderef->();
        say "Running case $case: \$M = $M, \$N = $N";
    
        my $prefix = 'A' x $N;
        my $middle = 'B' x $M;
        my $match_str = 'G' . $middle . 'G';
    
        my $str = $prefix . $match_str;
    
        my %methods = map {; "method$_" => \&{"method" . $_} } 1..6;
    
        for my $meth (keys %methods) {
            my $res = eval {
                $methods{$meth}->($str, $M, $N)
            };
            if ( $@ ) {
                print "$@";
                say "Skipping method '$meth'..";
                delete $methods{$meth};
                next;
            }   
            die "Method '$meth' failed.\n" if $res ne $match_str;
        }
    
        my %code = map { $_ => eval ('sub { $methods{' . $_ . '}->($str, $M, $N) }') } sort keys %methods; 
    
        cmpthese(-5, \%code );
    }
    
    sub method1 {
        my ( $str, $M, $N ) = @_;
        $str =~ m/.{$N}(G.{$M}G)/;
        return $1;
    }
    
    sub method2 {
        my ( $str, $M, $N ) = @_;
        pos( $str ) = $N;
        $str =~ m/\G(G.{$M}G)/;
        return $1;
    }
    
    sub method3 {
        my ( $str, $M, $N ) = @_;
        pos( $str ) = $N;
        $str =~ m/(G.{$M}G)/g;
        return $1;
    }
    
    sub method4 {
        my ( $str, $M, $N ) = @_;
        $str =~ m/.{$N}(G.{$M}G)/s;
        return $1;
    }
    
    sub method5 {
        my ( $str, $M, $N ) = @_;
        pos( $str ) = $N;
        $str =~ m/\G(G.{$M}G)/s;
        return $1;
    }
    
    sub method6 {
        my ( $str, $M, $N ) = @_;
        pos( $str ) = $N;
        $str =~ m/(G.{$M}G)/gs;
        return $1;
    }
    

    Output:

    Running case 1: $M = 187, $N = 2000
                 Rate method1 method3 method2 method6 method5 method4
    method1  696485/s      --    -37%    -39%    -44%    -46%    -57%
    method3 1112322/s     60%      --     -3%    -10%    -13%    -32%
    method2 1146132/s     65%      3%      --     -7%    -10%    -30%
    method6 1234678/s     77%     11%      8%      --     -3%    -24%
    method5 1278898/s     84%     15%     12%      4%      --    -21%
    method4 1629148/s    134%     46%     42%     32%     27%      --
    Running case 2: $M = 5, $N = 10000
                 Rate method1 method2 method6 method5 method3 method4
    method1  226784/s      --    -72%    -72%    -72%    -72%    -78%
    method2  801020/s    253%      --     -0%     -2%     -3%    -23%
    method6  802386/s    254%      0%      --     -1%     -3%    -23%
    method5  814132/s    259%      2%      1%      --     -1%    -22%
    method3  823653/s    263%      3%      3%      1%      --    -21%
    method4 1046605/s    361%     31%     30%     29%     27%      --
    Running case 3: $M = 5, $N = 20000
                 Rate method1 method3 method2 method6 method5 method4
    method1  122763/s      --    -90%    -90%    -90%    -90%    -92%
    method3 1252858/s    921%      --     -0%     -1%     -2%    -23%
    method2 1258330/s    925%      0%      --     -1%     -1%    -22%
    method6 1265165/s    931%      1%      1%      --     -1%    -22%
    method5 1274309/s    938%      2%      1%      1%      --    -21%
    method4 1622943/s   1222%     30%     29%     28%     27%      --
    Running case 4: $M = 10000, $N = 20000
                 Rate method1 method3 method2 method6 method5 method4
    method1   90687/s      --    -62%    -62%    -93%    -93%    -94%
    method3  236835/s    161%      --     -1%    -81%    -81%    -86%
    method2  238334/s    163%      1%      --    -81%    -81%    -85%
    method6 1236025/s   1263%    422%    419%      --     -3%    -25%
    method5 1270943/s   1301%    437%    433%      3%      --    -22%
    method4 1638548/s   1707%    592%    588%     33%     29%      --
    Running case 5: $M = 5, $N = 40000
    Quantifier in {,} bigger than 32766 in regex; marked by <-- HERE in m/.{ <-- HERE 40000}(G.{5}G)/ at ./p.pl line 83.
    Skipping method 'method4'..
    Quantifier in {,} bigger than 32766 in regex; marked by <-- HERE in m/.{ <-- HERE 40000}(G.{5}G)/ at ./p.pl line 63.
    Skipping method 'method1'..
                 Rate method2 method3 method6 method5
    method2 1253528/s      --     -1%     -1%     -2%
    method3 1260746/s      1%      --     -0%     -1%
    method6 1263378/s      1%      0%      --     -1%
    method5 1278718/s      2%      1%      1%      --
    

    I ran this on my Laptop using Intel(R) Core(TM) i7-7500U CPU @ 2.70GHz running Ubuntu 16.10, the result shows that using the s modifier can speed up method1 dramatically ( compare with method4). But method1 or method4 cannot be used when $N exceeds 32766.