Search code examples
cpcre2

find all offsets of pcre2 matches


I want to write a function int** text_match(const char* input, const char* pattern); that returns all match offsets(start and end respectively)

So far I have implemented this, by iterating over the input with an input offset, finding out if there is a match within the input + its offset, obtaining the offset and setting the input offset to the end offset of the match.

This approach has some very significant disadvantages

  • some patterns exists that can cause an infinite amount of matches (^|\n|$)
  • even once these infinite execution issues are fixed with (^|\n) will always find a match at the beginning of the string-fragment, thus viewing every index as a match

I experimented with pcre2_get_ovector_count() a bit, however so far whenever I provide it with the pcre2_match_dataafter having run pcre2_match() it only returns 1 and not the actual amount of matches

How could I find out all match offsets with pcre2 directly? Or at least avoid the issues I have described above?

my implementation so far (will be licensed under the MIT)

int** text_match(const char* input, const char* regex){
    if(!input || !regex) return (int**)calloc(1, sizeof(int*));

    int errorcode;
    PCRE2_SIZE erroroffset;

    pcre2_code *compiled_regex = pcre2_compile((PCRE2_SPTR)regex, PCRE2_ZERO_TERMINATED, PCRE2_EXTENDED|PCRE2_UTF, &errorcode, &erroroffset, NULL);

    if(!compiled_regex) return (int**)calloc(1, sizeof(int*));
    
    pcre2_match_data *match_data = pcre2_match_data_create_from_pattern(compiled_regex, NULL);
    
    if(!match_data){
        pcre2_code_free(compiled_regex);

        return (int**)calloc(1, sizeof(int*));
    }

    int** output = (int**)calloc(1, sizeof(int*));
    if(!output){
        pcre2_code_free(compiled_regex);
        pcre2_match_data_free(match_data);

        return (int**)calloc(1, sizeof(int*));
    }
    
    PCRE2_SIZE start_offset = 0;

    int matches_amount;
    
    size_t length_input = strlen(input);

    for(matches_amount = 0; ; matches_amount++){
        if(start_offset >= length_input) break;
    
        if(pcre2_match(compiled_regex, (PCRE2_SPTR)input, PCRE2_ZERO_TERMINATED, start_offset, 0, match_data, NULL) < 0) break;
        
        PCRE2_SIZE *offsets = pcre2_get_ovector_pointer(match_data);
        
        if(offsets == NULL) break;

        output[matches_amount] = (int*)malloc(2 * sizeof(int));
        
        if(output[matches_amount] == NULL) break;

        output[matches_amount][0] = offsets[0];
        output[matches_amount][1] = offsets[1];

        int** new_output = (int**)realloc(output, sizeof(int*) * (matches_amount + 2));
        if(new_output == NULL) break;
        output = new_output;

        if(start_offset == offsets[1]){
            start_offset++;
        }else{
            start_offset = offsets[1];
        }
    }
    
    output[matches_amount] = NULL;
    
    pcre2_code_free(compiled_regex);
    pcre2_match_data_free(match_data);

    return output;
}

Edit:

As pointed out by @ikegami the approach was right, however I did too little edge-case detection. This edit just serves one purpose. To provide the corrected function so that if someone finds this post at a later time and needs a function for this here you have it(will be licensed under the MIT btw so do with it whatever you want)

int** text_match(const char* input, const char* regex){
    if(!input || !regex) return (int**)calloc(1, sizeof(int*));

    int errorcode;
    PCRE2_SIZE erroroffset;

    pcre2_code *compiled_regex = pcre2_compile((PCRE2_SPTR)regex, PCRE2_ZERO_TERMINATED, PCRE2_EXTENDED|PCRE2_UTF, &errorcode, &erroroffset, NULL);

    if(!compiled_regex) return (int**)calloc(1, sizeof(int*));
    
    pcre2_match_data *match_data = pcre2_match_data_create_from_pattern(compiled_regex, NULL);
    
    if(!match_data){
        pcre2_code_free(compiled_regex);

        return NULL;
    }

    int** output = (int**)calloc(1, sizeof(int*));
    if(!output){
        pcre2_code_free(compiled_regex);
        pcre2_match_data_free(match_data);

        return NULL;
    }
    
    PCRE2_SIZE start_offset = 0;

    int matches_amount;
    
    size_t length_input = strlen(input);

    uint32_t options = 0;

    for(matches_amount = 0; ; matches_amount++){
        if(pcre2_match(compiled_regex, (PCRE2_SPTR)input, PCRE2_ZERO_TERMINATED, start_offset, options, match_data, NULL) < 0) break;
        
        PCRE2_SIZE *offsets = pcre2_get_ovector_pointer(match_data);
        
        if(offsets == NULL) break;

        output[matches_amount] = (int*)malloc(2 * sizeof(int));
        
        if(!output[matches_amount]){
            for(int i = 0; output[i]; i++) free(output[i]);
            free(output);
            pcre2_code_free_8(compiled_regex);
            pcre2_match_data_free_8(match_data);
            
            return NULL;
        }

        output[matches_amount][0] = offsets[0];
        output[matches_amount][1] = offsets[1];

        int** new_output = (int**)realloc(output, sizeof(int*) * (matches_amount + 2));
        if(!new_output){
            for(int i = 0; output[i]; i++) free(output[i]);
            free(output);
            pcre2_code_free_8(compiled_regex);
            pcre2_match_data_free_8(match_data);
            
            return NULL;
        }
        output = new_output;
        
        if(offsets[0] == offsets[1]){
            if(offsets[0] == length_input) break;
            options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
        }else{
            size_t startchar = pcre2_get_startchar(match_data);
            if(start_offset > startchar) continue;
            start_offset = startchar + 1;
            
            for(; start_offset < length_input; start_offset++){
                if((input[start_offset] & 0xc0) != 0x80) break;
            }
            
        }
    }
    
    output[matches_amount] = NULL;
    
    pcre2_code_free(compiled_regex);
    pcre2_match_data_free(match_data);

    return output;
}

Solution

  • The pcre2demo man page provides a demo. It includes the following:

    /* Loop for second and subsequent matches */
    
    for (;;)
      {
      uint32_t options = 0;                   /* Normally no options */
      PCRE2_SIZE start_offset = ovector[1];   /* Start at end of previous match */
    
      /* If the previous match was for an empty string, we are finished if we are
      at the end of the subject. Otherwise, arrange to run another match at the
      same point to see if a non-empty match can be found. */
    
      if (ovector[0] == ovector[1])
        {
        if (ovector[0] == subject_length) break;
        options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
        }
    
      /* If the previous match was not an empty string, there is one tricky case to
      consider. If a pattern contains \K within a lookbehind assertion at the
      start, the end of the matched string can be at the offset where the match
      started. Without special action, this leads to a loop that keeps on matching
      the same substring. We must detect this case and arrange to move the start on
      by one character. The pcre2_get_startchar() function returns the starting
      offset that was passed to pcre2_match(). */
    
      else
        {
        PCRE2_SIZE startchar = pcre2_get_startchar(match_data);
        if (start_offset <= startchar)
          {
          if (startchar >= subject_length) break;   /* Reached end of subject.   */
          start_offset = startchar + 1;             /* Advance by one character. */
          if (utf8)                                 /* If UTF-8, it may be more  */
            {                                       /*   than one code unit.     */
            for (; start_offset < subject_length; start_offset++)
              if ((subject[start_offset] & 0xc0) != 0x80) break;
            }
          }
        }
    
      ...
    }
    

    So you are indeed expected to manipulate the start position (and the options as well), although there are some issues with your current implementation.