Search code examples
cregexlinuxpcre

RegEx issue with PCRE library


Tried this with both the standard regex library as well as the PCRE library in ANSI C under Linux:

Need to catch the content between brackets, multiple times in the same string, but I can only get the first one or it matches the whole line (non-greedy match).

src    [] = "device=\"device 1\" device_name=\"the first device" address=\"192.168.1.10\" device=\"device 2\" device_name=\"the second device" address=\"192.168.1.12\" device=\"device 3\" device_name=\"the third device" address=\"192.168.1.13\"

So the result I want is getting 3 substrings with:

  • device 1
  • device 2
  • device 3
int main(int argc, char *argv[]) {
  pcre            *re;
  const char      *error;
  int             erroffset;
  int             ovector[OVECCOUNT];
  int             rc, i;

  char            src    [] = "device=\"device 1\" device_name=\"the first device" address=\"192.168.1.10\" device=\"device 2\" device_name=\"the second device" address=\"192.168.1.12\" device=\"device 3\" device_name=\"the third device" address=\"192.168.1.13\";
  char            pattern   [] = ".+device=\"(.+(?R))\".+";

  re = pcre_compile(pattern, 0, &error, &erroffset, NULL);
  if (re == NULL) {
          printf("PCRE compilation failed at offset %d: %s\n", erroffset, error);
          return 1;
  }

  rc = pcre_exec(re, NULL, src, strlen(src), 0, 0, ovector, OVECCOUNT);
  if (rc < 0) {
          if (rc == PCRE_ERROR_NOMATCH) printf("Sorry, no match ...\n");
          else    printf("Matching error %d/n", rc);
          free(re);
          return 1;
  }

  printf("\nOK, has matched ...\n\n");

  for (i = 0; i < rc; i++) {
          char *substring_start = src + ovector[2*i];
          int substring_length = ovector[2*i+1] - ovector[2*i];
          printf("%2d: %.*s\n", i, substring_length, substring_start);
  }

  free(re);

  return 0;
}

The regex 'testers' on the web can set the global flag which seems to work, but is not available in PCRE. What can I do?

Ideally I would prefer to use the standard regex.h lib, but PCRE is also fine if needed.


Solution

  • Standard POSIX extended pattern (^|[\t\v\f\r ])device="([^"]*)" works just fine. Then, the zeroth match is the entire match, the first match is the whitespace character preceding device= or empty string if it starts at the beginning of the line, and the second match is the contents of the device name:

    #define  _POSIX_C_SOURCE  200809L
    #include <stdlib.h>
    #include <sys/types.h>
    #include <regex.h>
    #include <string.h>
    #include <stdio.h>
    #include <errno.h>
    
    const char  data[] = "device=\"device 1\" device_name=\"the first device\" address=\"192.168.1.10\""
                         " device=\"device 2\" device_name=\"the second device\" address=\"192.168.1.12\""
                         " device=\"device 3\" device_name=\"the third device\" address=\"192.168.1.13\"";
    
    const char pattern[] = "(^|[\t\v\f\r ])device=\"([^\"]*)\"";
    
    int main(void)
    {
        regex_t     expression;
        regmatch_t  match[3];
        int         err, i;
    
        err = regcomp(&expression, pattern, REG_EXTENDED);
        if (err) {
            char  errbuf[1024];
            (void)regerror(err, &expression, errbuf, sizeof errbuf);
            fprintf(stderr, "Invalid basic POSIX regular expression: %s.\n", errbuf);
            return EXIT_FAILURE;
        }
    
        for (i = 0; regexec(&expression, data + i, 3, match, 0) == 0; i += match[0].rm_eo)
            if (match[2].rm_so >= 0 && match[2].rm_eo > match[2].rm_so) {
                const size_t  off = i + match[2].rm_so;
                const size_t  len = match[2].rm_eo - match[2].rm_so;
                char          part[len + 1];
                memcpy(part, data + off, len);
                part[len] = '\0';
                printf("Matched '%s'.\n", part);
            }
    
        regfree(&expression);
        return EXIT_SUCCESS;
    }
    

    As Jonathan Leffler mentioned in a comment to the question, the matches are obtained in a loop, where the next lookup starts where the previous match ended. The loop ends when there are no more matches.

    If you want to support multiple quotation styles, you could use something like ^device="([^"]*)*"|^device='([^']*)'|^device=([^\t\v\f\r ]*)|[\t\v\f\r ]device="([^"]*)*"|[\t\v\f\r ]device='([^']*)'|[\t\v\f\r ]device=([^\t\v\f\r ]*), with at least seven elements in the match[] array. Then, exactly one of the entries match[1] to match[6] will have .rm_so > 0, and that will identify the desired contents:

    #define  _POSIX_C_SOURCE  200809L
    #include <stdlib.h>
    #include <sys/types.h>
    #include <regex.h>
    #include <string.h>
    #include <stdio.h>
    #include <errno.h>
    
    const char  data[] = "device=\"device 1\" device_name=\"the first device\" address=\"192.168.1.10\""
                         " device=\"device 2\" device_name=\"the second device\" address=\"192.168.1.12\""
                         " device=\"device 3\" device_name=\"the third device\" address=\"192.168.1.13\"";
    
    const char pattern[] = "^device=\"([^\"]*)\""
                       "|" "^device='([^']*)'"
                       "|" "^device=([^\t\v\f\r ]*)"
                       "|" "[\t\v\f\r ]device=\"([^\"]*)\""
                       "|" "[\t\v\f\r ]device='([^']*)'"
                       "|" "[\t\v\f\r ]device=([^\t\v\f\r ]*)";
    
    int main(void)
    {
        regex_t     expression;
        regmatch_t  match[7];
        int         err, i, k;
    
        err = regcomp(&expression, pattern, REG_EXTENDED);
        if (err) {
            char  errbuf[1024];
            (void)regerror(err, &expression, errbuf, sizeof errbuf);
            fprintf(stderr, "Invalid basic POSIX regular expression: %s.\n", errbuf);
            return EXIT_FAILURE;
        }
    
        for (i = 0; regexec(&expression, data + i, 7, match, 0) == 0; i += match[0].rm_eo) {
            for (k = 1; k < 7; k++)
                if (match[k].rm_so >= 0)
                    break;
            if (k >= 7)
                continue;
    
            if (match[k].rm_so >= 0 && match[k].rm_eo > match[k].rm_so) {
                const size_t  off = i + match[k].rm_so;
                const size_t  len = match[k].rm_eo - match[k].rm_so;
                char          part[len + 1];
                memcpy(part, data + off, len);
                part[len] = '\0';
                printf("Matched '%s'.\n", part);
            }
        }
    
        regfree(&expression);
        return EXIT_SUCCESS;
    }
    

    However, this variant also detects the desired content when the data[] is say

    device="device 1" device_name="the first device" address="192.168.1.10"
    device=device2 device_name=the_second_device address=192.168.1.12
    device='device 3' device_name='the third device' address='192.168.1.13'
    

    Personally, I would consider matching on pattern (^|[\t\v\f\r ])([A-Za-z0-9][-_a-Za-z0-9]*)=("[^"]*"|'[^']*'|[^\t\v\f\r ]*) instead, so that the zeroth match matches each pair, first match the name part, and second match the value part possibly single- or double-quoted. Based on the name part, you could copy the value part (omitting the quotes if quoted) to dynamically allocated buffers.