Search code examples
cscanf

Get zero or more length string by using sscanf


The following code:

#include <stdio.h>

int main(void) {
  char buf[80] = "salam";
  const char *fmt = "\"%79[^\"]\"";
  int n;

  n = sscanf("\"\"", fmt, buf);
  printf("%d fields were read. buf = '%s'\n", n, buf);
  n = sscanf("\"hamidi\"", fmt, buf);
  printf("%d fields were read. buf = '%s'\n", n, buf);
}

outputs:

0 fields were read. buf = 'salam'
1 fields were read. buf = 'hamidi'

This indicates that the sscanf function can't read empty strings with the specified format string. Is there a good replacement for the regex format string to read strings with length ZERO to at most 79 characters by the sscanf function? In another words, I expect the first sscanf insert '\0' to buf[0] and return 1 instead of 0. Is there a way?


Solution

  • Indeed sscanf and friends cannot handle empty fields, and neither can strtok. There is no direct alternative in the Standard library, but you can write a simple scanner with a loop or using strchr, strpbrk or strcspn.

    Here is a simple example:

    #include <stdio.h>
    #include <string.h>
    
    int scan_string(char *dest, size_t size, const char *src, const char **endp) {
        if (*src++ != '"') {
            // no string delimiter
            return 0;
        }
        size_t len = strcspn(src, "\"");
        if (src[len] != '"') {
            // no matching string delimiter
            return 0;
        }
        if (endp) {
            *endp = src + len + 1;
        }
        if (size > 0) {
            if (len >= size)
                len = size - 1;
            memcpy(dest, src, len);
            dest[len] = '\0';
        }
        return 1;
    }
    
    void test(const char *src) {
        char buf[80] = "<unchanged>";
        const char *end = "<unchanged>";
        int n = scan_string(buf, sizeof buf, src, &end);
        printf("src: '%s', n: %d, buf: '%s', end: '%s'\n",
               src, n, buf, end);
    }
    
    int main(void) {
        test("");
        test("\"");
        test("\"\"");
        test("''");
        test("\"hamidi\"");
        test("\"Hello\" \"world\"");
        test("\"\\\"\"");
        test("\"Hello world\\n\"");
        return 0;
    }
    

    Output:

    src: '', n: 0, buf: '<unchanged>', end: '<unchanged>'
    src: '"', n: 0, buf: '<unchanged>', end: '<unchanged>'
    src: '""', n: 1, buf: '', end: ''
    src: '''', n: 0, buf: '<unchanged>', end: '<unchanged>'
    src: '"hamidi"', n: 1, buf: 'hamidi', end: ''
    src: '"Hello" "world"', n: 1, buf: 'Hello', end: ' "world"'
    src: '"\""', n: 1, buf: '\', end: '"'
    src: '"Hello world\n"', n: 1, buf: 'Hello world\n', end: ''
    

    Here is a slightly more complicated version that handles some \ sequences and different separators:

    int scan_string(char *dest, size_t size, const char *src, const char **endp) {
        char sep = *src++;
        char ch;
        size_t i = 0;
    
        // handle both single and double quotes
        if (sep != '"' && sep != '\'')
            return 0;
        while ((ch = *src++) != sep) {
            if (ch == '\0') {
                if (i < size)
                    dest[i] = '\0';
                return 0;
            }
            if (ch == '\\' && *src != '\0') {
                switch (ch = *src++) {
                case 'f': ch = '\f'; break;
                case 'n': ch = '\n'; break;
                case 'r': ch = '\r'; break;
                case 't': ch = '\t'; break;
                case 'v': ch = '\v'; break;
                // handle octal and hex sequences...
                default: ch = *src++; break;
                }
            }
            if (i + 1 < size)
                dest[i++] = ch;
        }
        if (i < size)
            dest[i] = '\0';
        if (endp)
            *endp = src;
        return 1;
    }
    

    Output:

    src: '', n: 0, buf: '<unchanged>', end: '<unchanged>'
    src: '"', n: 0, buf: '', end: '<unchanged>'
    src: '""', n: 1, buf: '', end: ''
    src: '''', n: 1, buf: '', end: ''
    src: '"hamidi"', n: 1, buf: 'hamidi', end: ''
    src: '"Hello" "world"', n: 1, buf: 'Hello', end: ' "world"'
    src: '"\""', n: 0, buf: '"', end: '<unchanged>'
    src: '"Hello world\n"', n: 1, buf: 'Hello world
    ', end: ''