Search code examples
cstringalgorithmcommand-line-interfacetokenize

Fastest algorithm for splitting a string in C?


I'm writing a CLI input parser and I want to know what is the fastest algorithm to split a string into tokens.

Rules:

  • A space means end of the token.
  • Any char can be escaped with a backslash which means that I take it as is without any specal meaning. (currently only used for escaping a space)

Here is the code I am currently using:

#define POINTER_ARRAY_STEP 10
#define STRING_STEP 255

char **parse_line(char *line)
{
    char **array;
    size_t array_len;
    size_t array_index;
    size_t token_len;
    size_t token_index;

    array_len = POINTER_ARRAY_STEP;
    array = malloc((array_len + 1) * sizeof(char*)); /* +1 to leave space for NULL */
    array_index = 0;

    token_len = STRING_STEP;
    array[array_index] = malloc(token_len + 1);
    token_index = 0;

    for (; *line; ++line) {
        if (array_index == array_len) {
            array_len += POINTER_ARRAY_STEP;
            array = realloc(array, (array_len + 1) * sizeof(char*));
        }
        if (token_index == token_len) {
            token_len += STRING_STEP;
            array[array_index] = realloc(array[array_index], token_len + 1);
        }
        if (*line == '\\') {
            array[array_index][token_index++] = *++line;
            continue;
        }
        if (*line == ' ') {
            array[array_index][token_index] = 0;
            array[++array_index] = malloc(token_len + 1);
            token_index = 0;
            continue;
        }
        array[array_index][token_index++] = *line;
    }
    /* Null terminate the last array element */
    array[array_index][token_index] = 0;

    /* Null terminate the array */
    array[array_index + 1] = NULL;
    return array;
}

Solution

  • Your method is both unsafe and inefficient: you do not check for memory allocation failure and you call realloc() way too many times.

    Here is another approach:

    • make a first pass to count the number of tokens and escapes,
    • allocate the pointer array and a buffer for the tokens
    • make a second pass, copying the characters into the buffer, splitting the tokens and making the pointer array point to the tokens.
    • return the array pointer.

    The memory can later be freed by calling free() on both the pointer array and its first element.

    Here is the code:

    #include <stdlib.h>
    
    char **parse_line(const char *line) {
        size_t len, i, j, k, items = 1, escapes = 0;
        char **array;
        char *buf;
    
        for (len = 0; line[len]; len++) {
            if (line[len] == '\\') {
                escapes++;
                if (line[++len] == '\0')
                    break;
            } else
            if (line[len] == ' ') {
                items++;
            }
        }
        if (len == escapes) {
            /* special case empty line */
            array = malloc(sizeof(*array));
            if (array != NULL) {
                array[0] = NULL;
            }
            return array;
        }
        array = malloc((items + 1) * sizeof(*array));
        buf = malloc(len + 1 - escapes);
    
        if (array == NULL || buf == NULL) {
            free(array);
            free(buf);
            return NULL;
        }
        items[0] = buf;
        k = 1;
        for (i = j = 0; i < len; i++) {
            if (line[i] == '\\') {
                if (++i == len)
                    break;
                buf[j++] = line[i];
            } else
            if (line[i] == ' ') {
                buf[j++] = '\0';
                items[k++] = buf + j;
            } else {
                buf[j++] = line[i];
            }
        }
        buf[j] = '\0';
        items[k] = NULL;
        return items;
    }