c string algorithm command-line-interface tokenize

Fastest algorithm for splitting a string in C?

I'm writing a CLI input parser and I want to know what is the fastest algorithm to split a string into tokens.

Rules:

A space means end of the token.
Any char can be escaped with a backslash which means that I take it as is without any specal meaning. (currently only used for escaping a space)

Here is the code I am currently using:

#define POINTER_ARRAY_STEP 10
#define STRING_STEP 255

char **parse_line(char *line)
{
    char **array;
    size_t array_len;
    size_t array_index;
    size_t token_len;
    size_t token_index;

    array_len = POINTER_ARRAY_STEP;
    array = malloc((array_len + 1) * sizeof(char*)); /* +1 to leave space for NULL */
    array_index = 0;

    token_len = STRING_STEP;
    array[array_index] = malloc(token_len + 1);
    token_index = 0;

    for (; *line; ++line) {
        if (array_index == array_len) {
            array_len += POINTER_ARRAY_STEP;
            array = realloc(array, (array_len + 1) * sizeof(char*));
        }
        if (token_index == token_len) {
            token_len += STRING_STEP;
            array[array_index] = realloc(array[array_index], token_len + 1);
        }
        if (*line == '\\') {
            array[array_index][token_index++] = *++line;
            continue;
        }
        if (*line == ' ') {
            array[array_index][token_index] = 0;
            array[++array_index] = malloc(token_len + 1);
            token_index = 0;
            continue;
        }
        array[array_index][token_index++] = *line;
    }
    /* Null terminate the last array element */
    array[array_index][token_index] = 0;

    /* Null terminate the array */
    array[array_index + 1] = NULL;
    return array;
}

Solution

Your method is both unsafe and inefficient: you do not check for memory allocation failure and you call realloc() way too many times.

Here is another approach:

make a first pass to count the number of tokens and escapes,
allocate the pointer array and a buffer for the tokens
make a second pass, copying the characters into the buffer, splitting the tokens and making the pointer array point to the tokens.
return the array pointer.

The memory can later be freed by calling free() on both the pointer array and its first element.

Here is the code:

#include <stdlib.h>

char **parse_line(const char *line) {
    size_t len, i, j, k, items = 1, escapes = 0;
    char **array;
    char *buf;

    for (len = 0; line[len]; len++) {
        if (line[len] == '\\') {
            escapes++;
            if (line[++len] == '\0')
                break;
        } else
        if (line[len] == ' ') {
            items++;
        }
    }
    if (len == escapes) {
        /* special case empty line */
        array = malloc(sizeof(*array));
        if (array != NULL) {
            array[0] = NULL;
        }
        return array;
    }
    array = malloc((items + 1) * sizeof(*array));
    buf = malloc(len + 1 - escapes);

    if (array == NULL || buf == NULL) {
        free(array);
        free(buf);
        return NULL;
    }
    items[0] = buf;
    k = 1;
    for (i = j = 0; i < len; i++) {
        if (line[i] == '\\') {
            if (++i == len)
                break;
            buf[j++] = line[i];
        } else
        if (line[i] == ' ') {
            buf[j++] = '\0';
            items[k++] = buf + j;
        } else {
            buf[j++] = line[i];
        }
    }
    buf[j] = '\0';
    items[k] = NULL;
    return items;
}