Search code examples
cstringsplitstring-matchingc-strings

Breaking up a string into a list of tokens using another string as a delimiter?


Let's say I have this string:

char *myTestString = "Hello AND test AND test2";

I want to break this down into the set { Hello, test, test2 }, which I can finally iterate over.

Or I have

char *myTestString2 = "Hi AND there AND test AND test2";

I want to break this down into the set { Hi, there, test, test2 }, which I can later iterate over.

How do I achieve this using C?

EDIT: Another example is splitting "Hello there AND test" should give out set { Hello there, test }. For clarification "AND" is delimiter here.


Solution

  • Here you are.

    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    
    char ** split( const char *s1, const char *s2 )
    {
        char **tokens = malloc( sizeof( char * ) );
        int success = tokens != NULL;
    
        if ( success )
        {
            const char *delim = " \t";
            *tokens = NULL;
    
            for ( size_t n = 1, len = strlen( s2 ); success && *s1; )
            {
                s1 += strspn( s1, delim );
    
                if ( *s1 )
                {
                    const char *p = s1;
    
                    s1 += strcspn( s1, delim );
    
                    if ( strncmp( p, s2, len ) != 0 )
                    {
                        char **tmp = realloc( tokens, ( n + 1 ) * sizeof( char * ) );
    
                        if ( ( success = tmp != NULL ) )
                        {
                            tokens = tmp;
    
                            success = ( tokens[n-1] = calloc( 1, s1 - p + 1 ) )  != NULL;
                            strncpy( tokens[n-1], p, s1 - p );
                            tokens[n] = NULL;
                            ++n;
                        }
    
                        if ( !success )
                        {
                            for ( size_t i = 0; i < n; i++ ) free( tokens[i] );
                            free( tokens );
                        }
                    }
                }
            }
        }       
    
        return tokens;
    }
    
    int main(void) 
    {
        const char *s1 = "Hi AND there AND test AND test2";
        const char *s2 = "AND";
    
        char **tokens = split( s1, s2 );
    
        if ( tokens != NULL )
        {
            for ( char **p = tokens; *p != NULL; ++p )
            {
                puts( *p );
            }
    
            char **p = tokens;
            do
            {
                free( *p );
            } while ( *p++ != NULL );
    
            free( tokens );
        }
    
        return 0;
    }
    

    The program output is

    Hi
    there
    test
    test2
    

    The function returns NULL if a memory allocation was not successful. Otherwise it returns a pointer to an array of the element type char * the last element of which is null pointer.

    The words in the source string are splited by tabs and spaces. You can change the delimiters as you like.

    After your comment to my previous solution it seems you need the following

    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    
    char ** split( const char *s1, const char *s2 )
    {
        char **tokens = malloc( sizeof( char * ) );
        int success = tokens != NULL;
    
        if ( success )
        {
            const char *delim = " \t";
            *tokens = NULL;
    
            for ( size_t n = 1, len2 = strlen( s2 ); success && *s1; )
            {
                for ( int empty = 1; empty; ) 
                {
                    s1 += strspn( s1, delim );
                    if ( ( empty = strncmp( s1, s2, len2 ) == 0 ) )
                    {
                        s1 += len2;
                    }
                }               
    
                if ( *s1 )
                {
                    const char *p = strstr( s1, s2 );
    
                    size_t len1 = p == NULL ? strlen( s1 ) : p - s1;
    
                    char **tmp = realloc( tokens, ( n + 1 ) * sizeof( char * ) );
    
                    if ( ( success = tmp != NULL ) )
                    {
                        tokens = tmp;
    
                        success = ( tokens[n-1] = calloc( 1, len1 +  1 ) )  != NULL;
                        strncpy( tokens[n-1], s1, len1 );
                        tokens[n] = NULL;
                        ++n;
    
                        s1 += p == NULL ? len1 : len1 + len2; 
                    }
    
                    if ( !success )
                    {
                        for ( size_t i = 0; i < n; i++ ) free( tokens[i] );
                        free( tokens );
                    }
                }
            }
        }       
    
        return tokens;
    }
    
    int main(void) 
    {
        const char *s1 = "Hi there AND test test2";
        const char *s2 = "AND";
    
        char **tokens = split( s1, s2 );
    
        if ( tokens != NULL )
        {
            for ( char **p = tokens; *p != NULL; ++p )
            {
                puts( *p );
            }
    
            char **p = tokens;
            do
            {
                free( *p );
            } while ( *p++ != NULL );
    
            free( tokens );
        }
    
        return 0;
    }
    

    The program output is

    Hi there 
    test test2
    

    Maybe you need also to remove trailing blanks of a extracted sub-string that I hope you can do yourself.:).