Search code examples
carraysstringparsingnested-loops

How to Tokenize string[array]?


I need to tokenize a string from an array, i need just three words and ignore all tabs '\t' and spaces ' '

the array line[] is just a test case.

I debugged mine, the first array (supposed to carry only the first word) got filled by spaces & letters from 3 words, not stopping after the first word when a tab or space is met. BTW my program crashed. i suspect getting out of array bounds maybe.

What am I doing wrong?

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int main()
{
    char line[] = "         CLOOP       LDA             buffer,x";

    char array1[20] ="";
    char array2[20] ="";
    char array3[20] ="";

    int i = 0;
    for( i ; i<strlen(line) ; i++)
    {
        while ( (line[i] != ' ') && (line[i] != '\t'))
        {

            if(array1[0] == '\0')
            {
                int j = 0;

                while(line[i] != ' ' && line[i] != '\t')
                {
                    array1[j] = line[i];
                    i++;
                    j++;
                }
            }

            if(array2[0] =='\0');
            {
                int k = 0;

                while(line[i] != ' ' && line[i] != '\t')
                {
                    array2[k] = line[i];
                    i++;
                    k++;  
                }   
            }

            if(array3[0] == '\0')
            {
                int g = 0;

                while(line[i] != ' ' && line[i] != '\t')
                {
                    array3[g] = line[i];
                    i++;
                    g++;
                }

            }

        }
    }

    printf("array 1: %s\n array2: %s\n array3: %s\n", array1, array2, array3);

    return(0);
}

Solution

  • You are over-complicating things. First of all it is difficult to feed all 3 arrays at the same time. The processing for one token should be completely finished before moving to the other token.

    I would propose to "eat" all the white spaces before starting to process a token. That is done by:

    // skip leading delimiters
    if( skip_leading_delimiters )
    {
         if( is_delimiter( delimiters, line[i]) ) continue;
         skip_leading_delimiters = 0;
    }
    

    After token is processes you can move to the next token and process it. I tried to preserve your concept and approach as much as I could. The amount of while loops has been reduced to 0 since // skip leading delimiters section takes care of it.

    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    
    int is_delimiter(const char * delimiters, char c) // check for a delimiter
    {
        char *p = strchr (delimiters, c);    // if not NULL c is separator
    
        if (p) return 1;                     // delimeter
        else return 0;                       // not a delimeter
    }
    
    int main()
    {
        char line[] = "         CLOOP       LDA             buffer,x";
    
        char array1[20];
        char array2[20];
        char array3[20];
    
        int con1 = 1;
        int con2 = 0;
        int con3 = 0;
    
        int con1s = 0;
        int con2s = 0;
        int con3s = 0;
    
        int i = 0;
        int j = 0;
    
        int skip_leading_delimiters = 1;
        char * delimiters = " \b";
    
        for(i = 0; i < strlen(line); i++)
        {   
            // skip leading delimiters
            if( skip_leading_delimiters )
            {
                if( is_delimiter( delimiters, line[i]) ) continue;
                skip_leading_delimiters = 0;
            }
    
            if(con1)
            {
                if(line[i] != ' ' && line[i] != '\t')
                {
                    array1[j] = line[i];
                    j++;
                    array1[j] = 0;
                    con1s = 1;
                }
                else
                {
                    if(con1s)
                    {
                        con1 = 0;
                        con2 = 1;
                        skip_leading_delimiters = 1;
                        j = 0;
                    }
                    continue;
                }
            }
    
            if(con2)
            {
                if(line[i] != ' ' && line[i] != '\t')
                {
                    array2[j] = line[i];
                    j++;
                    array2[j] = 0;
                    con2s = 1;
                }
                else
                {
                    con2 = 0;
                    con3 = 1;
                    skip_leading_delimiters = 1;
                    j = 0;
                    continue;
                }
            }
    
            if(con3)
            {
                if(line[i] != ' ' && line[i] != '\t')
                {
                    array3[j] = line[i];
                    j++;
                    array3[j] = 0;
                    con3s = 1;
                }
                else
                {
                    con3 = 0;
                    j = 0;
                    continue;
                }
            }
        }
    
        printf(" array1: %s\n array2: %s\n array3: %s\n", array1, array2, array3);
    
        return(0);
    }
    

    Output:

     array1: CLOOP                                                                                                                               
     array2: LDA                                                                                                                                 
     array3: buffer,x