Search code examples
c++tokenize

Simple C++ tokenizer


I'm writing a program for a challenge on HackerRank, and I need to parse HRML, a markup language similar to HTML:

<tag1 value = "Hello World"></tag1>

As a part of the program I have a function which is supposed to fill a vector of strings with the string tokens. It works fine with the tags, but I also need to tokenize queries, which are in the following format:

tag1.tag2.tag3~attribute_name

The function behaves like the string iterator stops advancing after it encounters a tilde. Here is the code:

#include<iostream>
#include<string>
#include<vector>
using namespace std;

void tokenize_string(vector<string>& vector, string str)
{
    string current_token;

    for (auto i = str.begin(); i != str.end(); i++)
    {

        if (isalnum(*i))
        {
            current_token += *i;
        }
        else
        {
            //We extracted a token
            vector.push_back(current_token);
            current_token = "";
        }
    }

    //Remove empty strings that the previous loop placed into the vector
    for (auto i = vector.begin(); i != vector.end(); i++)
    {
         if (*i == "")
        {
            vector.erase(i);
            i = vector.begin();
        }
    }
} 
int main()
{
    //A simple test
    vector<string> tag_tokens;
    vector<string> query_tokens;

    tokenize_string(tag_tokens, "<tag1 name=\"Hello\">");
    tokenize_string(query_tokens, "tag1.tag2.tag3~name");

    for (auto it = tag_tokens.begin(); it != tag_tokens.end(); it++)
    {
        cout << *it << ' ';
    }
    cout << '\n';
    for (auto it = query_tokens.begin(); it != query_tokens.end(); it++)
    {
        cout << *it << ' ';
    }
    cout << '\n';
    cin.get();
    return 0;
}

Solution

  • It is because you are not considering the last token after reaching the end of the input string i != str.end().
    Add vector.push_back(current_token); after for loop as below to consider last token.

    void tokenize_string(vector<string>& vector, string str)
    {
        string current_token;
    
        for (auto i = str.begin(); i != str.end(); i++)
        {
    
            if (isalnum(*i))
            {
                current_token += *i;
            }
            else
            {
                //We extracted a token
                vector.push_back(current_token);
                current_token = "";
            }
        }
                    vector.push_back(current_token);   ///-------->pushes last token
    
        //Remove empty strings that the previous loop placed into the vector
        for (auto i = vector.begin(); i != vector.end(); i++)
        {
             if (*i == "")
            {
                vector.erase(i);
                i = vector.begin();
            }
        }
    }