Search code examples
pythonnlpinformation-retrieval

How to vectorize dictionary of word tokens (bag of words implementation)


I'm creating my own bag of words algorithm but I'm stuck. So far I've tokenized the words(A list of strings and a user inputted string) and put them in a dictionary. Now I would like to create word vectors where 0 indicates the word is not in the document and 1 means it's present. My idea is to create a zero vector the size of which corresponds to the amount of unique words. Then make copies of that base vector, update the values of the vector for each document, and store them in an array. This is the part where I'm stuck.

import more_itertools as mit
import re
from collections import OrderedDict

def get_vector(lexicon, text):
   
    # Creates a dictionary with inital value 0 for all unique words in the vocabulary
    zero_vector = OrderedDict((token, 0) for token in lexicon)
    corpus_tokens = list(mit.collapse(text.split()))

def BoW(corpus: list, search_doc: str):
    
    word_count = {}
    
    # Regex to grab words here because its just a string
    search_doc_tokens = re.split(r'[-\s.,;!?]+', search_doc)
    
    # I have to do all this business here because it's a list of strings
    grab_words = [word.split() for word in corpus]
    corpus_tokens = list(mit.collapse(grab_words))
    
    # Concatenating the two lists
    vocabulary = corpus_tokens + search_doc_tokens
    
    # Filling dictionary
    for token in vocabulary:
        if token not in word_count:
            word_count[token] = 1
        else:
            word_count[token] += 1
                    
    
    # Unique words in vocab. Used determine size of zero vector
    lexicon = sorted(set(vocabulary))
    zero_vector = OrderedDict((token, 0) for token in lexicon)
    
    print(zero_vector)

documents = ["This is a text document", "This is another text document", "Get the picture?"]
BoW(documents, "hello there") 

Solution

  • I think that you should construct lexicon dictionary only from corpus list.

    I think you can write something like this:

    import more_itertools as mit
    import re
    from collections import OrderedDict
    
    def get_vector(lexicon, text):
        zero_vector = OrderedDict((token, 0) for token in lexicon)
        corpus_tokens = list(mit.collapse(text.split()))
        for token in corpus_tokens:
            if token in zero_vector:
                zero_vector[token] = 1
        return zero_vector
        
    
    def BoW(corpus: list, search_doc: str):
        
        word_count = {}
        
        # Regex to grab words here because its just a string
        search_doc_tokens = re.split(r'[-\s.,;!?]+', search_doc)
        
        # I have to do all this business here because it's a list of strings
        grab_words = [word.split() for word in corpus]
        corpus_tokens = list(mit.collapse(grab_words))
        
        # Concatenating the two lists  (why???)
        vocabulary = corpus_tokens #  + search_doc_tokens
        
        # Filling dictionary
        for token in vocabulary:
            if token not in word_count:
                word_count[token] = 1
            else:
                word_count[token] += 1
                        
        
        # Unique words in vocab. Used determine size of zero vector
        lexicon = sorted(set(vocabulary))
        
        for text in corpus:
            text_vector = get_vector(lexicon, text)
            print(text_vector)
            
        text_vector = get_vector(lexicon, search_doc)
        print(text_vector)
    

    But it would be much better to have the vector not as ordered dict but as numpy array.

    To transform ordered dict you can use something like this:

    import numpy as np
    tv_vec = np.array(list(test_vector.values()))
    

    So the question is: why do you need this BoW? How do you want to construct final matrix with vectorized texts? Do you want to include all corpus texts and search_doc together in the matrix?

    EDIT:

    I think you can do something like this:

        corpus_mat = np.zeros((len(lexicon), len(corpus)))
        for ind, text in enumerate(corpus):
            text_vector = get_vector(lexicon, text)
            corpus_mat[:, ind] = np.array(list(text_vector.values()))
            
        text_vector = get_vector(lexicon, search_doc)
        text_vector = np.array(list(text_vector.values()))
        return corpus_mat, text_vector
    

    And then use corpus_mat and text_vector to compute similarity with dot product:

    cm, tv = BoW(documents, "hello there") 
    print(cm.T @ tv)
    

    The output is going to be 3 zeros, as the search_doc text has no common words with corpus texts.