Search code examples
pythonpandaspos-taggerhidden-markov-modelsviterbi

Why does Viterbi algorithm (POS tagging) always predict one tag?


Here is my HMM model class:

class HiddenMarkovModel:    
    def __init__(self):
    
        pass
        
    def fit(self, train_tokens_tags_list):
        """
        train_tokens_tags_list: array of sentences of pairs word-tag (for train) 
        """
        tags = [tag for sent in train_tokens_tags_list
                for (word, tag) in sent]
        words = [word for sent in train_tokens_tags_list
                 for (word, tag) in sent]
        
        tag_num = pd.Series(data = nltk.FreqDist(tags)).sort_index()
        word_num = pd.Series(data = nltk.FreqDist(words)).sort_values(ascending=False)
         
        self.tags = tag_num.index
        self.words = word_num.index
        
        A = pd.DataFrame({'{}'.format(tag) : [0] * len(tag_num) for tag in tag_num.index}, index=tag_num.index)
        B = pd.DataFrame({'{}'.format(tag) : [0] * len(word_num) for tag in tag_num.index}, index=word_num.index)
        
        # Compure matrixes A and B from count of words and tags
        
        # sent - sentence
        # sent[i][0] - i word in this sentence, sent[i][1] - i tag in this sentence
        for sent in train_tokens_tags_list:
            for i in range(len(sent)):
                B.loc[sent[i][0], sent[i][1]] += 1 # current i-pair word-tag
                if len(sent) - 1 != i: # for last tag there is no next tag
                    A.loc[sent[i][1], sent[i + 1][1]] += 1 # pair tag-tag
                
        
        # now to probabilities
        
        # normalize along the row, i.e along all possible next tags
        A = A.divide(A.sum(axis=1), axis=0)
        
        # normalize along column, i.e along all possible current words
        B = B / np.sum(B, axis=0)
        
        self.A = A
        self.B = B
        
        return self
        
    
    def predict(self, test_tokens_list):
        """
        test_tokens_list : array of sentences of pairs word-tag (for test)
        """
        predict_tags = OrderedDict({i : np.array([]) for i in range(len(test_tokens_list))})
        
        for i_sent in range(len(test_tokens_list)):
            
            current_sent = test_tokens_list[i_sent] # current sentence
            len_sent = len(current_sent) # lenght of sentence 
            
            q = np.zeros(shape=(len_sent + 1, len(self.tags)))
            q[0] = 1 # zero state (equal distribution for all s)
            back_point = np.zeros(shape=(len_sent + 1, len(self.tags))) # # argmax
            
            for t in range(len_sent):
                
                # if we haven't met this word during training, 
                # we'll use the most popular word with most popular tag instead
                if current_sent[t] not in self.words:
                  # print('this word not in dictionary', current_sent[t])
                  tagdict = findtags('NOUN', brown_tagged_words)
                  current_sent[t] = tagdict['NOUN'][0][0]
                    
                # using max we choose next tag
                for i_s in range(len(self.tags)):
                    # i_s - index for tag
                    s = self.tags[i_s] # tag
                    
                    # formula (1)
                    probability = np.max(q[t][i_s] * self.A.loc[:, s] * self.B.loc[current_sent[t], s])

                    q[t + 1][i_s] = np.max(probability)
               
                    # argmax formula(1)
                    
                    # argmax for recreating the sequence of tags
                    back_point[t + 1][i_s] = (probability).reset_index()[s].idxmax() # index
                    
            back_point = back_point.astype('int')

            # saving tags and changing their order for real one
            back_tag = deque()
            current_tag = np.argmax(q[len_sent])

            for t in range(len_sent, 0, -1):
                back_tag.appendleft(self.tags[current_tag])
                current_tag = back_point[t, current_tag]
             
            predict_tags[i_sent] = np.array(back_tag)
        
        
        return predict_tags

It was actually almost completely written before me. I was to fill in some gaps (above all in the Viterbi algorithm where we compute q and back_point matrices). But I think I did something wrong as my model always predicts something like that:

OrderedDict([(0, array(['.', '.', '.'], dtype='<U1'))]) 

In the example above I gave it this sentence: [['he', 'can', 'stay']]

I've checked : all these words are in the train data.

I've printed out matrices A and B. Here they are:

A:

enter image description here

B:

enter image description here

This shows that the first part of model is working fine.

These are folmulas I had for help:

enter image description here

enter image description here

This is the data:

brown_tagged_sents = brown.tagged_sents(tagset="universal", categories='humor')

my_brown_tagged_sents = []
for sent in brown_tagged_sents:
    my_brown_tagged_sents.append(list(map(lambda x: (x[0].lower(), x[1]), sent)))
my_brown_tagged_sents = np.array(my_brown_tagged_sents)

from sklearn.model_selection import train_test_split
train_sents, test_sents = train_test_split(my_brown_tagged_sents, random_state=0, test_size=0.1)

What could be wrong with my model?


Solution

  • Probability should look like that:

    probability = q[t, :] * self.A.loc[:, s] * self.B.loc[current_sent[t], s]
    

    instead of this:

    probability = q[t][i_s] * self.A.loc[:, s] * self.B.loc[current_sent[t], s]