Search code examples
pythontensorflownlplstmrecurrent-neural-network

How to setup LSTM to use n-grams instead of sequence length?


I currently have an LSTM which uses sequence length as input, but this only allows the LSTM to predict when the input length is equal to the used sequence length. But i want the LSTM to use n-grams so that i can predict full words.

Examples:

So for input(Sequence length = 10):

Input: "no sweet t" 
Ouput(5 options): "['he ', 'o ', 'aste ', 'ime ', 'errible ']" 

What I want is that input can be:

Input: "No sweet"
Output: ['tea', 'taste', 'but', 'the', 'and']

So that i can predict full words and am not constrained by the sequence length.

My current code:

#Setup
import numpy as np
import tensorflow as tf
from numpy.core.multiarray import dtype
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.layers import LSTM, Dropout
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.layers import Dense, Activation, Dropout, RepeatVector
from tensorflow.keras.optimizers import RMSprop
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import pickle
import sys
import heapq
import seaborn as sns
from pylab import rcParams



#Loading the data
path = 'text_2.txt'
text = open(path, encoding='utf8').read().lower()
# print ('Corpus length: ',len(text))

#Preprocessing
#Finding all the unique characters in the corpus
chars = sorted(list(set(text)))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

print ("unique chars: ",len(chars))

#Cutting the corpus into chunks of 10 chars, spacing the sequences by 3 characters
#We will additionally store the next character (the one we need to predict) for every sequence

SEQUENCE_LENGTH = 10
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - SEQUENCE_LENGTH, step):
    sentences.append(text[i:i+SEQUENCE_LENGTH])
    next_chars.append(text[i+SEQUENCE_LENGTH])
print ('num training examples: ',len(sentences))

#Generating features and labels.
#Using previously generated sequences and characters that need to be predicted to create one-hot encoded vectors

X = np.zeros((len(sentences), SEQUENCE_LENGTH, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1



#Building the model

model = Sequential();
model.add(LSTM(128, input_shape=(SEQUENCE_LENGTH, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))


#Training
optimizer = RMSprop(lr= 0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

history = model.fit(X, y, validation_split=0.05, batch_size=128, epochs=1, shuffle=True).history

#Predicting

#Testing
def prepare_input(text):
    x = np.zeros((1, SEQUENCE_LENGTH, len(chars)))
    for t, char in enumerate(text):
        x[0, t, char_indices[char]] = 1
    return x
#The sequences must be 40 chars long and the tensor is of the shape (1, 40, 57)



#The sample function
#This function allows us to ask our model what are the next probable characters (The heap simplifies the job)
def sample(preds, top_n = 3):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return heapq.nlargest(top_n, range(len(preds)), preds.take)


#Prediction function
def predict_completion(text):
    original_text = text
    generalised = text
    completion = ''
    while True:
        x = prepare_input(text)
        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds, top_n=1)[0]
        next_char = indices_char[next_index]

        text = text[1:] + next_char
        completion += next_char

        if len(original_text + completion) + 2 > len(original_text) and next_char == ' ':
            return completion

#This methods wraps everything and allows us to predict multiple completions
def predict_completions(text, n = 3):
    x = prepare_input(text)
    preds = model.predict(x, verbose=0)[0]
    next_indices = sample(preds, n)
    return [indices_char[idx] + predict_completion(text[1:] + indices_char[idx]) for idx in next_indices]








Solution

  • To provide sequences of words instead of characters to an LSTM and have the LSTM predict words, the text will have to be tokenized differently. Instead of splitting the document into sequences of n characters, the document will have to be split into m words, and instead of having a vocabulary of distinct characters (the index mappings), you will have a vocabulary of distinct words from your corpus. I have revised the provided code to illustrate this idea for your review. Some of the parameters were modified for testing purposes (I created my own text file for evaluation), but it can be changed for your document(s).

    # Setup
    import re
    
    import numpy as np
    from tensorflow.keras.models import Sequential, load_model
    from tensorflow.keras.layers import Dense, Activation
    from tensorflow.keras.layers import LSTM, Dropout
    from tensorflow.keras.layers import TimeDistributed
    from tensorflow.keras.layers import Dense, Activation
    from tensorflow.keras.optimizers import RMSprop
    import matplotlib
    
    
    matplotlib.use('agg')
    
    import heapq
    
    # Loading the data
    path = 'text_2.txt'
    text = open(path, encoding='utf8').read().lower()
    # print ('Corpus length: ',len(text))
    
    # Preprocessing
    # Finding all the unique characters in the corpus
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    words = sorted(list(set(text.split()))) # split into distinct words
    # define vocabulary
    word_indices = dict((c, i) for i, c in enumerate(words))
    indices_words = dict((i, c) for i, c in enumerate(words))
    
    
    print("unique chars: ", len(words))
    
    # Cutting the corpus into chunks of 10 chars, spacing the sequences by 3 characters
    # We will additionally store the next character (the one we need to predict) for every sequence
    
    SEQUENCE_LENGTH = 3
    step = 3
    sentences = []
    next_chars = []
    for i in range(0, len(words) - SEQUENCE_LENGTH, step):
        sentences.append(words[i:i + SEQUENCE_LENGTH])
        next_chars.append(words[i + SEQUENCE_LENGTH])
    print('num training examples: ', len(sentences))
    
    # Generating features and labels.
    # Using previously generated sequences and characters that need to be predicted to create one-hot encoded vectors
    
    X = np.zeros((len(sentences), SEQUENCE_LENGTH, len(words)), dtype=np.bool)
    y = np.zeros((len(sentences), len(words)), dtype=np.bool)
    for i, sentence in enumerate(sentences):
        for t, word in enumerate(sentence):
            X[i, t, word_indices[word]] = 1
        y[i, word_indices[next_chars[i]]] = 1
    
    # Building the model
    
    model = Sequential()
    model.add(LSTM(128, input_shape=(SEQUENCE_LENGTH, len(words))))
    model.add(Dense(len(words)))
    model.add(Activation('softmax'))
    
    # Training
    optimizer = RMSprop(lr=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    history = model.fit(X, y, validation_split=0.05, batch_size=128, epochs=1, shuffle=True).history
    
    
    # Predicting
    
    # Testing
    def prepare_input(text):
        x = np.zeros((1, SEQUENCE_LENGTH, len(words)))
        for t, word in enumerate(text.split()):
            x[0, t, word_indices[word]] = 1
        return x
    
    
    # The sequences must be 40 chars long and the tensor is of the shape (1, 40, 57)
    
    
    # The sample function
    # This function allows us to ask our model what are the next probable characters (The heap simplifies the job)
    def sample(preds, top_n=3):
        preds = np.asarray(preds).astype('float64')
        preds = np.log(preds)
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds)
        return heapq.nlargest(top_n, range(len(preds)), preds.take)
    
    
    # Prediction function
    def predict_completion(text):
        prediction = []
        while len(prediction) < SEQUENCE_LENGTH:
            x = prepare_input(text)
            preds = model.predict(x, verbose=0)[0]
            next_index = sample(preds, top_n=1)[0]
            next_word = indices_words[next_index]
    
            text = " ".join([text, next_word])
            prediction += [next_word]
    
        return " ".join(prediction)
    
    
    # This methods wraps everything and allows us to predict multiple completions
    def predict_completions(text, n=3):
        x = prepare_input(text)
        preds = model.predict(x, verbose=0)[0]
        next_indices = sample(preds, n)
        return [indices_words[idx] + predict_completion(text[1:] + indices_words[idx]) for idx in next_indices]
    
    print(predict_completion("hello"))
    
    

    The text file (text_2.txt), for evaluating the tokenization step and verifying it can be input to the model correctly, was:

    hello. how are you today? i am doing well thank you for asking. i like sweet tea.
    

    Update

    I may have misunderstood the part of the question. Regarding how to train on n-grams, you can compute the n-grams (e.g; bigrams, trigrams) and add them as part of your training data, using the next word after the n-gram as the prediction. The tokenization updates are the same, but we would revise the preprocessing of the corpus:

    def compute_n_gram(words, n):
        # compute n-grams
        return [words[i:i+n] for i in range(len(words)-n+1)]
    
    
    def compute_n_gram_with_next_word(words, n):
        n_gram = compute_n_gram(words, n)[:-1]
        next_words = [words[i+n] for i in range(len(n_gram))]
        return n_gram, next_words
    
    words = text.split()
    vocab = sorted(list(set(words)))  # determine distinct words from corpus
    vocab_size = len(vocab)
    
    bigrams, next_word_bigrams = compute_n_gram_with_next_word(words, 2)
    trigrams, next_word_trigrams = compute_n_gram_with_next_word(words, 3)
    sentences = bigrams + trigrams
    next_words = next_word_bigrams + next_word_trigrams
    

    So now, rather than taking every subsentence of three words and predicting the fourth, we use the bigrams and trigrams and predict the third or fourth word, respectively. This n-gram logic will also apply if you decide to do higher orders (four-gram, five-gram, etc.). SEQUENCE_LENGTH has to be greater than or equal to the maximum n-gram size you use, as it controls the size of your input (and any shorter input is padded with zeros). It also controls your output size with the current prediction code, as the generated text can't exceed that length (unless you opt to use a windowing approach). The entire updated code is similar to the previous snippet, but for completeness:

    # Setup
    import re
    
    import numpy as np
    from tensorflow.keras.models import Sequential, load_model
    from tensorflow.keras.layers import Dense, Activation
    from tensorflow.keras.layers import LSTM, Dropout
    from tensorflow.keras.layers import TimeDistributed
    from tensorflow.keras.layers import Dense, Activation
    from tensorflow.keras.optimizers import RMSprop
    import matplotlib
    from tensorflow.python.keras import Input
    
    matplotlib.use('agg')
    
    import heapq
    
    def compute_n_gram(words, n):
        # compute n-grams
        return [words[i:i+n] for i in range(len(words)-n+1)]
    
    
    def compute_n_gram_with_next_word(words, n):
        n_gram = compute_n_gram(words, n)[:-1]
        next_words = [words[i+n] for i in range(len(n_gram))]
        return n_gram, next_words
    
    # Loading the data
    path = 'text_2.txt'
    text = open(path, encoding='utf8').read().lower()
    # print ('Corpus length: ',len(text))
    
    # Preprocessing
    # Finding all the unique characters in the corpus
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    words = text.split()
    vocab = sorted(list(set(words)))  # determine distinct words from corpus
    vocab_size = len(vocab)
    # define vocabulary
    word_indices = dict((c, i) for i, c in enumerate(vocab))
    indices_words = dict((i, c) for i, c in enumerate(vocab))
    
    
    print("unique chars: ", len(words))
    
    # Cutting the corpus into chunks of 10 chars, spacing the sequences by 3 characters
    # We will additionally store the next character (the one we need to predict) for every sequence
    
    SEQUENCE_LENGTH = 10
    bigrams, next_word_bigrams = compute_n_gram_with_next_word(words, 2)
    trigrams, next_word_trigrams = compute_n_gram_with_next_word(words, 3)
    sentences = bigrams + trigrams
    next_words = next_word_bigrams + next_word_trigrams
    print('num training examples: ', len(sentences))
    
    # Generating features and labels.
    # Using previously generated sequences and characters that need to be predicted to create one-hot encoded vectors
    
    X = np.zeros((len(sentences), SEQUENCE_LENGTH, len(vocab)), dtype=np.bool)
    y = np.zeros((len(sentences), len(vocab)), dtype=np.bool)
    for i, sentence in enumerate(sentences):
        for t, word in enumerate(sentence):
            X[i, t, word_indices[word]] = 1
        y[i, word_indices[next_words[i]]] = 1
    
    # Building the model
    
    model = Sequential()
    model.add(LSTM(128, input_shape=(SEQUENCE_LENGTH, vocab_size)))
    model.add(Dense(vocab_size))
    model.add(Activation('softmax'))
    
    # Training
    optimizer = RMSprop(lr=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    history = model.fit(X, y, validation_split=0.05, batch_size=128, epochs=1, shuffle=True).history
    
    
    # Predicting
    
    # Testing
    def prepare_input(text):
        x = np.zeros((1, SEQUENCE_LENGTH, vocab_size))
        for t, word in enumerate(text.split()):
            x[0, t, word_indices[word]] = 1
        return x
    
    
    # The sequences must be 40 chars long and the tensor is of the shape (1, 40, 57)
    
    
    # The sample function
    # This function allows us to ask our model what are the next probable characters (The heap simplifies the job)
    def sample(preds, top_n=3):
        preds = np.asarray(preds).astype('float64')
        preds = np.log(preds)
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds)
        return heapq.nlargest(top_n, range(len(preds)), preds.take)
    
    
    # Prediction function
    def predict_completion(text):
        prediction = []
        while len(prediction) < SEQUENCE_LENGTH:
            x = prepare_input(text)
            preds = model.predict(x, verbose=0)[0]
            next_index = sample(preds, top_n=1)[0]
            next_word = indices_words[next_index]
    
            text = " ".join([text, next_word])
            prediction += [next_word]
    
        return " ".join(prediction)
    
    
    # This methods wraps everything and allows us to predict multiple completions
    def predict_completions(text, n=3):
        x = prepare_input(text)
        preds = model.predict(x, verbose=0)[0]
        next_indices = sample(preds, n)
        return [indices_words[idx] + predict_completion(text[1:] + indices_words[idx]) for idx in next_indices]
    
    print(predict_completion("hello"))