Below is the full code:
import spacy
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding
def read_file(filepath):
with open(filepath) as f:
str_text = f.read()
return str_text
moby_text = read_file('moby_dick.txt')
nlp = spacy.load('en_core_web_sm')
doc = nlp(moby_text)
#getting tokens using list comprehension
tokens = [token.text.lower() for token in doc]
#cleaning text
tokens = [token for token in tokens if token not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']
train_len = 10+1 # 10 i/p and 1 o/p
text_sequences = []
for i in range(train_len,len(tokens)):
seq = tokens[i-train_len:i]
text_sequences.append(seq)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)
for i in sequences[0]:
print(f'{i} : {tokenizer.index_word[i]}')
sequences = np.array(sequences)
vocabulary_size = len(tokenizer.word_counts)
def create_model(vocabulary_size, seq_len):
model = Sequential()
model.add(Embedding(vocabulary_size, 25, input_length=seq_len))
model.add(LSTM(100,return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100,activation='relu'))
model.add(Dense(vocabulary_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
return model
X = sequences[:,:-1]
y = sequences[:,-1]
y = to_categorical(y, num_classes=vocabulary_size)
Here in the to_categorical I'm getting the error. I don't understand why? and after reading so many articles I still don't get how to solve it.
IndexError: index 2718 is out of bounds for axis 1 with size 2718
seq_len = X.shape[1]
model = create_model(vocabulary_size, seq_len)
model.fit(X, y, epochs=100,verbose=1)
I don't understand the error. I have searched the error and tried different ways to solve it but I can't find anything to solve it. Also, I guess this is because the indices for lists start at 0. And I have done
Y = Y - 1
y = to_categorical(y, num_classes=vocabulary_size)
but this doesn't work because it gives error in the model. So I am back to square one.
Node: 'sequential/embedding/embedding_lookup'
indices[13,9] = 2718 is not in [0, 2718)
[[{{node sequential/embedding/embedding_lookup}}]] [Op:__inference_train_function_5647]
So how can I solve it? Can someone please help me out? Thank you!!!
The Tokenizer
doesn't use 0, it starts counting with 1:
0 is a reserved index that won't be assigned to any word.
Try this:
vocabulary_size = len(tokenizer.word_counts) + 1