Search code examples
pythontensorflowmachine-learningkerasword2vec

How do I adapt code to make CNN model compatible with a higher dimension word embedding?


I have been following an online tutorial on 1D CNN for text classification. I have got the model to work with a self trained word2vec embedding of 100 dimensions, but I want to see how the model would preform when given a higher dimensional word embedding.

I have tried downloading a 300 dimension word2vec model and adding the .txt file in the CNN model and changing any dimensions from a 100 to 300. The model runs but produces bad results, the accuracy is 'nan' and the loss is 0.000 for all epochs.

What would i have to change for the model to work with the 300 dimension word2vec model? Thanks i have added the code below:

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text
 
# turn a doc into clean tokens
def clean_doc(doc, vocab):
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]
    # filter out tokens not in vocab
    tokens = [w for w in tokens if w in vocab]
    tokens = ' '.join(tokens)
    return tokens
 
# load all docs in a directory
def process_docs(directory, vocab, is_trian):
    documents = list()
    # walk through all files in the folder
    for filename in listdir(directory):
        # skip any reviews in the test set
        if is_trian and filename.startswith('cv9'):
            continue
        if not is_trian and not filename.startswith('cv9'):
            continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # load the doc
        doc = load_doc(path)
        # clean doc
        tokens = clean_doc(doc, vocab)
        # add to list
        documents.append(tokens)
    return documents
 
# load embedding as a dict
def load_embedding(filename):
    # load embedding into memory, skip first line
    file = open(filename,'r')
    lines = file.readlines()[1:]
    file.close()
    # create a map of words to vectors
    embedding = dict()
    for line in lines:
        parts = line.split()
        # key is string word, value is numpy array for vector
        embedding[parts[0]] = asarray(parts[1:], dtype='float32')
    return embedding
 
# create a weight matrix for the Embedding layer from a loaded embedding
def get_weight_matrix(embedding, vocab):
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    # define weight matrix dimensions with all 0
    weight_matrix = zeros((vocab_size, 100))
    # step vocab, store vectors using the Tokenizer's integer mapping
    for word, i in vocab.items():
        weight_matrix[i] = embedding.get(word)
    return weight_matrix
 
# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
 
# load all training reviews
positive_docs = process_docs('txt_sentoken/pos', vocab, True)
negative_docs = process_docs('txt_sentoken/neg', vocab, True)
train_docs = negative_docs + positive_docs
 
# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(train_docs)
 
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(train_docs)
# pad sequences
max_length = max([len(s.split()) for s in train_docs])
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define training labels
ytrain = array([0 for _ in range(900)] + [1 for _ in range(900)])
 
# load all test reviews
positive_docs = process_docs('txt_sentoken/pos', vocab, False)
negative_docs = process_docs('txt_sentoken/neg', vocab, False)
test_docs = negative_docs + positive_docs
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(test_docs)
# pad sequences
Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define test labels
ytest = array([0 for _ in range(100)] + [1 for _ in range(100)])
 
# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1
 
# load embedding from file
raw_embedding = load_embedding('embedding_word2vec.txt')
# get vectors in the right order
embedding_vectors = get_weight_matrix(raw_embedding, tokenizer.word_index)
# create the embedding layer
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_vectors], input_length=max_length, trainable=False)
 
# define model
model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
print(model.summary())
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(Xtrain, ytrain, epochs=10, verbose=2)
# evaluate
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))

Solution

  • If you are using 300-dimensional vectors you need to change two things in your code. This line:

    weight_matrix = zeros((vocab_size, 100))
    

    To:

    weight_matrix = zeros((vocab_size, 300))
    

    And this line:

    embedding_layer = Embedding(vocab_size, 100, weights=[embedding_vectors], input_length=max_length, trainable=False)
    

    To

    embedding_layer = Embedding(vocab_size, 300, weights=[embedding_vectors], input_length=max_length, trainable=False)
    

    That way, each integer representing a word will be mapped to its 300-dimensional word vector.

    Update 1

    If you download w2v from gensim:

    import gensim.downloader as api
    
    wv = api.load('word2vec-google-news-300')
    

    This seems to work:

    from string import punctuation
    from os import listdir
    from numpy import array
    from numpy import asarray
    from numpy import zeros
    import numpy as np
    from keras.preprocessing.text import Tokenizer
    from keras.preprocessing.sequence import pad_sequences
    from keras.models import Sequential
    from keras.layers import Dense
    from keras.layers import Flatten
    from keras.layers import Embedding
    from keras.layers.convolutional import Conv1D
    from keras.layers.convolutional import MaxPooling1D
    
    
    # load doc into memory
    def load_doc(filename):
        # open the file as read only
        file = open(filename, 'r')
        # read all text
        text = file.read()
        # close the file
        file.close()
        return text
    
    
    # turn a doc into clean tokens
    def clean_doc(doc, vocab):
        # split into tokens by white space
        tokens = doc.split()
        # remove punctuation from each token
        table = str.maketrans('', '', punctuation)
        tokens = [w.translate(table) for w in tokens]
        # filter out tokens not in vocab
        tokens = [w for w in tokens if w in vocab]
        tokens = ' '.join(tokens)
        return tokens
    
    
    # load all docs in a directory
    def process_docs(directory, vocab, is_trian):
        documents = list()
        # walk through all files in the folder
        for filename in listdir(directory):
            # skip any reviews in the test set
            if is_trian and filename.startswith('cv9'):
                continue
            if not is_trian and not filename.startswith('cv9'):
                continue
            # create the full path of the file to open
            path = directory + '/' + filename
            # load the doc
            doc = load_doc(path)
            # clean doc
            tokens = clean_doc(doc, vocab)
            # add to list
            documents.append(tokens)
        return documents
    
    
    # load embedding as a dict
    def load_embedding(filename):
        # load embedding into memory, skip first line
        file = open(filename, 'r')
        lines = file.readlines()[1:]
        file.close()
        # create a map of words to vectors
        embedding = dict()
        for line in lines:
            parts = line.split()
            # key is string word, value is numpy array for vector
            embedding[parts[0]] = asarray(parts[1:], dtype='float32')
        return embedding
    
    
    # create a weight matrix for the Embedding layer from a loaded embedding
    def get_weight_matrix(embedding, vocab):
        # total vocabulary size plus 0 for unknown words
        vocab_size = len(vocab) + 1
        # define weight matrix dimensions with all 0
        weight_matrix = zeros((vocab_size, 300))
        # step vocab, store vectors using the Tokenizer's integer mapping
        for word, i in vocab.items():
          try:
              weight_matrix[i] = embedding[word]
          except KeyError:
              weight_matrix[i] = np.random.uniform(size=300)
        return weight_matrix
    
    # load the vocabulary
    vocab_filename = 'vocab.txt'
    vocab = load_doc(vocab_filename)
    vocab = vocab.split()
    vocab = set(vocab)
    
    # load all training reviews
    positive_docs = process_docs('/content/txt_sentoken/pos', vocab, True)
    negative_docs = process_docs('/content/txt_sentoken/neg', vocab, True)
    train_docs = negative_docs + positive_docs
    
    # create the tokenizer
    tokenizer = Tokenizer()
    # fit the tokenizer on the documents
    tokenizer.fit_on_texts(train_docs)
    
    # sequence encode
    encoded_docs = tokenizer.texts_to_sequences(train_docs)
    # pad sequences
    max_length = max([len(s.split()) for s in train_docs])
    Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
    # define training labels
    ytrain = array([0 for _ in range(900)] + [1 for _ in range(900)])
    
    # load all test reviews
    positive_docs = process_docs('/content/txt_sentoken/pos', vocab, False)
    negative_docs = process_docs('/content/txt_sentoken/neg', vocab, False)
    test_docs = negative_docs + positive_docs
    # sequence encode
    encoded_docs = tokenizer.texts_to_sequences(test_docs)
    # pad sequences
    Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
    # define test labels
    ytest = array([0 for _ in range(100)] + [1 for _ in range(100)])
    
    # define vocabulary size (largest integer value)
    vocab_size = len(tokenizer.word_index) + 1
    
    # load embedding from file
    # get vectors in the right order
    embedding_vectors = get_weight_matrix(wv, tokenizer.word_index)
    # create the embedding layer
    embedding_layer = Embedding(vocab_size, 300, weights=[embedding_vectors], input_length=max_length, trainable=False)
    
    # define model
    model = Sequential()
    model.add(embedding_layer)
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    print(model.summary())
    # compile network
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # fit network
    model.fit(Xtrain, ytrain, epochs=10, verbose=2)
    # evaluate
    loss, acc = model.evaluate(Xtest, ytest, verbose=0)
    print('Test Accuracy: %f' % (acc * 100))