Search code examples
pythonfor-loopnlpnltk

Converting Sentences to a Matrix for NLP


I am attempting to take sentences, build a dictionary of possible terms and then convert the sentences into a matrix where the rows would represent Sentence 1, Sentence 2, etc and the columns would represent the possible words those sentences could contain with a 1 or 0 indicating whether that particular word was contained in the corresponding row (sentence). Below is what I've done so far, the first half is working as intended (I believe) but there is clearly something wrong with my logic in the section entitled " Find word tokens in sentences and mark for future analysis" that I can't quite figure out how to solve. I'm thinking there is probably also a better way to do this in NLTK, but I haven't been able to find it yet.

from nltk import word_tokenize
import nltk
import numpy as np
import punkt
from nltk.stem import PorterStemmer
from nltk.stem import   WordNetLemmatizer
from nltk.corpus import stopwords 

text_array = ["I run faster than you","I own four computers"]

# =============================================================================
# Create an array of possible words that has punctuation removed and is lemmatized
# =============================================================================

wordnet_lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

all_words = []

for sentence in text_array:
    #Take every word in each sentence and tokenize it
    tokenize_word = word_tokenize(sentence)
    for word in tokenize_word:
        #Find the lemma of every word previously tokenized, for example "running" should be come something like "run"
        word_mod = wordnet_lemmatizer.lemmatize(word)
        #Remove punctuation from individual words
        word_mod = word_mod.translate(str.maketrans('','',string.punctuation))
        #Check if word tokens are empty (because some tokenized words were just punctuation) or are a stop word, don't append in either case
        if word_mod != '' and word_mod not in stop_words:
            all_words.append(word_mod)

unique_words = set(all_words)
unique_words_list = list(unique_words)

# =============================================================================
# Find word tokens in sentences and mark for future analysis
# =============================================================================


text_array_ex =  text_array
results_matrix = np.zeros(shape=(len(text_array_ex),len(unique_words_list)),dtype='int') 

for i in range(0,len(text_array_ex)):
    sentence = text_array_ex[i]
    #Take every word in each sentence and tokenize it
    tokenize_word = word_tokenize(sentence)
    for word in tokenize_word:
        #Find the lemma of every word previously tokenized, for example "running" should be come something like "run"
        word_mod = wordnet_lemmatizer.lemmatize(word)
        #Remove punctuation from individual words
        word_mod = word_mod.translate(str.maketrans('','',string.punctuation))
        for j in range(0,len(unique_words_list)):
            if unique_words_list[j] == word_mod:
                results_matrix[i,j] = 1
            else: results_matrix[i,j] = 0

Solution

  • Well I was able to figure out my error, of course it was a silly one.

    for i in range(0,len(text_array_ex)):
        sentence = text_array_ex[i]
        #Take every word in each sentence and tokenize it
        tokenize_word = word_tokenize(sentence)
        for word in tokenize_word:
            #Find the lemma of every word previously tokenized, for example "running" should be come something like "run"
            word_mod = wordnet_lemmatizer.lemmatize(word)
            #Remove punctuation from individual words
            word_mod = word_mod.translate(str.maketrans('','',string.punctuation))
            for j in range(0,len(unique_words_list)):
                if unique_words_list[j] == word_mod:
                    results_matrix[i,j] += 1