Search code examples
pythonnltktokenizeword2vec

Error in loading NLTK resources: "Please use the NLTK Downloader to obtain the resource:\n\n"


I adapted the following code from Susan Li's post, but incurred an error when the code tries to tokenize text using NLTK's resources (or, there could be something wrong with "keyed vectors" loaded from the web). The error occurred on the 5th code block (see below, might take a while to load from the web):

## 1. load packages and data

import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import sent_tokenize
STOPWORDS = set(stopwords.words('english'))
nltk.download('stopwords')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import re
from bs4 import BeautifulSoup
%matplotlib inline

df = pd.read_csv('https://www.dropbox.com/s/b2w7iqi7c92uztt/stack-overflow-data.csv?dl=1')
df = df[pd.notnull(df['tags'])]

my_tags = ['java','html','asp.net','c#','ruby-on-rails','jquery','mysql','php','ios','javascript','python','c','css','android','iphone','sql','objective-c','c++','angularjs','.net']

## 2. cleaning

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):

    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text
    
df['post'] = df['post'].apply(clean_text)

## 3. train test split

X = df.post
y = df.tags
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 42)

## 4. load keyed vectors from the web: will take a while to load

import gensim
word2vec_path = "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
wv = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
wv.init_sims(replace=True)


## 5. this is where it goes wrong

def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens
    
train, test = train_test_split(df, test_size=0.3, random_state = 42)

test_tokenized = test.apply(lambda r: w2v_tokenize_text(r['post']), axis=1).values
train_tokenized = train.apply(lambda r: w2v_tokenize_text(r['post']), axis=1).values

X_train_word_average = word_averaging_list(wv,train_tokenized)
X_test_word_average = word_averaging_list(wv,test_tokenized)


## 6. perform logistic regression test

from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_word_average, train['tags'])
y_pred = logreg.predict(X_test_word_average)
print('accuracy %s' % accuracy_score(y_pred, test.tags))
print(classification_report(test.tags, y_pred,target_names=my_tags))

Update on part 5 (per @luigigi's comments)

## 5. download nltk and use apply() function without using lambda

import nltk
nltk.download()
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import sent_tokenize

    def w2v_tokenize_text(text):
        tokens = []
        for sent in nltk.sent_tokenize(text, language='english'):
            for word in nltk.word_tokenize(sent, language='english'):
                if len(word) < 2:
                    continue
                tokens.append(word)
        return tokens
        
    train, test = train_test_split(df, test_size=0.3, random_state = 42)

    def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens
    
train, test = train_test_split(df, test_size=0.3, random_state = 42)

test_tokenized = test['post'].apply(w2v_tokenize_text).values

train_tokenized = train['post'].apply(w2v_tokenize_text).values

    X_train_word_average = word_averaging_list(wv,train_tokenized)
    X_test_word_average = word_averaging_list(wv,test_tokenized)

## now run the test

from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_word_average, train['tags'])
y_pred = logreg.predict(X_test_word_average)
print('accuracy %s' % accuracy_score(y_pred, test.tags))
print(classification_report(test.tags, y_pred,target_names=my_tags))

This should work.


Solution

  • Then nltk tokenizer expects the punkt resource so you have to download it first:

    nltk.download('punkt')
    

    Also, you dont need a lambda expression to apply your tokenizer function. You can simply use:

    test_tokenized = test['post'].apply(w2v_tokenize_text).values
    train_tokenized = train['post'].apply(w2v_tokenize_text).values