UserWarning: Your stop_words may be inconsistent with your preprocessing

I am following this tutorial to make a chatbot with the following code.

import nltk
import numpy as np
import random
import string
import bs4 as bs
import urllib.request
import re

# Creating the Corpus
raw_html = urllib.request.urlopen('https://en.wikipedia.org/wiki/Tennis')
raw_html = raw_html.read()
article_html = bs.BeautifulSoup(raw_html, 'lxml')
article_paragraphs = article_html.find_all('p')

article_text = ''

for para in article_paragraphs:
    article_text += para.text

article_text = article_text.lower()

# Text Preprocessing
article_text = re.sub(r'\[[0-9]*\]', ' ', article_text)
article_text = re.sub(r'\s+', ' ', article_text)
article_sentences = nltk.sent_tokenize(article_text)
article_words = nltk.word_tokenize(article_text)

wnlemmatizer = nltk.stem.WordNetLemmatizer()

# Helper Function
def perform_lemmatization(tokens):
    return [wnlemmatizer.lemmatize(token) for token in tokens]

punctuation_removal = dict((ord(punctuation), None) for punctuation in string.punctuation)

def get_processed_text(document):
    return perform_lemmatization(nltk.word_tokenize(document.lower().translate(punctuation_removal)))

# Responding to Greetings
greeting_inputs = ("hey", "good morning", "good evening", "morning", "evening", "hi", "whatsup")
greeting_responses = ["hey", "hey hows you?", "*nods*", "hello, how you doing", "hello", "Welcome, I am good and you"]

def generate_greeting_response(greeting):
    for token in greeting.split():
        if token.lower() in greeting_inputs:
            return random.choice(greeting_responses)
        else:
            return 'Try again'

# Responding to User Queries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def generate_response(user_input):
    tennisrobo_response = ''
    article_sentences.append(user_input)

    word_vectorizer = TfidfVectorizer(tokenizer=get_processed_text, stop_words='english')
    all_word_vectors = word_vectorizer.fit_transform(article_sentences)
    similar_vector_values = cosine_similarity(all_word_vectors[-1], all_word_vectors)
    similar_sentence_number = similar_vector_values.argsort()[0][-2]

    matched_vector = similar_vector_values.flatten()
    matched_vector.sort()
    vector_matched = matched_vector[-2]

    if vector_matched == 0:
        tennisrobo_response = tennisrobo_response + "I am sorry, I could not understand you"
        return tennisrobo_response
    else:
        tennisrobo_response = tennisrobo_response + article_sentences[similar_sentence_number]
        return tennisrobo_response
print(generate_response('tennis'))

Running the code, I get the following error:

UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['ha', 'le', 'u', 'wa'] not in stop_words.
  warnings.warn('Your stop_words may be inconsistent with '

After searching google I got linked to this answer saying that there may be an inconsistency between my stop words and tokenizer. However, I am very new to python and NLTK and cannot find where the inconsistency is.

Where is the part of the code that is causing this error?

Solution

The code runs with no issues, and please note what you get is not an error, it is a warning. Note you can suppress all warnings with

import warnings
warnings.filterwarnings("ignore")

The warning appears due to the fact that you are using a custom preprocessor / tokenizer. See the get_processed_text method that calls perform_lemmatization(nltk.word_tokenize(document.lower().translate(punctuation_removal))). If you remove the lemmatization, you won't see the warning:

def get_processed_text(document):
    return nltk.word_tokenize(document.lower().translate(punctuation_removal))