AttributeError throwing up which says list object has no attribute lower

I am working on an example where training data and training labels are lists, but when I fit on it on a code, it throws an error. I guess, the problem is with the text pre-processing class.

Below is the code, where I created a pipeline, but it say expected str or bytes like object or sometimes, list has not attribute lower.

import string

from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag

from sklearn.base import BaseEstimator, TransformerMixin


class NLTKPreprocessor(BaseEstimator, TransformerMixin):

    def __init__(self, stopwords=None, punct=None,
                 lower=False, strip=True):
        self.lower      = lower
        self.strip      = strip
        self.stopwords  = stopwords or set(sw.words('english'))
        self.punct      = punct or set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, X, y=None):
        return self

    def inverse_transform(self, X):
        return [" ".join(doc) for doc in X]

    def transform(self, X):
        return [
            list(self.tokenize(doc)) for doc in X
        ]

    def tokenize(self, document):
        # Break the document into sentences
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                # If stopword, ignore token and continue
                if token in self.stopwords:
                    continue

                # If punctuation, ignore token and continue
                if all(char in self.punct for char in token):
                    continue

                # Lemmatize the token and yield
                lemma = self.lemmatize(token, tag)
                yield lemma

    def lemmatize(self, token, tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)

from sklearn.feature_extraction.text import TfidfVectorizer

text_clf = Pipeline([('preprocess', NLTKPreprocessor()),
                     ('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(smooth_idf=True,use_idf=True)),
                     ('clf', SGDClassifier(loss='log', penalty='l2',
                                           alpha=1e-3, random_state=42)),
])

#---show only the best performace results: accuracy, other metrics, and confusion matrix
text_clf3.fit(X_train, y_train)
y_pred = text_clf3.predict(X_test)
print ('Accuracy Score (%):', accuracy_score(y_test, y_pred)*100)

print(metrics.classification_report(y_test, y_pred,
    target_names=docs_data.target_names))

Solution

You get error because of this method.

def transform(self, X):
    return [
        list(self.tokenize(doc)) for doc in X
    ]

X is a dataframe, and when you iterate dataframe you get indexes of each row. Therefore you get 0 on first iteration and then sent_tokenizer fails because it expects something else.

I am not sure about entire pipeline. But you can fix this one step like this. Note, this would work only with one column.

def transform(self, X):
    return [
        list(self.tokenize(doc[0])) for doc in X.values
    ]

method above works with the following fit inputs.

X_train = pd.DataFrame([['asfas saf asf. dwqdwqwd '],['asdasdasd32d23  wedw ed wed. dwqdwq. ']])
y_train = [[1], [2]]