I am working on an example where training data and training labels are lists, but when I fit on it on a code, it throws an error. I guess, the problem is with the text pre-processing class.
Below is the code, where I created a pipeline, but it say expected str or bytes like object or sometimes, list has not attribute lower.
import string
from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag
from sklearn.base import BaseEstimator, TransformerMixin
class NLTKPreprocessor(BaseEstimator, TransformerMixin):
def __init__(self, stopwords=None, punct=None,
lower=False, strip=True):
self.lower = lower
self.strip = strip
self.stopwords = stopwords or set(sw.words('english'))
self.punct = punct or set(string.punctuation)
self.lemmatizer = WordNetLemmatizer()
def fit(self, X, y=None):
return self
def inverse_transform(self, X):
return [" ".join(doc) for doc in X]
def transform(self, X):
return [
list(self.tokenize(doc)) for doc in X
]
def tokenize(self, document):
# Break the document into sentences
for sent in sent_tokenize(document):
# Break the sentence into part of speech tagged tokens
for token, tag in pos_tag(wordpunct_tokenize(sent)):
# Apply preprocessing to the token
token = token.lower() if self.lower else token
token = token.strip() if self.strip else token
token = token.strip('_') if self.strip else token
token = token.strip('*') if self.strip else token
# If stopword, ignore token and continue
if token in self.stopwords:
continue
# If punctuation, ignore token and continue
if all(char in self.punct for char in token):
continue
# Lemmatize the token and yield
lemma = self.lemmatize(token, tag)
yield lemma
def lemmatize(self, token, tag):
tag = {
'N': wn.NOUN,
'V': wn.VERB,
'R': wn.ADV,
'J': wn.ADJ
}.get(tag[0], wn.NOUN)
return self.lemmatizer.lemmatize(token, tag)
from sklearn.feature_extraction.text import TfidfVectorizer
text_clf = Pipeline([('preprocess', NLTKPreprocessor()),
('vect', CountVectorizer()),
('tfidf', TfidfTransformer(smooth_idf=True,use_idf=True)),
('clf', SGDClassifier(loss='log', penalty='l2',
alpha=1e-3, random_state=42)),
])
#---show only the best performace results: accuracy, other metrics, and confusion matrix
text_clf3.fit(X_train, y_train)
y_pred = text_clf3.predict(X_test)
print ('Accuracy Score (%):', accuracy_score(y_test, y_pred)*100)
print(metrics.classification_report(y_test, y_pred,
target_names=docs_data.target_names))
You get error because of this method.
def transform(self, X):
return [
list(self.tokenize(doc)) for doc in X
]
X is a dataframe, and when you iterate dataframe you get indexes of each row. Therefore you get 0 on first iteration and then sent_tokenizer fails because it expects something else.
I am not sure about entire pipeline. But you can fix this one step like this. Note, this would work only with one column.
def transform(self, X):
return [
list(self.tokenize(doc[0])) for doc in X.values
]
method above works with the following fit inputs.
X_train = pd.DataFrame([['asfas saf asf. dwqdwqwd '],['asdasdasd32d23 wedw ed wed. dwqdwq. ']])
y_train = [[1], [2]]