python machine-learning sentiment-analysis valueerror

ValueError: Number of Coefficients does not match number of features (Mglearn visualization)

I am trying to perform a sentiment analysis based on product reviews collected from various websites. I've been able to follow along with the below article until it gets to the model coefficient visualization step.

https://towardsdatascience.com/how-a-simple-algorithm-classifies-texts-with-moderate-accuracy-79f0cd9eb47

When I run my program, I get the following error:

ValueError: Number of coefficients 6021 doesn't match number offeature names 6290.

Any advice on how to ensure the number of coefficients match the number of features in my dataset?

Below is my code:

y = reviews['Review Type']
X = reviews['Review Comment']

#Split the data into training and test sets
from sklearn.model_selection import train_test_split
text_train, text_test, y_train, y_test = train_test_split(X, y, random_state=0)

#run the feature extraction on training & test independent variables with bag of words
#changing the variable back to X_train after transforming it.

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
vect = CountVectorizer().fit(text_train)
X_train = vect.transform(text_train)
print(repr(X_train))

X_test = vect.transform(text_test)
print(repr(X_test))

feature_names = vect.get_feature_names()
print(len(feature_names))

#running a logistic regression model to predict whether a review is positive 
#or negative

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(max_iter=10000, class_weight='balanced', random_state=0)
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}


grid = GridSearchCV(logreg, param_grid, scoring= 'roc_auc', cv=5)
logreg_train = grid.fit(X_train, y_train)

pred_logreg = logreg_train.predict(X_test)
confusion = confusion_matrix(y_test, pred_logreg)
print(confusion)
print("Classification accuracy is: ", (confusion[0][0] + confusion[1][1]) / np.sum(confusion))

from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
import seaborn as sns; sns.set();

fpr, tpr, thresholds = roc_curve(y_test, grid.decision_function(X_test))
# find threshold closest to zero:
close_zero = np.argmin(np.abs(thresholds))
plt.plot(fpr[close_zero], tpr[close_zero], 'o', markersize=10, 
 label= 'threshold zero(default)', fillstyle= 'none', c='k', mew=2)
plt.plot([0,1], linestyle='-', lw=2, color='r', label='random', alpha=0.8)
plt.legend(loc=4)
plt.plot(fpr, tpr, label='ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate (recall)')
plt.title('roc_curve');
from sklearn.metrics import auc
print('AUC score is: ', auc(fpr, tpr));


from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(\
                                                      y_test, logreg_train.decision_function(X_test))
close_zero = np.argmin(np.abs(thresholds))
plt.plot(precision[close_zero], recall[close_zero], 'o', markersize=10, 
         label="threhold zero", fillstyle="none", c="k", mew=2)
plt.plot(precision, recall, label="precision recall curve")
plt.xlabel("precision")
plt.ylabel("recall")
plt.title("Precision Recall Curve")
plt.legend(loc="best");

from sklearn.feature_extraction.text import TfidfVectorizer
logreg = LogisticRegression(max_iter=10000, class_weight="balanced", random_state=0)
pipe = make_pipeline(TfidfVectorizer(norm=None, stop_words='english'), logreg)
param_grid = {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(pipe, param_grid, scoring="roc_auc", cv=5)
logreg_train = grid.fit(text_train, y_train)

fpr, tpr, thresholds = roc_curve(y_test, grid.decision_function(text_test))
pred_logreg = logreg_train.predict(text_test)
confusion = confusion_matrix(y_test, pred_logreg)
print(confusion)
print("Classification accuracy is: ", (confusion[0][0] + confusion[1][1]) / np.sum(confusion)) 
print("Test AUC score is: ", auc(fpr, tpr));

mglearn.tools.visualize_coefficients(grid.best_estimator_.named_steps["logisticregression"].coef_,feature_names, n_top_features=25)

Solution

You've defined feature_names in terms of the features from a CountVectorizer with the default stop_words=None, but your model in the last bit of code is using a TfidfVectorizer with stop_words='english'. Use instead

feature_names = grid.best_estimator_.named_steps["tfidfvectorizer"].get_feature_names()