I am trying to perform a sentiment analysis based on product reviews collected from various websites. I've been able to follow along with the below article until it gets to the model coefficient visualization step.
When I run my program, I get the following error:
ValueError: Number of coefficients 6021 doesn't match number offeature names 6290.
Any advice on how to ensure the number of coefficients match the number of features in my dataset?
Below is my code:
y = reviews['Review Type']
X = reviews['Review Comment']
#Split the data into training and test sets
from sklearn.model_selection import train_test_split
text_train, text_test, y_train, y_test = train_test_split(X, y, random_state=0)
#run the feature extraction on training & test independent variables with bag of words
#changing the variable back to X_train after transforming it.
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
vect = CountVectorizer().fit(text_train)
X_train = vect.transform(text_train)
print(repr(X_train))
X_test = vect.transform(text_test)
print(repr(X_test))
feature_names = vect.get_feature_names()
print(len(feature_names))
#running a logistic regression model to predict whether a review is positive
#or negative
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(max_iter=10000, class_weight='balanced', random_state=0)
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(logreg, param_grid, scoring= 'roc_auc', cv=5)
logreg_train = grid.fit(X_train, y_train)
pred_logreg = logreg_train.predict(X_test)
confusion = confusion_matrix(y_test, pred_logreg)
print(confusion)
print("Classification accuracy is: ", (confusion[0][0] + confusion[1][1]) / np.sum(confusion))
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
import seaborn as sns; sns.set();
fpr, tpr, thresholds = roc_curve(y_test, grid.decision_function(X_test))
# find threshold closest to zero:
close_zero = np.argmin(np.abs(thresholds))
plt.plot(fpr[close_zero], tpr[close_zero], 'o', markersize=10,
label= 'threshold zero(default)', fillstyle= 'none', c='k', mew=2)
plt.plot([0,1], linestyle='-', lw=2, color='r', label='random', alpha=0.8)
plt.legend(loc=4)
plt.plot(fpr, tpr, label='ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate (recall)')
plt.title('roc_curve');
from sklearn.metrics import auc
print('AUC score is: ', auc(fpr, tpr));
from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(\
y_test, logreg_train.decision_function(X_test))
close_zero = np.argmin(np.abs(thresholds))
plt.plot(precision[close_zero], recall[close_zero], 'o', markersize=10,
label="threhold zero", fillstyle="none", c="k", mew=2)
plt.plot(precision, recall, label="precision recall curve")
plt.xlabel("precision")
plt.ylabel("recall")
plt.title("Precision Recall Curve")
plt.legend(loc="best");
from sklearn.feature_extraction.text import TfidfVectorizer
logreg = LogisticRegression(max_iter=10000, class_weight="balanced", random_state=0)
pipe = make_pipeline(TfidfVectorizer(norm=None, stop_words='english'), logreg)
param_grid = {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(pipe, param_grid, scoring="roc_auc", cv=5)
logreg_train = grid.fit(text_train, y_train)
fpr, tpr, thresholds = roc_curve(y_test, grid.decision_function(text_test))
pred_logreg = logreg_train.predict(text_test)
confusion = confusion_matrix(y_test, pred_logreg)
print(confusion)
print("Classification accuracy is: ", (confusion[0][0] + confusion[1][1]) / np.sum(confusion))
print("Test AUC score is: ", auc(fpr, tpr));
mglearn.tools.visualize_coefficients(grid.best_estimator_.named_steps["logisticregression"].coef_,feature_names, n_top_features=25)
You've defined feature_names
in terms of the features from a CountVectorizer
with the default stop_words=None
, but your model in the last bit of code is using a TfidfVectorizer
with stop_words='english'
. Use instead
feature_names = grid.best_estimator_.named_steps["tfidfvectorizer"].get_feature_names()