I am trying to do sentiment analysis on twitter data by following chapter 6 of the book Building Machine Learning Systems in Python.
I am using the dataset: https://raw.githubusercontent.com/zfz/twitter_corpus/master/full-corpus.csv
It uses a pipeline of tfidf vectorizer and naive bayes classifier as estimator.
Then I am using GridSearchCV() to find the best parameters for the estimator.
The code is as follows:
from load_data import load_data
from sklearn.cross_validation import ShuffleSplit
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
def pipeline_tfidf_nb():
tfidf_vect = TfidfVectorizer( analyzer = "word")
naive_bayes_clf = MultinomialNB()
return Pipeline([('vect', tfidf_vect),('nbclf',naive_bayes_clf)])
input_file = "full-corpus.csv"
X,y = load_data(input_file)
print X.shape,y.shape
clf = pipeline_tfidf_nb()
cv = ShuffleSplit(n = len(X), test_size = .3, n_iter = 1, random_state = 0)
clf_param_grid = dict(vect__ngram_range = [(1,1),(1,2),(1,3)],
vect__min_df = [1,2],
vect__smooth_idf = [False, True],
vect__use_idf = [False, True],
vect__sublinear_tf = [False, True],
vect__binary = [False, True],
nbclf__alpha = [0, 0.01, 0.05, 0.1, 0.5, 1],
)
grid_search = GridSearchCV(estimator = clf, param_grid = clf_param_grid, cv = cv, scoring = f1_score)
grid_search.fit(X, y)
print grid_search.best_estimator_
load_data() extracts the values from the csv file with positive or negative sentiment.
X is an array of strings(TweetText) and y is an array of bool values(True for positive sentiment).
The error is:
runfile('C:/Users/saurabh.s1/Downloads/Python_ml/ch6/main.py', wdir='C:/Users/saurabh.s1/Downloads/Python_ml/ch6')
Reloaded modules: load_data
negative : 572
positive : 519
(1091,) (1091,)
Traceback (most recent call last):
File "<ipython-input-25-823b07c4ff26>", line 1, in <module>
runfile('C:/Users/saurabh.s1/Downloads/Python_ml/ch6/main.py', wdir='C:/Users/saurabh.s1/Downloads/Python_ml/ch6')
File "C:\anaconda2\lib\site-packages\spyder\utils\site\sitecustomize.py", line 866, in runfile
execfile(filename, namespace)
File "C:\anaconda2\lib\site-packages\spyder\utils\site\sitecustomize.py", line 87, in execfile
exec(compile(scripttext, filename, 'exec'), glob, loc)
File "C:/Users/saurabh.s1/Downloads/Python_ml/ch6/main.py", line 31, in <module>
grid_search.fit(X, y)
File "C:\anaconda2\lib\site-packages\sklearn\grid_search.py", line 804, in fit
return self._fit(X, y, ParameterGrid(self.param_grid))
File "C:\anaconda2\lib\site-packages\sklearn\grid_search.py", line 553, in _fit
for parameters in parameter_iterable
File "C:\anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py", line 800, in __call__
while self.dispatch_one_batch(iterator):
File "C:\anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py", line 658, in dispatch_one_batch
self._dispatch(tasks)
File "C:\anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py", line 566, in _dispatch
job = ImmediateComputeBatch(batch)
File "C:\anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py", line 180, in __init__
self.results = batch()
File "C:\anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py", line 72, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\anaconda2\lib\site-packages\sklearn\cross_validation.py", line 1550, in _fit_and_score
test_score = _score(estimator, X_test, y_test, scorer)
File "C:\anaconda2\lib\site-packages\sklearn\cross_validation.py", line 1606, in _score
score = scorer(estimator, X_test, y_test)
File "C:\anaconda2\lib\site-packages\sklearn\metrics\classification.py", line 639, in f1_score
sample_weight=sample_weight)
File "C:\anaconda2\lib\site-packages\sklearn\metrics\classification.py", line 756, in fbeta_score
sample_weight=sample_weight)
File "C:\anaconda2\lib\site-packages\sklearn\metrics\classification.py", line 956, in precision_recall_fscore_support
y_type, y_true, y_pred = _check_targets(y_true, y_pred)
File "C:\anaconda2\lib\site-packages\sklearn\metrics\classification.py", line 72, in _check_targets
check_consistent_length(y_true, y_pred)
File "C:\anaconda2\lib\site-packages\sklearn\utils\validation.py", line 173, in check_consistent_length
uniques = np.unique([_num_samples(X) for X in arrays if X is not None])
File "C:\anaconda2\lib\site-packages\sklearn\utils\validation.py", line 112, in _num_samples
'estimator %s' % x)
TypeError: Expected sequence or array-like, got estimator Pipeline(steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error=u'strict',
dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
lowercase=True, max_df=1.0, max_features=None, min_df=1,
ngram_range=(1, 1), norm=u'l2', preprocessor=None,
smooth_i...e_idf=False, vocabulary=None)), ('nbclf', MultinomialNB(alpha=0, class_prior=None, fit_prior=True))])
I have tried reshaping X,y but that is not working.
Let me know if you need more data or if I have missed something.
Thanks!
This error is because You are passing wrong parameter by using scoring=f1_score
into the GridSearchCV constructor.
Have a look at documentation of GridSearchCV.
In scoring param, it asks for:
A string (see model evaluation documentation) or a scorer callable object / function with signature scorer(estimator, X, y). If None, the score method of the estimator is used.
You are passing a callable function with signature (y_true, y_pred[, ...])
which is wrong. Thats why you are getting the error.
You should use a string as defined here to pass in scoring, or pass a callable with signature (estimator, X, y)
. This can be done by using make_scorer.
Change this line in your code:
grid_search = GridSearchCV(estimator = clf, param_grid = clf_param_grid,
cv = cv, scoring = f1_score)
to this:
grid_search = GridSearchCV(estimator = clf, param_grid = clf_param_grid,
cv = cv, scoring = 'f1')
OR
grid_search = GridSearchCV(estimator = clf, param_grid = clf_param_grid,
cv = cv, scoring = make_scorer(f1_score))
I have answered for same type of problem in this answer here