python neural-network cross-validation metrics

Multiple metrics for neural network model with cross validation

I am trying to get F1, precision and recall of cross validation for an LSTM model.

I know how to show the accuracies, but when I try to show the other metrics using cross_validate I get many different errors.

My code is the following:

def nn_model():
    model_lstm1 = Sequential()
    model_lstm1.add(Embedding(20000, 100, input_length=49))
    model_lstm1.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model_lstm1.add(Dense(2, activation='sigmoid'))
    model_lstm1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model_lstm1

classifier = KerasClassifier(build_fn=nn_model, batch_size=10,nb_epoch=10)

scoring = {'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}

results = cross_validate(classifier, X_train, y_train, cv=skf, scoring = scoring)

print("F1 score SVM: %0.2f (+/- %0.2f)" % (np.mean(results[f1_score]), np.std(results[f1_score])))

print("precision score SVM: %0.2f (+/- %0.2f)" % (np.mean(results[precision]), np.std(results[precision])))
print("recall macro SVM: %0.2f (+/- %0.2f)" % (np.mean(results[recall]), np.std(results[recall])))

The error I get is the following:

Epoch 1/1 1086/1086 [==============================] - 18s 17ms/step - loss: 0.6014 - acc: 0.7035
--------------------------------------------------------------------------- ValueError                                Traceback (most recent call last) <ipython-input-40-5afe62c11676> in <module>
      6            'f1_score' : make_scorer(f1_score)}
      7 
----> 8 results = cross_validate(classifier, X_train, y_train, cv=skf, scoring = scoring)
      9 
     10 print("F1 score SVM: %0.2f (+/- %0.2f)" % (np.mean(results[f1_score]), np.std(results[f1_score])))

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
    229             return_times=True, return_estimator=return_estimator,
    230             error_score=error_score)
--> 231         for train, test in cv.split(X, y, groups))
    232 
    233     zipped_scores = list(zip(*scores))

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
    919             # remaining jobs.
    920             self._iterating = False
--> 921             if self.dispatch_one_batch(iterator):
    922                 self._iterating = self._original_iterator is not None
    923 

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
    757                 return False
    758             else:
--> 759                 self._dispatch(tasks)
    760                 return True
    761 

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch)
    714         with self._lock:
    715             job_idx = len(self._jobs)
--> 716             job = self._backend.apply_async(batch, callback=cb)
    717             # A job can complete so quickly than its callback is
    718             # called before we get here, causing self._jobs to

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
    180     def apply_async(self, func, callback=None):
    181         """Schedule a func to be run"""
--> 182         result = ImmediateResult(func)
    183         if callback:
    184             callback(result)

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
    547         # Don't delay the application, to avoid keeping the input
    548         # arguments in memory
--> 549         self.results = batch()
    550 
    551     def get(self):

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/joblib/parallel.py in __call__(self)
    223         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    224             return [func(*args, **kwargs)
--> 225                     for func, args, kwargs in self.items]
    226 
    227     def __len__(self):

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0)
    223         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    224             return [func(*args, **kwargs)
--> 225                     for func, args, kwargs in self.items]
    226 
    227     def __len__(self):

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
    552         fit_time = time.time() - start_time
    553         # _score will return dict if is_multimetric is True
--> 554         test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
    555         score_time = time.time() - start_time - fit_time
    556         if return_train_score:

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _score(estimator, X_test, y_test, scorer, is_multimetric)
    595     """
    596     if is_multimetric:
--> 597         return _multimetric_score(estimator, X_test, y_test, scorer)
    598     else:
    599         if y_test is None:

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _multimetric_score(estimator, X_test, y_test, scorers)
    625             score = scorer(estimator, X_test)
    626         else:
--> 627             score = scorer(estimator, X_test, y_test)
    628 
    629         if hasattr(score, 'item'):

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/sklearn/metrics/scorer.py in __call__(self, estimator, X, y_true, sample_weight)
     95         else:
     96             return self._sign * self._score_func(y_true, y_pred,
---> 97                                                  **self._kwargs)
     98 
     99 

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/sklearn/metrics/classification.py in precision_score(y_true, y_pred, labels, pos_label, average, sample_weight)    1567                                                 average=average,    1568                                               warn_for=('precision',),
-> 1569                                                  sample_weight=sample_weight)    1570     return p    1571 

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/sklearn/metrics/classification.py in precision_recall_fscore_support(y_true, y_pred, beta, labels, pos_label, average, warn_for, sample_weight)    1413         raise ValueError("beta should be >0 in the F-beta score")    1414     labels
= _check_set_wise_labels(y_true, y_pred, average, labels,
-> 1415                                     pos_label)    1416     1417     # Calculate tp_sum, pred_sum, true_sum ###

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/sklearn/metrics/classification.py in _check_set_wise_labels(y_true, y_pred, average, labels, pos_label)  1237                          str(average_options))    1238 
-> 1239     y_type, y_true, y_pred = _check_targets(y_true, y_pred)    1240     present_labels = unique_labels(y_true, y_pred)    1241     if average == 'binary':

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/sklearn/metrics/classification.py in _check_targets(y_true, y_pred)
     79     if len(y_type) > 1:
     80         raise ValueError("Classification metrics can't handle a mix of {0} "
---> 81                          "and {1} targets".format(type_true, type_pred))
     82 
     83     # We can't have more than one value on y_type => The set is no more needed

ValueError: Classification metrics can't handle a mix of multilabel-indicator and binary targets

What am I doing wrong?

Solution

Issue in your code

You cant use hot-one-encoded labels link. Use raw labels. You can use sparse_categorical_crossentropy loss with raw labels.
cross_validate returns scores as test_scores. For train scores set return_train_score

Corrected code

def nn_model():
    model_lstm1 = Sequential()
    model_lstm1.add(Embedding(200, 100, input_length=10))
    model_lstm1.add(LSTM(10, dropout=0.2, recurrent_dropout=0.2))
    model_lstm1.add(Dense(2, activation='sigmoid'))
    model_lstm1.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model_lstm1

classifier = KerasClassifier(build_fn=nn_model, batch_size=10,nb_epoch=10)

scoring = {'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}

results = cross_validate(classifier, np.random.randint(0,100,(1000,10)), 
                         np.random.np.random.randint(0,2,1000), scoring = scoring, cv=3, return_train_score=True)

print("F1 score SVM: %0.2f (+/- %0.2f)" % (np.mean(results['test_f1_score']), np.std(results['test_f1_score'])))
print("precision score SVM: %0.2f (+/- %0.2f)" % (np.mean(results['test_precision']), np.std(results['test_precision'])))
print("recall macro SVM: %0.2f (+/- %0.2f)" % (np.mean(results['test_recall']), np.std(results['test_recall'])))

Output

Epoch 1/1
666/666 [==============================] - 5s 7ms/step - loss: 0.6932 - acc: 0.5075
Epoch 1/1
667/667 [==============================] - 5s 7ms/step - loss: 0.6929 - acc: 0.5127
Epoch 1/1
667/667 [==============================] - 5s 7ms/step - loss: 0.6934 - acc: 0.5007
F1 score SVM: 0.10 (+/- 0.09)
precision score SVM: 0.43 (+/- 0.07)
recall macro SVM: 0.06 (+/- 0.06)

You might get

UndefinedMetricWarning: ....

warnings in initials epochs (if data is low), which you can ignore. This is because the classifier is classifying all the data to one class and no data into the another class.