python pandas machine-learning scikit-learn benchmarking

Writing results tables with scikit-learn and pandas robust to interruptions

I would like to benchmark a classifier in scikit-learn with several datasets. For each dataset, this would involve running a grid search over four classifier settings, and producing a table recording precision, recall, accuracy, and the f1-score on an out-of-sample test set. (This table would ideally be a Pandas Dataframe).

Given that this process will take some time, and each dataset is independent, I would like to know how to produce these results so that, should the process be interrupted, the results from previous datasets will still be written to a file.

What is the standard way of providing these 'live updates' within the scikit-learn and pandas framework?

Here is some code showing the kind of results that would be produced for each dataset:

import numpy as np
from sqizesvc import SeqSVC # My own custom classifier
from warnings import warn

def getOptSeqSVC(X_data, y_data, cut_ord_dom=[(3,1), (2,1), (1,1)],
                 gamma_range=np.logspace(-3,3,num=5,base=10).tolist(),
                 C_range=np.logspace(-3,3,num=5,base=10).tolist(),
                 scale_range=np.logspace(-3,3,num=5,base=10).tolist(),
                 preprocess_range=range(4), kernel='linear'):
    """Runs cross-validation on an SVM estimator with sequentialised
    kernel to find the best values of the cut-off, the order, gamma, C, and
    preprocess."""

    params = dict(cut_ord_pair=cut_ord_dom, C=C_range, scale=scale_range,
                  gamma=gamma_range,
                  preprocess=preprocess_range.append('index') )

    grid = GridSearchCV(SeqSVC(kernel=kernel), param_grid=params, n_jobs=-1,
                        scoring='accuracy', error_score=0)

    grid.fit(X_data, y_data)

    # Issues warning if grid range needs to shift.
    for hypar in ['C', 'gamma', 'scale']:
        if (np.isclose(grid.best_params_[hypar], np.min(params[hypar])) or
            np.isclose(grid.best_params_[hypar], np.max(params[hypar]))):
            msg = """The best value of %s found was %d : this is at the
            edge of the range. Please consider changing the range so that
            this value lies closer to the middle of the range.""" \
            % (hypar, grid.best_params_[hypar])

            warn(msg)


    return grid.best_estimator_


class UCRExperiment(object):
    """Class to find the best parameters for a SeqSVC classifier on a UCR
    training set, compute the accuracy of the classifier on the test set,
    and compare this to the best accuracy achieved by the classifiers
    in the Time Series Classification project.

    Parameters
    ----------
    dataset_names : list (of strings)
        Lists the names of the UCR datasets to be loaded in the experiment.
    """
    def __init__(self, dataset_names):
        self.dataset_names = dataset_names

    def runExperiment(self, kernel='linear', cut_ord_dom=[(3,1), (2,1), (1,1)],
                      gamma_range=np.logspace(-3,3,num=5,base=10).tolist(),
                      C_range=np.logspace(-3,3,num=5,base=10).tolist(),
                      scale_range=np.logspace(-3,3,num=5,base=10).tolist(),
                      preprocess_range=range(4)):
        # Structure of results table:
        # Col1: SeqSVC score; Col2: SeqSVC time; Col3: No. of training samples;
        # Col4: No. of test samples; Col5: No. of time points.
        results_table = np.zeros((len(self.dataset_names),5))
        for ind in range(len(self.dataset_names)):
            loader = LoadUCRData(self.dataset_names[ind])
            X_train, X_test, y_train, y_test = loader.getTrainTestData()

            seq_tic = timeit.default_timer()
            best_seq = getOptSeqSVC(X_train, y_train, kernel=kernel,
                                    cut_ord_dom=cut_ord_dom,
                                    gamma_range=gamma_range,
                                    C_range=C_range,
                                    scale_range=scale_range,
                                    preprocess_range=preprocess_range)
            seq_toc = timeit.default_timer()

            results_table[ind, 0] = best_seq.score(X_test, y_test)
            results_table[ind, 1] = seq_tic-seq_toc
            results_table[ind, 2] = X_train.shape[0]
            results_table[ind, 3] = X_test.shape[0]
            results_table[ind, 4] = X_train.shape[1]
        return results_table

Solution

Instead of creating a single results_table in your experiment loop, you could generate one DataFrame per iteration and save it as CSV to disk.

for ind in range(len(self.dataset_names)):
    # execute your experiments and save results

    df = pd.DataFrame({
        'best_score': [best_seq.score(X_test, y_test)], 
        'duration': [seq_tic-seq_toc], 
        'train_samples': [X_train.shape[0]], 
        'test_shape': [X_test.shape[0]], 
        'train_time_points': [X_train.shape[1]]
    })

    df.to_csv('%s_results.csv' % self.dataset_names[ind])