machine-learning scikit-learn theano logistic-regression lasagne

Logistic Regression that supports both sparse matrices and multilabel outputs?

I have worked a lot with sklearn's SGDClassifier and LogisticRegression. I have a problem now which requires: 1- working with sparse matrices (sklearn supports that) 2- multilabel instances where the label of each instance is a probability distribution over all the labels rather than 0/1 indicators.

For example the label of sample 1 could be:

0.1 0.0 0.4 0.5

Which means that for the 4 labels we have a probability distribution and I shouldn't use thresholding to make them 0/1.

Does anybody know how to modify sklearn's linear models to achieve that? Or is there any other library that supports multioutput instances?

Note: I have implemented multioutput logistic regression in lasagne/theano but It doesn't accept sparse matrices and it is very slow compared to sklearn.

Solution

Update2: Seems that increasing the size of mini batches from 20 to 1000 makes the optimisation much faster. So I think I'll stick with this solution.

Update1: The following solution is so slow:

A solution:

Finally I used Lasagne to implement a Logistic Regression model that accepts sparse matrices as input and supports multioutput instances (e.g. a probability distribution over labels) as target value. The code is below. I have copied two classes from lasagne.layers.sparse directly into my code which is not needed if your lasagne is an updated version.

The main idea is that instead of every DenseLayer and DropoutLayer you should use the corresponding classes SparseInputDenseLayer and SparseInputDropoutLayer. Note that the input layer stays unchanged and that's what was confusing me. I expected a change in the input layer but because all the calculations relating to the input layer occurs in the next layer (either a hidden or output layer) so the second level tier changes as mentioned.

Note that the Logistic Regression is easily convertible to a MLP by adding one or more dense layers.

Note that the only layer that changes is the layer directly after the input layer and the others won't change.

I also experimented with gmlnet (python wrapper) and MLPClassifier from sklearn but to my knowledge they only support either 1d target labels or 1/0 multioutput labels and don't accept a label probability distribution as target.

'''
Created on 22 Apr 2016

@author: af
'''

import pdb
import numpy as np
import sys
from os import path
import scipy as sp
import theano
import theano.tensor as T
import lasagne
from lasagne.regularization import regularize_layer_params_weighted, l2, l1
from lasagne.regularization import regularize_layer_params
import theano.sparse as S
from lasagne.layers import DenseLayer, DropoutLayer
sys.path.append(path.abspath('../geolocation'))
import params
import geolocate
import logging
logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)

class SparseInputDenseLayer(DenseLayer):
    def get_output_for(self, input, **kwargs):
        if not isinstance(input, (S.SparseVariable, S.SparseConstant,
                                  S.sharedvar.SparseTensorSharedVariable)):
            raise ValueError("Input for this layer must be sparse")

        activation = S.dot(input, self.W)
        if self.b is not None:
            activation = activation + self.b.dimshuffle('x', 0)
        return self.nonlinearity(activation)
class SparseInputDropoutLayer(DropoutLayer):
    def get_output_for(self, input, deterministic=False, **kwargs):
        if not isinstance(input, (S.SparseVariable, S.SparseConstant,
                                  S.sharedvar.SparseTensorSharedVariable)):
            raise ValueError("Input for this layer must be sparse")

        if deterministic or self.p == 0:
            return input
        else:
            # Using Theano constant to prevent upcasting
            one = T.constant(1, name='one')
            retain_prob = one - self.p

            if self.rescale:
                input = S.mul(input, one/retain_prob)

            input_shape = self.input_shape
            if any(s is None for s in input_shape):
                input_shape = input.shape

            return input * self._srng.binomial(input_shape, p=retain_prob,
                                               dtype=input.dtype)


def load_data():
    from sklearn.datasets import fetch_20newsgroups
    from sklearn.feature_extraction.text import TfidfVectorizer
    #categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
    categories = None 
    #remove = ('headers', 'footers', 'quotes')
    remove = ()
    train_set = fetch_20newsgroups(subset='train', 
                                   remove=remove,
                                   categories=categories)
    test_set = fetch_20newsgroups(subset='test', 
                                  remove=remove,
                                  categories=categories)
    Y_train, Y_test = train_set.target, test_set.target
    vectorizer = TfidfVectorizer(min_df=2, stop_words='english')
    X_train = vectorizer.fit_transform(train_set.data)
    X_test = vectorizer.transform(test_set.data)
    return X_train, Y_train, X_test, Y_test

def load_geolocation_data(mindf=10, complete_prob=True):

    geolocate.initialize(granularity=params.BUCKET_SIZE, write=False, readText=True, reload_init=False, regression=params.do_not_discretize)
    params.X_train, params.Y_train, params.U_train, params.X_dev, params.Y_dev, params.U_dev, params.X_test, params.Y_test, params.U_test, params.categories, params.feature_names = geolocate.feature_extractor(norm=params.norm, use_mention_dictionary=False, min_df=mindf, max_df=0.2, stop_words='english', binary=True, sublinear_tf=False, vocab=None, use_idf=True, save_vectorizer=False)

    if complete_prob:
        Y_train = np.zeros((params.X_train.shape[0], len(params.categories)), dtype='int32')
        Y_dev = np.zeros((params.X_dev.shape[0], len(params.categories)), dtype='int32')
        for i in range(params.Y_dev.shape[0]):
            Y_dev[i, params.Y_dev[i]] = 1
        for i in range(params.Y_train.shape[0]):
            Y_train[i, params.Y_train[i]] = 1

        return params.X_train, Y_train, params.X_dev, Y_dev
    else:
        return params.X_train, params.Y_train, params.X_dev, params.Y_dev
# ############################# Batch iterator ###############################
# This is just a simple helper function iterating over training data in
# mini-batches of a particular size, optionally in random order. It assumes
# data is available as numpy arrays. For big datasets, you could load numpy
# arrays as memory-mapped files (np.load(..., mmap_mode='r')), or write your
# own custom data iteration function. For small datasets, you can also copy
# them to GPU at once for slightly improved performance. This would involve
# several changes in the main program, though, and is not demonstrated here.

def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
    assert inputs.shape[0] == targets.shape[0]
    if shuffle:
        indices = np.arange(inputs.shape[0])
        np.random.shuffle(indices)
    for start_idx in range(0, inputs.shape[0] - batchsize + 1, batchsize):
        if shuffle:
            excerpt = indices[start_idx:start_idx + batchsize]
        else:
            excerpt = slice(start_idx, start_idx + batchsize)
        yield inputs[excerpt], targets[excerpt]

def nn_model(X_train, Y_train, X_test, Y_test, n_epochs=20, batch_size=50, init_parameters=None, complete_prob=True, add_hidden=False, regul_coef=5e-5):

    logging.info('building the network...')
    in_size = X_train.shape[1]

    if complete_prob:
        out_size = Y_train.shape[1]
    else:
        out_size = len(set(Y_train.tolist()))
    # Prepare Theano variables for inputs and targets
    if not sp.sparse.issparse(X_train):
        X_sym = T.matrix()
    else:
        X_sym = S.csr_matrix(name='inputs', dtype='float32')

    if complete_prob:
        y_sym = T.matrix()
    else:
        y_sym = T.ivector()    
    l_in = lasagne.layers.InputLayer(shape=(None, in_size),
                                     input_var=X_sym)

    drop_input = False
    if drop_input:
        l_in = lasagne.layers.dropout(l_in, p=0.2)

    if add_hidden:
        if not sp.sparse.issparse(X_train):
            l_hid1 = lasagne.layers.DenseLayer(
                l_in, num_units=300,
                nonlinearity=lasagne.nonlinearities.rectify,
                W=lasagne.init.GlorotUniform())
        else:
            l_hid1 = SparseInputDenseLayer(
                l_in, num_units=300,
                nonlinearity=lasagne.nonlinearities.rectify,
                W=lasagne.init.GlorotUniform())


        l_out = lasagne.layers.DenseLayer(
        l_hid1, num_units=out_size,
        nonlinearity=lasagne.nonlinearities.softmax)
    else:
        if not sp.sparse.issparse(X_train):
            l_out = lasagne.layers.DenseLayer(
                l_in, num_units=out_size,
                nonlinearity=lasagne.nonlinearities.softmax)
        else:
            l_out = SparseInputDenseLayer(
                l_in, num_units=out_size,
                nonlinearity=lasagne.nonlinearities.softmax)




    if add_hidden:
        embedding = lasagne.layers.get_output(l_hid1, X_sym)
        f_get_embeddings = theano.function([X_sym], embedding)
    output = lasagne.layers.get_output(l_out, X_sym)
    pred = output.argmax(-1)
    loss = lasagne.objectives.categorical_crossentropy(output, y_sym)
    #loss = lasagne.objectives.multiclass_hinge_loss(output, y_sym)

    l1_share = 0.9
    l1_penalty = lasagne.regularization.regularize_layer_params(l_out, l1) * regul_coef * l1_share
    l2_penalty = lasagne.regularization.regularize_layer_params(l_out, l2) * regul_coef * (1-l1_share)
    loss = loss + l1_penalty + l2_penalty
    loss = loss.mean()
    if complete_prob:
        y_sym_one_hot = y_sym.argmax(-1)
        acc = T.mean(T.eq(pred, y_sym_one_hot))
    else:
        acc = T.mean(T.eq(pred, y_sym))
    if init_parameters:
        lasagne.layers.set_all_param_values(l_out, init_parameters)
    parameters = lasagne.layers.get_all_params(l_out, trainable=True)

    #print(params)
    #updates = lasagne.updates.nesterov_momentum(loss, parameters, learning_rate=0.01, momentum=0.9)
    #updates = lasagne.updates.sgd(loss, parameters, learning_rate=0.01)
    #updates = lasagne.updates.adagrad(loss, parameters, learning_rate=0.1, epsilon=1e-6)
    #updates = lasagne.updates.adadelta(loss, parameters, learning_rate=0.1, rho=0.95, epsilon=1e-6)
    updates = lasagne.updates.adam(loss, parameters, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8)

    f_train = theano.function([X_sym, y_sym], [loss, acc], updates=updates)
    f_val = theano.function([X_sym, y_sym], [loss, acc])
    f_predict = theano.function([X_sym], pred)
    f_predict_proba = theano.function([X_sym], output)


    do_scale = False
    #X_train = X_train.todense()
    #X_text = X_text.todense()
    if do_scale:
        from sklearn import preprocessing
        scaler = preprocessing.StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_text = scaler.transform(X_text)        
    #X = X_train.todense().astype(theano.config.floatX)
    #Xt = X_test.todense().astype(theano.config.floatX)
    X = X_train.astype('float32')
    Xt = X_test.astype('float32')
    #X = X_train.astype(theano.config.floatX)
    #Xt = X_test.astype(theano.config.floatX)
    if complete_prob:
        Y = Y_train.astype('float32')
        Yt = Y_test.astype('float32')
    else:
        Y = Y_train.astype('int32')
        Yt = Y_test.astype('int32')

    logging.info('training (n_epochs, batch_size) = (' + str(n_epochs) + ', ' + str(batch_size) + ')' )
    for n in xrange(n_epochs):
        for batch in iterate_minibatches(X, Y, batch_size, shuffle=True):
            x_batch, y_batch = batch
            l_train, acc_train = f_train(x_batch, y_batch)

        l_val, acc_val = f_val(Xt, Yt)
        logging.info('epoch ' + str(n) + ' ,train_loss ' + str(l_train) + ' ,acc ' + str(acc_train) + ' ,val_loss ' + str(l_val) + ' ,acc ' + str(acc_val))
        geolocate.loss(f_predict(Xt), U_eval=params.U_dev )
    logging.info( str(regul_coef))
    if add_hidden:
        X_embs = f_get_embeddings(X)
        Xt_embs = f_get_embeddings(Xt)
    train_probs = f_predict_proba(X)
    return train_probs
    #pdb.set_trace()

if __name__ == '__main__':
    #X_train, Y_train, X_test, Y_test = load_data()
    X_train, Y_train, X_test, Y_test = load_geolocation_data(complete_prob=True)

    if params.DATASET_NUMBER == 1:
        if params.TEXT_ONLY:
            _coef = 1e-4
        else:
            _coef = 9e-5
    #for _coef in [1e-7, 2e-7, 4e-7, 8e-7, 1e-6, 8e-6]:
    #for _coef in [1e-5, 2e-5, 4e-5, 6e-5, 8e-5, 1e-4, 2e-4]:
    for _coef in [_coef]: 
        logging.info( str(_coef))
        nn_model(X_train, Y_train, X_test, Y_test, complete_prob=True, regul_coef=_coef)