python neural-network theano deep-learning gated-recurrent-unit

GRU implementation in Theano

Based on the LSTM code provided in the official Theano tutorial (http://deeplearning.net/tutorial/code/lstm.py), I changed the LSTM layer code (i.e. the functions lstm_layer() and param_init_lstm()) to perform a GRU instead.

The provided LSTM code trains well, but not the GRU I coded: the accuracy on the training set with the LSTM goes up to 1 (train cost = 0), while with the GRU it stagnates at 0.7 (train cost = 0.3).

Below is the code I use for the GRU. I kept the same function names as in tutorial, so that one can copy paste the code directly in it. What could explain the poor performance of the GRU?

import numpy as np
def param_init_lstm(options, params, prefix='lstm'):
    """
    GRU
    """
    W = np.concatenate([ortho_weight(options['dim_proj']),  # Weight matrix for the input in the reset gate
                        ortho_weight(options['dim_proj']),
                        ortho_weight(options['dim_proj'])], # Weight matrix for the input in the update gate  
                        axis=1)         
    params[_p(prefix, 'W')] = W

    U = np.concatenate([ortho_weight(options['dim_proj']),  # Weight matrix for the previous hidden state in the reset gate
                        ortho_weight(options['dim_proj']),
                        ortho_weight(options['dim_proj'])], # Weight matrix for the previous hidden state in the update gate
                        axis=1)         
    params[_p(prefix, 'U')] = U

    b = np.zeros((3 * options['dim_proj'],)) # Biases for the reset gate and the update gate        
    params[_p(prefix, 'b')] = b.astype(config.floatX)    
    return params


def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):
    nsteps = state_below.shape[0]
    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
    else:
        n_samples = 1

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n * dim:(n + 1) * dim]
        return _x[:, n * dim:(n + 1) * dim]

    def _step(m_, x_, h_):
        preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
        preact += x_

        r = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj'])) # reset gate
        u = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj'])) # update gate

        U_h_t = _slice( tparams[_p(prefix, 'U')], 2, options['dim_proj'])
        x_h_t = _slice( x_, 2, options['dim_proj'])

        h_t_temp = tensor.tanh(tensor.dot(r*h_, U_h_t) + x_h_t)
        h = (1. - u) * h_ + u * h_t_temp        
        h = m_[:,None] * h + (1. - m_)[:,None] * h_

        return h

    state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
                   tparams[_p(prefix, 'b')])

    dim_proj = options['dim_proj'] 
    rval, updates = theano.scan(_step,
                                sequences=[mask, state_below],
                                outputs_info=[tensor.alloc(numpy_floatX(0.),
                                                           n_samples,
                                                           dim_proj)],
                                name=_p(prefix, '_layers'),
                                n_steps=nsteps)

    return rval[0]

Solution

The issue comes from the last line, return rval[0]: it should instead be return rval.

The LSTM code provided in the official Theano tutorial (http://deeplearning.net/tutorial/code/lstm.py) uses return rval[0] because outputs_info contains 2 elements:

rval, updates = theano.scan(_step,
                            sequences=[mask, state_below],
                            outputs_info=[tensor.alloc(numpy_floatX(0.),
                                                       n_samples,
                                                       dim_proj),
                                          tensor.alloc(numpy_floatX(0.),
                                                       n_samples,
                                                       dim_proj)],
                            name=_p(prefix, '_layers'),
                            n_steps=nsteps)
return rval[0]

In the GRU, outputs_info contains just one element:

outputs_info=[tensor.alloc(numpy_floatX(0.),
                           n_samples,
                           dim_proj)],

and despite the brackets, it won't return a list of a list of Theano variables representing the outputs of scan, but directly a Theano variable.

The rval is then fed to a pooling layer (in this case, a mean pooling layer):

By taking only rval[0] in the GRU, since in the GRU code rval is a Theano variable and not a list of a Theano variables, you removed the part in the red rectangle:

which means you tried to perform the sentence classification just using the first word.

Another GRU implementation that can be plugged in the LSTM tutorial:

# weight initializer, normal by default
def norm_weight(nin, nout=None, scale=0.01, ortho=True):
    if nout is None:
        nout = nin
    if nout == nin and ortho:
        W = ortho_weight(nin)
    else:
        W = scale * numpy.random.randn(nin, nout)
    return W.astype('float32')

def param_init_lstm(options, params, prefix='lstm'):
    """
    GRU. Source: https://github.com/kyunghyuncho/dl4mt-material/blob/master/session0/lm.py
    """
    nin = options['dim_proj']
    dim = options['dim_proj']
    # embedding to gates transformation weights, biases
    W = numpy.concatenate([norm_weight(nin, dim),
                           norm_weight(nin, dim)], axis=1)
    params[_p(prefix, 'W')] = W
    params[_p(prefix, 'b')] = numpy.zeros((2 * dim,)).astype('float32')

    # recurrent transformation weights for gates
    U = numpy.concatenate([ortho_weight(dim),
                           ortho_weight(dim)], axis=1)
    params[_p(prefix, 'U')] = U

    # embedding to hidden state proposal weights, biases
    Wx = norm_weight(nin, dim)
    params[_p(prefix, 'Wx')] = Wx
    params[_p(prefix, 'bx')] = numpy.zeros((dim,)).astype('float32')

    # recurrent transformation weights for hidden state proposal
    Ux = ortho_weight(dim)
    params[_p(prefix, 'Ux')] = Ux
    return params


def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):

    nsteps = state_below.shape[0]

    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
    else:
        n_samples = state_below.shape[0]

    dim = tparams[_p(prefix, 'Ux')].shape[1]

    if mask is None:
        mask = tensor.alloc(1., state_below.shape[0], 1)

    # utility function to slice a tensor
    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n*dim:(n+1)*dim]
        return _x[:, n*dim:(n+1)*dim]

    # state_below is the input word embeddings
    # input to the gates, concatenated
    state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + \
        tparams[_p(prefix, 'b')]
    # input to compute the hidden state proposal
    state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + \
        tparams[_p(prefix, 'bx')]

    # step function to be used by scan
    # arguments    | sequences |outputs-info| non-seqs
    def _step_slice(m_, x_, xx_,  h_,          U, Ux):
        preact = tensor.dot(h_, U)
        preact += x_

        # reset and update gates
        r = tensor.nnet.sigmoid(_slice(preact, 0, dim))
        u = tensor.nnet.sigmoid(_slice(preact, 1, dim))

        # compute the hidden state proposal
        preactx = tensor.dot(h_, Ux)
        preactx = preactx * r
        preactx = preactx + xx_

        # hidden state proposal
        h = tensor.tanh(preactx)

        # leaky integrate and obtain next hidden state
        h = u * h_ + (1. - u) * h
        h = m_[:, None] * h + (1. - m_)[:, None] * h_

        return h

    # prepare scan arguments
    seqs = [mask, state_below_, state_belowx]
    _step = _step_slice
    shared_vars = [tparams[_p(prefix, 'U')],
                   tparams[_p(prefix, 'Ux')]]

    init_state = tensor.unbroadcast(tensor.alloc(0., n_samples, dim), 0)

    rval, updates = theano.scan(_step,
                                    sequences=seqs,
                                    outputs_info=[init_state],
                                    non_sequences=shared_vars,
                                    name=_p(prefix, '_layers'),
                                    n_steps=nsteps,
                                    strict=True)
    return rval

As a side note, Keras fixed this issue as follows:

results, _ = theano.scan(
    _step,
    sequences=inputs,
    outputs_info=[None] + initial_states,
    go_backwards=go_backwards)

# deal with Theano API inconsistency
if type(results) is list:
    outputs = results[0]
    states = results[1:]
else:
    outputs = results
    states = []