Search code examples
pythonneural-networktheanodeep-learninggated-recurrent-unit

GRU implementation in Theano


Based on the LSTM code provided in the official Theano tutorial (http://deeplearning.net/tutorial/code/lstm.py), I changed the LSTM layer code (i.e. the functions lstm_layer() and param_init_lstm()) to perform a GRU instead.

The provided LSTM code trains well, but not the GRU I coded: the accuracy on the training set with the LSTM goes up to 1 (train cost = 0), while with the GRU it stagnates at 0.7 (train cost = 0.3).

Below is the code I use for the GRU. I kept the same function names as in tutorial, so that one can copy paste the code directly in it. What could explain the poor performance of the GRU?

import numpy as np
def param_init_lstm(options, params, prefix='lstm'):
    """
    GRU
    """
    W = np.concatenate([ortho_weight(options['dim_proj']),  # Weight matrix for the input in the reset gate
                        ortho_weight(options['dim_proj']),
                        ortho_weight(options['dim_proj'])], # Weight matrix for the input in the update gate  
                        axis=1)         
    params[_p(prefix, 'W')] = W

    U = np.concatenate([ortho_weight(options['dim_proj']),  # Weight matrix for the previous hidden state in the reset gate
                        ortho_weight(options['dim_proj']),
                        ortho_weight(options['dim_proj'])], # Weight matrix for the previous hidden state in the update gate
                        axis=1)         
    params[_p(prefix, 'U')] = U

    b = np.zeros((3 * options['dim_proj'],)) # Biases for the reset gate and the update gate        
    params[_p(prefix, 'b')] = b.astype(config.floatX)    
    return params


def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):
    nsteps = state_below.shape[0]
    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
    else:
        n_samples = 1

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n * dim:(n + 1) * dim]
        return _x[:, n * dim:(n + 1) * dim]

    def _step(m_, x_, h_):
        preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
        preact += x_

        r = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj'])) # reset gate
        u = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj'])) # update gate

        U_h_t = _slice( tparams[_p(prefix, 'U')], 2, options['dim_proj'])
        x_h_t = _slice( x_, 2, options['dim_proj'])

        h_t_temp = tensor.tanh(tensor.dot(r*h_, U_h_t) + x_h_t)
        h = (1. - u) * h_ + u * h_t_temp        
        h = m_[:,None] * h + (1. - m_)[:,None] * h_

        return h

    state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
                   tparams[_p(prefix, 'b')])

    dim_proj = options['dim_proj'] 
    rval, updates = theano.scan(_step,
                                sequences=[mask, state_below],
                                outputs_info=[tensor.alloc(numpy_floatX(0.),
                                                           n_samples,
                                                           dim_proj)],
                                name=_p(prefix, '_layers'),
                                n_steps=nsteps)

    return rval[0]

Solution

  • The issue comes from the last line, return rval[0]: it should instead be return rval.

    The LSTM code provided in the official Theano tutorial (http://deeplearning.net/tutorial/code/lstm.py) uses return rval[0] because outputs_info contains 2 elements:

    rval, updates = theano.scan(_step,
                                sequences=[mask, state_below],
                                outputs_info=[tensor.alloc(numpy_floatX(0.),
                                                           n_samples,
                                                           dim_proj),
                                              tensor.alloc(numpy_floatX(0.),
                                                           n_samples,
                                                           dim_proj)],
                                name=_p(prefix, '_layers'),
                                n_steps=nsteps)
    return rval[0]
    

    In the GRU, outputs_info contains just one element:

    outputs_info=[tensor.alloc(numpy_floatX(0.),
                               n_samples,
                               dim_proj)],
    

    and despite the brackets, it won't return a list of a list of Theano variables representing the outputs of scan, but directly a Theano variable.

    The rval is then fed to a pooling layer (in this case, a mean pooling layer):

    enter image description here

    By taking only rval[0] in the GRU, since in the GRU code rval is a Theano variable and not a list of a Theano variables, you removed the part in the red rectangle:

    enter image description here

    which means you tried to perform the sentence classification just using the first word.


    Another GRU implementation that can be plugged in the LSTM tutorial:

    # weight initializer, normal by default
    def norm_weight(nin, nout=None, scale=0.01, ortho=True):
        if nout is None:
            nout = nin
        if nout == nin and ortho:
            W = ortho_weight(nin)
        else:
            W = scale * numpy.random.randn(nin, nout)
        return W.astype('float32')
    
    def param_init_lstm(options, params, prefix='lstm'):
        """
        GRU. Source: https://github.com/kyunghyuncho/dl4mt-material/blob/master/session0/lm.py
        """
        nin = options['dim_proj']
        dim = options['dim_proj']
        # embedding to gates transformation weights, biases
        W = numpy.concatenate([norm_weight(nin, dim),
                               norm_weight(nin, dim)], axis=1)
        params[_p(prefix, 'W')] = W
        params[_p(prefix, 'b')] = numpy.zeros((2 * dim,)).astype('float32')
    
        # recurrent transformation weights for gates
        U = numpy.concatenate([ortho_weight(dim),
                               ortho_weight(dim)], axis=1)
        params[_p(prefix, 'U')] = U
    
        # embedding to hidden state proposal weights, biases
        Wx = norm_weight(nin, dim)
        params[_p(prefix, 'Wx')] = Wx
        params[_p(prefix, 'bx')] = numpy.zeros((dim,)).astype('float32')
    
        # recurrent transformation weights for hidden state proposal
        Ux = ortho_weight(dim)
        params[_p(prefix, 'Ux')] = Ux
        return params
    
    
    def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):
    
        nsteps = state_below.shape[0]
    
        if state_below.ndim == 3:
            n_samples = state_below.shape[1]
        else:
            n_samples = state_below.shape[0]
    
        dim = tparams[_p(prefix, 'Ux')].shape[1]
    
        if mask is None:
            mask = tensor.alloc(1., state_below.shape[0], 1)
    
        # utility function to slice a tensor
        def _slice(_x, n, dim):
            if _x.ndim == 3:
                return _x[:, :, n*dim:(n+1)*dim]
            return _x[:, n*dim:(n+1)*dim]
    
        # state_below is the input word embeddings
        # input to the gates, concatenated
        state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + \
            tparams[_p(prefix, 'b')]
        # input to compute the hidden state proposal
        state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + \
            tparams[_p(prefix, 'bx')]
    
        # step function to be used by scan
        # arguments    | sequences |outputs-info| non-seqs
        def _step_slice(m_, x_, xx_,  h_,          U, Ux):
            preact = tensor.dot(h_, U)
            preact += x_
    
            # reset and update gates
            r = tensor.nnet.sigmoid(_slice(preact, 0, dim))
            u = tensor.nnet.sigmoid(_slice(preact, 1, dim))
    
            # compute the hidden state proposal
            preactx = tensor.dot(h_, Ux)
            preactx = preactx * r
            preactx = preactx + xx_
    
            # hidden state proposal
            h = tensor.tanh(preactx)
    
            # leaky integrate and obtain next hidden state
            h = u * h_ + (1. - u) * h
            h = m_[:, None] * h + (1. - m_)[:, None] * h_
    
            return h
    
        # prepare scan arguments
        seqs = [mask, state_below_, state_belowx]
        _step = _step_slice
        shared_vars = [tparams[_p(prefix, 'U')],
                       tparams[_p(prefix, 'Ux')]]
    
        init_state = tensor.unbroadcast(tensor.alloc(0., n_samples, dim), 0)
    
        rval, updates = theano.scan(_step,
                                        sequences=seqs,
                                        outputs_info=[init_state],
                                        non_sequences=shared_vars,
                                        name=_p(prefix, '_layers'),
                                        n_steps=nsteps,
                                        strict=True)
        return rval
    

    As a side note, Keras fixed this issue as follows:

    results, _ = theano.scan(
        _step,
        sequences=inputs,
        outputs_info=[None] + initial_states,
        go_backwards=go_backwards)
    
    # deal with Theano API inconsistency
    if type(results) is list:
        outputs = results[0]
        states = results[1:]
    else:
        outputs = results
        states = []