Search code examples
python-3.xtensorflowmachine-learningkerasnlp

How to add attention layer to a Bi-LSTM


I am developing a Bi-LSTM model and want to add a attention layer to it. But I am not getting how to add it.

My current code for the model is

model = Sequential()
model.add(Embedding(max_words, 1152, input_length=max_len, weights=[embeddings]))
model.add(BatchNormalization())
model.add(Activation('tanh'))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(32)))
model.add(BatchNormalization())
model.add(Activation('tanh'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.summary()

And the model summary is

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_1 (Embedding)      (None, 1152, 1152)        278396928 
_________________________________________________________________
batch_normalization_1 (Batch (None, 1152, 1152)        4608      
_________________________________________________________________
activation_1 (Activation)    (None, 1152, 1152)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 1152, 1152)        0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                303360    
_________________________________________________________________
batch_normalization_2 (Batch (None, 64)                256       
_________________________________________________________________
activation_2 (Activation)    (None, 64)                0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
=================================================================
Total params: 278,705,217
Trainable params: 278,702,785
Non-trainable params: 2,432

Solution

  • This can be a possible custom solution with a custom layer that computes attention on the positional/temporal dimension

    from tensorflow.keras.layers import Layer
    from tensorflow.keras import backend as K
    
    class Attention(Layer):
        
        def __init__(self, return_sequences=True):
            self.return_sequences = return_sequences
            super(Attention,self).__init__()
            
        def build(self, input_shape):
            
            self.W=self.add_weight(name="att_weight", shape=(input_shape[-1],1),
                                   initializer="normal")
            self.b=self.add_weight(name="att_bias", shape=(input_shape[1],1),
                                   initializer="zeros")
            
            super(Attention,self).build(input_shape)
            
        def call(self, x):
            
            e = K.tanh(K.dot(x,self.W)+self.b)
            a = K.softmax(e, axis=1)
            output = x*a
            
            if self.return_sequences:
                return output
            
            return K.sum(output, axis=1)
    

    it's build to receive 3D tensors and output 3D tensors (return_sequences=True) or 2D tensors (return_sequences=False). below a dummy example

    # dummy data creation
    
    max_len = 100
    max_words = 333
    emb_dim = 126
    
    n_sample = 5
    X = np.random.randint(0,max_words, (n_sample,max_len))
    Y = np.random.randint(0,2, n_sample)
    

    with return_sequences=True

    model = Sequential()
    model.add(Embedding(max_words, emb_dim, input_length=max_len))
    model.add(Bidirectional(LSTM(32, return_sequences=True)))
    model.add(Attention(return_sequences=True)) # receive 3D and output 3D
    model.add(LSTM(32))
    model.add(Dense(1, activation='sigmoid'))
    model.summary()
    
    model.compile('adam', 'binary_crossentropy')
    model.fit(X,Y, epochs=3)
    

    with return_sequences=False

    model = Sequential()
    model.add(Embedding(max_words, emb_dim, input_length=max_len))
    model.add(Bidirectional(LSTM(32, return_sequences=True)))
    model.add(Attention(return_sequences=False)) # receive 3D and output 2D
    model.add(Dense(1, activation='sigmoid'))
    model.summary()
    
    model.compile('adam', 'binary_crossentropy')
    model.fit(X,Y, epochs=3)
    

    You can integrate it into your networks easily

    here the running notebook