Search code examples
tensorflowkerasdeep-learningconv-neural-networkattention-model

Adding Luong attention Layer to CNN


I'm using keras to implement a functional CNN model where I have images with the size of 64x64x1. with 6 convolutional layer like this :

num_classes = 5
def  get_model():
    ##creating CNN functional api for learning 
    input_ = keras.layers.Input(shape=[64, 64,1])
    ##first layer of convolutional layer 
    Conv1  = keras.layers.Conv2D(32, kernel_size=5,activation=tf.nn.relu)(input_layer)
    #second convolutional layer
    Conv12  = keras.layers.Conv2D(32, kernel_size=5,activation=tf.nn.relu)(Conv1)
    #third convolutional layer
    Conv13  = keras.layers.Conv2D(32, kernel_size=5,activation=tf.nn.relu)(Conv12)
    ##i add max pooling with a stride of 2
    Max1  = keras.layers.MaxPool2D(2, strides=2)(Conv13)
    ##i add a second layer of convlutional layer
    Conv2 = keras.layers.Conv2D(64, kernel_size=5,activation=tf.nn.relu)(Max1)
    ##adding second convolutional layer
    Conv21 = keras.layers.Conv2D(64, kernel_size=5,activation=tf.nn.relu)(Conv2)
    ##adding third convolutional layer
    Conv23 = keras.layers.Conv2D(64, kernel_size=5,activation=tf.nn.relu)(Conv21)
    ##i add another layer of max pooling 
    Max2 = keras.layers.MaxPool2D(2, strides=2)(Conv23)
    ##here i execute data flatting, i will change this to use attention layer Att.
    Flat = keras.layers.Flatten()(Max2)
    #i add another dense architecture
    Dense1= keras.layers.Dense(2048,activation=tf.nn.relu)(Flat)
    Dense2= keras.layers.Dense(700,activation=tf.nn.relu)(Dense1)
    #i add now the output layer with softmax
    # Output layer, class prediction.
    output = keras.layers.Dense(num_classes,activation=tf.nn.softmax)(Dense2)
    model = Model(inputs=input_, outputs=output)
    ##end of creating CNN using functional api
    ##defining loss function and training data and epoche. I modify the optimizer to rmsprop
    optimize_rmsprop = keras.optimizers.RMSprop(learning_rate=0.001, epsilon=1e-08, decay=0.0)
    model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer_rmsprop,metrics=["accuracy"])
    ###i return the model
    return model

to get better performance i want to add this attention layer to the above CNN :

#already imported
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Model
from keras.layers import Dropout
# Variable-length int sequences. 64*64*1
query_input = tf.keras.Input(shape=(4096,), dtype='int32')
value_input = tf.keras.Input(shape=(4096,), dtype='int32')

# Embedding lookup.
token_embedding = tf.keras.layers.Embedding(input_dim=1000, output_dim=64)
# Query embeddings of shape [batch_size, Tq, dimension].
query_embeddings = token_embedding(query_input)
# Value embeddings of shape [batch_size, Tv, dimension].
value_embeddings = token_embedding(value_input)

# CNN layer.
cnn_layer = tf.keras.layers.Conv1D(
    filters=100,
    kernel_size=4,
    # Use 'same' padding so outputs have the same shape as inputs.
    padding='same')
# Query encoding of shape [batch_size, Tq, filters].
query_seq_encoding = cnn_layer(query_embeddings)
# Value encoding of shape [batch_size, Tv, filters].
value_seq_encoding = cnn_layer(value_embeddings)

# Query-value attention of shape [batch_size, Tq, filters].
query_value_attention_seq = tf.keras.layers.Attention()(
    [query_seq_encoding, value_seq_encoding])

# Reduce over the sequence axis to produce encodings of shape
# [batch_size, filters].
query_encoding = tf.keras.layers.GlobalAveragePooling1D()(
    query_seq_encoding)
query_value_attention = tf.keras.layers.GlobalAveragePooling1D()(
    query_value_attention_seq)

# Concatenate query and document encodings to produce a DNN input layer.
input_layer = tf.keras.layers.Concatenate()(
    [query_encoding, query_value_attention])

but the problem is i don't know how to link the attention layer with my cnn model because when i connect the first convolutional network with the attention layer like this : Conv1 = keras.layers.Conv2D(32, kernel_size=5)(input_layer) I get this error :

ValueError: Input 0 of layer "conv2d" is incompatible with the layer: expected min_ndim=4, found ndim=2. Full shape received: (None, 200)

can someone show me how to add an attention layer to the CNN model.


Solution

  • Updated Answer...

    keras.backend.clear_session()
    class AttentionLayer(tf.keras.layers.Layer):
      def __init__(self,
                   output_dims):
        super(AttentionLayer, self).__init__()
        self.output_dims = output_dims
        
        self.embeddings = tf.keras.layers.Embedding(input_dim=4096, output_dim = output_dims)
        self.conv = tf.keras.layers.Conv1D(2048, 4 , padding='same')
        self.attn_layer = tf.keras.layers.Attention()
        self.global_pooling_1 = tf.keras.layers.GlobalAveragePooling1D()
        self.global_pooling_2 = tf.keras.layers.GlobalAveragePooling1D()
        
      def call (self, query_input, value_input):
        batch_size = tf.shape(query_input)[0]
        query_input = tf.reshape(query_input, (batch_size, 4096))
        value_input = tf.reshape(value_input, (batch_size, 4096))
        # Query embeddings of shape [batch_size, Tq, dimension].
        query_embeddings = self.embeddings(query_input)
        # Value embeddings of shape [batch_size, Tv, dimension].
        value_embeddings = self.embeddings(value_input)
        # Query encoding of shape [batch_size, Tq, filters].
        query_seq_encoding = self.conv(query_embeddings)
        # Value encoding of shape [batch_size, Tv, filters].
        value_seq_encoding = self.conv(value_embeddings)
    
        # Query-value attention of shape [batch_size, Tq, filters].
        query_value_attention_seq = self.attn_layer(
            [query_seq_encoding, value_seq_encoding])
    
        # Reduce over the sequence axis to produce encodings of shape
        # [batch_size, filters].
        query_encoding = self.global_pooling_1(
            query_seq_encoding)
        query_value_attention = self.global_pooling_2(
            query_value_attention_seq)
    
        # Concatenate query and document encodings to produce a DNN input layer.
        input_layer = tf.keras.layers.Concatenate()(
            [query_encoding, query_value_attention])
        input_layer = tf.reshape(input_layer , (batch_size, 64,64 ,1))
        
        return input_layer
    
    keras.backend.clear_session()
    num_classes = 5
    class Model(tf.keras.Model):
        def __init__(self):
            super(Model, self).__init__()
            self.attn_layer = AttentionLayer(64)
            ##first layer of convolutional layer 
            self.Conv1  = keras.layers.Conv2D(32, kernel_size=5,activation=tf.nn.relu)
            #second convolutional layer
            self.Conv12  = keras.layers.Conv2D(32, kernel_size=5,activation=tf.nn.relu)
            #third convolutional layer
            self.Conv13  = keras.layers.Conv2D(32, kernel_size=5,activation=tf.nn.relu)
            ##i add max pooling with a stride of 2
            self.Max1  = keras.layers.MaxPool2D(2, strides=2)
            ##i add a second layer of convlutional layer
            self.Conv2 = keras.layers.Conv2D(64, kernel_size=5,activation=tf.nn.relu)
            ##adding second convolutional layer
            self.Conv21 = keras.layers.Conv2D(64, kernel_size=5,activation=tf.nn.relu)
            ##adding third convolutional layer
            self.Conv23 = keras.layers.Conv2D(64, kernel_size=5,activation=tf.nn.relu)
            ##i add another layer of max pooling 
            self.Max2 = keras.layers.MaxPool2D(2, strides=2)
            ##here i execute data flatting, i will change this to use attention layer Att.
            self.Flat = keras.layers.Flatten()
            #i add another dense architecture
            self.Dense1= keras.layers.Dense(2048,activation=tf.nn.relu)
            self.Dense2= keras.layers.Dense(700,activation=tf.nn.relu)
            #i add now the output layer with softmax
            # Output layer, class prediction.
            self.outputs = keras.layers.Dense(num_classes,activation=tf.nn.softmax)
            
        def call(self, x):
            x = self.attn_layer(x , x)
            x = self.Conv1(x)
            x = self.Conv12(x)
            x = self.Conv13(x)
            x = self.Max1(x)
            x = self.Conv2(x)
            x = self.Conv21(x)
            x = self.Conv23(x)
            x = self.Max2(x)
            
            x = self.Flat(x)
            x = self.Dense1(x)
            x = self.Dense2(x)
            
            return self.outputs(x)
    
    model = Model()
    x = np.random.randint(0 ,2 , size = (8, 64, 64,1))
    y = np.random.randint(0,5, size=(8,1)) 
    print(model(x).shape)
    
    optimize_rmsprop = keras.optimizers.RMSprop(learning_rate=0.001, epsilon=1e-08, decay=0.0)
    
    model.compile(loss="sparse_categorical_crossentropy", optimizer=optimize_rmsprop,metrics=["accuracy"])
    
    model.fit(x, y, epochs=1)
    

    Output: enter image description here