tensorflow keras deep-learning conv-neural-network attention-model

Adding Luong attention Layer to CNN

I'm using keras to implement a functional CNN model where I have images with the size of 64x64x1. with 6 convolutional layer like this :

num_classes = 5
def  get_model():
    ##creating CNN functional api for learning 
    input_ = keras.layers.Input(shape=[64, 64,1])
    ##first layer of convolutional layer 
    Conv1  = keras.layers.Conv2D(32, kernel_size=5,activation=tf.nn.relu)(input_layer)
    #second convolutional layer
    Conv12  = keras.layers.Conv2D(32, kernel_size=5,activation=tf.nn.relu)(Conv1)
    #third convolutional layer
    Conv13  = keras.layers.Conv2D(32, kernel_size=5,activation=tf.nn.relu)(Conv12)
    ##i add max pooling with a stride of 2
    Max1  = keras.layers.MaxPool2D(2, strides=2)(Conv13)
    ##i add a second layer of convlutional layer
    Conv2 = keras.layers.Conv2D(64, kernel_size=5,activation=tf.nn.relu)(Max1)
    ##adding second convolutional layer
    Conv21 = keras.layers.Conv2D(64, kernel_size=5,activation=tf.nn.relu)(Conv2)
    ##adding third convolutional layer
    Conv23 = keras.layers.Conv2D(64, kernel_size=5,activation=tf.nn.relu)(Conv21)
    ##i add another layer of max pooling 
    Max2 = keras.layers.MaxPool2D(2, strides=2)(Conv23)
    ##here i execute data flatting, i will change this to use attention layer Att.
    Flat = keras.layers.Flatten()(Max2)
    #i add another dense architecture
    Dense1= keras.layers.Dense(2048,activation=tf.nn.relu)(Flat)
    Dense2= keras.layers.Dense(700,activation=tf.nn.relu)(Dense1)
    #i add now the output layer with softmax
    # Output layer, class prediction.
    output = keras.layers.Dense(num_classes,activation=tf.nn.softmax)(Dense2)
    model = Model(inputs=input_, outputs=output)
    ##end of creating CNN using functional api
    ##defining loss function and training data and epoche. I modify the optimizer to rmsprop
    optimize_rmsprop = keras.optimizers.RMSprop(learning_rate=0.001, epsilon=1e-08, decay=0.0)
    model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer_rmsprop,metrics=["accuracy"])
    ###i return the model
    return model

to get better performance i want to add this attention layer to the above CNN :

#already imported
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Model
from keras.layers import Dropout
# Variable-length int sequences. 64*64*1
query_input = tf.keras.Input(shape=(4096,), dtype='int32')
value_input = tf.keras.Input(shape=(4096,), dtype='int32')

# Embedding lookup.
token_embedding = tf.keras.layers.Embedding(input_dim=1000, output_dim=64)
# Query embeddings of shape [batch_size, Tq, dimension].
query_embeddings = token_embedding(query_input)
# Value embeddings of shape [batch_size, Tv, dimension].
value_embeddings = token_embedding(value_input)

# CNN layer.
cnn_layer = tf.keras.layers.Conv1D(
    filters=100,
    kernel_size=4,
    # Use 'same' padding so outputs have the same shape as inputs.
    padding='same')
# Query encoding of shape [batch_size, Tq, filters].
query_seq_encoding = cnn_layer(query_embeddings)
# Value encoding of shape [batch_size, Tv, filters].
value_seq_encoding = cnn_layer(value_embeddings)

# Query-value attention of shape [batch_size, Tq, filters].
query_value_attention_seq = tf.keras.layers.Attention()(
    [query_seq_encoding, value_seq_encoding])

# Reduce over the sequence axis to produce encodings of shape
# [batch_size, filters].
query_encoding = tf.keras.layers.GlobalAveragePooling1D()(
    query_seq_encoding)
query_value_attention = tf.keras.layers.GlobalAveragePooling1D()(
    query_value_attention_seq)

# Concatenate query and document encodings to produce a DNN input layer.
input_layer = tf.keras.layers.Concatenate()(
    [query_encoding, query_value_attention])

but the problem is i don't know how to link the attention layer with my cnn model because when i connect the first convolutional network with the attention layer like this : Conv1 = keras.layers.Conv2D(32, kernel_size=5)(input_layer) I get this error :

ValueError: Input 0 of layer "conv2d" is incompatible with the layer: expected min_ndim=4, found ndim=2. Full shape received: (None, 200)

can someone show me how to add an attention layer to the CNN model.

Solution

Updated Answer...

keras.backend.clear_session()
class AttentionLayer(tf.keras.layers.Layer):
  def __init__(self,
               output_dims):
    super(AttentionLayer, self).__init__()
    self.output_dims = output_dims
    
    self.embeddings = tf.keras.layers.Embedding(input_dim=4096, output_dim = output_dims)
    self.conv = tf.keras.layers.Conv1D(2048, 4 , padding='same')
    self.attn_layer = tf.keras.layers.Attention()
    self.global_pooling_1 = tf.keras.layers.GlobalAveragePooling1D()
    self.global_pooling_2 = tf.keras.layers.GlobalAveragePooling1D()
    
  def call (self, query_input, value_input):
    batch_size = tf.shape(query_input)[0]
    query_input = tf.reshape(query_input, (batch_size, 4096))
    value_input = tf.reshape(value_input, (batch_size, 4096))
    # Query embeddings of shape [batch_size, Tq, dimension].
    query_embeddings = self.embeddings(query_input)
    # Value embeddings of shape [batch_size, Tv, dimension].
    value_embeddings = self.embeddings(value_input)
    # Query encoding of shape [batch_size, Tq, filters].
    query_seq_encoding = self.conv(query_embeddings)
    # Value encoding of shape [batch_size, Tv, filters].
    value_seq_encoding = self.conv(value_embeddings)

    # Query-value attention of shape [batch_size, Tq, filters].
    query_value_attention_seq = self.attn_layer(
        [query_seq_encoding, value_seq_encoding])

    # Reduce over the sequence axis to produce encodings of shape
    # [batch_size, filters].
    query_encoding = self.global_pooling_1(
        query_seq_encoding)
    query_value_attention = self.global_pooling_2(
        query_value_attention_seq)

    # Concatenate query and document encodings to produce a DNN input layer.
    input_layer = tf.keras.layers.Concatenate()(
        [query_encoding, query_value_attention])
    input_layer = tf.reshape(input_layer , (batch_size, 64,64 ,1))
    
    return input_layer

keras.backend.clear_session()
num_classes = 5
class Model(tf.keras.Model):
    def __init__(self):
        super(Model, self).__init__()
        self.attn_layer = AttentionLayer(64)
        ##first layer of convolutional layer 
        self.Conv1  = keras.layers.Conv2D(32, kernel_size=5,activation=tf.nn.relu)
        #second convolutional layer
        self.Conv12  = keras.layers.Conv2D(32, kernel_size=5,activation=tf.nn.relu)
        #third convolutional layer
        self.Conv13  = keras.layers.Conv2D(32, kernel_size=5,activation=tf.nn.relu)
        ##i add max pooling with a stride of 2
        self.Max1  = keras.layers.MaxPool2D(2, strides=2)
        ##i add a second layer of convlutional layer
        self.Conv2 = keras.layers.Conv2D(64, kernel_size=5,activation=tf.nn.relu)
        ##adding second convolutional layer
        self.Conv21 = keras.layers.Conv2D(64, kernel_size=5,activation=tf.nn.relu)
        ##adding third convolutional layer
        self.Conv23 = keras.layers.Conv2D(64, kernel_size=5,activation=tf.nn.relu)
        ##i add another layer of max pooling 
        self.Max2 = keras.layers.MaxPool2D(2, strides=2)
        ##here i execute data flatting, i will change this to use attention layer Att.
        self.Flat = keras.layers.Flatten()
        #i add another dense architecture
        self.Dense1= keras.layers.Dense(2048,activation=tf.nn.relu)
        self.Dense2= keras.layers.Dense(700,activation=tf.nn.relu)
        #i add now the output layer with softmax
        # Output layer, class prediction.
        self.outputs = keras.layers.Dense(num_classes,activation=tf.nn.softmax)
        
    def call(self, x):
        x = self.attn_layer(x , x)
        x = self.Conv1(x)
        x = self.Conv12(x)
        x = self.Conv13(x)
        x = self.Max1(x)
        x = self.Conv2(x)
        x = self.Conv21(x)
        x = self.Conv23(x)
        x = self.Max2(x)
        
        x = self.Flat(x)
        x = self.Dense1(x)
        x = self.Dense2(x)
        
        return self.outputs(x)

model = Model()
x = np.random.randint(0 ,2 , size = (8, 64, 64,1))
y = np.random.randint(0,5, size=(8,1)) 
print(model(x).shape)

optimize_rmsprop = keras.optimizers.RMSprop(learning_rate=0.001, epsilon=1e-08, decay=0.0)

model.compile(loss="sparse_categorical_crossentropy", optimizer=optimize_rmsprop,metrics=["accuracy"])

model.fit(x, y, epochs=1)

Output: