How to create a bi-input TPU model for images?

I want to convert my GPU model to TPU model. My GPU model takes two input image and has the same output for both images. I use custom data generator for this. There are two parallel networks; one for each input.

From this StackOverflow question, I tried to solve this but I failed. Here is what I tried

dataset_12 = tf.data.Dataset.from_tensor_slices((left_train_paths, right_train_paths))
dataset_label = tf.data.Dataset.from_tensor_slices(train_labels) 
dataset = tf.data.Dataset.zip((dataset_12, dataset_label)).batch(2).repeat()

Problem I am facing is that I am unable to decode the bi-input images. Here is the decoder function

def decode_image(filename, label=None, image_size=(IMG_SIZE_h, IMG_SIZE_w)):
    bits = tf.io.read_file(filename)
    image = tf.image.decode_jpeg(bits, channels=3)
    image = tf.cast(image, tf.float32) / 255.0
    image = tf.image.resize(image, image_size)
    
    #convert to numpy and do some cv2 staff mb?
    
    if label is None:
        return image
    else:
        return image, label

The issue is that I am unable to pass both images to the decoder function at the same time. How can I resolve this?

I also try to decode the image in following way

 def decode(img,image_size=(IMG_SIZE_h, IMG_SIZE_w)):
    bits = tf.io.read_file(img)
    image = tf.image.decode_jpeg(bits, channels=3)
    image = tf.cast(image, tf.float32) / 255.0
    image = tf.image.resize(image, image_size)
    return image
def decode_image(left, right,labels=None ):
    if labels is None:
        return decode(left),decode(right)
    else:
        return decode(left),decode(right),labels 
    
image=tf.data.Dataset.from_tensor_slices((left_train_paths,right_train_paths,train_labels ))
dataset=image.map(decode_image, num_parallel_calls=AUTO).repeat().shuffle(512).batch(BATCH_SIZE).prefetch(AUTO)
dataset

The output is of dataset variable is now as <PrefetchDataset shapes: ((None, 760, 760, 3), (None, 760, 760, 3), (None, 8)), types: (tf.float32, tf.float32, tf.int64)>

How can I pass it to the model now?

Model

def get_model():
    
    left_tensor = Input(shape=(IMG_SIZE_h,IMG_SIZE_w,3))
    right_tensor = Input(shape=(IMG_SIZE_h,IMG_SIZE_w,3))

    left_model =  EfficientNetB3(input_shape =  (img_shape,img_shape,3), include_top = False, weights = 'imagenet',input_tensor=left_tensor)
    right_model = EfficientNetB3(input_shape =  (img_shape,img_shape,3), include_top = False, weights = 'imagenet',input_tensor=right_tensor)
    con = concatenate([left_model.output, right_model.output])
    GAP= GlobalAveragePooling2D()(con)
    out = Dense(8, activation = 'sigmoid')(GAP)
    model =Model(inputs=[left_input, right_input], outputs=out)

    return model

Solution

I found a pretty elegant solution. I will explain step by step since may be a bit different of what you thought:

When decoding the images stack both images in a single tensor so the input tensor will be of shape [2, IMAGE_H, IMAGE_W, 3]

def decode_single(im_path, image_size):
    bits = tf.io.read_file(im_path)
    image = tf.image.decode_jpeg(bits, channels=3)
    image = tf.cast(image, tf.float32) / 255.0
    image = tf.image.resize(image, image_size)
    return image

# Note that the image paths are packed in a tuple, and we unpack them inside the function
def decode(paths, label=None, image_size=(128, 128)):
    image_path1, image_path2 = paths
    im1 = decode_single(image_path1, image_size)
    im2 = decode_single(image_path2, image_size)
    images = tf.stack([im1, im2])

    if label is not None:
        return images, label

    return images

I declare the data pipeline so the paths are packed in a tuple.

label_ds = ...
ds = tf.data.Dataset.from_tensor_slices((left_paths, right_paths))
ds = tf.data.Dataset.zip((ds, label_ds)) # returns as ((im_path1, im_path2), label)) not (im_path1, im_path2, label)
ds = ds.map(decode).batch(4)
print(ds)
# Out: <BatchDataset shapes: ((None, 2, 128, 128, 3), ((None,),)), types: (tf.float32, (tf.int32,))>

Since we are feeding batches of two images (None, 2, 128, 128, 3). Declare the model with a single input of shape (2, HEIGHT, WIDTH, 3) and then we split the input in the two images:

def get_model():
    input_layer = Input(shape=(2, IMAGE_H,IMAGE_W,3))
    # Split into two images
    right_image, left_image = Lambda(lambda x: tf.split(x, 2, axis=1))(input_layer)
    
    right_image = Reshape([IMAGE_H, IMAGE_W, 3])(right_image)
    left_image = Reshape([IMAGE_H, IMAGE_W, 3])(left_image)
    # Replace by EfficientNets
    left_model =  Conv2D(64, 3)(left_image)
    right_model = Conv2D(64, 3)(right_image)
    con = Concatenate(-1)([left_model, right_model])
    GAP = GlobalAveragePooling2D()(con)
    out = Dense(8, activation = 'sigmoid')(GAP)
    model = tf.keras.Model(inputs=input_layer, outputs=out)

    return model

Finally compile and train the model as usual:

model = get_model()
model.compile(...)
model.fit(ds, epochs=10)