Search code examples
tensorflowkerastpu

How to create a bi-input TPU model for images?


I want to convert my GPU model to TPU model. My GPU model takes two input image and has the same output for both images. I use custom data generator for this. There are two parallel networks; one for each input.

From this StackOverflow question, I tried to solve this but I failed. Here is what I tried

dataset_12 = tf.data.Dataset.from_tensor_slices((left_train_paths, right_train_paths))
dataset_label = tf.data.Dataset.from_tensor_slices(train_labels) 
dataset = tf.data.Dataset.zip((dataset_12, dataset_label)).batch(2).repeat()

Problem I am facing is that I am unable to decode the bi-input images. Here is the decoder function

def decode_image(filename, label=None, image_size=(IMG_SIZE_h, IMG_SIZE_w)):
    bits = tf.io.read_file(filename)
    image = tf.image.decode_jpeg(bits, channels=3)
    image = tf.cast(image, tf.float32) / 255.0
    image = tf.image.resize(image, image_size)
    
    #convert to numpy and do some cv2 staff mb?
    
    if label is None:
        return image
    else:
        return image, label

The issue is that I am unable to pass both images to the decoder function at the same time. How can I resolve this?

I also try to decode the image in following way

 def decode(img,image_size=(IMG_SIZE_h, IMG_SIZE_w)):
    bits = tf.io.read_file(img)
    image = tf.image.decode_jpeg(bits, channels=3)
    image = tf.cast(image, tf.float32) / 255.0
    image = tf.image.resize(image, image_size)
    return image
def decode_image(left, right,labels=None ):
    if labels is None:
        return decode(left),decode(right)
    else:
        return decode(left),decode(right),labels 
    
image=tf.data.Dataset.from_tensor_slices((left_train_paths,right_train_paths,train_labels ))
dataset=image.map(decode_image, num_parallel_calls=AUTO).repeat().shuffle(512).batch(BATCH_SIZE).prefetch(AUTO)
dataset

The output is of dataset variable is now as <PrefetchDataset shapes: ((None, 760, 760, 3), (None, 760, 760, 3), (None, 8)), types: (tf.float32, tf.float32, tf.int64)>

How can I pass it to the model now?

Model

def get_model():
    
    left_tensor = Input(shape=(IMG_SIZE_h,IMG_SIZE_w,3))
    right_tensor = Input(shape=(IMG_SIZE_h,IMG_SIZE_w,3))

    left_model =  EfficientNetB3(input_shape =  (img_shape,img_shape,3), include_top = False, weights = 'imagenet',input_tensor=left_tensor)
    right_model = EfficientNetB3(input_shape =  (img_shape,img_shape,3), include_top = False, weights = 'imagenet',input_tensor=right_tensor)
    con = concatenate([left_model.output, right_model.output])
    GAP= GlobalAveragePooling2D()(con)
    out = Dense(8, activation = 'sigmoid')(GAP)
    model =Model(inputs=[left_input, right_input], outputs=out)

    return model

Solution

  • I found a pretty elegant solution. I will explain step by step since may be a bit different of what you thought:

    1. When decoding the images stack both images in a single tensor so the input tensor will be of shape [2, IMAGE_H, IMAGE_W, 3]
    def decode_single(im_path, image_size):
        bits = tf.io.read_file(im_path)
        image = tf.image.decode_jpeg(bits, channels=3)
        image = tf.cast(image, tf.float32) / 255.0
        image = tf.image.resize(image, image_size)
        return image
    
    # Note that the image paths are packed in a tuple, and we unpack them inside the function
    def decode(paths, label=None, image_size=(128, 128)):
        image_path1, image_path2 = paths
        im1 = decode_single(image_path1, image_size)
        im2 = decode_single(image_path2, image_size)
        images = tf.stack([im1, im2])
    
        if label is not None:
            return images, label
    
        return images
    
    1. I declare the data pipeline so the paths are packed in a tuple.
    label_ds = ...
    ds = tf.data.Dataset.from_tensor_slices((left_paths, right_paths))
    ds = tf.data.Dataset.zip((ds, label_ds)) # returns as ((im_path1, im_path2), label)) not (im_path1, im_path2, label)
    ds = ds.map(decode).batch(4)
    print(ds)
    # Out: <BatchDataset shapes: ((None, 2, 128, 128, 3), ((None,),)), types: (tf.float32, (tf.int32,))>
    
    1. Since we are feeding batches of two images (None, 2, 128, 128, 3). Declare the model with a single input of shape (2, HEIGHT, WIDTH, 3) and then we split the input in the two images:
    def get_model():
        input_layer = Input(shape=(2, IMAGE_H,IMAGE_W,3))
        # Split into two images
        right_image, left_image = Lambda(lambda x: tf.split(x, 2, axis=1))(input_layer)
        
        right_image = Reshape([IMAGE_H, IMAGE_W, 3])(right_image)
        left_image = Reshape([IMAGE_H, IMAGE_W, 3])(left_image)
        # Replace by EfficientNets
        left_model =  Conv2D(64, 3)(left_image)
        right_model = Conv2D(64, 3)(right_image)
        con = Concatenate(-1)([left_model, right_model])
        GAP = GlobalAveragePooling2D()(con)
        out = Dense(8, activation = 'sigmoid')(GAP)
        model = tf.keras.Model(inputs=input_layer, outputs=out)
    
        return model
    
    1. Finally compile and train the model as usual:
    model = get_model()
    model.compile(...)
    model.fit(ds, epochs=10)