python tensorflow computer-vision object-detection tf.keras

Efficient way to preprocess data for DETR model (object detection)

import keras_core as keras # so that i can use keras_cv
import keras_cv
from keras_cv import bounding_box
from keras_cv import visualization
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np

def visualize_dataset(inputs, value_range, rows, cols, bounding_box_format):
    inputs = next(iter(inputs.take(1)))
    images, bounding_boxes = inputs["images"], inputs["bounding_boxes"]
    visualization.plot_bounding_box_gallery(
        images,
        value_range=value_range,
        rows=rows,
        cols=cols,
        y_true=bounding_boxes,
        scale=5,
        font_scale=0.7,
        bounding_box_format=bounding_box_format,
        class_mapping=class_mapping,
    )


def unpackage_raw_tfds_inputs(inputs, bounding_box_format):
  image = inputs["image"]
  boxes = keras_cv.bounding_box.convert_format(
      inputs["objects"]["bbox"],
      images=image,
      source="rel_yxyx",
      target=bounding_box_format,
  )
  bounding_boxes = {
      "classes": tf.cast(inputs["objects"]["label"] + 1, dtype=tf.float32),
      "boxes": tf.cast(boxes, dtype=tf.float32),
  }
  return {"images": tf.cast(image, tf.float32), "bounding_boxes": bounding_boxes}

def load_pascal_voc(split, dataset, bounding_box_format):
  ds = tfds.load(dataset, split=split, with_info=False, shuffle_files=True)
  ds = ds.map(lambda x: unpackage_raw_tfds_inputs(x, bounding_box_format=bounding_box_format), num_parallel_calls=tf.data.AUTOTUNE)
  return ds


train_ds = load_pascal_voc(split="test", dataset="voc/2007", bounding_box_format="xywh"); print(tf.data.experimental.cardinality(train_ds))
val_ds = load_pascal_voc(split="validation", dataset="voc/2007", bounding_box_format="xywh"); print(tf.data.experimental.cardinality(val_ds))
test_ds = load_pascal_voc(split="train", dataset="voc/2007", bounding_box_format="xywh"); print(tf.data.experimental.cardinality(test_ds))

# so that I can visualize_dataset
train_ds = train_ds.ragged_batch(1, drop_remainder=True)
val_ds = val_ds.ragged_batch(1, drop_remainder=True)
test_ds = test_ds.ragged_batch(1, drop_remainder=True)

Padding the classes and bounding boxes for training DETR model from scratch in tensorflow

def pad(dataset):
  images, classes, boxes = [], [], [] # (m, None, None, 3), (m, 42), (m, 42, 4)

  for x in dataset:
    
    images.append(list(x["images"][0].numpy()))

    classes.append(list(tf.keras.utils.pad_sequences(x["bounding_boxes"]["classes"].numpy(), maxlen=42, padding='post')[0]))

    padded_bboxes = np.zeros((1, 42, 4), dtype=np.float32)
    padded_bboxes[: ,:np.shape(x["bounding_boxes"]["boxes"][0])[0], :] = x["bounding_boxes"]["boxes"][0]
    boxes.append(list(padded_bboxes[0]))

  dataset = tf.data.Dataset.from_tensor_slices((images, (classes, boxes)))  
  return dataset

Is there any other efficient method to do the above without converting them from tf.data.Dataset format to array then again convert to tf.data.Dataset format?

I tried to use .map() to the datasets but got an error that I can only use tf.keras.utils.pad_sequences in eager mode

Solution

Use tf.pad function

train_ds = load_pascal_voc(split="validation", dataset="voc/2007", bounding_box_format="xywh")

N = 42
max_height, max_width = 500, 500
def preprocess(x):
  return keras_cv.layers.Resizing(max_height, max_width, bounding_box_format="xywh", pad_to_aspect_ratio=True)(x["images"]),

 (tf.pad([x["bounding_boxes"]["classes"]], [[0,0], [N,N]])[0][N:-len(x["bounding_boxes"]["classes"])], tf.pad(x["bounding_boxes"]["boxes"], [[N,N], [0,0]])[N:-len(x["bounding_boxes"]["boxes"])])

TRAIN_DS = train_ds.map(lambda x: preprocess(x))

for i in TRAIN_DS.take(2): # test
  print(i[0].numpy().shape)
  print(i[1][0].numpy().shape)
  print(i[1][1].numpy().shape)