I am trying to parse a dataset with coco format, consisting of (among others) a input image, and a list of images(masks) as output. The dataset has been converted to tfrecords using efficientdet/dataset_tools/create_coco_record.py
Following is a snippet of the serialization:
feature_dict = {
'image/height':
tfrecord_util.int64_feature(image_height),
'image/width':
tfrecord_util.int64_feature(image_width)
'image/encoded':
tfrecord_util.bytes_feature(encoded_jpg),
}
...
for object_annotations in bbox_annotations:
run_len_encoding = mask.frPyObjects(object_annotations['segmentation'],
image_height, image_width)
binary_mask = mask.decode(run_len_encoding)
binary_mask = np.amax(binary_mask, axis=2)
pil_image = PIL.Image.fromarray(binary_mask)
output_io = io.BytesIO()
pil_image.save(output_io, format='PNG')
encoded_mask_png.append(output_io.getvalue()
if include_masks:
feature_dict['image/object/mask'] = (
tfrecord_util.bytes_list_feature(encoded_mask_png))
My problem comes with the decoding of the tfrecords, where I am unable to decode the images within the mask tensor.
Following is my parsing function:
def parse_example(serialized_example):
feature_dict = {
'image/height': tf.io.FixedLenFeature([], tf.int64),
'image/width': tf.io.FixedLenFeature([], tf.int64),
'image/encoded': tf.io.FixedLenFeature([], tf.string),
'image/object/class/label': tf.io.FixedLenSequenceFeature([], tf.int64, allow_missing=True),
'image/object/mask': tf.io.FixedLenSequenceFeature([], tf.string, allow_missing=True),
}
example = tf.io.parse_single_example(serialized_example, features=feature_dict)
raw_height = tf.cast(example['image/height'], tf.int64)
raw_width = tf.cast(example['image/width'], tf.int64)
image = tf.image.decode_png(example['image/encoded'], channels=3)
image = tf.image.resize(image, (512, 512))
labels = example['image/object/class/label']
masks = tf.image.decode_png(example['image/object/mask'], channels=3)
Error I recieve:
ValueError: Shape must be rank 0 but is rank 1 for '{{node DecodePng_1}} = DecodePngchannels=3, dtype=DT_UINT8' with input shapes: [?].
How would I go about decoding multiple images in a vector?
The solution was found at tf_example_decoder.py.
Following is a few snippets of the code:
Read the images as VarLenFeatures with type string
keys_to_features = {
...
'image/object/mask': tf.io.VarLenFeature(tf.string)
}
parsed_tensors = tf.io.parse_single_example(
serialized=serialized_example, features=keys_to_features)
Convert sparse to dense tensor
for k in parsed_tensors:
if isinstance(parsed_tensors[k], tf.SparseTensor):
if parsed_tensors[k].dtype == tf.string:
parsed_tensors[k] = tf.sparse.to_dense(
parsed_tensors[k], default_value='')
Then decode the masks using:
def _decode_masks(self, parsed_tensors):
"""Decode a set of PNG masks to the tf.float32 tensors."""
def _decode_png_mask(png_bytes):
mask = tf.squeeze(
tf.io.decode_png(png_bytes, channels=1, dtype=tf.uint8), axis=-1)
mask = tf.cast(mask, dtype=tf.float32)
mask.set_shape([None, None])
return mask
height = parsed_tensors['image/height']
width = parsed_tensors['image/width']
masks = parsed_tensors['image/object/mask']
return tf.cond(
pred=tf.greater(tf.size(input=masks), 0),
true_fn=lambda: tf.map_fn(_decode_png_mask, masks, dtype=tf.float32),
false_fn=lambda: tf.zeros([0, height, width], dtype=tf.float32))