Search code examples
pythonjsontensorflowtensorflow2.0tensorflow-datasets

How to load and map list of dictionaries/jsons with tf.data.Dataset


I have a dataset/records stored in a list of dictionaries. The dictionary can be pretty complex. I would like to load this list through TensorFlow dataset API. How can I do this? I tried something like this however, it is not working:

import tensorflow as tf
import json

LABELS_IDS = ["cat", "dog", "animal"]

def parse_record(record):
    image = tf.io.read_file(record["_file"])
    image = tf.image.decode_jpeg(image)
    image = tf.image.convert_image_dtype(image, tf.float32)
    image = tf.image.resize(image, [224, 224])
    image = tf.image.random_flip_left_right(image, seed=None)

    labels = []
    for element in record["_categories"]:
        if element in LABELS_IDS:
            labels.append(LABELS_IDS.index(element))

    one_hot_labels = tf.reduce_sum(tf.one_hot(labels, len(LABELS_IDS)), axis=0)
    return image, one_hot_labels

records = [{"_file":"images/test.jpg", "_categories": ["cat", "animal"]}]
    
train_x = tf.data.Dataset.from_tensor_slices(records).map(parse_record)

Edit:

I found the answer to this, you can simply map records to different methods:

LABELS_IDS = ["cat", "dog", "animal"]
records = [{"_file":"images/test.jpg", "_categories": ["cat", "animal"]}]

def _load_files(records):
    return [record["_file"] for record in records]

def _load_labels(records):
    vectors = []
    for record in records:
        labels = []
        for element in record["_categories"]:
            if element in LABELS_IDS:
                labels.append(LABELS_IDS.index(element))

        one_hot = tf.reduce_sum(tf.one_hot(present, len(LABELS_IDS)), axis=0)
        vectors.append(one_hot.numpy())
    return vectors

def _load_data(file_path, label):
    image = tf.io.read_file(file_path)
    image = tf.image.decode_image(image, channels=3, expand_animations=False)
    return image, label

data = (
  _load_files(records),
  _load_labels(records)
)

train_x = tf.data.Dataset.from_tensor_slices(data).map(_load_data)

Solution

  • For the benefit of Community, I am adding the @Cospel answer here

    LABELS_IDS = ["cat", "dog", "animal"]
    records = [{"_file":"images/test.jpg", "_categories": ["cat", "animal"]}]
    
    def _load_files(records):
        return [record["_file"] for record in records]
    
    def _load_labels(records):
        vectors = []
        for record in records:
            labels = []
            for element in record["_categories"]:
                if element in LABELS_IDS:
                    labels.append(LABELS_IDS.index(element))
    
            one_hot = tf.reduce_sum(tf.one_hot(present, len(LABELS_IDS)), axis=0)
            vectors.append(one_hot.numpy())
        return vectors
    
    def _load_data(file_path, label):
        image = tf.io.read_file(file_path)
        image = tf.image.decode_image(image, channels=3, expand_animations=False)
        return image, label
    
    data = (
      _load_files(records),
      _load_labels(records)
    )
    
    train_x = tf.data.Dataset.from_tensor_slices(data).map(_load_data)