Search code examples
python-2.7tensorflowdeep-learningtensorflow-datasetstfrecord

TFRecords for videos


I am trying to create TFRecords from a custom video dataset and I am having problems fully understanding how to set them up.

In order to prep my data for storage, I wrote a script that for a given video feed, outputs a 3D cube of shape [N_FRAMES, WIDTH, HEIGHT, CHANNEL]. Thereafter I create a tfrecord as follows:

def _int64_feature(self, value):
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _bytes_feature(self, value):
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def createDataRecord(self, file_name, locations, categories):
    writer = tf.python_io.TFRecordWriter(file_name)

    feature = {}

    for loc, category in zip(locations, categories):
        data = self.3DVideo(loc) # the final array of shape [N_FRAMES, WIDTH, HEIGHT, CHANNEL]

        feature['height'] = self._int64_feature(self.height)
        feature['width'] = self._int64_feature(self.width)
        feature['depth'] = self._int64_feature(self.depth)
        feature['data'] = self._bytes_feature(data.tostring())
        feature['category'] = self._int64_feature(category)

        example = tf.train.Example(features=tf.train.Features(feature=feature))
        writer.write(example.SerializeToString())

    writer.close()

Then my current parser function looks like this

def readDataRecord(self, record):
  filename_queue = tf.train.string_input_producer([record], num_epochs=1)

  reader = tf.TFRecordReader()
  _, serialized_example = reader.read(filename_queue)

  feature =
  {'height': tf.FixedLenFeature([], tf.int64),
    'width': tf.FixedLenFeature([], tf.int64),
    'depth': tf.FixedLenFeature([], tf.int64),
    'data': tf.FixedLenFeature([], tf.string),
    'category': tf.FixedLenFeature([], tf.int64),
  }

  example = tf.parse_single_example(serialized_example, features=feature)

  video3D_buffer = tf.reshape(example['data'], shape=[])
  video3D = tf.decode_raw(video3D_buffer, tf.uint8)

  label = tf.cast(example['category'], tf.int32)

  return video3D, label

With that being said, my questions are:

  1. I know that readDataRecord() is wrong since its working on individual frames. How exactly do I get it to return individual 3D cubes of shape [N_FRAMES, WIDTH, HEIGHT, CHANNEL] along with their respective category?

  2. Is this even a good idea to simply save the entire 3D cube?

Any help or guidance will be greatly appreciated :)

PS: I have looked into other methods including video2tfrecord but most of them seem to be saving individual frames for each video and I don't want that.


Solution

  • So this what I ended up doing to achieve this without having to encode individual frames.

    I ended up flattening the cube then writing that out instead as shown below:

    def _cube_feature(self, value):
        return tf.train.Feature(float_list=tf.train.FloatList(value=value))
    
    def createDataRecord(self, name, locations, categories):
    
        writer = tf.python_io.TFRecordWriter(name)
    
        feature = {}
    
        for loc, category in zip(locations, categories):
            data = self.3DVideo(loc)
            .............
            feature['data'] = self._cube_feature(data.flatten())
            feature['category'] = self._int64_feature(category)
    
            example = tf.train.Example(features=tf.train.Features(feature=feature))
            writer.write(example.SerializeToString())
    
        writer.close()
    

    The resulting parser is:

    def readDataRecord(self, record):
        ..........
        feature = \
        {'height': tf.FixedLenFeature([], tf.int64),
         'width': tf.FixedLenFeature([], tf.int64),
         'depth': tf.FixedLenFeature([], tf.int64),
         'data': tf.FixedLenFeature((NUM_FRAMES, WIDTH, HEIGHT, CHANNEL), tf.float32),
         'category': tf.FixedLenFeature([], tf.int64),
        }
    
        example = tf.parse_single_example(serialized_example, features=feature)
    
        cube = tf.cast(example['data'], tf.uint8)
        label = tf.cast(example['category'], tf.int32)
    
        return cube, label