Import huge non-image dataset in TensorFlow

I have a big dataset (300.000 examples x 33.000 features), which of course does not fit the memory. The data are saved in HDF5 format. The values are mostly zeros (sparse data). They look like this:

           Attr1    52  52  52  52  52  52  52  52 ...
           Attr2    umb umb umb umb umb umb umb umb ...
           CellID   TGC-1 TGG-1 CAG-1 TTC-1 GTG-1 GTA-1 CAA-1 CAC-1 ...

Acc     Gene                                      ...
243485  RP11-.3     0   0   0   0   0   0   0   0 ...
237613  FAM138A     0   0   0   0   0   0   0   0 ...
186092  OR4F5       0   0   0   0   0   0   0   0 ...
238009  RP11-.7     0   0   0   0   0   0   0   0 ...
239945  RP11-.8     0   0   0   0   0   0   0   0 ...
279457  FO538.2     0   0   0   0   0   0   0   0 ...
228463  AP006.2     0   0   0   0   0   0   0   0 ...
...     ...         ... ... ... ... ... ... ... ...

I have done the following that works, to load the whole dataset in TensorFlow (loompy is just a package using hdf5 on the background):

import tensorflow as tf
import numpy as np
import loompy as lp

batch_size = 1000

with loompy.connect(filename, 'r') as ds:
    ds_shape = (batch_size, ds.shape[0])
    ds_dtype = ds[0:1, 0:1].dtype

    labels = np.asarray([,]).T
    labels_shape = (batch_size, 1)

data_placeholder = tf.placeholder(ds_dtype, ds_shape)
labels_placeholder = tf.placeholder(labels[:,1].dtype, labels_shape)

dataset =, labels_placeholder))
dataset = dataset.prefetch(batch_size)
iterator = dataset.make_initializable_iterator()
next_element = iterator.get_next()

with tf.Session() as sess:
    with loompy.connect(filename, 'r') as ds:
        for i in range(0, ds.shape[1], batch_size):
            batch = ds[0 : ds_shape[1], i : i + batch_size].T
            batch_labels = np.asarray([[i : i + batch_size],
                             [i : i + batch_size]]).T[:,1]

  , feed_dict = {data_placeholder: batch,
                       labels_placeholder: batch_labels.reshape(batch_size, 1)})

            for _ in range(batch_size):


(array([0, 0, 0, ..., 0, 0, 0], dtype=int32), array([b'52'], dtype=object))

(array([0, 0, 0, ..., 0, 0, 0], dtype=int32), array([b'52'], dtype=object))


This way however, I am not able to split my data in train, test and evaluation sets. Also, I can only shuffle them inside each batch, which is not effective since most times the data on a batch belong to the same class.

How do I manipulate this kind of data to be able to load them as train, test, evaluation sets, and perform shuffling etc. (preferably by utilizing my TitanX GPU as much as possible)?


  • In case there is someone still interested on this topic, here is my solution to this problem I had. In the end I stuck with Loompy file format, as it is really convenient with what I am doing (take a look on Loompy here). To import such a big volume of information in my model, I used the from_generator() function of the TensorFlow API. Also, I created a generator to yield the data as needed.

    Below is how my input function looks:

    import loompy as lp
    import tensorflow as tf
    from sklearn.model_selection import train_test_split
    model_input_name = ""
    input_size = 10000
    batch_size = 32
    epochs = 10
    # Input functions for train, test and eval sets.
    def train_input_fn():
        return _input_fn('TRAIN')
    def test_input_fn():
        return _input_fn('TEST')
    def eval_input_fn():
        return _input_fn('EVAL')
    # General purpose input function
    def _input_fn(mode = 'TRAIN'):
                mode : 'TRAIN', 'TEST', 'EVAL'
        # A generator to yield data and labels from the given FILE,
        # based on the indices assigned to the "indices" variable.
        # If you change the labels, remember to update the from_generator()
        # parameters below, to reflect their datatype.
        def gen():
            with lp.connect(FILE, 'r') as ds:
                if ae:
                    for i in indices:
                        yield {model_input_name: ds[:, i]}, ds[:, i]
                    for i in indices:
                        yield {model_input_name: ds[:, i]},[i]
        # Get the indices for train, test and eval sets
        train_idx, test_idx, eval_idx = train_test_set_idx_split(TRAIN_RT, TEST_RT, EVAL_RT)
        # Check condition and assign the respective set to the "indices" variable
        if mode == 'TRAIN':
            indices = train_idx
        elif mode == 'TEST':
            indices = test_idx
        elif mode == 'EVAL':
            indices = eval_idx
            print("Wrong mode choice: ", mode)
        dataset =, ({model_input_name: tf.int64}, tf.int64),
                                                 output_shapes=({model_input_name: [input_size,]}, []))
        # Shuffle, batch, map, prefetch and repeat your dataset.
        # If you need to do some preprocessing on the data, create your function on
        # the cell above, and call it within a map() function.
        dataset = dataset.shuffle(buffer_size=batch_size*50)
        dataset = dataset.batch(batch_size)
        dataset =
        dataset =
        # Map on whatever other functions you need
        dataset = ... )
        dataset = dataset.prefetch(2)
        dataset = dataset.repeat(epochs)
        iterator = dataset.make_one_shot_iterator()
        return iterator.get_next()
    # Get train, test, eval indices for the given dataset
    def train_test_set_idx_split(train_rt, test_rt, eval_rt):
        """ This function returns indices for the train, test and evaluation sets,
            given an input Dataset.
                train_rt: ratio of the train dataset
                test_rt:  ratio of the test dataset
                eval_rt:  ratio of the evaluation dataset
                train_idx: indices (of the given dataset) for the train dataset
                test_idx:  indices (of the given dataset) for the test dataset
                evel_idx:  indices (of the given dataset) for the evaluation dataset
                This function will work correctly as long as (test_rt == evel_rt) is True.
                If you need (test_rt != evel_rt), you need something more sophisticated.
        with lp.connect(FILE, 'r') as ds:
            idx = np.array(range(0, ds.shape[1]))
        train_idx, test_idx = train_test_split(idx, train_size=train_rt, test_size=test_rt+eval_rt)
        test_idx, eval_idx = train_test_split(test_idx, train_size=0.5, test_size=0.5)
        return train_idx, test_idx, eval_idx
    # Reshape labels as needed
    def _reshape_labels(data, labels):
        return data, tf.reshape(labels, (-1,1))