Search code examples
pythonstringtensorflowtype-conversiontensorflow-datasets

TensorFlow: Integer tensor to file pattern string


In dataset pipeline (.map()) I'm using conversion from int tensor to file pattern string for make_csv_dataset(...).

I get an error:

ValueError: No files match `file_pattern` dataset/PAMAP2_Dataset/train/*_Tensor("strided_slice:0", shape=(), dtype=int32).csv.

Here is the error: Tensor("strided_slice:0", shape=(), dtype=int32) - this should be an integer number not this text .....

Code

labels = [ 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 24 ]
def _make_dataset(idx):
    # shuffle
    activityID = tf.random.shuffle(labels)

    dataset = (
        tf.data.experimental.make_csv_dataset(
           file_pattern=("dataset/PAMAP2_Dataset/train/*_" + str(activityID[idx]) + ".csv"),
           batch_size=1,
           num_epochs=1,
           shuffle=False,
        ).batch(64, drop_remainder=True).shuffle(64)
        
dataset = (
    tf.data.Dataset.range(1)
    .interleave(
        map_func=_make_dataset,
        cycle_length=tf.data.AUTOTUNE,
        num_parallel_calls=tf.data.AUTOTUNE,
        deterministic=False,
    )
)

Solution

  • There is a bug related to using tf.data.Dataset.interleave and make_csv_dataset. The recommendation is to use the CsvDataset API right now. Try something like this with tf.io.matching_files to solve file patterns:

    import pandas as pd
    import tensorflow as tf
    
    labels = [ 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 24 ]
    
    data = {
      "id": [420, 380, 390],
      "duration": [50, 40, 45]
    }
    
    # Create dummy data
    df = pd.DataFrame(data)
    
    for i in labels:
      df.to_csv('test_{}.csv'.format(i), index=False)
      
    def get_random_path(x, idx):
      return "/content/*_" + str(x[idx].numpy()) + ".csv"
    
    def _make_dataset(idx):
        # shuffle
        activityID = tf.random.shuffle(labels)
        path = tf.py_function(get_random_path, [activityID, idx], Tout=[tf.string])
    
        dataset = tf.data.experimental.CsvDataset(
               filenames=tf.io.matching_files(path), record_defaults=[tf.int32, tf.int32], header=True)
        return dataset
    
    dataset = (
        tf.data.Dataset.range(1)
        .interleave(_make_dataset,
            cycle_length=tf.data.AUTOTUNE,
            num_parallel_calls=tf.data.AUTOTUNE,
            deterministic=False,
        )
    )
    
    for x in dataset:
      print(x)
    
    (<tf.Tensor: shape=(), dtype=int32, numpy=420>, <tf.Tensor: shape=(), dtype=int32, numpy=50>)
    (<tf.Tensor: shape=(), dtype=int32, numpy=380>, <tf.Tensor: shape=(), dtype=int32, numpy=40>)
    (<tf.Tensor: shape=(), dtype=int32, numpy=390>, <tf.Tensor: shape=(), dtype=int32, numpy=45>)
    

    For more details, check the docs.

    Update 1:

    print(tf.io.matching_files("/content/*csv"))
    
    tf.Tensor(
    [b'/content/test_1.csv' b'/content/test_10.csv' b'/content/test_11.csv'
     b'/content/test_12.csv' b'/content/test_13.csv' b'/content/test_16.csv'
     b'/content/test_17.csv' b'/content/test_18.csv' b'/content/test_19.csv'
     b'/content/test_2.csv' b'/content/test_20.csv' b'/content/test_24.csv'
     b'/content/test_3.csv' b'/content/test_4.csv' b'/content/test_5.csv'
     b'/content/test_6.csv' b'/content/test_7.csv' b'/content/test_9.csv'], shape=(18,), dtype=string)