python string tensorflow type-conversion tensorflow-datasets

TensorFlow: Integer tensor to file pattern string

In dataset pipeline (.map()) I'm using conversion from int tensor to file pattern string for make_csv_dataset(...).

I get an error:

ValueError: No files match `file_pattern` dataset/PAMAP2_Dataset/train/*_Tensor("strided_slice:0", shape=(), dtype=int32).csv.

Here is the error: Tensor("strided_slice:0", shape=(), dtype=int32) - this should be an integer number not this text .....

Code

labels = [ 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 24 ]
def _make_dataset(idx):
    # shuffle
    activityID = tf.random.shuffle(labels)

    dataset = (
        tf.data.experimental.make_csv_dataset(
           file_pattern=("dataset/PAMAP2_Dataset/train/*_" + str(activityID[idx]) + ".csv"),
           batch_size=1,
           num_epochs=1,
           shuffle=False,
        ).batch(64, drop_remainder=True).shuffle(64)
        
dataset = (
    tf.data.Dataset.range(1)
    .interleave(
        map_func=_make_dataset,
        cycle_length=tf.data.AUTOTUNE,
        num_parallel_calls=tf.data.AUTOTUNE,
        deterministic=False,
    )
)

Solution

There is a bug related to using tf.data.Dataset.interleave and make_csv_dataset. The recommendation is to use the CsvDataset API right now. Try something like this with tf.io.matching_files to solve file patterns:

import pandas as pd
import tensorflow as tf

labels = [ 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 24 ]

data = {
  "id": [420, 380, 390],
  "duration": [50, 40, 45]
}

# Create dummy data
df = pd.DataFrame(data)

for i in labels:
  df.to_csv('test_{}.csv'.format(i), index=False)
  
def get_random_path(x, idx):
  return "/content/*_" + str(x[idx].numpy()) + ".csv"

def _make_dataset(idx):
    # shuffle
    activityID = tf.random.shuffle(labels)
    path = tf.py_function(get_random_path, [activityID, idx], Tout=[tf.string])

    dataset = tf.data.experimental.CsvDataset(
           filenames=tf.io.matching_files(path), record_defaults=[tf.int32, tf.int32], header=True)
    return dataset

dataset = (
    tf.data.Dataset.range(1)
    .interleave(_make_dataset,
        cycle_length=tf.data.AUTOTUNE,
        num_parallel_calls=tf.data.AUTOTUNE,
        deterministic=False,
    )
)

for x in dataset:
  print(x)

(<tf.Tensor: shape=(), dtype=int32, numpy=420>, <tf.Tensor: shape=(), dtype=int32, numpy=50>)
(<tf.Tensor: shape=(), dtype=int32, numpy=380>, <tf.Tensor: shape=(), dtype=int32, numpy=40>)
(<tf.Tensor: shape=(), dtype=int32, numpy=390>, <tf.Tensor: shape=(), dtype=int32, numpy=45>)

For more details, check the docs.

Update 1:

print(tf.io.matching_files("/content/*csv"))

tf.Tensor(
[b'/content/test_1.csv' b'/content/test_10.csv' b'/content/test_11.csv'
 b'/content/test_12.csv' b'/content/test_13.csv' b'/content/test_16.csv'
 b'/content/test_17.csv' b'/content/test_18.csv' b'/content/test_19.csv'
 b'/content/test_2.csv' b'/content/test_20.csv' b'/content/test_24.csv'
 b'/content/test_3.csv' b'/content/test_4.csv' b'/content/test_5.csv'
 b'/content/test_6.csv' b'/content/test_7.csv' b'/content/test_9.csv'], shape=(18,), dtype=string)