Search code examples
pythonnumpydatetimetensorflow2.0tensorflow-datasets

Creates a dataset of sliding windows over a timeseries from a pandas datetime index


Consider the following code:

import pandas as pd
import numpy as np
import tensorflow as tf


def random_dates(start, end, n=10):

    start_u = start.value//10**9
    end_u = end.value//10**9

    return pd.to_datetime(np.random.randint(start_u, end_u, n), unit='s')


start = pd.to_datetime('2015-01-01')
end = pd.to_datetime('2018-01-01')
dates=random_dates(start, end)

This code creates random dates with the following output:

print(dates)
DatetimeIndex(['2015-06-25 22:00:34', '2015-05-05 19:20:11',
               '2016-04-11 21:52:28', '2015-10-23 21:07:46',
               '2017-04-06 04:01:23', '2015-07-17 06:13:32',
               '2017-06-18 12:33:27', '2015-11-04 06:48:28',
               '2017-08-20 17:10:17', '2016-04-14 07:46:59'],
              dtype='datetime64[ns]', freq=None)

I would like to create a dataset of sliding windows using the datetime index as input with the following command:

tensorflow_dataset=tf.keras.preprocessing.timeseries_dataset_from_array(dates.values, None, sequence_length=1,sequence_stride=2, batch_size=1)

When I do this, I get the following error:

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported numpy type: NPY_DATETIME).

Any ideas how to solve this ?


Solution

  • You can try converting each numpy datatime object to a string:

    import pandas as pd
    import numpy as np
    import tensorflow as tf
    
    def random_dates(start, end, n=10):
        start_u = start.value//10**9
        end_u = end.value//10**9
    
        return pd.to_datetime(np.random.randint(start_u, end_u, n), unit='s')
    
    start = pd.to_datetime('2015-01-01')
    end = pd.to_datetime('2018-01-01')
    dates=random_dates(start, end)
    tensorflow_dataset=tf.keras.preprocessing.timeseries_dataset_from_array(np.datetime_as_string(dates.values), None, sequence_length=1,sequence_stride=2, batch_size=1)
    
    for d in tensorflow_dataset:
      print(d)
    
    tf.Tensor([[b'2016-11-16T02:46:49.000000000']], shape=(1, 1), dtype=string)
    tf.Tensor([[b'2015-07-27T04:07:14.000000000']], shape=(1, 1), dtype=string)
    tf.Tensor([[b'2015-09-10T14:57:51.000000000']], shape=(1, 1), dtype=string)
    tf.Tensor([[b'2017-11-01T20:48:49.000000000']], shape=(1, 1), dtype=string)
    tf.Tensor([[b'2017-08-25T11:34:42.000000000']], shape=(1, 1), dtype=string)
    

    Afterwards, you can convert the strings to anything you want. You could also use the unit parameter of np.datetime_as_string to get a different output.

    np.datetime_as_string(dates.values, unit='D'):

    tf.Tensor([[b'2016-04-22']], shape=(1, 1), dtype=string)
    tf.Tensor([[b'2015-04-03']], shape=(1, 1), dtype=string)
    tf.Tensor([[b'2015-02-14']], shape=(1, 1), dtype=string)
    tf.Tensor([[b'2017-02-09']], shape=(1, 1), dtype=string)
    tf.Tensor([[b'2015-02-19']], shape=(1, 1), dtype=string)
    

    np.datetime_as_string(dates.values, unit='h'):

    tf.Tensor([[b'2017-01-19T15']], shape=(1, 1), dtype=string)
    tf.Tensor([[b'2015-11-02T15']], shape=(1, 1), dtype=string)
    tf.Tensor([[b'2016-12-11T06']], shape=(1, 1), dtype=string)
    tf.Tensor([[b'2017-07-24T04']], shape=(1, 1), dtype=string)
    tf.Tensor([[b'2016-06-22T04']], shape=(1, 1), dtype=string)