Search code examples
pythontensorflowkerasdatasettensorflow-datasets

How to fit a keras model with a dataset?


I try to fit my keras model with a set of csv files (i dont want to load files in the memory and concat them). I tried to build a dataset with "tf.data.experimental.make_csv_dataset" (i think it works like matlab datastore?) and feed my model with "next" and "iter" but i couldnt solve the problems with input size and/or input type. I would appreciate any help. Thanks in advance.

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


dataset = tf.data.experimental.make_csv_dataset(
    "data/Testdata/*.csv",
    batch_size=128,
    field_delim=",",
    num_epochs=1,
    select_columns=['A', 'B', 'C'],
    label_name='C')

# MLP Model
model = Sequential()
model.add(Dense(1, input_dim=5))  
model.add(Dense(5, activation='relu'))
model.add(Dense(1, activation='linear'))
model.summary()

model.compile(loss='mean_absolute_error', optimizer="adam", metrics=['mean_squared_error'])

# for batch in dataset:
X, y = next(iter(dataset))

res = model.fit(X, y, epochs=5)

Solution

  • You can feed your dataset directly to model.fit with a few changes:

    Create dummy data:

    import pandas as pd
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense
    
    
    df = pd.DataFrame(data={'A': [50.1, 1.23, 4.5, 4.3, 3.2], 'B':[50.1, 1.23, 4.5, 4.3, 3.2], 'C':[5.2, 3.1, 2.2, 1., 3.]})
    
    df.to_csv('data1.csv', index=False)
    df.to_csv('data2.csv', index=False)
    

    Preprocess data:

    dataset = tf.data.experimental.make_csv_dataset(
        "/content/*.csv",
        batch_size=2,
        field_delim=",",
        num_epochs=1,
        select_columns=['A', 'B', 'C'],
        label_name='C')
    

    Before processing:

    for x in dataset:
      print(x)
    
    OrderedDict([('A', <tf.Tensor: shape=(2,), dtype=float32, numpy=array([4.5 , 1.23], dtype=float32)>), ('B', <tf.Tensor: shape=(2,), dtype=float32, numpy=array([4.5 , 1.23], dtype=float32)>)]), <tf.Tensor: shape=(2,), dtype=float32, numpy=array([2.2, 3.1], dtype=float32)>)
    (OrderedDict([('A', <tf.Tensor: shape=(2,), dtype=float32, numpy=array([50.1,  4.5], dtype=float32)>), ('B', <tf.Tensor: shape=(2,), dtype=float32, numpy=array([50.1,  4.5], dtype=float32)>)]), <tf.Tensor: shape=(2,), dtype=float32, numpy=array([5.2, 2.2], dtype=float32)>)
    (OrderedDict([('A', <tf.Tensor: shape=(2,), dtype=float32, numpy=array([ 4.3, 50.1], dtype=float32)>), ('B', <tf.Tensor: shape=(2,), dtype=float32, numpy=array([ 4.3, 50.1], dtype=float32)>)]), <tf.Tensor: shape=(2,), dtype=float32, numpy=array([1. , 5.2], dtype=float32)>)
    (OrderedDict([('A', <tf.Tensor: shape=(2,), dtype=float32, numpy=array([1.23, 4.3 ], dtype=float32)>), ('B', <tf.Tensor: shape=(2,), dtype=float32, numpy=array([1.23, 4.3 ], dtype=float32)>)]), <tf.Tensor: shape=(2,), dtype=float32, numpy=array([3.1, 1. ], dtype=float32)>)
    (OrderedDict([('A', <tf.Tensor: shape=(2,), dtype=float32, numpy=array([3.2, 3.2], dtype=float32)>), ('B', <tf.Tensor: shape=(2,), dtype=float32, numpy=array([3.2, 3.2], dtype=float32)>)]), <tf.Tensor: shape=(2,), dtype=float32, numpy=array([3., 3.], dtype=float32)>)
    

    Note that the parameter shuffle of make_csv_dataset is by default set to True. That is why you might see mixed outputs.

    After preprocessing the input data has 2 features from A and B:

    dataset = dataset.map(lambda x, y: (tf.concat([tf.expand_dims(x['A'], axis=-1), tf.expand_dims(x['B'], axis=-1)], axis=-1), y))
    for x in dataset:
      print(x)
    
    (<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
    array([[4.5 , 4.5 ],
           [1.23, 1.23]], dtype=float32)>, <tf.Tensor: shape=(2,), dtype=float32, numpy=array([2.2, 3.1], dtype=float32)>)
    (<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
    array([[4.3, 4.3],
           [4.3, 4.3]], dtype=float32)>, <tf.Tensor: shape=(2,), dtype=float32, numpy=array([1., 1.], dtype=float32)>)
    (<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
    array([[ 1.23,  1.23],
           [50.1 , 50.1 ]], dtype=float32)>, <tf.Tensor: shape=(2,), dtype=float32, numpy=array([3.1, 5.2], dtype=float32)>)
    (<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
    array([[50.1, 50.1],
           [ 3.2,  3.2]], dtype=float32)>, <tf.Tensor: shape=(2,), dtype=float32, numpy=array([5.2, 3. ], dtype=float32)>)
    (<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
    array([[4.5, 4.5],
           [3.2, 3.2]], dtype=float32)>, <tf.Tensor: shape=(2,), dtype=float32, numpy=array([2.2, 3. ], dtype=float32)>)
    

    Train your model:

    model = Sequential()
    model.add(Dense(1, input_dim=2))  
    model.add(Dense(5, activation='relu'))
    model.add(Dense(1, activation='linear'))
    model.summary()
    
    model.compile(loss='mean_absolute_error', optimizer="adam", metrics=['mean_squared_error'])
    
    res = model.fit(dataset, epochs=5)
    
    Model: "sequential_7"
    _________________________________________________________________
     Layer (type)                Output Shape              Param #   
    =================================================================
     dense_21 (Dense)            (None, 1)                 3         
                                                                     
     dense_22 (Dense)            (None, 5)                 10        
                                                                     
     dense_23 (Dense)            (None, 1)                 6         
                                                                     
    =================================================================
    Total params: 19
    Trainable params: 19
    Non-trainable params: 0
    _________________________________________________________________
    Epoch 1/5
    5/5 [==============================] - 1s 21ms/step - loss: 10.2060 - mean_squared_error: 247.2872
    Epoch 2/5
    5/5 [==============================] - 0s 10ms/step - loss: 10.0791 - mean_squared_error: 241.0892
    Epoch 3/5
    5/5 [==============================] - 0s 8ms/step - loss: 9.9328 - mean_squared_error: 233.3316
    Epoch 4/5
    5/5 [==============================] - 0s 6ms/step - loss: 9.7714 - mean_squared_error: 224.4764
    Epoch 5/5
    5/5 [==============================] - 0s 8ms/step - loss: 9.6863 - mean_squared_error: 221.0282