Search code examples
pythonpandastensorflowkerastensorflow-datasets

Tensorflow labels for classification aren't loaded properly in the model


I'm having issues with the categories in in my data, I can't set the Dense softmax layer to "3" instead of "1" for 3 categories.

I assume my issue is with vectorize_text, but I am not completely sure. I can also assume that I don't set the label tensors correctly.

# Start of data generation

dummy_data = {'text': ['Love', 'Money', 'War'],
              'labels': [1,2,3]
              }
dummy_data['text'] = dummy_data['text']*500
dummy_data['labels'] = dummy_data['labels']*500

df_train_bogus = pd.DataFrame(dummy_data)  


def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  ds = tf.data.Dataset.from_tensor_slices(dict(dataframe)).batch(batch_size)
  return ds

batch_size = 32
train_ds = df_to_dataset(df_train_bogus, batch_size=batch_size)
val_ds = df_to_dataset(df_train_bogus, batch_size=batch_size)

# Model constants (can be lower but that doesn't matter for this example)
sequence_length = 128
max_features = 20000  # vocab size
embedding_dim = 128
# End of data generation
#  Start of vectorization
vectorize_layer = TextVectorization(
    standardize = 'lower_and_strip_punctuation',
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

def vectorize_text(text, labels):
  print(text)
  print(labels)

  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), labels

vectorize_layer.adapt(df_train_bogus['text'])

train_ds_vectorized = train_ds.map(lambda x: (vectorize_text(x['text'], x['labels'])))
val_ds_vectorized = val_ds.map(lambda x: (vectorize_text(x['text'], x['labels'])))

"""
Output:
Tensor("args_1:0", shape=(None,), dtype=string)
Tensor("args_0:0", shape=(None,), dtype=int64)
Tensor("args_1:0", shape=(None,), dtype=string)
Tensor("args_0:0", shape=(None,), dtype=int64)

"""
#  The model

model = Sequential()
model.add(Embedding(max_features, embedding_dim, input_length=sequence_length))
model.add(LSTM(embedding_dim, input_shape=(None, sequence_length)))

model.add(Dense(3, activation='softmax'))
#  Fails with this error:
#      ValueError: Shapes (None, 1) and (None, 3) are incompatible

model.summary()

model.compile(loss="categorical_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])  # model 4

epochs = 10

# Fit the model using the train and test datasets.
history = model.fit(train_ds_vectorized, validation_data=val_ds_vectorized, epochs=epochs)

Solution

  • Your labels from your dummy data are causing the problem. If they are not one-hot encoded, then I would suggest using the sparse_categorical_crossentropy loss function instead, which works on integer targets (that you already you have). Check out the docs for more information. Here is a complete working example:

    import tensorflow as tf
    import pandas as pd
    
    dummy_data = {'text': ['Love', 'Money', 'War'],
                  'labels': [0, 1, 2]
                  }
    dummy_data['text'] = dummy_data['text']*500
    dummy_data['labels'] = dummy_data['labels']*500
    
    df_train_bogus = pd.DataFrame(dummy_data)  
    
    
    def df_to_dataset(dataframe, shuffle=True, batch_size=32):
      ds = tf.data.Dataset.from_tensor_slices(dict(dataframe)).batch(batch_size)
      return ds
    
    batch_size = 32
    train_ds = df_to_dataset(df_train_bogus, batch_size=batch_size)
    val_ds = df_to_dataset(df_train_bogus, batch_size=batch_size)
    
    # Model constants (can be lower but that doesn't matter for this example)
    sequence_length = 128
    max_features = 20000  # vocab size
    embedding_dim = 128
    
    #  Start of vectorization
    vectorize_layer = tf.keras.layers.TextVectorization(
        standardize = 'lower_and_strip_punctuation',
        max_tokens=max_features,
        output_mode="int",
        output_sequence_length=sequence_length,
    )
    
    def vectorize_text(text, labels):
      print(text)
      print(labels)
    
      text = tf.expand_dims(text, -1)
      return vectorize_layer(text), labels
    
    vectorize_layer.adapt(df_train_bogus['text'])
    
    train_ds_vectorized = train_ds.map(lambda x: (vectorize_text(x['text'], x['labels'])))
    val_ds_vectorized = val_ds.map(lambda x: (vectorize_text(x['text'], x['labels'])))
    
    """
    Output:
    Tensor("args_1:0", shape=(None,), dtype=string)
    Tensor("args_0:0", shape=(None,), dtype=int64)
    Tensor("args_1:0", shape=(None,), dtype=string)
    Tensor("args_0:0", shape=(None,), dtype=int64)
    
    """
    
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(max_features, embedding_dim, input_length=sequence_length))
    model.add(tf.keras.layers.LSTM(embedding_dim, input_shape=(None, sequence_length)))
    
    model.add(tf.keras.layers.Dense(3, activation='softmax'))
    
    model.summary()
    
    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer="adam",
                  metrics=["sparse_categorical_accuracy"])  # model 4
    
    epochs = 10
    
    history = model.fit(train_ds_vectorized, validation_data=val_ds_vectorized, epochs=epochs)
    """
    Output:
    Tensor("args_1:0", shape=(None,), dtype=string)
    Tensor("args_0:0", shape=(None,), dtype=int64)
    Tensor("args_1:0", shape=(None,), dtype=string)
    Tensor("args_0:0", shape=(None,), dtype=int64)
    
    """
    
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(max_features, embedding_dim, input_length=sequence_length))
    model.add(tf.keras.layers.LSTM(embedding_dim, input_shape=(None, sequence_length)))
    
    model.add(tf.keras.layers.Dense(3, activation='softmax'))
    
    model.summary()
    
    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer="adam",
                  metrics=["accuracy"])  # model 4
    
    epochs = 10
    
    history = model.fit(train_ds_vectorized, validation_data=val_ds_vectorized, epochs=epochs)
    

    Note that your labels need to start from zero to n, since sparse_categorical_crossentropy produces a category index of the most likely class, which can be 0.

    Update: The accuracy 0.333 is correct since you have 3 classes with an equal number of samples for each class. You need to use a larger dataset to see any reasonable results.