Search code examples
pythontensorflowkerastensorflow-datasetsrecommendation-engine

TensorFlow Recommenders - ValueError: Shape must be rank 2 but is rank 3


NOTE - Since I do need to stream loading the data instead of in memory, Please show the example using tf.data.experimental.make_csv_dataset. Also, please show an example using my exact dataset.

I'm trying to replicate this TensorFlow Recommenders tutorial with a toy dataset. However, I'm getting this below error:

Epoch 1/5

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/tmp/ipykernel_7920/1393870474.py in <module>
    106 
    107 # Train.
--> 108 model.fit(interactions, epochs=5)
    109 
    110 # Evaluate.

~/anaconda3/envs/srs/lib/python3.9/site-packages/keras/utils/traceback_utils.py in error_handler(*args, **kwargs)
     65     except Exception as e:  # pylint: disable=broad-except
     66       filtered_tb = _process_traceback_frames(e.__traceback__)
---> 67       raise e.with_traceback(filtered_tb) from None
     68     finally:
     69       del filtered_tb

~/anaconda3/envs/srs/lib/python3.9/site-packages/tensorflow/python/framework/func_graph.py in autograph_handler(*args, **kwargs)
   1127           except Exception as e:  # pylint:disable=broad-except
   1128             if hasattr(e, "ag_error_metadata"):
-> 1129               raise e.ag_error_metadata.to_exception(e)
   1130             else:
   1131               raise

ValueError: in user code:

    File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/keras/engine/training.py", line 878, in train_function  *
        return step_function(self, iterator)
    File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/keras/engine/training.py", line 867, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/keras/engine/training.py", line 860, in run_step  **
        outputs = model.train_step(data)
    File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/tensorflow_recommenders/models/base.py", line 68, in train_step
        loss = self.compute_loss(inputs, training=True)
    File "/tmp/ipykernel_7920/1393870474.py", line 94, in compute_loss
        return self.task(user_embeddings, channel_embeddings)
    File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None

    ValueError: Exception encountered when calling layer "retrieval" (type Retrieval).
    
    in user code:
    
        File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/tensorflow_recommenders/tasks/retrieval.py", line 143, in call  *
            metric_update_ops.append(
        File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/tensorflow_recommenders/metrics/factorized_top_k.py", line 84, in update_state  *
            top_k_predictions, _ = self._candidates(query_embeddings, k=self._k)
        File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler  **
            raise e.with_traceback(filtered_tb) from None
    
        ValueError: Exception encountered when calling layer "streaming" (type Streaming).
        
        in user code:
        
            File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/tensorflow_recommenders/layers/factorized_top_k.py", line 441, in top_k  *
                joined_scores = tf.concat([state_scores, x_scores], axis=1)
        
            ValueError: Shape must be rank 2 but is rank 3 for '{{node concat}} = ConcatV2[N=2, T=DT_FLOAT, Tidx=DT_INT32](args_0, args_2, concat/axis)' with input shapes: [1,0], [?,1,1], [].
        
        
        Call arguments received:
          • queries=tf.Tensor(shape=(1, 64), dtype=float32)
          • k=100
    
    
    Call arguments received:
      • query_embeddings=tf.Tensor(shape=(1, 64), dtype=float32)
      • candidate_embeddings=tf.Tensor(shape=(1, 64), dtype=float32)
      • sample_weight=None
      • candidate_sampling_probability=None
      • candidate_ids=None
      • compute_metrics=True


Here's my code:

from typing import Dict, Text
import pandas as pd
from pathlib import Path

import tensorflow as tf 
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

df_interactions = pd.DataFrame({
    'user_id': [
        '00001446-da5f-4d17', 
        '00001446-da5f-4d17',
        '00005ab5-c9e0-4b05-',
        '00005ab5-c9e0-4b05-',
        '000093dd-1a11-4600', 
        '000093dd-1a11-4600',
        '00009b34-65b5-42c1', 
        '0000ae32-4a91-4bcd',
        '0000ae32-4a91-4bcd',
        '0000ae32-4a91-4bcd'
    ], 
    'channel_id': [
        '1', '2', 'A56',
        '3', 'B72', '2', 
        'M63', '2', '5', 'A56'
    ]
})

df_interactions.to_csv('experiment_interactions.csv', index=False)

df_channels = pd.DataFrame({
    'channel_id': [
        '1', '2', '3', '5', 'A56', 'B72', 'M63' 
    ],
    'channel_name': [
        'Popular', 
        'Best',
        'Highest Rated',
        'Large Following',
        'Nice', 
        'Retro',
        'Modern'
    ]
})

df_channels.to_csv('experiment_channels.csv', index=False)


interactions = tf.data.experimental.make_csv_dataset(
    file_pattern='experiment_interactions.csv', 
    column_defaults=[tf.string, tf.string], 
    batch_size=1
)
channels = tf.data.experimental.make_csv_dataset(
    file_pattern='experiment_channels.csv', 
    column_defaults=[tf.string, tf.string], 
    batch_size=1
)


# Select the basic features.
interactions = interactions.map(lambda x: {
    "user_id": tf.strings.to_number(x["user_id"]),
    "channel_id": tf.strings.to_number(x["channel_id"])
})
channels = channels.map(lambda x: tf.strings.to_number(x["channel_id"]))


# Build a model.
class Model(tfrs.Model):

  def __init__(self):
    super().__init__()

    # Set up user representation.
    self.user_model = tf.keras.layers.Embedding(
        input_dim=2000, output_dim=64)
    # Set up movie representation.
    self.item_model = tf.keras.layers.Embedding(
        input_dim=2000, output_dim=64)
    # Set up a retrieval task and evaluation metrics over the
    # entire dataset of candidates.
    self.task = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=channels.batch(1).map(self.item_model)
        )
    )

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:

    user_embeddings = self.user_model(features["user_id"])
    channel_embeddings = self.item_model(features["channel_id"])

    return self.task(user_embeddings, channel_embeddings)


model = Model()
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))

# Randomly shuffle data and split between train and test.
tf.random.set_seed(42)
#shuffled = interactions.shuffle(100000, seed=42, reshuffle_each_iteration=False)

#train = shuffled.take(80000)
#test = shuffled.skip(80000).take(20000)

# Train.
model.fit(interactions, epochs=5)

Additional info:

  • TensorFlow version: '2.7.0'
  • TensorFlow Datasets version: '4.4.0'
  • Pandas version: '1.3.4'

Solution

  • You seem to be preprocessing your data incorrectly. For example, you cannot use tf.strings.to_number to convert 00001446-da5f-4d17 into some number. It will throw an error, since the string contains more than just numbers. Also, each sample in your dataset was an array instead of a single sample: Channel 1, for example, was not 1, but [1]. This was the cause of the original problem in your question. Here is a simplified working example based on your code:

    from typing import Dict, Text
    import pandas as pd
    from pathlib import Path
    
    import tensorflow as tf 
    import tensorflow_datasets as tfds
    import tensorflow_recommenders as tfrs
    
    df_interactions = pd.DataFrame({
        'user_id': [
            '4d17', 
            '4d17',
            '4b05',
            '4b05',
            '93dd', 
            '93dd',
            '9b34', 
            '4bcd',
            '-4bcd',
            '4bcd'
        ], 
        'channel_id': [
            '1', '2', '6',
            '3', '7', '2', 
            '8', '2', '5', '6'
        ]
    })
    
    df_channels = pd.DataFrame({
        'channel_id': [
            '1', '2', '3', '5', '6', '7', '8' 
        ],
        'channel_name': [
            'Popular', 
            'Best',
            'Highest Rated',
            'Large Following',
            'Nice', 
            'Retro',
            'Modern'
        ]
    })
    
    df_channels = pd.DataFrame({
        'channel_id': [
            '1', '2', '3', '5', '6', '7', '8' 
        ],
        'channel_name': [
            'Popular', 
            'Best',
            'Highest Rated',
            'Large Following',
            'Nice', 
            'Retro',
            'Modern'
        ]
    })
    
    interactions = tf.data.Dataset.from_tensor_slices((dict(df_interactions)))
    interactions = interactions.map(lambda x: {
        "user_id": tf.strings.to_number(tf.strings.regex_replace(x["user_id"], '[^0-9^]', "")),
        "channel_id": tf.strings.to_number(x["channel_id"])
    })
    
    channels = tf.data.Dataset.from_tensor_slices((dict(df_channels)))
    channels = channels.map(lambda x: tf.strings.to_number(x["channel_id"]))
    
    # Build a model.
    class Model(tfrs.Model):
    
      def __init__(self):
        super().__init__()
    
        # Set up user representation.
        self.user_model = tf.keras.layers.Embedding(
            input_dim=2000, output_dim=64)
        # Set up movie representation.
        self.item_model = tf.keras.layers.Embedding(
            input_dim=2000, output_dim=64)
        # Set up a retrieval task and evaluation metrics over the
        # entire dataset of candidates.
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=channels.batch(1).map(self.item_model)
            )
        )
    
      def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    
        user_embeddings = self.user_model(features["user_id"])
        channel_embeddings = self.item_model(features["channel_id"])
        return self.task(user_embeddings, channel_embeddings)
    
    
    model = Model()
    model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))
    tf.random.set_seed(42)
    
    model.fit(interactions.batch(1), epochs=5)
    
    Epoch 1/5
    10/10 [==============================] - 1s 61ms/step - factorized_top_k/top_1_categorical_accuracy: 1.0000 - factorized_top_k/top_5_categorical_accuracy: 1.0000 - factorized_top_k/top_10_categorical_accuracy: 1.0000 - factorized_top_k/top_50_categorical_accuracy: 1.0000 - factorized_top_k/top_100_categorical_accuracy: 1.0000 - loss: 0.0000e+00 - regularization_loss: 0.0000e+00 - total_loss: 0.0000e+00
    Epoch 2/5
    10/10 [==============================] - 1s 61ms/step - factorized_top_k/top_1_categorical_accuracy: 1.0000 - factorized_top_k/top_5_categorical_accuracy: 1.0000 - factorized_top_k/top_10_categorical_accuracy: 1.0000 - factorized_top_k/top_50_categorical_accuracy: 1.0000 - factorized_top_k/top_100_categorical_accuracy: 1.0000 - loss: 0.0000e+00 - regularization_loss: 0.0000e+00 - total_loss: 0.0000e+00
    Epoch 3/5
    10/10 [==============================] - 1s 60ms/step - factorized_top_k/top_1_categorical_accuracy: 1.0000 - factorized_top_k/top_5_categorical_accuracy: 1.0000 - factorized_top_k/top_10_categorical_accuracy: 1.0000 - factorized_top_k/top_50_categorical_accuracy: 1.0000 - factorized_top_k/top_100_categorical_accuracy: 1.0000 - loss: 0.0000e+00 - regularization_loss: 0.0000e+00 - total_loss: 0.0000e+00
    Epoch 4/5
    10/10 [==============================] - 1s 60ms/step - factorized_top_k/top_1_categorical_accuracy: 1.0000 - factorized_top_k/top_5_categorical_accuracy: 1.0000 - factorized_top_k/top_10_categorical_accuracy: 1.0000 - factorized_top_k/top_50_categorical_accuracy: 1.0000 - factorized_top_k/top_100_categorical_accuracy: 1.0000 - loss: 0.0000e+00 - regularization_loss: 0.0000e+00 - total_loss: 0.0000e+00
    Epoch 5/5
    10/10 [==============================] - 1s 61ms/step - factorized_top_k/top_1_categorical_accuracy: 1.0000 - factorized_top_k/top_5_categorical_accuracy: 1.0000 - factorized_top_k/top_10_categorical_accuracy: 1.0000 - factorized_top_k/top_50_categorical_accuracy: 1.0000 - factorized_top_k/top_100_categorical_accuracy: 1.0000 - loss: 0.0000e+00 - regularization_loss: 0.0000e+00 - total_loss: 0.0000e+00
    <keras.callbacks.History at 0x7fe480d22f50>
    

    If you want to read your files into a dataset, try something like this:

    
    ################## ORIGINAL DATASET ################## 
    df_interactions = pd.DataFrame({
        'user_id': [
            '00001446-da5f-4d17', 
            '00001446-da5f-4d17',
            '00005ab5-c9e0-4b05-',
            '00005ab5-c9e0-4b05-',
            '000093dd-1a11-4600', 
            '000093dd-1a11-4600',
            '00009b34-65b5-42c1', 
            '0000ae32-4a91-4bcd',
            '0000ae32-4a91-4bcd',
            '0000ae32-4a91-4bcd'
        ], 
        'channel_id': [
            '1', '2', 'A56',
            '3', 'B72', '2', 
            'M63', '2', '5', 'A56'
        ]
    })
    
    df_channels = pd.DataFrame({
        'channel_id': [
            '1', '2', '3', '5', '6', '7', '8' 
        ],
        'channel_name': [
            'Popular', 
            'Best',
            'Highest Rated',
            'Large Following',
            'Nice', 
            'Retro',
            'Modern'
        ]
    })
    
    """
    ################## MODIFIED DATASET ##################
    df_interactions = pd.DataFrame({
        'user_id': [
            '4d17', 
            '4d17',
            '4b05',
            '4b05',
            '93dd', 
            '93dd',
            '9b34', 
            '4bcd',
            '-4bcd',
            '4bcd'
        ], 
        'channel_id': [
            '1', '2', '6',
            '3', '7', '2', 
            '8', '2', '5', '6'
        ]
    })
    
    df_channels = pd.DataFrame({
        'channel_id': [
            '1', '2', '3', '5', '6', '7', '8' 
        ],
        'channel_name': [
            'Popular', 
            'Best',
            'Highest Rated',
            'Large Following',
            'Nice', 
            'Retro',
            'Modern'
        ]
    })
    """
    
    df_channels.to_csv('experiment_channels.csv', index=False)
    df_interactions.to_csv('experiment_interactions.csv', index=False)
    
    channels = tf.data.experimental.CsvDataset('experiment_channels.csv', [tf.string, tf.string], header=True)
    interactions = tf.data.experimental.CsvDataset('experiment_interactions.csv', [tf.string, tf.string], header=True)
    
    def preprocess_channels(x, y):
      return x
    
    def preprocess_interactions(x, y):
      return {
        "user_id": tf.strings.regex_replace(x, '[^0-9^]', ""),
        "channel_id": y
      }
    
    channels = channels.map(preprocess_channels)
    interactions = interactions.map(preprocess_interactions)
    
    interactions_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
    interactions_vocabulary.adapt(interactions.map(lambda x: x["user_id"]))
    
    channels_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
    channels_vocabulary.adapt(channels)
    
    # Build a model.
    class Model(tfrs.Model):
    
      def __init__(self):
        super().__init__()
    
        self.user_model = tf.keras.Sequential([
            interactions_vocabulary,
            tf.keras.layers.Embedding(interactions_vocabulary.vocabulary_size(), 64)
        ])
    
        self.item_model = tf.keras.Sequential([
            channels_vocabulary,
            tf.keras.layers.Embedding(channels_vocabulary.vocabulary_size(), 64)
        ])
        # Set up a retrieval task and evaluation metrics over the
        # entire dataset of candidates.
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=channels.batch(1).map(self.item_model)
            )
        )
    
      def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        user_embeddings = self.user_model(features["user_id"])
        channel_embeddings = self.item_model(features["channel_id"])
        return self.task(user_embeddings, channel_embeddings)
    
    
    model = Model()
    model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))
    tf.random.set_seed(42)
    
    model.fit(interactions.batch(1), epochs=5)
    

    Note this example uses tf.keras.layers.StringLookup.

    The tf.data.experimental.CsvDataset class provides a minimal CSV Dataset interface.

    However, you are far more flexible than using a more high-level API like tf.data.experimental.make_csv_dataset. Check out the docs for more information.