AssertionError: Size mismatch between tensors when modelling timeseries data

I am new to PyTorch, and am trying to use it to model some time series data. I have two lists: x and y. Each item in x contains 29 timesteps with 4 features. Each corresponding value in y contains the 30th timestep with 4 columns.

When I try to create the Tensor data set, I get the following error:

---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
Input In [35], in <cell line: 2>()
      1 # Step 2: Concatenate 'x' and 'y' tensors properly
----> 2 train_dataset = TensorDataset(*x_train_tensors, *y_train_tensors)
      3 valid_dataset = TensorDataset(*x_valid_tensors, *y_valid_tensors)
      5 # Step 3: Create DataLoader for training and validation sets

File ~\anaconda3\lib\site-packages\torch\utils\data\dataset.py:192, in TensorDataset.__init__(self, *tensors)
    191 def __init__(self, *tensors: Tensor) -> None:
--> 192     assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors), "Size mismatch between tensors"
    193     self.tensors = tensors

AssertionError: Size mismatch between tensors

I have tried various approaches, including flattening the arrays but am not able to figure out what's wrong. Here's my code for reference. For reading ease, I have marked the area that's throwing the error with the comment 'Error block':

import pickle
import numpy as np
import pandas as pd

with open('lists_data.pkl', 'rb') as file:
    x, y = pickle.load(file)

from sklearn.preprocessing import MinMaxScaler
# Create empty lists to store the normalized data
norm_x = []
norm_y = []

scaler = MinMaxScaler()

# Iterate over corresponding dataframes from 'x' and 'y' using zip
for temp_x, temp_y in zip(x, y):
    temp = pd.concat([temp_x, temp_y])
    norm_temp = pd.DataFrame(scaler.fit_transform(temp), columns=temp.columns)
    norm_x.append(norm_temp.iloc[:-1])  # Append all rows except the last one to 'norm_x'
    norm_y.append(norm_temp.iloc[[-1]])  # Append the last row to 'norm_y'

from sklearn.model_selection import train_test_split
# Convert 'x' and 'y' into numpy arrays for easier manipulation
x_array = np.array(norm_x)
y_array = np.array(norm_y)

# Set the random seed for reproducibility
random_seed = 42

# Split 'x' and 'y' into training (70%), validation (15%), and test (15%) sets
x_train, x_temp, y_train, y_temp = train_test_split(x_array, y_array, test_size=0.3, random_state=random_seed)
x_valid, x_test, y_valid, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=random_seed)

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

x_train_tensors = [torch.tensor(x, dtype=torch.float32) for x in x_train]
x_valid_tensors = [torch.tensor(x, dtype=torch.float32) for x in x_valid]
x_test_tensors = [torch.tensor(x, dtype=torch.float32) for x in x_test]

y_train_tensors = [torch.tensor(y, dtype=torch.float32) for y in y_train]
y_valid_tensors = [torch.tensor(y, dtype=torch.float32) for y in y_valid]
y_test_tensors = [torch.tensor(y, dtype=torch.float32) for y in y_test]

class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNModel, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])
        return out

# Error block
batch_size = 64
train_dataset = TensorDataset(*x_train_tensors, *y_train_tensors)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

valid_dataset = TensorDataset(*x_valid_tensors, *y_valid_tensors)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size)

Solution

Try squeezing your tensors using .squeeze() method before applying TensorDataset

use:

x_train_tensors = torch.FloatTensor(x_train_tensors)
y_train_tensors = torch.FloatTensor(y_train_tensors)

y_train_tensors = y_train_tensors.squeeze()
train_dataset = TensorDataset(*x_train_tensors, *y_train_tensors)