I am new to PyTorch, and am trying to use it to model some time series data. I have two lists: x and y. Each item in x contains 29 timesteps with 4 features. Each corresponding value in y contains the 30th timestep with 4 columns.
When I try to create the Tensor data set, I get the following error:
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
Input In [35], in <cell line: 2>()
1 # Step 2: Concatenate 'x' and 'y' tensors properly
----> 2 train_dataset = TensorDataset(*x_train_tensors, *y_train_tensors)
3 valid_dataset = TensorDataset(*x_valid_tensors, *y_valid_tensors)
5 # Step 3: Create DataLoader for training and validation sets
File ~\anaconda3\lib\site-packages\torch\utils\data\dataset.py:192, in TensorDataset.__init__(self, *tensors)
191 def __init__(self, *tensors: Tensor) -> None:
--> 192 assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors), "Size mismatch between tensors"
193 self.tensors = tensors
AssertionError: Size mismatch between tensors
I have tried various approaches, including flattening the arrays but am not able to figure out what's wrong. Here's my code for reference. For reading ease, I have marked the area that's throwing the error with the comment 'Error block':
import pickle
import numpy as np
import pandas as pd
with open('lists_data.pkl', 'rb') as file:
x, y = pickle.load(file)
from sklearn.preprocessing import MinMaxScaler
# Create empty lists to store the normalized data
norm_x = []
norm_y = []
scaler = MinMaxScaler()
# Iterate over corresponding dataframes from 'x' and 'y' using zip
for temp_x, temp_y in zip(x, y):
temp = pd.concat([temp_x, temp_y])
norm_temp = pd.DataFrame(scaler.fit_transform(temp), columns=temp.columns)
norm_x.append(norm_temp.iloc[:-1]) # Append all rows except the last one to 'norm_x'
norm_y.append(norm_temp.iloc[[-1]]) # Append the last row to 'norm_y'
from sklearn.model_selection import train_test_split
# Convert 'x' and 'y' into numpy arrays for easier manipulation
x_array = np.array(norm_x)
y_array = np.array(norm_y)
# Set the random seed for reproducibility
random_seed = 42
# Split 'x' and 'y' into training (70%), validation (15%), and test (15%) sets
x_train, x_temp, y_train, y_temp = train_test_split(x_array, y_array, test_size=0.3, random_state=random_seed)
x_valid, x_test, y_valid, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=random_seed)
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
x_train_tensors = [torch.tensor(x, dtype=torch.float32) for x in x_train]
x_valid_tensors = [torch.tensor(x, dtype=torch.float32) for x in x_valid]
x_test_tensors = [torch.tensor(x, dtype=torch.float32) for x in x_test]
y_train_tensors = [torch.tensor(y, dtype=torch.float32) for y in y_train]
y_valid_tensors = [torch.tensor(y, dtype=torch.float32) for y in y_valid]
y_test_tensors = [torch.tensor(y, dtype=torch.float32) for y in y_test]
class RNNModel(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(RNNModel, self).__init__()
self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x):
out, _ = self.rnn(x)
out = self.fc(out[:, -1, :])
return out
# Error block
batch_size = 64
train_dataset = TensorDataset(*x_train_tensors, *y_train_tensors)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataset = TensorDataset(*x_valid_tensors, *y_valid_tensors)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size)
Try squeezing your tensors using .squeeze()
method before applying TensorDataset
use:
x_train_tensors = torch.FloatTensor(x_train_tensors)
y_train_tensors = torch.FloatTensor(y_train_tensors)
y_train_tensors = y_train_tensors.squeeze()
train_dataset = TensorDataset(*x_train_tensors, *y_train_tensors)