I was wondering if someone could share some ideas for why my training loss begins at a higher level than the test loss?
I am trying to run an LSTM on daily stock return data as the only input and using the 10 previous days to predict the price on the next day. Training/test/validation sets do not overlap, so there is no leakage. Not using any regularisation that would impact the training data only.
Really confused at the moment as I cannot seem to find the error.
I will include the code below but its quite long
# Defining the LSTM class
import torch
import torch.nn as nn
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
class LSTM(nn.Module):
def __init__(self, n_inputs, n_hidden, num_layers, n_outputs):
super(LSTM, self).__init__()
self.D = n_inputs
self.M = n_hidden
self.K = n_outputs
self.L = num_layers
self.rnn = nn.LSTM(
input_size=self.D,
hidden_size=self.M,
num_layers=self.L,
batch_first=True)
self.fc = nn.Linear(self.M, self.K)
def forward(self, X):
# initial hidden states
h0 = torch.zeros(self.L, X.size(0), self.M).to(device)
c0 = torch.zeros(self.L, X.size(0), self.M).to(device)
# get RNN unit output
out, _ = self.rnn(X, (h0, c0))
# we only want h(T) at the final time step
out = self.fc(out[:, -1, :])
return out
# Defining a function to train the LSTM
def full_gd(model,
loss_function,
optimizer,
X_train,
y_train,
X_test,
y_test,
no_epochs):
# Stuff to store
train_losses = np.zeros(no_epochs)
test_losses = np.zeros(no_epochs)
for it in range(no_epochs):
# zero the parameter gradients
optimizer.zero_grad()
# Forward pass
outputs = model(X_train)
loss = loss_function(outputs, y_train)
# Backward and optimize
loss.backward()
optimizer.step()
# Save losses
train_losses[it] = loss.item()
# Test loss
test_outputs = model(X_test)
test_loss = loss_function(test_outputs, y_test)
test_losses[it] = test_loss.item()
if (it + 1) % 10 == 0:
print(f'Epoch {it+1}/{no_epochs}, Train Loss: {loss.item():.4f}, Test Loss: {test_loss.item():.4f}')
return train_losses, test_losses
# Import sklearn's StandardScaler to scale the returns data
scaler = StandardScaler()
scaler.fit(data[:3*len(data)//5])
historical_returns = scaler.transform(data)
# Creating the dataset to train the LSTM. D is the number of input features. T is the number of data points used in forecasting
T = 10
D = 1
X = []
Y = []
for t in range(len(historical_returns) - T):
x = historical_returns[t:t+T]
X.append(x)
y = historical_returns[t+T]
Y.append(y)
X_historical = np.array(X).reshape(-1, T, 1)
Y_historical = np.array(Y).reshape(-1, 1)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Splitting the data into a 60/20/20 train/validation/test split. No random split is used here as this is a time series dataset
x_train1 = torch.from_numpy(X_historical[:3*len(historical_returns)//5].astype(np.float32))
y_train1 = torch.from_numpy(Y_historical[:3*len(historical_returns)//5].astype(np.float32))
x_val1 = torch.from_numpy(X_historical[-2*len(historical_returns)//5: -1*len(historical_returns)//5].astype(np.float32))
y_val1 = torch.from_numpy(Y_historical[-2*len(historical_returns)//5: -1*len(historical_returns)//5].astype(np.float32))
x_test1 = torch.from_numpy(X_historical[-1*len(historical_returns)//5:].astype(np.float32))
y_test1 = torch.from_numpy(Y_historical[-1*len(historical_returns)//5:].astype(np.float32))
# move data to GPU
x_train1, y_train1 = x_train1.to(device), y_train1.to(device)
x_val1, y_val1 = x_val1.to(device), y_val1.to(device)
x_test1, y_test1 = x_test1.to(device), y_test1.to(device)
x_train1 = x_train1.reshape(-1, T, 1)
x_test1 = x_test1.reshape(-1, T, 1)
x_val1 = x_val1.reshape(-1, T, 1)
# Define the model parameters
Hidden = 10
model = LSTM(1, Hidden, 1, 1)
model.to(device)
loss_function = nn.MSELoss()
learning_rate = 0.01
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# Train the model
no_epochs = 200
train_losses, validation_losses = full_gd(model,
loss_function,
optimizer,
x_train1,
y_train1,
x_val1,
y_val1,
no_epochs)
# Plot training and validation loss
plt.figure(figsize=(12,8))
plt.plot(train_losses, label='train loss')
plt.plot(validation_losses, label='test loss')
plt.legend()
plt.show()
Well there might be several reasons.
Another natural reason to this issue is rising from the dataset size, since validation split is relatively smaller than the training split. Theoretically by random guesses (this is somewhat the model's initial state), you are more likely to fail on large number of guesses.
Your model seems that it couldn't learn, it perform poorly on training data, that is undesired. Keep in mind that RNNs are hard to train though. You can try some potential aids like, increasing the epoch size, making the model more complex. If you can compare your results with another work, you should do it. That'd guide you how good or badly you made the experiment.