Search code examples
pythonpytorchlstm

Multivariate input LSTM in pytorch


I would like to implement LSTM for multivariate input in Pytorch.

Following this article https://machinelearningmastery.com/how-to-develop-lstm-models-for-time-series-forecasting/ which uses keras, the input data are in shape of (number of samples, number of timesteps, number of parallel features)

in_seq1 = array([10, 20, 30, 40, 50, 60, 70, 80, 90])
in_seq2 = array([15, 25, 35, 45, 55, 65, 75, 85, 95])
out_seq = array([in_seq1[i]+in_seq2[i] for i in range(len(in_seq1))])
. . . 
Input     Output
[[10 15]
 [20 25]
 [30 35]] 65
[[20 25]
 [30 35]
 [40 45]] 85
[[30 35]
 [40 45]
 [50 55]] 105
[[40 45]
 [50 55]
 [60 65]] 125
[[50 55]
 [60 65]
 [70 75]] 145
[[60 65]
 [70 75]
 [80 85]] 165
[[70 75]
 [80 85]
 [90 95]] 185

n_timesteps = 3
n_features = 2

In keras it seems to be easy:

model.add(LSTM(50, activation='relu', input_shape=(n_timesteps, n_features)))

Can it be done in other way, than creating n_features of LSTMs as first layer and feed each separately (imagine as multiple streams of sequences) and then flatten their output to linear layer?

I'm not 100% sure but by nature of LSTM the input cannot be flattened and passed as 1D array, because each sequence "plays by different rules" which the LSTM is supposed to learn.

So how does such implementation with keras equal to PyTorch input of shape (seq_len, batch, input_size)(source https://pytorch.org/docs/stable/nn.html#lstm)


Edit:

Can it be done in other way, than creating n_features of LSTMs as first layer and feed each separately (imagine as multiple streams of sequences) and then flatten their output to linear layer?

According to PyTorch docs the input_size parameter actually means number of features (if it means number of parallel sequences)


Solution

  • I hope that problematic parts are commented to make sense:

    Data preparation

    import random
    import numpy as np
    import torch
    
    # multivariate data preparation
    from numpy import array
    from numpy import hstack
     
    # split a multivariate sequence into samples
    def split_sequences(sequences, n_steps):
        X, y = list(), list()
        for i in range(len(sequences)):
            # find the end of this pattern
            end_ix = i + n_steps
            # check if we are beyond the dataset
            if end_ix > len(sequences):
                break
            # gather input and output parts of the pattern
            seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1, -1]
            X.append(seq_x)
            y.append(seq_y)
        return array(X), array(y)
     
    # define input sequence
    in_seq1 = array([x for x in range(0,100,10)])
    in_seq2 = array([x for x in range(5,105,10)])
    out_seq = array([in_seq1[i]+in_seq2[i] for i in range(len(in_seq1))])
    # convert to [rows, columns] structure
    in_seq1 = in_seq1.reshape((len(in_seq1), 1))
    in_seq2 = in_seq2.reshape((len(in_seq2), 1))
    out_seq = out_seq.reshape((len(out_seq), 1))
    # horizontally stack columns
    dataset = hstack((in_seq1, in_seq2, out_seq))
    

    Multivariate LSTM Network

    class MV_LSTM(torch.nn.Module):
        def __init__(self,n_features,seq_length):
            super(MV_LSTM, self).__init__()
            self.n_features = n_features
            self.seq_len = seq_length
            self.n_hidden = 20 # number of hidden states
            self.n_layers = 1 # number of LSTM layers (stacked)
        
            self.l_lstm = torch.nn.LSTM(input_size = n_features, 
                                     hidden_size = self.n_hidden,
                                     num_layers = self.n_layers, 
                                     batch_first = True)
            # according to pytorch docs LSTM output is 
            # (batch_size,seq_len, num_directions * hidden_size)
            # when considering batch_first = True
            self.l_linear = torch.nn.Linear(self.n_hidden*self.seq_len, 1)
            
        
        def init_hidden(self, batch_size):
            # even with batch_first = True this remains same as docs
            hidden_state = torch.zeros(self.n_layers,batch_size,self.n_hidden)
            cell_state = torch.zeros(self.n_layers,batch_size,self.n_hidden)
            self.hidden = (hidden_state, cell_state)
        
        
        def forward(self, x):        
            batch_size, seq_len, _ = x.size()
            
            lstm_out, self.hidden = self.l_lstm(x,self.hidden)
            # lstm_out(with batch_first = True) is 
            # (batch_size,seq_len,num_directions * hidden_size)
            # for following linear layer we want to keep batch_size dimension and merge rest       
            # .contiguous() -> solves tensor compatibility error
            x = lstm_out.contiguous().view(batch_size,-1)
            return self.l_linear(x)
    

    Initialization

    n_features = 2 # this is number of parallel inputs
    n_timesteps = 3 # this is number of timesteps
    
    # convert dataset into input/output
    X, y = split_sequences(dataset, n_timesteps)
    print(X.shape, y.shape)
    
    # create NN
    mv_net = MV_LSTM(n_features,n_timesteps)
    criterion = torch.nn.MSELoss() # reduction='sum' created huge loss value
    optimizer = torch.optim.Adam(mv_net.parameters(), lr=1e-1)
    
    train_episodes = 500
    batch_size = 16
    

    Training

    mv_net.train()
    for t in range(train_episodes):
        for b in range(0,len(X),batch_size):
            inpt = X[b:b+batch_size,:,:]
            target = y[b:b+batch_size]    
            
            x_batch = torch.tensor(inpt,dtype=torch.float32)    
            y_batch = torch.tensor(target,dtype=torch.float32)
        
            mv_net.init_hidden(x_batch.size(0))
        #    lstm_out, _ = mv_net.l_lstm(x_batch,nnet.hidden)    
        #    lstm_out.contiguous().view(x_batch.size(0),-1)
            output = mv_net(x_batch) 
            loss = criterion(output.view(-1), y_batch)  
            
            loss.backward()
            optimizer.step()        
            optimizer.zero_grad() 
        print('step : ' , t , 'loss : ' , loss.item())
    

    Results

    step :  499 loss :  0.0010267728939652443 # probably overfitted due to 500 training episodes