machine-learning deep-learning time-series pytorch lstm

Feeding Multiple Inputs to LSTM for Time-Series Forecasting using PyTorch

I'm currently working on building an LSTM network to forecast time-series data using PyTorch. Following Roman's blog post, I implemented a simple LSTM for univariate time-series data, please see the class definitions below. However, it's been a few days since I ground to a halt on adding more features to the input data, say an hour of the day, day of the week, week of the year, and sorts.

class Model(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Model, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.lstm = nn.LSTMCell(self.input_size, self.hidden_size)
        self.linear = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, future=0, y=None):
        outputs = []

        # reset the state of LSTM
        # the state is kept till the end of the sequence
        h_t = torch.zeros(input.size(0), self.hidden_size, dtype=torch.float32)
        c_t = torch.zeros(input.size(0), self.hidden_size, dtype=torch.float32)

        for i, input_t in enumerate(input.chunk(input.size(1), dim=1)):
            h_t, c_t = self.lstm(input_t, (h_t, c_t))
            output = self.linear(h_t)
            outputs += [output]

        for i in range(future):
            if y is not None and random.random() > 0.5:
                output = y[:, [i]]  # teacher forcing
            h_t, c_t = self.lstm(output, (h_t, c_t))
            output = self.linear(h_t)
            outputs += [output]
        outputs = torch.stack(outputs, 1).squeeze(2)
        return outputs


class Optimization:
    "A helper class to train, test and diagnose the LSTM"

    def __init__(self, model, loss_fn, optimizer, scheduler):
        self.model = model
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.train_losses = []
        self.val_losses = []
        self.futures = []

    @staticmethod
    def generate_batch_data(x, y, batch_size):
        for batch, i in enumerate(range(0, len(x) - batch_size, batch_size)):
            x_batch = x[i : i + batch_size]
            y_batch = y[i : i + batch_size]
            yield x_batch, y_batch, batch

    def train(
        self,
        x_train,
        y_train,
        x_val=None,
        y_val=None,
        batch_size=100,
        n_epochs=20,
        dropout=0.2,
        do_teacher_forcing=None,
    ):
        seq_len = x_train.shape[1]
        for epoch in range(n_epochs):
            start_time = time.time()
            self.futures = []

            train_loss = 0
            for x_batch, y_batch, batch in self.generate_batch_data(x_train, y_train, batch_size):
                y_pred = self._predict(x_batch, y_batch, seq_len, do_teacher_forcing)
                self.optimizer.zero_grad()
                loss = self.loss_fn(y_pred, y_batch)
                loss.backward()
                self.optimizer.step()
                train_loss += loss.item()
            self.scheduler.step()
            train_loss /= batch
            self.train_losses.append(train_loss)

            self._validation(x_val, y_val, batch_size)

            elapsed = time.time() - start_time
            print(
                "Epoch %d Train loss: %.2f. Validation loss: %.2f. Avg future: %.2f. Elapsed time: %.2fs."
                % (epoch + 1, train_loss, self.val_losses[-1], np.average(self.futures), elapsed)
            )

    def _predict(self, x_batch, y_batch, seq_len, do_teacher_forcing):
        if do_teacher_forcing:
            future = random.randint(1, int(seq_len) / 2)
            limit = x_batch.size(1) - future
            y_pred = self.model(x_batch[:, :limit], future=future, y=y_batch[:, limit:])
        else:
            future = 0
            y_pred = self.model(x_batch)
        self.futures.append(future)
        return y_pred

    def _validation(self, x_val, y_val, batch_size):
        if x_val is None or y_val is None:
            return
        with torch.no_grad():
            val_loss = 0
            batch = 1
            for x_batch, y_batch, batch in self.generate_batch_data(x_val, y_val, batch_size):
                y_pred = self.model(x_batch)
                loss = self.loss_fn(y_pred, y_batch)
                val_loss += loss.item()
            val_loss /= batch
            self.val_losses.append(val_loss)

    def evaluate(self, x_test, y_test, batch_size, future=1):
        with torch.no_grad():
            test_loss = 0
            actual, predicted = [], []
            for x_batch, y_batch, batch in self.generate_batch_data(x_test, y_test, batch_size):
                y_pred = self.model(x_batch, future=future)
                y_pred = (
                    y_pred[:, -len(y_batch) :] if y_pred.shape[1] > y_batch.shape[1] else y_pred
                )
                loss = self.loss_fn(y_pred, y_batch)
                test_loss += loss.item()
                actual += torch.squeeze(y_batch[:, -1]).data.cpu().numpy().tolist()
                predicted += torch.squeeze(y_pred[:, -1]).data.cpu().numpy().tolist()
            test_loss /= batch
            return actual, predicted, test_loss

    def plot_losses(self):
        plt.plot(self.train_losses, label="Training loss")
        plt.plot(self.val_losses, label="Validation loss")
        plt.legend()
        plt.title("Losses")

You can find some of the helper functions that help me split and format data before feeding it to my LSTM network.

def to_dataframe(actual, predicted):
    return pd.DataFrame({"value": actual, "prediction": predicted})

def inverse_transform(scaler, df, columns):
    for col in columns:
        df[col] = scaler.inverse_transform(df[col])
    return df

def split_sequences(sequences, n_steps):
    X, y = list(), list()
    for i in range(len(sequences)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the dataset
        if end_ix > len(sequences):
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1, -1]
        X.append(seq_x)
        y.append(seq_y)
    return array(X), array(y)


def train_val_test_split_new(df, test_ratio=0.2, seq_len = 100):
    y = df['value']
    X = df.drop(columns = ['value'])
    tarin_ratio = 1 - test_ratio
    val_ratio = 1 - ((train_ratio - test_ratio) / train_ratio)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_ratio)

    return X_train, y_train, X_val, y_val, X_test, y_test

I use the following data frames to train my model.

# df_train 
value   weekday monthday    hour
timestamp               
2014-07-01 00:00:00 10844   1   1   0
2014-07-01 00:30:00 8127    1   1   0
2014-07-01 01:00:00 6210    1   1   1
2014-07-01 01:30:00 4656    1   1   1
2014-07-01 02:00:00 3820    1   1   2
... ... ... ... ...
2015-01-31 21:30:00 24670   5   31  21
2015-01-31 22:00:00 25721   5   31  22
2015-01-31 22:30:00 27309   5   31  22
2015-01-31 23:00:00 26591   5   31  23
2015-01-31 23:30:00 26288   5   31  23
10320 rows × 4 columns

# x_train 
weekday monthday    hour
timestamp           
2014-08-26 16:30:00 1   26  16
2014-08-18 16:30:00 0   18  16
2014-10-22 20:00:00 2   22  20
2014-12-10 08:00:00 2   10  8
2014-07-27 22:00:00 6   27  22
... ... ... ...
2014-08-24 05:30:00 6   24  5
2014-11-24 12:00:00 0   24  12
2014-12-18 06:00:00 3   18  6
2014-07-27 17:00:00 6   27  17
2014-12-05 21:00:00 4   5   21
6192 rows × 3 columns

# y_train 
timestamp
2014-08-26 16:30:00    14083
2014-08-18 16:30:00    14465
2014-10-22 20:00:00    25195
2014-12-10 08:00:00    21348
2014-07-27 22:00:00    16356
                       ...  
2014-08-24 05:30:00     2948
2014-11-24 12:00:00    16292
2014-12-18 06:00:00     7029
2014-07-27 17:00:00    18883
2014-12-05 21:00:00    26284
Name: value, Length: 6192, dtype: int64

After transforming and splitting time-series data into smaller batches, the training data set for X and y becomes as follows:

X_data shape is (6093, 100, 3)
y_data shape is (6093,)
tensor([[[-1.0097,  1.1510,  0.6508],
         [-1.5126,  0.2492,  0.6508],
         [-0.5069,  0.7001,  1.2238],
         ...,
         [ 1.5044, -1.4417, -1.6413],
         [ 1.0016, -0.0890,  0.7941],
         [ 1.5044, -0.9908, -0.2087]],

        [[-1.5126,  0.2492,  0.6508],
         [-0.5069,  0.7001,  1.2238],
         [-0.5069, -0.6526, -0.4952],
         ...,
         [ 1.0016, -0.0890,  0.7941],
         [ 1.5044, -0.9908, -0.2087],
         [ 0.4988,  0.5874,  0.5076]],

        [[-0.5069,  0.7001,  1.2238],
         [-0.5069, -0.6526, -0.4952],
         [ 1.5044,  1.2637,  1.5104],
         ...,
         [ 1.5044, -0.9908, -0.2087],
         [ 0.4988,  0.5874,  0.5076],
         [ 0.4988,  0.5874, -0.6385]],

        ...,

        [[ 1.0016,  0.9255, -1.2115],
         [-1.0097, -0.9908,  1.0806],
         [-0.0041,  0.8128,  0.3643],
         ...,
         [ 1.5044,  0.9255, -0.9250],
         [-1.5126,  0.9255,  0.0778],
         [-0.0041,  0.2492, -0.7818]],

        [[-1.0097, -0.9908,  1.0806],
         [-0.0041,  0.8128,  0.3643],
         [-0.5069,  1.3765, -0.0655],
         ...,
         [-1.5126,  0.9255,  0.0778],
         [-0.0041,  0.2492, -0.7818],
         [ 1.5044,  1.2637,  0.7941]],

        [[-0.0041,  0.8128,  0.3643],
         [-0.5069,  1.3765, -0.0655],
         [-0.0041, -1.6672, -0.4952],
         ...,
         [-0.0041,  0.2492, -0.7818],
         [ 1.5044,  1.2637,  0.7941],
         [ 0.4988, -1.2163,  1.3671]]])
tensor([ 0.4424,  0.1169,  0.0148,  ..., -1.1653,  0.5394,  1.6037])

Finally, just to check if the dimensions of all these training, validation, and test datasets are correct, I print out their shapes.

train shape is: torch.Size([6093, 100, 3])
train label shape is: torch.Size([6093])
val shape is: torch.Size([1965, 100, 3])
val label shape is: torch.Size([1965])
test shape is: torch.Size([1965, 100, 3])
test label shape is: torch.Size([1965])

When I try to build the model as follows, I end up getting a RuntimeError pointing at inconsistent input sizes.

model_params = {'train_ratio': 0.8, 
                'validation_ratio': 0.2,
                'sequence_length': 100,
                'teacher_forcing': False,
                'dropout_rate': 0.2,
                'batch_size': 100,
                'num_of_epochs': 5,
                'hidden_size': 24,
                'n_features': 3,
                'learning_rate': 1e-3
               }

train_ratio = model_params['train_ratio']
val_ratio = model_params['validation_ratio']
seq_len = model_params['sequence_length']
teacher_forcing = model_params['teacher_forcing']
dropout_rate = model_params['dropout_rate']
batch_size = model_params['batch_size']
n_epochs = model_params['num_of_epochs']
hidden_size = model_params['hidden_size']
n_features = model_params['n_features']
lr = model_params['learning_rate']


model = Model(input_size=n_features, hidden_size=hidden_size, output_size=1)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)
optimization = Optimization(model, loss_fn, optimizer, scheduler)

start_time = datetime.now()
optimization.train(x_train, y_train, x_val, y_val, 
                     batch_size=batch_size, 
                     n_epochs=n_epochs,
                     dropout=dropout_rate, 
                     do_teacher_forcing=teacher_forcing)

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-192-6fc406c0113d> in <module>
      6 
      7 start_time = datetime.now()
----> 8 optimization.train(x_train, y_train, x_val, y_val, 
      9                      batch_size=batch_size,
     10                      n_epochs=n_epochs,

<ipython-input-189-c18d20430910> in train(self, x_train, y_train, x_val, y_val, batch_size, n_epochs, dropout, do_teacher_forcing)
     68             train_loss = 0
     69             for x_batch, y_batch, batch in self.generate_batch_data(x_train, y_train, batch_size):
---> 70                 y_pred = self._predict(x_batch, y_batch, seq_len, do_teacher_forcing)
     71                 self.optimizer.zero_grad()
     72                 loss = self.loss_fn(y_pred, y_batch)

<ipython-input-189-c18d20430910> in _predict(self, x_batch, y_batch, seq_len, do_teacher_forcing)
     93         else:
     94             future = 0
---> 95             y_pred = self.model(x_batch)
     96         self.futures.append(future)
     97         return y_pred

~\Anaconda3\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
    725             result = self._slow_forward(*input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(
    729                 _global_forward_hooks.values(),

<ipython-input-189-c18d20430910> in forward(self, input, future, y)
     17 
     18         for i, input_t in enumerate(input.chunk(input.size(1), dim=1)):
---> 19             h_t, c_t = self.lstm(input_t, (h_t, c_t))
     20             output = self.linear(h_t)
     21             outputs += [output]

~\Anaconda3\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
    725             result = self._slow_forward(*input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(
    729                 _global_forward_hooks.values(),

~\Anaconda3\lib\site-packages\torch\nn\modules\rnn.py in forward(self, input, hx)
    963 
    964     def forward(self, input: Tensor, hx: Optional[Tuple[Tensor, Tensor]] = None) -> Tuple[Tensor, Tensor]:
--> 965         self.check_forward_input(input)
    966         if hx is None:
    967             zeros = torch.zeros(input.size(0), self.hidden_size, dtype=input.dtype, device=input.device)

~\Anaconda3\lib\site-packages\torch\nn\modules\rnn.py in check_forward_input(self, input)
    789     def check_forward_input(self, input: Tensor) -> None:
    790         if input.size(1) != self.input_size:
--> 791             raise RuntimeError(
    792                 "input has inconsistent input_size: got {}, expected {}".format(
    793                     input.size(1), self.input_size))

RuntimeError: input has inconsistent input_size: got 1, expected 3

I suspect my current LSTM model class does not support data with multiple features, and I've been trying out different approaches lately with no luck so far. Feel free to share your thoughts or point me in the right direction that could help me solve this problem.

As suggested by @stackoverflowuser2010, I printed out the shapes of the tensors input_t, h_t and c_t that is fed into the forward step before the error is thrown.

input_t
torch.Size([100, 1, 3])
h_t
torch.Size([100, 24])
c_t
torch.Size([100, 24])

Solution

After muddling through for a couple of weeks, I solved the issue. This has been a fruitful journey for me, so I'd like to share what I have discovered. If you'd like to have a look at the complete walk-through with code, please check out my Medium post on the matter.

Just as in Pandas, I found that things tend to work faster and smoother when I stick to the PyTorch way. Both libraries rely on NumPy, and I'm sure one can do pretty much all the table and matrix operations explicitly with NumPy arrays and functions. However, doing so does eliminate all the nice abstractions and performance improvements these libraries provide and turn each step into a CS exercise. It's fun until it isn't.

Rather than shaping all the training and validation sets manually to pass them to the model, PyTorch's TensorDataset and DataLoaders classes have immensely helped me. Scaling the feature and target sets for training and validation, we then have NumPy arrays. We can transform these arrays into Tensors and use these Tensors to create our TensorDataset, or a custom Dataset depending on your requirements. Finally, DataLoaders allow us to iterate over such datasets with much less hassle than otherwise as they already provide built-in batching, shuffling, and dropping the last batch options.

train_features = torch.Tensor(X_train_arr)
train_targets = torch.Tensor(y_train_arr)

val_features = torch.Tensor(X_val_arr)
val_targets = torch.Tensor(y_val_arr)

train = TensorDataset(train_features, train_targets)
train_loader = DataLoader(train, batch_size=64, shuffle=False, drop_last=True)

val = TensorDataset(val_features, val_targets)
val_loader = DataLoader(val, batch_size=64, shuffle=False, drop_last=True)

After transforming our data into iterable datasets, they can later be used to do mini-batch training. Instead of explicitly defining batches or wrestling with matrix operations, we can easily iterate over them via DataLoaders as follows.

model = LSTMModel(input_dim, hidden_dim, layer_dim, output_dim)

criterion = nn.MSELoss(reduction='mean')
optimizer = optim.Adam(model.parameters(), lr=1e-2)

train_losses = []
val_losses = []
train_step = make_train_step(model, criterion, optimizer)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

for epoch in range(n_epochs):
    batch_losses = []
    for x_batch, y_batch in train_loader:
        x_batch = x_batch.view([batch_size, -1, n_features]).to(device)
        y_batch = y_batch.to(device)
        loss = train_step(x_batch, y_batch)
        batch_losses.append(loss)
    training_loss = np.mean(batch_losses)
    train_losses.append(training_loss)    
    with torch.no_grad():
        batch_val_losses = []
        for x_val, y_val in val_loader:
            x_val = x_val.view([batch_size, -1, n_features]).to(device)
            y_val = y_val.to(device)        
            model.eval()
            yhat = model(x_val)
            val_loss = criterion(y_val, yhat).item()
            batch_val_losses.append(val_loss)
        validation_loss = np.mean(batch_val_losses)
        val_losses.append(validation_loss)
    
    print(f"[{epoch+1}] Training loss: {training_loss:.4f}\t Validation loss: {validation_loss:.4f}")

Another cool feature that PyTorch provides is the view() function, which allows faster and memory-efficient reshaping of tensors. Since I earlier defined my LSTM model with batch_first = True, the batch tensor for the feature set must have the shape of (batch size, time steps, number of features). The line in the code above x_batch = x_batch.view([batch_size, -1, n_features]).to(device) just does that.

I hope this answer helps those dealing with similar problems or at least gives an idea of which direction to take. I had changed a lot in the code shared in the original post, but I'll not put them all here for the sake of simplicity. Feel free to check out the rest of it in my other SO post here.