I am trying to understand LSTMs and wanted to implement a simple example of classifying a sequence as "0" if the number of "1" in the sequence is odd and as "1" if the number of "1" is even. This is my data generation and training routine:
import torch
import numpy as np
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from Dataset import LSTMDataset # Custom Dataset
from Network import LSTMNet # Custom Network
if __name__ == "__main__":
numSamples = 1000
sampleLength = 5
samples = np.ndarray( shape=( numSamples, sampleLength ), dtype=np.float32 )
labels = np.ndarray( shape=( numSamples ), dtype=np.float32 )
for s in range( numSamples ):
sample = np.random.choice( [ 0, 1 ], size=sampleLength )
samples[ s ] = sample
even = np.count_nonzero( sample == 1 ) % 2 == 0
labels[ s ] = int( even )
X_train, X_test, y_train, y_test = train_test_split( samples, labels, test_size=0.25, random_state=42 )
trainingSet = LSTMDataset( X_train, y_train )
testSet = LSTMDataset( X_test, y_test )
training_loader = DataLoader( trainingSet, batch_size=1, shuffle=True )
validation_loader = DataLoader( testSet, batch_size=1, shuffle=False )
model = LSTMNet( inputSize= sampleLength )
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
loss_fn = torch.nn.BCELoss()
for epoch in range( 10 ):
yPredicted = []
yTruth = []
for i, data in enumerate( training_loader ):
inputs, labels = data
optimizer.zero_grad()
outputs = model(inputs)
loss = loss_fn(outputs, labels)
loss.backward()
optimizer.step()
yTruth.append( int( labels.item() ) )
yPredicted.append( int( torch.round( outputs ).item() ) )
accuracy = accuracy_score( yTruth, yPredicted )
print( f"Accuracy: {accuracy:.2f}" )
My dataset and network:
class LSTMDataset( Dataset ):
def __init__( self, x, y ):
self.x = x
self.y = y
def __len__(self):
return self.y.shape[ 0 ]
def __getitem__(self, idx):
sample, label = self.x[ idx ], self.y[ idx ]
return sample.reshape( ( -1, 1 ) ), label.reshape( ( 1 ) )
class LSTMNet( nn.Module ):
def __init__( self, sequenceLength ):
super().__init__()
self.hidden_size = 10
self.lstm = nn.LSTM( input_size=1, hidden_size=self.hidden_size, num_layers=2, batch_first=True )
self.net = nn.Sequential(
nn.Flatten(),
nn.ReLU(),
nn.Linear( sequenceLength * self.hidden_size, 1 ),
nn.Sigmoid()
)
def forward(self, x):
x, _ = self.lstm( x )
x = self.net( x )
return x
But unfortunately, my training accuracy never goes beyond 53%. Does anyone have any tips what I am doing wrong?
The input shape to my network is ( 1, 5, 1 )
and I wanted to fed the sequence elements one after another to my network that's why I chose ( 1, 5, 1 )
and not (1, 1, 5 )
.
You're putting a bunch of 0 values directly into the network. Any value multiplied by 0 is 0. The 0s are killing your signal through the model. Replace the inputs with a learned embedding
class LSTMNet( nn.Module ):
def __init__( self, sequenceLength ):
super().__init__()
self.hidden_size = 10
# added embedding layer
self.embedding = nn.Embedding(2, self.hidden_size)
self.lstm = nn.LSTM( input_size=self.hidden_size, hidden_size=self.hidden_size,
num_layers=1, batch_first=True )
self.net = nn.Sequential(
nn.Flatten(),
# added layer here, see note
nn.Linear( sequenceLength * self.hidden_size, sequenceLength * self.hidden_size ),
nn.ReLU(),
nn.Linear( sequenceLength * self.hidden_size, 1 ),
nn.Sigmoid()
)
def forward(self, x):
# remove unit axis so x is size (batch_size, sequence_length)
# convert to long type for embedding
x = self.embedding(x.squeeze(-1).long())
x, _ = self.lstm( x )
x = self.net( x )
return x
The model has an added embedding layer. I also added another linear layer in the sequential section. Strictly spearking is optional but greatly improves convergence. The output of the LSTM comes from a tanh function which means about half your values are below 0. Going LSTM -> ReLU throws away these values. The model can compensate, but it will learn faster with a linear layer between the LSTM and ReLU.
Full code:
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
class LSTMDataset( Dataset ):
def __init__( self, x, y ):
self.x = x
self.y = y
def __len__(self):
return self.y.shape[ 0 ]
def __getitem__(self, idx):
sample, label = self.x[ idx ], self.y[ idx ]
return sample.reshape( ( -1, 1 ) ), label.reshape( ( 1 ) )
class LSTMNet( nn.Module ):
def __init__( self, sequenceLength ):
super().__init__()
self.hidden_size = 10
# added embedding layer
self.embedding = nn.Embedding(2, self.hidden_size)
self.lstm = nn.LSTM( input_size=self.hidden_size, hidden_size=self.hidden_size,
num_layers=1, batch_first=True )
self.net = nn.Sequential(
nn.Flatten(),
# added layer here, see note
nn.Linear( sequenceLength * self.hidden_size, sequenceLength * self.hidden_size ),
nn.ReLU(),
nn.Linear( sequenceLength * self.hidden_size, 1 ),
nn.Sigmoid()
)
def forward(self, x):
# remove unit axis so x is size (batch_size, sequence_length)
# convert to long type for embedding
x = self.embedding(x.squeeze(-1).long())
x, _ = self.lstm( x )
x = self.net( x )
return x
numSamples = 1000
sampleLength = 5
samples = np.ndarray( shape=( numSamples, sampleLength ), dtype=np.float32 )
labels = np.ndarray( shape=( numSamples ), dtype=np.float32 )
for s in range( numSamples ):
sample = np.random.choice( [ 0, 1 ], size=sampleLength )
samples[ s ] = sample
even = np.count_nonzero( sample == 1 ) % 2 == 0
labels[ s ] = int( even )
X_train, X_test, y_train, y_test = train_test_split( samples, labels, test_size=0.25, random_state=42 )
trainingSet = LSTMDataset( X_train, y_train )
testSet = LSTMDataset( X_test, y_test )
# note you should use a larger batch size
training_loader = DataLoader( trainingSet, batch_size=1, shuffle=True )
validation_loader = DataLoader( testSet, batch_size=1, shuffle=False )
model = LSTMNet( sampleLength )
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
loss_fn = torch.nn.BCELoss()
for epoch in range( 20 ):
yPredicted = []
yTruth = []
for i, data in enumerate( training_loader ):
inputs, labels = data
optimizer.zero_grad()
outputs = model(inputs)
loss = loss_fn(outputs, labels)
loss.backward()
optimizer.step()
yTruth.append(labels.detach() )
yPredicted.append( torch.round( outputs.detach() ) )
accuracy = accuracy_score( torch.cat(yTruth), torch.cat(yPredicted) )
print( f"Accuracy: {accuracy:.2f}" )
All that said, your model hard-codes the sequence length. In this scenario, it doesn't really make sense to use a LSTM to begin with. LSTMs are for variable sequence length tasks. If you have a hard-coded sequence length, you can just use a MLP
class MLPNet( nn.Module ):
def __init__( self, sequenceLength ):
super().__init__()
self.net = nn.Sequential(
nn.Flatten(),
nn.Linear( sequenceLength, sequenceLength ),
nn.ReLU(),
nn.Linear( sequenceLength, 1 ),
nn.Sigmoid()
)
def forward(self, x):
x = self.net( x )
return x