python tensorflow keras deep-learning pytorch

Convert Tensoflow model to PyTorch model - model isn't learning

I'm trying to port a tensorflow neural network to pytorch, as an exercise to familiarize myself with both / their nuances. This is the tensorflow network I'm porting to pytorch:

import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D
from tensorflow.keras.datasets import imdb

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=5000)
x_train = sequence.pad_sequences(x_train, maxlen=400, padding="post")
x_test = sequence.pad_sequences(x_test, maxlen=400, padding="post")
model = Sequential()
model.add(Embedding(5000, 50, input_length=400))
model.add(Dropout(0.2))
model.add(Conv1D(250, 3, padding='valid',activation='relu',strides=1))
model.add(GlobalMaxPooling1D())
model.add(Dense(250))
model.add(Dropout(0.2))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
h2 = model.fit(x_train, y_train, batch_size=32, epochs=10, validation_data=(x_test, y_test))

The shapes of each layer is shown below:

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding (Embedding)       (None, 400, 50)           250000    
 dropout (Dropout)           (None, 400, 50)           0         
 conv1d (Conv1D)             (None, 398, 250)          37750     
 global_max_pooling1d (Globa  (None, 250)              0         
 lMaxPooling1D)                                                  
 dense (Dense)               (None, 250)               62750     
 dropout_1 (Dropout)         (None, 250)               0         
 activation (Activation)     (None, 250)               0         
 dense_1 (Dense)             (None, 1)                 251       
 activation_1 (Activation)   (None, 1)                 0         
                                                                 
=================================================================
Total params: 350,751
Trainable params: 350,751
Non-trainable params: 0

And the output of the tensorflow model is:

Epoch 1/10
loss: 0.4043 - accuracy: 0.8021 - val_loss: 0.2764 - val_accuracy: 0.8854
Epoch 2/10
loss: 0.2332 - accuracy: 0.9052 - val_loss: 0.2690 - val_accuracy: 0.8888
Epoch 3/10
loss: 0.1598 - accuracy: 0.9389 - val_loss: 0.2948 - val_accuracy: 0.8832
Epoch 4/10
loss: 0.1112 - accuracy: 0.9600 - val_loss: 0.3015 - val_accuracy: 0.8906
Epoch 5/10
loss: 0.0810 - accuracy: 0.9700 - val_loss: 0.3057 - val_accuracy: 0.8868
Epoch 6/10
loss: 0.0537 - accuracy: 0.9811 - val_loss: 0.4055 - val_accuracy: 0.8868
Epoch 7/10
loss: 0.0408 - accuracy: 0.9860 - val_loss: 0.4083 - val_accuracy: 0.8852
Epoch 8/10
loss: 0.0411 - accuracy: 0.9845 - val_loss: 0.4789 - val_accuracy: 0.8789
Epoch 9/10
loss: 0.0380 - accuracy: 0.9862 - val_loss: 0.4828 - val_accuracy: 0.8827
Epoch 10/10
loss: 0.0329 - accuracy: 0.9879 - val_loss: 0.4999 - val_accuracy: 0.8825

Here's what I have in my PyTorch port over:

from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torch
from tqdm import tqdm
import torch.nn.functional as F
from sklearn.metrics import accuracy_score

class CustomDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
    
    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]
    
train_dataloader = DataLoader(CustomDataset(torch.Tensor(x_train), torch.Tensor(y_train)), batch_size=32, shuffle=True)
test_dataloader = DataLoader(CustomDataset(torch.Tensor(x_test), torch.Tensor(y_test)), batch_size=32, shuffle=True)

class MyModel(torch.nn.Module):
    def __init__(self, vocab_size=5000, input_len=400, embedding_dims=50, kernel_size=3, filters=250, hidden_dims=250):
        super(MyModel, self).__init__()
        self.embedding_dims = embedding_dims
        self.input_len = input_len
        self.embedding = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dims)
        self.dropout1 = torch.nn.Dropout(p=0.2)
        self.conv1d = torch.nn.Conv1d(in_channels=embedding_dims, out_channels=filters, kernel_size=kernel_size, padding=(0,), stride=1)
        self.pool = torch.nn.AdaptiveMaxPool1d(1)
        self.linear1 = torch.nn.Linear(in_features=hidden_dims, out_features=hidden_dims)
        self.dropout2 = torch.nn.Dropout(p=0.2)
        self.activation = torch.nn.ReLU()
        self.output = torch.nn.Linear(in_features=hidden_dims, out_features=1)
        self.activation2 = torch.nn.Sigmoid()

    def forward(self, x):
        x = self.dropout1(self.embedding(x.type(torch.LongTensor)))
        x = self.conv1d(x.view(-1, self.embedding_dims, self.input_len))
        x = self.pool(x)
        x = self.activation(self.dropout2(self.linear1(x.view(-1,x.size()[1]))))
        x = self.activation2(self.output(x))
        return x

class FitTorchModel():
    def __init__(self, model, num_epochs=10, steps_per_epoch=782):
        self.model = model
        self.epochs = num_epochs
        self.steps_per_epoch = steps_per_epoch
        
    def fit(self, train_dataloader, test_dataloader):
        opt = torch.optim.Adam(self.model.parameters(), lr=0.001)
        crit = torch.nn.BCELoss(reduction = "mean")
        history_df = pd.DataFrame(columns = ["Loss", "Accuracy", "Val_Loss", "Val_Acc"])
        for epoch in range(self.epochs):
            self.model.train()
            print(f"Epoch {epoch}")
            epoch_loss = 0
            epoch_acc = 0
            it = iter(train_dataloader)
            for step in tqdm(range(self.steps_per_epoch)):
                opt.zero_grad()
                x, y = next(it)
                y_pred = self.model(x).view(-1)
                loss = crit(y_pred, y)     
                epoch_loss += loss.item()
                epoch_acc += accuracy_score(y==1, y_pred > 0.5)
                loss.backward()
                opt.step()
            val_loss, val_acc = self.predict_proba(test_dataloader, crit)
            df = pd.DataFrame({"Loss": epoch_loss/(step+1), 
                               "Accuracy": epoch_acc/(step+1),
                               "Val_Loss": val_loss, "Val_Acc": val_acc}, index=[0]) 
            history_df = pd.concat((history_df, df), ignore_index=True) 
        return history_df

    def predict_proba(self, test_dataloader, crit):
        self.model.eval()
        val_loss = 0
        val_acc = 0
        it = iter(test_dataloader)
        with torch.no_grad():    
            for step in tqdm(range(self.steps_per_epoch)):
                x,y = next(it)
                y_pred = self.model(x).view(-1)
                batch_loss = crit(y_pred, y)
                val_loss += batch_loss.item()
                val_acc += accuracy_score(y==1, y_pred > 0.5)
        return  val_loss/(step+1), val_acc/(step+1)

ftm = FitTorchModel(model=MyModel(), num_epochs=10, steps_per_epoch=782)
history_df = ftm.fit(train_dataloader, test_dataloader)

The shape of each layer is:

After embedding layer: torch.Size([32, 400, 50])
After dropout1 layer: torch.Size([32, 400, 50])
After convolution1d layer: torch.Size([32, 250, 398])
After maxpooling layer: torch.Size([32, 250, 1])
After linear1 layer: torch.Size([32, 250])
After dropout2 layer: torch.Size([32, 250])
After activation layer: torch.Size([32, 250])
After output layer: torch.Size([32, 1])
After activation2 layer: torch.Size([32, 1])

The output of the pytorch model training is:

       Loss  Accuracy  Val_Loss   Val_Acc
0  0.697899  0.505874  0.692495  0.511629
1  0.693063  0.503477  0.693186  0.503637
2  0.693190  0.496044  0.693149  0.499201
3  0.693181  0.501359  0.693082  0.502038
4  0.693169  0.503237  0.693234  0.495964
5  0.693177  0.500240  0.693154  0.500679
6  0.693069  0.507473  0.693258  0.498881
7  0.693948  0.500320  0.693145  0.501598
8  0.693196  0.499640  0.693164  0.496324
9  0.693170  0.500759  0.693140  0.501918

Couple things: the accuracy hovers around guessing (this is a binary classification task), no matter how many epochs have passed. Secondly, the training loss barely improves. I set the learning rate to the default learning rate described by tensorflow's Adam Optimizer docs. What else am I missing here? I had some trouble with the input / output dimensions for the various layers - did I mess those up at all?

Solution

Some observations:

Use BCEWithLogitsLoss as loss on the output of the last linear layer, before the sigmoid. This includes the sigmoid activation in a more numerically stable fashion.
The TensorFlow model has a ReLU after the Convolution, the pytorch implementations does not.

In general, for debugging, one might want to look at weight.grad of some of your weights after the loss.backward() and see if gradients calculated. Also printing out the value of one of the weights in each iteration to see if your optimizer actually changes the weights can help...

Also, it can depend on the input data: (Are you sure that x_test is scaled correctly?) If you are transforming your inputs to Long before embedding them and all x_test, for example, are floats between 0 and 1, they will all be converted to 0! And the network will have a hard time predicting the labels from all zeros as constant input!

But now to the actual issue in this particular case: Be careful with .view! It might not do what you expect. It just reshapes the tensor but does not move the data around. What you really want is .moveaxes(-1,2) instead!!

        Loss    Accuracy    Val_Loss    Val_Acc
0   0.573489    0.671715    0.402601    0.819413
1   0.376908    0.830163    0.33786     0.850783
2   0.308343    0.868646    0.296171    0.872323
3   0.258806    0.893342    0.319121    0.865849
4   0.227044    0.907649    0.3172      0.868326
5   0.202789    0.918478    0.281184    0.886549
6   0.179744    0.928549    0.291027    0.886589
7   0.161205    0.93702     0.329196    0.879156
8   0.145447    0.944094    0.294914    0.889746
9   0.133034    0.949568    0.291476    0.889826

After adding the relu after the convolution and, more importantly, fixing the view!

class MyModel(torch.nn.Module):
    def __init__(self, vocab_size=5000, input_len=400, embedding_dims=50, kernel_size=3, filters=250, hidden_dims=250):
        super(MyModel, self).__init__()
        self.embedding_dims = embedding_dims
        self.input_len = input_len
        self.embedding = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dims)
        self.dropout1 = torch.nn.Dropout(p=0.2)
        self.conv1d = torch.nn.Conv1d(in_channels=embedding_dims, out_channels=filters, kernel_size=kernel_size, padding=(0,), stride=1)
        self.pool = torch.nn.AdaptiveMaxPool1d(1)
        self.linear1 = torch.nn.Linear(in_features=hidden_dims, out_features=hidden_dims)
        self.dropout2 = torch.nn.Dropout(p=0.2)
        self.activation = torch.nn.ReLU()
        self.output = torch.nn.Linear(in_features=hidden_dims, out_features=1)
        self.activation2 = torch.nn.Sigmoid()

    def forward(self, x):
        x = self.dropout1(self.embedding(x.type(torch.LongTensor)))
        x = self.activation(self.conv1d(x.moveaxis(-1,-2)))
        x = self.pool(x).squeeze(-1)
        x = self.activation(self.dropout2(self.linear1(x)))
        x = self.activation2(self.output(x))
        return x