Search code examples
pythonconv-neural-networktorchtorchvision

Why test accuracy is much higher then train accuracy while training cnn


Python 3.9.5 torch 1.13.0+cu117 torchvision 0.14.0+cu117

I am currently training a Convolutional Neural Network (CNN) for an image classification task. I have observed that during the training process, the test accuracy consistently surpasses the train accuracy, which is contrary to what is expected. The network is trained on the MNIST dataset. here is my training results:


epoch=1, train loss=0.8197974562644958, train acc=0.7494, test loss=0.1455492526292801, test acc=0.9616

epoch=2, train loss=0.7107925415039062, train acc=0.7788333333333334, test loss=0.1208220049738884, test acc=0.9689

epoch=3, train loss=0.6579669713973999, train acc=0.7906666666666666, test loss=0.11497163027524948, test acc=0.9676

epoch=4, train loss=0.6305248141288757, train acc=0.7994333333333333, test loss=0.10593992471694946, test acc=0.97

epoch=5, train loss=0.5982099771499634, train acc=0.80585, test loss=0.09132635593414307, test acc=0.9714

epoch=6, train loss=0.5825754404067993, train acc=0.8125333333333333, test loss=0.09170813113451004, test acc=0.9723

epoch=7, train loss=0.5688086748123169, train acc=0.8155166666666667, test loss=0.08628570288419724, test acc=0.9737

epoch=8, train loss=0.5556393265724182, train acc=0.8193166666666667, test loss=0.08203426003456116, test acc=0.9762

epoch=9, train loss=0.546567976474762, train acc=0.8213833333333334, test loss=0.08405696600675583, test acc=0.9754

epoch=10, train loss=0.5374698638916016, train acc=0.8239333333333333, test loss=0.07133891433477402, test acc=0.9788

epoch=11, train loss=0.5179286599159241, train acc=0.82975, test loss=0.0744888037443161, test acc=0.9792

epoch=12, train loss=0.5131004452705383, train acc=0.8329, test loss=0.07630482316017151, test acc=0.9778

epoch=14, train loss=0.49787914752960205, train acc=0.8366666666666667, test loss=0.07209591567516327, test acc=0.9779

epoch=15, train loss=0.4968840777873993, train acc=0.83475, test loss=0.07035819441080093, test acc=0.9801

epoch=16, train loss=0.4877821207046509, train acc=0.83925, test loss=0.07009950280189514, test acc=0.9777

epoch=17, train loss=0.48330068588256836, train acc=0.84045, test loss=0.06527410447597504, test acc=0.9809

epoch=18, train loss=0.48005640506744385, train acc=0.8404166666666667, test loss=0.06624794006347656, test acc=0.9781

epoch=19, train loss=0.47614845633506775, train acc=0.8418833333333333, test loss=0.07185563445091248, test acc=0.9788

training code:

from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from pathlib import Path

from CNN import CNNmodel

SEED = 5
device = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 16
data_root = Path("data/")

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

train_transform = transforms.Compose(\[
transforms.TrivialAugmentWide(num_magnitude_bins=8),
transforms.ToTensor()
\])

test_transform = transforms.ToTensor()

train_data = datasets.MNIST(
root=data_root / "train",
train=True,
download=True,
transform=train_transform
)

test_data = datasets.MNIST(
root=data_root / "test",
train=False,
download=True,
transform=test_transform
)

train_dataloader = DataLoader(
train_data,
batch_size=BATCH_SIZE,
shuffle=True
)

test_dataloader = DataLoader(
test_data,
batch_size=BATCH_SIZE,
shuffle=False
)

channel_num = train_data\[0\]\[0\].shape\[0\]
model = CNNmodel(in_shape=channel_num, hidden_shape=8, out_shape=len(train_data.classes)).to(device)
optimizer = torch.optim.SGD(params=model.parameters(), lr=0.01)
loss_fn = torch.nn.CrossEntropyLoss()
epochs = 20

def train_step(dataloader, loss_fn, optimizer, model, device):
train_loss = 0
train_acc = 0

    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
    
        y_pred = model(X)
    
        loss = loss_fn(y_pred, y)
        train_loss += loss
        
        optimizer.zero_grad()
    
        loss.backward()
    
        optimizer.step()
    
        y_pred_class = torch.argmax(torch.softmax(y_pred, dim=1), dim=1)
        train_acc += (y_pred_class == y).sum().item()/len(y_pred)
    
    train_loss /= len(dataloader)
    train_acc /= len(dataloader)
    
    return (train_loss, train_acc)

    def test_step(dataloader, loss_fn, model, device):
test_loss = 0
test_acc = 0

    with torch.inference_mode():
        for batch, (X, y) in enumerate(dataloader):
            X, y = X.to(device), y.to(device)
    
            y_pred = model(X)
    
            loss = loss_fn(y_pred, y)
            test_loss += loss
    
            y_pred_class = torch.argmax(torch.softmax(y_pred, dim=1), dim=1)
            test_acc += (y_pred_class == y).sum().item()/len(y_pred)
        
        test_loss /= len(dataloader)
        test_acc /= len(dataloader)
    
    return (test_loss, test_acc)

for epoch in range(epochs):
train_loss, train_acc = train_step(
dataloader=train_dataloader,
loss_fn=loss_fn,
optimizer=optimizer,
model=model,
device=device
)

    test_loss, test_acc = test_step(
        dataloader=test_dataloader,
        loss_fn=loss_fn,
        model=model,
        device=device
    
    torch.cuda.empty_cache()
    print(f"epoch={epoch}, train loss={train_loss}, train acc={train_acc}, test loss={test_loss}, test acc={test_acc}\n")

and here is my model achitecture:

class CNNmodel(nn.Module):
    def __init__(self, in_shape, hidden_shape, out_shape) -> None:
        super().__init__()
        self.conv_block_1 = nn.Sequential(
            nn.Conv2d(
                in_channels=in_shape,
                out_channels=hidden_shape,
                kernel_size=3,
                stride=1,
                padding=1
            ),
            nn.ReLU(),
            nn.Conv2d(
                in_channels=hidden_shape,
                out_channels=hidden_shape,
                kernel_size=3,
                stride=1,
                padding=1
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.conv_block_2 = nn.Sequential(
            nn.Conv2d(
                in_channels=hidden_shape,
                out_channels=hidden_shape,
                kernel_size=3,
                stride=1,
                padding=1
            ),
            nn.ReLU(),
            nn.Conv2d(
                in_channels=hidden_shape,
                out_channels=hidden_shape,
                kernel_size=3,
                stride=1,
                padding=1
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_features=hidden_shape*7*7,
                      out_features=out_shape)
        )
    
    def forward(self, x):
        return self.classifier(self.conv_block_2(self.conv_block_1(x)))

i thought that problem is in how dataset is installed but i couldnt find anything


Solution

  • I fixed this problem by setting batch size to 32.