pytorch - input&model size missmath error on Grayscale image classification model

I have (650, 650) grayscale images with label, and trying to build pytorch CNN model.

I think almost finished, but below Error.

RuntimeError: Given groups=1, weight of size [32, 1, 3, 3], expected input[1, 32, 650, 650] to have 1 channels, but got 32 channels instead

my whole code is below.

import numpy as np
import pandas as pd
import torch
import torchvision
from torchvision import transforms
from torchvision.datasets import ImageFolder
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
import torch.nn as nn

df = pd.DataFrame({"Img" : [np.random.randint(0, 255, size=(650, 650))], "label":[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}) # this is pseudo data. only shape and type is same. 
df.Img=df.Img.apply(lambda x: x.astype(np.single))

X_train, X_test, y_train, y_test = train_test_split(df, df.label, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

X = torch.tensor(X_train['Img'].tolist(), dtype=torch.float32)
y = torch.tensor(X_train['label'].tolist())
dataset = TensorDataset(X, y)
val_dataset = TensorDataset(torch.tensor(X_val['F_Img'].tolist()), torch.tensor(X_val['label'].tolist()))

batch_size=32

dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

train_dl = dataloader
val_dl = val_dataloader


class ImageClassificationBase(nn.Module):
    
    def training_step(self, batch):
        images, labels = batch 
        out = self(images)                  # Generate predictions
        loss = F.cross_entropy(out, labels) # Calculate loss
        return loss
    
    def validation_step(self, batch):
        images, labels = batch 
        out = self(images)                    # Generate predictions
        loss = F.cross_entropy(out, labels)   # Calculate loss
        acc = accuracy(out, labels)           # Calculate accuracy
        return {'val_loss': loss.detach(), 'val_acc': acc}
        
    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()   # Combine losses
        batch_accs = [x['val_acc'] for x in outputs]
        epoch_acc = torch.stack(batch_accs).mean()      # Combine accuracies
        return {'val_loss': epoch_loss.item(), 'val_acc': epoch_acc.item()}
    
    def epoch_end(self, epoch, result):
        print("Epoch [{}], train_loss: {:.4f}, val_loss: {:.4f}, val_acc: {:.4f}".format(
            epoch, result['train_loss'], result['val_loss'], result['val_acc']))

class Net(ImageClassificationBase):
    def __init__(self):
        super().__init__()
        self.network = nn.Sequential(
            
            nn.Conv2d(in_channels=1, out_channels=32, kernel_size = 3, padding = 1),
            nn.ReLU(),
            nn.Conv2d(32,64, kernel_size = 3, stride = 1, padding = 1),
            nn.ReLU(),
            nn.MaxPool2d(2,2),
        
            nn.Conv2d(64, 128, kernel_size = 3, stride = 1, padding = 1),
            nn.ReLU(),
            nn.Conv2d(128 ,128, kernel_size = 3, stride = 1, padding = 1),
            nn.ReLU(),
            nn.MaxPool2d(2,2),
            
            nn.Conv2d(128, 256, kernel_size = 3, stride = 1, padding = 1),
            nn.ReLU(),
            nn.Conv2d(256,256, kernel_size = 3, stride = 1, padding = 1),
            nn.ReLU(),
            nn.MaxPool2d(2,2),
            
            nn.Flatten(),
            nn.Linear(6561,1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(512,13)
        )
    
    def forward(self, xb):
        return self.network(xb)

def accuracy(outputs, labels):
    _, preds = torch.max(outputs, dim=1)
    return torch.tensor(torch.sum(preds == labels).item() / len(preds))

  
@torch.no_grad()
def evaluate(model, val_loader):
    model.eval()
    outputs = [model.validation_step(batch) for batch in val_loader]
    return model.validation_epoch_end(outputs)

  
def fit(epochs, lr, model, train_loader, val_loader, opt_func = torch.optim.SGD):
    
    history = []
    optimizer = opt_func(model.parameters(),lr)
    for epoch in range(epochs):
        
        model.train()
        train_losses = []
        for batch in train_loader:
            loss = model.training_step(batch)
            train_losses.append(loss)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
        result = evaluate(model, val_loader)
        result['train_loss'] = torch.stack(train_losses).mean().item()
        model.epoch_end(epoch, result)
        history.append(result)
    
    return history

num_epochs = 30
opt_func = torch.optim.Adam
lr = 0.001
#fitting the model on training data and record the result after each epoch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = Net().to(device)
model

history = fit(num_epochs, lr, model, train_dl, val_dl, opt_func)
# RuntimeError: Given groups=1, weight of size [32, 1, 3, 3], expected input[1, 32, 650, 650] to have 1 channels, but got 32 channels instead

I googled it, and find some similar problem

RuntimeError: Given groups=1, weight of size [64, 32, 3, 3], expected input[128, 64, 32, 32] to have 32 channels, but got 64 channels instead

point is shape matching in model, But even I know that, I cannot find where is the problem. I feel shamed......

Solution

Your model expects the input to be of shape (bs, ch, h, w), which in this case would be (32, 1, 650, 650). Your actual input is of shape (1, 32, 650, 650).

The reason this happens is your images are of shape (650, 650). The dataloader stacks them to (bs, 650, 650). nn.Conv2d expects a four dim input, so it adds a unit axis, converting the input to (1, bs, 650, 650).

The solution is to add a unit axis to your images:

np.random.randint(0, 255, size=(1, 650, 650))

This will result in your dataloader outputs having shape (bs, 1, 650, 650).