PyTorch ValueError: Target size (torch.Size([64])) must be the same as input size (torch.Size([15]))

I'm currently using this repo to perform NLP and learn more about CNN's using my own dataset, and I keep running into an error regarding a shape mismatch:

ValueError: Target size (torch.Size([64])) must be the same as input size (torch.Size([15]))

     10 }
     11 for epoch in tqdm(range(params['epochs'])):
---> 12     train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
     13     valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
     14     epoch_mins, epoch_secs = epoch_time(start_time, end_time)

     57         print("PredictionShapeAfter:")
     58         print(predictions.shape)
---> 59         loss = criterion(predictions, batch.l)
     60 
     61         acc = binary_accuracy(predictions, batch.l)

Doing some digging, I found that my CNN's prediction is a different size compared to the training data truth it's being compared to:

  Input Shape:
    torch.Size([15, 64])
    Truth Shape:
    torch.Size([64])
    embedded unsqueezed: torch.Size([15, 1, 64, 100])
    cat shape: torch.Size([15, 300])
    Prediction Shape Before Squeeze:
    torch.Size([15, 1])
    PredictionShapeAfter:
    torch.Size([15])

The model is making the prediction shape (the last value in this list) as the first dimension of the inputs. Is this a common problem and is there a way to rectify this issue?

My Model:

class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        super().__init__()
                
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text): 
        embedded = self.embedding(text)
        embedded = embedded.unsqueeze(1)
        print(f"embedded unsqueezed: {embedded.shape}")
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]          
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim = 1))   
        print(f"cat shape: {cat.shape}")       
        return self.fc(cat)

My Training function:

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()

        print("InputShape:")
        print(batch.t.shape)
        print("Truth Shape:")
        print(batch.l.shape)

        predictions = model(batch.t)
        print("Prediction Shape Before Squeeze:")
        print(predictions.shape)

        predictions = predictions.squeeze(1)
        print("PredictionShapeAfter:")
        print(predictions.shape)
        loss = criterion(predictions, batch.l)
        
        acc = binary_accuracy(predictions, batch.l)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

My full code can be found at this link.

Solution

Your issue is here:

self.convs = nn.ModuleList([
                            nn.Conv2d(in_channels = 1, 
                                      out_channels = n_filters, 
                                      kernel_size = (fs, embedding_dim)) 
                            for fs in filter_sizes
                            ])

You are inputting data of shape [15, 1, 64, 100], which the convolutions are interpreting as batches of size 15, of 1-channel images of HxW 64x100.

What it appears you want is a batch of size 64, so swap those dimensions first:

...
embedded = embedded.swapdims(0,2)
conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs] 
...