Pytorch CNN input dimensions not matching

Im currently trying to implement a CNN for a Alpha Zero game player, using pytorch but im getting an error regarding matrices multiplication. My input consists of 3 channels with 10x10 matrices.

model = Net(10, 10**2+1)
print(summary(model,(3,10,10)))

giving me the following error:

RuntimeError                              Traceback (most recent call last)
<ipython-input-32-0a101a882eb1> in <cell line: 3>()
      1 model = Net(10, 10**2+1)
      2 
----> 3 print(summary(model,(3,10,10)))

9 frames
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/linear.py in forward(self, input)
    112 
    113     def forward(self, input: Tensor) -> Tensor:
--> 114         return F.linear(input, self.weight, self.bias)
    115 
    116     def extra_repr(self) -> str:

RuntimeError: mat1 and mat2 shapes cannot be multiplied (40x10 and 2x101)

This is the current architecture:

def conv3x3(in_planes, out_planes):
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, padding=1)

def conv1x1(in_planes, out_planes):
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, padding=0)


class Net(nn.Module):
    def __init__(self, board_size, action_size, num_resBlocks=20, num_hidden=128):
        super().__init__()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Initial convolution 
        self.startBlock = nn.Sequential(
            conv3x3(3, num_hidden),
            nn.BatchNorm2d(num_hidden),
            nn.ReLU()
        )
        
        # Loop of all 20 Residual Layers
        self.backBone = nn.ModuleList(
            [ResBlock(num_hidden) for i in range(num_resBlocks)]
        )
        
        
        # Outputs expected value of the state 
        self.valueHead = nn.Sequential(
            conv1x1(num_hidden, 1),
            nn.BatchNorm2d(1),
            nn.ReLU(),

            nn.Linear(in_features=1, out_features=num_hidden),
            nn.ReLU(),

            nn.Linear(in_features=num_hidden, out_features=1),
            nn.Tanh()
        )

    
        # Outputs the probabilities of each possible action 
        self.policyHead = nn.Sequential(
            conv1x1(num_hidden, 2),
            nn.BatchNorm2d(2),
            nn.ReLU(),
            nn.Linear(2, out_features=(action_size)),
            nn.Softmax(dim=1)
        )


        self.to(self.device)

    def forward(self, x):
        x = self.startBlock(x)

        for resBlock in self.backBone:
            x = resBlock(x)

        policy = self.policyHead(x)
        value = self.valueHead(x)

        return policy, value
    

class ResBlock(nn.Module):
    def __init__(self, num_hidden):
        super().__init__()
        self.conv1 = conv3x3(num_hidden, num_hidden)
        self.bn1 = nn.BatchNorm2d(num_hidden)
        self.conv2 = conv3x3(num_hidden, num_hidden)
        self.bn2 = nn.BatchNorm2d(num_hidden)
        self.relu = nn.ReLU()

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        # Skip connections
        out = self.relu(out + identity)

        return out

Thank you so much for your help, i really appreciate it!

Solution

The output of your residual blocks in backBone is a 4D tensor shaped (N, 128, h, w). However, a linear layer in PyTorch works with inputs (*, H_in) and outputs (*, H_out). So in valueHead and policyHead the input of the first linear layer (nn.Linear(in_features=1, out_features=num_hidden) for the value head and nn.Linear(2, out_features=(action_size)) for the policy head) need to be permuted from (B, 128, h, w) to (B, h, w, 128).