Search code examples
pytorchfeature-extractionimage-compression

Pytorch: The size of tensor a (24) must match the size of tensor b (48) at non-singleton dimension 3


Below code works fine and generate proper results.

import torch
import torch.nn as nn
import torch.nn.functional as F

from modules import ConvLSTMCell, Sign


class EncoderCell(nn.Module):
    def __init__(self):
        super(EncoderCell, self).__init__()

        self.conv = nn.Conv2d(
            3, 64, kernel_size=3, stride=2, padding=1, bias=False)
        self.rnn1 = ConvLSTMCell(
            64,
            256,
            kernel_size=3,
            stride=2,
            padding=1,
            hidden_kernel_size=1,
            bias=False)
        self.rnn2 = ConvLSTMCell(
            256,
            512,
            kernel_size=3,
            stride=2,
            padding=1,
            hidden_kernel_size=1,
            bias=False)
        self.rnn3 = ConvLSTMCell(
            512,
            512,
            kernel_size=3,
            stride=2,
            padding=1,
            hidden_kernel_size=1,
            bias=False)

    def forward(self, input, hidden1, hidden2, hidden3):
        x = self.conv(input)

        hidden1 = self.rnn1(x, hidden1)
        x = hidden1[0]

        hidden2 = self.rnn2(x, hidden2)
        x = hidden2[0]

        hidden3 = self.rnn3(x, hidden3)
        x = hidden3[0]

        return x, hidden1, hidden2, hidden3


class Binarizer(nn.Module):
    def __init__(self):
        super(Binarizer, self).__init__()
        self.conv = nn.Conv2d(512, 32, kernel_size=1, bias=False)
        self.sign = Sign()

    def forward(self, input):
        feat = self.conv(input)
        x = F.tanh(feat)
        return self.sign(x)


class DecoderCell(nn.Module):
    def __init__(self):
        super(DecoderCell, self).__init__()

        self.conv1 = nn.Conv2d(
            32, 512, kernel_size=1, stride=1, padding=0, bias=False)
        self.rnn1 = ConvLSTMCell(
            512,
            512,
            kernel_size=3,
            stride=1,
            padding=1,
            hidden_kernel_size=1,
            bias=False)
        self.rnn2 = ConvLSTMCell(
            128,
            512,
            kernel_size=3,
            stride=1,
            padding=1,
            hidden_kernel_size=1,
            bias=False)
        self.rnn3 = ConvLSTMCell(
            128,
            256,
            kernel_size=3,
            stride=1,
            padding=1,
            hidden_kernel_size=3,
            bias=False)
        self.rnn4 = ConvLSTMCell(
            64,
            128,
            kernel_size=3,
            stride=1,
            padding=1,
            hidden_kernel_size=3,
            bias=False)
        self.conv2 = nn.Conv2d(
            32, 3, kernel_size=1, stride=1, padding=0, bias=False)

    def forward(self, input, hidden1, hidden2, hidden3, hidden4):
        x = self.conv1(input)

        hidden1 = self.rnn1(x, hidden1)
        x = hidden1[0]
        x = F.pixel_shuffle(x, 2)

        hidden2 = self.rnn2(x, hidden2)
        x = hidden2[0]
        x = F.pixel_shuffle(x, 2)

        hidden3 = self.rnn3(x, hidden3)
        x = hidden3[0]
        x = F.pixel_shuffle(x, 2)

        hidden4 = self.rnn4(x, hidden4)
        x = hidden4[0]
        x = F.pixel_shuffle(x, 2)

        x = F.tanh(self.conv2(x)) / 2
        return x, hidden1, hidden2, hidden3, hidden4

Now i have changed in self.con and add pretrained resent with layer. Now it shows tensor mismatched error after training. All things are same just add this line in code. I put ** in those line

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models


from modules import ConvLSTMCell, Sign


class EncoderCell(nn.Module):
    def __init__(self):
        super(EncoderCell, self).__init__()

        #self.conv = nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1, bias=False)

        **resConv = models.resnet50(pretrained=True)
        resConv.layer4 = nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1, bias=False)
        self.conv = resConv.layer4**


        self.rnn1 = ConvLSTMCell(
            64,
            256,
            kernel_size=3,
            stride=2,
            padding=1,
            hidden_kernel_size=1,
            bias=False)
        self.rnn2 = ConvLSTMCell(
            256,
            512,
            kernel_size=3,
            stride=2,
            padding=1,
            hidden_kernel_size=1,
            bias=False)
        self.rnn3 = ConvLSTMCell(
            512,
            512,
            kernel_size=3,
            stride=2,
            padding=1,
            hidden_kernel_size=1,
            bias=False)

    def forward(self, input, hidden1, hidden2, hidden3):

        x = self.conv(input)

        hidden1 = self.rnn1(x, hidden1)
        x = hidden1[0]

        hidden2 = self.rnn2(x, hidden2)
        x = hidden2[0]

        hidden3 = self.rnn3(x, hidden3)
        x = hidden3[0]

        return x, hidden1, hidden2, hidden3


class Binarizer(nn.Module):
    def __init__(self):
        super(Binarizer, self).__init__()
        self.conv = nn.Conv2d(512, 32, kernel_size=1, bias=False)
        self.sign = Sign()

    def forward(self, input):
        feat = self.conv(input)
        x = F.tanh(feat)
        return self.sign(x)


class DecoderCell(nn.Module):
    def __init__(self):
        super(DecoderCell, self).__init__()

        **resConv = models.resnet50(pretrained=True)
        resConv.layer4 = nn.Conv2d(32, 512, kernel_size=3, stride=2, padding=1, bias=False)
        self.conv1 = resConv.layer4**

        self.rnn1 = ConvLSTMCell(
            512,
            512,
            kernel_size=3,
            stride=1,
            padding=1,
            hidden_kernel_size=1,
            bias=False)
        self.rnn2 = ConvLSTMCell(
            128,
            512,
            kernel_size=3,
            stride=1,
            padding=1,
            hidden_kernel_size=1,
            bias=False)
        self.rnn3 = ConvLSTMCell(
            128,
            256,
            kernel_size=3,
            stride=1,
            padding=1,
            hidden_kernel_size=3,
            bias=False)
        self.rnn4 = ConvLSTMCell(
            64,
            128,
            kernel_size=3,
            stride=1,
            padding=1,
            hidden_kernel_size=3,
            bias=False)

        **resConv2 = models.resnet50(pretrained=True)
        resConv2.layer4 = nn.Conv2d(32, 3, kernel_size=1, stride=1, padding=0, bias=False)
        self.conv2 = resConv2.layer4**

    def forward(self, input, hidden1, hidden2, hidden3, hidden4):
        x = self.conv1(input)

        hidden1 = self.rnn1(x, hidden1)
        x = hidden1[0]
        x = F.pixel_shuffle(x, 2)

        hidden2 = self.rnn2(x, hidden2)
        x = hidden2[0]
        x = F.pixel_shuffle(x, 2)

        hidden3 = self.rnn3(x, hidden3)
        x = hidden3[0]
        x = F.pixel_shuffle(x, 2)

        hidden4 = self.rnn4(x, hidden4)
        x = hidden4[0]
        x = F.pixel_shuffle(x, 2)

        x = F.tanh(self.conv2(x)) / 2
        return x, hidden1, hidden2, hidden3, hidden4

Solution

  • You are doing it a wrong way, some explanation is,

        **resConv = models.resnet50(pretrained=True) # you are reading a model
    

    now you are replacing the layer in that model with newly initialized layer. Secondly, layer4 in resnet50 is a sequential block containing multiple layers. Use print to see exact the layers in model.

        resConv.layer4 = nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1, bias=False)
    

    here you are using new layer.

    self.conv = resConv.layer4**
    

    As per your query regarding usage of pretrained layer, you should do it like this,

    resConv = models.resnet50(pretrained=True)
    print(resConv) #see the layer which you want to use
    self.conv = resConv.conv1 # replace conv1 with that layer
    # note: conv1 is the name of first conv layer in resnet
    

    To add to this, I would also recommend acquiring and adding this layer (or the weights and biases) outside of the object initialization. Something like:

    enc = EncoderCell()
    resnet50 = models.resnet50(pretrained=True)
    

    and then either

    enc.conv = resnet50.conv1
    

    or more ideally

    enc.conv.load_state_dict(resnet50.layer1.state_dict())
    

    The reason being, calling state_dict() on a nn.Module class creates a clone of the parameters (weights and biases in this case) which can be loaded via nn.Module.load_state_dict() method as long as the two instances of nn.Module share the same shape. So you get the pretrained weights and they are completely detached from the pretrained model. Then you can get rid of the pretrained model since it could be rather large in memory.

    del resnet50