python deep-learning pytorch conv-neural-network huggingface-transformers

The output of the PyTourch DL network doesn't match the last layer provided in the network

I am trying to build a PyTorch model that can predict the rank of a text where the output is a float number between 0 and 1.

My input details are

My batch size is 32.
Max length for the tokenizer is 116
In addition to the masks and Ids generated from the tokenizer, I am adding 11 values that were generated through preprocessing to the input text.

S the entire input shape would be 32 for batch and array with 127 item for each sample text provided

My layers are as follows:

a DistilBERT uncased transformer. and I am using the DistilBERT tokenizer over the text.
The following layer is a CNN that takes the output of the DistilBERT (127 channel) as input and provide 64 channels as output, with kernel=1
After this, 6 CNN layers each input is 64 and output is 64 with a kernel size of 3 and dilation increasing from 2 to 32. On top of each CNN, there is a relu and a maxpooling with 2 as kernal size.
My last CNN layer (and where the issue is happening) have 64 input channels and 32 output channels with a kernel size of 1 and a relu with AdaptiveMaxPool1d with size of 32 on top of it
Linear layer takes 32 and output 16
Linear layer takes 16 and output 1

below is my code

class Dataset(Dataset):
    def __init__(self, df, max_len, bert_model_name, multi=1):
        super().__init__()
        self.df = df.reset_index(drop=True)
        self.max_len = max_len
        
        self.tokenizer = DistilBertTokenizer.from_pretrained(
            bert_model_name, 
            do_lower_case=True,
            strip_accents=True,
            wordpieces_prefix=None,
            use_fast=True
        )
        self.multiplier = multi

    def __getitem__(self, index):
        row = self.df.iloc[index]
        
        inputs = self.tokenizer.encode_plus(
            row.source,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        
        return (
            t.LongTensor(t.cat([
                t.LongTensor([
                    row.n_total_cells * self.multiplier, 
                    row.n_code_cells * self.multiplier,
                    row.n_markdown_cells * self.multiplier,
                    row.word_counts * self.multiplier,
                    row.line_counts * self.multiplier,
                    row.empty_line_counts * self.multiplier,
                    row.full_lines_count * self.multiplier,
                    row.text_lines_count * self.multiplier,
                    row.tag_lines_count * self.multiplier,
                    row.weight * self.multiplier,
                    row.weight_counts * self.multiplier,
                ]), 
                t.LongTensor(inputs['input_ids']),
            ], 0)), 
            
            t.LongTensor(t.cat([
                t.ones(11, dtype=t.long),
                t.LongTensor(inputs['attention_mask']),
            ], 0)),
        )

class BModel(nn.Module):
    def __init__(self, bert_model_name):
        super(BModel, self).__init__()
        self.distill_bert = DistilBertModel.from_pretrained(bert_model_name)       

        self.hidden_size = self.distill_bert.config.hidden_size
        print(self.hidden_size) # 768
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        
        self.cnn_layers()

    def forward(self, inputs):
        dbert = self.cnn_forward(inputs[0], inputs[1])
        return dbert

 def cnn_layers(self):
        self.layers = 4
        kernel_size = 3
        inp = 127
        out = 32
        grades = [2, 4, 8, 16, 32, 64, ]
        
        self.convs = nn.ModuleList()
        self.relus = nn.ModuleList()
        self.maxs = nn.ModuleList()
        self.norms = nn.ModuleList()
        
        self.start_conv = nn.Conv1d(
            in_channels=inp,
            out_channels=64,
            kernel_size=1,
            bias=True
        )
        
        for i in range(self.layers):
            # dilated convolutions
            self.convs.append(nn.Conv1d(
                in_channels=64,
                out_channels=64,
                kernel_size = kernel_size,
                bias=False,
                dilation=grades[i]
            ))

            self.relus.append(nn.ReLU())

            self.maxs.append(nn.MaxPool1d(
                kernel_size=kernel_size-1,
            ))

            self.norms.append(nn.BatchNorm1d(
                num_features=64,
            ))


        self.end_conv = nn.Conv1d(
            in_channels=64,
            out_channels=out,
            kernel_size=1,
            bias=True
        )
        
        self.max_pool = nn.AdaptiveMaxPool1d(out)
        
        self.top1 = nn.Linear(out, 16) 
        self.top2 = nn.Linear(16, 1)
        
    def cnn_forward(self, ids, masks):
        x = self.distill_bert(ids, masks)[0]
        x = self.relu(x)
        x = self.dropout(x)
        print(f"X size after BERT:", x.size())
        
        x = self.start_conv(x)
        print(f"X size after First Conv:", x.size())
        for i in range(self.layers):
            x = self.norms[i](self.maxs[i](self.relus[i](self.convs[i](x))))
            print(f"X size after {i} CNN dilation:", x.size())
            
        x = self.max_pool(t.abs(self.end_conv(x)))
        print("X size after AdaptiveMaxPool1d:", x.size())
        
        x = self.top1(x)
        print("X size after before-last linear:", x.size())
        
        x = self.top2(x)
        print("X size after last linear:", x.size())
        return x

Printing the output size after each layer would be as below

X size after First Conv: torch.Size([32, 64, 768])
X size after 0 CNN dilation: torch.Size([32, 64, 382])
X size after 1 CNN dilation: torch.Size([32, 64, 187])
X size after 2 CNN dilation: torch.Size([32, 64, 85])
X size after 3 CNN dilation: torch.Size([32, 64, 26])
X size after AdaptiveMaxPool1d: torch.Size([32, 32, 32])
X size after before-last linear: torch.Size([32, 32, 16])
X size after last linear: torch.Size([32, 32, 1]

The issue I am facing is after the AdaptiveMaxPool1d, the output of this layer suppose to be 2 dimensions instead of 3 [32, 32] instead of [32, 32, 32]

The output of AdaptiveMaxPool1d fits into the linear layer but is with one extra dimension causing the output pred to differ from the true input

when I check the pred size vs the true size it would be

y_pred shape (12480,)
y_val shape (390,)

and the code blow with this error

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Input In [13], in <cell line: 21>()
     17 # print(mkdn_train_loader, mkdn_val_loader)
     18 
     19 ########################################################################################################################

File E:\KAGGLE_COMP\pt_model.py:796, in train(model, train_loader, val_loader, epochs, patience, path)
    793 print('y_val shape', y_val.shape)
    794 print(y_pred[:10])
--> 796 print("Validation MSE:", np.round(mean_squared_error(y_val, y_pred), 4))
    797 print()
    799 early_stopping(np.round(mean_squared_error(y_val, y_pred), 4), model)

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\metrics\_regression.py:438, in mean_squared_error(y_true, y_pred, sample_weight, multioutput, squared)
    378 def mean_squared_error(
    379     y_true, y_pred, *, sample_weight=None, multioutput="uniform_average", squared=True
    380 ):
    381     """Mean squared error regression loss.
    382 
    383     Read more in the :ref:`User Guide <mean_squared_error>`.
   (...)
    436     0.825...
    437     """
--> 438     y_type, y_true, y_pred, multioutput = _check_reg_targets(
    439         y_true, y_pred, multioutput
    440     )
    441     check_consistent_length(y_true, y_pred, sample_weight)
    442     output_errors = np.average((y_true - y_pred) ** 2, axis=0, weights=sample_weight)

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\metrics\_regression.py:94, in _check_reg_targets(y_true, y_pred, multioutput, dtype)
     60 def _check_reg_targets(y_true, y_pred, multioutput, dtype="numeric"):
     61     """Check that y_true and y_pred belong to the same regression task.
     62 
     63     Parameters
   (...)
     92         the dtype argument passed to check_array.
     93     """
---> 94     check_consistent_length(y_true, y_pred)
     95     y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
     96     y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py:332, in check_consistent_length(*arrays)
    330 uniques = np.unique(lengths)
    331 if len(uniques) > 1:
--> 332     raise ValueError(
    333         "Found input variables with inconsistent numbers of samples: %r"
    334         % [int(l) for l in lengths]
    335     )

ValueError: Found input variables with inconsistent numbers of samples: [390, 12480]

I need to know what I must change to make this run and the size is passed with correct shape.

Solution

From AdaptiveMaxPool1d Documentation: If the Input is in Shape of (N, C, L_in), then your output would be in the shape of (N, C, L_out).

Since your input shape to the AdaptiveMaxPool1d is in the shape of (32, 32, 26) and you've set the output_size to 32 ( value of "out" variable ), your output shape comes out as (32, 32, 32). I suggest to set output_size as 1 and use squeeze(2) to squish down the dimension. Something like this:

# For initialization of maxpool layer.
nn.AdaptiveMaxPool1d(1)
# ---------
# In forward add squeeze(2) after max_pool like this:
x = self.max_pool(t.abs(self.end_conv(x))).squeeze(2)