Search code examples
pythondeep-learningpytorchconv-neural-networkhuggingface-transformers

The output of the PyTourch DL network doesn't match the last layer provided in the network


I am trying to build a PyTorch model that can predict the rank of a text where the output is a float number between 0 and 1.

My input details are

  1. My batch size is 32.
  2. Max length for the tokenizer is 116
  3. In addition to the masks and Ids generated from the tokenizer, I am adding 11 values that were generated through preprocessing to the input text.

S the entire input shape would be 32 for batch and array with 127 item for each sample text provided

My layers are as follows:

  1. a DistilBERT uncased transformer. and I am using the DistilBERT tokenizer over the text.
  2. The following layer is a CNN that takes the output of the DistilBERT (127 channel) as input and provide 64 channels as output, with kernel=1
  3. After this, 6 CNN layers each input is 64 and output is 64 with a kernel size of 3 and dilation increasing from 2 to 32. On top of each CNN, there is a relu and a maxpooling with 2 as kernal size.
  4. My last CNN layer (and where the issue is happening) have 64 input channels and 32 output channels with a kernel size of 1 and a relu with AdaptiveMaxPool1d with size of 32 on top of it
  5. Linear layer takes 32 and output 16
  6. Linear layer takes 16 and output 1

below is my code

class Dataset(Dataset):
    def __init__(self, df, max_len, bert_model_name, multi=1):
        super().__init__()
        self.df = df.reset_index(drop=True)
        self.max_len = max_len
        
        self.tokenizer = DistilBertTokenizer.from_pretrained(
            bert_model_name, 
            do_lower_case=True,
            strip_accents=True,
            wordpieces_prefix=None,
            use_fast=True
        )
        self.multiplier = multi

    def __getitem__(self, index):
        row = self.df.iloc[index]
        
        inputs = self.tokenizer.encode_plus(
            row.source,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        
        return (
            t.LongTensor(t.cat([
                t.LongTensor([
                    row.n_total_cells * self.multiplier, 
                    row.n_code_cells * self.multiplier,
                    row.n_markdown_cells * self.multiplier,
                    row.word_counts * self.multiplier,
                    row.line_counts * self.multiplier,
                    row.empty_line_counts * self.multiplier,
                    row.full_lines_count * self.multiplier,
                    row.text_lines_count * self.multiplier,
                    row.tag_lines_count * self.multiplier,
                    row.weight * self.multiplier,
                    row.weight_counts * self.multiplier,
                ]), 
                t.LongTensor(inputs['input_ids']),
            ], 0)), 
            
            t.LongTensor(t.cat([
                t.ones(11, dtype=t.long),
                t.LongTensor(inputs['attention_mask']),
            ], 0)),
        )

class BModel(nn.Module):
    def __init__(self, bert_model_name):
        super(BModel, self).__init__()
        self.distill_bert = DistilBertModel.from_pretrained(bert_model_name)       

        self.hidden_size = self.distill_bert.config.hidden_size
        print(self.hidden_size) # 768
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        
        self.cnn_layers()

    def forward(self, inputs):
        dbert = self.cnn_forward(inputs[0], inputs[1])
        return dbert

 def cnn_layers(self):
        self.layers = 4
        kernel_size = 3
        inp = 127
        out = 32
        grades = [2, 4, 8, 16, 32, 64, ]
        
        self.convs = nn.ModuleList()
        self.relus = nn.ModuleList()
        self.maxs = nn.ModuleList()
        self.norms = nn.ModuleList()
        
        self.start_conv = nn.Conv1d(
            in_channels=inp,
            out_channels=64,
            kernel_size=1,
            bias=True
        )
        
        for i in range(self.layers):
            # dilated convolutions
            self.convs.append(nn.Conv1d(
                in_channels=64,
                out_channels=64,
                kernel_size = kernel_size,
                bias=False,
                dilation=grades[i]
            ))

            self.relus.append(nn.ReLU())

            self.maxs.append(nn.MaxPool1d(
                kernel_size=kernel_size-1,
            ))

            self.norms.append(nn.BatchNorm1d(
                num_features=64,
            ))


        self.end_conv = nn.Conv1d(
            in_channels=64,
            out_channels=out,
            kernel_size=1,
            bias=True
        )
        
        self.max_pool = nn.AdaptiveMaxPool1d(out)
        
        self.top1 = nn.Linear(out, 16) 
        self.top2 = nn.Linear(16, 1)
        
    def cnn_forward(self, ids, masks):
        x = self.distill_bert(ids, masks)[0]
        x = self.relu(x)
        x = self.dropout(x)
        print(f"X size after BERT:", x.size())
        
        x = self.start_conv(x)
        print(f"X size after First Conv:", x.size())
        for i in range(self.layers):
            x = self.norms[i](self.maxs[i](self.relus[i](self.convs[i](x))))
            print(f"X size after {i} CNN dilation:", x.size())
            
        x = self.max_pool(t.abs(self.end_conv(x)))
        print("X size after AdaptiveMaxPool1d:", x.size())
        
        x = self.top1(x)
        print("X size after before-last linear:", x.size())
        
        x = self.top2(x)
        print("X size after last linear:", x.size())
        return x

Printing the output size after each layer would be as below

X size after First Conv: torch.Size([32, 64, 768])
X size after 0 CNN dilation: torch.Size([32, 64, 382])
X size after 1 CNN dilation: torch.Size([32, 64, 187])
X size after 2 CNN dilation: torch.Size([32, 64, 85])
X size after 3 CNN dilation: torch.Size([32, 64, 26])
X size after AdaptiveMaxPool1d: torch.Size([32, 32, 32])
X size after before-last linear: torch.Size([32, 32, 16])
X size after last linear: torch.Size([32, 32, 1]

The issue I am facing is after the AdaptiveMaxPool1d, the output of this layer suppose to be 2 dimensions instead of 3 [32, 32] instead of [32, 32, 32]

The output of AdaptiveMaxPool1d fits into the linear layer but is with one extra dimension causing the output pred to differ from the true input

when I check the pred size vs the true size it would be

y_pred shape (12480,)
y_val shape (390,)

and the code blow with this error

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Input In [13], in <cell line: 21>()
     17 # print(mkdn_train_loader, mkdn_val_loader)
     18 
     19 ########################################################################################################################

File E:\KAGGLE_COMP\pt_model.py:796, in train(model, train_loader, val_loader, epochs, patience, path)
    793 print('y_val shape', y_val.shape)
    794 print(y_pred[:10])
--> 796 print("Validation MSE:", np.round(mean_squared_error(y_val, y_pred), 4))
    797 print()
    799 early_stopping(np.round(mean_squared_error(y_val, y_pred), 4), model)

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\metrics\_regression.py:438, in mean_squared_error(y_true, y_pred, sample_weight, multioutput, squared)
    378 def mean_squared_error(
    379     y_true, y_pred, *, sample_weight=None, multioutput="uniform_average", squared=True
    380 ):
    381     """Mean squared error regression loss.
    382 
    383     Read more in the :ref:`User Guide <mean_squared_error>`.
   (...)
    436     0.825...
    437     """
--> 438     y_type, y_true, y_pred, multioutput = _check_reg_targets(
    439         y_true, y_pred, multioutput
    440     )
    441     check_consistent_length(y_true, y_pred, sample_weight)
    442     output_errors = np.average((y_true - y_pred) ** 2, axis=0, weights=sample_weight)

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\metrics\_regression.py:94, in _check_reg_targets(y_true, y_pred, multioutput, dtype)
     60 def _check_reg_targets(y_true, y_pred, multioutput, dtype="numeric"):
     61     """Check that y_true and y_pred belong to the same regression task.
     62 
     63     Parameters
   (...)
     92         the dtype argument passed to check_array.
     93     """
---> 94     check_consistent_length(y_true, y_pred)
     95     y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
     96     y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py:332, in check_consistent_length(*arrays)
    330 uniques = np.unique(lengths)
    331 if len(uniques) > 1:
--> 332     raise ValueError(
    333         "Found input variables with inconsistent numbers of samples: %r"
    334         % [int(l) for l in lengths]
    335     )

ValueError: Found input variables with inconsistent numbers of samples: [390, 12480]

I need to know what I must change to make this run and the size is passed with correct shape.


Solution

  • From AdaptiveMaxPool1d Documentation: If the Input is in Shape of (N, C, L_in), then your output would be in the shape of (N, C, L_out).

    Since your input shape to the AdaptiveMaxPool1d is in the shape of (32, 32, 26) and you've set the output_size to 32 ( value of "out" variable ), your output shape comes out as (32, 32, 32). I suggest to set output_size as 1 and use squeeze(2) to squish down the dimension. Something like this:

    # For initialization of maxpool layer.
    nn.AdaptiveMaxPool1d(1)
    # ---------
    # In forward add squeeze(2) after max_pool like this:
    x = self.max_pool(t.abs(self.end_conv(x))).squeeze(2)