python pytorch conv-neural-network bert-language-model

output from bert into cnn model

i am trying to concatenate bert model with Cnn 1d using pytorch . I used this code but I do not understand what is meaning of in_channels and out_channels in function conv1d if input shape into cnn model is torch(256,64,768)

class MixModel(nn.Module):
    def __init__(self,pre_trained='distilbert-base-uncased'):
        super().__init__()        
        self.bert =  AutoModel.from_pretrained('distilbert-base-uncased')
        self.hidden_size = self.bert.config.hidden_size
        self.conv = nn.Conv1d(in_channels=1, out_channels=256, kernel_size=5, padding='valid', stride=1)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool1d(kernel_size= 256- 5 + 1)
        self.dropout = nn.Dropout(0.3)
        self.clf = nn.Linear(self.hidden_size*2,6)
        
      
           
    def forward(self,inputs, mask , labels):
        
        cls_hs = self.bert(input_ids=inputs,attention_mask=mask, return_dict= False) 
        x=cls_hs
       # x = torch.cat(cls_hs[0]) # x= [416, 64, 768]
        x = self.conv(x)
        x = self.relu(x)
        x = self.pool(x)
        x = self.dropout(x)
        x = self.clf(x)
        
        
      
        return x

Edit I use recommended answer and change the parameters but i got error

class MixModel(nn.Module):
    def __init__(self,pre_trained='bert-base-uncased'):
        super().__init__()        
        self.bert =  AutoModel.from_pretrained('distilbert-base-uncased')
        self.hidden_size = self.bert.config.hidden_size
        self.conv = nn.Conv1d(in_channels=768, out_channels=256, kernel_size=5, padding='valid', stride=1)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool1d(kernel_size= 64- 5 + 1)
        print(11)
        self.dropout = nn.Dropout(0.3)
        print(12)
        self.clf = nn.Linear(self.hidden_size*2,6)
        print(13)
        
      
           
    def forward(self,inputs, mask , labels):
        
        cls_hs = self.bert(input_ids=inputs,attention_mask=mask, return_dict= False) 
        x=cls_hs[0]
        print(cls_hs[0]) 
        print(len(cls_hs[0]))
        print(cls_hs[0].size())
        #x = torch.cat(cls_hs,0) # x= [416, 64, 768]
        x = x.permute(0, 2, 1)
        x = self.conv(x)
        x = self.relu(x)
        x = self.pool(x)
        x = self.dropout(x)
        x = self.clf(x)
return x

the error is 5 frames /usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in linear(input, weight, bias) 1846 if has_torch_function_variadic(input, weight, bias): 1847 return handle_torch_function(linear, (input, weight, bias), input, weight, bias=bias) -> 1848 return torch._C._nn.linear(input, weight, bias) 1849 1850

RuntimeError: mat1 and mat2 shapes cannot be multiplied (65536x1 and 1536x6)

Solution

The dimension of the output prediction of BERT (and many other transformer-based models) is of shape batchxseq-lenxfeature-dim: That is, your input is a batch of 256 sequences of length (probably with padding) of 64 tokens, each token is represented by a feature vector of dimension 768.

In order to apply 1-d convolution along the sequence-len dimension, you will need first to permute x to be of shape batchxdimxlen:

x = x.permute(0, 2, 1)

Now you can apply nn.Conv1d, where the in_channels is the dimension of x = 768. the out_channels is up to you - what is going to be the hidden dimension of your model.