python machine-learning deep-learning nlp transformer-model

How to remove layers in Huggingface's transformers GPT2 pre-trained models?

My code:

from transformers import GPT2Config, GPT2Model
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
print(decoder)

Here is the output of the console, listing the model architecture:

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

I want to remove the first layer:

(wte): Embedding(50257, 768)

I've tried the following way:

def deleteEncodingLayers(model, num_layers_to_keep):  # must pass in the full bert model
    oldModuleList = model.bert.encoder.layer
    newModuleList = nn.ModuleList()

    # Now iterate over all layers, only keepign only the relevant layers.
    for i in range(0, len(num_layers_to_keep)):
        newModuleList.append(oldModuleList[i])

    # create a copy of the model, modify it with the new list, and return
    copyOfModel = copy.deepcopy(model)
    copyOfModel.bert.encoder.layer = newModuleList

    return copyOfModel

But it didn't work. Who knows how to fix it?

Solution

Try these parameters to bypass the embedding layer :

class GPT2WithoutWTE(GPT2Model):
def __init__(self, config):
    super().__init__(config)
    # Remove the word token embedding layer
    del self.wte

def forward(
    self,
    inputs_embeds,
    attention_mask=None,
    token_type_ids=None,
    position_ids=None,
    head_mask=None,
    inputs=None,
    encoder_hidden_states=None,
    encoder_attention_mask=None,
    past_key_values=None,
    use_cache=None,
    output_attentions=None,
    output_hidden_states=None,
    return_dict=None,
):
    # here you will bypass the embedding layer and use inputs_embeds directly
    return super().forward(
        inputs_embeds=inputs_embeds,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        head_mask=head_mask,
        encoder_hidden_states=encoder_hidden_states,
        encoder_attention_mask=encoder_attention_mask,
        past_key_values=past_key_values,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )

for the input embeddings you can use the following

inputs_embeds = torch.rand(1, 10, config.n_embd)

Load the config of GPT2, send it to the class and then use the inputs_embeds for the new model.