My code:
from transformers import GPT2Config, GPT2Model
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
print(decoder)
Here is the output of the console, listing the model architecture:
GPT2LMHeadModel(
(transformer): GPT2Model(
(wte): Embedding(50257, 768)
(wpe): Embedding(1024, 768)
(drop): Dropout(p=0.1, inplace=False)
(h): ModuleList(
(0-11): 12 x GPT2Block(
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(attn): GPT2Attention(
(c_attn): Conv1D()
(c_proj): Conv1D()
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): GPT2MLP(
(c_fc): Conv1D()
(c_proj): Conv1D()
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(lm_head): Linear(in_features=768, out_features=50257, bias=False)
)
I want to remove the first layer:
(wte): Embedding(50257, 768)
I've tried the following way:
def deleteEncodingLayers(model, num_layers_to_keep): # must pass in the full bert model
oldModuleList = model.bert.encoder.layer
newModuleList = nn.ModuleList()
# Now iterate over all layers, only keepign only the relevant layers.
for i in range(0, len(num_layers_to_keep)):
newModuleList.append(oldModuleList[i])
# create a copy of the model, modify it with the new list, and return
copyOfModel = copy.deepcopy(model)
copyOfModel.bert.encoder.layer = newModuleList
return copyOfModel
But it didn't work. Who knows how to fix it?
Try these parameters to bypass the embedding layer :
class GPT2WithoutWTE(GPT2Model):
def __init__(self, config):
super().__init__(config)
# Remove the word token embedding layer
del self.wte
def forward(
self,
inputs_embeds,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_values=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
):
# here you will bypass the embedding layer and use inputs_embeds directly
return super().forward(
inputs_embeds=inputs_embeds,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
past_key_values=past_key_values,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
for the input embeddings you can use the following
inputs_embeds = torch.rand(1, 10, config.n_embd)
Load the config of GPT2, send it to the class and then use the inputs_embeds for the new model.