Search code examples
machine-learningpytorchnlphuggingface-transformershuggingface

Why does my transformer model have more parameters than the Huggingface implementation?


I'm loading a GPT model from huggingface as follows:

from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

standard_gpt2 = GPT2LMHeadModel(config).to(device)
standard_gpt2_model_size = sum(t.numel() for t in standard_gpt2.parameters())
print(f"GPT-2 size: {standard_gpt2_model_size/1000**2:.1f}M parameters")
# >>> GPT-2 size: 124.4M parameters

If I print the model architecture I get:

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

Focusing on the last layer -- lm_head, it has in_features=768, out_features=50257

So why when I replace just that one layer with the exact same number of parameters I get different results?

standard_gpt2.lm_head = nn.Sequential(
    nn.Linear(in_features = 768, out_features = 50257, bias=False)
)
standard_gpt2_model_size = sum(t.numel() for t in standard_gpt2.parameters())
print(f"GPT-2 size: {standard_gpt2_model_size/1000**2:.1f}M parameters")        
# >>> GPT-2 size: 163.0M parameters


Solution

  • That is because the linear layer of lm_head doesn't have separate weights. It shares its weight tensor with the token embedding layer. You can confirm this with data-ptr, which returns the address of the first element of the tensor:

    from torch import nn
    from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig
    
    model_id = "gpt2"
    
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    standard_gpt2 = GPT2LMHeadModel.from_pretrained(model_id)
    standard_gpt2_model_size = sum(t.numel() for t in standard_gpt2.parameters())
    print(f"GPT-2 size: {standard_gpt2_model_size} parameters")
    
    print(f"Token embedding layer address {standard_gpt2.transformer.wte.weight.untyped_storage().data_ptr()}")
    
    print(f"LM_head address {standard_gpt2.lm_head.weight.untyped_storage().data_ptr()}")
    
    # Replacing the default head
    standard_gpt2.lm_head = nn.Linear(in_features = 768, out_features = 50257, bias=False)
    standard_gpt2_model_size = sum(t.numel() for t in standard_gpt2.parameters())
    print(f"GPT-2 size after replacing lm_head: {standard_gpt2_model_size} parameters")
    
    print(f"Token embedding layer address after replacing lm_head {standard_gpt2.transformer.wte.weight.untyped_storage().data_ptr()}")
    
    print(f"LM_head address after replacing lm_head {standard_gpt2.lm_head.weight.untyped_storage().data_ptr()}")
    

    Output:

    GPT-2 size: 124439808 parameters
    Token embedding layer address: 96251233152832
    LM_head address: 96251233152832
    GPT-2 size after replacing lm_head: 163037184 parameters
    Token embedding layer address after replacing lm_head: 96251233152832
    LM_head address after replacing lm_head: 134800505946176
    

    I assume you want to keep sharing the weights, in this case, you should call something like this after assigning your new head:

    standard_gpt2.lm_head = nn.Sequential(
        nn.Linear(in_features = 768, out_features = 50257, bias=False)
    )
    
    standard_gpt2.lm_head[0].weight = standard_gpt2.transformer.wte.weight
    
    
    standard_gpt2_model_size = sum(t.numel() for t in standard_gpt2.parameters())
    print(f"GPT-2 size with tied weights+custom head: {standard_gpt2_model_size} parameters")
    
    print(f"Token embedding layer address with tied weights+custom head: {standard_gpt2.transformer.wte.weight.untyped_storage().data_ptr()}")
    
    print(f"LM_head address with tied weights+custom head: {standard_gpt2.lm_head[0].weight.untyped_storage().data_ptr()}")
    

    Output:

    GPT-2 size: 124439808 parameters
    Token embedding layer address 134800505946176
    LM_head address 134800505946176
    GPT-2 size with tied weights+custom head: 124439808 parameters
    Token embedding layer address with tied weights+custom head: 134800505946176
    LM_head address with tied weights+custom head: 134800505946176