Search code examples
pytorchhuggingface-transformers

How to load pretrained model to transformers pipeline and specify multi-gpu?


I have a local server with multiple GPUs and I am trying to load a local model and specify which GPU to use since we want to split GPU between team members.

I can successfully specify 1 GPU using device_map='cuda:3' for smaller model, how to do this on multiple GPU like CUDA:[4,5,6] for larger model?

(I tried using device_map = 'auto', 'balanced', 'sequential', which will spread model automatically. But this is not what we want...)

import torch
from transformers import LlamaForCausalLM

model_dir = '/models/Llama-2-13b-chat-hf'

# 'auto' 'balanced' 'sequential' 'balanced_low_0'
# 'cuda:3',

model = LlamaForCausalLM.from_pretrained(model_dir,
                                         device_map='cuda:[3,4,5]',#how to make things work here?
                                         torch_dtype=torch.float32 
                                        )

Solution

  • I guess the easiest way to achieve what you want is exporting CUDA_VISIBLE_DEVICES:

    import os
    os.environ["CUDA_VISIBLE_DEVICES"] = "1"
    #or
    os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
    
    import torch
    from transformers import LlamaForCausalLM
    
    model_dir = '/models/Llama-2-13b-chat-hf'
    model = LlamaForCausalLM.from_pretrained(model_dir,
                                             device_map='auto')
    

    If you want to use the device_map you have to map each layer by yourself:

    # distillroberta because it is smaller
    
    from transformers import AutoModelForMaskedLM
    
    model = AutoModelForMaskedLM.from_pretrained("distilbert/distilroberta-base")
    # parameter names
    print([x[0] for x in model.named_parameters()])
    

    Output:

    ['roberta.embeddings.word_embeddings.weight',
     'roberta.embeddings.position_embeddings.weight',
     'roberta.embeddings.token_type_embeddings.weight',
     'roberta.embeddings.LayerNorm.weight',
     'roberta.embeddings.LayerNorm.bias',
     'roberta.encoder.layer.0.attention.self.query.weight',
     'roberta.encoder.layer.0.attention.self.query.bias',
    ...
     'roberta.encoder.layer.5.output.LayerNorm.weight',
     'roberta.encoder.layer.5.output.LayerNorm.bias',
     'lm_head.bias',
     'lm_head.dense.weight',
     'lm_head.dense.bias',
     'lm_head.layer_norm.weight',
     'lm_head.layer_norm.bias']
    

    You don't need to map each weight. It is enough when you map the layers:

    # device map example for distillroberta:
    from transformers import AutoTokenizer, AutoModelForMaskedLM
    
    device_map= {'roberta.embeddings':'cpu', 'roberta.encoder':0, 'lm_head':'cpu'}
    
    model = AutoModelForMaskedLM.from_pretrained("distilbert/distilroberta-base", device_map = device_map)