I am working on deploying a quantized fine-tuned LLaMA 3-8B model and I aim to use vLLM to achieve faster inference. I am currently using the following Python code to load the model:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import bitsandbytes as bnb
import accelerate
# model_id = "meta-llama/Meta-Llama-3-8B" #"mistralai/Mistral-7B-Instruct-v0.1"
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
base_model = AutoModelForCausalLM.from_pretrained(
model_id,
trust_remote_code=True,
quantization_config=quantization_config, #load_in_8bit=True,#
device_map='auto',
token=MYTOKEN
)
peft_model = "BojanaBas/Meta-Llama-3-8B-Instruct-pqa-10"
model = PeftModel.from_pretrained(base_model, peft_model)
The code successfully loads the model, but I am not sure how to integrate this with vLLM to optimize for faster inference. I read that it is not possible to load a model using PEFT in vLLM; instead, the PEFT model needs to be merged and loaded on Hugging Face.
I have merged and loaded the model on Hugging Face as described in the article, after that, I am trying to use the model pushed to Hugging Face to load it on vLLM with the following code:
from vllm import LLM
merged_peft_model_name="lcass00/Meta-Llama-3-8B-Instruct-pqa-10-merged-peft"
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
llm = LLM(model=merged_peft_model_name, tokenizer=model_id)
but when I try to load the model on vLLM I get the following error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-14-c306a36d9c21> in <cell line: 3>()
1 from vllm import LLM
2
----> 3 llm = LLM(model=merged_peft_model_name, tokenizer=model_id)
4 frames
/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/llm.py in __init__(self, model, tokenizer, tokenizer_mode, skip_tokenizer_init, trust_remote_code, tensor_parallel_size, dtype, quantization, revision, tokenizer_revision, seed, gpu_memory_utilization, swap_space, enforce_eager, max_context_len_to_capture, max_seq_len_to_capture, disable_custom_all_reduce, **kwargs)
142 **kwargs,
143 )
--> 144 self.llm_engine = LLMEngine.from_engine_args(
145 engine_args, usage_context=UsageContext.LLM_CLASS)
146 self.request_counter = Counter()
/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py in from_engine_args(cls, engine_args, usage_context)
333 """Creates an LLM engine from the engine arguments."""
334 # Create the engine configs.
--> 335 engine_config = engine_args.create_engine_config()
336 distributed_executor_backend = (
337 engine_config.parallel_config.distributed_executor_backend)
/usr/local/lib/python3.10/dist-packages/vllm/engine/arg_utils.py in create_engine_config(self)
557 def create_engine_config(self, ) -> EngineConfig:
558 device_config = DeviceConfig(self.device)
--> 559 model_config = ModelConfig(
560 self.model, self.tokenizer, self.tokenizer_mode,
561 self.trust_remote_code, self.dtype, self.seed, self.revision,
/usr/local/lib/python3.10/dist-packages/vllm/config.py in __init__(self, model, tokenizer, tokenizer_mode, trust_remote_code, dtype, seed, revision, code_revision, rope_scaling, tokenizer_revision, max_model_len, quantization, quantization_param_path, enforce_eager, max_context_len_to_capture, max_seq_len_to_capture, max_logprobs, disable_sliding_window, skip_tokenizer_init, served_model_name)
141 self._verify_tokenizer_mode()
142 self._verify_embedding_mode()
--> 143 self._verify_quantization()
144 self._verify_cuda_graph()
145
/usr/local/lib/python3.10/dist-packages/vllm/config.py in _verify_quantization(self)
201 if self.quantization is not None:
202 if self.quantization not in supported_quantization:
--> 203 raise ValueError(
204 f"Unknown quantization method: {self.quantization}. Must "
205 f"be one of {supported_quantization}.")
ValueError: Unknown quantization method: bitsandbytes. Must be one of ['aqlm', 'awq', 'deepspeedfp', 'fp8', 'marlin', 'gptq_marlin_24', 'gptq_marlin', 'gptq', 'squeezellm', 'sparseml'].
How can I load a quantized finetuned model on vLLM?
Unfortunately vLLM does not support bitsandbytes quantization technique yet. You may want to use Mixtral-8x7B-Instruct-v0.1-GPTQ tough, as GPTQ and AWQ quantization techniques are already supported.