So I have finetuned an LLM Model (https://huggingface.co/HuggingFaceH4/starchat-beta) and would like to deploy a quantized version of this model to an AWS Endpoint, but it keeps giving me the following error:
The primary container for production variant AllTraffic did not pass the ping health check. Please check CloudWatch logs for this endpoint.
I took a look at the logs and this is what I found:
2023-10-09T00:38:42.791+04:00 #033[32mINFO #033[m #033[92mModelManager#033[m Loading model on Python:[0]
2023-10-09T00:38:42.791+04:00 #033[32mINFO #033[m #033[92mWorkerPool#033[m loading model model on gpu(0) ...
2023-10-09T00:38:42.791+04:00 #033[32mINFO #033[m #033[92mModelInfo#033[m Available CPU memory: 14343 MB, required: 0 MB, reserved: 500 MB
2023-10-09T00:38:42.791+04:00 #033[32mINFO #033[m #033[92mModelInfo#033[m Available GPU memory: 22483 MB, required: 0 MB, reserved: 500 MB
2023-10-09T00:38:42.791+04:00 #033[32mINFO #033[m #033[92mModelInfo#033[m Loading model model on gpu(0)
2023-10-09T00:38:42.791+04:00 #033[1;31mERROR#033[m #033[92mModelServer#033[m Failed register workflow
2023-10-09T00:38:42.791+04:00 java.util.concurrent.CompletionException: java.io.FileNotFoundException: .py file not found in: /opt/ml/model
2023-10-09T00:38:42.791+04:00 #011at ai.djl.serving.wlm.WorkerPool.initWorkers(WorkerPool.java:189) ~[wlm-0.23.0.jar:?]
2023-10-09T00:38:42.791+04:00 #011at ai.djl.serving.models.ModelManager.initWorkers(ModelManager.java:214) ~[serving-0.23.0.jar:?]
2023-10-09T00:38:42.791+04:00 #011at ai.djl.serving.models.ModelManager.lambda$registerWorkflow$2(ModelManager.java:132) ~[serving-0.23.0.jar:?]
2023-10-09T00:38:42.791+04:00 #011at java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1700) [?:?]
2023-10-09T00:38:42.791+04:00 #011at java.util.concurrent.CompletableFuture$AsyncSupply.exec(CompletableFuture.java:1692) [?:?]
2023-10-09T00:38:42.791+04:00 #011at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:290) [?:?]
2023-10-09T00:38:42.791+04:00 #011at java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(ForkJoinPool.java:1020) [?:?]
2023-10-09T00:38:42.791+04:00 #011at java.util.concurrent.ForkJoinPool.scan(ForkJoinPool.java:1656) [?:?]
2023-10-09T00:38:42.791+04:00 #011at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1594) [?:?]
2023-10-09T00:38:42.791+04:00 #011at java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:183) [?:?]
2023-10-09T00:38:42.791+04:00 Caused by: java.io.FileNotFoundException: .py file not found in: /opt/ml/model
2023-10-09T00:38:42.791+04:00 #011at ai.djl.python.engine.PyModel.load(PyModel.java:164) ~[python-0.23.0.jar:?]
2023-10-09T00:38:42.792+04:00 #011at ai.djl.repository.zoo.BaseModelLoader.loadModel(BaseModelLoader.java:161) ~[api-0.23.0.jar:?]
2023-10-09T00:38:42.792+04:00 #011at ai.djl.repository.zoo.Criteria.loadModel(Criteria.java:172) ~[api-0.23.0.jar:?]
2023-10-09T00:38:42.792+04:00 #011at ai.djl.serving.wlm.ModelInfo.load(ModelInfo.java:246) ~[wlm-0.23.0.jar:?]
2023-10-09T00:38:42.792+04:00 #011at ai.djl.serving.wlm.WorkerPool.initWorkers(WorkerPool.java:187) ~[wlm-0.23.0.jar:?]
2023-10-09T00:38:45.796+04:00 #011... 9 more
2023-10-09T00:38:47.801+04:00 #033[32mINFO #033[m #033[92mModelServer#033[m Stopping model server.
2023-10-09T00:38:47.801+04:00 #033[32mINFO #033[m #033[92mModelServer#033[m Initialize BOTH server with: EpollServerSocketChannel.
2023-10-09T00:38:47.801+04:00 #033[32mINFO #033[m #033[92mModelServer#033[m BOTH API bind to: http://0.0.0.0:8080
2023-10-09T00:38:47.801+04:00 #033[32mINFO #033[m #033[92mModelServer#033[m Stopping model server.
2023-10-09T00:38:51.808+04:00 #033[32mINFO #033[m #033[92mModelServer#033[m BOTH listener stopped.
Please if anyone is able to help me, it would be highly appreciated
I will also include my code below:
!mkdir code
%%writefile code/inference.py
from typing import Dict, List, Any
import torch
def model_fn(model_dir):
# load model and processor from model_dir
# Activate 4-bit precision base model loading
use_4bit = True
# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"
# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"
# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = True
# Model Name
# model_name = "MODEL_HF_PATH"
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16
)
# Load base model
llm_model = AutoModelForCausalLM.from_pretrained(
model_dir,
quantization_config=bnb_config,
device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(model_dir))
return llm_model, tokenizer
from distutils.dir_util import copy_tree
from pathlib import Path
from tempfile import TemporaryDirectory
from huggingface_hub import snapshot_download
HF_MODEL_ID='MODEL_HF_PATH'
# create model dir
model_tar_dir = Path(HF_MODEL_ID.split("/")[-1])
model_tar_dir.mkdir()
# setup temporary directory
with TemporaryDirectory() as tmpdir:
# download snapshot
snapshot_dir = snapshot_download(repo_id=HF_MODEL_ID, cache_dir=tmpdir,resume_download=True)
# copy snapshot to model dir
print('Copying...')
copy_tree(snapshot_dir, str(model_tar_dir))
copy_tree("code/", str(model_tar_dir.joinpath("code")))
import tarfile
import os
# helper to create the model.tar.gz
def compress(tar_dir=None,output_file="model.tar.gz"):
parent_dir=os.getcwd()
os.chdir(tar_dir)
with tarfile.open(os.path.join(parent_dir, output_file), "w:gz") as tar:
for item in os.listdir('.'):
print(item)
tar.add(item, arcname=item)
os.chdir(parent_dir)
compress(str(model_tar_dir))
**from sagemaker.s3 import S3Uploader
# upload model.tar.gz to s3
s3_model_uri = S3Uploader.upload(local_path="model.tar.gz", desired_s3_uri=f"s3://{sess.default_bucket()}/model_name")
from sagemaker.huggingface.model import HuggingFaceModel
# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
model_data=s3_model_uri, # path to your model and script
role=role, # iam role with permissions to create an Endpoint
image_uri = '763104351884.dkr.ecr.us-east-2.amazonaws.com/djl-inference:0.23.0-fastertransformer5.3.0-cu118'
)
# deploy the endpoint endpoint
predictor = huggingface_model.deploy(
initial_instance_count=1,
instance_type="ml.g5.xlarge"
)
If you are using the LMI container (DJL) you need a serving.properties
and an optional model.py
For exampe: