python amazon-web-services amazon-s3 huggingface-transformers amazon-sagemaker

AWS Sagemaker Endpoint - Error while deploying LLM model

So I have finetuned an LLM Model (https://huggingface.co/HuggingFaceH4/starchat-beta) and would like to deploy a quantized version of this model to an AWS Endpoint, but it keeps giving me the following error:

The primary container for production variant AllTraffic did not pass the ping health check. Please check CloudWatch logs for this endpoint.

I took a look at the logs and this is what I found:

2023-10-09T00:38:42.791+04:00   #033[32mINFO #033[m #033[92mModelManager#033[m Loading model on Python:[0]

2023-10-09T00:38:42.791+04:00   #033[32mINFO #033[m #033[92mWorkerPool#033[m loading model model on gpu(0) ...

2023-10-09T00:38:42.791+04:00   #033[32mINFO #033[m #033[92mModelInfo#033[m Available CPU memory: 14343 MB, required: 0 MB, reserved: 500 MB

2023-10-09T00:38:42.791+04:00   #033[32mINFO #033[m #033[92mModelInfo#033[m Available GPU memory: 22483 MB, required: 0 MB, reserved: 500 MB

2023-10-09T00:38:42.791+04:00   #033[32mINFO #033[m #033[92mModelInfo#033[m Loading model model on gpu(0)

2023-10-09T00:38:42.791+04:00   #033[1;31mERROR#033[m #033[92mModelServer#033[m Failed register workflow

2023-10-09T00:38:42.791+04:00   java.util.concurrent.CompletionException: java.io.FileNotFoundException: .py file not found in: /opt/ml/model

2023-10-09T00:38:42.791+04:00   #011at ai.djl.serving.wlm.WorkerPool.initWorkers(WorkerPool.java:189) ~[wlm-0.23.0.jar:?]

2023-10-09T00:38:42.791+04:00   #011at ai.djl.serving.models.ModelManager.initWorkers(ModelManager.java:214) ~[serving-0.23.0.jar:?]

2023-10-09T00:38:42.791+04:00   #011at ai.djl.serving.models.ModelManager.lambda$registerWorkflow$2(ModelManager.java:132) ~[serving-0.23.0.jar:?]

2023-10-09T00:38:42.791+04:00   #011at java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1700) [?:?]

2023-10-09T00:38:42.791+04:00   #011at java.util.concurrent.CompletableFuture$AsyncSupply.exec(CompletableFuture.java:1692) [?:?]

2023-10-09T00:38:42.791+04:00   #011at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:290) [?:?]

2023-10-09T00:38:42.791+04:00   #011at java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(ForkJoinPool.java:1020) [?:?]

2023-10-09T00:38:42.791+04:00   #011at java.util.concurrent.ForkJoinPool.scan(ForkJoinPool.java:1656) [?:?]

2023-10-09T00:38:42.791+04:00   #011at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1594) [?:?]

2023-10-09T00:38:42.791+04:00   #011at java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:183) [?:?]

2023-10-09T00:38:42.791+04:00   Caused by: java.io.FileNotFoundException: .py file not found in: /opt/ml/model

2023-10-09T00:38:42.791+04:00   #011at ai.djl.python.engine.PyModel.load(PyModel.java:164) ~[python-0.23.0.jar:?]

2023-10-09T00:38:42.792+04:00   #011at ai.djl.repository.zoo.BaseModelLoader.loadModel(BaseModelLoader.java:161) ~[api-0.23.0.jar:?]

2023-10-09T00:38:42.792+04:00   #011at ai.djl.repository.zoo.Criteria.loadModel(Criteria.java:172) ~[api-0.23.0.jar:?]

2023-10-09T00:38:42.792+04:00   #011at ai.djl.serving.wlm.ModelInfo.load(ModelInfo.java:246) ~[wlm-0.23.0.jar:?]

2023-10-09T00:38:42.792+04:00   #011at ai.djl.serving.wlm.WorkerPool.initWorkers(WorkerPool.java:187) ~[wlm-0.23.0.jar:?]

2023-10-09T00:38:45.796+04:00   #011... 9 more

2023-10-09T00:38:47.801+04:00   #033[32mINFO #033[m #033[92mModelServer#033[m Stopping model server.

2023-10-09T00:38:47.801+04:00   #033[32mINFO #033[m #033[92mModelServer#033[m Initialize BOTH server with: EpollServerSocketChannel.

2023-10-09T00:38:47.801+04:00   #033[32mINFO #033[m #033[92mModelServer#033[m BOTH API bind to: http://0.0.0.0:8080

2023-10-09T00:38:47.801+04:00   #033[32mINFO #033[m #033[92mModelServer#033[m Stopping model server.

2023-10-09T00:38:51.808+04:00   #033[32mINFO #033[m #033[92mModelServer#033[m BOTH listener stopped.

Please if anyone is able to help me, it would be highly appreciated

I will also include my code below:

!mkdir code
%%writefile code/inference.py
from typing import Dict, List, Any
import torch


def model_fn(model_dir):
    # load model and processor from model_dir
    # Activate 4-bit precision base model loading
    use_4bit = True
    # Compute dtype for 4-bit base models
    bnb_4bit_compute_dtype = "float16"
    # Quantization type (fp4 or nf4)
    bnb_4bit_quant_type = "nf4"
    # Activate nested quantization for 4-bit base models (double quantization)
    use_nested_quant = True
    # Model Name
    # model_name = "MODEL_HF_PATH"

    # Load tokenizer and model with QLoRA configuration
    compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

    bnb_config = BitsAndBytesConfig(
       load_in_4bit=True,
       bnb_4bit_quant_type="nf4",
       bnb_4bit_use_double_quant=True,
       bnb_4bit_compute_dtype=torch.bfloat16
    )

    # Load base model
    llm_model = AutoModelForCausalLM.from_pretrained(
        model_dir,
        quantization_config=bnb_config,
        device_map="auto",

    )

    tokenizer = AutoTokenizer.from_pretrained(model_dir))

    return llm_model, tokenizer

from distutils.dir_util import copy_tree
from pathlib import Path
from tempfile import TemporaryDirectory
from huggingface_hub import snapshot_download

HF_MODEL_ID='MODEL_HF_PATH'
# create model dir
model_tar_dir = Path(HF_MODEL_ID.split("/")[-1])
model_tar_dir.mkdir()

# setup temporary directory
with TemporaryDirectory() as tmpdir:
    # download snapshot
    snapshot_dir = snapshot_download(repo_id=HF_MODEL_ID, cache_dir=tmpdir,resume_download=True)
    # copy snapshot to model dir
    print('Copying...')
    copy_tree(snapshot_dir, str(model_tar_dir))

copy_tree("code/", str(model_tar_dir.joinpath("code")))

import tarfile
import os

# helper to create the model.tar.gz
def compress(tar_dir=None,output_file="model.tar.gz"):
    parent_dir=os.getcwd()
    os.chdir(tar_dir)
    with tarfile.open(os.path.join(parent_dir, output_file), "w:gz") as tar:
        for item in os.listdir('.'):
          print(item)
          tar.add(item, arcname=item)
    os.chdir(parent_dir)

compress(str(model_tar_dir))

**from sagemaker.s3 import S3Uploader
# upload model.tar.gz to s3
s3_model_uri = S3Uploader.upload(local_path="model.tar.gz", desired_s3_uri=f"s3://{sess.default_bucket()}/model_name")

from sagemaker.huggingface.model import HuggingFaceModel


# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
    model_data=s3_model_uri,      # path to your model and script
    role=role, # iam role with permissions to create an Endpoint
    image_uri = '763104351884.dkr.ecr.us-east-2.amazonaws.com/djl-inference:0.23.0-fastertransformer5.3.0-cu118'
)

# deploy the endpoint endpoint
predictor = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type="ml.g5.xlarge"
    )

Solution

If you are using the LMI container (DJL) you need a serving.properties and an optional model.py

For exampe:

LLAMA 7B with customized preprocessing

Octocoder