From c695fc8d4dce42ddd419daadfda41c3b7656b526 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Enrique=20Hern=C3=A1ndez=20Calabr=C3=A9s?= Date: Mon, 6 Apr 2026 11:22:26 +0000 Subject: [PATCH 01/14] [WIP] Add HuggingFace LlamaCpp support --- dlc_developer_config.toml | 5 +- huggingface/llamacpp/buildspec.yaml | 91 +++++++++++++++++++ .../llamacpp/docker/b8672/Dockerfile.cpu | 0 .../docker/b8672/cu129/Dockerfile.gpu | 38 ++++++++ .../docker/b8672/cu130/Dockerfile.gpu | 42 +++++++++ src/constants.py | 1 + test/test_utils/__init__.py | 2 + test/test_utils/sagemaker.py | 4 + test/testrunner.py | 9 ++ 9 files changed, 191 insertions(+), 1 deletion(-) create mode 100644 huggingface/llamacpp/buildspec.yaml create mode 100644 huggingface/llamacpp/docker/b8672/Dockerfile.cpu create mode 100644 huggingface/llamacpp/docker/b8672/cu129/Dockerfile.gpu create mode 100644 huggingface/llamacpp/docker/b8672/cu130/Dockerfile.gpu diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 89b740a5e315..08ad2014d650 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -36,7 +36,7 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. -# available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_vllm", "huggingface_sglang", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] +# available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_vllm", "huggingface_sglang", "huggingface_llamacpp", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] build_frameworks = [] @@ -192,5 +192,8 @@ dlc-pr-huggingface-vllm = "" # HuggingFace SGLang dlc-pr-huggingface-sglang = "" +# Huggingface Llamacpp +dlc-pr-huggingface-llamacpp = "" + # sglang dlc-pr-sglang = "" diff --git a/huggingface/llamacpp/buildspec.yaml b/huggingface/llamacpp/buildspec.yaml new file mode 100644 index 000000000000..2d1b2e360c93 --- /dev/null +++ b/huggingface/llamacpp/buildspec.yaml @@ -0,0 +1,91 @@ +account_id: &ACCOUNT_ID +prod_account_id: &PROD_ACCOUNT_ID 763104351884 +region: ®ION +base_framework: &BASE_FRAMEWORK llamacpp +framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK] +version: &VERSION "b8672" +short_version: &SHORT_VERSION "b8672" +arch_type: &ARCH_TYPE x86_64 +autopatch_build: "False" + +repository_info: + build_repository: &BUILD_REPOSITORY + image_type: &IMAGE_TYPE inference + root: huggingface/llamacpp + repository_name: &REPOSITORY_NAME !join [ "pr", "-", "huggingface", "-", *BASE_FRAMEWORK ] + repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ] + release_repository_name: &RELEASE_REPOSITORY_NAME !join [ "huggingface", "-", *BASE_FRAMEWORK ] + release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ] + +context: + build_context: &BUILD_CONTEXT + deep_learning_container: + source: ../../src/deep_learning_container.py + target: deep_learning_container.py + start_cuda_compat: + source: build_artifacts/start_cuda_compat.sh + target: start_cuda_compat.sh + sagemaker_entrypoint: + source: build_artifacts/sagemaker_entrypoint.sh + target: sagemaker_entrypoint.sh + + +images: + BuildHuggingFaceLlamacppGpuCu129DockerImage: + <<: *BUILD_REPOSITORY + context: + <<: *BUILD_CONTEXT + image_size_baseline: 40000 + device_type: &DEVICE_TYPE gpu + cuda_version: &CUDA_VERSION cu129 + os_version: &OS_VERSION ubuntu24.04 + transformers_version: &TRANSFORMERS_VERSION 4.57.3 + tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *CUDA_VERSION, '-', *OS_VERSION ] + docker_file: !join [ docker/, *SHORT_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] + target: sagemaker + build: true + enable_common_stage_build: false + test_configs: + test_platforms: + - sanity + - security + - sagemaker + + BuildHuggingFaceLlamacppGpuCu130DockerImage: + <<: *BUILD_REPOSITORY + context: + <<: *BUILD_CONTEXT + image_size_baseline: 40000 + device_type: &DEVICE_TYPE gpu + cuda_version: &CUDA_VERSION cu130 + os_version: &OS_VERSION ubuntu24.04 + transformers_version: &TRANSFORMERS_VERSION 4.57.3 + tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *CUDA_VERSION, '-', *OS_VERSION ] + docker_file: !join [ docker/, *SHORT_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] + target: sagemaker + build: true + enable_common_stage_build: false + test_configs: + test_platforms: + - sanity + - security + - sagemaker + + BuildHuggingFaceLlamacppCpuDockerImage: + <<: *BUILD_REPOSITORY + context: + <<: *BUILD_CONTEXT + image_size_baseline: 40000 + device_type: &DEVICE_TYPE cpu + os_version: &OS_VERSION ubuntu24.04 + transformers_version: &TRANSFORMERS_VERSION 4.57.3 + tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *OS_VERSION ] + docker_file: !join [ docker/, *SHORT_VERSION, /Dockerfile., *DEVICE_TYPE ] + target: sagemaker + build: true + enable_common_stage_build: false + test_configs: + test_platforms: + - sanity + - security + - sagemaker diff --git a/huggingface/llamacpp/docker/b8672/Dockerfile.cpu b/huggingface/llamacpp/docker/b8672/Dockerfile.cpu new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/huggingface/llamacpp/docker/b8672/cu129/Dockerfile.gpu b/huggingface/llamacpp/docker/b8672/cu129/Dockerfile.gpu new file mode 100644 index 000000000000..6e37bac2e95e --- /dev/null +++ b/huggingface/llamacpp/docker/b8672/cu129/Dockerfile.gpu @@ -0,0 +1,38 @@ +FROM ghcr.io/ggml-org/llama.cpp:server-cuda12-b8672 as base + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +ARG HUGGINGFACE_HUB_VERSION=1.9.0 +ARG HF_XET_VERSION=1.2.0 + +WORKDIR / + +COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py +COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh + +# ====================== ec2 ========================================= +FROM base AS llamacpp-ec2 + +RUN dpkg -l | grep -E "cuda|nvidia|libnv" | awk '{print $2}' | xargs apt-mark hold && \ + apt-get update && \ + apt-get upgrade -y && \ + apt-get clean + +COPY dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh +RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh + +ENTRYPOINT ["/usr/local/bin/dockerd_entrypoint.sh"] + +# ====================== sagemaker ========================================= +FROM base AS llamacpp-sagemaker + +RUN dpkg -l | grep -E "cuda|nvidia|libnv" | awk '{print $2}' | xargs apt-mark hold && \ + apt-get update && \ + apt-get upgrade -y && \ + apt-get clean + +COPY sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh +RUN chmod +x /usr/local/bin/sagemaker_entrypoint.sh + +ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"] diff --git a/huggingface/llamacpp/docker/b8672/cu130/Dockerfile.gpu b/huggingface/llamacpp/docker/b8672/cu130/Dockerfile.gpu new file mode 100644 index 000000000000..e1cc3efa6e9a --- /dev/null +++ b/huggingface/llamacpp/docker/b8672/cu130/Dockerfile.gpu @@ -0,0 +1,42 @@ +FROM ghcr.io/ggml-org/llama.cpp:server-cuda13-b8672 as base + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +ENV DEBIAN_FRONTEND=noninteractive \ + LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 \ + DLC_CONTAINER_TYPE=base \ + LD_LIBRARY_PATH="/opt/amazon/ofi-nccl/lib:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" \ + PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/cuda/bin:${PATH}" + +WORKDIR / + +COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py +COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh + +# ====================== ec2 ========================================= +FROM base AS llamacpp-ec2 + +RUN dpkg -l | grep -E "cuda|nvidia|libnv" | awk '{print $2}' | xargs apt-mark hold && \ + apt-get update && \ + apt-get upgrade -y && \ + apt-get clean + +COPY dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh +RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh + +ENTRYPOINT ["/usr/local/bin/dockerd_entrypoint.sh"] + +# ====================== sagemaker ========================================= +FROM base AS llamacpp-sagemaker + +RUN dpkg -l | grep -E "cuda|nvidia|libnv" | awk '{print $2}' | xargs apt-mark hold && \ + apt-get update && \ + apt-get upgrade -y && \ + apt-get clean + +COPY sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh +RUN chmod +x /usr/local/bin/sagemaker_entrypoint.sh + +ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"] diff --git a/src/constants.py b/src/constants.py index 037414380bca..42275c5532f1 100644 --- a/src/constants.py +++ b/src/constants.py @@ -29,6 +29,7 @@ "sglang", "huggingface_vllm", "huggingface_sglang", + "huggingface_llamacpp", } DEVICE_TYPES = {"cpu", "gpu", "hpu", "eia", "inf", "neuron", "neuronx"} IMAGE_TYPES = {"training", "inference"} diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py index d593deea76e7..51ca9a276922 100644 --- a/test/test_utils/__init__.py +++ b/test/test_utils/__init__.py @@ -1822,6 +1822,7 @@ def get_framework_and_version_from_tag(image_uri): "huggingface_pytorch", "huggingface_vllm", "huggingface_sglang", + "huggingface_llamacpp", "stabilityai_pytorch", "pytorch_trcomp", "tensorflow", @@ -1939,6 +1940,7 @@ def get_framework_from_image_uri(image_uri): "huggingface-pytorch": "huggingface_pytorch", "huggingface-vllm": "huggingface_vllm", "huggingface-sglang": "huggingface_sglang", + "huggingface-llamacpp": "huggingface_llamacpp", "stabilityai-pytorch": "stabilityai_pytorch", "mxnet": "mxnet", "pytorch": "pytorch", diff --git a/test/test_utils/sagemaker.py b/test/test_utils/sagemaker.py index 0ab4d69e4829..24f256f66253 100644 --- a/test/test_utils/sagemaker.py +++ b/test/test_utils/sagemaker.py @@ -164,6 +164,8 @@ def generate_sagemaker_pytest_cmd(image, sagemaker_test_type): path = os.path.join("test", "sagemaker_tests", "huggingface", "vllm") elif framework == "huggingface_sglang": path = os.path.join("test", "sagemaker_tests", "huggingface", "sglang") + elif framework == "huggingface_llamacpp": + path = os.path.join("test", "sagemaker_tests", "huggingface", "llamacpp") else: path = os.path.join("test", "sagemaker_tests", framework, job_type) aws_id_arg = "--aws-id" @@ -286,6 +288,8 @@ def generate_sagemaker_pytest_cmd(image, sagemaker_test_type): path = os.path.join("test", "sagemaker_tests", "huggingface", "vllm") elif "huggingface" in framework and "sglang" in framework: path = os.path.join("test", "sagemaker_tests", "huggingface", "sglang") + elif "huggingface" in framework and "llamacpp" in framework: + path = os.path.join("test", "sagemaker_tests", "huggingface", "llamacpp") elif "huggingface" in framework and job_type == "inference": path = os.path.join("test", "sagemaker_tests", "huggingface", "inference") if "trcomp" in framework: diff --git a/test/testrunner.py b/test/testrunner.py index 2d7deb2cfe24..9773d11e1604 100644 --- a/test/testrunner.py +++ b/test/testrunner.py @@ -629,6 +629,15 @@ def main(): sm_utils.generate_empty_report(report, test_type, "sglang") return + # Skip base llamacpp (not huggingface_llamacpp) - huggingface_llamacpp has local tests + if "llamacpp" in dlc_images and "huggingface" not in dlc_images: + LOGGER.info( + f"Skipping - there are no local mode tests for base Llamacpp. Images: {dlc_images}" + ) + report = os.path.join(os.getcwd(), "test", f"{test_type}.xml") + sm_utils.generate_empty_report(report, test_type, "llamacpp") + return + testing_image_list = [ image for image in standard_images_list From 5d5118fe3bb684aecdf7904bb4662745d2b45a0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Enrique=20Hern=C3=A1ndez=20Calabr=C3=A9s?= Date: Fri, 24 Apr 2026 15:08:44 +0000 Subject: [PATCH 02/14] [WIP] Add HuggingFace LlamaCpp support with Dockerfiles, buildspec, and serving scripts --- .../llamacpp_sagemaker_serve.py | 211 ++++++++++ .../build_artifacts/sagemaker_entrypoint.sh | 95 +++++ .../build_artifacts/start_cuda_compat.sh | 25 ++ .../{buildspec.yaml => buildspec.yml} | 31 +- .../docker/b8672/cu129/Dockerfile.gpu | 38 -- .../docker/b8672/cu130/Dockerfile.gpu | 42 -- .../docker/{b8672 => b8882}/Dockerfile.cpu | 0 .../docker/b8882/cu130/Dockerfile.gpu | 32 ++ src/image_builder.py | 1 + .../huggingface/llamacpp/__init__.py | 13 + .../huggingface/llamacpp/conftest.py | 391 ++++++++++++++++++ .../llamacpp/integration/__init__.py | 119 ++++++ .../llamacpp/integration/local/__init__.py | 13 + .../integration/local/test_serving.py | 109 +++++ .../integration/sagemaker/__init__.py | 12 + .../integration/sagemaker/test_sglang.py | 116 ++++++ .../llamacpp/integration/sagemaker/timeout.py | 66 +++ .../resources/qwen3.5-0.8b/.gitattributes | 61 +++ .../huggingface/llamacpp/utils/__init__.py | 36 ++ .../huggingface/llamacpp/utils/image_utils.py | 67 +++ .../llamacpp/utils/local_mode_utils.py | 46 +++ test/test_utils/__init__.py | 1 + 22 files changed, 1423 insertions(+), 102 deletions(-) create mode 100644 huggingface/llamacpp/build_artifacts/llamacpp_sagemaker_serve.py create mode 100644 huggingface/llamacpp/build_artifacts/sagemaker_entrypoint.sh create mode 100644 huggingface/llamacpp/build_artifacts/start_cuda_compat.sh rename huggingface/llamacpp/{buildspec.yaml => buildspec.yml} (77%) delete mode 100644 huggingface/llamacpp/docker/b8672/cu129/Dockerfile.gpu delete mode 100644 huggingface/llamacpp/docker/b8672/cu130/Dockerfile.gpu rename huggingface/llamacpp/docker/{b8672 => b8882}/Dockerfile.cpu (100%) create mode 100644 huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu create mode 100644 test/sagemaker_tests/huggingface/llamacpp/__init__.py create mode 100644 test/sagemaker_tests/huggingface/llamacpp/conftest.py create mode 100644 test/sagemaker_tests/huggingface/llamacpp/integration/__init__.py create mode 100644 test/sagemaker_tests/huggingface/llamacpp/integration/local/__init__.py create mode 100644 test/sagemaker_tests/huggingface/llamacpp/integration/local/test_serving.py create mode 100644 test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/__init__.py create mode 100644 test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/test_sglang.py create mode 100644 test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/timeout.py create mode 100644 test/sagemaker_tests/huggingface/llamacpp/resources/qwen3.5-0.8b/.gitattributes create mode 100644 test/sagemaker_tests/huggingface/llamacpp/utils/__init__.py create mode 100644 test/sagemaker_tests/huggingface/llamacpp/utils/image_utils.py create mode 100644 test/sagemaker_tests/huggingface/llamacpp/utils/local_mode_utils.py diff --git a/huggingface/llamacpp/build_artifacts/llamacpp_sagemaker_serve.py b/huggingface/llamacpp/build_artifacts/llamacpp_sagemaker_serve.py new file mode 100644 index 000000000000..38da145b33e0 --- /dev/null +++ b/huggingface/llamacpp/build_artifacts/llamacpp_sagemaker_serve.py @@ -0,0 +1,211 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +"""SageMaker HTTP proxy for llama.cpp llama-server. + +SageMaker invokes POST /invocations and GET /ping on port 8080. llama-server +speaks OpenAI-style routes (e.g. /v1/chat/completions) and does not expose +/invocations. + +Behavior mirrors scripts/vllm/omni_sagemaker_serve.py routing: + +- GET /ping is proxied to GET {backend}/health. +- POST /invocations: if ``X-Amzn-SageMaker-Custom-Attributes`` contains + ``route=/some/path``, the request is forwarded to that path on llama-server. + Otherwise the target path is inferred from the JSON body (messages -> + /v1/chat/completions, prompt -> /v1/completions, input+model -> /v1/embeddings), + defaulting to /v1/chat/completions. + +For routes that require multipart/form-data (parity with vLLM-Omni), JSON bodies +are converted when ``route=`` targets those paths. + +Environment: + +- LLAMACPP_SAGEMAKER_BACKEND_URL: upstream base URL (default http://127.0.0.1:8081) +""" + +from __future__ import annotations + +import json +import logging +import os +import re +import uuid +from collections.abc import AsyncIterator + +import httpx +from starlette.applications import Starlette +from starlette.requests import Request +from starlette.responses import Response, StreamingResponse +from starlette.routing import Route + +logger = logging.getLogger("llamacpp_sagemaker") + +BACKEND = os.environ.get("LLAMACPP_SAGEMAKER_BACKEND_URL", "http://127.0.0.1:8081").rstrip("/") + +FORM_DATA_ROUTES = frozenset({"/v1/videos", "/v1/videos/sync"}) + +_HOP_BY_HOP = frozenset( + { + "connection", + "keep-alive", + "proxy-authenticate", + "proxy-authorization", + "te", + "trailers", + "transfer-encoding", + "upgrade", + "host", + "content-length", + } +) + +_RESP_DROP = frozenset({"transfer-encoding", "content-length", "connection"}) + + +def _parse_route_from_header(raw: str | None) -> str | None: + if not raw: + return None + m = re.search(r"route=(/[^\s,]+)", raw) + return m.group(1) if m else None + + +def _parse_route(request: Request) -> str | None: + h = request.headers + v = h.get("x-amzn-sagemaker-custom-attributes") + return _parse_route_from_header(v) + + +def _build_multipart_body(data: dict, boundary: str) -> bytes: + parts: list[str] = [] + for key, value in data.items(): + parts.append( + f'--{boundary}\r\nContent-Disposition: form-data; name="{key}"\r\n\r\n{value}\r\n' + ) + parts.append(f"--{boundary}--\r\n") + return "".join(parts).encode() + + +def _default_path_for_invocation(content_type: str, body: bytes) -> str: + ct = (content_type or "").lower() + if "json" not in ct: + return "/v1/chat/completions" + try: + data = json.loads(body) + except (json.JSONDecodeError, UnicodeDecodeError): + return "/v1/chat/completions" + if not isinstance(data, dict): + return "/v1/chat/completions" + if "messages" in data: + return "/v1/chat/completions" + if "prompt" in data: + return "/v1/completions" + if "input" in data and "model" in data: + return "/v1/embeddings" + return "/v1/chat/completions" + + +def _forward_request_headers(request: Request, body_len: int, content_type: str | None) -> dict[str, str]: + out: dict[str, str] = {} + for key, value in request.headers.items(): + lk = key.lower() + if lk in _HOP_BY_HOP or lk == "x-amzn-sagemaker-custom-attributes": + continue + out[key] = value + out["content-length"] = str(body_len) + if content_type is not None: + out["content-type"] = content_type + return out + + +def _response_headers_from_httpx(resp: httpx.Response) -> dict[str, str]: + h: dict[str, str] = {} + for key, value in resp.headers.items(): + lk = key.lower() + if lk in _RESP_DROP: + continue + h[key] = value + return h + + +async def ping(request: Request) -> Response: + url = f"{BACKEND}/health" + try: + async with httpx.AsyncClient(timeout=httpx.Timeout(10.0, connect=2.0)) as client: + r = await client.get(url) + except httpx.RequestError as e: + logger.warning("Backend health request failed: %s", e) + return Response(status_code=503, content=b'{"error":"backend_unavailable"}') + return Response( + status_code=r.status_code, + content=r.content, + headers=_response_headers_from_httpx(r), + ) + + +async def invocations(request: Request) -> Response: + if request.method != "POST": + return Response(status_code=405, content=b"Method Not Allowed") + + body = await request.body() + route = _parse_route(request) + content_type = request.headers.get("content-type") + + if route: + target = route + logger.info("Rerouting /invocations -> %s", target) + ct = (content_type or "").lower() + if target in FORM_DATA_ROUTES and "json" in ct: + try: + data = json.loads(body) + except (json.JSONDecodeError, UnicodeDecodeError): + data = None + if isinstance(data, dict): + boundary = uuid.uuid4().hex + body = _build_multipart_body(data, boundary) + content_type = f"multipart/form-data; boundary={boundary}" + logger.info("Converted JSON to form-data for %s", target) + else: + target = _default_path_for_invocation(content_type or "", body) + logger.info("Inferred /invocations -> %s", target) + + url = f"{BACKEND}{target}" + fwd_headers = _forward_request_headers(request, len(body), content_type) + + timeout = httpx.Timeout(600.0, connect=30.0) + client = httpx.AsyncClient(timeout=timeout) + try: + req = client.build_request("POST", url, headers=fwd_headers, content=body) + r = await client.send(req, stream=True) + except httpx.RequestError as e: + await client.aclose() + logger.exception("Upstream request failed: %s", e) + return Response(status_code=502, content=json.dumps({"error": "upstream_error"}).encode()) + + async def stream_body() -> AsyncIterator[bytes]: + try: + async for chunk in r.aiter_bytes(): + yield chunk + finally: + await r.aclose() + await client.aclose() + + return StreamingResponse( + stream_body(), + status_code=r.status_code, + headers=_response_headers_from_httpx(r), + media_type=r.headers.get("content-type"), + ) + + +routes = [ + Route("/ping", ping, methods=["GET"]), + Route("/invocations", invocations, methods=["POST"]), +] + +app = Starlette(routes=routes) + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s %(name)s %(message)s", + force=True, +) diff --git a/huggingface/llamacpp/build_artifacts/sagemaker_entrypoint.sh b/huggingface/llamacpp/build_artifacts/sagemaker_entrypoint.sh new file mode 100644 index 000000000000..7f55bf5967d8 --- /dev/null +++ b/huggingface/llamacpp/build_artifacts/sagemaker_entrypoint.sh @@ -0,0 +1,95 @@ +#!/bin/bash +set -euo pipefail + +# Check if telemetry file exists before executing +# Execute telemetry script if it exists, suppress errors +bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true + +# Source CUDA compat for older drivers (e.g., g5 instances) +if command -v nvidia-smi >/dev/null 2>&1 && command -v nvcc >/dev/null 2>&1; then + source /usr/local/bin/start_cuda_compat.sh +fi + +# SageMaker sends traffic to port 8080 on /ping and /invocations. llama-server +# listens on a loopback-only port; a small Python proxy (llamacpp_sagemaker_serve) +# binds 8080 and forwards to llama-server, similar to vLLM-Omni middleware. +INTERNAL_HOST="${LLAMACPP_SAGEMAKER_INTERNAL_HOST:-127.0.0.1}" +INTERNAL_PORT="${LLAMACPP_SAGEMAKER_INTERNAL_PORT:-8081}" +PROXY_PORT="${LLAMACPP_SAGEMAKER_PROXY_PORT:-8080}" +export LLAMACPP_SAGEMAKER_BACKEND_URL="${LLAMACPP_SAGEMAKER_BACKEND_URL:-http://${INTERNAL_HOST}:${INTERNAL_PORT}}" + +PREFIX="SM_LLAMACPP_" +ARG_PREFIX="--" + +ARGS=() + +while IFS='=' read -r key value; do + arg_name=$(echo "${key#"${PREFIX}"}" | tr '[:upper:]' '[:lower:]' | tr '_' '-') + + ARGS+=("${ARG_PREFIX}${arg_name}") + if [ -n "$value" ]; then + ARGS+=("$value") + fi +done < <(env | grep "^${PREFIX}" || true) + +# Drop any user-supplied --host / --port so inference stays on the internal bind. +normalized=() +skip_next=0 +for a in "${ARGS[@]}"; do + if [ "$skip_next" -eq 1 ]; then + skip_next=0 + continue + fi + if [ "$a" = "--host" ] || [ "$a" = "--port" ]; then + skip_next=1 + continue + fi + normalized+=("$a") +done +ARGS=("${normalized[@]}") +ARGS+=(--host "$INTERNAL_HOST" --port "$INTERNAL_PORT") + +echo "[sagemaker] llama-server args: ${ARGS[*]}" >&2 + +/app/llama-server "${ARGS[@]}" & +LLAMA_PID=$! + +wait_for_llama() { + local i + for i in $(seq 1 120); do + if curl -sf "http://${INTERNAL_HOST}:${INTERNAL_PORT}/health" >/dev/null 2>&1; then + return 0 + fi + sleep 1 + done + return 1 +} + +if ! wait_for_llama; then + echo "[sagemaker] llama-server did not become healthy on ${INTERNAL_HOST}:${INTERNAL_PORT}" >&2 + kill -TERM "$LLAMA_PID" 2>/dev/null || true + wait "$LLAMA_PID" 2>/dev/null || true + exit 1 +fi + +shutdown() { + kill -TERM "$UVICORN_PID" 2>/dev/null || true + kill -TERM "$LLAMA_PID" 2>/dev/null || true + wait "$UVICORN_PID" 2>/dev/null || true + wait "$LLAMA_PID" 2>/dev/null || true +} + +trap shutdown SIGTERM SIGINT + +if [ -n "${PYTHONPATH:-}" ]; then + export PYTHONPATH="${PYTHONPATH}:/usr/local/lib/llamacpp_sagemaker" +else + export PYTHONPATH="/usr/local/lib/llamacpp_sagemaker" +fi +python3 -m uvicorn llamacpp_sagemaker_serve:app --host 0.0.0.0 --port "$PROXY_PORT" --log-level info & +UVICORN_PID=$! + +wait "$UVICORN_PID" +exit_code=$? +shutdown +exit "$exit_code" diff --git a/huggingface/llamacpp/build_artifacts/start_cuda_compat.sh b/huggingface/llamacpp/build_artifacts/start_cuda_compat.sh new file mode 100644 index 000000000000..791d355c5abe --- /dev/null +++ b/huggingface/llamacpp/build_artifacts/start_cuda_compat.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +verlte() { + [ "$1" = "$2" ] && return 1 || [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] +} + +COMPAT_FILE=/usr/local/cuda/compat/libcuda.so.1 +if [ -f $COMPAT_FILE ]; then + CUDA_COMPAT_MAX_DRIVER_VERSION=$(readlink $COMPAT_FILE | cut -d'.' -f 3-) + echo "CUDA compat package should be installed for NVIDIA driver smaller than ${CUDA_COMPAT_MAX_DRIVER_VERSION}" + NVIDIA_DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true) + if [ -z "$NVIDIA_DRIVER_VERSION" ]; then + NVIDIA_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0 2>/dev/null || true) + fi + echo "Current installed NVIDIA driver version is ${NVIDIA_DRIVER_VERSION}" + if verlte $NVIDIA_DRIVER_VERSION $CUDA_COMPAT_MAX_DRIVER_VERSION; then + echo "Adding CUDA compat to LD_LIBRARY_PATH" + export LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH + echo $LD_LIBRARY_PATH + else + echo "Skipping CUDA compat setup as newer NVIDIA driver is installed" + fi +else + echo "Skipping CUDA compat setup as package not found" +fi diff --git a/huggingface/llamacpp/buildspec.yaml b/huggingface/llamacpp/buildspec.yml similarity index 77% rename from huggingface/llamacpp/buildspec.yaml rename to huggingface/llamacpp/buildspec.yml index 2d1b2e360c93..bed20118458b 100644 --- a/huggingface/llamacpp/buildspec.yaml +++ b/huggingface/llamacpp/buildspec.yml @@ -3,8 +3,8 @@ prod_account_id: &PROD_ACCOUNT_ID 763104351884 region: ®ION base_framework: &BASE_FRAMEWORK llamacpp framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK] -version: &VERSION "b8672" -short_version: &SHORT_VERSION "b8672" +version: &VERSION "b8882" +short_version: &SHORT_VERSION "b8882" arch_type: &ARCH_TYPE x86_64 autopatch_build: "False" @@ -28,29 +28,12 @@ context: sagemaker_entrypoint: source: build_artifacts/sagemaker_entrypoint.sh target: sagemaker_entrypoint.sh + llamacpp_sagemaker_serve: + source: build_artifacts/llamacpp_sagemaker_serve.py + target: llamacpp_sagemaker_serve.py images: - BuildHuggingFaceLlamacppGpuCu129DockerImage: - <<: *BUILD_REPOSITORY - context: - <<: *BUILD_CONTEXT - image_size_baseline: 40000 - device_type: &DEVICE_TYPE gpu - cuda_version: &CUDA_VERSION cu129 - os_version: &OS_VERSION ubuntu24.04 - transformers_version: &TRANSFORMERS_VERSION 4.57.3 - tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *CUDA_VERSION, '-', *OS_VERSION ] - docker_file: !join [ docker/, *SHORT_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] - target: sagemaker - build: true - enable_common_stage_build: false - test_configs: - test_platforms: - - sanity - - security - - sagemaker - BuildHuggingFaceLlamacppGpuCu130DockerImage: <<: *BUILD_REPOSITORY context: @@ -59,6 +42,8 @@ images: device_type: &DEVICE_TYPE gpu cuda_version: &CUDA_VERSION cu130 os_version: &OS_VERSION ubuntu24.04 + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py312 transformers_version: &TRANSFORMERS_VERSION 4.57.3 tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *CUDA_VERSION, '-', *OS_VERSION ] docker_file: !join [ docker/, *SHORT_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] @@ -78,6 +63,8 @@ images: image_size_baseline: 40000 device_type: &DEVICE_TYPE cpu os_version: &OS_VERSION ubuntu24.04 + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py312 transformers_version: &TRANSFORMERS_VERSION 4.57.3 tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *OS_VERSION ] docker_file: !join [ docker/, *SHORT_VERSION, /Dockerfile., *DEVICE_TYPE ] diff --git a/huggingface/llamacpp/docker/b8672/cu129/Dockerfile.gpu b/huggingface/llamacpp/docker/b8672/cu129/Dockerfile.gpu deleted file mode 100644 index 6e37bac2e95e..000000000000 --- a/huggingface/llamacpp/docker/b8672/cu129/Dockerfile.gpu +++ /dev/null @@ -1,38 +0,0 @@ -FROM ghcr.io/ggml-org/llama.cpp:server-cuda12-b8672 as base - -LABEL maintainer="Amazon AI" -LABEL dlc_major_version="1" - -ARG HUGGINGFACE_HUB_VERSION=1.9.0 -ARG HF_XET_VERSION=1.2.0 - -WORKDIR / - -COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py -COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh - -# ====================== ec2 ========================================= -FROM base AS llamacpp-ec2 - -RUN dpkg -l | grep -E "cuda|nvidia|libnv" | awk '{print $2}' | xargs apt-mark hold && \ - apt-get update && \ - apt-get upgrade -y && \ - apt-get clean - -COPY dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh -RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh - -ENTRYPOINT ["/usr/local/bin/dockerd_entrypoint.sh"] - -# ====================== sagemaker ========================================= -FROM base AS llamacpp-sagemaker - -RUN dpkg -l | grep -E "cuda|nvidia|libnv" | awk '{print $2}' | xargs apt-mark hold && \ - apt-get update && \ - apt-get upgrade -y && \ - apt-get clean - -COPY sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh -RUN chmod +x /usr/local/bin/sagemaker_entrypoint.sh - -ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"] diff --git a/huggingface/llamacpp/docker/b8672/cu130/Dockerfile.gpu b/huggingface/llamacpp/docker/b8672/cu130/Dockerfile.gpu deleted file mode 100644 index e1cc3efa6e9a..000000000000 --- a/huggingface/llamacpp/docker/b8672/cu130/Dockerfile.gpu +++ /dev/null @@ -1,42 +0,0 @@ -FROM ghcr.io/ggml-org/llama.cpp:server-cuda13-b8672 as base - -LABEL maintainer="Amazon AI" -LABEL dlc_major_version="1" - -ENV DEBIAN_FRONTEND=noninteractive \ - LANG=C.UTF-8 \ - LC_ALL=C.UTF-8 \ - DLC_CONTAINER_TYPE=base \ - LD_LIBRARY_PATH="/opt/amazon/ofi-nccl/lib:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" \ - PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/cuda/bin:${PATH}" - -WORKDIR / - -COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py -COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh - -# ====================== ec2 ========================================= -FROM base AS llamacpp-ec2 - -RUN dpkg -l | grep -E "cuda|nvidia|libnv" | awk '{print $2}' | xargs apt-mark hold && \ - apt-get update && \ - apt-get upgrade -y && \ - apt-get clean - -COPY dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh -RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh - -ENTRYPOINT ["/usr/local/bin/dockerd_entrypoint.sh"] - -# ====================== sagemaker ========================================= -FROM base AS llamacpp-sagemaker - -RUN dpkg -l | grep -E "cuda|nvidia|libnv" | awk '{print $2}' | xargs apt-mark hold && \ - apt-get update && \ - apt-get upgrade -y && \ - apt-get clean - -COPY sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh -RUN chmod +x /usr/local/bin/sagemaker_entrypoint.sh - -ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"] diff --git a/huggingface/llamacpp/docker/b8672/Dockerfile.cpu b/huggingface/llamacpp/docker/b8882/Dockerfile.cpu similarity index 100% rename from huggingface/llamacpp/docker/b8672/Dockerfile.cpu rename to huggingface/llamacpp/docker/b8882/Dockerfile.cpu diff --git a/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu b/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu new file mode 100644 index 000000000000..187f8f75ea0a --- /dev/null +++ b/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu @@ -0,0 +1,32 @@ +FROM ghcr.io/ggml-org/llama.cpp:server-cuda13-b8882 AS base + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +FROM base AS sagemaker + +WORKDIR /app +ENV LD_LIBRARY_PATH=/app:${LD_LIBRARY_PATH} + +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + ca-certificates \ + curl \ + python3 \ + python3-pip \ + && pip3 install --no-cache-dir --break-system-packages \ + "httpx>=0.27,<1" \ + "starlette>=0.37,<1" \ + "uvicorn[standard]>=0.27,<1" \ + && rm -rf /var/lib/apt/lists/* + +COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py +COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh +COPY start_cuda_compat.sh /usr/local/bin/start_cuda_compat.sh +COPY sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh +RUN mkdir -p /usr/local/lib/llamacpp_sagemaker +COPY llamacpp_sagemaker_serve.py /usr/local/lib/llamacpp_sagemaker/llamacpp_sagemaker_serve.py +RUN chmod +x /usr/local/bin/sagemaker_entrypoint.sh \ + && chmod +x /usr/local/bin/start_cuda_compat.sh + +ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"] diff --git a/src/image_builder.py b/src/image_builder.py index f101f33fa7fb..cc401a6a5e11 100644 --- a/src/image_builder.py +++ b/src/image_builder.py @@ -686,6 +686,7 @@ def get_job_type(image_repo_uri): "base": "general", "vllm": "general", "sglang": "general", + "llamacpp": "general", } for key, job_type in job_type_mapping.items(): diff --git a/test/sagemaker_tests/huggingface/llamacpp/__init__.py b/test/sagemaker_tests/huggingface/llamacpp/__init__.py new file mode 100644 index 000000000000..199e66b95926 --- /dev/null +++ b/test/sagemaker_tests/huggingface/llamacpp/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import diff --git a/test/sagemaker_tests/huggingface/llamacpp/conftest.py b/test/sagemaker_tests/huggingface/llamacpp/conftest.py new file mode 100644 index 000000000000..cbf61a194072 --- /dev/null +++ b/test/sagemaker_tests/huggingface/llamacpp/conftest.py @@ -0,0 +1,391 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import json +import logging +import os +import platform +import shutil +import sys +import tempfile + +import boto3 +import pytest + +from botocore.exceptions import ClientError +from sagemaker import LocalSession, Session +from sagemaker.pytorch import PyTorch + +from .utils import image_utils, get_ecr_registry + +NO_P4_REGIONS = [ + "af-south-1", + "ap-east-1", + "ap-northeast-3", + "ap-southeast-1", + "ap-southeast-2", + "ap-south-1", + "ca-central-1", + "eu-central-1", + "eu-north-1", + "eu-west-2", + "eu-west-3", + "eu-south-1", + "me-south-1", + "sa-east-1", + "us-west-1", + "cn-northwest-1", + "il-central-1", +] + +NO_G5_REGIONS = [ + "us-west-1", + "ca-west-1", + "mx-cental-1", + "af-south-1", + "ap-east-1", + "ap-south-2", + "ap-southeast-5", + "ap-southeast-4", + "ap-northeast-3", + "ap-southeast-1", + "ap-southeast-7", + "eu-south-1", + "eu-west-3", + "eu-south-2", + "eu-central-2", + "me-south-1", +] + + +logger = logging.getLogger(__name__) +logging.getLogger("boto").setLevel(logging.INFO) +logging.getLogger("boto3").setLevel(logging.INFO) +logging.getLogger("botocore").setLevel(logging.INFO) +logging.getLogger("factory.py").setLevel(logging.INFO) +logging.getLogger("auth.py").setLevel(logging.INFO) +logging.getLogger("connectionpool.py").setLevel(logging.INFO) + + +dir_path = os.path.dirname(os.path.realpath(__file__)) + + +def pytest_addoption(parser): + parser.addoption("--build-image", "-D", action="store_true") + parser.addoption("--build-base-image", "-B", action="store_true") + parser.addoption("--aws-id") + parser.addoption("--instance-type") + parser.addoption("--accelerator-type", default=None) + parser.addoption("--docker-base-name", default="huggingface_sglang") + parser.addoption("--region", default="us-west-2") + parser.addoption("--framework-version", default="") + parser.addoption( + "--py-version", + choices=["2", "3", "37", "38", "39", "310", "311", "312"], + default=str(sys.version_info.major), + ) + # Processor is still "cpu" for EIA tests + parser.addoption( + "--processor", choices=["gpu", "cpu", "eia", "neuron", "neuronx"], default="cpu" + ) + # If not specified, will default to {framework-version}-{processor}-py{py-version} + parser.addoption("--tag", default=None) + parser.addoption( + "--generate-coverage-doc", + default=False, + action="store_true", + help="use this option to generate test coverage doc", + ) + parser.addoption( + "--efa", + action="store_true", + default=False, + help="Run only efa tests", + ) + parser.addoption("--sagemaker-regions", default="us-west-2") + + +def pytest_configure(config): + config.addinivalue_line("markers", "efa(): explicitly mark to run efa tests") + + +def pytest_runtest_setup(item): + if item.config.getoption("--efa"): + efa_tests = [mark for mark in item.iter_markers(name="efa")] + if not efa_tests: + pytest.skip("Skipping non-efa tests") + + +def pytest_collection_modifyitems(session, config, items): + for item in items: + print(f"item {item}") + for marker in item.iter_markers(name="team"): + print(f"item {marker}") + team_name = marker.args[0] + item.user_properties.append(("team_marker", team_name)) + print(f"item.user_properties {item.user_properties}") + + if config.getoption("--generate-coverage-doc"): + from test.test_utils.test_reporting import TestReportGenerator + + report_generator = TestReportGenerator(items, is_sagemaker=True) + report_generator.generate_coverage_doc(framework="huggingface_sglang", job_type="inference") + + +@pytest.fixture(scope="session", name="docker_base_name") +def fixture_docker_base_name(request): + return request.config.getoption("--docker-base-name") + + +@pytest.fixture(scope="session", name="region") +def fixture_region(request): + return request.config.getoption("--region") + + +@pytest.fixture(scope="session", name="framework_version") +def fixture_framework_version(request): + return request.config.getoption("--framework-version") + + +@pytest.fixture(scope="session", name="py_version") +def fixture_py_version(request): + return "py{}".format(int(request.config.getoption("--py-version"))) + + +@pytest.fixture(scope="session", name="processor") +def fixture_processor(request): + return request.config.getoption("--processor") + + +@pytest.fixture(scope="session", name="tag") +def fixture_tag(request, framework_version, processor, py_version): + provided_tag = request.config.getoption("--tag") + default_tag = "{}-{}-{}".format(framework_version, processor, py_version) + return provided_tag if provided_tag else default_tag + + +@pytest.fixture(scope="session", name="docker_image") +def fixture_docker_image(docker_base_name, tag): + return "{}:{}".format(docker_base_name, tag) + + +@pytest.fixture +def opt_ml(): + tmp = tempfile.mkdtemp() + os.mkdir(os.path.join(tmp, "output")) + + # Docker cannot mount Mac OS /var folder properly see + # https://forums.docker.com/t/var-folders-isnt-mounted-properly/9600 + opt_ml_dir = "/private{}".format(tmp) if platform.system() == "Darwin" else tmp + yield opt_ml_dir + + shutil.rmtree(tmp, True) + + +@pytest.fixture(scope="session", name="use_gpu") +def fixture_use_gpu(processor): + return processor == "gpu" + + +@pytest.fixture(scope="session", name="build_base_image", autouse=True) +def fixture_build_base_image( + request, framework_version, py_version, processor, tag, docker_base_name +): + build_base_image = request.config.getoption("--build-base-image") + if build_base_image: + return image_utils.build_base_image( + framework_name=docker_base_name, + framework_version=framework_version, + py_version=py_version, + base_image_tag=tag, + processor=processor, + cwd=os.path.join(dir_path, ".."), + ) + + return tag + + +@pytest.fixture(scope="session", name="sagemaker_session") +def fixture_sagemaker_session(region): + return Session(boto_session=boto3.Session(region_name=region)) + + +@pytest.fixture(scope="session", name="sagemaker_regions") +def fixture_sagemaker_regions(request): + sagemaker_regions = request.config.getoption("--sagemaker-regions") + return sagemaker_regions.split(",") + + +@pytest.fixture(scope="session", name="sagemaker_local_session") +def fixture_sagemaker_local_session(region): + return LocalSession(boto_session=boto3.Session(region_name=region)) + + +@pytest.fixture(name="aws_id", scope="session") +def fixture_aws_id(request): + return request.config.getoption("--aws-id") + + +@pytest.fixture(name="instance_type", scope="session") +def fixture_instance_type(request, processor): + provided_instance_type = request.config.getoption("--instance-type") + default_instance_type = "local" if processor == "cpu" else "local_gpu" + return provided_instance_type or default_instance_type + + +@pytest.fixture(name="accelerator_type", scope="session") +def fixture_accelerator_type(request): + return request.config.getoption("--accelerator-type") + + +@pytest.fixture(name="docker_registry", scope="session") +def fixture_docker_registry(aws_id, region): + return get_ecr_registry(aws_id, region) + + +@pytest.fixture(name="ecr_image", scope="session") +def fixture_ecr_image(docker_registry, docker_base_name, tag): + return "{}/{}:{}".format(docker_registry, docker_base_name, tag) + + +@pytest.fixture(autouse=True) +def skip_by_device_type(request, use_gpu, instance_type, accelerator_type): + is_gpu = use_gpu or instance_type[3] in ["g", "p"] + is_eia = accelerator_type is not None + is_neuron = instance_type.startswith("ml.inf1") + is_neuronx = instance_type.startswith("ml.inf2") or instance_type.startswith("ml.trn1") + + # Separate out cases for clearer logic. + # When running Neuron test, skip CPU and GPU test. + if request.node.get_closest_marker("neuron_test") and not is_neuron: + pytest.skip("Skipping because running on '{}' instance".format(instance_type)) + elif request.node.get_closest_marker("neuronx_test") and not is_neuronx: + pytest.skip("Skipping because running on '{}' instance".format(instance_type)) + + # When running GPU test, skip CPU and neuron test. When running CPU test, skip GPU and neuron test. + elif (request.node.get_closest_marker("gpu_test") and not is_gpu) or ( + request.node.get_closest_marker("cpu_test") and (is_gpu or is_neuron or is_neuronx) + ): + pytest.skip("Skipping because running on '{}' instance".format(instance_type)) + + # When running EIA test, skip the CPU, GPU and Neuron functions + elif ( + request.node.get_closest_marker("neuron_test") + or request.node.get_closest_marker("gpu_test") + or request.node.get_closest_marker("cpu_test") + ) and is_eia: + pytest.skip("Skipping because running on '{}' instance".format(instance_type)) + + # When running CPU or GPU or Neuron test, skip EIA test. + elif request.node.get_closest_marker("eia_test") and not is_eia: + pytest.skip("Skipping because running on '{}' instance".format(instance_type)) + + +@pytest.fixture(autouse=True) +def skip_by_py_version(request, py_version): + if request.node.get_closest_marker("skip_py2") and py_version != "py3": + pytest.skip("Skipping the test because Python 2 is not supported.") + + +@pytest.fixture(autouse=True) +def skip_gpu_instance_restricted_regions(region, instance_type): + if (region in NO_P4_REGIONS and instance_type.startswith("ml.p4")) or ( + region in NO_G5_REGIONS and instance_type.startswith("ml.g5") + ): + pytest.skip( + "Skipping GPU test in region {} with instance type {}".format(region, instance_type) + ) + + +@pytest.fixture(autouse=True) +def skip_gpu_py2(request, use_gpu, instance_type, py_version, framework_version): + is_gpu = use_gpu or instance_type[3] in ["g", "p"] + if ( + request.node.get_closest_marker("skip_gpu_py2") + and is_gpu + and py_version != "py3" + and framework_version == "1.4.0" + ): + pytest.skip("Skipping the test until mms issue resolved.") + + +def _get_remote_override_flags(): + try: + s3_client = boto3.client("s3") + sts_client = boto3.client("sts") + account_id = sts_client.get_caller_identity().get("Account") + result = s3_client.get_object( + Bucket=f"dlc-cicd-helper-{account_id}", Key="override_tests_flags.json" + ) + json_content = json.loads(result["Body"].read().decode("utf-8")) + except ClientError as e: + logger.warning("ClientError when performing S3/STS operation: {}".format(e)) + json_content = {} + return json_content + + +def _is_test_disabled(test_name, build_name, version): + """ + Expected format of remote_override_flags: + { + "CB Project Name for Test Type A": { + "CodeBuild Resolved Source Version": ["test_type_A_test_function_1", "test_type_A_test_function_2"] + }, + "CB Project Name for Test Type B": { + "CodeBuild Resolved Source Version": ["test_type_B_test_function_1", "test_type_B_test_function_2"] + } + } + + :param test_name: str Test Function node name (includes parametrized values in string) + :param build_name: str Build Project name of current execution + :param version: str Source Version of current execution + :return: bool True if test is disabled as per remote override, False otherwise + """ + remote_override_flags = _get_remote_override_flags() + remote_override_build = remote_override_flags.get(build_name, {}) + if version in remote_override_build: + return not remote_override_build[version] or any( + [test_keyword in test_name for test_keyword in remote_override_build[version]] + ) + return False + + +@pytest.fixture(autouse=True) +def disable_test(request): + test_name = request.node.name + # We do not have a regex pattern to find CB name, which means we must resort to string splitting + build_arn = os.getenv("CODEBUILD_BUILD_ARN") + build_name = build_arn.split("/")[-1].split(":")[0] if build_arn else None + version = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION") + + if build_name and version and _is_test_disabled(test_name, build_name, version): + pytest.skip(f"Skipping {test_name} test because it has been disabled.") + + +@pytest.fixture(autouse=True) +def skip_test_successfully_executed_before(request): + """ + "cache/lastfailed" contains information about failed tests only. We're running SM tests in separate threads for each image. + So when we retry SM tests, successfully executed tests executed again because pytest doesn't have that info in /.cache. + But the flag "--last-failed-no-failures all" requires pytest to execute all the available tests. + The only sign that a test passed last time - lastfailed file exists and the test name isn't in that file. + The method checks whether lastfailed file exists and the test name is not in it. + """ + test_name = request.node.name + lastfailed = request.config.cache.get("cache/lastfailed", None) + + # if lastfailed is not None and not any( + # test_name in failed_test_name for failed_test_name in lastfailed.keys() + # ): + # pytest.skip(f"Skipping {test_name} because it was successfully executed for this commit") diff --git a/test/sagemaker_tests/huggingface/llamacpp/integration/__init__.py b/test/sagemaker_tests/huggingface/llamacpp/integration/__init__.py new file mode 100644 index 000000000000..9befa612dc56 --- /dev/null +++ b/test/sagemaker_tests/huggingface/llamacpp/integration/__init__.py @@ -0,0 +1,119 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import json +import os +import re +import shutil +import tarfile + +import boto3 + +# Path to test resources +resources_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "resources")) + +# Model artifacts for local mode tests - downloaded from HuggingFace Hub at runtime +MODEL_ID = "unsloth/Qwen3.5-0.8B-GGUF" +model_dir = os.path.join(resources_path, "qwen3.5-0.8b") +model_data = "qwen3.5-0.8b.tar.gz" +model_data_path = os.path.join(model_dir, model_data) + + +def ensure_model_downloaded(): + """Download model from HuggingFace Hub and create tarball if not already present.""" + if os.path.exists(model_data_path): + return model_data_path + + from huggingface_hub import snapshot_download + + os.makedirs(model_dir, exist_ok=True) + local_model_dir = os.path.join(model_dir, "model") + + print(f"Downloading {MODEL_ID} from HuggingFace Hub...") + snapshot_download( + repo_id=MODEL_ID, local_dir=local_model_dir, ignore_patterns=["*.onnx"] + ) + + # Remove cache folder if present + cache_dir = os.path.join(local_model_dir, ".cache") + if os.path.exists(cache_dir): + shutil.rmtree(cache_dir) + + print(f"Creating tarball {model_data}...") + with tarfile.open(model_data_path, "w:gz") as tar: + for item in os.listdir(local_model_dir): + tar.add(os.path.join(local_model_dir, item), arcname=item) + + # Clean up extracted model + shutil.rmtree(local_model_dir) + + print(f"Model ready at {model_data_path}") + return model_data_path + + +# Role for local mode (not used but required by SageMaker SDK) +ROLE = "dummy/unused-role" +DEFAULT_TIMEOUT = 45 + +# Llama.cpp SageMaker images listen on port 8080 with a small HTTP shim (/ping, +# /invocations) that proxies to llama-server on loopback (see llamacpp_sagemaker_serve). +# Do not set SM_LLAMACPP_HOST or SM_LLAMACPP_PORT expecting external access to +# llama-server; the entrypoint pins the server to localhost and exposes the shim on 8080. + + +class NoLogStreamFoundError(Exception): + pass + + +class SageMakerEndpointFailure(Exception): + pass + + +def dump_logs_from_cloudwatch(e, region="us-west-2"): + """ + Function to dump logs from cloudwatch during error handling. + Gracefully handles missing log groups/streams. + """ + error_hosting_endpoint_regex = re.compile(r"Error hosting endpoint ((\w|-)+):") + endpoint_url_regex = re.compile(r"/aws/sagemaker/Endpoints/((\w|-)+)") + endpoint_match = error_hosting_endpoint_regex.search(str(e)) or endpoint_url_regex.search( + str(e) + ) + if endpoint_match: + logs_client = boto3.client("logs", region_name=region) + endpoint = endpoint_match.group(1) + log_group_name = f"/aws/sagemaker/Endpoints/{endpoint}" + try: + log_stream_resp = logs_client.describe_log_streams(logGroupName=log_group_name) + all_traffic_log_stream = "" + for log_stream in log_stream_resp.get("logStreams", []): + log_stream_name = log_stream.get("logStreamName") + if log_stream_name.startswith("AllTraffic"): + all_traffic_log_stream = log_stream_name + break + if not all_traffic_log_stream: + raise NoLogStreamFoundError( + f"Cannot find all traffic log streams for endpoint {endpoint}" + ) from e + events = logs_client.get_log_events( + logGroupName=log_group_name, logStreamName=all_traffic_log_stream + ) + raise SageMakerEndpointFailure( + f"Error from endpoint {endpoint}:\n{json.dumps(events, indent=4)}" + ) from e + except logs_client.exceptions.ResourceNotFoundException: + # Log group doesn't exist yet - endpoint may have failed before creating logs + raise SageMakerEndpointFailure( + f"Endpoint {endpoint} failed. No CloudWatch logs available yet." + ) from e diff --git a/test/sagemaker_tests/huggingface/llamacpp/integration/local/__init__.py b/test/sagemaker_tests/huggingface/llamacpp/integration/local/__init__.py new file mode 100644 index 000000000000..199e66b95926 --- /dev/null +++ b/test/sagemaker_tests/huggingface/llamacpp/integration/local/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import diff --git a/test/sagemaker_tests/huggingface/llamacpp/integration/local/test_serving.py b/test/sagemaker_tests/huggingface/llamacpp/integration/local/test_serving.py new file mode 100644 index 000000000000..a5f10f4de27f --- /dev/null +++ b/test/sagemaker_tests/huggingface/llamacpp/integration/local/test_serving.py @@ -0,0 +1,109 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +from contextlib import contextmanager + +import pytest +import requests +from sagemaker.model import Model +from sagemaker.predictor import Predictor +from sagemaker.serializers import JSONSerializer +from sagemaker.deserializers import JSONDeserializer + +from ...integration import ROLE, ensure_model_downloaded +from ...utils import local_mode_utils + + +@contextmanager +def _predictor(image, sagemaker_local_session, instance_type): + """Context manager for Llama.cpp model deployment and cleanup. + + Model is extracted to /opt/ml/model by SageMaker from model_data tar.gz. + The container entrypoint runs llama-server behind a SageMaker-compatible + proxy on port 8080 (/ping, /invocations -> OpenAI routes on llama-server). + """ + # Download model from HuggingFace Hub if not already present + model_data_path = ensure_model_downloaded() + + env = { + "SM_LLAMACPP_MODEL": "/opt/ml/model", + } + + model = Model( + model_data=f"file://{model_data_path}", + role=ROLE, + image_uri=image, + env=env, + sagemaker_session=sagemaker_local_session, + predictor_cls=Predictor, + ) + with local_mode_utils.lock(): + predictor = None + try: + predictor = model.deploy(1, instance_type) + yield predictor + finally: + if predictor is not None: + predictor.delete_endpoint() + + +def _assert_sagemaker_ping_local(): + """SageMaker contract: GET /ping on the container HTTP port (local mode: 8080).""" + response = requests.get("http://127.0.0.1:8080/ping", timeout=60) + assert response.status_code == 200 + + +def _assert_llamacpp_chat_prediction(predictor): + """Test Llama.cpp inference using OpenAI-compatible chat completions API.""" + predictor.serializer = JSONSerializer() + predictor.deserializer = JSONDeserializer() + + data = { + "messages": [{"role": "user", "content": "What is Deep Learning?"}], + "max_tokens": 50, + "temperature": 0.7, + } + output = predictor.predict(data) + + assert output is not None + assert "choices" in output + + +def _assert_llamacpp_chat_prediction_explicit_route(predictor): + """Same as chat test but forces target path via SageMaker CustomAttributes (proxy route=).""" + predictor.serializer = JSONSerializer() + predictor.deserializer = JSONDeserializer() + + data = { + "messages": [{"role": "user", "content": "Say hello in one word."}], + "max_tokens": 16, + "temperature": 0.3, + } + output = predictor.predict( + data, + custom_attributes="route=/v1/chat/completions", + ) + + assert output is not None + assert "choices" in output + + +@pytest.mark.model("qwen3.5-0.8b") +@pytest.mark.team("sagemaker-1p-algorithms") +def test_llamacpp_local_chat(docker_image, sagemaker_local_session, instance_type): + """Test Llama.cpp local deployment: /ping shim, /invocations chat, and explicit route=.""" + with _predictor(docker_image, sagemaker_local_session, instance_type) as predictor: + _assert_sagemaker_ping_local() + _assert_llamacpp_chat_prediction(predictor) + _assert_llamacpp_chat_prediction_explicit_route(predictor) diff --git a/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/__init__.py b/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/__init__.py new file mode 100644 index 000000000000..04fbf5d9a144 --- /dev/null +++ b/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/__init__.py @@ -0,0 +1,12 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. diff --git a/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/test_sglang.py b/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/test_sglang.py new file mode 100644 index 000000000000..f00668e50844 --- /dev/null +++ b/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/test_sglang.py @@ -0,0 +1,116 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import json +import logging + +import pytest +import sagemaker +from sagemaker.model import Model +from sagemaker.predictor import Predictor +from sagemaker.serializers import JSONSerializer +from sagemaker.deserializers import JSONDeserializer + +from ...integration import dump_logs_from_cloudwatch +from ...integration.sagemaker.timeout import timeout_and_delete_endpoint +from ..... import invoke_sm_endpoint_helper_function + +LOGGER = logging.getLogger(__name__) + + +@pytest.mark.model("qwen3.5-0.8b") +@pytest.mark.processor("gpu") +@pytest.mark.gpu_test +@pytest.mark.team("sagemaker-1p-algorithms") +def test_llamacpp_qwen(framework_version, ecr_image, instance_type, sagemaker_regions): + invoke_sm_endpoint_helper_function( + ecr_image=ecr_image, + sagemaker_regions=sagemaker_regions, + test_function=_test_llamacpp_model, + dump_logs_from_cloudwatch=dump_logs_from_cloudwatch, + framework_version=framework_version, + instance_type=instance_type, + model_id="unsloth/Qwen3.5-0.8B-GGUF", + ) + + +def _test_llamacpp_model( + image_uri, + sagemaker_session, + instance_type, + model_id, + framework_version=None, + **kwargs, +): + """Test Llama.cpp model deployment and inference using OpenAI-compatible API format + + Uses sagemaker.model.Model for SDK v3 compatibility instead of HuggingFaceModel. + + Args: + image_uri: ECR image URI + sagemaker_session: SageMaker session + instance_type: ML instance type + model_id: HuggingFace model ID + framework_version: Optional version info + **kwargs: Additional args from helper (boto_session, sagemaker_client, etc.) + """ + endpoint_name = sagemaker.utils.unique_name_from_base("sagemaker-hf-llamacpp-serving") + + env = { + "SM_LLAMACPP_MODEL": model_id, + } + + model = Model( + name=endpoint_name, + image_uri=image_uri, + role="SageMakerRole", + env=env, + sagemaker_session=sagemaker_session, + predictor_cls=Predictor, + ) + + with timeout_and_delete_endpoint(endpoint_name, sagemaker_session, minutes=45): + predictor = model.deploy( + initial_instance_count=1, + instance_type=instance_type, + endpoint_name=endpoint_name, + container_startup_health_check_timeout=1800, + inference_ami_version="al2-ami-sagemaker-inference-gpu-3-1", + ) + + predictor.serializer = JSONSerializer() + predictor.deserializer = JSONDeserializer() + + # Llama.cpp SageMaker uses OpenAI-compatible chat completions API format + data = { + "messages": [{"role": "user", "content": "What is Deep Learning?"}], + "max_tokens": 50, + "temperature": 0.7, + } + + LOGGER.info(f"Running inference with data: {data}") + output = predictor.predict(data) + LOGGER.info(f"Output: {json.dumps(output)}") + + assert output is not None + assert "choices" in output + + # Explicit route= mirrors vLLM-Omni-style CustomAttributes routing in the container proxy. + output_routed = predictor.predict( + data, + custom_attributes="route=/v1/chat/completions", + ) + LOGGER.info(f"Output (routed): {json.dumps(output_routed)}") + assert output_routed is not None + assert "choices" in output_routed diff --git a/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/timeout.py b/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/timeout.py new file mode 100644 index 000000000000..1d13878031f7 --- /dev/null +++ b/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/timeout.py @@ -0,0 +1,66 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import +import signal +from contextlib import contextmanager +import logging + +from botocore.exceptions import ClientError + +LOGGER = logging.getLogger("timeout") + + +class TimeoutError(Exception): + pass + + +@contextmanager +def timeout(seconds=0, minutes=0, hours=0): + """Add a signal-based timeout to any block of code. + If multiple time units are specified, they will be added together to determine time limit. + Usage: + with timeout(seconds=5): + my_slow_function(...) + Args: + - seconds: The time limit, in seconds. + - minutes: The time limit, in minutes. + - hours: The time limit, in hours. + """ + + limit = seconds + 60 * minutes + 3600 * hours + + def handler(signum, frame): + raise TimeoutError("timed out after {} seconds".format(limit)) + + try: + signal.signal(signal.SIGALRM, handler) + signal.alarm(limit) + + yield + finally: + signal.alarm(0) + + +@contextmanager +def timeout_and_delete_endpoint(endpoint_name, sagemaker_session, seconds=0, minutes=0, hours=0): + with timeout(seconds=seconds, minutes=minutes, hours=hours) as t: + try: + yield [t] + finally: + try: + sagemaker_session.delete_endpoint(endpoint_name) + LOGGER.info("deleted endpoint {}".format(endpoint_name)) + except ClientError as ce: + if ce.response["Error"]["Code"] == "ValidationException": + # avoids the inner exception to be overwritten + pass diff --git a/test/sagemaker_tests/huggingface/llamacpp/resources/qwen3.5-0.8b/.gitattributes b/test/sagemaker_tests/huggingface/llamacpp/resources/qwen3.5-0.8b/.gitattributes new file mode 100644 index 000000000000..6ba5bc1386f8 --- /dev/null +++ b/test/sagemaker_tests/huggingface/llamacpp/resources/qwen3.5-0.8b/.gitattributes @@ -0,0 +1,61 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +mmproj-BF16.gguf filter=lfs diff=lfs merge=lfs -text +mmproj-F16.gguf filter=lfs diff=lfs merge=lfs -text +mmproj-F32.gguf filter=lfs diff=lfs merge=lfs -text +Qwen3.5-0.8B-UD-IQ3_XXS.gguf filter=lfs diff=lfs merge=lfs -text +Qwen3.5-0.8B-UD-Q2_K_XL.gguf filter=lfs diff=lfs merge=lfs -text +Qwen3.5-0.8B-Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text +Qwen3.5-0.8B-UD-Q5_K_XL.gguf filter=lfs diff=lfs merge=lfs -text +Qwen3.5-0.8B-UD-Q4_K_XL.gguf filter=lfs diff=lfs merge=lfs -text +Qwen3.5-0.8B-UD-IQ2_M.gguf filter=lfs diff=lfs merge=lfs -text +Qwen3.5-0.8B-Q3_K_M.gguf filter=lfs diff=lfs merge=lfs -text +Qwen3.5-0.8B-Q6_K.gguf filter=lfs diff=lfs merge=lfs -text +Qwen3.5-0.8B-UD-IQ2_XXS.gguf filter=lfs diff=lfs merge=lfs -text +Qwen3.5-0.8B-Q5_K_M.gguf filter=lfs diff=lfs merge=lfs -text +Qwen3.5-0.8B-Q3_K_S.gguf filter=lfs diff=lfs merge=lfs -text +Qwen3.5-0.8B-IQ4_NL.gguf filter=lfs diff=lfs merge=lfs -text +Qwen3.5-0.8B-Q4_0.gguf filter=lfs diff=lfs merge=lfs -text +Qwen3.5-0.8B-Q4_1.gguf filter=lfs diff=lfs merge=lfs -text +Qwen3.5-0.8B-IQ4_XS.gguf filter=lfs diff=lfs merge=lfs -text +Qwen3.5-0.8B-Q5_K_S.gguf filter=lfs diff=lfs merge=lfs -text +Qwen3.5-0.8B-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text +Qwen3.5-0.8B-UD-Q6_K_XL.gguf filter=lfs diff=lfs merge=lfs -text +Qwen3.5-0.8B-UD-Q8_K_XL.gguf filter=lfs diff=lfs merge=lfs -text +Qwen3.5-0.8B-Q4_K_S.gguf filter=lfs diff=lfs merge=lfs -text +imatrix_unsloth.gguf_file filter=lfs diff=lfs merge=lfs -text +Qwen3.5-0.8B-BF16.gguf filter=lfs diff=lfs merge=lfs -text +Qwen3.5-0.8B-UD-Q3_K_XL.gguf filter=lfs diff=lfs merge=lfs -text diff --git a/test/sagemaker_tests/huggingface/llamacpp/utils/__init__.py b/test/sagemaker_tests/huggingface/llamacpp/utils/__init__.py new file mode 100644 index 000000000000..6932ed1abd5b --- /dev/null +++ b/test/sagemaker_tests/huggingface/llamacpp/utils/__init__.py @@ -0,0 +1,36 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import boto3 +import botocore + + +def _botocore_resolver(): + """ + Get the DNS suffix for the given region. + :return: endpoint object + """ + loader = botocore.loaders.create_loader() + return botocore.regions.EndpointResolver(loader.load_data("endpoints")) + + +def get_ecr_registry(account, region): + """ + Get prefix of ECR image URI + :param account: Account ID + :param region: region where ECR repo exists + :return: AWS ECR registry + """ + endpoint_data = _botocore_resolver().construct_endpoint("ecr", region) + return "{}.dkr.{}".format(account, endpoint_data["hostname"]) diff --git a/test/sagemaker_tests/huggingface/llamacpp/utils/image_utils.py b/test/sagemaker_tests/huggingface/llamacpp/utils/image_utils.py new file mode 100644 index 000000000000..3421e6ce2b42 --- /dev/null +++ b/test/sagemaker_tests/huggingface/llamacpp/utils/image_utils.py @@ -0,0 +1,67 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import os +import subprocess +import sys + +CYAN_COLOR = "\033[36m" +END_COLOR = "\033[0m" + + +def build_base_image( + framework_name, framework_version, py_version, processor, base_image_tag, cwd="." +): + base_image_uri = get_base_image_uri(framework_name, base_image_tag) + + dockerfile_location = os.path.join( + "docker", framework_version, "base", "Dockerfile.{}".format(processor) + ) + + subprocess.check_call( + [ + "docker", + "build", + "-t", + base_image_uri, + "-f", + dockerfile_location, + "--build-arg", + "py_version={}".format(py_version[-1]), + cwd, + ], + cwd=cwd, + ) + print("created image {}".format(base_image_uri)) + return base_image_uri + + +def get_base_image_uri(framework_name, base_image_tag): + return "{}-base:{}".format(framework_name, base_image_tag) + + +def get_image_uri(framework_name, tag): + return "{}:{}".format(framework_name, tag) + + +def _check_call(cmd, *popenargs, **kwargs): + if isinstance(cmd, str): + cmd = cmd.split(" ") + _print_cmd(cmd) + subprocess.check_call(cmd, *popenargs, **kwargs) + + +def _print_cmd(cmd): + print("executing docker command: {}{}{}".format(CYAN_COLOR, " ".join(cmd), END_COLOR)) + sys.stdout.flush() diff --git a/test/sagemaker_tests/huggingface/llamacpp/utils/local_mode_utils.py b/test/sagemaker_tests/huggingface/llamacpp/utils/local_mode_utils.py new file mode 100644 index 000000000000..fa6b3cf00c36 --- /dev/null +++ b/test/sagemaker_tests/huggingface/llamacpp/utils/local_mode_utils.py @@ -0,0 +1,46 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +from contextlib import contextmanager +import fcntl +import os +import tarfile +import time + +from ..integration import resources_path + +LOCK_PATH = os.path.join(resources_path, "local_mode_lock") + + +@contextmanager +def lock(): + # Since Local Mode uses the same port for serving, we need a lock in order + # to allow concurrent test execution. + local_mode_lock_fd = open(LOCK_PATH, "w") + local_mode_lock = local_mode_lock_fd.fileno() + + fcntl.lockf(local_mode_lock, fcntl.LOCK_EX) + + try: + yield + finally: + time.sleep(5) + fcntl.lockf(local_mode_lock, fcntl.LOCK_UN) + + +def assert_files_exist(output_path, directory_file_map): + for directory, files in directory_file_map.items(): + with tarfile.open(os.path.join(output_path, "{}.tar.gz".format(directory))) as tar: + for f in files: + tar.getmember(f) diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py index 51ca9a276922..571b9fb26ed3 100644 --- a/test/test_utils/__init__.py +++ b/test/test_utils/__init__.py @@ -2082,6 +2082,7 @@ def get_job_type_from_image(image_uri): "base": "general", "vllm": "general", "sglang": "general", + "llamacpp": "general", } for key, job_type in job_type_mapping.items(): From d2cc69f4278ca986199522e252abbd02dcb44843 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Enrique=20Hern=C3=A1ndez=20Calabr=C3=A9s?= Date: Mon, 27 Apr 2026 09:34:51 +0000 Subject: [PATCH 03/14] Update Docker base name and coverage report framework for HuggingFace LlamaCpp support --- test/sagemaker_tests/huggingface/llamacpp/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/sagemaker_tests/huggingface/llamacpp/conftest.py b/test/sagemaker_tests/huggingface/llamacpp/conftest.py index cbf61a194072..57374310db49 100644 --- a/test/sagemaker_tests/huggingface/llamacpp/conftest.py +++ b/test/sagemaker_tests/huggingface/llamacpp/conftest.py @@ -87,7 +87,7 @@ def pytest_addoption(parser): parser.addoption("--aws-id") parser.addoption("--instance-type") parser.addoption("--accelerator-type", default=None) - parser.addoption("--docker-base-name", default="huggingface_sglang") + parser.addoption("--docker-base-name", default="huggingface_llamacpp") parser.addoption("--region", default="us-west-2") parser.addoption("--framework-version", default="") parser.addoption( @@ -140,7 +140,7 @@ def pytest_collection_modifyitems(session, config, items): from test.test_utils.test_reporting import TestReportGenerator report_generator = TestReportGenerator(items, is_sagemaker=True) - report_generator.generate_coverage_doc(framework="huggingface_sglang", job_type="inference") + report_generator.generate_coverage_doc(framework="huggingface_llamacpp", job_type="inference") @pytest.fixture(scope="session", name="docker_base_name") From 1afc8338d068717d5e75d04c8b5d74b641f7e374 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Enrique=20Hern=C3=A1ndez=20Calabr=C3=A9s?= Date: Mon, 27 Apr 2026 16:18:45 +0000 Subject: [PATCH 04/14] Update local & sagemaker tests for llama.cpp DLC --- .../llamacpp/integration/__init__.py | 32 +++++++++++++++---- .../integration/local/test_serving.py | 2 +- .../{test_sglang.py => test_llamacpp.py} | 2 +- .../huggingface/llamacpp/requirements.txt | 29 +++++++++++++++++ 4 files changed, 57 insertions(+), 8 deletions(-) rename test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/{test_sglang.py => test_llamacpp.py} (98%) create mode 100644 test/sagemaker_tests/huggingface/llamacpp/requirements.txt diff --git a/test/sagemaker_tests/huggingface/llamacpp/integration/__init__.py b/test/sagemaker_tests/huggingface/llamacpp/integration/__init__.py index 9befa612dc56..79a9cbb4223d 100644 --- a/test/sagemaker_tests/huggingface/llamacpp/integration/__init__.py +++ b/test/sagemaker_tests/huggingface/llamacpp/integration/__init__.py @@ -25,24 +25,44 @@ # Model artifacts for local mode tests - downloaded from HuggingFace Hub at runtime MODEL_ID = "unsloth/Qwen3.5-0.8B-GGUF" +MODEL_FILENAME = "Qwen3.5-0.8B-UD-IQ2_XXS.gguf" model_dir = os.path.join(resources_path, "qwen3.5-0.8b") model_data = "qwen3.5-0.8b.tar.gz" model_data_path = os.path.join(model_dir, model_data) +def _tar_contains_expected_model(tar_path): + if not os.path.exists(tar_path): + return False + try: + with tarfile.open(tar_path, "r:gz") as tar: + return any( + os.path.basename(member.name) == MODEL_FILENAME + for member in tar.getmembers() + if member.isfile() + ) + except tarfile.TarError: + return False + + def ensure_model_downloaded(): """Download model from HuggingFace Hub and create tarball if not already present.""" - if os.path.exists(model_data_path): + if _tar_contains_expected_model(model_data_path): return model_data_path - from huggingface_hub import snapshot_download + from huggingface_hub import hf_hub_download os.makedirs(model_dir, exist_ok=True) local_model_dir = os.path.join(model_dir, "model") - - print(f"Downloading {MODEL_ID} from HuggingFace Hub...") - snapshot_download( - repo_id=MODEL_ID, local_dir=local_model_dir, ignore_patterns=["*.onnx"] + if os.path.exists(local_model_dir): + shutil.rmtree(local_model_dir) + os.makedirs(local_model_dir, exist_ok=True) + + print(f"Downloading {MODEL_FILENAME} from {MODEL_ID} on HuggingFace Hub...") + hf_hub_download( + repo_id=MODEL_ID, + filename=MODEL_FILENAME, + local_dir=local_model_dir, ) # Remove cache folder if present diff --git a/test/sagemaker_tests/huggingface/llamacpp/integration/local/test_serving.py b/test/sagemaker_tests/huggingface/llamacpp/integration/local/test_serving.py index a5f10f4de27f..68691d05b559 100644 --- a/test/sagemaker_tests/huggingface/llamacpp/integration/local/test_serving.py +++ b/test/sagemaker_tests/huggingface/llamacpp/integration/local/test_serving.py @@ -37,7 +37,7 @@ def _predictor(image, sagemaker_local_session, instance_type): model_data_path = ensure_model_downloaded() env = { - "SM_LLAMACPP_MODEL": "/opt/ml/model", + "SM_LLAMACPP_MODEL": "/opt/ml/model/Qwen3.5-0.8B-UD-IQ2_XXS.gguf", } model = Model( diff --git a/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/test_sglang.py b/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/test_llamacpp.py similarity index 98% rename from test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/test_sglang.py rename to test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/test_llamacpp.py index f00668e50844..b22b32f27543 100644 --- a/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/test_sglang.py +++ b/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/test_llamacpp.py @@ -68,7 +68,7 @@ def _test_llamacpp_model( endpoint_name = sagemaker.utils.unique_name_from_base("sagemaker-hf-llamacpp-serving") env = { - "SM_LLAMACPP_MODEL": model_id, + "SM_LLAMACPP_HF_REPO": model_id, } model = Model( diff --git a/test/sagemaker_tests/huggingface/llamacpp/requirements.txt b/test/sagemaker_tests/huggingface/llamacpp/requirements.txt new file mode 100644 index 000000000000..890bbe499718 --- /dev/null +++ b/test/sagemaker_tests/huggingface/llamacpp/requirements.txt @@ -0,0 +1,29 @@ +boto3 +coverage +# Docker v7.0.0 breaks compatibility with Docker Compose v1 (SageMaker Local) +docker>=5,<=6.1.3 +flake8==3.7.7 +Flask==1.1.1 +mock +pytest==8.3.5 +pytest-cov +pytest-rerunfailures +pytest-xdist +PyYAML +protobuf>=3.20,<=3.20.2 +sagemaker>=2.237.0,<3 +six +requests<2.32.0 +requests_mock +Pillow +retrying==1.3.3 +urllib3>=1.26.8 +pluggy>=1.5,<2 +requests_mock +sagemaker-inference +tenacity +fabric +invoke +gitpython +toml +huggingface_hub From ecdd5a2474d8ce616e5e1f1f22e8997a6ea3c982 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Enrique=20Hern=C3=A1ndez=20Calabr=C3=A9s?= Date: Mon, 27 Apr 2026 16:22:20 +0000 Subject: [PATCH 05/14] Update dlc_developer_config.toml --- dlc_developer_config.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 08ad2014d650..585a65356d74 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -1,6 +1,6 @@ [dev] # Set to "huggingface", for example, if you are a huggingface developer. Default is "" -partner_developer = "" +partner_developer = "huggingface" # Please only set it to true if you are preparing an EI related PR # Do remember to revert it back to false before merging any PR (including EI dedicated PR) ei_mode = false @@ -37,7 +37,7 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_vllm", "huggingface_sglang", "huggingface_llamacpp", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] -build_frameworks = [] +build_frameworks = ["huggingface_llamacpp"] # By default we build both training and inference containers. Set true/false values to determine which to build. @@ -193,7 +193,7 @@ dlc-pr-huggingface-vllm = "" dlc-pr-huggingface-sglang = "" # Huggingface Llamacpp -dlc-pr-huggingface-llamacpp = "" +dlc-pr-huggingface-llamacpp = "/huggingface/llamacpp/buildspec.yml" # sglang dlc-pr-sglang = "" From 3f69b07d7d2b58b369a31ab82702e0f08ff18607 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Enrique=20Hern=C3=A1ndez=20Calabr=C3=A9s?= Date: Mon, 27 Apr 2026 16:40:28 +0000 Subject: [PATCH 06/14] Disable training container build in dlc_developer_config.toml --- dlc_developer_config.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 585a65356d74..60593de299b7 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -41,7 +41,7 @@ build_frameworks = ["huggingface_llamacpp"] # By default we build both training and inference containers. Set true/false values to determine which to build. -build_training = true +build_training = false build_inference = true # Set do_build to "false" to skip builds and test the latest image built by this PR From f5bcbc35232cc1cce3755d53562285891e22e3f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Enrique=20Hern=C3=A1ndez=20Calabr=C3=A9s?= Date: Wed, 29 Apr 2026 12:25:06 +0200 Subject: [PATCH 07/14] Refactor SageMaker integration for llama.cpp: replace Python proxy with custom llama-server build --- .../llamacpp_sagemaker_serve.py | 211 ------------------ .../llamacpp_sagemaker_server.patch | 133 +++++++++++ .../build_artifacts/sagemaker_entrypoint.sh | 62 +---- huggingface/llamacpp/buildspec.yml | 6 +- .../llamacpp/docker/b8882/Dockerfile.cpu | 74 ++++++ .../docker/b8882/cu130/Dockerfile.gpu | 83 +++++-- .../llamacpp/integration/__init__.py | 6 +- .../integration/local/test_serving.py | 6 +- .../integration/sagemaker/test_llamacpp.py | 2 +- 9 files changed, 293 insertions(+), 290 deletions(-) delete mode 100644 huggingface/llamacpp/build_artifacts/llamacpp_sagemaker_serve.py create mode 100644 huggingface/llamacpp/build_artifacts/llamacpp_sagemaker_server.patch diff --git a/huggingface/llamacpp/build_artifacts/llamacpp_sagemaker_serve.py b/huggingface/llamacpp/build_artifacts/llamacpp_sagemaker_serve.py deleted file mode 100644 index 38da145b33e0..000000000000 --- a/huggingface/llamacpp/build_artifacts/llamacpp_sagemaker_serve.py +++ /dev/null @@ -1,211 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 -"""SageMaker HTTP proxy for llama.cpp llama-server. - -SageMaker invokes POST /invocations and GET /ping on port 8080. llama-server -speaks OpenAI-style routes (e.g. /v1/chat/completions) and does not expose -/invocations. - -Behavior mirrors scripts/vllm/omni_sagemaker_serve.py routing: - -- GET /ping is proxied to GET {backend}/health. -- POST /invocations: if ``X-Amzn-SageMaker-Custom-Attributes`` contains - ``route=/some/path``, the request is forwarded to that path on llama-server. - Otherwise the target path is inferred from the JSON body (messages -> - /v1/chat/completions, prompt -> /v1/completions, input+model -> /v1/embeddings), - defaulting to /v1/chat/completions. - -For routes that require multipart/form-data (parity with vLLM-Omni), JSON bodies -are converted when ``route=`` targets those paths. - -Environment: - -- LLAMACPP_SAGEMAKER_BACKEND_URL: upstream base URL (default http://127.0.0.1:8081) -""" - -from __future__ import annotations - -import json -import logging -import os -import re -import uuid -from collections.abc import AsyncIterator - -import httpx -from starlette.applications import Starlette -from starlette.requests import Request -from starlette.responses import Response, StreamingResponse -from starlette.routing import Route - -logger = logging.getLogger("llamacpp_sagemaker") - -BACKEND = os.environ.get("LLAMACPP_SAGEMAKER_BACKEND_URL", "http://127.0.0.1:8081").rstrip("/") - -FORM_DATA_ROUTES = frozenset({"/v1/videos", "/v1/videos/sync"}) - -_HOP_BY_HOP = frozenset( - { - "connection", - "keep-alive", - "proxy-authenticate", - "proxy-authorization", - "te", - "trailers", - "transfer-encoding", - "upgrade", - "host", - "content-length", - } -) - -_RESP_DROP = frozenset({"transfer-encoding", "content-length", "connection"}) - - -def _parse_route_from_header(raw: str | None) -> str | None: - if not raw: - return None - m = re.search(r"route=(/[^\s,]+)", raw) - return m.group(1) if m else None - - -def _parse_route(request: Request) -> str | None: - h = request.headers - v = h.get("x-amzn-sagemaker-custom-attributes") - return _parse_route_from_header(v) - - -def _build_multipart_body(data: dict, boundary: str) -> bytes: - parts: list[str] = [] - for key, value in data.items(): - parts.append( - f'--{boundary}\r\nContent-Disposition: form-data; name="{key}"\r\n\r\n{value}\r\n' - ) - parts.append(f"--{boundary}--\r\n") - return "".join(parts).encode() - - -def _default_path_for_invocation(content_type: str, body: bytes) -> str: - ct = (content_type or "").lower() - if "json" not in ct: - return "/v1/chat/completions" - try: - data = json.loads(body) - except (json.JSONDecodeError, UnicodeDecodeError): - return "/v1/chat/completions" - if not isinstance(data, dict): - return "/v1/chat/completions" - if "messages" in data: - return "/v1/chat/completions" - if "prompt" in data: - return "/v1/completions" - if "input" in data and "model" in data: - return "/v1/embeddings" - return "/v1/chat/completions" - - -def _forward_request_headers(request: Request, body_len: int, content_type: str | None) -> dict[str, str]: - out: dict[str, str] = {} - for key, value in request.headers.items(): - lk = key.lower() - if lk in _HOP_BY_HOP or lk == "x-amzn-sagemaker-custom-attributes": - continue - out[key] = value - out["content-length"] = str(body_len) - if content_type is not None: - out["content-type"] = content_type - return out - - -def _response_headers_from_httpx(resp: httpx.Response) -> dict[str, str]: - h: dict[str, str] = {} - for key, value in resp.headers.items(): - lk = key.lower() - if lk in _RESP_DROP: - continue - h[key] = value - return h - - -async def ping(request: Request) -> Response: - url = f"{BACKEND}/health" - try: - async with httpx.AsyncClient(timeout=httpx.Timeout(10.0, connect=2.0)) as client: - r = await client.get(url) - except httpx.RequestError as e: - logger.warning("Backend health request failed: %s", e) - return Response(status_code=503, content=b'{"error":"backend_unavailable"}') - return Response( - status_code=r.status_code, - content=r.content, - headers=_response_headers_from_httpx(r), - ) - - -async def invocations(request: Request) -> Response: - if request.method != "POST": - return Response(status_code=405, content=b"Method Not Allowed") - - body = await request.body() - route = _parse_route(request) - content_type = request.headers.get("content-type") - - if route: - target = route - logger.info("Rerouting /invocations -> %s", target) - ct = (content_type or "").lower() - if target in FORM_DATA_ROUTES and "json" in ct: - try: - data = json.loads(body) - except (json.JSONDecodeError, UnicodeDecodeError): - data = None - if isinstance(data, dict): - boundary = uuid.uuid4().hex - body = _build_multipart_body(data, boundary) - content_type = f"multipart/form-data; boundary={boundary}" - logger.info("Converted JSON to form-data for %s", target) - else: - target = _default_path_for_invocation(content_type or "", body) - logger.info("Inferred /invocations -> %s", target) - - url = f"{BACKEND}{target}" - fwd_headers = _forward_request_headers(request, len(body), content_type) - - timeout = httpx.Timeout(600.0, connect=30.0) - client = httpx.AsyncClient(timeout=timeout) - try: - req = client.build_request("POST", url, headers=fwd_headers, content=body) - r = await client.send(req, stream=True) - except httpx.RequestError as e: - await client.aclose() - logger.exception("Upstream request failed: %s", e) - return Response(status_code=502, content=json.dumps({"error": "upstream_error"}).encode()) - - async def stream_body() -> AsyncIterator[bytes]: - try: - async for chunk in r.aiter_bytes(): - yield chunk - finally: - await r.aclose() - await client.aclose() - - return StreamingResponse( - stream_body(), - status_code=r.status_code, - headers=_response_headers_from_httpx(r), - media_type=r.headers.get("content-type"), - ) - - -routes = [ - Route("/ping", ping, methods=["GET"]), - Route("/invocations", invocations, methods=["POST"]), -] - -app = Starlette(routes=routes) - -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s %(levelname)s %(name)s %(message)s", - force=True, -) diff --git a/huggingface/llamacpp/build_artifacts/llamacpp_sagemaker_server.patch b/huggingface/llamacpp/build_artifacts/llamacpp_sagemaker_server.patch new file mode 100644 index 000000000000..a8491c93a26a --- /dev/null +++ b/huggingface/llamacpp/build_artifacts/llamacpp_sagemaker_server.patch @@ -0,0 +1,133 @@ +--- a/tools/server/server.cpp ++++ b/tools/server/server.cpp +@@ -11,7 +11,9 @@ + #include "llama.h" + #include "log.h" + ++#include + #include ++#include + #include + #include + #include +@@ -69,6 +71,81 @@ + } + return res; + }; ++} ++ ++static std::string sagemaker_header(const server_http_req & req, const std::string & name) { ++ for (const auto & h : req.headers) { ++ std::string key = h.first; ++ std::transform(key.begin(), key.end(), key.begin(), [](unsigned char c) { return std::tolower(c); }); ++ if (key == name) { ++ return h.second; ++ } ++ } ++ return ""; ++} ++ ++static std::string sagemaker_route_from_attrs(const server_http_req & req) { ++ const std::string attrs = sagemaker_header(req, "x-amzn-sagemaker-custom-attributes"); ++ const std::string key = "route="; ++ const size_t pos = attrs.find(key); ++ if (pos == std::string::npos) { ++ return ""; ++ } ++ const size_t start = pos + key.size(); ++ const size_t end = attrs.find_first_of(",; \t\r\n", start); ++ return attrs.substr(start, end == std::string::npos ? std::string::npos : end - start); ++} ++ ++static bool sagemaker_route_syntax_ok(const std::string & route) { ++ return !route.empty() && route[0] == '/' && route.find("..") == std::string::npos && ++ route.find("://") == std::string::npos && route.find('?') == std::string::npos && ++ route.find('#') == std::string::npos; ++} ++ ++static std::string sagemaker_default_route(const server_http_req & req) { ++ const json body = json::parse(req.body, nullptr, false); ++ if (body.is_object()) { ++ if (body.contains("messages")) { ++ return "/v1/chat/completions"; ++ } ++ if (body.contains("prompt")) { ++ return "/v1/completions"; ++ } ++ if (body.contains("input")) { ++ return "/v1/embeddings"; ++ } ++ } ++ return "/v1/chat/completions"; ++} ++ ++static server_http_res_ptr sagemaker_error(int status, const std::string & message) { ++ auto res = std::make_unique(); ++ res->status = status; ++ res->data = safe_json_to_str({ ++ { "error", { ++ { "code", status }, ++ { "message", message }, ++ { "type", "invalid_request_error" }, ++ } }, ++ }); ++ return res; ++} ++ ++static server_http_res_ptr sagemaker_invocations( ++ const server_http_req & req, ++ const std::map & routes) { ++ const std::string requested = sagemaker_route_from_attrs(req); ++ const std::string route = requested.empty() ? sagemaker_default_route(req) : requested; ++ if (!sagemaker_route_syntax_ok(route)) { ++ return sagemaker_error(400, "invalid SageMaker route: " + route); ++ } ++ const auto it = routes.find(route); ++ if (it == routes.end()) { ++ return sagemaker_error(400, "unsupported SageMaker route: " + route); ++ } ++ server_http_req routed_req = req; ++ routed_req.path = route; ++ return it->second(routed_req); + } + + int main(int argc, char ** argv) { +@@ -169,6 +246,38 @@ + ctx_http.post("/models/unload", ex_wrapper(models_routes->post_router_models_unload)); + } + ++ ++ const std::map sagemaker_routes = { ++ {"/props", routes.post_props}, ++ {"/completion", routes.post_completions}, ++ {"/completions", routes.post_completions}, ++ {"/v1/completions", routes.post_completions_oai}, ++ {"/chat/completions", routes.post_chat_completions}, ++ {"/v1/chat/completions", routes.post_chat_completions}, ++ {"/v1/responses", routes.post_responses_oai}, ++ {"/responses", routes.post_responses_oai}, ++ {"/v1/audio/transcriptions", routes.post_transcriptions_oai}, ++ {"/audio/transcriptions", routes.post_transcriptions_oai}, ++ {"/v1/messages", routes.post_anthropic_messages}, ++ {"/v1/messages/count_tokens", routes.post_anthropic_count_tokens}, ++ {"/infill", routes.post_infill}, ++ {"/embedding", routes.post_embeddings}, ++ {"/embeddings", routes.post_embeddings}, ++ {"/v1/embeddings", routes.post_embeddings_oai}, ++ {"/rerank", routes.post_rerank}, ++ {"/reranking", routes.post_rerank}, ++ {"/v1/rerank", routes.post_rerank}, ++ {"/v1/reranking", routes.post_rerank}, ++ {"/tokenize", routes.post_tokenize}, ++ {"/detokenize", routes.post_detokenize}, ++ {"/apply-template", routes.post_apply_template}, ++ {"/lora-adapters", routes.post_lora_adapters}, ++ }; ++ ++ ctx_http.get ("/ping", ex_wrapper(routes.get_health)); // SageMaker health endpoint ++ ctx_http.post("/invocations", ex_wrapper([&sagemaker_routes](const server_http_req & req) { ++ return sagemaker_invocations(req, sagemaker_routes); ++ })); + ctx_http.get ("/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check) + ctx_http.get ("/v1/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check) + ctx_http.get ("/metrics", ex_wrapper(routes.get_metrics)); diff --git a/huggingface/llamacpp/build_artifacts/sagemaker_entrypoint.sh b/huggingface/llamacpp/build_artifacts/sagemaker_entrypoint.sh index 7f55bf5967d8..1d06097ef569 100644 --- a/huggingface/llamacpp/build_artifacts/sagemaker_entrypoint.sh +++ b/huggingface/llamacpp/build_artifacts/sagemaker_entrypoint.sh @@ -6,17 +6,16 @@ set -euo pipefail bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true # Source CUDA compat for older drivers (e.g., g5 instances) -if command -v nvidia-smi >/dev/null 2>&1 && command -v nvcc >/dev/null 2>&1; then +if [ -f /usr/local/bin/start_cuda_compat.sh ] \ + && command -v nvidia-smi >/dev/null 2>&1 \ + && command -v nvcc >/dev/null 2>&1; then source /usr/local/bin/start_cuda_compat.sh fi -# SageMaker sends traffic to port 8080 on /ping and /invocations. llama-server -# listens on a loopback-only port; a small Python proxy (llamacpp_sagemaker_serve) -# binds 8080 and forwards to llama-server, similar to vLLM-Omni middleware. -INTERNAL_HOST="${LLAMACPP_SAGEMAKER_INTERNAL_HOST:-127.0.0.1}" -INTERNAL_PORT="${LLAMACPP_SAGEMAKER_INTERNAL_PORT:-8081}" -PROXY_PORT="${LLAMACPP_SAGEMAKER_PROXY_PORT:-8080}" -export LLAMACPP_SAGEMAKER_BACKEND_URL="${LLAMACPP_SAGEMAKER_BACKEND_URL:-http://${INTERNAL_HOST}:${INTERNAL_PORT}}" +# SageMaker sends traffic to port 8080 on /ping and /invocations. The custom +# llama-server build handles those routes directly. +HOST="${LLAMACPP_SAGEMAKER_HOST:-0.0.0.0}" +PORT="${SAGEMAKER_BIND_TO_PORT:-${LLAMACPP_SAGEMAKER_PORT:-8080}}" PREFIX="SM_LLAMACPP_" ARG_PREFIX="--" @@ -32,7 +31,7 @@ while IFS='=' read -r key value; do fi done < <(env | grep "^${PREFIX}" || true) -# Drop any user-supplied --host / --port so inference stays on the internal bind. +# Drop any user-supplied --host / --port so SageMaker can always reach the server. normalized=() skip_next=0 for a in "${ARGS[@]}"; do @@ -47,49 +46,8 @@ for a in "${ARGS[@]}"; do normalized+=("$a") done ARGS=("${normalized[@]}") -ARGS+=(--host "$INTERNAL_HOST" --port "$INTERNAL_PORT") +ARGS+=(--host "$HOST" --port "$PORT") echo "[sagemaker] llama-server args: ${ARGS[*]}" >&2 -/app/llama-server "${ARGS[@]}" & -LLAMA_PID=$! - -wait_for_llama() { - local i - for i in $(seq 1 120); do - if curl -sf "http://${INTERNAL_HOST}:${INTERNAL_PORT}/health" >/dev/null 2>&1; then - return 0 - fi - sleep 1 - done - return 1 -} - -if ! wait_for_llama; then - echo "[sagemaker] llama-server did not become healthy on ${INTERNAL_HOST}:${INTERNAL_PORT}" >&2 - kill -TERM "$LLAMA_PID" 2>/dev/null || true - wait "$LLAMA_PID" 2>/dev/null || true - exit 1 -fi - -shutdown() { - kill -TERM "$UVICORN_PID" 2>/dev/null || true - kill -TERM "$LLAMA_PID" 2>/dev/null || true - wait "$UVICORN_PID" 2>/dev/null || true - wait "$LLAMA_PID" 2>/dev/null || true -} - -trap shutdown SIGTERM SIGINT - -if [ -n "${PYTHONPATH:-}" ]; then - export PYTHONPATH="${PYTHONPATH}:/usr/local/lib/llamacpp_sagemaker" -else - export PYTHONPATH="/usr/local/lib/llamacpp_sagemaker" -fi -python3 -m uvicorn llamacpp_sagemaker_serve:app --host 0.0.0.0 --port "$PROXY_PORT" --log-level info & -UVICORN_PID=$! - -wait "$UVICORN_PID" -exit_code=$? -shutdown -exit "$exit_code" +exec /app/llama-server "${ARGS[@]}" diff --git a/huggingface/llamacpp/buildspec.yml b/huggingface/llamacpp/buildspec.yml index bed20118458b..8b05831cbadf 100644 --- a/huggingface/llamacpp/buildspec.yml +++ b/huggingface/llamacpp/buildspec.yml @@ -28,9 +28,9 @@ context: sagemaker_entrypoint: source: build_artifacts/sagemaker_entrypoint.sh target: sagemaker_entrypoint.sh - llamacpp_sagemaker_serve: - source: build_artifacts/llamacpp_sagemaker_serve.py - target: llamacpp_sagemaker_serve.py + llamacpp_sagemaker_server_patch: + source: build_artifacts/llamacpp_sagemaker_server.patch + target: llamacpp_sagemaker_server.patch images: diff --git a/huggingface/llamacpp/docker/b8882/Dockerfile.cpu b/huggingface/llamacpp/docker/b8882/Dockerfile.cpu index e69de29bb2d1..813ba6eaf042 100644 --- a/huggingface/llamacpp/docker/b8882/Dockerfile.cpu +++ b/huggingface/llamacpp/docker/b8882/Dockerfile.cpu @@ -0,0 +1,74 @@ +ARG UBUNTU_VERSION=24.04 +ARG LLAMACPP_VERSION=b8882 + +FROM ubuntu:${UBUNTU_VERSION} AS build + +ARG LLAMACPP_VERSION + +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + cmake \ + gcc-14 \ + g++-14 \ + git \ + libgomp1 \ + libssl-dev \ + patch \ + python3 \ + && rm -rf /var/lib/apt/lists/* + +ENV CC=gcc-14 \ + CXX=g++-14 + +WORKDIR /src/llama.cpp + +RUN git clone --branch "${LLAMACPP_VERSION}" --depth 1 https://github.com/ggml-org/llama.cpp.git . + +COPY llamacpp_sagemaker_server.patch /tmp/llamacpp_sagemaker_server.patch + +RUN patch -p1 < /tmp/llamacpp_sagemaker_server.patch +RUN cmake -B build \ + -DGGML_NATIVE=OFF \ + -DGGML_BACKEND_DL=ON \ + -DGGML_CPU_ALL_VARIANTS=ON \ + -DLLAMA_BUILD_TESTS=OFF \ + . \ + && cmake --build build --config Release -j"$(nproc)" --target llama-server + +RUN mkdir -p /app/lib \ + && find build -name "*.so*" -exec cp -P {} /app/lib \; \ + && cp build/bin/llama-server /app/llama-server + +FROM ubuntu:${UBUNTU_VERSION} AS base + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +WORKDIR /app + +ENV LD_LIBRARY_PATH=/app:${LD_LIBRARY_PATH} + +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + ca-certificates \ + curl \ + libgomp1 \ + && apt-get autoremove -y \ + && apt-get clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete \ + && rm -rf /var/lib/apt/lists/* + +COPY --from=build /app/lib/ /app/ +COPY --from=build /app/llama-server /app/llama-server + +FROM base AS sagemaker + +COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py +COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh +COPY --chmod=0755 sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh + +ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"] diff --git a/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu b/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu index 187f8f75ea0a..c1001bb61fee 100644 --- a/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu +++ b/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu @@ -1,10 +1,60 @@ -FROM ghcr.io/ggml-org/llama.cpp:server-cuda13-b8882 AS base +ARG UBUNTU_VERSION=24.04 +ARG CUDA_VERSION=13.0.2 +ARG LLAMACPP_VERSION=b8882 +ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} +ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} + +FROM ${BASE_CUDA_DEV_CONTAINER} AS build + +ARG LLAMACPP_VERSION +ARG CUDA_DOCKER_ARCH=default + +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + cmake \ + gcc-14 \ + g++-14 \ + git \ + libgomp1 \ + libssl-dev \ + patch \ + python3 \ + && rm -rf /var/lib/apt/lists/* + +ENV CC=gcc-14 \ + CXX=g++-14 \ + CUDAHOSTCXX=g++-14 + +WORKDIR /src/llama.cpp + +RUN git clone --branch "${LLAMACPP_VERSION}" --depth 1 https://github.com/ggml-org/llama.cpp.git . +COPY llamacpp_sagemaker_server.patch /tmp/llamacpp_sagemaker_server.patch +RUN patch -p1 < /tmp/llamacpp_sagemaker_server.patch +RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \ + export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ + fi \ + && cmake -B build \ + -DGGML_NATIVE=OFF \ + -DGGML_CUDA=ON \ + -DGGML_BACKEND_DL=ON \ + -DGGML_CPU_ALL_VARIANTS=ON \ + -DLLAMA_BUILD_TESTS=OFF \ + ${CMAKE_ARGS:-} \ + -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \ + . \ + && cmake --build build --config Release -j"$(nproc)" --target llama-server + +RUN mkdir -p /app/lib \ + && find build -name "*.so*" -exec cp -P {} /app/lib \; \ + && cp build/bin/llama-server /app/llama-server + +FROM ${BASE_CUDA_RUN_CONTAINER} AS base LABEL maintainer="Amazon AI" LABEL dlc_major_version="1" -FROM base AS sagemaker - WORKDIR /app ENV LD_LIBRARY_PATH=/app:${LD_LIBRARY_PATH} @@ -12,21 +62,22 @@ RUN apt-get update \ && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ ca-certificates \ curl \ - python3 \ - python3-pip \ - && pip3 install --no-cache-dir --break-system-packages \ - "httpx>=0.27,<1" \ - "starlette>=0.37,<1" \ - "uvicorn[standard]>=0.27,<1" \ - && rm -rf /var/lib/apt/lists/* + libgomp1 \ + && apt-get autoremove -y \ + && apt-get clean -y \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + +COPY --from=build /app/lib/ /app/ +COPY --from=build /app/llama-server /app/llama-server + +FROM base AS sagemaker COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh -COPY start_cuda_compat.sh /usr/local/bin/start_cuda_compat.sh -COPY sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh -RUN mkdir -p /usr/local/lib/llamacpp_sagemaker -COPY llamacpp_sagemaker_serve.py /usr/local/lib/llamacpp_sagemaker/llamacpp_sagemaker_serve.py -RUN chmod +x /usr/local/bin/sagemaker_entrypoint.sh \ - && chmod +x /usr/local/bin/start_cuda_compat.sh +COPY --chmod=0755 start_cuda_compat.sh /usr/local/bin/start_cuda_compat.sh +COPY --chmod=0755 sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"] diff --git a/test/sagemaker_tests/huggingface/llamacpp/integration/__init__.py b/test/sagemaker_tests/huggingface/llamacpp/integration/__init__.py index 79a9cbb4223d..44e2b20386e5 100644 --- a/test/sagemaker_tests/huggingface/llamacpp/integration/__init__.py +++ b/test/sagemaker_tests/huggingface/llamacpp/integration/__init__.py @@ -86,10 +86,8 @@ def ensure_model_downloaded(): ROLE = "dummy/unused-role" DEFAULT_TIMEOUT = 45 -# Llama.cpp SageMaker images listen on port 8080 with a small HTTP shim (/ping, -# /invocations) that proxies to llama-server on loopback (see llamacpp_sagemaker_serve). -# Do not set SM_LLAMACPP_HOST or SM_LLAMACPP_PORT expecting external access to -# llama-server; the entrypoint pins the server to localhost and exposes the shim on 8080. +# Llama.cpp SageMaker images listen on port 8080 with a custom llama-server build +# that serves SageMaker-compatible /ping and /invocations routes directly. class NoLogStreamFoundError(Exception): diff --git a/test/sagemaker_tests/huggingface/llamacpp/integration/local/test_serving.py b/test/sagemaker_tests/huggingface/llamacpp/integration/local/test_serving.py index 68691d05b559..f4807f5c4cf0 100644 --- a/test/sagemaker_tests/huggingface/llamacpp/integration/local/test_serving.py +++ b/test/sagemaker_tests/huggingface/llamacpp/integration/local/test_serving.py @@ -30,8 +30,8 @@ def _predictor(image, sagemaker_local_session, instance_type): """Context manager for Llama.cpp model deployment and cleanup. Model is extracted to /opt/ml/model by SageMaker from model_data tar.gz. - The container entrypoint runs llama-server behind a SageMaker-compatible - proxy on port 8080 (/ping, /invocations -> OpenAI routes on llama-server). + The container entrypoint runs a custom llama-server build with + SageMaker-compatible /ping and /invocations routes on port 8080. """ # Download model from HuggingFace Hub if not already present model_data_path = ensure_model_downloaded() @@ -81,7 +81,7 @@ def _assert_llamacpp_chat_prediction(predictor): def _assert_llamacpp_chat_prediction_explicit_route(predictor): - """Same as chat test but forces target path via SageMaker CustomAttributes (proxy route=).""" + """Same as chat test but forces target path via SageMaker CustomAttributes route=.""" predictor.serializer = JSONSerializer() predictor.deserializer = JSONDeserializer() diff --git a/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/test_llamacpp.py b/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/test_llamacpp.py index b22b32f27543..370ae0f51e1b 100644 --- a/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/test_llamacpp.py +++ b/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/test_llamacpp.py @@ -106,7 +106,7 @@ def _test_llamacpp_model( assert output is not None assert "choices" in output - # Explicit route= mirrors vLLM-Omni-style CustomAttributes routing in the container proxy. + # Explicit route= uses SageMaker CustomAttributes routing in the custom llama-server build. output_routed = predictor.predict( data, custom_attributes="route=/v1/chat/completions", From 529537df1f8cf26804bbda6489c66fc48efef101 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Enrique=20Hern=C3=A1ndez=20Calabr=C3=A9s?= Date: Thu, 30 Apr 2026 11:01:24 +0000 Subject: [PATCH 08/14] Remove unnecesary resources file --- .../resources/qwen3.5-0.8b/.gitattributes | 61 ------------------- 1 file changed, 61 deletions(-) delete mode 100644 test/sagemaker_tests/huggingface/llamacpp/resources/qwen3.5-0.8b/.gitattributes diff --git a/test/sagemaker_tests/huggingface/llamacpp/resources/qwen3.5-0.8b/.gitattributes b/test/sagemaker_tests/huggingface/llamacpp/resources/qwen3.5-0.8b/.gitattributes deleted file mode 100644 index 6ba5bc1386f8..000000000000 --- a/test/sagemaker_tests/huggingface/llamacpp/resources/qwen3.5-0.8b/.gitattributes +++ /dev/null @@ -1,61 +0,0 @@ -*.7z filter=lfs diff=lfs merge=lfs -text -*.arrow filter=lfs diff=lfs merge=lfs -text -*.bin filter=lfs diff=lfs merge=lfs -text -*.bz2 filter=lfs diff=lfs merge=lfs -text -*.ckpt filter=lfs diff=lfs merge=lfs -text -*.ftz filter=lfs diff=lfs merge=lfs -text -*.gz filter=lfs diff=lfs merge=lfs -text -*.h5 filter=lfs diff=lfs merge=lfs -text -*.joblib filter=lfs diff=lfs merge=lfs -text -*.lfs.* filter=lfs diff=lfs merge=lfs -text -*.mlmodel filter=lfs diff=lfs merge=lfs -text -*.model filter=lfs diff=lfs merge=lfs -text -*.msgpack filter=lfs diff=lfs merge=lfs -text -*.npy filter=lfs diff=lfs merge=lfs -text -*.npz filter=lfs diff=lfs merge=lfs -text -*.onnx filter=lfs diff=lfs merge=lfs -text -*.ot filter=lfs diff=lfs merge=lfs -text -*.parquet filter=lfs diff=lfs merge=lfs -text -*.pb filter=lfs diff=lfs merge=lfs -text -*.pickle filter=lfs diff=lfs merge=lfs -text -*.pkl filter=lfs diff=lfs merge=lfs -text -*.pt filter=lfs diff=lfs merge=lfs -text -*.pth filter=lfs diff=lfs merge=lfs -text -*.rar filter=lfs diff=lfs merge=lfs -text -*.safetensors filter=lfs diff=lfs merge=lfs -text -saved_model/**/* filter=lfs diff=lfs merge=lfs -text -*.tar.* filter=lfs diff=lfs merge=lfs -text -*.tar filter=lfs diff=lfs merge=lfs -text -*.tflite filter=lfs diff=lfs merge=lfs -text -*.tgz filter=lfs diff=lfs merge=lfs -text -*.wasm filter=lfs diff=lfs merge=lfs -text -*.xz filter=lfs diff=lfs merge=lfs -text -*.zip filter=lfs diff=lfs merge=lfs -text -*.zst filter=lfs diff=lfs merge=lfs -text -*tfevents* filter=lfs diff=lfs merge=lfs -text -mmproj-BF16.gguf filter=lfs diff=lfs merge=lfs -text -mmproj-F16.gguf filter=lfs diff=lfs merge=lfs -text -mmproj-F32.gguf filter=lfs diff=lfs merge=lfs -text -Qwen3.5-0.8B-UD-IQ3_XXS.gguf filter=lfs diff=lfs merge=lfs -text -Qwen3.5-0.8B-UD-Q2_K_XL.gguf filter=lfs diff=lfs merge=lfs -text -Qwen3.5-0.8B-Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text -Qwen3.5-0.8B-UD-Q5_K_XL.gguf filter=lfs diff=lfs merge=lfs -text -Qwen3.5-0.8B-UD-Q4_K_XL.gguf filter=lfs diff=lfs merge=lfs -text -Qwen3.5-0.8B-UD-IQ2_M.gguf filter=lfs diff=lfs merge=lfs -text -Qwen3.5-0.8B-Q3_K_M.gguf filter=lfs diff=lfs merge=lfs -text -Qwen3.5-0.8B-Q6_K.gguf filter=lfs diff=lfs merge=lfs -text -Qwen3.5-0.8B-UD-IQ2_XXS.gguf filter=lfs diff=lfs merge=lfs -text -Qwen3.5-0.8B-Q5_K_M.gguf filter=lfs diff=lfs merge=lfs -text -Qwen3.5-0.8B-Q3_K_S.gguf filter=lfs diff=lfs merge=lfs -text -Qwen3.5-0.8B-IQ4_NL.gguf filter=lfs diff=lfs merge=lfs -text -Qwen3.5-0.8B-Q4_0.gguf filter=lfs diff=lfs merge=lfs -text -Qwen3.5-0.8B-Q4_1.gguf filter=lfs diff=lfs merge=lfs -text -Qwen3.5-0.8B-IQ4_XS.gguf filter=lfs diff=lfs merge=lfs -text -Qwen3.5-0.8B-Q5_K_S.gguf filter=lfs diff=lfs merge=lfs -text -Qwen3.5-0.8B-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text -Qwen3.5-0.8B-UD-Q6_K_XL.gguf filter=lfs diff=lfs merge=lfs -text -Qwen3.5-0.8B-UD-Q8_K_XL.gguf filter=lfs diff=lfs merge=lfs -text -Qwen3.5-0.8B-Q4_K_S.gguf filter=lfs diff=lfs merge=lfs -text -imatrix_unsloth.gguf_file filter=lfs diff=lfs merge=lfs -text -Qwen3.5-0.8B-BF16.gguf filter=lfs diff=lfs merge=lfs -text -Qwen3.5-0.8B-UD-Q3_K_XL.gguf filter=lfs diff=lfs merge=lfs -text From eeb004940321a0f7a94ffa7369a38c156e90d978 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Enrique=20Hern=C3=A1ndez=20Calabr=C3=A9s?= Date: Thu, 30 Apr 2026 12:17:28 +0000 Subject: [PATCH 09/14] Minimal style changes --- .../docker/b8882/cu130/Dockerfile.gpu | 68 +++++++++---------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu b/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu index c1001bb61fee..2c74a6ef212c 100644 --- a/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu +++ b/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu @@ -10,18 +10,18 @@ ARG LLAMACPP_VERSION ARG CUDA_DOCKER_ARCH=default RUN apt-get update \ - && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - build-essential \ - ca-certificates \ - cmake \ - gcc-14 \ - g++-14 \ - git \ - libgomp1 \ - libssl-dev \ - patch \ - python3 \ - && rm -rf /var/lib/apt/lists/* + && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + cmake \ + gcc-14 \ + g++-14 \ + git \ + libgomp1 \ + libssl-dev \ + patch \ + python3 \ + && rm -rf /var/lib/apt/lists/* ENV CC=gcc-14 \ CXX=g++-14 \ @@ -35,20 +35,20 @@ RUN patch -p1 < /tmp/llamacpp_sagemaker_server.patch RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \ export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ fi \ - && cmake -B build \ - -DGGML_NATIVE=OFF \ - -DGGML_CUDA=ON \ - -DGGML_BACKEND_DL=ON \ - -DGGML_CPU_ALL_VARIANTS=ON \ - -DLLAMA_BUILD_TESTS=OFF \ - ${CMAKE_ARGS:-} \ - -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \ - . \ - && cmake --build build --config Release -j"$(nproc)" --target llama-server + && cmake -B build \ + -DGGML_NATIVE=OFF \ + -DGGML_CUDA=ON \ + -DGGML_BACKEND_DL=ON \ + -DGGML_CPU_ALL_VARIANTS=ON \ + -DLLAMA_BUILD_TESTS=OFF \ + ${CMAKE_ARGS:-} \ + -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \ + . \ + && cmake --build build --config Release -j"$(nproc)" --target llama-server RUN mkdir -p /app/lib \ - && find build -name "*.so*" -exec cp -P {} /app/lib \; \ - && cp build/bin/llama-server /app/llama-server + && find build -name "*.so*" -exec cp -P {} /app/lib \; \ + && cp build/bin/llama-server /app/llama-server FROM ${BASE_CUDA_RUN_CONTAINER} AS base @@ -59,16 +59,16 @@ WORKDIR /app ENV LD_LIBRARY_PATH=/app:${LD_LIBRARY_PATH} RUN apt-get update \ - && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - ca-certificates \ - curl \ - libgomp1 \ - && apt-get autoremove -y \ - && apt-get clean -y \ - && rm -rf /var/lib/apt/lists/* \ - && rm -rf /tmp/* /var/tmp/* \ - && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ - && find /var/cache -type f -delete + && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + ca-certificates \ + curl \ + libgomp1 \ + && apt-get autoremove -y \ + && apt-get clean -y \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete COPY --from=build /app/lib/ /app/ COPY --from=build /app/llama-server /app/llama-server From 0773825df945b1f45ce5dab91cdc0feb69436eae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Enrique=20Hern=C3=A1ndez=20Calabr=C3=A9s?= Date: Thu, 30 Apr 2026 20:46:49 +0200 Subject: [PATCH 10/14] Update Dockerfiles to address multiple CVEs --- .../llamacpp/docker/b8882/Dockerfile.cpu | 17 +++++++++++++++++ .../llamacpp/docker/b8882/cu130/Dockerfile.gpu | 17 +++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/huggingface/llamacpp/docker/b8882/Dockerfile.cpu b/huggingface/llamacpp/docker/b8882/Dockerfile.cpu index 813ba6eaf042..a577306960f9 100644 --- a/huggingface/llamacpp/docker/b8882/Dockerfile.cpu +++ b/huggingface/llamacpp/docker/b8882/Dockerfile.cpu @@ -71,4 +71,21 @@ COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh COPY --chmod=0755 sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh +# Fix several CVEs: +# CVE-2026-31789, CVE-2025-15467, CVE-2025-13151, CVE-2025-15281, +# CVE-2025-69419, CVE-2025-68973, CVE-2025-69421, CVE-2026-28390, +# CVE-2025-69420, CVE-2026-0915, CVE-2026-0861, CVE-2026-28388, +# CVE-2026-31790, CVE-2026-28387, CVE-2026-28389 +RUN apt-get update \ + && apt-get install -y --only-upgrade \ + libssl3t64 \ + openssl \ + libtasn1-6 \ + libc6 \ + libc-bin \ + gnupg \ + gpg \ + gpgv \ + && rm -rf /var/lib/apt/lists/* + ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"] diff --git a/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu b/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu index 2c74a6ef212c..c7110bd0b007 100644 --- a/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu +++ b/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu @@ -80,4 +80,21 @@ COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh COPY --chmod=0755 start_cuda_compat.sh /usr/local/bin/start_cuda_compat.sh COPY --chmod=0755 sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh +# Fix several CVEs: +# CVE-2026-31789, CVE-2025-15467, CVE-2025-13151, CVE-2025-15281, +# CVE-2025-69419, CVE-2025-68973, CVE-2025-69421, CVE-2026-28390, +# CVE-2025-69420, CVE-2026-0915, CVE-2026-0861, CVE-2026-28388, +# CVE-2026-31790, CVE-2026-28387, CVE-2026-28389 +RUN apt-get update \ + && apt-get install -y --only-upgrade \ + libssl3t64 \ + openssl \ + libtasn1-6 \ + libc6 \ + libc-bin \ + gnupg \ + gpg \ + gpgv \ + && rm -rf /var/lib/apt/lists/* + ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"] From 988efb43bac1df766b939917a40d32fc6cf8ce37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Enrique=20Hern=C3=A1ndez=20Calabr=C3=A9s?= Date: Tue, 12 May 2026 21:37:29 +0200 Subject: [PATCH 11/14] Fix path for Huggingface Llamacpp buildspec in dlc_developer_config.toml --- dlc_developer_config.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 60593de299b7..3a0f53e59503 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -193,7 +193,7 @@ dlc-pr-huggingface-vllm = "" dlc-pr-huggingface-sglang = "" # Huggingface Llamacpp -dlc-pr-huggingface-llamacpp = "/huggingface/llamacpp/buildspec.yml" +dlc-pr-huggingface-llamacpp = "huggingface/llamacpp/buildspec.yml" # sglang dlc-pr-sglang = "" From af6ba95d79f89628ded5aab87b320f755a335636 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Enrique=20Hern=C3=A1ndez=20Calabr=C3=A9s?= Date: Wed, 13 May 2026 12:44:42 +0200 Subject: [PATCH 12/14] Update py_version extraction in generate_sagemaker_pytest_cmd to handle None case --- test/test_utils/sagemaker.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/test_utils/sagemaker.py b/test/test_utils/sagemaker.py index 24f256f66253..829d0f2c0986 100644 --- a/test/test_utils/sagemaker.py +++ b/test/test_utils/sagemaker.py @@ -183,7 +183,8 @@ def generate_sagemaker_pytest_cmd(image, sagemaker_test_type): else "gpu" if "gpu" in image else "eia" if "eia" in image else "cpu" ) ) - py_version = re.search(r"py\d+", tag).group() + match = re.search(r"py\d+", tag) + py_version = match.group() if match else None sm_local_py_version = ( "37" if py_version == "py37" From 473a6d6ee6c8790b7794239b92128ac7d9b36ff0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Enrique=20Hern=C3=A1ndez=20Calabr=C3=A9s?= Date: Wed, 13 May 2026 12:51:16 +0200 Subject: [PATCH 13/14] Remove transformers version from buildspec tag generation in Llamacpp configuration --- huggingface/llamacpp/buildspec.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/huggingface/llamacpp/buildspec.yml b/huggingface/llamacpp/buildspec.yml index 8b05831cbadf..60ac246a989d 100644 --- a/huggingface/llamacpp/buildspec.yml +++ b/huggingface/llamacpp/buildspec.yml @@ -44,8 +44,7 @@ images: os_version: &OS_VERSION ubuntu24.04 python_version: &DOCKER_PYTHON_VERSION py3 tag_python_version: &TAG_PYTHON_VERSION py312 - transformers_version: &TRANSFORMERS_VERSION 4.57.3 - tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *CUDA_VERSION, '-', *OS_VERSION ] + tag: !join [ *VERSION, '-', *DEVICE_TYPE, '-', *CUDA_VERSION, '-', *OS_VERSION ] docker_file: !join [ docker/, *SHORT_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker build: true @@ -65,8 +64,7 @@ images: os_version: &OS_VERSION ubuntu24.04 python_version: &DOCKER_PYTHON_VERSION py3 tag_python_version: &TAG_PYTHON_VERSION py312 - transformers_version: &TRANSFORMERS_VERSION 4.57.3 - tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *OS_VERSION ] + tag: !join [ *VERSION, '-', *DEVICE_TYPE, '-', *OS_VERSION ] docker_file: !join [ docker/, *SHORT_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker build: true From 3cc4e73ab892e9c0aa4fd293a0a7dd97af3bd10b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Enrique=20Hern=C3=A1ndez=20Calabr=C3=A9s?= Date: Tue, 19 May 2026 12:38:29 +0200 Subject: [PATCH 14/14] Enhance image_builder to require transformers_version for HuggingFace builds and update tests to include Llamacpp in upstream types --- src/image_builder.py | 9 +++++++-- .../test_boottime_container_security.py | 2 +- test/dlc_tests/sanity/test_dlc_labels.py | 5 ++++- test/dlc_tests/sanity/test_pre_release.py | 19 ++++++++++++------- test/dlc_tests/sanity/test_safety_check.py | 3 +++ .../sanity/test_safety_report_file.py | 2 +- test/test_utils/__init__.py | 9 ++++++++- 7 files changed, 36 insertions(+), 13 deletions(-) diff --git a/src/image_builder.py b/src/image_builder.py index cc401a6a5e11..48d8ef49d5cb 100644 --- a/src/image_builder.py +++ b/src/image_builder.py @@ -191,10 +191,15 @@ def image_builder(buildspec, image_types=[], device_types=[]): transformers_version = image_config.get("transformers_version") - if str(BUILDSPEC["framework"]).startswith("huggingface"): + buildspec_framework = str(BUILDSPEC["framework"]) + requires_transformers_version = buildspec_framework.startswith( + "huggingface" + ) and buildspec_framework != "huggingface_llamacpp" + + if buildspec_framework.startswith("huggingface"): if transformers_version: extra_build_args["TRANSFORMERS_VERSION"] = transformers_version - else: + elif requires_transformers_version: raise KeyError( f"HuggingFace buildspec.yml must contain 'transformers_version' field for each image" ) diff --git a/test/dlc_tests/sanity/test_boottime_container_security.py b/test/dlc_tests/sanity/test_boottime_container_security.py index ded6c61b0e3d..7de90dc3dbc6 100644 --- a/test/dlc_tests/sanity/test_boottime_container_security.py +++ b/test/dlc_tests/sanity/test_boottime_container_security.py @@ -6,7 +6,7 @@ @pytest.mark.model("N/A") @pytest.mark.canary("Run security test regularly on production images") def test_security(image): - upstream_types = ["vllm"] + upstream_types = ["vllm", "llamacpp"] if any(t in image for t in upstream_types): pytest.skip( f"{', '.join(upstream_types)} images do not require boot time security check as they are managed by upstream devs. Skipping test." diff --git a/test/dlc_tests/sanity/test_dlc_labels.py b/test/dlc_tests/sanity/test_dlc_labels.py index 87f68922eb9d..5fef1bb50b3a 100644 --- a/test/dlc_tests/sanity/test_dlc_labels.py +++ b/test/dlc_tests/sanity/test_dlc_labels.py @@ -31,7 +31,7 @@ def test_dlc_major_version_label(image, region): @pytest.mark.integration("dlc_labels") @pytest.mark.model("N/A") def test_dlc_standard_labels(image, region): - upstream_types = ["vllm", "sglang"] + upstream_types = ["vllm", "sglang", "llamacpp"] if any(t in image for t in upstream_types): pytest.skip( f"{', '.join(upstream_types)} images do not require test_dlc_standard_labels check as they are managed by upstream devs. Skipping test." @@ -130,6 +130,9 @@ def test_dlc_major_version_dockerfiles(image): :param image: ECR image URI """ + if "llamacpp" in image: + pytest.skip("Llamacpp images do not include Python versioned Dockerfile paths.") + dlc_dir = os.getcwd().split(f"{os.sep}test{os.sep}")[0] job_type = test_utils.get_job_type_from_image(image) framework, fw_version = test_utils.get_framework_and_version_from_tag(image) diff --git a/test/dlc_tests/sanity/test_pre_release.py b/test/dlc_tests/sanity/test_pre_release.py index 49745f8c9436..9d5c1f387f87 100644 --- a/test/dlc_tests/sanity/test_pre_release.py +++ b/test/dlc_tests/sanity/test_pre_release.py @@ -109,7 +109,7 @@ def test_stray_files(image): :param image: ECR image URI """ - upstream_types = ["vllm", "sglang"] + upstream_types = ["vllm", "sglang", "llamacpp"] if any(t in image for t in upstream_types): pytest.skip( f"{', '.join(upstream_types)} images do not require pip check as they are managed by upstream devs. Skipping test." @@ -347,6 +347,8 @@ def test_framework_version_cpu(image): """ if "base" in image: pytest.skip("Base images do not contain a framework version in the tag. Skipping test.") + if "llamacpp" in image: + pytest.skip("Llamacpp images do not expose a Python framework version. Skipping test.") if "gpu" in image: pytest.skip( "GPU images will have their framework version tested in test_framework_and_cuda_version_gpu" @@ -554,6 +556,9 @@ def test_dataclasses_check(image): ctx = Context() pip_package = "dataclasses" + if "llamacpp" in image: + pytest.skip("Llamacpp images do not include Python. Skipping test.") + container_name = get_container_name("dataclasses-check", image) python_version = get_python_version_from_image_uri(image).replace("py", "") @@ -583,7 +588,7 @@ def test_pip_check(image): :param image: ECR image URI """ - upstream_types = ["vllm", "sglang"] + upstream_types = ["vllm", "sglang", "llamacpp"] if any(t in image for t in upstream_types): pytest.skip( f"{', '.join(upstream_types)} images do not require pip check as they are managed by upstream devs. Skipping test." @@ -759,7 +764,7 @@ def test_cuda_paths(gpu): :param gpu: gpu image uris """ image = gpu - general_types = ["base", "vllm", "sglang"] + general_types = ["base", "vllm", "sglang", "llamacpp"] if any(t in image for t in general_types): pytest.skip( f"{', '.join(general_types)} DLC doesn't have the same directory structure and buildspec as other images" @@ -902,7 +907,7 @@ def _test_framework_and_cuda_version(gpu, ec2_connection): :param ec2_connection: fixture to establish connection with an ec2 instance """ image = gpu - general_types = ["base", "vllm", "sglang"] + general_types = ["base", "vllm", "sglang", "llamacpp"] if any(t in image for t in general_types): pytest.skip( f"{', '.join(general_types)} images do not follow the assumptions made by inference/training. Skipping test." @@ -1098,7 +1103,7 @@ def test_license_file(image): """ Check that license file within the container is readable and valid """ - general_types = ["base", "vllm", "sglang"] + general_types = ["base", "vllm", "sglang", "llamacpp"] if any(t in image for t in general_types): pytest.skip(f"{', '.join(general_types)} DLC doesn't embed license.txt. Skipping test.") @@ -1223,7 +1228,7 @@ def test_core_package_version(image): In this test, we ensure that if a core_packages.json file exists for an image, the packages installed in the image satisfy the version constraints specified in the core_packages.json file. """ - general_types = ["base", "vllm", "sglang"] + general_types = ["base", "vllm", "sglang", "llamacpp"] if any(t in image for t in general_types): pytest.skip(f"{', '.join(general_types)} images do not have core packages. Skipping test.") @@ -1275,7 +1280,7 @@ def test_package_version_regression_in_image(image): keys in the buildspec - as these keys are used to extract the released image uri. Additionally, if the image is not already released, this test would be skipped. """ - general_types = ["base", "vllm", "sglang"] + general_types = ["base", "vllm", "sglang", "llamacpp"] if any(t in image for t in general_types): pytest.skip( f"{', '.join(general_types)} images don't have python packages that needs to be checked. Skipping test." diff --git a/test/dlc_tests/sanity/test_safety_check.py b/test/dlc_tests/sanity/test_safety_check.py index 78c023433f44..99031bea51c0 100644 --- a/test/dlc_tests/sanity/test_safety_check.py +++ b/test/dlc_tests/sanity/test_safety_check.py @@ -1092,6 +1092,9 @@ def test_safety(image): Runs safety check on a container with the capability to ignore safety issues that cannot be fixed, and only raise error if an issue is fixable. """ + if "llamacpp" in image: + pytest.skip("Llamacpp images do not include Python safety tooling. Skipping test.") + from dlc.safety_check import SafetyCheck safety_check = SafetyCheck() diff --git a/test/dlc_tests/sanity/test_safety_report_file.py b/test/dlc_tests/sanity/test_safety_report_file.py index f8860d53784a..8f876396ce05 100644 --- a/test/dlc_tests/sanity/test_safety_report_file.py +++ b/test/dlc_tests/sanity/test_safety_report_file.py @@ -74,7 +74,7 @@ def test_safety_file_exists_and_is_valid(image): "Base images do not require safety file as there isn't much python libs in it. Skipping test." ) - upstream_types = ["vllm", "sglang"] + upstream_types = ["vllm", "sglang", "llamacpp"] if any(t in image for t in upstream_types): pytest.skip( f"{', '.join(upstream_types)} images do not require safety file as they are managed by upstream devs. Skipping test." diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py index 571b9fb26ed3..bb11664d93fd 100644 --- a/test/test_utils/__init__.py +++ b/test/test_utils/__init__.py @@ -1836,7 +1836,14 @@ def get_framework_and_version_from_tag(image_uri): f"from allowed frameworks {allowed_frameworks}" ) - tag_framework_version = re.search(r"(\d+(\.\d+){1,2})", image_uri).groups()[0] + _, image_tag = get_repository_and_tag_from_image_uri(image_uri) + if tested_framework == "huggingface_llamacpp": + tag_framework_version = image_tag.split("-")[0] + else: + version_match = re.search(r"(\d+(\.\d+){1,2})", image_tag) + if not version_match: + raise RuntimeError(f"Cannot find framework version in image tag {image_tag}") + tag_framework_version = version_match.group(1) return tested_framework, tag_framework_version