From c695fc8d4dce42ddd419daadfda41c3b7656b526 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Enrique=20Hern=C3=A1ndez=20Calabr=C3=A9s?=
 <ehcalabres@gmail.com>
Date: Mon, 6 Apr 2026 11:22:26 +0000
Subject: [PATCH 01/14] [WIP] Add HuggingFace LlamaCpp support

---
 dlc_developer_config.toml                     |  5 +-
 huggingface/llamacpp/buildspec.yaml           | 91 +++++++++++++++++++
 .../llamacpp/docker/b8672/Dockerfile.cpu      |  0
 .../docker/b8672/cu129/Dockerfile.gpu         | 38 ++++++++
 .../docker/b8672/cu130/Dockerfile.gpu         | 42 +++++++++
 src/constants.py                              |  1 +
 test/test_utils/__init__.py                   |  2 +
 test/test_utils/sagemaker.py                  |  4 +
 test/testrunner.py                            |  9 ++
 9 files changed, 191 insertions(+), 1 deletion(-)
 create mode 100644 huggingface/llamacpp/buildspec.yaml
 create mode 100644 huggingface/llamacpp/docker/b8672/Dockerfile.cpu
 create mode 100644 huggingface/llamacpp/docker/b8672/cu129/Dockerfile.gpu
 create mode 100644 huggingface/llamacpp/docker/b8672/cu130/Dockerfile.gpu

diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
index 89b740a5e315..08ad2014d650 100644
--- a/dlc_developer_config.toml
+++ b/dlc_developer_config.toml
@@ -36,7 +36,7 @@ deep_canary_mode = false
 
 [build]
 # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
-# available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_vllm", "huggingface_sglang", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
+# available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_vllm", "huggingface_sglang", "huggingface_llamacpp", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
 build_frameworks = []
 
 
@@ -192,5 +192,8 @@ dlc-pr-huggingface-vllm = ""
 # HuggingFace SGLang
 dlc-pr-huggingface-sglang = ""
 
+# Huggingface Llamacpp
+dlc-pr-huggingface-llamacpp = ""
+
 # sglang
 dlc-pr-sglang = ""
diff --git a/huggingface/llamacpp/buildspec.yaml b/huggingface/llamacpp/buildspec.yaml
new file mode 100644
index 000000000000..2d1b2e360c93
--- /dev/null
+++ b/huggingface/llamacpp/buildspec.yaml
@@ -0,0 +1,91 @@
+account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
+prod_account_id: &PROD_ACCOUNT_ID 763104351884
+region: &REGION <set-$REGION-in-environment>
+base_framework: &BASE_FRAMEWORK llamacpp
+framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK]
+version: &VERSION "b8672"
+short_version: &SHORT_VERSION "b8672"
+arch_type: &ARCH_TYPE x86_64
+autopatch_build: "False"
+
+repository_info:
+  build_repository: &BUILD_REPOSITORY
+    image_type: &IMAGE_TYPE inference
+    root: huggingface/llamacpp
+    repository_name: &REPOSITORY_NAME !join [ "pr", "-", "huggingface", "-", *BASE_FRAMEWORK ]
+    repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
+    release_repository_name: &RELEASE_REPOSITORY_NAME !join [ "huggingface", "-", *BASE_FRAMEWORK ]
+    release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]
+
+context:
+  build_context: &BUILD_CONTEXT
+    deep_learning_container:
+      source: ../../src/deep_learning_container.py
+      target: deep_learning_container.py
+    start_cuda_compat:
+      source: build_artifacts/start_cuda_compat.sh
+      target: start_cuda_compat.sh
+    sagemaker_entrypoint:
+      source: build_artifacts/sagemaker_entrypoint.sh
+      target: sagemaker_entrypoint.sh
+
+
+images:
+  BuildHuggingFaceLlamacppGpuCu129DockerImage:
+    <<: *BUILD_REPOSITORY
+    context:
+      <<: *BUILD_CONTEXT
+    image_size_baseline: 40000
+    device_type: &DEVICE_TYPE gpu
+    cuda_version: &CUDA_VERSION cu129
+    os_version: &OS_VERSION ubuntu24.04
+    transformers_version: &TRANSFORMERS_VERSION 4.57.3
+    tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *CUDA_VERSION, '-', *OS_VERSION ]
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
+    target: sagemaker
+    build: true
+    enable_common_stage_build: false
+    test_configs:
+      test_platforms:
+        - sanity
+        - security
+        - sagemaker
+  
+  BuildHuggingFaceLlamacppGpuCu130DockerImage:
+    <<: *BUILD_REPOSITORY
+    context:
+      <<: *BUILD_CONTEXT
+    image_size_baseline: 40000
+    device_type: &DEVICE_TYPE gpu
+    cuda_version: &CUDA_VERSION cu130
+    os_version: &OS_VERSION ubuntu24.04
+    transformers_version: &TRANSFORMERS_VERSION 4.57.3
+    tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *CUDA_VERSION, '-', *OS_VERSION ]
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
+    target: sagemaker
+    build: true
+    enable_common_stage_build: false
+    test_configs:
+      test_platforms:
+        - sanity
+        - security
+        - sagemaker
+
+  BuildHuggingFaceLlamacppCpuDockerImage:
+    <<: *BUILD_REPOSITORY
+    context:
+      <<: *BUILD_CONTEXT
+    image_size_baseline: 40000
+    device_type: &DEVICE_TYPE cpu
+    os_version: &OS_VERSION ubuntu24.04
+    transformers_version: &TRANSFORMERS_VERSION 4.57.3
+    tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *OS_VERSION ]
+    docker_file: !join [ docker/, *SHORT_VERSION, /Dockerfile., *DEVICE_TYPE ]
+    target: sagemaker
+    build: true
+    enable_common_stage_build: false
+    test_configs:
+      test_platforms:
+        - sanity
+        - security
+        - sagemaker
diff --git a/huggingface/llamacpp/docker/b8672/Dockerfile.cpu b/huggingface/llamacpp/docker/b8672/Dockerfile.cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/huggingface/llamacpp/docker/b8672/cu129/Dockerfile.gpu b/huggingface/llamacpp/docker/b8672/cu129/Dockerfile.gpu
new file mode 100644
index 000000000000..6e37bac2e95e
--- /dev/null
+++ b/huggingface/llamacpp/docker/b8672/cu129/Dockerfile.gpu
@@ -0,0 +1,38 @@
+FROM ghcr.io/ggml-org/llama.cpp:server-cuda12-b8672 as base
+
+LABEL maintainer="Amazon AI"
+LABEL dlc_major_version="1"
+
+ARG HUGGINGFACE_HUB_VERSION=1.9.0
+ARG HF_XET_VERSION=1.2.0
+
+WORKDIR /
+
+COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
+COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh
+
+# ====================== ec2 =========================================
+FROM base AS llamacpp-ec2 
+
+RUN dpkg -l | grep -E "cuda|nvidia|libnv" | awk '{print $2}' | xargs apt-mark hold && \
+    apt-get update && \
+    apt-get upgrade -y && \
+    apt-get clean
+
+COPY dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh
+RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh 
+
+ENTRYPOINT ["/usr/local/bin/dockerd_entrypoint.sh"]
+
+# ====================== sagemaker =========================================
+FROM base AS llamacpp-sagemaker
+
+RUN dpkg -l | grep -E "cuda|nvidia|libnv" | awk '{print $2}' | xargs apt-mark hold && \
+    apt-get update && \
+    apt-get upgrade -y && \
+    apt-get clean
+
+COPY sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh
+RUN chmod +x /usr/local/bin/sagemaker_entrypoint.sh
+
+ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"]
diff --git a/huggingface/llamacpp/docker/b8672/cu130/Dockerfile.gpu b/huggingface/llamacpp/docker/b8672/cu130/Dockerfile.gpu
new file mode 100644
index 000000000000..e1cc3efa6e9a
--- /dev/null
+++ b/huggingface/llamacpp/docker/b8672/cu130/Dockerfile.gpu
@@ -0,0 +1,42 @@
+FROM ghcr.io/ggml-org/llama.cpp:server-cuda13-b8672 as base
+
+LABEL maintainer="Amazon AI"
+LABEL dlc_major_version="1"
+
+ENV DEBIAN_FRONTEND=noninteractive \
+    LANG=C.UTF-8 \
+    LC_ALL=C.UTF-8 \
+    DLC_CONTAINER_TYPE=base \
+    LD_LIBRARY_PATH="/opt/amazon/ofi-nccl/lib:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" \
+    PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/cuda/bin:${PATH}"
+
+WORKDIR /
+
+COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
+COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh
+
+# ====================== ec2 =========================================
+FROM base AS llamacpp-ec2 
+
+RUN dpkg -l | grep -E "cuda|nvidia|libnv" | awk '{print $2}' | xargs apt-mark hold && \
+    apt-get update && \
+    apt-get upgrade -y && \
+    apt-get clean
+
+COPY dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh
+RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh 
+
+ENTRYPOINT ["/usr/local/bin/dockerd_entrypoint.sh"]
+
+# ====================== sagemaker =========================================
+FROM base AS llamacpp-sagemaker
+
+RUN dpkg -l | grep -E "cuda|nvidia|libnv" | awk '{print $2}' | xargs apt-mark hold && \
+    apt-get update && \
+    apt-get upgrade -y && \
+    apt-get clean
+
+COPY sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh
+RUN chmod +x /usr/local/bin/sagemaker_entrypoint.sh
+
+ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"]
diff --git a/src/constants.py b/src/constants.py
index 037414380bca..42275c5532f1 100644
--- a/src/constants.py
+++ b/src/constants.py
@@ -29,6 +29,7 @@
     "sglang",
     "huggingface_vllm",
     "huggingface_sglang",
+    "huggingface_llamacpp",
 }
 DEVICE_TYPES = {"cpu", "gpu", "hpu", "eia", "inf", "neuron", "neuronx"}
 IMAGE_TYPES = {"training", "inference"}
diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py
index d593deea76e7..51ca9a276922 100644
--- a/test/test_utils/__init__.py
+++ b/test/test_utils/__init__.py
@@ -1822,6 +1822,7 @@ def get_framework_and_version_from_tag(image_uri):
         "huggingface_pytorch",
         "huggingface_vllm",
         "huggingface_sglang",
+        "huggingface_llamacpp",
         "stabilityai_pytorch",
         "pytorch_trcomp",
         "tensorflow",
@@ -1939,6 +1940,7 @@ def get_framework_from_image_uri(image_uri):
         "huggingface-pytorch": "huggingface_pytorch",
         "huggingface-vllm": "huggingface_vllm",
         "huggingface-sglang": "huggingface_sglang",
+        "huggingface-llamacpp": "huggingface_llamacpp",
         "stabilityai-pytorch": "stabilityai_pytorch",
         "mxnet": "mxnet",
         "pytorch": "pytorch",
diff --git a/test/test_utils/sagemaker.py b/test/test_utils/sagemaker.py
index 0ab4d69e4829..24f256f66253 100644
--- a/test/test_utils/sagemaker.py
+++ b/test/test_utils/sagemaker.py
@@ -164,6 +164,8 @@ def generate_sagemaker_pytest_cmd(image, sagemaker_test_type):
         path = os.path.join("test", "sagemaker_tests", "huggingface", "vllm")
     elif framework == "huggingface_sglang":
         path = os.path.join("test", "sagemaker_tests", "huggingface", "sglang")
+    elif framework == "huggingface_llamacpp":
+        path = os.path.join("test", "sagemaker_tests", "huggingface", "llamacpp")
     else:
         path = os.path.join("test", "sagemaker_tests", framework, job_type)
     aws_id_arg = "--aws-id"
@@ -286,6 +288,8 @@ def generate_sagemaker_pytest_cmd(image, sagemaker_test_type):
         path = os.path.join("test", "sagemaker_tests", "huggingface", "vllm")
     elif "huggingface" in framework and "sglang" in framework:
         path = os.path.join("test", "sagemaker_tests", "huggingface", "sglang")
+    elif "huggingface" in framework and "llamacpp" in framework:
+        path = os.path.join("test", "sagemaker_tests", "huggingface", "llamacpp")
     elif "huggingface" in framework and job_type == "inference":
         path = os.path.join("test", "sagemaker_tests", "huggingface", "inference")
     if "trcomp" in framework:
diff --git a/test/testrunner.py b/test/testrunner.py
index 2d7deb2cfe24..9773d11e1604 100644
--- a/test/testrunner.py
+++ b/test/testrunner.py
@@ -629,6 +629,15 @@ def main():
             sm_utils.generate_empty_report(report, test_type, "sglang")
             return
 
+        # Skip base llamacpp (not huggingface_llamacpp) - huggingface_llamacpp has local tests
+        if "llamacpp" in dlc_images and "huggingface" not in dlc_images:
+            LOGGER.info(
+                f"Skipping - there are no local mode tests for base Llamacpp. Images: {dlc_images}"
+            )
+            report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
+            sm_utils.generate_empty_report(report, test_type, "llamacpp")
+            return
+
         testing_image_list = [
             image
             for image in standard_images_list

From 5d5118fe3bb684aecdf7904bb4662745d2b45a0d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Enrique=20Hern=C3=A1ndez=20Calabr=C3=A9s?=
 <ehcalabres@gmail.com>
Date: Fri, 24 Apr 2026 15:08:44 +0000
Subject: [PATCH 02/14] [WIP] Add HuggingFace LlamaCpp support with
 Dockerfiles, buildspec, and serving scripts

---
 .../llamacpp_sagemaker_serve.py               | 211 ++++++++++
 .../build_artifacts/sagemaker_entrypoint.sh   |  95 +++++
 .../build_artifacts/start_cuda_compat.sh      |  25 ++
 .../{buildspec.yaml => buildspec.yml}         |  31 +-
 .../docker/b8672/cu129/Dockerfile.gpu         |  38 --
 .../docker/b8672/cu130/Dockerfile.gpu         |  42 --
 .../docker/{b8672 => b8882}/Dockerfile.cpu    |   0
 .../docker/b8882/cu130/Dockerfile.gpu         |  32 ++
 src/image_builder.py                          |   1 +
 .../huggingface/llamacpp/__init__.py          |  13 +
 .../huggingface/llamacpp/conftest.py          | 391 ++++++++++++++++++
 .../llamacpp/integration/__init__.py          | 119 ++++++
 .../llamacpp/integration/local/__init__.py    |  13 +
 .../integration/local/test_serving.py         | 109 +++++
 .../integration/sagemaker/__init__.py         |  12 +
 .../integration/sagemaker/test_sglang.py      | 116 ++++++
 .../llamacpp/integration/sagemaker/timeout.py |  66 +++
 .../resources/qwen3.5-0.8b/.gitattributes     |  61 +++
 .../huggingface/llamacpp/utils/__init__.py    |  36 ++
 .../huggingface/llamacpp/utils/image_utils.py |  67 +++
 .../llamacpp/utils/local_mode_utils.py        |  46 +++
 test/test_utils/__init__.py                   |   1 +
 22 files changed, 1423 insertions(+), 102 deletions(-)
 create mode 100644 huggingface/llamacpp/build_artifacts/llamacpp_sagemaker_serve.py
 create mode 100644 huggingface/llamacpp/build_artifacts/sagemaker_entrypoint.sh
 create mode 100644 huggingface/llamacpp/build_artifacts/start_cuda_compat.sh
 rename huggingface/llamacpp/{buildspec.yaml => buildspec.yml} (77%)
 delete mode 100644 huggingface/llamacpp/docker/b8672/cu129/Dockerfile.gpu
 delete mode 100644 huggingface/llamacpp/docker/b8672/cu130/Dockerfile.gpu
 rename huggingface/llamacpp/docker/{b8672 => b8882}/Dockerfile.cpu (100%)
 create mode 100644 huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu
 create mode 100644 test/sagemaker_tests/huggingface/llamacpp/__init__.py
 create mode 100644 test/sagemaker_tests/huggingface/llamacpp/conftest.py
 create mode 100644 test/sagemaker_tests/huggingface/llamacpp/integration/__init__.py
 create mode 100644 test/sagemaker_tests/huggingface/llamacpp/integration/local/__init__.py
 create mode 100644 test/sagemaker_tests/huggingface/llamacpp/integration/local/test_serving.py
 create mode 100644 test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/__init__.py
 create mode 100644 test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/test_sglang.py
 create mode 100644 test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/timeout.py
 create mode 100644 test/sagemaker_tests/huggingface/llamacpp/resources/qwen3.5-0.8b/.gitattributes
 create mode 100644 test/sagemaker_tests/huggingface/llamacpp/utils/__init__.py
 create mode 100644 test/sagemaker_tests/huggingface/llamacpp/utils/image_utils.py
 create mode 100644 test/sagemaker_tests/huggingface/llamacpp/utils/local_mode_utils.py

diff --git a/huggingface/llamacpp/build_artifacts/llamacpp_sagemaker_serve.py b/huggingface/llamacpp/build_artifacts/llamacpp_sagemaker_serve.py
new file mode 100644
index 000000000000..38da145b33e0
--- /dev/null
+++ b/huggingface/llamacpp/build_artifacts/llamacpp_sagemaker_serve.py
@@ -0,0 +1,211 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""SageMaker HTTP proxy for llama.cpp llama-server.
+
+SageMaker invokes POST /invocations and GET /ping on port 8080. llama-server
+speaks OpenAI-style routes (e.g. /v1/chat/completions) and does not expose
+/invocations.
+
+Behavior mirrors scripts/vllm/omni_sagemaker_serve.py routing:
+
+- GET /ping is proxied to GET {backend}/health.
+- POST /invocations: if ``X-Amzn-SageMaker-Custom-Attributes`` contains
+  ``route=/some/path``, the request is forwarded to that path on llama-server.
+  Otherwise the target path is inferred from the JSON body (messages ->
+  /v1/chat/completions, prompt -> /v1/completions, input+model -> /v1/embeddings),
+  defaulting to /v1/chat/completions.
+
+For routes that require multipart/form-data (parity with vLLM-Omni), JSON bodies
+are converted when ``route=`` targets those paths.
+
+Environment:
+
+- LLAMACPP_SAGEMAKER_BACKEND_URL: upstream base URL (default http://127.0.0.1:8081)
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import re
+import uuid
+from collections.abc import AsyncIterator
+
+import httpx
+from starlette.applications import Starlette
+from starlette.requests import Request
+from starlette.responses import Response, StreamingResponse
+from starlette.routing import Route
+
+logger = logging.getLogger("llamacpp_sagemaker")
+
+BACKEND = os.environ.get("LLAMACPP_SAGEMAKER_BACKEND_URL", "http://127.0.0.1:8081").rstrip("/")
+
+FORM_DATA_ROUTES = frozenset({"/v1/videos", "/v1/videos/sync"})
+
+_HOP_BY_HOP = frozenset(
+    {
+        "connection",
+        "keep-alive",
+        "proxy-authenticate",
+        "proxy-authorization",
+        "te",
+        "trailers",
+        "transfer-encoding",
+        "upgrade",
+        "host",
+        "content-length",
+    }
+)
+
+_RESP_DROP = frozenset({"transfer-encoding", "content-length", "connection"})
+
+
+def _parse_route_from_header(raw: str | None) -> str | None:
+    if not raw:
+        return None
+    m = re.search(r"route=(/[^\s,]+)", raw)
+    return m.group(1) if m else None
+
+
+def _parse_route(request: Request) -> str | None:
+    h = request.headers
+    v = h.get("x-amzn-sagemaker-custom-attributes")
+    return _parse_route_from_header(v)
+
+
+def _build_multipart_body(data: dict, boundary: str) -> bytes:
+    parts: list[str] = []
+    for key, value in data.items():
+        parts.append(
+            f'--{boundary}\r\nContent-Disposition: form-data; name="{key}"\r\n\r\n{value}\r\n'
+        )
+    parts.append(f"--{boundary}--\r\n")
+    return "".join(parts).encode()
+
+
+def _default_path_for_invocation(content_type: str, body: bytes) -> str:
+    ct = (content_type or "").lower()
+    if "json" not in ct:
+        return "/v1/chat/completions"
+    try:
+        data = json.loads(body)
+    except (json.JSONDecodeError, UnicodeDecodeError):
+        return "/v1/chat/completions"
+    if not isinstance(data, dict):
+        return "/v1/chat/completions"
+    if "messages" in data:
+        return "/v1/chat/completions"
+    if "prompt" in data:
+        return "/v1/completions"
+    if "input" in data and "model" in data:
+        return "/v1/embeddings"
+    return "/v1/chat/completions"
+
+
+def _forward_request_headers(request: Request, body_len: int, content_type: str | None) -> dict[str, str]:
+    out: dict[str, str] = {}
+    for key, value in request.headers.items():
+        lk = key.lower()
+        if lk in _HOP_BY_HOP or lk == "x-amzn-sagemaker-custom-attributes":
+            continue
+        out[key] = value
+    out["content-length"] = str(body_len)
+    if content_type is not None:
+        out["content-type"] = content_type
+    return out
+
+
+def _response_headers_from_httpx(resp: httpx.Response) -> dict[str, str]:
+    h: dict[str, str] = {}
+    for key, value in resp.headers.items():
+        lk = key.lower()
+        if lk in _RESP_DROP:
+            continue
+        h[key] = value
+    return h
+
+
+async def ping(request: Request) -> Response:
+    url = f"{BACKEND}/health"
+    try:
+        async with httpx.AsyncClient(timeout=httpx.Timeout(10.0, connect=2.0)) as client:
+            r = await client.get(url)
+    except httpx.RequestError as e:
+        logger.warning("Backend health request failed: %s", e)
+        return Response(status_code=503, content=b'{"error":"backend_unavailable"}')
+    return Response(
+        status_code=r.status_code,
+        content=r.content,
+        headers=_response_headers_from_httpx(r),
+    )
+
+
+async def invocations(request: Request) -> Response:
+    if request.method != "POST":
+        return Response(status_code=405, content=b"Method Not Allowed")
+
+    body = await request.body()
+    route = _parse_route(request)
+    content_type = request.headers.get("content-type")
+
+    if route:
+        target = route
+        logger.info("Rerouting /invocations -> %s", target)
+        ct = (content_type or "").lower()
+        if target in FORM_DATA_ROUTES and "json" in ct:
+            try:
+                data = json.loads(body)
+            except (json.JSONDecodeError, UnicodeDecodeError):
+                data = None
+            if isinstance(data, dict):
+                boundary = uuid.uuid4().hex
+                body = _build_multipart_body(data, boundary)
+                content_type = f"multipart/form-data; boundary={boundary}"
+                logger.info("Converted JSON to form-data for %s", target)
+    else:
+        target = _default_path_for_invocation(content_type or "", body)
+        logger.info("Inferred /invocations -> %s", target)
+
+    url = f"{BACKEND}{target}"
+    fwd_headers = _forward_request_headers(request, len(body), content_type)
+
+    timeout = httpx.Timeout(600.0, connect=30.0)
+    client = httpx.AsyncClient(timeout=timeout)
+    try:
+        req = client.build_request("POST", url, headers=fwd_headers, content=body)
+        r = await client.send(req, stream=True)
+    except httpx.RequestError as e:
+        await client.aclose()
+        logger.exception("Upstream request failed: %s", e)
+        return Response(status_code=502, content=json.dumps({"error": "upstream_error"}).encode())
+
+    async def stream_body() -> AsyncIterator[bytes]:
+        try:
+            async for chunk in r.aiter_bytes():
+                yield chunk
+        finally:
+            await r.aclose()
+            await client.aclose()
+
+    return StreamingResponse(
+        stream_body(),
+        status_code=r.status_code,
+        headers=_response_headers_from_httpx(r),
+        media_type=r.headers.get("content-type"),
+    )
+
+
+routes = [
+    Route("/ping", ping, methods=["GET"]),
+    Route("/invocations", invocations, methods=["POST"]),
+]
+
+app = Starlette(routes=routes)
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s %(name)s %(message)s",
+    force=True,
+)
diff --git a/huggingface/llamacpp/build_artifacts/sagemaker_entrypoint.sh b/huggingface/llamacpp/build_artifacts/sagemaker_entrypoint.sh
new file mode 100644
index 000000000000..7f55bf5967d8
--- /dev/null
+++ b/huggingface/llamacpp/build_artifacts/sagemaker_entrypoint.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+set -euo pipefail
+
+# Check if telemetry file exists before executing
+# Execute telemetry script if it exists, suppress errors
+bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true
+
+# Source CUDA compat for older drivers (e.g., g5 instances)
+if command -v nvidia-smi >/dev/null 2>&1 && command -v nvcc >/dev/null 2>&1; then
+    source /usr/local/bin/start_cuda_compat.sh
+fi
+
+# SageMaker sends traffic to port 8080 on /ping and /invocations. llama-server
+# listens on a loopback-only port; a small Python proxy (llamacpp_sagemaker_serve)
+# binds 8080 and forwards to llama-server, similar to vLLM-Omni middleware.
+INTERNAL_HOST="${LLAMACPP_SAGEMAKER_INTERNAL_HOST:-127.0.0.1}"
+INTERNAL_PORT="${LLAMACPP_SAGEMAKER_INTERNAL_PORT:-8081}"
+PROXY_PORT="${LLAMACPP_SAGEMAKER_PROXY_PORT:-8080}"
+export LLAMACPP_SAGEMAKER_BACKEND_URL="${LLAMACPP_SAGEMAKER_BACKEND_URL:-http://${INTERNAL_HOST}:${INTERNAL_PORT}}"
+
+PREFIX="SM_LLAMACPP_"
+ARG_PREFIX="--"
+
+ARGS=()
+
+while IFS='=' read -r key value; do
+    arg_name=$(echo "${key#"${PREFIX}"}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')
+
+    ARGS+=("${ARG_PREFIX}${arg_name}")
+    if [ -n "$value" ]; then
+        ARGS+=("$value")
+    fi
+done < <(env | grep "^${PREFIX}" || true)
+
+# Drop any user-supplied --host / --port so inference stays on the internal bind.
+normalized=()
+skip_next=0
+for a in "${ARGS[@]}"; do
+    if [ "$skip_next" -eq 1 ]; then
+        skip_next=0
+        continue
+    fi
+    if [ "$a" = "--host" ] || [ "$a" = "--port" ]; then
+        skip_next=1
+        continue
+    fi
+    normalized+=("$a")
+done
+ARGS=("${normalized[@]}")
+ARGS+=(--host "$INTERNAL_HOST" --port "$INTERNAL_PORT")
+
+echo "[sagemaker] llama-server args: ${ARGS[*]}" >&2
+
+/app/llama-server "${ARGS[@]}" &
+LLAMA_PID=$!
+
+wait_for_llama() {
+    local i
+    for i in $(seq 1 120); do
+        if curl -sf "http://${INTERNAL_HOST}:${INTERNAL_PORT}/health" >/dev/null 2>&1; then
+            return 0
+        fi
+        sleep 1
+    done
+    return 1
+}
+
+if ! wait_for_llama; then
+    echo "[sagemaker] llama-server did not become healthy on ${INTERNAL_HOST}:${INTERNAL_PORT}" >&2
+    kill -TERM "$LLAMA_PID" 2>/dev/null || true
+    wait "$LLAMA_PID" 2>/dev/null || true
+    exit 1
+fi
+
+shutdown() {
+    kill -TERM "$UVICORN_PID" 2>/dev/null || true
+    kill -TERM "$LLAMA_PID" 2>/dev/null || true
+    wait "$UVICORN_PID" 2>/dev/null || true
+    wait "$LLAMA_PID" 2>/dev/null || true
+}
+
+trap shutdown SIGTERM SIGINT
+
+if [ -n "${PYTHONPATH:-}" ]; then
+    export PYTHONPATH="${PYTHONPATH}:/usr/local/lib/llamacpp_sagemaker"
+else
+    export PYTHONPATH="/usr/local/lib/llamacpp_sagemaker"
+fi
+python3 -m uvicorn llamacpp_sagemaker_serve:app --host 0.0.0.0 --port "$PROXY_PORT" --log-level info &
+UVICORN_PID=$!
+
+wait "$UVICORN_PID"
+exit_code=$?
+shutdown
+exit "$exit_code"
diff --git a/huggingface/llamacpp/build_artifacts/start_cuda_compat.sh b/huggingface/llamacpp/build_artifacts/start_cuda_compat.sh
new file mode 100644
index 000000000000..791d355c5abe
--- /dev/null
+++ b/huggingface/llamacpp/build_artifacts/start_cuda_compat.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+verlte() {
+  [ "$1" = "$2" ] && return 1 || [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]
+}
+
+COMPAT_FILE=/usr/local/cuda/compat/libcuda.so.1
+if [ -f $COMPAT_FILE ]; then
+  CUDA_COMPAT_MAX_DRIVER_VERSION=$(readlink $COMPAT_FILE | cut -d'.' -f 3-)
+  echo "CUDA compat package should be installed for NVIDIA driver smaller than ${CUDA_COMPAT_MAX_DRIVER_VERSION}"
+  NVIDIA_DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
+  if [ -z "$NVIDIA_DRIVER_VERSION" ]; then
+    NVIDIA_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0 2>/dev/null || true)
+  fi
+  echo "Current installed NVIDIA driver version is ${NVIDIA_DRIVER_VERSION}"
+  if verlte $NVIDIA_DRIVER_VERSION $CUDA_COMPAT_MAX_DRIVER_VERSION; then
+    echo "Adding CUDA compat to LD_LIBRARY_PATH"
+    export LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH
+    echo $LD_LIBRARY_PATH
+  else
+    echo "Skipping CUDA compat setup as newer NVIDIA driver is installed"
+  fi
+else
+  echo "Skipping CUDA compat setup as package not found"
+fi
diff --git a/huggingface/llamacpp/buildspec.yaml b/huggingface/llamacpp/buildspec.yml
similarity index 77%
rename from huggingface/llamacpp/buildspec.yaml
rename to huggingface/llamacpp/buildspec.yml
index 2d1b2e360c93..bed20118458b 100644
--- a/huggingface/llamacpp/buildspec.yaml
+++ b/huggingface/llamacpp/buildspec.yml
@@ -3,8 +3,8 @@ prod_account_id: &PROD_ACCOUNT_ID 763104351884
 region: &REGION <set-$REGION-in-environment>
 base_framework: &BASE_FRAMEWORK llamacpp
 framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK]
-version: &VERSION "b8672"
-short_version: &SHORT_VERSION "b8672"
+version: &VERSION "b8882"
+short_version: &SHORT_VERSION "b8882"
 arch_type: &ARCH_TYPE x86_64
 autopatch_build: "False"
 
@@ -28,29 +28,12 @@ context:
     sagemaker_entrypoint:
       source: build_artifacts/sagemaker_entrypoint.sh
       target: sagemaker_entrypoint.sh
+    llamacpp_sagemaker_serve:
+      source: build_artifacts/llamacpp_sagemaker_serve.py
+      target: llamacpp_sagemaker_serve.py
 
 
 images:
-  BuildHuggingFaceLlamacppGpuCu129DockerImage:
-    <<: *BUILD_REPOSITORY
-    context:
-      <<: *BUILD_CONTEXT
-    image_size_baseline: 40000
-    device_type: &DEVICE_TYPE gpu
-    cuda_version: &CUDA_VERSION cu129
-    os_version: &OS_VERSION ubuntu24.04
-    transformers_version: &TRANSFORMERS_VERSION 4.57.3
-    tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *CUDA_VERSION, '-', *OS_VERSION ]
-    docker_file: !join [ docker/, *SHORT_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
-    target: sagemaker
-    build: true
-    enable_common_stage_build: false
-    test_configs:
-      test_platforms:
-        - sanity
-        - security
-        - sagemaker
-  
   BuildHuggingFaceLlamacppGpuCu130DockerImage:
     <<: *BUILD_REPOSITORY
     context:
@@ -59,6 +42,8 @@ images:
     device_type: &DEVICE_TYPE gpu
     cuda_version: &CUDA_VERSION cu130
     os_version: &OS_VERSION ubuntu24.04
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py312
     transformers_version: &TRANSFORMERS_VERSION 4.57.3
     tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *CUDA_VERSION, '-', *OS_VERSION ]
     docker_file: !join [ docker/, *SHORT_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
@@ -78,6 +63,8 @@ images:
     image_size_baseline: 40000
     device_type: &DEVICE_TYPE cpu
     os_version: &OS_VERSION ubuntu24.04
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py312
     transformers_version: &TRANSFORMERS_VERSION 4.57.3
     tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *OS_VERSION ]
     docker_file: !join [ docker/, *SHORT_VERSION, /Dockerfile., *DEVICE_TYPE ]
diff --git a/huggingface/llamacpp/docker/b8672/cu129/Dockerfile.gpu b/huggingface/llamacpp/docker/b8672/cu129/Dockerfile.gpu
deleted file mode 100644
index 6e37bac2e95e..000000000000
--- a/huggingface/llamacpp/docker/b8672/cu129/Dockerfile.gpu
+++ /dev/null
@@ -1,38 +0,0 @@
-FROM ghcr.io/ggml-org/llama.cpp:server-cuda12-b8672 as base
-
-LABEL maintainer="Amazon AI"
-LABEL dlc_major_version="1"
-
-ARG HUGGINGFACE_HUB_VERSION=1.9.0
-ARG HF_XET_VERSION=1.2.0
-
-WORKDIR /
-
-COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
-COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh
-
-# ====================== ec2 =========================================
-FROM base AS llamacpp-ec2 
-
-RUN dpkg -l | grep -E "cuda|nvidia|libnv" | awk '{print $2}' | xargs apt-mark hold && \
-    apt-get update && \
-    apt-get upgrade -y && \
-    apt-get clean
-
-COPY dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh
-RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh 
-
-ENTRYPOINT ["/usr/local/bin/dockerd_entrypoint.sh"]
-
-# ====================== sagemaker =========================================
-FROM base AS llamacpp-sagemaker
-
-RUN dpkg -l | grep -E "cuda|nvidia|libnv" | awk '{print $2}' | xargs apt-mark hold && \
-    apt-get update && \
-    apt-get upgrade -y && \
-    apt-get clean
-
-COPY sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh
-RUN chmod +x /usr/local/bin/sagemaker_entrypoint.sh
-
-ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"]
diff --git a/huggingface/llamacpp/docker/b8672/cu130/Dockerfile.gpu b/huggingface/llamacpp/docker/b8672/cu130/Dockerfile.gpu
deleted file mode 100644
index e1cc3efa6e9a..000000000000
--- a/huggingface/llamacpp/docker/b8672/cu130/Dockerfile.gpu
+++ /dev/null
@@ -1,42 +0,0 @@
-FROM ghcr.io/ggml-org/llama.cpp:server-cuda13-b8672 as base
-
-LABEL maintainer="Amazon AI"
-LABEL dlc_major_version="1"
-
-ENV DEBIAN_FRONTEND=noninteractive \
-    LANG=C.UTF-8 \
-    LC_ALL=C.UTF-8 \
-    DLC_CONTAINER_TYPE=base \
-    LD_LIBRARY_PATH="/opt/amazon/ofi-nccl/lib:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" \
-    PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/cuda/bin:${PATH}"
-
-WORKDIR /
-
-COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
-COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh
-
-# ====================== ec2 =========================================
-FROM base AS llamacpp-ec2 
-
-RUN dpkg -l | grep -E "cuda|nvidia|libnv" | awk '{print $2}' | xargs apt-mark hold && \
-    apt-get update && \
-    apt-get upgrade -y && \
-    apt-get clean
-
-COPY dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh
-RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh 
-
-ENTRYPOINT ["/usr/local/bin/dockerd_entrypoint.sh"]
-
-# ====================== sagemaker =========================================
-FROM base AS llamacpp-sagemaker
-
-RUN dpkg -l | grep -E "cuda|nvidia|libnv" | awk '{print $2}' | xargs apt-mark hold && \
-    apt-get update && \
-    apt-get upgrade -y && \
-    apt-get clean
-
-COPY sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh
-RUN chmod +x /usr/local/bin/sagemaker_entrypoint.sh
-
-ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"]
diff --git a/huggingface/llamacpp/docker/b8672/Dockerfile.cpu b/huggingface/llamacpp/docker/b8882/Dockerfile.cpu
similarity index 100%
rename from huggingface/llamacpp/docker/b8672/Dockerfile.cpu
rename to huggingface/llamacpp/docker/b8882/Dockerfile.cpu
diff --git a/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu b/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu
new file mode 100644
index 000000000000..187f8f75ea0a
--- /dev/null
+++ b/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu
@@ -0,0 +1,32 @@
+FROM ghcr.io/ggml-org/llama.cpp:server-cuda13-b8882 AS base
+
+LABEL maintainer="Amazon AI"
+LABEL dlc_major_version="1"
+
+FROM base AS sagemaker
+
+WORKDIR /app
+ENV LD_LIBRARY_PATH=/app:${LD_LIBRARY_PATH}
+
+RUN apt-get update \
+ && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    ca-certificates \
+    curl \
+    python3 \
+    python3-pip \
+ && pip3 install --no-cache-dir --break-system-packages \
+    "httpx>=0.27,<1" \
+    "starlette>=0.37,<1" \
+    "uvicorn[standard]>=0.27,<1" \
+ && rm -rf /var/lib/apt/lists/*
+
+COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
+COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh
+COPY start_cuda_compat.sh /usr/local/bin/start_cuda_compat.sh
+COPY sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh
+RUN mkdir -p /usr/local/lib/llamacpp_sagemaker
+COPY llamacpp_sagemaker_serve.py /usr/local/lib/llamacpp_sagemaker/llamacpp_sagemaker_serve.py
+RUN chmod +x /usr/local/bin/sagemaker_entrypoint.sh \
+ && chmod +x /usr/local/bin/start_cuda_compat.sh
+
+ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"]
diff --git a/src/image_builder.py b/src/image_builder.py
index f101f33fa7fb..cc401a6a5e11 100644
--- a/src/image_builder.py
+++ b/src/image_builder.py
@@ -686,6 +686,7 @@ def get_job_type(image_repo_uri):
         "base": "general",
         "vllm": "general",
         "sglang": "general",
+        "llamacpp": "general",
     }
 
     for key, job_type in job_type_mapping.items():
diff --git a/test/sagemaker_tests/huggingface/llamacpp/__init__.py b/test/sagemaker_tests/huggingface/llamacpp/__init__.py
new file mode 100644
index 000000000000..199e66b95926
--- /dev/null
+++ b/test/sagemaker_tests/huggingface/llamacpp/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
diff --git a/test/sagemaker_tests/huggingface/llamacpp/conftest.py b/test/sagemaker_tests/huggingface/llamacpp/conftest.py
new file mode 100644
index 000000000000..cbf61a194072
--- /dev/null
+++ b/test/sagemaker_tests/huggingface/llamacpp/conftest.py
@@ -0,0 +1,391 @@
+# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+
+import json
+import logging
+import os
+import platform
+import shutil
+import sys
+import tempfile
+
+import boto3
+import pytest
+
+from botocore.exceptions import ClientError
+from sagemaker import LocalSession, Session
+from sagemaker.pytorch import PyTorch
+
+from .utils import image_utils, get_ecr_registry
+
+NO_P4_REGIONS = [
+    "af-south-1",
+    "ap-east-1",
+    "ap-northeast-3",
+    "ap-southeast-1",
+    "ap-southeast-2",
+    "ap-south-1",
+    "ca-central-1",
+    "eu-central-1",
+    "eu-north-1",
+    "eu-west-2",
+    "eu-west-3",
+    "eu-south-1",
+    "me-south-1",
+    "sa-east-1",
+    "us-west-1",
+    "cn-northwest-1",
+    "il-central-1",
+]
+
+NO_G5_REGIONS = [
+    "us-west-1",
+    "ca-west-1",
+    "mx-cental-1",
+    "af-south-1",
+    "ap-east-1",
+    "ap-south-2",
+    "ap-southeast-5",
+    "ap-southeast-4",
+    "ap-northeast-3",
+    "ap-southeast-1",
+    "ap-southeast-7",
+    "eu-south-1",
+    "eu-west-3",
+    "eu-south-2",
+    "eu-central-2",
+    "me-south-1",
+]
+
+
+logger = logging.getLogger(__name__)
+logging.getLogger("boto").setLevel(logging.INFO)
+logging.getLogger("boto3").setLevel(logging.INFO)
+logging.getLogger("botocore").setLevel(logging.INFO)
+logging.getLogger("factory.py").setLevel(logging.INFO)
+logging.getLogger("auth.py").setLevel(logging.INFO)
+logging.getLogger("connectionpool.py").setLevel(logging.INFO)
+
+
+dir_path = os.path.dirname(os.path.realpath(__file__))
+
+
+def pytest_addoption(parser):
+    parser.addoption("--build-image", "-D", action="store_true")
+    parser.addoption("--build-base-image", "-B", action="store_true")
+    parser.addoption("--aws-id")
+    parser.addoption("--instance-type")
+    parser.addoption("--accelerator-type", default=None)
+    parser.addoption("--docker-base-name", default="huggingface_sglang")
+    parser.addoption("--region", default="us-west-2")
+    parser.addoption("--framework-version", default="")
+    parser.addoption(
+        "--py-version",
+        choices=["2", "3", "37", "38", "39", "310", "311", "312"],
+        default=str(sys.version_info.major),
+    )
+    # Processor is still "cpu" for EIA tests
+    parser.addoption(
+        "--processor", choices=["gpu", "cpu", "eia", "neuron", "neuronx"], default="cpu"
+    )
+    # If not specified, will default to {framework-version}-{processor}-py{py-version}
+    parser.addoption("--tag", default=None)
+    parser.addoption(
+        "--generate-coverage-doc",
+        default=False,
+        action="store_true",
+        help="use this option to generate test coverage doc",
+    )
+    parser.addoption(
+        "--efa",
+        action="store_true",
+        default=False,
+        help="Run only efa tests",
+    )
+    parser.addoption("--sagemaker-regions", default="us-west-2")
+
+
+def pytest_configure(config):
+    config.addinivalue_line("markers", "efa(): explicitly mark to run efa tests")
+
+
+def pytest_runtest_setup(item):
+    if item.config.getoption("--efa"):
+        efa_tests = [mark for mark in item.iter_markers(name="efa")]
+        if not efa_tests:
+            pytest.skip("Skipping non-efa tests")
+
+
+def pytest_collection_modifyitems(session, config, items):
+    for item in items:
+        print(f"item {item}")
+        for marker in item.iter_markers(name="team"):
+            print(f"item {marker}")
+            team_name = marker.args[0]
+            item.user_properties.append(("team_marker", team_name))
+            print(f"item.user_properties {item.user_properties}")
+
+    if config.getoption("--generate-coverage-doc"):
+        from test.test_utils.test_reporting import TestReportGenerator
+
+        report_generator = TestReportGenerator(items, is_sagemaker=True)
+        report_generator.generate_coverage_doc(framework="huggingface_sglang", job_type="inference")
+
+
+@pytest.fixture(scope="session", name="docker_base_name")
+def fixture_docker_base_name(request):
+    return request.config.getoption("--docker-base-name")
+
+
+@pytest.fixture(scope="session", name="region")
+def fixture_region(request):
+    return request.config.getoption("--region")
+
+
+@pytest.fixture(scope="session", name="framework_version")
+def fixture_framework_version(request):
+    return request.config.getoption("--framework-version")
+
+
+@pytest.fixture(scope="session", name="py_version")
+def fixture_py_version(request):
+    return "py{}".format(int(request.config.getoption("--py-version")))
+
+
+@pytest.fixture(scope="session", name="processor")
+def fixture_processor(request):
+    return request.config.getoption("--processor")
+
+
+@pytest.fixture(scope="session", name="tag")
+def fixture_tag(request, framework_version, processor, py_version):
+    provided_tag = request.config.getoption("--tag")
+    default_tag = "{}-{}-{}".format(framework_version, processor, py_version)
+    return provided_tag if provided_tag else default_tag
+
+
+@pytest.fixture(scope="session", name="docker_image")
+def fixture_docker_image(docker_base_name, tag):
+    return "{}:{}".format(docker_base_name, tag)
+
+
+@pytest.fixture
+def opt_ml():
+    tmp = tempfile.mkdtemp()
+    os.mkdir(os.path.join(tmp, "output"))
+
+    # Docker cannot mount Mac OS /var folder properly see
+    # https://forums.docker.com/t/var-folders-isnt-mounted-properly/9600
+    opt_ml_dir = "/private{}".format(tmp) if platform.system() == "Darwin" else tmp
+    yield opt_ml_dir
+
+    shutil.rmtree(tmp, True)
+
+
+@pytest.fixture(scope="session", name="use_gpu")
+def fixture_use_gpu(processor):
+    return processor == "gpu"
+
+
+@pytest.fixture(scope="session", name="build_base_image", autouse=True)
+def fixture_build_base_image(
+    request, framework_version, py_version, processor, tag, docker_base_name
+):
+    build_base_image = request.config.getoption("--build-base-image")
+    if build_base_image:
+        return image_utils.build_base_image(
+            framework_name=docker_base_name,
+            framework_version=framework_version,
+            py_version=py_version,
+            base_image_tag=tag,
+            processor=processor,
+            cwd=os.path.join(dir_path, ".."),
+        )
+
+    return tag
+
+
+@pytest.fixture(scope="session", name="sagemaker_session")
+def fixture_sagemaker_session(region):
+    return Session(boto_session=boto3.Session(region_name=region))
+
+
+@pytest.fixture(scope="session", name="sagemaker_regions")
+def fixture_sagemaker_regions(request):
+    sagemaker_regions = request.config.getoption("--sagemaker-regions")
+    return sagemaker_regions.split(",")
+
+
+@pytest.fixture(scope="session", name="sagemaker_local_session")
+def fixture_sagemaker_local_session(region):
+    return LocalSession(boto_session=boto3.Session(region_name=region))
+
+
+@pytest.fixture(name="aws_id", scope="session")
+def fixture_aws_id(request):
+    return request.config.getoption("--aws-id")
+
+
+@pytest.fixture(name="instance_type", scope="session")
+def fixture_instance_type(request, processor):
+    provided_instance_type = request.config.getoption("--instance-type")
+    default_instance_type = "local" if processor == "cpu" else "local_gpu"
+    return provided_instance_type or default_instance_type
+
+
+@pytest.fixture(name="accelerator_type", scope="session")
+def fixture_accelerator_type(request):
+    return request.config.getoption("--accelerator-type")
+
+
+@pytest.fixture(name="docker_registry", scope="session")
+def fixture_docker_registry(aws_id, region):
+    return get_ecr_registry(aws_id, region)
+
+
+@pytest.fixture(name="ecr_image", scope="session")
+def fixture_ecr_image(docker_registry, docker_base_name, tag):
+    return "{}/{}:{}".format(docker_registry, docker_base_name, tag)
+
+
+@pytest.fixture(autouse=True)
+def skip_by_device_type(request, use_gpu, instance_type, accelerator_type):
+    is_gpu = use_gpu or instance_type[3] in ["g", "p"]
+    is_eia = accelerator_type is not None
+    is_neuron = instance_type.startswith("ml.inf1")
+    is_neuronx = instance_type.startswith("ml.inf2") or instance_type.startswith("ml.trn1")
+
+    # Separate out cases for clearer logic.
+    # When running Neuron test, skip CPU  and GPU test.
+    if request.node.get_closest_marker("neuron_test") and not is_neuron:
+        pytest.skip("Skipping because running on '{}' instance".format(instance_type))
+    elif request.node.get_closest_marker("neuronx_test") and not is_neuronx:
+        pytest.skip("Skipping because running on '{}' instance".format(instance_type))
+
+    # When running GPU test, skip CPU  and neuron test. When running CPU test, skip GPU  and neuron test.
+    elif (request.node.get_closest_marker("gpu_test") and not is_gpu) or (
+        request.node.get_closest_marker("cpu_test") and (is_gpu or is_neuron or is_neuronx)
+    ):
+        pytest.skip("Skipping because running on '{}' instance".format(instance_type))
+
+    # When running EIA test, skip the CPU, GPU and Neuron functions
+    elif (
+        request.node.get_closest_marker("neuron_test")
+        or request.node.get_closest_marker("gpu_test")
+        or request.node.get_closest_marker("cpu_test")
+    ) and is_eia:
+        pytest.skip("Skipping because running on '{}' instance".format(instance_type))
+
+    # When running CPU or GPU or Neuron test, skip EIA test.
+    elif request.node.get_closest_marker("eia_test") and not is_eia:
+        pytest.skip("Skipping because running on '{}' instance".format(instance_type))
+
+
+@pytest.fixture(autouse=True)
+def skip_by_py_version(request, py_version):
+    if request.node.get_closest_marker("skip_py2") and py_version != "py3":
+        pytest.skip("Skipping the test because Python 2 is not supported.")
+
+
+@pytest.fixture(autouse=True)
+def skip_gpu_instance_restricted_regions(region, instance_type):
+    if (region in NO_P4_REGIONS and instance_type.startswith("ml.p4")) or (
+        region in NO_G5_REGIONS and instance_type.startswith("ml.g5")
+    ):
+        pytest.skip(
+            "Skipping GPU test in region {} with instance type {}".format(region, instance_type)
+        )
+
+
+@pytest.fixture(autouse=True)
+def skip_gpu_py2(request, use_gpu, instance_type, py_version, framework_version):
+    is_gpu = use_gpu or instance_type[3] in ["g", "p"]
+    if (
+        request.node.get_closest_marker("skip_gpu_py2")
+        and is_gpu
+        and py_version != "py3"
+        and framework_version == "1.4.0"
+    ):
+        pytest.skip("Skipping the test until mms issue resolved.")
+
+
+def _get_remote_override_flags():
+    try:
+        s3_client = boto3.client("s3")
+        sts_client = boto3.client("sts")
+        account_id = sts_client.get_caller_identity().get("Account")
+        result = s3_client.get_object(
+            Bucket=f"dlc-cicd-helper-{account_id}", Key="override_tests_flags.json"
+        )
+        json_content = json.loads(result["Body"].read().decode("utf-8"))
+    except ClientError as e:
+        logger.warning("ClientError when performing S3/STS operation: {}".format(e))
+        json_content = {}
+    return json_content
+
+
+def _is_test_disabled(test_name, build_name, version):
+    """
+    Expected format of remote_override_flags:
+    {
+        "CB Project Name for Test Type A": {
+            "CodeBuild Resolved Source Version": ["test_type_A_test_function_1", "test_type_A_test_function_2"]
+        },
+        "CB Project Name for Test Type B": {
+            "CodeBuild Resolved Source Version": ["test_type_B_test_function_1", "test_type_B_test_function_2"]
+        }
+    }
+
+    :param test_name: str Test Function node name (includes parametrized values in string)
+    :param build_name: str Build Project name of current execution
+    :param version: str Source Version of current execution
+    :return: bool True if test is disabled as per remote override, False otherwise
+    """
+    remote_override_flags = _get_remote_override_flags()
+    remote_override_build = remote_override_flags.get(build_name, {})
+    if version in remote_override_build:
+        return not remote_override_build[version] or any(
+            [test_keyword in test_name for test_keyword in remote_override_build[version]]
+        )
+    return False
+
+
+@pytest.fixture(autouse=True)
+def disable_test(request):
+    test_name = request.node.name
+    # We do not have a regex pattern to find CB name, which means we must resort to string splitting
+    build_arn = os.getenv("CODEBUILD_BUILD_ARN")
+    build_name = build_arn.split("/")[-1].split(":")[0] if build_arn else None
+    version = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
+
+    if build_name and version and _is_test_disabled(test_name, build_name, version):
+        pytest.skip(f"Skipping {test_name} test because it has been disabled.")
+
+
+@pytest.fixture(autouse=True)
+def skip_test_successfully_executed_before(request):
+    """
+    "cache/lastfailed" contains information about failed tests only. We're running SM tests in separate threads for each image.
+    So when we retry SM tests, successfully executed tests executed again because pytest doesn't have that info in /.cache.
+    But the flag "--last-failed-no-failures all" requires pytest to execute all the available tests.
+    The only sign that a test passed last time - lastfailed file exists and the test name isn't in that file.
+    The method checks whether lastfailed file exists and the test name is not in it.
+    """
+    test_name = request.node.name
+    lastfailed = request.config.cache.get("cache/lastfailed", None)
+
+    # if lastfailed is not None and not any(
+    #     test_name in failed_test_name for failed_test_name in lastfailed.keys()
+    # ):
+    #     pytest.skip(f"Skipping {test_name} because it was successfully executed for this commit")
diff --git a/test/sagemaker_tests/huggingface/llamacpp/integration/__init__.py b/test/sagemaker_tests/huggingface/llamacpp/integration/__init__.py
new file mode 100644
index 000000000000..9befa612dc56
--- /dev/null
+++ b/test/sagemaker_tests/huggingface/llamacpp/integration/__init__.py
@@ -0,0 +1,119 @@
+# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+
+import json
+import os
+import re
+import shutil
+import tarfile
+
+import boto3
+
+# Path to test resources
+resources_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "resources"))
+
+# Model artifacts for local mode tests - downloaded from HuggingFace Hub at runtime
+MODEL_ID = "unsloth/Qwen3.5-0.8B-GGUF"
+model_dir = os.path.join(resources_path, "qwen3.5-0.8b")
+model_data = "qwen3.5-0.8b.tar.gz"
+model_data_path = os.path.join(model_dir, model_data)
+
+
+def ensure_model_downloaded():
+    """Download model from HuggingFace Hub and create tarball if not already present."""
+    if os.path.exists(model_data_path):
+        return model_data_path
+
+    from huggingface_hub import snapshot_download
+
+    os.makedirs(model_dir, exist_ok=True)
+    local_model_dir = os.path.join(model_dir, "model")
+
+    print(f"Downloading {MODEL_ID} from HuggingFace Hub...")
+    snapshot_download(
+        repo_id=MODEL_ID, local_dir=local_model_dir, ignore_patterns=["*.onnx"]
+    )
+
+    # Remove cache folder if present
+    cache_dir = os.path.join(local_model_dir, ".cache")
+    if os.path.exists(cache_dir):
+        shutil.rmtree(cache_dir)
+
+    print(f"Creating tarball {model_data}...")
+    with tarfile.open(model_data_path, "w:gz") as tar:
+        for item in os.listdir(local_model_dir):
+            tar.add(os.path.join(local_model_dir, item), arcname=item)
+
+    # Clean up extracted model
+    shutil.rmtree(local_model_dir)
+
+    print(f"Model ready at {model_data_path}")
+    return model_data_path
+
+
+# Role for local mode (not used but required by SageMaker SDK)
+ROLE = "dummy/unused-role"
+DEFAULT_TIMEOUT = 45
+
+# Llama.cpp SageMaker images listen on port 8080 with a small HTTP shim (/ping,
+# /invocations) that proxies to llama-server on loopback (see llamacpp_sagemaker_serve).
+# Do not set SM_LLAMACPP_HOST or SM_LLAMACPP_PORT expecting external access to
+# llama-server; the entrypoint pins the server to localhost and exposes the shim on 8080.
+
+
+class NoLogStreamFoundError(Exception):
+    pass
+
+
+class SageMakerEndpointFailure(Exception):
+    pass
+
+
+def dump_logs_from_cloudwatch(e, region="us-west-2"):
+    """
+    Function to dump logs from cloudwatch during error handling.
+    Gracefully handles missing log groups/streams.
+    """
+    error_hosting_endpoint_regex = re.compile(r"Error hosting endpoint ((\w|-)+):")
+    endpoint_url_regex = re.compile(r"/aws/sagemaker/Endpoints/((\w|-)+)")
+    endpoint_match = error_hosting_endpoint_regex.search(str(e)) or endpoint_url_regex.search(
+        str(e)
+    )
+    if endpoint_match:
+        logs_client = boto3.client("logs", region_name=region)
+        endpoint = endpoint_match.group(1)
+        log_group_name = f"/aws/sagemaker/Endpoints/{endpoint}"
+        try:
+            log_stream_resp = logs_client.describe_log_streams(logGroupName=log_group_name)
+            all_traffic_log_stream = ""
+            for log_stream in log_stream_resp.get("logStreams", []):
+                log_stream_name = log_stream.get("logStreamName")
+                if log_stream_name.startswith("AllTraffic"):
+                    all_traffic_log_stream = log_stream_name
+                    break
+            if not all_traffic_log_stream:
+                raise NoLogStreamFoundError(
+                    f"Cannot find all traffic log streams for endpoint {endpoint}"
+                ) from e
+            events = logs_client.get_log_events(
+                logGroupName=log_group_name, logStreamName=all_traffic_log_stream
+            )
+            raise SageMakerEndpointFailure(
+                f"Error from endpoint {endpoint}:\n{json.dumps(events, indent=4)}"
+            ) from e
+        except logs_client.exceptions.ResourceNotFoundException:
+            # Log group doesn't exist yet - endpoint may have failed before creating logs
+            raise SageMakerEndpointFailure(
+                f"Endpoint {endpoint} failed. No CloudWatch logs available yet."
+            ) from e
diff --git a/test/sagemaker_tests/huggingface/llamacpp/integration/local/__init__.py b/test/sagemaker_tests/huggingface/llamacpp/integration/local/__init__.py
new file mode 100644
index 000000000000..199e66b95926
--- /dev/null
+++ b/test/sagemaker_tests/huggingface/llamacpp/integration/local/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
diff --git a/test/sagemaker_tests/huggingface/llamacpp/integration/local/test_serving.py b/test/sagemaker_tests/huggingface/llamacpp/integration/local/test_serving.py
new file mode 100644
index 000000000000..a5f10f4de27f
--- /dev/null
+++ b/test/sagemaker_tests/huggingface/llamacpp/integration/local/test_serving.py
@@ -0,0 +1,109 @@
+# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+
+from contextlib import contextmanager
+
+import pytest
+import requests
+from sagemaker.model import Model
+from sagemaker.predictor import Predictor
+from sagemaker.serializers import JSONSerializer
+from sagemaker.deserializers import JSONDeserializer
+
+from ...integration import ROLE, ensure_model_downloaded
+from ...utils import local_mode_utils
+
+
+@contextmanager
+def _predictor(image, sagemaker_local_session, instance_type):
+    """Context manager for Llama.cpp model deployment and cleanup.
+
+    Model is extracted to /opt/ml/model by SageMaker from model_data tar.gz.
+    The container entrypoint runs llama-server behind a SageMaker-compatible
+    proxy on port 8080 (/ping, /invocations -> OpenAI routes on llama-server).
+    """
+    # Download model from HuggingFace Hub if not already present
+    model_data_path = ensure_model_downloaded()
+
+    env = {
+        "SM_LLAMACPP_MODEL": "/opt/ml/model",
+    }
+
+    model = Model(
+        model_data=f"file://{model_data_path}",
+        role=ROLE,
+        image_uri=image,
+        env=env,
+        sagemaker_session=sagemaker_local_session,
+        predictor_cls=Predictor,
+    )
+    with local_mode_utils.lock():
+        predictor = None
+        try:
+            predictor = model.deploy(1, instance_type)
+            yield predictor
+        finally:
+            if predictor is not None:
+                predictor.delete_endpoint()
+
+
+def _assert_sagemaker_ping_local():
+    """SageMaker contract: GET /ping on the container HTTP port (local mode: 8080)."""
+    response = requests.get("http://127.0.0.1:8080/ping", timeout=60)
+    assert response.status_code == 200
+
+
+def _assert_llamacpp_chat_prediction(predictor):
+    """Test Llama.cpp inference using OpenAI-compatible chat completions API."""
+    predictor.serializer = JSONSerializer()
+    predictor.deserializer = JSONDeserializer()
+
+    data = {
+        "messages": [{"role": "user", "content": "What is Deep Learning?"}],
+        "max_tokens": 50,
+        "temperature": 0.7,
+    }
+    output = predictor.predict(data)
+
+    assert output is not None
+    assert "choices" in output
+
+
+def _assert_llamacpp_chat_prediction_explicit_route(predictor):
+    """Same as chat test but forces target path via SageMaker CustomAttributes (proxy route=)."""
+    predictor.serializer = JSONSerializer()
+    predictor.deserializer = JSONDeserializer()
+
+    data = {
+        "messages": [{"role": "user", "content": "Say hello in one word."}],
+        "max_tokens": 16,
+        "temperature": 0.3,
+    }
+    output = predictor.predict(
+        data,
+        custom_attributes="route=/v1/chat/completions",
+    )
+
+    assert output is not None
+    assert "choices" in output
+
+
+@pytest.mark.model("qwen3.5-0.8b")
+@pytest.mark.team("sagemaker-1p-algorithms")
+def test_llamacpp_local_chat(docker_image, sagemaker_local_session, instance_type):
+    """Test Llama.cpp local deployment: /ping shim, /invocations chat, and explicit route=."""
+    with _predictor(docker_image, sagemaker_local_session, instance_type) as predictor:
+        _assert_sagemaker_ping_local()
+        _assert_llamacpp_chat_prediction(predictor)
+        _assert_llamacpp_chat_prediction_explicit_route(predictor)
diff --git a/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/__init__.py b/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/__init__.py
new file mode 100644
index 000000000000..04fbf5d9a144
--- /dev/null
+++ b/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/__init__.py
@@ -0,0 +1,12 @@
+# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
diff --git a/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/test_sglang.py b/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/test_sglang.py
new file mode 100644
index 000000000000..f00668e50844
--- /dev/null
+++ b/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/test_sglang.py
@@ -0,0 +1,116 @@
+# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+
+import json
+import logging
+
+import pytest
+import sagemaker
+from sagemaker.model import Model
+from sagemaker.predictor import Predictor
+from sagemaker.serializers import JSONSerializer
+from sagemaker.deserializers import JSONDeserializer
+
+from ...integration import dump_logs_from_cloudwatch
+from ...integration.sagemaker.timeout import timeout_and_delete_endpoint
+from ..... import invoke_sm_endpoint_helper_function
+
+LOGGER = logging.getLogger(__name__)
+
+
+@pytest.mark.model("qwen3.5-0.8b")
+@pytest.mark.processor("gpu")
+@pytest.mark.gpu_test
+@pytest.mark.team("sagemaker-1p-algorithms")
+def test_llamacpp_qwen(framework_version, ecr_image, instance_type, sagemaker_regions):
+    invoke_sm_endpoint_helper_function(
+        ecr_image=ecr_image,
+        sagemaker_regions=sagemaker_regions,
+        test_function=_test_llamacpp_model,
+        dump_logs_from_cloudwatch=dump_logs_from_cloudwatch,
+        framework_version=framework_version,
+        instance_type=instance_type,
+        model_id="unsloth/Qwen3.5-0.8B-GGUF",
+    )
+
+
+def _test_llamacpp_model(
+    image_uri,
+    sagemaker_session,
+    instance_type,
+    model_id,
+    framework_version=None,
+    **kwargs,
+):
+    """Test Llama.cpp model deployment and inference using OpenAI-compatible API format
+
+    Uses sagemaker.model.Model for SDK v3 compatibility instead of HuggingFaceModel.
+
+    Args:
+        image_uri: ECR image URI
+        sagemaker_session: SageMaker session
+        instance_type: ML instance type
+        model_id: HuggingFace model ID
+        framework_version: Optional version info
+        **kwargs: Additional args from helper (boto_session, sagemaker_client, etc.)
+    """
+    endpoint_name = sagemaker.utils.unique_name_from_base("sagemaker-hf-llamacpp-serving")
+
+    env = {
+        "SM_LLAMACPP_MODEL": model_id,
+    }
+
+    model = Model(
+        name=endpoint_name,
+        image_uri=image_uri,
+        role="SageMakerRole",
+        env=env,
+        sagemaker_session=sagemaker_session,
+        predictor_cls=Predictor,
+    )
+
+    with timeout_and_delete_endpoint(endpoint_name, sagemaker_session, minutes=45):
+        predictor = model.deploy(
+            initial_instance_count=1,
+            instance_type=instance_type,
+            endpoint_name=endpoint_name,
+            container_startup_health_check_timeout=1800,
+            inference_ami_version="al2-ami-sagemaker-inference-gpu-3-1",
+        )
+
+        predictor.serializer = JSONSerializer()
+        predictor.deserializer = JSONDeserializer()
+
+        # Llama.cpp SageMaker uses OpenAI-compatible chat completions API format
+        data = {
+            "messages": [{"role": "user", "content": "What is Deep Learning?"}],
+            "max_tokens": 50,
+            "temperature": 0.7,
+        }
+
+        LOGGER.info(f"Running inference with data: {data}")
+        output = predictor.predict(data)
+        LOGGER.info(f"Output: {json.dumps(output)}")
+
+        assert output is not None
+        assert "choices" in output
+
+        # Explicit route= mirrors vLLM-Omni-style CustomAttributes routing in the container proxy.
+        output_routed = predictor.predict(
+            data,
+            custom_attributes="route=/v1/chat/completions",
+        )
+        LOGGER.info(f"Output (routed): {json.dumps(output_routed)}")
+        assert output_routed is not None
+        assert "choices" in output_routed
diff --git a/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/timeout.py b/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/timeout.py
new file mode 100644
index 000000000000..1d13878031f7
--- /dev/null
+++ b/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/timeout.py
@@ -0,0 +1,66 @@
+# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+import signal
+from contextlib import contextmanager
+import logging
+
+from botocore.exceptions import ClientError
+
+LOGGER = logging.getLogger("timeout")
+
+
+class TimeoutError(Exception):
+    pass
+
+
+@contextmanager
+def timeout(seconds=0, minutes=0, hours=0):
+    """Add a signal-based timeout to any block of code.
+    If multiple time units are specified, they will be added together to determine time limit.
+    Usage:
+    with timeout(seconds=5):
+        my_slow_function(...)
+    Args:
+        - seconds: The time limit, in seconds.
+        - minutes: The time limit, in minutes.
+        - hours: The time limit, in hours.
+    """
+
+    limit = seconds + 60 * minutes + 3600 * hours
+
+    def handler(signum, frame):
+        raise TimeoutError("timed out after {} seconds".format(limit))
+
+    try:
+        signal.signal(signal.SIGALRM, handler)
+        signal.alarm(limit)
+
+        yield
+    finally:
+        signal.alarm(0)
+
+
+@contextmanager
+def timeout_and_delete_endpoint(endpoint_name, sagemaker_session, seconds=0, minutes=0, hours=0):
+    with timeout(seconds=seconds, minutes=minutes, hours=hours) as t:
+        try:
+            yield [t]
+        finally:
+            try:
+                sagemaker_session.delete_endpoint(endpoint_name)
+                LOGGER.info("deleted endpoint {}".format(endpoint_name))
+            except ClientError as ce:
+                if ce.response["Error"]["Code"] == "ValidationException":
+                    # avoids the inner exception to be overwritten
+                    pass
diff --git a/test/sagemaker_tests/huggingface/llamacpp/resources/qwen3.5-0.8b/.gitattributes b/test/sagemaker_tests/huggingface/llamacpp/resources/qwen3.5-0.8b/.gitattributes
new file mode 100644
index 000000000000..6ba5bc1386f8
--- /dev/null
+++ b/test/sagemaker_tests/huggingface/llamacpp/resources/qwen3.5-0.8b/.gitattributes
@@ -0,0 +1,61 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+mmproj-BF16.gguf filter=lfs diff=lfs merge=lfs -text
+mmproj-F16.gguf filter=lfs diff=lfs merge=lfs -text
+mmproj-F32.gguf filter=lfs diff=lfs merge=lfs -text
+Qwen3.5-0.8B-UD-IQ3_XXS.gguf filter=lfs diff=lfs merge=lfs -text
+Qwen3.5-0.8B-UD-Q2_K_XL.gguf filter=lfs diff=lfs merge=lfs -text
+Qwen3.5-0.8B-Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text
+Qwen3.5-0.8B-UD-Q5_K_XL.gguf filter=lfs diff=lfs merge=lfs -text
+Qwen3.5-0.8B-UD-Q4_K_XL.gguf filter=lfs diff=lfs merge=lfs -text
+Qwen3.5-0.8B-UD-IQ2_M.gguf filter=lfs diff=lfs merge=lfs -text
+Qwen3.5-0.8B-Q3_K_M.gguf filter=lfs diff=lfs merge=lfs -text
+Qwen3.5-0.8B-Q6_K.gguf filter=lfs diff=lfs merge=lfs -text
+Qwen3.5-0.8B-UD-IQ2_XXS.gguf filter=lfs diff=lfs merge=lfs -text
+Qwen3.5-0.8B-Q5_K_M.gguf filter=lfs diff=lfs merge=lfs -text
+Qwen3.5-0.8B-Q3_K_S.gguf filter=lfs diff=lfs merge=lfs -text
+Qwen3.5-0.8B-IQ4_NL.gguf filter=lfs diff=lfs merge=lfs -text
+Qwen3.5-0.8B-Q4_0.gguf filter=lfs diff=lfs merge=lfs -text
+Qwen3.5-0.8B-Q4_1.gguf filter=lfs diff=lfs merge=lfs -text
+Qwen3.5-0.8B-IQ4_XS.gguf filter=lfs diff=lfs merge=lfs -text
+Qwen3.5-0.8B-Q5_K_S.gguf filter=lfs diff=lfs merge=lfs -text
+Qwen3.5-0.8B-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
+Qwen3.5-0.8B-UD-Q6_K_XL.gguf filter=lfs diff=lfs merge=lfs -text
+Qwen3.5-0.8B-UD-Q8_K_XL.gguf filter=lfs diff=lfs merge=lfs -text
+Qwen3.5-0.8B-Q4_K_S.gguf filter=lfs diff=lfs merge=lfs -text
+imatrix_unsloth.gguf_file filter=lfs diff=lfs merge=lfs -text
+Qwen3.5-0.8B-BF16.gguf filter=lfs diff=lfs merge=lfs -text
+Qwen3.5-0.8B-UD-Q3_K_XL.gguf filter=lfs diff=lfs merge=lfs -text
diff --git a/test/sagemaker_tests/huggingface/llamacpp/utils/__init__.py b/test/sagemaker_tests/huggingface/llamacpp/utils/__init__.py
new file mode 100644
index 000000000000..6932ed1abd5b
--- /dev/null
+++ b/test/sagemaker_tests/huggingface/llamacpp/utils/__init__.py
@@ -0,0 +1,36 @@
+# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+
+import boto3
+import botocore
+
+
+def _botocore_resolver():
+    """
+    Get the DNS suffix for the given region.
+    :return: endpoint object
+    """
+    loader = botocore.loaders.create_loader()
+    return botocore.regions.EndpointResolver(loader.load_data("endpoints"))
+
+
+def get_ecr_registry(account, region):
+    """
+    Get prefix of ECR image URI
+    :param account: Account ID
+    :param region: region where ECR repo exists
+    :return: AWS ECR registry
+    """
+    endpoint_data = _botocore_resolver().construct_endpoint("ecr", region)
+    return "{}.dkr.{}".format(account, endpoint_data["hostname"])
diff --git a/test/sagemaker_tests/huggingface/llamacpp/utils/image_utils.py b/test/sagemaker_tests/huggingface/llamacpp/utils/image_utils.py
new file mode 100644
index 000000000000..3421e6ce2b42
--- /dev/null
+++ b/test/sagemaker_tests/huggingface/llamacpp/utils/image_utils.py
@@ -0,0 +1,67 @@
+# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+
+import os
+import subprocess
+import sys
+
+CYAN_COLOR = "\033[36m"
+END_COLOR = "\033[0m"
+
+
+def build_base_image(
+    framework_name, framework_version, py_version, processor, base_image_tag, cwd="."
+):
+    base_image_uri = get_base_image_uri(framework_name, base_image_tag)
+
+    dockerfile_location = os.path.join(
+        "docker", framework_version, "base", "Dockerfile.{}".format(processor)
+    )
+
+    subprocess.check_call(
+        [
+            "docker",
+            "build",
+            "-t",
+            base_image_uri,
+            "-f",
+            dockerfile_location,
+            "--build-arg",
+            "py_version={}".format(py_version[-1]),
+            cwd,
+        ],
+        cwd=cwd,
+    )
+    print("created image {}".format(base_image_uri))
+    return base_image_uri
+
+
+def get_base_image_uri(framework_name, base_image_tag):
+    return "{}-base:{}".format(framework_name, base_image_tag)
+
+
+def get_image_uri(framework_name, tag):
+    return "{}:{}".format(framework_name, tag)
+
+
+def _check_call(cmd, *popenargs, **kwargs):
+    if isinstance(cmd, str):
+        cmd = cmd.split(" ")
+    _print_cmd(cmd)
+    subprocess.check_call(cmd, *popenargs, **kwargs)
+
+
+def _print_cmd(cmd):
+    print("executing docker command: {}{}{}".format(CYAN_COLOR, " ".join(cmd), END_COLOR))
+    sys.stdout.flush()
diff --git a/test/sagemaker_tests/huggingface/llamacpp/utils/local_mode_utils.py b/test/sagemaker_tests/huggingface/llamacpp/utils/local_mode_utils.py
new file mode 100644
index 000000000000..fa6b3cf00c36
--- /dev/null
+++ b/test/sagemaker_tests/huggingface/llamacpp/utils/local_mode_utils.py
@@ -0,0 +1,46 @@
+# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+
+from contextlib import contextmanager
+import fcntl
+import os
+import tarfile
+import time
+
+from ..integration import resources_path
+
+LOCK_PATH = os.path.join(resources_path, "local_mode_lock")
+
+
+@contextmanager
+def lock():
+    # Since Local Mode uses the same port for serving, we need a lock in order
+    # to allow concurrent test execution.
+    local_mode_lock_fd = open(LOCK_PATH, "w")
+    local_mode_lock = local_mode_lock_fd.fileno()
+
+    fcntl.lockf(local_mode_lock, fcntl.LOCK_EX)
+
+    try:
+        yield
+    finally:
+        time.sleep(5)
+        fcntl.lockf(local_mode_lock, fcntl.LOCK_UN)
+
+
+def assert_files_exist(output_path, directory_file_map):
+    for directory, files in directory_file_map.items():
+        with tarfile.open(os.path.join(output_path, "{}.tar.gz".format(directory))) as tar:
+            for f in files:
+                tar.getmember(f)
diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py
index 51ca9a276922..571b9fb26ed3 100644
--- a/test/test_utils/__init__.py
+++ b/test/test_utils/__init__.py
@@ -2082,6 +2082,7 @@ def get_job_type_from_image(image_uri):
         "base": "general",
         "vllm": "general",
         "sglang": "general",
+        "llamacpp": "general",
     }
 
     for key, job_type in job_type_mapping.items():

From d2cc69f4278ca986199522e252abbd02dcb44843 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Enrique=20Hern=C3=A1ndez=20Calabr=C3=A9s?=
 <ehcalabres@gmail.com>
Date: Mon, 27 Apr 2026 09:34:51 +0000
Subject: [PATCH 03/14] Update Docker base name and coverage report framework
 for HuggingFace LlamaCpp support

---
 test/sagemaker_tests/huggingface/llamacpp/conftest.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/sagemaker_tests/huggingface/llamacpp/conftest.py b/test/sagemaker_tests/huggingface/llamacpp/conftest.py
index cbf61a194072..57374310db49 100644
--- a/test/sagemaker_tests/huggingface/llamacpp/conftest.py
+++ b/test/sagemaker_tests/huggingface/llamacpp/conftest.py
@@ -87,7 +87,7 @@ def pytest_addoption(parser):
     parser.addoption("--aws-id")
     parser.addoption("--instance-type")
     parser.addoption("--accelerator-type", default=None)
-    parser.addoption("--docker-base-name", default="huggingface_sglang")
+    parser.addoption("--docker-base-name", default="huggingface_llamacpp")
     parser.addoption("--region", default="us-west-2")
     parser.addoption("--framework-version", default="")
     parser.addoption(
@@ -140,7 +140,7 @@ def pytest_collection_modifyitems(session, config, items):
         from test.test_utils.test_reporting import TestReportGenerator
 
         report_generator = TestReportGenerator(items, is_sagemaker=True)
-        report_generator.generate_coverage_doc(framework="huggingface_sglang", job_type="inference")
+        report_generator.generate_coverage_doc(framework="huggingface_llamacpp", job_type="inference")
 
 
 @pytest.fixture(scope="session", name="docker_base_name")

From 1afc8338d068717d5e75d04c8b5d74b641f7e374 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Enrique=20Hern=C3=A1ndez=20Calabr=C3=A9s?=
 <ehcalabres@gmail.com>
Date: Mon, 27 Apr 2026 16:18:45 +0000
Subject: [PATCH 04/14] Update local & sagemaker tests for llama.cpp DLC

---
 .../llamacpp/integration/__init__.py          | 32 +++++++++++++++----
 .../integration/local/test_serving.py         |  2 +-
 .../{test_sglang.py => test_llamacpp.py}      |  2 +-
 .../huggingface/llamacpp/requirements.txt     | 29 +++++++++++++++++
 4 files changed, 57 insertions(+), 8 deletions(-)
 rename test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/{test_sglang.py => test_llamacpp.py} (98%)
 create mode 100644 test/sagemaker_tests/huggingface/llamacpp/requirements.txt

diff --git a/test/sagemaker_tests/huggingface/llamacpp/integration/__init__.py b/test/sagemaker_tests/huggingface/llamacpp/integration/__init__.py
index 9befa612dc56..79a9cbb4223d 100644
--- a/test/sagemaker_tests/huggingface/llamacpp/integration/__init__.py
+++ b/test/sagemaker_tests/huggingface/llamacpp/integration/__init__.py
@@ -25,24 +25,44 @@
 
 # Model artifacts for local mode tests - downloaded from HuggingFace Hub at runtime
 MODEL_ID = "unsloth/Qwen3.5-0.8B-GGUF"
+MODEL_FILENAME = "Qwen3.5-0.8B-UD-IQ2_XXS.gguf"
 model_dir = os.path.join(resources_path, "qwen3.5-0.8b")
 model_data = "qwen3.5-0.8b.tar.gz"
 model_data_path = os.path.join(model_dir, model_data)
 
 
+def _tar_contains_expected_model(tar_path):
+    if not os.path.exists(tar_path):
+        return False
+    try:
+        with tarfile.open(tar_path, "r:gz") as tar:
+            return any(
+                os.path.basename(member.name) == MODEL_FILENAME
+                for member in tar.getmembers()
+                if member.isfile()
+            )
+    except tarfile.TarError:
+        return False
+
+
 def ensure_model_downloaded():
     """Download model from HuggingFace Hub and create tarball if not already present."""
-    if os.path.exists(model_data_path):
+    if _tar_contains_expected_model(model_data_path):
         return model_data_path
 
-    from huggingface_hub import snapshot_download
+    from huggingface_hub import hf_hub_download
 
     os.makedirs(model_dir, exist_ok=True)
     local_model_dir = os.path.join(model_dir, "model")
-
-    print(f"Downloading {MODEL_ID} from HuggingFace Hub...")
-    snapshot_download(
-        repo_id=MODEL_ID, local_dir=local_model_dir, ignore_patterns=["*.onnx"]
+    if os.path.exists(local_model_dir):
+        shutil.rmtree(local_model_dir)
+    os.makedirs(local_model_dir, exist_ok=True)
+
+    print(f"Downloading {MODEL_FILENAME} from {MODEL_ID} on HuggingFace Hub...")
+    hf_hub_download(
+        repo_id=MODEL_ID,
+        filename=MODEL_FILENAME,
+        local_dir=local_model_dir,
     )
 
     # Remove cache folder if present
diff --git a/test/sagemaker_tests/huggingface/llamacpp/integration/local/test_serving.py b/test/sagemaker_tests/huggingface/llamacpp/integration/local/test_serving.py
index a5f10f4de27f..68691d05b559 100644
--- a/test/sagemaker_tests/huggingface/llamacpp/integration/local/test_serving.py
+++ b/test/sagemaker_tests/huggingface/llamacpp/integration/local/test_serving.py
@@ -37,7 +37,7 @@ def _predictor(image, sagemaker_local_session, instance_type):
     model_data_path = ensure_model_downloaded()
 
     env = {
-        "SM_LLAMACPP_MODEL": "/opt/ml/model",
+        "SM_LLAMACPP_MODEL": "/opt/ml/model/Qwen3.5-0.8B-UD-IQ2_XXS.gguf",
     }
 
     model = Model(
diff --git a/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/test_sglang.py b/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/test_llamacpp.py
similarity index 98%
rename from test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/test_sglang.py
rename to test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/test_llamacpp.py
index f00668e50844..b22b32f27543 100644
--- a/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/test_sglang.py
+++ b/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/test_llamacpp.py
@@ -68,7 +68,7 @@ def _test_llamacpp_model(
     endpoint_name = sagemaker.utils.unique_name_from_base("sagemaker-hf-llamacpp-serving")
 
     env = {
-        "SM_LLAMACPP_MODEL": model_id,
+        "SM_LLAMACPP_HF_REPO": model_id,
     }
 
     model = Model(
diff --git a/test/sagemaker_tests/huggingface/llamacpp/requirements.txt b/test/sagemaker_tests/huggingface/llamacpp/requirements.txt
new file mode 100644
index 000000000000..890bbe499718
--- /dev/null
+++ b/test/sagemaker_tests/huggingface/llamacpp/requirements.txt
@@ -0,0 +1,29 @@
+boto3
+coverage
+# Docker v7.0.0 breaks compatibility with Docker Compose v1 (SageMaker Local)
+docker>=5,<=6.1.3
+flake8==3.7.7
+Flask==1.1.1
+mock
+pytest==8.3.5
+pytest-cov
+pytest-rerunfailures
+pytest-xdist
+PyYAML
+protobuf>=3.20,<=3.20.2
+sagemaker>=2.237.0,<3
+six
+requests<2.32.0
+requests_mock
+Pillow
+retrying==1.3.3
+urllib3>=1.26.8
+pluggy>=1.5,<2
+requests_mock
+sagemaker-inference
+tenacity
+fabric
+invoke
+gitpython
+toml
+huggingface_hub

From ecdd5a2474d8ce616e5e1f1f22e8997a6ea3c982 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Enrique=20Hern=C3=A1ndez=20Calabr=C3=A9s?=
 <ehcalabres@gmail.com>
Date: Mon, 27 Apr 2026 16:22:20 +0000
Subject: [PATCH 05/14] Update dlc_developer_config.toml

---
 dlc_developer_config.toml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
index 08ad2014d650..585a65356d74 100644
--- a/dlc_developer_config.toml
+++ b/dlc_developer_config.toml
@@ -1,6 +1,6 @@
 [dev]
 # Set to "huggingface", for example, if you are a huggingface developer. Default is ""
-partner_developer = ""
+partner_developer = "huggingface"
 # Please only set it to true if you are preparing an EI related PR
 # Do remember to revert it back to false before merging any PR (including EI dedicated PR)
 ei_mode = false
@@ -37,7 +37,7 @@ deep_canary_mode = false
 [build]
 # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
 # available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_vllm", "huggingface_sglang", "huggingface_llamacpp", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
-build_frameworks = []
+build_frameworks = ["huggingface_llamacpp"]
 
 
 # By default we build both training and inference containers. Set true/false values to determine which to build.
@@ -193,7 +193,7 @@ dlc-pr-huggingface-vllm = ""
 dlc-pr-huggingface-sglang = ""
 
 # Huggingface Llamacpp
-dlc-pr-huggingface-llamacpp = ""
+dlc-pr-huggingface-llamacpp = "/huggingface/llamacpp/buildspec.yml"
 
 # sglang
 dlc-pr-sglang = ""

From 3f69b07d7d2b58b369a31ab82702e0f08ff18607 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Enrique=20Hern=C3=A1ndez=20Calabr=C3=A9s?=
 <ehcalabres@gmail.com>
Date: Mon, 27 Apr 2026 16:40:28 +0000
Subject: [PATCH 06/14] Disable training container build in
 dlc_developer_config.toml

---
 dlc_developer_config.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
index 585a65356d74..60593de299b7 100644
--- a/dlc_developer_config.toml
+++ b/dlc_developer_config.toml
@@ -41,7 +41,7 @@ build_frameworks = ["huggingface_llamacpp"]
 
 
 # By default we build both training and inference containers. Set true/false values to determine which to build.
-build_training = true
+build_training = false
 build_inference = true
 
 # Set do_build to "false" to skip builds and test the latest image built by this PR

From f5bcbc35232cc1cce3755d53562285891e22e3f2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Enrique=20Hern=C3=A1ndez=20Calabr=C3=A9s?=
 <ehcalabres@gmail.com>
Date: Wed, 29 Apr 2026 12:25:06 +0200
Subject: [PATCH 07/14] Refactor SageMaker integration for llama.cpp: replace
 Python proxy with custom llama-server build

---
 .../llamacpp_sagemaker_serve.py               | 211 ------------------
 .../llamacpp_sagemaker_server.patch           | 133 +++++++++++
 .../build_artifacts/sagemaker_entrypoint.sh   |  62 +----
 huggingface/llamacpp/buildspec.yml            |   6 +-
 .../llamacpp/docker/b8882/Dockerfile.cpu      |  74 ++++++
 .../docker/b8882/cu130/Dockerfile.gpu         |  83 +++++--
 .../llamacpp/integration/__init__.py          |   6 +-
 .../integration/local/test_serving.py         |   6 +-
 .../integration/sagemaker/test_llamacpp.py    |   2 +-
 9 files changed, 293 insertions(+), 290 deletions(-)
 delete mode 100644 huggingface/llamacpp/build_artifacts/llamacpp_sagemaker_serve.py
 create mode 100644 huggingface/llamacpp/build_artifacts/llamacpp_sagemaker_server.patch

diff --git a/huggingface/llamacpp/build_artifacts/llamacpp_sagemaker_serve.py b/huggingface/llamacpp/build_artifacts/llamacpp_sagemaker_serve.py
deleted file mode 100644
index 38da145b33e0..000000000000
--- a/huggingface/llamacpp/build_artifacts/llamacpp_sagemaker_serve.py
+++ /dev/null
@@ -1,211 +0,0 @@
-# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
-# SPDX-License-Identifier: Apache-2.0
-"""SageMaker HTTP proxy for llama.cpp llama-server.
-
-SageMaker invokes POST /invocations and GET /ping on port 8080. llama-server
-speaks OpenAI-style routes (e.g. /v1/chat/completions) and does not expose
-/invocations.
-
-Behavior mirrors scripts/vllm/omni_sagemaker_serve.py routing:
-
-- GET /ping is proxied to GET {backend}/health.
-- POST /invocations: if ``X-Amzn-SageMaker-Custom-Attributes`` contains
-  ``route=/some/path``, the request is forwarded to that path on llama-server.
-  Otherwise the target path is inferred from the JSON body (messages ->
-  /v1/chat/completions, prompt -> /v1/completions, input+model -> /v1/embeddings),
-  defaulting to /v1/chat/completions.
-
-For routes that require multipart/form-data (parity with vLLM-Omni), JSON bodies
-are converted when ``route=`` targets those paths.
-
-Environment:
-
-- LLAMACPP_SAGEMAKER_BACKEND_URL: upstream base URL (default http://127.0.0.1:8081)
-"""
-
-from __future__ import annotations
-
-import json
-import logging
-import os
-import re
-import uuid
-from collections.abc import AsyncIterator
-
-import httpx
-from starlette.applications import Starlette
-from starlette.requests import Request
-from starlette.responses import Response, StreamingResponse
-from starlette.routing import Route
-
-logger = logging.getLogger("llamacpp_sagemaker")
-
-BACKEND = os.environ.get("LLAMACPP_SAGEMAKER_BACKEND_URL", "http://127.0.0.1:8081").rstrip("/")
-
-FORM_DATA_ROUTES = frozenset({"/v1/videos", "/v1/videos/sync"})
-
-_HOP_BY_HOP = frozenset(
-    {
-        "connection",
-        "keep-alive",
-        "proxy-authenticate",
-        "proxy-authorization",
-        "te",
-        "trailers",
-        "transfer-encoding",
-        "upgrade",
-        "host",
-        "content-length",
-    }
-)
-
-_RESP_DROP = frozenset({"transfer-encoding", "content-length", "connection"})
-
-
-def _parse_route_from_header(raw: str | None) -> str | None:
-    if not raw:
-        return None
-    m = re.search(r"route=(/[^\s,]+)", raw)
-    return m.group(1) if m else None
-
-
-def _parse_route(request: Request) -> str | None:
-    h = request.headers
-    v = h.get("x-amzn-sagemaker-custom-attributes")
-    return _parse_route_from_header(v)
-
-
-def _build_multipart_body(data: dict, boundary: str) -> bytes:
-    parts: list[str] = []
-    for key, value in data.items():
-        parts.append(
-            f'--{boundary}\r\nContent-Disposition: form-data; name="{key}"\r\n\r\n{value}\r\n'
-        )
-    parts.append(f"--{boundary}--\r\n")
-    return "".join(parts).encode()
-
-
-def _default_path_for_invocation(content_type: str, body: bytes) -> str:
-    ct = (content_type or "").lower()
-    if "json" not in ct:
-        return "/v1/chat/completions"
-    try:
-        data = json.loads(body)
-    except (json.JSONDecodeError, UnicodeDecodeError):
-        return "/v1/chat/completions"
-    if not isinstance(data, dict):
-        return "/v1/chat/completions"
-    if "messages" in data:
-        return "/v1/chat/completions"
-    if "prompt" in data:
-        return "/v1/completions"
-    if "input" in data and "model" in data:
-        return "/v1/embeddings"
-    return "/v1/chat/completions"
-
-
-def _forward_request_headers(request: Request, body_len: int, content_type: str | None) -> dict[str, str]:
-    out: dict[str, str] = {}
-    for key, value in request.headers.items():
-        lk = key.lower()
-        if lk in _HOP_BY_HOP or lk == "x-amzn-sagemaker-custom-attributes":
-            continue
-        out[key] = value
-    out["content-length"] = str(body_len)
-    if content_type is not None:
-        out["content-type"] = content_type
-    return out
-
-
-def _response_headers_from_httpx(resp: httpx.Response) -> dict[str, str]:
-    h: dict[str, str] = {}
-    for key, value in resp.headers.items():
-        lk = key.lower()
-        if lk in _RESP_DROP:
-            continue
-        h[key] = value
-    return h
-
-
-async def ping(request: Request) -> Response:
-    url = f"{BACKEND}/health"
-    try:
-        async with httpx.AsyncClient(timeout=httpx.Timeout(10.0, connect=2.0)) as client:
-            r = await client.get(url)
-    except httpx.RequestError as e:
-        logger.warning("Backend health request failed: %s", e)
-        return Response(status_code=503, content=b'{"error":"backend_unavailable"}')
-    return Response(
-        status_code=r.status_code,
-        content=r.content,
-        headers=_response_headers_from_httpx(r),
-    )
-
-
-async def invocations(request: Request) -> Response:
-    if request.method != "POST":
-        return Response(status_code=405, content=b"Method Not Allowed")
-
-    body = await request.body()
-    route = _parse_route(request)
-    content_type = request.headers.get("content-type")
-
-    if route:
-        target = route
-        logger.info("Rerouting /invocations -> %s", target)
-        ct = (content_type or "").lower()
-        if target in FORM_DATA_ROUTES and "json" in ct:
-            try:
-                data = json.loads(body)
-            except (json.JSONDecodeError, UnicodeDecodeError):
-                data = None
-            if isinstance(data, dict):
-                boundary = uuid.uuid4().hex
-                body = _build_multipart_body(data, boundary)
-                content_type = f"multipart/form-data; boundary={boundary}"
-                logger.info("Converted JSON to form-data for %s", target)
-    else:
-        target = _default_path_for_invocation(content_type or "", body)
-        logger.info("Inferred /invocations -> %s", target)
-
-    url = f"{BACKEND}{target}"
-    fwd_headers = _forward_request_headers(request, len(body), content_type)
-
-    timeout = httpx.Timeout(600.0, connect=30.0)
-    client = httpx.AsyncClient(timeout=timeout)
-    try:
-        req = client.build_request("POST", url, headers=fwd_headers, content=body)
-        r = await client.send(req, stream=True)
-    except httpx.RequestError as e:
-        await client.aclose()
-        logger.exception("Upstream request failed: %s", e)
-        return Response(status_code=502, content=json.dumps({"error": "upstream_error"}).encode())
-
-    async def stream_body() -> AsyncIterator[bytes]:
-        try:
-            async for chunk in r.aiter_bytes():
-                yield chunk
-        finally:
-            await r.aclose()
-            await client.aclose()
-
-    return StreamingResponse(
-        stream_body(),
-        status_code=r.status_code,
-        headers=_response_headers_from_httpx(r),
-        media_type=r.headers.get("content-type"),
-    )
-
-
-routes = [
-    Route("/ping", ping, methods=["GET"]),
-    Route("/invocations", invocations, methods=["POST"]),
-]
-
-app = Starlette(routes=routes)
-
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s %(levelname)s %(name)s %(message)s",
-    force=True,
-)
diff --git a/huggingface/llamacpp/build_artifacts/llamacpp_sagemaker_server.patch b/huggingface/llamacpp/build_artifacts/llamacpp_sagemaker_server.patch
new file mode 100644
index 000000000000..a8491c93a26a
--- /dev/null
+++ b/huggingface/llamacpp/build_artifacts/llamacpp_sagemaker_server.patch
@@ -0,0 +1,133 @@
+--- a/tools/server/server.cpp
++++ b/tools/server/server.cpp
+@@ -11,7 +11,9 @@
+ #include "llama.h"
+ #include "log.h"
+ 
++#include <algorithm>
+ #include <atomic>
++#include <cctype>
+ #include <clocale>
+ #include <exception>
+ #include <signal.h>
+@@ -69,6 +71,81 @@
+         }
+         return res;
+     };
++}
++
++static std::string sagemaker_header(const server_http_req & req, const std::string & name) {
++    for (const auto & h : req.headers) {
++        std::string key = h.first;
++        std::transform(key.begin(), key.end(), key.begin(), [](unsigned char c) { return std::tolower(c); });
++        if (key == name) {
++            return h.second;
++        }
++    }
++    return "";
++}
++
++static std::string sagemaker_route_from_attrs(const server_http_req & req) {
++    const std::string attrs = sagemaker_header(req, "x-amzn-sagemaker-custom-attributes");
++    const std::string key = "route=";
++    const size_t pos = attrs.find(key);
++    if (pos == std::string::npos) {
++        return "";
++    }
++    const size_t start = pos + key.size();
++    const size_t end = attrs.find_first_of(",; \t\r\n", start);
++    return attrs.substr(start, end == std::string::npos ? std::string::npos : end - start);
++}
++
++static bool sagemaker_route_syntax_ok(const std::string & route) {
++    return !route.empty() && route[0] == '/' && route.find("..") == std::string::npos &&
++           route.find("://") == std::string::npos && route.find('?') == std::string::npos &&
++           route.find('#') == std::string::npos;
++}
++
++static std::string sagemaker_default_route(const server_http_req & req) {
++    const json body = json::parse(req.body, nullptr, false);
++    if (body.is_object()) {
++        if (body.contains("messages")) {
++            return "/v1/chat/completions";
++        }
++        if (body.contains("prompt")) {
++            return "/v1/completions";
++        }
++        if (body.contains("input")) {
++            return "/v1/embeddings";
++        }
++    }
++    return "/v1/chat/completions";
++}
++
++static server_http_res_ptr sagemaker_error(int status, const std::string & message) {
++    auto res = std::make_unique<server_http_res>();
++    res->status = status;
++    res->data = safe_json_to_str({
++        { "error", {
++            { "code", status },
++            { "message", message },
++            { "type", "invalid_request_error" },
++        } },
++    });
++    return res;
++}
++
++static server_http_res_ptr sagemaker_invocations(
++        const server_http_req & req,
++        const std::map<std::string, server_http_context::handler_t> & routes) {
++    const std::string requested = sagemaker_route_from_attrs(req);
++    const std::string route = requested.empty() ? sagemaker_default_route(req) : requested;
++    if (!sagemaker_route_syntax_ok(route)) {
++        return sagemaker_error(400, "invalid SageMaker route: " + route);
++    }
++    const auto it = routes.find(route);
++    if (it == routes.end()) {
++        return sagemaker_error(400, "unsupported SageMaker route: " + route);
++    }
++    server_http_req routed_req = req;
++    routed_req.path = route;
++    return it->second(routed_req);
+ }
+ 
+ int main(int argc, char ** argv) {
+@@ -169,6 +246,38 @@
+         ctx_http.post("/models/unload",        ex_wrapper(models_routes->post_router_models_unload));
+     }
+ 
++
++    const std::map<std::string, server_http_context::handler_t> sagemaker_routes = {
++        {"/props", routes.post_props},
++        {"/completion", routes.post_completions},
++        {"/completions", routes.post_completions},
++        {"/v1/completions", routes.post_completions_oai},
++        {"/chat/completions", routes.post_chat_completions},
++        {"/v1/chat/completions", routes.post_chat_completions},
++        {"/v1/responses", routes.post_responses_oai},
++        {"/responses", routes.post_responses_oai},
++        {"/v1/audio/transcriptions", routes.post_transcriptions_oai},
++        {"/audio/transcriptions", routes.post_transcriptions_oai},
++        {"/v1/messages", routes.post_anthropic_messages},
++        {"/v1/messages/count_tokens", routes.post_anthropic_count_tokens},
++        {"/infill", routes.post_infill},
++        {"/embedding", routes.post_embeddings},
++        {"/embeddings", routes.post_embeddings},
++        {"/v1/embeddings", routes.post_embeddings_oai},
++        {"/rerank", routes.post_rerank},
++        {"/reranking", routes.post_rerank},
++        {"/v1/rerank", routes.post_rerank},
++        {"/v1/reranking", routes.post_rerank},
++        {"/tokenize", routes.post_tokenize},
++        {"/detokenize", routes.post_detokenize},
++        {"/apply-template", routes.post_apply_template},
++        {"/lora-adapters", routes.post_lora_adapters},
++    };
++
++    ctx_http.get ("/ping", ex_wrapper(routes.get_health)); // SageMaker health endpoint
++    ctx_http.post("/invocations", ex_wrapper([&sagemaker_routes](const server_http_req & req) {
++        return sagemaker_invocations(req, sagemaker_routes);
++    }));
+     ctx_http.get ("/health",                   ex_wrapper(routes.get_health)); // public endpoint (no API key check)
+     ctx_http.get ("/v1/health",                ex_wrapper(routes.get_health)); // public endpoint (no API key check)
+     ctx_http.get ("/metrics",                  ex_wrapper(routes.get_metrics));
diff --git a/huggingface/llamacpp/build_artifacts/sagemaker_entrypoint.sh b/huggingface/llamacpp/build_artifacts/sagemaker_entrypoint.sh
index 7f55bf5967d8..1d06097ef569 100644
--- a/huggingface/llamacpp/build_artifacts/sagemaker_entrypoint.sh
+++ b/huggingface/llamacpp/build_artifacts/sagemaker_entrypoint.sh
@@ -6,17 +6,16 @@ set -euo pipefail
 bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true
 
 # Source CUDA compat for older drivers (e.g., g5 instances)
-if command -v nvidia-smi >/dev/null 2>&1 && command -v nvcc >/dev/null 2>&1; then
+if [ -f /usr/local/bin/start_cuda_compat.sh ] \
+    && command -v nvidia-smi >/dev/null 2>&1 \
+    && command -v nvcc >/dev/null 2>&1; then
     source /usr/local/bin/start_cuda_compat.sh
 fi
 
-# SageMaker sends traffic to port 8080 on /ping and /invocations. llama-server
-# listens on a loopback-only port; a small Python proxy (llamacpp_sagemaker_serve)
-# binds 8080 and forwards to llama-server, similar to vLLM-Omni middleware.
-INTERNAL_HOST="${LLAMACPP_SAGEMAKER_INTERNAL_HOST:-127.0.0.1}"
-INTERNAL_PORT="${LLAMACPP_SAGEMAKER_INTERNAL_PORT:-8081}"
-PROXY_PORT="${LLAMACPP_SAGEMAKER_PROXY_PORT:-8080}"
-export LLAMACPP_SAGEMAKER_BACKEND_URL="${LLAMACPP_SAGEMAKER_BACKEND_URL:-http://${INTERNAL_HOST}:${INTERNAL_PORT}}"
+# SageMaker sends traffic to port 8080 on /ping and /invocations. The custom
+# llama-server build handles those routes directly.
+HOST="${LLAMACPP_SAGEMAKER_HOST:-0.0.0.0}"
+PORT="${SAGEMAKER_BIND_TO_PORT:-${LLAMACPP_SAGEMAKER_PORT:-8080}}"
 
 PREFIX="SM_LLAMACPP_"
 ARG_PREFIX="--"
@@ -32,7 +31,7 @@ while IFS='=' read -r key value; do
     fi
 done < <(env | grep "^${PREFIX}" || true)
 
-# Drop any user-supplied --host / --port so inference stays on the internal bind.
+# Drop any user-supplied --host / --port so SageMaker can always reach the server.
 normalized=()
 skip_next=0
 for a in "${ARGS[@]}"; do
@@ -47,49 +46,8 @@ for a in "${ARGS[@]}"; do
     normalized+=("$a")
 done
 ARGS=("${normalized[@]}")
-ARGS+=(--host "$INTERNAL_HOST" --port "$INTERNAL_PORT")
+ARGS+=(--host "$HOST" --port "$PORT")
 
 echo "[sagemaker] llama-server args: ${ARGS[*]}" >&2
 
-/app/llama-server "${ARGS[@]}" &
-LLAMA_PID=$!
-
-wait_for_llama() {
-    local i
-    for i in $(seq 1 120); do
-        if curl -sf "http://${INTERNAL_HOST}:${INTERNAL_PORT}/health" >/dev/null 2>&1; then
-            return 0
-        fi
-        sleep 1
-    done
-    return 1
-}
-
-if ! wait_for_llama; then
-    echo "[sagemaker] llama-server did not become healthy on ${INTERNAL_HOST}:${INTERNAL_PORT}" >&2
-    kill -TERM "$LLAMA_PID" 2>/dev/null || true
-    wait "$LLAMA_PID" 2>/dev/null || true
-    exit 1
-fi
-
-shutdown() {
-    kill -TERM "$UVICORN_PID" 2>/dev/null || true
-    kill -TERM "$LLAMA_PID" 2>/dev/null || true
-    wait "$UVICORN_PID" 2>/dev/null || true
-    wait "$LLAMA_PID" 2>/dev/null || true
-}
-
-trap shutdown SIGTERM SIGINT
-
-if [ -n "${PYTHONPATH:-}" ]; then
-    export PYTHONPATH="${PYTHONPATH}:/usr/local/lib/llamacpp_sagemaker"
-else
-    export PYTHONPATH="/usr/local/lib/llamacpp_sagemaker"
-fi
-python3 -m uvicorn llamacpp_sagemaker_serve:app --host 0.0.0.0 --port "$PROXY_PORT" --log-level info &
-UVICORN_PID=$!
-
-wait "$UVICORN_PID"
-exit_code=$?
-shutdown
-exit "$exit_code"
+exec /app/llama-server "${ARGS[@]}"
diff --git a/huggingface/llamacpp/buildspec.yml b/huggingface/llamacpp/buildspec.yml
index bed20118458b..8b05831cbadf 100644
--- a/huggingface/llamacpp/buildspec.yml
+++ b/huggingface/llamacpp/buildspec.yml
@@ -28,9 +28,9 @@ context:
     sagemaker_entrypoint:
       source: build_artifacts/sagemaker_entrypoint.sh
       target: sagemaker_entrypoint.sh
-    llamacpp_sagemaker_serve:
-      source: build_artifacts/llamacpp_sagemaker_serve.py
-      target: llamacpp_sagemaker_serve.py
+    llamacpp_sagemaker_server_patch:
+      source: build_artifacts/llamacpp_sagemaker_server.patch
+      target: llamacpp_sagemaker_server.patch
 
 
 images:
diff --git a/huggingface/llamacpp/docker/b8882/Dockerfile.cpu b/huggingface/llamacpp/docker/b8882/Dockerfile.cpu
index e69de29bb2d1..813ba6eaf042 100644
--- a/huggingface/llamacpp/docker/b8882/Dockerfile.cpu
+++ b/huggingface/llamacpp/docker/b8882/Dockerfile.cpu
@@ -0,0 +1,74 @@
+ARG UBUNTU_VERSION=24.04
+ARG LLAMACPP_VERSION=b8882
+
+FROM ubuntu:${UBUNTU_VERSION} AS build
+
+ARG LLAMACPP_VERSION
+
+RUN apt-get update \
+    && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        cmake \
+        gcc-14 \
+        g++-14 \
+        git \
+        libgomp1 \
+        libssl-dev \
+        patch \
+        python3 \
+    && rm -rf /var/lib/apt/lists/*
+
+ENV CC=gcc-14 \
+    CXX=g++-14
+
+WORKDIR /src/llama.cpp
+
+RUN git clone --branch "${LLAMACPP_VERSION}" --depth 1 https://github.com/ggml-org/llama.cpp.git .
+
+COPY llamacpp_sagemaker_server.patch /tmp/llamacpp_sagemaker_server.patch
+
+RUN patch -p1 < /tmp/llamacpp_sagemaker_server.patch
+RUN cmake -B build \
+    -DGGML_NATIVE=OFF \
+    -DGGML_BACKEND_DL=ON \
+    -DGGML_CPU_ALL_VARIANTS=ON \
+    -DLLAMA_BUILD_TESTS=OFF \
+    . \
+    && cmake --build build --config Release -j"$(nproc)" --target llama-server
+
+RUN mkdir -p /app/lib \
+    && find build -name "*.so*" -exec cp -P {} /app/lib \; \
+    && cp build/bin/llama-server /app/llama-server
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+LABEL maintainer="Amazon AI"
+LABEL dlc_major_version="1"
+
+WORKDIR /app
+
+ENV LD_LIBRARY_PATH=/app:${LD_LIBRARY_PATH}
+
+RUN apt-get update \
+    && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        ca-certificates \
+        curl \
+        libgomp1 \
+    && apt-get autoremove -y \
+    && apt-get clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY --from=build /app/lib/ /app/
+COPY --from=build /app/llama-server /app/llama-server
+
+FROM base AS sagemaker
+
+COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
+COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh
+COPY --chmod=0755 sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh
+
+ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"]
diff --git a/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu b/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu
index 187f8f75ea0a..c1001bb61fee 100644
--- a/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu
+++ b/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu
@@ -1,10 +1,60 @@
-FROM ghcr.io/ggml-org/llama.cpp:server-cuda13-b8882 AS base
+ARG UBUNTU_VERSION=24.04
+ARG CUDA_VERSION=13.0.2
+ARG LLAMACPP_VERSION=b8882
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_CUDA_DEV_CONTAINER} AS build
+
+ARG LLAMACPP_VERSION
+ARG CUDA_DOCKER_ARCH=default
+
+RUN apt-get update \
+ && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    build-essential \
+    ca-certificates \
+    cmake \
+    gcc-14 \
+    g++-14 \
+    git \
+    libgomp1 \
+    libssl-dev \
+    patch \
+    python3 \
+ && rm -rf /var/lib/apt/lists/*
+
+ENV CC=gcc-14 \
+    CXX=g++-14 \
+    CUDAHOSTCXX=g++-14
+
+WORKDIR /src/llama.cpp
+
+RUN git clone --branch "${LLAMACPP_VERSION}" --depth 1 https://github.com/ggml-org/llama.cpp.git .
+COPY llamacpp_sagemaker_server.patch /tmp/llamacpp_sagemaker_server.patch
+RUN patch -p1 < /tmp/llamacpp_sagemaker_server.patch
+RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
+        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
+    fi \
+ && cmake -B build \
+    -DGGML_NATIVE=OFF \
+    -DGGML_CUDA=ON \
+    -DGGML_BACKEND_DL=ON \
+    -DGGML_CPU_ALL_VARIANTS=ON \
+    -DLLAMA_BUILD_TESTS=OFF \
+    ${CMAKE_ARGS:-} \
+    -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
+    . \
+ && cmake --build build --config Release -j"$(nproc)" --target llama-server
+
+RUN mkdir -p /app/lib \
+ && find build -name "*.so*" -exec cp -P {} /app/lib \; \
+ && cp build/bin/llama-server /app/llama-server
+
+FROM ${BASE_CUDA_RUN_CONTAINER} AS base
 
 LABEL maintainer="Amazon AI"
 LABEL dlc_major_version="1"
 
-FROM base AS sagemaker
-
 WORKDIR /app
 ENV LD_LIBRARY_PATH=/app:${LD_LIBRARY_PATH}
 
@@ -12,21 +62,22 @@ RUN apt-get update \
  && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
     ca-certificates \
     curl \
-    python3 \
-    python3-pip \
- && pip3 install --no-cache-dir --break-system-packages \
-    "httpx>=0.27,<1" \
-    "starlette>=0.37,<1" \
-    "uvicorn[standard]>=0.27,<1" \
- && rm -rf /var/lib/apt/lists/*
+    libgomp1 \
+ && apt-get autoremove -y \
+ && apt-get clean -y \
+ && rm -rf /var/lib/apt/lists/* \
+ && rm -rf /tmp/* /var/tmp/* \
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+ && find /var/cache -type f -delete
+
+COPY --from=build /app/lib/ /app/
+COPY --from=build /app/llama-server /app/llama-server
+
+FROM base AS sagemaker
 
 COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
 COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh
-COPY start_cuda_compat.sh /usr/local/bin/start_cuda_compat.sh
-COPY sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh
-RUN mkdir -p /usr/local/lib/llamacpp_sagemaker
-COPY llamacpp_sagemaker_serve.py /usr/local/lib/llamacpp_sagemaker/llamacpp_sagemaker_serve.py
-RUN chmod +x /usr/local/bin/sagemaker_entrypoint.sh \
- && chmod +x /usr/local/bin/start_cuda_compat.sh
+COPY --chmod=0755 start_cuda_compat.sh /usr/local/bin/start_cuda_compat.sh
+COPY --chmod=0755 sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh
 
 ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"]
diff --git a/test/sagemaker_tests/huggingface/llamacpp/integration/__init__.py b/test/sagemaker_tests/huggingface/llamacpp/integration/__init__.py
index 79a9cbb4223d..44e2b20386e5 100644
--- a/test/sagemaker_tests/huggingface/llamacpp/integration/__init__.py
+++ b/test/sagemaker_tests/huggingface/llamacpp/integration/__init__.py
@@ -86,10 +86,8 @@ def ensure_model_downloaded():
 ROLE = "dummy/unused-role"
 DEFAULT_TIMEOUT = 45
 
-# Llama.cpp SageMaker images listen on port 8080 with a small HTTP shim (/ping,
-# /invocations) that proxies to llama-server on loopback (see llamacpp_sagemaker_serve).
-# Do not set SM_LLAMACPP_HOST or SM_LLAMACPP_PORT expecting external access to
-# llama-server; the entrypoint pins the server to localhost and exposes the shim on 8080.
+# Llama.cpp SageMaker images listen on port 8080 with a custom llama-server build
+# that serves SageMaker-compatible /ping and /invocations routes directly.
 
 
 class NoLogStreamFoundError(Exception):
diff --git a/test/sagemaker_tests/huggingface/llamacpp/integration/local/test_serving.py b/test/sagemaker_tests/huggingface/llamacpp/integration/local/test_serving.py
index 68691d05b559..f4807f5c4cf0 100644
--- a/test/sagemaker_tests/huggingface/llamacpp/integration/local/test_serving.py
+++ b/test/sagemaker_tests/huggingface/llamacpp/integration/local/test_serving.py
@@ -30,8 +30,8 @@ def _predictor(image, sagemaker_local_session, instance_type):
     """Context manager for Llama.cpp model deployment and cleanup.
 
     Model is extracted to /opt/ml/model by SageMaker from model_data tar.gz.
-    The container entrypoint runs llama-server behind a SageMaker-compatible
-    proxy on port 8080 (/ping, /invocations -> OpenAI routes on llama-server).
+    The container entrypoint runs a custom llama-server build with
+    SageMaker-compatible /ping and /invocations routes on port 8080.
     """
     # Download model from HuggingFace Hub if not already present
     model_data_path = ensure_model_downloaded()
@@ -81,7 +81,7 @@ def _assert_llamacpp_chat_prediction(predictor):
 
 
 def _assert_llamacpp_chat_prediction_explicit_route(predictor):
-    """Same as chat test but forces target path via SageMaker CustomAttributes (proxy route=)."""
+    """Same as chat test but forces target path via SageMaker CustomAttributes route=."""
     predictor.serializer = JSONSerializer()
     predictor.deserializer = JSONDeserializer()
 
diff --git a/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/test_llamacpp.py b/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/test_llamacpp.py
index b22b32f27543..370ae0f51e1b 100644
--- a/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/test_llamacpp.py
+++ b/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/test_llamacpp.py
@@ -106,7 +106,7 @@ def _test_llamacpp_model(
         assert output is not None
         assert "choices" in output
 
-        # Explicit route= mirrors vLLM-Omni-style CustomAttributes routing in the container proxy.
+        # Explicit route= uses SageMaker CustomAttributes routing in the custom llama-server build.
         output_routed = predictor.predict(
             data,
             custom_attributes="route=/v1/chat/completions",

From 529537df1f8cf26804bbda6489c66fc48efef101 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Enrique=20Hern=C3=A1ndez=20Calabr=C3=A9s?=
 <ehcalabres@gmail.com>
Date: Thu, 30 Apr 2026 11:01:24 +0000
Subject: [PATCH 08/14] Remove unnecesary resources file

---
 .../resources/qwen3.5-0.8b/.gitattributes     | 61 -------------------
 1 file changed, 61 deletions(-)
 delete mode 100644 test/sagemaker_tests/huggingface/llamacpp/resources/qwen3.5-0.8b/.gitattributes

diff --git a/test/sagemaker_tests/huggingface/llamacpp/resources/qwen3.5-0.8b/.gitattributes b/test/sagemaker_tests/huggingface/llamacpp/resources/qwen3.5-0.8b/.gitattributes
deleted file mode 100644
index 6ba5bc1386f8..000000000000
--- a/test/sagemaker_tests/huggingface/llamacpp/resources/qwen3.5-0.8b/.gitattributes
+++ /dev/null
@@ -1,61 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
-mmproj-BF16.gguf filter=lfs diff=lfs merge=lfs -text
-mmproj-F16.gguf filter=lfs diff=lfs merge=lfs -text
-mmproj-F32.gguf filter=lfs diff=lfs merge=lfs -text
-Qwen3.5-0.8B-UD-IQ3_XXS.gguf filter=lfs diff=lfs merge=lfs -text
-Qwen3.5-0.8B-UD-Q2_K_XL.gguf filter=lfs diff=lfs merge=lfs -text
-Qwen3.5-0.8B-Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text
-Qwen3.5-0.8B-UD-Q5_K_XL.gguf filter=lfs diff=lfs merge=lfs -text
-Qwen3.5-0.8B-UD-Q4_K_XL.gguf filter=lfs diff=lfs merge=lfs -text
-Qwen3.5-0.8B-UD-IQ2_M.gguf filter=lfs diff=lfs merge=lfs -text
-Qwen3.5-0.8B-Q3_K_M.gguf filter=lfs diff=lfs merge=lfs -text
-Qwen3.5-0.8B-Q6_K.gguf filter=lfs diff=lfs merge=lfs -text
-Qwen3.5-0.8B-UD-IQ2_XXS.gguf filter=lfs diff=lfs merge=lfs -text
-Qwen3.5-0.8B-Q5_K_M.gguf filter=lfs diff=lfs merge=lfs -text
-Qwen3.5-0.8B-Q3_K_S.gguf filter=lfs diff=lfs merge=lfs -text
-Qwen3.5-0.8B-IQ4_NL.gguf filter=lfs diff=lfs merge=lfs -text
-Qwen3.5-0.8B-Q4_0.gguf filter=lfs diff=lfs merge=lfs -text
-Qwen3.5-0.8B-Q4_1.gguf filter=lfs diff=lfs merge=lfs -text
-Qwen3.5-0.8B-IQ4_XS.gguf filter=lfs diff=lfs merge=lfs -text
-Qwen3.5-0.8B-Q5_K_S.gguf filter=lfs diff=lfs merge=lfs -text
-Qwen3.5-0.8B-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
-Qwen3.5-0.8B-UD-Q6_K_XL.gguf filter=lfs diff=lfs merge=lfs -text
-Qwen3.5-0.8B-UD-Q8_K_XL.gguf filter=lfs diff=lfs merge=lfs -text
-Qwen3.5-0.8B-Q4_K_S.gguf filter=lfs diff=lfs merge=lfs -text
-imatrix_unsloth.gguf_file filter=lfs diff=lfs merge=lfs -text
-Qwen3.5-0.8B-BF16.gguf filter=lfs diff=lfs merge=lfs -text
-Qwen3.5-0.8B-UD-Q3_K_XL.gguf filter=lfs diff=lfs merge=lfs -text

From eeb004940321a0f7a94ffa7369a38c156e90d978 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Enrique=20Hern=C3=A1ndez=20Calabr=C3=A9s?=
 <ehcalabres@gmail.com>
Date: Thu, 30 Apr 2026 12:17:28 +0000
Subject: [PATCH 09/14] Minimal style changes

---
 .../docker/b8882/cu130/Dockerfile.gpu         | 68 +++++++++----------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu b/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu
index c1001bb61fee..2c74a6ef212c 100644
--- a/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu
+++ b/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu
@@ -10,18 +10,18 @@ ARG LLAMACPP_VERSION
 ARG CUDA_DOCKER_ARCH=default
 
 RUN apt-get update \
- && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
-    build-essential \
-    ca-certificates \
-    cmake \
-    gcc-14 \
-    g++-14 \
-    git \
-    libgomp1 \
-    libssl-dev \
-    patch \
-    python3 \
- && rm -rf /var/lib/apt/lists/*
+    && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        cmake \
+        gcc-14 \
+        g++-14 \
+        git \
+        libgomp1 \
+        libssl-dev \
+        patch \
+        python3 \
+    && rm -rf /var/lib/apt/lists/*
 
 ENV CC=gcc-14 \
     CXX=g++-14 \
@@ -35,20 +35,20 @@ RUN patch -p1 < /tmp/llamacpp_sagemaker_server.patch
 RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
         export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
     fi \
- && cmake -B build \
-    -DGGML_NATIVE=OFF \
-    -DGGML_CUDA=ON \
-    -DGGML_BACKEND_DL=ON \
-    -DGGML_CPU_ALL_VARIANTS=ON \
-    -DLLAMA_BUILD_TESTS=OFF \
-    ${CMAKE_ARGS:-} \
-    -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
-    . \
- && cmake --build build --config Release -j"$(nproc)" --target llama-server
+    && cmake -B build \
+        -DGGML_NATIVE=OFF \
+        -DGGML_CUDA=ON \
+        -DGGML_BACKEND_DL=ON \
+        -DGGML_CPU_ALL_VARIANTS=ON \
+        -DLLAMA_BUILD_TESTS=OFF \
+        ${CMAKE_ARGS:-} \
+        -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
+        . \
+    && cmake --build build --config Release -j"$(nproc)" --target llama-server
 
 RUN mkdir -p /app/lib \
- && find build -name "*.so*" -exec cp -P {} /app/lib \; \
- && cp build/bin/llama-server /app/llama-server
+    && find build -name "*.so*" -exec cp -P {} /app/lib \; \
+    && cp build/bin/llama-server /app/llama-server
 
 FROM ${BASE_CUDA_RUN_CONTAINER} AS base
 
@@ -59,16 +59,16 @@ WORKDIR /app
 ENV LD_LIBRARY_PATH=/app:${LD_LIBRARY_PATH}
 
 RUN apt-get update \
- && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
-    ca-certificates \
-    curl \
-    libgomp1 \
- && apt-get autoremove -y \
- && apt-get clean -y \
- && rm -rf /var/lib/apt/lists/* \
- && rm -rf /tmp/* /var/tmp/* \
- && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
- && find /var/cache -type f -delete
+    && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        ca-certificates \
+        curl \
+        libgomp1 \
+    && apt-get autoremove -y \
+    && apt-get clean -y \
+    && rm -rf /var/lib/apt/lists/* \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
 
 COPY --from=build /app/lib/ /app/
 COPY --from=build /app/llama-server /app/llama-server

From 0773825df945b1f45ce5dab91cdc0feb69436eae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Enrique=20Hern=C3=A1ndez=20Calabr=C3=A9s?=
 <ehcalabres@gmail.com>
Date: Thu, 30 Apr 2026 20:46:49 +0200
Subject: [PATCH 10/14] Update Dockerfiles to address multiple CVEs

---
 .../llamacpp/docker/b8882/Dockerfile.cpu        | 17 +++++++++++++++++
 .../llamacpp/docker/b8882/cu130/Dockerfile.gpu  | 17 +++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/huggingface/llamacpp/docker/b8882/Dockerfile.cpu b/huggingface/llamacpp/docker/b8882/Dockerfile.cpu
index 813ba6eaf042..a577306960f9 100644
--- a/huggingface/llamacpp/docker/b8882/Dockerfile.cpu
+++ b/huggingface/llamacpp/docker/b8882/Dockerfile.cpu
@@ -71,4 +71,21 @@ COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
 COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh
 COPY --chmod=0755 sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh
 
+# Fix several CVEs:
+# CVE-2026-31789, CVE-2025-15467, CVE-2025-13151, CVE-2025-15281,
+# CVE-2025-69419, CVE-2025-68973, CVE-2025-69421, CVE-2026-28390,
+# CVE-2025-69420, CVE-2026-0915, CVE-2026-0861, CVE-2026-28388,
+# CVE-2026-31790, CVE-2026-28387, CVE-2026-28389
+RUN apt-get update \
+    && apt-get install -y --only-upgrade \
+        libssl3t64 \
+        openssl \
+        libtasn1-6 \
+        libc6 \
+        libc-bin \
+        gnupg \
+        gpg \
+        gpgv \
+    && rm -rf /var/lib/apt/lists/*
+
 ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"]
diff --git a/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu b/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu
index 2c74a6ef212c..c7110bd0b007 100644
--- a/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu
+++ b/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu
@@ -80,4 +80,21 @@ COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh
 COPY --chmod=0755 start_cuda_compat.sh /usr/local/bin/start_cuda_compat.sh
 COPY --chmod=0755 sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh
 
+# Fix several CVEs:
+# CVE-2026-31789, CVE-2025-15467, CVE-2025-13151, CVE-2025-15281,
+# CVE-2025-69419, CVE-2025-68973, CVE-2025-69421, CVE-2026-28390,
+# CVE-2025-69420, CVE-2026-0915, CVE-2026-0861, CVE-2026-28388,
+# CVE-2026-31790, CVE-2026-28387, CVE-2026-28389
+RUN apt-get update \
+    && apt-get install -y --only-upgrade \
+        libssl3t64 \
+        openssl \
+        libtasn1-6 \
+        libc6 \
+        libc-bin \
+        gnupg \
+        gpg \
+        gpgv \
+    && rm -rf /var/lib/apt/lists/*
+
 ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"]

From 988efb43bac1df766b939917a40d32fc6cf8ce37 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Enrique=20Hern=C3=A1ndez=20Calabr=C3=A9s?=
 <ehcalabres@gmail.com>
Date: Tue, 12 May 2026 21:37:29 +0200
Subject: [PATCH 11/14] Fix path for Huggingface Llamacpp buildspec in
 dlc_developer_config.toml

---
 dlc_developer_config.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
index 60593de299b7..3a0f53e59503 100644
--- a/dlc_developer_config.toml
+++ b/dlc_developer_config.toml
@@ -193,7 +193,7 @@ dlc-pr-huggingface-vllm = ""
 dlc-pr-huggingface-sglang = ""
 
 # Huggingface Llamacpp
-dlc-pr-huggingface-llamacpp = "/huggingface/llamacpp/buildspec.yml"
+dlc-pr-huggingface-llamacpp = "huggingface/llamacpp/buildspec.yml"
 
 # sglang
 dlc-pr-sglang = ""

From af6ba95d79f89628ded5aab87b320f755a335636 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Enrique=20Hern=C3=A1ndez=20Calabr=C3=A9s?=
 <ehcalabres@gmail.com>
Date: Wed, 13 May 2026 12:44:42 +0200
Subject: [PATCH 12/14] Update py_version extraction in
 generate_sagemaker_pytest_cmd to handle None case

---
 test/test_utils/sagemaker.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_utils/sagemaker.py b/test/test_utils/sagemaker.py
index 24f256f66253..829d0f2c0986 100644
--- a/test/test_utils/sagemaker.py
+++ b/test/test_utils/sagemaker.py
@@ -183,7 +183,8 @@ def generate_sagemaker_pytest_cmd(image, sagemaker_test_type):
             else "gpu" if "gpu" in image else "eia" if "eia" in image else "cpu"
         )
     )
-    py_version = re.search(r"py\d+", tag).group()
+    match = re.search(r"py\d+", tag)
+    py_version = match.group() if match else None
     sm_local_py_version = (
         "37"
         if py_version == "py37"

From 473a6d6ee6c8790b7794239b92128ac7d9b36ff0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Enrique=20Hern=C3=A1ndez=20Calabr=C3=A9s?=
 <ehcalabres@gmail.com>
Date: Wed, 13 May 2026 12:51:16 +0200
Subject: [PATCH 13/14] Remove transformers version from buildspec tag
 generation in Llamacpp configuration

---
 huggingface/llamacpp/buildspec.yml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/huggingface/llamacpp/buildspec.yml b/huggingface/llamacpp/buildspec.yml
index 8b05831cbadf..60ac246a989d 100644
--- a/huggingface/llamacpp/buildspec.yml
+++ b/huggingface/llamacpp/buildspec.yml
@@ -44,8 +44,7 @@ images:
     os_version: &OS_VERSION ubuntu24.04
     python_version: &DOCKER_PYTHON_VERSION py3
     tag_python_version: &TAG_PYTHON_VERSION py312
-    transformers_version: &TRANSFORMERS_VERSION 4.57.3
-    tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *CUDA_VERSION, '-', *OS_VERSION ]
+    tag: !join [ *VERSION, '-', *DEVICE_TYPE, '-', *CUDA_VERSION, '-', *OS_VERSION ]
     docker_file: !join [ docker/, *SHORT_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
     target: sagemaker
     build: true
@@ -65,8 +64,7 @@ images:
     os_version: &OS_VERSION ubuntu24.04
     python_version: &DOCKER_PYTHON_VERSION py3
     tag_python_version: &TAG_PYTHON_VERSION py312
-    transformers_version: &TRANSFORMERS_VERSION 4.57.3
-    tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *OS_VERSION ]
+    tag: !join [ *VERSION, '-', *DEVICE_TYPE, '-', *OS_VERSION ]
     docker_file: !join [ docker/, *SHORT_VERSION, /Dockerfile., *DEVICE_TYPE ]
     target: sagemaker
     build: true

From 3cc4e73ab892e9c0aa4fd293a0a7dd97af3bd10b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Enrique=20Hern=C3=A1ndez=20Calabr=C3=A9s?=
 <ehcalabres@gmail.com>
Date: Tue, 19 May 2026 12:38:29 +0200
Subject: [PATCH 14/14] Enhance image_builder to require transformers_version
 for HuggingFace builds and update tests to include Llamacpp in upstream types

---
 src/image_builder.py                          |  9 +++++++--
 .../test_boottime_container_security.py       |  2 +-
 test/dlc_tests/sanity/test_dlc_labels.py      |  5 ++++-
 test/dlc_tests/sanity/test_pre_release.py     | 19 ++++++++++++-------
 test/dlc_tests/sanity/test_safety_check.py    |  3 +++
 .../sanity/test_safety_report_file.py         |  2 +-
 test/test_utils/__init__.py                   |  9 ++++++++-
 7 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/src/image_builder.py b/src/image_builder.py
index cc401a6a5e11..48d8ef49d5cb 100644
--- a/src/image_builder.py
+++ b/src/image_builder.py
@@ -191,10 +191,15 @@ def image_builder(buildspec, image_types=[], device_types=[]):
 
         transformers_version = image_config.get("transformers_version")
 
-        if str(BUILDSPEC["framework"]).startswith("huggingface"):
+        buildspec_framework = str(BUILDSPEC["framework"])
+        requires_transformers_version = buildspec_framework.startswith(
+            "huggingface"
+        ) and buildspec_framework != "huggingface_llamacpp"
+
+        if buildspec_framework.startswith("huggingface"):
             if transformers_version:
                 extra_build_args["TRANSFORMERS_VERSION"] = transformers_version
-            else:
+            elif requires_transformers_version:
                 raise KeyError(
                     f"HuggingFace buildspec.yml must contain 'transformers_version' field for each image"
                 )
diff --git a/test/dlc_tests/sanity/test_boottime_container_security.py b/test/dlc_tests/sanity/test_boottime_container_security.py
index ded6c61b0e3d..7de90dc3dbc6 100644
--- a/test/dlc_tests/sanity/test_boottime_container_security.py
+++ b/test/dlc_tests/sanity/test_boottime_container_security.py
@@ -6,7 +6,7 @@
 @pytest.mark.model("N/A")
 @pytest.mark.canary("Run security test regularly on production images")
 def test_security(image):
-    upstream_types = ["vllm"]
+    upstream_types = ["vllm", "llamacpp"]
     if any(t in image for t in upstream_types):
         pytest.skip(
             f"{', '.join(upstream_types)} images do not require boot time security check as they are managed by upstream devs. Skipping test."
diff --git a/test/dlc_tests/sanity/test_dlc_labels.py b/test/dlc_tests/sanity/test_dlc_labels.py
index 87f68922eb9d..5fef1bb50b3a 100644
--- a/test/dlc_tests/sanity/test_dlc_labels.py
+++ b/test/dlc_tests/sanity/test_dlc_labels.py
@@ -31,7 +31,7 @@ def test_dlc_major_version_label(image, region):
 @pytest.mark.integration("dlc_labels")
 @pytest.mark.model("N/A")
 def test_dlc_standard_labels(image, region):
-    upstream_types = ["vllm", "sglang"]
+    upstream_types = ["vllm", "sglang", "llamacpp"]
     if any(t in image for t in upstream_types):
         pytest.skip(
             f"{', '.join(upstream_types)} images do not require test_dlc_standard_labels check as they are managed by upstream devs. Skipping test."
@@ -130,6 +130,9 @@ def test_dlc_major_version_dockerfiles(image):
 
     :param image: <str> ECR image URI
     """
+    if "llamacpp" in image:
+        pytest.skip("Llamacpp images do not include Python versioned Dockerfile paths.")
+
     dlc_dir = os.getcwd().split(f"{os.sep}test{os.sep}")[0]
     job_type = test_utils.get_job_type_from_image(image)
     framework, fw_version = test_utils.get_framework_and_version_from_tag(image)
diff --git a/test/dlc_tests/sanity/test_pre_release.py b/test/dlc_tests/sanity/test_pre_release.py
index 49745f8c9436..9d5c1f387f87 100644
--- a/test/dlc_tests/sanity/test_pre_release.py
+++ b/test/dlc_tests/sanity/test_pre_release.py
@@ -109,7 +109,7 @@ def test_stray_files(image):
 
     :param image: ECR image URI
     """
-    upstream_types = ["vllm", "sglang"]
+    upstream_types = ["vllm", "sglang", "llamacpp"]
     if any(t in image for t in upstream_types):
         pytest.skip(
             f"{', '.join(upstream_types)} images do not require pip check as they are managed by upstream devs. Skipping test."
@@ -347,6 +347,8 @@ def test_framework_version_cpu(image):
     """
     if "base" in image:
         pytest.skip("Base images do not contain a framework version in the tag. Skipping test.")
+    if "llamacpp" in image:
+        pytest.skip("Llamacpp images do not expose a Python framework version. Skipping test.")
     if "gpu" in image:
         pytest.skip(
             "GPU images will have their framework version tested in test_framework_and_cuda_version_gpu"
@@ -554,6 +556,9 @@ def test_dataclasses_check(image):
     ctx = Context()
     pip_package = "dataclasses"
 
+    if "llamacpp" in image:
+        pytest.skip("Llamacpp images do not include Python. Skipping test.")
+
     container_name = get_container_name("dataclasses-check", image)
 
     python_version = get_python_version_from_image_uri(image).replace("py", "")
@@ -583,7 +588,7 @@ def test_pip_check(image):
 
     :param image: ECR image URI
     """
-    upstream_types = ["vllm", "sglang"]
+    upstream_types = ["vllm", "sglang", "llamacpp"]
     if any(t in image for t in upstream_types):
         pytest.skip(
             f"{', '.join(upstream_types)} images do not require pip check as they are managed by upstream devs. Skipping test."
@@ -759,7 +764,7 @@ def test_cuda_paths(gpu):
     :param gpu: gpu image uris
     """
     image = gpu
-    general_types = ["base", "vllm", "sglang"]
+    general_types = ["base", "vllm", "sglang", "llamacpp"]
     if any(t in image for t in general_types):
         pytest.skip(
             f"{', '.join(general_types)} DLC doesn't have the same directory structure and buildspec as other images"
@@ -902,7 +907,7 @@ def _test_framework_and_cuda_version(gpu, ec2_connection):
     :param ec2_connection: fixture to establish connection with an ec2 instance
     """
     image = gpu
-    general_types = ["base", "vllm", "sglang"]
+    general_types = ["base", "vllm", "sglang", "llamacpp"]
     if any(t in image for t in general_types):
         pytest.skip(
             f"{', '.join(general_types)} images do not follow the assumptions made by inference/training. Skipping test."
@@ -1098,7 +1103,7 @@ def test_license_file(image):
     """
     Check that license file within the container is readable and valid
     """
-    general_types = ["base", "vllm", "sglang"]
+    general_types = ["base", "vllm", "sglang", "llamacpp"]
     if any(t in image for t in general_types):
         pytest.skip(f"{', '.join(general_types)} DLC doesn't embed license.txt. Skipping test.")
 
@@ -1223,7 +1228,7 @@ def test_core_package_version(image):
     In this test, we ensure that if a core_packages.json file exists for an image, the packages installed in the image
     satisfy the version constraints specified in the core_packages.json file.
     """
-    general_types = ["base", "vllm", "sglang"]
+    general_types = ["base", "vllm", "sglang", "llamacpp"]
     if any(t in image for t in general_types):
         pytest.skip(f"{', '.join(general_types)} images do not have core packages. Skipping test.")
 
@@ -1275,7 +1280,7 @@ def test_package_version_regression_in_image(image):
     keys in the buildspec - as these keys are used to extract the released image uri. Additionally, if the image is not already
     released, this test would be skipped.
     """
-    general_types = ["base", "vllm", "sglang"]
+    general_types = ["base", "vllm", "sglang", "llamacpp"]
     if any(t in image for t in general_types):
         pytest.skip(
             f"{', '.join(general_types)} images don't have python packages that needs to be checked. Skipping test."
diff --git a/test/dlc_tests/sanity/test_safety_check.py b/test/dlc_tests/sanity/test_safety_check.py
index 78c023433f44..99031bea51c0 100644
--- a/test/dlc_tests/sanity/test_safety_check.py
+++ b/test/dlc_tests/sanity/test_safety_check.py
@@ -1092,6 +1092,9 @@ def test_safety(image):
     Runs safety check on a container with the capability to ignore safety issues that cannot be fixed, and only raise
     error if an issue is fixable.
     """
+    if "llamacpp" in image:
+        pytest.skip("Llamacpp images do not include Python safety tooling. Skipping test.")
+
     from dlc.safety_check import SafetyCheck
 
     safety_check = SafetyCheck()
diff --git a/test/dlc_tests/sanity/test_safety_report_file.py b/test/dlc_tests/sanity/test_safety_report_file.py
index f8860d53784a..8f876396ce05 100644
--- a/test/dlc_tests/sanity/test_safety_report_file.py
+++ b/test/dlc_tests/sanity/test_safety_report_file.py
@@ -74,7 +74,7 @@ def test_safety_file_exists_and_is_valid(image):
             "Base images do not require safety file as there isn't much python libs in it. Skipping test."
         )
 
-    upstream_types = ["vllm", "sglang"]
+    upstream_types = ["vllm", "sglang", "llamacpp"]
     if any(t in image for t in upstream_types):
         pytest.skip(
             f"{', '.join(upstream_types)} images do not require safety file as they are managed by upstream devs. Skipping test."
diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py
index 571b9fb26ed3..bb11664d93fd 100644
--- a/test/test_utils/__init__.py
+++ b/test/test_utils/__init__.py
@@ -1836,7 +1836,14 @@ def get_framework_and_version_from_tag(image_uri):
             f"from allowed frameworks {allowed_frameworks}"
         )
 
-    tag_framework_version = re.search(r"(\d+(\.\d+){1,2})", image_uri).groups()[0]
+    _, image_tag = get_repository_and_tag_from_image_uri(image_uri)
+    if tested_framework == "huggingface_llamacpp":
+        tag_framework_version = image_tag.split("-")[0]
+    else:
+        version_match = re.search(r"(\d+(\.\d+){1,2})", image_tag)
+        if not version_match:
+            raise RuntimeError(f"Cannot find framework version in image tag {image_tag}")
+        tag_framework_version = version_match.group(1)
 
     return tested_framework, tag_framework_version