kubernetes-sigs · k8s-ci-robot · Jan 27, 2026 · Dec 15, 2025 · Dec 17, 2025 · Dec 23, 2025
diff --git a/config/charts/inference-extension/templates/_deployment.yaml b/config/charts/inference-extension/templates/_deployment.yaml
@@ -95,6 +95,11 @@ spec:
               - "{{ (split "/" .Values.inferencePool.apiVersion)._0 }}"
           {{- end }}
           {{- end }}
+          # SGLang specific metrics.
+          {{- if eq $modelServerType "sglang" }}
+              - --total-queued-requests-metric="sglang:num_queue_reqs"
+              - --kv-cache-usage-percentage-metric="sglang:token_usage"
+          {{- end }}
           {{- if eq $modelServerType "triton-tensorrt-llm" }}
               - --total-queued-requests-metric
               - "nv_trt_llm_request_metrics{request_type=waiting}"

diff --git a/config/manifests/sglang/gpu-deployment.yaml b/config/manifests/sglang/gpu-deployment.yaml
@@ -0,0 +1,71 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: sgl-llama3-8b-instruct
+  labels:
+    app: sgl-llama3-8b-instruct
+spec:
+  replicas: 3
+  selector:
+    matchLabels:
+      app: sgl-llama3-8b-instruct
+  template:
+    metadata:
+      labels:
+        app: sgl-llama3-8b-instruct
+    spec:
+      containers:
+        - name: sglang
+          image: lmsysorg/sglang:latest
+          command: ["python3", "-m", "sglang.launch_server"]
+          args:
+            - "--model-path=meta-llama/Llama-3.1-8B-Instruct"
+            - "--host=0.0.0.0"
+            - "--port=8000"
+            - "--dtype=bfloat16"
+            - "--kv-cache-dtype=auto"
+            - "--tp=1"
+            - "--mem-fraction-static=0.90" # Equivalent to vllm's gpu-memory-utilization
+            - "--trust-remote-code"
+            - "--enable-metrics"
+          env:
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token
+                  key: token
+                  optional: true
+          ports:
+            - containerPort: 8000
+              name: http
+          resources:
+            limits:
+              nvidia.com/gpu: 1
+          volumeMounts:
+            - name: model-cache
+              mountPath: /root/.cache/huggingface
+            - name: dshm
+              mountPath: /dev/shm
+          readinessProbe:
+            httpGet:
+              path: /health_generate
+              port: 8000
+            periodSeconds: 40
+            timeoutSeconds: 30
+          startupProbe:
+            httpGet:
+              path: /health_generate
+              port: 8000
+            # Give the container 10 minutes (30 * 20s) to download and load weights
+            failureThreshold: 30
+            periodSeconds: 20
+      volumes:
+        - name: model-cache
+          emptyDir: {}
+        - name: dshm
+          emptyDir:
+            medium: Memory
+      tolerations:
+        - key: "nvidia.com/gpu"
+          operator: "Exists"
+          effect: "NoSchedule"
diff --git a/site-src/_includes/epp-sglang.md b/site-src/_includes/epp-sglang.md
@@ -0,0 +1,47 @@
+=== "GKE"
+
+      ```bash
+      export GATEWAY_PROVIDER=gke
+      helm install sgl-llama3-8b-instruct \
+      --set inferencePool.modelServers.matchLabels.app=sgl-llama3-8b-instruct \
+      --set provider.name=$GATEWAY_PROVIDER \
+      --set inferencePool.modelServerType=sglang \
+      --version $IGW_CHART_VERSION \
+      oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool
+      ```
+
+=== "Istio"
+
+      ```bash
+      export GATEWAY_PROVIDER=istio
+      helm install sgl-llama3-8b-instruct \
+      --set inferencePool.modelServers.matchLabels.app=sgl-llama3-8b-instruct \
+      --set provider.name=$GATEWAY_PROVIDER \
+      --set inferencePool.modelServerType=sglang \
+      --version $IGW_CHART_VERSION \
+      oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool
+      ```
+
+=== "Kgateway"
+
+      ```bash
+      export GATEWAY_PROVIDER=none
+      helm install sgl-llama3-8b-instruct \
+      --set inferencePool.modelServers.matchLabels.app=sgl-llama3-8b-instruct \
+      --set provider.name=$GATEWAY_PROVIDER \
+      --set inferencePool.modelServerType=sglang \
+      --version $IGW_CHART_VERSION \
+      oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool
+      ```
+
+=== "NGINX Gateway Fabric"
+
+      ```bash
+      export GATEWAY_PROVIDER=none
+      helm install sgl-llama3-8b-instruct \
+      --set inferencePool.modelServers.matchLabels.app=sgl-llama3-8b-instruct \
+      --set provider.name=$GATEWAY_PROVIDER \
+      --set inferencePool.modelServerType=sglang \
+      --version $IGW_CHART_VERSION \
+      oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool
+      ```
diff --git a/site-src/_includes/model-server-sim.md b/site-src/_includes/model-server-sim.md
@@ -1,4 +1,4 @@
-=== "vLLM Simulator Model Server"
+=== "vLLM Simulator deployment"
 
     This option uses the [vLLM simulator](https://github.com/llm-d/llm-d-inference-sim/tree/main) to simulate a backend model server.
     This setup uses the least amount of compute resources, does not require GPU's, and is ideal for test/dev environments.

diff --git a/site-src/_includes/sglang-gpu.md b/site-src/_includes/sglang-gpu.md
@@ -0,0 +1,8 @@
+=== "GPU-Based SGLang deployment"
+
+    For this setup, you will need 3 GPUs to run the sample model server. Adjust the number of replicas as needed.
+    Create a Hugging Face secret to download the model [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct).
+    Ensure that the token grants access to this model.
+
+    Deploy a sample SGLang deployment with the proper protocol to work with inference gateway.
+
diff --git a/site-src/_includes/model-server-cpu.md → site-src/_includes/vllm-cpu.md b/site-src/_includes/model-server-cpu.md → site-src/_includes/vllm-cpu.md
@@ -1,4 +1,4 @@
-=== "CPU-Based Model Server"
+=== "CPU-Based vLLM deployment"
 
     ???+ warning
 

diff --git a/site-src/_includes/model-server-gpu.md → site-src/_includes/vllm-gpu.md b/site-src/_includes/model-server-gpu.md → site-src/_includes/vllm-gpu.md
@@ -1,4 +1,4 @@
-=== "GPU-Based Model Server"
+=== "GPU-Based vLLM deployment"
 
     For this setup, you will need 3 GPUs to run the sample model server. Adjust the number of replicas as needed.
     Create a Hugging Face secret to download the model [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct).

diff --git a/site-src/guides/getting-started-latest.md b/site-src/guides/getting-started-latest.md
@@ -16,14 +16,14 @@
 
 ### Deploy Sample Model Server
 
---8<-- "site-src/_includes/model-server-gpu.md"
+--8<-- "site-src/_includes/vllm-gpu.md"
 
     ```bash
     kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to the set of Llama models
     kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/gpu-deployment.yaml
     ```
 
---8<-- "site-src/_includes/model-server-cpu.md"
+--8<-- "site-src/_includes/vllm-cpu.md"
 
     ```bash
     kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/cpu-deployment.yaml

diff --git a/site-src/guides/index.md b/site-src/guides/index.md
@@ -21,14 +21,14 @@ IGW_LATEST_RELEASE=$(curl -s https://api.github.com/repos/kubernetes-sigs/gatewa
 
 ### Deploy Sample Model Server
 
---8<-- "site-src/_includes/model-server-gpu.md"
+--8<-- "site-src/_includes/vllm-gpu.md"
 
     ```bash
     kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to the set of Llama models
     kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/${IGW_LATEST_RELEASE}/config/manifests/vllm/gpu-deployment.yaml
     ```
 
---8<-- "site-src/_includes/model-server-cpu.md"
+--8<-- "site-src/_includes/vllm-cpu.md"
 
     ```bash
     kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/${IGW_LATEST_RELEASE}/config/manifests/vllm/cpu-deployment.yaml
@@ -40,6 +40,12 @@ IGW_LATEST_RELEASE=$(curl -s https://api.github.com/repos/kubernetes-sigs/gatewa
     kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/${IGW_LATEST_RELEASE}/config/manifests/vllm/sim-deployment.yaml
     ```
 
+--8<-- "site-src/_includes/sglang-gpu.md"
+
+   ```bash
+   kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/${IGW_LATEST_RELEASE}/config/manifests/sglang/gpu-deployment.yaml
+   ```
+
 ### Install the Inference Extension CRDs
 
 ```bash
@@ -210,6 +216,10 @@ kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extens
 
 --8<-- "site-src/_includes/epp.md"
 
+For sglang deployment:
+
+--8<-- "site-src/_includes/epp-sglang.md"
+
 ### Verify HttpRoute and InferencePool Status
 
 --8<-- "site-src/_includes/verify-status.md"
@@ -330,4 +340,4 @@ You have now deployed a basic Inference Gateway with a simple routing strategy.
 
          ```bash
          kubectl delete ns nginx-gateway
-         ```
+         ```