Update docs with sglang deployment

rahulgurnani · rahulgurnani · commit f9ef40ebba26 · 2025-12-29T15:05:28.000Z
diff --git a/config/manifests/gateway/gke/sglang-httproute.yaml b/config/manifests/gateway/gke/sglang-httproute.yaml
@@ -1 +1,18 @@
-# Sample http route for GKE Gateway to route traffic to sglang InferencePool
+apiVersion: gateway.networking.k8s.io/v1
+kind: HTTPRoute
+metadata:
+  name: llm-route
+spec:
+  parentRefs:
+  - group: gateway.networking.k8s.io
+    kind: Gateway
+    name: inference-gateway
+  rules:
+  - backendRefs:
+    - group: inference.networking.k8s.io
+      kind: InferencePool
+      name: sgl-llama3-8b-instruct
+    matches:
+    - path:
+        type: PathPrefix
+        value: /
diff --git a/config/manifests/sglang/gpu-deployment.yaml b/config/manifests/sglang/gpu-deployment.yaml
@@ -1,33 +1,47 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: sgl-llama3-8b-instruct
+  name: sgl-deepseek-v3
   labels:
-    app: sgl-llama3-8b-instruct
+    app: sgl-deepseek-v3
 spec:
-  replicas: 3
+  replicas: 1
   selector:
     matchLabels:
-      app: sgl-llama3-8b-instruct
+      app: sgl-deepseek-v3
   template:
     metadata:
       labels:
-        app: sgl-llama3-8b-instruct
+        app: sgl-deepseek-v3
     spec:
       containers:
         - name: sglang
-          image: lmsysorg/sglang:latest
+          image: lmsysorg/sglang:v0.5.6.post2-cu129-arm64
           command: ["python3", "-m", "sglang.launch_server"]
           args:
-            - "--model-path=meta-llama/Llama-3.1-8B-Instruct"
+            - "--model-path=nvidia/DeepSeek-V3.1-NVFP4"
             - "--host=0.0.0.0"
             - "--port=8000"
             - "--dtype=bfloat16"
             - "--kv-cache-dtype=auto"
-            - "--tp=1"
-            - "--mem-fraction-static=0.90" # Equivalent to vllm's gpu-memory-utilization
+            - "--mem-fraction-static=0.90"
             - "--trust-remote-code"
             - "--enable-metrics"
+            # Hardware & Backend Optimization
+            - "--tp=4"
+            - "--quantization=modelopt_fp4"
+            - "--attention-backend=trtllm_mla"
+            - "--moe-runner-backend=flashinfer_trtllm"
+            - "--enable-flashinfer-allreduce-fusion"
+            # Scheduling & Traffic Control
+            - "--max-running-requests=256"
+            - "--schedule-low-priority-values-first"
+            - "--enable-priority-scheduling"
+            - "--priority-scheduling-preemption-threshold=1000"
+            # Logging
+            - "--log-requests"
+            - "--log-requests-level=1"
+            - "--enable-request-time-stats-logging"
           env:
             - name: HF_TOKEN
               valueFrom:
@@ -40,7 +54,7 @@ spec:
               name: http
           resources:
             limits:
-              nvidia.com/gpu: 1
+              nvidia.com/gpu: 4
           volumeMounts:
             - name: model-cache
               mountPath: /root/.cache/huggingface
@@ -56,8 +70,7 @@ spec:
             httpGet:
               path: /health_generate
               port: 8000
-            # Give the container 10 minutes (30 * 20s) to download and load weights
-            failureThreshold: 30
+            failureThreshold: 90
             periodSeconds: 20
       volumes:
         - name: model-cache
@@ -68,4 +81,8 @@ spec:
       tolerations:
         - key: "nvidia.com/gpu"
           operator: "Exists"
+          effect: "NoSchedule"
+        - key: "kubernetes.io/arch"
+          operator: "Equal"
+          value: "arm64"
           effect: "NoSchedule"
diff --git a/site-src/_includes/epp-sglang.md b/site-src/_includes/epp-sglang.md
@@ -0,0 +1,43 @@
+=== "GKE"
+
+      ```bash
+      export GATEWAY_PROVIDER=gke
+      helm install sgl-llama3-8b-instruct \
+      --set inferencePool.modelServers.matchLabels.app=sgl-llama3-8b-instruct \
+      --set provider.name=$GATEWAY_PROVIDER \
+      --version $IGW_CHART_VERSION \
+      oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool
+      ```
+
+=== "Istio"
+
+      ```bash
+      export GATEWAY_PROVIDER=istio
+      helm install sgl-llama3-8b-instruct \
+      --set inferencePool.modelServers.matchLabels.app=sgl-llama3-8b-instruct \
+      --set provider.name=$GATEWAY_PROVIDER \
+      --version $IGW_CHART_VERSION \
+      oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool
+      ```
+
+=== "Kgateway"
+
+      ```bash
+      export GATEWAY_PROVIDER=none
+      helm install sgl-llama3-8b-instruct \
+      --set inferencePool.modelServers.matchLabels.app=sgl-llama3-8b-instruct \
+      --set provider.name=$GATEWAY_PROVIDER \
+      --version $IGW_CHART_VERSION \
+      oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool
+      ```
+
+=== "NGINX Gateway Fabric"
+
+      ```bash
+      export GATEWAY_PROVIDER=none
+      helm install sgl-llama3-8b-instruct \
+      --set inferencePool.modelServers.matchLabels.app=sgl-llama3-8b-instruct \
+      --set provider.name=$GATEWAY_PROVIDER \
+      --version $IGW_CHART_VERSION \
+      oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool
+      ```
diff --git a/site-src/guides/index.md b/site-src/guides/index.md
@@ -37,13 +37,13 @@ IGW_LATEST_RELEASE=$(curl -s https://api.github.com/repos/kubernetes-sigs/gatewa
     ```
 
 --8<-- "site-src/_includes/model-server-sim.md"
-
+    
     ```bash
     kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/${IGW_LATEST_RELEASE}/config/manifests/vllm/sim-deployment.yaml
     ```
 
 --8<-- "site-src/_includes/sglang-gpu.md"
-
+   
    ```bash
    kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/${IGW_LATEST_RELEASE}/config/manifests/sglang/gpu-deployment.yaml
    ```
@@ -135,6 +135,11 @@ kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extens
 
 --8<-- "site-src/_includes/epp.md"
 
+   For sglang deployment:
+
+--8<-- "site-src/_includes/epp-sglang.md"
+
+
 ### Deploy an Inference Gateway
 
    Choose one of the following options to deploy an Inference Gateway.
@@ -280,6 +285,12 @@ kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extens
          kubectl describe inferencepools.inference.networking.k8s.io vllm-llama3-8b-instruct
          ```
 
+         For sglang deployment:
+
+         ```bash
+         kubectl describe inferencepools.inference.networking.k8s.io sgl-llama3-8b-instruct
+        ```
+
          Check that the status shows Accepted=True and ResolvedRefs=True. This confirms the InferencePool is ready to handle traffic.
       
        For more information, see the [NGINX Gateway Fabric - Inference Gateway Setup guide](https://docs.nginx.com/nginx-gateway-fabric/how-to/gateway-api-inference-extension/#overview)
@@ -319,6 +330,14 @@ You have now deployed a basic Inference Gateway with a simple routing strategy.
       kubectl delete secret hf-token --ignore-not-found
       ```
 
+      For Sglang deployment:
+      
+      ```bash
+      helm uninstall sgl-llama3-8b-instruct
+      kubectl delete -f       kubectl delete -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/${IGW_LATEST_RELEASE}/config/manifests/sglang/gpu-deployment.yaml --ignore-not-found
+      kubectl delete secret hf-token --ignore-not-found
+      ```
+
    1. Uninstall the Gateway API Inference Extension CRDs:
 
       ```bash