final

codefromthecrypt · codefromthecrypt · commit 74ea0ac7e8bf · 2026-02-14T11:36:50.000+08:00
Signed-off-by: Adrian Cole &lt;adrian@tetrate.io&gt;
diff --git a/inference-platforms/chat.py b/inference-platforms/chat.py
@@ -39,10 +39,8 @@ def main():
 
     # vllm-specific switch to disable thinking, ignored by other inference platforms.
     # See https://qwen.readthedocs.io/en/latest/deployment/vllm.html#thinking-non-thinking-modes
-    if "qwen3" in model.lower():
-        extra_body = {"chat_template_kwargs": {"enable_thinking": False}}
-    else:
-        extra_body = {}
+    extra_body = {"chat_template_kwargs": {"enable_thinking": False}} if model.startswith("Qwen/Qwen3") else None
+
     if args.use_responses_api:
         response = client.responses.create(
             model=model, input=messages[0]["content"], temperature=0, extra_body=extra_body
diff --git a/inference-platforms/llama-stack/README.md b/inference-platforms/llama-stack/README.md
@@ -43,15 +43,11 @@ uv run --exact -q --env-file env.local ../agent.py --use-responses-api
 
 * Llama Stack's Responses API connects to MCP servers server-side (unlike aigw
   which proxies MCP). The agent passes MCP configuration via `HostedMCPTool`.
-
 * Uses the `starter` distribution with its built-in `remote::openai` provider,
   pointing to Ollama via `OPENAI_BASE_URL` environment variable.
 * Models require `provider_id/` prefix (e.g., `openai/qwen3:0.6b`)
-* Until [this issue][docker] resolves, running docker on Apple Silicon
-  requires emulation.
 
 ---
-[docker]: https://github.com/llamastack/llama-stack/issues/406
 [docs]: https://llama-stack.readthedocs.io/en/latest/index.html
 [otel-sink]: https://llama-stack.readthedocs.io/en/latest/building_applications/telemetry.html#configuration
 [uv]: https://docs.astral.sh/uv/getting-started/installation/
diff --git a/inference-platforms/llama-stack/docker-compose.yml b/inference-platforms/llama-stack/docker-compose.yml
@@ -15,8 +15,7 @@ services:
     depends_on:
       ollama-pull:
         condition: service_completed_successfully
-    image: llamastack/distribution-starter:0.4.1
-    platform: linux/amd64  # ARM64 not published: https://github.com/llamastack/llama-stack/issues/406
+    image: llamastack/distribution-starter:0.5.0
     container_name: llama-stack
     tty: true
     env_file: