mudler · eureka928 · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026
diff --git a/backend/backend.proto b/backend/backend.proto
@@ -162,7 +162,7 @@ message PredictOptions {
   string ToolChoice = 49;  // JSON string or object specifying tool choice behavior
   int32 Logprobs = 50;  // Number of top logprobs to return (maps to OpenAI logprobs parameter)
   int32 TopLogprobs = 51;  // Number of top logprobs to return per token (maps to OpenAI top_logprobs parameter)
-  map<string, string> Metadata = 52;  // Generic per-request metadata (e.g., enable_thinking)
+  map<string, string> Metadata = 52;  // Generic per-request metadata (e.g., enable_thinking, json_schema, response_format)
 }
 
 // The response message containing the result

diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py
@@ -2,9 +2,11 @@
 import asyncio
 from concurrent import futures
 import argparse
+import json
 import signal
 import sys
 import os
+import time
 from typing import List
 from PIL import Image
 
@@ -15,6 +17,21 @@
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.sampling_params import SamplingParams
+
+# vLLM renamed GuidedDecodingParams to StructuredOutputsParams in newer versions.
+# The corresponding SamplingParams field also changed from guided_decoding to structured_outputs.
+try:
+    from vllm.sampling_params import StructuredOutputsParams
+    _structured_output_cls = StructuredOutputsParams
+    _structured_output_field = "structured_outputs"
+except ImportError:
+    try:
+        from vllm.sampling_params import GuidedDecodingParams
+        _structured_output_cls = GuidedDecodingParams
+        _structured_output_field = "guided_decoding"
+    except ImportError:
+        _structured_output_cls = None
+        _structured_output_field = None
 from vllm.utils import random_uuid
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.multimodal.utils import fetch_image
@@ -218,7 +235,6 @@ async def _predict(self, request, context, streaming=False):
             "SkipSpecialTokens": "skip_special_tokens",
             "SpacesBetweenSpecialTokens": "spaces_between_special_tokens",
             "TruncatePromptTokens": "truncate_prompt_tokens",
-            "GuidedDecoding": "guided_decoding",
         }
 
         sampling_params = SamplingParams(top_p=0.9, max_tokens=200)
@@ -229,6 +245,21 @@ async def _predict(self, request, context, streaming=False):
                 if value not in (None, 0, [], False):
                     setattr(sampling_params, param_field, value)
 
+        # Handle structured output via guided decoding / structured outputs
+        # Read json_schema and response_format from Metadata map (avoids extra proto fields)
+        if _structured_output_cls is not None:
+            metadata = dict(request.Metadata) if hasattr(request, 'Metadata') and request.Metadata else {}
+            constraint = None
+            if metadata.get("json_schema"):
+                constraint = _structured_output_cls(json=metadata["json_schema"])
+            elif metadata.get("response_format") == "json_object":
+                constraint = _structured_output_cls(json_object=True)
+            elif hasattr(request, 'Grammar') and request.Grammar:
+                constraint = _structured_output_cls(grammar=request.Grammar)
+
+            if constraint is not None:
+                setattr(sampling_params, _structured_output_field, constraint)
+
         # Extract image paths and process images
         prompt = request.Prompt
 

diff --git a/core/backend/options.go b/core/backend/options.go
@@ -263,6 +263,12 @@ func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions
 			metadata["enable_thinking"] = "true"
 		}
 	}
+	if c.ResponseFormat != "" {
+		metadata["response_format"] = c.ResponseFormat
+	}
+	for k, v := range c.RequestMetadata {
+		metadata[k] = v
+	}
 	pbOpts.Metadata = metadata
 
 	// Logprobs and TopLogprobs are set by the caller if provided

diff --git a/core/config/model_config.go b/core/config/model_config.go
@@ -51,6 +51,7 @@ type ModelConfig struct {
 	functionCallString, functionCallNameString string                 `yaml:"-" json:"-"`
 	ResponseFormat                             string                 `yaml:"-" json:"-"`
 	ResponseFormatMap                          map[string]interface{} `yaml:"-" json:"-"`
+	RequestMetadata                            map[string]string      `yaml:"-" json:"-"`
 
 	FunctionsConfig functions.FunctionsConfig `yaml:"function,omitempty" json:"function,omitempty"`
 	ReasoningConfig reasoning.Config          `yaml:"reasoning,omitempty" json:"reasoning,omitempty"`

diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go
@@ -430,7 +430,9 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 			switch d.Type {
 			case "json_object":
 				input.Grammar = functions.JSONBNF
+				config.ResponseFormat = "json_object"
 			case "json_schema":
+				config.ResponseFormat = "json_schema"
 				d := schema.JsonSchemaRequest{}
 				dat, err := json.Marshal(config.ResponseFormatMap)
 				if err != nil {
@@ -440,6 +442,16 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 				if err != nil {
 					return err
 				}
+
+				// Pass raw JSON schema via metadata for backends that support native structured output
+				schemaBytes, err := json.Marshal(d.JsonSchema.Schema)
+				if err == nil {
+					if config.RequestMetadata == nil {
+						config.RequestMetadata = map[string]string{}
+					}
+					config.RequestMetadata["json_schema"] = string(schemaBytes)
+				}
+
 				fs := &functions.JSONFunctionStructure{
 					AnyOf: []functions.Item{d.JsonSchema.Schema},
 				}

diff --git a/core/http/endpoints/openai/completion.go b/core/http/endpoints/openai/completion.go
@@ -87,8 +87,34 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
 			d := schema.ChatCompletionResponseFormat{}
 			dat, _ := json.Marshal(config.ResponseFormatMap)
 			_ = json.Unmarshal(dat, &d)
-			if d.Type == "json_object" {
+			switch d.Type {
+			case "json_object":
 				input.Grammar = functions.JSONBNF
+				config.ResponseFormat = "json_object"
+			case "json_schema":
+				config.ResponseFormat = "json_schema"
+				jsr := schema.JsonSchemaRequest{}
+				dat, err := json.Marshal(config.ResponseFormatMap)
+				if err == nil {
+					if err := json.Unmarshal(dat, &jsr); err == nil {
+						schemaBytes, err := json.Marshal(jsr.JsonSchema.Schema)
+						if err == nil {
+							if config.RequestMetadata == nil {
+								config.RequestMetadata = map[string]string{}
+							}
+							config.RequestMetadata["json_schema"] = string(schemaBytes)
+						}
+						fs := &functions.JSONFunctionStructure{
+							AnyOf: []functions.Item{jsr.JsonSchema.Schema},
+						}
+						g, err := fs.Grammar(config.FunctionsConfig.GrammarOptions()...)
+						if err == nil {
+							input.Grammar = g
+						} else {
+							xlog.Error("Failed generating grammar", "error", err)
+						}
+					}
+				}
 			}
 		}
 

diff --git a/core/http/endpoints/openresponses/responses.go b/core/http/endpoints/openresponses/responses.go
@@ -128,9 +128,42 @@ func ResponsesEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eval
 			Functions: funcs,
 		}
 
-		// Handle text_format -> response_format conversion
+		// Handle text_format -> response_format conversion and structured output
 		if input.TextFormat != nil {
-			openAIReq.ResponseFormat = convertTextFormatToResponseFormat(input.TextFormat)
+			responseFormat := convertTextFormatToResponseFormat(input.TextFormat)
+			openAIReq.ResponseFormat = responseFormat
+
+			// Generate grammar and pass schema for structured output (like OpenAI chat/completion)
+			if rfMap, ok := responseFormat.(map[string]interface{}); ok {
+				if rfType, _ := rfMap["type"].(string); rfType == "json_object" {
+					cfg.Grammar = functions.JSONBNF
+					cfg.ResponseFormat = "json_object"
+				} else if rfType == "json_schema" {
+					cfg.ResponseFormat = "json_schema"
+					d := schema.JsonSchemaRequest{}
+					dat, err := json.Marshal(rfMap)
+					if err == nil {
+						if err := json.Unmarshal(dat, &d); err == nil {
+							schemaBytes, err := json.Marshal(d.JsonSchema.Schema)
+							if err == nil {
+								if cfg.RequestMetadata == nil {
+									cfg.RequestMetadata = map[string]string{}
+								}
+								cfg.RequestMetadata["json_schema"] = string(schemaBytes)
+							}
+							fs := &functions.JSONFunctionStructure{
+								AnyOf: []functions.Item{d.JsonSchema.Schema},
+							}
+							g, err := fs.Grammar(cfg.FunctionsConfig.GrammarOptions()...)
+							if err == nil {
+								cfg.Grammar = g
+							} else {
+								xlog.Error("Open Responses - Failed generating grammar for json_schema", "error", err)
+							}
+						}
+					}
+				}
+			}
 		}
 
 		// Generate grammar for function calling (similar to OpenAI chat endpoint)

diff --git a/docs/content/features/constrained_grammars.md b/docs/content/features/constrained_grammars.md
@@ -10,7 +10,11 @@ url = "/features/constrained_grammars/"
 The `chat` endpoint supports the `grammar` parameter, which allows users to specify a grammar in Backus-Naur Form (BNF). This feature enables the Large Language Model (LLM) to generate outputs adhering to a user-defined schema, such as `JSON`, `YAML`, or any other format that can be defined using BNF. For more details about BNF, see [Backus-Naur Form on Wikipedia](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form).
 
 {{% notice note %}}
-**Compatibility Notice:** This feature is only supported by models that use the [llama.cpp](https://github.com/ggerganov/llama.cpp) backend. For a complete list of compatible models, refer to the [Model Compatibility]({{%relref "reference/compatibility-table" %}}) page. For technical details, see the related pull requests: [PR #1773](https://github.com/ggerganov/llama.cpp/pull/1773) and [PR #1887](https://github.com/ggerganov/llama.cpp/pull/1887).
+**Compatibility Notice:** Grammar and structured output support is available for the following backends:
+- **llama.cpp** — supports the `grammar` parameter (GBNF syntax) and `response_format` with `json_schema`/`json_object`
+- **vLLM** — supports the `grammar` parameter (via xgrammar), `response_format` with `json_schema` (native JSON schema enforcement), and `json_object`
+
+For a complete list of compatible models, refer to the [Model Compatibility]({{%relref "reference/compatibility-table" %}}) page.
  {{% /notice %}}
 
 ## Setup
@@ -66,6 +70,96 @@ For more complex grammars, you can define multi-line BNF rules. The grammar pars
 - Character classes (`[a-z]`)
 - String literals (`"text"`)
 
+## vLLM Backend
+
+The vLLM backend supports structured output via three methods:
+
+### JSON Schema (recommended)
+
+Use the OpenAI-compatible `response_format` parameter with `json_schema` to enforce a specific JSON structure:
+
+```bash
+curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+  "model": "my-vllm-model",
+  "messages": [{"role": "user", "content": "Generate a person object"}],
+  "response_format": {
+    "type": "json_schema",
+    "json_schema": {
+      "name": "person",
+      "schema": {
+        "type": "object",
+        "properties": {
+          "name": {"type": "string"},
+          "age": {"type": "integer"}
+        },
+        "required": ["name", "age"]
+      }
+    }
+  }
+}'
+```
+
+### JSON Object
+
+Force the model to output valid JSON (without a specific schema):
+
+```bash
+curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+  "model": "my-vllm-model",
+  "messages": [{"role": "user", "content": "Generate a person as JSON"}],
+  "response_format": {"type": "json_object"}
+}'
+```
+
+### Grammar
+
+The `grammar` parameter also works with vLLM via xgrammar:
+
+```bash
+curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+  "model": "my-vllm-model",
+  "messages": [{"role": "user", "content": "Do you like apples?"}],
+  "grammar": "root ::= (\"yes\" | \"no\")"
+}'
+```
+
+## Open Responses API
+
+The Open Responses API (`/v1/responses`) also supports structured output via the `text_format` parameter:
+
+### JSON Schema
+
+```bash
+curl http://localhost:8080/v1/responses -H "Content-Type: application/json" -d '{
+  "model": "my-model",
+  "input": "Generate a person object",
+  "text_format": {
+    "type": "json_schema",
+    "json_schema": {
+      "name": "person",
+      "schema": {
+        "type": "object",
+        "properties": {
+          "name": {"type": "string"},
+          "age": {"type": "integer"}
+        },
+        "required": ["name", "age"]
+      }
+    }
+  }
+}'
+```
+
+### JSON Object
+
+```bash
+curl http://localhost:8080/v1/responses -H "Content-Type: application/json" -d '{
+  "model": "my-model",
+  "input": "Generate a person as JSON",
+  "text_format": {"type": "json_object"}
+}'
+```
+
 ## Related Features
 
 - [OpenAI Functions]({{%relref "features/openai-functions" %}}) - Function calling with structured outputs