diff --git a/backend/backend.proto b/backend/backend.proto index 6312036b28cf..0a48cd6b735e 100644 --- a/backend/backend.proto +++ b/backend/backend.proto @@ -162,7 +162,7 @@ message PredictOptions { string ToolChoice = 49; // JSON string or object specifying tool choice behavior int32 Logprobs = 50; // Number of top logprobs to return (maps to OpenAI logprobs parameter) int32 TopLogprobs = 51; // Number of top logprobs to return per token (maps to OpenAI top_logprobs parameter) - map Metadata = 52; // Generic per-request metadata (e.g., enable_thinking) + map Metadata = 52; // Generic per-request metadata (e.g., enable_thinking, json_schema, response_format) } // The response message containing the result diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py index 56698a54e5f5..3a0e6b455330 100644 --- a/backend/python/vllm/backend.py +++ b/backend/python/vllm/backend.py @@ -2,9 +2,11 @@ import asyncio from concurrent import futures import argparse +import json import signal import sys import os +import time from typing import List from PIL import Image @@ -15,6 +17,21 @@ from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.sampling_params import SamplingParams + +# vLLM renamed GuidedDecodingParams to StructuredOutputsParams in newer versions. +# The corresponding SamplingParams field also changed from guided_decoding to structured_outputs. +try: + from vllm.sampling_params import StructuredOutputsParams + _structured_output_cls = StructuredOutputsParams + _structured_output_field = "structured_outputs" +except ImportError: + try: + from vllm.sampling_params import GuidedDecodingParams + _structured_output_cls = GuidedDecodingParams + _structured_output_field = "guided_decoding" + except ImportError: + _structured_output_cls = None + _structured_output_field = None from vllm.utils import random_uuid from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.multimodal.utils import fetch_image @@ -218,7 +235,6 @@ async def _predict(self, request, context, streaming=False): "SkipSpecialTokens": "skip_special_tokens", "SpacesBetweenSpecialTokens": "spaces_between_special_tokens", "TruncatePromptTokens": "truncate_prompt_tokens", - "GuidedDecoding": "guided_decoding", } sampling_params = SamplingParams(top_p=0.9, max_tokens=200) @@ -229,6 +245,21 @@ async def _predict(self, request, context, streaming=False): if value not in (None, 0, [], False): setattr(sampling_params, param_field, value) + # Handle structured output via guided decoding / structured outputs + # Read json_schema and response_format from Metadata map (avoids extra proto fields) + if _structured_output_cls is not None: + metadata = dict(request.Metadata) if hasattr(request, 'Metadata') and request.Metadata else {} + constraint = None + if metadata.get("json_schema"): + constraint = _structured_output_cls(json=metadata["json_schema"]) + elif metadata.get("response_format") == "json_object": + constraint = _structured_output_cls(json_object=True) + elif hasattr(request, 'Grammar') and request.Grammar: + constraint = _structured_output_cls(grammar=request.Grammar) + + if constraint is not None: + setattr(sampling_params, _structured_output_field, constraint) + # Extract image paths and process images prompt = request.Prompt diff --git a/core/backend/options.go b/core/backend/options.go index 3268c9287554..61056a9669d4 100644 --- a/core/backend/options.go +++ b/core/backend/options.go @@ -263,6 +263,12 @@ func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions metadata["enable_thinking"] = "true" } } + if c.ResponseFormat != "" { + metadata["response_format"] = c.ResponseFormat + } + for k, v := range c.RequestMetadata { + metadata[k] = v + } pbOpts.Metadata = metadata // Logprobs and TopLogprobs are set by the caller if provided diff --git a/core/config/model_config.go b/core/config/model_config.go index bcb6105ac04c..9d3a6f2ccd7f 100644 --- a/core/config/model_config.go +++ b/core/config/model_config.go @@ -51,6 +51,7 @@ type ModelConfig struct { functionCallString, functionCallNameString string `yaml:"-" json:"-"` ResponseFormat string `yaml:"-" json:"-"` ResponseFormatMap map[string]interface{} `yaml:"-" json:"-"` + RequestMetadata map[string]string `yaml:"-" json:"-"` FunctionsConfig functions.FunctionsConfig `yaml:"function,omitempty" json:"function,omitempty"` ReasoningConfig reasoning.Config `yaml:"reasoning,omitempty" json:"reasoning,omitempty"` diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go index 8f4a44a07469..394918769622 100644 --- a/core/http/endpoints/openai/chat.go +++ b/core/http/endpoints/openai/chat.go @@ -430,7 +430,9 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator switch d.Type { case "json_object": input.Grammar = functions.JSONBNF + config.ResponseFormat = "json_object" case "json_schema": + config.ResponseFormat = "json_schema" d := schema.JsonSchemaRequest{} dat, err := json.Marshal(config.ResponseFormatMap) if err != nil { @@ -440,6 +442,16 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator if err != nil { return err } + + // Pass raw JSON schema via metadata for backends that support native structured output + schemaBytes, err := json.Marshal(d.JsonSchema.Schema) + if err == nil { + if config.RequestMetadata == nil { + config.RequestMetadata = map[string]string{} + } + config.RequestMetadata["json_schema"] = string(schemaBytes) + } + fs := &functions.JSONFunctionStructure{ AnyOf: []functions.Item{d.JsonSchema.Schema}, } diff --git a/core/http/endpoints/openai/completion.go b/core/http/endpoints/openai/completion.go index 25935120d44d..be4de35686db 100644 --- a/core/http/endpoints/openai/completion.go +++ b/core/http/endpoints/openai/completion.go @@ -87,8 +87,34 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva d := schema.ChatCompletionResponseFormat{} dat, _ := json.Marshal(config.ResponseFormatMap) _ = json.Unmarshal(dat, &d) - if d.Type == "json_object" { + switch d.Type { + case "json_object": input.Grammar = functions.JSONBNF + config.ResponseFormat = "json_object" + case "json_schema": + config.ResponseFormat = "json_schema" + jsr := schema.JsonSchemaRequest{} + dat, err := json.Marshal(config.ResponseFormatMap) + if err == nil { + if err := json.Unmarshal(dat, &jsr); err == nil { + schemaBytes, err := json.Marshal(jsr.JsonSchema.Schema) + if err == nil { + if config.RequestMetadata == nil { + config.RequestMetadata = map[string]string{} + } + config.RequestMetadata["json_schema"] = string(schemaBytes) + } + fs := &functions.JSONFunctionStructure{ + AnyOf: []functions.Item{jsr.JsonSchema.Schema}, + } + g, err := fs.Grammar(config.FunctionsConfig.GrammarOptions()...) + if err == nil { + input.Grammar = g + } else { + xlog.Error("Failed generating grammar", "error", err) + } + } + } } } diff --git a/core/http/endpoints/openresponses/responses.go b/core/http/endpoints/openresponses/responses.go index 540f29a514bf..4aa49f36b73e 100644 --- a/core/http/endpoints/openresponses/responses.go +++ b/core/http/endpoints/openresponses/responses.go @@ -128,9 +128,42 @@ func ResponsesEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eval Functions: funcs, } - // Handle text_format -> response_format conversion + // Handle text_format -> response_format conversion and structured output if input.TextFormat != nil { - openAIReq.ResponseFormat = convertTextFormatToResponseFormat(input.TextFormat) + responseFormat := convertTextFormatToResponseFormat(input.TextFormat) + openAIReq.ResponseFormat = responseFormat + + // Generate grammar and pass schema for structured output (like OpenAI chat/completion) + if rfMap, ok := responseFormat.(map[string]interface{}); ok { + if rfType, _ := rfMap["type"].(string); rfType == "json_object" { + cfg.Grammar = functions.JSONBNF + cfg.ResponseFormat = "json_object" + } else if rfType == "json_schema" { + cfg.ResponseFormat = "json_schema" + d := schema.JsonSchemaRequest{} + dat, err := json.Marshal(rfMap) + if err == nil { + if err := json.Unmarshal(dat, &d); err == nil { + schemaBytes, err := json.Marshal(d.JsonSchema.Schema) + if err == nil { + if cfg.RequestMetadata == nil { + cfg.RequestMetadata = map[string]string{} + } + cfg.RequestMetadata["json_schema"] = string(schemaBytes) + } + fs := &functions.JSONFunctionStructure{ + AnyOf: []functions.Item{d.JsonSchema.Schema}, + } + g, err := fs.Grammar(cfg.FunctionsConfig.GrammarOptions()...) + if err == nil { + cfg.Grammar = g + } else { + xlog.Error("Open Responses - Failed generating grammar for json_schema", "error", err) + } + } + } + } + } } // Generate grammar for function calling (similar to OpenAI chat endpoint) diff --git a/docs/content/features/constrained_grammars.md b/docs/content/features/constrained_grammars.md index 33d50c900ba5..44dde2e58658 100644 --- a/docs/content/features/constrained_grammars.md +++ b/docs/content/features/constrained_grammars.md @@ -10,7 +10,11 @@ url = "/features/constrained_grammars/" The `chat` endpoint supports the `grammar` parameter, which allows users to specify a grammar in Backus-Naur Form (BNF). This feature enables the Large Language Model (LLM) to generate outputs adhering to a user-defined schema, such as `JSON`, `YAML`, or any other format that can be defined using BNF. For more details about BNF, see [Backus-Naur Form on Wikipedia](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form). {{% notice note %}} -**Compatibility Notice:** This feature is only supported by models that use the [llama.cpp](https://github.com/ggerganov/llama.cpp) backend. For a complete list of compatible models, refer to the [Model Compatibility]({{%relref "reference/compatibility-table" %}}) page. For technical details, see the related pull requests: [PR #1773](https://github.com/ggerganov/llama.cpp/pull/1773) and [PR #1887](https://github.com/ggerganov/llama.cpp/pull/1887). +**Compatibility Notice:** Grammar and structured output support is available for the following backends: +- **llama.cpp** — supports the `grammar` parameter (GBNF syntax) and `response_format` with `json_schema`/`json_object` +- **vLLM** — supports the `grammar` parameter (via xgrammar), `response_format` with `json_schema` (native JSON schema enforcement), and `json_object` + +For a complete list of compatible models, refer to the [Model Compatibility]({{%relref "reference/compatibility-table" %}}) page. {{% /notice %}} ## Setup @@ -66,6 +70,96 @@ For more complex grammars, you can define multi-line BNF rules. The grammar pars - Character classes (`[a-z]`) - String literals (`"text"`) +## vLLM Backend + +The vLLM backend supports structured output via three methods: + +### JSON Schema (recommended) + +Use the OpenAI-compatible `response_format` parameter with `json_schema` to enforce a specific JSON structure: + +```bash +curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{ + "model": "my-vllm-model", + "messages": [{"role": "user", "content": "Generate a person object"}], + "response_format": { + "type": "json_schema", + "json_schema": { + "name": "person", + "schema": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "age": {"type": "integer"} + }, + "required": ["name", "age"] + } + } + } +}' +``` + +### JSON Object + +Force the model to output valid JSON (without a specific schema): + +```bash +curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{ + "model": "my-vllm-model", + "messages": [{"role": "user", "content": "Generate a person as JSON"}], + "response_format": {"type": "json_object"} +}' +``` + +### Grammar + +The `grammar` parameter also works with vLLM via xgrammar: + +```bash +curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{ + "model": "my-vllm-model", + "messages": [{"role": "user", "content": "Do you like apples?"}], + "grammar": "root ::= (\"yes\" | \"no\")" +}' +``` + +## Open Responses API + +The Open Responses API (`/v1/responses`) also supports structured output via the `text_format` parameter: + +### JSON Schema + +```bash +curl http://localhost:8080/v1/responses -H "Content-Type: application/json" -d '{ + "model": "my-model", + "input": "Generate a person object", + "text_format": { + "type": "json_schema", + "json_schema": { + "name": "person", + "schema": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "age": {"type": "integer"} + }, + "required": ["name", "age"] + } + } + } +}' +``` + +### JSON Object + +```bash +curl http://localhost:8080/v1/responses -H "Content-Type: application/json" -d '{ + "model": "my-model", + "input": "Generate a person as JSON", + "text_format": {"type": "json_object"} +}' +``` + ## Related Features - [OpenAI Functions]({{%relref "features/openai-functions" %}}) - Function calling with structured outputs