diff --git a/demos/continuous_batching/README.md b/demos/continuous_batching/README.md index 79576dc503..7bf0f5789d 100644 --- a/demos/continuous_batching/README.md +++ b/demos/continuous_batching/README.md @@ -16,7 +16,7 @@ ovms_demos_continuous_batching_accuracy ``` This demo shows how to deploy LLM models in the OpenVINO Model Server using continuous batching and paged attention algorithms. -Text generation use case is exposed via OpenAI API `chat/completions` and `completions` endpoints. +Text generation use case is exposed via OpenAI API `chat/completions`, `completions` and `responses` endpoints. That makes it easy to use and efficient especially on on Intel® Xeon® processors and ARC GPUs. > **Note:** This demo was tested on 4th - 6th generation Intel® Xeon® Scalable Processors, and Intel® Core Ultra Series on Ubuntu24 and Windows11. @@ -73,7 +73,7 @@ curl http://localhost:8000/v3/models ## Request Generation -Model exposes both `chat/completions` and `completions` endpoints with and without stream capabilities. +Model exposes both `chat/completions`, `completions` and `responses` endpoints with and without stream capabilities. Chat endpoint is expected to be used for scenarios where conversation context should be pasted by the client and the model prompt is created by the server based on the jinja model template. Completion endpoint should be used to pass the prompt directly by the client and for models without the jinja template. Here is demonstrated model `Qwen/Qwen3-30B-A3B-Instruct-2507` in int4 precision. It has chat capability so `chat/completions` endpoint will be employed: @@ -148,9 +148,76 @@ curl -s http://localhost:8000/v3/chat/completions -H "Content-Type: application/ ::: +### Unary calls to responses endpoint using cURL + +::::{tab-set} + +:::{tab-item} Linux +```bash +curl http://localhost:8000/v3/responses \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "max_output_tokens":30, + "input": "What is OpenVINO?" + }'| jq . +``` +::: + +:::{tab-item} Windows +Windows Powershell +```powershell +(Invoke-WebRequest -Uri "http://localhost:8000/v3/responses" ` + -Method POST ` + -Headers @{ "Content-Type" = "application/json" } ` + -Body '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "max_output_tokens": 30, "input": "What is OpenVINO?"}').Content +``` + +Windows Command Prompt +```bat +curl -s http://localhost:8000/v3/responses -H "Content-Type: application/json" -d "{\"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\", \"max_output_tokens\": 30, \"input\": \"What is OpenVINO?\"}" +``` +::: + +:::: + +:::{dropdown} Expected Response +```json +{ + "id": "resp-1724405400", + "object": "response", + "created_at": 1724405400, + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "status": "completed", + "output": [ + { + "id": "msg-0", + "type": "message", + "role": "assistant", + "status": "completed", + "content": [ + { + "type": "output_text", + "text": "OpenVINO is an open-source software framework developed by Intel for optimizing and deploying computer vision, machine learning, and deep learning models on various devices,", + "annotations": [] + } + ] + } + ], + "usage": { + "input_tokens": 27, + "input_tokens_details": { "cached_tokens": 0 }, + "output_tokens": 30, + "output_tokens_details": { "reasoning_tokens": 0 }, + "total_tokens": 57 + } +} +``` +::: + ### OpenAI Python package -The endpoints `chat/completions` and `completions` are compatible with OpenAI client so it can be easily used to generate code also in streaming mode: +The endpoints `chat/completions`, `completions` and `responses` are compatible with OpenAI client so it can be easily used to generate code also in streaming mode: Install the client library: ```console @@ -262,6 +329,31 @@ So, **6 = 3**. ``` ::: +:::{tab-item} Responses +```python +from openai import OpenAI + +client = OpenAI( + base_url="http://localhost:8000/v3", + api_key="unused" +) + +stream = client.responses.create( + model="meta-llama/Meta-Llama-3-8B-Instruct", + input="Say this is a test", + stream=True, +) +for event in stream: + if event.type == "response.output_text.delta": + print(event.delta, end="", flush=True) +``` + +Output: +``` +It looks like you're testing me! +``` +::: + :::: ## Check how to use AI agents with MCP servers and language models @@ -300,5 +392,6 @@ Check the [guide of using lm-evaluation-harness](./accuracy/README.md) - [Official OpenVINO LLM models in HuggingFace](https://huggingface.co/collections/OpenVINO/llm) - [Chat Completions API](../../docs/model_server_rest_api_chat.md) - [Completions API](../../docs/model_server_rest_api_completions.md) +- [Responses API](../../docs/model_server_rest_api_responses.md) - [Writing client code](../../docs/clients_genai.md) - [LLM calculator reference](../../docs/llm/reference.md) diff --git a/demos/continuous_batching/vlm/README.md b/demos/continuous_batching/vlm/README.md index 0de93e27b8..2580360bd3 100644 --- a/demos/continuous_batching/vlm/README.md +++ b/demos/continuous_batching/vlm/README.md @@ -9,7 +9,7 @@ ovms_demos_vlm_npu ``` This demo shows how to deploy Vision Language Models in the OpenVINO Model Server. -Text generation use case is exposed via OpenAI API `chat/completions` endpoint. +Text generation use case is exposed via OpenAI API `chat/completions` and `responses` endpoints. > **Note:** This demo was tested on 4th - 6th generation Intel® Xeon® Scalable Processors, Intel® Arc™ GPU Series and Intel® Core Ultra Series on Ubuntu24, RedHat9 and Windows11. @@ -119,6 +119,45 @@ curl http://localhost:8000/v3/chat/completions -H "Content-Type: application/js ``` ::: +:::{dropdown} **Unary call with curl using responses endpoint** +**Note**: using urls in request requires `--allowed_media_domains` parameter described [here](../../../docs/parameters.md) + +```bash +curl http://localhost:8000/v3/responses -H "Content-Type: application/json" -d "{ \"model\": \"OpenGVLab/InternVL2-2B\", \"input\":[{\"role\": \"user\", \"content\": [{\"type\": \"input_text\", \"text\": \"Describe what is on the picture.\"},{\"type\": \"input_image\", \"image_url\": \"http://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/releases/2025/3/demos/common/static/images/zebra.jpeg\"}]}], \"max_output_tokens\": 100}" +``` +```json +{ + "id": "resp-1741731554", + "object": "response", + "created_at": 1741731554, + "model": "OpenGVLab/InternVL2-2B", + "status": "completed", + "output": [ + { + "id": "msg-0", + "type": "message", + "role": "assistant", + "status": "completed", + "content": [ + { + "type": "output_text", + "text": "The picture features a zebra standing in a grassy plain. Zebras are known for their distinctive black and white striped patterns, which help them blend in for camouflage purposes.", + "annotations": [] + } + ] + } + ], + "usage": { + "input_tokens": 19, + "input_tokens_details": { "cached_tokens": 0 }, + "output_tokens": 83, + "output_tokens_details": { "reasoning_tokens": 0 }, + "total_tokens": 102 + } +} +``` +::: + :::{dropdown} **Unary call with python requests library** ```console @@ -177,9 +216,9 @@ print(response.text) } ``` ::: -:::{dropdown} **Streaming request with OpenAI client** +:::{dropdown} **Streaming request with OpenAI client using chat/completions** -The endpoints `chat/completions` is compatible with OpenAI client so it can be easily used to generate code also in streaming mode: +The endpoints `chat/completions` and `responses` are compatible with OpenAI client so it can be easily used to generate code also in streaming mode: Install the client library: ```console @@ -223,6 +262,79 @@ The picture features a zebra standing in a grassy area. The zebra is characteriz ::: +:::{dropdown} **Streaming request with OpenAI client using responses endpoint** + +```console +pip3 install openai +``` +```python +from openai import OpenAI +import base64 +base_url='http://localhost:8080/v3' +model_name = "OpenGVLab/InternVL2-2B" + +client = OpenAI(api_key='unused', base_url=base_url) + +def convert_image(Image): + with open(Image,'rb' ) as file: + base64_image = base64.b64encode(file.read()).decode("utf-8") + return base64_image + +stream = client.responses.create( + model=model_name, + input=[ + { + "role": "user", + "content": [ + {"type": "input_text", "text": "Describe what is on the picture."}, + {"type": "input_image", "image_url": f"data:image/jpeg;base64,{convert_image('zebra.jpeg')}"} + ] + } + ], + stream=True, +) +for event in stream: + if event.type == "response.output_text.delta": + print(event.delta, end="", flush=True) +``` + +Output: +``` +The picture features a zebra standing in a grassy area. The zebra is characterized by its distinctive black and white striped pattern, which covers its entire body, including its legs, neck, and head. Zebras have small, rounded ears and a long, flowing tail. The background appears to be a natural grassy habitat, typical of a savanna or plain. +``` + +::: + +## Benchmarking text generation with high concurrency + +OpenVINO Model Server employs efficient parallelization for text generation. It can be used to generate text also in high concurrency in the environment shared by multiple clients. +It can be demonstrated using benchmarking app from vLLM repository: +```console +git clone --branch v0.7.3 --depth 1 https://github.com/vllm-project/vllm +cd vllm +pip3 install -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu +cd benchmarks +python benchmark_serving.py --backend openai-chat --dataset-name hf --dataset-path lmarena-ai/vision-arena-bench-v0.1 --hf-split train --host localhost --port 8000 --model OpenGVLab/InternVL2-2B --endpoint /v3/chat/completions --max-concurrency 1 --num-prompts 100 --trust-remote-code + +Burstiness factor: 1.0 (Poisson process) +Maximum request concurrency: None +============ Serving Benchmark Result ============ +Successful requests: 100 +Benchmark duration (s): 287.81 +Total input tokens: 15381 +Total generated tokens: 20109 +Request throughput (req/s): 0.35 +Output token throughput (tok/s): 69.87 +Total Token throughput (tok/s): 123.31 +---------------Time to First Token---------------- +Mean TTFT (ms): 1513.96 +Median TTFT (ms): 1368.93 +P99 TTFT (ms): 2647.45 +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): 6.68 +Median TPOT (ms): 6.68 +P99 TPOT (ms): 8.02 +``` ## Testing the model accuracy over serving API @@ -237,5 +349,6 @@ Check [VLM usage with NPU acceleration](../../vlm_npu/README.md) - [Export models to OpenVINO format](../common/export_models/README.md) - [Supported VLM models](https://openvinotoolkit.github.io/openvino.genai/docs/supported-models/#visual-language-models-vlms) - [Chat Completions API](../../../docs/model_server_rest_api_chat.md) +- [Responses API](../../../docs/model_server_rest_api_responses.md) - [Writing client code](../../../docs/clients_genai.md) - [LLM calculator reference](../../../docs/llm/reference.md) diff --git a/docs/llm/reference.md b/docs/llm/reference.md index 654c9b6d90..222777e4a7 100644 --- a/docs/llm/reference.md +++ b/docs/llm/reference.md @@ -44,7 +44,7 @@ struct HttpPayload { std::shared_ptr client; }; ``` -The input json content should be compatible with the [chat completions](../model_server_rest_api_chat.md) or [completions](../model_server_rest_api_completions.md) API. +The input json content should be compatible with the [chat completions](../model_server_rest_api_chat.md), [completions](../model_server_rest_api_completions.md) or [responses](../model_server_rest_api_responses.md) API. The input also includes a side packet with a reference to `LLM_NODE_RESOURCES` which is a shared object representing an LLM engine. It loads the model, runs the generation cycles and reports the generated results to the LLM calculator via a generation handler. diff --git a/src/http_rest_api_handler.cpp b/src/http_rest_api_handler.cpp index afe163e6dc..ab6a2c8868 100644 --- a/src/http_rest_api_handler.cpp +++ b/src/http_rest_api_handler.cpp @@ -531,7 +531,7 @@ static Status createV3HttpPayload( return Status(StatusCode::JSON_INVALID, "model field is not a string"); } - bool isTextGenerationEndpoint = uri.find("completions") != std::string_view::npos; + bool isTextGenerationEndpoint = (uri.find("completions") != std::string_view::npos) || (uri.find("responses") != std::string_view::npos); if (isTextGenerationEndpoint) { auto streamIt = parsedJson->FindMember("stream"); if (streamIt != parsedJson->MemberEnd()) { diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index 6898b51604..7915d88ede 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -17,6 +17,7 @@ #include "openai_completions.hpp" #include +#include #include #include #include @@ -95,8 +96,231 @@ ov::genai::JsonContainer rapidJsonValueToJsonContainer(const rapidjson::Value& v throw std::invalid_argument("Unsupported JSON value type"); } +std::string serializeResponsesEvent(const std::function&)>& eventSerializer) { + StringBuffer eventBuffer; + Writer eventWriter(eventBuffer); + eventSerializer(eventWriter); + return std::string(eventBuffer.GetString()); +} + } // namespace +void OpenAIChatCompletionsHandler::serializeResponsesToolChoice(Writer& writer) const { + writer.String("tool_choice"); + if (request.toolChoice.empty()) { + writer.String("auto"); + } else if (request.toolChoice == "auto" || request.toolChoice == "none" || request.toolChoice == "required") { + writer.String(request.toolChoice.c_str()); + } else { + writer.StartObject(); + writer.String("type"); + writer.String("function"); + writer.String("name"); + writer.String(request.toolChoice.c_str()); + writer.EndObject(); + } +} + +void OpenAIChatCompletionsHandler::serializeResponsesTools(Writer& writer) const { + writer.String("tools"); + writer.StartArray(); + for (const auto& [toolName, toolSchemaWrapper] : request.toolNameSchemaMap) { + writer.StartObject(); + writer.String("type"); + writer.String("function"); + writer.String("name"); + writer.String(toolName.c_str()); + writer.String("parameters"); + writer.RawValue(toolSchemaWrapper.stringRepr.c_str(), toolSchemaWrapper.stringRepr.size(), rapidjson::kObjectType); + writer.EndObject(); + } + writer.EndArray(); +} + +void OpenAIChatCompletionsHandler::serializeResponsesResponseObject(Writer& writer, const std::string& responseId, int64_t createdAt, + const char* status, const std::string& fullOutputText, bool includeUsage) const { + writer.StartObject(); + writer.String("id"); + writer.String(responseId.c_str()); + writer.String("object"); + writer.String("response"); + writer.String("created_at"); + writer.Int64(createdAt); + if (std::string(status) == "completed") { + writer.String("completed_at"); + writer.Int64(createdAt); + } + writer.String("model"); + writer.String(request.model.c_str()); + writer.String("status"); + writer.String(status); + + writer.String("parallel_tool_calls"); + writer.Bool(false); + serializeResponsesToolChoice(writer); + serializeResponsesTools(writer); + + if (request.maxTokens.has_value()) { + writer.String("max_output_tokens"); + writer.Uint64(static_cast(request.maxTokens.value())); + } + + writer.String("output"); + writer.StartArray(); + if (!fullOutputText.empty()) { + writer.StartObject(); + writer.String("id"); + writer.String("msg-0"); + writer.String("type"); + writer.String("message"); + writer.String("role"); + writer.String("assistant"); + writer.String("status"); + writer.String(std::string(status) == "completed" ? "completed" : "in_progress"); + writer.String("content"); + writer.StartArray(); + serializeResponsesPart(writer, fullOutputText); + writer.EndArray(); + writer.EndObject(); + } + writer.EndArray(); + + if (includeUsage) { + writer.String("usage"); + writer.StartObject(); + writer.String("input_tokens"); + writer.Uint64(static_cast(usage.promptTokens)); + writer.String("input_tokens_details"); + writer.StartObject(); + writer.String("cached_tokens"); + writer.Uint64(0); + writer.EndObject(); + writer.String("output_tokens"); + writer.Uint64(static_cast(usage.completionTokens)); + writer.String("output_tokens_details"); + writer.StartObject(); + writer.String("reasoning_tokens"); + writer.Uint64(0); + writer.EndObject(); + writer.String("total_tokens"); + writer.Uint64(static_cast(usage.calculateTotalTokens())); + writer.EndObject(); + } + + writer.EndObject(); +} + +void OpenAIChatCompletionsHandler::serializeResponsesOutputItem(Writer& writer, const std::string& outputItemId, + const std::string& text, const char* status, bool withContent) { + writer.StartObject(); + writer.String("id"); + writer.String(outputItemId.c_str()); + writer.String("type"); + writer.String("message"); + writer.String("role"); + writer.String("assistant"); + writer.String("status"); + writer.String(status); + writer.String("content"); + writer.StartArray(); + if (withContent) { + serializeResponsesPart(writer, text); + } + writer.EndArray(); + writer.EndObject(); +} + +void OpenAIChatCompletionsHandler::serializeResponsesPart(Writer& writer, const std::string& text) { + writer.StartObject(); + writer.String("type"); + writer.String("output_text"); + writer.String("text"); + writer.String(text.c_str()); + writer.String("annotations"); + writer.StartArray(); + writer.EndArray(); + writer.EndObject(); +} + +std::string OpenAIChatCompletionsHandler::serializeResponsesUnaryResponse(const std::vector& parsedOutputs) const { + const auto createdAt = std::chrono::duration_cast(created.time_since_epoch()).count(); + const std::string responseId = "resp-" + std::to_string(createdAt); + + StringBuffer buffer; + Writer writer(buffer); + + writer.StartObject(); + writer.String("id"); + writer.String(responseId.c_str()); + writer.String("object"); + writer.String("response"); + writer.String("created_at"); + writer.Int64(createdAt); + writer.String("completed_at"); + writer.Int64(createdAt); + writer.String("model"); + writer.String(request.model.c_str()); + writer.String("status"); + writer.String("completed"); + + writer.String("parallel_tool_calls"); + writer.Bool(false); + serializeResponsesToolChoice(writer); + serializeResponsesTools(writer); + + if (request.maxTokens.has_value()) { + writer.String("max_output_tokens"); + writer.Uint64(static_cast(request.maxTokens.value())); + } + + writer.String("output"); + writer.StartArray(); + int outputIndex = 0; + for (const auto& parsedOutput : parsedOutputs) { + const std::string outputId = "msg-" + std::to_string(outputIndex++); + + writer.StartObject(); + writer.String("id"); + writer.String(outputId.c_str()); + writer.String("type"); + writer.String("message"); + writer.String("role"); + writer.String("assistant"); + writer.String("status"); + writer.String("completed"); + writer.String("content"); + writer.StartArray(); + serializeResponsesPart(writer, parsedOutput.content); + writer.EndArray(); + writer.EndObject(); + } + writer.EndArray(); + + writer.String("usage"); + writer.StartObject(); + writer.String("input_tokens"); + writer.Uint64(static_cast(usage.promptTokens)); + writer.String("input_tokens_details"); + writer.StartObject(); + writer.String("cached_tokens"); + writer.Uint64(0); + writer.EndObject(); + writer.String("output_tokens"); + writer.Uint64(static_cast(usage.completionTokens)); + writer.String("output_tokens_details"); + writer.StartObject(); + writer.String("reasoning_tokens"); + writer.Uint64(0); + writer.EndObject(); + writer.String("total_tokens"); + writer.Uint64(static_cast(usage.calculateTotalTokens())); + writer.EndObject(); + + writer.EndObject(); + + return buffer.GetString(); +} + absl::Status OpenAIChatCompletionsHandler::parseCompletionsPart() { // prompt: string auto it = doc.FindMember("prompt"); @@ -247,6 +471,193 @@ absl::Status OpenAIChatCompletionsHandler::ensureArgumentsInToolCalls(Value& mes return absl::OkStatus(); } +absl::Status OpenAIChatCompletionsHandler::parseResponsesInputDirectly(std::optional allowedLocalMediaPath, std::optional> allowedMediaDomains) { + auto inputIt = doc.FindMember("input"); + if (inputIt == doc.MemberEnd()) { + return absl::InvalidArgumentError("input missing in request"); + } + + auto& allocator = doc.GetAllocator(); + rapidjson::Value messages(rapidjson::kArrayType); + + if (inputIt->value.IsString()) { + request.prompt = inputIt->value.GetString(); + if (!request.prompt.has_value() || request.prompt.value().empty()) { + return absl::InvalidArgumentError("input cannot be empty"); + } + + request.chatHistory.push_back({}); + request.chatHistory.last()["role"] = "user"; + request.chatHistory.last()["content"] = request.prompt.value(); + + rapidjson::Value messageObj(rapidjson::kObjectType); + messageObj.AddMember("role", "user", allocator); + messageObj.AddMember("content", rapidjson::Value(request.prompt->c_str(), allocator), allocator); + messages.PushBack(messageObj, allocator); + } else if (inputIt->value.IsArray()) { + if (inputIt->value.GetArray().Size() == 0) { + return absl::InvalidArgumentError("Messages array cannot be empty"); + } + + for (size_t i = 0; i < inputIt->value.GetArray().Size(); ++i) { + auto& item = inputIt->value.GetArray()[i]; + if (!item.IsObject()) { + return absl::InvalidArgumentError("input array items must be objects"); + } + + auto itemObj = item.GetObject(); + auto roleIt = itemObj.FindMember("role"); + if (roleIt == itemObj.MemberEnd() || !roleIt->value.IsString()) { + return absl::InvalidArgumentError("input item role is missing or invalid"); + } + + request.chatHistory.push_back({}); + request.chatHistory.last()["role"] = roleIt->value.GetString(); + + rapidjson::Value messageObj(rapidjson::kObjectType); + messageObj.AddMember("role", rapidjson::Value(roleIt->value.GetString(), allocator), allocator); + + auto contentIt = itemObj.FindMember("content"); + if (contentIt == itemObj.MemberEnd()) { + return absl::InvalidArgumentError("input item content is missing"); + } + + if (contentIt->value.IsString()) { + messageObj.AddMember("content", rapidjson::Value(contentIt->value.GetString(), allocator), allocator); + request.chatHistory.last()["content"] = contentIt->value.GetString(); + messages.PushBack(messageObj, allocator); + continue; + } + + if (!contentIt->value.IsArray()) { + return absl::InvalidArgumentError("input item content must be a string or array"); + } + if (contentIt->value.GetArray().Size() == 0) { + return absl::InvalidArgumentError("Invalid message structure - content array is empty"); + } + + std::string contentText; + for (auto& contentItem : contentIt->value.GetArray()) { + if (!contentItem.IsObject()) { + return absl::InvalidArgumentError("input content items must be objects"); + } + auto contentObj = contentItem.GetObject(); + auto typeIt = contentObj.FindMember("type"); + if (typeIt == contentObj.MemberEnd() || !typeIt->value.IsString()) { + return absl::InvalidArgumentError("input content item type is missing or invalid"); + } + + const std::string type = typeIt->value.GetString(); + if (type == "input_text") { + auto textIt = contentObj.FindMember("text"); + if (textIt == contentObj.MemberEnd() || !textIt->value.IsString()) { + return absl::InvalidArgumentError("input_text requires a valid text field"); + } + contentText = textIt->value.GetString(); + } else if (type == "input_image") { + std::string imageUrl; + auto imageUrlIt = contentObj.FindMember("image_url"); + if (imageUrlIt == contentObj.MemberEnd()) { + return absl::InvalidArgumentError("input_image requires image_url field"); + } + if (imageUrlIt->value.IsString()) { + imageUrl = imageUrlIt->value.GetString(); + } else if (imageUrlIt->value.IsObject()) { + auto imageUrlObj = imageUrlIt->value.GetObject(); + auto urlIt = imageUrlObj.FindMember("url"); + if (urlIt == imageUrlObj.MemberEnd() || !urlIt->value.IsString()) { + return absl::InvalidArgumentError("input_image.image_url.url is missing or invalid"); + } + imageUrl = urlIt->value.GetString(); + } else { + return absl::InvalidArgumentError("input_image.image_url must be a string or object"); + } + + std::string pattern = "base64,"; + std::size_t pos = imageUrl.find(pattern); + std::string decoded; + ov::Tensor tensor; + if (pos != std::string::npos) { + SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Loading image from base64 string"); + size_t offset = pos + pattern.length(); + if (!absl::Base64Unescape(std::string_view(imageUrl.data() + offset, imageUrl.size() - offset), &decoded)) { + return absl::InvalidArgumentError("Invalid base64 string in request"); + } + try { + tensor = loadImageStbiFromMemory(decoded); + } catch (std::runtime_error& e) { + std::stringstream ss; + ss << "Image parsing failed: " << e.what(); + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, ss.str()); + return absl::InvalidArgumentError(ss.str()); + } + } else if (std::regex_match(imageUrl.c_str(), std::regex("^(http|https|ftp|sftp|)://(.*)"))) { + SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Loading image using curl"); + int64_t sizeLimit = 20000000; // restrict single image size to 20MB + if (!allowedMediaDomains.has_value() || !isDomainAllowed(allowedMediaDomains.value(), imageUrl.c_str())) { + return absl::InvalidArgumentError("Given url does not match any allowed domain from allowed_media_domains"); + } + auto status = downloadImage(imageUrl.c_str(), decoded, sizeLimit); + if (status != absl::OkStatus()) { + return status; + } + try { + tensor = loadImageStbiFromMemory(decoded); + } catch (std::runtime_error& e) { + std::stringstream ss; + ss << "Image parsing failed: " << e.what(); + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, ss.str()); + return absl::InvalidArgumentError("Image parsing failed"); + } + } else { + if (!allowedLocalMediaPath.has_value()) { + return absl::InvalidArgumentError("Loading images from local filesystem is disabled."); + } + if (FileSystem::isPathEscaped(imageUrl)) { + std::stringstream ss; + ss << "Path " << imageUrl.c_str() << " escape with .. is forbidden."; + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, ss.str()); + return absl::InvalidArgumentError(ss.str()); + } + SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Loading image from local filesystem"); + const auto firstMissmatch = std::mismatch(imageUrl.begin(), imageUrl.end(), allowedLocalMediaPath.value().begin(), allowedLocalMediaPath.value().end()); + if (firstMissmatch.second != allowedLocalMediaPath.value().end()) { + return absl::InvalidArgumentError("Given filepath is not subpath of allowed_local_media_path"); + } + try { + tensor = loadImageStbiFromFile(imageUrl.c_str()); + } catch (std::runtime_error& e) { + std::stringstream ss; + ss << "Image file " << imageUrl.c_str() << " parsing failed: " << e.what(); + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, ss.str()); + return absl::InvalidArgumentError(ss.str()); + } + } + request.imageHistory.push_back({i, tensor}); + } else { + return absl::InvalidArgumentError("Unsupported content type"); + } + } + + messageObj.AddMember("content", rapidjson::Value(contentText.c_str(), allocator), allocator); + request.chatHistory.last()["content"] = contentText; + messages.PushBack(messageObj, allocator); + } + } else { + return absl::InvalidArgumentError("input is not a string or array"); + } + + auto existingMessages = doc.FindMember("messages"); + if (existingMessages != doc.MemberEnd()) { + existingMessages->value = messages; + } else { + doc.AddMember("messages", messages, allocator); + } + + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Parsed responses input directly to chat history"); + return absl::OkStatus(); +} + absl::Status OpenAIChatCompletionsHandler::parseMessages(std::optional allowedLocalMediaPath, std::optional> allowedMediaDomains) { auto it = doc.FindMember("messages"); if (it == doc.MemberEnd()) @@ -415,8 +826,9 @@ absl::Status OpenAIChatCompletionsHandler::parseTools() { if (tool_choice != "none" && tool_choice != "auto" && tool_choice != "required") return absl::InvalidArgumentError("tool_choice should be either 'none' or 'auto' or 'required'"); } else if (tool_choice_it->value.IsObject()) { - auto tool_choice_functionIt = tool_choice_it->value.GetObject().FindMember("function"); - if (tool_choice_functionIt != tool_choice_it->value.GetObject().MemberEnd() && tool_choice_functionIt->value.IsObject()) { + auto toolChoiceObj = tool_choice_it->value.GetObject(); + auto tool_choice_functionIt = toolChoiceObj.FindMember("function"); + if (tool_choice_functionIt != toolChoiceObj.MemberEnd() && tool_choice_functionIt->value.IsObject()) { auto nameIt = tool_choice_functionIt->value.GetObject().FindMember("name"); if (nameIt != tool_choice_functionIt->value.GetObject().MemberEnd() && nameIt->value.IsString()) { tool_choice = nameIt->value.GetString(); @@ -424,7 +836,16 @@ absl::Status OpenAIChatCompletionsHandler::parseTools() { return absl::InvalidArgumentError("tool_choice.function.name is not a valid string"); } } else { - return absl::InvalidArgumentError("tool_choice.function is not a valid JSON object"); + auto typeIt = toolChoiceObj.FindMember("type"); + auto nameIt = toolChoiceObj.FindMember("name"); + if (typeIt != toolChoiceObj.MemberEnd() && typeIt->value.IsString() && std::string(typeIt->value.GetString()) == "function") { + if (nameIt == toolChoiceObj.MemberEnd() || !nameIt->value.IsString()) { + return absl::InvalidArgumentError("tool_choice.name is not a valid string"); + } + tool_choice = nameIt->value.GetString(); + } else { + return absl::InvalidArgumentError("tool_choice.function is not a valid JSON object"); + } } } else { return absl::InvalidArgumentError("tool_choice is not a valid JSON object or string"); @@ -444,38 +865,71 @@ absl::Status OpenAIChatCompletionsHandler::parseTools() { auto& obj = it->value.GetArray()[i]; if (!obj.IsObject()) return absl::InvalidArgumentError("Tool is not a JSON object"); + rapidjson::Value* functionObj = nullptr; + rapidjson::Value* parametersValue = nullptr; + const char* functionNameCStr = nullptr; + auto functionIt = obj.FindMember("function"); - if (functionIt != obj.MemberEnd() && functionIt->value.IsObject()) { - auto nameIt = functionIt->value.GetObject().FindMember("name"); - if (nameIt != functionIt->value.GetObject().MemberEnd() && nameIt->value.IsString()) { - std::string functionName = nameIt->value.GetString(); - // If tool_choice is set to "auto", we keep all tools - // If tool_choice is set to a specific function name, we keep only that tool - if (tool_choice != "auto" && tool_choice != "required" && tool_choice != functionName) { - it->value.Erase(&obj); - jsonChanged = true; - } else { - i++; - // If we keep the tool, add tool name and schema to the request - auto parametersIt = functionIt->value.GetObject().FindMember("parameters"); - if (parametersIt != functionIt->value.GetObject().MemberEnd() && parametersIt->value.IsObject()) { - // now we want to insert to a mapping of - // tool name -> tool schema representations struct - // Dump parameters object to string since this is the schema format expected by GenAI - // Keep the rapidjson::Value object as well to avoid re-parsing in outputParsers - rapidjson::StringBuffer buffer; - rapidjson::Writer writer(buffer); - parametersIt->value.Accept(writer); - std::string parametersStr = buffer.GetString(); - ToolSchemaWrapper schemaReprs{¶metersIt->value, std::move(parametersStr)}; - request.toolNameSchemaMap[nameIt->value.GetString()] = std::move(schemaReprs); - } - } - } else { + if (functionIt != obj.MemberEnd()) { + if (!functionIt->value.IsObject()) { + return absl::InvalidArgumentError("Function is not a valid JSON object"); + } + functionObj = &functionIt->value; + auto nameIt = functionObj->GetObject().FindMember("name"); + if (nameIt == functionObj->GetObject().MemberEnd() || !nameIt->value.IsString()) { return absl::InvalidArgumentError("Function object does not contain a valid name field"); } + functionNameCStr = nameIt->value.GetString(); + auto parametersIt = functionObj->GetObject().FindMember("parameters"); + if (parametersIt != functionObj->GetObject().MemberEnd()) { + parametersValue = ¶metersIt->value; + } } else { - return absl::InvalidArgumentError("Function is not a valid JSON object"); + auto typeIt = obj.FindMember("type"); + if (typeIt == obj.MemberEnd() || !typeIt->value.IsString()) { + return absl::InvalidArgumentError("Tool type is missing or invalid"); + } + if (std::string(typeIt->value.GetString()) != "function") { + return absl::InvalidArgumentError("Only function tools are supported"); + } + + auto nameIt = obj.FindMember("name"); + if (nameIt == obj.MemberEnd() || !nameIt->value.IsString()) { + return absl::InvalidArgumentError("Function object does not contain a valid name field"); + } + functionNameCStr = nameIt->value.GetString(); + + auto parametersIt = obj.FindMember("parameters"); + if (parametersIt != obj.MemberEnd()) { + parametersValue = ¶metersIt->value; + } + } + + std::string functionName = functionNameCStr; + // If tool_choice is set to "auto", we keep all tools + // If tool_choice is set to a specific function name, we keep only that tool + if (tool_choice != "auto" && tool_choice != "required" && tool_choice != functionName) { + it->value.Erase(&obj); + jsonChanged = true; + continue; + } + + i++; + // If we keep the tool, add tool name and schema to the request + if (parametersValue != nullptr) { + if (!parametersValue->IsObject()) { + return absl::InvalidArgumentError("Function parameters are not a valid JSON object"); + } + // now we want to insert to a mapping of + // tool name -> tool schema representations struct + // Dump parameters object to string since this is the schema format expected by GenAI + // Keep the rapidjson::Value object as well to avoid re-parsing in outputParsers + rapidjson::StringBuffer buffer; + rapidjson::Writer writer(buffer); + parametersValue->Accept(writer); + std::string parametersStr = buffer.GetString(); + ToolSchemaWrapper schemaReprs{parametersValue, std::move(parametersStr)}; + request.toolNameSchemaMap[functionNameCStr] = std::move(schemaReprs); } } } else { @@ -656,6 +1110,74 @@ absl::Status OpenAIChatCompletionsHandler::parseChatCompletionsPart(std::optiona return absl::OkStatus(); } +absl::Status OpenAIChatCompletionsHandler::parseResponsesPart(std::optional maxTokensLimit, std::optional allowedLocalMediaPath, std::optional> allowedMediaDomains) { + // input: string; required + auto it = doc.FindMember("input"); + if (it == doc.MemberEnd()) { + return absl::InvalidArgumentError("input missing in request"); + } + + auto messagesStatus = parseResponsesInputDirectly(allowedLocalMediaPath, allowedMediaDomains); + if (!messagesStatus.ok()) { + return messagesStatus; + } + + // logprobs: bool; optional - defaults to false + it = doc.FindMember("logprobs"); + if (it != doc.MemberEnd() && !it->value.IsNull()) { + if (!it->value.IsBool()) + return absl::InvalidArgumentError("logprobs accepts values true or false"); + request.logprobschat = it->value.GetBool(); + } + if (request.logprobschat && request.stream) { + return absl::InvalidArgumentError("logprobs are not supported in streaming mode."); + } + + auto toolsStatus = parseTools(); + if (!toolsStatus.ok()) { + return toolsStatus; + } + + // max_output_tokens: uint; optional + // OpenAI Responses API uses this field for output token limit. + it = doc.FindMember("max_output_tokens"); + if (it != doc.MemberEnd() && !it->value.IsNull()) { + if (!it->value.IsUint()) { + if (it->value.IsUint64()) + return absl::InvalidArgumentError("max_output_tokens value can't be greater than 4294967295"); + return absl::InvalidArgumentError("max_output_tokens is not an unsigned integer"); + } + if (maxTokensLimit.has_value() && it->value.GetUint() > maxTokensLimit.value()) + return absl::InvalidArgumentError(absl::StrCat("max_output_tokens exceeds limit provided in graph config: ", maxTokensLimit.value())); + request.maxTokens = it->value.GetUint(); + } + + // specific part of max_output_tokens validation + if (request.maxTokens == 0) { + return absl::InvalidArgumentError("max_output_tokens value should be greater than 0"); + } + + // parse response_format + it = doc.FindMember("response_format"); + if (it != doc.MemberEnd()) { + if (it->value.IsNull()) + return absl::OkStatus(); + if (!it->value.IsObject()) + return absl::InvalidArgumentError("response_format is not an object"); + const rapidjson::Value& responseFormat = it->value; + request.responseFormat = convertOpenAIResponseFormatToStructuralTagStringFormat(responseFormat); + } + + { + StringBuffer buffer; + Writer writer(buffer); + doc.Accept(writer); + request.processedJson = buffer.GetString(); + } + + return absl::OkStatus(); +} + absl::Status OpenAIChatCompletionsHandler::parseCommonPart(std::optional maxTokensLimit, uint32_t bestOfLimit, std::optional maxModelLength) { OVMS_PROFILE_FUNCTION(); // stream: bool; optional @@ -712,16 +1234,23 @@ absl::Status OpenAIChatCompletionsHandler::parseCommonPart(std::optionalvalue.IsUint()) { - if (it->value.IsUint64()) - return absl::InvalidArgumentError("max_tokens value can't be greater than 4294967295"); - return absl::InvalidArgumentError("max_tokens is not an unsigned integer"); + // Not applicable for RESPONSES endpoint which uses max_output_tokens instead + if (endpoint != Endpoint::RESPONSES) { + it = doc.FindMember("max_tokens"); + if (it != doc.MemberEnd()) { + if (!it->value.IsUint()) { + if (it->value.IsUint64()) + return absl::InvalidArgumentError("max_tokens value can't be greater than 4294967295"); + return absl::InvalidArgumentError("max_tokens is not an unsigned integer"); + } + if (maxTokensLimit.has_value() && !(it->value.GetUint() < maxTokensLimit.value())) + return absl::InvalidArgumentError(absl::StrCat("max_tokens exceeds limit provided in graph config: ", maxTokensLimit.value())); + request.maxTokens = it->value.GetUint(); + } else { + if (maxTokensLimit.has_value()) { + request.maxTokens = maxTokensLimit.value(); + } } - if (maxTokensLimit.has_value() && !(it->value.GetUint() < maxTokensLimit.value())) - return absl::InvalidArgumentError(absl::StrCat("max_tokens exceeds limit provided in graph config: ", maxTokensLimit.value())); - request.maxTokens = it->value.GetUint(); } else { if (maxTokensLimit.has_value()) { request.maxTokens = maxTokensLimit.value(); @@ -848,6 +1377,7 @@ absl::Status OpenAIChatCompletionsHandler::parseCommonPart(std::optionalvalue.IsNull()) { if (!it->value.IsUint()) @@ -863,12 +1393,15 @@ absl::Status OpenAIChatCompletionsHandler::parseCommonPart(std::optionalvalue.IsNull()) { if (!it->value.IsUint()) return absl::InvalidArgumentError("n is not an unsigned integer"); if (it->value.GetUint() == 0) return absl::InvalidArgumentError("n value should be greater than 0"); + if (endpoint == Endpoint::RESPONSES && request.stream && it->value.GetUint() > 1) + return absl::InvalidArgumentError("n greater than 1 is not supported for responses streaming"); size_t bestOf = request.bestOf.has_value() ? request.bestOf.value() : 1; // 1 is default best_of value if (bestOf < it->value.GetUint()) { return absl::InvalidArgumentError("n value cannot be greater than best_of"); @@ -913,6 +1446,7 @@ std::optional OpenAIChatCompletionsHandler::getNumReturnSequences() const { StreamOptions OpenAIChatCompletionsHandler::getStreamOptions() const { return request.streamOptions; } bool OpenAIChatCompletionsHandler::isStream() const { return request.stream; } +Endpoint OpenAIChatCompletionsHandler::getEndpoint() const { return endpoint; } std::string OpenAIChatCompletionsHandler::getModel() const { return request.model; } std::string OpenAIChatCompletionsHandler::getToolChoice() const { return request.toolChoice; } const std::unique_ptr& OpenAIChatCompletionsHandler::getOutputParser() const { return outputParser; } @@ -937,6 +1471,8 @@ absl::Status OpenAIChatCompletionsHandler::parseRequest(std::optional return status; if (endpoint == Endpoint::COMPLETIONS) status = parseCompletionsPart(); + else if (endpoint == Endpoint::RESPONSES) + status = parseResponsesPart(maxTokensLimit, allowedLocalMediaPath, allowedMediaDomains); else status = parseChatCompletionsPart(maxTokensLimit, allowedLocalMediaPath, allowedMediaDomains); @@ -987,6 +1523,16 @@ ParsedOutput OpenAIChatCompletionsHandler::parseOutputIfNeeded(const std::vector std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const std::vector& generationOutputs) { OVMS_PROFILE_FUNCTION(); + if (endpoint == Endpoint::RESPONSES) { + std::vector parsedOutputs; + usage.completionTokens = 0; + for (const ov::genai::GenerationOutput& generationOutput : generationOutputs) { + updateUsage(usage, generationOutput.generated_ids, request.echo); + parsedOutputs.push_back(parseOutputIfNeeded(generationOutput.generated_ids)); + } + return serializeResponsesUnaryResponse(parsedOutputs); + } + OpenAiJsonResponse jsonResponse; jsonResponse.StartObject(); @@ -1112,6 +1658,15 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::Enco OVMS_PROFILE_FUNCTION(); usage.promptTokens = results.perf_metrics.get_num_input_tokens(); usage.completionTokens = results.perf_metrics.get_num_generated_tokens(); + if (endpoint == Endpoint::RESPONSES) { + std::vector parsedOutputs; + for (const auto& tokens : results.tokens) { + updateUsage(usage, tokens, request.echo); + parsedOutputs.push_back(parseOutputIfNeeded(tokens)); + } + return serializeResponsesUnaryResponse(parsedOutputs); + } + OpenAiJsonResponse jsonResponse; jsonResponse.StartObject(); @@ -1172,6 +1727,27 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMD OVMS_PROFILE_FUNCTION(); usage.promptTokens = results.perf_metrics.get_num_input_tokens(); usage.completionTokens = results.perf_metrics.get_num_generated_tokens(); + if (endpoint == Endpoint::RESPONSES) { + std::vector parsedOutputs; + usage.completionTokens = 0; + for (const std::string& text : results.texts) { + auto result = tokenizer.encode(text); + auto& input_ids = result.input_ids; + if (input_ids.get_shape().size() != 2) + throw std::runtime_error("input_ids should have 2 dimensions"); + if (input_ids.get_shape()[0] != 1) + throw std::runtime_error("input_ids should have 1 batch size"); + if (input_ids.get_element_type() != ov::element::i64) + throw std::runtime_error("input_ids should have i64 element type"); + + int64_t* input_ids_data = reinterpret_cast(input_ids.data()); + std::vector generatedTokens(input_ids_data, input_ids_data + input_ids.get_shape()[1]); + updateUsage(usage, generatedTokens, request.echo); + parsedOutputs.push_back(parseOutputIfNeeded(generatedTokens)); + } + return serializeResponsesUnaryResponse(parsedOutputs); + } + OpenAiJsonResponse jsonResponse; jsonResponse.StartObject(); @@ -1248,6 +1824,166 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMD std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::string& chunkResponse, ov::genai::GenerationFinishReason finishReason) { OVMS_PROFILE_FUNCTION(); + if (endpoint == Endpoint::RESPONSES) { + const auto createdAt = std::chrono::duration_cast(created.time_since_epoch()).count(); + const std::string responseId = "resp-" + std::to_string(createdAt); + const std::string outputItemId = "msg-0"; + + std::vector events; + if (!responsesStreamingInitialized) { + events.emplace_back(serializeResponsesEvent([this, &responseId, createdAt](Writer& writer) { + writer.StartObject(); + writer.String("type"); + writer.String("response.created"); + writer.String("sequence_number"); + writer.Uint64(responsesStreamingSequenceNumber++); + writer.String("response"); + serializeResponsesResponseObject(writer, responseId, createdAt, "in_progress", "", false); + writer.EndObject(); + })); + + events.emplace_back(serializeResponsesEvent([this, &responseId, createdAt](Writer& writer) { + writer.StartObject(); + writer.String("type"); + writer.String("response.in_progress"); + writer.String("sequence_number"); + writer.Uint64(responsesStreamingSequenceNumber++); + writer.String("response"); + serializeResponsesResponseObject(writer, responseId, createdAt, "in_progress", "", false); + writer.EndObject(); + })); + + events.emplace_back(serializeResponsesEvent([this, &outputItemId](Writer& writer) { + writer.StartObject(); + writer.String("type"); + writer.String("response.output_item.added"); + writer.String("sequence_number"); + writer.Uint64(responsesStreamingSequenceNumber++); + writer.String("output_index"); + writer.Uint64(0); + writer.String("item"); + serializeResponsesOutputItem(writer, outputItemId, "", "in_progress", false); + writer.EndObject(); + })); + + events.emplace_back(serializeResponsesEvent([this, &outputItemId](Writer& writer) { + writer.StartObject(); + writer.String("type"); + writer.String("response.content_part.added"); + writer.String("sequence_number"); + writer.Uint64(responsesStreamingSequenceNumber++); + writer.String("output_index"); + writer.Uint64(0); + writer.String("content_index"); + writer.Uint64(0); + writer.String("item_id"); + writer.String(outputItemId.c_str()); + writer.String("part"); + serializeResponsesPart(writer, ""); + writer.EndObject(); + })); + + responsesStreamingInitialized = true; + } + + if (!chunkResponse.empty()) { + responsesStreamingOutputText += chunkResponse; + events.emplace_back(serializeResponsesEvent([this, &chunkResponse, &outputItemId](Writer& writer) { + writer.StartObject(); + writer.String("type"); + writer.String("response.output_text.delta"); + writer.String("sequence_number"); + writer.Uint64(responsesStreamingSequenceNumber++); + writer.String("output_index"); + writer.Uint64(0); + writer.String("content_index"); + writer.Uint64(0); + writer.String("item_id"); + writer.String(outputItemId.c_str()); + writer.String("delta"); + writer.String(chunkResponse.c_str()); + writer.String("logprobs"); + writer.StartArray(); + writer.EndArray(); + writer.EndObject(); + })); + } + + if (finishReason != ov::genai::GenerationFinishReason::NONE) { + events.emplace_back(serializeResponsesEvent([this, &outputItemId](Writer& writer) { + writer.StartObject(); + writer.String("type"); + writer.String("response.output_text.done"); + writer.String("sequence_number"); + writer.Uint64(responsesStreamingSequenceNumber++); + writer.String("output_index"); + writer.Uint64(0); + writer.String("content_index"); + writer.Uint64(0); + writer.String("item_id"); + writer.String(outputItemId.c_str()); + writer.String("text"); + writer.String(responsesStreamingOutputText.c_str()); + writer.String("logprobs"); + writer.StartArray(); + writer.EndArray(); + writer.EndObject(); + })); + + events.emplace_back(serializeResponsesEvent([this, &outputItemId](Writer& writer) { + writer.StartObject(); + writer.String("type"); + writer.String("response.content_part.done"); + writer.String("sequence_number"); + writer.Uint64(responsesStreamingSequenceNumber++); + writer.String("output_index"); + writer.Uint64(0); + writer.String("content_index"); + writer.Uint64(0); + writer.String("item_id"); + writer.String(outputItemId.c_str()); + writer.String("part"); + serializeResponsesPart(writer, responsesStreamingOutputText); + writer.EndObject(); + })); + + events.emplace_back(serializeResponsesEvent([this, &outputItemId](Writer& writer) { + writer.StartObject(); + writer.String("type"); + writer.String("response.output_item.done"); + writer.String("sequence_number"); + writer.Uint64(responsesStreamingSequenceNumber++); + writer.String("output_index"); + writer.Uint64(0); + writer.String("item"); + serializeResponsesOutputItem(writer, outputItemId, responsesStreamingOutputText, "completed", true); + writer.EndObject(); + })); + + events.emplace_back(serializeResponsesEvent([this, &responseId, createdAt](Writer& writer) { + writer.StartObject(); + writer.String("type"); + writer.String("response.completed"); + writer.String("sequence_number"); + writer.Uint64(responsesStreamingSequenceNumber++); + writer.String("response"); + serializeResponsesResponseObject(writer, responseId, createdAt, "completed", responsesStreamingOutputText, true); + writer.EndObject(); + })); + } + + if (events.empty()) { + return ""; + } + + std::stringstream ss; + ss << events.front(); + for (size_t i = 1; i < events.size(); ++i) { + ss << "\n\ndata: " << events[i]; + } + return ss.str(); + } + Document doc; doc.SetObject(); Document::AllocatorType& allocator = doc.GetAllocator(); @@ -1334,6 +2070,9 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str std::string OpenAIChatCompletionsHandler::serializeStreamingUsageChunk() { OVMS_PROFILE_FUNCTION(); + if (endpoint == Endpoint::RESPONSES) { + return ""; + } StringBuffer buffer; Writer writer(buffer); diff --git a/src/llm/apis/openai_completions.hpp b/src/llm/apis/openai_completions.hpp index 516133f03a..c79514a4a6 100644 --- a/src/llm/apis/openai_completions.hpp +++ b/src/llm/apis/openai_completions.hpp @@ -47,6 +47,7 @@ namespace ovms { enum class Endpoint { CHAT_COMPLETIONS, COMPLETIONS, + RESPONSES, TOKENIZE, }; @@ -69,17 +70,32 @@ class OpenAIChatCompletionsHandler { std::chrono::time_point created; ov::genai::Tokenizer tokenizer; size_t processedTokens = 0; // tracks overall number of tokens processed by the pipeline + size_t responsesStreamingSequenceNumber = 0; + bool responsesStreamingInitialized = false; + std::string responsesStreamingOutputText; // Output parser is used to parse chat completions response to extract specific fields like tool calls and reasoning. std::unique_ptr outputParser = nullptr; absl::Status parseCompletionsPart(); absl::Status parseChatCompletionsPart(std::optional maxTokensLimit, std::optional allowedLocalMediaPath, std::optional> allowedMediaDomains); + absl::Status parseResponsesPart(std::optional maxTokensLimit, std::optional allowedLocalMediaPath, std::optional> allowedMediaDomains); + absl::Status parseResponsesInputDirectly(std::optional allowedLocalMediaPath, std::optional> allowedMediaDomains); absl::Status parseCommonPart(std::optional maxTokensLimit, uint32_t bestOfLimit, std::optional maxModelLength); ParsedOutput parseOutputIfNeeded(const std::vector& generatedIds); absl::Status ensureArgumentsInToolCalls(Value& messageObj, bool& jsonChanged); + // Responses API serialization helpers + void serializeResponsesToolChoice(Writer& writer) const; + void serializeResponsesTools(Writer& writer) const; + void serializeResponsesResponseObject(Writer& writer, const std::string& responseId, int64_t createdAt, + const char* status, const std::string& fullOutputText, bool includeUsage) const; + static void serializeResponsesOutputItem(Writer& writer, const std::string& outputItemId, + const std::string& text, const char* status, bool withContent); + static void serializeResponsesPart(Writer& writer, const std::string& text); + std::string serializeResponsesUnaryResponse(const std::vector& parsedOutputs) const; + public: OpenAIChatCompletionsHandler(Document& doc, Endpoint endpoint, std::chrono::time_point creationTime, ov::genai::Tokenizer tokenizer, const std::string& toolParserName = "", const std::string& reasoningParserName = "") : @@ -106,6 +122,7 @@ class OpenAIChatCompletionsHandler { std::optional getResponseFormat() const; bool isStream() const; + Endpoint getEndpoint() const; std::string getModel() const; std::string getToolChoice() const; const std::unique_ptr& getOutputParser() const; diff --git a/src/llm/servable.cpp b/src/llm/servable.cpp index 6d9810ae5f..3b36aecb94 100644 --- a/src/llm/servable.cpp +++ b/src/llm/servable.cpp @@ -68,10 +68,12 @@ absl::Status GenAiServable::loadRequest(std::shared_ptrendpoint = Endpoint::CHAT_COMPLETIONS; } else if (payload.uri == "/v3/completions" || payload.uri == "/v3/v1/completions") { executionContext->endpoint = Endpoint::COMPLETIONS; + } else if (payload.uri == "/v3/responses" || payload.uri == "/v3/v1/responses") { + executionContext->endpoint = Endpoint::RESPONSES; } else if (TokenizeParser::isTokenizeEndpoint(payload.uri)) { executionContext->endpoint = Endpoint::TOKENIZE; } else { - return absl::InvalidArgumentError("Wrong endpoint. Allowed endpoints: /v3/chat/completions, /v3/completions"); + return absl::InvalidArgumentError("Wrong endpoint. Allowed endpoints: /v3/chat/completions, /v3/completions, /v3/responses, /v3/tokenize"); } executionContext->payload = payload; return absl::OkStatus(); @@ -204,6 +206,50 @@ absl::Status GenAiServable::prepareInputs(std::shared_ptrapiHandler->getChatHistory().size() > 0) { +#if (PYTHON_DISABLE == 0) + bool success; + if (executionContext->apiHandler->getProcessedJson().size() > 0) { + success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, executionContext->apiHandler->getProcessedJson(), inputText); + } else { + success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, executionContext->payload.body, inputText); + } + if (!success) { + return absl::Status(absl::StatusCode::kInvalidArgument, inputText); + } +#else + ov::genai::ChatHistory& chatHistory = executionContext->apiHandler->getChatHistory(); + constexpr bool add_generation_prompt = true; + auto toolsStatus = executionContext->apiHandler->parseToolsToJsonContainer(); + if (!toolsStatus.ok()) { + return toolsStatus.status(); + } + const auto& tools = toolsStatus.value(); + auto chatTemplateKwargsStatus = executionContext->apiHandler->parseChatTemplateKwargsToJsonContainer(); + if (!chatTemplateKwargsStatus.ok()) { + return chatTemplateKwargsStatus.status(); + } + const auto& chatTemplateKwargs = chatTemplateKwargsStatus.value(); + try { + inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}, tools, chatTemplateKwargs); + } catch (const std::exception& e) { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to apply chat template: {}", e.what()); + return absl::Status(absl::StatusCode::kInvalidArgument, "Failed to apply chat template. The model either does not have chat template or has an invalid one."); + } +#endif + if (inputText.size() == 0) { + return absl::Status(absl::StatusCode::kInvalidArgument, "Final prompt after applying chat template is empty"); + } + } else { + auto prompt = executionContext->apiHandler->getPrompt(); + if (!prompt.has_value()) { + return absl::Status(absl::StatusCode::kInvalidArgument, "input is missing"); + } + inputText = prompt.value(); + } + break; + } case Endpoint::COMPLETIONS: { inputText = executionContext->apiHandler->getPrompt().value(); break; @@ -264,7 +310,10 @@ absl::Status GenAiServable::preparePartialResponse(std::shared_ptr 0) { + // For RESPONSES endpoint, always call serializeStreamingChunk so that + // initialization events (response.created, response.in_progress, etc.) + // are emitted immediately, even before the tokenizer produces text. + if (lastTextChunk.size() > 0 || executionContext->apiHandler->getEndpoint() == Endpoint::RESPONSES) { std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, finishReason); if (!serializedChunk.empty()) { executionContext->response = wrapTextInServerSideEventMessage(serializedChunk); @@ -286,8 +335,12 @@ absl::Status GenAiServable::preparePartialResponse(std::shared_ptrresponse = wrapTextInServerSideEventMessage(serializedChunk); } - if (executionContext->apiHandler->getStreamOptions().includeUsage) - executionContext->response += wrapTextInServerSideEventMessage(executionContext->apiHandler->serializeStreamingUsageChunk()); + if (executionContext->apiHandler->getStreamOptions().includeUsage) { + std::string usageChunk = executionContext->apiHandler->serializeStreamingUsageChunk(); + if (!usageChunk.empty()) { + executionContext->response += wrapTextInServerSideEventMessage(usageChunk); + } + } executionContext->response += wrapTextInServerSideEventMessage("[DONE]"); diff --git a/src/llm/visual_language_model/continuous_batching/servable.cpp b/src/llm/visual_language_model/continuous_batching/servable.cpp index be33838d9f..94aef05387 100644 --- a/src/llm/visual_language_model/continuous_batching/servable.cpp +++ b/src/llm/visual_language_model/continuous_batching/servable.cpp @@ -45,10 +45,12 @@ absl::Status VisualLanguageModelServable::loadRequest(std::shared_ptrendpoint = Endpoint::CHAT_COMPLETIONS; + } else if (payload.uri == "/v3/responses" || payload.uri == "/v3/v1/responses") { + executionContext->endpoint = Endpoint::RESPONSES; } else if (TokenizeParser::isTokenizeEndpoint(payload.uri)) { executionContext->endpoint = Endpoint::TOKENIZE; } else { - return absl::InvalidArgumentError("Wrong endpoint. VLM Servable allowed only on /v3/chat/completions endpoint or /v3/tokenize"); + return absl::InvalidArgumentError("Wrong endpoint. VLM Servable allowed only on /v3/chat/completions, /v3/responses endpoint or /v3/tokenize"); } executionContext->payload = payload; return absl::OkStatus(); @@ -67,7 +69,7 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptrapiHandler == nullptr) { return absl::Status(absl::StatusCode::kInvalidArgument, "API handler is not initialized"); } - if (executionContext->endpoint == Endpoint::CHAT_COMPLETIONS) { + if (executionContext->endpoint == Endpoint::CHAT_COMPLETIONS || executionContext->endpoint == Endpoint::RESPONSES) { ov::genai::ChatHistory& chatHistory = vlmExecutionContext->apiHandler->getChatHistory(); for (size_t i = 0; i < chatHistory.size(); i++) { diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp index 2834072410..307723415a 100644 --- a/src/llm/visual_language_model/legacy/servable.cpp +++ b/src/llm/visual_language_model/legacy/servable.cpp @@ -53,10 +53,12 @@ absl::Status VisualLanguageModelLegacyServable::loadRequest(std::shared_ptrendpoint = Endpoint::CHAT_COMPLETIONS; + } else if (payload.uri == "/v3/responses" || payload.uri == "/v3/v1/responses") { + executionContext->endpoint = Endpoint::RESPONSES; } else if (TokenizeParser::isTokenizeEndpoint(payload.uri)) { executionContext->endpoint = Endpoint::TOKENIZE; } else { - return absl::InvalidArgumentError("Wrong endpoint. VLM Servable allowed only on /v3/chat/completions endpoint or /v3/tokenize"); + return absl::InvalidArgumentError("Wrong endpoint. VLM Servable allowed only on /v3/chat/completions, /v3/responses endpoint or /v3/tokenize"); } executionContext->payload = payload; return absl::OkStatus(); @@ -237,7 +239,7 @@ absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptrapiHandler == nullptr) { return absl::Status(absl::StatusCode::kInvalidArgument, "API handler is not initialized"); } - if (executionContext->endpoint == Endpoint::CHAT_COMPLETIONS) { + if (executionContext->endpoint == Endpoint::CHAT_COMPLETIONS || executionContext->endpoint == Endpoint::RESPONSES) { ov::genai::ChatHistory& chatHistory = vlmExecutionContext->apiHandler->getChatHistory(); for (size_t i = 0; i < chatHistory.size(); i++) { diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index 94648d0e68..ecb5fc89e9 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -14,6 +14,7 @@ // limitations under the License. //***************************************************************************** #include +#include #include #include #include @@ -269,55 +270,29 @@ TEST_F(HttpOpenAIHandlerTest, Stream) { ASSERT_EQ(response, ""); } -TEST_F(HttpOpenAIHandlerTest, BodyNotAJson) { - std::string requestBody = "not a json"; - - EXPECT_CALL(*writer, PartialReplyEnd()).Times(0); - EXPECT_CALL(*writer, PartialReply(::testing::_)).Times(0); - EXPECT_CALL(*writer, IsDisconnected()).Times(0); - - auto status = handler->dispatchToProcessor("/v3/completions", requestBody, &response, comp, responseComponents, writer, multiPartParser); - ASSERT_EQ(status, ovms::StatusCode::JSON_INVALID); - ASSERT_EQ(status.string(), "The file is not valid json - Cannot parse JSON body"); -} - -TEST_F(HttpOpenAIHandlerTest, JsonBodyValidButNotAnObject) { - std::string requestBody = "[1, 2, 3]"; - - EXPECT_CALL(*writer, PartialReplyEnd()).Times(0); - EXPECT_CALL(*writer, PartialReply(::testing::_)).Times(0); - EXPECT_CALL(*writer, IsDisconnected()).Times(0); - - auto status = handler->dispatchToProcessor("/v3/completions", requestBody, &response, comp, responseComponents, writer, multiPartParser); - ASSERT_EQ(status, ovms::StatusCode::JSON_INVALID); - ASSERT_EQ(status.string(), "The file is not valid json - JSON body must be an object"); -} - -TEST_F(HttpOpenAIHandlerTest, ModelFieldMissing) { +TEST_F(HttpOpenAIHandlerTest, ResponsesStream) { std::string requestBody = R"( - { - "stream": true, - "messages": [] - } + { + "model": "gpt", + "stream": true, + "input": "What is OpenVINO?" + } )"; - EXPECT_CALL(*writer, PartialReplyEnd()).Times(0); - EXPECT_CALL(*writer, PartialReply(::testing::_)).Times(0); - EXPECT_CALL(*writer, IsDisconnected()).Times(0); + EXPECT_CALL(*writer, PartialReplyBegin(::testing::_)).WillOnce(testing::Invoke([](std::function fn) { fn(); })); + EXPECT_CALL(*writer, PartialReplyEnd()).Times(1); + EXPECT_CALL(*writer, PartialReply(::testing::_)).Times(9); + EXPECT_CALL(*writer, IsDisconnected()).Times(9); - auto status = handler->dispatchToProcessor("/v3/completions", requestBody, &response, comp, responseComponents, writer, multiPartParser); - ASSERT_EQ(status, ovms::StatusCode::JSON_INVALID); - ASSERT_EQ(status.string(), "The file is not valid json - model field is missing in JSON body"); + ASSERT_EQ( + handler->dispatchToProcessor("/v3/responses", requestBody, &response, comp, responseComponents, writer, multiPartParser), + ovms::StatusCode::PARTIAL_END); + + ASSERT_EQ(response, ""); } -TEST_F(HttpOpenAIHandlerTest, ModelFieldNotAString) { - std::string requestBody = R"( - { - "model": 2, - "stream": true, - "messages": [] - } - )"; +TEST_F(HttpOpenAIHandlerTest, BodyNotAJson) { + std::string requestBody = "not a json"; EXPECT_CALL(*writer, PartialReplyEnd()).Times(0); EXPECT_CALL(*writer, PartialReply(::testing::_)).Times(0); @@ -325,26 +300,19 @@ TEST_F(HttpOpenAIHandlerTest, ModelFieldNotAString) { auto status = handler->dispatchToProcessor("/v3/completions", requestBody, &response, comp, responseComponents, writer, multiPartParser); ASSERT_EQ(status, ovms::StatusCode::JSON_INVALID); - ASSERT_EQ(status.string(), "The file is not valid json - model field is not a string"); + ASSERT_EQ(status.string(), "The file is not valid json - Cannot parse JSON body"); } -TEST_F(HttpOpenAIHandlerTest, StreamFieldNotABoolean) { - std::string requestBody = R"( - { - "model": "gpt", - "stream": 2, - "messages": [] - } - )"; +TEST_F(HttpOpenAIHandlerTest, JsonBodyValidButNotAnObject) { + std::string requestBody = "[1, 2, 3]"; - EXPECT_CALL(*writer, PartialReplyBegin(::testing::_)).Times(0); EXPECT_CALL(*writer, PartialReplyEnd()).Times(0); EXPECT_CALL(*writer, PartialReply(::testing::_)).Times(0); EXPECT_CALL(*writer, IsDisconnected()).Times(0); auto status = handler->dispatchToProcessor("/v3/completions", requestBody, &response, comp, responseComponents, writer, multiPartParser); ASSERT_EQ(status, ovms::StatusCode::JSON_INVALID); - ASSERT_EQ(status.string(), "The file is not valid json - stream field is not a boolean"); + ASSERT_EQ(status.string(), "The file is not valid json - JSON body must be an object"); } TEST_F(HttpOpenAIHandlerTest, GraphWithANameDoesNotExist) { @@ -402,6 +370,220 @@ class HttpOpenAIHandlerParsingTest : public ::testing::Test { } }; +class HttpOpenAIHandlerCommonParsingValidationTest : public HttpOpenAIHandlerParsingTest, + public ::testing::WithParamInterface { +protected: + ovms::Endpoint endpoint() const { + return GetParam(); + } + + std::string createRequestWithRawStreamValue(const std::string& streamRawValue) const { + if (endpoint() == ovms::Endpoint::COMPLETIONS) { + return std::string("{\"model\":\"llama\",\"stream\":") + streamRawValue + ",\"prompt\":\"valid prompt\"}"; + } + if (endpoint() == ovms::Endpoint::RESPONSES) { + return std::string("{\"model\":\"llama\",\"stream\":") + streamRawValue + ",\"input\":\"valid prompt\"}"; + } + return std::string("{\"model\":\"llama\",\"stream\":") + streamRawValue + ",\"messages\":[{\"role\":\"user\",\"content\":\"valid prompt\"}]}"; + } + + std::string createRequestWithoutModel() const { + if (endpoint() == ovms::Endpoint::COMPLETIONS) { + return "{\"prompt\":\"valid prompt\"}"; + } + if (endpoint() == ovms::Endpoint::RESPONSES) { + return "{\"input\":\"valid prompt\"}"; + } + return "{\"messages\":[{\"role\":\"user\",\"content\":\"valid prompt\"}]}"; + } + + std::string createRequestWithNonStringModel() const { + if (endpoint() == ovms::Endpoint::COMPLETIONS) { + return "{\"model\":2,\"prompt\":\"valid prompt\"}"; + } + if (endpoint() == ovms::Endpoint::RESPONSES) { + return "{\"model\":2,\"input\":\"valid prompt\"}"; + } + return "{\"model\":2,\"messages\":[{\"role\":\"user\",\"content\":\"valid prompt\"}]}"; + } +}; + +TEST_P(HttpOpenAIHandlerCommonParsingValidationTest, StreamFieldNotABooleanFails) { + std::string json = createRequestWithRawStreamValue("2"); + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = + std::make_shared(doc, endpoint(), std::chrono::system_clock::now(), *tokenizer); + + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("Stream is not bool")); +} + +TEST_P(HttpOpenAIHandlerCommonParsingValidationTest, ModelFieldMissingFails) { + std::string json = createRequestWithoutModel(); + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = + std::make_shared(doc, endpoint(), std::chrono::system_clock::now(), *tokenizer); + + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("model missing in request")); +} + +TEST_P(HttpOpenAIHandlerCommonParsingValidationTest, ModelFieldNotStringFails) { + std::string json = createRequestWithNonStringModel(); + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = + std::make_shared(doc, endpoint(), std::chrono::system_clock::now(), *tokenizer); + + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("model is not a string")); +} + +INSTANTIATE_TEST_SUITE_P( + CommonParsingValidation, + HttpOpenAIHandlerCommonParsingValidationTest, + ::testing::Values(ovms::Endpoint::CHAT_COMPLETIONS, ovms::Endpoint::COMPLETIONS, ovms::Endpoint::RESPONSES), + [](const testing::TestParamInfo& info) { + switch (info.param) { + case ovms::Endpoint::CHAT_COMPLETIONS: + return "ChatCompletions"; + case ovms::Endpoint::COMPLETIONS: + return "Completions"; + case ovms::Endpoint::RESPONSES: + return "Responses"; + default: + return "Unknown"; + } + }); + +class HttpOpenAIHandlerChatAndResponsesParsingTest : public HttpOpenAIHandlerParsingTest, + public ::testing::WithParamInterface { +protected: + ovms::Endpoint endpoint() const { + return GetParam(); + } + + std::string createTextRequest(const std::string& text, const std::string& extraJsonFields = "") const { + if (endpoint() == ovms::Endpoint::RESPONSES) { + return std::string("{\"model\":\"llama\",\"input\":\"") + text + "\"" + extraJsonFields + "}"; + } + return std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"") + text + "\"}]" + extraJsonFields + "}"; + } + + std::string createMultimodalRequestWithImageUrl(const std::string& dataUrl) const { + if (endpoint() == ovms::Endpoint::RESPONSES) { + return std::string("{\"model\":\"llama\",\"input\":[{\"role\":\"user\",\"content\":[{\"type\":\"input_text\",\"text\":\"what is in this image?\"},{\"type\":\"input_image\",\"image_url\":\"") + dataUrl + "\"}]}] }"; + } + return std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":[{\"type\":\"text\",\"text\":\"what is in this image?\"},{\"type\":\"image_url\",\"image_url\":{\"url\":\"") + dataUrl + "\"}}]}]}"; + } + + std::string createToolRequest(const std::string& toolChoiceJson) const { + std::string base = createTextRequest("What is the weather like in Boston today?", ",\"tools\":[{\"type\":\"function\",\"function\":{\"name\":\"get_current_weather\",\"parameters\":{\"type\":\"object\",\"properties\":{\"location\":{\"type\":\"string\"}},\"required\":[\"location\"]}}}]"); + if (toolChoiceJson.empty()) { + return base; + } + base.pop_back(); // remove trailing '}' + base += ",\"tool_choice\":" + toolChoiceJson + "}"; + return base; + } + + std::shared_ptr parseCurrentRequest(const std::string& json) { + doc.Parse(json.c_str()); + EXPECT_FALSE(doc.HasParseError()) << json; + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = + std::make_shared(doc, endpoint(), std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()) << json; + return apiHandler; + } +}; + +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingTextInputCreatesUserChatMessage) { + std::string json = createTextRequest("What is OpenVINO?"); + auto apiHandler = parseCurrentRequest(json); + + auto& chatHistory = apiHandler->getChatHistory(); + ASSERT_EQ(chatHistory.size(), 1); + ASSERT_TRUE(chatHistory[0].contains("role")); + ASSERT_TRUE(chatHistory[0].contains("content")); + EXPECT_EQ(chatHistory[0]["role"], "user"); + EXPECT_EQ(chatHistory[0]["content"], "What is OpenVINO?"); + if (endpoint() == ovms::Endpoint::RESPONSES) { + EXPECT_NE(apiHandler->getProcessedJson().find("\"messages\""), std::string::npos); + } else { + EXPECT_TRUE(apiHandler->getProcessedJson().empty()); + } +} + +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingTokenLimitSetsMaxTokens) { + std::string tokenField = endpoint() == ovms::Endpoint::RESPONSES ? "max_output_tokens" : "max_completion_tokens"; + std::string json = createTextRequest("valid prompt", ",\"" + tokenField + "\":7"); + auto apiHandler = parseCurrentRequest(json); + + EXPECT_TRUE(apiHandler->getMaxTokens().has_value()); + EXPECT_EQ(apiHandler->getMaxTokens().value(), 7); +} + +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingFunctionToolsWithAutoChoiceSucceeds) { + std::string json = createToolRequest("\"auto\""); + auto apiHandler = parseCurrentRequest(json); + + EXPECT_TRUE(apiHandler->areToolsAvailable()); + EXPECT_EQ(apiHandler->getToolChoice(), "auto"); +} + +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingToolChoiceFunctionObjectSucceeds) { + std::string json = createToolRequest("{\"type\":\"function\",\"function\":{\"name\":\"get_current_weather\"}}"); + auto apiHandler = parseCurrentRequest(json); + + EXPECT_TRUE(apiHandler->areToolsAvailable()); + EXPECT_EQ(apiHandler->getToolChoice(), "get_current_weather"); +} + +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingToolChoiceNoneRemovesTools) { + std::string json = createToolRequest("\"none\""); + auto apiHandler = parseCurrentRequest(json); + + EXPECT_FALSE(apiHandler->areToolsAvailable()); + EXPECT_EQ(apiHandler->getToolChoice(), "none"); +} + +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingMultimodalInputImageSucceeds) { + const std::string base64Image = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAEElEQVR4nGLK27oAEAAA//8DYAHGgEvy5AAAAABJRU5ErkJggg=="; + std::string json = createMultimodalRequestWithImageUrl(base64Image); + auto apiHandler = parseCurrentRequest(json); + + EXPECT_EQ(apiHandler->getImageHistory().size(), 1); +} + +INSTANTIATE_TEST_SUITE_P( + ChatAndResponses, + HttpOpenAIHandlerChatAndResponsesParsingTest, + ::testing::Values(ovms::Endpoint::CHAT_COMPLETIONS, ovms::Endpoint::RESPONSES), + [](const testing::TestParamInfo& info) { + switch (info.param) { + case ovms::Endpoint::CHAT_COMPLETIONS: + return "ChatCompletions"; + case ovms::Endpoint::RESPONSES: + return "Responses"; + default: + return "Unknown"; + } + }); + static std::vector createHermes3ToolCallTokens(ov::genai::Tokenizer& tokenizer) { std::string toolCall = R"({"name": "example_tool", "arguments": {"arg1": "value1", "arg2": 42}})"; auto generatedTensor = tokenizer.encode(toolCall, ov::genai::add_special_tokens(true)).input_ids; @@ -557,6 +739,118 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseVLMSupportsToolCallsF ASSERT_NE(serialized.find("\"finish_reason\":\"tool_calls\""), std::string::npos) << serialized; } +TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesContainsOutputText) { + std::string json = R"({ + "model": "llama", + "input": "What is OpenVINO?", + "max_output_tokens": 5 + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + ov::genai::EncodedResults results; + ov::Tensor outputIds = tokenizer->encode("OVMS", ov::genai::add_special_tokens(false)).input_ids; + ASSERT_EQ(outputIds.get_shape().size(), 2); + ASSERT_EQ(outputIds.get_shape()[0], 1); + ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); + int64_t* outputIdsData = reinterpret_cast(outputIds.data()); + results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; + + std::string serialized = apiHandler->serializeUnaryResponse(results); + ASSERT_NE(serialized.find("\"object\":\"response\""), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"output\":"), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"type\":\"output_text\""), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"text\":"), std::string::npos) << serialized; +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeStreamingChunkForResponsesContainsRequiredEvents) { + std::string json = R"({ + "model": "llama", + "input": "What is OpenVINO?", + "stream": true + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + // Phase 1: Init events emitted even with empty text (before tokenizer produces output) + std::string initChunk = apiHandler->serializeStreamingChunk("", ov::genai::GenerationFinishReason::NONE); + ASSERT_NE(initChunk.find("\"type\":\"response.created\""), std::string::npos) << initChunk; + ASSERT_NE(initChunk.find("\"type\":\"response.in_progress\""), std::string::npos) << initChunk; + ASSERT_NE(initChunk.find("\"type\":\"response.output_item.added\""), std::string::npos) << initChunk; + ASSERT_NE(initChunk.find("\"type\":\"response.content_part.added\""), std::string::npos) << initChunk; + // No delta event when text is empty + ASSERT_EQ(initChunk.find("\"type\":\"response.output_text.delta\""), std::string::npos) << initChunk; + + // Verify correct event ordering: created < in_progress < output_item.added < content_part.added + auto createdPos = initChunk.find("\"type\":\"response.created\""); + auto inProgressPos = initChunk.find("\"type\":\"response.in_progress\""); + auto outputItemAddedPos = initChunk.find("\"type\":\"response.output_item.added\""); + auto contentPartAddedPos = initChunk.find("\"type\":\"response.content_part.added\""); + ASSERT_LT(createdPos, inProgressPos) << "response.created must come before response.in_progress"; + ASSERT_LT(inProgressPos, outputItemAddedPos) << "response.in_progress must come before response.output_item.added"; + ASSERT_LT(outputItemAddedPos, contentPartAddedPos) << "response.output_item.added must come before response.content_part.added"; + + // Phase 2: Second call should only contain delta, no repeated init events + std::string secondChunk = apiHandler->serializeStreamingChunk("", ov::genai::GenerationFinishReason::NONE); + ASSERT_TRUE(secondChunk.empty()) << "Empty text after init should produce no output: " << secondChunk; + + // Phase 3: Text delta + std::string deltaChunk = apiHandler->serializeStreamingChunk("Hello", ov::genai::GenerationFinishReason::NONE); + ASSERT_NE(deltaChunk.find("\"type\":\"response.output_text.delta\""), std::string::npos) << deltaChunk; + ASSERT_NE(deltaChunk.find("\"delta\":\"Hello\""), std::string::npos) << deltaChunk; + ASSERT_EQ(deltaChunk.find("\"type\":\"response.created\""), std::string::npos) << "No repeated init events: " << deltaChunk; + + // Phase 4: Final chunk with finish reason + std::string finalChunk = apiHandler->serializeStreamingChunk(" world", ov::genai::GenerationFinishReason::STOP); + ASSERT_NE(finalChunk.find("\"type\":\"response.output_text.delta\""), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"type\":\"response.output_text.done\""), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"type\":\"response.content_part.done\""), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"type\":\"response.output_item.done\""), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"type\":\"response.completed\""), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"text\":\"Hello world\""), std::string::npos) << finalChunk; + + // Verify correct event ordering in final chunk: delta < output_text.done < content_part.done < output_item.done < completed + auto deltaPos = finalChunk.find("\"type\":\"response.output_text.delta\""); + auto textDonePos = finalChunk.find("\"type\":\"response.output_text.done\""); + auto partDonePos = finalChunk.find("\"type\":\"response.content_part.done\""); + auto itemDonePos = finalChunk.find("\"type\":\"response.output_item.done\""); + auto completedPos = finalChunk.find("\"type\":\"response.completed\""); + ASSERT_LT(deltaPos, textDonePos) << "delta must come before output_text.done"; + ASSERT_LT(textDonePos, partDonePos) << "output_text.done must come before content_part.done"; + ASSERT_LT(partDonePos, itemDonePos) << "content_part.done must come before output_item.done"; + ASSERT_LT(itemDonePos, completedPos) << "output_item.done must come before response.completed"; +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeStreamingUsageChunkForResponsesIsEmpty) { + std::string json = R"({ + "model": "llama", + "input": "What is OpenVINO?", + "stream": true + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + ASSERT_EQ(apiHandler->serializeStreamingUsageChunk(), ""); +} + TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsBase64) { std::string json = R"({ "model": "llama", @@ -1318,6 +1612,434 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingRequestWithNullParametersCompletions } } +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesMaxOutputTokensSetsLimit) { + std::string json = R"({ + "model": "llama", + "input": "valid prompt", + "max_output_tokens": 42 + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + EXPECT_TRUE(apiHandler->getMaxTokens().has_value()); + EXPECT_EQ(apiHandler->getMaxTokens().value(), 42); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesMaxCompletionTokensIsIgnored) { + std::string json = R"({ + "model": "llama", + "input": "valid prompt", + "max_completion_tokens": 50 + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + // max_completion_tokens should be ignored for RESPONSES endpoint, so maxTokens should not be 50 + EXPECT_FALSE(apiHandler->getMaxTokens().has_value() && apiHandler->getMaxTokens().value() == 50); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesMaxTokensIsIgnored) { + std::string json = R"({ + "model": "llama", + "input": "valid prompt", + "max_tokens": 50 + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + // max_tokens should be ignored for RESPONSES endpoint, so maxTokens should not be 50 + EXPECT_FALSE(apiHandler->getMaxTokens().has_value() && apiHandler->getMaxTokens().value() == 50); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesNStreamingIsRejected) { + std::string json = R"({ + "model": "llama", + "input": "valid prompt", + "stream": true, + "n": 2 + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("n greater than 1 is not supported for responses streaming")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesNUnaryIsAccepted) { + std::string json = R"({ + "model": "llama", + "input": "valid prompt", + "best_of": 3, + "n": 2 + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 100; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesFlatFunctionToolsSucceeds) { + std::string json = R"({ + "model": "llama", + "input": "What is the weather like in Boston today?", + "tool_choice": "auto", + "tools": [ + { + "type": "function", + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA" + }, + "unit": { + "type": "string", + "enum": ["celsius", "fahrenheit"] + } + }, + "required": ["location", "unit"] + } + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + EXPECT_TRUE(apiHandler->areToolsAvailable()); + EXPECT_EQ(apiHandler->getToolChoice(), "auto"); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesToolChoiceFunctionObjectSucceeds) { + std::string json = R"({ + "model": "llama", + "input": "What is the weather like in Boston today?", + "tool_choice": { + "type": "function", + "name": "get_current_weather" + }, + "tools": [ + { + "type": "function", + "name": "get_current_weather", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} + }, + "required": ["location"] + } + }, + { + "type": "function", + "name": "unused_tool", + "parameters": { + "type": "object", + "properties": { + "arg": {"type": "string"} + } + } + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + EXPECT_TRUE(apiHandler->areToolsAvailable()); + EXPECT_EQ(apiHandler->getToolChoice(), "get_current_weather"); +} + +TEST_F(HttpOpenAIHandlerParsingTest, SerializeResponsesUnaryResponseContainsFunctionTools) { + std::string json = R"({ + "model": "llama", + "input": "What is the weather like in Boston today?", + "tool_choice": "auto", + "tools": [ + { + "type": "function", + "name": "get_current_weather", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} + }, + "required": ["location"] + } + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + ov::genai::EncodedResults results; + ov::Tensor outputIds = tokenizer->encode("Sunny", ov::genai::add_special_tokens(false)).input_ids; + ASSERT_EQ(outputIds.get_shape().size(), 2); + ASSERT_EQ(outputIds.get_shape()[0], 1); + ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); + int64_t* outputIdsData = reinterpret_cast(outputIds.data()); + results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; + + std::string serialized = apiHandler->serializeUnaryResponse(results); + ASSERT_NE(serialized.find("\"object\":\"response\""), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"tools\":[{"), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"type\":\"function\""), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"name\":\"get_current_weather\""), std::string::npos) << serialized; +} + +TEST_F(HttpOpenAIHandlerParsingTest, SerializeResponsesUnaryResponseContainsFunctionToolChoiceObject) { + std::string json = R"({ + "model": "llama", + "input": "What is the weather like in Boston today?", + "tool_choice": { + "type": "function", + "name": "get_current_weather" + }, + "tools": [ + { + "type": "function", + "name": "get_current_weather", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} + }, + "required": ["location"] + } + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + ov::genai::EncodedResults results; + ov::Tensor outputIds = tokenizer->encode("Sunny", ov::genai::add_special_tokens(false)).input_ids; + ASSERT_EQ(outputIds.get_shape().size(), 2); + ASSERT_EQ(outputIds.get_shape()[0], 1); + ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); + int64_t* outputIdsData = reinterpret_cast(outputIds.data()); + results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; + + std::string serialized = apiHandler->serializeUnaryResponse(results); + ASSERT_NE(serialized.find("\"tool_choice\":{"), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"type\":\"function\""), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"name\":\"get_current_weather\""), std::string::npos) << serialized; +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesToolChoiceFunctionObjectMissingNameFails) { + std::string json = R"({ + "model": "llama", + "input": "What is the weather like in Boston today?", + "tool_choice": { + "type": "function" + }, + "tools": [ + { + "type": "function", + "name": "get_current_weather", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} + } + } + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("tool_choice.name is not a valid string")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesToolChoiceFunctionObjectNameNotStringFails) { + std::string json = R"({ + "model": "llama", + "input": "What is the weather like in Boston today?", + "tool_choice": { + "type": "function", + "name": 7 + }, + "tools": [ + { + "type": "function", + "name": "get_current_weather", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} + } + } + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("tool_choice.name is not a valid string")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesInputImageUrlObjectSucceeds) { + std::string json = R"({ + "model": "llama", + "input": [ + { + "role": "user", + "content": [ + {"type": "input_text", "text": "what is in this image?"}, + {"type": "input_image", "image_url": {"url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAEElEQVR4nGLK27oAEAAA//8DYAHGgEvy5AAAAABJRU5ErkJggg=="}} + ] + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = + std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + EXPECT_EQ(apiHandler->getImageHistory().size(), 1); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesInputImageWithoutImageUrlFails) { + std::string json = R"({ + "model": "llama", + "input": [ + { + "role": "user", + "content": [ + {"type": "input_text", "text": "what is in this image?"}, + {"type": "input_image"} + ] + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = + std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("input_image requires image_url field")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesInputImageUrlInvalidTypeFails) { + std::string json = R"({ + "model": "llama", + "input": [ + { + "role": "user", + "content": [ + {"type": "input_text", "text": "what is in this image?"}, + {"type": "input_image", "image_url": 123} + ] + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = + std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("input_image.image_url must be a string or object")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesUnsupportedToolTypeFails) { + std::string json = R"({ + "model": "llama", + "input": "What is the weather like in Boston today?", + "tool_choice": "auto", + "tools": [ + { + "type": "web_search_preview" + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("Only function tools are supported")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesToolChoiceNoneRemovesTools) { + std::string json = R"({ + "model": "llama", + "input": "What is the weather like in Boston today?", + "tool_choice": "none", + "tools": [ + { + "type": "function", + "name": "get_current_weather", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} + } + } + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + EXPECT_FALSE(apiHandler->areToolsAvailable()); + EXPECT_EQ(apiHandler->getToolChoice(), "none"); +} + // Provide get_weather2 but take none TEST_F(HttpOpenAIHandlerParsingTest, ParseRequestWithTools_Provided1_ChoiceNone) { std::string providedTools = R"( diff --git a/src/test/llm/visual_language_model/complete_flow_test.cpp b/src/test/llm/visual_language_model/complete_flow_test.cpp index 4dc22d6fa3..5f2b380556 100644 --- a/src/test/llm/visual_language_model/complete_flow_test.cpp +++ b/src/test/llm/visual_language_model/complete_flow_test.cpp @@ -49,6 +49,7 @@ class VLMServableExecutionTest : public ::testing::Test { std::unordered_map headers{{"content-type", "application/json"}}; ovms::HttpRequestComponents comp; const std::string endpointChatCompletions = "/v3/chat/completions"; + const std::string endpointResponses = "/v3/responses"; std::shared_ptr writer; std::shared_ptr multiPartParser; std::string response; @@ -129,6 +130,50 @@ static std::string createRequestBody(const std::string& modelName, const std::ve return oss.str(); } +static std::string createResponsesRequestBody(const std::string& modelName, const std::vector>& fields, bool includeText = true, int numberOfImages = 1, const std::string contentOfTheFirstMessage = "What is in this image?") { + std::ostringstream oss; + oss << R"( + { + "model": ")" + << modelName << R"(", + "input": [ + { + "role": "user", + "content": [)"; + if (includeText) { + oss << R"( + { + "type": "input_text", + "text": ")"; + oss << contentOfTheFirstMessage; + oss << R"("})"; + if (numberOfImages > 0) { + oss << ","; + } + } + for (int i = 0; i < numberOfImages; i++) { + oss << R"( + { + "type": "input_image", + "image_url": "data:image/jpeg;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAEElEQVR4nGIy+/oREAAA//8DiQIftNKCRwAAAABJRU5ErkJggg==" + })"; + if (i < numberOfImages - 1) { + oss << ","; + } + } + oss << R"( + ] + } + ] + )"; + for (const auto& field : fields) { + oss << R"(, ")" << field.first << R"(": )" << field.second << R"()" + << "\n"; + } + oss << "\n}"; + return oss.str(); +} + class VLMServableExecutionTestParameterized : public VLMServableExecutionTest, public ::testing::WithParamInterface {}; // Unary flow @@ -304,6 +349,152 @@ TEST_P(VLMServableExecutionTestParameterized, unaryBasicWithTools) { EXPECT_STREQ(parsedResponse["model"].GetString(), modelName.c_str()); } +TEST_P(VLMServableExecutionTestParameterized, unaryResponsesWithImageInput) { + auto modelName = GetParam(); + std::vector> fields = { + {"max_output_tokens", "5"}, + {"temperature", "0.0"}}; + std::string requestBody = createResponsesRequestBody(modelName, fields); + + ovms::HttpRequestComponents responsesComp; + ASSERT_EQ(handler->parseRequestComponents(responsesComp, "POST", endpointResponses, headers), ovms::StatusCode::OK); + + ASSERT_EQ( + handler->dispatchToProcessor(endpointResponses, requestBody, &response, responsesComp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + + parsedResponse.Parse(response.c_str()); + ASSERT_TRUE(parsedResponse.IsObject()); + ASSERT_TRUE(parsedResponse.HasMember("object")); + EXPECT_STREQ(parsedResponse["object"].GetString(), "response"); + ASSERT_TRUE(parsedResponse.HasMember("model")); + EXPECT_STREQ(parsedResponse["model"].GetString(), modelName.c_str()); + ASSERT_TRUE(parsedResponse.HasMember("output")); + ASSERT_TRUE(parsedResponse["output"].IsArray()); + ASSERT_GT(parsedResponse["output"].GetArray().Size(), 0); + ASSERT_TRUE(parsedResponse["output"][0].IsObject()); + ASSERT_TRUE(parsedResponse["output"][0].HasMember("type")); + EXPECT_STREQ(parsedResponse["output"][0]["type"].GetString(), "message"); + ASSERT_TRUE(parsedResponse["output"][0].HasMember("content")); + ASSERT_TRUE(parsedResponse["output"][0]["content"].IsArray()); + ASSERT_GT(parsedResponse["output"][0]["content"].GetArray().Size(), 0); + ASSERT_TRUE(parsedResponse["output"][0]["content"][0].HasMember("type")); + EXPECT_STREQ(parsedResponse["output"][0]["content"][0]["type"].GetString(), "output_text"); + + ASSERT_TRUE(parsedResponse.HasMember("usage")); + ASSERT_TRUE(parsedResponse["usage"].IsObject()); + ASSERT_TRUE(parsedResponse["usage"].HasMember("input_tokens")); + ASSERT_TRUE(parsedResponse["usage"].HasMember("output_tokens")); + ASSERT_TRUE(parsedResponse["usage"].HasMember("total_tokens")); +} + +TEST_P(VLMServableExecutionTestParameterized, unaryResponsesOnlyImageInput) { + auto modelName = GetParam(); + std::vector> fields = { + {"max_output_tokens", "5"}, + {"temperature", "0.0"}}; + std::string requestBody = createResponsesRequestBody(modelName, fields, false, 1); + + ovms::HttpRequestComponents responsesComp; + ASSERT_EQ(handler->parseRequestComponents(responsesComp, "POST", endpointResponses, headers), ovms::StatusCode::OK); + + ASSERT_EQ( + handler->dispatchToProcessor(endpointResponses, requestBody, &response, responsesComp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + + parsedResponse.Parse(response.c_str()); + ASSERT_TRUE(parsedResponse.IsObject()); + ASSERT_TRUE(parsedResponse.HasMember("object")); + EXPECT_STREQ(parsedResponse["object"].GetString(), "response"); + ASSERT_TRUE(parsedResponse.HasMember("output")); + ASSERT_TRUE(parsedResponse["output"].IsArray()); + ASSERT_GT(parsedResponse["output"].GetArray().Size(), 0); +} + +TEST_P(VLMServableExecutionTestParameterized, unaryResponsesWithTools) { + auto modelName = GetParam(); + std::vector> fields = { + {"max_output_tokens", "5"}, + {"temperature", "0.0"}, + {"tool_choice", R"("auto")"}, + {"tools", R"([ + { + "type": "function", + "name": "get_weather", + "description": "Get weather by city", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string" + } + }, + "required": ["city"] + } + } + ])"}}; + std::string requestBody = createResponsesRequestBody(modelName, fields); + + ovms::HttpRequestComponents responsesComp; + ASSERT_EQ(handler->parseRequestComponents(responsesComp, "POST", endpointResponses, headers), ovms::StatusCode::OK); + + ASSERT_EQ( + handler->dispatchToProcessor(endpointResponses, requestBody, &response, responsesComp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + + parsedResponse.Parse(response.c_str()); + ASSERT_TRUE(parsedResponse.IsObject()); + ASSERT_TRUE(parsedResponse.HasMember("object")); + EXPECT_STREQ(parsedResponse["object"].GetString(), "response"); + ASSERT_TRUE(parsedResponse.HasMember("tools")); + ASSERT_TRUE(parsedResponse["tools"].IsArray()); + ASSERT_GT(parsedResponse["tools"].GetArray().Size(), 0); + ASSERT_TRUE(parsedResponse.HasMember("tool_choice")); + ASSERT_TRUE(parsedResponse["tool_choice"].IsString()); + EXPECT_STREQ(parsedResponse["tool_choice"].GetString(), "auto"); +} + +TEST_P(VLMServableExecutionTestParameterized, unaryResponsesWithFunctionToolChoiceObject) { + auto modelName = GetParam(); + std::vector> fields = { + {"max_output_tokens", "5"}, + {"temperature", "0.0"}, + {"tool_choice", R"({"type":"function","name":"get_weather"})"}, + {"tools", R"([ + { + "type": "function", + "name": "get_weather", + "description": "Get weather by city", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string" + } + }, + "required": ["city"] + } + } + ])"}}; + std::string requestBody = createResponsesRequestBody(modelName, fields); + + ovms::HttpRequestComponents responsesComp; + ASSERT_EQ(handler->parseRequestComponents(responsesComp, "POST", endpointResponses, headers), ovms::StatusCode::OK); + + ASSERT_EQ( + handler->dispatchToProcessor(endpointResponses, requestBody, &response, responsesComp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + + parsedResponse.Parse(response.c_str()); + ASSERT_TRUE(parsedResponse.IsObject()); + ASSERT_TRUE(parsedResponse.HasMember("tool_choice")); + ASSERT_TRUE(parsedResponse["tool_choice"].IsObject()); + ASSERT_TRUE(parsedResponse["tool_choice"].HasMember("type")); + EXPECT_STREQ(parsedResponse["tool_choice"]["type"].GetString(), "function"); + ASSERT_TRUE(parsedResponse["tool_choice"].HasMember("name")); + EXPECT_STREQ(parsedResponse["tool_choice"]["name"].GetString(), "get_weather"); +} + // Stream flow TEST_P(VLMServableExecutionTestParameterized, streamBasic) {