From a41d43d84d3eb595e8d7afe81030bf6299d10273 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Mon, 25 May 2026 15:49:02 +0200 Subject: [PATCH 1/5] Fix reporting output tokens in responses for legacy pipelines --- src/llm/language_model/legacy/servable.cpp | 5 +- .../visual_language_model/legacy/servable.cpp | 4 +- src/test/http_openai_handler_test.cpp | 101 ++++++++++++++++++ 3 files changed, 105 insertions(+), 5 deletions(-) diff --git a/src/llm/language_model/legacy/servable.cpp b/src/llm/language_model/legacy/servable.cpp index 8e244df219..d8faa20b42 100644 --- a/src/llm/language_model/legacy/servable.cpp +++ b/src/llm/language_model/legacy/servable.cpp @@ -234,13 +234,12 @@ absl::Status LegacyServable::preparePartialResponse(std::shared_ptrresults.finish_reasons.empty() ? ov::genai::GenerationFinishReason::STOP : legacyExecutionContext->results.finish_reasons[0]; + executionContext->apiHandler->setPromptTokensUsage(legacyExecutionContext->results.perf_metrics.get_num_input_tokens()); + executionContext->apiHandler->setCompletionTokensUsage(legacyExecutionContext->results.perf_metrics.get_num_generated_tokens()); std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, finishReason); if (!serializedChunk.empty()) { executionContext->response = wrapTextInServerSideEventMessage(serializedChunk); } - - executionContext->apiHandler->setPromptTokensUsage(legacyExecutionContext->results.perf_metrics.get_num_input_tokens()); - executionContext->apiHandler->setCompletionTokensUsage(legacyExecutionContext->results.perf_metrics.get_num_generated_tokens()); if (executionContext->apiHandler->getStreamOptions().includeUsage) executionContext->response += wrapTextInServerSideEventMessage(executionContext->apiHandler->serializeStreamingUsageChunk()); diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp index 033cb8641d..6e6fcadcd0 100644 --- a/src/llm/visual_language_model/legacy/servable.cpp +++ b/src/llm/visual_language_model/legacy/servable.cpp @@ -250,12 +250,12 @@ absl::Status VisualLanguageModelLegacyServable::preparePartialResponse(std::shar } // Legacy generation path always runs with batch=1, so we read the single finish reason at index 0. ov::genai::GenerationFinishReason finishReason = legacyExecutionContext->results.finish_reasons.empty() ? ov::genai::GenerationFinishReason::STOP : legacyExecutionContext->results.finish_reasons[0]; + executionContext->apiHandler->setPromptTokensUsage(legacyExecutionContext->results.perf_metrics.get_num_input_tokens()); + executionContext->apiHandler->setCompletionTokensUsage(legacyExecutionContext->results.perf_metrics.get_num_generated_tokens()); std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, finishReason); if (!serializedChunk.empty()) { executionContext->response = wrapTextInServerSideEventMessage(serializedChunk); } - executionContext->apiHandler->setPromptTokensUsage(legacyExecutionContext->results.perf_metrics.get_num_input_tokens()); - executionContext->apiHandler->setCompletionTokensUsage(legacyExecutionContext->results.perf_metrics.get_num_generated_tokens()); if (executionContext->apiHandler->getStreamOptions().includeUsage) executionContext->response += wrapTextInServerSideEventMessage(executionContext->apiHandler->serializeStreamingUsageChunk()); diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index 46a31a3337..44321e1e6e 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -5652,3 +5652,104 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesStreamOptionsRejected) { EXPECT_EQ(status, absl::InvalidArgumentError("stream_options is not supported in Responses API.")); } + +TEST_F(HttpOpenAIHandlerParsingTest, streamingResponsesCompletedEventHasCorrectUsageWhenSetBeforeFinalChunk) { + std::string json = R"({ + "model": "llama", + "input": "What is OpenVINO?", + "stream": true + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared( + doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + apiHandler->serializeStreamingCreatedEvent(); + apiHandler->serializeStreamingInProgressEvent(); + + // Simulate a mid-stream token delta + apiHandler->serializeStreamingChunk("Hello", ov::genai::GenerationFinishReason::NONE); + + // Fixed order: set usage BEFORE the final serializeStreamingChunk call + apiHandler->setPromptTokensUsage(7); + apiHandler->setCompletionTokensUsage(3); + std::string finalChunk = apiHandler->serializeStreamingChunk(" world", ov::genai::GenerationFinishReason::STOP); + + // The response.completed event must carry the correct usage values + ASSERT_NE(finalChunk.find("\"type\":\"response.completed\""), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"input_tokens\":7"), std::string::npos) + << "input_tokens must reflect value set before final chunk: " << finalChunk; + ASSERT_NE(finalChunk.find("\"output_tokens\":3"), std::string::npos) + << "output_tokens must reflect value set before final chunk: " << finalChunk; + ASSERT_NE(finalChunk.find("\"total_tokens\":10"), std::string::npos) + << "total_tokens must be input+output: " << finalChunk; +} + +TEST_F(HttpOpenAIHandlerParsingTest, streamingResponsesCompletedEventHasZeroUsageWhenSetAfterFinalChunk) { + std::string json = R"({ + "model": "llama", + "input": "What is OpenVINO?", + "stream": true + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared( + doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + apiHandler->serializeStreamingCreatedEvent(); + apiHandler->serializeStreamingInProgressEvent(); + apiHandler->serializeStreamingChunk("Hello", ov::genai::GenerationFinishReason::NONE); + + // Buggy order: final chunk first — response.completed is built with usage still at 0 + std::string finalChunk = apiHandler->serializeStreamingChunk(" world", ov::genai::GenerationFinishReason::STOP); + apiHandler->setPromptTokensUsage(7); + apiHandler->setCompletionTokensUsage(3); + + // Confirm the bug: output_tokens in the completed event is 0 + ASSERT_NE(finalChunk.find("\"type\":\"response.completed\""), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"output_tokens\":0"), std::string::npos) + << "output_tokens must be 0 when usage is set after the final chunk (documents the pre-fix bug): " << finalChunk; +} + +TEST_F(HttpOpenAIHandlerParsingTest, streamingChatCompletionsUsageChunkCorrectRegardlessOfSetOrder) { + std::string json = R"({ + "model": "llama", + "stream": true, + "stream_options": {"include_usage": true}, + "messages": [{"role": "user", "content": "hi"}] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared( + doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + uint32_t maxTokensLimit = 100; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + apiHandler->serializeStreamingChunk("Hello", ov::genai::GenerationFinishReason::NONE); + + // For chat/completions the final chunk does NOT embed usage, so setting it after is fine + apiHandler->serializeStreamingChunk("", ov::genai::GenerationFinishReason::STOP); + apiHandler->setPromptTokensUsage(7); + apiHandler->setCompletionTokensUsage(3); + + std::string usageChunk = apiHandler->serializeStreamingUsageChunk(); + ASSERT_NE(usageChunk.find("\"prompt_tokens\":7"), std::string::npos) + << "prompt_tokens must be correct in usage chunk: " << usageChunk; + ASSERT_NE(usageChunk.find("\"completion_tokens\":3"), std::string::npos) + << "completion_tokens must be correct in usage chunk: " << usageChunk; + ASSERT_NE(usageChunk.find("\"total_tokens\":10"), std::string::npos) + << "total_tokens must be correct in usage chunk: " << usageChunk; +} From 38f7358675b826160e2fadf9fc53315161c59f44 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Tue, 26 May 2026 11:07:54 +0200 Subject: [PATCH 2/5] fix --- src/test/http_openai_handler_test.cpp | 224 ++++++++++++++++---------- 1 file changed, 142 insertions(+), 82 deletions(-) diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index 44321e1e6e..48b34224f0 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -28,6 +28,9 @@ #include "../filesystem/filesystem.hpp" #include "../llm/apis/openai_completions.hpp" #include "../llm/apis/openai_responses.hpp" +#include "../llm/language_model/legacy/servable.hpp" +#include "../llm/visual_language_model/legacy/servable.hpp" +#include "../client_connection.hpp" #include #include "../module_names.hpp" #include "../servablemanagermodule.hpp" @@ -5653,103 +5656,160 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesStreamOptionsRejected) { absl::InvalidArgumentError("stream_options is not supported in Responses API.")); } -TEST_F(HttpOpenAIHandlerParsingTest, streamingResponsesCompletedEventHasCorrectUsageWhenSetBeforeFinalChunk) { - std::string json = R"({ - "model": "llama", - "input": "What is OpenVINO?", - "stream": true - })"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); +// Stub client that is never disconnected, used by the LM legacy servable tests below. +namespace { +struct NeverDisconnectedClient : public ovms::ClientConnection { + bool isDisconnected() const override { return false; } + void registerDisconnectionCallback(std::function) override {} +}; +} // namespace + +static std::shared_ptr makeLegacyResponsesContext( + const std::shared_ptr& tok, + size_t numInputTokens, size_t numGeneratedTokens, + ov::genai::GenerationFinishReason finishReason = ov::genai::GenerationFinishReason::STOP) { + auto ctx = std::make_shared(); + + ctx->payload.client = std::make_shared(); + ctx->payload.parsedJson = std::make_shared(); + ctx->payload.parsedJson->Parse(R"({"model":"llama","input":"test","stream":true})"); + ctx->endpoint = ovms::Endpoint::RESPONSES; auto apiHandler = std::make_shared( - doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + *ctx->payload.parsedJson, ovms::Endpoint::RESPONSES, + std::chrono::system_clock::now(), *tok); std::optional maxTokensLimit; - uint32_t bestOfLimit = 0; - std::optional maxModelLength; - ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + apiHandler->parseRequest(maxTokensLimit, 0, std::nullopt); + ctx->apiHandler = apiHandler; - apiHandler->serializeStreamingCreatedEvent(); - apiHandler->serializeStreamingInProgressEvent(); + ctx->results.finish_reasons.push_back(finishReason); + ctx->results.perf_metrics.num_input_tokens = numInputTokens; + ctx->results.perf_metrics.num_generated_tokens = numGeneratedTokens; + ctx->success = true; + // Signal that generation is done so preparePartialResponse goes straight to + // the "finish generation" branch without waiting. + ctx->readySignal.set_value(); - // Simulate a mid-stream token delta - apiHandler->serializeStreamingChunk("Hello", ov::genai::GenerationFinishReason::NONE); + ctx->textStreamer = std::make_shared( + *tok, [](std::string) { return ov::genai::StreamingStatus::RUNNING; }); - // Fixed order: set usage BEFORE the final serializeStreamingChunk call - apiHandler->setPromptTokensUsage(7); - apiHandler->setCompletionTokensUsage(3); - std::string finalChunk = apiHandler->serializeStreamingChunk(" world", ov::genai::GenerationFinishReason::STOP); - - // The response.completed event must carry the correct usage values - ASSERT_NE(finalChunk.find("\"type\":\"response.completed\""), std::string::npos) << finalChunk; - ASSERT_NE(finalChunk.find("\"input_tokens\":7"), std::string::npos) - << "input_tokens must reflect value set before final chunk: " << finalChunk; - ASSERT_NE(finalChunk.find("\"output_tokens\":3"), std::string::npos) - << "output_tokens must reflect value set before final chunk: " << finalChunk; - ASSERT_NE(finalChunk.find("\"total_tokens\":10"), std::string::npos) - << "total_tokens must be input+output: " << finalChunk; + return ctx; } -TEST_F(HttpOpenAIHandlerParsingTest, streamingResponsesCompletedEventHasZeroUsageWhenSetAfterFinalChunk) { - std::string json = R"({ - "model": "llama", - "input": "What is OpenVINO?", - "stream": true - })"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); +TEST_F(HttpOpenAIHandlerParsingTest, legacyServablePreparePartialResponseResponsesEndpointHasCorrectUsageInCompletedEvent) { + auto ctx = makeLegacyResponsesContext(tokenizer, /*numInputTokens=*/10, /*numGeneratedTokens=*/5); + std::shared_ptr ctxBase = ctx; - auto apiHandler = std::make_shared( - doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); - std::optional maxTokensLimit; - uint32_t bestOfLimit = 0; - std::optional maxModelLength; - ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + ovms::LegacyServable servable; + ASSERT_EQ(servable.preparePartialResponse(ctxBase), absl::OkStatus()); - apiHandler->serializeStreamingCreatedEvent(); - apiHandler->serializeStreamingInProgressEvent(); - apiHandler->serializeStreamingChunk("Hello", ov::genai::GenerationFinishReason::NONE); + const std::string& response = ctxBase->response; + ASSERT_NE(response.find("\"type\":\"response.completed\""), std::string::npos) + << "response.completed event must be present: " << response; + ASSERT_NE(response.find("\"output_tokens\":5"), std::string::npos) + << "output_tokens must equal num_generated_tokens from perf_metrics: " << response; + ASSERT_NE(response.find("\"input_tokens\":10"), std::string::npos) + << "input_tokens must equal num_input_tokens from perf_metrics: " << response; + ASSERT_NE(response.find("\"total_tokens\":15"), std::string::npos) + << "total_tokens must be input+output: " << response; + ASSERT_FALSE(ctxBase->sendLoopbackSignal); +} - // Buggy order: final chunk first — response.completed is built with usage still at 0 - std::string finalChunk = apiHandler->serializeStreamingChunk(" world", ov::genai::GenerationFinishReason::STOP); - apiHandler->setPromptTokensUsage(7); - apiHandler->setCompletionTokensUsage(3); +TEST_F(HttpOpenAIHandlerParsingTest, legacyServablePreparePartialResponseResponsesEndpointHasCorrectUsageOnLength) { + auto ctx = makeLegacyResponsesContext(tokenizer, /*numInputTokens=*/8, /*numGeneratedTokens=*/3, + ov::genai::GenerationFinishReason::LENGTH); + std::shared_ptr ctxBase = ctx; - // Confirm the bug: output_tokens in the completed event is 0 - ASSERT_NE(finalChunk.find("\"type\":\"response.completed\""), std::string::npos) << finalChunk; - ASSERT_NE(finalChunk.find("\"output_tokens\":0"), std::string::npos) - << "output_tokens must be 0 when usage is set after the final chunk (documents the pre-fix bug): " << finalChunk; + ovms::LegacyServable servable; + ASSERT_EQ(servable.preparePartialResponse(ctxBase), absl::OkStatus()); + + const std::string& response = ctxBase->response; + ASSERT_NE(response.find("\"type\":\"response.incomplete\""), std::string::npos) + << "response.incomplete event must be present for LENGTH finish reason: " << response; + ASSERT_NE(response.find("\"output_tokens\":3"), std::string::npos) + << "output_tokens must equal num_generated_tokens from perf_metrics: " << response; + ASSERT_NE(response.find("\"input_tokens\":8"), std::string::npos) + << "input_tokens must equal num_input_tokens from perf_metrics: " << response; } -TEST_F(HttpOpenAIHandlerParsingTest, streamingChatCompletionsUsageChunkCorrectRegardlessOfSetOrder) { - std::string json = R"({ - "model": "llama", - "stream": true, - "stream_options": {"include_usage": true}, - "messages": [{"role": "user", "content": "hi"}] - })"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); +TEST_F(HttpOpenAIHandlerParsingTest, vlmLegacyServablePreparePartialResponseResponsesEndpointHasCorrectUsageInCompletedEvent) { + auto ctx = std::make_shared(); - auto apiHandler = std::make_shared( - doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - uint32_t maxTokensLimit = 100; - uint32_t bestOfLimit = 0; - std::optional maxModelLength; - ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + ctx->payload.client = std::make_shared(); + ctx->payload.parsedJson = std::make_shared(); + ctx->payload.parsedJson->Parse(R"({"model":"llama","input":"test","stream":true})"); + ctx->endpoint = ovms::Endpoint::RESPONSES; - apiHandler->serializeStreamingChunk("Hello", ov::genai::GenerationFinishReason::NONE); + auto apiHandler = std::make_shared( + *ctx->payload.parsedJson, ovms::Endpoint::RESPONSES, + std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + apiHandler->parseRequest(maxTokensLimit, 0, std::nullopt); + ctx->apiHandler = apiHandler; + + ctx->results.finish_reasons.push_back(ov::genai::GenerationFinishReason::STOP); + ctx->results.perf_metrics.num_input_tokens = 12; + ctx->results.perf_metrics.num_generated_tokens = 6; + ctx->success = true; + ctx->readySignal.set_value(); + ctx->textStreamer = std::make_shared( + *tokenizer, [](std::string) { return ov::genai::StreamingStatus::RUNNING; }); + + ovms::VisualLanguageModelLegacyServable servable; + std::shared_ptr ctxBase = ctx; + ASSERT_EQ(servable.preparePartialResponse(ctxBase), absl::OkStatus()); + + const std::string& response = ctxBase->response; + ASSERT_NE(response.find("\"type\":\"response.completed\""), std::string::npos) + << "response.completed event must be present: " << response; + ASSERT_NE(response.find("\"output_tokens\":6"), std::string::npos) + << "output_tokens must equal num_generated_tokens from perf_metrics: " << response; + ASSERT_NE(response.find("\"input_tokens\":12"), std::string::npos) + << "input_tokens must equal num_input_tokens from perf_metrics: " << response; + ASSERT_NE(response.find("\"total_tokens\":18"), std::string::npos) + << "total_tokens must be input+output: " << response; + ASSERT_FALSE(ctxBase->sendLoopbackSignal); +} + +TEST_F(HttpOpenAIHandlerParsingTest, legacyServablePreparePartialResponseChatCompletionsStreamingHasCorrectUsageChunk) { + auto ctx = std::make_shared(); + + ctx->payload.client = std::make_shared(); + ctx->payload.parsedJson = std::make_shared(); + ctx->payload.parsedJson->Parse( + R"({"model":"llama","stream":true,"stream_options":{"include_usage":true},"messages":[{"role":"user","content":"hi"}]})"); + ctx->endpoint = ovms::Endpoint::CHAT_COMPLETIONS; - // For chat/completions the final chunk does NOT embed usage, so setting it after is fine - apiHandler->serializeStreamingChunk("", ov::genai::GenerationFinishReason::STOP); - apiHandler->setPromptTokensUsage(7); - apiHandler->setCompletionTokensUsage(3); - - std::string usageChunk = apiHandler->serializeStreamingUsageChunk(); - ASSERT_NE(usageChunk.find("\"prompt_tokens\":7"), std::string::npos) - << "prompt_tokens must be correct in usage chunk: " << usageChunk; - ASSERT_NE(usageChunk.find("\"completion_tokens\":3"), std::string::npos) - << "completion_tokens must be correct in usage chunk: " << usageChunk; - ASSERT_NE(usageChunk.find("\"total_tokens\":10"), std::string::npos) - << "total_tokens must be correct in usage chunk: " << usageChunk; + auto apiHandler = std::make_shared( + *ctx->payload.parsedJson, ovms::Endpoint::CHAT_COMPLETIONS, + std::chrono::system_clock::now(), *tokenizer); + uint32_t maxTokensLimit = 100; + apiHandler->parseRequest(maxTokensLimit, 0, std::nullopt); + ctx->apiHandler = apiHandler; + + ctx->results.finish_reasons.push_back(ov::genai::GenerationFinishReason::STOP); + ctx->results.perf_metrics.num_input_tokens = 10; + ctx->results.perf_metrics.num_generated_tokens = 5; + ctx->success = true; + ctx->readySignal.set_value(); + ctx->textStreamer = std::make_shared( + *tokenizer, [](std::string) { return ov::genai::StreamingStatus::RUNNING; }); + + ovms::LegacyServable servable; + std::shared_ptr ctxBase = ctx; + ASSERT_EQ(servable.preparePartialResponse(ctxBase), absl::OkStatus()); + + // For chat_completions, usage is in the separate SSE usage chunk (not in the + // final delta chunk), so it should be present even though set*Usage was called + // before serializeStreamingChunk in the fixed code. + const std::string& response = ctxBase->response; + ASSERT_NE(response.find("\"completion_tokens\":5"), std::string::npos) + << "completion_tokens must be in usage chunk: " << response; + ASSERT_NE(response.find("\"prompt_tokens\":10"), std::string::npos) + << "prompt_tokens must be in usage chunk: " << response; + ASSERT_NE(response.find("\"total_tokens\":15"), std::string::npos) + << "total_tokens must be in usage chunk: " << response; + ASSERT_NE(response.find("[DONE]"), std::string::npos) + << "[DONE] must be present: " << response; + ASSERT_FALSE(ctxBase->sendLoopbackSignal); } From 04444d74afe30a08fc6b4d5e4a7295ba87715fa3 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Wed, 10 Jun 2026 15:58:08 +0200 Subject: [PATCH 3/5] fix --- src/test/http_openai_handler_test.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index 48b34224f0..e6b8cbe344 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -5679,7 +5679,7 @@ static std::shared_ptr makeLegacyResponses *ctx->payload.parsedJson, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tok); std::optional maxTokensLimit; - apiHandler->parseRequest(maxTokensLimit, 0, std::nullopt); + static_cast(apiHandler->parseRequest(maxTokensLimit, 0, std::nullopt)); ctx->apiHandler = apiHandler; ctx->results.finish_reasons.push_back(finishReason); @@ -5744,7 +5744,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, vlmLegacyServablePreparePartialResponseResp *ctx->payload.parsedJson, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); std::optional maxTokensLimit; - apiHandler->parseRequest(maxTokensLimit, 0, std::nullopt); + static_cast(apiHandler->parseRequest(maxTokensLimit, 0, std::nullopt)); ctx->apiHandler = apiHandler; ctx->results.finish_reasons.push_back(ov::genai::GenerationFinishReason::STOP); @@ -5784,7 +5784,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, legacyServablePreparePartialResponseChatCom *ctx->payload.parsedJson, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); uint32_t maxTokensLimit = 100; - apiHandler->parseRequest(maxTokensLimit, 0, std::nullopt); + static_cast(apiHandler->parseRequest(maxTokensLimit, 0, std::nullopt)); ctx->apiHandler = apiHandler; ctx->results.finish_reasons.push_back(ov::genai::GenerationFinishReason::STOP); From 07334e05cf2981a1b9221212c0c551f7af8df312 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Wed, 10 Jun 2026 16:00:38 +0200 Subject: [PATCH 4/5] fix --- src/test/http_openai_handler_test.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index e6b8cbe344..87daa7daa3 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -5679,7 +5679,8 @@ static std::shared_ptr makeLegacyResponses *ctx->payload.parsedJson, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tok); std::optional maxTokensLimit; - static_cast(apiHandler->parseRequest(maxTokensLimit, 0, std::nullopt)); + const absl::Status parseStatus = apiHandler->parseRequest(maxTokensLimit, 0, std::nullopt); + ASSERT_TRUE(parseStatus.ok()) << parseStatus; ctx->apiHandler = apiHandler; ctx->results.finish_reasons.push_back(finishReason); @@ -5744,7 +5745,8 @@ TEST_F(HttpOpenAIHandlerParsingTest, vlmLegacyServablePreparePartialResponseResp *ctx->payload.parsedJson, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); std::optional maxTokensLimit; - static_cast(apiHandler->parseRequest(maxTokensLimit, 0, std::nullopt)); + const absl::Status parseStatus = apiHandler->parseRequest(maxTokensLimit, 0, std::nullopt); + ASSERT_TRUE(parseStatus.ok()) << parseStatus; ctx->apiHandler = apiHandler; ctx->results.finish_reasons.push_back(ov::genai::GenerationFinishReason::STOP); @@ -5784,7 +5786,8 @@ TEST_F(HttpOpenAIHandlerParsingTest, legacyServablePreparePartialResponseChatCom *ctx->payload.parsedJson, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); uint32_t maxTokensLimit = 100; - static_cast(apiHandler->parseRequest(maxTokensLimit, 0, std::nullopt)); + const absl::Status parseStatus = apiHandler->parseRequest(maxTokensLimit, 0, std::nullopt); + ASSERT_TRUE(parseStatus.ok()) << parseStatus; ctx->apiHandler = apiHandler; ctx->results.finish_reasons.push_back(ov::genai::GenerationFinishReason::STOP); From a8c5c90b1a683f724cf3a385675d6a65f729aae5 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Wed, 10 Jun 2026 16:44:29 +0200 Subject: [PATCH 5/5] fix --- src/test/http_openai_handler_test.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index 87daa7daa3..3e36b68a0b 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -5680,7 +5680,10 @@ static std::shared_ptr makeLegacyResponses std::chrono::system_clock::now(), *tok); std::optional maxTokensLimit; const absl::Status parseStatus = apiHandler->parseRequest(maxTokensLimit, 0, std::nullopt); - ASSERT_TRUE(parseStatus.ok()) << parseStatus; + EXPECT_TRUE(parseStatus.ok()) << parseStatus; + if (!parseStatus.ok()) { + return nullptr; + } ctx->apiHandler = apiHandler; ctx->results.finish_reasons.push_back(finishReason);