From a41d43d84d3eb595e8d7afe81030bf6299d10273 Mon Sep 17 00:00:00 2001
From: Michal Kulakowski <michal.kulakowski@intel.com>
Date: Mon, 25 May 2026 15:49:02 +0200
Subject: [PATCH 1/5] Fix reporting output tokens in responses for legacy
 pipelines

---
 src/llm/language_model/legacy/servable.cpp    |   5 +-
 .../visual_language_model/legacy/servable.cpp |   4 +-
 src/test/http_openai_handler_test.cpp         | 101 ++++++++++++++++++
 3 files changed, 105 insertions(+), 5 deletions(-)

diff --git a/src/llm/language_model/legacy/servable.cpp b/src/llm/language_model/legacy/servable.cpp
index 8e244df219..d8faa20b42 100644
--- a/src/llm/language_model/legacy/servable.cpp
+++ b/src/llm/language_model/legacy/servable.cpp
@@ -234,13 +234,12 @@ absl::Status LegacyServable::preparePartialResponse(std::shared_ptr<GenAiServabl
         }
         // Legacy generation path always runs with batch=1, so we read the single finish reason at index 0.
         ov::genai::GenerationFinishReason finishReason = legacyExecutionContext->results.finish_reasons.empty() ? ov::genai::GenerationFinishReason::STOP : legacyExecutionContext->results.finish_reasons[0];
+        executionContext->apiHandler->setPromptTokensUsage(legacyExecutionContext->results.perf_metrics.get_num_input_tokens());
+        executionContext->apiHandler->setCompletionTokensUsage(legacyExecutionContext->results.perf_metrics.get_num_generated_tokens());
         std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, finishReason);
         if (!serializedChunk.empty()) {
             executionContext->response = wrapTextInServerSideEventMessage(serializedChunk);
         }
-
-        executionContext->apiHandler->setPromptTokensUsage(legacyExecutionContext->results.perf_metrics.get_num_input_tokens());
-        executionContext->apiHandler->setCompletionTokensUsage(legacyExecutionContext->results.perf_metrics.get_num_generated_tokens());
         if (executionContext->apiHandler->getStreamOptions().includeUsage)
             executionContext->response += wrapTextInServerSideEventMessage(executionContext->apiHandler->serializeStreamingUsageChunk());
 
diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp
index 033cb8641d..6e6fcadcd0 100644
--- a/src/llm/visual_language_model/legacy/servable.cpp
+++ b/src/llm/visual_language_model/legacy/servable.cpp
@@ -250,12 +250,12 @@ absl::Status VisualLanguageModelLegacyServable::preparePartialResponse(std::shar
         }
         // Legacy generation path always runs with batch=1, so we read the single finish reason at index 0.
         ov::genai::GenerationFinishReason finishReason = legacyExecutionContext->results.finish_reasons.empty() ? ov::genai::GenerationFinishReason::STOP : legacyExecutionContext->results.finish_reasons[0];
+        executionContext->apiHandler->setPromptTokensUsage(legacyExecutionContext->results.perf_metrics.get_num_input_tokens());
+        executionContext->apiHandler->setCompletionTokensUsage(legacyExecutionContext->results.perf_metrics.get_num_generated_tokens());
         std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, finishReason);
         if (!serializedChunk.empty()) {
             executionContext->response = wrapTextInServerSideEventMessage(serializedChunk);
         }
-        executionContext->apiHandler->setPromptTokensUsage(legacyExecutionContext->results.perf_metrics.get_num_input_tokens());
-        executionContext->apiHandler->setCompletionTokensUsage(legacyExecutionContext->results.perf_metrics.get_num_generated_tokens());
         if (executionContext->apiHandler->getStreamOptions().includeUsage)
             executionContext->response += wrapTextInServerSideEventMessage(executionContext->apiHandler->serializeStreamingUsageChunk());
 
diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp
index 46a31a3337..44321e1e6e 100644
--- a/src/test/http_openai_handler_test.cpp
+++ b/src/test/http_openai_handler_test.cpp
@@ -5652,3 +5652,104 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesStreamOptionsRejected) {
     EXPECT_EQ(status,
         absl::InvalidArgumentError("stream_options is not supported in Responses API."));
 }
+
+TEST_F(HttpOpenAIHandlerParsingTest, streamingResponsesCompletedEventHasCorrectUsageWhenSetBeforeFinalChunk) {
+    std::string json = R"({
+        "model": "llama",
+        "input": "What is OpenVINO?",
+        "stream": true
+    })";
+    doc.Parse(json.c_str());
+    ASSERT_FALSE(doc.HasParseError());
+
+    auto apiHandler = std::make_shared<ovms::OpenAIResponsesHandler>(
+        doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer);
+    std::optional<uint32_t> maxTokensLimit;
+    uint32_t bestOfLimit = 0;
+    std::optional<uint32_t> maxModelLength;
+    ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus());
+
+    apiHandler->serializeStreamingCreatedEvent();
+    apiHandler->serializeStreamingInProgressEvent();
+
+    // Simulate a mid-stream token delta
+    apiHandler->serializeStreamingChunk("Hello", ov::genai::GenerationFinishReason::NONE);
+
+    // Fixed order: set usage BEFORE the final serializeStreamingChunk call
+    apiHandler->setPromptTokensUsage(7);
+    apiHandler->setCompletionTokensUsage(3);
+    std::string finalChunk = apiHandler->serializeStreamingChunk(" world", ov::genai::GenerationFinishReason::STOP);
+
+    // The response.completed event must carry the correct usage values
+    ASSERT_NE(finalChunk.find("\"type\":\"response.completed\""), std::string::npos) << finalChunk;
+    ASSERT_NE(finalChunk.find("\"input_tokens\":7"), std::string::npos)
+        << "input_tokens must reflect value set before final chunk: " << finalChunk;
+    ASSERT_NE(finalChunk.find("\"output_tokens\":3"), std::string::npos)
+        << "output_tokens must reflect value set before final chunk: " << finalChunk;
+    ASSERT_NE(finalChunk.find("\"total_tokens\":10"), std::string::npos)
+        << "total_tokens must be input+output: " << finalChunk;
+}
+
+TEST_F(HttpOpenAIHandlerParsingTest, streamingResponsesCompletedEventHasZeroUsageWhenSetAfterFinalChunk) {
+    std::string json = R"({
+        "model": "llama",
+        "input": "What is OpenVINO?",
+        "stream": true
+    })";
+    doc.Parse(json.c_str());
+    ASSERT_FALSE(doc.HasParseError());
+
+    auto apiHandler = std::make_shared<ovms::OpenAIResponsesHandler>(
+        doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer);
+    std::optional<uint32_t> maxTokensLimit;
+    uint32_t bestOfLimit = 0;
+    std::optional<uint32_t> maxModelLength;
+    ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus());
+
+    apiHandler->serializeStreamingCreatedEvent();
+    apiHandler->serializeStreamingInProgressEvent();
+    apiHandler->serializeStreamingChunk("Hello", ov::genai::GenerationFinishReason::NONE);
+
+    // Buggy order: final chunk first — response.completed is built with usage still at 0
+    std::string finalChunk = apiHandler->serializeStreamingChunk(" world", ov::genai::GenerationFinishReason::STOP);
+    apiHandler->setPromptTokensUsage(7);
+    apiHandler->setCompletionTokensUsage(3);
+
+    // Confirm the bug: output_tokens in the completed event is 0
+    ASSERT_NE(finalChunk.find("\"type\":\"response.completed\""), std::string::npos) << finalChunk;
+    ASSERT_NE(finalChunk.find("\"output_tokens\":0"), std::string::npos)
+        << "output_tokens must be 0 when usage is set after the final chunk (documents the pre-fix bug): " << finalChunk;
+}
+
+TEST_F(HttpOpenAIHandlerParsingTest, streamingChatCompletionsUsageChunkCorrectRegardlessOfSetOrder) {
+    std::string json = R"({
+        "model": "llama",
+        "stream": true,
+        "stream_options": {"include_usage": true},
+        "messages": [{"role": "user", "content": "hi"}]
+    })";
+    doc.Parse(json.c_str());
+    ASSERT_FALSE(doc.HasParseError());
+
+    auto apiHandler = std::make_shared<ovms::OpenAIChatCompletionsHandler>(
+        doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer);
+    uint32_t maxTokensLimit = 100;
+    uint32_t bestOfLimit = 0;
+    std::optional<uint32_t> maxModelLength;
+    ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus());
+
+    apiHandler->serializeStreamingChunk("Hello", ov::genai::GenerationFinishReason::NONE);
+
+    // For chat/completions the final chunk does NOT embed usage, so setting it after is fine
+    apiHandler->serializeStreamingChunk("", ov::genai::GenerationFinishReason::STOP);
+    apiHandler->setPromptTokensUsage(7);
+    apiHandler->setCompletionTokensUsage(3);
+
+    std::string usageChunk = apiHandler->serializeStreamingUsageChunk();
+    ASSERT_NE(usageChunk.find("\"prompt_tokens\":7"), std::string::npos)
+        << "prompt_tokens must be correct in usage chunk: " << usageChunk;
+    ASSERT_NE(usageChunk.find("\"completion_tokens\":3"), std::string::npos)
+        << "completion_tokens must be correct in usage chunk: " << usageChunk;
+    ASSERT_NE(usageChunk.find("\"total_tokens\":10"), std::string::npos)
+        << "total_tokens must be correct in usage chunk: " << usageChunk;
+}

From 38f7358675b826160e2fadf9fc53315161c59f44 Mon Sep 17 00:00:00 2001
From: Michal Kulakowski <michal.kulakowski@intel.com>
Date: Tue, 26 May 2026 11:07:54 +0200
Subject: [PATCH 2/5] fix

---
 src/test/http_openai_handler_test.cpp | 224 ++++++++++++++++----------
 1 file changed, 142 insertions(+), 82 deletions(-)

diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp
index 44321e1e6e..48b34224f0 100644
--- a/src/test/http_openai_handler_test.cpp
+++ b/src/test/http_openai_handler_test.cpp
@@ -28,6 +28,9 @@
 #include "../filesystem/filesystem.hpp"
 #include "../llm/apis/openai_completions.hpp"
 #include "../llm/apis/openai_responses.hpp"
+#include "../llm/language_model/legacy/servable.hpp"
+#include "../llm/visual_language_model/legacy/servable.hpp"
+#include "../client_connection.hpp"
 #include <openvino/genai/visual_language/pipeline.hpp>
 #include "../module_names.hpp"
 #include "../servablemanagermodule.hpp"
@@ -5653,103 +5656,160 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesStreamOptionsRejected) {
         absl::InvalidArgumentError("stream_options is not supported in Responses API."));
 }
 
-TEST_F(HttpOpenAIHandlerParsingTest, streamingResponsesCompletedEventHasCorrectUsageWhenSetBeforeFinalChunk) {
-    std::string json = R"({
-        "model": "llama",
-        "input": "What is OpenVINO?",
-        "stream": true
-    })";
-    doc.Parse(json.c_str());
-    ASSERT_FALSE(doc.HasParseError());
+// Stub client that is never disconnected, used by the LM legacy servable tests below.
+namespace {
+struct NeverDisconnectedClient : public ovms::ClientConnection {
+    bool isDisconnected() const override { return false; }
+    void registerDisconnectionCallback(std::function<void()>) override {}
+};
+}  // namespace
+
+static std::shared_ptr<ovms::LegacyServableExecutionContext> makeLegacyResponsesContext(
+    const std::shared_ptr<ov::genai::Tokenizer>& tok,
+    size_t numInputTokens, size_t numGeneratedTokens,
+    ov::genai::GenerationFinishReason finishReason = ov::genai::GenerationFinishReason::STOP) {
+    auto ctx = std::make_shared<ovms::LegacyServableExecutionContext>();
+
+    ctx->payload.client = std::make_shared<NeverDisconnectedClient>();
+    ctx->payload.parsedJson = std::make_shared<rapidjson::Document>();
+    ctx->payload.parsedJson->Parse(R"({"model":"llama","input":"test","stream":true})");
+    ctx->endpoint = ovms::Endpoint::RESPONSES;
 
     auto apiHandler = std::make_shared<ovms::OpenAIResponsesHandler>(
-        doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer);
+        *ctx->payload.parsedJson, ovms::Endpoint::RESPONSES,
+        std::chrono::system_clock::now(), *tok);
     std::optional<uint32_t> maxTokensLimit;
-    uint32_t bestOfLimit = 0;
-    std::optional<uint32_t> maxModelLength;
-    ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus());
+    apiHandler->parseRequest(maxTokensLimit, 0, std::nullopt);
+    ctx->apiHandler = apiHandler;
 
-    apiHandler->serializeStreamingCreatedEvent();
-    apiHandler->serializeStreamingInProgressEvent();
+    ctx->results.finish_reasons.push_back(finishReason);
+    ctx->results.perf_metrics.num_input_tokens = numInputTokens;
+    ctx->results.perf_metrics.num_generated_tokens = numGeneratedTokens;
+    ctx->success = true;
+    // Signal that generation is done so preparePartialResponse goes straight to
+    // the "finish generation" branch without waiting.
+    ctx->readySignal.set_value();
 
-    // Simulate a mid-stream token delta
-    apiHandler->serializeStreamingChunk("Hello", ov::genai::GenerationFinishReason::NONE);
+    ctx->textStreamer = std::make_shared<ov::genai::TextStreamer>(
+        *tok, [](std::string) { return ov::genai::StreamingStatus::RUNNING; });
 
-    // Fixed order: set usage BEFORE the final serializeStreamingChunk call
-    apiHandler->setPromptTokensUsage(7);
-    apiHandler->setCompletionTokensUsage(3);
-    std::string finalChunk = apiHandler->serializeStreamingChunk(" world", ov::genai::GenerationFinishReason::STOP);
-
-    // The response.completed event must carry the correct usage values
-    ASSERT_NE(finalChunk.find("\"type\":\"response.completed\""), std::string::npos) << finalChunk;
-    ASSERT_NE(finalChunk.find("\"input_tokens\":7"), std::string::npos)
-        << "input_tokens must reflect value set before final chunk: " << finalChunk;
-    ASSERT_NE(finalChunk.find("\"output_tokens\":3"), std::string::npos)
-        << "output_tokens must reflect value set before final chunk: " << finalChunk;
-    ASSERT_NE(finalChunk.find("\"total_tokens\":10"), std::string::npos)
-        << "total_tokens must be input+output: " << finalChunk;
+    return ctx;
 }
 
-TEST_F(HttpOpenAIHandlerParsingTest, streamingResponsesCompletedEventHasZeroUsageWhenSetAfterFinalChunk) {
-    std::string json = R"({
-        "model": "llama",
-        "input": "What is OpenVINO?",
-        "stream": true
-    })";
-    doc.Parse(json.c_str());
-    ASSERT_FALSE(doc.HasParseError());
+TEST_F(HttpOpenAIHandlerParsingTest, legacyServablePreparePartialResponseResponsesEndpointHasCorrectUsageInCompletedEvent) {
+    auto ctx = makeLegacyResponsesContext(tokenizer, /*numInputTokens=*/10, /*numGeneratedTokens=*/5);
+    std::shared_ptr<ovms::GenAiServableExecutionContext> ctxBase = ctx;
 
-    auto apiHandler = std::make_shared<ovms::OpenAIResponsesHandler>(
-        doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer);
-    std::optional<uint32_t> maxTokensLimit;
-    uint32_t bestOfLimit = 0;
-    std::optional<uint32_t> maxModelLength;
-    ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus());
+    ovms::LegacyServable servable;
+    ASSERT_EQ(servable.preparePartialResponse(ctxBase), absl::OkStatus());
 
-    apiHandler->serializeStreamingCreatedEvent();
-    apiHandler->serializeStreamingInProgressEvent();
-    apiHandler->serializeStreamingChunk("Hello", ov::genai::GenerationFinishReason::NONE);
+    const std::string& response = ctxBase->response;
+    ASSERT_NE(response.find("\"type\":\"response.completed\""), std::string::npos)
+        << "response.completed event must be present: " << response;
+    ASSERT_NE(response.find("\"output_tokens\":5"), std::string::npos)
+        << "output_tokens must equal num_generated_tokens from perf_metrics: " << response;
+    ASSERT_NE(response.find("\"input_tokens\":10"), std::string::npos)
+        << "input_tokens must equal num_input_tokens from perf_metrics: " << response;
+    ASSERT_NE(response.find("\"total_tokens\":15"), std::string::npos)
+        << "total_tokens must be input+output: " << response;
+    ASSERT_FALSE(ctxBase->sendLoopbackSignal);
+}
 
-    // Buggy order: final chunk first — response.completed is built with usage still at 0
-    std::string finalChunk = apiHandler->serializeStreamingChunk(" world", ov::genai::GenerationFinishReason::STOP);
-    apiHandler->setPromptTokensUsage(7);
-    apiHandler->setCompletionTokensUsage(3);
+TEST_F(HttpOpenAIHandlerParsingTest, legacyServablePreparePartialResponseResponsesEndpointHasCorrectUsageOnLength) {
+    auto ctx = makeLegacyResponsesContext(tokenizer, /*numInputTokens=*/8, /*numGeneratedTokens=*/3,
+        ov::genai::GenerationFinishReason::LENGTH);
+    std::shared_ptr<ovms::GenAiServableExecutionContext> ctxBase = ctx;
 
-    // Confirm the bug: output_tokens in the completed event is 0
-    ASSERT_NE(finalChunk.find("\"type\":\"response.completed\""), std::string::npos) << finalChunk;
-    ASSERT_NE(finalChunk.find("\"output_tokens\":0"), std::string::npos)
-        << "output_tokens must be 0 when usage is set after the final chunk (documents the pre-fix bug): " << finalChunk;
+    ovms::LegacyServable servable;
+    ASSERT_EQ(servable.preparePartialResponse(ctxBase), absl::OkStatus());
+
+    const std::string& response = ctxBase->response;
+    ASSERT_NE(response.find("\"type\":\"response.incomplete\""), std::string::npos)
+        << "response.incomplete event must be present for LENGTH finish reason: " << response;
+    ASSERT_NE(response.find("\"output_tokens\":3"), std::string::npos)
+        << "output_tokens must equal num_generated_tokens from perf_metrics: " << response;
+    ASSERT_NE(response.find("\"input_tokens\":8"), std::string::npos)
+        << "input_tokens must equal num_input_tokens from perf_metrics: " << response;
 }
 
-TEST_F(HttpOpenAIHandlerParsingTest, streamingChatCompletionsUsageChunkCorrectRegardlessOfSetOrder) {
-    std::string json = R"({
-        "model": "llama",
-        "stream": true,
-        "stream_options": {"include_usage": true},
-        "messages": [{"role": "user", "content": "hi"}]
-    })";
-    doc.Parse(json.c_str());
-    ASSERT_FALSE(doc.HasParseError());
+TEST_F(HttpOpenAIHandlerParsingTest, vlmLegacyServablePreparePartialResponseResponsesEndpointHasCorrectUsageInCompletedEvent) {
+    auto ctx = std::make_shared<ovms::VisualLanguageModelLegacyServableExecutionContext>();
 
-    auto apiHandler = std::make_shared<ovms::OpenAIChatCompletionsHandler>(
-        doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer);
-    uint32_t maxTokensLimit = 100;
-    uint32_t bestOfLimit = 0;
-    std::optional<uint32_t> maxModelLength;
-    ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus());
+    ctx->payload.client = std::make_shared<NeverDisconnectedClient>();
+    ctx->payload.parsedJson = std::make_shared<rapidjson::Document>();
+    ctx->payload.parsedJson->Parse(R"({"model":"llama","input":"test","stream":true})");
+    ctx->endpoint = ovms::Endpoint::RESPONSES;
 
-    apiHandler->serializeStreamingChunk("Hello", ov::genai::GenerationFinishReason::NONE);
+    auto apiHandler = std::make_shared<ovms::OpenAIResponsesHandler>(
+        *ctx->payload.parsedJson, ovms::Endpoint::RESPONSES,
+        std::chrono::system_clock::now(), *tokenizer);
+    std::optional<uint32_t> maxTokensLimit;
+    apiHandler->parseRequest(maxTokensLimit, 0, std::nullopt);
+    ctx->apiHandler = apiHandler;
+
+    ctx->results.finish_reasons.push_back(ov::genai::GenerationFinishReason::STOP);
+    ctx->results.perf_metrics.num_input_tokens = 12;
+    ctx->results.perf_metrics.num_generated_tokens = 6;
+    ctx->success = true;
+    ctx->readySignal.set_value();
+    ctx->textStreamer = std::make_shared<ov::genai::TextStreamer>(
+        *tokenizer, [](std::string) { return ov::genai::StreamingStatus::RUNNING; });
+
+    ovms::VisualLanguageModelLegacyServable servable;
+    std::shared_ptr<ovms::GenAiServableExecutionContext> ctxBase = ctx;
+    ASSERT_EQ(servable.preparePartialResponse(ctxBase), absl::OkStatus());
+
+    const std::string& response = ctxBase->response;
+    ASSERT_NE(response.find("\"type\":\"response.completed\""), std::string::npos)
+        << "response.completed event must be present: " << response;
+    ASSERT_NE(response.find("\"output_tokens\":6"), std::string::npos)
+        << "output_tokens must equal num_generated_tokens from perf_metrics: " << response;
+    ASSERT_NE(response.find("\"input_tokens\":12"), std::string::npos)
+        << "input_tokens must equal num_input_tokens from perf_metrics: " << response;
+    ASSERT_NE(response.find("\"total_tokens\":18"), std::string::npos)
+        << "total_tokens must be input+output: " << response;
+    ASSERT_FALSE(ctxBase->sendLoopbackSignal);
+}
+
+TEST_F(HttpOpenAIHandlerParsingTest, legacyServablePreparePartialResponseChatCompletionsStreamingHasCorrectUsageChunk) {
+    auto ctx = std::make_shared<ovms::LegacyServableExecutionContext>();
+
+    ctx->payload.client = std::make_shared<NeverDisconnectedClient>();
+    ctx->payload.parsedJson = std::make_shared<rapidjson::Document>();
+    ctx->payload.parsedJson->Parse(
+        R"({"model":"llama","stream":true,"stream_options":{"include_usage":true},"messages":[{"role":"user","content":"hi"}]})");
+    ctx->endpoint = ovms::Endpoint::CHAT_COMPLETIONS;
 
-    // For chat/completions the final chunk does NOT embed usage, so setting it after is fine
-    apiHandler->serializeStreamingChunk("", ov::genai::GenerationFinishReason::STOP);
-    apiHandler->setPromptTokensUsage(7);
-    apiHandler->setCompletionTokensUsage(3);
-
-    std::string usageChunk = apiHandler->serializeStreamingUsageChunk();
-    ASSERT_NE(usageChunk.find("\"prompt_tokens\":7"), std::string::npos)
-        << "prompt_tokens must be correct in usage chunk: " << usageChunk;
-    ASSERT_NE(usageChunk.find("\"completion_tokens\":3"), std::string::npos)
-        << "completion_tokens must be correct in usage chunk: " << usageChunk;
-    ASSERT_NE(usageChunk.find("\"total_tokens\":10"), std::string::npos)
-        << "total_tokens must be correct in usage chunk: " << usageChunk;
+    auto apiHandler = std::make_shared<ovms::OpenAIChatCompletionsHandler>(
+        *ctx->payload.parsedJson, ovms::Endpoint::CHAT_COMPLETIONS,
+        std::chrono::system_clock::now(), *tokenizer);
+    uint32_t maxTokensLimit = 100;
+    apiHandler->parseRequest(maxTokensLimit, 0, std::nullopt);
+    ctx->apiHandler = apiHandler;
+
+    ctx->results.finish_reasons.push_back(ov::genai::GenerationFinishReason::STOP);
+    ctx->results.perf_metrics.num_input_tokens = 10;
+    ctx->results.perf_metrics.num_generated_tokens = 5;
+    ctx->success = true;
+    ctx->readySignal.set_value();
+    ctx->textStreamer = std::make_shared<ov::genai::TextStreamer>(
+        *tokenizer, [](std::string) { return ov::genai::StreamingStatus::RUNNING; });
+
+    ovms::LegacyServable servable;
+    std::shared_ptr<ovms::GenAiServableExecutionContext> ctxBase = ctx;
+    ASSERT_EQ(servable.preparePartialResponse(ctxBase), absl::OkStatus());
+
+    // For chat_completions, usage is in the separate SSE usage chunk (not in the
+    // final delta chunk), so it should be present even though set*Usage was called
+    // before serializeStreamingChunk in the fixed code.
+    const std::string& response = ctxBase->response;
+    ASSERT_NE(response.find("\"completion_tokens\":5"), std::string::npos)
+        << "completion_tokens must be in usage chunk: " << response;
+    ASSERT_NE(response.find("\"prompt_tokens\":10"), std::string::npos)
+        << "prompt_tokens must be in usage chunk: " << response;
+    ASSERT_NE(response.find("\"total_tokens\":15"), std::string::npos)
+        << "total_tokens must be in usage chunk: " << response;
+    ASSERT_NE(response.find("[DONE]"), std::string::npos)
+        << "[DONE] must be present: " << response;
+    ASSERT_FALSE(ctxBase->sendLoopbackSignal);
 }

From 04444d74afe30a08fc6b4d5e4a7295ba87715fa3 Mon Sep 17 00:00:00 2001
From: Michal Kulakowski <michal.kulakowski@intel.com>
Date: Wed, 10 Jun 2026 15:58:08 +0200
Subject: [PATCH 3/5] fix

---
 src/test/http_openai_handler_test.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp
index 48b34224f0..e6b8cbe344 100644
--- a/src/test/http_openai_handler_test.cpp
+++ b/src/test/http_openai_handler_test.cpp
@@ -5679,7 +5679,7 @@ static std::shared_ptr<ovms::LegacyServableExecutionContext> makeLegacyResponses
         *ctx->payload.parsedJson, ovms::Endpoint::RESPONSES,
         std::chrono::system_clock::now(), *tok);
     std::optional<uint32_t> maxTokensLimit;
-    apiHandler->parseRequest(maxTokensLimit, 0, std::nullopt);
+    static_cast<void>(apiHandler->parseRequest(maxTokensLimit, 0, std::nullopt));
     ctx->apiHandler = apiHandler;
 
     ctx->results.finish_reasons.push_back(finishReason);
@@ -5744,7 +5744,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, vlmLegacyServablePreparePartialResponseResp
         *ctx->payload.parsedJson, ovms::Endpoint::RESPONSES,
         std::chrono::system_clock::now(), *tokenizer);
     std::optional<uint32_t> maxTokensLimit;
-    apiHandler->parseRequest(maxTokensLimit, 0, std::nullopt);
+    static_cast<void>(apiHandler->parseRequest(maxTokensLimit, 0, std::nullopt));
     ctx->apiHandler = apiHandler;
 
     ctx->results.finish_reasons.push_back(ov::genai::GenerationFinishReason::STOP);
@@ -5784,7 +5784,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, legacyServablePreparePartialResponseChatCom
         *ctx->payload.parsedJson, ovms::Endpoint::CHAT_COMPLETIONS,
         std::chrono::system_clock::now(), *tokenizer);
     uint32_t maxTokensLimit = 100;
-    apiHandler->parseRequest(maxTokensLimit, 0, std::nullopt);
+    static_cast<void>(apiHandler->parseRequest(maxTokensLimit, 0, std::nullopt));
     ctx->apiHandler = apiHandler;
 
     ctx->results.finish_reasons.push_back(ov::genai::GenerationFinishReason::STOP);

From 07334e05cf2981a1b9221212c0c551f7af8df312 Mon Sep 17 00:00:00 2001
From: Michal Kulakowski <michal.kulakowski@intel.com>
Date: Wed, 10 Jun 2026 16:00:38 +0200
Subject: [PATCH 4/5] fix

---
 src/test/http_openai_handler_test.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp
index e6b8cbe344..87daa7daa3 100644
--- a/src/test/http_openai_handler_test.cpp
+++ b/src/test/http_openai_handler_test.cpp
@@ -5679,7 +5679,8 @@ static std::shared_ptr<ovms::LegacyServableExecutionContext> makeLegacyResponses
         *ctx->payload.parsedJson, ovms::Endpoint::RESPONSES,
         std::chrono::system_clock::now(), *tok);
     std::optional<uint32_t> maxTokensLimit;
-    static_cast<void>(apiHandler->parseRequest(maxTokensLimit, 0, std::nullopt));
+    const absl::Status parseStatus = apiHandler->parseRequest(maxTokensLimit, 0, std::nullopt);
+    ASSERT_TRUE(parseStatus.ok()) << parseStatus;
     ctx->apiHandler = apiHandler;
 
     ctx->results.finish_reasons.push_back(finishReason);
@@ -5744,7 +5745,8 @@ TEST_F(HttpOpenAIHandlerParsingTest, vlmLegacyServablePreparePartialResponseResp
         *ctx->payload.parsedJson, ovms::Endpoint::RESPONSES,
         std::chrono::system_clock::now(), *tokenizer);
     std::optional<uint32_t> maxTokensLimit;
-    static_cast<void>(apiHandler->parseRequest(maxTokensLimit, 0, std::nullopt));
+    const absl::Status parseStatus = apiHandler->parseRequest(maxTokensLimit, 0, std::nullopt);
+    ASSERT_TRUE(parseStatus.ok()) << parseStatus;
     ctx->apiHandler = apiHandler;
 
     ctx->results.finish_reasons.push_back(ov::genai::GenerationFinishReason::STOP);
@@ -5784,7 +5786,8 @@ TEST_F(HttpOpenAIHandlerParsingTest, legacyServablePreparePartialResponseChatCom
         *ctx->payload.parsedJson, ovms::Endpoint::CHAT_COMPLETIONS,
         std::chrono::system_clock::now(), *tokenizer);
     uint32_t maxTokensLimit = 100;
-    static_cast<void>(apiHandler->parseRequest(maxTokensLimit, 0, std::nullopt));
+    const absl::Status parseStatus = apiHandler->parseRequest(maxTokensLimit, 0, std::nullopt);
+    ASSERT_TRUE(parseStatus.ok()) << parseStatus;
     ctx->apiHandler = apiHandler;
 
     ctx->results.finish_reasons.push_back(ov::genai::GenerationFinishReason::STOP);

From a8c5c90b1a683f724cf3a385675d6a65f729aae5 Mon Sep 17 00:00:00 2001
From: Michal Kulakowski <michal.kulakowski@intel.com>
Date: Wed, 10 Jun 2026 16:44:29 +0200
Subject: [PATCH 5/5] fix

---
 src/test/http_openai_handler_test.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp
index 87daa7daa3..3e36b68a0b 100644
--- a/src/test/http_openai_handler_test.cpp
+++ b/src/test/http_openai_handler_test.cpp
@@ -5680,7 +5680,10 @@ static std::shared_ptr<ovms::LegacyServableExecutionContext> makeLegacyResponses
         std::chrono::system_clock::now(), *tok);
     std::optional<uint32_t> maxTokensLimit;
     const absl::Status parseStatus = apiHandler->parseRequest(maxTokensLimit, 0, std::nullopt);
-    ASSERT_TRUE(parseStatus.ok()) << parseStatus;
+    EXPECT_TRUE(parseStatus.ok()) << parseStatus;
+    if (!parseStatus.ok()) {
+        return nullptr;
+    }
     ctx->apiHandler = apiHandler;
 
     ctx->results.finish_reasons.push_back(finishReason);