diff --git a/src/capi_frontend/server_settings.hpp b/src/capi_frontend/server_settings.hpp index dfcbe2328b..2cb91b7dde 100644 --- a/src/capi_frontend/server_settings.hpp +++ b/src/capi_frontend/server_settings.hpp @@ -226,6 +226,7 @@ struct ServerSettingsImpl { std::optional> allowedMediaDomains; std::string logLevel = "INFO"; std::string logPath; + bool verboseResponse = false; bool allowCredentials = false; std::string allowedOrigins{"*"}; std::string allowedMethods{"*"}; diff --git a/src/cli_parser.cpp b/src/cli_parser.cpp index 9e56e478be..e05109d905 100644 --- a/src/cli_parser.cpp +++ b/src/cli_parser.cpp @@ -105,6 +105,11 @@ std::variant> CLIParser::parse(int argc, char* ("log_path", "Optional path to the log file", cxxopts::value(), "LOG_PATH") + ("verbose_response", + "When enabled, responses include an extra " + "\"__verbose\" object with additional debug information.", + cxxopts::value()->default_value("false"), + "VERBOSE_RESPONSE") #ifdef MTR_ENABLED ("trace_path", "Path to the trace file", @@ -502,6 +507,8 @@ void CLIParser::prepareServer(ServerSettingsImpl& serverSettings) { serverSettings.logLevel = result->operator[]("log_level").as(); if (result->count("log_path")) serverSettings.logPath = result->operator[]("log_path").as(); + if (result->count("verbose_response")) + serverSettings.verboseResponse = result->operator[]("verbose_response").as(); if (result->count("grpc_channel_arguments")) serverSettings.grpcChannelArguments = result->operator[]("grpc_channel_arguments").as(); diff --git a/src/llm/apis/openai_api_handler.hpp b/src/llm/apis/openai_api_handler.hpp index 7c56bcbf95..30d29c0d21 100644 --- a/src/llm/apis/openai_api_handler.hpp +++ b/src/llm/apis/openai_api_handler.hpp @@ -99,6 +99,17 @@ class OpenAIApiHandler { // Output parser is used to parse chat completions response to extract specific fields like tool calls and reasoning. std::unique_ptr outputParser = nullptr; + // Verbose response support (enabled via --verbose_response). When set, the + // serialized response includes a "__verbose" object with the raw prompt + // (post chat template application) and raw decoded model output + // (before tool/reasoning parsing). + bool verboseResponse = false; + + std::string verbosePrompt; + // Streaming accumulators for raw model output. + std::vector verboseRawTokens; + std::string verboseRawText; + // Shared parsing helpers absl::Status parseCommonPart(std::optional maxTokensLimit, uint32_t bestOfLimit, std::optional maxModelLength); absl::Status parseResponseFormat(); @@ -156,6 +167,26 @@ class OpenAIApiHandler { std::string getToolChoice() const; const std::unique_ptr& getOutputParser() const; + // Verbose response configuration + void enableVerboseResponse(const std::string& promptAfterTemplate) { + verboseResponse = true; + verbosePrompt = promptAfterTemplate; + } + bool isVerboseResponse() const { return verboseResponse; } + const std::string& getVerbosePrompt() const { return verbosePrompt; } + // Accumulators used to assemble the "raw model output" for streaming responses. + void appendVerboseRawTokens(const std::vector& tokens) { + verboseRawTokens.insert(verboseRawTokens.end(), tokens.begin(), tokens.end()); + } + void appendVerboseRawText(const std::string& chunk) { + verboseRawText.append(chunk); + } + void setVerboseRawText(std::string text) { + verboseRawText = std::move(text); + } + const std::vector& getVerboseRawTokens() const { return verboseRawTokens; } + const std::string& getVerboseRawText() const { return verboseRawText; } + // Usage tracking void setPromptTokensUsage(size_t promptTokens); void setCompletionTokensUsage(size_t completionTokens); diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index 2fd5e12005..89009c0d74 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -394,6 +394,17 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const std::vect // TODO: system_fingerprint: string; This fingerprint represents the backend configuration that the model runs with. // Can be used in conjunction with the seed request parameter to understand when backend changes have been made that might impact determinism. + if (isVerboseResponse()) { + jsonResponse.StartObject("__verbose"); + jsonResponse.String("prompt", getVerbosePrompt()); + std::string rawContent; + if (!generationOutputs.empty()) { + rawContent = tokenizer.decode(generationOutputs.front().generated_ids, ov::genai::skip_special_tokens(false)); + } + jsonResponse.String("content", rawContent); + jsonResponse.EndObject(); + } + // finish response object jsonResponse.EndObject(); return jsonResponse.ToString(); @@ -458,6 +469,17 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::Enco // TODO: system_fingerprint: string; This fingerprint represents the backend configuration that the model runs with. // Can be used in conjunction with the seed request parameter to understand when backend changes have been made that might impact determinism. + if (isVerboseResponse()) { + jsonResponse.StartObject("__verbose"); + jsonResponse.String("prompt", getVerbosePrompt()); + std::string rawContent; + if (!results.tokens.empty()) { + rawContent = tokenizer.decode(results.tokens.front(), ov::genai::skip_special_tokens(false)); + } + jsonResponse.String("content", rawContent); + jsonResponse.EndObject(); + } + // finish response object jsonResponse.EndObject(); return jsonResponse.ToString(); @@ -528,6 +550,14 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMD // TODO: system_fingerprint: string; This fingerprint represents the backend configuration that the model runs with. // Can be used in conjunction with the seed request parameter to understand when backend changes have been made that might impact determinism. + if (isVerboseResponse()) { + jsonResponse.StartObject("__verbose"); + jsonResponse.String("prompt", getVerbosePrompt()); + // For VLM the raw decoded text is provided by GenAI directly. + jsonResponse.String("content", textResponse); + jsonResponse.EndObject(); + } + // finish response object jsonResponse.EndObject(); return jsonResponse.ToString(); @@ -622,6 +652,21 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str // TODO: system_fingerprint: string; This fingerprint represents the backend configuration that the model runs with. // Can be used in conjunction with the seed request parameter to understand when backend changes have been made that might impact determinism. + // Verbose mode: attach prompt and raw model output to the FINAL chunk only. + if (isVerboseResponse() && finishReason != ov::genai::GenerationFinishReason::NONE) { + std::string rawOutput; + if (!getVerboseRawTokens().empty()) { + rawOutput = tokenizer.decode(getVerboseRawTokens(), ov::genai::skip_special_tokens(false)); + } else { + rawOutput = getVerboseRawText(); + } + + Value verboseObject(kObjectType); + verboseObject.AddMember("prompt", Value(getVerbosePrompt().c_str(), allocator), allocator); + verboseObject.AddMember("content", Value(rawOutput.c_str(), allocator), allocator); + doc.AddMember("__verbose", verboseObject, allocator); + } + StringBuffer buffer; Writer writer(buffer); doc.Accept(writer); diff --git a/src/llm/language_model/legacy/servable.cpp b/src/llm/language_model/legacy/servable.cpp index 8e244df219..7dffb22afe 100644 --- a/src/llm/language_model/legacy/servable.cpp +++ b/src/llm/language_model/legacy/servable.cpp @@ -234,6 +234,9 @@ absl::Status LegacyServable::preparePartialResponse(std::shared_ptrresults.finish_reasons.empty() ? ov::genai::GenerationFinishReason::STOP : legacyExecutionContext->results.finish_reasons[0]; + if (executionContext->apiHandler->isVerboseResponse() && !legacyExecutionContext->results.tokens.empty()) { + executionContext->apiHandler->appendVerboseRawTokens(legacyExecutionContext->results.tokens[0]); + } std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, finishReason); if (!serializedChunk.empty()) { executionContext->response = wrapTextInServerSideEventMessage(serializedChunk); diff --git a/src/llm/servable.cpp b/src/llm/servable.cpp index cb8d87fcee..b4d5ca8185 100644 --- a/src/llm/servable.cpp +++ b/src/llm/servable.cpp @@ -269,6 +269,9 @@ absl::Status GenAiServable::prepareInputs(std::shared_ptrapiHandler->enableVerboseResponse(inputText); + } bool encodeAddSpecialTokens = (executionContext->endpoint == Endpoint::COMPLETIONS); executionContext->inputIds = getProperties()->tokenizer.encode(inputText, ov::genai::add_special_tokens(encodeAddSpecialTokens)).input_ids; if (getProperties()->maxModelLength.has_value()) { @@ -305,6 +308,9 @@ absl::Status GenAiServable::preparePartialResponse(std::shared_ptrgenerationOutputs[0]; executionContext->apiHandler->incrementProcessedTokens(generationOutput.generated_ids.size()); + if (executionContext->apiHandler->isVerboseResponse()) { + executionContext->apiHandler->appendVerboseRawTokens(generationOutput.generated_ids); + } std::stringstream ss; executionContext->textStreamer->write(generationOutput.generated_ids); diff --git a/src/llm/visual_language_model/continuous_batching/servable.cpp b/src/llm/visual_language_model/continuous_batching/servable.cpp index 13b7e73a62..7779d9c0be 100644 --- a/src/llm/visual_language_model/continuous_batching/servable.cpp +++ b/src/llm/visual_language_model/continuous_batching/servable.cpp @@ -22,6 +22,7 @@ #include #include +#include "../../../config.hpp" #include "../../../logging.hpp" #include "../../text_utils.hpp" #include "../../../tokenize/tokenize_parser.hpp" @@ -121,6 +122,10 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptrapiHandler->enableVerboseResponse(vlmExecutionContext->inputText); + } + // Below logic is used only for the statistics and debugging purposes and does not affect the model execution. SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM input text: {}", vlmExecutionContext->inputText); bool encodeAddSpecialTokens = false; // assuming chat template application added special tokens diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp index 033cb8641d..9c8e02c5df 100644 --- a/src/llm/visual_language_model/legacy/servable.cpp +++ b/src/llm/visual_language_model/legacy/servable.cpp @@ -224,6 +224,9 @@ absl::Status VisualLanguageModelLegacyServable::preparePartialResponse(std::shar lastTextChunk = executionContext->lastStreamerCallbackOutput; executionContext->lastStreamerCallbackOutput = ""; } + if (executionContext->apiHandler->isVerboseResponse() && !lastTextChunk.empty()) { + executionContext->apiHandler->appendVerboseRawText(lastTextChunk); + } if (generationStatus != std::future_status::ready) { // continue // For RESPONSES endpoint, always call serializeStreamingChunk so that // output item initialization events are emitted even before the tokenizer produces text. @@ -244,6 +247,9 @@ absl::Status VisualLanguageModelLegacyServable::preparePartialResponse(std::shar // if streamer::put returned a value, streamer::end() result will not contain it, so we add it manually if (!executionContext->lastStreamerCallbackOutput.empty()) { lastTextChunk = lastTextChunk + executionContext->lastStreamerCallbackOutput; + if (executionContext->apiHandler->isVerboseResponse()) { + executionContext->apiHandler->appendVerboseRawText(executionContext->lastStreamerCallbackOutput); + } } if (legacyExecutionContext->results.finish_reasons.empty()) { SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Missing finish reason in legacy VLM streaming generation result, defaulting to STOP"); @@ -316,6 +322,10 @@ absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptrapiHandler->enableVerboseResponse(vlmExecutionContext->inputText); + } + // Below logic is used only for the statistics and debugging purposes and does not affect the model execution. SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM input text: {}", vlmExecutionContext->inputText); bool encodeAddSpecialTokens = false; // assuming chat template application added special tokens diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index 46a31a3337..4fd28e771d 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -2213,6 +2213,133 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseChatCompletionsVLMDec ASSERT_NE(serialized.find("\"finish_reason\":\"length\""), std::string::npos) << serialized; } +TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseCompletionsIncludesVerbosePayloadWhenEnabled) { + std::string json = R"({ + "model": "llama", + "stream": false, + "prompt": "What is OpenVINO?" + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + uint32_t maxTokensLimit = 100; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + apiHandler->enableVerboseResponse("templated prompt"); + + ov::genai::EncodedResults results; + ov::Tensor outputIds = tokenizer->encode("OVMS", ov::genai::add_special_tokens(false)).input_ids; + int64_t* outputIdsData = reinterpret_cast(outputIds.data()); + results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; + + rapidjson::Document parsed; + parsed.Parse(apiHandler->serializeUnaryResponse(results).c_str()); + ASSERT_FALSE(parsed.HasParseError()); + ASSERT_TRUE(parsed.HasMember("__verbose")); + ASSERT_TRUE(parsed["__verbose"].IsObject()); + ASSERT_STREQ(parsed["__verbose"]["prompt"].GetString(), "templated prompt"); + ASSERT_STREQ(parsed["__verbose"]["content"].GetString(), "OVMS"); +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseCompletionsGenerationOutputIncludesVerbosePayloadWhenEnabled) { + std::string json = R"({ + "model": "llama", + "stream": false, + "prompt": "What is OpenVINO?" + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + uint32_t maxTokensLimit = 100; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + apiHandler->enableVerboseResponse("templated prompt"); + + ov::Tensor outputIds = tokenizer->encode("OVMS", ov::genai::add_special_tokens(false)).input_ids; + int64_t* outputIdsData = reinterpret_cast(outputIds.data()); + ov::genai::GenerationOutput generationOutput; + generationOutput.generated_ids = std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1]); + generationOutput.finish_reason = ov::genai::GenerationFinishReason::STOP; + + rapidjson::Document parsed; + parsed.Parse(apiHandler->serializeUnaryResponse(std::vector{generationOutput}).c_str()); + ASSERT_FALSE(parsed.HasParseError()); + ASSERT_TRUE(parsed.HasMember("__verbose")); + ASSERT_TRUE(parsed["__verbose"].IsObject()); + ASSERT_STREQ(parsed["__verbose"]["prompt"].GetString(), "templated prompt"); + ASSERT_STREQ(parsed["__verbose"]["content"].GetString(), "OVMS"); +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseCompletionsVLMDecodedResultsIncludesVerbosePayloadWhenEnabled) { + std::string json = R"({ + "model": "llama", + "stream": false, + "prompt": "What is OpenVINO?" + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + uint32_t maxTokensLimit = 100; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + apiHandler->enableVerboseResponse("templated prompt"); + + ov::genai::VLMDecodedResults results; + std::string text = "OVMS"; + results.texts = {text}; + results.finish_reasons = {ov::genai::GenerationFinishReason::STOP}; + + rapidjson::Document parsed; + parsed.Parse(apiHandler->serializeUnaryResponse(results, text).c_str()); + ASSERT_FALSE(parsed.HasParseError()); + ASSERT_TRUE(parsed.HasMember("__verbose")); + ASSERT_TRUE(parsed["__verbose"].IsObject()); + ASSERT_STREQ(parsed["__verbose"]["prompt"].GetString(), "templated prompt"); + ASSERT_STREQ(parsed["__verbose"]["content"].GetString(), "OVMS"); +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeStreamingChunkCompletionsIncludesVerbosePayloadOnlyOnFinalChunk) { + std::string json = R"({ + "model": "llama", + "stream": true, + "prompt": "What is OpenVINO?" + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + uint32_t maxTokensLimit = 100; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + apiHandler->enableVerboseResponse("templated prompt"); + + rapidjson::Document intermediate; + intermediate.Parse(apiHandler->serializeStreamingChunk("Hello", ov::genai::GenerationFinishReason::NONE).c_str()); + ASSERT_FALSE(intermediate.HasParseError()); + ASSERT_FALSE(intermediate.HasMember("__verbose")); + + apiHandler->setVerboseRawText("Hello world"); + + rapidjson::Document finalChunk; + finalChunk.Parse(apiHandler->serializeStreamingChunk(" world", ov::genai::GenerationFinishReason::STOP).c_str()); + ASSERT_FALSE(finalChunk.HasParseError()); + ASSERT_TRUE(finalChunk.HasMember("__verbose")); + ASSERT_TRUE(finalChunk["__verbose"].IsObject()); + ASSERT_STREQ(finalChunk["__verbose"]["prompt"].GetString(), "templated prompt"); + ASSERT_STREQ(finalChunk["__verbose"]["content"].GetString(), "Hello world"); +} + TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsBase64) { std::string json = R"({ "model": "llama",