diff --git a/mellea/backends/openai.py b/mellea/backends/openai.py index 1eea93511..ac8ffe582 100644 --- a/mellea/backends/openai.py +++ b/mellea/backends/openai.py @@ -1119,6 +1119,39 @@ async def post_processing( mot.generation.model = self._model_id mot.generation.provider = "openai" + # content=None with stop+tokens means thinking-only mode; surface it rather than returning "". + finish_reason = choice_response.get("finish_reason") + completion_tokens = usage.get("completion_tokens", 0) if usage else 0 + if ( + not mot._underlying_value + and finish_reason == "stop" + and completion_tokens > 0 + and not mot.tool_calls + ): + thinking_note = ( + f" Reasoning content ({len(mot._thinking)} chars) is in mot._thinking." + if mot._thinking + else "" + ) + err = RuntimeError( + "OpenAI backend received an empty response (content=None) with " + f"finish_reason=stop and completion_tokens={completion_tokens}. " + "This typically indicates a thinking-mode model (e.g. Qwen3 via vLLM " + "with --reasoning-parser) that emitted only reasoning tokens." + + thinking_note + + " For vLLM/Qwen3, disable thinking via model_options, e.g.: " + 'model_options={"extra_body": {"chat_template_kwargs": ' + '{"enable_thinking": False}}}.' + " For other providers, consult your runtime's documentation." + ) + span = mot._meta.pop("_telemetry_span", None) + if span is not None: + from ..telemetry import end_backend_span, set_span_error + + set_span_error(span, err) + end_backend_span(span) + raise err + # Record telemetry now that response is available span = mot._meta.get("_telemetry_span") if span is not None: diff --git a/test/backends/test_openai_unit.py b/test/backends/test_openai_unit.py index 09524df8c..d3084d279 100644 --- a/test/backends/test_openai_unit.py +++ b/test/backends/test_openai_unit.py @@ -1,13 +1,16 @@ """Unit tests for OpenAI backend pure-logic helpers — no API calls required. Covers filter_openai_client_kwargs, filter_chat_completions_kwargs, -_simplify_and_merge, and _make_backend_specific_and_remove. +_simplify_and_merge, _make_backend_specific_and_remove, and post_processing +error detection for empty thinking-mode responses. """ import pytest from mellea.backends import ModelOption from mellea.backends.openai import OpenAIBackend +from mellea.core import ModelOutputThunk +from mellea.stdlib.components import Message def _make_backend(model_options: dict | None = None) -> OpenAIBackend: @@ -168,5 +171,124 @@ def test_make_backend_specific_unknown_mellea_keys_removed(backend): assert ModelOption.SYSTEM_PROMPT not in result +# --- post_processing: empty thinking-mode response detection --- + + +def _build_mot_for_empty_content_check( + finish_reason: str = "stop", + content: str | None = None, + completion_tokens: int = 9, + tool_calls: list[dict] | None = None, +) -> ModelOutputThunk: + """Construct a ModelOutputThunk in the state post_processing expects after processing().""" + mot = ModelOutputThunk(value=None) + mot._action = Message("user", "What is 2 + 2?") + mot._model_options = {} + mot._underlying_value = content if content is not None else "" + choice = { + "finish_reason": finish_reason, + "index": 0, + "message": {"content": content, "role": "assistant", "tool_calls": tool_calls}, + } + full_response = { + "id": "chatcmpl-test", + "object": "chat.completion", + "choices": [choice], + "usage": { + "prompt_tokens": 10, + "completion_tokens": completion_tokens, + "total_tokens": 10 + completion_tokens, + }, + } + mot._meta["oai_chat_response"] = full_response + mot._meta["oai_chat_response_choice"] = choice + return mot + + +async def test_post_processing_raises_on_empty_content_with_tokens(backend): + """Thinking model with content=None, finish_reason=stop, non-zero tokens -> RuntimeError.""" + mot = _build_mot_for_empty_content_check() + with pytest.raises(RuntimeError, match="enable_thinking"): + await backend.post_processing( + mot=mot, tools={}, conversation=[], thinking=None, seed=None, _format=None + ) + + +async def test_post_processing_raises_on_empty_string_content(backend): + """content='' is treated the same as None when finish_reason=stop and tokens>0.""" + mot = _build_mot_for_empty_content_check(content="") + with pytest.raises(RuntimeError, match="empty response"): + await backend.post_processing( + mot=mot, tools={}, conversation=[], thinking=None, seed=None, _format=None + ) + + +async def test_post_processing_accepts_empty_content_with_zero_tokens(backend): + """Empty content with zero completion_tokens is not a thinking-mode failure.""" + mot = _build_mot_for_empty_content_check(completion_tokens=0) + # Should not raise. + await backend.post_processing( + mot=mot, tools={}, conversation=[], thinking=None, seed=None, _format=None + ) + + +async def test_post_processing_accepts_empty_content_with_length_finish(backend): + """finish_reason=length (truncated) is a different failure mode, not raised here.""" + mot = _build_mot_for_empty_content_check(finish_reason="length") + await backend.post_processing( + mot=mot, tools={}, conversation=[], thinking=None, seed=None, _format=None + ) + + +async def test_post_processing_accepts_non_empty_content(backend): + """Normal response with content is unaffected.""" + mot = _build_mot_for_empty_content_check(content="The answer is 4.") + await backend.post_processing( + mot=mot, tools={}, conversation=[], thinking=None, seed=None, _format=None + ) + assert mot._underlying_value == "The answer is 4." + + +async def test_post_processing_streaming_raises_on_empty_content(backend): + """Streaming path: oai_chat_response is a choice-shaped dict (chat_completion_delta_merge output); guard still fires.""" + mot = ModelOutputThunk(value=None) + mot._action = Message("user", "What is 2 + 2?") + mot._model_options = {} + mot._underlying_value = "" + # Streaming: oai_chat_response is the merged choice dict — finish_reason at the top level. + mot._meta["oai_chat_response"] = { + "finish_reason": "stop", + "index": 0, + "logprobs": None, + "stop_reason": None, + "message": { + "content": None, + "reasoning_content": "2+2=4", + "role": "assistant", + "tool_calls": [], + }, + } + mot._meta["oai_streaming_usage"] = { + "prompt_tokens": 10, + "completion_tokens": 9, + "total_tokens": 19, + } + # oai_chat_response_choice intentionally absent — this is the streaming code path. + with pytest.raises(RuntimeError, match="enable_thinking"): + await backend.post_processing( + mot=mot, tools={}, conversation=[], thinking=None, seed=None, _format=None + ) + + +async def test_post_processing_skips_when_tool_calls_present(backend): + """Empty content with active tool calls must not raise — tool calls legitimately have no text.""" + mot = _build_mot_for_empty_content_check() + mot.tool_calls = {"get_weather": {"name": "get_weather", "arguments": "{}"}} # type: ignore[assignment] + # Should not raise. + await backend.post_processing( + mot=mot, tools={}, conversation=[], thinking=None, seed=None, _format=None + ) + + if __name__ == "__main__": pytest.main([__file__, "-v"])