From 601a22434bc2b85e3849f3f0f1de505dd0038957 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Tue, 12 May 2026 14:24:38 +0100
Subject: [PATCH 1/2] fix(backends): raise error when OpenAI backend receives
 content=None

Thinking-mode models (e.g. Qwen3 served via vLLM with --reasoning-parser
qwen3) return content=None with finish_reason=stop and non-zero
completion_tokens when they emit only reasoning tokens. The OpenAI
backend silently skipped content accumulation, leaving the caller with
an empty string and no indication that anything went wrong.

Detect this case in post_processing and raise RuntimeError with a
message suggesting enable_thinking: False via model_options. The check
covers both streaming and non-streaming paths (post_processing is shared)
and skips when tool_calls are present, since those legitimately permit
empty content.

Closes #1060

Assisted-by: Claude Code
---
 mellea/backends/openai.py         | 19 +++++++
 test/backends/test_openai_unit.py | 88 ++++++++++++++++++++++++++++++-
 2 files changed, 106 insertions(+), 1 deletion(-)

diff --git a/mellea/backends/openai.py b/mellea/backends/openai.py
index 1eea93511..389084190 100644
--- a/mellea/backends/openai.py
+++ b/mellea/backends/openai.py
@@ -1115,6 +1115,25 @@ async def post_processing(
         if usage:
             mot.generation.usage = usage
 
+        # content=None with stop+tokens means thinking-only mode; surface it rather than returning "".
+        finish_reason = choice_response.get("finish_reason")
+        completion_tokens = usage.get("completion_tokens", 0) if usage else 0
+        if (
+            not mot._underlying_value
+            and finish_reason == "stop"
+            and completion_tokens > 0
+            and not mot.tool_calls
+        ):
+            raise RuntimeError(
+                "OpenAI backend received an empty response (content=None) with "
+                f"finish_reason=stop and completion_tokens={completion_tokens}. "
+                "This typically indicates a thinking-mode model that emitted only "
+                "reasoning tokens. For OpenAI-compatible thinking models, disable "
+                "thinking via model_options, e.g.: "
+                'model_options={"extra_body": {"chat_template_kwargs": '
+                '{"enable_thinking": False}}}.'
+            )
+
         # Populate model and provider metadata
         mot.generation.model = self._model_id
         mot.generation.provider = "openai"
diff --git a/test/backends/test_openai_unit.py b/test/backends/test_openai_unit.py
index 09524df8c..eba075190 100644
--- a/test/backends/test_openai_unit.py
+++ b/test/backends/test_openai_unit.py
@@ -1,13 +1,16 @@
 """Unit tests for OpenAI backend pure-logic helpers — no API calls required.
 
 Covers filter_openai_client_kwargs, filter_chat_completions_kwargs,
-_simplify_and_merge, and _make_backend_specific_and_remove.
+_simplify_and_merge, _make_backend_specific_and_remove, and post_processing
+error detection for empty thinking-mode responses.
 """
 
 import pytest
 
 from mellea.backends import ModelOption
 from mellea.backends.openai import OpenAIBackend
+from mellea.core import ModelOutputThunk
+from mellea.stdlib.components import Message
 
 
 def _make_backend(model_options: dict | None = None) -> OpenAIBackend:
@@ -168,5 +171,88 @@ def test_make_backend_specific_unknown_mellea_keys_removed(backend):
     assert ModelOption.SYSTEM_PROMPT not in result
 
 
+# --- post_processing: empty thinking-mode response detection ---
+
+
+def _build_mot_for_empty_content_check(
+    finish_reason: str = "stop",
+    content: str | None = None,
+    completion_tokens: int = 9,
+    tool_calls: list | None = None,
+) -> ModelOutputThunk:
+    """Construct a ModelOutputThunk in the state post_processing expects after processing()."""
+    mot = ModelOutputThunk(value=None)
+    mot._action = Message("user", "What is 2 + 2?")
+    mot._model_options = {}
+    mot._underlying_value = content if content is not None else ""
+    choice = {
+        "finish_reason": finish_reason,
+        "index": 0,
+        "message": {"content": content, "role": "assistant", "tool_calls": tool_calls},
+    }
+    full_response = {
+        "id": "chatcmpl-test",
+        "object": "chat.completion",
+        "choices": [choice],
+        "usage": {
+            "prompt_tokens": 10,
+            "completion_tokens": completion_tokens,
+            "total_tokens": 10 + completion_tokens,
+        },
+    }
+    mot._meta["oai_chat_response"] = full_response
+    mot._meta["oai_chat_response_choice"] = choice
+    return mot
+
+
+@pytest.mark.asyncio
+async def test_post_processing_raises_on_empty_content_with_tokens(backend):
+    """Thinking model with content=None, finish_reason=stop, non-zero tokens -> RuntimeError."""
+    mot = _build_mot_for_empty_content_check()
+    with pytest.raises(RuntimeError, match="enable_thinking"):
+        await backend.post_processing(
+            mot=mot, tools={}, conversation=[], thinking=None, seed=None, _format=None
+        )
+
+
+@pytest.mark.asyncio
+async def test_post_processing_raises_on_empty_string_content(backend):
+    """content='' is treated the same as None when finish_reason=stop and tokens>0."""
+    mot = _build_mot_for_empty_content_check(content="")
+    with pytest.raises(RuntimeError, match="empty response"):
+        await backend.post_processing(
+            mot=mot, tools={}, conversation=[], thinking=None, seed=None, _format=None
+        )
+
+
+@pytest.mark.asyncio
+async def test_post_processing_accepts_empty_content_with_zero_tokens(backend):
+    """Empty content with zero completion_tokens is not a thinking-mode failure."""
+    mot = _build_mot_for_empty_content_check(completion_tokens=0)
+    # Should not raise.
+    await backend.post_processing(
+        mot=mot, tools={}, conversation=[], thinking=None, seed=None, _format=None
+    )
+
+
+@pytest.mark.asyncio
+async def test_post_processing_accepts_empty_content_with_length_finish(backend):
+    """finish_reason=length (truncated) is a different failure mode, not raised here."""
+    mot = _build_mot_for_empty_content_check(finish_reason="length")
+    await backend.post_processing(
+        mot=mot, tools={}, conversation=[], thinking=None, seed=None, _format=None
+    )
+
+
+@pytest.mark.asyncio
+async def test_post_processing_accepts_non_empty_content(backend):
+    """Normal response with content is unaffected."""
+    mot = _build_mot_for_empty_content_check(content="The answer is 4.")
+    await backend.post_processing(
+        mot=mot, tools={}, conversation=[], thinking=None, seed=None, _format=None
+    )
+    assert mot._underlying_value == "The answer is 4."
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])

From 42e83d9d7e95e0b9b98524853c04e8ca9f2c8088 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Wed, 13 May 2026 12:11:34 +0100
Subject: [PATCH 2/2] fix(backends): close telemetry span and broaden error
 message on empty-content raise
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Before this commit, the RuntimeError for thinking-mode empty responses was raised
mid-post_processing, leaving the OTel span unclosed (the cleanup block runs later).
base.py only catches generation-time exceptions from the chunk queue, not exceptions
from _post_process itself, so the span leaked on every thinking-mode failure.

Changes:
- Move model/provider metadata assignment before the guard so all fields satisfy
  the backend telemetry contract even when raising
- Build the RuntimeError first, close the span via set_span_error/end_backend_span,
  then raise — no span leaks on the error path
- Scope the enable_thinking hint to vLLM/Qwen3; add generic "consult your runtime's
  docs" for other providers
- Include reasoning content length in the error message when mot._thinking is set
- Add streaming-path regression test (oai_chat_response_choice absent)
- Add tool_calls bypass test
- Remove redundant @pytest.mark.asyncio decorators (asyncio_mode=auto in pyproject)
- Fix tool_calls param type annotation to list[dict] | None

Assisted-by: Claude Code
---
 mellea/backends/openai.py         | 28 +++++++++++++-----
 test/backends/test_openai_unit.py | 48 +++++++++++++++++++++++++++----
 2 files changed, 63 insertions(+), 13 deletions(-)

diff --git a/mellea/backends/openai.py b/mellea/backends/openai.py
index 389084190..ac8ffe582 100644
--- a/mellea/backends/openai.py
+++ b/mellea/backends/openai.py
@@ -1115,6 +1115,10 @@ async def post_processing(
         if usage:
             mot.generation.usage = usage
 
+        # Populate model and provider metadata
+        mot.generation.model = self._model_id
+        mot.generation.provider = "openai"
+
         # content=None with stop+tokens means thinking-only mode; surface it rather than returning "".
         finish_reason = choice_response.get("finish_reason")
         completion_tokens = usage.get("completion_tokens", 0) if usage else 0
@@ -1124,19 +1128,29 @@ async def post_processing(
             and completion_tokens > 0
             and not mot.tool_calls
         ):
-            raise RuntimeError(
+            thinking_note = (
+                f" Reasoning content ({len(mot._thinking)} chars) is in mot._thinking."
+                if mot._thinking
+                else ""
+            )
+            err = RuntimeError(
                 "OpenAI backend received an empty response (content=None) with "
                 f"finish_reason=stop and completion_tokens={completion_tokens}. "
-                "This typically indicates a thinking-mode model that emitted only "
-                "reasoning tokens. For OpenAI-compatible thinking models, disable "
-                "thinking via model_options, e.g.: "
+                "This typically indicates a thinking-mode model (e.g. Qwen3 via vLLM "
+                "with --reasoning-parser) that emitted only reasoning tokens."
+                + thinking_note
+                + " For vLLM/Qwen3, disable thinking via model_options, e.g.: "
                 'model_options={"extra_body": {"chat_template_kwargs": '
                 '{"enable_thinking": False}}}.'
+                " For other providers, consult your runtime's documentation."
             )
+            span = mot._meta.pop("_telemetry_span", None)
+            if span is not None:
+                from ..telemetry import end_backend_span, set_span_error
 
-        # Populate model and provider metadata
-        mot.generation.model = self._model_id
-        mot.generation.provider = "openai"
+                set_span_error(span, err)
+                end_backend_span(span)
+            raise err
 
         # Record telemetry now that response is available
         span = mot._meta.get("_telemetry_span")
diff --git a/test/backends/test_openai_unit.py b/test/backends/test_openai_unit.py
index eba075190..d3084d279 100644
--- a/test/backends/test_openai_unit.py
+++ b/test/backends/test_openai_unit.py
@@ -178,7 +178,7 @@ def _build_mot_for_empty_content_check(
     finish_reason: str = "stop",
     content: str | None = None,
     completion_tokens: int = 9,
-    tool_calls: list | None = None,
+    tool_calls: list[dict] | None = None,
 ) -> ModelOutputThunk:
     """Construct a ModelOutputThunk in the state post_processing expects after processing()."""
     mot = ModelOutputThunk(value=None)
@@ -205,7 +205,6 @@ def _build_mot_for_empty_content_check(
     return mot
 
 
-@pytest.mark.asyncio
 async def test_post_processing_raises_on_empty_content_with_tokens(backend):
     """Thinking model with content=None, finish_reason=stop, non-zero tokens -> RuntimeError."""
     mot = _build_mot_for_empty_content_check()
@@ -215,7 +214,6 @@ async def test_post_processing_raises_on_empty_content_with_tokens(backend):
         )
 
 
-@pytest.mark.asyncio
 async def test_post_processing_raises_on_empty_string_content(backend):
     """content='' is treated the same as None when finish_reason=stop and tokens>0."""
     mot = _build_mot_for_empty_content_check(content="")
@@ -225,7 +223,6 @@ async def test_post_processing_raises_on_empty_string_content(backend):
         )
 
 
-@pytest.mark.asyncio
 async def test_post_processing_accepts_empty_content_with_zero_tokens(backend):
     """Empty content with zero completion_tokens is not a thinking-mode failure."""
     mot = _build_mot_for_empty_content_check(completion_tokens=0)
@@ -235,7 +232,6 @@ async def test_post_processing_accepts_empty_content_with_zero_tokens(backend):
     )
 
 
-@pytest.mark.asyncio
 async def test_post_processing_accepts_empty_content_with_length_finish(backend):
     """finish_reason=length (truncated) is a different failure mode, not raised here."""
     mot = _build_mot_for_empty_content_check(finish_reason="length")
@@ -244,7 +240,6 @@ async def test_post_processing_accepts_empty_content_with_length_finish(backend)
     )
 
 
-@pytest.mark.asyncio
 async def test_post_processing_accepts_non_empty_content(backend):
     """Normal response with content is unaffected."""
     mot = _build_mot_for_empty_content_check(content="The answer is 4.")
@@ -254,5 +249,46 @@ async def test_post_processing_accepts_non_empty_content(backend):
     assert mot._underlying_value == "The answer is 4."
 
 
+async def test_post_processing_streaming_raises_on_empty_content(backend):
+    """Streaming path: oai_chat_response is a choice-shaped dict (chat_completion_delta_merge output); guard still fires."""
+    mot = ModelOutputThunk(value=None)
+    mot._action = Message("user", "What is 2 + 2?")
+    mot._model_options = {}
+    mot._underlying_value = ""
+    # Streaming: oai_chat_response is the merged choice dict — finish_reason at the top level.
+    mot._meta["oai_chat_response"] = {
+        "finish_reason": "stop",
+        "index": 0,
+        "logprobs": None,
+        "stop_reason": None,
+        "message": {
+            "content": None,
+            "reasoning_content": "2+2=4",
+            "role": "assistant",
+            "tool_calls": [],
+        },
+    }
+    mot._meta["oai_streaming_usage"] = {
+        "prompt_tokens": 10,
+        "completion_tokens": 9,
+        "total_tokens": 19,
+    }
+    # oai_chat_response_choice intentionally absent — this is the streaming code path.
+    with pytest.raises(RuntimeError, match="enable_thinking"):
+        await backend.post_processing(
+            mot=mot, tools={}, conversation=[], thinking=None, seed=None, _format=None
+        )
+
+
+async def test_post_processing_skips_when_tool_calls_present(backend):
+    """Empty content with active tool calls must not raise — tool calls legitimately have no text."""
+    mot = _build_mot_for_empty_content_check()
+    mot.tool_calls = {"get_weather": {"name": "get_weather", "arguments": "{}"}}  # type: ignore[assignment]
+    # Should not raise.
+    await backend.post_processing(
+        mot=mot, tools={}, conversation=[], thinking=None, seed=None, _format=None
+    )
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])