Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions mellea/backends/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -1119,6 +1119,39 @@ async def post_processing(
mot.generation.model = self._model_id
mot.generation.provider = "openai"

# content=None with stop+tokens means thinking-only mode; surface it rather than returning "".
finish_reason = choice_response.get("finish_reason")
completion_tokens = usage.get("completion_tokens", 0) if usage else 0
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When usage is unavailable (e.g. streaming without stream_options={"include_usage": True}, or an OpenAI-compatible provider that omits it), completion_tokens falls back to 0 and the guard silently passes — same empty-string symptom the fix is meant to surface. Not a regression, but worth a comment here or a line in the error so the next person hitting it knows why the guard didn't fire for them.

if (
not mot._underlying_value
and finish_reason == "stop"
and completion_tokens > 0
and not mot.tool_calls
):
thinking_note = (
f" Reasoning content ({len(mot._thinking)} chars) is in mot._thinking."
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: the error message points users at mot._thinking, which is a private attribute today. #909 item 4 is considering whether to promote it to public — happy to leave this as-is and let the message track whatever shakes out of that issue, but worth flagging.

if mot._thinking
else ""
)
err = RuntimeError(
"OpenAI backend received an empty response (content=None) with "
f"finish_reason=stop and completion_tokens={completion_tokens}. "
"This typically indicates a thinking-mode model (e.g. Qwen3 via vLLM "
"with --reasoning-parser) that emitted only reasoning tokens."
+ thinking_note
+ " For vLLM/Qwen3, disable thinking via model_options, e.g.: "
'model_options={"extra_body": {"chat_template_kwargs": '
'{"enable_thinking": False}}}.'
" For other providers, consult your runtime's documentation."
)
span = mot._meta.pop("_telemetry_span", None)
if span is not None:
from ..telemetry import end_backend_span, set_span_error

set_span_error(span, err)
end_backend_span(span)
raise err

# Record telemetry now that response is available
span = mot._meta.get("_telemetry_span")
if span is not None:
Expand Down
124 changes: 123 additions & 1 deletion test/backends/test_openai_unit.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
"""Unit tests for OpenAI backend pure-logic helpers — no API calls required.

Covers filter_openai_client_kwargs, filter_chat_completions_kwargs,
_simplify_and_merge, and _make_backend_specific_and_remove.
_simplify_and_merge, _make_backend_specific_and_remove, and post_processing
error detection for empty thinking-mode responses.
"""

import pytest

from mellea.backends import ModelOption
from mellea.backends.openai import OpenAIBackend
from mellea.core import ModelOutputThunk
from mellea.stdlib.components import Message


def _make_backend(model_options: dict | None = None) -> OpenAIBackend:
Expand Down Expand Up @@ -168,5 +171,124 @@ def test_make_backend_specific_unknown_mellea_keys_removed(backend):
assert ModelOption.SYSTEM_PROMPT not in result


# --- post_processing: empty thinking-mode response detection ---


def _build_mot_for_empty_content_check(
finish_reason: str = "stop",
content: str | None = None,
completion_tokens: int = 9,
tool_calls: list[dict] | None = None,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Optional: the helper hand-assembles mot._meta["oai_chat_response"] and oai_chat_response_choice. Driving a real ChatCompletion fixture through processing() instead would catch future drift in those meta keys. Fine as a unit test; mentioning in case you want the broader coverage.

) -> ModelOutputThunk:
"""Construct a ModelOutputThunk in the state post_processing expects after processing()."""
mot = ModelOutputThunk(value=None)
mot._action = Message("user", "What is 2 + 2?")
mot._model_options = {}
mot._underlying_value = content if content is not None else ""
choice = {
"finish_reason": finish_reason,
"index": 0,
"message": {"content": content, "role": "assistant", "tool_calls": tool_calls},
}
full_response = {
"id": "chatcmpl-test",
"object": "chat.completion",
"choices": [choice],
"usage": {
"prompt_tokens": 10,
"completion_tokens": completion_tokens,
"total_tokens": 10 + completion_tokens,
},
}
mot._meta["oai_chat_response"] = full_response
mot._meta["oai_chat_response_choice"] = choice
return mot


async def test_post_processing_raises_on_empty_content_with_tokens(backend):
"""Thinking model with content=None, finish_reason=stop, non-zero tokens -> RuntimeError."""
mot = _build_mot_for_empty_content_check()
with pytest.raises(RuntimeError, match="enable_thinking"):
await backend.post_processing(
mot=mot, tools={}, conversation=[], thinking=None, seed=None, _format=None
)


async def test_post_processing_raises_on_empty_string_content(backend):
"""content='' is treated the same as None when finish_reason=stop and tokens>0."""
mot = _build_mot_for_empty_content_check(content="")
with pytest.raises(RuntimeError, match="empty response"):
await backend.post_processing(
mot=mot, tools={}, conversation=[], thinking=None, seed=None, _format=None
)


async def test_post_processing_accepts_empty_content_with_zero_tokens(backend):
"""Empty content with zero completion_tokens is not a thinking-mode failure."""
mot = _build_mot_for_empty_content_check(completion_tokens=0)
# Should not raise.
await backend.post_processing(
mot=mot, tools={}, conversation=[], thinking=None, seed=None, _format=None
)


async def test_post_processing_accepts_empty_content_with_length_finish(backend):
"""finish_reason=length (truncated) is a different failure mode, not raised here."""
mot = _build_mot_for_empty_content_check(finish_reason="length")
await backend.post_processing(
mot=mot, tools={}, conversation=[], thinking=None, seed=None, _format=None
)


async def test_post_processing_accepts_non_empty_content(backend):
"""Normal response with content is unaffected."""
mot = _build_mot_for_empty_content_check(content="The answer is 4.")
await backend.post_processing(
mot=mot, tools={}, conversation=[], thinking=None, seed=None, _format=None
)
assert mot._underlying_value == "The answer is 4."


async def test_post_processing_streaming_raises_on_empty_content(backend):
"""Streaming path: oai_chat_response is a choice-shaped dict (chat_completion_delta_merge output); guard still fires."""
mot = ModelOutputThunk(value=None)
mot._action = Message("user", "What is 2 + 2?")
mot._model_options = {}
mot._underlying_value = ""
# Streaming: oai_chat_response is the merged choice dict — finish_reason at the top level.
mot._meta["oai_chat_response"] = {
"finish_reason": "stop",
"index": 0,
"logprobs": None,
"stop_reason": None,
"message": {
"content": None,
"reasoning_content": "2+2=4",
"role": "assistant",
"tool_calls": [],
},
}
mot._meta["oai_streaming_usage"] = {
"prompt_tokens": 10,
"completion_tokens": 9,
"total_tokens": 19,
}
# oai_chat_response_choice intentionally absent — this is the streaming code path.
with pytest.raises(RuntimeError, match="enable_thinking"):
await backend.post_processing(
mot=mot, tools={}, conversation=[], thinking=None, seed=None, _format=None
)


async def test_post_processing_skips_when_tool_calls_present(backend):
"""Empty content with active tool calls must not raise — tool calls legitimately have no text."""
mot = _build_mot_for_empty_content_check()
mot.tool_calls = {"get_weather": {"name": "get_weather", "arguments": "{}"}} # type: ignore[assignment]
# Should not raise.
await backend.post_processing(
mot=mot, tools={}, conversation=[], thinking=None, seed=None, _format=None
)


if __name__ == "__main__":
pytest.main([__file__, "-v"])
Loading