From 42cee396714e09d48b5a691b45417bacd981d1c6 Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Tue, 12 May 2026 14:27:13 +0100 Subject: [PATCH 1/2] fix(backends): capture vLLM reasoning field in mot._thinking vLLM served with --reasoning-parser qwen3 (and other thinking models) surfaces the reasoning trace under the "reasoning" key of the raw message dict, not as a Python attribute on the openai SDK object. The existing hasattr(message, "reasoning_content") probe therefore missed it, and mot._thinking was silently left unpopulated for Qwen3 and similar models. Add a fallback in both the non-streaming (ChatCompletion) and streaming (ChatCompletionChunk) branches of processing(): when reasoning_content is absent, probe the raw .model_dump() for a "reasoning" key. The existing reasoning_content path is preserved untouched. Closes #1061 Assisted-by: Claude Code --- mellea/backends/openai.py | 12 +++ test/backends/test_openai_unit.py | 119 ++++++++++++++++++++++++++++++ 2 files changed, 131 insertions(+) diff --git a/mellea/backends/openai.py b/mellea/backends/openai.py index 1eea93511..f8b639c6f 100644 --- a/mellea/backends/openai.py +++ b/mellea/backends/openai.py @@ -995,6 +995,12 @@ async def processing( thinking_chunk = message.reasoning_content # type: ignore if thinking_chunk is not None: mot._thinking += thinking_chunk + else: + # vLLM uses "reasoning" in the raw dict rather than a reasoning_content attribute. + raw_message = message.model_dump() + thinking_chunk = raw_message.get("reasoning") + if thinking_chunk is not None: + mot._thinking += thinking_chunk content_chunk = message.content if content_chunk is not None: @@ -1019,6 +1025,12 @@ async def processing( thinking_chunk = message_delta.reasoning_content # type: ignore if thinking_chunk is not None: mot._thinking += thinking_chunk + else: + # vLLM streaming: same "reasoning" fallback for delta chunks. + raw_delta = message_delta.model_dump() + thinking_chunk = raw_delta.get("reasoning") + if thinking_chunk is not None: + mot._thinking += thinking_chunk content_chunk = message_delta.content if content_chunk is not None: diff --git a/test/backends/test_openai_unit.py b/test/backends/test_openai_unit.py index 09524df8c..304b77171 100644 --- a/test/backends/test_openai_unit.py +++ b/test/backends/test_openai_unit.py @@ -5,9 +5,12 @@ """ import pytest +from openai.types.chat import ChatCompletion, ChatCompletionChunk, ChatCompletionMessage +from openai.types.chat.chat_completion import Choice from mellea.backends import ModelOption from mellea.backends.openai import OpenAIBackend +from mellea.core.base import ModelOutputThunk def _make_backend(model_options: dict | None = None) -> OpenAIBackend: @@ -168,5 +171,121 @@ def test_make_backend_specific_unknown_mellea_keys_removed(backend): assert ModelOption.SYSTEM_PROMPT not in result +# --- processing(): reasoning / thinking trace extraction --- + + +def _vllm_chat_completion(reasoning: str, content: str | None) -> ChatCompletion: + """Build a ChatCompletion that matches vLLM's thinking-model response shape.""" + message = ChatCompletionMessage.model_validate( + {"role": "assistant", "content": content, "reasoning": reasoning} + ) + return ChatCompletion( + id="vllm-test", + created=0, + model="qwen3", + object="chat.completion", + choices=[Choice(index=0, finish_reason="stop", message=message)], + ) + + +async def test_processing_captures_vllm_reasoning_field(backend): + """Non-streaming: mot._thinking captures the raw ``reasoning`` key from vLLM.""" + mot: ModelOutputThunk = ModelOutputThunk(value=None) + chunk = _vllm_chat_completion(reasoning="2 + 2 equals 4.", content="4") + # Sanity check: the SDK object does not expose reasoning_content + assert not hasattr(chunk.choices[0].message, "reasoning_content") + + await backend.processing(mot, chunk) + + assert mot._thinking == "2 + 2 equals 4." + assert mot._underlying_value == "4" + + +async def test_processing_vllm_reasoning_with_null_content(backend): + """Non-streaming: reasoning is captured even when ``content`` is null.""" + mot: ModelOutputThunk = ModelOutputThunk(value=None) + chunk = _vllm_chat_completion(reasoning="some thinking", content=None) + + await backend.processing(mot, chunk) + + assert mot._thinking == "some thinking" + assert mot._underlying_value == "" + + +async def test_processing_streaming_captures_vllm_reasoning_field(backend): + """Streaming: per-chunk ``reasoning`` deltas accumulate into mot._thinking.""" + mot: ModelOutputThunk = ModelOutputThunk(value=None) + chunk_a = ChatCompletionChunk.model_validate( + { + "id": "vllm-stream", + "created": 0, + "model": "qwen3", + "object": "chat.completion.chunk", + "choices": [ + { + "index": 0, + "delta": { + "role": "assistant", + "content": None, + "reasoning": "first ", + }, + "finish_reason": None, + } + ], + } + ) + chunk_b = ChatCompletionChunk.model_validate( + { + "id": "vllm-stream", + "created": 0, + "model": "qwen3", + "object": "chat.completion.chunk", + "choices": [ + { + "index": 0, + "delta": {"content": "ans", "reasoning": "second"}, + "finish_reason": None, + } + ], + } + ) + + await backend.processing(mot, chunk_a) + await backend.processing(mot, chunk_b) + + assert mot._thinking == "first second" + assert mot._underlying_value == "ans" + + +async def test_processing_reasoning_content_still_used(backend): + """Regression guard: the pre-existing ``reasoning_content`` path is preserved. + + Some providers surface the trace as ``reasoning_content`` on the message + object itself. The fix must not regress that path in favour of the raw-dict + fallback. + """ + message = ChatCompletionMessage.model_validate( + { + "role": "assistant", + "content": "answer", + "reasoning_content": "attribute-style trace", + } + ) + chunk = ChatCompletion( + id="rc-test", + created=0, + model="fake", + object="chat.completion", + choices=[Choice(index=0, finish_reason="stop", message=message)], + ) + assert hasattr(chunk.choices[0].message, "reasoning_content") + + mot: ModelOutputThunk = ModelOutputThunk(value=None) + await backend.processing(mot, chunk) + + assert mot._thinking == "attribute-style trace" + assert mot._underlying_value == "answer" + + if __name__ == "__main__": pytest.main([__file__, "-v"]) From 0326cfea91b6b8a707c86fe5adf88c3b08553847 Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Wed, 13 May 2026 12:08:47 +0100 Subject: [PATCH 2/2] fix(backends): replace hasattr guard with getattr+model_extra for reasoning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the `hasattr(message, "reasoning_content") / else model_dump()` pattern with a unified value-based probe: thinking_chunk = getattr(obj, "reasoning_content", None) if thinking_chunk is None: thinking_chunk = (obj.model_extra or {}).get("reasoning") This closes two gaps identified in code review: 1. Edge case: if a proxy sends `{"reasoning_content": null, "reasoning": "trace"}`, the old hasattr guard was True, the None check short-circuited, and the else branch never fired — silently dropping the trace. The value-based check falls through correctly. 2. Performance: the old else branch called model_dump() on every streaming delta for non-thinking models (gpt-4o etc.), allocating a full dict per token. model_extra is already a plain dict — O(1) lookup, no serialisation. Also adds test_processing_reasoning_content_takes_precedence_over_reasoning to pin that reasoning_content wins when both fields are present on the same message object. Closes #1061 Assisted-by: Claude Code --- mellea/backends/openai.py | 32 ++++++++++++------------------- test/backends/test_openai_unit.py | 24 +++++++++++++++++++++++ 2 files changed, 36 insertions(+), 20 deletions(-) diff --git a/mellea/backends/openai.py b/mellea/backends/openai.py index f8b639c6f..63a0fc2b1 100644 --- a/mellea/backends/openai.py +++ b/mellea/backends/openai.py @@ -991,16 +991,13 @@ async def processing( if isinstance(chunk, ChatCompletion): message = chunk.choices[0].message - if hasattr(message, "reasoning_content"): - thinking_chunk = message.reasoning_content # type: ignore - if thinking_chunk is not None: - mot._thinking += thinking_chunk - else: - # vLLM uses "reasoning" in the raw dict rather than a reasoning_content attribute. - raw_message = message.model_dump() - thinking_chunk = raw_message.get("reasoning") - if thinking_chunk is not None: - mot._thinking += thinking_chunk + # reasoning_content (Anthropic/DeepSeek attribute path) takes priority; + # fall back to the "reasoning" extra field used by vLLM and compatible servers. + thinking_chunk = getattr(message, "reasoning_content", None) + if thinking_chunk is None: + thinking_chunk = (message.model_extra or {}).get("reasoning") + if thinking_chunk is not None: + mot._thinking += thinking_chunk content_chunk = message.content if content_chunk is not None: @@ -1021,16 +1018,11 @@ async def processing( return message_delta = chunk.choices[0].delta - if hasattr(message_delta, "reasoning_content"): - thinking_chunk = message_delta.reasoning_content # type: ignore - if thinking_chunk is not None: - mot._thinking += thinking_chunk - else: - # vLLM streaming: same "reasoning" fallback for delta chunks. - raw_delta = message_delta.model_dump() - thinking_chunk = raw_delta.get("reasoning") - if thinking_chunk is not None: - mot._thinking += thinking_chunk + thinking_chunk = getattr(message_delta, "reasoning_content", None) + if thinking_chunk is None: + thinking_chunk = (message_delta.model_extra or {}).get("reasoning") + if thinking_chunk is not None: + mot._thinking += thinking_chunk content_chunk = message_delta.content if content_chunk is not None: diff --git a/test/backends/test_openai_unit.py b/test/backends/test_openai_unit.py index 304b77171..77a0745a4 100644 --- a/test/backends/test_openai_unit.py +++ b/test/backends/test_openai_unit.py @@ -287,5 +287,29 @@ async def test_processing_reasoning_content_still_used(backend): assert mot._underlying_value == "answer" +async def test_processing_reasoning_content_takes_precedence_over_reasoning(backend): + """reasoning_content attribute wins when both it and raw ``reasoning`` are present.""" + message = ChatCompletionMessage.model_validate( + { + "role": "assistant", + "content": "answer", + "reasoning_content": "attr-trace", + "reasoning": "raw-trace", + } + ) + chunk = ChatCompletion( + id="prec-test", + created=0, + model="fake", + object="chat.completion", + choices=[Choice(index=0, finish_reason="stop", message=message)], + ) + mot: ModelOutputThunk = ModelOutputThunk(value=None) + await backend.processing(mot, chunk) + + assert mot._thinking == "attr-trace" + assert mot._underlying_value == "answer" + + if __name__ == "__main__": pytest.main([__file__, "-v"])