From 42cee396714e09d48b5a691b45417bacd981d1c6 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Tue, 12 May 2026 14:27:13 +0100
Subject: [PATCH 1/2] fix(backends): capture vLLM reasoning field in
 mot._thinking

vLLM served with --reasoning-parser qwen3 (and other thinking models)
surfaces the reasoning trace under the "reasoning" key of the raw
message dict, not as a Python attribute on the openai SDK object. The
existing hasattr(message, "reasoning_content") probe therefore missed
it, and mot._thinking was silently left unpopulated for Qwen3 and
similar models.

Add a fallback in both the non-streaming (ChatCompletion) and streaming
(ChatCompletionChunk) branches of processing(): when reasoning_content
is absent, probe the raw .model_dump() for a "reasoning" key. The
existing reasoning_content path is preserved untouched.

Closes #1061

Assisted-by: Claude Code
---
 mellea/backends/openai.py         |  12 +++
 test/backends/test_openai_unit.py | 119 ++++++++++++++++++++++++++++++
 2 files changed, 131 insertions(+)

diff --git a/mellea/backends/openai.py b/mellea/backends/openai.py
index 1eea93511..f8b639c6f 100644
--- a/mellea/backends/openai.py
+++ b/mellea/backends/openai.py
@@ -995,6 +995,12 @@ async def processing(
                 thinking_chunk = message.reasoning_content  # type: ignore
                 if thinking_chunk is not None:
                     mot._thinking += thinking_chunk
+            else:
+                # vLLM uses "reasoning" in the raw dict rather than a reasoning_content attribute.
+                raw_message = message.model_dump()
+                thinking_chunk = raw_message.get("reasoning")
+                if thinking_chunk is not None:
+                    mot._thinking += thinking_chunk
 
             content_chunk = message.content
             if content_chunk is not None:
@@ -1019,6 +1025,12 @@ async def processing(
                 thinking_chunk = message_delta.reasoning_content  # type: ignore
                 if thinking_chunk is not None:
                     mot._thinking += thinking_chunk
+            else:
+                # vLLM streaming: same "reasoning" fallback for delta chunks.
+                raw_delta = message_delta.model_dump()
+                thinking_chunk = raw_delta.get("reasoning")
+                if thinking_chunk is not None:
+                    mot._thinking += thinking_chunk
 
             content_chunk = message_delta.content
             if content_chunk is not None:
diff --git a/test/backends/test_openai_unit.py b/test/backends/test_openai_unit.py
index 09524df8c..304b77171 100644
--- a/test/backends/test_openai_unit.py
+++ b/test/backends/test_openai_unit.py
@@ -5,9 +5,12 @@
 """
 
 import pytest
+from openai.types.chat import ChatCompletion, ChatCompletionChunk, ChatCompletionMessage
+from openai.types.chat.chat_completion import Choice
 
 from mellea.backends import ModelOption
 from mellea.backends.openai import OpenAIBackend
+from mellea.core.base import ModelOutputThunk
 
 
 def _make_backend(model_options: dict | None = None) -> OpenAIBackend:
@@ -168,5 +171,121 @@ def test_make_backend_specific_unknown_mellea_keys_removed(backend):
     assert ModelOption.SYSTEM_PROMPT not in result
 
 
+# --- processing(): reasoning / thinking trace extraction ---
+
+
+def _vllm_chat_completion(reasoning: str, content: str | None) -> ChatCompletion:
+    """Build a ChatCompletion that matches vLLM's thinking-model response shape."""
+    message = ChatCompletionMessage.model_validate(
+        {"role": "assistant", "content": content, "reasoning": reasoning}
+    )
+    return ChatCompletion(
+        id="vllm-test",
+        created=0,
+        model="qwen3",
+        object="chat.completion",
+        choices=[Choice(index=0, finish_reason="stop", message=message)],
+    )
+
+
+async def test_processing_captures_vllm_reasoning_field(backend):
+    """Non-streaming: mot._thinking captures the raw ``reasoning`` key from vLLM."""
+    mot: ModelOutputThunk = ModelOutputThunk(value=None)
+    chunk = _vllm_chat_completion(reasoning="2 + 2 equals 4.", content="4")
+    # Sanity check: the SDK object does not expose reasoning_content
+    assert not hasattr(chunk.choices[0].message, "reasoning_content")
+
+    await backend.processing(mot, chunk)
+
+    assert mot._thinking == "2 + 2 equals 4."
+    assert mot._underlying_value == "4"
+
+
+async def test_processing_vllm_reasoning_with_null_content(backend):
+    """Non-streaming: reasoning is captured even when ``content`` is null."""
+    mot: ModelOutputThunk = ModelOutputThunk(value=None)
+    chunk = _vllm_chat_completion(reasoning="some thinking", content=None)
+
+    await backend.processing(mot, chunk)
+
+    assert mot._thinking == "some thinking"
+    assert mot._underlying_value == ""
+
+
+async def test_processing_streaming_captures_vllm_reasoning_field(backend):
+    """Streaming: per-chunk ``reasoning`` deltas accumulate into mot._thinking."""
+    mot: ModelOutputThunk = ModelOutputThunk(value=None)
+    chunk_a = ChatCompletionChunk.model_validate(
+        {
+            "id": "vllm-stream",
+            "created": 0,
+            "model": "qwen3",
+            "object": "chat.completion.chunk",
+            "choices": [
+                {
+                    "index": 0,
+                    "delta": {
+                        "role": "assistant",
+                        "content": None,
+                        "reasoning": "first ",
+                    },
+                    "finish_reason": None,
+                }
+            ],
+        }
+    )
+    chunk_b = ChatCompletionChunk.model_validate(
+        {
+            "id": "vllm-stream",
+            "created": 0,
+            "model": "qwen3",
+            "object": "chat.completion.chunk",
+            "choices": [
+                {
+                    "index": 0,
+                    "delta": {"content": "ans", "reasoning": "second"},
+                    "finish_reason": None,
+                }
+            ],
+        }
+    )
+
+    await backend.processing(mot, chunk_a)
+    await backend.processing(mot, chunk_b)
+
+    assert mot._thinking == "first second"
+    assert mot._underlying_value == "ans"
+
+
+async def test_processing_reasoning_content_still_used(backend):
+    """Regression guard: the pre-existing ``reasoning_content`` path is preserved.
+
+    Some providers surface the trace as ``reasoning_content`` on the message
+    object itself. The fix must not regress that path in favour of the raw-dict
+    fallback.
+    """
+    message = ChatCompletionMessage.model_validate(
+        {
+            "role": "assistant",
+            "content": "answer",
+            "reasoning_content": "attribute-style trace",
+        }
+    )
+    chunk = ChatCompletion(
+        id="rc-test",
+        created=0,
+        model="fake",
+        object="chat.completion",
+        choices=[Choice(index=0, finish_reason="stop", message=message)],
+    )
+    assert hasattr(chunk.choices[0].message, "reasoning_content")
+
+    mot: ModelOutputThunk = ModelOutputThunk(value=None)
+    await backend.processing(mot, chunk)
+
+    assert mot._thinking == "attribute-style trace"
+    assert mot._underlying_value == "answer"
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])

From 0326cfea91b6b8a707c86fe5adf88c3b08553847 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Wed, 13 May 2026 12:08:47 +0100
Subject: [PATCH 2/2] fix(backends): replace hasattr guard with
 getattr+model_extra for reasoning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the `hasattr(message, "reasoning_content") / else model_dump()`
pattern with a unified value-based probe:

    thinking_chunk = getattr(obj, "reasoning_content", None)
    if thinking_chunk is None:
        thinking_chunk = (obj.model_extra or {}).get("reasoning")

This closes two gaps identified in code review:

1. Edge case: if a proxy sends `{"reasoning_content": null, "reasoning":
   "trace"}`, the old hasattr guard was True, the None check
   short-circuited, and the else branch never fired — silently dropping
   the trace. The value-based check falls through correctly.

2. Performance: the old else branch called model_dump() on every streaming
   delta for non-thinking models (gpt-4o etc.), allocating a full dict per
   token. model_extra is already a plain dict — O(1) lookup, no
   serialisation.

Also adds test_processing_reasoning_content_takes_precedence_over_reasoning
to pin that reasoning_content wins when both fields are present on the same
message object.

Closes #1061

Assisted-by: Claude Code
---
 mellea/backends/openai.py         | 32 ++++++++++++-------------------
 test/backends/test_openai_unit.py | 24 +++++++++++++++++++++++
 2 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/mellea/backends/openai.py b/mellea/backends/openai.py
index f8b639c6f..63a0fc2b1 100644
--- a/mellea/backends/openai.py
+++ b/mellea/backends/openai.py
@@ -991,16 +991,13 @@ async def processing(
         if isinstance(chunk, ChatCompletion):
             message = chunk.choices[0].message
 
-            if hasattr(message, "reasoning_content"):
-                thinking_chunk = message.reasoning_content  # type: ignore
-                if thinking_chunk is not None:
-                    mot._thinking += thinking_chunk
-            else:
-                # vLLM uses "reasoning" in the raw dict rather than a reasoning_content attribute.
-                raw_message = message.model_dump()
-                thinking_chunk = raw_message.get("reasoning")
-                if thinking_chunk is not None:
-                    mot._thinking += thinking_chunk
+            # reasoning_content (Anthropic/DeepSeek attribute path) takes priority;
+            # fall back to the "reasoning" extra field used by vLLM and compatible servers.
+            thinking_chunk = getattr(message, "reasoning_content", None)
+            if thinking_chunk is None:
+                thinking_chunk = (message.model_extra or {}).get("reasoning")
+            if thinking_chunk is not None:
+                mot._thinking += thinking_chunk
 
             content_chunk = message.content
             if content_chunk is not None:
@@ -1021,16 +1018,11 @@ async def processing(
                 return
 
             message_delta = chunk.choices[0].delta
-            if hasattr(message_delta, "reasoning_content"):
-                thinking_chunk = message_delta.reasoning_content  # type: ignore
-                if thinking_chunk is not None:
-                    mot._thinking += thinking_chunk
-            else:
-                # vLLM streaming: same "reasoning" fallback for delta chunks.
-                raw_delta = message_delta.model_dump()
-                thinking_chunk = raw_delta.get("reasoning")
-                if thinking_chunk is not None:
-                    mot._thinking += thinking_chunk
+            thinking_chunk = getattr(message_delta, "reasoning_content", None)
+            if thinking_chunk is None:
+                thinking_chunk = (message_delta.model_extra or {}).get("reasoning")
+            if thinking_chunk is not None:
+                mot._thinking += thinking_chunk
 
             content_chunk = message_delta.content
             if content_chunk is not None:
diff --git a/test/backends/test_openai_unit.py b/test/backends/test_openai_unit.py
index 304b77171..77a0745a4 100644
--- a/test/backends/test_openai_unit.py
+++ b/test/backends/test_openai_unit.py
@@ -287,5 +287,29 @@ async def test_processing_reasoning_content_still_used(backend):
     assert mot._underlying_value == "answer"
 
 
+async def test_processing_reasoning_content_takes_precedence_over_reasoning(backend):
+    """reasoning_content attribute wins when both it and raw ``reasoning`` are present."""
+    message = ChatCompletionMessage.model_validate(
+        {
+            "role": "assistant",
+            "content": "answer",
+            "reasoning_content": "attr-trace",
+            "reasoning": "raw-trace",
+        }
+    )
+    chunk = ChatCompletion(
+        id="prec-test",
+        created=0,
+        model="fake",
+        object="chat.completion",
+        choices=[Choice(index=0, finish_reason="stop", message=message)],
+    )
+    mot: ModelOutputThunk = ModelOutputThunk(value=None)
+    await backend.processing(mot, chunk)
+
+    assert mot._thinking == "attr-trace"
+    assert mot._underlying_value == "answer"
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])