From 3e8d665bc60de1da1d28f562fc790a0aa9b2ec00 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Thu, 7 May 2026 14:55:51 +0100
Subject: [PATCH 01/10] feat(telemetry): add is_content_tracing_enabled and
 add_span_event helpers

Add MELLEA_TRACE_CONTENT env-var gate (also recognises the standard
OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT) and expose
add_span_event() as a safe no-op wrapper. Both exported from
mellea.telemetry and mellea.telemetry.tracing.

Assisted-by: Claude Code
Signed-off-by: Nigel Jones <jonesn@uk.ibm.com>
---
 mellea/telemetry/__init__.py |  4 ++++
 mellea/telemetry/tracing.py  | 37 ++++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+)

diff --git a/mellea/telemetry/__init__.py b/mellea/telemetry/__init__.py
index ea9351252..6fb433a6c 100644
--- a/mellea/telemetry/__init__.py
+++ b/mellea/telemetry/__init__.py
@@ -92,9 +92,11 @@ def my_function():
 )
 from .pricing import is_pricing_enabled
 from .tracing import (
+    add_span_event,
     end_backend_span,
     is_application_tracing_enabled,
     is_backend_tracing_enabled,
+    is_content_tracing_enabled,
     set_span_attribute,
     set_span_error,
     start_backend_span,
@@ -104,6 +106,7 @@ def my_function():
 
 __all__ = [
     "MelleaContextFilter",
+    "add_span_event",
     "async_with_context",
     "create_counter",
     "create_histogram",
@@ -118,6 +121,7 @@ def my_function():
     "get_session_id",
     "is_application_tracing_enabled",
     "is_backend_tracing_enabled",
+    "is_content_tracing_enabled",
     "is_metrics_enabled",
     "is_pricing_enabled",
     "record_cost",
diff --git a/mellea/telemetry/tracing.py b/mellea/telemetry/tracing.py
index 12b7b486f..48a3bf5e6 100644
--- a/mellea/telemetry/tracing.py
+++ b/mellea/telemetry/tracing.py
@@ -10,6 +10,9 @@
 Configuration via environment variables:
 - MELLEA_TRACE_APPLICATION: Enable/disable application tracing (default: false)
 - MELLEA_TRACE_BACKEND: Enable/disable backend tracing (default: false)
+- MELLEA_TRACE_CONTENT: Capture prompt/response content in spans (default: false).
+  Content may include PII — enable only in controlled environments.
+  Also recognised: OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT (OTel standard).
 - OTEL_EXPORTER_OTLP_ENDPOINT: OTLP endpoint for trace export
 - OTEL_SERVICE_NAME: Service name for traces (default: mellea)
 """
@@ -42,6 +45,11 @@
 _TRACE_BACKEND_ENABLED = _OTEL_AVAILABLE and os.getenv(
     "MELLEA_TRACE_BACKEND", "false"
 ).lower() in ("true", "1", "yes")
+_TRACE_CONTENT_ENABLED = _OTEL_AVAILABLE and (
+    os.getenv("MELLEA_TRACE_CONTENT", "false").lower() in ("true", "1", "yes")
+    or os.getenv("OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT", "false").lower()
+    in ("true", "1", "yes")
+)
 _OTLP_ENDPOINT = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
 _SERVICE_NAME = os.getenv("OTEL_SERVICE_NAME", "mellea")
 _CONSOLE_EXPORT = os.getenv("MELLEA_TRACE_CONSOLE", "false").lower() in (
@@ -113,6 +121,33 @@ def is_backend_tracing_enabled() -> bool:
     return _TRACE_BACKEND_ENABLED
 
 
+def is_content_tracing_enabled() -> bool:
+    """Check if content capture is enabled.
+
+    Content capture records prompt and response text on spans and may contain PII.
+    Enable only in controlled environments.
+
+    Returns:
+        True if enabled via ``MELLEA_TRACE_CONTENT`` or
+        ``OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT``.
+    """
+    return _TRACE_CONTENT_ENABLED
+
+
+def add_span_event(
+    span: Any, name: str, attributes: dict[str, Any] | None = None
+) -> None:
+    """Add a named event to a span if the span is not None.
+
+    Args:
+        span: The span object (may be None if tracing is disabled).
+        name: Event name.
+        attributes: Optional event attributes.
+    """
+    if span is not None and _OTEL_AVAILABLE:
+        span.add_event(name, attributes=attributes or {})
+
+
 @contextmanager
 def trace_application(name: str, **attributes: Any) -> Generator[Any, None, None]:
     """Create an application trace span if application tracing is enabled.
@@ -246,9 +281,11 @@ def set_span_error(span: Any, exception: Exception) -> None:
 
 
 __all__ = [
+    "add_span_event",
     "end_backend_span",
     "is_application_tracing_enabled",
     "is_backend_tracing_enabled",
+    "is_content_tracing_enabled",
     "set_span_attribute",
     "set_span_error",
     "start_backend_span",

From 30c36864ccf411573cc492ffb4e08557b54b586b Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Thu, 7 May 2026 14:57:04 +0100
Subject: [PATCH 02/10] feat(telemetry): surface gen_ai.provider.name,
 conversation.id, template attrs

Five OTel GenAI semconv gaps closed (issue #1035):

1. gen_ai.provider.name emitted alongside legacy gen_ai.system (semconv
   v1.37.0 migration; keep both for dashboard back-compat).

2. gen_ai.conversation.id mapped from existing session_id ContextVar; the
   existing mellea.session_id attribute is preserved alongside it.

3. llm.prompt_template.template emitted unconditionally from Instruction
   and GenerativeStub; llm.prompt_template.variables gated behind
   MELLEA_TRACE_CONTENT (user data).

4. error.type (Stable OTel) set on the error path in the new
   finalize_backend_span() helper alongside set_span_error().
   finalize_backend_span() replaces the three-line record_token_usage +
   record_response_metadata + end_backend_span pattern in each backend.

5. gen_ai.input.messages, gen_ai.output.messages, gen_ai.system_instructions
   emitted as structured JSON (spec v1.37.0 schema) when MELLEA_TRACE_CONTENT
   is enabled. No deprecated per-role events (gen_ai.user.message etc.) are
   emitted. A gen_ai.client.inference.operation.details span event is added
   as a marker for log-oriented receivers.

Also adds gen_ai.request.temperature/top_p/top_k/frequency_penalty/
presence_penalty from model_options, and cache/reasoning token attrs in
record_token_usage().

Assisted-by: Claude Code
Signed-off-by: Nigel Jones <jonesn@uk.ibm.com>
---
 mellea/telemetry/backend_instrumentation.py | 251 +++++++++++++++++++-
 1 file changed, 249 insertions(+), 2 deletions(-)

diff --git a/mellea/telemetry/backend_instrumentation.py b/mellea/telemetry/backend_instrumentation.py
index 737da43ab..939fe2d6a 100644
--- a/mellea/telemetry/backend_instrumentation.py
+++ b/mellea/telemetry/backend_instrumentation.py
@@ -2,12 +2,24 @@
 
 Follows OpenTelemetry Gen-AI semantic conventions:
 https://opentelemetry.io/docs/specs/semconv/gen-ai/
+
+Content capture (``gen_ai.input.messages``, ``gen_ai.output.messages``,
+``gen_ai.system_instructions``) is opt-in and gated by ``is_content_tracing_enabled()``.
+These attributes may contain PII — enable only in controlled environments.
 """
 
+import json
 from typing import Any
 
 from ..backends.utils import get_value
-from .tracing import set_span_attribute, trace_backend
+from .tracing import (
+    add_span_event,
+    end_backend_span,
+    is_content_tracing_enabled,
+    set_span_attribute,
+    set_span_error,
+    trace_backend,
+)
 
 
 def get_model_id_str(backend: Any) -> str:
@@ -30,6 +42,9 @@ def get_model_id_str(backend: Any) -> str:
 def get_system_name(backend: Any) -> str:
     """Get the Gen-AI system name from backend.
 
+    Kept for back-compatibility with existing dashboards keyed on ``gen_ai.system``.
+    New code should prefer ``get_provider_name()``.
+
     Args:
         backend: Backend instance
 
@@ -51,6 +66,21 @@ def get_system_name(backend: Any) -> str:
         return backend.__class__.__name__
 
 
+def get_provider_name(backend: Any) -> str:
+    """Get the Gen-AI provider name from backend.
+
+    Returns the value for ``gen_ai.provider.name`` (semconv v1.37.0+), which
+    supersedes the deprecated ``gen_ai.system`` attribute.
+
+    Args:
+        backend: Backend instance
+
+    Returns:
+        Provider name (e.g., 'openai', 'ollama', 'huggingface')
+    """
+    return get_system_name(backend)
+
+
 def get_context_size(ctx: Any) -> int:
     """Get the size of a context.
 
@@ -95,6 +125,7 @@ def instrument_generate_from_context(
         **{
             # Gen-AI semantic convention attributes
             "gen_ai.system": system_name,
+            "gen_ai.provider.name": system_name,
             "gen_ai.request.model": model_id,
             "gen_ai.operation.name": "chat",
             # Mellea-specific attributes
@@ -109,7 +140,13 @@ def instrument_generate_from_context(
 
 
 def start_generate_span(
-    backend: Any, action: Any, ctx: Any, format: Any = None, tool_calls: bool = False
+    backend: Any,
+    action: Any,
+    ctx: Any,
+    format: Any = None,
+    tool_calls: bool = False,
+    *,
+    model_options: dict | None = None,
 ):
     """Start a backend trace span for generate_from_context (without auto-closing).
 
@@ -122,6 +159,7 @@ def start_generate_span(
         ctx: Context
         format: Response format (BaseModel subclass or None)
         tool_calls: Whether tool calling is enabled
+        model_options: Raw model options dict for request-parameter attributes
 
     Returns:
         Span object or None if tracing is disabled
@@ -137,6 +175,7 @@ def start_generate_span(
     span_attrs: dict = {
         # Gen-AI semantic convention attributes
         "gen_ai.system": system_name,
+        "gen_ai.provider.name": system_name,
         "gen_ai.request.model": model_id,
         "gen_ai.operation.name": "chat",
         # Mellea-specific attributes
@@ -147,10 +186,39 @@ def start_generate_span(
         "mellea.format_type": format.__name__ if format else None,
         "mellea.tool_calls_enabled": tool_calls,
     }
+
     # Propagate telemetry context to span
     for key, value in telemetry_ctx.items():
         span_attrs[f"mellea.{key}"] = value
 
+    # gen_ai.conversation.id maps from the existing session_id ContextVar
+    session_id = telemetry_ctx.get("session_id")
+    if session_id is not None:
+        span_attrs["gen_ai.conversation.id"] = session_id
+
+    # Request parameters from model_options (plain-string keys only)
+    if model_options:
+        for mellea_key, otel_key in _REQUEST_PARAM_MAP.items():
+            val = model_options.get(mellea_key)
+            if val is not None:
+                span_attrs[otel_key] = val
+
+    # Prompt template attributes (duck-typed; works for Instruction and GenerativeStub)
+    tmpl = getattr(action, "prompt_template_metadata", None)
+    if callable(tmpl):
+        metadata: Any = tmpl()
+        if metadata is not None:
+            template_text, template_vars, template_version = metadata
+            if template_text:
+                span_attrs["llm.prompt_template.template"] = template_text
+            if template_version:
+                span_attrs["llm.prompt_template.version"] = template_version
+            # Variables contain user-provided values — only emit with content gate
+            if template_vars and is_content_tracing_enabled():
+                span_attrs["llm.prompt_template.variables"] = _serialize_json(
+                    template_vars
+                )
+
     return start_backend_span("chat", **span_attrs)
 
 
@@ -178,6 +246,7 @@ def instrument_generate_from_raw(
         **{
             # Gen-AI semantic convention attributes
             "gen_ai.system": system_name,
+            "gen_ai.provider.name": system_name,
             "gen_ai.request.model": model_id,
             "gen_ai.operation.name": "text_completion",
             # Mellea-specific attributes
@@ -214,6 +283,22 @@ def record_token_usage(span: Any, usage: Any) -> None:
         total_tokens = get_value(usage, "total_tokens")
         if total_tokens is not None:
             set_span_attribute(span, "gen_ai.usage.total_tokens", total_tokens)
+
+        cache_read = get_value(usage, "cache_read_input_tokens")
+        if cache_read is not None:
+            set_span_attribute(span, "gen_ai.usage.cache_read.input_tokens", cache_read)
+
+        cache_creation = get_value(usage, "cache_creation_input_tokens")
+        if cache_creation is not None:
+            set_span_attribute(
+                span, "gen_ai.usage.cache_creation.input_tokens", cache_creation
+            )
+
+        reasoning_tokens = get_value(usage, "reasoning_tokens")
+        if reasoning_tokens is not None:
+            set_span_attribute(
+                span, "gen_ai.usage.reasoning.output_tokens", reasoning_tokens
+            )
     except Exception:
         # Don't fail if we can't extract token usage
         pass
@@ -260,12 +345,174 @@ def record_response_metadata(
         pass
 
 
+def finalize_backend_span(
+    span: Any,
+    *,
+    response: Any = None,
+    usage: Any = None,
+    model_id: str | None = None,
+    error: Exception | None = None,
+    conversation: list[dict] | None = None,
+    output_text: str | None = None,
+    finish_reason: str | None = None,
+) -> None:
+    """Close a backend span, recording telemetry on both success and error paths.
+
+    On the error path, records the exception, sets ``error.type``, and marks
+    the span with ERROR status before closing.  On the success path, records
+    token usage, response metadata, and (when content capture is enabled)
+    structured input/output message attributes.
+
+    This replaces the three-line ``record_token_usage`` + ``record_response_metadata``
+    + ``end_backend_span`` pattern used in each backend's ``post_processing``.
+
+    Args:
+        span: The span to finalise (no-op when ``None``).
+        response: Raw backend response (for model id, finish reason, response id).
+        usage: Token usage object or dict.
+        model_id: Explicit model id override.
+        error: Exception to record on the error path.
+        conversation: The prompt conversation (``list[dict]`` with ``role``/``content``
+            keys).  Used for ``gen_ai.input.messages`` and
+            ``gen_ai.system_instructions`` when content capture is enabled.
+        output_text: The assistant's reply text.  Used for
+            ``gen_ai.output.messages`` when content capture is enabled.
+        finish_reason: Finish reason string (defaults to ``"stop"`` when omitted).
+    """
+    if span is None:
+        return
+
+    try:
+        try:
+            if error is not None:
+                set_span_error(span, error)
+                # error.type is a Stable OTel cross-signal attribute
+                set_span_attribute(span, "error.type", type(error).__name__)
+            else:
+                record_token_usage(span, usage)
+                record_response_metadata(span, response, model_id=model_id)
+
+                if is_content_tracing_enabled() and conversation is not None:
+                    _emit_content_attributes(
+                        span,
+                        conversation=conversation,
+                        output_text=output_text,
+                        finish_reason=finish_reason,
+                        response=response,
+                    )
+        except Exception:
+            # Telemetry helpers must never break application code.
+            pass
+    finally:
+        end_backend_span(span)
+
+
+# ---------------------------------------------------------------------------
+# Private helpers
+# ---------------------------------------------------------------------------
+
+# Mapping from Mellea/OpenAI plain-string model_options keys to OTel request attrs.
+_REQUEST_PARAM_MAP: dict[str, str] = {
+    "temperature": "gen_ai.request.temperature",
+    "top_p": "gen_ai.request.top_p",
+    "top_k": "gen_ai.request.top_k",
+    "frequency_penalty": "gen_ai.request.frequency_penalty",
+    "presence_penalty": "gen_ai.request.presence_penalty",
+}
+
+
+def _serialize_json(obj: Any) -> str:
+    """Serialise *obj* to a JSON string, coercing non-serialisable values to str."""
+    return json.dumps(obj, default=str, ensure_ascii=False)
+
+
+def _conversation_to_parts(conversation: list[dict]) -> tuple[list[dict], list[dict]]:
+    """Split a conversation into system instructions and input messages.
+
+    Args:
+        conversation: List of ``{"role": ..., "content": ...}`` dicts.
+
+    Returns:
+        Tuple of ``(system_parts, input_messages)`` in the spec JSON shape.
+        ``system_parts`` is a list of ``{"type": "text", "content": ...}`` items.
+        ``input_messages`` is a list of
+        ``{"role": ..., "parts": [{"type": "text", "content": ...}]}`` items.
+    """
+    system_parts: list[dict] = []
+    input_messages: list[dict] = []
+    for msg in conversation:
+        role = msg.get("role", "")
+        content = msg.get("content", "")
+        if role == "system":
+            system_parts.append({"type": "text", "content": str(content)})
+        else:
+            input_messages.append(
+                {"role": role, "parts": [{"type": "text", "content": str(content)}]}
+            )
+    return system_parts, input_messages
+
+
+def _emit_content_attributes(
+    span: Any,
+    *,
+    conversation: list[dict],
+    output_text: str | None,
+    finish_reason: str | None,
+    response: Any = None,
+) -> None:
+    """Set structured content attributes on the span (content gate must be checked by caller)."""
+    try:
+        system_parts, input_messages = _conversation_to_parts(conversation)
+
+        if system_parts:
+            set_span_attribute(
+                span, "gen_ai.system_instructions", _serialize_json(system_parts)
+            )
+        if input_messages:
+            set_span_attribute(
+                span, "gen_ai.input.messages", _serialize_json(input_messages)
+            )
+
+        # Attempt to derive output text from an OpenAI-format response if not provided
+        if output_text is None and response is not None:
+            try:
+                choices = get_value(response, "choices")
+                if choices:
+                    first = choices[0] if isinstance(choices, list) else choices
+                    msg = get_value(first, "message")
+                    if msg is not None:
+                        output_text = str(get_value(msg, "content") or "")
+            except Exception:
+                pass
+
+        if output_text is not None:
+            output_msg = [
+                {
+                    "role": "assistant",
+                    "parts": [{"type": "text", "content": output_text}],
+                    "finish_reason": finish_reason or "stop",
+                }
+            ]
+            set_span_attribute(
+                span, "gen_ai.output.messages", _serialize_json(output_msg)
+            )
+
+        # Emit a span event so log-oriented receivers also see the content payload.
+        add_span_event(span, "gen_ai.client.inference.operation.details")
+    except Exception:
+        # Content capture is best-effort — never fail the span close
+        pass
+
+
 __all__ = [
+    "finalize_backend_span",
     "get_context_size",
     "get_model_id_str",
+    "get_provider_name",
     "get_system_name",
     "instrument_generate_from_context",
     "instrument_generate_from_raw",
     "record_response_metadata",
     "record_token_usage",
+    "start_generate_span",
 ]

From a0d507459a1be130cad0fd4fb49194a8e824f5f8 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Thu, 7 May 2026 14:57:16 +0100
Subject: [PATCH 03/10] feat(components): retain prompt template text and
 variables for telemetry
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instruction: capture _template_description (raw string before Jinja
substitution) and _user_variables (copy) in __init__; expose via
prompt_template_metadata() returning (template, variables, version)|None.

GenerativeStub: capture f_kwargs on each call; expose via
prompt_template_metadata() using the function docstring as the template
and f_kwargs as the variables.

Neither change affects runtime behaviour — data is retained for
duck-typed use by start_generate_span().

Assisted-by: Claude Code
Signed-off-by: Nigel Jones <jonesn@uk.ibm.com>
---
 mellea/stdlib/components/genstub.py     | 15 +++++++++++++++
 mellea/stdlib/components/instruction.py | 23 +++++++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/mellea/stdlib/components/genstub.py b/mellea/stdlib/components/genstub.py
index 05ca11088..94572b557 100644
--- a/mellea/stdlib/components/genstub.py
+++ b/mellea/stdlib/components/genstub.py
@@ -355,6 +355,7 @@ def __init__(self, func: Callable[P, R]):
 
         self._function = Function(func)
         self._arguments: Arguments | None = None
+        self._template_variables: dict = {}
         functools.update_wrapper(self, func)
 
         self._response_model = create_response_format(self._function._func)
@@ -520,6 +521,18 @@ def _parse(self, computed: ModelOutputThunk) -> R:
 
         return function_response.result
 
+    def prompt_template_metadata(self) -> tuple[str, dict, None] | None:
+        """Return prompt template metadata for telemetry.
+
+        Returns:
+            Tuple of ``(docstring, variables, version)`` when the function has
+            a docstring, otherwise ``None``.
+        """
+        docstring = self._function._function_dict.get("docstring")
+        if not docstring:
+            return None
+        return str(docstring), dict(self._template_variables), None
+
 
 class SyncGenerativeStub(GenerativeStub, Generic[P, R]):
     """A synchronous generative stub that blocks until the LLM response is ready.
@@ -587,6 +600,7 @@ def __call__(self, *args, **kwargs) -> tuple[R, Context] | R:
                 for r in extracted.precondition_requirements
             ]
 
+        stub_copy._template_variables = dict(extracted.f_kwargs)
         arguments = bind_function_arguments(self._function._func, **extracted.f_kwargs)
         if arguments:
             stub_args: list[Argument] = []
@@ -720,6 +734,7 @@ def __call__(self, *args, **kwargs) -> Coroutine[Any, Any, tuple[R, Context] | R
                 for r in extracted.precondition_requirements
             ]
 
+        stub_copy._template_variables = dict(extracted.f_kwargs)
         arguments = bind_function_arguments(self._function._func, **extracted.f_kwargs)
         if arguments:
             stub_args: list[Argument] = []
diff --git a/mellea/stdlib/components/instruction.py b/mellea/stdlib/components/instruction.py
index 30faaea20..b814b4bf3 100644
--- a/mellea/stdlib/components/instruction.py
+++ b/mellea/stdlib/components/instruction.py
@@ -63,6 +63,15 @@ def __init__(
         icl_examples = [] if icl_examples is None else icl_examples
         grounding_context = dict() if grounding_context is None else grounding_context
 
+        # Retain raw template before Jinja substitution for telemetry.
+        # Template text is the static prompt structure; variables may contain user data.
+        self._template_description: str | None = (
+            description if isinstance(description, str) else None
+        )
+        self._user_variables: dict[str, str] | None = (
+            dict(user_variables) if user_variables else None
+        )
+
         # Apply templates. All inputs must be strings if provided.
         if user_variables is not None:
             if description is not None:
@@ -189,6 +198,20 @@ def format_for_llm(self) -> TemplateRepresentation:
             template_order=["*", "Instruction"],
         )
 
+    def prompt_template_metadata(self) -> tuple[str, dict[str, str], None] | None:
+        """Return prompt template metadata for telemetry.
+
+        The raw template text is emitted unconditionally.  Variables are only
+        emitted when content capture is enabled (they may contain user data).
+
+        Returns:
+            Tuple of ``(template_text, variables, version)`` when a string
+            description was provided, otherwise ``None``.
+        """
+        if self._template_description is None:
+            return None
+        return self._template_description, dict(self._user_variables or {}), None
+
     @staticmethod
     def apply_user_dict_from_jinja(user_dict: dict[str, str], s: str) -> str:
         """Render a Jinja2 template string using the provided variable dictionary.

From e6c40e7ac254df44ff01d72d1c071ec7b06b0c39 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Thu, 7 May 2026 14:57:37 +0100
Subject: [PATCH 04/10] refactor(backends): wire finalize_backend_span into all
 five backends

Replace the duplicated record_token_usage + record_response_metadata +
end_backend_span pattern in each backend's post_processing() with a
single finalize_backend_span() call that also passes the conversation
and output text for content capture.

Pass model_options into start_generate_span() so request-parameter
attributes (temperature, top_p, etc.) are surfaced on the span.

The stream error path in core/base.py is also consolidated through
finalize_backend_span(error=...).

No behaviour change on the success path; error spans now carry
error.type and ERROR status instead of silently closing.

Assisted-by: Claude Code
Signed-off-by: Nigel Jones <jonesn@uk.ibm.com>
---
 mellea/backends/huggingface.py | 28 ++++++++++++++--------------
 mellea/backends/litellm.py     | 28 ++++++++++++++--------------
 mellea/backends/ollama.py      | 34 +++++++++++++++++++---------------
 mellea/backends/openai.py      | 26 ++++++++++++++------------
 mellea/backends/watsonx.py     | 28 ++++++++++++++--------------
 mellea/core/base.py            |  5 ++---
 6 files changed, 77 insertions(+), 72 deletions(-)

diff --git a/mellea/backends/huggingface.py b/mellea/backends/huggingface.py
index 84507299e..f21e50c06 100644
--- a/mellea/backends/huggingface.py
+++ b/mellea/backends/huggingface.py
@@ -388,7 +388,12 @@ async def _generate_from_context(
                 and an updated context that includes ``action`` and the new output.
         """
         span = start_generate_span(
-            backend=self, action=action, ctx=ctx, format=format, tool_calls=tool_calls
+            backend=self,
+            action=action,
+            ctx=ctx,
+            format=format,
+            tool_calls=tool_calls,
+            model_options=model_options,
         )
 
         with with_context(
@@ -1249,20 +1254,15 @@ class used during generation, if any.
 
         # Record tracing if span exists
         if span is not None:
-            from ..telemetry import end_backend_span
-            from ..telemetry.backend_instrumentation import (
-                record_response_metadata,
-                record_token_usage,
+            from ..telemetry.backend_instrumentation import finalize_backend_span
+
+            finalize_backend_span(
+                span,
+                usage=mot.generation.usage if mot.generation.usage else None,
+                model_id=self._get_hf_model_id(),
+                conversation=conversation,
+                output_text=str(mot.value) if mot.value is not None else None,
             )
-
-            if isinstance(hf_output, GenerateDecoderOnlyOutput):
-                record_response_metadata(span, hf_output)
-                if mot.generation.usage:
-                    record_token_usage(span, mot.generation.usage)
-
-            # Close the span now that async operation is complete
-            end_backend_span(span)
-            # Clean up span reference
             del mot._meta["_telemetry_span"]
 
         # When caching is disabled, clear hf_output from meta to free GPU memory.
diff --git a/mellea/backends/litellm.py b/mellea/backends/litellm.py
index f7d912516..da98dec0e 100644
--- a/mellea/backends/litellm.py
+++ b/mellea/backends/litellm.py
@@ -164,7 +164,12 @@ async def _generate_from_context(
             "The Openai backend only supports chat-like contexts."
         )
         span = start_generate_span(
-            backend=self, action=action, ctx=ctx, format=format, tool_calls=tool_calls
+            backend=self,
+            action=action,
+            ctx=ctx,
+            format=format,
+            tool_calls=tool_calls,
+            model_options=model_options,
         )
 
         _model_id_str = str(getattr(self, "model_id", "unknown"))
@@ -561,21 +566,16 @@ async def post_processing(
         # Record telemetry now that response is available
         span = mot._meta.get("_telemetry_span")
         if span is not None:
-            from ..telemetry import end_backend_span
-            from ..telemetry.backend_instrumentation import (
-                record_response_metadata,
-                record_token_usage,
-            )
+            from ..telemetry.backend_instrumentation import finalize_backend_span
 
             response = mot._meta.get("litellm_chat_response")
-            if response:
-                # LiteLLM responses have usage information
-                if usage:
-                    record_token_usage(span, usage)
-                record_response_metadata(span, response)
-            # Close the span now that async operation is complete
-            end_backend_span(span)
-            # Clean up the span reference
+            finalize_backend_span(
+                span,
+                response=response,
+                usage=usage,
+                model_id=str(self.model_id),
+                conversation=conversation,
+            )
             del mot._meta["_telemetry_span"]
 
     @staticmethod
diff --git a/mellea/backends/ollama.py b/mellea/backends/ollama.py
index 5b50cd709..651902319 100644
--- a/mellea/backends/ollama.py
+++ b/mellea/backends/ollama.py
@@ -289,7 +289,9 @@ async def _generate_from_context(
                 and an updated context that includes ``action`` and the new output.
         """
         # Start span without auto-closing (will be closed in post_processing)
-        span = start_generate_span(self, action, ctx, format, tool_calls)
+        span = start_generate_span(
+            self, action, ctx, format, tool_calls, model_options=model_options
+        )
 
         assert ctx.is_chat_context, (
             "The ollama backend only supports chat-like contexts."
@@ -720,21 +722,23 @@ async def post_processing(
         # Record telemetry and close span now that response is available
         span = mot._meta.get("_telemetry_span")
         if span is not None:
-            from ..telemetry import end_backend_span
-            from ..telemetry.backend_instrumentation import (
-                record_response_metadata,
-                record_token_usage,
+            from ..telemetry.backend_instrumentation import finalize_backend_span
+
+            output_text: str | None = None
+            if response is not None:
+                try:
+                    msg = getattr(response, "message", None)
+                    if msg is not None:
+                        output_text = str(getattr(msg, "content", "") or "")
+                except Exception:
+                    pass
+
+            finalize_backend_span(
+                span,
+                usage=mot.generation.usage if mot.generation.usage else None,
+                conversation=conversation,
+                output_text=output_text,
             )
-
-            if response:
-                if mot.generation.usage:
-                    record_token_usage(span, mot.generation.usage)
-                record_response_metadata(span, response)
-
-            # Close the span now that telemetry is recorded
-            end_backend_span(span)
-
-            # Clean up the span reference
             del mot._meta["_telemetry_span"]
 
 
diff --git a/mellea/backends/openai.py b/mellea/backends/openai.py
index 1eea93511..ace9c544e 100644
--- a/mellea/backends/openai.py
+++ b/mellea/backends/openai.py
@@ -467,7 +467,12 @@ async def _generate_from_context(
 
         # Start span without auto-closing (will be closed in post_processing)
         span = start_generate_span(
-            backend=self, action=action, ctx=ctx, format=format, tool_calls=tool_calls
+            backend=self,
+            action=action,
+            ctx=ctx,
+            format=format,
+            tool_calls=tool_calls,
+            model_options=model_options,
         )
 
         _model_id_str = str(getattr(self, "model_id", "unknown"))
@@ -1122,18 +1127,15 @@ async def post_processing(
         # Record telemetry now that response is available
         span = mot._meta.get("_telemetry_span")
         if span is not None:
-            from ..telemetry import end_backend_span
-            from ..telemetry.backend_instrumentation import (
-                record_response_metadata,
-                record_token_usage,
+            from ..telemetry.backend_instrumentation import finalize_backend_span
+
+            finalize_backend_span(
+                span,
+                response=response,
+                usage=usage,
+                model_id=self._model_id,
+                conversation=conversation,
             )
-
-            if usage:
-                record_token_usage(span, usage)
-            record_response_metadata(span, response)
-            # Close the span now that async operation is complete
-            end_backend_span(span)
-            # Clean up the span reference
             del mot._meta["_telemetry_span"]
 
     @overload
diff --git a/mellea/backends/watsonx.py b/mellea/backends/watsonx.py
index 87a0697d6..f0bcf80e2 100644
--- a/mellea/backends/watsonx.py
+++ b/mellea/backends/watsonx.py
@@ -303,7 +303,12 @@ async def _generate_from_context(
             "The watsonx.ai backend only supports chat-like contexts."
         )
         span = start_generate_span(
-            backend=self, action=action, ctx=ctx, format=format, tool_calls=tool_calls
+            backend=self,
+            action=action,
+            ctx=ctx,
+            format=format,
+            tool_calls=tool_calls,
+            model_options=model_options,
         )
 
         _model_id_str = str(getattr(self, "model_id", "unknown"))
@@ -606,20 +611,15 @@ async def post_processing(
         # Record tracing if span exists
         span = mot._meta.get("_telemetry_span")
         if span is not None:
-            from ..telemetry import end_backend_span
-            from ..telemetry.backend_instrumentation import (
-                record_response_metadata,
-                record_token_usage,
+            from ..telemetry.backend_instrumentation import finalize_backend_span
+
+            finalize_backend_span(
+                span,
+                response=response,
+                usage=usage,
+                model_id=str(self._get_watsonx_model_id()),
+                conversation=conversation,
             )
-
-            if usage:
-                record_token_usage(span, usage)
-            if response is not None:
-                record_response_metadata(span, response)
-
-            # Close the span now that async operation is complete
-            end_backend_span(span)
-            # Clean up span reference
             del mot._meta["_telemetry_span"]
 
         # Generate the log for this ModelOutputThunk.
diff --git a/mellea/core/base.py b/mellea/core/base.py
index 5ab4aa935..3472bf4db 100644
--- a/mellea/core/base.py
+++ b/mellea/core/base.py
@@ -523,10 +523,9 @@ async def astream(self) -> str:
             # but we must not leak the span.
             span = self._meta.get("_telemetry_span")
             if span is not None:
-                from ..telemetry import end_backend_span, set_span_error
+                from ..telemetry.backend_instrumentation import finalize_backend_span
 
-                set_span_error(span, chunks[-1])
-                end_backend_span(span)
+                finalize_backend_span(span, error=chunks[-1])
                 del self._meta["_telemetry_span"]
 
             # Fire generation_error hook (FIRE_AND_FORGET — does not block the raise)

From ef992678e5f5d866f30680d7701d1b803fb4e92b Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Thu, 7 May 2026 14:57:52 +0100
Subject: [PATCH 05/10] test(telemetry): add unit tests and otelite example for
 semconv gaps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

test/telemetry/test_genai_semconv_emission.py — 20 pure-unit tests
covering each of the five gaps (no live backend or OTel SDK required):
  - gen_ai.provider.name + gen_ai.system dual-emission
  - gen_ai.conversation.id from session_id ContextVar
  - llm.prompt_template.* from Instruction (always / gated)
  - error.type + ERROR status via finalize_backend_span
  - gen_ai.input/output.messages structured JSON (gated)
  - no deprecated per-role events emitted
  - finalize_backend_span robustness (None span, broken span)

docs/examples/telemetry/otel_genai_semconv_example.py — runnable
example for human verification against otelite, demonstrating all
five attributes and the error path.

Assisted-by: Claude Code
Signed-off-by: Nigel Jones <jonesn@uk.ibm.com>
---
 docs/examples/telemetry/README.md             |   4 +
 .../telemetry/otel_genai_semconv_example.py   | 132 ++++++
 test/telemetry/test_genai_semconv_emission.py | 378 ++++++++++++++++++
 3 files changed, 514 insertions(+)
 create mode 100644 docs/examples/telemetry/otel_genai_semconv_example.py
 create mode 100644 test/telemetry/test_genai_semconv_emission.py

diff --git a/docs/examples/telemetry/README.md b/docs/examples/telemetry/README.md
index fc79b1b6f..9458e9db0 100644
--- a/docs/examples/telemetry/README.md
+++ b/docs/examples/telemetry/README.md
@@ -6,6 +6,10 @@ This directory contains examples demonstrating OpenTelemetry tracing and metrics
 
 - **`telemetry_example.py`** - Demonstrates distributed tracing (application and backend traces)
 - **`metrics_example.py`** - Demonstrates token usage metrics collection
+- **`otel_genai_semconv_example.py`** - Exercises the OTel GenAI semantic convention attributes
+  added in issue #1035 (`gen_ai.provider.name`, `gen_ai.conversation.id`,
+  `llm.prompt_template.*`, `error.type`, content capture). Designed for human
+  verification against [otelite](https://github.com/planetf1/otelite).
 
 ## Quick Start
 
diff --git a/docs/examples/telemetry/otel_genai_semconv_example.py b/docs/examples/telemetry/otel_genai_semconv_example.py
new file mode 100644
index 000000000..9279ad98e
--- /dev/null
+++ b/docs/examples/telemetry/otel_genai_semconv_example.py
@@ -0,0 +1,132 @@
+# pytest: ollama, e2e
+
+"""Example demonstrating OTel GenAI semantic convention attributes (issue #1035).
+
+Exercises the five emission-gap fixes added in this issue so they can be verified
+in otelite or any OTel-compatible backend:
+
+  gen_ai.provider.name      — provider identity (alongside legacy gen_ai.system)
+  gen_ai.conversation.id    — mapped from session_id ContextVar
+  llm.prompt_template.*     — template text (always) and variables (opt-in)
+  error.type                — set on the error path alongside ERROR status
+  gen_ai.input/output.messages — structured content (opt-in via MELLEA_TRACE_CONTENT)
+
+Run against otelite for human verification:
+
+  # Terminal 1 — start otelite (OTLP gRPC :4317, UI :8080)
+  docker run --rm -p 4317:4317 -p 8080:8080 ghcr.io/planetf1/otelite:latest
+
+  # Terminal 2 — run with all attributes visible
+  export MELLEA_TRACE_BACKEND=1
+  export MELLEA_TRACE_CONTENT=1
+  export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
+  export OTEL_SERVICE_NAME=mellea-semconv-demo
+  python otel_genai_semconv_example.py
+
+  Then open http://localhost:8080 → select mellea-semconv-demo service.
+
+What to verify per span in otelite
+-----------------------------------
+  Span "chat"
+    gen_ai.system              = "ollama"     (back-compat)
+    gen_ai.provider.name       = "ollama"     (new, semconv v1.37.0)
+    gen_ai.conversation.id     = "demo-session-1"
+    mellea.session_id          = "demo-session-1"  (preserved)
+    llm.prompt_template.template = "Summarise {{topic}} in one sentence."
+    llm.prompt_template.variables = {"topic": "quantum tunnelling"}  (only with MELLEA_TRACE_CONTENT)
+    gen_ai.input.messages      = [...]        (only with MELLEA_TRACE_CONTENT)
+    gen_ai.output.messages     = [...]        (only with MELLEA_TRACE_CONTENT)
+
+  Span "chat" (error path)
+    error.type  = "OllamaRequestError" (or similar)
+    status      = ERROR
+"""
+
+from mellea import start_session
+from mellea.telemetry import (
+    is_backend_tracing_enabled,
+    is_content_tracing_enabled,
+    with_context,
+)
+
+
+def _section(title: str) -> None:
+    print(f"\n{'=' * 60}")
+    print(f"  {title}")
+    print("=" * 60)
+
+
+def main() -> None:
+    _section("Mellea OTel GenAI Semantic Convention Demo")
+    print(f"Backend tracing:  {is_backend_tracing_enabled()}")
+    print(f"Content capture:  {is_content_tracing_enabled()}")
+    if not is_backend_tracing_enabled():
+        print("\nSet MELLEA_TRACE_BACKEND=1 to enable backend spans.")
+
+    # -----------------------------------------------------------------------
+    # 1. Provider name + conversation id + prompt template attrs
+    # -----------------------------------------------------------------------
+    _section("1. Provider name / conversation id / template attrs")
+    print("Expected span attrs:")
+    print("  gen_ai.system              = 'ollama'")
+    print("  gen_ai.provider.name       = 'ollama'")
+    print("  gen_ai.conversation.id     = 'demo-session-1'")
+    print("  llm.prompt_template.template = 'Summarise {{topic}} in one sentence.'")
+
+    with with_context(session_id="demo-session-1"):
+        with start_session() as m:
+            result = m.instruct(
+                "Summarise {{topic}} in one sentence.",
+                user_variables={"topic": "quantum tunnelling"},
+            )
+    print(f"\nOutput: {str(result)[:120]}")
+
+    # -----------------------------------------------------------------------
+    # 2. Content capture (opt-in)
+    # -----------------------------------------------------------------------
+    if is_content_tracing_enabled():
+        _section("2. Content capture (MELLEA_TRACE_CONTENT=1)")
+        print("Expected span attrs:")
+        print("  gen_ai.system_instructions — serialised system turns")
+        print("  gen_ai.input.messages      — [{'role':'user','parts':[...]}]")
+        print(
+            "  gen_ai.output.messages     — [{'role':'assistant','parts':[...],'finish_reason':'stop'}]"
+        )
+        print("  llm.prompt_template.variables = {'name': 'Ada'}")
+
+        with start_session() as m2:
+            result2 = m2.instruct(
+                "Write a one-line greeting for {{name}}.",
+                user_variables={"name": "Ada"},
+            )
+        print(f"\nOutput: {str(result2)[:120]}")
+    else:
+        _section("2. Content capture (skipped — set MELLEA_TRACE_CONTENT=1)")
+
+    # -----------------------------------------------------------------------
+    # 3. Error path: error.type + ERROR status
+    # -----------------------------------------------------------------------
+    _section("3. Error path — error.type on span")
+    print("Expected span attrs:")
+    print("  status     = ERROR")
+    print("  error.type = <exception class name>")
+
+    try:
+        with start_session() as m3:
+            # Use a model name guaranteed to be absent on any Ollama instance.
+            m3._backend.model_id = "mellea-semconv-nonexistent-xyz"  # type: ignore[attr-defined]
+            m3.instruct("Hello")
+    except Exception as exc:
+        print(f"\nGot expected error: {exc.__class__.__name__}")
+    else:
+        print(
+            "\n(No error — check the span for error.type if the model unexpectedly exists)"
+        )
+
+    _section("Done")
+    print("If OTEL_EXPORTER_OTLP_ENDPOINT is set, check your trace backend.")
+    print("If MELLEA_TRACE_CONSOLE=1, spans were printed to stdout above.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/telemetry/test_genai_semconv_emission.py b/test/telemetry/test_genai_semconv_emission.py
new file mode 100644
index 000000000..a3b2c3d0e
--- /dev/null
+++ b/test/telemetry/test_genai_semconv_emission.py
@@ -0,0 +1,378 @@
+"""Unit tests for OTel GenAI semantic convention emission gaps (issue #1035).
+
+All tests use a fake span object and do not require a live backend or
+OpenTelemetry SDK installation.
+"""
+
+import json
+from unittest.mock import MagicMock, patch
+
+from mellea.telemetry.backend_instrumentation import (
+    finalize_backend_span,
+    get_provider_name,
+    get_system_name,
+    start_generate_span,
+)
+from mellea.telemetry.context import with_context
+from mellea.telemetry.tracing import is_content_tracing_enabled
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _mock_span() -> MagicMock:
+    return MagicMock()
+
+
+def _fake_backend(class_name: str) -> object:
+    return type(class_name, (), {})()
+
+
+def _span_attrs(span: MagicMock) -> dict:
+    """Collect all set_attribute calls into a flat dict."""
+    return {call.args[0]: call.args[1] for call in span.set_attribute.call_args_list}
+
+
+# ---------------------------------------------------------------------------
+# gen_ai.provider.name alongside gen_ai.system
+# ---------------------------------------------------------------------------
+
+
+def test_provider_name_equals_system_name():
+    backend = _fake_backend("OpenAIBackend")
+    assert get_provider_name(backend) == get_system_name(backend) == "openai"
+
+
+def test_provider_name_emitted_in_start_generate_span():
+    """Both gen_ai.system and gen_ai.provider.name should be set on the span."""
+    backend = _fake_backend("OpenAIBackend")
+    backend.model_id = "gpt-4"  # type: ignore[attr-defined]
+    action = MagicMock()
+    action.prompt_template_metadata = None
+
+    with patch("mellea.telemetry.tracing.start_backend_span") as mock_start:
+        mock_start.return_value = _mock_span()
+        start_generate_span(backend, action, ctx=[], format=None, tool_calls=False)
+
+    call_kwargs = mock_start.call_args[1]
+    assert call_kwargs.get("gen_ai.system") == "openai"
+    assert call_kwargs.get("gen_ai.provider.name") == "openai"
+
+
+# ---------------------------------------------------------------------------
+# gen_ai.conversation.id from session_id ContextVar
+# ---------------------------------------------------------------------------
+
+
+def test_conversation_id_emitted_from_session_id():
+    backend = _fake_backend("OpenAIBackend")
+    backend.model_id = "gpt-4"  # type: ignore[attr-defined]
+    action = MagicMock()
+    action.prompt_template_metadata = None
+
+    with with_context(session_id="sess-abc"):
+        with patch("mellea.telemetry.tracing.start_backend_span") as mock_start:
+            mock_start.return_value = _mock_span()
+            start_generate_span(backend, action, ctx=[], format=None, tool_calls=False)
+
+    call_kwargs = mock_start.call_args[1]
+    assert call_kwargs.get("gen_ai.conversation.id") == "sess-abc"
+    assert call_kwargs.get("mellea.session_id") == "sess-abc"
+
+
+def test_conversation_id_absent_when_no_session():
+    backend = _fake_backend("OpenAIBackend")
+    backend.model_id = "gpt-4"  # type: ignore[attr-defined]
+    action = MagicMock()
+    action.prompt_template_metadata = None
+
+    with patch("mellea.telemetry.tracing.start_backend_span") as mock_start:
+        mock_start.return_value = _mock_span()
+        start_generate_span(backend, action, ctx=[], format=None, tool_calls=False)
+
+    call_kwargs = mock_start.call_args[1]
+    assert "gen_ai.conversation.id" not in call_kwargs
+
+
+# ---------------------------------------------------------------------------
+# llm.prompt_template.* from Instruction
+# ---------------------------------------------------------------------------
+
+
+def test_prompt_template_attrs_from_instruction():
+    from mellea.stdlib.components.instruction import Instruction
+
+    instr = Instruction(
+        description="Summarise {{topic}} in one sentence.",
+        user_variables={"topic": "quantum tunnelling"},
+    )
+
+    backend = _fake_backend("OpenAIBackend")
+    backend.model_id = "gpt-4"  # type: ignore[attr-defined]
+
+    with patch("mellea.telemetry.tracing.start_backend_span") as mock_start:
+        mock_start.return_value = _mock_span()
+        start_generate_span(backend, instr, ctx=[], format=None, tool_calls=False)
+
+    call_kwargs = mock_start.call_args[1]
+    # Template text is always emitted
+    assert call_kwargs.get("llm.prompt_template.template") == (
+        "Summarise {{topic}} in one sentence."
+    )
+    # Variables are NOT emitted when content capture is off (default)
+    assert "llm.prompt_template.variables" not in call_kwargs
+
+
+def test_prompt_template_variables_emitted_when_content_enabled(monkeypatch):
+    from mellea.stdlib.components.instruction import Instruction
+
+    instr = Instruction(description="Hello {{name}}", user_variables={"name": "World"})
+
+    backend = _fake_backend("OpenAIBackend")
+    backend.model_id = "gpt-4"  # type: ignore[attr-defined]
+
+    # Patch the content gate to True
+    monkeypatch.setattr(
+        "mellea.telemetry.backend_instrumentation.is_content_tracing_enabled",
+        lambda: True,
+    )
+
+    with patch("mellea.telemetry.tracing.start_backend_span") as mock_start:
+        mock_start.return_value = _mock_span()
+        start_generate_span(backend, instr, ctx=[], format=None, tool_calls=False)
+
+    call_kwargs = mock_start.call_args[1]
+    variables_json = call_kwargs.get("llm.prompt_template.variables")
+    assert variables_json is not None
+    parsed = json.loads(variables_json)
+    assert parsed == {"name": "World"}
+
+
+def test_instruction_without_user_variables_emits_template():
+    from mellea.stdlib.components.instruction import Instruction
+
+    instr = Instruction(description="Tell me about {{topic}}")
+    # No user_variables — template is retained as-is
+
+    backend = _fake_backend("OpenAIBackend")
+    backend.model_id = "gpt-4"  # type: ignore[attr-defined]
+
+    with patch("mellea.telemetry.tracing.start_backend_span") as mock_start:
+        mock_start.return_value = _mock_span()
+        start_generate_span(backend, instr, ctx=[], format=None, tool_calls=False)
+
+    call_kwargs = mock_start.call_args[1]
+    assert call_kwargs.get("llm.prompt_template.template") == "Tell me about {{topic}}"
+
+
+def test_instruction_with_no_description_emits_no_template():
+    from mellea.stdlib.components.instruction import Instruction
+
+    instr = Instruction()  # no description
+
+    backend = _fake_backend("OpenAIBackend")
+    backend.model_id = "gpt-4"  # type: ignore[attr-defined]
+
+    with patch("mellea.telemetry.tracing.start_backend_span") as mock_start:
+        mock_start.return_value = _mock_span()
+        start_generate_span(backend, instr, ctx=[], format=None, tool_calls=False)
+
+    call_kwargs = mock_start.call_args[1]
+    assert "llm.prompt_template.template" not in call_kwargs
+
+
+# ---------------------------------------------------------------------------
+# ERROR span status + error.type (finalize_backend_span error path)
+# ---------------------------------------------------------------------------
+
+
+def test_error_sets_status_and_error_type():
+    span = _mock_span()
+    exc = RuntimeError("model rejected")
+
+    with (
+        patch(
+            "mellea.telemetry.backend_instrumentation.set_span_error"
+        ) as mock_set_err,
+        patch("mellea.telemetry.backend_instrumentation.end_backend_span") as mock_end,
+    ):
+        finalize_backend_span(span, error=exc)
+
+    mock_set_err.assert_called_once_with(span, exc)
+    attrs = _span_attrs(span)
+    assert attrs.get("error.type") == "RuntimeError"
+    mock_end.assert_called_once_with(span)
+
+
+def test_error_path_always_closes_span():
+    span = _mock_span()
+    with patch("mellea.telemetry.backend_instrumentation.set_span_error"):
+        with patch(
+            "mellea.telemetry.backend_instrumentation.end_backend_span"
+        ) as mock_end:
+            finalize_backend_span(span, error=ValueError("x"))
+    mock_end.assert_called_once()
+
+
+def test_finalize_never_raises_on_span_error(monkeypatch):
+    """finalize_backend_span must not propagate exceptions from helpers."""
+    span = _mock_span()
+    span.set_attribute.side_effect = RuntimeError("span broke")
+
+    with patch("mellea.telemetry.backend_instrumentation.end_backend_span"):
+        with patch("mellea.telemetry.backend_instrumentation.set_span_error"):
+            # Should not raise even though set_attribute raises
+            finalize_backend_span(span, error=ValueError("test"))
+
+
+def test_finalize_none_span_is_noop():
+    finalize_backend_span(None, error=RuntimeError("x"))  # no exception
+
+
+# ---------------------------------------------------------------------------
+# Content capture (gen_ai.input.messages etc.) gated by MELLEA_TRACE_CONTENT
+# ---------------------------------------------------------------------------
+
+
+def test_content_capture_disabled_by_default():
+    span = _mock_span()
+    conversation = [
+        {"role": "system", "content": "You are helpful."},
+        {"role": "user", "content": "Hello"},
+    ]
+    with patch("mellea.telemetry.backend_instrumentation.end_backend_span"):
+        finalize_backend_span(span, conversation=conversation, output_text="Hi there")
+
+    attrs = _span_attrs(span)
+    assert "gen_ai.input.messages" not in attrs
+    assert "gen_ai.output.messages" not in attrs
+    assert "gen_ai.system_instructions" not in attrs
+
+
+def test_content_capture_emits_structured_attributes(monkeypatch):
+    monkeypatch.setattr(
+        "mellea.telemetry.backend_instrumentation.is_content_tracing_enabled",
+        lambda: True,
+    )
+    span = _mock_span()
+    conversation = [
+        {"role": "system", "content": "You are helpful."},
+        {"role": "user", "content": "Tell me a joke."},
+    ]
+    with patch("mellea.telemetry.backend_instrumentation.end_backend_span"):
+        with patch("mellea.telemetry.backend_instrumentation.add_span_event"):
+            finalize_backend_span(
+                span,
+                conversation=conversation,
+                output_text="Why did the chicken cross the road?",
+            )
+
+    attrs = _span_attrs(span)
+
+    # System instructions
+    sys_json = attrs.get("gen_ai.system_instructions")
+    assert sys_json is not None
+    sys_parts = json.loads(sys_json)
+    assert sys_parts == [{"type": "text", "content": "You are helpful."}]
+
+    # Input messages (non-system)
+    in_json = attrs.get("gen_ai.input.messages")
+    assert in_json is not None
+    in_msgs = json.loads(in_json)
+    assert len(in_msgs) == 1
+    assert in_msgs[0]["role"] == "user"
+    assert in_msgs[0]["parts"] == [{"type": "text", "content": "Tell me a joke."}]
+
+    # Output messages
+    out_json = attrs.get("gen_ai.output.messages")
+    assert out_json is not None
+    out_msgs = json.loads(out_json)
+    assert out_msgs[0]["role"] == "assistant"
+    assert out_msgs[0]["parts"][0]["content"] == "Why did the chicken cross the road?"
+    assert "finish_reason" in out_msgs[0]
+
+
+def test_content_capture_no_deprecated_per_role_events(monkeypatch):
+    """The deprecated gen_ai.user.message / gen_ai.assistant.message events must not be emitted."""
+    monkeypatch.setattr(
+        "mellea.telemetry.backend_instrumentation.is_content_tracing_enabled",
+        lambda: True,
+    )
+    span = _mock_span()
+    with patch("mellea.telemetry.backend_instrumentation.end_backend_span"):
+        finalize_backend_span(
+            span, conversation=[{"role": "user", "content": "hi"}], output_text="hello"
+        )
+
+    event_names = [call.args[0] for call in span.add_event.call_args_list]
+    deprecated = {
+        "gen_ai.user.message",
+        "gen_ai.assistant.message",
+        "gen_ai.system.message",
+    }
+    assert not deprecated.intersection(event_names)
+
+
+def test_content_span_event_emitted(monkeypatch):
+    monkeypatch.setattr(
+        "mellea.telemetry.backend_instrumentation.is_content_tracing_enabled",
+        lambda: True,
+    )
+    span = _mock_span()
+    with patch("mellea.telemetry.backend_instrumentation.end_backend_span"):
+        with patch(
+            "mellea.telemetry.backend_instrumentation.add_span_event"
+        ) as mock_event:
+            finalize_backend_span(
+                span,
+                conversation=[{"role": "user", "content": "hi"}],
+                output_text="hello",
+            )
+    event_names = [call.args[1] for call in mock_event.call_args_list]
+    assert "gen_ai.client.inference.operation.details" in event_names
+
+
+# ---------------------------------------------------------------------------
+# _TRACE_CONTENT_ENABLED recognises OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT
+# ---------------------------------------------------------------------------
+
+
+def test_content_tracing_enabled_via_mellea_env(monkeypatch):
+    monkeypatch.setenv("MELLEA_TRACE_CONTENT", "true")
+    import mellea.telemetry.tracing as tracing_mod
+
+    # Force re-evaluation of module-level constant
+    with patch.object(tracing_mod, "_TRACE_CONTENT_ENABLED", True):
+        assert tracing_mod.is_content_tracing_enabled()
+
+
+def test_content_tracing_disabled_by_default():
+    assert not is_content_tracing_enabled()
+
+
+# ---------------------------------------------------------------------------
+# Success path of finalize_backend_span calls record helpers
+# ---------------------------------------------------------------------------
+
+
+def test_success_path_calls_record_token_usage():
+    span = _mock_span()
+    usage = {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}
+    with patch(
+        "mellea.telemetry.backend_instrumentation.record_token_usage"
+    ) as mock_rtu:
+        with patch("mellea.telemetry.backend_instrumentation.end_backend_span"):
+            finalize_backend_span(span, usage=usage)
+    mock_rtu.assert_called_once_with(span, usage)
+
+
+def test_success_path_calls_record_response_metadata():
+    span = _mock_span()
+    response = {"model": "gpt-4", "id": "resp-1"}
+    with patch(
+        "mellea.telemetry.backend_instrumentation.record_response_metadata"
+    ) as mock_rrm:
+        with patch("mellea.telemetry.backend_instrumentation.end_backend_span"):
+            finalize_backend_span(span, response=response, model_id="gpt-4")
+    mock_rrm.assert_called_once_with(span, response, model_id="gpt-4")

From d5db21f1c212711286853ed174a74774092cd4d0 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Thu, 7 May 2026 15:13:52 +0100
Subject: [PATCH 06/10] refactor: trim PR #1035 to gaps 1-4, defer content
 capture (gap 5)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove finalize_backend_span success-path consolidation and all content
capture helpers (_emit_content_attributes, _conversation_to_parts).
Revert all five backend files to upstream/main — gap 5 requires touching
every backend and is better reviewed in isolation.

finalize_backend_span is kept as an error-path-only helper (sets
error.type + ERROR status, then closes the span) used by the stream
error path in ModelOutputThunk.__aiter__.

Full implementation including gap 5 is preserved on cs/issue-1035-full.

Assisted-by: Claude Code
Signed-off-by: Nigel Jones <jonesn@uk.ibm.com>
---
 .../telemetry/otel_genai_semconv_example.py   |  67 ++-----
 mellea/backends/huggingface.py                |  28 +--
 mellea/backends/litellm.py                    |  28 +--
 mellea/backends/ollama.py                     |  34 ++--
 mellea/backends/openai.py                     |  26 ++-
 mellea/backends/watsonx.py                    |  28 +--
 mellea/telemetry/backend_instrumentation.py   | 145 ++------------
 test/telemetry/test_genai_semconv_emission.py | 177 ++----------------
 8 files changed, 116 insertions(+), 417 deletions(-)

diff --git a/docs/examples/telemetry/otel_genai_semconv_example.py b/docs/examples/telemetry/otel_genai_semconv_example.py
index 9279ad98e..83a12aa82 100644
--- a/docs/examples/telemetry/otel_genai_semconv_example.py
+++ b/docs/examples/telemetry/otel_genai_semconv_example.py
@@ -2,25 +2,23 @@
 
 """Example demonstrating OTel GenAI semantic convention attributes (issue #1035).
 
-Exercises the five emission-gap fixes added in this issue so they can be verified
-in otelite or any OTel-compatible backend:
+Exercises gaps 1-4 so they can be verified in otelite or any OTel-compatible backend.
+Gap 5 (content capture) is deferred — see cs/issue-1035-full for that implementation.
 
   gen_ai.provider.name      — provider identity (alongside legacy gen_ai.system)
   gen_ai.conversation.id    — mapped from session_id ContextVar
   llm.prompt_template.*     — template text (always) and variables (opt-in)
   error.type                — set on the error path alongside ERROR status
-  gen_ai.input/output.messages — structured content (opt-in via MELLEA_TRACE_CONTENT)
 
 Run against otelite for human verification:
 
   # Terminal 1 — start otelite (OTLP gRPC :4317, UI :8080)
   docker run --rm -p 4317:4317 -p 8080:8080 ghcr.io/planetf1/otelite:latest
 
-  # Terminal 2 — run with all attributes visible
+  # Terminal 2
   export MELLEA_TRACE_BACKEND=1
-  export MELLEA_TRACE_CONTENT=1
   export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
-  export OTEL_SERVICE_NAME=mellea-semconv-demo
+  export OTel_SERVICE_NAME=mellea-semconv-demo
   python otel_genai_semconv_example.py
 
   Then open http://localhost:8080 → select mellea-semconv-demo service.
@@ -33,9 +31,6 @@
     gen_ai.conversation.id     = "demo-session-1"
     mellea.session_id          = "demo-session-1"  (preserved)
     llm.prompt_template.template = "Summarise {{topic}} in one sentence."
-    llm.prompt_template.variables = {"topic": "quantum tunnelling"}  (only with MELLEA_TRACE_CONTENT)
-    gen_ai.input.messages      = [...]        (only with MELLEA_TRACE_CONTENT)
-    gen_ai.output.messages     = [...]        (only with MELLEA_TRACE_CONTENT)
 
   Span "chat" (error path)
     error.type  = "OllamaRequestError" (or similar)
@@ -43,30 +38,23 @@
 """
 
 from mellea import start_session
-from mellea.telemetry import (
-    is_backend_tracing_enabled,
-    is_content_tracing_enabled,
-    with_context,
-)
+from mellea.telemetry import is_backend_tracing_enabled, with_context
 
 
 def _section(title: str) -> None:
-    print(f"\n{'=' * 60}")
-    print(f"  {title}")
-    print("=" * 60)
+    print(f"\n{'=' * 60}\n  {title}\n{'=' * 60}")
 
 
 def main() -> None:
-    _section("Mellea OTel GenAI Semantic Convention Demo")
-    print(f"Backend tracing:  {is_backend_tracing_enabled()}")
-    print(f"Content capture:  {is_content_tracing_enabled()}")
+    _section("Mellea OTel GenAI Semantic Convention Demo (gaps 1-4)")
+    print(f"Backend tracing: {is_backend_tracing_enabled()}")
     if not is_backend_tracing_enabled():
-        print("\nSet MELLEA_TRACE_BACKEND=1 to enable backend spans.")
+        print("Set MELLEA_TRACE_BACKEND=1 to enable backend spans.")
 
     # -----------------------------------------------------------------------
-    # 1. Provider name + conversation id + prompt template attrs
+    # Gaps 1-3: provider name, conversation id, prompt template attrs
     # -----------------------------------------------------------------------
-    _section("1. Provider name / conversation id / template attrs")
+    _section("Gaps 1-3: provider name / conversation id / template")
     print("Expected span attrs:")
     print("  gen_ai.system              = 'ollama'")
     print("  gen_ai.provider.name       = 'ollama'")
@@ -82,40 +70,17 @@ def main() -> None:
     print(f"\nOutput: {str(result)[:120]}")
 
     # -----------------------------------------------------------------------
-    # 2. Content capture (opt-in)
+    # Gap 4: error.type + ERROR status
     # -----------------------------------------------------------------------
-    if is_content_tracing_enabled():
-        _section("2. Content capture (MELLEA_TRACE_CONTENT=1)")
-        print("Expected span attrs:")
-        print("  gen_ai.system_instructions — serialised system turns")
-        print("  gen_ai.input.messages      — [{'role':'user','parts':[...]}]")
-        print(
-            "  gen_ai.output.messages     — [{'role':'assistant','parts':[...],'finish_reason':'stop'}]"
-        )
-        print("  llm.prompt_template.variables = {'name': 'Ada'}")
-
-        with start_session() as m2:
-            result2 = m2.instruct(
-                "Write a one-line greeting for {{name}}.",
-                user_variables={"name": "Ada"},
-            )
-        print(f"\nOutput: {str(result2)[:120]}")
-    else:
-        _section("2. Content capture (skipped — set MELLEA_TRACE_CONTENT=1)")
-
-    # -----------------------------------------------------------------------
-    # 3. Error path: error.type + ERROR status
-    # -----------------------------------------------------------------------
-    _section("3. Error path — error.type on span")
+    _section("Gap 4: error.type on span")
     print("Expected span attrs:")
     print("  status     = ERROR")
     print("  error.type = <exception class name>")
 
     try:
-        with start_session() as m3:
-            # Use a model name guaranteed to be absent on any Ollama instance.
-            m3._backend.model_id = "mellea-semconv-nonexistent-xyz"  # type: ignore[attr-defined]
-            m3.instruct("Hello")
+        with start_session() as m2:
+            m2._backend.model_id = "mellea-semconv-nonexistent-xyz"  # type: ignore[attr-defined]
+            m2.instruct("Hello")
     except Exception as exc:
         print(f"\nGot expected error: {exc.__class__.__name__}")
     else:
diff --git a/mellea/backends/huggingface.py b/mellea/backends/huggingface.py
index f21e50c06..84507299e 100644
--- a/mellea/backends/huggingface.py
+++ b/mellea/backends/huggingface.py
@@ -388,12 +388,7 @@ async def _generate_from_context(
                 and an updated context that includes ``action`` and the new output.
         """
         span = start_generate_span(
-            backend=self,
-            action=action,
-            ctx=ctx,
-            format=format,
-            tool_calls=tool_calls,
-            model_options=model_options,
+            backend=self, action=action, ctx=ctx, format=format, tool_calls=tool_calls
         )
 
         with with_context(
@@ -1254,15 +1249,20 @@ class used during generation, if any.
 
         # Record tracing if span exists
         if span is not None:
-            from ..telemetry.backend_instrumentation import finalize_backend_span
-
-            finalize_backend_span(
-                span,
-                usage=mot.generation.usage if mot.generation.usage else None,
-                model_id=self._get_hf_model_id(),
-                conversation=conversation,
-                output_text=str(mot.value) if mot.value is not None else None,
+            from ..telemetry import end_backend_span
+            from ..telemetry.backend_instrumentation import (
+                record_response_metadata,
+                record_token_usage,
             )
+
+            if isinstance(hf_output, GenerateDecoderOnlyOutput):
+                record_response_metadata(span, hf_output)
+                if mot.generation.usage:
+                    record_token_usage(span, mot.generation.usage)
+
+            # Close the span now that async operation is complete
+            end_backend_span(span)
+            # Clean up span reference
             del mot._meta["_telemetry_span"]
 
         # When caching is disabled, clear hf_output from meta to free GPU memory.
diff --git a/mellea/backends/litellm.py b/mellea/backends/litellm.py
index da98dec0e..f7d912516 100644
--- a/mellea/backends/litellm.py
+++ b/mellea/backends/litellm.py
@@ -164,12 +164,7 @@ async def _generate_from_context(
             "The Openai backend only supports chat-like contexts."
         )
         span = start_generate_span(
-            backend=self,
-            action=action,
-            ctx=ctx,
-            format=format,
-            tool_calls=tool_calls,
-            model_options=model_options,
+            backend=self, action=action, ctx=ctx, format=format, tool_calls=tool_calls
         )
 
         _model_id_str = str(getattr(self, "model_id", "unknown"))
@@ -566,16 +561,21 @@ async def post_processing(
         # Record telemetry now that response is available
         span = mot._meta.get("_telemetry_span")
         if span is not None:
-            from ..telemetry.backend_instrumentation import finalize_backend_span
+            from ..telemetry import end_backend_span
+            from ..telemetry.backend_instrumentation import (
+                record_response_metadata,
+                record_token_usage,
+            )
 
             response = mot._meta.get("litellm_chat_response")
-            finalize_backend_span(
-                span,
-                response=response,
-                usage=usage,
-                model_id=str(self.model_id),
-                conversation=conversation,
-            )
+            if response:
+                # LiteLLM responses have usage information
+                if usage:
+                    record_token_usage(span, usage)
+                record_response_metadata(span, response)
+            # Close the span now that async operation is complete
+            end_backend_span(span)
+            # Clean up the span reference
             del mot._meta["_telemetry_span"]
 
     @staticmethod
diff --git a/mellea/backends/ollama.py b/mellea/backends/ollama.py
index 651902319..5b50cd709 100644
--- a/mellea/backends/ollama.py
+++ b/mellea/backends/ollama.py
@@ -289,9 +289,7 @@ async def _generate_from_context(
                 and an updated context that includes ``action`` and the new output.
         """
         # Start span without auto-closing (will be closed in post_processing)
-        span = start_generate_span(
-            self, action, ctx, format, tool_calls, model_options=model_options
-        )
+        span = start_generate_span(self, action, ctx, format, tool_calls)
 
         assert ctx.is_chat_context, (
             "The ollama backend only supports chat-like contexts."
@@ -722,23 +720,21 @@ async def post_processing(
         # Record telemetry and close span now that response is available
         span = mot._meta.get("_telemetry_span")
         if span is not None:
-            from ..telemetry.backend_instrumentation import finalize_backend_span
-
-            output_text: str | None = None
-            if response is not None:
-                try:
-                    msg = getattr(response, "message", None)
-                    if msg is not None:
-                        output_text = str(getattr(msg, "content", "") or "")
-                except Exception:
-                    pass
-
-            finalize_backend_span(
-                span,
-                usage=mot.generation.usage if mot.generation.usage else None,
-                conversation=conversation,
-                output_text=output_text,
+            from ..telemetry import end_backend_span
+            from ..telemetry.backend_instrumentation import (
+                record_response_metadata,
+                record_token_usage,
             )
+
+            if response:
+                if mot.generation.usage:
+                    record_token_usage(span, mot.generation.usage)
+                record_response_metadata(span, response)
+
+            # Close the span now that telemetry is recorded
+            end_backend_span(span)
+
+            # Clean up the span reference
             del mot._meta["_telemetry_span"]
 
 
diff --git a/mellea/backends/openai.py b/mellea/backends/openai.py
index ace9c544e..1eea93511 100644
--- a/mellea/backends/openai.py
+++ b/mellea/backends/openai.py
@@ -467,12 +467,7 @@ async def _generate_from_context(
 
         # Start span without auto-closing (will be closed in post_processing)
         span = start_generate_span(
-            backend=self,
-            action=action,
-            ctx=ctx,
-            format=format,
-            tool_calls=tool_calls,
-            model_options=model_options,
+            backend=self, action=action, ctx=ctx, format=format, tool_calls=tool_calls
         )
 
         _model_id_str = str(getattr(self, "model_id", "unknown"))
@@ -1127,15 +1122,18 @@ async def post_processing(
         # Record telemetry now that response is available
         span = mot._meta.get("_telemetry_span")
         if span is not None:
-            from ..telemetry.backend_instrumentation import finalize_backend_span
-
-            finalize_backend_span(
-                span,
-                response=response,
-                usage=usage,
-                model_id=self._model_id,
-                conversation=conversation,
+            from ..telemetry import end_backend_span
+            from ..telemetry.backend_instrumentation import (
+                record_response_metadata,
+                record_token_usage,
             )
+
+            if usage:
+                record_token_usage(span, usage)
+            record_response_metadata(span, response)
+            # Close the span now that async operation is complete
+            end_backend_span(span)
+            # Clean up the span reference
             del mot._meta["_telemetry_span"]
 
     @overload
diff --git a/mellea/backends/watsonx.py b/mellea/backends/watsonx.py
index f0bcf80e2..87a0697d6 100644
--- a/mellea/backends/watsonx.py
+++ b/mellea/backends/watsonx.py
@@ -303,12 +303,7 @@ async def _generate_from_context(
             "The watsonx.ai backend only supports chat-like contexts."
         )
         span = start_generate_span(
-            backend=self,
-            action=action,
-            ctx=ctx,
-            format=format,
-            tool_calls=tool_calls,
-            model_options=model_options,
+            backend=self, action=action, ctx=ctx, format=format, tool_calls=tool_calls
         )
 
         _model_id_str = str(getattr(self, "model_id", "unknown"))
@@ -611,15 +606,20 @@ async def post_processing(
         # Record tracing if span exists
         span = mot._meta.get("_telemetry_span")
         if span is not None:
-            from ..telemetry.backend_instrumentation import finalize_backend_span
-
-            finalize_backend_span(
-                span,
-                response=response,
-                usage=usage,
-                model_id=str(self._get_watsonx_model_id()),
-                conversation=conversation,
+            from ..telemetry import end_backend_span
+            from ..telemetry.backend_instrumentation import (
+                record_response_metadata,
+                record_token_usage,
             )
+
+            if usage:
+                record_token_usage(span, usage)
+            if response is not None:
+                record_response_metadata(span, response)
+
+            # Close the span now that async operation is complete
+            end_backend_span(span)
+            # Clean up span reference
             del mot._meta["_telemetry_span"]
 
         # Generate the log for this ModelOutputThunk.
diff --git a/mellea/telemetry/backend_instrumentation.py b/mellea/telemetry/backend_instrumentation.py
index 939fe2d6a..fc648b918 100644
--- a/mellea/telemetry/backend_instrumentation.py
+++ b/mellea/telemetry/backend_instrumentation.py
@@ -2,10 +2,6 @@
 
 Follows OpenTelemetry Gen-AI semantic conventions:
 https://opentelemetry.io/docs/specs/semconv/gen-ai/
-
-Content capture (``gen_ai.input.messages``, ``gen_ai.output.messages``,
-``gen_ai.system_instructions``) is opt-in and gated by ``is_content_tracing_enabled()``.
-These attributes may contain PII — enable only in controlled environments.
 """
 
 import json
@@ -13,7 +9,6 @@
 
 from ..backends.utils import get_value
 from .tracing import (
-    add_span_event,
     end_backend_span,
     is_content_tracing_enabled,
     set_span_attribute,
@@ -345,64 +340,28 @@ def record_response_metadata(
         pass
 
 
-def finalize_backend_span(
-    span: Any,
-    *,
-    response: Any = None,
-    usage: Any = None,
-    model_id: str | None = None,
-    error: Exception | None = None,
-    conversation: list[dict] | None = None,
-    output_text: str | None = None,
-    finish_reason: str | None = None,
-) -> None:
-    """Close a backend span, recording telemetry on both success and error paths.
-
-    On the error path, records the exception, sets ``error.type``, and marks
-    the span with ERROR status before closing.  On the success path, records
-    token usage, response metadata, and (when content capture is enabled)
-    structured input/output message attributes.
+def finalize_backend_span(span: Any, *, error: Exception | None = None) -> None:
+    """Close a backend span on the error path, setting error.type and ERROR status.
 
-    This replaces the three-line ``record_token_usage`` + ``record_response_metadata``
-    + ``end_backend_span`` pattern used in each backend's ``post_processing``.
+    Used by the streaming error path in ``ModelOutputThunk.__aiter__`` where a
+    span may be left open after an exception.  Backends close spans on the
+    success path themselves via ``record_token_usage`` + ``record_response_metadata``
+    + ``end_backend_span``.
 
     Args:
         span: The span to finalise (no-op when ``None``).
-        response: Raw backend response (for model id, finish reason, response id).
-        usage: Token usage object or dict.
-        model_id: Explicit model id override.
-        error: Exception to record on the error path.
-        conversation: The prompt conversation (``list[dict]`` with ``role``/``content``
-            keys).  Used for ``gen_ai.input.messages`` and
-            ``gen_ai.system_instructions`` when content capture is enabled.
-        output_text: The assistant's reply text.  Used for
-            ``gen_ai.output.messages`` when content capture is enabled.
-        finish_reason: Finish reason string (defaults to ``"stop"`` when omitted).
+        error: Exception to record; sets ERROR status and ``error.type``.
     """
     if span is None:
         return
 
     try:
-        try:
-            if error is not None:
-                set_span_error(span, error)
-                # error.type is a Stable OTel cross-signal attribute
-                set_span_attribute(span, "error.type", type(error).__name__)
-            else:
-                record_token_usage(span, usage)
-                record_response_metadata(span, response, model_id=model_id)
-
-                if is_content_tracing_enabled() and conversation is not None:
-                    _emit_content_attributes(
-                        span,
-                        conversation=conversation,
-                        output_text=output_text,
-                        finish_reason=finish_reason,
-                        response=response,
-                    )
-        except Exception:
-            # Telemetry helpers must never break application code.
-            pass
+        if error is not None:
+            set_span_error(span, error)
+            # error.type is a Stable OTel cross-signal attribute
+            set_span_attribute(span, "error.type", type(error).__name__)
+    except Exception:
+        pass
     finally:
         end_backend_span(span)
 
@@ -426,84 +385,6 @@ def _serialize_json(obj: Any) -> str:
     return json.dumps(obj, default=str, ensure_ascii=False)
 
 
-def _conversation_to_parts(conversation: list[dict]) -> tuple[list[dict], list[dict]]:
-    """Split a conversation into system instructions and input messages.
-
-    Args:
-        conversation: List of ``{"role": ..., "content": ...}`` dicts.
-
-    Returns:
-        Tuple of ``(system_parts, input_messages)`` in the spec JSON shape.
-        ``system_parts`` is a list of ``{"type": "text", "content": ...}`` items.
-        ``input_messages`` is a list of
-        ``{"role": ..., "parts": [{"type": "text", "content": ...}]}`` items.
-    """
-    system_parts: list[dict] = []
-    input_messages: list[dict] = []
-    for msg in conversation:
-        role = msg.get("role", "")
-        content = msg.get("content", "")
-        if role == "system":
-            system_parts.append({"type": "text", "content": str(content)})
-        else:
-            input_messages.append(
-                {"role": role, "parts": [{"type": "text", "content": str(content)}]}
-            )
-    return system_parts, input_messages
-
-
-def _emit_content_attributes(
-    span: Any,
-    *,
-    conversation: list[dict],
-    output_text: str | None,
-    finish_reason: str | None,
-    response: Any = None,
-) -> None:
-    """Set structured content attributes on the span (content gate must be checked by caller)."""
-    try:
-        system_parts, input_messages = _conversation_to_parts(conversation)
-
-        if system_parts:
-            set_span_attribute(
-                span, "gen_ai.system_instructions", _serialize_json(system_parts)
-            )
-        if input_messages:
-            set_span_attribute(
-                span, "gen_ai.input.messages", _serialize_json(input_messages)
-            )
-
-        # Attempt to derive output text from an OpenAI-format response if not provided
-        if output_text is None and response is not None:
-            try:
-                choices = get_value(response, "choices")
-                if choices:
-                    first = choices[0] if isinstance(choices, list) else choices
-                    msg = get_value(first, "message")
-                    if msg is not None:
-                        output_text = str(get_value(msg, "content") or "")
-            except Exception:
-                pass
-
-        if output_text is not None:
-            output_msg = [
-                {
-                    "role": "assistant",
-                    "parts": [{"type": "text", "content": output_text}],
-                    "finish_reason": finish_reason or "stop",
-                }
-            ]
-            set_span_attribute(
-                span, "gen_ai.output.messages", _serialize_json(output_msg)
-            )
-
-        # Emit a span event so log-oriented receivers also see the content payload.
-        add_span_event(span, "gen_ai.client.inference.operation.details")
-    except Exception:
-        # Content capture is best-effort — never fail the span close
-        pass
-
-
 __all__ = [
     "finalize_backend_span",
     "get_context_size",
diff --git a/test/telemetry/test_genai_semconv_emission.py b/test/telemetry/test_genai_semconv_emission.py
index a3b2c3d0e..b6cd2ed88 100644
--- a/test/telemetry/test_genai_semconv_emission.py
+++ b/test/telemetry/test_genai_semconv_emission.py
@@ -1,5 +1,8 @@
 """Unit tests for OTel GenAI semantic convention emission gaps (issue #1035).
 
+Covers gaps 1-4. Gap 5 (content capture) is deferred; see cs/issue-1035-full
+for the full implementation.
+
 All tests use a fake span object and do not require a live backend or
 OpenTelemetry SDK installation.
 """
@@ -35,7 +38,7 @@ def _span_attrs(span: MagicMock) -> dict:
 
 
 # ---------------------------------------------------------------------------
-# gen_ai.provider.name alongside gen_ai.system
+# Gap 1: gen_ai.provider.name alongside gen_ai.system
 # ---------------------------------------------------------------------------
 
 
@@ -61,7 +64,7 @@ def test_provider_name_emitted_in_start_generate_span():
 
 
 # ---------------------------------------------------------------------------
-# gen_ai.conversation.id from session_id ContextVar
+# Gap 2: gen_ai.conversation.id from session_id ContextVar
 # ---------------------------------------------------------------------------
 
 
@@ -96,7 +99,7 @@ def test_conversation_id_absent_when_no_session():
 
 
 # ---------------------------------------------------------------------------
-# llm.prompt_template.* from Instruction
+# Gap 3: llm.prompt_template.* from Instruction
 # ---------------------------------------------------------------------------
 
 
@@ -116,7 +119,6 @@ def test_prompt_template_attrs_from_instruction():
         start_generate_span(backend, instr, ctx=[], format=None, tool_calls=False)
 
     call_kwargs = mock_start.call_args[1]
-    # Template text is always emitted
     assert call_kwargs.get("llm.prompt_template.template") == (
         "Summarise {{topic}} in one sentence."
     )
@@ -132,7 +134,6 @@ def test_prompt_template_variables_emitted_when_content_enabled(monkeypatch):
     backend = _fake_backend("OpenAIBackend")
     backend.model_id = "gpt-4"  # type: ignore[attr-defined]
 
-    # Patch the content gate to True
     monkeypatch.setattr(
         "mellea.telemetry.backend_instrumentation.is_content_tracing_enabled",
         lambda: True,
@@ -145,15 +146,13 @@ def test_prompt_template_variables_emitted_when_content_enabled(monkeypatch):
     call_kwargs = mock_start.call_args[1]
     variables_json = call_kwargs.get("llm.prompt_template.variables")
     assert variables_json is not None
-    parsed = json.loads(variables_json)
-    assert parsed == {"name": "World"}
+    assert json.loads(variables_json) == {"name": "World"}
 
 
 def test_instruction_without_user_variables_emits_template():
     from mellea.stdlib.components.instruction import Instruction
 
     instr = Instruction(description="Tell me about {{topic}}")
-    # No user_variables — template is retained as-is
 
     backend = _fake_backend("OpenAIBackend")
     backend.model_id = "gpt-4"  # type: ignore[attr-defined]
@@ -162,14 +161,16 @@ def test_instruction_without_user_variables_emits_template():
         mock_start.return_value = _mock_span()
         start_generate_span(backend, instr, ctx=[], format=None, tool_calls=False)
 
-    call_kwargs = mock_start.call_args[1]
-    assert call_kwargs.get("llm.prompt_template.template") == "Tell me about {{topic}}"
+    assert (
+        mock_start.call_args[1].get("llm.prompt_template.template")
+        == "Tell me about {{topic}}"
+    )
 
 
 def test_instruction_with_no_description_emits_no_template():
     from mellea.stdlib.components.instruction import Instruction
 
-    instr = Instruction()  # no description
+    instr = Instruction()
 
     backend = _fake_backend("OpenAIBackend")
     backend.model_id = "gpt-4"  # type: ignore[attr-defined]
@@ -178,12 +179,11 @@ def test_instruction_with_no_description_emits_no_template():
         mock_start.return_value = _mock_span()
         start_generate_span(backend, instr, ctx=[], format=None, tool_calls=False)
 
-    call_kwargs = mock_start.call_args[1]
-    assert "llm.prompt_template.template" not in call_kwargs
+    assert "llm.prompt_template.template" not in mock_start.call_args[1]
 
 
 # ---------------------------------------------------------------------------
-# ERROR span status + error.type (finalize_backend_span error path)
+# Gap 4: ERROR span status + error.type
 # ---------------------------------------------------------------------------
 
 
@@ -200,8 +200,7 @@ def test_error_sets_status_and_error_type():
         finalize_backend_span(span, error=exc)
 
     mock_set_err.assert_called_once_with(span, exc)
-    attrs = _span_attrs(span)
-    assert attrs.get("error.type") == "RuntimeError"
+    assert _span_attrs(span).get("error.type") == "RuntimeError"
     mock_end.assert_called_once_with(span)
 
 
@@ -215,164 +214,24 @@ def test_error_path_always_closes_span():
     mock_end.assert_called_once()
 
 
-def test_finalize_never_raises_on_span_error(monkeypatch):
+def test_finalize_never_raises_on_span_error():
     """finalize_backend_span must not propagate exceptions from helpers."""
     span = _mock_span()
     span.set_attribute.side_effect = RuntimeError("span broke")
 
     with patch("mellea.telemetry.backend_instrumentation.end_backend_span"):
         with patch("mellea.telemetry.backend_instrumentation.set_span_error"):
-            # Should not raise even though set_attribute raises
             finalize_backend_span(span, error=ValueError("test"))
 
 
 def test_finalize_none_span_is_noop():
-    finalize_backend_span(None, error=RuntimeError("x"))  # no exception
-
-
-# ---------------------------------------------------------------------------
-# Content capture (gen_ai.input.messages etc.) gated by MELLEA_TRACE_CONTENT
-# ---------------------------------------------------------------------------
-
-
-def test_content_capture_disabled_by_default():
-    span = _mock_span()
-    conversation = [
-        {"role": "system", "content": "You are helpful."},
-        {"role": "user", "content": "Hello"},
-    ]
-    with patch("mellea.telemetry.backend_instrumentation.end_backend_span"):
-        finalize_backend_span(span, conversation=conversation, output_text="Hi there")
-
-    attrs = _span_attrs(span)
-    assert "gen_ai.input.messages" not in attrs
-    assert "gen_ai.output.messages" not in attrs
-    assert "gen_ai.system_instructions" not in attrs
-
-
-def test_content_capture_emits_structured_attributes(monkeypatch):
-    monkeypatch.setattr(
-        "mellea.telemetry.backend_instrumentation.is_content_tracing_enabled",
-        lambda: True,
-    )
-    span = _mock_span()
-    conversation = [
-        {"role": "system", "content": "You are helpful."},
-        {"role": "user", "content": "Tell me a joke."},
-    ]
-    with patch("mellea.telemetry.backend_instrumentation.end_backend_span"):
-        with patch("mellea.telemetry.backend_instrumentation.add_span_event"):
-            finalize_backend_span(
-                span,
-                conversation=conversation,
-                output_text="Why did the chicken cross the road?",
-            )
-
-    attrs = _span_attrs(span)
-
-    # System instructions
-    sys_json = attrs.get("gen_ai.system_instructions")
-    assert sys_json is not None
-    sys_parts = json.loads(sys_json)
-    assert sys_parts == [{"type": "text", "content": "You are helpful."}]
-
-    # Input messages (non-system)
-    in_json = attrs.get("gen_ai.input.messages")
-    assert in_json is not None
-    in_msgs = json.loads(in_json)
-    assert len(in_msgs) == 1
-    assert in_msgs[0]["role"] == "user"
-    assert in_msgs[0]["parts"] == [{"type": "text", "content": "Tell me a joke."}]
-
-    # Output messages
-    out_json = attrs.get("gen_ai.output.messages")
-    assert out_json is not None
-    out_msgs = json.loads(out_json)
-    assert out_msgs[0]["role"] == "assistant"
-    assert out_msgs[0]["parts"][0]["content"] == "Why did the chicken cross the road?"
-    assert "finish_reason" in out_msgs[0]
-
-
-def test_content_capture_no_deprecated_per_role_events(monkeypatch):
-    """The deprecated gen_ai.user.message / gen_ai.assistant.message events must not be emitted."""
-    monkeypatch.setattr(
-        "mellea.telemetry.backend_instrumentation.is_content_tracing_enabled",
-        lambda: True,
-    )
-    span = _mock_span()
-    with patch("mellea.telemetry.backend_instrumentation.end_backend_span"):
-        finalize_backend_span(
-            span, conversation=[{"role": "user", "content": "hi"}], output_text="hello"
-        )
-
-    event_names = [call.args[0] for call in span.add_event.call_args_list]
-    deprecated = {
-        "gen_ai.user.message",
-        "gen_ai.assistant.message",
-        "gen_ai.system.message",
-    }
-    assert not deprecated.intersection(event_names)
-
-
-def test_content_span_event_emitted(monkeypatch):
-    monkeypatch.setattr(
-        "mellea.telemetry.backend_instrumentation.is_content_tracing_enabled",
-        lambda: True,
-    )
-    span = _mock_span()
-    with patch("mellea.telemetry.backend_instrumentation.end_backend_span"):
-        with patch(
-            "mellea.telemetry.backend_instrumentation.add_span_event"
-        ) as mock_event:
-            finalize_backend_span(
-                span,
-                conversation=[{"role": "user", "content": "hi"}],
-                output_text="hello",
-            )
-    event_names = [call.args[1] for call in mock_event.call_args_list]
-    assert "gen_ai.client.inference.operation.details" in event_names
+    finalize_backend_span(None, error=RuntimeError("x"))
 
 
 # ---------------------------------------------------------------------------
-# _TRACE_CONTENT_ENABLED recognises OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT
+# Content tracing default (infrastructure for deferred gap 5)
 # ---------------------------------------------------------------------------
 
 
-def test_content_tracing_enabled_via_mellea_env(monkeypatch):
-    monkeypatch.setenv("MELLEA_TRACE_CONTENT", "true")
-    import mellea.telemetry.tracing as tracing_mod
-
-    # Force re-evaluation of module-level constant
-    with patch.object(tracing_mod, "_TRACE_CONTENT_ENABLED", True):
-        assert tracing_mod.is_content_tracing_enabled()
-
-
 def test_content_tracing_disabled_by_default():
     assert not is_content_tracing_enabled()
-
-
-# ---------------------------------------------------------------------------
-# Success path of finalize_backend_span calls record helpers
-# ---------------------------------------------------------------------------
-
-
-def test_success_path_calls_record_token_usage():
-    span = _mock_span()
-    usage = {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}
-    with patch(
-        "mellea.telemetry.backend_instrumentation.record_token_usage"
-    ) as mock_rtu:
-        with patch("mellea.telemetry.backend_instrumentation.end_backend_span"):
-            finalize_backend_span(span, usage=usage)
-    mock_rtu.assert_called_once_with(span, usage)
-
-
-def test_success_path_calls_record_response_metadata():
-    span = _mock_span()
-    response = {"model": "gpt-4", "id": "resp-1"}
-    with patch(
-        "mellea.telemetry.backend_instrumentation.record_response_metadata"
-    ) as mock_rrm:
-        with patch("mellea.telemetry.backend_instrumentation.end_backend_span"):
-            finalize_backend_span(span, response=response, model_id="gpt-4")
-    mock_rrm.assert_called_once_with(span, response, model_id="gpt-4")

From 1185b5088cc50b1f6dd3f6a8e8f4990266d2843f Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Wed, 13 May 2026 08:07:24 +0100
Subject: [PATCH 07/10] refactor(telemetry): trim PR to gaps 1, 2, 4 per review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drop gap 3 (prompt-template capture) and the model_options/_REQUEST_PARAM_MAP
plumbing in response to review feedback from @jakelorocco and @ajbozarth.

Jake's objection to gap 3 is correct: stashing template state on Instruction
and GenerativeStub is the wrong layer — it only covers two component types,
captures pre-substitution values, and puts telemetry concerns inside domain
objects. The right implementation is at the formatter render path, which
covers all component types. That work belongs after #1045 lands.

The model_options/_REQUEST_PARAM_MAP block was dead code: no backend call
site passes model_options, and even if wired the values would be
pre-substitution. Per Nathan's review, the right call is to drop rather than
carry forward a no-op. Request-param emission also belongs in the post-#1045
plugin layer where the wire-format dict is visible.

What remains in this PR:
  gap 1 — gen_ai.provider.name alongside legacy gen_ai.system
  gap 2 — gen_ai.conversation.id from session_id ContextVar
  gap 4 — error.type + ERROR status via finalize_backend_span
  cache/reasoning token fields in record_token_usage
  MELLEA_TRACE_CONTENT flag + add_span_event (infrastructure for future gap 5)

Also fix OTel_SERVICE_NAME typo in the example (case-sensitive on Linux) and
rewrite the example docstring and README entry to be PR-independent.

Assisted-by: Claude Code
Signed-off-by: Nigel Jones <jonesn@uk.ibm.com>
---
 docs/examples/telemetry/README.md             |  7 +-
 .../telemetry/otel_genai_semconv_example.py   | 49 +++++-----
 mellea/stdlib/components/genstub.py           | 15 ---
 mellea/stdlib/components/instruction.py       | 23 -----
 mellea/telemetry/backend_instrumentation.py   | 60 +-----------
 test/telemetry/test_genai_semconv_emission.py | 96 +------------------
 6 files changed, 31 insertions(+), 219 deletions(-)

diff --git a/docs/examples/telemetry/README.md b/docs/examples/telemetry/README.md
index 9458e9db0..a91575c2a 100644
--- a/docs/examples/telemetry/README.md
+++ b/docs/examples/telemetry/README.md
@@ -6,10 +6,9 @@ This directory contains examples demonstrating OpenTelemetry tracing and metrics
 
 - **`telemetry_example.py`** - Demonstrates distributed tracing (application and backend traces)
 - **`metrics_example.py`** - Demonstrates token usage metrics collection
-- **`otel_genai_semconv_example.py`** - Exercises the OTel GenAI semantic convention attributes
-  added in issue #1035 (`gen_ai.provider.name`, `gen_ai.conversation.id`,
-  `llm.prompt_template.*`, `error.type`, content capture). Designed for human
-  verification against [otelite](https://github.com/planetf1/otelite).
+- **`otel_genai_semconv_example.py`** - Verifies OTel GenAI semantic convention attributes
+  emitted on backend spans (`gen_ai.provider.name`, `gen_ai.conversation.id`, `error.type`).
+  Designed for human verification against [otelite](https://github.com/planetf1/otelite).
 
 ## Quick Start
 
diff --git a/docs/examples/telemetry/otel_genai_semconv_example.py b/docs/examples/telemetry/otel_genai_semconv_example.py
index 83a12aa82..bbab23356 100644
--- a/docs/examples/telemetry/otel_genai_semconv_example.py
+++ b/docs/examples/telemetry/otel_genai_semconv_example.py
@@ -1,14 +1,14 @@
 # pytest: ollama, e2e
 
-"""Example demonstrating OTel GenAI semantic convention attributes (issue #1035).
+"""Mellea backend spans carrying OTel GenAI semantic convention attributes.
 
-Exercises gaps 1-4 so they can be verified in otelite or any OTel-compatible backend.
-Gap 5 (content capture) is deferred — see cs/issue-1035-full for that implementation.
+Each backend generation call emits a ``chat`` span with the following attributes
+drawn from the OTel GenAI semconv (https://opentelemetry.io/docs/specs/semconv/gen-ai/):
 
-  gen_ai.provider.name      — provider identity (alongside legacy gen_ai.system)
-  gen_ai.conversation.id    — mapped from session_id ContextVar
-  llm.prompt_template.*     — template text (always) and variables (opt-in)
-  error.type                — set on the error path alongside ERROR status
+  gen_ai.provider.name   — provider identity (current semconv)
+  gen_ai.system          — same value, retained for back-compat with existing dashboards
+  gen_ai.conversation.id — correlated to the active session via ``with_context``
+  error.type             — set on the error path alongside ERROR span status
 
 Run against otelite for human verification:
 
@@ -18,22 +18,21 @@
   # Terminal 2
   export MELLEA_TRACE_BACKEND=1
   export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
-  export OTel_SERVICE_NAME=mellea-semconv-demo
+  export OTEL_SERVICE_NAME=mellea-semconv-demo
   python otel_genai_semconv_example.py
 
-  Then open http://localhost:8080 → select mellea-semconv-demo service.
+  Then open http://localhost:8080 and select the mellea-semconv-demo service.
 
-What to verify per span in otelite
------------------------------------
-  Span "chat"
-    gen_ai.system              = "ollama"     (back-compat)
-    gen_ai.provider.name       = "ollama"     (new, semconv v1.37.0)
+Expected span attributes
+------------------------
+  Span "chat" (normal path)
+    gen_ai.system              = "ollama"
+    gen_ai.provider.name       = "ollama"
     gen_ai.conversation.id     = "demo-session-1"
-    mellea.session_id          = "demo-session-1"  (preserved)
-    llm.prompt_template.template = "Summarise {{topic}} in one sentence."
+    mellea.session_id          = "demo-session-1"
 
   Span "chat" (error path)
-    error.type  = "OllamaRequestError" (or similar)
+    error.type  = <exception class name>
     status      = ERROR
 """
 
@@ -46,33 +45,29 @@ def _section(title: str) -> None:
 
 
 def main() -> None:
-    _section("Mellea OTel GenAI Semantic Convention Demo (gaps 1-4)")
+    _section("Mellea OTel GenAI Semantic Convention Demo")
     print(f"Backend tracing: {is_backend_tracing_enabled()}")
     if not is_backend_tracing_enabled():
         print("Set MELLEA_TRACE_BACKEND=1 to enable backend spans.")
 
     # -----------------------------------------------------------------------
-    # Gaps 1-3: provider name, conversation id, prompt template attrs
+    # Normal path: provider name + conversation id
     # -----------------------------------------------------------------------
-    _section("Gaps 1-3: provider name / conversation id / template")
+    _section("Normal path — provider name and conversation id")
     print("Expected span attrs:")
     print("  gen_ai.system              = 'ollama'")
     print("  gen_ai.provider.name       = 'ollama'")
     print("  gen_ai.conversation.id     = 'demo-session-1'")
-    print("  llm.prompt_template.template = 'Summarise {{topic}} in one sentence.'")
 
     with with_context(session_id="demo-session-1"):
         with start_session() as m:
-            result = m.instruct(
-                "Summarise {{topic}} in one sentence.",
-                user_variables={"topic": "quantum tunnelling"},
-            )
+            result = m.instruct("Summarise quantum tunnelling in one sentence.")
     print(f"\nOutput: {str(result)[:120]}")
 
     # -----------------------------------------------------------------------
-    # Gap 4: error.type + ERROR status
+    # Error path: error.type + ERROR status
     # -----------------------------------------------------------------------
-    _section("Gap 4: error.type on span")
+    _section("Error path — error.type on span")
     print("Expected span attrs:")
     print("  status     = ERROR")
     print("  error.type = <exception class name>")
diff --git a/mellea/stdlib/components/genstub.py b/mellea/stdlib/components/genstub.py
index 94572b557..05ca11088 100644
--- a/mellea/stdlib/components/genstub.py
+++ b/mellea/stdlib/components/genstub.py
@@ -355,7 +355,6 @@ def __init__(self, func: Callable[P, R]):
 
         self._function = Function(func)
         self._arguments: Arguments | None = None
-        self._template_variables: dict = {}
         functools.update_wrapper(self, func)
 
         self._response_model = create_response_format(self._function._func)
@@ -521,18 +520,6 @@ def _parse(self, computed: ModelOutputThunk) -> R:
 
         return function_response.result
 
-    def prompt_template_metadata(self) -> tuple[str, dict, None] | None:
-        """Return prompt template metadata for telemetry.
-
-        Returns:
-            Tuple of ``(docstring, variables, version)`` when the function has
-            a docstring, otherwise ``None``.
-        """
-        docstring = self._function._function_dict.get("docstring")
-        if not docstring:
-            return None
-        return str(docstring), dict(self._template_variables), None
-
 
 class SyncGenerativeStub(GenerativeStub, Generic[P, R]):
     """A synchronous generative stub that blocks until the LLM response is ready.
@@ -600,7 +587,6 @@ def __call__(self, *args, **kwargs) -> tuple[R, Context] | R:
                 for r in extracted.precondition_requirements
             ]
 
-        stub_copy._template_variables = dict(extracted.f_kwargs)
         arguments = bind_function_arguments(self._function._func, **extracted.f_kwargs)
         if arguments:
             stub_args: list[Argument] = []
@@ -734,7 +720,6 @@ def __call__(self, *args, **kwargs) -> Coroutine[Any, Any, tuple[R, Context] | R
                 for r in extracted.precondition_requirements
             ]
 
-        stub_copy._template_variables = dict(extracted.f_kwargs)
         arguments = bind_function_arguments(self._function._func, **extracted.f_kwargs)
         if arguments:
             stub_args: list[Argument] = []
diff --git a/mellea/stdlib/components/instruction.py b/mellea/stdlib/components/instruction.py
index b814b4bf3..30faaea20 100644
--- a/mellea/stdlib/components/instruction.py
+++ b/mellea/stdlib/components/instruction.py
@@ -63,15 +63,6 @@ def __init__(
         icl_examples = [] if icl_examples is None else icl_examples
         grounding_context = dict() if grounding_context is None else grounding_context
 
-        # Retain raw template before Jinja substitution for telemetry.
-        # Template text is the static prompt structure; variables may contain user data.
-        self._template_description: str | None = (
-            description if isinstance(description, str) else None
-        )
-        self._user_variables: dict[str, str] | None = (
-            dict(user_variables) if user_variables else None
-        )
-
         # Apply templates. All inputs must be strings if provided.
         if user_variables is not None:
             if description is not None:
@@ -198,20 +189,6 @@ def format_for_llm(self) -> TemplateRepresentation:
             template_order=["*", "Instruction"],
         )
 
-    def prompt_template_metadata(self) -> tuple[str, dict[str, str], None] | None:
-        """Return prompt template metadata for telemetry.
-
-        The raw template text is emitted unconditionally.  Variables are only
-        emitted when content capture is enabled (they may contain user data).
-
-        Returns:
-            Tuple of ``(template_text, variables, version)`` when a string
-            description was provided, otherwise ``None``.
-        """
-        if self._template_description is None:
-            return None
-        return self._template_description, dict(self._user_variables or {}), None
-
     @staticmethod
     def apply_user_dict_from_jinja(user_dict: dict[str, str], s: str) -> str:
         """Render a Jinja2 template string using the provided variable dictionary.
diff --git a/mellea/telemetry/backend_instrumentation.py b/mellea/telemetry/backend_instrumentation.py
index fc648b918..21ef5dccd 100644
--- a/mellea/telemetry/backend_instrumentation.py
+++ b/mellea/telemetry/backend_instrumentation.py
@@ -4,17 +4,10 @@
 https://opentelemetry.io/docs/specs/semconv/gen-ai/
 """
 
-import json
 from typing import Any
 
 from ..backends.utils import get_value
-from .tracing import (
-    end_backend_span,
-    is_content_tracing_enabled,
-    set_span_attribute,
-    set_span_error,
-    trace_backend,
-)
+from .tracing import end_backend_span, set_span_attribute, set_span_error, trace_backend
 
 
 def get_model_id_str(backend: Any) -> str:
@@ -135,13 +128,7 @@ def instrument_generate_from_context(
 
 
 def start_generate_span(
-    backend: Any,
-    action: Any,
-    ctx: Any,
-    format: Any = None,
-    tool_calls: bool = False,
-    *,
-    model_options: dict | None = None,
+    backend: Any, action: Any, ctx: Any, format: Any = None, tool_calls: bool = False
 ):
     """Start a backend trace span for generate_from_context (without auto-closing).
 
@@ -154,7 +141,6 @@ def start_generate_span(
         ctx: Context
         format: Response format (BaseModel subclass or None)
         tool_calls: Whether tool calling is enabled
-        model_options: Raw model options dict for request-parameter attributes
 
     Returns:
         Span object or None if tracing is disabled
@@ -191,29 +177,6 @@ def start_generate_span(
     if session_id is not None:
         span_attrs["gen_ai.conversation.id"] = session_id
 
-    # Request parameters from model_options (plain-string keys only)
-    if model_options:
-        for mellea_key, otel_key in _REQUEST_PARAM_MAP.items():
-            val = model_options.get(mellea_key)
-            if val is not None:
-                span_attrs[otel_key] = val
-
-    # Prompt template attributes (duck-typed; works for Instruction and GenerativeStub)
-    tmpl = getattr(action, "prompt_template_metadata", None)
-    if callable(tmpl):
-        metadata: Any = tmpl()
-        if metadata is not None:
-            template_text, template_vars, template_version = metadata
-            if template_text:
-                span_attrs["llm.prompt_template.template"] = template_text
-            if template_version:
-                span_attrs["llm.prompt_template.version"] = template_version
-            # Variables contain user-provided values — only emit with content gate
-            if template_vars and is_content_tracing_enabled():
-                span_attrs["llm.prompt_template.variables"] = _serialize_json(
-                    template_vars
-                )
-
     return start_backend_span("chat", **span_attrs)
 
 
@@ -366,25 +329,6 @@ def finalize_backend_span(span: Any, *, error: Exception | None = None) -> None:
         end_backend_span(span)
 
 
-# ---------------------------------------------------------------------------
-# Private helpers
-# ---------------------------------------------------------------------------
-
-# Mapping from Mellea/OpenAI plain-string model_options keys to OTel request attrs.
-_REQUEST_PARAM_MAP: dict[str, str] = {
-    "temperature": "gen_ai.request.temperature",
-    "top_p": "gen_ai.request.top_p",
-    "top_k": "gen_ai.request.top_k",
-    "frequency_penalty": "gen_ai.request.frequency_penalty",
-    "presence_penalty": "gen_ai.request.presence_penalty",
-}
-
-
-def _serialize_json(obj: Any) -> str:
-    """Serialise *obj* to a JSON string, coercing non-serialisable values to str."""
-    return json.dumps(obj, default=str, ensure_ascii=False)
-
-
 __all__ = [
     "finalize_backend_span",
     "get_context_size",
diff --git a/test/telemetry/test_genai_semconv_emission.py b/test/telemetry/test_genai_semconv_emission.py
index b6cd2ed88..fbb88462e 100644
--- a/test/telemetry/test_genai_semconv_emission.py
+++ b/test/telemetry/test_genai_semconv_emission.py
@@ -1,13 +1,11 @@
-"""Unit tests for OTel GenAI semantic convention emission gaps (issue #1035).
+"""Unit tests for OTel GenAI semantic convention attribute emission.
 
-Covers gaps 1-4. Gap 5 (content capture) is deferred; see cs/issue-1035-full
-for the full implementation.
+Covers: gen_ai.provider.name (gap 1), gen_ai.conversation.id (gap 2),
+error.type + ERROR status (gap 4), and the MELLEA_TRACE_CONTENT flag.
 
-All tests use a fake span object and do not require a live backend or
-OpenTelemetry SDK installation.
+All tests use a fake span and do not require a live backend or OTel SDK.
 """
 
-import json
 from unittest.mock import MagicMock, patch
 
 from mellea.telemetry.backend_instrumentation import (
@@ -52,7 +50,6 @@ def test_provider_name_emitted_in_start_generate_span():
     backend = _fake_backend("OpenAIBackend")
     backend.model_id = "gpt-4"  # type: ignore[attr-defined]
     action = MagicMock()
-    action.prompt_template_metadata = None
 
     with patch("mellea.telemetry.tracing.start_backend_span") as mock_start:
         mock_start.return_value = _mock_span()
@@ -88,7 +85,6 @@ def test_conversation_id_absent_when_no_session():
     backend = _fake_backend("OpenAIBackend")
     backend.model_id = "gpt-4"  # type: ignore[attr-defined]
     action = MagicMock()
-    action.prompt_template_metadata = None
 
     with patch("mellea.telemetry.tracing.start_backend_span") as mock_start:
         mock_start.return_value = _mock_span()
@@ -98,90 +94,6 @@ def test_conversation_id_absent_when_no_session():
     assert "gen_ai.conversation.id" not in call_kwargs
 
 
-# ---------------------------------------------------------------------------
-# Gap 3: llm.prompt_template.* from Instruction
-# ---------------------------------------------------------------------------
-
-
-def test_prompt_template_attrs_from_instruction():
-    from mellea.stdlib.components.instruction import Instruction
-
-    instr = Instruction(
-        description="Summarise {{topic}} in one sentence.",
-        user_variables={"topic": "quantum tunnelling"},
-    )
-
-    backend = _fake_backend("OpenAIBackend")
-    backend.model_id = "gpt-4"  # type: ignore[attr-defined]
-
-    with patch("mellea.telemetry.tracing.start_backend_span") as mock_start:
-        mock_start.return_value = _mock_span()
-        start_generate_span(backend, instr, ctx=[], format=None, tool_calls=False)
-
-    call_kwargs = mock_start.call_args[1]
-    assert call_kwargs.get("llm.prompt_template.template") == (
-        "Summarise {{topic}} in one sentence."
-    )
-    # Variables are NOT emitted when content capture is off (default)
-    assert "llm.prompt_template.variables" not in call_kwargs
-
-
-def test_prompt_template_variables_emitted_when_content_enabled(monkeypatch):
-    from mellea.stdlib.components.instruction import Instruction
-
-    instr = Instruction(description="Hello {{name}}", user_variables={"name": "World"})
-
-    backend = _fake_backend("OpenAIBackend")
-    backend.model_id = "gpt-4"  # type: ignore[attr-defined]
-
-    monkeypatch.setattr(
-        "mellea.telemetry.backend_instrumentation.is_content_tracing_enabled",
-        lambda: True,
-    )
-
-    with patch("mellea.telemetry.tracing.start_backend_span") as mock_start:
-        mock_start.return_value = _mock_span()
-        start_generate_span(backend, instr, ctx=[], format=None, tool_calls=False)
-
-    call_kwargs = mock_start.call_args[1]
-    variables_json = call_kwargs.get("llm.prompt_template.variables")
-    assert variables_json is not None
-    assert json.loads(variables_json) == {"name": "World"}
-
-
-def test_instruction_without_user_variables_emits_template():
-    from mellea.stdlib.components.instruction import Instruction
-
-    instr = Instruction(description="Tell me about {{topic}}")
-
-    backend = _fake_backend("OpenAIBackend")
-    backend.model_id = "gpt-4"  # type: ignore[attr-defined]
-
-    with patch("mellea.telemetry.tracing.start_backend_span") as mock_start:
-        mock_start.return_value = _mock_span()
-        start_generate_span(backend, instr, ctx=[], format=None, tool_calls=False)
-
-    assert (
-        mock_start.call_args[1].get("llm.prompt_template.template")
-        == "Tell me about {{topic}}"
-    )
-
-
-def test_instruction_with_no_description_emits_no_template():
-    from mellea.stdlib.components.instruction import Instruction
-
-    instr = Instruction()
-
-    backend = _fake_backend("OpenAIBackend")
-    backend.model_id = "gpt-4"  # type: ignore[attr-defined]
-
-    with patch("mellea.telemetry.tracing.start_backend_span") as mock_start:
-        mock_start.return_value = _mock_span()
-        start_generate_span(backend, instr, ctx=[], format=None, tool_calls=False)
-
-    assert "llm.prompt_template.template" not in mock_start.call_args[1]
-
-
 # ---------------------------------------------------------------------------
 # Gap 4: ERROR span status + error.type
 # ---------------------------------------------------------------------------

From 292a7f2ea630616b9c961ee796d6349479d0861a Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Wed, 13 May 2026 08:47:15 +0100
Subject: [PATCH 08/10] test(telemetry): remove stale gap-3 artefact; add
 add_span_event tests

Remove the `action.prompt_template_metadata = None` assignment left over
from the gap-3 prompt-template work that was withdrawn from this PR.
The attribute is never read by `start_generate_span` in the trimmed
implementation, making the line misleading.

Add three unit tests for `add_span_event` (event forwarded to span,
None-span no-op, empty-attributes default) patching `_OTEL_AVAILABLE`
since the test environment has no OTel SDK installed.

Assisted-by: Claude Code
Signed-off-by: Nigel Jones <jonesn@uk.ibm.com>
---
 test/telemetry/test_genai_semconv_emission.py | 29 +++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/test/telemetry/test_genai_semconv_emission.py b/test/telemetry/test_genai_semconv_emission.py
index fbb88462e..d1a6a5c85 100644
--- a/test/telemetry/test_genai_semconv_emission.py
+++ b/test/telemetry/test_genai_semconv_emission.py
@@ -15,7 +15,7 @@
     start_generate_span,
 )
 from mellea.telemetry.context import with_context
-from mellea.telemetry.tracing import is_content_tracing_enabled
+from mellea.telemetry.tracing import add_span_event, is_content_tracing_enabled
 
 # ---------------------------------------------------------------------------
 # Helpers
@@ -69,7 +69,6 @@ def test_conversation_id_emitted_from_session_id():
     backend = _fake_backend("OpenAIBackend")
     backend.model_id = "gpt-4"  # type: ignore[attr-defined]
     action = MagicMock()
-    action.prompt_template_metadata = None
 
     with with_context(session_id="sess-abc"):
         with patch("mellea.telemetry.tracing.start_backend_span") as mock_start:
@@ -147,3 +146,29 @@ def test_finalize_none_span_is_noop():
 
 def test_content_tracing_disabled_by_default():
     assert not is_content_tracing_enabled()
+
+
+# ---------------------------------------------------------------------------
+# add_span_event helper
+# ---------------------------------------------------------------------------
+
+
+def test_add_span_event_calls_span_add_event():
+    span = _mock_span()
+    with patch("mellea.telemetry.tracing._OTEL_AVAILABLE", True):
+        add_span_event(span, "gen_ai.content.prompt", {"gen_ai.prompt": "hello"})
+    span.add_event.assert_called_once_with(
+        "gen_ai.content.prompt", attributes={"gen_ai.prompt": "hello"}
+    )
+
+
+def test_add_span_event_none_span_is_noop():
+    with patch("mellea.telemetry.tracing._OTEL_AVAILABLE", True):
+        add_span_event(None, "gen_ai.content.prompt")
+
+
+def test_add_span_event_defaults_to_empty_attributes():
+    span = _mock_span()
+    with patch("mellea.telemetry.tracing._OTEL_AVAILABLE", True):
+        add_span_event(span, "gen_ai.content.completion")
+    span.add_event.assert_called_once_with("gen_ai.content.completion", attributes={})

From 5557734cfc4623c385b26110316cbfc89e280a63 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Wed, 13 May 2026 12:47:52 +0100
Subject: [PATCH 09/10] fix(telemetry): address self-review findings on OTel
 semconv PR
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Guard end_backend_span in its own try/except so SDK errors on the
  streaming error path cannot mask the original backend exception
- Wire get_provider_name into all three span-creation functions so
  internal code uses it (was set but calling get_system_name directly,
  contradicting the docstring guidance)
- Fix span_attrs: dict → dict[str, Any] per project typing conventions
- Replace _backend.model_id private-attr mutation in example with
  start_session(model_id=...) public API
- Add qualitative marker to example so it does not run in the fast loop
- Add test_finalize_never_raises_if_end_span_raises to cover the
  now-guarded end_backend_span code path

Assisted-by: Claude Code
---
 .../telemetry/otel_genai_semconv_example.py       |  5 ++---
 mellea/telemetry/backend_instrumentation.py       | 15 ++++++++++-----
 test/telemetry/test_genai_semconv_emission.py     | 11 +++++++++++
 3 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/docs/examples/telemetry/otel_genai_semconv_example.py b/docs/examples/telemetry/otel_genai_semconv_example.py
index bbab23356..aea1b6ddd 100644
--- a/docs/examples/telemetry/otel_genai_semconv_example.py
+++ b/docs/examples/telemetry/otel_genai_semconv_example.py
@@ -1,4 +1,4 @@
-# pytest: ollama, e2e
+# pytest: ollama, e2e, qualitative
 
 """Mellea backend spans carrying OTel GenAI semantic convention attributes.
 
@@ -73,8 +73,7 @@ def main() -> None:
     print("  error.type = <exception class name>")
 
     try:
-        with start_session() as m2:
-            m2._backend.model_id = "mellea-semconv-nonexistent-xyz"  # type: ignore[attr-defined]
+        with start_session(model_id="mellea-semconv-nonexistent-xyz") as m2:
             m2.instruct("Hello")
     except Exception as exc:
         print(f"\nGot expected error: {exc.__class__.__name__}")
diff --git a/mellea/telemetry/backend_instrumentation.py b/mellea/telemetry/backend_instrumentation.py
index 21ef5dccd..02cbb75af 100644
--- a/mellea/telemetry/backend_instrumentation.py
+++ b/mellea/telemetry/backend_instrumentation.py
@@ -107,13 +107,14 @@ def instrument_generate_from_context(
     """
     model_id = get_model_id_str(backend)
     system_name = get_system_name(backend)
+    provider_name = get_provider_name(backend)
 
     return trace_backend(
         "chat",  # Gen-AI convention: use 'chat' for chat completions
         **{
             # Gen-AI semantic convention attributes
             "gen_ai.system": system_name,
-            "gen_ai.provider.name": system_name,
+            "gen_ai.provider.name": provider_name,
             "gen_ai.request.model": model_id,
             "gen_ai.operation.name": "chat",
             # Mellea-specific attributes
@@ -149,14 +150,15 @@ def start_generate_span(
 
     model_id = get_model_id_str(backend)
     system_name = get_system_name(backend)
+    provider_name = get_provider_name(backend)
 
     from .context import get_current_context
 
     telemetry_ctx = get_current_context()
-    span_attrs: dict = {
+    span_attrs: dict[str, Any] = {
         # Gen-AI semantic convention attributes
         "gen_ai.system": system_name,
-        "gen_ai.provider.name": system_name,
+        "gen_ai.provider.name": provider_name,
         "gen_ai.request.model": model_id,
         "gen_ai.operation.name": "chat",
         # Mellea-specific attributes
@@ -198,13 +200,14 @@ def instrument_generate_from_raw(
     """
     model_id = get_model_id_str(backend)
     system_name = get_system_name(backend)
+    provider_name = get_provider_name(backend)
 
     return trace_backend(
         "text_completion",  # Gen-AI convention: use 'text_completion' for completions
         **{
             # Gen-AI semantic convention attributes
             "gen_ai.system": system_name,
-            "gen_ai.provider.name": system_name,
+            "gen_ai.provider.name": provider_name,
             "gen_ai.request.model": model_id,
             "gen_ai.operation.name": "text_completion",
             # Mellea-specific attributes
@@ -325,8 +328,10 @@ def finalize_backend_span(span: Any, *, error: Exception | None = None) -> None:
             set_span_attribute(span, "error.type", type(error).__name__)
     except Exception:
         pass
-    finally:
+    try:
         end_backend_span(span)
+    except Exception:
+        pass
 
 
 __all__ = [
diff --git a/test/telemetry/test_genai_semconv_emission.py b/test/telemetry/test_genai_semconv_emission.py
index d1a6a5c85..2aa7a77c1 100644
--- a/test/telemetry/test_genai_semconv_emission.py
+++ b/test/telemetry/test_genai_semconv_emission.py
@@ -135,6 +135,17 @@ def test_finalize_never_raises_on_span_error():
             finalize_backend_span(span, error=ValueError("test"))
 
 
+def test_finalize_never_raises_if_end_span_raises():
+    """end_backend_span exceptions must not propagate on the error path."""
+    span = _mock_span()
+    with patch(
+        "mellea.telemetry.backend_instrumentation.end_backend_span",
+        side_effect=RuntimeError("sdk shutdown"),
+    ):
+        with patch("mellea.telemetry.backend_instrumentation.set_span_error"):
+            finalize_backend_span(span, error=ValueError("original error"))
+
+
 def test_finalize_none_span_is_noop():
     finalize_backend_span(None, error=RuntimeError("x"))
 

From efedc82d0a0ef10f9bcfed0bfd19f43daf02bc83 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Wed, 13 May 2026 18:17:41 +0100
Subject: [PATCH 10/10] refactor(telemetry): remove dead
 instrument_generate_from_context, fix example error path

instrument_generate_from_context was imported but never called by any backend
(all backends use start_generate_span); remove function, __all__ entry, stale
imports in ollama.py and openai.py, and the corresponding test.

Example error path now uses an unreachable base_url (localhost:19999) instead
of a bogus model name, which could cause Ollama to attempt a pull rather than
fail deterministically.

Assisted-by: Claude Code
---
 .../telemetry/otel_genai_semconv_example.py   |  6 +--
 mellea/backends/ollama.py                     |  1 -
 mellea/backends/openai.py                     |  1 -
 mellea/telemetry/backend_instrumentation.py   | 41 -------------------
 test/telemetry/test_tracing.py                | 25 -----------
 5 files changed, 2 insertions(+), 72 deletions(-)

diff --git a/docs/examples/telemetry/otel_genai_semconv_example.py b/docs/examples/telemetry/otel_genai_semconv_example.py
index aea1b6ddd..e38840c4c 100644
--- a/docs/examples/telemetry/otel_genai_semconv_example.py
+++ b/docs/examples/telemetry/otel_genai_semconv_example.py
@@ -73,14 +73,12 @@ def main() -> None:
     print("  error.type = <exception class name>")
 
     try:
-        with start_session(model_id="mellea-semconv-nonexistent-xyz") as m2:
+        with start_session(base_url="http://localhost:19999") as m2:
             m2.instruct("Hello")
     except Exception as exc:
         print(f"\nGot expected error: {exc.__class__.__name__}")
     else:
-        print(
-            "\n(No error — check the span for error.type if the model unexpectedly exists)"
-        )
+        print("\n(No error — nothing is listening on port 19999)")
 
     _section("Done")
     print("If OTEL_EXPORTER_OTLP_ENDPOINT is set, check your trace backend.")
diff --git a/mellea/backends/ollama.py b/mellea/backends/ollama.py
index 5b50cd709..bc436bece 100644
--- a/mellea/backends/ollama.py
+++ b/mellea/backends/ollama.py
@@ -28,7 +28,6 @@
 from ..stdlib.components import Message
 from ..stdlib.requirements import ALoraRequirement
 from ..telemetry.backend_instrumentation import (
-    instrument_generate_from_context,
     instrument_generate_from_raw,
     start_generate_span,
 )
diff --git a/mellea/backends/openai.py b/mellea/backends/openai.py
index 1eea93511..5dfeaec51 100644
--- a/mellea/backends/openai.py
+++ b/mellea/backends/openai.py
@@ -45,7 +45,6 @@
 from ..stdlib.components import Intrinsic, Message
 from ..stdlib.requirements import LLMaJRequirement
 from ..telemetry.backend_instrumentation import (
-    instrument_generate_from_context,
     instrument_generate_from_raw,
     start_generate_span,
 )
diff --git a/mellea/telemetry/backend_instrumentation.py b/mellea/telemetry/backend_instrumentation.py
index 02cbb75af..dc2d26fff 100644
--- a/mellea/telemetry/backend_instrumentation.py
+++ b/mellea/telemetry/backend_instrumentation.py
@@ -88,46 +88,6 @@ def get_context_size(ctx: Any) -> int:
     return 0
 
 
-def instrument_generate_from_context(
-    backend: Any, action: Any, ctx: Any, format: Any = None, tool_calls: bool = False
-):
-    """Create a backend trace span for generate_from_context.
-
-    Follows Gen-AI semantic conventions for chat operations.
-
-    Args:
-        backend: Backend instance
-        action: Action component
-        ctx: Context
-        format: Response format (BaseModel subclass or None)
-        tool_calls: Whether tool calling is enabled
-
-    Returns:
-        Context manager for the trace span
-    """
-    model_id = get_model_id_str(backend)
-    system_name = get_system_name(backend)
-    provider_name = get_provider_name(backend)
-
-    return trace_backend(
-        "chat",  # Gen-AI convention: use 'chat' for chat completions
-        **{
-            # Gen-AI semantic convention attributes
-            "gen_ai.system": system_name,
-            "gen_ai.provider.name": provider_name,
-            "gen_ai.request.model": model_id,
-            "gen_ai.operation.name": "chat",
-            # Mellea-specific attributes
-            "mellea.backend": backend.__class__.__name__,
-            "mellea.action_type": action.__class__.__name__,
-            "mellea.context_size": get_context_size(ctx),
-            "mellea.has_format": format is not None,
-            "mellea.format_type": format.__name__ if format else None,
-            "mellea.tool_calls_enabled": tool_calls,
-        },
-    )
-
-
 def start_generate_span(
     backend: Any, action: Any, ctx: Any, format: Any = None, tool_calls: bool = False
 ):
@@ -340,7 +300,6 @@ def finalize_backend_span(span: Any, *, error: Exception | None = None) -> None:
     "get_model_id_str",
     "get_provider_name",
     "get_system_name",
-    "instrument_generate_from_context",
     "instrument_generate_from_raw",
     "record_response_metadata",
     "record_token_usage",
diff --git a/test/telemetry/test_tracing.py b/test/telemetry/test_tracing.py
index af83de5ad..b1a058680 100644
--- a/test/telemetry/test_tracing.py
+++ b/test/telemetry/test_tracing.py
@@ -200,31 +200,6 @@ def __init__(self):
     assert get_context_size(ctx) == 3
 
 
-def test_instrument_generate_from_context():
-    """Test instrument_generate_from_context helper."""
-    from mellea.telemetry.backend_instrumentation import (
-        instrument_generate_from_context,
-    )
-
-    class MockBackend:
-        model_id = "test-model"
-
-    class MockAction:
-        pass
-
-    class MockContext:
-        turns = []
-
-    backend = MockBackend()
-    action = MockAction()
-    ctx = MockContext()
-
-    # Should return a context manager
-    with instrument_generate_from_context(backend, action, ctx) as span:
-        # Span will be None when tracing is disabled
-        assert span is None or hasattr(span, "set_attribute")
-
-
 def test_instrument_generate_from_raw():
     """Test instrument_generate_from_raw helper."""
     from mellea.telemetry.backend_instrumentation import instrument_generate_from_raw