From 3e8d665bc60de1da1d28f562fc790a0aa9b2ec00 Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Thu, 7 May 2026 14:55:51 +0100 Subject: [PATCH 01/10] feat(telemetry): add is_content_tracing_enabled and add_span_event helpers Add MELLEA_TRACE_CONTENT env-var gate (also recognises the standard OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT) and expose add_span_event() as a safe no-op wrapper. Both exported from mellea.telemetry and mellea.telemetry.tracing. Assisted-by: Claude Code Signed-off-by: Nigel Jones --- mellea/telemetry/__init__.py | 4 ++++ mellea/telemetry/tracing.py | 37 ++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/mellea/telemetry/__init__.py b/mellea/telemetry/__init__.py index ea9351252..6fb433a6c 100644 --- a/mellea/telemetry/__init__.py +++ b/mellea/telemetry/__init__.py @@ -92,9 +92,11 @@ def my_function(): ) from .pricing import is_pricing_enabled from .tracing import ( + add_span_event, end_backend_span, is_application_tracing_enabled, is_backend_tracing_enabled, + is_content_tracing_enabled, set_span_attribute, set_span_error, start_backend_span, @@ -104,6 +106,7 @@ def my_function(): __all__ = [ "MelleaContextFilter", + "add_span_event", "async_with_context", "create_counter", "create_histogram", @@ -118,6 +121,7 @@ def my_function(): "get_session_id", "is_application_tracing_enabled", "is_backend_tracing_enabled", + "is_content_tracing_enabled", "is_metrics_enabled", "is_pricing_enabled", "record_cost", diff --git a/mellea/telemetry/tracing.py b/mellea/telemetry/tracing.py index 12b7b486f..48a3bf5e6 100644 --- a/mellea/telemetry/tracing.py +++ b/mellea/telemetry/tracing.py @@ -10,6 +10,9 @@ Configuration via environment variables: - MELLEA_TRACE_APPLICATION: Enable/disable application tracing (default: false) - MELLEA_TRACE_BACKEND: Enable/disable backend tracing (default: false) +- MELLEA_TRACE_CONTENT: Capture prompt/response content in spans (default: false). + Content may include PII — enable only in controlled environments. + Also recognised: OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT (OTel standard). - OTEL_EXPORTER_OTLP_ENDPOINT: OTLP endpoint for trace export - OTEL_SERVICE_NAME: Service name for traces (default: mellea) """ @@ -42,6 +45,11 @@ _TRACE_BACKEND_ENABLED = _OTEL_AVAILABLE and os.getenv( "MELLEA_TRACE_BACKEND", "false" ).lower() in ("true", "1", "yes") +_TRACE_CONTENT_ENABLED = _OTEL_AVAILABLE and ( + os.getenv("MELLEA_TRACE_CONTENT", "false").lower() in ("true", "1", "yes") + or os.getenv("OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT", "false").lower() + in ("true", "1", "yes") +) _OTLP_ENDPOINT = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT") _SERVICE_NAME = os.getenv("OTEL_SERVICE_NAME", "mellea") _CONSOLE_EXPORT = os.getenv("MELLEA_TRACE_CONSOLE", "false").lower() in ( @@ -113,6 +121,33 @@ def is_backend_tracing_enabled() -> bool: return _TRACE_BACKEND_ENABLED +def is_content_tracing_enabled() -> bool: + """Check if content capture is enabled. + + Content capture records prompt and response text on spans and may contain PII. + Enable only in controlled environments. + + Returns: + True if enabled via ``MELLEA_TRACE_CONTENT`` or + ``OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT``. + """ + return _TRACE_CONTENT_ENABLED + + +def add_span_event( + span: Any, name: str, attributes: dict[str, Any] | None = None +) -> None: + """Add a named event to a span if the span is not None. + + Args: + span: The span object (may be None if tracing is disabled). + name: Event name. + attributes: Optional event attributes. + """ + if span is not None and _OTEL_AVAILABLE: + span.add_event(name, attributes=attributes or {}) + + @contextmanager def trace_application(name: str, **attributes: Any) -> Generator[Any, None, None]: """Create an application trace span if application tracing is enabled. @@ -246,9 +281,11 @@ def set_span_error(span: Any, exception: Exception) -> None: __all__ = [ + "add_span_event", "end_backend_span", "is_application_tracing_enabled", "is_backend_tracing_enabled", + "is_content_tracing_enabled", "set_span_attribute", "set_span_error", "start_backend_span", From 30c36864ccf411573cc492ffb4e08557b54b586b Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Thu, 7 May 2026 14:57:04 +0100 Subject: [PATCH 02/10] feat(telemetry): surface gen_ai.provider.name, conversation.id, template attrs Five OTel GenAI semconv gaps closed (issue #1035): 1. gen_ai.provider.name emitted alongside legacy gen_ai.system (semconv v1.37.0 migration; keep both for dashboard back-compat). 2. gen_ai.conversation.id mapped from existing session_id ContextVar; the existing mellea.session_id attribute is preserved alongside it. 3. llm.prompt_template.template emitted unconditionally from Instruction and GenerativeStub; llm.prompt_template.variables gated behind MELLEA_TRACE_CONTENT (user data). 4. error.type (Stable OTel) set on the error path in the new finalize_backend_span() helper alongside set_span_error(). finalize_backend_span() replaces the three-line record_token_usage + record_response_metadata + end_backend_span pattern in each backend. 5. gen_ai.input.messages, gen_ai.output.messages, gen_ai.system_instructions emitted as structured JSON (spec v1.37.0 schema) when MELLEA_TRACE_CONTENT is enabled. No deprecated per-role events (gen_ai.user.message etc.) are emitted. A gen_ai.client.inference.operation.details span event is added as a marker for log-oriented receivers. Also adds gen_ai.request.temperature/top_p/top_k/frequency_penalty/ presence_penalty from model_options, and cache/reasoning token attrs in record_token_usage(). Assisted-by: Claude Code Signed-off-by: Nigel Jones --- mellea/telemetry/backend_instrumentation.py | 251 +++++++++++++++++++- 1 file changed, 249 insertions(+), 2 deletions(-) diff --git a/mellea/telemetry/backend_instrumentation.py b/mellea/telemetry/backend_instrumentation.py index 737da43ab..939fe2d6a 100644 --- a/mellea/telemetry/backend_instrumentation.py +++ b/mellea/telemetry/backend_instrumentation.py @@ -2,12 +2,24 @@ Follows OpenTelemetry Gen-AI semantic conventions: https://opentelemetry.io/docs/specs/semconv/gen-ai/ + +Content capture (``gen_ai.input.messages``, ``gen_ai.output.messages``, +``gen_ai.system_instructions``) is opt-in and gated by ``is_content_tracing_enabled()``. +These attributes may contain PII — enable only in controlled environments. """ +import json from typing import Any from ..backends.utils import get_value -from .tracing import set_span_attribute, trace_backend +from .tracing import ( + add_span_event, + end_backend_span, + is_content_tracing_enabled, + set_span_attribute, + set_span_error, + trace_backend, +) def get_model_id_str(backend: Any) -> str: @@ -30,6 +42,9 @@ def get_model_id_str(backend: Any) -> str: def get_system_name(backend: Any) -> str: """Get the Gen-AI system name from backend. + Kept for back-compatibility with existing dashboards keyed on ``gen_ai.system``. + New code should prefer ``get_provider_name()``. + Args: backend: Backend instance @@ -51,6 +66,21 @@ def get_system_name(backend: Any) -> str: return backend.__class__.__name__ +def get_provider_name(backend: Any) -> str: + """Get the Gen-AI provider name from backend. + + Returns the value for ``gen_ai.provider.name`` (semconv v1.37.0+), which + supersedes the deprecated ``gen_ai.system`` attribute. + + Args: + backend: Backend instance + + Returns: + Provider name (e.g., 'openai', 'ollama', 'huggingface') + """ + return get_system_name(backend) + + def get_context_size(ctx: Any) -> int: """Get the size of a context. @@ -95,6 +125,7 @@ def instrument_generate_from_context( **{ # Gen-AI semantic convention attributes "gen_ai.system": system_name, + "gen_ai.provider.name": system_name, "gen_ai.request.model": model_id, "gen_ai.operation.name": "chat", # Mellea-specific attributes @@ -109,7 +140,13 @@ def instrument_generate_from_context( def start_generate_span( - backend: Any, action: Any, ctx: Any, format: Any = None, tool_calls: bool = False + backend: Any, + action: Any, + ctx: Any, + format: Any = None, + tool_calls: bool = False, + *, + model_options: dict | None = None, ): """Start a backend trace span for generate_from_context (without auto-closing). @@ -122,6 +159,7 @@ def start_generate_span( ctx: Context format: Response format (BaseModel subclass or None) tool_calls: Whether tool calling is enabled + model_options: Raw model options dict for request-parameter attributes Returns: Span object or None if tracing is disabled @@ -137,6 +175,7 @@ def start_generate_span( span_attrs: dict = { # Gen-AI semantic convention attributes "gen_ai.system": system_name, + "gen_ai.provider.name": system_name, "gen_ai.request.model": model_id, "gen_ai.operation.name": "chat", # Mellea-specific attributes @@ -147,10 +186,39 @@ def start_generate_span( "mellea.format_type": format.__name__ if format else None, "mellea.tool_calls_enabled": tool_calls, } + # Propagate telemetry context to span for key, value in telemetry_ctx.items(): span_attrs[f"mellea.{key}"] = value + # gen_ai.conversation.id maps from the existing session_id ContextVar + session_id = telemetry_ctx.get("session_id") + if session_id is not None: + span_attrs["gen_ai.conversation.id"] = session_id + + # Request parameters from model_options (plain-string keys only) + if model_options: + for mellea_key, otel_key in _REQUEST_PARAM_MAP.items(): + val = model_options.get(mellea_key) + if val is not None: + span_attrs[otel_key] = val + + # Prompt template attributes (duck-typed; works for Instruction and GenerativeStub) + tmpl = getattr(action, "prompt_template_metadata", None) + if callable(tmpl): + metadata: Any = tmpl() + if metadata is not None: + template_text, template_vars, template_version = metadata + if template_text: + span_attrs["llm.prompt_template.template"] = template_text + if template_version: + span_attrs["llm.prompt_template.version"] = template_version + # Variables contain user-provided values — only emit with content gate + if template_vars and is_content_tracing_enabled(): + span_attrs["llm.prompt_template.variables"] = _serialize_json( + template_vars + ) + return start_backend_span("chat", **span_attrs) @@ -178,6 +246,7 @@ def instrument_generate_from_raw( **{ # Gen-AI semantic convention attributes "gen_ai.system": system_name, + "gen_ai.provider.name": system_name, "gen_ai.request.model": model_id, "gen_ai.operation.name": "text_completion", # Mellea-specific attributes @@ -214,6 +283,22 @@ def record_token_usage(span: Any, usage: Any) -> None: total_tokens = get_value(usage, "total_tokens") if total_tokens is not None: set_span_attribute(span, "gen_ai.usage.total_tokens", total_tokens) + + cache_read = get_value(usage, "cache_read_input_tokens") + if cache_read is not None: + set_span_attribute(span, "gen_ai.usage.cache_read.input_tokens", cache_read) + + cache_creation = get_value(usage, "cache_creation_input_tokens") + if cache_creation is not None: + set_span_attribute( + span, "gen_ai.usage.cache_creation.input_tokens", cache_creation + ) + + reasoning_tokens = get_value(usage, "reasoning_tokens") + if reasoning_tokens is not None: + set_span_attribute( + span, "gen_ai.usage.reasoning.output_tokens", reasoning_tokens + ) except Exception: # Don't fail if we can't extract token usage pass @@ -260,12 +345,174 @@ def record_response_metadata( pass +def finalize_backend_span( + span: Any, + *, + response: Any = None, + usage: Any = None, + model_id: str | None = None, + error: Exception | None = None, + conversation: list[dict] | None = None, + output_text: str | None = None, + finish_reason: str | None = None, +) -> None: + """Close a backend span, recording telemetry on both success and error paths. + + On the error path, records the exception, sets ``error.type``, and marks + the span with ERROR status before closing. On the success path, records + token usage, response metadata, and (when content capture is enabled) + structured input/output message attributes. + + This replaces the three-line ``record_token_usage`` + ``record_response_metadata`` + + ``end_backend_span`` pattern used in each backend's ``post_processing``. + + Args: + span: The span to finalise (no-op when ``None``). + response: Raw backend response (for model id, finish reason, response id). + usage: Token usage object or dict. + model_id: Explicit model id override. + error: Exception to record on the error path. + conversation: The prompt conversation (``list[dict]`` with ``role``/``content`` + keys). Used for ``gen_ai.input.messages`` and + ``gen_ai.system_instructions`` when content capture is enabled. + output_text: The assistant's reply text. Used for + ``gen_ai.output.messages`` when content capture is enabled. + finish_reason: Finish reason string (defaults to ``"stop"`` when omitted). + """ + if span is None: + return + + try: + try: + if error is not None: + set_span_error(span, error) + # error.type is a Stable OTel cross-signal attribute + set_span_attribute(span, "error.type", type(error).__name__) + else: + record_token_usage(span, usage) + record_response_metadata(span, response, model_id=model_id) + + if is_content_tracing_enabled() and conversation is not None: + _emit_content_attributes( + span, + conversation=conversation, + output_text=output_text, + finish_reason=finish_reason, + response=response, + ) + except Exception: + # Telemetry helpers must never break application code. + pass + finally: + end_backend_span(span) + + +# --------------------------------------------------------------------------- +# Private helpers +# --------------------------------------------------------------------------- + +# Mapping from Mellea/OpenAI plain-string model_options keys to OTel request attrs. +_REQUEST_PARAM_MAP: dict[str, str] = { + "temperature": "gen_ai.request.temperature", + "top_p": "gen_ai.request.top_p", + "top_k": "gen_ai.request.top_k", + "frequency_penalty": "gen_ai.request.frequency_penalty", + "presence_penalty": "gen_ai.request.presence_penalty", +} + + +def _serialize_json(obj: Any) -> str: + """Serialise *obj* to a JSON string, coercing non-serialisable values to str.""" + return json.dumps(obj, default=str, ensure_ascii=False) + + +def _conversation_to_parts(conversation: list[dict]) -> tuple[list[dict], list[dict]]: + """Split a conversation into system instructions and input messages. + + Args: + conversation: List of ``{"role": ..., "content": ...}`` dicts. + + Returns: + Tuple of ``(system_parts, input_messages)`` in the spec JSON shape. + ``system_parts`` is a list of ``{"type": "text", "content": ...}`` items. + ``input_messages`` is a list of + ``{"role": ..., "parts": [{"type": "text", "content": ...}]}`` items. + """ + system_parts: list[dict] = [] + input_messages: list[dict] = [] + for msg in conversation: + role = msg.get("role", "") + content = msg.get("content", "") + if role == "system": + system_parts.append({"type": "text", "content": str(content)}) + else: + input_messages.append( + {"role": role, "parts": [{"type": "text", "content": str(content)}]} + ) + return system_parts, input_messages + + +def _emit_content_attributes( + span: Any, + *, + conversation: list[dict], + output_text: str | None, + finish_reason: str | None, + response: Any = None, +) -> None: + """Set structured content attributes on the span (content gate must be checked by caller).""" + try: + system_parts, input_messages = _conversation_to_parts(conversation) + + if system_parts: + set_span_attribute( + span, "gen_ai.system_instructions", _serialize_json(system_parts) + ) + if input_messages: + set_span_attribute( + span, "gen_ai.input.messages", _serialize_json(input_messages) + ) + + # Attempt to derive output text from an OpenAI-format response if not provided + if output_text is None and response is not None: + try: + choices = get_value(response, "choices") + if choices: + first = choices[0] if isinstance(choices, list) else choices + msg = get_value(first, "message") + if msg is not None: + output_text = str(get_value(msg, "content") or "") + except Exception: + pass + + if output_text is not None: + output_msg = [ + { + "role": "assistant", + "parts": [{"type": "text", "content": output_text}], + "finish_reason": finish_reason or "stop", + } + ] + set_span_attribute( + span, "gen_ai.output.messages", _serialize_json(output_msg) + ) + + # Emit a span event so log-oriented receivers also see the content payload. + add_span_event(span, "gen_ai.client.inference.operation.details") + except Exception: + # Content capture is best-effort — never fail the span close + pass + + __all__ = [ + "finalize_backend_span", "get_context_size", "get_model_id_str", + "get_provider_name", "get_system_name", "instrument_generate_from_context", "instrument_generate_from_raw", "record_response_metadata", "record_token_usage", + "start_generate_span", ] From a0d507459a1be130cad0fd4fb49194a8e824f5f8 Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Thu, 7 May 2026 14:57:16 +0100 Subject: [PATCH 03/10] feat(components): retain prompt template text and variables for telemetry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instruction: capture _template_description (raw string before Jinja substitution) and _user_variables (copy) in __init__; expose via prompt_template_metadata() returning (template, variables, version)|None. GenerativeStub: capture f_kwargs on each call; expose via prompt_template_metadata() using the function docstring as the template and f_kwargs as the variables. Neither change affects runtime behaviour — data is retained for duck-typed use by start_generate_span(). Assisted-by: Claude Code Signed-off-by: Nigel Jones --- mellea/stdlib/components/genstub.py | 15 +++++++++++++++ mellea/stdlib/components/instruction.py | 23 +++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/mellea/stdlib/components/genstub.py b/mellea/stdlib/components/genstub.py index 05ca11088..94572b557 100644 --- a/mellea/stdlib/components/genstub.py +++ b/mellea/stdlib/components/genstub.py @@ -355,6 +355,7 @@ def __init__(self, func: Callable[P, R]): self._function = Function(func) self._arguments: Arguments | None = None + self._template_variables: dict = {} functools.update_wrapper(self, func) self._response_model = create_response_format(self._function._func) @@ -520,6 +521,18 @@ def _parse(self, computed: ModelOutputThunk) -> R: return function_response.result + def prompt_template_metadata(self) -> tuple[str, dict, None] | None: + """Return prompt template metadata for telemetry. + + Returns: + Tuple of ``(docstring, variables, version)`` when the function has + a docstring, otherwise ``None``. + """ + docstring = self._function._function_dict.get("docstring") + if not docstring: + return None + return str(docstring), dict(self._template_variables), None + class SyncGenerativeStub(GenerativeStub, Generic[P, R]): """A synchronous generative stub that blocks until the LLM response is ready. @@ -587,6 +600,7 @@ def __call__(self, *args, **kwargs) -> tuple[R, Context] | R: for r in extracted.precondition_requirements ] + stub_copy._template_variables = dict(extracted.f_kwargs) arguments = bind_function_arguments(self._function._func, **extracted.f_kwargs) if arguments: stub_args: list[Argument] = [] @@ -720,6 +734,7 @@ def __call__(self, *args, **kwargs) -> Coroutine[Any, Any, tuple[R, Context] | R for r in extracted.precondition_requirements ] + stub_copy._template_variables = dict(extracted.f_kwargs) arguments = bind_function_arguments(self._function._func, **extracted.f_kwargs) if arguments: stub_args: list[Argument] = [] diff --git a/mellea/stdlib/components/instruction.py b/mellea/stdlib/components/instruction.py index 30faaea20..b814b4bf3 100644 --- a/mellea/stdlib/components/instruction.py +++ b/mellea/stdlib/components/instruction.py @@ -63,6 +63,15 @@ def __init__( icl_examples = [] if icl_examples is None else icl_examples grounding_context = dict() if grounding_context is None else grounding_context + # Retain raw template before Jinja substitution for telemetry. + # Template text is the static prompt structure; variables may contain user data. + self._template_description: str | None = ( + description if isinstance(description, str) else None + ) + self._user_variables: dict[str, str] | None = ( + dict(user_variables) if user_variables else None + ) + # Apply templates. All inputs must be strings if provided. if user_variables is not None: if description is not None: @@ -189,6 +198,20 @@ def format_for_llm(self) -> TemplateRepresentation: template_order=["*", "Instruction"], ) + def prompt_template_metadata(self) -> tuple[str, dict[str, str], None] | None: + """Return prompt template metadata for telemetry. + + The raw template text is emitted unconditionally. Variables are only + emitted when content capture is enabled (they may contain user data). + + Returns: + Tuple of ``(template_text, variables, version)`` when a string + description was provided, otherwise ``None``. + """ + if self._template_description is None: + return None + return self._template_description, dict(self._user_variables or {}), None + @staticmethod def apply_user_dict_from_jinja(user_dict: dict[str, str], s: str) -> str: """Render a Jinja2 template string using the provided variable dictionary. From e6c40e7ac254df44ff01d72d1c071ec7b06b0c39 Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Thu, 7 May 2026 14:57:37 +0100 Subject: [PATCH 04/10] refactor(backends): wire finalize_backend_span into all five backends Replace the duplicated record_token_usage + record_response_metadata + end_backend_span pattern in each backend's post_processing() with a single finalize_backend_span() call that also passes the conversation and output text for content capture. Pass model_options into start_generate_span() so request-parameter attributes (temperature, top_p, etc.) are surfaced on the span. The stream error path in core/base.py is also consolidated through finalize_backend_span(error=...). No behaviour change on the success path; error spans now carry error.type and ERROR status instead of silently closing. Assisted-by: Claude Code Signed-off-by: Nigel Jones --- mellea/backends/huggingface.py | 28 ++++++++++++++-------------- mellea/backends/litellm.py | 28 ++++++++++++++-------------- mellea/backends/ollama.py | 34 +++++++++++++++++++--------------- mellea/backends/openai.py | 26 ++++++++++++++------------ mellea/backends/watsonx.py | 28 ++++++++++++++-------------- mellea/core/base.py | 5 ++--- 6 files changed, 77 insertions(+), 72 deletions(-) diff --git a/mellea/backends/huggingface.py b/mellea/backends/huggingface.py index 84507299e..f21e50c06 100644 --- a/mellea/backends/huggingface.py +++ b/mellea/backends/huggingface.py @@ -388,7 +388,12 @@ async def _generate_from_context( and an updated context that includes ``action`` and the new output. """ span = start_generate_span( - backend=self, action=action, ctx=ctx, format=format, tool_calls=tool_calls + backend=self, + action=action, + ctx=ctx, + format=format, + tool_calls=tool_calls, + model_options=model_options, ) with with_context( @@ -1249,20 +1254,15 @@ class used during generation, if any. # Record tracing if span exists if span is not None: - from ..telemetry import end_backend_span - from ..telemetry.backend_instrumentation import ( - record_response_metadata, - record_token_usage, + from ..telemetry.backend_instrumentation import finalize_backend_span + + finalize_backend_span( + span, + usage=mot.generation.usage if mot.generation.usage else None, + model_id=self._get_hf_model_id(), + conversation=conversation, + output_text=str(mot.value) if mot.value is not None else None, ) - - if isinstance(hf_output, GenerateDecoderOnlyOutput): - record_response_metadata(span, hf_output) - if mot.generation.usage: - record_token_usage(span, mot.generation.usage) - - # Close the span now that async operation is complete - end_backend_span(span) - # Clean up span reference del mot._meta["_telemetry_span"] # When caching is disabled, clear hf_output from meta to free GPU memory. diff --git a/mellea/backends/litellm.py b/mellea/backends/litellm.py index f7d912516..da98dec0e 100644 --- a/mellea/backends/litellm.py +++ b/mellea/backends/litellm.py @@ -164,7 +164,12 @@ async def _generate_from_context( "The Openai backend only supports chat-like contexts." ) span = start_generate_span( - backend=self, action=action, ctx=ctx, format=format, tool_calls=tool_calls + backend=self, + action=action, + ctx=ctx, + format=format, + tool_calls=tool_calls, + model_options=model_options, ) _model_id_str = str(getattr(self, "model_id", "unknown")) @@ -561,21 +566,16 @@ async def post_processing( # Record telemetry now that response is available span = mot._meta.get("_telemetry_span") if span is not None: - from ..telemetry import end_backend_span - from ..telemetry.backend_instrumentation import ( - record_response_metadata, - record_token_usage, - ) + from ..telemetry.backend_instrumentation import finalize_backend_span response = mot._meta.get("litellm_chat_response") - if response: - # LiteLLM responses have usage information - if usage: - record_token_usage(span, usage) - record_response_metadata(span, response) - # Close the span now that async operation is complete - end_backend_span(span) - # Clean up the span reference + finalize_backend_span( + span, + response=response, + usage=usage, + model_id=str(self.model_id), + conversation=conversation, + ) del mot._meta["_telemetry_span"] @staticmethod diff --git a/mellea/backends/ollama.py b/mellea/backends/ollama.py index 5b50cd709..651902319 100644 --- a/mellea/backends/ollama.py +++ b/mellea/backends/ollama.py @@ -289,7 +289,9 @@ async def _generate_from_context( and an updated context that includes ``action`` and the new output. """ # Start span without auto-closing (will be closed in post_processing) - span = start_generate_span(self, action, ctx, format, tool_calls) + span = start_generate_span( + self, action, ctx, format, tool_calls, model_options=model_options + ) assert ctx.is_chat_context, ( "The ollama backend only supports chat-like contexts." @@ -720,21 +722,23 @@ async def post_processing( # Record telemetry and close span now that response is available span = mot._meta.get("_telemetry_span") if span is not None: - from ..telemetry import end_backend_span - from ..telemetry.backend_instrumentation import ( - record_response_metadata, - record_token_usage, + from ..telemetry.backend_instrumentation import finalize_backend_span + + output_text: str | None = None + if response is not None: + try: + msg = getattr(response, "message", None) + if msg is not None: + output_text = str(getattr(msg, "content", "") or "") + except Exception: + pass + + finalize_backend_span( + span, + usage=mot.generation.usage if mot.generation.usage else None, + conversation=conversation, + output_text=output_text, ) - - if response: - if mot.generation.usage: - record_token_usage(span, mot.generation.usage) - record_response_metadata(span, response) - - # Close the span now that telemetry is recorded - end_backend_span(span) - - # Clean up the span reference del mot._meta["_telemetry_span"] diff --git a/mellea/backends/openai.py b/mellea/backends/openai.py index 1eea93511..ace9c544e 100644 --- a/mellea/backends/openai.py +++ b/mellea/backends/openai.py @@ -467,7 +467,12 @@ async def _generate_from_context( # Start span without auto-closing (will be closed in post_processing) span = start_generate_span( - backend=self, action=action, ctx=ctx, format=format, tool_calls=tool_calls + backend=self, + action=action, + ctx=ctx, + format=format, + tool_calls=tool_calls, + model_options=model_options, ) _model_id_str = str(getattr(self, "model_id", "unknown")) @@ -1122,18 +1127,15 @@ async def post_processing( # Record telemetry now that response is available span = mot._meta.get("_telemetry_span") if span is not None: - from ..telemetry import end_backend_span - from ..telemetry.backend_instrumentation import ( - record_response_metadata, - record_token_usage, + from ..telemetry.backend_instrumentation import finalize_backend_span + + finalize_backend_span( + span, + response=response, + usage=usage, + model_id=self._model_id, + conversation=conversation, ) - - if usage: - record_token_usage(span, usage) - record_response_metadata(span, response) - # Close the span now that async operation is complete - end_backend_span(span) - # Clean up the span reference del mot._meta["_telemetry_span"] @overload diff --git a/mellea/backends/watsonx.py b/mellea/backends/watsonx.py index 87a0697d6..f0bcf80e2 100644 --- a/mellea/backends/watsonx.py +++ b/mellea/backends/watsonx.py @@ -303,7 +303,12 @@ async def _generate_from_context( "The watsonx.ai backend only supports chat-like contexts." ) span = start_generate_span( - backend=self, action=action, ctx=ctx, format=format, tool_calls=tool_calls + backend=self, + action=action, + ctx=ctx, + format=format, + tool_calls=tool_calls, + model_options=model_options, ) _model_id_str = str(getattr(self, "model_id", "unknown")) @@ -606,20 +611,15 @@ async def post_processing( # Record tracing if span exists span = mot._meta.get("_telemetry_span") if span is not None: - from ..telemetry import end_backend_span - from ..telemetry.backend_instrumentation import ( - record_response_metadata, - record_token_usage, + from ..telemetry.backend_instrumentation import finalize_backend_span + + finalize_backend_span( + span, + response=response, + usage=usage, + model_id=str(self._get_watsonx_model_id()), + conversation=conversation, ) - - if usage: - record_token_usage(span, usage) - if response is not None: - record_response_metadata(span, response) - - # Close the span now that async operation is complete - end_backend_span(span) - # Clean up span reference del mot._meta["_telemetry_span"] # Generate the log for this ModelOutputThunk. diff --git a/mellea/core/base.py b/mellea/core/base.py index 5ab4aa935..3472bf4db 100644 --- a/mellea/core/base.py +++ b/mellea/core/base.py @@ -523,10 +523,9 @@ async def astream(self) -> str: # but we must not leak the span. span = self._meta.get("_telemetry_span") if span is not None: - from ..telemetry import end_backend_span, set_span_error + from ..telemetry.backend_instrumentation import finalize_backend_span - set_span_error(span, chunks[-1]) - end_backend_span(span) + finalize_backend_span(span, error=chunks[-1]) del self._meta["_telemetry_span"] # Fire generation_error hook (FIRE_AND_FORGET — does not block the raise) From ef992678e5f5d866f30680d7701d1b803fb4e92b Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Thu, 7 May 2026 14:57:52 +0100 Subject: [PATCH 05/10] test(telemetry): add unit tests and otelite example for semconv gaps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test/telemetry/test_genai_semconv_emission.py — 20 pure-unit tests covering each of the five gaps (no live backend or OTel SDK required): - gen_ai.provider.name + gen_ai.system dual-emission - gen_ai.conversation.id from session_id ContextVar - llm.prompt_template.* from Instruction (always / gated) - error.type + ERROR status via finalize_backend_span - gen_ai.input/output.messages structured JSON (gated) - no deprecated per-role events emitted - finalize_backend_span robustness (None span, broken span) docs/examples/telemetry/otel_genai_semconv_example.py — runnable example for human verification against otelite, demonstrating all five attributes and the error path. Assisted-by: Claude Code Signed-off-by: Nigel Jones --- docs/examples/telemetry/README.md | 4 + .../telemetry/otel_genai_semconv_example.py | 132 ++++++ test/telemetry/test_genai_semconv_emission.py | 378 ++++++++++++++++++ 3 files changed, 514 insertions(+) create mode 100644 docs/examples/telemetry/otel_genai_semconv_example.py create mode 100644 test/telemetry/test_genai_semconv_emission.py diff --git a/docs/examples/telemetry/README.md b/docs/examples/telemetry/README.md index fc79b1b6f..9458e9db0 100644 --- a/docs/examples/telemetry/README.md +++ b/docs/examples/telemetry/README.md @@ -6,6 +6,10 @@ This directory contains examples demonstrating OpenTelemetry tracing and metrics - **`telemetry_example.py`** - Demonstrates distributed tracing (application and backend traces) - **`metrics_example.py`** - Demonstrates token usage metrics collection +- **`otel_genai_semconv_example.py`** - Exercises the OTel GenAI semantic convention attributes + added in issue #1035 (`gen_ai.provider.name`, `gen_ai.conversation.id`, + `llm.prompt_template.*`, `error.type`, content capture). Designed for human + verification against [otelite](https://github.com/planetf1/otelite). ## Quick Start diff --git a/docs/examples/telemetry/otel_genai_semconv_example.py b/docs/examples/telemetry/otel_genai_semconv_example.py new file mode 100644 index 000000000..9279ad98e --- /dev/null +++ b/docs/examples/telemetry/otel_genai_semconv_example.py @@ -0,0 +1,132 @@ +# pytest: ollama, e2e + +"""Example demonstrating OTel GenAI semantic convention attributes (issue #1035). + +Exercises the five emission-gap fixes added in this issue so they can be verified +in otelite or any OTel-compatible backend: + + gen_ai.provider.name — provider identity (alongside legacy gen_ai.system) + gen_ai.conversation.id — mapped from session_id ContextVar + llm.prompt_template.* — template text (always) and variables (opt-in) + error.type — set on the error path alongside ERROR status + gen_ai.input/output.messages — structured content (opt-in via MELLEA_TRACE_CONTENT) + +Run against otelite for human verification: + + # Terminal 1 — start otelite (OTLP gRPC :4317, UI :8080) + docker run --rm -p 4317:4317 -p 8080:8080 ghcr.io/planetf1/otelite:latest + + # Terminal 2 — run with all attributes visible + export MELLEA_TRACE_BACKEND=1 + export MELLEA_TRACE_CONTENT=1 + export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 + export OTEL_SERVICE_NAME=mellea-semconv-demo + python otel_genai_semconv_example.py + + Then open http://localhost:8080 → select mellea-semconv-demo service. + +What to verify per span in otelite +----------------------------------- + Span "chat" + gen_ai.system = "ollama" (back-compat) + gen_ai.provider.name = "ollama" (new, semconv v1.37.0) + gen_ai.conversation.id = "demo-session-1" + mellea.session_id = "demo-session-1" (preserved) + llm.prompt_template.template = "Summarise {{topic}} in one sentence." + llm.prompt_template.variables = {"topic": "quantum tunnelling"} (only with MELLEA_TRACE_CONTENT) + gen_ai.input.messages = [...] (only with MELLEA_TRACE_CONTENT) + gen_ai.output.messages = [...] (only with MELLEA_TRACE_CONTENT) + + Span "chat" (error path) + error.type = "OllamaRequestError" (or similar) + status = ERROR +""" + +from mellea import start_session +from mellea.telemetry import ( + is_backend_tracing_enabled, + is_content_tracing_enabled, + with_context, +) + + +def _section(title: str) -> None: + print(f"\n{'=' * 60}") + print(f" {title}") + print("=" * 60) + + +def main() -> None: + _section("Mellea OTel GenAI Semantic Convention Demo") + print(f"Backend tracing: {is_backend_tracing_enabled()}") + print(f"Content capture: {is_content_tracing_enabled()}") + if not is_backend_tracing_enabled(): + print("\nSet MELLEA_TRACE_BACKEND=1 to enable backend spans.") + + # ----------------------------------------------------------------------- + # 1. Provider name + conversation id + prompt template attrs + # ----------------------------------------------------------------------- + _section("1. Provider name / conversation id / template attrs") + print("Expected span attrs:") + print(" gen_ai.system = 'ollama'") + print(" gen_ai.provider.name = 'ollama'") + print(" gen_ai.conversation.id = 'demo-session-1'") + print(" llm.prompt_template.template = 'Summarise {{topic}} in one sentence.'") + + with with_context(session_id="demo-session-1"): + with start_session() as m: + result = m.instruct( + "Summarise {{topic}} in one sentence.", + user_variables={"topic": "quantum tunnelling"}, + ) + print(f"\nOutput: {str(result)[:120]}") + + # ----------------------------------------------------------------------- + # 2. Content capture (opt-in) + # ----------------------------------------------------------------------- + if is_content_tracing_enabled(): + _section("2. Content capture (MELLEA_TRACE_CONTENT=1)") + print("Expected span attrs:") + print(" gen_ai.system_instructions — serialised system turns") + print(" gen_ai.input.messages — [{'role':'user','parts':[...]}]") + print( + " gen_ai.output.messages — [{'role':'assistant','parts':[...],'finish_reason':'stop'}]" + ) + print(" llm.prompt_template.variables = {'name': 'Ada'}") + + with start_session() as m2: + result2 = m2.instruct( + "Write a one-line greeting for {{name}}.", + user_variables={"name": "Ada"}, + ) + print(f"\nOutput: {str(result2)[:120]}") + else: + _section("2. Content capture (skipped — set MELLEA_TRACE_CONTENT=1)") + + # ----------------------------------------------------------------------- + # 3. Error path: error.type + ERROR status + # ----------------------------------------------------------------------- + _section("3. Error path — error.type on span") + print("Expected span attrs:") + print(" status = ERROR") + print(" error.type = ") + + try: + with start_session() as m3: + # Use a model name guaranteed to be absent on any Ollama instance. + m3._backend.model_id = "mellea-semconv-nonexistent-xyz" # type: ignore[attr-defined] + m3.instruct("Hello") + except Exception as exc: + print(f"\nGot expected error: {exc.__class__.__name__}") + else: + print( + "\n(No error — check the span for error.type if the model unexpectedly exists)" + ) + + _section("Done") + print("If OTEL_EXPORTER_OTLP_ENDPOINT is set, check your trace backend.") + print("If MELLEA_TRACE_CONSOLE=1, spans were printed to stdout above.") + + +if __name__ == "__main__": + main() diff --git a/test/telemetry/test_genai_semconv_emission.py b/test/telemetry/test_genai_semconv_emission.py new file mode 100644 index 000000000..a3b2c3d0e --- /dev/null +++ b/test/telemetry/test_genai_semconv_emission.py @@ -0,0 +1,378 @@ +"""Unit tests for OTel GenAI semantic convention emission gaps (issue #1035). + +All tests use a fake span object and do not require a live backend or +OpenTelemetry SDK installation. +""" + +import json +from unittest.mock import MagicMock, patch + +from mellea.telemetry.backend_instrumentation import ( + finalize_backend_span, + get_provider_name, + get_system_name, + start_generate_span, +) +from mellea.telemetry.context import with_context +from mellea.telemetry.tracing import is_content_tracing_enabled + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _mock_span() -> MagicMock: + return MagicMock() + + +def _fake_backend(class_name: str) -> object: + return type(class_name, (), {})() + + +def _span_attrs(span: MagicMock) -> dict: + """Collect all set_attribute calls into a flat dict.""" + return {call.args[0]: call.args[1] for call in span.set_attribute.call_args_list} + + +# --------------------------------------------------------------------------- +# gen_ai.provider.name alongside gen_ai.system +# --------------------------------------------------------------------------- + + +def test_provider_name_equals_system_name(): + backend = _fake_backend("OpenAIBackend") + assert get_provider_name(backend) == get_system_name(backend) == "openai" + + +def test_provider_name_emitted_in_start_generate_span(): + """Both gen_ai.system and gen_ai.provider.name should be set on the span.""" + backend = _fake_backend("OpenAIBackend") + backend.model_id = "gpt-4" # type: ignore[attr-defined] + action = MagicMock() + action.prompt_template_metadata = None + + with patch("mellea.telemetry.tracing.start_backend_span") as mock_start: + mock_start.return_value = _mock_span() + start_generate_span(backend, action, ctx=[], format=None, tool_calls=False) + + call_kwargs = mock_start.call_args[1] + assert call_kwargs.get("gen_ai.system") == "openai" + assert call_kwargs.get("gen_ai.provider.name") == "openai" + + +# --------------------------------------------------------------------------- +# gen_ai.conversation.id from session_id ContextVar +# --------------------------------------------------------------------------- + + +def test_conversation_id_emitted_from_session_id(): + backend = _fake_backend("OpenAIBackend") + backend.model_id = "gpt-4" # type: ignore[attr-defined] + action = MagicMock() + action.prompt_template_metadata = None + + with with_context(session_id="sess-abc"): + with patch("mellea.telemetry.tracing.start_backend_span") as mock_start: + mock_start.return_value = _mock_span() + start_generate_span(backend, action, ctx=[], format=None, tool_calls=False) + + call_kwargs = mock_start.call_args[1] + assert call_kwargs.get("gen_ai.conversation.id") == "sess-abc" + assert call_kwargs.get("mellea.session_id") == "sess-abc" + + +def test_conversation_id_absent_when_no_session(): + backend = _fake_backend("OpenAIBackend") + backend.model_id = "gpt-4" # type: ignore[attr-defined] + action = MagicMock() + action.prompt_template_metadata = None + + with patch("mellea.telemetry.tracing.start_backend_span") as mock_start: + mock_start.return_value = _mock_span() + start_generate_span(backend, action, ctx=[], format=None, tool_calls=False) + + call_kwargs = mock_start.call_args[1] + assert "gen_ai.conversation.id" not in call_kwargs + + +# --------------------------------------------------------------------------- +# llm.prompt_template.* from Instruction +# --------------------------------------------------------------------------- + + +def test_prompt_template_attrs_from_instruction(): + from mellea.stdlib.components.instruction import Instruction + + instr = Instruction( + description="Summarise {{topic}} in one sentence.", + user_variables={"topic": "quantum tunnelling"}, + ) + + backend = _fake_backend("OpenAIBackend") + backend.model_id = "gpt-4" # type: ignore[attr-defined] + + with patch("mellea.telemetry.tracing.start_backend_span") as mock_start: + mock_start.return_value = _mock_span() + start_generate_span(backend, instr, ctx=[], format=None, tool_calls=False) + + call_kwargs = mock_start.call_args[1] + # Template text is always emitted + assert call_kwargs.get("llm.prompt_template.template") == ( + "Summarise {{topic}} in one sentence." + ) + # Variables are NOT emitted when content capture is off (default) + assert "llm.prompt_template.variables" not in call_kwargs + + +def test_prompt_template_variables_emitted_when_content_enabled(monkeypatch): + from mellea.stdlib.components.instruction import Instruction + + instr = Instruction(description="Hello {{name}}", user_variables={"name": "World"}) + + backend = _fake_backend("OpenAIBackend") + backend.model_id = "gpt-4" # type: ignore[attr-defined] + + # Patch the content gate to True + monkeypatch.setattr( + "mellea.telemetry.backend_instrumentation.is_content_tracing_enabled", + lambda: True, + ) + + with patch("mellea.telemetry.tracing.start_backend_span") as mock_start: + mock_start.return_value = _mock_span() + start_generate_span(backend, instr, ctx=[], format=None, tool_calls=False) + + call_kwargs = mock_start.call_args[1] + variables_json = call_kwargs.get("llm.prompt_template.variables") + assert variables_json is not None + parsed = json.loads(variables_json) + assert parsed == {"name": "World"} + + +def test_instruction_without_user_variables_emits_template(): + from mellea.stdlib.components.instruction import Instruction + + instr = Instruction(description="Tell me about {{topic}}") + # No user_variables — template is retained as-is + + backend = _fake_backend("OpenAIBackend") + backend.model_id = "gpt-4" # type: ignore[attr-defined] + + with patch("mellea.telemetry.tracing.start_backend_span") as mock_start: + mock_start.return_value = _mock_span() + start_generate_span(backend, instr, ctx=[], format=None, tool_calls=False) + + call_kwargs = mock_start.call_args[1] + assert call_kwargs.get("llm.prompt_template.template") == "Tell me about {{topic}}" + + +def test_instruction_with_no_description_emits_no_template(): + from mellea.stdlib.components.instruction import Instruction + + instr = Instruction() # no description + + backend = _fake_backend("OpenAIBackend") + backend.model_id = "gpt-4" # type: ignore[attr-defined] + + with patch("mellea.telemetry.tracing.start_backend_span") as mock_start: + mock_start.return_value = _mock_span() + start_generate_span(backend, instr, ctx=[], format=None, tool_calls=False) + + call_kwargs = mock_start.call_args[1] + assert "llm.prompt_template.template" not in call_kwargs + + +# --------------------------------------------------------------------------- +# ERROR span status + error.type (finalize_backend_span error path) +# --------------------------------------------------------------------------- + + +def test_error_sets_status_and_error_type(): + span = _mock_span() + exc = RuntimeError("model rejected") + + with ( + patch( + "mellea.telemetry.backend_instrumentation.set_span_error" + ) as mock_set_err, + patch("mellea.telemetry.backend_instrumentation.end_backend_span") as mock_end, + ): + finalize_backend_span(span, error=exc) + + mock_set_err.assert_called_once_with(span, exc) + attrs = _span_attrs(span) + assert attrs.get("error.type") == "RuntimeError" + mock_end.assert_called_once_with(span) + + +def test_error_path_always_closes_span(): + span = _mock_span() + with patch("mellea.telemetry.backend_instrumentation.set_span_error"): + with patch( + "mellea.telemetry.backend_instrumentation.end_backend_span" + ) as mock_end: + finalize_backend_span(span, error=ValueError("x")) + mock_end.assert_called_once() + + +def test_finalize_never_raises_on_span_error(monkeypatch): + """finalize_backend_span must not propagate exceptions from helpers.""" + span = _mock_span() + span.set_attribute.side_effect = RuntimeError("span broke") + + with patch("mellea.telemetry.backend_instrumentation.end_backend_span"): + with patch("mellea.telemetry.backend_instrumentation.set_span_error"): + # Should not raise even though set_attribute raises + finalize_backend_span(span, error=ValueError("test")) + + +def test_finalize_none_span_is_noop(): + finalize_backend_span(None, error=RuntimeError("x")) # no exception + + +# --------------------------------------------------------------------------- +# Content capture (gen_ai.input.messages etc.) gated by MELLEA_TRACE_CONTENT +# --------------------------------------------------------------------------- + + +def test_content_capture_disabled_by_default(): + span = _mock_span() + conversation = [ + {"role": "system", "content": "You are helpful."}, + {"role": "user", "content": "Hello"}, + ] + with patch("mellea.telemetry.backend_instrumentation.end_backend_span"): + finalize_backend_span(span, conversation=conversation, output_text="Hi there") + + attrs = _span_attrs(span) + assert "gen_ai.input.messages" not in attrs + assert "gen_ai.output.messages" not in attrs + assert "gen_ai.system_instructions" not in attrs + + +def test_content_capture_emits_structured_attributes(monkeypatch): + monkeypatch.setattr( + "mellea.telemetry.backend_instrumentation.is_content_tracing_enabled", + lambda: True, + ) + span = _mock_span() + conversation = [ + {"role": "system", "content": "You are helpful."}, + {"role": "user", "content": "Tell me a joke."}, + ] + with patch("mellea.telemetry.backend_instrumentation.end_backend_span"): + with patch("mellea.telemetry.backend_instrumentation.add_span_event"): + finalize_backend_span( + span, + conversation=conversation, + output_text="Why did the chicken cross the road?", + ) + + attrs = _span_attrs(span) + + # System instructions + sys_json = attrs.get("gen_ai.system_instructions") + assert sys_json is not None + sys_parts = json.loads(sys_json) + assert sys_parts == [{"type": "text", "content": "You are helpful."}] + + # Input messages (non-system) + in_json = attrs.get("gen_ai.input.messages") + assert in_json is not None + in_msgs = json.loads(in_json) + assert len(in_msgs) == 1 + assert in_msgs[0]["role"] == "user" + assert in_msgs[0]["parts"] == [{"type": "text", "content": "Tell me a joke."}] + + # Output messages + out_json = attrs.get("gen_ai.output.messages") + assert out_json is not None + out_msgs = json.loads(out_json) + assert out_msgs[0]["role"] == "assistant" + assert out_msgs[0]["parts"][0]["content"] == "Why did the chicken cross the road?" + assert "finish_reason" in out_msgs[0] + + +def test_content_capture_no_deprecated_per_role_events(monkeypatch): + """The deprecated gen_ai.user.message / gen_ai.assistant.message events must not be emitted.""" + monkeypatch.setattr( + "mellea.telemetry.backend_instrumentation.is_content_tracing_enabled", + lambda: True, + ) + span = _mock_span() + with patch("mellea.telemetry.backend_instrumentation.end_backend_span"): + finalize_backend_span( + span, conversation=[{"role": "user", "content": "hi"}], output_text="hello" + ) + + event_names = [call.args[0] for call in span.add_event.call_args_list] + deprecated = { + "gen_ai.user.message", + "gen_ai.assistant.message", + "gen_ai.system.message", + } + assert not deprecated.intersection(event_names) + + +def test_content_span_event_emitted(monkeypatch): + monkeypatch.setattr( + "mellea.telemetry.backend_instrumentation.is_content_tracing_enabled", + lambda: True, + ) + span = _mock_span() + with patch("mellea.telemetry.backend_instrumentation.end_backend_span"): + with patch( + "mellea.telemetry.backend_instrumentation.add_span_event" + ) as mock_event: + finalize_backend_span( + span, + conversation=[{"role": "user", "content": "hi"}], + output_text="hello", + ) + event_names = [call.args[1] for call in mock_event.call_args_list] + assert "gen_ai.client.inference.operation.details" in event_names + + +# --------------------------------------------------------------------------- +# _TRACE_CONTENT_ENABLED recognises OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT +# --------------------------------------------------------------------------- + + +def test_content_tracing_enabled_via_mellea_env(monkeypatch): + monkeypatch.setenv("MELLEA_TRACE_CONTENT", "true") + import mellea.telemetry.tracing as tracing_mod + + # Force re-evaluation of module-level constant + with patch.object(tracing_mod, "_TRACE_CONTENT_ENABLED", True): + assert tracing_mod.is_content_tracing_enabled() + + +def test_content_tracing_disabled_by_default(): + assert not is_content_tracing_enabled() + + +# --------------------------------------------------------------------------- +# Success path of finalize_backend_span calls record helpers +# --------------------------------------------------------------------------- + + +def test_success_path_calls_record_token_usage(): + span = _mock_span() + usage = {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15} + with patch( + "mellea.telemetry.backend_instrumentation.record_token_usage" + ) as mock_rtu: + with patch("mellea.telemetry.backend_instrumentation.end_backend_span"): + finalize_backend_span(span, usage=usage) + mock_rtu.assert_called_once_with(span, usage) + + +def test_success_path_calls_record_response_metadata(): + span = _mock_span() + response = {"model": "gpt-4", "id": "resp-1"} + with patch( + "mellea.telemetry.backend_instrumentation.record_response_metadata" + ) as mock_rrm: + with patch("mellea.telemetry.backend_instrumentation.end_backend_span"): + finalize_backend_span(span, response=response, model_id="gpt-4") + mock_rrm.assert_called_once_with(span, response, model_id="gpt-4") From d5db21f1c212711286853ed174a74774092cd4d0 Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Thu, 7 May 2026 15:13:52 +0100 Subject: [PATCH 06/10] refactor: trim PR #1035 to gaps 1-4, defer content capture (gap 5) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove finalize_backend_span success-path consolidation and all content capture helpers (_emit_content_attributes, _conversation_to_parts). Revert all five backend files to upstream/main — gap 5 requires touching every backend and is better reviewed in isolation. finalize_backend_span is kept as an error-path-only helper (sets error.type + ERROR status, then closes the span) used by the stream error path in ModelOutputThunk.__aiter__. Full implementation including gap 5 is preserved on cs/issue-1035-full. Assisted-by: Claude Code Signed-off-by: Nigel Jones --- .../telemetry/otel_genai_semconv_example.py | 67 ++----- mellea/backends/huggingface.py | 28 +-- mellea/backends/litellm.py | 28 +-- mellea/backends/ollama.py | 34 ++-- mellea/backends/openai.py | 26 ++- mellea/backends/watsonx.py | 28 +-- mellea/telemetry/backend_instrumentation.py | 145 ++------------ test/telemetry/test_genai_semconv_emission.py | 177 ++---------------- 8 files changed, 116 insertions(+), 417 deletions(-) diff --git a/docs/examples/telemetry/otel_genai_semconv_example.py b/docs/examples/telemetry/otel_genai_semconv_example.py index 9279ad98e..83a12aa82 100644 --- a/docs/examples/telemetry/otel_genai_semconv_example.py +++ b/docs/examples/telemetry/otel_genai_semconv_example.py @@ -2,25 +2,23 @@ """Example demonstrating OTel GenAI semantic convention attributes (issue #1035). -Exercises the five emission-gap fixes added in this issue so they can be verified -in otelite or any OTel-compatible backend: +Exercises gaps 1-4 so they can be verified in otelite or any OTel-compatible backend. +Gap 5 (content capture) is deferred — see cs/issue-1035-full for that implementation. gen_ai.provider.name — provider identity (alongside legacy gen_ai.system) gen_ai.conversation.id — mapped from session_id ContextVar llm.prompt_template.* — template text (always) and variables (opt-in) error.type — set on the error path alongside ERROR status - gen_ai.input/output.messages — structured content (opt-in via MELLEA_TRACE_CONTENT) Run against otelite for human verification: # Terminal 1 — start otelite (OTLP gRPC :4317, UI :8080) docker run --rm -p 4317:4317 -p 8080:8080 ghcr.io/planetf1/otelite:latest - # Terminal 2 — run with all attributes visible + # Terminal 2 export MELLEA_TRACE_BACKEND=1 - export MELLEA_TRACE_CONTENT=1 export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 - export OTEL_SERVICE_NAME=mellea-semconv-demo + export OTel_SERVICE_NAME=mellea-semconv-demo python otel_genai_semconv_example.py Then open http://localhost:8080 → select mellea-semconv-demo service. @@ -33,9 +31,6 @@ gen_ai.conversation.id = "demo-session-1" mellea.session_id = "demo-session-1" (preserved) llm.prompt_template.template = "Summarise {{topic}} in one sentence." - llm.prompt_template.variables = {"topic": "quantum tunnelling"} (only with MELLEA_TRACE_CONTENT) - gen_ai.input.messages = [...] (only with MELLEA_TRACE_CONTENT) - gen_ai.output.messages = [...] (only with MELLEA_TRACE_CONTENT) Span "chat" (error path) error.type = "OllamaRequestError" (or similar) @@ -43,30 +38,23 @@ """ from mellea import start_session -from mellea.telemetry import ( - is_backend_tracing_enabled, - is_content_tracing_enabled, - with_context, -) +from mellea.telemetry import is_backend_tracing_enabled, with_context def _section(title: str) -> None: - print(f"\n{'=' * 60}") - print(f" {title}") - print("=" * 60) + print(f"\n{'=' * 60}\n {title}\n{'=' * 60}") def main() -> None: - _section("Mellea OTel GenAI Semantic Convention Demo") - print(f"Backend tracing: {is_backend_tracing_enabled()}") - print(f"Content capture: {is_content_tracing_enabled()}") + _section("Mellea OTel GenAI Semantic Convention Demo (gaps 1-4)") + print(f"Backend tracing: {is_backend_tracing_enabled()}") if not is_backend_tracing_enabled(): - print("\nSet MELLEA_TRACE_BACKEND=1 to enable backend spans.") + print("Set MELLEA_TRACE_BACKEND=1 to enable backend spans.") # ----------------------------------------------------------------------- - # 1. Provider name + conversation id + prompt template attrs + # Gaps 1-3: provider name, conversation id, prompt template attrs # ----------------------------------------------------------------------- - _section("1. Provider name / conversation id / template attrs") + _section("Gaps 1-3: provider name / conversation id / template") print("Expected span attrs:") print(" gen_ai.system = 'ollama'") print(" gen_ai.provider.name = 'ollama'") @@ -82,40 +70,17 @@ def main() -> None: print(f"\nOutput: {str(result)[:120]}") # ----------------------------------------------------------------------- - # 2. Content capture (opt-in) + # Gap 4: error.type + ERROR status # ----------------------------------------------------------------------- - if is_content_tracing_enabled(): - _section("2. Content capture (MELLEA_TRACE_CONTENT=1)") - print("Expected span attrs:") - print(" gen_ai.system_instructions — serialised system turns") - print(" gen_ai.input.messages — [{'role':'user','parts':[...]}]") - print( - " gen_ai.output.messages — [{'role':'assistant','parts':[...],'finish_reason':'stop'}]" - ) - print(" llm.prompt_template.variables = {'name': 'Ada'}") - - with start_session() as m2: - result2 = m2.instruct( - "Write a one-line greeting for {{name}}.", - user_variables={"name": "Ada"}, - ) - print(f"\nOutput: {str(result2)[:120]}") - else: - _section("2. Content capture (skipped — set MELLEA_TRACE_CONTENT=1)") - - # ----------------------------------------------------------------------- - # 3. Error path: error.type + ERROR status - # ----------------------------------------------------------------------- - _section("3. Error path — error.type on span") + _section("Gap 4: error.type on span") print("Expected span attrs:") print(" status = ERROR") print(" error.type = ") try: - with start_session() as m3: - # Use a model name guaranteed to be absent on any Ollama instance. - m3._backend.model_id = "mellea-semconv-nonexistent-xyz" # type: ignore[attr-defined] - m3.instruct("Hello") + with start_session() as m2: + m2._backend.model_id = "mellea-semconv-nonexistent-xyz" # type: ignore[attr-defined] + m2.instruct("Hello") except Exception as exc: print(f"\nGot expected error: {exc.__class__.__name__}") else: diff --git a/mellea/backends/huggingface.py b/mellea/backends/huggingface.py index f21e50c06..84507299e 100644 --- a/mellea/backends/huggingface.py +++ b/mellea/backends/huggingface.py @@ -388,12 +388,7 @@ async def _generate_from_context( and an updated context that includes ``action`` and the new output. """ span = start_generate_span( - backend=self, - action=action, - ctx=ctx, - format=format, - tool_calls=tool_calls, - model_options=model_options, + backend=self, action=action, ctx=ctx, format=format, tool_calls=tool_calls ) with with_context( @@ -1254,15 +1249,20 @@ class used during generation, if any. # Record tracing if span exists if span is not None: - from ..telemetry.backend_instrumentation import finalize_backend_span - - finalize_backend_span( - span, - usage=mot.generation.usage if mot.generation.usage else None, - model_id=self._get_hf_model_id(), - conversation=conversation, - output_text=str(mot.value) if mot.value is not None else None, + from ..telemetry import end_backend_span + from ..telemetry.backend_instrumentation import ( + record_response_metadata, + record_token_usage, ) + + if isinstance(hf_output, GenerateDecoderOnlyOutput): + record_response_metadata(span, hf_output) + if mot.generation.usage: + record_token_usage(span, mot.generation.usage) + + # Close the span now that async operation is complete + end_backend_span(span) + # Clean up span reference del mot._meta["_telemetry_span"] # When caching is disabled, clear hf_output from meta to free GPU memory. diff --git a/mellea/backends/litellm.py b/mellea/backends/litellm.py index da98dec0e..f7d912516 100644 --- a/mellea/backends/litellm.py +++ b/mellea/backends/litellm.py @@ -164,12 +164,7 @@ async def _generate_from_context( "The Openai backend only supports chat-like contexts." ) span = start_generate_span( - backend=self, - action=action, - ctx=ctx, - format=format, - tool_calls=tool_calls, - model_options=model_options, + backend=self, action=action, ctx=ctx, format=format, tool_calls=tool_calls ) _model_id_str = str(getattr(self, "model_id", "unknown")) @@ -566,16 +561,21 @@ async def post_processing( # Record telemetry now that response is available span = mot._meta.get("_telemetry_span") if span is not None: - from ..telemetry.backend_instrumentation import finalize_backend_span + from ..telemetry import end_backend_span + from ..telemetry.backend_instrumentation import ( + record_response_metadata, + record_token_usage, + ) response = mot._meta.get("litellm_chat_response") - finalize_backend_span( - span, - response=response, - usage=usage, - model_id=str(self.model_id), - conversation=conversation, - ) + if response: + # LiteLLM responses have usage information + if usage: + record_token_usage(span, usage) + record_response_metadata(span, response) + # Close the span now that async operation is complete + end_backend_span(span) + # Clean up the span reference del mot._meta["_telemetry_span"] @staticmethod diff --git a/mellea/backends/ollama.py b/mellea/backends/ollama.py index 651902319..5b50cd709 100644 --- a/mellea/backends/ollama.py +++ b/mellea/backends/ollama.py @@ -289,9 +289,7 @@ async def _generate_from_context( and an updated context that includes ``action`` and the new output. """ # Start span without auto-closing (will be closed in post_processing) - span = start_generate_span( - self, action, ctx, format, tool_calls, model_options=model_options - ) + span = start_generate_span(self, action, ctx, format, tool_calls) assert ctx.is_chat_context, ( "The ollama backend only supports chat-like contexts." @@ -722,23 +720,21 @@ async def post_processing( # Record telemetry and close span now that response is available span = mot._meta.get("_telemetry_span") if span is not None: - from ..telemetry.backend_instrumentation import finalize_backend_span - - output_text: str | None = None - if response is not None: - try: - msg = getattr(response, "message", None) - if msg is not None: - output_text = str(getattr(msg, "content", "") or "") - except Exception: - pass - - finalize_backend_span( - span, - usage=mot.generation.usage if mot.generation.usage else None, - conversation=conversation, - output_text=output_text, + from ..telemetry import end_backend_span + from ..telemetry.backend_instrumentation import ( + record_response_metadata, + record_token_usage, ) + + if response: + if mot.generation.usage: + record_token_usage(span, mot.generation.usage) + record_response_metadata(span, response) + + # Close the span now that telemetry is recorded + end_backend_span(span) + + # Clean up the span reference del mot._meta["_telemetry_span"] diff --git a/mellea/backends/openai.py b/mellea/backends/openai.py index ace9c544e..1eea93511 100644 --- a/mellea/backends/openai.py +++ b/mellea/backends/openai.py @@ -467,12 +467,7 @@ async def _generate_from_context( # Start span without auto-closing (will be closed in post_processing) span = start_generate_span( - backend=self, - action=action, - ctx=ctx, - format=format, - tool_calls=tool_calls, - model_options=model_options, + backend=self, action=action, ctx=ctx, format=format, tool_calls=tool_calls ) _model_id_str = str(getattr(self, "model_id", "unknown")) @@ -1127,15 +1122,18 @@ async def post_processing( # Record telemetry now that response is available span = mot._meta.get("_telemetry_span") if span is not None: - from ..telemetry.backend_instrumentation import finalize_backend_span - - finalize_backend_span( - span, - response=response, - usage=usage, - model_id=self._model_id, - conversation=conversation, + from ..telemetry import end_backend_span + from ..telemetry.backend_instrumentation import ( + record_response_metadata, + record_token_usage, ) + + if usage: + record_token_usage(span, usage) + record_response_metadata(span, response) + # Close the span now that async operation is complete + end_backend_span(span) + # Clean up the span reference del mot._meta["_telemetry_span"] @overload diff --git a/mellea/backends/watsonx.py b/mellea/backends/watsonx.py index f0bcf80e2..87a0697d6 100644 --- a/mellea/backends/watsonx.py +++ b/mellea/backends/watsonx.py @@ -303,12 +303,7 @@ async def _generate_from_context( "The watsonx.ai backend only supports chat-like contexts." ) span = start_generate_span( - backend=self, - action=action, - ctx=ctx, - format=format, - tool_calls=tool_calls, - model_options=model_options, + backend=self, action=action, ctx=ctx, format=format, tool_calls=tool_calls ) _model_id_str = str(getattr(self, "model_id", "unknown")) @@ -611,15 +606,20 @@ async def post_processing( # Record tracing if span exists span = mot._meta.get("_telemetry_span") if span is not None: - from ..telemetry.backend_instrumentation import finalize_backend_span - - finalize_backend_span( - span, - response=response, - usage=usage, - model_id=str(self._get_watsonx_model_id()), - conversation=conversation, + from ..telemetry import end_backend_span + from ..telemetry.backend_instrumentation import ( + record_response_metadata, + record_token_usage, ) + + if usage: + record_token_usage(span, usage) + if response is not None: + record_response_metadata(span, response) + + # Close the span now that async operation is complete + end_backend_span(span) + # Clean up span reference del mot._meta["_telemetry_span"] # Generate the log for this ModelOutputThunk. diff --git a/mellea/telemetry/backend_instrumentation.py b/mellea/telemetry/backend_instrumentation.py index 939fe2d6a..fc648b918 100644 --- a/mellea/telemetry/backend_instrumentation.py +++ b/mellea/telemetry/backend_instrumentation.py @@ -2,10 +2,6 @@ Follows OpenTelemetry Gen-AI semantic conventions: https://opentelemetry.io/docs/specs/semconv/gen-ai/ - -Content capture (``gen_ai.input.messages``, ``gen_ai.output.messages``, -``gen_ai.system_instructions``) is opt-in and gated by ``is_content_tracing_enabled()``. -These attributes may contain PII — enable only in controlled environments. """ import json @@ -13,7 +9,6 @@ from ..backends.utils import get_value from .tracing import ( - add_span_event, end_backend_span, is_content_tracing_enabled, set_span_attribute, @@ -345,64 +340,28 @@ def record_response_metadata( pass -def finalize_backend_span( - span: Any, - *, - response: Any = None, - usage: Any = None, - model_id: str | None = None, - error: Exception | None = None, - conversation: list[dict] | None = None, - output_text: str | None = None, - finish_reason: str | None = None, -) -> None: - """Close a backend span, recording telemetry on both success and error paths. - - On the error path, records the exception, sets ``error.type``, and marks - the span with ERROR status before closing. On the success path, records - token usage, response metadata, and (when content capture is enabled) - structured input/output message attributes. +def finalize_backend_span(span: Any, *, error: Exception | None = None) -> None: + """Close a backend span on the error path, setting error.type and ERROR status. - This replaces the three-line ``record_token_usage`` + ``record_response_metadata`` - + ``end_backend_span`` pattern used in each backend's ``post_processing``. + Used by the streaming error path in ``ModelOutputThunk.__aiter__`` where a + span may be left open after an exception. Backends close spans on the + success path themselves via ``record_token_usage`` + ``record_response_metadata`` + + ``end_backend_span``. Args: span: The span to finalise (no-op when ``None``). - response: Raw backend response (for model id, finish reason, response id). - usage: Token usage object or dict. - model_id: Explicit model id override. - error: Exception to record on the error path. - conversation: The prompt conversation (``list[dict]`` with ``role``/``content`` - keys). Used for ``gen_ai.input.messages`` and - ``gen_ai.system_instructions`` when content capture is enabled. - output_text: The assistant's reply text. Used for - ``gen_ai.output.messages`` when content capture is enabled. - finish_reason: Finish reason string (defaults to ``"stop"`` when omitted). + error: Exception to record; sets ERROR status and ``error.type``. """ if span is None: return try: - try: - if error is not None: - set_span_error(span, error) - # error.type is a Stable OTel cross-signal attribute - set_span_attribute(span, "error.type", type(error).__name__) - else: - record_token_usage(span, usage) - record_response_metadata(span, response, model_id=model_id) - - if is_content_tracing_enabled() and conversation is not None: - _emit_content_attributes( - span, - conversation=conversation, - output_text=output_text, - finish_reason=finish_reason, - response=response, - ) - except Exception: - # Telemetry helpers must never break application code. - pass + if error is not None: + set_span_error(span, error) + # error.type is a Stable OTel cross-signal attribute + set_span_attribute(span, "error.type", type(error).__name__) + except Exception: + pass finally: end_backend_span(span) @@ -426,84 +385,6 @@ def _serialize_json(obj: Any) -> str: return json.dumps(obj, default=str, ensure_ascii=False) -def _conversation_to_parts(conversation: list[dict]) -> tuple[list[dict], list[dict]]: - """Split a conversation into system instructions and input messages. - - Args: - conversation: List of ``{"role": ..., "content": ...}`` dicts. - - Returns: - Tuple of ``(system_parts, input_messages)`` in the spec JSON shape. - ``system_parts`` is a list of ``{"type": "text", "content": ...}`` items. - ``input_messages`` is a list of - ``{"role": ..., "parts": [{"type": "text", "content": ...}]}`` items. - """ - system_parts: list[dict] = [] - input_messages: list[dict] = [] - for msg in conversation: - role = msg.get("role", "") - content = msg.get("content", "") - if role == "system": - system_parts.append({"type": "text", "content": str(content)}) - else: - input_messages.append( - {"role": role, "parts": [{"type": "text", "content": str(content)}]} - ) - return system_parts, input_messages - - -def _emit_content_attributes( - span: Any, - *, - conversation: list[dict], - output_text: str | None, - finish_reason: str | None, - response: Any = None, -) -> None: - """Set structured content attributes on the span (content gate must be checked by caller).""" - try: - system_parts, input_messages = _conversation_to_parts(conversation) - - if system_parts: - set_span_attribute( - span, "gen_ai.system_instructions", _serialize_json(system_parts) - ) - if input_messages: - set_span_attribute( - span, "gen_ai.input.messages", _serialize_json(input_messages) - ) - - # Attempt to derive output text from an OpenAI-format response if not provided - if output_text is None and response is not None: - try: - choices = get_value(response, "choices") - if choices: - first = choices[0] if isinstance(choices, list) else choices - msg = get_value(first, "message") - if msg is not None: - output_text = str(get_value(msg, "content") or "") - except Exception: - pass - - if output_text is not None: - output_msg = [ - { - "role": "assistant", - "parts": [{"type": "text", "content": output_text}], - "finish_reason": finish_reason or "stop", - } - ] - set_span_attribute( - span, "gen_ai.output.messages", _serialize_json(output_msg) - ) - - # Emit a span event so log-oriented receivers also see the content payload. - add_span_event(span, "gen_ai.client.inference.operation.details") - except Exception: - # Content capture is best-effort — never fail the span close - pass - - __all__ = [ "finalize_backend_span", "get_context_size", diff --git a/test/telemetry/test_genai_semconv_emission.py b/test/telemetry/test_genai_semconv_emission.py index a3b2c3d0e..b6cd2ed88 100644 --- a/test/telemetry/test_genai_semconv_emission.py +++ b/test/telemetry/test_genai_semconv_emission.py @@ -1,5 +1,8 @@ """Unit tests for OTel GenAI semantic convention emission gaps (issue #1035). +Covers gaps 1-4. Gap 5 (content capture) is deferred; see cs/issue-1035-full +for the full implementation. + All tests use a fake span object and do not require a live backend or OpenTelemetry SDK installation. """ @@ -35,7 +38,7 @@ def _span_attrs(span: MagicMock) -> dict: # --------------------------------------------------------------------------- -# gen_ai.provider.name alongside gen_ai.system +# Gap 1: gen_ai.provider.name alongside gen_ai.system # --------------------------------------------------------------------------- @@ -61,7 +64,7 @@ def test_provider_name_emitted_in_start_generate_span(): # --------------------------------------------------------------------------- -# gen_ai.conversation.id from session_id ContextVar +# Gap 2: gen_ai.conversation.id from session_id ContextVar # --------------------------------------------------------------------------- @@ -96,7 +99,7 @@ def test_conversation_id_absent_when_no_session(): # --------------------------------------------------------------------------- -# llm.prompt_template.* from Instruction +# Gap 3: llm.prompt_template.* from Instruction # --------------------------------------------------------------------------- @@ -116,7 +119,6 @@ def test_prompt_template_attrs_from_instruction(): start_generate_span(backend, instr, ctx=[], format=None, tool_calls=False) call_kwargs = mock_start.call_args[1] - # Template text is always emitted assert call_kwargs.get("llm.prompt_template.template") == ( "Summarise {{topic}} in one sentence." ) @@ -132,7 +134,6 @@ def test_prompt_template_variables_emitted_when_content_enabled(monkeypatch): backend = _fake_backend("OpenAIBackend") backend.model_id = "gpt-4" # type: ignore[attr-defined] - # Patch the content gate to True monkeypatch.setattr( "mellea.telemetry.backend_instrumentation.is_content_tracing_enabled", lambda: True, @@ -145,15 +146,13 @@ def test_prompt_template_variables_emitted_when_content_enabled(monkeypatch): call_kwargs = mock_start.call_args[1] variables_json = call_kwargs.get("llm.prompt_template.variables") assert variables_json is not None - parsed = json.loads(variables_json) - assert parsed == {"name": "World"} + assert json.loads(variables_json) == {"name": "World"} def test_instruction_without_user_variables_emits_template(): from mellea.stdlib.components.instruction import Instruction instr = Instruction(description="Tell me about {{topic}}") - # No user_variables — template is retained as-is backend = _fake_backend("OpenAIBackend") backend.model_id = "gpt-4" # type: ignore[attr-defined] @@ -162,14 +161,16 @@ def test_instruction_without_user_variables_emits_template(): mock_start.return_value = _mock_span() start_generate_span(backend, instr, ctx=[], format=None, tool_calls=False) - call_kwargs = mock_start.call_args[1] - assert call_kwargs.get("llm.prompt_template.template") == "Tell me about {{topic}}" + assert ( + mock_start.call_args[1].get("llm.prompt_template.template") + == "Tell me about {{topic}}" + ) def test_instruction_with_no_description_emits_no_template(): from mellea.stdlib.components.instruction import Instruction - instr = Instruction() # no description + instr = Instruction() backend = _fake_backend("OpenAIBackend") backend.model_id = "gpt-4" # type: ignore[attr-defined] @@ -178,12 +179,11 @@ def test_instruction_with_no_description_emits_no_template(): mock_start.return_value = _mock_span() start_generate_span(backend, instr, ctx=[], format=None, tool_calls=False) - call_kwargs = mock_start.call_args[1] - assert "llm.prompt_template.template" not in call_kwargs + assert "llm.prompt_template.template" not in mock_start.call_args[1] # --------------------------------------------------------------------------- -# ERROR span status + error.type (finalize_backend_span error path) +# Gap 4: ERROR span status + error.type # --------------------------------------------------------------------------- @@ -200,8 +200,7 @@ def test_error_sets_status_and_error_type(): finalize_backend_span(span, error=exc) mock_set_err.assert_called_once_with(span, exc) - attrs = _span_attrs(span) - assert attrs.get("error.type") == "RuntimeError" + assert _span_attrs(span).get("error.type") == "RuntimeError" mock_end.assert_called_once_with(span) @@ -215,164 +214,24 @@ def test_error_path_always_closes_span(): mock_end.assert_called_once() -def test_finalize_never_raises_on_span_error(monkeypatch): +def test_finalize_never_raises_on_span_error(): """finalize_backend_span must not propagate exceptions from helpers.""" span = _mock_span() span.set_attribute.side_effect = RuntimeError("span broke") with patch("mellea.telemetry.backend_instrumentation.end_backend_span"): with patch("mellea.telemetry.backend_instrumentation.set_span_error"): - # Should not raise even though set_attribute raises finalize_backend_span(span, error=ValueError("test")) def test_finalize_none_span_is_noop(): - finalize_backend_span(None, error=RuntimeError("x")) # no exception - - -# --------------------------------------------------------------------------- -# Content capture (gen_ai.input.messages etc.) gated by MELLEA_TRACE_CONTENT -# --------------------------------------------------------------------------- - - -def test_content_capture_disabled_by_default(): - span = _mock_span() - conversation = [ - {"role": "system", "content": "You are helpful."}, - {"role": "user", "content": "Hello"}, - ] - with patch("mellea.telemetry.backend_instrumentation.end_backend_span"): - finalize_backend_span(span, conversation=conversation, output_text="Hi there") - - attrs = _span_attrs(span) - assert "gen_ai.input.messages" not in attrs - assert "gen_ai.output.messages" not in attrs - assert "gen_ai.system_instructions" not in attrs - - -def test_content_capture_emits_structured_attributes(monkeypatch): - monkeypatch.setattr( - "mellea.telemetry.backend_instrumentation.is_content_tracing_enabled", - lambda: True, - ) - span = _mock_span() - conversation = [ - {"role": "system", "content": "You are helpful."}, - {"role": "user", "content": "Tell me a joke."}, - ] - with patch("mellea.telemetry.backend_instrumentation.end_backend_span"): - with patch("mellea.telemetry.backend_instrumentation.add_span_event"): - finalize_backend_span( - span, - conversation=conversation, - output_text="Why did the chicken cross the road?", - ) - - attrs = _span_attrs(span) - - # System instructions - sys_json = attrs.get("gen_ai.system_instructions") - assert sys_json is not None - sys_parts = json.loads(sys_json) - assert sys_parts == [{"type": "text", "content": "You are helpful."}] - - # Input messages (non-system) - in_json = attrs.get("gen_ai.input.messages") - assert in_json is not None - in_msgs = json.loads(in_json) - assert len(in_msgs) == 1 - assert in_msgs[0]["role"] == "user" - assert in_msgs[0]["parts"] == [{"type": "text", "content": "Tell me a joke."}] - - # Output messages - out_json = attrs.get("gen_ai.output.messages") - assert out_json is not None - out_msgs = json.loads(out_json) - assert out_msgs[0]["role"] == "assistant" - assert out_msgs[0]["parts"][0]["content"] == "Why did the chicken cross the road?" - assert "finish_reason" in out_msgs[0] - - -def test_content_capture_no_deprecated_per_role_events(monkeypatch): - """The deprecated gen_ai.user.message / gen_ai.assistant.message events must not be emitted.""" - monkeypatch.setattr( - "mellea.telemetry.backend_instrumentation.is_content_tracing_enabled", - lambda: True, - ) - span = _mock_span() - with patch("mellea.telemetry.backend_instrumentation.end_backend_span"): - finalize_backend_span( - span, conversation=[{"role": "user", "content": "hi"}], output_text="hello" - ) - - event_names = [call.args[0] for call in span.add_event.call_args_list] - deprecated = { - "gen_ai.user.message", - "gen_ai.assistant.message", - "gen_ai.system.message", - } - assert not deprecated.intersection(event_names) - - -def test_content_span_event_emitted(monkeypatch): - monkeypatch.setattr( - "mellea.telemetry.backend_instrumentation.is_content_tracing_enabled", - lambda: True, - ) - span = _mock_span() - with patch("mellea.telemetry.backend_instrumentation.end_backend_span"): - with patch( - "mellea.telemetry.backend_instrumentation.add_span_event" - ) as mock_event: - finalize_backend_span( - span, - conversation=[{"role": "user", "content": "hi"}], - output_text="hello", - ) - event_names = [call.args[1] for call in mock_event.call_args_list] - assert "gen_ai.client.inference.operation.details" in event_names + finalize_backend_span(None, error=RuntimeError("x")) # --------------------------------------------------------------------------- -# _TRACE_CONTENT_ENABLED recognises OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT +# Content tracing default (infrastructure for deferred gap 5) # --------------------------------------------------------------------------- -def test_content_tracing_enabled_via_mellea_env(monkeypatch): - monkeypatch.setenv("MELLEA_TRACE_CONTENT", "true") - import mellea.telemetry.tracing as tracing_mod - - # Force re-evaluation of module-level constant - with patch.object(tracing_mod, "_TRACE_CONTENT_ENABLED", True): - assert tracing_mod.is_content_tracing_enabled() - - def test_content_tracing_disabled_by_default(): assert not is_content_tracing_enabled() - - -# --------------------------------------------------------------------------- -# Success path of finalize_backend_span calls record helpers -# --------------------------------------------------------------------------- - - -def test_success_path_calls_record_token_usage(): - span = _mock_span() - usage = {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15} - with patch( - "mellea.telemetry.backend_instrumentation.record_token_usage" - ) as mock_rtu: - with patch("mellea.telemetry.backend_instrumentation.end_backend_span"): - finalize_backend_span(span, usage=usage) - mock_rtu.assert_called_once_with(span, usage) - - -def test_success_path_calls_record_response_metadata(): - span = _mock_span() - response = {"model": "gpt-4", "id": "resp-1"} - with patch( - "mellea.telemetry.backend_instrumentation.record_response_metadata" - ) as mock_rrm: - with patch("mellea.telemetry.backend_instrumentation.end_backend_span"): - finalize_backend_span(span, response=response, model_id="gpt-4") - mock_rrm.assert_called_once_with(span, response, model_id="gpt-4") From 1185b5088cc50b1f6dd3f6a8e8f4990266d2843f Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Wed, 13 May 2026 08:07:24 +0100 Subject: [PATCH 07/10] refactor(telemetry): trim PR to gaps 1, 2, 4 per review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop gap 3 (prompt-template capture) and the model_options/_REQUEST_PARAM_MAP plumbing in response to review feedback from @jakelorocco and @ajbozarth. Jake's objection to gap 3 is correct: stashing template state on Instruction and GenerativeStub is the wrong layer — it only covers two component types, captures pre-substitution values, and puts telemetry concerns inside domain objects. The right implementation is at the formatter render path, which covers all component types. That work belongs after #1045 lands. The model_options/_REQUEST_PARAM_MAP block was dead code: no backend call site passes model_options, and even if wired the values would be pre-substitution. Per Nathan's review, the right call is to drop rather than carry forward a no-op. Request-param emission also belongs in the post-#1045 plugin layer where the wire-format dict is visible. What remains in this PR: gap 1 — gen_ai.provider.name alongside legacy gen_ai.system gap 2 — gen_ai.conversation.id from session_id ContextVar gap 4 — error.type + ERROR status via finalize_backend_span cache/reasoning token fields in record_token_usage MELLEA_TRACE_CONTENT flag + add_span_event (infrastructure for future gap 5) Also fix OTel_SERVICE_NAME typo in the example (case-sensitive on Linux) and rewrite the example docstring and README entry to be PR-independent. Assisted-by: Claude Code Signed-off-by: Nigel Jones --- docs/examples/telemetry/README.md | 7 +- .../telemetry/otel_genai_semconv_example.py | 49 +++++----- mellea/stdlib/components/genstub.py | 15 --- mellea/stdlib/components/instruction.py | 23 ----- mellea/telemetry/backend_instrumentation.py | 60 +----------- test/telemetry/test_genai_semconv_emission.py | 96 +------------------ 6 files changed, 31 insertions(+), 219 deletions(-) diff --git a/docs/examples/telemetry/README.md b/docs/examples/telemetry/README.md index 9458e9db0..a91575c2a 100644 --- a/docs/examples/telemetry/README.md +++ b/docs/examples/telemetry/README.md @@ -6,10 +6,9 @@ This directory contains examples demonstrating OpenTelemetry tracing and metrics - **`telemetry_example.py`** - Demonstrates distributed tracing (application and backend traces) - **`metrics_example.py`** - Demonstrates token usage metrics collection -- **`otel_genai_semconv_example.py`** - Exercises the OTel GenAI semantic convention attributes - added in issue #1035 (`gen_ai.provider.name`, `gen_ai.conversation.id`, - `llm.prompt_template.*`, `error.type`, content capture). Designed for human - verification against [otelite](https://github.com/planetf1/otelite). +- **`otel_genai_semconv_example.py`** - Verifies OTel GenAI semantic convention attributes + emitted on backend spans (`gen_ai.provider.name`, `gen_ai.conversation.id`, `error.type`). + Designed for human verification against [otelite](https://github.com/planetf1/otelite). ## Quick Start diff --git a/docs/examples/telemetry/otel_genai_semconv_example.py b/docs/examples/telemetry/otel_genai_semconv_example.py index 83a12aa82..bbab23356 100644 --- a/docs/examples/telemetry/otel_genai_semconv_example.py +++ b/docs/examples/telemetry/otel_genai_semconv_example.py @@ -1,14 +1,14 @@ # pytest: ollama, e2e -"""Example demonstrating OTel GenAI semantic convention attributes (issue #1035). +"""Mellea backend spans carrying OTel GenAI semantic convention attributes. -Exercises gaps 1-4 so they can be verified in otelite or any OTel-compatible backend. -Gap 5 (content capture) is deferred — see cs/issue-1035-full for that implementation. +Each backend generation call emits a ``chat`` span with the following attributes +drawn from the OTel GenAI semconv (https://opentelemetry.io/docs/specs/semconv/gen-ai/): - gen_ai.provider.name — provider identity (alongside legacy gen_ai.system) - gen_ai.conversation.id — mapped from session_id ContextVar - llm.prompt_template.* — template text (always) and variables (opt-in) - error.type — set on the error path alongside ERROR status + gen_ai.provider.name — provider identity (current semconv) + gen_ai.system — same value, retained for back-compat with existing dashboards + gen_ai.conversation.id — correlated to the active session via ``with_context`` + error.type — set on the error path alongside ERROR span status Run against otelite for human verification: @@ -18,22 +18,21 @@ # Terminal 2 export MELLEA_TRACE_BACKEND=1 export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 - export OTel_SERVICE_NAME=mellea-semconv-demo + export OTEL_SERVICE_NAME=mellea-semconv-demo python otel_genai_semconv_example.py - Then open http://localhost:8080 → select mellea-semconv-demo service. + Then open http://localhost:8080 and select the mellea-semconv-demo service. -What to verify per span in otelite ------------------------------------ - Span "chat" - gen_ai.system = "ollama" (back-compat) - gen_ai.provider.name = "ollama" (new, semconv v1.37.0) +Expected span attributes +------------------------ + Span "chat" (normal path) + gen_ai.system = "ollama" + gen_ai.provider.name = "ollama" gen_ai.conversation.id = "demo-session-1" - mellea.session_id = "demo-session-1" (preserved) - llm.prompt_template.template = "Summarise {{topic}} in one sentence." + mellea.session_id = "demo-session-1" Span "chat" (error path) - error.type = "OllamaRequestError" (or similar) + error.type = status = ERROR """ @@ -46,33 +45,29 @@ def _section(title: str) -> None: def main() -> None: - _section("Mellea OTel GenAI Semantic Convention Demo (gaps 1-4)") + _section("Mellea OTel GenAI Semantic Convention Demo") print(f"Backend tracing: {is_backend_tracing_enabled()}") if not is_backend_tracing_enabled(): print("Set MELLEA_TRACE_BACKEND=1 to enable backend spans.") # ----------------------------------------------------------------------- - # Gaps 1-3: provider name, conversation id, prompt template attrs + # Normal path: provider name + conversation id # ----------------------------------------------------------------------- - _section("Gaps 1-3: provider name / conversation id / template") + _section("Normal path — provider name and conversation id") print("Expected span attrs:") print(" gen_ai.system = 'ollama'") print(" gen_ai.provider.name = 'ollama'") print(" gen_ai.conversation.id = 'demo-session-1'") - print(" llm.prompt_template.template = 'Summarise {{topic}} in one sentence.'") with with_context(session_id="demo-session-1"): with start_session() as m: - result = m.instruct( - "Summarise {{topic}} in one sentence.", - user_variables={"topic": "quantum tunnelling"}, - ) + result = m.instruct("Summarise quantum tunnelling in one sentence.") print(f"\nOutput: {str(result)[:120]}") # ----------------------------------------------------------------------- - # Gap 4: error.type + ERROR status + # Error path: error.type + ERROR status # ----------------------------------------------------------------------- - _section("Gap 4: error.type on span") + _section("Error path — error.type on span") print("Expected span attrs:") print(" status = ERROR") print(" error.type = ") diff --git a/mellea/stdlib/components/genstub.py b/mellea/stdlib/components/genstub.py index 94572b557..05ca11088 100644 --- a/mellea/stdlib/components/genstub.py +++ b/mellea/stdlib/components/genstub.py @@ -355,7 +355,6 @@ def __init__(self, func: Callable[P, R]): self._function = Function(func) self._arguments: Arguments | None = None - self._template_variables: dict = {} functools.update_wrapper(self, func) self._response_model = create_response_format(self._function._func) @@ -521,18 +520,6 @@ def _parse(self, computed: ModelOutputThunk) -> R: return function_response.result - def prompt_template_metadata(self) -> tuple[str, dict, None] | None: - """Return prompt template metadata for telemetry. - - Returns: - Tuple of ``(docstring, variables, version)`` when the function has - a docstring, otherwise ``None``. - """ - docstring = self._function._function_dict.get("docstring") - if not docstring: - return None - return str(docstring), dict(self._template_variables), None - class SyncGenerativeStub(GenerativeStub, Generic[P, R]): """A synchronous generative stub that blocks until the LLM response is ready. @@ -600,7 +587,6 @@ def __call__(self, *args, **kwargs) -> tuple[R, Context] | R: for r in extracted.precondition_requirements ] - stub_copy._template_variables = dict(extracted.f_kwargs) arguments = bind_function_arguments(self._function._func, **extracted.f_kwargs) if arguments: stub_args: list[Argument] = [] @@ -734,7 +720,6 @@ def __call__(self, *args, **kwargs) -> Coroutine[Any, Any, tuple[R, Context] | R for r in extracted.precondition_requirements ] - stub_copy._template_variables = dict(extracted.f_kwargs) arguments = bind_function_arguments(self._function._func, **extracted.f_kwargs) if arguments: stub_args: list[Argument] = [] diff --git a/mellea/stdlib/components/instruction.py b/mellea/stdlib/components/instruction.py index b814b4bf3..30faaea20 100644 --- a/mellea/stdlib/components/instruction.py +++ b/mellea/stdlib/components/instruction.py @@ -63,15 +63,6 @@ def __init__( icl_examples = [] if icl_examples is None else icl_examples grounding_context = dict() if grounding_context is None else grounding_context - # Retain raw template before Jinja substitution for telemetry. - # Template text is the static prompt structure; variables may contain user data. - self._template_description: str | None = ( - description if isinstance(description, str) else None - ) - self._user_variables: dict[str, str] | None = ( - dict(user_variables) if user_variables else None - ) - # Apply templates. All inputs must be strings if provided. if user_variables is not None: if description is not None: @@ -198,20 +189,6 @@ def format_for_llm(self) -> TemplateRepresentation: template_order=["*", "Instruction"], ) - def prompt_template_metadata(self) -> tuple[str, dict[str, str], None] | None: - """Return prompt template metadata for telemetry. - - The raw template text is emitted unconditionally. Variables are only - emitted when content capture is enabled (they may contain user data). - - Returns: - Tuple of ``(template_text, variables, version)`` when a string - description was provided, otherwise ``None``. - """ - if self._template_description is None: - return None - return self._template_description, dict(self._user_variables or {}), None - @staticmethod def apply_user_dict_from_jinja(user_dict: dict[str, str], s: str) -> str: """Render a Jinja2 template string using the provided variable dictionary. diff --git a/mellea/telemetry/backend_instrumentation.py b/mellea/telemetry/backend_instrumentation.py index fc648b918..21ef5dccd 100644 --- a/mellea/telemetry/backend_instrumentation.py +++ b/mellea/telemetry/backend_instrumentation.py @@ -4,17 +4,10 @@ https://opentelemetry.io/docs/specs/semconv/gen-ai/ """ -import json from typing import Any from ..backends.utils import get_value -from .tracing import ( - end_backend_span, - is_content_tracing_enabled, - set_span_attribute, - set_span_error, - trace_backend, -) +from .tracing import end_backend_span, set_span_attribute, set_span_error, trace_backend def get_model_id_str(backend: Any) -> str: @@ -135,13 +128,7 @@ def instrument_generate_from_context( def start_generate_span( - backend: Any, - action: Any, - ctx: Any, - format: Any = None, - tool_calls: bool = False, - *, - model_options: dict | None = None, + backend: Any, action: Any, ctx: Any, format: Any = None, tool_calls: bool = False ): """Start a backend trace span for generate_from_context (without auto-closing). @@ -154,7 +141,6 @@ def start_generate_span( ctx: Context format: Response format (BaseModel subclass or None) tool_calls: Whether tool calling is enabled - model_options: Raw model options dict for request-parameter attributes Returns: Span object or None if tracing is disabled @@ -191,29 +177,6 @@ def start_generate_span( if session_id is not None: span_attrs["gen_ai.conversation.id"] = session_id - # Request parameters from model_options (plain-string keys only) - if model_options: - for mellea_key, otel_key in _REQUEST_PARAM_MAP.items(): - val = model_options.get(mellea_key) - if val is not None: - span_attrs[otel_key] = val - - # Prompt template attributes (duck-typed; works for Instruction and GenerativeStub) - tmpl = getattr(action, "prompt_template_metadata", None) - if callable(tmpl): - metadata: Any = tmpl() - if metadata is not None: - template_text, template_vars, template_version = metadata - if template_text: - span_attrs["llm.prompt_template.template"] = template_text - if template_version: - span_attrs["llm.prompt_template.version"] = template_version - # Variables contain user-provided values — only emit with content gate - if template_vars and is_content_tracing_enabled(): - span_attrs["llm.prompt_template.variables"] = _serialize_json( - template_vars - ) - return start_backend_span("chat", **span_attrs) @@ -366,25 +329,6 @@ def finalize_backend_span(span: Any, *, error: Exception | None = None) -> None: end_backend_span(span) -# --------------------------------------------------------------------------- -# Private helpers -# --------------------------------------------------------------------------- - -# Mapping from Mellea/OpenAI plain-string model_options keys to OTel request attrs. -_REQUEST_PARAM_MAP: dict[str, str] = { - "temperature": "gen_ai.request.temperature", - "top_p": "gen_ai.request.top_p", - "top_k": "gen_ai.request.top_k", - "frequency_penalty": "gen_ai.request.frequency_penalty", - "presence_penalty": "gen_ai.request.presence_penalty", -} - - -def _serialize_json(obj: Any) -> str: - """Serialise *obj* to a JSON string, coercing non-serialisable values to str.""" - return json.dumps(obj, default=str, ensure_ascii=False) - - __all__ = [ "finalize_backend_span", "get_context_size", diff --git a/test/telemetry/test_genai_semconv_emission.py b/test/telemetry/test_genai_semconv_emission.py index b6cd2ed88..fbb88462e 100644 --- a/test/telemetry/test_genai_semconv_emission.py +++ b/test/telemetry/test_genai_semconv_emission.py @@ -1,13 +1,11 @@ -"""Unit tests for OTel GenAI semantic convention emission gaps (issue #1035). +"""Unit tests for OTel GenAI semantic convention attribute emission. -Covers gaps 1-4. Gap 5 (content capture) is deferred; see cs/issue-1035-full -for the full implementation. +Covers: gen_ai.provider.name (gap 1), gen_ai.conversation.id (gap 2), +error.type + ERROR status (gap 4), and the MELLEA_TRACE_CONTENT flag. -All tests use a fake span object and do not require a live backend or -OpenTelemetry SDK installation. +All tests use a fake span and do not require a live backend or OTel SDK. """ -import json from unittest.mock import MagicMock, patch from mellea.telemetry.backend_instrumentation import ( @@ -52,7 +50,6 @@ def test_provider_name_emitted_in_start_generate_span(): backend = _fake_backend("OpenAIBackend") backend.model_id = "gpt-4" # type: ignore[attr-defined] action = MagicMock() - action.prompt_template_metadata = None with patch("mellea.telemetry.tracing.start_backend_span") as mock_start: mock_start.return_value = _mock_span() @@ -88,7 +85,6 @@ def test_conversation_id_absent_when_no_session(): backend = _fake_backend("OpenAIBackend") backend.model_id = "gpt-4" # type: ignore[attr-defined] action = MagicMock() - action.prompt_template_metadata = None with patch("mellea.telemetry.tracing.start_backend_span") as mock_start: mock_start.return_value = _mock_span() @@ -98,90 +94,6 @@ def test_conversation_id_absent_when_no_session(): assert "gen_ai.conversation.id" not in call_kwargs -# --------------------------------------------------------------------------- -# Gap 3: llm.prompt_template.* from Instruction -# --------------------------------------------------------------------------- - - -def test_prompt_template_attrs_from_instruction(): - from mellea.stdlib.components.instruction import Instruction - - instr = Instruction( - description="Summarise {{topic}} in one sentence.", - user_variables={"topic": "quantum tunnelling"}, - ) - - backend = _fake_backend("OpenAIBackend") - backend.model_id = "gpt-4" # type: ignore[attr-defined] - - with patch("mellea.telemetry.tracing.start_backend_span") as mock_start: - mock_start.return_value = _mock_span() - start_generate_span(backend, instr, ctx=[], format=None, tool_calls=False) - - call_kwargs = mock_start.call_args[1] - assert call_kwargs.get("llm.prompt_template.template") == ( - "Summarise {{topic}} in one sentence." - ) - # Variables are NOT emitted when content capture is off (default) - assert "llm.prompt_template.variables" not in call_kwargs - - -def test_prompt_template_variables_emitted_when_content_enabled(monkeypatch): - from mellea.stdlib.components.instruction import Instruction - - instr = Instruction(description="Hello {{name}}", user_variables={"name": "World"}) - - backend = _fake_backend("OpenAIBackend") - backend.model_id = "gpt-4" # type: ignore[attr-defined] - - monkeypatch.setattr( - "mellea.telemetry.backend_instrumentation.is_content_tracing_enabled", - lambda: True, - ) - - with patch("mellea.telemetry.tracing.start_backend_span") as mock_start: - mock_start.return_value = _mock_span() - start_generate_span(backend, instr, ctx=[], format=None, tool_calls=False) - - call_kwargs = mock_start.call_args[1] - variables_json = call_kwargs.get("llm.prompt_template.variables") - assert variables_json is not None - assert json.loads(variables_json) == {"name": "World"} - - -def test_instruction_without_user_variables_emits_template(): - from mellea.stdlib.components.instruction import Instruction - - instr = Instruction(description="Tell me about {{topic}}") - - backend = _fake_backend("OpenAIBackend") - backend.model_id = "gpt-4" # type: ignore[attr-defined] - - with patch("mellea.telemetry.tracing.start_backend_span") as mock_start: - mock_start.return_value = _mock_span() - start_generate_span(backend, instr, ctx=[], format=None, tool_calls=False) - - assert ( - mock_start.call_args[1].get("llm.prompt_template.template") - == "Tell me about {{topic}}" - ) - - -def test_instruction_with_no_description_emits_no_template(): - from mellea.stdlib.components.instruction import Instruction - - instr = Instruction() - - backend = _fake_backend("OpenAIBackend") - backend.model_id = "gpt-4" # type: ignore[attr-defined] - - with patch("mellea.telemetry.tracing.start_backend_span") as mock_start: - mock_start.return_value = _mock_span() - start_generate_span(backend, instr, ctx=[], format=None, tool_calls=False) - - assert "llm.prompt_template.template" not in mock_start.call_args[1] - - # --------------------------------------------------------------------------- # Gap 4: ERROR span status + error.type # --------------------------------------------------------------------------- From 292a7f2ea630616b9c961ee796d6349479d0861a Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Wed, 13 May 2026 08:47:15 +0100 Subject: [PATCH 08/10] test(telemetry): remove stale gap-3 artefact; add add_span_event tests Remove the `action.prompt_template_metadata = None` assignment left over from the gap-3 prompt-template work that was withdrawn from this PR. The attribute is never read by `start_generate_span` in the trimmed implementation, making the line misleading. Add three unit tests for `add_span_event` (event forwarded to span, None-span no-op, empty-attributes default) patching `_OTEL_AVAILABLE` since the test environment has no OTel SDK installed. Assisted-by: Claude Code Signed-off-by: Nigel Jones --- test/telemetry/test_genai_semconv_emission.py | 29 +++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/test/telemetry/test_genai_semconv_emission.py b/test/telemetry/test_genai_semconv_emission.py index fbb88462e..d1a6a5c85 100644 --- a/test/telemetry/test_genai_semconv_emission.py +++ b/test/telemetry/test_genai_semconv_emission.py @@ -15,7 +15,7 @@ start_generate_span, ) from mellea.telemetry.context import with_context -from mellea.telemetry.tracing import is_content_tracing_enabled +from mellea.telemetry.tracing import add_span_event, is_content_tracing_enabled # --------------------------------------------------------------------------- # Helpers @@ -69,7 +69,6 @@ def test_conversation_id_emitted_from_session_id(): backend = _fake_backend("OpenAIBackend") backend.model_id = "gpt-4" # type: ignore[attr-defined] action = MagicMock() - action.prompt_template_metadata = None with with_context(session_id="sess-abc"): with patch("mellea.telemetry.tracing.start_backend_span") as mock_start: @@ -147,3 +146,29 @@ def test_finalize_none_span_is_noop(): def test_content_tracing_disabled_by_default(): assert not is_content_tracing_enabled() + + +# --------------------------------------------------------------------------- +# add_span_event helper +# --------------------------------------------------------------------------- + + +def test_add_span_event_calls_span_add_event(): + span = _mock_span() + with patch("mellea.telemetry.tracing._OTEL_AVAILABLE", True): + add_span_event(span, "gen_ai.content.prompt", {"gen_ai.prompt": "hello"}) + span.add_event.assert_called_once_with( + "gen_ai.content.prompt", attributes={"gen_ai.prompt": "hello"} + ) + + +def test_add_span_event_none_span_is_noop(): + with patch("mellea.telemetry.tracing._OTEL_AVAILABLE", True): + add_span_event(None, "gen_ai.content.prompt") + + +def test_add_span_event_defaults_to_empty_attributes(): + span = _mock_span() + with patch("mellea.telemetry.tracing._OTEL_AVAILABLE", True): + add_span_event(span, "gen_ai.content.completion") + span.add_event.assert_called_once_with("gen_ai.content.completion", attributes={}) From 5557734cfc4623c385b26110316cbfc89e280a63 Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Wed, 13 May 2026 12:47:52 +0100 Subject: [PATCH 09/10] fix(telemetry): address self-review findings on OTel semconv PR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Guard end_backend_span in its own try/except so SDK errors on the streaming error path cannot mask the original backend exception - Wire get_provider_name into all three span-creation functions so internal code uses it (was set but calling get_system_name directly, contradicting the docstring guidance) - Fix span_attrs: dict → dict[str, Any] per project typing conventions - Replace _backend.model_id private-attr mutation in example with start_session(model_id=...) public API - Add qualitative marker to example so it does not run in the fast loop - Add test_finalize_never_raises_if_end_span_raises to cover the now-guarded end_backend_span code path Assisted-by: Claude Code --- .../telemetry/otel_genai_semconv_example.py | 5 ++--- mellea/telemetry/backend_instrumentation.py | 15 ++++++++++----- test/telemetry/test_genai_semconv_emission.py | 11 +++++++++++ 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/docs/examples/telemetry/otel_genai_semconv_example.py b/docs/examples/telemetry/otel_genai_semconv_example.py index bbab23356..aea1b6ddd 100644 --- a/docs/examples/telemetry/otel_genai_semconv_example.py +++ b/docs/examples/telemetry/otel_genai_semconv_example.py @@ -1,4 +1,4 @@ -# pytest: ollama, e2e +# pytest: ollama, e2e, qualitative """Mellea backend spans carrying OTel GenAI semantic convention attributes. @@ -73,8 +73,7 @@ def main() -> None: print(" error.type = ") try: - with start_session() as m2: - m2._backend.model_id = "mellea-semconv-nonexistent-xyz" # type: ignore[attr-defined] + with start_session(model_id="mellea-semconv-nonexistent-xyz") as m2: m2.instruct("Hello") except Exception as exc: print(f"\nGot expected error: {exc.__class__.__name__}") diff --git a/mellea/telemetry/backend_instrumentation.py b/mellea/telemetry/backend_instrumentation.py index 21ef5dccd..02cbb75af 100644 --- a/mellea/telemetry/backend_instrumentation.py +++ b/mellea/telemetry/backend_instrumentation.py @@ -107,13 +107,14 @@ def instrument_generate_from_context( """ model_id = get_model_id_str(backend) system_name = get_system_name(backend) + provider_name = get_provider_name(backend) return trace_backend( "chat", # Gen-AI convention: use 'chat' for chat completions **{ # Gen-AI semantic convention attributes "gen_ai.system": system_name, - "gen_ai.provider.name": system_name, + "gen_ai.provider.name": provider_name, "gen_ai.request.model": model_id, "gen_ai.operation.name": "chat", # Mellea-specific attributes @@ -149,14 +150,15 @@ def start_generate_span( model_id = get_model_id_str(backend) system_name = get_system_name(backend) + provider_name = get_provider_name(backend) from .context import get_current_context telemetry_ctx = get_current_context() - span_attrs: dict = { + span_attrs: dict[str, Any] = { # Gen-AI semantic convention attributes "gen_ai.system": system_name, - "gen_ai.provider.name": system_name, + "gen_ai.provider.name": provider_name, "gen_ai.request.model": model_id, "gen_ai.operation.name": "chat", # Mellea-specific attributes @@ -198,13 +200,14 @@ def instrument_generate_from_raw( """ model_id = get_model_id_str(backend) system_name = get_system_name(backend) + provider_name = get_provider_name(backend) return trace_backend( "text_completion", # Gen-AI convention: use 'text_completion' for completions **{ # Gen-AI semantic convention attributes "gen_ai.system": system_name, - "gen_ai.provider.name": system_name, + "gen_ai.provider.name": provider_name, "gen_ai.request.model": model_id, "gen_ai.operation.name": "text_completion", # Mellea-specific attributes @@ -325,8 +328,10 @@ def finalize_backend_span(span: Any, *, error: Exception | None = None) -> None: set_span_attribute(span, "error.type", type(error).__name__) except Exception: pass - finally: + try: end_backend_span(span) + except Exception: + pass __all__ = [ diff --git a/test/telemetry/test_genai_semconv_emission.py b/test/telemetry/test_genai_semconv_emission.py index d1a6a5c85..2aa7a77c1 100644 --- a/test/telemetry/test_genai_semconv_emission.py +++ b/test/telemetry/test_genai_semconv_emission.py @@ -135,6 +135,17 @@ def test_finalize_never_raises_on_span_error(): finalize_backend_span(span, error=ValueError("test")) +def test_finalize_never_raises_if_end_span_raises(): + """end_backend_span exceptions must not propagate on the error path.""" + span = _mock_span() + with patch( + "mellea.telemetry.backend_instrumentation.end_backend_span", + side_effect=RuntimeError("sdk shutdown"), + ): + with patch("mellea.telemetry.backend_instrumentation.set_span_error"): + finalize_backend_span(span, error=ValueError("original error")) + + def test_finalize_none_span_is_noop(): finalize_backend_span(None, error=RuntimeError("x")) From efedc82d0a0ef10f9bcfed0bfd19f43daf02bc83 Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Wed, 13 May 2026 18:17:41 +0100 Subject: [PATCH 10/10] refactor(telemetry): remove dead instrument_generate_from_context, fix example error path instrument_generate_from_context was imported but never called by any backend (all backends use start_generate_span); remove function, __all__ entry, stale imports in ollama.py and openai.py, and the corresponding test. Example error path now uses an unreachable base_url (localhost:19999) instead of a bogus model name, which could cause Ollama to attempt a pull rather than fail deterministically. Assisted-by: Claude Code --- .../telemetry/otel_genai_semconv_example.py | 6 +-- mellea/backends/ollama.py | 1 - mellea/backends/openai.py | 1 - mellea/telemetry/backend_instrumentation.py | 41 ------------------- test/telemetry/test_tracing.py | 25 ----------- 5 files changed, 2 insertions(+), 72 deletions(-) diff --git a/docs/examples/telemetry/otel_genai_semconv_example.py b/docs/examples/telemetry/otel_genai_semconv_example.py index aea1b6ddd..e38840c4c 100644 --- a/docs/examples/telemetry/otel_genai_semconv_example.py +++ b/docs/examples/telemetry/otel_genai_semconv_example.py @@ -73,14 +73,12 @@ def main() -> None: print(" error.type = ") try: - with start_session(model_id="mellea-semconv-nonexistent-xyz") as m2: + with start_session(base_url="http://localhost:19999") as m2: m2.instruct("Hello") except Exception as exc: print(f"\nGot expected error: {exc.__class__.__name__}") else: - print( - "\n(No error — check the span for error.type if the model unexpectedly exists)" - ) + print("\n(No error — nothing is listening on port 19999)") _section("Done") print("If OTEL_EXPORTER_OTLP_ENDPOINT is set, check your trace backend.") diff --git a/mellea/backends/ollama.py b/mellea/backends/ollama.py index 5b50cd709..bc436bece 100644 --- a/mellea/backends/ollama.py +++ b/mellea/backends/ollama.py @@ -28,7 +28,6 @@ from ..stdlib.components import Message from ..stdlib.requirements import ALoraRequirement from ..telemetry.backend_instrumentation import ( - instrument_generate_from_context, instrument_generate_from_raw, start_generate_span, ) diff --git a/mellea/backends/openai.py b/mellea/backends/openai.py index 1eea93511..5dfeaec51 100644 --- a/mellea/backends/openai.py +++ b/mellea/backends/openai.py @@ -45,7 +45,6 @@ from ..stdlib.components import Intrinsic, Message from ..stdlib.requirements import LLMaJRequirement from ..telemetry.backend_instrumentation import ( - instrument_generate_from_context, instrument_generate_from_raw, start_generate_span, ) diff --git a/mellea/telemetry/backend_instrumentation.py b/mellea/telemetry/backend_instrumentation.py index 02cbb75af..dc2d26fff 100644 --- a/mellea/telemetry/backend_instrumentation.py +++ b/mellea/telemetry/backend_instrumentation.py @@ -88,46 +88,6 @@ def get_context_size(ctx: Any) -> int: return 0 -def instrument_generate_from_context( - backend: Any, action: Any, ctx: Any, format: Any = None, tool_calls: bool = False -): - """Create a backend trace span for generate_from_context. - - Follows Gen-AI semantic conventions for chat operations. - - Args: - backend: Backend instance - action: Action component - ctx: Context - format: Response format (BaseModel subclass or None) - tool_calls: Whether tool calling is enabled - - Returns: - Context manager for the trace span - """ - model_id = get_model_id_str(backend) - system_name = get_system_name(backend) - provider_name = get_provider_name(backend) - - return trace_backend( - "chat", # Gen-AI convention: use 'chat' for chat completions - **{ - # Gen-AI semantic convention attributes - "gen_ai.system": system_name, - "gen_ai.provider.name": provider_name, - "gen_ai.request.model": model_id, - "gen_ai.operation.name": "chat", - # Mellea-specific attributes - "mellea.backend": backend.__class__.__name__, - "mellea.action_type": action.__class__.__name__, - "mellea.context_size": get_context_size(ctx), - "mellea.has_format": format is not None, - "mellea.format_type": format.__name__ if format else None, - "mellea.tool_calls_enabled": tool_calls, - }, - ) - - def start_generate_span( backend: Any, action: Any, ctx: Any, format: Any = None, tool_calls: bool = False ): @@ -340,7 +300,6 @@ def finalize_backend_span(span: Any, *, error: Exception | None = None) -> None: "get_model_id_str", "get_provider_name", "get_system_name", - "instrument_generate_from_context", "instrument_generate_from_raw", "record_response_metadata", "record_token_usage", diff --git a/test/telemetry/test_tracing.py b/test/telemetry/test_tracing.py index af83de5ad..b1a058680 100644 --- a/test/telemetry/test_tracing.py +++ b/test/telemetry/test_tracing.py @@ -200,31 +200,6 @@ def __init__(self): assert get_context_size(ctx) == 3 -def test_instrument_generate_from_context(): - """Test instrument_generate_from_context helper.""" - from mellea.telemetry.backend_instrumentation import ( - instrument_generate_from_context, - ) - - class MockBackend: - model_id = "test-model" - - class MockAction: - pass - - class MockContext: - turns = [] - - backend = MockBackend() - action = MockAction() - ctx = MockContext() - - # Should return a context manager - with instrument_generate_from_context(backend, action, ctx) as span: - # Span will be None when tracing is disabled - assert span is None or hasattr(span, "set_attribute") - - def test_instrument_generate_from_raw(): """Test instrument_generate_from_raw helper.""" from mellea.telemetry.backend_instrumentation import instrument_generate_from_raw