fix async iterator issue

AbhiPrasad · AbhiPrasad · commit 8fff014a86ec · 2026-03-26T15:12:07.000-07:00
diff --git a/py/src/braintrust/integrations/agentscope/test_agentscope.py b/py/src/braintrust/integrations/agentscope/test_agentscope.py
@@ -219,6 +219,85 @@ async def _stream():
     assert llm_span["metrics"]["tokens"] == 32
 
 
+@pytest.mark.asyncio
+async def test_model_call_wrapper_stream_span_covers_full_stream_duration(memory_logger):
+    """Span end timestamp must be recorded after the stream is fully consumed, not before."""
+    import asyncio
+
+    from braintrust.integrations.agentscope.tracing import _model_call_wrapper
+
+    assert not memory_logger.pop()
+
+    class FakeModel:
+        model_name = "gpt-4o-mini"
+
+    async def wrapped(*_args, **_kwargs):
+        async def _stream():
+            for i in range(3):
+                await asyncio.sleep(0.1)
+                yield {"content": [{"type": "text", "text": f"chunk-{i}"}]}
+
+        return _stream()
+
+    stream = await _model_call_wrapper(
+        wrapped,
+        FakeModel(),
+        args=([{"role": "user", "content": "hi"}],),
+        kwargs={},
+    )
+    async for _ in stream:
+        pass
+
+    spans = memory_logger.pop()
+    assert len(spans) == 1
+    span = spans[0]
+    m = span.get("metrics", {})
+    duration_ms = (m["end"] - m["start"]) * 1000
+    # Stream takes ~300ms (3 chunks × 100ms). The span duration must reflect that.
+    assert duration_ms >= 200, f"Span duration {duration_ms:.0f}ms is too short; span ended before stream was consumed"
+
+
+@pytest.mark.asyncio
+async def test_toolkit_call_tool_function_wrapper_stream_span_covers_full_stream_duration(memory_logger):
+    """Tool span end timestamp must be recorded after the stream is fully consumed, not before."""
+    import asyncio
+
+    from braintrust.integrations.agentscope.tracing import _toolkit_call_tool_function_wrapper
+
+    assert not memory_logger.pop()
+
+    class FakeToolkit:
+        pass
+
+    class FakeToolCall:
+        name = "my_tool"
+
+    async def wrapped(*_args, **_kwargs):
+        async def _stream():
+            for i in range(3):
+                await asyncio.sleep(0.1)
+                yield f"chunk-{i}"
+
+        return _stream()
+
+    stream = await _toolkit_call_tool_function_wrapper(
+        wrapped,
+        FakeToolkit(),
+        args=(FakeToolCall(),),
+        kwargs={},
+    )
+    async for _ in stream:
+        pass
+
+    spans = memory_logger.pop()
+    assert len(spans) == 1
+    span = spans[0]
+    m = span.get("metrics", {})
+    duration_ms = (m["end"] - m["start"]) * 1000
+    # Stream takes ~300ms (3 chunks × 100ms). The span duration must reflect that.
+    assert duration_ms >= 200, f"Span duration {duration_ms:.0f}ms is too short; span ended before stream was consumed"
+
+
 class TestAutoInstrumentAgentScope:
     def test_auto_instrument_agentscope(self):
         verify_autoinstrument_script("test_auto_agentscope.py")
diff --git a/py/src/braintrust/integrations/agentscope/tracing.py b/py/src/braintrust/integrations/agentscope/tracing.py
@@ -1,5 +1,6 @@
 """AgentScope-specific span creation and stream aggregation."""
 
+import contextlib
 from contextlib import aclosing
 from typing import Any
 
@@ -200,29 +201,34 @@ async def _fanout_pipeline_wrapper(wrapped: Any, instance: Any, args: Any, kwarg
 async def _toolkit_call_tool_function_wrapper(wrapped: Any, instance: Any, args: Any, kwargs: dict[str, Any]) -> Any:
     tool_call = args[0] if args else kwargs.get("tool_call")
     tool_name = _tool_name(tool_call)
-    with start_span(
-        name=f"{tool_name}.execute",
-        type=SpanTypeAttribute.TOOL,
-        input=_clean(
-            {
-                "tool_name": tool_name,
-                "tool_call": tool_call,
-            }
-        ),
-        metadata=_clean({"toolkit_class": instance.__class__.__name__}),
-    ) as span:
+    with contextlib.ExitStack() as stack:
+        span = stack.enter_context(
+            start_span(
+                name=f"{tool_name}.execute",
+                type=SpanTypeAttribute.TOOL,
+                input=_clean(
+                    {
+                        "tool_name": tool_name,
+                        "tool_call": tool_call,
+                    }
+                ),
+                metadata=_clean({"toolkit_class": instance.__class__.__name__}),
+            )
+        )
         try:
             result = await wrapped(*args, **kwargs)
             if _is_async_iterator(result):
+                deferred = stack.pop_all()
 
                 async def _trace():
-                    last_chunk = None
-                    async with aclosing(result) as agen:
-                        async for chunk in agen:
-                            last_chunk = chunk
-                            yield chunk
-                    if last_chunk is not None:
-                        span.log(output=last_chunk)
+                    with deferred:
+                        last_chunk = None
+                        async with aclosing(result) as agen:
+                            async for chunk in agen:
+                                last_chunk = chunk
+                                yield chunk
+                        if last_chunk is not None:
+                            span.log(output=last_chunk)
 
                 return _trace()
 
@@ -241,24 +247,29 @@ def _is_async_iterator(value: Any) -> bool:
 
 
 async def _model_call_wrapper(wrapped: Any, instance: Any, args: Any, kwargs: dict[str, Any]) -> Any:
-    with start_span(
-        name=f"{_model_provider_name(instance)}.call",
-        type=SpanTypeAttribute.LLM,
-        input=_model_call_input(args, kwargs),
-        metadata=_model_call_metadata(instance, kwargs),
-    ) as span:
+    with contextlib.ExitStack() as stack:
+        span = stack.enter_context(
+            start_span(
+                name=f"{_model_provider_name(instance)}.call",
+                type=SpanTypeAttribute.LLM,
+                input=_model_call_input(args, kwargs),
+                metadata=_model_call_metadata(instance, kwargs),
+            )
+        )
         try:
             result = await wrapped(*args, **kwargs)
             if _is_async_iterator(result):
+                deferred = stack.pop_all()
 
                 async def _trace():
-                    last_chunk = None
-                    async with aclosing(result) as agen:
-                        async for chunk in agen:
-                            last_chunk = chunk
-                            yield chunk
-                    if last_chunk is not None:
-                        span.log(output=_model_call_output(last_chunk), metrics=_extract_metrics(last_chunk))
+                    with deferred:
+                        last_chunk = None
+                        async with aclosing(result) as agen:
+                            async for chunk in agen:
+                                last_chunk = chunk
+                                yield chunk
+                        if last_chunk is not None:
+                            span.log(output=_model_call_output(last_chunk), metrics=_extract_metrics(last_chunk))
 
                 return _trace()