diff --git a/langfuse/batch_evaluation.py b/langfuse/batch_evaluation.py index 35e1ea938..4a360773a 100644 --- a/langfuse/batch_evaluation.py +++ b/langfuse/batch_evaluation.py @@ -20,6 +20,7 @@ Protocol, Tuple, Union, + cast, ) from langfuse.api.resources.commons.types import ( @@ -1220,6 +1221,9 @@ async def _process_batch_evaluation_item( self._create_score_for_scope( scope=scope, item_id=item_id, + trace_id=cast(ObservationsView, item).trace_id + if scope == "observations" + else None, evaluation=evaluation, additional_metadata=metadata, ) @@ -1242,6 +1246,9 @@ async def _process_batch_evaluation_item( self._create_score_for_scope( scope=scope, item_id=item_id, + trace_id=cast(ObservationsView, item).trace_id + if scope == "observations" + else None, evaluation=composite_eval, additional_metadata=metadata, ) @@ -1361,8 +1368,10 @@ async def _run_composite_evaluator( def _create_score_for_scope( self, + *, scope: str, item_id: str, + trace_id: Optional[str] = None, evaluation: Evaluation, additional_metadata: Optional[Dict[str, Any]], ) -> None: @@ -1371,6 +1380,7 @@ def _create_score_for_scope( Args: scope: The type of entity ("traces", "observations"). item_id: The ID of the entity. + trace_id: The trace ID of the entity; required if scope=observations evaluation: The evaluation result to create a score from. additional_metadata: Additional metadata to merge with evaluation metadata. """ @@ -1393,6 +1403,7 @@ def _create_score_for_scope( elif scope == "observations": self.client.create_score( observation_id=item_id, + trace_id=trace_id, name=evaluation.name, value=evaluation.value, # type: ignore comment=evaluation.comment, diff --git a/tests/test_batch_evaluation.py b/tests/test_batch_evaluation.py index 9bf18348d..448172a55 100644 --- a/tests/test_batch_evaluation.py +++ b/tests/test_batch_evaluation.py @@ -25,7 +25,7 @@ # ============================================================================ -pytestmark = pytest.mark.skip(reason="Github CI runner overwhelmed by score volume") +# pytestmark = pytest.mark.skip(reason="Github CI runner overwhelmed by score volume") @pytest.fixture @@ -67,6 +67,32 @@ def simple_evaluator(*, input, output, expected_output=None, metadata=None, **kw # ============================================================================ +def test_run_batched_evaluation_on_observations_basic(langfuse_client): + """Test basic batch evaluation on traces.""" + result = langfuse_client.run_batched_evaluation( + scope="observations", + mapper=simple_trace_mapper, + evaluators=[simple_evaluator], + max_items=1, + verbose=True, + ) + + # Validate result structure + assert isinstance(result, BatchEvaluationResult) + assert result.total_items_fetched >= 0 + assert result.total_items_processed >= 0 + assert result.total_scores_created >= 0 + assert result.completed is True + assert isinstance(result.duration_seconds, float) + assert result.duration_seconds > 0 + + # Verify evaluator stats + assert len(result.evaluator_stats) == 1 + stats = result.evaluator_stats[0] + assert isinstance(stats, EvaluatorStats) + assert stats.name == "simple_evaluator" + + def test_run_batched_evaluation_on_traces_basic(langfuse_client): """Test basic batch evaluation on traces.""" result = langfuse_client.run_batched_evaluation(