diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index 22f49dc3c2b3..5129c136bd67 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -15,6 +15,7 @@ ### Bugs Fixed - Fixed `evaluate()` raising `EvaluationException: (InternalError) unhashable type: 'list'` when an evaluator emitted a list value under a `_result`-suffixed column. Binary aggregation now skips such columns with a warning instead of aborting the entire run. +- Fixed `task_adherence` red team scoring by adding `scenario=redteam` to the RAI scorer evaluation payload, ensuring the server-side score mapping correctly routes to Direct mapping for attack success determination. - Fixed row classification double-counting in `_calculate_aoai_evaluation_summary` where errored rows were counted separately and could also be counted as passed/failed. Rows are now classified into mutually exclusive buckets with priority: passed > failed > errored > skipped. - Fixed row classification where rows with empty or missing results lists were incorrectly counted as "passed" (the condition `passed_count == len(results) - error_count` evaluated `0 == 0` as True). - Fixed `_get_metric_result` prefix matching where shorter metric names (e.g., `xpia`) could match before longer, more-specific ones (e.g., `xpia_manipulated_content`). Now sorts by length descending for correct longest-prefix matching. diff --git a/sdk/evaluation/azure-ai-evaluation/assets.json b/sdk/evaluation/azure-ai-evaluation/assets.json index e34432a85348..db9dc964c8df 100644 --- a/sdk/evaluation/azure-ai-evaluation/assets.json +++ b/sdk/evaluation/azure-ai-evaluation/assets.json @@ -2,5 +2,5 @@ "AssetsRepo": "Azure/azure-sdk-assets", "AssetsRepoPrefixPath": "python", "TagPrefix": "python/evaluation/azure-ai-evaluation", - "Tag": "python/evaluation/azure-ai-evaluation_67d91b0617" -} + "Tag": "python/evaluation/azure-ai-evaluation_0748353c8d" +} \ No newline at end of file diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_foundry/_rai_scorer.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_foundry/_rai_scorer.py index 13a748a56b9d..5d7dffd1c079 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_foundry/_rai_scorer.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_foundry/_rai_scorer.py @@ -180,6 +180,7 @@ async def _score_piece_async( "query": "query", "response": response_text, "context": context, + "scenario": "redteam", } # Get metric and annotation task for risk category diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_foundry.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_foundry.py index ccb44ffe05d3..55ac871cb12f 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_foundry.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_foundry.py @@ -825,7 +825,55 @@ async def test_score_async_error_handling(self, mock_credential, mock_azure_ai_p with pytest.raises(Exception, match="RAI service error"): await scorer.score_async(mock_message, objective="Test attack") - def test_get_context_for_piece_from_labels(self, mock_credential, mock_azure_ai_project, mock_logger): + @pytest.mark.asyncio + async def test_score_async_sends_redteam_scenario(self, mock_credential, mock_azure_ai_project, mock_logger): + """Regression: RAIServiceScorer must include scenario=redteam in eval_input + so the server-side ContextDependent score mapping routes to Direct for red team.""" + scorer = RAIServiceScorer( + credential=mock_credential, + azure_ai_project=mock_azure_ai_project, + risk_category=RiskCategory.Violence, + logger=mock_logger, + ) + + mock_piece = MagicMock() + mock_piece.id = "test-id" + mock_piece.converted_value = "Test response" + mock_piece.original_value = "Original content" + mock_piece.labels = {} + mock_piece.api_role = "assistant" + + mock_message = MagicMock() + mock_message.message_pieces = [mock_piece] + + mock_eval_result = MagicMock() + mock_eval_result.results = [ + MagicMock( + name="violence", + metric="violence", + score=5, + reason="Harmful content", + threshold=3, + passed=False, + label="high", + ) + ] + + with patch( + "azure.ai.evaluation.red_team._foundry._rai_scorer.evaluate_with_rai_service_sync", + new_callable=AsyncMock, + ) as mock_eval: + mock_eval.return_value = mock_eval_result + + await scorer.score_async(mock_message, objective="Test attack") + + # Verify scenario=redteam was included in the eval_input + call_args = mock_eval.call_args + eval_input = call_args[0][0] # First positional arg + assert eval_input.get("scenario") == "redteam", ( + "RAIServiceScorer must send scenario=redteam so server-side " + "ContextDependent mapping routes to Direct for red team evaluations" + ) """Test context retrieval from message labels.""" scorer = RAIServiceScorer( credential=mock_credential,