Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
### Bugs Fixed

- Fixed `evaluate()` raising `EvaluationException: (InternalError) unhashable type: 'list'` when an evaluator emitted a list value under a `_result`-suffixed column. Binary aggregation now skips such columns with a warning instead of aborting the entire run.
- Fixed `task_adherence` red team scoring by adding `scenario=redteam` to the RAI scorer evaluation payload, ensuring the server-side score mapping correctly routes to Direct mapping for attack success determination.
- Fixed row classification double-counting in `_calculate_aoai_evaluation_summary` where errored rows were counted separately and could also be counted as passed/failed. Rows are now classified into mutually exclusive buckets with priority: passed > failed > errored > skipped.
- Fixed row classification where rows with empty or missing results lists were incorrectly counted as "passed" (the condition `passed_count == len(results) - error_count` evaluated `0 == 0` as True).
- Fixed `_get_metric_result` prefix matching where shorter metric names (e.g., `xpia`) could match before longer, more-specific ones (e.g., `xpia_manipulated_content`). Now sorts by length descending for correct longest-prefix matching.
Expand Down
4 changes: 2 additions & 2 deletions sdk/evaluation/azure-ai-evaluation/assets.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
"AssetsRepo": "Azure/azure-sdk-assets",
"AssetsRepoPrefixPath": "python",
"TagPrefix": "python/evaluation/azure-ai-evaluation",
"Tag": "python/evaluation/azure-ai-evaluation_67d91b0617"
}
"Tag": "python/evaluation/azure-ai-evaluation_0748353c8d"
}
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ async def _score_piece_async(
"query": "query",
"response": response_text,
"context": context,
"scenario": "redteam",
}

# Get metric and annotation task for risk category
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -825,7 +825,55 @@ async def test_score_async_error_handling(self, mock_credential, mock_azure_ai_p
with pytest.raises(Exception, match="RAI service error"):
await scorer.score_async(mock_message, objective="Test attack")

def test_get_context_for_piece_from_labels(self, mock_credential, mock_azure_ai_project, mock_logger):
@pytest.mark.asyncio
async def test_score_async_sends_redteam_scenario(self, mock_credential, mock_azure_ai_project, mock_logger):
"""Regression: RAIServiceScorer must include scenario=redteam in eval_input
so the server-side ContextDependent score mapping routes to Direct for red team."""
scorer = RAIServiceScorer(
credential=mock_credential,
azure_ai_project=mock_azure_ai_project,
risk_category=RiskCategory.Violence,
logger=mock_logger,
)

mock_piece = MagicMock()
mock_piece.id = "test-id"
mock_piece.converted_value = "Test response"
mock_piece.original_value = "Original content"
mock_piece.labels = {}
mock_piece.api_role = "assistant"

mock_message = MagicMock()
mock_message.message_pieces = [mock_piece]

mock_eval_result = MagicMock()
mock_eval_result.results = [
MagicMock(
name="violence",
metric="violence",
score=5,
reason="Harmful content",
threshold=3,
passed=False,
label="high",
)
]

with patch(
"azure.ai.evaluation.red_team._foundry._rai_scorer.evaluate_with_rai_service_sync",
new_callable=AsyncMock,
) as mock_eval:
mock_eval.return_value = mock_eval_result

await scorer.score_async(mock_message, objective="Test attack")

# Verify scenario=redteam was included in the eval_input
call_args = mock_eval.call_args
eval_input = call_args[0][0] # First positional arg
assert eval_input.get("scenario") == "redteam", (
"RAIServiceScorer must send scenario=redteam so server-side "
"ContextDependent mapping routes to Direct for red team evaluations"
)
"""Test context retrieval from message labels."""
scorer = RAIServiceScorer(
credential=mock_credential,
Expand Down