diff --git a/sdk/ai/azure-ai-projects/assets.json b/sdk/ai/azure-ai-projects/assets.json index 9c6015cc80f1..490af2ec735e 100644 --- a/sdk/ai/azure-ai-projects/assets.json +++ b/sdk/ai/azure-ai-projects/assets.json @@ -2,5 +2,5 @@ "AssetsRepo": "Azure/azure-sdk-assets", "AssetsRepoPrefixPath": "python", "TagPrefix": "python/ai/azure-ai-projects", - "Tag": "python/ai/azure-ai-projects_257daffdb5" + "Tag": "python/ai/azure-ai-projects_6f9985fe6d" } diff --git a/sdk/ai/azure-ai-projects/tests/conftest.py b/sdk/ai/azure-ai-projects/tests/conftest.py index dfae2c1550cb..982d1b3ab7bb 100644 --- a/sdk/ai/azure-ai-projects/tests/conftest.py +++ b/sdk/ai/azure-ai-projects/tests/conftest.py @@ -122,6 +122,18 @@ def sanitize_url_paths(): # Sanitize eval dataset names with timestamps (e.g., eval-data-2026-01-19_040648_UTC) add_general_regex_sanitizer(regex=r"eval-data-\d{4}-\d{2}-\d{2}_\d{6}_UTC", value="eval-data-sanitized-timestamp") + # Sanitize Unix timestamps in eval names (from sample_redteam_evaluations.py) + # Pattern 1: "Red Team Agent Safety Evaluation -" + add_general_regex_sanitizer( + regex=r"Evaluation -\d{10}", + value="Evaluation -SANITIZED-TS" + ) + # Pattern 2: "Eval Run for -" (agent name already sanitized) + add_general_regex_sanitizer( + regex=r"sanitized-agent-name -\d{10}", + value="sanitized-agent-name -SANITIZED-TS" + ) + # Sanitize API key from service response (this includes Application Insights connection string) add_body_key_sanitizer(json_path="credentials.key", value="sanitized-api-key") diff --git a/sdk/ai/azure-ai-projects/tests/samples/sample_executor.py b/sdk/ai/azure-ai-projects/tests/samples/sample_executor.py index d23b17295716..a15b9b72b31d 100644 --- a/sdk/ai/azure-ai-projects/tests/samples/sample_executor.py +++ b/sdk/ai/azure-ai-projects/tests/samples/sample_executor.py @@ -11,6 +11,7 @@ import inspect import importlib.util import functools + from dataclasses import dataclass, field from typing import overload, Union, Optional from pydantic import BaseModel @@ -22,7 +23,12 @@ from azure.core.credentials_async import AsyncTokenCredential from devtools_testutils.fake_credentials import FakeTokenCredential from devtools_testutils.fake_credentials_async import AsyncFakeCredential +from devtools_testutils import is_live from azure.ai.projects import AIProjectClient + +# Fixed timestamp for playback mode (Nov 2023). +# Must match the timestamp sanitizers in conftest.py (e.g., `Evaluation -\d{10}`). +PLAYBACK_TIMESTAMP = 1700000000 from pytest import MonkeyPatch from azure.ai.projects.aio import AIProjectClient as AsyncAIProjectClient @@ -312,6 +318,16 @@ def execute(self, patched_open_fn=None): mock.patch("builtins.open", side_effect=patched_open_fn), ): self.spec.loader.exec_module(self.module) + # In playback mode, patch time functions on the module: + # - time.sleep: avoid waiting for polling loops (instant) + # - time.time: return fixed value for deterministic request bodies + # Must be done after exec_module so the module's 'time' reference can be patched. + if not is_live() and hasattr(self.module, "time"): + self.module.time.sleep = lambda _: None + self.module.time.time = lambda: PLAYBACK_TIMESTAMP + # Call main() if it exists (samples wrap their code in main()) + if hasattr(self.module, "main") and callable(self.module.main): + self.module.main() def validate_print_calls_by_llm( self, @@ -379,7 +395,16 @@ async def execute_async(self, patched_open_fn=None): if self.spec.loader is None: raise ImportError(f"Could not load module {self.spec.name} from {self.sample_path}") self.spec.loader.exec_module(self.module) - await self.module.main() + # In playback mode, patch time functions on the module: + # - time.sleep: avoid waiting for polling loops (instant) + # - time.time: return fixed value for deterministic request bodies + # Must be done after exec_module so the module's 'time' reference can be patched. + if not is_live() and hasattr(self.module, "time"): + self.module.time.sleep = lambda _: None + self.module.time.time = lambda: PLAYBACK_TIMESTAMP + # Call main() if it exists (samples wrap their code in main()) + if hasattr(self.module, "main") and callable(self.module.main): + await self.module.main() async def validate_print_calls_by_llm_async( self, diff --git a/sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py b/sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py index ef671dd22369..acb3c5c2fd04 100644 --- a/sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py +++ b/sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py @@ -4,6 +4,7 @@ # Licensed under the MIT License. # ------------------------------------ import functools +import os import pytest from devtools_testutils import recorded_by_proxy, AzureRecordedTestCase, RecordedTransport, EnvironmentVariableLoader from sample_executor import ( @@ -44,7 +45,9 @@ class TestSamplesEvaluations(AzureRecordedTestCase): """ Tests for evaluation samples. - Included samples (9): + Included samples (25): + + Main evaluation samples (10): - sample_agent_evaluation.py - sample_model_evaluation.py - sample_agent_response_evaluation.py @@ -54,8 +57,24 @@ class TestSamplesEvaluations(AzureRecordedTestCase): - sample_eval_catalog_code_based_evaluators.py - sample_eval_catalog_prompt_based_evaluators.py - sample_evaluation_compare_insight.py + - sample_redteam_evaluations.py - More samples will be added in the future. + Agentic evaluator samples (15): + - sample_coherence.py + - sample_fluency.py + - sample_groundedness.py + - sample_intent_resolution.py + - sample_relevance.py + - sample_response_completeness.py + - sample_task_adherence.py + - sample_task_completion.py + - sample_task_navigation_efficiency.py + - sample_tool_call_accuracy.py + - sample_tool_call_success.py + - sample_tool_input_accuracy.py + - sample_tool_output_utilization.py + - sample_tool_selection.py + - sample_generic_agentic_evaluator.py Excluded samples and reasons: @@ -76,13 +95,11 @@ class TestSamplesEvaluations(AzureRecordedTestCase): - sample_evaluations_builtin_with_traces.py: Requires Azure Application Insights and uses azure-monitor-query to fetch traces. - sample_scheduled_evaluations.py: Requires Azure RBAC assignment via - azure-mgmt-authorization and azure-mgmt-resource. + azure-mgmt-authorization and azure-mgmt-resource, AND uploads Dataset. Complex prerequisites (require manual portal setup): - sample_continuous_evaluation_rule.py: Requires manual RBAC assignment in Azure Portal to enable continuous evaluation. - - sample_redteam_evaluations.py: Red team evaluations may require special - permissions or setup. """ # To run this test with a specific sample, use: @@ -102,6 +119,7 @@ class TestSamplesEvaluations(AzureRecordedTestCase): "sample_eval_catalog_prompt_based_evaluators.py", "sample_evaluation_compare_insight.py", "sample_agent_response_evaluation_with_function_tool.py", + "sample_redteam_evaluations.py", ], ), ) @@ -116,3 +134,65 @@ def test_evaluation_samples(self, sample_path: str, **kwargs) -> None: project_endpoint=kwargs["azure_ai_project_endpoint"], model=kwargs["azure_ai_model_deployment_name"], ) + + # To run this test with a specific sample, use: + # pytest tests/samples/test_samples_evaluations.py::TestSamplesEvaluations::test_agentic_evaluator_samples[sample_coherence] + @evaluationsPreparer() + @pytest.mark.parametrize( + "sample_path", + get_sample_paths( + "evaluations/agentic_evaluators", + samples_to_test=[ + "sample_coherence.py", + "sample_fluency.py", + "sample_groundedness.py", + "sample_intent_resolution.py", + "sample_relevance.py", + "sample_response_completeness.py", + "sample_task_adherence.py", + "sample_task_completion.py", + "sample_task_navigation_efficiency.py", + "sample_tool_call_accuracy.py", + "sample_tool_call_success.py", + "sample_tool_input_accuracy.py", + "sample_tool_output_utilization.py", + "sample_tool_selection.py", + ], + ), + ) + @SamplePathPasser() + @recorded_by_proxy(RecordedTransport.AZURE_CORE, RecordedTransport.HTTPX) + def test_agentic_evaluator_samples(self, sample_path: str, **kwargs) -> None: + env_var_mapping = get_sample_environment_variables_map(kwargs) + executor = SyncSampleExecutor(self, sample_path, env_var_mapping=env_var_mapping, **kwargs) + executor.execute() + executor.validate_print_calls_by_llm( + instructions=evaluations_instructions, + project_endpoint=kwargs["azure_ai_project_endpoint"], + model=kwargs["azure_ai_model_deployment_name"], + ) + + # To run this test, use: + # pytest tests/samples/test_samples_evaluations.py::TestSamplesEvaluations::test_generic_agentic_evaluator_sample + @evaluationsPreparer() + @recorded_by_proxy(RecordedTransport.AZURE_CORE, RecordedTransport.HTTPX) + def test_generic_agentic_evaluator_sample(self, **kwargs) -> None: + # Manually construct path to nested sample + current_dir = os.path.dirname(os.path.abspath(__file__)) + samples_folder = os.path.normpath(os.path.join(current_dir, os.pardir, os.pardir)) + sample_path = os.path.join( + samples_folder, + "samples", + "evaluations", + "agentic_evaluators", + "sample_generic_agentic_evaluator", + "sample_generic_agentic_evaluator.py", + ) + env_var_mapping = get_sample_environment_variables_map(kwargs) + executor = SyncSampleExecutor(self, sample_path, env_var_mapping=env_var_mapping, **kwargs) + executor.execute() + executor.validate_print_calls_by_llm( + instructions=evaluations_instructions, + project_endpoint=kwargs["azure_ai_project_endpoint"], + model=kwargs["azure_ai_model_deployment_name"], + )