From 05786e6aa525eeab4581e0f6cfb50b7dcdbc7fd6 Mon Sep 17 00:00:00 2001 From: April Kwong Date: Thu, 22 Jan 2026 16:37:03 -0800 Subject: [PATCH 1/3] Added more eval sample tests --- sdk/ai/azure-ai-projects/assets.json | 2 +- sdk/ai/azure-ai-projects/tests/conftest.py | 7 ++ .../tests/samples/sample_executor.py | 23 +++++ .../tests/samples/test_samples_evaluations.py | 90 +++++++++++++++++-- 4 files changed, 116 insertions(+), 6 deletions(-) diff --git a/sdk/ai/azure-ai-projects/assets.json b/sdk/ai/azure-ai-projects/assets.json index 9c6015cc80f1..490af2ec735e 100644 --- a/sdk/ai/azure-ai-projects/assets.json +++ b/sdk/ai/azure-ai-projects/assets.json @@ -2,5 +2,5 @@ "AssetsRepo": "Azure/azure-sdk-assets", "AssetsRepoPrefixPath": "python", "TagPrefix": "python/ai/azure-ai-projects", - "Tag": "python/ai/azure-ai-projects_257daffdb5" + "Tag": "python/ai/azure-ai-projects_6f9985fe6d" } diff --git a/sdk/ai/azure-ai-projects/tests/conftest.py b/sdk/ai/azure-ai-projects/tests/conftest.py index b47a817688e7..7712fad211e0 100644 --- a/sdk/ai/azure-ai-projects/tests/conftest.py +++ b/sdk/ai/azure-ai-projects/tests/conftest.py @@ -126,6 +126,13 @@ def sanitize_url_paths(): value="eval-data-sanitized-timestamp" ) + # Sanitize Unix timestamps (10-digit numbers) in eval names + # These appear in patterns like "Evaluation -1769111901" or "Eval Run for agent-test -1769111901" + add_general_regex_sanitizer( + regex=r"-17\d{8}", + value="-SANITIZED-TS" + ) + # Sanitize API key from service response (this includes Application Insights connection string) add_body_key_sanitizer(json_path="credentials.key", value="sanitized-api-key") diff --git a/sdk/ai/azure-ai-projects/tests/samples/sample_executor.py b/sdk/ai/azure-ai-projects/tests/samples/sample_executor.py index d23b17295716..b872ac83c173 100644 --- a/sdk/ai/azure-ai-projects/tests/samples/sample_executor.py +++ b/sdk/ai/azure-ai-projects/tests/samples/sample_executor.py @@ -11,6 +11,7 @@ import inspect import importlib.util import functools + from dataclasses import dataclass, field from typing import overload, Union, Optional from pydantic import BaseModel @@ -22,7 +23,12 @@ from azure.core.credentials_async import AsyncTokenCredential from devtools_testutils.fake_credentials import FakeTokenCredential from devtools_testutils.fake_credentials_async import AsyncFakeCredential +from devtools_testutils import is_live from azure.ai.projects import AIProjectClient + +# Fixed timestamp for playback mode (Nov 2023). +# Must match the sanitizer regex `-17\d{8}` in conftest.py which replaces it with `-SANITIZED-TS`. +PLAYBACK_TIMESTAMP = 1700000000 from pytest import MonkeyPatch from azure.ai.projects.aio import AIProjectClient as AsyncAIProjectClient @@ -312,6 +318,16 @@ def execute(self, patched_open_fn=None): mock.patch("builtins.open", side_effect=patched_open_fn), ): self.spec.loader.exec_module(self.module) + # In playback mode, patch time functions on the module: + # - time.sleep: avoid waiting for polling loops (instant) + # - time.time: return fixed value for deterministic request bodies + # Must be done after exec_module so the module's 'time' reference can be patched. + if not is_live() and hasattr(self.module, "time"): + self.module.time.sleep = lambda _: None + self.module.time.time = lambda: PLAYBACK_TIMESTAMP + # Call main() if it exists (samples wrap their code in main()) + if hasattr(self.module, "main") and callable(self.module.main): + self.module.main() def validate_print_calls_by_llm( self, @@ -379,6 +395,13 @@ async def execute_async(self, patched_open_fn=None): if self.spec.loader is None: raise ImportError(f"Could not load module {self.spec.name} from {self.sample_path}") self.spec.loader.exec_module(self.module) + # In playback mode, patch time functions on the module: + # - time.sleep: avoid waiting for polling loops (instant) + # - time.time: return fixed value for deterministic request bodies + # Must be done after exec_module so the module's 'time' reference can be patched. + if not is_live() and hasattr(self.module, "time"): + self.module.time.sleep = lambda _: None + self.module.time.time = lambda: PLAYBACK_TIMESTAMP await self.module.main() async def validate_print_calls_by_llm_async( diff --git a/sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py b/sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py index 241d4652b581..4a3fa5acf24b 100644 --- a/sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py +++ b/sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py @@ -4,6 +4,7 @@ # Licensed under the MIT License. # ------------------------------------ import functools +import os import pytest from devtools_testutils import recorded_by_proxy, AzureRecordedTestCase, RecordedTransport, EnvironmentVariableLoader from sample_executor import ( @@ -43,7 +44,9 @@ class TestSamplesEvaluations(AzureRecordedTestCase): """ Tests for evaluation samples. - Included samples (9): + Included samples (25): + + Main evaluation samples (10): - sample_agent_evaluation.py - sample_model_evaluation.py - sample_agent_response_evaluation.py @@ -53,8 +56,24 @@ class TestSamplesEvaluations(AzureRecordedTestCase): - sample_eval_catalog_code_based_evaluators.py - sample_eval_catalog_prompt_based_evaluators.py - sample_evaluation_compare_insight.py + - sample_redteam_evaluations.py - More samples will be added in the future. + Agentic evaluator samples (15): + - sample_coherence.py + - sample_fluency.py + - sample_groundedness.py + - sample_intent_resolution.py + - sample_relevance.py + - sample_response_completeness.py + - sample_task_adherence.py + - sample_task_completion.py + - sample_task_navigation_efficiency.py + - sample_tool_call_accuracy.py + - sample_tool_call_success.py + - sample_tool_input_accuracy.py + - sample_tool_output_utilization.py + - sample_tool_selection.py + - sample_generic_agentic_evaluator.py Excluded samples and reasons: @@ -75,13 +94,11 @@ class TestSamplesEvaluations(AzureRecordedTestCase): - sample_evaluations_builtin_with_traces.py: Requires Azure Application Insights and uses azure-monitor-query to fetch traces. - sample_scheduled_evaluations.py: Requires Azure RBAC assignment via - azure-mgmt-authorization and azure-mgmt-resource. + azure-mgmt-authorization and azure-mgmt-resource, AND uploads Dataset. Complex prerequisites (require manual portal setup): - sample_continuous_evaluation_rule.py: Requires manual RBAC assignment in Azure Portal to enable continuous evaluation. - - sample_redteam_evaluations.py: Red team evaluations may require special - permissions or setup. """ # To run this test with a specific sample, use: @@ -101,6 +118,7 @@ class TestSamplesEvaluations(AzureRecordedTestCase): "sample_eval_catalog_prompt_based_evaluators.py", "sample_evaluation_compare_insight.py", "sample_agent_response_evaluation_with_function_tool.py", + "sample_redteam_evaluations.py", ], ), ) @@ -115,3 +133,65 @@ def test_evaluation_samples(self, sample_path: str, **kwargs) -> None: project_endpoint=kwargs["azure_ai_project_endpoint"], model=kwargs["azure_ai_model_deployment_name"], ) + + # To run this test with a specific sample, use: + # pytest tests/samples/test_samples_evaluations.py::TestSamplesEvaluations::test_agentic_evaluator_samples[sample_coherence] + @evaluationsPreparer() + @pytest.mark.parametrize( + "sample_path", + get_sample_paths( + "evaluations/agentic_evaluators", + samples_to_test=[ + "sample_coherence.py", + "sample_fluency.py", + "sample_groundedness.py", + "sample_intent_resolution.py", + "sample_relevance.py", + "sample_response_completeness.py", + "sample_task_adherence.py", + "sample_task_completion.py", + "sample_task_navigation_efficiency.py", + "sample_tool_call_accuracy.py", + "sample_tool_call_success.py", + "sample_tool_input_accuracy.py", + "sample_tool_output_utilization.py", + "sample_tool_selection.py", + ], + ), + ) + @SamplePathPasser() + @recorded_by_proxy(RecordedTransport.AZURE_CORE, RecordedTransport.HTTPX) + def test_agentic_evaluator_samples(self, sample_path: str, **kwargs) -> None: + env_var_mapping = get_sample_environment_variables_map(kwargs) + executor = SyncSampleExecutor(self, sample_path, env_var_mapping=env_var_mapping, **kwargs) + executor.execute() + executor.validate_print_calls_by_llm( + instructions=evaluations_instructions, + project_endpoint=kwargs["azure_ai_project_endpoint"], + model=kwargs["azure_ai_model_deployment_name"], + ) + + # To run this test, use: + # pytest tests/samples/test_samples_evaluations.py::TestSamplesEvaluations::test_generic_agentic_evaluator_sample + @evaluationsPreparer() + @recorded_by_proxy(RecordedTransport.AZURE_CORE, RecordedTransport.HTTPX) + def test_generic_agentic_evaluator_sample(self, **kwargs) -> None: + # Manually construct path to nested sample + current_dir = os.path.dirname(os.path.abspath(__file__)) + samples_folder = os.path.normpath(os.path.join(current_dir, os.pardir, os.pardir)) + sample_path = os.path.join( + samples_folder, + "samples", + "evaluations", + "agentic_evaluators", + "sample_generic_agentic_evaluator", + "sample_generic_agentic_evaluator.py", + ) + env_var_mapping = get_sample_environment_variables_map(kwargs) + executor = SyncSampleExecutor(self, sample_path, env_var_mapping=env_var_mapping, **kwargs) + executor.execute() + executor.validate_print_calls_by_llm( + instructions=evaluations_instructions, + project_endpoint=kwargs["azure_ai_project_endpoint"], + model=kwargs["azure_ai_model_deployment_name"], + ) From 7afdcbb43e6355998f194821c696a0e7fd139dd0 Mon Sep 17 00:00:00 2001 From: April Kwong Date: Thu, 22 Jan 2026 16:58:42 -0800 Subject: [PATCH 2/3] Address PR review comments: async executor consistency, robust timestamp regex --- sdk/ai/azure-ai-projects/tests/conftest.py | 2 +- sdk/ai/azure-ai-projects/tests/samples/sample_executor.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/sdk/ai/azure-ai-projects/tests/conftest.py b/sdk/ai/azure-ai-projects/tests/conftest.py index 7712fad211e0..8ab9b41a7523 100644 --- a/sdk/ai/azure-ai-projects/tests/conftest.py +++ b/sdk/ai/azure-ai-projects/tests/conftest.py @@ -129,7 +129,7 @@ def sanitize_url_paths(): # Sanitize Unix timestamps (10-digit numbers) in eval names # These appear in patterns like "Evaluation -1769111901" or "Eval Run for agent-test -1769111901" add_general_regex_sanitizer( - regex=r"-17\d{8}", + regex=r"-\d{10}", value="-SANITIZED-TS" ) diff --git a/sdk/ai/azure-ai-projects/tests/samples/sample_executor.py b/sdk/ai/azure-ai-projects/tests/samples/sample_executor.py index b872ac83c173..096383282ea7 100644 --- a/sdk/ai/azure-ai-projects/tests/samples/sample_executor.py +++ b/sdk/ai/azure-ai-projects/tests/samples/sample_executor.py @@ -27,7 +27,7 @@ from azure.ai.projects import AIProjectClient # Fixed timestamp for playback mode (Nov 2023). -# Must match the sanitizer regex `-17\d{8}` in conftest.py which replaces it with `-SANITIZED-TS`. +# Must match the sanitizer regex `-\d{10}` in conftest.py which replaces it with `-SANITIZED-TS`. PLAYBACK_TIMESTAMP = 1700000000 from pytest import MonkeyPatch from azure.ai.projects.aio import AIProjectClient as AsyncAIProjectClient @@ -402,7 +402,9 @@ async def execute_async(self, patched_open_fn=None): if not is_live() and hasattr(self.module, "time"): self.module.time.sleep = lambda _: None self.module.time.time = lambda: PLAYBACK_TIMESTAMP - await self.module.main() + # Call main() if it exists (samples wrap their code in main()) + if hasattr(self.module, "main") and callable(self.module.main): + await self.module.main() async def validate_print_calls_by_llm_async( self, From 39ba3b3b4e7f24f4562a5353702159e5f4b7c8af Mon Sep 17 00:00:00 2001 From: April Kwong Date: Thu, 22 Jan 2026 21:51:34 -0800 Subject: [PATCH 3/3] Use specific regex patterns for timestamp sanitization --- sdk/ai/azure-ai-projects/tests/conftest.py | 13 +++++++++---- .../tests/samples/sample_executor.py | 2 +- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/sdk/ai/azure-ai-projects/tests/conftest.py b/sdk/ai/azure-ai-projects/tests/conftest.py index 8ab9b41a7523..9fef075701c0 100644 --- a/sdk/ai/azure-ai-projects/tests/conftest.py +++ b/sdk/ai/azure-ai-projects/tests/conftest.py @@ -126,11 +126,16 @@ def sanitize_url_paths(): value="eval-data-sanitized-timestamp" ) - # Sanitize Unix timestamps (10-digit numbers) in eval names - # These appear in patterns like "Evaluation -1769111901" or "Eval Run for agent-test -1769111901" + # Sanitize Unix timestamps in eval names (from sample_redteam_evaluations.py) + # Pattern 1: "Red Team Agent Safety Evaluation -" add_general_regex_sanitizer( - regex=r"-\d{10}", - value="-SANITIZED-TS" + regex=r"Evaluation -\d{10}", + value="Evaluation -SANITIZED-TS" + ) + # Pattern 2: "Eval Run for -" (agent name already sanitized) + add_general_regex_sanitizer( + regex=r"sanitized-agent-name -\d{10}", + value="sanitized-agent-name -SANITIZED-TS" ) # Sanitize API key from service response (this includes Application Insights connection string) diff --git a/sdk/ai/azure-ai-projects/tests/samples/sample_executor.py b/sdk/ai/azure-ai-projects/tests/samples/sample_executor.py index 096383282ea7..a15b9b72b31d 100644 --- a/sdk/ai/azure-ai-projects/tests/samples/sample_executor.py +++ b/sdk/ai/azure-ai-projects/tests/samples/sample_executor.py @@ -27,7 +27,7 @@ from azure.ai.projects import AIProjectClient # Fixed timestamp for playback mode (Nov 2023). -# Must match the sanitizer regex `-\d{10}` in conftest.py which replaces it with `-SANITIZED-TS`. +# Must match the timestamp sanitizers in conftest.py (e.g., `Evaluation -\d{10}`). PLAYBACK_TIMESTAMP = 1700000000 from pytest import MonkeyPatch from azure.ai.projects.aio import AIProjectClient as AsyncAIProjectClient