Azure · aprilk-ms · Jan 23, 2026 · Jan 23, 2026 · Jan 23, 2026 · Jan 23, 2026
@@ -2,5 +2,5 @@
   "AssetsRepo": "Azure/azure-sdk-assets",
   "AssetsRepoPrefixPath": "python",
   "TagPrefix": "python/ai/azure-ai-projects",
-  "Tag": "python/ai/azure-ai-projects_257daffdb5"
+  "Tag": "python/ai/azure-ai-projects_6f9985fe6d"
 }
@@ -122,6 +122,18 @@ def sanitize_url_paths():
     # Sanitize eval dataset names with timestamps (e.g., eval-data-2026-01-19_040648_UTC)
     add_general_regex_sanitizer(regex=r"eval-data-\d{4}-\d{2}-\d{2}_\d{6}_UTC", value="eval-data-sanitized-timestamp")
 
+    # Sanitize Unix timestamps in eval names (from sample_redteam_evaluations.py)
+    # Pattern 1: "Red Team Agent Safety Evaluation -<timestamp>"
+    add_general_regex_sanitizer(
+        regex=r"Evaluation -\d{10}",
+        value="Evaluation -SANITIZED-TS"
+    )
+    # Pattern 2: "Eval Run for <agent_name> -<timestamp>" (agent name already sanitized)
+    add_general_regex_sanitizer(
+        regex=r"sanitized-agent-name -\d{10}",
+        value="sanitized-agent-name -SANITIZED-TS"
+    )
+
     # Sanitize API key from service response (this includes Application Insights connection string)
     add_body_key_sanitizer(json_path="credentials.key", value="sanitized-api-key")
 

@@ -11,6 +11,7 @@
 import inspect
 import importlib.util
 import functools
+
 from dataclasses import dataclass, field
 from typing import overload, Union, Optional
 from pydantic import BaseModel
@@ -22,7 +23,12 @@
 from azure.core.credentials_async import AsyncTokenCredential
 from devtools_testutils.fake_credentials import FakeTokenCredential
 from devtools_testutils.fake_credentials_async import AsyncFakeCredential
+from devtools_testutils import is_live
 from azure.ai.projects import AIProjectClient
+
+# Fixed timestamp for playback mode (Nov 2023).
+# Must match the timestamp sanitizers in conftest.py (e.g., `Evaluation -\d{10}`).
+PLAYBACK_TIMESTAMP = 1700000000
 from pytest import MonkeyPatch
 from azure.ai.projects.aio import AIProjectClient as AsyncAIProjectClient
 
@@ -312,6 +318,16 @@ def execute(self, patched_open_fn=None):
                 mock.patch("builtins.open", side_effect=patched_open_fn),
             ):
                 self.spec.loader.exec_module(self.module)
+                # In playback mode, patch time functions on the module:
+                # - time.sleep: avoid waiting for polling loops (instant)
+                # - time.time: return fixed value for deterministic request bodies
+                # Must be done after exec_module so the module's 'time' reference can be patched.
+                if not is_live() and hasattr(self.module, "time"):
+                    self.module.time.sleep = lambda _: None
+                    self.module.time.time = lambda: PLAYBACK_TIMESTAMP
+                # Call main() if it exists (samples wrap their code in main())
+                if hasattr(self.module, "main") and callable(self.module.main):
+                    self.module.main()
 
     def validate_print_calls_by_llm(
         self,
@@ -379,7 +395,16 @@ async def execute_async(self, patched_open_fn=None):
             if self.spec.loader is None:
                 raise ImportError(f"Could not load module {self.spec.name} from {self.sample_path}")
             self.spec.loader.exec_module(self.module)
-            await self.module.main()
+            # In playback mode, patch time functions on the module:
+            # - time.sleep: avoid waiting for polling loops (instant)
+            # - time.time: return fixed value for deterministic request bodies
+            # Must be done after exec_module so the module's 'time' reference can be patched.
+            if not is_live() and hasattr(self.module, "time"):
+                self.module.time.sleep = lambda _: None
+                self.module.time.time = lambda: PLAYBACK_TIMESTAMP
+            # Call main() if it exists (samples wrap their code in main())
+            if hasattr(self.module, "main") and callable(self.module.main):
+                await self.module.main()
 
     async def validate_print_calls_by_llm_async(
         self,

@@ -4,6 +4,7 @@
 # Licensed under the MIT License.
 # ------------------------------------
 import functools
+import os
 import pytest
 from devtools_testutils import recorded_by_proxy, AzureRecordedTestCase, RecordedTransport, EnvironmentVariableLoader
 from sample_executor import (
@@ -44,7 +45,9 @@ class TestSamplesEvaluations(AzureRecordedTestCase):
     """
     Tests for evaluation samples.
 
-    Included samples (9):
+    Included samples (25):
+
+    Main evaluation samples (10):
     - sample_agent_evaluation.py
     - sample_model_evaluation.py
     - sample_agent_response_evaluation.py
@@ -54,8 +57,24 @@ class TestSamplesEvaluations(AzureRecordedTestCase):
     - sample_eval_catalog_code_based_evaluators.py
     - sample_eval_catalog_prompt_based_evaluators.py
     - sample_evaluation_compare_insight.py
+    - sample_redteam_evaluations.py
 
-    More samples will be added in the future.
+    Agentic evaluator samples (15):
+    - sample_coherence.py
+    - sample_fluency.py
+    - sample_groundedness.py
+    - sample_intent_resolution.py
+    - sample_relevance.py
+    - sample_response_completeness.py
+    - sample_task_adherence.py
+    - sample_task_completion.py
+    - sample_task_navigation_efficiency.py
+    - sample_tool_call_accuracy.py
+    - sample_tool_call_success.py
+    - sample_tool_input_accuracy.py
+    - sample_tool_output_utilization.py
+    - sample_tool_selection.py
+    - sample_generic_agentic_evaluator.py
 
     Excluded samples and reasons:
 
@@ -76,13 +95,11 @@ class TestSamplesEvaluations(AzureRecordedTestCase):
     - sample_evaluations_builtin_with_traces.py: Requires Azure Application Insights and
       uses azure-monitor-query to fetch traces.
     - sample_scheduled_evaluations.py: Requires Azure RBAC assignment via
-      azure-mgmt-authorization and azure-mgmt-resource.
+      azure-mgmt-authorization and azure-mgmt-resource, AND uploads Dataset.
 
     Complex prerequisites (require manual portal setup):
     - sample_continuous_evaluation_rule.py: Requires manual RBAC assignment in Azure
       Portal to enable continuous evaluation.
-    - sample_redteam_evaluations.py: Red team evaluations may require special
-      permissions or setup.
     """
 
     # To run this test with a specific sample, use:
@@ -102,6 +119,7 @@ class TestSamplesEvaluations(AzureRecordedTestCase):
                 "sample_eval_catalog_prompt_based_evaluators.py",
                 "sample_evaluation_compare_insight.py",
                 "sample_agent_response_evaluation_with_function_tool.py",
+                "sample_redteam_evaluations.py",
             ],
         ),
     )
@@ -116,3 +134,65 @@ def test_evaluation_samples(self, sample_path: str, **kwargs) -> None:
             project_endpoint=kwargs["azure_ai_project_endpoint"],
             model=kwargs["azure_ai_model_deployment_name"],
         )
+
+    # To run this test with a specific sample, use:
+    # pytest tests/samples/test_samples_evaluations.py::TestSamplesEvaluations::test_agentic_evaluator_samples[sample_coherence]
+    @evaluationsPreparer()
+    @pytest.mark.parametrize(
+        "sample_path",
+        get_sample_paths(
+            "evaluations/agentic_evaluators",
+            samples_to_test=[
+                "sample_coherence.py",
+                "sample_fluency.py",
+                "sample_groundedness.py",
+                "sample_intent_resolution.py",
+                "sample_relevance.py",
+                "sample_response_completeness.py",
+                "sample_task_adherence.py",
+                "sample_task_completion.py",
+                "sample_task_navigation_efficiency.py",
+                "sample_tool_call_accuracy.py",
+                "sample_tool_call_success.py",
+                "sample_tool_input_accuracy.py",
+                "sample_tool_output_utilization.py",
+                "sample_tool_selection.py",
+            ],
+        ),
+    )
+    @SamplePathPasser()
+    @recorded_by_proxy(RecordedTransport.AZURE_CORE, RecordedTransport.HTTPX)
+    def test_agentic_evaluator_samples(self, sample_path: str, **kwargs) -> None:
+        env_var_mapping = get_sample_environment_variables_map(kwargs)
+        executor = SyncSampleExecutor(self, sample_path, env_var_mapping=env_var_mapping, **kwargs)
+        executor.execute()
+        executor.validate_print_calls_by_llm(
+            instructions=evaluations_instructions,
+            project_endpoint=kwargs["azure_ai_project_endpoint"],
+            model=kwargs["azure_ai_model_deployment_name"],
+        )
+
+    # To run this test, use:
+    # pytest tests/samples/test_samples_evaluations.py::TestSamplesEvaluations::test_generic_agentic_evaluator_sample
+    @evaluationsPreparer()
+    @recorded_by_proxy(RecordedTransport.AZURE_CORE, RecordedTransport.HTTPX)
+    def test_generic_agentic_evaluator_sample(self, **kwargs) -> None:
+        # Manually construct path to nested sample
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        samples_folder = os.path.normpath(os.path.join(current_dir, os.pardir, os.pardir))
+        sample_path = os.path.join(
+            samples_folder,
+            "samples",
+            "evaluations",
+            "agentic_evaluators",
+            "sample_generic_agentic_evaluator",
+            "sample_generic_agentic_evaluator.py",
+        )
+        env_var_mapping = get_sample_environment_variables_map(kwargs)
+        executor = SyncSampleExecutor(self, sample_path, env_var_mapping=env_var_mapping, **kwargs)
+        executor.execute()
+        executor.validate_print_calls_by_llm(
+            instructions=evaluations_instructions,
+            project_endpoint=kwargs["azure_ai_project_endpoint"],
+            model=kwargs["azure_ai_model_deployment_name"],
+        )