From 05786e6aa525eeab4581e0f6cfb50b7dcdbc7fd6 Mon Sep 17 00:00:00 2001
From: April Kwong <aprilk@microsoft.com>
Date: Thu, 22 Jan 2026 16:37:03 -0800
Subject: [PATCH 1/3] Added more eval sample tests

---
 sdk/ai/azure-ai-projects/assets.json          |  2 +-
 sdk/ai/azure-ai-projects/tests/conftest.py    |  7 ++
 .../tests/samples/sample_executor.py          | 23 +++++
 .../tests/samples/test_samples_evaluations.py | 90 +++++++++++++++++--
 4 files changed, 116 insertions(+), 6 deletions(-)

diff --git a/sdk/ai/azure-ai-projects/assets.json b/sdk/ai/azure-ai-projects/assets.json
index 9c6015cc80f1..490af2ec735e 100644
--- a/sdk/ai/azure-ai-projects/assets.json
+++ b/sdk/ai/azure-ai-projects/assets.json
@@ -2,5 +2,5 @@
   "AssetsRepo": "Azure/azure-sdk-assets",
   "AssetsRepoPrefixPath": "python",
   "TagPrefix": "python/ai/azure-ai-projects",
-  "Tag": "python/ai/azure-ai-projects_257daffdb5"
+  "Tag": "python/ai/azure-ai-projects_6f9985fe6d"
 }
diff --git a/sdk/ai/azure-ai-projects/tests/conftest.py b/sdk/ai/azure-ai-projects/tests/conftest.py
index b47a817688e7..7712fad211e0 100644
--- a/sdk/ai/azure-ai-projects/tests/conftest.py
+++ b/sdk/ai/azure-ai-projects/tests/conftest.py
@@ -126,6 +126,13 @@ def sanitize_url_paths():
         value="eval-data-sanitized-timestamp"
     )
 
+    # Sanitize Unix timestamps (10-digit numbers) in eval names
+    # These appear in patterns like "Evaluation -1769111901" or "Eval Run for agent-test -1769111901"
+    add_general_regex_sanitizer(
+        regex=r"-17\d{8}",
+        value="-SANITIZED-TS"
+    )
+
     # Sanitize API key from service response (this includes Application Insights connection string)
     add_body_key_sanitizer(json_path="credentials.key", value="sanitized-api-key")
 
diff --git a/sdk/ai/azure-ai-projects/tests/samples/sample_executor.py b/sdk/ai/azure-ai-projects/tests/samples/sample_executor.py
index d23b17295716..b872ac83c173 100644
--- a/sdk/ai/azure-ai-projects/tests/samples/sample_executor.py
+++ b/sdk/ai/azure-ai-projects/tests/samples/sample_executor.py
@@ -11,6 +11,7 @@
 import inspect
 import importlib.util
 import functools
+
 from dataclasses import dataclass, field
 from typing import overload, Union, Optional
 from pydantic import BaseModel
@@ -22,7 +23,12 @@
 from azure.core.credentials_async import AsyncTokenCredential
 from devtools_testutils.fake_credentials import FakeTokenCredential
 from devtools_testutils.fake_credentials_async import AsyncFakeCredential
+from devtools_testutils import is_live
 from azure.ai.projects import AIProjectClient
+
+# Fixed timestamp for playback mode (Nov 2023).
+# Must match the sanitizer regex `-17\d{8}` in conftest.py which replaces it with `-SANITIZED-TS`.
+PLAYBACK_TIMESTAMP = 1700000000
 from pytest import MonkeyPatch
 from azure.ai.projects.aio import AIProjectClient as AsyncAIProjectClient
 
@@ -312,6 +318,16 @@ def execute(self, patched_open_fn=None):
                 mock.patch("builtins.open", side_effect=patched_open_fn),
             ):
                 self.spec.loader.exec_module(self.module)
+                # In playback mode, patch time functions on the module:
+                # - time.sleep: avoid waiting for polling loops (instant)
+                # - time.time: return fixed value for deterministic request bodies
+                # Must be done after exec_module so the module's 'time' reference can be patched.
+                if not is_live() and hasattr(self.module, "time"):
+                    self.module.time.sleep = lambda _: None
+                    self.module.time.time = lambda: PLAYBACK_TIMESTAMP
+                # Call main() if it exists (samples wrap their code in main())
+                if hasattr(self.module, "main") and callable(self.module.main):
+                    self.module.main()
 
     def validate_print_calls_by_llm(
         self,
@@ -379,6 +395,13 @@ async def execute_async(self, patched_open_fn=None):
             if self.spec.loader is None:
                 raise ImportError(f"Could not load module {self.spec.name} from {self.sample_path}")
             self.spec.loader.exec_module(self.module)
+            # In playback mode, patch time functions on the module:
+            # - time.sleep: avoid waiting for polling loops (instant)
+            # - time.time: return fixed value for deterministic request bodies
+            # Must be done after exec_module so the module's 'time' reference can be patched.
+            if not is_live() and hasattr(self.module, "time"):
+                self.module.time.sleep = lambda _: None
+                self.module.time.time = lambda: PLAYBACK_TIMESTAMP
             await self.module.main()
 
     async def validate_print_calls_by_llm_async(
diff --git a/sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py b/sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py
index 241d4652b581..4a3fa5acf24b 100644
--- a/sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py
+++ b/sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py
@@ -4,6 +4,7 @@
 # Licensed under the MIT License.
 # ------------------------------------
 import functools
+import os
 import pytest
 from devtools_testutils import recorded_by_proxy, AzureRecordedTestCase, RecordedTransport, EnvironmentVariableLoader
 from sample_executor import (
@@ -43,7 +44,9 @@ class TestSamplesEvaluations(AzureRecordedTestCase):
     """
     Tests for evaluation samples.
 
-    Included samples (9):
+    Included samples (25):
+
+    Main evaluation samples (10):
     - sample_agent_evaluation.py
     - sample_model_evaluation.py
     - sample_agent_response_evaluation.py
@@ -53,8 +56,24 @@ class TestSamplesEvaluations(AzureRecordedTestCase):
     - sample_eval_catalog_code_based_evaluators.py
     - sample_eval_catalog_prompt_based_evaluators.py
     - sample_evaluation_compare_insight.py
+    - sample_redteam_evaluations.py
 
-    More samples will be added in the future.
+    Agentic evaluator samples (15):
+    - sample_coherence.py
+    - sample_fluency.py
+    - sample_groundedness.py
+    - sample_intent_resolution.py
+    - sample_relevance.py
+    - sample_response_completeness.py
+    - sample_task_adherence.py
+    - sample_task_completion.py
+    - sample_task_navigation_efficiency.py
+    - sample_tool_call_accuracy.py
+    - sample_tool_call_success.py
+    - sample_tool_input_accuracy.py
+    - sample_tool_output_utilization.py
+    - sample_tool_selection.py
+    - sample_generic_agentic_evaluator.py
 
     Excluded samples and reasons:
 
@@ -75,13 +94,11 @@ class TestSamplesEvaluations(AzureRecordedTestCase):
     - sample_evaluations_builtin_with_traces.py: Requires Azure Application Insights and
       uses azure-monitor-query to fetch traces.
     - sample_scheduled_evaluations.py: Requires Azure RBAC assignment via
-      azure-mgmt-authorization and azure-mgmt-resource.
+      azure-mgmt-authorization and azure-mgmt-resource, AND uploads Dataset.
 
     Complex prerequisites (require manual portal setup):
     - sample_continuous_evaluation_rule.py: Requires manual RBAC assignment in Azure
       Portal to enable continuous evaluation.
-    - sample_redteam_evaluations.py: Red team evaluations may require special
-      permissions or setup.
     """
 
     # To run this test with a specific sample, use:
@@ -101,6 +118,7 @@ class TestSamplesEvaluations(AzureRecordedTestCase):
                 "sample_eval_catalog_prompt_based_evaluators.py",
                 "sample_evaluation_compare_insight.py",
                 "sample_agent_response_evaluation_with_function_tool.py",
+                "sample_redteam_evaluations.py",
             ],
         ),
     )
@@ -115,3 +133,65 @@ def test_evaluation_samples(self, sample_path: str, **kwargs) -> None:
             project_endpoint=kwargs["azure_ai_project_endpoint"],
             model=kwargs["azure_ai_model_deployment_name"],
         )
+
+    # To run this test with a specific sample, use:
+    # pytest tests/samples/test_samples_evaluations.py::TestSamplesEvaluations::test_agentic_evaluator_samples[sample_coherence]
+    @evaluationsPreparer()
+    @pytest.mark.parametrize(
+        "sample_path",
+        get_sample_paths(
+            "evaluations/agentic_evaluators",
+            samples_to_test=[
+                "sample_coherence.py",
+                "sample_fluency.py",
+                "sample_groundedness.py",
+                "sample_intent_resolution.py",
+                "sample_relevance.py",
+                "sample_response_completeness.py",
+                "sample_task_adherence.py",
+                "sample_task_completion.py",
+                "sample_task_navigation_efficiency.py",
+                "sample_tool_call_accuracy.py",
+                "sample_tool_call_success.py",
+                "sample_tool_input_accuracy.py",
+                "sample_tool_output_utilization.py",
+                "sample_tool_selection.py",
+            ],
+        ),
+    )
+    @SamplePathPasser()
+    @recorded_by_proxy(RecordedTransport.AZURE_CORE, RecordedTransport.HTTPX)
+    def test_agentic_evaluator_samples(self, sample_path: str, **kwargs) -> None:
+        env_var_mapping = get_sample_environment_variables_map(kwargs)
+        executor = SyncSampleExecutor(self, sample_path, env_var_mapping=env_var_mapping, **kwargs)
+        executor.execute()
+        executor.validate_print_calls_by_llm(
+            instructions=evaluations_instructions,
+            project_endpoint=kwargs["azure_ai_project_endpoint"],
+            model=kwargs["azure_ai_model_deployment_name"],
+        )
+
+    # To run this test, use:
+    # pytest tests/samples/test_samples_evaluations.py::TestSamplesEvaluations::test_generic_agentic_evaluator_sample
+    @evaluationsPreparer()
+    @recorded_by_proxy(RecordedTransport.AZURE_CORE, RecordedTransport.HTTPX)
+    def test_generic_agentic_evaluator_sample(self, **kwargs) -> None:
+        # Manually construct path to nested sample
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        samples_folder = os.path.normpath(os.path.join(current_dir, os.pardir, os.pardir))
+        sample_path = os.path.join(
+            samples_folder,
+            "samples",
+            "evaluations",
+            "agentic_evaluators",
+            "sample_generic_agentic_evaluator",
+            "sample_generic_agentic_evaluator.py",
+        )
+        env_var_mapping = get_sample_environment_variables_map(kwargs)
+        executor = SyncSampleExecutor(self, sample_path, env_var_mapping=env_var_mapping, **kwargs)
+        executor.execute()
+        executor.validate_print_calls_by_llm(
+            instructions=evaluations_instructions,
+            project_endpoint=kwargs["azure_ai_project_endpoint"],
+            model=kwargs["azure_ai_model_deployment_name"],
+        )

From 7afdcbb43e6355998f194821c696a0e7fd139dd0 Mon Sep 17 00:00:00 2001
From: April Kwong <aprilk@microsoft.com>
Date: Thu, 22 Jan 2026 16:58:42 -0800
Subject: [PATCH 2/3] Address PR review comments: async executor consistency,
 robust timestamp regex

---
 sdk/ai/azure-ai-projects/tests/conftest.py                | 2 +-
 sdk/ai/azure-ai-projects/tests/samples/sample_executor.py | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/sdk/ai/azure-ai-projects/tests/conftest.py b/sdk/ai/azure-ai-projects/tests/conftest.py
index 7712fad211e0..8ab9b41a7523 100644
--- a/sdk/ai/azure-ai-projects/tests/conftest.py
+++ b/sdk/ai/azure-ai-projects/tests/conftest.py
@@ -129,7 +129,7 @@ def sanitize_url_paths():
     # Sanitize Unix timestamps (10-digit numbers) in eval names
     # These appear in patterns like "Evaluation -1769111901" or "Eval Run for agent-test -1769111901"
     add_general_regex_sanitizer(
-        regex=r"-17\d{8}",
+        regex=r"-\d{10}",
         value="-SANITIZED-TS"
     )
 
diff --git a/sdk/ai/azure-ai-projects/tests/samples/sample_executor.py b/sdk/ai/azure-ai-projects/tests/samples/sample_executor.py
index b872ac83c173..096383282ea7 100644
--- a/sdk/ai/azure-ai-projects/tests/samples/sample_executor.py
+++ b/sdk/ai/azure-ai-projects/tests/samples/sample_executor.py
@@ -27,7 +27,7 @@
 from azure.ai.projects import AIProjectClient
 
 # Fixed timestamp for playback mode (Nov 2023).
-# Must match the sanitizer regex `-17\d{8}` in conftest.py which replaces it with `-SANITIZED-TS`.
+# Must match the sanitizer regex `-\d{10}` in conftest.py which replaces it with `-SANITIZED-TS`.
 PLAYBACK_TIMESTAMP = 1700000000
 from pytest import MonkeyPatch
 from azure.ai.projects.aio import AIProjectClient as AsyncAIProjectClient
@@ -402,7 +402,9 @@ async def execute_async(self, patched_open_fn=None):
             if not is_live() and hasattr(self.module, "time"):
                 self.module.time.sleep = lambda _: None
                 self.module.time.time = lambda: PLAYBACK_TIMESTAMP
-            await self.module.main()
+            # Call main() if it exists (samples wrap their code in main())
+            if hasattr(self.module, "main") and callable(self.module.main):
+                await self.module.main()
 
     async def validate_print_calls_by_llm_async(
         self,

From 39ba3b3b4e7f24f4562a5353702159e5f4b7c8af Mon Sep 17 00:00:00 2001
From: April Kwong <aprilk@microsoft.com>
Date: Thu, 22 Jan 2026 21:51:34 -0800
Subject: [PATCH 3/3] Use specific regex patterns for timestamp sanitization

---
 sdk/ai/azure-ai-projects/tests/conftest.py          | 13 +++++++++----
 .../tests/samples/sample_executor.py                |  2 +-
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/sdk/ai/azure-ai-projects/tests/conftest.py b/sdk/ai/azure-ai-projects/tests/conftest.py
index 8ab9b41a7523..9fef075701c0 100644
--- a/sdk/ai/azure-ai-projects/tests/conftest.py
+++ b/sdk/ai/azure-ai-projects/tests/conftest.py
@@ -126,11 +126,16 @@ def sanitize_url_paths():
         value="eval-data-sanitized-timestamp"
     )
 
-    # Sanitize Unix timestamps (10-digit numbers) in eval names
-    # These appear in patterns like "Evaluation -1769111901" or "Eval Run for agent-test -1769111901"
+    # Sanitize Unix timestamps in eval names (from sample_redteam_evaluations.py)
+    # Pattern 1: "Red Team Agent Safety Evaluation -<timestamp>"
     add_general_regex_sanitizer(
-        regex=r"-\d{10}",
-        value="-SANITIZED-TS"
+        regex=r"Evaluation -\d{10}",
+        value="Evaluation -SANITIZED-TS"
+    )
+    # Pattern 2: "Eval Run for <agent_name> -<timestamp>" (agent name already sanitized)
+    add_general_regex_sanitizer(
+        regex=r"sanitized-agent-name -\d{10}",
+        value="sanitized-agent-name -SANITIZED-TS"
     )
 
     # Sanitize API key from service response (this includes Application Insights connection string)
diff --git a/sdk/ai/azure-ai-projects/tests/samples/sample_executor.py b/sdk/ai/azure-ai-projects/tests/samples/sample_executor.py
index 096383282ea7..a15b9b72b31d 100644
--- a/sdk/ai/azure-ai-projects/tests/samples/sample_executor.py
+++ b/sdk/ai/azure-ai-projects/tests/samples/sample_executor.py
@@ -27,7 +27,7 @@
 from azure.ai.projects import AIProjectClient
 
 # Fixed timestamp for playback mode (Nov 2023).
-# Must match the sanitizer regex `-\d{10}` in conftest.py which replaces it with `-SANITIZED-TS`.
+# Must match the timestamp sanitizers in conftest.py (e.g., `Evaluation -\d{10}`).
 PLAYBACK_TIMESTAMP = 1700000000
 from pytest import MonkeyPatch
 from azure.ai.projects.aio import AIProjectClient as AsyncAIProjectClient