Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sdk/ai/azure-ai-projects/assets.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
"AssetsRepo": "Azure/azure-sdk-assets",
"AssetsRepoPrefixPath": "python",
"TagPrefix": "python/ai/azure-ai-projects",
"Tag": "python/ai/azure-ai-projects_257daffdb5"
"Tag": "python/ai/azure-ai-projects_6f9985fe6d"
}
12 changes: 12 additions & 0 deletions sdk/ai/azure-ai-projects/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,18 @@ def sanitize_url_paths():
# Sanitize eval dataset names with timestamps (e.g., eval-data-2026-01-19_040648_UTC)
add_general_regex_sanitizer(regex=r"eval-data-\d{4}-\d{2}-\d{2}_\d{6}_UTC", value="eval-data-sanitized-timestamp")

# Sanitize Unix timestamps in eval names (from sample_redteam_evaluations.py)
# Pattern 1: "Red Team Agent Safety Evaluation -<timestamp>"
add_general_regex_sanitizer(
regex=r"Evaluation -\d{10}",
value="Evaluation -SANITIZED-TS"
)
# Pattern 2: "Eval Run for <agent_name> -<timestamp>" (agent name already sanitized)
add_general_regex_sanitizer(
regex=r"sanitized-agent-name -\d{10}",
value="sanitized-agent-name -SANITIZED-TS"
)

# Sanitize API key from service response (this includes Application Insights connection string)
add_body_key_sanitizer(json_path="credentials.key", value="sanitized-api-key")

Expand Down
27 changes: 26 additions & 1 deletion sdk/ai/azure-ai-projects/tests/samples/sample_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import inspect
import importlib.util
import functools

from dataclasses import dataclass, field
from typing import overload, Union, Optional
from pydantic import BaseModel
Expand All @@ -22,7 +23,12 @@
from azure.core.credentials_async import AsyncTokenCredential
from devtools_testutils.fake_credentials import FakeTokenCredential
from devtools_testutils.fake_credentials_async import AsyncFakeCredential
from devtools_testutils import is_live
from azure.ai.projects import AIProjectClient

# Fixed timestamp for playback mode (Nov 2023).
# Must match the timestamp sanitizers in conftest.py (e.g., `Evaluation -\d{10}`).
PLAYBACK_TIMESTAMP = 1700000000
from pytest import MonkeyPatch
from azure.ai.projects.aio import AIProjectClient as AsyncAIProjectClient

Expand Down Expand Up @@ -312,6 +318,16 @@ def execute(self, patched_open_fn=None):
mock.patch("builtins.open", side_effect=patched_open_fn),
):
self.spec.loader.exec_module(self.module)
# In playback mode, patch time functions on the module:
# - time.sleep: avoid waiting for polling loops (instant)
# - time.time: return fixed value for deterministic request bodies
# Must be done after exec_module so the module's 'time' reference can be patched.
if not is_live() and hasattr(self.module, "time"):
self.module.time.sleep = lambda _: None
self.module.time.time = lambda: PLAYBACK_TIMESTAMP
# Call main() if it exists (samples wrap their code in main())
if hasattr(self.module, "main") and callable(self.module.main):
self.module.main()

def validate_print_calls_by_llm(
self,
Expand Down Expand Up @@ -379,7 +395,16 @@ async def execute_async(self, patched_open_fn=None):
if self.spec.loader is None:
raise ImportError(f"Could not load module {self.spec.name} from {self.sample_path}")
self.spec.loader.exec_module(self.module)
await self.module.main()
# In playback mode, patch time functions on the module:
# - time.sleep: avoid waiting for polling loops (instant)
# - time.time: return fixed value for deterministic request bodies
# Must be done after exec_module so the module's 'time' reference can be patched.
if not is_live() and hasattr(self.module, "time"):
self.module.time.sleep = lambda _: None
self.module.time.time = lambda: PLAYBACK_TIMESTAMP
# Call main() if it exists (samples wrap their code in main())
if hasattr(self.module, "main") and callable(self.module.main):
await self.module.main()

async def validate_print_calls_by_llm_async(
self,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# Licensed under the MIT License.
# ------------------------------------
import functools
import os
import pytest
from devtools_testutils import recorded_by_proxy, AzureRecordedTestCase, RecordedTransport, EnvironmentVariableLoader
from sample_executor import (
Expand Down Expand Up @@ -44,7 +45,9 @@ class TestSamplesEvaluations(AzureRecordedTestCase):
"""
Tests for evaluation samples.

Included samples (9):
Included samples (25):

Main evaluation samples (10):
- sample_agent_evaluation.py
- sample_model_evaluation.py
- sample_agent_response_evaluation.py
Expand All @@ -54,8 +57,24 @@ class TestSamplesEvaluations(AzureRecordedTestCase):
- sample_eval_catalog_code_based_evaluators.py
- sample_eval_catalog_prompt_based_evaluators.py
- sample_evaluation_compare_insight.py
- sample_redteam_evaluations.py

More samples will be added in the future.
Agentic evaluator samples (15):
- sample_coherence.py
- sample_fluency.py
- sample_groundedness.py
- sample_intent_resolution.py
- sample_relevance.py
- sample_response_completeness.py
- sample_task_adherence.py
- sample_task_completion.py
- sample_task_navigation_efficiency.py
- sample_tool_call_accuracy.py
- sample_tool_call_success.py
- sample_tool_input_accuracy.py
- sample_tool_output_utilization.py
- sample_tool_selection.py
- sample_generic_agentic_evaluator.py

Excluded samples and reasons:

Expand All @@ -76,13 +95,11 @@ class TestSamplesEvaluations(AzureRecordedTestCase):
- sample_evaluations_builtin_with_traces.py: Requires Azure Application Insights and
uses azure-monitor-query to fetch traces.
- sample_scheduled_evaluations.py: Requires Azure RBAC assignment via
azure-mgmt-authorization and azure-mgmt-resource.
azure-mgmt-authorization and azure-mgmt-resource, AND uploads Dataset.

Complex prerequisites (require manual portal setup):
- sample_continuous_evaluation_rule.py: Requires manual RBAC assignment in Azure
Portal to enable continuous evaluation.
- sample_redteam_evaluations.py: Red team evaluations may require special
permissions or setup.
"""

# To run this test with a specific sample, use:
Expand All @@ -102,6 +119,7 @@ class TestSamplesEvaluations(AzureRecordedTestCase):
"sample_eval_catalog_prompt_based_evaluators.py",
"sample_evaluation_compare_insight.py",
"sample_agent_response_evaluation_with_function_tool.py",
"sample_redteam_evaluations.py",
],
),
)
Expand All @@ -116,3 +134,65 @@ def test_evaluation_samples(self, sample_path: str, **kwargs) -> None:
project_endpoint=kwargs["azure_ai_project_endpoint"],
model=kwargs["azure_ai_model_deployment_name"],
)

# To run this test with a specific sample, use:
# pytest tests/samples/test_samples_evaluations.py::TestSamplesEvaluations::test_agentic_evaluator_samples[sample_coherence]
@evaluationsPreparer()
@pytest.mark.parametrize(
"sample_path",
get_sample_paths(
"evaluations/agentic_evaluators",
samples_to_test=[
"sample_coherence.py",
"sample_fluency.py",
"sample_groundedness.py",
"sample_intent_resolution.py",
"sample_relevance.py",
"sample_response_completeness.py",
"sample_task_adherence.py",
"sample_task_completion.py",
"sample_task_navigation_efficiency.py",
"sample_tool_call_accuracy.py",
"sample_tool_call_success.py",
"sample_tool_input_accuracy.py",
"sample_tool_output_utilization.py",
"sample_tool_selection.py",
],
),
)
@SamplePathPasser()
@recorded_by_proxy(RecordedTransport.AZURE_CORE, RecordedTransport.HTTPX)
def test_agentic_evaluator_samples(self, sample_path: str, **kwargs) -> None:
env_var_mapping = get_sample_environment_variables_map(kwargs)
executor = SyncSampleExecutor(self, sample_path, env_var_mapping=env_var_mapping, **kwargs)
executor.execute()
executor.validate_print_calls_by_llm(
instructions=evaluations_instructions,
project_endpoint=kwargs["azure_ai_project_endpoint"],
model=kwargs["azure_ai_model_deployment_name"],
)

# To run this test, use:
# pytest tests/samples/test_samples_evaluations.py::TestSamplesEvaluations::test_generic_agentic_evaluator_sample
@evaluationsPreparer()
@recorded_by_proxy(RecordedTransport.AZURE_CORE, RecordedTransport.HTTPX)
def test_generic_agentic_evaluator_sample(self, **kwargs) -> None:
# Manually construct path to nested sample
current_dir = os.path.dirname(os.path.abspath(__file__))
samples_folder = os.path.normpath(os.path.join(current_dir, os.pardir, os.pardir))
sample_path = os.path.join(
samples_folder,
"samples",
"evaluations",
"agentic_evaluators",
"sample_generic_agentic_evaluator",
"sample_generic_agentic_evaluator.py",
)
env_var_mapping = get_sample_environment_variables_map(kwargs)
executor = SyncSampleExecutor(self, sample_path, env_var_mapping=env_var_mapping, **kwargs)
executor.execute()
executor.validate_print_calls_by_llm(
instructions=evaluations_instructions,
project_endpoint=kwargs["azure_ai_project_endpoint"],
model=kwargs["azure_ai_model_deployment_name"],
)