diff --git a/sdk/ai/azure-ai-projects/azure/ai/projects/aio/operations/_patch_evaluators_async.py b/sdk/ai/azure-ai-projects/azure/ai/projects/aio/operations/_patch_evaluators_async.py new file mode 100644 index 000000000000..966e377d68a1 --- /dev/null +++ b/sdk/ai/azure-ai-projects/azure/ai/projects/aio/operations/_patch_evaluators_async.py @@ -0,0 +1,203 @@ +# pylint: disable=line-too-long,useless-suppression +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ +"""Customize generated code here. + +Follow our quickstart for examples: https://aka.ms/azsdk/python/dpcodegen/python/customize +""" +import os +import logging +from typing import Any, IO, Tuple, Optional, Union +from pathlib import Path +from urllib.parse import urlsplit +from azure.storage.blob.aio import ContainerClient +from azure.core.tracing.decorator_async import distributed_trace_async +from azure.core.exceptions import HttpResponseError, ResourceNotFoundError +from ._operations import BetaEvaluatorsOperations as EvaluatorsOperationsGenerated, JSON +from ...models._models import ( + EvaluatorVersion, +) + +logger = logging.getLogger(__name__) + + +class EvaluatorsOperations(EvaluatorsOperationsGenerated): + """ + .. warning:: + **DO NOT** instantiate this class directly. + + Instead, you should access the following operations through + :class:`~azure.ai.projects.aio.AIProjectClient`'s + :attr:`beta.evaluators` attribute. + """ + + async def _start_pending_upload_and_get_container_client( + self, + name: str, + version: str, + connection_name: Optional[str] = None, + ) -> Tuple[ContainerClient, str, str]: + """Call startPendingUpload to get a SAS URI and return a ContainerClient and blob URI.""" + + request_body: dict = {} + if connection_name: + request_body["connectionName"] = connection_name + + pending_upload_response = await self.pending_upload( + name=name, + version=version, + pending_upload_request=request_body, + ) + + # The service returns blobReferenceForConsumption + blob_ref = pending_upload_response.get("blobReferenceForConsumption") + if not blob_ref: + raise ValueError("Blob reference is not present in the pending upload response") + + credential = blob_ref.get("credential") if isinstance(blob_ref, dict) else None + if not credential: + raise ValueError("SAS credential is not present in the pending upload response") + + sas_uri = credential.get("sasUri") if isinstance(credential, dict) else None + if not sas_uri: + raise ValueError("SAS URI is missing or empty in the pending upload response") + + blob_uri = blob_ref.get("blobUri") if isinstance(blob_ref, dict) else None + if not blob_uri: + raise ValueError("Blob URI is missing or empty in the pending upload response") + + return ( + ContainerClient.from_container_url(container_url=sas_uri), + version, + blob_uri, + ) + + async def _get_next_version(self, name: str) -> str: + """Get the next version number for an evaluator by fetching existing versions.""" + try: + versions = [] + async for v in self.list_versions(name=name): + versions.append(v) + if versions: + numeric_versions = [] + for v in versions: + ver = v.get("version") if isinstance(v, dict) else getattr(v, "version", None) + if ver and ver.isdigit(): + numeric_versions.append(int(ver)) + if numeric_versions: + return str(max(numeric_versions) + 1) + return "1" + except ResourceNotFoundError: + return "1" + + @distributed_trace_async + async def upload( + self, + name: str, + evaluator_version: Union[EvaluatorVersion, JSON, IO[bytes]], + *, + folder: str, + connection_name: Optional[str] = None, + **kwargs: Any, + ) -> EvaluatorVersion: + """Upload all files in a folder to blob storage and create a code-based evaluator version + that references the uploaded code. + + This method calls startPendingUpload to get a SAS URI, uploads files from the folder + to blob storage, then creates an evaluator version referencing the uploaded blob. + + The version is automatically determined by incrementing the latest existing version. + + :param name: The name of the evaluator. Required. + :type name: str + :param evaluator_version: The evaluator version definition. This is the same object accepted + by ``create_version``. Is one of the following types: EvaluatorVersion, JSON, + IO[bytes]. Required. + :type evaluator_version: ~azure.ai.projects.models.EvaluatorVersion or JSON or IO[bytes] + :keyword folder: Path to the folder containing the evaluator Python code. Required. + :paramtype folder: str + :keyword connection_name: The name of an Azure Storage Account connection where the files + should be uploaded. If not specified, the default Azure Storage Account connection will be + used. Optional. + :paramtype connection_name: str + :return: The created evaluator version. + :rtype: ~azure.ai.projects.models.EvaluatorVersion + :raises ~azure.core.exceptions.HttpResponseError: If an error occurs during the HTTP request. + """ + path_folder = Path(folder) + if not path_folder.exists(): + raise ValueError(f"The provided folder `{folder}` does not exist.") + if path_folder.is_file(): + raise ValueError("The provided path is a file, not a folder.") + + version = await self._get_next_version(name) + logger.info("[upload] Auto-resolved version to '%s'.", version) + + # Get SAS URI via startPendingUpload + container_client, output_version, blob_uri = await self._start_pending_upload_and_get_container_client( + name=name, + version=version, + connection_name=connection_name, + ) + + async with container_client: + # Upload all files from the folder (including nested subdirectories) + skip_dirs = {"__pycache__", ".git", ".venv", "venv", "node_modules"} + skip_extensions = {".pyc", ".pyo"} + files_uploaded: bool = False + for root, dirs, files in os.walk(folder): + # Prune directories we don't want to traverse + dirs[:] = [d for d in dirs if d not in skip_dirs] + for file in files: + if any(file.endswith(ext) for ext in skip_extensions): + continue + file_path = os.path.join(root, file) + blob_name = os.path.relpath(file_path, folder).replace("\\", "/") + logger.debug( + "[upload] Start uploading file `%s` as blob `%s`.", + file_path, + blob_name, + ) + with open(file=file_path, mode="rb") as data: + try: + await container_client.upload_blob(name=str(blob_name), data=data, **kwargs) + except HttpResponseError as e: + if e.error_code == "AuthorizationPermissionMismatch": + storage_account = urlsplit(container_client.url).hostname + raise HttpResponseError( + message=( + f"Failed to upload file '{blob_name}' to blob storage: " + f"permission denied. Ensure the identity that signed the SAS token " + f"has the 'Storage Blob Data Contributor' role on the storage account " + f"'{storage_account}'. " + f"Original error: {e.message}" + ), + response=e.response, + ) from e + raise + logger.debug("[upload] Done uploading file") + files_uploaded = True + logger.debug("[upload] Done uploading all files.") + + if not files_uploaded: + raise ValueError("The provided folder is empty.") + + # Set the blob_uri in the evaluator version definition + if isinstance(evaluator_version, dict): + definition = evaluator_version.get("definition", {}) + if isinstance(definition, dict): + definition["blob_uri"] = blob_uri + else: + definition.blob_uri = blob_uri + else: + if hasattr(evaluator_version, "definition") and evaluator_version.definition: + evaluator_version.definition.blob_uri = blob_uri + + result = await self.create_version( + name=name, + evaluator_version=evaluator_version, + ) + + return result diff --git a/sdk/ai/azure-ai-projects/azure/ai/projects/operations/_patch.py b/sdk/ai/azure-ai-projects/azure/ai/projects/operations/_patch.py index bc78f4d6baf8..f628dcfce670 100644 --- a/sdk/ai/azure-ai-projects/azure/ai/projects/operations/_patch.py +++ b/sdk/ai/azure-ai-projects/azure/ai/projects/operations/_patch.py @@ -12,12 +12,12 @@ from ._patch_agents import AgentsOperations from ._patch_datasets import DatasetsOperations from ._patch_evaluation_rules import EvaluationRulesOperations +from ._patch_evaluators import EvaluatorsOperations as BetaEvaluatorsOperations from ._patch_telemetry import TelemetryOperations from ._patch_connections import ConnectionsOperations from ._patch_memories import BetaMemoryStoresOperations from ._operations import ( BetaEvaluationTaxonomiesOperations, - BetaEvaluatorsOperations, BetaInsightsOperations, BetaRedTeamsOperations, BetaSchedulesOperations, @@ -50,6 +50,8 @@ class BetaOperations(GeneratedBetaOperations): def __init__(self, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) + # Replace with patched class that includes upload() + self.evaluators = BetaEvaluatorsOperations(self._client, self._config, self._serialize, self._deserialize) # Replace with patched class that includes begin_update_memories self.memory_stores = BetaMemoryStoresOperations(self._client, self._config, self._serialize, self._deserialize) diff --git a/sdk/ai/azure-ai-projects/azure/ai/projects/operations/_patch_evaluators.py b/sdk/ai/azure-ai-projects/azure/ai/projects/operations/_patch_evaluators.py new file mode 100644 index 000000000000..b6ebb813992d --- /dev/null +++ b/sdk/ai/azure-ai-projects/azure/ai/projects/operations/_patch_evaluators.py @@ -0,0 +1,201 @@ +# pylint: disable=line-too-long,useless-suppression +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ +"""Customize generated code here. + +Follow our quickstart for examples: https://aka.ms/azsdk/python/dpcodegen/python/customize +""" +import os +import logging +from typing import Any, IO, Tuple, Optional, Union +from pathlib import Path +from urllib.parse import urlsplit +from azure.storage.blob import ContainerClient +from azure.core.tracing.decorator import distributed_trace +from azure.core.exceptions import HttpResponseError, ResourceNotFoundError +from ._operations import BetaEvaluatorsOperations as EvaluatorsOperationsGenerated, JSON +from ..models._models import ( + EvaluatorVersion, +) + +logger = logging.getLogger(__name__) + + +class EvaluatorsOperations(EvaluatorsOperationsGenerated): + """ + .. warning:: + **DO NOT** instantiate this class directly. + + Instead, you should access the following operations through + :class:`~azure.ai.projects.AIProjectClient`'s + :attr:`beta.evaluators` attribute. + """ + + def _start_pending_upload_and_get_container_client( + self, + name: str, + version: str, + connection_name: Optional[str] = None, + ) -> Tuple[ContainerClient, str, str]: + """Call startPendingUpload to get a SAS URI and return a ContainerClient and blob URI.""" + + request_body: dict = {} + if connection_name: + request_body["connectionName"] = connection_name + + pending_upload_response = self.pending_upload( + name=name, + version=version, + pending_upload_request=request_body, + ) + + # The service returns blobReferenceForConsumption + blob_ref = pending_upload_response.get("blobReferenceForConsumption") + if not blob_ref: + raise ValueError("Blob reference is not present in the pending upload response") + + credential = blob_ref.get("credential") if isinstance(blob_ref, dict) else None + if not credential: + raise ValueError("SAS credential is not present in the pending upload response") + + sas_uri = credential.get("sasUri") if isinstance(credential, dict) else None + if not sas_uri: + raise ValueError("SAS URI is missing or empty in the pending upload response") + + blob_uri = blob_ref.get("blobUri") if isinstance(blob_ref, dict) else None + if not blob_uri: + raise ValueError("Blob URI is missing or empty in the pending upload response") + + return ( + ContainerClient.from_container_url(container_url=sas_uri), + version, + blob_uri, + ) + + def _get_next_version(self, name: str) -> str: + """Get the next version number for an evaluator by fetching existing versions.""" + try: + versions = list(self.list_versions(name=name)) + if versions: + numeric_versions = [] + for v in versions: + ver = v.get("version") if isinstance(v, dict) else getattr(v, "version", None) + if ver and ver.isdigit(): + numeric_versions.append(int(ver)) + if numeric_versions: + return str(max(numeric_versions) + 1) + return "1" + except ResourceNotFoundError: + return "1" + + @distributed_trace + def upload( + self, + name: str, + evaluator_version: Union[EvaluatorVersion, JSON, IO[bytes]], + *, + folder: str, + connection_name: Optional[str] = None, + **kwargs: Any, + ) -> EvaluatorVersion: + """Upload all files in a folder to blob storage and create a code-based evaluator version + that references the uploaded code. + + This method calls startPendingUpload to get a SAS URI, uploads files from the folder + to blob storage, then creates an evaluator version referencing the uploaded blob. + + The version is automatically determined by incrementing the latest existing version. + + :param name: The name of the evaluator. Required. + :type name: str + :param evaluator_version: The evaluator version definition. This is the same object accepted + by ``create_version``. Is one of the following types: EvaluatorVersion, JSON, + IO[bytes]. Required. + :type evaluator_version: ~azure.ai.projects.models.EvaluatorVersion or JSON or IO[bytes] + :keyword folder: Path to the folder containing the evaluator Python code. Required. + :paramtype folder: str + :keyword connection_name: The name of an Azure Storage Account connection where the files + should be uploaded. If not specified, the default Azure Storage Account connection will be + used. Optional. + :paramtype connection_name: str + :return: The created evaluator version. + :rtype: ~azure.ai.projects.models.EvaluatorVersion + :raises ~azure.core.exceptions.HttpResponseError: If an error occurs during the HTTP request. + """ + path_folder = Path(folder) + if not path_folder.exists(): + raise ValueError(f"The provided folder `{folder}` does not exist.") + if path_folder.is_file(): + raise ValueError("The provided path is a file, not a folder.") + + version = self._get_next_version(name) + logger.info("[upload] Auto-resolved version to '%s'.", version) + + # Get SAS URI via startPendingUpload + container_client, output_version, blob_uri = self._start_pending_upload_and_get_container_client( + name=name, + version=version, + connection_name=connection_name, + ) + + with container_client: + # Upload all files from the folder (including nested subdirectories) + skip_dirs = {"__pycache__", ".git", ".venv", "venv", "node_modules"} + skip_extensions = {".pyc", ".pyo"} + files_uploaded: bool = False + for root, dirs, files in os.walk(folder): + # Prune directories we don't want to traverse + dirs[:] = [d for d in dirs if d not in skip_dirs] + for file in files: + if any(file.endswith(ext) for ext in skip_extensions): + continue + file_path = os.path.join(root, file) + blob_name = os.path.relpath(file_path, folder).replace("\\", "/") + logger.debug( + "[upload] Start uploading file `%s` as blob `%s`.", + file_path, + blob_name, + ) + with open(file=file_path, mode="rb") as data: + try: + container_client.upload_blob(name=str(blob_name), data=data, **kwargs) + except HttpResponseError as e: + if e.error_code == "AuthorizationPermissionMismatch": + storage_account = urlsplit(container_client.url).hostname + raise HttpResponseError( + message=( + f"Failed to upload file '{blob_name}' to blob storage: " + f"permission denied. Ensure the identity that signed the SAS token " + f"has the 'Storage Blob Data Contributor' role on the storage account " + f"'{storage_account}'. " + f"Original error: {e.message}" + ), + response=e.response, + ) from e + raise + logger.debug("[upload] Done uploading file") + files_uploaded = True + logger.debug("[upload] Done uploading all files.") + + if not files_uploaded: + raise ValueError("The provided folder is empty.") + + # Set the blob_uri in the evaluator version definition + if isinstance(evaluator_version, dict): + definition = evaluator_version.get("definition", {}) + if isinstance(definition, dict): + definition["blob_uri"] = blob_uri + else: + definition.blob_uri = blob_uri + else: + if hasattr(evaluator_version, "definition") and evaluator_version.definition: + evaluator_version.definition.blob_uri = blob_uri + + result = self.create_version( + name=name, + evaluator_version=evaluator_version, + ) + + return result diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/answer_length_evaluator/answer_length_evaluator.py b/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/answer_length_evaluator/answer_length_evaluator.py new file mode 100644 index 000000000000..1fa95ab19b1d --- /dev/null +++ b/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/answer_length_evaluator/answer_length_evaluator.py @@ -0,0 +1,14 @@ +"""Custom evaluator that measures the length of a response.""" + + +class AnswerLengthEvaluator: + def __init__(self, *, model_config): + self.model_config = model_config + + def __call__(self, *args, **kwargs): + return {"result": evaluate_answer_length(kwargs.get("response"))} + + +def evaluate_answer_length(answer: str): + return len(answer) + diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/common_util/__init__.py b/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/common_util/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/common_util/util.py b/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/common_util/util.py new file mode 100644 index 000000000000..7499261ba7c6 --- /dev/null +++ b/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/common_util/util.py @@ -0,0 +1,72 @@ +"""Utility functions for custom evaluators.""" + +FRIENDLINESS_SYSTEM_PROMPT = """You are an expert evaluator that assesses how friendly, warm, and approachable +a response is. You evaluate responses on a scale of 1 to 5 based on the following criteria: + +Score 1 (Very Unfriendly): The response is cold, dismissive, rude, or hostile. +Score 2 (Unfriendly): The response is curt, impersonal, or lacks warmth. +Score 3 (Neutral): The response is acceptable but neither particularly friendly nor unfriendly. +Score 4 (Friendly): The response is warm, polite, and shows genuine interest in helping. +Score 5 (Very Friendly): The response is exceptionally warm, encouraging, empathetic, and makes the user feel valued. + +You MUST respond in the following JSON format only: +{ + "score": , + "label": "", + "reason": "", + "explanation": "" +} + +A score of 3 or above is considered "Pass", below 3 is "Fail". +""" + + +def build_evaluation_messages(query: str, response: str) -> list: + """Build the messages list for the LLM evaluation call. + + :param query: The original user query. + :param response: The response to evaluate for friendliness. + :return: A list of message dicts for the chat completion API. + """ + return [ + {"role": "system", "content": FRIENDLINESS_SYSTEM_PROMPT}, + { + "role": "user", + "content": ( + f"Please evaluate the friendliness of the following response.\n\n" + f"Original query: {query}\n\n" + f"Response to evaluate: {response}" + ), + }, + ] + + +def parse_evaluation_result(raw_result: str) -> dict: + """Parse the LLM's JSON response into a structured evaluation result. + + :param raw_result: The raw string output from the LLM. + :return: A dict with score, label, reason, and explanation. + """ + import json + + try: + # Try to extract JSON from the response (handle markdown code blocks) + text = raw_result.strip() + if text.startswith("```"): + text = text.split("\n", 1)[1] if "\n" in text else text[3:] + text = text.rsplit("```", 1)[0] + result = json.loads(text.strip()) + score = int(result.get("score", 3)) + return { + "score": max(1, min(5, score)), + "label": result.get("label", "Pass" if score >= 3 else "Fail"), + "reason": result.get("reason", "No reason provided"), + "explanation": result.get("explanation", "No explanation provided"), + } + except (json.JSONDecodeError, ValueError, KeyError): + return { + "score": 3, + "label": "Pass", + "reason": "Could not parse LLM response", + "explanation": f"Raw LLM output: {raw_result}", + } diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/friendly_evaluator.py b/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/friendly_evaluator.py new file mode 100644 index 000000000000..c58ff350ba25 --- /dev/null +++ b/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/friendly_evaluator.py @@ -0,0 +1,62 @@ +"""Custom evaluator that uses an LLM to assess the friendliness of a response.""" + +from openai import AzureOpenAI +from common_util.util import build_evaluation_messages, parse_evaluation_result + + +class FriendlyEvaluator: + """Evaluates how friendly and approachable a response is using an LLM judge. + + This evaluator sends the query and response to an LLM, which returns a + friendliness score (1-5), a pass/fail label, a reason, and a detailed explanation. + + :param model_config: A dict containing Azure OpenAI connection info. Expected keys: + - azure_endpoint: The Azure OpenAI endpoint URL. + - azure_deployment: The deployment/model name. + - api_version: The API version (default: "2024-06-01"). + - api_key: (Optional) The API key. If not provided, DefaultAzureCredential is used. + """ + + def __init__(self, *, model_config: dict): + self.model_config = model_config + api_key = model_config.get("api_key") + + if api_key: + self.client = AzureOpenAI( + azure_endpoint=model_config["azure_endpoint"], + api_key=api_key, + api_version=model_config.get("api_version", "2024-06-01"), + ) + else: + from azure.identity import DefaultAzureCredential, get_bearer_token_provider + + token_provider = get_bearer_token_provider( + DefaultAzureCredential(), + "https://cognitiveservices.azure.com/.default", + ) + self.client = AzureOpenAI( + azure_endpoint=model_config["azure_endpoint"], + azure_ad_token_provider=token_provider, + api_version=model_config.get("api_version", "2024-06-01"), + ) + + self.deployment = model_config["azure_deployment"] + + def __call__(self, *, query: str, response: str, **kwargs) -> dict: + """Evaluate the friendliness of a response. + + :param query: The original user query. + :param response: The response to evaluate. + :return: A dict with score, label, reason, and explanation. + """ + messages = build_evaluation_messages(query, response) + + completion = self.client.chat.completions.create( + model=self.deployment, + messages=messages, + temperature=0.0, + max_tokens=500, + ) + + raw_result = completion.choices[0].message.content + return parse_evaluation_result(raw_result) diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_upload_custom_evaluator.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_upload_custom_evaluator.py new file mode 100644 index 000000000000..d8326f8d2c33 --- /dev/null +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_upload_custom_evaluator.py @@ -0,0 +1,224 @@ +# pylint: disable=line-too-long,useless-suppression +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ + +""" +DESCRIPTION: + Given an AIProjectClient, this sample demonstrates how to: + 1. Upload a local folder containing custom evaluator Python code and + register it as a code-based evaluator version using `evaluators.upload()`. + 2. Create an evaluation (eval) that references the uploaded evaluator. + 3. Run the evaluation with inline data and poll for results. + +USAGE: + python sample_eval_upload_custom_evaluator.py + + Before running the sample: + + pip install "azure-ai-projects>=2.0.0b4" azure-storage-blob python-dotenv azure-identity openai + + Set these environment variables with your own values: + 1) FOUNDRY_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as found in the overview page of your + Microsoft Foundry project. It has the form: https://.services.ai.azure.com/api/projects/. + 2) FOUNDRY_MODEL_NAME - Optional. The name of the model deployment to use for evaluation. +""" + +import os +import time +import random +import string +from pathlib import Path +from pprint import pprint + +from dotenv import load_dotenv +from openai.types.evals.create_eval_jsonl_run_data_source_param import ( + CreateEvalJSONLRunDataSourceParam, + SourceFileContent, + SourceFileContentContent, +) +from openai.types.eval_create_params import DataSourceConfigCustom +from azure.identity import DefaultAzureCredential +from azure.ai.projects import AIProjectClient +from azure.ai.projects.models import ( + CodeBasedEvaluatorDefinition, + EvaluatorCategory, + EvaluatorMetric, + EvaluatorMetricType, + EvaluatorMetricDirection, + EvaluatorType, + EvaluatorVersion, +) + +load_dotenv() + +endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"] +model_deployment_name = os.environ.get("FOUNDRY_MODEL_NAME") + +# The folder containing the AnswerLength evaluator code, relative to this sample file. +local_upload_folder = str(Path(__file__).parent / "custom_evaluators" / "answer_length_evaluator") + +with ( + DefaultAzureCredential() as credential, + AIProjectClient(endpoint=endpoint, credential=credential) as project_client, + project_client.get_openai_client() as client, +): + # --------------------------------------------------------------- + # 1. Upload evaluator code and create evaluator version + # upload() internally calls startPendingUpload to get a SAS URI, + # uploads the folder contents to blob storage, then creates the + # evaluator version with the blob URI. + # --------------------------------------------------------------- + suffix = "".join(random.choices(string.ascii_lowercase, k=5)) + evaluator_name = f"answer_length_evaluator_{suffix}" + evaluator_version = EvaluatorVersion( + evaluator_type=EvaluatorType.CUSTOM, + categories=[EvaluatorCategory.QUALITY], + display_name="Answer Length Evaluator", + description="Custom evaluator to calculate length of content", + definition=CodeBasedEvaluatorDefinition( + entry_point="answer_length_evaluator:AnswerLengthEvaluator", + init_parameters={ + "type": "object", + "properties": {"model_config": {"type": "string"}}, + "required": ["model_config"], + }, + data_schema={ + "type": "object", + "properties": { + "query": {"type": "string"}, + "response": {"type": "string"}, + }, + "required": ["query", "response"], + }, + metrics={ + "score": EvaluatorMetric( + type=EvaluatorMetricType.ORDINAL, + desirable_direction=EvaluatorMetricDirection.INCREASE, + min_value=1, + max_value=5, + ) + }, + ), + ) + + print("Uploading custom evaluator code and creating evaluator version...") + code_evaluator = project_client.beta.evaluators.upload( + name=evaluator_name, + evaluator_version=evaluator_version, + folder=local_upload_folder, + overwrite=True, + ) + + print(f"Evaluator created: name={code_evaluator.name}, version={code_evaluator.version}") + print(f"Evaluator ID: {code_evaluator.id}") + pprint(code_evaluator) + + # --------------------------------------------------------------- + # 2. Create an evaluation referencing the uploaded evaluator + # --------------------------------------------------------------- + data_source_config = DataSourceConfigCustom( + { + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "query": {"type": "string"}, + "response": {"type": "string"}, + }, + "required": ["query", "response"], + }, + "include_sample_schema": True, + } + ) + + testing_criteria = [ + { + "type": "azure_ai_evaluator", + "name": evaluator_name, + "evaluator_name": evaluator_name, + "initialization_parameters": { + "model_config": f"{model_deployment_name}", + }, + } + ] + + print("\nCreating evaluation...") + eval_object = client.evals.create( + name=f"Answer Length Evaluation - {suffix}", + data_source_config=data_source_config, + testing_criteria=testing_criteria, # type: ignore + ) + print(f"Evaluation created (id: {eval_object.id}, name: {eval_object.name})") + + # --------------------------------------------------------------- + # 3. Run the evaluation with inline data + # --------------------------------------------------------------- + print("\nCreating evaluation run with inline data...") + eval_run_object = client.evals.runs.create( + eval_id=eval_object.id, + name=f"Answer Length Eval Run - {suffix}", + metadata={"team": "eval-exp", "scenario": "answer-length-v1"}, + data_source=CreateEvalJSONLRunDataSourceParam( + type="jsonl", + source=SourceFileContent( + type="file_content", + content=[ + SourceFileContentContent( + item={ + "query": "What is the capital of France?", + "response": "Paris", + } + ), + SourceFileContentContent( + item={ + "query": "Explain quantum computing", + "response": "Quantum computing leverages quantum mechanical phenomena like superposition and entanglement to process information in fundamentally different ways than classical computers.", + } + ), + SourceFileContentContent( + item={ + "query": "What is AI?", + "response": "AI stands for Artificial Intelligence. It is a branch of computer science that aims to create intelligent machines that can perform tasks that typically require human intelligence, such as visual perception, speech recognition, decision-making, and language translation.", + } + ), + SourceFileContentContent( + item={ + "query": "Say hello", + "response": "Hi!", + } + ), + ], + ), + ), + ) + + print(f"Evaluation run created (id: {eval_run_object.id})") + pprint(eval_run_object) + + # --------------------------------------------------------------- + # 4. Poll for evaluation run completion + # --------------------------------------------------------------- + while True: + run = client.evals.runs.retrieve(run_id=eval_run_object.id, eval_id=eval_object.id) + if run.status in ("completed", "failed"): + print(f"\nEvaluation run finished with status: {run.status}") + output_items = list(client.evals.runs.output_items.list(run_id=run.id, eval_id=eval_object.id)) + pprint(output_items) + print(f"\nEvaluation run Report URL: {run.report_url}") + break + time.sleep(5) + print("Waiting for evaluation run to complete...") + + # --------------------------------------------------------------- + # 5. Cleanup (uncomment to delete) + # --------------------------------------------------------------- + # print("\nCleaning up...") + # project_client.beta.evaluators.delete_version( + # name=code_evaluator.name, + # version=code_evaluator.version, + # ) + # client.evals.delete(eval_id=eval_object.id) + # print("Cleanup done.") + print("\nDone - upload, eval creation, and eval run verified successfully.") diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_upload_friendly_evaluator.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_upload_friendly_evaluator.py new file mode 100644 index 000000000000..bcea7e6aff42 --- /dev/null +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_upload_friendly_evaluator.py @@ -0,0 +1,247 @@ +# pylint: disable=line-too-long,useless-suppression +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ + +""" +DESCRIPTION: + Given an AIProjectClient, this sample demonstrates how to: + 1. Upload a custom LLM-based evaluator (FriendlyEvaluator) with nested + folder structure (common_util/) using `evaluators.upload()`. + 2. Create an evaluation (eval) that references the uploaded evaluator. + 3. Run the evaluation with inline data and poll for results. + + The FriendlyEvaluator calls Azure OpenAI to judge the friendliness of a + response and returns score, label, reason, and explanation. + +USAGE: + python sample_eval_upload_friendly_evaluator.py + + Before running the sample: + + pip install "azure-ai-projects>=2.0.0b4" azure-storage-blob python-dotenv azure-identity openai + + Set these environment variables with your own values: + 1) FOUNDRY_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint. + 2) FOUNDRY_MODEL_NAME - Optional. The name of the model deployment to use for evaluation. +""" + +import os +import time +import random +import string +from pathlib import Path +from pprint import pprint + +from dotenv import load_dotenv +from openai.types.evals.create_eval_jsonl_run_data_source_param import ( + CreateEvalJSONLRunDataSourceParam, + SourceFileContent, + SourceFileContentContent, +) +from openai.types.eval_create_params import DataSourceConfigCustom +from azure.identity import DefaultAzureCredential +from azure.ai.projects import AIProjectClient +from azure.ai.projects.models import ( + CodeBasedEvaluatorDefinition, + EvaluatorCategory, + EvaluatorMetric, + EvaluatorMetricType, + EvaluatorMetricDirection, + EvaluatorType, + EvaluatorVersion, +) + +load_dotenv() + +endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"] +model_deployment_name = os.environ.get("FOUNDRY_MODEL_NAME") +azure_openai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"] +azure_openai_api_key = os.environ["AZURE_OPENAI_API_KEY"] + +# The folder containing the FriendlyEvaluator code, including common_util/ subfolder +local_upload_folder = str(Path(__file__).parent / "custom_evaluators" / "friendly_evaluator") + +with ( + DefaultAzureCredential() as credential, + AIProjectClient(endpoint=endpoint, credential=credential) as project_client, + project_client.get_openai_client() as client, +): + # --------------------------------------------------------------- + # 1. Upload evaluator code and create evaluator version + # The folder structure uploaded is: + # friendly_evaluator/ + # friendly_evaluator.py <- entry point + # common_util/ + # __init__.py + # util.py <- helper functions + # --------------------------------------------------------------- + suffix = "".join(random.choices(string.ascii_lowercase, k=5)) + evaluator_name = f"friendly_evaluator_{suffix}" + + evaluator_version = EvaluatorVersion( + evaluator_type=EvaluatorType.CUSTOM, + categories=[EvaluatorCategory.QUALITY], + display_name="Friendliness Evaluator", + description="LLM-based evaluator that scores how friendly a response is (1-5)", + definition=CodeBasedEvaluatorDefinition( + entry_point="friendly_evaluator:FriendlyEvaluator", + init_parameters={ + "type": "object", + "properties": { + "model_config": { + "type": "object", + "description": "Azure OpenAI configuration for the LLM judge", + "properties": { + "azure_endpoint": {"type": "string"}, + "api_version": {"type": "string"}, + "api_key": {"type": "string"}, + }, + "required": ["azure_endpoint", "api_key"], + } + }, + "required": ["model_config"], + }, + data_schema={ + "type": "object", + "properties": { + "query": {"type": "string", "description": "The original user query"}, + "response": {"type": "string", "description": "The response to evaluate for friendliness"}, + }, + "required": ["query", "response"], + }, + metrics={ + "score": EvaluatorMetric( + type=EvaluatorMetricType.ORDINAL, + desirable_direction=EvaluatorMetricDirection.INCREASE, + min_value=1, + max_value=5, + ) + }, + ), + ) + + print("Uploading FriendlyEvaluator (with nested common_util folder)...") + friendly_evaluator = project_client.beta.evaluators.upload( + name=evaluator_name, + evaluator_version=evaluator_version, + folder=local_upload_folder, + overwrite=True, + ) + + print(f"\nEvaluator created: name={friendly_evaluator.name}, version={friendly_evaluator.version}") + print(f"Evaluator ID: {friendly_evaluator.id}") + pprint(friendly_evaluator) + + # --------------------------------------------------------------- + # 2. Create an evaluation referencing the uploaded evaluator + # --------------------------------------------------------------- + data_source_config = DataSourceConfigCustom( + { + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "query": {"type": "string"}, + "response": {"type": "string"}, + }, + "required": ["query", "response"], + }, + "include_sample_schema": True, + } + ) + + testing_criteria = [ + { + "type": "azure_ai_evaluator", + "name": evaluator_name, + "evaluator_name": evaluator_name, + "initialization_parameters": { + "model_config": { + "azure_endpoint": azure_openai_endpoint, + "api_key": f"{azure_openai_api_key}", + "api_version": "2024-06-01", + }, + }, + } + ] + + print("\nCreating evaluation...") + eval_object = client.evals.create( + name=f"Friendliness Evaluation - {suffix}", + data_source_config=data_source_config, + testing_criteria=testing_criteria, # type: ignore + ) + print(f"Evaluation created (id: {eval_object.id}, name: {eval_object.name})") + + # --------------------------------------------------------------- + # 3. Run the evaluation with inline data + # --------------------------------------------------------------- + print("\nCreating evaluation run with inline data...") + eval_run_object = client.evals.runs.create( + eval_id=eval_object.id, + name=f"Friendliness Eval Run - {suffix}", + metadata={"team": "eval-exp", "scenario": "friendliness-v1"}, + data_source=CreateEvalJSONLRunDataSourceParam( + type="jsonl", + source=SourceFileContent( + type="file_content", + content=[ + SourceFileContentContent( + item={ + "query": "How do I reset my password?", + "response": "Go to settings and click reset. That's it.", + } + ), + SourceFileContentContent( + item={ + "query": "I'm having trouble with my account", + "response": "I'm really sorry to hear you're having trouble! I'd love to help you get this sorted out. Could you tell me a bit more about what's happening so I can assist you better?", + } + ), + SourceFileContentContent( + item={ + "query": "Can you help me?", + "response": "Read the docs.", + } + ), + SourceFileContentContent( + item={ + "query": "What's the weather like today?", + "response": "Great question! While I'm not a weather service, I'd be happy to suggest some wonderful weather apps that can give you accurate forecasts. Would you like some recommendations? 😊", + } + ), + ], + ), + ), + ) + + print(f"Evaluation run created (id: {eval_run_object.id})") + pprint(eval_run_object) + + # --------------------------------------------------------------- + # 4. Poll for evaluation run completion + # --------------------------------------------------------------- + while True: + run = client.evals.runs.retrieve(run_id=eval_run_object.id, eval_id=eval_object.id) + if run.status in ("completed", "failed"): + print(f"\nEvaluation run finished with status: {run.status}") + output_items = list(client.evals.runs.output_items.list(run_id=run.id, eval_id=eval_object.id)) + pprint(output_items) + print(f"\nEvaluation run Report URL: {run.report_url}") + break + time.sleep(5) + print("Waiting for evaluation run to complete...") + + # --------------------------------------------------------------- + # 5. Cleanup (uncomment to delete) + # --------------------------------------------------------------- + # print("\nCleaning up...") + # project_client.beta.evaluators.delete_version( + # name=friendly_evaluator.name, + # version=friendly_evaluator.version, + # ) + # client.evals.delete(eval_id=eval_object.id) + # print("Cleanup done.") + print("\nDone - FriendlyEvaluator upload, eval creation, and eval run verified successfully.")