diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/data_folder/sample_evaluations_score_model_grader_with_audio.wav b/sdk/ai/azure-ai-projects/samples/evaluations/data_folder/sample_evaluations_score_model_grader_with_audio.wav new file mode 100644 index 000000000000..6ce78c4215b3 Binary files /dev/null and b/sdk/ai/azure-ai-projects/samples/evaluations/data_folder/sample_evaluations_score_model_grader_with_audio.wav differ diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_score_model_grader_with_audio.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_score_model_grader_with_audio.py new file mode 100644 index 000000000000..64a7445a6b57 --- /dev/null +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_score_model_grader_with_audio.py @@ -0,0 +1,199 @@ +# pylint: disable=line-too-long,useless-suppression +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ + +""" +DESCRIPTION: + Given an AIProjectClient, this sample demonstrates how to use the synchronous + `openai.evals.*` methods to create, get, and list evaluations and eval runs. + + The OpenAI official tutorial is here: https://cookbook.openai.com/examples/evaluation/use-cases/evalsapi_audio_inputs + +USAGE: + python sample_evaluations_score_model_grader_with_audio.py + + Before running the sample: + + pip install "azure-ai-projects>=2.0.0b2" azure-identity python-dotenv + + Set these environment variables with your own values: + 1) AZURE_AI_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as found in the overview page of your + Microsoft Foundry project. It has the form: https://.services.ai.azure.com/api/projects/. + 2) AZURE_AI_MODEL_DEPLOYMENT_NAME - Required. The name of the model deployment to use for evaluation. + 3) AZURE_AI_MODEL_DEPLOYMENT_NAME_FOR_AUDIO - Required. The name of the model deployment for audio to use for evaluation, recommend to use "gpt-4o-audio-preview" +""" + +import os +import base64 + +from azure.identity import DefaultAzureCredential +from azure.ai.projects import AIProjectClient +import time +from pprint import pprint +from openai.types.evals.create_eval_completions_run_data_source_param import ( + CreateEvalCompletionsRunDataSourceParam, + SourceFileContent, + SourceFileContentContent, + InputMessagesTemplate, + InputMessagesTemplateTemplateEvalItem, +) +from openai.types.responses import EasyInputMessageParam, ResponseInputAudioParam +from openai.types.eval_create_params import DataSourceConfigCustom +from dotenv import load_dotenv + + +load_dotenv() +file_path = os.path.abspath(__file__) +folder_path = os.path.dirname(file_path) + +endpoint = os.environ.get("AZURE_AI_PROJECT_ENDPOINT", "") +model_deployment_name = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "") +model_deployment_name_for_audio = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME_FOR_AUDIO", "") + + +def audio_to_base64(audio_path: str) -> str: + """Read an audio file and return its base64-encoded content.""" + with open(audio_path, "rb") as f: + return base64.b64encode(f.read()).decode() + + +with ( + DefaultAzureCredential() as credential, + AIProjectClient(endpoint=endpoint, credential=credential) as project_client, + project_client.get_openai_client() as client, +): + + data_source_config = DataSourceConfigCustom( + { + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "audio_data": { + "type": "string", + "description": "Base64-encoded WAV audio data." + }, + "expected": { + "type": "string", + "description": "The expected content in the audio." + } + }, + "required": [ + "audio_data", + "expected", + ], + }, + "include_sample_schema": True, + } + ) + + testing_criteria = [ + { + "type": "score_model", + "name": "score_grader", + "model": model_deployment_name, + "input": [ + { + "role": "system", + "content": "You are an audio analyzer. Listen to the audio, return a float score in [0,1] where 1 means the audio has the same meaning as {{item.expected}}.", + }, + { + "role": "user", + "content": "{{sample.output_text}}", + } + ], + "range": [ + 0.0, + 1.0 + ], + "pass_threshold": 0.5, + }, + ] + + print("Creating evaluation") + eval_object = client.evals.create( + name="OpenAI graders test", + data_source_config=data_source_config, + testing_criteria=testing_criteria, # type: ignore + ) + print(f"Evaluation created (id: {eval_object.id}, name: {eval_object.name})") + + print("Get evaluation by Id") + eval_object_response = client.evals.retrieve(eval_object.id) + print("Evaluation Response:") + pprint(eval_object_response) + + source_file_content_content = SourceFileContentContent( + item={ + "audio_data": audio_to_base64(os.path.join(folder_path, "data_folder/sample_evaluations_score_model_grader_with_audio.wav")), + "expected": "Don't forget a jacket", + }, + ) + source_file_content = SourceFileContent( + type="file_content", + content=[source_file_content_content], + ) + input_messages = InputMessagesTemplate( + type="template", + template=[ + EasyInputMessageParam( + role="system", + content="You are an assistant that can analyze audio input. You will be given an audio input to analyze.", + ), + InputMessagesTemplateTemplateEvalItem( + role="user", + type="message", + content="Listen to the following audio and convert to text.", + ), + InputMessagesTemplateTemplateEvalItem( + role="user", + type="message", + content=ResponseInputAudioParam( + type="input_audio", + input_audio={ + "data": "{{item.audio_data}}", + "format": "wav", + } + ) + ) + ], + ) + + print("Creating Eval Run") + eval_run_object = client.evals.runs.create( + eval_id=eval_object.id, + name="Eval", + metadata={"team": "eval-exp", "scenario": "notifications-v1"}, + data_source=CreateEvalCompletionsRunDataSourceParam( + type="completions", + source=source_file_content, + model=model_deployment_name_for_audio, + input_messages=input_messages, + sampling_params={ + "temperature": 0.8, + }, + ) + ) + print(f"Eval Run created (id: {eval_run_object.id}, name: {eval_run_object.name})") + pprint(eval_run_object) + + print("Get Eval Run by Id") + eval_run_response = client.evals.runs.retrieve(run_id=eval_run_object.id, eval_id=eval_object.id) + print("Eval Run Response:") + pprint(eval_run_response) + + while True: + run = client.evals.runs.retrieve(run_id=eval_run_response.id, eval_id=eval_object.id) + if run.status == "completed" or run.status == "failed": + output_items = list(client.evals.runs.output_items.list(run_id=run.id, eval_id=eval_object.id)) + pprint(output_items) + print(f"Eval Run Report URL: {run.report_url}") + + break + time.sleep(5) + print("Waiting for eval run to complete...") + + client.evals.delete(eval_id=eval_object.id) + print("Evaluation deleted")