python-agentframework-demos/examples/agent_evaluation_batch.py at main · Azure-Samples/python-agentframework-demos · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
"""Batch evaluation of agent responses using Azure AI Evaluation's evaluate() function.

Reads evaluation data from a JSONL file (produced by agent_evaluation_generate.py) and runs
all evaluators in a single batch call. Optionally logs results to Azure AI Foundry
if AZURE_AI_PROJECT is set.

Usage:
    python agent_evaluation_batch.py                          # uses eval_data.jsonl
    AZURE_AI_PROJECT=<url> python agent_evaluation_batch.py   # logs to Azure AI Foundry
"""

import logging
import os
from pathlib import Path

from azure.ai.evaluation import (
    AzureOpenAIModelConfiguration,
    IntentResolutionEvaluator,
    OpenAIModelConfiguration,
    ResponseCompletenessEvaluator,
    TaskAdherenceEvaluator,
    ToolCallAccuracyEvaluator,
    evaluate,
)
from dotenv import load_dotenv
import rich
from rich.logging import RichHandler
from rich.table import Table

handler = RichHandler(show_path=False, rich_tracebacks=True, show_level=False)
logging.basicConfig(level=logging.WARNING, handlers=[handler], force=True, format="%(message)s")
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

load_dotenv(override=True)
API_HOST = os.getenv("API_HOST", "github")

if API_HOST == "azure":
    model_config = AzureOpenAIModelConfiguration(
        type="azure_openai",
        azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
        azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT"],
    )
elif API_HOST == "github":
    model_config = OpenAIModelConfiguration(
        type="openai",
        base_url="https://models.github.ai/inference",
        api_key=os.environ["GITHUB_TOKEN"],
        model="openai/gpt-4.1-mini",
    )
else:
    model_config = OpenAIModelConfiguration(
        type="openai",
        api_key=os.environ["OPENAI_API_KEY"],
        model=os.environ.get("OPENAI_MODEL", "gpt-4.1-mini"),
    )

# Optional: Set AZURE_AI_PROJECT in .env to log results to Azure AI Foundry.
# Example: https://your-account.services.ai.azure.com/api/projects/your-project
AZURE_AI_PROJECT = os.getenv("AZURE_AI_PROJECT")


def display_evaluation_results(eval_result: dict) -> None:
    """Display batch evaluation results in a formatted table using rich."""
    result_keys = {
        "IntentResolution": "intent_resolution",
        "ResponseCompleteness": "response_completeness",
        "TaskAdherence": "task_adherence",
        "ToolCallAccuracy": "tool_call_accuracy",
    }

    rows = eval_result.get("rows", [])

    for i, row in enumerate(rows):
        table = Table(title=f"Evaluation Results - Row {i + 1}", show_lines=True)
        table.add_column("Evaluator", style="cyan", width=28)
        table.add_column("Score", style="bold", justify="center", width=8)
        table.add_column("Result", justify="center", width=8)
        table.add_column("Reason", style="dim", width=70)

        for display_name, key in result_keys.items():
            score = str(row.get(f"outputs.{key}.{key}", "N/A"))
            pass_fail = row.get(f"outputs.{key}.{key}_result", "N/A")
            reason = row.get(f"outputs.{key}.{key}_reason", "N/A")

            if pass_fail == "pass":
                result_str = "[green]pass[/green]"
            elif pass_fail == "fail":
                result_str = "[red]fail[/red]"
            else:
                result_str = str(pass_fail)

            table.add_row(display_name, score, result_str, reason)

        rich.print()
        rich.print(table)


def main() -> None:
    """Run batch evaluation on a JSONL data file."""
    eval_data_file = Path(__file__).parent / "eval_data.jsonl"

    if not eval_data_file.exists():
        logger.error(f"Data file not found: {eval_data_file}")
        logger.error("Run agent_evaluation_generate.py first to generate evaluation data.")
        return

    logger.info(f"Running batch evaluation on {eval_data_file}...")

    optional_kwargs: dict = {}
    if AZURE_AI_PROJECT:
        logger.info(f"Logging results to Azure AI project: {AZURE_AI_PROJECT}")
        optional_kwargs["azure_ai_project"] = AZURE_AI_PROJECT
    else:
        optional_kwargs["output_path"] = str(Path(__file__).parent / "eval_results.json")

    eval_result = evaluate(
        data=eval_data_file,
        evaluators={
            "intent_resolution": IntentResolutionEvaluator(model_config, is_reasoning_model=True),
            "response_completeness": ResponseCompletenessEvaluator(model_config, is_reasoning_model=True),
            "task_adherence": TaskAdherenceEvaluator(model_config, is_reasoning_model=True),
            "tool_call_accuracy": ToolCallAccuracyEvaluator(model_config, is_reasoning_model=True),
        },
        # ResponseCompletenessEvaluator expects a plain text response, not a message list,
        # so we override its column mapping to use response_text and ground_truth.
        # Other evaluators auto-map correctly since data keys match param names.
        evaluator_config={
            "response_completeness": {
                "column_mapping": {
                    "response": "${data.response_text}",
                    "ground_truth": "${data.ground_truth}",
                }
            },
        },
        **optional_kwargs,
    )

    display_evaluation_results(eval_result)

    if AZURE_AI_PROJECT:
        studio_url = eval_result.get("studio_url")
        if studio_url:
            print(f"\nView results in Azure AI Foundry:\n{studio_url}")
    else:
        logger.info("Results saved to eval_results.json")


if __name__ == "__main__":
    main()