diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py index 3a5b999d0f81..993f6aa5e686 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py @@ -212,6 +212,9 @@ def construct_prompty_model_config( if _is_aoi_model_config(model_config) and user_agent: prompty_model_config["parameters"]["extra_headers"].update({"x-ms-useragent": user_agent}) + if "extra_headers" in model_config: + prompty_model_config["parameters"]["extra_headers"].update(model_config["extra_headers"]) + return prompty_model_config diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_model_configurations.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_model_configurations.py index 916dec2984e4..6e4efb84206b 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_model_configurations.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_model_configurations.py @@ -78,6 +78,7 @@ class OpenAIModelConfiguration(TypedDict): model: str base_url: NotRequired[str] organization: NotRequired[str] + extra_headers: NotRequired[Dict[str, str]] class AzureAIProject(TypedDict): diff --git a/sdk/evaluation/azure-ai-evaluation/samples/m365-llm-api-investigation.md b/sdk/evaluation/azure-ai-evaluation/samples/m365-llm-api-investigation.md new file mode 100644 index 000000000000..3140b88ac29e --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/samples/m365-llm-api-investigation.md @@ -0,0 +1,321 @@ +# Investigation: Calling M365 LLM API from Azure AI Evaluations SDK + +**Issue:** [M365-Copilot-Agent-Evals#138](https://github.com/microsoft/M365-Copilot-Agent-Evals/issues/138) +**Goal:** Route evaluation LLM calls through the M365 LLM API, so users of the M365 Copilot Agent Evals app no longer need to bring their own Azure OpenAI endpoint and key. +**SDK Contacts:** Waqas Javed, Hanchi Wang + +--- + +## 1. Background: How the Evals App Calls an LLM Today + +The M365 Copilot Agent Evals app uses the [Azure AI Evaluations SDK](https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/evaluation/azure-ai-evaluation) to score agent responses. Today, users must provide their own Azure OpenAI endpoint and API key. We want to replace that with the M365 LLM API — an internal Microsoft service that provides OpenAI-compatible LLM access without requiring users to provision their own resources. + +The key question is: **can the SDK's client construction be pointed at the M365 LLM API instead of Azure OpenAI?** + +### Current SDK Usage in the Evals App + +The app constructs an `AzureOpenAIModelConfiguration` from environment variables and passes it to four LLM-based evaluators ([`main.py:108-125`](https://github.com/microsoft/M365-Copilot-Agent-Evals/blob/main/src/clients/cli/main.py#L108-L125)): + +```python +model_config = AzureOpenAIModelConfiguration( + azure_endpoint=os.environ.get("AZURE_AI_OPENAI_ENDPOINT"), + api_key=os.environ.get("AZURE_AI_API_KEY"), + api_version=os.environ.get("AZURE_AI_API_VERSION"), + azure_deployment=os.environ.get("AZURE_AI_MODEL_NAME"), +) + +relevance_evaluator = RelevanceEvaluator(model_config=model_config) +coherence_evaluator = CoherenceEvaluator(model_config=model_config) +groundedness_evaluator = GroundednessEvaluator(model_config=model_config) +tool_call_accuracy_evaluator = ToolCallAccuracyEvaluator(model_config) +``` + +A fifth evaluator, `CitationsEvaluator`, uses regex-based matching and does not call an LLM. + +**Key characteristics of the current usage:** + +- **Evaluators are called individually** — each evaluator is invoked as a callable on a per-row basis (e.g., `relevance_evaluator(query=prompt, response=response)`). The SDK's batch `evaluate()` function is **not** used. +- **No Azure AI project connection** — `azure_ai_project` is never passed, so evaluation results stay local (written to console, JSON, CSV, and HTML report). **No data is uploaded to Azure ML or any Azure service.** +- **No RAI service evaluators** — content safety evaluators (violence, self-harm, hate, etc.) are not used, so there are no calls to Azure AI's Responsible AI service. +- **No AOAI Graders** — the app does not use server-side Azure OpenAI evaluation APIs (`client.evals.create()`). +- **Prompty-based evaluators only** — all four LLM-based evaluators use the SDK's built-in prompty runtime, which constructs an OpenAI client and calls `client.chat.completions.create()` locally. + +**Data flow — technical vs compliance perspective:** + +- **Technical (code-level):** Because the app uses individual prompty-based evaluators without `azure_ai_project`, the only external HTTP calls go to the configured LLM endpoint. User data (queries, responses, context) flows **exclusively between the client machine and the LLM endpoint** — no Azure services receive the data. +- **Compliance:** However, the Azure AI Evals SDK is considered Azure infrastructure from a compliance standpoint. M365 and Azure are **different compliance boundaries** — M365 is a SaaS model (Microsoft is data controller), while Azure is IaaS/PaaS (customer is controller, Microsoft is processor). The Copilot Evals team has concluded that even though the LLM endpoint is M365 LLM API, running evaluation logic through the Azure SDK means **data is processed in the Azure compliance boundary**, which may not pass Privacy reviews for M365 customer data. This applies to both the local SDK and the newer [cloud evaluation SDK](https://learn.microsoft.com/en-us/azure/foundry/how-to/develop/cloud-evaluation?view=foundry) (which runs evaluations server-side on Azure Foundry). +- **Implication:** Simply pointing the SDK at M365 LLM API may not be sufficient to satisfy M365 trust boundary requirements. Alternatives under consideration include running evaluation logic fully inside the M365 boundary (e.g., extracting reusable evaluation criteria into a standalone library, or building a custom evaluation runner within M365 infrastructure). + +--- + +## 2. How the SDK Constructs LLM Clients + +The SDK supports two model configuration types. Each one constructs a different OpenAI client with different URL and auth behaviors. Understanding the difference is critical to choosing the right path for LLM API integration. + +Client construction code: [`_prompty.py:296-320`](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_legacy/prompty/_prompty.py#L296-L320) + +### `AzureOpenAIModelConfiguration` → `AsyncAzureOpenAI` + +Designed for Azure OpenAI resources. The SDK **rewrites the URL** to include Azure-specific path segments. + +```python +# What the user provides: +model_config = AzureOpenAIModelConfiguration( + azure_endpoint="https://my-resource.openai.azure.com/", + azure_deployment="gpt-4", + api_key="...", # or credential=DefaultAzureCredential() +) + +# What the SDK constructs: +client = AsyncAzureOpenAI( + azure_endpoint="https://my-resource.openai.azure.com/", + azure_deployment="gpt-4", + api_key="..." or azure_ad_token_provider=token_provider, +) +# Resulting URL: https://my-resource.openai.azure.com/openai/deployments/gpt-4/chat/completions +# Auth header: api-key: OR Authorization: Bearer +``` + +Key behaviors: +- URL is rewritten to `/openai/deployments/{deployment}/...` — this is Azure OpenAI-specific and would break against LLM API +- Supports `credential` for auto-refreshing AAD tokens + +### `OpenAIModelConfiguration` → `AsyncOpenAI` + +Designed for non-Azure OpenAI endpoints. The SDK sends requests to the `base_url` **as-is** with no rewriting. + +```python +# What the user provides: +model_config = OpenAIModelConfiguration( + base_url="https://api.openai.com/v1", + model="gpt-4", + api_key="...", +) + +# What the SDK constructs: +client = AsyncOpenAI( + base_url="https://api.openai.com/v1", + api_key="...", +) +# Resulting URL: https://api.openai.com/v1/chat/completions (no rewriting) +# Auth header: Authorization: Bearer +``` + +Key behaviors: +- No URL rewriting — `base_url` + `/chat/completions` is sent directly +- `api_key` is sent as `Authorization: Bearer ` +- No `credential`/token provider support (static key only) + +### Why `OpenAIModelConfiguration` Is the Right Fit + +The M365 LLM API expects requests at `{endpoint}/chat/completions` — it does **not** use the Azure-specific `/openai/deployments/{dep}/...` URL pattern. `AzureOpenAIModelConfiguration` would rewrite the URL and break the request. + +`OpenAIModelConfiguration` sends requests as-is, which matches LLM API's URL pattern. Additionally, an AAD token can be passed as `api_key` — the SDK will send it as `Authorization: Bearer `, which is exactly what LLM API expects. + +--- + +## 3. M365 LLM API Request Format + +The M365 LLM API is an internal Microsoft service that provides OpenAI-compatible LLM inference. Its request/response format closely mirrors OpenAI's, with additional headers that identify the calling scenario and model. + +**Request** (source: [`header_factory.py`](https://o365exchange.visualstudio.com/O365%20Core/_git/LLMApi?path=/sources/dev/llm_client_lib/code/llm_api_client/implementation/header_factory.py)): +``` +POST {endpoint}/chat/completions +Headers: + Authorization: ← Bearer (dev) or PFT (prod); see Section 5 + X-ModelType: ← specifies which model to route to + X-ScenarioGUID: ← identifies the registered use case (not always required) +Body: + {"messages": [...], "temperature": 0, "max_tokens": 800} +``` + +**Response:** OpenAI-compatible — `choices[0].message.content` contains the model output. + +**Endpoints:** Available in TDF, SDF, MSIT, and PROD rings at `substrate-llmapi[-ring].trafficmanager.net`. + +--- + +## 4. Compatibility Analysis + +Given the SDK's `OpenAIModelConfiguration` path and the LLM API's request format, here's how well they align: + +| Aspect | Status | Details | +|---|---|---| +| Request/response format | ✅ Compatible | LLM API accepts OpenAI-shaped payloads (`messages`, `temperature`, `max_tokens`) and returns OpenAI-compatible responses (`choices[0].message.content`). The evaluators' prompts and parsing logic work unchanged. | +| Auth (Bearer token) | ✅ Dev only | An AAD token passed as `api_key` gets sent as `Authorization: Bearer `, which works for LLM API dev scenarios. **Production requires PFT** (`Authorization: MSAuth1.0 ...`), which needs a middle-tier service (see Section 5). | +| Endpoint URL | ✅ Compatible | `OpenAIModelConfiguration` sends to `{base_url}/chat/completions` with no rewriting, matching LLM API's URL pattern exactly. | +| Custom headers | ✅ Validated (local patch) | LLM API requires `X-ModelType` header on every request. `OpenAIModelConfiguration` currently has no field for custom headers, but a **2-line local SDK patch** (adding `extra_headers` to the TypedDict and merging it into the per-call `extra_headers`) was validated end-to-end — see Section 5 for results. We suggest making this an official SDK change (see Appendix A). | +| Token refresh | ⚠️ Limitation | `OpenAIModelConfiguration` only accepts a static `api_key`. AAD tokens expire in ~60-90 min, which is sufficient for short eval runs but not for longer ones. **We suggest the SDK add `credential` + `token_scope` support to `OpenAIModelConfiguration`**, similar to how `AzureOpenAIModelConfiguration` already supports it. | + +**Summary:** The request/response format and URL structure are fully compatible. Custom headers were the only blocker — resolved with a 2-line local SDK patch, now validated end-to-end (see Section 5). + +--- + +## 5. Authentication + +Calling the M365 LLM API requires AAD authentication. The auth requirements differ between dev and production environments (see [App Authentication](https://eng.ms/docs/experiences-devices/m365-core-msai/platform/substrate-intelligence/llm-api/llm-api-partner-docs/onboarding/app-authentication)). + +### Dev: Validated End-to-End with LLM API Sample App ✅ + +AAD user-delegate auth is confirmed working for dev scenarios. We validated the full integration end-to-end in a dev/MSIT environment: using the LLM API team's sample app and a user-delegated AAD token, the Azure AI Eval SDK's `RelevanceEvaluator` successfully called the M365 LLM API and produced correct evaluation scores. + +| Detail | Value | +|---|---| +| Sample app ID | `68df66a4-cad9-4bfd-872b-c6ddde00d6b2` | +| Tenant | MSIT (`72f988bf-86f1-41af-91ab-2d7cd011db47`) | +| Scope | `https://substrate.office.com/llmapi/LLMAPI.dev` | +| Endpoint | `substrate-llmapi[-ring].trafficmanager.net` | +| Model | `dev-gpt-4o-gg` | +| Auth | MSAL `PublicClientApplication` with WAM broker | + +**Complete test script:** + +```python +import json +from msal import PublicClientApplication +from azure.ai.evaluation import RelevanceEvaluator + +# --- Configuration --- +CLIENT_ID = "68df66a4-cad9-4bfd-872b-c6ddde00d6b2" # LLM API sample app +TENANT_ID = "72f988bf-86f1-41af-91ab-2d7cd011db47" # MSIT tenant +TOKEN_SCOPE = "https://substrate.office.com/llmapi/LLMAPI.dev" +LLM_API_BASE_URL = "https://substrate-llmapi.trafficmanager.net" +MODEL = "dev-gpt-4o-gg" + +# --- Step 1: Acquire AAD token via MSAL WAM --- +app = PublicClientApplication( + CLIENT_ID, + authority=f"https://login.microsoftonline.com/{TENANT_ID}", + enable_broker_on_windows=True, +) + +accounts = app.get_accounts() +result = None +if accounts: + result = app.acquire_token_silent([TOKEN_SCOPE], account=accounts[0]) + +if not result or "access_token" not in result: + result = app.acquire_token_interactive( + scopes=[TOKEN_SCOPE], + parent_window_handle=app.CONSOLE_WINDOW_HANDLE, + ) + +token = result["access_token"] + +# --- Step 2: Build OpenAIModelConfiguration with LLM API routing --- +model_config = { + "type": "openai", + "api_key": token, # AAD token from MSAL WAM + "model": MODEL, + "base_url": LLM_API_BASE_URL, + "extra_headers": { # requires local SDK patch + "X-ModelType": MODEL, + }, +} + +# --- Step 3: Create evaluator and run --- +evaluator = RelevanceEvaluator(model_config=model_config) +result = evaluator( + query="What is the capital of Japan?", + response="The capital of Japan is Tokyo.", +) + +print(json.dumps(result, indent=2)) +``` + +**Result:** + +```json +{ + "relevance": 4.0, + "gpt_relevance": 4.0, + "relevance_result": "pass", + "relevance_threshold": 3, + "relevance_reason": "The response directly and accurately answers the user's question by correctly identifying Tokyo as the capital of Japan. It is clear, concise, and fully satisfies the user's request without unnecessary or unrelated information.", + "relevance_model": "gpt5-1-1p-exp-p-1125-2c99db-ev3-vis-6-treatment" +} +``` + +**Key observations:** +- The evaluator produced a valid score (4.0/5) with reasoning — the prompty template, LLM call, and response parsing all worked correctly +- `relevance_model` confirms the request was routed through LLM API (not Azure OpenAI) +- No code changes to the evaluator itself — only the model config and a 2-line local SDK patch (see Appendix A) + +**Local SDK patch (2 changes):** + +1. Add `extra_headers` field to `OpenAIModelConfiguration` TypedDict in [`_model_configurations.py`](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_model_configurations.py): + ```python + extra_headers: NotRequired[Dict[str, str]] + ``` + +2. Merge `extra_headers` into per-call headers in [`_common/utils.py:215-216`](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py#L215): + ```python + if "extra_headers" in model_config: + prompty_model_config["parameters"]["extra_headers"].update(model_config["extra_headers"]) + ``` + +### Prod: Middle-Tier Required (PFT) + +Production auth requires **PFT (Protected Forwarded Token)**, a middle-tier server-to-server mechanism. The LLM API team has confirmed this is the required production auth flow. + +PFT requires: +- A middle-tier service with `ConfidentialClient` credentials (certificate-based) +- The SAL SDK (.NET) to transform the user's AAD token into a PFT +- The PFT is sent as `Authorization: MSAuth1.0 ...` (not `Bearer`) + +**A desktop CLI app using `PublicClientApplication` (no certificate) cannot generate PFT on its own.** This means production integration will require building a middle-tier service that the CLI calls, which in turn calls LLM API with PFT auth. + +--- + +## 6. Alternative Approaches Considered + +### Torus Multi-Tenant App with Dev Scope + +One approach to avoid the PFT middle-tier requirement would be to register the evals app as a multi-tenant app in the Torus tenant and use the dev user-facing auth flow (`LLMAPI.dev` scope) for all users. + +**Concerns:** +- The LLM API [App Authentication docs](https://eng.ms/docs/experiences-devices/m365-core-msai/platform/substrate-intelligence/llm-api/llm-api-partner-docs/onboarding/app-authentication) state: *"The LLM API 1P app permission is not directly available to user-facing production applications."* It's unclear whether a Torus dev app serving end users would be considered a "production application" by the LLM API team. +- Dev apps only have access to `dev-` prefixed models with minimal access to features and endpoints. It's untested whether dev model quality is sufficient for evaluation scoring. +- For external customer tenants, it's unclear whether users can acquire tokens for the `LLMAPI.dev` scope through a Torus multi-tenant app. The server-side auth config suggests all tenants are allowed (`TenantIds=*`), but this needs confirmation from the LLM API team. + +This approach may work as an **internal-only** path but has open questions for external customers. Worth validating with the LLM API team. + +### Sydney as Middle Tier + +We explored whether the evals app could call LLM API **through Sydney** (the M365 Copilot backend), since the app already authenticates to Sydney and calling through Sydney is auto-approved for prod (no PFT needed). + +**Current finding:** Sydney does not appear to expose a raw `/chat/completions` endpoint. However, since production requires a middle-tier service anyway, Sydney could potentially serve as that middle tier if it added such an endpoint — this would avoid building a new service. + +--- + +## 7. Summary + +1. **Dev scenario validated with local SDK patches.** The Azure AI Eval SDK can successfully call the M365 LLM API using `OpenAIModelConfiguration` with a 2-line local patch for custom headers. The full flow — MSAL WAM auth → AAD token → SDK evaluator → LLM API → evaluation score — works end-to-end. + +2. **Production requires a decision on middle-tier architecture.** The LLM API requires PFT for production user-facing apps, which is a middle-tier server-to-server mechanism. A desktop CLI app cannot generate PFT on its own. We need to decide whether to build a dedicated middle-tier service, leverage Sydney, or pursue an alternative approach (see Section 6). + +3. **Trust boundary concern remains open.** Even though the local SDK only sends HTTP requests to the configured LLM endpoint (no Azure service uploads), the Azure AI Evals SDK is considered Azure infrastructure from a compliance perspective. M365 and Azure are different compliance boundaries (M365 = Microsoft as data controller; Azure = customer as controller). The Copilot Evals team has concluded that processing M365 customer data through Azure SDK infrastructure may not pass Privacy reviews — regardless of where the LLM endpoint is hosted. This may require exploring alternatives such as extracting evaluation logic into a standalone library that runs fully within the M365 boundary. + +--- + +## Appendix A: Suggested SDK Changes + +These changes are validated by the end-to-end POC (Section 5). The 2-line local patch for P0 has been tested and confirmed working. These changes are sufficient for dev/MSIT scenarios (user-delegate auth with Bearer tokens). Production integration will require a middle-tier service in addition to these SDK changes. + +| Priority | Suggestion | Effort | Why | +|---|---|---|---| +| **P0** | Add `extra_headers: Dict[str, str]` to [`OpenAIModelConfiguration`](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_model_configurations.py), and merge them into per-call `extra_headers` in [`_common/utils.py`](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py#L215) | **Easy — 2 lines** (validated locally) | **Unblocks integration.** LLM API requires `X-ModelType` header. Without this, there's no way to send custom headers through the SDK. | +| **P1** | Add `credential` + `token_scope` to `OpenAIModelConfiguration` for auto-refreshing AAD tokens | Medium | Enables long-running eval runs. Currently `api_key` is static and AAD tokens expire in ~60 min. `AzureOpenAIModelConfiguration` already supports this pattern. | +| **P2** | Support custom `Authorization` header scheme (e.g., `MSAuth1.0` for PFT) | Medium | May be needed for production since PFT is the required auth flow. Note: the `extra_headers` change (P0) can actually cover this — custom `Authorization` in `extra_headers` overrides the SDK's default `Bearer` header via the OpenAI SDK's `_merge_mappings()` logic. | + +--- + +## Appendix B: Key Files + +| File | Purpose | +|---|---| +| [`_model_configurations.py`](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_model_configurations.py) | Model config TypedDicts (`OpenAIModelConfiguration`, `AzureOpenAIModelConfiguration`) | +| [`_prompty.py:296-383`](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_legacy/prompty/_prompty.py#L296-L383) | Client construction + `chat.completions.create()` | +| [`main.py:108-125`](https://github.com/microsoft/M365-Copilot-Agent-Evals/blob/main/src/clients/cli/main.py#L108-L125) | Current evaluator usage in agent evals app | +| [`header_factory.py`](https://o365exchange.visualstudio.com/O365%20Core/_git/LLMApi?path=/sources/dev/llm_client_lib/code/llm_api_client/implementation/header_factory.py) | LLM API required headers (`X-ScenarioGUID`, `X-ModelType`) | +| [`Authentication-User-Guide.md`](https://o365exchange.visualstudio.com/O365%20Core/_git/LLMApi?path=/DocumentationExternal/Onboarding/Authentication-User-Guide.md) | PFT auth flow documentation | diff --git a/sdk/evaluation/azure-ai-evaluation/samples/test_m365_llmapi_e2e.py b/sdk/evaluation/azure-ai-evaluation/samples/test_m365_llmapi_e2e.py new file mode 100644 index 000000000000..b495ba5a7ed1 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/samples/test_m365_llmapi_e2e.py @@ -0,0 +1,91 @@ +""" +End-to-end test: Azure AI Evaluation SDK → M365 LLM API + +Demonstrates calling the M365 LLM API as the judge LLM for prompty-based +evaluators, using the local SDK patch that adds `extra_headers` support +to `OpenAIModelConfiguration`. + +Prerequisites (run from repo root, requires Python 3.9+ and Windows): + 1. Clone the repo and checkout the PR branch: + git clone https://github.com/Azure/azure-sdk-for-python.git + cd azure-sdk-for-python + gh pr checkout 45687 + 2. Create and activate a virtual environment: + python -m venv sdk/evaluation/azure-ai-evaluation/.venv + sdk\\evaluation\\azure-ai-evaluation\\.venv\\Scripts\\activate + 3. Install the patched SDK and MSAL: + pip install -e sdk/evaluation/azure-ai-evaluation + pip install "msal[broker]>=1.20,<2" + 4. Run the script (a login window will pop up for AAD auth): + python sdk/evaluation/azure-ai-evaluation/samples/test_m365_llmapi_e2e.py +""" + +import json +from msal import PublicClientApplication +from azure.ai.evaluation import RelevanceEvaluator + +# --- Configuration --- +CLIENT_ID = "68df66a4-cad9-4bfd-872b-c6ddde00d6b2" # LLM API sample app +TENANT_ID = "72f988bf-86f1-41af-91ab-2d7cd011db47" # MSIT tenant +TOKEN_SCOPE = "https://substrate.office.com/llmapi/LLMAPI.dev" +LLM_API_BASE_URL = "https://substrate-llmapi.trafficmanager.net" +MODEL = "dev-gpt-4o-gg" + + +def acquire_token(): + """Acquire an AAD token via MSAL with WAM broker.""" + app = PublicClientApplication( + CLIENT_ID, + authority=f"https://login.microsoftonline.com/{TENANT_ID}", + enable_broker_on_windows=True, + ) + + accounts = app.get_accounts() + result = None + if accounts: + result = app.acquire_token_silent([TOKEN_SCOPE], account=accounts[0]) + + if not result or "access_token" not in result: + result = app.acquire_token_interactive( + scopes=[TOKEN_SCOPE], + parent_window_handle=app.CONSOLE_WINDOW_HANDLE, + ) + + if "access_token" not in result: + raise RuntimeError(f"Token acquisition failed: {result.get('error_description', result)}") + + return result["access_token"] + + +def main(): + print("Acquiring AAD token via MSAL WAM...") + token = acquire_token() + print("Token acquired.\n") + + # Build OpenAIModelConfiguration pointing at M365 LLM API. + # - api_key: AAD token (sent as Authorization: Bearer ) + # - base_url: LLM API endpoint (no URL rewriting with OpenAI config) + # - extra_headers: custom headers required by LLM API (requires local SDK patch) + model_config = { + "type": "openai", + "api_key": token, + "model": MODEL, + "base_url": LLM_API_BASE_URL, + "extra_headers": { + "X-ModelType": MODEL, + }, + } + + print("Running RelevanceEvaluator via M365 LLM API...") + evaluator = RelevanceEvaluator(model_config=model_config) + result = evaluator( + query="What is the capital of Japan?", + response="The capital of Japan is Tokyo.", + ) + + print("\nResult:") + print(json.dumps(result, indent=2)) + + +if __name__ == "__main__": + main()