diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index c36dddb..5ca9ee2 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -47,12 +47,25 @@ jobs: - name: Install Python deps run: pip install -r agent/requirements.txt -r evals/requirements.txt + - name: Resolve steady-state Agent Bricks endpoint + run: | + endpoint="$(./scripts/resolve-agent-endpoint.sh demo 2>/dev/null || true)" + if [ -z "$endpoint" ]; then + echo "::error::Agent Bricks supervisor for demo is missing. Run ./scripts/bootstrap-demo.sh once for first workspace bring-up, then rerun CI." + exit 1 + fi + if ! databricks api get "/api/2.0/serving-endpoints/${endpoint}" >/dev/null 2>&1; then + echo "::error::Agent Bricks supervisor record points to endpoint ${endpoint}, but that serving endpoint is missing or not listable. Run ./scripts/bootstrap-demo.sh locally to repair workspace drift." + exit 1 + fi + echo "AGENT_ENDPOINT_NAME=${endpoint}" >> "$GITHUB_ENV" + - name: Deploy bundle (full — consumers already exist in steady-state) - # Pin warehouse_id so the dashboard + serving env match what - # wait_for_kpis / log_and_register use. Without --var, the bundle + # Pin warehouse_id so the dashboard and Agent Bricks bootstrap match + # wait_for_kpis. Without --var, the bundle # falls back to its `lookup: warehouse: Serverless Starter Warehouse` # default and silently picks a different ID. - run: databricks bundle deploy -t demo --var "warehouse_id=$DOCINTEL_WAREHOUSE_ID" + run: databricks bundle deploy -t demo --var "warehouse_id=$DOCINTEL_WAREHOUSE_ID" --var "agent_endpoint_name=$AGENT_ENDPOINT_NAME" - name: Wait for Lakebase instance to be AVAILABLE # Lakebase already exists in steady-state but a config change can @@ -81,7 +94,7 @@ while True: env: LAKEBASE_NAME: ${{ vars.DOCINTEL_LAKEBASE_NAME || 'docintel-demo-state-v1' }} - - name: Refresh data — upload samples, run pipeline, register new model version + - name: Refresh data and Agent Bricks configuration run: | for f in samples/*_10K_*.pdf; do databricks fs cp "$f" \ @@ -90,9 +103,12 @@ while True: done databricks bundle run -t demo --var "warehouse_id=$DOCINTEL_WAREHOUSE_ID" doc_intel_pipeline python scripts/wait_for_kpis.py --min-rows 3 --timeout 900 - # --serving-endpoint repoints the existing endpoint to the new - # model version in-place (steady-state idempotent operation). - python agent/log_and_register.py --target demo --serving-endpoint analyst-agent-demo + databricks bundle run -t demo --var "warehouse_id=$DOCINTEL_WAREHOUSE_ID" --var "agent_endpoint_name=$AGENT_ENDPOINT_NAME" index_refresh + python scripts/bootstrap_agent_bricks.py \ + --target demo \ + --catalog "$DOCINTEL_CATALOG" \ + --schema "$DOCINTEL_SCHEMA" \ + --warehouse-id "$DOCINTEL_WAREHOUSE_ID" - name: Apply UC grants (catalog + schema; not DAB-supported) # UC requires the full chain: USE_CATALOG → USE_SCHEMA → SELECT/EXECUTE. @@ -112,16 +128,14 @@ while True: # Databricks Apps deploy docs: # https://docs.databricks.com/aws/en/dev-tools/databricks-apps/deploy # `bundle deploy` alone uploads code but doesn't apply config/restart. - run: databricks bundle run -t demo --var "warehouse_id=$DOCINTEL_WAREHOUSE_ID" analyst_app + run: databricks bundle run -t demo --var "warehouse_id=$DOCINTEL_WAREHOUSE_ID" --var "agent_endpoint_name=$AGENT_ENDPOINT_NAME" analyst_app - name: Verify OBO scopes survived deploy # `bundle run` may wipe user_api_scopes (documented destructive-update - # behavior). Fail loudly so we re-apply. Skipped when user_api_scopes - # are not declared (workspace feature off). + # behavior). Fail loudly because user-token passthrough is mandatory. run: | - if grep -q '^ user_api_scopes:' resources/consumers/analyst.app.yml; then - databricks apps get doc-intel-analyst-demo --output json > /tmp/app.json - python -c " + databricks apps get doc-intel-analyst-demo --output json > /tmp/app.json + python -c " import json app = json.load(open('/tmp/app.json')) scopes = set(app.get('user_api_scopes') or []) @@ -129,9 +143,6 @@ required = {'serving.serving-endpoints', 'sql'} missing = required - scopes assert not missing, f'OBO scopes missing: {sorted(missing)} (got {sorted(scopes)})' " - else - echo "user_api_scopes not declared (workspace feature off); skipping OBO scope check" - fi - name: CLEARS evaluation gate - run: python evals/clears_eval.py --endpoint analyst-agent-demo --dataset evals/dataset.jsonl + run: python evals/clears_eval.py --endpoint "$AGENT_ENDPOINT_NAME" --dataset evals/dataset.jsonl diff --git a/CLAUDE.md b/CLAUDE.md index 918750a..66acf46 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,10 +4,10 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co ## Repository status -**Databricks Document Intelligence Agent — Reference Implementation.** +**Databricks Document Intelligence Agent - Agent Bricks implementation.** Active feature: **001-doc-intel-10k** — demonstrated on synthetic SEC 10-K filings. Drives a Bronze→Silver→Gold pipeline (`ai_parse_document` / `ai_classify` / `ai_extract`), -Mosaic AI Vector Search index, agent endpoint behind AI Gateway, Streamlit App on Databricks Apps, +Mosaic AI Vector Search index, Agent Bricks Supervisor endpoint behind AI Gateway, Streamlit App on Databricks Apps, Lakebase state, Lakehouse Monitoring, and an MLflow CLEARS eval gate — all in one DAB. For an end-to-end overview written for humans, read [`README.md`](./README.md). @@ -16,15 +16,15 @@ For an end-to-end overview written for humans, read [`README.md`](./README.md). The bundle has three chicken-egg dependencies that prevent a single `databricks bundle deploy -t demo` from succeeding on a fresh workspace: -1. **Model Serving endpoint** references a registered model version that doesn't exist until `agent/log_and_register.py` runs. +1. **Databricks App resource binding** references the Agent Bricks Supervisor endpoint that `scripts/bootstrap_agent_bricks.py` creates after the Vector Search index exists. 2. **Lakehouse Monitor** (`resources/consumers/kpi_drift.yml`) attaches to `gold_filing_kpis`, which doesn't exist until the pipeline runs once. 3. **Lakebase database_catalog + Databricks App** race the `database_instance` provisioning. -**Canonical fix**: Run `./scripts/bootstrap-demo.sh` for fresh stand-ups; plain `databricks bundle deploy -t demo` for steady-state. The script does a **staged deploy** — `resources/` is split into `foundation/` (no data deps) and `consumers/` (need data). Stage 1 temporarily renames consumer YAMLs to `*.yml.skip` so the bundle glob skips them; stage 2 produces data and then runs full `bundle deploy`. **Both deploys succeed cleanly** — no "errors tolerated" hand-waving, no orphans to clean up on retry. +**Canonical fix**: Run `./scripts/bootstrap-demo.sh` for fresh stand-ups; plain `databricks bundle deploy -t demo` for steady-state. The script does a **staged deploy** — `resources/` is split into `foundation/` (no data deps) and `consumers/` (need data). Stage 1 temporarily renames consumer YAMLs to `*.yml.skip` so the bundle glob skips them; stage 2 produces data and then runs full `bundle deploy`. Both deploys should succeed cleanly. **Do NOT try to "fix" these by:** - Adding `depends_on` between heterogeneous DAB resource types — DAB doesn't reliably honor it across instance↔catalog↔app. -- Switching `resources/consumers/agent.serving.yml` to UC alias syntax (`@demo`) — DAB serving config may reject alias syntax; that's why `_promote_serving_endpoint` exists in `agent/log_and_register.py`. +- Reintroducing a custom MLflow pyfunc serving endpoint. Agent Bricks Knowledge Assistant + Supervisor Agent is the production path. - Splitting monitors into a separate target overlay — adds complexity for a one-time concern. Full breakdown lives in [`docs/runbook.md`](./docs/runbook.md) §"Known deploy ordering gaps". @@ -33,13 +33,13 @@ Full breakdown lives in [`docs/runbook.md`](./docs/runbook.md) §"Known deploy o ``` pipelines/sql/ Lakeflow SDP — Bronze → Silver → Gold (SQL only, principle III) -agent/ Mosaic AI Agent Framework: pyfunc, retrieval, supervisor, UC tools, registration, OBO helpers +agent/ Deterministic Agent Bricks tool glue only app/ Streamlit on Databricks Apps + Lakebase psycopg client evals/ MLflow CLEARS gate (clears_eval.py + dataset.jsonl) jobs/ Lakeflow Jobs Python tasks (retention, index_refresh) resources/foundation/ DAB resources with no data deps: catalog/schema/volume, pipeline, retention job, Lakebase instance -resources/consumers/ DAB resources that depend on foundation data: serving endpoint, monitor, index-refresh job, app, dashboard, Lakebase catalog -scripts/ Operational scripts (bootstrap-demo.sh, wait_for_kpis.py) +resources/consumers/ DAB resources that depend on foundation data: monitor, index-refresh job, app, dashboard, Lakebase catalog +scripts/ Operational scripts (bootstrap-demo.sh, bootstrap_agent_bricks.py, wait_for_kpis.py) samples/ Synthetic 10-K PDFs (regenerable via synthesize.py) specs/001-… Spec-Kit artifacts (spec, plan, tasks, research, data-model, contracts, quickstart) docs/runbook.md Day-2 ops + bring-up workflow @@ -50,13 +50,13 @@ docs/runbook.md Day-2 ops + bring-up workflow - Validate: `databricks bundle validate -t demo` - Fresh stand-up: `./scripts/bootstrap-demo.sh` (requires `DOCINTEL_CATALOG`, `DOCINTEL_SCHEMA`, `DOCINTEL_WAREHOUSE_ID`) -- Steady-state deploy: `databricks bundle deploy -t demo` +- Steady-state deploy: `databricks bundle deploy -t demo --var "agent_endpoint_name=$(./scripts/resolve-agent-endpoint.sh demo)"` - Run pipeline: `databricks bundle run -t demo doc_intel_pipeline` -- Run eval: `python evals/clears_eval.py --endpoint analyst-agent-demo --dataset evals/dataset.jsonl` +- Run eval: `python evals/clears_eval.py --endpoint "$(./scripts/resolve-agent-endpoint.sh demo)" --dataset evals/dataset.jsonl` ## Tests & validation -- `pytest agent/tests/` — unit tests for retrieval, agent routing, supervisor +- `pytest agent/tests/` — unit tests for deterministic Agent Bricks tool glue - `databricks bundle validate -t demo` and `-t prod` — schema check both targets before merging - The CLEARS eval is the deploy gate; principle V says no agent ships without it passing @@ -66,19 +66,17 @@ These were discovered the painful way during the 2026-04-25 bring-up. Future ses - **SDP streaming chains require explicit `STREAM(...)`**: a temp view that reads from `STREAM(upstream_table)` is itself a streaming view, and downstream references must wrap it in `STREAM(...)` again. Reference: `pipelines/sql/02_silver_parse.sql:23` (`FROM STREAM(silver_with_parsed)`). - **One Auto Loader source per path**: split downstream tables off a single `STREAM read_files(...)` via a temp streaming view. Reference: `pipelines/sql/01_bronze.sql` (`raw_pdf_arrivals` view); Auto Loader docs: https://docs.databricks.com/aws/en/ingestion/cloud-object-storage/auto-loader/. -- **Section explosion fallback**: `pipelines/sql/03_gold_classify_extract.sql` POSEXPLODES `parsed:sections[*]` and falls back to a single `full_document` row when the VARIANT lacks `$.sections` so we never lose a filing. -- **MLflow + UC requires both inputs AND outputs in signatures**: an inputs-only signature is rejected at registration. For variable-shape fields like `citations` (array of dicts), use `mlflow.types.schema.AnyType()` to avoid serving-time truncation. Reference: `agent/log_and_register.py:_signature`. +- **Section normalization**: `pipelines/sql/03_gold_classify_extract.sql` POSEXPLODES `parsed:sections[*]` and represents sectionless VARIANT output as one `full_document` row so we never lose a filing. - **`lakebase_stopped: true` is rejected on instance creation**: the API doesn't allow creating a database_instance directly into stopped state. Default is `false`; flip to `true` only after the instance exists. Reference: `databricks.yml` variable description. - **macOS doesn't ship `python`**: scripts must prefer `.venv/bin/python` then fall back to `python3`. Reference: `scripts/bootstrap-demo.sh`. -- **`agent/log_and_register.py` needs `PYTHONPATH`**: it imports the `agent` package; run with `PYTHONPATH=$REPO_ROOT` or use the bootstrap script which exports it. -- **Serving endpoint version drifts from YAML**: `resources/consumers/agent.serving.yml` pins `entity_version: "1"` as the bootstrap value. Steady-state CI re-registers new versions and uses `_promote_serving_endpoint` to update the served entity in-place. The YAML and the live endpoint diverge over time — that's intentional, not drift. +- **Agent Bricks resources are SDK-managed**: `scripts/bootstrap_agent_bricks.py` creates/updates the Knowledge Assistant, its Vector Search knowledge source, the UC KPI function, and the Supervisor Agent. DAB still manages the surrounding data/app/monitor resources. - **Streamlit on Databricks Apps requires CORS+XSRF off via env vars**: not flags. `STREAMLIT_SERVER_ENABLE_CORS=false` and `STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION=false` in `app/app.yaml`. Databricks Apps runtime config: https://docs.databricks.com/aws/en/dev-tools/databricks-apps/app-runtime. - **`bundle deploy` doesn't apply app config / restart**: must follow with `databricks bundle run -t analyst_app` (or use `databricks apps deploy`). Databricks Apps deploy docs: https://docs.databricks.com/aws/en/dev-tools/databricks-apps/deploy. - **`bundle run` may wipe `user_api_scopes`**: documented as a destructive-update behavior in the Databricks Apps deploy docs. Bootstrap step 5c re-asserts; CI verifies. If you change the App resource, double-check OBO scopes after. - **OBO token never refreshes on Streamlit**: captured at HTTP request, then WebSocket. Long sessions need a page reload to re-acquire. - **Lakebase init runs at startup under whatever creds the app process has**: in deployed mode that's the App SP (per resource binding); in local dev, set `DATABRICKS_CLIENT_ID/SECRET` to the App SP or tables get user-owned and break the deployed App. `lakebase_client.init_schema` warns on identity mismatch. See `app/README.md`. - **Prod `bundle validate` fails without `service_principal_id`**: that's the safety. Pass `--var service_principal_id=` for any prod operation. -- **Prod `run_as` rejected by app/monitor/serving when validated as a user**: DAB requires `run_as == owner`, and these three resource types set their owner to the deploying identity. Local `bundle validate -t prod --var service_principal_id=…` as a *user* fails with three errors; CI authenticated as the *SP* (matching `service_principal_id`) validates and deploys cleanly. This is correct enforcement, not a bug. +- **Prod `run_as` rejected by app/monitor resources when validated as a user**: DAB requires `run_as == owner`, and these resource types set their owner to the deploying identity. Local `bundle validate -t prod --var service_principal_id=...` as a user can fail; CI authenticated as the SP matching `service_principal_id` is the production validation path. ## Spec-Kit cycle diff --git a/PRODUCTION_READINESS.md b/PRODUCTION_READINESS.md index bc46f33..dd0a594 100644 --- a/PRODUCTION_READINESS.md +++ b/PRODUCTION_READINESS.md @@ -6,37 +6,37 @@ This project is open-sourced as a Databricks reference implementation. Treat it | Level | Bar | Evidence | |---|---|---| -| Reference-ready | Synthetic corpus demonstrates the full architecture | Dev bundle validates, staged bootstrap succeeds, synthetic CLEARS passes | +| Reference-ready | Synthetic corpus demonstrates the full architecture | Demo bundle validates, staged bootstrap succeeds, synthetic CLEARS passes | | Pilot-ready | Real filings exercise document variability and cost/latency | Reference-ready plus a reviewed EDGAR pilot corpus | | Production-ready | Analysts can use it under governed identity and SLOs | Pilot-ready plus end-to-end OBO, dashboards, alerts, rollback, and runbook evidence | ## Reference-Ready Checklist -- `databricks bundle validate --strict -t dev` passes. -- `./scripts/bootstrap-dev.sh` succeeds in a clean dev workspace. +- `databricks bundle validate --strict -t demo` passes. +- `./scripts/bootstrap-demo.sh` succeeds in a clean demo workspace. - Synthetic PDFs in `samples/` produce at least ACME/BETA/GAMMA KPI rows. -- Vector Search index sync completes and the endpoint answers a smoke question with citations. -- `python evals/clears_eval.py --endpoint analyst-agent-dev --dataset evals/dataset.jsonl` passes. -- App starts via `databricks bundle run -t dev analyst_app`. +- Vector Search index sync completes and the Agent Bricks Supervisor endpoint answers a smoke question with citations. +- `python evals/clears_eval.py --endpoint "$(./scripts/resolve-agent-endpoint.sh demo)" --dataset evals/dataset.jsonl` passes. +- App starts via `databricks bundle run -t demo analyst_app`. ## Pilot-Ready Checklist - At least 5 representative public SEC 10-K filings are uploaded from EDGAR and processed. -- Section explosion produces meaningful section labels, not only `full_document` fallbacks. +- Section explosion produces meaningful section labels, not only `full_document` normalized rows. - KPI extraction is manually reviewed for revenue, EBITDA, segment revenue, and top risks. - Quality rubric distribution is reviewed; low-quality filings are retained in Gold but excluded from `gold_filing_sections_indexable`. - Latency p95 is measured for single-filing and cross-company prompts. -- Estimated AI Functions, Vector Search, Model Serving, Lakebase, and Apps costs are documented. +- Estimated AI Functions, Vector Search, Agent Bricks, AI Gateway, Lakebase, and Apps costs are documented. ## Production-Ready Checklist - Databricks Apps user-token passthrough is enabled in the workspace. -- `resources/consumers/analyst.app.yml:user_api_scopes` is uncommented and survives `bundle run`. -- Audit logs prove app requests, agent SQL, Vector Search, and downstream serving calls execute under the invoking user where required. +- `resources/consumers/analyst.app.yml:user_api_scopes` is declared and survives `bundle run`. +- Audit logs prove app requests, Agent Bricks, Knowledge Assistant, Vector Search, and structured KPI SQL calls execute under the invoking user where required. - Service principal `run_as` is configured for prod via `--var service_principal_id=`. - Analyst group grants include `USE_CATALOG`, `USE_SCHEMA`, `SELECT`, `EXECUTE`, `READ_VOLUME`, and `WRITE_VOLUME` as appropriate. - CLEARS passes against the pilot corpus and synthetic regression corpus. -- Rollback is tested by re-pointing the UC model alias or served model version to a prior working version. +- Rollback is tested by reverting Agent Bricks configuration and redeploying the previous known-good bundle. - Dashboards and monitors are deployed and reviewed by an owner. - Alerting exists for pipeline failures, index-refresh failures, endpoint errors, app startup failures, CLEARS failures, and Lakebase write failures. @@ -45,4 +45,4 @@ This project is open-sourced as a Databricks reference implementation. Treat it - It is not a managed product. - It does not include a legal/compliance review for SEC filing usage. - It does not guarantee support for every 10-K layout or scanned PDF quality. -- It does not make SP fallback acceptable for production row-level-security use cases. +- It does not permit broad service-principal reads for production document Q&A. diff --git a/README.md b/README.md index a5933ef..93fe5d2 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ A **Databricks-native document intelligence + agent** stack: parse PDFs once with `ai_parse_document`, classify and extract structured KPIs with `ai_classify` / `ai_extract`, score quality on a 5-dimension rubric, index high-quality summaries into Mosaic AI Vector Search, and serve a cited-answer agent through a Streamlit app on Databricks Apps. **Demonstrated on synthetic SEC 10-K filings**, but the architecture works for any structured document corpus (contracts, invoices, research reports, regulatory filings). > [!IMPORTANT] -> Open-source **reference implementation**. The repo demonstrates production-grade Databricks patterns end-to-end, but it is not a turnkey production deployment. Read [`PRODUCTION_READINESS.md`](./PRODUCTION_READINESS.md), [`SECURITY.md`](./SECURITY.md), and [`VALIDATION.md`](./VALIDATION.md) before pointing real users at it. +> Open-source **reference implementation** for production-grade Databricks patterns. Read [`PRODUCTION_READINESS.md`](./PRODUCTION_READINESS.md), [`SECURITY.md`](./SECURITY.md), and [`VALIDATION.md`](./VALIDATION.md) before pointing real users at it. ``` SEC 10-K PDF Analyst's question @@ -55,7 +55,7 @@ For motivation, architecture diagrams, the Spec-Kit + Claude Code build workflow ## Features - **End-to-end document intelligence pipeline** — Auto Loader ingest → `ai_parse_document` → section explosion → `ai_classify` + `ai_extract` → 5-dim quality rubric → Vector Search Delta-Sync index (the endpoint is DAB-managed; the index is created/synced by `jobs/index_refresh/sync_index.py`). SQL-only pipeline (Lakeflow Spark Declarative Pipelines). -- **Cited-answer agent** — Mosaic AI Agent Framework (MLflow `pyfunc`), hybrid retrieval + Mosaic re-ranker, single-filing and cross-company supervisor paths. Logged with auth_policy for end-to-end OBO when the workspace supports it. +- **Cited-answer agent** — Agent Bricks-first runtime: Knowledge Assistant for cited document Q&A, Supervisor Agent for cross-company orchestration, and a deterministic KPI tool for structured comparisons. No custom pyfunc, retrieval loop, or supervisor runtime is retained. - **Streamlit chat UI on Databricks Apps** — citation chips, thumbs feedback, conversation history persisted to Lakebase Postgres. - **Eval-gated promotion** — `mlflow.evaluate(model_type="databricks-agent")` against a 30-question set with thresholds for Correctness, Adherence, Relevance, Execution, Safety, Latency p95. - **Reproducible synthetic corpus** — `samples/synthesize.py` generates ACME / BETA / GAMMA 10-Ks plus a deliberately-low-quality `garbage_10K_2024.pdf` for the rubric-exclusion test (SC-006). No EDGAR dependency in CI. @@ -67,7 +67,7 @@ For motivation, architecture diagrams, the Spec-Kit + Claude Code build workflow | Level | Meaning | Required evidence | |---|---|---| | Reference-ready | Synthetic corpus deploys and demonstrates the architecture end-to-end | Demo bundle validates, bootstrap succeeds, synthetic CLEARS passes | -| Pilot-ready | Real 10-K filings validate parse/extract/retrieval behavior | Reference-ready + small real EDGAR corpus + reviewed costs/latency | +| Pilot-ready | Real 10-K filings validate parse/extract/cited-answer behavior | Reference-ready + small real EDGAR corpus + reviewed costs/latency | | Production-ready | Analysts can use it under governed identity and operational SLOs | Pilot-ready + app-level OBO enabled, audit proof, alerts/dashboards, rollback tested | Full checklists in [`PRODUCTION_READINESS.md`](./PRODUCTION_READINESS.md). @@ -101,22 +101,22 @@ You need a workspace with **all** of the following enabled: - Serverless SQL warehouse (AI Functions GA — `ai_parse_document`, `ai_classify`, `ai_extract`, `ai_query`) - Mosaic AI Vector Search (endpoint + Delta-Sync index) -- Mosaic AI Agent Framework (`databricks-agents`) -- Mosaic AI Model Serving (CPU instances; AI Gateway) +- Agent Bricks (Knowledge Assistant, Supervisor Agent, Custom Agents on Apps) +- AI Gateway with OBO / identity enforcement - Lakebase Postgres (preview / GA depending on region) - Databricks Apps (Streamlit runtime) - Lakehouse Monitoring - Unity Catalog with permission to create catalogs/schemas/volumes (or an existing schema you can write to) -**Optional** but recommended for production-tier OBO: +**Required for production identity:** -- Databricks Apps **user token passthrough** (workspace admin setting). Without it, the app falls back to service-principal auth — see [`SECURITY.md`](./SECURITY.md). +- Databricks Apps **user token passthrough** (workspace admin setting). The app must not fall back to broad service-principal reads — see [`SECURITY.md`](./SECURITY.md). ### Free trial signup -Don't have a workspace? The fastest path is the **14-day Premium trial** at . Verify each entitlement above is enabled in your trial workspace and region — Mosaic AI Vector Search, Lakebase, Databricks Apps, and Model Serving rollout varies by cloud and region, so a Premium tier doesn't automatically guarantee every feature is on. Workspace settings → Previews / Compute → Mosaic AI is the place to check. +Don't have a workspace? The fastest path is the **14-day Premium trial** at . Verify each entitlement above is enabled in your trial workspace and region — Mosaic AI Vector Search, Lakebase, Databricks Apps, Agent Bricks, and AI Gateway rollout varies by cloud and region, so a Premium tier doesn't automatically guarantee every feature is on. Workspace settings → Previews / Compute → Mosaic AI is the place to check. -> Note: **Free Edition** at databricks.com/learn/free-edition does not include Mosaic AI Vector Search or Model Serving and **cannot run this reference**. Use the Premium trial. +> Note: **Free Edition** at databricks.com/learn/free-edition does not include the required governed agent services and **cannot run this implementation**. Use the Premium trial. After signup: @@ -174,7 +174,7 @@ The script handles the chicken-egg ordering automatically — see [`docs/design. ```bash DOCINTEL_CATALOG=workspace DOCINTEL_SCHEMA=docintel_10k_demo \ .venv/bin/python evals/clears_eval.py \ - --endpoint analyst-agent-demo \ + --endpoint "$(./scripts/resolve-agent-endpoint.sh demo)" \ --dataset evals/dataset.jsonl ``` @@ -197,12 +197,9 @@ After the first bring-up, iteration depends on what changed: databricks bundle deploy -t demo databricks bundle run -t demo analyst_app # apply app config + restart -# Agent code changes (agent/*.py): register a new model version -# and repoint the existing serving endpoint in-place. -DOCINTEL_CATALOG=workspace \ -DOCINTEL_SCHEMA=docintel_10k_demo \ -DOCINTEL_WAREHOUSE_ID= \ - .venv/bin/python agent/log_and_register.py --target demo --serving-endpoint analyst-agent-demo +# Agent Bricks configuration / tool glue changes +databricks bundle deploy -t demo +databricks bundle run -t demo analyst_app # Pipeline SQL changes that need to re-process existing filings databricks bundle run -t demo doc_intel_pipeline @@ -255,10 +252,7 @@ Implementation uses `mlflow.evaluate(model_type="databricks-agent")` for the fou | `service_principal_id` | `""` | **Required** for `-t prod`; `bundle validate -t prod` fails loudly without it | | `warehouse_id` | looked up from `Serverless Starter Warehouse` | Used by index-refresh + dashboards | | `embedding_model_endpoint_name` | `databricks-bge-large-en` | Vector Search embeddings | -| `foundation_model_endpoint_name` | `databricks-meta-llama-3-3-70b-instruct` | Agent answer generation | -| `rerank_model_endpoint_name` | `databricks-bge-rerank-v2` | Mosaic re-ranker | | `quality_threshold` | `22` | Section quality cutoff (0-30) for index inclusion | -| `top_k` | `5` | Citations returned after re-rank | | `max_pdf_bytes` | `52428800` (50 MB) | Reject filings larger than this | | `analyst_group` | `account users` | UC group granted SELECT/USE on schema, READ/WRITE on volume | @@ -270,7 +264,7 @@ Override via `--var name=value` on any `bundle` command. |---|---|---| | `DOCINTEL_CATALOG` | yes | Bootstrap, CI, eval | | `DOCINTEL_SCHEMA` | yes | Same | -| `DOCINTEL_WAREHOUSE_ID` | yes | Bootstrap (passed to bundle as `--var warehouse_id`, used by kpi-poll + smoke); `agent/log_and_register.py` (auth-policy SQL warehouse resource); `agent/tools.py` UC Function tool | +| `DOCINTEL_WAREHOUSE_ID` | yes | Bootstrap (passed to bundle as `--var warehouse_id`, used by kpi-poll + smoke); `agent/tools.py` structured KPI tool | | `DOCINTEL_TARGET` | no (default `demo`) | Bootstrap | | `DOCINTEL_ANALYST_GROUP` | no (default `account users`) | UC grants in bootstrap + CI | | `DOCINTEL_WAIT_SECONDS` | no (default 600) | Bootstrap KPI-table poll timeout | @@ -282,7 +276,7 @@ Override via `--var name=value` on any `bundle` command. ## Testing & validation ```bash -# Unit tests (18 tests covering retrieval, agent routing, supervisor) +# Unit tests for Agent Bricks tool glue and app helpers .venv/bin/python -m pytest agent/tests/ -q # Bundle schema + interpolation @@ -294,10 +288,10 @@ bash -n scripts/bootstrap-demo.sh # Compile checks for all modified Python .venv/bin/python -m py_compile \ - agent/_obo.py agent/analyst_agent.py agent/log_and_register.py \ - agent/retrieval.py agent/supervisor.py agent/tools.py \ + agent/tools.py \ app/app.py app/lakebase_client.py \ - evals/clears_eval.py scripts/wait_for_kpis.py samples/synthesize.py + evals/clears_eval.py scripts/bootstrap_agent_bricks.py \ + scripts/wait_for_kpis.py samples/synthesize.py ``` End-to-end is exercised by [`./scripts/bootstrap-demo.sh`](./scripts/bootstrap-demo.sh) against a real workspace; see [`VALIDATION.md`](./VALIDATION.md) for the full procedure with expected outputs. @@ -311,11 +305,10 @@ End-to-end is exercised by [`./scripts/bootstrap-demo.sh`](./scripts/bootstrap-d | `./scripts/bootstrap-demo.sh` | Fresh-workspace bring-up (or after `bundle destroy`). Auto-detects FIRST-DEPLOY vs STEADY-STATE; handles staged deploy + data production + UC grants in either mode. | | `databricks bundle deploy -t demo` | YAML / pipeline / job / app config changes after the first bring-up. | | `databricks bundle run -t demo analyst_app` | After any change to `app/` or `resources/consumers/analyst.app.yml` — required to apply runtime config + restart the app. | -| `python agent/log_and_register.py --target demo --serving-endpoint analyst-agent-demo` | After agent code changes (`agent/*.py`). Registers a new UC model version and repoints the existing serving endpoint in-place. | | `databricks bundle deploy -t prod --var service_principal_id=` | Production deploy, run as the prod SP. | -| GitHub Actions on push to `main` | Steady-state CI: full `bundle deploy` → wait for Lakebase AVAILABLE → upload samples + run pipeline + register/promote agent → UC grants → `bundle run analyst_app` → CLEARS eval gate. (The first-ever bring-up of a workspace must be done locally with `./scripts/bootstrap-demo.sh`.) | +| GitHub Actions on push to `main` | Steady-state CI: full `bundle deploy` → wait for Lakebase AVAILABLE → upload samples + run pipeline → Agent Bricks / AI Gateway validation → UC grants → `bundle run analyst_app` → CLEARS eval gate. (The first-ever bring-up of a workspace must be done locally with `./scripts/bootstrap-demo.sh`.) | -For day-2 ops (rolling agent versions, debugging low quality scores, inspecting CLEARS metrics in MLflow), see [`docs/runbook.md`](./docs/runbook.md). For the production-readiness checklist, see [`PRODUCTION_READINESS.md`](./PRODUCTION_READINESS.md). +For day-2 ops (Agent Bricks configuration validation, debugging low quality scores, inspecting CLEARS metrics in MLflow), see [`docs/runbook.md`](./docs/runbook.md). For the production-readiness checklist, see [`PRODUCTION_READINESS.md`](./PRODUCTION_READINESS.md). --- @@ -325,7 +318,7 @@ For day-2 ops (rolling agent versions, debugging low quality scores, inspecting databricks/ ├── databricks.yml # Bundle root — variables + demo/prod targets ├── pipelines/sql/ # Lakeflow SDP — Bronze → Silver → Gold (SQL only) -├── agent/ # Mosaic AI Agent Framework — pyfunc, retrieval, OBO +├── agent/ # Agent Bricks deterministic tool glue ├── app/ # Streamlit on Databricks Apps + Lakebase client ├── evals/ # MLflow CLEARS eval gate (dataset + runner) ├── jobs/ # Lakeflow Jobs (retention, index refresh) @@ -344,7 +337,7 @@ Top-level docs: [`CLAUDE.md`](./CLAUDE.md) (runtime guidance for Claude Code), [ ## Limitations -This is a **pilot-scale** reference implementation, not a turnkey production deployment: +This is a production-oriented reference implementation with conservative scale defaults: | Limit | Value | Source | |---|---|---| @@ -370,7 +363,7 @@ See [`CONTRIBUTING.md`](./CONTRIBUTING.md) for local setup, the spec-kit workflo ## Security -See [`SECURITY.md`](./SECURITY.md) for the identity model (App SP fallback vs end-to-end OBO), required UC grants, secrets-handling guidance, and how to report security issues in a fork or deployment. +See [`SECURITY.md`](./SECURITY.md) for the mandatory end-to-end OBO identity model, required UC grants, secrets-handling guidance, and how to report security issues in a fork or deployment. ## License @@ -381,4 +374,4 @@ Released under the [**MIT License**](./LICENSE) — Copyright (c) 2026 Sathish K - [**Spec-Kit**](https://github.com/github/spec-kit) — spec-driven development workflow for AI coding agents. - [**Claude Code**](https://claude.com/claude-code) — Anthropic's CLI for AI-assisted development. - [**Agent Skills**](https://github.com/anthropics/skills) — general-purpose Claude Code skill bundles. -- [**Databricks**](https://www.databricks.com/) — Unity Catalog, Lakeflow Spark Declarative Pipelines, Mosaic AI Vector Search, Agent Framework, Model Serving, AI Gateway, Databricks Apps, Lakebase, Lakehouse Monitoring. +- [**Databricks**](https://www.databricks.com/) — Unity Catalog, Document Intelligence AI Functions, Lakeflow Spark Declarative Pipelines, Mosaic AI Vector Search, Agent Bricks, AI Gateway, Databricks Apps, Lakebase, Lakehouse Monitoring. diff --git a/SECURITY.md b/SECURITY.md index ee5d70d..476dd34 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -2,26 +2,25 @@ ## Supported Security Posture -This reference is designed for Databricks workspaces using Unity Catalog, service-principal deployment, Databricks Apps resource bindings, Model Serving auth policies, and optional end-to-end on-behalf-of (OBO) user identity. +This reference is designed for Databricks workspaces using Unity Catalog, Agent Bricks, AI Gateway, Databricks Apps resource bindings, and mandatory end-to-end on-behalf-of (OBO) user identity. ## Identity Modes | Mode | Use | Production row-level security | |---|---|---| -| App SP fallback | Local development, reference demos, workspaces without Apps user-token passthrough | No | -| End-to-end OBO | Production analyst use | Yes, after audit verification | +| End-to-end OBO | Demo and production analyst use | Yes, after audit verification | -SP fallback is intentionally supported so the reference can run in workspaces that do not yet expose Databricks Apps user-token passthrough. It is not sufficient for production deployments that promise user-specific UC row/column enforcement. +Service-principal fallback is not supported for the agent path. If Databricks Apps user-token passthrough, Agent Bricks OBO, or AI Gateway identity enforcement is unavailable, deployment must fail with an actionable prerequisite error. ## Enabling End-To-End OBO 1. Workspace admin enables Databricks Apps user-token passthrough. -2. Uncomment `user_api_scopes` in `resources/consumers/analyst.app.yml`. +2. Declare the required `user_api_scopes` in `resources/consumers/analyst.app.yml`. 3. Redeploy and run the app resource. 4. Verify `serving.serving-endpoints` and `sql` scopes are present after deployment. 5. Verify audit logs show downstream calls under the invoking user where required. -The served agent also declares an MLflow auth policy in `agent/log_and_register.py` using Model Serving OBO scopes (`model-serving`, `vector-search`) and system resources. +Agent Bricks / AI Gateway must enforce downstream access to document Q&A, SQL tools, models, and any external tools under the invoking user's identity. The previous custom MLflow auth-policy path has been removed from the production implementation. ## Secrets And Credentials diff --git a/VALIDATION.md b/VALIDATION.md index 5bf0acc..97aa9b1 100644 --- a/VALIDATION.md +++ b/VALIDATION.md @@ -6,10 +6,10 @@ Use this guide to prove the reference implementation works in a Databricks works ```bash python3 -m py_compile \ - agent/_obo.py agent/analyst_agent.py agent/log_and_register.py \ - agent/retrieval.py agent/supervisor.py agent/tools.py \ + agent/tools.py \ app/app.py app/lakebase_client.py \ - evals/clears_eval.py scripts/wait_for_kpis.py samples/synthesize.py + evals/clears_eval.py scripts/bootstrap_agent_bricks.py \ + scripts/wait_for_kpis.py samples/synthesize.py bash -n scripts/bootstrap-demo.sh pytest agent/tests @@ -39,11 +39,11 @@ Expected outcomes: - Foundation resources deploy first. - Synthetic PDFs upload to the `raw_filings` volume. - Pipeline creates Gold rows. -- Agent model registers in Unity Catalog. +- Agent Bricks Knowledge Assistant and Supervisor Agent are created or updated. - Consumer resources deploy cleanly. - App config is applied with `bundle run analyst_app`. -- Bootstrap prints either OBO scope verification or an explicit app-level OBO disabled warning. -- Smoke query returns a grounded answer. +- Bootstrap verifies mandatory OBO scopes. +- Smoke query reaches the Agent Bricks supervisor endpoint. ## Data Checks @@ -66,7 +66,7 @@ Expected: ```bash python evals/clears_eval.py \ - --endpoint analyst-agent-demo \ + --endpoint "$(./scripts/resolve-agent-endpoint.sh demo)" \ --dataset evals/dataset.jsonl ``` @@ -85,14 +85,8 @@ Expected: ## OBO Verification -If app-level OBO is enabled: - -- Confirm `resources/consumers/analyst.app.yml:user_api_scopes` is uncommented. +- Confirm `resources/consumers/analyst.app.yml:user_api_scopes` is present. - Run `databricks bundle deploy -t demo && databricks bundle run -t demo analyst_app`. - Confirm bootstrap or CI verifies `serving.serving-endpoints` and `sql` scopes. -- Check audit logs for user-scoped downstream access. - -If app-level OBO is not enabled: - -- Treat the deployment as reference/demo only. -- Do not claim user-level UC row/column enforcement. +- Check audit logs for user-scoped downstream access through Agent Bricks, Knowledge Assistant, and the structured KPI SQL function. +- If the workspace cannot grant user-token passthrough, deployment is invalid and must fail. diff --git a/agent/_obo.py b/agent/_obo.py deleted file mode 100644 index 9d2c0bc..0000000 --- a/agent/_obo.py +++ /dev/null @@ -1,51 +0,0 @@ -"""On-behalf-of credentials helpers for the analyst pyfunc. - -Inside Model Serving, the model receives a per-request user context. The -canonical wiring (per Databricks Model Serving OBO docs at -https://docs.databricks.com/aws/en/generative-ai/agent-framework/agent-authentication-model-serving) -is: - - - WorkspaceClient(credentials_strategy=ModelServingUserCredentials()) - (from databricks_ai_bridge — not databricks.sdk.credentials_provider) - - VectorSearchClient( - credential_strategy=CredentialStrategy.MODEL_SERVING_USER_CREDENTIALS, - ) - -When the Model Serving deployment was logged WITHOUT a user auth_policy (or -the workspace lacks user-token-passthrough), `ModelServingUserCredentials()` -raises at instantiation. Callers fall back to default SP auth — which keeps -tests, local dev, and OBO-disabled workspaces working without code changes. - -`agent/log_and_register.py` declares the matching `UserAuthPolicy` with the -documented agent-side scopes `model-serving` and `vector-search`, and the -SystemAuthPolicy with the resources the model touches; together they tell -Model Serving to inject the per-request user token into `predict()`. - -(Databricks Apps OBO docs: -https://docs.databricks.com/aws/en/dev-tools/databricks-apps/iam-auth. -Project skill bundles — `databricks-apps`, `databricks-model-serving`, etc. — -are distributed by Databricks via the CLI / Claude Code plugin channel and -are not vendored in this repo; see `CONTRIBUTING.md` for the full mapping.) -""" - -from __future__ import annotations - -import logging - -from databricks.sdk import WorkspaceClient -from databricks_ai_bridge import ModelServingUserCredentials - -_log = logging.getLogger(__name__) - - -def user_workspace() -> WorkspaceClient: - """User-scoped client when OBO is enabled; SP fallback otherwise. - - The fallback path keeps tests, local dev, and OBO-disabled workspaces - working without code changes. - """ - try: - return WorkspaceClient(credentials_strategy=ModelServingUserCredentials()) - except Exception as exc: # OBO disabled, outside Serving runtime, etc. - _log.debug("ModelServingUserCredentials unavailable (%s); falling back to default auth", exc) - return WorkspaceClient() diff --git a/agent/analyst_agent.py b/agent/analyst_agent.py deleted file mode 100644 index 9d5e5f8..0000000 --- a/agent/analyst_agent.py +++ /dev/null @@ -1,167 +0,0 @@ -"""Mosaic AI Custom Analyst Agent for the 10-K corpus. - -Routes single-filing questions to grounded retrieval + LLM generation, -delegates cross-company questions to supervisor.handle(). -FR-007, FR-014 (no source -> "no grounded source found"). -""" - -from __future__ import annotations - -import json -import os -import re -import time -import uuid -from typing import Any - -import mlflow -from databricks.sdk import WorkspaceClient - -from agent import retrieval -from agent._obo import user_workspace -from agent.retrieval import Citation - - -FOUNDATION_MODEL = os.environ.get("DOCINTEL_FOUNDATION_ENDPOINT", "databricks-meta-llama-3-3-70b-instruct") -NO_SOURCE_MESSAGE = "No grounded source found for this question in the indexed 10-K corpus." - -_COMPARE_TOKENS = (" vs ", " versus ", "compare", "between", "across") - -# Capitalized tokens that aren't company names. Without this, "What are Apple's -# revenue and EBITDA?" mis-routes to the supervisor because EBITDA, What, etc. -# count as candidate companies. -_ROUTING_STOP_TOKENS = { - "what", "which", "how", "why", "when", "where", "who", - "the", "and", "but", "for", "with", "their", "most", "recent", - "between", "across", "compare", "vs", "versus", "fy", "fiscal", "year", - "ebitda", "revenue", "kpis", "kpi", "10-k", "10k", "form", - "company", "companies", "corp", "inc", "ltd", "llc", -} - - -class AnalystAgent(mlflow.pyfunc.PythonModel): - def predict(self, context: Any, model_input: Any) -> dict[str, Any]: - request = _coerce_request(model_input) - question = request["question"] - top_k = int(request.get("top_k") or 5) - - if _is_cross_company(question): - from agent import supervisor # lazy import; avoids cycle for tests - return supervisor.handle(question=question, top_k=top_k, conversation_id=request.get("conversation_id")) - - return _single_filing( - question=question, - top_k=top_k, - company_filter=request.get("company_filter"), - fiscal_year_filter=request.get("fiscal_year_filter"), - conversation_id=request.get("conversation_id"), - ) - - -def _single_filing( - *, - question: str, - top_k: int, - company_filter: str | None, - fiscal_year_filter: int | None, - conversation_id: str | None, -) -> dict[str, Any]: - started = time.monotonic() - citations, retrieved_count = retrieval.hybrid_retrieve( - question, - top_k=top_k, - company_filter=company_filter, - fiscal_year_filter=fiscal_year_filter, - ) - - if not citations: - return _response( - answer=NO_SOURCE_MESSAGE, - grounded=False, - citations=[], - retrieved_count=retrieved_count, - agent_path="analyst", - started=started, - conversation_id=conversation_id, - ) - - answer = _generate(question=question, citations=citations) - return _response( - answer=answer, - grounded=True, - citations=citations, - retrieved_count=retrieved_count, - agent_path="analyst", - started=started, - conversation_id=conversation_id, - ) - - -def _generate(*, question: str, citations: list[Citation]) -> str: - w = user_workspace() - sources = "\n\n".join( - f"[{i + 1}] {c.filename} — {c.section_label}\n{c.snippet}" - for i, c in enumerate(citations) - ) - prompt = ( - "You are an equity research assistant. Answer the analyst's question using ONLY the cited 10-K sections " - "below. Cite sources inline as [1], [2], etc. matching the section index. If the sources don't answer the " - f"question, reply '{NO_SOURCE_MESSAGE}'.\n\n" - f"QUESTION:\n{question}\n\nSOURCES:\n{sources}" - ) - out = w.serving_endpoints.query( - name=FOUNDATION_MODEL, - messages=[{"role": "user", "content": prompt}], - ) - if hasattr(out, "choices") and out.choices: - return out.choices[0].message.content - return out["choices"][0]["message"]["content"] - - -def _response( - *, - answer: str, - grounded: bool, - citations: list[Citation], - retrieved_count: int, - agent_path: str, - started: float, - conversation_id: str | None, -) -> dict[str, Any]: - return { - "answer": answer, - "grounded": grounded, - "citations": [c.to_dict() for c in citations], - "latency_ms": int((time.monotonic() - started) * 1000), - "retrieved_count": retrieved_count, - "agent_path": agent_path, - "conversation_id": conversation_id, - "turn_id": str(uuid.uuid4()), - } - - -def _coerce_request(model_input: Any) -> dict[str, Any]: - if hasattr(model_input, "to_dict"): - rows = model_input.to_dict(orient="records") - return rows[0] if rows else {} - if isinstance(model_input, list) and model_input: - return model_input[0] if isinstance(model_input[0], dict) else json.loads(model_input[0]) - if isinstance(model_input, dict): - return model_input - if isinstance(model_input, str): - return json.loads(model_input) - raise TypeError(f"Unsupported request type: {type(model_input)!r}") - - -def _is_cross_company(question: str) -> bool: - """Return True only when the question is a comparison AND mentions ≥ 2 plausible - company tokens. The capitalized-token heuristic strips question words, financial - metric names, and form-name boilerplate so a single-company question like - "What are Apple's revenue and EBITDA?" stays on the analyst path. - """ - lowered = question.lower() - if not any(token in lowered for token in _COMPARE_TOKENS): - return False - capitalized = re.findall(r"\b[A-Z][A-Za-z][A-Za-z0-9&\.\-]+\b", question) - candidates = {w for w in capitalized if w.lower() not in _ROUTING_STOP_TOKENS} - return len(candidates) >= 2 diff --git a/agent/log_and_register.py b/agent/log_and_register.py deleted file mode 100644 index d255b58..0000000 --- a/agent/log_and_register.py +++ /dev/null @@ -1,166 +0,0 @@ -"""Log the AnalystAgent as an MLflow pyfunc model, register it in UC, and assign the @ alias. - -Invoked from the GitHub Actions deploy step. Idempotent — re-running creates a new -version and re-points the alias. -""" - -from __future__ import annotations - -import argparse -import os -import sys - -import mlflow -from mlflow.models.auth_policy import AuthPolicy, SystemAuthPolicy, UserAuthPolicy -from mlflow.models.resources import ( - DatabricksServingEndpoint, - DatabricksSQLWarehouse, - DatabricksVectorSearchIndex, -) -from mlflow.models.signature import ModelSignature -from mlflow.types.schema import AnyType, ColSpec, Schema -from databricks.sdk import WorkspaceClient - -from agent.analyst_agent import AnalystAgent - - -# Foundation + re-rank endpoints called by the agent (resolved here so the -# log_model auth_policy can enumerate them). Defaults match databricks.yml. -_FOUNDATION_ENDPOINT = os.environ.get("DOCINTEL_FOUNDATION_ENDPOINT", "databricks-meta-llama-3-3-70b-instruct") -_RERANK_ENDPOINT = os.environ.get("DOCINTEL_RERANK_ENDPOINT", "databricks-bge-rerank-v2") - - -def _auth_policy(catalog: str, schema: str, warehouse_id: str) -> AuthPolicy: - """OBO-ready auth policy for the analyst pyfunc. - - System resources: enumerated so MLflow grants the deploying SP access at - deploy time (Databricks Apps service-principal permissions are - auto-granted by resource declaration — see - https://docs.databricks.com/aws/en/dev-tools/databricks-apps/access-data). - - User scopes: documented agent-side scopes per Databricks Model Serving - OBO docs (https://docs.databricks.com/aws/en/generative-ai/agent-framework/ - agent-authentication-model-serving) — `model-serving` for downstream - serving-endpoint calls (foundation + rerank), `vector-search` for the VS - index. App-side scopes (`serving.serving-endpoints`, - `vectorsearch.vector-search-indexes`) are different — those are declared - on the App resource, not here. - """ - resources = [ - DatabricksServingEndpoint(endpoint_name=_FOUNDATION_ENDPOINT), - DatabricksServingEndpoint(endpoint_name=_RERANK_ENDPOINT), - DatabricksVectorSearchIndex(index_name=f"{catalog}.{schema}.filings_summary_idx"), - DatabricksSQLWarehouse(warehouse_id=warehouse_id), - ] - return AuthPolicy( - system_auth_policy=SystemAuthPolicy(resources=resources), - user_auth_policy=UserAuthPolicy(api_scopes=[ - "model-serving", - "vector-search", - ]), - ) - - -def _signature() -> ModelSignature: - inputs = Schema( - [ - ColSpec("string", "question"), - ColSpec("integer", "top_k"), - ColSpec("string", "company_filter"), - ColSpec("integer", "fiscal_year_filter"), - ColSpec("string", "conversation_id"), - ] - ) - # UC requires both inputs and outputs in the signature. citations is an - # array of dicts whose shape varies between analyst and supervisor paths, - # so declare it as AnyType to avoid serving-time truncation of the nested - # structure while still satisfying UC's "outputs declared" check. - outputs = Schema( - [ - ColSpec("string", "answer"), - ColSpec("boolean", "grounded"), - ColSpec("long", "latency_ms"), - ColSpec("long", "retrieved_count"), - ColSpec("string", "agent_path"), - ColSpec("string", "conversation_id"), - ColSpec("string", "turn_id"), - ColSpec(AnyType(), "citations"), - ] - ) - return ModelSignature(inputs=inputs, outputs=outputs) - - -def _promote_serving_endpoint(endpoint_name: str, model_name: str, version: str) -> None: - """Point an existing serving endpoint at the newly registered UC model version. - - DAB alias syntax has been unreliable for this endpoint, so CI registers the - model and then updates the served entity explicitly. On first bring-up the - endpoint may not yet exist (the initial deploy can't create it without a - model version) — in that case skip silently and let the subsequent - `bundle deploy` create it from serving.yml. - """ - w = WorkspaceClient() - try: - endpoint = w.api_client.do("GET", f"/api/2.0/serving-endpoints/{endpoint_name}") - except Exception as exc: - msg = str(exc) - if "does not exist" in msg or "RESOURCE_DOES_NOT_EXIST" in msg or "404" in msg: - print(f"serving endpoint {endpoint_name!r} does not exist yet; skipping promote (will be created by next bundle deploy)") - return - raise - config = endpoint.get("config", {}) - served_entities = config.get("served_entities") or config.get("served_models") or [] - if not served_entities: - # Bootstrap edge case: the first `bundle deploy` created the endpoint shell - # but the served-entity creation failed (model didn't exist yet). Skip the - # in-place update — the subsequent `bundle deploy` reads serving.yml and - # populates the served entity from scratch with the bootstrap version. - print(f"serving endpoint {endpoint_name!r} has no served entities (likely UPDATE_FAILED on first deploy); skipping promote (next bundle deploy will populate from serving.yml)") - return - - entity = dict(served_entities[0]) - entity.update({"entity_name": model_name, "entity_version": str(version)}) - body = { - "served_entities": [entity], - "traffic_config": config.get( - "traffic_config", - {"routes": [{"served_model_name": entity["name"], "traffic_percentage": 100}]}, - ), - } - w.api_client.do("PUT", f"/api/2.0/serving-endpoints/{endpoint_name}/config", body=body) - - -def main() -> int: - p = argparse.ArgumentParser() - p.add_argument("--target", required=True, choices=["demo", "prod"]) - p.add_argument("--serving-endpoint", help="Existing serving endpoint to update to the new model version") - args = p.parse_args() - - catalog = os.environ["DOCINTEL_CATALOG"] - schema = os.environ["DOCINTEL_SCHEMA"] - warehouse_id = os.environ["DOCINTEL_WAREHOUSE_ID"] - name = f"{catalog}.{schema}.analyst_agent" - alias = args.target - - mlflow.set_registry_uri("databricks-uc") - with mlflow.start_run(run_name=f"analyst-agent-{alias}") as run: - info = mlflow.pyfunc.log_model( - name="analyst_agent", - python_model=AnalystAgent(), - registered_model_name=name, - signature=_signature(), - code_paths=["agent"], - pip_requirements=open("agent/requirements.txt").read().splitlines(), - auth_policy=_auth_policy(catalog, schema, warehouse_id), - ) - version = info.registered_model_version - client = mlflow.tracking.MlflowClient(registry_uri="databricks-uc") - client.set_registered_model_alias(name=name, alias=alias, version=version) - if args.serving_endpoint: - _promote_serving_endpoint(args.serving_endpoint, name, version) - print(f"registered {name} version={version} alias=@{alias} run_id={run.info.run_id}") - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/agent/requirements.txt b/agent/requirements.txt index 4175e30..e1ed11c 100644 --- a/agent/requirements.txt +++ b/agent/requirements.txt @@ -1,6 +1,2 @@ -mlflow>=2.20,<3 -databricks-agents>=0.10,<1 -databricks-ai-bridge>=0.1,<1 -databricks-vectorsearch>=0.40,<1 -databricks-sdk>=0.40,<1 +databricks-sdk>=0.105,<1 pydantic>=2.6,<3 diff --git a/agent/retrieval.py b/agent/retrieval.py deleted file mode 100644 index 04a79d2..0000000 --- a/agent/retrieval.py +++ /dev/null @@ -1,134 +0,0 @@ -"""Hybrid retrieval + re-rank for the 10-K Analyst. - -Top-25 hybrid (keyword + semantic) → Mosaic re-ranker → top-k. FR-007, SC-009. -Honors `embed_eligible` filter implicitly because the Vector Search index source view -already filters on it (see resources/vector_search/filings_index.yml + -pipelines/sql/04_gold_quality.sql). -""" - -from __future__ import annotations - -import os -import logging -from dataclasses import dataclass -from typing import Any - -from databricks.sdk import WorkspaceClient -from databricks.vector_search.client import CredentialStrategy, VectorSearchClient - -from agent._obo import user_workspace - - -CATALOG = os.environ["DOCINTEL_CATALOG"] -SCHEMA = os.environ["DOCINTEL_SCHEMA"] -INDEX_FQN = f"{CATALOG}.{SCHEMA}.filings_summary_idx" -ENDPOINT = os.environ.get("DOCINTEL_VS_ENDPOINT", f"docintel-{os.environ.get('DOCINTEL_TARGET', 'demo')}") -RERANK_ENDPOINT = os.environ.get("DOCINTEL_RERANK_ENDPOINT", "databricks-bge-rerank-v2") - -_RETURN_COLS = ["section_uid", "filename", "section_label", "original_label", "summary", "quality_score"] -_LOG = logging.getLogger(__name__) - - -@dataclass(frozen=True) -class Citation: - filename: str - section_label: str - score: float - snippet: str | None = None - char_offset: int | None = None - - def to_dict(self) -> dict[str, Any]: - return { - "filename": self.filename, - "section_label": self.section_label, - "score": round(float(self.score), 4), - "snippet": self.snippet, - "char_offset": self.char_offset, - } - - -def _filters(company: str | None, fiscal_year: int | None) -> dict[str, Any] | None: - out: dict[str, Any] = {} - if company: - out["company_filter_text LIKE"] = f"%{company}%" - if fiscal_year is not None: - out["fiscal_year ="] = fiscal_year - return out or None - - -def hybrid_retrieve( - question: str, - *, - top_k: int = 5, - company_filter: str | None = None, - fiscal_year_filter: int | None = None, - candidate_window: int = 25, -) -> tuple[list[Citation], int]: - """Pull `candidate_window` hybrid candidates, re-rank to `top_k`. Returns (citations, retrieved_count).""" - - # VS user-scope: per Databricks Model Serving OBO docs, pass - # CredentialStrategy.MODEL_SERVING_USER_CREDENTIALS rather than extracting - # the token manually. The strategy resolves the per-request user context - # the same way ModelServingUserCredentials does for WorkspaceClient. - try: - vsc = VectorSearchClient( - credential_strategy=CredentialStrategy.MODEL_SERVING_USER_CREDENTIALS, - disable_notice=True, - ) - except Exception: - # Outside Model Serving (tests, local dev) or OBO disabled — SP fallback. - vsc = VectorSearchClient(disable_notice=True) - index = vsc.get_index(endpoint_name=ENDPOINT, index_name=INDEX_FQN) - raw = index.similarity_search( - query_text=question, - columns=_RETURN_COLS, - num_results=candidate_window, - query_type="HYBRID", - filters=_filters(company_filter, fiscal_year_filter), - ) - rows = raw.get("result", {}).get("data_array", []) - if not rows: - return [], 0 - - documents = [{"text": _row(row, "summary"), "id": _row(row, "section_uid")} for row in rows] - if len(documents) > top_k: - order = _rerank(question, documents, top_k=top_k) - rows = [rows[i] for i in order] - else: - rows = rows[:top_k] - - citations = [ - Citation( - filename=_row(r, "filename"), - section_label=_row(r, "section_label"), - score=float(r[-1]), - snippet=_truncate(_row(r, "summary"), 240), - ) - for r in rows - ] - return citations, len(documents) - - -def _rerank(question: str, documents: list[dict[str, str]], *, top_k: int) -> list[int]: - """Calls the Mosaic re-ranker endpoint; returns the original-row indices ordered by relevance.""" - w = user_workspace() - try: - response = w.serving_endpoints.query( - name=RERANK_ENDPOINT, - inputs={"query": question, "documents": [d["text"] for d in documents], "top_n": top_k}, - ) - ranked = response.predictions if hasattr(response, "predictions") else response["predictions"] - return [item["index"] for item in ranked[:top_k]] - except Exception as exc: # pragma: no cover - workspace failure path - _LOG.warning("Rerank endpoint %s failed; falling back to vector-search order: %s", RERANK_ENDPOINT, exc) - return list(range(min(top_k, len(documents)))) - - -def _row(row: list[Any], col: str) -> Any: - return row[_RETURN_COLS.index(col)] - - -def _truncate(text: str | None, n: int) -> str | None: - if not text: - return None - return text if len(text) <= n else text[: n - 1] + "…" diff --git a/agent/supervisor.py b/agent/supervisor.py deleted file mode 100644 index 0f32a6e..0000000 --- a/agent/supervisor.py +++ /dev/null @@ -1,230 +0,0 @@ -"""Supervisor agent for cross-company aggregation (US3). - -Detects N >= 2 company tokens in the question, classifies the requested metric, -fans out per-company retrieval, and emits a markdown table whose columns match -what was asked. For metrics not in gold_filing_kpis (e.g. risks, R&D trends), -falls back to a per-company narrative built from retrieved sections rather -than fabricating a numeric table. -""" - -from __future__ import annotations - -import json -import re -import time -import uuid -from typing import Any - -from agent import retrieval, tools -from agent.retrieval import Citation - - -# Words to exclude when scanning for company tokens. Capitalized but not company names. -_STOP = { - "compare", "between", "across", "the", "and", "for", "with", "their", "most", - "recent", "what", "which", "how", "did", "does", "do", "does", "is", "are", - "vs", "versus", "against", "ebitda", "revenue", "fiscal", "year", "filing", - "filings", "10-k", "10k", "ten-k", "tenk", "company", "companies", "by", - "in", "on", "of", "to", "from", - "mda", "md&a", "risk", "risks", "financials", "notes", "note", "section", - "sections", "item", "items", "management", "discussion", "analysis", -} - -# Map question keywords -> column template + KPI extractor. -_INTENTS: list[tuple[set[str], str]] = [ - ({"segment", "segments"}, "segments"), - ({"risk", "risks"}, "risks"), - ({"ebitda"}, "ebitda"), - ({"r&d", "research", "development", "spending"}, "narrative"), - ({"acquisitions", "antitrust", "repurchase", "buyback", "climate"}, "narrative"), - ({"revenue", "sales"}, "revenue"), -] - - -def handle(*, question: str, top_k: int, conversation_id: str | None) -> dict[str, Any]: - started = time.monotonic() - companies = _extract_companies(question) - if len(companies) < 2: - return _empty(question=question, started=started, conversation_id=conversation_id) - - rows = tools.fetch_kpis_for_companies(companies) - if not rows: - return _empty(question=question, started=started, conversation_id=conversation_id) - - intent = _intent(question) - - # For any structured-table intent, also pull a few citations per company so the - # answer is verifiable. Narrative intents lean on retrieval entirely. - per_company_citations: dict[str, list[Citation]] = {} - for r in rows: - sub_citations, _ = retrieval.hybrid_retrieve( - f"{r['company_name']}: {question}", - top_k=2, - company_filter=r["company_name"], - ) - per_company_citations[r["company_name"]] = sub_citations - - if intent == "narrative": - body = _narrative(rows, per_company_citations, question) - else: - body = _table(rows, intent) - - citations = [c for cits in per_company_citations.values() for c in cits] - sources_line = ( - "**Sources:** " + ", ".join(f"`{c.filename}` ({c.section_label})" for c in citations) - if citations - else "" - ) - answer = f"### {question.strip().rstrip('?')}\n\n{body}\n\n{sources_line}".rstrip() - - found = {r["company_name"].lower() for r in rows} - missing = [c for c in companies if c.lower() not in found and not any(c.lower() in name for name in found)] - - return { - "answer": answer, - "grounded": True, - "citations": [c.to_dict() for c in citations], - "latency_ms": int((time.monotonic() - started) * 1000), - "retrieved_count": len(citations), - "agent_path": "supervisor", - "conversation_id": conversation_id, - "turn_id": str(uuid.uuid4()), - "missing_companies": missing, - } - - -def _extract_companies(question: str) -> list[str]: - found = re.findall(r"\b[A-Z][A-Za-z][A-Za-z0-9&\.\-]+\b", question) - return [w for w in dict.fromkeys(found) if w.lower() not in _STOP and len(w) > 2] - - -def _intent(question: str) -> str: - lower = question.lower() - for keywords, intent in _INTENTS: - if any(k in lower for k in keywords): - return intent - return "general" - - -def _table(rows: list[dict[str, Any]], intent: str) -> str: - if intent == "segments": - header = "| Company | Fiscal Year | Top Segments |" - sep = "|---|---|---|" - body = [] - for r in rows: - segments = _segments_text(r.get("segment_revenue_raw") or r.get("segment_revenue")) - body.append(f"| {r['company_name']} | {r.get('fiscal_year', '—')} | {segments} |") - return "\n".join([header, sep, *body]) - - if intent == "risks": - header = "| Company | Fiscal Year | Top Risks |" - sep = "|---|---|---|" - body = [] - for r in rows: - risks = _risks_text(r.get("top_risks_raw") or r.get("top_risks")) - body.append(f"| {r['company_name']} | {r.get('fiscal_year', '—')} | {risks} |") - return "\n".join([header, sep, *body]) - - if intent == "ebitda": - header = "| Company | Fiscal Year | EBITDA |" - sep = "|---|---|---|" - body = [f"| {r['company_name']} | {r.get('fiscal_year', '—')} | {_money(r.get('ebitda'))} |" for r in rows] - return "\n".join([header, sep, *body]) - - if intent == "revenue": - header = "| Company | Fiscal Year | Revenue |" - sep = "|---|---|---|" - body = [f"| {r['company_name']} | {r.get('fiscal_year', '—')} | {_money(r.get('revenue'))} |" for r in rows] - return "\n".join([header, sep, *body]) - - # general: include all KPI columns - header = "| Company | Fiscal Year | Revenue | EBITDA | Top Segments |" - sep = "|---|---|---|---|---|" - body = [] - for r in rows: - segments = _segments_text(r.get("segment_revenue_raw") or r.get("segment_revenue")) - body.append( - f"| {r['company_name']} | {r.get('fiscal_year', '—')} | " - f"{_money(r.get('revenue'))} | {_money(r.get('ebitda'))} | {segments} |" - ) - return "\n".join([header, sep, *body]) - - -def _narrative( - rows: list[dict[str, Any]], - citations_by_company: dict[str, list[Citation]], - question: str, -) -> str: - """For metrics not in gold_filing_kpis, summarize per-company from retrieved snippets.""" - parts = [] - for r in rows: - company = r["company_name"] - cits = citations_by_company.get(company, []) - if not cits: - parts.append(f"**{company}** — no grounded source for this question in the indexed corpus.") - continue - snippet = next((c.snippet for c in cits if c.snippet), None) or "no detail available in retrieved snippets." - parts.append(f"**{company}** — {snippet}") - return "\n\n".join(parts) - - -def _money(v: Any) -> str: - if v is None: - return "—" - try: - return f"${float(v) / 1e9:.2f}B" - except (TypeError, ValueError): - return str(v) - - -def _segments_text(raw: Any) -> str: - items = _coerce_list(raw) - if not items: - return "—" - rendered = [] - for item in items[:3]: - if isinstance(item, dict): - rendered.append(f"{item.get('name','?')}: {_money(item.get('revenue'))}") - else: - rendered.append(str(item)) - return ", ".join(rendered) - - -def _risks_text(raw: Any) -> str: - items = _coerce_list(raw) - if not items: - return "—" - return "; ".join(str(x) for x in items[:5]) - - -def _coerce_list(raw: Any) -> list[Any]: - if raw is None: - return [] - if isinstance(raw, list): - return raw - if isinstance(raw, str): - s = raw.strip() - if not s: - return [] - try: - v = json.loads(s) - return v if isinstance(v, list) else [v] - except (ValueError, TypeError): - return [s] - return [raw] - - -def _empty(*, question: str, started: float, conversation_id: str | None) -> dict[str, Any]: - from agent.analyst_agent import NO_SOURCE_MESSAGE - - return { - "answer": NO_SOURCE_MESSAGE, - "grounded": False, - "citations": [], - "latency_ms": int((time.monotonic() - started) * 1000), - "retrieved_count": 0, - "agent_path": "supervisor", - "conversation_id": conversation_id, - "turn_id": str(uuid.uuid4()), - "missing_companies": [], - } diff --git a/agent/tests/test_agent_bricks_response.py b/agent/tests/test_agent_bricks_response.py new file mode 100644 index 0000000..7d3d4f9 --- /dev/null +++ b/agent/tests/test_agent_bricks_response.py @@ -0,0 +1,88 @@ +from __future__ import annotations + +from app.agent_bricks_response import extract_citations, extract_text, normalise_agent_response + + +def test_extract_text_from_responses_output_shape() -> None: + payload = { + "output": [ + { + "content": [ + {"text": "Revenue increased."}, + {"text": "Risks were disclosed."}, + ] + } + ] + } + + assert extract_text(payload) == "Revenue increased.\nRisks were disclosed." + + +def test_extract_text_from_chat_choices_shape() -> None: + payload = {"choices": [{"message": {"content": "Choice response"}}]} + + assert extract_text(payload) == "Choice response" + + +def test_extract_text_prefers_final_agent_bricks_message() -> None: + payload = { + "output": [ + {"type": "message", "content": [{"type": "output_text", "text": "Thinking"}]}, + {"type": "message", "content": [{"type": "output_text", "text": "Final answer"}]}, + ] + } + + assert extract_text(payload) == "Final answer" + + +def test_extract_citations_from_agent_bricks_footnotes() -> None: + payload = { + "output": [ + { + "type": "message", + "content": [ + { + "type": "output_text", + "text": "[^p1]: Revenue was $94.2B. _ACME_10K_2024.pdf_", + } + ], + } + ] + } + + citations = extract_citations(payload) + + assert citations[0]["filename"] == "ACME_10K_2024.pdf" + assert "Revenue was $94.2B" in citations[0]["snippet"] + + +def test_extract_citations_returns_empty_without_structured_sources_or_footnotes() -> None: + payload = { + "output": [ + { + "type": "message", + "content": [{"type": "output_text", "text": "Final answer without footnotes."}], + } + ] + } + + assert extract_citations(payload) == [] + + +def test_normalise_agent_response_coerces_citations_and_latency() -> None: + response = normalise_agent_response( + { + "output_text": "Grounded answer", + "sources": [{"doc_uri": "filing.pdf"}, "legacy-source"], + "latency_ms": "41", + }, + conversation_id="conversation-1", + ) + + assert response["answer"] == "Grounded answer" + assert response["grounded"] is True + assert response["retrieved_count"] == 2 + assert response["citations"] == [{"doc_uri": "filing.pdf"}, {"source": "legacy-source"}] + assert response["latency_ms"] == 41 + assert response["conversation_id"] == "conversation-1" + assert response["agent_path"] == "agent_bricks_supervisor" diff --git a/agent/tests/test_agent_routing.py b/agent/tests/test_agent_routing.py deleted file mode 100644 index 68ce19e..0000000 --- a/agent/tests/test_agent_routing.py +++ /dev/null @@ -1,67 +0,0 @@ -"""Routing tests for AnalystAgent: cross-company → supervisor, single → grounded path.""" - -from __future__ import annotations - -import os -from unittest.mock import patch - -import pytest - - -os.environ.setdefault("DOCINTEL_CATALOG", "test_catalog") -os.environ.setdefault("DOCINTEL_SCHEMA", "test_schema") -os.environ.setdefault("DOCINTEL_WAREHOUSE_ID", "test_warehouse") - - -def test_no_grounded_source_returns_canonical_message() -> None: - from agent.analyst_agent import AnalystAgent, NO_SOURCE_MESSAGE - - with patch("agent.retrieval.hybrid_retrieve", return_value=([], 0)): - out = AnalystAgent().predict(None, {"question": "Tell me about XYZ Corp's risks"}) - - assert out["grounded"] is False - assert out["citations"] == [] - assert out["answer"] == NO_SOURCE_MESSAGE - - -def test_cross_company_question_routes_to_supervisor() -> None: - from agent.analyst_agent import AnalystAgent - - with patch("agent.supervisor.handle", return_value={"agent_path": "supervisor", "answer": "ok", "grounded": True, "citations": [], "latency_ms": 1, "retrieved_count": 0, "conversation_id": None, "turn_id": "t", "missing_companies": []}) as supervisor: - out = AnalystAgent().predict(None, {"question": "Compare Apple and Microsoft revenue"}) - - supervisor.assert_called_once() - assert out["agent_path"] == "supervisor" - - -def test_apple_revenue_and_ebitda_routes_to_analyst() -> None: - """Regression: 'and' + capitalized metric tokens shouldn't trigger supervisor. - Without the routing stop list, 'What are Apple's revenue and EBITDA?' would - be misrouted because EBITDA / What count as company candidates. - """ - from agent.analyst_agent import _is_cross_company - - assert _is_cross_company("What are Apple's revenue and EBITDA?") is False - assert _is_cross_company("How did Microsoft describe AI risks?") is False - - -def test_compare_two_companies_routes_to_supervisor() -> None: - from agent.analyst_agent import _is_cross_company - - assert _is_cross_company("Compare segment revenue between Apple and Microsoft") is True - assert _is_cross_company("Apple vs Google operating income") is True - - -def test_single_company_question_uses_analyst_path(monkeypatch) -> None: - from agent.analyst_agent import AnalystAgent - from agent.retrieval import Citation - - citations = [Citation("AAPL_10K_2024.pdf", "Risk", 0.9, snippet="snippet")] - - with patch("agent.retrieval.hybrid_retrieve", return_value=(citations, 25)), \ - patch("agent.analyst_agent._generate", return_value="generated answer with [1]"): - out = AnalystAgent().predict(None, {"question": "What are Apple's top risks?"}) - - assert out["agent_path"] == "analyst" - assert out["grounded"] is True - assert len(out["citations"]) == 1 diff --git a/agent/tests/test_retrieval.py b/agent/tests/test_retrieval.py deleted file mode 100644 index e6b906d..0000000 --- a/agent/tests/test_retrieval.py +++ /dev/null @@ -1,88 +0,0 @@ -"""Unit tests for hybrid retrieval + re-rank. - -Mocks the VectorSearchClient already imported by `agent.retrieval` so tests -don't hit the workspace. -""" - -from __future__ import annotations - -import os -from unittest.mock import MagicMock, patch - -import pytest - - -os.environ.setdefault("DOCINTEL_CATALOG", "test_catalog") -os.environ.setdefault("DOCINTEL_SCHEMA", "test_schema") - - -def _candidates(n: int) -> list[list]: - # Order of values matches retrieval._RETURN_COLS, with the trailing score column. - return [ - [f"sec-{i}", "AAPL_10K_2024.pdf", "Risk", "Risk Factors", f"summary {i}", 25, 0.9 - i * 0.05] - for i in range(n) - ] - - -@pytest.fixture -def fake_index(monkeypatch: pytest.MonkeyPatch) -> MagicMock: - """Patch VectorSearchClient on the already-loaded retrieval module.""" - from agent import retrieval - - client = MagicMock() - index = MagicMock() - client.return_value.get_index.return_value = index - monkeypatch.setattr(retrieval, "VectorSearchClient", client) - return index - - -def test_returns_top_k_after_rerank(fake_index: MagicMock) -> None: - fake_index.similarity_search.return_value = {"result": {"data_array": _candidates(25)}} - - with patch("agent.retrieval._rerank", return_value=list(range(5))) as rerank: - from agent import retrieval - - citations, retrieved = retrieval.hybrid_retrieve("top risks?", top_k=5) - - assert len(citations) == 5 - assert retrieved == 25 - rerank.assert_called_once() - - -def test_empty_index_returns_empty(fake_index: MagicMock) -> None: - fake_index.similarity_search.return_value = {"result": {"data_array": []}} - from agent import retrieval - - citations, retrieved = retrieval.hybrid_retrieve("nothing matches", top_k=5) - assert citations == [] - assert retrieved == 0 - - -def test_company_filter_passes_through(fake_index: MagicMock) -> None: - fake_index.similarity_search.return_value = {"result": {"data_array": _candidates(3)}} - from agent import retrieval - - retrieval.hybrid_retrieve("Apple risks", top_k=5, company_filter="Apple") - kwargs = fake_index.similarity_search.call_args.kwargs - assert kwargs.get("filters") == {"company_filter_text LIKE": "%Apple%"} - - -def test_company_and_year_filters_do_not_clobber(fake_index: MagicMock) -> None: - fake_index.similarity_search.return_value = {"result": {"data_array": _candidates(3)}} - from agent import retrieval - - retrieval.hybrid_retrieve("Apple FY2024 risks", top_k=5, company_filter="Apple", fiscal_year_filter=2024) - kwargs = fake_index.similarity_search.call_args.kwargs - assert kwargs.get("filters") == {"company_filter_text LIKE": "%Apple%", "fiscal_year =": 2024} - - -def test_rerank_failure_falls_back_to_vector_order(fake_index: MagicMock) -> None: - fake_index.similarity_search.return_value = {"result": {"data_array": _candidates(8)}} - from agent import retrieval - - with patch("agent.retrieval.WorkspaceClient") as workspace: - workspace.return_value.serving_endpoints.query.side_effect = RuntimeError("missing endpoint") - citations, retrieved = retrieval.hybrid_retrieve("top risks?", top_k=5) - - assert [c.snippet for c in citations] == [f"summary {i}" for i in range(5)] - assert retrieved == 8 diff --git a/agent/tests/test_supervisor.py b/agent/tests/test_supervisor.py deleted file mode 100644 index afaf10f..0000000 --- a/agent/tests/test_supervisor.py +++ /dev/null @@ -1,74 +0,0 @@ -"""Tests for the supervisor (US3 cross-company).""" - -from __future__ import annotations - -import os -from unittest.mock import patch - -import pytest - - -os.environ.setdefault("DOCINTEL_CATALOG", "test_catalog") -os.environ.setdefault("DOCINTEL_SCHEMA", "test_schema") -os.environ.setdefault("DOCINTEL_WAREHOUSE_ID", "test_warehouse") - - -@pytest.fixture -def fake_kpis() -> list[dict]: - return [ - { - "filename": "AAPL_10K_2024.pdf", - "company_name": "Apple", - "fiscal_year": 2024, - "revenue": 391_000_000_000, - "ebitda": 130_000_000_000, - "segment_revenue": [{"name": "iPhone", "revenue": 200_000_000_000}, {"name": "Services", "revenue": 96_000_000_000}], - }, - { - "filename": "MSFT_10K_2024.pdf", - "company_name": "Microsoft", - "fiscal_year": 2024, - "revenue": 245_000_000_000, - "ebitda": 130_000_000_000, - "segment_revenue": [{"name": "Azure", "revenue": 105_000_000_000}], - }, - ] - - -def test_supervisor_returns_markdown_table(fake_kpis: list[dict]) -> None: - from agent import supervisor - - with patch("agent.supervisor.tools.fetch_kpis_for_companies", return_value=fake_kpis), \ - patch("agent.supervisor.retrieval.hybrid_retrieve", return_value=([], 0)): - out = supervisor.handle(question="Compare Apple and Microsoft revenue", top_k=5, conversation_id=None) - - assert out["agent_path"] == "supervisor" - assert out["grounded"] is True - assert "| Company |" in out["answer"] - assert "Apple" in out["answer"] and "Microsoft" in out["answer"] - - -def test_supervisor_handles_missing_company(fake_kpis: list[dict]) -> None: - from agent import supervisor - - with patch("agent.supervisor.tools.fetch_kpis_for_companies", return_value=fake_kpis), \ - patch("agent.supervisor.retrieval.hybrid_retrieve", return_value=([], 0)): - out = supervisor.handle(question="Compare Apple, Microsoft, and ZZZCorp", top_k=5, conversation_id=None) - - assert "ZZZCorp" in out["missing_companies"] or "ZZZCorp" in out["answer"] or any("ZZZ" in m for m in out["missing_companies"]) - - -def test_supervisor_with_no_data_falls_back_to_no_source() -> None: - from agent import supervisor - - with patch("agent.supervisor.tools.fetch_kpis_for_companies", return_value=[]): - out = supervisor.handle(question="Compare Apple and Microsoft", top_k=5, conversation_id=None) - - assert out["grounded"] is False - assert out["citations"] == [] - - -def test_section_terms_are_not_extracted_as_companies() -> None: - from agent import supervisor - - assert supervisor._extract_companies("Compare Risk and MD&A coverage between filings") == [] diff --git a/agent/tests/test_supervisor_table.py b/agent/tests/test_supervisor_table.py deleted file mode 100644 index a361e9e..0000000 --- a/agent/tests/test_supervisor_table.py +++ /dev/null @@ -1,77 +0,0 @@ -"""Regression tests for supervisor question-aware table rendering.""" - -from __future__ import annotations - -import os -from unittest.mock import patch - -import pytest - - -os.environ.setdefault("DOCINTEL_CATALOG", "test_catalog") -os.environ.setdefault("DOCINTEL_SCHEMA", "test_schema") -os.environ.setdefault("DOCINTEL_WAREHOUSE_ID", "test_warehouse") - - -@pytest.fixture -def fake_kpis() -> list[dict]: - return [ - { - "filename": "AAPL_10K_2024.pdf", - "company_name": "Apple", - "fiscal_year": 2024, - "revenue": 391_000_000_000, - "ebitda": 130_000_000_000, - "segment_revenue_raw": '[{"name":"iPhone","revenue":200000000000},{"name":"Services","revenue":96000000000}]', - "top_risks_raw": '["macroeconomic conditions","supply chain"]', - }, - { - "filename": "MSFT_10K_2024.pdf", - "company_name": "Microsoft", - "fiscal_year": 2024, - "revenue": 245_000_000_000, - "ebitda": 130_000_000_000, - "segment_revenue_raw": '[{"name":"Azure","revenue":105000000000}]', - "top_risks_raw": '["AI risk","competition"]', - }, - ] - - -def _run(question: str, fake_kpis: list[dict]) -> str: - from agent import supervisor - - with patch("agent.supervisor.tools.fetch_kpis_for_companies", return_value=fake_kpis), \ - patch("agent.supervisor.retrieval.hybrid_retrieve", return_value=([], 0)): - out = supervisor.handle(question=question, top_k=5, conversation_id=None) - return out["answer"] - - -def test_risks_question_renders_risks_column(fake_kpis: list[dict]) -> None: - answer = _run("Compare top 3 risk factors between Apple and Microsoft", fake_kpis) - assert "Top Risks" in answer - assert "Revenue" not in answer.split("|")[1] # header line shouldn't lead with revenue - - -def test_segments_question_renders_segments_column(fake_kpis: list[dict]) -> None: - answer = _run("Compare segment revenue between Apple and Microsoft", fake_kpis) - assert "Top Segments" in answer - assert "iPhone" in answer - assert "Azure" in answer - - -def test_ebitda_question_renders_ebitda_only(fake_kpis: list[dict]) -> None: - answer = _run("Compare EBITDA across Apple and Microsoft", fake_kpis) - # Header should include EBITDA and not Revenue - header_line = next(line for line in answer.splitlines() if "| Company" in line) - assert "EBITDA" in header_line - assert "Revenue" not in header_line - - -def test_narrative_intent_skips_table(fake_kpis: list[dict]) -> None: - """Questions about R&D / repurchases / antitrust aren't in gold_filing_kpis - columns, so the supervisor should NOT fabricate a numeric table for them. - """ - answer = _run("Compare R&D spending trends between Apple and Microsoft", fake_kpis) - assert "**Apple**" in answer or "no grounded source" in answer.lower() - # Critical: must not fabricate a Revenue column for an R&D question. - assert "| Revenue |" not in answer diff --git a/agent/tests/test_tools.py b/agent/tests/test_tools.py new file mode 100644 index 0000000..4140182 --- /dev/null +++ b/agent/tests/test_tools.py @@ -0,0 +1,66 @@ +from __future__ import annotations + +import os +from types import SimpleNamespace +from unittest.mock import MagicMock + + +os.environ.setdefault("DOCINTEL_CATALOG", "test_catalog") +os.environ.setdefault("DOCINTEL_SCHEMA", "test_schema") +os.environ.setdefault("DOCINTEL_WAREHOUSE_ID", "test_warehouse") + + +def _statement(rows: list[list[object]]) -> SimpleNamespace: + return SimpleNamespace( + result=SimpleNamespace(data_array=rows), + manifest=SimpleNamespace( + schema=SimpleNamespace( + columns=[ + SimpleNamespace(name="filename"), + SimpleNamespace(name="company_name"), + SimpleNamespace(name="fiscal_year"), + ] + ) + ), + ) + + +def test_fetch_kpis_parameterizes_filename(monkeypatch) -> None: + from agent import tools + + client = MagicMock() + client.statement_execution.execute_statement.return_value = _statement( + [["ACME_10K_2024.pdf", "ACME", 2024]] + ) + monkeypatch.setattr(tools, "_workspace", lambda: client) + + row = tools.fetch_kpis("ACME_10K_2024.pdf") + + assert row == { + "filename": "ACME_10K_2024.pdf", + "company_name": "ACME", + "fiscal_year": 2024, + } + call = client.statement_execution.execute_statement.call_args.kwargs + assert call["warehouse_id"] == "test_warehouse" + assert call["parameters"] == [{"name": "filename", "value": "ACME_10K_2024.pdf"}] + + +def test_fetch_kpis_for_companies_builds_bound_parameters(monkeypatch) -> None: + from agent import tools + + client = MagicMock() + client.statement_execution.execute_statement.return_value = _statement( + [["ACME_10K_2024.pdf", "ACME", 2024]] + ) + monkeypatch.setattr(tools, "_workspace", lambda: client) + + rows = tools.fetch_kpis_for_companies(["ACME", "BETA"]) + + assert rows[0]["company_name"] == "ACME" + call = client.statement_execution.execute_statement.call_args.kwargs + assert call["parameters"] == [ + {"name": "c0", "value": "%acme%"}, + {"name": "c1", "value": "%beta%"}, + ] + assert "gold_filing_kpis" in call["statement"] diff --git a/agent/tools.py b/agent/tools.py index 71e36c7..5385dfa 100644 --- a/agent/tools.py +++ b/agent/tools.py @@ -1,7 +1,9 @@ -"""UC Function tools the Analyst Agent can call for deterministic SQL aggregation. +"""Deterministic KPI tool glue for Agent Bricks. -Wraps gold_filing_kpis so cross-company comparisons (US3) don't have to go through -retrieval + LLM math. +The production tool is a Unity Catalog SQL function created by +`scripts/bootstrap_agent_bricks.py`. These helpers keep the SQL access pattern +testable and available for local validation without reintroducing a custom +agent runtime. """ from __future__ import annotations @@ -11,17 +13,25 @@ from databricks.sdk import WorkspaceClient -from agent._obo import user_workspace - CATALOG = os.environ["DOCINTEL_CATALOG"] SCHEMA = os.environ["DOCINTEL_SCHEMA"] WAREHOUSE_ID = os.environ["DOCINTEL_WAREHOUSE_ID"] +def _workspace() -> WorkspaceClient: + """Return the current Databricks client. + + Hosted production calls are expected to run under Agent Bricks / AI Gateway + identity enforcement. Missing user identity is a deployment error, not a + alternate execution mode handled here. + """ + return WorkspaceClient() + + def fetch_kpis(filename: str) -> dict[str, Any] | None: """Return the gold_filing_kpis row for one filing, or None if not present.""" - w = user_workspace() + w = _workspace() statement = w.statement_execution.execute_statement( warehouse_id=WAREHOUSE_ID, statement=( @@ -44,7 +54,7 @@ def fetch_kpis_for_companies(companies: list[str]) -> list[dict[str, Any]]: """ if not companies: return [] - w = user_workspace() + w = _workspace() clauses = [] parameters: list[dict[str, str]] = [] for i, c in enumerate(companies): diff --git a/app/README.md b/app/README.md index cc291ef..20e328c 100644 --- a/app/README.md +++ b/app/README.md @@ -1,6 +1,6 @@ # Streamlit App — runtime + local-dev guide -Source for the Databricks App `doc-intel-analyst-${target}`. Streamlit chat UI over the agent endpoint, with citation chips, thumbs feedback, and Lakebase persistence. +Source for the Databricks App `doc-intel-analyst-${target}`. Streamlit chat UI over the Agent Bricks Supervisor endpoint, with citation chips, thumbs feedback, and Lakebase persistence. ## Files @@ -23,7 +23,7 @@ The first request creates the `conversation_history`, `query_logs`, and `feedbac ## Running locally -For iteration speed you may want to run the Streamlit app on your laptop against a deployed demo workspace. **Authenticate as the App's bound service principal** so Lakebase schema init produces the same ownership as the deployed App: +For iteration speed you may run the Streamlit app on your laptop against a deployed demo workspace for Lakebase UI work. Authenticate as the App's bound service principal so Lakebase schema init produces the same ownership as the deployed App: ```bash export DATABRICKS_HOST=https://.cloud.databricks.com @@ -36,11 +36,13 @@ eval "$(databricks apps get doc-intel-analyst-demo \ --output json | jq -r '.resources[] | select(.name=="docintel-lakebase") | .database | @sh " export PGHOST=\(.host) PGPORT=\(.port) PGUSER=\(.username) PGPASSWORD=\(.password) PGDATABASE=\(.database)"')" -export DOCINTEL_AGENT_ENDPOINT=analyst-agent-demo +export DOCINTEL_AGENT_ENDPOINT="$(./scripts/resolve-agent-endpoint.sh demo)" streamlit run app/app.py ``` -If you accidentally run with user creds (`DATABRICKS_CLIENT_ID`/`SECRET` unset), `lakebase_client.init_schema()` logs a warning identifying the mismatch — the tables get created under your user account, not the App SP, and the deployed App will lose write access. Drop the user-owned tables and re-init under the App SP to recover: +Local runs do not have the Databricks Apps `x-forwarded-access-token` header, so they cannot validate the Agent Bricks OBO path. Use the deployed App for agent validation. + +If you accidentally run Lakebase schema initialization with user creds (`DATABRICKS_CLIENT_ID`/`SECRET` unset), `lakebase_client.init_schema()` logs a warning identifying the mismatch. The tables get created under your user account, not the App SP, and the deployed App will lose write access. Drop the user-owned tables and re-init under the App SP to recover: ```sql -- connected as the App SP via the local-dev env above @@ -52,10 +54,10 @@ DROP TABLE IF EXISTS conversation_history CASCADE; ## OBO (on-behalf-of) flow -The app forwards each user's `x-forwarded-access-token` header to the agent serving endpoint via a `WorkspaceClient(token=...)` cache (`app.py:_user_client`). Agent-side UC SQL calls then run as the user, not the App SP — UC ACLs are honored end-to-end. +The app forwards each user's `x-forwarded-access-token` header to the Agent Bricks Supervisor endpoint via a `WorkspaceClient(token=...)` cache (`app.py:_user_client`). Agent Bricks, Knowledge Assistant, and the UC KPI function must run under the invoking user's identity, not broad App SP reads. -`user_api_scopes` declared in `resources/consumers/analyst.app.yml` (`serving.serving-endpoints`, `sql`, `iam.access-control:read`, `iam.current-user:read`) — required for app-level OBO to invoke the serving endpoint as the user. The agent-side Model Serving auth policy separately declares `model-serving` and `vector-search`. +`user_api_scopes` declared in `resources/consumers/analyst.app.yml` (`serving.serving-endpoints`, `sql`, `iam.access-control:read`, `iam.current-user:read`) are required for app-level OBO. Deployment is invalid if these scopes are not granted. **Streamlit gotcha** (per the [Databricks Apps runtime docs](https://docs.databricks.com/aws/en/dev-tools/databricks-apps/app-runtime)): the OBO token is captured at the initial HTTP request; the connection then upgrades to WebSocket and the token never refreshes. If a user's UC permissions change mid-session, ask them to reload the page. -**Local-dev caveat**: `st.context.headers` won't have `x-forwarded-access-token` when running `streamlit run` outside the Databricks Apps reverse proxy, so the OBO helper falls back to the SP client. That's fine for development — UC ACLs in demo workspaces are usually permissive — but verify against deployed demo before assuming OBO works. +**Local-dev caveat**: `st.context.headers` won't have `x-forwarded-access-token` when running `streamlit run` outside the Databricks Apps reverse proxy. The app raises a prerequisite error instead of using service-principal reads for agent calls. diff --git a/app/agent_bricks_client.py b/app/agent_bricks_client.py new file mode 100644 index 0000000..3539779 --- /dev/null +++ b/app/agent_bricks_client.py @@ -0,0 +1,55 @@ +"""Invoke Agent Bricks serving endpoints through the OpenAI-compatible path.""" + +from __future__ import annotations + +import json +import time +import urllib.error +import urllib.request +import uuid +from typing import Any + +from databricks.sdk import WorkspaceClient + + +def invoke_agent_endpoint( + client: WorkspaceClient, + endpoint_name: str, + question: str, + *, + client_request_id: str | None = None, + max_retries: int = 3, + timeout_seconds: int = 120, +) -> dict[str, Any]: + host = client.config.host.rstrip("/") + url = f"{host}/serving-endpoints/{endpoint_name}/invocations" + body = json.dumps({"input": [{"role": "user", "content": question}]}).encode("utf-8") + # For an OBO WorkspaceClient built with Config(token=), + # authenticate() emits that user token. There is no App SP fallback here. + headers = { + "Content-Type": "application/json", + "X-Request-ID": client_request_id or str(uuid.uuid4()), + **client.config.authenticate(), + } + + last_error: Exception | None = None + for attempt in range(1, max_retries + 1): + request = urllib.request.Request(url, data=body, headers=headers, method="POST") + try: + with urllib.request.urlopen(request, timeout=timeout_seconds) as response: + raw = response.read().decode("utf-8", errors="replace") + if raw.strip(): + return json.loads(raw) + last_error = RuntimeError("empty response body") + except json.JSONDecodeError as exc: + last_error = RuntimeError(f"non-JSON response body: {exc}") + except urllib.error.HTTPError as exc: + detail = exc.read().decode("utf-8", errors="replace") + if exc.code not in {429, 500, 502, 503, 504}: + raise RuntimeError(f"Agent Bricks endpoint {endpoint_name} returned HTTP {exc.code}: {detail}") from exc + last_error = RuntimeError(f"retryable HTTP {exc.code}: {detail}") + + if attempt < max_retries: + time.sleep(2 * attempt) + + raise RuntimeError(f"Agent Bricks endpoint {endpoint_name} returned no usable response after {max_retries} attempts") from last_error diff --git a/app/agent_bricks_response.py b/app/agent_bricks_response.py new file mode 100644 index 0000000..be0fb35 --- /dev/null +++ b/app/agent_bricks_response.py @@ -0,0 +1,123 @@ +"""Normalize Agent Bricks serving endpoint responses for app and eval paths.""" + +from __future__ import annotations + +import uuid +import re +from collections.abc import Mapping +from typing import Any + + +# Observed during 2026-04-26 demo deploy validation: Knowledge Assistant +# citations appear as markdown footnotes in intermediate Agent Bricks messages, +# e.g. `[^p1]: ... _ACME_10K_2024.pdf_`. This is not a public structured +# citation contract. If citation chips stop showing filenames, grep live payloads +# for `[^` and `.pdf_`; extraction falls back to filename="source" for footnotes +# without a parseable filename and [] when no footnotes are present. +APP_EMPTY_TEXT = "The Agent Bricks endpoint returned a response without displayable text." +FILENAME_RE = re.compile(r"_([A-Za-z0-9][A-Za-z0-9_.-]*\.pdf)_") + + +def _output_text_groups(payload: Mapping[str, Any]) -> list[str]: + output = payload.get("output") + if not isinstance(output, list): + return [] + + groups: list[str] = [] + for item in output: + if not isinstance(item, Mapping): + continue + content_items = item.get("content", []) + if not isinstance(content_items, list): + continue + parts: list[str] = [] + for content in content_items: + text = content.get("text") if isinstance(content, Mapping) else None + if isinstance(text, str): + parts.append(text) + if parts: + groups.append("\n".join(parts)) + return groups + + +def extract_text(payload: Mapping[str, Any], *, empty_text: str = "") -> str: + if isinstance(payload.get("output_text"), str): + return payload["output_text"] + if isinstance(payload.get("response"), str): + return payload["response"] + + choices = payload.get("choices") + if isinstance(choices, list) and choices: + first = choices[0] + message = first.get("message", {}) if isinstance(first, Mapping) else {} + content = message.get("content") if isinstance(message, Mapping) else None + if isinstance(content, str): + return content + + if isinstance(payload.get("output"), str): + return payload["output"] + output_groups = _output_text_groups(payload) + if output_groups: + return output_groups[-1] + + return empty_text + + +def extract_citations(payload: Mapping[str, Any]) -> list[dict[str, Any]]: + citations = payload.get("citations") or payload.get("sources") or [] + if not isinstance(citations, list): + return [] + normalized: list[dict[str, Any]] = [] + for citation in citations: + if isinstance(citation, Mapping): + normalized.append(dict(citation)) + elif citation is not None: + normalized.append({"source": str(citation)}) + if normalized: + return normalized + + # Walk all output groups, not just the final answer. The Supervisor's final + # message can omit citations that the Knowledge Assistant returned earlier. + for text in _output_text_groups(payload): + for line in text.splitlines(): + stripped = line.strip() + if not stripped.startswith("[^") or "]:" not in stripped: + continue + snippet = stripped.split("]:", 1)[1].strip() + filename = "" + match = FILENAME_RE.search(snippet) + if match: + filename = match.group(1) + normalized.append({ + "filename": filename or "source", + "section_label": "Knowledge Assistant citation", + "snippet": snippet, + }) + return normalized + + +def normalise_agent_response( + payload: Mapping[str, Any], + *, + conversation_id: str | None = None, + agent_path: str = "agent_bricks_supervisor", + empty_text: str = APP_EMPTY_TEXT, +) -> dict[str, Any]: + citations = extract_citations(payload) + try: + latency_ms = int(payload.get("latency_ms") or 0) + except (TypeError, ValueError): + latency_ms = 0 + + response = { + "answer": extract_text(payload, empty_text=empty_text), + "grounded": bool(citations), + "citations": citations, + "latency_ms": latency_ms, + "retrieved_count": len(citations), + "agent_path": agent_path, + "turn_id": str(uuid.uuid4()), + } + if conversation_id is not None: + response["conversation_id"] = conversation_id + return response diff --git a/app/app.py b/app/app.py index fc34bd6..6ab1f04 100644 --- a/app/app.py +++ b/app/app.py @@ -14,20 +14,16 @@ from databricks.sdk import WorkspaceClient from databricks.sdk.config import Config +from app.agent_bricks_client import invoke_agent_endpoint +from app.agent_bricks_response import normalise_agent_response from app import lakebase_client -AGENT_ENDPOINT = os.environ["DOCINTEL_AGENT_ENDPOINT"] # set via resource binding in resources/consumers/analyst.app.yml - - -@st.cache_resource -def _sp_client() -> WorkspaceClient: - """Service-principal-scoped client for app-owned operations (Lakebase init, etc.).""" - return WorkspaceClient() +AGENT_ENDPOINT = os.environ["DOCINTEL_AGENT_ENDPOINT"] # set by resources/consumers/analyst.app.yml @st.cache_resource(ttl=3600) -def _user_client(token: str | None) -> WorkspaceClient: +def _user_client(token: str) -> WorkspaceClient: """User-scoped (OBO) client built from the request's x-forwarded-access-token. Databricks Apps OBO docs: @@ -37,10 +33,9 @@ def _user_client(token: str | None) -> WorkspaceClient: token never refreshes. Long-lived sessions should reload the page after permission changes. - `token=None` → SP fallback (local dev, or unauthenticated requests). + Missing tokens are a deployment prerequisite failure. Production must run + through Databricks Apps user-token passthrough. """ - if not token: - return _sp_client() return WorkspaceClient(config=Config( host=os.environ["DATABRICKS_HOST"], token=token, @@ -48,7 +43,13 @@ def _user_client(token: str | None) -> WorkspaceClient: def _agent_client() -> WorkspaceClient: - return _user_client(st.context.headers.get("x-forwarded-access-token")) + token = st.context.headers.get("x-forwarded-access-token") + if not token: + raise RuntimeError( + "Databricks Apps user-token passthrough is required; no " + "x-forwarded-access-token header was present." + ) + return _user_client(token) def _user_email() -> str: @@ -57,15 +58,11 @@ def _user_email() -> str: def _query_agent(question: str, conversation_id: str) -> dict: try: - out = _agent_client().serving_endpoints.query( - name=AGENT_ENDPOINT, - inputs=[{"question": question, "conversation_id": conversation_id, "top_k": 5}], - ) - raw = out.predictions if hasattr(out, "predictions") else out["predictions"] - return raw[0] if isinstance(raw, list) else raw + payload = invoke_agent_endpoint(_agent_client(), AGENT_ENDPOINT, question, client_request_id=conversation_id) + return normalise_agent_response(payload, conversation_id=conversation_id) except Exception as exc: return { - "answer": "The analyst agent is unavailable right now. Please try again.", + "answer": "The Agent Bricks supervisor endpoint is unavailable right now. Please try again.", "grounded": False, "citations": [], "latency_ms": 0, @@ -94,7 +91,11 @@ def _render_citations(citations: list[dict]) -> None: cols = st.columns(min(len(citations), 4)) for i, c in enumerate(citations[:4]): with cols[i]: - st.markdown(f"**`{c['filename']}`**\n\n{c['section_label']} — score {c['score']:.2f}") + filename = c.get("filename") or c.get("doc_uri") or c.get("source") or "source" + section = c.get("section_label") or c.get("title") or c.get("name") or "citation" + score = c.get("score") + suffix = f" - score {score:.2f}" if isinstance(score, (float, int)) else "" + st.markdown(f"**`{filename}`**\n\n{section}{suffix}") if c.get("snippet"): st.caption(c["snippet"]) diff --git a/app/app.yaml b/app/app.yaml index 75b3881..3f5537c 100644 --- a/app/app.yaml +++ b/app/app.yaml @@ -21,9 +21,3 @@ env: value: "false" - name: STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION value: "false" - - # Resolved at runtime from the resource bindings declared in - # resources/consumers/analyst.app.yml. The bindings are target-aware - # (analyst-agent-${bundle.target}) so demo and prod stay isolated. - - name: DOCINTEL_AGENT_ENDPOINT - valueFrom: agent-endpoint diff --git a/app/requirements.txt b/app/requirements.txt index c9b3d00..dc8f600 100644 --- a/app/requirements.txt +++ b/app/requirements.txt @@ -1,3 +1,3 @@ streamlit>=1.32 -databricks-sdk>=0.40 +databricks-sdk>=0.105,<1 psycopg[binary]>=3.1 diff --git a/databricks.yml b/databricks.yml index af0bb79..1ee6c38 100644 --- a/databricks.yml +++ b/databricks.yml @@ -27,24 +27,18 @@ variables: embedding_model_endpoint_name: description: Mosaic AI embedding endpoint used by Vector Search default: databricks-bge-large-en - foundation_model_endpoint_name: - description: Foundation model endpoint used by the agent for generation - default: databricks-meta-llama-3-3-70b-instruct - rerank_model_endpoint_name: - description: Mosaic AI re-ranker endpoint - default: databricks-bge-rerank-v2 quality_threshold: description: Minimum quality_score (0-30) required for embed_eligible default: 22 - top_k: - description: Citations returned after re-rank - default: 5 max_pdf_bytes: description: Reject filings larger than this many bytes (50 MB) default: 52428800 analyst_group: description: UC group granted SELECT/USE on the catalog/schema default: account users + agent_endpoint_name: + description: Agent Bricks Supervisor serving endpoint name resolved by bootstrap_agent_bricks.py + default: UNSET_AGENT_BRICKS_ENDPOINT targets: demo: diff --git a/docs/design.md b/docs/design.md index e818682..32b1220 100644 --- a/docs/design.md +++ b/docs/design.md @@ -21,9 +21,11 @@ This document covers the *why*, the architecture, and the build workflow behind ## Why this exists -Databricks shipped a lot of new generative-AI surface area in 2025–2026: `ai_parse_document`, Mosaic AI Vector Search, the Agent Framework, AI Gateway, Lakebase, Databricks Apps. Tutorials show each piece in isolation; nobody shows them wired together with **eval gates, governance, and reproducible deploys** the way you'd actually ship to analysts. +Databricks shipped a lot of new generative-AI surface area in 2025–2026: Document Intelligence (`ai_parse_document`, `ai_classify`, `ai_extract`), Agent Bricks, AI Gateway, Lakebase, and Databricks Apps. The two source articles for this reference are Databricks' Document Intelligence launch article ("Why Your Agents Can't Read Enterprise Documents") and the Agent Bricks platform article. The reference exists to demonstrate those patterns end to end: parse messy enterprise PDFs into a governed document data layer, then build a governed agent on that enriched layer through Agent Bricks. -This repo is that worked example. Drop a PDF into a governed UC volume; ten minutes later, an analyst can ask cited questions in plain English with end-to-end audit. The whole stack is described declaratively as one **Databricks Asset Bundle (DAB)** plus a small bootstrap script. DAB manages catalog/schema/volume, pipeline, jobs, the Vector Search **endpoint**, the Lakebase instance, the serving endpoint, the monitor, the app, and the dashboard; the Vector Search **index** itself is created and synced by `jobs/index_refresh/sync_index.py` (DAB doesn't yet manage indexes as a resource type), and the agent model version is registered by `agent/log_and_register.py`. The bootstrap script orchestrates them in the right order. +This repo is that worked example. Drop a PDF into a governed UC volume; ten minutes later, an analyst can ask cited questions in plain English with end-to-end audit. The desired target architecture is **Agent Bricks-first**: Document Intelligence prepares the governed source of truth; Knowledge Assistant handles cited document Q&A; Supervisor Agent coordinates document Q&A with structured KPI tools; AI Gateway, Unity Catalog, OBO, Lakebase, and CLEARS provide the governance and operating layer. + +The earlier custom `mlflow.pyfunc` agent path diverged from that target by re-introducing custom serving lifecycle, auth-policy ordering, retrieval, and supervisor code that Agent Bricks is meant to absorb. The production path now uses Agent Bricks bootstrap instead of that custom runtime. It also demonstrates a development workflow: **Spec-Kit** for spec-driven design, **Claude Code** with Databricks skill bundles for AI-assisted implementation, six **non-negotiable constitution principles** that gate every plan. See [How it's built](#how-its-built--three-pillars). @@ -43,9 +45,9 @@ It also demonstrates a development workflow: **Spec-Kit** for spec-driven design BETA_10K.pdf │ (raw bytes, │ │ filings (parsed │ │ sections (one │ GAMMA_10K.pdf │ filename, │ │ VARIANT — │ │ row per parsed │ │ ingested_at) │ │ ai_parse_ │ │ $.sections[*]; │ - │ │ │ document) │ │ fallback to │ - │ >50MB rejects: │ │ │ │ full_document │ - │ bronze_filings │ │ Status: ok / │ │ if absent) │ + │ │ │ document) │ │ uses full_ │ + │ >50MB rejects: │ │ │ │ document when │ + │ bronze_filings │ │ Status: ok / │ │ sections absent)│ │ _rejected │ │ partial / error │ │ │ └─────────────────┘ └─────────────────┘ │ gold_filing_kpis │ 01_bronze.sql 02_silver_parse │ (typed columns: │ @@ -91,45 +93,40 @@ It also demonstrates a development workflow: **Spec-Kit** for spec-driven design "Quality before retrieval." ``` -**Ownership note**: DAB manages the Vector Search **endpoint** (`resources/foundation/filings_index.yml`) and the index-refresh **job** (`resources/consumers/index_refresh.job.yml`). The **index** itself isn't yet a DAB-managed resource type as of CLI 0.298 — `jobs/index_refresh/sync_index.py` creates the Delta-Sync index on first run and triggers a sync on subsequent runs. The endpoint lives in foundation so first-deploy bootstrap can materialize the index before `agent/log_and_register.py` logs the model auth policy that references it. +**Ownership note**: DAB manages the Vector Search **endpoint** (`resources/foundation/filings_index.yml`) and the index-refresh **job** (`resources/consumers/index_refresh.job.yml`). The **index** itself isn't yet a DAB-managed resource type as of CLI 0.298 — `jobs/index_refresh/sync_index.py` creates the Delta-Sync index on first run and triggers a sync on subsequent runs. The endpoint lives in foundation so first-deploy bootstrap can materialize the index before `scripts/bootstrap_agent_bricks.py` attaches it to Knowledge Assistant. -### Agent has two paths, one endpoint +### Agent Bricks target runtime ``` User question │ ▼ - ┌────────────────────────────────────────────┐ - │ AnalystAgent.predict() │ - │ ───────────────────── │ - │ contains "compare" / "vs" / │ - │ "between" + ≥2 company names? │ - └────────────┬─────────────────┬─────────────┘ - │ no │ yes - ▼ ▼ - ┌──────────────────────┐ ┌──────────────────────┐ - │ Single-filing path │ │ Supervisor path │ - │ │ │ │ - │ 1. Hybrid search │ │ For each company: │ - │ (keyword + vec) │ │ ▸ run analyst path │ - │ 2. Re-rank → top 5 │ │ ▸ pull KPIs from │ - │ 3. LLM generates │ │ gold_filing_kpis │ - │ answer w/ [1] [2] │ │ Format markdown │ - │ citations │ │ table with cites. │ - └──────────────────────┘ └──────────────────────┘ - │ │ - └────────┬────────┘ - ▼ - ┌──────────────────────┐ - │ Response JSON: │ - │ answer │ - │ citations[] │ - │ grounded: bool │ - │ latency_ms │ - └──────────────────────┘ + ┌─────────────────────────────────────────────┐ + │ Agent Bricks Supervisor Agent │ + │ - owns routing and orchestration │ + │ - runs under UC / AI Gateway governance │ + └────────────┬─────────────────────┬──────────┘ + │ │ + ▼ ▼ + ┌────────────────────────┐ ┌────────────────────────┐ + │ Knowledge Assistant │ │ Structured KPI tool │ + │ - cited document Q&A │ │ - reads Gold KPI table │ + │ - grounded in parsed │ │ - deterministic tables │ + │ document layer / VS │ │ for comparisons │ + └────────────────────────┘ └────────────────────────┘ + │ │ + └──────────┬──────────┘ + ▼ + ┌──────────────────────┐ + │ Response JSON / App │ + │ citations, feedback │ + │ latency, audit │ + └──────────────────────┘ ``` -The agent is an `mlflow.pyfunc` model registered in Unity Catalog and served behind an **AI Gateway** (rate limiting per-user, usage tracking, inference-table audit). Identity passthrough is implemented at the *App layer* when the workspace has Databricks Apps user-token passthrough enabled: the Streamlit app extracts the user's `x-forwarded-access-token` header and constructs a user-scoped `WorkspaceClient`. The served model is OBO-ready via MLflow `auth_policy` and Model Serving user credentials. If app-level passthrough is not enabled, the app falls back to service-principal auth and the repo must be treated as a reference/demo deployment, not a production row-level-security deployment. See [`../SECURITY.md`](../SECURITY.md) and [`../app/README.md`](../app/README.md). +Knowledge Assistant is the default single-filing Q&A path because the Agent Bricks article positions the hard part as governed context, identity, and observability rather than hand-building the agent loop. Supervisor Agent is the default cross-company orchestration path. Custom code is allowed only where it is business logic around Agent Bricks, such as a deterministic KPI table tool or the App-specific feedback UI. It must not replace Knowledge Assistant, Supervisor Agent, Agent Bricks serving, or Agent Bricks governance. + +**Removed divergence**: the custom `agent/analyst_agent.py`, `agent/retrieval.py`, `agent/supervisor.py`, `agent/log_and_register.py`, and `resources/consumers/agent.serving.yml` path has been removed. `scripts/bootstrap_agent_bricks.py` is now the production bootstrap for Knowledge Assistant, the UC KPI function, and Supervisor Agent configuration. ### Runtime stack @@ -149,26 +146,27 @@ The agent is an `mlflow.pyfunc` model registered in Unity Catalog and served beh │ query │ feedback writes ▼ ▼ ┌────────────────────────┐ ┌────────────────────────┐ - │ Model Serving endpoint │ │ Lakebase Postgres │ - │ "analyst-agent-demo" │ │ ───────────────── │ - │ (CPU, scales to 0) │ │ conversation_history │ + │ Agent Bricks endpoint │ │ Lakebase Postgres │ + │ Knowledge Assistant + │ │ ───────────────── │ + │ Supervisor Agent │ │ conversation_history │ │ │ │ query_logs │ │ + AI Gateway: │ │ feedback │ - │ rate limit │ │ │ - │ (per-user key) │ │ (Postgres for tiny │ - │ inference-table │ │ per-turn writes — │ - │ audit │ │ Delta isn't great │ - │ usage tracking │ │ at row-by-row) │ + │ OBO, permissions, │ │ │ + │ audit, rate limits, │ │ (Postgres for tiny │ + │ guardrails │ │ per-turn writes — │ + │ │ │ Delta isn't great │ + │ │ │ at row-by-row) │ └────────────────────────┘ └────────────────────────┘ - OBO (user identity end-to-end, when enabled): + OBO (user identity end-to-end, mandatory): ────────────────────────────── - App reads `x-forwarded-access-token` from the request, builds - `WorkspaceClient(token=...)`, calls the serving endpoint with the - user's identity. The agent-side MLflow auth policy and Model Serving - OBO credentials let downstream calls run as the user. If the app-side - feature is unavailable, the bootstrap script prints an explicit warning - and the deployment remains reference/demo only. + App reads `x-forwarded-access-token` from the request and invokes the + Agent Bricks endpoint with the user's identity. AI Gateway and Unity + Catalog enforce identity, permissions, audit, and routing across the + agent, model, tools, and data. User token passthrough is a hard + prerequisite for production. If the workspace cannot provide end-to-end + OBO, deployment must fail rather than silently falling back to a service + principal identity. ``` **Why Postgres for state?** Delta tables are great for analytics but bad at "insert one tiny row per chat turn at high frequency." Lakebase is Databricks's managed Postgres — same governance, right tool for the job. @@ -211,14 +209,14 @@ This repo combines three things: Spec-Kit for spec-driven design, Databricks Ass | II | **Parse once, extract many** | `ai_parse_document` runs once at Silver → VARIANT; everything downstream reads the parsed output | | III | **Declarative over imperative** | SDP SQL pipelines, Lakeflow Jobs, DAB resources — no production notebooks | | IV | **Quality before retrieval** | 5-dim rubric scores every section; only ≥22/30 reach the index. Embed `summary`, not raw text | -| V | **Eval-gated agents** | MLflow CLEARS scores must clear thresholds before any deploy is considered complete | +| V | **Eval-gated Agent Bricks** | CLEARS scores must clear thresholds before any deploy is considered complete | | VI | **Reproducible deploys** | `databricks bundle deploy -t ` recreates the entire stack; `demo` and `prod` parity enforced | When you read `specs/001-doc-intel-10k/plan.md` you'll see a "Constitution Check" gate that maps each design decision back to the principle it satisfies. When you read `specs/001-doc-intel-10k/tasks.md` you'll see how each task derives from the plan, and how user-stories (P1, P2, P3) are independently demoable. ### Pillar 2 — Databricks Asset Bundles + the Claude Code skill suite -[**Databricks Asset Bundles**](https://docs.databricks.com/aws/en/dev-tools/bundles/) (DABs) describe most of the workspace state as YAML. One root `databricks.yml` declares variables and targets (`demo`, `prod`); `resources/**/*.yml` declares each resource (pipeline, jobs, Vector Search endpoint, index-refresh job, serving endpoint, app, monitor, dashboard, Lakebase instance + catalog). `databricks bundle deploy -t demo` reconciles workspace state to YAML. The two non-DAB-managed pieces — the Vector Search **index** itself and the registered **model version** — are produced at runtime by `jobs/index_refresh/sync_index.py` and `agent/log_and_register.py` respectively, which the bootstrap script orchestrates. +[**Databricks Asset Bundles**](https://docs.databricks.com/aws/en/dev-tools/bundles/) (DABs) describe most of the workspace state as YAML. One root `databricks.yml` declares variables and targets (`demo`, `prod`); `resources/**/*.yml` declares each resource (pipeline, jobs, Vector Search endpoint, index-refresh job, Agent Bricks endpoint/configuration, app, monitor, dashboard, Lakebase instance + catalog). `databricks bundle deploy -t demo` reconciles workspace state to YAML. The Vector Search **index** is still created and synced by `jobs/index_refresh/sync_index.py` until DAB supports index resources directly. This repo was built with Databricks-specific Claude Code skill bundles. Those bundles are distributed by Databricks via the CLI / Claude Code plugin channel and **are not vendored in this open-source tree** — install them locally if you have access, or reference the canonical Databricks docs (mapping in [`../CONTRIBUTING.md`](../CONTRIBUTING.md)). @@ -230,7 +228,7 @@ This repo was built with Databricks-specific Claude Code skill bundles. Those bu | **databricks-jobs** | Lakeflow Jobs with retries, schedules, table-update / file-arrival triggers | [docs](https://docs.databricks.com/aws/en/jobs/) | | **databricks-apps** | Databricks Apps (Streamlit), App resource bindings | [docs](https://docs.databricks.com/aws/en/dev-tools/databricks-apps/) | | **databricks-lakebase** | Lakebase Postgres instances, branches, computes, endpoint provisioning | [docs](https://docs.databricks.com/aws/en/oltp/) | -| **databricks-model-serving** | Model Serving endpoints, AI Gateway, served entities, scaling config | [docs](https://docs.databricks.com/aws/en/machine-learning/model-serving/) | +| **databricks-agent-bricks** | Knowledge Assistant, Supervisor Agent, UC tools, endpoint lifecycle | [docs](https://docs.databricks.com/aws/en/generative-ai/agent-bricks/knowledge-assistant) | Skills are loaded by Claude Code on demand. When you ask Claude to "wire up Vector Search," it should read the Databricks pipeline/model-serving guidance *before* writing YAML, so the output reflects current Databricks API shapes — not stale training data. @@ -261,16 +259,16 @@ DABs deploy *everything in one shot*. But our resources have a chicken-and-egg p │ ▸ Pipeline ────┐ │ │ ▸ Tables ────┼──── all need each other │ │ ▸ Vector idx ───┤ │ - │ ▸ Model ───┤ Monitor wants the │ - │ ▸ Endpoint ────┤ KPI table to exist │ + │ ▸ Agent Bricks ──┤ Monitor wants the │ + │ ▸ App ───┤ KPI table to exist │ │ ▸ App ───┤ BEFORE it can attach │ │ ▸ Monitor ────┘ │ │ ▸ Lakebase ──── │ └────────────────────────────────────────────────┘ - Endpoint needs a registered model version. - Model version needs the model logged. - Model logging needs the agent code. + App needs the Agent Bricks Supervisor endpoint. + Supervisor needs Knowledge Assistant + UC function tools. + Knowledge Assistant needs the Vector Search index. Monitor needs the table populated. Table needs the pipeline to run. @@ -289,18 +287,17 @@ The fix is a **staged deploy** orchestrated by `scripts/bootstrap-demo.sh`. Reso │ └── lakebase_instance.yml │ └── consumers/ ← need foundation to be RUNNING and producing data - ├── agent.serving.yml (needs registered model version) ├── kpi_drift.yml (needs gold_filing_kpis table) ├── index_refresh.job.yml (needs source table) - ├── analyst.app.yml (needs Lakebase + agent endpoint) + ├── analyst.app.yml (needs Lakebase + generated agent endpoint) ├── usage.dashboard.yml └── lakebase_catalog.yml (needs instance AVAILABLE) ``` -**The bootstrap script auto-detects which mode to run** by checking whether the agent serving endpoint already has a populated config: +**The bootstrap script auto-detects which mode to run** by checking whether the Agent Bricks Supervisor exists and has generated a serving endpoint: ``` - does analyst-agent-${target} have served entities? + does doc-intel-supervisor-${target} have endpoint_name? │ no ◀───────┴───────▶ yes │ │ @@ -315,9 +312,9 @@ The fix is a **staged deploy** orchestrated by `scripts/bootstrap-demo.sh`. Reso │ 2. bundle deploy │ │ 2. refresh data: │ │ (foundation) │ │ upload, run │ │ 3. produce data: │ │ pipeline, │ - │ upload, run, │ │ register new │ - │ register │ │ model version │ - │ model │ │ + repoint │ + │ upload, run, │ │ sync index, │ + │ sync index, │ │ update Agent │ + │ Agent Bricks │ │ Bricks │ │ 4. wait Lakebase │ │ serving in- │ │ AVAILABLE │ │ place │ │ 5. restore yamls │ │ │ @@ -335,11 +332,11 @@ The fix is a **staged deploy** orchestrated by `scripts/bootstrap-demo.sh`. Reso └──────────────────────────┘ ``` -**Why two modes?** DAB tracks resource state; if you run the temp-rename trick against an *existing* deployment, DAB sees the consumer YAMLs as removed and plans to **delete** the serving endpoint, app, monitor, etc. Safe-ish on a fresh workspace; destructive in steady-state. The script detects mode and does the right thing. +**Why two modes?** DAB tracks resource state; if you run the temp-rename trick against an existing deployment, DAB sees the consumer YAMLs as removed and plans to delete the app, monitor, dashboard, etc. Appropriate on a fresh workspace; destructive in steady-state. The script detects mode and does the right thing. -CI (`.github/workflows/deploy.yml`) assumes steady-state — the first-ever bring-up of a workspace must be done locally with `./scripts/bootstrap-demo.sh`. After that, every push to `main` runs the steady-state path: full `bundle deploy` → refresh data → repoint serving endpoint → grants → CLEARS gate. +CI (`.github/workflows/deploy.yml`) assumes steady-state — the first-ever bring-up of a workspace must be done locally with `./scripts/bootstrap-demo.sh`. After that, every push to `main` runs the steady-state path: full `bundle deploy` → refresh data → sync index → update Agent Bricks → grants → CLEARS gate. -For the per-step procedure and known failure modes, see [`runbook.md` § Known deploy ordering gaps](./runbook.md#known-deploy-ordering-gaps-discovered-in-the-2026-04-24-smoke-test). +For the per-step procedure and known failure modes, see [`runbook.md` § Known deploy ordering gaps](./runbook.md#known-deploy-ordering-gaps). --- @@ -347,9 +344,9 @@ For the per-step procedure and known failure modes, see [`runbook.md` § Known d - **Wiring `ai_parse_document` into Lakeflow SDP** — pattern for streaming-tables + `STREAM(...)` views + `APPLY CHANGES INTO` keyed on filename. - **Scoring document quality before retrieval** — five 0–6 dimensions in SQL, threshold filter on the index source. -- **Logging a Mosaic AI agent to UC** — `mlflow.pyfunc` with both inputs *and* outputs in the signature (UC requirement), `AnyType` for variable-shape fields, `auth_policy` + `resources` for OBO. -- **Grounding an agent with citations** — hybrid Vector Search → re-rank → top-k → LLM with explicit "cite sources [1] [2]" prompt. +- **Building on Agent Bricks instead of custom agent loops** — Knowledge Assistant for cited document Q&A, Supervisor Agent for orchestration, deterministic KPI tool glue for structured comparisons. +- **Grounding an agent with citations** — Document Intelligence output and the governed Vector Search / Knowledge Assistant source provide the citation-bearing context. - **Handling DAB deploy ordering** — chicken-egg dependencies between heterogeneous resources, solved with a 5-step bootstrap rather than `depends_on` (which DAB doesn't reliably honor across resource types). - **Gating deploys on MLflow eval** — `mlflow.evaluate(model_type="databricks-agent")` with documented metric keys, per-axis thresholds, exit-code gate in CI. -- **End-to-end OBO** — `ModelServingUserCredentials` from `databricks_ai_bridge`, `CredentialStrategy.MODEL_SERVING_USER_CREDENTIALS` for Vector Search, MLflow `auth_policy` with `model-serving` + `vector-search` user scopes, App-side `user_api_scopes` declaration. +- **End-to-end OBO** — Databricks Apps user-token passthrough, Agent Bricks / AI Gateway identity enforcement, UC permissions, and audit verification are production prerequisites. - **Spec-Kit + Claude Code + Databricks skills composing** — every artifact in `specs/` and `pipelines/` and `agent/` was generated through that loop. diff --git a/docs/runbook.md b/docs/runbook.md index 327ab6d..fd401ce 100644 --- a/docs/runbook.md +++ b/docs/runbook.md @@ -33,19 +33,22 @@ If a filing scores below threshold: - It is retained in `gold_filing_sections` and `gold_filing_kpis` for audit (FR-005, SC-006). - It is **excluded** from `gold_filing_sections_indexable` and therefore from Vector Search. -## Roll an agent endpoint version +## Update Agent Bricks configuration -The Model Serving endpoint follows the UC Model Alias `@demo` (or `@prod`), not a pinned version. To roll forward: +Agent Bricks resources are managed by `scripts/bootstrap_agent_bricks.py`. Run it after changes to Knowledge Assistant instructions, Supervisor instructions, or the KPI tool function: ```bash -DOCINTEL_CATALOG= DOCINTEL_SCHEMA= python agent/log_and_register.py --target demo +DOCINTEL_CATALOG= \ +DOCINTEL_SCHEMA= \ +DOCINTEL_WAREHOUSE_ID= \ +python scripts/bootstrap_agent_bricks.py --target demo ``` -This registers a new version and reassigns `@demo`. The serving endpoint will pick the new version on its next traffic refresh (a few minutes). To roll back, use the UC Model Registry UI to re-point the alias to the prior version. +This creates or updates the Knowledge Assistant, syncs the Vector Search knowledge source, creates or updates the UC SQL KPI function, and wires both into the Supervisor Agent endpoint. ## Inspect CLEARS metrics in MLflow -CI runs `python evals/clears_eval.py --endpoint analyst-agent-demo` after each `demo` deploy. Look for the experiment `/Shared/docintel-clears-`; each run logs: +CI resolves the generated Agent Bricks Supervisor serving endpoint, then runs `python evals/clears_eval.py --endpoint "$AGENT_ENDPOINT_NAME"` after each `demo` deploy. Look for the experiment `/Shared/docintel-clears-`; each run logs: - Per-axis metrics: `correctness`, `adherence`, `relevance`, `execution`, `safety`, `latency_p95_ms` - Per-category slices: `p2_correctness`, `p3_correctness` @@ -63,30 +66,26 @@ Failures are logged as a JSON list under the run tag `failures`. The script exit | Agent answers ignore user UC permissions | OBO scopes wiped by `bundle run` (documented destructive-update behavior — see [Databricks Apps deploy docs](https://docs.databricks.com/aws/en/dev-tools/databricks-apps/deploy)) | Re-apply: `databricks apps update doc-intel-analyst-demo --user-api-scopes serving.serving-endpoints,sql,iam.access-control:read,iam.current-user:read` | | Streamlit user sees stale UC permissions | OBO token captured at WebSocket open; never refreshes ([Databricks Apps runtime docs](https://docs.databricks.com/aws/en/dev-tools/databricks-apps/app-runtime)) | Reload the page after permission changes | | Lakebase tables not writable from deployed App | Local-dev `streamlit run` initialised schema under user identity, not App SP | Connect as App SP and `DROP TABLE feedback, query_logs, conversation_history`; next App run re-creates them under SP. See `app/README.md` | -| CLEARS Latency axis fails | Re-rank window too large | Reduce candidate window in `agent/retrieval.py` from 25 to 15 | +| CLEARS Latency axis fails | Agent Bricks orchestration or Knowledge Assistant source is too broad | Narrow the Knowledge Assistant source, tune Supervisor instructions, or reduce structured-tool fan-out | | App errors connecting to Lakebase | Database resource binding missing Postgres env vars | Check the `docintel-lakebase` resource binding and `PGHOST`/`PGPORT`/`PGUSER`/`PGPASSWORD`/`PGDATABASE` in the App runtime | -## Enabling end-to-end OBO +## Verifying end-to-end OBO -If your workspace lacks the "Databricks Apps - user token passthrough" feature, OBO end-to-end is operationally disabled until an admin turns it on. The agent code is OBO-ready (`agent/_obo.user_workspace`, `VectorSearchClient(credential_strategy=CredentialStrategy.MODEL_SERVING_USER_CREDENTIALS)` in `retrieval.py`, `auth_policy` declared in `log_and_register.py`), and the app forwards `x-forwarded-access-token` via `app/app.py:_user_client`. **What's missing is the App-side scope declaration**, which the workspace rejects until the feature is enabled. +Databricks Apps user-token passthrough, Agent Bricks OBO, AI Gateway identity enforcement, and UC grants are production prerequisites. Bootstrap must fail if any required scope or workspace feature is missing. -**Bootstrap prints a `⚠ APP-LEVEL OBO IS OPERATIONALLY DISABLED` banner** whenever the `user_api_scopes` block in `resources/consumers/analyst.app.yml` is commented out, so this state is visible in every bring-up log. - -To enable OBO end-to-end: +To verify OBO end-to-end: 1. **Workspace admin** enables the "Databricks Apps - user token passthrough" feature in workspace settings. -2. Uncomment the `user_api_scopes` block in `resources/consumers/analyst.app.yml`. Required scopes for the analyst app's call chain: +2. Confirm the `user_api_scopes` block in `resources/consumers/analyst.app.yml` is present. Required scopes for the analyst app's call chain: ```yaml user_api_scopes: - - serving.serving-endpoints # invoke analyst-agent endpoint as user - - sql # agent's tools.py runs UC SQL + - serving.serving-endpoints # invoke Agent Bricks endpoint as user + - sql # structured KPI tool runs UC SQL - iam.access-control:read # default - iam.current-user:read # default ``` 3. Redeploy: `databricks bundle deploy -t demo && databricks bundle run -t demo analyst_app`. -4. Verify: bootstrap step 5's scope check now asserts (rather than skipping). Visit the deployed app, ask a question, confirm in audit logs that the agent's UC SQL runs under the user's identity (not the app SP). - -The agent-side `auth_policy` declared in `log_and_register.py` uses the **agent-side** scopes (`model-serving`, `vector-search`) per the Model Serving OBO docs — these are different from the App-side scopes above and need no workspace feature flag; they just plumb the per-request user token through the served pyfunc. +4. Verify: bootstrap scope checks assert required scopes. Visit the deployed app, ask a question, and confirm in audit logs that Agent Bricks, Knowledge Assistant, and structured KPI SQL calls run under the invoking user's identity. ## CLEARS thresholds @@ -117,32 +116,32 @@ P3 correctness: Latency p95: ``` -## Known deploy ordering gaps (discovered in the 2026-04-24 smoke test) +## Known deploy ordering gaps The bundle has three chicken-egg dependencies that a single `bundle deploy` cannot resolve on a fresh workspace. Each needs a phase-2 step after a prior side effect: -1. **Model Serving endpoint references a concrete agent model version** - - `resources/consumers/agent.serving.yml` must contain a numeric placeholder - because DAB serving config may reject UC alias syntax. - - CI registers a fresh model version and then calls - `agent/log_and_register.py --target demo --serving-endpoint analyst-agent-demo` - to update the served entity to the new version. - - **Fix**: for local deploys, run the same registration command after bundle - deploy, or bootstrap the endpoint once and let the script advance it. +1. **Databricks App binds to an Agent Bricks endpoint** + - Agent Bricks generates concrete Knowledge Assistant and Supervisor serving + endpoint names. + - `scripts/bootstrap_agent_bricks.py` returns the generated Supervisor + endpoint, and `resources/consumers/analyst.app.yml` injects it into + `DOCINTEL_AGENT_ENDPOINT` via the `agent_endpoint_name` bundle variable. + - **Fix**: bootstrap creates data and Agent Bricks resources before the full + consumer deploy. 2. **Lakehouse Monitor references `gold_filing_kpis` which the pipeline must create first** - `resources/consumers/kpi_drift.yml` attaches to a table that doesn't exist until the pipeline has run at least once. - - **Fix**: move the monitor into a separate `bundle deploy --include monitors` - step run after the first pipeline trigger, or comment out the monitor on - fresh deploys and add it after the first ingest. + - **Fix**: stage the first deploy so the pipeline runs before consumers are + reconciled. 3. **Lakebase `database_catalog` and `App` race the `database_instance` provisioning** - The catalog and app attach to the instance before the instance has finished coming up. Re-running `bundle deploy` immediately after the first attempt usually succeeds since the instance is then ready. - - **Fix**: `bundle deploy -t demo` twice on first stand-up, or add a wait task. + - **Fix**: bootstrap waits for Lakebase to reach `AVAILABLE` before the full + consumer deploy. A clean fresh-workspace bring-up is a single command: @@ -158,28 +157,27 @@ The script implements a **staged deploy**: resources are split into data). Stage 1 temporarily renames consumer YAMLs to `*.yml.skip` so the bundle's `resources/**/*.yml` glob excludes them — foundation deploys cleanly. Stage 2 brings up data (sample upload, pipeline run, VS index -materialization, model register, Lakebase ready) and then runs full `bundle deploy`, with all +materialization, Agent Bricks bootstrap, Lakebase ready) and then runs full `bundle deploy`, with all consumer dependencies satisfied. The previous "errors tolerated on first deploy" workaround is gone — both deploys succeed cleanly. Six-step flow: -1. **Orphan detection** — delete a malformed serving endpoint with no - served entities (leftover from a prior partial run); fail loudly if +1. **Environment conflict checks** — fail loudly if the configured Lakebase name is in `DELETING` state (soft-delete retention conflict — bump the suffix and retry). 2. **Foundation deploy** — `resources/consumers/*.yml` renamed to `*.yml.skip`; `bundle deploy` only touches catalog/schema/volume, pipeline, retention job, Lakebase instance, Vector Search endpoint. 3. **Produce data** — upload synthetic samples, run pipeline, wait for - `gold_filing_kpis`, materialize the Vector Search index, register - agent model (no `--serving-endpoint`, endpoint doesn't exist yet), + `gold_filing_kpis`, materialize the Vector Search index, bootstrap + Agent Bricks Knowledge Assistant + Supervisor Agent, wait for Lakebase to reach `AVAILABLE`. 4. **Consumer deploy** — full `bundle deploy` (foundation idempotent; consumers create cleanly because all deps are live). 5. **App run + UC grants chain** — `bundle run analyst_app`, `USE_CATALOG → USE_SCHEMA → SELECT/EXECUTE` for the analyst group. -6. **Smoke check** — query the serving endpoint with one sample question. +6. **Smoke check** — query the Agent Bricks Supervisor endpoint with one sample question. -CI (`.github/workflows/deploy.yml`) uses the same staged shape so steady- -state pushes don't re-introduce orphans. +CI (`.github/workflows/deploy.yml`) uses the same staged shape for steady- +state pushes. diff --git a/evals/clears_eval.py b/evals/clears_eval.py index 7424231..18c795e 100644 --- a/evals/clears_eval.py +++ b/evals/clears_eval.py @@ -23,15 +23,21 @@ import argparse import json import os +from pathlib import Path import statistics import sys import time from typing import Any +sys.path.insert(0, str(Path(__file__).resolve().parents[1])) + import mlflow import pandas as pd from databricks.sdk import WorkspaceClient +from app.agent_bricks_client import invoke_agent_endpoint +from app.agent_bricks_response import normalise_agent_response + THRESHOLDS = { "correctness": 0.80, @@ -46,26 +52,33 @@ # Per Databricks Agent Evaluation docs, judge metrics in `result.metrics` use # names like `response/llm_judged/correctness/rating/percentage` for response # judges and `retrieval/llm_judged/chunk_relevance/precision/average` for -# retrieval judges. Per-row results live in `result.tables['eval_results']`, -# NOT `result.metrics`. +# retrieval judges. MLflow 3.x / databricks-agents 1.x also emits shorter +# aggregate keys (`correctness/percentage`, `groundedness/percentage`, etc.). +# Per-row results live in `result.tables['eval_results']`, NOT `result.metrics`. # (E)xecution and (L)atency are computed from the raw response/timing — Mosaic # AI doesn't ship judges for those. AGGREGATE_METRIC_KEYS = { "correctness": [ "response/llm_judged/correctness/rating/percentage", "response/llm_judged/correctness/rating/average", + "correctness/percentage", ], "adherence": [ "response/llm_judged/guideline_adherence/rating/percentage", "response/llm_judged/guideline_adherence/rating/average", + "guideline_adherence/percentage", + "global_guideline_adherence/percentage", ], "relevance": [ "retrieval/llm_judged/chunk_relevance/precision/average", "retrieval/llm_judged/chunk_relevance/precision/percentage", + "groundedness/percentage", + "context_sufficiency/percentage", ], "safety": [ "response/llm_judged/safety/rating/percentage", "response/llm_judged/safety/rating/average", + "safety/percentage", ], } @@ -91,9 +104,8 @@ def _load(path: str) -> list[dict[str, Any]]: def _query(endpoint: str, question: str) -> tuple[dict[str, Any], int]: w = WorkspaceClient() started = time.monotonic() - out = w.serving_endpoints.query(name=endpoint, inputs=[{"question": question, "top_k": 5}]) - raw = out.predictions if hasattr(out, "predictions") else out["predictions"] - response = raw[0] if isinstance(raw, list) else raw + payload = invoke_agent_endpoint(w, endpoint, question, max_retries=2, timeout_seconds=90) + response = normalise_agent_response(payload, empty_text="") return response, int((time.monotonic() - started) * 1000) @@ -112,7 +124,10 @@ def _to_eval_record(item: dict[str, Any], response: dict[str, Any], latency_ms: "response": response.get("answer", ""), "expected_facts": item.get("expected_facts", []), "retrieved_context": [ - {"doc_uri": c.get("filename", ""), "content": c.get("snippet") or c.get("section_label", "")} + { + "doc_uri": c.get("filename") or c.get("doc_uri") or c.get("source") or "", + "content": c.get("snippet") or c.get("section_label") or c.get("title") or "", + } for c in citations ], "guidelines": item.get("guidelines", []) or GLOBAL_GUIDELINES, @@ -124,7 +139,8 @@ def _execute(endpoint: str, items: list[dict[str, Any]]) -> tuple[pd.DataFrame, eval_rows: list[dict[str, Any]] = [] latencies: list[int] = [] raw_responses: list[dict[str, Any]] = [] - for item in items: + for i, item in enumerate(items, start=1): + print(f"[eval] {i}/{len(items)} {item.get('id', item.get('category', 'row'))}", flush=True) response, latency_ms = _query(endpoint, item["question"]) latencies.append(latency_ms) raw_responses.append(response) @@ -152,7 +168,7 @@ def _enforce(result: Any, items: list[dict[str, Any]], # Custom axes: Execution from agent_path; Latency p95 from raw timings. executions = [ - 1.0 if r.get("agent_path") in {"analyst", "supervisor", "knowledge_assistant"} else 0.0 + 1.0 if r.get("agent_path") in {"agent_bricks_supervisor", "knowledge_assistant"} else 0.0 for r in raw_responses ] summary["execution"] = statistics.mean(executions) if executions else 0.0 @@ -163,7 +179,7 @@ def _enforce(result: Any, items: list[dict[str, Any]], if axis not in summary: failures.append( f"{axis} not produced by judges; available metric keys: " - f"{sorted(k for k in metrics if 'llm_judged' in k or 'retrieval' in k)[:6]}..." + f"{sorted(metrics)[:12]}..." ) continue actual = summary[axis] @@ -254,7 +270,16 @@ def main() -> int: # Surface the judge-aggregate metrics that mapped to CLEARS axes plus # any unmapped llm_judged keys for debuggability. all_metrics = result.metrics or {} - debug_metrics = {k: v for k, v in all_metrics.items() if "llm_judged" in k or "retrieval" in k} + debug_metrics = { + k: v + for k, v in all_metrics.items() + if ( + "llm_judged" in k + or "retrieval" in k + or k.split("/", 1)[0] + in {"correctness", "guideline_adherence", "global_guideline_adherence", "groundedness", "context_sufficiency", "safety"} + ) + } print(json.dumps({ "summary": summary, "judge_metrics": debug_metrics, diff --git a/evals/requirements.txt b/evals/requirements.txt index e8b1941..fa3e69d 100644 --- a/evals/requirements.txt +++ b/evals/requirements.txt @@ -1,3 +1,3 @@ mlflow>=2.20 databricks-agents>=0.10 -databricks-sdk>=0.40 +databricks-sdk>=0.105,<1 diff --git a/jobs/index_refresh/sync_index.py b/jobs/index_refresh/sync_index.py index 8988926..34e0c45 100644 --- a/jobs/index_refresh/sync_index.py +++ b/jobs/index_refresh/sync_index.py @@ -8,11 +8,31 @@ from __future__ import annotations import argparse +from datetime import timedelta import logging import sys +import time from databricks.sdk import WorkspaceClient -from databricks.vector_search.client import VectorSearchClient +from databricks.sdk.service.vectorsearch import ( + DeltaSyncVectorIndexSpecRequest, + EmbeddingSourceColumn, + PipelineType, + VectorIndexType, +) + + +def _wait_index_ready(w: WorkspaceClient, index_name: str, *, timeout_seconds: int = 1200) -> None: + deadline = time.time() + timeout_seconds + while True: + index = w.vector_search_indexes.get_index(index_name) + status = index.status + if status and status.ready: + return + if time.time() >= deadline: + message = getattr(status, "message", None) or "UNKNOWN" + raise TimeoutError(f"Vector Search index {index_name} not ready after {timeout_seconds}s: {message}") + time.sleep(15) def main() -> int: @@ -27,25 +47,37 @@ def main() -> int: logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") log = logging.getLogger("vs-sync") - vsc = VectorSearchClient(disable_notice=True) - indexes = {idx["name"] for idx in vsc.list_indexes(name=args.endpoint).get("vector_indexes", [])} + w = WorkspaceClient() + w.vector_search_endpoints.wait_get_endpoint_vector_search_endpoint_online( + args.endpoint, + timeout=timedelta(minutes=20), + ) + indexes = {idx.name for idx in w.vector_search_indexes.list_indexes(endpoint_name=args.endpoint)} if args.index not in indexes: log.info("creating Delta-Sync index %s", args.index) - vsc.create_delta_sync_index_and_wait( + w.vector_search_indexes.create_index( + name=args.index, endpoint_name=args.endpoint, - index_name=args.index, - source_table_name=args.source_table, primary_key=args.primary_key, - pipeline_type="TRIGGERED", - embedding_source_column="summary", - embedding_model_endpoint_name=args.embedding_endpoint, + index_type=VectorIndexType.DELTA_SYNC, + delta_sync_index_spec=DeltaSyncVectorIndexSpecRequest( + source_table=args.source_table, + pipeline_type=PipelineType.TRIGGERED, + embedding_source_columns=[ + EmbeddingSourceColumn( + name="summary", + embedding_model_endpoint_name=args.embedding_endpoint, + ) + ], + ), ) + _wait_index_ready(w, args.index) log.info("index created and initial sync complete") return 0 log.info("index %s exists; triggering sync", args.index) - vsc.get_index(endpoint_name=args.endpoint, index_name=args.index).sync() + w.vector_search_indexes.sync_index(args.index) log.info("sync triggered") return 0 diff --git a/pipelines/sql/03_gold_classify_extract.sql b/pipelines/sql/03_gold_classify_extract.sql index 163587b..a6891f5 100644 --- a/pipelines/sql/03_gold_classify_extract.sql +++ b/pipelines/sql/03_gold_classify_extract.sql @@ -3,8 +3,8 @@ -- -- Section explosion: the silver VARIANT carries a `$.sections[*]` array with -- {label, text} per parsed section. We POSEXPLODE that into per-section rows. --- Filings whose VARIANT does not produce a usable sections array fall back to --- a single full_document row so we never silently drop a parsed filing. +-- Filings whose VARIANT does not produce a usable sections array are represented +-- as a single full_document row so we never silently drop a parsed filing. CREATE OR REFRESH STREAMING TABLE gold_filing_sections_raw AS WITH sectioned AS ( @@ -22,9 +22,9 @@ WITH sectioned AS ( WHERE s.parse_status IN ('ok', 'partial') AND variant_get(s.parsed, '$.sections') IS NOT NULL ), -fallback AS ( - -- Fallback: filings whose parsed VARIANT lacks $.sections still get one row - -- so downstream classification/extraction can run. +whole_document AS ( + -- Filings whose parsed VARIANT lacks $.sections still get one row so + -- downstream classification/extraction can run. SELECT s.filename, 1 AS section_seq, @@ -38,7 +38,7 @@ fallback AS ( ) SELECT * FROM sectioned UNION ALL -SELECT * FROM fallback; +SELECT * FROM whole_document; CREATE OR REFRESH STREAMING TABLE gold_filing_sections TBLPROPERTIES ('delta.enableChangeDataFeed' = 'true'); diff --git a/resources/consumers/agent.serving.yml b/resources/consumers/agent.serving.yml deleted file mode 100644 index 8b38299..0000000 --- a/resources/consumers/agent.serving.yml +++ /dev/null @@ -1,46 +0,0 @@ -resources: - model_serving_endpoints: - analyst_agent: - name: analyst-agent-${bundle.target} - config: - served_entities: - - name: analyst_agent - entity_name: ${var.catalog}.${var.schema}.analyst_agent - # Bootstrap creates v1 via agent/log_and_register.py before this deploy - # creates the serving endpoint. Steady-state CI re-registers a new - # version and calls _promote_serving_endpoint to advance the served - # entity in-place (the served version diverges from this YAML over - # time — that's intentional; the YAML is just the bootstrap value). - entity_version: "1" - workload_size: Small - workload_type: CPU - scale_to_zero_enabled: true - environment_vars: - DOCINTEL_CATALOG: ${var.catalog} - DOCINTEL_SCHEMA: ${var.schema} - DOCINTEL_TARGET: ${bundle.target} - DOCINTEL_VS_ENDPOINT: docintel-${bundle.target} - DOCINTEL_FOUNDATION_ENDPOINT: ${var.foundation_model_endpoint_name} - DOCINTEL_RERANK_ENDPOINT: ${var.rerank_model_endpoint_name} - DOCINTEL_WAREHOUSE_ID: ${var.warehouse_id} - traffic_config: - routes: - - served_model_name: analyst_agent - traffic_percentage: 100 - - ai_gateway: - usage_tracking_config: - enabled: true - rate_limits: - - calls: 60 - renewal_period: minute - key: user - inference_table_config: - enabled: true - catalog_name: ${var.catalog} - schema_name: ${var.schema} - table_name_prefix: agent_gateway - - permissions: - - level: CAN_QUERY - group_name: ${var.analyst_group} diff --git a/resources/consumers/analyst.app.yml b/resources/consumers/analyst.app.yml index 32f8ba6..fbe9b3b 100644 --- a/resources/consumers/analyst.app.yml +++ b/resources/consumers/analyst.app.yml @@ -5,41 +5,28 @@ resources: description: "10-K Analyst — chat over indexed filings with citations" source_code_path: ../../app - # Databricks Apps auto-grants the declared permissions to the App SP - # on deploy — see https://docs.databricks.com/aws/en/dev-tools/databricks-apps/access-data - # Do NOT also grant manually. + config: + env: + - name: DOCINTEL_AGENT_ENDPOINT + value: ${var.agent_endpoint_name} + + # Databricks Apps auto-grants Lakebase permissions to the App SP on + # deploy — see https://docs.databricks.com/aws/en/dev-tools/databricks-apps/access-data. + # Agent Bricks endpoint access is granted to the analyst group by + # scripts/bootstrap_agent_bricks.py because calls use OBO user identity. resources: - name: docintel-lakebase database: database_name: ${var.lakebase_instance} instance_name: ${var.lakebase_instance} permission: CAN_CONNECT_AND_CREATE - - name: agent-endpoint - serving_endpoint: - name: analyst-agent-${bundle.target} - permission: CAN_QUERY - # OBO scopes (Databricks Apps IAM/auth docs: + # Mandatory OBO scopes (Databricks Apps IAM/auth docs: # https://docs.databricks.com/aws/en/dev-tools/databricks-apps/iam-auth) - # require the workspace-level "Databricks Apps - user token passthrough" - # feature. If your workspace does NOT have it enabled, DAB rejects this - # field with PERMISSION_DENIED at deploy time — leave the block - # commented out and the app will fall back to SP creds (with a loud - # bring-up banner). Uncomment after a workspace admin enables the feature. - # - # Required scopes for this app's OBO call chain: - # serving.serving-endpoints — invoke analyst-agent-${target} as the user - # sql — agent's tools.py runs UC SQL via warehouse - # iam.access-control:read — default - # iam.current-user:read — default - # - # The agent's pyfunc additionally calls Vector Search and downstream - # serving endpoints; those resources/scopes are declared in MLflow's - # auth_policy at log_model time (see agent/log_and_register.py - # _auth_policy), not here. - # - # user_api_scopes: - # - serving.serving-endpoints - # - sql - # - iam.access-control:read - # - iam.current-user:read + # require the workspace-level "Databricks Apps - user token passthrough". + # Deployment must fail if the workspace cannot grant these scopes. + user_api_scopes: + - serving.serving-endpoints + - sql + - iam.access-control:read + - iam.current-user:read diff --git a/resources/consumers/index_refresh.job.yml b/resources/consumers/index_refresh.job.yml index 017f5b3..5d01478 100644 --- a/resources/consumers/index_refresh.job.yml +++ b/resources/consumers/index_refresh.job.yml @@ -31,8 +31,7 @@ resources: spec: client: "1" dependencies: - - databricks-vectorsearch>=0.40 - - databricks-sdk>=0.40 + - databricks-sdk>=0.105,<1 permissions: - level: CAN_VIEW diff --git a/resources/foundation/retention.job.yml b/resources/foundation/retention.job.yml index 11464ff..b9310c1 100644 --- a/resources/foundation/retention.job.yml +++ b/resources/foundation/retention.job.yml @@ -27,7 +27,7 @@ resources: spec: client: "1" dependencies: - - databricks-sdk>=0.40 + - databricks-sdk>=0.105,<1 permissions: - level: CAN_VIEW diff --git a/scripts/bootstrap-demo.sh b/scripts/bootstrap-demo.sh index c6aa4e3..cf64e80 100755 --- a/scripts/bootstrap-demo.sh +++ b/scripts/bootstrap-demo.sh @@ -3,15 +3,15 @@ # # Two modes, auto-detected: # -# FIRST DEPLOY (no serving endpoint yet) -# resources/ has chicken-egg dependencies: consumers (serving endpoint, -# monitor, app, lakebase catalog, index-refresh job) need foundation data -# (registered model, populated KPI table, AVAILABLE Lakebase). DAB -# deploys everything in one shot, so we stage: +# FIRST DEPLOY (no Agent Bricks supervisor endpoint yet) +# resources/ has chicken-egg dependencies: consumers (app, monitor, +# lakebase catalog, index-refresh job) need foundation data (populated KPI +# table, Vector Search index, Agent Bricks endpoint, AVAILABLE Lakebase). +# DAB deploys everything in one shot, so we stage: # 1. Hide resources/consumers/*.yml → *.yml.skip; bundle deploy # touches only foundation. Trap restores on any exit. # 2. Produce data: samples → pipeline → wait for KPIs → materialize -# VS index → register model → wait for Lakebase AVAILABLE. +# VS index → configure Agent Bricks → wait for Lakebase AVAILABLE. # 3. Restore consumer YAMLs; bundle deploy full bundle. All deps # satisfied; consumers create cleanly. # @@ -20,8 +20,7 @@ # would plan to DELETE any resource that disappears from config (per # Databricks bundle docs — removed config = removed workspace resource). # So in steady state we do a normal full bundle deploy and refresh data -# in place: samples → pipeline → register a new model version → repoint -# the serving endpoint via _promote_serving_endpoint. +# in place: samples → pipeline → sync the index → update Agent Bricks. # # Common to both: bundle run analyst_app (apply config + restart), # UC grants chain, smoke check. @@ -40,6 +39,8 @@ # DOCINTEL_FORCE_LOCK set to 1 to pass --force-lock (use ONLY when a # prior deploy crashed and left a stale lock — # not a normal-flow flag). +# DOCINTEL_AUTO_APPROVE set to 1 to pass --auto-approve when intentionally +# deleting/recreating stale bundle-managed resources. # DOCINTEL_EMBEDDING_ENDPOINT # embedding endpoint for first-run VS index # materialization (default: databricks-bge-large-en) @@ -58,24 +59,29 @@ ANALYST_GROUP="${DOCINTEL_ANALYST_GROUP:-account users}" WAIT_SECONDS="${DOCINTEL_WAIT_SECONDS:-600}" LAKEBASE_TIMEOUT="${DOCINTEL_LAKEBASE_TIMEOUT:-600}" EMBEDDING_ENDPOINT="${DOCINTEL_EMBEDDING_ENDPOINT:-databricks-bge-large-en}" -ENDPOINT="analyst-agent-${TARGET}" APP_NAME="doc-intel-analyst-${TARGET}" KPI_TABLE="${DOCINTEL_CATALOG}.${DOCINTEL_SCHEMA}.gold_filing_kpis" VOLUME_PATH="dbfs:/Volumes/${DOCINTEL_CATALOG}/${DOCINTEL_SCHEMA}/raw_filings" PIPELINE_KEY="doc_intel_pipeline" +AGENT_ENDPOINT_NAME="" DEPLOY_FLAGS=() if [[ "${DOCINTEL_FORCE_LOCK:-0}" == "1" ]]; then log "DOCINTEL_FORCE_LOCK=1 — passing --force-lock to bundle deploy (use only for stale-lock recovery)" DEPLOY_FLAGS+=(--force-lock) fi +if [[ "${DOCINTEL_AUTO_APPROVE:-0}" == "1" ]]; then + log "DOCINTEL_AUTO_APPROVE=1 — passing --auto-approve to bundle deploy for intentional clean recreation" + DEPLOY_FLAGS+=(--auto-approve) +fi # Pin the bundle's `warehouse_id` variable to the user-selected ID so the -# dashboard + serving-endpoint env match wait_for_kpis / log_and_register. +# dashboard and Agent Bricks bootstrap match wait_for_kpis. # Without this, the bundle falls back to its `lookup: warehouse: Serverless # Starter Warehouse` default — which fails validation in workspaces lacking # that named warehouse, and silently picks a different ID otherwise. VAR_FLAGS=(--var "warehouse_id=$DOCINTEL_WAREHOUSE_ID") +BUNDLE_VAR_FLAGS=("${VAR_FLAGS[@]}") REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" cd "$REPO_ROOT" @@ -90,24 +96,46 @@ else fi # ─── First-deploy detection ────────────────────────────────────────────────── -# A serving endpoint with a populated config means consumers were deployed -# previously (or the deploy got partway). Treat anything else as first deploy. +resolve_existing_agent_endpoint() { + scripts/resolve-agent-endpoint.sh "$TARGET" 2>/dev/null || true +} + +set_agent_endpoint_name() { + AGENT_ENDPOINT_NAME="$1" + if [[ -z "$AGENT_ENDPOINT_NAME" ]]; then + die "Agent Bricks Supervisor endpoint name is empty" + fi + BUNDLE_VAR_FLAGS=("${VAR_FLAGS[@]}" --var "agent_endpoint_name=$AGENT_ENDPOINT_NAME") + log " using Agent Bricks Supervisor endpoint $AGENT_ENDPOINT_NAME" +} + +run_agent_bricks_bootstrap() { + local bootstrap_json endpoint + bootstrap_json=$("$PYTHON" scripts/bootstrap_agent_bricks.py \ + --target "$TARGET" \ + --catalog "$DOCINTEL_CATALOG" \ + --schema "$DOCINTEL_SCHEMA" \ + --warehouse-id "$DOCINTEL_WAREHOUSE_ID" \ + --analyst-group "$ANALYST_GROUP") || \ + die "Agent Bricks bootstrap failed" + endpoint=$(printf '%s' "$bootstrap_json" | "$PYTHON" -c " +import json, sys +payload = json.load(sys.stdin) +print(payload.get('supervisor_endpoint') or '') +") + set_agent_endpoint_name "$endpoint" +} + +# An existing Agent Bricks Supervisor means the generated serving endpoint can +# be resolved before app deployment. Treat absence as first deploy. detect_mode() { if [[ "${DOCINTEL_FORCE_FIRST:-0}" == "1" ]]; then echo "first" return fi - if ep_state=$(databricks api get "/api/2.0/serving-endpoints/${ENDPOINT}" --output json 2>/dev/null); then - has_entity=$("$PYTHON" -c " -import json, sys -ep = json.loads(sys.argv[1]) -served = (ep.get('config') or {}).get('served_entities') or (ep.get('config') or {}).get('served_models') or [] -print('yes' if served else 'no') -" "$ep_state" 2>/dev/null || echo "no") - if [[ "$has_entity" == "yes" ]]; then - echo "steady" - return - fi + if [[ -n "$(resolve_existing_agent_endpoint)" ]]; then + echo "steady" + return fi echo "first" } @@ -115,24 +143,8 @@ print('yes' if served else 'no') MODE=$(detect_mode) log "detected mode: $MODE" -# ─── Step 0: orphan detection + cleanup (always run) ──────────────────────── -log "step 0/6: detecting orphans from prior failed runs" - -# Malformed serving endpoint: exists but has no served entities. -if ep_state=$(databricks api get "/api/2.0/serving-endpoints/${ENDPOINT}" --output json 2>/dev/null); then - if "$PYTHON" -c " -import json, sys -ep = json.loads(sys.argv[1]) -served = (ep.get('config') or {}).get('served_entities') or (ep.get('config') or {}).get('served_models') or [] -if not served: - sys.exit(0) -sys.exit(1) -" "$ep_state" 2>/dev/null; then - log " → deleting malformed serving endpoint $ENDPOINT (no served entities)" - databricks api delete "/api/2.0/serving-endpoints/${ENDPOINT}" >/dev/null 2>&1 || \ - log " warn: delete failed, continuing" - fi -fi +# ─── Step 0: environment conflict checks (always run) ─────────────────────── +log "step 0/6: checking environment conflicts" # Lakebase soft-delete name conflict. LAKEBASE_NAME=$("$PYTHON" -c " @@ -233,12 +245,8 @@ if [[ "$MODE" == "first" ]]; then "$PYTHON" scripts/wait_for_kpis.py --min-rows 1 --timeout "$WAIT_SECONDS" || \ die "timed out waiting for $KPI_TABLE" - # Materialize the VS index BEFORE agent registration: the agent's auth_policy - # declares the VS index as a UC resource (DatabricksVectorSearchIndex), and - # MLflow validates its existence at create_model_version time. The VS - # endpoint is in foundation/ (created by stage-1 deploy), but the index is - # always created at runtime by sync_index.py. Stage-2's index_refresh job - # is too late. + # Materialize the VS index BEFORE Agent Bricks configuration so Knowledge + # Assistant can attach the governed index as its knowledge source. log " creating Vector Search index ${DOCINTEL_CATALOG}.${DOCINTEL_SCHEMA}.filings_summary_idx" "$PYTHON" jobs/index_refresh/sync_index.py \ --endpoint "docintel-${TARGET}" \ @@ -248,12 +256,11 @@ if [[ "$MODE" == "first" ]]; then --embedding-endpoint "$EMBEDDING_ENDPOINT" || \ die "VS index creation failed (sync_index.py)" - "$PYTHON" agent/log_and_register.py --target "$TARGET" || \ - die "agent registration failed" + run_agent_bricks_bootstrap wait_for_lakebase_available log "step 3/6: stage-2 deploy (full bundle — consumers join the foundation)" - databricks bundle deploy -t "$TARGET" "${VAR_FLAGS[@]}" ${DEPLOY_FLAGS[@]+"${DEPLOY_FLAGS[@]}"} || \ + databricks bundle deploy -t "$TARGET" "${BUNDLE_VAR_FLAGS[@]}" ${DEPLOY_FLAGS[@]+"${DEPLOY_FLAGS[@]}"} || \ die "stage-2 deploy failed; check logs" # The index_refresh job is created by stage-2 deploy and is `table_update`- @@ -261,31 +268,32 @@ if [[ "$MODE" == "first" ]]; then # produced before the job existed, so run it once after deployment as an # idempotent smoke of the bundled job path. log "step 3.5/6: triggering initial Vector Search index materialization" - databricks bundle run -t "$TARGET" "${VAR_FLAGS[@]}" index_refresh || \ + databricks bundle run -t "$TARGET" "${BUNDLE_VAR_FLAGS[@]}" index_refresh || \ log " warn: index_refresh failed; the table_update trigger will retry on the next pipeline run" else # ─── Steady-state path: single full deploy + in-place data refresh ──────── + set_agent_endpoint_name "$(resolve_existing_agent_endpoint)" log "step 1/6: full bundle deploy (steady-state — consumers already exist)" - databricks bundle deploy -t "$TARGET" "${VAR_FLAGS[@]}" ${DEPLOY_FLAGS[@]+"${DEPLOY_FLAGS[@]}"} || \ + databricks bundle deploy -t "$TARGET" "${BUNDLE_VAR_FLAGS[@]}" ${DEPLOY_FLAGS[@]+"${DEPLOY_FLAGS[@]}"} || \ die "bundle deploy failed; if a prior deploy was interrupted, set DOCINTEL_FORCE_LOCK=1 and retry" - log "step 2/6: refreshing data + repointing serving endpoint" + log "step 2/6: refreshing data + Agent Bricks configuration" upload_samples databricks bundle run -t "$TARGET" "${VAR_FLAGS[@]}" "$PIPELINE_KEY" || \ die "pipeline run failed — inspect SDP UI before retrying" "$PYTHON" scripts/wait_for_kpis.py --min-rows 1 --timeout "$WAIT_SECONDS" || \ die "timed out waiting for $KPI_TABLE" - # Register a new model version and update the served entity in-place. - "$PYTHON" agent/log_and_register.py --target "$TARGET" --serving-endpoint "$ENDPOINT" || \ - die "agent registration failed" + databricks bundle run -t "$TARGET" "${BUNDLE_VAR_FLAGS[@]}" index_refresh || \ + log " warn: index_refresh failed; the table_update trigger will retry on the next pipeline run" + run_agent_bricks_bootstrap log "step 3/6: skipped (no second deploy needed in steady-state)" fi # ─── Step 4: app run (both paths) ──────────────────────────────────────────── log "step 4/6: applying app config + restart" -databricks bundle run -t "$TARGET" "${VAR_FLAGS[@]}" analyst_app || \ +databricks bundle run -t "$TARGET" "${BUNDLE_VAR_FLAGS[@]}" analyst_app || \ log " warn: analyst_app run failed; retry manually with 'databricks bundle run -t $TARGET analyst_app'" # ─── Step 5: UC grants (idempotent) ────────────────────────────────────────── @@ -313,33 +321,23 @@ missing = required - scopes if missing: raise SystemExit(f'OBO scopes missing: {sorted(missing)} (got {sorted(scopes)})') print(f' OBO scopes intact: {sorted(scopes)}') -" || log " warn: OBO scopes wiped — re-apply via 'databricks apps update $APP_NAME --user-api-scopes serving.serving-endpoints,sql,iam.access-control:read,iam.current-user:read'" +" || die "OBO scopes missing after deploy" + else + die "unable to read app state for OBO verification" fi else - log "" - log " ⚠ APP-LEVEL OBO IS OPERATIONALLY DISABLED" - log " resources/consumers/analyst.app.yml has user_api_scopes commented out, so:" - log " • Databricks Apps will NOT inject x-forwarded-access-token into requests." - log " • app/app.py:_user_client falls back to SP creds for every user." - log " • UC ACLs in the agent's downstream calls run as the app SP, not the user." - log " This is a deliberate fallback because the workspace lacks the user-token-" - log " passthrough feature. To enable OBO end-to-end:" - log " 1. Workspace admin enables 'Databricks Apps - user token passthrough'." - log " 2. Uncomment the user_api_scopes block in analyst.app.yml." - log " 3. Re-deploy: databricks bundle deploy -t $TARGET && databricks bundle run -t $TARGET analyst_app" - log "" + die "resources/consumers/analyst.app.yml must declare user_api_scopes; OBO is mandatory" fi # ─── Step 6: smoke check ───────────────────────────────────────────────────── -log "step 6/6: smoke check on $ENDPOINT" +log "step 6/6: smoke check on $AGENT_ENDPOINT_NAME" if smoke=$("$PYTHON" -c " from databricks.sdk import WorkspaceClient +from app.agent_bricks_client import invoke_agent_endpoint import json, sys w = WorkspaceClient() -out = w.serving_endpoints.query(name='$ENDPOINT', inputs=[{'question': 'What was ACMEs revenue in fiscal 2024?', 'top_k': 3}]) -preds = out.predictions if hasattr(out, 'predictions') else out['predictions'] -r = preds[0] if isinstance(preds, list) else preds -print(json.dumps({'grounded': r.get('grounded'), 'agent_path': r.get('agent_path'), 'citations': len(r.get('citations') or [])})) +payload = invoke_agent_endpoint(w, '$AGENT_ENDPOINT_NAME', 'What was ACMEs revenue in fiscal 2024?') +print(json.dumps({'endpoint': '$AGENT_ENDPOINT_NAME', 'keys': sorted(payload.keys())[:12]})) " 2>&1); then log " smoke OK: $smoke" else @@ -348,8 +346,8 @@ fi log "done." log " mode: $MODE" -log " endpoint: $ENDPOINT" +log " endpoint: $AGENT_ENDPOINT_NAME" log " KPI table: $KPI_TABLE" log " app: $APP_NAME" log " Lakebase: $LAKEBASE_NAME" -log "next: $PYTHON evals/clears_eval.py --endpoint $ENDPOINT --dataset evals/dataset.jsonl" +log "next: $PYTHON evals/clears_eval.py --endpoint $AGENT_ENDPOINT_NAME --dataset evals/dataset.jsonl" diff --git a/scripts/bootstrap_agent_bricks.py b/scripts/bootstrap_agent_bricks.py new file mode 100644 index 0000000..565a0d5 --- /dev/null +++ b/scripts/bootstrap_agent_bricks.py @@ -0,0 +1,431 @@ +"""Create or update the Agent Bricks runtime for the document intelligence app. + +This is the production agent bootstrap path. It configures: + +* Agent Bricks Knowledge Assistant over the governed Vector Search source. +* A deterministic Unity Catalog SQL function for structured KPI lookups. +* Agent Bricks Supervisor Agent that coordinates the Knowledge Assistant and + the KPI function. + +The earlier hand-built MLflow pyfunc agent runtime is intentionally not part of +this path. +""" + +from __future__ import annotations + +import argparse +import json +import os +import sys +import time +from dataclasses import asdict +from typing import Iterable, TypeVar + +from databricks.sdk import WorkspaceClient +from databricks.sdk.common.types.fieldmask import FieldMask +from databricks.sdk.service.iam import AccessControlRequest, PermissionLevel +from databricks.sdk.service.knowledgeassistants import ( + IndexSpec, + KnowledgeAssistant, + KnowledgeSource, +) +from databricks.sdk.service.supervisoragents import ( + KnowledgeAssistant as SupervisorKnowledgeAssistant, +) +from databricks.sdk.service.supervisoragents import SupervisorAgent, Tool, UcFunction + + +T = TypeVar("T") + + +def _find_by_display_name(items: Iterable[T], display_name: str) -> T | None: + for item in items: + if getattr(item, "display_name", None) == display_name: + return item + return None + + +def _id_from_name(name: str | None) -> str: + if not name: + raise ValueError("Agent Bricks resource did not return a name") + return name.rsplit("/", 1)[-1] + + +def _as_dict(obj: object) -> dict: + if hasattr(obj, "as_dict"): + return obj.as_dict() + if hasattr(obj, "__dataclass_fields__"): + return asdict(obj) + return {"value": str(obj)} + + +def _enum_name(value: object) -> str: + if value is None: + return "" + raw = getattr(value, "value", value) + return str(raw).upper().rsplit(".", 1)[-1] + + +def _statement_error(status: object) -> str: + error = getattr(status, "error", None) + if error is None: + return str(status) + message = getattr(error, "message", None) + error_code = getattr(error, "error_code", None) + if message and error_code: + return f"{error_code}: {message}" + return str(error) + + +def _wait_statement_succeeded( + w: WorkspaceClient, + result: object, + *, + label: str, + timeout_seconds: int = 300, +) -> None: + started = time.time() + deadline = started + timeout_seconds + next_log = 60 + failed_states = {"FAILED", "CANCELED", "CANCELLED", "CLOSED"} + + while True: + status = getattr(result, "status", None) + state = _enum_name(getattr(status, "state", None)) + if state == "SUCCEEDED": + return + if state in failed_states: + raise RuntimeError(f"{label} failed: {_statement_error(status)}") + + statement_id = getattr(result, "statement_id", None) + if not statement_id: + if state: + raise RuntimeError(f"{label} did not finish and returned no statement_id (state={state})") + return + + elapsed = int(time.time() - started) + if time.time() >= deadline: + raise TimeoutError(f"{label} did not finish within {timeout_seconds}s (last state={state or 'UNKNOWN'})") + if elapsed >= next_log: + print(f"still waiting on {label} after {elapsed}s (state={state or 'UNKNOWN'})", file=sys.stderr) + next_log += 60 + time.sleep(5) + result = w.statement_execution.get_statement(statement_id) + + +def _create_or_update_kpi_function( + w: WorkspaceClient, + *, + catalog: str, + schema: str, + warehouse_id: str, +) -> str: + function_name = f"{catalog}.{schema}.lookup_10k_kpis" + statement = f""" +CREATE OR REPLACE FUNCTION {function_name}(company STRING) +RETURNS STRING +LANGUAGE SQL +COMMENT 'Return the newest governed 10-K KPI row for a company as JSON. Used by Agent Bricks Supervisor Agent.' +RETURN ( + SELECT to_json(named_struct( + 'filename', filename, + 'company_name', company_name, + 'fiscal_year', fiscal_year, + 'revenue', revenue, + 'ebitda', ebitda, + 'segment_revenue', segment_revenue, + 'top_risks', top_risks, + 'extraction_confidence', extraction_confidence + )) + FROM {catalog}.{schema}.gold_filing_kpis + WHERE lower(company_name) LIKE concat('%', lower(company), '%') + OR lower(filename) LIKE concat('%', lower(company), '%') + ORDER BY fiscal_year DESC + LIMIT 1 +) +""" + result = w.statement_execution.execute_statement( + warehouse_id=warehouse_id, + statement=statement, + wait_timeout="50s", + ) + _wait_statement_succeeded( + w, + result, + label=f"CREATE OR REPLACE FUNCTION {function_name}", + timeout_seconds=300, + ) + return function_name + + +def _ensure_knowledge_assistant( + w: WorkspaceClient, + *, + display_name: str, + endpoint_name: str, + index_name: str, +) -> KnowledgeAssistant: + description = ( + "Cited document Q&A over curated 10-K sections produced by the " + "Document Intelligence pipeline." + ) + instructions = ( + "Answer questions only from the provided 10-K knowledge source. " + "Prefer exact figures and section-level citations. If the answer is " + "not grounded in the indexed corpus, say that the corpus does not " + "contain a grounded answer." + ) + + existing = _find_by_display_name(w.knowledge_assistants.list_knowledge_assistants(), display_name) + desired = KnowledgeAssistant( + display_name=display_name, + endpoint_name=endpoint_name, + description=description, + instructions=instructions, + ) + if existing is None: + assistant = w.knowledge_assistants.create_knowledge_assistant(knowledge_assistant=desired) + else: + assistant = w.knowledge_assistants.update_knowledge_assistant( + name=existing.name, + knowledge_assistant=desired, + update_mask=FieldMask(["description", "instructions"]), + ) + + source_display = "curated_10k_sections" + source_description = ( + "Quality-filtered 10-K section summaries with filename and section " + "metadata from the governed Document Intelligence Gold layer." + ) + source = KnowledgeSource( + display_name=source_display, + description=source_description, + source_type="index", + index=IndexSpec( + index_name=index_name, + text_col="summary", + doc_uri_col="filename", + ), + ) + + sources = list(w.knowledge_assistants.list_knowledge_sources(parent=assistant.name)) + existing_source = next( + ( + s + for s in sources + if s.display_name == source_display + or (s.index is not None and s.index.index_name == index_name) + ), + None, + ) + if existing_source is None: + w.knowledge_assistants.create_knowledge_source(parent=assistant.name, knowledge_source=source) + else: + w.knowledge_assistants.update_knowledge_source( + name=existing_source.name, + knowledge_source=KnowledgeSource( + display_name=source_display, + description=source_description, + source_type="index", + ), + update_mask=FieldMask(["display_name", "description"]), + ) + + w.knowledge_assistants.sync_knowledge_sources(name=assistant.name) + return assistant + + +def _ensure_supervisor( + w: WorkspaceClient, + *, + display_name: str, + endpoint_name: str, + knowledge_assistant: KnowledgeAssistant, + kpi_function_name: str, +) -> SupervisorAgent: + description = ( + "Governed 10-K document intelligence supervisor for cited filing Q&A " + "and structured KPI comparisons." + ) + instructions = ( + "Use the Knowledge Assistant for narrative or section-level questions. " + "Use the Unity Catalog KPI function for structured financial metrics " + "and cross-company comparisons. Do not invent figures; cite the filing " + "source or state that the corpus does not contain the answer." + ) + desired = SupervisorAgent( + display_name=display_name, + endpoint_name=endpoint_name, + description=description, + instructions=instructions, + ) + existing = _find_by_display_name(w.supervisor_agents.list_supervisor_agents(), display_name) + if existing is None: + supervisor = w.supervisor_agents.create_supervisor_agent(supervisor_agent=desired) + else: + supervisor = w.supervisor_agents.update_supervisor_agent( + name=existing.name, + supervisor_agent=desired, + update_mask=FieldMask(["description", "instructions"]), + ) + + ka_tool = Tool( + tool_type="knowledge_assistant", + description=( + "Answer cited questions about individual 10-K filings, risk " + "factors, MD&A, notes, and narrative disclosures." + ), + knowledge_assistant=SupervisorKnowledgeAssistant( + knowledge_assistant_id=knowledge_assistant.id or _id_from_name(knowledge_assistant.name), + serving_endpoint_name=knowledge_assistant.endpoint_name, + ), + ) + kpi_tool = Tool( + tool_type="uc_function", + description=( + "Fetch deterministic structured KPIs from the governed Gold table " + "for a requested company." + ), + uc_function=UcFunction(name=kpi_function_name), + ) + + existing_tools = { + (getattr(t, "tool_id", None) or _id_from_name(t.name)): t + for t in w.supervisor_agents.list_tools(parent=supervisor.name) + } + for tool_id, tool in { + "filings_knowledge_assistant": ka_tool, + "structured_kpi_lookup": kpi_tool, + }.items(): + if tool_id in existing_tools: + w.supervisor_agents.update_tool( + name=existing_tools[tool_id].name, + tool=Tool(tool_type=tool.tool_type, description=tool.description), + update_mask=FieldMask(["description"]), + ) + else: + w.supervisor_agents.create_tool(parent=supervisor.name, tool=tool, tool_id=tool_id) + + return supervisor + + +def _endpoint_status(endpoint: object) -> tuple[str, str]: + state = getattr(endpoint, "state", None) + if isinstance(state, dict): + ready = state.get("ready") + config_update = state.get("config_update") + else: + ready = getattr(state, "ready", None) + config_update = getattr(state, "config_update", None) + return _enum_name(ready), _enum_name(config_update) + + +def _wait_endpoint_ready(w: WorkspaceClient, endpoint_name: str, *, timeout_seconds: int = 600) -> object: + started = time.time() + deadline = started + timeout_seconds + next_log = 60 + last_status = "not listable yet" + + while True: + try: + endpoint = w.serving_endpoints.get(endpoint_name) + ready, config_update = _endpoint_status(endpoint) + last_status = f"ready={ready or 'UNKNOWN'}, config_update={config_update or 'UNKNOWN'}" + if config_update == "UPDATE_FAILED": + raise RuntimeError(f"Agent Bricks endpoint {endpoint_name} update failed ({last_status})") + if ready == "READY" and config_update in {"", "NOT_UPDATING"}: + return endpoint + except RuntimeError: + raise + except Exception as exc: + last_status = f"not listable yet: {exc}" + + elapsed = int(time.time() - started) + if time.time() >= deadline: + raise RuntimeError( + f"Agent Bricks endpoint {endpoint_name} was not ready within " + f"{timeout_seconds}s ({last_status})" + ) + if elapsed >= next_log: + print(f"still waiting on endpoint {endpoint_name} after {elapsed}s ({last_status})", file=sys.stderr) + next_log += 60 + time.sleep(15) + + +def _grant_endpoint_query(w: WorkspaceClient, endpoint_name: str, group_name: str) -> None: + endpoint = _wait_endpoint_ready(w, endpoint_name) + endpoint_id = getattr(endpoint, "id", None) or endpoint_name + + w.permissions.update( + "serving-endpoints", + endpoint_id, + access_control_list=[ + AccessControlRequest( + group_name=group_name, + permission_level=PermissionLevel.CAN_QUERY, + ) + ], + ) + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("--target", default=os.environ.get("DOCINTEL_TARGET", "demo")) + parser.add_argument("--catalog", default=os.environ.get("DOCINTEL_CATALOG")) + parser.add_argument("--schema", default=os.environ.get("DOCINTEL_SCHEMA")) + parser.add_argument("--warehouse-id", default=os.environ.get("DOCINTEL_WAREHOUSE_ID")) + parser.add_argument("--analyst-group", default=os.environ.get("DOCINTEL_ANALYST_GROUP", "account users")) + parser.add_argument("--requested-supervisor-endpoint") + parser.add_argument("--requested-knowledge-endpoint") + parser.add_argument("--supervisor-endpoint", dest="requested_supervisor_endpoint", help=argparse.SUPPRESS) + parser.add_argument("--knowledge-endpoint", dest="requested_knowledge_endpoint", help=argparse.SUPPRESS) + args = parser.parse_args() + + if not args.catalog or not args.schema or not args.warehouse_id: + parser.error("--catalog, --schema, and --warehouse-id are required") + + target = args.target + requested_supervisor_endpoint = args.requested_supervisor_endpoint or f"analyst-agent-{target}" + requested_knowledge_endpoint = args.requested_knowledge_endpoint or f"doc-intel-knowledge-{target}" + index_name = f"{args.catalog}.{args.schema}.filings_summary_idx" + + w = WorkspaceClient() + kpi_function_name = _create_or_update_kpi_function( + w, + catalog=args.catalog, + schema=args.schema, + warehouse_id=args.warehouse_id, + ) + knowledge_assistant = _ensure_knowledge_assistant( + w, + display_name=f"doc-intel-knowledge-{target}", + endpoint_name=requested_knowledge_endpoint, + index_name=index_name, + ) + supervisor = _ensure_supervisor( + w, + display_name=f"doc-intel-supervisor-{target}", + endpoint_name=requested_supervisor_endpoint, + knowledge_assistant=knowledge_assistant, + kpi_function_name=kpi_function_name, + ) + + actual_supervisor_endpoint = supervisor.endpoint_name or requested_supervisor_endpoint + actual_knowledge_endpoint = knowledge_assistant.endpoint_name or requested_knowledge_endpoint + + _grant_endpoint_query(w, actual_supervisor_endpoint, args.analyst_group) + if actual_knowledge_endpoint: + _grant_endpoint_query(w, actual_knowledge_endpoint, args.analyst_group) + + print(json.dumps({ + "knowledge_assistant": _as_dict(knowledge_assistant), + "supervisor_agent": _as_dict(supervisor), + "kpi_function": kpi_function_name, + "supervisor_endpoint": actual_supervisor_endpoint, + "knowledge_endpoint": actual_knowledge_endpoint, + }, indent=2, default=str)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/resolve-agent-endpoint.sh b/scripts/resolve-agent-endpoint.sh new file mode 100755 index 0000000..e7de571 --- /dev/null +++ b/scripts/resolve-agent-endpoint.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +set -euo pipefail + +TARGET="${1:-${DOCINTEL_TARGET:-demo}}" +REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" +cd "$REPO_ROOT" +export PYTHONPATH="$REPO_ROOT${PYTHONPATH:+:$PYTHONPATH}" + +if [[ -x ".venv/bin/python" ]]; then + PYTHON=".venv/bin/python" +elif command -v python3 >/dev/null 2>&1; then + PYTHON="python3" +else + echo "no python interpreter found (.venv/bin/python or python3)" >&2 + exit 1 +fi + +"$PYTHON" - "$TARGET" <<'PY' +import sys +from databricks.sdk import WorkspaceClient + +target = sys.argv[1] +display_name = f"doc-intel-supervisor-{target}" +w = WorkspaceClient() +for agent in w.supervisor_agents.list_supervisor_agents(): + if agent.display_name == display_name and agent.endpoint_name: + print(agent.endpoint_name) + raise SystemExit(0) +raise SystemExit(f"Agent Bricks Supervisor endpoint not found for target {target!r}") +PY diff --git a/specs/001-doc-intel-10k/contracts/agent-request.json b/specs/001-doc-intel-10k/contracts/agent-request.json index 5164911..d17d011 100644 --- a/specs/001-doc-intel-10k/contracts/agent-request.json +++ b/specs/001-doc-intel-10k/contracts/agent-request.json @@ -1,7 +1,7 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", "title": "Agent Endpoint Request", - "description": "Request body POSTed to the Mosaic AI Agent serving endpoint. AI Gateway adds identity headers automatically.", + "description": "App-level normalized request sent to the Agent Bricks Supervisor endpoint. AI Gateway and Databricks Apps OBO add identity automatically.", "type": "object", "required": ["question"], "properties": { @@ -11,16 +11,9 @@ "maxLength": 2000, "description": "Natural-language question from the analyst." }, - "top_k": { - "type": "integer", - "minimum": 1, - "maximum": 25, - "default": 5, - "description": "Number of citations to return after re-ranking." - }, "company_filter": { "type": ["string", "null"], - "description": "Optional company name to scope retrieval; null for cross-company." + "description": "Optional company name to pass as app metadata; Agent Bricks owns retrieval and routing." }, "fiscal_year_filter": { "type": ["integer", "null"], diff --git a/specs/001-doc-intel-10k/contracts/agent-response.json b/specs/001-doc-intel-10k/contracts/agent-response.json index 1df970f..e278bbc 100644 --- a/specs/001-doc-intel-10k/contracts/agent-response.json +++ b/specs/001-doc-intel-10k/contracts/agent-response.json @@ -1,7 +1,7 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", "title": "Agent Endpoint Response", - "description": "Response body returned by the Mosaic AI Agent serving endpoint.", + "description": "Normalized response rendered by the App after querying the Agent Bricks Supervisor endpoint.", "type": "object", "required": ["answer", "citations", "latency_ms", "retrieved_count", "agent_path"], "properties": { @@ -18,19 +18,19 @@ "description": "Sources used to ground the answer. May be empty when grounded=false.", "items": { "type": "object", - "required": ["filename", "section_label", "score"], "properties": { "filename": { "type": "string", "description": "Source filing filename." }, + "doc_uri": { "type": "string", "description": "Agent Bricks citation URI when present." }, "section_label": { "type": "string", "enum": ["MD&A", "Risk", "Financials", "Notes", "Other"], "description": "Canonical section label." }, - "score": { "type": "number", "minimum": 0, "maximum": 1, "description": "Re-ranker score." }, + "score": { "type": "number", "minimum": 0, "maximum": 1, "description": "Optional relevance score." }, "char_offset": { "type": ["integer", "null"], "description": "Optional offset into the section text." }, "snippet": { "type": ["string", "null"], "description": "Optional short snippet for hover preview." } }, - "additionalProperties": false + "additionalProperties": true } }, "latency_ms": { @@ -41,12 +41,12 @@ "retrieved_count": { "type": "integer", "minimum": 0, - "description": "Total candidates pulled before re-ranking trim." + "description": "Number of citations surfaced to the app." }, "agent_path": { "type": "string", - "enum": ["knowledge_assistant", "analyst", "supervisor"], - "description": "Which agent handled the request; logged for evaluation slicing." + "enum": ["knowledge_assistant", "agent_bricks_supervisor"], + "description": "Agent Bricks path logged for evaluation slicing." }, "conversation_id": { "type": ["string", "null"], diff --git a/specs/001-doc-intel-10k/data-model.md b/specs/001-doc-intel-10k/data-model.md index fbea4c9..631d39f 100644 --- a/specs/001-doc-intel-10k/data-model.md +++ b/specs/001-doc-intel-10k/data-model.md @@ -147,9 +147,9 @@ PDF in volume └─ ai_parse_document → silver_parsed_filings (parse_status) └─ ai_classify + ai_extract → gold_filing_{sections,kpis} └─ ai_query rubric → gold_filing_quality (quality_score) - └─ embed_eligible? → Vector Search index sync - └─ Knowledge Assistant + Analyst + Supervisor agents - └─ Model Serving + AI Gateway + └─ quality_score threshold → Vector Search index sync + └─ Agent Bricks Knowledge Assistant + Supervisor Agent + └─ AI Gateway + OBO └─ Streamlit App turn └─ Lakebase query_logs + feedback ``` diff --git a/specs/001-doc-intel-10k/plan.md b/specs/001-doc-intel-10k/plan.md index 7471612..aeea37b 100644 --- a/specs/001-doc-intel-10k/plan.md +++ b/specs/001-doc-intel-10k/plan.md @@ -5,15 +5,15 @@ ## Summary -Build a Databricks-native, governed pipeline + agent that turns SEC 10-K PDFs into a queryable lakehouse and a cited Q&A experience. SQL Lakeflow Spark Declarative Pipelines parse PDFs once with `ai_parse_document` (VARIANT), classify sections with `ai_classify`, extract structured KPIs with `ai_extract`, and score every section against a 5-dimension quality rubric. High-quality summaries flow into a Mosaic AI Vector Search index. A Mosaic AI Agent Framework agent (Knowledge Assistant + Custom Analyst Agent + Supervisor for cross-company fan-out) is logged via MLflow, registered in Unity Catalog, served behind AI Gateway, and surfaced through a Streamlit Databricks App with citation rendering and a feedback widget. Conversation history and feedback land in Lakebase Postgres. Lakehouse Monitoring tracks extraction drift; an AI/BI dashboard surfaces query-log content gaps. CLEARS evaluation in MLflow gates promotion. The entire stack is one Databricks Asset Bundle (`databricks bundle deploy -t demo|prod`). +Build a Databricks-native, governed pipeline + Agent Bricks system that turns SEC 10-K PDFs into a queryable lakehouse and a cited Q&A experience. SQL Lakeflow Spark Declarative Pipelines parse PDFs once with `ai_parse_document` (VARIANT), classify sections with `ai_classify`, extract structured KPIs with `ai_extract`, and score every section against a 5-dimension quality rubric. High-quality summaries flow into a Mosaic AI Vector Search index. Agent Bricks Knowledge Assistant handles cited document Q&A; Agent Bricks Supervisor Agent coordinates the Knowledge Assistant with a deterministic Unity Catalog KPI function for cross-company comparisons. AI Gateway, Unity Catalog, and mandatory OBO enforce identity and audit. Conversation history and feedback land in Lakebase Postgres. Lakehouse Monitoring tracks extraction drift; an AI/BI dashboard surfaces query-log content gaps. CLEARS evaluation in MLflow gates promotion. The stack is deployed by DAB plus idempotent Agent Bricks bootstrap (`databricks bundle deploy -t demo|prod`, `scripts/bootstrap_agent_bricks.py`). ## Technical Context **Language/Version**: SQL (Databricks SQL on serverless) for parse/classify/extract pipelines; Python 3.11 for agent + app + eval -**Primary Dependencies**: Lakeflow Spark Declarative Pipelines, Lakeflow Jobs, Mosaic AI Vector Search, Mosaic AI Agent Framework (`databricks-agents`, `mlflow >= 2.20`), Databricks Model Serving + AI Gateway, Databricks Apps (Streamlit), Lakebase Postgres, Lakehouse Monitoring, Databricks Asset Bundles CLI (`databricks` >= 0.260) +**Primary Dependencies**: Lakeflow Spark Declarative Pipelines, Lakeflow Jobs, Mosaic AI Vector Search, Agent Bricks Knowledge Assistant and Supervisor Agent, AI Gateway, Databricks Apps (Streamlit), Lakebase Postgres, Lakehouse Monitoring, Databricks Asset Bundles CLI (`databricks` >= 0.260), MLflow Agent Evaluation **Storage**: Unity Catalog — `.` with one volume (`raw_filings`) and Delta tables (`bronze_filings`, `silver_parsed_filings`, `gold_filing_sections`, `gold_filing_kpis`); Lakebase Postgres for `conversation_history`, `query_logs`, `feedback` **Testing**: `databricks bundle validate -t demo` (schema check), pytest for agent unit tests, MLflow `evaluate()` with `databricks-agents` evaluators for CLEARS, manual smoke via the deployed App -**Target Platform**: Databricks workspace with serverless SQL warehouse (AI Functions GA), Mosaic AI Vector Search and Model Serving entitlements; agent endpoint runs on CPU instance behind AI Gateway +**Target Platform**: Databricks workspace with serverless SQL warehouse (AI Functions GA), Mosaic AI Vector Search, Agent Bricks, Databricks Apps user-token passthrough, AI Gateway, Unity Catalog, and Lakebase enabled **Project Type**: Databricks lakehouse + agent stack delivered as a single DAB **Performance Goals**: Pipeline E2E ≤ 10 min P95 on a 30 MB PDF (SC-001); agent P95 ≤ 8s single-filing, ≤ 20s cross-company (SC-009); Vector Search refresh ≤ 5 min after Gold update **Constraints**: SQL only for parse/classify/extract layer; Python only for agent + app; CPU model serving (no GPU); zero hard-coded paths outside the bundle; one-command deploy; CLEARS thresholds C≥0.8, L p95≤8s, E≥0.95, A≥0.9, R≥0.8, S≥0.99 block promotion @@ -66,13 +66,10 @@ resources/ │ └── retention.job.yml # 90-day raw PDF cleanup ├── consumers/ │ ├── index_refresh.job.yml # Vector Search index create/sync -│ └── agent.serving.yml # Model Serving + AI Gateway -├── monitors/ -│ └── kpi_drift.yml # Lakehouse Monitoring -├── dashboards/ -│ └── usage.lvdash.yml # AI/BI Lakeview dashboard -└── apps/ - └── analyst.app.yml # Databricks App resource +│ ├── kpi_drift.yml # Lakehouse Monitoring +│ ├── usage.dashboard.yml # AI/BI Lakeview dashboard +│ ├── lakebase_catalog.yml # Lakebase database catalog +│ └── analyst.app.yml # Databricks App env binding to generated Agent Bricks endpoint pipelines/ └── sql/ @@ -82,14 +79,9 @@ pipelines/ └── 04_gold_quality.sql # 5-dim rubric → quality_score agent/ -├── analyst_agent.py # Mosaic AI Agent Framework -├── supervisor.py # Cross-company fan-out -├── retrieval.py # Hybrid keyword+semantic + re-rank -├── tools.py # UC Function tool wrapping gold_filing_kpis -├── log_and_register.py # mlflow.pyfunc + UC registry +├── tools.py # deterministic KPI tool glue for Agent Bricks └── tests/ - ├── test_retrieval.py - └── test_supervisor.py + └── test_tools.py app/ ├── app.py # Streamlit chat UI @@ -100,6 +92,11 @@ evals/ ├── dataset.jsonl # 30 questions: 20 P2 + 10 P3 └── clears_eval.py # MLflow CLEARS gate +scripts/ +├── bootstrap-demo.sh # staged deploy orchestration +├── bootstrap_agent_bricks.py # Knowledge Assistant + Supervisor bootstrap +└── wait_for_kpis.py + .github/ └── workflows/ └── deploy.yml # validate on PR, deploy -t demo on merge @@ -107,7 +104,7 @@ evals/ CLAUDE.md # Runtime guidance for Claude Code ``` -**Structure Decision**: Single DAB containing one pipeline, two jobs, one vector index, one serving endpoint, one Lakebase project, one monitor, one dashboard, one app, and a CI workflow. SQL pipeline code lives at the root under `pipelines/sql/`; Python agent and app code live at `agent/` and `app/`. This layout matches the `databricks-dabs` skill's recommended bundle-structure layout and the constitution's "declarative over imperative" principle. +**Structure Decision**: Single DAB containing one pipeline, two jobs, one Vector Search endpoint, one Lakebase project, one monitor, one dashboard, one app, and a CI workflow. Agent Bricks resources are SDK-managed by `scripts/bootstrap_agent_bricks.py` until DAB exposes first-class Knowledge Assistant and Supervisor resource types. SQL pipeline code lives at the root under `pipelines/sql/`; deterministic tool glue lives at `agent/`; app code lives at `app/`. ## Phase 0 — Outline & Research @@ -121,9 +118,9 @@ Output: [research.md](./research.md). Decisions captured: | Idempotency | `APPLY CHANGES INTO` keyed on `filename` for Silver and Gold | SDP native CDC, deterministic on re-upload, no Python helper | Hand-rolled MERGE (rejected: more code paths); content hash key (deferred — filename is sufficient for v1) | | Quality rubric | 5 dimensions × 0–6 scale; threshold ≥ 22/30; computed via `ai_query` calls in `04_gold_quality.sql` | Mirrors Reffy's 31-point pattern; SQL-native means no Python helper; explicit dimensions help debug rejections | Single `extraction_confidence` (rejected: no debuggability); 3-dim avg (rejected: too coarse) | | Vector Search index | Delta-Sync index over `gold_filing_sections` filtered by `embed_eligible`; embed `summary` column | Managed sync, no manual refresh; embeds curated content per principle IV | Direct Vector Index (rejected: no managed sync); embedding raw `parsed.text_full` (rejected: noise) | -| Retrieval strategy | Hybrid (keyword + semantic) top-25 → re-rank → top-5 | Reffy pattern; re-rank tightens top-5 ordering; CPU re-rank stays in budget | Pure semantic (rejected: misses exact filings/years); re-rank against top-100 (rejected: latency budget) | -| Agent framework | Mosaic AI Agent Framework via `databricks-agents` SDK + MLflow `pyfunc` | First-class Knowledge Assistant + Supervisor primitives; logged + registered in UC | LangGraph standalone (rejected: more glue, no UC registration story) | -| Serving | CPU instance behind AI Gateway; identity passthrough on | Cost-first per Reffy; Gateway gives audit + rate limit + on-behalf-of | GPU (rejected: not needed at scale of pilot); raw endpoint (rejected: no governance layer) | +| Retrieval strategy | Agent Bricks Knowledge Assistant over the governed document layer / Vector Search source | Demonstrates the Agent Bricks article pattern and removes custom retrieval/rerank serving code | Raw chunk search (rejected: ignores Document Intelligence quality layer) | +| Agent framework | Agent Bricks Knowledge Assistant + Supervisor Agent | First-class governed enterprise agent primitives; aligns with the source articles | Custom `mlflow.pyfunc` analyst agent (rejected: caused deploy-order and serving lifecycle failures); LangGraph standalone (rejected: not the reference pattern) | +| Serving | Agent Bricks endpoint behind AI Gateway with mandatory OBO | Gateway gives audit, rate limits, guardrails, and identity enforcement | Bespoke custom endpoint ownership (rejected: custom lifecycle); service-principal auth for document Q&A (rejected: not production-safe) | | State store | Lakebase Postgres (managed) | Native to platform, low-latency reads/writes, fits Reffy pattern; integrates with Apps | Delta tables (rejected: write throughput on small turn-level updates); external Postgres (rejected: governance gap) | | Eval framework | MLflow `evaluate()` with `databricks-agents` evaluators on CLEARS axes | First-class CLEARS support; logged into MLflow runs | LangSmith / Ragas (rejected: external system) | | Monitoring | Lakehouse Monitoring `inference` profile on `gold_filing_kpis`; Lakeview AI/BI dashboard on `query_logs` | First-class drift detection; usage dashboard surfaces content gaps per Reffy | Custom Spark notebooks (rejected: imperative, principle III) | @@ -152,7 +149,7 @@ Output: `data-model.md`, `contracts/`, `quickstart.md`, plus the agent context u ### Contracts - `contracts/kpi-schema.json` — JSON schema passed to `ai_extract` (revenue, ebitda, segment_revenue, top_risks, fiscal_year, company_name, extraction_confidence) -- `contracts/agent-request.json` — `{question: string, top_k?: int, company_filter?: string}` +- `contracts/agent-request.json` — normalized app request metadata around an Agent Bricks user message - `contracts/agent-response.json` — `{answer: string, citations: [{filename, section_label, score, char_offset?}], latency_ms: int, retrieved_count: int}` - `contracts/feedback-event.json` — `{conversation_id, turn_id, user_id, rating: "up"|"down", comment?: string, ts}` diff --git a/specs/001-doc-intel-10k/quickstart.md b/specs/001-doc-intel-10k/quickstart.md index 787794b..1c10779 100644 --- a/specs/001-doc-intel-10k/quickstart.md +++ b/specs/001-doc-intel-10k/quickstart.md @@ -5,7 +5,7 @@ Goal: from a clean clone, stand up the entire stack on the Databricks `demo` tar ## Prerequisites - macOS or Linux, `python` 3.11+, `git`, `databricks` CLI ≥ 0.298 (`brew install databricks/tap/databricks`) -- A Databricks workspace with: serverless SQL warehouse (AI Functions GA), Mosaic AI Vector Search and Model Serving entitlements, Lakebase enabled +- A Databricks workspace with: serverless SQL warehouse (AI Functions GA), Mosaic AI Vector Search, Agent Bricks Knowledge Assistant and Supervisor Agent, AI Gateway, Databricks Apps user-token passthrough, Unity Catalog, and Lakebase enabled - An auth profile (`databricks auth login --host ` once); verify with `databricks auth profiles` - Local virtualenv: `python -m venv .venv && .venv/bin/pip install -r agent/requirements.txt -r evals/requirements.txt` @@ -31,10 +31,10 @@ DOCINTEL_WAREHOUSE_ID= \ The script implements a 6-step staged deploy: -1. Detect & clean orphan resources from prior failed runs. +1. Check environment conflicts such as Lakebase soft-delete name retention. 2. **Stage 1**: deploy `resources/foundation/` only (catalog/schema/volume, pipeline, retention job, Lakebase instance, VS endpoint) — consumer YAMLs are temp-renamed to `*.yml.skip`. -3. **Produce data**: upload synthetic samples, run pipeline, materialize the VS index, register agent model, wait for Lakebase to reach `AVAILABLE`. -4. **Stage 2**: full `bundle deploy` — consumers (serving endpoint, monitor, index-refresh job, app, dashboard, Lakebase catalog) attach to the live foundation. The VS endpoint is deployed in stage 1, and the bootstrap materializes the VS index before agent registration. +3. **Produce data**: upload synthetic samples, run pipeline, materialize the VS index, configure Agent Bricks Knowledge Assistant + Supervisor Agent, wait for Lakebase to reach `AVAILABLE`. +4. **Stage 2**: full `bundle deploy` — consumers (monitor, index-refresh job, app, dashboard, Lakebase catalog) attach to the live foundation. The VS endpoint is deployed in stage 1, and the bootstrap materializes the VS index before Knowledge Assistant configuration. 5. `bundle run analyst_app`; UC grants chain (`USE_CATALOG → USE_SCHEMA → SELECT/EXECUTE`). 6. Smoke check on the analyst endpoint. @@ -78,7 +78,7 @@ Expect: a markdown table with one row per company, segment-revenue values matchi DOCINTEL_CATALOG=workspace \ DOCINTEL_SCHEMA=docintel_10k_demo \ .venv/bin/python evals/clears_eval.py \ - --endpoint analyst-agent-demo \ + --endpoint "$(./scripts/resolve-agent-endpoint.sh demo)" \ --dataset evals/dataset.jsonl ``` @@ -98,7 +98,7 @@ Note: the Lakebase instance enters a soft-delete state for ~7 days during which |---|---|---| | `bundle validate` errors on `ai_parse_document` | Workspace lacks AI Functions GA | Move SQL warehouse to a recent serverless channel | | Vector Search index sync stuck | Embedding endpoint not provisioned | Provision `databricks-bge-large-en` or override `var.embedding_model_endpoint_name` | -| Agent endpoint 401 from App | OBO not plumbed end-to-end | Verify `app/app.py:_user_client` reads `x-forwarded-access-token` and the App's `user_api_scopes` includes `serving.serving-endpoints` (workspace must have user-token-passthrough enabled — see `docs/runbook.md` §"Enabling end-to-end OBO") | -| CLEARS Latency axis fails | Re-rank window too large | Reduce candidate window in `agent/retrieval.py` from 25 to 15 | +| Agent endpoint 401 from App | OBO not plumbed end-to-end | Verify `app/app.py:_user_client` reads `x-forwarded-access-token` and the App's `user_api_scopes` includes `serving.serving-endpoints` (workspace must have user-token-passthrough enabled — see `docs/runbook.md` §"Verifying end-to-end OBO") | +| CLEARS Latency axis fails | Agent Bricks orchestration or Knowledge Assistant source is too broad | Narrow the Knowledge Assistant source, tune Supervisor instructions, or reduce structured-tool fan-out | | Bootstrap blocks on Lakebase soft-delete | `lakebase_instance` name held by retention | Bump suffix in `databricks.yml` and retry | -| `⚠ APP-LEVEL OBO IS OPERATIONALLY DISABLED` banner | Workspace lacks user-token-passthrough feature | Workspace admin enables the feature; uncomment `user_api_scopes` in `resources/consumers/analyst.app.yml`; redeploy | +| App deploy fails on OBO scopes | Workspace lacks user-token-passthrough feature | Workspace admin enables the feature; this is a production prerequisite | diff --git a/specs/001-doc-intel-10k/research.md b/specs/001-doc-intel-10k/research.md index 3acf3ec..b210d30 100644 --- a/specs/001-doc-intel-10k/research.md +++ b/specs/001-doc-intel-10k/research.md @@ -39,7 +39,7 @@ evolution natively. ## Decision: Quality rubric as SQL `ai_query` calls in Gold **Rationale**: Reffy's 31-point rubric showed that scoring + filtering at -ingest is more cost-effective than re-ranking at inference. Using +ingest is more cost-effective than trying to repair context at inference. Using `ai_query()` in `04_gold_quality.sql` keeps the rubric declarative and versionable in git; the 5 dimensions (parse_completeness, layout_fidelity, ocr_confidence, section_recognizability, kpi_extractability) each give @@ -50,35 +50,41 @@ tunable as a bundle parameter. - *Single `extraction_confidence` value*: rejected — collapses debuggability. - *Python scorer in a job*: rejected — imperative, principle III. -## Decision: Hybrid retrieval (top-25) → re-rank → top-5 +## Decision: Agent Bricks Knowledge Assistant + Supervisor Agent, not custom pyfunc -**Rationale**: Reffy reports keyword-only sub-2s but reasoning needs LLM -generation. Hybrid keyword + semantic retrieval to top-25, then a Mosaic AI -re-ranker (CPU) trim to top-5, keeps single-filing P95 ≤ 8s achievable -on CPU serving while improving top-5 ordering qualitatively. Bigger windows blow -the latency budget; pure semantic misses exact ticker/year matches in -financial filings. +**Rationale**: The Agent Bricks article is explicit that the challenge is not +building an agent loop; it is running agents with real context, permissions, +identity, audit, and control. Therefore the reference implementation must use +Agent Bricks as the agent construction and governance layer. Knowledge +Assistant handles cited single-filing document Q&A over the governed Document +Intelligence output. Supervisor Agent orchestrates Knowledge Assistant with a +deterministic KPI tool over `gold_filing_kpis` for structured comparisons. +AI Gateway and Unity Catalog enforce identity, permissions, audit, routing, +and guardrails end to end. -## Decision: Mosaic AI Agent Framework + Knowledge Assistant + Supervisor - -**Rationale**: First-class platform primitives. The Knowledge Assistant -auto-ingests the Vector Search index and handles single-filing Q&A; -Supervisor handles cross-company fan-out (P3); a Custom Analyst Agent -adds a UC-Function tool that hits `gold_filing_kpis` directly for -structured comparisons. All three are logged via `mlflow.pyfunc`, -registered in UC, and served from a single endpoint. +Custom `mlflow.pyfunc` agents, custom retrieval/reranking loops, custom +supervisor routing, and bespoke Model Serving endpoint ownership are rejected +for this reference because they reproduce the exact glue layer that Agent +Bricks is meant to remove. They also caused deploy-order and serving +provisioning failures during validation. **Alternatives**: -- *LangGraph standalone*: rejected. Possible, but bypasses UC registration - and gives up the AI Gateway integration story. -- *DSPy* (Reffy's choice): considered. Mosaic AI Agent Framework now subsumes - the routing pattern, with first-class CLEARS evaluators. +- *Custom `mlflow.pyfunc` analyst agent*: rejected. It can work as a prototype, + but it is not the Agent Bricks pattern and it requires custom auth policy, + serving and supervisor lifecycle code. +- *LangGraph standalone*: rejected. Agent Bricks can interoperate with external + frameworks when needed, but this reference is intended to demonstrate + Databricks' governed enterprise agent platform, not a framework-specific + custom runtime. +- *DSPy* (Reffy's choice): rejected for v1 of this reference. The objective is + to demonstrate Document Intelligence + Agent Bricks as described in the + source articles. ## Decision: Lakebase Postgres for conversation state, not Delta tables **Rationale**: Per-turn writes (~100s of tiny rows/sec at peak) are a poor fit for Delta. Lakebase Postgres is the platform-native managed Postgres -that integrates with Apps and Model Serving; reads/writes are sub-10ms. +that integrates with Apps; reads/writes are sub-10ms. The Reffy team explicitly chose Lakebase for this exact pattern. ## Decision: Streamlit App, not React + FastAPI @@ -101,7 +107,7 @@ on every PR (`databricks bundle validate -t demo`), and `deploy` on push to edits we can't assume. A Lakeflow Job listing the volume, filtering `ingested_at < now()-90d`, and removing files is bundle-managed, auditable in `query_logs`, and trivial to extend later. Silver/Gold are -preserved indefinitely so retrieval doesn't lose context after raw cleanup. +preserved indefinitely so cited document answers do not lose context after raw cleanup. ## Open follow-ups @@ -110,4 +116,3 @@ None blocking. Items intentionally deferred: - Lakeflow Connect SharePoint connector (post-v1). - Content-hash idempotency key (filename suffices for v1). - React + FastAPI frontend. -- GPU re-rank for >50-result windows. diff --git a/specs/001-doc-intel-10k/spec.md b/specs/001-doc-intel-10k/spec.md index f16a93a..de42e26 100644 --- a/specs/001-doc-intel-10k/spec.md +++ b/specs/001-doc-intel-10k/spec.md @@ -18,8 +18,15 @@ ### Session 2026-04-25 - Q: Eval corpus — real EDGAR PDFs or synthetic? → A: Synthetic. The 30-question dataset references three synthetic 10-Ks (`samples/{ACME,BETA,GAMMA}_10K_2024.pdf`, generated by `samples/synthesize.py`) plus a deliberately low-quality `garbage_10K_2024.pdf` for SC-006. Real EDGAR filings can still be uploaded to the volume in deployed environments; the synthetic corpus exists so CI is fully deterministic and self-contained (no EDGAR dependency, no license concerns). User-facing examples in spec scenarios still use AAPL/MSFT/GOOG to convey intent. -- Q: Deploy ordering — single bundle deploy or staged? → A: Staged. `resources/foundation/` (catalog, pipeline, retention job, Lakebase instance, VS endpoint) deploys first; data is produced (sample upload, pipeline run, VS index materialization, agent registration, Lakebase ready); then `resources/consumers/` (serving endpoint, monitor, index-refresh job, app, Lakebase catalog) deploys. The chicken-egg dependencies between consumers and foundation data make a single deploy impossible. Bootstrap script automates this. -- Q: User identity passthrough? → A: OBO end-to-end is implemented but operationally requires the workspace-level "Databricks Apps - user token passthrough" feature. When disabled, the app falls back to SP creds with a loud bring-up banner identifying the limitation. +- Q: Deploy ordering — single bundle deploy or staged? → A: Staged. `resources/foundation/` (catalog, pipeline, retention job, Lakebase instance, VS endpoint) deploys first; data is produced (sample upload, pipeline run, VS index materialization, Agent Bricks Knowledge Assistant + Supervisor configuration, Lakebase ready); then `resources/consumers/` (monitor, index-refresh job, app, Lakebase catalog) deploys. The chicken-egg dependencies between consumers and foundation data make a single deploy impossible. Bootstrap script automates this. +- Q: User identity passthrough? → A: OBO end-to-end is mandatory. The workspace-level "Databricks Apps - user token passthrough" feature must be enabled before deployment. When disabled, deploy fails with an actionable prerequisite error. + +### Session 2026-04-26 + +- Q: What is the architectural source of truth? → A: The reference implementation MUST demonstrate the patterns in Databricks' "Why Your Agents Can't Read Enterprise Documents" and "Agent Bricks: The Governed Enterprise Agent Platform" articles. Document Intelligence is the document-processing foundation; Agent Bricks is the agent construction, orchestration, governance, and serving foundation. +- Q: Is a custom `mlflow.pyfunc` analyst agent acceptable as the primary implementation? → A: No. Custom pyfunc retrieval/supervisor/serving code is a divergence from the Agent Bricks-first reference and MUST be removed. Knowledge Assistant MUST handle cited single-filing document Q&A. Supervisor Agent MUST handle orchestration across document Q&A and structured KPI tools. +- Q: What custom code may remain? → A: Custom code may remain only where it demonstrates integration around Agent Bricks rather than replacing Agent Bricks: the Document Intelligence SQL pipeline, deterministic Gold KPI SQL/tool access, Databricks App UX, Lakebase feedback persistence, and deploy/eval automation. +- Q: Should the implementation keep legacy fallback logic for workspaces without user-token passthrough or Agent Bricks support? → A: No. Production deployment requires Agent Bricks, AI Gateway, Unity Catalog, Databricks Apps user-token passthrough, and end-to-end OBO. Missing prerequisites MUST fail validation or deploy; service-principal fallback and legacy custom-agent fallback are not acceptable. ## User Scenarios & Testing *(mandatory)* @@ -40,9 +47,9 @@ An equity research analyst uploads a public-company 10-K PDF (e.g., `AAPL_10K_20 --- -### User Story 2 — Ask the corpus a single-filing question with citations (Priority: P2) +### User Story 2 — Ask the corpus a single-filing question with Knowledge Assistant citations (Priority: P2) -An analyst opens a chat UI and asks a focused question about one company's filing — e.g., "What were the top 3 risk factors disclosed by Apple in their FY24 10-K?" The agent retrieves matching gold sections via hybrid keyword + semantic search, generates a grounded answer, and renders the answer alongside citations that link back to the source filename and section label. The analyst can submit thumbs-up/down feedback that is persisted. +An analyst opens a chat UI and asks a focused question about one company's filing — e.g., "What were the top 3 risk factors disclosed by Apple in their FY24 10-K?" Agent Bricks Knowledge Assistant answers over the governed Document Intelligence output / Vector Search knowledge source, generates a grounded answer, and renders citations that link back to the source filename and section label. The analyst can submit thumbs-up/down feedback that is persisted. **Why this priority**: Single-filing Q&A is the most common analyst workflow and the foundation of trust in the system. Citations make answers verifiable; feedback closes the evaluation loop. Cross-company aggregation (P3) reuses the same retrieval primitives, so this must work first. @@ -56,9 +63,9 @@ An analyst opens a chat UI and asks a focused question about one company's filin --- -### User Story 3 — Cross-company aggregation via supervisor agent (Priority: P3) +### User Story 3 — Cross-company aggregation via Agent Bricks Supervisor Agent (Priority: P3) -An analyst asks a multi-company question — e.g., "Compare segment revenue between Apple, Microsoft, and Google in their most recent 10-Ks." A supervisor orchestrates per-company retrievals, pulls structured KPIs from the Gold table, and returns a markdown table aggregating the results with per-row citations. +An analyst asks a multi-company question — e.g., "Compare segment revenue between Apple, Microsoft, and Google in their most recent 10-Ks." Agent Bricks Supervisor Agent orchestrates per-company Knowledge Assistant calls and a deterministic structured KPI tool over the Gold table, then returns a markdown table aggregating the results with per-row citations. **Why this priority**: Highest-value but depends on P1 (KPIs in Gold) and P2 (single-filing retrieval working reliably). Without P1's structured columns, aggregation collapses into freeform text comparison and loses precision. @@ -92,14 +99,16 @@ An analyst asks a multi-company question — e.g., "Compare segment revenue betw - **FR-004**: System MUST extract per-filing structured KPIs against a defined JSON schema including: `revenue` (numeric, USD), `ebitda` (numeric, USD), `segment_revenue` (array of `{name, revenue}`), `top_risks` (array of strings), `fiscal_year` (integer), `company_name` (string), and `extraction_confidence` (numeric 0-1). - **FR-005**: System MUST score every Gold row against a five-dimension quality rubric — `parse_completeness`, `layout_fidelity`, `ocr_confidence`, `section_recognizability`, `kpi_extractability` — each scored 0–6 (rubric total 0–30); rows with total ≥ 22 MUST be eligible for the Vector Search index, rows below MUST be excluded but retained in Gold for audit. - **FR-006**: System MUST embed curated section summaries (not raw chunks) into Vector Search; the index MUST refresh automatically when Gold updates. -- **FR-007**: System MUST expose an agent endpoint behind an AI Gateway that supports: (a) single-filing Q&A with citations, (b) cross-company supervisor fan-out, (c) hybrid keyword + semantic retrieval with re-ranking. End-to-end response time MUST meet P95 ≤ 8s for single-filing questions and P95 ≤ 20s for cross-company supervisor questions. +- **FR-007**: System MUST expose an Agent Bricks endpoint behind AI Gateway that supports: (a) single-filing Q&A with citations through Knowledge Assistant, (b) cross-company orchestration through Supervisor Agent, and (c) deterministic structured KPI lookup as a governed tool. Custom Python retrieval/rerank/supervisor loops MUST NOT be the primary agent implementation. End-to-end response time MUST meet P95 ≤ 8s for single-filing questions and P95 ≤ 20s for cross-company supervisor questions. - **FR-008**: System MUST render answers in a Databricks App UI with inline citations linking to source filename + section, plus a feedback widget (thumbs up/down + comment). - **FR-009**: System MUST persist conversation history, query logs, and feedback in a transactional store suitable for fast reads/writes alongside the agent serving path. - **FR-010**: System MUST evaluate the agent against a curated eval set of 30 hand-authored questions (20 P2 single-filing, 10 P3 cross-company) checked into the repo at `evals/dataset.jsonl`, scoring each axis of CLEARS and gating promotion on per-axis thresholds: Correctness ≥ 0.8, Latency p95 ≤ 8s, Execution ≥ 0.95, Adherence ≥ 0.9, Relevance ≥ 0.8, Safety ≥ 0.99. Any failing axis MUST block promotion. - **FR-011**: System MUST expose a monitoring dashboard summarizing extraction drift on Gold and a usage dashboard summarizing conversation logs (top queries, content gaps). -- **FR-012**: System MUST be deployable end-to-end (catalog/schema/volume, pipelines, vector index, agent endpoint, gateway, app, monitors, dashboards) via a single repeatable bring-up command; two environments (demo, prod) MUST be defined; no resource MAY be created outside the bundle. (See Clarifications session 2026-04-25 for the staged-deploy realization — first bring-up uses `./scripts/bootstrap-demo.sh` to handle chicken-egg dependencies between consumers and live data; steady-state deploys are a single `databricks bundle deploy`.) +- **FR-012**: System MUST be deployable end-to-end (catalog/schema/volume, Document Intelligence pipelines, vector index or Knowledge Assistant source, Agent Bricks endpoint/configuration, AI Gateway, app, monitors, dashboards) via a single repeatable bring-up command; two environments (demo, prod) MUST be defined; any resource not yet expressible as DAB YAML MUST be created by idempotent bootstrap code that is treated as part of the production deployment, not as manual setup. - **FR-013**: System MUST process duplicate uploads idempotently keyed on filename. - **FR-014**: System MUST gracefully report missing/ungrounded answers ("no source found") rather than hallucinating when retrieval returns no qualified results. +- **FR-015**: System MUST explicitly remove current custom-agent divergence: `agent/analyst_agent.py`, `agent/retrieval.py`, `agent/supervisor.py`, direct `mlflow.pyfunc` registration, and bespoke Model Serving endpoint ownership MUST be replaced by Agent Bricks Knowledge Assistant / Supervisor Agent configuration. No temporary compatibility shims or legacy fallback endpoint may remain. +- **FR-016**: System MUST require end-to-end user identity. Databricks Apps user-token passthrough, Agent Bricks / AI Gateway OBO, and UC permission enforcement are production prerequisites. If any prerequisite is unavailable, deploy MUST fail with an actionable error; the app and agent MUST NOT fall back to broad service-principal reads. ### Key Entities @@ -128,10 +137,10 @@ An analyst asks a multi-company question — e.g., "Compare segment revenue betw ## Assumptions - The target Databricks workspace has a serverless SQL warehouse with `ai_parse_document` (GA), `ai_classify`, `ai_extract`, and `ai_prep_search` available. -- Mosaic AI Vector Search and Model Serving entitlements are enabled for the workspace. +- Mosaic AI Vector Search, Agent Bricks, AI Gateway, and Databricks Apps user-token passthrough entitlements are enabled for the workspace. - Sample 10-K PDFs are publicly available SEC filings (EDGAR) the analyst manually uploads to the volume; no automated SharePoint/Drive sync in v1. - A Service Principal exists for prod deploys but is not used in v1 (demo target only). -- Analyst end-users have UC `SELECT` on the configured catalog/schema and execute permission on the agent endpoint via UC identity passthrough. +- Analyst end-users have UC `SELECT` on the configured catalog/schema, `EXECUTE` on the KPI function, and `CAN QUERY` on the Agent Bricks endpoints via end-to-end OBO. - The CLI auth profile on the operator's machine targets a workspace where the bundle can deploy without further policy exceptions. - 10-K fiscal year and company name can be reliably extracted from the parsed cover page; if not, `extraction_confidence` reflects the gap and the row remains queryable. - A curated eval set of 30 questions (20 P2 + 10 P3) is authored during implementation and checked in at `evals/dataset.jsonl`; CLEARS thresholds are tunable in config but defaults are fixed in FR-010. diff --git a/specs/001-doc-intel-10k/tasks.md b/specs/001-doc-intel-10k/tasks.md index d36f7a8..e9eac71 100644 --- a/specs/001-doc-intel-10k/tasks.md +++ b/specs/001-doc-intel-10k/tasks.md @@ -19,17 +19,17 @@ description: "Task list for Databricks 10-K Analyst implementation" ## Path Conventions -This is a single-DAB Databricks project. SQL pipeline code at `pipelines/sql/`, Python agent at `agent/`, Streamlit App at `app/`, evals at `evals/`, bundle resources at `resources/`. See plan.md for the full tree. +This is a DAB plus Agent Bricks bootstrap project. SQL pipeline code is at `pipelines/sql/`, deterministic tool glue at `agent/`, Streamlit App at `app/`, evals at `evals/`, bundle resources at `resources/`, and Agent Bricks orchestration in `scripts/bootstrap_agent_bricks.py`. See plan.md for the full tree. --- ## Phase 1: Setup (Shared Infrastructure) - [ ] T001 Verify `databricks` CLI ≥ 0.260 is installed and `databricks auth profiles` shows a working profile; if missing, follow the official Databricks CLI installation docs -- [x] T002 Create the bundle skeleton at `databricks.yml` with `bundle.name: doc-intel-10k`, `targets: {demo, prod}`, variables `catalog`, `schema`, `workspace_host`, `service_principal_id` (prod only), `embedding_model_endpoint_name`, `quality_threshold` (default 22), `top_k` (default 5) +- [x] T002 Create the bundle skeleton at `databricks.yml` with `bundle.name: doc-intel-10k`, `targets: {demo, prod}`, variables `catalog`, `schema`, `service_principal_id` (prod only), `embedding_model_endpoint_name`, and `quality_threshold` (default 22) - [x] T003 [P] Add `.github/workflows/deploy.yml` running `databricks bundle validate -t demo` on PR and `databricks bundle deploy -t demo` + `python evals/clears_eval.py` on push to `main` - [x] T004 [P] Create empty `pipelines/sql/`, `agent/`, `app/`, `evals/`, `resources/{pipelines,jobs,vector_search,serving,lakebase,monitors,dashboards,apps}/` directories with `.gitkeep` files -- [x] T005 [P] Add `agent/requirements.txt` (`mlflow>=2.20`, `databricks-agents`, `databricks-vectorsearch`, `databricks-sdk`) and `app/requirements.txt` (`streamlit`, `databricks-sdk`, `psycopg[binary]`) +- [x] T005 [P] Add `agent/requirements.txt` (`databricks-sdk`) and `app/requirements.txt` (`streamlit`, `databricks-sdk`, `psycopg[binary]`) --- @@ -78,19 +78,19 @@ This is a single-DAB Databricks project. SQL pipeline code at `pipelines/sql/`, ### Tests for US2 (TDD) -- [x] T018 [P] [US2] Add `agent/tests/test_retrieval.py` covering: hybrid retrieval returns ≤25 candidates, re-rank trims to `top_k`, `embed_eligible=false` rows never returned, `company_filter` and `fiscal_year_filter` are honored -- [x] T019 [P] [US2] Add `agent/tests/test_agent.py` covering: agent returns `grounded=true` only when ≥1 citation present, "no source found" path triggers when retrieval is empty, response validates against `contracts/agent-response.json` +- [x] T018 [P] [US2] Remove custom retrieval tests and add `agent/tests/test_tools.py` coverage for deterministic KPI SQL parameterization +- [x] T019 [P] [US2] Validate app-side Agent Bricks response normalization and citation rendering through Streamlit smoke/eval coverage ### Implementation for US2 - [x] T020 [P] [US2] Define the Vector Search endpoint in `resources/foundation/filings_index.yml`; the Delta-Sync index over `${var.catalog}.${var.schema}.gold_filing_sections_indexable` is created by `jobs/index_refresh/sync_index.py` because DAB does not manage Vector Search indexes directly (depends on T013) - [x] T021 [P] [US2] Define the index-refresh Lakeflow Job in `resources/consumers/index_refresh.job.yml` with a table-update trigger on `gold_filing_sections_indexable` and a Python task that creates/syncs `${var.catalog}.${var.schema}.filings_summary_idx` (depends on T020) -- [x] T022 [US2] Implement `agent/retrieval.py`: `hybrid_retrieve(question, top_k=25, filters=None)` calling Vector Search with `query_type='HYBRID'`, then `mosaic_rerank(question, candidates, top_k=5)`; returns list of citation dicts matching `agent-response.json` (depends on T020; tests T018 must fail first) -- [x] T023 [US2] Implement `agent/tools.py`: a UC Function tool wrapping `SELECT * FROM gold_filing_kpis WHERE filename = :filename` for the agent to access structured KPIs deterministically -- [x] T024 [US2] Implement `agent/analyst_agent.py`: a `mlflow.pyfunc` model class implementing the Mosaic AI Agent Framework chat protocol; uses `retrieval.hybrid_retrieve` for grounding, calls a foundation model endpoint to generate the answer, returns the schema in `contracts/agent-response.json` (depends on T022, T023) -- [x] T025 [US2] Implement `agent/log_and_register.py`: `mlflow.pyfunc.log_model(...)`, `mlflow.register_model(...)` to UC at `${var.catalog}.${var.schema}.analyst_agent`; assign UC Model Alias `@demo` (and later `@prod`) to the freshly registered version so Model Serving in T026 follows the alias rather than a frozen version (depends on T024) -- [x] T026 [US2] Define the Model Serving endpoint in `resources/consumers/agent.serving.yml`: CPU instance, served entity = `${var.catalog}.${var.schema}.analyst_agent`, AI Gateway with rate limit + audit enabled (depends on T025) -- [x] T027 [US2] Implement `app/app.py` (Streamlit): chat input, calls the agent endpoint via `databricks.sdk.WorkspaceClient.serving_endpoints.query`, renders answer + citations as chips that show filename + section on hover, thumbs-up/down + comment widget that POSTs to a Lakebase write helper; persists `conversation_id` in session state (depends on T026, T007) +- [x] T022 [US2] Remove custom retrieval implementation (`agent/retrieval.py`) and configure Agent Bricks Knowledge Assistant over the governed Document Intelligence / Vector Search source (depends on T020) +- [x] T023 [US2] Implement `agent/tools.py` as deterministic structured KPI tool glue for Agent Bricks, wrapping governed SQL over `gold_filing_kpis` +- [x] T024 [US2] Remove custom `agent/analyst_agent.py` and direct `mlflow.pyfunc` registration; Knowledge Assistant owns single-filing cited Q&A (depends on T022, T023) +- [x] T025 [US2] Remove `agent/log_and_register.py` and bespoke model-version promotion from the production path; bootstrap configures Agent Bricks resources idempotently instead +- [x] T026 [US2] Replace `resources/consumers/agent.serving.yml` with `scripts/bootstrap_agent_bricks.py` Agent Bricks endpoint/configuration behind AI Gateway with mandatory OBO and guardrails (depends on T024, T025) +- [x] T027 [US2] Implement `app/app.py` (Streamlit): chat input, calls the Agent Bricks endpoint as the invoking user, renders answer + citations as chips, thumbs-up/down + comment widget that POSTs to a Lakebase write helper; persists `conversation_id` in session state (depends on T026, T007) - [x] T028 [US2] Implement `app/lakebase_client.py`: thin wrapper using `psycopg` with the bundle-injected DSN to insert into `conversation_history`, `query_logs`, `feedback` - [x] T029 [US2] Define the Databricks App in `resources/consumers/analyst.app.yml`: source = `app/`, runtime python, env = Lakebase binding + agent endpoint binding (depends on T027, T028) - [x] T030 [US2] Author `evals/dataset.jsonl` 20 P2 questions per `data-model.md`'s eval section (each with `expected_filename`, `expected_section`, `expected_answer_keywords`, `min_citations`) @@ -110,13 +110,13 @@ This is a single-DAB Databricks project. SQL pipeline code at `pipelines/sql/`, ### Tests for US3 (TDD) -- [x] T034 [P] [US3] Add `agent/tests/test_supervisor.py` covering: supervisor fans out 1 sub-question per detected company, missing companies trigger explicit "not in corpus" handling, the rendered markdown table shape matches expected (header + N rows + citations column), aggregation is deterministic for the same inputs +- [ ] T034 [P] [US3] Add deployed Agent Bricks Supervisor acceptance checks covering: Supervisor invokes Knowledge Assistant per detected company, invokes the KPI tool for structured fields, missing companies trigger explicit "not in corpus" handling, and the rendered markdown table shape matches expected ### Implementation for US3 -- [x] T035 [US3] Implement `agent/supervisor.py`: detects company names via a small classifier or LLM call, fans out a per-company query through `analyst_agent`, pulls structured `gold_filing_kpis` rows via `tools.py`, formats a markdown table; returns `agent_path='supervisor'` in the response (depends on T024, T023; tests T034 must fail first) -- [x] T036 [US3] Update `agent/analyst_agent.py` to detect cross-company intent at the routing layer and delegate to `supervisor.handle()`; otherwise stay in single-filing path (depends on T035) -- [x] T037 [US3] Re-run `agent/log_and_register.py` from CI (GH Actions deploy step in T003) to register a new UC model version with the supervisor enabled and re-assign alias `@demo`; the serving endpoint follows the alias so no yml edit is needed +- [x] T035 [US3] Remove custom `agent/supervisor.py`; configure Agent Bricks Supervisor Agent instructions/tools to orchestrate Knowledge Assistant + KPI tool (depends on T024, T023) +- [x] T036 [US3] Configure Agent Bricks routing/instructions for cross-company intent; no custom Python routing layer remains (depends on T035) +- [x] T037 [US3] Update CI to validate/deploy Agent Bricks configuration directly; no `agent/log_and_register.py` step remains - [x] T038 [US3] Author 10 P3 questions in `evals/dataset.jsonl` (each with `expected_companies` and `expected_table_columns`) (depends on T030) - [x] T039 [US3] Extend `evals/clears_eval.py` to slice metrics by `category in {P2, P3}` and assert SC-002 ≥0.8 on P2, SC-003 ≥0.7 on P3 (depends on T031, T038) - [x] T040 [US3] Update `app/app.py` to render markdown tables (Streamlit `st.markdown(..., unsafe_allow_html=False)` already handles this) and surface a "show structured KPIs" expander next to each row (depends on T036) @@ -146,19 +146,19 @@ This is a single-DAB Databricks project. SQL pipeline code at `pipelines/sql/`, - **Phase 2 (Foundational)**: depends on Phase 1; **blocks all user stories** - **Phase 3 (US1)**: depends on Phase 2 - **Phase 4 (US2)**: depends on Phase 3 (specifically T011, T013) — vector index needs `gold_filing_sections.embed_eligible` -- **Phase 5 (US3)**: depends on Phase 4 (specifically T024) — supervisor wraps `analyst_agent` +- **Phase 5 (US3)**: depends on Phase 4 (specifically T026) — Supervisor Agent wraps Knowledge Assistant and the KPI function - **Phase 6 (Polish)**: depends on all user stories complete ### User Story Dependencies - **US1**: independent given Phase 2 done - **US2**: depends on US1's Gold tables to have data to embed -- **US3**: depends on US2's analyst agent to fan out +- **US3**: depends on US2's Knowledge Assistant and KPI function ### Within Each User Story - Pipeline SQL files: T009 → T010 → T011 → T012 → T013 → T014 (linear within US1) -- Agent code (US2): T020 → T022 → T024 → T025 → T026 → T031 (mostly linear); tests T018/T019 first +- Agent Bricks bootstrap (US2): T020 → T022 → T024 → T025 → T026 → T031 (mostly linear); tests T018/T019 first - Supervisor (US3): T035 → T036 → T037 → T039 → T040 ### Parallel Opportunities @@ -167,7 +167,7 @@ This is a single-DAB Databricks project. SQL pipeline code at `pipelines/sql/`, - T007, T008 in Phase 2 (after T006) - T009 (Bronze SQL) parallel with T015 (retention Job) and T016 (sample PDF) and T017 (initial dashboard) - T020 (VS index yml) and T021 (refresh Job yml) in parallel after T013 -- T018 and T019 (agent tests) parallel +- T018 and T019 (Agent Bricks tool/app tests) parallel - T030 (P2 eval items) and T032 (Lakehouse Monitor) parallel within US2 - T041, T042, T044, T045 in Phase 6 @@ -181,13 +181,13 @@ Task: "T020 Vector Search index yml" Task: "T021 Index-refresh Job yml" # Then write tests in parallel: -Task: "T018 retrieval tests" -Task: "T019 agent contract tests" +Task: "T018 KPI tool tests" +Task: "T019 app response normalization tests" -# Then implement (sequential within agent/ Python module dependencies): -Task: "T022 retrieval.py" +# Then implement (sequential within Agent Bricks bootstrap dependencies): +Task: "T022 Knowledge Assistant source" Task: "T023 tools.py" -Task: "T024 analyst_agent.py" (depends on T022, T023) +Task: "T024 remove pyfunc runtime" (depends on T022, T023) ``` ---