diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 5ca9ee2..8ccb478 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -72,25 +72,28 @@ jobs: # transition it back through provisioning; the catalog/app bindings # need it AVAILABLE before the next bundle run touches them. run: | - python -c " -import json, os, sys, time, subprocess -name = os.environ.get('LAKEBASE_NAME') or 'docintel-demo-state-v1' -deadline = time.time() + 600 -while True: - out = subprocess.run(['databricks','api','get','/api/2.0/database/instances','--output','json'], - capture_output=True, text=True) - try: - d = json.loads(out.stdout) - except Exception: - d = {} - state = next((i.get('state') for i in d.get('database_instances',[]) if i.get('name')==name), 'UNKNOWN') - print(f'lakebase state: {state}') - if state == 'AVAILABLE': - sys.exit(0) - if time.time() >= deadline: - sys.exit(f'Lakebase {name} did not reach AVAILABLE within 600s (state={state})') - time.sleep(15) -" + python - <<'PY' + import json, os, sys, time, subprocess + name = os.environ.get('LAKEBASE_NAME') or 'docintel-demo-state-v1' + deadline = time.time() + 600 + while True: + out = subprocess.run( + ['databricks', 'api', 'get', '/api/2.0/database/instances', '--output', 'json'], + capture_output=True, + text=True, + ) + try: + d = json.loads(out.stdout) + except Exception: + d = {} + state = next((i.get('state') for i in d.get('database_instances', []) if i.get('name') == name), 'UNKNOWN') + print(f'lakebase state: {state}') + if state == 'AVAILABLE': + sys.exit(0) + if time.time() >= deadline: + sys.exit(f'Lakebase {name} did not reach AVAILABLE within 600s (state={state})') + time.sleep(15) + PY env: LAKEBASE_NAME: ${{ vars.DOCINTEL_LAKEBASE_NAME || 'docintel-demo-state-v1' }} @@ -104,7 +107,7 @@ while True: databricks bundle run -t demo --var "warehouse_id=$DOCINTEL_WAREHOUSE_ID" doc_intel_pipeline python scripts/wait_for_kpis.py --min-rows 3 --timeout 900 databricks bundle run -t demo --var "warehouse_id=$DOCINTEL_WAREHOUSE_ID" --var "agent_endpoint_name=$AGENT_ENDPOINT_NAME" index_refresh - python scripts/bootstrap_agent_bricks.py \ + python -m agent.document_intelligence_agent \ --target demo \ --catalog "$DOCINTEL_CATALOG" \ --schema "$DOCINTEL_SCHEMA" \ @@ -130,19 +133,38 @@ while True: # `bundle deploy` alone uploads code but doesn't apply config/restart. run: databricks bundle run -t demo --var "warehouse_id=$DOCINTEL_WAREHOUSE_ID" --var "agent_endpoint_name=$AGENT_ENDPOINT_NAME" analyst_app - - name: Verify OBO scopes survived deploy - # `bundle run` may wipe user_api_scopes (documented destructive-update - # behavior). Fail loudly because user-token passthrough is mandatory. + - name: Verify app auth mode and endpoint grants run: | databricks apps get doc-intel-analyst-demo --output json > /tmp/app.json - python -c " -import json -app = json.load(open('/tmp/app.json')) -scopes = set(app.get('user_api_scopes') or []) -required = {'serving.serving-endpoints', 'sql'} -missing = required - scopes -assert not missing, f'OBO scopes missing: {sorted(missing)} (got {sorted(scopes)})' -" + app_obo_required="$(python -c "import yaml; d=yaml.safe_load(open('databricks.yml')); default=d.get('variables',{}).get('app_obo_required',{}).get('default','true'); value=d.get('targets',{}).get('demo',{}).get('variables',{}).get('app_obo_required', default); print(str(value).lower())")" + lakebase_name="$(python -c "import yaml; d=yaml.safe_load(open('databricks.yml')); print(d.get('targets',{}).get('demo',{}).get('variables',{}).get('lakebase_instance','docintel-demo-state-v1'))")" + python -c "import json; app=json.load(open('/tmp/app.json')); vals=[str(app.get(k)) for k in ('service_principal_client_id','service_principal_name','service_principal_id') if app.get(k) is not None]; print('\n'.join(dict.fromkeys(v for v in vals if v)))" > /tmp/app-sp-candidates.txt + db_granted=0 + while IFS= read -r principal; do + grant_json="$(python -c "import json, sys; print(json.dumps({'access_control_list':[{'service_principal_name':sys.argv[1],'permission_level':'CAN_USE'}]}))" "$principal")" + if databricks permissions update database-instances "$lakebase_name" --json "$grant_json"; then + db_granted=1 + break + fi + done < /tmp/app-sp-candidates.txt + test "$db_granted" = "1" + if [ "$app_obo_required" = "true" ]; then + # `bundle run` may wipe user_api_scopes (documented destructive-update + # behavior). Fail loudly if required user scopes are missing. + python -c "import json; app=json.load(open('/tmp/app.json')); scopes=set(app.get('user_api_scopes') or []); required={'serving.serving-endpoints','sql'}; missing=required-scopes; assert not missing, f'OBO scopes missing: {sorted(missing)} (got {sorted(scopes)})'" + else + python -c "import json; app=json.load(open('/tmp/app.json')); scopes=app.get('user_api_scopes'); assert not scopes, f'demo App-SP mode expected no user_api_scopes, got {scopes}'" + endpoint_id="$(databricks serving-endpoints get "$AGENT_ENDPOINT_NAME" --output json | python -c "import json, sys; e=json.load(sys.stdin); print(e.get('id') or e.get('name'))")" + granted=0 + while IFS= read -r principal; do + grant_json="$(python -c "import json, sys; print(json.dumps({'access_control_list':[{'service_principal_name':sys.argv[1],'permission_level':'CAN_QUERY'}]}))" "$principal")" + if databricks permissions update serving-endpoints "$endpoint_id" --json "$grant_json"; then + granted=1 + break + fi + done < /tmp/app-sp-candidates.txt + test "$granted" = "1" + fi - name: CLEARS evaluation gate run: python evals/clears_eval.py --endpoint "$AGENT_ENDPOINT_NAME" --dataset evals/dataset.jsonl diff --git a/CLAUDE.md b/CLAUDE.md index 66acf46..892f151 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -16,15 +16,15 @@ For an end-to-end overview written for humans, read [`README.md`](./README.md). The bundle has three chicken-egg dependencies that prevent a single `databricks bundle deploy -t demo` from succeeding on a fresh workspace: -1. **Databricks App resource binding** references the Agent Bricks Supervisor endpoint that `scripts/bootstrap_agent_bricks.py` creates after the Vector Search index exists. +1. **Databricks App config** needs the generated Agent Bricks Supervisor endpoint name from `agent/document_intelligence_agent.py`, which can only run after the Vector Search index exists. 2. **Lakehouse Monitor** (`resources/consumers/kpi_drift.yml`) attaches to `gold_filing_kpis`, which doesn't exist until the pipeline runs once. 3. **Lakebase database_catalog + Databricks App** race the `database_instance` provisioning. -**Canonical fix**: Run `./scripts/bootstrap-demo.sh` for fresh stand-ups; plain `databricks bundle deploy -t demo` for steady-state. The script does a **staged deploy** — `resources/` is split into `foundation/` (no data deps) and `consumers/` (need data). Stage 1 temporarily renames consumer YAMLs to `*.yml.skip` so the bundle glob skips them; stage 2 produces data and then runs full `bundle deploy`. Both deploys should succeed cleanly. +**Canonical fix**: Run `./scripts/bootstrap-demo.sh` for fresh stand-ups. For steady-state manual deploys, resolve the generated Supervisor endpoint and pass it as a bundle variable: `databricks bundle deploy -t demo --var "agent_endpoint_name=$(./scripts/resolve-agent-endpoint.sh demo)"`. The script does a **staged deploy** — `resources/` is split into `foundation/` (no data deps) and `consumers/` (need data). Stage 1 temporarily renames consumer YAMLs to `*.yml.skip` so the bundle glob skips them; stage 2 produces data and then runs full `bundle deploy`. Both deploys should succeed cleanly. **Do NOT try to "fix" these by:** - Adding `depends_on` between heterogeneous DAB resource types — DAB doesn't reliably honor it across instance↔catalog↔app. -- Reintroducing a custom MLflow pyfunc serving endpoint. Agent Bricks Knowledge Assistant + Supervisor Agent is the production path. +- Bypassing Agent Bricks Knowledge Assistant + Supervisor Agent for the production path. - Splitting monitors into a separate target overlay — adds complexity for a one-time concern. Full breakdown lives in [`docs/runbook.md`](./docs/runbook.md) §"Known deploy ordering gaps". @@ -33,13 +33,13 @@ Full breakdown lives in [`docs/runbook.md`](./docs/runbook.md) §"Known deploy o ``` pipelines/sql/ Lakeflow SDP — Bronze → Silver → Gold (SQL only, principle III) -agent/ Deterministic Agent Bricks tool glue only +agent/ Agent Bricks definition + deterministic tool glue app/ Streamlit on Databricks Apps + Lakebase psycopg client evals/ MLflow CLEARS gate (clears_eval.py + dataset.jsonl) jobs/ Lakeflow Jobs Python tasks (retention, index_refresh) resources/foundation/ DAB resources with no data deps: catalog/schema/volume, pipeline, retention job, Lakebase instance resources/consumers/ DAB resources that depend on foundation data: monitor, index-refresh job, app, dashboard, Lakebase catalog -scripts/ Operational scripts (bootstrap-demo.sh, bootstrap_agent_bricks.py, wait_for_kpis.py) +scripts/ Operational scripts (bootstrap-demo.sh, wait_for_kpis.py) samples/ Synthetic 10-K PDFs (regenerable via synthesize.py) specs/001-… Spec-Kit artifacts (spec, plan, tasks, research, data-model, contracts, quickstart) docs/runbook.md Day-2 ops + bring-up workflow @@ -51,6 +51,7 @@ docs/runbook.md Day-2 ops + bring-up workflow - Validate: `databricks bundle validate -t demo` - Fresh stand-up: `./scripts/bootstrap-demo.sh` (requires `DOCINTEL_CATALOG`, `DOCINTEL_SCHEMA`, `DOCINTEL_WAREHOUSE_ID`) - Steady-state deploy: `databricks bundle deploy -t demo --var "agent_endpoint_name=$(./scripts/resolve-agent-endpoint.sh demo)"` +- App config/restart: `databricks bundle run -t demo --var "agent_endpoint_name=$(./scripts/resolve-agent-endpoint.sh demo)" analyst_app` - Run pipeline: `databricks bundle run -t demo doc_intel_pipeline` - Run eval: `python evals/clears_eval.py --endpoint "$(./scripts/resolve-agent-endpoint.sh demo)" --dataset evals/dataset.jsonl` @@ -69,7 +70,9 @@ These were discovered the painful way during the 2026-04-25 bring-up. Future ses - **Section normalization**: `pipelines/sql/03_gold_classify_extract.sql` POSEXPLODES `parsed:sections[*]` and represents sectionless VARIANT output as one `full_document` row so we never lose a filing. - **`lakebase_stopped: true` is rejected on instance creation**: the API doesn't allow creating a database_instance directly into stopped state. Default is `false`; flip to `true` only after the instance exists. Reference: `databricks.yml` variable description. - **macOS doesn't ship `python`**: scripts must prefer `.venv/bin/python` then fall back to `python3`. Reference: `scripts/bootstrap-demo.sh`. -- **Agent Bricks resources are SDK-managed**: `scripts/bootstrap_agent_bricks.py` creates/updates the Knowledge Assistant, its Vector Search knowledge source, the UC KPI function, and the Supervisor Agent. DAB still manages the surrounding data/app/monitor resources. +- **Agent Bricks resources are SDK-managed**: `agent/document_intelligence_agent.py` creates/updates the Knowledge Assistant, its Vector Search knowledge source, the UC KPI function, and the Supervisor Agent. DAB still manages the surrounding data/app/monitor resources. +- **Agent Bricks generates endpoint names**: use `scripts/resolve-agent-endpoint.sh ` and pass the result as `--var agent_endpoint_name=...` for deploys and app runs. +- **Agent Bricks invocation uses the invocations path directly**: `app/agent_bricks_client.py` posts to `/serving-endpoints/{endpoint}/invocations` with the user's OBO token and an `X-Request-ID`. Do not swap this back to `WorkspaceClient.serving_endpoints.query()` without revalidating the Agent Bricks response shape. - **Streamlit on Databricks Apps requires CORS+XSRF off via env vars**: not flags. `STREAMLIT_SERVER_ENABLE_CORS=false` and `STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION=false` in `app/app.yaml`. Databricks Apps runtime config: https://docs.databricks.com/aws/en/dev-tools/databricks-apps/app-runtime. - **`bundle deploy` doesn't apply app config / restart**: must follow with `databricks bundle run -t analyst_app` (or use `databricks apps deploy`). Databricks Apps deploy docs: https://docs.databricks.com/aws/en/dev-tools/databricks-apps/deploy. - **`bundle run` may wipe `user_api_scopes`**: documented as a destructive-update behavior in the Databricks Apps deploy docs. Bootstrap step 5c re-asserts; CI verifies. If you change the App resource, double-check OBO scopes after. diff --git a/PRODUCTION_READINESS.md b/PRODUCTION_READINESS.md index dd0a594..27b0efd 100644 --- a/PRODUCTION_READINESS.md +++ b/PRODUCTION_READINESS.md @@ -10,6 +10,8 @@ This project is open-sourced as a Databricks reference implementation. Treat it | Pilot-ready | Real filings exercise document variability and cost/latency | Reference-ready plus a reviewed EDGAR pilot corpus | | Production-ready | Analysts can use it under governed identity and SLOs | Pilot-ready plus end-to-end OBO, dashboards, alerts, rollback, and runbook evidence | +Current demo status as of 2026-04-26: Agent Bricks bootstrap, Databricks App deploy, direct Supervisor endpoint smoke, Lakebase OAuth credential handling, and Vector Search index-refresh smoke passed. The project is not reference-ready yet because the latest synthetic CLEARS run failed the configured quality/latency gate. Prod readiness still requires user-token passthrough/OBO audit evidence. See [`VALIDATION.md`](./VALIDATION.md#latest-demo-snapshot). + ## Reference-Ready Checklist - `databricks bundle validate --strict -t demo` passes. @@ -17,7 +19,7 @@ This project is open-sourced as a Databricks reference implementation. Treat it - Synthetic PDFs in `samples/` produce at least ACME/BETA/GAMMA KPI rows. - Vector Search index sync completes and the Agent Bricks Supervisor endpoint answers a smoke question with citations. - `python evals/clears_eval.py --endpoint "$(./scripts/resolve-agent-endpoint.sh demo)" --dataset evals/dataset.jsonl` passes. -- App starts via `databricks bundle run -t demo analyst_app`. +- App starts via `databricks bundle run -t demo --var "agent_endpoint_name=$(./scripts/resolve-agent-endpoint.sh demo)" analyst_app` in the configured demo auth mode. ## Pilot-Ready Checklist @@ -31,7 +33,7 @@ This project is open-sourced as a Databricks reference implementation. Treat it ## Production-Ready Checklist - Databricks Apps user-token passthrough is enabled in the workspace. -- `resources/consumers/analyst.app.yml:user_api_scopes` is declared and survives `bundle run`. +- Prod target `user_api_scopes` in `databricks.yml` are declared and survive `bundle run`. - Audit logs prove app requests, Agent Bricks, Knowledge Assistant, Vector Search, and structured KPI SQL calls execute under the invoking user where required. - Service principal `run_as` is configured for prod via `--var service_principal_id=`. - Analyst group grants include `USE_CATALOG`, `USE_SCHEMA`, `SELECT`, `EXECUTE`, `READ_VOLUME`, and `WRITE_VOLUME` as appropriate. diff --git a/README.md b/README.md index 93fe5d2..fb759ff 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ A **Databricks-native document intelligence + agent** stack: parse PDFs once wit [2], regulation [3]…" ``` -For motivation, architecture diagrams, the Spec-Kit + Claude Code build workflow, and the chicken-egg deploy-ordering story, see [**`docs/design.md`**](./docs/design.md). For day-2 ops, see [**`docs/runbook.md`**](./docs/runbook.md). +For architecture and deploy ordering, see [**`docs/design.md`**](./docs/design.md). For operations, validation, and troubleshooting, see [**`docs/runbook.md`**](./docs/runbook.md). --- @@ -37,6 +37,7 @@ For motivation, architecture diagrams, the Spec-Kit + Claude Code build workflow - [Features](#features) - [Readiness levels](#readiness-levels) +- [How Agent Bricks is used](#how-agent-bricks-is-used) - [Prerequisites](#prerequisites) - [Getting started](#getting-started) - [CLEARS quality gate](#clears-quality-gate) @@ -55,7 +56,7 @@ For motivation, architecture diagrams, the Spec-Kit + Claude Code build workflow ## Features - **End-to-end document intelligence pipeline** — Auto Loader ingest → `ai_parse_document` → section explosion → `ai_classify` + `ai_extract` → 5-dim quality rubric → Vector Search Delta-Sync index (the endpoint is DAB-managed; the index is created/synced by `jobs/index_refresh/sync_index.py`). SQL-only pipeline (Lakeflow Spark Declarative Pipelines). -- **Cited-answer agent** — Agent Bricks-first runtime: Knowledge Assistant for cited document Q&A, Supervisor Agent for cross-company orchestration, and a deterministic KPI tool for structured comparisons. No custom pyfunc, retrieval loop, or supervisor runtime is retained. +- **Cited-answer agent** — Agent Bricks Knowledge Assistant for cited document Q&A, Supervisor Agent for cross-company orchestration, and a deterministic KPI tool for structured comparisons. - **Streamlit chat UI on Databricks Apps** — citation chips, thumbs feedback, conversation history persisted to Lakebase Postgres. - **Eval-gated promotion** — `mlflow.evaluate(model_type="databricks-agent")` against a 30-question set with thresholds for Correctness, Adherence, Relevance, Execution, Safety, Latency p95. - **Reproducible synthetic corpus** — `samples/synthesize.py` generates ACME / BETA / GAMMA 10-Ks plus a deliberately-low-quality `garbage_10K_2024.pdf` for the rubric-exclusion test (SC-006). No EDGAR dependency in CI. @@ -72,6 +73,31 @@ For motivation, architecture diagrams, the Spec-Kit + Claude Code build workflow Full checklists in [`PRODUCTION_READINESS.md`](./PRODUCTION_READINESS.md). +> Latest demo status, 2026-04-26: Agent Bricks bootstrap, Databricks App deploy, direct Supervisor endpoint smoke, and Vector Search index-refresh smoke passed. Reference-ready remains blocked by CLEARS thresholds. Prod readiness still requires user-token passthrough/OBO evidence. See [`VALIDATION.md`](./VALIDATION.md). + +--- + +## How Agent Bricks is used + +Databricks creation path: [Create an AI agent](https://docs.databricks.com/aws/en/generative-ai/agent-framework/create-agent) → Knowledge Assistant for document Q&A, with Supervisor Agent coordinating hosted tools. + +The Agent Bricks path is: + +1. `jobs/index_refresh/sync_index.py` creates/syncs the Mosaic AI Vector Search Delta-Sync index over `gold_filing_sections_indexable`. +2. `agent/document_intelligence_agent.py` creates or updates the Agent Bricks Knowledge Assistant with that Vector Search index as its knowledge source. The source uses `summary` as the searchable text column and `filename` as the document URI column. +3. `agent/document_intelligence_agent.py` creates or updates the UC SQL function `lookup_10k_kpis`. +4. `agent/document_intelligence_agent.py` creates or updates the Agent Bricks Supervisor Agent with two tools: the Knowledge Assistant for cited document Q&A and the UC function for deterministic KPI lookups. +5. Agent Bricks generates concrete serving endpoint names. Resolve the live Supervisor endpoint with `./scripts/resolve-agent-endpoint.sh `. +6. The Databricks App receives the resolved endpoint through the `agent_endpoint_name` bundle variable as `DOCINTEL_AGENT_ENDPOINT`. +7. The app invokes `POST /serving-endpoints/{endpoint}/invocations` directly. Prod uses each user's OBO token. Demo uses the App service principal when `DOCINTEL_OBO_REQUIRED=false`. `WorkspaceClient.serving_endpoints.query()` is not used for Agent Bricks invocation because validation showed it did not preserve the needed Agent Bricks response shape. +8. Knowledge Assistant citations currently arrive as markdown footnotes in Agent Bricks output messages. `app/agent_bricks_response.py` normalizes the final answer and extracts citation chips from those footnotes. + +Useful Databricks references: + +- [Create an AI agent](https://docs.databricks.com/aws/en/generative-ai/agent-framework/create-agent) +- [Knowledge Assistant](https://docs.databricks.com/aws/en/generative-ai/agent-bricks/knowledge-assistant) +- [Supervisor Agent](https://docs.databricks.com/aws/en/generative-ai/agent-bricks/multi-agent-supervisor) + --- ## Prerequisites @@ -101,7 +127,7 @@ You need a workspace with **all** of the following enabled: - Serverless SQL warehouse (AI Functions GA — `ai_parse_document`, `ai_classify`, `ai_extract`, `ai_query`) - Mosaic AI Vector Search (endpoint + Delta-Sync index) -- Agent Bricks (Knowledge Assistant, Supervisor Agent, Custom Agents on Apps) +- Agent Bricks Knowledge Assistant and Supervisor Agent - AI Gateway with OBO / identity enforcement - Lakebase Postgres (preview / GA depending on region) - Databricks Apps (Streamlit runtime) @@ -110,7 +136,7 @@ You need a workspace with **all** of the following enabled: **Required for production identity:** -- Databricks Apps **user token passthrough** (workspace admin setting). The app must not fall back to broad service-principal reads — see [`SECURITY.md`](./SECURITY.md). +- Databricks Apps **user token passthrough** (workspace admin setting). Prod requires user-scoped Agent Bricks calls — see [`SECURITY.md`](./SECURITY.md). ### Free trial signup @@ -188,24 +214,34 @@ In the workspace UI: **Apps → `doc-intel-analyst-demo`**. Ask: You should see a grounded answer with citation chips linking to `ACME_10K_2024.pdf` / `Risk`. +Example deployed Databricks App validation: + +![Deployed 10-K Analyst app showing an ACME revenue answer with a structured KPI citation chip](./docs/databricks-app-dogfood.png) + ### 7. Steady-state deploys After the first bring-up, iteration depends on what changed: ```bash # YAML / pipeline / job / app config changes -databricks bundle deploy -t demo -databricks bundle run -t demo analyst_app # apply app config + restart +AGENT_ENDPOINT_NAME="$(./scripts/resolve-agent-endpoint.sh demo)" +databricks bundle deploy -t demo --var "agent_endpoint_name=${AGENT_ENDPOINT_NAME}" +databricks bundle run -t demo --var "agent_endpoint_name=${AGENT_ENDPOINT_NAME}" analyst_app # Agent Bricks configuration / tool glue changes -databricks bundle deploy -t demo -databricks bundle run -t demo analyst_app +DOCINTEL_CATALOG=workspace \ +DOCINTEL_SCHEMA=docintel_10k_demo \ +DOCINTEL_WAREHOUSE_ID= \ +python -m agent.document_intelligence_agent --target demo +AGENT_ENDPOINT_NAME="$(./scripts/resolve-agent-endpoint.sh demo)" +databricks bundle deploy -t demo --var "agent_endpoint_name=${AGENT_ENDPOINT_NAME}" +databricks bundle run -t demo --var "agent_endpoint_name=${AGENT_ENDPOINT_NAME}" analyst_app # Pipeline SQL changes that need to re-process existing filings databricks bundle run -t demo doc_intel_pipeline ``` -You can also re-run `./scripts/bootstrap-demo.sh` — it auto-detects steady-state and does the full cycle (deploy → refresh data → register/promote → app run → grants → smoke) in one command. +You can also re-run `./scripts/bootstrap-demo.sh` — it auto-detects steady-state and does the full cycle (deploy → refresh data → update Agent Bricks → app run → grants → smoke) in one command. For a guided 30-minute tour, see [`specs/001-doc-intel-10k/quickstart.md`](./specs/001-doc-intel-10k/quickstart.md). @@ -235,7 +271,7 @@ Before any deploy reaches production, an evaluation must pass (constitution prin The bar is hard-coded; changing it requires editing `.specify/memory/constitution.md`, which is its own small ceremony (PR + version bump + Sync Impact Report). -Implementation uses `mlflow.evaluate(model_type="databricks-agent")` for the four LLM-judged axes; Execution + Latency are computed from the raw response stream. Per-row Correctness is sliced from `result.tables['eval_results']` for the SC-002/SC-003 P2 vs P3 thresholds. +Implementation uses `mlflow.evaluate(model_type="databricks-agent")` for the LLM-judged axes; Execution and Latency are computed from the raw response stream. When the active MLflow/databricks-agents version exposes per-row correctness in `result.tables['eval_results']`, the runner also logs SC-002/SC-003 P2 vs P3 slices. Current 1.x aggregate outputs may omit those slice columns, so the aggregate CLEARS gate remains the required pass/fail signal. --- @@ -255,6 +291,8 @@ Implementation uses `mlflow.evaluate(model_type="databricks-agent")` for the fou | `quality_threshold` | `22` | Section quality cutoff (0-30) for index inclusion | | `max_pdf_bytes` | `52428800` (50 MB) | Reject filings larger than this | | `analyst_group` | `account users` | UC group granted SELECT/USE on schema, READ/WRITE on volume | +| `agent_endpoint_name` | `UNSET_AGENT_BRICKS_ENDPOINT` | Generated Agent Bricks Supervisor endpoint resolved by `scripts/resolve-agent-endpoint.sh`; pass it on deploy/app-run commands after bootstrap | +| `app_obo_required` | `true` (prod) / `false` (demo) | Controls Databricks Apps user-token passthrough. Demo can use the App SP when passthrough is unavailable; prod requires OBO. | Override via `--var name=value` on any `bundle` command. @@ -288,9 +326,9 @@ bash -n scripts/bootstrap-demo.sh # Compile checks for all modified Python .venv/bin/python -m py_compile \ - agent/tools.py \ - app/app.py app/lakebase_client.py \ - evals/clears_eval.py scripts/bootstrap_agent_bricks.py \ + agent/document_intelligence_agent.py agent/tools.py \ + app/app.py app/agent_bricks_client.py app/agent_bricks_response.py app/lakebase_client.py \ + evals/clears_eval.py \ scripts/wait_for_kpis.py samples/synthesize.py ``` @@ -303,9 +341,9 @@ End-to-end is exercised by [`./scripts/bootstrap-demo.sh`](./scripts/bootstrap-d | Path | When | |---|---| | `./scripts/bootstrap-demo.sh` | Fresh-workspace bring-up (or after `bundle destroy`). Auto-detects FIRST-DEPLOY vs STEADY-STATE; handles staged deploy + data production + UC grants in either mode. | -| `databricks bundle deploy -t demo` | YAML / pipeline / job / app config changes after the first bring-up. | -| `databricks bundle run -t demo analyst_app` | After any change to `app/` or `resources/consumers/analyst.app.yml` — required to apply runtime config + restart the app. | -| `databricks bundle deploy -t prod --var service_principal_id=` | Production deploy, run as the prod SP. | +| `databricks bundle deploy -t demo --var "agent_endpoint_name=$(./scripts/resolve-agent-endpoint.sh demo)"` | YAML / pipeline / job / app config changes after the first bring-up. | +| `databricks bundle run -t demo --var "agent_endpoint_name=$(./scripts/resolve-agent-endpoint.sh demo)" analyst_app` | After any change to `app/` or `resources/consumers/analyst.app.yml` — required to apply runtime config + restart the app. | +| `databricks bundle deploy -t prod --var service_principal_id= --var "agent_endpoint_name=$(./scripts/resolve-agent-endpoint.sh prod)"` | Production deploy, run as the prod SP after prod Agent Bricks bootstrap. | | GitHub Actions on push to `main` | Steady-state CI: full `bundle deploy` → wait for Lakebase AVAILABLE → upload samples + run pipeline → Agent Bricks / AI Gateway validation → UC grants → `bundle run analyst_app` → CLEARS eval gate. (The first-ever bring-up of a workspace must be done locally with `./scripts/bootstrap-demo.sh`.) | For day-2 ops (Agent Bricks configuration validation, debugging low quality scores, inspecting CLEARS metrics in MLflow), see [`docs/runbook.md`](./docs/runbook.md). For the production-readiness checklist, see [`PRODUCTION_READINESS.md`](./PRODUCTION_READINESS.md). @@ -349,7 +387,7 @@ This is a production-oriented reference implementation with conservative scale d | Compute | CPU only | constitution add'l constraints | | Languages | English filings | implicit (foundation model) | | Eval set size | 30 questions | spec clarification | -| OBO end-to-end | Requires workspace-level `Databricks Apps - user token passthrough` feature | [`SECURITY.md`](./SECURITY.md) | +| Prod OBO end-to-end | Requires workspace-level `Databricks Apps - user token passthrough` feature | [`SECURITY.md`](./SECURITY.md) | Latency SLOs: P95 ≤ 8s for single-filing, ≤ 20s for cross-company. End-to-end pipeline ≤ 10 min P95 on a 30 MB PDF. @@ -363,7 +401,7 @@ See [`CONTRIBUTING.md`](./CONTRIBUTING.md) for local setup, the spec-kit workflo ## Security -See [`SECURITY.md`](./SECURITY.md) for the mandatory end-to-end OBO identity model, required UC grants, secrets-handling guidance, and how to report security issues in a fork or deployment. +See [`SECURITY.md`](./SECURITY.md) for the target-specific identity model, required UC grants, secrets-handling guidance, and how to report security issues in a fork or deployment. ## License diff --git a/SECURITY.md b/SECURITY.md index 476dd34..2af2350 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -2,32 +2,24 @@ ## Supported Security Posture -This reference is designed for Databricks workspaces using Unity Catalog, Agent Bricks, AI Gateway, Databricks Apps resource bindings, and mandatory end-to-end on-behalf-of (OBO) user identity. - -## Identity Modes - -| Mode | Use | Production row-level security | -|---|---|---| -| End-to-end OBO | Demo and production analyst use | Yes, after audit verification | - -Service-principal fallback is not supported for the agent path. If Databricks Apps user-token passthrough, Agent Bricks OBO, or AI Gateway identity enforcement is unavailable, deployment must fail with an actionable prerequisite error. +This reference is designed for Databricks workspaces using Unity Catalog, Agent Bricks, AI Gateway, Databricks Apps resource bindings, and end-to-end on-behalf-of (OBO) user identity in prod. Demo can run with `app_obo_required=false` when the workspace does not have Databricks Apps user-token passthrough enabled. In that mode the App service principal invokes Agent Bricks and is granted `CAN_QUERY` after deploy. ## Enabling End-To-End OBO -1. Workspace admin enables Databricks Apps user-token passthrough. -2. Declare the required `user_api_scopes` in `resources/consumers/analyst.app.yml`. -3. Redeploy and run the app resource. +1. Set `app_obo_required=true` for the target. Prod does this by default. +2. Workspace admin enables Databricks Apps user-token passthrough. +3. Redeploy and run the app resource. The deploy fails if the workspace cannot grant the declared `user_api_scopes`. 4. Verify `serving.serving-endpoints` and `sql` scopes are present after deployment. 5. Verify audit logs show downstream calls under the invoking user where required. -Agent Bricks / AI Gateway must enforce downstream access to document Q&A, SQL tools, models, and any external tools under the invoking user's identity. The previous custom MLflow auth-policy path has been removed from the production implementation. +With OBO enabled, Agent Bricks / AI Gateway enforce downstream access to document Q&A, SQL tools, models, and any external tools under the invoking user's identity. ## Secrets And Credentials - Do not commit Databricks tokens, service-principal secrets, Postgres passwords, or local app settings. - `.claude/settings.local.json`, `.databricks/`, `.venv/`, MLflow local artifacts, Python caches, and local skill bundles are ignored. - Use GitHub Actions secrets for `DATABRICKS_HOST` and `DATABRICKS_TOKEN`. -- Use Databricks resource bindings for app access to Lakebase and serving endpoints. +- Use Databricks resource bindings for Lakebase. Agent Bricks endpoint access is granted directly to users or the App service principal, depending on target auth mode. ## Required Grants diff --git a/VALIDATION.md b/VALIDATION.md index 97aa9b1..96f6e2d 100644 --- a/VALIDATION.md +++ b/VALIDATION.md @@ -7,8 +7,8 @@ Use this guide to prove the reference implementation works in a Databricks works ```bash python3 -m py_compile \ agent/tools.py \ - app/app.py app/lakebase_client.py \ - evals/clears_eval.py scripts/bootstrap_agent_bricks.py \ + app/app.py app/agent_bricks_client.py app/agent_bricks_response.py app/lakebase_client.py \ + evals/clears_eval.py agent/document_intelligence_agent.py \ scripts/wait_for_kpis.py samples/synthesize.py bash -n scripts/bootstrap-demo.sh @@ -41,10 +41,12 @@ Expected outcomes: - Pipeline creates Gold rows. - Agent Bricks Knowledge Assistant and Supervisor Agent are created or updated. - Consumer resources deploy cleanly. -- App config is applied with `bundle run analyst_app`. -- Bootstrap verifies mandatory OBO scopes. +- App config is applied with `bundle run analyst_app`, including `DOCINTEL_AGENT_ENDPOINT` set from the generated Supervisor endpoint name. +- Bootstrap verifies the target auth mode. Demo leaves `user_api_scopes` unset and grants the App service principal `CAN_QUERY`; prod requires OBO scopes. - Smoke query reaches the Agent Bricks supervisor endpoint. +If a prod app deploy fails with `Databricks Apps - user token passthrough feature is not enabled`, enable the workspace/org feature and rerun. Demo uses `app_obo_required=false` by default for workspaces where user-token passthrough is not enabled. + ## Data Checks ```sql @@ -73,20 +75,57 @@ python evals/clears_eval.py \ Expected: - Correctness, adherence, relevance, execution, safety, and latency thresholds pass. -- P2 and P3 correctness slices are logged. +- P2 and P3 correctness slices are logged when the active MLflow/databricks-agents metric output includes per-row correctness columns. Current 1.x aggregate outputs may not expose those slice columns; treat missing slices as validation evidence to record, not as a reason to bypass the aggregate gate. - No citations reference `garbage_10K_2024.pdf`. ## App Checks - Open `doc-intel-analyst-demo`. -- Ask: `What was ACME's revenue in fiscal year 2024?` -- Confirm the response has citations and the turn is written to Lakebase. +- Ask: `What were the top 3 risk factors disclosed by ACME in their FY24 10-K?` +- Confirm the response has citation chips and the turn is written to Lakebase. +- Ask: `What was ACME's revenue in fiscal year 2024?` to verify the structured KPI tool path. - Submit thumbs feedback and confirm a feedback row is written. -## OBO Verification - -- Confirm `resources/consumers/analyst.app.yml:user_api_scopes` is present. -- Run `databricks bundle deploy -t demo && databricks bundle run -t demo analyst_app`. -- Confirm bootstrap or CI verifies `serving.serving-endpoints` and `sql` scopes. -- Check audit logs for user-scoped downstream access through Agent Bricks, Knowledge Assistant, and the structured KPI SQL function. -- If the workspace cannot grant user-token passthrough, deployment is invalid and must fail. +## App Auth Verification + +- Demo: confirm `user_api_scopes` is unset and `DOCINTEL_OBO_REQUIRED=false`. +- Prod: confirm `user_api_scopes` is present and `DOCINTEL_OBO_REQUIRED=true`. +- Run for the target being verified: + ```bash + TARGET=demo + AGENT_ENDPOINT_NAME="$(./scripts/resolve-agent-endpoint.sh "$TARGET")" + databricks bundle deploy -t "$TARGET" --var "agent_endpoint_name=${AGENT_ENDPOINT_NAME}" + databricks bundle run -t "$TARGET" --var "agent_endpoint_name=${AGENT_ENDPOINT_NAME}" analyst_app + ``` +- Confirm bootstrap or CI verifies the target auth mode. Demo grants the App SP endpoint access; prod verifies `serving.serving-endpoints` and `sql` scopes. +- For prod, check audit logs for user-scoped downstream access through Agent Bricks, Knowledge Assistant, and the structured KPI SQL function. +- If prod cannot grant user-token passthrough, deployment is invalid and must fail. + +## Latest Demo Snapshot + +As of 2026-04-26, the demo workspace evidence is: + +- Bundle validation passed with the resolved Agent Bricks Supervisor endpoint. +- Document Intelligence Agent deployment succeeded: + - Knowledge Assistant display name: `doc-intel-knowledge-demo` + - Supervisor display name: `doc-intel-supervisor-demo` + - UC function: `workspace.docintel_10k_demo.lookup_10k_kpis` +- Direct Supervisor endpoint smoke passed. The ACME FY2024 revenue question returned `$94.2 billion` and referenced `ACME_10K_2024.pdf` through the structured KPI path. +- Databricks App deploy succeeded in demo App-SP mode: + - App: `doc-intel-analyst-demo` + - Endpoint: `mas-dc6aba10-endpoint` + - `DOCINTEL_OBO_REQUIRED=false` + - `user_api_scopes` unset + - App service principal granted `CAN_QUERY` on the generated Supervisor endpoint. +- Lakebase OAuth credential handling was validated without `PGPASSWORD`; the deployed app code mints the database password at connection time. +- Vector Search `index_refresh` job rerun terminated `SUCCESS`. +- CLEARS live eval completed but failed the configured gate: + - MLflow run ID: `772e902cab92459f9bf569296fc5f801` + - correctness: `0.323` + - adherence: `0.000` + - relevance/groundedness: `0.516` + - safety: `1.000` + - execution: `1.000` + - latency p95: `31711ms` + +Status: Agent Bricks deployment mechanics, demo App deploy, Lakebase OAuth credential handling, and direct serving smoke passed. Reference-ready quality remains open until CLEARS passes. Prod OBO readiness remains open until validated in a workspace with user-token passthrough enabled. diff --git a/scripts/bootstrap_agent_bricks.py b/agent/document_intelligence_agent.py similarity index 84% rename from scripts/bootstrap_agent_bricks.py rename to agent/document_intelligence_agent.py index 565a0d5..af825f2 100644 --- a/scripts/bootstrap_agent_bricks.py +++ b/agent/document_intelligence_agent.py @@ -1,15 +1,4 @@ -"""Create or update the Agent Bricks runtime for the document intelligence app. - -This is the production agent bootstrap path. It configures: - -* Agent Bricks Knowledge Assistant over the governed Vector Search source. -* A deterministic Unity Catalog SQL function for structured KPI lookups. -* Agent Bricks Supervisor Agent that coordinates the Knowledge Assistant and - the KPI function. - -The earlier hand-built MLflow pyfunc agent runtime is intentionally not part of -this path. -""" +"""Document Intelligence Agent definition and deployment logic.""" from __future__ import annotations @@ -18,7 +7,7 @@ import os import sys import time -from dataclasses import asdict +from dataclasses import asdict, dataclass from typing import Iterable, TypeVar from databricks.sdk import WorkspaceClient @@ -38,6 +27,24 @@ T = TypeVar("T") +@dataclass +class DocumentIntelligenceAgentRuntime: + knowledge_assistant: KnowledgeAssistant + supervisor_agent: SupervisorAgent + kpi_function: str + supervisor_endpoint: str + knowledge_endpoint: str + + def as_dict(self) -> dict: + return { + "knowledge_assistant": _as_dict(self.knowledge_assistant), + "supervisor_agent": _as_dict(self.supervisor_agent), + "kpi_function": self.kpi_function, + "supervisor_endpoint": self.supervisor_endpoint, + "knowledge_endpoint": self.knowledge_endpoint, + } + + def _find_by_display_name(items: Iterable[T], display_name: str) -> T | None: for item in items: if getattr(item, "display_name", None) == display_name: @@ -162,7 +169,6 @@ def _ensure_knowledge_assistant( w: WorkspaceClient, *, display_name: str, - endpoint_name: str, index_name: str, ) -> KnowledgeAssistant: description = ( @@ -179,7 +185,6 @@ def _ensure_knowledge_assistant( existing = _find_by_display_name(w.knowledge_assistants.list_knowledge_assistants(), display_name) desired = KnowledgeAssistant( display_name=display_name, - endpoint_name=endpoint_name, description=description, instructions=instructions, ) @@ -239,7 +244,6 @@ def _ensure_supervisor( w: WorkspaceClient, *, display_name: str, - endpoint_name: str, knowledge_assistant: KnowledgeAssistant, kpi_function_name: str, ) -> SupervisorAgent: @@ -250,12 +254,13 @@ def _ensure_supervisor( instructions = ( "Use the Knowledge Assistant for narrative or section-level questions. " "Use the Unity Catalog KPI function for structured financial metrics " - "and cross-company comparisons. Do not invent figures; cite the filing " - "source or state that the corpus does not contain the answer." + "and cross-company comparisons. For KPI function answers, include the " + "source filename and extraction confidence in the final answer. Do not " + "invent figures; cite the filing source or state that the corpus does " + "not contain the answer." ) desired = SupervisorAgent( display_name=display_name, - endpoint_name=endpoint_name, description=description, instructions=instructions, ) @@ -277,7 +282,6 @@ def _ensure_supervisor( ), knowledge_assistant=SupervisorKnowledgeAssistant( knowledge_assistant_id=knowledge_assistant.id or _id_from_name(knowledge_assistant.name), - serving_endpoint_name=knowledge_assistant.endpoint_name, ), ) kpi_tool = Tool( @@ -368,62 +372,77 @@ def _grant_endpoint_query(w: WorkspaceClient, endpoint_name: str, group_name: st ) -def main() -> int: - parser = argparse.ArgumentParser() - parser.add_argument("--target", default=os.environ.get("DOCINTEL_TARGET", "demo")) - parser.add_argument("--catalog", default=os.environ.get("DOCINTEL_CATALOG")) - parser.add_argument("--schema", default=os.environ.get("DOCINTEL_SCHEMA")) - parser.add_argument("--warehouse-id", default=os.environ.get("DOCINTEL_WAREHOUSE_ID")) - parser.add_argument("--analyst-group", default=os.environ.get("DOCINTEL_ANALYST_GROUP", "account users")) - parser.add_argument("--requested-supervisor-endpoint") - parser.add_argument("--requested-knowledge-endpoint") - parser.add_argument("--supervisor-endpoint", dest="requested_supervisor_endpoint", help=argparse.SUPPRESS) - parser.add_argument("--knowledge-endpoint", dest="requested_knowledge_endpoint", help=argparse.SUPPRESS) - args = parser.parse_args() - - if not args.catalog or not args.schema or not args.warehouse_id: - parser.error("--catalog, --schema, and --warehouse-id are required") - - target = args.target - requested_supervisor_endpoint = args.requested_supervisor_endpoint or f"analyst-agent-{target}" - requested_knowledge_endpoint = args.requested_knowledge_endpoint or f"doc-intel-knowledge-{target}" - index_name = f"{args.catalog}.{args.schema}.filings_summary_idx" +def deploy_document_intelligence_agent( + w: WorkspaceClient, + *, + target: str, + catalog: str, + schema: str, + warehouse_id: str, + analyst_group: str, +) -> DocumentIntelligenceAgentRuntime: + index_name = f"{catalog}.{schema}.filings_summary_idx" - w = WorkspaceClient() kpi_function_name = _create_or_update_kpi_function( w, - catalog=args.catalog, - schema=args.schema, - warehouse_id=args.warehouse_id, + catalog=catalog, + schema=schema, + warehouse_id=warehouse_id, ) knowledge_assistant = _ensure_knowledge_assistant( w, display_name=f"doc-intel-knowledge-{target}", - endpoint_name=requested_knowledge_endpoint, index_name=index_name, ) supervisor = _ensure_supervisor( w, display_name=f"doc-intel-supervisor-{target}", - endpoint_name=requested_supervisor_endpoint, knowledge_assistant=knowledge_assistant, kpi_function_name=kpi_function_name, ) - actual_supervisor_endpoint = supervisor.endpoint_name or requested_supervisor_endpoint - actual_knowledge_endpoint = knowledge_assistant.endpoint_name or requested_knowledge_endpoint + if not supervisor.endpoint_name: + raise RuntimeError(f"Supervisor Agent doc-intel-supervisor-{target} did not return an endpoint_name") + if not knowledge_assistant.endpoint_name: + raise RuntimeError(f"Knowledge Assistant doc-intel-knowledge-{target} did not return an endpoint_name") + + actual_supervisor_endpoint = supervisor.endpoint_name + actual_knowledge_endpoint = knowledge_assistant.endpoint_name - _grant_endpoint_query(w, actual_supervisor_endpoint, args.analyst_group) + _grant_endpoint_query(w, actual_supervisor_endpoint, analyst_group) if actual_knowledge_endpoint: - _grant_endpoint_query(w, actual_knowledge_endpoint, args.analyst_group) - - print(json.dumps({ - "knowledge_assistant": _as_dict(knowledge_assistant), - "supervisor_agent": _as_dict(supervisor), - "kpi_function": kpi_function_name, - "supervisor_endpoint": actual_supervisor_endpoint, - "knowledge_endpoint": actual_knowledge_endpoint, - }, indent=2, default=str)) + _grant_endpoint_query(w, actual_knowledge_endpoint, analyst_group) + + return DocumentIntelligenceAgentRuntime( + knowledge_assistant=knowledge_assistant, + supervisor_agent=supervisor, + kpi_function=kpi_function_name, + supervisor_endpoint=actual_supervisor_endpoint, + knowledge_endpoint=actual_knowledge_endpoint, + ) + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("--target", default=os.environ.get("DOCINTEL_TARGET", "demo")) + parser.add_argument("--catalog", default=os.environ.get("DOCINTEL_CATALOG")) + parser.add_argument("--schema", default=os.environ.get("DOCINTEL_SCHEMA")) + parser.add_argument("--warehouse-id", default=os.environ.get("DOCINTEL_WAREHOUSE_ID")) + parser.add_argument("--analyst-group", default=os.environ.get("DOCINTEL_ANALYST_GROUP", "account users")) + args = parser.parse_args() + + if not args.catalog or not args.schema or not args.warehouse_id: + parser.error("--catalog, --schema, and --warehouse-id are required") + + runtime = deploy_document_intelligence_agent( + WorkspaceClient(), + target=args.target, + catalog=args.catalog, + schema=args.schema, + warehouse_id=args.warehouse_id, + analyst_group=args.analyst_group, + ) + print(json.dumps(runtime.as_dict(), indent=2, default=str)) return 0 diff --git a/agent/tests/test_agent_bricks_response.py b/agent/tests/test_agent_bricks_response.py index 7d3d4f9..63b308b 100644 --- a/agent/tests/test_agent_bricks_response.py +++ b/agent/tests/test_agent_bricks_response.py @@ -69,6 +69,98 @@ def test_extract_citations_returns_empty_without_structured_sources_or_footnotes assert extract_citations(payload) == [] +def test_extract_citations_from_structured_kpi_answer() -> None: + payload = { + "output": [ + { + "type": "message", + "content": [ + { + "type": "output_text", + "text": ( + "ACME Corporation's revenue in fiscal year 2024 was $94.20 billion.\n\n" + "This information was extracted from ACME's official 10-K filing " + "(ACME_10K_2024.pdf) with high confidence (99.97%)." + ), + } + ], + } + ] + } + + citations = extract_citations(payload) + + assert citations == [ + { + "filename": "ACME_10K_2024.pdf", + "section_label": "Structured KPI extract", + "snippet": ( + "This information was extracted from ACME's official 10-K filing " + "(ACME_10K_2024.pdf) with high confidence (99.97%)." + ), + "score": 0.9997, + } + ] + + +def test_normalise_agent_response_marks_structured_kpi_answer_grounded() -> None: + response = normalise_agent_response({ + "output_text": ( + "Revenue was $94.20 billion, sourced from ACME_10K_2024.pdf " + "with extraction confidence 99.97%." + ) + }) + + assert response["grounded"] is True + assert response["retrieved_count"] == 1 + assert response["citations"][0]["filename"] == "ACME_10K_2024.pdf" + assert response["citations"][0]["score"] == 0.9997 + + +def test_extract_citations_keeps_unsupported_answer_ungrounded() -> None: + payload = { + "output_text": ( + "The corpus does not contain a grounded answer for this metric. " + "No source in ACME_10K_2024.pdf supports the requested value." + ) + } + + assert extract_citations(payload) == [] + + +def test_extract_citations_prefers_knowledge_footnotes_over_kpi_fallback() -> None: + payload = { + "output": [ + { + "type": "message", + "content": [ + { + "type": "output_text", + "text": "[^p1]: Revenue was $94.2B. _ACME_10K_2024.pdf_", + } + ], + }, + { + "type": "message", + "content": [ + { + "type": "output_text", + "text": ( + "Revenue was $94.20 billion, sourced from ACME_10K_2024.pdf " + "with confidence 99.97%." + ), + } + ], + }, + ] + } + + citations = extract_citations(payload) + + assert len(citations) == 1 + assert citations[0]["section_label"] == "Knowledge Assistant citation" + + def test_normalise_agent_response_coerces_citations_and_latency() -> None: response = normalise_agent_response( { diff --git a/agent/tools.py b/agent/tools.py index 5385dfa..20759d7 100644 --- a/agent/tools.py +++ b/agent/tools.py @@ -1,9 +1,7 @@ """Deterministic KPI tool glue for Agent Bricks. The production tool is a Unity Catalog SQL function created by -`scripts/bootstrap_agent_bricks.py`. These helpers keep the SQL access pattern -testable and available for local validation without reintroducing a custom -agent runtime. +`agent.document_intelligence_agent`. These helpers keep the SQL access pattern testable. """ from __future__ import annotations diff --git a/app/README.md b/app/README.md index 20e328c..774d9ff 100644 --- a/app/README.md +++ b/app/README.md @@ -6,7 +6,7 @@ Source for the Databricks App `doc-intel-analyst-${target}`. Streamlit chat UI o | File | Purpose | |---|---| -| `app.py` | Streamlit entry point — chat loop, OBO client, citation rendering. | +| `app.py` | Streamlit entry point — chat loop, target auth client, citation rendering. | | `app.yaml` | Databricks Apps runtime config (port, address, CORS/XSRF env vars). | | `lakebase_client.py` | psycopg-based persistence to Lakebase Postgres. | | `requirements.txt` | Python deps installed by the Apps runtime. | @@ -14,8 +14,9 @@ Source for the Databricks App `doc-intel-analyst-${target}`. Streamlit chat UI o ## Running deployed (canonical) ```bash -databricks bundle deploy -t demo -databricks bundle run -t demo analyst_app +AGENT_ENDPOINT_NAME="$(./scripts/resolve-agent-endpoint.sh demo)" +databricks bundle deploy -t demo --var "agent_endpoint_name=${AGENT_ENDPOINT_NAME}" +databricks bundle run -t demo --var "agent_endpoint_name=${AGENT_ENDPOINT_NAME}" analyst_app # Open the App URL from the workspace UI ("Apps" → doc-intel-analyst-demo) ``` @@ -30,34 +31,41 @@ export DATABRICKS_HOST=https://.cloud.databricks.com export DATABRICKS_CLIENT_ID= export DATABRICKS_CLIENT_SECRET= -# Lakebase env vars (PGHOST/PGPORT/PGUSER/PGPASSWORD/PGDATABASE) come from -# the App resource binding when deployed. Locally, derive them with: -eval "$(databricks apps get doc-intel-analyst-demo \ - --output json | jq -r '.resources[] | select(.name=="docintel-lakebase") | .database | @sh " -export PGHOST=\(.host) PGPORT=\(.port) PGUSER=\(.username) PGPASSWORD=\(.password) PGDATABASE=\(.database)"')" +# Lakebase env vars come from the App resource binding when deployed. +# Locally, set the same connection fields and let lakebase_client.py mint the +# OAuth database password through the Databricks SDK. +export DOCINTEL_LAKEBASE_INSTANCE=docintel-demo-state-v1 +export DOCINTEL_LAKEBASE_SCHEMA=docintel_app +export PGDATABASE=docintel-demo-state-v1 +export PGUSER= +export PGPORT=5432 +export PGSSLMODE=require +export PGHOST="$(databricks database get-database-instance "${DOCINTEL_LAKEBASE_INSTANCE}" \ + --output json | jq -r '.read_write_dns')" export DOCINTEL_AGENT_ENDPOINT="$(./scripts/resolve-agent-endpoint.sh demo)" +export DOCINTEL_OBO_REQUIRED=false streamlit run app/app.py ``` -Local runs do not have the Databricks Apps `x-forwarded-access-token` header, so they cannot validate the Agent Bricks OBO path. Use the deployed App for agent validation. +Local runs do not have the Databricks Apps `x-forwarded-access-token` header, so they cannot validate the Agent Bricks OBO path. Use a deployed OBO-enabled target for prod identity validation. -If you accidentally run Lakebase schema initialization with user creds (`DATABRICKS_CLIENT_ID`/`SECRET` unset), `lakebase_client.init_schema()` logs a warning identifying the mismatch. The tables get created under your user account, not the App SP, and the deployed App will lose write access. Drop the user-owned tables and re-init under the App SP to recover: +If you accidentally run Lakebase schema initialization with user creds (`DATABRICKS_CLIENT_ID`/`SECRET` unset), `lakebase_client.init_schema()` logs a warning identifying the mismatch. The schema gets created under your user account, not the App SP, and the deployed App will lose write access. Drop the user-owned schema and re-init under the App SP to recover: ```sql -- connected as the App SP via the local-dev env above -DROP TABLE IF EXISTS feedback CASCADE; -DROP TABLE IF EXISTS query_logs CASCADE; -DROP TABLE IF EXISTS conversation_history CASCADE; +DROP SCHEMA IF EXISTS docintel_app CASCADE; -- next streamlit run will re-init under the App SP ``` ## OBO (on-behalf-of) flow -The app forwards each user's `x-forwarded-access-token` header to the Agent Bricks Supervisor endpoint via a `WorkspaceClient(token=...)` cache (`app.py:_user_client`). Agent Bricks, Knowledge Assistant, and the UC KPI function must run under the invoking user's identity, not broad App SP reads. +When `DOCINTEL_OBO_REQUIRED=true`, the app builds a `WorkspaceClient(token=...)` from each user's `x-forwarded-access-token` header (`app.py:_user_client`) and invokes the Agent Bricks Supervisor endpoint through `POST /serving-endpoints/{endpoint}/invocations` (`agent_bricks_client.py`). Agent Bricks, Knowledge Assistant, and the UC KPI function run under the invoking user's identity. -`user_api_scopes` declared in `resources/consumers/analyst.app.yml` (`serving.serving-endpoints`, `sql`, `iam.access-control:read`, `iam.current-user:read`) are required for app-level OBO. Deployment is invalid if these scopes are not granted. +When `DOCINTEL_OBO_REQUIRED=false`, the app uses its App service principal client. This is for demo workspaces that do not have Databricks Apps user-token passthrough enabled. -**Streamlit gotcha** (per the [Databricks Apps runtime docs](https://docs.databricks.com/aws/en/dev-tools/databricks-apps/app-runtime)): the OBO token is captured at the initial HTTP request; the connection then upgrades to WebSocket and the token never refreshes. If a user's UC permissions change mid-session, ask them to reload the page. +The endpoint name is generated by Agent Bricks, resolved with `scripts/resolve-agent-endpoint.sh`, and injected into the app as `DOCINTEL_AGENT_ENDPOINT` by `resources/consumers/analyst.app.yml`. + +`user_api_scopes` is declared only on the prod target in `databricks.yml` (`serving.serving-endpoints`, `sql`, `iam.access-control:read`, `iam.current-user:read`) and requires the workspace-level "Databricks Apps - user token passthrough" feature. Demo leaves scopes unset and grants the App service principal `CAN_QUERY` on the generated Supervisor endpoint after deploy. -**Local-dev caveat**: `st.context.headers` won't have `x-forwarded-access-token` when running `streamlit run` outside the Databricks Apps reverse proxy. The app raises a prerequisite error instead of using service-principal reads for agent calls. +**Streamlit gotcha** (per the [Databricks Apps runtime docs](https://docs.databricks.com/aws/en/dev-tools/databricks-apps/app-runtime)): the OBO token is captured at the initial HTTP request; the connection then upgrades to WebSocket and the token never refreshes. If a user's UC permissions change mid-session, ask them to reload the page. diff --git a/app/agent_bricks_client.py b/app/agent_bricks_client.py index 3539779..944f37d 100644 --- a/app/agent_bricks_client.py +++ b/app/agent_bricks_client.py @@ -25,7 +25,7 @@ def invoke_agent_endpoint( url = f"{host}/serving-endpoints/{endpoint_name}/invocations" body = json.dumps({"input": [{"role": "user", "content": question}]}).encode("utf-8") # For an OBO WorkspaceClient built with Config(token=), - # authenticate() emits that user token. There is no App SP fallback here. + # authenticate() emits that user token. headers = { "Content-Type": "application/json", "X-Request-ID": client_request_id or str(uuid.uuid4()), diff --git a/app/agent_bricks_response.py b/app/agent_bricks_response.py index be0fb35..4e2b282 100644 --- a/app/agent_bricks_response.py +++ b/app/agent_bricks_response.py @@ -2,8 +2,8 @@ from __future__ import annotations -import uuid import re +import uuid from collections.abc import Mapping from typing import Any @@ -16,6 +16,19 @@ # without a parseable filename and [] when no footnotes are present. APP_EMPTY_TEXT = "The Agent Bricks endpoint returned a response without displayable text." FILENAME_RE = re.compile(r"_([A-Za-z0-9][A-Za-z0-9_.-]*\.pdf)_") +PDF_FILENAME_RE = re.compile(r"\b([A-Za-z0-9][A-Za-z0-9_.-]*\.pdf)\b") +CONFIDENCE_PERCENT_RE = re.compile( + r"\b(?:confidence|extraction[_ -]?confidence)\b[^\d%]{0,40}(\d+(?:\.\d+)?)\s*%", + re.IGNORECASE, +) +UNGROUNDED_RE = re.compile( + r"\b(" + r"cannot determine|could not find|does not contain(?: a)? grounded answer|" + r"no grounded answer|no source|not available|not found|unable to determine|" + r"without a grounded source" + r")\b", + re.IGNORECASE, +) def _output_text_groups(payload: Mapping[str, Any]) -> list[str]: @@ -63,16 +76,61 @@ def extract_text(payload: Mapping[str, Any], *, empty_text: str = "") -> str: return empty_text +def _confidence_score(text: str) -> float | None: + match = CONFIDENCE_PERCENT_RE.search(text) + if not match: + return None + try: + percent = float(match.group(1)) + except ValueError: + return None + if percent < 0 or percent > 100: + return None + return percent / 100 + + +def _source_snippet(text: str, filename: str) -> str: + for line in text.splitlines(): + stripped = line.strip(" -*\t") + if filename in stripped: + return stripped + return f"Structured KPI answer cited {filename}." + + +def _structured_kpi_citation(payload: Mapping[str, Any]) -> list[dict[str, Any]]: + answer = extract_text(payload) + if not answer or UNGROUNDED_RE.search(answer): + return [] + + match = PDF_FILENAME_RE.search(answer) + if not match: + return [] + + filename = match.group(1) + citation: dict[str, Any] = { + "filename": filename, + "section_label": "Structured KPI extract", + "snippet": _source_snippet(answer, filename), + } + score = _confidence_score(answer) + if score is not None: + citation["score"] = score + return [citation] + + def extract_citations(payload: Mapping[str, Any]) -> list[dict[str, Any]]: citations = payload.get("citations") or payload.get("sources") or [] - if not isinstance(citations, list): - return [] normalized: list[dict[str, Any]] = [] - for citation in citations: - if isinstance(citation, Mapping): - normalized.append(dict(citation)) - elif citation is not None: - normalized.append({"source": str(citation)}) + if isinstance(citations, list): + for citation in citations: + if isinstance(citation, Mapping): + normalized.append(dict(citation)) + elif citation is not None: + normalized.append({"source": str(citation)}) + elif isinstance(citations, Mapping): + normalized.append(dict(citations)) + elif citations: + normalized.append({"source": str(citations)}) if normalized: return normalized @@ -93,7 +151,10 @@ def extract_citations(payload: Mapping[str, Any]) -> list[dict[str, Any]]: "section_label": "Knowledge Assistant citation", "snippet": snippet, }) - return normalized + if normalized: + return normalized + + return _structured_kpi_citation(payload) def normalise_agent_response( diff --git a/app/app.py b/app/app.py index 6ab1f04..384616f 100644 --- a/app/app.py +++ b/app/app.py @@ -14,12 +14,18 @@ from databricks.sdk import WorkspaceClient from databricks.sdk.config import Config -from app.agent_bricks_client import invoke_agent_endpoint -from app.agent_bricks_response import normalise_agent_response -from app import lakebase_client +try: + from app.agent_bricks_client import invoke_agent_endpoint + from app.agent_bricks_response import normalise_agent_response + from app import lakebase_client +except ImportError: + from agent_bricks_client import invoke_agent_endpoint + from agent_bricks_response import normalise_agent_response + import lakebase_client AGENT_ENDPOINT = os.environ["DOCINTEL_AGENT_ENDPOINT"] # set by resources/consumers/analyst.app.yml +OBO_REQUIRED = os.environ.get("DOCINTEL_OBO_REQUIRED", "true").lower() == "true" @st.cache_resource(ttl=3600) @@ -32,9 +38,6 @@ def _user_client(token: str) -> WorkspaceClient: the initial HTTP request, then the connection switches to WebSocket — the token never refreshes. Long-lived sessions should reload the page after permission changes. - - Missing tokens are a deployment prerequisite failure. Production must run - through Databricks Apps user-token passthrough. """ return WorkspaceClient(config=Config( host=os.environ["DATABRICKS_HOST"], @@ -44,12 +47,15 @@ def _user_client(token: str) -> WorkspaceClient: def _agent_client() -> WorkspaceClient: token = st.context.headers.get("x-forwarded-access-token") + if token: + return _user_client(token) if not token: + if not OBO_REQUIRED: + return WorkspaceClient() raise RuntimeError( "Databricks Apps user-token passthrough is required; no " "x-forwarded-access-token header was present." ) - return _user_client(token) def _user_email() -> str: @@ -86,7 +92,7 @@ def _ensure_session() -> tuple[str, str]: def _render_citations(citations: list[dict]) -> None: if not citations: - st.caption("No citations — the agent did not find a grounded source.") + st.caption("No citation chips returned for this response.") return cols = st.columns(min(len(citations), 4)) for i, c in enumerate(citations[:4]): diff --git a/app/app.yaml b/app/app.yaml index 3f5537c..ffd1350 100644 --- a/app/app.yaml +++ b/app/app.yaml @@ -4,13 +4,9 @@ # injected port/host and disable Streamlit CORS/XSRF behind the Apps proxy. command: - - streamlit - - run - - app/app.py - - --server.port - - "${DATABRICKS_APP_PORT:-8000}" - - --server.address - - "0.0.0.0" + - sh + - -c + - exec streamlit run app.py --server.port "${DATABRICKS_APP_PORT:-8000}" --server.address 0.0.0.0 env: # Disable Streamlit's CORS + XSRF self-checks. The Apps reverse-proxy origin diff --git a/app/lakebase_client.py b/app/lakebase_client.py index caba5e5..286dfd5 100644 --- a/app/lakebase_client.py +++ b/app/lakebase_client.py @@ -2,8 +2,9 @@ Persists conversation history, query logs, and feedback per the contracts in `specs/001-doc-intel-10k/contracts/`. The Databricks App database resource -binding exposes standard Postgres env vars (PGHOST, PGPORT, PGUSER, -PGPASSWORD, PGDATABASE). +binding exposes Postgres connection env vars (PGHOST, PGPORT, PGUSER, +PGDATABASE, PGSSLMODE). Lakebase OAuth passwords are minted on demand with the +Databricks SDK. Databricks Apps + Lakebase docs (https://docs.databricks.com/aws/en/oltp/) — initialize schema at @@ -24,20 +25,35 @@ from typing import Iterator import psycopg +from psycopg import sql +from databricks.sdk import WorkspaceClient _log = logging.getLogger(__name__) -_SCHEMA = """ -CREATE TABLE IF NOT EXISTS conversation_history ( +def _lakebase_schema() -> str: + return os.environ.get("DOCINTEL_LAKEBASE_SCHEMA", "docintel_app") + + +def _table(name: str) -> sql.Identifier: + return sql.Identifier(_lakebase_schema(), name) + + +def _schema_ddl() -> sql.Composed: + conversation_history = _table("conversation_history") + query_logs = _table("query_logs") + feedback = _table("feedback") + return sql.SQL( + """ +CREATE TABLE IF NOT EXISTS {conversation_history} ( conversation_id UUID PRIMARY KEY, user_email TEXT NOT NULL, started_at TIMESTAMPTZ NOT NULL DEFAULT now(), last_turn_at TIMESTAMPTZ NOT NULL DEFAULT now() ); -CREATE TABLE IF NOT EXISTS query_logs ( +CREATE TABLE IF NOT EXISTS {query_logs} ( turn_id UUID PRIMARY KEY, - conversation_id UUID REFERENCES conversation_history(conversation_id), + conversation_id UUID REFERENCES {conversation_history}(conversation_id), question TEXT NOT NULL, answer TEXT NOT NULL, citations JSONB NOT NULL, @@ -45,15 +61,20 @@ agent_path TEXT NOT NULL, created_at TIMESTAMPTZ NOT NULL DEFAULT now() ); -CREATE TABLE IF NOT EXISTS feedback ( +CREATE TABLE IF NOT EXISTS {feedback} ( feedback_id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - turn_id UUID REFERENCES query_logs(turn_id), + turn_id UUID REFERENCES {query_logs}(turn_id), user_email TEXT NOT NULL, rating TEXT NOT NULL CHECK (rating IN ('up','down')), comment TEXT, created_at TIMESTAMPTZ NOT NULL DEFAULT now() ); """ + ).format( + conversation_history=conversation_history, + query_logs=query_logs, + feedback=feedback, + ) @contextmanager @@ -63,24 +84,40 @@ def _conn() -> Iterator[psycopg.Connection]: conninfo = dsn kwargs = {} else: - required = ("PGHOST", "PGPORT", "PGUSER", "PGPASSWORD", "PGDATABASE") + required = ("PGHOST", "PGPORT", "PGUSER", "PGDATABASE") missing = [name for name in required if not os.environ.get(name)] if missing: raise RuntimeError(f"Lakebase binding missing Postgres env vars: {', '.join(missing)}") + password = os.environ.get("PGPASSWORD") or _generate_lakebase_password() conninfo = "" kwargs = { "host": os.environ["PGHOST"], "port": os.environ["PGPORT"], "user": os.environ["PGUSER"], - "password": os.environ["PGPASSWORD"], + "password": password, "dbname": os.environ["PGDATABASE"], + "sslmode": os.environ.get("PGSSLMODE", "require"), } with psycopg.connect(conninfo, autocommit=True, **kwargs) as c: yield c +def _generate_lakebase_password() -> str: + instance_name = os.environ.get("DOCINTEL_LAKEBASE_INSTANCE") or os.environ.get("PGDATABASE") + if not instance_name: + raise RuntimeError("Lakebase OAuth credential requires DOCINTEL_LAKEBASE_INSTANCE or PGDATABASE") + credential = WorkspaceClient().database.generate_database_credential( + request_id=str(uuid.uuid4()), + instance_names=[instance_name], + ) + token = getattr(credential, "token", None) + if not token: + raise RuntimeError("Lakebase OAuth credential response did not include a token") + return token + + def init_schema() -> None: - """Idempotent CREATE TABLE IF NOT EXISTS. Logs the connected role so + """Idempotent CREATE SCHEMA/TABLE IF NOT EXISTS. Logs the connected role so deployed-vs-local identity divergence is debuggable from app logs. """ with _conn() as c, c.cursor() as cur: @@ -97,14 +134,19 @@ def init_schema() -> None: ) else: _log.info("Lakebase init connected as %r", connected_user) - cur.execute(_SCHEMA) + cur.execute( + sql.SQL("CREATE SCHEMA IF NOT EXISTS {}").format(sql.Identifier(_lakebase_schema())) + ) + cur.execute(_schema_ddl()) def ensure_conversation(conversation_id: uuid.UUID, user_email: str) -> None: with _conn() as c, c.cursor() as cur: cur.execute( - "INSERT INTO conversation_history (conversation_id, user_email) VALUES (%s, %s) " - "ON CONFLICT (conversation_id) DO UPDATE SET last_turn_at = now()", + sql.SQL( + "INSERT INTO {table} (conversation_id, user_email) VALUES (%s, %s) " + "ON CONFLICT (conversation_id) DO UPDATE SET last_turn_at = now()" + ).format(table=_table("conversation_history")), (conversation_id, user_email), ) @@ -112,8 +154,10 @@ def ensure_conversation(conversation_id: uuid.UUID, user_email: str) -> None: def log_turn(*, turn_id: str, conversation_id: uuid.UUID, response: dict, question: str) -> None: with _conn() as c, c.cursor() as cur: cur.execute( - "INSERT INTO query_logs (turn_id, conversation_id, question, answer, citations, latency_ms, agent_path) " - "VALUES (%s, %s, %s, %s, %s::jsonb, %s, %s)", + sql.SQL( + "INSERT INTO {table} (turn_id, conversation_id, question, answer, citations, latency_ms, agent_path) " + "VALUES (%s, %s, %s, %s, %s::jsonb, %s, %s)" + ).format(table=_table("query_logs")), ( turn_id, conversation_id, @@ -129,6 +173,8 @@ def log_turn(*, turn_id: str, conversation_id: uuid.UUID, response: dict, questi def write_feedback(*, turn_id: str, user_email: str, rating: str, comment: str | None) -> None: with _conn() as c, c.cursor() as cur: cur.execute( - "INSERT INTO feedback (turn_id, user_email, rating, comment) VALUES (%s, %s, %s, %s)", + sql.SQL("INSERT INTO {table} (turn_id, user_email, rating, comment) VALUES (%s, %s, %s, %s)").format( + table=_table("feedback") + ), (turn_id, user_email, rating, comment), ) diff --git a/databricks.yml b/databricks.yml index 1ee6c38..badb9a2 100644 --- a/databricks.yml +++ b/databricks.yml @@ -37,8 +37,11 @@ variables: description: UC group granted SELECT/USE on the catalog/schema default: account users agent_endpoint_name: - description: Agent Bricks Supervisor serving endpoint name resolved by bootstrap_agent_bricks.py + description: Agent Bricks Supervisor serving endpoint name resolved by agent.document_intelligence_agent default: UNSET_AGENT_BRICKS_ENDPOINT + app_obo_required: + description: Whether the Databricks App requires user-token passthrough for Agent Bricks calls + default: "true" targets: demo: @@ -53,6 +56,7 @@ targets: catalog: workspace schema: docintel_10k_demo lakebase_instance: docintel-demo-state-v1 + app_obo_required: "false" resources: pipelines: doc_intel_pipeline: @@ -78,3 +82,12 @@ targets: schema: docintel_10k lakebase_instance: docintel-prod-state lakebase_stopped: false + app_obo_required: "true" + resources: + apps: + analyst_app: + user_api_scopes: + - serving.serving-endpoints + - sql + - iam.access-control:read + - iam.current-user:read diff --git a/docs/databricks-app-dogfood.png b/docs/databricks-app-dogfood.png new file mode 100644 index 0000000..811e5d1 Binary files /dev/null and b/docs/databricks-app-dogfood.png differ diff --git a/docs/design.md b/docs/design.md index 32b1220..0b2c5e7 100644 --- a/docs/design.md +++ b/docs/design.md @@ -8,7 +8,7 @@ This document covers the *why*, the architecture, and the build workflow behind - [Architecture](#architecture) - [Two halves: an offline pipeline, and an online agent](#two-halves-an-offline-pipeline-and-an-online-agent) - [Vector Search bridges data and agent](#vector-search-bridges-data-and-agent) - - [Agent has two paths, one endpoint](#agent-has-two-paths-one-endpoint) + - [Agent Bricks target runtime](#agent-bricks-target-runtime) - [Runtime stack](#runtime-stack) - [How it's built — three pillars](#how-its-built--three-pillars) - [Pillar 1 — Spec-Kit](#pillar-1--spec-kit-spec-driven-development) @@ -23,9 +23,7 @@ This document covers the *why*, the architecture, and the build workflow behind Databricks shipped a lot of new generative-AI surface area in 2025–2026: Document Intelligence (`ai_parse_document`, `ai_classify`, `ai_extract`), Agent Bricks, AI Gateway, Lakebase, and Databricks Apps. The two source articles for this reference are Databricks' Document Intelligence launch article ("Why Your Agents Can't Read Enterprise Documents") and the Agent Bricks platform article. The reference exists to demonstrate those patterns end to end: parse messy enterprise PDFs into a governed document data layer, then build a governed agent on that enriched layer through Agent Bricks. -This repo is that worked example. Drop a PDF into a governed UC volume; ten minutes later, an analyst can ask cited questions in plain English with end-to-end audit. The desired target architecture is **Agent Bricks-first**: Document Intelligence prepares the governed source of truth; Knowledge Assistant handles cited document Q&A; Supervisor Agent coordinates document Q&A with structured KPI tools; AI Gateway, Unity Catalog, OBO, Lakebase, and CLEARS provide the governance and operating layer. - -The earlier custom `mlflow.pyfunc` agent path diverged from that target by re-introducing custom serving lifecycle, auth-policy ordering, retrieval, and supervisor code that Agent Bricks is meant to absorb. The production path now uses Agent Bricks bootstrap instead of that custom runtime. +This repo is that worked example. Drop a PDF into a governed UC volume; ten minutes later, an analyst can ask cited questions in plain English with end-to-end audit. Document Intelligence prepares the governed source of truth; Knowledge Assistant handles cited document Q&A; Supervisor Agent coordinates document Q&A with structured KPI tools; AI Gateway, Unity Catalog, OBO, Lakebase, and CLEARS provide the governance and operating layer. It also demonstrates a development workflow: **Spec-Kit** for spec-driven design, **Claude Code** with Databricks skill bundles for AI-assisted implementation, six **non-negotiable constitution principles** that gate every plan. See [How it's built](#how-its-built--three-pillars). @@ -93,7 +91,7 @@ It also demonstrates a development workflow: **Spec-Kit** for spec-driven design "Quality before retrieval." ``` -**Ownership note**: DAB manages the Vector Search **endpoint** (`resources/foundation/filings_index.yml`) and the index-refresh **job** (`resources/consumers/index_refresh.job.yml`). The **index** itself isn't yet a DAB-managed resource type as of CLI 0.298 — `jobs/index_refresh/sync_index.py` creates the Delta-Sync index on first run and triggers a sync on subsequent runs. The endpoint lives in foundation so first-deploy bootstrap can materialize the index before `scripts/bootstrap_agent_bricks.py` attaches it to Knowledge Assistant. +**Ownership note**: DAB manages the Vector Search **endpoint** (`resources/foundation/filings_index.yml`) and the index-refresh **job** (`resources/consumers/index_refresh.job.yml`). The **index** itself isn't yet a DAB-managed resource type as of CLI 0.298 — `jobs/index_refresh/sync_index.py` creates the Delta-Sync index on first run and triggers a sync on subsequent runs. The endpoint lives in foundation so first-deploy bootstrap can materialize the index before `agent/document_intelligence_agent.py` attaches it to Knowledge Assistant. ### Agent Bricks target runtime @@ -118,15 +116,25 @@ It also demonstrates a development workflow: **Spec-Kit** for spec-driven design └──────────┬──────────┘ ▼ ┌──────────────────────┐ - │ Response JSON / App │ - │ citations, feedback │ + │ Agent output → App │ + │ final answer, │ + │ citations, feedback, │ │ latency, audit │ └──────────────────────┘ ``` -Knowledge Assistant is the default single-filing Q&A path because the Agent Bricks article positions the hard part as governed context, identity, and observability rather than hand-building the agent loop. Supervisor Agent is the default cross-company orchestration path. Custom code is allowed only where it is business logic around Agent Bricks, such as a deterministic KPI table tool or the App-specific feedback UI. It must not replace Knowledge Assistant, Supervisor Agent, Agent Bricks serving, or Agent Bricks governance. +Databricks creation path: [Create an AI agent](https://docs.databricks.com/aws/en/generative-ai/agent-framework/create-agent) → Knowledge Assistant for document Q&A. Supervisor Agent coordinates the Knowledge Assistant and UC function tools. + +Repository code is limited to deterministic tool glue, app UI, evals, and deployment scripts. -**Removed divergence**: the custom `agent/analyst_agent.py`, `agent/retrieval.py`, `agent/supervisor.py`, `agent/log_and_register.py`, and `resources/consumers/agent.serving.yml` path has been removed. `scripts/bootstrap_agent_bricks.py` is now the production bootstrap for Knowledge Assistant, the UC KPI function, and Supervisor Agent configuration. +**Concrete Agent Bricks wiring**: + +- `agent/document_intelligence_agent.py` creates or updates `doc-intel-knowledge-${target}` as the Knowledge Assistant. Its source is the Vector Search index over `gold_filing_sections_indexable`, with `summary` as the text column and `filename` as the document URI column. +- `agent/document_intelligence_agent.py` creates or updates the UC SQL function `..lookup_10k_kpis`. +- `doc-intel-supervisor-${target}` is the Supervisor Agent. Its tools are the Knowledge Assistant and the UC SQL KPI function. Supervisor Agent owns tool routing. +- Agent Bricks generates concrete serving endpoint names for Knowledge Assistant and Supervisor Agent. The repo resolves the live Supervisor endpoint with `scripts/resolve-agent-endpoint.sh` and passes it into DAB as `agent_endpoint_name`. +- Serving endpoint permissions are granted by endpoint ID after the generated endpoint is ready. The Databricks App does not bind to the endpoint as a resource; it invokes the resolved endpoint directly. Prod uses each user's OBO token. Demo uses the App service principal when `DOCINTEL_OBO_REQUIRED=false`. +- Agent Bricks responses use an OpenAI Responses-style `output` message sequence in current validation. The app displays the last output text group as the answer. Knowledge Assistant citations have been observed as markdown footnotes in intermediate messages, so `app/agent_bricks_response.py` normalizes those footnotes into citation chips. ### Runtime stack @@ -158,15 +166,18 @@ Knowledge Assistant is the default single-filing Q&A path because the Agent Bric │ │ │ at row-by-row) │ └────────────────────────┘ └────────────────────────┘ - OBO (user identity end-to-end, mandatory): - ────────────────────────────── - App reads `x-forwarded-access-token` from the request and invokes the + Target auth modes: + ───────────────── + Prod reads `x-forwarded-access-token` from the request and invokes the Agent Bricks endpoint with the user's identity. AI Gateway and Unity Catalog enforce identity, permissions, audit, and routing across the agent, model, tools, and data. User token passthrough is a hard - prerequisite for production. If the workspace cannot provide end-to-end - OBO, deployment must fail rather than silently falling back to a service - principal identity. + prerequisite for production. + + Demo can set `DOCINTEL_OBO_REQUIRED=false`; the App service principal then + invokes the generated Supervisor endpoint and receives `CAN_QUERY` after + deploy. This is for development workspaces without Apps user-token + passthrough, not for production. ``` **Why Postgres for state?** Delta tables are great for analytics but bad at "insert one tiny row per chat turn at high frequency." Lakebase is Databricks's managed Postgres — same governance, right tool for the job. @@ -216,7 +227,7 @@ When you read `specs/001-doc-intel-10k/plan.md` you'll see a "Constitution Check ### Pillar 2 — Databricks Asset Bundles + the Claude Code skill suite -[**Databricks Asset Bundles**](https://docs.databricks.com/aws/en/dev-tools/bundles/) (DABs) describe most of the workspace state as YAML. One root `databricks.yml` declares variables and targets (`demo`, `prod`); `resources/**/*.yml` declares each resource (pipeline, jobs, Vector Search endpoint, index-refresh job, Agent Bricks endpoint/configuration, app, monitor, dashboard, Lakebase instance + catalog). `databricks bundle deploy -t demo` reconciles workspace state to YAML. The Vector Search **index** is still created and synced by `jobs/index_refresh/sync_index.py` until DAB supports index resources directly. +[**Databricks Asset Bundles**](https://docs.databricks.com/aws/en/dev-tools/bundles/) (DABs) describe most of the workspace state as YAML. One root `databricks.yml` declares variables and targets (`demo`, `prod`); `resources/**/*.yml` declares each DAB-managed resource (pipeline, jobs, Vector Search endpoint, app, monitor, dashboard, Lakebase instance + catalog). `databricks bundle deploy -t demo` reconciles workspace state to YAML. The Vector Search **index** is still created and synced by `jobs/index_refresh/sync_index.py` until DAB supports index resources directly. Agent Bricks Knowledge Assistant and Supervisor Agent are SDK-managed by `agent/document_intelligence_agent.py`; DAB only passes the resolved generated Supervisor endpoint into the app through `agent_endpoint_name`. This repo was built with Databricks-specific Claude Code skill bundles. Those bundles are distributed by Databricks via the CLI / Claude Code plugin channel and **are not vendored in this open-source tree** — install them locally if you have access, or reference the canonical Databricks docs (mapping in [`../CONTRIBUTING.md`](../CONTRIBUTING.md)). @@ -260,13 +271,13 @@ DABs deploy *everything in one shot*. But our resources have a chicken-and-egg p │ ▸ Tables ────┼──── all need each other │ │ ▸ Vector idx ───┤ │ │ ▸ Agent Bricks ──┤ Monitor wants the │ - │ ▸ App ───┤ KPI table to exist │ + │ ▸ App config ───┤ KPI table to exist │ │ ▸ App ───┤ BEFORE it can attach │ │ ▸ Monitor ────┘ │ │ ▸ Lakebase ──── │ └────────────────────────────────────────────────┘ - App needs the Agent Bricks Supervisor endpoint. + App needs the generated Agent Bricks Supervisor endpoint name. Supervisor needs Knowledge Assistant + UC function tools. Knowledge Assistant needs the Vector Search index. Monitor needs the table populated. @@ -289,7 +300,7 @@ The fix is a **staged deploy** orchestrated by `scripts/bootstrap-demo.sh`. Reso └── consumers/ ← need foundation to be RUNNING and producing data ├── kpi_drift.yml (needs gold_filing_kpis table) ├── index_refresh.job.yml (needs source table) - ├── analyst.app.yml (needs Lakebase + generated agent endpoint) + ├── analyst.app.yml (needs Lakebase + generated agent endpoint name) ├── usage.dashboard.yml └── lakebase_catalog.yml (needs instance AVAILABLE) ``` @@ -332,7 +343,7 @@ The fix is a **staged deploy** orchestrated by `scripts/bootstrap-demo.sh`. Reso └──────────────────────────┘ ``` -**Why two modes?** DAB tracks resource state; if you run the temp-rename trick against an existing deployment, DAB sees the consumer YAMLs as removed and plans to delete the app, monitor, dashboard, etc. Appropriate on a fresh workspace; destructive in steady-state. The script detects mode and does the right thing. +**Why two modes?** DAB tracks resource state; if you run the temp-rename trick against an existing deployment, DAB sees the consumer YAMLs as removed and plans to delete the app, monitor, dashboard, etc. Use FIRST-DEPLOY only for a fresh workspace; use STEADY-STATE after resources exist. CI (`.github/workflows/deploy.yml`) assumes steady-state — the first-ever bring-up of a workspace must be done locally with `./scripts/bootstrap-demo.sh`. After that, every push to `main` runs the steady-state path: full `bundle deploy` → refresh data → sync index → update Agent Bricks → grants → CLEARS gate. @@ -344,7 +355,7 @@ For the per-step procedure and known failure modes, see [`runbook.md` § Known d - **Wiring `ai_parse_document` into Lakeflow SDP** — pattern for streaming-tables + `STREAM(...)` views + `APPLY CHANGES INTO` keyed on filename. - **Scoring document quality before retrieval** — five 0–6 dimensions in SQL, threshold filter on the index source. -- **Building on Agent Bricks instead of custom agent loops** — Knowledge Assistant for cited document Q&A, Supervisor Agent for orchestration, deterministic KPI tool glue for structured comparisons. +- **Agent Bricks orchestration** — Knowledge Assistant for cited document Q&A, Supervisor Agent for orchestration, deterministic KPI tool glue for structured comparisons. - **Grounding an agent with citations** — Document Intelligence output and the governed Vector Search / Knowledge Assistant source provide the citation-bearing context. - **Handling DAB deploy ordering** — chicken-egg dependencies between heterogeneous resources, solved with a 5-step bootstrap rather than `depends_on` (which DAB doesn't reliably honor across resource types). - **Gating deploys on MLflow eval** — `mlflow.evaluate(model_type="databricks-agent")` with documented metric keys, per-axis thresholds, exit-code gate in CI. diff --git a/docs/runbook.md b/docs/runbook.md index fd401ce..fb3c065 100644 --- a/docs/runbook.md +++ b/docs/runbook.md @@ -35,26 +35,42 @@ If a filing scores below threshold: ## Update Agent Bricks configuration -Agent Bricks resources are managed by `scripts/bootstrap_agent_bricks.py`. Run it after changes to Knowledge Assistant instructions, Supervisor instructions, or the KPI tool function: +Agent Bricks resources are defined and applied by `agent/document_intelligence_agent.py`. Run it after changes to Knowledge Assistant instructions, Supervisor instructions, or the KPI tool function: ```bash DOCINTEL_CATALOG= \ DOCINTEL_SCHEMA= \ DOCINTEL_WAREHOUSE_ID= \ -python scripts/bootstrap_agent_bricks.py --target demo +python -m agent.document_intelligence_agent --target demo ``` This creates or updates the Knowledge Assistant, syncs the Vector Search knowledge source, creates or updates the UC SQL KPI function, and wires both into the Supervisor Agent endpoint. +Agent Bricks generates concrete serving endpoint names. After applying the agent definition, always resolve the live Supervisor endpoint before deploying or restarting the app: + +```bash +AGENT_ENDPOINT_NAME="$(./scripts/resolve-agent-endpoint.sh demo)" +databricks bundle deploy -t demo --var "agent_endpoint_name=${AGENT_ENDPOINT_NAME}" +databricks bundle run -t demo --var "agent_endpoint_name=${AGENT_ENDPOINT_NAME}" analyst_app +``` + +The app receives the generated endpoint as `DOCINTEL_AGENT_ENDPOINT`. + +## Agent Bricks invocation and citations + +The app and eval runner invoke the generated Supervisor endpoint through `POST /serving-endpoints/{endpoint}/invocations`. Prod uses the user's OBO token. Demo uses the App service principal when `DOCINTEL_OBO_REQUIRED=false`. They do not use `WorkspaceClient.serving_endpoints.query()` for Agent Bricks calls because workspace validation showed that path did not preserve the needed Agent Bricks response shape. + +Current Agent Bricks output is an OpenAI Responses-style `output` message sequence. `app/agent_bricks_response.py` displays the last output text group as the final answer. Knowledge Assistant citations were observed during 2026-04-26 validation as markdown footnotes in intermediate messages, such as `[^p1]: ... _ACME_10K_2024.pdf_`; the app extracts filenames from those footnotes for citation chips. If citation chips show only `source`, capture a live payload and grep for `[^` and `.pdf_` to confirm whether the Knowledge Assistant citation format changed. + ## Inspect CLEARS metrics in MLflow CI resolves the generated Agent Bricks Supervisor serving endpoint, then runs `python evals/clears_eval.py --endpoint "$AGENT_ENDPOINT_NAME"` after each `demo` deploy. Look for the experiment `/Shared/docintel-clears-`; each run logs: - Per-axis metrics: `correctness`, `adherence`, `relevance`, `execution`, `safety`, `latency_p95_ms` -- Per-category slices: `p2_correctness`, `p3_correctness` - Per-question latency: `latency_ms_` +- Per-category slices: `p2_correctness`, `p3_correctness`, only when the active MLflow/databricks-agents output includes per-row correctness columns -Failures are logged as a JSON list under the run tag `failures`. The script exit-code-fails the deploy if any threshold is missed (FR-010, SC-002, SC-003). +Metric key names can vary across MLflow/databricks-agents versions. The eval runner maps current aggregate keys such as `correctness/percentage`, `guideline_adherence/percentage`, `groundedness/percentage`, and `safety/percentage` to the CLEARS axes. Failures are logged as a JSON list under the run tag `failures`. The script exit-code-fails the deploy if any threshold is missed (FR-010, SC-002, SC-003). ## Common failure modes @@ -62,21 +78,21 @@ Failures are logged as a JSON list under the run tag `failures`. The script exit |---|---|---| | `bundle validate` fails on `ai_parse_document` | Workspace lacks AI Functions GA | Move SQL warehouse to a recent serverless channel | | Vector Search index sync stuck | Embedding endpoint not provisioned | Provision `databricks-bge-large-en` or override `var.embedding_model_endpoint_name` | -| Agent endpoint 401 | OBO not plumbed end-to-end | Verify `app/app.py:_user_client` reads `x-forwarded-access-token` and `resources/consumers/analyst.app.yml:user_api_scopes` includes `serving.serving-endpoints` and `sql` | -| Agent answers ignore user UC permissions | OBO scopes wiped by `bundle run` (documented destructive-update behavior — see [Databricks Apps deploy docs](https://docs.databricks.com/aws/en/dev-tools/databricks-apps/deploy)) | Re-apply: `databricks apps update doc-intel-analyst-demo --user-api-scopes serving.serving-endpoints,sql,iam.access-control:read,iam.current-user:read` | +| `DOCINTEL_AGENT_ENDPOINT` is `UNSET_AGENT_BRICKS_ENDPOINT` | Bundle deploy/run omitted the generated endpoint variable | Re-run with `--var "agent_endpoint_name=$(./scripts/resolve-agent-endpoint.sh demo)"` | +| Agent endpoint 401 | Target auth mode does not have endpoint access | Demo: verify bootstrap/CI granted the App SP `CAN_QUERY` on the generated endpoint. Prod: verify `x-forwarded-access-token` is present and target `user_api_scopes` include `serving.serving-endpoints` and `sql` | +| App deploy fails with `Databricks Apps - user token passthrough feature is not enabled` | Prod target requires a workspace/org prerequisite | Enable Databricks Apps user-token passthrough and rerun. Demo should keep `app_obo_required=false` unless validating OBO | +| Agent answers ignore user UC permissions in prod | OBO scopes wiped by `bundle run` (documented destructive-update behavior — see [Databricks Apps deploy docs](https://docs.databricks.com/aws/en/dev-tools/databricks-apps/deploy)) | Re-apply scopes to the target app: `databricks apps update --user-api-scopes serving.serving-endpoints,sql,iam.access-control:read,iam.current-user:read` | +| Agent deployment cannot grant endpoint query permission | Permissions API was called with endpoint name instead of internal endpoint ID, or the generated endpoint is not ready | Use current `agent/document_intelligence_agent.py`; it waits for readiness and grants by serving endpoint ID | | Streamlit user sees stale UC permissions | OBO token captured at WebSocket open; never refreshes ([Databricks Apps runtime docs](https://docs.databricks.com/aws/en/dev-tools/databricks-apps/app-runtime)) | Reload the page after permission changes | -| Lakebase tables not writable from deployed App | Local-dev `streamlit run` initialised schema under user identity, not App SP | Connect as App SP and `DROP TABLE feedback, query_logs, conversation_history`; next App run re-creates them under SP. See `app/README.md` | +| Lakebase tables not writable from deployed App | Local-dev `streamlit run` initialised the `docintel_app` schema under user identity, not App SP | Connect as App SP and `DROP SCHEMA docintel_app CASCADE`; next App run re-creates it under SP. See `app/README.md` | | CLEARS Latency axis fails | Agent Bricks orchestration or Knowledge Assistant source is too broad | Narrow the Knowledge Assistant source, tune Supervisor instructions, or reduce structured-tool fan-out | -| App errors connecting to Lakebase | Database resource binding missing Postgres env vars | Check the `docintel-lakebase` resource binding and `PGHOST`/`PGPORT`/`PGUSER`/`PGPASSWORD`/`PGDATABASE` in the App runtime | +| Citation chips render but filenames show `source` | Knowledge Assistant footnote format changed or omitted filename markers | Capture the raw Agent Bricks payload and compare it with `app/agent_bricks_response.py`'s markdown-footnote parser | +| App errors connecting to Lakebase | Database resource binding missing connection fields, OAuth credential minting failed, or App SP lacks Lakebase instance `CAN_USE` | Check the `docintel-lakebase` resource binding plus `PGHOST`/`PGPORT`/`PGUSER`/`PGDATABASE`/`DOCINTEL_LAKEBASE_INSTANCE`/`DOCINTEL_LAKEBASE_SCHEMA` in the App runtime. `PGPASSWORD` is minted at connection time by `app/lakebase_client.py` | ## Verifying end-to-end OBO -Databricks Apps user-token passthrough, Agent Bricks OBO, AI Gateway identity enforcement, and UC grants are production prerequisites. Bootstrap must fail if any required scope or workspace feature is missing. - -To verify OBO end-to-end: - 1. **Workspace admin** enables the "Databricks Apps - user token passthrough" feature in workspace settings. -2. Confirm the `user_api_scopes` block in `resources/consumers/analyst.app.yml` is present. Required scopes for the analyst app's call chain: +2. Confirm the required scopes are declared on the prod target in `databricks.yml`: ```yaml user_api_scopes: - serving.serving-endpoints # invoke Agent Bricks endpoint as user @@ -84,7 +100,16 @@ To verify OBO end-to-end: - iam.access-control:read # default - iam.current-user:read # default ``` -3. Redeploy: `databricks bundle deploy -t demo && databricks bundle run -t demo analyst_app`. + +Demo uses `app_obo_required=false` unless overridden; the bootstrap grants the App service principal `CAN_QUERY` on the generated Supervisor endpoint. + +3. Redeploy the OBO-enabled target: + ```bash + TARGET=prod + AGENT_ENDPOINT_NAME="$(./scripts/resolve-agent-endpoint.sh "$TARGET")" + databricks bundle deploy -t "$TARGET" --var "agent_endpoint_name=${AGENT_ENDPOINT_NAME}" + databricks bundle run -t "$TARGET" --var "agent_endpoint_name=${AGENT_ENDPOINT_NAME}" analyst_app + ``` 4. Verify: bootstrap scope checks assert required scopes. Visit the deployed app, ask a question, and confirm in audit logs that Agent Bricks, Knowledge Assistant, and structured KPI SQL calls run under the invoking user's identity. ## CLEARS thresholds @@ -106,25 +131,31 @@ Changing any threshold requires a constitution amendment per the Governance sect ## v1 baseline -(populate after the first successful `demo` deploy) +No passing v1 baseline has been recorded yet. Latest demo evidence as of 2026-04-26: ``` -MLflow run ID: -Deployed at: -P2 correctness: -P3 correctness: -Latency p95: +MLflow run ID: 772e902cab92459f9bf569296fc5f801 +Deployed at: 2026-04-26 +Correctness: 0.323 +Adherence: 0.000 +Relevance/grounding: 0.516 +Safety: 1.000 +Execution: 1.000 +Latency p95: 31711 ms +P2/P3 slices: unavailable in the current aggregate metric output ``` +Do not promote this as reference-ready until CLEARS passes. Do not promote to prod until Databricks Apps user-token passthrough is enabled and OBO audit evidence is captured in the target workspace. + ## Known deploy ordering gaps The bundle has three chicken-egg dependencies that a single `bundle deploy` cannot resolve on a fresh workspace. Each needs a phase-2 step after a prior side effect: -1. **Databricks App binds to an Agent Bricks endpoint** +1. **Databricks App needs the generated Agent Bricks endpoint name** - Agent Bricks generates concrete Knowledge Assistant and Supervisor serving endpoint names. - - `scripts/bootstrap_agent_bricks.py` returns the generated Supervisor + - `agent/document_intelligence_agent.py` returns the generated Supervisor endpoint, and `resources/consumers/analyst.app.yml` injects it into `DOCINTEL_AGENT_ENDPOINT` via the `agent_endpoint_name` bundle variable. - **Fix**: bootstrap creates data and Agent Bricks resources before the full diff --git a/jobs/index_refresh/sync_index.py b/jobs/index_refresh/sync_index.py index 34e0c45..7310889 100644 --- a/jobs/index_refresh/sync_index.py +++ b/jobs/index_refresh/sync_index.py @@ -10,7 +10,6 @@ import argparse from datetime import timedelta import logging -import sys import time from databricks.sdk import WorkspaceClient @@ -35,7 +34,33 @@ def _wait_index_ready(w: WorkspaceClient, index_name: str, *, timeout_seconds: i time.sleep(15) -def main() -> int: +def _sync_index_when_ready(w: WorkspaceClient, index_name: str, *, timeout_seconds: int = 1200) -> None: + deadline = time.time() + timeout_seconds + next_log = 60 + started = time.time() + while True: + try: + w.vector_search_indexes.sync_index(index_name) + return + except Exception as exc: + message = str(exc) + transient = "not ready to sync yet" in message or "needs to be in one of the following states" in message + if not transient or time.time() >= deadline: + raise + elapsed = int(time.time() - started) + if elapsed >= next_log: + log_message = message.splitlines()[0] if message else type(exc).__name__ + logging.getLogger("vs-sync").info( + "index %s is not syncable yet after %ss: %s", + index_name, + elapsed, + log_message, + ) + next_log += 60 + time.sleep(15) + + +def main() -> None: p = argparse.ArgumentParser() p.add_argument("--endpoint", required=True) p.add_argument("--index", required=True) @@ -74,13 +99,13 @@ def main() -> int: ) _wait_index_ready(w, args.index) log.info("index created and initial sync complete") - return 0 + return log.info("index %s exists; triggering sync", args.index) - w.vector_search_indexes.sync_index(args.index) + _wait_index_ready(w, args.index) + _sync_index_when_ready(w, args.index) log.info("sync triggered") - return 0 if __name__ == "__main__": - sys.exit(main()) + main() diff --git a/resources/consumers/analyst.app.yml b/resources/consumers/analyst.app.yml index fbe9b3b..7e124b5 100644 --- a/resources/consumers/analyst.app.yml +++ b/resources/consumers/analyst.app.yml @@ -9,11 +9,18 @@ resources: env: - name: DOCINTEL_AGENT_ENDPOINT value: ${var.agent_endpoint_name} + - name: DOCINTEL_OBO_REQUIRED + value: ${var.app_obo_required} + - name: DOCINTEL_LAKEBASE_INSTANCE + value: ${var.lakebase_instance} + - name: DOCINTEL_LAKEBASE_SCHEMA + value: docintel_app # Databricks Apps auto-grants Lakebase permissions to the App SP on # deploy — see https://docs.databricks.com/aws/en/dev-tools/databricks-apps/access-data. - # Agent Bricks endpoint access is granted to the analyst group by - # scripts/bootstrap_agent_bricks.py because calls use OBO user identity. + # Agent Bricks endpoint access is granted outside this binding: + # agent/document_intelligence_agent.py grants analyst users/groups, and + # bootstrap/CI grants the App SP when demo runs with OBO disabled. resources: - name: docintel-lakebase database: @@ -21,12 +28,5 @@ resources: instance_name: ${var.lakebase_instance} permission: CAN_CONNECT_AND_CREATE - # Mandatory OBO scopes (Databricks Apps IAM/auth docs: - # https://docs.databricks.com/aws/en/dev-tools/databricks-apps/iam-auth) - # require the workspace-level "Databricks Apps - user token passthrough". - # Deployment must fail if the workspace cannot grant these scopes. - user_api_scopes: - - serving.serving-endpoints - - sql - - iam.access-control:read - - iam.current-user:read + # Prod declares user_api_scopes in databricks.yml. Demo leaves them unset + # so workspaces without Apps user-token passthrough can use App-SP calls. diff --git a/resources/consumers/lakebase_catalog.yml b/resources/consumers/lakebase_catalog.yml index 07a5f63..42735b8 100644 --- a/resources/consumers/lakebase_catalog.yml +++ b/resources/consumers/lakebase_catalog.yml @@ -1,7 +1,7 @@ resources: database_catalogs: docintel_state_catalog: - name: ${var.lakebase_instance} + name: ${var.schema}_state database_instance_name: ${var.lakebase_instance} database_name: ${var.lakebase_instance} create_database_if_not_exists: true diff --git a/resources/foundation/filings_index.yml b/resources/foundation/filings_index.yml index 3865751..52b1864 100644 --- a/resources/foundation/filings_index.yml +++ b/resources/foundation/filings_index.yml @@ -5,7 +5,7 @@ resources: endpoint_type: STANDARD # Index `${var.catalog}.${var.schema}.filings_summary_idx` is created and synced by -# `jobs/index_refresh/sync_index.py` (see resources/jobs/index_refresh.job.yml). +# `jobs/index_refresh/sync_index.py` (see resources/consumers/index_refresh.job.yml). # Vector Search *indexes* are not a DAB-managed resource type as of CLI 0.298; # only endpoints are. The Python task creates the Delta-Sync index on first # run and triggers a sync on subsequent runs. diff --git a/scripts/bootstrap-demo.sh b/scripts/bootstrap-demo.sh index cf64e80..a517f05 100755 --- a/scripts/bootstrap-demo.sh +++ b/scripts/bootstrap-demo.sh @@ -109,16 +109,16 @@ set_agent_endpoint_name() { log " using Agent Bricks Supervisor endpoint $AGENT_ENDPOINT_NAME" } -run_agent_bricks_bootstrap() { - local bootstrap_json endpoint - bootstrap_json=$("$PYTHON" scripts/bootstrap_agent_bricks.py \ +deploy_document_intelligence_agent() { + local agent_json endpoint + agent_json=$("$PYTHON" -m agent.document_intelligence_agent \ --target "$TARGET" \ --catalog "$DOCINTEL_CATALOG" \ --schema "$DOCINTEL_SCHEMA" \ --warehouse-id "$DOCINTEL_WAREHOUSE_ID" \ --analyst-group "$ANALYST_GROUP") || \ - die "Agent Bricks bootstrap failed" - endpoint=$(printf '%s' "$bootstrap_json" | "$PYTHON" -c " + die "Document Intelligence Agent deployment failed" + endpoint=$(printf '%s' "$agent_json" | "$PYTHON" -c " import json, sys payload = json.load(sys.stdin) print(payload.get('supervisor_endpoint') or '') @@ -153,6 +153,15 @@ with open('databricks.yml') as f: d = yaml.safe_load(f) print(d['targets']['$TARGET']['variables']['lakebase_instance']) " 2>/dev/null || echo "") +APP_OBO_REQUIRED=$("$PYTHON" -c " +import yaml +with open('databricks.yml') as f: + d = yaml.safe_load(f) +default = d.get('variables', {}).get('app_obo_required', {}).get('default', 'true') +value = d.get('targets', {}).get('$TARGET', {}).get('variables', {}).get('app_obo_required', default) +print(str(value).lower()) +" 2>/dev/null || echo "true") + if [[ -n "$LAKEBASE_NAME" ]]; then if instances=$(databricks api get /api/2.0/database/instances --output json 2>/dev/null); then conflict=$("$PYTHON" -c " @@ -195,6 +204,85 @@ for i in d.get('database_instances', []): done } +app_sp_principals() { + local app_json="$1" + printf '%s' "$app_json" | "$PYTHON" -c " +import json, sys +app = json.load(sys.stdin) +seen = set() +for key in ('service_principal_client_id', 'service_principal_name', 'service_principal_id'): + value = app.get(key) + if value is None: + continue + value = str(value) + if value and value not in seen: + seen.add(value) + print(value) +" +} + +grant_app_sp_lakebase_use() { + local app_json="$1" + local principals principal grant_json + principals=() + while IFS= read -r principal; do + [[ -n "$principal" ]] && principals+=("$principal") + done < <(app_sp_principals "$app_json") + if (( ${#principals[@]} == 0 )); then + die "app service principal was not returned by Databricks Apps API" + fi + for principal in "${principals[@]}"; do + grant_json=$("$PYTHON" -c " +import json, sys +print(json.dumps({ + 'access_control_list': [{ + 'service_principal_name': sys.argv[1], + 'permission_level': 'CAN_USE', + }] +})) +" "$principal") + if databricks permissions update database-instances "$LAKEBASE_NAME" --json "$grant_json" >/dev/null 2>&1; then + log " granted CAN_USE on Lakebase $LAKEBASE_NAME to App SP $principal" + return 0 + fi + done + die "failed to grant CAN_USE on Lakebase $LAKEBASE_NAME to the App service principal" +} + +grant_app_sp_endpoint_query() { + local app_json="$1" + local endpoint_json endpoint_id principals principal grant_json + endpoint_json=$(databricks serving-endpoints get "$AGENT_ENDPOINT_NAME" --output json) + endpoint_id=$(printf '%s' "$endpoint_json" | "$PYTHON" -c " +import json, sys +endpoint = json.load(sys.stdin) +print(endpoint.get('id') or endpoint.get('name') or '$AGENT_ENDPOINT_NAME') +") + principals=() + while IFS= read -r principal; do + [[ -n "$principal" ]] && principals+=("$principal") + done < <(app_sp_principals "$app_json") + if (( ${#principals[@]} == 0 )); then + die "app service principal was not returned by Databricks Apps API" + fi + for principal in "${principals[@]}"; do + grant_json=$("$PYTHON" -c " +import json, sys +print(json.dumps({ + 'access_control_list': [{ + 'service_principal_name': sys.argv[1], + 'permission_level': 'CAN_QUERY', + }] +})) +" "$principal") + if databricks permissions update serving-endpoints "$endpoint_id" --json "$grant_json" >/dev/null 2>&1; then + log " granted CAN_QUERY on $AGENT_ENDPOINT_NAME to App SP $principal" + return 0 + fi + done + die "failed to grant CAN_QUERY on $AGENT_ENDPOINT_NAME to the App service principal" +} + upload_samples() { log " uploading synthetic samples to $VOLUME_PATH" shopt -s nullglob @@ -256,7 +344,7 @@ if [[ "$MODE" == "first" ]]; then --embedding-endpoint "$EMBEDDING_ENDPOINT" || \ die "VS index creation failed (sync_index.py)" - run_agent_bricks_bootstrap + deploy_document_intelligence_agent wait_for_lakebase_available log "step 3/6: stage-2 deploy (full bundle — consumers join the foundation)" @@ -286,7 +374,7 @@ else die "timed out waiting for $KPI_TABLE" databricks bundle run -t "$TARGET" "${BUNDLE_VAR_FLAGS[@]}" index_refresh || \ log " warn: index_refresh failed; the table_update trigger will retry on the next pipeline run" - run_agent_bricks_bootstrap + deploy_document_intelligence_agent log "step 3/6: skipped (no second deploy needed in steady-state)" fi @@ -308,10 +396,10 @@ databricks api patch \ --json "{\"changes\":[{\"principal\":\"${ANALYST_GROUP}\",\"add\":[\"USE_SCHEMA\",\"SELECT\",\"EXECUTE\"]}]}" \ >/dev/null 2>&1 || log " warn: schema grants failed (may already be applied; UC dedupes)" -# OBO scope verification (only meaningful when user_api_scopes is declared). -if grep -q '^ user_api_scopes:' resources/consumers/analyst.app.yml 2>/dev/null; then - log " verifying OBO scopes on $APP_NAME" - if app_state=$(databricks apps get "$APP_NAME" --output json 2>/dev/null); then +log " verifying app auth mode on $APP_NAME" +if app_state=$(databricks apps get "$APP_NAME" --output json 2>/dev/null); then + grant_app_sp_lakebase_use "$app_state" + if [[ "$APP_OBO_REQUIRED" == "true" ]]; then "$PYTHON" -c " import json app = json.loads('''$app_state''') @@ -323,10 +411,18 @@ if missing: print(f' OBO scopes intact: {sorted(scopes)}') " || die "OBO scopes missing after deploy" else - die "unable to read app state for OBO verification" + "$PYTHON" -c " +import json +app = json.loads('''$app_state''') +scopes = app.get('user_api_scopes') +if scopes: + raise SystemExit(f'demo App-SP mode expected no user_api_scopes, got {scopes}') +print(' OBO disabled for demo; user_api_scopes unset') +" + grant_app_sp_endpoint_query "$app_state" fi else - die "resources/consumers/analyst.app.yml must declare user_api_scopes; OBO is mandatory" + die "unable to read app state for OBO verification" fi # ─── Step 6: smoke check ───────────────────────────────────────────────────── diff --git a/specs/001-doc-intel-10k/contracts/agent-request.json b/specs/001-doc-intel-10k/contracts/agent-request.json index d17d011..a654ce4 100644 --- a/specs/001-doc-intel-10k/contracts/agent-request.json +++ b/specs/001-doc-intel-10k/contracts/agent-request.json @@ -1,7 +1,7 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", "title": "Agent Endpoint Request", - "description": "App-level normalized request sent to the Agent Bricks Supervisor endpoint. AI Gateway and Databricks Apps OBO add identity automatically.", + "description": "App-level normalized request sent to the Agent Bricks Supervisor endpoint. Prod uses Databricks Apps OBO identity; demo may use the App service principal.", "type": "object", "required": ["question"], "properties": { diff --git a/specs/001-doc-intel-10k/data-model.md b/specs/001-doc-intel-10k/data-model.md index 631d39f..cc3e65c 100644 --- a/specs/001-doc-intel-10k/data-model.md +++ b/specs/001-doc-intel-10k/data-model.md @@ -1,6 +1,6 @@ # Phase 1 Data Model -All Delta tables live under the bundle-parameterized `${var.catalog}.${var.schema}`. Lakebase tables live in the bundle-managed Lakebase database `${var.catalog}_state`. +All Delta tables live under the bundle-parameterized `${var.catalog}.${var.schema}`. Lakebase tables live in schema `docintel_app` inside the bundle-managed Lakebase database instance `${var.lakebase_instance}`, exposed to SQL dashboards through the UC database catalog `${var.schema}_state`. ## Bronze @@ -99,7 +99,7 @@ A view `gold_filing_sections_with_quality` joins sections + quality and is the s | Column | Type | Notes | |---|---|---| | `conversation_id` | UUID PK | One row per session | -| `user_email` | STRING | From identity passthrough | +| `user_email` | STRING | From Databricks Apps identity headers; local runs may use `DOCINTEL_USER_EMAIL` | | `started_at` | TIMESTAMPTZ | | | `last_turn_at` | TIMESTAMPTZ | | @@ -149,7 +149,7 @@ PDF in volume └─ ai_query rubric → gold_filing_quality (quality_score) └─ quality_score threshold → Vector Search index sync └─ Agent Bricks Knowledge Assistant + Supervisor Agent - └─ AI Gateway + OBO + └─ AI Gateway + target auth mode └─ Streamlit App turn └─ Lakebase query_logs + feedback ``` diff --git a/specs/001-doc-intel-10k/plan.md b/specs/001-doc-intel-10k/plan.md index aeea37b..a833ece 100644 --- a/specs/001-doc-intel-10k/plan.md +++ b/specs/001-doc-intel-10k/plan.md @@ -5,7 +5,7 @@ ## Summary -Build a Databricks-native, governed pipeline + Agent Bricks system that turns SEC 10-K PDFs into a queryable lakehouse and a cited Q&A experience. SQL Lakeflow Spark Declarative Pipelines parse PDFs once with `ai_parse_document` (VARIANT), classify sections with `ai_classify`, extract structured KPIs with `ai_extract`, and score every section against a 5-dimension quality rubric. High-quality summaries flow into a Mosaic AI Vector Search index. Agent Bricks Knowledge Assistant handles cited document Q&A; Agent Bricks Supervisor Agent coordinates the Knowledge Assistant with a deterministic Unity Catalog KPI function for cross-company comparisons. AI Gateway, Unity Catalog, and mandatory OBO enforce identity and audit. Conversation history and feedback land in Lakebase Postgres. Lakehouse Monitoring tracks extraction drift; an AI/BI dashboard surfaces query-log content gaps. CLEARS evaluation in MLflow gates promotion. The stack is deployed by DAB plus idempotent Agent Bricks bootstrap (`databricks bundle deploy -t demo|prod`, `scripts/bootstrap_agent_bricks.py`). +Build a Databricks-native, governed pipeline + Agent Bricks system that turns SEC 10-K PDFs into a queryable lakehouse and a cited Q&A experience. SQL Lakeflow Spark Declarative Pipelines parse PDFs once with `ai_parse_document` (VARIANT), classify sections with `ai_classify`, extract structured KPIs with `ai_extract`, and score every section against a 5-dimension quality rubric. High-quality summaries flow into a Mosaic AI Vector Search index. Agent Bricks Knowledge Assistant handles cited document Q&A; Agent Bricks Supervisor Agent coordinates the Knowledge Assistant with a deterministic Unity Catalog KPI function for cross-company comparisons. AI Gateway, Unity Catalog, and prod OBO enforce identity and audit. Demo can use App-SP mode when Apps user-token passthrough is unavailable. Conversation history and feedback land in Lakebase Postgres. Lakehouse Monitoring tracks extraction drift; an AI/BI dashboard surfaces query-log content gaps. CLEARS evaluation in MLflow gates promotion. The stack is deployed by DAB plus idempotent Agent Bricks bootstrap (`databricks bundle deploy -t demo|prod`, `agent/document_intelligence_agent.py`). ## Technical Context @@ -13,7 +13,7 @@ Build a Databricks-native, governed pipeline + Agent Bricks system that turns SE **Primary Dependencies**: Lakeflow Spark Declarative Pipelines, Lakeflow Jobs, Mosaic AI Vector Search, Agent Bricks Knowledge Assistant and Supervisor Agent, AI Gateway, Databricks Apps (Streamlit), Lakebase Postgres, Lakehouse Monitoring, Databricks Asset Bundles CLI (`databricks` >= 0.260), MLflow Agent Evaluation **Storage**: Unity Catalog — `.` with one volume (`raw_filings`) and Delta tables (`bronze_filings`, `silver_parsed_filings`, `gold_filing_sections`, `gold_filing_kpis`); Lakebase Postgres for `conversation_history`, `query_logs`, `feedback` **Testing**: `databricks bundle validate -t demo` (schema check), pytest for agent unit tests, MLflow `evaluate()` with `databricks-agents` evaluators for CLEARS, manual smoke via the deployed App -**Target Platform**: Databricks workspace with serverless SQL warehouse (AI Functions GA), Mosaic AI Vector Search, Agent Bricks, Databricks Apps user-token passthrough, AI Gateway, Unity Catalog, and Lakebase enabled +**Target Platform**: Databricks workspace with serverless SQL warehouse (AI Functions GA), Mosaic AI Vector Search, Agent Bricks, Databricks Apps, AI Gateway, Unity Catalog, and Lakebase enabled. Prod also requires Databricks Apps user-token passthrough. **Project Type**: Databricks lakehouse + agent stack delivered as a single DAB **Performance Goals**: Pipeline E2E ≤ 10 min P95 on a 30 MB PDF (SC-001); agent P95 ≤ 8s single-filing, ≤ 20s cross-company (SC-009); Vector Search refresh ≤ 5 min after Gold update **Constraints**: SQL only for parse/classify/extract layer; Python only for agent + app; CPU model serving (no GPU); zero hard-coded paths outside the bundle; one-command deploy; CLEARS thresholds C≥0.8, L p95≤8s, E≥0.95, A≥0.9, R≥0.8, S≥0.99 block promotion @@ -79,6 +79,7 @@ pipelines/ └── 04_gold_quality.sql # 5-dim rubric → quality_score agent/ +├── document_intelligence_agent.py # Knowledge Assistant + Supervisor definition ├── tools.py # deterministic KPI tool glue for Agent Bricks └── tests/ └── test_tools.py @@ -94,7 +95,6 @@ evals/ scripts/ ├── bootstrap-demo.sh # staged deploy orchestration -├── bootstrap_agent_bricks.py # Knowledge Assistant + Supervisor bootstrap └── wait_for_kpis.py .github/ @@ -104,7 +104,7 @@ scripts/ CLAUDE.md # Runtime guidance for Claude Code ``` -**Structure Decision**: Single DAB containing one pipeline, two jobs, one Vector Search endpoint, one Lakebase project, one monitor, one dashboard, one app, and a CI workflow. Agent Bricks resources are SDK-managed by `scripts/bootstrap_agent_bricks.py` until DAB exposes first-class Knowledge Assistant and Supervisor resource types. SQL pipeline code lives at the root under `pipelines/sql/`; deterministic tool glue lives at `agent/`; app code lives at `app/`. +**Structure Decision**: Single DAB containing one pipeline, two jobs, one Vector Search endpoint, one Lakebase instance/catalog, one monitor, one dashboard, one app, and a CI workflow. Agent Bricks resources are SDK-managed by `agent/document_intelligence_agent.py` until DAB exposes first-class Knowledge Assistant and Supervisor resource types. SQL pipeline code lives at the root under `pipelines/sql/`; deterministic tool glue lives at `agent/`; app code lives at `app/`. ## Phase 0 — Outline & Research @@ -120,7 +120,7 @@ Output: [research.md](./research.md). Decisions captured: | Vector Search index | Delta-Sync index over `gold_filing_sections` filtered by `embed_eligible`; embed `summary` column | Managed sync, no manual refresh; embeds curated content per principle IV | Direct Vector Index (rejected: no managed sync); embedding raw `parsed.text_full` (rejected: noise) | | Retrieval strategy | Agent Bricks Knowledge Assistant over the governed document layer / Vector Search source | Demonstrates the Agent Bricks article pattern and removes custom retrieval/rerank serving code | Raw chunk search (rejected: ignores Document Intelligence quality layer) | | Agent framework | Agent Bricks Knowledge Assistant + Supervisor Agent | First-class governed enterprise agent primitives; aligns with the source articles | Custom `mlflow.pyfunc` analyst agent (rejected: caused deploy-order and serving lifecycle failures); LangGraph standalone (rejected: not the reference pattern) | -| Serving | Agent Bricks endpoint behind AI Gateway with mandatory OBO | Gateway gives audit, rate limits, guardrails, and identity enforcement | Bespoke custom endpoint ownership (rejected: custom lifecycle); service-principal auth for document Q&A (rejected: not production-safe) | +| Serving | Agent Bricks endpoint behind AI Gateway; prod uses OBO, demo can use App-SP mode | Gateway gives audit, rate limits, guardrails, and identity enforcement; demo remains deployable in workspaces without Apps user-token passthrough | Bespoke custom endpoint ownership (rejected: custom lifecycle); service-principal auth for production document Q&A (rejected: not production-safe) | | State store | Lakebase Postgres (managed) | Native to platform, low-latency reads/writes, fits Reffy pattern; integrates with Apps | Delta tables (rejected: write throughput on small turn-level updates); external Postgres (rejected: governance gap) | | Eval framework | MLflow `evaluate()` with `databricks-agents` evaluators on CLEARS axes | First-class CLEARS support; logged into MLflow runs | LangSmith / Ragas (rejected: external system) | | Monitoring | Lakehouse Monitoring `inference` profile on `gold_filing_kpis`; Lakeview AI/BI dashboard on `query_logs` | First-class drift detection; usage dashboard surfaces content gaps per Reffy | Custom Spark notebooks (rejected: imperative, principle III) | @@ -143,7 +143,7 @@ Output: `data-model.md`, `contracts/`, `quickstart.md`, plus the agent context u | Section | `gold_filing_sections` row | Gold | | KPI Record | `gold_filing_kpis` row (JSON-typed `ai_extract` output unpacked into columns) | Gold | | Citation | Returned in agent response payload (see `contracts/agent-response.json`) | Runtime | -| Conversation | `lakebase.conversation_history` + `lakebase.query_logs` rows | Lakebase | +| Conversation | `conversation_history` + `query_logs` rows in `${var.lakebase_instance}` / UC catalog `${var.schema}_state` | Lakebase | | Eval Item | `evals/dataset.jsonl` row | Repo | ### Contracts diff --git a/specs/001-doc-intel-10k/quickstart.md b/specs/001-doc-intel-10k/quickstart.md index 1c10779..b93613a 100644 --- a/specs/001-doc-intel-10k/quickstart.md +++ b/specs/001-doc-intel-10k/quickstart.md @@ -1,11 +1,11 @@ # Quickstart: Deploy and Test the 10-K Analyst -Goal: from a clean clone, stand up the entire stack on the Databricks `demo` target and verify P1, P2, P3 acceptance scenarios in 15–25 minutes. +Goal: from a clean clone, stand up the entire stack on the Databricks `demo` target and run the P1, P2, and P3 acceptance checks in 15–25 minutes. ## Prerequisites - macOS or Linux, `python` 3.11+, `git`, `databricks` CLI ≥ 0.298 (`brew install databricks/tap/databricks`) -- A Databricks workspace with: serverless SQL warehouse (AI Functions GA), Mosaic AI Vector Search, Agent Bricks Knowledge Assistant and Supervisor Agent, AI Gateway, Databricks Apps user-token passthrough, Unity Catalog, and Lakebase enabled +- A Databricks workspace with: serverless SQL warehouse (AI Functions GA), Mosaic AI Vector Search, Agent Bricks Knowledge Assistant and Supervisor Agent, AI Gateway, Databricks Apps, Unity Catalog, and Lakebase enabled. Prod also requires Databricks Apps user-token passthrough. - An auth profile (`databricks auth login --host ` once); verify with `databricks auth profiles` - Local virtualenv: `python -m venv .venv && .venv/bin/pip install -r agent/requirements.txt -r evals/requirements.txt` @@ -98,7 +98,7 @@ Note: the Lakebase instance enters a soft-delete state for ~7 days during which |---|---|---| | `bundle validate` errors on `ai_parse_document` | Workspace lacks AI Functions GA | Move SQL warehouse to a recent serverless channel | | Vector Search index sync stuck | Embedding endpoint not provisioned | Provision `databricks-bge-large-en` or override `var.embedding_model_endpoint_name` | -| Agent endpoint 401 from App | OBO not plumbed end-to-end | Verify `app/app.py:_user_client` reads `x-forwarded-access-token` and the App's `user_api_scopes` includes `serving.serving-endpoints` (workspace must have user-token-passthrough enabled — see `docs/runbook.md` §"Verifying end-to-end OBO") | +| Agent endpoint 401 from App | Target auth mode does not have endpoint access | Demo: verify App SP `CAN_QUERY` was granted. Prod: verify `app/app.py:_user_client` reads `x-forwarded-access-token` and the target `user_api_scopes` include `serving.serving-endpoints` | | CLEARS Latency axis fails | Agent Bricks orchestration or Knowledge Assistant source is too broad | Narrow the Knowledge Assistant source, tune Supervisor instructions, or reduce structured-tool fan-out | | Bootstrap blocks on Lakebase soft-delete | `lakebase_instance` name held by retention | Bump suffix in `databricks.yml` and retry | -| App deploy fails on OBO scopes | Workspace lacks user-token-passthrough feature | Workspace admin enables the feature; this is a production prerequisite | +| App deploy fails on OBO scopes | Workspace lacks user-token-passthrough feature | Workspace admin enables the feature for prod. Demo should use `app_obo_required=false` unless validating OBO | diff --git a/specs/001-doc-intel-10k/spec.md b/specs/001-doc-intel-10k/spec.md index de42e26..c1eca8a 100644 --- a/specs/001-doc-intel-10k/spec.md +++ b/specs/001-doc-intel-10k/spec.md @@ -19,14 +19,14 @@ - Q: Eval corpus — real EDGAR PDFs or synthetic? → A: Synthetic. The 30-question dataset references three synthetic 10-Ks (`samples/{ACME,BETA,GAMMA}_10K_2024.pdf`, generated by `samples/synthesize.py`) plus a deliberately low-quality `garbage_10K_2024.pdf` for SC-006. Real EDGAR filings can still be uploaded to the volume in deployed environments; the synthetic corpus exists so CI is fully deterministic and self-contained (no EDGAR dependency, no license concerns). User-facing examples in spec scenarios still use AAPL/MSFT/GOOG to convey intent. - Q: Deploy ordering — single bundle deploy or staged? → A: Staged. `resources/foundation/` (catalog, pipeline, retention job, Lakebase instance, VS endpoint) deploys first; data is produced (sample upload, pipeline run, VS index materialization, Agent Bricks Knowledge Assistant + Supervisor configuration, Lakebase ready); then `resources/consumers/` (monitor, index-refresh job, app, Lakebase catalog) deploys. The chicken-egg dependencies between consumers and foundation data make a single deploy impossible. Bootstrap script automates this. -- Q: User identity passthrough? → A: OBO end-to-end is mandatory. The workspace-level "Databricks Apps - user token passthrough" feature must be enabled before deployment. When disabled, deploy fails with an actionable prerequisite error. +- Q: User identity passthrough? → A: Prod requires OBO end-to-end. The workspace-level "Databricks Apps - user token passthrough" feature must be enabled before prod deployment. Demo may run with `app_obo_required=false`, where the App service principal invokes Agent Bricks and is explicitly granted endpoint query access. ### Session 2026-04-26 - Q: What is the architectural source of truth? → A: The reference implementation MUST demonstrate the patterns in Databricks' "Why Your Agents Can't Read Enterprise Documents" and "Agent Bricks: The Governed Enterprise Agent Platform" articles. Document Intelligence is the document-processing foundation; Agent Bricks is the agent construction, orchestration, governance, and serving foundation. - Q: Is a custom `mlflow.pyfunc` analyst agent acceptable as the primary implementation? → A: No. Custom pyfunc retrieval/supervisor/serving code is a divergence from the Agent Bricks-first reference and MUST be removed. Knowledge Assistant MUST handle cited single-filing document Q&A. Supervisor Agent MUST handle orchestration across document Q&A and structured KPI tools. - Q: What custom code may remain? → A: Custom code may remain only where it demonstrates integration around Agent Bricks rather than replacing Agent Bricks: the Document Intelligence SQL pipeline, deterministic Gold KPI SQL/tool access, Databricks App UX, Lakebase feedback persistence, and deploy/eval automation. -- Q: Should the implementation keep legacy fallback logic for workspaces without user-token passthrough or Agent Bricks support? → A: No. Production deployment requires Agent Bricks, AI Gateway, Unity Catalog, Databricks Apps user-token passthrough, and end-to-end OBO. Missing prerequisites MUST fail validation or deploy; service-principal fallback and legacy custom-agent fallback are not acceptable. +- Q: Should the implementation keep legacy compatibility logic for workspaces without Agent Bricks support? → A: No. Agent Bricks is required. Production deployment requires Agent Bricks, AI Gateway, Unity Catalog, Databricks Apps user-token passthrough, and end-to-end OBO. Demo App-SP mode is allowed only for development workspaces without Apps user-token passthrough; legacy custom-agent or bespoke serving paths are not acceptable. ## User Scenarios & Testing *(mandatory)* @@ -107,8 +107,8 @@ An analyst asks a multi-company question — e.g., "Compare segment revenue betw - **FR-012**: System MUST be deployable end-to-end (catalog/schema/volume, Document Intelligence pipelines, vector index or Knowledge Assistant source, Agent Bricks endpoint/configuration, AI Gateway, app, monitors, dashboards) via a single repeatable bring-up command; two environments (demo, prod) MUST be defined; any resource not yet expressible as DAB YAML MUST be created by idempotent bootstrap code that is treated as part of the production deployment, not as manual setup. - **FR-013**: System MUST process duplicate uploads idempotently keyed on filename. - **FR-014**: System MUST gracefully report missing/ungrounded answers ("no source found") rather than hallucinating when retrieval returns no qualified results. -- **FR-015**: System MUST explicitly remove current custom-agent divergence: `agent/analyst_agent.py`, `agent/retrieval.py`, `agent/supervisor.py`, direct `mlflow.pyfunc` registration, and bespoke Model Serving endpoint ownership MUST be replaced by Agent Bricks Knowledge Assistant / Supervisor Agent configuration. No temporary compatibility shims or legacy fallback endpoint may remain. -- **FR-016**: System MUST require end-to-end user identity. Databricks Apps user-token passthrough, Agent Bricks / AI Gateway OBO, and UC permission enforcement are production prerequisites. If any prerequisite is unavailable, deploy MUST fail with an actionable error; the app and agent MUST NOT fall back to broad service-principal reads. +- **FR-015**: System MUST explicitly remove current custom-agent divergence: `agent/analyst_agent.py`, `agent/retrieval.py`, `agent/supervisor.py`, direct `mlflow.pyfunc` registration, and bespoke Model Serving endpoint ownership MUST be replaced by Agent Bricks Knowledge Assistant / Supervisor Agent configuration. No temporary compatibility shims or legacy custom-agent endpoint may remain. +- **FR-016**: System MUST require end-to-end user identity for prod. Databricks Apps user-token passthrough, Agent Bricks / AI Gateway OBO, and UC permission enforcement are production prerequisites. If any prod prerequisite is unavailable, deploy MUST fail with an actionable error. Demo may set `app_obo_required=false` and grant the App service principal `CAN_QUERY` on the generated Supervisor endpoint. ### Key Entities @@ -137,10 +137,10 @@ An analyst asks a multi-company question — e.g., "Compare segment revenue betw ## Assumptions - The target Databricks workspace has a serverless SQL warehouse with `ai_parse_document` (GA), `ai_classify`, `ai_extract`, and `ai_prep_search` available. -- Mosaic AI Vector Search, Agent Bricks, AI Gateway, and Databricks Apps user-token passthrough entitlements are enabled for the workspace. +- Mosaic AI Vector Search, Agent Bricks, AI Gateway, and Databricks Apps are enabled for the workspace. Prod also has Databricks Apps user-token passthrough enabled. - Sample 10-K PDFs are publicly available SEC filings (EDGAR) the analyst manually uploads to the volume; no automated SharePoint/Drive sync in v1. -- A Service Principal exists for prod deploys but is not used in v1 (demo target only). -- Analyst end-users have UC `SELECT` on the configured catalog/schema, `EXECUTE` on the KPI function, and `CAN QUERY` on the Agent Bricks endpoints via end-to-end OBO. +- A Service Principal exists for prod deploys. Demo App-SP mode uses the Databricks App service principal when `app_obo_required=false`. +- Analyst end-users have UC `SELECT` on the configured catalog/schema and `EXECUTE` on the KPI function. Prod users also have `CAN QUERY` on the Agent Bricks endpoints via end-to-end OBO; demo App-SP mode grants `CAN_QUERY` to the App service principal. - The CLI auth profile on the operator's machine targets a workspace where the bundle can deploy without further policy exceptions. - 10-K fiscal year and company name can be reliably extracted from the parsed cover page; if not, `extraction_confidence` reflects the gap and the row remains queryable. - A curated eval set of 30 questions (20 P2 + 10 P3) is authored during implementation and checked in at `evals/dataset.jsonl`; CLEARS thresholds are tunable in config but defaults are fixed in FR-010. diff --git a/specs/001-doc-intel-10k/tasks.md b/specs/001-doc-intel-10k/tasks.md index e9eac71..c1bff42 100644 --- a/specs/001-doc-intel-10k/tasks.md +++ b/specs/001-doc-intel-10k/tasks.md @@ -19,7 +19,7 @@ description: "Task list for Databricks 10-K Analyst implementation" ## Path Conventions -This is a DAB plus Agent Bricks bootstrap project. SQL pipeline code is at `pipelines/sql/`, deterministic tool glue at `agent/`, Streamlit App at `app/`, evals at `evals/`, bundle resources at `resources/`, and Agent Bricks orchestration in `scripts/bootstrap_agent_bricks.py`. See plan.md for the full tree. +This is a DAB plus Agent Bricks deployment project. SQL pipeline code is at `pipelines/sql/`, deterministic tool glue at `agent/`, Streamlit App at `app/`, evals at `evals/`, bundle resources at `resources/`, and Agent Bricks orchestration in `agent/document_intelligence_agent.py`. See plan.md for the full tree. --- @@ -37,9 +37,9 @@ This is a DAB plus Agent Bricks bootstrap project. SQL pipeline code is at `pipe **⚠️ CRITICAL**: All user stories depend on these. -- [x] T006 Define UC catalog/schema/volume in `resources/dabs/catalog.yml` (or inline in `databricks.yml`): `${var.catalog}.${var.schema}` schema + `raw_filings` volume; grant `USE_CATALOG`, `USE_SCHEMA`, `READ_VOLUME` to a configurable analyst group -- [x] T007 [P] Define the Lakebase project + database in `resources/lakebase/state.yml` with three tables (`conversation_history`, `query_logs`, `feedback`) per `data-model.md`; expose connection vars to the Streamlit App -- [x] T008 [P] Add the agent JSON contracts to the bundle as inline strings or copy them to `agent/contracts/` so both the agent and the App reference one source: `agent-request.json`, `agent-response.json`, `feedback-event.json`, `kpi-schema.json` +- [x] T006 Define UC catalog/schema/volume in `resources/foundation/catalog.yml`: `${var.catalog}.${var.schema}` schema + `raw_filings` volume; grant `USE_CATALOG`, `USE_SCHEMA`, `READ_VOLUME` to a configurable analyst group +- [x] T007 [P] Define the Lakebase instance/catalog in `resources/foundation/lakebase_instance.yml` and `resources/consumers/lakebase_catalog.yml`; the UC catalog is `${var.schema}_state`, while `app/lakebase_client.py` creates `docintel_app.conversation_history`, `docintel_app.query_logs`, and `docintel_app.feedback` at runtime using App resource binding fields plus Databricks-minted Lakebase OAuth credentials +- [x] T008 [P] Add JSON contracts under `specs/001-doc-intel-10k/contracts/`: `agent-request.json`, `agent-response.json`, `feedback-event.json`, `kpi-schema.json` **Checkpoint**: catalog, schema, volume, Lakebase database exist; bundle validates. @@ -57,16 +57,16 @@ This is a DAB plus Agent Bricks bootstrap project. SQL pipeline code is at `pipe - [x] T010 [US1] Write `pipelines/sql/02_silver_parse.sql`: streaming table `silver_parsed_filings` using `APPLY CHANGES INTO` keyed on `filename`, computing `ai_parse_document(content)` once into `VARIANT` `parsed`, plus `parse_status`/`parse_error` derived from `try_cast` of the result (depends on T009) - [x] T011 [US1] Write `pipelines/sql/03_gold_classify_extract.sql`: - Streaming table `gold_filing_sections` exploding `parsed:sections[*]`, calling `ai_classify(section_text, ARRAY('MD&A','Risk','Financials','Notes','Other'))` to populate `section_label`, summarising via `ai_query` into the `summary` column - - Streaming table `gold_filing_kpis` calling `ai_extract` against the concatenated MD&A + Financials text using the JSON schema in `agent/contracts/kpi-schema.json`, then unpacking into typed columns + - Streaming table `gold_filing_kpis` calling `ai_extract` against the concatenated MD&A + Financials text using the JSON schema in `specs/001-doc-intel-10k/contracts/kpi-schema.json`, then unpacking into typed columns - Both tables use `APPLY CHANGES INTO` keyed on appropriate keys (depends on T010) - [x] T012 [US1] Write `pipelines/sql/04_gold_quality.sql`: materialized view `gold_filing_quality` invoking `ai_query` 5 times per section row to score parse_completeness, layout_fidelity, ocr_confidence, section_recognizability, kpi_extractability (each 0–6); compute `quality_score` and persist `quality_breakdown` STRUCT (depends on T011) - [x] T013 [US1] Update `gold_filing_sections` (in T011 or a follow-on view) to add `embed_eligible = (quality_score >= ${var.quality_threshold} AND parse_status = 'ok')` by joining with `gold_filing_quality` -- [x] T014 [US1] Define the Lakeflow SDP in `resources/pipelines/doc_intel.pipeline.yml`: serverless, libraries point at `pipelines/sql/*.sql`, target = `${var.catalog}.${var.schema}`, file-arrival event trigger on the `raw_filings` volume, retries=2 (depends on T009-T013) -- [x] T015 [US1] Define the retention Lakeflow Job in `resources/jobs/retention.job.yml`: daily schedule, single Python task that lists the volume via `WorkspaceClient.files`, removes files with `modificationTime < now()-90d`, logs deletions; uses Service Principal in prod only (depends on T006) +- [x] T014 [US1] Define the Lakeflow SDP in `resources/foundation/doc_intel.pipeline.yml`: serverless, libraries point at `pipelines/sql/*.sql`, target = `${var.catalog}.${var.schema}`, triggered in demo and continuous in prod (depends on T009-T013) +- [x] T015 [US1] Define the retention Lakeflow Job in `resources/foundation/retention.job.yml`: daily schedule, single Python task that lists the volume via `WorkspaceClient.files`, removes files with `modificationTime < now()-90d`, logs deletions; uses Service Principal in prod only (depends on T006) - [x] T016 [US1] Add synthetic samples (`samples/{ACME,BETA,GAMMA}_10K_2024.pdf` + `samples/garbage_10K_2024.pdf` for SC-006) reproducible from `samples/synthesize.py`; documented in `samples/README.md` -- [x] T017 [US1] Write a Lakeview `resources/dashboards/usage.lvdash.yml` containing one initial widget over `gold_filing_kpis` (count by company_name, count by fiscal_year); will be extended in US2/US3 (depends on T011) +- [x] T017 [US1] Write a Lakeview dashboard source at `src/dashboards/usage.lvdash.json`, managed by `resources/consumers/usage.dashboard.yml`, containing one initial widget over `gold_filing_kpis` (count by company_name, count by fiscal_year); will be extended in US2/US3 (depends on T011) -**Checkpoint**: P1 acceptance scenarios 1–4 pass via the quickstart commands. +**Checkpoint target**: P1 acceptance scenarios 1–4 pass via the quickstart commands. Latest workspace evidence is tracked in `VALIDATION.md`. --- @@ -88,17 +88,17 @@ This is a DAB plus Agent Bricks bootstrap project. SQL pipeline code is at `pipe - [x] T022 [US2] Remove custom retrieval implementation (`agent/retrieval.py`) and configure Agent Bricks Knowledge Assistant over the governed Document Intelligence / Vector Search source (depends on T020) - [x] T023 [US2] Implement `agent/tools.py` as deterministic structured KPI tool glue for Agent Bricks, wrapping governed SQL over `gold_filing_kpis` - [x] T024 [US2] Remove custom `agent/analyst_agent.py` and direct `mlflow.pyfunc` registration; Knowledge Assistant owns single-filing cited Q&A (depends on T022, T023) -- [x] T025 [US2] Remove `agent/log_and_register.py` and bespoke model-version promotion from the production path; bootstrap configures Agent Bricks resources idempotently instead -- [x] T026 [US2] Replace `resources/consumers/agent.serving.yml` with `scripts/bootstrap_agent_bricks.py` Agent Bricks endpoint/configuration behind AI Gateway with mandatory OBO and guardrails (depends on T024, T025) -- [x] T027 [US2] Implement `app/app.py` (Streamlit): chat input, calls the Agent Bricks endpoint as the invoking user, renders answer + citations as chips, thumbs-up/down + comment widget that POSTs to a Lakebase write helper; persists `conversation_id` in session state (depends on T026, T007) -- [x] T028 [US2] Implement `app/lakebase_client.py`: thin wrapper using `psycopg` with the bundle-injected DSN to insert into `conversation_history`, `query_logs`, `feedback` +- [x] T025 [US2] Remove `agent/log_and_register.py` and bespoke model-version promotion from the production path; `agent/document_intelligence_agent.py` configures Agent Bricks resources idempotently instead +- [x] T026 [US2] Replace `resources/consumers/agent.serving.yml` with `agent/document_intelligence_agent.py` Agent Bricks endpoint/configuration behind AI Gateway with prod OBO and demo App-SP mode (depends on T024, T025) +- [x] T027 [US2] Implement `app/app.py` (Streamlit): chat input, calls the Agent Bricks endpoint using the target auth mode, renders answer + citations as chips, thumbs-up/down + comment widget that POSTs to a Lakebase write helper; persists `conversation_id` in session state (depends on T026, T007) +- [x] T028 [US2] Implement `app/lakebase_client.py`: thin wrapper using `psycopg` with App resource binding connection fields and Databricks-minted Lakebase OAuth credentials to insert into `conversation_history`, `query_logs`, `feedback` - [x] T029 [US2] Define the Databricks App in `resources/consumers/analyst.app.yml`: source = `app/`, runtime python, env = Lakebase binding + agent endpoint binding (depends on T027, T028) - [x] T030 [US2] Author `evals/dataset.jsonl` 20 P2 questions per `data-model.md`'s eval section (each with `expected_filename`, `expected_section`, `expected_answer_keywords`, `min_citations`) - [x] T031 [US2] Implement `evals/clears_eval.py`: connects to the demo endpoint, runs `mlflow.evaluate()` with `databricks-agents` evaluators on the dataset, asserts thresholds C≥0.8, L p95≤8s, E≥0.95, A≥0.9, R≥0.8, S≥0.99; exits non-zero on failure (depends on T026, T030) - [x] T032 [US2] Define Lakehouse Monitoring in `resources/consumers/kpi_drift.yml`: `inference` profile on `gold_filing_kpis`, slicing on `company_name`, `fiscal_year`; baselines computed from first 10 filings (depends on T011) -- [x] T033 [US2] Extend `resources/dashboards/usage.lvdash.yml` with widgets over `lakebase.query_logs`: top questions, daily active users, p95 latency, citation count distribution, ungrounded-answer rate (depends on T028, T017) +- [x] T033 [US2] Extend `src/dashboards/usage.lvdash.json` with widgets over Lakebase `query_logs`: top questions, daily active users, p95 latency, citation count distribution, ungrounded-answer rate (depends on T028, T017) -**Checkpoint**: P2 acceptance scenarios 1–3 pass via App; CLEARS gate passes for the P2 slice of the eval set. +**Checkpoint target**: P2 acceptance scenarios 1–3 pass via App; CLEARS gate passes for the P2 slice of the eval set. Latest workspace evidence is tracked in `VALIDATION.md`. --- @@ -121,7 +121,7 @@ This is a DAB plus Agent Bricks bootstrap project. SQL pipeline code is at `pipe - [x] T039 [US3] Extend `evals/clears_eval.py` to slice metrics by `category in {P2, P3}` and assert SC-002 ≥0.8 on P2, SC-003 ≥0.7 on P3 (depends on T031, T038) - [x] T040 [US3] Update `app/app.py` to render markdown tables (Streamlit `st.markdown(..., unsafe_allow_html=False)` already handles this) and surface a "show structured KPIs" expander next to each row (depends on T036) -**Checkpoint**: P3 acceptance scenarios 1–2 pass; CLEARS gate passes for both P2 and P3 slices. +**Checkpoint target**: P3 acceptance scenarios 1–2 pass; CLEARS gate passes for both P2 and P3 slices. Latest workspace evidence is tracked in `VALIDATION.md`. --- @@ -130,11 +130,11 @@ This is a DAB plus Agent Bricks bootstrap project. SQL pipeline code is at `pipe - [ ] T041 [P] Run `databricks bundle validate -t demo` and resolve any schema warnings - [ ] T042 [P] Run `databricks bundle validate -t prod` (no deploy) to confirm prod target compiles - [ ] T043 Walk through `quickstart.md` end-to-end on a clean workspace; capture timing for SC-005 -- [x] T044 [P] Add a Lakeview widget on `lakebase.query_logs` summarising "ungrounded answer rate by week" — content-gap signal per Reffy +- [x] T044 [P] Add a Lakeview widget on Lakebase `query_logs` summarising "ungrounded answer rate by week" — content-gap signal per Reffy - [x] T045 [P] Document operating runbook in `docs/runbook.md`: how to add a sample filing, how to debug a low quality_score, how to roll an agent endpoint version, how to inspect CLEARS metrics in MLflow - [ ] T046 Run `python evals/clears_eval.py` against the demo endpoint and store the MLflow run ID in `docs/runbook.md` as the v1 baseline - [x] T047 [P] Add an SC-006 verification assertion in `evals/clears_eval.py`: query Vector Search for a known-rejected filename and assert zero hits (verifies "100% rubric exclusion") -- [x] T048 [P] Add an SC-001 timing widget to `resources/dashboards/usage.lvdash.yml` over `gold_filing_kpis` joined to `bronze_filings.ingested_at`: P95 of `extracted_at - ingested_at` per company; alerts if > 10 minutes +- [x] T048 [P] Add an SC-001 timing widget to `src/dashboards/usage.lvdash.json` over `gold_filing_kpis` joined to `bronze_filings.ingested_at`: P95 of `extracted_at - ingested_at` per company; alerts if > 10 minutes --- diff --git a/src/dashboards/usage.lvdash.json b/src/dashboards/usage.lvdash.json index 346d728..45ae00d 100644 --- a/src/dashboards/usage.lvdash.json +++ b/src/dashboards/usage.lvdash.json @@ -26,7 +26,7 @@ "SELECT created_at::date AS day, agent_path, count(*) AS turns,", " percentile_approx(latency_ms, 0.95) AS latency_p95_ms,", " sum(CASE WHEN array_size(citations) = 0 THEN 1 ELSE 0 END) AS ungrounded", - "FROM `__dataset_catalog__`.`__dataset_schema__`_state.public.query_logs", + "FROM `__dataset_schema___state`.docintel_app.query_logs", "GROUP BY 1, 2 ORDER BY 1 DESC" ] }