From 01b7903993d9f9ae6aa09cc7e26f2807e5eee3bd Mon Sep 17 00:00:00 2001 From: Sathish Krishnan <10681383+SatyKrish@users.noreply.github.com> Date: Sun, 26 Apr 2026 09:14:07 -0400 Subject: [PATCH] Switch demo target bootstrap flow --- .github/workflows/deploy.yml | 24 ++++---- CLAUDE.md | 24 ++++---- CONTRIBUTING.md | 6 +- README.md | 56 +++++++++---------- VALIDATION.md | 18 +++--- agent/log_and_register.py | 2 +- agent/retrieval.py | 2 +- app/README.md | 14 ++--- app/app.yaml | 2 +- databricks.yml | 12 ++-- docs/design.md | 20 +++---- docs/runbook.md | 35 ++++++------ resources/foundation/catalog.yml | 2 +- resources/foundation/doc_intel.pipeline.yml | 2 +- .../filings_index.yml | 0 .../{bootstrap-dev.sh => bootstrap-demo.sh} | 47 +++++++++++----- scripts/wait_for_kpis.py | 4 +- specs/001-doc-intel-10k/plan.md | 33 +++++------ specs/001-doc-intel-10k/quickstart.md | 32 +++++------ specs/001-doc-intel-10k/research.md | 4 +- specs/001-doc-intel-10k/spec.md | 8 +-- specs/001-doc-intel-10k/tasks.md | 18 +++--- 22 files changed, 191 insertions(+), 174 deletions(-) rename resources/{consumers => foundation}/filings_index.yml (100%) rename scripts/{bootstrap-dev.sh => bootstrap-demo.sh} (87%) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 2609a2d..c36dddb 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -13,16 +13,16 @@ jobs: - uses: actions/checkout@v4 - name: Install Databricks CLI run: curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sh - - name: Validate bundle (dev) + - name: Validate bundle (demo) env: DATABRICKS_HOST: ${{ secrets.DATABRICKS_HOST }} DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }} DOCINTEL_WAREHOUSE_ID: ${{ vars.DOCINTEL_WAREHOUSE_ID }} - run: databricks bundle validate --strict -t dev --var "warehouse_id=$DOCINTEL_WAREHOUSE_ID" + run: databricks bundle validate --strict -t demo --var "warehouse_id=$DOCINTEL_WAREHOUSE_ID" - deploy-dev: + deploy-demo: # CI assumes steady-state: the first-ever bring-up of a workspace must be - # done locally via `./scripts/bootstrap-dev.sh`, which handles the + # done locally via `./scripts/bootstrap-demo.sh`, which handles the # foundation/consumers staging and waits for Lakebase AVAILABLE. After # that initial bring-up, every push to main runs a full bundle deploy # against the now-existing resources — no temp-rename trick (DAB would @@ -52,7 +52,7 @@ jobs: # wait_for_kpis / log_and_register use. Without --var, the bundle # falls back to its `lookup: warehouse: Serverless Starter Warehouse` # default and silently picks a different ID. - run: databricks bundle deploy -t dev --var "warehouse_id=$DOCINTEL_WAREHOUSE_ID" + run: databricks bundle deploy -t demo --var "warehouse_id=$DOCINTEL_WAREHOUSE_ID" - name: Wait for Lakebase instance to be AVAILABLE # Lakebase already exists in steady-state but a config change can @@ -61,7 +61,7 @@ jobs: run: | python -c " import json, os, sys, time, subprocess -name = os.environ.get('LAKEBASE_NAME') or 'docintel-dev-state-v3' +name = os.environ.get('LAKEBASE_NAME') or 'docintel-demo-state-v1' deadline = time.time() + 600 while True: out = subprocess.run(['databricks','api','get','/api/2.0/database/instances','--output','json'], @@ -79,7 +79,7 @@ while True: time.sleep(15) " env: - LAKEBASE_NAME: ${{ vars.DOCINTEL_LAKEBASE_NAME || 'docintel-dev-state-v3' }} + LAKEBASE_NAME: ${{ vars.DOCINTEL_LAKEBASE_NAME || 'docintel-demo-state-v1' }} - name: Refresh data — upload samples, run pipeline, register new model version run: | @@ -88,11 +88,11 @@ while True: "dbfs:/Volumes/${DOCINTEL_CATALOG}/${DOCINTEL_SCHEMA}/raw_filings/" \ --overwrite done - databricks bundle run -t dev --var "warehouse_id=$DOCINTEL_WAREHOUSE_ID" doc_intel_pipeline + databricks bundle run -t demo --var "warehouse_id=$DOCINTEL_WAREHOUSE_ID" doc_intel_pipeline python scripts/wait_for_kpis.py --min-rows 3 --timeout 900 # --serving-endpoint repoints the existing endpoint to the new # model version in-place (steady-state idempotent operation). - python agent/log_and_register.py --target dev --serving-endpoint analyst-agent-dev + python agent/log_and_register.py --target demo --serving-endpoint analyst-agent-demo - name: Apply UC grants (catalog + schema; not DAB-supported) # UC requires the full chain: USE_CATALOG → USE_SCHEMA → SELECT/EXECUTE. @@ -112,7 +112,7 @@ while True: # Databricks Apps deploy docs: # https://docs.databricks.com/aws/en/dev-tools/databricks-apps/deploy # `bundle deploy` alone uploads code but doesn't apply config/restart. - run: databricks bundle run -t dev --var "warehouse_id=$DOCINTEL_WAREHOUSE_ID" analyst_app + run: databricks bundle run -t demo --var "warehouse_id=$DOCINTEL_WAREHOUSE_ID" analyst_app - name: Verify OBO scopes survived deploy # `bundle run` may wipe user_api_scopes (documented destructive-update @@ -120,7 +120,7 @@ while True: # are not declared (workspace feature off). run: | if grep -q '^ user_api_scopes:' resources/consumers/analyst.app.yml; then - databricks apps get doc-intel-analyst-dev --output json > /tmp/app.json + databricks apps get doc-intel-analyst-demo --output json > /tmp/app.json python -c " import json app = json.load(open('/tmp/app.json')) @@ -134,4 +134,4 @@ assert not missing, f'OBO scopes missing: {sorted(missing)} (got {sorted(scopes) fi - name: CLEARS evaluation gate - run: python evals/clears_eval.py --endpoint analyst-agent-dev --dataset evals/dataset.jsonl + run: python evals/clears_eval.py --endpoint analyst-agent-demo --dataset evals/dataset.jsonl diff --git a/CLAUDE.md b/CLAUDE.md index 585cecf..918750a 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -14,17 +14,17 @@ For an end-to-end overview written for humans, read [`README.md`](./README.md). ## Critical: deploy ordering hazard (READ FIRST before touching deploys) -The bundle has three chicken-egg dependencies that prevent a single `databricks bundle deploy -t dev` from succeeding on a fresh workspace: +The bundle has three chicken-egg dependencies that prevent a single `databricks bundle deploy -t demo` from succeeding on a fresh workspace: 1. **Model Serving endpoint** references a registered model version that doesn't exist until `agent/log_and_register.py` runs. 2. **Lakehouse Monitor** (`resources/consumers/kpi_drift.yml`) attaches to `gold_filing_kpis`, which doesn't exist until the pipeline runs once. 3. **Lakebase database_catalog + Databricks App** race the `database_instance` provisioning. -**Canonical fix**: Run `./scripts/bootstrap-dev.sh` for fresh stand-ups; plain `databricks bundle deploy -t dev` for steady-state. The script does a **staged deploy** — `resources/` is split into `foundation/` (no data deps) and `consumers/` (need data). Stage 1 temporarily renames consumer YAMLs to `*.yml.skip` so the bundle glob skips them; stage 2 produces data and then runs full `bundle deploy`. **Both deploys succeed cleanly** — no "errors tolerated" hand-waving, no orphans to clean up on retry. +**Canonical fix**: Run `./scripts/bootstrap-demo.sh` for fresh stand-ups; plain `databricks bundle deploy -t demo` for steady-state. The script does a **staged deploy** — `resources/` is split into `foundation/` (no data deps) and `consumers/` (need data). Stage 1 temporarily renames consumer YAMLs to `*.yml.skip` so the bundle glob skips them; stage 2 produces data and then runs full `bundle deploy`. **Both deploys succeed cleanly** — no "errors tolerated" hand-waving, no orphans to clean up on retry. **Do NOT try to "fix" these by:** - Adding `depends_on` between heterogeneous DAB resource types — DAB doesn't reliably honor it across instance↔catalog↔app. -- Switching `resources/consumers/agent.serving.yml` to UC alias syntax (`@dev`) — DAB serving config may reject alias syntax; that's why `_promote_serving_endpoint` exists in `agent/log_and_register.py`. +- Switching `resources/consumers/agent.serving.yml` to UC alias syntax (`@demo`) — DAB serving config may reject alias syntax; that's why `_promote_serving_endpoint` exists in `agent/log_and_register.py`. - Splitting monitors into a separate target overlay — adds complexity for a one-time concern. Full breakdown lives in [`docs/runbook.md`](./docs/runbook.md) §"Known deploy ordering gaps". @@ -38,8 +38,8 @@ app/ Streamlit on Databricks Apps + Lakebase psycopg client evals/ MLflow CLEARS gate (clears_eval.py + dataset.jsonl) jobs/ Lakeflow Jobs Python tasks (retention, index_refresh) resources/foundation/ DAB resources with no data deps: catalog/schema/volume, pipeline, retention job, Lakebase instance -resources/consumers/ DAB resources that depend on foundation data: serving endpoint, monitor, VS endpoint, index-refresh job, app, dashboard, Lakebase catalog -scripts/ Operational scripts (bootstrap-dev.sh, wait_for_kpis.py) +resources/consumers/ DAB resources that depend on foundation data: serving endpoint, monitor, index-refresh job, app, dashboard, Lakebase catalog +scripts/ Operational scripts (bootstrap-demo.sh, wait_for_kpis.py) samples/ Synthetic 10-K PDFs (regenerable via synthesize.py) specs/001-… Spec-Kit artifacts (spec, plan, tasks, research, data-model, contracts, quickstart) docs/runbook.md Day-2 ops + bring-up workflow @@ -48,16 +48,16 @@ docs/runbook.md Day-2 ops + bring-up workflow ## Build & deploy -- Validate: `databricks bundle validate -t dev` -- Fresh stand-up: `./scripts/bootstrap-dev.sh` (requires `DOCINTEL_CATALOG`, `DOCINTEL_SCHEMA`, `DOCINTEL_WAREHOUSE_ID`) -- Steady-state deploy: `databricks bundle deploy -t dev` -- Run pipeline: `databricks bundle run -t dev doc_intel_pipeline` -- Run eval: `python evals/clears_eval.py --endpoint analyst-agent-dev --dataset evals/dataset.jsonl` +- Validate: `databricks bundle validate -t demo` +- Fresh stand-up: `./scripts/bootstrap-demo.sh` (requires `DOCINTEL_CATALOG`, `DOCINTEL_SCHEMA`, `DOCINTEL_WAREHOUSE_ID`) +- Steady-state deploy: `databricks bundle deploy -t demo` +- Run pipeline: `databricks bundle run -t demo doc_intel_pipeline` +- Run eval: `python evals/clears_eval.py --endpoint analyst-agent-demo --dataset evals/dataset.jsonl` ## Tests & validation - `pytest agent/tests/` — unit tests for retrieval, agent routing, supervisor -- `databricks bundle validate -t dev` and `-t prod` — schema check both targets before merging +- `databricks bundle validate -t demo` and `-t prod` — schema check both targets before merging - The CLEARS eval is the deploy gate; principle V says no agent ships without it passing ## Working with this codebase — gotchas Claude has learned @@ -69,7 +69,7 @@ These were discovered the painful way during the 2026-04-25 bring-up. Future ses - **Section explosion fallback**: `pipelines/sql/03_gold_classify_extract.sql` POSEXPLODES `parsed:sections[*]` and falls back to a single `full_document` row when the VARIANT lacks `$.sections` so we never lose a filing. - **MLflow + UC requires both inputs AND outputs in signatures**: an inputs-only signature is rejected at registration. For variable-shape fields like `citations` (array of dicts), use `mlflow.types.schema.AnyType()` to avoid serving-time truncation. Reference: `agent/log_and_register.py:_signature`. - **`lakebase_stopped: true` is rejected on instance creation**: the API doesn't allow creating a database_instance directly into stopped state. Default is `false`; flip to `true` only after the instance exists. Reference: `databricks.yml` variable description. -- **macOS doesn't ship `python`**: scripts must prefer `.venv/bin/python` then fall back to `python3`. Reference: `scripts/bootstrap-dev.sh`. +- **macOS doesn't ship `python`**: scripts must prefer `.venv/bin/python` then fall back to `python3`. Reference: `scripts/bootstrap-demo.sh`. - **`agent/log_and_register.py` needs `PYTHONPATH`**: it imports the `agent` package; run with `PYTHONPATH=$REPO_ROOT` or use the bootstrap script which exports it. - **Serving endpoint version drifts from YAML**: `resources/consumers/agent.serving.yml` pins `entity_version: "1"` as the bootstrap value. Steady-state CI re-registers new versions and uses `_promote_serving_endpoint` to update the served entity in-place. The YAML and the live endpoint diverge over time — that's intentional, not drift. - **Streamlit on Databricks Apps requires CORS+XSRF off via env vars**: not flags. `STREAMLIT_SERVER_ENABLE_CORS=false` and `STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION=false` in `app/app.yaml`. Databricks Apps runtime config: https://docs.databricks.com/aws/en/dev-tools/databricks-apps/app-runtime. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 91bde69..ad69e72 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -21,11 +21,11 @@ python -m venv .venv ```bash .venv/bin/python -m pytest agent/tests/ -q # 18 unit tests -databricks bundle validate --strict -t dev # YAML schema + interpolation -bash -n scripts/bootstrap-dev.sh # bash syntax +databricks bundle validate --strict -t demo # YAML schema + interpolation +bash -n scripts/bootstrap-demo.sh # bash syntax ``` -End-to-end is exercised by `./scripts/bootstrap-dev.sh` against a real Databricks workspace; see [`specs/001-doc-intel-10k/quickstart.md`](./specs/001-doc-intel-10k/quickstart.md). +End-to-end is exercised by `./scripts/bootstrap-demo.sh` against a real Databricks workspace; see [`specs/001-doc-intel-10k/quickstart.md`](./specs/001-doc-intel-10k/quickstart.md). ## Working with the spec-kit diff --git a/README.md b/README.md index 3eeaa37..a5933ef 100644 --- a/README.md +++ b/README.md @@ -59,14 +59,14 @@ For motivation, architecture diagrams, the Spec-Kit + Claude Code build workflow - **Streamlit chat UI on Databricks Apps** — citation chips, thumbs feedback, conversation history persisted to Lakebase Postgres. - **Eval-gated promotion** — `mlflow.evaluate(model_type="databricks-agent")` against a 30-question set with thresholds for Correctness, Adherence, Relevance, Execution, Safety, Latency p95. - **Reproducible synthetic corpus** — `samples/synthesize.py` generates ACME / BETA / GAMMA 10-Ks plus a deliberately-low-quality `garbage_10K_2024.pdf` for the rubric-exclusion test (SC-006). No EDGAR dependency in CI. -- **Staged deploy with chicken-egg resolution** — `scripts/bootstrap-dev.sh` orchestrates foundation → data production → consumers so a fresh workspace deploys cleanly with no "errors tolerated." +- **Staged deploy with chicken-egg resolution** — `scripts/bootstrap-demo.sh` orchestrates foundation → data production → consumers so a fresh workspace deploys cleanly with no "errors tolerated." - **Lakehouse Monitoring + AI/BI dashboard** — drift on extraction confidence, p95 latency by company, ungrounded-answer rate. ## Readiness levels | Level | Meaning | Required evidence | |---|---|---| -| Reference-ready | Synthetic corpus deploys and demonstrates the architecture end-to-end | Dev bundle validates, bootstrap succeeds, synthetic CLEARS passes | +| Reference-ready | Synthetic corpus deploys and demonstrates the architecture end-to-end | Demo bundle validates, bootstrap succeeds, synthetic CLEARS passes | | Pilot-ready | Real 10-K filings validate parse/extract/retrieval behavior | Reference-ready + small real EDGAR corpus + reviewed costs/latency | | Production-ready | Analysts can use it under governed identity and operational SLOs | Pilot-ready + app-level OBO enabled, audit proof, alerts/dashboards, rollback tested | @@ -153,7 +153,7 @@ Pick the ID of a serverless warehouse (state can be `STOPPED` — it auto-starts ### 3. Validate the bundle ```bash -databricks bundle validate --strict -t dev +databricks bundle validate --strict -t demo ``` If this prints `Validation OK!`, every YAML resource is schema-correct. @@ -162,9 +162,9 @@ If this prints `Validation OK!`, every YAML resource is schema-correct. ```bash DOCINTEL_CATALOG=workspace \ -DOCINTEL_SCHEMA=docintel_10k_dev \ +DOCINTEL_SCHEMA=docintel_10k_demo \ DOCINTEL_WAREHOUSE_ID= \ -./scripts/bootstrap-dev.sh +./scripts/bootstrap-demo.sh ``` The script handles the chicken-egg ordering automatically — see [`docs/design.md` § Deploy ordering](./docs/design.md#deploy-ordering-foundation--consumers). @@ -172,9 +172,9 @@ The script handles the chicken-egg ordering automatically — see [`docs/design. ### 5. Run the eval gate ```bash -DOCINTEL_CATALOG=workspace DOCINTEL_SCHEMA=docintel_10k_dev \ +DOCINTEL_CATALOG=workspace DOCINTEL_SCHEMA=docintel_10k_demo \ .venv/bin/python evals/clears_eval.py \ - --endpoint analyst-agent-dev \ + --endpoint analyst-agent-demo \ --dataset evals/dataset.jsonl ``` @@ -182,7 +182,7 @@ Exit 0 means every CLEARS axis met its threshold. ### 6. Open the app -In the workspace UI: **Apps → `doc-intel-analyst-dev`**. Ask: +In the workspace UI: **Apps → `doc-intel-analyst-demo`**. Ask: > What were the top 3 risk factors disclosed by ACME in their FY24 10-K? @@ -194,21 +194,21 @@ After the first bring-up, iteration depends on what changed: ```bash # YAML / pipeline / job / app config changes -databricks bundle deploy -t dev -databricks bundle run -t dev analyst_app # apply app config + restart +databricks bundle deploy -t demo +databricks bundle run -t demo analyst_app # apply app config + restart # Agent code changes (agent/*.py): register a new model version # and repoint the existing serving endpoint in-place. DOCINTEL_CATALOG=workspace \ -DOCINTEL_SCHEMA=docintel_10k_dev \ +DOCINTEL_SCHEMA=docintel_10k_demo \ DOCINTEL_WAREHOUSE_ID= \ - .venv/bin/python agent/log_and_register.py --target dev --serving-endpoint analyst-agent-dev + .venv/bin/python agent/log_and_register.py --target demo --serving-endpoint analyst-agent-demo # Pipeline SQL changes that need to re-process existing filings -databricks bundle run -t dev doc_intel_pipeline +databricks bundle run -t demo doc_intel_pipeline ``` -You can also re-run `./scripts/bootstrap-dev.sh` — it auto-detects steady-state and does the full cycle (deploy → refresh data → register/promote → app run → grants → smoke) in one command. +You can also re-run `./scripts/bootstrap-demo.sh` — it auto-detects steady-state and does the full cycle (deploy → refresh data → register/promote → app run → grants → smoke) in one command. For a guided 30-minute tour, see [`specs/001-doc-intel-10k/quickstart.md`](./specs/001-doc-intel-10k/quickstart.md). @@ -222,7 +222,7 @@ Before any deploy reaches production, an evaluation must pass (constitution prin evals/dataset.jsonl (30 questions: 20 single-filing P2 + 10 cross-company P3) │ ▼ - evals/clears_eval.py ──▶ hits the dev endpoint, scores 6 axes: + evals/clears_eval.py ──▶ hits the demo endpoint, scores 6 axes: ┌─────────────────────────────────────────────────────┐ │ C - Correctness ≥ 0.80 (factual accuracy) │ @@ -249,7 +249,7 @@ Implementation uses `mlflow.evaluate(model_type="databricks-agent")` for the fou | Variable | Default | Purpose | |---|---|---| | `catalog` | `workspace` | UC catalog for all resources | -| `schema` | `docintel_10k` (prod) / `docintel_10k_dev` (dev) | Schema under the catalog | +| `schema` | `docintel_10k` (prod) / `docintel_10k_demo` (demo) | Schema under the catalog | | `lakebase_instance` | per-target | Lakebase database instance name | | `lakebase_stopped` | `false` | Flip to `true` only after instance exists | | `service_principal_id` | `""` | **Required** for `-t prod`; `bundle validate -t prod` fails loudly without it | @@ -271,7 +271,7 @@ Override via `--var name=value` on any `bundle` command. | `DOCINTEL_CATALOG` | yes | Bootstrap, CI, eval | | `DOCINTEL_SCHEMA` | yes | Same | | `DOCINTEL_WAREHOUSE_ID` | yes | Bootstrap (passed to bundle as `--var warehouse_id`, used by kpi-poll + smoke); `agent/log_and_register.py` (auth-policy SQL warehouse resource); `agent/tools.py` UC Function tool | -| `DOCINTEL_TARGET` | no (default `dev`) | Bootstrap | +| `DOCINTEL_TARGET` | no (default `demo`) | Bootstrap | | `DOCINTEL_ANALYST_GROUP` | no (default `account users`) | UC grants in bootstrap + CI | | `DOCINTEL_WAIT_SECONDS` | no (default 600) | Bootstrap KPI-table poll timeout | | `DOCINTEL_LAKEBASE_TIMEOUT` | no (default 600) | Bootstrap Lakebase-AVAILABLE poll | @@ -286,11 +286,11 @@ Override via `--var name=value` on any `bundle` command. .venv/bin/python -m pytest agent/tests/ -q # Bundle schema + interpolation -databricks bundle validate --strict -t dev +databricks bundle validate --strict -t demo databricks bundle validate --strict -t prod # expected to FAIL without --var service_principal_id (intended safety) # Bash syntax -bash -n scripts/bootstrap-dev.sh +bash -n scripts/bootstrap-demo.sh # Compile checks for all modified Python .venv/bin/python -m py_compile \ @@ -300,7 +300,7 @@ bash -n scripts/bootstrap-dev.sh evals/clears_eval.py scripts/wait_for_kpis.py samples/synthesize.py ``` -End-to-end is exercised by [`./scripts/bootstrap-dev.sh`](./scripts/bootstrap-dev.sh) against a real workspace; see [`VALIDATION.md`](./VALIDATION.md) for the full procedure with expected outputs. +End-to-end is exercised by [`./scripts/bootstrap-demo.sh`](./scripts/bootstrap-demo.sh) against a real workspace; see [`VALIDATION.md`](./VALIDATION.md) for the full procedure with expected outputs. --- @@ -308,12 +308,12 @@ End-to-end is exercised by [`./scripts/bootstrap-dev.sh`](./scripts/bootstrap-de | Path | When | |---|---| -| `./scripts/bootstrap-dev.sh` | Fresh-workspace bring-up (or after `bundle destroy`). Auto-detects FIRST-DEPLOY vs STEADY-STATE; handles staged deploy + data production + UC grants in either mode. | -| `databricks bundle deploy -t dev` | YAML / pipeline / job / app config changes after the first bring-up. | -| `databricks bundle run -t dev analyst_app` | After any change to `app/` or `resources/consumers/analyst.app.yml` — required to apply runtime config + restart the app. | -| `python agent/log_and_register.py --target dev --serving-endpoint analyst-agent-dev` | After agent code changes (`agent/*.py`). Registers a new UC model version and repoints the existing serving endpoint in-place. | +| `./scripts/bootstrap-demo.sh` | Fresh-workspace bring-up (or after `bundle destroy`). Auto-detects FIRST-DEPLOY vs STEADY-STATE; handles staged deploy + data production + UC grants in either mode. | +| `databricks bundle deploy -t demo` | YAML / pipeline / job / app config changes after the first bring-up. | +| `databricks bundle run -t demo analyst_app` | After any change to `app/` or `resources/consumers/analyst.app.yml` — required to apply runtime config + restart the app. | +| `python agent/log_and_register.py --target demo --serving-endpoint analyst-agent-demo` | After agent code changes (`agent/*.py`). Registers a new UC model version and repoints the existing serving endpoint in-place. | | `databricks bundle deploy -t prod --var service_principal_id=` | Production deploy, run as the prod SP. | -| GitHub Actions on push to `main` | Steady-state CI: full `bundle deploy` → wait for Lakebase AVAILABLE → upload samples + run pipeline + register/promote agent → UC grants → `bundle run analyst_app` → CLEARS eval gate. (The first-ever bring-up of a workspace must be done locally with `./scripts/bootstrap-dev.sh`.) | +| GitHub Actions on push to `main` | Steady-state CI: full `bundle deploy` → wait for Lakebase AVAILABLE → upload samples + run pipeline + register/promote agent → UC grants → `bundle run analyst_app` → CLEARS eval gate. (The first-ever bring-up of a workspace must be done locally with `./scripts/bootstrap-demo.sh`.) | For day-2 ops (rolling agent versions, debugging low quality scores, inspecting CLEARS metrics in MLflow), see [`docs/runbook.md`](./docs/runbook.md). For the production-readiness checklist, see [`PRODUCTION_READINESS.md`](./PRODUCTION_READINESS.md). @@ -323,7 +323,7 @@ For day-2 ops (rolling agent versions, debugging low quality scores, inspecting ``` databricks/ -├── databricks.yml # Bundle root — variables + dev/prod targets +├── databricks.yml # Bundle root — variables + demo/prod targets ├── pipelines/sql/ # Lakeflow SDP — Bronze → Silver → Gold (SQL only) ├── agent/ # Mosaic AI Agent Framework — pyfunc, retrieval, OBO ├── app/ # Streamlit on Databricks Apps + Lakebase client @@ -331,7 +331,7 @@ databricks/ ├── jobs/ # Lakeflow Jobs (retention, index refresh) ├── resources/foundation/ # DAB resources with no data deps ├── resources/consumers/ # DAB resources that depend on foundation data -├── scripts/ # bootstrap-dev.sh + helpers +├── scripts/ # bootstrap-demo.sh + helpers ├── samples/ # Synthetic 10-K PDFs (regenerable) ├── specs/001-doc-intel-10k/ # Spec-Kit artifacts (spec, plan, tasks, etc.) ├── docs/ # design.md (this repo's "why") + runbook.md (day-2 ops) @@ -348,7 +348,7 @@ This is a **pilot-scale** reference implementation, not a turnkey production dep | Limit | Value | Source | |---|---|---| -| Filings in dev | ~500 | spec.md scale | +| Filings in demo | ~500 | spec.md scale | | Filings in prod | ~5,000 | spec.md scale | | Concurrent app users | ~20 | spec.md scale | | PDF size cap | 50 MB | FR / `bronze_filings_rejected` | diff --git a/VALIDATION.md b/VALIDATION.md index 9a56a7a..5bf0acc 100644 --- a/VALIDATION.md +++ b/VALIDATION.md @@ -11,9 +11,9 @@ python3 -m py_compile \ app/app.py app/lakebase_client.py \ evals/clears_eval.py scripts/wait_for_kpis.py samples/synthesize.py -bash -n scripts/bootstrap-dev.sh +bash -n scripts/bootstrap-demo.sh pytest agent/tests -databricks bundle validate --strict -t dev +databricks bundle validate --strict -t demo ``` Expected prod safety check: @@ -24,14 +24,14 @@ databricks bundle validate --strict -t prod This should fail unless `service_principal_id` is provided. -## Fresh Dev Bring-Up +## Fresh Demo Bring-Up ```bash export DOCINTEL_CATALOG=workspace -export DOCINTEL_SCHEMA=docintel_10k_dev +export DOCINTEL_SCHEMA=docintel_10k_demo export DOCINTEL_WAREHOUSE_ID= -./scripts/bootstrap-dev.sh +./scripts/bootstrap-demo.sh ``` Expected outcomes: @@ -66,7 +66,7 @@ Expected: ```bash python evals/clears_eval.py \ - --endpoint analyst-agent-dev \ + --endpoint analyst-agent-demo \ --dataset evals/dataset.jsonl ``` @@ -78,7 +78,7 @@ Expected: ## App Checks -- Open `doc-intel-analyst-dev`. +- Open `doc-intel-analyst-demo`. - Ask: `What was ACME's revenue in fiscal year 2024?` - Confirm the response has citations and the turn is written to Lakebase. - Submit thumbs feedback and confirm a feedback row is written. @@ -88,11 +88,11 @@ Expected: If app-level OBO is enabled: - Confirm `resources/consumers/analyst.app.yml:user_api_scopes` is uncommented. -- Run `databricks bundle deploy -t dev && databricks bundle run -t dev analyst_app`. +- Run `databricks bundle deploy -t demo && databricks bundle run -t demo analyst_app`. - Confirm bootstrap or CI verifies `serving.serving-endpoints` and `sql` scopes. - Check audit logs for user-scoped downstream access. If app-level OBO is not enabled: -- Treat the deployment as reference/dev only. +- Treat the deployment as reference/demo only. - Do not claim user-level UC row/column enforcement. diff --git a/agent/log_and_register.py b/agent/log_and_register.py index 17bf427..d255b58 100644 --- a/agent/log_and_register.py +++ b/agent/log_and_register.py @@ -132,7 +132,7 @@ def _promote_serving_endpoint(endpoint_name: str, model_name: str, version: str) def main() -> int: p = argparse.ArgumentParser() - p.add_argument("--target", required=True, choices=["dev", "prod"]) + p.add_argument("--target", required=True, choices=["demo", "prod"]) p.add_argument("--serving-endpoint", help="Existing serving endpoint to update to the new model version") args = p.parse_args() diff --git a/agent/retrieval.py b/agent/retrieval.py index 2d49d78..04a79d2 100644 --- a/agent/retrieval.py +++ b/agent/retrieval.py @@ -22,7 +22,7 @@ CATALOG = os.environ["DOCINTEL_CATALOG"] SCHEMA = os.environ["DOCINTEL_SCHEMA"] INDEX_FQN = f"{CATALOG}.{SCHEMA}.filings_summary_idx" -ENDPOINT = os.environ.get("DOCINTEL_VS_ENDPOINT", f"docintel-{os.environ.get('DOCINTEL_TARGET', 'dev')}") +ENDPOINT = os.environ.get("DOCINTEL_VS_ENDPOINT", f"docintel-{os.environ.get('DOCINTEL_TARGET', 'demo')}") RERANK_ENDPOINT = os.environ.get("DOCINTEL_RERANK_ENDPOINT", "databricks-bge-rerank-v2") _RETURN_COLS = ["section_uid", "filename", "section_label", "original_label", "summary", "quality_score"] diff --git a/app/README.md b/app/README.md index 251035a..cc291ef 100644 --- a/app/README.md +++ b/app/README.md @@ -14,16 +14,16 @@ Source for the Databricks App `doc-intel-analyst-${target}`. Streamlit chat UI o ## Running deployed (canonical) ```bash -databricks bundle deploy -t dev -databricks bundle run -t dev analyst_app -# Open the App URL from the workspace UI ("Apps" → doc-intel-analyst-dev) +databricks bundle deploy -t demo +databricks bundle run -t demo analyst_app +# Open the App URL from the workspace UI ("Apps" → doc-intel-analyst-demo) ``` The first request creates the `conversation_history`, `query_logs`, and `feedback` tables in Lakebase. Tables are owned by the App's bound service principal (auto-granted `CAN_CONNECT_AND_CREATE` per `resources/consumers/analyst.app.yml`). ## Running locally -For iteration speed you may want to run the Streamlit app on your laptop against a deployed dev workspace. **Authenticate as the App's bound service principal** so Lakebase schema init produces the same ownership as the deployed App: +For iteration speed you may want to run the Streamlit app on your laptop against a deployed demo workspace. **Authenticate as the App's bound service principal** so Lakebase schema init produces the same ownership as the deployed App: ```bash export DATABRICKS_HOST=https://.cloud.databricks.com @@ -32,11 +32,11 @@ export DATABRICKS_CLIENT_SECRET= # Lakebase env vars (PGHOST/PGPORT/PGUSER/PGPASSWORD/PGDATABASE) come from # the App resource binding when deployed. Locally, derive them with: -eval "$(databricks apps get doc-intel-analyst-dev \ +eval "$(databricks apps get doc-intel-analyst-demo \ --output json | jq -r '.resources[] | select(.name=="docintel-lakebase") | .database | @sh " export PGHOST=\(.host) PGPORT=\(.port) PGUSER=\(.username) PGPASSWORD=\(.password) PGDATABASE=\(.database)"')" -export DOCINTEL_AGENT_ENDPOINT=analyst-agent-dev +export DOCINTEL_AGENT_ENDPOINT=analyst-agent-demo streamlit run app/app.py ``` @@ -58,4 +58,4 @@ The app forwards each user's `x-forwarded-access-token` header to the agent serv **Streamlit gotcha** (per the [Databricks Apps runtime docs](https://docs.databricks.com/aws/en/dev-tools/databricks-apps/app-runtime)): the OBO token is captured at the initial HTTP request; the connection then upgrades to WebSocket and the token never refreshes. If a user's UC permissions change mid-session, ask them to reload the page. -**Local-dev caveat**: `st.context.headers` won't have `x-forwarded-access-token` when running `streamlit run` outside the Databricks Apps reverse proxy, so the OBO helper falls back to the SP client. That's fine for development — UC ACLs in dev workspaces are usually permissive — but verify against deployed dev before assuming OBO works. +**Local-dev caveat**: `st.context.headers` won't have `x-forwarded-access-token` when running `streamlit run` outside the Databricks Apps reverse proxy, so the OBO helper falls back to the SP client. That's fine for development — UC ACLs in demo workspaces are usually permissive — but verify against deployed demo before assuming OBO works. diff --git a/app/app.yaml b/app/app.yaml index ff9bc36..75b3881 100644 --- a/app/app.yaml +++ b/app/app.yaml @@ -24,6 +24,6 @@ env: # Resolved at runtime from the resource bindings declared in # resources/consumers/analyst.app.yml. The bindings are target-aware - # (analyst-agent-${bundle.target}) so dev and prod stay isolated. + # (analyst-agent-${bundle.target}) so demo and prod stay isolated. - name: DOCINTEL_AGENT_ENDPOINT valueFrom: agent-endpoint diff --git a/databricks.yml b/databricks.yml index 8040270..af0bb79 100644 --- a/databricks.yml +++ b/databricks.yml @@ -47,23 +47,23 @@ variables: default: account users targets: - dev: + demo: default: true workspace: profile: DEFAULT - # Per-user dev path so two engineers' deploys don't stomp each other. + # Per-user demo path so two engineers' deploys don't stomp each other. # `${workspace.current_user.userName}` resolves to the auth profile's # email at deploy time. - root_path: /Workspace/Users/${workspace.current_user.userName}/.bundle/${bundle.name}/dev + root_path: /Workspace/Users/${workspace.current_user.userName}/.bundle/${bundle.name}/demo variables: catalog: workspace - schema: docintel_10k_dev - lakebase_instance: docintel-dev-state-v3 + schema: docintel_10k_demo + lakebase_instance: docintel-demo-state-v1 resources: pipelines: doc_intel_pipeline: development: true - # Dev override: triggered runs only, to avoid a continuously-running + # Demo override: triggered runs only, to avoid a continuously-running # serverless cluster during smoke iterations. Prod inherits # continuous: true from resources/foundation/doc_intel.pipeline.yml. continuous: false diff --git a/docs/design.md b/docs/design.md index f405159..e818682 100644 --- a/docs/design.md +++ b/docs/design.md @@ -71,7 +71,7 @@ It also demonstrates a development workflow: **Spec-Kit** for spec-driven design **Key idea — "parse once, extract many":** PDFs are expensive to parse. Silver runs `ai_parse_document` exactly once per file and stores the structured result as a `VARIANT`. Everything downstream — classification, KPI extraction, summarization, quality scoring — reads the parsed output, never the raw bytes. This is a non-negotiable constitution principle. -**Triggering**: prod runs the pipeline in `continuous: true` mode so Auto Loader (`read_files`) reacts to new PDFs in the volume automatically. Dev overrides to `continuous: false` to avoid a 24/7 cluster during smoke iterations. See `resources/foundation/doc_intel.pipeline.yml` and the dev override block in `databricks.yml`. +**Triggering**: prod runs the pipeline in `continuous: true` mode so Auto Loader (`read_files`) reacts to new PDFs in the volume automatically. Demo overrides to `continuous: false` to avoid a 24/7 cluster during smoke iterations. See `resources/foundation/doc_intel.pipeline.yml` and the demo override block in `databricks.yml`. ### Vector Search bridges data and agent @@ -91,7 +91,7 @@ It also demonstrates a development workflow: **Spec-Kit** for spec-driven design "Quality before retrieval." ``` -**Ownership note**: DAB manages the Vector Search **endpoint** (`resources/consumers/filings_index.yml`) and the index-refresh **job** (`resources/consumers/index_refresh.job.yml`). The **index** itself isn't yet a DAB-managed resource type as of CLI 0.298 — `jobs/index_refresh/sync_index.py` creates the Delta-Sync index on first run and triggers a sync on subsequent runs. That's why the bootstrap script's stage-2 deploy creates the endpoint + job, and the job's first execution materializes the actual index. +**Ownership note**: DAB manages the Vector Search **endpoint** (`resources/foundation/filings_index.yml`) and the index-refresh **job** (`resources/consumers/index_refresh.job.yml`). The **index** itself isn't yet a DAB-managed resource type as of CLI 0.298 — `jobs/index_refresh/sync_index.py` creates the Delta-Sync index on first run and triggers a sync on subsequent runs. The endpoint lives in foundation so first-deploy bootstrap can materialize the index before `agent/log_and_register.py` logs the model auth policy that references it. ### Agent has two paths, one endpoint @@ -129,7 +129,7 @@ It also demonstrates a development workflow: **Spec-Kit** for spec-driven design └──────────────────────┘ ``` -The agent is an `mlflow.pyfunc` model registered in Unity Catalog and served behind an **AI Gateway** (rate limiting per-user, usage tracking, inference-table audit). Identity passthrough is implemented at the *App layer* when the workspace has Databricks Apps user-token passthrough enabled: the Streamlit app extracts the user's `x-forwarded-access-token` header and constructs a user-scoped `WorkspaceClient`. The served model is OBO-ready via MLflow `auth_policy` and Model Serving user credentials. If app-level passthrough is not enabled, the app falls back to service-principal auth and the repo must be treated as a reference/dev deployment, not a production row-level-security deployment. See [`../SECURITY.md`](../SECURITY.md) and [`../app/README.md`](../app/README.md). +The agent is an `mlflow.pyfunc` model registered in Unity Catalog and served behind an **AI Gateway** (rate limiting per-user, usage tracking, inference-table audit). Identity passthrough is implemented at the *App layer* when the workspace has Databricks Apps user-token passthrough enabled: the Streamlit app extracts the user's `x-forwarded-access-token` header and constructs a user-scoped `WorkspaceClient`. The served model is OBO-ready via MLflow `auth_policy` and Model Serving user credentials. If app-level passthrough is not enabled, the app falls back to service-principal auth and the repo must be treated as a reference/demo deployment, not a production row-level-security deployment. See [`../SECURITY.md`](../SECURITY.md) and [`../app/README.md`](../app/README.md). ### Runtime stack @@ -150,7 +150,7 @@ The agent is an `mlflow.pyfunc` model registered in Unity Catalog and served beh ▼ ▼ ┌────────────────────────┐ ┌────────────────────────┐ │ Model Serving endpoint │ │ Lakebase Postgres │ - │ "analyst-agent-dev" │ │ ───────────────── │ + │ "analyst-agent-demo" │ │ ───────────────── │ │ (CPU, scales to 0) │ │ conversation_history │ │ │ │ query_logs │ │ + AI Gateway: │ │ feedback │ @@ -168,7 +168,7 @@ The agent is an `mlflow.pyfunc` model registered in Unity Catalog and served beh user's identity. The agent-side MLflow auth policy and Model Serving OBO credentials let downstream calls run as the user. If the app-side feature is unavailable, the bootstrap script prints an explicit warning - and the deployment remains reference/dev only. + and the deployment remains reference/demo only. ``` **Why Postgres for state?** Delta tables are great for analytics but bad at "insert one tiny row per chat turn at high frequency." Lakebase is Databricks's managed Postgres — same governance, right tool for the job. @@ -212,13 +212,13 @@ This repo combines three things: Spec-Kit for spec-driven design, Databricks Ass | III | **Declarative over imperative** | SDP SQL pipelines, Lakeflow Jobs, DAB resources — no production notebooks | | IV | **Quality before retrieval** | 5-dim rubric scores every section; only ≥22/30 reach the index. Embed `summary`, not raw text | | V | **Eval-gated agents** | MLflow CLEARS scores must clear thresholds before any deploy is considered complete | -| VI | **Reproducible deploys** | `databricks bundle deploy -t ` recreates the entire stack; `dev` and `prod` parity enforced | +| VI | **Reproducible deploys** | `databricks bundle deploy -t ` recreates the entire stack; `demo` and `prod` parity enforced | When you read `specs/001-doc-intel-10k/plan.md` you'll see a "Constitution Check" gate that maps each design decision back to the principle it satisfies. When you read `specs/001-doc-intel-10k/tasks.md` you'll see how each task derives from the plan, and how user-stories (P1, P2, P3) are independently demoable. ### Pillar 2 — Databricks Asset Bundles + the Claude Code skill suite -[**Databricks Asset Bundles**](https://docs.databricks.com/aws/en/dev-tools/bundles/) (DABs) describe most of the workspace state as YAML. One root `databricks.yml` declares variables and targets (`dev`, `prod`); `resources/**/*.yml` declares each resource (pipeline, jobs, Vector Search endpoint, index-refresh job, serving endpoint, app, monitor, dashboard, Lakebase instance + catalog). `databricks bundle deploy -t dev` reconciles workspace state to YAML. The two non-DAB-managed pieces — the Vector Search **index** itself and the registered **model version** — are produced at runtime by `jobs/index_refresh/sync_index.py` and `agent/log_and_register.py` respectively, which the bootstrap script orchestrates. +[**Databricks Asset Bundles**](https://docs.databricks.com/aws/en/dev-tools/bundles/) (DABs) describe most of the workspace state as YAML. One root `databricks.yml` declares variables and targets (`demo`, `prod`); `resources/**/*.yml` declares each resource (pipeline, jobs, Vector Search endpoint, index-refresh job, serving endpoint, app, monitor, dashboard, Lakebase instance + catalog). `databricks bundle deploy -t demo` reconciles workspace state to YAML. The two non-DAB-managed pieces — the Vector Search **index** itself and the registered **model version** — are produced at runtime by `jobs/index_refresh/sync_index.py` and `agent/log_and_register.py` respectively, which the bootstrap script orchestrates. This repo was built with Databricks-specific Claude Code skill bundles. Those bundles are distributed by Databricks via the CLI / Claude Code plugin channel and **are not vendored in this open-source tree** — install them locally if you have access, or reference the canonical Databricks docs (mapping in [`../CONTRIBUTING.md`](../CONTRIBUTING.md)). @@ -277,7 +277,7 @@ DABs deploy *everything in one shot*. But our resources have a chicken-and-egg p ▶ Single `bundle deploy` → 4+ errors on a fresh workspace. ``` -The fix is a **staged deploy** orchestrated by `scripts/bootstrap-dev.sh`. Resources are split into two directories by data dependency: +The fix is a **staged deploy** orchestrated by `scripts/bootstrap-demo.sh`. Resources are split into two directories by data dependency: ``` resources/ @@ -285,12 +285,12 @@ The fix is a **staged deploy** orchestrated by `scripts/bootstrap-dev.sh`. Resou │ ├── catalog.yml (schema + volume + grants) │ ├── doc_intel.pipeline.yml │ ├── retention.job.yml + │ ├── filings_index.yml (VS endpoint) │ └── lakebase_instance.yml │ └── consumers/ ← need foundation to be RUNNING and producing data ├── agent.serving.yml (needs registered model version) ├── kpi_drift.yml (needs gold_filing_kpis table) - ├── filings_index.yml (VS endpoint) ├── index_refresh.job.yml (needs source table) ├── analyst.app.yml (needs Lakebase + agent endpoint) ├── usage.dashboard.yml @@ -337,7 +337,7 @@ The fix is a **staged deploy** orchestrated by `scripts/bootstrap-dev.sh`. Resou **Why two modes?** DAB tracks resource state; if you run the temp-rename trick against an *existing* deployment, DAB sees the consumer YAMLs as removed and plans to **delete** the serving endpoint, app, monitor, etc. Safe-ish on a fresh workspace; destructive in steady-state. The script detects mode and does the right thing. -CI (`.github/workflows/deploy.yml`) assumes steady-state — the first-ever bring-up of a workspace must be done locally with `./scripts/bootstrap-dev.sh`. After that, every push to `main` runs the steady-state path: full `bundle deploy` → refresh data → repoint serving endpoint → grants → CLEARS gate. +CI (`.github/workflows/deploy.yml`) assumes steady-state — the first-ever bring-up of a workspace must be done locally with `./scripts/bootstrap-demo.sh`. After that, every push to `main` runs the steady-state path: full `bundle deploy` → refresh data → repoint serving endpoint → grants → CLEARS gate. For the per-step procedure and known failure modes, see [`runbook.md` § Known deploy ordering gaps](./runbook.md#known-deploy-ordering-gaps-discovered-in-the-2026-04-24-smoke-test). diff --git a/docs/runbook.md b/docs/runbook.md index 8328eee..327ab6d 100644 --- a/docs/runbook.md +++ b/docs/runbook.md @@ -1,11 +1,11 @@ # Operating Runbook — 10-K Analyst -This runbook covers day-2 operations for the deployed dev/prod stacks. For first-time setup follow [`specs/001-doc-intel-10k/quickstart.md`](../specs/001-doc-intel-10k/quickstart.md). +This runbook covers day-2 operations for the deployed demo/prod stacks. For first-time setup follow [`specs/001-doc-intel-10k/quickstart.md`](../specs/001-doc-intel-10k/quickstart.md). ## Add a sample filing 1. `databricks fs cp /_10K_.pdf dbfs:/Volumes///raw_filings/` -2. Watch the pipeline: `databricks bundle run -t dev doc_intel_pipeline` +2. Watch the pipeline: `databricks bundle run -t demo doc_intel_pipeline` 3. Verify: ```sql SELECT filename, company_name, fiscal_year, revenue @@ -35,17 +35,17 @@ If a filing scores below threshold: ## Roll an agent endpoint version -The Model Serving endpoint follows the UC Model Alias `@dev` (or `@prod`), not a pinned version. To roll forward: +The Model Serving endpoint follows the UC Model Alias `@demo` (or `@prod`), not a pinned version. To roll forward: ```bash -DOCINTEL_CATALOG= DOCINTEL_SCHEMA= python agent/log_and_register.py --target dev +DOCINTEL_CATALOG= DOCINTEL_SCHEMA= python agent/log_and_register.py --target demo ``` -This registers a new version and reassigns `@dev`. The serving endpoint will pick the new version on its next traffic refresh (a few minutes). To roll back, use the UC Model Registry UI to re-point the alias to the prior version. +This registers a new version and reassigns `@demo`. The serving endpoint will pick the new version on its next traffic refresh (a few minutes). To roll back, use the UC Model Registry UI to re-point the alias to the prior version. ## Inspect CLEARS metrics in MLflow -CI runs `python evals/clears_eval.py --endpoint analyst-agent-dev` after each `dev` deploy. Look for the experiment `/Shared/docintel-clears-`; each run logs: +CI runs `python evals/clears_eval.py --endpoint analyst-agent-demo` after each `demo` deploy. Look for the experiment `/Shared/docintel-clears-`; each run logs: - Per-axis metrics: `correctness`, `adherence`, `relevance`, `execution`, `safety`, `latency_p95_ms` - Per-category slices: `p2_correctness`, `p3_correctness` @@ -60,7 +60,7 @@ Failures are logged as a JSON list under the run tag `failures`. The script exit | `bundle validate` fails on `ai_parse_document` | Workspace lacks AI Functions GA | Move SQL warehouse to a recent serverless channel | | Vector Search index sync stuck | Embedding endpoint not provisioned | Provision `databricks-bge-large-en` or override `var.embedding_model_endpoint_name` | | Agent endpoint 401 | OBO not plumbed end-to-end | Verify `app/app.py:_user_client` reads `x-forwarded-access-token` and `resources/consumers/analyst.app.yml:user_api_scopes` includes `serving.serving-endpoints` and `sql` | -| Agent answers ignore user UC permissions | OBO scopes wiped by `bundle run` (documented destructive-update behavior — see [Databricks Apps deploy docs](https://docs.databricks.com/aws/en/dev-tools/databricks-apps/deploy)) | Re-apply: `databricks apps update doc-intel-analyst-dev --user-api-scopes serving.serving-endpoints,sql,iam.access-control:read,iam.current-user:read` | +| Agent answers ignore user UC permissions | OBO scopes wiped by `bundle run` (documented destructive-update behavior — see [Databricks Apps deploy docs](https://docs.databricks.com/aws/en/dev-tools/databricks-apps/deploy)) | Re-apply: `databricks apps update doc-intel-analyst-demo --user-api-scopes serving.serving-endpoints,sql,iam.access-control:read,iam.current-user:read` | | Streamlit user sees stale UC permissions | OBO token captured at WebSocket open; never refreshes ([Databricks Apps runtime docs](https://docs.databricks.com/aws/en/dev-tools/databricks-apps/app-runtime)) | Reload the page after permission changes | | Lakebase tables not writable from deployed App | Local-dev `streamlit run` initialised schema under user identity, not App SP | Connect as App SP and `DROP TABLE feedback, query_logs, conversation_history`; next App run re-creates them under SP. See `app/README.md` | | CLEARS Latency axis fails | Re-rank window too large | Reduce candidate window in `agent/retrieval.py` from 25 to 15 | @@ -83,7 +83,7 @@ To enable OBO end-to-end: - iam.access-control:read # default - iam.current-user:read # default ``` -3. Redeploy: `databricks bundle deploy -t dev && databricks bundle run -t dev analyst_app`. +3. Redeploy: `databricks bundle deploy -t demo && databricks bundle run -t demo analyst_app`. 4. Verify: bootstrap step 5's scope check now asserts (rather than skipping). Visit the deployed app, ask a question, confirm in audit logs that the agent's UC SQL runs under the user's identity (not the app SP). The agent-side `auth_policy` declared in `log_and_register.py` uses the **agent-side** scopes (`model-serving`, `vector-search`) per the Model Serving OBO docs — these are different from the App-side scopes above and need no workspace feature flag; they just plumb the per-request user token through the served pyfunc. @@ -107,7 +107,7 @@ Changing any threshold requires a constitution amendment per the Governance sect ## v1 baseline -(populate after the first successful `dev` deploy) +(populate after the first successful `demo` deploy) ``` MLflow run ID: @@ -126,7 +126,7 @@ resolve on a fresh workspace. Each needs a phase-2 step after a prior side effec - `resources/consumers/agent.serving.yml` must contain a numeric placeholder because DAB serving config may reject UC alias syntax. - CI registers a fresh model version and then calls - `agent/log_and_register.py --target dev --serving-endpoint analyst-agent-dev` + `agent/log_and_register.py --target demo --serving-endpoint analyst-agent-demo` to update the served entity to the new version. - **Fix**: for local deploys, run the same registration command after bundle deploy, or bootstrap the endpoint once and let the script advance it. @@ -142,7 +142,7 @@ resolve on a fresh workspace. Each needs a phase-2 step after a prior side effec - The catalog and app attach to the instance before the instance has finished coming up. Re-running `bundle deploy` immediately after the first attempt usually succeeds since the instance is then ready. - - **Fix**: `bundle deploy -t dev` twice on first stand-up, or add a wait task. + - **Fix**: `bundle deploy -t demo` twice on first stand-up, or add a wait task. A clean fresh-workspace bring-up is a single command: @@ -150,15 +150,15 @@ A clean fresh-workspace bring-up is a single command: DOCINTEL_CATALOG= \ DOCINTEL_SCHEMA= \ DOCINTEL_WAREHOUSE_ID= \ -./scripts/bootstrap-dev.sh +./scripts/bootstrap-demo.sh ``` The script implements a **staged deploy**: resources are split into `resources/foundation/` (no data deps) and `resources/consumers/` (need data). Stage 1 temporarily renames consumer YAMLs to `*.yml.skip` so the bundle's `resources/**/*.yml` glob excludes them — foundation deploys -cleanly. Stage 2 brings up data (sample upload, pipeline run, model -register, Lakebase ready) and then runs full `bundle deploy`, with all +cleanly. Stage 2 brings up data (sample upload, pipeline run, VS index +materialization, model register, Lakebase ready) and then runs full `bundle deploy`, with all consumer dependencies satisfied. The previous "errors tolerated on first deploy" workaround is gone — both deploys succeed cleanly. @@ -170,10 +170,11 @@ Six-step flow: retention conflict — bump the suffix and retry). 2. **Foundation deploy** — `resources/consumers/*.yml` renamed to `*.yml.skip`; `bundle deploy` only touches catalog/schema/volume, - pipeline, retention job, Lakebase instance. + pipeline, retention job, Lakebase instance, Vector Search endpoint. 3. **Produce data** — upload synthetic samples, run pipeline, wait for - `gold_filing_kpis`, register agent model (no `--serving-endpoint`, - endpoint doesn't exist yet), wait for Lakebase to reach `AVAILABLE`. + `gold_filing_kpis`, materialize the Vector Search index, register + agent model (no `--serving-endpoint`, endpoint doesn't exist yet), + wait for Lakebase to reach `AVAILABLE`. 4. **Consumer deploy** — full `bundle deploy` (foundation idempotent; consumers create cleanly because all deps are live). 5. **App run + UC grants chain** — `bundle run analyst_app`, diff --git a/resources/foundation/catalog.yml b/resources/foundation/catalog.yml index 7f9a627..92f704e 100644 --- a/resources/foundation/catalog.yml +++ b/resources/foundation/catalog.yml @@ -29,5 +29,5 @@ resources: # `grants`. Both USE_CATALOG and (USE_SCHEMA, SELECT, EXECUTE) are required # for analysts to query UC tables; the catalog half is often forgotten because # it's not on the schema securable. Both are applied post-deploy by -# scripts/bootstrap-dev.sh and the CI deploy workflow via the UC permissions +# scripts/bootstrap-demo.sh and the CI deploy workflow via the UC permissions # API. Idempotent. diff --git a/resources/foundation/doc_intel.pipeline.yml b/resources/foundation/doc_intel.pipeline.yml index 7981560..27c1ee1 100644 --- a/resources/foundation/doc_intel.pipeline.yml +++ b/resources/foundation/doc_intel.pipeline.yml @@ -25,7 +25,7 @@ resources: serverless: true # File-arrival semantics (spec FR-001): continuous mode keeps the Auto # Loader source running so new PDFs in the raw_filings volume are picked - # up automatically. Dev target overrides this to false in databricks.yml + # up automatically. Demo target overrides this to false in databricks.yml # for cost control on smoke runs. continuous: true photon: true diff --git a/resources/consumers/filings_index.yml b/resources/foundation/filings_index.yml similarity index 100% rename from resources/consumers/filings_index.yml rename to resources/foundation/filings_index.yml diff --git a/scripts/bootstrap-dev.sh b/scripts/bootstrap-demo.sh similarity index 87% rename from scripts/bootstrap-dev.sh rename to scripts/bootstrap-demo.sh index 7ee3cf2..c6aa4e3 100755 --- a/scripts/bootstrap-dev.sh +++ b/scripts/bootstrap-demo.sh @@ -1,17 +1,17 @@ #!/usr/bin/env bash -# Bootstrap a dev workspace end-to-end. +# Bootstrap a demo workspace end-to-end. # # Two modes, auto-detected: # # FIRST DEPLOY (no serving endpoint yet) # resources/ has chicken-egg dependencies: consumers (serving endpoint, -# monitor, app, lakebase catalog, vs endpoint) need foundation data +# monitor, app, lakebase catalog, index-refresh job) need foundation data # (registered model, populated KPI table, AVAILABLE Lakebase). DAB # deploys everything in one shot, so we stage: # 1. Hide resources/consumers/*.yml → *.yml.skip; bundle deploy # touches only foundation. Trap restores on any exit. -# 2. Produce data: samples → pipeline → wait for KPIs → register -# model → wait for Lakebase AVAILABLE. +# 2. Produce data: samples → pipeline → wait for KPIs → materialize +# VS index → register model → wait for Lakebase AVAILABLE. # 3. Restore consumer YAMLs; bundle deploy full bundle. All deps # satisfied; consumers create cleanly. # @@ -28,11 +28,11 @@ # # Required env vars: # DOCINTEL_CATALOG e.g. workspace -# DOCINTEL_SCHEMA e.g. docintel_10k_dev +# DOCINTEL_SCHEMA e.g. docintel_10k_demo # DOCINTEL_WAREHOUSE_ID SQL warehouse id (used by wait_for_kpis + smoke) # # Optional: -# DOCINTEL_TARGET bundle target (default: dev) +# DOCINTEL_TARGET bundle target (default: demo) # DOCINTEL_ANALYST_GROUP UC group for grants (default: "account users") # DOCINTEL_WAIT_SECONDS poll timeout for KPI table (default: 600) # DOCINTEL_LAKEBASE_TIMEOUT poll timeout for Lakebase (default: 600) @@ -40,6 +40,9 @@ # DOCINTEL_FORCE_LOCK set to 1 to pass --force-lock (use ONLY when a # prior deploy crashed and left a stale lock — # not a normal-flow flag). +# DOCINTEL_EMBEDDING_ENDPOINT +# embedding endpoint for first-run VS index +# materialization (default: databricks-bge-large-en) set -euo pipefail @@ -47,13 +50,14 @@ log() { echo "[bootstrap] $*" >&2; } die() { log "error: $*"; exit 1; } : "${DOCINTEL_CATALOG:?must be set (e.g. workspace)}" -: "${DOCINTEL_SCHEMA:?must be set (e.g. docintel_10k_dev)}" +: "${DOCINTEL_SCHEMA:?must be set (e.g. docintel_10k_demo)}" : "${DOCINTEL_WAREHOUSE_ID:?must be set}" -TARGET="${DOCINTEL_TARGET:-dev}" +TARGET="${DOCINTEL_TARGET:-demo}" ANALYST_GROUP="${DOCINTEL_ANALYST_GROUP:-account users}" WAIT_SECONDS="${DOCINTEL_WAIT_SECONDS:-600}" LAKEBASE_TIMEOUT="${DOCINTEL_LAKEBASE_TIMEOUT:-600}" +EMBEDDING_ENDPOINT="${DOCINTEL_EMBEDDING_ENDPOINT:-databricks-bge-large-en}" ENDPOINT="analyst-agent-${TARGET}" APP_NAME="doc-intel-analyst-${TARGET}" KPI_TABLE="${DOCINTEL_CATALOG}.${DOCINTEL_SCHEMA}.gold_filing_kpis" @@ -216,7 +220,7 @@ if [[ "$MODE" == "first" ]]; then mv "$f" "$f.skip" done - databricks bundle deploy -t "$TARGET" "${VAR_FLAGS[@]}" "${DEPLOY_FLAGS[@]}" || \ + databricks bundle deploy -t "$TARGET" "${VAR_FLAGS[@]}" ${DEPLOY_FLAGS[@]+"${DEPLOY_FLAGS[@]}"} || \ die "stage-1 deploy failed (foundation should be self-contained — investigate)" restore_consumers @@ -228,19 +232,34 @@ if [[ "$MODE" == "first" ]]; then die "pipeline run failed — inspect SDP UI before retrying" "$PYTHON" scripts/wait_for_kpis.py --min-rows 1 --timeout "$WAIT_SECONDS" || \ die "timed out waiting for $KPI_TABLE" + + # Materialize the VS index BEFORE agent registration: the agent's auth_policy + # declares the VS index as a UC resource (DatabricksVectorSearchIndex), and + # MLflow validates its existence at create_model_version time. The VS + # endpoint is in foundation/ (created by stage-1 deploy), but the index is + # always created at runtime by sync_index.py. Stage-2's index_refresh job + # is too late. + log " creating Vector Search index ${DOCINTEL_CATALOG}.${DOCINTEL_SCHEMA}.filings_summary_idx" + "$PYTHON" jobs/index_refresh/sync_index.py \ + --endpoint "docintel-${TARGET}" \ + --index "${DOCINTEL_CATALOG}.${DOCINTEL_SCHEMA}.filings_summary_idx" \ + --source-table "${DOCINTEL_CATALOG}.${DOCINTEL_SCHEMA}.gold_filing_sections_indexable" \ + --primary-key section_uid \ + --embedding-endpoint "$EMBEDDING_ENDPOINT" || \ + die "VS index creation failed (sync_index.py)" + "$PYTHON" agent/log_and_register.py --target "$TARGET" || \ die "agent registration failed" wait_for_lakebase_available log "step 3/6: stage-2 deploy (full bundle — consumers join the foundation)" - databricks bundle deploy -t "$TARGET" "${VAR_FLAGS[@]}" "${DEPLOY_FLAGS[@]}" || \ + databricks bundle deploy -t "$TARGET" "${VAR_FLAGS[@]}" ${DEPLOY_FLAGS[@]+"${DEPLOY_FLAGS[@]}"} || \ die "stage-2 deploy failed; check logs" # The index_refresh job is created by stage-2 deploy and is `table_update`- # triggered. Triggers do not fire retroactively on the rows the pipeline - # produced in stage 2, so we have to materialize the Vector Search index - # explicitly the first time. sync_index.py is create-if-missing/sync-if- - # exists, so this is idempotent on subsequent runs. + # produced before the job existed, so run it once after deployment as an + # idempotent smoke of the bundled job path. log "step 3.5/6: triggering initial Vector Search index materialization" databricks bundle run -t "$TARGET" "${VAR_FLAGS[@]}" index_refresh || \ log " warn: index_refresh failed; the table_update trigger will retry on the next pipeline run" @@ -248,7 +267,7 @@ if [[ "$MODE" == "first" ]]; then else # ─── Steady-state path: single full deploy + in-place data refresh ──────── log "step 1/6: full bundle deploy (steady-state — consumers already exist)" - databricks bundle deploy -t "$TARGET" "${VAR_FLAGS[@]}" "${DEPLOY_FLAGS[@]}" || \ + databricks bundle deploy -t "$TARGET" "${VAR_FLAGS[@]}" ${DEPLOY_FLAGS[@]+"${DEPLOY_FLAGS[@]}"} || \ die "bundle deploy failed; if a prior deploy was interrupted, set DOCINTEL_FORCE_LOCK=1 and retry" log "step 2/6: refreshing data + repointing serving endpoint" diff --git a/scripts/wait_for_kpis.py b/scripts/wait_for_kpis.py index a4bb0ab..240ffc6 100644 --- a/scripts/wait_for_kpis.py +++ b/scripts/wait_for_kpis.py @@ -1,12 +1,12 @@ """Poll until gold_filing_kpis has at least N rows, or time out. -Used by both `scripts/bootstrap-dev.sh` (post-pipeline-trigger) and the GitHub +Used by both `scripts/bootstrap-demo.sh` (post-pipeline-trigger) and the GitHub Actions deploy workflow (post-sample-upload). Centralized here so both paths share the same SQL Statement Execution logic. Required env: DOCINTEL_CATALOG e.g. workspace - DOCINTEL_SCHEMA e.g. docintel_10k_dev + DOCINTEL_SCHEMA e.g. docintel_10k_demo DOCINTEL_WAREHOUSE_ID SQL warehouse to run the count query CLI: diff --git a/specs/001-doc-intel-10k/plan.md b/specs/001-doc-intel-10k/plan.md index 8b6e801..7471612 100644 --- a/specs/001-doc-intel-10k/plan.md +++ b/specs/001-doc-intel-10k/plan.md @@ -5,19 +5,19 @@ ## Summary -Build a Databricks-native, governed pipeline + agent that turns SEC 10-K PDFs into a queryable lakehouse and a cited Q&A experience. SQL Lakeflow Spark Declarative Pipelines parse PDFs once with `ai_parse_document` (VARIANT), classify sections with `ai_classify`, extract structured KPIs with `ai_extract`, and score every section against a 5-dimension quality rubric. High-quality summaries flow into a Mosaic AI Vector Search index. A Mosaic AI Agent Framework agent (Knowledge Assistant + Custom Analyst Agent + Supervisor for cross-company fan-out) is logged via MLflow, registered in Unity Catalog, served behind AI Gateway, and surfaced through a Streamlit Databricks App with citation rendering and a feedback widget. Conversation history and feedback land in Lakebase Postgres. Lakehouse Monitoring tracks extraction drift; an AI/BI dashboard surfaces query-log content gaps. CLEARS evaluation in MLflow gates promotion. The entire stack is one Databricks Asset Bundle (`databricks bundle deploy -t dev|prod`). +Build a Databricks-native, governed pipeline + agent that turns SEC 10-K PDFs into a queryable lakehouse and a cited Q&A experience. SQL Lakeflow Spark Declarative Pipelines parse PDFs once with `ai_parse_document` (VARIANT), classify sections with `ai_classify`, extract structured KPIs with `ai_extract`, and score every section against a 5-dimension quality rubric. High-quality summaries flow into a Mosaic AI Vector Search index. A Mosaic AI Agent Framework agent (Knowledge Assistant + Custom Analyst Agent + Supervisor for cross-company fan-out) is logged via MLflow, registered in Unity Catalog, served behind AI Gateway, and surfaced through a Streamlit Databricks App with citation rendering and a feedback widget. Conversation history and feedback land in Lakebase Postgres. Lakehouse Monitoring tracks extraction drift; an AI/BI dashboard surfaces query-log content gaps. CLEARS evaluation in MLflow gates promotion. The entire stack is one Databricks Asset Bundle (`databricks bundle deploy -t demo|prod`). ## Technical Context **Language/Version**: SQL (Databricks SQL on serverless) for parse/classify/extract pipelines; Python 3.11 for agent + app + eval **Primary Dependencies**: Lakeflow Spark Declarative Pipelines, Lakeflow Jobs, Mosaic AI Vector Search, Mosaic AI Agent Framework (`databricks-agents`, `mlflow >= 2.20`), Databricks Model Serving + AI Gateway, Databricks Apps (Streamlit), Lakebase Postgres, Lakehouse Monitoring, Databricks Asset Bundles CLI (`databricks` >= 0.260) **Storage**: Unity Catalog — `.` with one volume (`raw_filings`) and Delta tables (`bronze_filings`, `silver_parsed_filings`, `gold_filing_sections`, `gold_filing_kpis`); Lakebase Postgres for `conversation_history`, `query_logs`, `feedback` -**Testing**: `databricks bundle validate -t dev` (schema check), pytest for agent unit tests, MLflow `evaluate()` with `databricks-agents` evaluators for CLEARS, manual smoke via the deployed App +**Testing**: `databricks bundle validate -t demo` (schema check), pytest for agent unit tests, MLflow `evaluate()` with `databricks-agents` evaluators for CLEARS, manual smoke via the deployed App **Target Platform**: Databricks workspace with serverless SQL warehouse (AI Functions GA), Mosaic AI Vector Search and Model Serving entitlements; agent endpoint runs on CPU instance behind AI Gateway **Project Type**: Databricks lakehouse + agent stack delivered as a single DAB **Performance Goals**: Pipeline E2E ≤ 10 min P95 on a 30 MB PDF (SC-001); agent P95 ≤ 8s single-filing, ≤ 20s cross-company (SC-009); Vector Search refresh ≤ 5 min after Gold update **Constraints**: SQL only for parse/classify/extract layer; Python only for agent + app; CPU model serving (no GPU); zero hard-coded paths outside the bundle; one-command deploy; CLEARS thresholds C≥0.8, L p95≤8s, E≥0.95, A≥0.9, R≥0.8, S≥0.99 block promotion -**Scale/Scope**: Pilot scale — up to ~500 filings in dev, ~5,000 in prod; ~20 concurrent App users; 30-question eval set +**Scale/Scope**: Pilot scale — up to ~500 filings in demo, ~5,000 in prod; ~20 concurrent App users; 30-question eval set ## Constitution Check @@ -29,8 +29,8 @@ Build a Databricks-native, governed pipeline + agent that turns SEC 10-K PDFs in | II. Parse Once, Extract Many | ✅ Pass | `ai_parse_document` runs once at Silver into VARIANT; classify/extract/prep_search iterate on Gold. | | III. Declarative over imperative | ✅ Pass | Lakeflow SDP (SQL) for pipelines; Lakeflow Jobs for orchestration; DAB for resources. No production notebooks. | | IV. Quality before retrieval | ✅ Pass | 5-dim rubric (parse_completeness, layout_fidelity, ocr_confidence, section_recognizability, kpi_extractability); `embed_eligible` boolean filter on the index. Summaries (not raw chunks) embedded. | -| V. Eval-gated agents | ✅ Pass | `evals/clears_eval.py` runs MLflow eval against the dev endpoint; promotion blocked on threshold failure. Lakehouse Monitoring on `gold_filing_kpis`; AI/BI dashboard on `query_logs`. | -| VI. Reproducible deploys | ✅ Pass | `databricks bundle deploy -t dev` recreates the entire stack. Same Python code path runs locally and in Databricks Apps via unified CLI auth. | +| V. Eval-gated agents | ✅ Pass | `evals/clears_eval.py` runs MLflow eval against the demo endpoint; promotion blocked on threshold failure. Lakehouse Monitoring on `gold_filing_kpis`; AI/BI dashboard on `query_logs`. | +| VI. Reproducible deploys | ✅ Pass | `databricks bundle deploy -t demo` recreates the entire stack. Same Python code path runs locally and in Databricks Apps via unified CLI auth. | **Result**: All gates pass. No deviations to record. Complexity Tracking section intentionally omitted. @@ -57,19 +57,16 @@ specs/001-doc-intel-10k/ ### Source Code (repository root) ```text -databricks.yml # Bundle root, dev/prod targets +databricks.yml # Bundle root, demo/prod targets resources/ -├── pipelines/ -│ └── doc_intel.pipeline.yml # Lakeflow SDP definition -├── jobs/ -│ ├── index_refresh.job.yml # Vector Search refresh +├── foundation/ +│ ├── doc_intel.pipeline.yml # Lakeflow SDP definition +│ ├── filings_index.yml # VS endpoint +│ ├── lakebase_instance.yml # Postgres for state │ └── retention.job.yml # 90-day raw PDF cleanup -├── vector_search/ -│ └── filings_index.yml # VS endpoint + index -├── serving/ +├── consumers/ +│ ├── index_refresh.job.yml # Vector Search index create/sync │ └── agent.serving.yml # Model Serving + AI Gateway -├── lakebase/ -│ └── state.yml # Postgres for state ├── monitors/ │ └── kpi_drift.yml # Lakehouse Monitoring ├── dashboards/ @@ -105,7 +102,7 @@ evals/ .github/ └── workflows/ - └── deploy.yml # validate on PR, deploy -t dev on merge + └── deploy.yml # validate on PR, deploy -t demo on merge CLAUDE.md # Runtime guidance for Claude Code ``` @@ -131,7 +128,7 @@ Output: [research.md](./research.md). Decisions captured: | Eval framework | MLflow `evaluate()` with `databricks-agents` evaluators on CLEARS axes | First-class CLEARS support; logged into MLflow runs | LangSmith / Ragas (rejected: external system) | | Monitoring | Lakehouse Monitoring `inference` profile on `gold_filing_kpis`; Lakeview AI/BI dashboard on `query_logs` | First-class drift detection; usage dashboard surfaces content gaps per Reffy | Custom Spark notebooks (rejected: imperative, principle III) | | App framework | Streamlit | Fastest in-platform Python UI; Databricks Apps native | React + FastAPI (deferred — Reffy uses this but adds frontend build) | -| CI | GitHub Actions running `databricks bundle validate` (PR) + `bundle deploy -t dev` (merge to main) | Reffy pattern; minimal infra | GitLab/CircleCI (rejected: GitHub is the user's host) | +| CI | GitHub Actions running `databricks bundle validate` (PR) + `bundle deploy -t demo` (merge to main) | Reffy pattern; minimal infra | GitLab/CircleCI (rejected: GitHub is the user's host) | | Section labels | Canonical set: `MD&A`, `Risk`, `Financials`, `Notes`, `Other` (preserve `original_label`) | Matches FR-003; explicit, testable | Free-form labels (rejected: untestable) | | Retention | 90-day Lakeflow Job that lists volume, filters `ingested_at < now()-90d`, removes the file | Doesn't depend on workspace lifecycle policies; auditable | UC volume lifecycle rule (rejected: requires admin policy work that can't be assumed) | @@ -161,7 +158,7 @@ Output: `data-model.md`, `contracts/`, `quickstart.md`, plus the agent context u ### Quickstart -`quickstart.md` covers: install/auth `databricks` CLI → set bundle vars → `bundle validate -t dev` → `bundle deploy -t dev` → upload sample 10-K → query Gold → open App and ask the example question → run `evals/clears_eval.py`. +`quickstart.md` covers: install/auth `databricks` CLI → set bundle vars → `bundle validate -t demo` → `bundle deploy -t demo` → upload sample 10-K → query Gold → open App and ask the example question → run `evals/clears_eval.py`. ### Agent context update diff --git a/specs/001-doc-intel-10k/quickstart.md b/specs/001-doc-intel-10k/quickstart.md index e3152de..787794b 100644 --- a/specs/001-doc-intel-10k/quickstart.md +++ b/specs/001-doc-intel-10k/quickstart.md @@ -1,6 +1,6 @@ # Quickstart: Deploy and Test the 10-K Analyst -Goal: from a clean clone, stand up the entire stack on the Databricks `dev` target and verify P1, P2, P3 acceptance scenarios in 15–25 minutes. +Goal: from a clean clone, stand up the entire stack on the Databricks `demo` target and verify P1, P2, P3 acceptance scenarios in 15–25 minutes. ## Prerequisites @@ -11,30 +11,30 @@ Goal: from a clean clone, stand up the entire stack on the Databricks `dev` targ ## 1. Configure the bundle -The `dev` target's defaults (in `databricks.yml`) are `catalog=workspace`, `schema=docintel_10k_dev`. Override per the workspace via env vars or `--var`: +The `demo` target's defaults (in `databricks.yml`) are `catalog=workspace`, `schema=docintel_10k_demo`. Override per the workspace via env vars or `--var`: ```bash cd databricks -databricks bundle validate --strict -t dev +databricks bundle validate --strict -t demo ``` If validate prints no errors, every resource YAML is schema-correct. -## 2. Stand up dev (staged deploy) +## 2. Stand up demo (staged deploy) ```bash DOCINTEL_CATALOG=workspace \ -DOCINTEL_SCHEMA=docintel_10k_dev \ +DOCINTEL_SCHEMA=docintel_10k_demo \ DOCINTEL_WAREHOUSE_ID= \ -./scripts/bootstrap-dev.sh +./scripts/bootstrap-demo.sh ``` The script implements a 6-step staged deploy: 1. Detect & clean orphan resources from prior failed runs. -2. **Stage 1**: deploy `resources/foundation/` only (catalog/schema/volume, pipeline, retention job, Lakebase instance) — consumer YAMLs are temp-renamed to `*.yml.skip`. -3. **Produce data**: upload synthetic samples, run pipeline, register agent model, wait for Lakebase to reach `AVAILABLE`. -4. **Stage 2**: full `bundle deploy` — consumers (serving endpoint, monitor, VS endpoint, index-refresh job, app, dashboard, Lakebase catalog) attach to the live foundation. +2. **Stage 1**: deploy `resources/foundation/` only (catalog/schema/volume, pipeline, retention job, Lakebase instance, VS endpoint) — consumer YAMLs are temp-renamed to `*.yml.skip`. +3. **Produce data**: upload synthetic samples, run pipeline, materialize the VS index, register agent model, wait for Lakebase to reach `AVAILABLE`. +4. **Stage 2**: full `bundle deploy` — consumers (serving endpoint, monitor, index-refresh job, app, dashboard, Lakebase catalog) attach to the live foundation. The VS endpoint is deployed in stage 1, and the bootstrap materializes the VS index before agent registration. 5. `bundle run analyst_app`; UC grants chain (`USE_CATALOG → USE_SCHEMA → SELECT/EXECUTE`). 6. Smoke check on the analyst endpoint. @@ -48,7 +48,7 @@ The bootstrap script already uploaded `samples/{ACME,BETA,GAMMA,garbage}_10K_202 SELECT filename, company_name, fiscal_year, revenue, ebitda, size(top_risks) AS num_risks, size(segment_revenue) AS num_segments - FROM workspace.docintel_10k_dev.gold_filing_kpis + FROM workspace.docintel_10k_demo.gold_filing_kpis ORDER BY filename; ``` @@ -56,13 +56,13 @@ Expect 4 rows. ACME/BETA/GAMMA each show non-null revenue (`94.2`, `212.0`, `305 ## 4. Verify P2 — ask the corpus -Open the deployed App URL (workspace UI → Apps → `doc-intel-analyst-dev`). Ask: +Open the deployed App URL (workspace UI → Apps → `doc-intel-analyst-demo`). Ask: > What were the top 3 risk factors disclosed by ACME in their FY24 10-K? Expect: a grounded answer naming ≥3 risks (macroeconomic conditions, competitive pressure in AI, supply chain concentration), each with a citation chip linking back to `ACME_10K_2024.pdf` / `Risk`. Submit thumbs-up; refresh; the feedback row appears in `lakebase.feedback`. Confirms SC-002, SC-007. -To bring real EDGAR filings online instead of the synthetic samples, see `samples/README.md` — the volume accepts any `*_10K_*.pdf` and the pipeline reacts via Auto Loader (`continuous: true` in prod, `false` in dev). +To bring real EDGAR filings online instead of the synthetic samples, see `samples/README.md` — the volume accepts any `*_10K_*.pdf` and the pipeline reacts via Auto Loader (`continuous: true` in prod, `false` in demo). ## 5. Verify P3 — cross-company @@ -76,9 +76,9 @@ Expect: a markdown table with one row per company, segment-revenue values matchi ```bash DOCINTEL_CATALOG=workspace \ -DOCINTEL_SCHEMA=docintel_10k_dev \ +DOCINTEL_SCHEMA=docintel_10k_demo \ .venv/bin/python evals/clears_eval.py \ - --endpoint analyst-agent-dev \ + --endpoint analyst-agent-demo \ --dataset evals/dataset.jsonl ``` @@ -87,10 +87,10 @@ Exit 0 iff every CLEARS axis meets thresholds (C≥0.8, L p95≤8s, E≥0.95, A ## 7. Tear down ```bash -databricks bundle destroy -t dev --auto-approve +databricks bundle destroy -t demo --auto-approve ``` -Note: the Lakebase instance enters a soft-delete state for ~7 days during which its name is reserved. To redeploy quickly, bump `lakebase_instance` in `databricks.yml` (e.g., `docintel-dev-state-v4`) before re-running the bootstrap. +Note: the Lakebase instance enters a soft-delete state for ~7 days during which its name is reserved. To redeploy quickly, bump `lakebase_instance` in `databricks.yml` (e.g., `docintel-demo-state-v4`) before re-running the bootstrap. ## Troubleshooting diff --git a/specs/001-doc-intel-10k/research.md b/specs/001-doc-intel-10k/research.md index ef9d4c6..3acf3ec 100644 --- a/specs/001-doc-intel-10k/research.md +++ b/specs/001-doc-intel-10k/research.md @@ -91,8 +91,8 @@ under `databricks` CLI auth and inside the deployed App. React + FastAPI ## Decision: GitHub Actions for CI **Rationale**: User's existing host. The workflow has two jobs: `validate` -on every PR (`databricks bundle validate -t dev`), and `deploy` on push to -`main` (`databricks bundle deploy -t dev` → `python evals/clears_eval.py` +on every PR (`databricks bundle validate -t demo`), and `deploy` on push to +`main` (`databricks bundle deploy -t demo` → `python evals/clears_eval.py` → exit non-zero on threshold failure to block the deploy). ## Decision: 90-day retention via Lakeflow Job, not UC volume lifecycle diff --git a/specs/001-doc-intel-10k/spec.md b/specs/001-doc-intel-10k/spec.md index ac9d406..f16a93a 100644 --- a/specs/001-doc-intel-10k/spec.md +++ b/specs/001-doc-intel-10k/spec.md @@ -18,7 +18,7 @@ ### Session 2026-04-25 - Q: Eval corpus — real EDGAR PDFs or synthetic? → A: Synthetic. The 30-question dataset references three synthetic 10-Ks (`samples/{ACME,BETA,GAMMA}_10K_2024.pdf`, generated by `samples/synthesize.py`) plus a deliberately low-quality `garbage_10K_2024.pdf` for SC-006. Real EDGAR filings can still be uploaded to the volume in deployed environments; the synthetic corpus exists so CI is fully deterministic and self-contained (no EDGAR dependency, no license concerns). User-facing examples in spec scenarios still use AAPL/MSFT/GOOG to convey intent. -- Q: Deploy ordering — single bundle deploy or staged? → A: Staged. `resources/foundation/` (catalog, pipeline, retention job, Lakebase instance) deploys first; data is produced (sample upload, pipeline run, agent registration, Lakebase ready); then `resources/consumers/` (serving endpoint, monitor, VS endpoint, app, Lakebase catalog) deploys. The chicken-egg dependencies between consumers and foundation data make a single deploy impossible. Bootstrap script automates this. +- Q: Deploy ordering — single bundle deploy or staged? → A: Staged. `resources/foundation/` (catalog, pipeline, retention job, Lakebase instance, VS endpoint) deploys first; data is produced (sample upload, pipeline run, VS index materialization, agent registration, Lakebase ready); then `resources/consumers/` (serving endpoint, monitor, index-refresh job, app, Lakebase catalog) deploys. The chicken-egg dependencies between consumers and foundation data make a single deploy impossible. Bootstrap script automates this. - Q: User identity passthrough? → A: OBO end-to-end is implemented but operationally requires the workspace-level "Databricks Apps - user token passthrough" feature. When disabled, the app falls back to SP creds with a loud bring-up banner identifying the limitation. ## User Scenarios & Testing *(mandatory)* @@ -97,7 +97,7 @@ An analyst asks a multi-company question — e.g., "Compare segment revenue betw - **FR-009**: System MUST persist conversation history, query logs, and feedback in a transactional store suitable for fast reads/writes alongside the agent serving path. - **FR-010**: System MUST evaluate the agent against a curated eval set of 30 hand-authored questions (20 P2 single-filing, 10 P3 cross-company) checked into the repo at `evals/dataset.jsonl`, scoring each axis of CLEARS and gating promotion on per-axis thresholds: Correctness ≥ 0.8, Latency p95 ≤ 8s, Execution ≥ 0.95, Adherence ≥ 0.9, Relevance ≥ 0.8, Safety ≥ 0.99. Any failing axis MUST block promotion. - **FR-011**: System MUST expose a monitoring dashboard summarizing extraction drift on Gold and a usage dashboard summarizing conversation logs (top queries, content gaps). -- **FR-012**: System MUST be deployable end-to-end (catalog/schema/volume, pipelines, vector index, agent endpoint, gateway, app, monitors, dashboards) via a single repeatable bring-up command; two environments (dev, prod) MUST be defined; no resource MAY be created outside the bundle. (See Clarifications session 2026-04-25 for the staged-deploy realization — first bring-up uses `./scripts/bootstrap-dev.sh` to handle chicken-egg dependencies between consumers and live data; steady-state deploys are a single `databricks bundle deploy`.) +- **FR-012**: System MUST be deployable end-to-end (catalog/schema/volume, pipelines, vector index, agent endpoint, gateway, app, monitors, dashboards) via a single repeatable bring-up command; two environments (demo, prod) MUST be defined; no resource MAY be created outside the bundle. (See Clarifications session 2026-04-25 for the staged-deploy realization — first bring-up uses `./scripts/bootstrap-demo.sh` to handle chicken-egg dependencies between consumers and live data; steady-state deploys are a single `databricks bundle deploy`.) - **FR-013**: System MUST process duplicate uploads idempotently keyed on filename. - **FR-014**: System MUST gracefully report missing/ungrounded answers ("no source found") rather than hallucinating when retrieval returns no qualified results. @@ -118,7 +118,7 @@ An analyst asks a multi-company question — e.g., "Compare segment revenue betw - **SC-002**: For curated single-filing eval questions (P2 category), the agent produces a cited, grounded answer that meets the per-axis evaluation thresholds on at least 80% of items. - **SC-003**: For curated cross-company eval questions (P3 category), the supervisor returns a consistent, cited aggregation that meets the per-axis evaluation thresholds on at least 70% of items. - **SC-004**: 100% of resources backing the deployed system are registered under the configured catalog/schema; zero resources are created outside the bundle (verified by `bundle validate` + workspace audit). -- **SC-005**: A clean workspace + new analyst can stand up the entire stack with one repeatable command (no manual UI clicks) in under 30 minutes. The command is `./scripts/bootstrap-dev.sh` (which orchestrates a staged `bundle deploy`) for first bring-up, and plain `databricks bundle deploy` for every steady-state iteration thereafter. +- **SC-005**: A clean workspace + new analyst can stand up the entire stack with one repeatable command (no manual UI clicks) in under 30 minutes. The command is `./scripts/bootstrap-demo.sh` (which orchestrates a staged `bundle deploy`) for first bring-up, and plain `databricks bundle deploy` for every steady-state iteration thereafter. - **SC-006**: Filings that fail the quality rubric are excluded from retrieval 100% of the time and are visible in an audit log with score breakdown. - **SC-007**: Every agent answer in the App renders at least one citation when retrieved sources exist; when none exist, the agent explicitly states no grounded source rather than fabricating one. - **SC-008**: Re-uploading the same filename produces no duplicate KPI rows on 100% of attempts. @@ -130,7 +130,7 @@ An analyst asks a multi-company question — e.g., "Compare segment revenue betw - The target Databricks workspace has a serverless SQL warehouse with `ai_parse_document` (GA), `ai_classify`, `ai_extract`, and `ai_prep_search` available. - Mosaic AI Vector Search and Model Serving entitlements are enabled for the workspace. - Sample 10-K PDFs are publicly available SEC filings (EDGAR) the analyst manually uploads to the volume; no automated SharePoint/Drive sync in v1. -- A Service Principal exists for prod deploys but is not used in v1 (dev target only). +- A Service Principal exists for prod deploys but is not used in v1 (demo target only). - Analyst end-users have UC `SELECT` on the configured catalog/schema and execute permission on the agent endpoint via UC identity passthrough. - The CLI auth profile on the operator's machine targets a workspace where the bundle can deploy without further policy exceptions. - 10-K fiscal year and company name can be reliably extracted from the parsed cover page; if not, `extraction_confidence` reflects the gap and the row remains queryable. diff --git a/specs/001-doc-intel-10k/tasks.md b/specs/001-doc-intel-10k/tasks.md index 3ffce29..d36f7a8 100644 --- a/specs/001-doc-intel-10k/tasks.md +++ b/specs/001-doc-intel-10k/tasks.md @@ -26,8 +26,8 @@ This is a single-DAB Databricks project. SQL pipeline code at `pipelines/sql/`, ## Phase 1: Setup (Shared Infrastructure) - [ ] T001 Verify `databricks` CLI ≥ 0.260 is installed and `databricks auth profiles` shows a working profile; if missing, follow the official Databricks CLI installation docs -- [x] T002 Create the bundle skeleton at `databricks.yml` with `bundle.name: doc-intel-10k`, `targets: {dev, prod}`, variables `catalog`, `schema`, `workspace_host`, `service_principal_id` (prod only), `embedding_model_endpoint_name`, `quality_threshold` (default 22), `top_k` (default 5) -- [x] T003 [P] Add `.github/workflows/deploy.yml` running `databricks bundle validate -t dev` on PR and `databricks bundle deploy -t dev` + `python evals/clears_eval.py` on push to `main` +- [x] T002 Create the bundle skeleton at `databricks.yml` with `bundle.name: doc-intel-10k`, `targets: {demo, prod}`, variables `catalog`, `schema`, `workspace_host`, `service_principal_id` (prod only), `embedding_model_endpoint_name`, `quality_threshold` (default 22), `top_k` (default 5) +- [x] T003 [P] Add `.github/workflows/deploy.yml` running `databricks bundle validate -t demo` on PR and `databricks bundle deploy -t demo` + `python evals/clears_eval.py` on push to `main` - [x] T004 [P] Create empty `pipelines/sql/`, `agent/`, `app/`, `evals/`, `resources/{pipelines,jobs,vector_search,serving,lakebase,monitors,dashboards,apps}/` directories with `.gitkeep` files - [x] T005 [P] Add `agent/requirements.txt` (`mlflow>=2.20`, `databricks-agents`, `databricks-vectorsearch`, `databricks-sdk`) and `app/requirements.txt` (`streamlit`, `databricks-sdk`, `psycopg[binary]`) @@ -83,18 +83,18 @@ This is a single-DAB Databricks project. SQL pipeline code at `pipelines/sql/`, ### Implementation for US2 -- [x] T020 [P] [US2] Define the Vector Search endpoint and Delta-Sync index in `resources/vector_search/filings_index.yml` over `${var.catalog}.${var.schema}.gold_filing_sections` filtered `WHERE embed_eligible = true`, embedding column `summary`, embedding model from `${var.embedding_model_endpoint_name}` (depends on T013) -- [x] T021 [P] [US2] Define the index-refresh Lakeflow Job in `resources/jobs/index_refresh.job.yml` with table-update trigger on `gold_filing_sections`, single SQL task running `SYNC INDEX ${var.catalog}.${var.schema}.filings_summary_idx` (depends on T020) +- [x] T020 [P] [US2] Define the Vector Search endpoint in `resources/foundation/filings_index.yml`; the Delta-Sync index over `${var.catalog}.${var.schema}.gold_filing_sections_indexable` is created by `jobs/index_refresh/sync_index.py` because DAB does not manage Vector Search indexes directly (depends on T013) +- [x] T021 [P] [US2] Define the index-refresh Lakeflow Job in `resources/consumers/index_refresh.job.yml` with a table-update trigger on `gold_filing_sections_indexable` and a Python task that creates/syncs `${var.catalog}.${var.schema}.filings_summary_idx` (depends on T020) - [x] T022 [US2] Implement `agent/retrieval.py`: `hybrid_retrieve(question, top_k=25, filters=None)` calling Vector Search with `query_type='HYBRID'`, then `mosaic_rerank(question, candidates, top_k=5)`; returns list of citation dicts matching `agent-response.json` (depends on T020; tests T018 must fail first) - [x] T023 [US2] Implement `agent/tools.py`: a UC Function tool wrapping `SELECT * FROM gold_filing_kpis WHERE filename = :filename` for the agent to access structured KPIs deterministically - [x] T024 [US2] Implement `agent/analyst_agent.py`: a `mlflow.pyfunc` model class implementing the Mosaic AI Agent Framework chat protocol; uses `retrieval.hybrid_retrieve` for grounding, calls a foundation model endpoint to generate the answer, returns the schema in `contracts/agent-response.json` (depends on T022, T023) -- [x] T025 [US2] Implement `agent/log_and_register.py`: `mlflow.pyfunc.log_model(...)`, `mlflow.register_model(...)` to UC at `${var.catalog}.${var.schema}.analyst_agent`; assign UC Model Alias `@dev` (and later `@prod`) to the freshly registered version so Model Serving in T026 follows the alias rather than a frozen version (depends on T024) +- [x] T025 [US2] Implement `agent/log_and_register.py`: `mlflow.pyfunc.log_model(...)`, `mlflow.register_model(...)` to UC at `${var.catalog}.${var.schema}.analyst_agent`; assign UC Model Alias `@demo` (and later `@prod`) to the freshly registered version so Model Serving in T026 follows the alias rather than a frozen version (depends on T024) - [x] T026 [US2] Define the Model Serving endpoint in `resources/consumers/agent.serving.yml`: CPU instance, served entity = `${var.catalog}.${var.schema}.analyst_agent`, AI Gateway with rate limit + audit enabled (depends on T025) - [x] T027 [US2] Implement `app/app.py` (Streamlit): chat input, calls the agent endpoint via `databricks.sdk.WorkspaceClient.serving_endpoints.query`, renders answer + citations as chips that show filename + section on hover, thumbs-up/down + comment widget that POSTs to a Lakebase write helper; persists `conversation_id` in session state (depends on T026, T007) - [x] T028 [US2] Implement `app/lakebase_client.py`: thin wrapper using `psycopg` with the bundle-injected DSN to insert into `conversation_history`, `query_logs`, `feedback` - [x] T029 [US2] Define the Databricks App in `resources/consumers/analyst.app.yml`: source = `app/`, runtime python, env = Lakebase binding + agent endpoint binding (depends on T027, T028) - [x] T030 [US2] Author `evals/dataset.jsonl` 20 P2 questions per `data-model.md`'s eval section (each with `expected_filename`, `expected_section`, `expected_answer_keywords`, `min_citations`) -- [x] T031 [US2] Implement `evals/clears_eval.py`: connects to the dev endpoint, runs `mlflow.evaluate()` with `databricks-agents` evaluators on the dataset, asserts thresholds C≥0.8, L p95≤8s, E≥0.95, A≥0.9, R≥0.8, S≥0.99; exits non-zero on failure (depends on T026, T030) +- [x] T031 [US2] Implement `evals/clears_eval.py`: connects to the demo endpoint, runs `mlflow.evaluate()` with `databricks-agents` evaluators on the dataset, asserts thresholds C≥0.8, L p95≤8s, E≥0.95, A≥0.9, R≥0.8, S≥0.99; exits non-zero on failure (depends on T026, T030) - [x] T032 [US2] Define Lakehouse Monitoring in `resources/consumers/kpi_drift.yml`: `inference` profile on `gold_filing_kpis`, slicing on `company_name`, `fiscal_year`; baselines computed from first 10 filings (depends on T011) - [x] T033 [US2] Extend `resources/dashboards/usage.lvdash.yml` with widgets over `lakebase.query_logs`: top questions, daily active users, p95 latency, citation count distribution, ungrounded-answer rate (depends on T028, T017) @@ -116,7 +116,7 @@ This is a single-DAB Databricks project. SQL pipeline code at `pipelines/sql/`, - [x] T035 [US3] Implement `agent/supervisor.py`: detects company names via a small classifier or LLM call, fans out a per-company query through `analyst_agent`, pulls structured `gold_filing_kpis` rows via `tools.py`, formats a markdown table; returns `agent_path='supervisor'` in the response (depends on T024, T023; tests T034 must fail first) - [x] T036 [US3] Update `agent/analyst_agent.py` to detect cross-company intent at the routing layer and delegate to `supervisor.handle()`; otherwise stay in single-filing path (depends on T035) -- [x] T037 [US3] Re-run `agent/log_and_register.py` from CI (GH Actions deploy step in T003) to register a new UC model version with the supervisor enabled and re-assign alias `@dev`; the serving endpoint follows the alias so no yml edit is needed +- [x] T037 [US3] Re-run `agent/log_and_register.py` from CI (GH Actions deploy step in T003) to register a new UC model version with the supervisor enabled and re-assign alias `@demo`; the serving endpoint follows the alias so no yml edit is needed - [x] T038 [US3] Author 10 P3 questions in `evals/dataset.jsonl` (each with `expected_companies` and `expected_table_columns`) (depends on T030) - [x] T039 [US3] Extend `evals/clears_eval.py` to slice metrics by `category in {P2, P3}` and assert SC-002 ≥0.8 on P2, SC-003 ≥0.7 on P3 (depends on T031, T038) - [x] T040 [US3] Update `app/app.py` to render markdown tables (Streamlit `st.markdown(..., unsafe_allow_html=False)` already handles this) and surface a "show structured KPIs" expander next to each row (depends on T036) @@ -127,12 +127,12 @@ This is a single-DAB Databricks project. SQL pipeline code at `pipelines/sql/`, ## Phase 6: Polish & Cross-Cutting Concerns -- [ ] T041 [P] Run `databricks bundle validate -t dev` and resolve any schema warnings +- [ ] T041 [P] Run `databricks bundle validate -t demo` and resolve any schema warnings - [ ] T042 [P] Run `databricks bundle validate -t prod` (no deploy) to confirm prod target compiles - [ ] T043 Walk through `quickstart.md` end-to-end on a clean workspace; capture timing for SC-005 - [x] T044 [P] Add a Lakeview widget on `lakebase.query_logs` summarising "ungrounded answer rate by week" — content-gap signal per Reffy - [x] T045 [P] Document operating runbook in `docs/runbook.md`: how to add a sample filing, how to debug a low quality_score, how to roll an agent endpoint version, how to inspect CLEARS metrics in MLflow -- [ ] T046 Run `python evals/clears_eval.py` against the dev endpoint and store the MLflow run ID in `docs/runbook.md` as the v1 baseline +- [ ] T046 Run `python evals/clears_eval.py` against the demo endpoint and store the MLflow run ID in `docs/runbook.md` as the v1 baseline - [x] T047 [P] Add an SC-006 verification assertion in `evals/clears_eval.py`: query Vector Search for a known-rejected filename and assert zero hits (verifies "100% rubric exclusion") - [x] T048 [P] Add an SC-001 timing widget to `resources/dashboards/usage.lvdash.yml` over `gold_filing_kpis` joined to `bronze_filings.ingested_at`: P95 of `extracted_at - ingested_at` per company; alerts if > 10 minutes