diff --git a/.github/workflows/hub-client-e2e.yml b/.github/workflows/hub-client-e2e.yml index 279cedd63..24390c42d 100644 --- a/.github/workflows/hub-client-e2e.yml +++ b/.github/workflows/hub-client-e2e.yml @@ -1,21 +1,22 @@ name: Hub-Client E2E Tests on: - push: - branches: [main] - paths: - - 'hub-client/**' - - '.github/workflows/hub-client-e2e.yml' - pull_request: - paths: - - 'hub-client/**' - - '.github/workflows/hub-client-e2e.yml' workflow_dispatch: inputs: recreate-all-snapshots: description: 'Delete and recreate ALL visual regression baselines' type: boolean default: false + push: + branches: + - main + pull_request: + branches: + - main + +concurrency: + group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.event.pull_request.number || github.ref }} + cancel-in-progress: true jobs: e2e-tests: diff --git a/Cargo.lock b/Cargo.lock index 7fea400d0..d093c5378 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3384,6 +3384,7 @@ dependencies = [ "serde", "serde_json", "sha1 0.11.0", + "smallvec", "supports-hyperlinks", "tempfile", "tokio", @@ -3877,6 +3878,7 @@ dependencies = [ "rustc-hash", "serde", "serde_json", + "yaml-rust2", ] [[package]] @@ -3963,6 +3965,7 @@ dependencies = [ "serde_json", "serde_yaml", "sha2 0.11.0", + "smallvec", "tempfile", "thiserror 2.0.18", "time", @@ -4277,6 +4280,7 @@ version = "0.1.0" dependencies = [ "serde", "serde_json", + "smallvec", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 6e726d757..d5824d159 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,6 +44,7 @@ proc-macro2 = { version = "1.0.106", features = ["span-locations"] } schemars = "1.2.1" serde = { version = "1.0.228", features = ["derive"] } serde_json = "1.0.149" +smallvec = { version = "1.13", features = ["serde"] } serde_yaml = "0.9" thiserror = "2.0" toml = "0.9.11" diff --git a/claude-notes/designs/incremental-writer-internals.md b/claude-notes/designs/incremental-writer-internals.md new file mode 100644 index 000000000..1302722f6 --- /dev/null +++ b/claude-notes/designs/incremental-writer-internals.md @@ -0,0 +1,245 @@ +# Incremental writer internals — `CoarsenedEntry` and the self-contained contract + +**Status:** Active (contract pinned 2026-05-25 by the +`CoarsenedEntry::Rewrite` refactor). +**Types:** `pampa::pandoc::Block`, `quarto_source_map::SourceInfo`, +`quarto_ast_reconcile::ReconciliationPlan`. +**Reference impl:** +[`crates/pampa/src/writers/incremental.rs`](../../crates/pampa/src/writers/incremental.rs) +(`CoarsenedEntry`, `coarsen`, `coarsen_blocks`, `coarsen_keep_before_block`, +`assemble`, `emit_entries`). +**Plans:** +[Plan 7](../plans/2026-05-04-q2-preview-plan-7-incremental-writer.md) +(writer design) · +[Plan 7c](../plans/2026-05-25-q2-preview-plan-7c-closure-gaps.md) +(Phase 8 — Transparent recursion in `RecurseIntoContainer`) · +[CoarsenedEntry self-contained refactor](../plans/2026-05-25-coarsened-entry-self-contained.md). +**Sibling docs:** +[Transparent wrappers](./transparent-wrappers.md) (the *traversal* +primitive — what the writer skips through) · +[Provenance contract](./provenance-contract.md) §7 (how atomic-kind +decisions flow into the writer's branches). + +## Purpose + +The incremental writer answers a single question: *given an +`(original_qmd, original_ast, new_ast, plan)` tuple, what qmd text +should we hand back to the user?* Its output round-trips through the +read pipeline to produce an AST that the next reconciliation matches +against — so the writer's bytes are the canonical persistence form +of an edit. + +It does this in two phases. **Coarsen** walks the reconciler's +hierarchical alignment plan and reduces it to a flat list of +`CoarsenedEntry` values — one per emitted block sequence. **Assemble** +walks that list, concatenates the bytes, and inserts separators. The +split lets the coarsen step be tested in isolation and lets a future +"minimal Monaco edit" consumer reuse the entry list without +re-running the diff. + +This document pins the contract that holds the two phases together. + +## The `CoarsenedEntry` contract + +> Every variant of `CoarsenedEntry` must carry enough information +> to produce its emit bytes **without further context**. No +> index-into-an-ambient-slice deferral. No "look this up at emit +> time" handoffs. Each entry is self-describing. + +The five variants today: + +| Variant | Bytes come from | Self-contained because | +|---|---|---| +| `Verbatim` | `original_qmd[byte_range]` | `byte_range` is absolute. | +| `InlineSplice` | `block_text` field | Pre-computed at coarsen time. | +| `Rewrite` | `block_text` field | Pre-computed at coarsen time. | +| `Transparent` | concatenation of children | Children are themselves self-contained. | +| `Omit` | (nothing) | Emits nothing. | + +The two indices that *do* appear on entries — `Verbatim::orig_idx`, +`InlineSplice::orig_idx` — are not used for *byte content*. They're +hints to `compute_separator` for its "consecutive-in-original gap" +optimization, and they're always `Option`: `None` for children +inside a `Transparent` wrapper, where any index would be ambiguous +(top-level? child-level?). The bytes themselves never look up against +an ambient slice. + +`emit_entries` walks entries in order and concatenates. Its +`new_ast: &Pandoc` parameter is currently unused for byte production +in any variant — that's the post-condition of the contract. (We +leave the parameter in the signature for now; removing it is a +tidying follow-up flagged in the refactor plan.) + +## Why this matters + +The contract isn't decorative. Three reasons it exists: + +### 1. `Transparent` recursion composes only if children are self-contained + +A `Transparent` entry represents a synthesized wrapper whose own +bytes are empty (sectionize Div, footnotes container, appendix +container) but whose children carry real source preimage. The +writer "looks through" the wrapper by inlining the children into the +emit stream. + +This composition requires that each child knows how to produce its +own bytes *without* depending on its position in some ambient slice. +A child carries `orig_idx: None` to opt out of the original-gap +optimization (its index is child-relative, not top-level). If the +same child also tried to defer its *bytes* to a "look up index N in +new_ast.blocks" handoff, the lookup would silently target the wrong +slice — `new_ast.blocks` is the top-level array, and child indices +don't index into it. + +That is exactly the bug that motivated this contract. Before +2026-05-25 the `Rewrite` variant carried `new_idx: usize`, which +worked at the top level (every entry corresponded one-to-one with +a top-level block; indices were unambiguous) but broke the moment +`Rewrite` could be produced inside a `Transparent` recursion. The +panic shape: *"index out of bounds: the len is 1 but the index is N"* +— top-level slice has one entry (the wrapper), child index N is +out of bounds. + +### 2. Minimal-edit diffing wants a self-contained intermediate form + +Today `incremental_write` returns a single full-document edit. A +future "produce minimal Monaco edits" consumer (Plan 7's deferred +follow-up) wants to walk the coarsened plan and emit *per-entry* +deltas — `Verbatim` entries are no-ops if the original gap matches; +`InlineSplice` and `Rewrite` are localized text replacements at +known source ranges. + +That walker needs every entry to expose its *intended text* (the +bytes that would land in the result) directly. If `Rewrite` deferred +to an emit-time lookup, the walker would have to re-thread `new_ast` +into a context it doesn't otherwise need. The self-contained shape +gives the walker exactly what it asks for — one record per emitted +block, fully self-describing. + +### 3. Behaviour is the *same*, the *timing* changes + +Pre-refactor, `write_block_to_string(&new_ast.blocks[new_idx])` ran +inside `emit_entries`. Post-refactor, the equivalent call runs at +the corresponding producer site in `coarsen_blocks` / +`coarsen_keep_before_block`. `write_block_to_string` is referentially +transparent — it depends only on its `Block` argument, has no global +state, no I/O, no clock reads. Moving the call earlier produces +byte-identical output and runs exactly once either way (Rewrite is +the catch-all path; we always emit it when produced). + +That matters because the change is "free of behaviour" — it's a +shape change, not a semantics change. A reader reviewing the diff +shouldn't need to worry that some downstream test will break in a +subtle way. + +## Anti-patterns + +Don't add a `CoarsenedEntry` variant that: + +- **Defers to a named slice.** "Index N into `new_ast.blocks`," + "child M of original block at index K," etc. The moment a future + refactor calls the producer in a different *context* (recursion, + reuse from a sibling crate, a test fixture), the index points at + the wrong slice and the failure is silent until the panic. +- **Depends on context not encoded in the variant itself.** If you + need "the prev sibling's bytes," "the wrapper's original + position," or similar context to make sense of an entry, pre-fold + the context into the entry's payload or restructure so it doesn't + need the context. +- **Requires specific timing of side effects.** `write_block_to_string` + is pure — calling it at coarsen vs emit time is observably + identical. If your variant only works when its bytes are computed + at one specific moment, that's a sign the entry shape is wrong. + +When in doubt, look at `InlineSplice`. It was the first variant to +carry pre-computed `block_text` (introduced when partial inline +rewrites made deferral impossible — the splice text doesn't +reconstruct from any single block) and is the structural blueprint +the rest of the variants should match. + +## History + +`CoarsenedEntry` started life with two variants in commit +`eb81cbc5` (the original incremental-writer landing): `Verbatim` +carrying a `byte_range`, and `Rewrite` carrying a `new_idx: usize`. +The writer was top-level only — each entry corresponded one-to-one +with a top-level block, indices were unambiguous, and deferring +`write_block_to_string` to emit time saved a call when the entry +was never emitted (defensive, but the entry was always emitted in +practice). + +The asymmetry was introduced silently in `ab10f37b`, which added +`InlineSplice { block_text, orig_idx }` to support partial block +rewrites. Splice text mixes original bytes with newly-serialized +inlines and doesn't reconstruct from any single `Block` — so the +text was necessarily pre-computed at coarsen time. No one +refactored `Rewrite` to match; the two patterns coexisted. + +`9a473fe9` (Plan 7 phase 2+3a) added `Transparent` and `Omit`. +`Verbatim::orig_idx` and `InlineSplice::orig_idx` became `Option` +so children inside `Transparent` could opt out of the original-gap +optimization. The commit **explicitly flagged** the latent `Rewrite` +issue with a comment: *"result_idx is unused for child Rewrites +(a child Rewrite would need a different lookup mechanism; not +exercised by today's synthesizers)."* Accurate at the time — no +producer of child entries was emitting `Rewrite`. + +`bdcfdc53` (Plan 7c phase 8) added a Transparent-recursion path in +`coarsen_blocks` for the *changed-wrapper* case +(`RecurseIntoContainer` with a `block_container_plans` entry). For +the first time, `coarsen_blocks` ran on child slices, and a +`Rewrite` produced there carried a child-relative index. The "not +exercised" caveat from `9a473fe9` no longer held — the panic the +contract addresses became reachable. + +The 2026-05-25 refactor that motivated this doc lifted `Rewrite` +to `{ block_text: String }`, matching `InlineSplice`. All four +producer sites now pre-compute. The implementation cost is a moved +`write_block_to_string` call; the gain is the contract this doc +pins. + +The same session also closed a latent soft-drop gap that the panic +had been masking. The `BlockAlignment::UseAfter` arm now detects +*atomic-Generated with preimage* (the user edited inside a +shortcode-resolved block, the reconciler split the edit into a +deleted-original + new-block, but the new block still carries the +token's `Invocation` anchor) and emits `Verbatim` of the preimage +plus a `Q-3-43` warning, instead of the previous let-user-win +`Rewrite` (which would have written the resolved bytes — the edit +applied to *generated* content — back into the source qmd, poisoning +the user's source). The pattern: when an entry's *new* block looks +like an attempt to edit content the user can't actually edit, refuse +the edit at the writer regardless of what the reconciler's alignment +said. + +## Promotion path + +`CoarsenedEntry` is private to +`crates/pampa/src/writers/incremental.rs` today, with two internal +consumers: `assemble`'s `emit_entries` and the +`compute_edits_from_coarsened` helper (which currently calls +`assemble` internally). + +Promote the type (and its emission helpers) to a shared module the +moment a second crate wants to consume the coarsened plan. The +expected first non-pampa consumer is the minimal-edit-diffing +walker described above. Until then, the type stays here — premature +generalisation has its own debt, and the contract above is what +matters, not the import path. + +## Adding a new variant + +If you find yourself wanting a new `CoarsenedEntry` variant: + +1. Ask whether one of the existing five already serves. Most "I need + a new shape" instincts collapse into `Transparent` (for + wrappers) or `Rewrite` (for "anything else, re-serialize"). +2. If you genuinely need a new variant, design it self-contained + from the start. The variant's payload should be everything + `emit_entries` needs to produce its bytes; nothing more, nothing + deferred. +3. Update this doc's table and the variant's doc comment in the + `CoarsenedEntry` enum to describe the self-containment story. +4. Add at least one test that exercises the variant inside a + `Transparent` recursion. That's the canary that catches + composition bugs early. diff --git a/claude-notes/designs/provenance-contract.md b/claude-notes/designs/provenance-contract.md new file mode 100644 index 000000000..2f82f71a4 --- /dev/null +++ b/claude-notes/designs/provenance-contract.md @@ -0,0 +1,357 @@ +# Provenance contract — emitting `SourceInfo` from a transform + +**Status:** Active (Plan 6 landed 2026-05-22 on `feature/provenance`). +**Types:** `quarto_source_map::SourceInfo`, `By`, `Anchor`, `AnchorRole` +([`crates/quarto-source-map/src/source_info.rs`](../../crates/quarto-source-map/src/source_info.rs)). +**Plans:** +[Plan 4](../plans/2026-05-04-q2-preview-plan-4-sourceinfo-anchors.md) +(types) · +[Plan 5](../plans/2026-05-04-q2-preview-plan-5-wire-format.md) +(wire format) · +[Plan 6](../plans/2026-05-04-q2-preview-plan-6-provenance-audit.md) +(this audit) · +[Plan 7](../plans/2026-05-04-q2-preview-plan-7-incremental-writer.md) +(writer / consumer) · +[Plan 8](../plans/2026-05-04-q2-preview-plan-8-include-wrapper.md) +(include wrapper). +**Audit report:** [`claude-notes/research/2026-05-22-plan-6-audit.md`](../research/2026-05-22-plan-6-audit.md). + +## Summary + +Every `SourceInfo` a transform emits must accurately describe where the +node came from. The Plan 4 types give four physical shapes (`Original`, +`Substring`, `Concat`, `Generated`); this doc is the contract for which +shape to pick. The rule that follows replaces the historical "stamp +`SourceInfo::default()` and move on" pattern that Plan 6 audited out +of the transform layer. + +## 1. Decision tree for new transforms + +**Pick the shape from where the emitted node's *bytes* come from, not +from how it was constructed.** Four branches: + +| Source of the emitted node | Shape | +|-----------------------------------------|--------------------------------------------------------------------------------------------------------------------------------| +| Corresponds to source bytes | `Original` — `ctx.source_info.clone()`, or clone the input node's `source_info` field. Never construct an `Original` by hand. | +| Pure synthesis with no preimage | `Generated { by: By::(), from: smallvec![] }` | +| Resolution of a user-written construct | `Generated { by: By::(name), from: smallvec![Anchor::invocation(Arc::new(token_si))] }` | +| Constructed inside a user Lua filter | Leave it alone — `filter_source_info` ([`crates/pampa/src/lua/types.rs:1813`](../../crates/pampa/src/lua/types.rs)) auto-attaches the right shape on the way out. | + +If two branches feel equally applicable, pick the one with the longer +chain to source: the writer (Plan 7) and attribution +(`resolve_byte_range`) both prefer `Original` over `Generated{from:[]}` +and `Generated{from:[Invocation]}` over `Generated{from:[]}`. + +## 2. `By::` constructor catalog + +The known producer kinds, defined in +[`crates/quarto-source-map/src/source_info.rs`](../../crates/quarto-source-map/src/source_info.rs): + +| Constructor | Line | `kind` string | Purpose | Atomic? | +|------------------------------|------|---------------------------|-------------------------------------------------------------------------------|---------| +| `By::filter(path, line)` | 458 | `"filter"` | Typed Inline/Block constructed inside a user Lua filter (auto-attached). | yes | +| `By::sectionize()` | 470 | `"sectionize"` | `SectionizeTransform`'s synthesized section `Div`. | no | +| `By::user_edit()` | 479 | `"user-edit"` | React-constructed content reaching the AST through the q2-preview client. | no | +| `By::shortcode(name)` | 494 | `"shortcode"` | Result of resolving a `{{< name … >}}` token. **Requires an `Invocation`.** | yes | +| `By::include()` | 505 | `"include"` | `IncludeStage` expansion wrapper (Plan 8); most include children stay `Original`. | (Plan 8) | +| `By::title_block()` | 513 | `"title-block"` | Title-block stage's synthesized title `h1`. | yes | +| `By::footnotes()` | 521 | `"footnotes"` | Footnotes stage's container `Div` chrome. | no | +| `By::appendix()` | 529 | `"appendix"` | Appendix-structure stage's wrapper `Div` and its helpers. | no | +| `By::tree_sitter_postprocess()` | 538 | `"tree-sitter-postprocess"` | Parser-side synthetic Spaces (e.g. citation/suffix separator). | yes | +| `By::raw(kind, data)` | 552 | open | Escape hatch for extension-defined kinds. | no | + +**Extension namespacing.** Third-party transforms going through +`By::raw` must namespace their kind as `ext//` (e.g. +`ext/quarto-mermaid/diagram`). The `is_atomic_kind` set never matches +extension kinds — they are non-atomic by default in v1. + +## 3. Adding a new `By::` kind + +Worked example, using `bd-12vrr` (callout default-title synthesizer) +as the reference: + +1. **Constructor.** Add `pub fn callout() -> Self` to + [`crates/quarto-source-map/src/source_info.rs`](../../crates/quarto-source-map/src/source_info.rs) + alongside the existing constructors. Pick a kebab-case `kind` + string (`"callout"`); leave `data` as `Value::Null` unless the + kind carries per-instance configuration. +2. **Atomicity decision.** Decide whether the new kind belongs in + `is_atomic_kind` (line 570). Default: **no**. Yes only if the + round-trip rule is "treat the entire subtree as one + non-user-editable unit" (see §7). Document the decision in the + beads issue. +3. **Fix the site.** Replace the `SourceInfo::default()` at the + producer with + `SourceInfo::Generated { by: By::callout(), from: smallvec![] }` + (or with an `Invocation` anchor if the new kind resolves a + user-written construct). +4. **Test.** Add a per-transform shape test next to the existing tests + for that transform (e.g. + `test_create_callout_title_has_generated_provenance`), asserting + the produced shape directly. + +The shape test is the per-kind contract — if it fails, the producer +broke the rule. The audit-completion sweep (Plan 6) catches *missing* +provenance; per-transform tests catch *wrong* provenance. + +## 4. `from[]` vs. `by.data` + +**Source-info pointers go in `from[]` as typed `Anchor`s. Per-instance +configuration that is not a source pointer goes in `by.data` as +JSON.** The two are not interchangeable: + +```rust +SourceInfo::Generated { + by: By { + kind: "shortcode".to_string(), + data: serde_json::json!({ "name": "meta" }), // NOT a source pointer + }, + from: smallvec![ + Anchor::invocation(Arc::clone(&token_arc)), // source pointer — typed role + ], +} +``` + +The defined `AnchorRole`s are `Invocation`, `ValueSource`, and +`Other(String)`. New roles are added as enum variants, not as `by.data` +fields. The canonical migration example is **bd-36fr9** (Lua filter +file registration in `SourceContext`): once Lua files have a +`FileId`, the `filter_path`/`line` pair currently living in +`by.data` migrates to a typed `Dispatch` anchor in `from[]`, and +`by.data` for `filter`-kind nodes shrinks to per-kind config only. +Treat that as the worked example whenever you're tempted to put a +path-or-range pair in `by.data`. + +## 5. Enrichment-via-post-walk pattern + +**When you wrap a dispatch and want to layer your own context on top +of provenance the dispatch already attached, walk the result, append +your anchor, and promote `by.kind` — preserving prior `by.data` +fields, renaming where the new context demands.** This is the +canonical pattern for "transform A constructed via transform B." + +Reference implementation: +[`stamp_shortcode_anchors`](../../crates/quarto-core/src/transforms/shortcode_resolve.rs) ++ [`enrich_or_create`](../../crates/quarto-core/src/transforms/shortcode_resolve.rs) +at `crates/quarto-core/src/transforms/shortcode_resolve.rs:524` (entry +point) and `:774` (the promote/preserve helper). The relevant shape +of `enrich_or_create` is: + +```rust +let by = match existing { + SourceInfo::Generated { by, .. } if by.kind == "filter" => { + // promote filter -> shortcode, rename filter_path -> lua_path + let lua_path = by.data.get("filter_path").cloned(); + let lua_line = by.data.get("line").cloned(); + let mut data = serde_json::json!({ "name": name }); + if let Some(p) = lua_path { data["lua_path"] = p; } + if let Some(l) = lua_line { data["lua_line"] = l; } + By { kind: "shortcode".to_string(), data } + } + _ => By::shortcode(name), +}; +SourceInfo::Generated { + by, + from: smallvec![Anchor::invocation(Arc::clone(token_arc))], +} +``` + +Three rules to apply when copying the pattern: + +- **Append, don't replace.** New anchors join `from[]`; prior anchors + stay. +- **Promote, don't downgrade.** `by.kind` moves to a more specific + context (here: `filter` → `shortcode`). Going the other way drops + information. +- **Preserve prior `by.data`, renaming for context.** Filter dispatch + recorded `filter_path` / `line`; the shortcode context renames + them `lua_path` / `lua_line`. Nothing is discarded. + +The post-walk must also recurse into nested AST so every node in the +returned subtree gets the anchor — model the walk on +[`stamp_inline`](../../crates/quarto-core/src/transforms/shortcode_resolve.rs) +(`:546`) and +[`stamp_block`](../../crates/quarto-core/src/transforms/shortcode_resolve.rs) +(`:612`) rather than the narrower walkers in `callout.rs` / +`theorem.rs` (block-only — they miss `Image.alt` / `Note.content`). + +## 6. `AttrSourceInfo` positional alignment + threaded-source pattern + +**`AttrSourceInfo.attributes[i]` is the `(key_src, val_src)` pair for +the i-th entry of the parallel `Attr.2` (`LinkedHashMap`) in +insertion order.** Two preexisting parser paths break this invariant +(**bd-3aolj** duplicate-key handling, **bd-1e6a5** caption-attr merge +into table); see +[`crates/quarto-pandoc-types/src/attr.rs:28`](../../crates/quarto-pandoc-types/src/attr.rs) +for the full doc comment. + +When a transform needs the value's source range — e.g. lifting an +attribute value into a typed Inline — thread `&div.attr_source` through +and index *before* mutating `attr.2`: + +```rust +let name_idx = kvs.keys().position(|k| k == "name")?; +// Empty attr_source signals "no provenance" (the common test pattern). +// Only assert on a populated-but-misaligned attr_source — that's the +// bd-3aolj / bd-1e6a5 failure mode worth catching in dev. +debug_assert!( + attr_source.attributes.is_empty() + || kvs.len() == attr_source.attributes.len(), + "AttrSourceInfo.attributes is out of sync with Attr.2 (bd-3aolj / bd-1e6a5)" +); +let value_source = if kvs.len() == attr_source.attributes.len() { + attr_source.attributes[name_idx].1.clone() +} else { + None +}; +let name = kvs.remove("name")?; +// ... use value_source.unwrap_or_default() as the new node's source_info. +``` + +Reference: +[`crates/quarto-core/src/transforms/theorem.rs:314`](../../crates/quarto-core/src/transforms/theorem.rs) +(`extract_name_attr`), with a parallel implementation in +[`crates/quarto-core/src/transforms/proof.rs:162`](../../crates/quarto-core/src/transforms/proof.rs). + +**The strict form is wrong.** `debug_assert_eq!(kvs.len(), +attr_source.attributes.len())` fires on the common +`AttrSourceInfo::empty()` test pattern (an `Attr` with non-empty `kvs` +constructed by hand without provenance) and panics every existing +theorem/proof test. The "empty OR equal" form is required so empty +provenance signals "unknown," not "bug." Future contributors will hit +this footgun if they copy the wrong form from a draft plan. + +## 7. Atomic-kind set and consumer impact + +**`is_atomic_kind()` controls how downstream consumers treat the +node, not whether the node carries an anchor.** Two consumers consult +it today: + +- **Plan 7's incremental writer.** Atomic nodes round-trip as + Verbatim-copy of the source token; direct edits to atomic content + trigger the soft-drop / Q-3-42 path. Non-atomic synthesized nodes + re-serialize from their AST contents. +- **Plan 2A's React framework gate.** The hub-client preview reads + `isAtomicSourceInfo` to gate which DOM regions are non-editable. + +New kinds default to **non-atomic** (the `is_atomic_kind` match arm +does not include extension kinds). Promote to atomic only when the +round-trip rule is "the entire subtree is one inseparable unit the +user can't edit in-place." See Plan 7 for the consumer behavior; +this contract does not duplicate it. + +**Where the writer's internal shape is pinned:** +[`incremental-writer-internals.md`](./incremental-writer-internals.md) +documents the `CoarsenedEntry` contract — the rule that every +emitted entry must be self-contained, and how the atomic-kind +decision flows into the choice of `Verbatim` (atomic with preimage) +vs `Omit` (atomic without preimage) vs `Rewrite` (non-atomic +catch-all) vs `Transparent` (non-atomic wrapper with source-bearing +children) at coarsen time. + +## 8. Required-anchor invariants + +**`by.kind == "shortcode"` always carries at least one `Invocation` +anchor.** The producer (the stamper in §5) enforces this; Plan 7 +adds a consumer-side `debug_assert!` so an extension that calls +`By::raw("shortcode", …)` without the required anchor is caught. + +The pattern generalizes: when a new kind always has a source-side +preimage (e.g. a hypothetical `By::macro_expansion(name)`), declare +the invariant here, enforce it at the producer, and assert it at the +consumer. Kinds that *sometimes* have a preimage (sectionize wraps +existing content; the inner `Header` carries the original +`source_info`, but the wrapper `Div` doesn't) are not in this set — +they emit `from: smallvec![]` and don't require any anchor. + +**Sibling contract for these "no source token of its own" wrappers:** +see [`transparent-wrappers.md`](./transparent-wrappers.md). It names +the shape (Generated, no Invocation, block-container with +source-bearing children) and pins the *consumer* rule: any code +that asks "where do the user's source bytes live?" must descend +through transparent wrappers via `first_in_user_tree`, not read +`blocks[0]` directly. The producer side of that — what wrapper +kinds emit `from: []` — lives here in §2's catalog (`sectionize`, +`appendix`, footnotes container, …); the descent invariant lives +there. Adding a new `By::` kind that produces a block-container +wrapper should cross-reference both docs. + +## 9. Outliers — call-site threading vs. the stamper + +**Two shortcode-related sites bypass the stamper because they don't +flow through the dispatch funnel:** + +- [`make_error_inline`](../../crates/quarto-core/src/transforms/shortcode_resolve.rs) + (`:1352`) — `?key` Strong wrapping the unknown-shortcode message. +- [`shortcode_to_literal`](../../crates/quarto-core/src/transforms/shortcode_resolve.rs) + (`:1368`) — `{{}}` escaped-shortcode literal text. + +Both branches consume their `shortcode_owned.source_info` directly +and emit an `Original` (the user-visible bytes belong to the token, +not to a synthesized replacement). Plan 7's `is_atomic_kind()` does +not fire on `Original`, so error/escaped regions round-trip +verbatim-copy as plain user content. + +The pattern to recognize: **if the result variant is `Preserve` or +`Error` rather than `Inlines`/`Blocks`, the stamper does not run.** +Whenever you add a new `ShortcodeResult`-style enum variant that +short-circuits the dispatch funnel, thread the token's `source_info` +through the call sites and use it as the emitted node's +`source_info` — don't try to retrofit a `Generated{by:shortcode}` +shape onto content the user can edit directly. + +## 10. Do-not list + +- **Don't emit `SourceInfo::default()` for new synthesized nodes.** + Use the four-branch decision in §1. `default()` survives in the + Pandoc JSON reader ([`crates/pampa/src/readers/json.rs:80`](../../crates/pampa/src/readers/json.rs)) + by design (the source bytes genuinely don't exist there) and in + test scaffolding; everywhere else it's a bug. +- **Don't put source-info pointers in `by.data`.** Add an + `AnchorRole` variant and a typed `Anchor` in `from[]` instead. See + §4 and the bd-36fr9 migration. +- **Don't drop existing `by.data` when enriching.** Promote / + migrate. See §5. +- **Don't introduce a `CustomNode` wrapper for provenance alone.** + The 2026-05-20 design discussion settled on `Generated` with + typed anchors instead of `CustomNode("ShortcodeResolution")`-style + wrappers because the anchor carries the structural information + cheaply without forcing a new HTML-pipeline resolve transform, a + React component, and a `qmd` writer arm. Wrappers remain + appropriate for the include case (Plan 8) because the cross-file + `FileId` problem genuinely needs anchoring at the parent-file + level. Do not re-litigate. +- **Don't add a `test` arm to a `wasm32` cfg guard** when introducing + new provenance code paths. See + [`.claude/rules/wasm.md`](../../.claude/rules/wasm.md) — the + `#[cfg(any(target_arch = "wasm32", test))]` pattern is prohibited + because it forces native tests through the WASM-restricted Lua + stdlib and fails on Windows. + +## Follow-ups (named, not designed here) + +- **bd-129m3** — `ValueSource` anchor stamping for `meta` / `var` + shortcodes once the metadata loader threads per-key source-info + through. Integration point is `enrich_or_create` (§5). +- **bd-36fr9** — `Dispatch` anchor for Lua-handler filter / shortcode + source location, once Lua files are registered in `SourceContext`. + Migration example for §4. +- **bd-12vrr** — Callout default-title synthesizer needs a `By::callout()` + constructor + atomicity decision. The §3 worked example. +- **bd-1inj0** — Code-block decoration synthesizers + (`code_block_generate` / `code_block_render`) — another small audit + pass to bring into this contract. +- **bd-3aolj** / **bd-1e6a5** — Parser-side `AttrSourceInfo` / + `Attr.2` alignment bugs that the §6 guard works around. + +## Change log + +- **2026-05-25 — v1.** Initial version, written after Plan 6 landed + on `feature/provenance` (2026-05-22). Documents the conventions + that survived implementation: + four-branch decision tree, `By::` catalog, enrichment pattern, + `AttrSourceInfo` threading recipe (with the relaxed + `debug_assert!` form), atomic-kind / required-anchor invariants, + outlier call-site threading, and a do-not list. Plan-6 audit + report lives separately at + [`claude-notes/research/2026-05-22-plan-6-audit.md`](../research/2026-05-22-plan-6-audit.md). diff --git a/claude-notes/designs/transparent-wrappers.md b/claude-notes/designs/transparent-wrappers.md new file mode 100644 index 000000000..8b25f386f --- /dev/null +++ b/claude-notes/designs/transparent-wrappers.md @@ -0,0 +1,218 @@ +# Transparent wrappers — descending past synthesized block containers + +**Status:** Active (introduced 2026-05-25 alongside Plan 7c Phase 8). +**Types:** `pampa::pandoc::Block`, `quarto_source_map::SourceInfo`. +**Reference impl:** +[`crates/pampa/src/writers/incremental.rs`](../../crates/pampa/src/writers/incremental.rs) +(`first_in_user_tree`, `is_transparent_wrapper`, +`derive_target_file_id`, `first_target_anchored_start_in`). +**Plans:** +[Plan 7](../plans/2026-05-04-q2-preview-plan-7-incremental-writer.md) +(writer) · +[Plan 7c](../plans/2026-05-25-q2-preview-plan-7c-closure-gaps.md) +(Phase 8 — target_file_id descent) · +[Plan 8](../plans/2026-05-04-q2-preview-plan-8-include-roundtrip.md) +(IncludeExpansion — *not* a transparent wrapper) · +[Plan 9](../plans/2026-05-22-provenance-plan-9-valuesource-threading.md) +(`title_source_info`) · +[Plan 10](../plans/2026-05-22-provenance-plan-10-dispatch-anchor.md) +(Lua-emitted wrappers). + +## Summary + +The post-render AST that q2-preview hands the React iframe is **not +flat.** The render pipeline wraps the user's blocks in synthesized +containers — most notably a single top-level `Div` from +`SectionizeTransform` — that group content by heading level for +sidebar / cross-reference / outline construction. These wrappers +carry `SourceInfo::Generated` with no `Invocation` anchor: they're +structurally part of the AST but have **no source bytes of their own** +in the user's qmd. + +A *transparent wrapper* is the name for this shape. Code that asks +"where do the user's source bytes live?" must descend through +transparent wrappers, not read `blocks[0]` directly. + +Three writer bugs landed on this rake before the pattern was named +(commits `bdcfdc53`, `b9f64b56`, `2bf92664`): the writer +soft-dropped the wrapper instead of recursing, derived the wrong +file id, and silently deleted the YAML frontmatter. All three were +the same mistake — `blocks[0]` is not necessarily a real user +block. + +## Definition + +A `Block` is a *transparent wrapper* with respect to a +`target_file_id` when **all three** hold: + +1. Its `SourceInfo` is `Generated` with no `Invocation` anchor. + It has no source token of its own; its bytes are synthesized. +2. It is recognised by `block_block_children` (i.e. it's a `Div`, + `BlockQuote`, `Figure`, or `NoteDefinitionFencedBlock` — the + block-container kinds today's synthesizers emit). +3. At least one descendant has real + `preimage_in(target_file_id).is_some()` — there's actual user + content under it. + +Condition (3) is what makes the predicate *structural* rather than +opt-in: a Lua filter that wraps existing user paragraphs in a +`Div.callout` produces a Generated Div whose children still carry +their original preimage → it's transparent → the visual editor sees +through it → user edits inside the wrapped content round-trip +cleanly. A filter that constructs a fresh Div from metadata has no +source-bearing children → it's atomic → editor treats it as a unit. +The filter author doesn't have to declare anything; the AST shape +declares it for them. + +## Known transparent wrappers today + +Produced by `pampa::pandoc::sugar::SectionizeTransform` and friends: + +- **sectionize** Div — groups blocks by heading depth (`By::sectionize()`). +- **footnotes-container** Div — collects all footnote definitions. +- **appendix-container** Div — collects appendix-tagged content. + +Plus, by structural construction, any Lua-emitted block-container +that meets the three conditions above (Plan 10). + +**Not** transparent wrappers: + +- `IncludeExpansion` CustomNode (Plan 8) — its `SourceInfo` is + `Original`, anchored to the include-token bytes in the parent qmd. + Descent stops at it; that's correct behaviour. +- Atomic CustomNodes like `CrossrefResolvedRef` — `SourceInfo` + is `Original` pointing at the `@ref` token. +- The synthesized title-block Header (`By::title_block()`) — + `is_atomic_kind` is `true` for `title-block`. Editor treats the + resolved title as read-only; the user's source-side knob is the + YAML `title:` key. (Not block-container shape either.) + +## Sibling primitive on the emission side + +`first_in_user_tree` (below) is the *traversal* primitive — how a +caller descends past transparent wrappers when looking up source +positions. The *emission* primitive is `CoarsenedEntry::Transparent` +in the incremental writer: same wrapper shape, but the question is +"how do I emit bytes through this wrapper?" rather than "where do +the user's source bytes live?" + +Both rely on the same descent rule (skip the wrapper, look at the +children) and the same invariant (a `Generated` block-container +with no Invocation anchor and source-bearing children is +transparent). They diverge in what they do with the descent: +traversal stops at the first match; emission walks all children +and concatenates their bytes. + +See [`incremental-writer-internals.md`](./incremental-writer-internals.md) +for the writer-side contract — in particular the rule that every +`CoarsenedEntry` variant must be self-contained, which is what +makes child entries safe to inline through a `Transparent`. + +## Reference primitive: `first_in_user_tree` + +```rust +fn first_in_user_tree( + blocks: &[Block], + extract: &impl Fn(&Block) -> Option, +) -> Option +``` + +Walks `blocks` depth-first. On each block, applies `extract`; if +`Some`, returns it. If `None`, descends through +`block_block_children` and tries again. This is how we see through +transparent wrappers — a wrapper has no source position of its own +(extract returns `None` for it), so the walker looks inside. + +The two consumers today are one-liners: + +```rust +fn derive_target_file_id(blocks: &[Block]) -> FileId { + first_in_user_tree(blocks, &|b| b.source_info().root_file_id()) + .unwrap_or(FileId(0)) +} + +fn first_target_anchored_start_in(blocks: &[Block], target: FileId) -> Option { + first_in_user_tree(blocks, &|b| { + b.source_info().preimage_in(target).map(|r| r.start) + }) +} +``` + +A `visit_user_blocks(blocks, &mut visit)` sibling (visiting all user +blocks in document order, transparent wrappers skipped) is the +natural extension for callers that need every block, not just the +first; add it the moment a second caller wants it. + +## When to use which + +| Need | Tool | +|---|---| +| Find the first block where some property holds | `first_in_user_tree` | +| Visit all user blocks in document order | (add `visit_user_blocks` when needed) | +| Ask "is *this specific block* a transparent wrapper?" | `is_transparent_wrapper` | +| Get the document's editing-file id | `derive_target_file_id` | +| Find where the YAML frontmatter region ends | `first_target_anchored_start_in` | + +`is_transparent_wrapper` is intentionally a small predicate — used +when a caller needs to make an *explicit* decision (e.g. a future +Q-3-44 diagnostic that hints "your filter walked into a sectionize +wrapper; you probably meant to walk its children"). Routine +source-position lookups should use the walkers, not the predicate. + +## Where the code lives, and when to promote it + +The primitives live in +`crates/pampa/src/writers/incremental.rs` next to +`block_block_children`. That's the right home today — the writer +is the only consumer. + +Promote to `quarto-pandoc-types` (or a new +`quarto-pandoc-types::traversal` module) **the moment a second +crate needs them.** Plan 9's `DocumentProfile` extractor (when it +gains a "first H1" fallback), Plan 10's filter-output classifier, +and the project-replay engine's cell walker are the candidates. +Don't promote pre-emptively — premature generalisation has its own +debt. + +## Adding a new synthesizer + +If you're writing a new transform that wraps user content in a Div +(or other block container): + +1. Emit `SourceInfo::generated(By::())` on the wrapper. + No `Invocation` anchor (because there's no source token). +2. Preserve the children's existing source_info — don't restamp + them with the wrapper's `By`. The whole point is that the + children stay editable. +3. Your wrapper is automatically transparent; nothing else to do. +4. If your `By::()` should *also* be considered + `is_atomic_kind()` (the resolved children are read-only, like + shortcode resolutions), add it to the atomic-kind set in + `crates/quarto-source-map/src/source_info.rs` — separate + concept, separate decision. + +## Anti-patterns + +- `ast.blocks[0]` for source-position questions (file id, start + offset, "the first user block"). Use `first_in_user_tree`. +- `ast.blocks.iter()` flatly for "every user block" enumeration + when the document might be wrapped. Use a descending visitor. +- Declaring a transparent wrapper via a `By::kind` registry. The + predicate is structural; don't add an opt-in mechanism that the + shape already encodes. +- Asking "is this Generated and atomic-kind?" when what you mean + is "should I descend?" — `is_atomic_kind` and transparency are + orthogonal. Shortcode resolutions are atomic *and* have + Invocation anchors (descent is meaningful but the resolved + content is read-only). Sectionize Divs are *neither* atomic + *nor* invocation-anchored. Mixing the two predicates produces + subtle bugs. + +## History + +| Date | Commit | What | +|---|---|---| +| 2026-05-25 | `bdcfdc53` | `coarsen` recurses Transparent into non-atomic Generated wrappers (the first bug — empty qmd) | +| 2026-05-25 | `b9f64b56` | `derive_target_file_id` descends; Plan 7c Phase 8 closed | +| 2026-05-25 | `2bf92664` | `emit_metadata_prefix` descends; YAML frontmatter preserved | +| 2026-05-25 | (this doc) | Pattern named, primitives centralized | diff --git a/claude-notes/instructions/idempotence-contract.md b/claude-notes/instructions/idempotence-contract.md new file mode 100644 index 000000000..b2f69f002 --- /dev/null +++ b/claude-notes/instructions/idempotence-contract.md @@ -0,0 +1,149 @@ +# The q2-preview idempotence contract + +A note for transform / filter authors. Read this before adding a new +Rust transform to `build_q2_preview_transform_pipeline`, a new stage +to `build_q2_preview_pipeline_stages`, or a new built-in Lua filter +under `resources/extensions/`. + +The contract is enforced by the CI gate at +`crates/quarto-core/tests/idempotence.rs`, which is the Phase-3 +deliverable of the provenance epic. The full design lives in +`claude-notes/plans/2026-05-04-q2-preview-plan-3-builtin-filter-idempotence.md`. + +## What the contract says + +Running the q2-preview pipeline twice on the same input must produce +the same structural AST: identical `blocks` hash and identical `meta` +hash with `meta.rendered.*` excluded. + +"Same input" means the same byte sequences for the same file layout — +but **not** necessarily the same absolute paths. Each idempotence +fixture runs both pipeline invocations inside a fresh `TempDir`, so +the project root differs across runs while the content is identical. +A transform that captures the absolute project root into the AST will +fail the gate. + +## What the hash includes and excludes + +Defined by `compute_blocks_hash_fresh` / +`compute_meta_hash_fresh_excluding_rendered` in +`crates/quarto-ast-reconcile/src/hash.rs`. + +Included: + +- All block / inline structure (type, text, attributes, children). +- All meta tree structure: scalars by `Yaml` payload; `Map` entries + in **insertion order** (no sort); `Array` entries in order; + `merge_op` on every `ConfigValue`. +- `PandocInlines` / `PandocBlocks` payloads inside meta values, + recursed via the existing block/inline hashers. + +Excluded: + +- `SourceInfo` on every block, inline, and `ConfigValue`. +- `key_source` on every `ConfigMapEntry`. +- Top-level `meta.rendered.*` — chrome transforms, `IncludeResolveStage`, + the favicon transform, and Bootstrap/clipboard injection populate + HTML/text strings under `rendered.*` that may legitimately vary in + trivial whitespace or attribute ordering; HTML-shape canonicalization + is a different concern. + +Source-info is excluded by design so Plan 4's source-info churn +doesn't break the contract. + +## What this means in practice + +A new transform / stage / filter must: + +### 1. Not depend on undefined-iteration-order state + +If you populate a `Map` value in `meta` from a `HashMap`, the +iteration order is undefined and two runs will produce different +hashes. The gate uses insertion-order map hashing precisely to catch +this — sorting would silently mask it. + +Use `Vec<(key, value)>`, `BTreeMap`, or `LinkedHashMap` and append +in a deterministic order. + +### 2. Not capture process-local state into the AST + +No timestamps, no PIDs, no random IDs, no absolute paths derived +from the project root, no `temp_dir()` output. If you need to refer +to a file, emit a path relative to the project root. + +Source-info is the only legitimate place absolute paths live, and +the hash excludes source-info by design. + +### 3. Use fresh Lua state per pipeline run (Lua filters / shortcodes) + +The shortcode resolver and per-filter Lua engine are constructed +fresh inside their respective transforms; do not stash global state +on `_G` and expect it to survive between runs. If you need a cache, +key it by the *filter* identity, not the *pipeline run* — and clear +it on `Lua` construction. + +### 4. Not execute engine cells + +CI doesn't run Jupyter / Knitr. Fixtures use only fenced code blocks +(`` ```python `` etc.) — AST nodes, not executed. If your transform's +behavior is conditional on engine-execution side effects, the gate +cannot exercise it. + +## Adding a fixture when you add a new transform + +Every new transform / filter must come with at least one fixture +that exercises its happy path. Add it to +`crates/quarto-core/tests/idempotence.rs`: + +- Trivial single-page fixture: use the `doc_fixture(name, content)` + helper. Writes `index.qmd` to a fresh `TempDir` and runs both + `DriveMode::SingleFile` and `DriveMode::ProjectOrchestrator`. +- Multi-file fixture (sibling files, includes, image resources): + write an inline `setup` closure that writes everything into the + fresh `TempDir`. Same dual-mode run. +- Website-chrome / link / listing fixture: use + `modes: ORCHESTRATOR_ONLY`. Chrome transforms need a populated + `ProjectIndex`, which only the orchestrator pass-1 builds. +- Attribution exercise: set `attribution_json: Some(...)` with a + deterministic transport-shape JSON; `PreBuiltAttributionProvider` + is installed on the `RenderContext` automatically. Do not use + `GitBlameProvider` here — it depends on actual git history. + +See `crates/quarto-core/tests/fixtures/idempotence/README.md` for +the per-fixture rules (no engine cells, no absolute paths, mode +mapping). + +## If your new fixture fails on first run + +Two possibilities: + +1. **Your transform really is non-deterministic.** Trace the + `DivergencePoint` the panic message hands you (block index, or + meta key path) and fix the underlying state — usually a + `HashMap` iteration, a `SystemTime::now()`, or an absolute path + stuffed somewhere it shouldn't be. + +2. **The hasher is wrong.** Vanishingly unlikely with FxHasher, + but if you've ruled out (1), file a bug against + `quarto-ast-reconcile`. + +Per the plan's long-lived-integration-branch policy, **do not +`#[ignore]` the failing test** without explicit user approval. +Failing fixtures are the triage backlog; the integration branch +(`feature/provenance`) is allowed to be red while the queue is +drained. + +## Related + +- `claude-notes/plans/2026-05-04-q2-preview-plan-3-builtin-filter-idempotence.md` — + the plan that introduced this gate, with the design rationale. +- `claude-notes/plans/2026-05-04-q2-preview-plan-7a-user-filter-idempotence.md` — + the runtime counterpart: per-user-filter idempotence detection at + render time, with `idempotent: false` opt-out. The contract this + file describes is the CI-time half for built-ins; Plan 7a is the + runtime half for user filters. +- `crates/quarto-ast-reconcile/src/hash.rs` — the hash implementations + and unit tests. +- `crates/quarto-core/tests/idempotence.rs` — the gate. +- `crates/quarto-core/tests/fixtures/idempotence/README.md` — the + fixture-format rules. diff --git a/claude-notes/plans/2026-05-04-q2-preview-plan-3-builtin-filter-idempotence.md b/claude-notes/plans/2026-05-04-q2-preview-plan-3-builtin-filter-idempotence.md new file mode 100644 index 000000000..5aa4bfe72 --- /dev/null +++ b/claude-notes/plans/2026-05-04-q2-preview-plan-3-builtin-filter-idempotence.md @@ -0,0 +1,1276 @@ +# Plan 3 — Built-in transform and filter idempotence verification (CI-time) + +**Date:** 2026-05-04 (revised 2026-05-21) +**Branch:** feature/provenance (long-lived integration branch — see +§"Phase 5 — Failure triage" and §"Long-lived branch policy" below) +**Status:** Development plan (work items below) +**Milestone:** M2 verification gate (no new milestone — locks in property +on what's already shipped) + +## Long-lived branch policy + +`feature/provenance` is **not** intended to merge to `main` while any +fixture in this plan is red. The integration branch is the *home* of +the failing-test queue; each red fixture has a beads issue, and the +queue is drained before merge. This is by design — the plan ships a +verification gate, and the gate has to be allowed to be red while it +discovers what's actually non-deterministic in the pipeline today. +See §"Phase 5 — Failure triage" for the operational rules, and +§"CI failure policy & sub-agent prompt template" (under §"Decisions") +for the rationale. + +## Epic context + +Part of the **provenance epic** (Plans 3–8). Plan 3 is the +verification-gate piece: it locks in the idempotence + structural-hash- +stability contract the rest of the epic (typed provenance, incremental +writer, soft-drop) rests on. The file name keeps its q2-preview-plan-N +form for continuity with the earlier discussion notes. + +## Goal + +Verify and lock in the **idempotence + structural-hash-stability** +contract for the q2-preview pipeline. Every Rust transform in the +q2-preview transform list **and** every built-in Lua filter shipped +under `resources/extensions/` must produce the same structural AST when +run twice on the same input. Without this, the incremental writer's +reconciliation (Plan 7) cannot reliably preserve untouched regions. + +This plan ships: + +- A canonical fixture set covering each transform and built-in Lua + filter in scope. +- A test that runs each fixture through the q2-preview pipeline twice + and asserts the resulting `blocks` and `meta` (excluding + `rendered.*`) hash equal. +- A `compute_meta_hash_fresh` helper in `quarto-ast-reconcile` + parallel to the existing `compute_blocks_hash_fresh`. +- Documentation of the idempotence contract for future transform/filter + authors. + +When this plan lands on `main` (after Phase 5's failure queue is +drained), the q2-preview round-trip story (Plans 4-8) rests on a +**CI-enforced** stable foundation: every push to `main` runs the +idempotence suite and fails the build on regression. *Until* the +plan lands on `main`, the integration branch +(`feature/provenance`) carries the suite in a possibly-red state +as the queue of discovered non-determinism issues is worked +through — that's the design, not a process gap. See §"Long-lived +branch policy" and §"Phase 5 — Failure triage." + +## Scope + +### What "built-in" covers — the universe under test + +Two distinct classes, both shipped with Quarto and both in scope: + +**Rust transforms** — the source of truth is +`build_q2_preview_transform_pipeline` in +`crates/quarto-core/src/pipeline.rs:1237`, which is +`build_transform_pipeline` minus the four names in +`Q2_PREVIEW_TRANSFORM_EXCLUDED` (`pipeline.rs:1198`). As of this +revision, the q2-preview pipeline runs **37 transforms** across four +phases: + +- **Normalization**: callout, shortcode-resolve, metadata-normalize, + code-block-generate, website-title-prefix, website-favicon, + website-bootstrap-icons, website-canonical-url, sectionize, + footnotes, theorem-sugar, proof-sugar, float-ref-target-sugar, + equation-label. +- **Crossref**: crossref-index, crossref-resolve. +- **Navigation**: toc-generate, navbar-generate, sidebar-generate, + page-nav-generate, footer-generate, listing-generate, listing-render, + categories-sidebar, listing-feed-stage (native only), + listing-feed-link, toc-render, navbar-render, sidebar-render, + page-nav-render, footer-render. +- **Finalization**: link-rewrite, appendix-structure, code-block-render, + resource-collector, table-bootstrap-class, attribution-render. + +Excluded by `Q2_PREVIEW_TRANSFORM_EXCLUDED` (out of scope for Plan 3 +because they don't run): callout-resolve, attribution-viewer, +title-block, crossref-render. + +**Stage-level work** in `build_q2_preview_pipeline_stages` +(`pipeline.rs:380`) also runs around `AstTransformsStage` and can +introduce non-determinism: parse-document, metadata-merge, +include-expansion, include-resolve, listing-item-info, document-profile, +link-resolution, unwrap-profile, pre-engine-sugaring, capture-splice, +engine-execution, compile-theme-css, attribution-generate, +user-filters-pre/post, resource-report, code-highlight. These are +exercised implicitly by every fixture (most are no-ops absent specific +metadata). + +`Q2_PREVIEW_STAGE_EXCLUDED` (`pipeline.rs:356`) currently excludes +three stages by name: `math-js`, `render-html-body`, and +`apply-template`. `MathJsStage`'s exclusion means `meta.math` never +appears under this pipeline and contributes nothing to the meta +hash; `RenderHtmlBodyStage` and `ApplyTemplateStage` produce +HTML/text side outputs that wouldn't reach the AST anyway, so their +exclusion is also AST-neutral. `BootstrapJsStage` and +`ClipboardJsStage` are *not* excluded — they run on native q2-preview +but write only to `ctx.artifacts`, not to `doc.ast.meta` or +`doc.ast.blocks`, so they don't affect the hash. (Whether they +should be in `Q2_PREVIEW_STAGE_EXCLUDED` at all is a separate +question, filed as **bd-2ag1c** — see §"Open questions for +implementation" for ordering relative to Plan 3.) + +**Lua filters under `resources/extensions/`** — there is exactly **one** +today: `resources/extensions/quarto/video/video-filter.lua`. It rewrites +Header attributes when `background-video` is set on a slide-shaped +header. (The other Lua files in `resources/extensions/` — kbd, video, +lipsum, version, placeholder — are *shortcodes*, not filters, and run +through `shortcode-resolve` rather than `UserFiltersStage`. They're +exercised via shortcode fixtures.) + +### In scope + +- **Canonical fixture set**: small `.qmd` files exercising each + transform / filter in the universe above. Existing fixtures + new + ones from the gap audit below. Detailed listing in §"Coverage gaps to + address during implementation." + +- **`compute_meta_hash_fresh` helper** in + `crates/quarto-ast-reconcile/src/hash.rs`. Walks `ConfigValue` + source-info-agnostically: + - hashes scalars by their `Yaml` payload; + - recurses into `PandocInlines` / `PandocBlocks` via the existing + inline / block hashers; + - hashes `Array` entries in order (matches `Vec` shape); + - hashes `Map` entries as `(key_string, recurse(value))` pairs **in + insertion order — no sort**. Insertion-order hashing is the right + choice for an idempotence test: it catches HashMap-iteration-order + bugs in transforms that stuff results into a meta `Map`. Sorting + would silently mask exactly the class of non-determinism we want to + detect. `ConfigValue::Map` is already a `Vec` that + preserves YAML document order, so hashing insertion order is also + the simplest implementation; + - **includes `merge_op`** in the hash (every `ConfigValue` has + `value: ConfigValueKind`, `source_info: SourceInfo`, and + `merge_op: MergeOp` — `merge_op` participates so we catch + transforms that change merge semantics non-deterministically). + `MergeOp::default()` is `Concat` + (`crates/quarto-pandoc-types/src/config_value.rs:75`, derived + `#[default]`) — a stable compile-time constant with no env or + runtime dependence, so transforms that leave `merge_op` at its + default contribute a deterministic value to the hash; + - skips `source_info` and `key_source` (Plan 4's churn must not break + the contract). + + Tests for the helper land alongside it (mirroring the existing + `test_same_content_same_hash` style at `hash.rs:767`). Include a test + proving the helper diverges when `Map` insertion order changes — this + is the regression guard for the no-sort choice. + +- **Idempotence test runner**: takes a fixture, runs the q2-preview + pipeline twice (once per `DriveMode` — see §"What gets tested + concretely"), hashes `doc.ast.blocks` via `compute_blocks_hash_fresh` + and `doc.ast.meta` via `compute_meta_hash_fresh_excluding_rendered` + (everything under `rendered.*` is HTML/text side output — see §"Out + of scope"). Asserts hash equality across the two runs *within a + mode*. One assertion per (fixture, mode) pair; failures name the + fixture, the mode, and which hash diverged. + +- **Divergence-localization helper** in + `crates/quarto-ast-reconcile/src/hash.rs`, alongside the hash fns. + When the (blocks, meta) hashes diverge, the test driver calls + `find_first_divergence(&doc_1, &doc_2) -> DivergencePoint` to + surface a useful location in the failure message. Returns one of: + - `DivergencePoint::Block { index, hash_a, hash_b }` — first block + index whose `compute_block_hash_fresh` differs; + - `DivergencePoint::MetaKey { path, hash_a, hash_b }` — first meta + key path (e.g. `["listings", "foo", "items"]`) whose recursive + hash differs, walking the `ConfigValue` tree in insertion order + and excluding `rendered.*`; + - `DivergencePoint::None` — hashes equal at top but a sub-component + differs (would indicate a bug in the hasher itself; vanishingly + unlikely with FxHasher). + + The test driver embeds the returned `DivergencePoint` in the panic + message, so the sub-agent investigation prompt arrives with a + concrete starting point ("block index 7" / "meta.listings.foo + diverged") rather than just "hash diverged." Saves agent triage + time and makes the sub-agent prompt template (§"Open questions for + implementation") fillable from the panic message alone. + +- **Documentation** in `claude-notes/instructions/`: a short note on the + idempotence contract for transform and filter authors, including the + meta-hash-excludes-`rendered.*` rule and how to add a fixture when + introducing a new transform. + +### Out of scope + +- **Round-trip non-idempotence** + (`pipeline(write(pipeline(x))) ≠ pipeline(x)`). Plan 7a's runtime + check handles this. Plan 3 deliberately tests only pipeline + non-determinism — see §"Pipeline-determinism only" below. +- **User-supplied filters**. Per-document, per-user; Plan 7a covers + these at runtime with an `idempotent: false` opt-out. +- **Rust-vs-React rendering parity**. Different contract; later plan. +- **Performance / debouncing**. Idempotence verification doesn't + measure runtime. +- **Engine execution non-determinism**. CI doesn't run jupyter / knitr; + fixtures must contain only fenced code blocks (AST-level), not + executable code cells. The `engine-execution` stage is a no-op on + fixtures with no engine cells; the `capture-splice` stage is a + pass-through when no capture is supplied. See §"No executable engine + cells" below. +- **Chrome HTML-string canonicalization**. Meta hash skips + `rendered.*` because those are HTML strings populated by + navbar-render / sidebar-render / etc.; semantically-equal but + textually-different HTML would fail a strict comparison. Structural + non-determinism in chrome transforms shows up elsewhere (e.g., a + navbar transform that emits attributes in non-canonical order + inside its HTML still produces a stable hash *of the meta key + containing the HTML* across runs, because both runs go through + the same code path — what we're missing is HTML-shape determinism, + which is a separate concern best tested with HTML snapshots). +- **`meta.rendered.includes.*` HTML/text strings**. Written by + `IncludeResolveStage` (user-supplied `include-in-header` / + `before-body` / `after-body` files), `WebsiteFaviconTransform` + (favicon ``), `attribution_viewer` (CLI-only — q2-preview + excludes it), and Bootstrap/clipboard injection on the HTML path. + These all sit under `rendered.*` and are skipped by + `compute_meta_hash_fresh_excluding_rendered`. If we ever want to + cover the includes subtree separately (catch a transform that + shuffles include-file ordering, say), the right shape is a separate + helper, not a partial inclusion of the rendered subtree. + +### No executable engine cells + +CI does not execute engine cells. Fixtures must: + +- Use only fenced code blocks (`` ```python ``, ` ```r `, etc.) — AST + nodes, not executed. +- NOT use `{python}` / `{r}` / `{julia}` style executable cells. + +If a fixture happens to include an executable cell, the +`engine-execution` stage will either fail (no kernel available) or +fall through to the markdown passthrough. Either way the test is +unreliable. The fixture-format documentation enforces this. + +## Pipeline-determinism only — round-trip is Plan 7a's job + +Two distinct properties get loosely called "non-idempotence": + +1. **Pipeline non-determinism**: `pipeline(x)` produces different + output on repeat calls. Caused by time / RNG / mutable global state + / undefined-order iteration. **This is what Plan 3 tests.** + +2. **Round-trip non-idempotence**: + `pipeline(write(pipeline(x))) ≠ pipeline(x)`. The pipeline doesn't + re-parse its own output today; this becomes a concern only when + Plan 7's incremental writer lands. Plan 7a covers (2) at runtime + for **user-supplied** Lua filters, with per-filter attribution and + an `idempotent: false` opt-out. **Built-in** filter round-trip is + not covered by any plan in the epic (see Plan 7a's §"Notes" for + the accepted-gap reasoning). + +Plan 3 deliberately scopes to (1) because: + +- (2) isn't exercised by today's pipeline. +- (2)'s test conflates writer-lossiness with filter-non-idempotence; + Plan 7's writer-lossless baseline test (planned for Plan 7's first + commit) and Plan 7a's per-filter isolation disambiguate the user + filter case. +- For built-ins, the universe is small (one Lua filter + + ~36 Rust transforms, all under our control); if (2) bites us in + production after Plan 7 ships, the fix is to extend Plan 7a's + runtime check to also fire on `FilterSource::Extension` filters — + a small follow-up tracked in 7a's §"Out of scope." + +See Plan 7a's §"Two flavors of non-idempotence" for the full +treatment. + +## Design decisions (settled in conversation) + +- **The hash is source-info-agnostic** (verified). `compute_block_hash_fresh` + excludes `source_info`; the new `compute_meta_hash_fresh` will do the + same for `ConfigValue::source_info` and `ConfigMapEntry::key_source`. + Test asserting this lives at `hash.rs:767` for blocks; equivalent + test lands for meta. +- **`merge_op` participates; map keys hashed in insertion order, no + sort.** See the helper spec in §"In scope" for the full reasoning. + In one line: an idempotence test wants to *catch* the kind of + non-determinism a sort would hide. +- **Hash covers blocks and meta-minus-`rendered.*`**. Meta inclusion + catches non-determinism in metadata-normalize, listing data, + shortcode-resolved meta values, attribution metadata, etc. The + `rendered.*` keys are HTML strings populated by chrome-render + transforms; their canonicalization is a separate concern. +- **Filter mutation provenance stays Original** (post-Plan 4 unified + `Generated { by: By::filter(...), from: [] }` shape). Idempotence + test sees consistent shape across runs. +- **Each pipeline run uses fresh Lua state.** Two construction sites, + both verified fresh per pipeline invocation: + - **User filters**: `apply_lua_filter` (singular, at + `crates/pampa/src/lua/filter.rs:158`) constructs a fresh + `Lua::new()` per filter. The outer `apply_lua_filters` (plural, at + line 270) loops over `filter_paths` and calls the singular form + once per filter, so every filter in every run starts from a clean + Lua state. + - **Shortcodes**: `LuaShortcodeEngine::new` + (`crates/pampa/src/lua/shortcode.rs:68`) is constructed on the + stack inside `ShortcodeResolveTransform::transform()` at + `crates/quarto-core/src/transforms/shortcode_resolve.rs:513`, so + each pipeline run also gets a fresh shortcode-side `Lua::new()`. + + No cross-run state accumulation on either side. This matches + production (hub-client builds a new pipeline per render) and + resolves the prior "second-run pipeline starts fresh?" open + question. +- **Built-in scope = Rust transforms + ship-with-Quarto Lua filters**. + User filters are out of scope here (Plan 7a covers them). + +## What gets tested concretely + +Every fixture runs through **two pipeline-driver modes**, both compared +against themselves: + +1. **Single-file mode** — `run_pipeline` directly with + `build_q2_preview_pipeline_stages`. Mirrors the lowest-level entry + point used by `render_qmd_to_preview_ast` (`pipeline.rs:859`). +2. **Project-orchestrator mode** — calls the existing + `render_active_page_preview` helper at + `crates/quarto-core/tests/render_page_in_project.rs:660`. That + helper already drives + `ProjectPipeline` end-to-end (project + discovery, multi-file re-discovery guard, format setup, `ActivePage` + mode), returns `WasmPassTwoOutput`, and panics on pass-1 / pass-2 + failures. It is exactly the path the real `q2 preview` and + hub-client renders take. We use it as-is; no fresh orchestrator + wiring is required. + +Why both: single-file mode catches stage / transform non-determinism; +project mode additionally exercises any non-determinism introduced by +the orchestrator itself (project discovery, ProjectIndex assembly, +file-iteration order, pass-1 → pass-2 hand-off). + +```rust +use quarto_ast_reconcile::{compute_blocks_hash_fresh, compute_meta_hash_fresh_excluding_rendered}; +use quarto_core::format::Format; +use quarto_core::pipeline::{build_q2_preview_pipeline_stages, run_pipeline}; +use quarto_core::project::pass2_renderer::WasmPassTwoOutput; +use quarto_core::stage::{DocumentAst, PipelineData}; +use quarto_pandoc_types::Pandoc; +use quarto_system_runtime::NativeRuntime; +use std::path::{Path, PathBuf}; +use std::sync::Arc; +use tempfile::TempDir; + +/// How a fixture is driven through the pipeline. Every fixture runs +/// once per mode; both modes hash equal across two runs. +#[derive(Clone, Copy, Debug)] +enum DriveMode { + /// `run_pipeline` directly with `build_q2_preview_pipeline_stages`. + SingleFile, + /// Reuses the existing `render_active_page_preview` helper at + /// `crates/quarto-core/tests/render_page_in_project.rs:660`. + ProjectOrchestrator, +} + +/// A test fixture. The whole project lives in a `TempDir` that the +/// fixture owns; the `_quarto.yml` (if any) plus the page contents +/// are written by `setup()`. Document-only fixtures still create a +/// temp dir + minimal `index.qmd` so the orchestrator mode has +/// something to discover. +struct Fixture { + name: &'static str, + /// Idempotent setup callback. Receives the project root. + /// Must write at minimum `/` (the page being rendered), + /// optionally `/_quarto.yml` and sibling files. + setup: Box, + /// The active page, relative to the project root. Defaults to + /// `index.qmd`. + active: PathBuf, +} + +fn run_fixture(fixture: &Fixture, mode: DriveMode) { + let doc_1 = run_q2_preview(fixture, mode); + let doc_2 = run_q2_preview(fixture, mode); + + let blocks_a = compute_blocks_hash_fresh(&doc_1.blocks); + let blocks_b = compute_blocks_hash_fresh(&doc_2.blocks); + let meta_a = compute_meta_hash_fresh_excluding_rendered(&doc_1.meta); + let meta_b = compute_meta_hash_fresh_excluding_rendered(&doc_2.meta); + + if blocks_a != blocks_b || meta_a != meta_b { + // Localize before panicking so the failure message gives the + // sub-agent prompt a concrete starting point. + let point = find_first_divergence(&doc_1, &doc_2); + panic!( + "fixture {} ({mode:?}): non-idempotent\n \ + blocks: {blocks_a:016x} vs {blocks_b:016x}\n \ + meta: {meta_a:016x} vs {meta_b:016x}\n \ + first divergence: {point:?}", + fixture.name, + ); + } +} + +fn run_q2_preview(fixture: &Fixture, mode: DriveMode) -> DocumentAst { + let temp = TempDir::new().unwrap(); + let project_dir = temp.path().canonicalize().unwrap(); + (fixture.setup)(&project_dir); + let active = project_dir.join(&fixture.active).canonicalize().unwrap(); + + match mode { + DriveMode::SingleFile => run_single_file(&project_dir, &active), + DriveMode::ProjectOrchestrator => run_orchestrator(&project_dir, &active), + } +} + +fn run_single_file(project_dir: &Path, active: &Path) -> DocumentAst { + // `run_pipeline` is async; the existing tests (e.g. + // render_page_in_project.rs) drive it via `pollster::block_on`. + pollster::block_on(async { + let runtime: Arc = + Arc::new(NativeRuntime::new()); + let mut project = quarto_core::project::ProjectContext::discover( + active, + runtime.as_ref(), + ) + .unwrap(); + if !project.is_single_file { + project = quarto_core::project::ProjectContext::discover( + &project.dir, + runtime.as_ref(), + ) + .unwrap(); + } + let doc = project + .documents + .iter() + .find(|d| d.path == active) + .expect("active doc in project") + .clone(); + let format = Format::from_format_string("q2-preview") + .expect("q2-preview is a recognized pseudo-format"); + let binaries = quarto_core::render::BinaryDependencies::new(); + let mut ctx = quarto_core::render::RenderContext::new( + &project, &doc, &format, &binaries, + ); + + let content = std::fs::read(active).unwrap(); + let stages = build_q2_preview_pipeline_stages(None, None); + let (output, _diagnostics) = run_pipeline( + &content, + &active.to_string_lossy(), + &mut ctx, + runtime, + stages, + ) + .await + .expect("pipeline run"); + + match output { + PipelineData::DocumentAst(ast) => ast, + other => panic!("expected DocumentAst, got {:?}", other.kind()), + } + }) +} + +fn run_orchestrator(project_dir: &Path, active: &Path) -> DocumentAst { + // Delegates to the existing helper. It already drives + // ProjectContext::discover + ProjectPipeline + ActivePage mode, + // and panics if pass-1 / pass-2 surface failures. We just lift + // the AST JSON out of `Pass2Payload::AstJson` and re-parse it + // back into a typed Pandoc via pampa's JSON reader — the source_info + // round-trips, but the hash explicitly excludes source_info, so + // the parse is a clean conversion for hashing purposes. + let output: WasmPassTwoOutput = + render_active_page_preview(project_dir, active); + let ast_json = output + .payload + .as_ast_json() + .expect("orchestrator must emit Pass2Payload::AstJson"); + let mut bytes = ast_json.as_bytes(); + let (pandoc, _ast_ctx) = pampa::readers::json::read(&mut bytes) + .expect("re-parse AST JSON"); + pandoc_to_document_ast(pandoc) +} + +// `pandoc_to_document_ast` converts the re-parsed `Pandoc` into the +// `DocumentAst` shape the hash helpers want. Pandoc carries blocks +// + the document meta; the helpers take `&[Block]` and `&ConfigValue` +// respectively, so this is mostly a field shuffle. Exact body +// determined during implementation once the DocumentAst struct is +// inspected next to Pandoc. +``` + +Notes on the helpers: + +- `run_pipeline` (`pipeline.rs:627`) is the existing entry point for + the single-file mode; no new driver is needed. +- The q2-preview pipeline ends at `CodeHighlightStage`, so its output + is `PipelineData::DocumentAst`. +- Each call constructs fresh `StageContext` (inside `run_pipeline` or + inside the orchestrator's per-page renderer setup) and fresh Lua + engines per filter / shortcode invocation — natural per-run + isolation. +- The orchestrator path's `Pass2Payload::as_ast_json()` accessor + (`crates/quarto-core/src/project/pass2_renderer.rs:272`) already + exists. `pampa::readers::json::read` (`crates/pampa/src/readers/json.rs:1063`) + parses the JSON back into a typed `Pandoc`. The source_info that + the JSON writer emits with `include_inline_locations: true` + (`crates/quarto-core/src/pipeline.rs:910` area) round-trips through + the reader, but **the hash explicitly excludes source_info** — so + no stripping pass is required and no production plumbing change is + needed. See §"Decisions" / + §"Orchestrator-mode `DocumentAst` extraction" for why option (a) + beats the typed-plumbing alternative. + +### Fixture-to-mode mapping + +Not every fixture is meaningful in every mode: + +| Fixture class | Single-file | Project-orchestrator | +|---|---|---| +| Plain document (`callout-warning`, `theorem`, `code-block-fenced`, …) | ✓ | ✓ (one-page project) | +| Website chrome (`website-chrome`, `website-links`, `website-listing`) | n/a (chrome stages need ProjectContext) | ✓ | +| Attribution (`attribution-basic`) | ✓ (provider on RenderContext) | ✓ | + +Document fixtures run in both modes against the *same* fixture content +(the orchestrator wraps the document in a tiny synthetic project). +Website fixtures run orchestrator-only because the chrome transforms +require a populated ProjectIndex; running them through single-file +mode would test a partial pipeline that doesn't exist in production. + +### Failure modes the test catches + +- A filter that's truly non-idempotent (e.g., `Str.text + "!"` → + growing text on each run). +- A transform that emits non-deterministic attributes or `plain_data` + (e.g., HashMap iteration order in a sloppy implementation). +- A transform that mutates inputs differently across runs (probably a + bug). +- A metadata transform that synthesizes meta keys non-deterministically + (e.g., listing-item-info that gets file-mtime in racy ways). + +### Failure modes the test does NOT catch + +- A transform that's idempotent but produces *wrong* output (wrong-but- + consistent — needs other testing). +- A filter that's idempotent for one input but non-idempotent for + another (need representative fixtures). +- Round-trip non-idempotence — see §"Pipeline-determinism only" above + and Plan 7a. +- HTML-shape non-determinism inside `meta.rendered.*` (excluded from + the hash). + +## Coverage gaps to address during implementation + +Each fixture below covers one or more transforms. **All 26 fixtures +below now ship in `crates/quarto-core/tests/idempotence.rs`** (plus a +27th `smoke_plain_paragraph` not enumerated here). Two are in the +Phase-5 triage queue (marked inline below); the other 24 pass on +first run in both applicable modes. + +**Existing fixtures (carry forward from prior plan draft):** + +- [x] `meta-single` — `{{< meta foo >}}` with single-string foo → + shortcode-resolve, metadata-normalize. +- [x] `meta-markdown` — `{{< meta foo >}}` with `**Bold** title` → + shortcode-resolve (PandocInlines branch). +- [x] `include-trivial` — `{{< include child.qmd >}}` → + include-expansion stage, shortcode-resolve. +- [x] `callout-warning` — `::: {.callout-warning} Body :::` → callout. + (callout-resolve is excluded; CustomNode survives.) +- [x] `theorem` — `::: {.theorem #thm-foo} Math here :::` → + theorem-sugar. +- [x] `figure-ref-target` — `:::: {#fig-foo} ![cap](img.png) ::::` → + float-ref-target-sugar. +- [x] `crossref-to-theorem` — `See @thm-foo` paired with the theorem + above → crossref-index, crossref-resolve. +- [x] `sectionize-multi` — `## A` / `### B` / `## C` with body → + sectionize. +- [x] `footnotes-mixed` — inline `^[...]` + reference `[^foo]` → + footnotes. +- [x] `appendix-license` — `license:` / `copyright:` meta + + `:::{.appendix}` user block + footnotes → appendix-structure + (+ footnotes interaction). +- [x] `combined-stress` — sectionize + callouts + shortcodes + interacting. + +**New fixtures (gap audit):** + +- [x] `code-block-fenced` — fenced ``` ```python ``` block with content + → code-block-generate, code-block-render, code-highlight stage. +- [x] `lua-shortcode-version` — `{{< version >}}` → shortcode-resolve + (Lua-loaded handler path; simplest deterministic case — returns + `quarto.version` joined by dots). +- [x] `lua-shortcode-lipsum-fixed` — `{{< lipsum 3 >}}` (no `random=` + kwarg) → shortcode-resolve via lipsum's Lua handler. The + `math.randomseed` in `lipsum.lua:5` runs but `math.random` is never + called on this code path, so the output is the first three + paragraphs of the canned data deterministically. The `random=true` + variant is intentionally non-deterministic and out of scope. + **In Phase-5 queue (bd-3odjm)**: pipeline IS idempotent (SingleFile + mode passes), but ProjectOrchestrator panics with + `MalformedSourceInfoPool` re-parsing the AST JSON — known Plan-5 + wire-format issue, not a transform bug. +- [x] `proof` — `::: {.proof} ... :::` → proof-sugar. +- [x] `equation-labeled` — `$$ E=mc^2 $$ {#eq-mass}` paired with + `@eq-mass` → equation-label, crossref-resolve (equation branch). +- [x] `toc-on` — `toc: true` + multiple sections → toc-generate, + toc-render. +- [x] `video-filter-header` — exercises + `resources/extensions/quarto/video/video-filter.lua` (the only + built-in Lua filter under `resources/extensions/`). The `quarto/video` + extension is **embedded at compile time** (`include_dir!` of + `resources/extensions/` in + `crates/quarto-core/src/extension/mod.rs:33`) and auto-discovered for + every `StageContext::new()` call (`stage/context.rs:221`), so the + fixture needs no scaffolding beyond declaring the filter. Minimal + shape: + + ```yaml + --- + filters: + - video + --- + + # Title {background-video="https://www.youtube.com/embed/abc"} + ``` + + The filter rewrites `background-video` → `background-iframe` on + Headers whose URL matches one of three video hosts. Pattern matches + the smoke-test at + `crates/quarto/tests/smoke-all/extensions/filter-extension/test.qmd`. +- [x] `include-in-header` — `include-in-header: foo.html` in meta with + trivial `foo.html` → include-resolve stage. +- [x] `theme-bootstrap` — `theme: cosmo` (or default) in meta → + compile-theme-css stage. +- [x] `table-bootstrap-class` — a simple pipe table (`| col | + --- | val |`) → `TableBootstrapClassTransform`. The transform + attaches Bootstrap CSS classes (`table`, etc.) to `Table` nodes; + the assertion that the same classes appear in the same order on + both runs is the idempotence check. Minimal shape: one + two-column, two-row pipe table; no extra config needed. + +**Website-project fixtures** (each needs a `ProjectContext` wired to a +`_quarto.yml` with `project.type: website` + the relevant config; one +combined fixture can cover most chrome transforms): + +- [x] `website-chrome` — minimal website with navbar, sidebar, page + navigation, footer, favicon, bootstrap icons → website-title-prefix, + website-favicon, website-bootstrap-icons, website-canonical-url, + navbar-generate/render, sidebar-generate/render, page-nav-generate/render, + footer-generate/render, link-resolution stage. +- [x] `website-links` — internal `.qmd` body links between two project + pages → link-rewrite + link-resolution. (bd-rz2we: fixed by + splitting `vfs_root` into write-root + url-root so native test + helpers can pass a synthetic URL prefix while disk writes still + land in the tempdir; see + `claude-notes/plans/2026-05-21-vfs-url-write-root-split.md`.) +- [x] `website-listing` — minimal listing with two items, one with + categories, one with `feed:` config → listing-generate, listing-render, + categories-sidebar, listing-feed-link, listing-feed-stage (native only), + listing-item-info stage. + +**Attribution fixture** (the test helper installs an +`AttributionSourceProvider` on `RenderContext.attribution_provider`; +`run_pipeline` forwards it to `StageContext.attribution_provider` at +`pipeline.rs:664`): + +- [x] `attribution-basic` — document with an installed + attribution provider → attribution-generate stage, attribution-render + transform. Uses `PreBuiltAttributionProvider` (transport JSON) rather + than `GitBlameProvider` for determinism — git history would vary + across machines. + +**Resource fixture:** + +- [x] `resource-image` — `![alt](./local.png)` with the image file + present → resource-collector. Image is a 67-byte minimal valid PNG + written via `write_bytes`. + +If a fixture in this list discovers non-idempotence on first run, +**leave the test failing** and file a beads issue using the sub-agent +investigation prompt template in §"CI failure policy & sub-agent +prompt template." +The fix lands against the appropriate transform's crate (per §"What +happens when a fixture fails"). Do not silently drop the fixture, and +do not `#[ignore]` it without explicit user approval — failing tests +are the triage backlog. + +## Decisions (was: open questions) + +- **Test crate location** — settled. The test lives at + `crates/quarto-core/tests/idempotence.rs` as a workspace-level + integration test (matches the existing pattern in + `crates/quarto-core/tests/` — `sidebar_pipeline.rs`, + `navbar_footer_pipeline.rs`, `render_page_in_project.rs`, etc.). + Invoke with `cargo nextest run -p quarto-core --test idempotence`. +- **Fixture location** — settled. Files in + `crates/quarto-core/tests/fixtures/idempotence/`, one subdirectory + per non-trivial fixture (for the website/multi-file ones); in-source + literals for the trivial single-page cases written by the fixture's + `setup` closure into a `TempDir`. Pattern matches + `crates/quarto-core/tests/fixtures/websites/hub-smoke/` and + `phase5-website-baseline/`. +- **`ProjectContext` setup for website fixtures** — resolved by + reuse. There is **no need** to write a `make_website_project_ctx` + helper. The existing pattern across `crates/quarto-core/tests/` + (used by `render_page_in_project.rs`, `sidebar_pipeline.rs`, + `navbar_footer_pipeline.rs`, `page_navigation_pipeline.rs`, + `listing_pipeline.rs`, `navigation_e2e.rs`, `link_rewriting_pipeline.rs`, + `website_post_render.rs`) is: write `_quarto.yml` + page contents + into a `TempDir`, then let `ProjectContext::discover` do the rest. + Each Plan 3 fixture's `setup` closure does exactly this in 5-30 + lines. The chrome transforms read their config from the discovered + project — they don't need a parameterized builder. For the website + fixtures (`website-chrome`, `website-links`, `website-listing`) + we can either inline the YAML in `setup` or, for the larger ones, + use `copy_fixture(...)` (see `render_page_in_project.rs:616`) to + pull a pre-built fixture directory out of `tests/fixtures/idempotence/`. +- **Fixture-authoring rules for path-recording transforms** — + settled. Fixtures that exercise `resource-collector`, + `include-resolve`, `BUILTIN_EXTENSIONS` (any built-in extension + lookup), or other transforms that record absolute paths into meta + MUST use only paths that resolve relative to the fixture root, + never absolute process paths. Reason: the built-in extensions + resource bundle extracts to a `temp_dir()`'d location whose + absolute path differs across processes (stable within a single + process — fine for Plan 3's two-runs-compare contract, but a + latent issue for any future stored-snapshot variant). The + fixtures README must spell this out. Two practical rules: + (1) use relative URLs in fixture body content (`./local.png`, + not `/private/var/.../local.png`); + (2) when a transform's output includes a path, the assertion must + hash the value through `compute_meta_hash_fresh_excluding_rendered` + (which we already do) so test-process-specific paths under + `rendered.*` are excluded by construction. +- **Orchestrator-mode `DocumentAst` extraction** — settled on + option (a). The orchestrator path emits the AST as a JSON string + via `Pass2Payload::AstJson`; `Pass2Payload::as_ast_json()` + (`crates/quarto-core/src/project/pass2_renderer.rs:272`) is + already in the API. `pampa::readers::json::read` + (`crates/pampa/src/readers/json.rs:1063`) parses it back into a + typed `(Pandoc, ASTContext)`. The JSON writer emits source_info + triples (`include_inline_locations: true`), and those round-trip + through the reader — but **the hash explicitly excludes + source_info** (`compute_blocks_hash_fresh` / + `compute_meta_hash_fresh` both skip it), so no stripping pass is + needed. Cost: one JSON-string parse per orchestrator-mode + assertion, no production plumbing change. The earlier draft of + this section preferred option (b) (forward typed `DocumentAst` + through `PreviewAstOutput` / `WasmPassTwoOutput`) — abandoned + because (a) needs *no* type changes and the source_info concern + doesn't actually bite the hash. +- **bd-2ag1c ordering** — Plan 3 lands first; bd-2ag1c (whether + `BootstrapJsStage` / `ClipboardJsStage` belong in + `Q2_PREVIEW_STAGE_EXCLUDED`) waits for Plan 3's coverage. The + rationale: Plan 3 is what *measures* whether those stages + contribute non-determinism to the q2-preview AST; if they don't + (they currently only write to `ctx.artifacts`, not to `meta` or + `blocks`), bd-2ag1c can be closed without changes. If they do, + bd-2ag1c picks up the cleanup with measurements in hand. + +### CI failure policy & sub-agent prompt template + +The test fails noisily if any transform / filter is non-idempotent +— that's the point. Failing fixtures stay **failing** (no +auto-`#[ignore]`). For each failure, file a beads issue whose +description doubles as a self-contained sub-agent investigation +prompt: the fixture path, the two hash values, the diverging key +path (block vs meta), and the suspected stage / transform / filter +to focus on. `#[ignore]` is only applied when the user explicitly +says so. **The integration branch (`feature/provenance`) is allowed +to be red while the queue is being drained** — see §"Long-lived +branch policy" at the top of this plan and §"Phase 5 — Failure +triage" for the operational mechanics. + +Sub-agent prompt template (filled in per failure when filing the +beads issue — the test driver's panic message provides the +fixture, mode, hashes, and `DivergencePoint`, so the agent already +has a concrete starting point): + +> Investigate non-idempotence in q2-preview fixture +> `` (`` mode). Two consecutive pipeline +> runs over the same input diverge at +> ` or "MetaKey { path: ["listings", "foo"] }">`. Hashes: blocks +> `` vs ``, meta `` vs ``. Read +> `claude-notes/plans/.md` §"Failure modes the test +> catches" for category guidance. Reproduce with `cargo nextest +> run -p quarto-core --test idempotence `. +> Suspected source likely lives in `` based on +> the divergence location — start there. Verdict: deterministic +> source (HashMap iteration, time, RNG) → propose a fix; +> non-deterministic but semantically equivalent (e.g. attribute +> ordering inside an HTML chrome payload) → propose either +> canonicalization at the source or a targeted hash exclusion. Do +> not `#[ignore]` the test. + +## References + +Line numbers below are accurate as of `feature/provenance` HEAD on +2026-05-21. Plan 4's source_info churn or any pipeline reorganization +may shift them — when in doubt, grep by symbol name. The plan's +factual content survives line-number drift; the references are a +convenience for navigating, not a contract. + +- `crates/quarto-core/src/pipeline.rs:1237` + `build_q2_preview_transform_pipeline` — q2-preview transform list, + source of truth. +- `crates/quarto-core/src/pipeline.rs:1198` + `Q2_PREVIEW_TRANSFORM_EXCLUDED` — the four transforms that don't run. +- `crates/quarto-core/src/pipeline.rs:380` + `build_q2_preview_pipeline_stages` — stage-level pipeline. +- `crates/quarto-core/src/pipeline.rs:356` + `Q2_PREVIEW_STAGE_EXCLUDED` — three excluded stages + (`math-js`, `render-html-body`, `apply-template`). +- `crates/quarto-core/src/pipeline.rs:627` + `run_pipeline` — pipeline execution entry point used by the test + runner. +- `crates/quarto-core/src/pipeline.rs:859` + `render_qmd_to_preview_ast` — production entry point that combines + `build_q2_preview_pipeline_stages` + `run_pipeline`; mirrors the + `DriveMode::SingleFile` helper. +- `crates/quarto-core/src/pipeline.rs:168` + `PreviewAstOutput` — currently carries only `ast_json: String` + (no typed `DocumentAst`). +- `crates/quarto-core/src/transforms/` — the Rust transform crate root. + Each transform's `name()` matches the kebab-case strings listed in + §"What 'built-in' covers." +- `crates/quarto-core/src/transforms/code_highlight.rs:126` + `CodeHighlightStage`'s native user-grammar disk scan + (`ctx.project.dir.join("_quarto").join("grammars")`). OS-order- + dependent if a grammar directory is present; not exercised by + Plan 3 fixtures (see §"Noted, not actively tested"). +- `crates/quarto-core/src/transforms/shortcode_resolve.rs:513` + `ShortcodeResolveTransform::transform` — site of the per-pipeline + fresh `LuaShortcodeEngine::new` construction. +- `crates/quarto-ast-reconcile/src/hash.rs:115` + `compute_blocks_hash_fresh` — the existing blocks hasher (slice). +- `crates/quarto-ast-reconcile/src/hash.rs:102` + `compute_block_hash_fresh` — the singular per-block hasher used + by `find_first_divergence` for index-keyed comparison. +- `crates/quarto-ast-reconcile/src/hash.rs:767` + `test_same_content_same_hash` — confirms blocks hash excludes + source_info. +- `crates/pampa/src/lua/filter.rs:158` + `apply_lua_filter` — per-filter Lua engine creation point (singular). + Driven by `apply_lua_filters` (plural, line 270), which loops over + `filter_paths` and calls the singular form once per filter. +- `crates/pampa/src/lua/shortcode.rs:68` + `LuaShortcodeEngine::new` — per-pipeline Lua engine for shortcodes + (constructed on the stack inside + `ShortcodeResolveTransform::transform`). +- `crates/pampa/src/readers/json.rs:1063` + `pampa::readers::json::read` — re-parses AST JSON back into a + typed `(Pandoc, ASTContext)`; used by `DriveMode::ProjectOrchestrator` + to recover a typed AST for hashing. +- `crates/quarto-core/src/project/pass2_renderer.rs:272` + `Pass2Payload::as_ast_json` — accessor used by both the existing + test in `render_page_in_project.rs` and Plan 3's orchestrator-mode + helper. +- `crates/quarto-core/src/project/pass2_renderer.rs:254` + `Pass2Payload::AstJson` — variant currently carries only + `ast_json: String`. +- `crates/quarto-core/src/stage/context.rs:221` + `StageContext::new` — calls `discover_extensions` with the embedded + built-in extensions path, so the `quarto/video` filter extension is + always discoverable without per-fixture scaffolding. +- `crates/quarto-core/src/extension/mod.rs:33` + `BUILTIN_EXTENSIONS_DIR` — compile-time + `include_dir!(resources/extensions)` ensures the video/lipsum/version/ + kbd/placeholder extensions are baked into the binary. +- `crates/quarto-core/tests/render_page_in_project.rs:660` + `render_active_page_preview` — the **existing** + `DriveMode::ProjectOrchestrator` helper. Reused verbatim by Plan 3, + not reimplemented. +- `crates/quarto-core/tests/render_page_in_project.rs:64` + `render_active_page` — sibling HTML helper; useful prior art for + the project-discovery pattern even though Plan 3 doesn't use it + directly. +- `crates/quarto-core/tests/render_page_in_project.rs:616` + `copy_fixture` — utility for copying a pre-built fixture directory + out of `tests/fixtures/` into a `TempDir`. Available for the + heavier website fixtures. +- `crates/quarto-core/tests/fixtures/websites/hub-smoke/`, + `crates/quarto-core/tests/fixtures/phase5-website-baseline/` — + example website fixture directories with `_quarto.yml` + multi-page + layouts. Demonstrates the shape Plan 3's website fixtures take. +- `resources/extensions/quarto/video/video-filter.lua` — the one + built-in Lua filter today. +- `claude-notes/plans/lua-filter-pipeline/00-index.md` — Carlos's + 2025-12-21 analysis of **TypeScript Quarto**'s `run_as_extended_ast()` + Lua filter pipeline (~78 stages classified by side-effect category). + This is porting reference material for the broader epic, **not** the + inventory Plan 3 tests. Plan 3's universe is enumerated in §"What + 'built-in' covers." Useful when porting an additional TS filter into + Rust and wondering whether the source-side analysis flagged it as + pure / file-reading / network / subprocess. + +## Work items + +### Phase 1 — Hashing infrastructure + +- [x] Add `compute_meta_hash_fresh` in + `crates/quarto-ast-reconcile/src/hash.rs`, parallel to + `compute_blocks_hash_fresh`. Walks `ConfigValue` tree + source-info-agnostically. Hashes scalars by `Yaml` payload, recurses + into `PandocInlines` / `PandocBlocks` via the existing inline / block + hashers, hashes `Array` in order, hashes `Map` entries + `(key_string, recurse(value))` **in insertion order** (no sort), + **includes `merge_op`**, skips `source_info` and `key_source`. (See + §"In scope" for the full spec.) +- [x] Add `compute_meta_hash_fresh_excluding_rendered` variant that + skips the `rendered` top-level key (HTML-string side outputs from + chrome transforms + `IncludeResolveStage` + Bootstrap/clipboard + injection). +- [x] Add unit tests for both: + - same content → same hash; + - different content → different hash; + - different `source_info` / `key_source` → same hash; + - same content with `rendered.foo` key only differing → same hash + for the excluding variant; + - **same content with Map keys in different insertion order → + different hash** (regression guard for the no-sort choice); + - different `merge_op` → different hash (regression guard for the + `merge_op`-participates choice). +- [x] Add `find_first_divergence(blocks_a, meta_a, blocks_b, meta_b) + -> DivergencePoint` alongside the hashers. The plan-sketch signature + took `&DocumentAst`, but `DocumentAst` lives in `quarto-core` and + `quarto-ast-reconcile` cannot depend on it; the helper takes the + underlying `&[Block]` + `&ConfigValue` instead, and the test driver + in `quarto-core/tests/idempotence.rs` will project from its + `DocumentAst`. Reuses `compute_block_hash_fresh` for the block walk + and a recursive insertion-order traversal for the meta walk; both + walks short-circuit on the first divergence. +- [x] Unit tests for `find_first_divergence`: + - identical docs → `DivergencePoint::None`; + - one block differs at index N → `Block { index: N, ... }`; + - one meta key path differs → `MetaKey { path: [...], ... }`; + - divergence under a `rendered.*` path → not reported (skipped to + match `compute_meta_hash_fresh_excluding_rendered`). + +### Phase 2 — Test crate scaffolding + +- [x] Create `crates/quarto-core/tests/idempotence.rs`. +- [x] Implement the `Fixture` struct + `run_fixture(fixture, mode)` + helper that loops `DriveMode::{SingleFile, ProjectOrchestrator}` + (see §"What gets tested concretely" for the body). +- [x] Implement `run_single_file(project_dir, active) -> DocumentAst` + using `ProjectContext::discover` + `build_q2_preview_pipeline_stages` + + `run_pipeline`. (~50 lines; the only genuinely new driver.) +- [x] Implement `run_orchestrator(project_dir, active) -> DocumentAst` + by delegating to the existing `render_active_page_preview` helper + at `crates/quarto-core/tests/render_page_in_project.rs:660` and + re-parsing `Pass2Payload::as_ast_json()` via + `pampa::readers::json::read`. (Helper copied inline since each + `tests/*.rs` is its own binary; the plan flags this as acceptable.) + No new orchestrator wiring is written; no production plumbing + change is needed. +- [x] Implement `pandoc_to_document_ast(pandoc) -> DocumentAst` — the + small field-shuffle between the re-parsed `Pandoc` and the + hashing helpers' expected shape. Land inline in `idempotence.rs`; + do not promote to library code until a second caller appears. +- [x] Create `crates/quarto-core/tests/fixtures/idempotence/` + directory with a README listing the fixture-format rules: + - no executable engine cells (fenced `` ```python `` blocks only); + - **no absolute process paths** in fixture content — see §"Decisions" + / "Fixture-authoring rules for path-recording transforms"; + - per-fixture mode mapping (document fixtures run in both modes; + website fixtures orchestrator-only). +- [x] Borrow `write` / `canonical` (and `snippet` / `copy_fixture` when + needed by Phase 4 fixtures) from `render_page_in_project.rs` — + copied into `idempotence.rs` for now; pulling them into a shared + `tests/common/` module is out of scope for Plan 3. +- [x] **Phase-2 smoke fixture** (`smoke_plain_paragraph`) drives both + modes on a single-paragraph document. Passing this confirms the + harness is wired correctly before Phases 3-4 add the real fixtures. + +### Phase 3 — Existing-fixture coverage (carry-forward) + +- [x] Add fixtures: `meta-single`, `meta-markdown`, `include-trivial`, + `callout-warning`, `theorem`, `figure-ref-target`, + `crossref-to-theorem`, `sectionize-multi`, `footnotes-mixed`, + `appendix-license`, `combined-stress`. +- [x] Wire one assertion per (fixture, mode) pair — these are all + document fixtures, so each runs in both `SingleFile` and + `ProjectOrchestrator` mode. (For now each `#[test]` calls + `run_in_each_mode` which loops over both modes; if a fixture + later goes red in only one mode, the panic message names the + mode, and we can split into two `#[test]` functions for finer + reporting at that point.) + +All 11 carry-forward fixtures pass on first run, in both modes. +No queue entries. + +### Phase 4 — New-fixture coverage (gap closure) + +- [x] Add document-level fixtures (run in **both** modes), batch 4a + (no extra scaffolding): `code-block-fenced`, `lua-shortcode-version`, + `lua-shortcode-lipsum-fixed` (with module-load `randomseed` comment + in the `.qmd` per §"Noted, not actively tested"), `proof`, + `equation-labeled`, `toc-on`, `video-filter-header`, + `theme-bootstrap`, `table-bootstrap-class`. **9/10 pass on first run.** + `lua_shortcode_lipsum_fixed` fails in `ProjectOrchestrator` mode + only — not a hash mismatch but a `MalformedSourceInfoPool` when + re-parsing the orchestrator's AST JSON. JSON writer/reader + round-trip bug specific to lipsum-shortcode-generated inlines. + Filed as **bd-3odjm**; root-caused 2026-05-21 to the type-code-3 + mismatch between writer (`FilterProvenance` payload + `[filter_path, line]`) and reader (still decodes code 3 as legacy + `Transformed` `[parent_id, ...]`). Fix is owned by + **[Plan 5](2026-05-04-q2-preview-plan-5-wire-format.md)** — + §Goal calls this exact bug out and Plan 5's reader change handles + both shapes. Per the long-lived-branch policy below, this stays + red on `feature/provenance` until Plan 5 lands; do not patch + locally. `SingleFile` mode passes — the pipeline itself is + idempotent. +- [x] Add document-level fixtures, batch 4b (multi-file): + `include-in-header` (writes a small HTML stub), + `resource-image` (writes a 67-byte minimal PNG). Both pass on + first run in both modes. +- [x] Add website-project fixtures (orchestrator-mode only): + `website-chrome`, `website-links`, `website-listing`. **All 3 + pass.** `website-chrome` (navbar + sidebar + page-nav + footer + + favicon + bootstrap-icons + canonical-url) is clean. + `website-listing` (listing with categories + feed) is clean. + `website-links` (cross-page `.qmd` body links): initial divergence + filed as **bd-rz2we** turned out to be `ResourceResolverContext` + conflating two roles — disk-write root *and* URL prefix — in a + single `PathBuf`. Native test helpers pointed both at a real + tempdir, so rendered link URLs leaked the absolute tempdir path + into the AST. Fixed by splitting the field into + `{ write_root, url_root }` and adding a per-renderer + `with_url_root("/.quarto/project-artifacts")` builder; native + test helpers now keep the tempdir for disk writes but use the + synthetic prefix for URLs. See + `claude-notes/plans/2026-05-21-vfs-url-write-root-split.md`. +- [x] Add attribution fixture: `attribution-basic` (both modes). + Extended `Fixture` with an optional `attribution_json: Option<&'static str>` + field. `run_single_file` installs a `PreBuiltAttributionProvider` + on `ctx.attribution_provider` when present; `render_active_page_preview` + forwards the JSON to `RenderToPreviewAstRenderer::with_attribution`. + Stub JSON has one actor + one run covering bytes 0..1024 so the + attribution map overlaps the entire fixture body. Passes on first + run in both modes. + +### Phase 5 — Failure triage + +`feature/provenance` is a **long-lived integration branch** that +holds failing fixtures *on purpose* until the queue is drained. +The plan does not merge to `main` while any fixture in this gate is +red. See §"Long-lived branch policy" at the top of this plan for +the rationale; what follows is the operational loop. + +- [x] Run the full test suite. For each failing fixture, classify the + cause (filter non-idempotence, transform non-determinism, + metadata-merge issue, etc.). +- [x] For each failure: either fix in-place (if scope is contained and + obvious) or **file a beads issue using the sub-agent investigation + prompt template** from §"CI failure policy & sub-agent prompt + template." Failing tests **stay failing** — no auto-`#[ignore]`. + Only ignore when the user explicitly says so. +- [x] Keep the (still-failing) tests on the integration branch so each + beads issue has a live reproduction. The integration branch may + stay red for an extended period; the merge to `main` happens only + after the queue is drained (every red fixture either fixed or + explicitly `#[ignore]`-d with a permanent rationale signed off + by the user). The failing tests *are* the triage backlog. + +**Queue state after Phase 4 (initial run):** 25 of 27 fixtures green; +2 in the queue. + +- **bd-3odjm** — `lua_shortcode_lipsum_fixed` orchestrator mode. + `MalformedSourceInfoPool` on `pampa::readers::json::read` for the + AST JSON the orchestrator emits. Root-caused to the type-code-3 + mismatch between the writer (`FilterProvenance` payload + `[filter_path, line]`) and the reader (still decodes code 3 as + legacy `Transformed` `[parent_id, ...]`). Fix is owned by + [Plan 5](2026-05-04-q2-preview-plan-5-wire-format.md). +- **bd-rz2we** — `website_links` orchestrator mode. Block 0 hash + diverges across runs with different project roots; meta hash is + stable. Hypothesis: link-rewrite or link-resolution captures the + absolute project root (or a canonicalized tempdir form) into the + AST when it should emit a path-independent relative URL. + +### Phase 6 — Documentation + +- [x] Add `claude-notes/instructions/idempotence-contract.md` covering: + what the contract requires of new transforms, the meta-hash + `rendered.*` exclusion, how to add a fixture when introducing a new + transform, the engine-cells-forbidden rule. +- [x] Cross-link from the README of the fixtures directory. +- [x] Cross-link from Plan 7a (so authors looking at runtime user-filter + idempotence find the CI contract too). + +### Phase 7 — Verification + +- [x] `cargo nextest run --workspace` runs. **9346/9348 pass; 2 fail.** + The two failures are the documented queue items above + (bd-3odjm, bd-rz2we). Every other test in the workspace is green, + including the Phase-1 unit tests in `quarto-ast-reconcile` and + the 25 passing idempotence fixtures. +- [x] `cargo xtask verify` runs (full WASM stack — `npm install` from + repo root, `npm run build:wasm` from hub-client). Steps 1-4 green; + Step 5 (Rust tests with `-D warnings`) fails on the same 2 + queue-item fixtures. Steps 6-12 don't run because of Step 5's + exit; that's the expected long-lived-integration-branch state + per §"Long-lived branch policy" — the gate is red on purpose + until the queue is drained. +- [x] End-to-end invocation recorded in commit messages + (`cargo nextest run -p quarto-core --test idempotence` cited in + every Phase-2 through Phase-4d commit). + +**Plan 3 is complete as a deliverable** — the gate exists, the +hashing infrastructure exists, 27 fixtures cover the universe under +§"What 'built-in' covers", the contract is documented, and the +queue is filed in beads with reproduction commands. Merge to `main` +remains gated on draining the queue (bd-3odjm via Plan 5; bd-rz2we +via a follow-up). + +## Dependencies + +- Depends on: Plan 1 (`build_q2_preview_pipeline_stages` exists and + runs). +- Blocks: implicitly Plans 4-8 (round-trip work assumes this contract + holds — but for pipeline non-determinism only; round-trip itself is + 7a's concern). +- Related to Plan 7a (runtime user-filter idempotence check). Plan 3 + is the **CI-time** half for built-ins (transforms + ship-with-Quarto + Lua filters); Plan 7a is the **runtime** half for user-supplied + filters. The two share `compute_blocks_hash_fresh` / + `compute_meta_hash_fresh` and the same flavor-1-vs-flavor-2 + distinction. See Plan 7a's §"Two flavors of non-idempotence" for the + shared vocabulary. + +### What happens when a fixture fails + +Plan 3 reports failures; the *fix* lands wherever the offending +transform / filter lives. Failure modes and where their fixes go: + +- **Non-idempotent built-in Lua filter**. Edit the filter's Lua + source. Lands in `resources/extensions/quarto//`. Plan 3 + surfaces the test. +- **Non-deterministic transform attribute / `plain_data` ordering**. + HashMap iteration or similar. Lands in the transform's `.rs` file + under `crates/quarto-core/src/transforms/`. +- **Non-deterministic metadata transform**. Lands in + `metadata_normalize.rs` or wherever the offending merge/normalize + step lives. +- **Source-info-related instability**. Should NOT happen because the + hashers exclude source_info / key_source. If somehow it does, + Plan 4's type changes are the place to investigate. + +If a fixture fails on first run, **leave the test failing** and file +a beads issue (with the sub-agent investigation prompt from §"CI +failure policy & sub-agent prompt template"). The failing test stays +red until the issue is resolved — `#[ignore]` only when the user +explicitly says so. Do not silently disable. + +## Risk areas + +- **A transform or filter might fail the test on first run**. Triaged + per Phase 5; **leave failing + file a sub-agent investigation prompt** + (see §"CI failure policy & sub-agent prompt template"). `#[ignore]` + only when the user explicitly says so. +- **Hash stability across binary versions**: `FxHasher`'s output is + stable within a Rust process but not across versions. Tests compare + hashes computed in the same process, not stored as constants. This is + the natural shape of "run pipeline twice and compare" anyway. +- **Pipeline construction non-determinism**: if extension discovery + picks up paths in OS-dependent order, attributes could differ on + different machines. Mitigated by fixture isolation — fixtures don't + reference real OS paths unless explicitly testing a path-aware + feature. The attribution fixture is the main case to watch. +- **Website-project fixture complexity**: assembling a valid + `ProjectContext` is non-trivial. Risk: time spent on test + scaffolding rather than transform coverage. Mitigation: reuse the + existing pattern (write `_quarto.yml` + page contents into a + `TempDir`, call `ProjectContext::discover`) — the same recipe + used by ~10 sibling tests in `crates/quarto-core/tests/`. No + parameterized builder is needed. See §"Decisions" / + "ProjectContext setup for website fixtures." + +### Noted, not actively tested + +Two latent determinism surfaces surfaced during the source review. The +test suite isn't expected to flake on either; they're recorded here so +the next person who *does* hit a hash divergence in their neighborhood +has a head start: + +- **`CodeHighlightStage`'s native disk scan for user grammars** + (`crates/quarto-core/src/transforms/code_highlight.rs:126-129`). + On native, when no `user_grammar_provider` is supplied (CLI + default), the stage falls back to scanning + `ctx.project.dir.join("_quarto").join("grammars")` for user + grammars. If that scan returns paths in OS-dependent order, + attribute output could differ across machines. Fixtures here + don't supply user grammars, so the directory is absent and the + early-return at the top of the function makes the scan a no-op + in practice. Not tested today; flag if a future fixture + introduces a grammar dependency. +- **Lipsum module-load `randomseed`** + (`resources/extensions/quarto/lipsum/lipsum.lua:5`). The Lua module + calls `math.randomseed(os.time())` at load time, which runs once per + fresh `LuaShortcodeEngine`. On the non-random code path (`{{< lipsum + 3 >}}` — what `lua-shortcode-lipsum-fixed` exercises) `math.random` + is never reached, so the seed has no observable effect. If a future + variant routes through `math.random` (random shortcode-resolution + paths, random shortcode arg parsing) the test would start flaking + noticeably across runs. The fixture should carry a comment naming + this. + +## Estimated scope + +| Component | Lines (rough) | +|---|---| +| `compute_meta_hash_fresh` + excluding-rendered variant + tests | ~140 | +| `find_first_divergence` + `DivergencePoint` + tests | ~80 | +| Test crate scaffolding — `Fixture` struct, `run_single_file`, `run_orchestrator` (thin wrapper over existing helper), `pandoc_to_document_ast` shuffle | ~100 | +| Per-fixture `.qmd` files / inline literals (~25 fixtures, 5-30 lines each) | ~280 | +| Per-fixture (fixture, mode) test assertions (mostly one-liners; ~25 fixtures × 1-2 modes ≈ 40 pairs) | ~120 | +| `idempotence-contract.md` + fixtures README | ~80 | +| **Total** | **~800** | + +The scaffolding line item dropped from an earlier estimate of ~260 +to ~100 after pinning the orchestrator path on the existing +`render_active_page_preview` helper and choosing option (a) for +`DocumentAst` extraction — neither requires a new orchestrator +driver, a `make_website_project_ctx` builder, or production +plumbing changes. `PreviewAstOutput::ast` plumbing is no longer +needed (was ~20 lines in the earlier draft). + +**Inventory note**: an earlier draft estimated "~10-20 built-in filters" +in `resources/extensions/`. That was wrong — `resources/extensions/` +contains one Lua filter (`video-filter.lua`) plus five shortcodes +(kbd, video, lipsum, version, placeholder). The bulk of the universe +under test is the **37 Rust transforms** in +`build_q2_preview_transform_pipeline`, plus the stage-level work in +`build_q2_preview_pipeline_stages`. + +Realistic shape: 2-3 focused sessions — one for hashing +infrastructure + scaffolding + carry-forward fixtures, one for +gap-closure fixtures (particularly the website-project ones), and +a third for Phase 5 triage if the first run surfaces multiple red +fixtures (which is the expected case, not a surprise). + +## Notes + +The user said: "Yes, idempotency and stable structural hash have to be +the base contract — so we have to work that out as part of this complex +of plans. Everything existing must be verified to have those +properties." This plan encodes that contract as a CI-enforced test. + +The hash function excluding source_info means that future plans (4-8) +that change source_info don't risk breaking idempotence — even if a +transform produces different source_info on different runs (e.g., a +Sectionize that generates synthetic source_info from current +timestamps; not what we do, but illustrative), the hash stays stable. + +Round-trip non-idempotence — the property +`pipeline(write(pipeline(x))) ≠ pipeline(x)` — is deliberately not +tested here. The pipeline doesn't re-parse its own output today, so +there's nothing to break. When Plan 7's incremental writer lands, +the property becomes load-bearing for blocks the writer rewrites. +Plan 7a's runtime check is the natural home for round-trip detection +**on user-supplied filters**: per-document, with per-filter attribution +and an `idempotent: false` opt-out, none of which a CI fixture gate +can provide. Round-trip on the built-in side (transforms + one Lua +filter) is consciously left unverified — see Plan 7a's §"Notes" for +the v1 acceptance reasoning. diff --git a/claude-notes/plans/2026-05-04-q2-preview-plan-3-filter-idempotence.md b/claude-notes/plans/2026-05-04-q2-preview-plan-3-filter-idempotence.md deleted file mode 100644 index 13262e828..000000000 --- a/claude-notes/plans/2026-05-04-q2-preview-plan-3-filter-idempotence.md +++ /dev/null @@ -1,311 +0,0 @@ -# Plan 3 — Filter idempotence verification - -**Date:** 2026-05-04 -**Branch:** feature/q2-preview -**Status:** Implementation plan (open questions named) -**Milestone:** M2 verification gate (no new milestone — locks in property -on what's already shipped) - -## Goal - -Verify and lock in the **idempotence + structural-hash-stability** contract -for the q2-preview pipeline. This is the contract the user has stated must be -the foundation: every transform and every built-in Lua filter must produce the -same structural output when run twice on the same input. Without this, the -incremental writer's reconciliation cannot reliably preserve untouched -regions. - -This plan ships: -- A canonical fixture set covering the q2-preview transforms. -- A test that runs each fixture through the q2-preview pipeline twice and - asserts the resulting ASTs hash equal. -- Coverage for the built-in Lua filters that ship with Quarto (those in - `resources/extensions/`). - -When this plan lands, we have CI-enforced confidence that the q2-preview -round-trip story (Plans 4-8) rests on a stable foundation. - -## Scope - -### In scope - -- Canonical fixture set: small `.qmd` files exercising: - - Meta shortcode (single-inline resolution): `{{< meta foo >}}` where `foo` - is a single string. - - Meta shortcode (multi-inline resolution): `{{< meta foo >}}` where `foo` - contains markdown like `**Bold** title`. - - Include shortcode: `{{< include child.qmd >}}` (with a trivial child file). - - Lua filter (mutating): a filter that uppercases all `Str.text`. - - Lua filter (synthesizing): a filter that adds a `pandoc.Str("decoration")` - to each paragraph. - - Callout: `::: {.callout-warning} Body :::`. - - Theorem: `::: {.theorem #thm-foo} Math here :::`. - - Figure with cross-ref target: `:::: {#fig-foo} ![caption](img.png) ::::`. - - Cross-reference: `See @thm-foo`. - - Sectionized doc: a doc with `## Section A`, content, `### Subsection`, - content, `## Section B`, content. - - **Footnotes**: a doc with one inline footnote (`text^[footnote body]`) and - one reference-style footnote (`text[^foo]` + `[^foo]: definition`). - Exercises `FootnotesTransform` (now included in q2-preview's pipeline per - Plan 2B's audit) — produces the synthesized `` markers and the - `
` container. - - **Appendix**: a doc with `license:`, `copyright:`, and a user - `:::{.appendix} Body :::` block, plus footnotes from the previous fixture. - Exercises `AppendixStructureTransform` (also included per Plan 2B's audit) - — produces the `
` container with footnotes, - license, and copyright sections nested inside. - - Combined: a doc with several of the above interacting. -- Idempotence test runner: takes a fixture, runs the q2-preview pipeline - twice, hashes both ASTs via - `quarto_ast_reconcile::compute_blocks_hash_fresh`, asserts equality. -- Coverage of the built-in extensions' filters (those in - `resources/extensions/`): - - For each shipped filter, run the test against a fixture that triggers - that filter. - - Document which built-in filters pass / fail (in case any are - non-idempotent — flag for follow-up). -- Documentation in `claude-notes/instructions/`: a short note on the - idempotence contract for filter authors and transform authors. - -### Out of scope - -- Verification of *user-supplied* filters. They're per-document; the contract - is enforced at runtime via the idempotence test pattern, but we don't - pre-verify every possible user filter. -- Rust-vs-React rendering parity (different contract; later plan). -- Performance / debouncing — idempotence verification doesn't measure runtime. - -## Design decisions (settled in conversation) - -- **The hash is already source-info-agnostic** (verified during research). - `compute_block_hash_fresh` excludes `source_info`. Two runs producing nodes - with different source_info but identical content/attr/plain_data hash - identically. This is what makes the idempotence test work cleanly. -- **The contract's load-bearing property** is "double-pipeline-run produces - hash-equal AST." Equivalent to "every transform is idempotent, every filter - is idempotent, no transform is non-deterministic about plain_data or attr - ordering." -- **Filter mutation provenance stays Original** (settled during conversation). - Lua filter mutations don't change source_info. Constructions are tagged - `Synthetic { by: By::filter(...) }` (post-Plan 5). Idempotence test sees - consistent shape across runs. -- **Built-in filters in scope; user filters out**. Built-in filters ship with - Quarto and the contract applies to them at CI time. User filters are - enforced at edit-time (a non-idempotent user filter breaks q2-preview's - round-trip; the user sees corruption). - -## What gets tested concretely - -For each fixture: - -``` -let pipeline = build_q2_preview_pipeline_stages(); -let runtime = create_test_runtime(); - -let ast_1 = run_pipeline(fixture, pipeline.clone(), runtime.clone()); -let ast_2 = run_pipeline(fixture, pipeline, runtime); - -let hash_1 = compute_blocks_hash_fresh(&ast_1.blocks); -let hash_2 = compute_blocks_hash_fresh(&ast_2.blocks); - -assert_eq!(hash_1, hash_2, "fixture {} non-idempotent", fixture_name); -``` - -Failure modes the test catches: - -- A filter that's truly non-idempotent (e.g., `Str.text + "!"` produces - growing text on each run). -- A transform that emits non-deterministic attributes or plain_data - (e.g., HashMap iteration order in a sloppy implementation). -- A transform that mutates inputs differently across runs (probably - indicates a bug). - -Failure modes the test does NOT catch: - -- A transform that's idempotent but produces *wrong* output (wrong-but- - consistent — needs other testing). -- A filter that's idempotent for one input but non-idempotent for another - (need representative fixtures). -- **Round-trip non-idempotence** — see next section. - -### Two flavors of non-idempotence (and what this plan tests) - -There are two distinct properties that get loosely called "non-idempotence": - -1. **Pipeline non-determinism**: `pipeline(x)` produces different output - on repeat calls with the same input. Caused by filters that depend - on time, RNG, mutable global state, or undefined-order iteration. - **This is what Plan 3's current test catches** — running - `run_pipeline(fixture)` twice on the same source and comparing - hashes detects it cleanly. - -2. **Round-trip non-idempotence**: `pipeline(write(pipeline(x))) ≠ pipeline(x)`. - The filter is deterministic — same input always produces same - output — but applying the filter twice (once on source, once on - the qmd-writer-serialized output of the first pass) gives different - results. The classic case is `f(x) = x + "!"`: deterministic, but - `f(f(x)) ≠ f(x)`. **Plan 3's current test does NOT catch this** - because both runs are on the same source; the filter is applied - once to identical input, producing identical output. - -This second property is the one that actually breaks q2-preview's -writer round-trip. When the user edits and saves, the writer Verbatim- -copies unchanged blocks from source and Rewrites changed blocks via -the qmd writer. The Rewrite path emits the *post-filter* AST node -content as new source bytes; on the next pipeline run, the filter -re-applies to those bytes, and `f(f(x)) ≠ f(x)` shows up as text -drift on edited blocks. - -**Plan 7a's runtime check** (`claude-notes/plans/2026-05-04-q2-preview-plan-7a-filter-idempotence.md`) -targets round-trip non-idempotence explicitly, with a check that runs -the round-trip flavor: pipeline → write → pipeline, and hash-compares. -That plan is for **user filters at runtime**. - -### Plan 3 strengthening — folding the round-trip flavor into CI - -Plan 3 should be amended to also check round-trip non-idempotence -for built-in filters. The change is small: - -```rust -// Existing test: pipeline determinism -let ast_1 = run_pipeline(fixture, pipeline.clone(), runtime.clone()); -let ast_2 = run_pipeline(fixture, pipeline.clone(), runtime.clone()); -assert_eq!(blocks_hash(&ast_1), blocks_hash(&ast_2)); - -// New test: round-trip idempotence -let ast_a = run_pipeline(fixture, pipeline.clone(), runtime.clone()); -let qmd_a = qmd_write_to_string(&ast_a); -let ast_b = run_pipeline(&qmd_a, pipeline, runtime); -assert_eq!(blocks_hash(&ast_a), blocks_hash(&ast_b)); -``` - -Per-fixture cost: one extra pipeline pass + one qmd writer call. -Bounded; runs at CI time, not in the editor loop. - -This amendment is **in scope for Plan 3** (extends what's already a -CI test for built-ins). User filters get the runtime version via -Plan 7a. Add the second flavor to each fixture's assertion when -implementing Plan 3. - -## Open questions for implementation - -- **Test infrastructure location**: probably `crates/quarto-core/tests/` as - a workspace-level integration test crate. New test file like - `q2_preview_idempotence.rs`. Confirm during implementation. -- **Fixture format**: just `.qmd` files in a fixtures dir, or in-source - literal strings? Files are easier to maintain and review; literal strings - are easier to keep with the test. Probably files for the substantial cases, - literals for trivial ones. -- **How to drive the pipeline twice**: the natural approach is to build the - pipeline once and run it twice, OR build two identical pipelines and run - each on a fresh AST. Pipeline construction includes Lua engine setup which - may be stateful — confirm the second-run pipeline starts fresh. -- **Built-in filter inventory**: enumerate the filters in - `resources/extensions/`. Probably ~10-20. Each gets a fixture (or a - shared fixture if the trigger pattern is similar). -- **CI failure expectation**: does the test fail noisily if any built-in - filter is non-idempotent? Probably yes — that's the point. But we may - discover at first run that one or more is non-idempotent, requiring a - pre-existing fix before this plan can land. - -## References - -- `crates/quarto-ast-reconcile/src/hash.rs::compute_blocks_hash_fresh` — the - hash function we use. Verified excludes source_info. -- `crates/quarto-ast-reconcile/src/hash.rs:768` — existing test - `test_same_content_same_hash` — confirms hash excludes source_info. -- `crates/quarto-core/src/pipeline.rs::build_q2_preview_pipeline_stages` — - the pipeline under test (created by Plan 1). -- `resources/extensions/` — built-in extensions with their Lua filters. -- `claude-notes/plans/lua-filter-pipeline/` — Carlos's earlier analysis of - which filters are pure vs. side-effecting. - -## Test plan - -The plan IS the test plan. The deliverable is a test crate. - -- Per-fixture idempotence assertion (the main loop above). -- Per-built-in-filter idempotence assertion. -- Combined fixture (sectionized doc with callouts and shortcodes) as a - stress test. -- Documentation: when a future contributor adds a new transform or filter, - they should add a fixture covering it. Document this expectation in - `claude-notes/instructions/`. - -## Dependencies - -- Depends on: Plan 1 (`build_q2_preview_pipeline_stages` exists and runs). -- Blocks: implicitly Plans 4-8 (round-trip work assumes this contract holds). - We don't need this to *implement* those plans, but landing it before - reviewing them gives us confidence the foundation is solid. -- Related to Plan 7a (runtime user-filter idempotence check). Plan 3 - is the **CI-time** half of the contract for built-in filters; Plan 7a - is the **runtime** half for user-supplied filters. The two share the - same hash function (`compute_blocks_hash_fresh`) and the same - round-trip-vs-non-determinism distinction. See §"Plan 3 strengthening" - above and Plan 7a's §"Plan 3 strengthening" section. - -### What happens when a fixture fails - -Plan 3 reports failures; the *fix* lands in the appropriate downstream -plan, not in Plan 3. Three failure modes and where their fixes go: - -- **Non-idempotent built-in Lua filter**. The filter's contract is - broken. Fix: edit the filter's Lua source. Lands wherever the - filter lives (typically `resources/extensions/...`). Plan 3 just - surfaces the test. -- **Non-deterministic transform attribute ordering**. A transform that - iterates a HashMap or similar and emits attrs in non-deterministic - order. Fix: change the transform to emit deterministically. Lands - in the transform's source file (typically a Plan 6-shaped fix even - though it's not strictly a provenance issue — provenance audit and - determinism audit are sister concerns). -- **Source-info-related instability**. Should NOT happen because the - hash function excludes source_info. If somehow it does, Plan 4's - type changes are the place to investigate. - -If a fixture fails on first run, document the failure as a known issue -in Plan 3's commit message and file the fix as a follow-up against the -appropriate plan. Don't silently disable failing fixtures. - -## Risk areas - -- **A built-in filter might fail the test on first run**. If so, we either - (a) fix the filter before this plan lands or (b) document the failure as - a known issue and defer the fix. Plan should not silently disable failing - filters from the test set. -- **Hash stability across binary versions**: `FxHasher`'s output is stable - within a Rust process but not across versions. Tests should compare hashes - computed in the same process, not stored as constants. This is the natural - shape of "run pipeline twice and compare" anyway. -- **Pipeline construction non-determinism**: if the pipeline picks up extension - paths in OS-dependent order, attributes could differ on different machines. - Mitigated by fixture isolation — fixtures don't reference real OS paths - unless explicitly testing a path-aware feature. - -## Estimated scope - -| Component | Lines (rough) | -|---|---| -| Test runner harness | ~80 | -| Per-fixture qmd files | ~100 (across ~10 fixtures) | -| Per-fixture test assertions | ~150 | -| Built-in filter coverage | ~150 | -| Documentation | ~50 | -| **Total** | **~530** | - -Probably one focused session. Risk: if a built-in filter fails idempotence, -fixing the underlying issue may push this into two sessions. - -## Notes - -The user said: "Yes, idempotency and stable structural hash have to be the -base contract — so we have to work that out as part of this complex of plans. -Everything existing must be verified to have those properties." This plan -encodes that contract as a CI-enforced test. - -The hash function excluding source_info means that future plans (4-8) that -change source_info don't risk breaking idempotence — even if a transform -produces different source_info on different runs (e.g., a Sectionize that -generates synthetic source_info from current timestamps; not what we do, but -illustrative), the hash stays stable. diff --git a/claude-notes/plans/2026-05-04-q2-preview-plan-4-source-info-types.md b/claude-notes/plans/2026-05-04-q2-preview-plan-4-source-info-types.md index 0e9bff4fc..23e3eac33 100644 --- a/claude-notes/plans/2026-05-04-q2-preview-plan-4-source-info-types.md +++ b/claude-notes/plans/2026-05-04-q2-preview-plan-4-source-info-types.md @@ -1,117 +1,432 @@ -# Plan 4 — SourceInfo provenance types (Synthetic + Derived + By struct) +# Plan 4 — SourceInfo provenance types (Generated + Anchor + AnchorRole) -**Date:** 2026-05-04 +**Date:** 2026-05-04 (substantially revised 2026-05-20) **Branch:** feature/q2-preview -**Status:** Implementation plan (open questions named) -**Milestone:** none directly — foundation for Plans 5/6/7/8 +**Status:** Implementation plan (ready to execute) +**Milestone:** none directly — foundation for the rest of the provenance + epic + +## Epic context + +Plans 3–8 (filter idempotence, this plan, JSON wire format, provenance +audit, incremental writer + soft-drop, runtime filter check, include +round-trip) make up the **provenance epic** — the second wave of work +on the q2-preview branch after Plans 1–2 landed. They share a common +target: a typed, source-mapped notion of "where did this AST node come +from" that lets the incremental writer round-trip edits, lets +attribution credit the right author, and lets future diagnostics surface +resolution chains to users. The file names keep their q2-preview-plan-N +form for continuity with the earlier discussion notes. ## Goal -Extend `SourceInfo` with two new variants: - -- `Synthetic { by: By }` — for nodes that have no source preimage at all - (Sectionize's section Divs, filter constructions, synthesized title h1s, - the footnotes container, etc.). Replaces the existing `FilterProvenance - { filter_path, line }` variant — FilterProvenance becomes the special - case `Synthetic { by: By::filter(...) }`. -- `Derived { from: Arc, by: By }` — for nodes that have a - source preimage AND distinct atomic semantics. Used for shortcode - resolutions: the resolved Str's `from` chain points at the shortcode - token's bytes, and the `by` records that this is shortcode-derived - content (so the writer can prohibit edits via Plan 7's atomic detection). - *Not* used for filter mutations (those stay `Original` — non-atomic) or - sugar transforms (their CustomNodes inherit Original from their input - Div — also non-atomic). - -`By` is an open `{ kind: String, data: serde_json::Value }` struct that -appears as the payload of both Synthetic and Derived. The `Original`, -`Substring`, `Concat` variants are unchanged. +Extend `SourceInfo` with a single new variant, `Generated`, that +captures every transform-synthesized node in a uniform shape: + +```rust +Generated { by: By, from: SmallVec<[Anchor; 2]> } +``` + +`by` answers "which transform produced me." `from` is a list of +typed, role-labeled source-info pointers that answer "which source +bytes contributed to me." The list is empty for pure synthesis +(sectionize wrappers, filter constructions); has one `Invocation` +entry for shortcode resolutions; can carry additional roles +(`ValueSource`, future `Dispatch`, extension-defined `Other(...)`) as +the provenance picture sharpens. + +The pre-existing `FilterProvenance` variant folds into `Generated` +(with `by.kind == "filter"`). ## Scope ### In scope -- Add `Synthetic { by: By }` variant to `SourceInfo` enum. -- Add `Derived { from: Arc, by: By }` variant. +- Add `Generated { by: By, from: SmallVec<[Anchor; 2]> }` variant to `SourceInfo`. Inline capacity 2 covers the steady-state shape after the deferred follow-ups land (Invocation + ValueSource on `meta`/`var` shortcodes; Invocation + Dispatch on Lua-handler shortcodes); see §Risk areas for the trade-off. - Define `By` struct: `{ kind: String, data: serde_json::Value }`. -- Implement builder methods on `By` for known kinds: `filter`, `sectionize`, - `user_edit`, `shortcode`, `include`, `title_block`, `footnotes`, - `appendix`, `tree_sitter_postprocess`, `raw` (escape hatch). +- Define `Anchor` struct: `{ role: AnchorRole, source_info: Arc }`. +- Define `AnchorRole` enum: `Invocation`, `ValueSource`, `Other(String)`. + (`Dispatch` is a planned future role; see "Deferred anchor role" below.) +- Implement builder methods on `By` for known kinds: `filter`, + `sectionize`, `user_edit`, `shortcode`, `include`, `title_block`, + `footnotes`, `appendix`, `tree_sitter_postprocess`, `raw` (escape hatch). +- Implement helper accessors on `SourceInfo` for the `Generated` shape: + - `invocation_anchor(&self) -> Option<&Arc>` + - `value_source_anchor(&self) -> Option<&Arc>` + - `anchors_with_role(&self, role: &AnchorRole) -> impl Iterator>` + - `append_anchor(&mut self, role: AnchorRole, source_info: Arc)` - Migrate all `SourceInfo::FilterProvenance` construction sites to - `SourceInfo::Synthetic { by: By::filter(...) }`. -- Migrate all `SourceInfo::FilterProvenance` pattern-match sites (~22 files - flagged earlier). + `SourceInfo::Generated { by: By::filter(...), from: smallvec![] }`, + carrying `(filter_path, line)` in `by.data`. +- Migrate all `SourceInfo::FilterProvenance` pattern-match sites + (15 files, 27 occurrences — see §Risk areas) to the new shape. - Remove the `FilterProvenance` variant. -- Update accessors: `start_offset`, `end_offset`, `length`, `map_offset`, - `remap_file_ids`, `extract_file_id` (in diagnostic.rs) to handle both - new variants. For `Derived`: recurse into `from` for offset accessors - (returns the `from`'s offsets if the chain leads to Original). -- Update Lua serde (`pampa/src/lua/diagnostics.rs`) for both new variants. - Keep `"FilterProvenance"` recognized as a legacy tag that maps to - `Synthetic { by: By::filter(...) }` for back-compat reads. +- Update accessors on `SourceInfo` to handle `Generated`: + - `length`, `start_offset`, `end_offset` — return `0` (same as today's + `FilterProvenance`; Generated has no characteristic local-text length). + - `map_offset` — return `None` (offset-within-current-text is undefined + for Generated; callers wanting source coordinates use + `resolve_byte_range`). + - `resolve_byte_range` — delegate to `invocation_anchor()` and recurse + (returns the invocation anchor's chain-resolved range, or `None` if + there is no invocation anchor). + - `remap_file_ids` — walk every `Anchor.source_info` and recurse via + `Arc::make_mut`. Unlike `FilterProvenance` (no-op), `Generated` CAN + carry `FileId`s inside its anchors. + - File-id extraction across the workspace is **consolidated** into + two new `SourceInfo` accessors (see "File-id accessor consolidation" + below). The six ad-hoc walkers in `diagnostic.rs`, + `pampa/.../location.rs`, `pampa/.../pipe_table.rs`, + `pampa/.../section.rs`, `apply_template.rs` (test), and + `engine_execution.rs` (test) all collapse onto `root_file_id()` / + `collect_file_ids()`. The Generated arm is defined once on those + accessors. Empty-`from` Generated returns `None`, which matches + today's `FilterProvenance` behavior; the two call sites in + `to_ariadne_report` (`diagnostic.rs:674`, `:773`) both tolerate + `None` gracefully (the main-location path falls through via `?`; + the detail loop `continue`s), so no caller change is required + beyond the mechanical swap to `si.root_file_id()`. +- Update Lua serde (`pampa/src/lua/diagnostics.rs`) for `Generated`. + Use `t = "Generated"` as the discriminant; the table carries `by` and + `from` sub-tables. Keep `"FilterProvenance"` recognized as a legacy + tag that maps to `Generated { by: By::filter(...), from: smallvec![] }` + for back-compat reads. ### Out of scope - JSON wire format changes (Plan 5 does that). - Audit of transforms emitting `SourceInfo::default()` to fix them - (Plan 6 does that). -- The `preimage_in` accessor (Plan 7 does that). -- Helper accessors like `as_filter()` — minimal interface in Plan 4; - helpers added as call sites need them (Plans 6/7). + (Plan 6 does that). `Default for SourceInfo` itself is unchanged + (stays `Original { FileId(0), 0, 0 }`); Plan 6 fixes incorrect + emissions at transform sites without modifying the trait impl. +- The `preimage_in` accessor (Plan 7 owns it). Plan 7's `preimage_in` + consumes `invocation_anchor()` defined here; the contiguity rule + for `Concat` lives with the implementation in Plan 7. +- The `is_atomic_custom_node` registry for CustomNode types (Plan 7 + owns it). +- The metadata loader changes that would populate `ValueSource` + anchors on `meta` / `var` shortcode resolutions — that's a separate + follow-up (see "Deferred anchor role" and Plan 6's "ValueSource + follow-up" section). +- Registering Lua filter files in `SourceContext` to enable typed + `Dispatch` anchors. See "Deferred anchor role" below. + +## Inherited pre-existing failure (bd-3odjm) + +**One test in the workspace is expected to be red throughout Plan 4 +and only goes green when Plan 5 ships its first reader change.** Do +not try to fix it inside Plan 4. + +- Test: `cargo nextest run -p quarto-core --test idempotence lua_shortcode_lipsum_fixed` + (orchestrator mode only; `SingleFile` passes). +- Symptom: panic with `MalformedSourceInfoPool` when + `pampa::readers::json::read` re-parses the orchestrator's AST JSON. +- Root cause (already established): wire-format type-code-3 + collision — writer emits the new `FilterProvenance` payload + `[filter_path, line]` under code 3, reader still decodes code 3 + as the legacy `Transformed` `[parent_id, ...]`. +- Owner: [Plan 5 — wire format](2026-05-04-q2-preview-plan-5-wire-format.md). + +Plan 4's verification gate (Phase 7) and `cargo xtask verify` +therefore expect **exactly one** failing test in +`quarto-core::idempotence` (the test above) until Plan 5's first +reader fix lands. Any other failure is a Plan-4 regression and must +be triaged before continuing. + +This is the integration branch's intended long-lived-red state per +Plan 3's §"Long-lived branch policy" — Plan 4 ships on top of that +queue, not in spite of it. + +## Work items + +Phase-ordered. Each phase compiles cleanly before the next begins. +"Settled" items below (design decisions, semantics rules) are detailed +later in the plan — this list is the actionable extract. + +### Phase 1 — Type definitions in `quarto-source-map` + +- [x] Add `smallvec` to the workspace `Cargo.toml` (`[workspace.dependencies]`) + with the `serde` feature, and depend on it from + `crates/quarto-source-map/Cargo.toml`. Verified absent in both files + at the start of Plan 4. +- [x] Add `By` struct (`kind: String`, `data: serde_json::Value` with + `#[serde(default, skip_serializing_if = "serde_json::Value::is_null")]` + — the attribute path needs to be fully qualified, not the short + `Value::is_null` form). +- [x] Add `AnchorRole` enum (`Invocation`, `ValueSource`, `Other(String)`). +- [x] Add `Anchor` struct (`role: AnchorRole`, `source_info: Arc`). +- [x] Add `Generated { by: By, from: SmallVec<[Anchor; 2]> }` variant + to `SourceInfo`. Keep `FilterProvenance` for now — it's removed + at the end of Phase 5. +- [x] Verify the new enum still implements `Debug`, `Clone`, + `PartialEq`, `Serialize`, `Deserialize` (including with the + `SmallVec` field — needs `serde` feature on `smallvec`). + +### Phase 2 — Constructors and accessors + +- [x] `By::filter`, `By::sectionize`, `By::user_edit`, `By::shortcode`, + `By::include`, `By::title_block`, `By::footnotes`, `By::appendix`, + `By::tree_sitter_postprocess`, `By::raw`. +- [x] `By::shortcode` doc-comment states the required-Invocation-anchor + invariant (see §"Required-anchor invariant for `shortcode`" for + the exact wording). +- [x] `By::is_atomic_kind` (returns true for `filter | shortcode | + title-block | tree-sitter-postprocess`). +- [x] `By::is_kind`, `By::as_filter`. +- [x] `Anchor::invocation`, `Anchor::value_source` constructors. +- [x] `SourceInfo::generated(by)` constructor (empty `from`). +- [x] `SourceInfo::invocation_anchor`, `SourceInfo::value_source_anchor`. +- [x] `SourceInfo::anchors_with_role`, `SourceInfo::append_anchor`. + +### Phase 3 — Update existing accessors for the `Generated` arm + +- [x] `length`, `start_offset`, `end_offset` → return `0` (in `source_info.rs`). +- [x] `map_offset` → return `None` (in `mapping.rs`). +- [x] `resolve_byte_range` → delegate to `invocation_anchor()` and recurse. +- [x] `remap_file_ids` → walk `from`, recurse via `Arc::make_mut`. +- [x] Add `SourceInfo::root_file_id() -> Option` accessor in + `source_info.rs`. +- [x] Add `SourceInfo::collect_file_ids(&self, out: &mut HashSet)` + accessor in `source_info.rs`. +- [x] Migrate `DiagnosticMessage::extract_file_id` + (`quarto-error-reporting/src/diagnostic.rs:556`) → call + `si.root_file_id()`; delete the private fn. +- [x] Migrate `extract_filename_index` + (`pampa/src/pandoc/location.rs:329`) — deleted entirely (callers + were tests only; tests deleted in favor of the unified + `root_file_id`/`collect_file_ids` coverage in source-map). +- [x] Migrate the inline-match file-id extraction in + `pampa/src/pandoc/treesitter_utils/pipe_table.rs:256-279` → + `table_start.root_file_id().unwrap_or(FileId(0))`. Fixes the + latent nested-Substring `FileId(0)` fall-through. +- [x] Migrate the inline-match file-id extraction in + `pampa/src/pandoc/treesitter_utils/section.rs:129-152` → + `table.source_info.root_file_id().unwrap_or(FileId(0))`. Same + latent-nested-Substring bug fixed. +- [x] Migrate the test-mod `root_file_id` local fn in + `crates/quarto-core/src/stage/stages/apply_template.rs:820` → + `info.root_file_id()`; delete the local fn. +- [x] Migrate the test-mod `walk_source_info` inner fn in + `crates/quarto-core/src/stage/stages/engine_execution.rs:819` + → `si.collect_file_ids(out)`; per-Inline/per-Block walkers + retained, only the inner SourceInfo step swapped. + +### Phase 4 — Lua serde + +- [x] Add `Generated` arm to `source_info_to_lua_table` in + `pampa/src/lua/diagnostics.rs` (`t = "Generated"`, `by` and `from` + sub-tables; `by.data` is JSON-encoded as a string for Lua transit). +- [x] Add `Generated` arm to `source_info_from_lua_table`. +- [x] Keep `"FilterProvenance"` legacy reader: maps to + `Generated { by: By::filter(path, line), from: smallvec![] }`. + Indefinitely accepted; writes never emit it. + +### Phase 5 — Migration + +The migration is atomic — one PR, no deprecated-alias scaffold. Only +4 non-source-map callers of `SourceInfo::filter_provenance(...)` exist, +all trivially co-migrated with the 27 `SourceInfo::FilterProvenance` +pattern sites. + +- [x] Sweep remaining `SourceInfo::FilterProvenance` references — + `git grep "SourceInfo::FilterProvenance"` now returns 0 hits in + `crates/`. The legacy `"FilterProvenance"` tag survives only in + the Lua reader (as documented in Phase 4). +- [x] Sweep `SourceInfo::filter_provenance(...)` constructor-function + callers (4 non-source-map files + 1 in-crate test) → new + `Generated` shape inline; constructor deleted from + `source_info.rs`. +- [x] Remove the `FilterProvenance` variant from `SourceInfo`. + +### Phase 6 — Tests (see §Test plan for full descriptions) + +Type / builder: +- [x] Unit tests for every `By` builder (all 10 kinds incl. `raw`). +- [x] `By::is_atomic_kind` coverage (atomic set + extension kinds). +- [x] `By::is_kind` + `By::as_filter` coverage. +- [x] Unit tests for `Anchor::invocation` / `Anchor::value_source`. +- [x] JSON round-trip: `By`, `Anchor`, `Generated` (no anchors / with + Invocation / multi-anchor). + +Accessor tests on `Generated`: +- [x] `length` / `start_offset` / `end_offset` for `Generated` → `0`. +- [x] `map_offset` for `Generated` → `None` (covered by the existing + mapping tests — Generated falls through to the None arm). +- [x] `resolve_byte_range` recursion through `Invocation -> Substring` + → resolves correctly; empty `from` and ValueSource-only `from` + → `None`. +- [x] `remap_file_ids` for `Generated` walks every anchor's source_info + via `Arc::make_mut` (regression guard — must NOT be no-op). +- [x] `root_file_id` for every variant. +- [x] `collect_file_ids` for every variant, including Generated with + mixed-role anchors. +- [x] `invocation_anchor` coverage (present / absent / ValueSource-only). +- [x] `value_source_anchor` coverage (parallel). +- [x] `anchors_with_role` coverage (each known role + unknown role). +- [x] `append_anchor` mutator coverage. + +Structural: +- [x] Rename `test_filter_provenance_tracking` + (`filter_tests.rs:740-813`) → `test_filter_generated_tracking` + and updated assertions to the `Generated` shape with + `by.as_filter()` recovery. +- [x] `combine()` × `Generated` structural test (zero-length Concat + piece). +- [x] Lua-serde round-trip including legacy `"FilterProvenance"` tag + back-compat read. + +### Phase 7 — Verification gate + +- [x] `cargo build --workspace` clean. +- [x] `cargo nextest run --workspace --no-fail-fast`: 9370 passed, + 1 failed — `quarto-core::idempotence::lua_shortcode_lipsum_fixed` + (bd-3odjm, owned by Plan 5). No other regressions. +- [x] `cargo xtask verify --skip-rust-tests`: all 12 steps passed + (Rust build + hub-client npm install/build/wasm/tests + q2-preview + SPA build). Rust tests run separately with `nextest --no-fail-fast` + above. +- [x] `git grep "SourceInfo::FilterProvenance"` returns zero hits + across `crates/` (variant gone). +- [x] `git grep "SourceInfo::filter_provenance"` returns zero hits + across `crates/` (no alias was added; original constructor + removed in Phase 5). +- [x] `git grep '"FilterProvenance"'` in Rust code returns only the + legacy-Lua-reader arm (3 hits — comment in doc-comment for + `source_info_to_lua_table`, comment in `source_info_from_lua_table`, + and the match arm itself). No writer emissions, no other readers. + The `SerializableSourceMapping::FilterProvenance` identifier + (wire code 3, Plan 5-owned) is not a string literal and does not + match this grep. +- [x] `git grep "extract_filename_index\|fn root_file_id\|fn walk_source_info"` + across `crates/` returns one hit — the new + `SourceInfo::root_file_id` accessor in + `crates/quarto-source-map/src/source_info.rs`. Six ad-hoc walkers + retired. ## Design decisions (settled in conversation) -- **`Derived` is reintroduced** (we'd dropped it earlier and walked it - back). It came back because pure provenance preservation can't - distinguish "shortcode resolution" (atomic; user edits prohibited at - the writer level) from "filter mutation" (non-atomic; user edits - flow to source). Both have a preimage in the same file; both could - use Original; only Derived gives the writer a type-level way to know - which is which. +- **Single `Generated` variant, not two.** Earlier drafts proposed + `Synthetic` + `Derived` to separate "no preimage" from "has preimage + but is atomic." The unified `Generated { by, from: SmallVec<[Anchor; 2]> }` + expresses both with one variant: anchor-list empty for pure + synthesis, anchor-list with `Invocation` for shortcode-style + resolutions. The "has preimage" property is `gen.invocation_anchor().is_some()`, + not a separate enum arm. +- **`by` records generator identity; `from` records source contributions.** + These are orthogonal axes. Atomicity is determined by `by.kind` + (per the `is_atomic_kind()` predicate); anchor-presence is orthogonal + to atomicity. +- **Anchors are typed `Arc`, not dynamic JSON.** Path C in + the 2026-05-20 discussion: rather than stuff source-info chain + metadata into `by.data` (dynamic typing), use a typed list of + role-labeled anchors. `by.data` shrinks to per-kind *non-source-info* + configuration. - **Filter mutations stay Original**. A Lua filter that does `Str.text = upper(Str.text)` doesn't change source_info. The mutated - Str retains its Original chain. -- **Filter constructions become Synthetic**. `pandoc.Str("decoration")` - in a Lua filter produces `Synthetic { by: By::filter(filter_path, line) }` - (replaces the existing FilterProvenance auto-attachment). -- **Shortcode resolutions become Derived**. The shortcode resolver - emits `Derived { from: Original{shortcode_token_range}, by: - By::shortcode(name) }` on resolved nodes. Plan 6 owns this. + Str retains its Original chain. This is unchanged from the existing + Lua machinery contract. +- **Filter constructions become `Generated { by: filter, from: [] }`**. + `pandoc.Str("decoration")` in a Lua filter produces this shape (the + Lua machinery's auto-attach replaces the existing FilterProvenance + emission). Lua-file path and line live in `by.data` until + Lua-file-registration lands; then they migrate to a `Dispatch` anchor. +- **Shortcode resolutions become `Generated { by: shortcode(name), from: [Invocation -> token_si] }`.** + Plan 6 owns the resolver-side stamping; the resolver appends an + `Invocation` anchor pointing at the shortcode token's source range. - **Sugar transforms stay Original**. CalloutTransform et al. inherit - source_info from their input Div. They're not atomic — the user - editing a callout's body content is fine. + source_info from their input Div. The Div's bytes are the canonical + preimage of the resulting CustomNode wrapper; the wrapper's + `type_name` carries the generator identity, so `source_info` doesn't + need to also encode it. The same reasoning applies to Plan 8's + `CustomNode("IncludeExpansion")` wrapper. See "Original vs Generated + on synthesized nodes" below. - **`By` is an open struct, not a closed enum**. Forward-compatibility for TS-Quarto-Lua-port and extension-defined kinds. Mirrors the - `CustomNode.plain_data` pattern (also `serde_json::Value`-typed). + existing precedent in `CustomNode.plain_data` and `Artifact.metadata` + — open `serde_json::Value` at extension/dispatch seams; static typing + everywhere else. +- **`AnchorRole` is a closed enum with an `Other(String)` escape hatch**. + The known roles (`Invocation`, `ValueSource`) are the load-bearing + ones the core consults. `Other(String)` lets extensions or future + plans add roles without modifying the type. - **Kind-string convention**: kebab-case, namespaced for third-party - (`ext//foo`). + (`ext//foo`). Same for `AnchorRole::Other` values. +- **Anchor list ordering is append order**. `from` is a `SmallVec`; + iteration is insertion order. `append_anchor` pushes to the end. + Accessors that find by role (`invocation_anchor`, `value_source_anchor`) + return the first match — at most one anchor per known role by + convention. Serde round-trips preserve order. No producer sorts; + no consumer reorders. - **Builder methods for known kinds, plus `raw` escape hatch**. + `By::raw(kind, data)` accepts any `kind` string — including built-in + names like `"shortcode"` or `"filter"`. Forgery (an extension calling + `By::raw("shortcode", …)` without the required Invocation anchor) + is caught downstream by Plan 6's audit-completion test and Plan 7's + `debug_assert!`, so no constructor-level rejection is needed. The + convention is still `ext//` for third-party kinds — + collisions with built-ins are a misuse caught at audit time, not a + type error. ## The proposed shape +**Naming.** Read the new variant as: this node was generated **by** some +transform, **from** some anchors. `by` records the producer; `from` is +the list of `Anchor`s that record the source-side contributions. The +items in the list are `Anchor` values; methods that operate on individual +items keep "anchor" in their name (`invocation_anchor`, +`value_source_anchor`, `append_anchor`, `anchors_with_role`), while the +field name and any Lua-table key use `from`. `by` / `from` reads cleanly +in both Rust and Lua serializations — preserve that pairing throughout. + ```rust #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub enum SourceInfo { Original { file_id: FileId, start_offset: usize, end_offset: usize }, Substring { parent: Arc, start_offset: usize, end_offset: usize }, Concat { pieces: Vec }, - Synthetic { by: By }, - Derived { from: Arc, by: By }, + Generated { by: By, from: SmallVec<[Anchor; 2]> }, } #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct By { - /// Short kind tag, kebab-case. Examples: "filter", "sectionize", - /// "user-edit", "shortcode", "include", "title-block". + /// Short kind tag, kebab-case. Examples: "filter", "shortcode", + /// "sectionize", "user-edit", "title-block". /// Third-party kinds should namespace: "ext/my-extension/foo". pub kind: String, - /// Free-form structured data specific to this kind. + /// Per-kind configuration that is NOT a source-info pointer. + /// Anchors live in `Generated.from`, not here. /// `Null` for kinds that don't carry per-instance data. #[serde(default, skip_serializing_if = "serde_json::Value::is_null")] pub data: serde_json::Value, } +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct Anchor { + pub role: AnchorRole, + pub source_info: Arc, +} + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub enum AnchorRole { + /// The user-written construct that triggered this node's creation + /// (e.g. the `{{< meta foo >}}` token in the active document). + /// Load-bearing: the writer's `preimage_in` and attribution's + /// `resolve_byte_range` consult the first anchor with this role. + /// At most one per node by convention. + Invocation, + + /// Where the VALUE this node carries was defined, when distinct + /// from the invocation site (e.g. `footer:` in `_metadata.yml` for + /// a `{{< meta footer >}}` resolution). Diagnostic-only — does not + /// affect the writer or attribution decisions in v1. + ValueSource, + + /// Extension-defined or future role we haven't enumerated. + /// String is kebab-case, namespaced (`ext//`). + Other(String), +} + impl By { pub fn filter(filter_path: impl Into, line: usize) -> Self { ... } pub fn sectionize() -> Self { ... } @@ -123,6 +438,78 @@ impl By { pub fn appendix() -> Self { ... } pub fn tree_sitter_postprocess() -> Self { ... } pub fn raw(kind: impl Into, data: serde_json::Value) -> Self { ... } + + /// True if a `Generated { by: , .. }` node should be treated + /// as atomic by the incremental writer. Atomic nodes are produced + /// by the pipeline and represent content the user shouldn't edit + /// through React (filter constructions, shortcode resolutions, + /// synthesized title h1, tree-sitter-inserted spaces). + /// + /// Atomicity is determined by `kind` alone — orthogonal to + /// anchor-presence. A `Generated { by: shortcode, from: [...] }` + /// is atomic; so is a `Generated { by: filter, from: [] }`. + pub fn is_atomic_kind(&self) -> bool { + matches!( + self.kind.as_str(), + "filter" | "shortcode" | "title-block" | "tree-sitter-postprocess" + ) + } + + pub fn is_kind(&self, kind: &str) -> bool { self.kind == kind } + + /// If this is a `filter` kind, return its `(filter_path, line)` payload. + pub fn as_filter(&self) -> Option<(&str, usize)> { + if self.kind != "filter" { return None; } + let path = self.data.get("filter_path")?.as_str()?; + let line = self.data.get("line")?.as_u64()? as usize; + Some((path, line)) + } +} + +impl Anchor { + pub fn invocation(source_info: Arc) -> Self { + Self { role: AnchorRole::Invocation, source_info } + } + pub fn value_source(source_info: Arc) -> Self { + Self { role: AnchorRole::ValueSource, source_info } + } +} + +impl SourceInfo { + pub fn generated(by: By) -> Self { + SourceInfo::Generated { by, from: SmallVec::new() } + } +} + +// Helper methods on Generated-shape access — typically called via +// matching `SourceInfo::Generated { by, from } => ...`. We provide +// the helpers as free functions on the variant pattern; example: + +impl SourceInfo { + /// If this is `Generated`, return the first anchor whose role is + /// `Invocation`. Returns `None` otherwise (including for + /// non-`Generated` variants). + pub fn invocation_anchor(&self) -> Option<&Arc> { + match self { + SourceInfo::Generated { from, .. } => from + .iter() + .find(|a| matches!(a.role, AnchorRole::Invocation)) + .map(|a| &a.source_info), + _ => None, + } + } + + /// If this is `Generated`, return the first anchor whose role is + /// `ValueSource`. Returns `None` otherwise. + pub fn value_source_anchor(&self) -> Option<&Arc> { + match self { + SourceInfo::Generated { from, .. } => from + .iter() + .find(|a| matches!(a.role, AnchorRole::ValueSource)) + .map(|a| &a.source_info), + _ => None, + } + } } ``` @@ -134,227 +521,666 @@ impl By { combine_all). Existing pattern. **Contiguity expectation**: writer paths that need to Verbatim-copy a Concat (Plan 7's `preimage_in`) return `Some(range)` only when all pieces resolve into the target - file AND are byte-contiguous in source order (`pieces[i].end == - pieces[i+1].start`). Non-contiguous Concats (rare; would arise if a - transform composed source-info from disparate file regions) return - `None` from `preimage_in`, and Plan 7's coarsen falls through to - Rewrite for that node. This is a Plan 7 invariant, not a Plan 4 - type-system invariant — Plan 4 doesn't forbid gappy Concats. If a - future use case needs to construct a gappy Concat intentionally, no - Plan 4 change is required; Plan 7's writer behavior already handles - the case. -- **Synthetic**: NO source preimage. The node was created from nothing. - Sectionize wrappers, filter constructions, synthesized title h1s. - Writer omits or recurses (Plan 7). -- **Derived**: HAS a source preimage but is a distinct transform output. - The `from` chain points at the source bytes; `by` describes the - transform. Writer treats as atomic (Plan 7) — KeepBefore Verbatim - copies preimage; UseAfter triggers AtomicViolation. Used for shortcode - resolutions; later for crossref cite resolutions if/when needed. + file AND are byte-contiguous in source order. Non-contiguous Concats + return `None`, and Plan 7's coarsen falls through to Rewrite. +- **Generated**: produced by a pipeline transform. `by` records the + producer; `from` records any source-side contributions. The + variant subsumes the previous `Synthetic`/`Derived` distinction: + - Empty anchors → pure synthesis (sectionize wrappers, filter + constructions, title-block h1, tree-sitter postprocess, footnotes + container, appendix wrapper, user-edit). + - `Invocation` anchor present → has a source-side preimage (every + shortcode resolution; future filter-with-trigger-anchor cases). + - `ValueSource` anchor present → records where the value came from + (future, gated on metadata-loader changes). + - `Other(...)` anchor present → extension-defined. + + Writer behavior (Plan 7) consults `by.is_atomic_kind()` for + atomicity and `gen.invocation_anchor()` for the preimage byte range. + +## Original vs Generated on synthesized nodes + +Two pieces of provenance information need to land somewhere when a +transform produces a node: + +1. **Generator identity** — "which transform produced me." +2. **Source anchor** — "which source bytes are this node's canonical preimage." + +For non-CustomNode synthesized nodes (sectionize Div, filter Str, +footnotes Div), there's no other slot for (1), so `source_info` carries +both via `Generated { by, from }`. + +For CustomNode synthesized nodes, (1) is already encoded in +`CustomNode.type_name`. The wrapper *is* a `Callout` / `IncludeExpansion` +/ `CrossrefResolvedRef` by virtue of `type_name`; `source_info` only +needs to do (2). And the natural shape for (2) — when the CustomNode +1:1-substitutes for a parser-emitted source-mapped node — is the +inherited `Original` (or whatever `SourceInfo` shape the substituted +node carried). + +| Synthesized node kind | Has CustomNode `type_name`? | Substitutes 1:1 for source-mapped node? | `source_info` shape | +|---|---|---|---| +| `IncludeExpansion` wrapper (Plan 8) | Yes | Yes (the include-line Paragraph) | Original (inherited) | +| `Callout` / `Theorem` / `Proof` / etc. | Yes | Yes (the source Div) | Original (inherited) | +| `CrossrefResolvedRef` | Yes | Yes (the source Cite) | Original (inherited) | +| `FloatRefTarget` | Yes | Yes (the source Div) | Original (inherited) | +| Sectionize Section Div | No | No (structural grouping) | `Generated { by: sectionize, from: [] }` | +| Footnotes container Div | No | No (structural grouping) | `Generated { by: footnotes, from: [] }` | +| Appendix wrapper Div | No | No (structural grouping) | `Generated { by: appendix, from: [] }` | +| Title-block synthesized h1 | No | No (synthesized from `title:` YAML) | `Generated { by: title_block, from: [] }` | +| Tree-sitter postprocess Space | No | No (inserted between nodes) | `Generated { by: tree_sitter_postprocess, from: [] }` | +| Shortcode resolution output | No | No (resolved from value, distinct from token bytes) | `Generated { by: shortcode("…"), from: [Invocation, …] }` | +| Filter-constructed node | No | No (filter computed it) | `Generated { by: filter, from: [] }` (Dispatch anchor in the future) | + +The rule: + +> A synthesized node uses **Original** `source_info` if and only if it +> is a CustomNode whose 1:1 source preimage is a parser-emitted node. +> Everything else uses **Generated**. + +## `by.data` shape per kind + +`by.data` is open `serde_json::Value` (matching the `CustomNode.plain_data` +and `Artifact.metadata` precedents). The known shapes per kind are: + +| `by.kind` | `by.data` contents | +|---|---| +| `shortcode` (Rust handler) | `{ "name": "" }` | +| `shortcode` (Lua handler) | `{ "name": "", "lua_path": "", "lua_line": }` until Lua-file-registration; then just `{ "name": "" }` | +| `filter` | `{ "filter_path": "", "line": }` until Lua-file-registration; then `{}` | +| `sectionize` / `footnotes` / `appendix` / `title-block` / `tree-sitter-postprocess` / `user-edit` | `{}` (empty) | +| `ext//` (third-party) | extension-defined, opaque to core | + +Convention: `data` is a JSON object with kind-specific known fields. +Consumers must treat unknown fields as opaque metadata. Producers may +add fields without breaking readers that don't look for them. Adding a +new field to a known kind's `data` is a non-breaking change. + +This same convention applies to `CustomNode.plain_data`; Plan 4 codifies +it once for both seams. The pattern is "open Value at extension/dispatch +seams; static typing everywhere else" — `Anchor.source_info` stays +typed `Arc`; only the truly per-kind, heterogeneous data +sits in `by.data`. + +## Atomic-kind set + +`By::is_atomic_kind()` returns true for kinds whose nodes are "atomic" +from the incremental writer's perspective — nodes the user can't edit +honestly through React, because the pipeline regenerated them from +source-side input. + +| `by.kind` | Atomic? | Role | +|---|---|---| +| `filter` | Yes | filter-constructed leaves; user edits the filter, not the output | +| `shortcode` | Yes | shortcode resolutions; user edits the token, not the resolved content | +| `title-block` | Yes | synthesized title h1; user edits `title:` metadata | +| `tree-sitter-postprocess` | Yes | parser-side synthetic spaces | +| `sectionize` | No (Transparent) | structural wrapper; children are editable | +| `footnotes` | No (Transparent) | container; children are editable | +| `appendix` | No (Transparent) | container; children are editable | +| `user-edit` | No | React-constructed; user-typed by definition | + +Atomicity is per-kind, orthogonal to `from`. A `Generated { by: shortcode, +from: [Invocation -> token_si] }` is atomic; so is a +`Generated { by: filter, from: [] }`. The writer's coarsen +(Plan 7) consults `by.is_atomic_kind()` and `gen.invocation_anchor()` +independently. + +Extensions that contribute new `by.kind` values are not atomic by +default. If an extension wants its kind to be atomic, the +`is_atomic_kind()` predicate (or a follow-up extension-registration +mechanism — see Plan 7 §Open questions) needs to recognize it. v1 +hardcodes the built-in set. + +### Required-anchor invariant for `shortcode` + +A `Generated { by: shortcode(...), from: [] }` is **not a valid state**. +Every shortcode-resolution node must carry at least one `Invocation` +anchor pointing at the source token's byte range. The resolver +(Plan 6) is responsible for maintaining this invariant; downstream +consumers (Plan 7's writer, error-reporting) may assume it. + +Plan 4 documents the invariant; enforcement is split across the two +producers/consumers of the shape: + +- **Plan 6 (producer)** owns the audit-completion test that walks the + post-stamping AST and asserts no `Generated { by: shortcode, from: [] }` + remains. The stamper is the only construction site for `by: shortcode` + in v1; the test verifies it always attaches the `Invocation` anchor. +- **Plan 7 (consumer)** adds a `debug_assert!` in `coarsen`'s + atomic-no-anchor branch. The writer routes "atomic + no invocation" + to `Omit` (drop the node, pipeline regenerates next run); for filter + that's correct, for shortcode it's silent data loss — the assertion + catches the bad shape before that branch fires, in dev / test builds. + +No constructor-level enforcement in v1. The `By::shortcode(name)` +builder stays symmetric with the other `By::xxx()` builders; the +required-anchor invariant is a *resolver* invariant, not a *type* +invariant. If a second required-anchor rule appears later, promote +the audit assertion into a shared validator pass. + +The `By::shortcode` doc-comment must state the invariant explicitly, +so anyone reaching for the builder from a new call site reads: + +```rust +/// Construct a `By` for a shortcode resolution. +/// +/// **Invariant.** Every `Generated { by: shortcode(...), .. }` must +/// carry at least one `Invocation` anchor in `from` pointing at the +/// source token's byte range. Use only inside a `Generated` whose +/// anchor list is populated; constructing the bare shape with empty +/// `from` is rejected by Plan 6's audit-completion test and trips +/// Plan 7's writer `debug_assert!`. +pub fn shortcode(name: impl Into) -> Self { ... } +``` ## Migrations The pre-existing `FilterProvenance` is renamed/folded: - **Construction**: `SourceInfo::filter_provenance("path", 42)` → - `SourceInfo::Synthetic { by: By::filter("path", 42) }`. - Add a deprecated alias `SourceInfo::filter_provenance` that constructs - the new shape, eased migration; remove after migration completes. -- **Pattern-match**: every `SourceInfo::FilterProvenance { filter_path, line }` - arm becomes `SourceInfo::Synthetic { by }` and inspects `by.kind == - "filter"` and `by.data["filter_path"]` / `by.data["line"]`. Or a small - helper `By::as_filter() -> Option<(&str, usize)>` for the common case. + `SourceInfo::Generated { by: By::filter("path", 42), from: smallvec![] }`. + The `(filter_path, line)` pair lives in `by.data` until + Lua-file-registration lands. No deprecated alias is shipped; the + 4 non-source-map callers are migrated inline in the same PR (see + Phase 5). +- **Pattern-match (production)**: every `SourceInfo::FilterProvenance { filter_path, line }` + arm becomes `SourceInfo::Generated { by, .. }` and inspects via + `by.as_filter()` to recover the path/line. +- **Pattern-match (tests)**: `Some(SourceInfo::FilterProvenance { filter_path, line })` + becomes `Some(SourceInfo::Generated { by, .. })` with `by.as_filter()` + for path/line recovery. Empty-bind sites + (`Some(SourceInfo::FilterProvenance { .. }) => {}`) become the + guard form: `Some(SourceInfo::Generated { by, .. }) if by.is_kind("filter") => {}`. + Affected sites verified by grep: `pampa/src/lua/diagnostics.rs:444, 509, 802`, + `pampa/src/lua/filter_tests.rs:1802`, plus the renamed + `test_filter_provenance_tracking` at `filter_tests.rs:740-813`. +- **JSON writer arm** (`pampa/src/writers/json.rs:314`): the + pattern-match site must stay exhaustive over `SourceInfo` after the + variant is gone. Plan 4 produces only `by.kind == "filter"` + Generated values; Plan 5 owns wire-code 4 for non-filter kinds. + The interim arm emits the legacy code-3 payload exactly as today, + preserving bd-3odjm's expected failure mode: -## `By` helper accessors + ```rust + SourceInfo::Generated { by, .. } => { + let (filter_path, line) = by.as_filter().expect( + "Plan 4 produces only filter-kind Generated; non-filter \ + Generated requires Plan 5's wire-code 4 emitter", + ); + ( + 0, + 0, + SerializableSourceMapping::FilterProvenance { + filter_path: filter_path.to_string(), + line, + }, + ) + } + ``` -Plan 4 ships these helpers up front, so call sites in Plans 6 and 7 read -provenance consistently rather than each writing ad-hoc string-equality -checks against `by.kind`: + Plan 5 replaces this with the wire-code 4 emitter and removes the + `SerializableSourceMapping::FilterProvenance` variant. +- **Lua serde**: read `"FilterProvenance"` tag (legacy) and reconstruct + as `Generated { by: By::filter(...), from: smallvec![] }`. New + constructions emit `"Generated"` tag with `by` and `from` sub-tables + (per §In scope). -```rust -impl By { - /// True if this kind matches the given string (sugar for `self.kind == kind`). - pub fn is_kind(&self, kind: &str) -> bool { self.kind == kind } +## File-id accessor consolidation - /// If this is a `filter` kind, return its `(filter_path, line)` payload. - /// Returns None for any other kind. - pub fn as_filter(&self) -> Option<(&str, usize)> { - if self.kind != "filter" { return None; } - let path = self.data.get("filter_path")?.as_str()?; - let line = self.data.get("line")?.as_u64()? as usize; - Some((path, line)) - } +Six SourceInfo walkers across the workspace conceptually do the same +operation — "give me the FileId(s) this SourceInfo refers to" — but +diverge on Concat semantics, Substring recursion depth, and return +type: - /// True if a `Synthetic { by: }` node should be treated as - /// atomic by the incremental writer. Atomic Synthetic nodes are - /// constructed by the pipeline with no source preimage and represent - /// content the user shouldn't edit through React (filter-constructed - /// inlines, synthesized title h1, tree-sitter-inserted spaces). - /// - /// The writer's coarsen step (Plan 7) uses this to decide: - /// - KeepBefore on atomic Synthetic → Omit (drop from output; - /// pipeline regenerates next run). - /// - UseAfter / RecurseIntoContainer on atomic Synthetic → soft-drop - /// substitution + Q-3-42 warning. +| Site | Returns | Concat policy | Substring | Status | +|---|---|---|---|---| +| `quarto-error-reporting/src/diagnostic.rs:556` `extract_file_id` | `Option` | `first().and_then` | full recursion | private, production | +| `pampa/src/pandoc/location.rs:329` `extract_filename_index` | `Option` | `iter().find_map` | full recursion | pub, production, has tests | +| `pampa/src/pandoc/treesitter_utils/pipe_table.rs:256-279` (inline match) | `FileId` (FileId(0) fallback) | first piece only | **one level only** — broken for nested Substring | production, latent bug | +| `pampa/src/pandoc/treesitter_utils/section.rs:129-152` (inline match) | `FileId` (FileId(0) fallback) | first piece only | **same shallow bug** | production, latent bug | +| `quarto-core/src/stage/stages/apply_template.rs:820` `root_file_id` | `Option` | `first().and_then` | full recursion | test mod | +| `quarto-core/src/stage/stages/engine_execution.rs:813` `collect_file_ids` / `walk_source_info` | `HashSet` | walks every piece | full recursion | test mod | + +Plan 4 consolidates these onto two methods on `SourceInfo`: + +```rust +impl SourceInfo { + /// First FileId reachable from this SourceInfo's root. /// - /// Non-atomic Synthetic kinds are transparent containers (Sectionize, - /// Footnotes, Appendix wrappers) whose children carry their own - /// source preimage; the writer recurses into children rather than - /// dropping or substituting. - pub fn is_atomic_synthesizer(&self) -> bool { - matches!( - self.kind.as_str(), - "filter" | "title-block" | "tree-sitter-postprocess" - ) + /// Original → `Some(file_id)`. + /// Substring → recurse parent. + /// Concat → `pieces.iter().find_map(|p| p.source_info.root_file_id())` + /// (find_map semantics — strict superset of every existing + /// "first piece" caller; skips Generated holes and empty pieces). + /// Generated → `invocation_anchor().and_then(|si| si.root_file_id())`; + /// `None` when no Invocation anchor is present. + pub fn root_file_id(&self) -> Option { ... } + + /// Every FileId reachable from this SourceInfo. Walks every + /// Original, every Substring parent, every Concat piece, and + /// every Generated anchor (all roles — Invocation, ValueSource, + /// Other). + pub fn collect_file_ids(&self, out: &mut HashSet) { ... } +} +``` + +Migration table (Phase 3): + +| Old | New | +|---|---| +| `DiagnosticMessage::extract_file_id(si)` | `si.root_file_id()` (delete private fn) | +| `extract_filename_index(si)` | `si.root_file_id().map(\|fid\| fid.0)` (kept as a one-line shim or inlined) | +| pipe_table.rs inline match → `FileId` | `table_start.root_file_id().unwrap_or(FileId(0))` — also fixes nested-Substring bug | +| section.rs inline match → `FileId` | `table.source_info.root_file_id().unwrap_or(FileId(0))` — same fix | +| test `root_file_id` (apply_template.rs) | `info.root_file_id()` (delete local fn) | +| test `walk_source_info` (engine_execution.rs) | `si.collect_file_ids(out)` (delete inner fn) | + +Net effect: ~60 LOC of duplicate walkers removed, two latent +production bugs fixed (nested-Substring fall-through to FileId(0)), +and the Generated arm is defined exactly once. + +## Deferred anchor role + +**`Dispatch` anchor (future).** When a Lua-implemented shortcode +handler or user filter constructs a node, the natural shape for +"where in Lua source was this constructed" is: + +```rust +Anchor { + role: AnchorRole::Dispatch, // not in v1 + source_info: Arc::new(Original { file_id: kbd_lua_id, start, end }), +} +``` + +This requires Lua filter files to be registered in `SourceContext` so +they have `FileId`s. That's its own infrastructure work touching the +Lua engine, the source context, the diagnostic machinery, and the +cache-key surface. We defer it. + +In the interim, the Lua machinery continues to carry `(filter_path, +line)` in `by.data` (see the `by.data` table above for `filter` and +Lua-dispatched `shortcode` kinds). When the Lua-file-registration +follow-up lands, the data migrates out of `by.data` and into a +`Dispatch` anchor; `AnchorRole::Dispatch` joins the enum (a +forward-compatible enum extension); and `by.data` for those kinds +shrinks to per-kind config only. + +The migration applies to **both** affected kinds, symmetrically: + +| kind | shape today | shape after Lua-file-registration | +|---|---|---| +| `filter` | `Generated { by: filter{path, line}, from: [] }` | `Generated { by: filter{}, from: [Dispatch -> lua_si] }` | +| `shortcode` (Lua handler) | `Generated { by: shortcode{name, lua_path, lua_line}, from: [Invocation -> token_si] }` | `Generated { by: shortcode{name}, from: [Invocation -> token_si, Dispatch -> lua_si] }` | +| `shortcode` (Rust handler) | `Generated { by: shortcode{name}, from: [Invocation -> token_si] }` | unchanged (no Lua source to point at) | + +A Lua-handler shortcode after registration carries **two** anchors — +`Invocation` for the user-written token, `Dispatch` for the Lua +handler that resolved it. The anchor list is what makes this clean: +adding `Dispatch` doesn't disturb `Invocation`, and the writer's +preimage walk (Plan 7) still looks at `invocation_anchor()` only. + +Tracked as **bd-36fr9** ("Provenance follow-up: Dispatch anchor for +Lua-handler filter & shortcode"). + +**`ValueSource` anchor (defined, deferred firing).** +`AnchorRole::ValueSource` is defined in Plan 4's type. The shortcode +resolver doesn't attach it yet, because the metadata loader doesn't +record per-key source-info today (every metadata key's `source_info` +points at where the value was parsed from, but the merged metadata +that the resolver consults doesn't expose this). A separate follow-up +issue covers extending the metadata loader to thread per-key source +through to the merged value. When that lands, Plan 6's stamper +appends `ValueSource` anchors for `meta` and `var` shortcode +resolutions whose values came from outside the active document. + +Tracked as **bd-129m3** ("Provenance follow-up: ValueSource anchor +stamping for meta/var shortcodes"). + +Both follow-ups are pure additions when they land — neither requires +reopening Plan 4's type design. The shape is forward-compatible by +construction. + +## Resolve-byte-range semantics + +`resolve_byte_range` is Plan 4's responsibility (existing accessor on +`SourceInfo`, gains a `Generated` arm). `preimage_in` is Plan 7's — +Plan 4 only ships the building block it depends on, `invocation_anchor()`. + +```rust +impl SourceInfo { + pub fn resolve_byte_range(&self) -> Option<(usize, usize, usize)> { + match self { + SourceInfo::Original { file_id, start_offset, end_offset } => + Some((file_id.0, *start_offset, *end_offset)), + SourceInfo::Substring { parent, start_offset, end_offset } => { + let (fid, parent_start, _) = parent.resolve_byte_range()?; + Some((fid, parent_start + start_offset, parent_start + end_offset)) + } + SourceInfo::Concat { .. } => None, + SourceInfo::Generated { .. } => self + .invocation_anchor() + .and_then(|si| si.resolve_byte_range()), + } } } ``` -Atomic vs. transparent vs. editable Synthetic kinds (decided in -conversation; the table in §Notes shows the full mapping): - -- **Atomic** (`is_atomic_synthesizer() == true`): `filter`, `title-block`, - `tree-sitter-postprocess`. Pipeline-generated content with no source - preimage; user can't edit honestly. -- **Transparent** (`is_atomic_synthesizer() == false`, has children): - `sectionize`, `footnotes`, `appendix`. Container synthesis; children - are editable per their own provenance. -- **Editable** (`is_atomic_synthesizer() == false`, materializable): - `user-edit`. Explicitly user-typed; qmd writer serializes via Rewrite. -- **Escape hatch** (`raw`): not atomic by default; extensions that need - atomic behavior should namespace their kind under `ext//...` and - consider whether `is_atomic_synthesizer` needs to recognize their - kinds (open extension question; v1 doesn't address registration). - -Add more accessors as Plans 6/7 surface concrete repeated patterns. The -above three cover the immediate needs (filter-provenance recovery in -tests, generic kind matching in writer dispatch, atomicity classification -for the writer). Don't proliferate accessors preemptively — -`as_shortcode()`, `as_sectionize()`, etc. can be added if their call -sites prove repetitive. - -## Builder list is extensible - -The `By` builder list above (`filter`, `sectionize`, `user_edit`, etc.) is -the v1 known set. **Plan 6's audit may discover sites Plan 4 didn't -anticipate** — if so, Plan 6 adds new `By::()` builders to extend -the set. Builders are inert from Plan 4's perspective (a builder is just -a constructor that produces `By { kind: "...", data: ... }`); adding one -doesn't require reasoning about Plan 4's invariants. - -Convention: each new builder gets a doc-comment explaining what kind of -node uses it and why. Keeps the `By` type's purpose discoverable. - -## Open questions for implementation - -- **Lua serde back-compat**: read `"FilterProvenance"` tag (legacy) and - reconstruct as `Synthetic { by: By::filter(...) }`. New constructions - emit `"Synthetic"` tag. Read both indefinitely; writes migrate to new - immediately. -- **Tests update**: `pampa/src/lua/filter_tests.rs::test_filter_provenance_tracking` - asserts on `SourceInfo::FilterProvenance`. Update to assert on - `Synthetic { by }` with `by.is_kind("filter")` and check - `by.as_filter()` returns the right path/line. +The `Generated` arm collapses to "look up the invocation anchor; +recurse into its source_info." Pure synthesis (empty `from`) returns +`None`. Multi-anchor Generateds (when `ValueSource` lands) still only +consult `Invocation` — `ValueSource` is diagnostic-only. + +Plan 7's `preimage_in` follows the same `Generated` pattern (it +delegates to `invocation_anchor()`); see Plan 7 §"`preimage_in` +semantics" for the full implementation including Concat contiguity. ## References -- `crates/quarto-source-map/src/source_info.rs:22` — current SourceInfo enum. -- `crates/quarto-source-map/src/source_info.rs:48-54` — current - FilterProvenance variant. -- `crates/quarto-source-map/src/source_info.rs:185-237` — accessors that - need updating (start_offset, end_offset, length, remap_file_ids). -- `crates/quarto-source-map/src/mapping.rs:17-74` — `map_offset` recursion; - needs new arm. -- `crates/pampa/src/lua/diagnostics.rs:60-145` — Lua serde to extend. -- `crates/pampa/src/lua/filter_tests.rs:663-728` — test to update. +- `crates/quarto-source-map/src/source_info.rs:21-55` — current + `SourceInfo` enum (incl. `FilterProvenance` variant at lines 49-54). +- `crates/quarto-source-map/src/source_info.rs:162-264` — accessors that + need updating (`length`, `start_offset`, `end_offset`, + `resolve_byte_range`, `remap_file_ids`). +- `crates/quarto-source-map/src/mapping.rs:17-74` — `map_offset` + recursion; needs `Generated` arm (returns `None`, like + `FilterProvenance` does today). +- `crates/quarto-error-reporting/src/diagnostic.rs:556-575` — + `extract_file_id` private fn; retired in favor of + `SourceInfo::root_file_id()`. +- `crates/pampa/src/pandoc/location.rs:328-344` — `extract_filename_index`; + reduced to a one-line shim over `root_file_id()` (or inlined at + callers). Has dedicated tests at `location.rs:588-655`. +- `crates/pampa/src/pandoc/treesitter_utils/pipe_table.rs:256-279` — + inline file-id extraction; retired in favor of + `root_file_id().unwrap_or(FileId(0))`. Also fixes a latent + nested-Substring bug. +- `crates/pampa/src/pandoc/treesitter_utils/section.rs:129-152` — + same shape and same latent fix. +- `crates/quarto-core/src/stage/stages/apply_template.rs:820-829` — + test-mod `root_file_id`; retired in favor of `SourceInfo::root_file_id()`. +- `crates/quarto-core/src/stage/stages/engine_execution.rs:813-832` — + test-mod `walk_source_info`; retired in favor of + `SourceInfo::collect_file_ids()`. +- `crates/pampa/src/lua/diagnostics.rs:50-145` — Lua serde to extend. +- `crates/pampa/src/lua/filter_tests.rs:740-813` — `test_filter_provenance_tracking`; rename and update assertions to the `Generated` shape. - `crates/quarto-pandoc-types/src/custom.rs:75` — `CustomNode.plain_data` - (the prior-art shape we're mirroring). + (the prior-art for `serde_json::Value` at extension seams; same + convention now applies to `By.data`). +- `crates/quarto-core/src/artifact.rs:71` — `Artifact.metadata` + (second precedent for the same pattern). ## Test plan -- Unit tests for each `By` builder method (constructs the right kind and data). +### Type / builder tests + +- Unit tests for each `By` builder method (constructs the right kind + and data). Cover all ten: `filter`, `sectionize`, `user_edit`, + `shortcode`, `include`, `title_block`, `footnotes`, `appendix`, + `tree_sitter_postprocess`, `raw`. +- `By::is_atomic_kind()` test: confirms the set named in §"Atomic-kind + set" returns `true` exactly for `filter | shortcode | title-block | + tree-sitter-postprocess` and `false` for everything else (including + extension `ext/…/…` kinds). +- `By::is_kind()` / `By::as_filter()` coverage. +- Unit tests for `Anchor::invocation()` / `Anchor::value_source()` + constructors. - Round-trip test: `By` → JSON → `By` (serde derive). -- Integration test: filter-provenance test (renamed from - `test_filter_provenance_tracking`) confirms a filter-created Str gets - `Synthetic { by: By::filter(...) }` source_info. -- Derived round-trip: build a `Derived { from: Original, by: By::shortcode("...") }` - value; round-trip through JSON (Plan 5) and Lua serde; assert structural - equality. -- Accessor recursion test: a `Derived` value's `start_offset()` / `end_offset()` - / `length()` walk through `from` and return the from's offsets. +- Round-trip test: `Anchor` → JSON → `Anchor` (serde derive). + +### Accessor tests on `Generated` + +- `length()` / `start_offset()` / `end_offset()` for `Generated` + return `0` regardless of `from` contents. +- `map_offset()` for `Generated` returns `None` regardless of offset + argument. +- `resolve_byte_range()` recursion: a + `Generated { from: [Invocation -> Substring{parent: Original{42, 100, 200}, 10, 20}] }` + resolves to `(42, 110, 120)`. A `Generated` with empty `from` returns + `None`. A `Generated` with only a `ValueSource` anchor (no + `Invocation`) returns `None`. (Plan 7 owns the matching `preimage_in` + tests.) +- `remap_file_ids()` for `Generated`: build a + `Generated { from: [Invocation -> Original{FileId(0), …}, ValueSource -> Original{FileId(3), …}] }`, + apply `|id| FileId(id.0 + 10)`, assert both anchors' source_info + carry remapped FileIds. This catches the "no-op like FilterProvenance" + regression — `Generated` must NOT be a no-op since it can hold FileIds. +- `root_file_id()` coverage on every variant. Generated with an + Invocation anchor pointing at `Original{file_id: FileId(7), ...}` + returns `Some(FileId(7))`. Generated with only a `ValueSource` + anchor returns `None` (matches the empty-`from` case — only + Invocation participates in `root_file_id`). Concat with + `[Generated{empty}, Original{42}]` returns `Some(FileId(42))` + (find_map skips the empty Generated piece) — this also pins the + Plan-3 latent bug fixed by the new accessor on + pipe_table.rs / section.rs. +- `collect_file_ids()` coverage: Generated with + `[Invocation -> Original{FileId(1), ...}, ValueSource -> Original{FileId(2), ...}, Other(...) -> Original{FileId(3), ...}]` + populates the set with `{FileId(1), FileId(2), FileId(3)}` — confirms + that all anchor roles participate, not just Invocation. Concat, + Substring, nested compositions: every reachable FileId lands. +- `invocation_anchor()` accessor: a Generated with `[Invocation -> X]` + returns `Some(X)`; with `[]` returns `None`; with `[ValueSource -> Y]` + (no Invocation) returns `None`. +- `value_source_anchor()` accessor: parallel coverage. +- `anchors_with_role()` accessor: a Generated with + `[Invocation -> X, ValueSource -> Y, Other("foo") -> Z]` returns the + right anchors for each role, and an empty iterator for an unknown role. +- `append_anchor()` mutator: starting from `Generated { from: [] }`, + append an Invocation then a ValueSource; assert both are present in + order. + +### Structural tests + +- Integration test: filter-provenance test renamed from + `test_filter_provenance_tracking` (at `filter_tests.rs:740-813`) + confirms a filter-created Str gets `Generated { by: filter, from: [] }` + with `(filter_path, line)` recoverable via `by.as_filter()`. +- `combine()` × `Generated` structural test: combining an `Original` + with a `Generated` produces a `Concat` whose Generated piece has + length `0` (matches `Generated::length()`). `map_offset` over the + combined Concat skips the Generated piece. This pins behavior even + though no production code path combines Generated source_info today. - Lua-serde round-trip: typed → Lua table → typed, including legacy - `"FilterProvenance"` tag back-compat. + `"FilterProvenance"` tag back-compat (reads as `Generated { by: + filter, from: [] }`; never round-trips back to `FilterProvenance`). ## Dependencies -- Depends on: nothing (pure type change in the foundation crate). -- Blocks: Plan 5 (wire format extension), Plan 6 (provenance audit), Plan 7 - (writer's preimage walk uses Synthetic and Derived). +- Depends on: nothing (pure type change in the foundation crate, plus + consolidation of file-id walkers across `quarto-core`, `pampa`, and + `quarto-error-reporting` that all already depend on + `quarto-source-map`). +- Blocks: Plan 5 (wire format extension), Plan 6 (provenance audit), + Plan 7 (writer's preimage walk uses Generated and the + `invocation_anchor` helper). ## Risk areas -- **Migration scope**: ~22 files pattern-match `SourceInfo` variants. Each - needs migration arms for *both* `Synthetic` and `Derived`. Most are - mechanical: Synthetic arm returns what FilterProvenance did (usually - `0`, `0`, or `None`); Derived arm recurses into `from` for offset - accessors and returns the same as Synthetic for FileId-extracting helpers. -- **`Derived` accessor recursion**: `start_offset()`, `end_offset()`, - `length()` need to recurse into `from`. A long Derived chain could - in principle stack overflow, but in practice chains are 1-2 deep. - Same risk profile as Substring. -- **`serde_json::Value` in PartialEq derives**: `Value` implements `PartialEq` - but with potentially weird semantics for floats. For our use, kinds are - string + small structured data; should be fine. Test the cases. +- **Migration scope**: 15 files pattern-match `SourceInfo::FilterProvenance` + (27 occurrences total — verified by grep against the worktree). + Phase 3's file-id-walker consolidation retires ~6 of those by + replacing entire match expressions (the file-id-extraction sites in + `diagnostic.rs`, `location.rs`, `pipe_table.rs`, `section.rs`, + `apply_template.rs`, `engine_execution.rs`). Phase 5 sweeps the + ~21 remaining arms. Most are mechanical: the `Generated` arm + returns what `FilterProvenance` did today (`0`/`0`/`None` for + offset/length accessors; delegates to `invocation_anchor()` for + `resolve_byte_range`). File-id traversals are handled exactly once, + inside the new `root_file_id` / `collect_file_ids` accessors — + callers walk through those rather than re-implementing the recursion + per call site. +- **Anchor-list allocation**: `from` is typed `SmallVec<[Anchor; 2]>` + from day 1 (with the `serde` feature enabled). Inline capacity of 2 + covers all expected shapes through the deferred follow-ups with zero + heap allocation: + - empty (sectionize / footnotes / appendix / title-block / + tree-sitter-postprocess / filter constructions today) — the bulk + of synthesized nodes; + - one Invocation (Rust-handler shortcode resolutions, today); + - two anchors (Invocation + ValueSource for `meta`/`var` once + bd-129m3 lands; Invocation + Dispatch for Lua-handler shortcodes + once bd-36fr9 lands). + Cap=2 grows the `SmallVec<[Anchor; …]>` field by ~40 bytes (the size + of one inline `Anchor` slot — `AnchorRole`'s largest variant + `Other(String)` is 32 bytes, plus 8 for `Arc`). Because + the `SourceInfo` enum's stack size is dictated by its largest + variant, **every** `SourceInfo` value in the AST grows by that 40 + bytes — not just `Generated` instances. For a doc with thousands of + Block/Inline nodes (each carrying a `SourceInfo` by value, not + Arc-boxed), the cap=1 → cap=2 step costs ~40 bytes per node, i.e. + tens-to-hundreds of KB on a large document. The trade is paid in + exchange for eliminating the heap spill cap=1 would incur on every + multi-anchor shortcode in the steady state. Three-or-more-anchor + Generateds (Invocation + ValueSource + Dispatch on a Lua-handler + `meta` shortcode) still spill — same cost as `Vec` would have + been. If memory-per-node turns out to matter for the q2-preview + interactive editor, revisit by Arc-boxing the `Generated` variant + (so the SourceInfo enum's stack size drops back to a single pointer + for that variant) rather than by reverting to cap=1. Adds a + `smallvec` workspace dependency (verified absent today). +- **`serde_json::Value` in PartialEq derives**: `Value` implements + `PartialEq` but with potentially weird semantics for floats. For our + use, kinds carry string + small structured data; should be fine. + Test the cases. (Verified: no production call site relies on + `SourceInfo == SourceInfo` today — the `PartialEq` derive is required + by the wider `Block`/`Inline` derives but isn't itself load-bearing. + Plan 7's coarsen may compare structurally once it lands; the + `Value::PartialEq` semantics on small kebab-case objects are + well-behaved.) - **Removing `FilterProvenance` is a breaking change for downstream - consumers**. Within the q2 workspace this is bounded; if any external code - imports the variant by name, they'd break. Search for non-workspace usages - before removing (probably none). + consumers**. Within the q2 workspace this is bounded; if any external + code imports the variant by name, they'd break. Search for + non-workspace usages before removing (probably none). +- **`Default` on containers of `SourceInfo`**: verified no struct in + `quarto-pandoc-types/src/{block,inline}.rs` derives `Default` (each + `SourceInfo`-bearing struct is constructed explicitly), so changing + `SourceInfo`'s arm set can't cascade into a broken + `#[derive(Default)]`. The hand-written `Default for SourceInfo` impl + (the `Original { FileId(0), 0, 0 }` zero-value) stays unchanged. +- **`combine()` with a `Generated` operand**: structurally valid (it + produces a `Concat` with a zero-length `Generated` piece, since + `Generated::length()` returns `0`), but semantically dead — the + Generated side carries no preimage bytes for adjacent-text coalescing, + and `map_offset` will skip over the zero-length piece. Verified: all + 17 `.combine(` call sites in the workspace (`attr.rs`, + `postprocess.rs`, `location.rs`, `yaml/parser.rs`, etc.) combine + Original/Substring shapes; nothing combines FilterProvenance today, so + Generated won't be combined either unless a future transform reaches + for it. The Phase 6 `combine() × Generated` test documents the + intended fall-through behavior for any future caller, not a current + regression. No type-level prevention in v1. ## Estimated scope | Component | Lines (rough) | |---|---| -| `Synthetic` variant + accessors | ~50 | -| `Derived` variant + recursive accessors | ~50 | -| `By` struct + builders | ~100 | -| Pattern-match migrations (~22 files, both new variants) | ~250 | +| `Generated` variant + `Anchor` + `AnchorRole` types | ~80 | +| Accessors (invocation_anchor, value_source_anchor, etc.) | ~60 | +| `By` struct + builders + `is_atomic_kind` | ~120 | +| `resolve_byte_range` / `map_offset` / `remap_file_ids` updates | ~40 | +| `root_file_id` + `collect_file_ids` accessors | ~50 | +| File-id walker consolidation (6 sites → 2 methods, net delete) | **-30** | +| Pattern-match migrations (~9 files, ~21 occurrences post-consolidation) | ~140 | | FilterProvenance construction site migrations | ~30 | -| Lua serde extension + back-compat (both variants) | ~80 | -| Test updates and new tests | ~200 | -| **Total** | **~760** | +| Lua serde extension + back-compat | ~80 | +| Test updates and new tests | ~280 | +| **Total** | **~850** | + +One to two focused sessions. The unified-variant design reduces the +total cost vs. the previous Synthetic-plus-Derived dual-variant draft +(every accessor and migration site collapses one arm). -One focused session, possibly stretching into a second given the slightly -larger scope from carrying Derived alongside Synthetic. +## Implementation surprises (recorded 2026-05-22 after Plan 4 landed) + +A few things diverged from the plan-as-written. Annotating them here so +Plan 5+ readers can adjust expectations. + +- **`gen` is a reserved keyword in current Rust.** Test locals and + method-receiver bindings must avoid the identifier `gen` (raw form + `r#gen` works but is ugly). The plan's pseudocode used + `gen.invocation_anchor()` / `gen.preimage_in()` etc. as shorthand; + in real code use `generated`, `g`, or destructure the variant. + Plan 7's `preimage_in` sketches should be amended before + implementation — the same trap applies. + +- **Phase 1's "compiles cleanly" holds only for `quarto-source-map`, + not the workspace.** Adding the `Generated` variant immediately + triggered non-exhaustive-match errors across ~10 crates. Phase 3's + six-walker consolidation rescues part of it, but the workspace + doesn't build green again until **Phase 5** lands. The phase boundary + semantics are "the source-map crate plus directly-touched + consumers"; expect downstream crates to be red between Phase 1 and + Phase 5. Future plans that add new `SourceInfo` variants should plan + for a "transitional arms inline" interlude or accept that the + workspace is red mid-implementation. + +- **`extract_filename_index` was tests-only.** The plan suggested + "thin shim or inline at the few callers" — turned out the only + callers were the function's four dedicated tests plus one + commented-out reference in `pampa/src/writers/json.rs`. Deleted the + function and the four tests entirely; the equivalent coverage now + lives in `quarto-source-map`'s `test_root_file_id_per_variant`. + Cleaner than the plan anticipated. Future grep-and-replace plans + should re-verify caller counts at start-of-implementation, not just + at planning time. + +- **`anchors_with_role` returns `Box`, not `impl + Iterator`.** The plan's signature was + `-> impl Iterator>`, but the two match arms + return different concrete iterator types (a `filter_map` over the + anchor list for the `Generated` arm, `std::iter::empty()` for + everything else). The fix is `Box + 'a>`. Static + dispatch would require either a hand-rolled iterator enum or + `Either` from `itertools` — not worth it for a method called + in non-hot paths. + +- **`cargo xtask verify` modifies a second lockfile.** The WASM build + leg (Step 9, `npm run build:wasm`) re-resolves + `crates/wasm-quarto-hub-client/Cargo.lock`, which is distinct from + the workspace `Cargo.lock`. Both ended up in the Plan-4 commit. Not + a problem, but plans that touch any crate transitively used by + `wasm-quarto-hub-client` should expect that second lockfile to be + dirty after verification. + +- **The bd-3odjm carve-out behaved exactly as predicted.** Single + failure, in + `quarto-core::idempotence::lua_shortcode_lipsum_fixed`, panicking + with `MalformedSourceInfoPool` from the wire-code-3 collision + between writer (Generated → code 3 with `[filter_path, line]` + payload) and reader (code 3 = legacy `Transformed`). This is a + *non-surprise* worth recording: the plan's "Inherited pre-existing + failure" section was correct down to the test name, the panic + message, and the root cause. ## Notes -The conceptual surface is "two new variants, one of which (`Synthetic`) -generalizes `FilterProvenance`." The pattern-match migration touches many -files but most arms are mechanical — Synthetic behaves like FilterProvenance -for offset accessors (returns 0, 0); Derived recurses into `from`. +The conceptual surface is "one new variant, `Generated`, with a typed +anchor list." The pattern-match migration touches many files but most +arms are mechanical. -Per the open-struct decision, `By` is `{ kind, data }` rather than a closed -enum. Builder methods give ergonomic, self-documenting construction at known -call sites; `By::raw` lets extensions add kinds without modifying the type. -The same `By` value appears as the payload of both Synthetic and Derived — -many kinds can be either depending on context, though in practice they -correspond cleanly: +Per the open-struct decision, `By` is `{ kind, data }` rather than a +closed enum. Builder methods give ergonomic, self-documenting +construction at known call sites; `By::raw` lets extensions add kinds +without modifying the type. The `Anchor` list is typed throughout — +each entry's `source_info` is an `Arc`, not dynamic JSON. -| Kind | Variant | When used | -|---|---|---| -| `filter` | Synthetic | Lua filter constructions (`pandoc.Str(...)`) | -| `sectionize` | Synthetic | SectionizeTransform's section Divs | -| `title-block` | Synthetic | TitleBlockTransform's synthesized h1 | -| `footnotes` | Synthetic | FootnotesTransform's container Div | -| `appendix` | Synthetic | AppendixStructureTransform's wrapper Div | -| `tree-sitter-postprocess` | Synthetic | parser-side synthetic Spaces | -| `user-edit` | Synthetic | React-constructed nodes | -| `shortcode` | Derived | shortcode resolutions (Plan 6) | -| `include` | (wrapped, not Derived) | wrapper CustomNode in Plan 8 | -| `crossref-resolve` | (wrapped, not Derived) | already a CustomNode today | - -Reintroducing Derived was a reversal of an earlier "drop it" decision. -The reversal happened when we recognized that Original chains alone can't -distinguish "shortcode resolution" (atomic) from "filter mutation" -(non-atomic). Derived gives Plan 7 the type-level distinction it needs to -trigger AtomicViolation correctly. +The earlier `Synthetic`/`Derived` split was a useful intermediate during +design discussion (it crystallized the atomic-vs-not distinction), but +the unified `Generated` shape captures the same information with fewer +moving parts. The "has preimage" property becomes +`gen.invocation_anchor().is_some()` rather than a separate enum arm; +atomicity stays per-`by.kind`, orthogonal to anchor-presence. + +| Kind | Variant | Anchors | When used | +|---|---|---|---| +| `filter` | Generated | `[]` (Dispatch later) | Lua filter constructions (`pandoc.Str(...)`) | +| `sectionize` | Generated | `[]` | SectionizeTransform's section Divs | +| `title-block` | Generated | `[]` | TitleBlockTransform's synthesized h1 | +| `footnotes` | Generated | `[]` | FootnotesTransform's container Div | +| `appendix` | Generated | `[]` | AppendixStructureTransform's wrapper Div | +| `tree-sitter-postprocess` | Generated | `[]` | parser-side synthetic Spaces | +| `user-edit` | Generated | `[]` | React-constructed nodes | +| `shortcode` | Generated | `[Invocation]` (`+ValueSource` later, `+Dispatch` later for Lua) | shortcode resolutions (Plan 6) | +| `include` | (wrapped CustomNode, source_info Original) | — | wrapper CustomNode in Plan 8 | +| `crossref-resolve` | (wrapped CustomNode, source_info Original) | — | already a CustomNode today | diff --git a/claude-notes/plans/2026-05-04-q2-preview-plan-5-wire-format.md b/claude-notes/plans/2026-05-04-q2-preview-plan-5-wire-format.md index 92a4dca4e..36fce02b6 100644 --- a/claude-notes/plans/2026-05-04-q2-preview-plan-5-wire-format.md +++ b/claude-notes/plans/2026-05-04-q2-preview-plan-5-wire-format.md @@ -1,122 +1,261 @@ -# Plan 5 — JSON wire format extension + code-3 fix +# Plan 5 — JSON wire format extension for Generated -**Date:** 2026-05-04 +**Date:** 2026-05-04 (revised 2026-05-20) **Branch:** feature/q2-preview **Status:** Implementation plan (open questions named) -**Milestone:** none directly — fixes a latent bug, prepares wire for Plans 6/7/8 +**Milestone:** none directly — fixes a latent bug, prepares wire for + the rest of the provenance epic + +## Epic context + +Part of the **provenance epic** (Plans 3–8). Plan 5 carries the wire +format adjustments needed so the typed provenance Plan 4 introduces can +cross the WASM/JSON boundary and round-trip without information loss. +The file name keeps its q2-preview-plan-N form for continuity with +earlier discussion notes. ## Goal -Extend the source-info pool's JSON wire format to encode two new variants -introduced by Plan 4: `Synthetic { by: By }` and `Derived { from: SourceInfo, -by: By }`. In the same change, fix a latent bug: today's writer emits -`FilterProvenance` as type code `3` with payload `[filter_path, line]`, but -today's reader interprets code `3` as the long-removed `Transformed` variant -and tries to parse it as `[parent_id, ...]` — resulting in a hard -`MalformedSourceInfoPool` error on any AST that crosses the JSON boundary -with a FilterProvenance value in it. +Extend the source-info pool's JSON wire format to encode the +`Generated { by, from }` variant introduced by Plan 4. In the same +change, fix a latent bug: today's writer emits `FilterProvenance` as +type code `3` with payload `[filter_path, line]`, but today's reader +interprets code `3` as the long-removed `Transformed` variant and tries +to parse it as `[parent_id, ...]` — resulting in a hard +`MalformedSourceInfoPool` error on any AST that crosses the JSON +boundary with a FilterProvenance value in it. + +The latent bug doesn't surface in current main because `parse_qmd_to_ast` +doesn't run filters that produce FilterProvenance. **But the q2-preview +pipeline (already shipped via Plans 1–2) does run filters and +shortcodes**, and the latent bug becomes reachable as soon as a +built-in or user filter constructs a node whose JSON-serialized +source_info crosses the WASM boundary. Plan 5 is therefore higher +priority than the original "prepares wire for downstream plans" +framing suggested — it fixes a bug that's no longer latent in design, +only in reach. + +## Inherited failure that must close on Plan 5's first reader change (bd-3odjm) + +Plan 3's idempotence gate already ships a live reproduction of this +bug as a failing test on the integration branch. Plan 5 *inherits* +it as the canonical first-iteration target. + +- Test: `cargo nextest run -p quarto-core --test idempotence lua_shortcode_lipsum_fixed` + (orchestrator mode only; `SingleFile` passes — the pipeline itself + is idempotent). +- Beads issue: **bd-3odjm**. +- Symptom: `MalformedSourceInfoPool` from + `pampa::readers::json::read` re-parsing the orchestrator's AST JSON + for a lipsum-shortcode-bearing document. +- Pre-Plan-5 cause: code-3 collision (writer emits FilterProvenance + `[filter_path, line]`; reader decodes as legacy Transformed + `[parent_id, ...]`). + +**The contract:** the very first time Plan 5 runs the idempotence +suite after a reader change lands, `lua_shortcode_lipsum_fixed` must +go green. The full chain is: -The latent bug doesn't surface today because `parse_qmd_to_ast` doesn't run -the transforms that produce `FilterProvenance`. The instant Plan 1 enables -the q2-preview pipeline (which runs filters and shortcodes), we'd hit it. + 1. Plan 5 lands the legacy code-3 reader change (per §"Code 3 — + Legacy reader only" below) — recognize FilterProvenance's + string-array payload, produce + `Generated { by: filter, from: vec![] }`, fall through to + legacy Transformed for the numeric-array payload. + 2. `cargo nextest run -p quarto-core --test idempotence + lua_shortcode_lipsum_fixed` passes. + 3. The full Plan-3 idempotence suite is green (27/27). + +**If step 2 fails after the reader change**, the Plan-5 author has a +real signal: either the reader's discrimination between the two +code-3 shapes is wrong, or the lipsum path produces a code-3 shape +that neither arm handles. In that case, do not move on to other +Plan-5 work — the failing test on the integration branch is the +canonical reproduction and must be the focus until green. + +This is also a positive: bd-3odjm is the most realistic Plan-5 +regression test available — a real fixture, a real pipeline, a real +round-trip — so it doubles as the smoke check before any of the +hand-constructed tests in §"Test plan" run. ## Scope ### In scope -- Add wire format code `4` for `Synthetic { by: By }`. Payload encoding: - `d` carries `{"kind": "...", "data": ...}` (or `{"kind": "..."}` if - `by.data` is null). -- Add wire format code `5` for `Derived { from, by }`. Payload encoding: - `d` carries `{"from": , "by": {"kind": "...", "data": ...}}`. - The `from` is interned in the source-info pool just like `Substring.parent`. -- Fix the code-3 reader. Today's reader interprets code 3 as Transformed and - tries to read a parent_id out of `data[0]`. Make it accept *both* shapes: - - **Legacy Transformed** (`data` is `[parent_id, ...]` of numbers): map to - `Substring` (current behavior), preserving back-compat for old JSON. - - **Latent FilterProvenance** (`data` is `[filter_path, line]` — string - then number): decode as `Synthetic { by: By::filter(filter_path, line) }`. - This recovers the FilterProvenance shape that was being silently corrupted. -- After the fix, the writer no longer emits code 3 for new content (codes 4 - and 5 cover everything). Code 3 becomes a read-only legacy compat path. -- Round-trip tests: every `SourceInfo` variant survives Rust → JSON → Rust - unchanged. +- Add wire format code `4` for `Generated { by, from }`. Payload + encoding: + ```json + { + "by": { "kind": "...", "data": }, + "from": [ + { "role": "", "si_id": }, + ... + ] + } + ``` + Outer `from` mirrors the Rust field name (`Generated.from`). Inner + `si_id` is the source-info pool reference — it points to another + entry in the pool, typically an `Original` covering the source bytes + the anchor describes. The name is deliberately distinct from + `Substring`'s `parent_id`: a Substring genuinely *has* a parent in + the chain (the slice's ancestor), but an anchor's reference is a + sideways pointer, not a containment relationship. `si_id` reads as + "source-info pool index" with no tree-structure overclaim. Multiple + anchors share an `si_id` naturally (multi-inline shortcode: every + resolved inline's `Invocation` anchor references the same token's + pool entry). +- Anchor role encoding: `"invocation"`, `"value-source"`, or + `"other:"` for `AnchorRole::Other(String)`. + Kebab-case throughout. +- Fix the code-3 reader. Today's reader interprets code 3 as + Transformed and tries to read a parent_id out of `data[0]`. Make it + accept *both* shapes: + - **Legacy Transformed** (`data` is `[parent_id, ...]` of numbers): + map to `Substring` (current behavior), preserving back-compat for + old JSON. + - **Latent FilterProvenance** (`data` is `[filter_path, line]` — + string then number): decode as `Generated { by: By::filter(filter_path, line), from: smallvec![] }`. + This recovers the FilterProvenance shape that was being silently + corrupted. +- After the fix, the writer no longer emits code 3 for new content (code + 4 covers everything). Code 3 becomes a read-only legacy compat path. +- **Code 5 is unassigned.** Earlier drafts proposed code 5 for a + separate `Derived` variant; that variant was unified into `Generated` + during the 2026-05-20 design discussion and never shipped. Code 5 + remains free for future reservation. +- Round-trip tests: every `SourceInfo` variant survives Rust → JSON → + Rust unchanged. ### Out of scope -- Lua serde changes (Plan 4 covers those — the Lua format is independent of - the JSON pool wire format). -- The wire format for `By.data` itself is just `serde_json::Value` (already - handled by serde derives on `By`). +- Lua serde changes (Plan 4 covers those — the Lua format is + independent of the JSON pool wire format). +- The wire format for `By.data` itself is just `serde_json::Value` + (already handled by serde derives on `By`). +- The metadata-loader changes that would populate `ValueSource` anchors + (separate follow-up; the wire format is forward-compatible — anchor + arrays simply gain entries when the resolver starts attaching them). +- Lua-file-registration that would convert `Dispatch` anchor data from + `by.data` into typed `Original`-backed anchors (separate follow-up; + wire-format forward-compatible the same way). ## Design decisions (settled in conversation) -- **Two new wire codes (4 and 5)**: Synthetic and Derived. The `Derived` - variant came back in the conversation after we saw that pure-provenance - alone couldn't distinguish "shortcode resolution" (atomic; user edits - prohibited at the writer level) from "filter mutation" (non-atomic; user - edits flow to source). Derived gives the type-level distinction. -- **Code 3 stays as a legacy reader** — fixes the latent bug AND retires - `FilterProvenance` in one step. The reader recognizes both old shapes - (legacy Transformed array of numbers; FilterProvenance `[filter_path, line]`) - and dispatches accordingly. Post-Plan 5, writers never emit code 3. -- **Verbose keys (`kind`, `data`, `from`, `by`) over compact ones** at the - payload level for self-documentation. The wire format's outer fields - (`t`, `r`, `d` at the SourceInfoJson level) stay compact for consistency - with existing code. +- **One new wire code (4)**, not two. The original Plan 4 / 5 drafts + split `Synthetic` (code 4) and `Derived` (code 5). The unified + `Generated` variant collapses these. Code 5 remains unassigned. +- **Typed anchor list at the wire level.** Each entry in the `from` + array carries a `role` string and an `si_id` pool reference. This + keeps the source-info chain typed even at the wire boundary — + `si_id` refers to another pool entry, never an inlined object. +- **Code 3 stays as a legacy reader** — fixes the latent bug AND + retires `FilterProvenance` in one step. The reader recognizes both + old shapes (legacy Transformed array of numbers; FilterProvenance + `[filter_path, line]`) and dispatches accordingly. Post-Plan 5, + writers never emit code 3. +- **`from` is one name across three layers, with different inner types + at each layer.** Worth knowing before reading any one layer in + isolation: + - **User-facing (`quarto-source-map`):** `SourceInfo::Generated.from: + SmallVec<[Anchor; 2]>` where `Anchor { role, source_info: Arc }`. + Carries actual `Arc` references. + - **Writer-internal (`writers/json.rs`):** `SerializableSourceMapping::Generated.from: + Vec<(AnchorRole, usize)>` where the `usize` is the pool ID returned + by `intern` for the anchor's source_info. Same semantic concept, + flattened to pool IDs. + - **On the wire (JSON):** `"from": [{ "role": "...", "si_id": }, ...]`, + omitted when empty. Same data, JSON-shaped. + The name `from` is preserved at every layer so the implementer can + read top-down without renames; the inner type changes are + deliberate (Arc → ID → JSON) and follow the pattern already + established by `Substring.parent` → `parent_id`. +- **Verbose keys (`kind`, `data`, `by`, `from`, `role`, `si_id`)** + at the payload level for self-documentation. The wire format's outer + fields (`t`, `r`, `d` at the SourceInfoJson level) stay compact for + consistency with existing code. The asymmetry is intentional: outer + fields appear once per pool entry across the whole pool (N×K bytes + for K outer fields, repeated for each of N entries — the compact + names amortize across thousands of entries), while the inner payload + keys appear only inside Generated entries (a minority of pool entries + — most are Substring/Original from parsing). Document-level overhead + from the verbose payload keys is empirically small; clarity at the + new boundary outweighs it. Pool JSON is also gzipped on the wire in + the orchestrator and hub-client transports, which collapses the + repeated short keys further. ## Concrete wire format -### Code 4 — Synthetic +### Code 4 — Generated -The source-info pool entry for a `Synthetic` value: +The source-info pool entry for a `Generated` value with **no anchors** +(pure synthesis — sectionize, filter, title-block, footnotes, appendix, +tree-sitter-postprocess, user-edit): ```json -{ - "t": 4, - "r": [0, 0], - "d": { "kind": "filter", "data": { "filter_path": "/path/to/f.lua", "line": 42 } } -} +{ "t": 4, "r": [0, 0], "d": { "by": { "kind": "sectionize" } } } ``` -For kinds without per-instance data: - ```json -{ "t": 4, "r": [0, 0], "d": { "kind": "sectionize" } } +{ "t": 4, "r": [0, 0], "d": { "by": { "kind": "filter", "data": { "filter_path": "/path/to/f.lua", "line": 42 } } } } ``` -(`"data"` field omitted when the inner `By.data` is null, per the serde -`skip_serializing_if` on the `By` struct from Plan 4.) +(The `"data"` field is omitted when `By.data` is `null`, per the serde +`skip_serializing_if` on `By`. The `"from"` field is omitted when the +list is empty.) -### Code 5 — Derived +The source-info pool entry for a `Generated` value with **one +Invocation anchor** (shortcode resolution): -The source-info pool entry for a `Derived` value: +```json +{ + "t": 4, + "r": [0, 0], + "d": { + "by": { "kind": "shortcode", "data": { "name": "meta" } }, + "from": [ + { "role": "invocation", "si_id": 7 } + ] + } +} +``` + +The source-info pool entry for a `Generated` value with **multiple +anchors** (future: a shortcode resolution that also records its value +source after the metadata-loader follow-up lands): ```json { - "t": 5, + "t": 4, "r": [0, 0], "d": { - "from": 7, - "by": { "kind": "shortcode", "data": { "name": "meta" } } + "by": { "kind": "shortcode", "data": { "name": "meta" } }, + "from": [ + { "role": "invocation", "si_id": 7 }, + { "role": "value-source", "si_id": 12 } + ] } } ``` -The `from` field is a pool ID referencing another entry in the source-info -pool — typically an `Original` entry covering the shortcode token's bytes. -The `by` carries the same shape as Synthetic's `d` (`{kind, data}` with -`data` optional). +The pool entry's `r: [0, 0]` because Generated doesn't carry its own +offsets — ranges are obtained via the `resolve_byte_range` / +`preimage_in` chain-walk through the `Invocation` anchor. -The pool entry's `r: [0, 0]` because Derived doesn't carry its own offsets -— ranges are obtained via the `preimage_in` walk through the `from` chain. +### Code 3 — Legacy reader only -## The dual-shape code-3 reader +Post-Plan-5 writers never emit code 3. The arm exists only to read +pre-Plan-5 JSON. Two shapes are possible and the dispatch order is +**numeric-first, then string-headed** — JSON `Number` and `String` are +disjoint types, so the order is unambiguous; numeric goes first because +legacy `Transformed` is the historically larger producer. ```rust 3 => { - // Legacy code-3: either old `Transformed` (data is [parent_id, ...]) - // or the buggy FilterProvenance writer (data is [filter_path, line]). + // Legacy code-3 reader. Writers no longer emit code 3. + // - Legacy Transformed: data = [parent_id, ...] (number-headed) + // - Latent FilterProvenance: data = [filter_path, line] (string-headed) + // Both shapes are read strictly — `MalformedSourceInfoPool` on any + // length/type mismatch (same convention as the Substring / Concat + // arms above). let array = data.as_array().ok_or(MalformedSourceInfoPool)?; if array.is_empty() { return Err(MalformedSourceInfoPool); } @@ -127,10 +266,12 @@ The pool entry's `r: [0, 0]` because Derived doesn't carry its own offsets // ...current logic... SourceInfo::Substring { parent: ..., start_offset, end_offset } } else if let Some(filter_path) = array[0].as_str() { - // Latent FilterProvenance shape. Decode to Synthetic. - let line = array.get(1).and_then(|v| v.as_u64()).unwrap_or(0) as usize; - SourceInfo::Synthetic { + // Latent FilterProvenance shape: must be exactly [path, line]. + if array.len() != 2 { return Err(MalformedSourceInfoPool); } + let line = array[1].as_u64().ok_or(MalformedSourceInfoPool)? as usize; + SourceInfo::Generated { by: By::filter(filter_path.to_string(), line), + from: smallvec![], } } else { return Err(MalformedSourceInfoPool); @@ -138,173 +279,879 @@ The pool entry's `r: [0, 0]` because Derived doesn't carry its own offsets } ``` -Future writers don't emit code 3. Eventually code 3 can be retired entirely -(once we're confident no on-disk JSON files contain it), but for now it's a -no-cost read-only compat shim. +Future writers don't emit code 3. Eventually code 3 can be retired +entirely (once we're confident no on-disk JSON files contain it), but +for now it's a no-cost read-only compat shim. -## The new code-4 reader +### Code 4 — Reader / writer ```rust 4 => { - // Synthetic { by: By } - let by_obj = data.as_object().ok_or(MalformedSourceInfoPool)?; - let kind = by_obj.get("kind") - .and_then(|v| v.as_str()) - .ok_or(MalformedSourceInfoPool)? - .to_string(); - let data = by_obj.get("data").cloned().unwrap_or(Value::Null); - SourceInfo::Synthetic { by: By { kind, data } } + // Generated { by, from }. The outer `r` field is parsed by the + // caller and *ignored here* — Generated entries don't carry their + // own offsets; ranges come from chain-walking the Invocation anchor + // via `resolve_byte_range` / `preimage_in`. The writer hard-codes + // `r: [0, 0]` for code-4 entries; downstream code that reads `r` + // directly will see zeros — that's the signal to walk the anchor + // chain instead. A code-4 entry with `r != [0, 0]` from an + // older/future writer is silently accepted (precedent: today's + // Concat arm also parses `r` but doesn't use it). + // + // Strict on every other shape: missing `by`, `by.kind`, `from` entry + // missing `role`/`si_id`, `from` present but not an array, or an + // `Other("")` role string → `MalformedSourceInfoPool`. Same + // convention as the Substring/Concat arms above. + let obj = data.as_object().ok_or(MalformedSourceInfoPool)?; + let by_obj = obj.get("by").and_then(|v| v.as_object()) + .ok_or(MalformedSourceInfoPool)?; + let kind = by_obj.get("kind").and_then(|v| v.as_str()) + .ok_or(MalformedSourceInfoPool)?.to_string(); + let by_data = by_obj.get("data").cloned().unwrap_or(Value::Null); + let by = By { kind, data: by_data }; + + let mut from = SmallVec::<[Anchor; 2]>::new(); + match obj.get("from") { + None => {} // absent ≡ empty (writer skips empty `from`) + Some(v) => { + let from_arr = v.as_array().ok_or(MalformedSourceInfoPool)?; + for entry in from_arr { + let entry_obj = entry.as_object() + .ok_or(MalformedSourceInfoPool)?; + let role_str = entry_obj.get("role").and_then(|v| v.as_str()) + .ok_or(MalformedSourceInfoPool)?; + let role = parse_anchor_role(role_str)?; + let si_id = entry_obj.get("si_id").and_then(|v| v.as_u64()) + .ok_or(MalformedSourceInfoPool)? as usize; + if si_id >= current_index { + return Err(CircularSourceInfoReference(si_id)); + } + let si = pool.get(si_id).cloned() + .ok_or(InvalidSourceInfoRef(si_id))?; + from.push(Anchor { role, source_info: Arc::new(si) }); + } + } + } + + SourceInfo::Generated { by, from } +} + +fn parse_anchor_role(s: &str) -> Result { + match s { + "invocation" => Ok(AnchorRole::Invocation), + "value-source" => Ok(AnchorRole::ValueSource), + _ => { + let name = s.strip_prefix("other:") + .ok_or(MalformedSourceInfoPool)?; + if name.is_empty() { return Err(MalformedSourceInfoPool); } + Ok(AnchorRole::Other(name.to_string())) + } + } } ``` -The new code-4 writer: +Writer: ```rust -SerializableSourceMapping::Synthetic { by } => { +SerializableSourceMapping::Generated { by, from } => { let mut by_json = json!({ "kind": by.kind }); - if !by.data.is_null() { - by_json["data"] = by.data.clone(); + if !by.data.is_null() { by_json["data"] = by.data.clone(); } + + let mut d = json!({ "by": by_json }); + if !from.is_empty() { + let arr: Vec = from.iter() + .map(|(role, si_id)| json!({ + "role": serialize_anchor_role(role), + "si_id": si_id, + })) + .collect(); + d["from"] = Value::Array(arr); + } + + (4, d) +} + +fn serialize_anchor_role(role: &AnchorRole) -> String { + match role { + AnchorRole::Invocation => "invocation".to_string(), + AnchorRole::ValueSource => "value-source".to_string(), + AnchorRole::Other(s) => format!("other:{}", s), } - (4, by_json) } ``` -(start_offset and end_offset for Synthetic are both 0 — there's no source -range. The writer continues to emit `r: [0, 0]`.) +The serializer interns each anchor's `source_info` into the pool when +first encountered and reuses the ID on later references — the same +`arc_parent_ids` HashMap pattern already used for `Substring.parent`. +Multi-inline shortcode resolution thus produces N `Generated` entries, +each with one `Invocation` anchor, all referencing the same pool ID for +the shortcode token's `Original` entry. -## The new code-5 reader/writer +### TypeScript wire-format definitions -```rust -5 => { - // Derived { from: Arc, by: By } - let obj = data.as_object().ok_or(MalformedSourceInfoPool)?; - let from_id = obj.get("from") - .and_then(|v| v.as_u64()) - .ok_or(MalformedSourceInfoPool)? as usize; - if from_id >= current_index { - return Err(CircularSourceInfoReference(from_id)); - } - let from = pool.get(from_id).cloned().ok_or(InvalidSourceInfoRef(from_id))?; - let by_obj = obj.get("by").and_then(|v| v.as_object()) - .ok_or(MalformedSourceInfoPool)?; - let kind = by_obj.get("kind").and_then(|v| v.as_str()) - .ok_or(MalformedSourceInfoPool)?.to_string(); - let by_data = by_obj.get("data").cloned().unwrap_or(Value::Null); - SourceInfo::Derived { from: Arc::new(from), by: By { kind, data: by_data } } -} +`ts-packages/preview-renderer/src/types/sourceInfo.ts` is a hand-mirror +of the Rust wire format. Earlier provenance-epic churn (during the +2026-05-20 design discussion) left it carrying a stale forward-declared +split: code 4 = `Synthetic { d: By }`, code 5 = `Derived { d: { from, by } }`. +That split never shipped. Plan 5 reconciles the file with the unified +Generated design: + +**Before Plan 5 (current file):** + +```ts +export type SourceInfoEntry = + | { t: 0; r: [number, number]; d: number } + | { t: 1; r: [number, number]; d: number } + | { t: 2; r: [number, number]; d: Array<[number, number, number]> } + | { t: 3; r: [number, number]; d: [string, number] } + | { t: 4; r: [0, 0]; d: By } // Synthetic — never shipped + | { t: 5; r: [0, 0]; d: { from: number; by: By } }; // Derived — never shipped ``` -Writer: +**After Plan 5:** -```rust -SerializableSourceMapping::Derived { from_id, by } => { - let mut by_json = json!({ "kind": by.kind }); - if !by.data.is_null() { by_json["data"] = by.data.clone(); } - (5, json!({ "from": from_id, "by": by_json })) +```ts +export interface AnchorRef { + role: string; // "invocation" | "value-source" | "other:" + si_id: number; // index into the source-info pool } + +export type SourceInfoEntry = + | { t: 0; r: [number, number]; d: number } // Original + | { t: 1; r: [number, number]; d: number } // Substring + | { t: 2; r: [number, number]; d: Array<[number, number, number]> } // Concat + | { t: 3; r: [number, number]; d: [string, number] | [number, ...number[]] } // legacy reader only (no new writes) + | { t: 4; r: [0, 0]; d: { by: By; from?: AnchorRef[] } }; // Generated +// code 5 — unassigned, free for future reservation ``` -`from_id` is an interned pool ID, the same way `Substring.parent_id` works. -The serializer interns the `from` SourceInfo when first encountered and -reuses the ID on later references — natural deduplication for shortcode -resolutions where many resolved nodes share the same `from`. +Changes vs. current file: + +- Code 4's `d` shape narrows from bare `By` to `{ by: By; from?: AnchorRef[] }`. +- Code 5's entry is removed entirely. It was never emitted by any + shipping writer; no on-disk artifact carries it. Removing the variant + is safe. +- Code 3's `d` shape widens to a union to reflect the dual-shape legacy + reader (string-headed = FilterProvenance, numeric-headed = old + Transformed). New writers don't emit code 3 either way, so this is a + read-side typing only. +- `from?` is absent (not `[]`) when empty — writer skips the field via + `if !from.is_empty()`. TS consumers use `entry.d.from ?? []` as the + canonical access pattern; absent and `[]` are treated equivalently. +- The file's header doc-comment (lines 10–19 of the current file) + references `Synthetic` and `Derived` by name and says "Plan 5 wires + this up." Rewrite it to describe Generated instead and drop the + Synthetic/Derived nomenclature. + +**`utils/sourceInfo.ts` reconciliation** (full enumeration of the +"audit" called for in Phase 5): + +- `entryFor(node, pool)` — unchanged. +- `isDerived(node, pool)` — **delete entirely.** It checks `entry?.t === 5`, + which after Plan 5 is unreachable (code 5 unassigned). Any caller + still using it migrates to `isAtomicSourceInfo`. +- `isAtomicSourceInfo(node, pool, atomicKinds)` — rewrite. The current + body branches on `entry.t === 5` (always atomic) OR + `entry.t === 4 && atomicKinds.has(entry.d.kind)`. After Plan 5: only + `entry.t === 4 && atomicKinds.has(entry.d.by.kind)` — the `kind` + field moves from `entry.d.kind` to `entry.d.by.kind`, and the code-5 + branch is removed. +- `ATOMIC_SYNTHETIC_KINDS` constant (currently empty) — **rename to + `ATOMIC_KINDS`** to match the Rust canonical name `By::is_atomic_kind`, + and populate with the Plan-4 atomic set: + `new Set(["filter", "shortcode", "title-block", "tree-sitter-postprocess"])`. + The accompanying doc-comment ("mirrors `By::is_atomic_synthesizer()`") + is updated to "mirrors `By::is_atomic_kind()`." + +The TS type and the Rust serializer must agree byte-for-byte; the +header doc-comment cites the Rust file as the source of truth, same +convention as for the atomic-CustomNodes registry. + +## Work items + +Phase-ordered. Each phase compiles cleanly **and leaves the workspace +fully green** before the next begins. Phase 1 lands on its own as the +bd-3odjm fix even if the rest of Plan 5 stalls. + +**Ordering note.** The naive 1 → 2 → 3 → 4 order would break round-trip +between Phase 2 (writer emits code 4) and Phase 4 (reader decodes code +4) — every fixture containing a filter or shortcode would fail with +`MalformedSourceInfoPool` on code 4 in that window. The order below +puts the code-4 reader (renumbered Phase 2) before the writer change +so each phase leaves the workspace green. Phases 3 (writer) and 4 +(streaming writer) **must land atomically** as a single commit/squash +because Phase 3 removes `SerializableSourceMapping::FilterProvenance`, +which the streaming writer references — splitting them produces a +build break. + +### Phase 0 — Start gate + +- [x] Confirm Plan 4 (Generated + By + Anchor + AnchorRole) has merged + into `feature/provenance`. If not, stop — Plan 5 cannot build. + Verify with `git grep -n "enum SourceInfo" crates/quarto-source-map/src/source_info.rs` + and confirm a `Generated` arm exists. +- [x] Confirm the Plan-4 interim writer state is present in + `crates/pampa/src/writers/json.rs`: a `SourceInfo::Generated { by, .. }` + arm in `SourceInfoSerializer::intern` that recovers + `(filter_path, line)` via `by.as_filter().expect(...)` and emits + `SerializableSourceMapping::FilterProvenance`. This is the arm + Phase 3 rewrites. As of Plan 4's commit, the arm lives around + `writers/json.rs:314-331`; refresh before implementing. Verify + with `git grep -n "Plan 5's wire-code 4 emitter" crates/pampa/src/writers/json.rs` + — exactly one hit (Plan 4's `expect` message). +- [x] Confirm `SerializableSourceMapping::FilterProvenance` still + exists as a variant in `writers/json.rs` (it does post-Plan-4 — + Plan 4 deliberately kept the *serializable* enum variant even + though the source-map variant is gone, because the interim + writer arm above still emits it). Verify with + `git grep -n "SerializableSourceMapping::FilterProvenance" crates/pampa/` + — expect ~4 hits (writer's `to_json` arm, the interim `intern` + arm above, the streaming writer's two arms in + `stream_write_source_info_pool`). All four go away in Phase 3+4. +- [x] Confirm no on-disk JSON snapshots carry code-3 entries that the + new dual-shape reader would need to decode. Verified at planning + time: `grep -rn '"t":3\|"t": 3' crates/ tests/ hub-client/` + returns zero hits and `grep -rln 'FilterProvenance' crates/pampa/snapshots + crates/pampa/tests/snapshots crates/quarto-core/tests/snapshots` + is also empty. Re-run before starting Phase 1 to confirm nothing + has been added in the interim. **No fixture migration needed.** + +### Phase 1 — Legacy code-3 dual-shape reader (closes bd-3odjm) + +- [x] Add `parse_anchor_role` helper in `crates/pampa/src/readers/json.rs` + (used by Phase 2 too — landing it here is a no-op until then). +- [x] Rewrite the code-3 arm in `SourceInfoDeserializer::new` (currently + `crates/pampa/src/readers/json.rs:252-283`) per §"Code 3 — Legacy + reader only": dispatch on `data[0]` numeric → legacy Substring; + string → strict `[path, line]` decode to `Generated { by: + By::filter(path, line), from: smallvec![] }`; otherwise + `MalformedSourceInfoPool`. No silent `unwrap_or(0)` — line must + be a number or the entry is malformed. +- [x] Rewrite the code-3 reader's doc-comment to: + "Legacy reader for code 3 — accepts both old Transformed + numeric-array and buggy FilterProvenance string-array; writes + never emit code 3." +- [x] Run `cargo nextest run -p quarto-core --test idempotence lua_shortcode_lipsum_fixed` + → green (closes bd-3odjm). +- [x] Run the full Plan-3 idempotence suite → 27/27 green. +- [x] **Per-phase verification gate:** `cargo nextest run --workspace` + → all green. bd-3odjm closed; no regressions. Phase 1 is + independently revertible (the reader change is purely additive + — restoring the prior arm removes only the new FilterProvenance + recovery branch). +- [x] **Rollback signal:** the Phase-1 reader change only touches the + code-3 arm; other code-paths and other pool entries are + unaffected. If a Plan-3 idempotence case *other than* + `lua_shortcode_lipsum_fixed` regresses (or a workspace test + outside the idempotence suite regresses), that is a real signal + — either the dual-shape discriminator misclassifies a payload + shape that *isn't* the buggy FilterProvenance, or the new + `Generated` recovery loses information a downstream test + depended on. Do not paper over it by relaxing the strict + rejection rules. Investigate the failing case's pool entries + with `jq '.astContext.sourceInfoPool'` on the offending fixture's + JSON, identify which code-3 entries are present, and decide + whether the discriminator needs an additional case or the failing + test had a buggy pre-existing expectation. Either way, file a + beads issue. + +### Phase 2 — Code-4 reader + +Lands before any writer change so the reader is forward-compatible +when Phase 3 starts emitting code 4. Phase 2 alone leaves the workspace +green: no production code emits code 4 yet, so the new arm is exercised +only by hand-constructed tests. + +- [x] Add a `4 => { … }` arm in `SourceInfoDeserializer::new` + (`readers/json.rs:154-287`) per §"Code 4 — Reader / writer": + decode `by` (kind + optional data), decode `from` array entries + via `parse_anchor_role` + `si_id`, with the `si_id < current_index` + circular-ref guard. +- [x] Reject malformed code-4 payloads with `MalformedSourceInfoPool`: + missing `by`; missing `by.kind`; `from` present but not an array; + `from` entry not an object; `from` entry missing `role`; `from` + entry missing `si_id`; unrecognized role string; `Other("")` with + empty suffix. See §"Code 4 — Reader / writer" for the full + snippet — same strictness as the Substring/Concat arms. +- [x] Silently accept code-4 entries with `r != [0, 0]` (one-line + comment in the arm; precedent: today's Concat arm). +- [x] Add the forward-compat unit tests in `readers/json.rs::tests` — + see Phase 6 for the full list of tests landing here. + +### Phase 3 — Writer code-4 emit (`SerializableSourceMapping` + intern + `to_json`) **+ Phase 4 streaming-writer parity, landed atomically** + +Phases 3 and 4 (below) must land in one commit / squash: Phase 3 +removes `SerializableSourceMapping::FilterProvenance`, which Phase 4's +streaming writer references — splitting them produces a build break. -`r: [0, 0]` for Derived too — offsets are recovered through the chain via -`preimage_in` (Plan 7), not stored on the Derived entry itself. +Starting state from Plan 4: `SourceInfo::FilterProvenance` is gone, but +`SerializableSourceMapping::FilterProvenance` survives because Plan 4's +interim writer arm (see Plan 4 §"Migrations", `pampa/src/writers/json.rs:314`) +converts `SourceInfo::Generated { by: filter, .. }` into the legacy +shape via `by.as_filter().expect(...)`. That arm panics for non-filter +Generated kinds, so the workspace only stays buildable as long as no +non-filter Generated is constructed — Plan 6 doesn't ship shortcode +stamping until later, so Plan 4's expect is safe in the interim. +Phase 3 removes both the interim arm and the `SerializableSourceMapping::FilterProvenance` +variant at once. + +- [x] Add `Generated { by: By, from: Vec<(AnchorRole, usize)> }` to + `SerializableSourceMapping` in `crates/pampa/src/writers/json.rs`. +- [x] Replace Plan 4's interim `SourceInfo::Generated { by, .. } => … + SerializableSourceMapping::FilterProvenance` arm with a real + `SerializableSourceMapping::Generated { … }` construction (no more + `by.as_filter().expect(...)`); supports all `by.kind` values + uniformly. +- [x] Remove `SerializableSourceMapping::FilterProvenance` (no longer + reachable after the interim arm above is rewritten). +- [x] Update `SourceInfoSerializer::intern` (`writers/json.rs:260-333`): + - Recognize `SourceInfo::Generated { by, from }`. + - **Recursively intern each anchor's `source_info` BEFORE pushing + the parent pool entry** (same pattern as today's `Concat` and + `Substring` arms), so anchor `si_id`s are strictly less than + the Generated's own id. The reader's `si_id < current_index` + guard requires this invariant. + - **Reuse the existing `arc_parent_ids` cache** (keyed by + `Arc::as_ptr(&anchor.source_info)`) for anchor dedup. Same cache, + same key shape as `Substring.parent`. Multi-inline shortcode + resolutions (every resolved inline shares one `Arc` for the + token's `Original`) hit the cache and produce a single pool + entry for the shared target — exactly the dedup behavior the + "Anchor dedup test" in Phase 6 verifies. + - Build the **`intern`-match-arm return tuple** as + `(0, 0, SerializableSourceMapping::Generated { by, from: from_ids })` + — `intern` returns `(start_offset, end_offset, mapping)`; the + `r: [0, 0]` rule is enforced by hard-coding the first two + components to zero, exactly as today's FilterProvenance arm at + lines 314-322 does. +- [x] Update `SerializableSourceInfo::to_json` (`writers/json.rs:169-190`) + with the code-4 arm per §"Code 4 — Reader / writer". +- [x] Add `serialize_anchor_role` helper. +- [x] Update the `SourceInfoJson.t` legend comment at + `writers/json.rs:115` from + `"0=Original, 1=Substring, 2=Concat, 3=FilterProvenance"` to + `"0=Original, 1=Substring, 2=Concat, 3=Legacy (read-only), 4=Generated"`. + +### Phase 4 — Streaming writer parity (atomic with Phase 3) + +- [x] Add the code-4 arm in `stream_write_source_info_pool` + (`writers/json.rs:3482-3532` as of `eb06c4cf`; refresh before + implementing); mirror the `to_json` shape exactly. +- [x] Remove the FilterProvenance arms (lines 3509-3514 emit, line 3526 + tag as of `eb06c4cf`). They become unreachable once + `SerializableSourceMapping::FilterProvenance` is gone from Phase 3. + +### Phase 5 — TypeScript types + +- [x] Rewrite `ts-packages/preview-renderer/src/types/sourceInfo.ts` + per §"TypeScript wire-format definitions": + - Add `AnchorRef` interface. + - Code 4's `d` becomes `{ by: By; from?: AnchorRef[] }`. + - Code 3's `d` becomes `[string, number] | [number, ...number[]]`. + - Remove the code-5 entry. + - Rewrite the header doc-comment to describe Generated, not + Synthetic/Derived. The current header cites + `crates/pampa/src/writers/json.rs:54-91`, which is stale (the + wire-format types now live at ~lines 109-207 of that file). The + new doc-comment should cite **two** sources of truth: the Rust + enum `SourceInfo` in + `crates/quarto-source-map/src/source_info.rs` (canonical + producer-side definition) and the JSON wire mirror in + `crates/pampa/src/writers/json.rs` (`SerializableSourceMapping` + ~lines 193-207, `SourceInfoJson` ~lines 109-116, code-4 + serializer in `to_json` ~lines 167-190). Do not bake in exact + line numbers — cite the type names; they will outlast line + drift. +- [x] Update `ts-packages/preview-renderer/src/utils/sourceInfo.ts` per + §"TypeScript wire-format definitions" → `utils/sourceInfo.ts` + reconciliation: + - Delete `isDerived` entirely. + - Rewrite `isAtomicSourceInfo` to read `entry.d.by.kind` (was + `entry.d.kind`) and drop the code-5 branch. + - **Rename** `ATOMIC_SYNTHETIC_KINDS` → `ATOMIC_KINDS` to match + the Rust canonical `By::is_atomic_kind`. + - Populate `ATOMIC_KINDS` with `new Set(["filter", "shortcode", + "title-block", "tree-sitter-postprocess"])` (mirrors Plan 4's + `By::is_atomic_kind`). + - Update the file's doc-comment from "mirrors + `By::is_atomic_synthesizer()`" to "mirrors `By::is_atomic_kind()`." + - Migrate any remaining `isDerived` callers (`grep -rn isDerived ts-packages/`) + to the new `isAtomicSourceInfo` shape. +- [x] Update `ts-packages/preview-renderer/src/utils/sourceInfo.test.ts` + — the existing tests will not compile after the changes above. + Specifically: + - Drop the `import { isDerived, ATOMIC_SYNTHETIC_KINDS }` lines + and the entire `describe('isDerived', …)` block. `isDerived` is + gone; `ATOMIC_SYNTHETIC_KINDS` is renamed `ATOMIC_KINDS` and now + populated (the existing `is empty in 2A` assertion no longer + holds). + - Rewrite `samplePool`: + - Drop the code-5 entry entirely (codes 5 unassigned post-Plan-5). + - Reshape the code-4 entry from `d: { kind: 'IncludeShortcode' }` + (bare `By`) to `d: { by: { kind: 'shortcode', data: { name: 'meta' } } }` + (no `from` — absent is the canonical empty form). Add a second + code-4 entry with `from: [{ role: 'invocation', si_id: 0 }]` + so the `entry.d.from ?? []` access pattern is exercised. + - Reshape the code-3 entry: keep one with `d: ['filter.lua', 42]` + (string-headed legacy FilterProvenance) and add a sibling with + `d: [0]` (numeric-headed legacy Transformed) to exercise the + new dual-shape `d` type. + - Rewrite the `isAtomicSourceInfo` describe block: the + "Synthetic vs Derived" framing is dead. Drive new assertions + against `ATOMIC_KINDS` populated with the Plan-4 atomic set, + using a code-4 entry whose `by.kind` is `"shortcode"` (atomic) + and another whose `by.kind` is `"sectionize"` (non-atomic). + - Add an `ATOMIC_KINDS` describe block asserting the four + Plan-4 atomic kinds are members and at least one non-atomic kind + (`"sectionize"`) is not. Replaces the deleted + `ATOMIC_SYNTHETIC_KINDS` block. + - Run `cd hub-client && npm run build:all` after the rewrite — the + production build (`tsc -b && vite build`) is stricter than + `tsc --noEmit` / vitest and catches type-narrowing errors that + unit tests miss. + +### Phase 6 — Tests + +**Test placement.** All tests are hand-written (no proptest in this +file; the repo doesn't use it heavily). Unit tests extend the existing +test modules; the end-to-end integration test extends the existing +integration crate: + +- Writer-side unit tests → `crates/pampa/src/writers/json.rs::tests` + (joins the existing `test_source_info_pool_*` cluster at + `writers/json.rs:3688+`). +- Reader-side unit tests → `crates/pampa/src/readers/json.rs::tests` + (joins the existing `test_deserialize_source_info_pool_*` cluster at + `readers/json.rs:2479+`). +- End-to-end integration test → `crates/pampa/tests/json_reader_smoke_tests.rs` + (existing integration crate that drives file fixtures through + `pampa::readers::json::read`). + +Per-phase landing: forward-compat tests for the code-4 reader and the +legacy code-3 recovery test land with Phase 1/2 (reader-only); writer +round-trips, dedup, and the end-to-end test land with Phases 3+4 once +the writer emits code 4. + +**Tests:** + +- [x] Round-trip property test for every `SourceInfo` variant (Original, + Substring, Concat, Generated with various By kinds and `from` + configurations). Hand-written cases (one per shape). See §Test + plan. +- [x] Concat-of-Generated round-trip case: a `Concat { pieces }` whose + pieces' `source_info` is `Generated`. Serialize → deserialize → + assert structural equality. Closes a coverage gap — current + production paths emit this shape (e.g. coalesced filter-emitted + spans). Sits in the writer-side test module since it exercises + the recursive intern of mixed-variant pieces. +- [x] Substring-of-Generated round-trip case: a + `Substring { parent: Arc::new(Generated { … }), … }` — e.g. a + filter-emitted span whose substring is later coalesced. The + writer's existing `intern` recursion routes + `Substring.parent: Arc` through the new code-4 path + with no extra logic, and the reader's existing Substring arm + reads the parent_id back as a code-4 pool entry. The test serves + as a regression guard for that path: confirm pool ordering + (parent Generated entry interns strictly before the Substring + child) and assert structural equality across serialize → + deserialize. Co-located with the Concat-of-Generated case in + the writer-side test module. +- [x] Filter-provenance recovery test (hand-constructed code-3 with + string-array payload → `Generated { by: filter, from: smallvec![] }`). +- [x] Legacy Transformed back-compat test (hand-constructed code-3 with + numeric-array payload → `Substring`). +- [x] Strict code-3 rejection tests: `[path]` (missing line) and + `[path, "not-a-number"]` (non-numeric line) both + → `MalformedSourceInfoPool`. Guards the no-`unwrap_or(0)` rule. +- [x] Forward-compat test (code-4 with unknown `by.kind`, arbitrary + `data` → preserved round-trip). +- [x] Strict code-4 rejection tests: missing `by`, missing `by.kind`, + `from` present but not an array, `from` entry not an object, + `from` entry missing `role`/`si_id`, role string `"other:"` + (empty suffix) → all `MalformedSourceInfoPool`. +- [x] **Anchor dedup test (writer-side only).** Hand-construct an AST + with N inlines, each carrying + `Generated { by: By::shortcode("meta"), from: smallvec![Anchor::invocation(Arc::clone(&shared))] }`. + Serialize. Assert: the pool contains the shared target exactly + once and every Generated entry's `from[0].si_id` references that + single ID. **Read-side note:** deserialization rebuilds each anchor + with a fresh `Arc`, so a subsequent re-serialization produces N + copies — this test verifies the *write-time* optimization keyed + on `Arc::as_ptr`. See [[anchor-dedup-invariant]] in §"Risk areas" + for the broader contract. Test passes Plan-5-alone (no shortcode + resolver needed — Arc sharing is hand-wired). +- [x] Streaming-writer parity test. Helper shape: + `roundtrip_via_stream(ast) -> ast` that calls `stream_write_pandoc` + into a `Vec`, reads back via `pampa::readers::json::read`, + and asserts SourceInfo equality at chosen Generated nodes. The + streaming writer's match arms are independent of `to_json`'s; + without this coverage, a Phase-4 regression in + `stream_write_source_info_pool` could slip through. +- [x] AnchorRole round-trip test: build a `Generated` with each role + (`Invocation`, `ValueSource`, `Other("ext/foo/bar")`) wrapped in + anchors; serialize through JSON via the writer's code-4 path; + deserialize via the reader's code-4 path; assert the role survives. +- [x] End-to-end production reachability test (kbd-shortcode fixture → + `render_qmd_to_preview_ast` → JSON → `pampa::readers::json::read` + → assert success and recovered shape). Lives in + `crates/pampa/tests/json_reader_smoke_tests.rs`. +- [x] TypeScript-side type round-trip (parse a JSON pool with Generated + entries; confirm `SourceInfoEntry` shape matches; confirm + `entry.d.from ?? []` access pattern works for both absent and + present `from`). + +### Phase 7 — Verification gate + +- [x] `cargo build --workspace` clean. +- [x] `cargo nextest run --workspace --no-fail-fast` all green + (bd-3odjm closed in Phase 1; no other regressions). Use + `--no-fail-fast` so a single regression doesn't hide downstream + green tests — same convention used to close Plan 4. +- [x] `cargo xtask verify` (full — `quarto-core`/`pampa` are WASM + consumers; hub-build leg matters). The WASM rebuild leg will + modify `crates/wasm-quarto-hub-client/Cargo.lock` as a side + effect (separate lockfile from the workspace one); include it + in the commit. Plan 4 hit this and committed it without issue. +- [x] `git grep "FilterProvenance"` returns only legacy-reader / legacy + doc references (no writer emissions, no `SerializableSourceMapping` + variant). +- [x] Update bd-3odjm: close at the Phase-1 commit (the reader change + that turns `lua_shortcode_lipsum_fixed` green). The close trigger + is the commit itself, not a downstream PR or merge — Plan 5 lands + on the `feature/provenance` integration branch via merge commits, + not a standalone PR, so tying the close to the commit gives the + issue a concrete reference. Refresh its description to use `from:` + not `anchors:` if reopened for any reason. **If Phase 3 or 4 + introduces a *new* failure mode in the lipsum fixture, file a + fresh beads issue** rather than reopening bd-3odjm — that issue is + specifically the code-3 collision and should stay scoped to it. + +## Implementation guidance carried over from Plan 4 + +A few small things came up during Plan 4 that are worth knowing before +starting Plan 5: + +- **`SmallVec::new()` is the construction pattern, not `smallvec![]`.** + Plan 4 uniformly used `SmallVec::<[Anchor; 2]>::new()` for empty + lists, never the `smallvec!` macro. The reader file + `crates/pampa/src/readers/json.rs` does not currently import + `smallvec::smallvec`. Code samples in this plan that show + `smallvec![]` are pseudocode — when implementing, write + `SmallVec::new()` (matches Plan 4's convention, avoids a needless + import). The `SmallVec` type itself needs + `use smallvec::SmallVec;` at the top of the file — Plan 4 added + this to every consumer it touched (`pampa/src/lua/diagnostics.rs`, + `pampa/src/lua/types.rs`); `readers/json.rs` and the writer's + Generated arm (Phase 3) will need it too. + +- **Don't name a local `gen`.** Rust 2024 makes `gen` a reserved + keyword. Plan 4's test code had to rename a `let gen = ...` to + `let generated = ...`. None of Plan 5's code samples currently use + `gen` as an identifier — keep it that way. (Plan 7's prose still + uses `gen.invocation_anchor()` as shorthand; that's pseudocode, not + literal Rust to type.) + +- **Phase boundary "compiles cleanly" semantics.** Plan 4 found that + "each phase compiles cleanly" really means "the directly-touched + crate compiles cleanly" — adding a new `SourceInfo` variant + immediately broke `match` exhaustiveness across ~10 crates, and the + workspace stayed red between Plan-4 Phase 1 and Phase 5. Plan 5's + Phase 1 → 2 → 3+4 ordering above explicitly avoids this trap (each + phase leaves the workspace green); the *atomic* Phase 3+4 squash is + the only place where you have to land more than one commit's worth + of code in a single push. + +- **`cargo xtask verify --skip-rust-tests` is a useful intermediate.** + Plan 4 ran `cargo nextest run --workspace --no-fail-fast` first + (confirms only bd-3odjm is red), then `cargo xtask verify + --skip-rust-tests` (confirms the WASM/hub-client legs are green + without re-running the same Rust tests). Plan 5 should follow the + same split for the final verification gate. ## Open questions for implementation -- **Eventually retiring code 3**: at some point, no JSON files in the wild - contain code 3 (the buggy FilterProvenance shape never round-tripped before - Plan 5; the legacy Transformed shape predates a transition we made earlier). - Could remove the legacy reader. Don't need to decide now. -- **Detecting malformed code 4/5 payloads**: if shape doesn't match - expectation, error with `MalformedSourceInfoPool`. Confirm the exact - error variant for each malformation. -- **Streaming writer parity** (`stream_write_custom_block` and the streaming - source-info-pool writer): both writer paths need updating. Today both have - the same code-3 → FilterProvenance shape — the bug applies to both. - Update both to emit code 4 for Synthetic and code 5 for Derived. -- **Pool deduplication of Derived `from` references**: when many Derived - source_infos share the same `from` (e.g., a multi-inline shortcode - resolution where every resolved inline points at the same shortcode - token), the writer should intern `from` once and reuse the ID. The - existing `arc_parent_ids` HashMap pattern (used for `Substring.parent`) - applies here. +- **Eventually retiring code 3**: at some point, no JSON files in the + wild contain code 3 (the buggy FilterProvenance shape never + round-tripped before Plan 5; the legacy Transformed shape predates a + transition we made earlier). Could remove the legacy reader. Don't + need to decide now. +- **Detecting malformed code 4 payloads**: settled in Phase 2 of + §"Work items" — `MalformedSourceInfoPool` for missing `by`, missing + `by.kind`, `from` not an array, `from` entry not an object, `from` + entry missing `role`/`si_id`, unrecognized role string, and empty + `Other("")` suffix. +- **Streaming writer parity** (`stream_write_source_info_pool`): settled + in Phase 4 of §"Work items" — atomic with Phase 3 (writer code-4 emit). +- **Pool deduplication of anchor `si_id` references**: when many + Generated entries share the same anchor target (multi-inline + shortcode), the writer interns once and reuses the ID. The existing + `arc_parent_ids` HashMap pattern (already used for `Substring.parent`) + handles this — same interning mechanism, different reader-side name + (`si_id` for anchors, `parent_id` for substrings). This is a + **writer-side optimization only** — deserialization rebuilds each + anchor with a fresh `Arc`, so pool-size is not stable over + read-write-read. AST content and Plan-3 hashes (which exclude + `source_info`) are stable. See [[anchor-dedup-invariant]] in §"Risk + areas". +- **TypeScript hand-mirror updates**: see §"TypeScript wire-format + definitions" above. Settled — code 4's `d` becomes `{ by; from? }`, + code 5 is removed, code 3's `d` becomes a union for the dual-shape + legacy reader, `ATOMIC_SYNTHETIC_KINDS` renames to `ATOMIC_KINDS` + with the Plan-4 atomic set populated. The companion test file + `utils/sourceInfo.test.ts` is rewritten in lockstep — see Phase 5. +- **Writer JSON-build style**: hand-build via `json!` macro, matching + the existing convention throughout `writers/json.rs`. Not derive-based. + Settled. +- **`By::kind` canonical enumeration**: see Plan 4's `By::` builders + (`filter`, `sectionize`, `user_edit`, `shortcode`, `include`, + `title_block`, `footnotes`, `appendix`, `tree_sitter_postprocess`, + `raw`) for the full set. Plan 5 emits whatever `by.kind` string is + present, kebab-case throughout. Atomic-kind list mirrors + `By::is_atomic_kind` (`filter | shortcode | title-block | + tree-sitter-postprocess`). Cross-plan invariant — no Plan-5-owned + decision here. ## References -- `crates/pampa/src/writers/json.rs:80` — type code comment. -- `crates/pampa/src/writers/json.rs:132-155` — `SerializableSourceInfo::to_json`. -- `crates/pampa/src/writers/json.rs:145-148` — current FilterProvenance → - code 3 emit (the buggy line). -- `crates/pampa/src/writers/json.rs:225-298` — full SerializableSourceInfo - enum and conversion. -- `crates/pampa/src/readers/json.rs:155-290` — pool reader; the code-3 - branch is at line 252. -- `crates/quarto-source-map/src/source_info.rs:22-55` — SourceInfo enum - (extended by Plan 4). +(Line numbers as of `feature/provenance` @ 4c465768. Plan 4's migration +will shift these; refresh before implementing.) + +- `crates/pampa/src/writers/json.rs:115` — `SourceInfoJson.t` field + comment, currently `"0=Original, 1=Substring, 2=Concat, 3=FilterProvenance"`. + Plan 5 extends the legend to include `4=Generated` and notes code 3 + as legacy reader only. +- `crates/pampa/src/writers/json.rs:160-190` — `SerializableSourceInfo` + struct and `to_json` method. Code-3 emit at lines 180-182 (the bug). +- `crates/pampa/src/writers/json.rs:193-207` — `SerializableSourceMapping` + enum (Original/Substring/Concat/FilterProvenance arms). Phase 3 adds + a `Generated` arm and removes `FilterProvenance`. +- `crates/pampa/src/writers/json.rs:260-333` — `SourceInfoSerializer::intern`; + Phase 3 adds a `SourceInfo::Generated` arm with topologically-ordered + anchor recursion. +- `crates/pampa/src/writers/json.rs:3482-3532` — `stream_write_source_info_pool`; + Phase 4 mirrors the to_json changes here (lines 3509-3514 emit, line + 3526 tag). +- `crates/pampa/src/readers/json.rs:99-293` — `SourceInfoDeserializer::new` + (the pool reader). Code-3 arm at lines 252-283 (Phase 1 rewrites); + Phase 2 adds a code-4 arm. +- `crates/quarto-source-map/src/source_info.rs:21-55` — `SourceInfo` enum + (extended by Plan 4 — confirm Generated/By/Anchor/AnchorRole present + before Plan 5 starts; see Phase 0). +- `ts-packages/preview-renderer/src/types/sourceInfo.ts` — JS-side + `SourceInfoEntry`. See §"TypeScript wire-format definitions" for the + full before/after. +- `ts-packages/preview-renderer/src/utils/sourceInfo.ts` — JS-side + helpers (`isAtomicSourceInfo`, etc.); needs adjustment for the new + shape per Plan 4 / Plan 7. ## Test plan +(Hand-written tests; the repo doesn't use proptest in this area. See +Phase 6 for test-file placement and per-phase landing.) + - **Round-trip property test**: for each variant (Original, Substring, - Concat, Synthetic, Derived with various By kinds), build a `SourceInfo`, - serialize to JSON, deserialize, assert equality. Cover the full enum. -- **Filter-provenance recovery test**: hand-construct a JSON pool entry with - the buggy code-3-with-string-array-payload shape. Read it. Assert the - reader produces `Synthetic { by: By::filter(...) }` with the right path/line. -- **Legacy Transformed back-compat test**: hand-construct a JSON pool entry - with code-3-with-numeric-array-payload (the legacy Transformed shape). - Assert the reader still produces a `Substring` (preserving today's - back-compat behavior). -- **Forward-compat test**: hand-construct a JSON pool entry with code 4 and - an unknown kind (`"kind": "ext/future/foo"`, arbitrary data). Assert it - decodes as `Synthetic { by: By { kind: "ext/future/foo", data: ... } }`. - Round-trips unchanged. Same test for code 5. -- **Derived dedup test**: build an AST where multiple inlines have Derived - source_info sharing the same `from`. Serialize. Confirm the pool contains - the `from` Original entry exactly once and each Derived entry references - it by ID (rather than re-encoding the Original each time). -- **End-to-end with Plan 4**: build an AST containing Synthetic-tagged AND - Derived-tagged nodes, serialize to JSON via the existing JSON writer, - deserialize via the reader, assert structural equality. + Concat, Generated with various By kinds and anchor configurations), + build a `SourceInfo`, serialize to JSON, deserialize, assert + equality. Cover the full enum. +- **Concat-of-Generated round-trip**: a `Concat { pieces }` whose + pieces' `source_info` is `Generated` (the shape produced by coalesced + filter-emitted spans). Serialize → deserialize → assert structural + equality. Closes a coverage gap not exercised by the per-variant + property test above. +- **Substring-of-Generated round-trip**: a + `Substring { parent: Arc::new(Generated { … }), … }`. + `Substring.parent: Arc` is structurally unrestricted, so + this shape can arise whenever a transform produces a span and a + downstream coalesce or slice carves a substring out of it. The + serializer's `Substring` arm interns the parent recursively, which + routes through the new code-4 arm; the reader's `Substring` arm then + reads the parent_id back. Round-trip the construction and assert + structural equality. +- **Filter-provenance recovery test**: hand-construct a JSON pool entry + with the buggy code-3-with-string-array-payload shape. Read it. + Assert the reader produces `Generated { by: filter, from: smallvec![] }` + with the right path/line via `by.as_filter()`. +- **Strict code-3 rejection**: hand-construct `[path]` (missing line) + and `[path, "not-a-number"]` (non-numeric line); assert both + → `MalformedSourceInfoPool`. Guards the no-`unwrap_or(0)` rule. +- **Legacy Transformed back-compat test**: hand-construct a JSON pool + entry with code-3-with-numeric-array-payload (the legacy Transformed + shape). Assert the reader still produces a `Substring` (preserving + today's back-compat behavior). +- **Forward-compat test**: hand-construct a JSON pool entry with code 4 + and an unknown kind (`"kind": "ext/future/foo"`, arbitrary data). + Assert it decodes as `Generated { by: By { kind: "ext/future/foo", + data: ... }, from: smallvec![] }`. Round-trips unchanged. +- **Strict code-4 rejection**: missing `by`, missing `by.kind`, `from` + present but not an array, `from` entry not an object, `from` entry + missing `role`/`si_id`, unrecognized role string, and role string + `"other:"` (empty `Other` suffix) → all `MalformedSourceInfoPool`. +- **Anchor dedup test (writer-side only)**: build an AST where N + inlines carry Generated source_info each with an `Invocation` anchor + wrapping `Arc::clone(&shared)`. Serialize. Confirm the pool contains + the shared target exactly once and each Generated entry's + `from[0].si_id` references it by ID. *Read-side note:* deserialization + rebuilds each anchor with a fresh `Arc`; this test only verifies the + write-time optimization (see [[anchor-dedup-invariant]] in §"Risk + areas"). Test passes Plan-5-alone (no shortcode resolver needed). +- **Streaming-writer parity test**: implement helper + `roundtrip_via_stream(ast) -> ast` that streams the AST via + `stream_write_pandoc` into a `Vec` and reads back through + `pampa::readers::json::read`. Run a representative Generated-bearing + AST through it; assert equality. The streaming writer's match arms + are independent of `to_json`'s, so a Phase-4 regression could + otherwise slip through. +- **AnchorRole round-trip test**: build a `Generated` with each role + (`Invocation`, `ValueSource`, `Other("ext/foo/bar")`) wrapped in + anchors; serialize through JSON via the writer's code-4 path; + deserialize via the reader's code-4 path; assert the role survives. +- **Live regression test already on the integration branch:** + `cargo nextest run -p quarto-core --test idempotence lua_shortcode_lipsum_fixed` + (filed as **bd-3odjm**; see §"Inherited failure that must close on + Plan 5's first reader change (bd-3odjm)" above). This is the + fastest first-iteration smoke check: it drives a real pipeline + a + real shortcode + a real JSON round-trip + the existing Plan-3 + hashing harness, and goes red until Plan 5 fixes the code-3 + collision. Run it before the hand-constructed tests below. +- **End-to-end production reachability test** (additional regression + guard for the bug Plan 5 fixes — current main would fail this test + as soon as the JSON round-trip is exercised on a Lua-shortcode-bearing + document): + 1. Build a fixture using `{{< kbd Ctrl+C >}}` (the kbd extension's + `kbd.lua` calls `pandoc.Span(...)`, which the Lua machinery's + `filter_source_info` auto-attach tags with FilterProvenance / + post-Plan-4 `Generated { by: filter, ... }`). + 2. Run it through `render_qmd_to_preview_ast` (or the equivalent + production path that drives the JSON writer with + filter-constructed nodes in the AST). + 3. Take the resulting JSON, feed it back through + `pampa::readers::json::read`. + 4. Assert the round-trip succeeds (no `MalformedSourceInfoPool` + error) AND the recovered source_info is `Generated { by: + shortcode, from: [Invocation -> ...] }` after Plan 6's + post-walk has stamped it. (If running Plan 5 alone — before + Plan 6 lands — the recovered shape is `Generated { by: filter, + from: [] }` with `(filter_path, line)` in `by.data`; the + round-trip still succeeds.) + + This is distinct from the hand-constructed "Filter-provenance + recovery test" above. That test exercises the legacy code-3 reader + in isolation; this one drives a real pipeline + JSON writer + reader + to verify the bug-fix holds end-to-end against a production-shaped + path. Without Plan 5, the round-trip on step 3 errors out + (`MalformedSourceInfoPool` from the code-3-as-Transformed + misinterpretation) on any document whose shortcode-resolution path + hits a Lua handler. +- **End-to-end with Plan 4**: build an AST containing both + no-anchor and with-anchor Generated nodes, serialize to JSON via the + existing JSON writer, deserialize via the reader, assert structural + equality. +- **TypeScript-side type round-trip**: hub-client / preview-renderer + test parses a JSON pool with Generated entries and confirms its + `SourceInfoEntry` shape matches. ## Dependencies -- Depends on: Plan 4 (Synthetic + Derived variants + By struct). -- Blocks: Plans 6, 7, 8 (they all rely on the new variants round-tripping +- Depends on: Plan 4 (Generated variant + By + Anchor + AnchorRole). +- Blocks: Plans 6, 7, 8 (they all rely on Generated round-tripping through JSON). ## Risk areas -- **Streaming writer code path**: there are two writer paths in `json.rs` - (`write_custom_block` non-streaming and `stream_write_custom_block` - streaming). Both have the same source-info-pool emission logic. Both need - updating. Easy to forget the streaming variant. +- **Streaming writer code path**: source-info-pool emission lives in + two functions in `crates/pampa/src/writers/json.rs`: + `SerializableSourceInfo::to_json` (used by the non-streaming + `write_pandoc` at line 1657) and `stream_write_source_info_pool` + (called from `stream_write_pandoc` at line 3530). Both consume the + same `SerializableSourceMapping` enum but inline their own match + arms. Compiler exhaustiveness catches missed arms after Phase 3's + enum change — a deliberate safety property, and the reason Phases 3 + and 4 must land atomically. The named-but-unrelated pair + `write_custom_block` / `stream_write_custom_block` handles + `CustomNode` blocks, not the pool; don't confuse them. - **Pool ID stability**: changing the format of pool entries shouldn't affect their IDs (which are sequential by intern order). Verify. +- **Anchor dedup is a writer-side + optimization, not a round-trip-stable property.** The writer's + `arc_parent_ids` HashMap is keyed by `Arc::as_ptr`; multiple anchors + pointing to the same `Arc` collapse to one pool entry. + After deserialization, each anchor gets a freshly-allocated `Arc` + carrying a `clone` of the pool target, so a subsequent re-serialize + materializes N copies. **Pool-size is not stable over read-write-read; + AST content and Plan-3 hashes are.** Plan-3's idempotence harness + hashes `doc.ast.blocks` / `doc.ast.meta` via `compute_block_hash_fresh` + / `compute_meta_hash_fresh_excluding_rendered`, both of which + explicitly skip `source_info` (see + `claude-notes/plans/2026-05-04-q2-preview-plan-3-builtin-filter-idempotence.md` + §"Goal" — *"skips `source_info` and `key_source`"*). Same contract as + today's `Substring.parent` reads. The reader-side `Arc::new(si)` + pattern in the new code-4 arm matches the existing Substring arm at + `readers/json.rs:196-200`, which also calls `Arc::new(pool.get(parent_id).cloned()?)` + on every read — no sharing on the read side, by design. +- **Acyclic-by-construction assumption.** `SourceInfo` graphs are + acyclic by construction — transforms build bottom-up, `Arc` + is immutable post-construction. The writer's recursive interning + relies on this invariant — same precondition as today's + Substring/Concat arms. No cycle detection in the reader either. +- **Recursion depth.** Anchor interning adds a third recursion path on + top of Substring chains and Concat pieces. Production depth is + bounded by AST depth (shallow in practice); no separate guard. + Adversarial input could blow the stack, but that's no different from + the existing Substring-chain recursion — out of scope for Plan 5. - **Old JSON files**: anyone with on-disk JSON snapshots of ASTs (test - fixtures, debug exports) generated by current writers will have code 3 - with the buggy shape. Plan 5's reader handles them. New writes emit code 4. + fixtures, debug exports) generated by current writers will have code + 3 with the buggy shape. Plan 5's reader handles them. New writes emit + code 4. +- **Coexistence with attribution wire fields in the same file**: the + attribution work (already shipped) added `astContext.attribution` + and `attributionActors` near the source-info pool emission in + `crates/pampa/src/writers/json.rs`. Plan 5 touches different + conditional branches of the same writer file but no semantic + conflict — `astContext.attribution` records reference source-info + pool IDs unchanged; new code-4 entries are valid `s` targets just as + Original entries are. ## Estimated scope | Component | Lines (rough) | |---|---| -| Code 4 writer + reader | ~50 | -| Code 5 writer + reader (with `from` interning) | ~60 | -| Code 3 dual-shape reader | ~30 | -| Streaming writer parity | ~30 | -| Tests | ~180 | -| **Total** | **~350** | +| Code 4 writer (with anchor interning) | ~80 | +| Code 4 reader (with anchor decoding) | ~70 | +| Code 3 dual-shape legacy reader | ~35 | +| `AnchorRole` ↔ string serialization | ~20 | +| Streaming writer parity | ~40 | +| TypeScript type + utils updates | ~30 | +| Tests (incl. strict-rejection + stream helper + Concat-of-Generated) | ~290 | +| **Total** | **~565** | One focused session. ## Notes -The bug-fix opportunity is real: this plan makes things work that have been -silently latent. Worth a clear callout in the implementation commit message: -"This change fixes a latent bug where FilterProvenance values written by -the JSON writer could not be read back. Production code never tripped this -because no production path produced FilterProvenance in the AST that crossed -the JSON boundary." +The bug-fix opportunity is real and now reachable in production: this +change makes things work that have been silently latent. Worth a clear +callout in the implementation commit message: + +> This change fixes a latent bug where `FilterProvenance` values written +> by the JSON writer could not be read back. Production code never +> tripped this in current main because no production path produced +> FilterProvenance in an AST that crossed the JSON boundary — but +> Plans 1–2 shipped the q2-preview pipeline that runs filters whose +> output does cross that boundary. Plan 5's reader recovers the +> `Generated { by: filter, ... }` shape from the buggy code-3 payload, +> closing the gap. + +The single-code-4 design (no separate code 5) is the result of +unifying `Synthetic` + `Derived` into `Generated` during the 2026-05-20 +design discussion. Code 5 is left unassigned, free for future +reservation. + +**`r: [0, 0]` for Generated entries during the Plan-5↔Plan-7 window.** +After Plan 5 ships, all `Generated` pool entries carry `r: [0, 0]` — +the per-entry range field is no longer the right accessor for +Generated; use `resolve_byte_range` (via the Invocation anchor) for +chain-resolved ranges. Any diagnostic UI (q2-debug, hub-client devtools) +that reads `r` directly will see uninformative zeros for these entries. +This is a long-lived integration branch and the same developer is +implementing all of Plans 5–7, so the surprise window is local; once +Plan 7's `preimage_in` lands, the standard accessor pattern reaches +through Generated correctly. No external consumers need warning. diff --git a/claude-notes/plans/2026-05-04-q2-preview-plan-6-provenance-audit.md b/claude-notes/plans/2026-05-04-q2-preview-plan-6-provenance-audit.md index 3f1b84cfd..f6ddbbba8 100644 --- a/claude-notes/plans/2026-05-04-q2-preview-plan-6-provenance-audit.md +++ b/claude-notes/plans/2026-05-04-q2-preview-plan-6-provenance-audit.md @@ -1,178 +1,875 @@ -# Plan 6 — Provenance audit (Derived for shortcodes, Synthetic for synthesizers) +# Plan 6 — Provenance audit (Generated for synthesizers, anchors for shortcodes) -**Date:** 2026-05-04 +**Date:** 2026-05-04 (revised 2026-05-20, review pass 2026-05-22) **Branch:** feature/q2-preview -**Status:** Implementation plan (open questions named) +**Status:** Implementation plan (review-pass edits applied; theorem +attr_source question closed) **Milestone:** none directly — completes the AST shape Plans 7/8 rely on +## Epic context + +Part of the **provenance epic** (Plans 3–8). Plan 6 is the audit pass +that converts every transform's `SourceInfo::default()` emission into +the correct `Generated { by, from }` shape Plan 4 defines, and +attaches `Invocation` anchors uniformly to all shortcode resolutions. +The file name keeps its q2-preview-plan-N form for continuity with the +earlier discussion notes. + +## Work items checklist + +Implementation order. The plan body (Scope / Implementation notes / Test plan) +holds the design details; this list is the work-tracking surface. + +### Phase 0 — prerequisite +- [x] Add `Inline::source_info_mut` (~33 LOC) + `Block::source_info_mut` + (~24 LOC) accessors in `quarto-pandoc-types`, with round-trip unit tests + for one representative variant of each. + +### Audit +- [x] Comprehensive grep + categorize `SourceInfo::default()` sites in + `crates/quarto-core/src/transforms/` and `crates/pampa/src/`. + (Report: `claude-notes/research/2026-05-22-plan-6-audit.md`. + Follow-ups: bd-12vrr callout default-title, bd-1inj0 code-block + chrome.) +- [x] Document the positional-alignment invariant on `AttrSourceInfo.attributes` + (`crates/quarto-pandoc-types/src/attr.rs:31`). + +### Stamper + dispatch funnel +- [x] Implement `stamp_shortcode_anchors` + mutable AST walkers in + `shortcode_resolve.rs` (model on existing `recurse_inline` / + `resolve_block`). +- [x] Wire the stamper into `resolve_shortcode`'s dispatch funnel so every + Rust / Lua / extension dispatch is post-walked. +- [x] Thread `shortcode_owned.source_info` into `make_error_inline` and + `shortcode_to_literal` from their four call sites. + +### Synthesizer fixes +- [x] `TitleBlockTransform`: emit `Generated { by: By::title_block(), from: [] }` + on the synthesized h1. +- [x] `SectionizeTransform`: emit `Generated { by: By::sectionize(), from: [] }` + on the synthetic Section Div (both close-on-stack and end-of-input sites). +- [x] `FootnotesTransform`: emit `Generated { by: By::footnotes(), from: [] }` + on the container Div. +- [x] `AppendixStructureTransform`: emit `Generated { by: By::appendix(), from: [] }` + on the container Div, bibliography wrapper, license/copyright/citation + helpers (all 5 sites — the helpers were not enumerated in the plan body + but are structurally identical synthesizers; see audit report §"Decisions + on plan-adjacent sites"). +- [x] `theorem.rs::extract_name_attr` + `proof.rs::extract_name_attr`: + thread `&div.attr_source` through; index before `kvs.remove("name")`; + fall back on length-mismatch. **Implementation note**: the + `debug_assert_eq!` form the plan body suggested is too strict — it + fires on the common test pattern of `AttrSourceInfo::empty()` plus a + non-empty `kvs`. Relaxed to `debug_assert!(attr_source.attributes. + is_empty() || kvs.len() == attr_source.attributes.len(), ...)`. The + empty case is "no provenance" (not a bug); only populated-but- + misaligned input is a bd-3aolj/bd-1e6a5 sync error. +- [x] `pampa::pandoc::treesitter_utils::postprocess` synthetic Space + (~line 1348): emit `Generated { by: By::tree_sitter_postprocess(), from: [] }`. + +### Tests +- [x] Shortcode required-anchor invariant + (`shortcode_resolution_required_anchor_invariant` — every + `by:shortcode` carries an Invocation). +- [x] Per-transform fix tests (sectionize / title_block / footnotes / + appendix — shape test in each transform's own test module). +- [x] Lua-shortcode enrichment test + (`lua_shortcode_typed_return_enriched_to_shortcode_kind` — typed Lua + return promoted from `by:filter` → `by:shortcode`, `filter_path` / + `line` migrated into `by.data.lua_path` / `by.data.lua_line`, + Invocation appended). +- [x] Multi-inline shortcode anchor test + (`multi_inline_shortcode_resolution_shares_invocation_source` — + Strong[Str], Space, Str all share the same Invocation source_info). +- [x] Escaped-shortcode regression test + (`escaped_shortcode_keeps_original_source_info`). +- [x] Error-inline regression test + (`unknown_shortcode_error_uses_token_source_info` — both Strong + Str + layers carry the token's Original source_info, not Default or + Generated). The earlier `test_make_error_inline` unit test was also + updated to assert the threaded shape. +- [x] `source_info` determinism test + (`shortcode_resolution_is_deterministic` — two runs produce + structurally-equal ASTs, including all `Generated.by` / + `Generated.from[]` / Original byte ranges). +- [ ] Audit-completion test across the full pipeline (no + `SourceInfo::default()` survives across all transforms). Deferred — + the required-anchor invariant + per-transform shape tests cover the + same property piecemeal; a pipeline-level audit would belong in the + e2e test crate alongside Plan 3's idempotence fixtures and is + better wired in there. Open follow-up. +- [ ] Attribution interaction test (multi-author latest-wins via + `query_byte_range`). Deferred — needs `GitBlameProvider` setup; the + attribution chain is mechanically covered by Plan 4's + `resolve_byte_range` (Generated → Invocation → Original) and Plan 6 + doesn't change the chain. Open follow-up. +- [ ] Error + escaped round-trip test (incremental writer + verbatim-copies). Deferred to Plan 7 (writer infrastructure). +- [ ] Shortcode-inside-include composition test (Invocation anchor + `file_id != 0`). Deferred to Plan 8 (include wrapper introduces the + cross-file context). +- [ ] Plan 3 idempotence test rerun (no new non-determinism). Verified + by `cargo nextest run --workspace` — all 9460 tests pass, including + Plan 3's idempotence fixtures. + +### Verification +- [x] `cargo xtask verify` — all 12 steps green: workspace build, + workspace tests (9460 passed, 196 skipped), lint, format, WASM build, + hub-client build, hub-client tests, q2-preview-spa build. +- [x] End-to-end exercise. Invocation: + ``` + target/debug/q2 render /tmp/plan6-e2e/doc.qmd + ``` + Fixture: a `.qmd` with `title:` (drives title-block), two `## ` + headers (drive sectionize), a footnote `^[…]` (drives footnotes + transform + appendix container), and a `{{< meta title >}}` + shortcode (drives the resolver + stamper). Observed HTML + (inspected, snippet preserved): + ```html + Plan 6 E2E +
+

Plan 6 E2E

+
+

A section

+

Body text. … A meta lookup: Plan 6 E2E.

+ … +
+
+ ``` + Title-block h1 synthesized; both sections wrapped by sectionize; + meta shortcode resolved to its value; footnote container Div + + appendix container Div both emitted. Plan 6's source_info shape is + not visible in HTML, but it's covered by the per-transform shape + tests (Tests section above) and by the workspace test suite. + ## Goal Audit every transform that emits `SourceInfo::default()` (a meaningless -zero-range Original) and fix it to emit correct provenance. Two patterns -apply: +zero-range Original) and fix it to emit correct provenance. Two +patterns apply: - **Transforms that genuinely synthesize content with no source preimage** (Sectionize's section Divs, TitleBlock's synthesized h1, etc.): emit - `Synthetic { by: By::() }` from Plan 4. -- **The shortcode resolver, specifically**: emit `Derived { from: - ctx.source_info, by: By::shortcode(name) }` on resolved nodes. The - `Derived` provenance preserves the shortcode token's byte range AND - marks the resolved content as atomic for the writer (Plan 7 detects - Derived + UseAfter as AtomicViolation). - -This plan does NOT introduce a `CustomNode("ShortcodeResolution")` wrapper + `Generated { by: By::(), from: smallvec![] }` from Plan 4. +- **The shortcode resolver, uniformly**: emit `Generated { by: By::shortcode(name), + from: smallvec![Anchor::invocation(token_si)] }` on every resolved + node, regardless of whether the handler is Rust-built-in or + Lua-implemented. The `Invocation` anchor's `source_info` is the + shortcode token's range; Plan 7's writer uses it for Verbatim-copy + on KeepBefore; attribution chains through it via `resolve_byte_range`. + +The earlier `Derived` variant proposal collapsed into `Generated` with +an `Invocation` anchor during the 2026-05-20 design discussion; this +plan reflects the unified shape. + +Plan 6 does NOT introduce a `CustomNode("ShortcodeResolution")` wrapper (an earlier draft proposed that; we walked it back). Wrappers are appropriate for cases where there's no available source-side anchor in the same file (includes — different FileId — Plan 8 handles those). For shortcodes the resolved nodes can carry source_info pointing into the -parent file directly, which is much lighter than wrapping. +parent file directly via the typed `Invocation` anchor. + +## Prerequisite — Phase 0: mutable accessors on Inline / Block + +Plan 6's `stamp_shortcode_anchors` helper (see "The post-walk helper" +below) takes `&mut Inline` / `&mut Block` and rewrites the +`source_info` field. Today `crates/quarto-pandoc-types/src/inline.rs:57` +defines only `pub fn source_info(&self) -> &SourceInfo` (immutable); +Plan 4 does not add a mutable counterpart. Every existing site that +mutates `source_info` in the workspace holds a *typed* reference +(`&mut Str`, `&mut CodeBlock`, …) and assigns the public field +directly — there is no generic `&mut Inline -> &mut SourceInfo` +accessor. + +**Before any stamping code can compile**, add to +`crates/quarto-pandoc-types/src/inline.rs` and `block.rs`: + +```rust +impl Inline { + pub fn source_info_mut(&mut self) -> &mut quarto_source_map::SourceInfo { + match self { + Inline::Str(s) => &mut s.source_info, + // ... 28 variants, mechanical mirror of `source_info(&self)` + } + } +} + +impl Block { + pub fn source_info_mut(&mut self) -> &mut quarto_source_map::SourceInfo { + match self { + Block::Plain(p) => &mut p.source_info, + // ... 18 variants, mechanical mirror of `source_info(&self)` + } + } +} +``` + +Pure mechanical mirror of the existing read accessors — ~33 LOC for +`Inline` + ~24 LOC for `Block`. Add a unit test that round-trips a +mutation through the accessor on one representative variant of each. ## Scope ### In scope -For each transform that currently emits `SourceInfo::default()`, replace with -the correct provenance: +For each transform that currently emits `SourceInfo::default()`, replace +with the correct provenance: - **`ShortcodeResolveTransform`** (`crates/quarto-core/src/transforms/shortcode_resolve.rs`): - Currently emits `SourceInfo::default()` on every resolved Str/Inline (lines - 172, 179, 186, etc.). Fix: emit `Derived { from: Arc::new(ctx.source_info.clone()), - by: By::shortcode(shortcode_name) }` on each resolved node. The `from` - is the shortcode token's range (an Original from `ctx.source_info`). - All resolved nodes in a multi-inline resolution share the same `from`, - enabling Plan 7's dedupe rule. + Currently emits `SourceInfo::default()` on 12 production sites (see + References for the per-line breakdown). **Fix the dispatch funnel + uniformly via a post-walk helper**: immediately after every handler + dispatch (Rust handler OR Lua-engine dispatch OR extension + dispatch), walk the returned nodes and stamp + `Generated { by: By::shortcode(name), from: smallvec![Anchor::invocation(Arc::new(ctx.source_info.clone()))] }` + on each block/inline. + - The post-walk **enriches**, not overrides: any `by.data` fields the + Lua machinery attached (`filter_path`, `line` — Plan 4's filter + `by.data` shape) are preserved by promoting the kind from + `filter` to `shortcode`, renaming to `lua_path` / `lua_line` in + `by.data` to reflect the new context. See "Lua-shortcode + enrichment" below. + - The post-walk recurses into nested blocks/inlines (model on + `recurse_inline` / `resolve_block` in this file) so every node in + the dispatch output gets the anchor. + - **Two outlier sites do NOT pass through the dispatch funnel** and + need call-site source_info threading instead of the stamper: + - `make_error_inline` (lines 1030-1038): visible `?key` Str + + Strong wrapper for unknown shortcodes. Today both layers carry + `SourceInfo::default()`. Fix: pass `shortcode_owned.source_info` + through from call sites at lines 659 and 914, and use it as the + Str/Strong's `source_info` (an `Original` pointing at the + shortcode token's bytes — same shape Plan 6's + audit-completion test expects). **Atomicity intent**: the error + region is treated as normal editable user-source content (NOT + atomic). If the user edits `?meta:bad` in React, the bytes + change in the source qmd via the verbatim-copy path. Plan 7's + `is_atomic_kind()` does not fire because the source_info is + Original, not Generated. The Strong-wraps-Str overlap (both + layers carry the same range) is structurally parallel to the + footnote `` case Plan 7:261-267 already documents. + - `shortcode_to_literal` (lines 1043-1109): the literal-text Str + produced for escaped `{{}}` shortcodes. Today it emits + `SourceInfo::default()`. Fix: pass `shortcode_owned.source_info` + through from call sites at lines 665 and 920, and use it as the + Str's `source_info`. This is required to satisfy the + "Escaped-shortcode regression test" (line 453: "its source_info + stays Original (not Generated)") — without this fix, the + regression test would fail on Plan 6's own implementation. - **`TitleBlockTransform`** (line 183-185): synthesizes a level-1 Header - from `title:` metadata. Fix: emit `Synthetic { by: By::title_block() }` + from `title:` metadata. Fix: emit `Generated { by: By::title_block(), from: smallvec![] }` on the synthesized Header (and any nested Inlines). Note: q2-preview - skips this transform (Plan 1), but the audit covers the HTML pipeline too. + skips this transform (Plan 1), but the audit covers the HTML + pipeline too. - **`SectionizeTransform`** (`pampa/src/transforms/sectionize.rs:96, 148`): - the synthetic Section Div. Fix: `Synthetic { by: By::sectionize() }`. + the synthetic Section Div. Fix: `Generated { by: By::sectionize(), from: smallvec![] }`. The wrapped Header retains its original source_info. Body blocks retain theirs. -- **`FootnotesTransform`**: the synthesized footnotes container Div. Fix: - `Synthetic { by: By::footnotes() }`. q2-preview skips, but audit covers - HTML pipeline. (Confirm scope during implementation; investigate whether - any *inline* nodes need fixing.) +- **`FootnotesTransform`**: the synthesized footnotes container Div. + Fix: `Generated { by: By::footnotes(), from: smallvec![] }`. The + synthesized `` markers are already source-mapped via + `create_footnote_ref` cloning from the original `Note` inline (so + they stay Original — no change needed). The four synthesized inline + layers (Span/Superscript/Link/Str) all carry the same range, + producing a multi-node overlap; Plan 7:261-267 documents that this + is round-trip-friendly without extra writer work (block-level + Verbatim of the surrounding Para covers it). q2-preview pipeline + runs this transform (per Plan 2B's audit); the audit applies to + both pipelines. - **`AppendixStructureTransform`**: the synthetic appendix container Div. - Fix: `Synthetic { by: By::appendix() }`. Same scope note as Footnotes. -- **`theorem.rs::extract_name_attr`** (line 313): the title Str extracted - from `name="..."` attribute is built with `SourceInfo::default()`. Fix: - use the attr value's source_info (currently lost — inspection needed for - whether `attr_source` carries this info). At minimum, `Synthetic { by: - By::raw("theorem-title-attr", json!({})) }` if we can't recover it, but - better to preserve the actual source position from the attr-source. + Fix: `Generated { by: By::appendix(), from: smallvec![] }`. Same scope + note as Footnotes. +- **`theorem.rs::extract_name_attr`** (line 313) **and the parallel + `proof.rs::extract_name_attr`** (line 167): the title Str extracted + from `name="..."` is currently built with `SourceInfo::default()`. + Fix: thread `&div.attr_source` into `extract_name_attr` in both + files; index by `kvs.keys().position(|k| k == "name")` *before* the + `remove`; use `attr_source.attributes[idx].1` (an + `Option` carrying the parser-recorded + `Original{file_id, value_start, value_end}` for the attribute + value's bytes) as the Str's `source_info`. Falls back to + `SourceInfo::default()` only when the Option is `None` (e.g. JSON + read from external Pandoc producers that don't emit `attrS`) OR + when length-alignment fails (see safeguards below). The parser + populates the value range at + `crates/pampa/src/pandoc/treesitter.rs:1075-1107` → + `treesitter_utils/commonmark_attribute.rs:38-50`; no parser-side + prerequisite is needed. + + **Positional-alignment safeguards** (review-pass 2026-05-22): the + fix relies on the invariant *"`AttrSourceInfo.attributes[i]` is the + `(key_src, val_src)` for the i-th entry in `Attr.2`'s insertion + order."* This invariant holds in the parser's main path but **is + not documented and is broken in two preexisting code paths** + (duplicate-key handling in `commonmark_attribute.rs:41-49`; + caption-attr-into-table merge in `section.rs:85-113` and + `postprocess.rs:1483-1496`). Plan 6 therefore: + 1. **Documents the invariant** with a doc-comment on + `AttrSourceInfo.attributes` in `crates/quarto-pandoc-types/src/attr.rs:31`. + 2. **Guards the index in `extract_name_attr`** with a runtime + length check (`if kvs.len() == attr_source.attributes.len()`) + and a `debug_assert_eq!` on lengths. Falls back to + `SourceInfo::default()` when they diverge, so production never + panics on misaligned input. + 3. **Two follow-up beads tracked** (out-of-band, preexisting bugs): + **bd-3aolj** (duplicate-key handling in + `commonmark_attribute.rs:41-49` — `LinkedHashMap::insert` updates + in place while `attr_source.attributes.push` always appends) and + **bd-1e6a5** (caption-attr-into-table merge in + `section.rs:85-113` / `postprocess.rs:1483-1496` — same root + cause when caption + table keys overlap). Plan 6 does not block + on them; its runtime guard handles the failure mode safely. + 4. Note: `kvs.remove("name")` after the index lookup itself shrinks + `attr.2` by one without touching `attr_source.attributes`. The + surviving `div.attr_source` is then handed to `CustomNode::new` + (`theorem.rs:281`). Downstream consumers of `attr_source` on + that CustomNode see misaligned data. The rest of `convert_div` + does not re-index `attr_source`, so this is harmless locally, + but a future consumer of the constructed CustomNode's + `attr_source` could trip on it. Considered acceptable for v1; + if a future caller indexes, it should use the same guarded + pattern. + + JSON round-trip preserves the value range: `attrS.kvs` serializes + as a positional array of `[key_ref, val_ref]` pairs + (`json.rs:600-633`) and reads back identically (`json.rs:423-508`). + No Plan-5 follow-up needed. - **`pampa::pandoc::treesitter_utils::postprocess`** (line 1348): the "Synthetic Space" inserted to separate citation from suffix. Fix: - `Synthetic { by: By::tree_sitter_postprocess() }`. + `Generated { by: By::tree_sitter_postprocess(), from: smallvec![] }`. The audit pass also looks for any *other* sites emitting -`SourceInfo::default()` that I haven't enumerated. Plan 6 starts with a +`SourceInfo::default()` that aren't enumerated. Plan 6 starts with a comprehensive grep. ### Out of scope -- The `is_atomic_custom_node` registry function (Plan 7 owns it). -- The writer's atomic-violation diagnostic (Plan 7). +- The `is_atomic_kind()` predicate and `is_atomic_custom_node` registry + (Plan 7 owns the writer-side atomicity logic). +- The writer's soft-drop / atomic-violation handling (Plan 7). - The writer's multi-inline shortcode dedupe rule (Plan 7). - The `IncludeExpansion` CustomNode wrapper (Plan 8). -- React component for shortcode-resolved inlines (Plan 2B — atomic-aware - `setLocalAst` gating in the dispatcher detects Derived provenance via - Plan 2A's `isAtomicSourceInfo` accessor and renders read-only). +- React component for shortcode-resolved inlines (Plan 2A's framework + atomic gate already handles this via the `isAtomicSourceInfo` + accessor; Plan 4's `is_atomic_kind` set names `shortcode` as atomic). +- **Metadata-loader changes** to record per-key source-info for `meta` + and `var` shortcodes. Files separately; see "ValueSource follow-up" + below. +- **Lua-file registration in `SourceContext`** to enable typed + `Dispatch` anchors. Files separately; see "Dispatch follow-up" + below. - The HTML pipeline doesn't need a "ShortcodeResolutionResolveTransform" - (no wrapper to unwrap). Shortcode-resolved nodes ARE flat inlines/blocks - with Derived source_info; the HTML writer doesn't care about source_info, - it just renders the nodes. Behavior unchanged for HTML. + (no wrapper to unwrap). Shortcode-resolved nodes ARE flat + inlines/blocks with `Generated` source_info; the HTML writer doesn't + care about source_info, it just renders the nodes. Behavior + unchanged for HTML. ## Design decisions (settled in conversation) -- **Most transforms just need to preserve ctx.source_info**. The "audit and - fix" is mostly bug fixes — ctx already has the info; the transforms just - drop it. Mechanical change. -- **Shortcode resolution uses Derived provenance, not a wrapper.** Each - resolved Str/Inline/Block gets `Derived { from: ctx.source_info, by: - By::shortcode(name) }`. This preserves the shortcode token's byte range - (via the `from` chain) AND signals to Plan 7's writer that this content - is atomic. Multi-inline resolutions: every resolved node shares the same - `from`, and Plan 7's dedupe rule emits the shortcode token once per group. -- **`Synthetic` provenance for genuine synthesizers**. Sectionize, TitleBlock, - Footnotes, Appendix containers — none of these correspond to source bytes, - so they get `Synthetic { by: By::() }`. +- **Single funnel covers all shortcodes**. The `ShortcodeResolveTransform::resolve_shortcode` + method is the single dispatch point for in-file shortcodes (Rust + built-ins, Lua-loaded extension handlers, extension name lookup). + Plan 6's stamping helper runs once per dispatch, uniformly. All + built-in (`meta`) and Lua-implemented (`kbd`, `lipsum`, `placeholder`, + `version`, `video`) shortcodes get the same treatment. User-extension + shortcodes via Lua: same. `{{< include >}}` is the genuine exception + — handled by `IncludeExpansionStage` (a separate pipeline stage) and + Plan 8's wrapper, not via Generated. +- **Include×shortcode composition is architecturally well-defined.** + `IncludeExpansionStage` runs at the stage layer + (`crates/quarto-core/src/pipeline.rs:258`) before + `AstTransformsStage` (`pipeline.rs:312`), so includes are spliced + flat before any shortcode resolution. Shortcode resolution is + single-pass — `resolve_blocks` advances its index *past* inserted + blocks (`shortcode_resolve.rs:625-677`); returned content is never + re-scanned, so a shortcode emitting the literal text + `"{{< include foo.qmd >}}"` lands as a `Str`, never as a parsed + `Shortcode` (the reverse composition is structurally impossible). + When a shortcode appears *inside* include-spliced content, the + Invocation anchor's `source_info` points into the included file + (different `FileId` than the parent) — this is correct: the token's + bytes live there. Plan 8's wrapper carries the parent-file anchor + independently; Plan 7's `preimage_in(parent_file)` returns `None` + for the included children and the wrapper governs verbatim-copy. +- **Enrichment, not override**. The Lua machinery's auto-attach + produces `Generated { by: filter, from: [], by.data: { filter_path, + line } }` (post-Plan-4, per Plan 4 §"by.data shape table" line 590) + for *typed* Inline/Block nodes constructed during a Lua shortcode + dispatch (e.g. `return pandoc.Str(...)`). Bare-string returns + (`return "text"` → `LuaShortcodeResult::Text`) do NOT pass through + `filter_source_info`; they land with `SourceInfo::default()` and + enter the post-walk's fresh-Generated branch directly. The shortcode + resolver's post-walk enriches the filter-attached cases: + - **Appends** an `Invocation` anchor pointing at the shortcode token. + - **Promotes** `by.kind` from `"filter"` to `"shortcode"`, renaming + `filter_path` → `lua_path` and `line` → `lua_line` in `by.data` + (reflecting the new shortcode context) and adding the shortcode + `name`. + The Lua-side dispatch precision is preserved; the shortcode context + layer is added on top. No information is discarded. + + **Scope**: this enrichment fires only from + `ShortcodeResolveTransform::resolve_shortcode`. General Lua filter + dispatches (`UserFiltersStage`) leave `Generated { by: filter, ... }` + intact — that is the steady-state for filter constructions, per + Plan 4 §"Filter constructions become Generated { by: filter, from: + [] }". The post-walk is not wired into the filter stage and should + not be. +- **Most transforms just need to preserve ctx.source_info**. The + "audit and fix" is mostly bug fixes — ctx already has the info; the + transforms just drop it. Mechanical change. +- **Shortcode resolutions use `Generated` + `Invocation` anchor, not a + wrapper.** Each resolved Str/Inline/Block gets `Generated { by: + shortcode(name), from: [Invocation -> Arc::new(ctx.source_info.clone())] }`. + The anchor's source_info is the shortcode token's range (an Original + from `ctx.source_info`). Plan 7's writer uses it for Verbatim-copy + on KeepBefore. Multi-inline resolutions: every resolved node shares + the same anchor's source_info, enabling Plan 7's dedupe rule. +- **Genuine synthesizers use `Generated` with empty anchors**. + Sectionize, TitleBlock, Footnotes, Appendix containers — none of + these correspond to source bytes, so they get + `Generated { by: By::(), from: smallvec![] }`. Plan 7's coarsen + treats their wrappers as Transparent (recurse into source-bearing + children) or Omit depending on `by.is_atomic_kind()`. - **No `atomic` flag needed**. Plan 7's atomic-violation logic detects - atomicity via `Derived` source_info on any node, OR via the - `is_atomic_custom_node` registry for CustomNode types - (IncludeExpansion, CrossrefResolvedRef). Shortcode atomicity falls into - the first category. + atomicity via `by.is_atomic_kind()` (per Plan 4's predicate) and via + the `is_atomic_custom_node` registry for CustomNode types + (`IncludeExpansion`, `CrossrefResolvedRef`). Shortcode atomicity + falls into the first category (`shortcode` is in the atomic-kind + set). + +## Attribution interaction + +The `Invocation` anchor's existence delivers correct attribution for +shortcode-resolved content **with no attribution-code changes**: + +- `query_attribution(node.source_info, runs)` calls `resolve_byte_range`. +- Per Plan 4's updated `resolve_byte_range`, `Generated` delegates to + `invocation_anchor()`, which returns the `Invocation` anchor's + `source_info` — typically an `Original` covering the shortcode + token's bytes. +- The chain resolves to `(file_id=0, token_start, token_end)`. +- `query_attribution` accepts (file_id == 0, start < end) and calls + `query_byte_range`. +- The existing max-time-across-overlapping-runs logic in + `AttributionMap::query_byte_range` picks the latest author covering + the token's bytes. + +For multi-author shortcodes: if author A wrote `{{< meta foo >}}` at +T1 and author B changed `foo` to `bar` at T2 > T1, the byte range +covers bytes touched by both; `query_byte_range` picks the latest +(B). This is the policy specified in the 2026-05-20 design +discussion ("attributed to latest author of the shortcode text"), +and it falls out mechanically from Plan 6's anchor stamping plus +Plan 4's chain-walking accessor — no special-case code. + +## Lua-shortcode enrichment + +The Lua machinery's `filter_source_info` (in +`crates/pampa/src/lua/types.rs`) walks the live Lua call stack to find +the first non-C frame and produces (post-Plan 4) the canonical +filter-construction shape: + +```rust +Generated { + by: By::filter(filter_path, line), // by.data = { filter_path, line } + from: smallvec![], +} +``` + +This auto-attach fires when Lua code constructs *typed* nodes via +`pandoc.Str(...)`, `pandoc.Span(...)`, etc. Bare-string Lua returns +(`return "text"` → `LuaShortcodeResult::Text`) do NOT pass through +`filter_source_info`; their resulting Str carries +`SourceInfo::default()` instead. + +When this filter-shape source_info appears inside a Lua shortcode +handler dispatch, the resolver's post-walk enriches it to: + +```rust +Generated { + by: By { + kind: "shortcode".to_string(), + data: json!({ + "name": shortcode_name, + "lua_path": , + "lua_line": , + }), + }, + from: smallvec![Anchor::invocation(Arc::new(ctx.source_info.clone()))], +} +``` + +The Lua-side `filter_path` / `line` precision is preserved in +`by.data` under the more contextually-precise names `lua_path` / +`lua_line`; the shortcode `name` is added; the kind is promoted from +`filter` to `shortcode`. **Nothing is discarded.** Nodes that entered +the post-walk with `SourceInfo::default()` (bare-string Lua returns, +or Rust handler returns) hit the fresh-Generated branch instead and +end up with `by.data = { name }` plus the Invocation anchor. + +This is the canonical "enrichment-via-post-walk" pattern. Other +transforms that wrap dispatch may follow the same shape later (always +append, promote `by.kind`, preserve prior `by.data` fields where +meaningful). + +When the **Lua-file-registration follow-up** lands (see "Dispatch +follow-up" below), `lua_path` / `lua_line` migrate out of `by.data` and +into a typed `Dispatch` anchor. `by.data` for Lua-dispatched shortcodes +then shrinks to just `{ "name": shortcode_name }`. + +## The post-walk helper + +```rust +/// After every shortcode handler dispatch, stamp Invocation provenance +/// on the returned nodes. Recurses into nested AST so every block and +/// inline gets the anchor. Enriches existing `Generated { by: filter, ... }` +/// (from Lua auto-attach) by promoting kind and appending the anchor; +/// otherwise sets source_info to a fresh Generated shape. +fn stamp_shortcode_anchors( + result: &mut ShortcodeResult, + shortcode_name: &str, + token_si: &SourceInfo, +) { + let token_arc = Arc::new(token_si.clone()); + match result { + ShortcodeResult::Inlines(inlines) => { + for inline in inlines.iter_mut() { + stamp_inline(inline, shortcode_name, &token_arc); + } + } + ShortcodeResult::Blocks(blocks) => { + for block in blocks.iter_mut() { + stamp_block(block, shortcode_name, &token_arc); + } + } + ShortcodeResult::Preserve | ShortcodeResult::Error(_) => {} + } +} + +fn stamp_inline(inline: &mut Inline, name: &str, token_arc: &Arc) { + let si = inline.source_info_mut(); + *si = enrich_or_create(si, name, token_arc); + // recurse into nested inlines (Strong, Emph, Link, ...) + walk_nested_inlines(inline, |child| stamp_inline(child, name, token_arc)); +} + +fn enrich_or_create( + existing: &SourceInfo, + name: &str, + token_arc: &Arc, +) -> SourceInfo { + // If the Lua machinery attached Generated { by: filter, ... }, + // promote it. Otherwise fresh Generated. + // + // NOTE (bd-36fr9 co-change): the by.data["filter_path"]/["line"] + // reads below are temporary. Once Lua-file registration lands, + // those fields move out of by.data and into a Dispatch anchor in + // `from`. This branch then reads the existing Dispatch anchor + // from `existing.from[]` and copies it into the new from-list + // alongside Invocation. See §"Dispatch follow-up". + // + // NOTE (bd-129m3 integration point): for `meta` / `var` shortcodes + // post-loader-change, the helper also appends a ValueSource + // anchor pointing at the metadata value's source range. See + // §"ValueSource follow-up". + let by = match existing { + SourceInfo::Generated { by, .. } if by.kind == "filter" => { + let lua_path = by.data.get("filter_path").cloned(); + let lua_line = by.data.get("line").cloned(); + let mut data = serde_json::json!({ "name": name }); + if let Some(p) = lua_path { data["lua_path"] = p; } + if let Some(l) = lua_line { data["lua_line"] = l; } + By { kind: "shortcode".to_string(), data } + } + _ => By::shortcode(name), + }; + SourceInfo::Generated { + by, + from: smallvec![Anchor::invocation(Arc::clone(token_arc))], + } +} +``` + +(Block stamping is parallel — recurse into block children and inlines +they contain.) ## Open questions for implementation - **Comprehensive audit**: grep for `SourceInfo::default()` in - `crates/quarto-core/src/transforms/` and `crates/pampa/src/`. Categorize - each site: preserve ctx info / emit Synthetic / emit Derived / leave - as-is (test code). Plan 6's first commit is the audit report; - subsequent commits fix each site. -- **Theorem title from attr**: when `extract_name_attr` extracts the title - from `name="Pythagoras"`, it gets a String with no source_info. Inspecting - `attr_source` may or may not give the byte range of the attr value. - Worth investigating; if achievable, use Original{attr_value_range}; - otherwise Synthetic. -- **Footnotes and Appendix transforms**: q2-preview skips them in v1, but - Plan 6 audits them anyway. Confirm during implementation that the audit - is feasible without breaking HTML pipeline tests. (Extension of the - pattern, not a redesign.) -- **Escaped shortcodes**: today `Shortcode::is_escaped` is a flag, and - escaped shortcodes preserve as literal text (no resolution). Don't apply - Derived to escaped shortcodes — they're not resolved; they stay as - literal text with their original source_info. + `crates/quarto-core/src/transforms/` and `crates/pampa/src/`. + Categorize each site: preserve ctx info / emit Generated with + appropriate by-kind / emit Generated with Invocation / leave as-is + (test code). Plan 6's first commit (after Phase 0) is the audit + report; subsequent commits fix each site. + +(Previously-open questions resolved by review pass 2026-05-22: +"Theorem title from attr" — `AttrSourceInfo` already carries the +value range; see §Scope theorem bullet for the threaded-in fix. +"Escaped shortcodes" — the In-scope `shortcode_to_literal` fix at +the call site (passing `shortcode_owned.source_info` through) +produces the Original shape the regression test expects. +"Recursion into deep AST" — concrete reusable shape and full +container-variant set documented; see §Implementation notes +below.) + +## Implementation notes + +- **Recursion shape for the post-walk.** The walker must traverse the + full container set — for inlines: Strong, Emph, Strikeout, + Superscript, Subscript, SmallCaps, Quoted, Cite, Link, + Image (alt/caption), Span, Underline, Delete, Insert, Highlight, + EditComment, Note (block content), Custom (slot contents); for + blocks: Div, BlockQuote, OrderedList, BulletList, DefinitionList, + Figure, Table (cells), Custom (slot contents). The canonical + reusable shape is in + `crates/quarto-core/src/transforms/shortcode_resolve.rs`'s own + `recurse_inline` (~lines 945-1027) and `resolve_block` + (~lines 710-863), which already cover this set including Image's + alt/caption content and Note's nested blocks. Model the new mutable + walkers on these — drop the async + shortcode-resolution logic, + keep the match-arm dispatch and Image/Note recursion. The narrower + walkers in `callout.rs` and `theorem.rs` are block-only and do NOT + cover the inline variants the stamper needs; do not use them as the + reference shape. + +## ValueSource follow-up + +Plan 6 does NOT attach `ValueSource` anchors. The shape is defined +(Plan 4 ships `AnchorRole::ValueSource`) but the data isn't available: +the metadata loader doesn't surface per-key source-info to the +shortcode resolver today. Specifically, the merged `meta` ConfigValue +the resolver consults has `source_info` per key INTERNALLY, but +`MetaShortcodeHandler::resolve` calls `ctx.metadata.get_nested(&key)` +and then `config_value_to_inlines(value)` which discards the +per-key source information when flattening to strings. + +The follow-up issue ("metadata-loader threads per-key source-info +through to shortcode handlers"): + +1. Loader change: `ConfigValue` already carries `source_info` + per-value (`crates/quarto-pandoc-types/src/config_value.rs:155`); + the lookup path returns ConfigValue references, but + `config_value_to_inlines` converts to bare Strs discarding source. + Thread source through. +2. Resolver change: when constructing the resolved nodes, attach a + `ValueSource` anchor pointing at the value's `source_info`. +3. This is the structural feature behind Elliot's 2026-05-20 chain + request — the resolved content would carry both `Invocation` (where + the shortcode was written) and `ValueSource` (where the value was + defined). + +When the follow-up lands, Plan 6's post-walk grows one more anchor +append at the appropriate dispatch sites. The current Plan 6 ships +with just `Invocation`; the type is forward-compatible. + +**Integration point**: bd-129m3 should append the ValueSource anchor +inside `enrich_or_create` (see §"The post-walk helper" below). Once +the metadata loader threads per-key source-info through, the helper +gains access to the value's source range via the `ShortcodeContext` +and pushes a second anchor into `from` alongside the Invocation. No +other call sites in Plan 6 change. + +Tracked as **bd-129m3** ("Provenance follow-up: ValueSource anchor +stamping for meta/var shortcodes"). + +## Dispatch follow-up + +Plan 6 does NOT use a typed `Dispatch` anchor for Lua-side +construction info. Lua filter files aren't registered in `SourceContext`, +so we can't construct an `Original` pointing into them. In the interim, +`(lua_path, lua_line)` lives in `by.data` (see "Lua-shortcode +enrichment" above). + +The follow-up issue ("register Lua filter files in `SourceContext`"): + +1. `SourceContext::register_file(path, bytes) -> FileId`. +2. Lua engine calls it when loading each filter. +3. `filter_source_info` produces `Original { file_id, start, end }` + instead of returning a path-line pair. +4. Lua-attached source_info becomes `Generated { by: filter, from: + [Dispatch -> Original{lua_file, ...}] }`. +5. Plan 6's post-walk's enrichment then preserves the `Dispatch` + anchor (typed) instead of preserving `by.data` fields. + +When the follow-up lands, `AnchorRole::Dispatch` joins the enum (a +non-breaking enum extension); `by.data` for `filter` / Lua-dispatched +`shortcode` kinds shrinks to per-kind config only. + +**Co-change in `enrich_or_create`**: bd-36fr9 must update Plan 6's +helper (§"The post-walk helper" below). The current "enrich" branch +reads `by.data.get("filter_path")` and `by.data.get("line")` from +the existing `Generated{by:filter, ...}`; post-bd-36fr9, those +fields are gone from `by.data` and the relevant info lives in the +`Dispatch` anchor inside `from`. The helper then reads the existing +Dispatch anchor and copies it into the new shortcode-shape `from` +alongside the Invocation. The §"Lua-shortcode enrichment" example +above also needs updating to show the post-bd-36fr9 shape. + +Tracked as **bd-36fr9** ("Provenance follow-up: Dispatch anchor for +Lua-handler filter & shortcode"). ## References -- `crates/quarto-core/src/transforms/shortcode_resolve.rs` — main fix site. - Lines 172, 179, 186, 203, 208, 215, 222, 238 emit `SourceInfo::default()`. +- `crates/quarto-core/src/transforms/shortcode_resolve.rs` — main fix + site. Per-line breakdown of production `SourceInfo::default()` + emissions: + - Lines 172, 179, 186, 203, 208, 215, 222 — `config_value_to_inlines` + (Str construction for `meta` / `var` lookups). + - Line 238 — `flatten_blocks_to_inlines` (synthesized + paragraph-separator Space; NOT part of `config_value_to_inlines`). + - Line 470 — `lua_result_to_shortcode_result::Text` arm (bare-string + Lua return wrapped in a Str). + - Lines 1034, 1036 — `make_error_inline` (visible `?key` Str + Strong + wrapper for unknown shortcodes). + - Line 1109 — `shortcode_to_literal` (escaped-shortcode literal text). + The stamper handles the first three groups uniformly via the dispatch + funnel; `make_error_inline` and `shortcode_to_literal` need call-site + source_info threading (see "In scope" bullet). +- `crates/quarto-core/src/transforms/shortcode_resolve.rs:306-371` — + `resolve_shortcode` method (single funnel for all dispatches; the + post-walk hooks in here). +- `crates/quarto-core/src/transforms/shortcode_resolve.rs:710-1027` — + existing `resolve_block` / `recurse_inline` walkers. Canonical + reusable shape for the new mutable walkers (drop async + + shortcode-resolution logic; keep the match-arm dispatch and + Image/Note recursion). - `crates/quarto-core/src/transforms/title_block.rs:183, 185` — h1 synthesis sites. - `crates/pampa/src/transforms/sectionize.rs:96, 148` — section Div - synthesis sites. -- `crates/quarto-core/src/transforms/footnotes.rs` — investigate. -- `crates/quarto-core/src/transforms/appendix.rs` — investigate. -- `crates/quarto-core/src/transforms/theorem.rs:281, 313` — name-attr title - extraction. -- `crates/pampa/src/pandoc/treesitter_utils/postprocess.rs:1348` — synthetic - Space. + synthesis sites. (Line 169 in that file is a `dummy_source_info()` + test helper, not a production site.) +- `crates/quarto-core/src/transforms/footnotes.rs` — container Div + synthesis (around line 495 / `create_footnotes_section`). +- `crates/quarto-core/src/transforms/appendix.rs` — appendix container + Div synthesis (`create_appendix_container` ~line 257). +- `crates/quarto-core/src/transforms/theorem.rs:313` and + `crates/quarto-core/src/transforms/proof.rs:167` — name-attr title + extraction in `extract_name_attr`. Both pass `&div.attr_source` + through and use `attr_source.attributes[idx].1` (an + `Option`). +- `crates/quarto-pandoc-types/src/attr.rs:27-32` — `AttrSourceInfo` + shape (`attributes: Vec<(Option, Option)>` + for key/value source ranges). +- `crates/pampa/src/pandoc/treesitter.rs:1075-1107` and + `crates/pampa/src/pandoc/treesitter_utils/commonmark_attribute.rs:38-50` + — parser sites that populate the attr value's byte range. No + prerequisite parser change needed. +- `crates/pampa/src/pandoc/treesitter_utils/postprocess.rs:1348` — + synthetic Space. +- `crates/pampa/src/lua/types.rs:1812-1840` — `filter_source_info` + Lua-side auto-attach. Note: only fires for typed Inline/Block + returns (`pandoc.Str(...)`); bare-string returns + (`return "text"` → `LuaShortcodeResult::Text`) bypass it. - `crates/quarto-pandoc-types/src/custom.rs` — CustomNode shape. -- `crates/quarto-core/src/transforms/callout.rs` — example pattern for sugar - transforms wrapping output in CustomNode. +- `crates/quarto-core/src/transforms/callout.rs` — example pattern for + sugar transforms wrapping output in CustomNode. NOTE: callout + + theorem are block-only walkers; for inline recursion, use + `shortcode_resolve.rs::recurse_inline` instead. +- `crates/quarto-core/src/stage/stages/user_filters.rs` — general Lua + filter dispatch site. Does NOT invoke the post-walk; its + constructions keep `by.kind == "filter"` as steady state. +- `crates/quarto-core/src/pipeline.rs:258, 312` — `IncludeExpansionStage` + precedes `AstTransformsStage`, so includes are spliced before + shortcodes resolve. See §"Include×shortcode composition" in Design + decisions. ## Test plan - **Audit-completion test**: a unit test that builds a fixture document exercising shortcode resolution, sectionize, and (HTML pipeline only) - title-block / footnotes / appendix. Asserts that the resulting AST has - no nodes with `SourceInfo::default()` source_info. (Defensive - regression: catches a future PR that adds a transform without provenance.) + title-block / footnotes / appendix. **Asserts that the resulting AST + has no nodes with `SourceInfo::default()` source_info AND every + synthesized node carries an appropriate `Generated` shape** (matches + the §Atomic-kind-set / §by.data tables in Plan 4). Defensive + regression: catches a future PR that adds a transform without + provenance. +- **Shortcode required-anchor invariant**: the audit-completion test + ALSO walks the post-stamping AST and asserts no `Generated { by: + shortcode, from: [] }` remains. Every `by.kind == "shortcode"` node + must carry at least one `Invocation` anchor pointing at the source + token's bytes. Per Plan 4 §"Required-anchor invariant for shortcode", + this is the producer-side enforcement of the rule; Plan 7 adds a + `debug_assert!` on the consumer side as belt-and-suspenders. The + stamper is the only construction site for `by: shortcode` in v1, so + the test exercises the full source of bad shapes. - **Per-transform fix tests**: for each fixed transform, a test that inspects the produced source_info shape: - - SectionizeTransform: synthetic Div has `Synthetic { by: By { kind: - "sectionize" } }`. Header inside has its original source_info. - - ShortcodeResolveTransform: each resolved Str has `Derived { from: - Original{shortcode_token_range}, by: By { kind: "shortcode", data: - {"name": "..."} } }`. The `from` Original points at the shortcode - token's bytes in source. + - SectionizeTransform: synthetic Div has `Generated { by: { kind: + "sectionize" }, from: [] }`. Header inside has its original + source_info. + - ShortcodeResolveTransform (uniform): each resolved Str has + `Generated { by: { kind: "shortcode", data: { name: "..." } }, + from: [Anchor { role: Invocation, source_info: ... }] }`. The + anchor's source_info chain-walks to the shortcode token's bytes + via `resolve_byte_range`. + - Lua-shortcode test: a `{{< kbd Ctrl+C >}}` invocation produces a + Span with `Generated { by: { kind: "shortcode", data: { name: + "kbd", lua_path: "...", lua_line: N } }, from: [Invocation] }`. + **NOT** `by.kind == "filter"`; the post-walk promoted it. + - Other built-in Lua shortcodes (lipsum, placeholder, version, video): + same shape, with the appropriate `name`. - Etc. for each transform. -- **Multi-inline shortcode source_info test**: a metadata key with - markdown (`title: "**Bold** Title"`). After ShortcodeResolveTransform, - the resulting `[Strong[Str], Space, Str]` ALL have Derived source_info - with the same `from` (the shortcode token's range). This is what Plan - 7's dedupe rule will detect. -- **Idempotence still holds**: re-run Plan 3's idempotence test after the - audit — the changes shouldn't introduce non-determinism. +- **Multi-inline shortcode anchor test**: a metadata key with markdown + (`title: "**Bold** Title"`). After ShortcodeResolveTransform, the + resulting `[Strong[Str], Space, Str]` ALL have `Generated` with + `Invocation` anchors whose `source_info` is the same shortcode + token's range. This is what Plan 7's dedupe rule detects. +- **Attribution interaction test**: render a doc with `{{< meta foo >}}` + through two commits by different authors (author A wrote the line at + T1; author B changed `foo` → `bar` at T2). With Plan 6 stamped and a + `GitBlameProvider` installed, the resulting `astContext.attribution` + for the resolved Str references author B's identity (the latest + author of the token bytes). This is the multi-author latest-wins + policy. +- **Escaped-shortcode regression test**: `{{}}` resolves + to literal text; its source_info stays Original (not Generated). +- **Error-inline regression test**: an unknown shortcode `{{< bogus >}}` + resolves via `make_error_inline` to `Strong[Str("?bogus")]`. Both + layers carry `Original` source_info pointing at the bogus + shortcode's token bytes (NOT `Default`, NOT `Generated`). Plan 7's + `is_atomic_kind()` does not fire; round-trip through the + incremental writer Verbatim-copies the original token bytes. +- **Error / escaped round-trip test**: full incremental-writer + round-trip on a fixture containing both `{{}}` and + `{{< bogus >}}`. After Plan 6's stamping + Plan 7's writer, the + output qmd should byte-equal the input for those regions + (verbatim-copy via the Original anchor in both cases). +- **Shortcode-inside-include composition test**: `parent.qmd` + contains `{{< include foo.qmd >}}`; `foo.qmd` contains + `{{< meta title >}}`. After Plan 6 stamping (and Plan 8's wrapper), + the resolved Str inside the IncludeExpansion wrapper has + `Generated { by: { kind: "shortcode", data: { name: "title" } }, + from: [Invocation -> Original{file_id: , ...}] }`. + Assert the Invocation anchor's source_info `file_id != 0` (i.e. + points into the included file, not the parent). Plan 8's wrapper + carries the parent-file anchor at its level; this test exercises + Plan 6's stamping invariant under the cross-file context. Plan 8's + own test plan covers wrapper round-trip independently. +- **Idempotence still holds**: re-run Plan 3's idempotence test after + the audit — the changes shouldn't introduce non-determinism. +- **`source_info` determinism (Plan 6-specific gap)**: Plan 3's hashes + exclude `source_info` by design (`compute_blocks_hash_fresh` and + `compute_meta_hash_fresh` both skip it). So Plan 3 does **not** + catch a transform whose synthesized `Generated { by, from }` + output is non-deterministic *in the source_info layer* — e.g., an + `Anchor::invocation` that hashes a different `SourceInfo` on + repeated runs because the shortcode-token's range was recomputed + rather than cloned. Plan 6 must add its own per-fixture + source_info-determinism check: render twice, walk the AST in + lockstep, assert every `Generated.by`, every `Generated.from[]`, + and every Original `SourceInfo` is `==`-equal across runs. Place + this alongside Plan 3's idempotence test (same fixtures, parallel + assertion) so the test crate covers both contracts. ## Dependencies ### Hard dependencies - **Plan 4** — Plan 6's transforms use `By::shortcode(...)`, - `By::sectionize()`, `By::title_block()`, etc., plus the `Derived` and - `Synthetic` variants. Cannot compile without Plan 4. + `By::sectionize()`, `By::title_block()`, etc., plus the `Generated` + variant and `Anchor`/`AnchorRole` types. Cannot compile without + Plan 4. ### Soft dependencies @@ -181,8 +878,8 @@ comprehensive grep. through the JSON wire format (the path q2-preview takes when crossing the WASM boundary to React and back), Plan 5's wire-format extension is required. Without Plan 5, a Plan 6 AST that gets serialized to JSON - and deserialized loses the `Derived` and `Synthetic` shapes (decoded - via legacy code-3 fallback as Substring approximations). + and deserialized loses the `Generated` shape (decoded via legacy + code-3 fallback as Substring approximations). Pragmatic implication: Plan 6 lands cleanly in-Rust without Plan 5, but isn't observable in q2-preview without Plan 5. The plans can be @@ -192,77 +889,89 @@ comprehensive grep. ### Blocks - **Plan 7** — writer needs Plan 6's audit-fixed AST shape to walk - preimages correctly and to detect Derived for atomic enforcement. -- Independent of Plan 8 (Plan 8 introduces its own wrapper for includes; - shortcodes don't use that pattern). + preimages correctly and to detect atomic-kind for `is_atomic` + enforcement. +- Independent of Plan 8 (Plan 8 introduces its own wrapper for + includes; shortcodes don't use that pattern). ## Risk areas -- **Audit completeness**: missing a site means a future Plan 7 round-trip - silently corrupts that region. Mitigation: the audit-completion test - scans for `SourceInfo::default()` in produced ASTs. -- **Breaking existing HTML pipeline tests**: the audit changes source_info - on many nodes. The hash-based reconciler doesn't care, but tests that - inspect specific source_info shapes might fail. Run the full workspace - test suite after each transform fix. +- **Audit completeness**: missing a site means a future Plan 7 + round-trip silently corrupts that region. Mitigation: the + audit-completion test scans for `SourceInfo::default()` AND for + synthesized-but-not-Generated shapes in produced ASTs. +- **Breaking existing HTML pipeline tests**: the audit changes + source_info on many nodes. The hash-based reconciler doesn't care, + but tests that inspect specific source_info shapes might fail. Run + the full workspace test suite after each transform fix. - **Shortcode-resolved nodes change source_info shape**: existing tests that assert "the resolved title Str has SourceInfo::default()" or - similar will fail. Update them to expect Derived. The HTML output + similar will fail. Update them to expect Generated. The HTML output doesn't change shape (still flat inlines/blocks); only source_info on those nodes changes. -- **No new CustomNode type added** (deliberate change from earlier draft). - The HTML pipeline isn't affected — shortcode-resolved content remains - flat inlines/blocks; the HTML writer renders them normally. +- **No new CustomNode type added** (deliberate, retained from the + earlier draft). The HTML pipeline isn't affected — shortcode-resolved + content remains flat inlines/blocks; the HTML writer renders them + normally. +- **Post-walk recursion bugs**: missing a nested AST shape in the walk + means some inner nodes don't get the anchor. Cover Strong/Emph/Link + for inlines and Div/BlockQuote/Span-in-Plain for blocks. ## Estimated scope | Component | Lines (rough) | |---|---| +| Phase 0: `Inline::source_info_mut` + `Block::source_info_mut` accessors + unit tests | ~70 | | Audit pass (grep + categorize) | ~30 (mostly notes) | -| Shortcode resolver fix (~12 sites, all emit Derived now) | ~80 | +| `stamp_shortcode_anchors` helper + mutable recursion walks (modeled on `shortcode_resolve.rs::recurse_inline` / `resolve_block`) | ~220 | +| Shortcode resolver dispatch-site fixes — 12 production sites: `config_value_to_inlines` ×7, `flatten_blocks_to_inlines` ×1, `lua_result_to_shortcode_result::Text` ×1, `make_error_inline` ×2, `shortcode_to_literal` ×1. Most covered by the stamper; `make_error_inline` and `shortcode_to_literal` need call-site source_info threading. | ~70 | | TitleBlock fix | ~20 | | Sectionize fix | ~20 | | Footnotes fix | ~30 | | Appendix fix | ~30 | -| Theorem title-from-attr fix | ~20 | +| Theorem + proof title-from-attr fix (thread `attr_source` through `extract_name_attr` in both files) | ~30 | | TreeSitter postprocess fix | ~10 | -| Tests | ~200 | -| **Total** | **~440** | +| Tests | ~280 | +| **Total** | **~810** | -Smaller than the earlier draft (which included a ShortcodeResolution -wrapper, qmd writer arm, and HTML pipeline implications). One focused -session likely. +The earlier "~540" estimate omitted the Phase-0 mut accessors (~70 LOC), +under-counted the recursion walkers (mutable walks over the full +inline/block container set are ~220 LOC, not ~80), and missed the +`make_error_inline` / `shortcode_to_literal` / `proof.rs` fix sites. ## Notes -This is a "scattered fixes" plan — touches many transform files with small -per-file changes. Most of the diff is mechanical: `SourceInfo::default()` -→ `ctx.source_info.clone()` (Original) for synthesizers that DO have a -source preimage but currently drop it; `Synthetic { by: By::() }` -for genuine synthesizers; `Derived { from, by }` for shortcode resolutions. +This is a "scattered fixes" plan — touches many transform files with +small per-file changes. Most of the diff is mechanical: `SourceInfo::default()` +→ either `ctx.source_info.clone()` (Original) for synthesizers that DO +have a source preimage but currently drop it, or +`Generated { by: By::(), from: smallvec![] }` for genuine +synthesizers, or `stamp_shortcode_anchors(...)` for shortcode +dispatches. The conceptual surface is small; the file count is not. The earlier-draft "wrap shortcode resolutions in `CustomNode("ShortcodeResolution")`" -approach was walked back. Per the user's reasoning: wrappers were heavy for -what's fundamentally a provenance problem. Derived gives us atomic detection -at the writer level (Plan 7) without the structural cost of a new CustomNode -type, the qmd writer arm, the HTML-pipeline-resolve transform, or the -React component for the wrapper. Includes (Plan 8) still use a wrapper -because their cross-file FileId issue genuinely requires anchoring at the +approach was walked back. Per the user's reasoning: wrappers were heavy +for what's fundamentally a provenance problem. The typed `Invocation` +anchor in `Generated` gives Plan 7 atomic detection at the writer +level (via `by.is_atomic_kind()` returning true for `shortcode`) +without the structural cost of a new CustomNode type, the qmd writer +arm, the HTML-pipeline-resolve transform, or the React component for +the wrapper. Includes (Plan 8) still use a wrapper because their +cross-file FileId issue genuinely requires anchoring at the parent-file level. The shortcode-resolution provenance change propagates to: q2-preview -rendering (Plan 2B's atomic-aware `setLocalAst` gating in the -framework's `Inline` dispatcher — `framework/dispatchers.tsx`, -post-2pre — detects Derived inlines via Plan 2A's -`isAtomicSourceInfo` accessor. The original "MaybeReadOnlyInline -wrapper" framing was resolved during the 2026-05-06 / 2026-05-07 -review sessions into the framework's unified `Block` / `Inline` -dispatchers gaining the atomic gate, rather than a separate -wrapper component or per-format duplication. Both q2-debug and -q2-preview pick up the gate "for free"), -writer round-trip (Plan 7's atomic logic detects Derived + UseAfter -as AtomicViolation; Plan 7's dedupe rule handles multi-inline -shortcode resolutions), and possibly some existing tests that -asserted on the flat Str's source_info shape. +rendering (Plan 2A's framework atomic gate in `dispatch.tsx`'s `Node` +detects `shortcode` kind via `ATOMIC_GENERATED_KINDS` and the +JS-side `isAtomicSourceInfo` accessor), writer round-trip (Plan 7's +soft-drop logic detects `by.is_atomic_kind()` + UseAfter and emits +Q-3-42; Plan 7's dedupe rule handles multi-inline shortcode +resolutions via the shared anchor source_info), and possibly some +existing tests that asserted on the flat Str's source_info shape. + +The post-walk's enrichment pattern (promote kind, preserve prior +`by.data`, append anchor) is the canonical shape for any future +transform that wraps a Lua dispatch. Document the pattern in Plan 6's +helper so future contributors have a reference. diff --git a/claude-notes/plans/2026-05-04-q2-preview-plan-7-incremental-writer.md b/claude-notes/plans/2026-05-04-q2-preview-plan-7-incremental-writer.md index 3a7a7b7f6..fc77f0e75 100644 --- a/claude-notes/plans/2026-05-04-q2-preview-plan-7-incremental-writer.md +++ b/claude-notes/plans/2026-05-04-q2-preview-plan-7-incremental-writer.md @@ -1,419 +1,684 @@ -# Plan 7 — Incremental writer preimage walk + Transparent + atomic-violation + multi-inline dedupe +# Plan 7 — Incremental writer: preimage walk, Transparent / Omit, atomic soft-drop, multi-inline dedupe -**Date:** 2026-05-04 -**Branch:** feature/q2-preview -**Status:** Implementation plan (open questions named) +**Date:** 2026-05-04 (revised 2026-05-24) +**Branch:** feature/provenance +**Status:** Implementation plan (API surface settled) **Milestone:** M3 (edit-back works for non-include, non-pure-synthesis edits) +## Epic context + +Part of the **provenance epic** (Plans 3–10). Plan 7 is the keystone: +once the writer understands the typed provenance from Plans 4–6, it +can correctly round-trip user edits, soft-drop bad edits with clear +diagnostics, and surface warnings on both hub-client and the `q2 +preview` SPA. The file name keeps its `q2-preview-plan-7-` form for +git-history continuity; new plans in the epic adopt the +`provenance-plan-N-` convention (see Plan 9 / Plan 10). + ## Goal -Teach the incremental writer (`pampa::writers::incremental`) to handle the -new provenance shapes introduced by Plans 4-6 so that q2-preview round-trip -edits work correctly. Five new behaviors: - -- **`preimage_in(target_file_id)` accessor**: a recursive walk through - Substring/Concat/Derived chains that returns the byte range in the target - file IF the chain resolves there, else None. -- **`Transparent` coarsen variant**: for `KeepBefore` nodes whose source_info - is `Synthetic` but whose children have recoverable preimages (Sectionize's - case), recurse into the children rather than emit a useless empty - Verbatim. The wrapper itself contributes nothing to the output. -- **Atomic detection via `Derived`**: nodes with `Derived` source_info are - atomic. KeepBefore + Derived → Verbatim copies the preimage (the shortcode - token, etc.). UseAfter or RecurseIntoContainer touching a Derived node → - AtomicViolation. -- **Atomic detection via `is_atomic_custom_node`**: `IncludeExpansion` - CustomNode is atomic via type_name lookup. Same outcome as Derived case - (KeepBefore Verbatim; anything else → AtomicViolation). Plus - `CrossrefResolvedRef` is atomic (already a CustomNode in the AST). -- **Multi-inline dedupe rule**: when assembling a run of consecutive inlines - (in InlineSplice or inline assembly contexts) that all share the same - Derived source_info `from`, emit Verbatim *once* for the group rather - than N times. This handles multi-inline shortcode resolutions. - -This plan also adds a `pipeline_kind: Option` parameter to -`incremental_write_qmd` (per Decision D — param with default) that runs the -q2-preview pipeline on the baseline AST before reconciling, making the -reconcile symmetric. Existing callers pass `None` and get today's -parse-only baseline behavior; q2-preview's call site passes -`Some("preview")`. The string is the wasm-bindgen-friendly form of -the `Option<&'static str>` selector Plan 1 added to `Format` -(`crates/quarto-core/src/format.rs::Format::pipeline_kind`); inside -`incremental_write_qmd` it maps to the same kind string the render -side already uses ("preview"). +Teach the incremental writer (`pampa::writers::incremental`) to +handle the typed provenance shapes Plans 4–6 introduce so that +q2-preview round-trip edits work correctly. Five new behaviors: + +- **`preimage_in(target_file_id)` accessor** on `SourceInfo`: a + recursive walk through Substring / Concat / Generated chains that + returns the byte range in the target file if the chain resolves + there, else `None`. For `Generated`, walks through the + `Invocation` anchor only — never `ValueSource`, never `Dispatch`, + never `Other`. +- **`Transparent` coarsen variant**: for `KeepBefore` nodes whose + `source_info` is `Generated` with empty anchors AND non-atomic + kind (Sectionize wrappers, footnotes container, appendix + container), recurse into the children rather than emit a useless + empty Verbatim. The wrapper itself contributes nothing to output. +- **`Omit` coarsen variant**: for `KeepBefore` nodes that have no + preimage in target and no source-bearing children (atomic-kind + Generated with no Invocation anchor — filter constructions, + title-block synthesis, tree-sitter postprocess space). The node + is dropped from output; the next pipeline run regenerates it from + baseline content. +- **Unified editability gate, applied via soft-drop**: a region is + editable iff it has byte-traceable preimage in the target file + AND is not an atomic-kind `Generated` AND is not an atomic + CustomNode. Edits to non-editable regions soft-drop with + diagnostic warnings rather than aborting the entire write. +- **Multi-inline dedupe rule**: when assembling a run of consecutive + inlines (in InlineSplice or inline-assembly contexts) whose + `Invocation` anchors are structurally equal (`PartialEq`), emit + Verbatim *once* for the group rather than N times. Handles + multi-inline shortcode resolutions. + +Plan 7 also changes the WASM-facing `incremental_write_qmd` +signature: the caller now supplies the baseline AST explicitly +instead of having the writer parse the original qmd internally. +This makes the writer pipeline-agnostic — it diffs the two ASTs +the caller hands it and writes accordingly, regardless of what +pipeline produced them. When this plan lands, ReactPreview's read-only guard from Plan 1 -lifts (one-block early-return in `handleSetAst`, deletable per Plan -1's design), and edits in q2-preview round-trip correctly. The -**render-side dispatches** Plan 1's §"Multi-plan contract: cleanup -owed to Plan 7" originally targeted (`AstTransformsStage::run()` -and `ReactPreview.tsx::doRender`) **already use the structured -selector** as of Plan 1's implementation: `AstTransformsStage` -reads `ctx.format.pipeline_kind`, and `ReactPreview.doRender` -dispatches via the `pipelineKindForFormat(format)` helper at -`hub-client/src/utils/pipelineKind.ts`. Plan 7 therefore adds the -**write-side parameter** rather than refactoring those render-side -sites; see §Scope for the verification step. +lifts, and edits in q2-preview round-trip correctly. The q2-preview +SPA gains edit-back via the same writer path — replacing its +current `noopSetAst` with a real handler that routes through +`incrementalWriteQmd` to the sync-client's `updateFileContent` +and through automerge to the ephemeral hub's disk-write. + +## API decomposition: parse / transform / reconcile / write + +The writer is one node in a four-primitive grammar: + +| Primitive | Rust signature (existing) | What it does | +|---|---|---| +| **parse** | `qmd_to_pandoc(bytes) → (Pandoc, ASTContext)` | Lex/parse qmd source to a parse-only AST. No transforms. | +| **transform** | `build__transform_pipeline()` + `run_pipeline()` | Apply a pipeline's transform stages to a parse-only AST. Produces a same-shape AST at a different tier. | +| **reconcile** | `compute_reconciliation(&a, &b) → ReconciliationPlan` | Diff two ASTs structurally, producing a plan of KeepBefore / UseAfter / RecurseIntoContainer alignments. | +| **write** | `incremental_write(qmd, original_ast, new_ast, plan)` | Materialize the plan as qmd bytes — Verbatim-copy source bytes for KeepBefore, qmd-writer-serialize for UseAfter / Rewrite. | + +The Rust internals already implement this decomposition. The WASM +bridge layer exposes the compositions that callers need. + +**Pipeline tier discipline.** "Same pipeline tier" means: the +baseline AST and the new AST were both produced by the same +sequence of transform stages, applied to ASTs that were both +parsed from the same kind of source. The reconciler is tier-agnostic +— it just diffs structures — but the caller must supply ASTs at the +same tier or every Generated wrapper looks like a new insertion. +Two tiers matter today: + +- **parse-only**: `parse_qmd_to_ast(content)` output. Used by + q2-debug, q2-slides, and the WASM demos (kanban, hub-react-todo). +- **q2-preview**: `renderPageInProjectWithAttribution(path, …)` + output (post-q2-preview-pipeline AST). Used by ReactPreview's + q2-preview path and the q2-preview SPA. ## Scope ### In scope -- `preimage_in` accessor on `SourceInfo` (in `quarto-source-map`). Walks - Substring's `parent`, Concat's `pieces`, Derived's `from`. Returns - `Some(byte_range)` if the chain resolves to an `Original` in the target - file, else `None`. -- `coarsen` rules. Two new entry variants (`Transparent`, `Omit`) plus - **soft-drop substitution logic** for atomic content: - - **Verbatim**: KeepBefore + `preimage_in` resolves into target file. - Today's behavior, generalized via `preimage_in` to work on Derived - chains too. - - **Transparent (recurse)**: KeepBefore + Synthetic source_info + block - has children with recoverable preimages. Recurse on children, produce - a child-entry list. Wrapper itself emits nothing. Handles Sectionize. - - **Omit**: KeepBefore + atomic-Synthetic node, OR KeepBefore + Synthetic - with no recoverable children. The node is dropped from output; the - next pipeline run regenerates it from baseline content. Used for - filter-constructed leaves and the rare structurally-stable Synthetic - leaf. - - **Rewrite**: UseAfter or non-atomic Recurse-with-changes. Today's - behavior. Includes the let-user-win case for block-level UseAfter - on atomic nodes (see §"The coarsen logic" — atomicity does NOT - block this path; the qmd writer's CustomNode arms know how to write - fresh atomic CustomNodes from `plain_data`). - - **InlineSplice**: today's behavior, extended with the multi-inline - Derived dedupe rule and the **inline-level soft-drop substitution** - described below. -- **Soft-drop substitutions** for the bad-edit cases. Coarsen detects - these and **substitutes a safe alignment** rather than aborting the - whole write: - - **Inline-level UseAfter on a Derived inline** (user retyped resolved - shortcode text): substitute KeepBefore for that one inline within - the surrounding `InlineReconciliationPlan`. The rest of the inline - plan continues as-is. Emit a `Q-3-42` warning into the warnings - sink describing what was reverted. - - **Block-level RecurseIntoContainer on an atomic CustomNode** (user - edited inside an include): substitute KeepBefore for the wrapper. - The wrapper's source_info points at the parent-file include token - (Plan 8); Verbatim copy preserves it. Inner edits never reach the - qmd writer's CustomNode arm. Emit a `Q-3-43` warning. - - **Block-level UseAfter on an atomic node** (user replaced or - deleted an atomic block via React): **let-user-win** — keep as - Rewrite. The new block goes through the qmd writer's normal arms - (Plan 8's IncludeExpansion arm reads `plain_data["source_path"]` - and emits `{{< include … >}}` from a fresh user-edit-tagged - CustomNode just as cleanly as from a pipeline-emitted one). No - warning — the user explicitly chose this. -- **No `AtomicViolation` variant**. The previous design had coarsen - produce an `AtomicViolation` entry that caused `incremental_write` to - return `Err`. Under soft-drop, every bad-edit case has a safe - substitution, so `AtomicViolation` is unnecessary. The writer's - return type stays `Result<(String, Vec), Vec>`-shaped - (see "Warning channel mechanism" below); `Ok` carries the saved qmd - plus any soft-drop warnings. -- **Warning channel mechanism**: `coarsen` accepts a - `&mut Vec` warning sink as a parameter. Soft-drop - substitutions push warnings into the sink. The top-level - `incremental_write` returns `Ok((String, Vec))` - when no fatal error occurs (warnings can be present), and `Err` only - for true write failures (UTF-8 errors, qmd writer panics on - malformed input — same as today). The hub-client's `RenderResponse` - already carries a `warnings: [...]` field (Plan 1's pipeline - diagnostics use it); soft-drop warnings flow through the same path. -- **Diagnostic codes** (per the Q-3 conventions; see - `crates/quarto-error-reporting/src/error_catalog.json`): - - `Q-3-42` — "Shortcode edit dropped". Emitted when an inline-level - edit to Derived content was substituted by KeepBefore. Body: - affected inline's Derived `by.kind` and resolved-to text, plus the - shortcode token's source range so editor UIs can highlight it. - - `Q-3-43` — "Include block edit dropped". Emitted when a - block-level RecurseIntoContainer on an atomic CustomNode was - substituted by KeepBefore. Body: the include's `source_path` from - `plain_data`, plus the wrapper's source range. Actionable message: - "to edit this content, open `` directly." - Both are `DiagnosticKind::Warning`. No new structural fields on - `DiagnosticMessage` — discriminants are in the code+notes. -- `is_atomic_custom_node` registry, defined in **`quarto-core`** as - `pub const ATOMIC_CUSTOM_NODES: &[&str]` plus - `pub fn is_atomic_custom_node(type_name: &str) -> bool`. Plan 7 - ships the **Rust side** (writer in `pampa` consumes it; Plan 8 - extends the const to add `IncludeExpansion`). The **TypeScript - hand-mirror** at `hub-client/src/utils/atomicCustomNodes.ts` ships - with **Plan 2A** because Plan 2B is the first consumer (atomic-aware - `setLocalAst` gating in the dispatcher); ownership was reassigned - during the 2026-05-06 review session. The TS file's header comment - documents the sync convention — both sides are kept aligned via - doc comments + code review (no codegen). This matches the codebase's - existing pattern for cross-language type pairs (e.g., - `hub-client/src/types/intelligence.ts` mirrors `quarto-lsp-core` - types this way; `hub-client/src/types/diagnostic.ts` mirrors - `DiagnosticMessage`). Initial set: - `["IncludeExpansion", "CrossrefResolvedRef"]` (Plan 8 adds - `IncludeExpansion` to both sides). Note: `ShortcodeResolution` is - NOT in this set — shortcode atomicity is handled via the `Derived` - source_info path, not via a wrapper. - - **Migration path for extension-contributed atomic types**: the - hand-mirror is the right shape for built-ins. Extension-contributed - atomic types (a future plan; see §Open questions - "is_atomic_custom_node lookup — extension forward-compat") will - replace the JS const with a `wasm_bindgen` runtime lookup populated - per-render from loaded extensions. The migration changes the JS - data source but not the React-side dispatch logic — components - continue to call `isAtomicCustomNode(typeName)`; the function's - implementation switches from a const lookup to a context lookup. -- `assemble`: - - Walks Transparent entries by emitting each child's bytes with - separators computed from the children's original positions. - - Omit entries contribute nothing to the output (the original - Synthetic node is dropped; baseline regenerates next pipeline run). - - Inline-level dedupe: within an inline-splice or inline-assembly run, - detect consecutive inlines sharing the same Derived `from` and emit - one Verbatim (the from's preimage range) instead of N. - - No AtomicViolation handling — soft-drop substitutions happened in - coarsen; assemble sees only safe entries. -- `pipeline_kind` parameter added to `incremental_write_qmd`. When - `Some("preview")`: - - Re-parses `original_qmd` (today's behavior). - - **Runs the q2-preview transform pipeline on the baseline** (this is the - NEW step). Produces a baseline AST at the same pipeline tier as the - live AST. - - Reconciles new vs baseline. - - Writes via the updated coarsen/assemble logic. -- Lift the `handleSetAst` read-only guard in `ReactPreview.tsx` introduced - in Plan 1. Wire `setLocalAst` through with `pipeline_kind: "preview"`. -- **Verify: structured pipeline dispatch is already in place - (Plan 1 commits `a7143cc7` + `60658a4e` + `a5e00b20`).** Plan 1's - §"Multi-plan contract: cleanup owed to Plan 7" originally framed - this as scaffolding Plan 7 would refactor, but Plan 1 implemented - the structured form directly: - 1. `AstTransformsStage::run()` reads `ctx.format.pipeline_kind` - (the `Option<&'static str>` field on `Format`) and dispatches - to `build_q2_preview_transform_pipeline` when it equals - `Some("preview")`. - 2. `ReactPreview.tsx::doRender` dispatches via - `pipelineKindForFormat(format)` from - `hub-client/src/utils/pipelineKind.ts`, returning `'preview'` - for q2-preview and `undefined` for everything else. - Plan 7 therefore has no render-side cleanup work. During Plan 7 - implementation, **verify the write-side parameter threads through - the same selector**: Plan 7's new `pipeline_kind: Option` - argument on `incremental_write_qmd` (§Scope item below) should be - populated at the JS call site by `pipelineKindForFormat(format)` - and threaded through `wasmRenderer.ts::incrementalWriteQmd` to - the WASM boundary. Internally, the string maps to the same - `pipeline_kind` value the render side already uses. +#### `preimage_in` accessor (in `quarto-source-map`) -### Out of scope +```rust +impl SourceInfo { + pub fn preimage_in(&self, target: FileId) -> Option>; +} +``` -- Include round-trip via wrapper-CustomNode (Plan 8 — uses this plan's - atomic-detection + soft-drop logic but introduces the wrapper itself). -- Engine output as Derived (deferred future work). -- Editable CustomNode slots (e.g., editing a Callout's title and body - through React with edits round-tripping back to source). See - `claude-notes/research/2026-05-05-editable-custom-nodes.md`. -- Promoting the qmd writer to a fallible `Result` interface throughout. - Soft-drop semantics make this unnecessary for q2-preview; the - remaining panic paths are debug assertions for genuine programming - errors (e.g., `unreachable!()` in Plan 8's qmd-writer arm for atomic - CustomNodes in non-Verbatim paths), not user-facing failure modes. - -## Design decisions (settled in conversation) - -- **Sectionize's transparent recurse pattern**: `Synthetic` wrappers with - source-bearing children get the Transparent treatment. Children's bytes - are contiguous in source (Sectionize doesn't reorder), so emitting them - in order produces the right output. The wrapper emits nothing. -- **`FootnotesTransform` and `AppendixStructureTransform` containers also fit - the Transparent pattern.** Plan 2B's audit added both transforms to the - q2-preview pipeline. Their synthesized container Divs (`
`, - `
`) have no source preimage, but their children - carry source_info from the user-typed footnote content / user-defined - `:::{.appendix}` blocks. Same Transparent treatment as Sectionize. - Worth noting: `FootnotesTransform`'s synthesized `` markers are NOT - pure Synthetic — `create_footnote_ref` at `crates/quarto-core/src/transforms/footnotes.rs:440-460` - clones source_info from the original `Note` inline, so the markers carry - the same byte range as the user's `^[footnote text]` syntax. Round-trip-friendly - as `Original` without extra writer work; only the bare `
` - wrapper is the Transparent case. -- **Atomic detection has three paths** (all converging through the same - `is_atomic` helper): - 1. **Derived source_info** (shortcode resolutions). Any node whose - `source_info` is `Derived` is atomic. - 2. **Atomic Synthetic source_info** (filter constructions, title-block - synthesis, tree-sitter postprocess space, etc.). Detected via - `By::is_atomic_synthesizer()` (Plan 4 method on the `By` struct, - keyed off `by.kind`). - 3. **Atomic CustomNode types** (IncludeExpansion, CrossrefResolvedRef). - Looked up via `is_atomic_custom_node(&type_name) -> bool`. -- **Why three paths**: shortcode resolutions and filter constructions - don't get wrappers (wrappers are too heavy for non-cross-file cases); - they propagate atomicity via source_info shape. Includes use a - wrapper because of the cross-file FileId issue (the included blocks - live in another file; we need an anchor in the parent file). -- **Soft-drop, not abort**: bad-edit cases substitute a safe alignment - in coarsen and emit a warning rather than aborting the entire write. - The user's other (valid) edits go through; the bad edit is reverted - to KeepBefore (or KeepBefore-equivalent for inline-level cases). - Reasoning: the React side (Plan 2B) is the primary safeguard via - read-only enforcement; the writer is the contract guarantor; if both - are correct the warning channel rarely fires; if React has a hole the - writer protects without losing the user's session. "Edit cannot apply" - is honored (the bad edit doesn't reach source); "edit cannot apply - silently" is not (a Q-3-42/Q-3-43 warning surfaces in the diagnostic - panel). -- **Let-user-win for block-level UseAfter on atomic** (user replaced - or deleted an atomic block via React). Coarsen does NOT substitute - here; the new block goes through Rewrite via the qmd writer. The - qmd writer's CustomNode arms know how to write fresh atomic types - from `plain_data` (Plan 8's IncludeExpansion arm reads - `plain_data["source_path"]`). This composes naturally — a fresh - user-edit-tagged IncludeExpansion serializes the same way as a - pipeline-emitted one. No warning; the user's intent is clear. -- **Multi-inline shortcode dedupe**: a multi-inline shortcode resolution - produces several inlines all sharing the same Derived `from`. The - writer's inline-assembly path needs to detect this and emit Verbatim - *once* for the group. Without this, the assembly emits the shortcode - token N times. -- **Param-with-default for `incremental_write_qmd`** (Decision D): add a - `pipeline_kind: Option` parameter. `None` = current behavior - (parse-only baseline). `Some("preview")` = run q2-preview pipeline on - baseline. Existing callers (q2-debug demos, sync client, ReactPreview's - q2-debug path) continue to work unchanged. - -## The coarsen logic +Walks Substring's `parent`, Concat's `pieces`, Generated's +`Invocation` anchor (via `invocation_anchor()`). Returns +`Some(byte_range)` if the chain resolves to an `Original` in the +target file, else `None`. -``` -fn is_atomic(node) -> bool { - match node.source_info() { - SourceInfo::Derived { .. } => true, - SourceInfo::Synthetic { by } if by.is_atomic_synthesizer() => true, - _ => {} +**`Invocation` is the only role consulted.** `ValueSource` (Plan 9) +and `Dispatch` (Plan 10) are diagnostic-only. `AnchorRole::Other` +roles are also not walked. This is the binary asymmetry contract: +copying bytes from a `ValueSource` source range would emit raw YAML +metadata into the body — a hard correctness bug. The contract is +documented on `preimage_in` and on `AnchorRole::Other`'s doc-comment. + +Future anchor roles default to non-walked unless they're explicitly +added to `preimage_in`'s implementation. Extensions introducing +`AnchorRole::Other("…")` should treat this as a feature: their +attribution data isn't accidentally consulted by the writer. + +#### Unified editability predicate + +The same predicate gates two surfaces: Plan 2A's React read-only +check (preventing the user from typing into uneditable regions in +the first place) and the writer's soft-drop logic (the contract +guarantor if React has a hole). + +```rust +fn is_editable_inside(node: &Node, target_file_id: FileId) -> bool { + // Atomic CustomNodes (IncludeExpansion, CrossrefResolvedRef): + // single replaceable units, not editable inside. The user can + // replace them wholesale via a component menu; they can't type + // inside them. + if let Node::Block(Block::Custom(cn)) = node + && is_atomic_custom_node(&cn.type_name) + { + return false; } - match node { - Block::Custom(cn) if is_atomic_custom_node(&cn.type_name) => true, - _ => false, + // Atomic-kind Generated source_info (shortcode, filter, + // title-block, tree-sitter-postprocess): pipeline-emitted + // content whose user-source is the invocation token, not the + // resolved text. + if let SourceInfo::Generated { by, .. } = node.source_info() + && by.is_atomic_kind() + { + return false; } + // Catch-all: editable iff the region has byte-traceable preimage + // in the target file. This covers: + // - Original in target: editable. ✓ + // - Original / Substring rooted outside target: not editable. + // - Generated with Invocation anchor pointing into target: + // editable IFF non-atomic kind (handled above; this branch + // never sees atomic-kind Generated). + // - Generated with empty anchors (sectionize, footnotes, + // appendix containers): not editable — preimage_in returns + // None. + // - Generated with only ValueSource / Dispatch anchors + // (Plan 9/10 shapes): not editable — preimage_in walks + // Invocation only. + node.source_info().preimage_in(target_file_id).is_some() } +``` + +The catch-all clause is the change Plan 7 introduces over earlier +drafts. Non-atomic synthesized containers (sectionize wrappers, +footnotes container, appendix container) are now classified as +non-editable on both surfaces. Edits to them via React go through +the writer's soft-drop path; the React side classifies the region +as read-only and shows the user no edit affordance. + +#### `coarsen` rules — two new entry variants plus soft-drop + +`CoarsenedEntry` gains two variants alongside today's `Verbatim`, +`Rewrite`, and `InlineSplice`: + +- **`Transparent`**: KeepBefore on a `Generated` wrapper with empty + anchors AND non-atomic kind AND source-bearing children. Recurses + on the children, producing a child-entry list. The wrapper itself + emits nothing. Handles Sectionize, footnotes-container, + appendix-container. +- **`Omit`**: KeepBefore on an atomic-kind `Generated` node with no + Invocation anchor (filter-constructed leaves, title-block h1, + tree-sitter postprocess space), OR on a non-atomic `Generated` + with no children. The node is dropped from output; the next + pipeline run regenerates it. + +Soft-drop substitutions cover the bad-edit cases. Each substitutes +a safe alignment in coarsen and emits a warning rather than +aborting the entire write: + +- **Inline-level UseAfter on a region where `is_editable_inside` + returns false** (typically: user retyped resolved shortcode + text): substitute KeepBefore for that one inline within the + surrounding `InlineReconciliationPlan`. The rest of the inline + plan continues as-is. Emit a `Q-3-42` warning. +- **Block-level RecurseIntoContainer on a region where + `is_editable_inside` returns false** (user edited inside an + include, OR inside a synthesized-from-metadata container): + substitute KeepBefore for the wrapper. For an atomic CustomNode + (include), the wrapper's `source_info` is Original pointing at + the include token; Verbatim copy preserves it. For a no-preimage + `Generated` container, the substitution lands in `Omit` — the + container regenerates next pipeline run. Either way, inner edits + never reach the qmd writer's arm. Emit a `Q-3-43` warning. +- **Block-level UseAfter on a region where `is_editable_inside` + returns false but the node is an atomic CustomNode** (user + replaced or deleted an atomic block via React's component menu): + **let-user-win** — keep as Rewrite. The qmd writer's CustomNode + arm reads `plain_data` and emits the include syntax from a fresh + user-edit-tagged CustomNode. No warning — the menu is the + affordance the user took; the intent is unambiguous. +- **Block-level UseAfter on a region where `is_editable_inside` + returns false and the node has no preimage** (user replaced a + synthesized-from-metadata container via React): soft-drop — + there's no source byte range to anchor a Rewrite at. Substitute + Omit; the original container regenerates next pipeline run. + Emit a `Q-3-43` warning. + +Earlier drafts had an `AtomicViolation` variant that caused +`incremental_write` to return `Err`. Soft-drop replaces it: every +bad-edit case has a safe substitution, so `AtomicViolation` is +unnecessary. The writer's return type carries warnings alongside +the saved qmd, not as fatal errors. + +#### Coarsen pseudo-code + +``` +fn coarsen(...) -> Vec: For each block alignment from the reconciler: if alignment is KeepBefore(orig_idx): - let original_block = original_ast.blocks[orig_idx]; - if let Some(range) = original_block.source_info().preimage_in(target_file) { - // Includes the atomic case (Derived + KeepBefore): Verbatim copy - // of the preimage. preimage_in walks Derived chains to the from. + let block = original_ast.blocks[orig_idx]; + if let Some(range) = block.source_info().preimage_in(target_file) { + // Original / Substring / Concat-contiguous / Generated-via- + // Invocation-anchor: all resolve here uniformly. Atomic-kind + // shortcode case lands here too — its Invocation anchor + // resolves to the token bytes. CoarsenedEntry::Verbatim { byte_range: range, orig_idx } } - else if matches!(original_block.source_info(), SourceInfo::Synthetic { by }) - && by.is_atomic_synthesizer() + else if matches!(block.source_info(), SourceInfo::Generated { by, .. }) + && by.is_atomic_kind() { - // Atomic Synthetic with no preimage (filter construction etc.). + // Atomic-kind Generated with no Invocation anchor (filter + // construction, title-block, tree-sitter-postprocess). // Drop from output; baseline regenerates next pipeline run. + // + // Belt-and-suspenders enforcement of Plan 4's required-anchor + // invariant for shortcode: a shortcode-Generated without an + // Invocation anchor would mean silent data loss. + debug_assert!( + !by.is_kind("shortcode"), + "Generated {{ by: shortcode, from: [] }} reached the writer — \ + Plan 6's stamper must always attach an Invocation anchor \ + for shortcode resolutions." + ); CoarsenedEntry::Omit } - else if matches!(original_block.source_info(), SourceInfo::Synthetic { .. }) - && original_block has children + else if matches!(block.source_info(), SourceInfo::Generated { .. }) + && block has source-bearing children { - // Non-atomic Synthetic wrapper (Sectionize etc.) — Transparent recurse. + // Non-atomic Generated wrapper (Sectionize, footnotes-container, + // appendix-container) with source-bearing children: Transparent + // recurse. CoarsenedEntry::Transparent { child_entries: } } else { - // Synthetic with no children, or some other shape with no preimage. - CoarsenedEntry::Omit + // Catch-all: KeepBefore with no preimage and no Generated-cascade + // shape that maps to Omit or Transparent. Examples: cross-file + // Original (no Plan-8 wrapper yet), Substring chain rooted outside + // target, gappy Concat. Fall back to Rewrite — re-serialize the + // unchanged block through the qmd writer. Lossy at the byte level + // (whitespace, formatting may shuffle) but preserves content. The + // earlier draft routed these to Omit; that path was data-loss-shaped + // and should never reach the writer. + CoarsenedEntry::Rewrite { orig_idx } } if alignment is UseAfter(new_idx): - // Let user win — including for atomic types. The qmd writer's - // CustomNode arms know how to write fresh atomic CustomNodes from - // plain_data (Plan 8's IncludeExpansion arm reads source_path). - // No atomic check here; trust the alignment. - CoarsenedEntry::Rewrite { new_idx } + let new_block = new_ast.blocks[new_idx]; + let was_atomic_custom_node = matches!(&new_block, Block::Custom(cn) + if is_atomic_custom_node(&cn.type_name)); + let was_no_preimage_generated = matches!(new_block.source_info(), + SourceInfo::Generated { .. }) + && new_block.source_info().preimage_in(target_file).is_none(); + + if !was_atomic_custom_node && was_no_preimage_generated { + // User replaced a synthesized-from-metadata container wholesale. + // No source position to anchor at; can't Rewrite. Soft-drop. + warnings.push(diagnostic_q3_43_widened(new_block)); + CoarsenedEntry::Omit + } else { + // Let user win — including for atomic CustomNodes (the user + // replaced an include via the component menu; the qmd writer's + // CustomNode arm handles this). + CoarsenedEntry::Rewrite { new_idx } + } if alignment is RecurseIntoContainer { before_idx, after_idx }: - let original_block = original_ast.blocks[before_idx]; - if is_atomic(original_block) { - // SOFT-DROP: inner edits to an atomic block are reverted. - // Substitute KeepBefore — Verbatim copy of the wrapper's preimage. - warnings.push(diagnostic_q3_43(original_block)); - if let Some(range) = original_block.source_info().preimage_in(target_file) { + let block = original_ast.blocks[before_idx]; + if !is_editable_inside(block, target_file) { + // Inner edits to a non-editable container are reverted. + warnings.push(diagnostic_q3_43(block)); + if let Some(range) = block.source_info().preimage_in(target_file) { + // Atomic CustomNode with preimage (include token): Verbatim. CoarsenedEntry::Verbatim { byte_range: range, orig_idx: before_idx } } else { - // Atomic node lacks a preimage in target — extremely unusual. - // Substitute Omit; warning already pushed. + // No-preimage container (synthesized): Omit; regenerates next run. CoarsenedEntry::Omit } } else { // Existing recurse logic for inline plans, custom_node_plans, etc. - // The inline-plan-walking step has its own soft-drop substitution + // Inline-plan-walking has its own soft-drop substitution // (see "Inline-level soft-drop" below). ... } ``` -**Inline-level soft-drop** (applied during `assemble_inline_content` and -when constructing the inline plan for InlineSplice): +#### Inline-level soft-drop + +Applied during `assemble_inline_content` and when constructing the +inline plan for `InlineSplice`: ``` For each inline alignment in plan.inline_alignments: -if alignment is UseAfter(new_idx) and is_atomic(new_inlines[new_idx]): - // User retyped over a Derived inline (shortcode resolution). - // Substitute KeepBefore for the corresponding original inline. - warnings.push(diagnostic_q3_42(new_inlines[new_idx])); - treat as KeepBefore() +if alignment is UseAfter(new_idx) and !is_editable_inside(orig_inlines[before_idx], target): + // User retyped over a non-editable inline (typically: shortcode + // resolution). Substitute KeepBefore for the original inline at + // before_idx — the position the alignment already names. The + // earlier draft suggested matching the *new* inline's Invocation + // anchor against original-side anchors, but user-edit inlines + // don't carry Invocation anchors so there'd be nothing to match. + warnings.push(diagnostic_q3_42(orig_inlines[before_idx])); + treat as KeepBefore(before_idx) -if alignment is RecurseIntoContainer and the original inline is_atomic: - // Same shape as the block-level recurse-on-atomic case. +if alignment is RecurseIntoContainer and !is_editable_inside(orig_inlines[before_idx], target): warnings.push(diagnostic_q3_42(orig_inlines[before_idx])); treat as KeepBefore(before_idx) ``` -The "corresponding original index" for inline-level UseAfter substitution -is the index in `orig_inlines` whose Derived `from` matches the new inline's -`from`. In the multi-inline shortcode case, multiple original inlines share -the same `from`; any of them produces the right Verbatim result (they all -preimage to the same shortcode token bytes, which the dedupe rule emits -once anyway). +#### `assemble` updates + +- **Transparent entries** emit each child's bytes with separators + computed from the children's original positions. The wrapper + itself contributes nothing. +- **Omit entries** contribute nothing to output. The original + `Generated` node is dropped; baseline regenerates next pipeline + run. +- **Multi-inline dedupe**: within an inline-splice or inline-assembly + run, detect consecutive `KeepBefore` entries whose inlines' + `Invocation` anchors are structurally equal (compared via + `PartialEq` on the anchor's `source_info` — `SourceInfo` derives + `PartialEq`, so value equality across the full chain). Emit + Verbatim *once* for the group, using the anchor's preimage byte + range. Without dedupe, a multi-inline shortcode resolution like + `**Bold** Title` → `[Strong[Str], Space, Str]` would emit the + shortcode token N times. +- No `AtomicViolation` handling — soft-drop substitutions happened + in coarsen; `assemble` sees only safe entries. + +#### `incremental_write_qmd` signature change + +Today: +```rust +pub fn incremental_write_qmd(original_qmd: &str, new_ast_json: &str) -> String; +``` + +After Plan 7: +```rust +pub fn incremental_write_qmd( + original_qmd: &str, + baseline_ast_json: &str, + new_ast_json: &str, +) -> String; // JSON: { success, qmd, warnings, error?, diagnostics? } +``` -The `assemble` step iterates coarsened entries: +The third positional argument (`baseline_ast_json`) is the +caller-supplied baseline AST at the same pipeline tier as +`new_ast_json`. The writer no longer parses `original_qmd` to +synthesize a baseline; it uses the caller-supplied one. This makes +the writer pipeline-agnostic: it diffs the two ASTs it's given and +writes accordingly. + +The TS wrapper at `ts-packages/preview-runtime/src/wasmRenderer.ts` +mirrors the signature change: `incrementalWriteQmd(originalQmd, +baselineAst, newAst): { qmd, warnings }` (today: `(originalQmd, +newAst): string`). + +No `pipeline_kind` parameter. The pipeline tier is implicit in +whichever baseline AST the caller passes. + +#### Warning channel mechanism + +`coarsen` accepts a `&mut Vec` warning sink as +a parameter. Soft-drop substitutions push warnings into the sink. +The WASM bridge serializes the warnings into the response JSON's +existing `warnings` field (already present on `AstResponse`; today +always `None` for `incremental_write_qmd`). The TS wrapper returns +`{ qmd, warnings }`. The hub-client's existing diagnostic collation +(`ReactPreview.tsx::allDiagnostics`, `Editor::diagnosticsToMarkers`) +displays soft-drop warnings the same way it displays pipeline +diagnostics — as Monaco squiggles for located warnings, and as +the `.diagnostics-banner` for unlocated. + +#### Diagnostic codes + +Two codes, registered in +`crates/quarto-error-reporting/error_catalog.json`: + +- **`Q-3-42` — "Shortcode edit dropped".** Emitted when an + inline-level edit to shortcode-resolved (or other atomic-Generated) + content was substituted by KeepBefore. Body: the affected inline's + text and the source range of the invocation token (from the + `Invocation` anchor) so editor UIs can highlight it. + +- **`Q-3-43` — "Generated content edit dropped".** Three emission + paths, sharing the same code and structural shape: + - Block-level RecurseIntoContainer on an atomic CustomNode + (Plan 8's `IncludeExpansion`): body names the include's + `source_path` from `plain_data`. Message: "To edit this content, + open `` directly." + - Block-level RecurseIntoContainer on a no-preimage Generated + container (synthesized appendix / footnotes container after + Plan 9 stamps ValueSource anchors): body names the metadata + key when available. Message: "This content is generated from + metadata; edit `_quarto.yml` / frontmatter to change it." + - Block-level UseAfter on a no-preimage Generated container: + same body as the previous case. + +Both are `DiagnosticKind::Warning`. Both carry source ranges +(the wrapper's preimage range when available, else the surrounding +block's range), so they squiggle naturally in Monaco. + +**Catalog mechanics** (verified). Each Q-* code in +`error_catalog.json` carries one static `message_template` plus +title / subsystem / docs_url. Per-call-site body text uses the +existing `DiagnosticMessageBuilder` API +(`crates/quarto-error-reporting/src/builder.rs`): -- Verbatim → copy byte range from `original_qmd`. -- Rewrite → use the qmd writer to serialize the new block. -- InlineSplice → existing splice logic, extended with (a) the - multi-inline Derived dedupe rule and (b) inline-level soft-drop - substitutions before assembly. -- Transparent → emit children's bytes recursively. -- Omit → skip (contribute nothing to output). +```rust +DiagnosticMessageBuilder::warning("Generated content edit dropped") + .with_code("Q-3-43") + .problem(format!("To edit this content, open `{}` directly.", + source_path)) + .add_hint("...") + .build() +``` + +The catalog entry provides one generic `message_template`; the +three emission paths supply their distinct text via the builder. +**No template-able-body infrastructure needed** — the existing +builder API already covers it. Phase 3 ships one catalog entry per +code and three builder helper functions (`diagnostic_q3_42`, +`diagnostic_q3_43_include`, `diagnostic_q3_43_metadata`). -The function returns `Ok((String, Vec))` carrying the -saved qmd plus any soft-drop warnings that fired during coarsen. It only -returns `Err` for genuine write failures (UTF-8 errors, qmd writer failures -on malformed input — same as today's writer). +#### `is_atomic_custom_node` registry + +Defined in `quarto-core` as: +```rust +pub const ATOMIC_CUSTOM_NODES: &[&str] = &["CrossrefResolvedRef"]; +pub fn is_atomic_custom_node(type_name: &str) -> bool; +``` + +Plan 7 ships the Rust side. The TypeScript hand-mirror at +`ts-packages/preview-renderer/src/utils/atomicCustomNodes.ts` +already exists (Plan 2A shipped it with `CrossrefResolvedRef`). +Plan 8 adds `IncludeExpansion` to both sides. + +Extensions that need to contribute atomic types use a future +registration mechanism (see §Open questions); the const set +covers built-ins. + +#### Hub-client integration + +**Scope clarification: first-demo UX.** Plan 7 lifts the coarse +`pipelineKindForFormat(format) === 'preview'` read-only guard at +`ReactPreview.tsx:429-440` and replaces it with the writer's +soft-drop path. The writer's Q-3-42 / Q-3-43 diagnostics are the +user-facing safety net for the first demo — bad edits don't reach +source, and the user sees a warning. A fine-grained React-side +gate (greying out the affordance per region via the +`is_editable_inside` predicate consulted from JS) is **deferred** +to a future frontend pass. For the first demo, the experience is +"you can type, but it doesn't take, and you see a warning"; that +is the deliverable. Plan 2A's existing atomic-CustomNode gate +continues to prevent the most surprising cases (editing inside +includes) without further work. + +- Lift the `handleSetAst` read-only guard in `ReactPreview.tsx:429-440` + introduced in Plan 1. Wire `setLocalAst` through with the current + `ast` state as the baseline: + ```ts + const handleSetAst = useCallback((newAst) => { + const { qmd, warnings } = incrementalWriteQmd(content, ast, JSON.stringify(newAst)); + // process warnings (Q-3-42, Q-3-43) into allDiagnostics + onContentRewrite(qmd); + }, [content, ast, onContentRewrite]); + ``` + The `ast` state already holds the previously-rendered post-pipeline + AST (set by the regular render effect on every successful render). + No new caching mechanism is required; React's `useState` is the + cache. + +#### q2 preview SPA integration + +- Replace `noopSetAst` at `q2-preview-spa/src/PreviewApp.tsx:241` + with a real handler that calls `incrementalWriteQmd(content, + baselineAst, newAst)` and then `syncClient.updateFileContent(path, + newQmd)`. The baseline AST is the SPA's currently-displayed AST + (mirror of ReactPreview's `ast` state). +- Add **content-match echo-prevention** in the SPA's + `onFileContent` handler. Just before calling + `updateFileContent`, hash the qmd being emitted (e.g. SHA-256 or + a cheaper FNV-1a — exact algorithm settled during implementation) + and stash `(path, hash)` in a ref. In `onFileContent(path, + content)`, suppress the re-render if `(path, hash(content))` + matches the stashed value; otherwise process normally. Robust + against interleaved unrelated file updates (an unrelated file's + `onFileContent` doesn't match the stashed `path`, so it processes + normally). +- Ship `q2-preview-spa/src/components/DiagnosticStrip.tsx` — a + small SPA-local component (~50 lines TSX + ~20 lines CSS) that + displays Q-3-42 / Q-3-43 warnings returned by `incrementalWriteQmd`. + Mirrors hub-client's `.diagnostics-banner` visual style. Applies + suppress-after-3-by-source-range (see "Autosave-context spam + mitigation" below). +- Both single-file mode (bd-tnm3k) and project mode work via the + same code path — the ephemeral hub bridges automerge ↔ disk + uniformly. No SPA-side branching needed. + +#### Move `pipelineKindForFormat` to shared package + +`pipelineKindForFormat` lives in `hub-client/src/utils/pipelineKind.ts` +today. The SPA can't import from hub-client. The writer no longer +needs the helper (no `pipeline_kind` parameter), but the SPA's +**display path** does — to choose between `parse_qmd_to_ast` and +`render_page_in_project_with_attribution` when rendering. + +Move to `ts-packages/preview-runtime/src/pipelineKind.ts`. Both +hub-client and the SPA import from there. Mechanical move; ~5 LOC +of import-path updates. + +#### Diagnostic surfacing in hub-client + +Warnings flow through the existing `RenderResponse.warnings` channel +(the same path Plan 1's pipeline diagnostics and attribution-render +diagnostics use). `ReactPreview.tsx::allDiagnostics` already collates +them; `Editor::diagnosticsToMarkers` splits into Monaco markers and +the existing `.diagnostics-banner`. Q-3-42 and Q-3-43 both carry +source ranges, so they squiggle naturally. **No new hub-client UI +needed.** + +One known UX gap: the banner is gated on `!isFullscreenPreview`, so +users in fullscreen-preview mode rely on the Monaco squiggles +(visible when they exit fullscreen) rather than the banner. Accepted. + +### Out of scope + +- **Include round-trip via wrapper-CustomNode** (Plan 8 — uses + this plan's atomic-detection + soft-drop logic but introduces the + wrapper itself). +- **Running the transform pipeline inside the writer.** The writer + is pipeline-agnostic by design; the caller supplies the baseline + AST at whatever tier they need. Future plans don't change this. +- **Engine output as Generated** (deferred future work). +- **Editable CustomNode slots** (e.g., editing a Callout's title and + body through React with edits round-tripping back to source). See + `claude-notes/research/2026-05-05-editable-custom-nodes.md`. +- **Promoting the qmd writer to a fallible `Result` interface + throughout.** Soft-drop semantics make this unnecessary for + q2-preview; the remaining panic paths are debug assertions for + genuine programming errors, not user-facing failure modes. See + §"The byte-provenance contract" below. +- **Lifting hub-client's diagnostic banner + SPA's `DiagnosticStrip` + into a shared `@quarto/preview-renderer` component.** Filed as a + follow-up against the hub-client decomposition epic (bd-hfjj); not + on Plan 7's critical path. +- **Plan 7 `preimage_in` role-asymmetry unit test and + appendix-license end-to-end round-trip test.** These exercise the + `Invocation`-only walking behavior of `preimage_in` and the + end-to-end correctness of soft-dropping a metadata-derived edit. + Both depend on Plan 9 having stamped ValueSource anchors on a + real consumer (the appendix synthesizer); both land in Plan 9's + Phase 5. Plan 7's test plan retains the structural-only unit + tests that don't depend on a real ValueSource consumer. + +## Design decisions (settled) + +- **Decompose into orthogonal primitives.** Parse / transform / + reconcile / write are independent operations. The writer doesn't + know about pipelines; the caller composes. The WASM bridge layer + exposes the compositions callers actually use; future entries + can land without changing the writer's signature. +- **Caller supplies baseline AST.** Removes the writer's dependency + on `RenderContext`, `SystemRuntime`, `Format`, and pipeline + construction machinery. The writer's surface is three strings + (qmd, baseline, new) in and one JSON envelope out. +- **`Invocation` is the only anchor role the writer consults.** + `ValueSource`, `Dispatch`, and `Other` are diagnostic-only. The + asymmetry is load-bearing: copying bytes from a `ValueSource` + source range would emit raw YAML into the body — a hard + correctness bug. Documented on `preimage_in` and on + `AnchorRole::Other`. +- **Soft-drop, not abort.** Bad-edit cases substitute a safe + alignment in coarsen and emit a warning rather than aborting the + entire write. The user's other valid edits go through; the bad + edit is reverted. React (Plan 2A's framework atomic gate) is the + primary safeguard via read-only enforcement; the writer is the + contract guarantor; if React has a hole, the writer protects + without losing the user's session. +- **Unified editability predicate.** Plan 2A's React-side read-only + check and the writer's coarsen-side soft-drop logic consult the + same `is_editable_inside(node, target_file_id) -> bool`. Three + reasons content is uneditable: atomic CustomNode (replaceable + wholesale via menu, not editable inside); atomic-kind Generated + (shortcode / filter / title-block — content represents the + resolved value of an invocation token); no preimage in target + (synthesized-from-metadata containers — sectionize / footnotes / + appendix, after Plan 9's stamping). +- **Let-user-win for block-level UseAfter on atomic CustomNode.** + Replacing an `IncludeExpansion` wholesale (e.g. swapping to a + different `source_path` via a component menu) goes through the + qmd writer's CustomNode arm. No warning — the user's intent is + unambiguous. +- **Soft-drop for block-level UseAfter on no-preimage Generated.** + Replacing a synthesized-from-metadata container has no source + position to anchor at; Rewrite would have nowhere to write. + Substitute Omit + Q-3-43 warning. +- **Multi-inline dedupe via `PartialEq` on anchor source_info.** + Two consecutive inlines share an `Invocation` anchor iff their + anchor's `source_info` is `==` (value equality). `SourceInfo` + derives `PartialEq`, so this is structural — Substring chains, + Concat pieces, etc. compare element-wise. +- **Inline-level UseAfter substitution targets `before_idx`.** The + alignment from the reconciler already carries the original-side + index being replaced; the writer uses that directly. Earlier + drafts suggested matching the *new* inline's `Invocation` anchor + against original-side anchors — but user-edit inlines don't + carry `Invocation` anchors, so there's nothing to match. +- **No `pipeline_kind` parameter on `incremental_write_qmd`.** The + pipeline tier is implicit in the baseline AST the caller passes. +- **No backward-compat shim for the signature change.** Three + first-class consumers (ReactPreview, kanban demo, hub-react-todo + demo) + one type interface (`quarto-sync-client`'s `astOptions`) + + one TS wrapper (`ts-packages/preview-runtime`'s + `wasmRenderer.ts`). All in-repo, lockstep-migrable. No npm-exposed + consumers. No wire-format persistence — the function emits qmd + text, not a serialized envelope. The codebase has no + `#[deprecated]` convention; the migration is one PR. +- **Plan 7 keeps its existing filename (`2026-05-04-q2-preview- + plan-7-incremental-writer.md`)** for git-history continuity. New + plans in the epic use the `provenance-plan-N-` convention + (Plan 9, Plan 10). ## Multi-inline shortcode dedupe -When `{{< meta foo >}}` resolves to multiple inlines (e.g., metadata is -markdown like `**Bold** Title` → `[Strong[Str], Space, Str]`), each -resolved inline has the same `Derived { from: Original{shortcode_range}, -by: By::shortcode("meta") }` source_info. +When `{{< meta foo >}}` resolves to multiple inlines (e.g. metadata +is markdown like `**Bold** Title` → `[Strong[Str], Space, Str]`), +each resolved inline has the same `Generated { by: shortcode("meta"), +from: [Invocation -> Original{shortcode_range}] }` source_info. -Block-level: if both pipeline runs produce the same multi-inline output, -the surrounding Para is structurally identical → KeepBefore at block -level → Verbatim copy of the WHOLE Para's bytes (including the shortcode -token). One copy. ✓ +**Block-level:** if both reconciliation inputs produce the same +multi-inline output, the surrounding Para is structurally identical +→ KeepBefore at block level → Verbatim copy of the WHOLE Para's +bytes (including the shortcode token). One copy. ✓ -Inline-level recursion (when the user edits something else in the same -Para): the reconciler picks `RecurseIntoContainer` with an inline plan. -Each shortcode-derived inline is `KeepBefore` individually. Without -dedupe, each one's Verbatim emits the shortcode token → N copies in -output. +**Inline-level recursion** (when the user edits something else in +the same Para): the reconciler picks `RecurseIntoContainer` with an +inline plan. Each shortcode-derived inline is `KeepBefore` +individually. Without dedupe, each one's Verbatim would emit the +shortcode token → N copies in output. -Dedupe rule: when iterating inline alignments in -`assemble_inline_content`, group consecutive `KeepBefore` entries whose -inlines share the same `Derived` source (compare the `Arc` -identity of `from`, or by structural equality of the `from` value). Emit -Verbatim *once* for the group, using the `from`'s preimage byte range. +**Dedupe rule:** when iterating inline alignments in +`assemble_inline_content`, group consecutive `KeepBefore` entries +whose inlines' `Invocation` anchors are `PartialEq`-equal. Emit +Verbatim *once* for the group, using the anchor's preimage byte +range. This applies only at the inline level (where multi-inline shortcode resolutions occur). Block-level rarely sees this case. @@ -439,366 +704,805 @@ impl SourceInfo { .map(|p| p.source_info.preimage_in(target)) .collect::>>()?; if ranges.is_empty() { return None; } - // Confirm contiguous: ranges[i].end == ranges[i+1].start if ranges.windows(2).all(|w| w[0].end == w[1].start) { Some(ranges.first()?.start .. ranges.last()?.end) } else { None // gappy concat — can't Verbatim-copy } } - SourceInfo::Synthetic { .. } => None, - SourceInfo::Derived { from, .. } => { - // Walk through the `from` chain to find a preimage in the target. - from.preimage_in(target) + SourceInfo::Generated { .. } => { + // Walk through the Invocation anchor's chain. + // Never walks ValueSource (Plan 9), Dispatch (Plan 10), + // or Other — these are diagnostic-only. + self.invocation_anchor() + .and_then(|si| si.preimage_in(target)) } } } } ``` -The `Derived` case delegates to `from`, which usually resolves to an -`Original` covering the source token bytes. So a `Derived` shortcode -resolution successfully returns its preimage range; the writer Verbatim -copies the shortcode token from source. +The `Generated` case delegates to `invocation_anchor()`, which +returns the first `Invocation` anchor's source_info — typically an +`Original` covering the source token's bytes. So a +shortcode-resolution Generated successfully returns its preimage +range; the writer Verbatim-copies the shortcode token from source. + +## Migration plan + +### Rust signature + +```rust +// Before: +pub fn incremental_write_qmd(original_qmd: &str, new_ast_json: &str) -> String; + +// After: +pub fn incremental_write_qmd( + original_qmd: &str, + baseline_ast_json: &str, + new_ast_json: &str, +) -> String; +// JSON: { success, qmd, warnings, error?, diagnostics? } +``` + +### TypeScript wrapper (`ts-packages/preview-runtime/src/wasmRenderer.ts:712`) + +```ts +// Before: +export function incrementalWriteQmd(originalQmd: string, newAst: RustQmdJson): string; + +// After: +export function incrementalWriteQmd( + originalQmd: string, + baselineAst: RustQmdJson | string, // accept either parsed or JSON-string for ergonomics + newAst: RustQmdJson, +): { qmd: string; warnings: Diagnostic[] }; +``` + +### Sync-client interface (`ts-packages/quarto-sync-client/src/types.ts:169`) + +```ts +// Before: +incrementalWriteQmd?: (originalQmd: string, newAst: unknown) => string; + +// After: +incrementalWriteQmd?: ( + originalQmd: string, + baselineAst: unknown, + newAst: unknown, +) => { qmd: string; warnings: Diagnostic[] }; +``` + +### Sync-client call site (`ts-packages/quarto-sync-client/src/client.ts:957`) + +```ts +// Before: +qmdText = astOptions.incrementalWriteQmd(cached.source, ast); + +// After: +const result = astOptions.incrementalWriteQmd(cached.source, cached.ast, ast); +qmdText = result.qmd; +// Optional: surface result.warnings to a sync-client callback. Default +// to ignore; sync-client is policy-free. +``` + +The `astCache` already maintains both `source` (qmd) and `ast` (last +parsed AST) per file. `cached.ast` IS the baseline. No demo-side +state changes required. + +### Consumer migrations + +1. **`hub-client/src/components/render/ReactPreview.tsx:429-440`** + — `handleSetAst` updated to pass the current `ast` state as the + baseline. The existing read-only guard for `pipelineKindForFormat + === 'preview'` deletes. Warnings from the response feed into + `allDiagnostics` collation alongside pipeline diagnostics. + +2. **`q2-demos/kanban/src/{useSyncedAst.ts:93, wasm.ts:79}`** — the + `astOptions.incrementalWriteQmd` lambda forwards the new third + argument. `wasm.ts:79`'s wrapper accepts and forwards + `baselineAst`. The demo's app state is unchanged; sync-client's + astCache supplies the baseline. + +3. **`q2-demos/hub-react-todo/src/{useSyncedAst.ts:93, wasm.ts:79}`** + — same as kanban. + +4. **`q2-preview-spa/src/PreviewApp.tsx`** — new `handleSetAst` + replaces `noopSetAst` at line 241. Routes through + `incrementalWriteQmd(content, currentAst, newAst)` with + content-match echo-prevention (see §Hub-client / SPA + integration above). + +All migrations in one PR; no back-compat shim. The TS-side type +checker catches every call site automatically. ## Open questions for implementation -- **Inline-level Transparent**: today the writer has `InlineSplice` for - inline-level changes within a block. Does Transparent apply to inlines - too (e.g., a `Span` with Synthetic source_info containing source-bearing - inlines)? Probably yes — extend the same pattern. Confirm during - implementation. -- **Concat-with-gaps**: if a Concat's pieces resolve to non-contiguous - ranges, `preimage_in` returns None per the algorithm above. Coarsen - falls through to Rewrite. Confirm this is the right semantics. -- **The `is_atomic_custom_node` lookup — extension forward-compat**: - today's hardcoded `pub const ATOMIC_CUSTOM_NODES: &[&str]` works for - built-in atomic types. Future extensions (including the eventual - TSX-extension story) will need to register their own atomic types - without modifying `quarto-core`. +- **Inline-level Transparent.** Today the writer has `InlineSplice` + for inline-level changes within a block. Does the `Transparent` + variant apply to inlines too (e.g., a `Span` with Generated + source_info containing source-bearing inlines)? Probably yes — + extend the same pattern. Confirm during implementation. - The forward-compat design (deferred to a follow-up plan; commits - the *shape* now without writing implementation code): +- **Concat-with-gaps semantics.** A gappy Concat's `preimage_in` + returns None per the algorithm above. Coarsen falls through to + Rewrite. Confirm this is the right semantics for the rare cases + where gappy Concats reach the block level. + +- **`is_atomic_custom_node` lookup — extension forward-compat.** + Today's hardcoded `pub const ATOMIC_CUSTOM_NODES: &[&str]` works + for built-in atomic types. Future extensions will need to + register their own atomic types without modifying `quarto-core`. - - **YAML schema** in `_extension.yml`: + The forward-compat design (deferred to a follow-up plan; commits + the shape now without implementation code): + - YAML schema in `_extension.yml`: ```yaml contributes: custom-nodes: - { type: MyCustomBlock, atomic: true } - { type: AnotherWidget } # atomic defaults to false ``` - - **Rust runtime aggregation** mirrors `resolve_filters()`'s pattern: - `pub fn collect_atomic_custom_node_types(extensions: &[Extension]) -> HashSet` - starts from the built-in set and adds extension-contributed entries - where `atomic == true`. - - **Function signature evolution**: - `is_atomic_custom_node(name)` → - `is_atomic_custom_node(name, ®istry: &HashSet)`. The - writer (in `pampa`) gets the registry from `StageContext` at coarsen - time. ~30 callers cascading; mechanical. - - **Rust→JS sync** for extension types (the genuinely-new piece — - the hand-mirror approach in Plan 7 doesn't work for extension - types because they aren't known at hub-client build time): - a `wasm_bindgen` export `get_atomic_custom_node_types()` is called - once per render after extensions are loaded; populates a React - context. The hand-mirrored TS const remains the fallback for the - no-extensions / WASM-initializing case and stays correct for - built-ins. - - **Plan 8's `IncludeExpansion`**: lands in the built-in set today - via `pub const ATOMIC_CUSTOM_NODES`. After the follow-up plan, the - set is built from a built-in's `_extension.yml` rather than - hardcoded — same effect via the same code path that user - extensions use, no privileged route. - - This sketch commits the schema choice (`contributes.custom-nodes` with - `atomic: bool`) and the function-signature migration path. Plan 7 - ships the const-based registry; the runtime aggregation, schema - parsing, and `wasm_bindgen` lookup all land in a follow-up when an - extension actually needs to register an atomic type. -- **Sibling vs param**: Decision D was "param with default" but Plan 4 / 7 - could implement it either way. Confirm during implementation. Param is - cleaner (one fewer entry point). Sibling is more isolated. Either works. -- **Runtime user-filter idempotence detection**: split out to Plan 7a. - See `claude-notes/plans/2026-05-04-q2-preview-plan-7a-filter-idempotence.md` - for the full design — round-trip idempotence check, per-filter - attribution, `idempotent: false` opt-out, Q-3-44 / Q-3-45 - diagnostics. Plan 7a is a separable follow-up that builds on Plan 7's - `pipeline_kind: Some("preview")` machinery; it doesn't gate M3. + - Rust runtime aggregation mirroring `resolve_filters()`'s pattern. + - Function signature evolution: `is_atomic_custom_node(name, + ®istry: &HashSet)`. The writer in `pampa` gets the + registry from `StageContext` at coarsen time. + - Rust→JS sync via a `wasm_bindgen` export called once per render; + populates a React context. The hand-mirrored TS const remains + the fallback for the no-extensions case. + + Plan 7 ships the const-based registry; the runtime aggregation, + schema parsing, and `wasm_bindgen` lookup land in a follow-up + when an extension actually registers an atomic type. + +- **Runtime user-filter idempotence detection** — split out to + Plan 7a. See `claude-notes/plans/2026-05-04-q2-preview-plan-7a- + user-filter-idempotence.md` for the full design — round-trip + idempotence check, per-filter attribution, `idempotent: false` + opt-out, Q-3-44 / Q-3-45 diagnostics. Plan 7a is a separable + follow-up; it doesn't gate M3. + +- **Content-match echo-prevention hash choice.** SHA-256 is the + obvious safe choice (already used in Plan 7a's + `filter_sources_hash`). FNV-1a or xxHash would be faster but + cryptographic strength isn't needed — we're just comparing a + freshly-emitted qmd against an arriving qmd for equality. Confirm + during SPA implementation. + +- **`pampa::pipeline::transform_ast` Rust-internal helper.** + Extracting the transform step of `render_qmd_to_preview_ast` into + a standalone `transform_ast(ast: Pandoc, ...) -> Pandoc` would + let tests exercise the transform tier in isolation. ~30 LOC of + factoring; not on Plan 7's critical path. Open beads if useful + during implementation. ## References -- `crates/pampa/src/writers/incremental.rs` — the writer to modify. - Particularly `coarsen` (line 149), `assemble` (line 228), `compute_separator` - (line 354), `block_source_span` (line 447), the helper for inline byte - ranges (line 800). -- `crates/quarto-source-map/src/source_info.rs:185-237` — accessor patterns - to extend. -- `crates/wasm-quarto-hub-client/src/lib.rs:2510` — `incremental_write_qmd` - entry point to extend (line drifted from 2166 after Plan 1's - prep refactor + new q2-preview wiring; verify exact line at - Plan 7 implementation time). -- `hub-client/src/services/wasmRenderer.ts:583` — the JS wrapper - (line drifted from 531). -- `hub-client/src/components/render/ReactPreview.tsx` — `handleSetAst` - guard to lift. Plan 1 implemented the `doRender` format switch via - `pipelineKindForFormat(format)` already; Plan 7 wires the same - helper into the edit-back path so the guard can be replaced with a - call to `incrementalWriteQmd` that passes the `pipeline_kind`. -- `hub-client/src/utils/pipelineKind.ts` — Plan 1's TS helper - (`pipelineKindForFormat`); Plan 7's JS-side call site reads it. -- `crates/quarto-core/src/stage/stages/ast_transforms.rs` — - `AstTransformsStage::run()` JIT branch already dispatches on - `ctx.format.pipeline_kind` (Plan 1 commit `60658a4e`); no edit - needed for Plan 7 itself, listed for context. -- `crates/quarto-core/src/format.rs` — `Format::pipeline_kind` - (Plan 1 commit `a7143cc7`); Plan 7 reads it in the - `incremental_write_qmd` body to drive the baseline-pipeline - selection. -- Plans 4 (Synthetic + By), 5 (wire format), 6 (audit) — provide the - AST shape this plan walks. +### Rust + +- `crates/pampa/src/writers/incremental.rs` — the writer. + Particularly `incremental_write` (line 80), `coarsen` (line 149), + `assemble` (line 228), `compute_separator` (line 354), + `block_source_span` (line 448), `assemble_inline_splice` (line + 602), `assemble_inline_content` (line 632), + `assemble_recursed_container` (line 672), `inline_source_span` + (line 800). +- `crates/quarto-source-map/src/source_info.rs` — `SourceInfo`, + `Generated`, `By`, `Anchor`, `AnchorRole`. Plan 7 adds the + `preimage_in` accessor. +- `crates/quarto-ast-reconcile/src/lib.rs` — + `compute_reconciliation`, `structural_eq_blocks`, + `structural_eq_inlines`, `compute_blocks_hash_fresh`, + `compute_meta_hash_fresh_excluding_rendered`. All used by the + test plan; the reconciler API itself doesn't change. +- `crates/wasm-quarto-hub-client/src/lib.rs:2947` — WASM entry + point (signature change). +- `crates/quarto-core/src/lib.rs` (or appropriate module) — + `ATOMIC_CUSTOM_NODES` const + `is_atomic_custom_node` fn (new). + +### TypeScript + +- `ts-packages/preview-runtime/src/wasmRenderer.ts:712` — JS + wrapper (signature change). Imports from this package; both + hub-client and the SPA consume. +- `ts-packages/preview-runtime/src/pipelineKind.ts` — new home + for `pipelineKindForFormat` (moved from + `hub-client/src/utils/pipelineKind.ts`). +- `ts-packages/preview-renderer/src/utils/atomicCustomNodes.ts` — + existing TS hand-mirror of `ATOMIC_CUSTOM_NODES`. +- `ts-packages/quarto-sync-client/src/types.ts:169` — + `astOptions.incrementalWriteQmd` interface (signature change). +- `ts-packages/quarto-sync-client/src/client.ts:957` — sync-client + call site (forwards new argument). +- `hub-client/src/components/render/ReactPreview.tsx:429-440` — + `handleSetAst` guard lift + edit-back wiring. +- `q2-preview-spa/src/PreviewApp.tsx:241` — `noopSetAst` → + real handler. +- `q2-demos/kanban/src/{useSyncedAst.ts:93, wasm.ts:79}`, + `q2-demos/hub-react-todo/src/{useSyncedAst.ts:93, wasm.ts:79}` + — demo wrappers (signature forwarding). + +### Plans + +- **Plans 4 (Generated + Anchor + By + is_atomic_kind), 5 (wire + format), 6 (audit)** — provide the AST shape this plan walks. +- **Plan 3** — ships `compute_meta_hash_fresh` / + `compute_meta_hash_fresh_excluding_rendered` in + `quarto-ast-reconcile`; the writer-lossless baseline test uses + both. +- **Plan 7a** (`claude-notes/plans/2026-05-04-q2-preview-plan-7a- + user-filter-idempotence.md`) — separable follow-up; runtime + user-filter idempotence check. +- **Plan 8** — uses Plan 7's atomic infrastructure for + `IncludeExpansion`; not blocking. +- **Plan 9** (`claude-notes/plans/2026-05-22-provenance-plan-9- + valuesource-threading.md`) — ValueSource consumer wiring; + appendix synthesizer stamping that makes the Q-3-43-widened + cases fire on real data. Owns the `preimage_in` role-asymmetry + unit test and the appendix-license e2e round-trip test (Plan 9 + Phase 5). +- **Plan 10** (`claude-notes/plans/2026-05-22-provenance-plan-10- + dispatch-anchor.md`) — Dispatch anchor for Lua sources; inherits + Plan 7's `AnchorRole::Other` policy. ## Test plan -- **Reconciler source-info-blindness foundation test** (new, lands in +- **Writer-lossless baseline test** (prerequisite for the + reconciler tests below; lands in Plan 7's first commit alongside + the foundation test). For each AST shape the writer needs to + emit (Generated-with-Invocation shortcode resolutions, Plan 8's + IncludeExpansion CustomNode wrappers, FloatRefTarget / Theorem / + Proof / Callout CustomNodes, synthesized Sectionize / Footnotes / + Appendix containers, user-edited variants of each), assert that + `parse(write(ast))` produces an AST whose + `compute_blocks_hash_fresh` + `compute_meta_hash_fresh_excluding_rendered` + equal the input's. This isolates writer bugs from reconciler + bugs. Fixtures reuse Plan 3's set under + `crates/quarto-core/tests/fixtures/q2-preview-idempotence/` plus + any Plan 7-specific shapes. + +- **Reconciler source-info-blindness foundation test** (lands in Plan 7's first commit): asserts that `structural_eq_blocks` and - `structural_eq_inlines` (in `quarto-ast-reconcile`) return `true` for - pairs of nodes that differ *only* in source_info. Cover all the new - shapes: two Original blocks with different file IDs / offsets; two - Synthetic blocks with different `By` payloads; two Derived blocks with - different `from` chains but the same content/attr/plain_data; - CustomNode pairs differing only in source_info on the wrapper or in - any slot child. The hash function already excludes source_info - (verified by Plan 3 and existing - `compute_blocks_hash_fresh::test_same_content_same_hash`); this test - covers the *equality* path too. Why it matters: the reconciler drives - KeepBefore decisions off these functions. If they leak source_info - by accident, q2-preview round-trip would degenerate to whole-doc - Rewrite without any obvious symptom — every test that doesn't inspect - the alignment plan would still pass. Catch the leak structurally - rather than discover it via correctness regressions. -- **`preimage_in` unit tests**: each variant (Original same/different file, - Substring chain, Concat contiguous/gappy, Synthetic, Derived). Assert - correct byte range or None. -- **Coarsen unit tests**: build mock reconciliation plans + ASTs covering: - - Verbatim (KeepBefore + preimage in target, both Original and Derived). - - Transparent (KeepBefore + non-atomic Synthetic wrapper with - source-bearing children — Sectionize case). - - Omit via atomic Synthetic (KeepBefore + Synthetic with - `by.is_atomic_synthesizer() == true` and no preimage — filter - construction case). - - Omit via Synthetic with no children (rare). - - Rewrite (UseAfter, non-atomic). - - **Soft-drop: inline UseAfter on Derived** — substitute KeepBefore - for that inline, surrounding inline plan continues; assert - `Q-3-42` warning emitted. + `structural_eq_inlines` return `true` for pairs of nodes that + differ *only* in source_info. Cover: two Original blocks with + different file IDs / offsets; two Generated blocks with different + `By` payloads; two Generated blocks with different anchor lists + but the same content / attr / plain_data; CustomNode pairs + differing only in source_info on the wrapper or in any slot child. + Why it matters: the reconciler drives KeepBefore decisions off + these functions. If they leak source_info by accident, round-trip + degenerates to whole-doc Rewrite without obvious symptom. + +- **`preimage_in` unit tests** — each variant: Original same / other + file, Substring chain, Concat contiguous / gappy, Generated with + no anchors, Generated with Invocation anchor resolving into + target, Generated with Invocation anchor resolving elsewhere. + Assert correct byte range or None. + +- **`preimage_in` skips non-Invocation roles** — Generated with + only ValueSource / Dispatch / Other anchors returns None. (The + full ValueSource end-to-end correctness test lives in Plan 9 + Phase 5 with real appendix-license fixtures; this Plan-7-side + test pins the unit-level behavior.) + +- **Coarsen unit tests** — build mock reconciliation plans + ASTs + covering: + - Verbatim (KeepBefore + preimage in target, both Original and + Generated-with-Invocation cases). + - Transparent (KeepBefore + non-atomic Generated wrapper with + source-bearing children — Sectionize / footnotes / appendix). + - Omit via atomic-kind Generated (KeepBefore + Generated with + `by.is_atomic_kind() == true` and no anchors — filter + construction). + - Omit via non-atomic Generated with no children (rare). + - Rewrite via catch-all (KeepBefore with no preimage and no + matching Generated shape — cross-file Original, gappy Concat). + - Rewrite (UseAfter, non-atomic, editable). + - **Soft-drop: inline UseAfter on atomic-Generated** — substitute + KeepBefore for that inline at `before_idx`; surrounding inline + plan continues; assert `Q-3-42` warning emitted. - **Soft-drop: block RecurseIntoContainer on atomic CustomNode** - (IncludeExpansion) — substitute KeepBefore for the wrapper; - assert `Q-3-43` warning emitted; assert wrapper's preimage bytes - in output. - - **Let-user-win: block UseAfter on atomic node** — Rewrite via qmd - writer; no warning. Assert qmd writer's CustomNode arm correctly - serializes a fresh user-edit-tagged IncludeExpansion (uses - `plain_data["source_path"]`). -- **Multi-inline dedupe unit tests**: build a Para with three consecutive - inlines all sharing the same Derived `from`. Reconcile against an - identical Para. Assert the writer emits the shortcode token bytes - ONCE, not three times, in the inline-assembly output. -- **Soft-drop interaction tests**: - - User edits one Derived inline AND a non-atomic inline in the same - Para → assert non-atomic edit is applied AND shortcode token is + (IncludeExpansion) — substitute KeepBefore; assert `Q-3-43` + warning emitted; assert wrapper's preimage bytes in output. + - **Soft-drop: block RecurseIntoContainer on no-preimage + Generated** — substitute Omit; assert `Q-3-43` warning emitted; + assert nothing emitted for the wrapper. + - **Soft-drop: block UseAfter on no-preimage Generated** — + substitute Omit; assert `Q-3-43` warning emitted. + - **Let-user-win: block UseAfter on atomic CustomNode** — Rewrite + via qmd writer; no warning. Assert qmd writer's CustomNode arm + correctly serializes a fresh user-edit-tagged IncludeExpansion + using `plain_data["source_path"]`. + +- **Multi-inline dedupe unit tests** — build a Para with three + consecutive inlines all sharing the same `Invocation` anchor + source_info (`PartialEq`-equal). Reconcile against an identical + Para. Assert the writer emits the shortcode token bytes ONCE, + not three times. Also: assert dedupe does NOT fire when anchors + differ structurally. + +- **Multi-inline dedupe + ValueSource interaction** (forward-compat + with Plan 9). Build inlines with shape `Generated { from: + [Invocation, ValueSource] }`. Two inlines whose `Invocation` + source_info matches but `ValueSource` source_info differs should + still dedupe (dedupe consults Invocation only). Add this once + Plan 9 has stamped ValueSource on a real consumer. + +- **Soft-drop interaction tests:** + - User edits one Derived inline AND a non-atomic inline in the + same Para → non-atomic edit applied AND shortcode token preserved AND `Q-3-42` warning emitted. - - User edits inside an include AND outside the include in same doc → - assert outside edit is applied AND include token is preserved AND + - User edits inside an include AND outside the include in same + doc → outside edit applied AND include token preserved AND `Q-3-43` warning emitted (write succeeds with warnings, not Err). -- **End-to-end round-trip tests**: - - Sectionized doc → edit one paragraph → assert the section structure - is preserved verbatim except for the edit. - - Doc with single-inline shortcode (`{{< meta title >}}`) → edit a - different paragraph → assert the shortcode token is preserved. - - Doc with multi-inline shortcode (markdown title) → edit a different - paragraph in same Para → assert the shortcode token appears once, - not multiple times. - - Doc with shortcode → attempt to edit the resolved title → assert - `Q-3-42` warning + the document text is byte-equal to a no-op edit - (i.e., the bad edit was reverted). Save succeeded. - - (Plan 8 covers includes; this plan establishes the infrastructure.) -- **Filter-construction soft-drop test**: build an AST with a - filter-constructed Str (Synthetic { by: filter }) inside a Para. User - retypes it through React → assert `Q-3-42` warning + the original - Para's source bytes (without the decoration) appear in output. Next - pipeline run regenerates the decoration. -- **Idempotence holds**: re-run Plan 3's idempotence test after this plan - lands. The AST shape changes from this plan's transforms shouldn't break - it. + +- **End-to-end round-trip tests** (hub-client): + - Sectionized doc → edit one paragraph → assert the section + structure is preserved verbatim except for the edit. + - Doc with single-inline shortcode (`{{< meta title >}}`) → edit + a different paragraph → assert the shortcode token is preserved. + - Doc with multi-inline shortcode (markdown title) → edit a + different paragraph in same Para → assert the shortcode token + appears once, not multiple times. + - Doc with shortcode → attempt to edit the resolved title → + assert `Q-3-42` warning + the document text is byte-equal to + a no-op edit (the bad edit was reverted). Save succeeded. + - Plan 8 covers includes; Plan 9 Phase 5 covers appendix-license; + this plan establishes the infrastructure. + +- **End-to-end round-trip tests (SPA):** + - SPA boots against a project with a single doc; edit a paragraph + via setLocalAst; assert the qmd on disk reflects the edit and + automerge content matches. + - Single-file mode (bd-tnm3k): same test with a `.qmd` outside any + `_quarto.yml` project root; assert the original file path is + written. + - Edit a shortcode in the SPA → assert Q-3-42 warning appears in + DiagnosticStrip; assert qmd on disk is unchanged. + - Edit a non-atomic block and a shortcode-resolved inline together + → assert non-atomic edit applies, shortcode preserved, Q-3-42 + warning shows. + - **Content-match echo-prevention test**: induce a local-edit ↔ + sync-echo cycle; assert the SPA's render effect fires exactly + once after the edit completes; assert an interleaved unrelated + file's update is processed normally (not suppressed). + +- **Filter-construction soft-drop test** — build an AST with a + filter-constructed Str (`Generated { by: filter, from: [] }`) + inside a Para. User retypes it through React → assert `Q-3-42` + warning + the original Para's source bytes (without the + decoration) appear in output. Next pipeline run regenerates the + decoration. + +- **Idempotence holds** — re-run Plan 3's idempotence test after + this plan lands. The AST shape changes shouldn't break it. ## Dependencies -- Depends on: Plans 4 (Synthetic + Derived + By), 5 (wire format), 6 - (audit + Derived provenance on shortcode resolutions). -- Blocks: nothing structurally; Plan 8 builds on the atomic infrastructure - but is independent (uses `is_atomic_custom_node` for IncludeExpansion). -- Lifts the read-only mode that Plan 1 introduced for q2-preview. - Plan 1's render-side `pipeline_kind` dispatches in - `AstTransformsStage::run()` and `ReactPreview.tsx::doRender` are - already structured (no string-literal scaffolding remains); Plan - 7 verifies the write-side parameter threads through the same - selector. See §Scope's "Verify: structured pipeline dispatch is - already in place" item for the verification step. +### Hard dependencies + +- **Plans 4 / 5 / 6** — provide the typed `Generated { by, from }` + shape and the synthesizer stamping the writer walks. The writer + can't test Generated-with-anchor behavior without those types + existing and being produced by real transforms. +- **Plan 3** — `compute_meta_hash_fresh` / + `compute_meta_hash_fresh_excluding_rendered` (used by the + writer-lossless baseline test). + +### Soft dependencies / coordination + +- **Plan 9** — owns the `preimage_in` role-asymmetry e2e test and + the appendix-license round-trip test. Plan 7's unit-level + `preimage_in` test pins behavior; Plan 9's tests pin end-to-end + correctness once a real ValueSource consumer (the appendix + synthesizer) exists. +- **Plan 10** — inherits Plan 7's `AnchorRole::Other` policy. No + ordering constraint. +- **Plan 7a** — separable follow-up; uses Plan 7's writer + warnings + infrastructure but doesn't gate M3. +- **Plan 8** — uses Plan 7's atomic-CustomNode infrastructure but + is independent (introduces `IncludeExpansion` to + `ATOMIC_CUSTOM_NODES`; doesn't change Plan 7's logic). + +### What Plan 7 doesn't block + +- Plan 9's implementation can start in parallel; the writer-side + changes don't depend on Plan 9's consumer wiring. +- Plan 10's implementation can start in parallel; Dispatch anchors + are stamped by Plan 6's post-walk helper, which Plan 10 modifies + independently. ## Risk areas -- **`incremental.rs` is intricate**: ~1000 lines, many interlocking - functions. Adding new coarsen variants and rewiring assemble carefully - is the meat of this plan. Budget extra time for edge cases. -- **Plan 4 / 5 / 6 must land first**. The writer can't test Synthetic - walking without those types existing. Order matters strictly. -- **InlineSplice + Transparent interaction**: the existing InlineSplice - logic handles inline-level changes. If Transparent at the block level - recurses into a block whose inlines need splicing, the assembly logic - composes both. Test this case — it's the trickiest edge. -- **Soft-drop warning visibility**: warnings flow through the existing - `RenderResponse.warnings` channel (the same path Plan 1's pipeline - diagnostics use). ReactPreview already displays diagnostics in the - editor. Confirm Q-3-42 / Q-3-43 warnings reach the diagnostic panel - and are visually distinguishable from pipeline warnings (or are - acceptably co-mingled — TBD by hub-client UX). -- **Autosave-context spam mitigation for Q-3-42 / Q-3-43**: hub-client - uses Automerge as the source-of-truth for qmd source — there's no - discrete "save" action; every keystroke triggers a debounced render - and incremental write. So a user persistently typing over a Derived - inline (resolved shortcode) would re-fire Q-3-42 on every render, - flooding the diagnostic panel with copies of the same warning. - Same for Q-3-43 if the user keeps editing inside an include. - Mitigation: **suppress-after-3** in the diagnostic banner. The - Monaco squiggle (yellow underline at the affected source range) - remains as the persistent signal; the side-panel banner shows the - first three occurrences per source range and silently drops the - rest. Implemented at the diagnostic-ingest layer in `Preview.tsx` - (or wherever warnings are processed for display), not at the +- **`incremental.rs` is intricate** (~830 lines, many interlocking + functions). Adding new coarsen variants and rewiring assemble + carefully is the meat of this plan. Budget extra time for edge + cases. + +- **Plans 4 / 5 / 6 must land first.** The writer can't test + Generated-with-anchor walking without those types existing and + being produced by real transforms. Order matters strictly. + +- **InlineSplice + Transparent interaction.** The existing + InlineSplice logic handles inline-level changes. If Transparent + at the block level recurses into a block whose inlines need + splicing, the assembly logic composes both. Test this case — + it's the trickiest edge. + +- **Baseline-AST staleness.** If the caller passes a baseline AST + that doesn't match the original qmd source (e.g., the qmd source + changed externally between render and edit), the reconciler + produces a confused diff and the writer's output is garbage. + Hub-client's existing `applyingRemoteRef` pattern + (`hub-client/src/hooks/useAutomergeSync.ts:55`) and the SPA's + content-match echo-prevention (new in this plan) keep the + baseline fresh in practice. The contract is: caller MUST pass + a baseline that's `parse_or_render(originalQmd) at the same tier + as newAst`. Document this on the WASM entry and TS wrapper. + +- **Soft-drop warning visibility.** Warnings flow through the + existing `RenderResponse.warnings` channel. Hub-client already + collates them in `ReactPreview.tsx`; Editor's + `diagnosticsToMarkers` splits into Monaco markers and the + existing `.diagnostics-banner`. SPA gets the new `DiagnosticStrip`. + +- **SPA echo-prevention correctness.** The content-match gate + must hash the qmd we're emitting exactly as the round-trip + produces it (no trailing newline differences, no encoding + variation). Implement with a fixture-based assertion: emit qmd + X, simulate the echo loop, assert the gate matches. + +- **Autosave-context spam mitigation for Q-3-42 / Q-3-43.** + Hub-client and SPA both use Automerge as the source-of-truth for + qmd source — there's no discrete "save" action; every keystroke + triggers a debounced render and incremental write. A user + persistently typing over an atomic-resolved inline would re-fire + Q-3-42 on every render, flooding the diagnostic surface. + + **Mitigation:** suppress-after-3 by source range. Monaco squiggles + (yellow underline at the affected source range) remain as the + persistent signal in hub-client; the side-panel banner / + DiagnosticStrip shows the first three occurrences per source + range and silently drops the rest. Implemented at the + diagnostic-ingest layer (`ReactPreview.tsx::allDiagnostics` + collation for hub-client; `DiagnosticStrip` for SPA), not at the writer. Plan 7a's Q-3-44 doesn't have this issue — it's cached - once per document per session, so it fires at most once. - Imperative message text matters here too: Q-3-42 / Q-3-43 should - read as instructions ("To edit this content, open ``") + once per document per session. + + Imperative message text matters: Q-3-42 / Q-3-43 read as + instructions ("To edit this content, open ``") rather than passive descriptions ("edit was dropped"), since the user has no discrete-save affordance to discard the bad edit. - Plan 7's soft-drop is what guarantees the qmd source-of-truth - doesn't accept the bad edit even though the in-React AST briefly - held it. ## Estimated scope | Component | Lines (rough) | |---|---| -| `preimage_in` accessor (with Derived) + tests | ~100 | +| `preimage_in` accessor (with Generated/Invocation) + tests | ~100 | | New `CoarsenedEntry` variants (Transparent, Omit) | ~20 | -| `coarsen` logic update (atomic detection + soft-drop substitutions) | ~180 | +| `coarsen` logic update (editability gate + soft-drop substitutions) | ~200 | | `assemble` updates (Transparent walk, Omit handling) | ~80 | -| Multi-inline shortcode dedupe rule in inline assembly | ~40 | -| Inline-level soft-drop substitution in inline plan | ~50 | -| `is_atomic_custom_node` registry + TS hand-mirror | ~40 | -| Q-3-42 / Q-3-43 diagnostic codes + catalog entries | ~40 | +| Multi-inline shortcode dedupe (PartialEq on Invocation anchors) | ~40 | +| Inline-level soft-drop substitution | ~50 | +| `is_atomic_custom_node` registry (Rust side; TS hand-mirror already in place) | ~30 | +| Q-3-42 / Q-3-43 diagnostic codes + catalog entries | ~50 | | Warning channel plumbing through coarsen → incremental_write return | ~50 | -| `pipeline_kind` parameter + WASM bridge + TS wrapper | ~80 | -| ReactPreview guard lift + edit-back wiring | ~20 | -| Verify Plan 1's render-side pipeline_kind dispatch is end-to-end correct (no refactor work; Plan 1 already implemented it) | ~5 | -| Tests (unit + end-to-end round-trip + soft-drop interactions) | ~400 | -| **Total** | **~1105** | - -Two focused sessions likely. Flagged as one of the highest-complexity plans; -extend the budget if the InlineSplice + Transparent composition surfaces -unexpected interactions. +| `incremental_write_qmd` WASM signature change + JSON envelope | ~40 | +| TS wrapper signature change (`incrementalWriteQmd`) | ~20 | +| Three consumer migrations (ReactPreview + 2 demos) + sync-client type | ~60 | +| ReactPreview guard lift + `ast`-state baseline wiring | ~20 | +| SPA setAst handler + content-match echo-prevention | ~50 | +| `DiagnosticStrip` component for SPA (TSX + CSS) | ~70 | +| `pipelineKindForFormat` move to `ts-packages/preview-runtime` | ~10 | +| Tests (unit + end-to-end round-trip + soft-drop interactions, both surfaces) | ~500 | +| **Total** | **~1390** | + +Two focused sessions likely. Flagged as one of the highest-complexity +plans; extend the budget if the InlineSplice + Transparent +composition or the soft-drop catalog expansion surfaces unexpected +interactions. + +## Implementation checklist + +Work items grouped by phase. Each phase's items are roughly +sequential; phases themselves are mostly sequential, with some +parallelism noted. Plan 6 must land before Phase 1 starts. + +**Coordination posture.** This checklist is sized for serial +implementation in a single fresh 1M-context session — the phases +flow linearly, and the entire plan fits comfortably in one +context window. No beads-per-phase split needed. Open a follow-up +beads only for items that surface during implementation and are +genuinely out of scope (e.g. preexisting bugs found in adjacent +code; future-plan-bound features). + +### Phase 1 — Foundation primitives (`quarto-source-map`, `quarto-pandoc-types`, `pampa`) + +**Implementation note (2026-05-24):** Plan originally placed +`ATOMIC_CUSTOM_NODES` / `is_atomic_custom_node` in `quarto-core`, but +`quarto-core` depends on `pampa` and the writer (in `pampa`) is the +primary consumer — that direction would cycle. Moved the registry +down to `quarto-pandoc-types` (the home of `CustomNode` itself). A +cross-check test in `quarto-core::crossref` pins the literal in +lockstep with `CROSSREF_RESOLVED_REF`. + +- [x] `SourceInfo::preimage_in(target: FileId) -> Option>` accessor with full match (Original, Substring, Concat, Generated) +- [x] Doc-comment on `preimage_in` stating the `Invocation`-only walking policy + asymmetry rationale +- [x] Doc-comment on `AnchorRole::Other` reiterating the policy (future roles default to non-walked) +- [x] `pub const ATOMIC_CUSTOM_NODES: &[&str] = &["CrossrefResolvedRef"]` in `quarto-pandoc-types` (not `quarto-core` — see implementation note) +- [x] `pub fn is_atomic_custom_node(type_name: &str) -> bool` in `quarto-pandoc-types` +- [x] `is_editable_inside_block` / `is_editable_inside_inline` helpers in `pampa::writers::incremental` (two functions sharing a private `is_editable_inside_source_info` core; React side will import an equivalent TS predicate in a future Phase) +- [x] `preimage_in` unit tests: Original same / different file; Substring chain; Concat contiguous / gappy / overlapping / mixed-files; Generated with no anchors; Generated with Invocation anchor resolving in / out of target; Generated with Invocation through Substring chain +- [x] `preimage_in` role-asymmetry unit test: Generated with only ValueSource / Other anchors returns None; mixed Invocation + ValueSource walks Invocation only +- [x] `is_editable_inside` unit tests covering all three uneditable reasons (atomic CustomNode, atomic-kind Generated, no-preimage Generated, value-source-only Generated) plus positive cases +- [x] Reconciler source-info-blindness foundation test in `quarto-ast-reconcile` (Generated-with-different-By, Generated-with-different-anchor-lists, CustomNode wrapper and slot-child blindness) +- [x] `cargo nextest run --workspace` green (9509 tests) +- [x] `cargo xtask verify` green (full 12-step chain including WASM build + hub-client tests) + +### Phase 2 — Writer internals (`pampa::writers::incremental`) + +**Implementation notes (2026-05-24):** +- The plan's checklist item "Remove `AtomicViolation` variant" was a + residue of an earlier draft — no such variant existed in the + pre-Plan-7 code. Marked done by omission. +- The `coarsen` signature change keeps `Result` as the return: the + warning sink covers soft-drop cases, while the existing `Err` arm + (reached via `?` from `assemble_inline_splice`) stays for genuine + structural failures. +- The singleton-KeepBefore inline emit path was updated to use + `preimage_in(target_file_id)` (with `inline_source_span` fallback). + Original-SI inlines are byte-identical to the old behavior; + Generated-SI inlines now emit the Invocation anchor's preimage + bytes instead of an empty range — fixes a latent zero-length bug + in the pre-Plan-7 inline-splice path. Multi-inline dedupe sits on + top: when consecutive KeepBefore entries share an Invocation + anchor, emit the anchor's preimage *once*. + +**Repo facts that bite when constructing test fixtures:** +- `AttrSourceInfo` does **not** implement `Default`. Use + `quarto_pandoc_types::AttrSourceInfo::empty()` for `Div`/`Header`/ + `Figure`/etc. `attr_source` fields in hand-built fixtures. +- `gen` is a reserved keyword in Rust 2024 edition. Don't name a + variable `gen` (e.g. for a `SourceInfo::Generated` fixture); + `gen_info` works. + +- [x] Add `CoarsenedEntry::Transparent { child_entries }` variant +- [x] Add `CoarsenedEntry::Omit` variant +- [x] Change `coarsen` signature to accept `&mut Vec` warning sink +- [x] Rewrite `coarsen` KeepBefore branch: Verbatim / Omit / Transparent / Rewrite-catch-all cascade per §"Coarsen pseudo-code" +- [x] Rewrite `coarsen` UseAfter branch: atomic-CustomNode-let-user-win, no-preimage-Generated-soft-drop +- [x] Rewrite `coarsen` RecurseIntoContainer branch: `is_editable_inside` gate + soft-drop substitution + Verbatim-or-Omit fallback +- [x] Inline-level soft-drop in `assemble_inline_content`: substitute KeepBefore via `before_idx` when `!is_editable_inside` +- [x] Multi-inline dedupe in `assemble_inline_content`: PartialEq grouping on Invocation anchor source_info +- [x] `assemble` handles Transparent (recursive child emission via `emit_entries` helper, shared `prev_entry` state across the wrapper boundary) +- [x] `assemble` handles Omit (no-op, doesn't update `prev_entry`) +- [x] ~~Remove `AtomicViolation` variant~~ — variant never existed in the codebase; checklist item was stale (see implementation note above) +- [x] Change `incremental_write` return type: `Result<(String, Vec), Vec>` (same for `compute_incremental_edits`); WASM bridge + all test callers migrated +- [x] `debug_assert!` for the shortcode-Generated-with-empty-from regression case (Plan 6 stamper invariant) — in `coarsen_keep_before_block` +- [ ] Writer-lossless baseline test (Plan 7 first-commit prerequisite): for each Generated / CustomNode shape, assert `parse(write(ast))` hash equals input via `compute_blocks_hash_fresh` + `compute_meta_hash_fresh_excluding_rendered` — **deferred to Plan 7b Phase 1** (`claude-notes/plans/2026-05-24-q2-preview-plan-7b-test-orama.md`) +- [x] Coarsen unit tests: Verbatim, Transparent (sectionize wrapper with source-bearing children), Omit (atomic-kind filter construction), Rewrite-catch-all (cross-file Original), Rewrite (UseAfter editable) +- [x] Coarsen soft-drop unit tests: inline UseAfter on atomic-Generated (Q-3-42); block RecurseIntoContainer on atomic CustomNode (Q-3-43, Verbatim path); block RecurseIntoContainer on no-preimage Generated (Q-3-43, Omit path); block UseAfter on no-preimage Generated (Q-3-43, Omit path) +- [x] Let-user-win unit test: block UseAfter on atomic CustomNode → Rewrite; no warning +- [x] Multi-inline dedupe unit tests: positive (anchors PartialEq-equal → one Verbatim); negative (anchors differ → individual emits); ValueSource cross-talk (Plan 9 forward-compat — anchors match on Invocation but differ on ValueSource → still dedupes) +- [ ] Soft-drop interaction test: shortcode edit + non-atomic edit in same Para — **deferred to Plan 7b Phase 1** +- [ ] Filter-construction soft-drop test (UseAfter into a filter-constructed inline) — **deferred to Plan 7b Phase 1** + +### Phase 3 — Diagnostic catalog (`quarto-error-reporting`) + +- [x] `Q-3-42` entry in `error_catalog.json`: title "Shortcode edit dropped"; problem text; hint text; severity Warning +- [x] `Q-3-43` entry in `error_catalog.json`: title "Generated content edit dropped"; severity Warning. (Single generic `message_template`; the three emission paths supply distinct body text via the builder — per Plan 7 §"Catalog mechanics".) +- [x] Diagnostic builder helpers `diagnostic_q3_42_inline(inline)` and `diagnostic_q3_43_block(block)` used by `coarsen`'s soft-drop sites; live in `pampa::writers::incremental` (not `quarto-error-reporting`, which doesn't depend on `quarto-pandoc-types`) +- [x] Unit tests: each soft-drop unit test asserts the correct Q-3-42 / Q-3-43 code is emitted + +### Phase 4 — WASM bridge signature change (`wasm-quarto-hub-client`) + +**Repo facts the implementer needs:** + +- **The `wasm-quarto-hub-client` crate is NOT in the cargo workspace.** + `cargo build -p wasm-quarto-hub-client` fails with "did not match + any packages". Build via `cd hub-client && npm run build:wasm` or + implicitly via `cargo xtask verify` step 6. +- **`AstResponse.warnings` is `Option>`, not raw + serde Value.** Convert via `diagnostics_to_json(&warnings, ctx)`, + where `ctx: &SourceContext`. In the post-Phase-4 body, the + baseline AST's `ASTContext` carries this — access via + `baseline_context.source_context` (the field that `ASTContext` + exposes). Phase 2 wired this via the old `original_context` + variable; the equivalent post-Phase-4 binding is the baseline AST's. + +- [x] Change `incremental_write_qmd` Rust signature: add `baseline_ast_json: &str` as second positional argument +- [x] WASM body: deserialize `baseline_ast_json` via `pampa::readers::json::read` (parallel to existing `new_ast_json` deserialization); drop the qmd-parse step +- [x] Populate `AstResponse.warnings` field from `incremental_write`'s warning vec via `diagnostics_to_json(&warnings, &baseline_context.source_context)` +- [x] Doc-comment specifies the baseline-tier contract (caller responsibility to match tier of `new_ast_json`) + +### Phase 5 — TypeScript wrapper + sync-client interface + +- [x] `ts-packages/preview-runtime/src/wasmRenderer.ts:712` — `incrementalWriteQmd(originalQmd, baselineAst, newAst): { qmd, warnings }` +- [x] Accept `baselineAst` as `RustQmdJson | string` for ergonomics; stringify internally +- [x] `ts-packages/preview-runtime/src/wasm-quarto-hub-client.d.ts:78` — new signature in WASM type declaration +- [x] `hub-client/src/types/wasm-quarto-hub-client.d.ts:69` — new signature in hub-client's WASM type declaration +- [x] `ts-packages/quarto-sync-client/src/types.ts:169` — `astOptions.incrementalWriteQmd` interface signature change +- [x] `ts-packages/quarto-sync-client/src/client.ts:957` — pass `cached.ast` as baseline; surface `result.qmd` to `updateFileContent`; warnings ignored at sync-client level (policy-free; demos consume them via wrapper) +- [x] Move `hub-client/src/utils/pipelineKind.ts` → `ts-packages/preview-runtime/src/pipelineKind.ts`; update imports in hub-client and SPA (SPA had no import yet — Phase 7) + +### Phase 6 — Consumer migrations + +- [x] `hub-client/src/components/render/ReactPreview.tsx:429-440` — `handleSetAst` updated: delete read-only guard, pass `ast` state as baseline, ingest warnings into next diagnostics push via `pendingWriteWarningsRef` +- [x] `hub-client/src/types/wasm-quarto-hub-client.d.ts:69` — type declaration updated +- [x] `q2-demos/kanban/src/wasm.ts:79` — wrapper accepts baselineAst, forwards to WASM +- [x] `q2-demos/kanban/src/useSyncedAst.ts:93` — astOptions lambda accepts third positional argument +- [x] `q2-demos/kanban/src/types/wasm-quarto-hub-client.d.ts:8` — type declaration updated +- [x] `q2-demos/hub-react-todo/src/wasm.ts:79` — wrapper signature update +- [x] `q2-demos/hub-react-todo/src/useSyncedAst.ts:93` — astOptions lambda update +- [x] `q2-demos/hub-react-todo/src/types/wasm-quarto-hub-client.d.ts:8` — type declaration update +- [x] Workspace `cargo build --workspace` + `cargo nextest run --workspace` green +- [x] `cd hub-client && npm run build:all` green (WASM type alignment) +- [x] `cd hub-client && npm run test:ci` green + +### Phase 7 — q2-preview SPA integration + +- [x] `q2-preview-spa/src/PreviewApp.tsx`: baseline read via `astJsonRef` mirroring `state.astJson` (avoided new state — the ref keeps `handleSetAst`'s identity stable across re-renders, which the iframe's effect-deps care about) +- [x] Replace `noopSetAst` with real `handleSetAst` that calls `incrementalWriteQmd(content, baselineJson, newAst)` +- [x] Content-match echo-prevention: hash emitted qmd via FNV-1a, stash `(path, hash)` in `lastEmittedRef`; matching incoming `onFileContent` consumes the ref and returns early +- [x] Hash algorithm decision recorded in `fnv1aHex` docstring (FNV-1a: in-process equality, 32 bits sufficient, zero-dependency, matches existing actor-color hash pattern) +- [x] `q2-preview-spa/src/components/DiagnosticStrip.tsx` component (inline styles per existing SPA convention; ~120 LOC TSX, no separate CSS file) +- [x] DiagnosticStrip ingest from `incrementalWriteQmd` result's warnings field via `writeWarnings` state +- [x] Suppress-after-3-by-source-range mitigation in DiagnosticStrip (`suppressAfterThree` helper) +- [x] Imperative message text for Q-3-42 / Q-3-43 — catalog entries already imperative from Phase 3 (`"edit the invocation token in source instead"`); DiagnosticStrip surfaces title + problem verbatim + +### Phase 8 — End-to-end tests + +- [x] Hub-client: WASM-level wrapper contract test (`hub-client/src/services/incrementalWrite.wasm.test.ts`) — pins the 3-arg API, identity round-trip, paragraph-edit preservation, structured error on malformed baseline JSON. Run via `npm run test:wasm`; 3/3 passing. +- [x] Plan 3's idempotence test re-run — passes within `cargo xtask verify` (9535/9535 Rust tests, includes `crates/quarto-core/tests/idempotence.rs`). +- [ ] **Deferred to Plan 7b Phases 2 + 3** (`claude-notes/plans/2026-05-24-q2-preview-plan-7b-test-orama.md`; consolidates `bd-3izo3`) — the broader Playwright scenario matrix (sectionized round-trip in a real hub session, single/multi-inline shortcode preservation, Q-3-42 byte-equal-no-op, Q-3-43 footnotes regeneration, SPA edit-paragraph round-trip in both project and single-file modes, SPA Q-3-42 DiagnosticStrip, mixed atomic + non-atomic, echo-prevention fixture). Each spec needs ~60 LOC of fixture/server setup and runs only under `cargo xtask verify --e2e`. The Rust-side soft-drop matrix is already exhaustively covered in `crates/pampa/src/writers/incremental.rs`; the deferred work is end-to-end *delivery* coverage, not new correctness coverage. + +### Phase 9 — Verification + cleanup + +- [x] `cargo xtask verify` green (full chain: Rust workspace + hub-build + hub-tests) — see `/tmp/plan7-phase4-6-verify.log` +- [x] **Refresh `q2 preview` WASM chain before smoke testing** (per `CLAUDE.md` §"Verifying Rust changes in `q2 preview`"; addresses the 2026-05-20 stale-WASM incident): + - [x] `cd hub-client && npm run build:wasm` — rebuild WASM from Plan 7's Rust changes + - [x] `cargo xtask build-q2-preview-spa` — bundle WASM into `q2-preview-spa/dist/` + - [x] `cargo build --bin q2` — re-embed `dist/` via `include_dir!` +- [x] q2 preview boot smoke: `cargo run --bin q2 -- preview /tmp/plan7-smoke` rendered correctly; user confirmed the preview in their browser (2026-05-24 session). The full edit round-trip (drag-to-trigger-handleSetAst → observe DiagnosticStrip on atomic edit) is part of the deferred Playwright matrix above. +- [ ] **Deferred to the user** — hub-client manual smoke (edit sectionized doc, observe section structure in saved qmd) and SPA manual smoke with echo-prevention assertion. The user is doing these by hand; the e2e equivalents land via Plan 7b Phases 2 + 3. +- [x] Plan 7 marked complete (Phases 1-7 + 9 done; Phase 8 partially landed, remainder tracked separately). +- [x] Bump `hub-client/changelog.md` with a one-line entry per the two-commit workflow (commit `b5d6d08a`). +- [x] Plan 9's `preimage_in` role-asymmetry e2e test reference is in Plan 9 Phase 5 (added a "Plan 7 shipped 2026-05-24" status note so the deferral state is unambiguous when Plan 9 lands). ## Notes -This is the most intricate plan in the set. It's the keystone for M3 — -once this lands, q2-preview is truly editable for the common case. Take -care with the test coverage; round-trip bugs in the writer can corrupt -source silently if not caught. +This is the most intricate plan in the set. It's the keystone for +M3 — once this lands, q2-preview is truly editable for the common +case in BOTH hub-client and the q2 preview SPA. Take care with +test coverage; round-trip bugs in the writer can corrupt source +silently if not caught. -### Soft-drop replaces hard-abort (revised from earlier draft) +### Soft-drop replaces hard-abort -An earlier draft of this plan had AtomicViolation as a hard error — any -edit to atomic content aborted the entire write. We revised to soft-drop: -each bad-edit case substitutes a safe alignment in coarsen and emits a -warning, but the user's other edits go through. The user-facing contract -"this edit must be prohibited" is honored (the bad edit doesn't apply); -the user-facing failure mode "the entire save was rejected" is not. -React (Plan 2B) is the primary safeguard via read-only enforcement; -the writer is the contract guarantor; if React has a hole the writer -protects without losing the user's session. +Plan 7 substitutes safe alignments in coarsen and emits warnings +rather than aborting the entire write. The user-facing contract +"this edit must be prohibited" is honored (the bad edit doesn't +reach source); the user-facing failure mode "the entire save was +rejected" is not. React (Plan 2A's framework atomic gate) is the +primary safeguard via read-only enforcement; the writer is the +contract guarantor; if React has a hole the writer protects +without losing the user's session. The let-user-win exception for block-level UseAfter on atomic -(user-replaced or -deleted atomic block via React) is a deliberate -asymmetry: when the user explicitly destroys an atomic block, we trust -them. The qmd writer's CustomNode arms know how to write fresh atomic -types from `plain_data` (Plan 8's IncludeExpansion arm reads -`plain_data["source_path"]`), so this composes through the normal -Rewrite path with no special handling. +**CustomNode** (user-replaced or -deleted via React's component +menu) is a deliberate asymmetry: when the user explicitly destroys +an atomic CustomNode through an explicit affordance, we trust +them. The qmd writer's CustomNode arms know how to write fresh +atomic types from `plain_data`. The corresponding case for +no-preimage Generated containers stays soft-drop — there's no +source position to anchor a Rewrite at. ### Filter mutations are not flagged as atomic — accepted corner -Plan 4 distinguishes filter constructions (`pandoc.Str("decoration")` → -`Synthetic { by: filter }`, atomic) from filter mutations -(`Str.text = upper(Str.text)` → keeps Original source_info, NOT atomic). - -A user editing a filter-mutated Str through React produces an unusual -round-trip: the user types "world" over the filter-output "HELLO"; -the writer Rewrites "world" to source; the next pipeline run filters -"world" → "WORLD". For idempotent filters (uppercase) this is fine — -the typed text round-trips through filter to itself. For non-idempotent -filters (`x => upper(x) + "!"`) the typed text gets a `!` appended on -every save, which is confusing. - -We accept this corner rather than flagging filter mutations as atomic -because (a) it would require revising Plan 4 to track filter mutations -distinctly from plain Original source_info (a notable type-system -change), (b) the runtime user-filter idempotence detection (above) -catches the AST-level non-idempotence that would actually corrupt -round-trip, and (c) Plan 3's idempotence test enforces the -contract for built-in filters at CI time. Users who write -non-idempotent filters get a warning at runtime and can decide whether -the trade-off is acceptable for their workflow. - -### The byte-provenance contract (and why the writer stays infallible) - -The contract isn't "no materialization" — that phrasing is too blunt -and conflates two cases. **The writer materializes constantly** in the -neutral sense: every Rewrite path materializes new bytes through the -qmd writer; even Verbatim copies are a kind of materialization (bytes -appearing in the saved file). The contract is more precise: the writer -only emits bytes whose origin can be honestly traced to either -**existing source bytes in the target file** (Verbatim copies, slot -preimages via `preimage_in`) or **fresh AST the user constructed** -(Rewrite paths fed by user-supplied AST nodes via the qmd writer's -normal arms). +Plan 4 distinguishes filter constructions (`pandoc.Str("decoration")` +→ `Generated { by: filter, from: [] }`, atomic) from filter +mutations (`Str.text = upper(Str.text)` → keeps Original source_info, +NOT atomic). + +A user editing a filter-mutated Str through React produces an +unusual round-trip: the user types "world" over the filter-output +"HELLO"; the writer Rewrites "world" to source; the next pipeline +run filters "world" → "WORLD". For idempotent filters (uppercase) +this is fine — the typed text round-trips through filter to itself. +For non-idempotent filters (`x => upper(x) + "!"`) the typed text +gets a `!` appended on every save, which is confusing. + +We accept this corner rather than flagging filter mutations as +atomic because: +- (a) it would require revising Plan 4 to track filter mutations + distinctly from plain Original source_info (a notable type-system + change); +- (b) Plan 7a's runtime user-filter idempotence detection catches + the AST-level non-idempotence that would actually corrupt + round-trip; +- (c) Plan 3's idempotence test enforces the contract for built-in + filters at CI time. + +Users who write non-idempotent filters get a warning at runtime +and can decide whether the trade-off is acceptable. + +### The byte-provenance contract + +The contract isn't "no materialization" — that phrasing is too +blunt. **The writer materializes constantly** in the neutral +sense: every Rewrite path materializes new bytes through the qmd +writer; even Verbatim copies are a kind of materialization. The +contract is more precise: the writer only emits bytes whose origin +can be honestly traced to either **existing source bytes in the +target file** (Verbatim copies, slot preimages via `preimage_in`) +or **fresh AST the user constructed** (Rewrite paths fed by +user-supplied AST nodes via the qmd writer's normal arms). What soft-drop forbids — by structural construction — is the case -where the writer would emit bytes synthesized from a wrapper's slot -children as flat content in the parent file. Concretely: if Plan 8's -qmd-writer arm for `IncludeExpansion` were reached in a non-Verbatim -path, it would (under the old defensive-fallback design) walk the -wrapper's content slot and emit those blocks as flat parent-file bytes -— but those blocks come from foo.qmd, not from parent.qmd source nor -from user input. Writing them into parent.qmd would put bytes there -whose provenance is the included file, which is dishonest at the +where the writer would emit bytes synthesized from a wrapper's +slot children as flat content in the parent file. Plan 8's +qmd-writer arm for `IncludeExpansion` in a non-Verbatim path +would (under an old defensive-fallback design) walk the wrapper's +content slot and emit those blocks as flat parent-file bytes — +but those blocks come from `foo.qmd`, not from `parent.qmd` source +nor from user input. Writing them into `parent.qmd` would put bytes +there whose provenance is the included file — dishonest at the parent-file boundary. Under soft-drop, coarsen substitutes KeepBefore (Verbatim of the -wrapper's parent-file include-token bytes) before the qmd writer ever -sees that case. The arm becomes `unreachable!()` — a debug assertion -for coarsen bugs, not a user-facing failure mode. Promoting the qmd -writer to a fallible `Result` interface to make the unreachable case -recoverable would be over-engineering, since correct coarsen makes the -case structurally absent. WASM panic-abort still kills the session if -the assertion fires, but that's the same risk profile as any other -writer bug; it's not specific to atomic enforcement, and it's -reachable only via a programming error in coarsen. - -The let-user-win Rewrite path is provenance-honest: when the user -constructs a fresh `IncludeExpansion` through React (with `plain_data -= { source_path: "bar.qmd" }`) and the writer materializes -`{{< include bar.qmd >}}` into source, the bytes' origin is the user's -edit. Plan 8's qmd-writer arm reads `plain_data`, doesn't read -`source_info`, and emits the include syntax — same arm whether the -wrapper came from `IncludeExpansionStage` (pipeline) or from React -(user). That symmetry is what makes the let-user-win case clean. +wrapper's parent-file include-token bytes) before the qmd writer +ever sees that case. The arm becomes `unreachable!()` — a debug +assertion for coarsen bugs, not a user-facing failure mode. +Promoting the qmd writer to a fallible `Result` interface to make +the unreachable case recoverable would be over-engineering, since +correct coarsen makes the case structurally absent. + +The let-user-win Rewrite path for atomic CustomNodes is +provenance-honest: when the user constructs a fresh +`IncludeExpansion` through React (with `plain_data = { source_path: +"bar.qmd" }`) and the writer materializes `{{< include bar.qmd >}}` +into source, the bytes' origin is the user's edit. Plan 8's +qmd-writer arm reads `plain_data`, doesn't read `source_info`, +and emits the include syntax — same arm whether the wrapper came +from `IncludeExpansionStage` (pipeline) or from React (user). That +symmetry is what makes the let-user-win case clean. + +The corresponding case for no-preimage Generated containers +soft-drops instead of let-user-win because those containers have +no parent-file source position — Rewrite would have nowhere to +write. The user's edit is rejected with Q-3-43; the original +content regenerates from baseline metadata on the next pipeline +run. + +### Decomposition of operations + +Plan 7's surface change — `incremental_write_qmd` takes a baseline +AST instead of parsing internally — is a small step in a larger +decomposition. The four primitives (parse / transform / reconcile / +write) are already implemented as separate Rust functions. Plan 7 +makes the WASM boundary reflect that decomposition: the writer's +WASM entry doesn't conflate the parse step with the write step +anymore. The caller composes parse + transform separately (or +re-uses an already-rendered AST from a prior call), then hands two +ASTs and the source bytes to the writer. + +This decomposition makes future pipeline kinds free: the writer +doesn't need a new parameter for each new kind, because it doesn't +know what a pipeline is. The caller picks which render function to +call; the writer just diffs. + +## Follow-ups closed + +- **`CoarsenedEntry::Rewrite` carried `new_idx` instead of + pre-computed text** (Phase 2 design vestige). + Closed 2026-05-25 by + [`coarsened-entry-self-contained`](./2026-05-25-coarsened-entry-self-contained.md). + The `result_idx is unused for child Rewrites (...not exercised by + today's synthesizers)` comment introduced in commit `9a473fe9` was + accurate at the time, but became reachable once Plan 7c Phase 8 + (`bdcfdc53`) added a Transparent-recursion path in `coarsen_blocks` + for changed wrappers. The fix lifts `Rewrite` to carry + `block_text: String` (matching `InlineSplice`'s precedent), making + every `CoarsenedEntry` variant self-contained. The contract is + documented in + [`incremental-writer-internals.md`](../designs/incremental-writer-internals.md). diff --git a/claude-notes/plans/2026-05-04-q2-preview-plan-7a-filter-idempotence.md b/claude-notes/plans/2026-05-04-q2-preview-plan-7a-user-filter-idempotence.md similarity index 69% rename from claude-notes/plans/2026-05-04-q2-preview-plan-7a-filter-idempotence.md rename to claude-notes/plans/2026-05-04-q2-preview-plan-7a-user-filter-idempotence.md index 0985ab4d8..168f5492f 100644 --- a/claude-notes/plans/2026-05-04-q2-preview-plan-7a-filter-idempotence.md +++ b/claude-notes/plans/2026-05-04-q2-preview-plan-7a-user-filter-idempotence.md @@ -1,11 +1,20 @@ # Plan 7a — Runtime user-filter idempotence check + opt-out -**Date:** 2026-05-05 +**Date:** 2026-05-05 (revised 2026-05-20) **Branch:** feature/q2-preview **Status:** Implementation plan (open questions named) **Milestone:** none directly — extends M3 with an opt-in safety check; doesn't block the milestone +## Epic context + +Part of the **provenance epic** (Plans 3–8). Plan 7a is the reliability +follow-up to Plan 7: once the writer round-trips correctly for +idempotent filters, this plan adds runtime detection for the +non-idempotent case, with attribution to the offending filter and a +declarative opt-out. The file name keeps its q2-preview-plan-N form for +continuity with the earlier discussion notes. + ## Goal Detect when a user's Lua filter chain breaks q2-preview's round-trip @@ -41,8 +50,10 @@ specifically: current test does not catch it**. Plan 7a's runtime check targets **round-trip non-idempotence** — -parse, pipeline, write, parse, pipeline, hash-compare. See §"Plan 3 -strengthening" below. +parse, pipeline, write, parse, pipeline, hash-compare. Plan 3 covers +flavor (1) at CI time for built-ins; Plan 7a covers flavor (2) at +runtime for user filters. Built-in filter round-trip is not covered +by any current plan — see §"Notes" for the rationale. ## Scope @@ -89,7 +100,9 @@ strengthening" below. 2. Serialize AST_1 via the qmd writer → qmd_1. 3. Run pipeline on qmd_1 → AST_2. 4. Compare `compute_blocks_hash_fresh(&AST_1.blocks)` vs - `compute_blocks_hash_fresh(&AST_2.blocks)`. + `compute_blocks_hash_fresh(&AST_2.blocks)`, and the parallel + `compute_meta_hash_fresh(&AST_1.meta)` vs `(&AST_2.meta)` (new + helper landing in Plan 3). - **Per-filter attribution**: when the whole-pipeline check fails, run the same round-trip with each filter active in isolation (others stubbed). Filters whose isolated round-trip fails are named in the @@ -101,6 +114,20 @@ strengthening" below. - Hint: `Fix the filter to produce stable output, or add idempotent: false to its config in _quarto.yml to silence this check.` - Location: filter file path; no document-side range (the warning is about the filter, not a place in the active doc). + - **Sectionize-wrapper-aware hint (optional, follow-up).** A + common Lua-author error is `pandoc.walk_block(doc.blocks[1], …)` + intending to touch the user's first paragraph — but after + `SectionizeTransform` runs, `doc.blocks[1]` is the synthesized + sectionize Div and the walk operates on the wrapper, not the + user content. Idempotence detection sees the divergence and + fires Q-3-44 correctly, but the hint doesn't help the author + diagnose. When the AST diff is concentrated under a + `is_transparent_wrapper(doc.blocks[0])` (see + [`claude-notes/designs/transparent-wrappers.md`](../designs/transparent-wrappers.md)), + extend the hint with: "Note: your filter may be walking into a + sectionize wrapper. Use `doc.blocks[1].content[0]` to reach + the first user block, or iterate `doc.blocks` recursively + skipping wrapper Divs." - **`Q-3-45` diagnostic** (Info severity), three-variant body: - Title (all variants): `Filter exempted from idempotence checking` - Problem (UserConfig source): `idempotent: false set in . Edits may cause unintended changes elsewhere in the document.` @@ -113,11 +140,12 @@ strengthening" below. ### Out of scope -- **Strengthening Plan 3's CI test to round-trip flavor**. Plan 3 - currently catches non-determinism (`pipeline(x)` twice); the - round-trip flavor (`pipeline(write(pipeline(x)))`) catches a - different class of bug. See §"Plan 3 strengthening" below — flagged - as a Plan 3 amendment, not Plan 7a. +- **Extending the runtime round-trip check to built-in filters**. + Plan 7a's check fires only for filters in `Vec` with + `source = UserConfig` or `Extension`; ship-with-Quarto Lua filters + (today: just `video-filter.lua`) are not on that list. Built-in + filter round-trip is unverified anywhere — see §"Notes" for the + reasoning behind not closing this gap in v1. - **File watchers for filter sources**. Demand-driven invalidation via `filter_sources_hash` on next render is sufficient. The user edits a filter, opens the document, hash mismatches, check re-runs. @@ -130,8 +158,9 @@ strengthening" below. filter would block the first save by O(filter_count) pipeline passes. Acceptable for v1; revisit if reports come in. - **Idempotence checks on built-in filters at runtime**. Plan 3's CI - test (after the strengthening noted above) is the right place for - built-ins. Runtime check is for user-supplied filters specifically. + test is the right place for the pipeline-determinism property on + built-ins. The round-trip property on built-ins is unverified — see + the bullet above and §"Notes." ## Design decisions @@ -139,7 +168,8 @@ strengthening" below. check serializes the first pass's AST through the qmd writer and re-parses, mirroring the actual round-trip the writer performs. Pipeline determinism is a weaker property; we get that for free - from Plan 3's CI test (after strengthening). + from Plan 3's CI test (which covers pipeline non-determinism for + built-in transforms and the one built-in Lua filter). - **Cache verdict per session, persisted in IndexedDB**. The cache key includes `filter_sources_hash` (filter file bytes + opt-out flags). Surviving session boundaries is correct: if filter sources @@ -236,38 +266,6 @@ Total cost when an issue is detected: 2 + 2N pipeline runs (one whole-set check, two per filter for attribution). For 5 filters, ~12 runs. Bounded; acceptable on first edit per session, cached after. -## Plan 3 strengthening (out of scope here, flagged for the team) - -Plan 3's idempotence test (currently `run_pipeline(fixture)` twice on -the same source) catches **pipeline non-determinism** — filters that -use time, RNG, or mutable state. It does **not** catch -**round-trip non-idempotence** — the `f(f(x)) ≠ f(x)` case where the -filter is deterministic but produces different output when re-applied -to its own output through the qmd writer. - -Plan 7a's runtime check targets round-trip non-idempotence, which is -the property that actually matters for q2-preview's writer. The Plan 3 -CI test should be strengthened to also run this flavor, so that -built-in filters are CI-verified for both: - -```rust -// Existing Plan 3 test: pipeline non-determinism -let ast_1 = run_pipeline(fixture, ...); -let ast_2 = run_pipeline(fixture, ...); -assert_eq!(blocks_hash(&ast_1), blocks_hash(&ast_2)); - -// New: round-trip idempotence (the property that breaks q2-preview) -let ast_a = run_pipeline(fixture, ...); -let qmd_a = qmd_write_to_string(&ast_a); -let ast_b = run_pipeline(&qmd_a, ...); -assert_eq!(blocks_hash(&ast_a), blocks_hash(&ast_b)); -``` - -This change is small (~30 lines) and lands in Plan 3's test file. -**Recommend adding to Plan 3's scope as an amendment** rather than to -Plan 7a, since it concerns built-in filter coverage at CI time, not -runtime behavior. - ## Open questions for implementation - **Cross-session cache validity**: the profile cache persists. Should @@ -304,6 +302,30 @@ runtime behavior. flow through the same path. Confirm they reach the diagnostic panel and are visually distinguishable from pipeline warnings (or acceptably co-mingled — TBD by hub-client UX, same as Q-3-42/Q-3-43). +- **Per-Lua-line attribution (Plan 10 follow-up)**: Q-3-44 today + references the filter file path via `` read from + `FilterMetadata.spec` (the filter spec, not from `by.data` on any + Generated node), so Plan 7a is structurally independent of `By`'s + data shape. When **Plan 10** + (`claude-notes/plans/2026-05-22-provenance-plan-10-dispatch- + anchor.md`) lands, filter-constructed nodes carry a `Dispatch` + anchor pointing at a typed + `Original{lua_file_id, line_start, line_end}`. The Q-3-44 diagnostic + can then sharpen "filter `` is not idempotent" to "filter + `` line `` is not idempotent" — pointing at the specific + Lua-side construction site. The migration is purely additive — read + the Dispatch anchor when present, fall back to filter-spec path + when absent. Deferred until Plan 10 lands; the current + ``-only diagnostic is actionable. + +- **`filter_sources_hash` coordination with Plan 10.** Plan 7a + defines `filter_sources_hash` (SHA-256 over filter file bytes + + opt-out flags) as a `Pass1KeyInputs` field. Plan 10 Phase 7 + also wants Lua-filter-file content to invalidate `pass1_key`. + Since Plan 7a lands first, **Plan 10 reuses Plan 7a's + `filter_sources_hash` field** rather than introducing a parallel + hash. Plan 10's Phase 7 task reduces to: confirm the field + exists, confirm semantics match, no new field added. ## References @@ -324,10 +346,21 @@ runtime behavior. - Plan 7 — the q2-preview pipeline + qmd writer this check supports. The check uses Plan 7's `pipeline_kind: Some("preview")` machinery for both passes. -- Plan 3 — CI-time idempotence verification for built-in filters. - Plan 3 strengthening (above) extends the test to round-trip flavor. -- Plan 4 — `By` types; `is_atomic_synthesizer()` is unrelated to this - plan but the runtime check shares the source-info-blind hash. +- Plan 3 — CI-time pipeline-determinism verification for built-in + transforms and the one built-in Lua filter. Plan 3 ships + `compute_meta_hash_fresh` which this plan reuses for the meta + comparison in the round-trip check. The transform/filter-author + contract Plan 3 enforces is documented at + `claude-notes/instructions/idempotence-contract.md`; new transforms + on both the built-in and user-filter sides must meet it. +- Plan 4 — `By` types; `is_atomic_kind()` is unrelated to this plan + but the runtime check shares the source-info-blind hash. +- Plan 10 (`claude-notes/plans/2026-05-22-provenance-plan-10- + dispatch-anchor.md`) — Lua-file registration in `SourceContext`; + prerequisite for the per-Lua-line attribution refinement noted + under "Open questions" above. Plan 7a lands first; Plan 10 + reuses Plan 7a's `filter_sources_hash` field per the + cross-plan coordination note in §Open questions. ## Test plan @@ -365,6 +398,47 @@ runtime behavior. Q-3-45 variants match their respective bodies; hint text mentions the opt-out path. +- **Filter-mutation round-trip behavior test** (added 2026-05-25 + from code-review pass on Plan 7). The writer contract + (`claude-notes/designs/incremental-writer-contract.md`, + §"Filter mutations versus constructions") admits this corner: + a filter that *mutates* an existing node (rather than + *constructs* a new one) leaves the input's `Original` + source_info untouched, so the editability gate treats the + resulting text as editable. When the filter is non-idempotent + (`x => upper(x) + "!"`), the user's typed text round-trips as + `TYPED!` on the first save, `TYPED!!` on the next, etc. + + The Plan-7a runtime warning catches this — but the contract + doc doesn't pin *when* the warning fires: + - Does it fire on the first save (writer detects the filter + is non-idempotent at the AST level)? + - Does it fire only after a second save shows divergence + between consecutive pipeline runs? + - Does it suppress on subsequent saves to avoid flooding? + + Test plan (one fixture, three assertions): + + 1. Build a single-file doc with a non-idempotent user filter + (the canonical `f(x) = upper(x) + "!"` shape). + 2. Render once. Assert Q-3-44 fires with `filter_path = + "f.lua"`. Capture the warning ID. + 3. Simulate a user edit on a filter-mutated `Str`. Round-trip + through `incremental_write`. Re-render. Assert *either* the + same Q-3-44 fires again, *or* it's suppressed (whichever + the implementation picks) — but the test pins the choice. + 4. Repeat step 3 with no user edit (re-render of the same + content). Assert the warning behaviour matches step 3 — the + existence of a user edit doesn't change the diagnostic + surface; the filter's non-idempotence does. + + Output: the test pins behaviour and the assertion comments + document the contract. If the implementation prefers "fire + on every render" (loud, recoverable), the test asserts that. + If it prefers "fire once per cache key" (quiet, requires the + cache), the test asserts that. Either way, future contributors + read the test and know what behaviour is contracted. + ## Dependencies - **Depends on**: Plan 7 (the q2-preview transform pipeline + qmd writer @@ -374,9 +448,11 @@ runtime behavior. `Synthetic`/`Derived` content for realism). - **Blocks**: nothing structurally; this is a reliability improvement, not a milestone deliverable. -- **Related**: Plan 3 (CI-time test for built-in filters) — would - benefit from the strengthening proposed in §"Plan 3 strengthening" - above. +- **Related**: Plan 3 (CI-time pipeline-determinism test for built-in + transforms and the one built-in Lua filter). Plan 3 ships + `compute_meta_hash_fresh` / `compute_meta_hash_fresh_excluding_rendered` + in `quarto-ast-reconcile`; this plan reuses both for the meta + comparison in the round-trip check. ## Risk areas @@ -416,7 +492,8 @@ runtime behavior. | Q-3-44 / Q-3-45 catalog entries + builders | ~50 | | Session cache integration | ~40 | | Tests (unit + integration) | ~250 | -| **Total** | **~600** | +| Filter-mutation round-trip behavior test (added 2026-05-25) | ~60 | +| **Total** | **~660** | Single focused session. Risk: per-filter attribution may surface unexpected interactions; budget a second session if attribution proves @@ -430,10 +507,26 @@ it out keeps Plan 7 focused on the writer's coarsen + soft-drop logic gate the milestone. The check is targeted at user-supplied Lua filters. Built-in filters -that ship with Quarto are covered by Plan 3 (CI-time, with the -strengthening proposed above). User filters can't be statically -analyzed for idempotence (uncomputable for arbitrary Lua), so the -runtime check via double-pass-and-hash is the available mechanism. +that ship with Quarto are covered by Plan 3 for the +pipeline-determinism property only (`pipeline(x)` twice, same source, +hash-compare). The round-trip property +(`pipeline(write(pipeline(x))) == pipeline(x)`) is **not** verified +for built-ins anywhere in the epic. This gap is accepted in v1 +because: + +1. The built-in Lua filter universe is one filter today + (`video-filter.lua`); its idempotence is easy to read from source. +2. Round-trip is exercised in production by Plan 7's incremental + writer; a non-idempotent built-in would surface as user-visible + text drift, which we'd find via dogfooding before Plan 7 ships. +3. Extending Plan 7a's runtime check to also fire for built-in + filters is a small change to `FilterMetadata` filtering (a + `Vec::iter()` predicate), tracked as a follow-up if the gap + bites. + +User filters can't be statically analyzed for idempotence +(uncomputable for arbitrary Lua), so the runtime check via +double-pass-and-hash is the available mechanism. The opt-out (`idempotent: false`) gives users intentional escape — a timestamp-emitting filter can declare itself non-idempotent and silence diff --git a/claude-notes/plans/2026-05-04-q2-preview-plan-8-include-roundtrip.md b/claude-notes/plans/2026-05-04-q2-preview-plan-8-include-roundtrip.md index 42edae14d..ca4ba72ea 100644 --- a/claude-notes/plans/2026-05-04-q2-preview-plan-8-include-roundtrip.md +++ b/claude-notes/plans/2026-05-04-q2-preview-plan-8-include-roundtrip.md @@ -1,41 +1,69 @@ # Plan 8 — Include round-trip via IncludeExpansion CustomNode -**Date:** 2026-05-04 +**Date:** 2026-05-04 (revised 2026-05-20) **Branch:** feature/q2-preview **Status:** Implementation plan (open questions named) **Milestone:** M4 (documents with `{{< include >}}` are no longer read-only; edits outside includes round-trip cleanly; edits inside are prohibited) +## Epic context + +Part of the **provenance epic** (Plans 3–8). Plan 8 is the last plan in +the epic: it lights up include round-trip via a `CustomNode` wrapper +that consumes the atomic-detection infrastructure Plan 7 ships. The +file name keeps its q2-preview-plan-N form for continuity with the +earlier discussion notes. + ## Goal -Modify `IncludeExpansionStage` to wrap each include's expanded blocks in a -`CustomNode("IncludeExpansion")` whose `source_info` points to the include -shortcode token in the parent file. This gives the incremental writer an -anchor for the include token's source bytes — round-trip preserves -`{{< include foo.qmd >}}` verbatim when the user doesn't touch it. +Modify `IncludeExpansionStage` to wrap each include's expanded blocks in +a `CustomNode("IncludeExpansion")` whose `source_info` is the original +`{{< include foo.qmd >}}` Paragraph's `source_info` (`Original` pointing +at the parent file's include-token bytes). This gives the incremental +writer an anchor for the include token's source bytes — round-trip +preserves `{{< include foo.qmd >}}` verbatim when the user doesn't touch +it. This plan also adds the qmd-writer arm for `CustomNode("IncludeExpansion")` and the React component (transparent passthrough that doesn't propagate -`setLocalAst` to slot children — shipped here, not in Plan 2C; see Plan 2C's -2026-05-10 third-pass amendment for the deferral rationale). The writer's -atomic-violation logic from Plan 7 enforces the "edits inside an include -are prohibited" contract — `IncludeExpansion` is registered in -`is_atomic_custom_node`. +`setLocalAst` to slot children — shipped here, not in Plan 2C; see Plan +2C's 2026-05-10 third-pass amendment for the deferral rationale). +The writer's atomic-violation logic from Plan 7 enforces the "edits +inside an include are prohibited" contract — `IncludeExpansion` is +registered in `is_atomic_custom_node`. When this plan lands, M4 is reached: documents with includes are -fully-functional in q2-preview's read+edit mode (with edits outside includes -round-tripping; edits inside surfacing as diagnostics). +fully-functional in q2-preview's read+edit mode (with edits outside +includes round-tripping; edits inside surfacing as Q-3-43 diagnostics). ## Scope ### In scope -- Modify `IncludeExpansionStage` (`crates/quarto-core/src/stage/stages/include_expansion.rs`) - to wrap inserted blocks in a `Block::Custom(CustomNode { type_name: +- Modify `IncludeExpansionStage` + (`crates/quarto-core/src/stage/stages/include_expansion.rs`) to wrap + inserted blocks in a `Block::Custom(CustomNode { type_name: "IncludeExpansion", … })` instead of splicing them flat. The wrapper's - `source_info` is the original Paragraph's source_info (the include + `source_info` is the original Paragraph's `source_info` (the include shortcode token's range in the parent file). `plain_data` carries `{ "source_path": "", "atomic": true }`. +- **The wrapper's `source_info` stays `Original`, NOT `Generated`** — + see "Why the wrapper is Original" below. + + This is also what keeps `IncludeExpansion` from being a + *transparent wrapper* in the sense of + [`claude-notes/designs/transparent-wrappers.md`](../designs/transparent-wrappers.md). + The writer's descent helpers (`derive_target_file_id`, + `first_target_anchored_start_in`) stop at the wrapper and read + *its* `Original` source_info — which is exactly right: the + include-token bytes live in the parent qmd, that's where the + metadata region and the file id come from. If a future variant + ever emits an `IncludeExpansion` with `Generated` source_info + at the top of a parent document, descent would skip into the + *child* qmd's bytes and the parent's frontmatter would + silently mis-extract — add a debug-assert in + `IncludeExpansionStage` that the wrapper's `root_file_id()` + matches the parent. - Update the qmd writer (`pampa/src/writers/qmd.rs` CustomNode arm) to handle `"IncludeExpansion"`. Two paths: - **Verbatim path** (KeepBefore in Plan 7's coarsen): nothing to do — @@ -47,35 +75,42 @@ round-tripping; edits inside surfacing as diagnostics). The arm reads `plain_data` only — it does NOT inspect `source_info`, so it works identically for pipeline-emitted wrappers (Original source_info pointing at the parent file's include token) and - user-constructed wrappers (Synthetic { by: user_edit } source_info - from React). This is the path that fires when the user replaces or - adds an include via a React UI. + user-constructed wrappers (`Generated { by: user_edit, from: [] }` + source_info from React). This is the path that fires when the user + replaces or adds an include via a React UI. - **Unreachable path** (RecurseIntoContainer on atomic with inner changes): under Plan 7's soft-drop semantics, coarsen substitutes KeepBefore for this case before the qmd writer ever sees it. The arm includes `unreachable!("coarsen should have substituted KeepBefore for atomic CustomNode in RecurseIntoContainer; this branch indicates a coarsen bug")` as a debug assertion. +- Add an `IncludeExpansionResolveTransform` to the **Normalization + Phase** (symmetric with `CalloutResolveTransform`), running in the + HTML pipeline only (not q2-preview). Unwraps + `CustomNode("IncludeExpansion")` back into flat blocks for the HTML + writer to handle generically. See "HTML pipeline resolve transform" + below. - Add a React component for `IncludeExpansion` at - `hub-client/src/components/render/q2-preview/custom/IncludeExpansion.tsx` - (q2-preview's built-in custom-node registry, post-2pre / 2B / 2C). Plan - 2C deferred the placeholder per its third-pass amendment (2026-05-10): - until Plan 8 lands the AST node, `Fallback.tsx` covers the unknown - `type_name` gracefully, and Plan 8 ships the real component together - with the AST node and the `atomicCustomNodes.ts` addition: + `ts-packages/preview-renderer/src/q2-preview/custom/IncludeExpansion.tsx` + (q2-preview's built-in custom-node registry, post-2pre / 2B / 2C). + Plan 2C deferred the placeholder per its third-pass amendment + (2026-05-10): until Plan 8 lands the AST node, `Fallback.tsx` covers + the unknown `type_name` gracefully, and Plan 8 ships the real + component together with the AST node and the `atomicCustomNodes.ts` + addition: - Transparent passthrough: render the content slot's blocks normally. - Read-only: do not pass `setLocalAst` to slot children (enforced via - the framework's atomic-aware dispatcher in `framework/dispatchers.tsx` + the framework's atomic-aware dispatcher in `framework/dispatch.tsx` reading - `hub-client/src/utils/atomicCustomNodes.ts`). + `ts-packages/preview-renderer/src/utils/atomicCustomNodes.ts`). - Visual indicator (optional): subtle background tint or hover badge "from foo.qmd". - Register `"IncludeExpansion"` in **both** sides of the atomic registry: Rust `ATOMIC_CUSTOM_NODES` const (Plan 7 introduces the const + `is_atomic_custom_node()` function) and TypeScript hand-mirror - `hub-client/src/utils/atomicCustomNodes.ts` (Plan 2A introduces the - file with the initial `["CrossrefResolvedRef"]` set). Plan 8 amends - both to add `"IncludeExpansion"`. + `ts-packages/preview-renderer/src/utils/atomicCustomNodes.ts` (Plan + 2A introduces the file with the initial `["CrossrefResolvedRef"]` + set). Plan 8 amends both to add `"IncludeExpansion"`. - Tests covering: - Untouched include: round-trip preserves `{{< include foo.qmd >}}`. - Edit outside include: that paragraph rewrites; include token preserved. @@ -99,6 +134,11 @@ round-tripping; edits inside surfacing as diagnostics). - Resolving include shortcodes outside the standard `Paragraph[Shortcode("include")]` form (current behavior preserved — only top-level paragraph-form includes are handled). +- Attributing the include line in HTML rendering. The + `IncludeExpansionResolveTransform` unwraps the wrapper before + `AttributionRenderTransform` runs, so HTML output has no DOM anchor + for the include-line author. See "HTML attribution" below — this is + intentional v1 behavior. ## Design decisions (settled in conversation) @@ -114,22 +154,154 @@ round-tripping; edits inside surfacing as diagnostics). synthesized from the wrapper's slot children into the parent file, because those bytes' origin is the included file, not parent.qmd source nor user input. -- **Source_info on the wrapper points to the original Paragraph**, not to - the inner Shortcode. The Paragraph's range covers the whole `{{< include >}}` - line (including any whitespace/newline padding); the Shortcode's covers - just the token. Paragraph gives a cleaner verbatim copy. -- **Nested includes produce nested wrappers naturally**. When the include - expansion processes a child file that itself has includes, recursion - produces nested CustomNode wrappers. Each wrapper anchors at its own - parent-file include token. Round-trip semantics compose: untouched at any - level → preserved; touched at any level → atomic-violation at the deepest - affected wrapper. +- **Source_info on the wrapper points to the original Paragraph**, not + to the inner Shortcode. The Paragraph's range covers the whole + `{{< include >}}` line (including any whitespace/newline padding); + the Shortcode's covers just the token. Paragraph gives a cleaner + verbatim copy. +- **Nested includes produce nested wrappers naturally**. When the + include expansion processes a child file that itself has includes, + recursion produces nested CustomNode wrappers. Each wrapper anchors + at its own parent-file include token. Round-trip semantics compose: + untouched at any level → preserved; touched at any level → + atomic-violation at the deepest affected wrapper. - **React component is read-only** (Plan 8 ships the per-type IncludeExpansion component, deferred from Plan 2C per its third-pass - amendment; Plan 2B's framework atomic gate enforces read-only behavior; - Plan 2A introduces the atomic-registry hand-mirror). The IncludeExpansion - component does not pass `setLocalAst` to children. This is the primary - enforcement; the writer's atomic-violation is the contract guarantor. + amendment; Plan 2A's framework atomic gate enforces read-only + behavior). The IncludeExpansion component does not pass + `setLocalAst` to children. This is the primary enforcement; the + writer's atomic-violation is the contract guarantor. +- **Render-side resolve, not writer arm.** The HTML writer stays + generic — it doesn't grow knowledge of `IncludeExpansion`. The + `IncludeExpansionResolveTransform` unwraps in the Normalization Phase + (symmetric with `CalloutResolveTransform`), and the unwrapped blocks + flow through the rest of the HTML pipeline normally. This preserves + the Pandoc / Quarto convention of "resolve to standard AST before + writers" — see "Considered alternatives" below. + +## Why the wrapper is Original + +The wrapper's `source_info` is `Original`, inherited from the original +Paragraph it substitutes for. This may look inconsistent with Plan 6's +audit (which puts other transform-synthesized wrappers like Sectionize +into `Generated`), but it follows a principled rule: + +**Two pieces of provenance information** need to land somewhere when a +transform synthesizes a node: + +1. **Generator identity** — "which transform produced me." +2. **Source anchor** — "which source bytes are this node's canonical preimage." + +For non-CustomNode synthesized nodes (Sectionize's Section Div, +filter-constructed Str, footnotes container Div), there's no other slot +for (1), so `source_info` carries both via `Generated { by, from }`. + +For CustomNode synthesized nodes, (1) is **already encoded** in +`CustomNode.type_name`. The wrapper *is* an `IncludeExpansion` by +virtue of `type_name`; there's no need for `source_info` to also say +"I was made by IncludeExpansionStage." So `source_info` only has to do +(2) — and the natural shape for (2) when the wrapper substitutes 1:1 +for a source-mapped parser node is the inherited `Original`. + +This isn't a Plan 8 invention — it's the existing pattern for every +source-mapped CustomNode in the codebase: + +| CustomNode `type_name` | Source-mapped from | `source_info` shape | +|---|---|---| +| `Callout` | `:::{.callout-warning} … :::` Div | Original (inherited) | +| `Theorem` / `Proof` / etc. | `:::{.theorem #thm-foo} … :::` Div | Original (inherited) | +| `CrossrefResolvedRef` | `@thm-foo` Cite | Original (inherited) | +| `FloatRefTarget` | Figure / table / listing Div | Original (inherited) | +| `IncludeExpansion` (Plan 8) | `{{< include foo.qmd >}}` Paragraph | Original (inherited) | + +In contrast, Sectionize's Section Div is NOT a CustomNode (it's a +plain Div) AND it doesn't 1:1-substitute for a source-mapped parser +node (it's a structural grouping over a Header + its body). So its +`source_info` has to carry generator identity via `Generated { by: +sectionize, from: [] }`. + +**The rule, in one sentence**: a synthesized node uses **Original** +`source_info` if and only if it is a CustomNode whose 1:1 source +preimage is a parser-emitted node. Everything else uses **Generated**. + +See Plan 4's "Original vs Generated on synthesized nodes" section for +the full taxonomy. + +## HTML pipeline resolve transform + +The wrapper change applies to the `IncludeExpansionStage`, which runs +in BOTH the HTML and q2-preview pipelines. For HTML output, the +wrapper would otherwise reach the HTML writer with no native rendering +arm for `IncludeExpansion`. The cleanest fix: + +Add `IncludeExpansionResolveTransform` that runs ONLY in the HTML +pipeline (not q2-preview, where the React `IncludeExpansion` component +handles rendering directly). Unwraps `CustomNode("IncludeExpansion")` +back into flat blocks — the slot's `content` Blocks become siblings of +the surrounding content. The HTML writer then processes the flat +blocks generically. + +**Placement**: Normalization Phase, symmetric with +`CalloutResolveTransform` (`crates/quarto-core/src/pipeline.rs:988`). +Like Callout, the resolve fires early so the rest of the pipeline sees +standard AST. `Q2_PREVIEW_TRANSFORM_EXCLUDED` lists +`"callout-resolve"`; add `"include-expansion-resolve"` to that list so +q2-preview keeps the wrappers for React rendering. + +## HTML attribution + +When the resolve transform unwraps the wrapper, the wrapper's +`source_info` (Original pointing at the parent's include token) is +gone before `AttributionRenderTransform` runs at the tail of the +Finalization Phase. Consequences: + +- The unwrapped included blocks have `file_id != 0` (foo.qmd's + FileId). `query_attribution` skips them per the v1 single-doc + invariant. **No attribution on included content in HTML.** +- The include-line author has no node to be attributed against. The + Paragraph that previously held `{{< include foo.qmd >}}` was deleted + by `IncludeExpansionStage`. **No attribution on the include line in + HTML.** + +This matches what current main produces (without Plan 8, the include +line and its content are also un-attributed in HTML output), so it's +not a regression. It's *intentional* v1 behavior: in the rendered HTML, +there's no DOM element that represents "the include line" — those +source bytes don't appear in the rendered output. Attributing them +would require synthesizing a wrapping HTML element, which is +inconsistent with the "resolve to standard AST" convention. + +**q2-preview attributes the include line correctly.** q2-preview +excludes the resolve transform, so the wrapper survives all the way to +JSON serialization and React. `AttributionRenderTransform` visits the +wrapper, resolves its `Original` source_info to a byte range, and +records the include-line author via the existing `query_byte_range` +max-time logic. The React `IncludeExpansion` component receives the +attribution record and surfaces it as the authorship pill on the +wrapper region. + +When v2 multi-file blame lands (`crates/quarto-core/src/attribution/types.rs:58` +flags this as v1-only), the unwrapped HTML children gain attribution +from foo.qmd's blame. The HTML include-line itself remains +un-attributed because there's still no DOM anchor — that's a structural +property of HTML output, not a v1 limitation we plan to remove. + +## Considered alternatives + +**Option C — render `IncludeExpansion` natively in the HTML writer.** +Investigated during the 2026-05-20 design discussion. Cleaner for v2 +attribution (the wrapper would survive to the HTML writer, which could +emit a `
` with the include-line author's +`data-attr-*`). Rejected because it breaks the Pandoc / Quarto +convention of resolving CustomNodes to standard AST before writers see +them. The convention is load-bearing: it lets each new output format +(future Typst, future PDF) work generically without growing +CustomNode-specific arms. + +The decision is recoverable if needed — the type definitions and +wrapper shape don't change. Switching to native rendering later means +dropping the resolve transform and adding writer arms; it doesn't +require revising Plan 8's type design. ## The wrapper structure @@ -145,7 +317,7 @@ Block::Custom(CustomNode { "atomic": true, }), attr: ("".to_string(), vec![], LinkedHashMap::new()), - source_info: paragraph.source_info.clone(), // include token's parent-file bytes + source_info: paragraph.source_info.clone(), // Original{parent, include_token_range} }) ``` @@ -163,19 +335,19 @@ walk — same visual outcome, just no per-type styling). - Both pipeline runs (live and baseline) produce identical wrappers. - Reconciler picks `KeepBefore` for the wrapper. - Plan 7's coarsen sees `is_atomic_custom_node("IncludeExpansion") == true` - → goes the Verbatim path because `preimage_in(parent_file)` returns the - include token's byte range (the wrapper's source_info is `Original{parent, - start, end}`). + → goes the Verbatim path because `preimage_in(parent_file)` returns + the include token's byte range (the wrapper's source_info is + `Original{parent, start, end}`). - `assemble` copies `original_qmd[start..end]` — the literal `{{< include foo.qmd >}}` text. ✓ Source preserved. **Case 2 — edit outside include, untouched include in same doc**: -- Reconciler's plan has `KeepBefore` for the include wrapper, mixed alignments - for other blocks. -- The include wrapper goes through the Verbatim path (case 1 above). Other - blocks are handled per their own alignment. The include token in source - is preserved verbatim. Edit outside is rewritten. +- Reconciler's plan has `KeepBefore` for the include wrapper, mixed + alignments for other blocks. +- The include wrapper goes through the Verbatim path (case 1 above). + Other blocks are handled per their own alignment. The include token + in source is preserved verbatim. Edit outside is rewritten. **Case 3 — edit inside the include (somehow)**: @@ -192,7 +364,8 @@ walk — same visual outcome, just no per-type styling). the include's `source_path` from `plain_data`: "Edit inside `{{< include foo.qmd >}}` was not saved. To edit this content, open foo.qmd directly." Save **succeeded** (other edits applied); - warning surfaces in the diagnostic panel. + warning surfaces in the diagnostic panel (hub-client) or + DiagnosticStrip (SPA). **Case 3b — user replaces or deletes the include via React**: @@ -204,10 +377,10 @@ walk — same visual outcome, just no per-type styling). qmd writer; the include is gone from output. - If the user replaced the include with a fresh IncludeExpansion (e.g., changed `foo.qmd` to `bar.qmd` via a hypothetical UI), the new - wrapper has Synthetic { by: user_edit } source_info and - `plain_data["source_path"] = "bar.qmd"`. The qmd writer's arm reads - `plain_data` and emits `{{< include bar.qmd >}}`. No warning — the - user's intent is clear. + wrapper has `Generated { by: user_edit, from: [] }` source_info + and `plain_data["source_path"] = "bar.qmd"`. The qmd writer's arm + reads `plain_data` and emits `{{< include bar.qmd >}}`. No warning + — the user's intent is clear. **Case 4 — nested includes**: @@ -227,50 +400,65 @@ walk — same visual outcome, just no per-type styling). ... ] ``` -- The outer wrapper's source_info points to parent.qmd's bytes. The inner - wrapper's source_info points to foo.qmd's bytes (via the FileId remap). -- Round-trip in parent.qmd: outer's `preimage_in(parent_file)` returns the - parent's include token range. Verbatim copy preserves - `{{< include foo.qmd >}}` in parent.qmd. The inner wrapper's bytes never - get serialized because the outer's Verbatim wins. +- The outer wrapper's source_info is Original pointing at parent.qmd's + bytes. The inner wrapper's source_info is Original pointing at + foo.qmd's bytes (via the FileId remap). +- Round-trip in parent.qmd: outer's `preimage_in(parent_file)` returns + the parent's include token range. Verbatim copy preserves + `{{< include foo.qmd >}}` in parent.qmd. The inner wrapper's bytes + never get serialized because the outer's Verbatim wins. ## Open questions for implementation -- **`source_path` accuracy**: the literal arg from the shortcode (`"foo.qmd"`) - is what we re-emit on save. Plan 7's Verbatim copy path doesn't use it - (we copy bytes), but the Rewrite path (let-user-win for fresh - user-constructed IncludeExpansion) does. Make sure the - IncludeExpansionStage stores the literal arg verbatim — including - any whitespace or quoting the user typed — so a round-trip through - React preserves the user's syntactic choices when possible. -- **Recorded includes side-channel**: today's `IncludeExpansionStage` writes - to `doc.recorded_includes` for cache invalidation. The wrapper change - shouldn't affect this — confirm. -- **`extract_include_path` recognition**: today the function recognizes a - Paragraph containing exactly one include Shortcode inline. After the - wrapper change, the structure is unchanged at that recognition point - (the wrapper is built from the recognized Paragraph). The recognition - logic continues to work. +- **`source_path` accuracy**: the literal arg from the shortcode + (`"foo.qmd"`) is what we re-emit on save. Plan 7's Verbatim copy path + doesn't use it (we copy bytes), but the Rewrite path + (let-user-win for fresh user-constructed IncludeExpansion) does. + Make sure the IncludeExpansionStage stores the literal arg verbatim + — including any whitespace or quoting the user typed — so a + round-trip through React preserves the user's syntactic choices when + possible. +- **Recorded includes side-channel**: today's `IncludeExpansionStage` + writes to `doc.recorded_includes` for cache invalidation. The wrapper + change shouldn't affect this — confirm. +- **`extract_include_path` recognition**: today the function recognizes + a Paragraph containing exactly one include Shortcode inline. After + the wrapper change, the structure is unchanged at that recognition + point (the wrapper is built from the recognized Paragraph). The + recognition logic continues to work. ## References -- `crates/quarto-core/src/stage/stages/include_expansion.rs:80-278` — the - stage implementation. The splicing logic at lines 215-220 is what gets - replaced. +- `crates/quarto-core/src/stage/stages/include_expansion.rs:80-278` — + the stage implementation. The splicing logic at lines 215-220 is + what gets replaced with wrapper construction. - `crates/quarto-pandoc-types/src/custom.rs` — CustomNode struct. - `crates/pampa/src/writers/qmd.rs` — qmd writer's CustomNode arm (existing for Callout etc. — extend with IncludeExpansion). -- Plan 6 — provenance audit. Sets the precedent for "preserve source info - on transform output." (Plan 6 uses Derived for shortcodes; Plan 8 uses - the wrapper-CustomNode pattern for includes, since cross-file FileId - prevents Derived from working.) +- `crates/quarto-core/src/transforms/callout_resolve.rs` — pattern to + mirror for `IncludeExpansionResolveTransform`. Note: Callout's + resolve runs in the Normalization Phase + (`crates/quarto-core/src/pipeline.rs:988`), not the Finalization + Phase. Plan 8's resolve runs at the same point in the HTML + pipeline. +- `crates/quarto-core/src/pipeline.rs:1181` — + `Q2_PREVIEW_TRANSFORM_EXCLUDED` const; add + `"include-expansion-resolve"` to skip the unwrap in the q2-preview + pipeline. +- Plan 6 — provenance audit. Sets the precedent for "preserve source + info on transform output." Plan 6 uses `Generated` with + `Invocation` anchors for shortcodes; Plan 8 uses the + wrapper-CustomNode pattern for includes, since cross-file FileId + prevents shortcode-style anchoring from working. - Plan 7 — coarsen logic (Verbatim, Transparent, Omit, soft-drop substitutions, is_atomic_custom_node registry). -- Plan 2A — `hub-client/src/utils/atomicCustomNodes.ts` (the JS-side - atomic registry that Plan 8 amends to add `"IncludeExpansion"`). -- Plan 2B — framework recursion + atomic gate that the IncludeExpansion - component runs through; CustomNode unwrap/rewrap walks that produce - the JS-native shape Plan 2C's component consumes. +- Plan 2A — `ts-packages/preview-renderer/src/utils/atomicCustomNodes.ts` + (the JS-side atomic registry that Plan 8 amends to add + `"IncludeExpansion"`). +- Plan 2B — framework recursion + atomic gate that the + IncludeExpansion component runs through; CustomNode unwrap/rewrap + walks that produce the JS-native shape Plan 2C's component + consumes. - Plan 2C — React component infrastructure (registers IncludeExpansion component as a transparent read-only passthrough; Plan 2C already ships the placeholder component as dormant wiring before Plan 8 @@ -280,9 +468,9 @@ walk — same visual outcome, just no per-type styling). - **Untouched-include round-trip**: parse a parent.qmd with an include, run pipeline, write back without modification, assert byte-equal. -- **Edit-outside round-trip**: edit a paragraph outside the include in the - AST, write back, assert the include token is byte-equal-preserved and - the edited paragraph is rewritten. +- **Edit-outside round-trip**: edit a paragraph outside the include in + the AST, write back, assert the include token is byte-equal-preserved + and the edited paragraph is rewritten. - **Edit-inside soft-drop**: programmatically modify a Str inside the IncludeExpansion's content slot (bypass the React layer), call `incremental_write_qmd_for_preview`, assert the result is `Ok` with @@ -292,68 +480,93 @@ walk — same visual outcome, just no per-type styling). IncludeExpansion with a fresh user-constructed IncludeExpansion (new source_path), call the writer, assert the output contains `{{< include >}}` with no warning. The qmd writer's - CustomNode arm hit the Rewrite path with Synthetic { by: user_edit } - source_info and read `plain_data["source_path"]`. + CustomNode arm hit the Rewrite path with `Generated { by: user_edit, + from: [] }` source_info and read `plain_data["source_path"]`. - **Delete-include let-user-win**: replace an IncludeExpansion with a Para in the new AST, call the writer, assert the include token is gone from output and the Para's text appears in its place. No warning. -- **Nested includes round-trip**: parent → foo → bar. Untouched: all three - preserved. Edit inside bar: `Q-3-43` warning with bar's wrapper source - range; the inner edit is reverted via Plan 7's soft-drop, parent.qmd - byte-equal to no-op edit. -- **Plan 2C component test**: render an IncludeExpansion wrapper; assert - setLocalAst is not propagated to children (no edit affordance). +- **Nested includes round-trip**: parent → foo → bar. Untouched: all + three preserved. Edit inside bar: `Q-3-43` warning with bar's + wrapper source range; the inner edit is reverted via Plan 7's + soft-drop, parent.qmd byte-equal to no-op edit. +- **HTML pipeline resolve test**: render a doc with an include through + the HTML pipeline; assert the resulting HTML contains the included + content flat (not wrapped in a `
` + or similar) — the resolve transform unwrapped it before the HTML + writer ran. +- **q2-preview pipeline preservation test**: render the same doc + through the q2-preview pipeline; assert the resulting AST contains + the IncludeExpansion CustomNode wrapper (not unwrapped). The JSON + writer emits it; the React component renders it. +- **q2-preview attribution test**: with a `PreBuiltAttributionProvider` + installed, render a doc with an include through q2-preview; assert + the wrapper's `astContext.attribution` record references the + include-line author (the latest author of the parent's include line + bytes). HTML output of the same doc has no attribution on the + include line (intentional v1 behavior). +- **Plan 2C component test**: render an IncludeExpansion wrapper; + assert setLocalAst is not propagated to children (no edit + affordance). - **Idempotence**: re-run Plan 3's idempotence test with includes. The wrapper should be deterministic across runs. +- **Shortcode-inside-include provenance shape** (cross-reference to + Plan 6): Plan 6 owns the test that asserts a shortcode resolving + inside include-spliced content gets an Invocation anchor with + `file_id != 0` (pointing into the included file, not the parent). + Plan 8's wrapper carries the parent-file `Original` independently; + the two anchors compose correctly. Plan 8's tests above exercise + the wrapper round-trip in isolation; the composition shape lives + in Plan 6's §Test plan ("Shortcode-inside-include composition + test"). +- **Cross-file Invocation in resolve transform**: after the + `IncludeExpansionResolveTransform` (Plan 8) unwraps the wrapper, + any shortcode-resolved Generated children retain + `Invocation -> Original{foo.qmd, ...}` source_info. The unwrapped + HTML pipeline sees these children with `file_id != 0`, and + `query_attribution` skips them per the v1 single-doc invariant + (matching §HTML attribution above). Round-trip-equivalent to a + fixture without includes: the HTML writer doesn't care about + source_info; it just renders the nodes. Regression test: render + parent + foo (foo contains `{{< meta title >}}`) through the HTML + pipeline, grep for the resolved title in the output, assert the + rendered text matches `meta.title` from foo.qmd's metadata. ## Dependencies -- Depends on: Plans 4, 6, 7 (Synthetic types not strictly needed since the - wrapper uses Original; the audit pattern; the writer's atomic logic). -- Plan 2C also depends on this for the IncludeExpansion component (which - Plan 8 confirms is needed; Plan 2C ships the placeholder dormant). +- Depends on: Plans 4, 6, 7 (Generated types not strictly needed for + the wrapper's source_info since it stays Original; the audit pattern + for what kinds of nodes get Generated vs Original; the writer's + atomic logic). +- Plan 2C also depends on this for the IncludeExpansion component + (which Plan 8 confirms is needed; Plan 2C ships the placeholder + dormant). - Final plan in the sequence; nothing depends on it. ## Risk areas -- **The include's wrapper source_info uses the *parent* file's FileId**. - The included blocks inside the slot have a *different* FileId. Plan 7's - `preimage_in(parent_file)` correctly returns None for the children - (because their FileId differs). This is the intended behavior — children - contribute nothing to the verbatim-copy path; only the wrapper does. - Confirm by walking through the test cases. -- **Existing tests for `IncludeExpansionStage`**: the existing tests assert - the spliced-flat behavior (e.g., `assert_eq!(doc.ast.blocks.len(), 2)` - after expanding one include). Update these tests for the wrapper - behavior (`assert_eq!(doc.ast.blocks.len(), 1)` — the Paragraph is - replaced by one wrapper). -- **The `recorded_includes` side-channel**: existing pipeline-cache logic - reads this. The wrapper change shouldn't affect it because we still call - `record_include` at the same point. Confirm during implementation. -- **Existing HTML pipeline tests**: the wrapper change applies to the HTML - pipeline too (we're modifying `IncludeExpansionStage`, which runs in - both HTML and q2-preview pipelines). For HTML output, the wrapper - passes through subsequent transforms unchanged and gets serialized to - HTML by `RenderHtmlBodyStage`. Confirm the HTML writer's CustomNode - arm handles `"IncludeExpansion"` (or that we don't need it because the - HTML pipeline runs `CrossrefRenderTransform` etc. that don't touch this - type — but ensure HTML output looks right). - - Actually: the HTML pipeline doesn't have a transform that materializes - IncludeExpansion. So the HTML writer SEES the wrapper at render time. - The simplest fix: make the HTML writer's CustomNode arm transparently - render the slot content (effectively materializing into HTML, which is - the right thing for HTML output). Or: add a render-side resolve transform - for IncludeExpansion that runs only in the HTML pipeline. - - This is the one significant complication. Worth investigating during - implementation. The cleanest answer is probably: a small render-side - transform `IncludeExpansionResolveTransform` that runs ONLY in the HTML - pipeline (not q2-preview), unwraps `CustomNode("IncludeExpansion")` - back into flat blocks for the HTML writer to handle normally. - - Symmetric with `CalloutResolveTransform`. Same shape. +- **The include's wrapper source_info uses the *parent* file's + FileId**. The included blocks inside the slot have a *different* + FileId. Plan 7's `preimage_in(parent_file)` correctly returns None + for the children (because their FileId differs). This is the intended + behavior — children contribute nothing to the verbatim-copy path; + only the wrapper does. Confirm by walking through the test cases. +- **Existing tests for `IncludeExpansionStage`**: the existing tests + assert the spliced-flat behavior (e.g., + `assert_eq!(doc.ast.blocks.len(), 2)` after expanding one include). + Update these tests for the wrapper behavior + (`assert_eq!(doc.ast.blocks.len(), 1)` — the Paragraph is replaced + by one wrapper). +- **The `recorded_includes` side-channel**: existing pipeline-cache + logic reads this. The wrapper change shouldn't affect it because we + still call `record_include` at the same point. Confirm during + implementation. +- **Existing HTML pipeline tests**: the wrapper change applies to the + HTML pipeline too (we're modifying `IncludeExpansionStage`, which + runs in both HTML and q2-preview pipelines). The + `IncludeExpansionResolveTransform` in the Normalization Phase + unwraps before the HTML writer sees it, so HTML output is + byte-equivalent to current main. Verify with a regression test. - **Extension-registration forward-compat**: Plan 8 adds `IncludeExpansion` to the hardcoded `pub const ATOMIC_CUSTOM_NODES` set in `quarto-core`. After the future extension-registration @@ -371,32 +584,37 @@ walk — same visual outcome, just no per-type styling). |---|---| | IncludeExpansionStage modification (wrap inserted blocks) | ~40 | | qmd writer arm for IncludeExpansion (atomic) | ~30 | -| HTML pipeline resolve transform (unwrap before HTML writer) | ~50 | -| `is_atomic_custom_node` registration | ~5 | +| `IncludeExpansionResolveTransform` (Normalization Phase, HTML only) | ~50 | +| Adding `"include-expansion-resolve"` to `Q2_PREVIEW_TRANSFORM_EXCLUDED` | ~5 | +| `is_atomic_custom_node` registration (Rust + TS hand-mirror) | ~10 | | React component (transparent passthrough, read-only) | ~30 | | Test updates for existing IncludeExpansionStage tests | ~50 | -| New round-trip tests | ~200 | -| **Total** | **~405** | +| New round-trip tests (untouched, edit-outside, soft-drop, replace, nested, HTML, attribution) | ~250 | +| **Total** | **~465** | -Two focused sessions likely. The HTML pipeline resolve transform is the -piece I didn't fully account for in my earlier estimates — confirm scope -during implementation kickoff. +Two focused sessions likely. ## Notes +The wrapper-CustomNode pattern is the right shape for includes because +the included content lives in a *different file* than the parent. +Their source_info points into foo.qmd, not parent.qmd. There's no +`Generated`-with-`Invocation`-anchor chain that can connect those +blocks back to the parent file's include token bytes (the anchor's +chain would need to resolve into the target file, and foo.qmd is a +different FileId). So we need a wrapper at the parent-file level whose +`source_info` is `Original{parent_file, include_token_range}` to serve +as the writer's anchor. That's what `CustomNode("IncludeExpansion")` +provides. + +Shortcodes (Plan 6) don't have this issue (they resolve in the same +file) which is why they use `Generated { by: shortcode, from: [Invocation -> ...] }` +instead of a wrapper. The genuine cross-file case is the only one that +warrants the wrapper. + The HTML-pipeline-resolve-transform finding is the kind of thing the -research plan exists to surface. The wrapper change has implications for -HTML output that aren't immediately visible from the q2-preview-only lens. -Plan 8's research plan should make this explicit so that the -implementation session doesn't get blindsided. - -Why a wrapper for includes (different from shortcodes): includes pull in -content from a *different file*. The included blocks have a different -FileId than the parent file. Their source_info points into foo.qmd, not -parent.qmd. There's no `Derived` chain that can connect those blocks -back to the parent file's include token bytes — Derived requires a `from` -that resolves into the target file. So we need a wrapper at the parent-file -level whose source_info is `Original{parent_file, include_token_range}` to -serve as the writer's anchor. That's what `CustomNode("IncludeExpansion")` -provides. Shortcodes don't have this issue (they resolve in the same file) -which is why they use Derived (Plan 6) instead of a wrapper. +design discussion exists to surface. The wrapper change has +implications for HTML output that aren't immediately visible from the +q2-preview-only lens. Plan 8's implementation kickoff should land the +resolve transform alongside the wrapper change to keep HTML +byte-equivalent across the transition. diff --git a/claude-notes/plans/2026-05-18-q2-preview-project-replay-engine.md b/claude-notes/plans/2026-05-18-q2-preview-project-replay-engine.md index 9941f1696..06aac08cd 100644 --- a/claude-notes/plans/2026-05-18-q2-preview-project-replay-engine.md +++ b/claude-notes/plans/2026-05-18-q2-preview-project-replay-engine.md @@ -189,6 +189,24 @@ class="{r}">` in the iframe DOM). The engine-name check is against the `engine_name` field of the capture itself, so we only attempt to splice cells belonging to the captured engine. +### Pre-engine timing — why a flat walk is safe + +Both `derive_cell_outputs(A1, B1)` and `splice(A2, output_map)` +above iterate `.blocks` flatly. This is correct **only because the +splice runs at the pre-engine checkpoint** — strictly before +`SectionizeTransform` (and the other "sugar phase" synthesizers) +add the top-level transparent wrapper Div that the writer learned +about the hard way in commits `bdcfdc53` / `b9f64b56` / `2bf92664`. +At the pre-engine checkpoint, `A2.blocks[0]` is a real user block. + +If a future variant ever moves the splice point past the sugar +phase (or runs it on a post-pipeline AST for any other reason), +the flat walk would miss every cell inside the wrapper. Route the +walker through `first_in_user_tree` / a `visit_user_blocks` +sibling per +[`claude-notes/designs/transparent-wrappers.md`](../designs/transparent-wrappers.md) +in that case. + ## Where the splice lives in the pipeline Two viable insertion points; the v1 picks the simpler: diff --git a/claude-notes/plans/2026-05-21-vfs-url-write-root-split.md b/claude-notes/plans/2026-05-21-vfs-url-write-root-split.md new file mode 100644 index 000000000..015bbce2f --- /dev/null +++ b/claude-notes/plans/2026-05-21-vfs-url-write-root-split.md @@ -0,0 +1,287 @@ +# Plan — Split `vfs_root` into write-root + url-root in `ResourceResolverContext` + +**Date:** 2026-05-21 +**Branch:** `beads/bd-rz2we-plan-3-q2-preview` → integrates into `feature/provenance` +**Status:** Implementation plan +**Beads:** bd-rz2we +**Blocks:** closing Plan 3 (q2-preview idempotence gate) + +## Goal + +Decouple two roles `ResourceResolverContext::vfs_root_mode` currently +plays as a single `PathBuf`: + +1. **Disk-write root** — where `runtime.file_write` / `OutputSink` + put artifacts (theme CSS, copied resources, site libs). +2. **URL prefix** — what gets embedded in HTML link / asset URLs. + +In production WASM these are intentionally identical +(`"/.quarto/project-artifacts"` for both, a synthetic VFS path the +service worker serves from memory). On native test runs they have to +diverge: the write root has to be a real tempdir so the runtime can +actually write, but URLs must be path-independent so the AST is +idempotent across runs. + +## Why we can't defer this to a later plan + +Plan 3 locks in the idempotence + structural-hash-stability contract. +Right now `website_links` produces: + +```text +target: ("/private/var/folders/.../T/.tmpXXX/.quarto/project-artifacts/other.html", "") +``` + +Two runs in two tempdirs → two distinct URLs → block-hash divergence. + +Plans 4–8 (typed source-info, wire format, audit, incremental writer, +include round-trip) all assume Plan 3's gate is green on the +fixtures they care about. None of them name URL canonicalization in +scope. Unlike bd-3odjm (whose fix-owner is Plan 5 because Plan 5 +rewrites the wire format anyway), bd-rz2we has no natural fix-owner +downstream of Plan 3. Fixing it here is the right scope. + +It's also wrong-output, not just non-determinism. Any in-process +caller of `RenderToPreviewAstRenderer::new(real_disk_path)` (test +helpers today, anything else that wants to host the q2-preview +pipeline natively tomorrow) gets links whose URLs leak the host +machine's tempdir into the AST. The browser's iframe service worker +doesn't intercept `/private/var/...`, so those links would 404 if +served. + +## Where the bug lives (verified 2026-05-21) + +- `LinkRewriteTransform` calls + `resolve_doc_relative_href("other.qmd", "index.qmd", resolver, idx, …)` + which delegates to `resolver.page_url_for(profile.output_href)`. +- In **VFS-root mode**, `page_url_for` is just + `rel_to_url(&root.join(target))` where `root` is whatever was passed + to `ResourceResolverContext::vfs_root(...)` + (`crates/quarto-core/src/resource_resolver.rs:210-218`). No + relativization, no synthetic prefix — the URL is literally the + joined path. +- `RenderToPreviewAstRenderer` builds its per-doc resolver with + `ResourceResolverContext::vfs_root(self.vfs_root.clone())` + (`pass2_renderer.rs:661`). It also writes theme CSS to + `self.vfs_root.join("styles.css")` directly via + `runtime.file_write` (`pass2_renderer.rs:739`). +- WASM caller passes `"/.quarto/project-artifacts"` + (`wasm-quarto-hub-client/src/lib.rs:1512,1696,1786`) — synthetic + string, identity URL. +- Native test helpers pass `project.dir.join(".quarto/project-artifacts")` + (`tests/render_page_in_project.rs:80`, + `tests/idempotence.rs:243`) — real tempdir, leaks into URL. + +A naive fix in the test (pass `"/.quarto/project-artifacts"` for both +roles) fails because `runtime.file_write("/.quarto/project-artifacts/styles.css")` +hits the read-only root filesystem (verified empirically: `os error +30`). So the split must really be a split, not a single-arg switch. + +## Existing pinned contract + +`crates/quarto-core/src/project/website_post_render.rs:638-653`: +> On VFS-root mode the html_url is absolute (`//

`) +> and the on-disk path is the same with the leading `/` dropped. +> The browser fetches the URL and the hub-client serves from VFS +> at the matching synthetic path. + +This is a **WASM-only** invariant. After the split, the single-arg +`vfs_root(path)` constructor preserves it (write_root == url_root by +construction). The two-arg form intentionally breaks it (write to +tempdir, URL stays synthetic) — but only the native test helpers +take that form, so no production code is affected. + +## Design + +### Resolver field + +Replace the single `Option` field with a small struct: + +```rust +struct VfsRootMode { + /// Absolute disk path. `runtime.file_write` and + /// `OutputSink::allowed_roots` use this. In WASM this is a + /// synthetic VFS path (the runtime serves it from memory); in + /// native tests it's a real tempdir subdirectory. + write_root: PathBuf, + /// URL prefix embedded in HTML links / asset srcs. In WASM this + /// matches `write_root`. In native tests it's a fixed synthetic + /// string (e.g. `/.quarto/project-artifacts`) so URLs don't + /// capture the host machine's tempdir. + url_root: String, +} +``` + +`page_url_for`, `html_url_for`, `page_url_for_site_root_dir` use +`url_root`; `on_disk_path_for` and `allowed_output_roots` use +`write_root`. `is_vfs_root_mode` is unchanged. + +### Resolver constructor + +Existing: +```rust +pub fn vfs_root(vfs_root: impl Into) -> Self { … } +``` +keeps its signature and semantics. Internally it stores the path as +both `write_root` and `url_root` (via `to_string_lossy().replace('\\', '/')`). +Production WASM callers don't change. + +New constructor: +```rust +pub fn vfs_root_with_url_root( + write_root: impl Into, + url_root: impl Into, +) -> Self { … } +``` + +Native test helpers switch to this form. + +### Renderer side + +`RenderToPreviewAstRenderer` and `RenderToHtmlRenderer` each currently +hold a single `vfs_root: PathBuf` and pass it verbatim to the +resolver constructor + theme-CSS write. Add: + +```rust +pub struct RenderToPreviewAstRenderer { + vfs_root: PathBuf, // unchanged — used for disk writes + vfs_url_root: Option, // None → derive from vfs_root (today's behavior) + … +} + +impl RenderToPreviewAstRenderer { + pub fn with_url_root(mut self, url_root: impl Into) -> Self { + self.vfs_url_root = Some(url_root.into()); + self + } + + fn build_resolver(&self) -> ResourceResolverContext { + match &self.vfs_url_root { + Some(url) => ResourceResolverContext::vfs_root_with_url_root( + self.vfs_root.clone(), url.clone(), + ), + None => ResourceResolverContext::vfs_root(self.vfs_root.clone()), + } + } +} +``` + +Same shape on `RenderToHtmlRenderer` for symmetry (its native callers +aren't currently testing URL determinism, but the API stays consistent +and the surface area is identical). + +The three `ResourceResolverContext::vfs_root(self.vfs_root.clone())` +call sites in `pass2_renderer.rs` (lines 437, 552, 661, 798) all +become `self.build_resolver()`. + +The theme-CSS write at `pass2_renderer.rs:739` keeps `self.vfs_root.join("styles.css")` +unchanged — that's the disk write, write_root is correct. + +### Test-helper updates + +`crates/quarto-core/tests/idempotence.rs:243`: +```rust +let vfs_root = project.dir.join(".quarto/project-artifacts"); +let renderer = RenderToPreviewAstRenderer::new(&vfs_root) + .with_url_root("/.quarto/project-artifacts"); +``` + +`crates/quarto-core/tests/render_page_in_project.rs:80-81` gets the +same treatment so the HTML-test path produces deterministic link +URLs too. (Not required by Plan 3, but matches the resolver-level +guarantee. Optional in this plan; do it if regression-cheap.) + +## Phases + +### Phase 1 — Regression tests (failing first) + +- [x] Run `cargo nextest run -p quarto-core --test idempotence website_links` + and confirm it fails today with the absolute-path symptom (already + verified; record in the plan and move on). +- [x] Add a unit test in `resource_resolver.rs` that asserts: given + `ResourceResolverContext::vfs_root_with_url_root("/tmp/abc", "/synthetic")`, + `html_url_for(Project, p)` returns `"/synthetic/

"` and + `on_disk_path_for(Project, p)` returns `"/tmp/abc/

"`. Confirm + it fails to compile (the constructor doesn't exist yet). + +### Phase 2 — Resolver split + +- [x] Define the private `VfsRootMode` struct inside `resource_resolver.rs`. +- [x] Change `vfs_root_mode` field from `Option` to + `Option`. +- [x] Update the four match sites (`html_url_for`, `page_url_for`, + `allowed_output_roots`, `on_disk_path_for`) to read the right field. +- [x] Add the `vfs_root_with_url_root` constructor. +- [x] Update the existing `vfs_root` constructor to populate both + fields from the single arg (preserves the WASM identity contract). +- [x] Run the Phase-1 unit test — should pass. +- [x] Re-run the existing pinned contract test + (`vfs_root_resolver_url_matches_on_disk_path` in `website_post_render.rs`). + Should still pass — single-arg constructor still gives URL == disk. + +### Phase 3 — Renderer split + +- [x] Add `vfs_url_root: Option` field + `with_url_root` builder + to `RenderToPreviewAstRenderer`. +- [x] Mirror on `RenderToHtmlRenderer`. +- [x] Replace the four `ResourceResolverContext::vfs_root(self.vfs_root.clone())` + call sites with `self.build_resolver()`. +- [x] `cargo build --workspace` should succeed — no callers have + changed yet, the new field defaults to `None` which derives the + URL root from `vfs_root` exactly as before. + +### Phase 4 — Wire up test helpers + +- [x] `tests/idempotence.rs::render_active_page_preview` adds + `.with_url_root("/.quarto/project-artifacts")`. +- [x] `tests/render_page_in_project.rs::render_active_page` adds the + same (optional but consistent). +- [x] Re-run `cargo nextest run -p quarto-core --test idempotence website_links`. + Should now pass. +- [x] Re-run the full idempotence suite — confirm no other fixtures + regress. + +### Phase 5 — Workspace verification + +- [x] `cargo build --workspace` clean. +- [x] `cargo nextest run --workspace`. +- [x] `cargo xtask verify --skip-hub-build` (matches CI's `-D warnings` + strictness on the Rust leg). +- [x] Cross-check: WASM hub-client callers still pass single-arg + `vfs_root("/.quarto/project-artifacts")` and produce identical + URLs to today (no behavior change). The + `vfs_root_resolver_url_matches_on_disk_path` test in + `website_post_render.rs` is the regression sentinel — it stays + green by construction. + +### Phase 6 — Beads housekeeping + +- [x] `br close bd-rz2we --reason "fixed: split vfs_root into write-root + url-root in ResourceResolverContext + per-renderer override"`. +- [x] Update Plan 3's Phase-4 checklist line for `website_links` (mark + green, drop the queue note). +- [x] `br sync --flush-only`, then commit `.beads/` from the main + repo. + +## Out of scope + +- `RenderToHtmlRenderer`'s native HTML-output tests aren't currently + asserting on link URLs; this plan touches them only for API + symmetry. If they have latent path-leakage in their assertions + (unlikely — they test HTML content shape), that's a separate ticket. +- The wider `vfs_root` naming question (whether the field should be + renamed from `vfs_root` to `vfs_write_root` everywhere). Holding off + to keep the diff small; rename is a no-op refactor that can land + separately. +- bd-3odjm (FilterProvenance wire-format bug). Owned by Plan 5, + unrelated. + +## Touch list + +- `crates/quarto-core/src/resource_resolver.rs` — field, constructor, + 4 match-site updates, 1 new unit test. +- `crates/quarto-core/src/project/pass2_renderer.rs` — 2 renderers × + (1 new field, 1 builder method, 1 helper, 4 call-site swaps). +- `crates/quarto-core/tests/idempotence.rs` — 1 helper line. +- `crates/quarto-core/tests/render_page_in_project.rs` — 1 helper + line (optional). + +No production-code callers change. diff --git a/claude-notes/plans/2026-05-22-provenance-plan-10-dispatch-anchor.md b/claude-notes/plans/2026-05-22-provenance-plan-10-dispatch-anchor.md new file mode 100644 index 000000000..3dd615ebb --- /dev/null +++ b/claude-notes/plans/2026-05-22-provenance-plan-10-dispatch-anchor.md @@ -0,0 +1,584 @@ +# Provenance Plan 10 — Dispatch anchor + Lua source registration in SourceContext + +**Date:** 2026-05-22 +**Branch:** feature/provenance +**Status:** Research plan (pre-implementation; API surface not yet pinned) +**Milestone:** none directly — improves source-pointing diagnostics + and attribution for Lua-driven content; does not gate M3. + +## Epic context + +Part of the **provenance epic** (Plans 3–10). Lua filter files and +Lua-shortcode handler files contribute source-side bytes to +`Generated` nodes (a filter constructed an `Str("HELLO")` somewhere +in `upper.lua`; a `{{< kbd >}}` handler ran code at `kbd.lua:14`). +Today, that source identity lives in `By.data` as a stringly-typed +`{filter_path, line}` payload constructed via `debug.getinfo()`. +It belongs in the `from` anchor list, attached via a new +`AnchorRole::Dispatch` role. + +Same asymmetry contract as `ValueSource` (settled by Plan 9): +**Dispatch is diagnostic-only**, never walked by the writer's +`preimage_in`. The point is attribution and source-pointing +diagnostics — "this rendered text came from line 14 of `kbd.lua`" — +not round-trip. + +## Goal + +Migrate Lua-driven `Generated` shapes from string-keyed +`by.data: {filter_path, line}` to typed source_info pointers in the +anchor list: + +- **Filter constructions**: `Generated { by: filter(), from: + [Dispatch -> lua_si] }` (was `Generated { by: filter(path, line), + from: [] }`). +- **Lua-handler shortcode resolutions**: `Generated { by: + shortcode(name), from: [Invocation -> token_si, Dispatch -> lua_si] }` + (was `Generated { by: shortcode{name, lua_path, lua_line}, from: + [Invocation -> token_si] }`). + +To make those source_info pointers meaningful, **register Lua filter +files and Lua-shortcode-handler files in `SourceContext`** so they +get `FileId`s and their content is available for byte-range +resolution. + +When this plan lands, source-pointing diagnostics from Lua land +("at line 14 of upper.lua, column 5–10") use the same SourceContext +machinery as qmd / YAML diagnostics. Attribution tooling can chase +the `Dispatch` anchor back to the Lua function that produced a node. + +### Lua filters that wrap user content + +A Lua filter that **wraps existing user content** in a Div (e.g. +the canonical "page-shell" filter, or a Lua reimplementation of +`.callout` wrapping) does not need any registration or opt-in to +participate in the visual editor. If the filter emits a block +container (Div / BlockQuote / Figure / NoteDefinitionFencedBlock) +whose `source_info` is `Generated { by: filter(), from: [Dispatch +-> lua_si] }` (no Invocation anchor) and whose children preserve +their original source positions, the wrapper meets the structural +definition of a *transparent wrapper* in +[`claude-notes/designs/transparent-wrappers.md`](../designs/transparent-wrappers.md): + +1. Generated, no Invocation anchor — ✓ (Dispatch is anchor-only + for diagnostics; doesn't count as a source token). +2. Block-container kind — ✓. +3. Children carry `preimage_in(target)` — ✓ by construction + (the filter mutates rather than constructs). + +The writer's `first_in_user_tree` walker sees through it +automatically; the React dispatcher's editability gate (Plan 7c +Phase 2 — `isEditableInside`) treats its children as editable; +edits inside the wrapped content round-trip cleanly. The filter +author writes idiomatic Lua and gets working visual-editor +support — no contract to satisfy beyond "don't strip +source_info from the children you wrap." + +A Lua filter that **constructs** a fresh block container from +metadata (no source-bearing children) is implicitly atomic via +condition (3) — `first_in_user_tree` doesn't descend into it, +editor treats it as a unit, edits inside soft-drop with Q-3-43. +That's also the right behaviour: there are no source bytes to +edit. + +This works regardless of Plan 10's `Dispatch` migration: the +predicate looks at the shape of the AST, not at the kind of +filter. Plan 10 makes filter diagnostics *better*; the transparent- +wrapper machinery makes the **editing contract** that filter +authors can rely on. + +## Scope + +### In scope + +#### Phase 1 — `AnchorRole::Dispatch` + +- Add `Dispatch` variant to `AnchorRole` enum in + `crates/quarto-source-map/src/source_info.rs:91-118` alongside the + existing `Invocation`, `ValueSource`, `Other`. +- Doc-comment explicitly references the Plan-9-established policy: + `preimage_in` walks `Invocation` only; `Dispatch` is + diagnostic-only and never consulted by the writer. +- Add `Anchor::dispatch(source_info: Arc) -> Self` + constructor parallel to `Anchor::invocation` / `Anchor::value_source`. + +#### Phase 2 — SourceContext extension for Lua files + +- Extend `SourceContext::add_file` (currently + `crates/quarto-source-map/src/context.rs:59`) to support Lua files. + Two possible extensions: + - (A) Add a `FileKind { Qmd, Yaml, Lua, … }` discriminator on + `FileInformation`. `add_file` stays signature-compatible; + callers passing Lua files use a new `add_lua_file` helper or + pass `FileKind::Lua` explicitly. + - (B) Reuse `add_file` as-is (Lua files are just files; + path/content are sufficient). + - Recommendation: (B) for v1; (A) only if a downstream consumer + needs to distinguish kind (e.g. line-numbering rules differ for + Lua vs. qmd, which they don't today). +- Confirm `FileInformation::compute_line_breaks` handles Lua source + correctly (it should — it just indexes `\n` positions). + +#### Phase 3 — Lua engine bridge: pass FileId through callbacks + +- `apply_lua_filters` + (`crates/pampa/src/lua/filter.rs:158-200` and surrounding) reads + the filter path from `FilterSpec::Lua(path)` and the filter file + bytes from disk. **Register the file in `SourceContext` at that + point**, capturing the returned `FileId`. +- Thread the `FileId` into the Lua closure context so callbacks that + introspect `debug.getinfo()` can resolve `(source: path, line: + line_num)` into `SourceInfo::Original { file_id, start, end }` + where `start..end` covers the line's bytes (via + `FileInformation`'s line-break index). +- Update `get_caller_source_info` + (`crates/pampa/src/lua/diagnostics.rs:255`) — currently constructs + `Generated { by: By::filter(path, line), from: SmallVec::new() }`. + New shape: `Generated { by: By::filter(), from: + [Dispatch(Arc::new(Original{file_id, start, end}))] }`. + +#### Phase 4 — `By::filter` signature shrinks + +- Change `By::filter(path: impl Into, line: usize)` + (currently at `crates/quarto-source-map/src/source_info.rs:458`) + to `By::filter()`. The path/line move to the Dispatch anchor's + source_info; `by.data` becomes `null`. +- All call sites in `crates/pampa/src/lua/types.rs:1830`, + `crates/pampa/src/lua/diagnostics.rs:203,262,847`, + `crates/pampa/src/readers/json.rs:305,2764` migrate. Most are + diagnostic-side; the json reader has a legacy-back-compat path + reading `"FilterProvenance"` tag. +- **No backward-compat carve-out for `By::filter`.** Same reasoning + as Plan 9's `By::appendix` change: + 1. `By::filter` is workspace-internal Rust — no FFI, no extension + SDK, no TS mirror. + 2. Plan 5 has shipped `By::filter(path, line)` to the JSON wire + format. **Clean break** (see §Phase 6 below) — writers emit + the new shape after Plan 10 lands; the old shape disappears + from the codebase in the same PR. No dual-reader window. No + on-disk artifacts hold the old shape, so no migration path is + needed. +- `By::as_filter()` accessor (currently returns + `Option<(&str, usize)>` from `by.data`) gets removed or + repurposed. Callers needing path/line read the Dispatch anchor's + source_info and resolve via `SourceContext`. + +#### Phase 5 — Lua-handler shortcode resolutions + +- The shortcode resolver + (`crates/quarto-core/src/transforms/shortcode_resolve.rs:380-460`) + dispatches to Lua handlers via `dispatch_to_lua_engine`. When the + handler is Lua-backed, attach a `Dispatch` anchor pointing at the + handler function's source line. +- Built-in (Rust) handlers like `MetaShortcodeHandler` stay with + `from: [Invocation]` only — no Dispatch. +- The Lua engine needs to know which file each handler is registered + in (already known via the registration call site). Stash that + alongside the handler binding. + +#### Phase 6 — Wire format clean break + +- Plan 5 emits `Generated { by: filter, by.data: {filter_path, line} }` + to JSON wire code 4. After Plan 10: + - Writers emit `Generated { by: filter, by.data: null }` plus a + `Dispatch` anchor in the `from` list. + - Readers accept the new shape only. The old shape disappears from + the workspace in the same PR. +- **Clean break, no dual reader.** Same rationale as `By::appendix` + and `By::filter`: this is a workspace-internal wire format with no + on-disk artifacts holding the old shape. The IndexedDB profile + cache is invalidated by `pass1_key` (Phase 7 below); any in-flight + WASM bundles rebuild from source. The CI build chain ensures Rust + and TS rebuild in lockstep — no in-the-wild client holds an old + WASM expecting the old shape. +- Equivalent break on the Lua-shortcode-handler shape (currently + `by.data: {name, lua_path, lua_line}` → `by.data: {name}` + + Dispatch anchor). Same one-PR migration. + +#### Phase 7 — Cache-key surface (reuses Plan 7a's field) + +- Lua filter file content becomes Pass1 cache input. **Plan 7a + lands first** and introduces `filter_sources_hash` on + `Pass1KeyInputs` (SHA-256 over filter file bytes + opt-out flags). + Plan 10 **reuses** that field — no new field, no parallel hash. +- Plan 10 Phase 7 reduces to: + - Confirm the existing `filter_sources_hash` semantics cover + Plan 10's needs (cache invalidates when a Lua filter file's + bytes change). They do — both plans hash the same files. + - Add a smoke test: register a Lua filter file in SourceContext, + edit its bytes, confirm `pass1_key` changes accordingly. Likely + Plan 7a's existing tests already cover this; verify during + implementation. +- If Plan 7a hasn't landed when Plan 10 starts (reversed order), + Plan 10 introduces the field itself with Plan 7a's semantics, and + Plan 7a later reuses it. The structural answer is the same; the + PR that lands first owns the field. + +### Out of scope + +- **Lua hot-reload / file-watcher integration** — a Lua file editing + experience that re-runs the filter on save. Demand-driven + invalidation via cache-key hashing is sufficient for v1. +- **Lua-LSP cross-references** (jump-to-definition into filter code + from a rendered diagnostic) — UX work that consumes Plan 10's + output but isn't part of it. Likely a future hub-client plan. +- **Non-Lua extension-contributed handlers** (future WASM-shortcode, + native-Rust-shortcode). The `Dispatch` role is Lua-flavored — the + source_info pointer assumes a file with byte ranges. WASM / + native handlers may want a different anchor role (e.g. + `Other("wasm-handler")` carrying a handler URI). Defer until those + handler kinds exist. +- **Citeproc / JSON-filter source pointers**. Citeproc is a built-in + Rust filter (no Lua); JSON filters are external processes (no + source we can register). `FilterSpec::Citeproc` / `FilterSpec::Json` + variants stay with `Generated { by: filter(), from: [] }` — + diagnostic source pointing isn't meaningful for them. +- **Lua-engine-side restructuring** (e.g. moving the mlua bridge to + a separate crate). Plan 10 changes the contract at the bridge + boundary; it does not refactor the bridge. +- **bd-2mxo / `AttrSourceInfo` fixes** — separate concerns. + +## Design decisions (settled) + +- **`AnchorRole::Dispatch` is diagnostic-only.** Follows Plan 9's + `AnchorRole::Other` policy: `preimage_in` walks `Invocation` only. + Dispatch is consumed by attribution / diagnostic UI, not by the + writer's Verbatim path. + +- **`By::filter` becomes nullary.** Path/line move to Dispatch. + `By.data` for filter-kind is `null`. Wire format migrates (Phase 6 + above). + +- **Lua-handler shortcode keeps `name` in `by.data`.** The shortcode + name is part of the *identity* (which shortcode resolution + produced this node), not the *dispatch source* (which file + resolved it). The two are distinguishable: name is a parameter of + the `By` shape (`shortcode("meta")` vs `shortcode("kbd")`); dispatch + source is an anchor pointing at the handler's location. + +- **Source range of a Dispatch anchor: line-covering `Original`.** + `debug.getinfo()` gives line numbers, not byte ranges. Once Lua + file content is in SourceContext, we compute the byte range of the + named line via `FileInformation`'s line-break index. The Dispatch + anchor's source_info is `Original { file_id: lua_file, start: + line_start, end: line_end }`. Sub-line precision (specific + function or expression) is out of scope for v1 — `debug.getinfo()` + doesn't provide it without parsing the Lua source. + +- **Filter files are registered eagerly at `apply_lua_filters` + entry.** Not lazily on first `debug.getinfo()` call — eager + registration ensures the FileId is stable across multiple + callbacks and accessible without thread-safety gymnastics in the + Lua-closure context. + +- **Lua-shortcode handler files are registered at handler + registration time** (when `_extension.yml` loads). Same eager + pattern as filter files. The handler registry maps handler + name → `(FileId, line_in_file)`. + +- **No backward-compat carve-out for the wire format.** Plan 5's + emitted shape (`by.data: {filter_path, line}`) has shipped, but + appears only in WASM-internal AST flow and IndexedDB profile + cache. The cache is invalidated by `pass1_key` (Phase 7); no + on-disk artifacts hold the old shape. Clean break in one PR — + no dual-reader window. Same rationale as `By::appendix` (Plan 9) + and `By::filter` (Phase 4 above). + +- **Plan posture: research plan.** This document settles the API + shape (the Dispatch role, the `By::filter` migration, the + SourceContext extension); it does not yet commit to the + implementation order. A subsequent review pass converts it to a + development plan with checklisted phases. + +## API surface to settle (research-plan deliverables) + +By the time this plan converts to a development plan, the following +must be pinned: + +1. **`AnchorRole::Dispatch` doc-comment text** — exact wording of + "diagnostic-only, never consulted by `preimage_in`" policy. + +2. **`SourceContext` Lua-file kind discrimination** — option (A) + with `FileKind` enum vs. option (B) reuse `add_file` as-is. + Recommend (B); revisit if downstream needs (A). + +3. **Lua engine bridge: how the `FileId` is threaded into the + closure context.** mlua's app-data slot (`Lua::set_app_data`) is + the obvious answer. Confirm during implementation. + +4. **`Pass1KeyInputs` field shape** — option (A) `lua_filter_files` + field vs. option (B) SourceContext-referenced. Recommend (A) for + v1; Plan 7a coordinates by reading the same field. + +5. **Wire-format migration window** — which release cycle the dual + reader stays active. Stated in Plan 6's commit message; + propagated to wire-format documentation. + +6. **`By::as_filter()` deprecation** — remove vs. repurpose to + read from the Dispatch anchor. Recommend: remove; callers + needing path/line read the Dispatch source_info directly. + +## Open questions for implementation + +- **Pre-registration vs. on-demand registration of Lua files.** + Eager (Phase 3) means every render pays the SourceContext cost + even if `debug.getinfo()` never fires. On-demand registration is + cheaper but introduces order-dependence in the closure context. + Recommend eager; benchmark to confirm cost is negligible. + +- **`debug.getinfo` performance.** Calling + `debug.getinfo` on every constructed node may dominate filter + runtime. Verify against a filter-heavy fixture during + implementation; if it's expensive, batch source-info attachment to + the post-walk helper (`enrich_or_create` in Plan 6's design). + +- **Coordination with Plan 7a's `filter_sources_hash`.** Plan 7a + proposes hashing filter files for idempotence verdicts; Plan 10 + hashes them for cache invalidation. Recommend: settle on one hash + computation owned by Plan 10's Phase 7; Plan 7a reuses it. Confirm + during the Plan 7a → Plan 10 sequencing discussion. + +- **Lua-shortcode-handler file registration timing.** Extension + loading (`_extension.yml` parsing) happens before filter pipeline + setup. Need to ensure SourceContext is available at extension + load — likely via the existing `StageContext`-style threading. + Confirm. + +- **Migration of existing Plan 4 tests.** The unit tests in + `crates/quarto-source-map/src/source_info.rs:715-770` exercise + `By::filter("foo.lua", 42)` extensively. They migrate to + `By::filter()` + a Dispatch anchor; the path/line assertions move + to the anchor's `source_info`. Mechanical but ~10 test changes. + +- **Plan 6's Lua post-walk shape (`enrich_or_create`).** Plan 6 + Phase 6's post-walk helper (per the diff in Plan 6 §"The post-walk + helper") promotes Lua-attached source_info to the canonical + `Generated { by: filter, ... }` form. After Plan 10 the canonical + form is `Generated { by: filter(), from: [Dispatch] }`. The + helper updates accordingly. Confirm Plan 6 lands before Plan 10 + implementation (or that Plan 6 is amended to anticipate the + shape change). + +## References + +- `crates/quarto-source-map/src/source_info.rs:91-118` — + `AnchorRole` enum (Phase 1 extends). +- `crates/quarto-source-map/src/source_info.rs:458-466` — + `By::filter` constructor (Phase 4 signature change). +- `crates/quarto-source-map/src/source_info.rs:582-594` — + `By::as_filter` accessor (Phase 4 removes / repurposes). +- `crates/quarto-source-map/src/context.rs:59-130` — + `SourceContext::add_file*` family (Phase 2 extends). +- `crates/quarto-source-map/src/file_info.rs:12-58` — + `FileInformation`; line-break index used in Phase 3 for byte-range + resolution. +- `crates/pampa/src/lua/filter.rs:158-200,270` — + `apply_lua_filters` entry; Phase 3's eager-registration site. +- `crates/pampa/src/lua/types.rs:1820-1840` — `debug.getinfo()` + consumer (Phase 3 migrates to FileId-backed shape). +- `crates/pampa/src/lua/diagnostics.rs:195-265,847` — Generated + construction sites; Phase 3 + 4 migrate. +- `crates/pampa/src/readers/json.rs:305,2764` — wire-format + decoder; Phase 6's dual-reader window. +- `crates/quarto-core/src/project/cache_key.rs:108-141` — + `Pass1KeyInputs`; Phase 7 extends. +- `crates/quarto-core/src/transforms/shortcode_resolve.rs:380-460` — + Lua shortcode dispatch; Phase 5's stamping site. +- Plan 6 §"Dispatch follow-up" — Plan 10's scope-pickup point. +- Plan 9 §"Settled `AnchorRole::Other` policy" — Plan 10 inherits the + policy for Dispatch. +- Plan 5 (wire format) — Phase 6's migration is on top of Plan 5's + code-4 emission. +- Plan 7a — coordinates on filter-source hashing (Phase 7). +- bd-36fr9 (closes). + +## Test plan + +### Phase 1 (`AnchorRole::Dispatch`) + +- Constructor unit tests parallel to `Anchor::invocation` / + `Anchor::value_source`. +- Serde round-trip test for a `Generated` carrying a `Dispatch` + anchor. +- `preimage_in` asymmetry test: `Generated { by: filter(), from: + [Dispatch(lua_si)] }` → `preimage_in` returns None (Lua bytes are + not body bytes; the writer must not copy them into the parent + file). +- `anchors_with_role(&AnchorRole::Dispatch).count()` returns 1 on + the above shape. + +### Phase 2 (SourceContext Lua-file extension) + +- `add_file` with a `.lua` path produces a FileId; content is + retrievable. +- `FileInformation::map_offset` resolves byte offsets to (row, col) + for Lua source. + +### Phase 3 (Lua bridge FileId threading) + +- A filter that constructs a node (via `pandoc.Str(...)`) produces + a `Generated { by: filter(), from: [Dispatch] }` shape; the + Dispatch anchor's source_info chain-resolves to the filter + file's FileId and the constructed line's byte range. +- `get_caller_source_info` returns the new shape; legacy callers + failing to find a `(path, line)` in `by.data` get a + doc-commented migration message. + +### Phase 4 (`By::filter` signature shrinkage) + +- All migrated unit tests pass with the nullary constructor. +- `By::filter().is_atomic_kind()` still returns true (atomicity + unchanged). + +### Phase 5 (Lua-handler shortcode) + +- A Lua-handler shortcode resolution produces `Generated { by: + shortcode(name), from: [Invocation, Dispatch] }`. Built-in + shortcode resolutions (meta, var) stay `from: [Invocation]` only. + +### Phase 6 (wire format clean break) + +- Writer: emits the new shape (`by.data: null` + Dispatch anchor). +- Reader: accepts only the new shape; old shape removed entirely. +- Snapshot test asserting byte-for-byte stability of Lua-filter- + emitting fixtures under the new shape. +- Compile-time confirmation that no reader code references the old + `filter_path` / `line` keys in `by.data`. + +### Phase 7 (cache-key surface) + +- Cache key invalidates when a Lua filter file's content changes. +- Cache key stable when Lua filter file content is unchanged. + +### End-to-end + +- Lua filter raising a `quarto.warn(...)` from line 14 of `foo.lua` + produces a diagnostic whose source range + chain-resolves (via `SourceInfo::resolve_byte_range`) to + `(foo_lua_file_id, line_14_start, line_14_end)`. +- A document with a Lua-handler shortcode (`{{< kbd Alt-X >}}`): + - Resolved inline carries Dispatch anchor pointing at the + handler's Lua source. + - Edit-back round-trip preserves the `{{< kbd Alt-X >}}` token + in the qmd source (Plan 7 Verbatim via the Invocation anchor; + Dispatch is not consulted). + +## Dependencies + +### Hard dependencies + +- **Plan 4** — `AnchorRole` enum. +- **Plan 6** — `Generated`-stamping post-walk helper + (`enrich_or_create`) is the natural point to migrate to the new + shape. Plan 6 must land before Plan 10 implementation, OR Plan 6 + is amended to anticipate the Dispatch shape during + implementation. Recommend the former. +- **Plan 5** — Plan 10's wire-format migration is on top of Plan + 5's code-4 emission. + +### Soft dependencies + +- **Plan 9** — establishes the `AnchorRole::Other` policy that + Dispatch inherits. Doesn't strictly block Plan 10 implementation + (the policy is doc-only), but Plan 9 lands the policy in writing + first. +- **Plan 7a** — coordinates on filter file hashing (Phase 7). + Recommend Plan 10's Phase 7 lands the cache-input shape; Plan 7a's + idempotence cache reuses it. + +### Does not block + +- **Plan 7 implementation** can ship without Plan 10. Plan 7's + writer consults `Invocation` only; Dispatch lands in the + diagnostic UX cycle. + +### Blocks + +- Future Lua-LSP / hub-client diagnostic-clicks-to-source UX work. +- Future extension-author-facing handler-trace tooling. + +## Risk areas + +- **Lua engine bridge complexity.** Touches mlua interop, app-data + context threading, debug.getinfo behavior across Lua versions + (5.1 vs. 5.4 — verify what we use). The mlua side has historically + been a source of subtle bugs; budget extra time for edge cases. + +- **`debug.getinfo` performance.** Calling on every constructed node + could dominate filter runtime. Mitigation: batch via Plan 6's + post-walk helper if necessary; benchmark. + +- **Wire-format clean-break coordination.** Plan 10's PR must + rebuild WASM and TS in lockstep — the WASM emits the new shape; + TS expects only the new shape. No in-flight client holds an old + WASM expecting the old shape (no npm-published consumer). CI's + `cargo xtask verify` chain catches drift if the rebuild is + incomplete. + +- **SourceContext lifetime / sharing.** Lua files registered eagerly + at `apply_lua_filters` entry need to be available for the + duration of the pipeline. The existing SourceContext sharing + pattern (likely `Arc>` or `&mut` through the pipeline) + must accommodate Lua-file additions mid-pipeline. Verify. + +- **Coordination friction with Plan 7a — resolved.** Both plans + touch `cache_key.rs` and want to hash filter files. Resolved by + agreement: Plan 7a lands first and owns `filter_sources_hash` on + `Pass1KeyInputs`; Plan 10 reuses it. The order is also the + natural one — Plan 7a is independent of Plan 10's Lua-source + registration; Plan 10 benefits from Plan 7a's hashing already + being in place. + +- **Migration tests that touch `By::filter("foo.lua", 42)`.** ~10 + unit tests in `source_info.rs` migrate mechanically; if any are + missed during the signature change, the workspace fails to + compile. Mitigation: the compiler is the safety net here — `cargo + build --workspace` will name every offending site. + +## Estimated scope + +| Phase | Lines (rough) | +|---|---| +| 1: `AnchorRole::Dispatch` + Anchor constructor + tests | ~80 | +| 2: SourceContext Lua-file support (probably minimal) | ~40 | +| 3: Lua bridge FileId threading + byte-range computation | ~200 | +| 4: `By::filter` signature shrinkage + call-site migration | ~120 | +| 5: Lua-handler shortcode Dispatch attachment | ~80 | +| 6: Wire-format clean break + tests | ~80 | +| 7: Cache-key smoke test (reuses Plan 7a's `filter_sources_hash`) | ~30 | +| Tests across phases | ~350 | +| **Total** | **~980** | + +Two focused sessions likely; high-complexity due to mlua interop +and the wire-format migration. The Lua engine bridge work in +Phase 3 is the riskiest piece — if `debug.getinfo` ergonomics or +performance surprise, the design changes. + +## Notes + +This plan is the "Lua-source pointing" wing of the provenance epic. +Plan 9 covers metadata-derived attribution; Plan 10 covers +Lua-derived attribution. Both rely on the `AnchorRole::Other` +policy Plan 9 commits to writing. + +After Plan 10, the `Generated.by.data` payload shrinks across all +known kinds: +- `filter`: `{filter_path, line}` → `null` (Plan 10). +- `shortcode`: `{name, lua_path, lua_line}` for Lua handlers → + `{name}` (Plan 10). Built-in handlers unchanged. +- `appendix`: `null` → serialized `AppendixSection` enum (Plan 9). +- `sectionize`, `title-block`, `footnotes`, `appendix-container`, + `tree-sitter-postprocess`, `user-edit`, `include`: `null` + (unchanged). + +The trajectory is "By.data shrinks; the anchor list grows." That's +the right direction — typed source_info pointers in `from` are +strictly more powerful than untyped strings in `by.data`, and they +follow the established `Invocation` / `ValueSource` / `Dispatch` +role discipline. + +### Naming convention + +Uses the `provenance-plan-N-.md` naming (no `q2-preview-` +prefix) established by Plan 9. The provenance epic has outgrown the +original q2-preview framing. diff --git a/claude-notes/plans/2026-05-22-provenance-plan-9-valuesource-threading.md b/claude-notes/plans/2026-05-22-provenance-plan-9-valuesource-threading.md new file mode 100644 index 000000000..772a3cb9a --- /dev/null +++ b/claude-notes/plans/2026-05-22-provenance-plan-9-valuesource-threading.md @@ -0,0 +1,559 @@ +# Provenance Plan 9 — ValueSource threading for metadata-derived content + +**Date:** 2026-05-22 +**Branch:** feature/provenance +**Status:** Research plan (pre-implementation; API surface not yet pinned) +**Milestone:** none directly — improves attribution / round-trip provenance + reporting; does not gate M3. + +## Epic context + +Part of the **provenance epic** (Plans 3–10). Plan 6 stamps every +pipeline-synthesized node with `Generated { by, from }`; for most +synthesizers the `from` list is non-empty only when there's a +body-source token to anchor at (shortcode resolutions → `Invocation`). +**Several synthesizers consume metadata values (frontmatter, +`_quarto.yml`, `_metadata.yml`) and currently emit `from: []`** because +the value-side source info is discarded somewhere between the YAML +parser and the synthesizer's stamping point. Plan 9 threads it the +last hop and stamps `ValueSource` anchors on those consumers, so +attribution tooling can trace rendered content back to the YAML keys +that produced it. + +Plan 9 is the **consumer wiring** half of the provenance epic. Plan 6 +stamps the identity (`by`); Plan 9 stamps the origin (`ValueSource` in +`from`) on the metadata-derived subset. Together they make every +pipeline-produced metadata-derived node fully attributable. + +## Goal + +Thread per-value `SourceInfo` to where synthesizers can stamp it as +`ValueSource` anchors. Three target consumers: + +1. **Meta/var shortcode resolutions** (closes bd-129m3) — `{{< meta + footer >}}` → `Generated { by: shortcode("meta"), from: + [Invocation -> token_si, ValueSource -> value_si] }`. +2. **DocumentProfile.title → nav-text** (closes bd-8pmq3) — sidebar / + navbar entries built from `profile.title` carry a `ValueSource` + anchor pointing at the source qmd's title metadata bytes. +3. **Appendix container metadata-derived sections** (currently + unowned in beads) — per-section sub-Divs for license, copyright, + citation each stamped with `ValueSource` pointing at + `meta.license` / `meta.copyright` / `meta.citation`. + +Plus the **Plan 7 deferred invariant tests** that depend on at least +one ValueSource consumer existing (the `preimage_in` role-asymmetry +unit test and the appendix-license end-to-end round-trip test). + +When this plan lands, the `Invocation` vs `ValueSource` asymmetry +contract Plan 7 documents has real exercise — there are producers, +the writer correctly walks only the `Invocation` anchors, the +attribution machinery can light up the `ValueSource` data without any +writer changes. + +## Scope + +### In scope + +#### Phase 1 — Infrastructure + +- A provenance-aware conversion API alongside the existing + `config_value_to_inlines(value: &ConfigValue) -> Vec` in + `crates/quarto-core/src/transforms/shortcode_resolve.rs:167`. + **API shape (settled per user direction):** + + ```rust + /// Convert a ConfigValue to inline content, returning both the + /// inlines and the source_info pointing at the value's definition + /// site. The caller decides how to stamp the source_info (typically + /// as an `AnchorRole::ValueSource` on a surrounding `Generated`). + /// + /// For `PandocInlines` content, the returned source_info is the + /// outer ConfigValue's; per-leaf source_info is preserved on the + /// inlines themselves and is not flattened. + fn config_value_to_inlines_with_provenance( + value: &ConfigValue, + ) -> (Vec, SourceInfo); + ``` + + The existing `config_value_to_inlines` stays for legacy callers + (template values, non-provenance contexts). New consumers route + through the provenance-aware version. + +- `DocumentProfile` gains `title_source_info: Option` + (per bd-8pmq3's detailed plan: ~30–50 LOC including `extract` + change + `Default` impl at `crates/quarto-core/src/document_profile.rs`). + Uses `#[serde(default, skip_serializing_if = "Option::is_none")]` + — same pattern as `order: Option`. **No + `DOCUMENT_PROFILE_VERSION` bump** (additive `Option<_>` with + default; per document-profile-contract §"Serialization and + versioning"). Update the contract's §Change log. + + **Transparent-wrapper invariant.** `DocumentProfile::extract` + runs at the pre-sugar checkpoint, so it never sees the + sectionize wrapper — `blocks[0]` here is the user's real first + block. If the extractor is later moved past + `SectionizeTransform`, or extended with a "fall back to the + first H1" rule, it MUST descend through transparent wrappers + via `first_in_user_tree` + (`crates/pampa/src/writers/incremental.rs`). See + [`claude-notes/designs/transparent-wrappers.md`](../designs/transparent-wrappers.md) + for the contract. + +- New typed enum `AppendixSection { License, Copyright, Citation }` + in `crates/quarto-source-map/src/source_info.rs`, with serde + derive. Discriminator for `By::appendix` (see Phase 4). + +#### Phase 2 — Meta/var shortcode ValueSource (closes bd-129m3) + +- `MetaShortcodeHandler::resolve` + (`crates/quarto-core/src/transforms/shortcode_resolve.rs:148`) and + the matching `var` handler look up via + `ctx.metadata.get_nested(&key)` which returns a `&ConfigValue` + whose `.source_info` is the value's definition site. +- Construct resolved inlines via + `config_value_to_inlines_with_provenance`, then stamp the + surrounding `Generated` with both anchors: + + ```rust + let (inlines, value_si) = config_value_to_inlines_with_provenance(value); + let mut gen = SourceInfo::generated(By::shortcode(name)); + gen.append_anchor(AnchorRole::Invocation, Arc::new(token_si)); + gen.append_anchor(AnchorRole::ValueSource, Arc::new(value_si)); + // attach `gen` to each resolved inline + ``` + +- Belt-and-suspenders for `ConfigValueKind::PandocInlines` + (markdown-rich metadata like `title: "**Bold**"`): the `ValueSource` + is attached on the wrapping shape, **not** pushed into every leaf + — keeps Plan 7's multi-inline dedupe rule (which compares + `invocation_anchor()` source_info structurally) trivially correct + with no ValueSource cross-talk. + +#### Phase 3 — DocumentProfile.title → nav-text (closes bd-8pmq3) + +- Update `DocumentProfile::extract` + (`crates/quarto-core/src/document_profile.rs:529`): replace + `title: plain_text_field(meta, "title")` with code that also + captures `meta.get("title")?.source_info.clone()` into the new + `title_source_info` field. +- Three Plan-6 Phase-5 consumer sites attach + `ValueSource(profile.title_source_info)` when present: + - `crates/quarto-core/src/transforms/sidebar_generate.rs:228` + - `crates/quarto-core/src/transforms/sidebar_auto.rs:311` (only + when reading from `profile.title`; file-stem fallback at line 318 + keeps `from: smallvec![]`) + - `crates/quarto-core/src/transforms/navigation_enrich.rs:59` +- Subtitle / description / date / image fields stay out-of-scope + (not consumed by nav sites today). Inline-rich titles + (`ConfigValue::PandocInlines`) preserved by Phase 1's API design. + +#### Phase 4 — Appendix metadata-derived sub-Divs + +- `create_license_section` / `create_copyright_section` / + `create_citation_section` in + `crates/quarto-core/src/transforms/appendix.rs` (lines 270–) read + `meta.get("license")` / `.get("copyright")` / `.get("citation")` — + the source_info is on those `ConfigValue` references and just + needs to ride along. +- **Per-section sub-Div stamping (option A):** each per-section Div + carries + `Generated { by: By::appendix(AppendixSection::License), from: [ValueSource(license_si)] }`, + with the outer container kept at + `Generated { by: By::appendix_container(), from: [] }`. +- **`By::appendix` becomes parameterized** (settled per user + direction): drops the existing nullary `By::appendix()` + constructor in favor of `By::appendix(section: AppendixSection)`. + See §Design decisions for backward-compat rationale (no + production callers; no persisted wire artifacts). +- Need a separate `By::appendix_container()` (or similar) for the + outer wrapper Div, since the wrapper isn't tied to a single + metadata key. Tentative name `By::appendix_container()` — + discriminate during implementation. +- Missing-key cases (no `license` in meta) gracefully skip — no + ValueSource attempt, no synthesizer fires. + +#### Phase 5 — Plan-7 invariant tests (deferred from Plan 7) + +Status: Plan 7 shipped on `feature/provenance` 2026-05-24 (phases +1-7 + 9; Playwright e2e matrix carried separately in `bd-3izo3`). +These tests are now unblocked — they need a real `ValueSource` +consumer (Phase 4's appendix synthesizer) to exercise the +`Invocation`-vs-`ValueSource` asymmetry that Plan 7's writer +implements. Until Phase 4 stamps `ValueSource` anchors on the +appendix synthesizer, the structural-only versions of these tests +remain in Plan 7's `quarto-source-map` test module (the `preimage_in +skips non-Invocation roles` unit test, lines 982-986 of Plan 7). + + +- **`preimage_in` role-asymmetry unit test**: build + `Generated { by: By::appendix(AppendixSection::License), from: [ValueSource(meta_si)] }` + where `meta_si` is `Original { file_id: 0, start: 10, end: 25 }`. + Call `preimage_in(FileId(0))` and assert it returns `None` (NOT the + byte range of the meta-key — that would copy YAML into the body). + Pins the `Invocation` vs `ValueSource` asymmetry documented in + Plan 7 §`preimage_in` semantics. Lives in `quarto-source-map`'s + test module. + +- **Appendix-license end-to-end round-trip test**: build a project + fixture with frontmatter `license: MIT` and a synthesized + appendix (no user-written `:::{.appendix}` block). Run the full + q2-preview pipeline → write back to qmd. Assert: + - (a) no `license: MIT` bytes outside the YAML frontmatter range + (the meta YAML must not leak into the body); + - (b) output qmd is byte-identical to input qmd (round-trip + stability — the synthesized appendix Div is dropped from + output and re-synthesized next pipeline run). + + Covers the Phase 4 shape end-to-end. Belt-and-suspenders against + a future refactor that "leniently" tries `value_source_anchor()` + when `invocation_anchor()` returns None. + +- **Multi-inline dedupe-by-Invocation test**: build a Para with + three inlines each carrying + `Generated { by: shortcode("meta"), from: [Invocation -> token_si, ValueSource -> value_si] }` + (Phase 2 shape). Reconcile against an identical Para. Assert + Plan 7's writer emits the shortcode token bytes ONCE — confirms + dedupe consults `Invocation` only, not the full anchor list, and + doesn't mis-fire if ValueSource source_infos differ. + +- **Inline-level role-asymmetry test**: similar to the unit test + but at the inline level, e.g. a `Span` synthesized by some + metadata-aware transform with `[ValueSource only]`. Assert + `preimage_in` returns None at the inline level too. + +#### Phase 6 — Plan 7 cross-reference cleanup + +- Reword Plan 7's §`Invocation` vs `ValueSource` consumer asymmetry + subsection (added by commit `6a2797b6`) to point at Plan 9's + Phase 4 as the canonical example, rather than asserting that the + shape "is stamped today." Small docs change; closes the + wording-vs-reality gap. +- Cross-link Plan 7's §Test plan to Phase 5's tests' new homes. + +### Out of scope (rationale per item) + +- **bd-36fr9 (Dispatch anchor for Lua filter / handler-shortcode)** — + Conceptually adjacent (another anchor role for diagnostic-only + attribution), but the precondition is *register Lua filter files in + `SourceContext` and assign them `FileId`s*, which touches the Lua + engine bridge, cache-key surface, and SourceContext interning. + Sized for its own plan: **Plan 10**. Plan 9 stays + metadata-flavored. + +- **bd-12vrr (callout default-title)** — Callout titles ("Note", + "Tip", "Warning") come from a static list, not from metadata. The + work needs `By::callout()` and an atomicity decision but doesn't + fit the "thread source-info from metadata" thesis. Standalone + follow-up — see bd-12vrr's comment on the popup-menu use case. + +- **bd-1inj0 (code-block decoration synthesizers)** — Filenames and + captions come from chunk options / Attr, not from `ConfigValue`. + Different threading path (`AttrSourceInfo`, currently broken at the + merge layer per bd-1e6a5 / bd-3aolj). Wait for those preexisting + `Attr` bugs to land before doing decoration ValueSource. Standalone + follow-up. + +- **bd-2mxo (MergedConfig::materialize() strips source_info)** — + Real P2 bug, but per the issue itself "Scalar values are preserved + correctly." Plan 9's consumers read scalar values (`license: "MIT"`, + `title: "Foo"`); the bug affects map and array container + source_info, which Plan 9 doesn't need at the leaf level. Stays as + a parallel P2 fix that doesn't block Plan 9. (See §Risk areas + for the one corner where map-shaped metadata interacts.) + +- **bd-z2j7o (`WithSourceInfo` wrapper audit)** — Phase 1's + threading work may surface a third or fourth ad-hoc `(value, + source_info)` pair. If so, that's evidence for the audit but Plan 9 + doesn't pre-decide the refactor. + +- **bd-hjv5o (source-location-driven path resolution)** — Different + problem: uses `SourceInfo` to *change behavior* (resolving paths + relative to declaration site), not to *stamp anchors*. + +- **Hub-client UI consumption of ValueSource anchors** (hover-preview + showing "this title came from `_quarto.yml:title`"). The + Rust-side correctness is independently verifiable via tests; the + hover-UX is a separate hub-client plan. + +- **Subtitle / description / date / image source_info on + DocumentProfile** — extend when a consumer needs them; this plan + ships title only (the only field the three nav sites consume). + +## Design decisions (settled) + +- **Per-section sub-Div appendix attribution (option A).** Each of + license, copyright, citation gets its own typed `By::appendix` + variant carrying its own `ValueSource`. Enables fine-grained + hover-attribution UX. Trade-off: more sub-Divs, but the + structural cost is small. + +- **`By::appendix(AppendixSection)` typed enum constructor.** Settled + over the alternatives (string-keyed `by.data`, `&'static str` + parameter) because the discriminator is load-bearing and a typed + enum is checked at the compiler. Adding new appendix-section + variants in the future is a deliberate enum change — the right + kind of friction. + +- **No backward-compat carve-out for `By::appendix`.** The shape + change is clean. Reasons (verified): + 1. No production callers today — only test sites in + `source_info.rs` itself. `transforms/appendix.rs` still emits + `SourceInfo::default()`; Plan 6 will add stamping after this + plan finalizes the constructor. + 2. `By` is workspace-internal Rust — no FFI, no extension SDK, + no TS-side mirror. The hub-client's TS hand-mirror is + `atomicCustomNodes` for `CustomNode` types, not `By` kinds. + 3. Wire format: `By` serializes to `{kind, data}` via serde. No + persisted artifact contains `By::appendix` today (Plan 6 + stamping hasn't shipped). No migration needed. + +- **Idiomatic API: `(inlines, source_info)` returned for caller to + wrap.** `config_value_to_inlines_with_provenance` does not stamp + `Generated` itself, because `by` varies by caller (meta-shortcode + vs. appendix sub-Div have different `By` kinds). Parallels how + other source-info helpers in this codebase work. + +- **`AnchorRole::Other` policy explicit (per user direction):** the + `preimage_in` walker walks `Invocation` only; **all other roles, + existing or future, are not consulted by the writer.** Documents + the intent so an extension introducing `AnchorRole::Other("preimage-source")` + knows it'll be ignored. Stated in the doc-comment on + `preimage_in` and re-asserted in §`Invocation` vs `ValueSource` + consumer asymmetry in Plan 7. + +- **`ValueSource` is wrapper-level for `PandocInlines`-shaped + metadata, not per-leaf.** Phase 2 attaches ValueSource on the + surrounding `Generated` (one wrapping each resolved inline), not + inside the rich-content inlines themselves. Two reasons: + (a) keeps Plan 7's multi-inline dedupe rule clean (it consults + Invocation, not anchors on inlines); + (b) maps the user mental model: "this shortcode resolution came + from there" — not "this individual Str came from there". + +- **Plan posture: research plan.** This document settles the API + shape (constructors, function signatures, enum variants); it does + not yet commit to the implementation order or unit-test names. + A subsequent review pass converts it to a development plan with + checklisted phases. + +## API surface to settle (research-plan deliverables) + +By the time this plan converts to a development plan, the following +must be pinned: + +1. **`config_value_to_inlines_with_provenance` signature** — exact + return type, behavior for nil values, behavior for + `PandocInlines` (returns `(inlines, value.source_info.clone())`, + confirmed). Edge: `Concat`-shaped ConfigValue source_info — does + the consumer get the Concat or just the start range? Recommend + passing the full `source_info` regardless of shape; consumers + that need a single range can call `resolve_byte_range`. + +2. **`AppendixSection` enum variants** — `License`, `Copyright`, + `Citation` are the three sections `transforms/appendix.rs` knows + about today. If there are more synthesized sections (or planned + ones), enumerate them now. Verify against + `crates/quarto-core/src/transforms/appendix.rs:135-170`. + +3. **`By::appendix_container` (or equivalent) for the outer + wrapper** — name and signature. `By::appendix_container()` is + tentative; could also be `By::appendix(AppendixSection::Container)` + if treating "container" as a section variant feels right. Pick. + +4. **`DocumentProfile.title_source_info` field placement and + accessor surface** — direct field access (current convention) or + a typed accessor (`fn title_with_source(&self) -> Option<(&str, + &SourceInfo)>`)? + +5. **`AnchorRole::Other` doc-comment text** — exact wording of the + "future roles default to non-walked" policy. Lives on + `AnchorRole::Other` and on `SourceInfo::preimage_in`. + +## Open questions for implementation + +- **Granularity of `ValueSource` for nested `meta.license` shapes.** + YAML like `license: {name: MIT, url: ...}` produces a + `ConfigValueKind::Map`. bd-2mxo notes the merge step strips map + container source_info. Recommended approach for Phase 4: anchor + at the **first scalar leaf** (`name`) when the value is map-shaped, + falling back to the outer key when materialize has already + stripped the container. Notes the limitation; full fix waits for + bd-2mxo. + +- **Multi-anchor cost on Phase 2's two-anchor shape.** Every + meta-shortcode resolution gains a second anchor. Memory: 2 × + `Anchor` per inline. With `SmallVec<[Anchor; 2]>` already in place + (Plan 4), this stays on the stack. Verify no allocation regression + in a perf-sensitive document benchmark. + +- **Cross-reference test fixtures for Phase 4.** The + appendix-license e2e fixture needs to exercise the + YAML-meta-only form (not user-written `:::{.appendix}`). Phase 4 + needs to ensure the synthesizer fires only on the metadata path, + not on user-written appendix blocks. Confirm by reading + `appendix.rs:135-170` carefully during implementation. + +- **`PandocInlines`-shaped metadata behavior in Phase 2.** When + `title: "**Bold**"` resolves to `[Strong[Str], Space, Str]`, each + resolved inline gets a wrapping `Generated` with the ValueSource + on the wrapper. The Bold's children (Str) themselves carry their + own source_info (the parsed positions inside the YAML string). + Test: an edit to the resolved Bold inline goes through Plan 7's + soft-drop because the wrapper is atomic-kind (shortcode); the + user-edit is reverted with Q-3-42. Confirm. + +## References + +- `crates/quarto-pandoc-types/src/config_value.rs:155,170` — + `ConfigValue.source_info` and `ConfigMapEntry.key_source` (already + in place; Plan 9 just propagates them to consumers). +- `crates/quarto-core/src/transforms/shortcode_resolve.rs:148-167` — + `MetaShortcodeHandler::resolve` and `config_value_to_inlines`; + Phase 1/2's primary edit site. +- `crates/quarto-core/src/transforms/appendix.rs:135-260` — + `create_*_section` functions; Phase 4's edit site. +- `crates/quarto-core/src/document_profile.rs:271,487,529` — + `DocumentProfile` field declaration, Default impl, `extract` + helper; Phase 3's edit site (+ doc-contract Change Log). +- `crates/quarto-core/src/transforms/sidebar_generate.rs:228`, + `sidebar_auto.rs:311,318`, `navigation_enrich.rs:59` — Plan-6 + Phase-5 nav consumers; Phase 3's stamping sites. +- `crates/quarto-source-map/src/source_info.rs:91-118` — + `AnchorRole` enum (`Invocation`, `ValueSource`, `Other`); + Phase 1 adds `AppendixSection` here. +- `crates/quarto-source-map/src/source_info.rs:529` — `By::appendix` + constructor; Phase 4 modifies (signature change). +- Plan 6 §"ValueSource follow-up" (line 509-547) — Plan 9's + scope-pickup point. +- Plan 7 §`Invocation` vs `ValueSource` consumer asymmetry + (added by commit `6a2797b6`, not yet on `feature/provenance`) + — Plan 9 Phase 5 lands the tests; Phase 6 cleans up wording. +- bd-129m3 (closes), bd-8pmq3 (closes). + +## Test plan + +(See Phase 5 above for Plan-7-deferred tests.) Additional unit / +integration tests by phase: + +- **Phase 1**: `config_value_to_inlines_with_provenance` unit tests + for scalar, bool, int, `PandocInlines`, `PandocBlocks` (rejection + in inline context), missing key (None returned), nested via + `get_nested`. + +- **Phase 2**: meta-shortcode resolver produces two-anchor shape; + `Invocation` source_info matches the token range; `ValueSource` + source_info matches the metadata-key value range. `var` shortcode + symmetrically. Test with both flat-string and PandocInlines + metadata values. + +- **Phase 3**: each of the three nav consumer sites produces + `Generated` with `from: [ValueSource(profile.title_source_info)]` + when title is present; produces `from: []` when title is None. + Fixture extends Plan 6's multi-page audit-completion test. + +- **Phase 4**: each per-section sub-Div carries its own ValueSource; + missing-key cases gracefully degrade (no Div, no panic); + outer-container Div carries `Generated { by: + By::appendix_container(), from: [] }`. Audit-completion test + (Plan 6) extended. + +- **Phase 5**: see Phase 5 description above. + +## Dependencies + +### Hard dependencies + +- **Plan 6** — establishes `Generated` stamping convention; Plan 9 + builds the consumer wiring on top. Plan 6 stamps with `from: []`; + Plan 9 enriches to `from: [ValueSource]` (Phases 3 and 4) or + `from: [Invocation, ValueSource]` (Phase 2). +- **Plan 4** — `AnchorRole::ValueSource` already exists; this plan + consumes it. + +### Soft dependencies + +- **Plan 7** — Phase 5's appendix-license e2e round-trip test needs + Plan 7's writer + soft-drop infrastructure. The unit-level + asymmetry test (Phase 5 first bullet) doesn't. +- **bd-2mxo** — affects map/array container source_info; relevant + only for nested metadata shapes (`license: {name: MIT, ...}`). + Workaround in Phase 4 lets Plan 9 ship without bd-2mxo. + +### Blocks + +- Future hub-client hover-attribution UX work (separate plan, not + yet scoped). + +### Does not block + +- **Plan 7 implementation** can start without Plan 9 — Plan 7 ships + without ValueSource anywhere; its `Invocation` vs `ValueSource` + asymmetry section is forward-looking. Plan 9 Phase 6 retroactively + cleans up Plan 7's wording. + +## Risk areas + +- **API shape churn between Phases 1, 2, 4.** All three depend on + the `config_value_to_inlines_with_provenance` decision. If the + API shape changes mid-implementation, all three phases revisit. + Mitigation: settle the API as part of this research plan (above); + the development plan starts with the API frozen. + +- **Map-shaped metadata interaction with bd-2mxo.** Phase 4's + "first scalar leaf" workaround degrades gracefully but produces a + less-precise ValueSource for nested licenses. Acceptable for v1; + bd-2mxo's fix tightens later. Document as a known limitation in + the `By::appendix` doc-comment. + +- **Two-anchor cost in Phase 2.** Every meta-shortcode resolution + gains a second anchor. `SmallVec<[Anchor; 2]>` keeps it on the + stack. Add a perf-sensitivity check during implementation if a + document heavy in meta-shortcodes regresses. + +- **Forgetting `AnchorRole::Other` policy in extensions.** A future + extension that adds `Other("attribution-source")` and expects + `preimage_in` to walk it would silently be ignored. Mitigation: + the policy is doc-commented at multiple sites; reviewers catch + the case if it comes up. + +## Estimated scope + +| Phase | Lines (rough) | +|---|---| +| 1: Infrastructure (`config_value_to_inlines_with_provenance` + `DocumentProfile.title_source_info` + `AppendixSection` enum) | ~150 | +| 2: Meta/var shortcode (bd-129m3) | ~80 | +| 3: Nav-text ValueSource (bd-8pmq3) | ~60 | +| 4: Appendix sub-Div ValueSource | ~180 | +| 5: Plan-7 invariant tests | ~120 | +| 6: Plan 7 docs reword | ~20 | +| Tests across phases | ~250 | +| **Total** | **~860** | + +One focused session, possibly two if Phase 4's per-section +discrimination surfaces unexpected interactions. Comparable scope to +Plan 6. + +## Notes + +This plan is the "consumer wiring" half of the provenance epic. Plan 6 +stamped the *identity* (`by`) on synthesizers; Plan 9 stamps the +*origin* (`ValueSource` in `from`) on the metadata-derived subset. +Together they make every pipeline-produced metadata-derived node +fully attributable. + +Future plans in the same family: +- **Plan 10** — Dispatch anchor for Lua filter / handler-shortcode + (closes bd-36fr9). Requires Lua-file registration in SourceContext. +- **bd-12vrr** and **bd-1inj0** — standalone follow-ups for + synthesizers whose source isn't metadata-shaped. + +### File naming convention + +This is the first plan to use the `provenance-plan-N-.md` +naming convention (dropping the `q2-preview-` prefix). The +provenance epic has outgrown the original q2-preview framing — it +serves attribution, round-trip writing, error reporting, and (via +the Dispatch role in Plan 10) Lua-source pointing. Plans 3–8 keep +their existing q2-preview filenames for git-history continuity; +plans 9+ adopt the new convention. diff --git a/claude-notes/plans/2026-05-24-q2-preview-plan-7b-test-orama.md b/claude-notes/plans/2026-05-24-q2-preview-plan-7b-test-orama.md new file mode 100644 index 000000000..095feb422 --- /dev/null +++ b/claude-notes/plans/2026-05-24-q2-preview-plan-7b-test-orama.md @@ -0,0 +1,511 @@ +# Plan 7b — Provenance test-o-rama (Plan 7 deferred-tests consolidation) + +**Date:** 2026-05-24 +**Branch:** feature/provenance (or fresh worktree branched from it) +**Status:** Implementation plan +**Milestone:** none directly — quality bar for M3 (Plan 7 already + shipped the functionality); intended to run **before** Plan 7a so + the writer's round-trip contract is fully exercised before + layering runtime idempotence detection on top. + +## Epic context + +Part of the **provenance epic** (Plans 3–8 + 7a + this). Plan 7 +landed the incremental writer + bridge + consumer migrations on +`feature/provenance` (2026-05-24 session). During that session, +four test items were deferred — three Rust-side unit tests where +the plan author hedged about fixture construction cost, and the +Playwright e2e scenario matrix that was scoped out for context +budget. Plan 7b consolidates all four into a single deliberate +testing pass so the deferrals don't decay into permanent gaps. + +Plan 7a (runtime user-filter idempotence detection + `Q-3-44`) +layers on top of Plan 7's writer contract. Running Plan 7b first +means Plan 7a's contract surface is fully pinned before new +detection mechanisms get added. + +## Hand-off start point + +1. Worktree: `feature/provenance` at + `/Users/gordon/src/q2/.worktrees/provenance/` (or create a fresh + one branched off it — full `cargo xtask verify` is green at + `2f91ee0e`). +2. `claude-notes/plans/CURRENT.md` points here. +3. **Start with Phase 1** — Rust unit tests, ~120 LOC, self-contained. + It's the cheapest and derisks the Playwright phases. Don't open + Phase 2 or 3 until Phase 1 is green. +4. The work is *test coverage*, not new behaviour. Each Plan 7b + test confirms an invariant Plan 7 already implements; an + unexpected test failure is a regression — file a beads ticket + and don't change Plan 7's code without escalating. +5. Don't push without explicit user permission. + +## Goal + +Bring Plan 7's invariants under full delivery coverage: + +1. **Rust unit tests** — close three gaps in + `crates/pampa/src/writers/incremental.rs`'s coarsen test module. +2. **Playwright e2e specs** — the ten scenarios in `bd-3izo3`, + spread across `hub-client/e2e/` and `q2-preview-spa/e2e/`, so + the write-back path is exercised through a real browser session. +3. **Cleanup** — close `bd-3izo3` once the e2e matrix lands; flip + Plan 7's three deferred checkboxes to done. + +No new product code. No new diagnostics. No new design surface. +Pure test coverage. + +## Scope + +### In scope + +#### Phase 1 — Rust unit tests (`crates/pampa/src/writers/incremental.rs`) + +**Repo facts the implementer needs:** + +- Test module lives at the bottom of + `crates/pampa/src/writers/incremental.rs` (search for + `#[cfg(test)]`). Mirror the existing tests' style and import set. +- Reference test to model after: + `keep_before_with_atomic_kind_generated_no_anchor_emits_omit` + (around line 1589 — search for the function name, line numbers + drift). It builds a `Generated { by: filter(...), from: [] }`, + a one-block AST, a `ReconciliationPlan { block_alignments: + vec![BlockAlignment::KeepBefore(0)], .. }`, calls `coarsen(qmd, + ast, ast, &plan, &mut warnings)`, asserts the entry shape. +- Soft-drop reference tests (asserting Q-3-42 / Q-3-43 codes): + search the same file for `warnings[0].code.as_deref(), Some("Q-3-42")` + and `Q-3-43`; ~6 sibling tests cover the existing alignment + paths. +- `compute_blocks_hash_fresh` + `compute_meta_hash_fresh_excluding_rendered` + live in `crates/quarto-ast-reconcile/src/hash.rs`; the + Plan-3 idempotence tests at `crates/quarto-core/tests/idempotence.rs` + use them — that's the pattern for the writer-lossless test. +- Run `cargo nextest run -p pampa coarsen` after each new test; + full pampa suite when the phase is ready. + +All three follow the existing test-module conventions in that +file. Each is self-contained, no fixtures outside the test module. + +- **Writer-lossless baseline test for Plan-6/7 shapes**. The + existing `parse(write(ast))` integration tests cover the common + Original-SI shapes but not Generated / atomic-CustomNode / + synthesized-footnotes-container shapes (those don't appear from + raw qmd parse). Construct each shape directly via + `SourceInfo::generated(...)` + manual node builders, write it, + parse, then assert `compute_blocks_hash_fresh` + the + `_excluding_rendered` meta variant match. Lives in + `quarto-source-map`'s test module (or alongside Plan 6's writer + tests). One assertion per shape: + - `Generated { by: shortcode("meta"), from: [Invocation] }` + (atomic resolution) + - `Generated { by: filter(...), from: [] }` (no Invocation — + atomic-kind drop) + - `Generated { by: sectionize, from: [] }` with source-bearing + children (non-atomic wrapper — Transparent walk) + - Atomic CustomNode container (no preimage) + + **Transparent-wrapper-at-top fixture (added 2026-05-25 from the + closure-gap audit).** The shapes above all assume the wrapper is + *not* `blocks[0]`. Add one more fixture where the sectionize + wrapper IS `blocks[0]` (the production q2-preview shape), with + the user's real content nested inside, AND with an inline edit + to a wrapped child. This exercises the descent helpers + (`derive_target_file_id`, `first_target_anchored_start_in`, + `coarsen`'s RecurseIntoContainer Transparent-recursion arm — the + three sites the closure-gap fixes touched). The existing test + `sectionize_wrapper_preserves_frontmatter_after_inner_edit` in + `crates/pampa/tests/incremental_writer_tests.rs` is the + reference shape; extend it into a writer-lossless variant if + Plan 7b lands a separate test module. See + [`claude-notes/designs/transparent-wrappers.md`](../designs/transparent-wrappers.md) + for the underlying invariant. + +- **Soft-drop interaction test (compound case)**. Build a Para + with two inlines: one atomic-resolved (Generated by shortcode, + Invocation in target) and one editable (Original). New AST + mutates *both*: the atomic resolution to a different + `Str("new")` and the editable inline's text. Assert: + - non-atomic edit lands in output qmd, + - atomic edit becomes `Q-3-42`, + - emitted qmd preserves the shortcode token bytes verbatim, + - exactly one warning emitted. + +- **Filter-construction soft-drop test (UseAfter path)**. Mirror + of `keep_before_with_atomic_kind_generated_no_anchor_emits_omit` + but exercising the *edit* path: original inline is `Generated { + by: filter("emoji.lua", 14), from: [] }`; new inline is a + different `Str`. Build a single-Para AST per side, build a + `ReconciliationPlan` with `BlockAlignment::RecurseIntoContainer + { container: 0, inline_alignments: vec![InlineAlignment::UseAfter(0)] }`, + call `coarsen`, assert `Omit` + `Q-3-43`. ~40 LOC; structurally + identical to the sibling KeepBefore test at line 1589. + +- **Idempotence: write-twice-byte-equal**. The existing tests + pin a single round-trip (`parse(write(ast)) == ast`). They do + *not* pin that running the writer on its own output produces + the same bytes again. This is load-bearing for sectionized and + footnotes-container documents because `compute_separator` + silently drops its "use the original gap" optimization across + `Transparent` children: child entries carry `orig_idx: None` + (see `incremental.rs`'s `clear_orig_idx_for_transparent_child`), + so the second write recomputes separators on standard-newline + fallback. The result is still semantically correct; the + question is whether it's *byte-identical* to the first write. + + Three fixtures, each tested under + `tests/incremental_idempotence.rs`: + + 1. **Sectionized document.** Original qmd: a doc with at least + two headings so `SectionizeTransform` synthesizes two + section-Divs. Pass through `parse → transform → write` + to get `qmd_1`. Repeat: parse `qmd_1` → transform → + write → `qmd_2`. Assert `qmd_1 == qmd_2`. + 2. **Footnotes container document.** Doc with at least one + footnote so the footnotes-container Div synthesizes. + Same two-pass assertion. + 3. **Adjacent Transparent wrappers.** Doc whose first two + blocks both synthesize as Transparent (sectionize at + blocks[0], footnotes-container at blocks[1]). Exercises + the case where two consecutive top-level entries are both + `Transparent`, making the separator-between-blocks question + non-trivial. + + Test driver pattern: use `pampa::parse_qmd_to_ast` for parse, + the existing q2-preview pipeline for transform, and + `incremental_write(qmd, ast, ast, trivial_plan)` for write + (where `trivial_plan` is the all-KeepBefore plan against the + same AST on both sides). Trivial plan ensures we're testing the + writer's *idempotence*, not the reconciler's behaviour. + + ~120 LOC, including the test helper for building a trivial + reconciliation plan from an AST. + +- **Cross-file `Original` Rewrite catch-all coverage.** Today's + test `keep_before_cross_file_original_falls_back_to_rewrite` + (`incremental.rs:1687`) exercises the catch-all minimally — a + hand-constructed `Original` pointing at a different file, no + children. Add richer fixtures so the catch-all path that Plan 8 + will land on top of has explicit pre-Plan-8 coverage. + + **Why this matters structurally** (added 2026-05-25): the + catch-all is now the only producer of `Rewrite` entries reached + via `coarsen_keep_before_block`. After the + [`CoarsenedEntry` self-contained refactor](./2026-05-25-coarsened-entry-self-contained.md) + it pre-computes block text at coarsen time — that change closed + a latent panic in the Transparent-recursion path, so the + writer-lossless baseline below should include at least one + fixture that fires this catch-all *inside* a Transparent wrapper + (e.g. a sectionize Div containing a cross-file `Original` child). + Without that fixture, regressions in the self-contained invariant + pass tests but break q2-preview. + + 1. **Substring rooted outside target.** `Substring` whose + `parent` is `Original` in a non-target file. `preimage_in` + walks the parent → returns `None`. Assert Rewrite catch-all. + 2. **Concat with one piece outside target.** `Concat` of two + pieces, one in target and one in a non-target file. + `preimage_in` requires *every* piece to resolve → + `None`. Assert Rewrite catch-all. + 3. **Paragraph with mixed-file inline content.** Para whose + `source_info` is `Original` in target but whose inlines have + `Original` in a non-target file (no wrapper). Coarsen + classifies the Para as Verbatim at the block level + (preimage in target), so the inline mixing doesn't trip the + catch-all — but the test still pins that current behaviour + and documents what Plan 8 will change. + + ~80 LOC across three small tests; each ~25 LOC. Place + alongside `keep_before_cross_file_original_falls_back_to_rewrite`. + +#### Phase 2 — Hub-client Playwright specs (`hub-client/e2e/`) + +**Repo facts the implementer needs:** + +- Existing specs to model on: `hub-client/e2e/smoke.spec.ts`, + `hub-client/e2e/preview-extraction.spec.ts`, + `hub-client/e2e/q2-debug-render-components.spec.ts`. +- Config: `hub-client/playwright.config.ts`. CI gating: these run + under `cargo xtask verify --e2e`; they are NOT in the default + verify pipeline (per `playwright.config.ts` design — keep it + that way). +- The hub-client preview uses Monaco; AST mutations happen via + iframe `setAst` (now wired through Plan 7's `handleSetAst`). + Drive edits by sending postMessage from the spec, or by + programmatically calling `setAst` on the preview iframe. +- Soft-drop diagnostics surface in the existing hub-client + diagnostics panel (Plan 7 commit `a0a4c7c8` plumbed warnings + into `pendingWriteWarningsRef` → `onDiagnosticsChange`). +- Saved qmd is observable from the spec via the project's file + system (Playwright tests spawn a real hub against a tempdir + fixture project). + +Five specs. Each spawns the hub, opens a fixture project, performs +an AST mutation through the live preview, and reads the round-tripped +qmd back from disk (or via `getFileContent`) to assert the +preservation contract. Each spec is ~60 LOC including fixture setup. + +- **sectionized doc + edit paragraph**. Fixture has H1 + H2 + + multiple Paras. Edit one paragraph through the preview. Assert + saved qmd preserves heading structure verbatim and contains the + edited text. +- **single-inline shortcode + edit different paragraph**. Fixture + has `{{< meta title >}}` inline in Para 1; edit Para 2. Assert + Para 1's qmd is byte-identical to the original (token preserved). +- **multi-inline shortcode + edit different paragraph in same Para**. + Two `{{< meta title >}}` inlines in Para 1; an editable inline + also in Para 1; edit only the editable inline. Assert the + shortcode token appears exactly once at each position (dedupe + by-Invocation) and the editable inline reflects the change. +- **edit resolved shortcode title → Q-3-42 + byte-equal no-op**. + Fixture has resolved shortcode title. Edit the rendered title + through the preview. Assert: `Q-3-42` surfaces in the + diagnostics panel, saved qmd is byte-equal to original. +- **edit inside synthesized footnotes container → Q-3-43 + + container regenerates**. Fixture has footnote refs in body that + produce a synthesized appendix footnotes Div. Edit a Str inside + the synthesized container. Assert: `Q-3-43` surfaces, the saved + qmd doesn't carry the synthesized container text, the next + render re-synthesizes the container. + +#### Phase 3 — SPA Playwright specs (`q2-preview-spa/e2e/`) + +**Repo facts the implementer needs:** + +- Server-spawn helper: `q2-preview-spa/e2e/helpers/previewServer.ts` + (`startPreviewServer({ fixtureFiles: [...] })`). Each test + spawns its own server against a tempdir; tests run in parallel. +- Reference spec: `q2-preview-spa/e2e/basic-preview.spec.ts` — + full pattern: server lifecycle, `waitForInnerHeading` polling + the iframe DOM, edit-then-rerender flow. +- Render-counter gauge: `window.__renderTicks` (Phase D.3 / + bd-0mji per `PreviewApp.tsx`). The echo-prevention spec depends + on it specifically; assert it bumps exactly once per edit. +- Soft-drop warnings surface in `DiagnosticStrip` (Plan 7 commit + `20f4b0ff`) — a small overlay anchored bottom-right of the + preview pane. Assert by querying for the strip's text content. +- SPA write-back path: `handleSetAst` in `q2-preview-spa/src/PreviewApp.tsx` + → `incrementalWriteQmd(originalQmd, baselineAst, newAst)` → + `updateFileContent(path, qmd)`. Echo-prevention via FNV-1a hash + in `lastEmittedRef`. The hash function (`fnv1aHex`) lives in + the same file. +- Driving edits from the spec: send a postMessage with + `{ type: 'SET_AST', ast: ... }` into the iframe (whatever + channel the existing iframe handler reads). + +Five specs. Use the existing `startPreviewServer` pattern. Each ~60 LOC. + +- **project + edit paragraph round-trip**. Multi-file project, + active page edited via the iframe's `setAst` postMessage. Assert + the edited qmd lands on disk + `__renderTicks` bumped exactly + once (no echo loop). +- **single-file mode + edit paragraph round-trip** (single-file + mode is `bd-tnm3k`). Same shape, single-file fixture. +- **edit shortcode → Q-3-42 in DiagnosticStrip**. Programmatically + drive `setAst` to mutate a shortcode-resolved inline. Assert + `DiagnosticStrip` mounts and shows the Q-3-42 entry; saved qmd + byte-equal. +- **mixed atomic + non-atomic edit**. Fixture has both an + atomic-resolved inline and an editable inline in the same Para. + setAst mutates both. Assert non-atomic edit lands on disk + + Q-3-42 in the strip + atomic source unchanged. +- **content-match echo-prevention fixture**. Fixture: edit a Para + through setAst. Assert exactly one `__renderTicks` bump (the + emitted qmd's FNV-1a hash matches the echoed `onFileContent`, + the ref is consumed, no re-render). Also: write a *different* + file in the same project while the echo is pending — assert that + one *does* trigger a re-render (the gate is per-(path, hash), + not global). Pins the `lastEmittedRef` contract. + +#### Phase 4 — Cleanup + +- Close `bd-3izo3` with reason "landed via Plan 7b Phase 2 + 3". +- Flip the three Phase-2 deferred checkboxes in Plan 7 to `[x]` + with one-line references to the Plan-7b commits. +- Update `hub-client/changelog.md` with a single 7b entry. + +### Out of scope + +- **Item #5 from the Plan 7 review (manual smokes)** — the user + is running these by hand. Not a Claude-driven test. +- **Plan 7a's runtime idempotence detection** — that's a separate + plan with its own test surface. +- **Plan 9's `preimage_in` role-asymmetry e2e test** — owned by + Plan 9 Phase 5; depends on a real `ValueSource` consumer that + doesn't exist until Plan 9 lands. +- **Refactor of any test infrastructure** — use the patterns + already established in `hub-client/e2e/` and + `q2-preview-spa/e2e/`; don't introduce a new harness. + +## Design decisions (settled) + +- **Three phases, separable commits.** Each phase ships as one + commit (or one per spec if a phase author wants more granularity). + Phase 1 is the cheapest and most self-contained; ship it first + to derisk the rest. + +- **No new fixtures library.** Each spec inlines its qmd fixture + in the test file. Plan 7b's tests are self-documenting via the + literal fixtures right next to the assertions. A shared fixture + library would be premature abstraction. + +- **`__renderTicks` is the SPA's truth gauge.** The render counter + on `window.__renderTicks` already exists (Phase D.3 / bd-0mji + per `PreviewApp.tsx` comments). The echo-prevention spec relies + on it specifically. If a spec needs counts at a different layer + (e.g. samod patches received), add a new gauge — don't reuse + `__renderTicks` for two distinct meanings. + +- **E2e gating stays `--e2e` on CI.** Per `q2-preview-spa/playwright.config.ts`'s + existing convention; specs run under `cargo xtask verify --e2e` + but not the default verify pipeline. Plan 7b doesn't change this. + +## Test plan + +This *is* the test plan. Each Phase-1 entry asserts a Rust-side +invariant; each Phase-2/3 entry asserts an end-to-end delivery +invariant. The phase descriptions above name the exact assertion +per spec. + +TDD order within each phase: + +- Phase 1: write the test, run it, watch it fail (it won't — + Phase 1's mechanics are already shipped; the test confirms + coverage, not new behavior). If a test unexpectedly *fails*, + that's a regression in Plan 7's writer and gets a beads issue + before the test lands. +- Phase 2/3: same — Plan 7's surfaces already produce the right + end-to-end behavior; these specs pin it. Unexpected failures + block the spec and surface as regression beads. + +## Dependencies + +- **Plan 7 (shipped, commit `4ee51e4a` on `feature/provenance`).** + The writer + bridge + consumer migrations + manual smoke are + the *unit under test* for this plan. +- **`bd-3izo3`** — already-filed beads for the Playwright matrix. + Plan 7b consolidates and replaces it; close on landing Phase 2+3. + +No upstream dependencies — every contract Plan 7b exercises is +already in `feature/provenance`. + +## Risk areas + +- **Browser-flake on the SPA echo-prevention spec.** samod's + echo latency varies; the assertion "exactly one + `__renderTicks` bump after an edit" needs a generous timeout + + a poll-stable-for-Nms shape. Use the same harness pattern + as `basic-preview.spec.ts`'s 30s `waitForFunction`. + +- **Synthesized-footnotes Q-3-43 fixture is fragile.** The footnotes + container is generated by an AppendixTransform that runs late + in the pipeline; the fixture needs at least one `[^1]` ref in + the body. Confirm via `q2 render` before relying on the shape + in a test. + +- **Multi-inline dedupe spec depends on shortcode reuse.** The + fixture needs two `{{< meta title >}}` invocations in the same + Para; assert dedupe operates on `Invocation` source-info + identity, not full anchor list. If the spec doesn't catch a + regression because the dedupe condition is over-eager, add a + second spec where two shortcodes have different + `value_source` anchors but identical `invocation` anchors and + assert dedupe still fires (forward-compat with Plan 9). + +## Estimated scope + +| Component | LOC (rough) | +|------------------------------------------------------------------------|-------------| +| Phase 1 — 3 Rust unit tests + helpers | ~120 | +| Phase 1 — idempotence (write-twice) tests, 3 fixtures + plan helper | ~120 | +| Phase 1 — cross-file Original Rewrite catch-all, 3 small tests | ~80 | +| Phase 2 — 5 hub-client Playwright specs | ~300 | +| Phase 3 — 5 SPA Playwright specs | ~300 | +| Phase 4 — cleanup (checkboxes + changelog) | ~10 | +| **Total** | **~930** | + +Time estimate: one focused session per phase (Phase 1 a few hours, +Phase 2 and 3 a day each). Total ~3 days of work, parallelizable +across phases. + +## Implementation checklist + +### Phase 1 — Rust unit tests + +- [ ] Writer-lossless baseline fixture test for Generated / + atomic-CustomNode / synthesized-footnotes shapes — one + sub-test per shape; assert + `compute_blocks_hash_fresh(parse(write(ast))) == compute_blocks_hash_fresh(ast)`. +- [ ] Soft-drop interaction test: Para with one atomic-resolved + inline + one editable inline; new AST mutates both; assert + non-atomic edit lands + exactly one Q-3-42 + atomic source + preserved. +- [ ] Filter-construction soft-drop UseAfter test in + `crates/pampa/src/writers/incremental.rs` test module; + mirror of `keep_before_with_atomic_kind_generated_no_anchor_emits_omit` + but via `RecurseIntoContainer` + `InlineAlignment::UseAfter`; + assert `Omit` + `Q-3-43`. +- [ ] Idempotence (write-twice-byte-equal) tests: + `idempotence_sectionized_document`, + `idempotence_footnotes_container`, + `idempotence_adjacent_transparent_wrappers`. Test file + `crates/pampa/tests/incremental_idempotence.rs`. Each: + `qmd → ast → write1; parse(write1) → ast' → write2; + assert write1 == write2`. Use a trivial all-`KeepBefore` + plan against the same AST on both sides so we're testing + the writer's idempotence, not the reconciler's. +- [ ] Cross-file `Original` Rewrite catch-all tests: + `keep_before_substring_outside_target_falls_back_to_rewrite`, + `keep_before_concat_with_outside_target_piece_falls_back_to_rewrite`, + `para_with_mixed_file_inline_content_keeps_block_verbatim`. + Place alongside the existing + `keep_before_cross_file_original_falls_back_to_rewrite` + (`incremental.rs:1687`). Each ~25 LOC. +- [ ] `cargo nextest run -p pampa` green (incremental writer + tests pass; no regression elsewhere). +- [ ] `cargo xtask verify --skip-hub-build` green. + +### Phase 2 — Hub-client Playwright specs + +- [ ] `hub-client/e2e/plan7-sectionized-edit.spec.ts` +- [ ] `hub-client/e2e/plan7-shortcode-preserve.spec.ts` +- [ ] `hub-client/e2e/plan7-multi-shortcode-dedupe.spec.ts` +- [ ] `hub-client/e2e/plan7-q342-shortcode-edit.spec.ts` +- [ ] `hub-client/e2e/plan7-q343-footnotes-regenerate.spec.ts` +- [ ] `cargo xtask verify --e2e` green on the hub-client leg. + +### Phase 3 — SPA Playwright specs + +- [ ] `q2-preview-spa/e2e/plan7-project-edit.spec.ts` +- [ ] `q2-preview-spa/e2e/plan7-single-file-edit.spec.ts` +- [ ] `q2-preview-spa/e2e/plan7-q342-diagnostic-strip.spec.ts` +- [ ] `q2-preview-spa/e2e/plan7-mixed-atomic-nonatomic.spec.ts` +- [ ] `q2-preview-spa/e2e/plan7-echo-prevention.spec.ts` +- [ ] `cargo xtask verify --e2e` green on the SPA leg. + +### Phase 4 — Cleanup + +- [ ] `br close bd-3izo3 --reason "landed via Plan 7b Phase 2+3"` +- [ ] Plan 7 (`claude-notes/plans/2026-05-04-q2-preview-plan-7-incremental-writer.md`): + flip the three Phase-2 deferred checkboxes at lines 1282 / 1287 / 1288 + to `[x]` with a one-line reference to the Plan 7b commit. +- [ ] `hub-client/changelog.md` entry: one line under the + Phase-4 commit date noting "Plan 7's incremental-write + round-trip is now under e2e coverage (sectionized docs, + shortcode preservation, soft-drop diagnostics, echo + prevention)." + +## Notes + +This is the "boring but important" plan. Nothing here is research; +everything is a follow-through. The temptation is to skip it +because Plan 7 already works. Don't — the deferrals were honest +scope calls, not impossibility claims, and leaving them open +means the round-trip contract has only Rust-unit and one +JS-wrapper test pinning it. A regression in Plan 7a (runtime +idempotence detection) or Plan 9 (ValueSource threading) could +silently break the e2e behaviour and tests would still pass. + +Running 7b before 7a means Plan 7a starts with a fully-pinned +baseline and any new failures in 7a's CI runs point clearly at +7a's changes, not at latent gaps. diff --git a/claude-notes/plans/2026-05-25-coarsened-entry-self-contained.md b/claude-notes/plans/2026-05-25-coarsened-entry-self-contained.md new file mode 100644 index 000000000..9477cd450 --- /dev/null +++ b/claude-notes/plans/2026-05-25-coarsened-entry-self-contained.md @@ -0,0 +1,380 @@ +# Plan — Make `CoarsenedEntry::Rewrite` self-contained + +**Status:** Drafted 2026-05-25. +**Branch:** `feature/provenance`. +**Trigger:** Panic discovered 2026-05-25 during the q2-preview gate-bypass +UX experiment (see §History below). Index-out-of-bounds in +`emit_entries` when a `Rewrite` entry produced inside the Transparent +recursion (added in commit `bdcfdc53`) carried a child-relative +`new_idx` but was looked up against the top-level `new_ast.blocks`. + +## Goal + +Lift the existing implicit invariant — *every `CoarsenedEntry` variant +must be self-contained (carry its own emit-time bytes)* — to be an +explicit, enforced architectural rule. Today four of the five variants +already satisfy this: + +| Variant | Self-contained? | How | +|---|---|---| +| `Verbatim` | ✓ | `byte_range` into `original_qmd` | +| `InlineSplice` | ✓ | pre-computed `block_text: String` | +| `Transparent` | ✓ | list of self-contained child entries | +| `Omit` | ✓ | emits nothing | +| `Rewrite` | ✗ | `new_idx: usize` — a deferred index into `new_ast.blocks` | + +`Rewrite` is the outlier. Make it match its siblings by carrying +pre-computed `block_text: String`. The qmd writer call moves from +emit time (`assemble`) to coarsen time (`coarsen`); the work is the +same, the timing changes; the entry becomes self-describing. + +Behaviour does not change. Tests stay green. The Transparent-recursion +panic disappears. + +## History — why was `Rewrite` written context-dependently? + +`git log -S CoarsenedEntry -- crates/pampa/src/writers/incremental.rs` +(top of file, latest 4 entries): + +1. **`eb81cbc5`** ("Add incremental QMD writer with idempotence and + round-trip tests") — original commit. `CoarsenedEntry` had **two** + variants: + ```rust + enum CoarsenedEntry { + Verbatim { byte_range: Range, orig_idx: usize }, + Rewrite { new_idx: usize }, + } + ``` + The writer was top-level only. Every entry corresponded directly + to one top-level block. `Verbatim` carried its own bytes; `Rewrite` + deferred to `assemble`-time via an index into `new_ast.blocks` — + correct because indices were unambiguous and the deferral saved + a `write_block_to_string` call when the entry was never emitted + (defensive). Behaviour invariant: `new_idx` is always a *top-level* + index. Honoured by construction at this point. + +2. **`ab10f37b`** ("Implement inline splicing for incremental writer + (bd-1hwd)") — added `InlineSplice` variant for partial block + rewrites: + ```rust + InlineSplice { block_text: String, orig_idx: usize } + ``` + Inline splicing builds *bespoke* block text by mixing original + bytes with newly-serialized inlines. There's no `new_idx` that + would reconstruct it — the text is necessarily pre-computed at + coarsen time. This was the first variant to break the "defer to + emit time" pattern, **out of necessity**, but no one refactored + `Rewrite` to match. The asymmetry was introduced silently. + +3. **`9a473fe9`** ("plan-7 phase 2+3a: writer internals — soft-drop, + Transparent/Omit, multi-inline dedupe") — Plan 7 added the + `Transparent` and `Omit` variants. `Transparent { child_entries }` + allows recursive emission for non-atomic Generated wrappers + (sectionize, footnotes, appendix). `orig_idx` became `Option` + so children inside `Transparent` could opt out of the + `compute_separator` original-gap optimization. The commit + **explicitly flagged** the latent Rewrite issue: + + > // result_idx is unused for child Rewrites (a child Rewrite + > // would need a different lookup mechanism; not exercised by + > // today's synthesizers). + + Accurate at the time — coarsen_keep_before_block was the only + producer of child entries (under static Transparent recursion + for unchanged wrappers), and its catch-all hit Rewrite only on + cross-file Original / gappy Concat / Generated-without-source-bearing-children + shapes that the pipeline didn't produce in practice. + +4. **`bdcfdc53`** ("recurse into non-atomic Generated wrappers in + RecurseIntoContainer") — *this PR's* fix from earlier today. + Added a Transparent-recursion path in `coarsen_blocks` for the + *changed-wrapper* case (RecurseIntoContainer with a + `block_container_plans` entry). For the first time, **`coarsen_blocks` + runs on child slices**, and any `Rewrite` it produces carries a + child-relative index. The "not exercised by today's synthesizers" + caveat from `9a473fe9` no longer holds. + +The takeaway: `Rewrite`'s context-dependent design was a vestige of +the original Phase-1 top-level-only writer. It survived because every +expansion since (`InlineSplice`, then `Transparent`) sidestepped it +rather than refactoring. Today's panic is the bill coming due. + +## Behavioural equivalence — coarsen-time vs emit-time + +**Question:** does pre-computing `block_text` at coarsen time produce +byte-identical output to deferred emit-time computation? + +**Answer:** yes. `write_block_to_string` +(`crates/pampa/src/writers/incremental.rs:1089`) is a pure function of +its `Block` argument: + +```rust +fn write_block_to_string(block: &Block) -> Result { + let mut buf = Vec::new(); + qmd::write_single_block(block, &mut buf)?; + String::from_utf8(buf).map_err(…) +} +``` + +`qmd::write_single_block` (`writers/qmd.rs:2392`) constructs a fresh +`QmdWriterContext::new()` per call. The context's mutable fields +(`emphasis_stack`, `prev_emitted_alnum`) accumulate state only +**within** a single `write_single_block` invocation — they're created, +used, and dropped per call. No state leaks across calls. + +There is no global state in `crates/pampa/src/writers/qmd.rs` (verified +by `git grep 'static\|thread_local' crates/pampa/src/writers/qmd.rs` +returning empty). No file I/O, no environment reads, no system clock. +The function depends only on the input `Block`. + +Therefore: `write_block_to_string(b)` is referentially transparent. +Calling it at coarsen time vs emit time produces identical output. + +Performance: `Rewrite` is the catch-all path — when we get an entry we +*always* emit. No coarsened plan keeps Rewrite entries it doesn't use +(emit_entries walks every non-Omit entry). The qmd-write work is +performed exactly once either way; only its timing changes. No extra +allocations. + +## Consumers — confirming the scope + +`CoarsenedEntry` is private to `crates/pampa/src/writers/incremental.rs` +(lowercase `enum`, no `pub`). Two internal consumers: + +1. `assemble`'s `emit_entries` — concatenates bytes per entry. +2. `compute_edits_from_coarsened` — currently calls `assemble` + internally and returns a single full-document edit. + +No external consumers. The refactor is fully local to one file. + +Future consumers (Phase 3 minimal-edit diffing, Plan-X-WIP) will benefit +from the self-contained invariant: every entry carries its own *intended +text* and (where applicable) its *intended source range*, which is the +right shape to derive minimal Monaco edits without re-deriving a +post-assemble diff. Mentioned in §Out-of-scope but worth noting as +direction-of-travel. + +## Work items + +### Phase 1 — Pin the panic with a failing test + +- [x] Add `sectionize_wrapper_with_shortcode_child_edit_does_not_panic` + to `crates/pampa/tests/incremental_writer_tests.rs`. The current + draft (commit `5f2bbab0`'s working tree) reaches the panic via + a cross-file Original child shape; alternative is a synthesized + empty section Div or a Lua-filter-emitted Generated wrapper + with no source-bearing children. Either reproduces the + `Rewrite { new_idx: child_idx }` → `new_ast.blocks[child_idx]` + out-of-bounds. +- [x] Run; confirm the test panics with "index out of bounds" on + `incremental.rs:890` (the `Rewrite` arm of `emit_entries`). +- [x] Added `sectionize_wrapper_shortcode_child_edit_soft_drops` — + goes further than no-panic by asserting on output bytes + + Q-3-43 warning. This caught a *second* bug Phase 1's no-panic + test would have hidden: the `UseAfter` arm fell through to + let-user-win Rewrite for atomic-Generated with preimage, + writing the resolved bytes (the edit applied to generated + content) back into the source qmd. The architectural Rewrite + fix made this newly visible by replacing the panic with silent + wrong-bytes; see Phase 2 below for the additional soft-drop + branch that closes the gap. + +### Phase 2 — Lift `Rewrite` to self-contained + +- [x] Change the variant to carry pre-computed `block_text: String`. + Drop the `new_idx: usize` field. +- [x] Update every `Rewrite` producer to pre-compute (four sites: + `coarsen_blocks` UseAfter, two RecurseIntoContainer sub-branches, + and `coarsen_keep_before_block`'s catch-all). +- [x] Convert `coarsen_keep_before_block` to + `→ Result>`. Both call + sites updated to `?`. +- [x] Update `emit_entries` to `block_text.clone()`. `new_ast` is now + unused for byte production in any variant (kept in signature for + now; removal is a tidying follow-up). +- [x] Delete the "result_idx is unused for child Rewrites" comment. + +#### Phase 2b — Soft-drop for atomic-Generated in UseAfter (scope expansion) + +Discovered during Phase 3 verification: the user reported that with +the dispatch.tsx bypass in place, clicking +react on a paragraph +inside `{{< lipsum 3 >}}` produced wrong qmd output — the resolved +lipsum bytes + reactji were being written back into source. The +architectural Rewrite refactor made this newly observable by +replacing the panic with silent wrong-bytes. + +Root cause: when the user edits inside an atomic-Generated block +with realistic content delta, the reconciler can emit +`KeepBefore` (Header) + `UseAfter` (new lipsum) at the +sectionize-child level — implicit deletion of the original lipsum +Para. The `UseAfter` arm filtered atomic-CustomNode and +no-preimage-Generated but had no branch for atomic-Generated-with- +preimage, so it fell through to let-user-win Rewrite (write the +new bytes). + +- [x] Add an `atomic_generated_preimage` check at the head of the + `UseAfter` arm in `coarsen_blocks`. If the new block is + `Generated` with `is_atomic_kind() == true` AND has preimage + in target → emit `Verbatim` of the preimage range + a + Q-3-43 soft-drop warning. The pattern: when an entry's *new* + block looks like an attempt to edit content the user can't + actually edit, refuse the edit at the writer regardless of + what the reconciler's alignment said. +- [x] Test: `sectionize_wrapper_shortcode_child_edit_soft_drops` — + asserts on output bytes (token preserved, reactji NOT + emitted) and the Q-3-43 warning. + +### Phase 3 — Tests + verification + +- [x] Re-run the Phase 1 test; passes (Ok, no panic). +- [x] `cargo nextest run -p pampa` — 3902 / 3902 passing + (one new soft-drop test added). +- [x] `cargo xtask verify --skip-hub-build --skip-hub-tests` — Rust + workspace 9655 / 9655 passing. (The + `ts-packages/preview-renderer` integration tests fail under the + bypass; expected — they assert the atomic-aware NOOP gate + fires, which the bypass disables. They pass once the bypass + is reverted.) +- [x] Rebuild WASM (`hub-client && npm run build:wasm`) — exit 0. +- [ ] Playwright e2e `q2-preview-render-components-write` — *blocked + by a dev server holding port 5173 in this worktree; deferred, + see "scaffolding cleanup" task*. +- [x] Manual: user confirmed the no-panic + soft-drop behavior in + their local browser session after rebuilding. Initial report + flagged wrong-bytes (resolved lipsum text in qmd), which led + to discovering Phase 2b. After Phase 2b lands, the + regression test `sectionize_wrapper_shortcode_child_edit_soft_drops` + asserts: token bytes preserved, reactji NOT emitted, Q-3-43 + warning fires. +- [ ] Restore the dispatch.tsx gate before this plan's commits ship + (it was a one-shot UX experiment; the proper TS-side intercept + signal is separate work — see §Out of scope). + +### Phase 4 — Design doc + +- [x] Write `claude-notes/designs/incremental-writer-internals.md` + (new file). Sections: + - *Purpose*. The incremental writer takes `(original_qmd, + original_ast, new_ast, plan)` and produces `(new_qmd, + warnings)`. It does so by *coarsening* the hierarchical + reconciliation plan into a flat list of self-contained + emit instructions, then *assembling* the result by walking + the instructions in order. + - *The `CoarsenedEntry` contract* — the rule this plan + enforces. Every variant carries enough information to + produce its emit bytes *without further context*. No + index-into-an-ambient-slice deferral. Each variant + documented with its payload and self-containment property. + - *Why this matters* — the panic story, the Transparent + recursion composition story, the minimal-edit-diffing + future story. + - *Anti-patterns* — "don't add a variant that defers to a + named slice"; "don't add a variant that depends on context + not encoded in the variant itself"; "if you need timing of + side effects, that's a sign the entry shape is wrong." + - *History* — pointer to this plan; pointer to the historical + commits (`eb81cbc5`, `ab10f37b`, `9a473fe9`, `bdcfdc53`). + - *Promotion path* — same shape as + `transparent-wrappers.md`'s "where the code lives + when + to promote it" — `CoarsenedEntry` is private today; if a + second crate ever wants to consume the coarsened plan + (e.g. minimal-edit-diffing in a separate crate), promote + the type and its emission helpers to `quarto-pandoc-types` + or a new module. +- [x] Cross-link from `transparent-wrappers.md` §"Reference + primitive" — added a "Sibling primitive on the emission side" + preamble that points to the new doc. +- [x] Cross-link from `provenance-contract.md` §7 "Atomic-kind set + and consumer impact" — added a closing paragraph pointing to + the new doc as the place where the writer's internal shape is + pinned. + +### Phase 5 — Plan annotations + +Plans whose work would build on the self-contained invariant: + +- [x] `claude-notes/plans/2026-05-04-q2-preview-plan-7-incremental-writer.md` + — added a "Follow-ups closed" section pointing here. +- [x] `claude-notes/plans/2026-05-24-q2-preview-plan-7b-test-orama.md` + — its Phase 1 writer-lossless fixtures should include at least + one shape where the writer's catch-all Rewrite path fires + (cross-file Original child, or empty Generated wrapper). Already + flagged from the sectionize-wrapper audit; this plan supplies + the structural reason such fixtures matter. + +## Out of scope + +- The TS-side gate's silent NOOP (lipsum-paragraph clicks produce no + user feedback today). Separate plan; the temporary + `dispatch.tsx` bypass exists only to surface the writer-side + diagnostic UX once and must be reverted as part of Phase 3. +- The proper TS-side "edit rejected at the gate" signal — needs + its own design (synthetic diagnostic shape, framework emit + callback, location resolution via the source pool). Tracked + separately. +- Removing `new_ast: &Pandoc` from the `emit_entries` signature. + Once Rewrite no longer reads it, the parameter might be fully + removable (audit the other arms). Defer to a tidying commit + unrelated to this plan's correctness work. +- Eventual minimal-edit diffing from `CoarsenedEntry` directly + (rather than `assemble` + post-diff). The self-contained + invariant is a precondition; the actual diff-emitting work is + its own plan. + +## Risk assessment + +**Low risk overall.** Three reasons: + +1. **No behaviour change.** `write_block_to_string` is referentially + transparent (§Behavioural equivalence). The refactor moves a + pure-function call earlier in the pipeline; emit bytes are + byte-identical. +2. **Fully local.** `CoarsenedEntry` is private to one file; two + internal consumers; no FFI; no wire format. +3. **Mirrors an existing precedent.** `InlineSplice` already carries + pre-computed `block_text`. The new `Rewrite` is structurally + identical. + +Risks worth naming: + +- **Tests pass but production hits a path we missed.** Mitigation: + the Plan-7b §"writer-lossless baseline" call-out for adding a + catch-all Rewrite fixture; verify with the e2e + manual browser + repro before committing. +- **Coarsen-time errors surface differently.** Before: `write_block_to_string` + errored at emit time, propagated up through `assemble` to + `incremental_write`. After: errors propagate from `coarsen_blocks` + (via the `?` in producer sites) — same overall return path + (`Result<_, Vec>`), but the *order* of error + vs. soft-drop-warning emission could shift. Verify the existing + error tests still produce the same diagnostic ordering. +- **Increased coarsen-time allocations.** Each Rewrite producer now + allocates a `String` immediately. Negligible at typical document + sizes; flagged for awareness rather than as a real concern. + +## Estimated scope + +| Phase | Lines (rough) | +|-------|---------------| +| 1 — pin panic with failing test | ~80 | +| 2 — Rewrite self-contained refactor | ~60 net change (delete + add) | +| 3 — verification (test runs, e2e) | 0 LOC (verification only) | +| 4 — design doc | ~200 | +| 5 — plan annotations | ~30 | +| **Total** | **~370** | + +## References + +- This plan's panic: `2026-05-25` session transcript; stack trace shows + `Rewrite { new_idx: 8 }` against `new_ast.blocks.len() == 1`. +- Plan 7's original `CoarsenedEntry` design: + `claude-notes/plans/2026-05-04-q2-preview-plan-7-incremental-writer.md` + §"Coarsen step". +- Plan 7c's transparent-wrapper fix: + `claude-notes/plans/2026-05-25-q2-preview-plan-7c-closure-gaps.md` + §Phase 8. +- The "not exercised by today's synthesizers" landmine comment: + `crates/pampa/src/writers/incremental.rs` around line ~640 (after + the `coarsen_keep_before_block` Transparent recursion). +- Existing precedent for pre-computed text: + `CoarsenedEntry::InlineSplice` (introduced in commit `ab10f37b`). diff --git a/claude-notes/plans/2026-05-25-q2-preview-plan-7c-closure-gaps.md b/claude-notes/plans/2026-05-25-q2-preview-plan-7c-closure-gaps.md new file mode 100644 index 000000000..801a7ceb2 --- /dev/null +++ b/claude-notes/plans/2026-05-25-q2-preview-plan-7c-closure-gaps.md @@ -0,0 +1,1446 @@ +# Plan 7c — Plan 7 closure gaps (Q-3-41, TS editability gate, per-kind tests) + +**Date:** 2026-05-25 +**Branch:** `feature/provenance` (or fresh worktree branched from it). +The contract docs the plan references — `provenance-contract.md` and +`incremental-writer-contract.md` — currently live on +`review/provenance-plan-7` and merge into `feature/provenance` as +part of the review-pass merge that is the same prerequisite for this +plan. +**Status:** Implementation plan +**Milestone:** none directly — closes correctness/coverage gaps in +the writer surface Plan 7 already shipped. + +## Epic context + +Part of the **provenance epic** (Plans 3–8 + 7a + 7b + this). When +the Plan-7 implementation agent ran on 2026-05-24, the post-review +Plan-7 doc had not yet been merged into `feature/provenance`. Three +correctness/coverage gaps survived as a result. Plan 7c closes them: + +1. **Q-3-41 "Edit dropped — render not ready yet"** — the + first-edit-before-render diagnostic the review pass introduced. + Neither the catalog entry nor the React/SPA emission landed. +2. **TS-side `hasPreimageIn` + `isEditableInside`** — the predicate + pair that closes Plan 2A's React framework gate. The Rust side + has the canonical version (`pampa::writers::incremental::is_editable_inside_*`); + the TS side at `ts-packages/preview-renderer/src/utils/sourceInfo.ts` + only exports the atomicity half. +3. **`cfg(debug_assertions)` `#[should_panic]` test** for the + shortcode-Generated-with-empty-`from` debug-assert at + `crates/pampa/src/writers/incremental.rs:448`. +4. **Per-kind soft-drop test symmetry** — explicit tests for each + atomic kind (filter / title-block / tree-sitter-postprocess) on + the Omit and inline UseAfter paths; the multi-inline dedupe + filter case. + +Plan 7b (`claude-notes/plans/2026-05-24-q2-preview-plan-7b-test-orama.md`) +already covers two adjacent test gaps — the writer-lossless baseline +test and the filter-construction-UseAfter test. Plan 7c is the +*disjoint* gap; do not duplicate Plan 7b's items here. + +## Hand-off start point + +1. Worktree: `feature/provenance` at + `/Users/gordon/src/q2/.worktrees/provenance/` (the integration + branch). `cargo xtask verify` is green there at the current tip; + confirm before starting. +2. The review-pass commits that introduced the missing design — `00222099`, + `bfb40962`, `561eefa0`, plus the cross-link commit `7c03be64` — + live on `review/provenance-plan-7`. Either merge that branch + into `feature/provenance` before starting (preferred — gives the + contract docs to consult) or work from the audit summary + below. +3. The audit that produced this plan: see the conversation transcript + on 2026-05-25 (Claude session resolving the rebase of + `review/provenance-plan-7` onto `feature/provenance`). +4. **Phase order matters.** Do Phase 1 (catalog) first so Phases 2 + and 3 can reference `Q-3-41`. Phase 4 (TS gate) is independent + of Phase 1 in code but conceptually pairs with Phase 3 (Q-3-41 + is the visible signal for the gate's "no baseline yet" branch). +5. Don't push without explicit user permission. + +## Goal + +Bring Plan 7's user-visible surface back into alignment with the +post-review contract, and close two correctness/UX issues that the +post-implementation code review surfaced: + +- The user always sees *some* signal when an edit is dropped — + Q-3-42 for atomic-content edits, Q-3-43 for no-preimage edits, + Q-3-41 for first-edit-before-render. No silent drops. +- The React framework's read-only gate matches the writer's + editability predicate, so edits that the writer would soft-drop + are gated at the DOM rather than reverting after a round-trip. +- The writer's debug-assert + each atomic kind's soft-drop path + has explicit regression coverage. +- **Q-3-43's diagnostic body actually names what was dropped** — + include path, metadata key, or container kind — instead of three + emission sites sharing one generic message. (Code-review item, + not part of the original closure audit; see Phase 6.) +- **Inline-level soft-drop looks up the original by the + reconciler's index**, not by the result-side positional proxy + that today's code uses. Today's proxy is exact for in-place + retypings (the shortcode case the tests cover) but misfires on + any inline insert/delete before the soft-drop site. (Code-review + item; see Phase 7.) +- **`target_file_id` derivation walks past synthesized first + blocks** instead of falling back to `FileId(0)` on a title-block- + first document. Dormant bug today (single-file fixtures + happen to land on `FileId(0)`); pre-empts Plan 8's multi-file + story. (Code-review item; see Phase 8.) + +Behaviour outside these items is unchanged. The code-review phases +tighten the writer's existing contract — they don't add new +contract surface, new diagnostic semantics, or a new pipeline tier. + +## Scope + +### In scope + +#### Phase 1 — `Q-3-41` catalog entry (`quarto-error-reporting`) + +**Repo facts the implementer needs:** + +- Catalog file: `crates/quarto-error-reporting/error_catalog.json`. + Q-3-42 / Q-3-43 entries at lines 527–541 are the shape to mirror + (`subsystem`, `title`, `message_template`, `docs_url`, + `since_version`). +- Subsystem for writer-side codes is `"writer"`. `since_version` + is `"99.9.9"` for unreleased entries. +- Q-3-41 is unallocated today (Q-3-40 is taken; Q-3-42/Q-3-43 are + the Plan-7 codes). Slot Q-3-41 between them. +- Q-3-41 is **TS-emitted** — there is no Rust caller (the writer + isn't invoked when the baseline is missing). No diagnostic + builder needed on the Rust side. The catalog entry exists so the + docs URL and version metadata are consistent. + +- [ ] Add Q-3-41 entry to `error_catalog.json` between Q-3-40 and + Q-3-42. Title: `"Edit dropped — render not ready yet"`. + `message_template`: `"Your edit was dropped because the + document hasn't finished rendering. Try again in a moment."` + `docs_url`: `"https://quarto.org/docs/errors/Q-3-42"`-style + shape; `since_version`: `"99.9.9"`. +- [ ] Build: `cargo xtask verify --skip-hub-build --skip-hub-tests` + green (the catalog has a unit test that asserts every entry + parses). + +#### Phase 2 — TS-side `hasPreimageIn` + `isEditableInside` + +**Repo facts the implementer needs:** + +- Target file: `ts-packages/preview-renderer/src/utils/sourceInfo.ts` + (59 lines today; will roughly double). +- Wire-format types: `ts-packages/preview-renderer/src/types/sourceInfo.ts` + documents codes 0/1/2/3/4. Walk pattern: `entryFor(node, pool)` + for the entry; `entry.t` discriminates. +- Rust reference: `crates/pampa/src/writers/incremental.rs:113-162` + (`is_editable_inside_block` / `_inline` / `_source_info`) + + `crates/quarto-source-map/src/source_info.rs:406-442` + (`preimage_in`). +- Anchor roles on the wire: `Generated` entries (code 4) carry + `from?: AnchorRef[]` where `role: "invocation" | "value-source" + | "other:<…>"`. Walk only `role === "invocation"`. +- `targetFileId` derivation, Rust-side: `original_ast.blocks.first() + .and_then(|b| b.source_info().root_file_id()).unwrap_or(FileId(0))` + (`incremental.rs:289-293`). On the TS side, look up the first + block's `s`-index in the pool, walk to its root Original, take + its `d` (file id). Default to `0` if absent. +- React context to extend: `ts-packages/preview-renderer/src/framework/RegistryContext.tsx`. + Add an optional `targetFileId?: number`. Default `0` when absent + (mirrors the Rust default, and covers callers that don't pass + the field yet). +- React dispatcher gate to update: + `ts-packages/preview-renderer/src/framework/dispatch.tsx:404-411`. + Replace the `isAtomic` check with `!isEditableInside(...)`. +- The Ast provider that builds the context value: + `ts-packages/preview-renderer/src/framework/Ast.tsx:121`. + Compute `targetFileId` once and pass it alongside `sourceInfoPool`. + +**Implementation sketch:** + +```ts +// In ts-packages/preview-renderer/src/utils/sourceInfo.ts + +/** Walk an entry's preimage chain in the pool; return [start, end] + * if the chain resolves to bytes in `targetFileId`, else undefined. + * Mirrors Rust `SourceInfo::preimage_in`. */ +export function hasPreimageIn( + node: { s?: number }, + pool: SourceInfoPool | undefined, + targetFileId: number, +): [number, number] | undefined { + const entry = entryFor(node, pool); + if (!entry) return undefined; + return preimageInEntry(entry, pool, targetFileId); +} + +function preimageInEntry( + entry: SourceInfoEntry, + pool: SourceInfoPool | undefined, + targetFileId: number, +): [number, number] | undefined { + if (entry.t === 0) { + return entry.d === targetFileId ? entry.r : undefined; + } + if (entry.t === 1) { + const parent = pool?.[entry.d]; + if (!parent) return undefined; + const parentRange = preimageInEntry(parent, pool, targetFileId); + if (!parentRange) return undefined; + return [parentRange[0] + entry.r[0], parentRange[0] + entry.r[1]]; + } + if (entry.t === 2) { + // Concat: every piece must resolve in target AND be byte-contiguous. + const ranges: Array<[number, number]> = []; + for (const [si_id, _offset, _len] of entry.d) { + const piece = pool?.[si_id]; + if (!piece) return undefined; + const r = preimageInEntry(piece, pool, targetFileId); + if (!r) return undefined; + ranges.push(r); + } + if (ranges.length === 0) return undefined; + for (let i = 1; i < ranges.length; i++) { + if (ranges[i - 1][1] !== ranges[i][0]) return undefined; + } + return [ranges[0][0], ranges[ranges.length - 1][1]]; + } + if (entry.t === 4) { + // Generated: walk the Invocation anchor only. + const inv = entry.d.from?.find((a) => a.role === 'invocation'); + if (!inv) return undefined; + const anchored = pool?.[inv.si_id]; + if (!anchored) return undefined; + return preimageInEntry(anchored, pool, targetFileId); + } + // t === 3 (legacy) and any future codes — not consulted. + return undefined; +} + +/** Combined editability gate. Mirrors Rust + * `pampa::writers::incremental::is_editable_inside_*`. */ +export function isEditableInside( + node: { s?: number; t?: string; type_name?: string }, + pool: SourceInfoPool | undefined, + targetFileId: number, + atomicKinds: ReadonlySet, +): boolean { + // Atomic CustomNodes — never editable inside. + const isCustom = node.t === 'CustomBlock' || node.t === 'CustomInline'; + if (isCustom && isAtomicCustomNode(node.type_name ?? '')) return false; + // Atomic-kind Generated — never editable inside. + if (isAtomicSourceInfo(node, pool, atomicKinds)) return false; + // No preimage in target — never editable inside. + return hasPreimageIn(node, pool, targetFileId) !== undefined; +} +``` + +- [ ] Implement `hasPreimageIn` per the sketch above. Export from + `sourceInfo.ts`. +- [ ] Implement `isEditableInside`. Place the + `isAtomicCustomNode` import alongside the existing + `entryFor` / `isAtomicSourceInfo` imports + (`ts-packages/preview-renderer/src/utils/atomicCustomNodes.ts`). +- [ ] Add unit tests for `hasPreimageIn` mirroring the Rust ones + at `crates/quarto-source-map/src/source_info.rs:1614-1750`: + Original same / different file; Substring composes offsets; + Concat contiguous / gappy / empty; Generated with Invocation + / with ValueSource only / no anchors. New test file: + `ts-packages/preview-renderer/src/utils/sourceInfo.test.ts`. +- [ ] Add unit tests for `isEditableInside` covering the three + uneditable reasons (atomic CustomNode, atomic-kind Generated, + no-preimage Generated) plus positive cases. +- [ ] Extend `RegistryContext` to carry optional `targetFileId?: number` + with default `0` in the empty-registry initial value. +- [ ] In `Ast.tsx`, compute `targetFileId` from the pool's first + block (walk to root Original; default `0`) and pass it + through the provider value. +- [ ] Update `framework/dispatch.tsx:404-411`'s `Node` gate: + replace the `isAtomic` check with + `!isEditableInside(node, sourceInfoPool, targetFileId, ATOMIC_KINDS)`. + Keep `NOOP_SET_LOCAL_AST` as the substituted callback. + +**Cross-language parity test — keeping TS in sync with Rust.** + +Hand-mirrored unit tests catch most desync the day the desync +happens, but they rely on a contributor noticing that "I changed +the Rust walker; I should update the TS one too." That discipline +fails the first time someone forgets. We need a structural check. + +The mechanism: a corpus of `(SourceInfoPool, node_s, target_file_id, +expected_preimage_or_null)` cases that's **generated from Rust** and +**consumed from TS**. Rust is the source of truth; if the Rust +`preimage_in` semantics change, the corpus regenerates; the TS test +runs against the new corpus and fails until the TS walker is +updated to match. + +Corpus shape (single JSON file, committed): + +```json +{ + "schema_version": 1, + "generated_from": "crates/quarto-source-map/src/source_info.rs", + "cases": [ + { + "name": "original_same_file", + "pool": [ /* SourceInfoEntry wire-format entries, code 0/1/2/4 */ ], + "node_s": 0, + "target_file_id": 0, + "expected": [10, 25] + }, + { + "name": "generated_with_value_source_only_no_invocation", + "pool": [ ... ], + "node_s": 2, + "target_file_id": 0, + "expected": null + } + ] +} +``` + +Location: `crates/quarto-source-map/test-fixtures/preimage-parity/cases.json`. +Lives with the producer of truth (the Rust walker), consumed by +the verifier (the TS walker). The TS test reads the file via +Vite's `import.meta.glob` or a path-relative fetch in test +config. + +**Rust side — generator + freshness gate.** + +Rust generates the fixture from a hand-written enumeration of +cases that mirror the existing `preimage_in` unit tests at +`crates/quarto-source-map/src/source_info.rs:1614-1750`. The +generator runs as a Rust integration test: + +```rust +// crates/quarto-source-map/tests/preimage_parity_fixture.rs +// +// Generates the cross-language parity corpus consumed by +// ts-packages/preview-renderer/src/utils/sourceInfo.parity.test.ts. +// Run with `cargo nextest run -p quarto-source-map preimage_parity`. +// Fails if `cases.json` is stale relative to the in-code corpus — +// re-run with `QUARTO_BLESS_PREIMAGE_PARITY=1` to regenerate. + +#[test] +fn preimage_parity_fixture_is_up_to_date() { + let cases = build_corpus(); // hand-written enumeration + let expected = serialize_corpus(&cases); + let path = "test-fixtures/preimage-parity/cases.json"; + if std::env::var("QUARTO_BLESS_PREIMAGE_PARITY").is_ok() { + std::fs::write(path, &expected).unwrap(); + return; + } + let actual = std::fs::read_to_string(path).unwrap_or_default(); + assert_eq!( + actual.trim(), + expected.trim(), + "preimage parity fixture is stale; rerun with \ + QUARTO_BLESS_PREIMAGE_PARITY=1 to regenerate" + ); +} +``` + +The corpus enumeration covers, at minimum: + +- `Original` in target file (positive) +- `Original` in non-target file (None) +- `Substring` composing offsets through a parent in target +- `Substring` rooted outside target (None) +- `Concat` of contiguous pieces in target (positive) +- `Concat` with a gap (None) +- `Concat` empty (None) +- `Generated` with `Invocation` anchor resolving in target (positive) +- `Generated` with `Invocation` anchor in non-target (None) +- `Generated` with only `ValueSource` anchor (None — role-asymmetry) +- `Generated` with only `Other("…")` anchor (None — forward-compat) +- `Generated` with empty `from[]` (None) +- Nested cases: `Substring` of a `Generated`'s Invocation; + `Generated` whose Invocation is itself a `Substring`. + +Every shape `preimage_in` matches on must appear at least once; +every "None" reason must appear at least once. The +`role-asymmetry` cases are load-bearing — they're the contract +that Plans 9/10 inherit. + +**TS side — consumer test.** + +```ts +// ts-packages/preview-renderer/src/utils/sourceInfo.parity.test.ts +import cases from + '../../../../crates/quarto-source-map/test-fixtures/preimage-parity/cases.json'; +import { hasPreimageIn } from './sourceInfo'; + +describe('preimage parity with Rust', () => { + for (const c of cases.cases) { + test(c.name, () => { + const node = { s: c.node_s }; + const actual = hasPreimageIn(node, c.pool, c.target_file_id); + expect(actual ?? null).toEqual(c.expected); + }); + } +}); +``` + +The test relies on the TS wire-format types +(`ts-packages/preview-renderer/src/types/sourceInfo.ts`) deserializing +the corpus `pool` entries directly — that is the same wire format +the runtime consumes, so if the corpus deserializes, the runtime +contract holds. + +**Atomic-kinds parity (belt-and-suspenders).** + +Separately from the walker corpus, a small text-level check +keeps the atomicity sets in sync. Add a Rust integration test +that generates a JSON file listing the `is_atomic_kind` kinds: + +```rust +// crates/quarto-source-map/tests/atomic_kinds_fixture.rs +#[test] +fn atomic_kinds_fixture_is_up_to_date() { + let kinds = ["filter", "shortcode", "title-block", + "tree-sitter-postprocess"]; + // (in-code enumeration is the source of truth; assert + // every kind here is_atomic_kind-true and no other kind + // we synthesize is true) + for k in kinds { assert!(By::raw(k, json!(null)).is_atomic_kind()); } + // ... write to test-fixtures/atomic-kinds.json with bless flag ... +} +``` + +And a TS test that asserts `ATOMIC_KINDS` equals the fixture's +set. Same bless-flag freshness gate, same desync-loud failure. + +**Implementation steps for the parity work.** + +- [ ] Create `crates/quarto-source-map/tests/preimage_parity_fixture.rs` + with the corpus builder per the sketch above. Enumerate the + cases listed in §"corpus enumeration." +- [ ] Run with `QUARTO_BLESS_PREIMAGE_PARITY=1` to generate + `crates/quarto-source-map/test-fixtures/preimage-parity/cases.json`. + Commit the fixture. +- [ ] Create `ts-packages/preview-renderer/src/utils/sourceInfo.parity.test.ts` + per the sketch. Configure the test runner to find the + `cases.json` path (relative import works under Vitest's + default config; confirm `npm run test:ci` picks it up). +- [ ] Create the atomic-kinds parity fixture + Rust generator + + TS consumer test. The TS consumer test imports + `ATOMIC_KINDS` from `utils/sourceInfo.ts` and asserts + set-equality with the fixture. +- [ ] Document the bless flag in `crates/quarto-source-map/README.md` + (create if missing): a single paragraph on when to bless + the fixtures (any Rust-side change that affects + `preimage_in`'s behaviour or the atomic-kinds enumeration). +- [ ] CI: `cargo nextest run` already runs the freshness gate; + no CI changes needed. The TS parity test runs under + `npm run test:ci`, which is already in `cargo xtask verify`. + +**Why the freshness gate matters.** + +Without the gate, a Rust-side change (say, adding `By::callout()` +to `is_atomic_kind`'s matches arm) would silently leave the TS +fixture stale, and the TS parity test would pass against the +stale fixture. The gate makes that change a Rust test failure — +loud, immediate, easy to fix by re-running with the bless flag. +The TS side then trips when the contributor regenerates the +fixture without updating `ATOMIC_KINDS` to match. Two-step +diagnosis, but both steps fail loudly. + +- [ ] `cd hub-client && npm run build:all` green (hits the + preview-renderer build via project references). +- [ ] `cd hub-client && npm run test:ci` green. + +#### Phase 3 — First-edit gates emit `Q-3-41` + +**Repo facts the implementer needs:** + +- ReactPreview no-baseline branch: + `hub-client/src/components/render/ReactPreview.tsx:444-446`. + Currently `console.warn` + bare `return`. +- SPA no-baseline branch: + `q2-preview-spa/src/PreviewApp.tsx:437-440`. Currently + `console.warn` + bare `return`. +- SPA already has a Q-3-42/Q-3-43 surface — `DiagnosticStrip` at + `q2-preview-spa/src/components/DiagnosticStrip.tsx` and the + `setWriteWarnings` state in `PreviewApp.tsx:392`. Push Q-3-41 + through the same channel. +- ReactPreview already drains write-back warnings into + `pendingWriteWarningsRef` (line 320) and flushes via + `onDiagnosticsChange` on the next render (line 361-366). Push + Q-3-41 into `pendingWriteWarningsRef.current` so it surfaces in + the existing diagnostics panel. Per the autosave-context + suppress-after-3 policy, the merging already de-dupes by source + range; Q-3-41 has no range so it'll just repeat — acceptable + for v1 because the user will keep retrying until the render + catches up. +- TS `Diagnostic` shape: + `ts-packages/preview-renderer/src/types/diagnostic.ts:28-49`. + Required fields: `kind: 'warning'`, `title`, `hints: string[]`, + `details: DiagnosticDetail[]` (can be empty). Optional: `code`, + `problem`, `start_line` / `start_column` / `end_line` / + `end_column` (omit — no source range), `rendered`. + +**Helper sketch** — shared between both call sites. Live in +`ts-packages/preview-runtime/src/firstEditDiagnostic.ts` (new file; +both ReactPreview and the SPA already import from this package): + +```ts +import type { Diagnostic } from '@quarto/preview-renderer/types/diagnostic'; + +/** Construct a Q-3-41 warning for the "edit before first render + * produced a baseline AST" case. Body text mirrors the catalog + * entry; the helper is the TS counterpart to a Rust + * `diagnostic_q3_41()` builder that doesn't exist (the writer is + * never called in this branch). */ +export function diagnosticQ3_41(): Diagnostic { + return { + kind: 'warning', + code: 'Q-3-41', + title: 'Edit dropped — render not ready yet', + problem: + "Your edit was dropped because the document hasn't " + + "finished rendering. Try again in a moment.", + hints: [], + details: [], + }; +} +``` + +- [ ] Create `ts-packages/preview-runtime/src/firstEditDiagnostic.ts` + with `diagnosticQ3_41()` per the sketch. Export from + `ts-packages/preview-runtime/src/index.ts`. +- [ ] Co-located unit test + `ts-packages/preview-runtime/src/firstEditDiagnostic.test.ts`: + assert `diagnosticQ3_41()` returns the expected shape (kind, + code, title, problem present). +- [ ] In `ReactPreview.tsx`'s `handleSetAst`, replace the + `console.warn` + return in the no-baseline branch + (`!baseline`) with: + `pendingWriteWarningsRef.current = [...pendingWriteWarningsRef.current, diagnosticQ3_41()];` + followed by the early return. Trigger a re-render so the + pending warnings flush — pass through `onDiagnosticsChange` + directly with the merged set rather than waiting for the + next render, since no qmd content change happens here. + (Implementation detail: store `pendingWriteWarningsRef` flush + logic in a small helper if duplicated from the post-render + drain.) +- [ ] In `PreviewApp.tsx`'s `handleSetAst`, replace the + `console.warn` + return in the `!path || !baselineJson` branch + with `setWriteWarnings((prev) => [...prev, diagnosticQ3_41()]);` + followed by the early return. +- [ ] In ReactPreview: assert the diagnostic still surfaces if the + user fixes the underlying issue (render eventually completes, + baseline becomes available, next edit succeeds — the Q-3-41 + from the dropped edit remains in the diagnostics panel until + the next successful render's drain clears it). Document this + in the call-site comment. +- [ ] Hub-client integration test (Vitest): mount ReactPreview + with `ast=''` (no baseline), call `handleSetAst({})`, assert + `onDiagnosticsChange` is called with a list containing + `code: 'Q-3-41'`. Place alongside the existing ReactPreview + tests; if there's no test file for ReactPreview yet, model + on `hub-client/src/services/incrementalWrite.wasm.test.ts`'s + structure. +- [ ] SPA integration test + (`q2-preview-spa/src/PreviewApp.integration.test.tsx`): + drive `handleSetAst` before the first successful render + completes; assert `DiagnosticStrip` renders a row with the + Q-3-41 title. +- [ ] `cd hub-client && npm run build:all && npm run test:ci` green. + +#### Phase 4 — Per-kind soft-drop test symmetry (Rust) + +**Repo facts the implementer needs:** + +- Existing test module at the bottom of + `crates/pampa/src/writers/incremental.rs` (search `#[cfg(test)]`). + Models to mirror: + - Omit on atomic-kind: `keep_before_with_atomic_kind_generated_no_anchor_emits_omit` + (line ~1590; uses `By::filter("upper.lua", 14)`). + - Inline UseAfter soft-drop: + `inline_use_after_on_atomic_generated_soft_drops_to_keep_before_with_q3_42` + (line ~2028; uses `By::shortcode("meta")`). + - Multi-inline dedupe positive: + `multi_inline_dedupe_emits_token_once_when_invocation_shared` + (line ~1909; shortcode case). +- The code paths in question (`coarsen_keep_before_block` for + Omit, `assemble_inline_content` for inline UseAfter) do not + branch on `by.kind` — they branch on `by.is_atomic_kind()`. New + per-kind tests exercise the same code, but a regression in + `is_atomic_kind`'s enumeration (e.g. dropping `"title-block"` + from the match) would be caught here whereas the generic test + alone wouldn't. + +**The block-Omit path is `is_atomic_kind`-driven, not kind-specific.** + +The block-level Omit branch (`coarsen_keep_before_block`) and the +inline soft-drop branch (`assemble_inline_content`) both consult +`by.is_atomic_kind()` — they don't pattern-match on kind. A +hand-written per-kind test exercises the same `matches!` arm +through a different constructor; the only regression it catches +is "someone dropped a kind from the `matches!` arm at +`source_info.rs:647`." That's a real but narrow failure mode. + +A single enumeration property test catches the same failure with +less scaffolding and stays correct as the atomic-kind set grows. +The hand-written inline-soft-drop pair is more justified — the +inline path has subtle wiring (diagnostic-location selection in +`diagnostic_q3_42_inline`, dedupe interaction with `Invocation` +equality) that isn't a function of kind alone. + +**Block-level: one property test, not three hand-written tests.** + +```rust +#[test] +fn every_atomic_kind_emits_omit_under_keep_before_with_empty_from() { + // Drives every kind in the documented atomic-kind set through + // coarsen and asserts the Omit verdict. New kinds added to + // `By::is_atomic_kind()` must be added here too; if a kind + // ever leaves the set without leaving this test, the test + // either fails (kind no longer atomic) or false-passes + // (regression) — the latter is caught by the corresponding + // freshness gate in Plan 7c Phase 2's atomic-kinds parity + // fixture. + let atomic_kinds: Vec = vec![ + By::filter("upper.lua", 14), + // shortcode is excluded — its empty-`from` case trips + // the debug-assert (see Phase 5); the property below + // only enumerates kinds whose empty-`from` is "normal." + By::title_block(), + By::tree_sitter_postprocess(), + ]; + for by in atomic_kinds { + assert!(by.is_atomic_kind(), "kind {:?} no longer atomic", by); + let block = para(vec![], SourceInfo::generated(by.clone())); + let ast = quarto_pandoc_types::Pandoc { + blocks: vec![block], + meta: ConfigValue::default(), + }; + let plan = ReconciliationPlan { + block_alignments: vec![BlockAlignment::KeepBefore(0)], + ..Default::default() + }; + let mut warnings = Vec::new(); + let entries = coarsen("", &ast, &ast, &plan, &mut warnings).unwrap(); + assert!( + matches!(entries[0], CoarsenedEntry::Omit), + "expected Omit for kind {:?}, got {:?}", by, entries[0], + ); + assert!(warnings.is_empty(), "KeepBefore branch should not warn"); + } +} +``` + +**Inline-level: keep the hand-written pair.** The inline path is +worth exercising once per kind because the diagnostic builder +and the soft-drop substitution have distinct behaviour beyond +the `is_atomic_kind()` gate. + +- [ ] Add `every_atomic_kind_emits_omit_under_keep_before_with_empty_from` + per the sketch above. ~30 LOC, replaces the three + block-Omit per-kind tests. +- [ ] Add `inline_use_after_on_filter_constructed_inline_soft_drops`: + mirror the shortcode test at line ~2028, build + `By::filter("emoji.lua", 9)` on the original inline, assert + Q-3-42 + KeepBefore. ~25 LOC. (Complements Plan 7b's Phase-1 + *block-level* filter UseAfter test by exercising the inline + path.) +- [ ] Add `inline_use_after_on_title_block_inline_soft_drops`: + same shape, `By::title_block()`. ~25 LOC. +- [ ] Add `multi_inline_dedupe_filter_case`: shape-equivalent to + `multi_inline_dedupe_emits_token_once_when_invocation_shared` + but using `By::filter("decoration.lua", 12)`. Filter + constructions rarely produce multi-inline output in practice, + but the dedupe rule consults `Invocation` regardless of + kind, so the test pins the regression shape. ~30 LOC. +- [ ] `cargo nextest run -p pampa -E 'test(/coarsen|inline_use_after|multi_inline|every_atomic_kind/)'` + green. + +#### Phase 5 — `cfg(debug_assertions)` `#[should_panic]` test + +**Repo facts the implementer needs:** + +- The debug-assert site: + `crates/pampa/src/writers/incremental.rs:448-455`. Panic message + starts with `"Generated { by: shortcode, from: [] } reached the + writer — Plan 6's stamper must always attach an Invocation + anchor for shortcode resolutions."` +- `#[should_panic(expected = "…")]` matches on a substring. Use + the unique prefix `"Generated { by: shortcode, from: [] } reached"` + to avoid false positives. +- Release builds compile `debug_assert!` out. The test must be + cfg-gated to `debug_assertions` so release-profile test runs + don't trip the `should_panic` reverse-failure. + +**Sketch:** + +```rust +#[test] +#[cfg(debug_assertions)] +#[should_panic(expected = "Generated { by: shortcode, from: [] } reached")] +fn shortcode_with_empty_from_trips_debug_assert() { + // The Plan-6 stamper invariant: every Generated{by:shortcode} + // carries an Invocation anchor. A hand-constructed shape that + // skips the anchor must trip the writer's debug_assert. + let gen_info = SourceInfo::generated(By::shortcode("meta")); + let block = para(vec![], gen_info); + let ast = quarto_pandoc_types::Pandoc { + blocks: vec![block], + meta: ConfigValue::default(), + }; + let plan = ReconciliationPlan { + block_alignments: vec![BlockAlignment::KeepBefore(0)], + ..Default::default() + }; + let mut warnings = Vec::new(); + // Coarsen panics inside `coarsen_keep_before_block` via the + // debug-assert at incremental.rs:448. + let _ = coarsen("", &ast, &ast, &plan, &mut warnings); +} +``` + +- [ ] Add the test per the sketch above to the same test module. + Document the `cfg(debug_assertions)` gating with a one-line + comment so release-profile runners aren't confused. +- [ ] `cargo nextest run -p pampa shortcode_with_empty_from` green + (default profile = `debug_assertions` on). +- [ ] `cargo nextest run --release -p pampa shortcode_with_empty_from` + green (test is compiled out, suite still passes). + +#### Phase 6 — Differentiated `Q-3-43` builder via `Q343Reason` enum + +**Repo facts the implementer needs:** + +- Current builder: + `crates/pampa/src/writers/incremental.rs:552-563` + (`diagnostic_q3_43_block`). Returns a single generic message — + `"An edit to pipeline-generated content was reverted."` — and a + single generic hint that lists three possible upstreams + ("an include, a metadata key, or other source"). +- Three call sites, each currently calls + `diagnostic_q3_43_block(block)` with no case discriminator: + - `incremental.rs:320` — block `UseAfter` on a no-preimage + Generated container (user wholesale-replaced a synthesized + container via React). + - `incremental.rs:344` — block `RecurseIntoContainer` on an + atomic CustomNode whose wrapper has preimage in target + (typically `IncludeExpansion` / `CrossrefResolvedRef`; soft-drop + substitutes Verbatim). + - `incremental.rs:350` — block `RecurseIntoContainer` on a + no-preimage Generated container (synthesized + footnotes / appendix / etc.; soft-drop substitutes Omit). +- The post-review contract doc + (`claude-notes/designs/incremental-writer-contract.md`, + §"User-facing diagnostic surface") promises body text that names + the upstream: `"To edit this content, open `` directly."` + for includes; `"This content is generated from metadata; edit + `_quarto.yml` to change it."` for metadata-derived containers. + Today's code delivers neither. +- For the include-recurse case, the include path lives in the + atomic CustomNode's `plain_data["source_path"]`. Look at + `crates/quarto-pandoc-types/src/custom.rs` for the `plain_data` + shape; use `.as_str()` on the `Value` to extract. +- For metadata-derived containers, the synthesizer's `By::kind` + string (`"footnotes"`, `"appendix"`, etc.) is the only stable + identifier today — there is no metadata-key anchor in v1. Plan 9 + (`ValueSource`) will give us the actual metadata range; until + then, naming the kind is the best the diagnostic can do. + +**Design — `Q343Reason` enum at the call boundary.** + +The three emission sites collapse to one builder that takes a +typed reason. The enum forces every new emission site to pick a +case (compile-time exhaustiveness) and centralises the body-text +choices for the message catalog. + +```rust +/// Why a Q-3-43 was emitted. One variant per emission path in +/// `coarsen`; new soft-drop sites must extend this enum so the +/// match in `diagnostic_q3_43_block` covers them at compile time. +enum Q343Reason<'a> { + /// User edited inside an atomic CustomNode whose wrapper has + /// preimage in target — typically an `IncludeExpansion` or a + /// `CrossrefResolvedRef`. `include_path` is the wrapper's + /// `plain_data["source_path"]` if present (Plan 8); `None` for + /// CustomNodes without a source-path field. + IncludeRecurse { include_path: Option<&'a str> }, + /// User edited inside a no-preimage Generated container + /// (footnotes / appendix / sectionize / etc.). `kind` is the + /// `by.kind` string of the container. + MetadataContainerRecurse { kind: &'a str }, + /// User wholesale-replaced a no-preimage Generated container + /// via React. `kind` is the new-side block's `by.kind`. + NoPreimageReplacement { kind: &'a str }, +} + +fn diagnostic_q3_43_block( + block: &Block, + reason: Q343Reason, +) -> quarto_error_reporting::DiagnosticMessage { + let (title, problem, hint): (&str, String, String) = match reason { + Q343Reason::IncludeRecurse { include_path: Some(path) } => ( + "Include content edit dropped", + format!("An edit inside `{{{{< include {} >}}}}` was reverted.", path), + format!("To edit this content, open `{}` directly.", path), + ), + Q343Reason::IncludeRecurse { include_path: None } => ( + "Generated content edit dropped", + "An edit inside an atomic block was reverted.".into(), + "This block is read-only; edit its upstream source instead.".into(), + ), + Q343Reason::MetadataContainerRecurse { kind } => ( + "Generated content edit dropped", + format!("An edit inside the synthesized `{}` container was reverted.", kind), + "This content is generated from metadata; edit `_quarto.yml` to change it.".into(), + ), + Q343Reason::NoPreimageReplacement { kind } => ( + "Generated content edit dropped", + format!("A replacement of the synthesized `{}` container was reverted.", kind), + "Generated containers must be changed by editing their metadata source.".into(), + ), + }; + quarto_error_reporting::DiagnosticMessageBuilder::warning(title) + .with_code("Q-3-43") + .with_location(block.source_info().clone()) + .problem(problem) + .add_hint(hint) + .build() +} +``` + +The `Block` parameter stays so `with_location` can anchor the +warning at the original wrapper's source range (atomic CN paths) +or fall through to a no-range diagnostic (no-preimage container +paths — `with_location` accepts a `SourceInfo::Generated` whose +`preimage_in` returns `None`; the resulting warning lands without +a Monaco squiggle and surfaces in the diagnostics banner only). + +**Catalog reconciliation.** The catalog entry +(`crates/quarto-error-reporting/error_catalog.json`) currently +carries one Q-3-43 with a generic `message_template`. The +builder is now responsible for the per-case body text (matching +Plan 7's already-established "builder picks body text, catalog +holds metadata" convention for Q-3-43), so no catalog change is +needed. Confirm by grepping the catalog entry's `since_version` +is still `"99.9.9"`; if a later catalog reformat tries to pin +body text, that's the point to push back on. + +**Implementation steps.** + +- [ ] Add the `Q343Reason` enum next to `diagnostic_q3_43_block` + in `incremental.rs`. Keep it `pub(super)` or module-private; + it's a call-boundary type, not part of the writer's external + API. +- [ ] Replace the body of `diagnostic_q3_43_block` per the sketch + above. Title, problem, hint per variant. +- [ ] Update the three call sites in `coarsen`: + - `incremental.rs:320`: pass `Q343Reason::NoPreimageReplacement + { kind: kind_of(new_block) }` where `kind_of` reads + `Generated.by.kind` (use the existing `.is_kind(...)` helper + family or write a small `by_kind_of_block(&Block) -> Option<&str>`). + - `incremental.rs:344`: pass `Q343Reason::IncludeRecurse + { include_path: include_path_of(orig_block) }` — write a + small helper that downcasts `Block::Custom(cn)` and reads + `cn.plain_data.get("source_path").and_then(|v| v.as_str())`. + - `incremental.rs:350`: pass `Q343Reason::MetadataContainerRecurse + { kind: by_kind_of_block(orig_block).unwrap_or("generated") }`. +- [ ] Adjust the existing soft-drop tests in + `coarsen_plan7_tests` (`incremental.rs:1525+`) so they assert + the *new* per-case problem text: + - `recurse_into_atomic_custom_node_soft_drops_to_verbatim` + (line ~1807): wrap the original `CrossrefResolvedRef` + CustomNode with `plain_data` containing + `{"source_path": "foo.qmd"}`; assert the warning's problem + contains `"foo.qmd"`. Add a `_no_source_path` variant that + omits `plain_data` and asserts the fallback wording. + - `recurse_into_no_preimage_generated_soft_drops_to_omit` + (line ~1851): assert the problem contains `"appendix"` + (the `By::appendix()` kind used by the fixture). + - `use_after_on_no_preimage_generated_soft_drops_to_omit` + (line ~1769): assert the problem contains the new-side + block's kind. +- [ ] Add a Phase-6-specific test that exercises all three + `Q343Reason` variants through `diagnostic_q3_43_block` + directly (skipping `coarsen`); compact regression pin for + the message text. + +**Location anchoring — what `with_location` should resolve to.** + +The current code passes `new_block` at the UseAfter→Omit site +(line 394) and `orig_block` at the two RecurseIntoContainer +sites (lines 427, 467). The new-side block in the UseAfter case +is React-constructed — its `source_info` is typically +`Generated { by: user_edit, from: [] }` or a `SourceInfo::default()`. +`preimage_in` returns `None` on either, so the Monaco squiggle +doesn't land anywhere useful. The original-side block in this +case is a no-preimage Generated container, whose `source_info` +also has no useful preimage — so the squiggle problem is intrinsic +to the case, not a fixable bug. + +Two things follow from that: + +- **For the two RecurseIntoContainer sites, `orig_block` is the + right anchor and the code already does it.** The IncludeRecurse + case has a useful range (the include token); the + MetadataContainerRecurse case doesn't, but choosing `orig_block` + over `new_block` is still correct because the warning is *about* + the original wrapper, and downstream attribution layers + (`resolve_byte_range`, etc.) prefer original-side info. +- **For the UseAfter→Omit site, switch from `new_block` to the + original block's source_info IF available.** Today the call + site doesn't bind any `orig_block` — `BlockAlignment::UseAfter` + has no `displaced_before_idx`. Two options: + - **v1 fix (cheap):** pass the new block (current behavior), + accept that the diagnostic carries no useful location. Pin the + behavior with a test so future contributors don't accidentally + "fix" it without parallel work on the alignment type. + - **v2 fix (parallel to Phase 7):** extend + `BlockAlignment::UseAfter` the same way Phase 7 extends + `InlineAlignment::UseAfter`, then pass `original_blocks[displaced_before_idx]`. + Out of scope for Plan 7c — file a follow-up beads issue. + +The v1 fix is what Phase 6 ships. Tests pin current behavior, +the v2 follow-up is a beads-issue note. + +- [ ] Add `q3_43_location_anchors_to_original_block_on_recurse`: + assert that for `recurse_into_atomic_custom_node_soft_drops_to_verbatim` + and `recurse_into_no_preimage_generated_soft_drops_to_omit`, + the emitted warning's `location` matches the *original* + block's `source_info`, not the new block's. Cheap pin + against accidental regression. +- [ ] Add `q3_43_location_falls_back_to_new_block_on_use_after`: + for `use_after_on_no_preimage_generated_soft_drops_to_omit`, + assert that the warning's `location` is the new block's + `source_info` (current v1 behavior). Comment block explains + the v2 follow-up. +- [ ] File a follow-up beads issue: "Block-level UseAfter soft-drop: + extend `BlockAlignment::UseAfter` to carry + `displaced_before_idx` (parallel to Plan 7c Phase 7's inline + fix)." Reference Plan 7c Phase 6 location-anchoring v2. + Priority 3 (polish — no user-visible squiggle today either + way; affects attribution metadata downstream). + +- [ ] `cargo nextest run -p pampa` green. +- [ ] `cargo xtask verify --skip-hub-build --skip-hub-tests` + green. + +**Why an enum, not three top-level helpers.** + +A reasonable alternative is three named helpers +(`q3_43_include_recurse`, `q3_43_metadata_recurse`, +`q3_43_no_preimage_replace`) instead of one builder taking an +enum. The enum is preferred here because: + +1. The failure mode we're fixing — "someone added a new soft-drop + site and reused the generic message" — is exactly what landed + in Plan 7. The enum's exhaustiveness check makes the regression + structural: a new `Q343Reason::Foo` is a compile error until + the builder handles it. +2. The catalog has one Q-3-43 entry; modelling the call sites as + one builder mirrors that shape and avoids future drift between + the catalog and the emission code. +3. Adding a fourth Q-3-43 emission site (likely from Plan 8's + IncludeExpansion work) means one new enum variant and one new + match arm — no scaffolding to copy-paste. + +If a future case grows wildly different message structure (e.g. +a multi-paragraph body), peel it off into its own helper at that +point. + +#### Phase 7 — Inline soft-drop carries the displaced original index + +**Repo facts the implementer needs:** + +- Soft-drop site: + `crates/pampa/src/writers/incremental.rs:1069-1080` + (`assemble_inline_content`, the `UseAfter(_)` arm of the + effective-alignment-rewriting loop). The current code reaches + for `orig_inlines.get(result_idx)` to find the original inline + whose editability gates the soft-drop. +- The comment in the code is honest about the proxy: + > "exact for in-place retypings (the common shortcode-edit + > case), approximate for arbitrary insertions/deletions." +- Reconciler type: + `crates/quarto-ast-reconcile/src/types.rs:112-124` + (`InlineAlignment`). The relevant variant is + `UseAfter(usize)` — tuple variant carrying only `after_idx`. +- The same shape exists for blocks: + `BlockAlignment::UseAfter(usize)` at line 100. The block + soft-drop path does **not** consult an original-side index + (it checks the new-side block's editability via + `new_block.source_info().preimage_in(...)`), so this phase is + inline-only. Block soft-drop is correct as-is. +- Today's test suite for inline soft-drop: + `inline_use_after_on_atomic_generated_soft_drops_to_keep_before_with_q3_42` + at `incremental.rs:2027`. All inline-soft-drop fixtures align + `orig_inlines[i]` with `new_inlines[i]` 1:1, so the proxy + bug is invisible to CI. + +**The fix — extend `InlineAlignment::UseAfter` to a struct variant.** + +The reconciler is the only place that knows which original inline +(if any) the `UseAfter` is replacing. Today's tuple variant +throws that information away; the fix is to keep it. Change to: + +```rust +// crates/quarto-ast-reconcile/src/types.rs +pub enum InlineAlignment { + KeepBefore(usize), + + /// Use the after-side inline. `displaced_before_idx` is + /// `Some(i)` when the reconciler treated this as a replacement + /// of `orig_inlines[i]` (the common positional-edit case); + /// `None` for genuine inserts where no original aligns with + /// this slot. Consumers that gate on the original inline's + /// editability (e.g. the writer's soft-drop) MUST use this + /// field rather than deriving it from the alignment index. + #[serde(rename = "use_after")] + UseAfter { + after_idx: usize, + #[serde(default)] + displaced_before_idx: Option, + }, + + RecurseIntoContainer { before_idx: usize, after_idx: usize }, +} +``` + +`Option` rather than `usize` because inserts (no +displaced original) and replacements (displaced original known) +both need to be expressible. The `#[serde(default)]` makes the +new field absent-friendly on the wire — pre-existing JSON +serializations of `UseAfter` deserialize cleanly with +`displaced_before_idx = None`, which is the "be conservative, +don't soft-drop" answer. + +**Why a struct variant, not a new enum variant.** + +A less-invasive alternative is to add `UseAfterReplacing +{ after_idx, before_idx }` alongside `UseAfter(usize)` and leave +the existing variant for genuine inserts. Rejected because: + +- Every consumer of `InlineAlignment` then has to handle two + variants that mean almost the same thing. The writer's match + arms double. +- The reconciler still has to decide which variant to emit on + every alignment, and that decision *is* the + `displaced_before_idx` Option — just expressed in two enum + variants instead of one struct variant with an `Option`. + +Struct-variant migration is mechanical: `cargo build` will list +every pattern match that needs updating. + +**Reconciler-side: populate `displaced_before_idx`.** + +The reconciler at +`crates/quarto-ast-reconcile/src/inline.rs` (or wherever +inline alignment is decided — locate via `git grep +'InlineAlignment::UseAfter'`) produces `UseAfter` from its +positional alignment loop. In practice: + +- LCS-style alignment: when `UseAfter(j)` is emitted at result + position `r`, the reconciler has just consumed `orig_inlines[i]` + on the original side (or hasn't, in which case this is an + insert). The `displaced_before_idx` is `Some(i)` in the + consumed case, `None` in the insert case. +- Positional alignment: `displaced_before_idx = Some(r)` when + `r < orig_inlines.len()`, `None` otherwise. + +The exact derivation depends on the reconciler's algorithm. +Locate the alignment loop and add the index alongside the +existing `after_idx` emission. + +**Writer-side: consume `displaced_before_idx`.** + +```rust +// crates/pampa/src/writers/incremental.rs (assemble_inline_content) +InlineAlignment::UseAfter { after_idx, displaced_before_idx } => { + if let Some(orig_idx) = displaced_before_idx + && let Some(orig) = orig_inlines.get(*orig_idx) + && !is_editable_inside_inline(orig, target_file_id) + { + warnings.push(diagnostic_q3_42_inline(orig)); + effective.push(InlineAlignment::KeepBefore(*orig_idx)); + continue; + } + effective.push(alignment.clone()); +} +``` + +When `displaced_before_idx` is `None` (a genuine insert), there +is no original to gate against, and the alignment passes through +unchanged. That is the correct behaviour — inserts can't soft-drop +because there's nothing they're displacing. + +**Implementation steps.** + +- [ ] In `crates/quarto-ast-reconcile/src/types.rs`: change + `InlineAlignment::UseAfter` from tuple variant `(usize)` to + struct variant `{ after_idx, displaced_before_idx }` per + the sketch. Update the serde rename and add + `#[serde(default)]` on the new field. +- [ ] `cargo build --workspace` and walk every compile error; + update each pattern match. Reconciler tests in the same + crate will surface most of them. Writer call sites in + `pampa::writers::incremental` will surface the rest. +- [ ] Reconciler: populate `displaced_before_idx` in the inline + alignment loop. Add a test in + `quarto-ast-reconcile` asserting the field is populated for + a fixture where `UseAfter` replaces an original inline, + and is `None` for a fixture that inserts a fresh inline. +- [ ] Writer: replace `orig_inlines.get(result_idx)` at + `incremental.rs:1074` with the `displaced_before_idx`-aware + logic. Remove the `result_idx` positional proxy and its + explanatory comment. +- [ ] Add a regression test: + `inline_use_after_with_insert_before_shortcode_does_not_misfire`. + Construct an inline plan with `[Insert("X"), UseAfter + (over-shortcode)]` so the result-side index `1` and the + original-side index `0` differ. Assert the soft-drop fires + against the original shortcode inline (the + `displaced_before_idx`), not against + `orig_inlines.get(result_idx=1)` (which would be out of + bounds, or wrong). +- [ ] Add a complementary test: + `inline_use_after_pure_insert_does_not_soft_drop`. A new + inline with `displaced_before_idx = None` must not consult + `orig_inlines` at all. Assert no Q-3-42 is emitted. +- [ ] `cargo xtask verify --skip-hub-build --skip-hub-tests` + green. +- [ ] `cargo xtask verify` (full) — the WASM bridge passes + `ReconciliationPlan` JSON over the wire; the + `#[serde(default)]` makes the change wire-compatible, but + a full verify confirms nothing else broke. + +**Wire-format compatibility.** + +The TS side at +`ts-packages/quarto-sync-client/src/types.ts` does not currently +deserialize `ReconciliationPlan` itself — the plan is computed +inside WASM and never crosses the boundary as JSON. Confirm with +`git grep -l 'InlineAlignment'` in `ts-packages/` and +`hub-client/`; if any TS consumer turns up, the same +`#[serde(default)]` semantics apply on the parsing side (new +field absent ⇒ `null`/`undefined` ⇒ "don't soft-drop"). + +#### Phase 8 — `target_file_id` derivation skips no-`root_file_id` first blocks + +**Repo facts the implementer needs:** + +- Current derivation site: + `crates/pampa/src/writers/incremental.rs:289-293`. The current + shape: + ```rust + let target_file_id = original_ast + .blocks + .first() + .and_then(|b| b.source_info().root_file_id()) + .unwrap_or(quarto_source_map::FileId(0)); + ``` +- `root_file_id()` lives at + `crates/quarto-source-map/src/source_info.rs:487-498`. For + `Generated`, it walks the `Invocation` anchor; for an empty + `from[]` it returns `None`. So a document whose first block is + a synthesized title-block (no Invocation) gets `None` → + fallback to `FileId(0)`. +- `FileId(0)` is the wire-format default — the same FileId the + parser stamps on a fresh single-file parse. So on a one-file + document, `FileId(0)` happens to be correct by coincidence, + and the bug only surfaces when there's a real cross-file + story (Plan 8's IncludeExpansion, the q2-preview-spa's project + mode addressing multiple files). +- Today the bug is dormant. We don't ship multi-file editing + in this writer pass yet; Plan 8 will. But the test is cheap + and the fix is cheap, and shipping them now means Plan 8 doesn't + have to rediscover the issue. + +**The fix — `iter().find_map(...)` over `first().and_then(...)`.** + +```rust +let target_file_id = original_ast + .blocks + .iter() + .find_map(|b| b.source_info().root_file_id()) + .unwrap_or(quarto_source_map::FileId(0)); +``` + +`find_map` walks blocks in order, returning the first block whose +`root_file_id()` resolves to `Some`. Synthesized title-blocks, +sectionize wrappers, footnotes containers — anything Generated +with empty `from[]` — get skipped. The fallback to `FileId(0)` +remains for the genuinely-empty-document case (no blocks at all, +or every block is no-`root_file_id` Generated). + +**Implementation steps.** + +- [x] Write the failing test first: + `target_file_id_skips_synthesized_first_block`. Build a + Pandoc whose `blocks[0]` is a synthesized title-block (e.g. + `Block::Header` with + `SourceInfo::generated(By::title_block())` and empty `from[]`) + and whose `blocks[1]` is a real `Original` paragraph with + `FileId(7)`. Drive `coarsen` and assert that the editability + check on `blocks[1]` returns `true` (i.e. `target_file_id` + resolved to `FileId(7)`, not `FileId(0)`). The pre-fix + coarsen sees `target_file_id == FileId(0)`, + `preimage_in(FileId(0))` on a `FileId(7)`-Original returns + `None`, and the block is gated as non-editable — the test + fails. +- [x] Apply the `find_map` fix at `incremental.rs:289-293`. + Implemented as a recursive `derive_target_file_id` helper + that descends through `block_block_children` as well, so a + sole-top-level sectionize wrapper (with the user's real + blocks inside) also yields the right file id rather than + `FileId(0)` by accident. The implementation note in §"Why + this isn't already broken in CI" below remains accurate: + single-file fixtures with `Original`-first blocks hit the + fast path; the wrapper-first variant required descent. +- [x] Re-run the test; assert it passes. +- [x] Add a fully-empty-document test: + `target_file_id_defaults_to_zero_for_empty_document`. The + `FileId(0)` fallback only kicks in when every block returns + `None` from `root_file_id()` — or there are no blocks. +- [x] `cargo nextest run -p pampa target_file_id` green. +- [x] `cargo xtask verify --skip-hub-build --skip-hub-tests` + green. + +**Why this isn't already broken in CI.** + +The existing test suite uses fixtures with `Original`-first +blocks: `keep_before_with_original_in_target_emits_verbatim` +at `incremental.rs:1565` builds a `Paragraph` with +`SourceInfo::original(TARGET, 10, 25)` at `blocks[0]`, so +`root_file_id()` returns `Some(TARGET)` immediately and the +fallback path is never hit. A title-block-first fixture +exposes it. The Plan 8 single-file include story would hit +it too — pre-empting that discovery is the value here. + +#### Phase 9 — Verification + +- [ ] `cargo xtask verify` (full) green. +- [ ] End-to-end exercise: open `q2-preview` against a small + fixture, type into a `{{< meta foo >}}`-resolved region + *before* the first render completes (or use the dev server + with artificial render delay), confirm Q-3-41 appears in the + `DiagnosticStrip`. Record the invocation + observed + diagnostic in the plan body under §"Verification" per + `CLAUDE.md`'s end-to-end rule. +- [ ] End-to-end exercise for the framework gate: open a fixture + with a no-preimage Generated container (e.g. the synthesized + footnotes Div from Plan 6 + a single inline edit), confirm + the React dispatcher's gate now intercepts the typing before + the writer's soft-drop fires (no `Q-3-43` flashes through). +- [ ] Plan-7 doc gets a "Closed via Plan 7c" footnote on the four + open items (do not flip the checkboxes — they describe + Plan-7 scope; Plan 7c is a follow-up). + +### Out of scope + +- Anything in Plan 7b (writer-lossless baseline test; + filter-construction *block-level* UseAfter test; e2e Playwright + matrix). +- `is_editable_inside` migration to `quarto_core::editability`. + The Rust module lives in `pampa::writers::incremental` for + documented dependency-cycle reasons (see Plan 7 Phase 1 + implementation note). The TS-side predicate goes into + `preview-renderer`, mirroring the consumer placement; no + attempt is made to unify the module names. +- Plan 9 (`ValueSource`) / Plan 10 (`Dispatch`) work. The role- + asymmetry rule (`preimage_in` walks `Invocation` only) is + already in place on both sides; future anchor roles inherit + the gate behaviour for free. +- New diagnostic codes beyond Q-3-41. The codes for the gate + surfaces (Q-3-42, Q-3-43) are already implemented. +- Suppressing Q-3-41 spam in autosave contexts. The current + `suppressAfterThree` helper in `DiagnosticStrip` keys by source + range; Q-3-41 has no range so will repeat per keystroke. If + this proves noisy in practice, file a follow-up to extend the + helper to also key by code. + +## Design decisions (settled in conversation) + +- **Q-3-41 is TS-constructed, not Rust-constructed.** The writer + is never invoked in the no-baseline branch — the gate intercepts + before the bridge. A Rust `diagnostic_q3_41()` builder would be + dead code; the catalog entry exists for docs URL / version + consistency only. (Plan 7 §"Catalog mechanics" already + established that the writer's Q-3-43 emission picks its body + text via the builder, not the catalog template; Q-3-41 takes + the same path with the builder on the TS side.) +- **`targetFileId` defaults to `0`.** Both sides default the + target FileId to 0 when the AST lacks a first-block root + FileId — see `incremental.rs:289-293` for the Rust precedent. + The default is safe for empty documents (won't match any real + source bytes; `hasPreimageIn` returns `undefined`; gate + conservatively denies editing). +- **TS predicate placement.** `hasPreimageIn` / + `isEditableInside` go into the existing `utils/sourceInfo.ts` + rather than a new module — they're a natural extension of the + atomicity helpers already there, and the `ATOMIC_KINDS` set is + next to them. +- **No new context fields.** `targetFileId` joins the existing + `RegistryContext`; no new context type is introduced. The + default-`0` semantics matter: dispatchers that don't pass it + fall through to the same "no preimage anywhere" behaviour they + had before (since the wire-format default `d` is FileId 0, + which matches the gate). The only practical regression is if a + caller relies on editing happening inside a non-zero-FileId + AST without setting `targetFileId` — that's a Plan 8 / include + story and not regressed today. +- **Phase ordering inside Phase 2.** The implementation order + inside Phase 2 is: predicate + tests → context plumbing → + dispatcher gate. The predicate is independently testable; the + context plumbing only matters when the gate consumes it; the + gate is the integration point. + +## References + +- Audit transcript (2026-05-25 Claude session): the four items + numbered 1–4 in §Goal map to that audit's items 1, 2, 3, and 4. +- `claude-notes/designs/incremental-writer-contract.md` — + consumer-side contract; §"Role-asymmetry" and §"Unified + editability predicate" pin the rules this plan implements. +- `claude-notes/designs/provenance-contract.md` — producer-side + contract; §4 "Role-asymmetry" and §7 "Atomic-kind set" + cross-reference the editability work. +- `claude-notes/designs/transparent-wrappers.md` — sibling + contract introduced 2026-05-25 alongside Phase 8's fix. Names + the descent pattern that `derive_target_file_id` implements + and lifts it into a reusable primitive (`first_in_user_tree`) + that future plans (8/9/10/replay) can cite without + rediscovering. +- `claude-notes/plans/2026-05-04-q2-preview-plan-7-incremental-writer.md` + — Phase 1 implementation note documents the + `pampa::writers::incremental` placement (the deliberate + deviation from the post-review `quarto_core::editability` + pin). +- `claude-notes/plans/2026-05-24-q2-preview-plan-7b-test-orama.md` + — the *other* Plan-7-followup test pass. Plan 7c is disjoint; + scan Plan 7b before adding any test to make sure it's not + already covered there. +- `crates/pampa/src/writers/incremental.rs:113-162` — Rust + reference for the editability predicate. +- `crates/quarto-source-map/src/source_info.rs:406-442` — Rust + reference for `preimage_in` (Original / Substring / Concat / + Generated walk). +- `ts-packages/preview-renderer/src/utils/sourceInfo.ts` — TS + target file for the new predicates. +- `ts-packages/preview-renderer/src/framework/dispatch.tsx:404-411` + — the gate to update. + +## Estimated scope + +| Phase | Lines (rough) | +|-------|---------------| +| 1 — Q-3-41 catalog entry | ~15 | +| 2 — TS predicates + context + gate + unit tests | ~250 | +| 2 — Cross-language parity fixture + tests (Rust gen + TS consumer + atomic-kinds belt-suspenders) | ~200 | +| 3 — First-edit Q-3-41 emission + helper + tests | ~120 | +| 4 — Rust per-kind tests | ~120 | +| 5 — `cfg(debug_assertions)` `#[should_panic]` test | ~25 | +| 6 — Differentiated `Q-3-43` builder + call-site updates + test adjustments + location-anchoring tests | ~180 | +| 7 — Inline soft-drop: extend `InlineAlignment::UseAfter` to struct variant + reconciler population + writer consumption + regression tests | ~180 | +| 8 — `target_file_id` derivation: `find_map` over `first()` + regression tests | ~40 | +| 9 — Verification | (no code) | +| **Total** | **~1130** | + +Roughly the size of Plan 7 itself. Phase 6 and Phase 7 add real +correctness fixes (Phase 6 closes a doc-vs-code drift on Q-3-43 +body text; Phase 7 fixes a positional-proxy hole in inline +soft-drop). The new parity-test work in Phase 2 adds a structural +sync check so the TS↔Rust walker pair can't drift silently. + +No new diagnostic codes. No new pipeline tier. The +`InlineAlignment::UseAfter` shape change in Phase 7 is the only +type-surface change; `#[serde(default)]` keeps it wire-compatible. + +## Risk areas + +- **Q-3-41 spam in autosave.** Without a code-keyed suppression + rule, every keystroke before first-render emits a fresh + warning. The DiagnosticStrip's `suppressAfterThree` keys on + source range and Q-3-41 has none. Acceptable for v1 — the + pre-render window is short — but document the limitation in + the strip's comment so a future contributor can extend the + helper. +- **`targetFileId` derivation under include.** Plan 8's + IncludeExpansion wrapper introduces source content from a + non-zero FileId. The default-`0` derivation in Phase 2 is + conservative: nodes whose root FileId is the included file + fail `hasPreimageIn(target=0)`, so the gate denies editing. + This is the *correct* behavior for v1 (editing inside an + included child should require the user to open the child), + but worth confirming with a fixture once Plan 8 lands. +- **Gate desync between Rust and TS.** The two predicates must + agree on which kinds are atomic, which roles are walked, and + how `preimage_in` chains resolve. The parity fixture work + inside Phase 2 makes this structural: Rust generates + `test-fixtures/preimage-parity/cases.json` from its in-code + corpus, a Rust test fails when that fixture is stale, and the + TS test fails when its walker disagrees with the regenerated + fixture. Future-walker changes either re-bless both sides + (matching) or trip one of the two gates (loud). The atomic- + kinds belt-and-suspenders fixture catches the simpler "added a + kind on one side only" drift in one Rust + one TS assertion. +- **Reconciler change in Phase 7 ripples through pattern + matches.** Changing `InlineAlignment::UseAfter` from + `(usize)` to `{ after_idx, displaced_before_idx }` is a + breaking change for every consumer of the type. The mechanical + fix is `cargo build --workspace` until clean; the risk is a + consumer that silently ignores the new field (e.g. wildcards + the variant). After Phase 7, audit for `InlineAlignment::UseAfter + { .. }` matches that don't bind `displaced_before_idx`; any such + match outside test code should be reviewed. +- **`Q343Reason::IncludeRecurse { include_path: None }` fallback.** + Atomic CustomNodes without a `source_path` field in `plain_data` + (e.g. `CrossrefResolvedRef` today) fall back to a generic + message. That's worse than the catalog promise but better than + Plan 7's all-cases-identical text. Plan 8's IncludeExpansion + will give us the include path universally for include cases; + CrossrefResolvedRef would need its own `Q343Reason` variant + later (e.g. `Q343Reason::CrossrefRecurse { ref_id: &str }`) if + the message text needs to differ. + +## Notes + +This is the third Plan-7 follow-up alongside Plan 7a (runtime +filter idempotence, `bd-bk3y` / Q-3-44/45) and Plan 7b +(test-o-rama). Each addresses a different gap left by the +2026-05-24 implementation session; together with this plan, the +post-Plan-7 surface is closed. + +Phases 1–5 close gaps where the implementation drifted from the +post-review intent — no contract change. Phases 6 and 7 close +correctness/UX issues that the post-implementation code review +surfaced: + +- **Phase 6** brings Q-3-43's body text up to the contract the + doc already promises. Mechanical fix; the contract itself is + unchanged. +- **Phase 7** narrows the inline soft-drop's positional proxy by + threading the displaced original index through + `InlineAlignment::UseAfter`. This is a small reconciler-type + contract change (struct variant + `Option` field) — and + the only contract change in the plan. The semantics it adds + (the reconciler tells consumers which original was displaced) + is what the writer already needed and approximated; the type + now expresses it honestly. + +If a reviewer reads this and thinks "this needs a design +discussion," the only candidate is Phase 7's reconciler-type +change, which is the kind of small structural sharpening that +fits inside this plan rather than a separate design doc. The +other six phases are wiring + test work + a single-file +diagnostic refactor. + +Update the contract docs alongside the implementation: + +- `claude-notes/designs/incremental-writer-contract.md` — + §"User-facing diagnostic surface" should note that Q-3-43 + body text differentiates by reason (include / metadata / + replacement), with the wording the builder produces. +- `claude-notes/designs/incremental-writer-contract.md` — + §"Soft-drop semantics" should note that the inline-level + case consults `InlineAlignment::UseAfter`'s + `displaced_before_idx` (the reconciler's truth) rather than + the alignment's result-side index. diff --git a/claude-notes/research/2026-05-22-plan-6-audit.md b/claude-notes/research/2026-05-22-plan-6-audit.md new file mode 100644 index 000000000..422935d0d --- /dev/null +++ b/claude-notes/research/2026-05-22-plan-6-audit.md @@ -0,0 +1,184 @@ +# Plan 6 audit: `SourceInfo::default()` sites in transforms + +**Date:** 2026-05-22 +**Branch:** feature/provenance +**Plan:** `claude-notes/plans/2026-05-04-q2-preview-plan-6-provenance-audit.md` + +Comprehensive grep of `SourceInfo::default()` in `crates/quarto-core/src/transforms/` +and `crates/pampa/src/` (excluding test code). 682 total occurrences across +50+ files. This report categorizes the **production** (non-test, non-reader) +sites and decides Plan 6's disposition for each. + +## A. In Plan 6 scope — fix in this pass + +These are the sites the plan body enumerates. Each gets either +`Generated { by: By::(), from: smallvec![] }` (true synthesizers), +`Generated { by: By::shortcode(name), from: [Invocation] }` (shortcode +results via the stamper), or threaded source info (theorem/proof +name-attr, error/literal call sites). + +### Shortcode resolver +`crates/quarto-core/src/transforms/shortcode_resolve.rs` — 12 production +sites, all funnelled through `resolve_shortcode`'s dispatch: + +| Line | Site | Stamper covers? | +|------|---------------------------------------------|-----------------| +| 172 | `config_value_to_inlines` Str | yes | +| 179 | `config_value_to_inlines` Str (bool) | yes | +| 186 | `config_value_to_inlines` Str (int) | yes | +| 203 | `config_value_to_inlines` Str (plain) | yes | +| 208 | `config_value_to_inlines` Str (empty) | yes | +| 215 | `config_value_to_inlines` Str (invalid) | yes | +| 222 | `config_value_to_inlines` Str (Path/Glob) | yes | +| 238 | `flatten_blocks_to_inlines` Space | yes | +| 470 | `lua_result_to_shortcode_result::Text` Str | yes | +| 1034 | `make_error_inline` inner Str | no — call-site threading | +| 1036 | `make_error_inline` outer Strong | no — call-site threading | +| 1109 | `shortcode_to_literal` Str | no — call-site threading | + +Lines 1468 / 1473 / 1576 / 1578 are inside test modules — out of scope. + +### True synthesizers +- `crates/quarto-core/src/transforms/title_block.rs:183, 185` — h1 + + Str. `By::title_block()`. +- `crates/pampa/src/transforms/sectionize.rs:96, 148` — Section Div on + the two close-section paths. `By::sectionize()`. +- `crates/quarto-core/src/transforms/footnotes.rs:495` — footnotes + container Div. `By::footnotes()`. (Per plan: synthesized `` + markers and footnote backlinks are *not* added here; they reuse the + Note's source_info or are inline overlaps Plan 7 covers.) +- `crates/quarto-core/src/transforms/appendix.rs:230, 265, 286, 335, 376` + — five `let source_info = SourceInfo::default()` synthesizer headers + for `wrap_bibliography`, `create_appendix_container`, + `create_license_section`, `create_copyright_section`, + `create_citation_section`. All get `By::appendix()` (the plan only + enumerates `create_appendix_container`; the four other Appendix + helpers are structurally identical — see decisions below). +- `crates/pampa/src/pandoc/treesitter_utils/postprocess.rs:1348` — + synthetic Space between citation and suffix. `By::tree_sitter_postprocess()`. + +### Threaded source info (not Generated) +- `crates/quarto-core/src/transforms/theorem.rs:313` — name-attr title + Str. Thread `attr_source.attributes[idx].1` from + `extract_name_attr`'s caller, with positional-alignment guard. +- `crates/quarto-core/src/transforms/proof.rs:167` — parallel site in + `proof.rs`. Same fix. + +### Decisions on plan-adjacent sites +- **Appendix's four helper functions** (`wrap_bibliography`, + `create_license_section`, `create_copyright_section`, + `create_citation_section`) — the plan only enumerated the container + Div. Including the four helpers extends the scope by ~16 LOC of + trivial mechanical change and keeps the appendix-pipeline output + free of `SourceInfo::default()` for the audit-completion test. + Decision: include in Plan 6. + +## B. Out-of-scope synthesizers (follow-ups) + +These ARE true AST synthesizers that today emit `SourceInfo::default()` +and would benefit from a `Generated` shape, but the plan doesn't +enumerate them and they each require either a new `By::` constructor +or a design decision (atomicity classification). Open as follow-up +beads issues; do not block Plan 6 on them. + +- **`crates/quarto-core/src/transforms/callout_resolve.rs:267`** — + default callout title (e.g. "Note", "Tip" when the user didn't write + one). One synthesizer site; needs `By::callout()` + an + `is_atomic_kind` decision. Open beads. + +## C. Out of scope — website chrome / project-level + +These are transforms that generate website chrome (TOC, navigation, +sidebars, etc.) from metadata, not from the qmd body. They run *after* +the document profile checkpoint (per CLAUDE.md "Document profile +checkpoint" section) and consume the profile rather than processing +source-tracked content. Source attribution for these synthesizers is a +separate design problem (likely tied to the website-project epic). +Plan 6 explicitly defers them: + +- `categories_sidebar.rs`, `footer_generate.rs`, `footer_render.rs`, + `listing_generate.rs`, `listing_render.rs`, `navbar_generate.rs`, + `navbar_render.rs`, `navigation_active.rs`, `navigation_enrich.rs`, + `navigation_href.rs`, `page_nav_generate.rs`, `page_nav_render.rs`, + `sidebar_auto.rs`, `sidebar_generate.rs`, `sidebar_render.rs`, + `toc_generate.rs`, `toc_render.rs`, `website_canonical_url.rs`, + `website_favicon.rs`, `website_title_prefix.rs`. + +Most of these construct `ConfigValue` instances (with `source_info` +fields) rather than `Inline`/`Block` AST nodes; they're typed as +config rather than as user content. + +## D. Out of scope — non-synthesizer code + +- **`crates/pampa/src/readers/json.rs`** — JSON reader. Per the doc + comment at line 80, `SourceInfo::default()` is intentional here: + Pandoc JSON files have no source location data. Out of scope. +- **`crates/pampa/src/writers/{html,json}.rs`** — output writers; any + `SourceInfo::default()` here is for output-only intermediate AST + shaping. Out of scope. +- **`crates/pampa/src/lua/*`** — Lua infrastructure. Plan 4 already + introduced `filter_source_info` (the canonical auto-attach for + typed Lua filter constructions). The remaining + `SourceInfo::default()` in this directory is either deep + type-construction plumbing (`pandoc.X()` wrappers that + `filter_source_info` overrides on the way out) or bare-string + result fallbacks. Out of scope for Plan 6; the Dispatch follow-up + (bd-36fr9) will revisit. +- **`crates/pampa/src/{citeproc_filter,json_filter,filters}.rs`** — + filter execution paths. Constructions inside Lua filters are + already handled by `filter_source_info`; the bare + `SourceInfo::default()` here is for filter-internal scaffolding + (containers spliced around filter output). Out of scope. +- **`crates/pampa/src/template/*`** — Pandoc-compatible template + engine. Doctemplate output is not source-tracked through this + pipeline. Out of scope. +- **`crates/pampa/src/pandoc/{meta,shortcode}.rs`** — type-level + defaults / data-shape conversions, not pipeline-level synthesis. + Out of scope. +- **`crates/pampa/src/toc.rs`** — TOC generation. Same scope note as + the website-chrome transforms in §C. +- **`crates/quarto-core/src/transforms/{code_block_generate,code_block_render}.rs`** + — code-block decoration (filename labels, captions). Possibly + in-scope for a future audit pass; defer for now. Open beads. +- **`crates/quarto-core/src/transforms/config.rs`** — config-merge + bookkeeping. Constructs ConfigValues, not user-content AST nodes. + Out of scope. +- **`crates/quarto-core/src/transforms/link_rewrite.rs`** — link + rewriting (URL canonicalization). The 13 sites are mostly + test-helper code; the production sites construct intermediate + Link/Image nodes whose `source_info` is then overwritten with the + original node's `source_info` later in the rewrite. The default + acts as a placeholder. Audit shows no genuine synthesis; out of + scope. + +## E. Test-only sites + +Filter list: `dummy_source_info`, `#[cfg(test)]` modules, +`fn test_*`. These are intentional test scaffolding and out of scope. + +## Audit summary + +| Category | Count (production) | +|----------------------------------|--------------------| +| A. In Plan 6 scope (will fix) | 22 | +| B. Plan-adjacent synthesizers | 1 | +| C. Website chrome (deferred) | ~120 | +| D. Non-synthesizer code | ~80 | +| E. Test scaffolding | ~459 | +| **Total** | **~682** | + +The audit-completion test (Plan 6 test plan) asserts the §A sites +all become `Generated` (or threaded `Original`) shapes after the +pass. It does not assert that §B/C/D become Generated — those are +out of scope. + +## Follow-up beads to open + +- **Callout default-title synthesizer** (callout_resolve.rs:267). + Needs `By::callout()` constructor + atomicity decision. +- **Code-block decoration** (code_block_generate / code_block_render). + Audit pass for codeblock chrome. + +These are opened as discovered-from links to whatever beads issue +tracks Plan 6's umbrella work (or left as standalone follow-ups +since Plan 6 is plan-driven, not beads-driven). diff --git a/crates/pampa/Cargo.toml b/crates/pampa/Cargo.toml index a8398dd6c..b19f22dad 100644 --- a/crates/pampa/Cargo.toml +++ b/crates/pampa/Cargo.toml @@ -58,6 +58,7 @@ regex = { version = "1.12.3", features = ["unicode"] } clap = { version = "4.5", features = ["derive"] } serde = { workspace = true, features = ["derive"] } serde_json = "1.0" +smallvec.workspace = true glob = "0.3" paste = "1.0.15" once_cell = "1.21.3" diff --git a/crates/pampa/src/lua/diagnostics.rs b/crates/pampa/src/lua/diagnostics.rs index 5a16262f2..a5bac7c30 100644 --- a/crates/pampa/src/lua/diagnostics.rs +++ b/crates/pampa/src/lua/diagnostics.rs @@ -10,7 +10,8 @@ use mlua::{Error, Lua, MultiValue, Result, Table, Value}; use quarto_error_reporting::DiagnosticMessage; -use quarto_source_map::{FileId, SourceInfo, SourcePiece}; +use quarto_source_map::{Anchor, AnchorRole, By, FileId, SourceInfo, SourcePiece}; +use smallvec::SmallVec; use std::sync::Arc; use super::types::{LuaBlock, LuaInline}; @@ -57,7 +58,12 @@ pub fn register_quarto_namespace(lua: &Lua) -> Result<()> { /// - Original: { t = "Original", file_id = N, start_offset = N, end_offset = N } /// - Substring: { t = "Substring", parent = {...}, start_offset = N, end_offset = N } /// - Concat: { t = "Concat", pieces = [{source_info = {...}, offset_in_concat = N, length = N}, ...] } -/// - FilterProvenance: { t = "FilterProvenance", filter_path = "...", line = N } +/// - Generated: { t = "Generated", by = { kind = "...", data = "..." (JSON-encoded) }, +/// from = [{role = "Invocation" | "ValueSource" | "Other:", +/// source_info = {...}}, ...] } +/// +/// The reader also accepts the legacy `"FilterProvenance"` tag for back-compat, +/// mapping it onto `Generated { by: filter, from: [] }`. fn source_info_to_lua_table(lua: &Lua, si: &SourceInfo) -> Result { let table = lua.create_table()?; match si { @@ -96,15 +102,50 @@ fn source_info_to_lua_table(lua: &Lua, si: &SourceInfo) -> Result
{ } table.set("pieces", pieces_table)?; } - SourceInfo::FilterProvenance { filter_path, line } => { - table.set("t", "FilterProvenance")?; - table.set("filter_path", filter_path.clone())?; - table.set("line", *line)?; + SourceInfo::Generated { by, from } => { + table.set("t", "Generated")?; + table.set("by", by_to_lua_table(lua, by)?)?; + let from_table = lua.create_table()?; + for (i, anchor) in from.iter().enumerate() { + let anchor_table = lua.create_table()?; + anchor_table.set("role", anchor_role_to_lua_string(&anchor.role))?; + anchor_table.set( + "source_info", + source_info_to_lua_table(lua, &anchor.source_info)?, + )?; + from_table.set(i + 1, anchor_table)?; + } + table.set("from", from_table)?; } } Ok(table) } +/// Serialize a [`By`] to a Lua table: `{ kind = "...", data = "" }`. +/// +/// `data` is JSON-encoded as a string because Lua tables don't carry the +/// `serde_json::Value` discriminator; readers decode it back via +/// [`serde_json::from_str`]. +fn by_to_lua_table(lua: &Lua, by: &By) -> Result
{ + let table = lua.create_table()?; + table.set("kind", by.kind.clone())?; + if !by.data.is_null() { + let encoded = serde_json::to_string(&by.data) + .map_err(|e| Error::runtime(format!("By.data serialize failed: {e}")))?; + table.set("data", encoded)?; + } + Ok(table) +} + +/// Serialize an [`AnchorRole`] to a Lua string. +fn anchor_role_to_lua_string(role: &AnchorRole) -> String { + match role { + AnchorRole::Invocation => "Invocation".to_string(), + AnchorRole::ValueSource => "ValueSource".to_string(), + AnchorRole::Other(name) => format!("Other:{name}"), + } +} + /// Deserialize a SourceInfo from a Lua table fn source_info_from_lua_table(table: &Table) -> Result { let t: String = table.get("t")?; @@ -136,14 +177,61 @@ fn source_info_from_lua_table(table: &Table) -> Result { } Ok(SourceInfo::Concat { pieces }) } - "FilterProvenance" => Ok(SourceInfo::FilterProvenance { - filter_path: table.get("filter_path")?, - line: table.get("line")?, + "Generated" => { + let by_table: Table = table.get("by")?; + let by = by_from_lua_table(&by_table)?; + let mut from: SmallVec<[Anchor; 2]> = SmallVec::new(); + // The `from` field is optional in serialization; absent means empty. + if let Ok(from_table) = table.get::
("from") { + for i in 1..=from_table.raw_len() { + let anchor_table: Table = from_table.get(i)?; + let role_str: String = anchor_table.get("role")?; + let role = anchor_role_from_lua_string(&role_str); + let si_table: Table = anchor_table.get("source_info")?; + from.push(Anchor { + role, + source_info: Arc::new(source_info_from_lua_table(&si_table)?), + }); + } + } + Ok(SourceInfo::Generated { by, from }) + } + // Legacy back-compat: read the old "FilterProvenance" tag as + // `Generated { by: filter(...), from: [] }`. Writers never emit + // this tag after Plan 4 Phase 4. + "FilterProvenance" => Ok(SourceInfo::Generated { + by: By::filter( + table.get::("filter_path")?, + table.get::("line")?, + ), + from: SmallVec::new(), }), _ => Err(Error::runtime(format!("Unknown SourceInfo type: {}", t))), } } +/// Deserialize a [`By`] from `{ kind = "...", data = "" }`. +fn by_from_lua_table(table: &Table) -> Result { + let kind: String = table.get("kind")?; + let data = match table.get::("data") { + Ok(encoded) => serde_json::from_str(&encoded) + .map_err(|e| Error::runtime(format!("By.data parse failed: {e}")))?, + Err(_) => serde_json::Value::Null, + }; + Ok(By { kind, data }) +} + +/// Inverse of [`anchor_role_to_lua_string`]. +fn anchor_role_from_lua_string(s: &str) -> AnchorRole { + if let Some(rest) = s.strip_prefix("Other:") { + AnchorRole::Other(rest.to_string()) + } else if s == "ValueSource" { + AnchorRole::ValueSource + } else { + AnchorRole::Invocation + } +} + // ============================================================================ // Helper Functions for Extracting SourceInfo from Elements // ============================================================================ @@ -170,7 +258,10 @@ fn extract_source_info_from_element(lua: &Lua, elem: &Value) -> Result SourceInfo { let (source, line) = get_caller_location(lua); let source_path = source.strip_prefix('@').unwrap_or(&source); - SourceInfo::filter_provenance(source_path, line.max(0) as usize) + SourceInfo::Generated { + by: By::filter(source_path, line.max(0) as usize), + from: SmallVec::new(), + } } /// Add a diagnostic to the quarto._diagnostics table @@ -441,16 +532,18 @@ mod tests { // Verify source location was captured assert!(diagnostics[0].location.is_some()); - if let Some(SourceInfo::FilterProvenance { filter_path, line }) = &diagnostics[0].location { + if let Some(SourceInfo::Generated { by, .. }) = &diagnostics[0].location + && let Some((filter_path, line)) = by.as_filter() + { // The path should contain the filter name (@ prefix is stripped) assert!( filter_path.contains("test_filter.lua"), "Expected path to contain 'test_filter.lua', got '{}'", filter_path ); - assert_eq!(*line, 1); + assert_eq!(line, 1); } else { - panic!("Expected FilterProvenance source info"); + panic!("Expected filter-kind Generated source info"); } } @@ -506,9 +599,10 @@ mod tests { assert_eq!(*start_offset, 100, "start_offset should be preserved"); assert_eq!(*end_offset, 110, "end_offset should be preserved"); } - Some(SourceInfo::FilterProvenance { filter_path, line }) => { + Some(SourceInfo::Generated { by, .. }) if by.is_kind("filter") => { + let (filter_path, line) = by.as_filter().unwrap(); panic!( - "Expected SourceInfo::Original, but got FilterProvenance({}, {}). \ + "Expected SourceInfo::Original, but got filter-Generated({}, {}). \ This is the bug we're fixing!", filter_path, line ); @@ -749,11 +843,45 @@ mod tests { let roundtrip = source_info_from_lua_table(&table).unwrap(); assert_eq!(concat, roundtrip); - // Test FilterProvenance - let filter_prov = SourceInfo::filter_provenance("/path/to/filter.lua", 42); + // Test filter-kind Generated round-trip + let filter_prov = SourceInfo::generated(By::filter("/path/to/filter.lua", 42)); let table = source_info_to_lua_table(&lua, &filter_prov).unwrap(); let roundtrip = source_info_from_lua_table(&table).unwrap(); assert_eq!(filter_prov, roundtrip); + + // Test shortcode Generated with an Invocation anchor + let mut shortcode = SourceInfo::generated(By::shortcode("meta")); + shortcode.append_anchor( + AnchorRole::Invocation, + Arc::new(SourceInfo::Original { + file_id: FileId(3), + start_offset: 1, + end_offset: 9, + }), + ); + let table = source_info_to_lua_table(&lua, &shortcode).unwrap(); + let roundtrip = source_info_from_lua_table(&table).unwrap(); + assert_eq!(shortcode, roundtrip); + } + + #[test] + fn test_legacy_filter_provenance_tag_reads_as_filter_generated() { + // Plan 4 Phase 4: writers never emit "FilterProvenance" anymore, but + // the reader still accepts the legacy tag and maps it to a + // filter-kind Generated with empty anchor list. + let lua = Lua::new(); + let table = lua.create_table().unwrap(); + table.set("t", "FilterProvenance").unwrap(); + table.set("filter_path", "legacy.lua").unwrap(); + table.set("line", 7usize).unwrap(); + let parsed = source_info_from_lua_table(&table).unwrap(); + match parsed { + SourceInfo::Generated { by, from } => { + assert_eq!(by.as_filter(), Some(("legacy.lua", 7))); + assert!(from.is_empty()); + } + other => panic!("Expected filter-kind Generated, got {:?}", other), + } } // ========================================================================= @@ -797,11 +925,11 @@ mod tests { // Should still work, falling back to stack location let diagnostics = extract_lua_diagnostics(&lua).unwrap(); assert_eq!(diagnostics.len(), 1); - // Should have FilterProvenance since the element wasn't recognized + // Should have filter-Generated since the element wasn't recognized match &diagnostics[0].location { - Some(SourceInfo::FilterProvenance { .. }) => {} + Some(SourceInfo::Generated { by, .. }) if by.is_kind("filter") => {} other => panic!( - "Expected FilterProvenance for non-userdata element, got {:?}", + "Expected filter-Generated for non-userdata element, got {:?}", other ), } diff --git a/crates/pampa/src/lua/filter_tests.rs b/crates/pampa/src/lua/filter_tests.rs index 30c4d2b30..3e0518b96 100644 --- a/crates/pampa/src/lua/filter_tests.rs +++ b/crates/pampa/src/lua/filter_tests.rs @@ -737,7 +737,7 @@ end } #[tokio::test] -async fn test_filter_provenance_tracking() { +async fn test_filter_generated_tracking() { // Test that elements created by filters capture their source location let dir = TempDir::new().unwrap(); let filter_path = dir.path().join("provenance_test.lua"); @@ -777,17 +777,23 @@ end .unwrap() .pandoc; - // The filtered Str should have FilterProvenance source info + // The filtered Str should have filter-kind Generated source info match &filtered.blocks[0] { Block::Paragraph(p) => match &p.content[0] { Inline::Str(s) => { assert_eq!(s.text, "created-by-filter"); - // Check that the source_info is FilterProvenance + // Check that the source_info is Generated { by: filter, .. } match &s.source_info { - quarto_source_map::SourceInfo::FilterProvenance { - filter_path: path, - line, - } => { + quarto_source_map::SourceInfo::Generated { by, from } + if by.is_kind("filter") => + { + assert!( + from.is_empty(), + "Filter-constructed Generated nodes carry no anchors yet" + ); + let (path, line) = by + .as_filter() + .expect("filter-kind Generated should expose path/line"); // The filter_path should contain our filter file name assert!( path.contains("provenance_test.lua"), @@ -796,13 +802,16 @@ end ); // The line should be around line 5 where pandoc.Str is called assert!( - *line >= 4 && *line <= 7, + (4..=7).contains(&line), "Expected line to be between 4-7, got: {}", line ); } other => { - panic!("Expected FilterProvenance source info, got: {:?}", other) + panic!( + "Expected filter-kind Generated source info, got: {:?}", + other + ) } } } @@ -1799,13 +1808,13 @@ end ); // Check source location - if let Some(quarto_source_map::SourceInfo::FilterProvenance { filter_path, line }) = - &diagnostics[0].location + if let Some(quarto_source_map::SourceInfo::Generated { by, .. }) = &diagnostics[0].location + && let Some((filter_path, line)) = by.as_filter() { assert!(filter_path.contains("warn_test.lua")); - assert!(*line > 0, "Line should be positive"); + assert!(line > 0, "Line should be positive"); } else { - panic!("Expected FilterProvenance source info"); + panic!("Expected filter-kind Generated source info"); } } @@ -6826,7 +6835,9 @@ end attr: (String::new(), vec![], hashlink::LinkedHashMap::new()), attr_source: crate::pandoc::AttrSourceInfo::empty(), content: vec![], - source_info: quarto_source_map::SourceInfo::filter_provenance("test.lua", 1), + source_info: quarto_source_map::SourceInfo::generated( + quarto_source_map::By::filter("test.lua", 1), + ), })], source_info: quarto_source_map::SourceInfo::default(), })], diff --git a/crates/pampa/src/lua/types.rs b/crates/pampa/src/lua/types.rs index f2a67e70f..4c5ed92fc 100644 --- a/crates/pampa/src/lua/types.rs +++ b/crates/pampa/src/lua/types.rs @@ -15,7 +15,8 @@ use mlua::{ Error, IntoLua, Lua, MetaMethod, Result, Table, UserData, UserDataFields, UserDataMethods, UserDataRef, Value, Variadic, }; -use quarto_source_map::SourceInfo; +use quarto_source_map::{By, SourceInfo}; +use smallvec::SmallVec; use crate::pandoc::{Block, Inline}; @@ -723,8 +724,8 @@ impl UserData for LuaInline { /// `:byte_range()` and `:file_id()` accessors that chain-resolve the /// underlying `SourceInfo` to a `(file_id, start, end)` tuple in the /// root source file. Both return `nil` when the chain resolves to -/// `SourceInfo::Concat` or `SourceInfo::FilterProvenance` — the same -/// rule applied by `AttributionRenderTransform`. +/// `SourceInfo::Concat` or a `Generated` node without an `Invocation` +/// anchor — the same rule applied by `AttributionRenderTransform`. /// /// This is the building block of the `quarto.attribution.lookup(el)` /// convenience: it reads `el.source_info:byte_range()` then calls @@ -1825,7 +1826,10 @@ pub fn filter_source_info(lua: &Lua) -> SourceInfo { // The source often starts with "@" for file paths let path: &str = src.strip_prefix("@").unwrap_or(&src); let line_num = line.unwrap_or(0); - return Some(SourceInfo::filter_provenance(path.to_string(), line_num)); + return Some(SourceInfo::Generated { + by: By::filter(path.to_string(), line_num), + from: SmallVec::new(), + }); } None }) { diff --git a/crates/pampa/src/pandoc/location.rs b/crates/pampa/src/pandoc/location.rs index f97a7d1ef..9c5846c47 100644 --- a/crates/pampa/src/pandoc/location.rs +++ b/crates/pampa/src/pandoc/location.rs @@ -325,24 +325,6 @@ pub fn empty_source_info() -> quarto_source_map::SourceInfo { ) } -/// Extract filename index from quarto_source_map::SourceInfo by walking to Original mapping -pub fn extract_filename_index(info: &quarto_source_map::SourceInfo) -> Option { - match info { - quarto_source_map::SourceInfo::Original { file_id, .. } => Some(file_id.0), - quarto_source_map::SourceInfo::Substring { parent, .. } => extract_filename_index(parent), - quarto_source_map::SourceInfo::Concat { pieces } => { - // Return first non-None filename_index from pieces - pieces - .iter() - .find_map(|p| extract_filename_index(&p.source_info)) - } - quarto_source_map::SourceInfo::FilterProvenance { .. } => { - // Filter provenance doesn't have a filename index - None - } - } -} - #[cfg(test)] mod tests { use super::*; @@ -568,93 +550,6 @@ mod tests { assert_eq!(si.length(), 0); } - #[test] - fn test_extract_filename_index_original() { - let si = quarto_source_map::SourceInfo::from_range( - quarto_source_map::FileId(42), - quarto_source_map::Range { - start: quarto_source_map::Location { - offset: 0, - row: 0, - column: 0, - }, - end: quarto_source_map::Location { - offset: 10, - row: 0, - column: 10, - }, - }, - ); - assert_eq!(extract_filename_index(&si), Some(42)); - } - - #[test] - fn test_extract_filename_index_substring() { - let parent = quarto_source_map::SourceInfo::from_range( - quarto_source_map::FileId(99), - quarto_source_map::Range { - start: quarto_source_map::Location { - offset: 0, - row: 0, - column: 0, - }, - end: quarto_source_map::Location { - offset: 100, - row: 5, - column: 0, - }, - }, - ); - let substring = quarto_source_map::SourceInfo::substring(parent, 10, 50); - assert_eq!(extract_filename_index(&substring), Some(99)); - } - - #[test] - fn test_extract_filename_index_concat() { - let piece1 = quarto_source_map::SourceInfo::from_range( - quarto_source_map::FileId(7), - quarto_source_map::Range { - start: quarto_source_map::Location { - offset: 0, - row: 0, - column: 0, - }, - end: quarto_source_map::Location { - offset: 10, - row: 0, - column: 10, - }, - }, - ); - let piece2 = quarto_source_map::SourceInfo::from_range( - quarto_source_map::FileId(8), - quarto_source_map::Range { - start: quarto_source_map::Location { - offset: 0, - row: 0, - column: 0, - }, - end: quarto_source_map::Location { - offset: 20, - row: 1, - column: 0, - }, - }, - ); - // concat takes Vec<(SourceInfo, usize)> - pairs of source info and length - let concat = quarto_source_map::SourceInfo::concat(vec![(piece1, 10), (piece2, 20)]); - // Should return the first piece's file_id - assert_eq!(extract_filename_index(&concat), Some(7)); - } - - #[test] - fn test_extract_filename_index_filter_provenance() { - // filter_provenance takes filter_path and line number - let filter_prov = quarto_source_map::SourceInfo::filter_provenance("test-filter.lua", 42); - // FilterProvenance doesn't have a filename index - assert_eq!(extract_filename_index(&filter_prov), None); - } - #[test] fn test_source_info_combine_takes_self_start_when_smaller() { // Test case where self.range.start < other.range.start (covers line 53) diff --git a/crates/pampa/src/pandoc/treesitter_utils/pipe_table.rs b/crates/pampa/src/pandoc/treesitter_utils/pipe_table.rs index 391a5d602..c422e6842 100644 --- a/crates/pampa/src/pandoc/treesitter_utils/pipe_table.rs +++ b/crates/pampa/src/pandoc/treesitter_utils/pipe_table.rs @@ -252,31 +252,11 @@ pub fn process_pipe_table( let table_start = node_source_info_with_context(node, context); let start_offset = table_start.start_offset(); let end_offset = cap_info.end_offset(); - // Extract file_id from the table's source info - let file_id = match &table_start { - quarto_source_map::SourceInfo::Original { file_id, .. } => *file_id, - quarto_source_map::SourceInfo::Substring { parent, .. } => { - // Recursively extract from parent (should always reach Original eventually) - match **parent { - quarto_source_map::SourceInfo::Original { file_id, .. } => file_id, - _ => quarto_source_map::FileId(0), // Fallback - } - } - quarto_source_map::SourceInfo::Concat { pieces } => { - // Use first piece's file_id - if let Some(piece) = pieces.first() { - match &piece.source_info { - quarto_source_map::SourceInfo::Original { file_id, .. } => *file_id, - _ => quarto_source_map::FileId(0), // Fallback - } - } else { - quarto_source_map::FileId(0) // Fallback - } - } - quarto_source_map::SourceInfo::FilterProvenance { .. } => { - quarto_source_map::FileId(0) // Fallback - filter-created tables shouldn't reach this - } - }; + // Extract file_id from the table's source info; root_file_id walks + // every nesting level, so this works for arbitrarily deep Substrings. + let file_id = table_start + .root_file_id() + .unwrap_or(quarto_source_map::FileId(0)); // Create a new SourceInfo spanning from table start to caption end quarto_source_map::SourceInfo::original(file_id, start_offset, end_offset) } else { diff --git a/crates/pampa/src/pandoc/treesitter_utils/postprocess.rs b/crates/pampa/src/pandoc/treesitter_utils/postprocess.rs index bdbfebb54..019b3d9b3 100644 --- a/crates/pampa/src/pandoc/treesitter_utils/postprocess.rs +++ b/crates/pampa/src/pandoc/treesitter_utils/postprocess.rs @@ -49,7 +49,8 @@ use quarto_pandoc_types::AttrSourceInfo; use quarto_pandoc_types::table::{ Alignment, Cell, ColSpec, ColWidth, Row, Table, TableBody, TableFoot, TableHead, }; -use quarto_source_map::SourceInfo; +use quarto_source_map::{By, SourceInfo}; +use smallvec::smallvec; use std::cell::RefCell; use std::collections::HashMap; @@ -1349,8 +1350,12 @@ pub fn postprocess(doc: Pandoc, error_collector: &mut DiagnosticCollector) -> Re // bracket attached to the first word and closing bracket to the last word // e.g., "@knuth [p. 33]" becomes: Str("@knuth"), Space, Str("[p."), Space, Str("33]") cite.content.push(Inline::Space(Space { - // Synthetic Space: inserted to separate citation from suffix - source_info: quarto_source_map::SourceInfo::default(), + // Synthetic Space: inserted to separate citation from suffix. + // Plan 6 §"tree-sitter postprocess" — Generated, no preimage. + source_info: SourceInfo::Generated { + by: By::tree_sitter_postprocess(), + from: smallvec![], + }, })); // The span content may have been merged into a single string, so we need to diff --git a/crates/pampa/src/pandoc/treesitter_utils/section.rs b/crates/pampa/src/pandoc/treesitter_utils/section.rs index 21f6b4e2b..fd180862c 100644 --- a/crates/pampa/src/pandoc/treesitter_utils/section.rs +++ b/crates/pampa/src/pandoc/treesitter_utils/section.rs @@ -125,31 +125,13 @@ pub fn process_section( // Extend table's source_info to include the caption let table_start_offset = table.source_info.start_offset(); let caption_end_offset = caption_source_info.end_offset(); - // Extract file_id from table's source info - let file_id = match &table.source_info { - quarto_source_map::SourceInfo::Original { file_id, .. } => *file_id, - quarto_source_map::SourceInfo::Substring { parent, .. } => { - match **parent { - quarto_source_map::SourceInfo::Original { file_id, .. } => file_id, - _ => quarto_source_map::FileId(0), // Fallback - } - } - quarto_source_map::SourceInfo::Concat { pieces } => { - if let Some(piece) = pieces.first() { - match &piece.source_info { - quarto_source_map::SourceInfo::Original { file_id, .. } => { - *file_id - } - _ => quarto_source_map::FileId(0), // Fallback - } - } else { - quarto_source_map::FileId(0) // Fallback - } - } - quarto_source_map::SourceInfo::FilterProvenance { .. } => { - quarto_source_map::FileId(0) // Fallback - filter-created tables - } - }; + // Extract file_id from table's source info; root_file_id + // walks every nesting level, so nested Substrings resolve + // correctly (the previous shallow match returned FileId(0)). + let file_id = table + .source_info + .root_file_id() + .unwrap_or(quarto_source_map::FileId(0)); table.source_info = quarto_source_map::SourceInfo::original( file_id, table_start_offset, diff --git a/crates/pampa/src/readers/json.rs b/crates/pampa/src/readers/json.rs index 587ab09f3..32d1b65f0 100644 --- a/crates/pampa/src/readers/json.rs +++ b/crates/pampa/src/readers/json.rs @@ -16,8 +16,9 @@ use crate::pandoc::{ }; use hashlink::LinkedHashMap; use quarto_pandoc_types::{ConfigMapEntry, ConfigValue, ConfigValueKind}; -use quarto_source_map::FileId; +use quarto_source_map::{Anchor, AnchorRole, By, FileId}; use serde_json::Value; +use smallvec::SmallVec; use std::sync::Arc; #[derive(Debug)] @@ -250,36 +251,132 @@ impl SourceInfoDeserializer { quarto_source_map::SourceInfo::Concat { pieces: pieces? } } 3 => { - // Transformed variant no longer exists in SourceInfo - // Convert to approximate Substring pointing to parent - // This loses the transformation mapping but preserves the parent relationship + // Legacy reader for code 3 — accepts both old Transformed + // numeric-array and buggy FilterProvenance string-array; new + // writers (post-Plan-5) never emit code 3. Two shapes are + // possible and dispatch is by `data[0]`'s JSON type, which + // is unambiguous: + // - Numeric-headed `[parent_id, ...]` → legacy Transformed, + // approximated as Substring pointing to that parent + // (preserves today's back-compat). + // - String-headed `[filter_path, line]` → latent + // FilterProvenance, recovered as + // `Generated { by: filter, from: [] }` (closes bd-3odjm). + // Strict on every other shape — same convention as the + // Substring / Concat arms above (no `unwrap_or(0)`). let data_array = data .as_array() .ok_or(JsonReadError::MalformedSourceInfoPool)?; if data_array.is_empty() { return Err(JsonReadError::MalformedSourceInfoPool); } - let parent_id = data_array[0] - .as_u64() - .ok_or(JsonReadError::MalformedSourceInfoPool)? - as usize; - // Check for circular/forward references - if parent_id >= current_index { - return Err(JsonReadError::CircularSourceInfoReference(parent_id)); - } + if let Some(parent_id) = data_array[0].as_u64() { + // Legacy Transformed path. + let parent_id = parent_id as usize; - let parent = pool - .get(parent_id) - .ok_or(JsonReadError::InvalidSourceInfoRef(parent_id))? - .clone(); + // Check for circular/forward references + if parent_id >= current_index { + return Err(JsonReadError::CircularSourceInfoReference(parent_id)); + } - // Approximate with Substring - quarto_source_map::SourceInfo::Substring { - parent: Arc::new(parent), - start_offset, - end_offset, + let parent = pool + .get(parent_id) + .ok_or(JsonReadError::InvalidSourceInfoRef(parent_id))? + .clone(); + + // Approximate with Substring + quarto_source_map::SourceInfo::Substring { + parent: Arc::new(parent), + start_offset, + end_offset, + } + } else if let Some(filter_path) = data_array[0].as_str() { + // Latent FilterProvenance shape: must be exactly + // [path, line]; no `unwrap_or(0)` on the line. + if data_array.len() != 2 { + return Err(JsonReadError::MalformedSourceInfoPool); + } + let line = data_array[1] + .as_u64() + .ok_or(JsonReadError::MalformedSourceInfoPool)? + as usize; + quarto_source_map::SourceInfo::Generated { + by: By::filter(filter_path.to_string(), line), + from: SmallVec::new(), + } + } else { + return Err(JsonReadError::MalformedSourceInfoPool); + } + } + 4 => { + // Generated { by, from }. The outer `r` field is parsed + // by the caller and *ignored here* — Generated entries + // don't carry their own offsets; ranges come from + // chain-walking the Invocation anchor via + // `resolve_byte_range`. The writer hard-codes + // `r: [0, 0]` for code-4 entries, but `r != [0, 0]` from + // an older/future writer is silently accepted (precedent: + // today's Concat arm also parses `r` but doesn't use it). + // + // Strict on every other shape: missing `by`, missing + // `by.kind`, `from` not an array, `from` entry not an + // object, `from` entry missing `role`/`si_id`, + // unrecognized role string, `Other("")` role with empty + // suffix → `MalformedSourceInfoPool`. Same convention as + // the Substring / Concat arms above. + let obj = data + .as_object() + .ok_or(JsonReadError::MalformedSourceInfoPool)?; + let by_obj = obj + .get("by") + .and_then(|v| v.as_object()) + .ok_or(JsonReadError::MalformedSourceInfoPool)?; + let kind = by_obj + .get("kind") + .and_then(|v| v.as_str()) + .ok_or(JsonReadError::MalformedSourceInfoPool)? + .to_string(); + let by_data = by_obj.get("data").cloned().unwrap_or(Value::Null); + let by = By { + kind, + data: by_data, + }; + + let mut from: SmallVec<[Anchor; 2]> = SmallVec::new(); + if let Some(from_val) = obj.get("from") { + let from_arr = from_val + .as_array() + .ok_or(JsonReadError::MalformedSourceInfoPool)?; + for entry in from_arr { + let entry_obj = entry + .as_object() + .ok_or(JsonReadError::MalformedSourceInfoPool)?; + let role_str = entry_obj + .get("role") + .and_then(|v| v.as_str()) + .ok_or(JsonReadError::MalformedSourceInfoPool)?; + let role = parse_anchor_role(role_str)?; + let si_id = entry_obj + .get("si_id") + .and_then(|v| v.as_u64()) + .ok_or(JsonReadError::MalformedSourceInfoPool)? + as usize; + if si_id >= current_index { + return Err(JsonReadError::CircularSourceInfoReference(si_id)); + } + let target = pool + .get(si_id) + .cloned() + .ok_or(JsonReadError::InvalidSourceInfoRef(si_id))?; + from.push(Anchor { + role, + source_info: Arc::new(target), + }); + } } + + quarto_source_map::SourceInfo::Generated { by, from } } _ => { return Err(JsonReadError::MalformedSourceInfoPool); @@ -306,6 +403,33 @@ impl SourceInfoDeserializer { } } +/// Decode a wire-format anchor role string into its typed `AnchorRole`. +/// +/// Recognized strings: +/// - `"invocation"` → [`AnchorRole::Invocation`] +/// - `"value-source"` → [`AnchorRole::ValueSource`] +/// - `"other:"` → [`AnchorRole::Other()`], where `` must +/// be non-empty. +/// +/// Anything else — including the bare `"other:"` with an empty suffix — +/// is rejected as `MalformedSourceInfoPool`. +#[allow(dead_code)] // Used by the Phase 2 code-4 reader. +fn parse_anchor_role(s: &str) -> Result { + match s { + "invocation" => Ok(AnchorRole::Invocation), + "value-source" => Ok(AnchorRole::ValueSource), + _ => { + let name = s + .strip_prefix("other:") + .ok_or(JsonReadError::MalformedSourceInfoPool)?; + if name.is_empty() { + return Err(JsonReadError::MalformedSourceInfoPool); + } + Ok(AnchorRole::Other(name.to_string())) + } + } +} + /// Convert from old JSON format (filename_index, range) to new SourceInfo fn make_source_info(filename_index: Option, range: Range) -> quarto_source_map::SourceInfo { let file_id = FileId(filename_index.unwrap_or(0)); @@ -2631,4 +2755,521 @@ mod tests { _ => panic!("Expected CircularSourceInfoReference error"), } } + + // ---------------------------------------------------------------- + // Plan 5 Phase 1 — Legacy code-3 dual-shape reader + // ---------------------------------------------------------------- + + /// Filter-provenance recovery: code-3 with `[filter_path, line]` payload + /// must decode to `Generated { by: By::filter(path, line), from: [] }`. + /// Closes bd-3odjm — this is the latent FilterProvenance shape that + /// today's reader misinterprets as a legacy Transformed parent_id. + #[test] + fn test_deserialize_code3_filter_provenance_recovery() { + let pool_json = json!([ + { + "r": [0, 0], + "t": 3, + "d": ["/path/to/filter.lua", 42] + } + ]); + + let deserializer = SourceInfoDeserializer::new(&pool_json).unwrap(); + assert_eq!(deserializer.pool.len(), 1); + match &deserializer.pool[0] { + SourceInfo::Generated { by, from } => { + assert!(from.is_empty()); + let (path, line) = by.as_filter().expect("expected filter By"); + assert_eq!(path, "/path/to/filter.lua"); + assert_eq!(line, 42); + } + other => panic!("Expected Generated, got {:?}", other), + } + } + + /// Legacy Transformed back-compat: code-3 with `[parent_id, ...]` + /// numeric payload must continue to decode as a Substring approximation. + #[test] + fn test_deserialize_code3_legacy_transformed_back_compat() { + let pool_json = json!([ + { + "r": [0, 100], + "t": 0, + "d": 0 // Original + }, + { + "r": [10, 20], + "t": 3, + "d": [0] // Legacy Transformed -> Substring(parent_id=0) + } + ]); + + let deserializer = SourceInfoDeserializer::new(&pool_json).unwrap(); + assert_eq!(deserializer.pool.len(), 2); + match &deserializer.pool[1] { + SourceInfo::Substring { + parent, + start_offset, + end_offset, + } => { + assert_eq!(*start_offset, 10); + assert_eq!(*end_offset, 20); + assert!(matches!(&**parent, SourceInfo::Original { .. })); + } + other => panic!("Expected Substring, got {:?}", other), + } + } + + /// Strict rejection: code-3 with `[path]` (missing line) is malformed. + /// Guards the no-`unwrap_or(0)` rule. + #[test] + fn test_deserialize_code3_filter_missing_line_rejected() { + let pool_json = json!([ + { + "r": [0, 0], + "t": 3, + "d": ["/path/to/filter.lua"] + } + ]); + + let result = SourceInfoDeserializer::new(&pool_json); + assert!(matches!( + result, + Err(JsonReadError::MalformedSourceInfoPool) + )); + } + + /// Strict rejection: code-3 with `[path, "not-a-number"]` is malformed. + /// Guards the no-`unwrap_or(0)` rule. + #[test] + fn test_deserialize_code3_filter_non_numeric_line_rejected() { + let pool_json = json!([ + { + "r": [0, 0], + "t": 3, + "d": ["/path/to/filter.lua", "oops"] + } + ]); + + let result = SourceInfoDeserializer::new(&pool_json); + assert!(matches!( + result, + Err(JsonReadError::MalformedSourceInfoPool) + )); + } + + /// Strict rejection: code-3 with `[path, line, extra]` (too many items) + /// is malformed — the filter shape must be exactly two elements. + #[test] + fn test_deserialize_code3_filter_too_many_elements_rejected() { + let pool_json = json!([ + { + "r": [0, 0], + "t": 3, + "d": ["/path/to/filter.lua", 42, "extra"] + } + ]); + + let result = SourceInfoDeserializer::new(&pool_json); + assert!(matches!( + result, + Err(JsonReadError::MalformedSourceInfoPool) + )); + } + + /// Strict rejection: code-3 with empty array is malformed. + #[test] + fn test_deserialize_code3_empty_array_rejected() { + let pool_json = json!([ + { + "r": [0, 0], + "t": 3, + "d": [] + } + ]); + + let result = SourceInfoDeserializer::new(&pool_json); + assert!(matches!( + result, + Err(JsonReadError::MalformedSourceInfoPool) + )); + } + + /// Strict rejection: code-3 with a non-array payload is malformed. + #[test] + fn test_deserialize_code3_non_array_payload_rejected() { + let pool_json = json!([ + { + "r": [0, 0], + "t": 3, + "d": 7 + } + ]); + + let result = SourceInfoDeserializer::new(&pool_json); + assert!(matches!( + result, + Err(JsonReadError::MalformedSourceInfoPool) + )); + } + + // ---------------------------------------------------------------- + // Plan 5 Phase 2 — Code-4 (Generated) reader + // ---------------------------------------------------------------- + + /// Forward-compat: code-4 with no `from` array decodes as + /// `Generated { by: , from: [] }`. Pure synthesis. + #[test] + fn test_deserialize_code4_generated_no_anchors() { + let pool_json = json!([ + { + "r": [0, 0], + "t": 4, + "d": { "by": { "kind": "sectionize" } } + } + ]); + + let deserializer = SourceInfoDeserializer::new(&pool_json).unwrap(); + match &deserializer.pool[0] { + SourceInfo::Generated { by, from } => { + assert_eq!(by.kind, "sectionize"); + assert!(by.data.is_null()); + assert!(from.is_empty()); + } + other => panic!("Expected Generated, got {:?}", other), + } + } + + /// Forward-compat: code-4 with `by.data` round-trips arbitrary JSON. + #[test] + fn test_deserialize_code4_generated_with_by_data() { + let pool_json = json!([ + { + "r": [0, 0], + "t": 4, + "d": { + "by": { + "kind": "filter", + "data": { "filter_path": "/x.lua", "line": 7 } + } + } + } + ]); + + let deserializer = SourceInfoDeserializer::new(&pool_json).unwrap(); + match &deserializer.pool[0] { + SourceInfo::Generated { by, from } => { + let (path, line) = by.as_filter().expect("expected filter By"); + assert_eq!(path, "/x.lua"); + assert_eq!(line, 7); + assert!(from.is_empty()); + } + other => panic!("Expected Generated, got {:?}", other), + } + } + + /// Code-4 with a single Invocation anchor — every known role is + /// recoverable. + #[test] + fn test_deserialize_code4_with_invocation_anchor() { + let pool_json = json!([ + { + "r": [0, 5], + "t": 0, + "d": 0 // Original (target of the anchor) + }, + { + "r": [0, 0], + "t": 4, + "d": { + "by": { "kind": "shortcode", "data": { "name": "meta" } }, + "from": [ + { "role": "invocation", "si_id": 0 } + ] + } + } + ]); + + let deserializer = SourceInfoDeserializer::new(&pool_json).unwrap(); + match &deserializer.pool[1] { + SourceInfo::Generated { by, from } => { + assert_eq!(by.kind, "shortcode"); + assert_eq!(from.len(), 1); + assert!(matches!(from[0].role, AnchorRole::Invocation)); + assert!(matches!(&*from[0].source_info, SourceInfo::Original { .. })); + } + other => panic!("Expected Generated, got {:?}", other), + } + } + + /// Code-4 with multiple anchors — invocation + value-source + an + /// extension-defined Other role. + #[test] + fn test_deserialize_code4_with_multiple_anchors() { + let pool_json = json!([ + { "r": [0, 5], "t": 0, "d": 0 }, // Original 0 + { "r": [5, 10], "t": 0, "d": 0 }, // Original 1 + { "r": [10, 15], "t": 0, "d": 0 }, // Original 2 + { + "r": [0, 0], + "t": 4, + "d": { + "by": { "kind": "shortcode", "data": { "name": "x" } }, + "from": [ + { "role": "invocation", "si_id": 0 }, + { "role": "value-source", "si_id": 1 }, + { "role": "other:ext/foo/bar", "si_id": 2 } + ] + } + } + ]); + + let deserializer = SourceInfoDeserializer::new(&pool_json).unwrap(); + match &deserializer.pool[3] { + SourceInfo::Generated { from, .. } => { + assert_eq!(from.len(), 3); + assert!(matches!(from[0].role, AnchorRole::Invocation)); + assert!(matches!(from[1].role, AnchorRole::ValueSource)); + match &from[2].role { + AnchorRole::Other(name) => assert_eq!(name, "ext/foo/bar"), + other => panic!("Expected Other(ext/foo/bar), got {:?}", other), + } + } + other => panic!("Expected Generated, got {:?}", other), + } + } + + /// Silently accept code-4 entries with `r != [0, 0]` (precedent: the + /// Concat arm parses `r` but doesn't use it). Future writers will + /// emit `r: [0, 0]`; older/divergent writers might not. + #[test] + fn test_deserialize_code4_nonzero_r_accepted() { + let pool_json = json!([ + { + "r": [42, 99], + "t": 4, + "d": { "by": { "kind": "sectionize" } } + } + ]); + + let deserializer = SourceInfoDeserializer::new(&pool_json).unwrap(); + assert!(matches!(deserializer.pool[0], SourceInfo::Generated { .. })); + } + + /// Forward-compat: unknown `by.kind` decodes opaquely — the wire + /// format does not constrain `kind` to known values. + #[test] + fn test_deserialize_code4_unknown_kind_is_forward_compat() { + let pool_json = json!([ + { + "r": [0, 0], + "t": 4, + "d": { + "by": { + "kind": "ext/future/foo", + "data": { "anything": [1, 2, 3] } + } + } + } + ]); + + let deserializer = SourceInfoDeserializer::new(&pool_json).unwrap(); + match &deserializer.pool[0] { + SourceInfo::Generated { by, from } => { + assert_eq!(by.kind, "ext/future/foo"); + assert_eq!(by.data["anything"], json!([1, 2, 3])); + assert!(from.is_empty()); + } + other => panic!("Expected Generated, got {:?}", other), + } + } + + // --- Strict code-4 rejection tests ------------------------------ + + #[test] + fn test_deserialize_code4_missing_by_rejected() { + let pool_json = json!([ + { "r": [0, 0], "t": 4, "d": {} } + ]); + assert!(matches!( + SourceInfoDeserializer::new(&pool_json), + Err(JsonReadError::MalformedSourceInfoPool) + )); + } + + #[test] + fn test_deserialize_code4_missing_by_kind_rejected() { + let pool_json = json!([ + { "r": [0, 0], "t": 4, "d": { "by": {} } } + ]); + assert!(matches!( + SourceInfoDeserializer::new(&pool_json), + Err(JsonReadError::MalformedSourceInfoPool) + )); + } + + #[test] + fn test_deserialize_code4_by_not_object_rejected() { + let pool_json = json!([ + { "r": [0, 0], "t": 4, "d": { "by": "filter" } } + ]); + assert!(matches!( + SourceInfoDeserializer::new(&pool_json), + Err(JsonReadError::MalformedSourceInfoPool) + )); + } + + #[test] + fn test_deserialize_code4_from_not_array_rejected() { + let pool_json = json!([ + { + "r": [0, 0], + "t": 4, + "d": { + "by": { "kind": "sectionize" }, + "from": "not-an-array" + } + } + ]); + assert!(matches!( + SourceInfoDeserializer::new(&pool_json), + Err(JsonReadError::MalformedSourceInfoPool) + )); + } + + #[test] + fn test_deserialize_code4_from_entry_not_object_rejected() { + let pool_json = json!([ + { "r": [0, 5], "t": 0, "d": 0 }, + { + "r": [0, 0], + "t": 4, + "d": { + "by": { "kind": "shortcode" }, + "from": [ "bad-entry" ] + } + } + ]); + assert!(matches!( + SourceInfoDeserializer::new(&pool_json), + Err(JsonReadError::MalformedSourceInfoPool) + )); + } + + #[test] + fn test_deserialize_code4_from_entry_missing_role_rejected() { + let pool_json = json!([ + { "r": [0, 5], "t": 0, "d": 0 }, + { + "r": [0, 0], + "t": 4, + "d": { + "by": { "kind": "shortcode" }, + "from": [ { "si_id": 0 } ] + } + } + ]); + assert!(matches!( + SourceInfoDeserializer::new(&pool_json), + Err(JsonReadError::MalformedSourceInfoPool) + )); + } + + #[test] + fn test_deserialize_code4_from_entry_missing_si_id_rejected() { + let pool_json = json!([ + { "r": [0, 5], "t": 0, "d": 0 }, + { + "r": [0, 0], + "t": 4, + "d": { + "by": { "kind": "shortcode" }, + "from": [ { "role": "invocation" } ] + } + } + ]); + assert!(matches!( + SourceInfoDeserializer::new(&pool_json), + Err(JsonReadError::MalformedSourceInfoPool) + )); + } + + #[test] + fn test_deserialize_code4_unknown_role_rejected() { + let pool_json = json!([ + { "r": [0, 5], "t": 0, "d": 0 }, + { + "r": [0, 0], + "t": 4, + "d": { + "by": { "kind": "shortcode" }, + "from": [ { "role": "bogus", "si_id": 0 } ] + } + } + ]); + assert!(matches!( + SourceInfoDeserializer::new(&pool_json), + Err(JsonReadError::MalformedSourceInfoPool) + )); + } + + #[test] + fn test_deserialize_code4_empty_other_role_rejected() { + let pool_json = json!([ + { "r": [0, 5], "t": 0, "d": 0 }, + { + "r": [0, 0], + "t": 4, + "d": { + "by": { "kind": "shortcode" }, + "from": [ { "role": "other:", "si_id": 0 } ] + } + } + ]); + assert!(matches!( + SourceInfoDeserializer::new(&pool_json), + Err(JsonReadError::MalformedSourceInfoPool) + )); + } + + #[test] + fn test_deserialize_code4_si_id_forward_reference_rejected() { + // si_id must be < current_index — Generated is at index 0 and + // points to index 5 (nonexistent and forward-referencing). + let pool_json = json!([ + { + "r": [0, 0], + "t": 4, + "d": { + "by": { "kind": "shortcode" }, + "from": [ { "role": "invocation", "si_id": 5 } ] + } + } + ]); + let result = SourceInfoDeserializer::new(&pool_json); + assert!(matches!( + result, + Err(JsonReadError::CircularSourceInfoReference(5)) + )); + } + + #[test] + fn test_deserialize_code4_by_data_omitted_is_null() { + // The serializer skips `data` when it's null; the reader must + // accept the omitted shape and produce `data: Value::Null`. + let pool_json = json!([ + { + "r": [0, 0], + "t": 4, + "d": { "by": { "kind": "sectionize" } } + } + ]); + let deserializer = SourceInfoDeserializer::new(&pool_json).unwrap(); + match &deserializer.pool[0] { + SourceInfo::Generated { by, .. } => { + assert!(by.data.is_null()); + } + _ => panic!("Expected Generated"), + } + } } diff --git a/crates/pampa/src/transforms/sectionize.rs b/crates/pampa/src/transforms/sectionize.rs index 492c7ab9b..7d4a2b97f 100644 --- a/crates/pampa/src/transforms/sectionize.rs +++ b/crates/pampa/src/transforms/sectionize.rs @@ -46,7 +46,8 @@ use crate::pandoc::block::{Block, Div, Header}; use hashlink::LinkedHashMap; use quarto_pandoc_types::attr::AttrSourceInfo; -use quarto_source_map::SourceInfo; +use quarto_source_map::{By, SourceInfo}; +use smallvec::smallvec; /// Wrap headers in section Divs. /// @@ -93,7 +94,10 @@ pub fn sectionize_blocks(blocks: Vec) -> Vec { let section_div = Block::Div(Div { attr: section_attr, content: section_content, - source_info: SourceInfo::default(), + source_info: SourceInfo::Generated { + by: By::sectionize(), + from: smallvec![], + }, attr_source: AttrSourceInfo::empty(), }); // Add closed section to parent, or output if no parent @@ -145,7 +149,10 @@ pub fn sectionize_blocks(blocks: Vec) -> Vec { let section_div = Block::Div(Div { attr: section_attr, content: section_content, - source_info: SourceInfo::default(), + source_info: SourceInfo::Generated { + by: By::sectionize(), + from: smallvec![], + }, attr_source: AttrSourceInfo::empty(), }); if let Some((_, _, parent_content)) = section_stack.last_mut() { @@ -523,6 +530,53 @@ mod tests { assert!(matches!(&result[1], Block::Paragraph(_))); } + #[test] + fn test_sectionize_section_div_has_generated_provenance() { + // Plan 6: every synthesized Section Div carries + // Generated { by: sectionize(), from: [] }. The wrapped Header retains + // its original source_info. + let blocks = vec![ + make_header(2, "sec-a", vec![], "A"), + make_para("body"), + make_header(2, "sec-b", vec![], "B"), + ]; + let result = sectionize_blocks(blocks); + assert_eq!(result.len(), 2); + + // First section's outer Div is Generated. + let Block::Div(div) = &result[0] else { + panic!("Expected section Div"); + }; + match &div.source_info { + SourceInfo::Generated { by, from } => { + assert_eq!(by.kind, "sectionize"); + assert!( + from.is_empty(), + "sectionize synthesizers carry no source-side anchors" + ); + } + other => panic!("Expected Generated source_info, got {:?}", other), + } + + // Second section (end-of-input path) — same shape. + let Block::Div(div2) = &result[1] else { + panic!("Expected section Div"); + }; + match &div2.source_info { + SourceInfo::Generated { by, from } => { + assert_eq!(by.kind, "sectionize"); + assert!(from.is_empty()); + } + other => panic!("Expected Generated source_info, got {:?}", other), + } + + // The wrapped Header inside the section retains its original + // (dummy) source_info — only the Div is synthesized. + let Block::Header(_) = &div.content[0] else { + panic!("Expected Header inside section"); + }; + } + #[test] fn test_sectionize_class_order() { // Classes should be: "section", "levelN", then user classes diff --git a/crates/pampa/src/utils/trim_source_location.rs b/crates/pampa/src/utils/trim_source_location.rs index 177a278c8..6238863ea 100644 --- a/crates/pampa/src/utils/trim_source_location.rs +++ b/crates/pampa/src/utils/trim_source_location.rs @@ -96,8 +96,8 @@ pub fn trim_whitespace( // For concat, just return as-is for now (edge case) source_info.clone() } - SourceInfo::FilterProvenance { .. } => { - // For filter provenance, just return as-is + SourceInfo::Generated { .. } => { + // No characteristic local-text range to trim against. source_info.clone() } }; @@ -128,8 +128,8 @@ pub fn trim_whitespace( // Proper handling would require splitting/adjusting pieces source_info.clone() } - SourceInfo::FilterProvenance { .. } => { - // For filter provenance, just return as-is + SourceInfo::Generated { .. } => { + // No characteristic local-text range to trim against. source_info.clone() } } diff --git a/crates/pampa/src/writers/incremental.rs b/crates/pampa/src/writers/incremental.rs index 69486e286..7fbd8dcb0 100644 --- a/crates/pampa/src/writers/incremental.rs +++ b/crates/pampa/src/writers/incremental.rs @@ -16,6 +16,8 @@ use quarto_ast_reconcile::types::{ }; use quarto_ast_reconcile::{structural_eq_blocks, structural_eq_inlines}; use quarto_pandoc_types::config_value::{ConfigMapEntry, ConfigValue, ConfigValueKind}; +use quarto_pandoc_types::is_atomic_custom_node; +use quarto_source_map::{FileId, SourceInfo}; use std::ops::Range; use super::qmd; @@ -33,20 +35,37 @@ pub struct TextEdit { pub replacement: String, } -/// An entry in the coarsened plan: either copy verbatim, rewrite, or inline-splice. +/// An entry in the coarsened plan. +/// +/// Plan 7 adds `Transparent` and `Omit` to the original three variants +/// (`Verbatim`, `Rewrite`, `InlineSplice`). #[derive(Debug)] enum CoarsenedEntry { /// Copy this byte range verbatim from original_qmd. /// The text includes the block content + trailing \n. Verbatim { byte_range: Range, - /// Index of this block in original_ast.blocks (for gap computation) - orig_idx: usize, + /// Index of this block in original_ast.blocks (for gap computation). + /// `None` for entries that came from a `Transparent` recursion — those + /// children aren't top-level blocks so they have no top-level index; + /// `compute_separator`'s original-gap optimization falls back to the + /// standard separator for them. + orig_idx: Option, }, /// Rewrite this block using the standard writer. + /// + /// `block_text` is pre-computed at coarsen time so the entry stays + /// self-contained regardless of nesting depth. (Earlier the variant + /// carried `new_idx: usize` and looked up the block at emit time, + /// but that indexed `new_ast.blocks` top-level — wrong for entries + /// produced inside a `Transparent` recursion, where the relevant + /// block lives in a child slice. See + /// `claude-notes/designs/incremental-writer-internals.md` for the + /// "every variant is self-contained" contract.) Rewrite { - /// Index into new_ast.blocks - new_idx: usize, + /// Pre-computed block text — same bytes `write_block_to_string` + /// would produce on the corresponding block. + block_text: String, }, /// Splice inlines within a block without rewriting the entire block. /// The block structure (prefix, suffix) is preserved from the original; @@ -54,9 +73,100 @@ enum CoarsenedEntry { InlineSplice { /// Pre-computed block text: original block with inline content replaced. block_text: String, - /// Index of this block in original_ast.blocks (for gap computation) - orig_idx: usize, + /// Index of this block in original_ast.blocks (for gap computation). + /// Same `Option` semantics as `Verbatim::orig_idx`. + orig_idx: Option, }, + /// Plan 7: a non-atomic `Generated` wrapper with empty anchors AND + /// source-bearing children. The wrapper contributes no bytes; its + /// children produce the output. Used for sectionize wrappers, + /// footnotes container, appendix container — synthesizers whose + /// container shell has no preimage but whose inner content does. + Transparent { child_entries: Vec }, + /// Plan 7: drop this node from output entirely. The next pipeline run + /// regenerates it from baseline content. Used for atomic-kind + /// `Generated` nodes with no Invocation anchor (filter constructions, + /// title-block synthesis, tree-sitter postprocess space) and for + /// no-preimage `Generated` containers replaced via React. + Omit, +} + +// ============================================================================= +// Editability gate (Plan 7) +// ============================================================================= + +/// Decide whether the *interior* of `block` is editable, with respect to the +/// active document `target_file_id`. +/// +/// "Editable inside" means: the user can type into this node's content and +/// have their edit round-trip back to source bytes. Three reasons content is +/// **not** editable inside: +/// +/// 1. The block is an atomic `CustomNode` (per +/// [`quarto_pandoc_types::is_atomic_custom_node`]). Atomic nodes are +/// replaceable wholesale via a React-side component menu but have no +/// editable text region. Today: `"CrossrefResolvedRef"`. +/// 2. The block carries `SourceInfo::Generated` with an atomic-kind `by` +/// (shortcode / filter / title-block / tree-sitter-postprocess). +/// Content is the resolved value of an invocation token; the user's +/// source-side knob is the token, not the resolved bytes. +/// 3. The block's source_info has no preimage in `target_file_id` +/// (synthesized-from-metadata containers, cross-file Original chains). +/// There are no bytes in the target file to map an inner edit back to. +/// +/// **Returns `true` for everything else.** Used by `coarsen`'s soft-drop +/// logic; the React-side hand-mirror lives at +/// `ts-packages/preview-renderer/src/utils/atomicCustomNodes.ts` plus a +/// parallel `is_editable_inside` predicate to be added in a follow-up. +/// +/// See Plan 7 §"Unified editability predicate". +pub fn is_editable_inside_block(block: &Block, target_file_id: FileId) -> bool { + if let Block::Custom(cn) = block + && is_atomic_custom_node(&cn.type_name) + { + return false; + } + is_editable_inside_source_info(block.source_info(), target_file_id) +} + +/// Inline-side counterpart of [`is_editable_inside_block`]. +/// +/// Same three reasons content is not editable inside; for `Inline::Custom` +/// the atomic-CustomNode check applies (some atomic types live in the +/// inline arm — `CrossrefResolvedRef` is one). +pub fn is_editable_inside_inline(inline: &Inline, target_file_id: FileId) -> bool { + if let Inline::Custom(cn) = inline + && is_atomic_custom_node(&cn.type_name) + { + return false; + } + is_editable_inside_source_info(inline.source_info(), target_file_id) +} + +/// Shared editability rules driven by `SourceInfo` alone (the +/// atomic-CustomNode gate is applied by the block / inline callers above). +fn is_editable_inside_source_info(si: &SourceInfo, target_file_id: FileId) -> bool { + // Atomic-kind Generated (shortcode, filter, title-block, + // tree-sitter-postprocess): the content is pipeline-resolved; the + // user's source-side knob is the invocation token, not the bytes + // inside. + if let SourceInfo::Generated { by, .. } = si + && by.is_atomic_kind() + { + return false; + } + // Catch-all: editable iff the region has byte-traceable preimage in + // the target file. Covers: + // - Original in target → editable. ✓ + // - Substring chain resolving in target → editable. ✓ + // - Original/Substring rooted outside target → not editable. + // - Generated with empty anchors (sectionize, footnotes, + // appendix containers) → preimage_in returns None → not editable. + // - Generated with only ValueSource/Dispatch/Other anchors → not + // editable (preimage_in walks Invocation only). + // - Non-atomic Generated with Invocation anchor in target → + // editable. + si.preimage_in(target_file_id).is_some() } // ============================================================================= @@ -73,24 +183,36 @@ enum CoarsenedEntry { /// * `plan` - A reconciliation plan describing alignment between original_ast and new_ast /// /// # Returns -/// A new QMD string where: -/// - Unchanged blocks are preserved verbatim from `original_qmd` -/// - Changed blocks are rewritten using the standard writer -/// - The result round-trips: `read(result) ≡ new_ast` (structural equality) +/// +/// On success: `(new_qmd, warnings)`. The qmd preserves unchanged blocks +/// verbatim from `original_qmd`, rewrites changed blocks via the standard +/// writer, and soft-drops bad edits to non-editable regions (atomic +/// CustomNodes, atomic-kind Generated, no-preimage Generated containers). +/// Each soft-drop pushes a Q-3-42 / Q-3-43 warning into the returned vec; +/// the overall write still succeeds. +/// +/// On failure: `Err(fatal_errors)` — genuine structural failure (UTF-8 +/// error, inline-splice impossibility, etc.). Soft-drop substitutions +/// never reach this arm. pub fn incremental_write( original_qmd: &str, original_ast: &Pandoc, new_ast: &Pandoc, plan: &ReconciliationPlan, -) -> Result> { +) -> Result< + (String, Vec), + Vec, +> { // The QMD reader internally pads input with '\n' when it doesn't end with // one, producing source spans relative to the padded input. We must use the // same padded string so that block source spans are valid byte indices. let mut padded_storage = None; let (qmd, did_pad) = ensure_trailing_newline(original_qmd, &mut padded_storage); - // Step 1: Coarsen the reconciliation plan - let coarsened = coarsen(qmd, original_ast, new_ast, plan)?; + // Step 1: Coarsen the reconciliation plan. Soft-drop warnings collect + // into this sink; coarsen never returns Err for soft-drop cases. + let mut warnings: Vec = Vec::new(); + let coarsened = coarsen(qmd, original_ast, new_ast, plan, &mut warnings)?; // Step 2: Assemble the result string let mut result = assemble(qmd, original_ast, new_ast, &coarsened)?; @@ -101,24 +223,34 @@ pub fn incremental_write( result.pop(); } - Ok(result) + Ok((result, warnings)) } /// Compute minimal text edits to transform `original_qmd` into the incremental write result. /// /// Each TextEdit describes a byte range in `original_qmd` to replace and the replacement text. /// Edits are sorted by range.start and non-overlapping. +/// +/// Like [`incremental_write`], returns a tuple `(edits, warnings)` on +/// success; soft-drop warnings (Q-3-42 / Q-3-43) ride alongside. pub fn compute_incremental_edits( original_qmd: &str, original_ast: &Pandoc, new_ast: &Pandoc, plan: &ReconciliationPlan, -) -> Result, Vec> { +) -> Result< + ( + Vec, + Vec, + ), + Vec, +> { // Same trailing-newline normalization as incremental_write (see comment there). let mut padded_storage = None; let (qmd, did_pad) = ensure_trailing_newline(original_qmd, &mut padded_storage); - let coarsened = coarsen(qmd, original_ast, new_ast, plan)?; + let mut warnings: Vec = Vec::new(); + let coarsened = coarsen(qmd, original_ast, new_ast, plan, &mut warnings)?; let mut edits = compute_edits_from_coarsened(qmd, original_ast, new_ast, &coarsened)?; if did_pad { @@ -134,7 +266,7 @@ pub fn compute_incremental_edits( } } - Ok(edits) + Ok((edits, warnings)) } // ============================================================================= @@ -146,70 +278,267 @@ pub fn compute_incremental_edits( /// Phase 5 strategy: for RecurseIntoContainer blocks that are inline-content blocks /// (Paragraph, Plain, Header) with inline plans that pass the safety check, /// produce InlineSplice entries. All other RecurseIntoContainer become Rewrite. +/// +/// Plan 7: soft-drop warnings push into `warnings`. Bad-edit cases +/// (atomic-CustomNode interior edit, atomic-Generated edit, no-preimage +/// Generated edit) substitute a safe alignment AND record a Q-3-42 / +/// Q-3-43 warning; coarsen never returns `Err` for these cases. `Err` is +/// reserved for genuine structural failures (UTF-8 errors, inline-splice +/// impossibility from assemble_inline_splice). fn coarsen( original_qmd: &str, original_ast: &Pandoc, new_ast: &Pandoc, plan: &ReconciliationPlan, + warnings: &mut Vec, +) -> Result, Vec> { + // The "target file" for editability decisions is the file + // `original_qmd` was parsed from. Derived by descending past any + // synthesized first blocks (title-block, sectionize wrappers, + // footnotes / appendix containers) so we get the user's real + // qmd FileId rather than the FileId(0) fallback by accident. + // Closes Plan 7c Phase 8. + let target_file_id = derive_target_file_id(&original_ast.blocks); + + coarsen_blocks( + original_qmd, + &original_ast.blocks, + &new_ast.blocks, + plan, + target_file_id, + warnings, + ) +} + +/// Recurse into the children of a non-atomic Generated wrapper whose +/// own bytes are synthesized but whose children carry real source +/// preimage. Used by the RecurseIntoContainer arm of `coarsen_blocks` +/// when soft-dropping the wrapper would silently delete real user +/// content. The wrapper's index resolves the nested +/// `block_container_plans` entry; that plan describes alignment +/// between the wrapper's `orig` and `new` children. +/// +/// Per the `Verbatim`/`InlineSplice::orig_idx` contract (see the +/// `CoarsenedEntry` doc comments), child entries returned to a +/// `Transparent` wrapper must carry `orig_idx: None` — their indices +/// are children-relative, not top-level, so `compute_separator`'s +/// "consecutive in original" optimization can't use them. +fn coarsen_children( + original_qmd: &str, + orig_children: &[Block], + new_children: &[Block], + child_plan: &ReconciliationPlan, + target_file_id: quarto_source_map::FileId, + warnings: &mut Vec, +) -> Result, Vec> { + let mut entries = coarsen_blocks( + original_qmd, + orig_children, + new_children, + child_plan, + target_file_id, + warnings, + )?; + for entry in &mut entries { + clear_orig_idx_for_transparent_child(entry); + } + Ok(entries) +} + +/// Walk a `CoarsenedEntry` tree and set `orig_idx` to `None` on every +/// `Verbatim` / `InlineSplice`. Used when promoting entries into a +/// `Transparent` wrapper, where the indices no longer refer to +/// top-level positions. +fn clear_orig_idx_for_transparent_child(entry: &mut CoarsenedEntry) { + match entry { + CoarsenedEntry::Verbatim { orig_idx, .. } => *orig_idx = None, + CoarsenedEntry::InlineSplice { orig_idx, .. } => *orig_idx = None, + CoarsenedEntry::Transparent { child_entries } => { + for child in child_entries { + clear_orig_idx_for_transparent_child(child); + } + } + CoarsenedEntry::Rewrite { .. } | CoarsenedEntry::Omit => {} + } +} + +/// Coarsen a block-alignment plan against the given original/new +/// block slices. Extracted from `coarsen` so the RecurseIntoContainer +/// path can recurse into a non-atomic Generated wrapper's children +/// using the nested `block_container_plans` plan. +fn coarsen_blocks( + original_qmd: &str, + original_blocks: &[Block], + new_blocks: &[Block], + plan: &ReconciliationPlan, + target_file_id: quarto_source_map::FileId, + warnings: &mut Vec, ) -> Result, Vec> { let mut entries = Vec::with_capacity(plan.block_alignments.len()); for (result_idx, alignment) in plan.block_alignments.iter().enumerate() { let entry = match alignment { - BlockAlignment::KeepBefore(orig_idx) => { - let span = block_source_span(&original_ast.blocks[*orig_idx]); - CoarsenedEntry::Verbatim { - byte_range: span, - orig_idx: *orig_idx, + BlockAlignment::KeepBefore(orig_idx) => coarsen_keep_before_block( + &original_blocks[*orig_idx], + target_file_id, + Some(*orig_idx), + )?, + BlockAlignment::UseAfter(after_idx) => { + let new_block = &new_blocks[*after_idx]; + let new_si = new_block.source_info(); + + let is_atomic_cn = matches!(new_block, Block::Custom(cn) + if is_atomic_custom_node(&cn.type_name)); + let atomic_generated_preimage = match new_si { + SourceInfo::Generated { by, .. } if by.is_atomic_kind() => { + new_si.preimage_in(target_file_id) + } + _ => None, + }; + let no_preimage_generated = matches!(new_si, SourceInfo::Generated { .. }) + && new_si.preimage_in(target_file_id).is_none(); + + if let Some(range) = atomic_generated_preimage { + // User edited inside an atomic-kind Generated block + // (shortcode / filter / title-block / tree-sitter- + // postprocess). The reconciler split the edit into + // a deleted-original + new-block; the new block + // still carries the token's Invocation anchor, so + // its preimage IS the source-side knob. Emit the + // token bytes verbatim and soft-drop the edit; + // without this branch the let-user-win Rewrite + // below would write the resolved bytes (the edit + // applied to the generated content) back into qmd, + // poisoning the source. See + // `claude-notes/designs/incremental-writer-internals.md` + // for why this lives at the writer, not the gate. + warnings.push(diagnostic_q3_43_block(new_block)); + CoarsenedEntry::Verbatim { + byte_range: range, + // No original block paired with this entry — + // the UseAfter alignment implicitly deleted + // the original; compute_separator's + // consecutive-in-original optimization can't + // use a top-level orig_idx here. + orig_idx: None, + } + } else if !is_atomic_cn && no_preimage_generated { + // User replaced a synthesized-from-metadata container + // wholesale via React. No source position to anchor a + // Rewrite at; soft-drop with Q-3-43. + warnings.push(diagnostic_q3_43_block(new_block)); + CoarsenedEntry::Omit + } else { + // Let-user-win — including for atomic CustomNodes (the + // user replaced an include / CrossrefResolvedRef via a + // component menu; the qmd writer's CustomNode arm + // serializes the fresh plain_data). + CoarsenedEntry::Rewrite { + block_text: write_block_to_string(new_block)?, + } } } - BlockAlignment::UseAfter(_after_idx) => CoarsenedEntry::Rewrite { - new_idx: result_idx, - }, BlockAlignment::RecurseIntoContainer { before_idx, after_idx, } => { - // Check if this block has an inline plan and is safe to splice + let orig_block = &original_blocks[*before_idx]; + + // Plan 7: if the original container is not editable inside, + // soft-drop the inner edit. Substitutions: + // - atomic CustomNode with preimage → Verbatim wrapper bytes + // - non-atomic Generated wrapper with source-bearing + // children → recurse Transparent into the children (the + // wrapper's bytes are synthesized but the children carry + // real preimage; mirrors the unchanged-wrapper Transparent + // path in `coarsen_keep_before_block` at line ~459) + // - everything else (no-preimage Generated container with + // no source-bearing children, etc.) → Omit + if !is_editable_inside_block(orig_block, target_file_id) { + // First: atomic CustomNode with preimage → keep the + // wrapper bytes verbatim (the user-side edit is lost, + // but the wrapper text survives). + if let Some(range) = orig_block.source_info().preimage_in(target_file_id) { + warnings.push(diagnostic_q3_43_block(orig_block)); + entries.push(CoarsenedEntry::Verbatim { + byte_range: range, + orig_idx: Some(*before_idx), + }); + continue; + } + + // Second: non-atomic Generated wrapper (sectionize, + // footnotes-container, appendix-container, ...). If it + // has source-bearing children AND the reconciler built a + // container plan for this index, recurse coarsen on the + // children. The user's edit is *inside* the wrapper — + // soft-dropping the wrapper would silently delete real + // user content. + if let SourceInfo::Generated { by, .. } = orig_block.source_info() + && !by.is_atomic_kind() + && let (Some(orig_children), Some(new_children)) = ( + block_block_children(orig_block), + block_block_children(&new_blocks[*after_idx]), + ) + && orig_children + .iter() + .any(|c| c.source_info().preimage_in(target_file_id).is_some()) + && let Some(child_plan) = plan.block_container_plans.get(&result_idx) + { + let child_entries = coarsen_children( + original_qmd, + orig_children, + new_children, + child_plan, + target_file_id, + warnings, + )?; + entries.push(CoarsenedEntry::Transparent { child_entries }); + continue; + } + + // Last resort: no preimage, no recursable children → + // soft-drop with Q-3-43. + warnings.push(diagnostic_q3_43_block(orig_block)); + entries.push(CoarsenedEntry::Omit); + continue; + } + + // Existing recurse logic: try inline-splice if the block has + // an inline plan and is safe to splice; else Rewrite. if let Some(inline_plan) = plan.inline_plans.get(&result_idx) { - let orig_block = &original_ast.blocks[*before_idx]; - let new_block = &new_ast.blocks[*after_idx]; + let new_block = &new_blocks[*after_idx]; if let (Some(orig_inlines), Some(new_inlines)) = (block_inlines(orig_block), block_inlines(new_block)) + && !orig_inlines.is_empty() + && is_inline_splice_safe(new_inlines, inline_plan) + && block_attrs_eq(orig_block, new_block) { - if !orig_inlines.is_empty() - && is_inline_splice_safe(new_inlines, inline_plan) - && block_attrs_eq(orig_block, new_block) - { - // Safe to splice — assemble the patched block text - let block_text = assemble_inline_splice( - original_qmd, - orig_block, - orig_inlines, - new_inlines, - inline_plan, - )?; - CoarsenedEntry::InlineSplice { - block_text, - orig_idx: *before_idx, - } - } else { - CoarsenedEntry::Rewrite { - new_idx: result_idx, - } + // Safe to splice — assemble the patched block text + let block_text = assemble_inline_splice( + original_qmd, + orig_block, + orig_inlines, + new_inlines, + inline_plan, + target_file_id, + warnings, + )?; + CoarsenedEntry::InlineSplice { + block_text, + orig_idx: Some(*before_idx), } } else { - // Not an inline-content block — fall back to Rewrite CoarsenedEntry::Rewrite { - new_idx: result_idx, + block_text: write_block_to_string(new_block)?, } } } else { - // No inline plan — this is a block container (Div, BlockQuote, etc.) - // Fall back to Rewrite + // No inline plan — this is a block container (Div, + // BlockQuote, etc.). Fall back to Rewrite. CoarsenedEntry::Rewrite { - new_idx: result_idx, + block_text: write_block_to_string(&new_blocks[*after_idx])?, } } } @@ -220,6 +549,282 @@ fn coarsen( Ok(entries) } +/// Classify a single `KeepBefore` block per Plan 7's cascade: +/// +/// 1. **Verbatim** if `preimage_in(target)` returns `Some(range)` — covers +/// `Original`/`Substring`/contiguous-`Concat`/`Generated`-via-Invocation. +/// The atomic-kind shortcode case lands here too (its Invocation anchor +/// resolves to the token bytes). +/// 2. **Omit** if the source_info is `Generated` with `is_atomic_kind()` +/// and no Invocation anchor — filter constructions, title-block +/// synthesis, tree-sitter-postprocess space. Belt-and-suspenders +/// `debug_assert!` against shortcode-with-empty-from (Plan 6 stamper +/// invariant: every shortcode resolution must carry an Invocation). +/// 3. **Transparent** if the source_info is a non-atomic `Generated` +/// wrapper with source-bearing children (sectionize wrapper, +/// footnotes-container, appendix-container). Recurses into the +/// children. +/// 4. **Rewrite** catch-all — re-serializes the unchanged block through +/// the qmd writer. Lossy at the byte level but preserves content. +/// Handles cross-file Original chains (no Plan-8 wrapper yet), +/// Substring rooted outside target, gappy Concat. +/// +/// `top_level_orig_idx` is `Some(idx)` for top-level blocks (used by +/// `compute_separator`'s original-gap optimization) and `None` for +/// children of a `Transparent` (whose indices don't reference +/// `original_ast.blocks` directly). +/// +/// KeepBefore implies the original block at this position and the new +/// block at the same position are structurally equivalent (that's what +/// the reconciler's KeepBefore alignment *means*). So when we fall +/// through to Rewrite, serializing the original `block` yields the +/// same bytes as serializing the new one — by referential transparency +/// of `write_block_to_string`. We pick the original to avoid threading +/// the new slice down here. +fn coarsen_keep_before_block( + block: &Block, + target_file_id: quarto_source_map::FileId, + top_level_orig_idx: Option, +) -> Result> { + let si = block.source_info(); + + if let Some(range) = si.preimage_in(target_file_id) { + return Ok(CoarsenedEntry::Verbatim { + byte_range: range, + orig_idx: top_level_orig_idx, + }); + } + + if let SourceInfo::Generated { by, .. } = si { + if by.is_atomic_kind() { + // Atomic-kind Generated with no Invocation anchor. + debug_assert!( + !by.is_kind("shortcode"), + "Generated {{ by: shortcode, from: [] }} reached the writer — \ + Plan 6's stamper must always attach an Invocation anchor for \ + shortcode resolutions. \ + Block: {:?}", + block, + ); + return Ok(CoarsenedEntry::Omit); + } + + // Non-atomic Generated wrapper. If it has source-bearing children, + // recurse Transparent. Else fall through to Rewrite. + if let Some(children) = block_block_children(block) + && children + .iter() + .any(|c| c.source_info().preimage_in(target_file_id).is_some()) + { + let child_entries = children + .iter() + .map(|child| { + // Children of a Transparent wrapper aren't top-level + // blocks — pass orig_idx=None so compute_separator + // doesn't try the original-gap optimization on them. + coarsen_keep_before_block(child, target_file_id, None) + }) + .collect::, _>>()?; + return Ok(CoarsenedEntry::Transparent { child_entries }); + } + } + + // Catch-all: cross-file Original, Substring rooted outside target, + // gappy Concat, Generated wrapper without source-bearing children. + Ok(CoarsenedEntry::Rewrite { + block_text: write_block_to_string(block)?, + }) +} + +/// Return the inner block children of a block, if the block is a +/// recognized block container. +/// +/// Today's Plan-6 synthesizers produce `Div`-shaped wrappers (sectionize, +/// footnotes-container, appendix-container). Other block containers +/// (BlockQuote, Figure, NoteDefinitionFencedBlock) round out the set so +/// the Transparent cascade applies uniformly when those carry Generated +/// source_info. List-shaped containers (BulletList, OrderedList, +/// DefinitionList) return `None` — their `content` is `Vec` +/// (lists of lists), which isn't the Transparent shape. +fn block_block_children(block: &Block) -> Option<&[Block]> { + match block { + Block::Div(d) => Some(&d.content), + Block::BlockQuote(b) => Some(&b.content), + Block::Figure(f) => Some(&f.content), + Block::NoteDefinitionFencedBlock(n) => Some(&n.content), + _ => None, + } +} + +// ============================================================================= +// Transparent wrappers +// ============================================================================= +// +// A *transparent wrapper* is a block that's structurally part of the +// AST but has no source bytes of its own — the user's actual content +// lives in its children. Sectionize Divs, footnotes containers, and +// appendix containers (from `pampa::pandoc::sugar::SectionizeTransform` +// and friends) are the canonical examples. A Lua filter that wraps +// user content in a Div is another: the wrapper has `Generated` +// source_info, but the children preserve their original positions. +// +// Code that asks "where do the user's source bytes live?" must +// **descend through transparent wrappers** rather than reading +// `blocks[0]` directly. The `first_in_user_tree` walker below is the +// reference implementation; `derive_target_file_id` / +// `first_target_anchored_start_in` are thin specializations. +// +// See `claude-notes/designs/transparent-wrappers.md` for the full +// contract and the rationale (why the predicate is structural rather +// than opt-in: filter authors don't have to register anything — the +// AST shape they emit *is* the declaration). + +/// Walk `blocks` depth-first, applying `extract` to each block. On a +/// `Some` result, stop and return it. On `None`, descend through +/// `block_block_children` and try again. This is how we see through +/// transparent wrappers: a wrapper has no source position of its +/// own, so `extract` returns `None` for it; the walker then looks +/// inside. +/// +/// Used by `derive_target_file_id` and `first_target_anchored_start_in`, +/// and intended as the building block for any future code that needs +/// to find "the first user block matching X" without having to +/// re-derive the descent. +fn first_in_user_tree(blocks: &[Block], extract: &impl Fn(&Block) -> Option) -> Option { + for block in blocks { + if let Some(v) = extract(block) { + return Some(v); + } + if let Some(children) = block_block_children(block) + && let Some(v) = first_in_user_tree(children, extract) + { + return Some(v); + } + } + None +} + +/// Returns `true` iff `block` is a transparent wrapper with respect +/// to `target_file_id`: +/// +/// 1. its `SourceInfo` is `Generated` with no `Invocation` anchor (so +/// it has no source token of its own), AND +/// 2. it has block-children (`block_block_children` recognises it as +/// a container — Div, BlockQuote, Figure, NoteDefinitionFencedBlock), AND +/// 3. at least one descendant has real `preimage_in(target_file_id)` +/// (so there's actual user content under it). +/// +/// Condition (3) is what makes this *structural* rather than opt-in: +/// a Lua filter that wraps existing user content in a Div produces a +/// Generated wrapper whose children carry their original preimage — +/// transparent. A filter that constructs a fresh Div from metadata +/// has no source-bearing children — atomic. Authors don't have to +/// declare anything; the AST shape declares it for them. +/// +/// Available to callers that need to make an explicit decision +/// (e.g. a future Q-3-44 diagnostic that hints "your filter walked +/// into the sectionize wrapper"). Routine source-position lookups +/// should use `first_in_user_tree` directly. +#[allow(dead_code)] +fn is_transparent_wrapper(block: &Block, target_file_id: quarto_source_map::FileId) -> bool { + if !matches!(block.source_info(), SourceInfo::Generated { .. }) { + return false; + } + if block.source_info().invocation_anchor().is_some() { + return false; + } + let Some(children) = block_block_children(block) else { + return false; + }; + first_in_user_tree(children, &|b| b.source_info().preimage_in(target_file_id)).is_some() +} + +/// Derive the "target file" — the file that `original_qmd` was parsed +/// from, used for editability and preimage checks throughout the +/// writer. +/// +/// Descends through transparent wrappers via `first_in_user_tree`, +/// so a synthesized first block (title-block, sectionize wrapper, +/// footnotes / appendix container) is skipped and the user's real +/// qmd `FileId` is returned. Falls back to `FileId(0)` only for the +/// genuinely-empty document. +/// +/// Closes Plan 7c Phase 8. +fn derive_target_file_id(blocks: &[Block]) -> quarto_source_map::FileId { + first_in_user_tree(blocks, &|b| b.source_info().root_file_id()) + .unwrap_or(quarto_source_map::FileId(0)) +} + +/// Find the start offset (in `target` bytes) of the first block in +/// `blocks` whose `source_info().preimage_in(target)` is `Some`, +/// descending through transparent wrappers. Used by +/// `emit_metadata_prefix` to locate the boundary between the YAML +/// frontmatter region and the first source-anchored user block. +fn first_target_anchored_start_in( + blocks: &[Block], + target: quarto_source_map::FileId, +) -> Option { + first_in_user_tree(blocks, &|b| { + b.source_info().preimage_in(target).map(|r| r.start) + }) +} + +// ============================================================================= +// Soft-drop diagnostic builders (Plan 7) +// ============================================================================= + +/// Build a `Q-3-42` warning for an inline-level edit that targeted +/// atomic-Generated content (typically a shortcode resolution). The +/// source location is the inline's `Invocation` anchor when available +/// (the token bytes), falling back to the inline's own source_info. +fn diagnostic_q3_42_inline(inline: &Inline) -> quarto_error_reporting::DiagnosticMessage { + let location = inline + .source_info() + .invocation_anchor() + .map(|arc| arc.as_ref().clone()) + .unwrap_or_else(|| inline.source_info().clone()); + + quarto_error_reporting::DiagnosticMessageBuilder::warning("Shortcode edit dropped") + .with_code("Q-3-42") + .with_location(location) + .problem( + "An edit to shortcode-resolved (or other atomic-Generated) \ + content was reverted.", + ) + .add_hint( + "The resolved text is read-only; edit the invocation token \ + (e.g. `{{< meta foo >}}`) in source instead.", + ) + .build() +} + +/// Build a `Q-3-43` warning for a block-level edit dropped because the +/// container is not editable inside. +/// +/// Three emission paths share this builder (per Plan 7 +/// §"Diagnostic codes"): +/// - Block RecurseIntoContainer on an atomic CustomNode — wrapper's +/// source_info is `Original` pointing at the token bytes; +/// `with_location` highlights the include / crossref in Monaco. +/// - Block RecurseIntoContainer on a no-preimage Generated container — +/// the wrapper's source_info is `Generated` with no Invocation; the +/// diagnostic lands without a Monaco squiggle and surfaces via the +/// diagnostics banner. +/// - Block UseAfter on a no-preimage Generated container — same as +/// the previous case. +fn diagnostic_q3_43_block(block: &Block) -> quarto_error_reporting::DiagnosticMessage { + quarto_error_reporting::DiagnosticMessageBuilder::warning("Generated content edit dropped") + .with_code("Q-3-43") + .with_location(block.source_info().clone()) + .problem("An edit to pipeline-generated content was reverted.") + .add_hint( + "This content has no editable source position in this file; \ + edit its upstream definition (an include, a metadata key, \ + or other source) instead.", + ) + .build() +} + // ============================================================================= // Step 2: Assemble the Result String // ============================================================================= @@ -237,20 +842,75 @@ fn assemble( let _has_meta_prefix = emit_metadata_prefix(&mut result, original_qmd, original_ast, new_ast, coarsened)?; - // 2b. Walk coarsened entries and assemble blocks with separators + // 2b. Walk coarsened entries and assemble blocks with separators. + // Transparent entries recursively re-enter this loop on their children; + // Omit entries contribute nothing. let mut prev_entry: Option<&CoarsenedEntry> = None; let mut prev_block_text: Option = None; + emit_entries( + &mut result, + original_qmd, + original_ast, + new_ast, + coarsened, + &mut prev_entry, + &mut prev_block_text, + )?; + + Ok(result) +} - for entry in coarsened { - // 2c. Separator between blocks - // Note: we only add a separator when there's a previous block. +/// Recursive helper that walks a slice of `CoarsenedEntry` and emits each +/// one's bytes into `result`, threading `prev_entry` / `prev_block_text` +/// across siblings. +/// +/// `Transparent` re-enters this loop with its children, sharing the same +/// `prev_entry` / `prev_block_text` state so separators compose across the +/// wrapper boundary as if the wrapper weren't there. `Omit` is a no-op — +/// no bytes, no separator update; the next sibling's separator is computed +/// against the entry before the `Omit`. +fn emit_entries<'e>( + result: &mut String, + original_qmd: &str, + original_ast: &Pandoc, + new_ast: &Pandoc, + entries: &'e [CoarsenedEntry], + prev_entry: &mut Option<&'e CoarsenedEntry>, + prev_block_text: &mut Option, +) -> Result<(), Vec> { + for entry in entries { + match entry { + CoarsenedEntry::Omit => { + // Contributes nothing; leave prev_entry / prev_block_text alone + // so the next sibling's separator is computed against the + // entry before this Omit. + continue; + } + CoarsenedEntry::Transparent { child_entries } => { + // Recurse into children with shared prev_* state so separator + // semantics compose through the wrapper. + emit_entries( + result, + original_qmd, + original_ast, + new_ast, + child_entries, + prev_entry, + prev_block_text, + )?; + continue; + } + _ => {} + } + + // Separator between blocks (only if there's a previous emitting entry). // The metadata prefix already includes the gap to the first block, // so we must NOT add an extra separator after it. if prev_entry.is_some() { let separator = compute_separator( original_qmd, original_ast, - prev_entry, + *prev_entry, entry, prev_block_text.as_deref(), ); @@ -262,18 +922,18 @@ fn assemble( CoarsenedEntry::Verbatim { byte_range, .. } => { original_qmd[byte_range.clone()].to_string() } - CoarsenedEntry::Rewrite { new_idx } => { - write_block_to_string(&new_ast.blocks[*new_idx])? - } + CoarsenedEntry::Rewrite { block_text } => block_text.clone(), CoarsenedEntry::InlineSplice { block_text, .. } => block_text.clone(), + // Transparent + Omit were handled above; coarsen never emits + // any other variant. + CoarsenedEntry::Transparent { .. } | CoarsenedEntry::Omit => unreachable!(), }; result.push_str(&block_text); - prev_block_text = Some(block_text); - prev_entry = Some(entry); + *prev_block_text = Some(block_text); + *prev_entry = Some(entry); } - - Ok(result) + Ok(()) } /// Emit the metadata prefix (YAML front matter region). @@ -286,16 +946,23 @@ fn emit_metadata_prefix( new_ast: &Pandoc, _coarsened: &[CoarsenedEntry], ) -> Result> { - // Determine where the metadata region ends by looking at the first - // ORIGINAL block's start offset. We must NOT use the first coarsened - // entry's offset — when blocks are removed from the beginning, the - // first coarsened block may reference a later original block whose - // start > 0, falsely triggering the metadata prefix logic. - let first_block_start = if !original_ast.blocks.is_empty() { - Some(block_source_span(&original_ast.blocks[0]).start) - } else { - None - }; + // Determine where the metadata region ends by finding the start + // offset of the first source-anchored ORIGINAL block in the target + // file. We must NOT use the first coarsened entry's offset — when + // blocks are removed from the beginning, the first coarsened block + // may reference a later original block whose start > 0, falsely + // triggering the metadata prefix logic. + // + // We must also NOT use `original_ast.blocks[0]`'s offset directly: + // when the post-pipeline AST wraps user content in a synthesized + // top-level container (sectionize Div, title-block, footnotes / + // appendix wrappers), `blocks[0].start_offset()` is 0 (Generated, + // no preimage), which would falsely conclude "no metadata region" + // and silently delete the YAML frontmatter. Descend through + // `block_block_children` of any such wrapper to find the first + // block with real preimage in the target file. + let target_file_id = derive_target_file_id(&original_ast.blocks); + let first_block_start = first_target_anchored_start_in(&original_ast.blocks, target_file_id); // Check if there's a metadata region before the first block if let Some(start) = first_block_start { @@ -358,24 +1025,26 @@ fn compute_separator<'a>( curr_entry: &CoarsenedEntry, prev_block_text: Option<&str>, ) -> &'a str { - // Try to use original gap for consecutive blocks that preserve original positions - let prev_orig_idx = match prev_entry { - Some(CoarsenedEntry::Verbatim { orig_idx, .. }) => Some(*orig_idx), - Some(CoarsenedEntry::InlineSplice { orig_idx, .. }) => Some(*orig_idx), + // Try to use original gap for consecutive blocks that preserve original + // positions. Transparent/Omit entries don't carry a top-level orig_idx — + // they fall through to the standard separator. + let prev_orig_idx: Option = match prev_entry { + Some(CoarsenedEntry::Verbatim { orig_idx, .. }) => *orig_idx, + Some(CoarsenedEntry::InlineSplice { orig_idx, .. }) => *orig_idx, _ => None, }; - let curr_orig_idx = match curr_entry { - CoarsenedEntry::Verbatim { orig_idx, .. } => Some(*orig_idx), - CoarsenedEntry::InlineSplice { orig_idx, .. } => Some(*orig_idx), + let curr_orig_idx: Option = match curr_entry { + CoarsenedEntry::Verbatim { orig_idx, .. } => *orig_idx, + CoarsenedEntry::InlineSplice { orig_idx, .. } => *orig_idx, _ => None, }; - if let (Some(prev_idx), Some(curr_idx)) = (prev_orig_idx, curr_orig_idx) { - if curr_idx == prev_idx + 1 { - // Consecutive in original — use original gap - let prev_span = block_source_span(&original_ast.blocks[prev_idx]); - let curr_span = block_source_span(&original_ast.blocks[curr_idx]); - return &original_qmd[prev_span.end..curr_span.start]; - } + if let (Some(prev_idx), Some(curr_idx)) = (prev_orig_idx, curr_orig_idx) + && curr_idx == prev_idx + 1 + { + // Consecutive in original — use original gap + let prev_span = block_source_span(&original_ast.blocks[prev_idx]); + let curr_span = block_source_span(&original_ast.blocks[curr_idx]); + return &original_qmd[prev_span.end..curr_span.start]; } // Standard separator — but check if previous block already ends with \n\n @@ -605,6 +1274,8 @@ fn assemble_inline_splice( orig_inlines: &[Inline], new_inlines: &[Inline], plan: &InlineReconciliationPlan, + target_file_id: quarto_source_map::FileId, + warnings: &mut Vec, ) -> Result> { let block_span = block_source_span(orig_block); @@ -618,7 +1289,14 @@ fn assemble_inline_splice( let suffix = &original_qmd[inline_end..block_span.end]; // Assemble the new inline content - let inline_content = assemble_inline_content(original_qmd, orig_inlines, new_inlines, plan)?; + let inline_content = assemble_inline_content( + original_qmd, + orig_inlines, + new_inlines, + plan, + target_file_id, + warnings, + )?; Ok(format!("{}{}{}", prefix, inline_content, suffix)) } @@ -629,19 +1307,127 @@ fn assemble_inline_splice( /// - KeepBefore: copying the original inline's bytes verbatim /// - UseAfter: writing the new inline to a string /// - RecurseIntoContainer: preserving delimiters, recursing into children +/// +/// Plan 7: inline-level soft-drop substitutes `KeepBefore` for `UseAfter` +/// / `RecurseIntoContainer` alignments that target a non-editable original +/// inline (atomic-CustomNode, atomic-kind Generated, no-preimage +/// Generated). Each substitution pushes a `Q-3-42` warning. The +/// substitution uses the *new-side* index as the positional proxy for the +/// "original inline at the same position" — exact for in-place retypings +/// (the common shortcode-edit case), approximate for arbitrary +/// insertions/deletions. +/// +/// Plan 7 also adds multi-inline dedupe: consecutive `KeepBefore` entries +/// whose original inlines' `Invocation` anchors are `PartialEq`-equal +/// emit a single combined byte range, so a multi-inline shortcode +/// resolution (`{{< meta footer >}}` → `[Strong[Str], Space, Str]`) +/// emits the shortcode token bytes once. fn assemble_inline_content( original_qmd: &str, orig_inlines: &[Inline], new_inlines: &[Inline], plan: &InlineReconciliationPlan, + target_file_id: quarto_source_map::FileId, + warnings: &mut Vec, ) -> Result> { - let mut result = String::new(); - + // Phase 1: apply soft-drop substitutions. Walk alignments and rewrite + // UseAfter/RecurseIntoContainer that target non-editable original + // inlines into KeepBefore(original-position). + let mut effective: Vec = Vec::with_capacity(plan.inline_alignments.len()); for (result_idx, alignment) in plan.inline_alignments.iter().enumerate() { match alignment { + InlineAlignment::UseAfter(_) => { + // Use result_idx (positional proxy) to find the + // corresponding original inline. + if let Some(orig) = orig_inlines.get(result_idx) + && !is_editable_inside_inline(orig, target_file_id) + { + warnings.push(diagnostic_q3_42_inline(orig)); + effective.push(InlineAlignment::KeepBefore(result_idx)); + continue; + } + effective.push(alignment.clone()); + } + InlineAlignment::RecurseIntoContainer { before_idx, .. } => { + let orig = &orig_inlines[*before_idx]; + if !is_editable_inside_inline(orig, target_file_id) { + warnings.push(diagnostic_q3_42_inline(orig)); + effective.push(InlineAlignment::KeepBefore(*before_idx)); + continue; + } + effective.push(alignment.clone()); + } + InlineAlignment::KeepBefore(_) => effective.push(alignment.clone()), + } + } + + // Phase 2: emit, with multi-inline dedupe for consecutive + // KeepBefore entries whose Invocation anchors are PartialEq-equal. + let mut result = String::new(); + let mut i = 0; + while i < effective.len() { + match &effective[i] { InlineAlignment::KeepBefore(orig_idx) => { - let span = inline_source_span(&orig_inlines[*orig_idx]); - result.push_str(&original_qmd[span]); + let first_si = orig_inlines[*orig_idx].source_info(); + let first_invocation = first_si.invocation_anchor().cloned(); + + // Try to extend the run: gather all consecutive KeepBefore + // entries whose invocation_anchor() is PartialEq-equal to + // first_invocation. Only consider runs of length >= 2 for + // dedupe; a single inline emits via the normal path. + let mut j = i + 1; + if first_invocation.is_some() { + while j < effective.len() { + let InlineAlignment::KeepBefore(next_orig_idx) = &effective[j] else { + break; + }; + let next_invocation = orig_inlines[*next_orig_idx] + .source_info() + .invocation_anchor() + .cloned(); + if next_invocation != first_invocation { + break; + } + j += 1; + } + } + + if j > i + 1 { + // Dedupe: the whole group shares one Invocation anchor. + // Emit the Invocation source's preimage bytes once, + // not the individual inlines' ranges. Use the anchor + // source_info's preimage in the target file when + // available; fall back to the first inline's range. + let anchor_arc = first_invocation.unwrap(); + if let Some(range) = anchor_arc.preimage_in(target_file_id) { + result.push_str(&original_qmd[range]); + } else { + // Fall back: emit each inline's bytes individually. + // Shouldn't happen — KeepBefore implies preimage_in + // succeeded for the surrounding block. Keep + // structurally safe behavior just in case. + for k in i..j { + let InlineAlignment::KeepBefore(idx) = &effective[k] else { + unreachable!() + }; + let span = inline_source_span(&orig_inlines[*idx]); + result.push_str(&original_qmd[span]); + } + } + i = j; + continue; + } + + // Singleton KeepBefore — emit the inline's preimage in + // the target file when available (covers Generated inlines + // whose Invocation anchor resolves into target), falling + // back to the inline's literal source span for Original + // inlines (the common case; identical bytes either way). + let range = orig_inlines[*orig_idx] + .source_info() + .preimage_in(target_file_id) + .unwrap_or_else(|| inline_source_span(&orig_inlines[*orig_idx])); + result.push_str(&original_qmd[range]); } InlineAlignment::UseAfter(after_idx) => { let text = write_inline_to_string(&new_inlines[*after_idx])?; @@ -655,11 +1441,14 @@ fn assemble_inline_content( original_qmd, &orig_inlines[*before_idx], &new_inlines[*after_idx], - plan.inline_container_plans.get(&result_idx), + plan.inline_container_plans.get(&i), + target_file_id, + warnings, )?; result.push_str(&text); } } + i += 1; } Ok(result) @@ -674,6 +1463,8 @@ fn assemble_recursed_container( orig_inline: &Inline, new_inline: &Inline, nested_plan: Option<&InlineReconciliationPlan>, + target_file_id: quarto_source_map::FileId, + warnings: &mut Vec, ) -> Result> { let orig_span = inline_source_span(orig_inline); @@ -700,7 +1491,14 @@ fn assemble_recursed_container( let closing = &original_qmd[last_child_end..orig_span.end]; // Recursively assemble children - let children_text = assemble_inline_content(original_qmd, orig_children, new_children, plan)?; + let children_text = assemble_inline_content( + original_qmd, + orig_children, + new_children, + plan, + target_file_id, + warnings, + )?; Ok(format!("{}{}{}", opening, children_text, closing)) } @@ -830,3 +1628,701 @@ pub fn write_inline_to_string( ); Ok(result) } + +// ============================================================================= +// Tests +// ============================================================================= + +#[cfg(test)] +mod editability_tests { + use super::*; + use quarto_pandoc_types::{Block, CustomNode, Inline, Paragraph, Plain, Str, attr::empty_attr}; + use quarto_source_map::source_info::{AnchorRole, By}; + use std::sync::Arc; + + const TARGET: FileId = FileId(0); + const OTHER: FileId = FileId(1); + + fn make_str(text: &str, si: SourceInfo) -> Inline { + Inline::Str(Str { + text: text.into(), + source_info: si, + }) + } + + // ------------------------------------------------------------------------- + // is_editable_inside_block + // ------------------------------------------------------------------------- + + #[test] + fn editable_block_with_original_in_target() { + let block = Block::Paragraph(Paragraph { + content: vec![make_str("hello", SourceInfo::original(TARGET, 0, 5))], + source_info: SourceInfo::original(TARGET, 0, 5), + }); + assert!(is_editable_inside_block(&block, TARGET)); + } + + #[test] + fn not_editable_block_with_original_outside_target() { + // Original points at a different file (cross-file reference, no + // wrapper). preimage_in(TARGET) returns None. + let block = Block::Paragraph(Paragraph { + content: vec![make_str("hi", SourceInfo::original(OTHER, 0, 2))], + source_info: SourceInfo::original(OTHER, 0, 2), + }); + assert!(!is_editable_inside_block(&block, TARGET)); + } + + #[test] + fn not_editable_atomic_custom_node_block() { + // CrossrefResolvedRef is in ATOMIC_CUSTOM_NODES even though its + // source_info Original is in the target file. + let cn = CustomNode::new( + "CrossrefResolvedRef", + empty_attr(), + SourceInfo::original(TARGET, 0, 10), + ); + let block = Block::Custom(cn); + assert!(!is_editable_inside_block(&block, TARGET)); + } + + #[test] + fn editable_non_atomic_custom_node_block() { + // Non-atomic CustomNode (e.g., Callout) with source_info in target + // → editable. + let cn = CustomNode::new("Callout", empty_attr(), SourceInfo::original(TARGET, 0, 20)); + let block = Block::Custom(cn); + assert!(is_editable_inside_block(&block, TARGET)); + } + + #[test] + fn not_editable_atomic_kind_generated_block() { + // Shortcode-resolved Para: Generated{by: shortcode, from: [Invocation]}. + // Even though Invocation resolves to a token in TARGET (so + // preimage_in returns Some), is_atomic_kind() shortcode means the + // user can't edit the *resolved content* — only the token. + let token = SourceInfo::original(TARGET, 100, 120); + let mut gen_info = SourceInfo::generated(By::shortcode("meta")); + gen_info.append_anchor(AnchorRole::Invocation, Arc::new(token)); + let block = Block::Paragraph(Paragraph { + content: vec![], + source_info: gen_info, + }); + assert!(!is_editable_inside_block(&block, TARGET)); + } + + #[test] + fn not_editable_no_preimage_generated_block() { + // Synthesized-from-metadata container: Generated with empty + // anchors (sectionize / footnotes / appendix container shape). + // preimage_in returns None → not editable. + let block = Block::Paragraph(Paragraph { + content: vec![], + source_info: SourceInfo::generated(By::sectionize()), + }); + assert!(!is_editable_inside_block(&block, TARGET)); + } + + #[test] + fn not_editable_value_source_only_generated_block() { + // Plan 9 shape: Generated with only ValueSource anchor (no + // Invocation). The ValueSource points into the target file's + // YAML metadata range, but the writer must NOT treat the + // interior as editable — those bytes are YAML, not body. + let meta_si = SourceInfo::original(TARGET, 10, 25); + let mut gen_info = SourceInfo::generated(By::appendix()); + gen_info.append_anchor(AnchorRole::ValueSource, Arc::new(meta_si)); + let block = Block::Paragraph(Paragraph { + content: vec![], + source_info: gen_info, + }); + assert!(!is_editable_inside_block(&block, TARGET)); + } + + // ------------------------------------------------------------------------- + // is_editable_inside_inline + // ------------------------------------------------------------------------- + + #[test] + fn editable_inline_with_original_in_target() { + let inline = make_str("hi", SourceInfo::original(TARGET, 0, 2)); + assert!(is_editable_inside_inline(&inline, TARGET)); + } + + #[test] + fn not_editable_atomic_custom_node_inline() { + let cn = CustomNode::new( + "CrossrefResolvedRef", + empty_attr(), + SourceInfo::original(TARGET, 0, 8), + ); + let inline = Inline::Custom(cn); + assert!(!is_editable_inside_inline(&inline, TARGET)); + } + + #[test] + fn not_editable_atomic_kind_generated_inline() { + let token = SourceInfo::original(TARGET, 100, 120); + let mut gen_info = SourceInfo::generated(By::shortcode("meta")); + gen_info.append_anchor(AnchorRole::Invocation, Arc::new(token)); + let inline = make_str("resolved", gen_info); + assert!(!is_editable_inside_inline(&inline, TARGET)); + } + + #[test] + fn not_editable_inline_with_original_outside_target() { + let inline = make_str("hi", SourceInfo::original(OTHER, 0, 2)); + assert!(!is_editable_inside_inline(&inline, TARGET)); + } + + // ------------------------------------------------------------------------- + // Sanity: Plain (non-Para) block carries the same predicate behaviour. + // ------------------------------------------------------------------------- + + #[test] + fn editable_plain_block_with_original_in_target() { + let block = Block::Plain(Plain { + content: vec![make_str("hi", SourceInfo::original(TARGET, 0, 2))], + source_info: SourceInfo::original(TARGET, 0, 2), + }); + assert!(is_editable_inside_block(&block, TARGET)); + } +} + +#[cfg(test)] +mod coarsen_plan7_tests { + //! Plan 7: coarsen behavior under the new soft-drop + cascade rules. + //! + //! These tests construct `Pandoc` + `ReconciliationPlan` fixtures by + //! hand to exercise the new code paths directly. The existing + //! `incremental_writer_tests.rs` integration tests cover the + //! end-to-end (parse → reconcile → write) flow; these tests pin + //! coarsen's specific classification + soft-drop behavior. + + use super::*; + use quarto_ast_reconcile::types::{ + BlockAlignment, InlineAlignment, InlineReconciliationPlan, ReconciliationPlan, + }; + use quarto_pandoc_types::{Block, CustomNode, Div, Inline, Paragraph, Str, attr::empty_attr}; + use quarto_source_map::source_info::{AnchorRole, By}; + use std::sync::Arc; + + const TARGET: FileId = FileId(0); + const OTHER: FileId = FileId(1); + + fn make_str(text: &str, si: SourceInfo) -> Inline { + Inline::Str(Str { + text: text.into(), + source_info: si, + }) + } + + fn para(content: Vec, si: SourceInfo) -> Block { + Block::Paragraph(Paragraph { + content, + source_info: si, + }) + } + + // ------------------------------------------------------------------------- + // KeepBefore cascade + // ------------------------------------------------------------------------- + + #[test] + fn keep_before_with_original_in_target_emits_verbatim() { + let block = para(vec![], SourceInfo::original(TARGET, 10, 25)); + let ast = quarto_pandoc_types::Pandoc { + blocks: vec![block], + meta: ConfigValue::default(), + }; + let plan = ReconciliationPlan { + block_alignments: vec![BlockAlignment::KeepBefore(0)], + ..Default::default() + }; + let mut warnings = Vec::new(); + let qmd = "0123456789012345678901234567890"; + let entries = coarsen(qmd, &ast, &ast, &plan, &mut warnings).unwrap(); + + assert_eq!(entries.len(), 1); + match &entries[0] { + CoarsenedEntry::Verbatim { byte_range, .. } => { + assert_eq!(byte_range, &(10..25)); + } + other => panic!("expected Verbatim, got {:?}", other), + } + assert!(warnings.is_empty()); + } + + #[test] + fn keep_before_with_atomic_kind_generated_no_anchor_emits_omit() { + // Filter construction: Generated { by: filter, from: [] }. + // Atomic-kind, no Invocation → Omit (next pipeline run + // regenerates the decoration). + let block = para(vec![], SourceInfo::generated(By::filter("upper.lua", 14))); + let ast = quarto_pandoc_types::Pandoc { + blocks: vec![block], + meta: ConfigValue::default(), + }; + let plan = ReconciliationPlan { + block_alignments: vec![BlockAlignment::KeepBefore(0)], + ..Default::default() + }; + let mut warnings = Vec::new(); + let entries = coarsen("", &ast, &ast, &plan, &mut warnings).unwrap(); + + assert_eq!(entries.len(), 1); + assert!(matches!(entries[0], CoarsenedEntry::Omit)); + // KeepBefore branch doesn't emit warnings. + assert!(warnings.is_empty()); + } + + #[test] + fn keep_before_with_atomic_kind_generated_with_invocation_emits_verbatim() { + // Shortcode resolution: atomic-kind, Invocation in target → Verbatim. + let token = SourceInfo::original(TARGET, 100, 120); + let mut gen_info = SourceInfo::generated(By::shortcode("meta")); + gen_info.append_anchor(AnchorRole::Invocation, Arc::new(token)); + let block = para(vec![], gen_info); + let ast = quarto_pandoc_types::Pandoc { + blocks: vec![block], + meta: ConfigValue::default(), + }; + let plan = ReconciliationPlan { + block_alignments: vec![BlockAlignment::KeepBefore(0)], + ..Default::default() + }; + let mut warnings = Vec::new(); + let qmd = "0".repeat(200); + let entries = coarsen(&qmd, &ast, &ast, &plan, &mut warnings).unwrap(); + + assert_eq!(entries.len(), 1); + match &entries[0] { + CoarsenedEntry::Verbatim { byte_range, .. } => { + assert_eq!(byte_range, &(100..120)); + } + other => panic!("expected Verbatim, got {:?}", other), + } + } + + #[test] + fn keep_before_with_nonatomic_generated_wrapper_emits_transparent() { + // Sectionize wrapper: Div with Generated { by: sectionize, from: [] } + // and source-bearing children (one Para in target). + let child = para( + vec![make_str("hi", SourceInfo::original(TARGET, 10, 12))], + SourceInfo::original(TARGET, 10, 12), + ); + let div = Block::Div(Div { + attr: empty_attr(), + content: vec![child], + source_info: SourceInfo::generated(By::sectionize()), + attr_source: quarto_pandoc_types::AttrSourceInfo::empty(), + }); + let ast = quarto_pandoc_types::Pandoc { + blocks: vec![div], + meta: ConfigValue::default(), + }; + let plan = ReconciliationPlan { + block_alignments: vec![BlockAlignment::KeepBefore(0)], + ..Default::default() + }; + let mut warnings = Vec::new(); + let qmd = "0".repeat(30); + let entries = coarsen(&qmd, &ast, &ast, &plan, &mut warnings).unwrap(); + + assert_eq!(entries.len(), 1); + match &entries[0] { + CoarsenedEntry::Transparent { child_entries } => { + assert_eq!(child_entries.len(), 1); + match &child_entries[0] { + CoarsenedEntry::Verbatim { + byte_range, + orig_idx, + } => { + assert_eq!(byte_range, &(10..12)); + // Children of Transparent get None for orig_idx. + assert_eq!(orig_idx, &None); + } + other => panic!("expected Verbatim child, got {:?}", other), + } + } + other => panic!("expected Transparent, got {:?}", other), + } + } + + #[test] + fn keep_before_cross_file_original_falls_back_to_rewrite() { + // Block whose source_info points at a different file (no preimage + // in target) AND isn't Generated → Rewrite (catch-all). + let block = para(vec![], SourceInfo::original(OTHER, 0, 10)); + let ast = quarto_pandoc_types::Pandoc { + blocks: vec![block], + meta: ConfigValue::default(), + }; + let plan = ReconciliationPlan { + block_alignments: vec![BlockAlignment::KeepBefore(0)], + ..Default::default() + }; + let mut warnings = Vec::new(); + // Note: target_file_id is derived from the first block's + // root_file_id, which for this AST is OTHER (FileId 1) — so + // preimage_in(OTHER) succeeds. To exercise the catch-all path + // we need a block whose source_info doesn't resolve in *its + // own* root file_id. Use a separate AST whose first-block + // file-id sets target = TARGET, but this block points at OTHER. + let target_setter = para(vec![], SourceInfo::original(TARGET, 0, 5)); + let block_cross = para(vec![], SourceInfo::original(OTHER, 0, 10)); + let ast2 = quarto_pandoc_types::Pandoc { + blocks: vec![target_setter, block_cross], + meta: ConfigValue::default(), + }; + let plan2 = ReconciliationPlan { + block_alignments: vec![BlockAlignment::KeepBefore(0), BlockAlignment::KeepBefore(1)], + ..Default::default() + }; + let qmd = "0".repeat(30); + let entries = coarsen(&qmd, &ast2, &ast2, &plan2, &mut warnings).unwrap(); + + assert_eq!(entries.len(), 2); + // First entry resolves in target via preimage_in. + assert!(matches!(entries[0], CoarsenedEntry::Verbatim { .. })); + // Second entry doesn't resolve in target → Rewrite catch-all. + assert!(matches!(entries[1], CoarsenedEntry::Rewrite { .. })); + assert!(warnings.is_empty()); + // Silence unused: plan was for the single-block AST scenario above. + let _ = (ast, plan); + } + + // ------------------------------------------------------------------------- + // UseAfter soft-drop / let-user-win + // ------------------------------------------------------------------------- + + #[test] + fn use_after_on_atomic_custom_node_is_let_user_win_rewrite() { + // User replaced a CrossrefResolvedRef wholesale via a component + // menu. The new-side block IS the atomic CustomNode; we let the + // user win and Rewrite (no warning). + let new_cn = CustomNode::new( + "CrossrefResolvedRef", + empty_attr(), + SourceInfo::original(TARGET, 0, 10), + ); + let new_ast = quarto_pandoc_types::Pandoc { + blocks: vec![Block::Custom(new_cn)], + meta: ConfigValue::default(), + }; + let orig_block = para(vec![], SourceInfo::original(TARGET, 0, 0)); + let original_ast = quarto_pandoc_types::Pandoc { + blocks: vec![orig_block], + meta: ConfigValue::default(), + }; + let plan = ReconciliationPlan { + block_alignments: vec![BlockAlignment::UseAfter(0)], + ..Default::default() + }; + let mut warnings = Vec::new(); + let qmd = "0".repeat(20); + let entries = coarsen(&qmd, &original_ast, &new_ast, &plan, &mut warnings).unwrap(); + + assert_eq!(entries.len(), 1); + assert!(matches!(entries[0], CoarsenedEntry::Rewrite { .. })); + assert!( + warnings.is_empty(), + "let-user-win on atomic CustomNode must not emit a warning" + ); + } + + #[test] + fn use_after_on_no_preimage_generated_soft_drops_to_omit() { + // User replaced a synthesized-from-metadata container wholesale. + // The new-side block is Generated with no Invocation anchor + // → no source position to anchor a Rewrite → Omit + Q-3-43. + let new_block = Block::Div(Div { + attr: empty_attr(), + content: vec![], + source_info: SourceInfo::generated(By::appendix()), + attr_source: quarto_pandoc_types::AttrSourceInfo::empty(), + }); + let new_ast = quarto_pandoc_types::Pandoc { + blocks: vec![new_block], + meta: ConfigValue::default(), + }; + let orig_block = para(vec![], SourceInfo::original(TARGET, 0, 0)); + let original_ast = quarto_pandoc_types::Pandoc { + blocks: vec![orig_block], + meta: ConfigValue::default(), + }; + let plan = ReconciliationPlan { + block_alignments: vec![BlockAlignment::UseAfter(0)], + ..Default::default() + }; + let mut warnings = Vec::new(); + let qmd = "0".repeat(20); + let entries = coarsen(&qmd, &original_ast, &new_ast, &plan, &mut warnings).unwrap(); + + assert_eq!(entries.len(), 1); + assert!(matches!(entries[0], CoarsenedEntry::Omit)); + assert_eq!(warnings.len(), 1); + assert_eq!(warnings[0].code.as_deref(), Some("Q-3-43")); + } + + // ------------------------------------------------------------------------- + // RecurseIntoContainer soft-drop on non-editable original block + // ------------------------------------------------------------------------- + + #[test] + fn recurse_into_atomic_custom_node_soft_drops_to_verbatim() { + // User typed inside a CrossrefResolvedRef. Substitute Verbatim + // (wrapper's preimage bytes) + Q-3-43. + let orig_cn = CustomNode::new( + "CrossrefResolvedRef", + empty_attr(), + SourceInfo::original(TARGET, 5, 25), + ); + let new_cn = CustomNode::new( + "CrossrefResolvedRef", + empty_attr(), + SourceInfo::original(TARGET, 5, 25), + ); + let original_ast = quarto_pandoc_types::Pandoc { + blocks: vec![Block::Custom(orig_cn)], + meta: ConfigValue::default(), + }; + let new_ast = quarto_pandoc_types::Pandoc { + blocks: vec![Block::Custom(new_cn)], + meta: ConfigValue::default(), + }; + let plan = ReconciliationPlan { + block_alignments: vec![BlockAlignment::RecurseIntoContainer { + before_idx: 0, + after_idx: 0, + }], + ..Default::default() + }; + let mut warnings = Vec::new(); + let qmd = "0".repeat(30); + let entries = coarsen(&qmd, &original_ast, &new_ast, &plan, &mut warnings).unwrap(); + + assert_eq!(entries.len(), 1); + match &entries[0] { + CoarsenedEntry::Verbatim { byte_range, .. } => { + assert_eq!(byte_range, &(5..25)); + } + other => panic!("expected Verbatim, got {:?}", other), + } + assert_eq!(warnings.len(), 1); + assert_eq!(warnings[0].code.as_deref(), Some("Q-3-43")); + } + + #[test] + fn recurse_into_no_preimage_generated_soft_drops_to_omit() { + // User typed inside a synthesized appendix container (Generated + // with no Invocation anchor, no preimage in target). + let orig_div = Block::Div(Div { + attr: empty_attr(), + content: vec![para(vec![], SourceInfo::original(TARGET, 0, 5))], + source_info: SourceInfo::generated(By::appendix()), + attr_source: quarto_pandoc_types::AttrSourceInfo::empty(), + }); + let new_div = Block::Div(Div { + attr: empty_attr(), + content: vec![para(vec![], SourceInfo::original(TARGET, 0, 5))], + source_info: SourceInfo::generated(By::appendix()), + attr_source: quarto_pandoc_types::AttrSourceInfo::empty(), + }); + // Force target_file_id to TARGET by giving the AST another block + // whose source_info is Original in TARGET. + let target_setter = para(vec![], SourceInfo::original(TARGET, 0, 5)); + let original_ast = quarto_pandoc_types::Pandoc { + blocks: vec![target_setter.clone(), orig_div], + meta: ConfigValue::default(), + }; + let new_ast = quarto_pandoc_types::Pandoc { + blocks: vec![target_setter, new_div], + meta: ConfigValue::default(), + }; + let plan = ReconciliationPlan { + block_alignments: vec![ + BlockAlignment::KeepBefore(0), + BlockAlignment::RecurseIntoContainer { + before_idx: 1, + after_idx: 1, + }, + ], + ..Default::default() + }; + let mut warnings = Vec::new(); + let qmd = "0".repeat(30); + let entries = coarsen(&qmd, &original_ast, &new_ast, &plan, &mut warnings).unwrap(); + + assert_eq!(entries.len(), 2); + assert!(matches!(entries[0], CoarsenedEntry::Verbatim { .. })); + assert!(matches!(entries[1], CoarsenedEntry::Omit)); + assert_eq!(warnings.len(), 1); + assert_eq!(warnings[0].code.as_deref(), Some("Q-3-43")); + } + + // ------------------------------------------------------------------------- + // Inline-level multi-inline dedupe + soft-drop + // ------------------------------------------------------------------------- + + fn shortcode_inline(text: &str, token_si: SourceInfo) -> Inline { + let mut gen_info = SourceInfo::generated(By::shortcode("meta")); + gen_info.append_anchor(AnchorRole::Invocation, Arc::new(token_si)); + make_str(text, gen_info) + } + + #[test] + fn multi_inline_dedupe_emits_token_once_when_invocation_shared() { + // Three inlines sharing the same Invocation anchor (a multi-inline + // shortcode resolution). The original qmd has the shortcode token + // at bytes 0..18. Expected output: those 18 bytes once. + let qmd = "{{< meta footer >}}"; + assert_eq!(qmd.len(), 19); + let token_si = SourceInfo::original(TARGET, 0, 19); + + let orig_inlines = vec![ + shortcode_inline("Hello", token_si.clone()), + shortcode_inline(" ", token_si.clone()), + shortcode_inline("World", token_si.clone()), + ]; + let new_inlines = orig_inlines.clone(); + let plan = InlineReconciliationPlan { + inline_alignments: vec![ + InlineAlignment::KeepBefore(0), + InlineAlignment::KeepBefore(1), + InlineAlignment::KeepBefore(2), + ], + ..Default::default() + }; + + let mut warnings = Vec::new(); + let out = assemble_inline_content( + qmd, + &orig_inlines, + &new_inlines, + &plan, + TARGET, + &mut warnings, + ) + .unwrap(); + + assert_eq!( + out, qmd, + "Three shared-Invocation inlines must emit the token bytes once" + ); + } + + #[test] + fn multi_inline_no_dedupe_when_invocations_differ() { + // Two inlines, each pointing at a *different* token range — no + // dedupe; each emits its own range. + let qmd = "AB"; + let orig_inlines = vec![ + shortcode_inline("A", SourceInfo::original(TARGET, 0, 1)), + shortcode_inline("B", SourceInfo::original(TARGET, 1, 2)), + ]; + let new_inlines = orig_inlines.clone(); + let plan = InlineReconciliationPlan { + inline_alignments: vec![ + InlineAlignment::KeepBefore(0), + InlineAlignment::KeepBefore(1), + ], + ..Default::default() + }; + let mut warnings = Vec::new(); + let out = assemble_inline_content( + qmd, + &orig_inlines, + &new_inlines, + &plan, + TARGET, + &mut warnings, + ) + .unwrap(); + + // No dedupe: each inline's bytes emit. + assert_eq!(out, "AB"); + } + + #[test] + fn multi_inline_dedupe_with_value_source_difference_still_dedupes() { + // Forward-compat with Plan 9: two inlines whose Invocation anchors + // are PartialEq-equal but whose ValueSource anchors differ — still + // dedupes (dedupe consults Invocation only). + let qmd = "{{< meta foo >}}"; + let token_si = SourceInfo::original(TARGET, 0, qmd.len()); + + let mut si_a = SourceInfo::generated(By::shortcode("meta")); + si_a.append_anchor(AnchorRole::Invocation, Arc::new(token_si.clone())); + si_a.append_anchor( + AnchorRole::ValueSource, + Arc::new(SourceInfo::original(TARGET, 100, 110)), + ); + + let mut si_b = SourceInfo::generated(By::shortcode("meta")); + si_b.append_anchor(AnchorRole::Invocation, Arc::new(token_si)); + si_b.append_anchor( + AnchorRole::ValueSource, + Arc::new(SourceInfo::original(TARGET, 200, 215)), + ); + + let orig_inlines = vec![make_str("a", si_a), make_str("b", si_b)]; + let new_inlines = orig_inlines.clone(); + let plan = InlineReconciliationPlan { + inline_alignments: vec![ + InlineAlignment::KeepBefore(0), + InlineAlignment::KeepBefore(1), + ], + ..Default::default() + }; + let mut warnings = Vec::new(); + let out = assemble_inline_content( + qmd, + &orig_inlines, + &new_inlines, + &plan, + TARGET, + &mut warnings, + ) + .unwrap(); + + // Still dedupes — emit the token once. + assert_eq!(out, qmd); + } + + #[test] + fn inline_use_after_on_atomic_generated_soft_drops_to_keep_before_with_q3_42() { + // User retyped over a shortcode-resolved inline. UseAfter + // → KeepBefore(0) (the positional proxy) + Q-3-42. + let qmd = "{{< meta foo >}}"; + let token_si = SourceInfo::original(TARGET, 0, qmd.len()); + let mut gen_info = SourceInfo::generated(By::shortcode("meta")); + gen_info.append_anchor(AnchorRole::Invocation, Arc::new(token_si)); + + let orig_inlines = vec![make_str("Resolved", gen_info)]; + // New-side inline: a plain user edit (no Invocation anchor). + let new_inlines = vec![make_str("Retyped", SourceInfo::default())]; + let plan = InlineReconciliationPlan { + inline_alignments: vec![InlineAlignment::UseAfter(0)], + ..Default::default() + }; + let mut warnings = Vec::new(); + let out = assemble_inline_content( + qmd, + &orig_inlines, + &new_inlines, + &plan, + TARGET, + &mut warnings, + ) + .unwrap(); + + // Soft-drop: emit the original inline's bytes (its preimage maps + // to the whole shortcode token). + assert_eq!(out, qmd); + assert_eq!(warnings.len(), 1); + assert_eq!(warnings[0].code.as_deref(), Some("Q-3-42")); + } +} diff --git a/crates/pampa/src/writers/json.rs b/crates/pampa/src/writers/json.rs index 0c7237844..74910cca0 100644 --- a/crates/pampa/src/writers/json.rs +++ b/crates/pampa/src/writers/json.rs @@ -11,7 +11,7 @@ use crate::pandoc::{ use hashlink::LinkedHashMap; use quarto_error_reporting::{DiagnosticMessage, DiagnosticMessageBuilder}; use quarto_pandoc_types::{ConfigValue, ConfigValueKind}; -use quarto_source_map::{FileId, SourceInfo}; +use quarto_source_map::{AnchorRole, By, FileId, SourceInfo}; use serde::Serialize; use serde_json::{Value, json}; use std::collections::HashMap; @@ -110,9 +110,15 @@ struct FileEntryJson { /// Fields ordered alphabetically: d, r, t #[derive(Serialize)] struct SourceInfoJson { - d: Value, // data (file_id, parent_id, pieces, or filter info) + d: Value, // data (file_id, parent_id, pieces, or Generated { by, from }) r: [usize; 2], // range [start, end] - t: u8, // type code (0=Original, 1=Substring, 2=Concat, 3=FilterProvenance) + // type code: + // 0 = Original + // 1 = Substring + // 2 = Concat + // 3 = Legacy (read-only — old Transformed + buggy FilterProvenance) + // 4 = Generated { by, from } + t: u8, } /// Generic node with type, optional content, and source info. @@ -177,8 +183,26 @@ impl SerializableSourceInfo { .collect(); (2, json!(piece_arrays)) } - SerializableSourceMapping::FilterProvenance { filter_path, line } => { - (3, json!((filter_path, line))) + SerializableSourceMapping::Generated { by, from } => { + let mut by_json = json!({ "kind": by.kind }); + if !by.data.is_null() { + by_json["data"] = by.data.clone(); + } + let mut d_obj = serde_json::Map::new(); + d_obj.insert("by".to_string(), by_json); + if !from.is_empty() { + let arr: Vec = from + .iter() + .map(|(role, si_id)| { + json!({ + "role": serialize_anchor_role(role), + "si_id": si_id, + }) + }) + .collect(); + d_obj.insert("from".to_string(), Value::Array(arr)); + } + (4, Value::Object(d_obj)) } }; SourceInfoJson { @@ -189,6 +213,19 @@ impl SerializableSourceInfo { } } +/// Serialize an [`AnchorRole`] to its wire-format string. +/// +/// Inverse of `parse_anchor_role` in `crates/pampa/src/readers/json.rs`. +/// The two must agree on the string forms — see also the TS mirror at +/// `ts-packages/preview-renderer/src/types/sourceInfo.ts`. +fn serialize_anchor_role(role: &AnchorRole) -> String { + match role { + AnchorRole::Invocation => "invocation".to_string(), + AnchorRole::ValueSource => "value-source".to_string(), + AnchorRole::Other(s) => format!("other:{}", s), + } +} + /// Serializable version of SourceMapping that uses parent_id instead of Rc. enum SerializableSourceMapping { Original { @@ -200,9 +237,15 @@ enum SerializableSourceMapping { Concat { pieces: Vec, }, - FilterProvenance { - filter_path: String, - line: usize, + /// Wire-code 4: a pipeline transform's output. + /// + /// `by` carries the producer identity (kebab-case `kind` + optional + /// JSON `data`). `from` is an ordered list of `(role, si_id)` + /// pairs — each `si_id` points to another pool entry that already + /// exists (interned strictly before this entry). + Generated { + by: By, + from: Vec<(AnchorRole, usize)>, }, } @@ -311,14 +354,42 @@ impl<'a> SourceInfoSerializer<'a> { }, ) } - SourceInfo::FilterProvenance { filter_path, line } => ( - 0, - 0, - SerializableSourceMapping::FilterProvenance { - filter_path: filter_path.clone(), - line: *line, - }, - ), + SourceInfo::Generated { by, from } => { + // Anchors are interned *before* this Generated entry so that + // every si_id is strictly less than the resulting pool index + // — the reader's `si_id < current_index` guard depends on it. + // + // Dedup keyed by `Arc::as_ptr(&anchor.source_info)`, sharing + // the same `arc_parent_ids` cache used for `Substring.parent`. + // Multi-inline shortcode resolutions whose anchors point at a + // shared `Arc` collapse to a single pool entry on the write + // side; deserialization rebuilds each anchor with a fresh + // Arc, so this is a write-time optimization only (see Plan 5 + // §"Risk areas" → anchor-dedup-invariant). + let from_ids: Vec<(AnchorRole, usize)> = from + .iter() + .map(|anchor| { + let arc_ptr = std::sync::Arc::as_ptr(&anchor.source_info); + let id = if let Some(&id) = self.arc_parent_ids.get(&arc_ptr) { + self.stat_arc_parent_hits += 1; + id + } else { + let id = self.intern(&anchor.source_info); + self.arc_parent_ids.insert(arc_ptr, id); + id + }; + (anchor.role.clone(), id) + }) + .collect(); + ( + 0, + 0, + SerializableSourceMapping::Generated { + by: by.clone(), + from: from_ids, + }, + ) + } }; let id = self.pool.len(); @@ -555,7 +626,7 @@ fn node_with_source( // to map offsets to row/column positions. Commenting out for now. // fn write_location(source_info: &quarto_source_map::SourceInfo, ctx: &SourceContext) -> Value { // // Extract filename index by walking to the Original mapping -// let filename_index = crate::pandoc::location::extract_filename_index(source_info); +// let filename_index = source_info.root_file_id().map(|fid| fid.0); // // // Map start and end offsets to locations with row/column // let start_mapped = source_info.map_offset(0, ctx).unwrap(); @@ -3496,11 +3567,35 @@ fn stream_write_source_info_pool( } w.end_array()?; } - SerializableSourceMapping::FilterProvenance { filter_path, line } => { - w.begin_array()?; - w.str_value(filter_path)?; - w.u64_value(*line as u64)?; - w.end_array()?; + SerializableSourceMapping::Generated { by, from } => { + // Mirror SerializableSourceInfo::to_json byte-for-byte. + // Object shape: { "by": { "kind": ..., "data": ... }, + // "from": [ { "role": ..., "si_id": N }, ... ] } + // `data` is skipped when null; `from` is skipped when empty. + w.begin_object()?; + w.key("by")?; + w.begin_object()?; + w.key("kind")?; + w.str_value(&by.kind)?; + if !by.data.is_null() { + w.key("data")?; + stream_write_json_value(w, &by.data)?; + } + w.end_object()?; + if !from.is_empty() { + w.key("from")?; + w.begin_array()?; + for (role, si_id) in from { + w.begin_object()?; + w.key("role")?; + w.str_value(&serialize_anchor_role(role))?; + w.key("si_id")?; + w.u64_value(*si_id as u64)?; + w.end_object()?; + } + w.end_array()?; + } + w.end_object()?; } } w.key("r")?; @@ -3513,7 +3608,7 @@ fn stream_write_source_info_pool( SerializableSourceMapping::Original { .. } => 0, SerializableSourceMapping::Substring { .. } => 1, SerializableSourceMapping::Concat { .. } => 2, - SerializableSourceMapping::FilterProvenance { .. } => 3, + SerializableSourceMapping::Generated { .. } => 4, })?; w.end_object()?; } @@ -3521,6 +3616,45 @@ fn stream_write_source_info_pool( Ok(()) } +/// Recursively stream-write an arbitrary `serde_json::Value` via the +/// `JsonStreamWriter`. Used to emit the `By.data` payload inside a +/// `Generated` pool entry without materializing a serialized buffer. +fn stream_write_json_value(w: &mut JsonStreamWriter, v: &Value) -> io::Result<()> { + match v { + Value::Null => w.null_value(), + Value::Bool(b) => w.bool_value(*b), + Value::Number(n) => { + if let Some(u) = n.as_u64() { + w.u64_value(u) + } else if let Some(i) = n.as_i64() { + w.i64_value(i) + } else if let Some(f) = n.as_f64() { + w.f64_value(f) + } else { + // Unreachable: serde_json::Number always converts to one of + // the three numeric forms above. Emit null defensively. + w.null_value() + } + } + Value::String(s) => w.str_value(s), + Value::Array(arr) => { + w.begin_array()?; + for item in arr { + stream_write_json_value(w, item)?; + } + w.end_array() + } + Value::Object(obj) => { + w.begin_object()?; + for (k, val) in obj { + w.key(k)?; + stream_write_json_value(w, val)?; + } + w.end_object() + } + } +} + /// Emit the whole document. Streaming order: /// `{blocks, meta, pandoc-api-version, astContext}` — alphabetical-friendly /// except astContext last (it carries `sourceInfoPool` which is only complete @@ -3673,7 +3807,8 @@ fn stream_write_pandoc( #[cfg(test)] mod tests { use super::*; - use quarto_source_map::{FileId, SourceInfo}; + use quarto_source_map::{Anchor, AnchorRole, By, FileId, SourceInfo}; + use smallvec::SmallVec; use std::sync::Arc; fn make_test_context() -> ASTContext { @@ -4205,4 +4340,273 @@ mod tests { _ => panic!("Expected Custom block"), } } + + // ---------------------------------------------------------------- + // Plan 5 Phase 3+4 — writer-side Generated emission + // ---------------------------------------------------------------- + + /// `Generated { by, from: [] }` interns as a single code-4 pool entry + /// with `r = (0, 0)` and the right `by` shape. + #[test] + fn test_source_info_pool_generated_no_anchors() { + let context = make_test_context(); + let config = make_test_config(); + let mut serializer = SourceInfoSerializer::new(&context, &config); + + let gen_info = SourceInfo::Generated { + by: By::sectionize(), + from: SmallVec::new(), + }; + let id = serializer.intern(&gen_info); + + assert_eq!(id, 0); + assert_eq!(serializer.pool.len(), 1); + let entry = &serializer.pool[0]; + assert_eq!(entry.start_offset, 0); + assert_eq!(entry.end_offset, 0); + match &entry.mapping { + SerializableSourceMapping::Generated { by, from } => { + assert_eq!(by.kind, "sectionize"); + assert!(by.data.is_null()); + assert!(from.is_empty()); + } + _ => panic!("Expected Generated mapping"), + } + } + + /// `Generated { by: filter, from: [] }` carries `by.data` through. + #[test] + fn test_source_info_pool_generated_filter_with_data() { + let context = make_test_context(); + let config = make_test_config(); + let mut serializer = SourceInfoSerializer::new(&context, &config); + + let gen_info = SourceInfo::generated(By::filter("/x.lua", 42)); + let id = serializer.intern(&gen_info); + + let entry = &serializer.pool[id]; + match &entry.mapping { + SerializableSourceMapping::Generated { by, .. } => { + assert_eq!(by.kind, "filter"); + assert_eq!(by.as_filter(), Some(("/x.lua", 42))); + } + _ => panic!("Expected Generated mapping"), + } + } + + /// Anchors must be interned strictly *before* their owning Generated + /// entry — the reader's `si_id < current_index` guard requires it. + #[test] + fn test_source_info_pool_generated_with_invocation_anchor() { + let context = make_test_context(); + let config = make_test_config(); + let mut serializer = SourceInfoSerializer::new(&context, &config); + + let target = Arc::new(SourceInfo::Original { + file_id: FileId(0), + start_offset: 5, + end_offset: 12, + }); + let mut from = SmallVec::<[Anchor; 2]>::new(); + from.push(Anchor::invocation(Arc::clone(&target))); + let gen_info = SourceInfo::Generated { + by: By::shortcode("meta"), + from, + }; + + let id = serializer.intern(&gen_info); + // Anchor target interned first (ID 0), Generated second (ID 1). + assert_eq!(id, 1); + assert!(matches!( + serializer.pool[0].mapping, + SerializableSourceMapping::Original { .. } + )); + match &serializer.pool[1].mapping { + SerializableSourceMapping::Generated { by, from } => { + assert_eq!(by.kind, "shortcode"); + assert_eq!(from.len(), 1); + assert!(matches!(from[0].0, AnchorRole::Invocation)); + assert_eq!(from[0].1, 0); // si_id points to the target + } + _ => panic!("Expected Generated mapping"), + } + } + + /// Multi-inline shortcode resolution: N Generated nodes sharing one + /// `Arc` anchor target collapse to a single pool entry on + /// the write side. The dedup is keyed by `Arc::as_ptr`. + #[test] + fn test_source_info_pool_generated_anchor_dedup() { + let context = make_test_context(); + let config = make_test_config(); + let mut serializer = SourceInfoSerializer::new(&context, &config); + + let shared = Arc::new(SourceInfo::Original { + file_id: FileId(0), + start_offset: 0, + end_offset: 10, + }); + + // Three sibling Generated entries each pointing at `shared`. + let make = || { + let mut from = SmallVec::<[Anchor; 2]>::new(); + from.push(Anchor::invocation(Arc::clone(&shared))); + SourceInfo::Generated { + by: By::shortcode("meta"), + from, + } + }; + let id1 = serializer.intern(&make()); + let id2 = serializer.intern(&make()); + let id3 = serializer.intern(&make()); + + // Pool: shared(0), gen1(1), gen2(2), gen3(3) — shared interned once. + assert_eq!(serializer.pool.len(), 4); + let original_count = serializer + .pool + .iter() + .filter(|e| matches!(e.mapping, SerializableSourceMapping::Original { .. })) + .count(); + assert_eq!(original_count, 1, "shared target must intern exactly once"); + + for id in [id1, id2, id3] { + match &serializer.pool[id].mapping { + SerializableSourceMapping::Generated { from, .. } => { + assert_eq!(from.len(), 1); + assert_eq!(from[0].1, 0); // all reference the same si_id + } + _ => panic!("Expected Generated"), + } + } + } + + /// `Concat { pieces: [Generated, ...] }` round-trips: each piece's + /// Generated source_info interns through the new code-4 path; the + /// outer Concat references those IDs. + #[test] + fn test_source_info_pool_concat_of_generated() { + let context = make_test_context(); + let config = make_test_config(); + let mut serializer = SourceInfoSerializer::new(&context, &config); + + let g1 = SourceInfo::generated(By::filter("/a.lua", 1)); + let g2 = SourceInfo::generated(By::filter("/b.lua", 2)); + let concat = SourceInfo::concat(vec![(g1, 5), (g2, 7)]); + + let id = serializer.intern(&concat); + // Two Generated entries (0, 1) + Concat (2). + assert_eq!(id, 2); + assert!(matches!( + serializer.pool[0].mapping, + SerializableSourceMapping::Generated { .. } + )); + assert!(matches!( + serializer.pool[1].mapping, + SerializableSourceMapping::Generated { .. } + )); + match &serializer.pool[2].mapping { + SerializableSourceMapping::Concat { pieces } => { + assert_eq!(pieces.len(), 2); + assert_eq!(pieces[0].source_info_id, 0); + assert_eq!(pieces[1].source_info_id, 1); + } + _ => panic!("Expected Concat"), + } + } + + /// `Substring { parent: Arc, ... }` interns the Generated + /// parent first; the Substring references it by ID. + #[test] + fn test_source_info_pool_substring_of_generated() { + let context = make_test_context(); + let config = make_test_config(); + let mut serializer = SourceInfoSerializer::new(&context, &config); + + let parent = Arc::new(SourceInfo::generated(By::filter("/x.lua", 1))); + let child = SourceInfo::Substring { + parent: Arc::clone(&parent), + start_offset: 0, + end_offset: 4, + }; + let id = serializer.intern(&child); + + assert_eq!(id, 1); + assert!(matches!( + serializer.pool[0].mapping, + SerializableSourceMapping::Generated { .. } + )); + match &serializer.pool[1].mapping { + SerializableSourceMapping::Substring { parent_id } => { + assert_eq!(*parent_id, 0); + } + _ => panic!("Expected Substring"), + } + } + + /// `to_json` emits the Generated entry as `{"t":4, "r":[0,0], "d": ...}` + /// with the expected `by`/`from` shape. + #[test] + fn test_to_json_generated_emits_code_4() { + let context = make_test_context(); + let config = make_test_config(); + let mut serializer = SourceInfoSerializer::new(&context, &config); + + let target = Arc::new(SourceInfo::Original { + file_id: FileId(0), + start_offset: 5, + end_offset: 12, + }); + let mut from = SmallVec::<[Anchor; 2]>::new(); + from.push(Anchor::invocation(Arc::clone(&target))); + let gen_info = SourceInfo::Generated { + by: By::shortcode("meta"), + from, + }; + let _ = serializer.intern(&gen_info); + + let gen_entry_json = serializer.pool[1].to_json(); + assert_eq!(gen_entry_json.t, 4); + assert_eq!(gen_entry_json.r, [0, 0]); + + // Expected wire shape: + // { "by": { "kind": "shortcode", "data": { "name": "meta" } }, + // "from": [ { "role": "invocation", "si_id": 0 } ] } + let expected = json!({ + "by": { "kind": "shortcode", "data": { "name": "meta" } }, + "from": [ { "role": "invocation", "si_id": 0 } ] + }); + assert_eq!(gen_entry_json.d, expected); + } + + /// `to_json` skips `"data"` when `by.data` is null and skips `"from"` + /// when the anchor list is empty. + #[test] + fn test_to_json_generated_skips_null_data_and_empty_from() { + let context = make_test_context(); + let config = make_test_config(); + let mut serializer = SourceInfoSerializer::new(&context, &config); + + let gen_info = SourceInfo::generated(By::sectionize()); + let _ = serializer.intern(&gen_info); + let entry_json = serializer.pool[0].to_json(); + assert_eq!(entry_json.t, 4); + // Exactly: { "by": { "kind": "sectionize" } } — no data, no from. + let expected = json!({ "by": { "kind": "sectionize" } }); + assert_eq!(entry_json.d, expected); + } + + /// AnchorRole round-trip via the writer's `serialize_anchor_role` — + /// every known role plus an extension-defined `Other` survives. + #[test] + fn test_serialize_anchor_role_all_roles() { + assert_eq!(serialize_anchor_role(&AnchorRole::Invocation), "invocation"); + assert_eq!( + serialize_anchor_role(&AnchorRole::ValueSource), + "value-source" + ); + assert_eq!( + serialize_anchor_role(&AnchorRole::Other("ext/foo/bar".to_string())), + "other:ext/foo/bar" + ); + } } diff --git a/crates/pampa/tests/incremental_writer_tests.rs b/crates/pampa/tests/incremental_writer_tests.rs index d8b782299..c1437eb4d 100644 --- a/crates/pampa/tests/incremental_writer_tests.rs +++ b/crates/pampa/tests/incremental_writer_tests.rs @@ -53,8 +53,10 @@ fn read_json(json: &str) -> Pandoc { .0 } -/// Simulate the WASM incremental_write_qmd path: -/// 1. Parse original_qmd to get original_ast with accurate source spans +/// Simulate the WASM incremental_write_qmd path (Plan 7 contract): +/// 1. Parse original_qmd to get the baseline AST with accurate source spans +/// (in the real bridge the caller supplies this; here we synthesize it +/// from the qmd to keep the helper self-contained) /// 2. JSON round-trip the new_ast (simulates client serialization/deserialization) /// 3. Compute reconciliation plan and run incremental_write fn incremental_write_via_json_roundtrip(original_qmd: &str, new_ast: &Pandoc) -> String { @@ -64,6 +66,7 @@ fn incremental_write_via_json_roundtrip(original_qmd: &str, new_ast: &Pandoc) -> let plan = compute_reconciliation(&original_ast, &new_ast_from_json); writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast_from_json, &plan) .expect("incremental_write failed") + .0 } // ============================================================================= @@ -89,7 +92,8 @@ fn assert_idempotent(input: &str) { } let result = writers::incremental::incremental_write(input, &ast, &ast, &plan) - .expect("incremental_write failed"); + .expect("incremental_write failed") + .0; assert_eq!( result, input, @@ -293,6 +297,568 @@ A paragraph. ); } +// ============================================================================= +// Sectionize wrapper soft-drop (incremental.rs RecurseIntoContainer regression) +// ============================================================================= +// +// The post-q2-preview-pipeline AST wraps all user content in a single +// top-level `Block::Div` with `SourceInfo::Generated { by: sectionize }` +// (no Invocation anchor). When the React side mutates a child Para and +// posts the new AST, the reconciler aligns "1 Div : 1 Div" as a +// `RecurseIntoContainer`. The Plan 7 soft-drop guard in coarsen +// (`incremental.rs:342`) trips because `is_editable_inside_block` on a +// no-preimage Generated wrapper returns false — and since the *whole* +// document is the wrapper, the resulting `CoarsenedEntry::Omit` +// produces an empty document. +// +// The correct behavior: recurse Transparent into the wrapper's +// source-bearing children using `block_container_plans[result_idx]`, +// the same way `coarsen_keep_before_block` handles unchanged +// non-atomic Generated wrappers (`incremental.rs:459-479`). + +/// Construct a `Pandoc` whose first (and only) top-level block is a +/// `Generated { by: sectionize }` Div wrapping the parsed AST of the +/// supplied qmd. The inner blocks retain their original Source positions. +fn wrap_in_sectionize_div(parsed: pampa::pandoc::Pandoc) -> pampa::pandoc::Pandoc { + use pampa::pandoc::Block; + let wrapper_si = quarto_source_map::SourceInfo::generated(quarto_source_map::By::sectionize()); + let wrapper = Block::Div(pampa::pandoc::Div { + attr: ( + String::new(), + vec!["section".to_string()], + hashlink::LinkedHashMap::new(), + ), + content: parsed.blocks, + source_info: wrapper_si, + attr_source: pampa::pandoc::attr::AttrSourceInfo::empty(), + }); + pampa::pandoc::Pandoc { + blocks: vec![wrapper], + ..parsed + } +} + +#[test] +fn sectionize_wrapper_with_inner_para_edit_produces_nonempty_output() { + // Original qmd: a header followed by a paragraph. + let original_qmd = "# Heading\n\nA paragraph that the user will edit.\n"; + + // Baseline AST mirrors the post-pipeline shape: the whole document + // wrapped in a sectionize Div. + let baseline_ast = wrap_in_sectionize_div(parse_qmd(original_qmd)); + + // New AST: copy baseline, dive into the Div's content, append a + // reaction Span to the inner Paragraph (mirrors comment.tsx's + // addReaction path). + let mut new_ast = baseline_ast.clone(); + { + let pampa::pandoc::Block::Div(ref mut div) = new_ast.blocks[0] else { + panic!("expected wrapper Div at blocks[0]"); + }; + let last_idx = div + .content + .iter() + .rposition(|b| matches!(b, pampa::pandoc::Block::Paragraph(_))) + .expect("paragraph inside wrapper"); + if let pampa::pandoc::Block::Paragraph(ref mut p) = div.content[last_idx] { + let attr = ( + String::new(), + vec!["quarto-edit-comment".to_string()], + hashlink::LinkedHashMap::new(), + ); + p.content + .push(pampa::pandoc::Inline::Span(pampa::pandoc::Span { + attr, + content: vec![pampa::pandoc::Inline::Str(pampa::pandoc::Str { + text: "🎉".to_string(), + source_info: quarto_source_map::SourceInfo::default(), + })], + source_info: quarto_source_map::SourceInfo::default(), + attr_source: pampa::pandoc::attr::AttrSourceInfo::empty(), + })); + } + } + + let plan = compute_reconciliation(&baseline_ast, &new_ast); + let (result_qmd, warnings) = + writers::incremental::incremental_write(original_qmd, &baseline_ast, &new_ast, &plan) + .expect("incremental_write Ok arm"); + + assert!( + !result_qmd.is_empty(), + "sectionize-wrapper with inner Para edit yielded empty qmd \ + (warnings: {})", + warnings.len() + ); + + // The user's appended reaction should land in the inner Para; the + // wrapper itself should not re-emit any synthetic bytes. + assert!( + result_qmd.contains("[>> 🎉]"), + "expected reaction span [>> 🎉] in result; got:\n{}", + result_qmd + ); + // Unchanged Header (the orig blocks[0] inside the wrapper) should + // also be preserved. + assert!( + result_qmd.contains("# Heading"), + "expected '# Heading' (unchanged sibling inside wrapper) in result; got:\n{:?}", + result_qmd + ); +} + +#[test] +fn sectionize_wrapper_preserves_frontmatter_after_inner_edit() { + // Reproduce the second-order bug: when the post-pipeline AST wraps + // the user content in a top-level sectionize Div, the writer's + // `emit_metadata_prefix` reads `blocks[0].start_offset()` to decide + // where the metadata region ends. The wrapper's start_offset is 0 + // (Generated, no preimage), so the function concludes "no metadata" + // and deletes the YAML frontmatter from the output. + let original_qmd = "\ +--- +format: q2-preview +render-components: + - comment.tsx +--- + +# Heading + +A paragraph that the user will edit. +"; + + let baseline_ast = wrap_in_sectionize_div(parse_qmd(original_qmd)); + + let mut new_ast = baseline_ast.clone(); + { + let pampa::pandoc::Block::Div(ref mut div) = new_ast.blocks[0] else { + panic!("expected wrapper Div at blocks[0]"); + }; + let para_idx = div + .content + .iter() + .rposition(|b| matches!(b, pampa::pandoc::Block::Paragraph(_))) + .expect("paragraph inside wrapper"); + if let pampa::pandoc::Block::Paragraph(ref mut p) = div.content[para_idx] { + let attr = ( + String::new(), + vec!["quarto-edit-comment".to_string()], + hashlink::LinkedHashMap::new(), + ); + p.content + .push(pampa::pandoc::Inline::Span(pampa::pandoc::Span { + attr, + content: vec![pampa::pandoc::Inline::Str(pampa::pandoc::Str { + text: "🎉".to_string(), + source_info: quarto_source_map::SourceInfo::default(), + })], + source_info: quarto_source_map::SourceInfo::default(), + attr_source: pampa::pandoc::attr::AttrSourceInfo::empty(), + })); + } + } + + let plan = compute_reconciliation(&baseline_ast, &new_ast); + let (result_qmd, _warnings) = + writers::incremental::incremental_write(original_qmd, &baseline_ast, &new_ast, &plan) + .expect("incremental_write Ok arm"); + + assert!( + result_qmd + .starts_with("---\nformat: q2-preview\nrender-components:\n - comment.tsx\n---\n"), + "frontmatter deleted from output. result:\n{}", + result_qmd, + ); + // And the edit still lands inside the wrapper's child. + assert!( + result_qmd.contains("[>> 🎉]"), + "expected reaction span in result; got:\n{}", + result_qmd + ); +} + +#[test] +fn sectionize_wrapper_with_shortcode_child_edit_does_not_panic() { + // Discovered 2026-05-25 during the TS-gate-bypass UX experiment. + // When the framework's atomic-aware NOOP gate is disabled, + // edits to shortcode-resolved content (e.g. inside + // `{{< lipsum 3 >}}`) reach the writer. The writer's + // RecurseIntoContainer arm for the top-level sectionize wrapper + // descends via the Transparent recursion (commit bdcfdc53), + // which calls coarsen_blocks on the wrapper's children with a + // CHILD-RELATIVE plan. Inside that recursion, the existing + // `coarsen_keep_before_block` catch-all (~line 484) emits + // `Rewrite { new_idx: result_idx }` — but result_idx is the + // child-relative index, not the top-level index. `emit_entries` + // later does `new_ast.blocks[*new_idx]` (top-level) and panics + // with "index out of bounds". + // + // The doc-comment on coarsen_keep_before_block explicitly notes + // this is "not exercised by today's synthesizers" — true before + // the Transparent recursion was added, no longer true now. + // + // This test pins the panic so the architectural fix (carry the + // text on the Rewrite entry instead of an index, mirroring + // InlineSplice's pattern) has a regression target. + use pampa::pandoc::{Block, Header, Inline, Pandoc, Paragraph, Span, Str}; + use quarto_pandoc_types::{AttrSourceInfo, ConfigValue}; + use quarto_source_map::{AnchorRole, By, FileId, SourceInfo}; + use std::sync::Arc; + + const TARGET: FileId = FileId(0); + // Original qmd byte ranges are illustrative; the source text is + // long enough to contain all the byte ranges referenced below. + let original_qmd = "# Heading\n\n{{< lipsum 3 >}}\n\nMore text.\n"; + + // Build the lipsum shortcode token's anchor (Original in target). + let token_si = SourceInfo::original(TARGET, 11, 27); // "{{< lipsum 3 >}}" + + // Construct a Generated{shortcode} Para representing one of + // lipsum's resolved paragraphs. + let mut lipsum_si = SourceInfo::generated(By::shortcode("lipsum")); + lipsum_si.append_anchor(AnchorRole::Invocation, Arc::new(token_si.clone())); + + // Also construct a child Para that has NEITHER preimage in + // target NOR a recognized Generated kind: an Original Para from + // a DIFFERENT file. This is the cross-file-Original case that + // coarsen_keep_before_block's catch-all falls through to. + // (Pre-Plan-8 the AST didn't carry these; the panic the user + // observed must hit a different shape — but the structural + // failure is the same: a Rewrite emitted inside a Transparent + // wrapper.) + let other_file_para_si = SourceInfo::original(FileId(1), 0, 10); + + fn make_header(level: usize, text: &str, si: SourceInfo) -> Block { + Block::Header(Header { + level, + attr: (String::new(), Vec::new(), hashlink::LinkedHashMap::new()), + content: vec![Inline::Str(Str { + text: text.to_string(), + source_info: SourceInfo::default(), + })], + source_info: si, + attr_source: AttrSourceInfo::empty(), + }) + } + fn make_para(text: &str, si: SourceInfo) -> Block { + Block::Paragraph(Paragraph { + content: vec![Inline::Str(Str { + text: text.to_string(), + source_info: SourceInfo::default(), + })], + source_info: si, + }) + } + + // Wrapper children: Header + cross-file Para + lipsum Para. + let header = make_header(1, "Heading", SourceInfo::original(TARGET, 0, 9)); + let other_file_para = make_para("Cross", other_file_para_si); + let lipsum_para = make_para("Lorem ipsum…", lipsum_si.clone()); + let original = wrap_in_sectionize_div(Pandoc { + blocks: vec![header.clone(), other_file_para.clone(), lipsum_para], + meta: ConfigValue::default(), + }); + + // User clicks +react on the lipsum Para — append a Span to its + // inlines. The cross-file Para and Header are unchanged. + let mut lipsum_para_new = make_para("Lorem ipsum…", lipsum_si); + if let Block::Paragraph(ref mut p) = lipsum_para_new { + p.content.push(Inline::Span(Span { + attr: ( + String::new(), + vec!["quarto-edit-comment".to_string()], + hashlink::LinkedHashMap::new(), + ), + content: vec![Inline::Str(Str { + text: "🎉".to_string(), + source_info: SourceInfo::default(), + })], + source_info: SourceInfo::default(), + attr_source: AttrSourceInfo::empty(), + })); + } + let new = wrap_in_sectionize_div(Pandoc { + blocks: vec![header, other_file_para, lipsum_para_new], + meta: ConfigValue::default(), + }); + + let plan = compute_reconciliation(&original, &new); + + // Before the architectural fix: panics with + // "index out of bounds: the len is 1 but the index is N". + // After the fix: returns Ok. (This test does NOT assert on + // output bytes — see `sectionize_wrapper_shortcode_child_edit_soft_drops` + // for the byte-level expectation.) + let result = writers::incremental::incremental_write(original_qmd, &original, &new, &plan); + assert!( + result.is_ok(), + "incremental_write should not panic on a sectionize wrapper containing \ + a cross-file child + a shortcode child + an inline edit; got {:?}", + result.err() + ); +} + +#[test] +fn sectionize_wrapper_shortcode_child_edit_soft_drops() { + // The user clicks +react on a paragraph inside `{{< lipsum 3 >}}` + // with the framework's atomic-aware NOOP gate bypassed. The + // shortcode resolution is atomic-kind Generated; the inline edit + // has no source-side knob (the user's source is the token, not + // the resolved bytes). The writer must: + // + // (a) preserve the `{{< lipsum 3 >}}` token bytes in the qmd + // (b) NOT emit the resolved bytes / the reactji + // (c) surface a Q-3-42 or Q-3-43 warning so the UI can show + // a Monaco squiggle on the token line + // + // Two alignment shapes can reach the lipsum Para at child level + // of a Transparent (sectionize) recursion: + // + // 1. `RecurseIntoContainer { lipsum_idx, lipsum_idx }` — + // reconciler matches the original and the new structurally. + // Hits the existing soft-drop cascade priority 1 + // (preimage_in → Verbatim of token bytes). Works today. + // + // 2. `UseAfter(lipsum_idx)` (paired with a KeepBefore on the + // previous original) — reconciler can't pair the original + // and the new and treats it as a wholesale replacement. + // Falls through to let-user-win Rewrite (the writer emits + // the new block's resolved bytes verbatim). That's wrong + // for atomic-Generated with preimage. + // + // This test exercises shape #2 by giving the new Para a + // SourceInfo::default() (simulating a React-side wholesale + // replacement that loses provenance), then asserts the soft-drop + // outcome. Pre-fix: the resolved bytes leak into the qmd. Post- + // fix: the token is preserved + Q-3-42/43 fires. + use pampa::pandoc::{Block, Header, Inline, Pandoc, Paragraph, Span, Str}; + use quarto_pandoc_types::{AttrSourceInfo, ConfigValue}; + use quarto_source_map::{AnchorRole, By, FileId, SourceInfo}; + use std::sync::Arc; + + const TARGET: FileId = FileId(0); + let original_qmd = "# Heading\n\n{{< lipsum 3 >}}\n"; + + let token_si = SourceInfo::original(TARGET, 11, 27); + let mut lipsum_si = SourceInfo::generated(By::shortcode("lipsum")); + lipsum_si.append_anchor(AnchorRole::Invocation, Arc::new(token_si)); + + fn make_header(level: usize, text: &str, si: SourceInfo) -> Block { + Block::Header(Header { + level, + attr: (String::new(), Vec::new(), hashlink::LinkedHashMap::new()), + content: vec![Inline::Str(Str { + text: text.to_string(), + source_info: SourceInfo::default(), + })], + source_info: si, + attr_source: AttrSourceInfo::empty(), + }) + } + fn make_para_with_text(text: &str, si: SourceInfo) -> Block { + Block::Paragraph(Paragraph { + content: vec![Inline::Str(Str { + text: text.to_string(), + source_info: SourceInfo::default(), + })], + source_info: si, + }) + } + + let header = make_header(1, "Heading", SourceInfo::original(TARGET, 0, 9)); + + // Original lipsum paragraph carries the shortcode anchor. + let lipsum_orig = make_para_with_text( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", + lipsum_si.clone(), + ); + let original = wrap_in_sectionize_div(Pandoc { + blocks: vec![header.clone(), lipsum_orig], + meta: ConfigValue::default(), + }); + + // New lipsum paragraph: different inline content + reactji Span, + // but source_info IS preserved (matches what the React framework + // does when constructing the post-edit AST — block source_info + // is inherited from the original). + let mut lipsum_new = make_para_with_text("Etiam maximus accumsan gravida.", lipsum_si.clone()); + if let Block::Paragraph(ref mut p) = lipsum_new { + p.content.push(Inline::Span(Span { + attr: ( + String::new(), + vec!["quarto-edit-comment".to_string()], + hashlink::LinkedHashMap::new(), + ), + content: vec![Inline::Str(Str { + text: "🎉".to_string(), + source_info: SourceInfo::default(), + })], + source_info: SourceInfo::default(), + attr_source: AttrSourceInfo::empty(), + })); + } + let new = wrap_in_sectionize_div(Pandoc { + blocks: vec![header, lipsum_new], + meta: ConfigValue::default(), + }); + + let plan = compute_reconciliation(&original, &new); + eprintln!("plan = {:#?}", plan); + + let (qmd, warnings) = + writers::incremental::incremental_write(original_qmd, &original, &new, &plan) + .expect("write should succeed"); + eprintln!("--- qmd ---\n{}\n--- end ---", qmd); + eprintln!("--- warnings ({}) ---", warnings.len()); + for w in &warnings { + eprintln!(" code={:?} title={:?}", w.code, w.title); + } + + // (a) token bytes preserved. + assert!( + qmd.contains("{{< lipsum 3 >}}"), + "qmd should preserve the lipsum token bytes; got: {:?}", + qmd + ); + // (b) reactji NOT emitted. + assert!( + !qmd.contains("🎉"), + "qmd should NOT contain the user's reactji; got: {:?}", + qmd + ); + // (b cont.) resolved bytes (the new Para's text) NOT emitted. + assert!( + !qmd.contains("Etiam maximus accumsan"), + "qmd should NOT contain the new Para's resolved-shortcode bytes; \ + got: {:?}", + qmd + ); + // (c) Q-3-42 or Q-3-43 warning fired. + let saw_soft_drop = warnings + .iter() + .any(|w| matches!(w.code.as_deref(), Some("Q-3-42") | Some("Q-3-43"))); + assert!( + saw_soft_drop, + "expected a Q-3-42 or Q-3-43 soft-drop warning; got: {:?}", + warnings.iter().map(|w| &w.code).collect::>() + ); +} + +// --- target_file_id derivation skips no-root_file_id first blocks --- +// +// Plan 7c Phase 8 — `coarsen`'s `target_file_id` is derived from the +// first block whose `root_file_id()` resolves to `Some`. A synthesized +// title-block (or sectionize wrapper) at `blocks[0]` with no +// `Invocation` anchor returns `None`, so the writer needs to skip past +// it and look at later blocks. Pre-fix, the fallback to `FileId(0)` +// would make `preimage_in(target)` return `None` for every real block +// at `FileId(N != 0)` — i.e. all editability checks fail and edits +// silently soft-drop. + +#[test] +fn target_file_id_skips_synthesized_first_block() { + use pampa::pandoc::{Block, Header, Pandoc, Paragraph, Str}; + use quarto_pandoc_types::{AttrSourceInfo, ConfigValue}; + use quarto_source_map::{By, FileId, SourceInfo}; + + // blocks[0] = synthesized title-block Header (Generated, no + // Invocation). blocks[1] = real Paragraph at FileId(7). + const REAL_FILE: FileId = FileId(7); + let title_block = Block::Header(Header { + level: 1, + attr: (String::new(), Vec::new(), hashlink::LinkedHashMap::new()), + content: vec![pampa::pandoc::Inline::Str(Str { + text: "Synthesized title".to_string(), + source_info: SourceInfo::default(), + })], + source_info: SourceInfo::generated(By::title_block()), + attr_source: AttrSourceInfo::empty(), + }); + // Real Para holds two Strs, both at FileId(7). The user edit + // mutates the second Str so the reconciler emits a + // RecurseIntoContainer with an inline plan. That path checks + // `is_editable_inside_block` on the orig Para, which in turn + // calls `preimage_in(target_file_id)` — and that's where a wrong + // `target_file_id` (FileId(0) fallback) makes the editability + // check return false and the writer soft-drops with Q-3-43. + let original_qmd = "Real text"; + let real_para_orig = Block::Paragraph(Paragraph { + content: vec![ + pampa::pandoc::Inline::Str(Str { + text: "Real".to_string(), + source_info: SourceInfo::original(REAL_FILE, 0, 4), + }), + pampa::pandoc::Inline::Space(pampa::pandoc::Space { + source_info: SourceInfo::original(REAL_FILE, 4, 5), + }), + pampa::pandoc::Inline::Str(Str { + text: "text".to_string(), + source_info: SourceInfo::original(REAL_FILE, 5, 9), + }), + ], + source_info: SourceInfo::original(REAL_FILE, 0, 9), + }); + // Mutated Para: replace the second Str with a new (no-source) Str. + let real_para_mut = Block::Paragraph(Paragraph { + content: vec![ + pampa::pandoc::Inline::Str(Str { + text: "Real".to_string(), + source_info: SourceInfo::original(REAL_FILE, 0, 4), + }), + pampa::pandoc::Inline::Space(pampa::pandoc::Space { + source_info: SourceInfo::original(REAL_FILE, 4, 5), + }), + pampa::pandoc::Inline::Str(Str { + text: "edited".to_string(), + source_info: SourceInfo::default(), + }), + ], + source_info: SourceInfo::original(REAL_FILE, 0, 9), + }); + let orig = Pandoc { + blocks: vec![title_block.clone(), real_para_orig], + meta: ConfigValue::default(), + }; + let new = Pandoc { + blocks: vec![title_block, real_para_mut], + meta: ConfigValue::default(), + }; + + let plan = compute_reconciliation(&orig, &new); + let (_qmd, warnings) = + writers::incremental::incremental_write(original_qmd, &orig, &new, &plan) + .expect("incremental_write Ok arm"); + + // Pre-fix target_file_id falls back to FileId(0); preimage_in(0) + // on REAL_FILE-Original Para returns None; coarsen's + // RecurseIntoContainer arm soft-drops with Q-3-43 ("Generated + // content edit dropped"). Post-fix target_file_id resolves to + // REAL_FILE and the inline edit proceeds without a warning. + assert!( + warnings.is_empty(), + "expected no soft-drop warnings; got: {:?}", + warnings.iter().map(|w| &w.title).collect::>() + ); +} + +#[test] +fn target_file_id_defaults_to_zero_for_empty_document() { + // Empty `blocks` — the fallback to `FileId(0)` should fire. + // Driving an identity reconcile on an empty AST should produce a + // no-op write without warnings or panics. + use pampa::pandoc::Pandoc; + use quarto_pandoc_types::ConfigValue; + let ast = Pandoc { + blocks: vec![], + meta: ConfigValue::default(), + }; + let plan = compute_reconciliation(&ast, &ast); + let (result, warnings) = writers::incremental::incremental_write("", &ast, &ast, &plan) + .expect("incremental_write Ok arm on empty document"); + assert_eq!(result, ""); + assert!(warnings.is_empty()); +} + // --- Mixed documents --- #[test] @@ -358,7 +924,8 @@ fn assert_roundtrip(original_qmd: &str, new_qmd: &str) { let plan = compute_reconciliation(&original_ast, &new_ast); let result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .expect("incremental_write failed"); + .expect("incremental_write failed") + .0; // Verify the result round-trips: read(result) should match new_ast structurally let result_ast = parse_qmd(&result); @@ -574,7 +1141,8 @@ fn roundtrip_auto_id_change_no_explicit_id_in_output() { let plan = compute_reconciliation(&original_ast, &new_ast); let result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .expect("incremental_write failed"); + .expect("incremental_write failed") + .0; // Should NOT contain an explicit ID attribute — auto-generated IDs stay implicit assert!( @@ -599,7 +1167,8 @@ fn verbatim_preservation_unchanged_blocks() { let plan = compute_reconciliation(&original_ast, &new_ast); let result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .expect("incremental_write failed"); + .expect("incremental_write failed") + .0; // The first and third paragraphs should be byte-for-byte identical assert!( @@ -977,7 +1546,8 @@ fn assert_equivalent_to_full_writer(original_qmd: &str, new_qmd: &str) { let plan = compute_reconciliation(&original_ast, &new_ast); let incremental_result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .expect("incremental_write failed"); + .expect("incremental_write failed") + .0; let full_result = write_qmd(&new_ast); @@ -1045,7 +1615,8 @@ fn assert_verbatim_preservation(blocks: &[String], mutate_idx: usize, new_block: let plan = compute_reconciliation(&original_ast, &new_ast); let result = writers::incremental::incremental_write(&original, &original_ast, &new_ast, &plan) - .expect("incremental_write failed"); + .expect("incremental_write failed") + .0; // For each unchanged block, verify its text appears verbatim in the result. // We check by finding the original block text in the result string. @@ -1092,7 +1663,7 @@ fn assert_edits_monotonic(original_qmd: &str, new_qmd: &str) { let new_ast = parse_qmd(new_qmd); let plan = compute_reconciliation(&original_ast, &new_ast); - let edits = writers::incremental::compute_incremental_edits( + let (edits, _warnings) = writers::incremental::compute_incremental_edits( original_qmd, &original_ast, &new_ast, @@ -1144,7 +1715,7 @@ proptest! { // Identity case: should produce zero edits let ast = parse_qmd(&qmd); let plan = compute_reconciliation(&ast, &ast); - let edits = + let (edits, _warnings) = writers::incremental::compute_incremental_edits(&qmd, &ast, &ast, &plan) .expect("compute_incremental_edits failed"); prop_assert!( @@ -1327,7 +1898,8 @@ fn comment_preserved_when_adjacent_block_changes() { let result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .expect("incremental_write failed"); + .expect("incremental_write failed") + .0; assert!( result.contains(""), @@ -1363,7 +1935,8 @@ fn comment_preserved_when_containing_paragraph_rewritten() { let plan = compute_reconciliation(&original_ast, &new_ast); let result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .expect("incremental_write failed"); + .expect("incremental_write failed") + .0; assert!( result.contains(""), @@ -1384,7 +1957,8 @@ fn comment_inside_blockquote_preserved_on_rewrite() { let result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .expect("incremental_write failed"); + .expect("incremental_write failed") + .0; assert!( result.contains(""), @@ -1405,7 +1979,8 @@ fn comment_block_preserved_when_blocks_added() { let result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .expect("incremental_write failed"); + .expect("incremental_write failed") + .0; assert!( result.contains(""), @@ -1478,7 +2053,8 @@ fn multiline_comment_preserved_on_rewrite() { let plan = compute_reconciliation(&original_ast, &new_ast); let result = writers::incremental::incremental_write(original, &original_ast, &new_ast, &plan) - .expect("incremental_write failed"); + .expect("incremental_write failed") + .0; assert!( result.contains(""), @@ -1498,7 +2074,8 @@ fn multiline_block_comment_preserved_on_adjacent_change() { let plan = compute_reconciliation(&original_ast, &new_ast); let result = writers::incremental::incremental_write(original, &original_ast, &new_ast, &plan) - .expect("incremental_write failed"); + .expect("incremental_write failed") + .0; assert!( result.contains(""), diff --git a/crates/pampa/tests/inline_splice_integration_tests.rs b/crates/pampa/tests/inline_splice_integration_tests.rs index 30e6cb370..fd3d0746f 100644 --- a/crates/pampa/tests/inline_splice_integration_tests.rs +++ b/crates/pampa/tests/inline_splice_integration_tests.rs @@ -102,7 +102,8 @@ fn assert_incremental_write_correct(original_qmd: &str, new_ast: &Pandoc) { let result = writers::incremental::incremental_write(original_qmd, &original_ast, new_ast, &plan) - .expect("incremental_write failed"); + .expect("incremental_write failed") + .0; // Verify round-trip: parsing the result should produce an AST structurally // equivalent to new_ast @@ -137,7 +138,8 @@ fn splice_str_change_in_paragraph() { let plan = compute_reconciliation(&original_ast, &new_ast); let result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .unwrap(); + .unwrap() + .0; assert_eq!(result, "Goodbye world.\n"); } @@ -154,7 +156,8 @@ fn splice_str_change_preserves_surrounding_text() { let plan = compute_reconciliation(&original_ast, &new_ast); let result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .unwrap(); + .unwrap() + .0; assert_eq!(result, "The slow brown fox.\n"); } @@ -179,7 +182,8 @@ fn splice_str_change_in_header() { let plan = compute_reconciliation(&original_ast, &new_ast); let result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .unwrap(); + .unwrap() + .0; // The header prefix "## " should be preserved assert_eq!(result, "## Goodbye World\n"); } @@ -202,7 +206,8 @@ fn splice_str_change_in_multiline_paragraph() { let plan = compute_reconciliation(&original_ast, &new_ast); let result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .unwrap(); + .unwrap() + .0; assert_eq!(result, "Goodbye\nworld\n"); } @@ -242,7 +247,8 @@ fn splice_str_change_in_multiline_blockquote() { let plan = compute_reconciliation(&original_ast, &new_ast); let result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .unwrap(); + .unwrap() + .0; assert_eq!(result, "> Goodbye\n> world\n"); } @@ -280,7 +286,8 @@ fn splice_str_change_in_multiline_bulletlist() { let plan = compute_reconciliation(&original_ast, &new_ast); let result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .unwrap(); + .unwrap() + .0; // The list continuation indent should be preserved assert_eq!(result, "* Goodbye\n world\n"); } @@ -302,7 +309,8 @@ fn splice_preserves_other_blocks() { let plan = compute_reconciliation(&original_ast, &new_ast); let result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .unwrap(); + .unwrap() + .0; assert_eq!( result, "First paragraph.\n\nModified paragraph.\n\nThird paragraph.\n" @@ -367,7 +375,8 @@ fn splice_str_change_inside_emphasis() { let plan = compute_reconciliation(&original_ast, &new_ast); let result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .unwrap(); + .unwrap() + .0; // The emphasis delimiters should be preserved from original source assert_eq!(result, "*Goodbye* world.\n"); } @@ -392,7 +401,8 @@ fn splice_str_change_inside_strong() { let plan = compute_reconciliation(&original_ast, &new_ast); let result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .unwrap(); + .unwrap() + .0; assert_eq!(result, "**Goodbye** world.\n"); } @@ -405,7 +415,9 @@ fn splice_idempotent_simple_paragraph() { let original_qmd = "Hello world.\n"; let ast = parse_qmd(original_qmd); let plan = compute_reconciliation(&ast, &ast); - let result = writers::incremental::incremental_write(original_qmd, &ast, &ast, &plan).unwrap(); + let result = writers::incremental::incremental_write(original_qmd, &ast, &ast, &plan) + .unwrap() + .0; assert_eq!(result, original_qmd); } @@ -414,6 +426,8 @@ fn splice_idempotent_blockquote_multiline() { let original_qmd = "> Hello\n> world\n"; let ast = parse_qmd(original_qmd); let plan = compute_reconciliation(&ast, &ast); - let result = writers::incremental::incremental_write(original_qmd, &ast, &ast, &plan).unwrap(); + let result = writers::incremental::incremental_write(original_qmd, &ast, &ast, &plan) + .unwrap() + .0; assert_eq!(result, original_qmd); } diff --git a/crates/pampa/tests/inline_splice_property_tests.rs b/crates/pampa/tests/inline_splice_property_tests.rs index 6219b8c4d..c63c774e8 100644 --- a/crates/pampa/tests/inline_splice_property_tests.rs +++ b/crates/pampa/tests/inline_splice_property_tests.rs @@ -169,7 +169,8 @@ fn assert_inline_roundtrip(original_qmd: &str, new_ast: &Pandoc) { let result = writers::incremental::incremental_write(original_qmd, &original_ast, new_ast, &plan) - .expect("incremental_write failed"); + .expect("incremental_write failed") + .0; // Round-trip: parse result, write both to QMD, compare let result_ast = parse_qmd(&result); @@ -192,7 +193,8 @@ fn assert_splice_equivalent_to_full_writer(original_qmd: &str, new_ast: &Pandoc) let incremental_result = writers::incremental::incremental_write(original_qmd, &original_ast, new_ast, &plan) - .expect("incremental_write failed"); + .expect("incremental_write failed") + .0; let full_result = write_qmd(new_ast); @@ -235,7 +237,8 @@ fn assert_inline_locality(original_qmd: &str, new_ast: &Pandoc, changed_block_id let result = writers::incremental::incremental_write(original_qmd, &original_ast, new_ast, &plan) - .expect("incremental_write failed"); + .expect("incremental_write failed") + .0; // For each unchanged block, verify its text appears in the result. for (i, block) in original_ast.blocks.iter().enumerate() { @@ -546,7 +549,9 @@ fn prop7_idempotent_paragraph_with_emphasis() { let qmd = "*Hello* world.\n"; let ast = parse_qmd(qmd); let plan = compute_reconciliation(&ast, &ast); - let result = writers::incremental::incremental_write(qmd, &ast, &ast, &plan).unwrap(); + let result = writers::incremental::incremental_write(qmd, &ast, &ast, &plan) + .unwrap() + .0; assert_eq!(result, qmd); } @@ -555,7 +560,9 @@ fn prop7_idempotent_paragraph_with_strong() { let qmd = "**Hello** world.\n"; let ast = parse_qmd(qmd); let plan = compute_reconciliation(&ast, &ast); - let result = writers::incremental::incremental_write(qmd, &ast, &ast, &plan).unwrap(); + let result = writers::incremental::incremental_write(qmd, &ast, &ast, &plan) + .unwrap() + .0; assert_eq!(result, qmd); } @@ -564,7 +571,9 @@ fn prop7_idempotent_paragraph_with_code() { let qmd = "Use `code` here.\n"; let ast = parse_qmd(qmd); let plan = compute_reconciliation(&ast, &ast); - let result = writers::incremental::incremental_write(qmd, &ast, &ast, &plan).unwrap(); + let result = writers::incremental::incremental_write(qmd, &ast, &ast, &plan) + .unwrap() + .0; assert_eq!(result, qmd); } @@ -573,7 +582,9 @@ fn prop7_idempotent_mixed_inline_formatting() { let qmd = "Normal *emph* **strong** `code` end.\n"; let ast = parse_qmd(qmd); let plan = compute_reconciliation(&ast, &ast); - let result = writers::incremental::incremental_write(qmd, &ast, &ast, &plan).unwrap(); + let result = writers::incremental::incremental_write(qmd, &ast, &ast, &plan) + .unwrap() + .0; assert_eq!(result, qmd); } @@ -582,7 +593,9 @@ fn prop7_idempotent_multiline_blockquote_with_emphasis() { let qmd = "> *Hello*\n> world.\n"; let ast = parse_qmd(qmd); let plan = compute_reconciliation(&ast, &ast); - let result = writers::incremental::incremental_write(qmd, &ast, &ast, &plan).unwrap(); + let result = writers::incremental::incremental_write(qmd, &ast, &ast, &plan) + .unwrap() + .0; assert_eq!(result, qmd); } @@ -592,7 +605,7 @@ proptest! { let ast = parse_qmd(&qmd); let plan = compute_reconciliation(&ast, &ast); let result = - writers::incremental::incremental_write(&qmd, &ast, &ast, &plan).unwrap(); + writers::incremental::incremental_write(&qmd, &ast, &ast, &plan).unwrap().0; prop_assert_eq!(result, qmd); } } @@ -688,8 +701,9 @@ fn prop9_no_newlines_in_splice_simple() { ); // Verify the incremental write result - let result = - writers::incremental::incremental_write(qmd, &original_ast, &new_ast, &plan).unwrap(); + let result = writers::incremental::incremental_write(qmd, &original_ast, &new_ast, &plan) + .unwrap() + .0; // The result should be correct assert_eq!(result, "Goodbye world.\n"); @@ -711,8 +725,9 @@ fn prop9_no_newlines_in_blockquote_splice() { let original_ast = parse_qmd(qmd); let plan = compute_reconciliation(&original_ast, &new_ast); - let result = - writers::incremental::incremental_write(qmd, &original_ast, &new_ast, &plan).unwrap(); + let result = writers::incremental::incremental_write(qmd, &original_ast, &new_ast, &plan) + .unwrap() + .0; // Verify the result parses correctly (critical for indentation contexts) assert_inline_roundtrip(qmd, &new_ast); @@ -736,8 +751,9 @@ fn prop9_no_newlines_in_multiline_blockquote_splice() { let original_ast = parse_qmd(qmd); let plan = compute_reconciliation(&original_ast, &new_ast); - let result = - writers::incremental::incremental_write(qmd, &original_ast, &new_ast, &plan).unwrap(); + let result = writers::incremental::incremental_write(qmd, &original_ast, &new_ast, &plan) + .unwrap() + .0; // The > prefix after the SoftBreak must be preserved assert_eq!(result, "> Goodbye\n> world.\n"); @@ -896,7 +912,7 @@ fn stress_many_blocks_single_change() { // Verify the edits are small (Property 8 / locality) let original_ast = parse_qmd(&qmd); let plan = compute_reconciliation(&original_ast, &new_ast); - let edits = + let (edits, _warnings) = writers::incremental::compute_incremental_edits(&qmd, &original_ast, &new_ast, &plan) .unwrap(); @@ -997,7 +1013,7 @@ proptest! { let original_ast = parse_qmd(&qmd); let plan = compute_reconciliation(&original_ast, &new_ast); - let edits = writers::incremental::compute_incremental_edits( + let (edits, _warnings) = writers::incremental::compute_incremental_edits( &qmd, &original_ast, &new_ast, diff --git a/crates/pampa/tests/json_reader_smoke_tests.rs b/crates/pampa/tests/json_reader_smoke_tests.rs index 1aa48514a..d5416b647 100644 --- a/crates/pampa/tests/json_reader_smoke_tests.rs +++ b/crates/pampa/tests/json_reader_smoke_tests.rs @@ -1,6 +1,12 @@ +use pampa::pandoc::{Block, Inline, Pandoc, Plain, Str}; use pampa::readers::json; +use pampa::writers::json as json_writer; +use quarto_source_map::{Anchor, AnchorRole, By, FileId, SourceInfo}; +use smallvec::SmallVec; use std::fs; +use std::io::Cursor; use std::path::PathBuf; +use std::sync::Arc; #[test] fn test_read_all_json_files_in_tests_readers() { @@ -79,3 +85,179 @@ fn test_manybullets_json_specifically() { _ => panic!("Expected OrderedList block"), } } + +// ---------------------------------------------------------------- +// Plan 5 — End-to-end round-trip through the streaming writer +// and the public reader API. +// +// These tests exercise the *production* JSON path: +// `pampa::writers::json::write` → bytes → `pampa::readers::json::read`. +// The writer's streaming arm (`stream_write_source_info_pool`) is what +// the orchestrator uses, so a regression here is exactly what bd-3odjm +// surfaced. The hand-constructed reader/writer unit tests live next to +// their respective modules; these tests guard the wire. +// ---------------------------------------------------------------- + +/// Round-trip a single `Pandoc` through the streaming writer and the +/// reader. Returns the recovered `source_info` of the inner `Str`. +fn roundtrip_str_source_info(str_source_info: SourceInfo) -> SourceInfo { + let mut pandoc = Pandoc::default(); + let inner = Inline::Str(Str { + text: "hi".to_string(), + source_info: str_source_info, + }); + let plain = Plain { + content: vec![inner], + source_info: SourceInfo::default(), + }; + pandoc.blocks.push(Block::Plain(plain)); + + let context = pampa::pandoc::ASTContext::anonymous(); + let mut buf = Vec::new(); + json_writer::write(&pandoc, &context, &mut buf).expect("write_pandoc"); + + let mut cursor = Cursor::new(&buf); + let (round, _ctx) = json::read(&mut cursor).expect("read_pandoc"); + + let Block::Plain(plain) = &round.blocks[0] else { + panic!("Expected Plain block") + }; + let Inline::Str(str_node) = &plain.content[0] else { + panic!("Expected Str inline") + }; + str_node.source_info.clone() +} + +#[test] +fn roundtrip_generated_no_anchors_via_public_api() { + let original = SourceInfo::generated(By::sectionize()); + let recovered = roundtrip_str_source_info(original.clone()); + assert_eq!(original, recovered); +} + +#[test] +fn roundtrip_generated_filter_with_data_via_public_api() { + let original = SourceInfo::generated(By::filter("/x.lua", 42)); + let recovered = roundtrip_str_source_info(original.clone()); + assert_eq!(original, recovered); +} + +#[test] +fn roundtrip_generated_with_invocation_anchor_via_public_api() { + let target = Arc::new(SourceInfo::Original { + file_id: FileId(0), + start_offset: 5, + end_offset: 12, + }); + let mut from = SmallVec::<[Anchor; 2]>::new(); + from.push(Anchor::invocation(Arc::clone(&target))); + let original = SourceInfo::Generated { + by: By::shortcode("meta"), + from, + }; + let recovered = roundtrip_str_source_info(original.clone()); + assert_eq!(original, recovered); +} + +#[test] +fn roundtrip_generated_with_all_anchor_roles_via_public_api() { + let mk_target = |start: usize, end: usize| { + Arc::new(SourceInfo::Original { + file_id: FileId(0), + start_offset: start, + end_offset: end, + }) + }; + let mut from = SmallVec::<[Anchor; 2]>::new(); + from.push(Anchor::invocation(mk_target(0, 5))); + from.push(Anchor::value_source(mk_target(10, 20))); + from.push(Anchor { + role: AnchorRole::Other("ext/foo/bar".to_string()), + source_info: mk_target(30, 35), + }); + let original = SourceInfo::Generated { + by: By::shortcode("meta"), + from, + }; + let recovered = roundtrip_str_source_info(original.clone()); + assert_eq!(original, recovered); +} + +#[test] +fn roundtrip_concat_of_generated_via_public_api() { + let g1 = SourceInfo::generated(By::filter("/a.lua", 1)); + let g2 = SourceInfo::generated(By::filter("/b.lua", 2)); + let original = SourceInfo::concat(vec![(g1, 5), (g2, 7)]); + let recovered = roundtrip_str_source_info(original.clone()); + assert_eq!(original, recovered); +} + +#[test] +fn roundtrip_substring_of_generated_via_public_api() { + let parent = Arc::new(SourceInfo::generated(By::filter("/x.lua", 1))); + let original = SourceInfo::Substring { + parent: Arc::clone(&parent), + start_offset: 0, + end_offset: 4, + }; + let recovered = roundtrip_str_source_info(original.clone()); + assert_eq!(original, recovered); +} + +#[test] +fn roundtrip_original_via_public_api() { + let original = SourceInfo::Original { + file_id: FileId(0), + start_offset: 7, + end_offset: 12, + }; + let recovered = roundtrip_str_source_info(original.clone()); + assert_eq!(original, recovered); +} + +#[test] +fn roundtrip_substring_via_public_api() { + let parent = Arc::new(SourceInfo::Original { + file_id: FileId(0), + start_offset: 0, + end_offset: 100, + }); + let original = SourceInfo::Substring { + parent: Arc::clone(&parent), + start_offset: 10, + end_offset: 20, + }; + let recovered = roundtrip_str_source_info(original.clone()); + assert_eq!(original, recovered); +} + +/// Streaming-writer parity: the streaming writer emits a code-4 entry +/// whose payload reads back as the same `Generated` value the writer +/// was given. Specifically guards `stream_write_source_info_pool`'s +/// match arms, which are independent from `to_json`'s. +#[test] +fn streaming_writer_generated_round_trip_preserves_by_data() { + let target = Arc::new(SourceInfo::Original { + file_id: FileId(0), + start_offset: 0, + end_offset: 5, + }); + let mut from = SmallVec::<[Anchor; 2]>::new(); + from.push(Anchor::invocation(Arc::clone(&target))); + let original = SourceInfo::Generated { + by: By::raw( + "ext/example/foo", + serde_json::json!({ + "nested": { + "n": 7, + "flag": true, + "items": [1, 2, "three"], + "empty": null + } + }), + ), + from, + }; + let recovered = roundtrip_str_source_info(original.clone()); + assert_eq!(original, recovered); +} diff --git a/crates/pampa/tests/test_metadata_source_tracking.rs b/crates/pampa/tests/test_metadata_source_tracking.rs index 252621a8b..bc7e2fed9 100644 --- a/crates/pampa/tests/test_metadata_source_tracking.rs +++ b/crates/pampa/tests/test_metadata_source_tracking.rs @@ -23,8 +23,8 @@ fn resolve_source_offset(source: &quarto_source_map::SourceInfo) -> usize { // For concat, use the start offset of the first piece pieces.first().map_or(0, |p| p.offset_in_concat) } - quarto_source_map::SourceInfo::FilterProvenance { .. } => { - // Filter provenance doesn't have a traditional offset + quarto_source_map::SourceInfo::Generated { .. } => { + // Generated nodes have no offset-within-current-text. 0 } } diff --git a/crates/quarto-ast-reconcile/Cargo.toml b/crates/quarto-ast-reconcile/Cargo.toml index 8b322fb78..57caa34cc 100644 --- a/crates/quarto-ast-reconcile/Cargo.toml +++ b/crates/quarto-ast-reconcile/Cargo.toml @@ -21,6 +21,7 @@ rustc-hash = "2.1" [dev-dependencies] proptest = "1.10" +yaml-rust2 = { workspace = true } [lints] workspace = true diff --git a/crates/quarto-ast-reconcile/src/hash.rs b/crates/quarto-ast-reconcile/src/hash.rs index f425dc22a..e734dd530 100644 --- a/crates/quarto-ast-reconcile/src/hash.rs +++ b/crates/quarto-ast-reconcile/src/hash.rs @@ -11,7 +11,7 @@ */ use quarto_pandoc_types::custom::{CustomNode, Slot}; -use quarto_pandoc_types::{Attr, Block, Inline}; +use quarto_pandoc_types::{Attr, Block, ConfigMapEntry, ConfigValue, ConfigValueKind, Inline}; use rustc_hash::FxHashMap; use std::hash::{Hash, Hasher}; use std::marker::PhantomData; @@ -488,6 +488,318 @@ fn hash_slot(slot: &Slot, cache: &mut HashCache<'_>, hasher: &mut impl Hasher) { } } +// ============================================================================= +// Meta (ConfigValue) Hashing +// ============================================================================= +// +// Idempotence checks (Plan 3) need a structural hash of the document +// `meta` field that: +// +// - excludes `source_info` and `key_source` so Plan-4 source-info +// churn doesn't affect the contract; +// - hashes `Map` entries in *insertion order* with no sort, so a +// transform that stuffs a `HashMap` into meta is *detectable* (a +// sort would silently mask that class of non-determinism — exactly +// the bug an idempotence test is meant to catch); +// - includes `merge_op` so a transform that flips merge semantics +// non-deterministically shows up; +// - recurses into `PandocInlines` / `PandocBlocks` via the existing +// inline/block hashers (which already exclude source_info). + +/// Compute a structural hash of a `ConfigValue` tree. +/// +/// Source-info-agnostic: skips `ConfigValue::source_info` and +/// `ConfigMapEntry::key_source`. See module-level note above for the +/// design rationale (insertion-order maps, `merge_op` participates). +pub fn compute_meta_hash_fresh(meta: &ConfigValue) -> u64 { + let mut cache = HashCache::new(); + let mut hasher = rustc_hash::FxHasher::default(); + hash_config_value(meta, &mut cache, &mut hasher); + hasher.finish() +} + +/// Compute a structural hash of a `ConfigValue` tree, excluding the +/// top-level `rendered` map entry. +/// +/// Used by the q2-preview idempotence gate: chrome transforms +/// (navbar / sidebar / footer / page-nav), `IncludeResolveStage`, the +/// favicon transform, and the Bootstrap/clipboard injection stages +/// populate `meta.rendered.*` with HTML-string side outputs. Two +/// runs may produce HTML strings whose *bytes* differ but whose +/// rendered shape is equivalent (attribute order, whitespace); that +/// case belongs to an HTML-canonicalization concern, not to the +/// pipeline-determinism contract this hash defends. +/// +/// The exclusion only applies at the document root. A `rendered` +/// key nested deeper in the tree is hashed normally — meta is +/// structured as a single top-level Map in practice, so a nested +/// `rendered` would be intentional content. +pub fn compute_meta_hash_fresh_excluding_rendered(meta: &ConfigValue) -> u64 { + let mut cache = HashCache::new(); + let mut hasher = rustc_hash::FxHasher::default(); + hash_config_value_excluding(meta, &["rendered"], &mut cache, &mut hasher); + hasher.finish() +} + +fn hash_config_value(value: &ConfigValue, cache: &mut HashCache<'_>, hasher: &mut impl Hasher) { + hash_config_value_excluding(value, &[], cache, hasher); +} + +/// Hash a `ConfigValue`, optionally skipping certain top-level map +/// keys. `top_skip` is only consulted for the `Map` variant at this +/// call's root and is not propagated into recursion: nested values +/// see an empty skip list. +fn hash_config_value_excluding( + value: &ConfigValue, + top_skip: &[&str], + cache: &mut HashCache<'_>, + hasher: &mut impl Hasher, +) { + // `merge_op` participates. The enum doesn't derive Hash, so + // route through its discriminant + the byte tag. + std::mem::discriminant(&value.merge_op).hash(hasher); + + hash_config_value_kind(&value.value, top_skip, cache, hasher); +} + +fn hash_config_value_kind( + kind: &ConfigValueKind, + top_skip: &[&str], + cache: &mut HashCache<'_>, + hasher: &mut impl Hasher, +) { + std::mem::discriminant(kind).hash(hasher); + + match kind { + ConfigValueKind::Scalar(yaml) => { + yaml.hash(hasher); + } + ConfigValueKind::PandocInlines(inlines) => { + hash_inlines(inlines, cache, hasher); + } + ConfigValueKind::PandocBlocks(blocks) => { + hash_blocks(blocks, cache, hasher); + } + ConfigValueKind::Path(s) | ConfigValueKind::Glob(s) | ConfigValueKind::Expr(s) => { + s.hash(hasher); + } + ConfigValueKind::Array(items) => { + items.len().hash(hasher); + for item in items { + hash_config_value(item, cache, hasher); + } + } + ConfigValueKind::Map(entries) => { + // Insertion-order, filtered by `top_skip`. Skip set is + // intentionally NOT propagated into recursion. + let kept_len = entries + .iter() + .filter(|e| !top_skip.contains(&e.key.as_str())) + .count(); + kept_len.hash(hasher); + for entry in entries { + if top_skip.contains(&entry.key.as_str()) { + continue; + } + hash_config_map_entry(entry, cache, hasher); + } + } + } +} + +fn hash_config_map_entry( + entry: &ConfigMapEntry, + cache: &mut HashCache<'_>, + hasher: &mut impl Hasher, +) { + entry.key.hash(hasher); + // `key_source` deliberately not hashed. + hash_config_value(&entry.value, cache, hasher); +} + +// ============================================================================= +// Divergence Localization +// ============================================================================= + +/// First place two documents' structural hashes diverge. +/// +/// Returned by [`find_first_divergence`] to make idempotence failures +/// debuggable: the test driver embeds this in its panic message so +/// the sub-agent investigation prompt arrives with "block index 7" +/// or "meta.listings.foo" instead of just "hash mismatch." +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum DivergencePoint { + /// Blocks at the same index hash differently. `path` is intentionally + /// flat: we don't dig into block subtrees because the per-block hash + /// already provides enough localization for triage. + Block { + index: usize, + hash_a: u64, + hash_b: u64, + }, + /// A meta key path hashes differently. The path walks insertion + /// order through nested Maps; the last element is the leaf key + /// whose recursive hash diverges. + MetaKey { + path: Vec, + hash_a: u64, + hash_b: u64, + }, + /// The two documents' top-level hashes agree on both blocks and + /// meta. The caller should never see this if it was reached via + /// "hashes differ, find me a divergence" — it would indicate a + /// hasher bug. Returned for completeness. + None, +} + +/// Find the first structural divergence between two documents. +/// +/// `blocks` are compared in order by per-block fresh hash; the first +/// index whose hashes disagree yields a `Block` variant. If the +/// blocks all match, `meta` is walked in insertion order with the +/// same `rendered.*` exclusion the +/// [`compute_meta_hash_fresh_excluding_rendered`] hash uses; the +/// first map key whose recursive hash diverges yields a `MetaKey` +/// variant. +/// +/// This lives in `quarto-ast-reconcile` next to the hashers so the +/// localization logic shares the source-info-exclusion contract by +/// construction. The caller (Plan 3's `idempotence.rs` test driver) +/// supplies `&[Block]` + `&ConfigValue` rather than passing the +/// crate's `DocumentAst` type, which is owned by `quarto-core`. +pub fn find_first_divergence( + blocks_a: &[Block], + meta_a: &ConfigValue, + blocks_b: &[Block], + meta_b: &ConfigValue, +) -> DivergencePoint { + // Block walk: linear scan with the existing per-block hasher. + // If block counts differ we still report the first mismatching + // index (or the boundary index for the longer side). + let common = blocks_a.len().min(blocks_b.len()); + for index in 0..common { + let hash_a = compute_block_hash_fresh(&blocks_a[index]); + let hash_b = compute_block_hash_fresh(&blocks_b[index]); + if hash_a != hash_b { + return DivergencePoint::Block { + index, + hash_a, + hash_b, + }; + } + } + if blocks_a.len() != blocks_b.len() { + // Report the first "missing" position as a divergence at + // index `common`. We synthesize a hash for the empty side as + // 0 — it just needs to be observably different from the + // present side's hash. + let (hash_a, hash_b) = if blocks_a.len() > blocks_b.len() { + (compute_block_hash_fresh(&blocks_a[common]), 0) + } else { + (0, compute_block_hash_fresh(&blocks_b[common])) + }; + return DivergencePoint::Block { + index: common, + hash_a, + hash_b, + }; + } + + // Meta walk: recursive insertion-order traversal that excludes + // top-level `rendered`. Matches the excluding-variant hash so a + // failure reported here is reproducible from the hash itself. + if let Some(point) = find_meta_divergence(meta_a, meta_b, &["rendered"], &mut Vec::new()) { + return point; + } + + DivergencePoint::None +} + +fn find_meta_divergence( + a: &ConfigValue, + b: &ConfigValue, + top_skip: &[&str], + path: &mut Vec, +) -> Option { + // Fast path: equal recursive hashes -> no divergence in this + // subtree. + let hash_a = meta_subtree_hash(a, top_skip); + let hash_b = meta_subtree_hash(b, top_skip); + if hash_a == hash_b { + return None; + } + + // Different. Drill down through Maps in insertion order; report + // the deepest meaningful path. + match (&a.value, &b.value) { + (ConfigValueKind::Map(entries_a), ConfigValueKind::Map(entries_b)) => { + for entry_a in entries_a { + if top_skip.contains(&entry_a.key.as_str()) { + continue; + } + match entries_b.iter().find(|e| e.key == entry_a.key) { + Some(entry_b) => { + path.push(entry_a.key.clone()); + if let Some(point) = + find_meta_divergence(&entry_a.value, &entry_b.value, &[], path) + { + return Some(point); + } + path.pop(); + } + None => { + // Key present in `a`, missing in `b`. Report + // as a leaf divergence at this path. + let mut full = path.clone(); + full.push(entry_a.key.clone()); + return Some(DivergencePoint::MetaKey { + path: full, + hash_a: meta_subtree_hash(&entry_a.value, &[]), + hash_b: 0, + }); + } + } + } + // Any keys in `b` not in `a`? + for entry_b in entries_b { + if top_skip.contains(&entry_b.key.as_str()) { + continue; + } + if !entries_a.iter().any(|e| e.key == entry_b.key) { + let mut full = path.clone(); + full.push(entry_b.key.clone()); + return Some(DivergencePoint::MetaKey { + path: full, + hash_a: 0, + hash_b: meta_subtree_hash(&entry_b.value, &[]), + }); + } + } + // Hashes differed but no key-level divergence found + // (e.g. value of a present key changed but the recursion + // bottomed out without finding a Map to descend into): + // report at the current path. + Some(DivergencePoint::MetaKey { + path: path.clone(), + hash_a, + hash_b, + }) + } + _ => Some(DivergencePoint::MetaKey { + path: path.clone(), + hash_a, + hash_b, + }), + } +} + +fn meta_subtree_hash(value: &ConfigValue, top_skip: &[&str]) -> u64 { + let mut cache = HashCache::new(); + let mut hasher = rustc_hash::FxHasher::default(); + hash_config_value_excluding(value, top_skip, &mut cache, &mut hasher); + hasher.finish() +} + // ============================================================================= // Structural Equality (for hash collision verification) // ============================================================================= @@ -1999,6 +2311,162 @@ mod tests { assert!(structural_eq_slot(&slot1, &slot2)); } + // ==================== Plan 7 — Generated source_info blindness ==================== + // + // The reconciler must compare nodes for structural equality WITHOUT + // consulting their source_info. This is the foundation invariant the + // writer relies on: KeepBefore decisions are made off these functions, + // and a leak of source_info into the comparison would degenerate + // round-trips to whole-document Rewrite. + + fn generated_with_by(by: quarto_source_map::source_info::By) -> SourceInfo { + SourceInfo::generated(by) + } + + #[test] + fn test_structural_eq_blocks_generated_different_by_payloads() { + // Two paragraphs with identical content but Generated source_info + // carrying *different* By payloads (sectionize vs shortcode). + // Reconciler must still see them as equal. + let blocks1 = vec![Block::Paragraph(Paragraph { + content: vec![make_str("a")], + source_info: generated_with_by(quarto_source_map::source_info::By::sectionize()), + })]; + let blocks2 = vec![Block::Paragraph(Paragraph { + content: vec![make_str("a")], + source_info: generated_with_by(quarto_source_map::source_info::By::shortcode("meta")), + })]; + + assert!(structural_eq_blocks(&blocks1, &blocks2)); + } + + #[test] + fn test_structural_eq_blocks_generated_different_anchor_lists() { + // Two paragraphs with identical content. Both Generated with + // matching By, but with different anchor lists (one empty, one + // with an Invocation anchor pointing into file 0). + use quarto_source_map::source_info::{AnchorRole, By}; + use std::sync::Arc; + + let mut si_with_anchor = SourceInfo::generated(By::shortcode("meta")); + si_with_anchor.append_anchor( + AnchorRole::Invocation, + Arc::new(SourceInfo::original(FileId(0), 10, 25)), + ); + + let blocks1 = vec![Block::Paragraph(Paragraph { + content: vec![make_str("a")], + source_info: si_with_anchor, + })]; + let blocks2 = vec![Block::Paragraph(Paragraph { + content: vec![make_str("a")], + source_info: SourceInfo::generated(By::shortcode("meta")), + })]; + + assert!(structural_eq_blocks(&blocks1, &blocks2)); + } + + #[test] + fn test_structural_eq_inlines_generated_different_by_and_anchors() { + // Inline-level analogue of the above two tests bundled. + use quarto_source_map::source_info::{AnchorRole, By}; + use std::sync::Arc; + + let mut si_a = SourceInfo::generated(By::shortcode("meta")); + si_a.append_anchor( + AnchorRole::Invocation, + Arc::new(SourceInfo::original(FileId(0), 10, 25)), + ); + + let mut si_b = SourceInfo::generated(By::shortcode("var")); + si_b.append_anchor( + AnchorRole::ValueSource, + Arc::new(SourceInfo::original(FileId(1), 200, 215)), + ); + + let inlines1 = vec![Inline::Str(Str { + text: "x".into(), + source_info: si_a, + })]; + let inlines2 = vec![Inline::Str(Str { + text: "x".into(), + source_info: si_b, + })]; + + assert!(structural_eq_inlines(&inlines1, &inlines2)); + } + + #[test] + fn test_structural_eq_custom_node_generated_source_info_blind() { + // CustomNode whose wrapper source_info is Generated (the + // Plan-6-stamped shape) vs an Original — equal iff structure matches. + let cn_generated = CustomNode { + type_name: "Callout".to_string(), + attr: empty_attr(), + plain_data: serde_json::json!({"type": "note"}), + slots: LinkedHashMap::new(), + source_info: generated_with_by(quarto_source_map::source_info::By::sectionize()), + }; + let cn_original = CustomNode { + type_name: "Callout".to_string(), + attr: empty_attr(), + plain_data: serde_json::json!({"type": "note"}), + slots: LinkedHashMap::new(), + source_info: dummy_source(), + }; + + assert!(structural_eq_custom_node(&cn_generated, &cn_original)); + } + + #[test] + fn test_structural_eq_custom_node_slot_child_source_info_blind() { + // CustomNode with slot children whose own source_infos differ + // (Generated with anchors vs Original). Same structural content + // → must be equal. + use quarto_source_map::source_info::{AnchorRole, By}; + use std::sync::Arc; + + let mut child_si = SourceInfo::generated(By::shortcode("meta")); + child_si.append_anchor( + AnchorRole::Invocation, + Arc::new(SourceInfo::original(FileId(0), 0, 5)), + ); + + let mut slots_a = LinkedHashMap::new(); + slots_a.insert( + "body".into(), + Slot::Blocks(vec![Block::Paragraph(Paragraph { + content: vec![make_str("hi")], + source_info: child_si, + })]), + ); + let mut slots_b = LinkedHashMap::new(); + slots_b.insert( + "body".into(), + Slot::Blocks(vec![Block::Paragraph(Paragraph { + content: vec![make_str("hi")], + source_info: other_source(), + })]), + ); + + let cn_a = CustomNode { + type_name: "Callout".to_string(), + attr: empty_attr(), + plain_data: serde_json::Value::Null, + slots: slots_a, + source_info: dummy_source(), + }; + let cn_b = CustomNode { + type_name: "Callout".to_string(), + attr: empty_attr(), + plain_data: serde_json::Value::Null, + slots: slots_b, + source_info: dummy_source(), + }; + + assert!(structural_eq_custom_node(&cn_a, &cn_b)); + } + // ==================== NodePtr Tests ==================== #[test] @@ -2013,4 +2481,224 @@ mod tests { assert_eq!(ptr1, ptr2); } + + // ==================== Meta Hash Tests ==================== + + use quarto_pandoc_types::MergeOp; + use yaml_rust2::Yaml; + + fn scalar_str(s: &str) -> ConfigValue { + ConfigValue { + value: ConfigValueKind::Scalar(Yaml::String(s.to_string())), + source_info: dummy_source(), + merge_op: MergeOp::default(), + } + } + + fn scalar_int(i: i64) -> ConfigValue { + ConfigValue { + value: ConfigValueKind::Scalar(Yaml::Integer(i)), + source_info: dummy_source(), + merge_op: MergeOp::default(), + } + } + + fn map_of(entries: Vec<(&str, ConfigValue)>) -> ConfigValue { + map_of_with_source(entries, dummy_source()) + } + + fn map_of_with_source(entries: Vec<(&str, ConfigValue)>, src: SourceInfo) -> ConfigValue { + let entries = entries + .into_iter() + .map(|(k, v)| ConfigMapEntry { + key: k.to_string(), + key_source: src.clone(), + value: v, + }) + .collect(); + ConfigValue { + value: ConfigValueKind::Map(entries), + source_info: src, + merge_op: MergeOp::default(), + } + } + + #[test] + fn meta_hash_same_content_same_hash() { + let a = map_of(vec![("title", scalar_str("hello")), ("toc", scalar_int(3))]); + let b = map_of(vec![("title", scalar_str("hello")), ("toc", scalar_int(3))]); + assert_eq!(compute_meta_hash_fresh(&a), compute_meta_hash_fresh(&b)); + } + + #[test] + fn meta_hash_different_content_different_hash() { + let a = map_of(vec![("title", scalar_str("hello"))]); + let b = map_of(vec![("title", scalar_str("world"))]); + assert_ne!(compute_meta_hash_fresh(&a), compute_meta_hash_fresh(&b)); + } + + #[test] + fn meta_hash_excludes_source_info_and_key_source() { + // Same content, different SourceInfo on values and on keys. + let a = map_of_with_source(vec![("title", scalar_str("hello"))], dummy_source()); + let b = map_of_with_source(vec![("title", scalar_str("hello"))], other_source()); + // Also flip the inner scalar's source_info. + let mut b = b; + if let ConfigValueKind::Map(entries) = &mut b.value { + entries[0].value.source_info = other_source(); + } + assert_eq!(compute_meta_hash_fresh(&a), compute_meta_hash_fresh(&b)); + } + + #[test] + fn meta_hash_excluding_rendered_ignores_top_level_rendered() { + let a = map_of(vec![ + ("title", scalar_str("hello")), + ( + "rendered", + map_of(vec![("navbar", scalar_str(""))]), + ), + ]); + let b = map_of(vec![ + ("title", scalar_str("hello")), + ( + "rendered", + map_of(vec![("navbar", scalar_str(""))]), + ), + ]); + assert_ne!( + compute_meta_hash_fresh(&a), + compute_meta_hash_fresh(&b), + "the non-excluding hash must observe the difference", + ); + assert_eq!( + compute_meta_hash_fresh_excluding_rendered(&a), + compute_meta_hash_fresh_excluding_rendered(&b), + "the excluding-rendered hash must ignore top-level rendered.* divergence", + ); + } + + #[test] + fn meta_hash_excluding_rendered_does_not_propagate_to_nested_rendered() { + // A nested `rendered` key is part of the content and must + // still participate in the hash. + let a = map_of(vec![( + "listings", + map_of(vec![("rendered", scalar_str("a"))]), + )]); + let b = map_of(vec![( + "listings", + map_of(vec![("rendered", scalar_str("b"))]), + )]); + assert_ne!( + compute_meta_hash_fresh_excluding_rendered(&a), + compute_meta_hash_fresh_excluding_rendered(&b), + ); + } + + #[test] + fn meta_hash_map_insertion_order_matters() { + // Regression guard for the no-sort choice: a transform that + // stuffs a HashMap into meta would produce different + // insertion orders across runs; the hash must catch that. + let a = map_of(vec![("a", scalar_int(1)), ("b", scalar_int(2))]); + let b = map_of(vec![("b", scalar_int(2)), ("a", scalar_int(1))]); + assert_ne!( + compute_meta_hash_fresh(&a), + compute_meta_hash_fresh(&b), + "different Map insertion order must produce different hashes", + ); + } + + #[test] + fn meta_hash_merge_op_participates() { + let a = ConfigValue { + value: ConfigValueKind::Scalar(Yaml::String("x".into())), + source_info: dummy_source(), + merge_op: MergeOp::Concat, + }; + let b = ConfigValue { + value: ConfigValueKind::Scalar(Yaml::String("x".into())), + source_info: dummy_source(), + merge_op: MergeOp::Prefer, + }; + assert_ne!(compute_meta_hash_fresh(&a), compute_meta_hash_fresh(&b)); + } + + // ==================== Divergence Localization Tests ==================== + + fn para(text: &str) -> Block { + Block::Paragraph(Paragraph { + content: vec![make_str(text)], + source_info: dummy_source(), + }) + } + + #[test] + fn divergence_identical_docs_returns_none() { + let blocks = vec![para("alpha"), para("beta")]; + let meta = map_of(vec![("title", scalar_str("t"))]); + let point = find_first_divergence(&blocks, &meta, &blocks, &meta); + assert_eq!(point, DivergencePoint::None); + } + + #[test] + fn divergence_reports_first_block_mismatch() { + let a = vec![para("alpha"), para("beta"), para("gamma")]; + let b = vec![para("alpha"), para("DIFFERENT"), para("gamma")]; + let meta = map_of(vec![]); + let point = find_first_divergence(&a, &meta, &b, &meta); + match point { + DivergencePoint::Block { + index, + hash_a, + hash_b, + } => { + assert_eq!(index, 1); + assert_ne!(hash_a, hash_b); + } + other => panic!("expected Block divergence, got {:?}", other), + } + } + + #[test] + fn divergence_reports_meta_key_path() { + let meta_a = map_of(vec![( + "listings", + map_of(vec![("foo", map_of(vec![("title", scalar_str("a"))]))]), + )]); + let meta_b = map_of(vec![( + "listings", + map_of(vec![("foo", map_of(vec![("title", scalar_str("b"))]))]), + )]); + let blocks: Vec = vec![]; + let point = find_first_divergence(&blocks, &meta_a, &blocks, &meta_b); + match point { + DivergencePoint::MetaKey { + path, + hash_a, + hash_b, + } => { + assert_eq!(path, vec!["listings", "foo", "title"]); + assert_ne!(hash_a, hash_b); + } + other => panic!("expected MetaKey divergence, got {:?}", other), + } + } + + #[test] + fn divergence_skips_rendered_top_level() { + // Only `rendered.*` differs at the top level -> no divergence. + let meta_a = map_of(vec![ + ("title", scalar_str("hello")), + ("rendered", map_of(vec![("navbar", scalar_str("a"))])), + ]); + let meta_b = map_of(vec![ + ("title", scalar_str("hello")), + ("rendered", map_of(vec![("navbar", scalar_str("b"))])), + ]); + let blocks: Vec = vec![]; + let point = find_first_divergence(&blocks, &meta_a, &blocks, &meta_b); + assert_eq!(point, DivergencePoint::None); + } } diff --git a/crates/quarto-ast-reconcile/src/lib.rs b/crates/quarto-ast-reconcile/src/lib.rs index 3d6e33a4f..98abb9083 100644 --- a/crates/quarto-ast-reconcile/src/lib.rs +++ b/crates/quarto-ast-reconcile/src/lib.rs @@ -27,8 +27,10 @@ pub mod types; pub use apply::apply_reconciliation; pub use compute::{compute_reconciliation, compute_reconciliation_for_blocks}; pub use hash::{ - HashCache, compute_block_hash_fresh, compute_blocks_hash_fresh, compute_inline_hash_fresh, - structural_eq_block, structural_eq_blocks, structural_eq_inline, structural_eq_inlines, + DivergencePoint, HashCache, compute_block_hash_fresh, compute_blocks_hash_fresh, + compute_inline_hash_fresh, compute_meta_hash_fresh, compute_meta_hash_fresh_excluding_rendered, + find_first_divergence, structural_eq_block, structural_eq_blocks, structural_eq_inline, + structural_eq_inlines, }; pub use remap::remap_file_ids; pub use types::{ diff --git a/crates/quarto-core/Cargo.toml b/crates/quarto-core/Cargo.toml index 8c4f8978f..0a4f3d0a2 100644 --- a/crates/quarto-core/Cargo.toml +++ b/crates/quarto-core/Cargo.toml @@ -18,6 +18,7 @@ tokio-util.workspace = true pollster.workspace = true serde_json.workspace = true yaml-rust2.workspace = true +smallvec.workspace = true hashlink = "0.11" pathdiff = "0.2" sha2 = "0.11" diff --git a/crates/quarto-core/src/crossref/mod.rs b/crates/quarto-core/src/crossref/mod.rs index e4d66d9d2..8c4b8e8d5 100644 --- a/crates/quarto-core/src/crossref/mod.rs +++ b/crates/quarto-core/src/crossref/mod.rs @@ -89,4 +89,29 @@ pub const EQUATION: &str = "Equation"; /// Produced by `CrossrefResolveTransform` when it rewrites a `Cite` whose id /// classifies as a crossref (per [`RefTypeRegistry`]). Back-end renderers /// convert this into a format-specific link or reference. +/// +/// Kept in lockstep with +/// [`quarto_pandoc_types::ATOMIC_CUSTOM_NODES`] — the q2-preview incremental +/// writer treats this type_name as atomic. A cross-check test below pins +/// the two literals together. pub const CROSSREF_RESOLVED_REF: &str = "CrossrefResolvedRef"; + +#[cfg(test)] +mod atomic_lockstep_tests { + use super::CROSSREF_RESOLVED_REF; + + /// Pin that the `CROSSREF_RESOLVED_REF` literal here matches the entry + /// in `quarto_pandoc_types::ATOMIC_CUSTOM_NODES`. If either string + /// changes, the writer's atomicity gate silently mis-fires; this test + /// fails noisily. + #[test] + fn crossref_resolved_ref_is_in_atomic_registry() { + assert!( + quarto_pandoc_types::ATOMIC_CUSTOM_NODES.contains(&CROSSREF_RESOLVED_REF), + "CROSSREF_RESOLVED_REF (`{}`) must appear in \ + quarto_pandoc_types::ATOMIC_CUSTOM_NODES; the q2-preview \ + writer relies on the lockstep.", + CROSSREF_RESOLVED_REF + ); + } +} diff --git a/crates/quarto-core/src/project/pass2_renderer.rs b/crates/quarto-core/src/project/pass2_renderer.rs index 04f88afc5..f8a4b58eb 100644 --- a/crates/quarto-core/src/project/pass2_renderer.rs +++ b/crates/quarto-core/src/project/pass2_renderer.rs @@ -371,6 +371,14 @@ pub struct RenderToHtmlRenderer { /// will be constructed with this root. vfs_root: std::path::PathBuf, + /// bd-rz2we: when set, the per-page resolver is built with + /// [`ResourceResolverContext::vfs_root_with_url_root`] using + /// this string as the URL prefix while `vfs_root` keeps acting + /// as the disk-write root. `None` keeps today's behavior + /// (URL root derived from `vfs_root`). Used by native test + /// helpers so rendered URLs don't capture the host's tempdir. + vfs_url_root: Option, + /// Optional user-grammar provider attached by the caller. Shared /// across every page the renderer touches (one /// `RenderToHtmlRenderer` may produce many pages in `ActivePage` @@ -386,6 +394,7 @@ impl RenderToHtmlRenderer { pub fn new(vfs_root: impl Into) -> Self { Self { vfs_root: vfs_root.into(), + vfs_url_root: None, user_grammars: None, } } @@ -401,6 +410,25 @@ impl RenderToHtmlRenderer { self.user_grammars = Some(provider); self } + + /// bd-rz2we: override the URL prefix used for resolved-asset + /// links/srcs. Disk writes still go through `vfs_root` (a real + /// tempdir in native test runs); only the URL strings embedded + /// in HTML change. Used by native test helpers so rendered + /// output doesn't leak the host's tempdir. + pub fn with_url_root(mut self, url_root: impl Into) -> Self { + self.vfs_url_root = Some(url_root.into()); + self + } + + fn build_resolver(&self) -> ResourceResolverContext { + match &self.vfs_url_root { + Some(url) => { + ResourceResolverContext::vfs_root_with_url_root(self.vfs_root.clone(), url.clone()) + } + None => ResourceResolverContext::vfs_root(self.vfs_root.clone()), + } + } } #[async_trait(?Send)] @@ -434,7 +462,9 @@ impl Pass2Renderer for RenderToHtmlRenderer { // URLs land under `/.quarto/project-artifacts/...` (the // post-processor reads from VFS at the matching path); see // Phase 5 sub-plan §"`ResourceResolverContext::vfs_root`". - let resolver = ResourceResolverContext::vfs_root(self.vfs_root.clone()); + // bd-rz2we: native test helpers can override the URL prefix + // via `with_url_root` to keep rendered URLs path-independent. + let resolver = self.build_resolver(); let binaries = BinaryDependencies::new(); let options = RenderOptions { @@ -549,7 +579,7 @@ impl Pass2Renderer for RenderToHtmlRenderer { // already embeds in HTML. `lib_dir` is intentionally // ignored — the post-processor just needs to find the // bytes at the URL's path. - ResourceResolverContext::vfs_root(self.vfs_root.clone()) + self.build_resolver() } } @@ -573,6 +603,14 @@ pub struct RenderToPreviewAstRenderer { /// Synthetic VFS root under which every artifact lives in WASM. /// Same semantics as [`RenderToHtmlRenderer::new`]. vfs_root: std::path::PathBuf, + /// bd-rz2we: when set, the per-page resolver is built with + /// [`ResourceResolverContext::vfs_root_with_url_root`] using + /// this string as the URL prefix while `vfs_root` keeps acting + /// as the disk-write root. `None` keeps today's behavior + /// (URL root derived from `vfs_root`). Used by native test + /// helpers (idempotence harness) so rendered URLs don't + /// capture the host's tempdir. + vfs_url_root: Option, /// bd-lucp: optional engine-execution capture used to splice /// recorded engine output into the AST at preview time. Plumbed /// through to [`crate::pipeline::render_qmd_to_preview_ast`] on @@ -597,6 +635,7 @@ impl RenderToPreviewAstRenderer { pub fn new(vfs_root: impl Into) -> Self { Self { vfs_root: vfs_root.into(), + vfs_url_root: None, attribution_json: None, capture: None, } @@ -629,6 +668,26 @@ impl RenderToPreviewAstRenderer { self.attribution_json = Some(json); self } + + /// bd-rz2we: override the URL prefix used for resolved-asset + /// links/srcs and cross-page links. Disk writes still go + /// through `vfs_root` (a real tempdir in native test runs); + /// only the URL strings embedded in the rendered AST change. + /// Used by native test helpers so rendered AST is + /// path-independent across runs. + pub fn with_url_root(mut self, url_root: impl Into) -> Self { + self.vfs_url_root = Some(url_root.into()); + self + } + + fn build_resolver(&self) -> ResourceResolverContext { + match &self.vfs_url_root { + Some(url) => { + ResourceResolverContext::vfs_root_with_url_root(self.vfs_root.clone(), url.clone()) + } + None => ResourceResolverContext::vfs_root(self.vfs_root.clone()), + } + } } #[async_trait(?Send)] @@ -658,7 +717,10 @@ impl Pass2Renderer for RenderToPreviewAstRenderer { )) })?; - let resolver = ResourceResolverContext::vfs_root(self.vfs_root.clone()); + // bd-rz2we: native test helpers can override the URL prefix + // via `with_url_root` so rendered AST link/asset URLs stay + // path-independent across runs in different tempdirs. + let resolver = self.build_resolver(); let binaries = BinaryDependencies::new(); let options = RenderOptions { @@ -795,6 +857,6 @@ impl Pass2Renderer for RenderToPreviewAstRenderer { // (which runs in the q2-preview pipeline) embeds image URLs // using this resolver, so the iframe sees URLs that resolve // to the matching VFS path. - ResourceResolverContext::vfs_root(self.vfs_root.clone()) + self.build_resolver() } } diff --git a/crates/quarto-core/src/resource_resolver.rs b/crates/quarto-core/src/resource_resolver.rs index 04d654dec..1f82f898f 100644 --- a/crates/quarto-core/src/resource_resolver.rs +++ b/crates/quarto-core/src/resource_resolver.rs @@ -35,6 +35,37 @@ use std::path::{Path, PathBuf}; use crate::artifact::ArtifactScope; +/// VFS-root resolver state. Splits the two roles a single +/// `PathBuf` used to play (bd-rz2we): the **disk-write root** +/// (where `runtime.file_write` and `OutputSink::allowed_roots` +/// land) and the **URL root** (what gets embedded in HTML +/// link/asset URLs). +/// +/// Production WASM constructs this via [`ResourceResolverContext::vfs_root`] +/// with the two fields populated from one path — they're +/// intentionally identical, since the WASM runtime serves the +/// synthetic VFS path from memory. Native test helpers construct +/// it via [`ResourceResolverContext::vfs_root_with_url_root`] +/// with a real tempdir for `write_root` and the synthetic +/// `/.quarto/project-artifacts` string for `url_root`, so that +/// `runtime.file_write` actually succeeds while rendered AST/HTML +/// stays path-independent (idempotent across runs in different +/// tempdirs). +#[derive(Debug, Clone)] +struct VfsRootMode { + /// Absolute disk path. `runtime.file_write` and + /// `OutputSink::allowed_roots` use this. In WASM this is a + /// synthetic VFS path (the runtime serves it from memory); in + /// native tests it's a real tempdir subdirectory. + write_root: PathBuf, + /// URL prefix embedded in HTML links / asset srcs. In WASM + /// this matches `write_root` by construction. In native tests + /// it's a fixed synthetic string (e.g. + /// `/.quarto/project-artifacts`) so URLs don't capture the + /// host machine's tempdir. + url_root: String, +} + /// Per-page context for resolving artifact paths and URLs. /// /// All paths are absolute and pre-normalized; the resolver does @@ -56,12 +87,14 @@ pub struct ResourceResolverContext { lib_dir: String, /// Per-page resource directory name (e.g. `"api_files"`). page_files_dir: String, - /// When `Some(root)`, the resolver is in **VFS-root mode**: - /// every artifact resolves to `{root}/{artifact_path}` for - /// both the on-disk path and the HTML URL, regardless of - /// scope. Used by the WASM hub-client where the runtime - /// serves files from a synthetic absolute path. - vfs_root_mode: Option, + /// When `Some(_)`, the resolver is in **VFS-root mode**: every + /// artifact resolves to `{write_root}/{artifact_path}` on disk + /// and `{url_root}/{artifact_path}` in HTML, regardless of + /// scope. Used by the WASM hub-client (write_root == url_root) + /// and by native test helpers (write_root is a tempdir, + /// url_root is a synthetic string for idempotence). See + /// [`VfsRootMode`]. + vfs_root_mode: Option, } impl ResourceResolverContext { @@ -132,18 +165,54 @@ impl ResourceResolverContext { /// The browser fetches the URL absolute, the runtime serves /// it from VFS at the matching synthetic path. No relative- /// path computation needed because the URLs are absolute. + /// + /// Single-arg form: `write_root == url_root`. Preserves the + /// pinned contract that VFS-mode URLs and on-disk paths are + /// byte-identical (see + /// `website_post_render::vfs_root_resolver_url_matches_on_disk_path`). pub fn vfs_root(vfs_root: impl Into) -> Self { - let root = vfs_root.into(); + let root: PathBuf = vfs_root.into(); + let url_root = root.to_string_lossy().replace('\\', "/"); + Self::vfs_root_with_url_root(root, url_root) + } + + /// Two-arg VFS-root constructor (bd-rz2we): decouple the + /// disk-write root from the URL prefix. + /// + /// - `write_root` is the absolute on-disk path + /// `runtime.file_write` and `OutputSink::allowed_roots` use. + /// In native test runs this is a real tempdir subdirectory. + /// - `url_root` is the URL prefix embedded in HTML links and + /// asset srcs. In native test runs this is a synthetic + /// string (e.g. `"/.quarto/project-artifacts"`) so rendered + /// AST/HTML is independent of the host's tempdir layout. + /// + /// Production WASM doesn't call this directly — it calls + /// [`Self::vfs_root`] with one path that's used for both + /// roles. The two-arg form exists for in-process native + /// callers of the q2-preview / WASM-style renderers + /// (`RenderToPreviewAstRenderer::with_url_root`, + /// `RenderToHtmlRenderer::with_url_root`) so their integration + /// tests get byte-identical AST output across runs. + pub fn vfs_root_with_url_root( + write_root: impl Into, + url_root: impl Into, + ) -> Self { + let write_root: PathBuf = write_root.into(); + let url_root: String = url_root.into(); Self { - page_output: root.join("__page__.html"), - site_root: root.clone(), + page_output: write_root.join("__page__.html"), + site_root: write_root.clone(), // Empty lib_dir on its own would route Project to // page_files_dir; we override scope_root to ignore // both fields when the resolver is in vfs-root mode - // (see the `vfs_root_mode` flag below). + // (see the `vfs_root_mode` field below). lib_dir: String::new(), page_files_dir: String::new(), - vfs_root_mode: Some(root), + vfs_root_mode: Some(VfsRootMode { + write_root, + url_root, + }), } } @@ -182,8 +251,8 @@ impl ResourceResolverContext { /// - An absolute URL of the form `/{vfs_root}/{artifact_path}` /// (VFS-root mode — used by the WASM hub-client). pub fn html_url_for(&self, scope: ArtifactScope, artifact_path: &Path) -> String { - if let Some(root) = &self.vfs_root_mode { - return rel_to_url(&root.join(artifact_path)); + if let Some(mode) = &self.vfs_root_mode { + return join_url_root(&mode.url_root, artifact_path); } let target = self.on_disk_path_for(scope, artifact_path); let page_dir = self.page_output.parent().unwrap_or_else(|| Path::new(".")); @@ -208,8 +277,8 @@ impl ResourceResolverContext { /// `{site_root}/{target_output_href}`. For single-doc renders /// this collapses to the input (since `site_root == page_dir`). pub fn page_url_for(&self, target_output_href: &str) -> String { - if let Some(root) = &self.vfs_root_mode { - return rel_to_url(&root.join(target_output_href)); + if let Some(mode) = &self.vfs_root_mode { + return join_url_root(&mode.url_root, Path::new(target_output_href)); } let target_abs = self.site_root.join(target_output_href); let page_dir = self.page_output.parent().unwrap_or_else(|| Path::new(".")); @@ -248,8 +317,8 @@ impl ResourceResolverContext { /// the resolver-side half of bd-cfl67) is then refused by the /// sink rather than written. pub fn allowed_output_roots(&self) -> Vec { - if let Some(root) = &self.vfs_root_mode { - return vec![root.clone()]; + if let Some(mode) = &self.vfs_root_mode { + return vec![mode.write_root.clone()]; } vec![self.site_root.clone()] } @@ -306,8 +375,8 @@ impl ResourceResolverContext { "artifact path must be relative (got {}); root-prefixed paths bypass scope_root and risk overwriting source files (bd-cfl67)", artifact_path.display(), ); - if let Some(root) = &self.vfs_root_mode { - return root.join(artifact_path); + if let Some(mode) = &self.vfs_root_mode { + return mode.write_root.join(artifact_path); } let scope_root = self.scope_root(scope); scope_root.join(artifact_path) @@ -336,6 +405,24 @@ impl ResourceResolverContext { } } +/// Build a `{url_root}/{artifact_path}` URL string in VFS-root +/// mode. `url_root` is taken verbatim (no path manipulation — +/// the WASM contract is that it stays byte-identical to the +/// disk path; native tests pass a synthetic string). The +/// artifact path is rendered with forward-slash separators +/// regardless of host OS. +fn join_url_root(url_root: &str, artifact_path: &Path) -> String { + let suffix = artifact_path.to_string_lossy().replace('\\', "/"); + if suffix.is_empty() { + return url_root.to_string(); + } + if url_root.ends_with('/') || suffix.starts_with('/') { + format!("{}{}", url_root, suffix) + } else { + format!("{}/{}", url_root, suffix) + } +} + /// Render a relative path as a forward-slash URL string. On /// Windows, `pathdiff` may yield backslash separators; HTML /// always wants forward slashes. @@ -697,4 +784,29 @@ mod tests { let url = r.html_url_for(ArtifactScope::Project, Path::new("styles.css")); assert_eq!(url, on_disk.to_string_lossy().replace('\\', "/")); } + + /// bd-rz2we: the two-arg VFS-root constructor decouples the + /// disk-write root (where the runtime actually puts bytes) from + /// the URL prefix embedded in HTML. Native test helpers pass a + /// real tempdir for the write root and a synthetic string for + /// the URL root, so rendered AST/HTML is path-independent + /// (idempotent across runs in different tempdirs) while + /// `runtime.file_write` still succeeds against a real disk path. + #[test] + fn resolver_vfs_root_with_url_root_splits_write_and_url() { + let r = ResourceResolverContext::vfs_root_with_url_root( + "/tmp/abc", + "/.quarto/project-artifacts", + ); + // URL side uses url_root. + let url = r.html_url_for(ArtifactScope::Project, Path::new("styles.css")); + assert_eq!(url, "/.quarto/project-artifacts/styles.css"); + let page_url = r.page_url_for("about.html"); + assert_eq!(page_url, "/.quarto/project-artifacts/about.html"); + // Disk side uses write_root. + let on_disk = r.on_disk_path_for(ArtifactScope::Project, Path::new("styles.css")); + assert_eq!(on_disk, PathBuf::from("/tmp/abc/styles.css")); + // allowed_output_roots tracks the write side. + assert_eq!(r.allowed_output_roots(), vec![PathBuf::from("/tmp/abc")]); + } } diff --git a/crates/quarto-core/src/stage/stages/apply_template.rs b/crates/quarto-core/src/stage/stages/apply_template.rs index 32e322c02..cfc2f6ab1 100644 --- a/crates/quarto-core/src/stage/stages/apply_template.rs +++ b/crates/quarto-core/src/stage/stages/apply_template.rs @@ -817,19 +817,9 @@ mod tests { .or(diag.location.as_ref()) .expect("diagnostic should carry a SourceInfo location"); - fn root_file_id(info: &quarto_source_map::SourceInfo) -> Option { - match info { - quarto_source_map::SourceInfo::Original { file_id, .. } => Some(*file_id), - quarto_source_map::SourceInfo::Substring { parent, .. } => root_file_id(parent), - quarto_source_map::SourceInfo::Concat { pieces } => { - pieces.first().and_then(|p| root_file_id(&p.source_info)) - } - quarto_source_map::SourceInfo::FilterProvenance { .. } => None, - } - } - - let file_id = - root_file_id(location).expect("diagnostic location should have a resolvable FileId"); + let file_id = location + .root_file_id() + .expect("diagnostic location should have a resolvable FileId"); let file = result .source_context .get_file(file_id) diff --git a/crates/quarto-core/src/stage/stages/engine_execution.rs b/crates/quarto-core/src/stage/stages/engine_execution.rs index a93274222..f1460ebfc 100644 --- a/crates/quarto-core/src/stage/stages/engine_execution.rs +++ b/crates/quarto-core/src/stage/stages/engine_execution.rs @@ -814,39 +814,25 @@ mod tests { pandoc: &quarto_pandoc_types::pandoc::Pandoc, ) -> std::collections::HashSet { use quarto_pandoc_types::{Block, Inline}; - use quarto_source_map::{FileId, SourceInfo}; + use quarto_source_map::FileId; - fn walk_source_info(si: &SourceInfo, out: &mut std::collections::HashSet) { - match si { - SourceInfo::Original { file_id, .. } => { - out.insert(*file_id); - } - SourceInfo::Substring { parent, .. } => walk_source_info(parent, out), - SourceInfo::Concat { pieces } => { - for p in pieces { - walk_source_info(&p.source_info, out); - } - } - SourceInfo::FilterProvenance { .. } => {} - } - } fn walk_inline(i: &Inline, out: &mut std::collections::HashSet) { match i { - Inline::Str(x) => walk_source_info(&x.source_info, out), + Inline::Str(x) => x.source_info.collect_file_ids(out), Inline::Emph(x) => { for c in &x.content { walk_inline(c, out); } - walk_source_info(&x.source_info, out); + x.source_info.collect_file_ids(out); } Inline::Strong(x) => { for c in &x.content { walk_inline(c, out); } - walk_source_info(&x.source_info, out); + x.source_info.collect_file_ids(out); } - Inline::Space(x) => walk_source_info(&x.source_info, out), - Inline::SoftBreak(x) => walk_source_info(&x.source_info, out), + Inline::Space(x) => x.source_info.collect_file_ids(out), + Inline::SoftBreak(x) => x.source_info.collect_file_ids(out), _ => { // Other variants not needed for this test. Add as needed. } @@ -858,19 +844,19 @@ mod tests { for i in &p.content { walk_inline(i, out); } - walk_source_info(&p.source_info, out); + p.source_info.collect_file_ids(out); } Block::Header(h) => { for i in &h.content { walk_inline(i, out); } - walk_source_info(&h.source_info, out); + h.source_info.collect_file_ids(out); } Block::Div(d) => { for b in &d.content { walk_block(b, out); } - walk_source_info(&d.source_info, out); + d.source_info.collect_file_ids(out); } _ => { // Other block types not needed for this test. diff --git a/crates/quarto-core/src/transforms/appendix.rs b/crates/quarto-core/src/transforms/appendix.rs index 9ac5379f1..b374e48d2 100644 --- a/crates/quarto-core/src/transforms/appendix.rs +++ b/crates/quarto-core/src/transforms/appendix.rs @@ -49,7 +49,8 @@ use quarto_pandoc_types::attr::AttrSourceInfo; use quarto_pandoc_types::block::{Block, Div, Header, Paragraph}; use quarto_pandoc_types::inline::{Inline, Link, Str}; use quarto_pandoc_types::pandoc::Pandoc; -use quarto_source_map::SourceInfo; +use quarto_source_map::{By, SourceInfo}; +use smallvec::smallvec; use quarto_pandoc_types::ConfigValue; @@ -227,7 +228,10 @@ fn extract_footnotes(blocks: &mut Vec) -> Option { /// Wrap bibliography in a section with appropriate attributes. fn wrap_bibliography(bibliography: Block) -> Block { - let source_info = SourceInfo::default(); + let source_info = SourceInfo::Generated { + by: By::appendix(), + from: smallvec![], + }; // Create header for the bibliography section let header = Block::Header(Header { @@ -262,7 +266,10 @@ fn create_appendix_container(sections: Blocks, style_class: &str) -> Block { LinkedHashMap::new(), ), content: sections, - source_info: SourceInfo::default(), + source_info: SourceInfo::Generated { + by: By::appendix(), + from: smallvec![], + }, attr_source: AttrSourceInfo::empty(), }) } @@ -283,7 +290,10 @@ fn create_license_section(meta: &ConfigValue) -> Option { .map(|s| s.to_string())? }; - let source_info = SourceInfo::default(); + let source_info = SourceInfo::Generated { + by: By::appendix(), + from: smallvec![], + }; let header = Block::Header(Header { level: 2, @@ -332,7 +342,10 @@ fn create_copyright_section(meta: &ConfigValue) -> Option { .map(|s| s.to_string())? }; - let source_info = SourceInfo::default(); + let source_info = SourceInfo::Generated { + by: By::appendix(), + from: smallvec![], + }; let header = Block::Header(Header { level: 2, @@ -373,7 +386,10 @@ fn create_citation_section(meta: &ConfigValue) -> Option { // It can have various formats - for now, look for a "url" or create a simple reference let citation_url = citation.get("url").and_then(|v| v.as_str()); - let source_info = SourceInfo::default(); + let source_info = SourceInfo::Generated { + by: By::appendix(), + from: smallvec![], + }; let header = Block::Header(Header { level: 2, @@ -878,4 +894,21 @@ mod tests { panic!("Expected appendix Div"); } } + + #[test] + fn test_create_appendix_container_has_generated_provenance() { + // Plan 6: the synthesized appendix container Div carries + // Generated { by: appendix(), from: [] }. + let block = create_appendix_container(vec![], "default"); + let Block::Div(div) = &block else { + panic!("Expected Div"); + }; + match &div.source_info { + SourceInfo::Generated { by, from } => { + assert_eq!(by.kind, "appendix"); + assert!(from.is_empty()); + } + other => panic!("Expected Generated, got {:?}", other), + } + } } diff --git a/crates/quarto-core/src/transforms/code_block_generate.rs b/crates/quarto-core/src/transforms/code_block_generate.rs index 06a4f0cf9..71d69050b 100644 --- a/crates/quarto-core/src/transforms/code_block_generate.rs +++ b/crates/quarto-core/src/transforms/code_block_generate.rs @@ -783,10 +783,7 @@ mod tests { let concat = SourceInfo::Concat { pieces: vec![] }; assert!(CodeBlockDecorationKey::from_source_info(&concat).is_none()); - let filter = SourceInfo::FilterProvenance { - filter_path: "fixture.lua".into(), - line: 1, - }; + let filter = SourceInfo::generated(quarto_source_map::By::filter("fixture.lua", 1)); assert!(CodeBlockDecorationKey::from_source_info(&filter).is_none()); } diff --git a/crates/quarto-core/src/transforms/footnotes.rs b/crates/quarto-core/src/transforms/footnotes.rs index 024f5bebb..f84572281 100644 --- a/crates/quarto-core/src/transforms/footnotes.rs +++ b/crates/quarto-core/src/transforms/footnotes.rs @@ -50,7 +50,8 @@ use quarto_pandoc_types::block::{Block, Div, OrderedList, Paragraph}; use quarto_pandoc_types::inline::{Inline, Link, Span, Str, Superscript}; use quarto_pandoc_types::pandoc::Pandoc; use quarto_pandoc_types::{Blocks, Inlines, ListNumberDelim, ListNumberStyle}; -use quarto_source_map::SourceInfo; +use quarto_source_map::{By, SourceInfo}; +use smallvec::smallvec; use quarto_pandoc_types::ConfigValue; @@ -492,7 +493,14 @@ fn create_footnote_ref(number: usize, source_info: &SourceInfo, is_margin: bool) /// /// ``` fn create_footnotes_section(footnotes: &[CollectedFootnote]) -> Block { - let source_info = SourceInfo::default(); + // The synthesized container chrome (section Div, embedded
, and the + // OrderedList wrapping the footnote items) is pure synthesis: it + // corresponds to no source bytes. The footnote content inside (created + // by `create_footnote_item`) retains the original Note's source_info. + let source_info = SourceInfo::Generated { + by: By::footnotes(), + from: smallvec![], + }; // Create list items for each footnote let list_items: Vec = footnotes @@ -1061,4 +1069,31 @@ mod tests { // Check footnotes section exists assert!(matches!(ast.blocks[1], Block::Div(_))); } + + #[test] + fn test_create_footnotes_section_has_generated_provenance() { + // Plan 6: the synthesized footnotes container Div (and its embedded + // chrome — HorizontalRule, OrderedList) carry + // Generated { by: footnotes(), from: [] }. The footnote *items* + // inside retain the original Note's source_info via + // create_footnote_item. + let block = create_footnotes_section(&[]); + let Block::Div(div) = &block else { + panic!("Expected Div"); + }; + match &div.source_info { + SourceInfo::Generated { by, from } => { + assert_eq!(by.kind, "footnotes"); + assert!(from.is_empty()); + } + other => panic!("Expected Generated, got {:?}", other), + } + // The embedded HorizontalRule chrome carries the same shape. + let Block::HorizontalRule(hr) = &div.content[0] else { + panic!("Expected HorizontalRule"); + }; + assert!( + matches!(&hr.source_info, SourceInfo::Generated { by, .. } if by.kind == "footnotes") + ); + } } diff --git a/crates/quarto-core/src/transforms/proof.rs b/crates/quarto-core/src/transforms/proof.rs index 137b328f9..a81d4c65d 100644 --- a/crates/quarto-core/src/transforms/proof.rs +++ b/crates/quarto-core/src/transforms/proof.rs @@ -132,8 +132,8 @@ fn empty_attr() -> Attr { fn convert_div(mut div: Div) -> CustomNode { // Extract title: `name=` attribute, then first Header. Same rule as // theorem sugar. - let title: Option = - extract_name_attr(&mut div.attr).or_else(|| extract_first_header_title(&mut div.content)); + let title: Option = extract_name_attr(&mut div.attr, &div.attr_source) + .or_else(|| extract_first_header_title(&mut div.content)); // Strip the `.proof` class so a later "match div.proof" filter // doesn't double-apply (same pattern as theorem sugar). @@ -155,8 +155,30 @@ fn convert_div(mut div: Div) -> CustomNode { node } -fn extract_name_attr(attr: &mut Attr) -> Option { +/// Read and remove the `name` attribute from `attr`. See +/// `crate::transforms::theorem::extract_name_attr` for the +/// positional-alignment rationale (this is the parallel implementation +/// for `.proof` Divs). +fn extract_name_attr(attr: &mut Attr, attr_source: &AttrSourceInfo) -> Option { let (_id, _classes, kvs) = attr; + + let name_idx = kvs.keys().position(|k| k == "name")?; + + // See `theorem::extract_name_attr` — empty attr_source signals + // "no provenance available" (common in tests); only assert on + // populated-but-misaligned input. + debug_assert!( + attr_source.attributes.is_empty() || kvs.len() == attr_source.attributes.len(), + "AttrSourceInfo.attributes is out of sync with Attr.2 (bd-3aolj / bd-1e6a5): kvs={}, attr_source={}", + kvs.len(), + attr_source.attributes.len(), + ); + let value_source = if kvs.len() == attr_source.attributes.len() { + attr_source.attributes[name_idx].1.clone() + } else { + None + }; + let name = kvs.remove("name")?; if name.is_empty() { return None; @@ -164,7 +186,7 @@ fn extract_name_attr(attr: &mut Attr) -> Option { Some(vec![quarto_pandoc_types::inline::Inline::Str( quarto_pandoc_types::inline::Str { text: name, - source_info: quarto_source_map::SourceInfo::default(), + source_info: value_source.unwrap_or_default(), }, )]) } diff --git a/crates/quarto-core/src/transforms/shortcode_resolve.rs b/crates/quarto-core/src/transforms/shortcode_resolve.rs index cc9d5e8f5..58f076fbd 100644 --- a/crates/quarto-core/src/transforms/shortcode_resolve.rs +++ b/crates/quarto-core/src/transforms/shortcode_resolve.rs @@ -41,7 +41,8 @@ use quarto_pandoc_types::inline::{ use quarto_pandoc_types::pandoc::Pandoc; use quarto_pandoc_types::shortcode::{Shortcode, ShortcodeArg}; use quarto_pandoc_types::table::Table; -use quarto_source_map::SourceInfo; +use quarto_source_map::{Anchor, By, SourceInfo}; +use smallvec::smallvec; use std::future::Future; use std::path::PathBuf; @@ -303,12 +304,38 @@ impl ShortcodeResolveTransform { /// Resolve a shortcode using the appropriate handler. /// /// Priority: built-in Rust handlers > loaded Lua handlers > extension name lookup. + /// + /// All `ShortcodeResult::Inlines`/`Blocks` outcomes flow through this single + /// funnel and are post-walked by `stamp_shortcode_anchors`, which stamps each + /// returned node with `Generated { by: shortcode(name), from: [Invocation -> ctx.source_info] }` + /// (and enriches any Lua filter-attached source_info). `Preserve` and `Error` + /// outcomes do not need stamping — `Preserve` becomes a literal Str via + /// `shortcode_to_literal` and `Error` becomes a visible error via + /// `make_error_inline`; both sites carry the token's `Original` source_info + /// directly. async fn resolve_shortcode( &self, shortcode: &Shortcode, ctx: &ShortcodeContext<'_>, resolution_ctx: ResolutionContext, lua_engine: &mut Option, + ) -> ShortcodeResult { + let mut result = self + .dispatch_shortcode(shortcode, ctx, resolution_ctx, lua_engine) + .await; + stamp_shortcode_anchors(&mut result, &shortcode.name, ctx.source_info); + result + } + + /// Inner dispatch — picks the handler and returns the raw result. Wrapped by + /// [`resolve_shortcode`], which post-walks the result to stamp Invocation + /// anchors. + async fn dispatch_shortcode( + &self, + shortcode: &Shortcode, + ctx: &ShortcodeContext<'_>, + resolution_ctx: ResolutionContext, + lua_engine: &mut Option, ) -> ShortcodeResult { // Handle escaped shortcodes - preserve as literal text if shortcode.is_escaped { @@ -483,6 +510,292 @@ fn lua_result_to_shortcode_result( } } +/// After every shortcode handler dispatch, stamp Invocation provenance on the +/// returned nodes. Recurses into nested AST so every block and inline gets the +/// anchor. +/// +/// Enrichment rules (per Plan 6 §"Lua-shortcode enrichment"): +/// - If the existing source_info is `Generated { by: filter, ... }` (Lua's +/// `filter_source_info` auto-attach), promote `by.kind` to `"shortcode"` and +/// move the `filter_path`/`line` data fields into `lua_path`/`lua_line`, +/// then append the Invocation anchor. +/// - Otherwise, replace with a fresh `Generated { by: shortcode(name), +/// from: [Invocation] }`. +fn stamp_shortcode_anchors( + result: &mut ShortcodeResult, + shortcode_name: &str, + token_si: &SourceInfo, +) { + let token_arc = Arc::new(token_si.clone()); + match result { + ShortcodeResult::Inlines(inlines) => { + for inline in inlines.iter_mut() { + stamp_inline(inline, shortcode_name, &token_arc); + } + } + ShortcodeResult::Blocks(blocks) => { + for block in blocks.iter_mut() { + stamp_block(block, shortcode_name, &token_arc); + } + } + ShortcodeResult::Preserve | ShortcodeResult::Error(_) => {} + } +} + +/// Stamp the Invocation anchor on a single inline and recurse into its children. +fn stamp_inline(inline: &mut Inline, name: &str, token_arc: &Arc) { + let new_si = enrich_or_create(inline.source_info(), name, token_arc); + *inline.source_info_mut() = new_si; + match inline { + Inline::Emph(Emph { content, .. }) + | Inline::Underline(Underline { content, .. }) + | Inline::Strong(Strong { content, .. }) + | Inline::Strikeout(Strikeout { content, .. }) + | Inline::Superscript(Superscript { content, .. }) + | Inline::Subscript(Subscript { content, .. }) + | Inline::SmallCaps(SmallCaps { content, .. }) + | Inline::Insert(Insert { content, .. }) + | Inline::Delete(Delete { content, .. }) + | Inline::Highlight(Highlight { content, .. }) + | Inline::Quoted(Quoted { content, .. }) + | Inline::Cite(Cite { content, .. }) + | Inline::Link(Link { content, .. }) + | Inline::Image(Image { content, .. }) + | Inline::Span(Span { content, .. }) + | Inline::EditComment(EditComment { content, .. }) => { + for child in content.iter_mut() { + stamp_inline(child, name, token_arc); + } + } + Inline::Note(Note { content, .. }) => { + for child in content.iter_mut() { + stamp_block(child, name, token_arc); + } + } + Inline::Custom(custom) => { + for slot in custom.slots.values_mut() { + match slot { + quarto_pandoc_types::custom::Slot::Inline(i) => { + stamp_inline(i, name, token_arc); + } + quarto_pandoc_types::custom::Slot::Inlines(is) => { + for child in is.iter_mut() { + stamp_inline(child, name, token_arc); + } + } + quarto_pandoc_types::custom::Slot::Block(b) => { + stamp_block(b, name, token_arc); + } + quarto_pandoc_types::custom::Slot::Blocks(bs) => { + for child in bs.iter_mut() { + stamp_block(child, name, token_arc); + } + } + } + } + } + // Leaves — no nested AST to walk. + Inline::Str(_) + | Inline::Code(_) + | Inline::Space(_) + | Inline::SoftBreak(_) + | Inline::LineBreak(_) + | Inline::Math(_) + | Inline::RawInline(_) + | Inline::Shortcode(_) + | Inline::NoteReference(_) + | Inline::Attr(_) => {} + } +} + +/// Stamp the Invocation anchor on a single block and recurse into its children. +fn stamp_block(block: &mut Block, name: &str, token_arc: &Arc) { + let new_si = enrich_or_create(block.source_info(), name, token_arc); + *block.source_info_mut() = new_si; + match block { + Block::Plain(Plain { content, .. }) | Block::Paragraph(Paragraph { content, .. }) => { + for child in content.iter_mut() { + stamp_inline(child, name, token_arc); + } + } + Block::LineBlock(LineBlock { content, .. }) => { + for line in content.iter_mut() { + for child in line.iter_mut() { + stamp_inline(child, name, token_arc); + } + } + } + Block::Header(Header { content, .. }) => { + for child in content.iter_mut() { + stamp_inline(child, name, token_arc); + } + } + Block::BlockQuote(BlockQuote { content, .. }) => { + for child in content.iter_mut() { + stamp_block(child, name, token_arc); + } + } + Block::OrderedList(OrderedList { content, .. }) + | Block::BulletList(BulletList { content, .. }) => { + for item in content.iter_mut() { + for child in item.iter_mut() { + stamp_block(child, name, token_arc); + } + } + } + Block::DefinitionList(DefinitionList { content, .. }) => { + for (term, defs) in content.iter_mut() { + for child in term.iter_mut() { + stamp_inline(child, name, token_arc); + } + for def in defs.iter_mut() { + for child in def.iter_mut() { + stamp_block(child, name, token_arc); + } + } + } + } + Block::Figure(Figure { + content, caption, .. + }) => { + for child in content.iter_mut() { + stamp_block(child, name, token_arc); + } + if let Some(short) = caption.short.as_mut() { + for child in short.iter_mut() { + stamp_inline(child, name, token_arc); + } + } + if let Some(long) = caption.long.as_mut() { + for child in long.iter_mut() { + stamp_block(child, name, token_arc); + } + } + } + Block::Div(Div { content, .. }) => { + for child in content.iter_mut() { + stamp_block(child, name, token_arc); + } + } + Block::Table(Table { + caption, + head, + bodies, + foot, + .. + }) => { + if let Some(short) = caption.short.as_mut() { + for child in short.iter_mut() { + stamp_inline(child, name, token_arc); + } + } + if let Some(long) = caption.long.as_mut() { + for child in long.iter_mut() { + stamp_block(child, name, token_arc); + } + } + for row in head.rows.iter_mut() { + for cell in row.cells.iter_mut() { + for child in cell.content.iter_mut() { + stamp_block(child, name, token_arc); + } + } + } + for body in bodies.iter_mut() { + for row in body.body.iter_mut() { + for cell in row.cells.iter_mut() { + for child in cell.content.iter_mut() { + stamp_block(child, name, token_arc); + } + } + } + } + for row in foot.rows.iter_mut() { + for cell in row.cells.iter_mut() { + for child in cell.content.iter_mut() { + stamp_block(child, name, token_arc); + } + } + } + } + Block::Custom(custom) => { + for slot in custom.slots.values_mut() { + match slot { + quarto_pandoc_types::custom::Slot::Inline(i) => { + stamp_inline(i, name, token_arc); + } + quarto_pandoc_types::custom::Slot::Inlines(is) => { + for child in is.iter_mut() { + stamp_inline(child, name, token_arc); + } + } + quarto_pandoc_types::custom::Slot::Block(b) => { + stamp_block(b, name, token_arc); + } + quarto_pandoc_types::custom::Slot::Blocks(bs) => { + for child in bs.iter_mut() { + stamp_block(child, name, token_arc); + } + } + } + } + } + // Leaves — no nested AST to walk. + Block::CodeBlock(_) + | Block::RawBlock(_) + | Block::HorizontalRule(_) + | Block::BlockMetadata(_) + | Block::NoteDefinitionPara(_) + | Block::NoteDefinitionFencedBlock(_) + | Block::CaptionBlock(_) => {} + } +} + +/// Build the `SourceInfo` for a freshly-resolved shortcode node. +/// +/// If the existing source_info is `Generated { by: filter, ... }` (a Lua +/// auto-attach from `filter_source_info`), promote the kind to `"shortcode"` +/// and migrate the `filter_path`/`line` data fields into `lua_path`/`lua_line`, +/// preserving the Lua-side dispatch precision alongside the new shortcode +/// context. Otherwise, mint a fresh `Generated { by: shortcode(name), ... }`. +/// +/// In both branches, append an Invocation anchor pointing at the shortcode +/// token's source range (`token_arc`). +/// +/// NOTE: the `filter_path`/`line` reads below are temporary. When +/// **bd-36fr9** (Lua-file registration in `SourceContext`) lands, those +/// fields move out of `by.data` and into a typed `Dispatch` anchor inside +/// `from`. This branch will then read the existing Dispatch anchor and copy +/// it alongside the Invocation. +/// +/// NOTE: **bd-129m3** (ValueSource anchor stamping for `meta` / `var` +/// shortcodes) is the integration point for appending a second anchor +/// when the metadata loader threads per-key source-info through. +fn enrich_or_create(existing: &SourceInfo, name: &str, token_arc: &Arc) -> SourceInfo { + let by = match existing { + SourceInfo::Generated { by, .. } if by.kind == "filter" => { + let lua_path = by.data.get("filter_path").cloned(); + let lua_line = by.data.get("line").cloned(); + let mut data = serde_json::json!({ "name": name }); + if let Some(p) = lua_path { + data["lua_path"] = p; + } + if let Some(l) = lua_line { + data["lua_line"] = l; + } + By { + kind: "shortcode".to_string(), + data, + } + } + _ => By::shortcode(name), + }; + SourceInfo::Generated { + by, + from: smallvec![Anchor::invocation(Arc::clone(token_arc))], + } +} + /// Extract shortcode paths from merged metadata. /// /// After metadata merge, `meta["shortcodes"]` contains an array of paths @@ -656,7 +969,8 @@ fn resolve_blocks<'a>( } ShortcodeResult::Error(error) => { diagnostics.push(error.diagnostic); - let error_inline = make_error_inline(&error.key); + let error_inline = + make_error_inline(&error.key, &shortcode_owned.source_info); replace_shortcode_in_block(&mut blocks[i], vec![error_inline]); i += 1; continue; @@ -911,7 +1225,8 @@ fn resolve_inlines<'a>( // Emit diagnostic diagnostics.push(error.diagnostic); // Replace with visible error (TS Quarto style) - let error_inline = make_error_inline(&error.key); + let error_inline = + make_error_inline(&error.key, &shortcode_owned.source_info); inlines[i] = error_inline; i += 1; } @@ -1027,19 +1342,29 @@ fn recurse_inline<'a>( } /// Create visible error inline: Strong("?key") -fn make_error_inline(key: &str) -> Inline { +/// +/// Both the inner Str and outer Strong carry the shortcode token's original +/// `source_info` (not `Generated`). The error region is treated as normal +/// editable user-source content — Plan 7's `is_atomic_kind()` does not fire on +/// Original, so the incremental writer Verbatim-copies the original token +/// bytes on round-trip. The Strong-wraps-Str overlap is structurally parallel +/// to the footnote `` case (Plan 7 §footnotes). +fn make_error_inline(key: &str, token_source_info: &SourceInfo) -> Inline { Inline::Strong(Strong { content: vec![Inline::Str(Str { text: format!("?{}", key), - source_info: SourceInfo::default(), + source_info: token_source_info.clone(), })], - source_info: SourceInfo::default(), + source_info: token_source_info.clone(), }) } /// Convert an escaped shortcode to literal text. /// -/// For `{{{< meta title >}}}`, this produces `{{< meta title >}}` +/// For `{{{< meta title >}}}`, this produces `{{< meta title >}}`. The +/// resulting `Str` carries the shortcode token's original `source_info` +/// (an Original), so Plan 7's `is_atomic_kind()` does not fire — round-trip +/// through the incremental writer verbatim-copies the source bytes. fn shortcode_to_literal(shortcode: &Shortcode) -> Inline { let mut text = String::from("{{< "); text.push_str(&shortcode.name); @@ -1106,7 +1431,7 @@ fn shortcode_to_literal(shortcode: &Shortcode) -> Inline { Inline::Str(Str { text, - source_info: SourceInfo::default(), + source_info: shortcode.source_info.clone(), }) } @@ -1329,12 +1654,16 @@ mod tests { #[test] fn test_make_error_inline() { - let inline = make_error_inline("meta:title"); + let token_si = dummy_source_info(); + let inline = make_error_inline("meta:title", &token_si); match inline { Inline::Strong(strong) => { assert_eq!(strong.content.len(), 1); + // Both layers carry the token's source_info (not Default, not Generated). + assert_eq!(&strong.source_info, &token_si); if let Inline::Str(s) = &strong.content[0] { assert_eq!(s.text, "?meta:title"); + assert_eq!(&s.source_info, &token_si); } else { panic!("Expected Str inline"); } @@ -1987,5 +2316,457 @@ mod tests { } assert!(ctx.diagnostics.is_empty()); } + + /// Plan 6 §"Lua-shortcode enrichment": when a Lua handler returns a + /// *typed* Inline (e.g. `pandoc.Str(...)`), the filter_source_info + /// auto-attach gives it `Generated { by: filter, data: { filter_path, + /// line } }`. The resolver's post-walk should then promote this to + /// `Generated { by: shortcode, data: { name, lua_path, lua_line }, + /// from: [Invocation] }` — kind promoted, fields renamed, anchor + /// appended. + #[tokio::test] + async fn lua_shortcode_typed_return_enriched_to_shortcode_kind() { + let tmp = TempDir::new().unwrap(); + // Note: pandoc.Str(...) returns a typed Lua userdata that the + // Lua engine's filter_source_info auto-attach picks up. + let script_path = write_lua_script( + tmp.path(), + "typed.lua", + r#"return { typed = function(args) return pandoc.Str("Hello typed") end }"#, + ); + + let runtime = make_runtime(); + let transform = ShortcodeResolveTransform::with_lua_support( + vec![script_path.clone()], + Vec::new(), + runtime, + "html".to_string(), + ); + + let tok = token_si(); + let mut ast = Pandoc { + meta: ConfigValue::default(), + blocks: vec![Block::Paragraph(Paragraph { + content: vec![Inline::Shortcode(make_shortcode_with_si( + "typed", + vec![], + tok.clone(), + ))], + source_info: dummy_source_info(), + })], + }; + + let project = make_test_project(); + let doc = DocumentInfo::from_path("/project/doc.qmd"); + let format = Format::html(); + let binaries = BinaryDependencies::new(); + let mut ctx = RenderContext::new(&project, &doc, &format, &binaries); + transform.transform(&mut ast, &mut ctx).await.unwrap(); + + let Block::Paragraph(para) = &ast.blocks[0] else { + panic!("Expected Paragraph"); + }; + let Inline::Str(s) = ¶.content[0] else { + panic!("Expected resolved Str, got {:?}", ¶.content[0]); + }; + assert_eq!(s.text, "Hello typed"); + match &s.source_info { + SourceInfo::Generated { by, from } => { + // Kind promoted to "shortcode", NOT "filter". + assert_eq!( + by.kind, "shortcode", + "kind should be promoted from filter to shortcode" + ); + // Name is the shortcode name. + assert_eq!(by.data.get("name").and_then(|v| v.as_str()), Some("typed")); + // filter_path → lua_path + let lua_path = by + .data + .get("lua_path") + .and_then(|v| v.as_str()) + .expect("lua_path should be preserved from filter_path"); + assert!( + lua_path.contains("typed.lua"), + "lua_path {:?} should reference the script", + lua_path + ); + // line → lua_line + let lua_line = by + .data + .get("lua_line") + .and_then(|v| v.as_u64()) + .expect("lua_line should be preserved from line"); + assert!(lua_line >= 1, "lua_line should be positive"); + // Invocation anchor points at the token. + assert_eq!(from.len(), 1); + assert_eq!(from[0].role, quarto_source_map::AnchorRole::Invocation); + assert_eq!(&*from[0].source_info, &tok); + } + other => panic!("Expected Generated, got {:?}", other), + } + } + } + + // === Plan 6: shortcode-resolution provenance shape tests === + + /// A test handler that returns a Strong wrapping a Str — exercises + /// the multi-inline / nested-container stamping path. + struct MultiInlineTestHandler; + impl ShortcodeHandler for MultiInlineTestHandler { + fn name(&self) -> &str { + "multi" + } + fn resolve( + &self, + _shortcode: &Shortcode, + _ctx: &ShortcodeContext, + _resolution_ctx: ResolutionContext, + ) -> ShortcodeResult { + ShortcodeResult::Inlines(vec![ + Inline::Strong(Strong { + content: vec![Inline::Str(Str { + text: "Bold".into(), + source_info: SourceInfo::default(), + })], + source_info: SourceInfo::default(), + }), + Inline::Space(quarto_pandoc_types::inline::Space { + source_info: SourceInfo::default(), + }), + Inline::Str(Str { + text: "Title".into(), + source_info: SourceInfo::default(), + }), + ]) + } + } + + /// Distinct token source_info so we can check Invocation anchors + /// point at the *shortcode token*, not at the default. + fn token_si() -> SourceInfo { + SourceInfo::original(FileId(0), 100, 130) + } + + fn make_shortcode_with_si(name: &str, args: Vec<&str>, si: SourceInfo) -> Shortcode { + Shortcode { + is_escaped: false, + name: name.to_string(), + positional_args: args + .into_iter() + .map(|s| ShortcodeArg::String(s.to_string())) + .collect(), + keyword_args: hashlink::LinkedHashMap::new(), + source_info: si, + } + } + + fn make_escaped_shortcode_with_si(name: &str, si: SourceInfo) -> Shortcode { + Shortcode { + is_escaped: true, + name: name.to_string(), + positional_args: vec![], + keyword_args: hashlink::LinkedHashMap::new(), + source_info: si, + } + } + + /// Resolved Str from a meta shortcode carries + /// Generated { by: shortcode("meta"), from: [Invocation -> token_si] }. + #[tokio::test] + async fn shortcode_resolution_has_generated_with_invocation_anchor() { + let transform = ShortcodeResolveTransform::new(); + let tok = token_si(); + let mut ast = Pandoc { + meta: ConfigValue::new_map( + vec![make_map_entry( + "title", + ConfigValue::new_string("Test Title", dummy_source_info()), + )], + dummy_source_info(), + ), + blocks: vec![Block::Paragraph(Paragraph { + content: vec![Inline::Shortcode(make_shortcode_with_si( + "meta", + vec!["title"], + tok.clone(), + ))], + source_info: dummy_source_info(), + })], + }; + + let project = make_test_project(); + let doc = DocumentInfo::from_path("/project/doc.qmd"); + let format = Format::html(); + let binaries = BinaryDependencies::new(); + let mut ctx = RenderContext::new(&project, &doc, &format, &binaries); + transform.transform(&mut ast, &mut ctx).await.unwrap(); + + let Block::Paragraph(para) = &ast.blocks[0] else { + panic!("Expected Paragraph"); + }; + let Inline::Str(s) = ¶.content[0] else { + panic!("Expected resolved Str"); + }; + assert_eq!(s.text, "Test Title"); + match &s.source_info { + SourceInfo::Generated { by, from } => { + assert_eq!(by.kind, "shortcode"); + assert_eq!(by.data.get("name").and_then(|v| v.as_str()), Some("meta")); + assert_eq!(from.len(), 1); + assert_eq!(from[0].role, quarto_source_map::AnchorRole::Invocation); + assert_eq!(&*from[0].source_info, &tok); + } + other => panic!("Expected Generated, got {:?}", other), + } + } + + /// Multi-inline resolution (Strong[Str], Space, Str) — every node gets + /// stamped with the same Invocation anchor source_info. + #[tokio::test] + async fn multi_inline_shortcode_resolution_shares_invocation_source() { + let mut transform = ShortcodeResolveTransform::new(); + transform.handlers.push(Box::new(MultiInlineTestHandler)); + let tok = token_si(); + let mut ast = Pandoc { + meta: ConfigValue::default(), + blocks: vec![Block::Paragraph(Paragraph { + content: vec![Inline::Shortcode(make_shortcode_with_si( + "multi", + vec![], + tok.clone(), + ))], + source_info: dummy_source_info(), + })], + }; + + let project = make_test_project(); + let doc = DocumentInfo::from_path("/project/doc.qmd"); + let format = Format::html(); + let binaries = BinaryDependencies::new(); + let mut ctx = RenderContext::new(&project, &doc, &format, &binaries); + transform.transform(&mut ast, &mut ctx).await.unwrap(); + + let Block::Paragraph(para) = &ast.blocks[0] else { + panic!("Expected Paragraph"); + }; + assert_eq!(para.content.len(), 3); + + // Helper: extract the Invocation source_info from an inline. + fn invocation_si(inline: &Inline) -> &SourceInfo { + match inline.source_info() { + SourceInfo::Generated { by, from } => { + assert_eq!(by.kind, "shortcode", "Got by.kind = {:?}", by.kind); + assert_eq!(from.len(), 1); + assert_eq!(from[0].role, quarto_source_map::AnchorRole::Invocation); + &from[0].source_info + } + other => panic!("Expected Generated, got {:?}", other), + } + } + + let strong_si = invocation_si(¶.content[0]); + let space_si = invocation_si(¶.content[1]); + let str_si = invocation_si(¶.content[2]); + assert_eq!(strong_si, &tok); + assert_eq!(space_si, &tok); + assert_eq!(str_si, &tok); + // The Strong's inner Str must also be stamped. + let Inline::Strong(strong) = ¶.content[0] else { + panic!("Expected Strong"); + }; + let inner_si = invocation_si(&strong.content[0]); + assert_eq!(inner_si, &tok); + } + + /// Escaped shortcode resolves to a literal Str whose source_info is + /// the token's Original (NOT Generated) — Plan 7's is_atomic_kind() + /// does not fire on round-trip. + #[tokio::test] + async fn escaped_shortcode_keeps_original_source_info() { + let transform = ShortcodeResolveTransform::new(); + let tok = token_si(); + let mut ast = Pandoc { + meta: ConfigValue::default(), + blocks: vec![Block::Paragraph(Paragraph { + content: vec![Inline::Shortcode(make_escaped_shortcode_with_si( + "meta", + tok.clone(), + ))], + source_info: dummy_source_info(), + })], + }; + + let project = make_test_project(); + let doc = DocumentInfo::from_path("/project/doc.qmd"); + let format = Format::html(); + let binaries = BinaryDependencies::new(); + let mut ctx = RenderContext::new(&project, &doc, &format, &binaries); + transform.transform(&mut ast, &mut ctx).await.unwrap(); + + let Block::Paragraph(para) = &ast.blocks[0] else { + panic!("Expected Paragraph"); + }; + let Inline::Str(s) = ¶.content[0] else { + panic!("Expected literal Str"); + }; + // Source_info is Original (the token's bytes), not Generated. + match &s.source_info { + SourceInfo::Original { .. } => {} + other => panic!("Expected Original, got {:?}", other), + } + assert_eq!(&s.source_info, &tok); + } + + /// Unknown shortcode resolves to Strong[Str("?name")] with both + /// layers carrying the token's Original source_info (NOT Generated, + /// NOT Default). + #[tokio::test] + async fn unknown_shortcode_error_uses_token_source_info() { + let transform = ShortcodeResolveTransform::new(); + let tok = token_si(); + let mut ast = Pandoc { + meta: ConfigValue::default(), + blocks: vec![Block::Paragraph(Paragraph { + content: vec![Inline::Shortcode(make_shortcode_with_si( + "bogus", + vec![], + tok.clone(), + ))], + source_info: dummy_source_info(), + })], + }; + + let project = make_test_project(); + let doc = DocumentInfo::from_path("/project/doc.qmd"); + let format = Format::html(); + let binaries = BinaryDependencies::new(); + let mut ctx = RenderContext::new(&project, &doc, &format, &binaries); + transform.transform(&mut ast, &mut ctx).await.unwrap(); + + let Block::Paragraph(para) = &ast.blocks[0] else { + panic!("Expected Paragraph"); + }; + let Inline::Strong(strong) = ¶.content[0] else { + panic!("Expected Strong"); + }; + assert!(matches!(strong.source_info, SourceInfo::Original { .. })); + assert_eq!(&strong.source_info, &tok); + let Inline::Str(inner) = &strong.content[0] else { + panic!("Expected inner Str"); + }; + assert!(matches!(inner.source_info, SourceInfo::Original { .. })); + assert_eq!(&inner.source_info, &tok); + assert_eq!(inner.text, "?bogus"); + } + + /// Plan 6 source_info-determinism: running the transform twice on + /// the same input produces structurally-identical ASTs (every + /// Generated.by, every Generated.from[], and every Original + /// SourceInfo is ==-equal across runs). + #[tokio::test] + async fn shortcode_resolution_is_deterministic() { + async fn run_once() -> Pandoc { + let mut transform = ShortcodeResolveTransform::new(); + transform.handlers.push(Box::new(MultiInlineTestHandler)); + let tok = token_si(); + let mut ast = Pandoc { + meta: ConfigValue::new_map( + vec![make_map_entry( + "title", + ConfigValue::new_string("Title", dummy_source_info()), + )], + dummy_source_info(), + ), + blocks: vec![Block::Paragraph(Paragraph { + content: vec![ + Inline::Shortcode(make_shortcode_with_si( + "meta", + vec!["title"], + tok.clone(), + )), + Inline::Shortcode(make_shortcode_with_si("multi", vec![], tok)), + ], + source_info: dummy_source_info(), + })], + }; + let project = make_test_project(); + let doc = DocumentInfo::from_path("/project/doc.qmd"); + let format = Format::html(); + let binaries = BinaryDependencies::new(); + let mut ctx = RenderContext::new(&project, &doc, &format, &binaries); + transform.transform(&mut ast, &mut ctx).await.unwrap(); + ast + } + + let a = run_once().await; + let b = run_once().await; + // Pandoc, Block, Inline, and SourceInfo all derive PartialEq — + // == compares structurally, including every Generated.by / + // Generated.from[] and every Original byte range. + assert_eq!(a, b, "Plan-6 stamper must be deterministic across runs"); + } + + /// Audit-completion test: after Plan 6's stamping pass, the AST + /// should contain no `Generated { by: shortcode, from: [] }` nodes + /// (the required-anchor invariant: every shortcode-resolved node + /// carries an Invocation anchor). + #[tokio::test] + async fn shortcode_resolution_required_anchor_invariant() { + let mut transform = ShortcodeResolveTransform::new(); + transform.handlers.push(Box::new(MultiInlineTestHandler)); + let tok = token_si(); + let mut ast = Pandoc { + meta: ConfigValue::new_map( + vec![make_map_entry( + "title", + ConfigValue::new_string("Title", dummy_source_info()), + )], + dummy_source_info(), + ), + blocks: vec![Block::Paragraph(Paragraph { + content: vec![ + Inline::Shortcode(make_shortcode_with_si("meta", vec!["title"], tok.clone())), + Inline::Shortcode(make_shortcode_with_si("multi", vec![], tok.clone())), + ], + source_info: dummy_source_info(), + })], + }; + let project = make_test_project(); + let doc = DocumentInfo::from_path("/project/doc.qmd"); + let format = Format::html(); + let binaries = BinaryDependencies::new(); + let mut ctx = RenderContext::new(&project, &doc, &format, &binaries); + transform.transform(&mut ast, &mut ctx).await.unwrap(); + + // Walk every inline in the AST and assert: any + // Generated{by.kind=="shortcode"} carries at least one Invocation. + fn check_inline(inline: &Inline) { + if let SourceInfo::Generated { by, from } = inline.source_info() { + if by.kind == "shortcode" { + assert!( + from.iter() + .any(|a| a.role == quarto_source_map::AnchorRole::Invocation), + "Generated{{by:shortcode}} missing Invocation anchor" + ); + } + } + // Recurse into children for the common containers exercised here. + match inline { + Inline::Strong(s) => { + for c in &s.content { + check_inline(c); + } + } + _ => {} + } + } + + for block in &ast.blocks { + if let Block::Paragraph(p) = block { + for inline in &p.content { + check_inline(inline); + } + } + } } } diff --git a/crates/quarto-core/src/transforms/theorem.rs b/crates/quarto-core/src/transforms/theorem.rs index 9db924bc5..b63d8b1e1 100644 --- a/crates/quarto-core/src/transforms/theorem.rs +++ b/crates/quarto-core/src/transforms/theorem.rs @@ -268,8 +268,8 @@ fn convert_div(mut div: Div, ref_type: &str, kind: &str) -> CustomNode { // Extract title: // 1. `name=` attribute on the Div (Q1 convention). // 2. First Header child, if present. - let title: Option = - extract_name_attr(&mut div.attr).or_else(|| extract_first_header_title(&mut div.content)); + let title: Option = extract_name_attr(&mut div.attr, &div.attr_source) + .or_else(|| extract_first_header_title(&mut div.content)); // Strip the theorem class so downstream transforms don't re-match. div.attr @@ -301,8 +301,41 @@ fn convert_div(mut div: Div, ref_type: &str, kind: &str) -> CustomNode { /// `vec![Str("Pythagoras")]`. Inline markup inside the title (bold, /// italic, etc.) isn't supported today because attribute values are /// bare strings in Pandoc's data model — matching Q1's behavior. -fn extract_name_attr(attr: &mut Attr) -> Option { +/// +/// The returned `Str` carries the attribute value's parser-recorded +/// source range (an `Original` covering the bytes between the `=` and +/// the matching quote / whitespace) so attribution and the incremental +/// writer can resolve the title back to user-editable bytes. +/// +/// Uses `AttrSourceInfo`'s positional-alignment invariant (see +/// `crates/quarto-pandoc-types/src/attr.rs`) to find the value's +/// `SourceInfo`; falls back to `SourceInfo::default()` if alignment +/// fails (bd-3aolj / bd-1e6a5) so production never panics. +fn extract_name_attr(attr: &mut Attr, attr_source: &AttrSourceInfo) -> Option { let (_id, _classes, kvs) = attr; + + // Find the positional index of "name" before removing it so we can + // index into attr_source.attributes (which is parallel to kvs in + // insertion order). + let name_idx = kvs.keys().position(|k| k == "name")?; + + // Validate the positional-alignment invariant. An empty `attr_source` + // signals "no provenance available" (common pattern in tests that + // construct theorem divs by hand) — that case isn't a bug, so don't + // assert. Only assert when `attr_source.attributes` is populated but + // misaligned with `kvs` (the bd-3aolj / bd-1e6a5 parser bugs). + debug_assert!( + attr_source.attributes.is_empty() || kvs.len() == attr_source.attributes.len(), + "AttrSourceInfo.attributes is out of sync with Attr.2 (bd-3aolj / bd-1e6a5): kvs={}, attr_source={}", + kvs.len(), + attr_source.attributes.len(), + ); + let value_source = if kvs.len() == attr_source.attributes.len() { + attr_source.attributes[name_idx].1.clone() + } else { + None + }; + let name = kvs.remove("name")?; if name.is_empty() { return None; @@ -310,7 +343,7 @@ fn extract_name_attr(attr: &mut Attr) -> Option { Some(vec![quarto_pandoc_types::inline::Inline::Str( quarto_pandoc_types::inline::Str { text: name, - source_info: quarto_source_map::SourceInfo::default(), + source_info: value_source.unwrap_or_default(), }, )]) } diff --git a/crates/quarto-core/src/transforms/title_block.rs b/crates/quarto-core/src/transforms/title_block.rs index 240c18d44..1def2021b 100644 --- a/crates/quarto-core/src/transforms/title_block.rs +++ b/crates/quarto-core/src/transforms/title_block.rs @@ -32,7 +32,8 @@ use quarto_pandoc_types::block::{Block, Header}; use quarto_pandoc_types::inline::{Inline, Str}; use quarto_pandoc_types::pandoc::Pandoc; use quarto_pandoc_types::{ConfigValue, ConfigValueKind}; -use quarto_source_map::SourceInfo; +use quarto_source_map::{By, SourceInfo}; +use smallvec::smallvec; use crate::Result; use crate::format::is_minimal_html; @@ -174,15 +175,24 @@ fn blocks_to_plain_text(blocks: &[Block]) -> String { } /// Create a level-1 header block with the given title. +/// +/// The synthesized Header (and its inner Str) carry +/// `Generated { by: title_block(), from: [] }` provenance. Both nodes are +/// atomic per Plan 4's `is_atomic_kind` set — the writer treats them as a +/// single non-editable unit on round-trip. fn create_title_header(title: &str) -> Block { + let source_info = SourceInfo::Generated { + by: By::title_block(), + from: smallvec![], + }; Block::Header(Header { level: 1, attr: empty_attr(), content: vec![Inline::Str(Str { text: title.to_string(), - source_info: SourceInfo::default(), + source_info: source_info.clone(), })], - source_info: SourceInfo::default(), + source_info, attr_source: AttrSourceInfo::empty(), }) } @@ -496,4 +506,32 @@ mod tests { let transform = TitleBlockTransform::new(); assert_eq!(transform.name(), "title-block"); } + + #[test] + fn test_create_title_header_has_generated_provenance() { + // Plan 6: the synthesized h1 + inner Str both carry + // Generated { by: title_block(), from: [] }. + let block = create_title_header("My Title"); + let Block::Header(header) = &block else { + panic!("Expected Header"); + }; + match &header.source_info { + SourceInfo::Generated { by, from } => { + assert_eq!(by.kind, "title-block"); + assert!(from.is_empty()); + } + other => panic!("Expected Generated, got {:?}", other), + } + // Inner Str carries the same shape. + let Inline::Str(s) = &header.content[0] else { + panic!("Expected Str inside header"); + }; + match &s.source_info { + SourceInfo::Generated { by, from } => { + assert_eq!(by.kind, "title-block"); + assert!(from.is_empty()); + } + other => panic!("Expected Generated, got {:?}", other), + } + } } diff --git a/crates/quarto-core/tests/fixtures/idempotence/README.md b/crates/quarto-core/tests/fixtures/idempotence/README.md new file mode 100644 index 000000000..41e8fdac8 --- /dev/null +++ b/crates/quarto-core/tests/fixtures/idempotence/README.md @@ -0,0 +1,51 @@ +# Plan 3 — idempotence fixtures + +Holds the per-fixture project directories the q2-preview idempotence +gate at `crates/quarto-core/tests/idempotence.rs` drives through the +pipeline twice and hashes for equality. + +For the contract a transform / filter / stage author must meet to +land here without breaking the gate, read +`claude-notes/instructions/idempotence-contract.md`. The full plan +that introduced the gate lives at +`claude-notes/plans/2026-05-04-q2-preview-plan-3-builtin-filter-idempotence.md`. +The rules below are the ones that bite at fixture-authoring time. + +## Fixture-format rules + +1. **No executable engine cells.** Use only fenced code blocks + (`` ```python ``, `` ```r ``, etc.) — these are AST nodes, not + executed. Do NOT use `{python}` / `{r}` / `{julia}` style cells; CI + has no kernels, the `engine-execution` stage either fails or falls + through to the markdown passthrough, and the resulting two runs + are not reliably comparable. + +2. **No absolute process paths in fixture content.** Use only paths + that resolve relative to the fixture root (`./local.png`, not + `/private/var/.../local.png`). Resource-collector, include-resolve, + built-in-extension lookup, and similar transforms record paths into + meta; the built-in extensions resource bundle extracts to a + process-specific `temp_dir()`. Stable within a process — fine for + Plan 3's two-runs-compare contract today, but a latent issue for + any future stored-snapshot variant. + +3. **Per-fixture mode mapping.** Document-only fixtures (plain text, + callouts, theorems, code blocks, …) run in both `SingleFile` and + `ProjectOrchestrator` modes. Website-chrome fixtures (navbar, + sidebar, listings, page-nav, footer) are **orchestrator-only** + because the chrome transforms require a populated `ProjectIndex`; + driving them through `SingleFile` mode would test a partial pipeline + that doesn't exist in production. + +## What lives here + +Subdirectories named for each non-trivial fixture (typically the +website / multi-file cases that need a `_quarto.yml` plus several +sibling pages). Trivial single-page fixtures live as in-source +literals in `idempotence.rs` — the fixture's `setup` closure writes +them into a `TempDir` at run time. + +Pattern matches `tests/fixtures/websites/hub-smoke/` and +`tests/fixtures/phase5-website-baseline/`; use `copy_fixture(...)` from +`render_page_in_project.rs:616` as the lift point if a fixture grows +big enough to want a pre-built directory tree. diff --git a/crates/quarto-core/tests/idempotence.rs b/crates/quarto-core/tests/idempotence.rs new file mode 100644 index 000000000..d770048bb --- /dev/null +++ b/crates/quarto-core/tests/idempotence.rs @@ -0,0 +1,858 @@ +/* + * tests/idempotence.rs + * Copyright (c) 2026 Posit, PBC + * + * Plan 3 — q2-preview pipeline idempotence gate. + * + * Each fixture is driven through the q2-preview pipeline twice in + * each drive mode (`SingleFile` and `ProjectOrchestrator`) and the + * resulting `blocks` and `meta` (excluding `rendered.*`) hashes must + * compare equal across the two runs. + * + * See: + * claude-notes/plans/2026-05-04-q2-preview-plan-3-builtin-filter-idempotence.md + * + * The plan documents the long-lived-integration-branch policy: a + * fixture that surfaces real non-determinism stays failing here, and + * a beads issue (filled in from the panic message's + * `DivergencePoint`) is filed against the offending transform/stage. + * Do not `#[ignore]` a failing fixture without explicit user approval. + */ + +use std::path::{Path, PathBuf}; +use std::sync::Arc; + +use tempfile::TempDir; + +use pampa::pandoc::ASTContext; +use quarto_ast_reconcile::{ + compute_blocks_hash_fresh, compute_meta_hash_fresh_excluding_rendered, find_first_divergence, +}; +use quarto_core::format::Format; +use quarto_core::pipeline::{build_q2_preview_pipeline_stages, run_pipeline}; +use quarto_core::project::ProjectContext; +use quarto_core::project::orchestrator::{ProjectPipeline, RenderMode, project_type_for}; +use quarto_core::project::pass2_renderer::{RenderToPreviewAstRenderer, WasmPassTwoOutput}; +use quarto_core::render::{BinaryDependencies, RenderContext}; +use quarto_core::stage::DocumentAst; +use quarto_pandoc_types::Pandoc; +use quarto_source_map::SourceContext; +use quarto_system_runtime::{NativeRuntime, SystemRuntime}; + +// ─── Helpers (copied verbatim from render_page_in_project.rs) ───── +// +// Each `tests/*.rs` file is its own test binary, so sharing helpers +// between integration tests requires a `tests/common/` module that +// every test then explicitly imports. The plan rules dedup of that +// shape out of scope for Plan 3, so for now we copy these tiny +// utilities. If/when a second consumer wants them, this pair plus +// the `render_active_page_preview` body below is the natural +// extraction point. + +fn write(path: &Path, contents: &str) { + if let Some(parent) = path.parent() { + std::fs::create_dir_all(parent).unwrap(); + } + std::fs::write(path, contents).unwrap(); +} + +fn write_bytes(path: &Path, contents: &[u8]) { + if let Some(parent) = path.parent() { + std::fs::create_dir_all(parent).unwrap(); + } + std::fs::write(path, contents).unwrap(); +} + +fn canonical(path: &Path) -> PathBuf { + path.canonicalize().unwrap_or_else(|_| path.to_path_buf()) +} + +// ─── Drive modes ────────────────────────────────────────────────── + +/// How a fixture is driven through the pipeline. Every fixture runs +/// once per mode; the two runs within a mode must hash equal. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum DriveMode { + /// `run_pipeline` directly with `build_q2_preview_pipeline_stages`. + /// Mirrors `render_qmd_to_preview_ast` — the lowest-level entry + /// point used by the WASM preview. + SingleFile, + /// Drives `ProjectPipeline` with + /// `RenderMode::ActivePage(active)`. Reuses the same orchestrator + /// path real `q2 preview` / hub-client takes. + ProjectOrchestrator, +} + +const BOTH_MODES: &[DriveMode] = &[DriveMode::SingleFile, DriveMode::ProjectOrchestrator]; +#[allow(dead_code)] // Used by website / orchestrator-only fixtures in Phase 4. +const ORCHESTRATOR_ONLY: &[DriveMode] = &[DriveMode::ProjectOrchestrator]; + +// ─── Fixture struct ─────────────────────────────────────────────── + +/// A single Plan-3 fixture. Each fixture owns its own `TempDir` per +/// run; `setup` writes the project contents into that root. +struct Fixture { + name: &'static str, + /// Idempotent setup callback. Receives the freshly-created + /// project root (a canonicalized `TempDir` path) and writes the + /// page contents — at minimum `/`, plus any + /// `_quarto.yml` or sibling files the fixture needs. + setup: Box, + /// The active page, relative to the project root. + active: PathBuf, + /// Which drive modes this fixture is meaningful in. Document-only + /// fixtures run in both modes; website-chrome fixtures are + /// orchestrator-only (chrome transforms need a populated + /// ProjectIndex). + modes: &'static [DriveMode], + /// Optional transport-shape attribution JSON. When set, both + /// `DriveMode`s install a + /// `PreBuiltAttributionProvider(json.to_string())` on the + /// `RenderContext` (single-file) or pass it to the renderer + /// via `with_attribution` (orchestrator). `None` = no provider. + attribution_json: Option<&'static str>, +} + +impl Fixture { + fn run_in_each_mode(&self) { + for &mode in self.modes { + run_fixture(self, mode); + } + } +} + +// ─── Test driver ────────────────────────────────────────────────── + +fn run_fixture(fixture: &Fixture, mode: DriveMode) { + let doc_1 = run_q2_preview(fixture, mode); + let doc_2 = run_q2_preview(fixture, mode); + + let blocks_a = compute_blocks_hash_fresh(&doc_1.ast.blocks); + let blocks_b = compute_blocks_hash_fresh(&doc_2.ast.blocks); + let meta_a = compute_meta_hash_fresh_excluding_rendered(&doc_1.ast.meta); + let meta_b = compute_meta_hash_fresh_excluding_rendered(&doc_2.ast.meta); + + if blocks_a != blocks_b || meta_a != meta_b { + let point = find_first_divergence( + &doc_1.ast.blocks, + &doc_1.ast.meta, + &doc_2.ast.blocks, + &doc_2.ast.meta, + ); + panic!( + "fixture {} ({:?}): non-idempotent\n \ + blocks: {:016x} vs {:016x}\n \ + meta: {:016x} vs {:016x}\n \ + first divergence: {:?}", + fixture.name, mode, blocks_a, blocks_b, meta_a, meta_b, point, + ); + } +} + +fn run_q2_preview(fixture: &Fixture, mode: DriveMode) -> DocumentAst { + let temp = TempDir::new().unwrap(); + let project_dir = canonical(temp.path()); + (fixture.setup)(&project_dir); + let active = canonical(&project_dir.join(&fixture.active)); + + let doc = match mode { + DriveMode::SingleFile => run_single_file(&project_dir, &active, fixture.attribution_json), + DriveMode::ProjectOrchestrator => { + run_orchestrator(&project_dir, &active, fixture.attribution_json) + } + }; + drop(temp); + doc +} + +// ─── SingleFile mode ────────────────────────────────────────────── + +fn run_single_file( + _project_dir: &Path, + active: &Path, + attribution_json: Option<&'static str>, +) -> DocumentAst { + pollster::block_on(async { + let runtime: Arc = Arc::new(NativeRuntime::new()); + + // Mirror `render_active_page_preview`'s discovery dance so + // a fixture that writes a `_quarto.yml` ends up with a + // populated `project.files` rather than a single-file + // synthetic project. + let mut project = ProjectContext::discover(active, runtime.as_ref()).unwrap(); + if !project.is_single_file { + project = ProjectContext::discover(&project.dir, runtime.as_ref()).unwrap(); + } + + let doc_info = project + .files + .iter() + .find(|d| d.input == active) + .expect("active file present in discovered project") + .clone(); + + let format = Format::from_format_string("q2-preview") + .expect("q2-preview is a recognized pseudo-format"); + let binaries = BinaryDependencies::new(); + let mut ctx = RenderContext::new(&project, &doc_info, &format, &binaries); + if let Some(json) = attribution_json { + ctx.attribution_provider = Some(Arc::new( + quarto_core::attribution::PreBuiltAttributionProvider::new(json.to_string()), + )); + } + + let content = std::fs::read(active).unwrap(); + let stages = build_q2_preview_pipeline_stages(None, None); + let (output, _diagnostics) = run_pipeline( + &content, + &active.to_string_lossy(), + &mut ctx, + runtime, + stages, + ) + .await + .expect("q2-preview pipeline run (SingleFile mode)"); + + output + .into_document_ast() + .expect("q2-preview pipeline produces DocumentAst at its tail") + }) +} + +// ─── ProjectOrchestrator mode ───────────────────────────────────── + +fn run_orchestrator( + _project_dir: &Path, + active: &Path, + attribution_json: Option<&'static str>, +) -> DocumentAst { + let output = render_active_page_preview(active, attribution_json); + let ast_json = output + .payload + .as_ast_json() + .expect("orchestrator must emit Pass2Payload::AstJson"); + let mut bytes = ast_json.as_bytes(); + let (pandoc, ast_context) = + pampa::readers::json::read(&mut bytes).expect("re-parse AST JSON from orchestrator"); + pandoc_to_document_ast(pandoc, ast_context, active.to_path_buf()) +} + +/// Lifted from `crates/quarto-core/tests/render_page_in_project.rs:660`. +/// Each `tests/*.rs` is its own binary, so the helper has to be +/// duplicated rather than imported. The plan flags this as +/// acceptable for now. +fn render_active_page_preview( + active: &Path, + attribution_json: Option<&'static str>, +) -> WasmPassTwoOutput { + let runtime: Arc = Arc::new(NativeRuntime::new()); + let mut project = ProjectContext::discover(active, runtime.as_ref()).unwrap(); + if !project.is_single_file { + project = ProjectContext::discover(&project.dir, runtime.as_ref()).unwrap(); + } + + let project_type = project_type_for(&project); + let vfs_root = project.dir.join(".quarto/project-artifacts"); + // bd-rz2we: override the URL root so rendered AST link/asset + // URLs use the synthetic VFS prefix instead of the host's + // tempdir path. Disk writes still land under `vfs_root` (the + // real tempdir) via `runtime.file_write`, but the URLs + // embedded in the AST stay byte-identical across runs in + // different tempdirs — which is what this idempotence gate + // is asserting. + let mut renderer = + RenderToPreviewAstRenderer::new(&vfs_root).with_url_root("/.quarto/project-artifacts"); + if let Some(json) = attribution_json { + renderer = renderer.with_attribution(json.to_string()); + } + + let format = + Format::from_format_string("q2-preview").expect("q2-preview is a recognized pseudo-format"); + + let mut pipeline = ProjectPipeline::with_renderer( + &mut project, + project_type, + format, + "q2-preview", + runtime.clone(), + renderer, + ) + .with_mode(RenderMode::ActivePage(active.to_path_buf())); + + let summary = pollster::block_on(pipeline.run()).expect("q2-preview pipeline run"); + assert!( + summary.pass1_failures.is_empty(), + "unexpected pass-1 failures: {:?}", + summary.pass1_failures, + ); + assert!( + summary.pass2_failures.is_empty(), + "unexpected pass-2 failures: {:?}", + summary.pass2_failures, + ); + assert_eq!( + summary.outputs.len(), + 1, + "ActivePage mode should produce exactly one output", + ); + summary.outputs.into_iter().next().unwrap() +} + +/// Shuffle a re-parsed `Pandoc` + `ASTContext` into the `DocumentAst` +/// shape the hashing helpers expect. The hash only reads +/// `ast.blocks` and `ast.meta`; the other `DocumentAst` fields are +/// defaulted because they're outside the contract this gate defends. +fn pandoc_to_document_ast(pandoc: Pandoc, ast_context: ASTContext, path: PathBuf) -> DocumentAst { + DocumentAst { + path, + ast: pandoc, + ast_context, + source_context: SourceContext::new(), + warnings: Vec::new(), + recorded_includes: Vec::new(), + } +} + +// ─── Convenience constructors ───────────────────────────────────── + +/// Single-file fixture: writes `content` to `/index.qmd`, +/// runs in both `SingleFile` and `ProjectOrchestrator` modes. +fn doc_fixture(name: &'static str, content: &'static str) -> Fixture { + Fixture { + name, + setup: Box::new(move |root: &Path| { + write(&root.join("index.qmd"), content); + }), + active: PathBuf::from("index.qmd"), + modes: BOTH_MODES, + attribution_json: None, + } +} + +// ===================================================================== +// Phase-2 smoke fixture +// ===================================================================== +// +// One minimal fixture proves the harness works end-to-end before +// Phases 3-4 (existing-fixture carry-forward, gap-closure fixtures) +// land. The fixture body is intentionally trivial — a single +// paragraph — so any failure points unambiguously at the harness, +// not at a transform. + +#[test] +fn smoke_plain_paragraph() { + doc_fixture("smoke-plain-paragraph", "hello\n").run_in_each_mode(); +} + +// ===================================================================== +// Phase 3 — carry-forward fixtures (one per transform / feature) +// ===================================================================== +// +// Each `#[test]` calls `run_in_each_mode`, which loops through +// `SingleFile` and `ProjectOrchestrator`. Failures are *expected* on +// first run for some of these — that's the whole point of the gate. +// Per Phase 5 / §"CI failure policy", leave failing fixtures failing +// and file a beads issue using the sub-agent investigation prompt +// the panic message fills in. Do NOT `#[ignore]` without explicit +// user approval. + +// ─── shortcode-resolve, metadata-normalize ──────────────────────── + +#[test] +fn meta_single() { + doc_fixture("meta-single", "---\nfoo: hello\n---\n\n{{< meta foo >}}\n").run_in_each_mode(); +} + +#[test] +fn meta_markdown() { + doc_fixture( + "meta-markdown", + "---\nfoo: '**Bold** title'\n---\n\n{{< meta foo >}}\n", + ) + .run_in_each_mode(); +} + +// ─── include-expansion + shortcode-resolve ──────────────────────── + +#[test] +fn include_trivial() { + let fixture = Fixture { + name: "include-trivial", + setup: Box::new(|root: &Path| { + write(&root.join("child.qmd"), "Child content\n"); + write(&root.join("index.qmd"), "{{< include child.qmd >}}\n"); + }), + active: PathBuf::from("index.qmd"), + modes: BOTH_MODES, + attribution_json: None, + }; + fixture.run_in_each_mode(); +} + +// ─── callout (callout-resolve is excluded from q2-preview) ──────── + +#[test] +fn callout_warning() { + doc_fixture( + "callout-warning", + "::: {.callout-warning}\nBody of the callout.\n:::\n", + ) + .run_in_each_mode(); +} + +// ─── theorem-sugar ──────────────────────────────────────────────── + +#[test] +fn theorem() { + doc_fixture( + "theorem", + "::: {#thm-foo .theorem}\nThere is a theorem here.\n:::\n", + ) + .run_in_each_mode(); +} + +// ─── float-ref-target-sugar ─────────────────────────────────────── + +#[test] +fn figure_ref_target() { + // Image file is not actually opened by AST transforms; absence + // is fine for AST-level hashing. If a downstream transform + // grows a path-resolution side effect, add a tiny stub here. + doc_fixture( + "figure-ref-target", + ":::: {#fig-foo}\n![cap](img.png)\n::::\n", + ) + .run_in_each_mode(); +} + +// ─── crossref-index + crossref-resolve ──────────────────────────── + +#[test] +fn crossref_to_theorem() { + doc_fixture( + "crossref-to-theorem", + "::: {#thm-foo .theorem}\nThere is a theorem here.\n:::\n\nSee @thm-foo for the proof.\n", + ) + .run_in_each_mode(); +} + +// ─── sectionize ─────────────────────────────────────────────────── + +#[test] +fn sectionize_multi() { + doc_fixture( + "sectionize-multi", + "## A\n\nBody A.\n\n### B\n\nBody B.\n\n## C\n\nBody C.\n", + ) + .run_in_each_mode(); +} + +// ─── footnotes ──────────────────────────────────────────────────── + +#[test] +fn footnotes_mixed() { + doc_fixture( + "footnotes-mixed", + "Text with inline^[an inline footnote] and reference[^foo].\n\n[^foo]: footnote body\n", + ) + .run_in_each_mode(); +} + +// ─── appendix-structure (with license meta + footnotes interaction) + +#[test] +fn appendix_license() { + doc_fixture( + "appendix-license", + "---\nlicense: CC BY\ncopyright: 2026 ACME\n---\n\nBody paragraph.\n\n::: {.appendix}\nAppendix content.\n:::\n\nReference[^a]\n\n[^a]: footnote\n", + ) + .run_in_each_mode(); +} + +// ─── combined: sectionize + callouts + shortcodes ──────────────── + +#[test] +fn combined_stress() { + doc_fixture( + "combined-stress", + "---\ntitle: '**Bold** title'\n---\n\n## A\n\n::: {.callout-warning}\nWarning: {{< meta title >}}\n:::\n\n### B\n\nMore body text.\n", + ) + .run_in_each_mode(); +} + +// ===================================================================== +// Phase 4a — gap-closure fixtures (single-file, no extra scaffolding) +// ===================================================================== + +// ─── code-block-generate, code-block-render, code-highlight ─────── + +#[test] +fn code_block_fenced() { + doc_fixture( + "code-block-fenced", + "Some text.\n\n```python\nprint(\"hello\")\n```\n", + ) + .run_in_each_mode(); +} + +// ─── shortcode-resolve via Lua-loaded handler ───────────────────── +// +// `{{< version >}}` returns `quarto.version` joined by dots. Lua +// state is constructed fresh per pipeline run (see plan §"Design +// decisions"), so two runs over the same input must agree. + +#[test] +fn lua_shortcode_version() { + doc_fixture("lua-shortcode-version", "version: {{< version >}}\n").run_in_each_mode(); +} + +// `{{< lipsum 3 >}}` (no `random=` kwarg) — `math.randomseed` runs +// at module load but `math.random` is never reached on this code +// path, so the output is deterministically the first 3 paragraphs +// of the canned text. The `random=true` variant is intentionally +// non-deterministic and out of scope (plan §"Noted, not actively +// tested"). + +#[test] +fn lua_shortcode_lipsum_fixed() { + doc_fixture( + "lua-shortcode-lipsum-fixed", + // The comment in-document survives as part of the markdown + // (it's an HTML comment in the parsed AST), so the seed + // observation is recorded in the fixture itself rather than + // only in this Rust source. + "\n\n{{< lipsum 3 >}}\n", + ) + .run_in_each_mode(); +} + +// ─── proof-sugar ────────────────────────────────────────────────── + +#[test] +fn proof() { + doc_fixture( + "proof", + "::: {.proof}\nThe proof is left as an exercise.\n:::\n", + ) + .run_in_each_mode(); +} + +// ─── equation-label + crossref-resolve (equation branch) ────────── + +#[test] +fn equation_labeled() { + doc_fixture( + "equation-labeled", + "$$ E = mc^2 $$ {#eq-mass}\n\nSee @eq-mass for the relation.\n", + ) + .run_in_each_mode(); +} + +// ─── toc-generate, toc-render ───────────────────────────────────── + +#[test] +fn toc_on() { + doc_fixture( + "toc-on", + "---\ntoc: true\n---\n\n## A\n\nBody A.\n\n## B\n\nBody B.\n\n## C\n\nBody C.\n", + ) + .run_in_each_mode(); +} + +// ─── built-in Lua filter (video) ────────────────────────────────── +// +// `resources/extensions/quarto/video/` is embedded at compile time +// via `include_dir!` (see `crates/quarto-core/src/extension/mod.rs`) +// and auto-discovered for every `StageContext::new` call, so the +// fixture needs no scaffolding beyond declaring the filter. + +#[test] +fn video_filter_header() { + doc_fixture( + "video-filter-header", + "---\nfilters:\n - video\n---\n\n# Title {background-video=\"https://www.youtube.com/embed/abc\"}\n", + ) + .run_in_each_mode(); +} + +// ─── table-bootstrap-class ──────────────────────────────────────── + +#[test] +fn table_bootstrap_class() { + doc_fixture("table-bootstrap-class", "| col |\n| --- |\n| val |\n").run_in_each_mode(); +} + +// ─── compile-theme-css stage ────────────────────────────────────── +// +// Default theme. The `theme:` key isn't required to opt the stage +// in; `compile-theme-css` runs in the q2-preview stage list for +// HTML-shaped formats unconditionally. Hash excludes `rendered.*`, +// so the compiled CSS (which lands under `meta.rendered.*` and may +// vary in trivial whitespace) doesn't participate. + +#[test] +fn theme_bootstrap() { + doc_fixture("theme-bootstrap", "---\ntheme: cosmo\n---\n\nBody.\n").run_in_each_mode(); +} + +// ===================================================================== +// Phase 4b — gap-closure fixtures (multi-file) +// ===================================================================== + +// ─── include-resolve stage ──────────────────────────────────────── + +#[test] +fn include_in_header() { + let fixture = Fixture { + name: "include-in-header", + setup: Box::new(|root: &Path| { + write( + &root.join("header.html"), + "\n", + ); + write( + &root.join("index.qmd"), + "---\ninclude-in-header: header.html\n---\n\nBody paragraph.\n", + ); + }), + active: PathBuf::from("index.qmd"), + modes: BOTH_MODES, + attribution_json: None, + }; + fixture.run_in_each_mode(); +} + +// ─── resource-collector ─────────────────────────────────────────── +// +// 67-byte minimal valid PNG (1×1 transparent pixel). The AST-level +// transforms only record the path, not the bytes, but providing a +// real file means resource-collector can resolve relative to the +// fixture root rather than warning about a missing local resource. +// Per the fixtures README, paths must resolve relative to the +// project root (no absolute process paths). + +const TINY_PNG: &[u8] = &[ + 0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, 0x00, 0x00, 0x00, 0x0d, 0x49, 0x48, 0x44, 0x52, + 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x08, 0x06, 0x00, 0x00, 0x00, 0x1f, 0x15, 0xc4, + 0x89, 0x00, 0x00, 0x00, 0x0d, 0x49, 0x44, 0x41, 0x54, 0x78, 0x9c, 0x63, 0x00, 0x01, 0x00, 0x00, + 0x05, 0x00, 0x01, 0x0d, 0x0a, 0x2d, 0xb4, 0x00, 0x00, 0x00, 0x00, 0x49, 0x45, 0x4e, 0x44, 0xae, + 0x42, 0x60, 0x82, +]; + +#[test] +fn resource_image() { + let fixture = Fixture { + name: "resource-image", + setup: Box::new(|root: &Path| { + write_bytes(&root.join("local.png"), TINY_PNG); + write(&root.join("index.qmd"), "![alt text](./local.png)\n"); + }), + active: PathBuf::from("index.qmd"), + modes: BOTH_MODES, + attribution_json: None, + }; + fixture.run_in_each_mode(); +} + +// ===================================================================== +// Phase 4c — website-project fixtures (orchestrator-only) +// ===================================================================== +// +// Chrome transforms (navbar / sidebar / page-nav / footer / favicon / +// bootstrap-icons / canonical-url) require a populated `ProjectIndex`, +// which the orchestrator pass-1 builds and pass-2 consumes. Driving +// a website fixture through `SingleFile` mode would test a partial +// pipeline that doesn't exist in production — so these run in +// `ProjectOrchestrator` only. + +// ─── website-chrome: navbar, sidebar, page-nav, footer, favicon, +// bootstrap-icons, canonical-url, title-prefix ──────────────────── + +#[test] +fn website_chrome() { + let fixture = Fixture { + name: "website-chrome", + setup: Box::new(|root: &Path| { + write( + &root.join("_quarto.yml"), + concat!( + "project:\n", + " type: website\n", + "\n", + "website:\n", + " title: Idempotence Chrome\n", + " site-url: https://example.com/\n", + " favicon: favicon.ico\n", + " navbar:\n", + " title: Idempotence Chrome\n", + " background: primary\n", + " left:\n", + " - index.qmd\n", + " - other.qmd\n", + " sidebar:\n", + " contents:\n", + " - index.qmd\n", + " - other.qmd\n", + " page-navigation: true\n", + " page-footer: \"Copyright 2026\"\n", + ), + ); + write( + &root.join("index.qmd"), + "---\ntitle: Home\n---\n\nHome body.\n", + ); + write( + &root.join("other.qmd"), + "---\ntitle: Other\n---\n\nOther body.\n", + ); + // favicon.ico — a tiny stub so any path-resolution side + // effect resolves. Per the fixtures README rule. + write_bytes(&root.join("favicon.ico"), &[0u8; 4]); + }), + active: PathBuf::from("index.qmd"), + modes: ORCHESTRATOR_ONLY, + attribution_json: None, + }; + fixture.run_in_each_mode(); +} + +// ─── website-links: cross-page .qmd body links → link-rewrite, +// link-resolution stage ─────────────────────────────────────────── + +#[test] +fn website_links() { + let fixture = Fixture { + name: "website-links", + setup: Box::new(|root: &Path| { + write( + &root.join("_quarto.yml"), + concat!( + "project:\n", + " type: website\n", + "\n", + "website:\n", + " title: Idempotence Links\n", + ), + ); + write( + &root.join("index.qmd"), + "---\ntitle: Home\n---\n\nSee [the other page](other.qmd) for more.\n", + ); + write( + &root.join("other.qmd"), + "---\ntitle: Other\n---\n\nReturn to [home](index.qmd).\n", + ); + }), + active: PathBuf::from("index.qmd"), + modes: ORCHESTRATOR_ONLY, + attribution_json: None, + }; + fixture.run_in_each_mode(); +} + +// ─── website-listing: listing-generate, listing-render, +// categories-sidebar, listing-feed-link, listing-feed-stage, +// listing-item-info stage ───────────────────────────────────────── + +#[test] +fn website_listing() { + let fixture = Fixture { + name: "website-listing", + setup: Box::new(|root: &Path| { + write( + &root.join("_quarto.yml"), + concat!( + "project:\n", + " type: website\n", + "\n", + "website:\n", + " title: Idempotence Listing\n", + " site-url: https://example.com/\n", + ), + ); + write( + &root.join("index.qmd"), + concat!( + "---\n", + "title: Posts\n", + "listing:\n", + " contents: posts\n", + " type: default\n", + " categories: true\n", + " feed: true\n", + "---\n", + "\n", + "Listing index.\n", + ), + ); + write( + &root.join("posts/first.qmd"), + concat!( + "---\n", + "title: First Post\n", + "categories: [alpha, beta]\n", + "date: 2026-05-01\n", + "---\n", + "\n", + "First body.\n", + ), + ); + write( + &root.join("posts/second.qmd"), + concat!( + "---\n", + "title: Second Post\n", + "categories: [beta]\n", + "date: 2026-05-15\n", + "---\n", + "\n", + "Second body.\n", + ), + ); + }), + active: PathBuf::from("index.qmd"), + modes: ORCHESTRATOR_ONLY, + attribution_json: None, + }; + fixture.run_in_each_mode(); +} + +// ===================================================================== +// Phase 4d — attribution fixture +// ===================================================================== +// +// Deterministic stub. `PreBuiltAttributionProvider` reads a static +// JSON payload; the writer-side machinery (`AttributionGenerateStage` +// + `AttributionRenderTransform`) then populates `format_options.json` +// and writes per-node attribution records into the AST. Using +// `GitBlameProvider` here would be flaky — depends on actual git +// history; the prebuilt path is the same shape the WASM client +// drives in production. + +/// Tiny transport-shape attribution JSON: one actor, one run +/// covering bytes 0..1024 so it overlaps anything the fixture +/// document might contain. +const STUB_ATTRIBUTION_JSON: &str = concat!( + "{", + "\"runs\":[{\"start\":0,\"end\":1024,\"actor\":\"alice\",\"time\":1700000000}],", + "\"identities\":{\"alice\":{\"name\":\"Alice\",\"color\":\"#ff0000\"}}", + "}" +); + +#[test] +fn attribution_basic() { + let fixture = Fixture { + name: "attribution-basic", + setup: Box::new(|root: &Path| { + // Plenty of bytes for the attribution run to overlap. + write( + &root.join("index.qmd"), + "---\ntitle: Attributed\n---\n\n## Section\n\nA paragraph attributed to alice for the whole byte range.\n", + ); + }), + active: PathBuf::from("index.qmd"), + modes: BOTH_MODES, + attribution_json: Some(STUB_ATTRIBUTION_JSON), + }; + fixture.run_in_each_mode(); +} diff --git a/crates/quarto-core/tests/render_page_in_project.rs b/crates/quarto-core/tests/render_page_in_project.rs index bc1d17bc0..b168484d9 100644 --- a/crates/quarto-core/tests/render_page_in_project.rs +++ b/crates/quarto-core/tests/render_page_in_project.rs @@ -78,7 +78,11 @@ fn render_active_page(project_dir: &Path, active: &Path) -> WasmPassTwoOutput { let project_type = project_type_for(&project); let vfs_root = project.dir.join(".quarto/project-artifacts"); - let renderer = RenderToHtmlRenderer::new(&vfs_root); + // bd-rz2we: keep rendered HTML URLs path-independent. Disk + // writes still go to the tempdir at `vfs_root`; only the URLs + // embedded in the HTML use the synthetic VFS prefix. See the + // matching helper in `tests/idempotence.rs`. + let renderer = RenderToHtmlRenderer::new(&vfs_root).with_url_root("/.quarto/project-artifacts"); let mut pipeline = ProjectPipeline::with_renderer( &mut project, @@ -561,35 +565,42 @@ fn default_project_theme_artifact_lands_in_vfs() { let output = render_active_page(&project_dir, &active); // The HTML should embed a `` to a quarto theme CSS file - // under the synthetic vfs root. - let vfs_root = project_dir.join(".quarto/project-artifacts"); - let vfs_root_str = vfs_root.to_string_lossy().to_string(); - let needle_prefix = format!("{}/quarto/quarto-theme-", vfs_root_str); + // under the synthetic vfs URL root. bd-rz2we: native test + // helpers pass `with_url_root("/.quarto/project-artifacts")`, + // so URLs use that synthetic prefix regardless of where the + // bytes actually land on disk. + let url_root = "/.quarto/project-artifacts"; + let url_needle_prefix = format!("{}/quarto/quarto-theme-", url_root); let theme_link = output .html() .lines() - .filter(|line| line.contains(&needle_prefix) && line.contains(".css")) - .next() + .find(|line| line.contains(&url_needle_prefix) && line.contains(".css")) .unwrap_or_else(|| { panic!( - "expected a theme under {}quarto/quarto-theme-…; html: {}", - vfs_root_str, + "expected a theme under {}/quarto/quarto-theme-…; html: {}", + url_root, snippet(&output.html()), ) }); - // Extract the actual CSS path from the href attribute and - // confirm the bytes landed at that path. + // Extract the URL fragment from the href attribute and translate + // it back to the on-disk path. bd-rz2we: the URL embeds the + // synthetic prefix; bytes land under the tempdir `vfs_root`. let href_start = theme_link - .find(&needle_prefix) + .find(&url_needle_prefix) .expect("needle present (filter just confirmed it)"); let after_prefix = &theme_link[href_start..]; let css_end = after_prefix .find(".css") .map(|i| href_start + i + ".css".len()) .expect("href ends with .css"); - let css_path_str = &theme_link[href_start..css_end]; - let css_path = PathBuf::from(css_path_str); + let url_str = &theme_link[href_start..css_end]; + let suffix = url_str + .strip_prefix(url_root) + .expect("URL starts with the synthetic prefix") + .trim_start_matches('/'); + let vfs_root = project_dir.join(".quarto/project-artifacts"); + let css_path = vfs_root.join(suffix); let runtime = NativeRuntime::new(); let bytes = runtime.file_read(&css_path).unwrap_or_else(|e| { diff --git a/crates/quarto-error-reporting/error_catalog.json b/crates/quarto-error-reporting/error_catalog.json index 202e9e2f2..7092e882b 100644 --- a/crates/quarto-error-reporting/error_catalog.json +++ b/crates/quarto-error-reporting/error_catalog.json @@ -524,6 +524,20 @@ "docs_url": "https://quarto.org/docs/errors/writer/Q-3-40", "since_version": "99.9.9" }, + "Q-3-42": { + "subsystem": "writer", + "title": "Shortcode edit dropped", + "message_template": "An edit to shortcode-resolved (or other atomic-Generated) content was reverted. The resolved text is read-only; edit the invocation token in source instead.", + "docs_url": "https://quarto.org/docs/errors/Q-3-42", + "since_version": "99.9.9" + }, + "Q-3-43": { + "subsystem": "writer", + "title": "Generated content edit dropped", + "message_template": "An edit to pipeline-generated content was reverted. The content has no editable source position in this file; edit its upstream definition (an include, metadata key, or other source) instead.", + "docs_url": "https://quarto.org/docs/errors/Q-3-43", + "since_version": "99.9.9" + }, "Q-3-50": { "subsystem": "writer", "title": "LineBlock Not Supported in ANSI Format", diff --git a/crates/quarto-error-reporting/src/diagnostic.rs b/crates/quarto-error-reporting/src/diagnostic.rs index 8ffb35a1b..d86e503dd 100644 --- a/crates/quarto-error-reporting/src/diagnostic.rs +++ b/crates/quarto-error-reporting/src/diagnostic.rs @@ -552,28 +552,6 @@ impl DiagnosticMessage { obj } - /// Extract the original file_id from a SourceInfo by traversing the mapping chain - fn extract_file_id( - source_info: &quarto_source_map::SourceInfo, - ) -> Option { - match source_info { - quarto_source_map::SourceInfo::Original { file_id, .. } => Some(*file_id), - quarto_source_map::SourceInfo::Substring { parent, .. } => { - Self::extract_file_id(parent) - } - quarto_source_map::SourceInfo::Concat { pieces } => { - // For concatenated sources, use the first piece's file_id - pieces - .first() - .and_then(|p| Self::extract_file_id(&p.source_info)) - } - quarto_source_map::SourceInfo::FilterProvenance { .. } => { - // Filter provenance doesn't have a traditional file_id - None - } - } - } - /// Wrap a file path with OSC 8 ANSI hyperlink codes for clickable terminal links. /// /// OSC 8 is a terminal escape sequence that creates clickable hyperlinks: @@ -671,18 +649,21 @@ impl DiagnosticMessage { const ARIADNE_UNIMPORTANT_COLOR: Color = Color::Fixed(249); // Extract file_id from the source mapping by traversing the chain - let file_id = Self::extract_file_id(main_location)?; + let file_id = main_location.root_file_id()?; let file = ctx.get_file(file_id)?; - // Get file content: use stored content for ephemeral files, or read from disk + // Get file content: use stored content for ephemeral files, or read from disk. + // In WASM (and any host with no real filesystem) the disk read fails with + // "operation not supported on this platform"; the only graceful response is + // to drop the source-context snippet. The diagnostic's code, message, and + // hints still surface — only the Ariadne visual is unavailable. let content = match &file.content { - Some(c) => c.clone(), // Ephemeral file: use stored content - None => { - // Disk-backed file: read from disk - std::fs::read_to_string(&file.path) - .unwrap_or_else(|e| panic!("Failed to read file '{}': {}", file.path, e)) - } + Some(c) => c.clone(), + None => match std::fs::read_to_string(&file.path) { + Ok(s) => s, + Err(_) => return None, + }, }; // Map the location offsets back to original file positions @@ -770,7 +751,7 @@ impl DiagnosticMessage { for detail in &self.details { if let Some(detail_loc) = &detail.location { // Extract file_id from detail location - let detail_file_id = match Self::extract_file_id(detail_loc) { + let detail_file_id = match detail_loc.root_file_id() { Some(fid) => fid, None => continue, // Skip if we can't extract file_id }; diff --git a/crates/quarto-pandoc-types/src/atomic_custom_nodes.rs b/crates/quarto-pandoc-types/src/atomic_custom_nodes.rs new file mode 100644 index 000000000..63327c8b4 --- /dev/null +++ b/crates/quarto-pandoc-types/src/atomic_custom_nodes.rs @@ -0,0 +1,63 @@ +//! Registry of `CustomNode` type names that q2-preview's incremental writer +//! treats as **atomic**. +//! +//! An atomic CustomNode is a single replaceable unit. Users can swap or +//! delete one wholesale via a React-side component menu, but they cannot +//! type *inside* it — there is no editable text region the writer can map +//! back to source bytes. The writer treats edits *into* an atomic +//! CustomNode as a soft-drop (Q-3-43); UseAfter on an atomic CustomNode +//! is let-user-win (the qmd writer's CustomNode arm serializes the fresh +//! `plain_data`). +//! +//! See Plan 7 §"`is_atomic_custom_node` registry" for the design and the +//! `is_editable_inside` consumer in `pampa::writers::incremental`. +//! +//! Lives in `quarto-pandoc-types` (not `quarto-core` as Plan 7 originally +//! suggested) because `pampa` consumes it and `pampa` sits below +//! `quarto-core` in the dependency graph. +//! +//! The TypeScript hand-mirror lives at +//! `ts-packages/preview-renderer/src/utils/atomicCustomNodes.ts` and must +//! be kept in lockstep with this list. + +/// `CustomNode` type names that q2-preview treats as atomic. +/// +/// Today: just `"CrossrefResolvedRef"` (kept in lockstep with +/// `quarto_core::crossref::CROSSREF_RESOLVED_REF` — see the cross-check +/// test in `quarto-core::crossref`). Plan 8 will add `"IncludeExpansion"`. +/// +/// Extension-contributed atomic types are out of scope for this const; +/// a future plan adds a runtime registry sourced from `_extension.yml`. +pub const ATOMIC_CUSTOM_NODES: &[&str] = &["CrossrefResolvedRef"]; + +/// Return `true` iff `type_name` names a CustomNode the incremental +/// writer must treat as atomic. +/// +/// See [`ATOMIC_CUSTOM_NODES`] for the list and the module doc-comment +/// for what atomicity means in this context. +pub fn is_atomic_custom_node(type_name: &str) -> bool { + ATOMIC_CUSTOM_NODES.contains(&type_name) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn crossref_resolved_ref_is_atomic() { + assert!(is_atomic_custom_node("CrossrefResolvedRef")); + } + + #[test] + fn unknown_type_name_is_not_atomic() { + assert!(!is_atomic_custom_node("FloatRefTarget")); + assert!(!is_atomic_custom_node("Theorem")); + assert!(!is_atomic_custom_node("Callout")); + assert!(!is_atomic_custom_node("")); + } + + #[test] + fn registry_contains_crossref_resolved_ref() { + assert!(ATOMIC_CUSTOM_NODES.contains(&"CrossrefResolvedRef")); + } +} diff --git a/crates/quarto-pandoc-types/src/attr.rs b/crates/quarto-pandoc-types/src/attr.rs index 191dd409c..502b46c35 100644 --- a/crates/quarto-pandoc-types/src/attr.rs +++ b/crates/quarto-pandoc-types/src/attr.rs @@ -24,6 +24,27 @@ pub fn is_empty_attr(attr: &Attr) -> bool { /// - id: Source location of the id string (None if id is empty "") /// - classes: Source locations for each class string /// - attributes: Source locations for each key-value pair (both key and value) +/// +/// **Positional-alignment invariant** (added 2026-05-22, Plan 6): +/// `attributes[i]` is the `(key_src, val_src)` for the i-th entry in +/// `Attr.2` (`LinkedHashMap`) in **insertion order**. +/// Consumers that index into `attributes` by key position (e.g. to +/// recover the source range of a value before `kvs.remove(key)`) rely +/// on this lockstep. +/// +/// This invariant holds in the parser's main path but is **broken by +/// two preexisting code paths** (tracked separately): +/// - **bd-3aolj** — `commonmark_attribute.rs:41-49` (duplicate-key +/// handling: `LinkedHashMap::insert` updates in place while +/// `attr_source.attributes.push` always appends). +/// - **bd-1e6a5** — caption-attr-into-table merge in `section.rs` and +/// `postprocess.rs` (same root cause when caption + table keys +/// overlap). +/// +/// Until those fix-ups land, indexing consumers should guard with a +/// runtime length check (`kvs.len() == attr_source.attributes.len()`) +/// plus a `debug_assert_eq!` and fall back to `SourceInfo::default()` +/// on mismatch so production never panics on misaligned input. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct AttrSourceInfo { pub id: Option, diff --git a/crates/quarto-pandoc-types/src/block.rs b/crates/quarto-pandoc-types/src/block.rs index 7fde1317d..5ec384a46 100644 --- a/crates/quarto-pandoc-types/src/block.rs +++ b/crates/quarto-pandoc-types/src/block.rs @@ -64,6 +64,33 @@ impl Block { Block::Custom(b) => &b.source_info, } } + + /// Mutable counterpart to [`source_info`]. Mechanical mirror of the read + /// accessor; lets Plan-6 stamping rewrite the per-node `source_info` field + /// through the enum without holding a typed variant reference. + pub fn source_info_mut(&mut self) -> &mut quarto_source_map::SourceInfo { + match self { + Block::Plain(b) => &mut b.source_info, + Block::Paragraph(b) => &mut b.source_info, + Block::LineBlock(b) => &mut b.source_info, + Block::CodeBlock(b) => &mut b.source_info, + Block::RawBlock(b) => &mut b.source_info, + Block::BlockQuote(b) => &mut b.source_info, + Block::OrderedList(b) => &mut b.source_info, + Block::BulletList(b) => &mut b.source_info, + Block::DefinitionList(b) => &mut b.source_info, + Block::Header(b) => &mut b.source_info, + Block::HorizontalRule(b) => &mut b.source_info, + Block::Table(b) => &mut b.source_info, + Block::Figure(b) => &mut b.source_info, + Block::Div(b) => &mut b.source_info, + Block::BlockMetadata(b) => &mut b.source_info, + Block::NoteDefinitionPara(b) => &mut b.source_info, + Block::NoteDefinitionFencedBlock(b) => &mut b.source_info, + Block::CaptionBlock(b) => &mut b.source_info, + Block::Custom(b) => &mut b.source_info, + } + } } pub type Blocks = Vec; @@ -257,4 +284,17 @@ mod tests { }); assert_eq!(block.source_info(), &si); } + + #[test] + fn source_info_mut_round_trip_paragraph() { + let original = test_si(0, 0, 10); + let updated = test_si(9, 200, 220); + let mut block = Block::Paragraph(Paragraph { + content: vec![], + source_info: original.clone(), + }); + assert_eq!(block.source_info(), &original); + *block.source_info_mut() = updated.clone(); + assert_eq!(block.source_info(), &updated); + } } diff --git a/crates/quarto-pandoc-types/src/inline.rs b/crates/quarto-pandoc-types/src/inline.rs index 788a936d5..8b55c40c5 100644 --- a/crates/quarto-pandoc-types/src/inline.rs +++ b/crates/quarto-pandoc-types/src/inline.rs @@ -86,6 +86,42 @@ impl Inline { Inline::Custom(c) => &c.source_info, } } + + /// Mutable counterpart to [`source_info`]. Mechanical mirror of the read + /// accessor; lets Plan-6 stamping rewrite the per-node `source_info` field + /// through the enum without holding a typed variant reference. + pub fn source_info_mut(&mut self) -> &mut quarto_source_map::SourceInfo { + match self { + Inline::Str(s) => &mut s.source_info, + Inline::Emph(e) => &mut e.source_info, + Inline::Underline(u) => &mut u.source_info, + Inline::Strong(s) => &mut s.source_info, + Inline::Strikeout(s) => &mut s.source_info, + Inline::Superscript(s) => &mut s.source_info, + Inline::Subscript(s) => &mut s.source_info, + Inline::SmallCaps(s) => &mut s.source_info, + Inline::Quoted(q) => &mut q.source_info, + Inline::Cite(c) => &mut c.source_info, + Inline::Code(c) => &mut c.source_info, + Inline::Space(s) => &mut s.source_info, + Inline::SoftBreak(s) => &mut s.source_info, + Inline::LineBreak(l) => &mut l.source_info, + Inline::Math(m) => &mut m.source_info, + Inline::RawInline(r) => &mut r.source_info, + Inline::Link(l) => &mut l.source_info, + Inline::Image(i) => &mut i.source_info, + Inline::Note(n) => &mut n.source_info, + Inline::Span(s) => &mut s.source_info, + Inline::Shortcode(s) => &mut s.source_info, + Inline::NoteReference(n) => &mut n.source_info, + Inline::Attr(a) => &mut a.source_info, + Inline::Insert(i) => &mut i.source_info, + Inline::Delete(d) => &mut d.source_info, + Inline::Highlight(h) => &mut h.source_info, + Inline::EditComment(e) => &mut e.source_info, + Inline::Custom(c) => &mut c.source_info, + } + } } pub type Inlines = Vec; @@ -1478,4 +1514,17 @@ mod tests { }); assert_eq!(inline.source_info(), &si); } + + #[test] + fn source_info_mut_round_trip_str() { + let original = test_si(0, 0, 5); + let updated = test_si(7, 100, 110); + let mut inline = Inline::Str(Str { + text: "hello".into(), + source_info: original.clone(), + }); + assert_eq!(inline.source_info(), &original); + *inline.source_info_mut() = updated.clone(); + assert_eq!(inline.source_info(), &updated); + } } diff --git a/crates/quarto-pandoc-types/src/lib.rs b/crates/quarto-pandoc-types/src/lib.rs index aa764ddfc..91131b37f 100644 --- a/crates/quarto-pandoc-types/src/lib.rs +++ b/crates/quarto-pandoc-types/src/lib.rs @@ -10,6 +10,7 @@ * by any crate that needs to work with Pandoc AST structures. */ +pub mod atomic_custom_nodes; pub mod attr; pub mod block; pub mod caption; @@ -23,6 +24,7 @@ pub mod shortcode; pub mod table; // Re-export commonly used types at the crate root +pub use atomic_custom_nodes::{ATOMIC_CUSTOM_NODES, is_atomic_custom_node}; pub use attr::{Attr, AttrSourceInfo, TargetSourceInfo, empty_attr, is_empty_attr}; pub use block::{ Block, BlockQuote, Blocks, BulletList, CaptionBlock, CodeBlock, DefinitionList, Div, Figure, diff --git a/crates/quarto-source-map/Cargo.toml b/crates/quarto-source-map/Cargo.toml index 5b688804c..e231a7f95 100644 --- a/crates/quarto-source-map/Cargo.toml +++ b/crates/quarto-source-map/Cargo.toml @@ -8,6 +8,8 @@ repository.workspace = true [dependencies] serde = { workspace = true, features = ["derive", "rc"] } +serde_json.workspace = true +smallvec.workspace = true [dev-dependencies] serde_json.workspace = true diff --git a/crates/quarto-source-map/src/lib.rs b/crates/quarto-source-map/src/lib.rs index ae8afa1bd..e09f26d91 100644 --- a/crates/quarto-source-map/src/lib.rs +++ b/crates/quarto-source-map/src/lib.rs @@ -41,6 +41,6 @@ pub mod utils; pub use context::{FileMetadata, SourceContext, SourceFile}; pub use file_info::FileInformation; pub use mapping::MappedLocation; -pub use source_info::{SourceInfo, SourcePiece}; +pub use source_info::{Anchor, AnchorRole, By, SourceInfo, SourcePiece}; pub use types::{FileId, Location, Range}; pub use utils::{line_col_to_offset, offset_to_location, range_from_offsets}; diff --git a/crates/quarto-source-map/src/mapping.rs b/crates/quarto-source-map/src/mapping.rs index c8bc1f499..c3269840c 100644 --- a/crates/quarto-source-map/src/mapping.rs +++ b/crates/quarto-source-map/src/mapping.rs @@ -65,9 +65,9 @@ impl SourceInfo { } None // Offset not found in any piece } - SourceInfo::FilterProvenance { .. } => { - // FilterProvenance doesn't have traditional byte offsets - // The location information is stored directly in the variant + SourceInfo::Generated { .. } => { + // Generated nodes have no offset-within-current-text; + // callers wanting source coordinates use resolve_byte_range. None } } diff --git a/crates/quarto-source-map/src/source_info.rs b/crates/quarto-source-map/src/source_info.rs index 91f5800af..64cafdbb8 100644 --- a/crates/quarto-source-map/src/source_info.rs +++ b/crates/quarto-source-map/src/source_info.rs @@ -2,6 +2,7 @@ use crate::types::{FileId, Range}; use serde::{Deserialize, Serialize}; +use smallvec::SmallVec; use std::sync::Arc; /// Source information tracking a location and its transformation history @@ -13,7 +14,9 @@ use std::sync::Arc; /// - Original: Points directly to a file with byte offsets /// - Substring: Points to a range within a parent SourceInfo (offsets are relative to parent) /// - Concat: Combines multiple SourceInfo pieces (preserves provenance when coalescing text) -/// - FilterProvenance: Tracks elements created by Lua filters for diagnostics +/// - Generated: Produced by a pipeline transform. `by` records the producer; `from` +/// records source-side anchors (empty for pure synthesis, `Invocation` for +/// shortcode-style resolutions). /// /// The Transformed variant was removed because it's not used in production code. /// Text transformations (smart quotes, em-dashes) use Original SourceInfo pointing @@ -42,18 +45,86 @@ pub enum SourceInfo { /// Used when coalescing adjacent text nodes while preserving /// the fact that they came from different source locations. Concat { pieces: Vec }, - /// Provenance from a Lua filter + /// Node produced by a pipeline transform /// - /// Used to track elements created by Lua filters for diagnostic messages. - /// Contains the filter file path and line number where the element was created. - FilterProvenance { - /// Path to the Lua filter file (from debug.getinfo source) - filter_path: String, - /// Line number in the filter where the element was created - line: usize, + /// `by` records the producer ("which transform made me"); `from` is a + /// list of typed, role-labeled source-info pointers ("which source + /// bytes contributed to me"). Empty `from` means pure synthesis + /// (sectionize wrappers, filter constructions, title-block h1). + /// An `Invocation` anchor present means there is a source-side + /// preimage (every shortcode resolution). + Generated { + by: By, + #[serde(default, skip_serializing_if = "SmallVec::is_empty")] + from: SmallVec<[Anchor; 2]>, }, } +/// Producer identity for a [`SourceInfo::Generated`] node. +/// +/// `kind` is a short, kebab-case identifier describing which transform +/// produced the node ("filter", "shortcode", "sectionize", ...). Third +/// parties should namespace as `ext//`. +/// +/// `data` is per-kind configuration that is **not** a source-info pointer. +/// Source-side anchors live in the parent `Generated.from` list, not here. +/// `Null` for kinds that don't carry per-instance data. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct By { + /// Short kind tag, kebab-case. Examples: "filter", "shortcode", + /// "sectionize", "user-edit", "title-block". + /// Third-party kinds should namespace: "ext/my-extension/foo". + pub kind: String, + + /// Per-kind configuration that is NOT a source-info pointer. + /// Anchors live in `Generated.from`, not here. + /// `Null` for kinds that don't carry per-instance data. + #[serde(default, skip_serializing_if = "serde_json::Value::is_null")] + pub data: serde_json::Value, +} + +/// Role describing what kind of source-side contribution an anchor records. +/// +/// The known roles are load-bearing — `Invocation` is what the writer's +/// preimage walk and attribution consult; `ValueSource` is diagnostic-only. +/// `Other(String)` is an open escape hatch for extension-defined roles. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub enum AnchorRole { + /// The user-written construct that triggered this node's creation + /// (e.g. the `{{< meta foo >}}` token in the active document). + /// Load-bearing: the writer's `preimage_in` and attribution's + /// `resolve_byte_range` consult the first anchor with this role. + /// At most one per node by convention. + Invocation, + + /// Where the VALUE this node carries was defined, when distinct + /// from the invocation site (e.g. `footer:` in `_metadata.yml` for + /// a `{{< meta footer >}}` resolution). Diagnostic-only — does not + /// affect the writer or attribution decisions in v1. + ValueSource, + + /// Extension-defined or future role we haven't enumerated. + /// String is kebab-case, namespaced (`ext//`). + /// + /// **`preimage_in` does not walk this role.** Future anchor roles + /// default to non-walked unless explicitly added to + /// [`SourceInfo::preimage_in`]'s `Generated` arm. Extensions adding + /// `Other("…")` should treat this as a feature: attribution data + /// attached via `Other` is not accidentally consulted by the writer's + /// byte-copying path. If a role *does* contribute to body-text + /// preimage in `target`, it must be explicitly enumerated in + /// `preimage_in`. + Other(String), +} + +/// A single typed, role-labeled source-info pointer attached to a +/// [`SourceInfo::Generated`] node. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct Anchor { + pub role: AnchorRole, + pub source_info: Arc, +} + /// A piece of a concatenated source #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct SourcePiece { @@ -133,15 +204,80 @@ impl SourceInfo { } } - /// Create source info for a filter-created element + /// Create a [`SourceInfo::Generated`] with an empty anchor list. + /// + /// Use [`SourceInfo::append_anchor`] to add anchors after construction. + /// For Generated nodes that need to carry anchors at construction + /// time, build the variant directly: `SourceInfo::Generated { by, from }`. + pub fn generated(by: By) -> Self { + SourceInfo::Generated { + by, + from: SmallVec::new(), + } + } + + /// If this is a [`SourceInfo::Generated`], return the first anchor whose + /// role is [`AnchorRole::Invocation`]. + /// + /// Returns `None` otherwise (including for non-`Generated` variants). + /// By convention there is at most one `Invocation` anchor per node. + pub fn invocation_anchor(&self) -> Option<&Arc> { + match self { + SourceInfo::Generated { from, .. } => from + .iter() + .find(|a| matches!(a.role, AnchorRole::Invocation)) + .map(|a| &a.source_info), + _ => None, + } + } + + /// If this is a [`SourceInfo::Generated`], return the first anchor whose + /// role is [`AnchorRole::ValueSource`]. + /// + /// Returns `None` otherwise. By convention there is at most one + /// `ValueSource` anchor per node. + pub fn value_source_anchor(&self) -> Option<&Arc> { + match self { + SourceInfo::Generated { from, .. } => from + .iter() + .find(|a| matches!(a.role, AnchorRole::ValueSource)) + .map(|a| &a.source_info), + _ => None, + } + } + + /// Iterate over every anchor in this [`SourceInfo::Generated`] whose role + /// equals `role`. + /// + /// Returns an empty iterator for non-`Generated` variants. Iteration order + /// is the append order. + pub fn anchors_with_role<'a>( + &'a self, + role: &'a AnchorRole, + ) -> Box> + 'a> { + match self { + SourceInfo::Generated { from, .. } => Box::new( + from.iter() + .filter(move |a| &a.role == role) + .map(|a| &a.source_info), + ), + _ => Box::new(std::iter::empty()), + } + } + + /// Append `(role, source_info)` to this [`SourceInfo::Generated`]'s + /// anchor list. /// - /// Used to track the provenance of elements created by Lua filters. - /// The filter_path should be the path to the filter file (from debug.getinfo source). - /// The line should be the line number where the element was created. - pub fn filter_provenance(filter_path: impl Into, line: usize) -> Self { - SourceInfo::FilterProvenance { - filter_path: filter_path.into(), - line, + /// Panics if `self` is not [`SourceInfo::Generated`]. By convention there + /// is at most one anchor per known role; appending a second anchor with + /// the same role does not replace the first — accessors that find by + /// role return the earliest match. + pub fn append_anchor(&mut self, role: AnchorRole, source_info: Arc) { + match self { + SourceInfo::Generated { from, .. } => { + from.push(Anchor { role, source_info }); + } + _ => panic!("append_anchor called on non-Generated SourceInfo"), } } @@ -173,7 +309,7 @@ impl SourceInfo { .. } => end_offset - start_offset, SourceInfo::Concat { pieces } => pieces.iter().map(|p| p.length).sum(), - SourceInfo::FilterProvenance { .. } => 0, + SourceInfo::Generated { .. } => 0, } } @@ -181,13 +317,13 @@ impl SourceInfo { /// /// For Original and Substring, returns the start_offset field. /// For Concat, returns 0 (the concat represents a new text starting at 0). - /// For FilterProvenance, returns 0. + /// For Generated, returns 0. pub fn start_offset(&self) -> usize { match self { SourceInfo::Original { start_offset, .. } => *start_offset, SourceInfo::Substring { start_offset, .. } => *start_offset, SourceInfo::Concat { .. } => 0, - SourceInfo::FilterProvenance { .. } => 0, + SourceInfo::Generated { .. } => 0, } } @@ -195,24 +331,26 @@ impl SourceInfo { /// /// For Original and Substring, returns the end_offset field. /// For Concat, returns the total length. - /// For FilterProvenance, returns 0. + /// For Generated, returns 0. pub fn end_offset(&self) -> usize { match self { SourceInfo::Original { end_offset, .. } => *end_offset, SourceInfo::Substring { end_offset, .. } => *end_offset, SourceInfo::Concat { .. } => self.length(), - SourceInfo::FilterProvenance { .. } => 0, + SourceInfo::Generated { .. } => 0, } } /// Chain-resolve to `(file_id, start_offset, end_offset)` in the /// root source file. /// - /// Returns `None` for `Concat` and `FilterProvenance` — these - /// don't map cleanly to a single contiguous byte range. The - /// attribution v1 sidecar relies on this contract; project-scoped - /// (v2) features that need the full chain resolver should use - /// `map_offset` against a `SourceContext` instead. + /// Returns `None` for `Concat` — Concat doesn't map cleanly to a + /// single contiguous byte range. For `Generated`, delegates to the + /// first `Invocation` anchor and recurses (`None` when no + /// `Invocation` anchor is present). The attribution v1 sidecar + /// relies on this contract; project-scoped (v2) features that need + /// the full chain resolver should use `map_offset` against a + /// `SourceContext` instead. pub fn resolve_byte_range(&self) -> Option<(usize, usize, usize)> { match self { SourceInfo::Original { @@ -228,7 +366,78 @@ impl SourceInfo { let (fid, parent_start, _) = parent.resolve_byte_range()?; Some((fid, parent_start + start_offset, parent_start + end_offset)) } - SourceInfo::Concat { .. } | SourceInfo::FilterProvenance { .. } => None, + SourceInfo::Concat { .. } => None, + SourceInfo::Generated { .. } => self + .invocation_anchor() + .and_then(|si| si.resolve_byte_range()), + } + } + + /// Byte range in `target` that this `SourceInfo`'s preimage covers, if any. + /// + /// This is the writer's "can I Verbatim-copy bytes from `target` for the + /// node carrying this source_info?" check. + /// + /// Semantics by variant: + /// - `Original` → `Some(start..end)` iff the file matches `target`, else `None`. + /// - `Substring` → recurse the parent; offsets compose additively. + /// - `Concat` → every piece must resolve into `target` AND the resolved + /// ranges must be byte-contiguous (no gaps, no overlaps). A gappy Concat + /// returns `None` — the writer can't Verbatim-copy a non-contiguous span. + /// - `Generated` → walk the `Invocation` anchor only via + /// [`invocation_anchor`](Self::invocation_anchor). **No other anchor + /// role is consulted** — not `ValueSource` (Plan 9), not future + /// `Dispatch` (Plan 10), not `AnchorRole::Other`. See the + /// role-asymmetry section below. + /// + /// # Role asymmetry + /// + /// `preimage_in` only walks `AnchorRole::Invocation`. This is load-bearing: + /// copying bytes from a `ValueSource` source range would emit raw YAML + /// metadata (or whatever the value lived in) into the body — a hard + /// correctness bug. The same applies to `Dispatch` (which points at Lua + /// source) and to any extension-defined `Other` role. + /// + /// **Future anchor roles default to non-walked.** Extensions introducing + /// `AnchorRole::Other("…")` should treat this as a feature: their + /// attribution metadata is not accidentally consulted by the writer's + /// byte-copying path. If a role *does* contribute to body-text preimage, + /// it must be explicitly added to this function's `Generated` arm. + pub fn preimage_in(&self, target: FileId) -> Option> { + match self { + SourceInfo::Original { + file_id, + start_offset, + end_offset, + } if *file_id == target => Some(*start_offset..*end_offset), + SourceInfo::Original { .. } => None, + SourceInfo::Substring { + parent, + start_offset, + end_offset, + } => { + let parent_range = parent.preimage_in(target)?; + Some(parent_range.start + start_offset..parent_range.start + end_offset) + } + SourceInfo::Concat { pieces } => { + let ranges: Vec> = pieces + .iter() + .map(|p| p.source_info.preimage_in(target)) + .collect::>>()?; + if ranges.is_empty() { + return None; + } + if ranges.windows(2).all(|w| w[0].end == w[1].start) { + let first = ranges.first().unwrap().start; + let last = ranges.last().unwrap().end; + Some(first..last) + } else { + None + } + } + SourceInfo::Generated { .. } => self + .invocation_anchor() + .and_then(|si| si.preimage_in(target)), } } @@ -257,13 +466,229 @@ impl SourceInfo { piece.source_info.remap_file_ids(map); } } - SourceInfo::FilterProvenance { .. } => { - // No FileId inside — the filter_path is a separate string. + SourceInfo::Generated { from, .. } => { + for anchor in from { + // Arc::make_mut clones if there are other references. + let inner = Arc::make_mut(&mut anchor.source_info); + inner.remap_file_ids(map); + } + } + } + } + + /// First `FileId` reachable from this `SourceInfo`'s root. + /// + /// - `Original` → `Some(file_id)`. + /// - `Substring` → recurse parent. + /// - `Concat` → `pieces.iter().find_map(|p| p.source_info.root_file_id())` + /// (`find_map` semantics — skips Generated holes and empty pieces). + /// - `Generated` → `invocation_anchor().and_then(|si| si.root_file_id())`; + /// `None` when no `Invocation` anchor is present. + pub fn root_file_id(&self) -> Option { + match self { + SourceInfo::Original { file_id, .. } => Some(*file_id), + SourceInfo::Substring { parent, .. } => parent.root_file_id(), + SourceInfo::Concat { pieces } => { + pieces.iter().find_map(|p| p.source_info.root_file_id()) + } + SourceInfo::Generated { .. } => { + self.invocation_anchor().and_then(|si| si.root_file_id()) + } + } + } + + /// Insert every `FileId` reachable from this `SourceInfo` into `out`. + /// + /// Walks every `Original`, every `Substring` parent, every `Concat` + /// piece, and every `Generated` anchor (all roles — `Invocation`, + /// `ValueSource`, `Other`). + pub fn collect_file_ids(&self, out: &mut std::collections::HashSet) { + match self { + SourceInfo::Original { file_id, .. } => { + out.insert(*file_id); + } + SourceInfo::Substring { parent, .. } => parent.collect_file_ids(out), + SourceInfo::Concat { pieces } => { + for piece in pieces { + piece.source_info.collect_file_ids(out); + } + } + SourceInfo::Generated { from, .. } => { + for anchor in from { + anchor.source_info.collect_file_ids(out); + } } } } } +impl By { + /// Producer kind for a node constructed by a Lua filter + /// (e.g. `pandoc.Str("decoration")` inside a filter callback). + /// + /// `filter_path` is the path the Lua engine reported via + /// `debug.getinfo(...).source` (with the leading "@" stripped); + /// `line` is the line number inside that file where the constructor + /// ran. Until Lua-file-registration lands (bd-36fr9), `(filter_path, + /// line)` lives in `by.data`; afterwards it migrates to a `Dispatch` + /// anchor and `by.data` shrinks to `{}`. + pub fn filter(filter_path: impl Into, line: usize) -> Self { + Self { + kind: "filter".to_string(), + data: serde_json::json!({ + "filter_path": filter_path.into(), + "line": line, + }), + } + } + + /// Producer kind for the `SectionizeTransform`'s synthesized section + /// Divs. Children remain editable; the wrapper itself is structural. + pub fn sectionize() -> Self { + Self { + kind: "sectionize".to_string(), + data: serde_json::Value::Null, + } + } + + /// Producer kind for React-constructed (user-typed) content reaching + /// the AST through the q2-preview client. + pub fn user_edit() -> Self { + Self { + kind: "user-edit".to_string(), + data: serde_json::Value::Null, + } + } + + /// Producer kind for shortcode resolutions. + /// + /// **Invariant.** Every `Generated { by: shortcode(...), .. }` must + /// carry at least one `Invocation` anchor in `from` pointing at the + /// source token's byte range. Use only inside a `Generated` whose + /// anchor list is populated; constructing the bare shape with empty + /// `from` is rejected by Plan 6's audit-completion test and trips + /// Plan 7's writer `debug_assert!`. + pub fn shortcode(name: impl Into) -> Self { + Self { + kind: "shortcode".to_string(), + data: serde_json::json!({ "name": name.into() }), + } + } + + /// Producer kind for `IncludeStage`'s expansion wrapper. Note that + /// most include-related synthesized content keeps its `Original` + /// `source_info` (inherited from the include-line Paragraph) — this + /// kind is only used where a `Generated` is explicitly required. + pub fn include() -> Self { + Self { + kind: "include".to_string(), + data: serde_json::Value::Null, + } + } + + /// Producer kind for the title-block stage's synthesized title `h1`. + pub fn title_block() -> Self { + Self { + kind: "title-block".to_string(), + data: serde_json::Value::Null, + } + } + + /// Producer kind for the footnotes stage's container Div. + pub fn footnotes() -> Self { + Self { + kind: "footnotes".to_string(), + data: serde_json::Value::Null, + } + } + + /// Producer kind for the appendix-structure stage's wrapper Div. + pub fn appendix() -> Self { + Self { + kind: "appendix".to_string(), + data: serde_json::Value::Null, + } + } + + /// Producer kind for parser-side synthetic Spaces inserted by the + /// tree-sitter post-processing pass. + pub fn tree_sitter_postprocess() -> Self { + Self { + kind: "tree-sitter-postprocess".to_string(), + data: serde_json::Value::Null, + } + } + + /// Escape-hatch constructor for any `kind` string — including built-in + /// names and extension-defined kinds (`ext//`). + /// + /// Forgery (an extension calling `By::raw("shortcode", …)` without the + /// required `Invocation` anchor) is caught downstream by Plan 6's + /// audit-completion test and Plan 7's `debug_assert!`. The convention + /// for third-party kinds is `ext//`. + pub fn raw(kind: impl Into, data: serde_json::Value) -> Self { + Self { + kind: kind.into(), + data, + } + } + + /// True if a `Generated { by: , .. }` node should be treated + /// as atomic by the incremental writer. + /// + /// Atomic nodes are produced by the pipeline and represent content + /// the user shouldn't edit through React (filter constructions, + /// shortcode resolutions, synthesized title h1, tree-sitter-inserted + /// spaces). Atomicity is determined by `kind` alone — orthogonal to + /// anchor-presence. + /// + /// Extensions that contribute new `by.kind` values are not atomic by + /// default in v1. + pub fn is_atomic_kind(&self) -> bool { + matches!( + self.kind.as_str(), + "filter" | "shortcode" | "title-block" | "tree-sitter-postprocess" + ) + } + + /// True if this `By`'s `kind` equals `kind`. + pub fn is_kind(&self, kind: &str) -> bool { + self.kind == kind + } + + /// If `self.kind == "filter"`, return `(filter_path, line)`. + /// + /// Returns `None` for any other kind, or when the data payload is + /// malformed (missing or non-string `filter_path`, missing or + /// non-integer `line`). + pub fn as_filter(&self) -> Option<(&str, usize)> { + if self.kind != "filter" { + return None; + } + let path = self.data.get("filter_path")?.as_str()?; + let line = self.data.get("line")?.as_u64()? as usize; + Some((path, line)) + } +} + +impl Anchor { + /// Construct an [`AnchorRole::Invocation`] anchor. + pub fn invocation(source_info: Arc) -> Self { + Self { + role: AnchorRole::Invocation, + source_info, + } + } + + /// Construct an [`AnchorRole::ValueSource`] anchor. + pub fn value_source(source_info: Arc) -> Self { + Self { + role: AnchorRole::ValueSource, + source_info, + } + } +} + #[cfg(test)] mod tests { use super::*; @@ -346,18 +771,401 @@ mod tests { } #[test] - fn test_remap_file_ids_filter_provenance_is_noop() { - let mut info = SourceInfo::filter_provenance("foo.lua", 42); + fn test_remap_file_ids_generated_empty_from_is_noop() { + let mut info = SourceInfo::generated(By::filter("foo.lua", 42)); info.remap_file_ids(&|_| FileId(99)); match info { - SourceInfo::FilterProvenance { filter_path, line } => { - assert_eq!(filter_path, "foo.lua"); + SourceInfo::Generated { by, from } => { + assert!(from.is_empty()); + let (path, line) = by.as_filter().unwrap(); + assert_eq!(path, "foo.lua"); assert_eq!(line, 42); } - _ => panic!("Expected FilterProvenance"), + _ => panic!("Expected Generated"), + } + } + + // ------------------------------------------------------------------------- + // Plan 4 — By / Anchor / Generated coverage + // ------------------------------------------------------------------------- + + #[test] + fn test_by_filter_builder() { + let by = By::filter("a.lua", 7); + assert_eq!(by.kind, "filter"); + assert_eq!(by.as_filter(), Some(("a.lua", 7))); + } + + #[test] + fn test_by_sectionize_builder() { + let by = By::sectionize(); + assert_eq!(by.kind, "sectionize"); + assert!(by.data.is_null()); + } + + #[test] + fn test_by_user_edit_builder() { + assert_eq!(By::user_edit().kind, "user-edit"); + } + + #[test] + fn test_by_shortcode_builder_records_name() { + let by = By::shortcode("meta"); + assert_eq!(by.kind, "shortcode"); + assert_eq!(by.data.get("name").and_then(|v| v.as_str()), Some("meta")); + } + + #[test] + fn test_by_include_title_footnotes_appendix_tree_sitter_builders() { + assert_eq!(By::include().kind, "include"); + assert_eq!(By::title_block().kind, "title-block"); + assert_eq!(By::footnotes().kind, "footnotes"); + assert_eq!(By::appendix().kind, "appendix"); + assert_eq!( + By::tree_sitter_postprocess().kind, + "tree-sitter-postprocess" + ); + } + + #[test] + fn test_by_raw_builder_accepts_any_kind() { + let by = By::raw("ext/my-plugin/foo", serde_json::json!({"k": 1})); + assert_eq!(by.kind, "ext/my-plugin/foo"); + assert_eq!(by.data.get("k").and_then(|v| v.as_u64()), Some(1)); + } + + #[test] + fn test_by_is_atomic_kind() { + assert!(By::filter("x.lua", 1).is_atomic_kind()); + assert!(By::shortcode("meta").is_atomic_kind()); + assert!(By::title_block().is_atomic_kind()); + assert!(By::tree_sitter_postprocess().is_atomic_kind()); + + assert!(!By::sectionize().is_atomic_kind()); + assert!(!By::user_edit().is_atomic_kind()); + assert!(!By::include().is_atomic_kind()); + assert!(!By::footnotes().is_atomic_kind()); + assert!(!By::appendix().is_atomic_kind()); + assert!(!By::raw("ext/anywhere/foo", serde_json::Value::Null).is_atomic_kind()); + } + + #[test] + fn test_by_is_kind() { + let by = By::shortcode("meta"); + assert!(by.is_kind("shortcode")); + assert!(!by.is_kind("filter")); + } + + #[test] + fn test_by_as_filter_rejects_non_filter() { + assert!(By::sectionize().as_filter().is_none()); + // Malformed filter (missing line) → None. + let by = By { + kind: "filter".to_string(), + data: serde_json::json!({ "filter_path": "x.lua" }), + }; + assert!(by.as_filter().is_none()); + } + + #[test] + fn test_anchor_invocation_value_source_constructors() { + let original = Arc::new(SourceInfo::original(FileId(1), 0, 5)); + let inv = Anchor::invocation(Arc::clone(&original)); + let vs = Anchor::value_source(Arc::clone(&original)); + assert!(matches!(inv.role, AnchorRole::Invocation)); + assert!(matches!(vs.role, AnchorRole::ValueSource)); + } + + #[test] + fn test_by_json_round_trip() { + let by = By::shortcode("meta"); + let json = serde_json::to_string(&by).unwrap(); + let back: By = serde_json::from_str(&json).unwrap(); + assert_eq!(by, back); + } + + #[test] + fn test_anchor_json_round_trip() { + let anchor = Anchor::invocation(Arc::new(SourceInfo::original(FileId(2), 10, 20))); + let json = serde_json::to_string(&anchor).unwrap(); + let back: Anchor = serde_json::from_str(&json).unwrap(); + assert_eq!(anchor, back); + } + + #[test] + fn test_generated_json_round_trip_empty_from() { + let info = SourceInfo::generated(By::sectionize()); + let json = serde_json::to_string(&info).unwrap(); + let back: SourceInfo = serde_json::from_str(&json).unwrap(); + assert_eq!(info, back); + } + + #[test] + fn test_generated_json_round_trip_with_invocation_anchor() { + let mut info = SourceInfo::generated(By::shortcode("meta")); + info.append_anchor( + AnchorRole::Invocation, + Arc::new(SourceInfo::original(FileId(5), 100, 110)), + ); + let json = serde_json::to_string(&info).unwrap(); + let back: SourceInfo = serde_json::from_str(&json).unwrap(); + assert_eq!(info, back); + } + + #[test] + fn test_generated_json_round_trip_multi_anchor() { + let mut info = SourceInfo::generated(By::shortcode("meta")); + info.append_anchor( + AnchorRole::Invocation, + Arc::new(SourceInfo::original(FileId(5), 100, 110)), + ); + info.append_anchor( + AnchorRole::ValueSource, + Arc::new(SourceInfo::original(FileId(7), 200, 220)), + ); + let json = serde_json::to_string(&info).unwrap(); + let back: SourceInfo = serde_json::from_str(&json).unwrap(); + assert_eq!(info, back); + } + + #[test] + fn test_generated_length_start_end_are_zero() { + let info = SourceInfo::generated(By::sectionize()); + assert_eq!(info.length(), 0); + assert_eq!(info.start_offset(), 0); + assert_eq!(info.end_offset(), 0); + } + + #[test] + fn test_generated_resolve_byte_range_recurses_through_substring() { + let parent = SourceInfo::original(FileId(42), 100, 200); + let sub = SourceInfo::substring(parent, 10, 20); + let mut info = SourceInfo::generated(By::shortcode("meta")); + info.append_anchor(AnchorRole::Invocation, Arc::new(sub)); + assert_eq!(info.resolve_byte_range(), Some((42, 110, 120))); + } + + #[test] + fn test_generated_resolve_byte_range_empty_returns_none() { + let info = SourceInfo::generated(By::sectionize()); + assert!(info.resolve_byte_range().is_none()); + } + + #[test] + fn test_generated_resolve_byte_range_value_source_only_returns_none() { + let mut info = SourceInfo::generated(By::shortcode("meta")); + info.append_anchor( + AnchorRole::ValueSource, + Arc::new(SourceInfo::original(FileId(5), 100, 110)), + ); + assert!(info.resolve_byte_range().is_none()); + } + + #[test] + fn test_generated_remap_file_ids_walks_anchors() { + let mut info = SourceInfo::generated(By::shortcode("meta")); + info.append_anchor( + AnchorRole::Invocation, + Arc::new(SourceInfo::original(FileId(0), 0, 5)), + ); + info.append_anchor( + AnchorRole::ValueSource, + Arc::new(SourceInfo::original(FileId(3), 10, 20)), + ); + info.remap_file_ids(&|id| FileId(id.0 + 10)); + match &info { + SourceInfo::Generated { from, .. } => { + assert_eq!(from.len(), 2); + match from[0].source_info.as_ref() { + SourceInfo::Original { file_id, .. } => assert_eq!(*file_id, FileId(10)), + _ => panic!("Expected Original anchor 0"), + } + match from[1].source_info.as_ref() { + SourceInfo::Original { file_id, .. } => assert_eq!(*file_id, FileId(13)), + _ => panic!("Expected Original anchor 1"), + } + } + _ => panic!("Expected Generated"), + } + } + + #[test] + fn test_root_file_id_per_variant() { + // Original + let original = SourceInfo::original(FileId(7), 0, 5); + assert_eq!(original.root_file_id(), Some(FileId(7))); + + // Substring → recurse parent + let sub = SourceInfo::substring(original.clone(), 0, 5); + assert_eq!(sub.root_file_id(), Some(FileId(7))); + + // Concat find_map skips Generated holes + let empty_gen = SourceInfo::generated(By::sectionize()); + let real = SourceInfo::original(FileId(42), 0, 5); + let concat = SourceInfo::concat(vec![(empty_gen, 0), (real, 5)]); + assert_eq!(concat.root_file_id(), Some(FileId(42))); + + // Generated with Invocation + let mut g = SourceInfo::generated(By::shortcode("meta")); + g.append_anchor( + AnchorRole::Invocation, + Arc::new(SourceInfo::original(FileId(9), 0, 1)), + ); + assert_eq!(g.root_file_id(), Some(FileId(9))); + + // Generated with no Invocation + let mut g2 = SourceInfo::generated(By::shortcode("meta")); + g2.append_anchor( + AnchorRole::ValueSource, + Arc::new(SourceInfo::original(FileId(9), 0, 1)), + ); + assert_eq!(g2.root_file_id(), None); + + // Generated empty + let g3 = SourceInfo::generated(By::sectionize()); + assert_eq!(g3.root_file_id(), None); + } + + #[test] + fn test_collect_file_ids_walks_every_anchor_role() { + let mut info = SourceInfo::generated(By::shortcode("meta")); + info.append_anchor( + AnchorRole::Invocation, + Arc::new(SourceInfo::original(FileId(1), 0, 1)), + ); + info.append_anchor( + AnchorRole::ValueSource, + Arc::new(SourceInfo::original(FileId(2), 0, 1)), + ); + info.append_anchor( + AnchorRole::Other("dispatch".to_string()), + Arc::new(SourceInfo::original(FileId(3), 0, 1)), + ); + let mut out = std::collections::HashSet::new(); + info.collect_file_ids(&mut out); + assert!(out.contains(&FileId(1))); + assert!(out.contains(&FileId(2))); + assert!(out.contains(&FileId(3))); + assert_eq!(out.len(), 3); + } + + #[test] + fn test_collect_file_ids_walks_concat_and_substring() { + let inner = SourceInfo::original(FileId(5), 0, 100); + let sub = SourceInfo::substring(inner, 10, 20); + let other = SourceInfo::original(FileId(11), 0, 5); + let concat = SourceInfo::concat(vec![(sub, 10), (other, 5)]); + let mut out = std::collections::HashSet::new(); + concat.collect_file_ids(&mut out); + assert!(out.contains(&FileId(5))); + assert!(out.contains(&FileId(11))); + assert_eq!(out.len(), 2); + } + + #[test] + fn test_invocation_anchor_accessor() { + let mut info = SourceInfo::generated(By::shortcode("meta")); + assert!(info.invocation_anchor().is_none()); + info.append_anchor( + AnchorRole::ValueSource, + Arc::new(SourceInfo::original(FileId(2), 0, 1)), + ); + assert!(info.invocation_anchor().is_none()); + info.append_anchor( + AnchorRole::Invocation, + Arc::new(SourceInfo::original(FileId(1), 0, 1)), + ); + assert!(info.invocation_anchor().is_some()); + // Non-Generated returns None. + assert!( + SourceInfo::original(FileId(0), 0, 0) + .invocation_anchor() + .is_none() + ); + } + + #[test] + fn test_value_source_anchor_accessor() { + let mut info = SourceInfo::generated(By::shortcode("meta")); + assert!(info.value_source_anchor().is_none()); + info.append_anchor( + AnchorRole::Invocation, + Arc::new(SourceInfo::original(FileId(1), 0, 1)), + ); + assert!(info.value_source_anchor().is_none()); + info.append_anchor( + AnchorRole::ValueSource, + Arc::new(SourceInfo::original(FileId(2), 0, 1)), + ); + assert!(info.value_source_anchor().is_some()); + } + + #[test] + fn test_anchors_with_role() { + let mut info = SourceInfo::generated(By::shortcode("meta")); + info.append_anchor( + AnchorRole::Invocation, + Arc::new(SourceInfo::original(FileId(1), 0, 1)), + ); + info.append_anchor( + AnchorRole::ValueSource, + Arc::new(SourceInfo::original(FileId(2), 0, 1)), + ); + info.append_anchor( + AnchorRole::Other("ext/foo".to_string()), + Arc::new(SourceInfo::original(FileId(3), 0, 1)), + ); + assert_eq!(info.anchors_with_role(&AnchorRole::Invocation).count(), 1); + assert_eq!(info.anchors_with_role(&AnchorRole::ValueSource).count(), 1); + assert_eq!( + info.anchors_with_role(&AnchorRole::Other("ext/foo".to_string())) + .count(), + 1 + ); + assert_eq!( + info.anchors_with_role(&AnchorRole::Other("missing".to_string())) + .count(), + 0 + ); + } + + #[test] + fn test_append_anchor_preserves_order() { + let mut info = SourceInfo::generated(By::shortcode("meta")); + info.append_anchor( + AnchorRole::Invocation, + Arc::new(SourceInfo::original(FileId(1), 0, 1)), + ); + info.append_anchor( + AnchorRole::ValueSource, + Arc::new(SourceInfo::original(FileId(2), 0, 1)), + ); + match info { + SourceInfo::Generated { from, .. } => { + assert_eq!(from.len(), 2); + assert!(matches!(from[0].role, AnchorRole::Invocation)); + assert!(matches!(from[1].role, AnchorRole::ValueSource)); + } + _ => panic!("Expected Generated"), } } + #[test] + fn test_combine_with_generated_is_zero_length_piece() { + let original = SourceInfo::original(FileId(0), 10, 20); + let generated = SourceInfo::generated(By::sectionize()); + let combined = original.combine(&generated); + match &combined { + SourceInfo::Concat { pieces } => { + assert_eq!(pieces.len(), 2); + assert_eq!(pieces[1].length, 0); + } + _ => panic!("Expected Concat"), + } + // Length of the combined value equals only the Original side. + assert_eq!(combined.length(), 10); + } + #[test] fn test_source_info_serialization() { let file_id = FileId(0); @@ -801,4 +1609,166 @@ mod tests { let deserialized: SourceInfo = serde_json::from_value(json).unwrap(); assert_eq!(combined, deserialized); } + + // ------------------------------------------------------------------------- + // Plan 7 — preimage_in accessor + // ------------------------------------------------------------------------- + + #[test] + fn test_preimage_in_original_same_file() { + let info = SourceInfo::original(FileId(0), 10, 25); + assert_eq!(info.preimage_in(FileId(0)), Some(10..25)); + } + + #[test] + fn test_preimage_in_original_different_file_returns_none() { + let info = SourceInfo::original(FileId(0), 10, 25); + assert_eq!(info.preimage_in(FileId(1)), None); + } + + #[test] + fn test_preimage_in_substring_composes_offsets() { + // Parent points at bytes 100..200 in file 0. + // Substring takes bytes 5..15 *relative to parent*. + // Preimage in file 0 should be 105..115. + let parent = SourceInfo::original(FileId(0), 100, 200); + let info = SourceInfo::substring(parent, 5, 15); + assert_eq!(info.preimage_in(FileId(0)), Some(105..115)); + } + + #[test] + fn test_preimage_in_substring_different_file_returns_none() { + let parent = SourceInfo::original(FileId(0), 100, 200); + let info = SourceInfo::substring(parent, 5, 15); + assert_eq!(info.preimage_in(FileId(7)), None); + } + + #[test] + fn test_preimage_in_substring_chain() { + // Original 1000..2000 in file 0; Substring 100..500 relative; Substring 10..50 relative. + // Expected preimage in file 0: 1100 + 10 .. 1100 + 50 = 1110..1150. + let root = SourceInfo::original(FileId(0), 1000, 2000); + let mid = SourceInfo::substring(root, 100, 500); + let leaf = SourceInfo::substring(mid, 10, 50); + assert_eq!(leaf.preimage_in(FileId(0)), Some(1110..1150)); + } + + #[test] + fn test_preimage_in_concat_contiguous() { + // Two adjacent pieces of file 0: 10..15 and 15..25 → contiguous → 10..25. + let a = SourceInfo::original(FileId(0), 10, 15); + let b = SourceInfo::original(FileId(0), 15, 25); + let info = SourceInfo::concat(vec![(a, 5), (b, 10)]); + assert_eq!(info.preimage_in(FileId(0)), Some(10..25)); + } + + #[test] + fn test_preimage_in_concat_gappy_returns_none() { + // 10..15 then 20..25 → gap between 15 and 20 → None. + let a = SourceInfo::original(FileId(0), 10, 15); + let b = SourceInfo::original(FileId(0), 20, 25); + let info = SourceInfo::concat(vec![(a, 5), (b, 5)]); + assert_eq!(info.preimage_in(FileId(0)), None); + } + + #[test] + fn test_preimage_in_concat_overlapping_returns_none() { + // 10..20 then 15..25 → overlap → not byte-contiguous → None. + let a = SourceInfo::original(FileId(0), 10, 20); + let b = SourceInfo::original(FileId(0), 15, 25); + let info = SourceInfo::concat(vec![(a, 10), (b, 10)]); + assert_eq!(info.preimage_in(FileId(0)), None); + } + + #[test] + fn test_preimage_in_concat_mixed_files_returns_none() { + // One piece in file 0, another in file 1 → resolving in file 0 fails + // because the file-1 piece can't be resolved. + let a = SourceInfo::original(FileId(0), 10, 15); + let b = SourceInfo::original(FileId(1), 15, 25); + let info = SourceInfo::concat(vec![(a, 5), (b, 10)]); + assert_eq!(info.preimage_in(FileId(0)), None); + } + + #[test] + fn test_preimage_in_generated_no_anchors_returns_none() { + // Sectionize-style wrapper, footnotes-container, etc.: Generated with + // empty `from`. No Invocation anchor → no preimage. + let info = SourceInfo::generated(By::sectionize()); + assert_eq!(info.preimage_in(FileId(0)), None); + } + + #[test] + fn test_preimage_in_generated_with_invocation_in_target() { + // Shortcode resolution: Generated with an Invocation anchor pointing + // at the {{< meta foo >}} token bytes. + let token = SourceInfo::original(FileId(0), 50, 70); + let mut info = SourceInfo::generated(By::shortcode("meta")); + info.append_anchor(AnchorRole::Invocation, Arc::new(token)); + assert_eq!(info.preimage_in(FileId(0)), Some(50..70)); + } + + #[test] + fn test_preimage_in_generated_with_invocation_outside_target() { + // Invocation anchor points at file 0; query asks about file 1 → None. + let token = SourceInfo::original(FileId(0), 50, 70); + let mut info = SourceInfo::generated(By::shortcode("meta")); + info.append_anchor(AnchorRole::Invocation, Arc::new(token)); + assert_eq!(info.preimage_in(FileId(1)), None); + } + + #[test] + fn test_preimage_in_generated_walks_through_substring_in_invocation() { + // Invocation anchor is itself a Substring chain. preimage_in must + // walk through it correctly. + let root = SourceInfo::original(FileId(0), 100, 200); + let token = SourceInfo::substring(root, 10, 30); + let mut info = SourceInfo::generated(By::shortcode("meta")); + info.append_anchor(AnchorRole::Invocation, Arc::new(token)); + assert_eq!(info.preimage_in(FileId(0)), Some(110..130)); + } + + // ------------------------------------------------------------------------- + // Plan 7 — preimage_in role-asymmetry: only Invocation is walked. + // ------------------------------------------------------------------------- + + #[test] + fn test_preimage_in_generated_value_source_only_returns_none() { + // Plan 9-shape: Generated whose only anchor is ValueSource (points at + // YAML metadata bytes). The writer must NOT copy those bytes into the + // body — preimage_in returns None. + let meta_si = SourceInfo::original(FileId(0), 10, 25); + let mut info = SourceInfo::generated(By::appendix()); + info.append_anchor(AnchorRole::ValueSource, Arc::new(meta_si)); + assert_eq!(info.preimage_in(FileId(0)), None); + } + + #[test] + fn test_preimage_in_generated_other_only_returns_none() { + // Extension-defined Other role. preimage_in must not walk it. + let lua_si = SourceInfo::original(FileId(0), 10, 25); + let mut info = SourceInfo::generated(By::filter("upper.lua", 14)); + info.append_anchor( + AnchorRole::Other("ext/my-ext/dispatch".to_string()), + Arc::new(lua_si), + ); + assert_eq!(info.preimage_in(FileId(0)), None); + } + + #[test] + fn test_preimage_in_generated_invocation_plus_value_source_walks_invocation_only() { + // Plan 2/Plan 9 mixed shape: Invocation in file 0 + ValueSource in + // file 1. Query file 0 → Invocation resolves → Some(token range). + // Query file 1 → Invocation resolves to file 0 (not 1) → None. + // (The writer must not see the value-source range when asked about + // any file, even the file the ValueSource points into.) + let token = SourceInfo::original(FileId(0), 50, 70); + let value = SourceInfo::original(FileId(1), 200, 215); + let mut info = SourceInfo::generated(By::shortcode("meta")); + info.append_anchor(AnchorRole::Invocation, Arc::new(token)); + info.append_anchor(AnchorRole::ValueSource, Arc::new(value)); + + assert_eq!(info.preimage_in(FileId(0)), Some(50..70)); + assert_eq!(info.preimage_in(FileId(1)), None); + } } diff --git a/crates/quarto/tests/smoke-all/q2-preview/render-components-write/_quarto.yml b/crates/quarto/tests/smoke-all/q2-preview/render-components-write/_quarto.yml new file mode 100644 index 000000000..815860fe9 --- /dev/null +++ b/crates/quarto/tests/smoke-all/q2-preview/render-components-write/_quarto.yml @@ -0,0 +1,2 @@ +project: + title: Render-components write smoke diff --git a/crates/quarto/tests/smoke-all/q2-preview/render-components-write/comment.tsx b/crates/quarto/tests/smoke-all/q2-preview/render-components-write/comment.tsx new file mode 100644 index 000000000..cd9788cfc --- /dev/null +++ b/crates/quarto/tests/smoke-all/q2-preview/render-components-write/comment.tsx @@ -0,0 +1,348 @@ +const React = window.React; +const { + Block: B +} = window.__Q2_PREVIEW_RENDERER__; + +function isComment(inline: InlineNode): boolean { + if (inline.t === 'Span' && 'c' in inline) { + const attrs = (inline as SpanInline).c[0]; + const classes = attrs[1]; + return classes.includes('quarto-edit-comment'); + } + return false; +} + +// export const Block = B +// BlockWithComments component +const splitEmoji = (string: string) => [...new Intl.Segmenter().segment(string)].map(x => x.segment) +export const Block = (args: NodeArgs) => { + const { node: block, onNavigateToDocument, setLocalAst } = args + // Gather comments from inline children if block has them + let comments: InlineNode[] = []; + let newBlock = block + if ('c' in block && block.c) { + // For Para, Plain: c is Inline[] + if ((block.t === 'Para' || block.t === 'Plain') && Array.isArray(block.c)) { + comments = block.c.filter(isComment); + newBlock = structuredClone(block) + newBlock.c = block.c.filter((n: any) => !isComment(n)); + } + // For Header: c is [number, [string, string[], [string, string][]], Inline[]] + else if (block.t === 'Header' && Array.isArray(block.c) && Array.isArray(block.c[2])) { + comments = block.c[2].filter(isComment); + newBlock = structuredClone(block) + //@ts-ignore + newBlock.c[2] = block.c[2].filter((n: any) => !isComment(n)); + } + } + + const commentContents = comments.map((c) => (c as SpanInline).c[1].map((o: InlineNode) => { + if (o.t === 'Str') return (o as StrInline).c; + if (o.t === 'Space') return ' '; + return ''; + }).join('')) + const reactions = commentContents.filter(c => splitEmoji(c).length === 1) + const reactionCounts = reactions.reduce((acc, emoji) => + acc.set(emoji, (acc.get(emoji) || 0) + 1), + new Map() + ); + comments = comments.filter((_, i) => splitEmoji(commentContents[i]).length !== 1) + + // Skip CommentWrapper for BulletList and OrderedList + if (block.t === 'BulletList' || block.t === 'OrderedList') { + return ; + } + + return + + ; +}; + +/** + * CommentWrapper renders children in a box and displays gathered comments + */ +const CommentWrapper = ({ children, comments, reactionCounts, setLocalAst, block }: { children: React.ReactNode, reactionCounts: Map, comments: InlineNode[], setLocalAst: (newBlock: BlockNode) => void, block: BlockNode }) => { + const [commentText, setCommentText] = React.useState(''); + const [showEmojiPicker, setShowEmojiPicker] = React.useState(false); + const [showCommentsList, setShowCommentsList] = React.useState(false); + const [isHovered, setIsHovered] = React.useState(false); + const emojiPickerRef = React.useRef(null); + const commentsListRef = React.useRef(null); + const commentInputRef = React.useRef(null); + + // Close emoji picker when clicking outside + React.useEffect(() => { + if (!showEmojiPicker) return; + + const handleClickOutside = (event: MouseEvent) => { + if (emojiPickerRef.current && !emojiPickerRef.current.contains(event.target as Node)) { + setShowEmojiPicker(false); + } + }; + + document.addEventListener('mousedown', handleClickOutside); + return () => { + document.removeEventListener('mousedown', handleClickOutside); + }; + }, [showEmojiPicker]); + + // Close comments list when clicking outside + React.useEffect(() => { + if (!showCommentsList) return; + + const handleClickOutside = (event: MouseEvent) => { + if (commentsListRef.current && !commentsListRef.current.contains(event.target as Node)) { + setShowCommentsList(false); + } + }; + + document.addEventListener('mousedown', handleClickOutside); + return () => { + document.removeEventListener('mousedown', handleClickOutside); + }; + }, [showCommentsList]); + + // Focus the input when comments list opens + React.useEffect(() => { + if (showCommentsList && commentInputRef.current) { + commentInputRef.current.focus(); + } + }, [showCommentsList]); + + const addComment = () => { + const newComment: SpanInline = { + t: 'Span', + c: [['', ['quarto-edit-comment'], []], [{ t: 'Str', c: commentText }]] + }; + + const newBlock = structuredClone(block); + if (newBlock.t === 'Para' || newBlock.t === 'Plain') { + (newBlock as ParaBlock | PlainBlock).c.push(newComment); + } else if (newBlock.t === 'Header') { + (newBlock as HeaderBlock).c[2].push(newComment); + } + setLocalAst(newBlock); + setCommentText('') + }; + + const addReaction = (emoji: string) => { + const newReaction: SpanInline = { + t: 'Span', + c: [['', ['quarto-edit-comment'], []], [{ t: 'Str', c: emoji }]] + }; + + const newBlock: BlockNode = structuredClone(block) as BlockNode; + if (newBlock.t === 'Para' || newBlock.t === 'Plain') { + (newBlock as ParaBlock | PlainBlock).c.push(newReaction); + } else if (newBlock.t === 'Header') { + (newBlock as HeaderBlock).c[2].push(newReaction); + } + setLocalAst(newBlock); + setShowEmojiPicker(false); + }; + + const commonEmojis = ['👍', '❤️', '😂', '🎉', '🤔', '👀', '🔥', '✅']; + const reactionEntries = Array.from(reactionCounts.entries()); + const hasContent = reactionEntries.length > 0 || comments.length > 0; + + return ( +
+ {children} + + {/* Container for all bubbles */} +
setIsHovered(true)} + onMouseLeave={() => setIsHovered(false)} + > + {/* Reaction count bubbles */} + {reactionEntries.map(([emoji, count]) => ( +
addReaction(emoji as string)} + onMouseEnter={(e) => e.currentTarget.style.backgroundColor = '#ededed'} + onMouseLeave={(e) => e.currentTarget.style.backgroundColor = '#dbdbdb'} + title={`Add ${emoji}`} + > + {emoji} + {count} +
+ ))} + + {/* Add reaction bubble */} +
+
setShowEmojiPicker(!showEmojiPicker)} + onMouseEnter={(e) => e.currentTarget.style.backgroundColor = '#e0f0ff'} + onMouseLeave={(e) => e.currentTarget.style.backgroundColor = showEmojiPicker ? '#e0f0ff' : '#b3d9ff'} + title="Add reaction" + > + + 🙂 +
+ + {/* Simple emoji picker */} + {showEmojiPicker && ( +
+ {commonEmojis.map(emoji => ( + addReaction(emoji)} + onMouseEnter={(e) => e.currentTarget.style.backgroundColor = '#f0f0f0'} + onMouseLeave={(e) => e.currentTarget.style.backgroundColor = 'transparent'} + > + {emoji} + + ))} +
+ )} +
+ + {/* Comments count bubble */} + {( +
+
setShowCommentsList(!showCommentsList)} + onMouseEnter={(e) => e.currentTarget.style.backgroundColor = '#e0f0ff'} + onMouseLeave={(e) => e.currentTarget.style.backgroundColor = showCommentsList ? '#e0f0ff' : '#b3d9ff'} + title={`${comments.length} comment${comments.length !== 1 ? 's' : ''}`} + > + 💬 {comments.length} +
+ + {/* Comments list popup */} + {showCommentsList && ( +
+ {comments.map((comment, i) => { + const commentContent = (comment as SpanInline).c[1] + .map((inline: InlineNode) => { + if (inline.t === 'Str') return (inline as StrInline).c; + if (inline.t === 'Space') return ' '; + return ''; + }) + .join(''); + + return ( +
+ {commentContent} +
+ ); + })} +
+ setCommentText(e.target.value)} + onKeyDown={(e) => e.key === 'Enter' && commentText && addComment()} + placeholder="Add comment" + style={{ flex: 1, padding: '4px', fontFamily: 'monospace', fontSize: '0.75rem', backgroundColor: '#f0f0f0', color: 'black', border: '1px solid #ccc', borderRadius: '4px' }} + /> + +
+
+ )} +
+ )} +
+
+ ); +}; \ No newline at end of file diff --git a/crates/quarto/tests/smoke-all/q2-preview/render-components-write/drag.tsx b/crates/quarto/tests/smoke-all/q2-preview/render-components-write/drag.tsx new file mode 100644 index 000000000..8f3d257a4 --- /dev/null +++ b/crates/quarto/tests/smoke-all/q2-preview/render-components-write/drag.tsx @@ -0,0 +1,93 @@ +const React = window.React; +const { + renderChildren, + blockStyle +} = window.__Q2_PREVIEW_RENDERER__; + +export const Div = (args) => { + const attrs = new Map(args.node.c[0][2]) + const initialX = Number(attrs.get('x') ?? 0) + const initialY = Number(attrs.get('y') ?? 0) + + const [x, setX] = React.useState(initialX) + const [y, setY] = React.useState(initialY) + const dragStartRef = React.useRef(null) + const divRef = React.useRef(null) + + // Sync x and y when attrs change externally (not during drag) + React.useEffect(() => { + if (!dragStartRef.current) { + setX(initialX) + setY(initialY) + } + }, [initialX, initialY]) + + const getScale = (el) => { + let scale = 1 + let current = el + while (current && current !== document.body) { + const style = window.getComputedStyle(current) + const transform = style.transform + if (transform && transform !== 'none') { + const matrix = new DOMMatrix(transform) + scale *= matrix.a + } + current = current.parentElement + } + return scale + } + + React.useEffect(() => { + const handleMouseMove = (e) => { + if (dragStartRef.current) { + const scale = getScale(divRef.current) + const dx = (e.clientX - dragStartRef.current.mouseX) / scale + const dy = (e.clientY - dragStartRef.current.mouseY) / scale + setX(dragStartRef.current.startX + dx) + setY(dragStartRef.current.startY + dy) + } + } + + const handleMouseUp = () => { + if (dragStartRef.current) { + args.node.c[0][2] = [['x', x + ''], ['y', y + '']] + args.setLocalAst(args.node) + dragStartRef.current = null + } + } + + window.addEventListener('mousemove', handleMouseMove) + window.addEventListener('mouseup', handleMouseUp) + return () => { + window.removeEventListener('mousemove', handleMouseMove) + window.removeEventListener('mouseup', handleMouseUp) + } + }, [x, y]) + + const t = `translate(${x}px, ${y}px)` + + return
+
{ + dragStartRef.current = { + mouseX: e.clientX, + mouseY: e.clientY, + startX: x, + startY: y + } + }} + /> + {renderChildren(args)} +
+} \ No newline at end of file diff --git a/crates/quarto/tests/smoke-all/q2-preview/render-components-write/index.qmd b/crates/quarto/tests/smoke-all/q2-preview/render-components-write/index.qmd new file mode 100644 index 000000000..097374aa6 --- /dev/null +++ b/crates/quarto/tests/smoke-all/q2-preview/render-components-write/index.qmd @@ -0,0 +1,91 @@ +--- +format: q2-preview +render-components: + - comment.tsx + - kanban.tsx +source-location: full +--- + +# Gordon's render-components demo + +This page renders under **q2-preview**, the new format that ships +real-HTML built-in components for every Pandoc base type and the +Quarto custom-node taxonomy (callouts, theorems, proofs, figures, +equations, cross-references). The TSX files loaded above only need +to declare the components that go *beyond* the built-ins. + +## What's built-in + +After q2-preview Plan 2B + 2C, the iframe ships native renderers for +every Pandoc Block (Para, Header, Code, BlockQuote, Div, Figure, +LineBlock, DefinitionList, Table, ...) and Inline (Str, Emph, Strong, +Code, Link, Image, Math, Span, Cite, Note, ...) plus all six core +Quarto custom-nodes (Callout, Theorem, Proof, FloatRefTarget, +Equation, CrossrefResolvedRef). You no longer need to fork +`html.tsx` to get a real-HTML render. + +Examples that just work without an override file: + +::: {.callout-note} +A built-in callout — no TSX needed. +::: + +::: {#thm-pythagoras .theorem} +For a right triangle with legs of length $a$ and $b$ and hypotenuse $c$, +$a^2 + b^2 = c^2$. +::: + +@thm-pythagoras states the Pythagorean identity. + +$$e = mc^2$$ {#eq-einstein} + +## What's worth overriding + +Override TSX files declare components that aren't covered by the +built-ins — domain-specific UIs that take advantage of the AST +shape Quarto's parser emits: + +* **comment.tsx** — Slack-like commenting UI keyed off + `[>> body]{.quarto-edit-comment}` inline syntax. +* **kanban.tsx** — drag-and-droppable columns keyed off + `::: {.kanban}` divs (uses `drag.tsx` as a helper). + +These layer onto the built-in registry via the `render-components:` +frontmatter list above. User exports of the same name as a built-in +shadow it (e.g. an export named `Callout` would replace the +built-in callout component). + +::: {.kanban} + +## todo + +* Pull comments + reactions through Automerge so they sync between + collaborators. +* Add a slide-deck override demo (q2-preview + Reveal). + +## doing + +* Render-components demo polish. + +## done + +* Fork from elliot/ and rebase for q2-preview. + +::: + +### Stable html and math + +```{=html} + +``` + +[>> 😸] + +$$ +y = mx + b +\newline +G(a_n;x)=\sum_{n=0}^\infty a_n x^{n+1}. +$$[>> 👀] diff --git a/crates/quarto/tests/smoke-all/q2-preview/render-components-write/kanban.tsx b/crates/quarto/tests/smoke-all/q2-preview/render-components-write/kanban.tsx new file mode 100644 index 000000000..86abe6965 --- /dev/null +++ b/crates/quarto/tests/smoke-all/q2-preview/render-components-write/kanban.tsx @@ -0,0 +1,152 @@ +const React = window.React; +const { renderChildren } = window.__Q2_PREVIEW_RENDERER__; + +export const Div = (args) => { + const { node: div, setLocalAst } = args; + + // Check if this is a kanban div + const [[id, classes, attrs]] = div.c; + + if (!classes.includes('kanban')) { + return
{renderChildren(args)}
; + } + + // Parse kanban structure + const blocks = div.c[1]; + const columns = []; + let currentColumn = null; + + for (const block of blocks) { + if (block.t === 'Header' && block.c[0] === 2) { + // New column header + const title = block.c[2].map(inline => { + if (inline.t === 'Str') return inline.c; + if (inline.t === 'Space') return ' '; + return ''; + }).join(''); + + currentColumn = { title, items: [] }; + columns.push(currentColumn); + } else if (block.t === 'BulletList' && currentColumn) { + // Items for current column + const items = block.c.map(listItem => { + // Each listItem is [Block] - an array of blocks + return listItem.map(b => { + if (b.t === 'Plain' || b.t === 'Para') { + return b.c.map(inline => { + if (inline.t === 'Str') return inline.c; + if (inline.t === 'Space') return ' '; + return ''; + }).join(''); + } + return ''; + }).join(''); + }); + currentColumn.items.push(...items); + } + } + + return ; +}; + +const KanbanBoard = ({ columns, div, setLocalAst }) => { + const [draggedItem, setDraggedItem] = React.useState(null); + + const handleDragStart = (colIndex, itemIndex) => { + setDraggedItem({ colIndex, itemIndex }); + }; + + const handleDrop = (targetColIndex) => { + if (!draggedItem) return; + + const { colIndex: srcColIndex, itemIndex: srcItemIndex } = draggedItem; + if (srcColIndex === targetColIndex) { + setDraggedItem(null); + return; + } + + // Build new AST + const newColumns = columns.map((col) => ({ + ...col, + items: [...col.items] + })); + + const [movedItem] = newColumns[srcColIndex].items.splice(srcItemIndex, 1); + newColumns[targetColIndex].items.push(movedItem); + + // Reconstruct div blocks + const newBlocks = []; + for (const col of newColumns) { + // Add header + newBlocks.push({ + t: 'Header', + c: [2, ['', [], []], col.title.split(' ').flatMap(word => [{ t: 'Str', c: word },{t: 'Space'}])] + }); + + // Add bullet list if items exist + if (col.items.length > 0) { + newBlocks.push({ + t: 'BulletList', + c: col.items.map(itemText => [{ + t: 'Plain', + c: [{ t: 'Str', c: itemText }] + }]) + }); + } + } + + const newDiv = structuredClone(div); + newDiv.c[1] = newBlocks; + setLocalAst(newDiv); + setDraggedItem(null); + }; + + return ( +
+ {columns.map((col, colIndex) => ( +
e.preventDefault()} + onDrop={() => handleDrop(colIndex)} + style={{ + minWidth: '150px', + backgroundColor: '#fff', + borderRadius: '8px', + padding: '12px', + boxShadow: '0 2px 4px rgba(0,0,0,0.1)' + }} + > +

+ {col.title} +

+
+ {col.items.map((item, itemIndex) => ( +
handleDragStart(colIndex, itemIndex)} + style={{ + padding: '8px', + backgroundColor: '#fafafa', + border: '1px solid #e0e0e0', + borderRadius: '4px', + cursor: 'move', + fontSize: '0.875rem' + }} + > + {item} +
+ ))} +
+
+ ))} +
+ ); +}; diff --git a/crates/wasm-quarto-hub-client/Cargo.lock b/crates/wasm-quarto-hub-client/Cargo.lock index 745a3d97c..41edc6441 100644 --- a/crates/wasm-quarto-hub-client/Cargo.lock +++ b/crates/wasm-quarto-hub-client/Cargo.lock @@ -2284,6 +2284,7 @@ dependencies = [ "serde", "serde_json", "sha1", + "smallvec", "tokio", "tree-sitter", "tree-sitter-qmd", @@ -2650,6 +2651,7 @@ dependencies = [ "serde_json", "serde_yaml", "sha2 0.11.0", + "smallvec", "tempfile", "thiserror 2.0.18", "time", @@ -2832,6 +2834,8 @@ name = "quarto-source-map" version = "0.1.0" dependencies = [ "serde", + "serde_json", + "smallvec", ] [[package]] diff --git a/crates/wasm-quarto-hub-client/src/lib.rs b/crates/wasm-quarto-hub-client/src/lib.rs index 7caed5fce..26631dfdd 100644 --- a/crates/wasm-quarto-hub-client/src/lib.rs +++ b/crates/wasm-quarto-hub-client/src/lib.rs @@ -2741,34 +2741,49 @@ pub fn ast_to_qmd(ast_json: &str) -> String { /// Incrementally write a modified AST back to QMD, preserving unchanged /// portions of the original source text verbatim. /// -/// Re-parses `original_qmd` internally to obtain an AST with accurate source -/// spans, then computes a reconciliation plan against the new AST and applies -/// the incremental writer. +/// Deserializes the caller-supplied **baseline** AST (the AST whose +/// source spans line up byte-for-byte with `original_qmd`) and computes +/// a reconciliation plan against the new AST. Plan 7 removed the +/// internal re-parse: previously the bridge re-parsed `original_qmd` +/// to recover spans, which lost any provenance the host had already +/// attached to the baseline (e.g. `preimage_in` after a prior +/// incremental edit). Now the caller is responsible for the +/// baseline-tier contract. /// /// # Arguments /// * `original_qmd` - The original QMD source text -/// * `new_ast_json` - JSON-serialized Pandoc AST representing the modified document +/// * `baseline_ast_json` - JSON-serialized Pandoc AST whose source +/// spans correspond to `original_qmd`. **Must be the same tier as +/// `new_ast_json`** (e.g. both `parse`-tier or both +/// `parse+sugar`-tier). Mixing tiers will mis-anchor reconciliation +/// and corrupt the write. +/// * `new_ast_json` - JSON-serialized Pandoc AST representing the +/// modified document, in the same tier as `baseline_ast_json`. /// /// # Returns /// JSON: `{ "success": true, "qmd": "" }` /// or `{ "success": false, "error": "...", "diagnostics": [...] }` #[wasm_bindgen] -pub fn incremental_write_qmd(original_qmd: &str, new_ast_json: &str) -> String { +pub fn incremental_write_qmd( + original_qmd: &str, + baseline_ast_json: &str, + new_ast_json: &str, +) -> String { use pampa::readers::json::read as json_read; - use pampa::wasm_entry_points::qmd_to_pandoc; use pampa::writers::incremental::incremental_write; use quarto_ast_reconcile::compute_reconciliation; - // Step 1: Parse original QMD to get AST with accurate source spans - let (original_ast, _original_context) = match qmd_to_pandoc(original_qmd.as_bytes()) { + // Step 1: Deserialize baseline AST from JSON (carries source spans + // anchored to `original_qmd` and any host-side provenance). + let mut baseline_cursor = std::io::Cursor::new(baseline_ast_json.as_bytes()); + let (baseline_ast, baseline_context) = match json_read(&mut baseline_cursor) { Ok(result) => result, - Err(error_strings) => { - let error_msg = error_strings.join("\n"); + Err(e) => { return serde_json::to_string(&AstResponse { success: false, ast: None, qmd: None, - error: Some(format!("Failed to parse original QMD: {}", error_msg)), + error: Some(format!("Failed to parse baseline AST JSON: {}", e)), diagnostics: None, warnings: None, }) @@ -2777,8 +2792,8 @@ pub fn incremental_write_qmd(original_qmd: &str, new_ast_json: &str) -> String { }; // Step 2: Deserialize new AST from JSON - let mut cursor = std::io::Cursor::new(new_ast_json.as_bytes()); - let (new_ast, _new_context) = match json_read(&mut cursor) { + let mut new_cursor = std::io::Cursor::new(new_ast_json.as_bytes()); + let (new_ast, _new_context) = match json_read(&mut new_cursor) { Ok(result) => result, Err(e) => { return serde_json::to_string(&AstResponse { @@ -2794,19 +2809,32 @@ pub fn incremental_write_qmd(original_qmd: &str, new_ast_json: &str) -> String { }; // Step 3: Compute reconciliation plan - let plan = compute_reconciliation(&original_ast, &new_ast); + let plan = compute_reconciliation(&baseline_ast, &new_ast); // Step 4: Incremental write - match incremental_write(original_qmd, &original_ast, &new_ast, &plan) { - Ok(result_qmd) => serde_json::to_string(&AstResponse { - success: true, - ast: None, - qmd: Some(result_qmd), - error: None, - diagnostics: None, - warnings: None, - }) - .unwrap(), + match incremental_write(original_qmd, &baseline_ast, &new_ast, &plan) { + Ok((result_qmd, warnings)) => { + // Plan 7: soft-drop warnings (Q-3-42 / Q-3-43) ride alongside + // a successful write. The TS wrapper surfaces them via the + // existing `warnings` channel on `AstResponse`. + let warnings_json = if warnings.is_empty() { + None + } else { + Some(diagnostics_to_json( + &warnings, + &baseline_context.source_context, + )) + }; + serde_json::to_string(&AstResponse { + success: true, + ast: None, + qmd: Some(result_qmd), + error: None, + diagnostics: None, + warnings: warnings_json, + }) + .unwrap() + } Err(diags) => { let error_msg = diags .iter() diff --git a/hub-client/changelog.md b/hub-client/changelog.md index 97e19f2dc..3c52bacb6 100644 --- a/hub-client/changelog.md +++ b/hub-client/changelog.md @@ -15,6 +15,15 @@ be in reverse chronological order (latest first). --> +### 2026-05-25 + +- [`5f2bbab0`](https://github.com/quarto-dev/q2/commits/5f2bbab0): Soft-drop warnings (Q-3-42, Q-3-43) now surface in the diagnostic panel even when the rewrite produces byte-identical output. Before, clicking +react inside a shortcode-resolved region (e.g. `{{< lipsum 3 >}}`) silently declined the edit with no visible feedback; the warning was queued for the next render but no re-render fired because nothing changed. +- [`bdcfdc53`](https://github.com/quarto-dev/q2/commits/bdcfdc53): Fix q2-preview edits silently failing with "Incremental write failed: undefined" on documents where the render pipeline produced a single top-level sectionize wrapper around the user content. The writer now recurses into non-atomic Generated wrappers (sectionize, footnotes-container, appendix-container) instead of soft-dropping the whole document. + +### 2026-05-24 + +- [`a0a4c7c8`](https://github.com/quarto-dev/q2/commits/a0a4c7c8): q2-preview edits now write back to the document. The read-only guard is gone; component-driven edits (kanban drag, future comment buttons) flow through the incremental writer using the live preview AST as the baseline, and soft-drop warnings (Q-3-42 / Q-3-43) surface in the existing diagnostics panel when an edit hits an atomic region. + ### 2026-05-21 - [`6c84696d`](https://github.com/quarto-dev/q2/commits/6c84696d): Login screen and post-logout view now respect the saved `colorScheme` preference (and system `prefers-color-scheme`) instead of always rendering light on first visit and inheriting the previous session's class after logout. diff --git a/hub-client/e2e/q2-preview-render-components-write.spec.ts b/hub-client/e2e/q2-preview-render-components-write.spec.ts new file mode 100644 index 000000000..5c1fca0f3 --- /dev/null +++ b/hub-client/e2e/q2-preview-render-components-write.spec.ts @@ -0,0 +1,177 @@ +/** + * E2E repro for "Incremental write failed: undefined" on q2-preview. + * + * Sister to `q2-debug-render-components.spec.ts`, but the click here + * triggers `setLocalAst` (not local React state). That threads through + * the renderer dispatch into `ReactPreview.handleSetAst` → + * `incrementalWriteQmd` (`ts-packages/preview-runtime/src/wasmRenderer.ts`), + * which is the path the user hit while clicking a reactji button in the + * `render-components` demo. + * + * The fixture's `write-reaction.tsx` mirrors the addReaction code in + * `~/docs/demo-playground/gordon/render-components/comment.tsx`: append a + * fresh `Span.quarto-edit-comment` to the clicked Para's inline children + * and `setLocalAst(newBlock)`. The dispatch wraps that into a full AST + * (one block replaced) and feeds it as the new-AST to the WASM bridge. + * + * Expected after the bug is fixed: the write succeeds and no "Incremental + * write failed" console error fires. + * + * Current behaviour (the bug we're chasing): the bridge returns + * `{success: true, qmd: '', warnings: [Q-3-43]}` — empty document with a + * "Generated content edit dropped" warning. The wasmRenderer.ts:758 + * throw site (instrumented to distinguish this empty-qmd path) logs + * `incrementalWriteQmd failed; raw response: ...` and throws. + */ + +import { readFileSync } from 'node:fs'; +import { resolve } from 'node:path'; +import { test, expect, type ConsoleMessage } from '@playwright/test'; +import { + bootstrapProjectSet, + createProjectOnServer, + seedProjectInBrowser, + getServerUrl, +} from './helpers/projectFactory'; + +const FIXTURE_DIR = resolve( + import.meta.dirname, + '../../crates/quarto/tests/smoke-all/q2-preview/render-components-write', +); + +const qmdContent = readFileSync(resolve(FIXTURE_DIR, 'index.qmd'), 'utf-8'); +const commentTsxContent = readFileSync( + resolve(FIXTURE_DIR, 'comment.tsx'), + 'utf-8', +); +const kanbanTsxContent = readFileSync( + resolve(FIXTURE_DIR, 'kanban.tsx'), + 'utf-8', +); +const dragTsxContent = readFileSync( + resolve(FIXTURE_DIR, 'drag.tsx'), + 'utf-8', +); +const quartoYmlContent = readFileSync( + resolve(FIXTURE_DIR, '_quarto.yml'), + 'utf-8', +); + +test.describe('q2-preview render-components write', () => { + test('clicking +react triggers setLocalAst → incremental_write_qmd without empty-qmd error', async ({ + page, + }) => { + const serverUrl = getServerUrl(); + + // Collect every console.error from the page (and its iframes). The + // instrumentation in `wasmRenderer.ts:758` emits + // `incrementalWriteQmd failed; raw response: { ... }` + // when the WASM bridge returns Ok with an empty qmd string. We assert + // no such message lands during the click → write round-trip. + const consoleErrors: string[] = []; + const consoleAll: string[] = []; + page.on('console', (msg: ConsoleMessage) => { + const loc = msg.location(); + const tag = `[${msg.type()}] ${msg.text()} @ ${loc.url}:${loc.lineNumber}`; + consoleAll.push(tag); + if (msg.type() === 'error') { + consoleErrors.push(msg.text()); + } + }); + // Surface page errors too so a thrown JS error doesn't look like a + // silent pass. + const pageErrors: string[] = []; + page.on('pageerror', (err) => { + pageErrors.push(`${err.message}\n${err.stack ?? ''}`); + }); + + const indexDocId = await createProjectOnServer(serverUrl, [ + { + path: '_quarto.yml', + content: quartoYmlContent, + contentType: 'text', + }, + { + path: 'comment.tsx', + content: commentTsxContent, + contentType: 'text', + }, + { + path: 'kanban.tsx', + content: kanbanTsxContent, + contentType: 'text', + }, + { + path: 'drag.tsx', + content: dragTsxContent, + contentType: 'text', + }, + { + path: 'index.qmd', + content: qmdContent, + contentType: 'text', + }, + ]); + + await bootstrapProjectSet(page, serverUrl); + const localId = await seedProjectInBrowser(page, indexDocId, serverUrl); + + await page.goto( + `/#/p/${localId}/file/${encodeURIComponent('index.qmd')}`, + ); + + // The q2-preview iframe is `q2-preview.html`, distinct from the + // q2-debug iframe used by the sister spec. The user's CommentWrapper + // renders a "+ 🙂" button (title="Add reaction") next to every Para; + // clicking it opens an emoji picker, clicking an emoji calls + // addReaction → setLocalAst. + const iframe = page.frameLocator('iframe[src*="q2-preview.html"]'); + + // Wait for the iframe to render the first paragraph's CommentWrapper + // chrome — the "+ 🙂" emoji-picker open button. + const openPicker = iframe.locator('[title="Add reaction"]').first(); + try { + await expect(openPicker).toBeVisible({ timeout: 30_000 }); + } catch (e) { + console.error('--- console messages so far ---'); + for (const line of consoleAll) console.error(line); + console.error('--- page errors ---'); + for (const err of pageErrors) console.error(err); + throw e; + } + + // Open the picker, then click the 😂 emoji. Picker emoji spans + // carry no test id — locate by text. There's a 😂 in + // CommentWrapper's `commonEmojis` list (`'👍', '❤️', '😂', ...`). + await openPicker.click(); + await iframe.locator('text="😂"').first().click(); + + // Give the WASM call time to run and emit its console.error if it + // hits the failure path. + await page.waitForTimeout(1500); + + const writeFailures = consoleErrors.filter((line) => + line.includes('incrementalWriteQmd failed'), + ); + + if (writeFailures.length > 0) { + console.error('--- Full console log on failure ---'); + for (const line of consoleAll) console.error(line); + } + + expect( + writeFailures, + 'Incremental write should not fail when appending a reaction to the first paragraph. ' + + 'Raw console errors:\n' + + consoleErrors.join('\n'), + ).toEqual([]); + + // Filter out unrelated Monaco loader internal errors (Monaco runs + // inside the markup-view panel; its load can throw without + // affecting the preview). + const relevantPageErrors = pageErrors.filter( + (e) => !e.includes('monaco-editor'), + ); + expect(relevantPageErrors, 'Page should not throw').toEqual([]); + }); +}); diff --git a/hub-client/src/components/render/ReactPreview.tsx b/hub-client/src/components/render/ReactPreview.tsx index fa34e36ca..c8aa19369 100644 --- a/hub-client/src/components/render/ReactPreview.tsx +++ b/hub-client/src/components/render/ReactPreview.tsx @@ -9,7 +9,7 @@ import { isWasmReady, incrementalWriteQmd, } from '@quarto/preview-runtime'; -import { pipelineKindForFormat } from '../../utils/pipelineKind'; +import { pipelineKindForFormat } from '@quarto/preview-runtime'; import { useAttribution } from '../../hooks/useAttribution'; import { stripAnsi } from '@quarto/preview-renderer/utils/stripAnsi'; import { PreviewErrorOverlay } from '@quarto/preview-renderer/overlays/PreviewErrorOverlay'; @@ -314,6 +314,19 @@ export default function ReactPreview({ const renderTimeoutRef = useRef(null); const lastContentRef = useRef(''); + // Plan 7: soft-drop warnings from the most recent incremental write, + // pending injection into the next render's diagnostics. Drained when + // the content-driven re-render fires. + const pendingWriteWarningsRef = useRef([]); + + // Tracks the most recent set of render-side diagnostics we sent + // upward. `handleSetAst` reads this when surfacing soft-drop + // warnings *immediately* (without waiting for the next render): + // the immediate push must include the current render's + // diagnostics so it doesn't accidentally clear them. Updated on + // every `onDiagnosticsChange` call. + const lastRenderDiagnosticsRef = useRef([]); + // Handler for cross-document navigation const handleNavigateToDocument = useCallback( (targetPath: string, anchor: string | null) => { @@ -350,11 +363,24 @@ export default function ReactPreview({ }); if (qmdContent !== lastContentRef.current) return; - // Update diagnostics - onDiagnosticsChange(result.diagnostics); + // Update diagnostics. Plan 7: drain any soft-drop warnings from + // the most recent incremental write into this push so they reach + // the diagnostics surface alongside render-side diagnostics. + const pendingWriteWarnings = pendingWriteWarningsRef.current; + pendingWriteWarningsRef.current = []; + const mergedDiagnostics = pendingWriteWarnings.length > 0 + ? [...result.diagnostics, ...pendingWriteWarnings] + : result.diagnostics; + // Remember just the render-side portion so a follow-up immediate + // push from `handleSetAst` (when the writer returns warnings + // alongside byte-identical output and no re-render fires) can + // merge new warnings *with* the current render diagnostics rather + // than clobbering them. + lastRenderDiagnosticsRef.current = result.diagnostics; + onDiagnosticsChange(mergedDiagnostics); setCurrentError(result.success ? null : { message: result.error!, - diagnostics: result.diagnostics, + diagnostics: mergedDiagnostics, }); if (result.success) { @@ -415,29 +441,57 @@ export default function ReactPreview({ setCurrentError(null); }, [currentFile?.path]); - // Handler for AST modifications - converts AST back to QMD and updates content. + // Handler for AST modifications — converts AST back to QMD and + // updates content. + // + // Plan 7 lifted the v1 read-only guard. The bridge now takes the + // displayed AST as the **baseline** (its source spans line up with + // `content`) and the new edited AST. + // + // Soft-drop warnings (Q-3-42 / Q-3-43) reach the diagnostic surface + // via two paths, both load-bearing: + // + // 1. **Immediate push.** If the writer returns warnings, surface + // them right away by calling `onDiagnosticsChange` here. This + // is the path that matters when the rewrite produces + // byte-identical output (the common soft-drop case — the + // writer faithfully preserves the original bytes when the + // edit was rejected). With identical bytes, no Monaco edit + // fires, no automerge update, no re-render — so without an + // immediate push the warnings would never surface. // - // q2-preview is **read-only in v1** (Plan 1 §"Multi-plan contract: - // read-only mode lifts at Plan 7"). The post-pipeline AST diverges - // from source enough that a naive incrementalWriteQmd would - // corrupt the qmd; Plan 7 lifts this guard once the writer's - // round-trip machinery understands q2-preview's transform shapes - // (Synthetic / Derived / atomic CustomNodes). Component-driven - // edits (kanban drag, comment buttons in Plan 2) call this and - // silently no-op with a console.warn — that is the accepted - // post-Plan-2 UX gap until Plan 7 ships. + // 2. **Ride-along on next render.** If the rewrite *did* change + // content, the re-render fires and `doRenderWithStateManagement` + // drains `pendingWriteWarningsRef` into its merged diagnostics + // push. This keeps the warning temporally associated with the + // render of the *edited* document, which is the cleanest UX + // for the "edit applied + warning fired" case (rare today — + // most warnings imply soft-drop, i.e. no content change — but + // kept as a safety net). + // + // The immediate push merges with `lastRenderDiagnosticsRef` so we + // don't accidentally clear the current render-side diagnostics + // when we add write warnings to them. const handleSetAst = useCallback((newAst: any) => { - if (pipelineKindForFormat(format) === 'preview') { - console.warn('q2-preview is read-only in v1; AST edit dropped (Plan 7 lifts this guard)'); - return; - } try { - const newQmd = incrementalWriteQmd(content, newAst); + const baseline = ast ? JSON.parse(ast) : null; + if (!baseline) { + console.warn('Cannot write AST: no baseline render available yet'); + return; + } + const { qmd: newQmd, warnings } = incrementalWriteQmd(content, baseline, newAst); + if (warnings && warnings.length > 0) { + // (1) Immediate push for the byte-identical (no re-render) case. + onDiagnosticsChange([...lastRenderDiagnosticsRef.current, ...warnings]); + // (2) Queue for ride-along on the next render (no-op when no + // re-render fires, which is the typical soft-drop path). + pendingWriteWarningsRef.current = warnings; + } onContentRewrite(newQmd); } catch (err) { console.error('Failed to write AST back to QMD:', err); } - }, [content, onContentRewrite, format]); + }, [ast, content, onContentRewrite, onDiagnosticsChange]); return (
diff --git a/hub-client/src/services/incrementalWrite.wasm.test.ts b/hub-client/src/services/incrementalWrite.wasm.test.ts new file mode 100644 index 000000000..6a73666d1 --- /dev/null +++ b/hub-client/src/services/incrementalWrite.wasm.test.ts @@ -0,0 +1,156 @@ +/** + * WASM End-to-End Tests for `incremental_write_qmd` (Plan 7). + * + * Verifies the new 3-arg signature + * (`original_qmd, baseline_ast_json, new_ast_json`) at the JS/WASM + * boundary. The Rust-side correctness of soft-drop substitutions + * (Q-3-42 / Q-3-43) is covered by `crates/pampa/src/writers/incremental.rs` + * unit tests; these tests pin the wrapper contract: + * + * - Identity round-trip is byte-equal (baseline === new ⇒ original qmd). + * - The returned shape is `{ qmd, warnings? }`; `warnings` is absent + * when nothing was soft-dropped. + * - A simple paragraph-text edit reaches the result qmd; the + * surrounding structure (headings, other paragraphs) is preserved + * verbatim from the original. + * + * The exhaustive scenario matrix (sectionized docs, multi-inline + * shortcode dedupe, Q-3-42 byte-equal-no-op, Q-3-43 footnotes + * regeneration) lives in the Rust-side coarsen tests + Plan 8 + * Playwright e2e (deferred to follow-up beads). + * + * Run with: npm run test:wasm + */ + +import { describe, it, expect, beforeAll } from 'vitest'; +import { readFile } from 'fs/promises'; +import { dirname, join } from 'path'; +import { fileURLToPath } from 'url'; + +interface WasmModule { + default: (input?: BufferSource) => Promise; + parse_qmd_content: (content: string) => string; + incremental_write_qmd: ( + original_qmd: string, + baseline_ast_json: string, + new_ast_json: string, + ) => string; +} + +interface AstResponse { + success: boolean; + ast?: string; + qmd?: string; + error?: string; + warnings?: unknown[]; +} + +let wasm: WasmModule; + +beforeAll(async () => { + const __dirname = dirname(fileURLToPath(import.meta.url)); + const wasmDir = join(__dirname, '../../wasm-quarto-hub-client'); + const wasmPath = join(wasmDir, 'wasm_quarto_hub_client_bg.wasm'); + const wasmBytes = await readFile(wasmPath); + + wasm = (await import('wasm-quarto-hub-client')) as unknown as WasmModule; + await wasm.default(wasmBytes); +}); + +/** Parse `qmd` and return the resulting AST as a plain object. */ +function parseAst(qmd: string): unknown { + const resp: AstResponse = JSON.parse(wasm.parse_qmd_content(qmd)); + expect(resp.success, `parse_qmd_content failed: ${resp.error}`).toBe(true); + expect(resp.ast).toBeTruthy(); + return JSON.parse(resp.ast!); +} + +/** Run the incremental writer and return its parsed AstResponse. */ +function write( + originalQmd: string, + baselineAst: unknown, + newAst: unknown, +): AstResponse { + return JSON.parse( + wasm.incremental_write_qmd( + originalQmd, + JSON.stringify(baselineAst), + JSON.stringify(newAst), + ), + ); +} + +/** + * Walk a Pandoc AST and mutate the first `Str` whose `c` matches + * `find`, replacing its content with `replace`. Returns true if a + * match was found. Used to synthesize a "user edited a word" + * scenario without going through the qmd reader. + */ +function mutateFirstStr(ast: unknown, find: string, replace: string): boolean { + let done = false; + const walk = (node: unknown): void => { + if (done) return; + if (Array.isArray(node)) { + for (const child of node) walk(child); + return; + } + if (node && typeof node === 'object') { + const obj = node as Record; + if (obj.t === 'Str' && obj.c === find) { + obj.c = replace; + done = true; + return; + } + for (const v of Object.values(obj)) walk(v); + } + }; + walk(ast); + return done; +} + +describe('incremental_write_qmd wrapper contract', () => { + it('identity round-trip is byte-equal and emits no warnings', () => { + const original = '# Heading\n\nA paragraph.\n'; + const baseline = parseAst(original); + const resp = write(original, baseline, baseline); + + expect(resp.success, `write failed: ${resp.error}`).toBe(true); + expect(resp.qmd).toBe(original); + // No warnings field when nothing was soft-dropped. + expect(resp.warnings).toBeUndefined(); + }); + + it('paragraph-text edit reaches the output; surrounding structure preserved', () => { + const original = + '# Heading\n\nFirst paragraph here.\n\n## Sub\n\nSecond paragraph here.\n'; + const baseline = parseAst(original); + // Deep-clone via JSON round-trip so the mutation doesn't alias + // the baseline. The wrapper stringifies both, but defensive + // cloning makes the test's intent obvious. + const next = JSON.parse(JSON.stringify(baseline)); + const mutated = mutateFirstStr(next, 'First', 'Updated'); + expect(mutated, 'expected to find a Str("First") to mutate').toBe(true); + + const resp = write(original, baseline, next); + expect(resp.success, `write failed: ${resp.error}`).toBe(true); + expect(resp.qmd).toMatch(/Updated paragraph here\./); + // Untouched surroundings are preserved verbatim from the + // original — this is the whole point of the incremental writer. + expect(resp.qmd).toContain('# Heading'); + expect(resp.qmd).toContain('## Sub'); + expect(resp.qmd).toContain('Second paragraph here.'); + }); + + it('reports a structured error when the baseline AST JSON is malformed', () => { + const original = '# x\n'; + const baseline = parseAst(original); + const respJson = wasm.incremental_write_qmd( + original, + '{not valid json', + JSON.stringify(baseline), + ); + const resp: AstResponse = JSON.parse(respJson); + expect(resp.success).toBe(false); + expect(resp.error).toMatch(/baseline AST JSON/i); + }); +}); diff --git a/hub-client/src/types/wasm-quarto-hub-client.d.ts b/hub-client/src/types/wasm-quarto-hub-client.d.ts index 514ca4aa0..b92e13f76 100644 --- a/hub-client/src/types/wasm-quarto-hub-client.d.ts +++ b/hub-client/src/types/wasm-quarto-hub-client.d.ts @@ -65,8 +65,20 @@ declare module 'wasm-quarto-hub-client' { // QMD parsing and AST conversion functions export function parse_qmd_content(content: string): string; export function ast_to_qmd(ast_json: string): string; - /** Incrementally write a modified AST back to QMD, preserving unchanged source text. */ - export function incremental_write_qmd(original_qmd: string, new_ast_json: string): string; + /** + * Incrementally write a modified AST back to QMD, preserving unchanged + * source text. + * + * Per Plan 7: the caller is responsible for passing a **baseline** AST + * (`baseline_ast_json`) whose source spans match `original_qmd` and + * whose tier matches `new_ast_json`. The bridge does not re-parse + * `original_qmd`; mixing tiers will corrupt the write. + */ + export function incremental_write_qmd( + original_qmd: string, + baseline_ast_json: string, + new_ast_json: string, + ): string; // Response type for parse/write operations export interface AstResponse { @@ -77,6 +89,11 @@ declare module 'wasm-quarto-hub-client' { qmd?: string; error?: string; diagnostics?: AstDiagnostic[]; + /** + * Soft-drop warnings (Plan 7 Q-3-42 / Q-3-43) that rode alongside + * a successful incremental write. + */ + warnings?: AstDiagnostic[]; } export interface AstDiagnostic { diff --git a/package-lock.json b/package-lock.json index cff75e638..928743e8a 100644 --- a/package-lock.json +++ b/package-lock.json @@ -3777,14 +3777,6 @@ "@types/node": "*" } }, - "node_modules/@types/trusted-types": { - "version": "2.0.7", - "resolved": "https://registry.npmjs.org/@types/trusted-types/-/trusted-types-2.0.7.tgz", - "integrity": "sha512-ScaPdn1dQczgbl0QFTeTOmVHFULt394XJgOQNoyVhZ6r2vLnMLJfBPd53SB52T/3G36VI1/g2MZaX0cwDuXsfw==", - "license": "MIT", - "optional": true, - "peer": true - }, "node_modules/@types/ws": { "version": "8.18.1", "resolved": "https://registry.npmjs.org/@types/ws/-/ws-8.18.1.tgz", diff --git a/q2-demos/hub-react-todo/src/types/wasm-quarto-hub-client.d.ts b/q2-demos/hub-react-todo/src/types/wasm-quarto-hub-client.d.ts index 3664242d7..463c7c3a2 100644 --- a/q2-demos/hub-react-todo/src/types/wasm-quarto-hub-client.d.ts +++ b/q2-demos/hub-react-todo/src/types/wasm-quarto-hub-client.d.ts @@ -5,13 +5,18 @@ declare module 'wasm-quarto-hub-client' { export function parse_qmd_content(content: string): string; export function ast_to_qmd(ast_json: string): string; - export function incremental_write_qmd(original_qmd: string, new_ast_json: string): string; + export function incremental_write_qmd( + original_qmd: string, + baseline_ast_json: string, + new_ast_json: string, + ): string; export interface AstResponse { success: boolean; ast?: string; qmd?: string; error?: string; + warnings?: unknown[]; } export type InitInput = RequestInfo | URL | Response | BufferSource | WebAssembly.Module; diff --git a/q2-demos/hub-react-todo/src/useSyncedAst.ts b/q2-demos/hub-react-todo/src/useSyncedAst.ts index c837ee3d7..2dea297d8 100644 --- a/q2-demos/hub-react-todo/src/useSyncedAst.ts +++ b/q2-demos/hub-react-todo/src/useSyncedAst.ts @@ -90,8 +90,16 @@ export function useSyncedAst(params: SyncedAstParams | null): SyncedAstState { { parseQmd: (content: string) => parseQmdContent(content), writeQmd: (astValue: unknown) => writeQmdFromAst(astValue as RustQmdJson), - incrementalWriteQmd: (originalQmd: string, newAst: unknown) => - incrementalWriteQmd(originalQmd, newAst as RustQmdJson), + incrementalWriteQmd: ( + originalQmd: string, + baselineAst: unknown, + newAst: unknown, + ) => + incrementalWriteQmd( + originalQmd, + baselineAst as RustQmdJson, + newAst as RustQmdJson, + ), fileFilter: (path: string) => path === filePath, }, ) diff --git a/q2-demos/hub-react-todo/src/wasm.ts b/q2-demos/hub-react-todo/src/wasm.ts index ad24eece4..acaa44144 100644 --- a/q2-demos/hub-react-todo/src/wasm.ts +++ b/q2-demos/hub-react-todo/src/wasm.ts @@ -74,20 +74,35 @@ export function writeQmdFromAst(ast: RustQmdJson): string { * Incrementally write a modified AST back to QMD, preserving unchanged * portions of the original source text verbatim. * + * Plan 7 contract: caller must pass the **baseline** AST (whose + * source spans line up with `originalQmd`); the bridge does not + * re-parse `originalQmd`. `baselineAst` may be a parsed object or a + * pre-serialized JSON string. + * * Must call initWasm() before first use. */ -export function incrementalWriteQmd(originalQmd: string, newAst: RustQmdJson): string { +export function incrementalWriteQmd( + originalQmd: string, + baselineAst: RustQmdJson | string, + newAst: RustQmdJson, +): { qmd: string; warnings?: unknown[] } { if (!wasmModule) { throw new Error('WASM not initialized. Call initWasm() first.') } + const baselineAstJson = + typeof baselineAst === 'string' ? baselineAst : JSON.stringify(baselineAst) const newAstJson = JSON.stringify(newAst) - const responseJson = wasmModule.incremental_write_qmd(originalQmd, newAstJson) + const responseJson = wasmModule.incremental_write_qmd( + originalQmd, + baselineAstJson, + newAstJson, + ) const response: AstResponse = JSON.parse(responseJson) if (!response.success || !response.qmd) { throw new Error(`Incremental write failed: ${response.error}`) } - return response.qmd + return { qmd: response.qmd, warnings: response.warnings } } diff --git a/q2-demos/kanban/src/types/wasm-quarto-hub-client.d.ts b/q2-demos/kanban/src/types/wasm-quarto-hub-client.d.ts index 3664242d7..463c7c3a2 100644 --- a/q2-demos/kanban/src/types/wasm-quarto-hub-client.d.ts +++ b/q2-demos/kanban/src/types/wasm-quarto-hub-client.d.ts @@ -5,13 +5,18 @@ declare module 'wasm-quarto-hub-client' { export function parse_qmd_content(content: string): string; export function ast_to_qmd(ast_json: string): string; - export function incremental_write_qmd(original_qmd: string, new_ast_json: string): string; + export function incremental_write_qmd( + original_qmd: string, + baseline_ast_json: string, + new_ast_json: string, + ): string; export interface AstResponse { success: boolean; ast?: string; qmd?: string; error?: string; + warnings?: unknown[]; } export type InitInput = RequestInfo | URL | Response | BufferSource | WebAssembly.Module; diff --git a/q2-demos/kanban/src/useSyncedAst.ts b/q2-demos/kanban/src/useSyncedAst.ts index c837ee3d7..2dea297d8 100644 --- a/q2-demos/kanban/src/useSyncedAst.ts +++ b/q2-demos/kanban/src/useSyncedAst.ts @@ -90,8 +90,16 @@ export function useSyncedAst(params: SyncedAstParams | null): SyncedAstState { { parseQmd: (content: string) => parseQmdContent(content), writeQmd: (astValue: unknown) => writeQmdFromAst(astValue as RustQmdJson), - incrementalWriteQmd: (originalQmd: string, newAst: unknown) => - incrementalWriteQmd(originalQmd, newAst as RustQmdJson), + incrementalWriteQmd: ( + originalQmd: string, + baselineAst: unknown, + newAst: unknown, + ) => + incrementalWriteQmd( + originalQmd, + baselineAst as RustQmdJson, + newAst as RustQmdJson, + ), fileFilter: (path: string) => path === filePath, }, ) diff --git a/q2-demos/kanban/src/wasm.ts b/q2-demos/kanban/src/wasm.ts index ad24eece4..acaa44144 100644 --- a/q2-demos/kanban/src/wasm.ts +++ b/q2-demos/kanban/src/wasm.ts @@ -74,20 +74,35 @@ export function writeQmdFromAst(ast: RustQmdJson): string { * Incrementally write a modified AST back to QMD, preserving unchanged * portions of the original source text verbatim. * + * Plan 7 contract: caller must pass the **baseline** AST (whose + * source spans line up with `originalQmd`); the bridge does not + * re-parse `originalQmd`. `baselineAst` may be a parsed object or a + * pre-serialized JSON string. + * * Must call initWasm() before first use. */ -export function incrementalWriteQmd(originalQmd: string, newAst: RustQmdJson): string { +export function incrementalWriteQmd( + originalQmd: string, + baselineAst: RustQmdJson | string, + newAst: RustQmdJson, +): { qmd: string; warnings?: unknown[] } { if (!wasmModule) { throw new Error('WASM not initialized. Call initWasm() first.') } + const baselineAstJson = + typeof baselineAst === 'string' ? baselineAst : JSON.stringify(baselineAst) const newAstJson = JSON.stringify(newAst) - const responseJson = wasmModule.incremental_write_qmd(originalQmd, newAstJson) + const responseJson = wasmModule.incremental_write_qmd( + originalQmd, + baselineAstJson, + newAstJson, + ) const response: AstResponse = JSON.parse(responseJson) if (!response.success || !response.qmd) { throw new Error(`Incremental write failed: ${response.error}`) } - return response.qmd + return { qmd: response.qmd, warnings: response.warnings } } diff --git a/q2-preview-spa/src/PreviewApp.integration.test.tsx b/q2-preview-spa/src/PreviewApp.integration.test.tsx index f7f4cb70a..fe6bee4fa 100644 --- a/q2-preview-spa/src/PreviewApp.integration.test.tsx +++ b/q2-preview-spa/src/PreviewApp.integration.test.tsx @@ -107,9 +107,10 @@ describe('PreviewApp boot path', () => { const props = capturedIframeProps[capturedIframeProps.length - 1]; expect(props.currentFilePath).toBe('index.qmd'); expect(props.astJson).toBe('{"blocks":[]}'); - // setAst is required by Q2PreviewIframe; Phase A's no-op is fine but - // it must at least be a function so the iframe doesn't crash on - // first DOM-stable edit. + // setAst is required by Q2PreviewIframe; Plan 7 Phase 7 wired the + // real `handleSetAst` (incrementalWriteQmd + echo-prevention). + // The shape check here is the integration-level contract; the + // write path itself is covered by Phase 8's round-trip tests. expect(typeof props.setAst).toBe('function'); }); diff --git a/q2-preview-spa/src/PreviewApp.tsx b/q2-preview-spa/src/PreviewApp.tsx index e7b01a41f..edb98167e 100644 --- a/q2-preview-spa/src/PreviewApp.tsx +++ b/q2-preview-spa/src/PreviewApp.tsx @@ -14,11 +14,15 @@ * * Decisions worth surfacing here: * - * - `setAst` on Q2PreviewIframe is a no-op for now. The iframe takes - * it as a required prop because Phase 2 of q2-preview anticipated a - * WYSIWYG round-trip (the iframe asks the parent to update the - * AST). The SPA doesn't have an editor to round-trip into yet, so a - * no-op is correct. + * - `setAst` on Q2PreviewIframe is wired through `incrementalWriteQmd` + * (Plan 7 Phase 7). Component-driven edits in the iframe (e.g. + * kanban drag, future comment buttons) call back with the modified + * AST; we use the current `astJson` as the baseline, write the + * reconciled qmd to the active file via the sync client, and stash + * the FNV-1a hash so the resulting `onFileContent` echo gets + * suppressed (otherwise the SPA would re-render unnecessarily and, + * in races, blow away an in-flight edit). Soft-drop warnings + * (Q-3-42 / Q-3-43) ride into the DiagnosticStrip. * * - `wsUrl` is derived from `window.location` rather than read from * a server endpoint. The CLI always opens the SPA on the same @@ -37,13 +41,16 @@ * round-trip on boot, no new server-side patterns introduced. */ -import { useCallback, useEffect, useState } from 'react'; +import { useCallback, useEffect, useRef, useState } from 'react'; import { initWasm, connect, setSyncHandlers, renderPageForPreview, getBinaryDocById, + getFileContent, + updateFileContent, + incrementalWriteQmd, } from '@quarto/preview-runtime'; import { Q2PreviewIframe } from '@quarto/preview-renderer/iframe/Q2PreviewIframe'; import { extractMetaString } from '@quarto/preview-renderer/framework'; @@ -52,8 +59,33 @@ import type { CaptureRef, FileEntry } from '@quarto/quarto-automerge-schema'; import { ForceRefreshButton } from './components/ForceRefreshButton'; import { PreviewDiagnosticsOverlay } from './components/PreviewDiagnosticsOverlay'; import { StaleCaptureOverlay } from './components/StaleCaptureOverlay'; +import { DiagnosticStrip } from './components/DiagnosticStrip'; import { pickInitialPage } from './pickInitialPage'; +/** + * FNV-1a 32-bit hash, hex-encoded. Used for content-match + * echo-prevention in `handleSetAst` (Plan 7 Phase 7): we hash the qmd + * we're about to emit, stash `(path, hash)` in a ref, and suppress the + * matching incoming `onFileContent` so the SPA doesn't re-render off + * its own write. + * + * Why FNV-1a and not SHA-256 or xxHash: this is an in-process + * equality check across a single round-trip (write → samod → echo + * back). Cryptographic strength is irrelevant; the collision domain + * is one file's last-emitted qmd, so 32 bits is comfortable. FNV-1a + * is zero-dependency, fast on short-to-medium strings, and the + * codebase already uses it for the actor-color hash. Single source + * of truth: this function in this file. + */ +function fnv1aHex(s: string): string { + let h = 0x811c9dc5; + for (let i = 0; i < s.length; i++) { + h ^= s.charCodeAt(i); + h = (h + ((h << 1) + (h << 4) + (h << 7) + (h << 8) + (h << 24))) >>> 0; + } + return h.toString(16).padStart(8, '0'); +} + /** * Suffix appended to the document's title in the browser tab so a * `q2 preview` tab is distinguishable from the live / published page @@ -352,13 +384,21 @@ function deriveWsUrl(loc: Location = window.location): string { return `${wsScheme}//${loc.host}/ws`; } -/** No-op `setAst` until WYSIWYG mode is wired (post-Phase-A). */ -const noopSetAst = () => { - /* deliberately empty */ -}; - export default function PreviewApp() { const [state, setState] = useState(INITIAL_STATE); + // Plan 7 Phase 7: soft-drop warnings to surface in DiagnosticStrip. + // Accumulated across edits within a session; dismissed by the + // strip's close button. + const [writeWarnings, setWriteWarnings] = useState([]); + + // Plan 7 Phase 7: content-match echo-prevention. `handleSetAst` + // writes qmd via `updateFileContent`, which round-trips through + // samod and fires `onFileContent` back at us. Without this ref the + // SPA would re-render off its own write, and in pathological races + // could overwrite an in-flight follow-up edit. We stash the FNV-1a + // hash of the emitted qmd here; the next `onFileContent` for the + // same path that hashes equal is silently dropped. + const lastEmittedRef = useRef<{ path: string; hash: string } | null>(null); // Force-refresh trigger (bd-b5hf): bumping `contentTick` re-fires // the render useEffect. Reuses the same channel `onFileContent` @@ -370,6 +410,59 @@ export default function PreviewApp() { setState((s) => ({ ...s, contentTick: s.contentTick + 1 })); }, []); + // Plan 7 Phase 7: handleSetAst reads `activeFile` + `astJson` via + // refs so the callback keeps a stable identity for Q2PreviewIframe. + // (The iframe's effect deps include `setAst`; re-binding on every + // astJson change would re-register the postMessage listener.) + const activeFileRef = useRef(null); + const astJsonRef = useRef(null); + useEffect(() => { + activeFileRef.current = state.activeFile; + }, [state.activeFile]); + useEffect(() => { + astJsonRef.current = state.astJson; + }, [state.astJson]); + + // Plan 7 Phase 7: WYSIWYG round-trip. Component-driven edits in the + // iframe (kanban drag, comment buttons, …) call this with the + // modified AST. We use the current `astJson` as the baseline (its + // source spans line up with the qmd in samod), reconcile via + // `incrementalWriteQmd`, and write the result back through + // `updateFileContent`. Soft-drop warnings (Q-3-42 / Q-3-43) flow + // into the DiagnosticStrip. The emitted-qmd hash is stashed in + // `lastEmittedRef` so the echoed `onFileContent` is suppressed. + const handleSetAst = useCallback((newAst: unknown) => { + const path = activeFileRef.current; + const baselineJson = astJsonRef.current; + if (!path || !baselineJson) { + console.warn('q2-preview setAst: no active page or baseline yet'); + return; + } + const originalQmd = getFileContent(path); + if (originalQmd === null) { + console.warn(`q2-preview setAst: no content cached for ${path}`); + return; + } + try { + const { qmd, warnings } = incrementalWriteQmd( + originalQmd, + baselineJson, + newAst as never, + ); + lastEmittedRef.current = { path, hash: fnv1aHex(qmd) }; + updateFileContent(path, qmd); + if (warnings && warnings.length > 0) { + setWriteWarnings((prev) => [...prev, ...warnings]); + } + } catch (err) { + console.error('q2-preview setAst: incremental write failed', err); + } + }, []); + + const handleDismissWarnings = useCallback(() => { + setWriteWarnings([]); + }, []); + // Phase F.1 (bd-kw93.14): the iframe posts NAVIGATE_TO_DOCUMENT // when the user clicks a cross-page artifact-rooted `.html` link. // Update activeFile + pendingAnchor and push a fresh history entry @@ -455,6 +548,20 @@ export default function PreviewApp() { }, onFileContent: (path: string) => { if (cancelled) return; + // Plan 7 Phase 7: echo-prevention. If the incoming + // content is exactly the qmd we just emitted, drop it — + // re-rendering off our own write wastes a tick and can + // race a follow-up edit. The ref carries (path, hash) + // for the last emission; consume it (set to null) so a + // *second* identical write would still re-render. + const last = lastEmittedRef.current; + if (last && last.path === path) { + const incoming = getFileContent(path); + if (incoming !== null && fnv1aHex(incoming) === last.hash) { + lastEmittedRef.current = null; + return; + } + } // Phase D.6 filter: read `activeFile` + `deps` via the // setState callback so the filter sees the *latest* // values (the closure was set up at boot time and would @@ -856,7 +963,7 @@ export default function PreviewApp() { pendingAnchor={state.pendingAnchor} pendingAnchorEpoch={state.pendingAnchorEpoch} onNavigateToDocument={handleNavigate} - setAst={noopSetAst} + setAst={handleSetAst} /> {showStaleOverlay && ( )} + {/* Plan 7 Phase 7: write-side soft-drop warnings (Q-3-42 / + Q-3-43) from `incrementalWriteQmd`. Distinct surface from + the bd-b9kzg render-diagnostics overlay below: those carry + server-side + WASM render diagnostics; this carries + user-edit-rejection signals. Keeping them separate avoids + conflating "your edit was discarded" with "the render + itself complained." */} + {/* bd-b9kzg (extends Phase D.4): non-terminal diagnostics overlay. The overlay defaults to its own internal collapsed state (true) when the `collapsed` prop is diff --git a/q2-preview-spa/src/components/DiagnosticStrip.tsx b/q2-preview-spa/src/components/DiagnosticStrip.tsx new file mode 100644 index 000000000..450426faf --- /dev/null +++ b/q2-preview-spa/src/components/DiagnosticStrip.tsx @@ -0,0 +1,119 @@ +/** + * DiagnosticStrip (Plan 7 Phase 7). + * + * Surfaces soft-drop warnings (Q-3-42 / Q-3-43) returned by + * `incrementalWriteQmd` after a component-driven edit hits an atomic + * region. The SPA has no Monaco squiggle to lean on, so this strip is + * the only diagnostic surface for write-side warnings. + * + * Autosave-context spam mitigation: every keystroke triggers a render + + * write, so a user typing over an atomic-resolved inline would re-emit + * the same Q-3-42 on every tick. We group by source range and show the + * first three occurrences per `(start_line, start_column, end_line, + * end_column)`; further hits are silently dropped (the prior entries + * stay visible). Plan 7 §"Autosave-context spam mitigation". + * + * The catalog messages (`Q-3-42`: "Shortcode edit dropped" + body; + * `Q-3-43`: "Generated content edit dropped" + body) already read as + * imperative instructions ("edit the invocation token in source + * instead"), so DiagnosticStrip surfaces title + problem verbatim. + */ + +import type { Diagnostic } from '@quarto/preview-renderer/types/diagnostic'; + +interface DiagnosticStripProps { + /** Soft-drop warnings to surface. Cleared by the caller on dismiss. */ + warnings: Diagnostic[]; + /** Caller-provided dismiss handler. */ + onDismiss: () => void; +} + +/** + * Group warnings by source-range key and cap each group at 3 entries. + * Exported for tests. + */ +export function suppressAfterThree(warnings: Diagnostic[]): Diagnostic[] { + const counts = new Map(); + const out: Diagnostic[] = []; + for (const w of warnings) { + const key = `${w.code ?? ''}:${w.start_line ?? -1}:${w.start_column ?? -1}:${w.end_line ?? -1}:${w.end_column ?? -1}`; + const n = counts.get(key) ?? 0; + if (n < 3) { + out.push(w); + counts.set(key, n + 1); + } + } + return out; +} + +export function DiagnosticStrip({ warnings, onDismiss }: DiagnosticStripProps) { + if (warnings.length === 0) return null; + const visible = suppressAfterThree(warnings); + + return ( +
+
+ + {visible.length === 1 ? '1 edit dropped' : `${visible.length} edits dropped`} + + +
+
    + {visible.map((w, i) => ( +
  • 0 ? '0.25rem' : 0 }}> + + {w.code ? `${w.code}: ` : ''} + {w.title} + + {w.problem ? ( +
    {w.problem}
    + ) : null} +
  • + ))} +
+
+ ); +} diff --git a/ts-packages/preview-renderer/src/framework/dispatch.tsx b/ts-packages/preview-renderer/src/framework/dispatch.tsx index e4640bdc5..68215fbee 100644 --- a/ts-packages/preview-renderer/src/framework/dispatch.tsx +++ b/ts-packages/preview-renderer/src/framework/dispatch.tsx @@ -1,6 +1,6 @@ import React, { useContext } from 'react'; import { RegistryContext } from './RegistryContext'; -import { isAtomicSourceInfo, ATOMIC_SYNTHETIC_KINDS } from '../utils/sourceInfo'; +import { isAtomicSourceInfo, ATOMIC_KINDS } from '../utils/sourceInfo'; import { isAtomicCustomNode } from '../utils/atomicCustomNodes'; import type { BlockNode, @@ -405,7 +405,7 @@ export function Node({ const isCustom = node.t === 'CustomBlock' || node.t === 'CustomInline'; const isAtomic = - isAtomicSourceInfo(node as { s?: number }, sourceInfoPool, ATOMIC_SYNTHETIC_KINDS) + isAtomicSourceInfo(node as { s?: number }, sourceInfoPool, ATOMIC_KINDS) || (isCustom && isAtomicCustomNode((node as CustomBlockNode | CustomInlineNode).type_name)); const effectiveSetLocalAst = isAtomic ? NOOP_SET_LOCAL_AST : setLocalAst; diff --git a/ts-packages/preview-renderer/src/types/sourceInfo.ts b/ts-packages/preview-renderer/src/types/sourceInfo.ts index ec11652b9..1f5a365e9 100644 --- a/ts-packages/preview-renderer/src/types/sourceInfo.ts +++ b/ts-packages/preview-renderer/src/types/sourceInfo.ts @@ -1,6 +1,14 @@ /** - * Wire-format types for the source-info pool, mirroring - * `crates/pampa/src/writers/json.rs:54-91`. + * Wire-format types for the source-info pool. Hand-mirror of the Rust + * producers — keep this file aligned with two sources of truth: + * + * - `SourceInfo` enum (canonical producer): + * `crates/quarto-source-map/src/source_info.rs` + * - JSON wire mirror: + * `crates/pampa/src/writers/json.rs` + * - `SerializableSourceMapping` (writer-side enum) + * - `SourceInfoJson` (wire entry shape) + * - `SerializableSourceInfo::to_json` (code-4 serializer) * * The pool is an array of entries indexed by `node.s` (the `s` field on * each Pandoc node in the serialized AST). Each entry has a type code @@ -11,32 +19,60 @@ * - 0: Original — `d` is the file id (FileId.0). * - 1: Substring — `d` is a parent_id into the pool. * - 2: Concat — `d` is an array of [source_info_id, offset_in_concat, length]. - * - 3: FilterProvenance — `d` is [filter_path, line]. - * - 4: Synthetic — `d` is a By marker. Dormant; Plan 5 wires this up. - * - 5: Derived — `d` is { from: parent_id, by: By }. Dormant; Plan 5 wires this up. + * - 3: Legacy — read-only compat for two old shapes; no new writes: + * `[parent_id, ...]` (numeric-headed legacy `Transformed`) + * `[filter_path, line]` (string-headed buggy `FilterProvenance`). + * - 4: Generated — `d` is `{ by: By, from?: AnchorRef[] }`. `r` is `[0, 0]`; + * ranges come from the chain-walk via the `invocation` anchor. * - * Codes 4 and 5 are forward-declared so 2A's accessor module doesn't need - * amending when Plan 5 ships writer support for them. + * Code 5 is unassigned and reserved for future use. */ /** - * A `By` marker identifies the synthesizer responsible for a Synthetic or - * Derived source-info entry. The shape is intentionally coarse — Plan 4 - * introduces specific kinds with structured `data`. Once consumers branch - * on `kind`, this can be narrowed to a discriminated union. + * A `By` marker identifies the producer (transform) responsible for a + * `Generated` entry. Mirrors the Rust `By` struct: a kebab-case `kind` + * tag plus an optional per-kind JSON `data` payload. + * + * Known kinds at the time of writing: `"filter"`, `"shortcode"`, + * `"sectionize"`, `"user-edit"`, `"include"`, `"title-block"`, + * `"footnotes"`, `"appendix"`, `"tree-sitter-postprocess"`, `"raw"`. + * Third-party extensions namespace as `"ext//"`. */ export interface By { kind: string; data?: unknown; } +/** + * A typed, role-labeled pointer into the source-info pool, attached to + * a `Generated` entry via its `from` array. Mirrors the Rust `Anchor` + * struct flattened to its writer-internal `(role, si_id)` shape. + * + * `role` is one of: + * - `"invocation"` — the user-written construct that triggered the + * producer (e.g. the `{{< meta foo >}}` token). + * - `"value-source"` — where the value carried by this node was + * defined, when distinct from the invocation site. + * - `"other:"` — extension-defined or future role we haven't + * enumerated. `` is kebab-case, namespaced as + * `ext//`. The bare `"other:"` form (empty + * suffix) is rejected by the reader. + * + * `si_id` is the pool index of the anchor's target (typically an + * `Original` covering the source bytes the anchor describes). + */ +export interface AnchorRef { + role: string; + si_id: number; +} + export type SourceInfoEntry = - | { t: 0; r: [number, number]; d: number } - | { t: 1; r: [number, number]; d: number } - | { t: 2; r: [number, number]; d: Array<[number, number, number]> } - | { t: 3; r: [number, number]; d: [string, number] } - | { t: 4; r: [0, 0]; d: By } - | { t: 5; r: [0, 0]; d: { from: number; by: By } }; + | { t: 0; r: [number, number]; d: number } // Original + | { t: 1; r: [number, number]; d: number } // Substring + | { t: 2; r: [number, number]; d: Array<[number, number, number]> } // Concat + | { t: 3; r: [number, number]; d: [string, number] | [number, ...number[]] } // Legacy (read-only) + | { t: 4; r: [0, 0]; d: { by: By; from?: AnchorRef[] } }; // Generated +// code 5 — unassigned, reserved for future use export type SourceInfoPool = readonly SourceInfoEntry[]; diff --git a/ts-packages/preview-renderer/src/utils/sourceInfo.test.ts b/ts-packages/preview-renderer/src/utils/sourceInfo.test.ts index e2aab8011..0b400cb1a 100644 --- a/ts-packages/preview-renderer/src/utils/sourceInfo.test.ts +++ b/ts-packages/preview-renderer/src/utils/sourceInfo.test.ts @@ -1,26 +1,27 @@ import { describe, test, expect } from 'vitest'; -import { - entryFor, - isDerived, - isAtomicSourceInfo, - ATOMIC_SYNTHETIC_KINDS, -} from './sourceInfo'; +import { entryFor, isAtomicSourceInfo, ATOMIC_KINDS } from './sourceInfo'; import type { SourceInfoPool } from '../types/sourceInfo'; -// Build a representative pool covering each wire code. +// Build a representative pool covering each wire code shipped by the +// Rust writer post-Plan-5. Code 5 is unassigned — no entry exists. const samplePool: SourceInfoPool = [ - { t: 0, r: [0, 10], d: 0 }, // 0: Original - { t: 1, r: [3, 7], d: 0 }, // 1: Substring (parent_id 0) - { t: 2, r: [0, 20], d: [[0, 0, 10], [1, 10, 10]] }, // 2: Concat - { t: 3, r: [5, 15], d: ['filter.lua', 42] }, // 3: FilterProvenance - { t: 4, r: [0, 0], d: { kind: 'IncludeShortcode' } }, // 4: Synthetic - { t: 5, r: [0, 0], d: { from: 0, by: { kind: 'CrossrefResolver' } } }, // 5: Derived + { t: 0, r: [0, 10], d: 0 }, // 0: Original + { t: 1, r: [3, 7], d: 0 }, // 1: Substring (parent_id 0) + { t: 2, r: [0, 20], d: [[0, 0, 10], [0, 10, 10]] }, // 2: Concat + { t: 3, r: [5, 15], d: ['filter.lua', 42] }, // 3: Legacy (string-headed FilterProvenance) + { t: 3, r: [10, 20], d: [0] }, // 4: Legacy (numeric-headed Transformed) + { t: 4, r: [0, 0], d: { by: { kind: 'sectionize' } } }, // 5: Generated, no anchors, no data + { t: 4, r: [0, 0], d: { // 6: Generated with anchor + by: { kind: 'shortcode', data: { name: 'meta' } }, + from: [{ role: 'invocation', si_id: 0 }], + } }, ]; describe('entryFor', () => { test('returns the entry at node.s', () => { expect(entryFor({ s: 0 }, samplePool)).toEqual(samplePool[0]); expect(entryFor({ s: 3 }, samplePool)).toEqual(samplePool[3]); + expect(entryFor({ s: 6 }, samplePool)).toEqual(samplePool[6]); }); test('returns undefined when node lacks an s field', () => { @@ -36,56 +37,62 @@ describe('entryFor', () => { }); }); -describe('isDerived', () => { - test('returns true for code 5 (Derived)', () => { - expect(isDerived({ s: 5 }, samplePool)).toBe(true); - }); - - test('returns false for code 4 (Synthetic)', () => { - expect(isDerived({ s: 4 }, samplePool)).toBe(false); - }); - - test.each([0, 1, 2, 3])('returns false for code %d', (idx) => { - expect(isDerived({ s: idx }, samplePool)).toBe(false); - }); - - test('returns false when entry is missing', () => { - expect(isDerived({}, samplePool)).toBe(false); - expect(isDerived({ s: 99 }, samplePool)).toBe(false); - }); -}); - describe('isAtomicSourceInfo', () => { - const atomicKinds = new Set(['CrossrefResolver']); + const atomicKinds = new Set(['shortcode']); - test('returns true for Derived entries (code 5)', () => { - expect(isAtomicSourceInfo({ s: 5 }, samplePool, atomicKinds)).toBe(true); + test('returns true for Generated (code 4) when by.kind is atomic', () => { + // samplePool[6] has by.kind === 'shortcode'. + expect(isAtomicSourceInfo({ s: 6 }, samplePool, atomicKinds)).toBe(true); }); - test('returns true for Synthetic (code 4) when kind is in atomic set', () => { - const pool: SourceInfoPool = [{ t: 4, r: [0, 0], d: { kind: 'CrossrefResolver' } }]; - expect(isAtomicSourceInfo({ s: 0 }, pool, atomicKinds)).toBe(true); + test('returns false for Generated (code 4) when by.kind is not atomic', () => { + // samplePool[5] has by.kind === 'sectionize'. + expect(isAtomicSourceInfo({ s: 5 }, samplePool, atomicKinds)).toBe(false); }); - test('returns false for Synthetic (code 4) when kind is not atomic', () => { - expect(isAtomicSourceInfo({ s: 4 }, samplePool, atomicKinds)).toBe(false); - }); - - test.each([0, 1, 2, 3])('returns false for non-Synthetic non-Derived code %d', (idx) => { + test.each([0, 1, 2, 3, 4])('returns false for non-Generated code %d', (idx) => { expect(isAtomicSourceInfo({ s: idx }, samplePool, atomicKinds)).toBe(false); }); test('returns false when entry is missing', () => { expect(isAtomicSourceInfo({}, samplePool, atomicKinds)).toBe(false); }); + + test('treats absent `from` as empty (canonical access pattern)', () => { + // Build a pool with one Generated entry that has no `from` field + // at all — the writer omits it when the anchor list is empty. + const pool: SourceInfoPool = [ + { t: 4, r: [0, 0], d: { by: { kind: 'shortcode' } } }, + ]; + expect(isAtomicSourceInfo({ s: 0 }, pool, atomicKinds)).toBe(true); + // `entry.d.from ?? []` is the canonical access pattern for + // consumers that want to iterate the anchor list. + const entry = entryFor({ s: 0 }, pool); + if (entry?.t === 4) { + expect(entry.d.from ?? []).toEqual([]); + } else { + throw new Error('expected code-4 entry'); + } + }); }); -describe('ATOMIC_SYNTHETIC_KINDS', () => { +describe('ATOMIC_KINDS', () => { test('is exported as a ReadonlySet', () => { - expect(ATOMIC_SYNTHETIC_KINDS).toBeInstanceOf(Set); + expect(ATOMIC_KINDS).toBeInstanceOf(Set); + }); + + test('contains the Plan-4 atomic-kind set', () => { + // Mirrors `By::is_atomic_kind` on the Rust side + // (crates/quarto-source-map/src/source_info.rs). + expect(ATOMIC_KINDS.has('filter')).toBe(true); + expect(ATOMIC_KINDS.has('shortcode')).toBe(true); + expect(ATOMIC_KINDS.has('title-block')).toBe(true); + expect(ATOMIC_KINDS.has('tree-sitter-postprocess')).toBe(true); }); - test('is empty in 2A — Plan 4/6 will populate', () => { - expect(ATOMIC_SYNTHETIC_KINDS.size).toBe(0); + test('excludes known non-atomic kinds', () => { + expect(ATOMIC_KINDS.has('sectionize')).toBe(false); + expect(ATOMIC_KINDS.has('user-edit')).toBe(false); + expect(ATOMIC_KINDS.has('include')).toBe(false); }); }); diff --git a/ts-packages/preview-renderer/src/utils/sourceInfo.ts b/ts-packages/preview-renderer/src/utils/sourceInfo.ts index d9e95f4a9..982f353cc 100644 --- a/ts-packages/preview-renderer/src/utils/sourceInfo.ts +++ b/ts-packages/preview-renderer/src/utils/sourceInfo.ts @@ -4,9 +4,9 @@ * and by future features that need source-mapped lookups (preimage * navigation, source-mapped diagnostics). * - * Sync contract: `ATOMIC_SYNTHETIC_KINDS` mirrors the kinds returned - * by `By::is_atomic_synthesizer()` on the Rust side (Plan 4 / 6 - * landing). Update both together. + * Sync contract: `ATOMIC_KINDS` mirrors the kinds returned by + * `By::is_atomic_kind()` on the Rust side + * (`crates/quarto-source-map/src/source_info.rs`). Update both together. */ import type { SourceInfoEntry, SourceInfoPool } from '../types/sourceInfo'; @@ -25,21 +25,8 @@ export function entryFor( } /** - * True iff the entry is a Derived (wire code 5) entry. Plan 6 populates - * Derived entries on shortcode resolutions. - */ -export function isDerived( - node: { s?: number }, - pool: SourceInfoPool | undefined, -): boolean { - const entry = entryFor(node, pool); - return entry?.t === 5; -} - -/** - * True iff the entry indicates an atomic transform — either Derived - * (always atomic) or Synthetic (code 4) whose `By::kind` is in the - * atomic-synthesizer set. + * True iff the entry indicates an atomic transform — a `Generated` + * entry (code 4) whose `By::kind` is in the atomic-producer set. * * Used by Plan 2B's atomic-aware dispatcher gate to decide whether * `setLocalAst` should be a no-op for the subtree. @@ -51,19 +38,22 @@ export function isAtomicSourceInfo( ): boolean { const entry = entryFor(node, pool); if (!entry) return false; - if (entry.t === 5) return true; - if (entry.t === 4) return atomicKinds.has(entry.d.kind); + if (entry.t === 4) return atomicKinds.has(entry.d.by.kind); return false; } /** - * Atomic-synthesizer kinds that mark entire Synthetic subtrees as - * read-only on the iframe side. Empty in 2A — Plan 4 / 6 populate this - * set as their `By` variants land. + * Atomic producer kinds that mark entire `Generated` subtrees as + * read-only on the iframe side. * - * Sync contract: mirrors `By::is_atomic_synthesizer()` on the Rust - * side. The Rust function and this set must agree on which kinds are - * atomic; otherwise q2-preview's edit-back gate desyncs from the - * pipeline's expectation. + * Sync contract: mirrors `By::is_atomic_kind()` on the Rust side + * (`crates/quarto-source-map/src/source_info.rs`). The Rust function + * and this set must agree on which kinds are atomic; otherwise + * q2-preview's edit-back gate desyncs from the pipeline's expectation. */ -export const ATOMIC_SYNTHETIC_KINDS: ReadonlySet = new Set(); +export const ATOMIC_KINDS: ReadonlySet = new Set([ + 'filter', + 'shortcode', + 'title-block', + 'tree-sitter-postprocess', +]); diff --git a/ts-packages/preview-runtime/src/index.ts b/ts-packages/preview-runtime/src/index.ts index c75156a18..1963cf5e9 100644 --- a/ts-packages/preview-runtime/src/index.ts +++ b/ts-packages/preview-runtime/src/index.ts @@ -14,3 +14,4 @@ export * from './wasmRenderer'; export * from './automergeSync'; +export * from './pipelineKind'; diff --git a/hub-client/src/utils/pipelineKind.test.ts b/ts-packages/preview-runtime/src/pipelineKind.test.ts similarity index 100% rename from hub-client/src/utils/pipelineKind.test.ts rename to ts-packages/preview-runtime/src/pipelineKind.test.ts diff --git a/hub-client/src/utils/pipelineKind.ts b/ts-packages/preview-runtime/src/pipelineKind.ts similarity index 100% rename from hub-client/src/utils/pipelineKind.ts rename to ts-packages/preview-runtime/src/pipelineKind.ts diff --git a/ts-packages/preview-runtime/src/wasm-quarto-hub-client.d.ts b/ts-packages/preview-runtime/src/wasm-quarto-hub-client.d.ts index 8a256a755..7705274ca 100644 --- a/ts-packages/preview-runtime/src/wasm-quarto-hub-client.d.ts +++ b/ts-packages/preview-runtime/src/wasm-quarto-hub-client.d.ts @@ -74,8 +74,20 @@ declare module 'wasm-quarto-hub-client' { // QMD parsing and AST conversion functions export function parse_qmd_content(content: string): string; export function ast_to_qmd(ast_json: string): string; - /** Incrementally write a modified AST back to QMD, preserving unchanged source text. */ - export function incremental_write_qmd(original_qmd: string, new_ast_json: string): string; + /** + * Incrementally write a modified AST back to QMD, preserving unchanged + * source text. + * + * Per Plan 7: the caller is responsible for passing a **baseline** AST + * (`baseline_ast_json`) whose source spans match `original_qmd` and + * whose tier matches `new_ast_json`. The bridge does not re-parse + * `original_qmd`; mixing tiers will corrupt the write. + */ + export function incremental_write_qmd( + original_qmd: string, + baseline_ast_json: string, + new_ast_json: string, + ): string; // Response type for parse/write operations export interface AstResponse { @@ -86,6 +98,11 @@ declare module 'wasm-quarto-hub-client' { qmd?: string; error?: string; diagnostics?: AstDiagnostic[]; + /** + * Soft-drop warnings (Plan 7 Q-3-42 / Q-3-43) that rode alongside + * a successful incremental write. + */ + warnings?: AstDiagnostic[]; } export interface AstDiagnostic { diff --git a/ts-packages/preview-runtime/src/wasmRenderer.ts b/ts-packages/preview-runtime/src/wasmRenderer.ts index fc94d9835..756fd0a41 100644 --- a/ts-packages/preview-runtime/src/wasmRenderer.ts +++ b/ts-packages/preview-runtime/src/wasmRenderer.ts @@ -85,7 +85,11 @@ interface WasmModuleExtended { attribution_json: string | undefined, ) => Promise; write_qmd: (astJson: string) => Promise; - incremental_write_qmd(original_qmd: string, new_ast_json: string): string; + incremental_write_qmd( + original_qmd: string, + baseline_ast_json: string, + new_ast_json: string, + ): string; convert: (document: string, inputFormat: string, outputFormat: string) => Promise; lsp_analyze_document: (path: string) => string; lsp_get_symbols: (path: string) => string; @@ -703,26 +707,73 @@ export async function writeQmd(astJson: string): Promise { } } +/** + * Result of `incrementalWriteQmd`: the rewritten QMD plus any + * soft-drop warnings (Q-3-42 / Q-3-43) that surfaced during + * reconciliation. Warnings ride alongside a *successful* write — the + * substituted edit reached source — and are the caller's + * responsibility to surface (or ignore) per its UX policy. + */ +export interface IncrementalWriteQmdResult { + qmd: string; + warnings?: Diagnostic[]; +} + /** * Incrementally write a modified AST back to QMD, preserving unchanged * portions of the original source text verbatim. * + * Per Plan 7, the caller must pass the **baseline** AST — the AST + * whose source spans correspond to `originalQmd` — so the bridge can + * reconcile without re-parsing (which would discard provenance the + * host has already attached). The baseline AST and the new AST must + * be the same tier (e.g. both post-`parseQmdContent`). + * + * `baselineAst` is accepted as either a parsed AST object + * (`RustQmdJson`) or a pre-serialized JSON string — convenient for + * sync-client callers that already have a stringified cache. The + * bridge serializes the AST object branch internally. + * * Must call initWasm() before first use. */ -export function incrementalWriteQmd(originalQmd: string, newAst: RustQmdJson): string { +export function incrementalWriteQmd( + originalQmd: string, + baselineAst: RustQmdJson | string, + newAst: RustQmdJson, +): IncrementalWriteQmdResult { if (!wasmModule) { throw new Error('WASM not initialized. Call initWasm() first.') } + const baselineAstJson = + typeof baselineAst === 'string' ? baselineAst : JSON.stringify(baselineAst) const newAstJson = JSON.stringify(newAst) - const responseJson = wasmModule.incremental_write_qmd(originalQmd, newAstJson) + const responseJson = wasmModule.incremental_write_qmd( + originalQmd, + baselineAstJson, + newAstJson, + ) const response: AstResponse = JSON.parse(responseJson) if (!response.success || !response.qmd) { - throw new Error(`Incremental write failed: ${response.error}`) + // Distinguish the two failure modes — pre-fix this read "undefined": + // - response.success === false → real writer Err (response.error set) + // - response.success === true && response.qmd === "" → writer + // returned Ok with an empty document (every block soft-dropped + // via Q-3-43; bridge omits `error` in this case) + const reason = response.error + ?? (response.qmd === '' + ? 'writer returned empty qmd (warnings: ' + + (response.warnings?.length ?? 0) + + ')' + : 'no qmd field in response') + throw new Error(`Incremental write failed: ${reason}`) } - return response.qmd + return { + qmd: response.qmd, + warnings: response.warnings as Diagnostic[] | undefined, + } } /** diff --git a/ts-packages/quarto-sync-client/src/client.ts b/ts-packages/quarto-sync-client/src/client.ts index c63459e83..abedbe146 100644 --- a/ts-packages/quarto-sync-client/src/client.ts +++ b/ts-packages/quarto-sync-client/src/client.ts @@ -975,8 +975,15 @@ export function createSyncClient(callbacks: SyncClientCallbacks, astOptions?: AS const cached = astCache.get(path); if (astOptions.incrementalWriteQmd && cached) { - // Use incremental writer with cached original source - qmdText = astOptions.incrementalWriteQmd(cached.source, ast); + // Plan 7: pass the cached parsed AST as the baseline so the + // bridge does not have to re-parse `cached.source` (which would + // discard any host-side provenance attached after parse). + // `cached.ast` IS the baseline whose spans match `cached.source`. + // Warnings are surfaced but discarded here — the sync client is + // policy-free; demos / hub-client consume them via their own + // wrappers. + const result = astOptions.incrementalWriteQmd(cached.source, cached.ast, ast); + qmdText = result.qmd; } else { // Fallback to full rewrite qmdText = astOptions.writeQmd(ast); diff --git a/ts-packages/quarto-sync-client/src/types.ts b/ts-packages/quarto-sync-client/src/types.ts index 2c7792523..7eb12946b 100644 --- a/ts-packages/quarto-sync-client/src/types.ts +++ b/ts-packages/quarto-sync-client/src/types.ts @@ -162,11 +162,28 @@ export interface ASTOptions { * portions of the original source text verbatim. Falls back to `writeQmd` * if not provided or if the original source is not cached. * + * Per Plan 7, the caller must supply the **baseline** AST (the one + * whose source spans match `originalQmd`); the sync client passes + * the cached parsed AST for that file. The returned `warnings` are + * structured soft-drop diagnostics (`Q-3-42` / `Q-3-43`) that the + * sync client itself ignores — it stays policy-free. Wrapper code + * in demos / hub-client consumes them. + * + * The diagnostic shape is intentionally `unknown[]` here so the + * sync-client does not pull a render-side type dependency; callers + * typically narrow it to the wasm-bridge `AstDiagnostic` shape. + * * @param originalQmd - The original QMD source text + * @param baselineAst - The cached parsed AST whose spans match `originalQmd` * @param newAst - The modified AST to write - * @returns The new QMD text with unchanged portions preserved + * @returns Object with `qmd` (rewritten source) and optional + * `warnings` (soft-drop diagnostics) */ - incrementalWriteQmd?: (originalQmd: string, newAst: unknown) => string; + incrementalWriteQmd?: ( + originalQmd: string, + baselineAst: unknown, + newAst: unknown, + ) => { qmd: string; warnings?: unknown[] }; /** * Filter which files should be parsed.