From 74fb51345e62f0c477b6e73e0d8d0b79c15e4b92 Mon Sep 17 00:00:00 2001 From: Michael Gartner Date: Sun, 17 May 2026 17:01:46 -0600 Subject: [PATCH 1/3] docs --- docs/atjson-canonical-storage-plan.md | 290 ++++++++++++++ docs/atjson-canonical-storage-scope.md | 429 ++++++++++++++++++++ docs/atjson-port-plan.md | 519 +++++++++++++++++++++++++ 3 files changed, 1238 insertions(+) create mode 100644 docs/atjson-canonical-storage-plan.md create mode 100644 docs/atjson-canonical-storage-scope.md create mode 100644 docs/atjson-port-plan.md diff --git a/docs/atjson-canonical-storage-plan.md b/docs/atjson-canonical-storage-plan.md new file mode 100644 index 000000000..997fef81d --- /dev/null +++ b/docs/atjson-canonical-storage-plan.md @@ -0,0 +1,290 @@ +# ATJSON canonical storage plan + +## Summary + +Add ATJSON as the canonical stored content representation for Discourse Graphs without interrupting the current working Obsidian sink/import flow. + +The important storage decision is: + +- `variant` remains the semantic slice of a node, such as `direct`, `full`, or `direct_and_description`. +- `Content.content_type` becomes the representation format, such as `text/plain`, `text/markdown`, or `application/vnd.discourse-graph.atjson+json; version=1`. +- The first rollout writes ATJSON alongside the current text and Markdown rows. Existing readers keep using the current rows until the app renderers are ready. + +This plan builds on `docs/atjson-port-plan.md`, but focuses on the storage and rollout path. + +Use `docs/atjson-port-plan.md` as the architecture plan for the shared content-model package and adapters. Use this file as the database and rollout plan for storing canonical ATJSON without switching destination readers immediately. + +## Reference source map + +SamePage reference code exists locally under: + +- `C:\Users\Michael\Desktop\Areas\RoamJS\SamePage Repos\samepage.network` +- `C:\Users\Michael\Desktop\Areas\RoamJS\SamePage Repos\roam-samepage` +- `C:\Users\Michael\Desktop\Areas\RoamJS\SamePage Repos\obsidian-samepage` + +Files to port or adapt: + +- `samepage.network/package/internal/types.ts` + - Source for the original SamePage annotation schema, `SamePageSchema`, and ATJSON-compatible content/annotation shape. +- `samepage.network/package/utils/atJsonParser.ts` + - Source for generic lexer/parser helpers, `createEmptyAtJson`, `createTextAtJson`, and `combineAtJsons`. +- `samepage.network/package/utils/renderAtJson.ts` + - Source for generic annotation rendering order and prefix/suffix application. +- `roam-samepage/src/utils/blockParser.ts` + - Roam block text to SamePage ATJSON parser. +- `roam-samepage/src/utils/encodeState.ts` + - Roam page/tree to SamePage state encoder. This is the strongest reference for Roam-native ATJSON creation. +- `roam-samepage/src/utils/atJsonToRoam.ts` + - SamePage ATJSON to Roam string renderer. Keep as later renderer reference, not part of the first write-only storage pass. +- `roam-samepage/src/utils/decodeState.ts` + - SamePage state to Roam page/block materializer. Keep as later destination-render reference. +- `obsidian-samepage/src/utils/leafParser.ts` + - Obsidian Markdown to SamePage ATJSON parser. +- `obsidian-samepage/src/utils/atJsonToObsidian.ts` + - SamePage ATJSON to Obsidian Markdown renderer. Keep as later renderer reference. +- `roam-samepage/tests/blockGrammar.test.ts` and `obsidian-samepage/tests/leafParser.test.ts` + - Source fixtures for parser and round-trip behavior. + +Current Discourse Graphs code paths to preserve: + +- `packages/database/supabase/schemas/content.sql` + - Defines `Content`, `ContentVariant`, `my_contents`, `content_local_input`, and `upsert_content`. +- `packages/database/schema.yaml` + - LinkML source, currently stale around `ContentVariant` because SQL/generated types already include `full`. +- `packages/database/src/dbTypes.ts` + - Generated Supabase types used by both apps. +- `apps/obsidian/src/utils/upsertNodesAsContentWithEmbeddings.ts` + - Current Obsidian writer. Writes `direct` title rows and `full` Markdown rows. +- `apps/obsidian/src/utils/importNodes.ts` + - Current Obsidian import reader. Requires `direct` and `full`; must remain Markdown-based during write-only rollout. +- `apps/roam/src/utils/upsertNodesAsContentWithEmbeddings.ts` + - Current Roam writer. Writes `direct` or `direct_and_description` text rows. +- `apps/roam/src/utils/pageToMarkdown.ts` + - Current Roam Markdown export reference. + +## Data model changes + +Add a representation discriminator to `Content`: + +```sql +content_type text not null default 'text/plain' +``` + +Use these initial content types: + +- `text/plain` + - Plain searchable title or combined text. +- `text/markdown` + - Native Markdown representation used by the current Obsidian import path. +- `application/vnd.discourse-graph.atjson+json; version=1` + - DG canonical ATJSON representation. The structured document is stored in `Content.metadata`, and `Content.text` stores a derived plain-text projection. + +For ATJSON rows, use this row shape: + +```ts +{ + variant: "full", + content_type: DG_ATJSON_CONTENT_TYPE, + text: derivePlainTextFromDgDocument(document), + metadata: { + content: document, + }, +} +``` + +Do not serialize the ATJSON document into `Content.text`. `Content.text` should remain useful for search, previews, duplicate detection, and any existing text-centered tooling. + +Update the uniqueness model: + +```sql +unique (space_id, source_local_id, variant, content_type) +``` + +This prevents the new `full` ATJSON row from replacing the existing `full` Markdown row. + +`FileReference` currently points at `Content(space_id, source_local_id, variant)` for the `full` row. When the content uniqueness key changes, `FileReference` must also distinguish representation: + +- add a generated `content_type` column with value `text/markdown` +- update the foreign key to `Content(space_id, source_local_id, variant, content_type)` +- keep file references attached to the current Markdown `full` row until asset handling is intentionally moved to ATJSON + +Backfill existing rows: + +- Existing `variant = 'full'` rows become `content_type = 'text/markdown'`. +- All other existing rows become `content_type = 'text/plain'`. + +Update these database surfaces: + +- `Content` table +- `my_contents` view +- `my_contents_with_embedding_openai_text_embedding_3_small_1536` view +- `FileReference` foreign key +- `content_local_input` +- `_local_content_to_db_content` +- `upsert_content` +- generated database types + +Do not add a new `ContentVariant` for ATJSON. ATJSON is a representation, not a semantic slice. + +Define shared constants instead of repeating raw strings: + +```ts +export const TEXT_PLAIN_CONTENT_TYPE = "text/plain"; +export const TEXT_MARKDOWN_CONTENT_TYPE = "text/markdown"; +export const DG_ATJSON_CONTENT_TYPE = + "application/vnd.discourse-graph.atjson+json; version=1"; +``` + +## Canonical model package + +Create `packages/content-model` with package name `@repo/content-model`. + +The package should own a DG-specific document model rather than adopting SamePage's schema exactly: + +```ts +type DgDocument = { + version: "dg-content-model/v1"; + title: TextDocument; + body: BodyDocument; + metadata?: JsonObject; +}; +``` + +The canonical model should include: + +- top-level `title` and `body` +- inline annotations for title +- body block annotations with explicit block identity and parent linkage +- typed references for Roam pages, Roam blocks, and Obsidian wikilinks +- `appAttributes` only as a fidelity escape hatch + +Port first: + +- shared parser helper ideas from SamePage +- Obsidian Markdown to DG document conversion +- Roam page/block tree to DG document conversion +- validators for spans, title/body rules, block parents, and reference attributes + +Defer until after write-only storage: + +- DG document to Obsidian Markdown rendering +- DG document to Roam page/block rendering +- destination import reads from ATJSON +- HTML rendering + +## Write-only rollout + +### Database first + +Implement the `content_type` schema change and make sure all current readers still see the same rows they expect. + +Migration order: + +1. Add `Content.content_type text not null default 'text/plain'`. +2. Backfill existing `variant = 'full'` rows to `content_type = 'text/markdown'`. +3. Add generated `FileReference.content_type = 'text/markdown'`. +4. Replace `FileReference`'s three-column content foreign key with a four-column foreign key. +5. Replace `content_space_local_id_variant_idx` with a unique index over `(space_id, source_local_id, variant, content_type)`. +6. Add `content_type` to `content_local_input`. +7. Update `_local_content_to_db_content` and `upsert_content` to read, insert, update, and conflict on `content_type`. +8. Regenerate database types. +9. Update current app queries to filter by expected `content_type`. +10. Enable ATJSON writes. + +Every current query that relies on `direct` or `full` should be explicit about representation: + +- discovery/title rows: `variant = 'direct'` and `content_type = 'text/plain'` +- Obsidian import body rows: `variant = 'full'` and `content_type = 'text/markdown'` +- ATJSON rows: `variant = 'full'` and `content_type = 'application/vnd.discourse-graph.atjson+json; version=1` + +### Obsidian write path + +Update `apps/obsidian/src/utils/upsertNodesAsContentWithEmbeddings.ts` so content changes write: + +- `direct/text/plain` +- `full/text/markdown` +- `full/application/vnd.discourse-graph.atjson+json; version=1` + +For the ATJSON row, store the `DgDocument` in `metadata.content` and store a derived plain-text projection in `text`. + +Keep embeddings only on intentional searchable text rows. Do not embed serialized ATJSON. If ATJSON rows are embedded later, embed their derived `text` projection, not the JSON payload. + +### Roam write path + +Update Roam sync so it keeps current text rows and adds: + +- `full/application/vnd.discourse-graph.atjson+json; version=1` + +Use Roam-native page/block structure as the source, following SamePage `encodeState.ts` rather than deriving canonical ATJSON from Markdown. + +For the ATJSON row, store the `DgDocument` in `metadata.content` and store a derived plain-text projection in `text`. + +If cross-app Markdown import remains active before ATJSON renderers are ready, Roam should also emit `full/text/markdown` for shared nodes or route shared extraction through a source-neutral reader that can provide that Markdown row. Do not make Obsidian depend on Roam-origin ATJSON until the ATJSON-to-Obsidian renderer is active. + +### Readers stay stable + +During this phase: + +- Obsidian import continues to materialize Markdown from `full/text/markdown`. +- Obsidian publish and asset handling continue unchanged. +- Roam sync continues its five-minute local-to-remote process. +- No destination app should prefer ATJSON until renderer parity is tested. + +## Later conversion rollout + +After ATJSON write coverage is stable: + +1. Port `atJsonToObsidian` into a DG document renderer. +2. Add tests showing DG ATJSON renders to equivalent Obsidian Markdown for representative nodes. +3. Add an Obsidian importer fallback order: + - prefer ATJSON only when renderer tests pass and source row exists + - fall back to `full/text/markdown` +4. Port `atJsonToRoam` and `decodeState` concepts into a Roam renderer/materializer. +5. Add Roam destination import from DG ATJSON. +6. Only after both destination paths are stable, decide whether Markdown remains a durable native export or becomes derived output. + +## Test plan + +Database tests: + +- `content_type` defaults to `text/plain`. +- Backfill maps existing `full` rows to `text/markdown`. +- `upsert_content` allows two rows with the same `(space_id, source_local_id, variant)` when `content_type` differs. +- `my_contents` includes `content_type`. +- `FileReference` still cascades from the Markdown `full` content row. +- ATJSON rows store the canonical document in `metadata.content`. +- ATJSON rows store derived searchable text in `text`, not serialized JSON. +- Embedding views continue to work for text rows. + +Content-model package tests: + +- valid and invalid spans +- invalid block parents +- title rejects block annotations +- Roam page refs versus block refs +- Obsidian wikilinks and aliases +- parser fixtures ported from SamePage Roam and Obsidian tests + +App regression tests: + +- Obsidian import still fetches `direct/text/plain` and `full/text/markdown`. +- Obsidian sync writes ATJSON rows without changing current Markdown rows. +- Roam sync writes ATJSON rows without removing or replacing current text rows. +- Serialized ATJSON is never stored in `Content.text` or sent for embeddings. + +Manual validation: + +- Existing Obsidian publish/import flow still works before and after ATJSON rows are present. +- Existing Roam local-to-remote sync still completes. +- Published Obsidian nodes remain importable by current Obsidian importer. + +## Assumptions + +- `variant` is the semantic slice, not the representation. +- `content_type` is the representation discriminator. +- ATJSON payloads live in `Content.metadata.content`. +- `Content.text` on ATJSON rows is a derived plain-text projection. +- DG canonical model should be `DgDocument`, not SamePage's exact `SamePageSchema`. +- Initial ATJSON rollout is write-only. +- The current database generated types must be regenerated after schema changes. +- `packages/database/schema.yaml` should be reconciled with SQL because it currently omits the already-existing `full` content variant. diff --git a/docs/atjson-canonical-storage-scope.md b/docs/atjson-canonical-storage-scope.md new file mode 100644 index 000000000..3ced187d7 --- /dev/null +++ b/docs/atjson-canonical-storage-scope.md @@ -0,0 +1,429 @@ +# ATJSON canonical storage and conversion rollout scope + +## How to use + +Use this scope as the project contract for adding ATJSON as the canonical stored content representation in Discourse Graphs. + +This document clarifies: + +- what the v0 rollout is trying to accomplish +- what must keep working during the rollout +- what is in scope now +- what is explicitly deferred +- which user and system behaviors matter +- what needs to be built +- what counts as done +- what remains unknown + +Keep the detailed implementation plans in the related notes. This scope should stay lightweight enough to guide Linear issue creation and review. + +## Metadata + +- **Status**: DRAFT +- **Owner**: TBD +- **Reviewers**: DB admin, app/plugin owner, content-model owner +- **Related notes**: + - `docs/atjson-canonical-storage-plan.md` + - `docs/atjson-port-plan.md` +- **Related Linear project/issues**: TBD + +## 1. Summary + +- **One-sentence summary**: Add DG ATJSON as the canonical stored content representation while preserving the current Obsidian Markdown sink/import flow. +- **Problem**: Discourse Graphs currently stores app-native text and Markdown rows, but does not yet persist a portable canonical content model that can later render cleanly to Obsidian, Roam, and website publishing surfaces. +- **Proposed solution**: Add `Content.content_type`, keep `variant` as the semantic content slice, and write canonical DG ATJSON rows alongside existing text and Markdown rows, with the structured document stored in `Content.metadata.content` and derived plain text stored in `Content.text`. +- **Expected outcome**: The database can store canonical ATJSON without interrupting current Obsidian behavior, and the later ATJSON-to-Obsidian, ATJSON-to-Roam, and HTML conversion work has a stable storage target. + +## 2. Goal + Non-Goals + +### Goal + +Add write-only canonical ATJSON storage for Discourse Graphs content in a way that keeps all current app flows working. + +The v0 goal is specifically to: + +- treat `variant` as the semantic slice, such as `direct`, `full`, or `direct_and_description` +- treat `content_type` as the representation, such as `text/plain`, `text/markdown`, or `application/vnd.discourse-graph.atjson+json; version=1` +- store canonical DG ATJSON in `Content.metadata.content` +- keep `Content.text` as a derived plain-text projection for search, previews, duplicate detection, and existing text-centered tooling +- keep existing Obsidian imports reading `direct/text/plain` and `full/text/markdown` +- prepare for later ATJSON-to-Obsidian and ATJSON-to-Roam renderers + +### Non-Goals + +- Do not replace the current Obsidian Markdown import path in v0. +- Do not make destination readers prefer ATJSON in v0. +- Do not port SamePage wholesale. +- Do not introduce SamePage sync, networking, Automerge, IPFS, or protocol runtime code. +- Do not serialize ATJSON into `Content.text`. +- Do not embed serialized ATJSON JSON. +- Do not introduce a new `ContentVariant` for ATJSON. +- Do not make Markdown a final cross-app canonical format. + +## 3. v0 Scope + +### In scope + +- Add `Content.content_type`. +- Backfill existing content rows into explicit content types. +- Update content uniqueness to include `content_type`. +- Update `FileReference` to continue pointing at the Markdown `full` content row. +- Update `content_local_input`, `_local_content_to_db_content`, `upsert_content`, views, and generated database types. +- Add shared content type constants: + - `text/plain` + - `text/markdown` + - `application/vnd.discourse-graph.atjson+json; version=1` +- Create or use a DG-owned `DgDocument` canonical model. +- Write ATJSON rows from Obsidian without changing the current Markdown rows. +- Write ATJSON rows from Roam without removing existing text rows. +- Store ATJSON payloads in `metadata.content`. +- Store derived plain text in `text`. +- Add tests and manual validation proving the existing Obsidian sink/import still works. + +### Out of scope + +- ATJSON-preferred import. +- ATJSON-to-Obsidian rendering in production app reads. +- ATJSON-to-Roam rendering or materialization in production app reads. +- API content negotiation. +- Native export as a stored canonical format. +- Replacing existing Markdown asset/file-reference behavior. +- Continuous sync or automatic background import behavior beyond what exists today. + +### Deferred to v1+ + +- Port DG ATJSON to Obsidian Markdown rendering. +- Port DG ATJSON to Roam rendering/materialization. +- Add renderer parity tests and then switch destination readers to prefer ATJSON. +- Add HTML rendering for website publishing. +- Decide whether native exports should also be stored as durable representations. +- Decide long-term content API representation negotiation. + +## 4. In-Scope Use Cases + +### UC1: Existing Obsidian user continues publishing and importing content + +- **Actor**: Discourse Graphs user in Obsidian. +- **Trigger**: User publishes or imports content using the current working Obsidian flow. +- **Happy path**: + 1. User publishes or imports content from Obsidian. + 2. Existing `direct/text/plain` title rows and `full/text/markdown` body rows remain available. + 3. Obsidian import continues to materialize Markdown from the same representation it uses today. +- **Frequency**: Regularly; this is the compatibility baseline. + +### UC2: Obsidian-authored content is stored as canonical ATJSON + +- **Actor**: Discourse Graphs user in Obsidian. +- **Trigger**: User syncs or publishes an Obsidian-authored DG node. +- **Happy path**: + 1. User creates or updates a DG node in Obsidian. + 2. The current text and Markdown content rows are written as before. + 3. A canonical ATJSON row is also written for the same content. + 4. The ATJSON document is available for later cross-app rendering work. +- **Frequency**: Every Obsidian write after v0 rollout is enabled. + +### UC3: Roam-authored content is stored as canonical ATJSON + +- **Actor**: Discourse Graphs user in Roam. +- **Trigger**: User syncs a Roam page or block tree into Discourse Graphs. +- **Happy path**: + 1. User updates content in Roam. + 2. Existing Roam text rows continue to be written. + 3. A canonical ATJSON row is also written from the Roam-native page/block structure. + 4. The ATJSON document preserves enough structure for later Roam and Obsidian renderers. +- **Frequency**: Every Roam write after v0 rollout is enabled. + +### UC4: Engineer validates canonical payloads before read-path rollout + +- **Actor**: Engineer or database admin. +- **Trigger**: Preparing to build ATJSON-to-Obsidian, ATJSON-to-Roam, or HTML rendering. +- **Happy path**: + 1. Engineer queries content rows by `variant` and `content_type`. + 2. Engineer inspects `Content.metadata.content` for canonical DG ATJSON. + 3. Engineer uses the derived `Content.text` projection for search, previews, or debugging. + 4. Renderer work can proceed against real stored canonical documents without changing current readers. +- **Frequency**: During migration validation and renderer development. + +## 5. Constraints / Assumptions / Dependencies + +### Constraints + +- The current Obsidian sink/import path must keep working throughout the rollout. +- ATJSON must be added alongside existing rows before any reader is switched over. +- Existing text-centered tooling should still be able to use `Content.text`. +- `FileReference` must continue to attach to the current Markdown `full` row until asset handling is intentionally moved to ATJSON. +- Database type generation must happen after schema changes. + +### Assumptions + +- `variant` is the semantic content slice, not the representation format. +- `content_type` is the representation discriminator. +- ATJSON payloads live in `Content.metadata.content`. +- `Content.text` on ATJSON rows is derived plain text. +- The canonical model is DG-owned and ATJSON-compatible, not SamePage's exact runtime schema. +- The first rollout is write-only for ATJSON. + +### Dependencies + +- Database migration support for `packages/database/supabase/schemas/content.sql`. +- Generated Supabase type updates in `packages/database/src/dbTypes.ts`. +- Current app writers in `apps/obsidian` and `apps/roam`. +- SamePage reference parser and renderer code listed in `docs/atjson-canonical-storage-plan.md`. +- Shared `@repo/content-model` package work from `docs/atjson-port-plan.md`. + +## 6. Requirements + +### Functional requirements + +#### F1: Add content representation discriminator + +- **Milestone**: Milestone 2 +- **Requirement**: Add `Content.content_type text not null default 'text/plain'` and expose it through relevant content inputs, views, functions, and generated types. +- **Acceptance criteria**: + - Existing non-`full` rows resolve to `text/plain`. + - Existing `full` rows resolve to `text/markdown`. + - `my_contents` exposes `content_type`. + - App queries can filter by both `variant` and `content_type`. +- **Notes**: `content_type` distinguishes representation; it does not replace `variant`. + +#### F2: Allow multiple representations for the same content slice + +- **Milestone**: Milestone 2 +- **Requirement**: Update content uniqueness and upsert behavior to use `(space_id, source_local_id, variant, content_type)`. +- **Acceptance criteria**: + - A `full/text/markdown` row and a `full/application/vnd.discourse-graph.atjson+json; version=1` row can coexist. + - Upserting one representation does not overwrite another representation for the same content slice. + - `upsert_content` conflicts on the four-column key. +- **Notes**: Do not add ATJSON as a `ContentVariant`. + +#### F3: Store canonical ATJSON in metadata + +- **Milestone**: Milestones 3 and 4 +- **Requirement**: Store canonical DG ATJSON in `Content.metadata.content` and store only derived plain text in `Content.text`. +- **Acceptance criteria**: + - ATJSON rows use `content_type = 'application/vnd.discourse-graph.atjson+json; version=1'`. + - ATJSON rows store a valid `DgDocument` in `metadata.content`. + - ATJSON rows do not store serialized JSON in `text`. + - Search and preview tooling can use the derived plain-text projection. +- **Notes**: This reflects the database guidance that the content itself should be in the metadata JSON blob. + +#### F4: Preserve current Obsidian import behavior + +- **Milestone**: Milestones 2 and 4 +- **Requirement**: Keep Obsidian import reading `direct/text/plain` and `full/text/markdown` until ATJSON renderer parity exists. +- **Acceptance criteria**: + - Existing Obsidian import still succeeds after `content_type` is introduced. + - Obsidian import does not prefer ATJSON in v0. + - Current Markdown body behavior remains unchanged. +- **Notes**: This is the key compatibility requirement. + +#### F5: Keep file references attached to Markdown rows + +- **Milestone**: Milestone 2 +- **Requirement**: Update `FileReference` so it still points to the Markdown `full` content row after content uniqueness includes `content_type`. +- **Acceptance criteria**: + - `FileReference` has or derives `content_type = 'text/markdown'`. + - The foreign key references `(space_id, source_local_id, variant, content_type)`. + - Existing asset/file-reference behavior continues to work. +- **Notes**: Moving file references into ATJSON is deferred. + +#### F6: Write ATJSON alongside existing Obsidian rows + +- **Milestone**: Milestone 4 +- **Requirement**: Update the Obsidian writer to keep writing current rows and additionally write `full/application/vnd.discourse-graph.atjson+json; version=1`. +- **Acceptance criteria**: + - Obsidian still writes `direct/text/plain`. + - Obsidian still writes `full/text/markdown`. + - Obsidian also writes `full/application/vnd.discourse-graph.atjson+json; version=1`. + - The ATJSON row uses `metadata.content` for the canonical document and `text` for derived plain text. +- **Notes**: This should not change the current import path. + +#### F7: Write ATJSON alongside existing Roam rows + +- **Milestone**: Milestone 4 +- **Requirement**: Update the Roam writer to keep current text rows and additionally write `full/application/vnd.discourse-graph.atjson+json; version=1`. +- **Acceptance criteria**: + - Existing Roam write behavior remains intact. + - Roam ATJSON is derived from Roam-native page/block structure, not from Markdown when native structure is available. + - The ATJSON row uses `metadata.content` for the canonical document and `text` for derived plain text. +- **Notes**: If cross-app Markdown import is needed before ATJSON renderers are ready, Roam may also need to emit `full/text/markdown` for shared nodes. + +#### F8: Define shared content type constants + +- **Milestone**: Milestone 3 +- **Requirement**: Define shared constants for supported content types and use them from app writers/readers. +- **Acceptance criteria**: + - Constants exist for `text/plain`, `text/markdown`, and DG ATJSON. + - App code does not rely on repeated raw content-type strings where a shared constant is available. +- **Notes**: The constants should live where both app code and storage integration code can use them without circular dependencies. + +#### F9: Map later conversion rollout without activating it + +- **Milestone**: Milestone 5 +- **Requirement**: Document the follow-up path for ATJSON-to-Obsidian, ATJSON-to-Roam, HTML rendering, and API representation negotiation. +- **Acceptance criteria**: + - The follow-up work is captured as deferred scope. + - Destination readers remain on current text/Markdown rows in v0. + - Renderer parity tests are required before reader switch-over. +- **Notes**: The implementation plan is in `docs/atjson-port-plan.md`. + +### Non-functional requirements + +#### N1: Backward compatibility + +- **Requirement**: Existing Obsidian publish/import and Roam write behavior must continue to work during and after the v0 rollout. +- **Acceptance criteria**: + - Manual validation confirms current Obsidian import still works after ATJSON rows exist. + - Existing readers do not break when multiple representations exist for the same content slice. + +#### N2: Data clarity + +- **Requirement**: Stored rows must make semantic slice and representation format explicit. +- **Acceptance criteria**: + - Engineers can identify content by both `variant` and `content_type`. + - ATJSON is not hidden inside `variant` names or serialized into text fields. + +#### N3: Search and embedding safety + +- **Requirement**: Text search, previews, duplicate detection, and embeddings must operate on human-readable text projections, not serialized ATJSON. +- **Acceptance criteria**: + - ATJSON rows store derived plain text in `Content.text`. + - Serialized ATJSON is never sent to embedding generation. + +#### N4: Type safety and maintainability + +- **Requirement**: TypeScript code should use typed content constants and shared model types where practical. +- **Acceptance criteria**: + - Generated database types are updated after schema changes. + - App code can compile against the new `content_type` field. + - New conversion code avoids `any` where practical. + +#### N5: Explicit rollout + +- **Requirement**: v0 should not introduce new implicit sync or automatic read-path switching. +- **Acceptance criteria**: + - ATJSON rows are written explicitly by existing write flows. + - Destination readers switch to ATJSON only in a later planned phase. + +## 7. Milestones + +### Milestone 1: Scope and storage decision finalized + +- **Deliverable**: Approved scope plus linked architecture and storage plans. +- **Acceptance criteria**: + - The team agrees that `variant` is the semantic slice. + - The team agrees that `content_type` is the representation discriminator. + - The team agrees that ATJSON content lives in `metadata.content`. + - The team agrees that `text` remains a derived plain-text projection. +- **Dependencies**: DB admin review. +- **Estimate**: 0.5-1 day. + +### Milestone 2: Database supports multiple representations + +- **Deliverable**: Schema, function, view, FileReference, and generated type updates. +- **Acceptance criteria**: + - Existing rows are backfilled into explicit content types. + - Multiple content representations can coexist for the same semantic slice. + - Current Obsidian import queries can still find title and Markdown body rows. + - File references still attach to Markdown `full` rows. +- **Dependencies**: Migration review and database type regeneration. +- **Estimate**: 1-2 days. + +### Milestone 3: Canonical content model can be produced + +- **Deliverable**: Minimal DG-owned ATJSON-compatible `DgDocument` model and source-to-canonical conversion needed for write-only storage. +- **Acceptance criteria**: + - Obsidian content can produce a canonical document. + - Roam page/block content can produce a canonical document. + - Derived plain text can be produced from the canonical document. + - Validation covers spans, title/body rules, block parents, and reference attributes. +- **Dependencies**: `@repo/content-model` package work. +- **Estimate**: 2-5 days, depending on adapter depth included in v0. + +### Milestone 4: Write-only ATJSON rollout + +- **Deliverable**: Obsidian and Roam writers add ATJSON rows while preserving existing rows. +- **Acceptance criteria**: + - Obsidian writes current rows plus ATJSON. + - Roam writes current rows plus ATJSON. + - ATJSON payloads are in `metadata.content`. + - `text` contains derived plain text. + - Existing Obsidian import still works. +- **Dependencies**: Milestones 2 and 3. +- **Estimate**: 1-3 days. + +### Milestone 5: Deferred conversion rollout mapped + +- **Deliverable**: Follow-up scope or issues for renderers and read-path switch-over. +- **Acceptance criteria**: + - ATJSON-to-Obsidian renderer work is ticketed. + - ATJSON-to-Roam renderer/materializer work is ticketed. + - HTML rendering work is ticketed. + - API representation negotiation is explicitly deferred or scoped. +- **Dependencies**: Stored ATJSON examples from Milestone 4. +- **Estimate**: 0.5-1 day for scoping; implementation estimated separately. + +## 8. Open Questions + +### OQ1: Where should shared content type constants live? + +- **Current leaning**: Put them in the shared content-model package if it can be consumed cleanly by app writers; otherwise use a small shared database/content constants module. + +### OQ2: How much of the Obsidian and Roam parser work is required for v0? + +- **Current leaning**: Implement only enough source-to-canonical conversion for write-only ATJSON rows, then deepen fidelity during renderer parity work. + +### OQ3: Should Roam emit `full/text/markdown` before ATJSON readers exist? + +- **Current leaning**: Only if cross-app Markdown import from Roam-authored shared nodes must work before ATJSON-to-Obsidian rendering is active. + +### OQ4: Should native export formats become stored durable representations? + +- **Current leaning**: Defer until canonical ATJSON storage and app renderers are stable. + +### OQ5: What is the long-term source for embeddings? + +- **Current leaning**: Keep embeddings based on human-readable derived text, not serialized structured content. + +## 9. Risks + +### Risk: Existing Obsidian import breaks during schema migration + +- **Impact**: Users lose the current working sink/import path. +- **Mitigation**: Backfill before enforcing new uniqueness, update queries to filter explicit content types, and manually validate current Obsidian import before enabling ATJSON writes. + +### Risk: ATJSON rows overwrite Markdown rows + +- **Impact**: Existing Markdown import behavior breaks or data is lost. +- **Mitigation**: Use `(space_id, source_local_id, variant, content_type)` as the uniqueness key and update `upsert_content` conflict behavior. + +### Risk: Serialized JSON enters search or embedding paths + +- **Impact**: Search quality drops, embeddings become noisy, and text tooling becomes harder to reason about. +- **Mitigation**: Store canonical content in `metadata.content`; store only derived plain text in `text`; add regression tests. + +### Risk: File references lose their target row + +- **Impact**: Assets attached to Obsidian Markdown content break. +- **Mitigation**: Add or derive `FileReference.content_type = 'text/markdown'` and update the foreign key to the four-column content key. + +### Risk: Canonical model overfits SamePage + +- **Impact**: DG inherits SamePage-specific assumptions that do not fit Roam, Obsidian, and website publishing. +- **Mitigation**: Use SamePage as a reference, but define a DG-owned `DgDocument` with explicit title/body split, typed references, and block identity. + +### Risk: Readers switch to ATJSON before renderer parity + +- **Impact**: Import behavior changes before ATJSON-to-native rendering is trustworthy. +- **Mitigation**: Keep v0 write-only and require renderer parity tests before read-path switch-over. + +## 10. Approval Checklist + +- {{[[TODO]]}} Goal is clear. +- {{[[TODO]]}} v0 scope is clear. +- {{[[TODO]]}} Out-of-scope items are explicit. +- {{[[TODO]]}} Use cases describe user/system workflows, not implementation debate. +- {{[[TODO]]}} Functional requirements can become Linear issues. +- {{[[TODO]]}} Non-functional requirements are concrete and testable. +- {{[[TODO]]}} Milestones have acceptance criteria. +- {{[[TODO]]}} Open questions and risks are captured. diff --git a/docs/atjson-port-plan.md b/docs/atjson-port-plan.md new file mode 100644 index 000000000..1bd567715 --- /dev/null +++ b/docs/atjson-port-plan.md @@ -0,0 +1,519 @@ +# SamePage to Discourse Graphs content-model port + +## Summary + +This document scopes the SamePage ATJSON port into a Discourse Graphs-owned shared package. + +The immediate goal is to make **one canonical content model in code** that: + +- accepts content from Roam +- accepts content from Obsidian +- stores portable semantics in a DG-owned ATJSON-compatible shape +- renders that canonical shape back to Roam +- renders that canonical shape back to Obsidian +- renders that canonical shape to HTML for future website publishing + +The goal is **not** to port SamePage wholesale. We are only porting the document model, parser and renderer patterns, and app adapters that are useful for Roam, Obsidian, and HTML. + +Canonical storage rollout is tracked separately in `docs/atjson-canonical-storage-plan.md`. That storage plan records the later decision that `variant` is the semantic content slice and `Content.content_type` is the representation format. This file remains the architecture plan for the DG-owned content-model package and conversion adapters. + +## Prompt corrections + +The original prompt mixes two different initiatives. They should be treated separately. + +### Initiative A: canonical content model in code + +This is the main effort in this document. + +- Create a new shared package under `packages/content-model` +- Define the canonical schema and validators +- Add generic parser and renderer utilities +- Add Obsidian adapters +- Add Roam adapters +- Add direct HTML rendering from the canonical model +- Wire apps to the shared package with thin integration layers + +### Initiative B: canonical storage and content negotiation + +This is now split into two parts: + +- Initiative B1: write-only canonical ATJSON storage, documented in `docs/atjson-canonical-storage-plan.md` +- Initiative B2: later read-path/content negotiation, after app renderers have parity tests + +The decided B1 storage shape is: + +- `variant` remains the semantic slice of a node, such as `direct`, `full`, or `direct_and_description` +- `Content.content_type` becomes the representation format, such as `text/plain`, `text/markdown`, or `application/vnd.discourse-graph.atjson+json; version=1` +- ATJSON is initially written alongside existing rows, with the structured document stored in `Content.metadata` and a derived plain-text projection stored in `Content.text` +- Existing readers keep using the current text and Markdown rows + +This split is important because the current repo uses `Content.text` for text search, discovery, and embeddings, while app-native payloads are still organized by content variants. Write-only ATJSON storage can land before destination renderers are active, but API negotiation and ATJSON-preferred import should wait for renderer parity. + +## Estimated effort + +These estimates assume **LLM-assisted implementation** for the initial porting work. + +That means LLMs are used for: + +- package scaffolding +- first-pass translation of SamePage parser and renderer patterns +- generation of repetitive type definitions and tests +- drafting Roam and Obsidian adapter code from the reference repos + +The estimates still assume human engineering time for: + +- design decisions +- integration into this repo +- debugging edge cases +- validating round-trip fidelity +- writing and fixing tests +- code review and cleanup + +### Expected engineering hours + +#### Initiative A: canonical content model in code + +| Slice | Scope | Estimated hours | +| ----- | ------------------------------------------------ | --------------: | +| 1 | Package scaffold, schema, validators, core tests | 6-10 | +| 2 | Generic parser and renderer core | 8-14 | +| 3 | Obsidian adapter and tests | 12-20 | +| 4 | Roam adapter and tests | 14-24 | +| 5 | HTML renderer and tests | 4-8 | +| 6 | Integration pass in apps and migration notes | 8-16 | +| | **Subtotal** | **52-92** | + +#### Initiative B: storage and API tracks + +| Slice | Scope | Estimated hours | +| ----- | -------------------------------------------------------------------------------------------------- | --------------: | +| 7A | Write-only ATJSON storage with `Content.content_type`, migrations, and app writer updates | 12-24 | +| 7B | ATJSON-preferred destination reads, content negotiation, and Markdown/HTML representation handling | 12-24 | +| | **Subtotal** | **24-48** | + +#### Overall estimate + +- **Code-first implementation only**: about **52-92 engineer hours** +- **Including storage and API tracks**: about **76-140 engineer hours** + +### Planning assumptions behind the estimate + +- LLMs can likely remove about **25-40%** of the low-level porting and boilerplate effort. +- LLMs do **not** remove most of the cost of integration, review, debugging, and round-trip correctness testing. +- The estimate assumes we are **porting from existing SamePage code**, not inventing the parsers and renderer from scratch. +- The estimate also assumes we are porting only the relevant document-model pieces, not the excluded SamePage sync, transport, or Automerge layers. +- Roam and Obsidian edge cases will likely dominate the uncertainty. +- The estimate assumes one engineer driving the work with fast feedback, not a long review queue. +- If write-only storage is implemented in parallel with adapter work, keep it behind the current text and Markdown read paths and expect coordination overhead near the high end of the estimate. + +## Scope + +### In scope + +- A DG-owned shared package for the canonical content model +- Top-level `title` and `body` split +- Portable annotation definitions +- Generic parser and renderer helpers inspired by SamePage +- Roam to canonical content conversion +- Canonical content to Roam conversion +- Obsidian to canonical content conversion +- Canonical content to Obsidian conversion +- Canonical content to HTML rendering +- Unit tests and round-trip tests +- Migration notes for future contributors + +### Out of scope + +- SamePage networking +- websockets +- IPFS +- Automerge sync logic +- SamePage local database code +- SamePage protocol code +- SamePage runtime schema ownership +- a standalone general Markdown transport beyond what is required for the Obsidian adapter +- ATJSON-preferred import, API negotiation, and destination read-path changes in the first content-model package pass + +## Package decision + +The new package should be: + +- path: `packages/content-model` +- package name: `@repo/content-model` + +This name makes the package purpose clear while still allowing the package to be ATJSON-oriented internally. + +## Target architecture + +`@repo/content-model` owns: + +- canonical schema and exported types +- annotation definitions +- title and body document split +- validators +- parser and renderer helpers +- Roam adapters +- Obsidian adapters +- HTML rendering + +`apps/roam` should only own: + +- thin Roam integration +- access to Roam APIs +- any unavoidable Roam-only wiring + +`apps/obsidian` should only own: + +- thin Obsidian integration +- access to Obsidian APIs +- any unavoidable Obsidian-only wiring + +`apps/website` should later consume canonical HTML output rather than building a separate Roam-specific or Obsidian-specific publishing path. + +## Canonical document design + +The canonical document should be a DG-owned ATJSON-compatible structure with a strict top-level split between title and body. + +### Top-level type + +```ts +type DgDocument = { + version: "dg-content-model/v1"; + title: TextDocument; + body: BodyDocument; + metadata?: JsonObject; +}; +``` + +### Text containers + +```ts +type TextDocument = { + text: string; + annotations: InlineAnnotation[]; +}; + +type BodyDocument = { + text: string; + annotations: BodyAnnotation[]; +}; +``` + +`title` should only contain inline annotations. + +`body` should contain: + +- block structure +- inline formatting +- links +- images +- references +- optional app-specific fidelity attributes + +### Annotation set + +The v1 canonical annotation set should include: + +- `block` +- `bold` +- `italics` +- `strikethrough` +- `code` +- `link` +- `image` +- `reference` + +Every annotation should include: + +```ts +type AnnotationBase = { + start: number; + end: number; + appAttributes?: Record; +}; +``` + +### Block annotation + +Block annotations should carry explicit hierarchy information instead of relying only on indentation depth. + +```ts +type BlockAnnotation = AnnotationBase & { + type: "block"; + attributes: { + blockId: string; + parentBlockId?: string; + depth: number; + viewType: "paragraph" | "bullet" | "numbered"; + }; +}; +``` + +This is an intentional divergence from SamePage. SamePage used `level` and `viewType`; Discourse Graphs should keep `depth` but also carry explicit block identity and parent linkage so block hierarchy is reconstructable without relying on positional inference alone. + +### Inline annotations + +```ts +type BoldAnnotation = AnnotationBase & { + type: "bold"; + attributes?: { + delimiter?: string; + open?: boolean; + }; +}; + +type ItalicsAnnotation = AnnotationBase & { + type: "italics"; + attributes?: { + delimiter?: string; + open?: boolean; + }; +}; + +type StrikethroughAnnotation = AnnotationBase & { + type: "strikethrough"; + attributes?: { + delimiter?: string; + open?: boolean; + }; +}; + +type CodeAnnotation = AnnotationBase & { + type: "code"; + attributes: { + language?: string; + ticks?: number; + display?: "inline" | "block"; + }; +}; + +type LinkAnnotation = AnnotationBase & { + type: "link"; + attributes: { + href: string; + title?: string; + }; +}; + +type ImageAnnotation = AnnotationBase & { + type: "image"; + attributes: { + src: string; + alt?: string; + title?: string; + }; +}; +``` + +### Reference annotation + +References should be explicit and typed. + +```ts +type ReferenceAnnotation = AnnotationBase & { + type: "reference"; + attributes: + | { + kind: "roam-page"; + pageTitle: string; + pageUid?: string; + } + | { + kind: "roam-block"; + blockUid: string; + } + | { + kind: "obsidian-wikilink"; + path: string; + subpath?: string; + alias?: string; + }; +}; +``` + +Generic URL links should remain `link` annotations, not `reference` annotations. + +### Validation rules + +The shared package should provide validators that: + +- reject negative spans +- reject zero-length spans +- reject spans that exceed document length +- reject block parents that do not exist +- reject `title` documents that contain block annotations +- validate `reference.kind`-specific attributes + +## Module boundaries + +The package should expose these modules: + +- `@repo/content-model/schema` +- `@repo/content-model/validate` +- `@repo/content-model/core` +- `@repo/content-model/adapters/obsidian` +- `@repo/content-model/adapters/roam` +- `@repo/content-model/render/html` + +Recommended internal structure: + +- `src/schema.ts` +- `src/annotations.ts` +- `src/validate.ts` +- `src/core/parser.ts` +- `src/core/render.ts` +- `src/core/annotations.ts` +- `src/adapters/obsidian/fromObsidian.ts` +- `src/adapters/obsidian/toObsidian.ts` +- `src/adapters/roam/fromRoam.ts` +- `src/adapters/roam/toRoam.ts` +- `src/render/toHtml.ts` + +## Implementation slices + +### Slice 1: package scaffold and canonical schema + +Deliver: + +- package scaffold under `packages/content-model` +- exported document and annotation types +- validators +- tests for title and body documents +- tests for nested and overlapping annotations +- tests for block hierarchy representation +- tests for reference variants + +Notes: + +- Do not implement app adapters yet +- Do not add backend storage changes yet + +### Slice 2: generic parser and renderer core + +Deliver: + +- simplified DG-owned parser helpers +- annotation combination helpers +- renderer runtime for annotation application +- tests for annotation ordering and nested rendering + +Notes: + +- Port the reusable ideas from SamePage +- Do not carry over sync-specific or protocol-specific machinery +- Avoid overfitting to a single app + +### Slice 3: Obsidian adapter + +Deliver: + +- `fromObsidian` +- `toObsidian` +- tests for paragraphs, bullets, numbered lists, code fences, links, images, and wikilinks +- minimal integration points in `apps/obsidian` + +Notes: + +- Keep the title and body split explicit +- Use `appAttributes` only where portable attributes cannot preserve full fidelity + +### Slice 4: Roam adapter + +Deliver: + +- `fromRoam` +- `toRoam` +- tests for page title, block hierarchy, page refs, block refs, links, images, and code fences +- minimal integration points in `apps/roam` + +Notes: + +- Do not collapse page refs and block refs into the same vague type +- Preserve Roam-specific fidelity with `appAttributes` only where necessary + +### Slice 5: HTML renderer + +Deliver: + +- `toHtml` +- tests for representative title and body documents +- clear semantic mapping from canonical content to HTML + +Notes: + +- There should be one canonical HTML path +- Do not implement separate Roam-to-HTML and Obsidian-to-HTML renderers +- Next.js can consume the HTML output later + +### Slice 6: app integration pass + +Deliver: + +- thin app wiring in `apps/roam` +- thin app wiring in `apps/obsidian` +- removal or isolation of duplicated parsing and rendering logic where the shared package should be canonical +- migration notes + +Notes: + +- Keep the current backend upload model working during this phase +- Do not combine destination renderer rollout with a storage migration +- If write-only storage lands in parallel, follow `docs/atjson-canonical-storage-plan.md` and keep current readers on text and Markdown rows + +### Slice 7: storage and API tracks + +Deliver: + +- write-only canonical backend storage as a separate track, using `Content.content_type` +- later representation negotiation for future content APIs +- migration plan for existing plain text and Markdown content rows + +Notes: + +- Write-only storage has a concrete plan in `docs/atjson-canonical-storage-plan.md` +- ATJSON rows store the canonical structured document in `Content.metadata` +- `Content.text` stores a derived plain-text projection for search, discovery, and fallback display +- `Content.content_type` distinguishes `text/plain`, `text/markdown`, and DG ATJSON +- Do not move app import paths to ATJSON until renderer parity tests exist + +## Test plan + +The shared package should have package-local tests that cover: + +- valid `title` and `body` documents +- invalid spans and invalid block parents +- nested inline annotations +- overlapping annotations where ordering matters +- block hierarchy reconstruction +- explicit reference kind variants +- generic renderer ordering behavior +- Obsidian round-trips +- Roam round-trips +- HTML rendering for representative canonical documents + +## Intentional divergences from SamePage + +The DG implementation should intentionally diverge from SamePage in these places: + +- no `SamePageSchema` as the canonical runtime type +- no Automerge counters or Automerge-specific schema shape +- no SamePage content type version header +- no metadata annotation for the title +- explicit top-level `title` and `body` +- explicit block identity and parent linkage +- app-specific attributes are an escape hatch, not the primary semantics + +## Open questions for later phases + +These are intentionally not part of the first content-model package implementation pass: + +- whether `Content.text` remains the long-term search and embedding source or is always derived from canonical content +- how `GET` content APIs should negotiate ATJSON, HTML, and Markdown +- whether Markdown becomes a first-class standalone transport outside the Obsidian adapter + +Resolved storage decision: + +- canonical ATJSON initially lives in `Content.metadata`, not as serialized JSON in `Content.text` +- `Content.text` on ATJSON rows is a derived plain-text projection +- `Content.content_type` distinguishes ATJSON from text and Markdown +- `variant` remains the semantic slice, not the representation format From 4e27c72e11de902b7c59b2e2bf1d01b8a088a884 Mon Sep 17 00:00:00 2001 From: Michael Gartner Date: Sun, 17 May 2026 21:05:25 -0600 Subject: [PATCH 2/3] Implement ATJSON canonical content storage --- apps/obsidian/package.json | 2 + apps/obsidian/src/utils/importContentTypes.ts | 49 ++ apps/obsidian/src/utils/importNodes.ts | 29 +- apps/obsidian/src/utils/publishNode.ts | 2 + .../src/utils/syncDgNodesToSupabase.ts | 7 + .../upsertNodesAsContentWithEmbeddings.ts | 93 ++-- .../obsidian/tests/atjsonContentWrite.test.ts | 338 ++++++++++++++ apps/roam/package.json | 2 + apps/roam/src/utils/cleanupOrphanedNodes.ts | 2 + apps/roam/src/utils/contentEmbeddingSplit.ts | 24 + apps/roam/src/utils/syncDgNodesToSupabase.ts | 21 +- .../upsertNodesAsContentWithEmbeddings.ts | 110 ++++- apps/roam/tests/atjsonContentWrite.test.ts | 205 +++++++++ docs/atjson-canonical-storage-plan.md | 23 +- docs/atjson-canonical-storage-scope.md | 12 +- docs/atjson-port-plan.md | 4 +- packages/content-model/eslint.config.mjs | 13 + packages/content-model/package.json | 33 ++ .../content-model/src/adapters/obsidian.ts | 116 +++++ packages/content-model/src/adapters/roam.ts | 238 ++++++++++ packages/content-model/src/constants.ts | 8 + .../content-model/src/content-model.test.ts | 377 +++++++++++++++ packages/content-model/src/core/index.ts | 2 + packages/content-model/src/core/parse.ts | 431 ++++++++++++++++++ packages/content-model/src/core/render.ts | 303 ++++++++++++ packages/content-model/src/index.ts | 7 + packages/content-model/src/render/html.ts | 137 ++++++ packages/content-model/src/schema.ts | 134 ++++++ packages/content-model/src/text.ts | 28 ++ packages/content-model/src/validate.ts | 210 +++++++++ packages/content-model/tsconfig.json | 9 + packages/database/doc/upsert_content.md | 177 +++++-- .../features/atjsonContentType.feature | 76 +++ .../features/step-definitions/stepdefs.ts | 157 +++++-- packages/database/package.json | 4 +- packages/database/schema.yaml | 13 + packages/database/src/dbDotEnv.cjs | 105 +++++ packages/database/src/dbTypes.ts | 38 +- packages/database/src/lib/queries.ts | 97 ++-- .../20260517172000_atjson_content_type.sql | 289 ++++++++++++ packages/database/supabase/schemas/assets.sql | 7 +- .../database/supabase/schemas/content.sql | 45 +- .../database/supabase/schemas/embedding.sql | 3 +- .../database/tests/atjsonSqlContract.test.ts | 140 ++++++ packages/database/tsconfig.json | 1 + pnpm-lock.yaml | 28 ++ 46 files changed, 3935 insertions(+), 214 deletions(-) create mode 100644 apps/obsidian/src/utils/importContentTypes.ts create mode 100644 apps/obsidian/tests/atjsonContentWrite.test.ts create mode 100644 apps/roam/src/utils/contentEmbeddingSplit.ts create mode 100644 apps/roam/tests/atjsonContentWrite.test.ts create mode 100644 packages/content-model/eslint.config.mjs create mode 100644 packages/content-model/package.json create mode 100644 packages/content-model/src/adapters/obsidian.ts create mode 100644 packages/content-model/src/adapters/roam.ts create mode 100644 packages/content-model/src/constants.ts create mode 100644 packages/content-model/src/content-model.test.ts create mode 100644 packages/content-model/src/core/index.ts create mode 100644 packages/content-model/src/core/parse.ts create mode 100644 packages/content-model/src/core/render.ts create mode 100644 packages/content-model/src/index.ts create mode 100644 packages/content-model/src/render/html.ts create mode 100644 packages/content-model/src/schema.ts create mode 100644 packages/content-model/src/text.ts create mode 100644 packages/content-model/src/validate.ts create mode 100644 packages/content-model/tsconfig.json create mode 100644 packages/database/features/atjsonContentType.feature create mode 100644 packages/database/src/dbDotEnv.cjs create mode 100644 packages/database/supabase/migrations/20260517172000_atjson_content_type.sql create mode 100644 packages/database/tests/atjsonSqlContract.test.ts diff --git a/apps/obsidian/package.json b/apps/obsidian/package.json index 70537f7fc..215bc1011 100644 --- a/apps/obsidian/package.json +++ b/apps/obsidian/package.json @@ -9,6 +9,7 @@ "build": "tsx scripts/build.ts", "lint": "eslint .", "lint:fix": "eslint . --fix", + "test": "node --import tsx --test tests/atjsonContentWrite.test.ts", "publish": "tsx scripts/publish.ts --version 0.1.0", "check-types": "tsc --noEmit --skipLibCheck" }, @@ -39,6 +40,7 @@ }, "dependencies": { "@codemirror/view": "^6.38.8", + "@repo/content-model": "workspace:*", "@repo/database": "workspace:*", "@repo/utils": "workspace:*", "@supabase/supabase-js": "catalog:", diff --git a/apps/obsidian/src/utils/importContentTypes.ts b/apps/obsidian/src/utils/importContentTypes.ts new file mode 100644 index 000000000..ac7b3fa15 --- /dev/null +++ b/apps/obsidian/src/utils/importContentTypes.ts @@ -0,0 +1,49 @@ +import { + TEXT_MARKDOWN_CONTENT_TYPE, + TEXT_PLAIN_CONTENT_TYPE, +} from "@repo/content-model"; + +export type ObsidianImportContentVariant = "direct" | "full"; + +type ObsidianImportContentRow = { + variant: string | null; + // eslint-disable-next-line @typescript-eslint/naming-convention + content_type: string | null; +}; + +export const OBSIDIAN_IMPORT_CONTENT_TYPES = [ + TEXT_PLAIN_CONTENT_TYPE, + TEXT_MARKDOWN_CONTENT_TYPE, +] as const; + +export const getContentTypeForObsidianImportVariant = ( + variant: ObsidianImportContentVariant, +): (typeof OBSIDIAN_IMPORT_CONTENT_TYPES)[number] => + variant === "full" ? TEXT_MARKDOWN_CONTENT_TYPE : TEXT_PLAIN_CONTENT_TYPE; + +export const isObsidianImportDirectRow = ( + row: ObsidianImportContentRow, +): boolean => + row.variant === "direct" && + (row.content_type ?? TEXT_PLAIN_CONTENT_TYPE) === TEXT_PLAIN_CONTENT_TYPE; + +export const isObsidianImportFullRow = ( + row: ObsidianImportContentRow, +): boolean => + row.variant === "full" && + (row.content_type ?? TEXT_MARKDOWN_CONTENT_TYPE) === + TEXT_MARKDOWN_CONTENT_TYPE; + +export const selectObsidianImportContentRows = < + T extends ObsidianImportContentRow, +>( + rows: T[], +): { + direct: T | undefined; + full: T | undefined; +} => ({ + direct: rows.find(isObsidianImportDirectRow), + full: rows.find(isObsidianImportFullRow), +}); + +export { TEXT_MARKDOWN_CONTENT_TYPE, TEXT_PLAIN_CONTENT_TYPE }; diff --git a/apps/obsidian/src/utils/importNodes.ts b/apps/obsidian/src/utils/importNodes.ts index 4ef6ec171..8e7d08912 100644 --- a/apps/obsidian/src/utils/importNodes.ts +++ b/apps/obsidian/src/utils/importNodes.ts @@ -1,5 +1,11 @@ /* eslint-disable @typescript-eslint/naming-convention */ import type { Json } from "@repo/database/dbTypes"; +import { + OBSIDIAN_IMPORT_CONTENT_TYPES, + TEXT_PLAIN_CONTENT_TYPE, + getContentTypeForObsidianImportVariant, + selectObsidianImportContentRows, +} from "./importContentTypes"; import matter from "gray-matter"; import { App, Notice, TFile } from "obsidian"; import type { DGSupabaseClient } from "@repo/database/lib/client"; @@ -66,9 +72,10 @@ export const getPublishedNodesForGroups = async ({ const { data, error } = await client .from("my_contents") .select( - "source_local_id, space_id, text, created, last_modified, variant, metadata, author_id", + "source_local_id, space_id, text, created, last_modified, variant, content_type, metadata, author_id", ) - .neq("space_id", currentSpaceId); + .neq("space_id", currentSpaceId) + .in("content_type", OBSIDIAN_IMPORT_CONTENT_TYPES); if (error) { console.error("Error fetching published nodes:", error); @@ -86,6 +93,7 @@ export const getPublishedNodesForGroups = async ({ created: string | null; last_modified: string | null; variant: string | null; + content_type: string | null; author_id: number | null; metadata: Json; }; @@ -109,7 +117,7 @@ export const getPublishedNodesForGroups = async ({ const latest = withDate.reduce((a, b) => (a.last_modified ?? "") >= (b.last_modified ?? "") ? a : b, ); - const direct = rows.find((r) => r.variant === "direct"); + const { direct } = selectObsidianImportContentRows(rows); const text = direct?.text ?? latest.text ?? ""; const createdAt = latest.created ? new Date(latest.created + "Z").valueOf() @@ -267,6 +275,7 @@ export const fetchNodeContent = async ({ .eq("source_local_id", nodeInstanceId) .eq("space_id", spaceId) .eq("variant", variant) + .eq("content_type", getContentTypeForObsidianImportVariant(variant)) .maybeSingle(); if (error || !data || data.text == null) { @@ -301,6 +310,7 @@ export const fetchNodeContentWithMetadata = async ({ .eq("source_local_id", nodeInstanceId) .eq("space_id", spaceId) .eq("variant", variant) + .eq("content_type", getContentTypeForObsidianImportVariant(variant)) .maybeSingle(); if (error || !data || data.text == null) { @@ -342,10 +352,13 @@ const fetchNodeContentForImport = async ({ } | null> => { const { data, error } = await client .from("my_contents") - .select("text, created, last_modified, variant, metadata, author_id") + .select( + "text, created, last_modified, variant, content_type, metadata, author_id", + ) .eq("source_local_id", nodeInstanceId) .eq("space_id", spaceId) - .in("variant", ["direct", "full"]); + .in("variant", ["direct", "full"]) + .in("content_type", OBSIDIAN_IMPORT_CONTENT_TYPES); if (error) { console.error("Error fetching node content for import:", error); @@ -358,10 +371,10 @@ const fetchNodeContentForImport = async ({ last_modified: string | null; author_id: number | null; variant: string | null; + content_type: string | null; metadata: Json; }>; - const direct = rows.find((r) => r.variant === "direct"); - const full = rows.find((r) => r.variant === "full"); + const { direct, full } = selectObsidianImportContentRows(rows); const authorId = full?.author_id ?? direct?.author_id ?? null; if ( @@ -412,6 +425,7 @@ export const getSourceContentDates = async ({ .eq("source_local_id", nodeInstanceId) .eq("space_id", spaceId) .eq("variant", "direct") + .eq("content_type", TEXT_PLAIN_CONTENT_TYPE) .maybeSingle(); if (error || !data) return null; return { @@ -1542,6 +1556,7 @@ export const refreshImportedFile = async ({ .eq("space_id", spaceId) .eq("source_local_id", frontmatter.nodeInstanceId) .eq("variant", "direct") + .eq("content_type", TEXT_PLAIN_CONTENT_TYPE) .maybeSingle(); const metadata = metadataResp.data?.metadata; const filePath: string | undefined = diff --git a/apps/obsidian/src/utils/publishNode.ts b/apps/obsidian/src/utils/publishNode.ts index ae6e359cd..7fc8645db 100644 --- a/apps/obsidian/src/utils/publishNode.ts +++ b/apps/obsidian/src/utils/publishNode.ts @@ -21,6 +21,7 @@ import { isProvisionalSchema } from "./typeUtils"; import type { DiscourseNodeInVault } from "./getDiscourseNodes"; import type { SupabaseContext } from "./supabaseContext"; import type { TablesInsert } from "@repo/database/dbTypes"; +import { TEXT_MARKDOWN_CONTENT_TYPE } from "@repo/content-model"; const publishSchema = async ({ client, @@ -445,6 +446,7 @@ export const publishNodeToGroup = async ({ .eq("source_local_id", nodeId) .eq("space_id", spaceId) .eq("variant", "full") + .eq("content_type", TEXT_MARKDOWN_CONTENT_TYPE) .maybeSingle(); if (idResponse.error || !idResponse.data) { throw idResponse.error || new Error("no data while fetching node"); diff --git a/apps/obsidian/src/utils/syncDgNodesToSupabase.ts b/apps/obsidian/src/utils/syncDgNodesToSupabase.ts index 17351f5a2..31869ff4d 100644 --- a/apps/obsidian/src/utils/syncDgNodesToSupabase.ts +++ b/apps/obsidian/src/utils/syncDgNodesToSupabase.ts @@ -23,6 +23,10 @@ import { } from "./conceptConversion"; import { loadRelations } from "~/utils/relationsStore"; import type { LocalConceptDataInput } from "@repo/database/inputTypes"; +import { + TEXT_MARKDOWN_CONTENT_TYPE, + TEXT_PLAIN_CONTENT_TYPE, +} from "@repo/content-model"; import { type DiscourseNodeInVault, collectDiscourseNodesFromVault, @@ -59,6 +63,7 @@ const getAllNodeInstanceIdsFromSupabase = async ( .select("source_local_id") .eq("space_id", spaceId) .eq("scale", "document") + .eq("content_type", TEXT_PLAIN_CONTENT_TYPE) .not("source_local_id", "is", null); if (error) { @@ -168,6 +173,7 @@ const getLastContentSyncTime = async ( .from("my_contents") .select("last_modified") .eq("space_id", spaceId) + .in("content_type", [TEXT_PLAIN_CONTENT_TYPE, TEXT_MARKDOWN_CONTENT_TYPE]) .order("last_modified", { ascending: false }) .limit(1) .maybeSingle(); @@ -273,6 +279,7 @@ const getExistingTitlesFromDatabase = async ( .select("source_local_id, text") .eq("space_id", spaceId) .eq("variant", "direct") + .eq("content_type", TEXT_PLAIN_CONTENT_TYPE) .in("source_local_id", nodeInstanceIds); if (directError) { diff --git a/apps/obsidian/src/utils/upsertNodesAsContentWithEmbeddings.ts b/apps/obsidian/src/utils/upsertNodesAsContentWithEmbeddings.ts index 50e3c5f76..d445280f1 100644 --- a/apps/obsidian/src/utils/upsertNodesAsContentWithEmbeddings.ts +++ b/apps/obsidian/src/utils/upsertNodesAsContentWithEmbeddings.ts @@ -1,15 +1,21 @@ /* eslint-disable @typescript-eslint/naming-convention */ import { nextApiRoot } from "@repo/utils/execContext"; -import { DGSupabaseClient } from "@repo/database/lib/client"; -import { Json, CompositeTypes } from "@repo/database/dbTypes"; -import { SupabaseContext } from "./supabaseContext"; -import { ObsidianDiscourseNodeData, ChangeType } from "./syncDgNodesToSupabase"; -import { default as DiscourseGraphPlugin } from "~/index"; +import type { DGSupabaseClient } from "@repo/database/lib/client"; +import type { Json, CompositeTypes } from "@repo/database/dbTypes"; +import { + DG_ATJSON_CONTENT_TYPE, + TEXT_MARKDOWN_CONTENT_TYPE, + TEXT_PLAIN_CONTENT_TYPE, + createDgAtJsonMetadata, + derivePlainTextFromDgDocument, + obsidianMarkdownToDgDocument, +} from "@repo/content-model"; +import type { SupabaseContext } from "./supabaseContext"; +import type { ObsidianDiscourseNodeData } from "./syncDgNodesToSupabase"; +import type DiscourseGraphPlugin from "~/index"; type LocalContentDataInput = Partial>; -type ContentVariant = "direct" | "full"; - const EMBEDDING_BATCH_SIZE = 200; const EMBEDDING_MODEL = "openai_text_embedding_3_small_1536"; @@ -19,31 +25,16 @@ type EmbeddingApiResponse = { }[]; }; -/** - * Determine which content variants to create based on change types - */ -const getVariantsToCreate = (changeTypes: ChangeType[]): ContentVariant[] => { - const variants: ContentVariant[] = []; - - if (changeTypes.includes("title")) { - variants.push("direct"); - } - - if (changeTypes.includes("content")) { - variants.push("full"); - } - - return variants; -}; - const createNodeContentEntries = async ( node: ObsidianDiscourseNodeData, accountLocalId: string, plugin: DiscourseGraphPlugin, ): Promise => { - const variantsToCreate = getVariantsToCreate(node.changeTypes); + const shouldWriteDirect = node.changeTypes.includes("title"); + const shouldWriteMarkdown = node.changeTypes.includes("content"); + const shouldWriteAtJson = shouldWriteDirect || shouldWriteMarkdown; - if (variantsToCreate.length === 0) { + if (!shouldWriteDirect && !shouldWriteMarkdown && !shouldWriteAtJson) { return []; } @@ -59,25 +50,45 @@ const createNodeContentEntries = async ( const entries: LocalContentDataInput[] = []; // Create direct entry (title) if needed - will get embeddings - if (variantsToCreate.includes("direct")) { + if (shouldWriteDirect) { entries.push({ ...baseEntry, text: node.file.basename, variant: "direct", + content_type: TEXT_PLAIN_CONTENT_TYPE, metadata: { filePath: node.file.path }, }); } - // Create full entry (content) if needed - no embeddings - if (variantsToCreate.includes("full")) { + if (shouldWriteMarkdown || shouldWriteAtJson) { try { const fullContent = await plugin.app.vault.read(node.file); - entries.push({ - ...baseEntry, - text: fullContent, - variant: "full", - metadata: node.frontmatter as Json, + if (shouldWriteMarkdown) { + entries.push({ + ...baseEntry, + text: fullContent, + variant: "full", + content_type: TEXT_MARKDOWN_CONTENT_TYPE, + metadata: node.frontmatter as Json, + }); + } + const document = obsidianMarkdownToDgDocument({ + title: node.file.basename, + markdown: fullContent, + metadata: { + filePath: node.file.path, + frontmatter: node.frontmatter as Json, + }, }); + if (shouldWriteAtJson) { + entries.push({ + ...baseEntry, + text: derivePlainTextFromDgDocument(document), + variant: "full", + content_type: DG_ATJSON_CONTENT_TYPE, + metadata: createDgAtJsonMetadata({ document }) as unknown as Json, + }); + } } catch (error) { console.error(`Error reading file content for ${node.file.path}:`, error); } @@ -202,7 +213,7 @@ export const upsertNodesToSupabaseAsContentWithEmbeddings = async ({ return; } - // Create two entries per node: one "direct" (title) and one "full" (content) + // Create representation rows based on the changed slice: title, Markdown body, and canonical ATJSON. const allContentEntries = await convertObsidianNodeToLocalContent({ nodes: obsidianNodes, accountLocalId, @@ -210,10 +221,16 @@ export const upsertNodesToSupabaseAsContentWithEmbeddings = async ({ }); const directVariantEntries = allContentEntries.filter( - (entry) => entry.variant === "direct", + (entry) => + entry.variant === "direct" && + (entry.content_type ?? TEXT_PLAIN_CONTENT_TYPE) === + TEXT_PLAIN_CONTENT_TYPE, ); const fullVariantEntries = allContentEntries.filter( - (entry) => entry.variant === "full", + (entry) => + entry.variant === "full" || + (entry.content_type ?? TEXT_PLAIN_CONTENT_TYPE) !== + TEXT_PLAIN_CONTENT_TYPE, ); let directEntriesWithEmbeddings: LocalContentDataInput[]; @@ -223,7 +240,7 @@ export const upsertNodesToSupabaseAsContentWithEmbeddings = async ({ } catch (error: unknown) { const errorMessage = error instanceof Error ? error.message : String(error); console.error( - `upsertNodesToSupabaseAsContentWithEmbeddings: Embedding service failed – ${errorMessage}`, + `upsertNodesToSupabaseAsContentWithEmbeddings: Embedding service failed - ${errorMessage}`, ); throw new Error(errorMessage); } diff --git a/apps/obsidian/tests/atjsonContentWrite.test.ts b/apps/obsidian/tests/atjsonContentWrite.test.ts new file mode 100644 index 000000000..efb7b7694 --- /dev/null +++ b/apps/obsidian/tests/atjsonContentWrite.test.ts @@ -0,0 +1,338 @@ +/* eslint-disable @typescript-eslint/naming-convention */ +import assert from "node:assert/strict"; +import { readFileSync } from "node:fs"; +import { join } from "node:path"; +import test from "node:test"; +import { + DG_ATJSON_CONTENT_TYPE, + TEXT_MARKDOWN_CONTENT_TYPE, + TEXT_PLAIN_CONTENT_TYPE, + derivePlainTextFromDgDocument, + type DgDocument, +} from "@repo/content-model"; +import { + OBSIDIAN_IMPORT_CONTENT_TYPES, + getContentTypeForObsidianImportVariant, + selectObsidianImportContentRows, +} from "../src/utils/importContentTypes"; +import { upsertNodesToSupabaseAsContentWithEmbeddings } from "../src/utils/upsertNodesAsContentWithEmbeddings"; + +const readAppSource = (relativePath: string): string => + readFileSync(join(process.cwd(), relativePath), "utf8"); + +const createEmbeddingResponse = (input: string[]): Response => + new Response( + JSON.stringify({ + data: input.map(() => ({ embedding: Array(1536).fill(0) })), + }), + { + status: 200, + headers: new Headers([["Content-Type", "application/json"]]), + }, + ); + +void test("Obsidian writer uploads ATJSON without embedding serialized content", async () => { + const originalFetch = globalThis.fetch; + const embeddingInputs: string[][] = []; + globalThis.fetch = ((url: string | URL | Request, init?: RequestInit) => { + void url; + const body = JSON.parse(String(init?.body ?? "{}")) as { input: string[] }; + embeddingInputs.push(body.input); + return Promise.resolve(createEmbeddingResponse(body.input)); + }) as typeof fetch; + + const uploadedRows: Array> = []; + const supabaseClient = { + rpc: (name: string, args: { data: Array> }) => { + void name; + uploadedRows.push(...args.data); + return Promise.resolve({ error: null }); + }, + }; + const file = { + basename: "Claim note", + path: "Claim note.md", + }; + const plugin = { + app: { + vault: { + read: () => + Promise.resolve( + "---\nnodeTypeId: claim\n---\n- **Human readable** [[Evidence]]", + ), + }, + }, + }; + + try { + await upsertNodesToSupabaseAsContentWithEmbeddings({ + obsidianNodes: [ + { + file, + frontmatter: { nodeTypeId: "claim" }, + nodeTypeId: "claim", + nodeInstanceId: "node-1", + created: "2026-01-01T00:00:00.000Z", + last_modified: "2026-01-02T00:00:00.000Z", + changeTypes: ["title", "content"], + }, + ] as never, + supabaseClient: supabaseClient as never, + context: { spaceId: 1, userId: 2 } as never, + accountLocalId: "user-1", + plugin: plugin as never, + }); + } finally { + globalThis.fetch = originalFetch; + } + + assert.deepEqual(embeddingInputs, [["Claim note"]]); + assert.equal(uploadedRows.length, 3); + + const direct = uploadedRows.find( + (row) => row.content_type === TEXT_PLAIN_CONTENT_TYPE, + ); + const markdown = uploadedRows.find( + (row) => row.content_type === TEXT_MARKDOWN_CONTENT_TYPE, + ); + const atjson = uploadedRows.find( + (row) => row.content_type === DG_ATJSON_CONTENT_TYPE, + ); + + assert.equal(direct?.variant, "direct"); + assert.equal(direct?.text, "Claim note"); + assert.ok(direct?.embedding_inline); + + assert.equal(markdown?.variant, "full"); + assert.equal(typeof markdown?.text, "string"); + assert.match(markdown.text as string, /Human readable/); + assert.equal(markdown.embedding_inline, undefined); + + assert.equal(atjson?.variant, "full"); + assert.equal(typeof atjson?.text, "string"); + assert.equal((atjson.text as string).includes("{"), false); + assert.equal(atjson.embedding_inline, undefined); + const content = (atjson.metadata as { content: DgDocument }).content; + assert.equal(content.version, "dg-content-model/v1"); + assert.equal(atjson.text, derivePlainTextFromDgDocument(content)); +}); + +void test("Obsidian title-only changes refresh ATJSON title", async () => { + const originalFetch = globalThis.fetch; + const embeddingInputs: string[][] = []; + globalThis.fetch = ((url: string | URL | Request, init?: RequestInit) => { + void url; + const body = JSON.parse(String(init?.body ?? "{}")) as { input: string[] }; + embeddingInputs.push(body.input); + return Promise.resolve(createEmbeddingResponse(body.input)); + }) as typeof fetch; + + const uploadedRows: Array> = []; + const supabaseClient = { + rpc: (name: string, args: { data: Array> }) => { + void name; + uploadedRows.push(...args.data); + return Promise.resolve({ error: null }); + }, + }; + const plugin = { + app: { + vault: { + read: () => Promise.resolve("Existing body"), + }, + }, + }; + + try { + await upsertNodesToSupabaseAsContentWithEmbeddings({ + obsidianNodes: [ + { + file: { + basename: "Renamed claim", + path: "Renamed claim.md", + }, + frontmatter: { nodeTypeId: "claim" }, + nodeTypeId: "claim", + nodeInstanceId: "node-1", + created: "2026-01-01T00:00:00.000Z", + last_modified: "2026-01-03T00:00:00.000Z", + changeTypes: ["title"], + }, + ] as never, + supabaseClient: supabaseClient as never, + context: { spaceId: 1, userId: 2 } as never, + accountLocalId: "user-1", + plugin: plugin as never, + }); + } finally { + globalThis.fetch = originalFetch; + } + + assert.deepEqual(embeddingInputs, [["Renamed claim"]]); + assert.equal(uploadedRows.length, 2); + assert.equal( + uploadedRows.some((row) => row.content_type === TEXT_MARKDOWN_CONTENT_TYPE), + false, + ); + + const atjson = uploadedRows.find( + (row) => row.content_type === DG_ATJSON_CONTENT_TYPE, + ); + assert.equal(atjson?.text, "Renamed claim\n\nExisting body"); + const content = (atjson?.metadata as { content: DgDocument }).content; + assert.equal(content.title.text, "Renamed claim"); + assert.equal(atjson?.text, derivePlainTextFromDgDocument(content)); +}); + +void test("Obsidian content-only changes write body rows without embeddings", async () => { + const originalFetch = globalThis.fetch; + let embeddingRequestCount = 0; + globalThis.fetch = ((url: string | URL | Request, init?: RequestInit) => { + void url; + void init; + embeddingRequestCount += 1; + return Promise.resolve(createEmbeddingResponse([])); + }) as typeof fetch; + + const uploadedRows: Array> = []; + const supabaseClient = { + rpc: (name: string, args: { data: Array> }) => { + void name; + uploadedRows.push(...args.data); + return Promise.resolve({ error: null }); + }, + }; + const plugin = { + app: { + vault: { + read: () => Promise.resolve("Updated **body**"), + }, + }, + }; + + try { + await upsertNodesToSupabaseAsContentWithEmbeddings({ + obsidianNodes: [ + { + file: { + basename: "Claim note", + path: "Claim note.md", + }, + frontmatter: { nodeTypeId: "claim" }, + nodeTypeId: "claim", + nodeInstanceId: "node-1", + created: "2026-01-01T00:00:00.000Z", + last_modified: "2026-01-04T00:00:00.000Z", + changeTypes: ["content"], + }, + ] as never, + supabaseClient: supabaseClient as never, + context: { spaceId: 1, userId: 2 } as never, + accountLocalId: "user-1", + plugin: plugin as never, + }); + } finally { + globalThis.fetch = originalFetch; + } + + assert.equal(embeddingRequestCount, 0); + assert.equal(uploadedRows.length, 2); + assert.equal( + uploadedRows.some((row) => row.content_type === TEXT_PLAIN_CONTENT_TYPE), + false, + ); + assert.ok( + uploadedRows.some( + (row) => + row.variant === "full" && + row.content_type === TEXT_MARKDOWN_CONTENT_TYPE && + row.text === "Updated **body**", + ), + ); + assert.ok( + uploadedRows.some( + (row) => + row.variant === "full" && + row.content_type === DG_ATJSON_CONTENT_TYPE && + row.embedding_inline === undefined, + ), + ); +}); + +void test("Obsidian import stays on plain title and Markdown body rows", () => { + assert.deepEqual(OBSIDIAN_IMPORT_CONTENT_TYPES, [ + TEXT_PLAIN_CONTENT_TYPE, + TEXT_MARKDOWN_CONTENT_TYPE, + ]); + assert.equal( + getContentTypeForObsidianImportVariant("direct"), + TEXT_PLAIN_CONTENT_TYPE, + ); + assert.equal( + getContentTypeForObsidianImportVariant("full"), + TEXT_MARKDOWN_CONTENT_TYPE, + ); + + const { direct, full } = selectObsidianImportContentRows([ + { + variant: "full", + content_type: DG_ATJSON_CONTENT_TYPE, + text: "Derived ATJSON text", + }, + { + variant: "direct", + content_type: TEXT_PLAIN_CONTENT_TYPE, + text: "Claim note", + }, + { + variant: "full", + content_type: TEXT_MARKDOWN_CONTENT_TYPE, + text: "# Markdown body", + }, + ]); + + assert.equal(direct?.text, "Claim note"); + assert.equal(full?.text, "# Markdown body"); + + const legacyRows = selectObsidianImportContentRows([ + { + variant: "direct", + content_type: null, + text: "Legacy claim", + }, + { + variant: "full", + content_type: null, + text: "Legacy Markdown body", + }, + ]); + + assert.equal(legacyRows.direct?.text, "Legacy claim"); + assert.equal(legacyRows.full?.text, "Legacy Markdown body"); +}); + +void test("Obsidian reader and publish paths stay on plain and Markdown rows", () => { + const importNodesSource = readAppSource("src/utils/importNodes.ts"); + const publishNodeSource = readAppSource("src/utils/publishNode.ts"); + const syncSource = readAppSource("src/utils/syncDgNodesToSupabase.ts"); + + assert.match( + importNodesSource, + /\.in\("content_type", OBSIDIAN_IMPORT_CONTENT_TYPES\)/, + ); + assert.match( + importNodesSource, + /\.eq\("content_type", getContentTypeForObsidianImportVariant\(variant\)\)/, + ); + assert.equal(importNodesSource.includes("DG_ATJSON_CONTENT_TYPE"), false); + + assert.match( + publishNodeSource, + /\.eq\("content_type", TEXT_MARKDOWN_CONTENT_TYPE\)/, + ); + assert.match( + syncSource, + /\.in\("content_type", \[TEXT_PLAIN_CONTENT_TYPE, TEXT_MARKDOWN_CONTENT_TYPE\]\)/, + ); + assert.match(syncSource, /\.eq\("content_type", TEXT_PLAIN_CONTENT_TYPE\)/); +}); diff --git a/apps/roam/package.json b/apps/roam/package.json index e911904ad..c091998c1 100644 --- a/apps/roam/package.json +++ b/apps/roam/package.json @@ -8,6 +8,7 @@ "deploy": "tsx scripts/deploy.ts", "lint": "eslint .", "lint:fix": "eslint . --fix", + "test": "node --import tsx --test tests/atjsonContentWrite.test.ts", "publish": "tsx scripts/publish.ts", "check-types": "tsc --noEmit --skipLibCheck" }, @@ -42,6 +43,7 @@ "@dnd-kit/utilities": "^3.2.2", "@octokit/auth-app": "^7.1.4", "@octokit/core": "^6.1.3", + "@repo/content-model": "workspace:*", "@repo/database": "workspace:*", "@repo/utils": "workspace:*", "@supabase/functions-js": "catalog:", diff --git a/apps/roam/src/utils/cleanupOrphanedNodes.ts b/apps/roam/src/utils/cleanupOrphanedNodes.ts index 882a6a1d2..4a76fa04e 100644 --- a/apps/roam/src/utils/cleanupOrphanedNodes.ts +++ b/apps/roam/src/utils/cleanupOrphanedNodes.ts @@ -3,6 +3,7 @@ import type { SupabaseContext } from "./supabaseContext"; import type { DGSupabaseClient } from "@repo/database/lib/client"; import type { Tables } from "@repo/database/dbTypes"; import internalError from "./internalError"; +import { TEXT_PLAIN_CONTENT_TYPE } from "@repo/content-model"; const getAllNodesFromSupabase = async ( supabaseClient: DGSupabaseClient, @@ -58,6 +59,7 @@ const getAllNodesFromSupabase = async ( .select("source_local_id") .eq("space_id", spaceId) .eq("scale", "block") + .eq("content_type", TEXT_PLAIN_CONTENT_TYPE) .not("source_local_id", "is", null); if (blockContentResponse.error) { diff --git a/apps/roam/src/utils/contentEmbeddingSplit.ts b/apps/roam/src/utils/contentEmbeddingSplit.ts new file mode 100644 index 000000000..5765165a9 --- /dev/null +++ b/apps/roam/src/utils/contentEmbeddingSplit.ts @@ -0,0 +1,24 @@ +import { TEXT_PLAIN_CONTENT_TYPE } from "@repo/content-model"; + +type ContentWithType = { + // eslint-disable-next-line @typescript-eslint/naming-convention + content_type?: string | null; +}; + +export const splitEmbeddableContentNodes = ( + contentNodes: T[], +): { + embeddableContentNodes: T[]; + nonEmbeddableContentNodes: T[]; +} => ({ + embeddableContentNodes: contentNodes.filter( + (node) => + (node.content_type ?? TEXT_PLAIN_CONTENT_TYPE) === + TEXT_PLAIN_CONTENT_TYPE, + ), + nonEmbeddableContentNodes: contentNodes.filter( + (node) => + (node.content_type ?? TEXT_PLAIN_CONTENT_TYPE) !== + TEXT_PLAIN_CONTENT_TYPE, + ), +}); diff --git a/apps/roam/src/utils/syncDgNodesToSupabase.ts b/apps/roam/src/utils/syncDgNodesToSupabase.ts index cd4390193..c659b6fb2 100644 --- a/apps/roam/src/utils/syncDgNodesToSupabase.ts +++ b/apps/roam/src/utils/syncDgNodesToSupabase.ts @@ -21,6 +21,7 @@ import { fetchEmbeddingsForNodes } from "./upsertNodesAsContentWithEmbeddings"; import { convertRoamNodeToLocalContent } from "./upsertNodesAsContentWithEmbeddings"; import type { DGSupabaseClient } from "@repo/database/lib/client"; import type { Json, CompositeTypes, Enums } from "@repo/database/dbTypes"; +import { splitEmbeddableContentNodes } from "./contentEmbeddingSplit"; import { render as renderToast } from "roamjs-components/components/Toast"; import internalError from "~/utils/internalError"; type LocalContentDataInput = Partial>; @@ -198,7 +199,7 @@ export const proposeSyncTask = async ( if (error) { console.error( - `proposeSyncTask: propose_sync_task failed – ${error.message}`, + `proposeSyncTask: propose_sync_task failed - ${error.message}`, ); return { shouldProceed: false }; } @@ -346,21 +347,23 @@ export const upsertNodesToSupabaseAsContentWithEmbeddings = async ( const allNodeInstancesAsLocalContent = convertRoamNodeToLocalContent({ nodes: roamNodes, }); + const { + embeddableContentNodes: embeddableContent, + nonEmbeddableContentNodes: nonEmbeddableContent, + } = splitEmbeddableContentNodes(allNodeInstancesAsLocalContent); let nodesWithEmbeddings: LocalContentDataInput[]; try { - nodesWithEmbeddings = await fetchEmbeddingsForNodes( - allNodeInstancesAsLocalContent, - ); + nodesWithEmbeddings = await fetchEmbeddingsForNodes(embeddableContent); } catch (error) { const message = error instanceof Error ? error.message : String(error); console.error( - `upsertNodesToSupabaseAsContentWithEmbeddings: Embedding service failed – ${message}`, + `upsertNodesToSupabaseAsContentWithEmbeddings: Embedding service failed - ${message}`, ); throw new Error(message); } - if (nodesWithEmbeddings.length !== allNodeInstancesAsLocalContent.length) { + if (nodesWithEmbeddings.length !== embeddableContent.length) { console.error( "upsertNodesToSupabaseAsContentWithEmbeddings: Mismatch between node and embedding counts.", ); @@ -386,7 +389,9 @@ export const upsertNodesToSupabaseAsContentWithEmbeddings = async ( } }; - await uploadBatches(chunk(nodesWithEmbeddings, BATCH_SIZE)); + await uploadBatches( + chunk([...nodesWithEmbeddings, ...nonEmbeddableContent], BATCH_SIZE), + ); }; const getAllUsers = async (): Promise => { @@ -538,7 +543,7 @@ export const createOrUpdateDiscourseEmbedding = async (showToast = false) => { doSync = false; return; } - const jitter = 0.9 + Math.random() * 0.2; // 0.9x–1.1x + const jitter = 0.9 + Math.random() * 0.2; // 0.9x-1.1x timeout *= 2 ** numFailures * jitter; } if (activeTimeout != null) { diff --git a/apps/roam/src/utils/upsertNodesAsContentWithEmbeddings.ts b/apps/roam/src/utils/upsertNodesAsContentWithEmbeddings.ts index 829e792c2..11703d4c4 100644 --- a/apps/roam/src/utils/upsertNodesAsContentWithEmbeddings.ts +++ b/apps/roam/src/utils/upsertNodesAsContentWithEmbeddings.ts @@ -2,8 +2,20 @@ import { type RoamDiscourseNodeData } from "./getAllDiscourseNodesSince"; import { type SupabaseContext } from "./supabaseContext"; import { nextApiRoot } from "@repo/utils/execContext"; +import { + DG_ATJSON_CONTENT_TYPE, + TEXT_PLAIN_CONTENT_TYPE, + createDgAtJsonMetadata, + derivePlainTextFromDgDocument, + roamTextToDgDocument, + roamTreeToDgDocument, + type DgDocument, + type RoamTreeNode, +} from "@repo/content-model"; import type { DGSupabaseClient } from "@repo/database/lib/client"; import type { Json, CompositeTypes } from "@repo/database/dbTypes"; +import getFullTreeByParentUid from "roamjs-components/queries/getFullTreeByParentUid"; +import { splitEmbeddableContentNodes } from "./contentEmbeddingSplit"; type LocalContentDataInput = Partial>; @@ -16,25 +28,101 @@ type EmbeddingApiResponse = { }[]; }; +type RoamTreeLike = { + uid?: string; + text?: string; + viewType?: "bullet" | "numbered" | "document"; + children?: RoamTreeLike[]; +}; + +const toRoamTreeNode = ( + node: RoamTreeLike, + fallbackUid: string, +): RoamTreeNode => ({ + uid: node.uid ?? fallbackUid, + text: node.text ?? "", + viewType: node.viewType, + children: (node.children ?? []).map((child, index) => + toRoamTreeNode(child, `${fallbackUid}-${index + 1}`), + ), +}); + +const createRoamAtJsonDocument = (node: RoamDiscourseNodeData): DgDocument => { + const title = node.node_title ?? node.text; + try { + const tree = getFullTreeByParentUid(node.source_local_id) as RoamTreeLike; + const treeChildren = node.node_title + ? [ + toRoamTreeNode( + { + ...tree, + uid: node.source_local_id, + text: node.text || tree.text, + }, + node.source_local_id, + ), + ] + : (tree.children ?? []).map((child, index) => + toRoamTreeNode(child, `${node.source_local_id}-${index + 1}`), + ); + return roamTreeToDgDocument({ + title, + pageUid: node.source_local_id, + children: treeChildren, + metadata: { + sourceLocalId: node.source_local_id, + nodeTypeId: node.type, + }, + }); + } catch (error) { + console.warn( + `Falling back to text-only Roam ATJSON for ${node.source_local_id}:`, + error, + ); + return roamTextToDgDocument({ + title, + text: node.node_title ? node.text : "", + sourceLocalId: node.source_local_id, + metadata: { + nodeTypeId: node.type, + }, + }); + } +}; + export const convertRoamNodeToLocalContent = ({ nodes, }: { nodes: RoamDiscourseNodeData[]; }): LocalContentDataInput[] => { - return nodes.map((node) => { + return nodes.flatMap((node) => { const variant = node.node_title ? "direct_and_description" : "direct"; const text = node.node_title ? `${node.node_title} ${node.text}` : node.text; - return { + const baseEntry = { author_local_id: node.author_local_id, source_local_id: node.source_local_id, created: new Date(node.created || Date.now()).toISOString(), last_modified: new Date(node.last_modified || Date.now()).toISOString(), - text: text, - variant: variant, - scale: "document", + scale: "document" as const, }; + const document = createRoamAtJsonDocument(node); + return [ + { + ...baseEntry, + text: text, + variant: variant, + content_type: TEXT_PLAIN_CONTENT_TYPE, + }, + { + ...baseEntry, + text: derivePlainTextFromDgDocument(document), + variant: "full", + content_type: DG_ATJSON_CONTENT_TYPE, + metadata: createDgAtJsonMetadata({ document }) as unknown as Json, + }, + ]; }); }; @@ -103,7 +191,7 @@ const uploadBatches = async ( batches: LocalContentDataInput[][], supabaseClient: DGSupabaseClient, context: SupabaseContext, -) => { +): Promise => { const { spaceId, userId } = context; for (let idx = 0; idx < batches.length; idx++) { const batch = batches[idx]; @@ -137,19 +225,21 @@ export const upsertNodesToSupabaseAsContentWithEmbeddings = async ( const localContentNodes = convertRoamNodeToLocalContent({ nodes: roamNodes, }); + const { embeddableContentNodes, nonEmbeddableContentNodes } = + splitEmbeddableContentNodes(localContentNodes); let nodesWithEmbeddings: LocalContentDataInput[]; try { - nodesWithEmbeddings = await fetchEmbeddingsForNodes(localContentNodes); + nodesWithEmbeddings = await fetchEmbeddingsForNodes(embeddableContentNodes); } catch (error: unknown) { const errorMessage = error instanceof Error ? error.message : String(error); console.error( - `upsertNodesToSupabaseAsContentWithEmbeddings: Embedding service failed – ${errorMessage}`, + `upsertNodesToSupabaseAsContentWithEmbeddings: Embedding service failed - ${errorMessage}`, ); return; } - if (nodesWithEmbeddings.length !== roamNodes.length) { + if (nodesWithEmbeddings.length !== embeddableContentNodes.length) { console.error( "upsertNodesToSupabaseAsContentWithEmbeddings: Mismatch between node and embedding counts.", ); @@ -167,7 +257,7 @@ export const upsertNodesToSupabaseAsContentWithEmbeddings = async ( }; await uploadBatches( - chunk(nodesWithEmbeddings, batchSize), + chunk([...nodesWithEmbeddings, ...nonEmbeddableContentNodes], batchSize), supabaseClient, context, ); diff --git a/apps/roam/tests/atjsonContentWrite.test.ts b/apps/roam/tests/atjsonContentWrite.test.ts new file mode 100644 index 000000000..598718cff --- /dev/null +++ b/apps/roam/tests/atjsonContentWrite.test.ts @@ -0,0 +1,205 @@ +/* eslint-disable @typescript-eslint/naming-convention */ +import assert from "node:assert/strict"; +import { readFileSync } from "node:fs"; +import { join } from "node:path"; +import test from "node:test"; +import { + DG_ATJSON_CONTENT_TYPE, + TEXT_MARKDOWN_CONTENT_TYPE, + TEXT_PLAIN_CONTENT_TYPE, + derivePlainTextFromDgDocument, + type DgDocument, +} from "@repo/content-model"; +import { splitEmbeddableContentNodes } from "../src/utils/contentEmbeddingSplit"; +import { + convertRoamNodeToLocalContent, + upsertNodesToSupabaseAsContentWithEmbeddings, +} from "../src/utils/upsertNodesAsContentWithEmbeddings"; + +const readAppSource = (relativePath: string): string => + readFileSync(join(process.cwd(), relativePath), "utf8"); + +const createEmbeddingResponse = (input: string[]): Response => + new Response( + JSON.stringify({ + data: input.map(() => ({ embedding: Array(1536).fill(0) })), + }), + { + status: 200, + headers: new Headers([["Content-Type", "application/json"]]), + }, + ); + +void test("Roam writer uploads ATJSON without embedding serialized content", async () => { + const originalFetch = globalThis.fetch; + const globals = globalThis as { window?: unknown }; + const originalWindow = globals.window; + const embeddingInputs: string[][] = []; + globalThis.fetch = ((url: string | URL | Request, init?: RequestInit) => { + void url; + const body = JSON.parse(String(init?.body ?? "{}")) as { input: string[] }; + embeddingInputs.push(body.input); + return Promise.resolve(createEmbeddingResponse(body.input)); + }) as typeof fetch; + globals.window = { + roamAlphaAPI: { + pull: () => ({ + ":block/string": "Human readable **claim**", + ":block/uid": "node-1", + ":block/order": 0, + ":block/children": [ + { + ":block/string": "Nested [[Evidence]]", + ":block/uid": "child-1", + ":block/order": 0, + }, + ], + }), + }, + }; + + const uploadedRows: Array> = []; + const supabaseClient = { + rpc: (name: string, args: { data: Array> }) => { + void name; + uploadedRows.push(...args.data); + return Promise.resolve({ error: null }); + }, + }; + + try { + await upsertNodesToSupabaseAsContentWithEmbeddings( + [ + { + author_local_id: "user-1", + author_name: "User One", + source_local_id: "node-1", + created: "2026-01-01T00:00:00.000Z", + last_modified: "2026-01-02T00:00:00.000Z", + node_title: "Claim note", + text: "Human readable **claim**", + type: "claim", + }, + ], + supabaseClient as never, + { spaceId: 1, userId: 2 } as never, + ); + } finally { + globalThis.fetch = originalFetch; + globals.window = originalWindow; + } + + assert.deepEqual(embeddingInputs, [["Claim note Human readable **claim**"]]); + assert.equal(uploadedRows.length, 2); + + const plain = uploadedRows.find( + (row) => row.content_type === TEXT_PLAIN_CONTENT_TYPE, + ); + const atjson = uploadedRows.find( + (row) => row.content_type === DG_ATJSON_CONTENT_TYPE, + ); + + assert.equal(plain?.variant, "direct_and_description"); + assert.equal(plain?.text, "Claim note Human readable **claim**"); + assert.ok(plain?.embedding_inline); + + assert.equal(atjson?.variant, "full"); + assert.equal(typeof atjson?.text, "string"); + assert.match(atjson.text as string, /Nested/); + assert.equal((atjson.text as string).includes("{"), false); + assert.equal(atjson.embedding_inline, undefined); + const content = (atjson.metadata as { content: DgDocument }).content; + assert.equal(content.version, "dg-content-model/v1"); + assert.equal(atjson.text, derivePlainTextFromDgDocument(content)); + assert.ok( + content.body.annotations.some( + (annotation) => + annotation.type === "block" && + annotation.attributes.blockId === "child-1" && + annotation.attributes.depth === 1, + ), + ); +}); + +void test("Roam embedding split keeps non-plain rows out of embedding batches", () => { + const { embeddableContentNodes, nonEmbeddableContentNodes } = + splitEmbeddableContentNodes([ + { content_type: TEXT_PLAIN_CONTENT_TYPE, text: "plain" }, + { content_type: TEXT_MARKDOWN_CONTENT_TYPE, text: "markdown" }, + { content_type: DG_ATJSON_CONTENT_TYPE, text: "derived" }, + { text: "legacy plain default" }, + ]); + + assert.deepEqual( + embeddableContentNodes.map((node) => node.text), + ["plain", "legacy plain default"], + ); + assert.deepEqual( + nonEmbeddableContentNodes.map((node) => node.content_type), + [TEXT_MARKDOWN_CONTENT_TYPE, DG_ATJSON_CONTENT_TYPE], + ); +}); + +void test("Roam text fallback preserves existing plain row and derived ATJSON", () => { + const globals = globalThis as { window?: unknown }; + const originalWindow = globals.window; + const originalWarn = console.warn; + const warnings: unknown[][] = []; + globals.window = {}; + console.warn = (...args: unknown[]) => { + warnings.push(args); + }; + + try { + const rows = convertRoamNodeToLocalContent({ + nodes: [ + { + author_local_id: "user-1", + author_name: "User One", + source_local_id: "block-1", + created: "2026-01-01T00:00:00.000Z", + last_modified: "2026-01-02T00:00:00.000Z", + text: "Block-backed claim", + type: "claim", + }, + ], + }); + + assert.equal(warnings.length, 1); + assert.equal(rows.length, 2); + + const plain = rows.find( + (row) => row.content_type === TEXT_PLAIN_CONTENT_TYPE, + ); + const atjson = rows.find( + (row) => row.content_type === DG_ATJSON_CONTENT_TYPE, + ); + assert.equal(plain?.variant, "direct"); + assert.equal(plain?.text, "Block-backed claim"); + assert.equal(atjson?.variant, "full"); + assert.equal(atjson?.text, "Block-backed claim"); + assert.equal((atjson?.text ?? "").includes("{"), false); + const content = (atjson?.metadata as { content: DgDocument }).content; + assert.equal(content.title.text, "Block-backed claim"); + assert.equal(content.body.text, ""); + assert.equal(atjson?.text, derivePlainTextFromDgDocument(content)); + } finally { + console.warn = originalWarn; + globals.window = originalWindow; + } +}); + +void test("Roam sync and cleanup paths keep ATJSON out of plain-row behavior", () => { + const syncSource = readAppSource("src/utils/syncDgNodesToSupabase.ts"); + const cleanupSource = readAppSource("src/utils/cleanupOrphanedNodes.ts"); + + assert.match(syncSource, /splitEmbeddableContentNodes/); + assert.match( + syncSource, + /\.\.\.nodesWithEmbeddings, \.\.\.nonEmbeddableContent/, + ); + assert.match( + cleanupSource, + /\.eq\("content_type", TEXT_PLAIN_CONTENT_TYPE\)/, + ); +}); diff --git a/docs/atjson-canonical-storage-plan.md b/docs/atjson-canonical-storage-plan.md index 997fef81d..afe5bde4d 100644 --- a/docs/atjson-canonical-storage-plan.md +++ b/docs/atjson-canonical-storage-plan.md @@ -165,12 +165,15 @@ Port first: - Roam page/block tree to DG document conversion - validators for spans, title/body rules, block parents, and reference attributes -Defer until after write-only storage: +Implement package-level renderers in this PR-shaped pass, but keep production destination readers on the current text and Markdown rows until read-path parity is intentionally enabled: - DG document to Obsidian Markdown rendering - DG document to Roam page/block rendering +- DG document to HTML rendering + +Defer until after write-only storage and renderer parity hardening: + - destination import reads from ATJSON -- HTML rendering ## Write-only rollout @@ -207,7 +210,7 @@ Update `apps/obsidian/src/utils/upsertNodesAsContentWithEmbeddings.ts` so conten For the ATJSON row, store the `DgDocument` in `metadata.content` and store a derived plain-text projection in `text`. -Keep embeddings only on intentional searchable text rows. Do not embed serialized ATJSON. If ATJSON rows are embedded later, embed their derived `text` projection, not the JSON payload. +Keep embeddings only on intentional searchable text rows. Do not embed serialized ATJSON. The database `upsert_content` path should ignore `embedding_inline` for non-`text/plain` rows so Markdown and ATJSON representations cannot accidentally create embeddings. If ATJSON rows are embedded later, embed their derived `text` projection, not the JSON payload. ### Roam write path @@ -232,16 +235,14 @@ During this phase: ## Later conversion rollout -After ATJSON write coverage is stable: +After ATJSON write coverage is stable and package-level renderer fixtures are hardened: -1. Port `atJsonToObsidian` into a DG document renderer. -2. Add tests showing DG ATJSON renders to equivalent Obsidian Markdown for representative nodes. -3. Add an Obsidian importer fallback order: +1. Add an Obsidian importer fallback order: - prefer ATJSON only when renderer tests pass and source row exists - fall back to `full/text/markdown` -4. Port `atJsonToRoam` and `decodeState` concepts into a Roam renderer/materializer. -5. Add Roam destination import from DG ATJSON. -6. Only after both destination paths are stable, decide whether Markdown remains a durable native export or becomes derived output. +2. Add Roam destination import from DG ATJSON. +3. Wire website publishing to the shared HTML renderer. +4. Only after destination paths are stable, decide whether Markdown remains a durable native export or becomes derived output. ## Test plan @@ -255,6 +256,7 @@ Database tests: - ATJSON rows store the canonical document in `metadata.content`. - ATJSON rows store derived searchable text in `text`, not serialized JSON. - Embedding views continue to work for text rows. +- `upsert_content` only accepts inline embeddings for `text/plain` rows. Content-model package tests: @@ -271,6 +273,7 @@ App regression tests: - Obsidian sync writes ATJSON rows without changing current Markdown rows. - Roam sync writes ATJSON rows without removing or replacing current text rows. - Serialized ATJSON is never stored in `Content.text` or sent for embeddings. +- Storage-layer embedding guards reject accidental inline embeddings on Markdown and ATJSON rows. Manual validation: diff --git a/docs/atjson-canonical-storage-scope.md b/docs/atjson-canonical-storage-scope.md index 3ced187d7..aeca79507 100644 --- a/docs/atjson-canonical-storage-scope.md +++ b/docs/atjson-canonical-storage-scope.md @@ -34,6 +34,8 @@ Keep the detailed implementation plans in the related notes. This scope should s - **Proposed solution**: Add `Content.content_type`, keep `variant` as the semantic content slice, and write canonical DG ATJSON rows alongside existing text and Markdown rows, with the structured document stored in `Content.metadata.content` and derived plain text stored in `Content.text`. - **Expected outcome**: The database can store canonical ATJSON without interrupting current Obsidian behavior, and the later ATJSON-to-Obsidian, ATJSON-to-Roam, and HTML conversion work has a stable storage target. +**Implementation note**: The PR-shaped implementation includes package-level Obsidian, Roam, and HTML renderers earlier than the original v0 scope so parity tests can live with the shared content model. Production destination readers still do not prefer ATJSON in this rollout. + ## 2. Goal + Non-Goals ### Goal @@ -92,10 +94,9 @@ The v0 goal is specifically to: ### Deferred to v1+ -- Port DG ATJSON to Obsidian Markdown rendering. -- Port DG ATJSON to Roam rendering/materialization. -- Add renderer parity tests and then switch destination readers to prefer ATJSON. -- Add HTML rendering for website publishing. +- Switch production destination readers to prefer ATJSON. +- Harden renderer parity tests beyond the initial package-level fixtures. +- Add website publishing integration that consumes the shared HTML renderer. - Decide whether native exports should also be stored as durable representations. - Decide long-term content API representation negotiation. @@ -289,6 +290,7 @@ The v0 goal is specifically to: - **Acceptance criteria**: - ATJSON rows store derived plain text in `Content.text`. - Serialized ATJSON is never sent to embedding generation. + - `upsert_content` ignores inline embeddings on non-`text/plain` rows, including Markdown and ATJSON representations. #### N4: Type safety and maintainability @@ -400,7 +402,7 @@ The v0 goal is specifically to: ### Risk: Serialized JSON enters search or embedding paths - **Impact**: Search quality drops, embeddings become noisy, and text tooling becomes harder to reason about. -- **Mitigation**: Store canonical content in `metadata.content`; store only derived plain text in `text`; add regression tests. +- **Mitigation**: Store canonical content in `metadata.content`; store only derived plain text in `text`; keep app embedding requests on `text/plain`; make `upsert_content` ignore inline embeddings on Markdown and ATJSON rows; add regression tests. ### Risk: File references lose their target row diff --git a/docs/atjson-port-plan.md b/docs/atjson-port-plan.md index 1bd567715..9dfa18d6e 100644 --- a/docs/atjson-port-plan.md +++ b/docs/atjson-port-plan.md @@ -44,7 +44,7 @@ The decided B1 storage shape is: - `variant` remains the semantic slice of a node, such as `direct`, `full`, or `direct_and_description` - `Content.content_type` becomes the representation format, such as `text/plain`, `text/markdown`, or `application/vnd.discourse-graph.atjson+json; version=1` -- ATJSON is initially written alongside existing rows, with the structured document stored in `Content.metadata` and a derived plain-text projection stored in `Content.text` +- ATJSON is initially written alongside existing rows, with the structured document stored in `Content.metadata.content` and a derived plain-text projection stored in `Content.text` - Existing readers keep using the current text and Markdown rows This split is important because the current repo uses `Content.text` for text search, discovery, and embeddings, while app-native payloads are still organized by content variants. Write-only ATJSON storage can land before destination renderers are active, but API negotiation and ATJSON-preferred import should wait for renderer parity. @@ -471,7 +471,7 @@ Deliver: Notes: - Write-only storage has a concrete plan in `docs/atjson-canonical-storage-plan.md` -- ATJSON rows store the canonical structured document in `Content.metadata` +- ATJSON rows store the canonical structured document in `Content.metadata.content` - `Content.text` stores a derived plain-text projection for search, discovery, and fallback display - `Content.content_type` distinguishes `text/plain`, `text/markdown`, and DG ATJSON - Do not move app import paths to ATJSON until renderer parity tests exist diff --git a/packages/content-model/eslint.config.mjs b/packages/content-model/eslint.config.mjs new file mode 100644 index 000000000..ec6846c04 --- /dev/null +++ b/packages/content-model/eslint.config.mjs @@ -0,0 +1,13 @@ +import { config as base } from "@repo/eslint-config/base"; + +export default [ + ...base, + { + languageOptions: { + parserOptions: { + project: true, + tsconfigRootDir: ".", + }, + }, + }, +]; diff --git a/packages/content-model/package.json b/packages/content-model/package.json new file mode 100644 index 000000000..9658b2cf3 --- /dev/null +++ b/packages/content-model/package.json @@ -0,0 +1,33 @@ +{ + "name": "@repo/content-model", + "version": "0.0.0", + "private": true, + "license": "Apache-2.0", + "exports": { + ".": "./src/index.ts", + "./constants": "./src/constants.ts", + "./schema": "./src/schema.ts", + "./validate": "./src/validate.ts", + "./text": "./src/text.ts", + "./core": "./src/core/index.ts", + "./adapters/obsidian": "./src/adapters/obsidian.ts", + "./adapters/roam": "./src/adapters/roam.ts", + "./render/html": "./src/render/html.ts" + }, + "scripts": { + "build": "tsc", + "check-types": "tsc --noEmit --skipLibCheck", + "lint": "eslint .", + "test": "pnpm run build && node --test dist/src/content-model.test.js" + }, + "devDependencies": { + "@repo/eslint-config": "workspace:*", + "@repo/typescript-config": "workspace:*", + "@types/node": "^20.11.24", + "eslint": "catalog:", + "typescript": "5.5.4" + }, + "dependencies": { + "tslib": "2.5.1" + } +} diff --git a/packages/content-model/src/adapters/obsidian.ts b/packages/content-model/src/adapters/obsidian.ts new file mode 100644 index 000000000..d532a6161 --- /dev/null +++ b/packages/content-model/src/adapters/obsidian.ts @@ -0,0 +1,116 @@ +import { NULL_INLINE_CONTENT } from "../constants"; +import { + getInlineAnnotationsForRange, + renderInlineToObsidianMarkdown, +} from "../core/render"; +import { + createDgDocument, + parseInline, + parseMarkdownBody, + stripYamlFrontmatter, +} from "../core/parse"; +import type { + BlockAnnotation, + DgDocument, + InlineAnnotation, + JsonObject, +} from "../schema"; + +export const obsidianMarkdownToDgDocument = ({ + title, + markdown, + metadata = {}, +}: { + title: string; + markdown: string; + metadata?: JsonObject; +}): DgDocument => { + const { frontmatter, body } = stripYamlFrontmatter(markdown); + const parsedTitle = parseInline(title, { dialect: "obsidian" }); + const parsedBody = parseMarkdownBody({ + markdown: body, + dialect: "obsidian", + blockIdPrefix: "obsidian-block", + }); + return createDgDocument({ + title: parsedTitle.text, + titleAnnotations: parsedTitle.annotations, + body: parsedBody.text, + bodyAnnotations: parsedBody.annotations, + metadata: { + ...metadata, + source: "obsidian", + ...(frontmatter !== null ? { frontmatter } : {}), + }, + }); +}; + +const renderObsidianBlock = ({ + document, + block, + isFirstBlock, + previousBlock, +}: { + document: DgDocument; + block: BlockAnnotation; + isFirstBlock: boolean; + previousBlock?: BlockAnnotation; +}): string => { + const rawBlockText = document.body.text + .slice(block.start, block.end) + .replace(/\n$/, ""); + const inlineAnnotations = getInlineAnnotationsForRange({ + annotations: document.body.annotations, + start: block.start, + end: block.end, + }); + const rendered = renderInlineToObsidianMarkdown({ + text: rawBlockText, + annotations: inlineAnnotations, + }).replaceAll(NULL_INLINE_CONTENT, ""); + const indent = "\t".repeat(block.attributes.depth); + const marker = + block.attributes.viewType === "bullet" + ? "- " + : block.attributes.viewType === "numbered" + ? "1. " + : ""; + const paragraphBreak = + !isFirstBlock && + (block.attributes.viewType === "paragraph" || + previousBlock?.attributes.viewType === "paragraph") + ? "\n" + : ""; + return `${paragraphBreak}${indent}${marker}${rendered}`; +}; + +const isInlineAnnotation = ( + annotation: DgDocument["body"]["annotations"][number], +): annotation is InlineAnnotation => annotation.type !== "block"; + +export const dgDocumentToObsidianMarkdown = (document: DgDocument): string => { + const blocks = document.body.annotations + .filter( + (annotation): annotation is BlockAnnotation => + annotation.type === "block", + ) + .sort((a, b) => a.start - b.start); + + if (blocks.length === 0) { + return renderInlineToObsidianMarkdown({ + text: document.body.text, + annotations: document.body.annotations.filter(isInlineAnnotation), + }).replaceAll(NULL_INLINE_CONTENT, ""); + } + + return blocks + .map((block, index) => + renderObsidianBlock({ + document, + block, + isFirstBlock: index === 0, + previousBlock: blocks[index - 1], + }), + ) + .join("\n"); +}; diff --git a/packages/content-model/src/adapters/roam.ts b/packages/content-model/src/adapters/roam.ts new file mode 100644 index 000000000..b5dee443c --- /dev/null +++ b/packages/content-model/src/adapters/roam.ts @@ -0,0 +1,238 @@ +import { NULL_INLINE_CONTENT } from "../constants"; +import { + getInlineAnnotationsForRange, + renderInlineToRoam, +} from "../core/render"; +import { + createDgDocument, + parseInline, + parseMarkdownBody, +} from "../core/parse"; +import type { + BlockAnnotation, + BodyAnnotation, + DgDocument, + InlineAnnotation, + JsonObject, +} from "../schema"; + +export type RoamViewType = "bullet" | "numbered" | "document"; + +export type RoamTreeNode = { + uid: string; + text: string; + viewType?: RoamViewType; + children?: RoamTreeNode[]; +}; + +export type RoamRenderedBlock = { + uid?: string; + text: string; + viewType: RoamViewType; + children: RoamRenderedBlock[]; +}; + +const roamViewTypeToBlockViewType = ( + viewType?: RoamViewType, +): "paragraph" | "bullet" | "numbered" => { + if (viewType === "numbered") return "numbered"; + if (viewType === "document") return "paragraph"; + return "bullet"; +}; + +const blockViewTypeToRoamViewType = ( + viewType: "paragraph" | "bullet" | "numbered", +): RoamViewType => { + if (viewType === "paragraph") return "document"; + return viewType; +}; + +const appendRoamTreeNode = ({ + node, + depth, + parentBlockId, + state, +}: { + node: RoamTreeNode; + depth: number; + parentBlockId?: string; + state: { text: string; annotations: BodyAnnotation[] }; +}): void => { + const parsed = parseInline(node.text, { dialect: "roam" }); + const start = state.text.length; + state.text += `${parsed.text || NULL_INLINE_CONTENT}\n`; + const end = state.text.length; + state.annotations.push({ + type: "block", + start, + end, + attributes: { + blockId: node.uid, + parentBlockId, + depth, + viewType: roamViewTypeToBlockViewType(node.viewType), + }, + }); + state.annotations.push( + ...parsed.annotations.map((annotation) => ({ + ...annotation, + start: annotation.start + start, + end: annotation.end + start, + })), + ); + for (const child of node.children ?? []) { + appendRoamTreeNode({ + node: child, + depth: depth + 1, + parentBlockId: node.uid, + state, + }); + } +}; + +export const roamTreeToDgDocument = ({ + title, + pageUid, + children, + metadata = {}, +}: { + title: string; + pageUid?: string; + children: RoamTreeNode[]; + metadata?: JsonObject; +}): DgDocument => { + const parsedTitle = parseInline(title, { dialect: "roam" }); + const state: { text: string; annotations: BodyAnnotation[] } = { + text: "", + annotations: [], + }; + for (const child of children) { + appendRoamTreeNode({ node: child, depth: 0, state }); + } + return createDgDocument({ + title: parsedTitle.text, + titleAnnotations: parsedTitle.annotations, + body: state.text, + bodyAnnotations: state.annotations, + metadata: { + ...metadata, + source: "roam", + ...(pageUid ? { pageUid } : {}), + }, + }); +}; + +export const roamTextToDgDocument = ({ + title, + text, + sourceLocalId, + metadata = {}, +}: { + title: string; + text: string; + sourceLocalId?: string; + metadata?: JsonObject; +}): DgDocument => { + const parsedTitle = parseInline(title, { dialect: "roam" }); + const parsedBody = parseMarkdownBody({ + markdown: text, + dialect: "roam", + blockIdPrefix: sourceLocalId ?? "roam-block", + }); + return createDgDocument({ + title: parsedTitle.text, + titleAnnotations: parsedTitle.annotations, + body: parsedBody.text, + bodyAnnotations: parsedBody.annotations, + metadata: { + ...metadata, + source: "roam", + ...(sourceLocalId ? { sourceLocalId } : {}), + }, + }); +}; + +const renderRoamBlockText = ({ + document, + block, +}: { + document: DgDocument; + block: BlockAnnotation; +}): string => { + const rawBlockText = document.body.text + .slice(block.start, block.end) + .replace(/\n$/, ""); + return renderInlineToRoam({ + text: rawBlockText, + annotations: getInlineAnnotationsForRange({ + annotations: document.body.annotations, + start: block.start, + end: block.end, + }), + }).replaceAll(NULL_INLINE_CONTENT, ""); +}; + +export const dgDocumentToRoamMarkdown = (document: DgDocument): string => { + const blocks = document.body.annotations + .filter( + (annotation): annotation is BlockAnnotation => + annotation.type === "block", + ) + .sort((a, b) => a.start - b.start); + + if (blocks.length === 0) { + const inlineAnnotations = document.body.annotations.filter( + (annotation): annotation is InlineAnnotation => + annotation.type !== "block", + ); + return renderInlineToRoam({ + text: document.body.text, + annotations: inlineAnnotations, + }).replaceAll(NULL_INLINE_CONTENT, ""); + } + + return blocks + .map((block) => { + const indent = " ".repeat(block.attributes.depth); + const marker = + block.attributes.viewType === "numbered" + ? "1. " + : block.attributes.viewType === "bullet" + ? "- " + : ""; + return `${indent}${marker}${renderRoamBlockText({ document, block })}`; + }) + .join("\n"); +}; + +export const dgDocumentToRoamBlocks = ( + document: DgDocument, +): RoamRenderedBlock[] => { + const blocks = document.body.annotations + .filter( + (annotation): annotation is BlockAnnotation => + annotation.type === "block", + ) + .sort((a, b) => a.start - b.start); + const byId = new Map(); + const roots: RoamRenderedBlock[] = []; + + for (const block of blocks) { + const renderedBlock: RoamRenderedBlock = { + uid: block.attributes.blockId, + text: renderRoamBlockText({ document, block }), + viewType: blockViewTypeToRoamViewType(block.attributes.viewType), + children: [], + }; + byId.set(block.attributes.blockId, renderedBlock); + const parentId = block.attributes.parentBlockId; + const parent = parentId ? byId.get(parentId) : undefined; + if (parent) { + parent.children.push(renderedBlock); + } else { + roots.push(renderedBlock); + } + } + + return roots; +}; diff --git a/packages/content-model/src/constants.ts b/packages/content-model/src/constants.ts new file mode 100644 index 000000000..51985e3f5 --- /dev/null +++ b/packages/content-model/src/constants.ts @@ -0,0 +1,8 @@ +export const TEXT_PLAIN_CONTENT_TYPE = "text/plain"; +export const TEXT_MARKDOWN_CONTENT_TYPE = "text/markdown"; +export const DG_ATJSON_CONTENT_TYPE = + "application/vnd.discourse-graph.atjson+json; version=1"; + +export const DG_DOCUMENT_VERSION = "dg-content-model/v1"; + +export const NULL_INLINE_CONTENT = "\0"; diff --git a/packages/content-model/src/content-model.test.ts b/packages/content-model/src/content-model.test.ts new file mode 100644 index 000000000..c71def632 --- /dev/null +++ b/packages/content-model/src/content-model.test.ts @@ -0,0 +1,377 @@ +import assert from "node:assert/strict"; +import { readdirSync, readFileSync } from "node:fs"; +import { join } from "node:path"; +import test from "node:test"; +import { + DG_ATJSON_CONTENT_TYPE, + createDgAtJsonMetadata, + derivePlainTextFromDgDocument, + dgDocumentToHtml, + dgDocumentToObsidianMarkdown, + dgDocumentToRoamBlocks, + dgDocumentToRoamMarkdown, + obsidianMarkdownToDgDocument, + roamTreeToDgDocument, + validateDgDocument, +} from "./index"; + +const readPackageSourceFiles = (directory: string): string[] => + readdirSync(directory, { withFileTypes: true }).flatMap((entry) => { + const path = join(directory, entry.name); + if (entry.isDirectory()) return readPackageSourceFiles(path); + if (!entry.name.endsWith(".ts") || entry.name.endsWith(".test.ts")) { + return []; + } + return [path]; + }); + +void test("parses and renders representative Obsidian markdown", () => { + const sourceMarkdown = + "---\nnodeTypeId: claim\n---\n- **Bold claim** with [source](https://example.com)\n- Link to [[Other note|other]]\n"; + const document = obsidianMarkdownToDgDocument({ + title: "Claim note", + markdown: sourceMarkdown, + }); + + const validation = validateDgDocument(document); + assert.deepEqual(validation.errors, []); + assert.equal(validation.valid, true); + assert.equal(document.metadata?.source, "obsidian"); + const renderedMarkdown = dgDocumentToObsidianMarkdown(document); + assert.equal( + renderedMarkdown, + "- **Bold claim** with [source](https://example.com)\n- Link to [[Other note|other]]", + ); + const reparsed = obsidianMarkdownToDgDocument({ + title: "Claim note", + markdown: renderedMarkdown, + }); + assert.equal( + derivePlainTextFromDgDocument(reparsed), + derivePlainTextFromDgDocument(document), + ); +}); + +void test("round-trips richer Obsidian markdown fixtures", () => { + const sourceMarkdown = [ + "First paragraph with ![diagram](attachments/diagram.png)", + "", + "1. Numbered [[Target#Heading|alias]]", + "\t1. Nested `inline code`", + "", + "```ts", + "const answer = 42;", + "```", + ].join("\n"); + const document = obsidianMarkdownToDgDocument({ + title: "Fixture note", + markdown: sourceMarkdown, + }); + + const validation = validateDgDocument(document); + assert.deepEqual(validation.errors, []); + assert.equal( + dgDocumentToObsidianMarkdown(document), + [ + "First paragraph with ![diagram](attachments/diagram.png)", + "", + "1. Numbered [[Target#Heading|alias]]", + "\t1. Nested `inline code`", + "", + "```ts", + "const answer = 42;", + "```", + ].join("\n"), + ); + assert.equal( + derivePlainTextFromDgDocument(document), + "Fixture note\n\nFirst paragraph with diagram\nNumbered alias\nNested inline code\nconst answer = 42;", + ); +}); + +void test("parses Roam tree, renders Roam markdown, and materializes blocks", () => { + const document = roamTreeToDgDocument({ + title: "Roam page", + pageUid: "page-uid", + children: [ + { + uid: "block-a", + text: "A **claim** about [[Evidence]]", + viewType: "bullet", + children: [ + { + uid: "block-b", + text: "nested ((abc123def))", + viewType: "numbered", + }, + ], + }, + ], + }); + + const validation = validateDgDocument(document); + assert.equal(validation.valid, true); + assert.equal( + dgDocumentToRoamMarkdown(document), + "- A **claim** about [[Evidence]]\n 1. nested ((abc123def))", + ); + const blocks = dgDocumentToRoamBlocks(document); + assert.equal(blocks[0]?.uid, "block-a"); + assert.equal(blocks[0]?.text, "A **claim** about [[Evidence]]"); + assert.equal(blocks[0]?.children[0]?.uid, "block-b"); + assert.equal(blocks[0]?.children[0]?.text, "nested ((abc123def))"); +}); + +void test("renders richer Roam native tree fixtures", () => { + const document = roamTreeToDgDocument({ + title: "Roam fixture", + pageUid: "page-uid", + children: [ + { + uid: "block-a", + text: "Image ![diagram](https://example.com/diagram.png) #Evidence", + viewType: "bullet", + }, + { + uid: "block-b", + text: "Use `inline code` and ((abc123def))", + viewType: "document", + }, + ], + }); + + assert.equal(validateDgDocument(document).valid, true); + assert.equal( + dgDocumentToRoamMarkdown(document), + [ + "- Image ![diagram](https://example.com/diagram.png) [[Evidence]]", + "Use `inline code` and ((abc123def))", + ].join("\n"), + ); + const blocks = dgDocumentToRoamBlocks(document); + assert.deepEqual(blocks, [ + { + uid: "block-a", + text: "Image ![diagram](https://example.com/diagram.png) [[Evidence]]", + viewType: "bullet", + children: [], + }, + { + uid: "block-b", + text: "Use `inline code` and ((abc123def))", + viewType: "document", + children: [], + }, + ]); +}); + +void test("validates invalid spans and block parents", () => { + const document = obsidianMarkdownToDgDocument({ + title: "Invalid", + markdown: "- body", + }); + document.body.annotations.push({ + type: "bold", + start: 10, + end: 20, + }); + document.body.annotations.push({ + type: "block", + start: 0, + end: 1, + attributes: { + blockId: "orphan", + parentBlockId: "missing", + depth: 1, + viewType: "bullet", + }, + }); + + const validation = validateDgDocument(document); + assert.equal(validation.valid, false); + assert.ok(validation.errors.some((error) => error.includes("exceeds"))); + assert.ok( + validation.errors.some((error) => error.includes("does not exist")), + ); +}); + +void test("validates runtime-invalid annotation shapes from JSON", () => { + const document = obsidianMarkdownToDgDocument({ + title: "Invalid JSON", + markdown: "- body", + }); + ( + document.title.annotations as unknown as Array> + ).push({ + type: "block", + start: 0, + end: 1, + attributes: { + blockId: "title-block", + depth: 0, + viewType: "bullet", + }, + }); + document.body.annotations.push({ + type: "reference", + start: 0, + end: 1, + attributes: { + kind: "roam-block", + blockUid: "", + }, + }); + (document.body.annotations as unknown as Array>).push( + { + type: "block", + start: 0, + end: 1, + attributes: { + blockId: "bad-view", + depth: -1, + viewType: "kanban", + }, + }, + ); + + const validation = validateDgDocument(document); + assert.equal(validation.valid, false); + assert.ok( + validation.errors.some((error) => + error.includes("title cannot contain block annotations"), + ), + ); + assert.ok( + validation.errors.some((error) => error.includes("missing blockUid")), + ); + assert.ok(validation.errors.some((error) => error.includes("invalid depth"))); + assert.ok( + validation.errors.some((error) => error.includes("invalid viewType")), + ); +}); + +void test("validates negative and zero spans, duplicate block ids, and unknown references", () => { + const document = obsidianMarkdownToDgDocument({ + title: "Invalid runtime shapes", + markdown: "- body", + }); + (document.body.annotations as unknown as Array>).push( + { + type: "bold", + start: -1, + end: 1, + }, + { + type: "italics", + start: 0, + end: 0, + }, + { + type: "reference", + start: 0, + end: 1, + attributes: { + kind: "external-note", + }, + }, + { + type: "block", + start: 0, + end: 1, + attributes: { + blockId: "obsidian-block-1", + depth: 0, + viewType: "bullet", + }, + }, + ); + + const validation = validateDgDocument(document); + assert.equal(validation.valid, false); + assert.ok( + validation.errors.some((error) => error.includes("negative start")), + ); + assert.ok( + validation.errors.some((error) => + error.includes("zero or negative length"), + ), + ); + assert.ok(validation.errors.some((error) => error.includes("unknown kind"))); + assert.ok( + validation.errors.some((error) => error.includes("duplicate blockId")), + ); +}); + +void test("stores canonical content in metadata and derives plain text", () => { + const document = obsidianMarkdownToDgDocument({ + title: "Storage", + markdown: "- **Human readable**", + }); + const text = derivePlainTextFromDgDocument(document); + const metadata = createDgAtJsonMetadata({ document }); + + assert.equal(DG_ATJSON_CONTENT_TYPE.includes("atjson"), true); + assert.equal(text.includes("{"), false); + assert.equal(metadata.content.version, "dg-content-model/v1"); +}); + +void test("refuses to store invalid canonical content in metadata", () => { + const document = obsidianMarkdownToDgDocument({ + title: "Invalid storage", + markdown: "- body", + }); + document.body.annotations.push({ + type: "bold", + start: 0, + end: 50, + }); + + assert.throws( + () => createDgAtJsonMetadata({ document }), + /Invalid DG document/, + ); +}); + +void test("renders HTML from the canonical document", () => { + const document = obsidianMarkdownToDgDocument({ + title: "HTML & title", + markdown: "- A & **B < C** [link](https://example.com)", + }); + + const html = dgDocumentToHtml(document); + assert.match(html, /
/); + assert.match(html, /HTML & title/); + assert.match(html, /A & B < C<\/strong>/); + assert.match(html, /href="https:\/\/example.com"/); +}); + +void test("sanitizes unsafe HTML link and image URLs", () => { + const document = obsidianMarkdownToDgDocument({ + title: "HTML safety", + markdown: [ + "[unsafe link](javascript:alert(1))", + "", + "![unsafe image](javascript:alert(2))", + ].join("\n"), + }); + + const html = dgDocumentToHtml(document); + assert.equal(html.includes("javascript:"), false); + assert.match(html, /unsafe link<\/a>/); + assert.match(html, /unsafe image/); +}); + +void test("content model stays independent from SamePage runtime dependencies", () => { + const forbiddenRuntimeTerms = ["samepage", "automerge", "ipfs", "websocket"]; + + const matches = readPackageSourceFiles(join(process.cwd(), "src")).flatMap( + (path) => { + const content = readFileSync(path, "utf8").toLowerCase(); + return forbiddenRuntimeTerms + .filter((term) => content.includes(term)) + .map((term) => `${path}: ${term}`); + }, + ); + + assert.deepEqual(matches, []); +}); diff --git a/packages/content-model/src/core/index.ts b/packages/content-model/src/core/index.ts new file mode 100644 index 000000000..b0f835512 --- /dev/null +++ b/packages/content-model/src/core/index.ts @@ -0,0 +1,2 @@ +export * from "./parse"; +export * from "./render"; diff --git a/packages/content-model/src/core/parse.ts b/packages/content-model/src/core/parse.ts new file mode 100644 index 000000000..deae76463 --- /dev/null +++ b/packages/content-model/src/core/parse.ts @@ -0,0 +1,431 @@ +import { DG_DOCUMENT_VERSION, NULL_INLINE_CONTENT } from "../constants"; +import type { + BodyAnnotation, + DgDocument, + InlineAnnotation, + JsonObject, +} from "../schema"; + +type ParseDialect = "obsidian" | "roam"; + +type ParsedInline = { + text: string; + annotations: InlineAnnotation[]; +}; + +type BlockBuildState = { + text: string; + annotations: BodyAnnotation[]; +}; + +const shiftInlineAnnotations = ( + annotations: InlineAnnotation[], + offset: number, +): InlineAnnotation[] => + annotations.map((annotation) => ({ + ...annotation, + start: annotation.start + offset, + end: annotation.end + offset, + })); + +const findClosing = ( + input: string, + delimiter: string, + start: number, +): number => { + const index = input.indexOf(delimiter, start + delimiter.length); + return index > start + delimiter.length ? index : -1; +}; + +const appendReference = ({ + current, + content, + annotation, +}: { + current: ParsedInline; + content: string; + annotation: Omit; +}): void => { + const start = current.text.length; + current.text += content || NULL_INLINE_CONTENT; + const end = current.text.length; + current.annotations.push({ ...annotation, start, end } as InlineAnnotation); +}; + +const parseFormatting = ({ + input, + index, + delimiter, + type, + dialect, +}: { + input: string; + index: number; + delimiter: string; + type: "bold" | "italics" | "strikethrough"; + dialect: ParseDialect; +}): { parsed: ParsedInline; nextIndex: number } | null => { + const closing = findClosing(input, delimiter, index); + if (closing < 0) return null; + const inner = parseInline(input.slice(index + delimiter.length, closing), { + dialect, + }); + const attributes = { delimiter }; + return { + parsed: { + text: inner.text, + annotations: [ + ...inner.annotations, + { + type, + start: 0, + end: inner.text.length || 1, + attributes, + } as InlineAnnotation, + ], + }, + nextIndex: closing + delimiter.length, + }; +}; + +export const parseInline = ( + input: string, + { dialect }: { dialect: ParseDialect }, +): ParsedInline => { + const current: ParsedInline = { text: "", annotations: [] }; + let index = 0; + + while (index < input.length) { + const rest = input.slice(index); + + const imageMatch = /^!\[([^\]]*)\]\(([^)]+)\)/.exec(rest); + if (imageMatch) { + appendReference({ + current, + content: imageMatch[1] || NULL_INLINE_CONTENT, + annotation: { + type: "image", + attributes: { + alt: imageMatch[1], + src: imageMatch[2] ?? "", + }, + }, + }); + index += imageMatch[0].length; + continue; + } + + const linkMatch = /^\[([^\]]*)\]\(([^)]+)\)/.exec(rest); + if (linkMatch) { + appendReference({ + current, + content: linkMatch[1] || NULL_INLINE_CONTENT, + annotation: { + type: "link", + attributes: { + href: linkMatch[2] ?? "", + }, + }, + }); + index += linkMatch[0].length; + continue; + } + + const wikiMatch = /^\[\[([^\]]+)\]\]/.exec(rest); + if (wikiMatch) { + const rawTarget = wikiMatch[1] ?? ""; + const [pathWithSubpath = "", alias] = rawTarget.split("|"); + const subpathIndex = pathWithSubpath.search(/[#^]/); + const path = + subpathIndex >= 0 + ? pathWithSubpath.slice(0, subpathIndex) + : pathWithSubpath; + const subpath = + subpathIndex >= 0 ? pathWithSubpath.slice(subpathIndex) : undefined; + appendReference({ + current, + content: + dialect === "obsidian" + ? alias || pathWithSubpath || NULL_INLINE_CONTENT + : NULL_INLINE_CONTENT, + annotation: + dialect === "obsidian" + ? { + type: "reference", + attributes: { + kind: "obsidian-wikilink", + path, + subpath, + alias, + }, + } + : { + type: "reference", + attributes: { + kind: "roam-page", + pageTitle: rawTarget, + }, + }, + }); + index += wikiMatch[0].length; + continue; + } + + if (dialect === "roam") { + const blockRefMatch = /^\(\(([^)]+)\)\)/.exec(rest); + if (blockRefMatch) { + appendReference({ + current, + content: NULL_INLINE_CONTENT, + annotation: { + type: "reference", + attributes: { + kind: "roam-block", + blockUid: blockRefMatch[1] ?? "", + }, + }, + }); + index += blockRefMatch[0].length; + continue; + } + + const hashMatch = /^#([a-zA-Z0-9_.-]+)/.exec(rest); + if (hashMatch) { + appendReference({ + current, + content: NULL_INLINE_CONTENT, + annotation: { + type: "reference", + attributes: { + kind: "roam-page", + pageTitle: hashMatch[1] ?? "", + }, + appAttributes: { + roam: { kind: "hash" }, + }, + }, + }); + index += hashMatch[0].length; + continue; + } + } + + const inlineCodeMatch = /^`([^`\n]+)`/.exec(rest); + if (inlineCodeMatch) { + appendReference({ + current, + content: inlineCodeMatch[1] ?? "", + annotation: { + type: "code", + attributes: { + display: "inline", + }, + }, + }); + index += inlineCodeMatch[0].length; + continue; + } + + const formattingCandidates: Array<{ + delimiter: string; + type: "bold" | "italics" | "strikethrough"; + }> = + dialect === "roam" + ? [ + { delimiter: "**", type: "bold" }, + { delimiter: "__", type: "italics" }, + { delimiter: "~~", type: "strikethrough" }, + ] + : [ + { delimiter: "**", type: "bold" }, + { delimiter: "__", type: "bold" }, + { delimiter: "~~", type: "strikethrough" }, + { delimiter: "_", type: "italics" }, + { delimiter: "*", type: "italics" }, + ]; + + const parsedFormatting = formattingCandidates + .filter(({ delimiter }) => rest.startsWith(delimiter)) + .map(({ delimiter, type }) => + parseFormatting({ input, index, delimiter, type, dialect }), + ) + .find((result): result is NonNullable => result !== null); + + if (parsedFormatting) { + current.text += parsedFormatting.parsed.text; + current.annotations.push( + ...shiftInlineAnnotations( + parsedFormatting.parsed.annotations, + current.text.length - parsedFormatting.parsed.text.length, + ), + ); + index = parsedFormatting.nextIndex; + continue; + } + + current.text += input[index]; + index++; + } + + return current; +}; + +export const stripYamlFrontmatter = ( + markdown: string, +): { frontmatter: string | null; body: string } => { + if (!markdown.startsWith("---\n") && !markdown.startsWith("---\r\n")) { + return { frontmatter: null, body: markdown }; + } + const normalized = markdown.replace(/\r\n/g, "\n"); + const closing = normalized.indexOf("\n---\n", 4); + if (closing < 0) { + return { frontmatter: null, body: markdown }; + } + return { + frontmatter: normalized.slice(4, closing), + body: normalized.slice(closing + "\n---\n".length), + }; +}; + +const getBlockLineParts = ( + line: string, +): { + depth: number; + viewType: "paragraph" | "bullet" | "numbered"; + text: string; +} => { + const match = /^((?:\t| {2,4})*)(?:(- )|(\d+\. ))?(.*)$/.exec(line); + const indent = match?.[1] ?? ""; + const depth = + Array.from(indent.matchAll(/\t| {2,4}/g)).length > 0 + ? Array.from(indent.matchAll(/\t| {2,4}/g)).length + : 0; + const viewType = match?.[2] + ? "bullet" + : match?.[3] + ? "numbered" + : "paragraph"; + return { + depth, + viewType, + text: match?.[4] ?? line, + }; +}; + +export const parseMarkdownBody = ({ + markdown, + dialect, + blockIdPrefix, +}: { + markdown: string; + dialect: ParseDialect; + blockIdPrefix: string; +}): BlockBuildState => { + const state: BlockBuildState = { text: "", annotations: [] }; + const parentByDepth = new Map(); + const lines = markdown.replace(/\r\n/g, "\n").split("\n"); + let lineIndex = 0; + + while (lineIndex < lines.length) { + const line = lines[lineIndex] ?? ""; + if (line.trim() === "") { + lineIndex++; + continue; + } + + const codeFenceMatch = /^```([\w -]*)$/.exec(line.trim()); + if (codeFenceMatch) { + const codeStartLine = lineIndex + 1; + const codeLines: string[] = []; + lineIndex++; + while (lineIndex < lines.length && lines[lineIndex]?.trim() !== "```") { + codeLines.push(lines[lineIndex] ?? ""); + lineIndex++; + } + if (lineIndex < lines.length) lineIndex++; + const blockId = `${blockIdPrefix}-${codeStartLine}`; + const start = state.text.length; + const codeText = `${codeLines.join("\n")}\n`; + state.text += codeText; + const end = state.text.length; + state.annotations.push({ + type: "block", + start, + end, + attributes: { + blockId, + depth: 0, + viewType: "paragraph", + }, + }); + state.annotations.push({ + type: "code", + start, + end: Math.max(start + 1, end - 1), + attributes: { + display: "block", + language: codeFenceMatch[1] || undefined, + ticks: 3, + }, + }); + continue; + } + + const blockParts = getBlockLineParts(line); + const parsedInline = parseInline(blockParts.text, { dialect }); + const blockId = `${blockIdPrefix}-${lineIndex + 1}`; + const parentBlockId = + blockParts.depth > 0 + ? parentByDepth.get(blockParts.depth - 1) + : undefined; + const start = state.text.length; + state.text += `${parsedInline.text}\n`; + const end = state.text.length; + state.annotations.push({ + type: "block", + start, + end, + attributes: { + blockId, + parentBlockId, + depth: blockParts.depth, + viewType: blockParts.viewType, + }, + }); + state.annotations.push( + ...shiftInlineAnnotations(parsedInline.annotations, start), + ); + parentByDepth.set(blockParts.depth, blockId); + for (const depth of Array.from(parentByDepth.keys())) { + if (depth > blockParts.depth) parentByDepth.delete(depth); + } + lineIndex++; + } + + return state; +}; + +export const createDgDocument = ({ + title, + body, + titleAnnotations = [], + bodyAnnotations, + metadata, +}: { + title: string; + body: string; + titleAnnotations?: InlineAnnotation[]; + bodyAnnotations: BodyAnnotation[]; + metadata?: JsonObject; +}): DgDocument => ({ + version: DG_DOCUMENT_VERSION, + title: { + text: title, + annotations: titleAnnotations, + }, + body: { + text: body, + annotations: bodyAnnotations, + }, + metadata, +}); diff --git a/packages/content-model/src/core/render.ts b/packages/content-model/src/core/render.ts new file mode 100644 index 000000000..29026b9b4 --- /dev/null +++ b/packages/content-model/src/core/render.ts @@ -0,0 +1,303 @@ +import { NULL_INLINE_CONTENT } from "../constants"; +import type { + BodyAnnotation, + ImageAnnotation, + InlineAnnotation, + LinkAnnotation, + ReferenceAnnotation, +} from "../schema"; + +type AppliedAnnotation = { + prefix: string; + suffix: string; + replace?: boolean; +}; + +type AnnotationRenderer = (args: { + annotation: T; + content: string; + index: number; +}) => AppliedAnnotation; + +type RendererMap = { + [K in InlineAnnotation["type"]]?: AnnotationRenderer< + Extract + >; +}; + +const shiftRemainingAnnotations = ({ + annotations, + startIndex, + start, + end, + prefixLength, + suffixLength, + replacedLength, +}: { + annotations: Array<{ annotation: InlineAnnotation; index: number }>; + startIndex: number; + start: number; + end: number; + prefixLength: number; + suffixLength: number; + replacedLength: number; +}): void => { + for (const item of annotations.slice(startIndex + 1)) { + if (item.annotation.start >= start) item.annotation.start += prefixLength; + if (item.annotation.start >= end) { + item.annotation.start += suffixLength - replacedLength; + } + if (item.annotation.end > start) item.annotation.end += prefixLength; + if (item.annotation.end > end) { + item.annotation.end += suffixLength - replacedLength; + } + } +}; + +export const renderAnnotatedText = ({ + text, + annotations, + renderers, +}: { + text: string; + annotations: InlineAnnotation[]; + renderers: RendererMap; +}): string => { + const sorted = annotations + .map((annotation, index) => ({ annotation: { ...annotation }, index })) + .sort((a, b) => { + const aSize = a.annotation.end - a.annotation.start; + const bSize = b.annotation.end - b.annotation.start; + return bSize - aSize || a.index - b.index; + }); + + return sorted.reduce((content, item, index) => { + const renderer = renderers[item.annotation.type] as + | AnnotationRenderer + | undefined; + if (!renderer) return content; + + const annotatedContent = content.slice( + item.annotation.start, + item.annotation.end, + ); + const applied = renderer({ + annotation: item.annotation, + content: annotatedContent, + index: item.index, + }); + const replacementLength = applied.replace ? annotatedContent.length : 0; + shiftRemainingAnnotations({ + annotations: sorted, + startIndex: index, + start: item.annotation.start, + end: item.annotation.end, + prefixLength: applied.prefix.length, + suffixLength: applied.suffix.length, + replacedLength: replacementLength, + }); + return `${content.slice(0, item.annotation.start)}${applied.prefix}${ + applied.replace ? "" : annotatedContent + }${applied.suffix}${content.slice(item.annotation.end)}`; + }, text); +}; + +const shouldReplaceInline = (content: string): boolean => + content === NULL_INLINE_CONTENT; + +const renderLinkMarkdown = ( + annotation: LinkAnnotation, + content: string, +): AppliedAnnotation => ({ + prefix: "[", + suffix: `](${annotation.attributes.href})`, + replace: shouldReplaceInline(content), +}); + +const renderImageMarkdown = ( + annotation: ImageAnnotation, + content: string, +): AppliedAnnotation => ({ + prefix: "![", + suffix: `](${annotation.attributes.src})`, + replace: shouldReplaceInline(content), +}); + +const renderObsidianReference = ({ + annotation, + content, +}: { + annotation: ReferenceAnnotation; + content: string; +}): AppliedAnnotation => { + const attributes = annotation.attributes; + if (attributes.kind === "obsidian-wikilink") { + const target = `${attributes.path}${attributes.subpath ?? ""}`; + const alias = attributes.alias ? `|${attributes.alias}` : ""; + return { + prefix: "[[", + suffix: `${target}${alias}]]`, + replace: true, + }; + } + if (attributes.kind === "roam-block") { + return { + prefix: "", + suffix: `((${attributes.blockUid}))`, + replace: shouldReplaceInline(content), + }; + } + return { + prefix: "[[", + suffix: `${attributes.pageTitle}]]`, + replace: shouldReplaceInline(content), + }; +}; + +const renderRoamReference = ({ + annotation, + content, +}: { + annotation: ReferenceAnnotation; + content: string; +}): AppliedAnnotation => { + const attributes = annotation.attributes; + if (attributes.kind === "roam-block") { + return { + prefix: "", + suffix: `((${attributes.blockUid}))`, + replace: shouldReplaceInline(content), + }; + } + if (attributes.kind === "obsidian-wikilink") { + return { + prefix: "[[", + suffix: `${attributes.alias ?? attributes.path}]]`, + replace: true, + }; + } + return { + prefix: "", + suffix: `[[${attributes.pageTitle}]]`, + replace: shouldReplaceInline(content), + }; +}; + +export const renderInlineToObsidianMarkdown = ({ + text, + annotations, +}: { + text: string; + annotations: InlineAnnotation[]; +}): string => + renderAnnotatedText({ + text, + annotations, + renderers: { + bold: ({ annotation, content }) => { + const delimiter = annotation.attributes?.delimiter ?? "**"; + const safeDelimiter = delimiter === "__" ? "__" : "**"; + return { + prefix: safeDelimiter, + suffix: annotation.attributes?.open ? "" : safeDelimiter, + replace: shouldReplaceInline(content), + }; + }, + italics: ({ annotation, content }) => { + const delimiter = annotation.attributes?.delimiter ?? "_"; + const safeDelimiter = delimiter === "*" ? "*" : "_"; + return { + prefix: safeDelimiter, + suffix: annotation.attributes?.open ? "" : safeDelimiter, + replace: shouldReplaceInline(content), + }; + }, + strikethrough: ({ annotation, content }) => ({ + prefix: "~~", + suffix: annotation.attributes?.open ? "" : "~~", + replace: shouldReplaceInline(content), + }), + code: ({ annotation }) => { + if (annotation.attributes.display === "block") { + const ticks = "`".repeat(annotation.attributes.ticks ?? 3); + return { + prefix: `${ticks}${annotation.attributes.language ?? ""}\n`, + suffix: `\n${ticks}`, + }; + } + return { prefix: "`", suffix: "`" }; + }, + link: ({ annotation, content }) => + renderLinkMarkdown(annotation, content), + image: ({ annotation, content }) => + renderImageMarkdown(annotation, content), + reference: ({ annotation, content }) => + renderObsidianReference({ annotation, content }), + }, + }); + +export const renderInlineToRoam = ({ + text, + annotations, +}: { + text: string; + annotations: InlineAnnotation[]; +}): string => + renderAnnotatedText({ + text, + annotations, + renderers: { + bold: ({ content }) => ({ + prefix: "**", + suffix: "**", + replace: shouldReplaceInline(content), + }), + italics: ({ content }) => ({ + prefix: "__", + suffix: "__", + replace: shouldReplaceInline(content), + }), + strikethrough: ({ content }) => ({ + prefix: "~~", + suffix: "~~", + replace: shouldReplaceInline(content), + }), + code: ({ annotation }) => { + if (annotation.attributes.display === "block") { + return { + prefix: `\`\`\`${annotation.attributes.language ?? ""}\n`, + suffix: "\n```", + }; + } + return { prefix: "`", suffix: "`" }; + }, + link: ({ annotation, content }) => + renderLinkMarkdown(annotation, content), + image: ({ annotation, content }) => + renderImageMarkdown(annotation, content), + reference: ({ annotation, content }) => + renderRoamReference({ annotation, content }), + }, + }); + +export const getInlineAnnotationsForRange = ({ + annotations, + start, + end, +}: { + annotations: BodyAnnotation[]; + start: number; + end: number; +}): InlineAnnotation[] => + annotations + .filter( + (annotation): annotation is InlineAnnotation => + annotation.type !== "block" && + annotation.start >= start && + annotation.end <= end, + ) + .map((annotation) => ({ + ...annotation, + start: annotation.start - start, + end: annotation.end - start, + })); diff --git a/packages/content-model/src/index.ts b/packages/content-model/src/index.ts new file mode 100644 index 000000000..79052d4c3 --- /dev/null +++ b/packages/content-model/src/index.ts @@ -0,0 +1,7 @@ +export * from "./constants"; +export * from "./schema"; +export * from "./validate"; +export * from "./text"; +export * from "./adapters/obsidian"; +export * from "./adapters/roam"; +export * from "./render/html"; diff --git a/packages/content-model/src/render/html.ts b/packages/content-model/src/render/html.ts new file mode 100644 index 000000000..3b502afeb --- /dev/null +++ b/packages/content-model/src/render/html.ts @@ -0,0 +1,137 @@ +import { NULL_INLINE_CONTENT } from "../constants"; +import { + getInlineAnnotationsForRange, + renderAnnotatedText, +} from "../core/render"; +import type { BlockAnnotation, DgDocument, InlineAnnotation } from "../schema"; + +const escapeHtml = (value: string): string => + value + .replace(/&/g, "&") + .replace(//g, ">") + .replace(/"/g, """); + +const sanitizeUrl = (value: string): string => { + const trimmed = value.trim(); + const schemeMatch = /^([a-zA-Z][a-zA-Z\d+.-]*):/.exec(trimmed); + if (!schemeMatch) return trimmed; + const scheme = schemeMatch[1]?.toLowerCase(); + return scheme === "http" || scheme === "https" || scheme === "mailto" + ? trimmed + : "#"; +}; + +const escapeHtmlWithIndexMap = ( + value: string, +): { text: string; indexMap: number[] } => { + let text = ""; + const indexMap = [0]; + for (let index = 0; index < value.length; index++) { + const character = value[index] ?? ""; + text += escapeHtml(character); + indexMap.push(text.length); + } + return { text, indexMap }; +}; + +const mapAnnotationsToEscapedText = ({ + annotations, + indexMap, +}: { + annotations: InlineAnnotation[]; + indexMap: number[]; +}): InlineAnnotation[] => + annotations.map((annotation) => ({ + ...annotation, + start: indexMap[annotation.start] ?? annotation.start, + end: indexMap[annotation.end] ?? annotation.end, + })); + +const renderInlineHtml = ({ + text, + annotations, +}: { + text: string; + annotations: InlineAnnotation[]; +}): string => { + const escaped = escapeHtmlWithIndexMap(text); + return renderAnnotatedText({ + text: escaped.text, + annotations: mapAnnotationsToEscapedText({ + annotations, + indexMap: escaped.indexMap, + }), + renderers: { + bold: () => ({ prefix: "", suffix: "" }), + italics: () => ({ prefix: "", suffix: "" }), + strikethrough: () => ({ prefix: "", suffix: "" }), + code: () => ({ prefix: "", suffix: "" }), + link: ({ annotation }) => ({ + prefix: ``, + suffix: "", + }), + image: ({ annotation }) => ({ + prefix: `${escapeHtml(annotation.attributes.alt ?? `, + suffix: "", + replace: true, + }), + reference: ({ annotation, content }) => { + const attributes = annotation.attributes; + const target = + attributes.kind === "roam-block" + ? attributes.blockUid + : attributes.kind === "roam-page" + ? attributes.pageTitle + : attributes.path; + return { + prefix: ``, + suffix: "", + replace: content === NULL_INLINE_CONTENT, + }; + }, + }, + }).replaceAll(NULL_INLINE_CONTENT, ""); +}; + +export const dgDocumentToHtml = (document: DgDocument): string => { + const title = renderInlineHtml({ + text: document.title.text, + annotations: document.title.annotations, + }); + const blocks = document.body.annotations + .filter( + (annotation): annotation is BlockAnnotation => + annotation.type === "block", + ) + .sort((a, b) => a.start - b.start); + + const body = + blocks.length > 0 + ? blocks + .map((block) => { + const text = document.body.text + .slice(block.start, block.end) + .replace(/\n$/, ""); + const inline = renderInlineHtml({ + text, + annotations: getInlineAnnotationsForRange({ + annotations: document.body.annotations, + start: block.start, + end: block.end, + }), + }); + const tag = block.attributes.viewType === "paragraph" ? "p" : "li"; + return `<${tag} data-block-id="${escapeHtml(block.attributes.blockId)}">${inline}`; + }) + .join("\n") + : renderInlineHtml({ + text: document.body.text, + annotations: document.body.annotations.filter( + (annotation): annotation is InlineAnnotation => + annotation.type !== "block", + ), + }); + + return `

${title}

\n${body}
`; +}; diff --git a/packages/content-model/src/schema.ts b/packages/content-model/src/schema.ts new file mode 100644 index 000000000..b6f4bc74b --- /dev/null +++ b/packages/content-model/src/schema.ts @@ -0,0 +1,134 @@ +import { DG_DOCUMENT_VERSION } from "./constants"; + +export type Json = + | string + | number + | boolean + | null + | Json[] + | { [key: string]: Json | undefined }; + +export type JsonObject = { [key: string]: Json | undefined }; + +export type AnnotationBase = { + start: number; + end: number; + appAttributes?: Record; +}; + +export type BlockAnnotation = AnnotationBase & { + type: "block"; + attributes: { + blockId: string; + parentBlockId?: string; + depth: number; + viewType: "paragraph" | "bullet" | "numbered"; + }; +}; + +export type BoldAnnotation = AnnotationBase & { + type: "bold"; + attributes?: { + delimiter?: string; + open?: boolean; + }; +}; + +export type ItalicsAnnotation = AnnotationBase & { + type: "italics"; + attributes?: { + delimiter?: string; + open?: boolean; + }; +}; + +export type StrikethroughAnnotation = AnnotationBase & { + type: "strikethrough"; + attributes?: { + delimiter?: string; + open?: boolean; + }; +}; + +export type CodeAnnotation = AnnotationBase & { + type: "code"; + attributes: { + language?: string; + ticks?: number; + display?: "inline" | "block"; + }; +}; + +export type LinkAnnotation = AnnotationBase & { + type: "link"; + attributes: { + href: string; + title?: string; + }; +}; + +export type ImageAnnotation = AnnotationBase & { + type: "image"; + attributes: { + src: string; + alt?: string; + title?: string; + }; +}; + +export type ReferenceAnnotation = AnnotationBase & { + type: "reference"; + attributes: + | { + kind: "roam-page"; + pageTitle: string; + pageUid?: string; + } + | { + kind: "roam-block"; + blockUid: string; + } + | { + kind: "obsidian-wikilink"; + path: string; + subpath?: string; + alias?: string; + }; +}; + +export type InlineAnnotation = + | BoldAnnotation + | ItalicsAnnotation + | StrikethroughAnnotation + | CodeAnnotation + | LinkAnnotation + | ImageAnnotation + | ReferenceAnnotation; + +export type BodyAnnotation = InlineAnnotation | BlockAnnotation; + +export type TextDocument = { + text: string; + annotations: InlineAnnotation[]; +}; + +export type BodyDocument = { + text: string; + annotations: BodyAnnotation[]; +}; + +export type DgDocument = { + version: typeof DG_DOCUMENT_VERSION; + title: TextDocument; + body: BodyDocument; + metadata?: JsonObject; +}; + +export type ContentType = + | "text/plain" + | "text/markdown" + | "application/vnd.discourse-graph.atjson+json; version=1"; + +export type DgAtJsonMetadata = JsonObject & { + content: DgDocument; +}; diff --git a/packages/content-model/src/text.ts b/packages/content-model/src/text.ts new file mode 100644 index 000000000..0a4e9f163 --- /dev/null +++ b/packages/content-model/src/text.ts @@ -0,0 +1,28 @@ +import type { DgAtJsonMetadata, DgDocument, JsonObject } from "./schema"; +import { assertValidDgDocument } from "./validate"; + +const normalizeDerivedText = (text: string): string => + text + .replace(/\0/g, "") + .replace(/[ \t]+\n/g, "\n") + .trim(); + +export const derivePlainTextFromDgDocument = (document: DgDocument): string => { + const title = normalizeDerivedText(document.title.text); + const body = normalizeDerivedText(document.body.text); + return [title, body].filter((part) => part.length > 0).join("\n\n"); +}; + +export const createDgAtJsonMetadata = ({ + document, + metadata = {}, +}: { + document: DgDocument; + metadata?: JsonObject; +}): DgAtJsonMetadata => { + assertValidDgDocument(document); + return { + ...metadata, + content: document, + }; +}; diff --git a/packages/content-model/src/validate.ts b/packages/content-model/src/validate.ts new file mode 100644 index 000000000..9b307dffe --- /dev/null +++ b/packages/content-model/src/validate.ts @@ -0,0 +1,210 @@ +import { DG_DOCUMENT_VERSION, NULL_INLINE_CONTENT } from "./constants"; +import type { + BodyAnnotation, + BlockAnnotation, + DgDocument, + InlineAnnotation, + ReferenceAnnotation, +} from "./schema"; + +export type ValidationResult = { + valid: boolean; + errors: string[]; +}; + +const validateSpan = ({ + annotation, + length, + path, +}: { + annotation: BodyAnnotation | InlineAnnotation; + length: number; + path: string; +}): string[] => { + const errors: string[] = []; + if (!Number.isFinite(annotation.start)) { + errors.push(`${path} has invalid start`); + } + if (!Number.isFinite(annotation.end)) { + errors.push(`${path} has invalid end`); + } + if (errors.length > 0) { + return errors; + } + if (annotation.start < 0) { + errors.push(`${path} has negative start`); + } + if (annotation.end <= annotation.start) { + errors.push(`${path} has zero or negative length`); + } + if (annotation.end > length) { + errors.push(`${path} exceeds document length`); + } + return errors; +}; + +const validateReference = ( + annotation: ReferenceAnnotation, + path: string, +): string[] => { + const attributes = annotation.attributes as + | { + kind?: unknown; + pageTitle?: unknown; + blockUid?: unknown; + path?: unknown; + } + | undefined; + if (!attributes) { + return [`${path} reference is missing attributes`]; + } + if ( + attributes.kind === "roam-page" && + (typeof attributes.pageTitle !== "string" || + attributes.pageTitle.length === 0) + ) { + return [`${path} roam-page reference is missing pageTitle`]; + } + if ( + attributes.kind === "roam-block" && + (typeof attributes.blockUid !== "string" || + attributes.blockUid.length === 0) + ) { + return [`${path} roam-block reference is missing blockUid`]; + } + if ( + attributes.kind === "obsidian-wikilink" && + (typeof attributes.path !== "string" || attributes.path.length === 0) + ) { + return [`${path} obsidian-wikilink reference is missing path`]; + } + if ( + attributes.kind !== "roam-page" && + attributes.kind !== "roam-block" && + attributes.kind !== "obsidian-wikilink" + ) { + return [`${path} reference has unknown kind`]; + } + return []; +}; + +const validateBlockAnnotation = ( + annotation: BlockAnnotation, + path: string, +): string[] => { + const errors: string[] = []; + const attributes = annotation.attributes; + if (!attributes) { + return [`${path} block is missing attributes`]; + } + if (!attributes.blockId) { + errors.push(`${path} block is missing blockId`); + } + if (!Number.isInteger(attributes.depth) || attributes.depth < 0) { + errors.push(`${path} block has invalid depth`); + } + if ( + attributes.viewType !== "paragraph" && + attributes.viewType !== "bullet" && + attributes.viewType !== "numbered" + ) { + errors.push(`${path} block has invalid viewType`); + } + return errors; +}; + +const validateInlineAnnotations = ({ + annotations, + length, + path, +}: { + annotations: Array; + length: number; + path: string; +}): string[] => { + return annotations.flatMap((annotation, index) => { + const annotationPath = `${path}.annotations[${index}]`; + return [ + ...validateSpan({ annotation, length, path: annotationPath }), + ...(annotation.type === "block" + ? [`${annotationPath} title cannot contain block annotations`] + : []), + ...(annotation.type === "reference" + ? validateReference(annotation, annotationPath) + : []), + ]; + }); +}; + +export const validateDgDocument = (document: DgDocument): ValidationResult => { + const errors: string[] = []; + + if (document.version !== DG_DOCUMENT_VERSION) { + errors.push("version must be dg-content-model/v1"); + } + + errors.push( + ...validateInlineAnnotations({ + annotations: document.title.annotations, + length: document.title.text.length, + path: "title", + }), + ); + + const blockIds = new Set(); + for (const [index, annotation] of document.body.annotations.entries()) { + const path = `body.annotations[${index}]`; + errors.push( + ...validateSpan({ + annotation, + length: document.body.text.length, + path, + }), + ); + + if (annotation.type === "reference") { + errors.push(...validateReference(annotation, path)); + } + + if (annotation.type === "block") { + errors.push(...validateBlockAnnotation(annotation, path)); + if ( + annotation.attributes?.blockId && + blockIds.has(annotation.attributes.blockId) + ) { + errors.push( + `${path} duplicate blockId ${annotation.attributes.blockId}`, + ); + } + if (annotation.attributes?.blockId) { + blockIds.add(annotation.attributes.blockId); + } + } + } + + for (const [index, annotation] of document.body.annotations.entries()) { + if (annotation.type !== "block") continue; + const parentBlockId = annotation.attributes?.parentBlockId; + if (parentBlockId && !blockIds.has(parentBlockId)) { + errors.push( + `body.annotations[${index}] parentBlockId ${parentBlockId} does not exist`, + ); + } + } + + if (document.title.text.includes(NULL_INLINE_CONTENT)) { + errors.push("title text cannot contain null inline placeholder"); + } + + return { + valid: errors.length === 0, + errors, + }; +}; + +export const assertValidDgDocument = (document: DgDocument): void => { + const result = validateDgDocument(document); + if (!result.valid) { + throw new Error(`Invalid DG document:\n${result.errors.join("\n")}`); + } +}; diff --git a/packages/content-model/tsconfig.json b/packages/content-model/tsconfig.json new file mode 100644 index 000000000..45823fd82 --- /dev/null +++ b/packages/content-model/tsconfig.json @@ -0,0 +1,9 @@ +{ + "extends": "@repo/typescript-config/base.json", + "include": ["src"], + "compilerOptions": { + "baseUrl": ".", + "outDir": "dist", + "rootDir": "." + } +} diff --git a/packages/database/doc/upsert_content.md b/packages/database/doc/upsert_content.md index d4c790e74..11aec98e2 100644 --- a/packages/database/doc/upsert_content.md +++ b/packages/database/doc/upsert_content.md @@ -2,68 +2,189 @@ In general, for external references, you can either embed it inline, reference it by `X_local_id` (where local means the platform) or use the db_ids as in the tables. +`content_type` identifies the representation for a content row. It defaults to `text/plain`, except `variant: "full"` defaults to `text/markdown` for backward compatibility. Pass `content_type: "application/vnd.discourse-graph.atjson+json; version=1"` for canonical DG ATJSON rows, storing the structured document in `metadata.content` and only derived plain text in `text`. `embedding_inline` is accepted only for `text/plain` rows. + ```typescript -import type { LocalAccountDataInput, LocalDocumentDataInput, LocalContentDataInput } from '@repo/database/inputTypes'; +import type { + LocalAccountDataInput, + LocalDocumentDataInput, + LocalContentDataInput, +} from "@repo/database/inputTypes"; -const accounts: LocalAccountDataInput[] = [{ - "account_local_id": "sR22zZ470dNPkIf9PpjQXXdTBjG2", "name": "maparent" }]; -const docs: LocalDocumentDataInput[] = [{ "source_local_id": "page1_uid", "created": "2000/01/01", "last_modified": "2001/01/02", author_local_id:"sR22zZ470dNPkIf9PpjQXXdTBjG2"}]; +const accounts: LocalAccountDataInput[] = [ + { + account_local_id: "sR22zZ470dNPkIf9PpjQXXdTBjG2", + name: "maparent", + }, +]; +const docs: LocalDocumentDataInput[] = [ + { + source_local_id: "page1_uid", + created: "2000/01/01", + last_modified: "2001/01/02", + author_local_id: "sR22zZ470dNPkIf9PpjQXXdTBjG2", + }, +]; const contents: LocalContentDataInput[] = [ { - "author_local_id": "sR22zZ470dNPkIf9PpjQXXdTBjG2", - "author_inline": { - "account_local_id": "sR22zZ470dNPkIf9PpjQXXdTBjG2", - "name": "maparent" + author_local_id: "sR22zZ470dNPkIf9PpjQXXdTBjG2", + author_inline: { + account_local_id: "sR22zZ470dNPkIf9PpjQXXdTBjG2", + name: "maparent", }, - "document_inline": { - "source_local_id": "page1_uid", - "created": "2000/01/01", - "last_modified": "2001/01/02", - "author_local_id": "sR22zZ470dNPkIf9PpjQXXdTBjG2" + document_inline: { + source_local_id: "page1_uid", + created: "2000/01/01", + last_modified: "2001/01/02", + author_local_id: "sR22zZ470dNPkIf9PpjQXXdTBjG2", }, - "source_local_id": "a_roam_uid", - "scale": "document", - "created": "2000/01/01", - "last_modified": "2001/01/02", - "text": "Some text" + source_local_id: "a_roam_uid", + scale: "document", + created: "2000/01/01", + last_modified: "2001/01/02", + text: "Some text", }, - { "author_local_id": "sR22zZ470dNPkIf9PpjQXXdTBjG2", "document_local_id":"page1_uid", "source_local_id": "a_roam_uid2", "scale":"document", "created": "2000/01/02", "last_modified": "2001/01/03", "part_of_local_id":"a_roam_uid", "text": "Some subtext" }, { - "author_local_id": "sR22zZ470dNPkIf9PpjQXXdTBjG2", "document_inline": docs[0], "source_local_id": "a_roam_uid3", "scale": "document", "created": "2000/01/02", "last_modified": "2001/01/03", "part_of_local_id": "a_roam_uid2", "text": "Some subsubtext", "embedding_inline": { - "model":"openai_text_embedding_3_small_1536", "vector":[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] - } } + author_local_id: "sR22zZ470dNPkIf9PpjQXXdTBjG2", + document_local_id: "page1_uid", + source_local_id: "a_roam_uid2", + scale: "document", + created: "2000/01/02", + last_modified: "2001/01/03", + part_of_local_id: "a_roam_uid", + text: "Some subtext", + }, + { + author_local_id: "sR22zZ470dNPkIf9PpjQXXdTBjG2", + document_inline: docs[0], + source_local_id: "a_roam_uid3", + scale: "document", + created: "2000/01/02", + last_modified: "2001/01/03", + part_of_local_id: "a_roam_uid2", + text: "Some subsubtext", + embedding_inline: { + model: "openai_text_embedding_3_small_1536", + vector: [ + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + }, + }, ]; // optional preliminary step: upsert accounts. { - const { data, error } = await client.rpc("upsert_accounts_in_space", { space_id_: 12, data: accounts as Json }); + const { data, error } = await client.rpc("upsert_accounts_in_space", { + space_id_: 12, + data: accounts as Json, + }); if (error) console.error(error); console.log(data); } // optional preliminary step: upsert documents. { - const { data, error } = await client.rpc("upsert_documents", { v_space_id: 12, data: docs as Json }); + const { data, error } = await client.rpc("upsert_documents", { + v_space_id: 12, + data: docs as Json, + }); if (error) console.error(error); console.log(data); } // upsert content { - const { data, error } = await client.rpc("upsert_content", { v_space_id: 12, data: contents as Json, v_creator_id: 63 }); + const { data, error } = await client.rpc("upsert_content", { + v_space_id: 12, + data: contents as Json, + v_creator_id: 63, + }); if (error) console.error(error); console.log(data); } // more compact version if all content is known to also be a page const page_contents: LocalContentDataInput[] = [ - { "author_local_id": "sR22zZ470dNPkIf9PpjQXXdTBjG2", "source_local_id": "a_page_uid2", "created": "2000/01/02", "last_modified": "2001/01/03", "text": "Some other page" }, + { + author_local_id: "sR22zZ470dNPkIf9PpjQXXdTBjG2", + source_local_id: "a_page_uid2", + created: "2000/01/02", + last_modified: "2001/01/03", + text: "Some other page", + }, ]; { - const { data, error } = await client.rpc("upsert_content", { v_space_id: 12, data: page_contents as Json, v_creator_id: 63, content_as_document: true }); + const { data, error } = await client.rpc("upsert_content", { + v_space_id: 12, + data: page_contents as Json, + v_creator_id: 63, + content_as_document: true, + }); if (error) console.error(error); console.log(data); } - ``` diff --git a/packages/database/features/atjsonContentType.feature b/packages/database/features/atjsonContentType.feature new file mode 100644 index 000000000..da958b52a --- /dev/null +++ b/packages/database/features/atjsonContentType.feature @@ -0,0 +1,76 @@ +Feature: ATJSON content type storage + User story: + * As a Discourse Graphs app writer + * I want to store multiple representations of the same content slice + * So canonical ATJSON can be written without replacing Markdown + + Background: + Given the database is blank + And the user user1 opens the Roam plugin in space s1 + + Scenario: Markdown and ATJSON rows coexist for one full content slice + When user user1 upserts this content to space s1: + """json + [ + { + "author_local_id": "user1", + "document_inline": { + "source_local_id": "node1", + "created": "2026/01/01", + "last_modified": "2026/01/02", + "author_local_id": "user1" + }, + "source_local_id": "node1", + "scale": "document", + "created": "2026/01/01", + "last_modified": "2026/01/02", + "variant": "full", + "text": "# Markdown content" + }, + { + "author_local_id": "user1", + "document_inline": { + "source_local_id": "node1", + "created": "2026/01/01", + "last_modified": "2026/01/02", + "author_local_id": "user1" + }, + "source_local_id": "node1", + "scale": "document", + "created": "2026/01/01", + "last_modified": "2026/01/02", + "variant": "full", + "content_type": "application/vnd.discourse-graph.atjson+json; version=1", + "text": "Canonical text", + "metadata": { + "content": { + "version": "dg-content-model/v1", + "title": { + "text": "Node 1", + "annotations": [] + }, + "body": { + "text": "Canonical text\n", + "annotations": [ + { + "type": "block", + "start": 0, + "end": 15, + "attributes": { + "blockId": "block-1", + "depth": 0, + "viewType": "paragraph" + } + } + ] + } + } + } + } + ] + """ + Then a user logged in space s1 should see 2 Content in the database + And a user logged in space s1 should see these content representation rows: + | source_local_id | variant | content_type | text | + | node1 | full | application/vnd.discourse-graph.atjson+json; version=1 | Canonical text | + | node1 | full | text/markdown | # Markdown content | diff --git a/packages/database/features/step-definitions/stepdefs.ts b/packages/database/features/step-definitions/stepdefs.ts index 9791dbc42..2549a9b5b 100644 --- a/packages/database/features/step-definitions/stepdefs.ts +++ b/packages/database/features/step-definitions/stepdefs.ts @@ -67,14 +67,17 @@ Given("the database is blank", async () => { assert.equal(r.error, null); const r3 = await client.from("group_membership").select("group_id"); assert.equal(r3.error, null); - const groupIds = new Set((r3.data || []).map(({group_id})=>group_id)); + const groupIds = new Set((r3.data || []).map(({ group_id }) => group_id)); for (const id of groupIds) { const ur = await client.auth.admin.deleteUser(id); assert.equal(ur.error, null); } - const r2 = await client.from("PlatformAccount").select("dg_account").not('dg_account', 'is', null); + const r2 = await client + .from("PlatformAccount") + .select("dg_account") + .not("dg_account", "is", null); assert.equal(r2.error, null); - for (const {dg_account} of r2.data || []) { + for (const { dg_account } of r2.data || []) { const r = await client.auth.admin.deleteUser(dg_account!); assert.equal(r.error, null); } @@ -404,47 +407,117 @@ Then("query results should look like this", (table: DataTable) => { } }); -When("user of space {word} creates group {word}", async (spaceName: string, name: string) => { - const localRefs = (world.localRefs || {}) as LocalRefsType; - const spaceId = localRefs[spaceName]; - if (typeof spaceId !== "number") assert.fail("spaceId not a number"); - const client = await getLoggedinDatabase(spaceId); - try{ - // eslint-disable-next-line @typescript-eslint/naming-convention - const response = await client.functions.invoke<{group_id: string}>("create-group", {body:{name}}); +Then( + "a user logged in space {word} should see these content representation rows:", + async (spaceName: string, table: DataTable) => { + const localRefs = (world.localRefs || {}) as LocalRefsType; + const spaceId = localRefs[spaceName]; + if (typeof spaceId !== "number") assert.fail("spaceId not a number"); + const client = await getLoggedinDatabase(spaceId); + const response = await client + .from("my_contents") + .select("source_local_id, variant, content_type, text, metadata") + .eq("space_id", spaceId); assert.equal(response.error, null); - assert.ok(response.data?.group_id, "create-group response missing group_id"); - localRefs[name] = response.data.group_id; - world.localRefs = localRefs; - } catch (error) { - console.error((error as Record).actual); - throw error; - } -}) + const actual = (response.data || []) + .map( + ({ + source_local_id: sourceLocalId, + variant, + content_type: contentType, + text, + metadata, + }) => ({ + sourceLocalId, + variant, + contentType, + text, + hasMetadataContent: + typeof metadata === "object" && + metadata !== null && + "content" in metadata, + }), + ) + .sort((a, b) => + `${a.sourceLocalId}:${a.contentType}`.localeCompare( + `${b.sourceLocalId}:${b.contentType}`, + ), + ); + const expected = table + .hashes() + .map((row) => ({ + sourceLocalId: row.source_local_id, + variant: row.variant, + contentType: row.content_type, + text: row.text, + hasMetadataContent: row.content_type?.includes("atjson") ?? false, + })) + .sort((a, b) => + `${a.sourceLocalId}:${a.contentType}`.localeCompare( + `${b.sourceLocalId}:${b.contentType}`, + ), + ); + assert.deepEqual(actual, expected); + }, +); -When("user of space {word} adds space {word} to group {word}", - async (space1Name: string, space2Name:string, groupName: string): Promise =>{ - const localRefs = (world.localRefs || {}) as LocalRefsType; - const space1Id = localRefs[space1Name]; - const space2Id = localRefs[space2Name]; - const groupId = localRefs[groupName]; - if (typeof space1Id !== 'number') assert.fail("space1Id not a number"); - if (typeof space2Id !== 'number') assert.fail("space2Id not a number"); - if (typeof groupId !== 'string') assert.fail("groupId not a string"); - const client2 = await getLoggedinDatabase(space2Id); - const r1 = await client2.from("PlatformAccount") +When( + "user of space {word} creates group {word}", + async (spaceName: string, name: string) => { + const localRefs = (world.localRefs || {}) as LocalRefsType; + const spaceId = localRefs[spaceName]; + if (typeof spaceId !== "number") assert.fail("spaceId not a number"); + const client = await getLoggedinDatabase(spaceId); + try { + // eslint-disable-next-line @typescript-eslint/naming-convention + const response = await client.functions.invoke<{ group_id: string }>( + "create-group", + { body: { name } }, + ); + assert.equal(response.error, null); + assert.ok( + response.data?.group_id, + "create-group response missing group_id", + ); + localRefs[name] = response.data.group_id; + world.localRefs = localRefs; + } catch (error) { + console.error((error as Record).actual); + throw error; + } + }, +); + +When( + "user of space {word} adds space {word} to group {word}", + async ( + space1Name: string, + space2Name: string, + groupName: string, + ): Promise => { + const localRefs = (world.localRefs || {}) as LocalRefsType; + const space1Id = localRefs[space1Name]; + const space2Id = localRefs[space2Name]; + const groupId = localRefs[groupName]; + if (typeof space1Id !== "number") assert.fail("space1Id not a number"); + if (typeof space2Id !== "number") assert.fail("space2Id not a number"); + if (typeof groupId !== "string") assert.fail("groupId not a string"); + const client2 = await getLoggedinDatabase(space2Id); + const r1 = await client2 + .from("PlatformAccount") .select("dg_account") .eq("account_local_id", spaceAnonUserEmail("Roam", space2Id)) .maybeSingle(); - assert.equal(r1.error, null); - const memberId = r1.data?.dg_account; - assert.ok(memberId, "memberId not found for space2"); - const client1 = await getLoggedinDatabase(space1Id); - const r2 = await client1.from("group_membership").insert({ - /* eslint-disable @typescript-eslint/naming-convention */ - group_id: groupId, - member_id: memberId - /* eslint-enable @typescript-eslint/naming-convention */ - }); - assert.equal(r2.error, null); -}) + assert.equal(r1.error, null); + const memberId = r1.data?.dg_account; + assert.ok(memberId, "memberId not found for space2"); + const client1 = await getLoggedinDatabase(space1Id); + const r2 = await client1.from("group_membership").insert({ + /* eslint-disable @typescript-eslint/naming-convention */ + group_id: groupId, + member_id: memberId, + /* eslint-enable @typescript-eslint/naming-convention */ + }); + assert.equal(r2.error, null); + }, +); diff --git a/packages/database/package.json b/packages/database/package.json index b2c0de431..cb580d6ad 100644 --- a/packages/database/package.json +++ b/packages/database/package.json @@ -8,6 +8,7 @@ "./dbDotEnv": { "types": "./types/dbDotEnv.d.ts", "import": "./src/dbDotEnv.mjs", + "require": "./src/dbDotEnv.cjs", "default": "./src/dbDotEnv.mjs" }, "./dbTypes": "./src/dbTypes.ts", @@ -32,7 +33,8 @@ "lint": "eslint . && tsx scripts/lintSchemas.ts && tsx scripts/lintFunctions.ts", "lint:fix": "eslint --fix . && tsx scripts/lintSchemas.ts -f && tsx scripts/lintFunctions.ts", "migrate": "tsx scripts/migrate.ts", - "test": "pnpm run build && cucumber-js", + "test": "pnpm run build && pnpm run test:atjson-contract && cucumber-js", + "test:atjson-contract": "node --import tsx --test tests/atjsonSqlContract.test.ts", "test:withserve": "pnpm run build && tsx scripts/serveAndTest.ts", "genenv": "tsx scripts/createEnv.mts", "gentypes": "tsx scripts/genTypes.ts", diff --git a/packages/database/schema.yaml b/packages/database/schema.yaml index 5f44eef21..760228186 100644 --- a/packages/database/schema.yaml +++ b/packages/database/schema.yaml @@ -28,6 +28,7 @@ enums: direct: direct_and_children: direct_and_description: + full: Validation: description: Whether a given value was given by a person, or suggested by an automated agent (and then possibly infirmed.) permissible_values: @@ -229,6 +230,8 @@ classes: - creator - created - text + - variant + - content_type - metadata - scale # - position @@ -245,6 +248,8 @@ classes: unique_key_slots: - space - source_local_id + - variant + - content_type # ContentDerivation: # description: A derivation relation between content units @@ -428,6 +433,14 @@ slots: range: PlatformAccount text: required: true + variant: + range: ContentVariant + required: true + ifabsent: ContentVariant(direct) + content_type: + range: string + required: true + ifabsent: string(text/plain) description: created: range: datetime diff --git a/packages/database/src/dbDotEnv.cjs b/packages/database/src/dbDotEnv.cjs new file mode 100644 index 000000000..48b666871 --- /dev/null +++ b/packages/database/src/dbDotEnv.cjs @@ -0,0 +1,105 @@ +/* eslint-env node */ +/* global require, module, __dirname */ +/* eslint-disable @typescript-eslint/no-var-requires */ +const fs = require("node:fs"); +const path = require("node:path"); +const process = require("node:process"); +const console = require("node:console"); +const dotenv = require("dotenv"); + +const findRoot = () => { + let dir = __dirname; + while (path.basename(dir) !== "database") { + dir = path.dirname(dir); + } + return dir; +}; + +const getVariant = () => { + const useDbArgPos = (process.argv || []).indexOf("--use-db"); + let variant = + useDbArgPos > 0 + ? process.argv[useDbArgPos + 1] + : process.env["SUPABASE_USE_DB"]; + if (variant === undefined) { + dotenv.config(); + const dbGlobalEnv = path.join(findRoot(), ".env"); + if (fs.existsSync(dbGlobalEnv)) dotenv.config({ path: dbGlobalEnv }); + variant = process.env["SUPABASE_USE_DB"]; + } + const processHasVars = + !!process.env["SUPABASE_URL"] && !!process.env["SUPABASE_PUBLISHABLE_KEY"]; + + if ( + ["local", "branch", "production", "none", "implicit", undefined].indexOf( + variant, + ) === -1 + ) { + throw new Error("Invalid variant: " + variant); + } + + if ( + process.env.HOME === "/vercel" || + (process.env.GITHUB_ACTIONS === "true" && + process.env.GITHUB_TEST !== "test") + ) { + if (!processHasVars) { + console.error("Missing SUPABASE variables in deployment"); + variant = "none"; + } else { + variant = "implicit"; + } + } + if (variant === undefined) { + if (processHasVars) { + console.warn( + "please define explicitly which database to use (set SUPABASE_USE_DB)", + ); + variant = "implicit"; + } else { + console.warn("Not using the database"); + variant = "none"; + } + } + process.env["SUPABASE_USE_DB"] = variant; + return variant; +}; + +const envFilePath = () => { + const variant = getVariant(); + if (variant === "implicit" || variant === "none") return null; + const name = path.join(findRoot(), `.env.${variant}`); + return fs.existsSync(name) ? name : null; +}; + +const envContents = () => { + const path = envFilePath(); + if (!path) { + /* eslint-disable @typescript-eslint/naming-convention */ + const raw = { + SUPABASE_URL: process.env.SUPABASE_URL, + SUPABASE_PUBLISHABLE_KEY: process.env.SUPABASE_PUBLISHABLE_KEY, + NEXT_API_ROOT: process.env.NEXT_API_ROOT, + }; + /* eslint-enable @typescript-eslint/naming-convention */ + return Object.fromEntries(Object.entries(raw).filter(([, v]) => !!v)); + } + const data = fs.readFileSync(path, "utf8"); + return dotenv.parse(data); +}; + +let configDone = false; + +const config = () => { + if (configDone) return; + const path = envFilePath(); + if (path) dotenv.config({ path }); + configDone = true; +}; + +module.exports = { + getVariant, + envFilePath, + envContents, + config, +}; diff --git a/packages/database/src/dbTypes.ts b/packages/database/src/dbTypes.ts index 20cdbe2b1..2e16bf614 100644 --- a/packages/database/src/dbTypes.ts +++ b/packages/database/src/dbTypes.ts @@ -229,6 +229,7 @@ export type Database = { Content: { Row: { author_id: number | null + content_type: string created: string creator_id: number | null document_id: number @@ -244,6 +245,7 @@ export type Database = { } Insert: { author_id?: number | null + content_type?: string created: string creator_id?: number | null document_id: number @@ -259,6 +261,7 @@ export type Database = { } Update: { author_id?: number | null + content_type?: string created?: string creator_id?: number | null document_id?: number @@ -525,6 +528,7 @@ export type Database = { } FileReference: { Row: { + content_type: string | null created: string filehash: string filepath: string @@ -534,6 +538,7 @@ export type Database = { variant: Database["public"]["Enums"]["ContentVariant"] | null } Insert: { + content_type?: string | null created: string filehash: string filepath: string @@ -543,6 +548,7 @@ export type Database = { variant?: Database["public"]["Enums"]["ContentVariant"] | null } Update: { + content_type?: string | null created?: string filehash?: string filepath?: string @@ -554,24 +560,39 @@ export type Database = { Relationships: [ { foreignKeyName: "FileReference_content_fkey" - columns: ["space_id", "source_local_id", "variant"] + columns: ["space_id", "source_local_id", "variant", "content_type"] isOneToOne: false referencedRelation: "Content" - referencedColumns: ["space_id", "source_local_id", "variant"] + referencedColumns: [ + "space_id", + "source_local_id", + "variant", + "content_type", + ] }, { foreignKeyName: "FileReference_content_fkey" - columns: ["space_id", "source_local_id", "variant"] + columns: ["space_id", "source_local_id", "variant", "content_type"] isOneToOne: false referencedRelation: "my_contents" - referencedColumns: ["space_id", "source_local_id", "variant"] + referencedColumns: [ + "space_id", + "source_local_id", + "variant", + "content_type", + ] }, { foreignKeyName: "FileReference_content_fkey" - columns: ["space_id", "source_local_id", "variant"] + columns: ["space_id", "source_local_id", "variant", "content_type"] isOneToOne: false referencedRelation: "my_contents_with_embedding_openai_text_embedding_3_small_1536" - referencedColumns: ["space_id", "source_local_id", "variant"] + referencedColumns: [ + "space_id", + "source_local_id", + "variant", + "content_type", + ] }, ] } @@ -963,6 +984,7 @@ export type Database = { my_contents: { Row: { author_id: number | null + content_type: string | null created: string | null creator_id: number | null document_id: number | null @@ -1059,6 +1081,7 @@ export type Database = { my_contents_with_embedding_openai_text_embedding_3_small_1536: { Row: { author_id: number | null + content_type: string | null created: string | null creator_id: number | null document_id: number | null @@ -1272,6 +1295,7 @@ export type Database = { } Returns: { author_id: number | null + content_type: string created: string creator_id: number | null document_id: number @@ -1441,6 +1465,7 @@ export type Database = { Args: { concept: Database["public"]["Views"]["my_concepts"]["Row"] } Returns: { author_id: number | null + content_type: string | null created: string | null creator_id: number | null document_id: number | null @@ -1816,6 +1841,7 @@ export type Database = { | Database["public"]["CompositeTypes"]["inline_embedding_input"] | null variant: Database["public"]["Enums"]["ContentVariant"] | null + content_type: string | null } document_local_input: { space_id: number | null diff --git a/packages/database/src/lib/queries.ts b/packages/database/src/lib/queries.ts index 9e527a015..49fee40e3 100644 --- a/packages/database/src/lib/queries.ts +++ b/packages/database/src/lib/queries.ts @@ -137,8 +137,7 @@ export type NodeFilters = { author?: string; }; -type NodeFiltersDb = Omit & { ofTypes?: number[]}; - +type NodeFiltersDb = Omit & { ofTypes?: number[] }; /** * Filters for querying concepts based on their relationships. @@ -171,7 +170,10 @@ export type RelationFilters = { author?: string; }; -export type RelationFiltersDb = Omit&{ofTypes?: number[], toNodeTypes?: number[]}; +export type RelationFiltersDb = Omit< + RelationFilters, + "ofTypes" | "toNodeTypes" +> & { ofTypes?: number[]; toNodeTypes?: number[] }; /** * Controls which fields are returned in the response. @@ -254,7 +256,10 @@ export type GetConceptsParams = { pagination?: PaginationOptions; }; -type GetConceptsParamsDb = Omit&{scope?: NodeFiltersDb, relations?: RelationFiltersDb}; +type GetConceptsParamsDb = Omit & { + scope?: NodeFiltersDb; + relations?: RelationFiltersDb; +}; // Utility function to compose a generic query to fetch concepts, content and document. // Arguments are as in getConcepts, except we use numeric db ids of concepts for schemas instead @@ -262,15 +267,15 @@ type GetConceptsParamsDb = Omit&{scope?: const composeConceptQuery = ({ supabase, spaceId, - scope= { + scope = { type: "nodes", }, - relations= {}, - fields= { + relations = {}, + fields = { concepts: ["id", "name", "space_id"], content: ["source_local_id"], }, - pagination= { + pagination = { offset: 0, limit: 100, }, @@ -290,7 +295,9 @@ const composeConceptQuery = ({ if (ctArgs.length > 0) { const documentFields = fields.documents || []; if (documentFields.length > 0) { - ctArgs.push(`Document:my_documents!document_id${innerContent ? "!inner" : ""} (\n ${documentFields.join(",\n")} )`); + ctArgs.push( + `Document:my_documents!document_id${innerContent ? "!inner" : ""} (\n ${documentFields.join(",\n")} )`, + ); } q += `,\nContent:content_of_concept${innerContent ? "!inner" : ""} (\n${ctArgs.join(",\n")})`; } @@ -327,7 +334,7 @@ const composeConceptQuery = ({ let query = supabase.from("my_concepts").select(q); if (scope.type === "nodes") { query = query.eq("arity", 0); - } else if (scope.type === 'relations') { + } else if (scope.type === "relations") { query = query.gt("arity", 0); } // else fetch both @@ -344,28 +351,24 @@ const composeConceptQuery = ({ if (schemaDbIds.length > 0) { if (schemaDbIds.length === 1) query = query.eq("schema_id", schemaDbIds[0]!); - else - query = query.in("schema_id", schemaDbIds); + else query = query.in("schema_id", schemaDbIds); } // else we'll get all nodes } if (baseNodeLocalIds.length > 0) { if (baseNodeLocalIds.length === 1) query = query.eq("Content.source_local_id", baseNodeLocalIds[0]!); - else - query = query.in("Content.source_local_id", baseNodeLocalIds); + else query = query.in("Content.source_local_id", baseNodeLocalIds); } if (inRelsOfType !== undefined && inRelsOfType.length > 0) { if (inRelsOfType.length === 1) query = query.eq("relations.schema_id", inRelsOfType[0]!); - else - query = query.in("relations.schema_id", inRelsOfType); + else query = query.in("relations.schema_id", inRelsOfType); } if (inRelsToNodesOfType !== undefined && inRelsToNodesOfType.length > 0) { if (inRelsToNodesOfType.length === 1) query = query.eq("relations.subnodes.schema_id", inRelsToNodesOfType[0]!); - else - query = query.in("relations.subnodes.schema_id", inRelsToNodesOfType); + else query = query.in("relations.subnodes.schema_id", inRelsToNodesOfType); } if (inRelsToNodesOfAuthor !== undefined) { query = query.eq( @@ -410,7 +413,11 @@ export const getSchemaConcepts = async ( .filter((x) => typeof x === "object") .filter((x) => x.spaceId === spaceId || x.spaceId === 0); if (forceCacheReload || result.length === 1) { - const q = composeConceptQuery({ supabase, spaceId, scope: {type:"all", schemas: true} }); + const q = composeConceptQuery({ + supabase, + spaceId, + scope: { type: "all", schemas: true }, + }); const res = (await q) as PostgrestResponse; if (res.error) { console.error("getSchemaConcepts failed", res.error); @@ -467,7 +474,11 @@ const getLocalToDbIdMapping = async ( console.warn("Cannot populate cache without spaceId"); return dbIds; } - let q = composeConceptQuery({ supabase, spaceId, scope: {type:"all", schemas: true} }); + let q = composeConceptQuery({ + supabase, + spaceId, + scope: { type: "all", schemas: true }, + }); if (Object.keys(NODE_SCHEMA_CACHE).length > 1) { // Non-empty cache, query selectively q = q @@ -547,6 +558,7 @@ export const CONTENT_FIELDS: (keyof Content)[] = [ "id", "source_local_id", "variant", + "content_type", "author_id", "creator_id", "created", @@ -768,7 +780,7 @@ export const getNodesOfTypeWithRelations = async ({ return getConcepts({ supabase, spaceId, - scope: { type:"nodes", ofTypes, }, // we still start from the node + scope: { type: "nodes", ofTypes }, // we still start from the node relations: { ofTypes: relationTypes, author: nodeAuthoredBy, @@ -832,7 +844,6 @@ export const getDiscourseContext = async ({ }); }; - // instrumentation for benchmarking export const LAST_QUERY_DATA = { duration: 0 }; @@ -879,25 +890,23 @@ export const LAST_QUERY_DATA = { duration: 0 }; * }); * ``` */ -export const getConcepts = async ( - { - supabase, - spaceId, - scope= { - type: "nodes", - }, - relations= {}, - fields= { - concepts: CONCEPT_FIELDS, - content: CONTENT_FIELDS, - documents: DOCUMENT_FIELDS - }, - pagination= { - offset: 0, - limit: 100, - }, - }: GetConceptsParams -): Promise => { +export const getConcepts = async ({ + supabase, + spaceId, + scope = { + type: "nodes", + }, + relations = {}, + fields = { + concepts: CONCEPT_FIELDS, + content: CONTENT_FIELDS, + documents: DOCUMENT_FIELDS, + }, + pagination = { + offset: 0, + limit: 100, + }, +}: GetConceptsParams): Promise => { // translate schema local content Ids to concept database Ids. const localSchemaIds = new Set(); (scope.ofTypes || []).map((k) => localSchemaIds.add(k)); @@ -926,15 +935,15 @@ export const getConcepts = async ( spaceId, scope: { ...scope, - ofTypes: localToDbArray(scope.ofTypes) + ofTypes: localToDbArray(scope.ofTypes), }, relations: { ...relations, ofTypes: localToDbArray(relations.ofTypes), - toNodeTypes: localToDbArray(relations.toNodeTypes) + toNodeTypes: localToDbArray(relations.toNodeTypes), }, fields, - pagination + pagination, }); const before = Date.now(); const { error, data } = (await q) as PostgrestResponse; diff --git a/packages/database/supabase/migrations/20260517172000_atjson_content_type.sql b/packages/database/supabase/migrations/20260517172000_atjson_content_type.sql new file mode 100644 index 000000000..cf5c0d466 --- /dev/null +++ b/packages/database/supabase/migrations/20260517172000_atjson_content_type.sql @@ -0,0 +1,289 @@ +ALTER TABLE public."Content" +ADD COLUMN IF NOT EXISTS content_type text NOT NULL DEFAULT 'text/plain'; + +UPDATE public."Content" +SET content_type = 'text/markdown' +WHERE variant = 'full'::public."ContentVariant" +AND content_type = 'text/plain'; + +ALTER TABLE ONLY public."FileReference" +DROP CONSTRAINT IF EXISTS "FileReference_content_fkey"; + +DROP INDEX IF EXISTS public.content_space_local_id_variant_idx; + +CREATE UNIQUE INDEX content_space_local_id_variant_idx ON public."Content" USING btree ( + space_id, source_local_id, variant, content_type +) NULLS DISTINCT; + +ALTER TABLE public."FileReference" +ADD COLUMN IF NOT EXISTS content_type text GENERATED ALWAYS AS ('text/markdown') STORED; + +ALTER TABLE ONLY public."FileReference" +ADD CONSTRAINT "FileReference_content_fkey" FOREIGN KEY ( + space_id, source_local_id, variant, content_type +) REFERENCES public."Content" (space_id, source_local_id, variant, content_type) ON DELETE CASCADE; + +DO $$ +BEGIN + IF NOT EXISTS ( + SELECT 1 + FROM pg_attribute + WHERE attrelid = ( + SELECT typrelid + FROM pg_type + WHERE typnamespace = 'public'::regnamespace + AND typname = 'content_local_input' + ) + AND attname = 'content_type' + AND NOT attisdropped + ) THEN + ALTER TYPE public.content_local_input ADD ATTRIBUTE content_type text; + END IF; +END; +$$; + +CREATE OR REPLACE VIEW public.my_contents AS +SELECT + id, + document_id, + source_local_id, + variant, + author_id, + creator_id, + created, + text, + metadata, + scale, + space_id, + last_modified, + part_of_id, + content_type +FROM public."Content" + LEFT OUTER JOIN public.my_accessible_resources() AS ra USING (space_id, source_local_id) +WHERE ( + space_id = any(public.my_space_ids('reader')) + OR (space_id = any(public.my_space_ids('partial')) AND ra.space_id IS NOT NULL) +); + +CREATE OR REPLACE VIEW public.my_contents_with_embedding_openai_text_embedding_3_small_1536 AS +SELECT + ct.id, + ct.document_id, + ct.source_local_id, + ct.variant, + ct.author_id, + ct.creator_id, + ct.created, + ct.text, + ct.metadata, + ct.scale, + ct.space_id, + ct.last_modified, + ct.part_of_id, + emb.model, + emb.vector, + ct.content_type +FROM public."Content" AS ct + JOIN public."ContentEmbedding_openai_text_embedding_3_small_1536" AS emb ON (ct.id = emb.target_id) + LEFT OUTER JOIN public.my_accessible_resources() AS ra USING (space_id, source_local_id) +WHERE ( + ct.space_id = any(public.my_space_ids('reader')) + OR (ct.space_id = any(public.my_space_ids('partial')) AND ra.space_id IS NOT NULL) +) +AND NOT emb.obsolete; + +CREATE OR REPLACE FUNCTION public._local_content_to_db_content(data public.content_local_input) +RETURNS public."Content" STABLE +SET search_path = '' +LANGUAGE plpgsql AS $$ +DECLARE + content public."Content"%ROWTYPE; + reference_content JSONB := jsonb_build_object(); + key varchar; + value JSONB; + ref_single_val BIGINT; + ref_array_val BIGINT[]; +BEGIN + content := jsonb_populate_record(NULL::public."Content", to_jsonb(data)); + IF data.document_local_id IS NOT NULL THEN + SELECT id FROM public."Document" + WHERE source_local_id = data.document_local_id INTO content.document_id; + END IF; + IF data.creator_local_id IS NOT NULL THEN + SELECT id FROM public."PlatformAccount" + WHERE account_local_id = data.creator_local_id INTO content.creator_id; + ELSIF account_local_id(creator_inline(data)) IS NOT NULL THEN + SELECT id FROM public."PlatformAccount" + WHERE account_local_id = account_local_id(creator_inline(data)) INTO content.creator_id; + END IF; + IF data.author_local_id IS NOT NULL THEN + SELECT id FROM public."PlatformAccount" + WHERE account_local_id = data.author_local_id INTO content.author_id; + ELSIF account_local_id(author_inline(data)) IS NOT NULL THEN + SELECT id FROM public."PlatformAccount" + WHERE account_local_id = account_local_id(author_inline(data)) INTO content.author_id; + END IF; + IF data.space_url IS NOT NULL THEN + SELECT id FROM public."Space" + WHERE url = data.space_url INTO content.space_id; + END IF; + IF data.part_of_local_id IS NOT NULL THEN + SELECT parent_content.id INTO content.part_of_id FROM public."Content" AS parent_content + WHERE parent_content.source_local_id = data.part_of_local_id + AND (content.space_id IS NULL OR parent_content.space_id = content.space_id) + ORDER BY + CASE + WHEN parent_content.variant = 'direct'::public."ContentVariant" AND parent_content.content_type = 'text/plain' THEN 0 + WHEN parent_content.content_type = 'text/plain' THEN 1 + WHEN parent_content.content_type = 'text/markdown' THEN 2 + ELSE 3 + END, + parent_content.id + LIMIT 1; + END IF; + IF content.metadata IS NULL then + content.metadata := '{}'; + END IF; + IF content.content_type IS NULL then + content.content_type := CASE + WHEN content.variant = 'full'::public."ContentVariant" THEN 'text/markdown' + ELSE 'text/plain' + END; + END IF; + RETURN content; +END; +$$; + +CREATE OR REPLACE FUNCTION public.upsert_content(v_space_id bigint, data jsonb, v_creator_id BIGINT, content_as_document boolean DEFAULT TRUE) +RETURNS SETOF BIGINT +SET search_path = '' +LANGUAGE plpgsql +AS $$ +DECLARE + v_platform public."Platform"; + db_document public."Document"%ROWTYPE; + document_id BIGINT; + local_content public.content_local_input; + db_content public."Content"%ROWTYPE; + content_row JSONB; + upsert_id BIGINT; +BEGIN + SELECT platform INTO STRICT v_platform FROM public."Space" WHERE id=v_space_id; + FOR content_row IN SELECT * FROM jsonb_array_elements(data) + LOOP + local_content := jsonb_populate_record(NULL::public.content_local_input, content_row); + local_content.space_id := v_space_id; + db_content := public._local_content_to_db_content(local_content); + IF account_local_id(author_inline(local_content)) IS NOT NULL THEN + SELECT public.create_account_in_space( + v_space_id, + account_local_id(author_inline(local_content)), + name(author_inline(local_content)) + ) INTO STRICT upsert_id; + db_content.author_id := upsert_id; + END IF; + IF account_local_id(creator_inline(local_content)) IS NOT NULL THEN + SELECT public.create_account_in_space( + v_space_id, + account_local_id(creator_inline(local_content)), + name(creator_inline(local_content)) + ) INTO STRICT upsert_id; + db_content.creator_id := upsert_id; + END IF; + IF content_as_document THEN + db_content.scale = 'document'; + END IF; + IF content_as_document AND document_id(db_content) IS NULL AND source_local_id(document_inline(local_content)) IS NULL THEN + local_content.document_inline.space_id := v_space_id; + local_content.document_inline.source_local_id := db_content.source_local_id; + local_content.document_inline.last_modified := db_content.last_modified; + local_content.document_inline.created := db_content.created; + local_content.document_inline.author_id := db_content.author_id; + END IF; + IF source_local_id(document_inline(local_content)) IS NOT NULL THEN + db_document := public._local_document_to_db_document(document_inline(local_content)); + IF (db_document.author_id IS NULL AND author_inline(local_content) IS NOT NULL) THEN + db_document.author_id := upsert_account_in_space(v_space_id, author_inline(local_content)); + END IF; + INSERT INTO public."Document" ( + space_id, + source_local_id, + url, + created, + metadata, + last_modified, + author_id, + contents + ) VALUES ( + COALESCE(db_document.space_id, v_space_id), + db_document.source_local_id, + db_document.url, + db_document.created, + COALESCE(db_document.metadata, '{}'::jsonb), + db_document.last_modified, + db_document.author_id, + db_document.contents + ) + ON CONFLICT (space_id, source_local_id) DO UPDATE SET + url = COALESCE(db_document.url, EXCLUDED.url), + created = COALESCE(db_document.created, EXCLUDED.created), + metadata = COALESCE(db_document.metadata, EXCLUDED.metadata), + last_modified = COALESCE(db_document.last_modified, EXCLUDED.last_modified), + author_id = COALESCE(db_document.author_id, EXCLUDED.author_id), + contents = COALESCE(db_document.contents, EXCLUDED.contents) + RETURNING id INTO STRICT document_id; + db_content.document_id := document_id; + END IF; + INSERT INTO public."Content" ( + document_id, + source_local_id, + variant, + content_type, + author_id, + creator_id, + created, + text, + metadata, + scale, + space_id, + last_modified, + part_of_id + ) VALUES ( + db_content.document_id, + db_content.source_local_id, + COALESCE(db_content.variant, 'direct'::public."ContentVariant"), + COALESCE( + db_content.content_type, + CASE + WHEN db_content.variant = 'full'::public."ContentVariant" THEN 'text/markdown' + ELSE 'text/plain' + END + ), + db_content.author_id, + db_content.creator_id, + db_content.created, + db_content.text, + COALESCE(db_content.metadata, '{}'::jsonb), + db_content.scale, + db_content.space_id, + db_content.last_modified, + db_content.part_of_id + ) + ON CONFLICT (space_id, source_local_id, variant, content_type) DO UPDATE SET + document_id = COALESCE(db_content.document_id, EXCLUDED.document_id), + author_id = COALESCE(db_content.author_id, EXCLUDED.author_id), + creator_id = COALESCE(db_content.creator_id, EXCLUDED.creator_id), + created = COALESCE(db_content.created, EXCLUDED.created), + text = COALESCE(db_content.text, EXCLUDED.text), + metadata = COALESCE(db_content.metadata, EXCLUDED.metadata), + scale = COALESCE(db_content.scale, EXCLUDED.scale), + last_modified = COALESCE(db_content.last_modified, EXCLUDED.last_modified), + part_of_id = COALESCE(db_content.part_of_id, EXCLUDED.part_of_id) + RETURNING id INTO STRICT upsert_id; + IF model(embedding_inline(local_content)) IS NOT NULL AND db_content.content_type = 'text/plain' THEN + PERFORM public.upsert_content_embedding(upsert_id, model(embedding_inline(local_content)), vector(embedding_inline(local_content))); + END IF; + RETURN NEXT upsert_id; + END LOOP; +END; +$$; diff --git a/packages/database/supabase/schemas/assets.sql b/packages/database/supabase/schemas/assets.sql index f953d4419..4af6dc4f1 100644 --- a/packages/database/supabase/schemas/assets.sql +++ b/packages/database/supabase/schemas/assets.sql @@ -6,15 +6,16 @@ CREATE TABLE IF NOT EXISTS public."FileReference" ( "created" timestamp without time zone NOT NULL, last_modified timestamp without time zone NOT NULL, -- not allowed virtual with user types - variant public."ContentVariant" GENERATED ALWAYS AS ('full') STORED + variant public."ContentVariant" GENERATED ALWAYS AS ('full') STORED, + content_type text GENERATED ALWAYS AS ('text/markdown') STORED ); ALTER TABLE ONLY public."FileReference" ADD CONSTRAINT "FileReference_pkey" PRIMARY KEY (source_local_id, space_id, filepath); ALTER TABLE ONLY public."FileReference" ADD CONSTRAINT "FileReference_content_fkey" FOREIGN KEY ( - space_id, source_local_id, variant -) REFERENCES public."Content" (space_id, source_local_id, variant) ON DELETE CASCADE; + space_id, source_local_id, variant, content_type +) REFERENCES public."Content" (space_id, source_local_id, variant, content_type) ON DELETE CASCADE; -- note the absence of on update ; the generated column forbids cascade, so it will error -- However, update on those columns should never happen. diff --git a/packages/database/supabase/schemas/content.sql b/packages/database/supabase/schemas/content.sql index 98f18f179..ce37af3cc 100644 --- a/packages/database/supabase/schemas/content.sql +++ b/packages/database/supabase/schemas/content.sql @@ -78,6 +78,7 @@ CREATE TABLE IF NOT EXISTS public."Content" ( document_id bigint NOT NULL, source_local_id character varying, variant public."ContentVariant" NOT NULL DEFAULT 'direct', + content_type text NOT NULL DEFAULT 'text/plain', author_id bigint, creator_id bigint, created timestamp without time zone NOT NULL, @@ -130,7 +131,7 @@ CREATE INDEX "Content_part_of" ON public."Content" USING btree ( CREATE INDEX "Content_space" ON public."Content" USING btree (space_id); CREATE UNIQUE INDEX content_space_local_id_variant_idx ON public."Content" USING btree ( - space_id, source_local_id, variant + space_id, source_local_id, variant, content_type ) NULLS DISTINCT; CREATE INDEX "Content_text" ON public."Content" USING pgroonga (text); @@ -261,7 +262,8 @@ SELECT scale, space_id, last_modified, - part_of_id + part_of_id, + content_type FROM public."Content" LEFT OUTER JOIN public.my_accessible_resources() AS ra USING (space_id, source_local_id) WHERE ( @@ -337,7 +339,8 @@ CREATE TYPE public.content_local_input AS ( author_inline public.account_local_input, creator_inline public.account_local_input, embedding_inline public.inline_embedding_input, - variant public."ContentVariant" + variant public."ContentVariant", + content_type text ); @@ -408,18 +411,34 @@ BEGIN SELECT id FROM public."PlatformAccount" WHERE account_local_id = account_local_id(author_inline(data)) INTO content.author_id; END IF; - IF data.part_of_local_id IS NOT NULL THEN - SELECT id FROM public."Content" - WHERE source_local_id = data.part_of_local_id INTO content.part_of_id; - END IF; IF data.space_url IS NOT NULL THEN SELECT id FROM public."Space" WHERE url = data.space_url INTO content.space_id; END IF; + IF data.part_of_local_id IS NOT NULL THEN + SELECT parent_content.id INTO content.part_of_id FROM public."Content" AS parent_content + WHERE parent_content.source_local_id = data.part_of_local_id + AND (content.space_id IS NULL OR parent_content.space_id = content.space_id) + ORDER BY + CASE + WHEN parent_content.variant = 'direct'::public."ContentVariant" AND parent_content.content_type = 'text/plain' THEN 0 + WHEN parent_content.content_type = 'text/plain' THEN 1 + WHEN parent_content.content_type = 'text/markdown' THEN 2 + ELSE 3 + END, + parent_content.id + LIMIT 1; + END IF; -- now avoid null defaults IF content.metadata IS NULL then content.metadata := '{}'; END IF; + IF content.content_type IS NULL then + content.content_type := CASE + WHEN content.variant = 'full'::public."ContentVariant" THEN 'text/markdown' + ELSE 'text/plain' + END; + END IF; RETURN content; END; $$; @@ -598,6 +617,7 @@ BEGIN document_id, source_local_id, variant, + content_type, author_id, creator_id, created, @@ -611,6 +631,13 @@ BEGIN db_content.document_id, db_content.source_local_id, COALESCE(db_content.variant, 'direct'::public."ContentVariant"), + COALESCE( + db_content.content_type, + CASE + WHEN db_content.variant = 'full'::public."ContentVariant" THEN 'text/markdown' + ELSE 'text/plain' + END + ), db_content.author_id, db_content.creator_id, db_content.created, @@ -621,7 +648,7 @@ BEGIN db_content.last_modified, db_content.part_of_id ) - ON CONFLICT (space_id, source_local_id, variant) DO UPDATE SET + ON CONFLICT (space_id, source_local_id, variant, content_type) DO UPDATE SET document_id = COALESCE(db_content.document_id, EXCLUDED.document_id), author_id = COALESCE(db_content.author_id, EXCLUDED.author_id), creator_id = COALESCE(db_content.creator_id, EXCLUDED.creator_id), @@ -632,7 +659,7 @@ BEGIN last_modified = COALESCE(db_content.last_modified, EXCLUDED.last_modified), part_of_id = COALESCE(db_content.part_of_id, EXCLUDED.part_of_id) RETURNING id INTO STRICT upsert_id; - IF model(embedding_inline(local_content)) IS NOT NULL THEN + IF model(embedding_inline(local_content)) IS NOT NULL AND db_content.content_type = 'text/plain' THEN PERFORM public.upsert_content_embedding(upsert_id, model(embedding_inline(local_content)), vector(embedding_inline(local_content))); END IF; RETURN NEXT upsert_id; diff --git a/packages/database/supabase/schemas/embedding.sql b/packages/database/supabase/schemas/embedding.sql index 273fd6eab..f80c3ecfa 100644 --- a/packages/database/supabase/schemas/embedding.sql +++ b/packages/database/supabase/schemas/embedding.sql @@ -44,7 +44,8 @@ ct.space_id, ct.last_modified, ct.part_of_id, emb.model, -emb.vector +emb.vector, +ct.content_type FROM public."Content" AS ct JOIN public."ContentEmbedding_openai_text_embedding_3_small_1536" AS emb ON (ct.id = emb.target_id) LEFT OUTER JOIN public.my_accessible_resources () AS ra USING (space_id, source_local_id) diff --git a/packages/database/tests/atjsonSqlContract.test.ts b/packages/database/tests/atjsonSqlContract.test.ts new file mode 100644 index 000000000..ff3ca79e4 --- /dev/null +++ b/packages/database/tests/atjsonSqlContract.test.ts @@ -0,0 +1,140 @@ +import assert from "node:assert/strict"; +import { readFileSync } from "node:fs"; +import { join } from "node:path"; +import test from "node:test"; + +const repoRoot = join(process.cwd(), "..", ".."); + +const readRepoFile = (path: string): string => + readFileSync(join(repoRoot, path), "utf8"); + +const compactSql = (sql: string): string => + sql.replace(/--.*$/gm, "").replace(/\s+/g, " ").trim(); + +const contentSchema = compactSql( + readRepoFile("packages/database/supabase/schemas/content.sql"), +); +const assetsSchema = compactSql( + readRepoFile("packages/database/supabase/schemas/assets.sql"), +); +const embeddingSchema = compactSql( + readRepoFile("packages/database/supabase/schemas/embedding.sql"), +); +const migration = compactSql( + readRepoFile( + "packages/database/supabase/migrations/20260517172000_atjson_content_type.sql", + ), +); +const schemaYaml = readRepoFile("packages/database/schema.yaml"); +const dbTypes = readRepoFile("packages/database/src/dbTypes.ts"); + +void test("Content SQL models content_type as the representation discriminator", () => { + assert.match( + contentSchema, + /content_type text NOT NULL DEFAULT 'text\/plain'/, + ); + assert.match( + contentSchema, + /CREATE UNIQUE INDEX content_space_local_id_variant_idx ON public\."Content" USING btree \( space_id, source_local_id, variant, content_type \)/, + ); + assert.match( + contentSchema, + /CREATE TYPE public\.content_local_input AS \(.*variant public\."ContentVariant", content_type text \)/, + ); + assert.match(contentSchema, /content_type FROM public\."Content"/); + assert.match( + contentSchema, + /WHEN content\.variant = 'full'::public\."ContentVariant" THEN 'text\/markdown' ELSE 'text\/plain'/, + ); + assert.match( + contentSchema, + /ON CONFLICT \(space_id, source_local_id, variant, content_type\) DO UPDATE SET/, + ); +}); + +void test("ATJSON migration backfills and preserves multiple representations", () => { + assert.match( + migration, + /ADD COLUMN IF NOT EXISTS content_type text NOT NULL DEFAULT 'text\/plain'/, + ); + assert.match( + migration, + /UPDATE public\."Content" SET content_type = 'text\/markdown' WHERE variant = 'full'::public\."ContentVariant" AND content_type = 'text\/plain'/, + ); + assert.match( + migration, + /CREATE UNIQUE INDEX content_space_local_id_variant_idx ON public\."Content" USING btree \( space_id, source_local_id, variant, content_type \)/, + ); + assert.match( + migration, + /ON CONFLICT \(space_id, source_local_id, variant, content_type\) DO UPDATE SET/, + ); + assert.match( + migration, + /ALTER TYPE public\.content_local_input ADD ATTRIBUTE content_type text/, + ); + assert.match( + migration, + /DO \$\$ BEGIN IF NOT EXISTS .* THEN ALTER TYPE public\.content_local_input ADD ATTRIBUTE content_type text; END IF; END; \$\$;/, + ); + assert.match( + migration, + /CREATE OR REPLACE VIEW public\.my_contents AS SELECT.*content_type FROM public\."Content"/, + ); + assert.match( + migration, + /CREATE OR REPLACE VIEW public\.my_contents_with_embedding_openai_text_embedding_3_small_1536 AS SELECT.*ct\.content_type/, + ); +}); + +void test("content parent lookup stays deterministic with multiple representations", () => { + const parentLookupPattern = + /SELECT parent_content\.id INTO content\.part_of_id FROM public\."Content" AS parent_content WHERE parent_content\.source_local_id = data\.part_of_local_id AND \(content\.space_id IS NULL OR parent_content\.space_id = content\.space_id\) ORDER BY CASE WHEN parent_content\.variant = 'direct'::public\."ContentVariant" AND parent_content\.content_type = 'text\/plain' THEN 0 WHEN parent_content\.content_type = 'text\/plain' THEN 1 WHEN parent_content\.content_type = 'text\/markdown' THEN 2 ELSE 3 END, parent_content\.id LIMIT 1/; + assert.match(contentSchema, parentLookupPattern); + assert.match(migration, parentLookupPattern); +}); + +void test("FileReference still targets Markdown full content rows", () => { + assert.match( + assetsSchema, + /variant public\."ContentVariant" GENERATED ALWAYS AS \('full'\) STORED, content_type text GENERATED ALWAYS AS \('text\/markdown'\) STORED/, + ); + assert.match( + assetsSchema, + /FOREIGN KEY \( space_id, source_local_id, variant, content_type \) REFERENCES public\."Content" \(space_id, source_local_id, variant, content_type\)/, + ); + assert.match( + migration, + /ADD COLUMN IF NOT EXISTS content_type text GENERATED ALWAYS AS \('text\/markdown'\) STORED/, + ); + assert.match( + migration, + /FOREIGN KEY \( space_id, source_local_id, variant, content_type \) REFERENCES public\."Content" \(space_id, source_local_id, variant, content_type\)/, + ); +}); + +void test("embedding view and generated types expose content_type", () => { + assert.match( + embeddingSchema, + /CREATE OR REPLACE VIEW public\.my_contents_with_embedding_openai_text_embedding_3_small_1536 AS SELECT.*ct\.content_type/, + ); + assert.match(dbTypes, /content_type: string/); + assert.match(dbTypes, /content_type\?: string/); + assert.match(dbTypes, /content_type: string \| null/); +}); + +void test("upsert_content only accepts inline embeddings for plain text rows", () => { + const embeddingGuardPattern = + /IF model\(embedding_inline\(local_content\)\) IS NOT NULL AND db_content\.content_type = 'text\/plain' THEN PERFORM public\.upsert_content_embedding/; + assert.match(contentSchema, embeddingGuardPattern); + assert.match(migration, embeddingGuardPattern); +}); + +void test("LinkML schema keeps variant semantic and content_type representational", () => { + assert.match(schemaYaml, /full:/); + assert.match(schemaYaml, /content_type:/); + assert.match( + schemaYaml, + /unique_keys:[\s\S]*content_space_and_local_id:[\s\S]*unique_key_slots:[\s\S]*- space[\s\S]*- source_local_id[\s\S]*- variant[\s\S]*- content_type/, + ); +}); diff --git a/packages/database/tsconfig.json b/packages/database/tsconfig.json index 3db4efa48..f284fbf9c 100644 --- a/packages/database/tsconfig.json +++ b/packages/database/tsconfig.json @@ -8,6 +8,7 @@ "include": [ "./src/**/*", "./scripts/**/*", + "./tests/**/*", "./features/step-definitions/**/*", "./features/support/**/*" ], diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 3499ac86a..eecf2ad86 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -118,6 +118,9 @@ importers: '@codemirror/view': specifier: ^6.38.8 version: 6.38.8 + '@repo/content-model': + specifier: workspace:* + version: link:../../packages/content-model '@repo/database': specifier: workspace:* version: link:../../packages/database @@ -239,6 +242,9 @@ importers: '@octokit/core': specifier: ^6.1.3 version: 6.1.6 + '@repo/content-model': + specifier: workspace:* + version: link:../../packages/content-model '@repo/database': specifier: workspace:* version: link:../../packages/database @@ -531,6 +537,28 @@ importers: specifier: ^5 version: 5.5.4 + packages/content-model: + dependencies: + tslib: + specifier: 2.5.1 + version: 2.5.1 + devDependencies: + '@repo/eslint-config': + specifier: workspace:* + version: link:../eslint-config + '@repo/typescript-config': + specifier: workspace:* + version: link:../typescript-config + '@types/node': + specifier: ^20.11.24 + version: 20.19.13 + eslint: + specifier: 'catalog:' + version: 8.57.1 + typescript: + specifier: 5.5.4 + version: 5.5.4 + packages/database: dependencies: '@repo/utils': From c057ffa52fdbaf69881bc79d8fb41594cdcf2241 Mon Sep 17 00:00:00 2001 From: Michael Gartner Date: Sun, 17 May 2026 21:32:01 -0600 Subject: [PATCH 3/3] Fix changed-file lint warnings --- packages/database/features/step-definitions/stepdefs.ts | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/packages/database/features/step-definitions/stepdefs.ts b/packages/database/features/step-definitions/stepdefs.ts index 2549a9b5b..8b2322233 100644 --- a/packages/database/features/step-definitions/stepdefs.ts +++ b/packages/database/features/step-definitions/stepdefs.ts @@ -67,7 +67,9 @@ Given("the database is blank", async () => { assert.equal(r.error, null); const r3 = await client.from("group_membership").select("group_id"); assert.equal(r3.error, null); - const groupIds = new Set((r3.data || []).map(({ group_id }) => group_id)); + const groupIds = new Set( + (r3.data || []).map(({ group_id: groupId }) => groupId), + ); for (const id of groupIds) { const ur = await client.auth.admin.deleteUser(id); assert.equal(ur.error, null); @@ -77,8 +79,8 @@ Given("the database is blank", async () => { .select("dg_account") .not("dg_account", "is", null); assert.equal(r2.error, null); - for (const { dg_account } of r2.data || []) { - const r = await client.auth.admin.deleteUser(dg_account!); + for (const { dg_account: dgAccount } of r2.data || []) { + const r = await client.auth.admin.deleteUser(dgAccount); assert.equal(r.error, null); } r = await client.from("PlatformAccount").delete().neq("id", -1);