From fdf2d0328b2c817e53327f2519ce17971a6088b7 Mon Sep 17 00:00:00 2001 From: David de Boer Date: Sun, 28 Jun 2026 20:22:34 +0200 Subject: [PATCH 01/35] feat(search)!: unify the field model and add query IR, engine port and result types - replace FieldSpec and Projection with one SearchField/SearchSchema model - add SearchQuery, Filter, Sort and the filter-operator semantics - add the SearchEngine port and result types (SearchResult/SearchHit/ResultDocument/Reference) - add physicalFields (the shared fanout convention) and schema selectors - rewrite projectDocument and projectGraph onto the unified model; projection output unchanged - remove FieldSpec, Projection and the discriminated FieldKind (breaking) --- .../0003-search-api-core-query-model.md | 112 +++++++-- packages/search/README.md | 235 ++++++++---------- packages/search/package.json | 2 +- packages/search/src/engine.ts | 140 +++++++++++ packages/search/src/index.ts | 50 +++- packages/search/src/project.ts | 204 ++++++--------- packages/search/src/query.ts | 95 +++++++ packages/search/src/schema.ts | 184 ++++++++++++++ packages/search/test/engine.test.ts | 110 ++++++++ packages/search/test/project.test.ts | 100 +++++--- packages/search/test/query.test.ts | 78 ++++++ packages/search/test/schema.test.ts | 209 ++++++++++++++++ 12 files changed, 1195 insertions(+), 324 deletions(-) create mode 100644 packages/search/src/engine.ts create mode 100644 packages/search/src/query.ts create mode 100644 packages/search/src/schema.ts create mode 100644 packages/search/test/engine.test.ts create mode 100644 packages/search/test/query.test.ts create mode 100644 packages/search/test/schema.test.ts diff --git a/docs/decisions/0003-search-api-core-query-model.md b/docs/decisions/0003-search-api-core-query-model.md index 8189cda5..57521fad 100644 --- a/docs/decisions/0003-search-api-core-query-model.md +++ b/docs/decisions/0003-search-api-core-query-model.md @@ -32,7 +32,7 @@ Two tiers: `search-*` is backend you compose; `search-api-*` is the surface you | Tier | Package | Responsibility | | ----------- | ------------------------- | ----------------------------------------------------------------------------------------------------------------------- | -| backend | `@lde/search` | field model · `SearchQuery` · filter semantics · adapter port | +| backend | `@lde/search` | field model · `SearchQuery` · filter semantics · engine port | | backend | `@lde/search-typesense` | engine adapter: collection schema · query/filter compiler · `search()` | | API surface | `@lde/search-api-graphql` | field model + `SearchQuery` → GraphQL schema (runtime configuration; see [ADR 4](./0004-search-api-graphql-surface.md)) | | API surface | `@lde/search-api-rest` | OpenAPI + route handlers (later, thin over the core) | @@ -46,12 +46,22 @@ The **API contract** (the SDL shape consumers couple to) is breaking to change a right in v1. The **IR / stored document** (framed JSON-LD vs a flat engine doc) lives behind the adapter and is swappable with no consumer impact. Nothing engine-specific (companion fields, `int32`, the engine query language) and nothing RDF-specific -(`@context`, `@id`, IRI-keyed predicates) leaks past the adapter port. +(`@context`, `@id`, IRI-keyed predicates) leaks past the engine port. ### Field model The engine-neutral description of a queryable field – the runtime form of one SHACL -NodeShape + its `search:` annotations: +NodeShape + its `search:` annotations. **One `SearchField` declaration drives four +consumers** – projection (RDF→flat document), the engine collection schema, the query +semantics, and the GraphQL surface – so they cannot drift. + +> Updated 2026-06-26 (during implementation): this is the **unified** field model. It +> folds the three previously separate declarations into one – the projection-side +> `FieldSpec`/`FieldKind` (RDF→doc), the deployment’s Typesense `SEARCH_FIELDS` (collection +> schema + weights), and the query model below. The original ADR deferred this unification; +> it is now adopted (option “c”). The `kind` + capability flags replace the old discriminated +> projection kinds, derived fields become first-class, and the Typesense-vocabulary types are +> _derived_ from `kind` rather than re-declared. ```ts type FieldKind = @@ -64,31 +74,44 @@ type FieldKind = | 'reference'; interface SearchField { - readonly name: string; // logical API name + readonly name: string; // logical API name; the physical fanout derives from it readonly kind: FieldKind; - readonly array?: boolean; - readonly localized?: boolean; + readonly path?: string; // sh:path to project from; omit for a derivation-populated field + readonly array?: boolean; // sh:maxCount + readonly localized?: boolean; // rdf:langString / sh:languageIn (text only) + readonly locales?: readonly string[]; // when localized: which languages to emit readonly output?: boolean; // appears in the schema output type - readonly searchable?: { weight: number }; // free-text inclusion + weight + readonly searchable?: { weight: number }; // free-text inclusion + weight (per-locale when localized) readonly filterable?: boolean; // usable in `where` readonly facetable?: boolean; readonly sortable?: boolean; - readonly nestedStrategy?: 'labelOnly' | 'idOnly' | 'inline'; // for `reference` + readonly ref?: { type: string; strategy: 'labelOnly' | 'idOnly' | 'inline' }; // kind: 'reference' + readonly transform?: (value: string) => string; // projection-time value transform readonly group?: { readonly name: string; readonly prefix: string }; // deployment delta } +type Derivation = (document: SearchDocument, node: FramedNode) => void; + interface SearchSchema { + readonly type: string; // sh:targetClass readonly fields: readonly SearchField[]; + readonly derivations?: readonly Derivation[]; // computed fields: status, *_group, booleans } ``` -Maps onto SHACL + `search:` (`kind`←`sh:datatype`, `array`←`sh:maxCount`, -`localized`←`sh:languageIn`, `facetable`←`search:facetable`, `sortable`←`search:sortable`, -`nestedStrategy`←`sh:node`/`sh:class` + `search:nestedStrategy`) so an eventual generator -emits it unchanged. The `group` companion (coarse grouped facets, e.g. `format_group`) and -the `status_rank` tie-break sort are **deployment-specific deltas**, never in `@lde/search`. -`relevance` is _not_ a delta: every full-text engine ranks by match score, so it is a -generic reserved sort the adapter understands. +Maps onto SHACL + `search:` (`kind`←`sh:datatype`/`sh:nodeKind`, `path`←`sh:path`, +`array`←`sh:maxCount`, `localized`←`sh:languageIn`, `facetable`←`search:facetable`, +`sortable`←`search:sortable`, `ref`←`sh:node`/`sh:class` + `search:nestedStrategy`) so an +eventual generator emits it unchanged. A field with **no `path`** is a derived field – +populated by a `Derivation` rather than projected from the IR – yet it still carries full +query/schema/output behavior, which is how the former separate projection `FieldSpec` is +subsumed. The physical field names a declaration fans out to (`${name}_search_${locale}`, +`${name}_sort_${locale}`, `${name}_search`, `${name}_group`) follow one convention owned by +`@lde/search`, so projection, collection schema and query compiler agree. The `group` +companion (coarse grouped facets, e.g. `format_group`) and the `status_rank` tie-break sort +are **deployment-specific deltas**, never in `@lde/search`. `relevance` is _not_ a delta: +every full-text engine ranks by match score, so it is a generic reserved sort the adapter +understands. ### `SearchQuery` – the neutral query IR @@ -150,22 +173,61 @@ matches Typesense’s native inclusive range, covers every DR case, additively r Grouped facets need no special shape – `group:`-prefixed tokens travel as ordinary `in` strings and the adapter splits/unions them. -### Adapter port and result +### Engine port and result + +The **port** is the interface the core defines; a concrete engine **adapter** +(`@lde/search-typesense`’s `TypesenseSearchEngine`) implements it. Naming the port for the +capability (`SearchEngine`), not the pattern piece, keeps `TypesenseSearchEngine implements +SearchEngine` readable. ```ts -interface SearchAdapter { - search(query: SearchQuery, schema: SearchSchema): Promise; +// FacetField / OutputField default to `string` (ergonomic) and a deployment narrows them +// to its schema’s facetable / output field names for typo-safe facet and document access +// (helpers FacetFieldsOf / OutputFieldsOf, or the EngineFor alias). +interface SearchEngine< + FacetField extends string = string, + OutputField extends string = string, +> { + search( + query: SearchQuery, + schema: SearchSchema, + ): Promise>; } -interface SearchResult { - readonly hits: readonly { id: string; document: SearchDocument }[]; +interface SearchResult< + FacetField extends string = string, + OutputField extends string = string, +> { + readonly hits: readonly SearchHit[]; readonly total: number; + // Keyed by facet field name; `Partial` because only the queried facets are present. + // A bucket’s `label` (a LocalizedValue) is the engine-resolved canonical data label, + // present only for reference (IRI-keyed) facets; absent for token/free-string facets, + // whose display the consumer owns (its own i18n, or the value itself). readonly facets: Readonly< - Record + Partial< + Record< + FacetField, + readonly { value: string; count: number; label?: LocalizedValue }[] + > + > >; } -type SearchDocument = Record; +// `id` (the stable document key, an IRI) stays out of the document: it is the hit’s +// identity, always present, a different contract from the optional logical field values, +// and maps straight onto the GraphQL output’s `id: String!`. +interface SearchHit { + readonly id: string; + readonly document: ResultDocument; +} + +// The logical result document. Named distinctly from the flat, fanned-out projection +// `SearchDocument` that lives index-side: this carries logical fields (language maps, +// references) ready for a surface to shape. +type ResultDocument = Readonly< + Partial> +>; type SearchValue = | string | number @@ -247,7 +309,9 @@ not enabled for DR v1, more relevant for B/C. - Deviations to reconcile into the platform draft: numbered pagination; sidecar labels; logical result doc (framed JSON-LD scoped to index-side); `min`/`max` filter ranges; the `@lde/search*` naming and a core package row. +- Adopted during implementation (2026-06-26): the **unified** field model – the projection + `FieldSpec` (RDF→doc) and the deployment’s Typesense `SEARCH_FIELDS` are folded into this + one `SearchField` (see the Field model note above). - Deferred: REST surface; framed-JSON-LD materialised view (nested storage, index-time label inlining, detail-page-on-index, terms-collection split); semantic/hybrid (vector) - search; unifying the projection `FieldSpec` (RDF→doc) with this `SearchField` - (query/output) into one field declaration. + search. diff --git a/packages/search/README.md b/packages/search/README.md index 5672881e..476170d9 100644 --- a/packages/search/README.md +++ b/packages/search/README.md @@ -1,170 +1,155 @@ # @lde/search -Engine-agnostic search projection for RDF-backed pipelines. **`projectGraph`** -streams the result of a SPARQL `CONSTRUCT` into flat search documents, with no -engine and no vocabulary baked in. Internally it does two things per subject of -a root type: frame its one-hop subgraph into a JSON-LD IR node, then project -that node into a flat document from a **declarative field spec**. +The **engine- and domain-agnostic core** for RDF-backed search. It bakes in no +search engine, no API protocol, and no domain vocabulary: you supply a +declarative `SearchSchema`, and engine adapters and API surfaces sit on the ports +defined here. The library never names your domain — the same core drives a +`Dataset`, `Person`, or `CreativeWork` search. + +It provides four things: + +- **the unified field model** — `SearchField` / `SearchSchema`: one declaration + per field that drives all four consumers below, so they cannot drift; +- **the neutral query IR** — `SearchQuery` / `Filter` / `Sort` + filter + semantics, the shared compiler target every API surface parses into; +- **the engine port** — `SearchEngine` and the logical result types + (`SearchResult` / `SearchHit` / `ResultDocument` / `Reference` / …); +- **a streaming projection** — `projectGraph`, RDF `CONSTRUCT` quads → flat + search documents. -An engine adapter (e.g. [`@lde/search-typesense`](../search-typesense)) then -writes those documents to a search backend. - -```ts -import { projectGraph, type Projection } from '@lde/search'; - -const projection: Projection = { - /* type + field spec — see below */ -}; - -for await (const document of projectGraph(quads, [projection])) { - // one flat search document per matching subject, streamed -} +``` +SearchSchema ─┬─► projection (projectGraph → flat documents) [here] + ├─► engine adapter (collection schema + query compiler) e.g. @lde/search-typesense + ├─► query semantics (SearchQuery, filter/sort/facet) [here] + └─► API surface (GraphQL / REST) e.g. @lde/search-api-graphql ``` -`projectGraph` is fully streaming: subjects are grouped and framed one at a time -and documents are yielded as they are produced, so beyond a subject index memory -stays flat at scale (framing the whole graph at once is roughly O(N²)). Duplicate -triples are collapsed first, because some SPARQL engines (e.g. QLever) do not -deduplicate `CONSTRUCT` output. The IR carries no `@context`, so a `derivation` -reading it sees full predicate IRIs with language tags preserved. +One field, four consumers — that is why the model is unified: a field’s `kind` +plus capability flags (`searchable` / `filterable` / `facetable` / `sortable` / +`output`) describe projection, the engine collection schema, the query semantics, +and the API output in a single place. -## Projection +## Field model -The mapping is data, not code. Each field declares the IR `path` to read and a -`kind`; the conventions (per-locale split, diacritic folding via -[`@lde/text-normalization`](../text-normalization), facet arrays, numeric -coercion) are applied for you. Computed fields are `derivations` — hooks that -read the node and set fields the kinds can't. +The mapping is data, not code. Each field declares its `kind`, the IR `path` to +read (omit it for a **derived** field, populated by a `derivation`), and the +capabilities it opts into. The physical field names a declaration fans out to +(per-locale search/sort keys, the grouped-facet companion) come from +`physicalFields`, the single convention projection, the collection schema and the +query compiler all share. ```ts -import { projectGraph, irisOf, type Projection } from '@lde/search'; +import { projectGraph, irisOf, type SearchSchema } from '@lde/search'; -const projection: Projection = { +const DATASET = { type: 'http://www.w3.org/ns/dcat#Dataset', fields: [ - // → title_nl, title_en, title_search_nl, title_search_en, title_sort_nl, title_sort_en + // → title_nl, title_en, title_search_nl/_en, title_sort_nl/_en { name: 'title', path: 'http://purl.org/dc/terms/title', - kind: { - type: 'langText', - locales: ['nl', 'en'], - display: true, - search: true, - sort: true, - }, + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, }, - // → publisher (IRI facet) + // → publisher (IRI facet, resolved to a labelled reference at the surface) { name: 'publisher', path: 'http://purl.org/dc/terms/publisher', - kind: { type: 'facet', iri: true }, + kind: 'reference', + facetable: true, + output: true, + ref: { type: 'Organization', strategy: 'labelOnly' }, }, // → size (int) - { name: 'size', path: 'urn:dr:size', kind: { type: 'number' } }, + { name: 'size', path: 'urn:dr:size', kind: 'integer', sortable: true }, + // derived field (no path): populated by the derivation below + { name: 'classCount', kind: 'integer', sortable: true }, ], derivations: [ (document, node) => { - document.class_count = irisOf(node, 'urn:dr:class').length; + document.classCount = irisOf(node, 'urn:dr:class').length; }, ], -}; +} as const satisfies SearchSchema; -for await (const document of projectGraph(quads, [projection])) { - // … +for await (const document of projectGraph(quads, [DATASET])) { + // one flat search document per matching subject, streamed } ``` -**Kinds** +Capturing the schema with `as const satisfies SearchSchema` keeps the field +literals, so the API surface can derive typed facet/output keys from it (see +`@lde/search-api-graphql`). + +**Kinds** (`FieldKind`): `text`, `keyword`, `integer`, `number`, `boolean`, +`date`, `reference`. The Typesense/engine vocabulary and the GraphQL types are +_derived_ from the kind by the adapter and the surface — never declared here. -| kind | emits | -| ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `langText` | per locale (see below), each opt-in: `_${locale}` display with `display`, `_search_${locale}` folded with `search`, `_sort_${locale}` folded with `sort` | -| `facet` | the field as a deduped array; `iri` reads `@id`; `search` adds a folded `_search`; `transform` rewrites values | -| `number` | a numeric scalar; `date` parses an ISO date-time to unix seconds | +| kind | `where` | facet | sort | output | +| -------------------- | -------------------- | ----- | ---------------- | ------------------------------- | +| `text` (`localized`) | – (feeds free text) | – | yes (per-locale) | best-first language list | +| `keyword` | `in` (membership) | yes | – | string / `string[]` | +| `reference` | `in` (membership) | yes | – | labelled reference (id + label) | +| `integer` / `number` | `range { min, max }` | yes | yes | number | +| `date` | `range` (inclusive) | yes | yes | ISO 8601 string (surface) | +| `boolean` | `is` | yes | – | boolean (absent = false) | + +## Projection + +`projectGraph` is fully streaming: subjects are grouped and framed one at a time +and documents are yielded as produced, so beyond a subject index memory stays +flat at scale (framing the whole graph at once is roughly O(N²)). Duplicate +triples are collapsed first, because some SPARQL engines (e.g. QLever) do not +deduplicate `CONSTRUCT` output. The IR carries no `@context`, so a `derivation` +reading it sees full predicate IRIs with language tags preserved. ## Locales -`locales` is the **single** list of languages a `langText` field projects; -`display`, `search` and `sort` are independent opt-in families that each fan out +`locales` is the **single** list of languages a localized `text` field projects; +`output`, `searchable` and `sortable` are independent opt-ins that each fan out over it (so a field emits exactly what it opts into): -- `display` → `title_nl`/`title_en` (accents preserved); -- `search` → `title_search_nl`/`title_search_en` (folded; one field per locale - lets a query `query_by` them and rank the user’s language higher via - `query_by_weights`, and lets a language that needs a dedicated tokenizer set - its own `locale` in the schema); -- `sort` → `title_sort_nl`/`title_sort_en` (folded, so a locale-switching UI +- `output` → `title_nl`/`title_en` (accents preserved); +- `searchable` → `title_search_nl`/`title_search_en` (folded; one field per locale + lets a query `query_by` them and rank the user’s language higher, and lets a + language that needs a dedicated tokenizer set its own stemming `locale` in the + engine schema); +- `sortable` → `title_sort_nl`/`title_sort_en` (folded, so a locale-switching UI sorts on the active language). -A field with `search` but no `display` is **search-only** — folded and stemmed -for retrieval but never rendered (e.g. a `publisher` searched here but shown via -a separate label). +A field with `searchable` but no `output` is **search-only** — folded and stemmed +for retrieval but never rendered (e.g. a creator searched here but shown via a +separate label). **Only listed locales are indexed**; a literal whose language tag +is not in `locales` (or has no tag) is not projected at all. Per-locale fields are +**omitted, never empty**, when a document lacks that language, so declare them +optional in the engine schema and sort with `missing_values: last`. Folding the search fields is what lets diacritic-insensitive matching and stemming coexist. A search engine on its **default** locale typically folds case -and diacritics for you (Typesense v30, verified, even folds ø/æ/ß) — so there the -folding here is belt-and-suspenders. But enabling a language’s **stemming** -requires setting that language’s `locale` (e.g. `locale: 'nl'` + `stem: true` so -`huizen` matches `huis`), and a non-default locale switches the engine to ICU -tokenization, which **preserves** diacritics. At that point the engine no longer -folds them, and `fold()` is what keeps matching diacritic-insensitive. Stemming -is a per-field engine-schema choice (the consumer’s), and being rules-based it -can mangle proper nouns and place names — e.g. the Dutch stemmer reduces the city -`Bergen` to `berg`, colliding it with “mountain”. - -Recommended split: enable stemming on the **free-text** search fields -(`*_search_${locale}`, descriptions, keywords) where morphological recall helps -(`verhaal` ↔ `verhalen`), and keep **place names and other proper-noun facets on -a separate, unstemmed field** (facets are exact-match anyway). That captures the -recall without the `Bergen`/`berg` collision in the facet. A `stem_dictionary` -can pin specific names if you need stemmed free-text without given collisions. - -**Only listed locales are indexed.** A literal whose language tag is not in -`locales` is not projected at all — no display, no search, no sort field — so it -is invisible to the index. To index a language, add it to `locales`. - -Per-locale fields are **omitted, never empty**, when a document lacks that -language, so declare them `optional: true` in the engine schema. At query time, -sort with `missing_values: last` to push documents lacking the active locale to -the end, and `query_by` all the per-locale search fields (weighting the user’s -locale higher) to keep cross-language recall. - -A literal with no `@language` tag matches no locale, so it is not projected. Tag -your source literals (or pre-process them) for the languages you index. +and diacritics for you; enabling a language’s **stemming** switches it to ICU +tokenization, which **preserves** diacritics — at which point `fold()` (from +[`@lde/text-normalization`](../text-normalization)) is what keeps matching +diacritic-insensitive. Stemming is rules-based and can mangle proper nouns (the +Dutch stemmer reduces the city `Bergen` to `berg`), so enable it on free-text +fields and keep proper-noun facets on a separate, unstemmed field. ## Querying The search fields are stored already case- and diacritic-folded, so **the query -must be folded the same way** with the same `fold()` from -[`@lde/text-normalization`](../text-normalization) before it reaches the engine. -Otherwise index and query are normalized differently and matches silently miss -(the user sees no results, with no error). An engine on its default locale would -fold a raw query for you, but one set to a stemming locale (which preserves -diacritics) or a non-folding backend will not — so always fold, and matching -stays correct on any engine. - -```ts -import { fold } from '@lde/text-normalization'; - -await client - .collections(collection) - .documents() - .search({ - q: fold(userQuery), - query_by: 'title_search_nl,title_search_en', - query_by_weights: '2,1', // rank the user’s locale higher - }); -``` - -This contract holds for **any** consumer, including a search API built on top of -this package: index-time and query-time folding must use the same `fold()`, or -non-decomposing terms silently miss. - -## Why a spec - -The field spec's vocabulary mirrors SHACL on purpose: `path` is `sh:path`, and -the kind is derivable from `sh:datatype` / `sh:nodeKind` / `sh:maxCount` plus -search annotations. So the same projection engine that runs a hand-written spec -today will run a **SHACL-generated** spec tomorrow — the engine and the IR stay; -only spec-authoring gets automated. Nothing is thrown away. +must be folded the same way** with the same `fold()` before it reaches the engine, +or index and query normalize differently and matches silently miss. This contract +holds for **any** consumer, including an API built on this package — which is why +engine adapters and surfaces compile through the shared `SearchQuery` IR and the +`physicalFields` convention rather than re-deriving field names. + +## Why a declarative model + +The vocabulary mirrors SHACL on purpose: `path` is `sh:path`, `array` is +`sh:maxCount`, `required` is `sh:minCount`, `localized` is `sh:languageIn`, `ref` +is `sh:class`/`sh:node`. So the same core that runs a hand-written `SearchSchema` +today will run a **SHACL-generated** one tomorrow — the model, the ports and the +IR stay; only schema-authoring gets automated. diff --git a/packages/search/package.json b/packages/search/package.json index 61657f95..e81f647f 100644 --- a/packages/search/package.json +++ b/packages/search/package.json @@ -1,7 +1,7 @@ { "name": "@lde/search", "version": "0.1.2", - "description": "Engine-agnostic search projection for RDF-backed pipelines: frame CONSTRUCT quads into a JSON-LD IR, then project that IR into flat search documents from a declarative field spec (the artifact a SHACL generator would emit)", + "description": "Engine- and domain-agnostic core for RDF-backed search: a unified declarative field model (SearchField/SearchSchema), a neutral query IR, the SearchEngine port with logical result types, and a streaming CONSTRUCT-to-document projection. Bakes in no engine, protocol, or domain.", "repository": { "url": "git+https://github.com/ldelements/lde.git", "directory": "packages/search" diff --git a/packages/search/src/engine.ts b/packages/search/src/engine.ts new file mode 100644 index 00000000..59284d7f --- /dev/null +++ b/packages/search/src/engine.ts @@ -0,0 +1,140 @@ +import type { SearchQuery } from './query.js'; +import type { SearchSchema } from './schema.js'; + +/** + * The engine port — the boundary a concrete engine adapter (e.g. + * `@lde/search-typesense`’s `TypesenseSearchEngine`) implements. The adapter + * owns every engine specific (companion-field expansion, `query_by`/weights, the + * filter compiler, `sort_by`, folding, `facet_by`) and returns only logical + * documents, so a deployment can swap engines without any consumer noticing. + * Nothing engine-specific and nothing RDF-specific leaks past this port. + * + * `FacetField` keys the returned facet map; it defaults to `string` so an engine + * stays ergonomic, and a deployment can narrow it to its own facet-field union + * (see {@link FacetFieldsOf}) for typo-safe facet access. + */ +export interface SearchEngine< + FacetField extends string = string, + OutputField extends string = string, +> { + search( + query: SearchQuery, + schema: SearchSchema, + ): Promise>; +} + +/** What an engine returns: logical hits, a total, and the requested facets. */ +export interface SearchResult< + FacetField extends string = string, + OutputField extends string = string, +> { + readonly hits: readonly SearchHit[]; + readonly total: number; + readonly facets: FacetMap; +} + +/** + * Facet buckets keyed by facet field name. `Partial` because a result carries + * buckets only for the fields the query asked for, not every facetable field. + */ +export type FacetMap = Readonly< + Partial> +>; + +/** + * The facet-field-name union of a schema — the keys a {@link SearchResult}’s + * `facets` can hold. Requires the schema be captured as a literal type + * (`as const satisfies SearchSchema`), so the `facetable: true` flags survive as + * literals; a plain `: SearchSchema` annotation widens them and yields `never`. + */ +export type FacetFieldsOf = Extract< + Schema['fields'][number], + { readonly facetable: true } +>['name']; + +/** + * The output-field-name union of a schema — the keys a {@link ResultDocument} + * can hold. Like {@link FacetFieldsOf}, requires the schema captured as a literal + * (`as const satisfies SearchSchema`). + */ +export type OutputFieldsOf = Extract< + Schema['fields'][number], + { readonly output: true } +>['name']; + +/** A {@link SearchEngine} narrowed to one schema: facet keys and document keys + * fixed to that schema’s facetable / output field names. The schema must be + * captured as `as const satisfies SearchSchema`. */ +export type EngineFor = SearchEngine< + FacetFieldsOf, + OutputFieldsOf +>; + +/** A {@link SearchResult} narrowed to one schema (see {@link EngineFor}). */ +export type ResultFor = SearchResult< + FacetFieldsOf, + OutputFieldsOf +>; + +/** + * One result row. `id` (the stable document key, an IRI) is kept *out* of + * {@link ResultDocument}: it is always present and is the hit’s identity, a + * different contract from the optional, typed logical field values — and it maps + * straight onto the GraphQL output’s guaranteed `id: String!`. The document + * holds only the selectable fields. + */ +export interface SearchHit { + readonly id: string; + readonly document: ResultDocument; +} + +/** + * The logical result document at the query seam — engine- and RDF-neutral. + * Distinct from the flat, fanned-out projection `SearchDocument` that lives + * index-side: this carries logical fields with language maps and references, + * ready for a surface to shape. Keyed by output field name; `Partial` because a + * document omits absent optional fields. `OutputField` defaults to `string`; a + * deployment narrows it via {@link OutputFieldsOf} for typo-safe field access. + */ +export type ResultDocument = Readonly< + Partial> +>; + +/** A logical field value. */ +export type SearchValue = + | string + | number + | boolean + | readonly string[] + | LocalizedValue + | Reference + | readonly Reference[]; + +/** + * A JSON-LD-style language map (`@container: @language`, `@set` arrays); the key + * `und` carries untagged (`@none`) values. The surface flattens it to a + * best-first `Accept-Language`-ordered list. + */ +export type LocalizedValue = Readonly>; + +/** + * The generic internal carrier for a referenced entity. The GraphQL surface maps + * it to a named per-shape type (e.g. `Organization`, `Term`) with `label` + * exposed as `name`. + */ +export interface Reference { + readonly id: string; + readonly label?: LocalizedValue; +} + +/** + * One facet bucket: a value and how many documents carry it. `label` is the + * engine-resolved canonical **data** label, present only for reference facets + * (IRI-keyed); it is absent for facets whose value is a token or free string + * whose display the consumer owns (its own i18n, or the value itself). + */ +export interface FacetBucket { + readonly value: string; + readonly count: number; + readonly label?: LocalizedValue; +} diff --git a/packages/search/src/index.ts b/packages/search/src/index.ts index 10c2b32f..cb02290e 100644 --- a/packages/search/src/index.ts +++ b/packages/search/src/index.ts @@ -1,13 +1,47 @@ +// Projection: RDF CONSTRUCT quads → flat search documents, driven by the unified +// SearchField/SearchSchema model below (one declaration; the fanout names come +// from `physicalFields`). export { projectGraph, irisOf, literalsOf, firstLiteralOf } from './project.js'; +export type { SearchDocument } from './project.js'; + +// Unified field model: one declaration drives projection, engine collection +// schema, query semantics and the GraphQL surface. Plus the schema selectors and +// the physical field-name convention they all share. +export { + physicalFields, + searchableFields, + facetableFields, + filterableFields, + sortableFields, + outputFields, +} from './schema.js'; export type { - SearchDocument, - Projection, - FieldSpec, FieldKind, - LangTextKind, - FacetKind, - NumberKind, - DateKind, + SearchField, + SearchSchema, Derivation, -} from './project.js'; + PhysicalFields, +} from './schema.js'; + +// Engine- and protocol-neutral query IR + filter semantics. +export { filterOperatorFor, filterOperator, acceptsFilter } from './query.js'; +export type { SearchQuery, Filter, Sort, FilterOperator } from './query.js'; + +// Engine port + the logical result document returned across it. +export type { + SearchEngine, + SearchResult, + SearchHit, + ResultDocument, + SearchValue, + LocalizedValue, + Reference, + FacetBucket, + FacetMap, + FacetFieldsOf, + OutputFieldsOf, + EngineFor, + ResultFor, +} from './engine.js'; + export type { FramedNode } from './frame-by-type.js'; diff --git a/packages/search/src/project.ts b/packages/search/src/project.ts index c181978f..284c3183 100644 --- a/packages/search/src/project.ts +++ b/packages/search/src/project.ts @@ -1,135 +1,56 @@ import type { Quad } from '@rdfjs/types'; import { fold } from '@lde/text-normalization'; import { frameByType, type FramedNode } from './frame-by-type.js'; +import { + physicalFields, + type SearchField, + type SearchSchema, +} from './schema.js'; /** A flat search document. `id` is the engine document key. */ export type SearchDocument = { id: string } & Record; -/** - * How one framed-IR property projects into search fields. The vocabulary mirrors - * SHACL so a generator can later emit it from shapes + search annotations: - * `path` is `sh:path`, and the kind is derivable from `sh:datatype`/`sh:nodeKind` - * /`sh:maxCount` plus the search annotations. - */ -export type FieldKind = LangTextKind | FacetKind | NumberKind | DateKind; - -/** - * Language-tagged text, projected per locale. `locales` is the single source of - * truth for which languages this field emits; `display`, `search` and `sort` are - * three independent opt-in families that each fan out over it: - * - `display` → `${name}_${locale}` display label, accents preserved; - * - `search` → `${name}_search_${locale}` folded match field (one per locale so - * the engine can tokenize/stem each language and the query can rank the user’s - * locale higher); - * - `sort` → `${name}_sort_${locale}` folded sort key (one per locale so a - * locale-switching UI sorts on the active language). - * - * All three default off — a field emits exactly the families it opts into (e.g. - * `search` alone is a search-only field, shown via a separate label). Only listed - * locales are projected: a value whose language tag is not in `locales` (and is - * not mapped in by `untaggedLanguage`) is not indexed at all. - */ -export interface LangTextKind { - readonly type: 'langText'; - /** The languages to project; drives whichever of the families are enabled. */ - readonly locales: readonly string[]; - /** Emit the per-locale display labels `${name}_${locale}` (accents preserved). */ - readonly display?: boolean; - /** Emit a folded `${name}_search_${locale}` per locale (matchable). */ - readonly search?: boolean; - /** Emit a folded `${name}_sort_${locale}` per locale (sortable). */ - readonly sort?: boolean; -} - -/** A faceted multi-value field, optionally also folded for search. */ -export interface FacetKind { - readonly type: 'facet'; - /** Read IRI references (`@id`) rather than literal values. */ - readonly iri?: boolean; - /** Also emit a folded `${name}_search` array. */ - readonly search?: boolean; - /** Transform each value before faceting (e.g. strip a media-type prefix). */ - readonly transform?: (value: string) => string; -} - -/** A numeric scalar. */ -export interface NumberKind { - readonly type: 'number'; -} - -/** An ISO date-time, parsed into Unix seconds. */ -export interface DateKind { - readonly type: 'date'; -} - -/** - * One field of a projection: an output `name`, the framed-IR predicate `path` to - * read (the SHACL `sh:path`), and the kind-specific config discriminated by - * `type`. - */ -export type FieldSpec = { - /** Output field base name; per-kind suffixes are appended. */ - readonly name: string; - /** Framed-IR predicate IRI to read (the SHACL `sh:path`). */ - readonly path: string; -} & FieldKind; - -/** A computed field that is not a direct projection of a single path - * (e.g. a status rank, or a group derived from a code table). */ -export type Derivation = (document: SearchDocument, node: FramedNode) => void; - -/** - * One root type’s complete projection — the runtime form of a single SHACL - * NodeShape: `type` is its `sh:targetClass` (and the framed node’s `@type`), - * `fields` are its property shapes, and `derivations` are its `sh:rule`-shaped - * computed fields. A generator emits one of these per NodeShape. - */ -export interface Projection { - readonly type: string; - readonly fields: readonly FieldSpec[]; - readonly derivations?: readonly Derivation[]; -} - /** * Project one framed JSON-LD node into a flat search document: apply each field - * spec, then run the derivations (which may read fields the specs already set). + * of the schema, then run the derivations (which may read fields the field specs + * already set). The physical field names a field fans out to come from + * {@link physicalFields}, the single source shared with the engine collection + * schema and the query compiler. */ export function projectDocument( node: FramedNode, - projection: Projection, + schema: SearchSchema, ): SearchDocument { const id = node['@id']; if (typeof id !== 'string') { throw new Error( - `Cannot project a ${projection.type} node without an @id: every search document needs a stable key, and an empty one would collide with other keyless nodes.`, + `Cannot project a ${schema.type} node without an @id: every search document needs a stable key, and an empty one would collide with other keyless nodes.`, ); } const document: SearchDocument = { id }; - for (const field of projection.fields) { + for (const field of schema.fields) { applyField(document, node, field); } - for (const derive of projection.derivations ?? []) { + for (const derive of schema.derivations ?? []) { derive(document, node); } return document; } /** - * Frame `quads` for every projection’s root type and project each node with its - * type’s projection — the multi-shape pipeline. Streams one document at a time - * so memory stays flat. The IR maps to a projection by type, so adding a shape - * is adding a `Projection` (no engine change). + * Frame `quads` for every schema’s root type and project each node with its + * type’s schema — the multi-shape pipeline. Streams one document at a time so + * memory stays flat. The IR maps to a schema by type, so adding a shape is + * adding a `SearchSchema` (no engine change). */ export async function* projectGraph( quads: readonly Quad[], - projections: readonly Projection[], + schemas: readonly SearchSchema[], ): AsyncIterable { - const byType = new Map( - projections.map((projection) => [projection.type, projection]), - ); - for (const projection of byType.values()) { - for await (const node of frameByType(quads, projection.type)) { - yield projectDocument(node, projection); + const byType = new Map(schemas.map((schema) => [schema.type, schema])); + for (const schema of byType.values()) { + for await (const node of frameByType(quads, schema.type)) { + yield projectDocument(node, schema); } } } @@ -137,77 +58,96 @@ export async function* projectGraph( function applyField( document: SearchDocument, node: FramedNode, - field: FieldSpec, + field: SearchField, ): void { - switch (field.type) { - case 'langText': - return applyLangText(document, langValuesOf(node, field.path), field); - case 'facet': - return applyFacet(document, node, field); - case 'number': + const path = field.path; + if (path === undefined) { + // A derived field — populated by a derivation, not projected from a path. + return; + } + switch (field.kind) { + case 'text': + return applyLocalizedText(document, langValuesOf(node, path), field); + case 'keyword': + return applyFacet(document, literalsOf(node, path), field); + case 'reference': + return applyFacet(document, irisOf(node, path), field); + case 'integer': return setNumber( document, field.name, - toInteger(firstLiteralOf(node, field.path)), + toInteger(firstLiteralOf(node, path)), ); case 'date': return setNumber( document, field.name, - isoToUnix(firstLiteralOf(node, field.path)), + isoToUnix(firstLiteralOf(node, path)), ); } + // `number` and `boolean` are not projected from a path in current schemas + // (booleans are derivation-populated, e.g. the compatibility vinkjes). } -function applyLangText( +/** + * Project a language-tagged text field per locale. Display shows one label + * (accents preserved) when the field is `output`; sort keys off that same + * primary value (folded) when `sortable`; search folds every value of the locale + * when `searchable`, so all are matchable. Absent locales emit nothing. + */ +function applyLocalizedText( document: SearchDocument, values: readonly LangValue[], - { name, locales, display, search, sort }: Extract, + field: SearchField, ): void { + const locales = field.locales ?? []; if (locales.length === 0) { throw new Error( - `langText field “${name}” must declare at least one locale; nothing would be projected otherwise.`, + `Localized text field “${field.name}” must declare at least one locale; nothing would be projected otherwise.`, ); } - for (const locale of locales) { + const names = physicalFields(field); + locales.forEach((locale, index) => { const localeValues = values .filter((value) => value.lang === locale) .map((value) => value.value); if (localeValues.length === 0) { - continue; + return; } - // Display shows one label (accents preserved); sort keys off that same - // primary value (folded); search folds every value of the locale so all - // are matchable. Absent locales emit nothing (the field stays optional). const [primary] = localeValues; - if (display) { - setString(document, `${name}_${locale}`, primary); + if (field.output) { + setString(document, names.display[index], primary); } - if (search) { + if (field.searchable) { setString( document, - `${name}_search_${locale}`, + names.search[index], fold(localeValues.join(' ')).trim(), ); } - if (sort) { - setString(document, `${name}_sort_${locale}`, fold(primary)); + if (field.sortable) { + setString(document, names.sort[index], fold(primary)); } - } + }); } +/** + * Project a faceted multi-value field: dedupe (after the optional transform), + * write the value field, and — when `searchable` — a folded `${name}_search` + * array. `keyword` reads literals; `reference` reads IRIs (the caller passes the + * already-read raw values). + */ function applyFacet( document: SearchDocument, - node: FramedNode, - { name, path, iri, search, transform }: Extract, + raw: readonly string[], + field: SearchField, ): void { - const raw = iri ? irisOf(node, path) : literalsOf(node, path); - const values = dedupe(transform ? raw.map(transform) : raw); - setArray(document, name, values); - if (search) { + const values = dedupe(field.transform ? raw.map(field.transform) : raw); + setArray(document, field.name, values); + if (field.searchable) { setArray( document, - `${name}_search`, + physicalFields(field).search[0], dedupe(values.map((value) => fold(value))), ); } diff --git a/packages/search/src/query.ts b/packages/search/src/query.ts new file mode 100644 index 00000000..d009ea75 --- /dev/null +++ b/packages/search/src/query.ts @@ -0,0 +1,95 @@ +import type { FieldKind, SearchField } from './schema.js'; + +/** + * The engine- and protocol-neutral query IR. Every API surface parses its input + * into this; the engine adapter consumes it. It is the shared compiler target + * that keeps the GraphQL surface, a later REST surface and the adapter from + * drifting. + */ +export interface SearchQuery { + /** Free-text query; `undefined`/`''` means browse (no text ranking). */ + readonly text?: string; + /** AND across fields. */ + readonly where: readonly Filter[]; + /** Primary public sort plus any server tie-breaks, in precedence order. */ + readonly orderBy: readonly Sort[]; + /** Numbered pagination. */ + readonly limit: number; + readonly offset: number; + /** Logical field names to return facet buckets for. */ + readonly facets: readonly string[]; + /** Selects the per-locale fields to query/sort on (from `Accept-Language`). */ + readonly locale: string; +} + +/** + * One `where` clause. The operator is fixed by the target field’s {@link FieldKind} + * ({@link filterOperatorFor}): keyword/reference use `in` (OR within the field), + * the numeric/date kinds use an inclusive `range`, boolean uses `is`. Bounds are + * inclusive only — no `gt`/`gte`/`lt`/`lte`. + */ +export type Filter = + | { readonly field: string; readonly in: readonly string[] } + | { + readonly field: string; + readonly range: { + readonly min?: number | string; + readonly max?: number | string; + }; + } + | { readonly field: string; readonly is: boolean }; + +/** A single sort dimension. */ +export interface Sort { + readonly field: string; + readonly direction: 'asc' | 'desc'; +} + +/** The `where` operator a kind accepts, or `undefined` when it is not filterable + * through `where` (`text` feeds the free-text `query` instead). */ +export type FilterOperator = 'in' | 'range' | 'is'; + +const OPERATOR_BY_KIND: Readonly< + Record +> = { + text: undefined, + keyword: 'in', + reference: 'in', + integer: 'range', + number: 'range', + date: 'range', + boolean: 'is', +}; + +/** + * The `where` operator a field of this kind accepts (per the ADR filter-semantics + * table), or `undefined` for `text` — which feeds the free-text `query` rather + * than `where`. Drives both the surface’s `where` input type and the adapter’s + * filter compiler from one rule. + */ +export function filterOperatorFor(kind: FieldKind): FilterOperator | undefined { + return OPERATOR_BY_KIND[kind]; +} + +/** The operator a concrete {@link Filter} carries, from its shape. */ +export function filterOperator(filter: Filter): FilterOperator { + if ('in' in filter) { + return 'in'; + } + if ('range' in filter) { + return 'range'; + } + return 'is'; +} + +/** + * Whether `field` can be filtered by `filter`: the field must be `filterable` + * and the filter’s shape must be the operator its kind accepts. Surfaces use it + * to reject malformed `where` input before it reaches the adapter. + */ +export function acceptsFilter(field: SearchField, filter: Filter): boolean { + return ( + field.filterable === true && + filterOperator(filter) === filterOperatorFor(field.kind) + ); +} diff --git a/packages/search/src/schema.ts b/packages/search/src/schema.ts new file mode 100644 index 00000000..2873d99c --- /dev/null +++ b/packages/search/src/schema.ts @@ -0,0 +1,184 @@ +import type { FramedNode } from './frame-by-type.js'; +import type { SearchDocument } from './project.js'; + +/** + * The engine-neutral kind of a queryable field — the runtime form of one SHACL + * property shape’s datatype/nodeKind. It drives every downstream behavior: + * which physical fields the projection emits, the engine collection-schema + * type, the `where`/facet/sort semantics, and the GraphQL output/input type. + * The Typesense-vocabulary types (`string`, `int32`, …) are *derived* from this + * by the engine adapter, never declared here. + */ +export type FieldKind = + | 'text' + | 'keyword' + | 'integer' + | 'number' + | 'boolean' + | 'date' + | 'reference'; + +/** + * One queryable field — the single declarative source that drives all four + * consumers (projection, engine collection schema, query semantics, and the + * GraphQL surface). The vocabulary mirrors SHACL + the `search:` annotations so + * a generator can later emit it unchanged from shapes: + * `kind`←`sh:datatype`/`sh:nodeKind`, `path`←`sh:path`, `array`←`sh:maxCount`, + * `localized`←`rdf:langString`/`sh:languageIn`, `ref`←`sh:node`/`sh:class`. + * + * Capability flags (`searchable`/`filterable`/`facetable`/`sortable`/`output`) + * are independent opt-ins: a field exposes exactly the roles it declares. A + * field with no `path` is a **derived field** — populated by a + * {@link Derivation} rather than projected from the IR — yet it still carries + * full query/schema/output behavior (e.g. `status`, the `*_group` companions, + * the compatibility booleans). + * + * The physical field names a declaration fans out to (per-locale search/sort + * keys, the grouped-facet companion, …) follow one convention, owned by + * {@link physicalFields} so projection, collection-schema and query compiler + * cannot disagree. + */ +export interface SearchField { + /** Logical API name; the physical fanout derives from it. Declare camelCase + * where it surfaces in GraphQL. */ + readonly name: string; + readonly kind: FieldKind; + /** Framed-IR predicate IRI to project from (the SHACL `sh:path`). Omit for a + * derivation-populated field. */ + readonly path?: string; + /** Multi-valued (`sh:maxCount > 1`). */ + readonly array?: boolean; + /** Always present (`sh:minCount ≥ 1`): a non-null scalar in the API output and + * a non-optional field in the engine index. Moot for arrays/booleans/`id`, + * which are non-null regardless. */ + readonly required?: boolean; + /** Language-tagged text (`rdf:langString`); projected per locale. `text` only. */ + readonly localized?: boolean; + /** When `localized`, the languages to emit (the per-locale fanout). */ + readonly locales?: readonly string[]; + /** Appears in the API output type / carries a display label. */ + readonly output?: boolean; + /** Full-text inclusion with a `query_by` weight (folded; per-locale when + * `localized`). Presence is what makes a field searchable. */ + readonly searchable?: { readonly weight: number }; + /** Usable in `where`. */ + readonly filterable?: boolean; + /** Returned as facet buckets. */ + readonly facetable?: boolean; + /** Publicly selectable in `orderBy`; localized text also emits a folded sort key. */ + readonly sortable?: boolean; + /** For `kind: 'reference'`: the referenced shape and how much of it to carry. */ + readonly ref?: { + readonly type: string; + readonly strategy: 'labelOnly' | 'idOnly' | 'inline'; + }; + /** Projection-time value transform (e.g. strip a media-type prefix). */ + readonly transform?: (value: string) => string; + /** Grouped-facet companion (a coarse `${name}_group`; deployment delta). */ + readonly group?: { readonly name: string; readonly prefix: string }; +} + +/** + * A computed field that is not a direct projection of a single path — a status + * rank, a `*_group` derived from a code table, a compatibility boolean. Reads + * the framed node and writes onto the flat document the field specs already + * populated. + */ +export type Derivation = (document: SearchDocument, node: FramedNode) => void; + +/** + * One root type’s complete search declaration — the runtime form of a single + * SHACL NodeShape: `type` is its `sh:targetClass`, `fields` are its property + * shapes (and derived fields), `derivations` are its `sh:rule`-shaped computed + * fields. A generator emits one of these per NodeShape. + */ +export interface SearchSchema { + readonly type: string; + readonly fields: readonly SearchField[]; + readonly derivations?: readonly Derivation[]; +} + +/** + * The physical engine fields one {@link SearchField} fans out into, grouped by + * the role each plays. The single source of truth for the naming convention, so + * the projection (writes them), the collection schema (declares them) and the + * query compiler (reads them) cannot disagree. + */ +export interface PhysicalFields { + /** The lone stored field for a non-localized kind — faceted, filtered, sorted + * and output directly. Absent for localized text (its value lives per locale). */ + readonly value?: string; + /** Per-locale output labels `${name}_${locale}` (localized text, `output`). */ + readonly display: readonly string[]; + /** Folded match fields: `${name}_search_${locale}` per locale (localized) or a + * single `${name}_search` (non-localized), when `searchable`. */ + readonly search: readonly string[]; + /** Per-locale folded sort keys `${name}_sort_${locale}` (localized text, + * `sortable`); a non-localized field sorts on its `value`. */ + readonly sort: readonly string[]; + /** The grouped-facet companion `${name}_group`, when `group` is declared. */ + readonly group?: string; +} + +/** + * Full-text searchable fields, highest `query_by` weight first — the order the + * engine adapter weights `query_by` in. A field is searchable iff it carries a + * `searchable` weight. + */ +export function searchableFields( + schema: SearchSchema, +): readonly (SearchField & { + readonly searchable: { readonly weight: number }; +})[] { + return schema.fields + .filter( + (field): field is SearchField & { searchable: { weight: number } } => + field.searchable !== undefined, + ) + .sort((left, right) => right.searchable.weight - left.searchable.weight); +} + +/** Fields returned as facet buckets, in declaration order. */ +export function facetableFields(schema: SearchSchema): readonly SearchField[] { + return schema.fields.filter((field) => field.facetable === true); +} + +/** Fields usable in `where`, in declaration order. */ +export function filterableFields(schema: SearchSchema): readonly SearchField[] { + return schema.fields.filter((field) => field.filterable === true); +} + +/** Fields publicly selectable in `orderBy`, in declaration order. */ +export function sortableFields(schema: SearchSchema): readonly SearchField[] { + return schema.fields.filter((field) => field.sortable === true); +} + +/** Fields that appear in the API output type, in declaration order. */ +export function outputFields(schema: SearchSchema): readonly SearchField[] { + return schema.fields.filter((field) => field.output === true); +} + +/** Derive the physical engine field names a declaration produces. */ +export function physicalFields(field: SearchField): PhysicalFields { + const localized = field.kind === 'text' && field.localized === true; + const locales = localized ? (field.locales ?? []) : []; + return { + // Localized text has no single value field — its values live in the + // per-locale fields; every other kind stores into one `${name}` field. + value: localized ? undefined : field.name, + display: + localized && field.output + ? locales.map((locale) => `${field.name}_${locale}`) + : [], + search: field.searchable + ? localized + ? locales.map((locale) => `${field.name}_search_${locale}`) + : [`${field.name}_search`] + : [], + sort: + localized && field.sortable + ? locales.map((locale) => `${field.name}_sort_${locale}`) + : [], + group: field.group ? `${field.name}_group` : undefined, + }; +} diff --git a/packages/search/test/engine.test.ts b/packages/search/test/engine.test.ts new file mode 100644 index 00000000..54ad819d --- /dev/null +++ b/packages/search/test/engine.test.ts @@ -0,0 +1,110 @@ +import { describe, expect, it } from 'vitest'; +import type { EngineFor, SearchEngine, SearchResult } from '../src/engine.js'; +import type { SearchQuery } from '../src/query.js'; +import type { SearchSchema } from '../src/schema.js'; + +const schema: SearchSchema = { + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [{ name: 'title', kind: 'text', localized: true, locales: ['nl'] }], +}; + +// A fake engine: the port is implementable and the result types compose into a +// logical document (language map + reference) the way a real engine returns. +const fake: SearchEngine = { + async search(query: SearchQuery): Promise { + return { + total: 1, + hits: [ + { + id: 'https://example/dataset/1', + document: { + title: { nl: ['Erfgoed'], und: [query.text ?? ''] }, + publisher: { + id: 'https://example/org/1', + label: { nl: ['Archief'] }, + }, + keyword: ['kaarten', 'atlas'], + }, + }, + ], + facets: { keyword: [{ value: 'kaarten', count: 3 }] }, + }; + }, +}; + +describe('SearchEngine port', () => { + it('returns logical hits, total and facets through the port', async () => { + const query: SearchQuery = { + text: 'kaart', + where: [], + orderBy: [{ field: 'relevance', direction: 'desc' }], + limit: 20, + offset: 0, + facets: ['keyword'], + locale: 'nl', + }; + + const result = await fake.search(query, schema); + + expect(result.total).toBe(1); + expect(result.hits[0].id).toBe('https://example/dataset/1'); + expect(result.hits[0].document.title).toEqual({ + nl: ['Erfgoed'], + und: ['kaart'], + }); + expect(result.facets.keyword).toEqual([{ value: 'kaarten', count: 3 }]); + }); +}); + +describe('typed facet and document keys', () => { + it('keys facets and the result document by the schema’s field names', async () => { + // Captured as a literal (`as const satisfies`) so the `facetable`/`output` + // flags survive and the `…Of` helpers can read the field names off the type. + const datasetSchema = { + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [ + { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl'], + output: true, + }, + { name: 'format', kind: 'keyword', array: true, facetable: true }, + { name: 'status', kind: 'keyword', facetable: true }, + ], + } as const satisfies SearchSchema; + + // facets ⊂ { format, status }, document keys ⊂ { title }. These object + // literals would not compile if the helpers widened to `string`/`never`. + const engine: EngineFor = { + async search() { + return { + total: 1, + hits: [ + { + id: 'https://example/d/1', + document: { title: { nl: ['Titel'] } }, + }, + ], + facets: { format: [{ value: 'text/turtle', count: 2 }] }, + }; + }, + }; + + const result = await engine.search( + { + where: [], + orderBy: [], + limit: 10, + offset: 0, + facets: ['format'], + locale: 'nl', + }, + datasetSchema, + ); + + expect(result.facets.format).toEqual([{ value: 'text/turtle', count: 2 }]); + expect(result.hits[0].document.title).toEqual({ nl: ['Titel'] }); + }); +}); diff --git a/packages/search/test/project.test.ts b/packages/search/test/project.test.ts index 60c42f71..8f513baa 100644 --- a/packages/search/test/project.test.ts +++ b/packages/search/test/project.test.ts @@ -5,11 +5,9 @@ import { projectDocument, projectGraph, irisOf, - type FieldSpec, - type Derivation, - type Projection, type SearchDocument, } from '../src/project.js'; +import type { SearchField, SearchSchema, Derivation } from '../src/schema.js'; const DR = 'urn:dr:'; const IANA = 'https://www.iana.org/assignments/media-types/'; @@ -30,49 +28,50 @@ const node = { [`${DR}size`]: { '@type': xsd.integer.value, '@value': '1234' }, }; -const fields: FieldSpec[] = [ +const fields: SearchField[] = [ { name: 'title', path: dcterms.title.value, - type: 'langText', + kind: 'text', + localized: true, locales: ['nl', 'en'], - display: true, - search: true, - sort: true, + output: true, + searchable: { weight: 1 }, + sortable: true, }, { name: 'publisher', path: `${DR}publisherName`, - type: 'langText', + kind: 'text', + localized: true, locales: ['nl', 'en'], - display: true, - search: true, + output: true, + searchable: { weight: 1 }, }, { name: 'publisher', path: dcterms.publisher.value, - type: 'facet', - iri: true, + kind: 'reference', }, { name: 'keyword', path: dcat.keyword.value, - type: 'facet', - search: true, + kind: 'keyword', + searchable: { weight: 1 }, }, { name: 'format', path: `${DR}format`, - type: 'facet', + kind: 'keyword', transform: (value) => value.replace(IANA, ''), }, - { name: 'class', path: `${DR}class`, type: 'facet', iri: true }, + { name: 'class', path: `${DR}class`, kind: 'reference' }, { name: 'date_posted', path: `${DR}datePosted`, - type: 'date', + kind: 'date', }, - { name: 'size', path: `${DR}size`, type: 'number' }, + { name: 'size', path: `${DR}size`, kind: 'integer' }, ]; const derivations: Derivation[] = [ @@ -81,11 +80,11 @@ const derivations: Derivation[] = [ }, ]; -const projection: Projection = { type: DATASET, fields, derivations }; +const schema: SearchSchema = { type: DATASET, fields, derivations }; describe('projectDocument', () => { it('projects every field kind and runs derivations', () => { - const document = projectDocument(node, projection); + const document = projectDocument(node, schema); expect(document.id).toBe('https://ex/d/1'); expect(document.title_nl).toBe('Titel'); @@ -121,23 +120,22 @@ describe('projectDocument', () => { { type: DATASET, fields: [ - { name: 'size', path: `${DR}size`, type: 'number' }, + { name: 'size', path: `${DR}size`, kind: 'integer' }, { name: 'language', path: dcterms.language.value, - type: 'facet', + kind: 'keyword', }, { name: 'keyword', path: dcat.keyword.value, - type: 'facet', - search: true, + kind: 'keyword', + searchable: { weight: 1 }, }, { name: 'class', path: `${DR}class`, - type: 'facet', - iri: true, + kind: 'reference', }, ], }, @@ -157,8 +155,8 @@ describe('projectDocument', () => { { name: 'format', path: `${DR}format`, - type: 'facet', - search: true, + kind: 'keyword', + searchable: { weight: 1 }, transform: (value) => value.replace(IANA, ''), }, ], @@ -232,10 +230,11 @@ describe('projectDocument', () => { { name: 'title', path: dcterms.title.value, - // search only — display and sort not opted into. - type: 'langText', + // search only — display (output) and sort not opted into. + kind: 'text', + localized: true, locales: ['nl', 'en'], - search: true, + searchable: { weight: 1 }, }, ], }, @@ -262,6 +261,38 @@ describe('projectDocument', () => { expect(document.title_search_nl).toBe('titel ondertitel'); }); + it('skips a field with no path, leaving it to a derivation (derived field)', () => { + const document = projectDocument( + { + '@id': 'https://ex/d/11', + [dcterms.title.value]: { '@language': 'nl', '@value': 'Titel' }, + }, + { + type: DATASET, + fields: [ + { + name: 'title', + path: dcterms.title.value, + kind: 'text', + localized: true, + locales: ['nl'], + output: true, + }, + // No `path`: a derived field — its value comes from a derivation, + // never from projection. + { name: 'status', kind: 'keyword', facetable: true }, + ], + derivations: [ + (derived) => { + derived.status = 'valid'; + }, + ], + }, + ); + expect(document.title_nl).toBe('Titel'); + expect(document.status).toBe('valid'); + }); + it('throws when the framed node has no @id', () => { expect(() => projectDocument( @@ -284,7 +315,8 @@ describe('projectDocument', () => { { name: 'title', path: dcterms.title.value, - type: 'langText', + kind: 'text', + localized: true, locales: [], }, ], @@ -295,7 +327,7 @@ describe('projectDocument', () => { }); describe('projectGraph', () => { - it('frames each projection’s type and projects matching nodes', async () => { + it('frames each schema’s type and projects matching nodes', async () => { const quads = new Parser({ format: 'N-Triples' }).parse(` <${rdf.type.value}> <${DATASET}> . <${dcterms.title.value}> "Titel"@nl . diff --git a/packages/search/test/query.test.ts b/packages/search/test/query.test.ts new file mode 100644 index 00000000..b82042f5 --- /dev/null +++ b/packages/search/test/query.test.ts @@ -0,0 +1,78 @@ +import { describe, expect, it } from 'vitest'; +import { acceptsFilter, filterOperatorFor } from '../src/query.js'; +import type { SearchField } from '../src/schema.js'; + +const keyword: SearchField = { + name: 'format', + kind: 'keyword', + array: true, + filterable: true, +}; +const datePosted: SearchField = { + name: 'datePosted', + kind: 'date', + filterable: true, +}; +const status: SearchField = { + name: 'status', + kind: 'keyword', + facetable: true, +}; +const title: SearchField = { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl'], + filterable: true, +}; + +describe('filterOperatorFor', () => { + it('maps each field kind to its `where` operator', () => { + expect(filterOperatorFor('text')).toBeUndefined(); + expect(filterOperatorFor('keyword')).toBe('in'); + expect(filterOperatorFor('reference')).toBe('in'); + expect(filterOperatorFor('integer')).toBe('range'); + expect(filterOperatorFor('number')).toBe('range'); + expect(filterOperatorFor('date')).toBe('range'); + expect(filterOperatorFor('boolean')).toBe('is'); + }); +}); + +describe('acceptsFilter', () => { + it('accepts a filter whose shape matches the field’s operator', () => { + expect( + acceptsFilter(keyword, { field: 'format', in: ['text/turtle'] }), + ).toBe(true); + expect( + acceptsFilter(datePosted, { + field: 'datePosted', + range: { min: '2024' }, + }), + ).toBe(true); + }); + + it('rejects a filter whose shape does not match the field’s operator', () => { + expect(acceptsFilter(keyword, { field: 'format', range: { min: 1 } })).toBe( + false, + ); + }); + + it('rejects a filter on a non-filterable field', () => { + expect(acceptsFilter(status, { field: 'status', in: ['valid'] })).toBe( + false, + ); + }); + + it('rejects any filter on a text field (it feeds the free-text query)', () => { + expect(acceptsFilter(title, { field: 'title', in: ['x'] })).toBe(false); + }); + + it('accepts an `is` filter on a filterable boolean field', () => { + const iiif: SearchField = { + name: 'iiif', + kind: 'boolean', + filterable: true, + }; + expect(acceptsFilter(iiif, { field: 'iiif', is: true })).toBe(true); + }); +}); diff --git a/packages/search/test/schema.test.ts b/packages/search/test/schema.test.ts new file mode 100644 index 00000000..bd52d449 --- /dev/null +++ b/packages/search/test/schema.test.ts @@ -0,0 +1,209 @@ +import { describe, expect, it } from 'vitest'; +import { + facetableFields, + filterableFields, + outputFields, + physicalFields, + searchableFields, + sortableFields, + type SearchField, + type SearchSchema, +} from '../src/schema.js'; + +const DATASET = 'http://www.w3.org/ns/dcat#Dataset'; + +const schema: SearchSchema = { + type: DATASET, + fields: [ + { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, + }, + { + name: 'description', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 2 }, + }, + { + name: 'keyword', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + searchable: { weight: 1 }, + }, + { + name: 'format', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + }, + { + name: 'datePosted', + kind: 'date', + output: true, + filterable: true, + sortable: true, + }, + { + name: 'status', + kind: 'keyword', + facetable: true, + filterable: true, + output: true, + }, + ], +}; + +describe('physicalFields', () => { + it('fans a localized text field out into per-locale display, search and sort keys', () => { + const title: SearchField = { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, + }; + + expect(physicalFields(title)).toEqual({ + display: ['title_nl', 'title_en'], + search: ['title_search_nl', 'title_search_en'], + sort: ['title_sort_nl', 'title_sort_en'], + }); + }); + + it('gives a searchable keyword facet one value field and one folded search field', () => { + const keyword: SearchField = { + name: 'keyword', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + searchable: { weight: 1 }, + }; + + expect(physicalFields(keyword)).toEqual({ + value: 'keyword', + display: [], + search: ['keyword_search'], + sort: [], + }); + }); + + it('adds the `${name}_group` companion when a field declares a group', () => { + const format: SearchField = { + name: 'format', + kind: 'keyword', + array: true, + facetable: true, + group: { + name: 'format_group', + prefix: 'https://www.iana.org/assignments/media-types/', + }, + }; + + expect(physicalFields(format)).toEqual({ + value: 'format', + display: [], + search: [], + sort: [], + group: 'format_group', + }); + }); + + it('emits only the search keys for a search-only localized field (no display, no sort)', () => { + const creator: SearchField = { + name: 'creator', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + searchable: { weight: 2 }, + }; + + expect(physicalFields(creator)).toEqual({ + display: [], + search: ['creator_search_nl', 'creator_search_en'], + sort: [], + }); + }); + + it('emits no per-locale fields when a localized field declares no locales', () => { + const title: SearchField = { + name: 'title', + kind: 'text', + localized: true, + output: true, + searchable: { weight: 5 }, + sortable: true, + }; + + expect(physicalFields(title)).toEqual({ + display: [], + search: [], + sort: [], + }); + }); + + it('stores a reference field in one value field', () => { + const publisher: SearchField = { + name: 'publisher', + kind: 'reference', + facetable: true, + filterable: true, + output: true, + ref: { type: 'http://xmlns.com/foaf/0.1/Agent', strategy: 'labelOnly' }, + }; + + expect(physicalFields(publisher)).toEqual({ + value: 'publisher', + display: [], + search: [], + sort: [], + }); + }); +}); + +describe('schema selectors', () => { + it('orders searchable fields by descending weight', () => { + expect(searchableFields(schema).map((field) => field.name)).toEqual([ + 'title', + 'description', + 'keyword', + ]); + }); + + it('selects facetable, filterable, sortable and output fields by capability', () => { + expect(facetableFields(schema).map((field) => field.name)).toEqual([ + 'keyword', + 'format', + 'status', + ]); + expect(filterableFields(schema).map((field) => field.name)).toEqual([ + 'keyword', + 'format', + 'datePosted', + 'status', + ]); + expect(sortableFields(schema).map((field) => field.name)).toEqual([ + 'title', + 'datePosted', + ]); + expect(outputFields(schema).map((field) => field.name)).toEqual([ + 'title', + 'description', + 'datePosted', + 'status', + ]); + }); +}); From 11df992f169649e1d50abbf562407ceb1db549e8 Mon Sep 17 00:00:00 2001 From: David de Boer Date: Sun, 28 Jun 2026 20:22:54 +0200 Subject: [PATCH 02/35] feat(search-typesense): add collection-schema builder, query compiler and SearchEngine - buildCollectionSchema derives a Typesense collection from the unified SearchField model - buildSearchParams compiles SearchQuery into Typesense params (filter_by/sort_by/facet_by/query_by) - createTypesenseSearchEngine implements the SearchEngine port: compile, search, reconstruct - resolve reference and reference-facet labels from the sidecar labels collection in one lookup - add a testcontainer integration test and a generator-stability snapshot --- packages/search-typesense/README.md | 30 +- packages/search-typesense/package.json | 4 +- .../search-typesense/src/collection-schema.ts | 144 ++++++++++ packages/search-typesense/src/index.ts | 8 + .../search-typesense/src/query-compiler.ts | 202 +++++++++++++ packages/search-typesense/src/search.ts | 265 ++++++++++++++++++ .../generator-stability.test.ts.snap | 120 ++++++++ .../test/collection-schema.test.ts | 202 +++++++++++++ .../test/generator-stability.test.ts | 66 +++++ .../test/parse-response.test.ts | 145 ++++++++++ .../test/query-compiler.test.ts | 156 +++++++++++ .../test/search-engine.test.ts | 226 +++++++++++++++ packages/search-typesense/tsconfig.lib.json | 5 +- 13 files changed, 1563 insertions(+), 10 deletions(-) create mode 100644 packages/search-typesense/src/collection-schema.ts create mode 100644 packages/search-typesense/src/query-compiler.ts create mode 100644 packages/search-typesense/src/search.ts create mode 100644 packages/search-typesense/test/__snapshots__/generator-stability.test.ts.snap create mode 100644 packages/search-typesense/test/collection-schema.test.ts create mode 100644 packages/search-typesense/test/generator-stability.test.ts create mode 100644 packages/search-typesense/test/parse-response.test.ts create mode 100644 packages/search-typesense/test/query-compiler.test.ts create mode 100644 packages/search-typesense/test/search-engine.test.ts diff --git a/packages/search-typesense/README.md b/packages/search-typesense/README.md index b5d62bb9..ea681cae 100644 --- a/packages/search-typesense/README.md +++ b/packages/search-typesense/README.md @@ -1,13 +1,27 @@ # @lde/search-typesense -[Typesense](https://typesense.org/) engine adapter for RDF-backed search -pipelines. Engine-specific (Typesense) but domain-agnostic – the caller supplies -the collection schema and documents. - -The engine-agnostic half of the pipeline – framing `CONSTRUCT` quads into a -JSON-LD IR and projecting that IR into flat documents from a declarative field -spec – lives in [`@lde/search`](../search). This package consumes those -documents and writes them to Typesense. +[Typesense](https://typesense.org/) engine adapter for the engine- and +domain-agnostic [`@lde/search`](../search) core. **Engine-specific (Typesense) but +domain-agnostic** – you supply a `SearchSchema`; this package never names your +domain. It is the Typesense implementation of the `SearchEngine` port: it derives +a collection schema from the field model, compiles the neutral `SearchQuery` into +Typesense search params, runs it, reconstructs the engine-neutral `SearchResult`, +and manages the index lifecycle (blue/green rebuild). + +## Collection schema and engine + +`buildCollectionSchema(schema, { name, defaultSortingField, … })` derives a +Typesense collection from the unified `SearchField` model — the Typesense field +type comes from each field’s `kind`, and the physical fanout (per-locale +search/sort keys, the `_group` companion) matches what the projection writes, via +`@lde/search`’s `physicalFields`, so the index and the documents cannot drift. + +`createTypesenseSearchEngine(client, { collection, labelsCollection })` is the +`SearchEngine` implementation: it compiles the query, runs the search, resolves +reference (and reference-facet) labels from the sidecar `labels` collection in a +single lookup, and reconstructs the logical `SearchResult` — language maps, +labelled references, labelled facet buckets. The pure halves `buildSearchParams` +and `parseSearchResponse` are exported for direct use and testing. ## Indexing diff --git a/packages/search-typesense/package.json b/packages/search-typesense/package.json index b1dde852..445624fb 100644 --- a/packages/search-typesense/package.json +++ b/packages/search-typesense/package.json @@ -1,7 +1,7 @@ { "name": "@lde/search-typesense", "version": "0.1.1", - "description": "Generic Typesense engine adapter for RDF-backed search pipelines: collection lifecycle, bulk upsert and blue/green alias swap", + "description": "Typesense implementation of the @lde/search SearchEngine port: collection-schema builder, query compiler, label-resolving result reconstruction, and blue/green index lifecycle. Engine-specific (Typesense) but domain-agnostic.", "repository": { "url": "git+https://github.com/ldelements/lde.git", "directory": "packages/search-typesense" @@ -25,6 +25,8 @@ "!**/*.tsbuildinfo" ], "dependencies": { + "@lde/search": "^0.1.2", + "@lde/text-normalization": "^0.1.1", "tslib": "^2.3.0", "typesense": "^3.0.6" }, diff --git a/packages/search-typesense/src/collection-schema.ts b/packages/search-typesense/src/collection-schema.ts new file mode 100644 index 00000000..5141f634 --- /dev/null +++ b/packages/search-typesense/src/collection-schema.ts @@ -0,0 +1,144 @@ +import type { CollectionCreateSchema } from 'typesense'; +import type { CollectionFieldSchema } from 'typesense/lib/Typesense/Collection.js'; +import { + physicalFields, + type SearchField, + type SearchSchema, +} from '@lde/search'; + +/** Deployment-specific options the generic field model does not carry. */ +export interface CollectionSchemaOptions { + /** The Typesense collection (or alias) name. */ + readonly name: string; + /** Snowball stemming locale for non-localized searchable fields (default `nl`). + * Localized text search fields stem in their own locale. */ + readonly defaultLocale?: string; + /** The field Typesense sorts by when a query imposes no order. */ + readonly defaultSortingField?: string; + /** Synonym sets the collection references (synced separately). */ + readonly synonymSets?: readonly string[]; +} + +/** + * Build a Typesense collection schema from the unified {@link SearchSchema}, so + * the index and the projection are driven by one declarative source and cannot + * drift. Each field fans out into the same physical fields the projection writes + * ({@link physicalFields}); the Typesense field type is derived from the field + * `kind`, never re-declared. + * + * Stemming is enabled on every folded `*_search` field: localized text stems + * each `*_search_${locale}` in its own language, and a non-localized searchable + * field stems in `defaultLocale`. + */ +export function buildCollectionSchema( + schema: SearchSchema, + options: CollectionSchemaOptions, +): CollectionCreateSchema { + const defaultLocale = options.defaultLocale ?? 'nl'; + const collection: CollectionCreateSchema = { + name: options.name, + fields: schema.fields.flatMap((field) => + typesenseFields(field, defaultLocale, options.defaultSortingField), + ), + }; + if (options.defaultSortingField !== undefined) { + collection.default_sorting_field = options.defaultSortingField; + } + if (options.synonymSets !== undefined) { + collection.synonym_sets = [...options.synonymSets]; + } + return collection; +} + +/** The physical Typesense fields one declaration produces. */ +function typesenseFields( + field: SearchField, + defaultLocale: string, + defaultSortingField: string | undefined, +): CollectionFieldSchema[] { + const names = physicalFields(field); + if (field.kind === 'text' && field.localized === true) { + const locales = field.locales ?? []; + return [ + // Display labels: stored, not indexed for search (search uses the folded + // companions), accents preserved. + ...names.display.map( + (name): CollectionFieldSchema => ({ + name, + type: 'string', + index: false, + optional: true, + }), + ), + // One folded search field per locale, each stemmed in its own language. + ...names.search.map( + (name, index): CollectionFieldSchema => ({ + name, + type: 'string', + optional: true, + stem: true, + locale: locales[index], + }), + ), + ...names.sort.map( + (name): CollectionFieldSchema => ({ + name, + type: 'string', + sort: true, + optional: true, + }), + ), + ]; + } + + const valueType = typesenseValueType(field); + const fields: CollectionFieldSchema[] = [ + { + name: field.name, + type: valueType, + facet: field.facetable ?? false, + sort: field.sortable ?? false, + // A `required` field is non-optional; so is the `default_sorting_field`, + // which Typesense requires to be present. Everything else may be absent. + optional: field.required !== true && field.name !== defaultSortingField, + }, + ]; + if (field.searchable) { + for (const name of names.search) { + fields.push({ + name, + type: valueType, + optional: true, + stem: true, + locale: defaultLocale, + }); + } + } + if (names.group !== undefined) { + fields.push({ + name: names.group, + type: valueType, + facet: true, + optional: true, + }); + } + return fields; +} + +/** The Typesense field type for a non-localized field, from its `kind`. 64-bit + * integers (and dates, stored as Unix seconds) so large counts never overflow. */ +function typesenseValueType(field: SearchField): CollectionFieldSchema['type'] { + switch (field.kind) { + case 'integer': + case 'date': + return 'int64'; + case 'number': + return 'float'; + case 'boolean': + return 'bool'; + case 'keyword': + case 'reference': + case 'text': + return field.array === true ? 'string[]' : 'string'; + } +} diff --git a/packages/search-typesense/src/index.ts b/packages/search-typesense/src/index.ts index 6514638d..66247957 100644 --- a/packages/search-typesense/src/index.ts +++ b/packages/search-typesense/src/index.ts @@ -1 +1,9 @@ export { rebuild } from './adapter.js'; +export { buildCollectionSchema } from './collection-schema.js'; +export type { CollectionSchemaOptions } from './collection-schema.js'; +export { buildSearchParams } from './query-compiler.js'; +export { createTypesenseSearchEngine, parseSearchResponse } from './search.js'; +export type { + TypesenseSearchEngineOptions, + TypesenseSearchResponse, +} from './search.js'; diff --git a/packages/search-typesense/src/query-compiler.ts b/packages/search-typesense/src/query-compiler.ts new file mode 100644 index 00000000..fc9d4950 --- /dev/null +++ b/packages/search-typesense/src/query-compiler.ts @@ -0,0 +1,202 @@ +import type { SearchParams } from 'typesense/lib/Typesense/Documents.js'; +import { fold } from '@lde/text-normalization'; +import { + physicalFields, + searchableFields, + type Filter, + type SearchField, + type SearchQuery, + type SearchSchema, + type Sort, +} from '@lde/search'; + +/** + * Compile the engine-neutral {@link SearchQuery} into Typesense search + * parameters — the query half of the engine adapter. Pure (no client, no env), + * so the mapping is asserted directly in unit tests. Field names come from + * {@link physicalFields}, the same convention the projection and the collection + * schema use, so a query can never reference a field the index does not carry. + */ +export function buildSearchParams( + query: SearchQuery, + schema: SearchSchema, +): SearchParams { + const folded = + query.text !== undefined && query.text.length > 0 + ? fold(query.text) + : undefined; + const { names, weights } = queryFields(schema, query.locale); + const filterBy = compileFilterBy(query.where, schema); + const sortBy = query.orderBy + .map((sort) => compileSort(sort, schema, query.locale)) + .join(','); + const params: SearchParams = { + q: folded ?? '*', + query_by: names.join(','), + query_by_weights: weights.join(','), + per_page: query.limit, + page: Math.floor(query.offset / query.limit) + 1, + }; + if (filterBy.length > 0) { + params.filter_by = filterBy; + } + if (sortBy.length > 0) { + params.sort_by = sortBy; + } + if (query.facets.length > 0) { + params.facet_by = query.facets.join(','); + } + return params; +} + +/** + * The `query_by` fields and aligned weights. Each searchable field expands to its + * folded `*_search` companion(s); a localized field’s active-locale companion + * keeps its full weight while the other locale is gently demoted (−1, floored at + * 1), so a match in the user’s language ranks higher while cross-language matches + * still surface. + */ +function queryFields( + schema: SearchSchema, + locale: string, +): { readonly names: string[]; readonly weights: number[] } { + const names: string[] = []; + const weights: number[] = []; + for (const field of searchableFields(schema)) { + const search = physicalFields(field).search; + const baseWeight = field.searchable.weight; + if (field.kind === 'text' && field.localized === true) { + const locales = field.locales ?? []; + search.forEach((name, index) => { + names.push(name); + weights.push( + locales[index] === locale ? baseWeight : Math.max(1, baseWeight - 1), + ); + }); + } else { + for (const name of search) { + names.push(name); + weights.push(baseWeight); + } + } + } + return { names, weights }; +} + +/** AND-join the compiled `where` clauses; skips unknown fields and empty clauses. */ +function compileFilterBy( + where: readonly Filter[], + schema: SearchSchema, +): string { + return where + .map((filter) => compileFilter(filter, schema)) + .filter((clause): clause is string => clause !== undefined) + .join(' && '); +} + +function compileFilter( + filter: Filter, + schema: SearchSchema, +): string | undefined { + const field = schema.fields.find( + (candidate) => candidate.name === filter.field, + ); + if (field === undefined) { + return undefined; + } + if ('in' in filter) { + return filter.in.length > 0 + ? compileMembership(field, filter.in) + : undefined; + } + if ('range' in filter) { + return compileRange(field.name, filter.range); + } + return `${field.name}:=${filter.is}`; +} + +/** + * A membership clause. A grouped field splits its values into `prefix`-tagged + * group tokens (matched against the `_group` companion) and granular values, and + * ORs the two so selecting a value and a group within one facet unions instead of + * intersecting. A non-facet (tokenized) field uses the exact `:=` operator so an + * IRI cannot partial-match on a shared path segment. + */ +function compileMembership( + field: SearchField, + values: readonly string[], +): string { + const exact = field.facetable !== true; + if (field.group !== undefined) { + const prefix = field.group.prefix; + const groups = values.filter((value) => value.startsWith(prefix)); + const granular = values.filter((value) => !value.startsWith(prefix)); + const parts: string[] = []; + if (granular.length > 0) { + parts.push(membership(field.name, granular, exact)); + } + if (groups.length > 0) { + parts.push(membership(field.group.name, groups, false)); + } + return parts.length > 1 ? `(${parts.join(' || ')})` : parts[0]; + } + return membership(field.name, values, exact); +} + +function membership( + name: string, + values: readonly string[], + exact: boolean, +): string { + const list = `[${values.map(escapeFilterValue).join(',')}]`; + return exact ? `${name}:=${list}` : `${name}:${list}`; +} + +/** An inclusive Typesense range clause, or `undefined` when neither bound is set. */ +function compileRange( + name: string, + range: { readonly min?: number | string; readonly max?: number | string }, +): string | undefined { + const { min, max } = range; + if (min !== undefined && max !== undefined) { + return `${name}:[${min}..${max}]`; + } + if (min !== undefined) { + return `${name}:>=${min}`; + } + if (max !== undefined) { + return `${name}:<=${max}`; + } + return undefined; +} + +/** + * One `sort_by` term. `relevance` maps to Typesense’s `_text_match`; a localized + * text field sorts on its active-locale folded key; any other field (including a + * deployment tie-break like `status_rank`) sorts on its own name. + */ +function compileSort(sort: Sort, schema: SearchSchema, locale: string): string { + if (sort.field === 'relevance') { + return `_text_match:${sort.direction}`; + } + const field = schema.fields.find( + (candidate) => candidate.name === sort.field, + ); + if ( + field !== undefined && + field.kind === 'text' && + field.localized === true + ) { + return `${field.name}_sort_${locale}:${sort.direction}`; + } + return `${sort.field}:${sort.direction}`; +} + +/** + * Backtick-wrap a filter value so reserved characters in IRIs and media types + * (`:`, `/`, `&`, `,`, …) are taken literally instead of parsed as filter syntax. + * An embedded backtick is escaped. + */ +function escapeFilterValue(value: string): string { + return `\`${value.replace(/`/g, '\\`')}\``; +} diff --git a/packages/search-typesense/src/search.ts b/packages/search-typesense/src/search.ts new file mode 100644 index 00000000..e9d792c7 --- /dev/null +++ b/packages/search-typesense/src/search.ts @@ -0,0 +1,265 @@ +import type { Client } from 'typesense'; +import { + outputFields, + type FacetBucket, + type LocalizedValue, + type Reference, + type ResultDocument, + type SearchEngine, + type SearchField, + type SearchHit, + type SearchQuery, + type SearchResult, + type SearchSchema, + type SearchValue, +} from '@lde/search'; +import { buildSearchParams } from './query-compiler.js'; + +/** Where the engine reads documents and (optionally) reference labels. */ +export interface TypesenseSearchEngineOptions { + /** The dataset collection or alias to query. */ + readonly collection: string; + /** The sidecar `labels` collection (IRI → label); omit for id-only references. */ + readonly labelsCollection?: string; +} + +/** + * A Typesense-backed {@link SearchEngine}. `search` compiles the query + * ({@link buildSearchParams}), runs it, resolves the reference labels for the + * page of hits from the sidecar `labels` collection in one lookup, and + * reconstructs the engine-neutral {@link SearchResult} ({@link parseSearchResponse}). + * Every engine specific stays here; consumers see only logical documents. + */ +export function createTypesenseSearchEngine( + client: Client, + options: TypesenseSearchEngineOptions, +): SearchEngine { + return { + async search( + query: SearchQuery, + schema: SearchSchema, + ): Promise { + const params = buildSearchParams(query, schema); + const response = (await client + .collections(options.collection) + .documents() + .search(params)) as TypesenseSearchResponse; + const labels = + options.labelsCollection !== undefined + ? await fetchLabels( + client, + options.labelsCollection, + referenceIris(response, schema), + ) + : new Map(); + return parseSearchResponse(response, schema, labels); + }, + }; +} + +/** Every distinct reference IRI across the page of hits. */ +function referenceIris( + response: TypesenseSearchResponse, + schema: SearchSchema, +): string[] { + const referenceFields = schema.fields + .filter((field) => field.kind === 'reference') + .map((field) => field.name); + const referenceFieldSet = new Set(referenceFields); + const iris = new Set(); + for (const hit of response.hits ?? []) { + for (const name of referenceFields) { + const raw = hit.document[name]; + if (Array.isArray(raw)) { + for (const value of raw) { + iris.add(String(value)); + } + } else if (typeof raw === 'string') { + iris.add(raw); + } + } + } + // Reference-facet bucket values are IRIs too; resolve them in the same lookup. + for (const facet of response.facet_counts ?? []) { + if (referenceFieldSet.has(facet.field_name)) { + for (const bucket of facet.counts) { + iris.add(bucket.value); + } + } + } + return [...iris]; +} + +/** + * Resolve labels for `iris` from the sidecar `labels` collection in a single + * `filter_by: id:[…]` lookup. Each `label_${locale}` becomes a language-map + * entry; the default `label` is the untagged (`und`) fallback when no locale + * variant exists. + */ +async function fetchLabels( + client: Client, + collection: string, + iris: readonly string[], +): Promise> { + const labels = new Map(); + if (iris.length === 0) { + return labels; + } + const filter = `id:[${iris.map((iri) => `\`${iri.replace(/`/g, '\\`')}\``).join(',')}]`; + const response = (await client.collections(collection).documents().search({ + q: '*', + query_by: 'label', + filter_by: filter, + per_page: iris.length, + })) as TypesenseSearchResponse; + for (const hit of response.hits ?? []) { + labels.set(String(hit.document.id), labelToLocalizedValue(hit.document)); + } + return labels; +} + +/** Turn a `labels` document into a language map (`label_${locale}` → locale). */ +function labelToLocalizedValue( + document: Record, +): LocalizedValue { + const map: Record = {}; + for (const [key, value] of Object.entries(document)) { + if (key.startsWith('label_') && typeof value === 'string') { + map[key.slice('label_'.length)] = [value]; + } + } + if (Object.keys(map).length === 0 && typeof document.label === 'string') { + map.und = [document.label]; + } + return map; +} + +/** The subset of a Typesense search response this adapter reads. */ +export interface TypesenseSearchResponse { + readonly found: number; + readonly hits?: readonly { readonly document: Record }[]; + readonly facet_counts?: readonly { + readonly field_name: string; + readonly counts: readonly { + readonly value: string; + readonly count: number; + }[]; + }[]; +} + +/** + * Reconstruct a Typesense response into the engine-neutral {@link SearchResult}: + * the flat, fanned-out document is turned back into a logical one (per-locale + * display fields → a language map, reference IRIs → labelled references via the + * sidecar `labels` lookup, scalars passed through). `labels` maps a reference IRI + * to its resolved label; an IRI absent from it yields an id-only reference. + */ +export function parseSearchResponse( + response: TypesenseSearchResponse, + schema: SearchSchema, + labels: ReadonlyMap, +): SearchResult { + const hits: SearchHit[] = (response.hits ?? []).map((hit) => ({ + id: String(hit.document.id), + document: reconstructDocument(hit.document, schema, labels), + })); + // Reference facets are IRI-keyed; their buckets carry a resolved data label. + // Plain facets (tokens, free strings) carry no label — the consumer owns display. + const referenceFacets = new Set( + schema.fields + .filter((field) => field.kind === 'reference') + .map((field) => field.name), + ); + const facets: Record = {}; + for (const facet of response.facet_counts ?? []) { + const labelled = referenceFacets.has(facet.field_name); + facets[facet.field_name] = facet.counts.map((bucket) => { + const label = labelled ? labels.get(bucket.value) : undefined; + return label === undefined + ? { value: bucket.value, count: bucket.count } + : { value: bucket.value, count: bucket.count, label }; + }); + } + return { hits, total: response.found, facets }; +} + +/** Rebuild one logical document from a flat Typesense document. */ +function reconstructDocument( + flat: Record, + schema: SearchSchema, + labels: ReadonlyMap, +): ResultDocument { + const document: Record = {}; + for (const field of outputFields(schema)) { + if (field.kind === 'boolean') { + // A boolean is always present; an absent value means false. + document[field.name] = flat[field.name] === true; + continue; + } + const value = logicalValue(flat, field, labels); + if (value !== undefined) { + document[field.name] = value; + } + } + return document; +} + +function logicalValue( + flat: Record, + field: SearchField, + labels: ReadonlyMap, +): SearchValue | undefined { + switch (field.kind) { + case 'text': + return localizedValue(flat, field); + case 'reference': + return referenceValue(flat, field, labels); + case 'keyword': { + const value = flat[field.name]; + return Array.isArray(value) || typeof value === 'string' + ? (value as SearchValue) + : undefined; + } + case 'integer': + case 'number': + case 'date': { + const value = flat[field.name]; + return typeof value === 'number' ? value : undefined; + } + case 'boolean': + return flat[field.name] === true; + } +} + +/** Gather the per-locale display fields back into a language map. */ +function localizedValue( + flat: Record, + field: SearchField, +): LocalizedValue | undefined { + const map: Record = {}; + for (const locale of field.locales ?? []) { + const value = flat[`${field.name}_${locale}`]; + if (typeof value === 'string') { + map[locale] = [value]; + } + } + return Object.keys(map).length > 0 ? map : undefined; +} + +/** Map stored reference IRIs to labelled references; id-only when no label. */ +function referenceValue( + flat: Record, + field: SearchField, + labels: ReadonlyMap, +): SearchValue | undefined { + const raw = flat[field.name]; + if (raw === undefined) { + return undefined; + } + const iris = Array.isArray(raw) ? (raw as string[]) : [String(raw)]; + const references: Reference[] = iris.map((iri) => { + const label = labels.get(iri); + return label === undefined ? { id: iri } : { id: iri, label }; + }); + return field.array === true ? references : references[0]; +} diff --git a/packages/search-typesense/test/__snapshots__/generator-stability.test.ts.snap b/packages/search-typesense/test/__snapshots__/generator-stability.test.ts.snap new file mode 100644 index 00000000..201512f7 --- /dev/null +++ b/packages/search-typesense/test/__snapshots__/generator-stability.test.ts.snap @@ -0,0 +1,120 @@ +// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html + +exports[`collection-schema generator stability > derives a stable Typesense collection for a representative schema 1`] = ` +{ + "default_sorting_field": "size", + "fields": [ + { + "index": false, + "name": "title_nl", + "optional": true, + "type": "string", + }, + { + "index": false, + "name": "title_en", + "optional": true, + "type": "string", + }, + { + "locale": "nl", + "name": "title_search_nl", + "optional": true, + "stem": true, + "type": "string", + }, + { + "locale": "en", + "name": "title_search_en", + "optional": true, + "stem": true, + "type": "string", + }, + { + "name": "title_sort_nl", + "optional": true, + "sort": true, + "type": "string", + }, + { + "name": "title_sort_en", + "optional": true, + "sort": true, + "type": "string", + }, + { + "facet": true, + "name": "keyword", + "optional": true, + "sort": false, + "type": "string[]", + }, + { + "locale": "nl", + "name": "keyword_search", + "optional": true, + "stem": true, + "type": "string[]", + }, + { + "facet": true, + "name": "format", + "optional": true, + "sort": false, + "type": "string[]", + }, + { + "facet": true, + "name": "format_group", + "optional": true, + "type": "string[]", + }, + { + "facet": true, + "name": "creator", + "optional": true, + "sort": false, + "type": "string[]", + }, + { + "facet": true, + "name": "status", + "optional": false, + "sort": false, + "type": "string", + }, + { + "facet": true, + "name": "size", + "optional": false, + "sort": true, + "type": "int64", + }, + { + "facet": true, + "name": "score", + "optional": true, + "sort": false, + "type": "float", + }, + { + "facet": false, + "name": "created", + "optional": true, + "sort": true, + "type": "int64", + }, + { + "facet": true, + "name": "open", + "optional": true, + "sort": false, + "type": "bool", + }, + ], + "name": "things", + "synonym_sets": [ + "things-synonyms", + ], +} +`; diff --git a/packages/search-typesense/test/collection-schema.test.ts b/packages/search-typesense/test/collection-schema.test.ts new file mode 100644 index 00000000..51511122 --- /dev/null +++ b/packages/search-typesense/test/collection-schema.test.ts @@ -0,0 +1,202 @@ +import { describe, expect, it } from 'vitest'; +import type { SearchSchema } from '@lde/search'; +import { buildCollectionSchema } from '../src/collection-schema.js'; + +const schema: SearchSchema = { + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [ + { + name: 'title', + path: 'http://purl.org/dc/terms/title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, + }, + { + name: 'keyword', + path: 'http://www.w3.org/ns/dcat#keyword', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + searchable: { weight: 1 }, + }, + { + name: 'format', + path: 'https://def.nde.nl/format', + kind: 'keyword', + array: true, + facetable: true, + group: { name: 'format_group', prefix: 'group:' }, + }, + // Derived fields (no path) still get collection fields — populated at index + // time by derivations, not projected. + { name: 'status', kind: 'keyword', facetable: true, required: true }, + { name: 'statusRank', kind: 'integer', sortable: true }, + { + name: 'size', + kind: 'integer', + facetable: true, + sortable: true, + }, + { name: 'iiif', kind: 'boolean', facetable: true }, + { + name: 'publisher', + path: 'http://purl.org/dc/terms/publisher', + kind: 'reference', + array: true, + facetable: true, + }, + { + name: 'datePosted', + path: 'https://def.nde.nl/datePosted', + kind: 'date', + sortable: true, + }, + { + name: 'score', + kind: 'number', + facetable: true, + }, + ], +}; + +describe('buildCollectionSchema', () => { + const collection = buildCollectionSchema(schema, { + name: 'datasets', + defaultLocale: 'nl', + defaultSortingField: 'statusRank', + synonymSets: ['dataset-synonyms'], + }); + + it('carries the collection name, default sorting field and synonym sets', () => { + expect(collection.name).toBe('datasets'); + expect(collection.default_sorting_field).toBe('statusRank'); + expect(collection.synonym_sets).toEqual(['dataset-synonyms']); + }); + + it('fans a localized text field into display, per-locale stemmed search and sort keys', () => { + expect(collection.fields).toContainEqual({ + name: 'title_nl', + type: 'string', + index: false, + optional: true, + }); + expect(collection.fields).toContainEqual({ + name: 'title_en', + type: 'string', + index: false, + optional: true, + }); + expect(collection.fields).toContainEqual({ + name: 'title_search_nl', + type: 'string', + optional: true, + stem: true, + locale: 'nl', + }); + expect(collection.fields).toContainEqual({ + name: 'title_search_en', + type: 'string', + optional: true, + stem: true, + locale: 'en', + }); + expect(collection.fields).toContainEqual({ + name: 'title_sort_nl', + type: 'string', + sort: true, + optional: true, + }); + expect(collection.fields).toContainEqual({ + name: 'title_sort_en', + type: 'string', + sort: true, + optional: true, + }); + }); + + it('maps keyword/reference/integer/boolean kinds to Typesense value fields', () => { + expect(collection.fields).toContainEqual({ + name: 'keyword', + type: 'string[]', + facet: true, + sort: false, + optional: true, + }); + // `status` is required → non-optional, like the default sorting field. + expect(collection.fields).toContainEqual({ + name: 'status', + type: 'string', + facet: true, + sort: false, + optional: false, + }); + // statusRank is the default_sorting_field, which Typesense requires to be + // non-optional. + expect(collection.fields).toContainEqual({ + name: 'statusRank', + type: 'int64', + facet: false, + sort: true, + optional: false, + }); + expect(collection.fields).toContainEqual({ + name: 'size', + type: 'int64', + facet: true, + sort: true, + optional: true, + }); + expect(collection.fields).toContainEqual({ + name: 'iiif', + type: 'bool', + facet: true, + sort: false, + optional: true, + }); + expect(collection.fields).toContainEqual({ + name: 'publisher', + type: 'string[]', + facet: true, + sort: false, + optional: true, + }); + expect(collection.fields).toContainEqual({ + name: 'datePosted', + type: 'int64', + facet: false, + sort: true, + optional: true, + }); + expect(collection.fields).toContainEqual({ + name: 'score', + type: 'float', + facet: true, + sort: false, + optional: true, + }); + }); + + it('emits a folded, stemmed search companion for a searchable keyword field', () => { + expect(collection.fields).toContainEqual({ + name: 'keyword_search', + type: 'string[]', + optional: true, + stem: true, + locale: 'nl', + }); + }); + + it('emits the grouped-facet companion for a field that declares a group', () => { + expect(collection.fields).toContainEqual({ + name: 'format_group', + type: 'string[]', + facet: true, + optional: true, + }); + }); +}); diff --git a/packages/search-typesense/test/generator-stability.test.ts b/packages/search-typesense/test/generator-stability.test.ts new file mode 100644 index 00000000..2383ecde --- /dev/null +++ b/packages/search-typesense/test/generator-stability.test.ts @@ -0,0 +1,66 @@ +import { describe, expect, it } from 'vitest'; +import type { SearchSchema } from '@lde/search'; +import { buildCollectionSchema } from '../src/collection-schema.js'; + +/** + * A neutral fixture exercising every kind + capability — NOT a real domain. The + * derived Typesense collection is snapshotted purely to pin the **generator**: + * any change to how `buildCollectionSchema` maps the field model (Typesense field + * types, the physical fanout, stem/locale, optional/default-sorting-field, group + * companions) surfaces as a snapshot diff before this library is published. + */ +const THING: SearchSchema = { + type: 'https://example.org/Thing', + fields: [ + { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, + }, + { + name: 'keyword', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + searchable: { weight: 1 }, + }, + { + name: 'format', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + group: { name: 'format_group', prefix: 'group:' }, + }, + { + name: 'creator', + kind: 'reference', + array: true, + facetable: true, + ref: { type: 'Agent', strategy: 'labelOnly' }, + }, + { name: 'status', kind: 'keyword', facetable: true, required: true }, + { name: 'size', kind: 'integer', facetable: true, sortable: true }, + { name: 'score', kind: 'number', facetable: true }, + { name: 'created', kind: 'date', sortable: true }, + { name: 'open', kind: 'boolean', facetable: true }, + ], +}; + +describe('collection-schema generator stability', () => { + it('derives a stable Typesense collection for a representative schema', () => { + expect( + buildCollectionSchema(THING, { + name: 'things', + defaultSortingField: 'size', + defaultLocale: 'nl', + synonymSets: ['things-synonyms'], + }), + ).toMatchSnapshot(); + }); +}); diff --git a/packages/search-typesense/test/parse-response.test.ts b/packages/search-typesense/test/parse-response.test.ts new file mode 100644 index 00000000..50e601a4 --- /dev/null +++ b/packages/search-typesense/test/parse-response.test.ts @@ -0,0 +1,145 @@ +import { describe, expect, it } from 'vitest'; +import type { LocalizedValue, SearchSchema } from '@lde/search'; +import { parseSearchResponse } from '../src/search.js'; + +const schema: SearchSchema = { + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [ + { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + }, + { + name: 'keyword', + kind: 'keyword', + array: true, + facetable: true, + output: true, + }, + { + name: 'publisher', + kind: 'reference', + array: true, + facetable: true, + output: true, + ref: { type: 'http://xmlns.com/foaf/0.1/Agent', strategy: 'labelOnly' }, + }, + { name: 'size', kind: 'integer', output: true }, + { name: 'datePosted', kind: 'date', output: true }, + { name: 'iiif', kind: 'boolean', facetable: true, output: true }, + // A non-output field is never reconstructed into the logical document. + { name: 'status', kind: 'keyword', facetable: true, filterable: true }, + ], +}; + +const labels = new Map([ + ['https://org/1', { nl: ['Het Utrechts Archief'] }], + ['https://org/2', { nl: ['Rijksmuseum'], en: ['Rijksmuseum'] }], +]); + +const response = { + found: 2, + hits: [ + { + document: { + id: 'https://d/1', + title_nl: 'Titel', + title_en: 'Title', + keyword: ['kaarten'], + publisher: ['https://org/1'], + size: 1234, + datePosted: 1_700_000_000, + iiif: true, + status: 'valid', + }, + }, + { + document: { + id: 'https://d/2', + title_nl: 'Andere', + keyword: ['atlas', 'kaart'], + publisher: ['https://org/2', 'https://org/3'], + }, + }, + ], + facet_counts: [ + { + field_name: 'keyword', + counts: [ + { value: 'kaarten', count: 3 }, + { value: 'atlas', count: 1 }, + ], + }, + { + // A reference facet: buckets are keyed by IRI and carry resolved labels. + field_name: 'publisher', + counts: [ + { value: 'https://org/1', count: 2 }, + { value: 'https://org/3', count: 1 }, + ], + }, + ], +}; + +describe('parseSearchResponse', () => { + const result = parseSearchResponse(response, schema, labels); + + it('carries the total and the facet buckets keyed by field name', () => { + expect(result.total).toBe(2); + // A plain facet: buckets carry no label. + expect(result.facets.keyword).toEqual([ + { value: 'kaarten', count: 3 }, + { value: 'atlas', count: 1 }, + ]); + }); + + it('attaches resolved labels to reference-facet buckets, id-only when unlabelled', () => { + expect(result.facets.publisher).toEqual([ + { + value: 'https://org/1', + count: 2, + label: { nl: ['Het Utrechts Archief'] }, + }, + { value: 'https://org/3', count: 1 }, + ]); + }); + + it('reconstructs localized text into a best-available language map', () => { + expect(result.hits[0].id).toBe('https://d/1'); + expect(result.hits[0].document.title).toEqual({ + nl: ['Titel'], + en: ['Title'], + }); + // Only the present locale is emitted. + expect(result.hits[1].document.title).toEqual({ nl: ['Andere'] }); + }); + + it('resolves reference IRIs to labelled references, id-only when unlabelled', () => { + expect(result.hits[0].document.publisher).toEqual([ + { id: 'https://org/1', label: { nl: ['Het Utrechts Archief'] } }, + ]); + expect(result.hits[1].document.publisher).toEqual([ + { + id: 'https://org/2', + label: { nl: ['Rijksmuseum'], en: ['Rijksmuseum'] }, + }, + { id: 'https://org/3' }, + ]); + }); + + it('passes keyword arrays and numeric scalars through, and omits absent fields', () => { + expect(result.hits[0].document.keyword).toEqual(['kaarten']); + expect(result.hits[0].document.size).toBe(1234); + expect(result.hits[0].document.datePosted).toBe(1_700_000_000); + expect(result.hits[1].document.size).toBeUndefined(); + }); + + it('defaults an absent boolean to false and never reconstructs non-output fields', () => { + expect(result.hits[0].document.iiif).toBe(true); + expect(result.hits[1].document.iiif).toBe(false); + expect(result.hits[0].document.status).toBeUndefined(); + }); +}); diff --git a/packages/search-typesense/test/query-compiler.test.ts b/packages/search-typesense/test/query-compiler.test.ts new file mode 100644 index 00000000..acdd9f7a --- /dev/null +++ b/packages/search-typesense/test/query-compiler.test.ts @@ -0,0 +1,156 @@ +import { describe, expect, it } from 'vitest'; +import type { SearchQuery, SearchSchema } from '@lde/search'; +import { buildSearchParams } from '../src/query-compiler.js'; + +const schema: SearchSchema = { + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [ + { + name: 'title', + path: 'http://purl.org/dc/terms/title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, + }, + { + name: 'keyword', + path: 'http://www.w3.org/ns/dcat#keyword', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + searchable: { weight: 1 }, + }, + { + name: 'format', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + group: { name: 'format_group', prefix: 'group:' }, + }, + // Filter-only, non-facet (tokenized) → exact `:=` membership. + { name: 'catalog', kind: 'keyword', array: true, filterable: true }, + { name: 'status', kind: 'keyword', facetable: true, filterable: true }, + { name: 'size', kind: 'integer', filterable: true, sortable: true }, + { name: 'iiif', kind: 'boolean', filterable: true, facetable: true }, + ], +}; + +const base: SearchQuery = { + where: [], + orderBy: [], + limit: 20, + offset: 0, + facets: [], + locale: 'nl', +}; + +describe('buildSearchParams', () => { + it('browses with a match-all q and the weighted query_by fields', () => { + const params = buildSearchParams(base, schema); + expect(params.q).toBe('*'); + expect(params.query_by).toBe( + 'title_search_nl,title_search_en,keyword_search', + ); + expect(params.per_page).toBe(20); + expect(params.page).toBe(1); + expect(params.filter_by).toBeUndefined(); + expect(params.sort_by).toBeUndefined(); + }); + + it('folds the query text and boosts the active locale in query_by_weights', () => { + expect( + buildSearchParams({ ...base, text: 'Kaart', locale: 'nl' }, schema), + ).toMatchObject({ q: 'kaart', query_by_weights: '5,4,1' }); + expect( + buildSearchParams({ ...base, text: 'Kaart', locale: 'en' }, schema) + .query_by_weights, + ).toBe('4,5,1'); + }); + + it('maps offset/limit to numbered pages', () => { + expect( + buildSearchParams({ ...base, offset: 40, limit: 20 }, schema).page, + ).toBe(3); + }); + + it('compiles where clauses, with exact membership for non-facet fields and grouped OR', () => { + const params = buildSearchParams( + { + ...base, + where: [ + { field: 'status', in: ['valid'] }, + { field: 'keyword', in: ['kaarten', 'atlas'] }, + { field: 'catalog', in: ['urn:cat'] }, + { field: 'format', in: ['text/turtle', 'group:rdf'] }, + { field: 'size', range: { min: 1, max: 10 } }, + { field: 'iiif', is: true }, + ], + }, + schema, + ); + expect(params.filter_by).toBe( + 'status:[`valid`] && ' + + 'keyword:[`kaarten`,`atlas`] && ' + + 'catalog:=[`urn:cat`] && ' + + '(format:[`text/turtle`] || format_group:[`group:rdf`]) && ' + + 'size:[1..10] && ' + + 'iiif:=true', + ); + }); + + it('compiles a one-sided range bound', () => { + expect( + buildSearchParams( + { ...base, where: [{ field: 'size', range: { min: 5 } }] }, + schema, + ).filter_by, + ).toBe('size:>=5'); + expect( + buildSearchParams( + { ...base, where: [{ field: 'size', range: { max: 9 } }] }, + schema, + ).filter_by, + ).toBe('size:<=9'); + }); + + it('compiles orderBy: RELEVANCE → _text_match and a localized field → its sort key', () => { + expect( + buildSearchParams( + { + ...base, + orderBy: [ + { field: 'relevance', direction: 'desc' }, + { field: 'status_rank', direction: 'asc' }, + ], + }, + schema, + ).sort_by, + ).toBe('_text_match:desc,status_rank:asc'); + + expect( + buildSearchParams( + { + ...base, + locale: 'nl', + orderBy: [ + { field: 'title', direction: 'asc' }, + { field: 'status_rank', direction: 'asc' }, + ], + }, + schema, + ).sort_by, + ).toBe('title_sort_nl:asc,status_rank:asc'); + }); + + it('requests facets by their logical field name', () => { + expect( + buildSearchParams({ ...base, facets: ['keyword', 'format'] }, schema) + .facet_by, + ).toBe('keyword,format'); + }); +}); diff --git a/packages/search-typesense/test/search-engine.test.ts b/packages/search-typesense/test/search-engine.test.ts new file mode 100644 index 00000000..3a392f8a --- /dev/null +++ b/packages/search-typesense/test/search-engine.test.ts @@ -0,0 +1,226 @@ +import { afterAll, beforeAll, describe, expect, it } from 'vitest'; +import type { Client } from 'typesense'; +import type { SearchEngine, SearchQuery, SearchSchema } from '@lde/search'; +import { buildCollectionSchema } from '../src/collection-schema.js'; +import { createTypesenseSearchEngine } from '../src/search.js'; +import { TypesenseContainer } from './typesense-container.js'; + +const datasetSchema: SearchSchema = { + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [ + { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, + }, + { + name: 'keyword', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + searchable: { weight: 1 }, + output: true, + }, + { + name: 'publisher', + kind: 'reference', + array: true, + facetable: true, + output: true, + ref: { type: 'http://xmlns.com/foaf/0.1/Agent', strategy: 'labelOnly' }, + }, + { name: 'status', kind: 'keyword', facetable: true, filterable: true }, + { name: 'statusRank', kind: 'integer', sortable: true }, + ], +}; + +// Flat documents, as the projection would emit them (physical field names). +const documents = [ + { + id: 'd1', + title_nl: 'Kaart van Utrecht', + title_en: 'Map of Utrecht', + title_search_nl: 'kaart van utrecht', + title_search_en: 'map of utrecht', + title_sort_nl: 'kaart van utrecht', + title_sort_en: 'map of utrecht', + keyword: ['kaarten'], + keyword_search: ['kaarten'], + publisher: ['https://org/1'], + status: 'valid', + statusRank: 0, + }, + { + id: 'd2', + title_nl: 'Atlas der Nederlanden', + title_search_nl: 'atlas der nederlanden', + title_sort_nl: 'atlas der nederlanden', + keyword: ['atlas'], + keyword_search: ['atlas'], + publisher: ['https://org/2'], + status: 'valid', + statusRank: 0, + }, + { + id: 'd3', + title_nl: 'Verouderde kaart', + title_search_nl: 'verouderde kaart', + title_sort_nl: 'verouderde kaart', + keyword: ['kaarten'], + keyword_search: ['kaarten'], + publisher: ['https://org/1'], + status: 'invalid', + statusRank: 3, + }, +]; + +const labelDocuments = [ + { + id: 'https://org/1', + label: 'Het Utrechts Archief', + label_nl: 'Het Utrechts Archief', + type: 'organization', + }, + { + id: 'https://org/2', + label: 'Rijksmuseum', + label_nl: 'Rijksmuseum', + label_en: 'Rijksmuseum', + type: 'organization', + }, +]; + +const baseQuery: SearchQuery = { + where: [], + orderBy: [], + limit: 10, + offset: 0, + facets: [], + locale: 'nl', +}; + +describe('createTypesenseSearchEngine (integration)', () => { + const container = new TypesenseContainer(); + let client: Client; + let engine: SearchEngine; + + beforeAll(async () => { + client = await container.start(); + // Typesense accepts the generated schema (stemming, locales, int64, …). + await client.collections().create( + buildCollectionSchema(datasetSchema, { + name: 'datasets', + defaultSortingField: 'statusRank', + defaultLocale: 'nl', + }), + ); + await client.collections().create({ + name: 'labels', + fields: [ + { name: 'label', type: 'string' }, + { name: 'label_nl', type: 'string', optional: true, index: false }, + { name: 'label_en', type: 'string', optional: true, index: false }, + { name: 'type', type: 'string', facet: true }, + ], + }); + await client + .collections('datasets') + .documents() + .import(documents, { action: 'create' }); + await client + .collections('labels') + .documents() + .import(labelDocuments, { action: 'create' }); + + engine = createTypesenseSearchEngine(client, { + collection: 'datasets', + labelsCollection: 'labels', + }); + }, 120_000); + + afterAll(async () => { + await container.stop(); + }); + + it('filters by status, sorts by the localized title key, and resolves reference labels', async () => { + const result = await engine.search( + { + ...baseQuery, + where: [{ field: 'status', in: ['valid'] }], + orderBy: [ + { field: 'title', direction: 'asc' }, + { field: 'statusRank', direction: 'asc' }, + ], + }, + datasetSchema, + ); + + // d3 is invalid → filtered out; remaining two sorted by folded title. + expect(result.total).toBe(2); + expect(result.hits.map((hit) => hit.id)).toEqual(['d2', 'd1']); + expect(result.hits[0].document.title).toEqual({ + nl: ['Atlas der Nederlanden'], + }); + expect(result.hits[0].document.publisher).toEqual([ + { + id: 'https://org/2', + label: { nl: ['Rijksmuseum'], en: ['Rijksmuseum'] }, + }, + ]); + expect(result.hits[1].document.publisher).toEqual([ + { id: 'https://org/1', label: { nl: ['Het Utrechts Archief'] } }, + ]); + }); + + it('ranks a full-text query through the weighted query_by fields', async () => { + const result = await engine.search( + { + ...baseQuery, + text: 'Utrecht', + orderBy: [{ field: 'relevance', direction: 'desc' }], + }, + datasetSchema, + ); + + expect(result.hits[0].id).toBe('d1'); + expect(result.hits.map((hit) => hit.id)).not.toContain('d2'); + }); + + it('returns facet buckets with counts, labelling reference facets', async () => { + const result = await engine.search( + { ...baseQuery, facets: ['keyword', 'publisher'] }, + datasetSchema, + ); + + // Plain facet: value + count, no label. + const keyword = [...result.facets.keyword].sort( + (a, b) => b.count - a.count, + ); + expect(keyword).toEqual([ + { value: 'kaarten', count: 2 }, + { value: 'atlas', count: 1 }, + ]); + + // Reference facet: IRI-keyed buckets carry the resolved data label. + const publisher = [...result.facets.publisher].sort( + (a, b) => b.count - a.count, + ); + expect(publisher).toEqual([ + { + value: 'https://org/1', + count: 2, + label: { nl: ['Het Utrechts Archief'] }, + }, + { + value: 'https://org/2', + count: 1, + label: { nl: ['Rijksmuseum'], en: ['Rijksmuseum'] }, + }, + ]); + }); +}); diff --git a/packages/search-typesense/tsconfig.lib.json b/packages/search-typesense/tsconfig.lib.json index e7c2ce37..52ca4bb7 100644 --- a/packages/search-typesense/tsconfig.lib.json +++ b/packages/search-typesense/tsconfig.lib.json @@ -8,7 +8,10 @@ "types": ["node"] }, "include": ["src/**/*.ts"], - "references": [], + "references": [ + { "path": "../search/tsconfig.lib.json" }, + { "path": "../text-normalization/tsconfig.lib.json" } + ], "exclude": [ "vite.config.ts", "vite.config.mts", From c64c90948e50dfb48e958ea8cd1a7bae7ba158ec Mon Sep 17 00:00:00 2001 From: David de Boer Date: Sun, 28 Jun 2026 20:23:14 +0200 Subject: [PATCH 03/35] feat(search-api-graphql): add the runtime-configured GraphQL surface - buildSearchSchema builds an executable GraphQLSchema from any SearchSchema at runtime (no codegen) - one generic resolver maps args to SearchQuery, calls the engine, and maps the result back - derive output, where, orderBy and facet types plus nullability from the field model - best-first Accept-Language output ordering; nullable facet label for reference facets - add printSearchSchema for a consumer SDL snapshot, plus a generator-stability snapshot --- .../0004-search-api-graphql-surface.md | 82 +++- packages/search-api-graphql/README.md | 55 +++ packages/search-api-graphql/eslint.config.mjs | 22 + packages/search-api-graphql/package.json | 32 ++ .../search-api-graphql/src/build-schema.ts | 445 ++++++++++++++++++ packages/search-api-graphql/src/index.ts | 7 + packages/search-api-graphql/src/language.ts | 47 ++ .../generator-stability.test.ts.snap | 106 +++++ .../test/build-schema.test.ts | 349 ++++++++++++++ .../test/generator-stability.test.ts | 97 ++++ packages/search-api-graphql/tsconfig.json | 13 + packages/search-api-graphql/tsconfig.lib.json | 26 + .../search-api-graphql/tsconfig.spec.json | 29 ++ packages/search-api-graphql/vite.config.ts | 21 + tsconfig.json | 3 + 15 files changed, 1322 insertions(+), 12 deletions(-) create mode 100644 packages/search-api-graphql/README.md create mode 100644 packages/search-api-graphql/eslint.config.mjs create mode 100644 packages/search-api-graphql/package.json create mode 100644 packages/search-api-graphql/src/build-schema.ts create mode 100644 packages/search-api-graphql/src/index.ts create mode 100644 packages/search-api-graphql/src/language.ts create mode 100644 packages/search-api-graphql/test/__snapshots__/generator-stability.test.ts.snap create mode 100644 packages/search-api-graphql/test/build-schema.test.ts create mode 100644 packages/search-api-graphql/test/generator-stability.test.ts create mode 100644 packages/search-api-graphql/tsconfig.json create mode 100644 packages/search-api-graphql/tsconfig.lib.json create mode 100644 packages/search-api-graphql/tsconfig.spec.json create mode 100644 packages/search-api-graphql/vite.config.ts diff --git a/docs/decisions/0004-search-api-graphql-surface.md b/docs/decisions/0004-search-api-graphql-surface.md index d6aff824..54c34000 100644 --- a/docs/decisions/0004-search-api-graphql-surface.md +++ b/docs/decisions/0004-search-api-graphql-surface.md @@ -29,7 +29,7 @@ that schema. A better name for the draft’s “generation” step, at least for **runtime configuration**. This matters because the resolvers are inherently generic – there is essentially one root -resolver that maps args to a `SearchQuery`, calls the adapter, and maps the result back; +resolver that maps args to a `SearchQuery`, calls the engine, and maps the result back; the field model only parameterises data. Codegen would emit N near-identical resolver stubs that all delegate to the same logic, plus a build step and staleness risk, for no benefit. @@ -45,8 +45,10 @@ accidental breaking changes to the frozen contract – not a shipped artifact. ### The schema-building function ```ts -function buildSearchSchema( - schema: SearchSchema, +// Generic over the config *value’s* type (capture it `as const satisfies SearchSchema`), so +// one declaration drives both the runtime schema and the static TS types below. +function buildSearchSchema( + schema: S, options: { typeName: string; // 'Dataset' – drives all derived type names queryField?: string; // root field; default lowercased plural of typeName @@ -60,6 +62,13 @@ function buildSearchSchema( }, ): GraphQLSchema; // executable schema: types + generic resolvers attached +// Static types derived from the SAME config value’s type (compile-time only, erased at +// runtime); one source, no codegen, no drift. Exported for typed in-process callers/tests. +type OutputOf; // { id: string; title: LanguageString[]; size: number | null; … } +type WhereOf; // { format?: StringFilter; size?: FloatRange; … } +type OrderByOf; // { field: 'RELEVANCE' | 'TITLE' | …; direction: 'ASC' | 'DESC' } +type FacetOf; // the facetable-field-name union + // also exported for manual composition / non-default servers: function buildSearchTypeDefsAndResolvers( schema, @@ -74,6 +83,38 @@ function printSearchSchema(schema, options): string; // SDL, for a snapshot/brea `extendResolvers` (merged before `makeExecutableSchema`, since Mercurius registers once) or composes the exported typeDefs/resolvers by hand. +### A typed surface the contract does not depend on + +Because `buildSearchSchema` is generic over the config _value_ (``), one +`as const satisfies SearchSchema` declaration drives two **independent** projections: + +- **the runtime contract** – the `GraphQLSchema`, built at startup by reading the value + (`field.kind`, `output`, `facetable`, …); and +- **a static TS mirror** – `OutputOf` / `WhereOf` / `OrderByOf` / `FacetOf`, + computed from `typeof schema` via mapped types. + +The contract **does not depend on the TS types.** `as const`/`satisfies` are compile-time +only and TS types are erased, so the served schema is byte-identical whether or not the +mirror types exist – they are a developer-experience overlay, never the source. The two are +parallel derivations of one value: the runtime kind→GraphQL-type mapping lives in +`buildSearchSchema`; the type-level mapping in `OutputOf` duplicates it. They can drift, +so the **contract** is guarded by the optional `printSearchSchema()` SDL snapshot test (the +real artifact), while the TS mirror only catches our own coding mistakes against it. + +Values are typed at both ends, with the resolver as the typed transform between them: + +| layer | localized text | reference | int64 | keyword (array) | boolean | +| ----------------------- | ------------------------------------ | --------------------------- | ---------------- | ----------------------- | -------------------- | +| IR (`ResultDocument`) | `LocalizedValue` (lang map) | `Reference` | `number` | `readonly string[]` | `boolean` | +| GraphQL (`OutputOf`) | `LanguageString[]` (best-first list) | named type (`Organization`) | `Float`/`number` | `[String!]!`/`string[]` | `Boolean!`/`boolean` | + +What stays unchecked is only the **generic resolver’s dynamic middle**: it loops over the +field model with runtime-string names, so TS cannot prove the object it builds matches +`OutputOf` – it casts at that boundary, and graphql-js’s executor (not TS) enforces the +output types at runtime (a wrong-typed return raises a field error). This is the same +“typed boundaries, dynamic middle” shape as the engine port and the projection: type the +edges where it is honest, accept a cast where iteration is inherently dynamic. + ### Construction rules (field model → schema) Type names derive from `typeName`; shared types (`LanguageString`, `Facet`, `FacetBucket`, @@ -81,9 +122,12 @@ Type names derive from `typeName`; shared types (`LanguageString`, `Facet`, `Fac GraphQL field names are the field model `name` verbatim (declare camelCase). - **Output type** – one field per `output` field: `text`+`localized` → `[LanguageString!]!` (best-first; `[0].language` = served language, the per-field `Content-Language`); - `keyword` array → `[String!]!`, scalar → `String`; `integer` → `Int`; `number` → `Float`; - `date` → `String` (ISO 8601); `boolean` → `Boolean!` (absent = false); `reference` → - see below. Nullability from `array` / required / optional; `id` is `String!`. + `keyword` array → `[String!]!`, scalar → `String`; `integer` → `Int` (signed 32-bit); + `number` → `Float` (exact integers to 2^53); `date` → `String` (ISO 8601); `boolean` → + `Boolean!` (absent = false); `reference` → see below. Nullability from `array` / required / + optional; `id` is `String!`. A field whose magnitude can exceed 32 bits (a 64-bit count or + byte size – e.g. DR’s `size`) is modelled as `number` → `Float`, since GraphQL’s `Int` + would overflow; a `Long`/`BigInt` custom scalar is the deferred alternative. - **Reference types** – a `reference` field is typed by the **referenced shape** (`sh:class`/`sh:node`), emitted once and reused by every field referencing the same shape. Its fields follow `nestedStrategy`: @@ -113,6 +157,14 @@ GraphQL field names are the field model `name` verbatim (declare camelCase). is rejected where `[DatasetOrderBy!]` is expected), so a future array is a deliberate, potentially breaking change – not a free one. - **Facets** – an enum of every `facetable` field; requested per query, returned with counts. + A bucket’s `value` is its selection key; `label` is the **nullable** display label. + The engine resolves `label` only for **reference** facets — IRI-keyed buckets whose + canonical multilingual label is _data_, fetched from the sidecar `labels` collection in the + same lookup as hit references. It is `null` for token facets (e.g. `status`) and + free-string facets (e.g. `keyword`): those carry no data label, and the consumer owns their + display — its own i18n catalog for controlled tokens (`valid` → “Geldig”/“Valid”, which the + engine cannot and must not fabricate), or the `value` itself for free strings. The null is + load-bearing: it tells a client whether a server-resolved label exists or display is theirs. ### Resulting schema (DR example, abridged) @@ -138,7 +190,7 @@ type Dataset { terminologySource: [Term!]! format: [String!]! class: [String!]! - size: Int + size: Float # int64 magnitude → Float, not Int (32-bit); see note below datePosted: String status: String iiif: Boolean! @@ -152,6 +204,10 @@ input IntRange { min: Int max: Int } +input FloatRange { + min: Float + max: Float +} input DateRange { min: String max: String @@ -162,7 +218,7 @@ input DatasetWhere { format: StringFilter class: StringFilter status: StringFilter - size: IntRange + size: FloatRange datePosted: DateRange iiif: Boolean # … keyword, language, terminologySource, catalog, ndeSchemaAp, linkedData, terms, persistentUris @@ -198,8 +254,9 @@ enum DatasetFacetField { PERSISTENT_URIS } type FacetBucket { - value: String! + value: String! # the selection key (an IRI for reference facets, else a token/string) count: Int! + label: [LanguageString!] # nullable — see below } type Facet { field: DatasetFacetField! @@ -244,7 +301,7 @@ The single, generic root resolver (shipped in the package, not emitted): 2. **Apply `options.queryDefaults`** – the generic resolver bakes no deployment defaults; DR injects its policy here: default `status:=valid`; default sort `relevance` when a `query` is present else `title`; and the `status_rank` tie-break appended to either. -3. **`context.adapter.search(query, schema)` → `SearchResult`.** +3. **`context.engine.search(query, schema)` → `SearchResult`.** 4. **`SearchResult` → output** – scalars pass through; a `LocalizedValue` map → `[LanguageString]` ordered by `options.languageOrder(available, acceptLanguage)`; reference values likewise; facets keyed logical→enum. GraphQL field selection prunes. @@ -271,7 +328,7 @@ then untagged (`und`) last – so `[0]` is always the best available value. ```ts interface SearchContext { - adapter: SearchAdapter; // any engine + engine: SearchEngine; // the port; any engine adapter acceptLanguage: readonly string[]; // parsed, ordered; drives locale + output ordering } ``` @@ -296,6 +353,7 @@ Each transport populates it per request; no framework type appears in the packag additive `inline` growth. - Deferred: a `dataset(id)` single-resource query (detail-page-on-index direction; DR detail stays on SPARQL); cross-collection `@reference` joins beyond inline labels; cursor - pagination; a `Date` scalar (kept ISO `String`); transport-layer persisted queries / cost + pagination; a `Date` scalar (kept ISO `String`) and a `Long`/`BigInt` scalar for 64-bit + integers (kept `Float`); transport-layer persisted queries / cost limits; a root or per-field language argument (Accept-Language is the sole preference mechanism); metadata-language-availability filtering (a facetable dimension, not v1). diff --git a/packages/search-api-graphql/README.md b/packages/search-api-graphql/README.md new file mode 100644 index 00000000..88f8cdb3 --- /dev/null +++ b/packages/search-api-graphql/README.md @@ -0,0 +1,55 @@ +# @lde/search-api-graphql + +The GraphQL surface for the [`@lde/search`](../search) core. **Both engine- and +domain-agnostic:** it builds an executable `GraphQLSchema` from any `SearchSchema` +at runtime, and serves it with one generic resolver over any `SearchEngine`. It +names neither your **domain** (you pass `typeName` — `Dataset`, `Person`, +`CreativeWork`, …) nor your **engine** (the resolver calls `context.engine`, be it +[`@lde/search-typesense`](../search-typesense) or another adapter). + +## Runtime configuration, not codegen + +`buildSearchSchema(schema, { typeName })` constructs the schema once at startup +from the field model — no SDL artifact, no generated resolver stubs. The field +model is the single source; the GraphQL contract is whatever it produces. Output +types, the `where`/`orderBy`/facet inputs, reference types and nullability are all +derived from each field’s `kind` and capability flags. + +```ts +import { buildSearchSchema } from '@lde/search-api-graphql'; + +const gqlSchema = buildSearchSchema(DATASET, { + typeName: 'Dataset', + queryDefaults: (query) => ({ + ...query, + where: [...query.where, { field: 'status', in: ['valid'] }], + }), +}); + +// Hand `gqlSchema` to any graphql-js server; populate the per-request context: +// { engine: SearchEngine, acceptLanguage: string[] } +``` + +## What it builds + +- **Output type** (`typeName`) — localized text → best-first `[LanguageString!]!` + (`[0].language` is the language actually served); references → named per-shape + types (`Organization`, `Term`) with a `name`; scalars/booleans per kind; `date` + → ISO 8601 string; nullability from `required` / `array` / `kind`. +- **`where`** — one input per `filterable` field (`StringFilter`, `IntRange` / + `FloatRange` / `DateRange`, or `Boolean`). +- **`orderBy`** — `RELEVANCE` plus every `sortable` field, as an enum. +- **Facets** — an enum of every `facetable` field; a bucket carries `value` + + `count` + a nullable `label` — the resolved data label for **reference** facets, + `null` for token/free-string facets whose display the consumer owns (its own + i18n, or the value itself). + +## Why it can’t drift + +The surface reads the same field model the index is built from, and compiles into +the same neutral `SearchQuery` the engine consumes — so the API, the index and a +future REST surface stay in lockstep. The contract is **frozen** (breaking to +change), and because it is generated rather than hand-written, a _consumer_ guards +it with a `printSearchSchema(schema, options)` SDL snapshot over its **own** +schema and `typeName` — that snapshot also catches a `buildSearchSchema` change in +a future version of this library silently altering the consumer’s contract. diff --git a/packages/search-api-graphql/eslint.config.mjs b/packages/search-api-graphql/eslint.config.mjs new file mode 100644 index 00000000..2dcaf60c --- /dev/null +++ b/packages/search-api-graphql/eslint.config.mjs @@ -0,0 +1,22 @@ +import baseConfig from '../../eslint.config.mjs'; + +export default [ + ...baseConfig, + { + files: ['**/*.json'], + rules: { + '@nx/dependency-checks': [ + 'error', + { + ignoredFiles: [ + '{projectRoot}/eslint.config.{js,cjs,mjs}', + '{projectRoot}/vite.config.{js,ts,mjs,mts}', + ], + }, + ], + }, + languageOptions: { + parser: await import('jsonc-eslint-parser'), + }, + }, +]; diff --git a/packages/search-api-graphql/package.json b/packages/search-api-graphql/package.json new file mode 100644 index 00000000..ea761b48 --- /dev/null +++ b/packages/search-api-graphql/package.json @@ -0,0 +1,32 @@ +{ + "name": "@lde/search-api-graphql", + "version": "0.1.0", + "description": "Engine- and domain-agnostic GraphQL surface for @lde/search: builds an executable GraphQLSchema from any SearchSchema at runtime (no codegen), served by one generic resolver over any SearchEngine. You supply the schema and typeName; it names neither your domain nor your engine.", + "repository": { + "url": "git+https://github.com/ldelements/lde.git", + "directory": "packages/search-api-graphql" + }, + "license": "MIT", + "type": "module", + "exports": { + "./package.json": "./package.json", + ".": { + "types": "./dist/index.d.ts", + "import": "./dist/index.js", + "development": "./src/index.ts", + "default": "./dist/index.js" + } + }, + "main": "./dist/index.js", + "module": "./dist/index.js", + "types": "./dist/index.d.ts", + "files": [ + "dist", + "!**/*.tsbuildinfo" + ], + "dependencies": { + "@lde/search": "^0.1.2", + "graphql": "^15.8.0", + "tslib": "^2.3.0" + } +} diff --git a/packages/search-api-graphql/src/build-schema.ts b/packages/search-api-graphql/src/build-schema.ts new file mode 100644 index 00000000..fdfccf09 --- /dev/null +++ b/packages/search-api-graphql/src/build-schema.ts @@ -0,0 +1,445 @@ +import { + GraphQLBoolean, + GraphQLEnumType, + GraphQLFloat, + GraphQLInputObjectType, + GraphQLInt, + GraphQLList, + GraphQLNonNull, + GraphQLObjectType, + GraphQLSchema, + GraphQLString, + printSchema, + type GraphQLEnumValueConfigMap, + type GraphQLFieldConfig, + type GraphQLInputFieldConfig, + type GraphQLInputType, + type GraphQLOutputType, +} from 'graphql'; +import { + facetableFields, + filterableFields, + filterOperatorFor, + outputFields, + sortableFields, + type Filter, + type LocalizedValue, + type Reference, + type SearchEngine, + type SearchField, + type SearchQuery, + type SearchSchema, + type Sort, +} from '@lde/search'; +import { + defaultLanguageOrder, + toLanguageStrings, + type LanguageOrder, +} from './language.js'; + +/** Populated per request by the transport; no framework type appears here. */ +export interface SearchContext { + readonly engine: SearchEngine; + /** Parsed, ordered `Accept-Language`; drives locale selection and output order. */ + readonly acceptLanguage: readonly string[]; +} + +export interface BuildSearchSchemaOptions { + /** Drives all derived type names, e.g. `Dataset`. */ + readonly typeName: string; + /** Root query field; defaults to the lowercased plural of `typeName`. */ + readonly queryField?: string; + /** Consumer policy applied to every query (default status, sort, tie-breaks). */ + readonly queryDefaults?: ( + query: SearchQuery, + context: SearchContext, + ) => SearchQuery; + /** Output-language ordering; defaults to Accept-Language-first, `und` last. */ + readonly languageOrder?: LanguageOrder; +} + +type Source = Record; + +const nonNullListOf = (type: GraphQLOutputType): GraphQLOutputType => + new GraphQLNonNull(new GraphQLList(new GraphQLNonNull(type))); + +const scalarOutput = ( + scalar: GraphQLOutputType, + field: SearchField, +): GraphQLOutputType => + field.required === true ? new GraphQLNonNull(scalar) : scalar; + +/** SCREAMING_SNAKE_CASE for an enum value name, e.g. `datePosted` → `DATE_POSTED`. */ +function screamingSnake(name: string): string { + return name.replace(/([a-z0-9])([A-Z])/g, '$1_$2').toUpperCase(); +} + +/** + * Construct an executable GraphQL schema from the unified {@link SearchField} + * model at runtime — no codegen, no SDL artifact. One generic resolver maps the + * arguments to a {@link SearchQuery}, calls `context.engine`, and maps the result + * back; the field model only parameterises data. + */ +export function buildSearchSchema( + schema: SearchSchema, + options: BuildSearchSchemaOptions, +): GraphQLSchema { + const { typeName } = options; + const languageOrder = options.languageOrder ?? defaultLanguageOrder; + const queryField = + options.queryField ?? + `${typeName.charAt(0).toLowerCase()}${typeName.slice(1)}s`; + + // --- Shared types --- + const languageString = new GraphQLObjectType({ + name: 'LanguageString', + fields: { + language: { type: GraphQLString }, + value: { type: new GraphQLNonNull(GraphQLString) }, + }, + }); + const facetBucket = new GraphQLObjectType({ + name: 'FacetBucket', + fields: { + value: { type: new GraphQLNonNull(GraphQLString) }, + count: { type: new GraphQLNonNull(GraphQLInt) }, + // Nullable: the resolved data label for a reference facet, else null — + // the consumer owns display for token/free-string facets (its i18n or the + // value itself). + label: { + type: new GraphQLList(new GraphQLNonNull(languageString)), + resolve: (bucket: Source, _args: unknown, context: SearchContext) => { + const label = bucket.label as LocalizedValue | undefined; + return label + ? toLanguageStrings(label, context.acceptLanguage, languageOrder) + : null; + }, + }, + }, + }); + const sortDirection = new GraphQLEnumType({ + name: 'SortDirection', + values: { ASC: { value: 'asc' }, DESC: { value: 'desc' } }, + }); + const stringFilter = new GraphQLInputObjectType({ + name: 'StringFilter', + fields: { + in: { type: new GraphQLList(new GraphQLNonNull(GraphQLString)) }, + }, + }); + const intRange = rangeInput('IntRange', GraphQLInt); + const floatRange = rangeInput('FloatRange', GraphQLFloat); + const dateRange = rangeInput('DateRange', GraphQLString); + + const labelList = ( + resolveLabel: (source: Source) => LocalizedValue | undefined, + ) => ({ + type: nonNullListOf(languageString), + resolve: (source: Source, _args: unknown, context: SearchContext) => { + const value = resolveLabel(source); + return value + ? toLanguageStrings(value, context.acceptLanguage, languageOrder) + : []; + }, + }); + + // --- Reference types, one per referenced shape, reused by every field. --- + const referenceTypes = new Map(); + for (const field of outputFields(schema)) { + if ( + field.kind === 'reference' && + field.ref && + !referenceTypes.has(field.ref.type) + ) { + referenceTypes.set( + field.ref.type, + new GraphQLObjectType({ + name: field.ref.type, + fields: { + id: { + type: new GraphQLNonNull(GraphQLString), + resolve: (source: Source) => (source as unknown as Reference).id, + }, + name: labelList((source) => (source as unknown as Reference).label), + }, + }), + ); + } + } + + // --- Output type --- + const outputType = new GraphQLObjectType({ + name: typeName, + fields: () => { + const fields: Record< + string, + GraphQLFieldConfig + > = { + id: { type: new GraphQLNonNull(GraphQLString) }, + }; + for (const field of outputFields(schema)) { + fields[field.name] = outputFieldConfig(field); + } + return fields; + }, + }); + + function outputFieldConfig( + field: SearchField, + ): GraphQLFieldConfig { + const passthrough = (source: Source) => source[field.name] ?? null; + switch (field.kind) { + case 'text': + return labelList( + (source) => source[field.name] as LocalizedValue | undefined, + ); + case 'keyword': + return field.array === true + ? { + type: nonNullListOf(GraphQLString), + resolve: (s) => s[field.name] ?? [], + } + : { type: scalarOutput(GraphQLString, field), resolve: passthrough }; + case 'reference': { + const referenceType = referenceTypes.get(field.ref?.type ?? '')!; + return field.array === true + ? { + type: nonNullListOf(referenceType), + resolve: (s) => s[field.name] ?? [], + } + : { + type: + field.required === true + ? new GraphQLNonNull(referenceType) + : referenceType, + resolve: passthrough, + }; + } + case 'integer': + return { type: scalarOutput(GraphQLInt, field), resolve: passthrough }; + case 'number': + return { + type: scalarOutput(GraphQLFloat, field), + resolve: passthrough, + }; + case 'date': + // Stored as Unix seconds (int64); the surface serves ISO 8601 (ADR 4). + return { + type: scalarOutput(GraphQLString, field), + resolve: (source) => { + const value = source[field.name]; + return typeof value === 'number' + ? new Date(value * 1000).toISOString() + : (value ?? null); + }, + }; + case 'boolean': + return { + type: new GraphQLNonNull(GraphQLBoolean), + resolve: (source) => source[field.name] === true, + }; + } + } + + // --- where / orderBy / facets --- + const whereInput = new GraphQLInputObjectType({ + name: `${typeName}Where`, + fields: () => { + const fields: Record = {}; + for (const field of filterableFields(schema)) { + fields[field.name] = { type: whereFieldType(field) }; + } + return fields; + }, + }); + + function whereFieldType(field: SearchField): GraphQLInputType { + switch (filterOperatorFor(field.kind)) { + case 'in': + return stringFilter; + case 'range': + return field.kind === 'integer' + ? intRange + : field.kind === 'number' + ? floatRange + : dateRange; + default: + return GraphQLBoolean; + } + } + + const sortValues: GraphQLEnumValueConfigMap = { + RELEVANCE: { value: 'relevance' }, + }; + for (const field of sortableFields(schema)) { + sortValues[screamingSnake(field.name)] = { value: field.name }; + } + const sortField = new GraphQLEnumType({ + name: `${typeName}SortField`, + values: sortValues, + }); + const orderByInput = new GraphQLInputObjectType({ + name: `${typeName}OrderBy`, + fields: { + field: { type: new GraphQLNonNull(sortField) }, + direction: { + type: new GraphQLNonNull(sortDirection), + defaultValue: 'desc', + }, + }, + }); + + const facetValues: GraphQLEnumValueConfigMap = {}; + for (const field of facetableFields(schema)) { + facetValues[screamingSnake(field.name)] = { value: field.name }; + } + const facetField = new GraphQLEnumType({ + name: `${typeName}FacetField`, + values: facetValues, + }); + const facet = new GraphQLObjectType({ + name: 'Facet', + fields: { + field: { type: new GraphQLNonNull(facetField) }, + buckets: { type: nonNullListOf(facetBucket) }, + }, + }); + + const resultType = new GraphQLObjectType({ + name: `${typeName}SearchResult`, + fields: { + items: { type: nonNullListOf(outputType) }, + total: { type: new GraphQLNonNull(GraphQLInt) }, + page: { type: new GraphQLNonNull(GraphQLInt) }, + perPage: { type: new GraphQLNonNull(GraphQLInt) }, + facets: { type: nonNullListOf(facet) }, + }, + }); + + const query = new GraphQLObjectType({ + name: 'Query', + fields: { + [queryField]: { + type: new GraphQLNonNull(resultType), + args: { + query: { type: GraphQLString }, + where: { type: whereInput }, + orderBy: { type: orderByInput }, + page: { type: GraphQLInt, defaultValue: 1 }, + perPage: { type: GraphQLInt, defaultValue: 20 }, + facets: { type: new GraphQLList(new GraphQLNonNull(facetField)) }, + }, + resolve: async (_source, args, context: SearchContext) => { + const built = argsToQuery(args as QueryArgs, context, schema); + const finalQuery = options.queryDefaults + ? options.queryDefaults(built, context) + : built; + const result = await context.engine.search(finalQuery, schema); + return { + items: result.hits.map((hit) => ({ id: hit.id, ...hit.document })), + total: result.total, + page: Math.floor(finalQuery.offset / finalQuery.limit) + 1, + perPage: finalQuery.limit, + facets: Object.entries(result.facets).map(([field, buckets]) => ({ + field, + buckets, + })), + }; + }, + }, + }, + }); + + return new GraphQLSchema({ query }); +} + +/** + * The SDL of the built schema. Not a shipped artifact — a consumer uses it for an + * optional CI snapshot test over its own schema, catching accidental breaking + * changes to its frozen contract (including a `buildSearchSchema` change in a + * future version of this library silently altering it). + */ +export function printSearchSchema( + schema: SearchSchema, + options: BuildSearchSchemaOptions, +): string { + return printSchema(buildSearchSchema(schema, options)); +} + +interface QueryArgs { + readonly query?: string; + readonly where?: Record; + readonly orderBy?: { field: string; direction: 'asc' | 'desc' }; + readonly page?: number; + readonly perPage?: number; + readonly facets?: readonly string[]; +} + +/** Pure args → {@link SearchQuery} mapping. */ +function argsToQuery( + args: QueryArgs, + context: SearchContext, + schema: SearchSchema, +): SearchQuery { + const perPage = args.perPage ?? 20; + const page = args.page ?? 1; + return { + text: args.query, + where: whereToFilters(args.where, schema), + orderBy: args.orderBy + ? [{ field: args.orderBy.field, direction: args.orderBy.direction }] + : [], + limit: perPage, + offset: (page - 1) * perPage, + facets: args.facets ?? [], + locale: context.acceptLanguage[0] ?? 'und', + }; +} + +function whereToFilters( + where: Record | undefined, + schema: SearchSchema, +): Filter[] { + if (where === undefined) { + return []; + } + const filters: Filter[] = []; + for (const field of filterableFields(schema)) { + const value = where[field.name]; + if (value === undefined || value === null) { + continue; + } + switch (filterOperatorFor(field.kind)) { + case 'in': + filters.push({ + field: field.name, + in: (value as { in?: string[] }).in ?? [], + }); + break; + case 'range': { + const range = value as { min?: number | string; max?: number | string }; + filters.push({ + field: field.name, + range: { min: range.min, max: range.max }, + }); + break; + } + default: + filters.push({ field: field.name, is: value as boolean }); + } + } + return filters; +} + +function rangeInput( + name: string, + bound: typeof GraphQLInt | typeof GraphQLFloat | typeof GraphQLString, +): GraphQLInputObjectType { + return new GraphQLInputObjectType({ + name, + fields: { min: { type: bound }, max: { type: bound } }, + }); +} + +// Re-exported for callers that compose a sort manually. +export type { Sort }; diff --git a/packages/search-api-graphql/src/index.ts b/packages/search-api-graphql/src/index.ts new file mode 100644 index 00000000..2fe7db46 --- /dev/null +++ b/packages/search-api-graphql/src/index.ts @@ -0,0 +1,7 @@ +export { buildSearchSchema, printSearchSchema } from './build-schema.js'; +export type { + SearchContext, + BuildSearchSchemaOptions, +} from './build-schema.js'; +export { defaultLanguageOrder, toLanguageStrings } from './language.js'; +export type { LanguageString, LanguageOrder } from './language.js'; diff --git a/packages/search-api-graphql/src/language.ts b/packages/search-api-graphql/src/language.ts new file mode 100644 index 00000000..96826f65 --- /dev/null +++ b/packages/search-api-graphql/src/language.ts @@ -0,0 +1,47 @@ +import type { LocalizedValue } from '@lde/search'; + +/** One entry of the surface’s best-first `[LanguageString!]!`. `language` is null + * for untagged (`und`) values; `[0]` is the value to display and `[0].language` + * is the language actually served (the per-field `Content-Language`). */ +export interface LanguageString { + readonly language: string | null; + readonly value: string; +} + +/** Orders a localized value’s available languages against the request. */ +export type LanguageOrder = ( + available: readonly string[], + accept: readonly string[], +) => readonly string[]; + +/** + * Default ordering: requested languages first (in request order), then the + * remaining tagged languages, then untagged (`und`) last — so `[0]` is always the + * best available value. + */ +export const defaultLanguageOrder: LanguageOrder = (available, accept) => { + const requested = accept.filter((language) => available.includes(language)); + const rest = available.filter( + (language) => language !== 'und' && !requested.includes(language), + ); + const untagged = available.includes('und') ? ['und'] : []; + return [...requested, ...rest, ...untagged]; +}; + +/** Flatten a language map into a best-first `LanguageString` list. */ +export function toLanguageStrings( + value: LocalizedValue, + accept: readonly string[], + order: LanguageOrder, +): LanguageString[] { + const result: LanguageString[] = []; + for (const language of order(Object.keys(value), accept)) { + for (const text of value[language] ?? []) { + result.push({ + language: language === 'und' ? null : language, + value: text, + }); + } + } + return result; +} diff --git a/packages/search-api-graphql/test/__snapshots__/generator-stability.test.ts.snap b/packages/search-api-graphql/test/__snapshots__/generator-stability.test.ts.snap new file mode 100644 index 00000000..d1741f4e --- /dev/null +++ b/packages/search-api-graphql/test/__snapshots__/generator-stability.test.ts.snap @@ -0,0 +1,106 @@ +// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html + +exports[`GraphQL generator stability > emits a stable SDL for a representative schema 1`] = ` +"type Query { + things(query: String, where: ThingWhere, orderBy: ThingOrderBy, page: Int = 1, perPage: Int = 20, facets: [ThingFacetField!]): ThingSearchResult! +} + +type ThingSearchResult { + items: [Thing!]! + total: Int! + page: Int! + perPage: Int! + facets: [Facet!]! +} + +type Thing { + id: String! + title: [LanguageString!]! + description: [LanguageString!]! + keyword: [String!]! + creator: [Agent!]! + publisher: Agent + size: Int + score: Float + created: String + status: String! + open: Boolean! +} + +type LanguageString { + language: String + value: String! +} + +type Agent { + id: String! + name: [LanguageString!]! +} + +type Facet { + field: ThingFacetField! + buckets: [FacetBucket!]! +} + +enum ThingFacetField { + KEYWORD + CREATOR + PUBLISHER + STATUS + OPEN +} + +type FacetBucket { + value: String! + count: Int! + label: [LanguageString!] +} + +input ThingWhere { + keyword: StringFilter + creator: StringFilter + publisher: StringFilter + size: IntRange + score: FloatRange + created: DateRange + status: StringFilter + open: Boolean +} + +input StringFilter { + in: [String!] +} + +input IntRange { + min: Int + max: Int +} + +input FloatRange { + min: Float + max: Float +} + +input DateRange { + min: String + max: String +} + +input ThingOrderBy { + field: ThingSortField! + direction: SortDirection! = DESC +} + +enum ThingSortField { + RELEVANCE + TITLE + SIZE + CREATED +} + +enum SortDirection { + ASC + DESC +} +" +`; diff --git a/packages/search-api-graphql/test/build-schema.test.ts b/packages/search-api-graphql/test/build-schema.test.ts new file mode 100644 index 00000000..b61ba240 --- /dev/null +++ b/packages/search-api-graphql/test/build-schema.test.ts @@ -0,0 +1,349 @@ +import { describe, expect, it } from 'vitest'; +import { graphql, printSchema } from 'graphql'; +import type { + SearchEngine, + SearchQuery, + SearchResult, + SearchSchema, +} from '@lde/search'; +import { buildSearchSchema, type SearchContext } from '../src/build-schema.js'; + +const schema: SearchSchema = { + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [ + { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, + }, + { + name: 'keyword', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + output: true, + }, + { + name: 'publisher', + kind: 'reference', + facetable: true, + filterable: true, + output: true, + ref: { type: 'Organization', strategy: 'labelOnly' }, + }, + { + name: 'size', + kind: 'integer', + filterable: true, + sortable: true, + output: true, + }, + { name: 'datePosted', kind: 'date', sortable: true, output: true }, + { name: 'score', kind: 'number', output: true }, + { + name: 'terminologySource', + kind: 'reference', + array: true, + facetable: true, + output: true, + ref: { type: 'Term', strategy: 'labelOnly' }, + }, + { + name: 'status', + kind: 'keyword', + facetable: true, + filterable: true, + required: true, + output: true, + }, + { + name: 'iiif', + kind: 'boolean', + facetable: true, + filterable: true, + output: true, + }, + ], +}; + +/** A fake engine that records the query it received and returns a canned result. */ +function fakeEngine(result: SearchResult): { + engine: SearchEngine; + received: () => SearchQuery; +} { + let captured: SearchQuery; + return { + engine: { + async search(query) { + captured = query; + return result; + }, + }, + received: () => captured, + }; +} + +const canned: SearchResult = { + total: 1, + hits: [ + { + id: 'https://d/1', + document: { + title: { nl: ['Titel'], en: ['Title'] }, + keyword: ['kaarten'], + publisher: { + id: 'https://org/1', + label: { nl: ['Het Utrechts Archief'] }, + }, + size: 1234, + datePosted: 1_700_000_000, + score: 4.5, + terminologySource: [ + { id: 'https://term/1', label: { nl: ['Kaarten'] } }, + ], + status: 'valid', + iiif: true, + }, + }, + ], + facets: { keyword: [{ value: 'kaarten', count: 3 }] }, +}; + +async function run( + source: string, + context: SearchContext, + variables?: Record, +) { + return graphql({ + schema: buildSearchSchema(schema, { typeName: 'Dataset' }), + source, + contextValue: context, + variableValues: variables, + }); +} + +describe('buildSearchSchema', () => { + it('resolves a query, mapping the result to the typed output', async () => { + const { engine, received } = fakeEngine(canned); + const result = await run( + `{ + datasets(query: "kaart") { + total + page + perPage + items { + id + title { language value } + keyword + publisher { id name { language value } } + terminologySource { id name { language value } } + size + datePosted + score + status + iiif + } + facets { field buckets { value count } } + } + }`, + { engine, acceptLanguage: ['nl'] }, + ); + + expect(result.errors).toBeUndefined(); + const data = result.data?.datasets as Record; + expect(data.total).toBe(1); + expect(data.page).toBe(1); + const item = (data.items as Record[])[0]; + expect(item.id).toBe('https://d/1'); + expect(item.title).toEqual([ + { language: 'nl', value: 'Titel' }, + { language: 'en', value: 'Title' }, + ]); + expect(item.keyword).toEqual(['kaarten']); + expect(item.publisher).toEqual({ + id: 'https://org/1', + name: [{ language: 'nl', value: 'Het Utrechts Archief' }], + }); + expect(item.size).toBe(1234); + expect(item.datePosted).toBe('2023-11-14T22:13:20.000Z'); + expect(item.score).toBe(4.5); + expect(item.terminologySource).toEqual([ + { id: 'https://term/1', name: [{ language: 'nl', value: 'Kaarten' }] }, + ]); + expect(item.iiif).toBe(true); + expect(data.facets).toEqual([ + { field: 'KEYWORD', buckets: [{ value: 'kaarten', count: 3 }] }, + ]); + // The free-text arg became the query text. + expect(received().text).toBe('kaart'); + }); + + it('orders the output list best-first for the requested language', async () => { + const { engine } = fakeEngine(canned); + const result = await run( + `{ datasets { items { title { language value } } } }`, + { engine, acceptLanguage: ['en'] }, + ); + const item = ( + (result.data?.datasets as Record).items as Record< + string, + unknown + >[] + )[0]; + expect(item.title).toEqual([ + { language: 'en', value: 'Title' }, + { language: 'nl', value: 'Titel' }, + ]); + }); + + it('places untagged (und) values last with a null language', async () => { + const { engine } = fakeEngine({ + total: 1, + facets: {}, + hits: [ + { + id: 'x', + document: { title: { nl: ['Titel'], und: ['Naamloos'] } }, + }, + ], + }); + const result = await run( + `{ datasets { items { title { language value } } } }`, + { engine, acceptLanguage: ['en'] }, + ); + const item = ( + (result.data?.datasets as Record).items as Record< + string, + unknown + >[] + )[0]; + expect(item.title).toEqual([ + { language: 'nl', value: 'Titel' }, + { language: null, value: 'Naamloos' }, + ]); + }); + + it('labels reference-facet buckets, leaving plain-facet buckets null', async () => { + const { engine } = fakeEngine({ + total: 0, + hits: [], + facets: { + publisher: [ + { + value: 'https://org/1', + count: 2, + label: { nl: ['Het Utrechts Archief'] }, + }, + ], + keyword: [{ value: 'kaarten', count: 3 }], + }, + }); + const result = await run( + `{ datasets { facets { field buckets { value count label { language value } } } } }`, + { engine, acceptLanguage: ['nl'] }, + ); + const facets = (result.data?.datasets as Record) + .facets as { field: string; buckets: unknown[] }[]; + const publisher = facets.find((facet) => facet.field === 'PUBLISHER'); + const keyword = facets.find((facet) => facet.field === 'KEYWORD'); + expect(publisher?.buckets).toEqual([ + { + value: 'https://org/1', + count: 2, + label: [{ language: 'nl', value: 'Het Utrechts Archief' }], + }, + ]); + expect(keyword?.buckets).toEqual([ + { value: 'kaarten', count: 3, label: null }, + ]); + }); + + it('maps where, orderBy, facets and pagination into the SearchQuery', async () => { + const { engine, received } = fakeEngine(canned); + await run( + `{ + datasets( + where: { status: { in: ["valid"] }, keyword: {}, size: { min: 1, max: 9 }, iiif: true } + orderBy: { field: SIZE, direction: ASC } + page: 3 + perPage: 10 + facets: [KEYWORD, PUBLISHER] + ) { total } + }`, + { engine, acceptLanguage: ['nl'] }, + ); + + const query = received(); + expect(query.where).toContainEqual({ field: 'status', in: ['valid'] }); + // An empty StringFilter compiles to an empty membership. + expect(query.where).toContainEqual({ field: 'keyword', in: [] }); + expect(query.where).toContainEqual({ + field: 'size', + range: { min: 1, max: 9 }, + }); + expect(query.where).toContainEqual({ field: 'iiif', is: true }); + expect(query.orderBy).toEqual([{ field: 'size', direction: 'asc' }]); + expect(query.facets).toEqual(['keyword', 'publisher']); + expect(query.limit).toBe(10); + expect(query.offset).toBe(20); + }); + + it('falls back to the und locale when no Accept-Language is given', async () => { + const { engine, received } = fakeEngine(canned); + await run(`{ datasets { total } }`, { engine, acceptLanguage: [] }); + expect(received().locale).toBe('und'); + }); + + it('applies queryDefaults before calling the engine', async () => { + let captured: SearchQuery | undefined; + const engine: SearchEngine = { + async search(query) { + captured = query; + return canned; + }, + }; + const gqlSchema = buildSearchSchema(schema, { + typeName: 'Dataset', + queryDefaults: (query) => ({ + ...query, + where: [...query.where, { field: 'status', in: ['valid'] }], + orderBy: [{ field: 'relevance', direction: 'desc' }], + }), + }); + await graphql({ + schema: gqlSchema, + source: `{ datasets { total } }`, + contextValue: { engine, acceptLanguage: ['nl'] }, + }); + expect(captured?.where).toEqual([{ field: 'status', in: ['valid'] }]); + expect(captured?.orderBy).toEqual([ + { field: 'relevance', direction: 'desc' }, + ]); + }); + + it('derives nullability: required scalar non-null, optional scalar nullable, arrays/booleans non-null', () => { + const sdl = printSchema(buildSearchSchema(schema, { typeName: 'Dataset' })); + expect(sdl).toMatch(/status: String!/); // required + expect(sdl).toMatch(/size: Int\b(?!!)/); // optional → nullable + expect(sdl).toMatch(/title: \[LanguageString!\]!/); + expect(sdl).toMatch(/keyword: \[String!\]!/); + expect(sdl).toMatch(/iiif: Boolean!/); + expect(sdl).toMatch(/publisher: Organization\b(?!!)/); // optional reference + }); + + it('builds the where, orderBy and facet enums from the field model', () => { + const sdl = printSchema(buildSearchSchema(schema, { typeName: 'Dataset' })); + expect(sdl).toMatch(/enum DatasetSortField/); + expect(sdl).toMatch(/RELEVANCE/); + expect(sdl).toMatch(/SIZE/); + expect(sdl).toMatch(/enum DatasetFacetField/); + expect(sdl).toMatch(/input DatasetWhere/); + expect(sdl).toMatch(/status: StringFilter/); + expect(sdl).toMatch(/size: IntRange/); + }); +}); diff --git a/packages/search-api-graphql/test/generator-stability.test.ts b/packages/search-api-graphql/test/generator-stability.test.ts new file mode 100644 index 00000000..78a86f40 --- /dev/null +++ b/packages/search-api-graphql/test/generator-stability.test.ts @@ -0,0 +1,97 @@ +import { describe, expect, it } from 'vitest'; +import type { SearchSchema } from '@lde/search'; +import { printSearchSchema } from '../src/build-schema.js'; + +/** + * A neutral fixture exercising every kind + capability — NOT a real domain. Its + * SDL is snapshotted purely to pin the **generator**: any change to how + * `buildSearchSchema` maps the field model (nullability, type names, enums, + * reference reuse) surfaces as a snapshot diff before this library is published, + * so a consumer’s contract can’t shift from under it by accident. + */ +const THING: SearchSchema = { + type: 'https://example.org/Thing', + fields: [ + { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, + required: true, + }, + { + name: 'description', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 2 }, + }, + { + name: 'keyword', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + searchable: { weight: 1 }, + output: true, + }, + // Two references sharing a shape → the Agent type is emitted once and reused. + { + name: 'creator', + kind: 'reference', + array: true, + facetable: true, + filterable: true, + output: true, + ref: { type: 'Agent', strategy: 'labelOnly' }, + }, + { + name: 'publisher', + kind: 'reference', + facetable: true, + filterable: true, + output: true, + ref: { type: 'Agent', strategy: 'labelOnly' }, + }, + { + name: 'size', + kind: 'integer', + filterable: true, + sortable: true, + output: true, + }, + { name: 'score', kind: 'number', filterable: true, output: true }, + { + name: 'created', + kind: 'date', + filterable: true, + sortable: true, + output: true, + }, + { + name: 'status', + kind: 'keyword', + facetable: true, + filterable: true, + required: true, + output: true, + }, + { + name: 'open', + kind: 'boolean', + facetable: true, + filterable: true, + output: true, + }, + ], +}; + +describe('GraphQL generator stability', () => { + it('emits a stable SDL for a representative schema', () => { + expect(printSearchSchema(THING, { typeName: 'Thing' })).toMatchSnapshot(); + }); +}); diff --git a/packages/search-api-graphql/tsconfig.json b/packages/search-api-graphql/tsconfig.json new file mode 100644 index 00000000..62ebbd94 --- /dev/null +++ b/packages/search-api-graphql/tsconfig.json @@ -0,0 +1,13 @@ +{ + "extends": "../../tsconfig.base.json", + "files": [], + "include": [], + "references": [ + { + "path": "./tsconfig.lib.json" + }, + { + "path": "./tsconfig.spec.json" + } + ] +} diff --git a/packages/search-api-graphql/tsconfig.lib.json b/packages/search-api-graphql/tsconfig.lib.json new file mode 100644 index 00000000..64610bac --- /dev/null +++ b/packages/search-api-graphql/tsconfig.lib.json @@ -0,0 +1,26 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { + "rootDir": "src", + "outDir": "dist", + "tsBuildInfoFile": "dist/tsconfig.lib.tsbuildinfo", + "emitDeclarationOnly": false, + "types": ["node"] + }, + "include": ["src/**/*.ts"], + "references": [{ "path": "../search/tsconfig.lib.json" }], + "exclude": [ + "vite.config.ts", + "vite.config.mts", + "vitest.config.ts", + "vitest.config.mts", + "test/**/*.test.ts", + "test/**/*.spec.ts", + "test/**/*.test.tsx", + "test/**/*.spec.tsx", + "test/**/*.test.js", + "test/**/*.spec.js", + "test/**/*.test.jsx", + "test/**/*.spec.jsx" + ] +} diff --git a/packages/search-api-graphql/tsconfig.spec.json b/packages/search-api-graphql/tsconfig.spec.json new file mode 100644 index 00000000..04480f69 --- /dev/null +++ b/packages/search-api-graphql/tsconfig.spec.json @@ -0,0 +1,29 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { + "outDir": "./out-tsc/vitest", + "types": [ + "vitest/globals", + "vitest/importMeta", + "vite/client", + "node", + "vitest" + ] + }, + "include": [ + "test/**/*.test.ts", + "test/**/*.spec.ts", + "test/**/*.test.tsx", + "test/**/*.spec.tsx", + "test/**/*.test.js", + "test/**/*.spec.js", + "test/**/*.test.jsx", + "test/**/*.spec.jsx", + "test/**/*.d.ts" + ], + "references": [ + { + "path": "./tsconfig.lib.json" + } + ] +} diff --git a/packages/search-api-graphql/vite.config.ts b/packages/search-api-graphql/vite.config.ts new file mode 100644 index 00000000..725cf854 --- /dev/null +++ b/packages/search-api-graphql/vite.config.ts @@ -0,0 +1,21 @@ +/// +import { defineConfig, mergeConfig } from 'vite'; +import baseConfig from '../../vite.base.config.js'; + +export default mergeConfig( + baseConfig, + defineConfig({ + root: __dirname, + cacheDir: '../../node_modules/.vite/packages/search-api-graphql', + test: { + coverage: { + thresholds: { + functions: 90, + lines: 90, + branches: 78, + statements: 90, + }, + }, + }, + }), +); diff --git a/tsconfig.json b/tsconfig.json index 0b6d2b2c..0defc069 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -76,6 +76,9 @@ }, { "path": "./packages/search" + }, + { + "path": "./packages/search-api-graphql" } ] } From a1dabb2ab1304e289d32db251ec1cba0e9554a5b Mon Sep 17 00:00:00 2001 From: David de Boer Date: Mon, 29 Jun 2026 09:38:15 +0200 Subject: [PATCH 04/35] docs(search): reconcile ADRs 0003 and 0004 with the NDE stack docs - state the decisions directly as the reconciled architecture, not deviations from a draft - remove the deviation/reconcile framing and the deviations-to-reconcile lists - align wording with the stack platform layer --- .../0003-search-api-core-query-model.md | 44 +++++++------------ .../0004-search-api-graphql-surface.md | 28 +++++------- 2 files changed, 27 insertions(+), 45 deletions(-) diff --git a/docs/decisions/0003-search-api-core-query-model.md b/docs/decisions/0003-search-api-core-query-model.md index 57521fad..c093a6f8 100644 --- a/docs/decisions/0003-search-api-core-query-model.md +++ b/docs/decisions/0003-search-api-core-query-model.md @@ -6,10 +6,9 @@ Date: 2026-06-25 Proposed -Reconciles against the NDE stack platform docs -(`netwerk-digitaal-erfgoed/docs` → `docs/stack/layers/platform.md`), which are themselves -a **draft under discussion**, so several decisions below are deliberate deviations from -the current draft, to be reconciled back into it. +Aligned with the NDE stack platform docs +(`netwerk-digitaal-erfgoed/docs` → `docs/stack/layers/platform.md`); the decisions below are +reflected there. ## Context @@ -19,10 +18,9 @@ declarative source so the GraphQL surface, a later REST surface, and the index c from each other, and so a deployment can swap search engines without consumers noticing. That requires an engine- and protocol-neutral **core** that both API surfaces and any -engine adapter sit on. The platform draft frames this as Ports & Adapters with a framed -JSON-LD intermediate representation, generated from SHACL + a `search:` annotation -vocabulary. We adopt that direction but scope it to what a v1 keyword search needs, and -diverge on a few concrete points where the draft does not fit DR’s catalog-search case. +engine adapter sit on. The architecture is Ports & Adapters with a framed JSON-LD +intermediate representation, generated from SHACL + a `search:` annotation vocabulary, +scoped here to what a v1 keyword search needs. ## Decision @@ -37,9 +35,6 @@ Two tiers: `search-*` is backend you compose; `search-api-*` is the surface you | API surface | `@lde/search-api-graphql` | field model + `SearchQuery` → GraphQL schema (runtime configuration; see [ADR 4](./0004-search-api-graphql-surface.md)) | | API surface | `@lde/search-api-rest` | OpenAPI + route handlers (later, thin over the core) | -This deviates from the draft’s function-mapping table (`@lde/graphql-server`, -`@lde/rest-server`, no core row); the draft should adopt the `@lde/search*` family. - ### Contract frozen, storage swappable The **API contract** (the SDL shape consumers couple to) is breaking to change and must be @@ -254,7 +249,7 @@ per-shape types (e.g. `Organization`, `Term`) with `label` exposed as `name` - **IR / adapter-return:** JSON-LD language map (`@container: @language`), `@set` arrays, `und` for untagged. Matches schema-profile #171 (language maps are more usable as a data - model) and the platform draft’s envelope. + model) and the stack platform envelope. - **GraphQL surface:** a single **best-first** `Accept-Language`-ordered list (`[LanguageString!]!`, see [ADR 4](./0004-search-api-graphql-surface.md)). `[0]` is the value to display; **`[0].language` is the language actually served** – the per-field @@ -270,7 +265,7 @@ argument (deferred): a parallel arg would duplicate the header and need preceden Chosen over a `{nl,en}` map (silently yields `undefined` for a missing language, no defined fallback order) and over a separate resolved scalar (the value must be a `LanguageString` to carry its language anyway, so the scalar saved only the `[0]` index – not worth a second -field plus a deviation from the draft / Network-of-Terms list shape). Grounded in measured +field plus diverging from the Network-of-Terms list shape). Grounded in measured data and all three substrates: - **A (descriptions, measured):** bilingual `nl`/`en`, ~86% Dutch-only → an English user gets @@ -284,31 +279,26 @@ have an English title) is distinct from content `dct:language` (already filterab preference; expressible as a facetable dimension (languages-present-in-a-localized-field), not enabled for DR v1, more relevant for B/C. -### Other reconciled decisions +### Other decisions - **Numbered pagination** (`offset`/`limit`, presented as page/per-page), not Relay cursors. DR is a page-numbered faceted browser with totals; Typesense is natively page/per-page; the ~2,500-doc corpus never paginates deep enough for offset cost to bite; and the blue/green alias swap removes the mutation-drift that motivates cursors. - **Sidecar canonical labels**, not inline `labelOnly` as default. Facets need one - canonical label per entity; the draft’s own two-source model puts canonical labels in a - separate collection, which is what DR’s `labels` collection is. `nestedStrategy` is - carried as metadata but inline `labelOnly` is not the default. -- **Logical typed result document** at the query seam; framed JSON-LD kept index-side. The - draft treats framed JSON-LD as the universal IR; we scope it to the index/projection - artifact (its payoff – vector/LDES/UI sinks – is object-search’s, not catalog-search’s), - gated on the generic framing packages existing rather than on DR. + canonical label per entity, kept in a separate collection — DR’s `labels` collection. A + reference’s `strategy` is carried as metadata; `labelOnly` is the v1 default, not inline. +- **Logical typed result document** at the query seam; framed JSON-LD kept index-side as the + index/projection artifact (its payoff – vector/LDES/UI sinks – is object-search’s, not + catalog-search’s), gated on the generic framing packages existing rather than on DR. ## Consequences - One declarative source drives GraphQL, later REST, and the index; they cannot drift. - The engine is a swappable adapter; the contract outlives engine choices. -- Adopted from the draft unchanged: the Stable API Contract discipline, `nestedStrategy` as - a concept, the surface `LanguageString` list, folding at the adapter boundary + query - side via `@lde/text-normalization`, SDL-in-projection vs filter-compiler-in-adapter. -- Deviations to reconcile into the platform draft: numbered pagination; sidecar labels; - logical result doc (framed JSON-LD scoped to index-side); `min`/`max` filter ranges; the - `@lde/search*` naming and a core package row. +- Carried through: the Stable API Contract discipline, the reference `strategy` concept, the + surface `LanguageString` list, folding at the adapter boundary + query side via + `@lde/text-normalization`, SDL-in-projection vs filter-compiler-in-adapter. - Adopted during implementation (2026-06-26): the **unified** field model – the projection `FieldSpec` (RDF→doc) and the deployment’s Typesense `SEARCH_FIELDS` are folded into this one `SearchField` (see the Field model note above). diff --git a/docs/decisions/0004-search-api-graphql-surface.md b/docs/decisions/0004-search-api-graphql-surface.md index 54c34000..f16c066b 100644 --- a/docs/decisions/0004-search-api-graphql-surface.md +++ b/docs/decisions/0004-search-api-graphql-surface.md @@ -11,8 +11,8 @@ Builds on [ADR 3 (Search API core query model)](./0003-search-api-core-query-mod ## Context Given the engine-neutral core of [ADR 3](./0003-search-api-core-query-model.md), the first -API surface is GraphQL. The platform draft requires the surface to be derived from the same -source as the index, never hand-written, so it cannot drift. It must also be framework-free: +API surface is GraphQL. The surface is derived from the same source as the index, never +hand-written, so it cannot drift. It must also be framework-free: resolvers are standard `graphql-js`, not tied to Fastify/Mercurius, so any GraphQL server can host the schema (DR mounts it inline; a Fastify wrapper is deferred and, if ever built, is a separate package). @@ -21,12 +21,11 @@ is a separate package). ### Runtime configuration, not code generation -The platform draft frames this as _generating_ the surface – emitting GraphQL SDL **and** -resolvers as artifacts. We deviate: nothing is emitted or committed. The schema is -**constructed at runtime from the field-model configuration** (`buildSearchSchema(config)`), -once at startup, and the resolvers are **generic functions inside the package** attached to -that schema. A better name for the draft’s “generation” step, at least for this surface, is -**runtime configuration**. +The surface is **constructed at runtime from the field-model configuration** +(`buildSearchSchema(config)`), once at startup, with the resolvers as **generic functions +inside the package** attached to that schema. Nothing is emitted or committed — there is no +generated GraphQL SDL or resolver artifact. The accurate name for this step is **runtime +configuration**, not generation. This matters because the resolvers are inherently generic – there is essentially one root resolver that maps args to a `SearchQuery`, calls the engine, and maps the result back; @@ -38,10 +37,6 @@ need no committed `.graphql` file. The field-model diff is the reviewable change `printSchema()` helper exists only as an **optional** CI snapshot test for catching accidental breaking changes to the frozen contract – not a shipped artifact. -> Deviation from the stack draft: the draft’s “generate SDL + resolvers” becomes -> _construct the schema at runtime from configuration; resolvers are generic and in-package; -> SDL is served live via introspection, not emitted._ For the reconciliation list. - ### The schema-building function ```ts @@ -345,12 +340,9 @@ Each transport populates it per request; no framework type appears in the packag facet types. Breaking to change – right in v1. - **Internal:** args→`SearchQuery` mapping, language ordering, how the adapter computes facets, the `SearchDocument` shape. -- **Deviations to reconcile into the platform draft:** - - “generate SDL + resolvers” → _runtime configuration_ (construct at startup from config; - generic in-package resolvers; SDL served via introspection, not emitted as an artifact). - - Named reference types per shape (`Organization`, `Term`) rather than the draft’s uniform - `labelOnly` `{ @id, @type, name }` reference shape – chosen for ergonomics and - additive `inline` growth. +- **Named reference types** per shape (`Organization`, `Term`) rather than a single uniform + reference type – chosen for ergonomics and additive `inline` growth (`labelOnly` → `inline` + only adds fields, non-breaking). - Deferred: a `dataset(id)` single-resource query (detail-page-on-index direction; DR detail stays on SPARQL); cross-collection `@reference` joins beyond inline labels; cursor pagination; a `Date` scalar (kept ISO `String`) and a `Long`/`BigInt` scalar for 64-bit From a4e295a4d417672921ed9d1e014458098ef177ce Mon Sep 17 00:00:00 2001 From: David de Boer Date: Mon, 29 Jun 2026 10:30:08 +0200 Subject: [PATCH 05/35] feat(search): project number-kind fields - number fields now project as floats (not truncated like integer) - closes the step-1 gap so an int64-magnitude field mapped to number (Float) indexes --- packages/search/src/project.ts | 14 ++++++++++++-- packages/search/test/project.test.ts | 11 +++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/packages/search/src/project.ts b/packages/search/src/project.ts index 284c3183..71e2416e 100644 --- a/packages/search/src/project.ts +++ b/packages/search/src/project.ts @@ -78,6 +78,12 @@ function applyField( field.name, toInteger(firstLiteralOf(node, path)), ); + case 'number': + return setNumber( + document, + field.name, + toNumber(firstLiteralOf(node, path)), + ); case 'date': return setNumber( document, @@ -85,8 +91,8 @@ function applyField( isoToUnix(firstLiteralOf(node, path)), ); } - // `number` and `boolean` are not projected from a path in current schemas - // (booleans are derivation-populated, e.g. the compatibility vinkjes). + // `boolean` is not projected from a path in current schemas — booleans are + // derivation-populated (e.g. the compatibility vinkjes). } /** @@ -236,6 +242,10 @@ function toInteger(literal: string | undefined): number | undefined { return literal === undefined ? undefined : Math.trunc(Number(literal)); } +function toNumber(literal: string | undefined): number | undefined { + return literal === undefined ? undefined : Number(literal); +} + function isoToUnix(iso: string | undefined): number | undefined { if (iso === undefined) { return undefined; diff --git a/packages/search/test/project.test.ts b/packages/search/test/project.test.ts index 8f513baa..592caac6 100644 --- a/packages/search/test/project.test.ts +++ b/packages/search/test/project.test.ts @@ -146,6 +146,17 @@ describe('projectDocument', () => { expect(document.class).toEqual(['http://example.org/BareClass']); }); + it('projects a number field as a float (not truncated like integer)', () => { + const document = projectDocument( + { '@id': 'https://ex/d/12', [`${DR}size`]: { '@value': '1234.5' } }, + { + type: DATASET, + fields: [{ name: 'size', path: `${DR}size`, kind: 'number' }], + }, + ); + expect(document.size).toBe(1234.5); + }); + it('folds the transformed values (not the raw ones) for a facet search field', () => { const document = projectDocument( { '@id': 'https://ex/d/4', [`${DR}format`]: [`${IANA}text/turtle`] }, From 79eaad3d12f0d51d38288e86eff4e58b146e0598 Mon Sep 17 00:00:00 2001 From: David de Boer Date: Mon, 29 Jun 2026 12:11:43 +0200 Subject: [PATCH 06/35] docs(search): link ADR 3 to the published stack platform docs Replace the repo-path breadcrumb with a direct link to the docs site, so the status note points readers at the rendered page rather than a source file path. --- docs/decisions/0003-search-api-core-query-model.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/decisions/0003-search-api-core-query-model.md b/docs/decisions/0003-search-api-core-query-model.md index c093a6f8..e931d849 100644 --- a/docs/decisions/0003-search-api-core-query-model.md +++ b/docs/decisions/0003-search-api-core-query-model.md @@ -6,9 +6,8 @@ Date: 2026-06-25 Proposed -Aligned with the NDE stack platform docs -(`netwerk-digitaal-erfgoed/docs` → `docs/stack/layers/platform.md`); the decisions below are -reflected there. +Aligned with the NDE [stack platform docs](https://docs.nde.nl/stack/layers/platform); the +decisions below are reflected there. ## Context From bda62cf59f0da53307a5fdae5a7b1f5ace0450e6 Mon Sep 17 00:00:00 2001 From: David de Boer Date: Wed, 1 Jul 2026 09:47:38 +0200 Subject: [PATCH 07/35] feat(search)!: keyed facet surface, range facets, label cache; remove the group companion - Keyed per-type facets object on the GraphQL surface (ValueBucket / RangeBucket), selection-is-the-request with skip-own-filter. - Numeric range facets and an opt-in label cache in the Typesense adapter. - Reconcile ADRs 0003 and 0004 with the implementation. BREAKING CHANGE: remove SearchField.group and its *_group companion field, collection column and query split. Deployments denormalize group tokens into the field values instead, so a group is an ordinary facet value with no engine mechanism. --- .../0003-search-api-core-query-model.md | 36 +- .../0004-search-api-graphql-surface.md | 256 +++++++------- .../search-api-graphql/src/build-schema.ts | 125 +++++-- .../generator-stability.test.ts.snap | 23 +- .../test/build-schema.test.ts | 182 +++++++++- packages/search-api-graphql/vite.config.ts | 8 +- .../search-typesense/src/collection-schema.ts | 8 - .../search-typesense/src/query-compiler.ts | 76 ++-- packages/search-typesense/src/search.ts | 212 ++++++++++-- .../generator-stability.test.ts.snap | 6 - .../test/collection-schema.test.ts | 10 - .../test/generator-stability.test.ts | 5 +- .../test/parse-response.test.ts | 326 +++++++++++++++++- .../test/query-compiler.test.ts | 53 ++- packages/search-typesense/vite.config.ts | 8 +- packages/search/src/engine.ts | 13 +- packages/search/src/index.ts | 1 + packages/search/src/schema.ts | 33 +- packages/search/test/schema.test.ts | 21 -- packages/search/vite.config.ts | 6 +- 20 files changed, 1071 insertions(+), 337 deletions(-) diff --git a/docs/decisions/0003-search-api-core-query-model.md b/docs/decisions/0003-search-api-core-query-model.md index e931d849..38f9e697 100644 --- a/docs/decisions/0003-search-api-core-query-model.md +++ b/docs/decisions/0003-search-api-core-query-model.md @@ -72,6 +72,7 @@ interface SearchField { readonly kind: FieldKind; readonly path?: string; // sh:path to project from; omit for a derivation-populated field readonly array?: boolean; // sh:maxCount + readonly required?: boolean; // sh:minCount ≥ 1 — non-null in output, non-optional in the index readonly localized?: boolean; // rdf:langString / sh:languageIn (text only) readonly locales?: readonly string[]; // when localized: which languages to emit readonly output?: boolean; // appears in the schema output type @@ -81,7 +82,7 @@ interface SearchField { readonly sortable?: boolean; readonly ref?: { type: string; strategy: 'labelOnly' | 'idOnly' | 'inline' }; // kind: 'reference' readonly transform?: (value: string) => string; // projection-time value transform - readonly group?: { readonly name: string; readonly prefix: string }; // deployment delta + readonly facetRanges?: readonly FacetRange[]; // numeric facet: fixed [min, max) range bins (histogram) vs per-value buckets } type Derivation = (document: SearchDocument, node: FramedNode) => void; @@ -89,7 +90,7 @@ type Derivation = (document: SearchDocument, node: FramedNode) => void; interface SearchSchema { readonly type: string; // sh:targetClass readonly fields: readonly SearchField[]; - readonly derivations?: readonly Derivation[]; // computed fields: status, *_group, booleans + readonly derivations?: readonly Derivation[]; // computed fields: status, booleans } ``` @@ -100,10 +101,11 @@ eventual generator emits it unchanged. A field with **no `path`** is a derived f populated by a `Derivation` rather than projected from the IR – yet it still carries full query/schema/output behavior, which is how the former separate projection `FieldSpec` is subsumed. The physical field names a declaration fans out to (`${name}_search_${locale}`, -`${name}_sort_${locale}`, `${name}_search`, `${name}_group`) follow one convention owned by -`@lde/search`, so projection, collection schema and query compiler agree. The `group` -companion (coarse grouped facets, e.g. `format_group`) and the `status_rank` tie-break sort -are **deployment-specific deltas**, never in `@lde/search`. `relevance` is _not_ a delta: +`${name}_sort_${locale}`, `${name}_search`) follow one convention owned by +`@lde/search`, so projection, collection schema and query compiler agree. The `status_rank` +tie-break sort is a **deployment-specific delta**, never in `@lde/search`. Grouped facets need +no field-model mechanism at all: a deployment derivation materializes group tokens (e.g. +`group:rdf`) into the field’s own values – see Consequences. `relevance` is _not_ a delta: every full-text engine ranks by match score, so it is a generic reserved sort the adapter understands. @@ -164,8 +166,26 @@ variable-based clients (`$o: DatasetOrderBy`) break, so a future array is a deli **Inclusive bounds only** – `min`/`max`, no `gt`/`gte`/`lt`/`lte`: self-documenting, matches Typesense’s native inclusive range, covers every DR case, additively reversible. -Grouped facets need no special shape – `group:`-prefixed tokens travel as ordinary `in` -strings and the adapter splits/unions them. +A numeric facet returns **range buckets** (`[min, max)` bins declared per field); the adapter +maps them to the engine’s native range faceting. + +**Grouped facets need no special engine mechanism; they are denormalized at index time.** +A coarse category alongside granular values (e.g. `group:rdf` next to media types, `group:person` +next to class IRIs) is materialized into the field’s own values during projection, so at query +time a group token is an ordinary value: faceted natively, filtered by plain membership +(`field.in: ["group:rdf"]` unions with granular values for free), and — where the field is +`output` – read like any other value. There is no `_group` companion, no `group:`-prefix split, +no filter rewriting in the adapter; the engine stays dumb and denormalization (the document +store’s strength) does the work. A cross-source signal that is not a subset of the field (e.g. a +SPARQL capability derived from `conformsTo`, not a media type) is likewise materialized as a plain +value by a deployment derivation. + +The trade-off this design accepts: **group membership is fixed at index time.** Because the +group token is baked into each document’s values during projection, redefining a group (which +granular values map to `group:rdf`) is an index-data change that takes effect only on **reindex** – +there is no query-time mapping to edit. The constraint is acceptable here because group definitions +are deployment projection config that already drives indexing, and reindexing is already the +pipeline’s job; it would not suit a system where grouping is user-defined or changes frequently. ### Engine port and result diff --git a/docs/decisions/0004-search-api-graphql-surface.md b/docs/decisions/0004-search-api-graphql-surface.md index f16c066b..c5b297da 100644 --- a/docs/decisions/0004-search-api-graphql-surface.md +++ b/docs/decisions/0004-search-api-graphql-surface.md @@ -11,31 +11,26 @@ Builds on [ADR 3 (Search API core query model)](./0003-search-api-core-query-mod ## Context Given the engine-neutral core of [ADR 3](./0003-search-api-core-query-model.md), the first -API surface is GraphQL. The surface is derived from the same source as the index, never -hand-written, so it cannot drift. It must also be framework-free: -resolvers are standard `graphql-js`, not tied to Fastify/Mercurius, so any GraphQL server -can host the schema (DR mounts it inline; a Fastify wrapper is deferred and, if ever built, -is a separate package). +API surface is GraphQL, derived from the same source as the index so it cannot drift. It must +be framework-free: resolvers are standard `graphql-js`, not tied to Fastify/Mercurius, so any +GraphQL server can host the schema (DR mounts it inline; a Fastify wrapper is a deferred +separate package). ## Decision ### Runtime configuration, not code generation The surface is **constructed at runtime from the field-model configuration** -(`buildSearchSchema(config)`), once at startup, with the resolvers as **generic functions -inside the package** attached to that schema. Nothing is emitted or committed — there is no -generated GraphQL SDL or resolver artifact. The accurate name for this step is **runtime -configuration**, not generation. - -This matters because the resolvers are inherently generic – there is essentially one root -resolver that maps args to a `SearchQuery`, calls the engine, and maps the result back; -the field model only parameterises data. Codegen would emit N near-identical resolver stubs +(`buildSearchSchema(config)`), once at startup, with generic resolvers shipped in the package +attached to that schema – nothing is emitted or committed. The resolvers are inherently +generic (one root resolver maps args to a `SearchQuery`, calls the engine, and maps the result +back; the field model only parameterises data), so codegen would emit N near-identical stubs that all delegate to the same logic, plus a build step and staleness risk, for no benefit. -**No SDL artifact.** A live GraphQL API serves its own schema via introspection, so clients -need no committed `.graphql` file. The field-model diff is the reviewable change. A -`printSchema()` helper exists only as an **optional** CI snapshot test for catching -accidental breaking changes to the frozen contract – not a shipped artifact. +A live GraphQL API serves its own schema via introspection, so clients need no committed +`.graphql` file; the field-model diff is the reviewable change. `printSearchSchema()` exists +only as an **optional** CI snapshot test guarding the frozen contract against accidental +breaking changes – not a shipped artifact. ### The schema-building function @@ -80,21 +75,17 @@ composes the exported typeDefs/resolvers by hand. ### A typed surface the contract does not depend on -Because `buildSearchSchema` is generic over the config _value_ (``), one -`as const satisfies SearchSchema` declaration drives two **independent** projections: +One `as const satisfies SearchSchema` declaration drives two **independent** projections: the +**runtime contract** (the `GraphQLSchema`, built at startup by reading the value – +`field.kind`, `output`, `facetable`, …) and a **static TS mirror** (`OutputOf` / +`WhereOf` / `OrderByOf` / `FacetOf`, computed from `typeof schema` via mapped types). -- **the runtime contract** – the `GraphQLSchema`, built at startup by reading the value - (`field.kind`, `output`, `facetable`, …); and -- **a static TS mirror** – `OutputOf` / `WhereOf` / `OrderByOf` / `FacetOf`, - computed from `typeof schema` via mapped types. - -The contract **does not depend on the TS types.** `as const`/`satisfies` are compile-time -only and TS types are erased, so the served schema is byte-identical whether or not the -mirror types exist – they are a developer-experience overlay, never the source. The two are -parallel derivations of one value: the runtime kind→GraphQL-type mapping lives in -`buildSearchSchema`; the type-level mapping in `OutputOf` duplicates it. They can drift, -so the **contract** is guarded by the optional `printSearchSchema()` SDL snapshot test (the -real artifact), while the TS mirror only catches our own coding mistakes against it. +The contract **does not depend on the TS types.** `as const`/`satisfies` are compile-time only +and erased, so the served schema is byte-identical whether or not the mirror exists – it is a +developer-experience overlay. The two derivations can drift (the runtime kind→GraphQL-type +mapping lives in `buildSearchSchema`; the type-level mapping in `OutputOf` duplicates it), +so the **contract** is guarded by the optional `printSearchSchema()` SDL snapshot (the real +artifact), while the TS mirror only catches our own coding mistakes against it. Values are typed at both ends, with the resolver as the typed transform between them: @@ -103,26 +94,27 @@ Values are typed at both ends, with the resolver as the typed transform between | IR (`ResultDocument`) | `LocalizedValue` (lang map) | `Reference` | `number` | `readonly string[]` | `boolean` | | GraphQL (`OutputOf`) | `LanguageString[]` (best-first list) | named type (`Organization`) | `Float`/`number` | `[String!]!`/`string[]` | `Boolean!`/`boolean` | -What stays unchecked is only the **generic resolver’s dynamic middle**: it loops over the +What stays unchecked is only the generic resolver’s **dynamic middle**: it loops over the field model with runtime-string names, so TS cannot prove the object it builds matches `OutputOf` – it casts at that boundary, and graphql-js’s executor (not TS) enforces the -output types at runtime (a wrong-typed return raises a field error). This is the same -“typed boundaries, dynamic middle” shape as the engine port and the projection: type the -edges where it is honest, accept a cast where iteration is inherently dynamic. +output types at runtime (a wrong-typed return raises a field error). Same “typed boundaries, +dynamic middle” shape as the engine port and the projection: type the edges where it is +honest, accept a cast where iteration is inherently dynamic. ### Construction rules (field model → schema) -Type names derive from `typeName`; shared types (`LanguageString`, `Facet`, `FacetBucket`, -`SortDirection`, `StringFilter`, `IntRange`, `FloatRange`, `DateRange`) are emitted once. +Type names derive from `typeName`; shared types (`LanguageString`, `ValueBucket`, `RangeBucket`, +`SortDirection`, `StringFilter`, `IntRange`, `FloatRange`, `DateRange`) are emitted once, and the +per-type keyed facets object is named `Facets`. GraphQL field names are the field model `name` verbatim (declare camelCase). - **Output type** – one field per `output` field: `text`+`localized` → `[LanguageString!]!` (best-first; `[0].language` = served language, the per-field `Content-Language`); `keyword` array → `[String!]!`, scalar → `String`; `integer` → `Int` (signed 32-bit); `number` → `Float` (exact integers to 2^53); `date` → `String` (ISO 8601); `boolean` → `Boolean!` (absent = false); `reference` → see below. Nullability from `array` / required / - optional; `id` is `String!`. A field whose magnitude can exceed 32 bits (a 64-bit count or - byte size – e.g. DR’s `size`) is modelled as `number` → `Float`, since GraphQL’s `Int` - would overflow; a `Long`/`BigInt` custom scalar is the deferred alternative. + optional; `id` is `String!`. A magnitude that can exceed 32 bits (a 64-bit count or byte size + – e.g. DR’s `size`) is `number` → `Float`, since `Int` would overflow; a `Long`/`BigInt` + custom scalar is the deferred alternative. - **Reference types** – a `reference` field is typed by the **referenced shape** (`sh:class`/`sh:node`), emitted once and reused by every field referencing the same shape. Its fields follow `nestedStrategy`: @@ -134,9 +126,9 @@ GraphQL field names are the field model `name` verbatim (declare camelCase). | `inline` (later) | the named type plus the referenced shape’s projected fields | So DR emits `publisher: Organization` (the `foaf:Agent` shape) and - `terminologySource: [Term!]!`; a shape’s type is emitted once and reused by any field that - references it. Named, not a generic GraphQL `Reference`: going `labelOnly → inline` then - only _adds_ fields (non-breaking), whereas generic→named later would break the contract. + `terminologySource: [Term!]!`. Named, not a generic GraphQL `Reference`: going + `labelOnly → inline` then only _adds_ fields (non-breaking), whereas generic→named later + would break the contract. - **`where` input** – one field per `filterable` field: `keyword`/`reference` → `StringFilter { in: [String!] }`; `integer` → `IntRange { min, max }`; `number` → @@ -144,22 +136,32 @@ GraphQL field names are the field model `name` verbatim (declare camelCase). `is` value); `text` is excluded (it goes through the `query` arg). - **`orderBy`** – `RELEVANCE` (the sane default when a `query` is present) plus every `sortable` field, as an enum, in a single `{ field, direction }` input. Only - publicly-selectable sorts appear here; the resolver expands the client’s one choice into - the internal `Sort[]`, appending deployment tie-breaks like DR’s `status_rank` via - `queryDefaults` (never exposed). Single for now because a user picks one dimension. - Promoting it to a list later is backward-compatible only for inline-literal clients (list - input coercion wraps a single value); **variable-based clients break** (`$o: DatasetOrderBy` - is rejected where `[DatasetOrderBy!]` is expected), so a future array is a deliberate, - potentially breaking change – not a free one. -- **Facets** – an enum of every `facetable` field; requested per query, returned with counts. - A bucket’s `value` is its selection key; `label` is the **nullable** display label. - The engine resolves `label` only for **reference** facets — IRI-keyed buckets whose - canonical multilingual label is _data_, fetched from the sidecar `labels` collection in the - same lookup as hit references. It is `null` for token facets (e.g. `status`) and - free-string facets (e.g. `keyword`): those carry no data label, and the consumer owns their - display — its own i18n catalog for controlled tokens (`valid` → “Geldig”/“Valid”, which the - engine cannot and must not fabricate), or the `value` itself for free strings. The null is - load-bearing: it tells a client whether a server-resolved label exists or display is theirs. + publicly-selectable sorts appear; the resolver expands the client’s one choice into the + internal `Sort[]`, appending deployment tie-breaks like DR’s `status_rank` via + `queryDefaults` (never exposed). Single for now because a user picks one dimension; promoting + it to a list later is backward-compatible only for inline-literal clients (list input + coercion) – **variable-based clients break** (`$o: DatasetOrderBy` where `[DatasetOrderBy!]` + is expected) – so a future array is a deliberate, potentially breaking change. +- **Facets** – a **keyed object** (`Facets`), one field per `facetable` field, typed by + the field’s kind: a numeric range-facet field is `[RangeBucket!]!`, every other facet is + `[ValueBucket!]!`. The facet set and each bucket shape are thus encoded **statically in the + schema**, not discovered at runtime through an enum + polymorphic bucket (no `__typename`, no + fragments). **Selection is the request**: only the facet keys a query selects are computed + (the resolver inspects the selection), each with its **own where-filter removed** + (skip-own-filter – a multi-select facet still lists its other options; dropping a `status` + filter also drops the valid-only default, so the status facet counts across every status). + Two bucket types: + - `ValueBucket { value, count, label }` – `value` is the selection key (filter via + `field.in`); `label` (nullable) is the engine-resolved canonical **data** label, present + only for **reference** (IRI-keyed) facets, `null` for token/free-string facets whose + display the consumer owns (its i18n for controlled tokens like `valid` → “Geldig”/“Valid”, + or the `value` itself). The null is load-bearing. + - `RangeBucket { min, max, count }` – a half-open `[min, max)` numeric bin (`max` null on an + open-ended top bin), filtered via `field.range`. + - A grouped facet (a coarse category alongside granular values, e.g. `group:rdf` next to media + types) needs **no special bucket**: its tokens are denormalized into the field at index time, + so they are ordinary `ValueBucket` values – faceted, filtered (`field.in: ["group:rdf"]`) and, + where output, read like any other value (see ADR 0003). ### Resulting schema (DR example, abridged) @@ -184,29 +186,15 @@ type Dataset { publisher: Organization terminologySource: [Term!]! format: [String!]! - class: [String!]! - size: Float # int64 magnitude → Float, not Int (32-bit); see note below + size: Float # int64 magnitude → Float, not Int (32-bit) datePosted: String status: String iiif: Boolean! # … keyword, language, iiifManifestCount, ndeSchemaAp, linkedData, terms, persistentUris } -input StringFilter { - in: [String!] -} -input IntRange { - min: Int - max: Int -} -input FloatRange { - min: Float - max: Float -} -input DateRange { - min: String - max: String -} +# shared inputs are emitted once and reused: DR uses StringFilter + FloatRange + +# SortDirection (IntRange / DateRange are pruned – no filterable int/date field). input DatasetWhere { publisher: StringFilter @@ -214,9 +202,7 @@ input DatasetWhere { class: StringFilter status: StringFilter size: FloatRange - datePosted: DateRange - iiif: Boolean - # … keyword, language, terminologySource, catalog, ndeSchemaAp, linkedData, terms, persistentUris + # … keyword, language, terminologySource, catalog } enum DatasetSortField { @@ -225,37 +211,31 @@ enum DatasetSortField { DATE_POSTED SIZE } -enum SortDirection { - ASC - DESC -} input DatasetOrderBy { field: DatasetSortField! direction: SortDirection! = DESC } -enum DatasetFacetField { - PUBLISHER - KEYWORD - LANGUAGE - FORMAT - CLASS - TERMINOLOGY_SOURCE - STATUS - IIIF - NDE_SCHEMA_AP - LINKED_DATA - TERMS - PERSISTENT_URIS +type ValueBucket { + value: String! # selection key: a media type, a token (group:rdf), or an IRI for reference facets + count: Int! + label: [LanguageString!] # nullable; resolved data label for reference facets, else null } -type FacetBucket { - value: String! # the selection key (an IRI for reference facets, else a token/string) +type RangeBucket { + min: Float # half-open [min, max); max null = open-ended top bin + max: Float count: Int! - label: [LanguageString!] # nullable — see below } -type Facet { - field: DatasetFacetField! - buckets: [FacetBucket!]! +type DatasetFacets { + # one field per facetable field, typed by kind; selection = request, skip-own-filter applied + publisher: [ValueBucket!]! + keyword: [ValueBucket!]! + language: [ValueBucket!]! + format: [ValueBucket!]! + class: [ValueBucket!]! + terminologySource: [ValueBucket!]! + status: [ValueBucket!]! + size: [RangeBucket!]! } type DatasetSearchResult { @@ -263,7 +243,7 @@ type DatasetSearchResult { total: Int! page: Int! perPage: Int! - facets: [Facet!]! + facets: DatasetFacets! } type Query { @@ -272,19 +252,20 @@ type Query { where: DatasetWhere orderBy: DatasetOrderBy page: Int = 1 - perPage: Int = 20 - facets: [DatasetFacetField!] + perPage: Int = 20 # no `facets` arg – selecting facet keys IS the request ): DatasetSearchResult! } ``` Numbered pagination (`page`/`perPage` + `total`), per [ADR 3](./0003-search-api-core-query-model.md) – no Relay connection. The reference types -(`Organization`, `Term`) carry `id + name` (labelOnly) from DR’s sidecar labels collection, -resolved by the adapter. `publisher` is single (`dct:publisher` `maxCount 1`); `creator` is -search-only – its name feeds full-text `query` but it has no output field of its own, -mirroring the current card. `catalog` is filter-only, so it appears in `where` but not as an -output field. +carry `id + name` (labelOnly) from DR’s sidecar labels collection, resolved by the adapter. +`publisher` is single (`dct:publisher` `maxCount 1`); `creator` is search-only (its name feeds +full-text `query` but it has no output field); `catalog` is filter-only (in `where`, not output); +`class` is facet + filter but not output (its `group:` tokens surface only as facet buckets, never +as card values); `datePosted` is sortable + output only; and the NDE compatibility booleans +(`iiif`, `ndeSchemaAp`, `linkedData`, `terms`) are output-only vinkjes – in neither `where` nor the +facets until “filter by vinkje” ships. ### Resolver behaviour @@ -293,31 +274,27 @@ The single, generic root resolver (shipped in the package, not emitted): 1. **Args → `SearchQuery`** (pure): `query`→`text`; `where`→`Filter[]`; `orderBy`→`Sort[]` (`RELEVANCE`→reserved `relevance`); `page`/`perPage`→`offset`/`limit`; `facets`→logical names; `locale`←`context.acceptLanguage[0]`. -2. **Apply `options.queryDefaults`** – the generic resolver bakes no deployment defaults; - DR injects its policy here: default `status:=valid`; default sort `relevance` when a - `query` is present else `title`; and the `status_rank` tie-break appended to either. +2. **Apply `options.queryDefaults`** – the generic resolver bakes no deployment defaults; DR + injects its policy here: default `status:=valid`; default sort `relevance` when a `query` is + present else `title`; and the `status_rank` tie-break appended to either. 3. **`context.engine.search(query, schema)` → `SearchResult`.** 4. **`SearchResult` → output** – scalars pass through; a `LocalizedValue` map → - `[LanguageString]` ordered by `options.languageOrder(available, acceptLanguage)`; - reference values likewise; facets keyed logical→enum. GraphQL field selection prunes. + `[LanguageString]` ordered by `options.languageOrder(available, acceptLanguage)`; reference + values likewise; facets keyed logical→enum. GraphQL field selection prunes. -Default `languageOrder`: Accept-Language entries first, then remaining tagged languages, -then untagged (`und`) last – so `[0]` is always the best available value. +Default `languageOrder`: Accept-Language entries first, then remaining tagged languages, then +untagged (`und`) last – so `[0]` is always the best available value. ### Lifecycle and performance -- **Built once at startup.** The consumer calls `buildSearchSchema` during boot and hands - the single `GraphQLSchema` to its server; the field model is static per deployment, so it - is never rebuilt per request. -- **Held and reused.** That one schema serves every request (Mercurius additionally - caches/compiles it). -- **Zero per-request penalty vs codegen.** A runtime-constructed schema is the same - `GraphQLSchema` object codegen would have produced; the only added cost is the one-time - build, sub-millisecond to low-single-digit-ms for a schema this size. +- **Built once at startup, reused for every request.** The field model is static per + deployment, so the single `GraphQLSchema` is constructed during boot (sub-millisecond to + low-single-digit-ms for a schema this size) and never rebuilt per request – the same object + codegen would have produced, with no per-request penalty (Mercurius additionally caches it). - **Hot path is the engine, not GraphQL.** Per-request cost is dominated by the Typesense round-trip; parse/validate/resolve of a small query is sub-millisecond. -- **Introspection serves the contract.** Cheap (a query against the built schema, cached by - clients). Leave it on, or disable in production and use `printSearchSchema` for tooling. +- **Introspection serves the contract** (cheap, client-cached). Leave it on, or disable in + production and use `printSearchSchema` for tooling. ### Context contract @@ -333,19 +310,18 @@ Each transport populates it per request; no framework type appears in the packag ## Consequences - The GraphQL surface is configured at runtime from the - [ADR 3](./0003-search-api-core-query-model.md) field model, so it cannot drift from the - index or a later REST surface, and works under any GraphQL server. + [ADR 3](./0003-search-api-core-query-model.md) field model, so it cannot drift from the index + or a later REST surface, and works under any GraphQL server. - **Frozen (public contract):** `LanguageString`, the named reference types (`Organization`, `Term`, …), output types, `where` operators, `orderBy` enums, numbered-pagination args, facet types. Breaking to change – right in v1. -- **Internal:** args→`SearchQuery` mapping, language ordering, how the adapter computes - facets, the `SearchDocument` shape. -- **Named reference types** per shape (`Organization`, `Term`) rather than a single uniform - reference type – chosen for ergonomics and additive `inline` growth (`labelOnly` → `inline` - only adds fields, non-breaking). -- Deferred: a `dataset(id)` single-resource query (detail-page-on-index direction; DR detail - stays on SPARQL); cross-collection `@reference` joins beyond inline labels; cursor - pagination; a `Date` scalar (kept ISO `String`) and a `Long`/`BigInt` scalar for 64-bit - integers (kept `Float`); transport-layer persisted queries / cost - limits; a root or per-field language argument (Accept-Language is the sole preference - mechanism); metadata-language-availability filtering (a facetable dimension, not v1). +- **Internal:** args→`SearchQuery` mapping, language ordering, how the adapter computes facets, + the `SearchDocument` shape. +- **Named reference types** per shape rather than one uniform reference type – chosen for + ergonomics and additive `inline` growth (`labelOnly` → `inline` only adds fields). +- Deferred: a `dataset(id)` single-resource query (DR detail stays on SPARQL); cross-collection + `@reference` joins beyond inline labels; cursor pagination; a `Date` scalar (kept ISO + `String`) and a `Long`/`BigInt` scalar for 64-bit integers (kept `Float`); transport-layer + persisted queries / cost limits; a root or per-field language argument (Accept-Language is the + sole preference mechanism); metadata-language-availability filtering (a facetable dimension, + not v1). diff --git a/packages/search-api-graphql/src/build-schema.ts b/packages/search-api-graphql/src/build-schema.ts index fdfccf09..f7449793 100644 --- a/packages/search-api-graphql/src/build-schema.ts +++ b/packages/search-api-graphql/src/build-schema.ts @@ -42,6 +42,12 @@ export interface SearchContext { readonly engine: SearchEngine; /** Parsed, ordered `Accept-Language`; drives locale selection and output order. */ readonly acceptLanguage: readonly string[]; + /** + * Called when a single facet's computation fails. The facet degrades to an + * empty list (a supplementary facet must not fail the whole query); supply + * this to log the cause. Optional — omit to swallow silently. + */ + readonly onFacetError?: (field: string, error: unknown) => void; } export interface BuildSearchSchemaOptions { @@ -98,14 +104,14 @@ export function buildSearchSchema( value: { type: new GraphQLNonNull(GraphQLString) }, }, }); - const facetBucket = new GraphQLObjectType({ - name: 'FacetBucket', + // A plain value facet bucket: a selection key, its count, and (for reference + // facets) the engine-resolved data label; null for token/free-string facets + // whose display the consumer owns. + const valueBucket = new GraphQLObjectType({ + name: 'ValueBucket', fields: { value: { type: new GraphQLNonNull(GraphQLString) }, count: { type: new GraphQLNonNull(GraphQLInt) }, - // Nullable: the resolved data label for a reference facet, else null — - // the consumer owns display for token/free-string facets (its i18n or the - // value itself). label: { type: new GraphQLList(new GraphQLNonNull(languageString)), resolve: (bucket: Source, _args: unknown, context: SearchContext) => { @@ -117,6 +123,22 @@ export function buildSearchSchema( }, }, }); + // A numeric range-facet bin: half-open `[min, max)` bounds (max null on an + // open-ended top bin) and the count of documents in it. + const rangeBucket = new GraphQLObjectType({ + name: 'RangeBucket', + fields: { + min: { + type: GraphQLFloat, + resolve: (bucket: Source) => bucket.min ?? null, + }, + max: { + type: GraphQLFloat, + resolve: (bucket: Source) => bucket.max ?? null, + }, + count: { type: new GraphQLNonNull(GraphQLInt) }, + }, + }); const sortDirection = new GraphQLEnumType({ name: 'SortDirection', values: { ASC: { value: 'asc' }, DESC: { value: 'desc' } }, @@ -289,19 +311,55 @@ export function buildSearchSchema( }, }); - const facetValues: GraphQLEnumValueConfigMap = {}; - for (const field of facetableFields(schema)) { - facetValues[screamingSnake(field.name)] = { value: field.name }; - } - const facetField = new GraphQLEnumType({ - name: `${typeName}FacetField`, - values: facetValues, - }); - const facet = new GraphQLObjectType({ - name: 'Facet', - fields: { - field: { type: new GraphQLNonNull(facetField) }, - buckets: { type: nonNullListOf(facetBucket) }, + // Keyed facets object: one field per facetable field, typed by its kind + // (range fields → [RangeBucket!], else [ValueBucket!]). Each field's resolver + // computes that facet with its OWN where-filter removed (skip-own-filter), so a + // multi-select facet still lists its other options; only the selected fields + // are resolved (GraphQL prunes the rest), so the selection IS the request. + const facetsType = new GraphQLObjectType({ + name: `${typeName}Facets`, + fields: () => { + const fields: Record< + string, + GraphQLFieldConfig + > = {}; + for (const field of facetableFields(schema)) { + const isRange = + field.facetRanges !== undefined && field.facetRanges.length > 0; + fields[field.name] = { + type: nonNullListOf(isRange ? rangeBucket : valueBucket), + resolve: async ( + source: Source, + _args: unknown, + context: SearchContext, + ) => { + const query = source.query as SearchQuery; + // Drop this facet's own filter so its other options still count + // (a removed `status` filter also drops the valid-only default, so + // the status facet counts across every status). + const facetQuery: SearchQuery = { + ...query, + where: query.where.filter( + (filter) => filter.field !== field.name, + ), + facets: [field.name], + limit: 0, + offset: 0, + }; + // A facet is supplementary: degrade a failed facet to an empty list + // rather than failing the whole query (which would null the non-null + // result and discard the items + every other facet). + try { + const result = await context.engine.search(facetQuery, schema); + return result.facets[field.name] ?? []; + } catch (error) { + context.onFacetError?.(field.name, error); + return []; + } + }, + }; + } + return fields; }, }); @@ -312,7 +370,12 @@ export function buildSearchSchema( total: { type: new GraphQLNonNull(GraphQLInt) }, page: { type: new GraphQLNonNull(GraphQLInt) }, perPage: { type: new GraphQLNonNull(GraphQLInt) }, - facets: { type: nonNullListOf(facet) }, + // Resolved lazily, per selected key (skip-own-filter); the result object + // (which carries the resolved `query`) is the facets source. + facets: { + type: new GraphQLNonNull(facetsType), + resolve: (source: Source) => source, + }, }, }); @@ -327,23 +390,29 @@ export function buildSearchSchema( orderBy: { type: orderByInput }, page: { type: GraphQLInt, defaultValue: 1 }, perPage: { type: GraphQLInt, defaultValue: 20 }, - facets: { type: new GraphQLList(new GraphQLNonNull(facetField)) }, }, resolve: async (_source, args, context: SearchContext) => { const built = argsToQuery(args as QueryArgs, context, schema); const finalQuery = options.queryDefaults ? options.queryDefaults(built, context) : built; - const result = await context.engine.search(finalQuery, schema); + // Items + total only; facets are resolved lazily per selected key. + const result = await context.engine.search( + { ...finalQuery, facets: [] }, + schema, + ); return { items: result.hits.map((hit) => ({ id: hit.id, ...hit.document })), total: result.total, - page: Math.floor(finalQuery.offset / finalQuery.limit) + 1, + // Guard against a `perPage: 0` arg: `Math.floor(0/0)` is NaN, which a + // non-null `Int!` cannot serialize and would fail the whole query. + page: + finalQuery.limit > 0 + ? Math.floor(finalQuery.offset / finalQuery.limit) + 1 + : 1, perPage: finalQuery.limit, - facets: Object.entries(result.facets).map(([field, buckets]) => ({ - field, - buckets, - })), + // Carried for the facets resolver (skip-own-filter per key). + query: finalQuery, }; }, }, @@ -372,7 +441,6 @@ interface QueryArgs { readonly orderBy?: { field: string; direction: 'asc' | 'desc' }; readonly page?: number; readonly perPage?: number; - readonly facets?: readonly string[]; } /** Pure args → {@link SearchQuery} mapping. */ @@ -391,7 +459,8 @@ function argsToQuery( : [], limit: perPage, offset: (page - 1) * perPage, - facets: args.facets ?? [], + // Facets are requested per-key by the facets resolver, not via an arg. + facets: [], locale: context.acceptLanguage[0] ?? 'und', }; } diff --git a/packages/search-api-graphql/test/__snapshots__/generator-stability.test.ts.snap b/packages/search-api-graphql/test/__snapshots__/generator-stability.test.ts.snap index d1741f4e..63bc19de 100644 --- a/packages/search-api-graphql/test/__snapshots__/generator-stability.test.ts.snap +++ b/packages/search-api-graphql/test/__snapshots__/generator-stability.test.ts.snap @@ -2,7 +2,7 @@ exports[`GraphQL generator stability > emits a stable SDL for a representative schema 1`] = ` "type Query { - things(query: String, where: ThingWhere, orderBy: ThingOrderBy, page: Int = 1, perPage: Int = 20, facets: [ThingFacetField!]): ThingSearchResult! + things(query: String, where: ThingWhere, orderBy: ThingOrderBy, page: Int = 1, perPage: Int = 20): ThingSearchResult! } type ThingSearchResult { @@ -10,7 +10,7 @@ type ThingSearchResult { total: Int! page: Int! perPage: Int! - facets: [Facet!]! + facets: ThingFacets! } type Thing { @@ -37,20 +37,15 @@ type Agent { name: [LanguageString!]! } -type Facet { - field: ThingFacetField! - buckets: [FacetBucket!]! +type ThingFacets { + keyword: [ValueBucket!]! + creator: [ValueBucket!]! + publisher: [ValueBucket!]! + status: [ValueBucket!]! + open: [ValueBucket!]! } -enum ThingFacetField { - KEYWORD - CREATOR - PUBLISHER - STATUS - OPEN -} - -type FacetBucket { +type ValueBucket { value: String! count: Int! label: [LanguageString!] diff --git a/packages/search-api-graphql/test/build-schema.test.ts b/packages/search-api-graphql/test/build-schema.test.ts index b61ba240..6ba323de 100644 --- a/packages/search-api-graphql/test/build-schema.test.ts +++ b/packages/search-api-graphql/test/build-schema.test.ts @@ -41,7 +41,12 @@ const schema: SearchSchema = { kind: 'integer', filterable: true, sortable: true, + facetable: true, output: true, + facetRanges: [ + { key: '0', min: 1, max: 10 }, + { key: '1', min: 10 }, + ], }, { name: 'datePosted', kind: 'date', sortable: true, output: true }, { name: 'score', kind: 'number', output: true }, @@ -148,7 +153,7 @@ describe('buildSearchSchema', () => { status iiif } - facets { field buckets { value count } } + facets { keyword { value count } } } }`, { engine, acceptLanguage: ['nl'] }, @@ -176,9 +181,9 @@ describe('buildSearchSchema', () => { { id: 'https://term/1', name: [{ language: 'nl', value: 'Kaarten' }] }, ]); expect(item.iiif).toBe(true); - expect(data.facets).toEqual([ - { field: 'KEYWORD', buckets: [{ value: 'kaarten', count: 3 }] }, - ]); + expect(data.facets).toEqual({ + keyword: [{ value: 'kaarten', count: 3 }], + }); // The free-text arg became the query text. expect(received().text).toBe('kaart'); }); @@ -213,7 +218,7 @@ describe('buildSearchSchema', () => { ], }); const result = await run( - `{ datasets { items { title { language value } } } }`, + `{ datasets { items { title { language value } datePosted } } }`, { engine, acceptLanguage: ['en'] }, ); const item = ( @@ -226,6 +231,8 @@ describe('buildSearchSchema', () => { { language: 'nl', value: 'Titel' }, { language: null, value: 'Naamloos' }, ]); + // An absent date resolves to null (the non-numeric branch). + expect(item.datePosted).toBeNull(); }); it('labels reference-facet buckets, leaving plain-facet buckets null', async () => { @@ -244,26 +251,165 @@ describe('buildSearchSchema', () => { }, }); const result = await run( - `{ datasets { facets { field buckets { value count label { language value } } } } }`, + `{ datasets { facets { + publisher { value count label { language value } } + keyword { value count label { language value } } + } } }`, { engine, acceptLanguage: ['nl'] }, ); const facets = (result.data?.datasets as Record) - .facets as { field: string; buckets: unknown[] }[]; - const publisher = facets.find((facet) => facet.field === 'PUBLISHER'); - const keyword = facets.find((facet) => facet.field === 'KEYWORD'); - expect(publisher?.buckets).toEqual([ + .facets as { + publisher: unknown[]; + keyword: unknown[]; + }; + expect(facets.publisher).toEqual([ { value: 'https://org/1', count: 2, label: [{ language: 'nl', value: 'Het Utrechts Archief' }], }, ]); - expect(keyword?.buckets).toEqual([ + expect(facets.keyword).toEqual([ { value: 'kaarten', count: 3, label: null }, ]); }); - it('maps where, orderBy, facets and pagination into the SearchQuery', async () => { + it('exposes range-facet bucket bounds, null for value facets and open ends', async () => { + const { engine } = fakeEngine({ + total: 0, + hits: [], + facets: { + size: [ + { value: '0', count: 2, min: 1, max: 10 }, + // Open-ended top bin: lower bound only. + { value: '1', count: 5, min: 10 }, + ], + keyword: [{ value: 'kaarten', count: 3 }], + }, + }); + const result = await run( + `{ datasets { facets { + size { min max count } + keyword { value count } + } } }`, + { engine, acceptLanguage: ['nl'] }, + ); + const facets = (result.data?.datasets as Record) + .facets as { + size: unknown[]; + keyword: unknown[]; + }; + // RangeBuckets carry their half-open bounds (max null = open-ended top bin). + expect(facets.size).toEqual([ + { min: 1, max: 10, count: 2 }, + { min: 10, max: null, count: 5 }, + ]); + // A value facet's ValueBuckets carry no bounds. + expect(facets.keyword).toEqual([{ value: 'kaarten', count: 3 }]); + }); + + it('resolves every selected facet key, returning [] where the engine has none', async () => { + const { engine } = fakeEngine({ + total: 0, + hits: [], + facets: { keyword: [{ value: 'kaarten', count: 1 }] }, + }); + const result = await run( + `{ datasets { facets { + keyword { value count } + publisher { value count } + terminologySource { value count } + status { value count } + iiif { value count } + size { min max count } + } } }`, + { engine, acceptLanguage: ['nl'] }, + ); + const facets = (result.data?.datasets as Record) + .facets as Record; + expect(facets.keyword).toEqual([{ value: 'kaarten', count: 1 }]); + // Keys the engine returned nothing for resolve to an empty list. + for (const key of [ + 'publisher', + 'terminologySource', + 'status', + 'iiif', + 'size', + ]) { + expect(facets[key]).toEqual([]); + } + }); + + it('computes a facet with its own where-filter removed (skip-own-filter)', async () => { + const { engine, received } = fakeEngine({ + total: 0, + hits: [], + facets: { keyword: [{ value: 'kaarten', count: 1 }] }, + }); + await run( + `{ datasets(where: { keyword: { in: ["x"] }, status: { in: ["valid"] } }) { + facets { keyword { value count } } + } }`, + { engine, acceptLanguage: ['nl'] }, + ); + // The keyword facet query is run with the keyword filter dropped (so its + // other options still count), but other filters (status) retained. + const facetQuery = received(); + expect(facetQuery.facets).toEqual(['keyword']); + expect( + facetQuery.where.find((filter) => filter.field === 'keyword'), + ).toBeUndefined(); + expect(facetQuery.where).toContainEqual({ field: 'status', in: ['valid'] }); + }); + + it('degrades a failed facet to an empty list without failing the whole query', async () => { + // A facet is supplementary: its computation runs a separate search (with + // `facets` set). Fail only that, leaving the listing search untouched. + const failedFacets: string[] = []; + const engine: SearchEngine = { + async search(query) { + if (query.facets.length > 0) { + throw new Error('facet backend unavailable'); + } + return canned; + }, + }; + const result = await run( + `{ datasets { + total + items { id } + facets { keyword { value count } } + } }`, + { + engine, + acceptLanguage: ['nl'], + onFacetError: (field) => failedFacets.push(field), + }, + ); + + // No top-level error: the failed facet degraded rather than nulling the + // non-null result and discarding the items. + expect(result.errors).toBeUndefined(); + const data = result.data?.datasets as Record; + expect(data.total).toBe(1); + expect((data.items as Record[])[0].id).toBe('https://d/1'); + // The failed facet degraded to an empty list, and the cause was reported. + expect((data.facets as Record).keyword).toEqual([]); + expect(failedFacets).toEqual(['keyword']); + }); + + it('guards perPage: 0, resolving page to 1 rather than failing on NaN', async () => { + const { engine } = fakeEngine(canned); + const result = await run(`{ datasets(perPage: 0) { page total } }`, { + engine, + acceptLanguage: ['nl'], + }); + expect(result.errors).toBeUndefined(); + const data = result.data?.datasets as Record; + expect(data.page).toBe(1); + }); + + it('maps where, orderBy and pagination into the SearchQuery', async () => { const { engine, received } = fakeEngine(canned); await run( `{ @@ -272,7 +418,6 @@ describe('buildSearchSchema', () => { orderBy: { field: SIZE, direction: ASC } page: 3 perPage: 10 - facets: [KEYWORD, PUBLISHER] ) { total } }`, { engine, acceptLanguage: ['nl'] }, @@ -288,7 +433,9 @@ describe('buildSearchSchema', () => { }); expect(query.where).toContainEqual({ field: 'iiif', is: true }); expect(query.orderBy).toEqual([{ field: 'size', direction: 'asc' }]); - expect(query.facets).toEqual(['keyword', 'publisher']); + // Facets are requested per key via selection, not an arg; the listing query + // carries none. + expect(query.facets).toEqual([]); expect(query.limit).toBe(10); expect(query.offset).toBe(20); }); @@ -336,12 +483,15 @@ describe('buildSearchSchema', () => { expect(sdl).toMatch(/publisher: Organization\b(?!!)/); // optional reference }); - it('builds the where, orderBy and facet enums from the field model', () => { + it('builds the where, orderBy enum and keyed facets object from the field model', () => { const sdl = printSchema(buildSearchSchema(schema, { typeName: 'Dataset' })); expect(sdl).toMatch(/enum DatasetSortField/); expect(sdl).toMatch(/RELEVANCE/); expect(sdl).toMatch(/SIZE/); - expect(sdl).toMatch(/enum DatasetFacetField/); + // Facets are a keyed object, one field per facetable field, typed by kind. + expect(sdl).toMatch(/type DatasetFacets/); + expect(sdl).toMatch(/keyword: \[ValueBucket!\]!/); + expect(sdl).toMatch(/size: \[RangeBucket!\]!/); expect(sdl).toMatch(/input DatasetWhere/); expect(sdl).toMatch(/status: StringFilter/); expect(sdl).toMatch(/size: IntRange/); diff --git a/packages/search-api-graphql/vite.config.ts b/packages/search-api-graphql/vite.config.ts index 725cf854..7434ca80 100644 --- a/packages/search-api-graphql/vite.config.ts +++ b/packages/search-api-graphql/vite.config.ts @@ -10,10 +10,10 @@ export default mergeConfig( test: { coverage: { thresholds: { - functions: 90, - lines: 90, - branches: 78, - statements: 90, + functions: 100, + lines: 100, + branches: 88.63, + statements: 100, }, }, }, diff --git a/packages/search-typesense/src/collection-schema.ts b/packages/search-typesense/src/collection-schema.ts index 5141f634..37f0d378 100644 --- a/packages/search-typesense/src/collection-schema.ts +++ b/packages/search-typesense/src/collection-schema.ts @@ -114,14 +114,6 @@ function typesenseFields( }); } } - if (names.group !== undefined) { - fields.push({ - name: names.group, - type: valueType, - facet: true, - optional: true, - }); - } return fields; } diff --git a/packages/search-typesense/src/query-compiler.ts b/packages/search-typesense/src/query-compiler.ts index fc9d4950..662eb393 100644 --- a/packages/search-typesense/src/query-compiler.ts +++ b/packages/search-typesense/src/query-compiler.ts @@ -3,6 +3,7 @@ import { fold } from '@lde/text-normalization'; import { physicalFields, searchableFields, + type FacetRange, type Filter, type SearchField, type SearchQuery, @@ -17,9 +18,21 @@ import { * {@link physicalFields}, the same convention the projection and the collection * schema use, so a query can never reference a field the index does not carry. */ +export interface CompileOptions { + /** + * Cap on the number of buckets returned per facet (`max_facet_values`). Left + * unset, Typesense defaults to 10 — too few for high-cardinality facets + * (publisher, keyword), so a deployment with such facets must raise it. Range + * facets return one bucket per declared range regardless, but a value > the + * range count is still safe. + */ + readonly maxFacetValues?: number; +} + export function buildSearchParams( query: SearchQuery, schema: SearchSchema, + options: CompileOptions = {}, ): SearchParams { const folded = query.text !== undefined && query.text.length > 0 @@ -35,7 +48,9 @@ export function buildSearchParams( query_by: names.join(','), query_by_weights: weights.join(','), per_page: query.limit, - page: Math.floor(query.offset / query.limit) + 1, + // A facet-only query (`limit: 0`) fetches no hits; page is then meaningless, + // so pin it to 1 rather than dividing by zero. + page: query.limit > 0 ? Math.floor(query.offset / query.limit) + 1 : 1, }; if (filterBy.length > 0) { params.filter_by = filterBy; @@ -44,11 +59,46 @@ export function buildSearchParams( params.sort_by = sortBy; } if (query.facets.length > 0) { - params.facet_by = query.facets.join(','); + params.facet_by = compileFacetBy(query.facets, schema); + if (options.maxFacetValues !== undefined) { + params.max_facet_values = options.maxFacetValues; + } } return params; } +/** + * The `facet_by` clause. A facet on a numeric field that declares + * {@link SearchField.facetRanges} faceted into those fixed half-open `[min, max)` + * bins (a histogram); every other facet is a plain per-value facet on its field + * name. Typesense range syntax is already start-inclusive/end-exclusive, so the + * declared bounds pass straight through with no boundary fix-up. + */ +function compileFacetBy( + facets: readonly string[], + schema: SearchSchema, +): string { + return facets + .map((name) => { + const field = schema.fields.find((candidate) => candidate.name === name); + return field?.facetRanges !== undefined && field.facetRanges.length > 0 + ? compileRangeFacet(field.name, field.facetRanges) + : name; + }) + .join(','); +} + +/** `name(key:[min, max], …)`; a blank bound is open-ended (Typesense `[75, ]`). */ +function compileRangeFacet( + name: string, + ranges: readonly FacetRange[], +): string { + const bins = ranges + .map((range) => `${range.key}:[${range.min ?? ''}, ${range.max ?? ''}]`) + .join(', '); + return `${name}(${bins})`; +} + /** * The `query_by` fields and aligned weights. Each searchable field expands to its * folded `*_search` companion(s); a localized field’s active-locale companion @@ -116,30 +166,14 @@ function compileFilter( } /** - * A membership clause. A grouped field splits its values into `prefix`-tagged - * group tokens (matched against the `_group` companion) and granular values, and - * ORs the two so selecting a value and a group within one facet unions instead of - * intersecting. A non-facet (tokenized) field uses the exact `:=` operator so an - * IRI cannot partial-match on a shared path segment. + * A membership clause. A non-facet (tokenized) field uses the exact `:=` + * operator so an IRI cannot partial-match on a shared path segment. */ function compileMembership( field: SearchField, values: readonly string[], ): string { const exact = field.facetable !== true; - if (field.group !== undefined) { - const prefix = field.group.prefix; - const groups = values.filter((value) => value.startsWith(prefix)); - const granular = values.filter((value) => !value.startsWith(prefix)); - const parts: string[] = []; - if (granular.length > 0) { - parts.push(membership(field.name, granular, exact)); - } - if (groups.length > 0) { - parts.push(membership(field.group.name, groups, false)); - } - return parts.length > 1 ? `(${parts.join(' || ')})` : parts[0]; - } return membership(field.name, values, exact); } @@ -197,6 +231,6 @@ function compileSort(sort: Sort, schema: SearchSchema, locale: string): string { * (`:`, `/`, `&`, `,`, …) are taken literally instead of parsed as filter syntax. * An embedded backtick is escaped. */ -function escapeFilterValue(value: string): string { +export function escapeFilterValue(value: string): string { return `\`${value.replace(/`/g, '\\`')}\``; } diff --git a/packages/search-typesense/src/search.ts b/packages/search-typesense/src/search.ts index e9d792c7..3e2a9959 100644 --- a/packages/search-typesense/src/search.ts +++ b/packages/search-typesense/src/search.ts @@ -13,7 +13,7 @@ import { type SearchSchema, type SearchValue, } from '@lde/search'; -import { buildSearchParams } from './query-compiler.js'; +import { buildSearchParams, escapeFilterValue } from './query-compiler.js'; /** Where the engine reads documents and (optionally) reference labels. */ export interface TypesenseSearchEngineOptions { @@ -21,6 +21,26 @@ export interface TypesenseSearchEngineOptions { readonly collection: string; /** The sidecar `labels` collection (IRI → label); omit for id-only references. */ readonly labelsCollection?: string; + /** + * Buckets returned per facet (`max_facet_values`). Typesense defaults to 10; + * raise it for high-cardinality facets (publisher, keyword) so their long + * value lists are not truncated. + */ + readonly maxFacetValues?: number; + /** + * Called when reference-label resolution fails; the search then degrades to + * id-only references rather than failing. Optional — omit to swallow silently. + */ + readonly onLabelError?: (error: unknown) => void; + /** + * Opt-in in-memory label cache. When set (and {@link labelsCollection} is + * set), the FULL sidecar `labels` collection is loaded once via the documents + * export endpoint and held in a process-lifetime cache for this many + * milliseconds; each `search` then resolves its reference labels by in-memory + * lookup instead of a per-search `multi_search` round-trip. Omit to keep the + * per-search {@link fetchLabels} behaviour unchanged. + */ + readonly labelCacheTtlMs?: number; } /** @@ -34,41 +54,137 @@ export function createTypesenseSearchEngine( client: Client, options: TypesenseSearchEngineOptions, ): SearchEngine { + // Process-lifetime cache for the FULL `labels` collection, held in the engine + // closure. Populated lazily on the first cached search; `loadAll` is the + // single-flight in-flight promise so concurrent first-loads share one export. + let cachedLabels: ReadonlyMap | undefined; + let cacheExpiresAt = 0; + let inFlightLoad: Promise> | undefined; + + function cachedAllLabels( + labelsCollection: string, + ttlMs: number, + ): Promise> { + if (cachedLabels !== undefined && Date.now() < cacheExpiresAt) { + return Promise.resolve(cachedLabels); + } + // Single-flight: a load already running serves every concurrent caller. + inFlightLoad ??= loadAllLabels(client, labelsCollection) + .then((loaded) => { + cachedLabels = loaded; + cacheExpiresAt = Date.now() + ttlMs; + return loaded; + }) + // A failed load degrades to id-only references and is NOT cached, so the + // next search retries rather than serving an empty map for the whole TTL. + .catch((error) => { + options.onLabelError?.(error); + return new Map(); + }) + .finally(() => { + inFlightLoad = undefined; + }); + return inFlightLoad; + } + return { async search( query: SearchQuery, schema: SearchSchema, ): Promise { - const params = buildSearchParams(query, schema); + const params = buildSearchParams(query, schema, { + maxFacetValues: options.maxFacetValues, + }); const response = (await client .collections(options.collection) .documents() .search(params)) as TypesenseSearchResponse; - const labels = - options.labelsCollection !== undefined - ? await fetchLabels( + // Labels are supplementary: a failed lookup (e.g. the sidecar collection + // mid-rebuild) degrades to id-only references rather than failing the whole + // search, so the listing still renders with bare IRIs. + let labels: ReadonlyMap = new Map(); + if (options.labelsCollection !== undefined) { + if (options.labelCacheTtlMs !== undefined) { + // Cached path: resolve the page's references by in-memory lookup + // against the once-loaded collection (no Typesense round-trip). + const allLabels = await cachedAllLabels( + options.labelsCollection, + options.labelCacheTtlMs, + ); + labels = selectLabels(allLabels, referenceIris(response, schema)); + } else { + try { + labels = await fetchLabels( client, options.labelsCollection, referenceIris(response, schema), - ) - : new Map(); + ); + } catch (error) { + options.onLabelError?.(error); + } + } + } return parseSearchResponse(response, schema, labels); }, }; } -/** Every distinct reference IRI across the page of hits. */ +/** + * Load the FULL `labels` collection into a label map via the documents export + * endpoint, which streams every document as JSONL (one JSON object per line). + * Each line is reconstructed by {@link labelToLocalizedValue}, exactly as the + * per-search {@link fetchLabels} path does for its `multi_search` hits. + */ +async function loadAllLabels( + client: Pick, + collection: string, +): Promise> { + const jsonl = await client.collections(collection).documents().export(); + const labels = new Map(); + for (const line of jsonl.split('\n')) { + if (line.length === 0) { + continue; + } + const document = JSON.parse(line) as Record; + labels.set(String(document.id), labelToLocalizedValue(document)); + } + return labels; +} + +/** Narrow the cached collection to just the labels `iris` actually need. */ +function selectLabels( + allLabels: ReadonlyMap, + iris: readonly string[], +): Map { + const labels = new Map(); + for (const iri of iris) { + const label = allLabels.get(iri); + if (label !== undefined) { + labels.set(iri, label); + } + } + return labels; +} + +/** Every distinct reference IRI whose label the result will actually use. */ function referenceIris( response: TypesenseSearchResponse, schema: SearchSchema, ): string[] { - const referenceFields = schema.fields + const referenceFieldSet = new Set( + schema.fields + .filter((field) => field.kind === 'reference') + .map((field) => field.name), + ); + // Hits only carry labels for OUTPUT reference fields: reconstructDocument skips + // non-output fields, so resolving a non-output reference's hit labels (e.g. a + // facet-only `class` with dozens of IRIs per hit) is pure waste. + const outputReferenceFields = outputFields(schema) .filter((field) => field.kind === 'reference') .map((field) => field.name); - const referenceFieldSet = new Set(referenceFields); const iris = new Set(); for (const hit of response.hits ?? []) { - for (const name of referenceFields) { + for (const name of outputReferenceFields) { const raw = hit.document[name]; if (Array.isArray(raw)) { for (const value of raw) { @@ -79,7 +195,8 @@ function referenceIris( } } } - // Reference-facet bucket values are IRIs too; resolve them in the same lookup. + // Reference-facet bucket values are IRIs too (incl. facet-only references like + // `class`); resolve them in the same lookup. for (const facet of response.facet_counts ?? []) { if (referenceFieldSet.has(facet.field_name)) { for (const bucket of facet.counts) { @@ -91,33 +208,48 @@ function referenceIris( } /** - * Resolve labels for `iris` from the sidecar `labels` collection in a single - * `filter_by: id:[…]` lookup. Each `label_${locale}` becomes a language-map - * entry; the default `label` is the untagged (`und`) fallback when no locale - * variant exists. + * Resolve labels for `iris` from the sidecar `labels` collection. Each + * `label_${locale}` becomes a language-map entry; the default `label` is the + * untagged (`und`) fallback when no locale variant exists. + * + * Sent over `multi_search` (POST) in batches: the id-list of a page or facet + * carrying many references — e.g. a dataset with dozens of classes — would + * overflow Typesense’s GET query-string limit (4000 chars, and IRIs URL-encode + * to several times their length) if it travelled in the URL. POST puts it in the + * body; the batch size stays under Typesense’s `per_page` cap. Exported for + * unit testing against a fake client. */ -async function fetchLabels( - client: Client, +export async function fetchLabels( + client: Pick, collection: string, iris: readonly string[], ): Promise> { const labels = new Map(); - if (iris.length === 0) { - return labels; - } - const filter = `id:[${iris.map((iri) => `\`${iri.replace(/`/g, '\\`')}\``).join(',')}]`; - const response = (await client.collections(collection).documents().search({ - q: '*', - query_by: 'label', - filter_by: filter, - per_page: iris.length, - })) as TypesenseSearchResponse; - for (const hit of response.hits ?? []) { - labels.set(String(hit.document.id), labelToLocalizedValue(hit.document)); + for (let start = 0; start < iris.length; start += LABEL_BATCH_SIZE) { + const batch = iris.slice(start, start + LABEL_BATCH_SIZE); + const filter = `id:[${batch.map(escapeFilterValue).join(',')}]`; + const { results } = (await client.multiSearch.perform({ + searches: [ + { + collection, + q: '*', + query_by: 'label', + filter_by: filter, + per_page: batch.length, + }, + ], + })) as { results: readonly TypesenseSearchResponse[] }; + for (const hit of results[0]?.hits ?? []) { + labels.set(String(hit.document.id), labelToLocalizedValue(hit.document)); + } } return labels; } +/** Typesense caps `per_page` at 250; the multi_search POST body holds the + * id-list comfortably, so resolve references in batches of this size. */ +const LABEL_BATCH_SIZE = 200; + /** Turn a `labels` document into a language map (`label_${locale}` → locale). */ function labelToLocalizedValue( document: Record, @@ -173,11 +305,25 @@ export function parseSearchResponse( const facets: Record = {}; for (const facet of response.facet_counts ?? []) { const labelled = referenceFacets.has(facet.field_name); + // A range facet echoes the declared range key as the bucket value; look the + // bin's half-open bounds back up by key so the bucket is self-describing. + const field = schema.fields.find( + (candidate) => candidate.name === facet.field_name, + ); + const rangesByKey = + field?.facetRanges !== undefined + ? new Map(field.facetRanges.map((range) => [range.key, range])) + : undefined; facets[facet.field_name] = facet.counts.map((bucket) => { const label = labelled ? labels.get(bucket.value) : undefined; - return label === undefined - ? { value: bucket.value, count: bucket.count } - : { value: bucket.value, count: bucket.count, label }; + const range = rangesByKey?.get(bucket.value); + return { + value: bucket.value, + count: bucket.count, + ...(label !== undefined ? { label } : {}), + ...(range?.min !== undefined ? { min: range.min } : {}), + ...(range?.max !== undefined ? { max: range.max } : {}), + }; }); } return { hits, total: response.found, facets }; diff --git a/packages/search-typesense/test/__snapshots__/generator-stability.test.ts.snap b/packages/search-typesense/test/__snapshots__/generator-stability.test.ts.snap index 201512f7..e56c6447 100644 --- a/packages/search-typesense/test/__snapshots__/generator-stability.test.ts.snap +++ b/packages/search-typesense/test/__snapshots__/generator-stability.test.ts.snap @@ -63,12 +63,6 @@ exports[`collection-schema generator stability > derives a stable Typesense coll "sort": false, "type": "string[]", }, - { - "facet": true, - "name": "format_group", - "optional": true, - "type": "string[]", - }, { "facet": true, "name": "creator", diff --git a/packages/search-typesense/test/collection-schema.test.ts b/packages/search-typesense/test/collection-schema.test.ts index 51511122..8d82507d 100644 --- a/packages/search-typesense/test/collection-schema.test.ts +++ b/packages/search-typesense/test/collection-schema.test.ts @@ -30,7 +30,6 @@ const schema: SearchSchema = { kind: 'keyword', array: true, facetable: true, - group: { name: 'format_group', prefix: 'group:' }, }, // Derived fields (no path) still get collection fields — populated at index // time by derivations, not projected. @@ -190,13 +189,4 @@ describe('buildCollectionSchema', () => { locale: 'nl', }); }); - - it('emits the grouped-facet companion for a field that declares a group', () => { - expect(collection.fields).toContainEqual({ - name: 'format_group', - type: 'string[]', - facet: true, - optional: true, - }); - }); }); diff --git a/packages/search-typesense/test/generator-stability.test.ts b/packages/search-typesense/test/generator-stability.test.ts index 2383ecde..bb7eca2a 100644 --- a/packages/search-typesense/test/generator-stability.test.ts +++ b/packages/search-typesense/test/generator-stability.test.ts @@ -6,8 +6,8 @@ import { buildCollectionSchema } from '../src/collection-schema.js'; * A neutral fixture exercising every kind + capability — NOT a real domain. The * derived Typesense collection is snapshotted purely to pin the **generator**: * any change to how `buildCollectionSchema` maps the field model (Typesense field - * types, the physical fanout, stem/locale, optional/default-sorting-field, group - * companions) surfaces as a snapshot diff before this library is published. + * types, the physical fanout, stem/locale, optional/default-sorting-field) + * surfaces as a snapshot diff before this library is published. */ const THING: SearchSchema = { type: 'https://example.org/Thing', @@ -35,7 +35,6 @@ const THING: SearchSchema = { array: true, facetable: true, filterable: true, - group: { name: 'format_group', prefix: 'group:' }, }, { name: 'creator', diff --git a/packages/search-typesense/test/parse-response.test.ts b/packages/search-typesense/test/parse-response.test.ts index 50e601a4..55a09bdd 100644 --- a/packages/search-typesense/test/parse-response.test.ts +++ b/packages/search-typesense/test/parse-response.test.ts @@ -1,6 +1,11 @@ -import { describe, expect, it } from 'vitest'; -import type { LocalizedValue, SearchSchema } from '@lde/search'; -import { parseSearchResponse } from '../src/search.js'; +import { afterEach, describe, expect, it, vi } from 'vitest'; +import type { LocalizedValue, SearchQuery, SearchSchema } from '@lde/search'; +import type { Client } from 'typesense'; +import { + createTypesenseSearchEngine, + fetchLabels, + parseSearchResponse, +} from '../src/search.js'; const schema: SearchSchema = { type: 'http://www.w3.org/ns/dcat#Dataset', @@ -143,3 +148,318 @@ describe('parseSearchResponse', () => { expect(result.hits[0].document.status).toBeUndefined(); }); }); + +describe('parseSearchResponse range facets', () => { + const rangeSchema: SearchSchema = { + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [ + { + name: 'size', + kind: 'integer', + facetable: true, + output: true, + facetRanges: [ + { key: '0', min: 1, max: 10 }, + { key: '1', min: 10, max: 100 }, + // Open-ended top bin: no upper bound. + { key: '2', min: 100 }, + ], + }, + ], + }; + + const rangeResponse = { + found: 5, + hits: [], + facet_counts: [ + { + field_name: 'size', + counts: [ + { value: '0', count: 2 }, + { value: '1', count: 1 }, + { value: '2', count: 2 }, + ], + }, + ], + }; + + it('echoes each range bin’s half-open bounds onto its bucket, open ends omitted', () => { + const result = parseSearchResponse(rangeResponse, rangeSchema, new Map()); + expect(result.facets.size).toEqual([ + { value: '0', count: 2, min: 1, max: 10 }, + { value: '1', count: 1, min: 10, max: 100 }, + // The open-ended top bin carries only its lower bound. + { value: '2', count: 2, min: 100 }, + ]); + }); +}); + +describe('createTypesenseSearchEngine label degradation', () => { + const baseQuery: SearchQuery = { + where: [], + orderBy: [], + limit: 10, + offset: 0, + facets: [], + locale: 'nl', + }; + + // A fake client whose document search succeeds but whose label lookup + // (multi_search) rejects, so the engine must degrade to id-only references. + function fakeClient(): Client { + return { + collections: () => ({ + documents: () => ({ + search: () => + Promise.resolve({ + found: 1, + hits: [ + { + document: { id: 'https://d/1', publisher: ['https://org/1'] }, + }, + ], + }), + }), + }), + multiSearch: { + perform: () => + Promise.reject(new Error('labels collection unavailable')), + }, + } as unknown as Client; + } + + it('degrades to id-only references when the label lookup fails, reporting the cause', async () => { + let capturedError: unknown; + const engine = createTypesenseSearchEngine(fakeClient(), { + collection: 'datasets', + labelsCollection: 'labels', + onLabelError: (error) => { + capturedError = error; + }, + }); + const result = await engine.search(baseQuery, schema); + // The reference is present but unlabelled: the failed lookup degraded + // rather than failing the whole search. + expect(result.hits[0].document.publisher).toEqual([ + { id: 'https://org/1' }, + ]); + expect(capturedError).toBeInstanceOf(Error); + }); +}); + +describe('createTypesenseSearchEngine label cache (labelCacheTtlMs)', () => { + const baseQuery: SearchQuery = { + where: [], + orderBy: [], + limit: 10, + offset: 0, + facets: [], + locale: 'nl', + }; + + // One labels document, as the export endpoint streams it (JSONL). + const labelsJsonl = JSON.stringify({ + id: 'https://org/1', + label: 'Het Utrechts Archief', + label_nl: 'Het Utrechts Archief', + }); + + // A fake client whose document search always returns one hit referencing + // `https://org/1`, and whose `labels` collection export is driven by + // `exportImpl`. Counters make the export-call count observable. + function fakeClient(exportImpl: () => Promise) { + let exportCalls = 0; + const client = { + collections: () => ({ + documents: () => ({ + search: () => + Promise.resolve({ + found: 1, + hits: [ + { + document: { id: 'https://d/1', publisher: ['https://org/1'] }, + }, + ], + }), + export: () => { + exportCalls += 1; + return exportImpl(); + }, + }), + }), + }; + return { + client: client as unknown as Client, + exportCalls: () => exportCalls, + }; + } + + afterEach(() => { + vi.useRealTimers(); + }); + + it('loads the collection once for concurrent searches (single-flight)', async () => { + const { client, exportCalls } = fakeClient(() => + Promise.resolve(labelsJsonl), + ); + const engine = createTypesenseSearchEngine(client, { + collection: 'datasets', + labelsCollection: 'labels', + labelCacheTtlMs: 60_000, + }); + + const results = await Promise.all([ + engine.search(baseQuery, schema), + engine.search(baseQuery, schema), + engine.search(baseQuery, schema), + ]); + + // One export served all three concurrent searches. + expect(exportCalls()).toBe(1); + for (const result of results) { + expect(result.hits[0].document.publisher).toEqual([ + { id: 'https://org/1', label: { nl: ['Het Utrechts Archief'] } }, + ]); + } + }); + + it('serves a later search from cache without a second export', async () => { + const { client, exportCalls } = fakeClient(() => + Promise.resolve(labelsJsonl), + ); + const engine = createTypesenseSearchEngine(client, { + collection: 'datasets', + labelsCollection: 'labels', + labelCacheTtlMs: 60_000, + }); + + await engine.search(baseQuery, schema); + await engine.search(baseQuery, schema); + + expect(exportCalls()).toBe(1); + }); + + it('reloads the collection after the TTL expires', async () => { + vi.useFakeTimers(); + const { client, exportCalls } = fakeClient(() => + Promise.resolve(labelsJsonl), + ); + const engine = createTypesenseSearchEngine(client, { + collection: 'datasets', + labelsCollection: 'labels', + labelCacheTtlMs: 1000, + }); + + await engine.search(baseQuery, schema); + expect(exportCalls()).toBe(1); + + // Within the TTL: still cached. + vi.advanceTimersByTime(500); + await engine.search(baseQuery, schema); + expect(exportCalls()).toBe(1); + + // Past the TTL: reload. + vi.advanceTimersByTime(600); + await engine.search(baseQuery, schema); + expect(exportCalls()).toBe(2); + }); + + it('degrades to id-only references on a load error and retries next time', async () => { + let capturedError: unknown; + let attempt = 0; + const { client, exportCalls } = fakeClient(() => { + attempt += 1; + return attempt === 1 + ? Promise.reject(new Error('labels collection unavailable')) + : Promise.resolve(labelsJsonl); + }); + const engine = createTypesenseSearchEngine(client, { + collection: 'datasets', + labelsCollection: 'labels', + labelCacheTtlMs: 60_000, + onLabelError: (error) => { + capturedError = error; + }, + }); + + // First load fails: id-only reference, error reported, nothing cached. + const failed = await engine.search(baseQuery, schema); + expect(failed.hits[0].document.publisher).toEqual([ + { id: 'https://org/1' }, + ]); + expect(capturedError).toBeInstanceOf(Error); + expect(exportCalls()).toBe(1); + + // Next search retries the load (the failure was not cached) and resolves. + const recovered = await engine.search(baseQuery, schema); + expect(recovered.hits[0].document.publisher).toEqual([ + { id: 'https://org/1', label: { nl: ['Het Utrechts Archief'] } }, + ]); + expect(exportCalls()).toBe(2); + }); +}); + +describe('fetchLabels', () => { + // A fake Typesense client whose multi_search returns the requested ids that + // exist in `docsById`, recording the id-list of each POST so batching is + // observable. (Resolving via multi_search/POST avoids the GET query-string + // limit that a large id-list would otherwise overflow.) + function fakeClient(docsById: Record>) { + const calls: string[][] = []; + const client = { + multiSearch: { + perform: (request: { searches: { readonly filter_by: string }[] }) => { + const ids = [ + ...request.searches[0].filter_by.matchAll(/`([^`]+)`/g), + ].map((match) => match[1]); + calls.push(ids); + const hits = ids + .filter((id) => docsById[id] !== undefined) + .map((id) => ({ document: { id, ...docsById[id] } })); + return Promise.resolve({ results: [{ found: hits.length, hits }] }); + }, + }, + }; + return { client: client as unknown as Pick, calls }; + } + + it('resolves labels via multi_search, merging per-locale variants', async () => { + const { client, calls } = fakeClient({ + 'https://org/1': { label: 'KB', label_nl: 'KB' }, + // Only a default label (no locale variant) → untagged (`und`) fallback. + 'https://org/3': { label: 'Untagged' }, + }); + const labels = await fetchLabels(client, 'labels', [ + 'https://org/1', + 'https://org/2', + 'https://org/3', + ]); + expect(labels.get('https://org/1')).toEqual({ nl: ['KB'] }); + expect(labels.get('https://org/3')).toEqual({ und: ['Untagged'] }); + // An IRI absent from the collection yields no entry. + expect(labels.has('https://org/2')).toBe(false); + expect(calls).toHaveLength(1); + }); + + it('batches a large id-list under the per_page cap, one POST per batch', async () => { + const ids = Array.from( + { length: 450 }, + (_unused, index) => `https://example.org/class/${index}`, + ); + const docsById = Object.fromEntries( + ids.map((id) => [id, { label_nl: id }]), + ); + const { client, calls } = fakeClient(docsById); + const labels = await fetchLabels(client, 'labels', ids); + // 450 ids → batches of 200, 200, 50. + expect(calls.map((batch) => batch.length)).toEqual([200, 200, 50]); + expect(labels.size).toBe(450); + }); + + it('makes no request for an empty id-list', async () => { + const { client, calls } = fakeClient({}); + const labels = await fetchLabels(client, 'labels', []); + expect(labels.size).toBe(0); + expect(calls).toHaveLength(0); + }); +}); diff --git a/packages/search-typesense/test/query-compiler.test.ts b/packages/search-typesense/test/query-compiler.test.ts index acdd9f7a..6556e7b3 100644 --- a/packages/search-typesense/test/query-compiler.test.ts +++ b/packages/search-typesense/test/query-compiler.test.ts @@ -30,12 +30,23 @@ const schema: SearchSchema = { array: true, facetable: true, filterable: true, - group: { name: 'format_group', prefix: 'group:' }, }, // Filter-only, non-facet (tokenized) → exact `:=` membership. { name: 'catalog', kind: 'keyword', array: true, filterable: true }, { name: 'status', kind: 'keyword', facetable: true, filterable: true }, - { name: 'size', kind: 'integer', filterable: true, sortable: true }, + { + name: 'size', + kind: 'integer', + filterable: true, + sortable: true, + facetable: true, + // Half-open `[min, max)` bins; the last is open-ended (no upper bound). + facetRanges: [ + { key: '0', min: 1, max: 10 }, + { key: '1', min: 10, max: 100 }, + { key: '2', min: 100 }, + ], + }, { name: 'iiif', kind: 'boolean', filterable: true, facetable: true }, ], }; @@ -78,7 +89,7 @@ describe('buildSearchParams', () => { ).toBe(3); }); - it('compiles where clauses, with exact membership for non-facet fields and grouped OR', () => { + it('compiles where clauses, with exact membership for non-facet fields', () => { const params = buildSearchParams( { ...base, @@ -97,7 +108,7 @@ describe('buildSearchParams', () => { 'status:[`valid`] && ' + 'keyword:[`kaarten`,`atlas`] && ' + 'catalog:=[`urn:cat`] && ' + - '(format:[`text/turtle`] || format_group:[`group:rdf`]) && ' + + 'format:[`text/turtle`,`group:rdf`] && ' + 'size:[1..10] && ' + 'iiif:=true', ); @@ -147,10 +158,44 @@ describe('buildSearchParams', () => { ).toBe('title_sort_nl:asc,status_rank:asc'); }); + it('pins page to 1 for a facet-only (limit:0) query instead of dividing by zero', () => { + const params = buildSearchParams({ ...base, limit: 0 }, schema); + expect(params.per_page).toBe(0); + expect(params.page).toBe(1); + }); + it('requests facets by their logical field name', () => { expect( buildSearchParams({ ...base, facets: ['keyword', 'format'] }, schema) .facet_by, ).toBe('keyword,format'); }); + + it('facets a range field into its declared half-open bins, open ends blank', () => { + // Typesense range syntax is start-inclusive/end-exclusive, so the declared + // `[min, max)` bounds pass straight through; the open-ended bin leaves the + // upper bound blank. + expect( + buildSearchParams({ ...base, facets: ['size'] }, schema).facet_by, + ).toBe('size(0:[1, 10], 1:[10, 100], 2:[100, ])'); + }); + + it('mixes range and plain facets in one facet_by clause', () => { + expect( + buildSearchParams({ ...base, facets: ['keyword', 'size'] }, schema) + .facet_by, + ).toBe('keyword,size(0:[1, 10], 1:[10, 100], 2:[100, ])'); + }); + + it('omits max_facet_values by default but sets it when configured', () => { + expect( + buildSearchParams({ ...base, facets: ['keyword'] }, schema) + .max_facet_values, + ).toBeUndefined(); + expect( + buildSearchParams({ ...base, facets: ['keyword'] }, schema, { + maxFacetValues: 250, + }).max_facet_values, + ).toBe(250); + }); }); diff --git a/packages/search-typesense/vite.config.ts b/packages/search-typesense/vite.config.ts index a09c9579..9184cdbe 100644 --- a/packages/search-typesense/vite.config.ts +++ b/packages/search-typesense/vite.config.ts @@ -16,10 +16,10 @@ export default mergeConfig( // rethrow guards and best-effort cleanup paths are deliberately not // exercised, which is why branch coverage is lower. thresholds: { - functions: 87.5, - lines: 84.7, - branches: 66.66, - statements: 84.88, + functions: 97.14, + lines: 93.28, + branches: 83.75, + statements: 93.37, }, }, }, diff --git a/packages/search/src/engine.ts b/packages/search/src/engine.ts index 59284d7f..bcf61657 100644 --- a/packages/search/src/engine.ts +++ b/packages/search/src/engine.ts @@ -4,8 +4,9 @@ import type { SearchSchema } from './schema.js'; /** * The engine port — the boundary a concrete engine adapter (e.g. * `@lde/search-typesense`’s `TypesenseSearchEngine`) implements. The adapter - * owns every engine specific (companion-field expansion, `query_by`/weights, the - * filter compiler, `sort_by`, folding, `facet_by`) and returns only logical + * owns every engine specific (companion-field expansion, full-text field + * selection and weights, filter compilation, sorting, result folding, faceting) + * and returns only logical * documents, so a deployment can swap engines without any consumer noticing. * Nothing engine-specific and nothing RDF-specific leaks past this port. * @@ -137,4 +138,12 @@ export interface FacetBucket { readonly value: string; readonly count: number; readonly label?: LocalizedValue; + /** + * For a range-facet bucket: its half-open bounds (`min` inclusive, `max` + * exclusive), echoing the declared {@link FacetRange} so the bucket is + * self-describing and a consumer never hardcodes the bin formula. Both absent + * for a value facet; either absent for an open-ended bin. + */ + readonly min?: number; + readonly max?: number; } diff --git a/packages/search/src/index.ts b/packages/search/src/index.ts index cb02290e..37bc4db3 100644 --- a/packages/search/src/index.ts +++ b/packages/search/src/index.ts @@ -21,6 +21,7 @@ export type { SearchSchema, Derivation, PhysicalFields, + FacetRange, } from './schema.js'; // Engine- and protocol-neutral query IR + filter semantics. diff --git a/packages/search/src/schema.ts b/packages/search/src/schema.ts index 2873d99c..41ed5356 100644 --- a/packages/search/src/schema.ts +++ b/packages/search/src/schema.ts @@ -30,11 +30,10 @@ export type FieldKind = * are independent opt-ins: a field exposes exactly the roles it declares. A * field with no `path` is a **derived field** — populated by a * {@link Derivation} rather than projected from the IR — yet it still carries - * full query/schema/output behavior (e.g. `status`, the `*_group` companions, - * the compatibility booleans). + * full query/schema/output behavior (e.g. `status`, the compatibility booleans). * * The physical field names a declaration fans out to (per-locale search/sort - * keys, the grouped-facet companion, …) follow one convention, owned by + * keys) follow one convention, owned by * {@link physicalFields} so projection, collection-schema and query compiler * cannot disagree. */ @@ -74,13 +73,32 @@ export interface SearchField { }; /** Projection-time value transform (e.g. strip a media-type prefix). */ readonly transform?: (value: string) => string; - /** Grouped-facet companion (a coarse `${name}_group`; deployment delta). */ - readonly group?: { readonly name: string; readonly prefix: string }; + /** + * Range-facet bins for a numeric (`integer`/`number`/`date`) facetable field. + * When set, the field facets into these fixed half-open `[min, max)` ranges (a + * histogram) rather than one bucket per distinct value — the per-bucket counts + * a UI slider needs. Bins are query-time only (no index impact) and + * engine-neutral: the Typesense adapter emits a `facet_by` range, an + * Elasticsearch adapter a `range` aggregation. See {@link FacetRange}. + */ + readonly facetRanges?: readonly FacetRange[]; +} + +/** + * One half-open `[min, max)` range-facet bin: `min` inclusive, `max` exclusive, + * so contiguous bins partition cleanly with no boundary double-counting. Omit + * `min` (or `max`) for an open-ended bin (`< max`, resp. `≥ min`). `key` is the + * bucket’s stable label, echoed back as the {@link FacetBucket} `value`. + */ +export interface FacetRange { + readonly key: string; + readonly min?: number; + readonly max?: number; } /** * A computed field that is not a direct projection of a single path — a status - * rank, a `*_group` derived from a code table, a compatibility boolean. Reads + * rank, a compatibility boolean. Reads * the framed node and writes onto the flat document the field specs already * populated. */ @@ -116,8 +134,6 @@ export interface PhysicalFields { /** Per-locale folded sort keys `${name}_sort_${locale}` (localized text, * `sortable`); a non-localized field sorts on its `value`. */ readonly sort: readonly string[]; - /** The grouped-facet companion `${name}_group`, when `group` is declared. */ - readonly group?: string; } /** @@ -179,6 +195,5 @@ export function physicalFields(field: SearchField): PhysicalFields { localized && field.sortable ? locales.map((locale) => `${field.name}_sort_${locale}`) : [], - group: field.group ? `${field.name}_group` : undefined, }; } diff --git a/packages/search/test/schema.test.ts b/packages/search/test/schema.test.ts index bd52d449..08ab0fd5 100644 --- a/packages/search/test/schema.test.ts +++ b/packages/search/test/schema.test.ts @@ -101,27 +101,6 @@ describe('physicalFields', () => { }); }); - it('adds the `${name}_group` companion when a field declares a group', () => { - const format: SearchField = { - name: 'format', - kind: 'keyword', - array: true, - facetable: true, - group: { - name: 'format_group', - prefix: 'https://www.iana.org/assignments/media-types/', - }, - }; - - expect(physicalFields(format)).toEqual({ - value: 'format', - display: [], - search: [], - sort: [], - group: 'format_group', - }); - }); - it('emits only the search keys for a search-only localized field (no display, no sort)', () => { const creator: SearchField = { name: 'creator', diff --git a/packages/search/vite.config.ts b/packages/search/vite.config.ts index 6a8321a2..915a945a 100644 --- a/packages/search/vite.config.ts +++ b/packages/search/vite.config.ts @@ -11,9 +11,9 @@ export default mergeConfig( coverage: { thresholds: { functions: 100, - lines: 97.3, - branches: 88.76, - statements: 97.3, + lines: 97.84, + branches: 90.9, + statements: 97.91, }, }, }, From 8e61bbd1469f3a39d4b4a18919942b8573f9e6f9 Mon Sep 17 00:00:00 2001 From: David de Boer Date: Wed, 1 Jul 2026 12:13:55 +0200 Subject: [PATCH 08/35] build(deps): add @lde/search-api-graphql to the lockfile and refresh @lde/* pins npm ci failed because the lockfile lacked the new @lde/search-api-graphql workspace. Regenerating against npmjs adds it and brings ~24 @lde/* internal deps up to their latest in-range patches; no third-party or duplicate-version changes. --- package-lock.json | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/package-lock.json b/package-lock.json index 50845ee3..dd6a4bc4 100644 --- a/package-lock.json +++ b/package-lock.json @@ -24949,6 +24949,10 @@ "resolved": "packages/search", "link": true }, + "node_modules/@lde/search-api-graphql": { + "resolved": "packages/search-api-graphql", + "link": true + }, "node_modules/@lde/search-typesense": { "resolved": "packages/search-typesense", "link": true @@ -32504,7 +32508,6 @@ "version": "15.10.2", "resolved": "https://registry.npmjs.org/graphql/-/graphql-15.10.2.tgz", "integrity": "sha512-1PRqdDPAmViWr4h1GVBT8RoPZfWSGZa7kDzleTilOfVIslsgf+cia3Nl95v1KDmR4iERPaT7WzQ+tN4MJmbg3w==", - "dev": true, "license": "MIT", "engines": { "node": ">= 10.x" @@ -40442,7 +40445,7 @@ "commander": "^15.0.0", "cron": "^4.1.0", "drizzle-kit": "1.0.0-rc.4", - "drizzle-orm": "^1.0.0-rc.4", + "drizzle-orm": "1.0.0-rc.4", "postgres": "^3.4.9", "tslib": "^2.3.0" }, @@ -42921,11 +42924,23 @@ "n3": "^2.1.0" } }, + "packages/search-api-graphql": { + "name": "@lde/search-api-graphql", + "version": "0.1.0", + "license": "MIT", + "dependencies": { + "@lde/search": "^0.1.2", + "graphql": "^15.8.0", + "tslib": "^2.3.0" + } + }, "packages/search-typesense": { "name": "@lde/search-typesense", "version": "0.1.1", "license": "MIT", "dependencies": { + "@lde/search": "^0.1.2", + "@lde/text-normalization": "^0.1.1", "tslib": "^2.3.0", "typesense": "^3.0.6" }, From 32f3335d24a346f79d4e62296a748eca08768e54 Mon Sep 17 00:00:00 2001 From: David de Boer Date: Wed, 1 Jul 2026 12:18:10 +0200 Subject: [PATCH 09/35] fix(search-typesense): narrow possibly-undefined facet buckets in the search-engine test `result.facets` is a `Partial` record, so a facet is `FacetBucket[] | undefined`; guard the two spreads with `?? []` so the `typecheck` target passes (it never ran in CI before the lockfile fix). --- packages/search-typesense/test/search-engine.test.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/search-typesense/test/search-engine.test.ts b/packages/search-typesense/test/search-engine.test.ts index 3a392f8a..8847d2d7 100644 --- a/packages/search-typesense/test/search-engine.test.ts +++ b/packages/search-typesense/test/search-engine.test.ts @@ -198,7 +198,7 @@ describe('createTypesenseSearchEngine (integration)', () => { ); // Plain facet: value + count, no label. - const keyword = [...result.facets.keyword].sort( + const keyword = [...(result.facets.keyword ?? [])].sort( (a, b) => b.count - a.count, ); expect(keyword).toEqual([ @@ -207,7 +207,7 @@ describe('createTypesenseSearchEngine (integration)', () => { ]); // Reference facet: IRI-keyed buckets carry the resolved data label. - const publisher = [...result.facets.publisher].sort( + const publisher = [...(result.facets.publisher ?? [])].sort( (a, b) => b.count - a.count, ); expect(publisher).toEqual([ From ec7866fce8449090b8d1fe1c421abf8019b574a2 Mon Sep 17 00:00:00 2001 From: David de Boer Date: Wed, 1 Jul 2026 15:55:10 +0200 Subject: [PATCH 10/35] docs(search): state ADR 3 design directly, without dated update annotations Fold the unified-field-model blockquote and the dated Consequences bullet into running text, so the ADR reads as the current design rather than a change log. --- .../decisions/0003-search-api-core-query-model.md | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/docs/decisions/0003-search-api-core-query-model.md b/docs/decisions/0003-search-api-core-query-model.md index 38f9e697..09e08a4a 100644 --- a/docs/decisions/0003-search-api-core-query-model.md +++ b/docs/decisions/0003-search-api-core-query-model.md @@ -49,13 +49,11 @@ NodeShape + its `search:` annotations. **One `SearchField` declaration drives fo consumers** – projection (RDF→flat document), the engine collection schema, the query semantics, and the GraphQL surface – so they cannot drift. -> Updated 2026-06-26 (during implementation): this is the **unified** field model. It -> folds the three previously separate declarations into one – the projection-side -> `FieldSpec`/`FieldKind` (RDF→doc), the deployment’s Typesense `SEARCH_FIELDS` (collection -> schema + weights), and the query model below. The original ADR deferred this unification; -> it is now adopted (option “c”). The `kind` + capability flags replace the old discriminated -> projection kinds, derived fields become first-class, and the Typesense-vocabulary types are -> _derived_ from `kind` rather than re-declared. +It is a **unified** model: one declaration in place of three otherwise-separate ones – the +projection-side `FieldSpec`/`FieldKind`, the Typesense `SEARCH_FIELDS` (collection schema + +weights), and the query model below. `kind` plus capability flags replace the discriminated +projection kinds, derived fields are first-class, and the Typesense-vocabulary types are +_derived_ from `kind` rather than re-declared. ```ts type FieldKind = @@ -318,9 +316,6 @@ not enabled for DR v1, more relevant for B/C. - Carried through: the Stable API Contract discipline, the reference `strategy` concept, the surface `LanguageString` list, folding at the adapter boundary + query side via `@lde/text-normalization`, SDL-in-projection vs filter-compiler-in-adapter. -- Adopted during implementation (2026-06-26): the **unified** field model – the projection - `FieldSpec` (RDF→doc) and the deployment’s Typesense `SEARCH_FIELDS` are folded into this - one `SearchField` (see the Field model note above). - Deferred: REST surface; framed-JSON-LD materialised view (nested storage, index-time label inlining, detail-page-on-index, terms-collection split); semantic/hybrid (vector) search. From f486988077dc8f037a7458d2aa2c89d29bd5f6f2 Mon Sep 17 00:00:00 2001 From: David de Boer Date: Thu, 2 Jul 2026 19:46:42 +0200 Subject: [PATCH 11/35] feat(search)!: rename the per-type SearchSchema to SearchType - SearchType is one root type declaration (one SHACL NodeShape, one GraphQL object type); SearchSchema now names the whole search declaration: a ReadonlyMap of SearchTypes keyed by type IRI, built with the new searchSchema() factory - projectGraph now consumes a SearchSchema instead of a SearchType array - rename buildSearchSchema / printSearchSchema / BuildSearchSchemaOptions to buildGraphQLSchema / printGraphQLSchema / BuildGraphQLSchemaOptions: they construct a GraphQLSchema rather than the SearchSchema the old names implied - rename schema parameters to searchType where they take one type, and the FacetFieldsOf/OutputFieldsOf/EngineFor/ResultFor generic from Schema to Type - add a Terminology section to the @lde/search README mapping SearchField / SearchType / SearchSchema onto SHACL and GraphQL; update ADRs 3 and 4, the package READMEs and npm descriptions - drop section-divider comments in build-schema.ts and stale grouped-facet mentions in the READMEs BREAKING CHANGE: the per-type interface SearchSchema is renamed to SearchType, and SearchSchema now denotes the type-keyed map built with searchSchema(). projectGraph(quads, types[]) becomes projectGraph(quads, searchSchema(...types)). In @lde/search-api-graphql, buildSearchSchema, printSearchSchema and BuildSearchSchemaOptions are renamed to buildGraphQLSchema, printGraphQLSchema and BuildGraphQLSchemaOptions. --- .../0003-search-api-core-query-model.md | 10 ++-- .../0004-search-api-graphql-surface.md | 28 +++++----- packages/search-api-graphql/README.md | 23 ++++---- packages/search-api-graphql/package.json | 2 +- .../search-api-graphql/src/build-schema.ts | 52 +++++++++---------- packages/search-api-graphql/src/index.ts | 4 +- .../test/build-schema.test.ts | 20 ++++--- .../test/generator-stability.test.ts | 10 ++-- packages/search-typesense/README.md | 6 +-- .../search-typesense/src/collection-schema.ts | 12 ++--- .../search-typesense/src/query-compiler.ts | 38 ++++++++------ packages/search-typesense/src/search.ts | 30 +++++------ .../test/collection-schema.test.ts | 4 +- .../test/generator-stability.test.ts | 4 +- .../test/parse-response.test.ts | 6 +-- .../test/query-compiler.test.ts | 4 +- .../test/search-engine.test.ts | 4 +- packages/search/README.md | 34 +++++++++--- packages/search/package.json | 2 +- packages/search/src/engine.ts | 46 ++++++++-------- packages/search/src/index.ts | 6 ++- packages/search/src/project.ts | 30 +++++------ packages/search/src/schema.ts | 38 ++++++++++---- packages/search/test/engine.test.ts | 6 +-- packages/search/test/project.test.ts | 18 ++++--- packages/search/test/schema.test.ts | 4 +- 26 files changed, 247 insertions(+), 194 deletions(-) diff --git a/docs/decisions/0003-search-api-core-query-model.md b/docs/decisions/0003-search-api-core-query-model.md index 09e08a4a..df74737c 100644 --- a/docs/decisions/0003-search-api-core-query-model.md +++ b/docs/decisions/0003-search-api-core-query-model.md @@ -85,7 +85,9 @@ interface SearchField { type Derivation = (document: SearchDocument, node: FramedNode) => void; -interface SearchSchema { +// One root type (one SHACL NodeShape); a whole deployment’s declaration is the +// SearchSchema, a map of SearchTypes keyed by type IRI (built with searchSchema()). +interface SearchType { readonly type: string; // sh:targetClass readonly fields: readonly SearchField[]; readonly derivations?: readonly Derivation[]; // computed fields: status, booleans @@ -194,15 +196,15 @@ SearchEngine` readable. ```ts // FacetField / OutputField default to `string` (ergonomic) and a deployment narrows them -// to its schema’s facetable / output field names for typo-safe facet and document access -// (helpers FacetFieldsOf / OutputFieldsOf, or the EngineFor alias). +// to its type’s facetable / output field names for typo-safe facet and document access +// (helpers FacetFieldsOf / OutputFieldsOf, or the EngineFor alias). interface SearchEngine< FacetField extends string = string, OutputField extends string = string, > { search( query: SearchQuery, - schema: SearchSchema, + searchType: SearchType, ): Promise>; } diff --git a/docs/decisions/0004-search-api-graphql-surface.md b/docs/decisions/0004-search-api-graphql-surface.md index c5b297da..678d6d04 100644 --- a/docs/decisions/0004-search-api-graphql-surface.md +++ b/docs/decisions/0004-search-api-graphql-surface.md @@ -21,23 +21,23 @@ separate package). ### Runtime configuration, not code generation The surface is **constructed at runtime from the field-model configuration** -(`buildSearchSchema(config)`), once at startup, with generic resolvers shipped in the package +(`buildGraphQLSchema(config)`), once at startup, with generic resolvers shipped in the package attached to that schema – nothing is emitted or committed. The resolvers are inherently generic (one root resolver maps args to a `SearchQuery`, calls the engine, and maps the result back; the field model only parameterises data), so codegen would emit N near-identical stubs that all delegate to the same logic, plus a build step and staleness risk, for no benefit. A live GraphQL API serves its own schema via introspection, so clients need no committed -`.graphql` file; the field-model diff is the reviewable change. `printSearchSchema()` exists +`.graphql` file; the field-model diff is the reviewable change. `printGraphQLSchema()` exists only as an **optional** CI snapshot test guarding the frozen contract against accidental breaking changes – not a shipped artifact. ### The schema-building function ```ts -// Generic over the config *value’s* type (capture it `as const satisfies SearchSchema`), so +// Generic over the config *value’s* type (capture it `as const satisfies SearchType`), so // one declaration drives both the runtime schema and the static TS types below. -function buildSearchSchema( +function buildGraphQLSchema( schema: S, options: { typeName: string; // 'Dataset' – drives all derived type names @@ -54,10 +54,10 @@ function buildSearchSchema( // Static types derived from the SAME config value’s type (compile-time only, erased at // runtime); one source, no codegen, no drift. Exported for typed in-process callers/tests. -type OutputOf; // { id: string; title: LanguageString[]; size: number | null; … } -type WhereOf; // { format?: StringFilter; size?: FloatRange; … } -type OrderByOf; // { field: 'RELEVANCE' | 'TITLE' | …; direction: 'ASC' | 'DESC' } -type FacetOf; // the facetable-field-name union +type OutputOf; // { id: string; title: LanguageString[]; size: number | null; … } +type WhereOf; // { format?: StringFilter; size?: FloatRange; … } +type OrderByOf; // { field: 'RELEVANCE' | 'TITLE' | …; direction: 'ASC' | 'DESC' } +type FacetOf; // the facetable-field-name union // also exported for manual composition / non-default servers: function buildSearchTypeDefsAndResolvers( @@ -65,17 +65,17 @@ function buildSearchTypeDefsAndResolvers( options, ): { typeDefs: string; resolvers: object }; // optional CI helper only: -function printSearchSchema(schema, options): string; // SDL, for a snapshot/breaking-change test +function printGraphQLSchema(schema, options): string; // SDL, for a snapshot/breaking-change test ``` -`buildSearchSchema` is the standalone, framework-agnostic artifact (depends only on +`buildGraphQLSchema` is the standalone, framework-agnostic artifact (depends only on `graphql` + `@graphql-tools/schema`). Deep customisation passes `extendTypeDefs`/ `extendResolvers` (merged before `makeExecutableSchema`, since Mercurius registers once) or composes the exported typeDefs/resolvers by hand. ### A typed surface the contract does not depend on -One `as const satisfies SearchSchema` declaration drives two **independent** projections: the +One `as const satisfies SearchType` declaration drives two **independent** projections: the **runtime contract** (the `GraphQLSchema`, built at startup by reading the value – `field.kind`, `output`, `facetable`, …) and a **static TS mirror** (`OutputOf` / `WhereOf` / `OrderByOf` / `FacetOf`, computed from `typeof schema` via mapped types). @@ -83,8 +83,8 @@ One `as const satisfies SearchSchema` declaration drives two **independent** pro The contract **does not depend on the TS types.** `as const`/`satisfies` are compile-time only and erased, so the served schema is byte-identical whether or not the mirror exists – it is a developer-experience overlay. The two derivations can drift (the runtime kind→GraphQL-type -mapping lives in `buildSearchSchema`; the type-level mapping in `OutputOf` duplicates it), -so the **contract** is guarded by the optional `printSearchSchema()` SDL snapshot (the real +mapping lives in `buildGraphQLSchema`; the type-level mapping in `OutputOf` duplicates it), +so the **contract** is guarded by the optional `printGraphQLSchema()` SDL snapshot (the real artifact), while the TS mirror only catches our own coding mistakes against it. Values are typed at both ends, with the resolver as the typed transform between them: @@ -294,7 +294,7 @@ untagged (`und`) last – so `[0]` is always the best available value. - **Hot path is the engine, not GraphQL.** Per-request cost is dominated by the Typesense round-trip; parse/validate/resolve of a small query is sub-millisecond. - **Introspection serves the contract** (cheap, client-cached). Leave it on, or disable in - production and use `printSearchSchema` for tooling. + production and use `printGraphQLSchema` for tooling. ### Context contract diff --git a/packages/search-api-graphql/README.md b/packages/search-api-graphql/README.md index 88f8cdb3..d6274a9d 100644 --- a/packages/search-api-graphql/README.md +++ b/packages/search-api-graphql/README.md @@ -1,7 +1,7 @@ # @lde/search-api-graphql The GraphQL surface for the [`@lde/search`](../search) core. **Both engine- and -domain-agnostic:** it builds an executable `GraphQLSchema` from any `SearchSchema` +domain-agnostic:** it builds an executable `GraphQLSchema` from any `SearchType` at runtime, and serves it with one generic resolver over any `SearchEngine`. It names neither your **domain** (you pass `typeName` — `Dataset`, `Person`, `CreativeWork`, …) nor your **engine** (the resolver calls `context.engine`, be it @@ -9,16 +9,16 @@ names neither your **domain** (you pass `typeName` — `Dataset`, `Person`, ## Runtime configuration, not codegen -`buildSearchSchema(schema, { typeName })` constructs the schema once at startup -from the field model — no SDL artifact, no generated resolver stubs. The field -model is the single source; the GraphQL contract is whatever it produces. Output -types, the `where`/`orderBy`/facet inputs, reference types and nullability are all -derived from each field’s `kind` and capability flags. +`buildGraphQLSchema(searchType, { typeName })` constructs the schema once at +startup from the field model — no SDL artifact, no generated resolver stubs. The +field model is the single source; the GraphQL contract is whatever it produces. +Output types, the `where`/`orderBy`/facet inputs, reference types and nullability +are all derived from each field’s `kind` and capability flags. ```ts -import { buildSearchSchema } from '@lde/search-api-graphql'; +import { buildGraphQLSchema } from '@lde/search-api-graphql'; -const gqlSchema = buildSearchSchema(DATASET, { +const gqlSchema = buildGraphQLSchema(DATASET, { typeName: 'Dataset', queryDefaults: (query) => ({ ...query, @@ -50,6 +50,7 @@ The surface reads the same field model the index is built from, and compiles int the same neutral `SearchQuery` the engine consumes — so the API, the index and a future REST surface stay in lockstep. The contract is **frozen** (breaking to change), and because it is generated rather than hand-written, a _consumer_ guards -it with a `printSearchSchema(schema, options)` SDL snapshot over its **own** -schema and `typeName` — that snapshot also catches a `buildSearchSchema` change in -a future version of this library silently altering the consumer’s contract. +it with a `printGraphQLSchema(searchType, options)` SDL snapshot over its **own** +search type and `typeName` — that snapshot also catches a `buildGraphQLSchema` +change in a future version of this library silently altering the consumer’s +contract. diff --git a/packages/search-api-graphql/package.json b/packages/search-api-graphql/package.json index ea761b48..70f76450 100644 --- a/packages/search-api-graphql/package.json +++ b/packages/search-api-graphql/package.json @@ -1,7 +1,7 @@ { "name": "@lde/search-api-graphql", "version": "0.1.0", - "description": "Engine- and domain-agnostic GraphQL surface for @lde/search: builds an executable GraphQLSchema from any SearchSchema at runtime (no codegen), served by one generic resolver over any SearchEngine. You supply the schema and typeName; it names neither your domain nor your engine.", + "description": "Engine- and domain-agnostic GraphQL surface for @lde/search: builds an executable GraphQLSchema from any SearchType at runtime (no codegen), served by one generic resolver over any SearchEngine. You supply the search type and typeName; it names neither your domain nor your engine.", "repository": { "url": "git+https://github.com/ldelements/lde.git", "directory": "packages/search-api-graphql" diff --git a/packages/search-api-graphql/src/build-schema.ts b/packages/search-api-graphql/src/build-schema.ts index f7449793..836cc2a6 100644 --- a/packages/search-api-graphql/src/build-schema.ts +++ b/packages/search-api-graphql/src/build-schema.ts @@ -28,7 +28,7 @@ import { type SearchEngine, type SearchField, type SearchQuery, - type SearchSchema, + type SearchType, type Sort, } from '@lde/search'; import { @@ -50,7 +50,7 @@ export interface SearchContext { readonly onFacetError?: (field: string, error: unknown) => void; } -export interface BuildSearchSchemaOptions { +export interface BuildGraphQLSchemaOptions { /** Drives all derived type names, e.g. `Dataset`. */ readonly typeName: string; /** Root query field; defaults to the lowercased plural of `typeName`. */ @@ -86,9 +86,9 @@ function screamingSnake(name: string): string { * arguments to a {@link SearchQuery}, calls `context.engine`, and maps the result * back; the field model only parameterises data. */ -export function buildSearchSchema( - schema: SearchSchema, - options: BuildSearchSchemaOptions, +export function buildGraphQLSchema( + searchType: SearchType, + options: BuildGraphQLSchemaOptions, ): GraphQLSchema { const { typeName } = options; const languageOrder = options.languageOrder ?? defaultLanguageOrder; @@ -96,7 +96,6 @@ export function buildSearchSchema( options.queryField ?? `${typeName.charAt(0).toLowerCase()}${typeName.slice(1)}s`; - // --- Shared types --- const languageString = new GraphQLObjectType({ name: 'LanguageString', fields: { @@ -165,9 +164,9 @@ export function buildSearchSchema( }, }); - // --- Reference types, one per referenced shape, reused by every field. --- + // One reference type per referenced shape, reused by every field. const referenceTypes = new Map(); - for (const field of outputFields(schema)) { + for (const field of outputFields(searchType)) { if ( field.kind === 'reference' && field.ref && @@ -189,7 +188,6 @@ export function buildSearchSchema( } } - // --- Output type --- const outputType = new GraphQLObjectType({ name: typeName, fields: () => { @@ -199,7 +197,7 @@ export function buildSearchSchema( > = { id: { type: new GraphQLNonNull(GraphQLString) }, }; - for (const field of outputFields(schema)) { + for (const field of outputFields(searchType)) { fields[field.name] = outputFieldConfig(field); } return fields; @@ -263,12 +261,11 @@ export function buildSearchSchema( } } - // --- where / orderBy / facets --- const whereInput = new GraphQLInputObjectType({ name: `${typeName}Where`, fields: () => { const fields: Record = {}; - for (const field of filterableFields(schema)) { + for (const field of filterableFields(searchType)) { fields[field.name] = { type: whereFieldType(field) }; } return fields; @@ -293,7 +290,7 @@ export function buildSearchSchema( const sortValues: GraphQLEnumValueConfigMap = { RELEVANCE: { value: 'relevance' }, }; - for (const field of sortableFields(schema)) { + for (const field of sortableFields(searchType)) { sortValues[screamingSnake(field.name)] = { value: field.name }; } const sortField = new GraphQLEnumType({ @@ -323,7 +320,7 @@ export function buildSearchSchema( string, GraphQLFieldConfig > = {}; - for (const field of facetableFields(schema)) { + for (const field of facetableFields(searchType)) { const isRange = field.facetRanges !== undefined && field.facetRanges.length > 0; fields[field.name] = { @@ -350,7 +347,10 @@ export function buildSearchSchema( // rather than failing the whole query (which would null the non-null // result and discard the items + every other facet). try { - const result = await context.engine.search(facetQuery, schema); + const result = await context.engine.search( + facetQuery, + searchType, + ); return result.facets[field.name] ?? []; } catch (error) { context.onFacetError?.(field.name, error); @@ -392,14 +392,14 @@ export function buildSearchSchema( perPage: { type: GraphQLInt, defaultValue: 20 }, }, resolve: async (_source, args, context: SearchContext) => { - const built = argsToQuery(args as QueryArgs, context, schema); + const built = argsToQuery(args as QueryArgs, context, searchType); const finalQuery = options.queryDefaults ? options.queryDefaults(built, context) : built; // Items + total only; facets are resolved lazily per selected key. const result = await context.engine.search( { ...finalQuery, facets: [] }, - schema, + searchType, ); return { items: result.hits.map((hit) => ({ id: hit.id, ...hit.document })), @@ -425,14 +425,14 @@ export function buildSearchSchema( /** * The SDL of the built schema. Not a shipped artifact — a consumer uses it for an * optional CI snapshot test over its own schema, catching accidental breaking - * changes to its frozen contract (including a `buildSearchSchema` change in a + * changes to its frozen contract (including a `buildGraphQLSchema` change in a * future version of this library silently altering it). */ -export function printSearchSchema( - schema: SearchSchema, - options: BuildSearchSchemaOptions, +export function printGraphQLSchema( + searchType: SearchType, + options: BuildGraphQLSchemaOptions, ): string { - return printSchema(buildSearchSchema(schema, options)); + return printSchema(buildGraphQLSchema(searchType, options)); } interface QueryArgs { @@ -447,13 +447,13 @@ interface QueryArgs { function argsToQuery( args: QueryArgs, context: SearchContext, - schema: SearchSchema, + searchType: SearchType, ): SearchQuery { const perPage = args.perPage ?? 20; const page = args.page ?? 1; return { text: args.query, - where: whereToFilters(args.where, schema), + where: whereToFilters(args.where, searchType), orderBy: args.orderBy ? [{ field: args.orderBy.field, direction: args.orderBy.direction }] : [], @@ -467,13 +467,13 @@ function argsToQuery( function whereToFilters( where: Record | undefined, - schema: SearchSchema, + searchType: SearchType, ): Filter[] { if (where === undefined) { return []; } const filters: Filter[] = []; - for (const field of filterableFields(schema)) { + for (const field of filterableFields(searchType)) { const value = where[field.name]; if (value === undefined || value === null) { continue; diff --git a/packages/search-api-graphql/src/index.ts b/packages/search-api-graphql/src/index.ts index 2fe7db46..20c13223 100644 --- a/packages/search-api-graphql/src/index.ts +++ b/packages/search-api-graphql/src/index.ts @@ -1,7 +1,7 @@ -export { buildSearchSchema, printSearchSchema } from './build-schema.js'; +export { buildGraphQLSchema, printGraphQLSchema } from './build-schema.js'; export type { SearchContext, - BuildSearchSchemaOptions, + BuildGraphQLSchemaOptions, } from './build-schema.js'; export { defaultLanguageOrder, toLanguageStrings } from './language.js'; export type { LanguageString, LanguageOrder } from './language.js'; diff --git a/packages/search-api-graphql/test/build-schema.test.ts b/packages/search-api-graphql/test/build-schema.test.ts index 6ba323de..243b0ec9 100644 --- a/packages/search-api-graphql/test/build-schema.test.ts +++ b/packages/search-api-graphql/test/build-schema.test.ts @@ -4,11 +4,11 @@ import type { SearchEngine, SearchQuery, SearchResult, - SearchSchema, + SearchType, } from '@lde/search'; -import { buildSearchSchema, type SearchContext } from '../src/build-schema.js'; +import { buildGraphQLSchema, type SearchContext } from '../src/build-schema.js'; -const schema: SearchSchema = { +const schema: SearchType = { type: 'http://www.w3.org/ns/dcat#Dataset', fields: [ { @@ -125,14 +125,14 @@ async function run( variables?: Record, ) { return graphql({ - schema: buildSearchSchema(schema, { typeName: 'Dataset' }), + schema: buildGraphQLSchema(schema, { typeName: 'Dataset' }), source, contextValue: context, variableValues: variables, }); } -describe('buildSearchSchema', () => { +describe('buildGraphQLSchema', () => { it('resolves a query, mapping the result to the typed output', async () => { const { engine, received } = fakeEngine(canned); const result = await run( @@ -454,7 +454,7 @@ describe('buildSearchSchema', () => { return canned; }, }; - const gqlSchema = buildSearchSchema(schema, { + const gqlSchema = buildGraphQLSchema(schema, { typeName: 'Dataset', queryDefaults: (query) => ({ ...query, @@ -474,7 +474,9 @@ describe('buildSearchSchema', () => { }); it('derives nullability: required scalar non-null, optional scalar nullable, arrays/booleans non-null', () => { - const sdl = printSchema(buildSearchSchema(schema, { typeName: 'Dataset' })); + const sdl = printSchema( + buildGraphQLSchema(schema, { typeName: 'Dataset' }), + ); expect(sdl).toMatch(/status: String!/); // required expect(sdl).toMatch(/size: Int\b(?!!)/); // optional → nullable expect(sdl).toMatch(/title: \[LanguageString!\]!/); @@ -484,7 +486,9 @@ describe('buildSearchSchema', () => { }); it('builds the where, orderBy enum and keyed facets object from the field model', () => { - const sdl = printSchema(buildSearchSchema(schema, { typeName: 'Dataset' })); + const sdl = printSchema( + buildGraphQLSchema(schema, { typeName: 'Dataset' }), + ); expect(sdl).toMatch(/enum DatasetSortField/); expect(sdl).toMatch(/RELEVANCE/); expect(sdl).toMatch(/SIZE/); diff --git a/packages/search-api-graphql/test/generator-stability.test.ts b/packages/search-api-graphql/test/generator-stability.test.ts index 78a86f40..c78b1535 100644 --- a/packages/search-api-graphql/test/generator-stability.test.ts +++ b/packages/search-api-graphql/test/generator-stability.test.ts @@ -1,15 +1,15 @@ import { describe, expect, it } from 'vitest'; -import type { SearchSchema } from '@lde/search'; -import { printSearchSchema } from '../src/build-schema.js'; +import type { SearchType } from '@lde/search'; +import { printGraphQLSchema } from '../src/build-schema.js'; /** * A neutral fixture exercising every kind + capability — NOT a real domain. Its * SDL is snapshotted purely to pin the **generator**: any change to how - * `buildSearchSchema` maps the field model (nullability, type names, enums, + * `buildGraphQLSchema` maps the field model (nullability, type names, enums, * reference reuse) surfaces as a snapshot diff before this library is published, * so a consumer’s contract can’t shift from under it by accident. */ -const THING: SearchSchema = { +const THING: SearchType = { type: 'https://example.org/Thing', fields: [ { @@ -92,6 +92,6 @@ const THING: SearchSchema = { describe('GraphQL generator stability', () => { it('emits a stable SDL for a representative schema', () => { - expect(printSearchSchema(THING, { typeName: 'Thing' })).toMatchSnapshot(); + expect(printGraphQLSchema(THING, { typeName: 'Thing' })).toMatchSnapshot(); }); }); diff --git a/packages/search-typesense/README.md b/packages/search-typesense/README.md index ea681cae..efffc145 100644 --- a/packages/search-typesense/README.md +++ b/packages/search-typesense/README.md @@ -2,7 +2,7 @@ [Typesense](https://typesense.org/) engine adapter for the engine- and domain-agnostic [`@lde/search`](../search) core. **Engine-specific (Typesense) but -domain-agnostic** – you supply a `SearchSchema`; this package never names your +domain-agnostic** – you supply a `SearchType`; this package never names your domain. It is the Typesense implementation of the `SearchEngine` port: it derives a collection schema from the field model, compiles the neutral `SearchQuery` into Typesense search params, runs it, reconstructs the engine-neutral `SearchResult`, @@ -10,10 +10,10 @@ and manages the index lifecycle (blue/green rebuild). ## Collection schema and engine -`buildCollectionSchema(schema, { name, defaultSortingField, … })` derives a +`buildCollectionSchema(searchType, { name, defaultSortingField, … })` derives a Typesense collection from the unified `SearchField` model — the Typesense field type comes from each field’s `kind`, and the physical fanout (per-locale -search/sort keys, the `_group` companion) matches what the projection writes, via +search/sort keys) matches what the projection writes, via `@lde/search`’s `physicalFields`, so the index and the documents cannot drift. `createTypesenseSearchEngine(client, { collection, labelsCollection })` is the diff --git a/packages/search-typesense/src/collection-schema.ts b/packages/search-typesense/src/collection-schema.ts index 37f0d378..af133b08 100644 --- a/packages/search-typesense/src/collection-schema.ts +++ b/packages/search-typesense/src/collection-schema.ts @@ -1,10 +1,6 @@ import type { CollectionCreateSchema } from 'typesense'; import type { CollectionFieldSchema } from 'typesense/lib/Typesense/Collection.js'; -import { - physicalFields, - type SearchField, - type SearchSchema, -} from '@lde/search'; +import { physicalFields, type SearchField, type SearchType } from '@lde/search'; /** Deployment-specific options the generic field model does not carry. */ export interface CollectionSchemaOptions { @@ -20,7 +16,7 @@ export interface CollectionSchemaOptions { } /** - * Build a Typesense collection schema from the unified {@link SearchSchema}, so + * Build a Typesense collection schema from the unified {@link SearchType}, so * the index and the projection are driven by one declarative source and cannot * drift. Each field fans out into the same physical fields the projection writes * ({@link physicalFields}); the Typesense field type is derived from the field @@ -31,13 +27,13 @@ export interface CollectionSchemaOptions { * field stems in `defaultLocale`. */ export function buildCollectionSchema( - schema: SearchSchema, + searchType: SearchType, options: CollectionSchemaOptions, ): CollectionCreateSchema { const defaultLocale = options.defaultLocale ?? 'nl'; const collection: CollectionCreateSchema = { name: options.name, - fields: schema.fields.flatMap((field) => + fields: searchType.fields.flatMap((field) => typesenseFields(field, defaultLocale, options.defaultSortingField), ), }; diff --git a/packages/search-typesense/src/query-compiler.ts b/packages/search-typesense/src/query-compiler.ts index 662eb393..dfeede8c 100644 --- a/packages/search-typesense/src/query-compiler.ts +++ b/packages/search-typesense/src/query-compiler.ts @@ -7,7 +7,7 @@ import { type Filter, type SearchField, type SearchQuery, - type SearchSchema, + type SearchType, type Sort, } from '@lde/search'; @@ -31,17 +31,17 @@ export interface CompileOptions { export function buildSearchParams( query: SearchQuery, - schema: SearchSchema, + searchType: SearchType, options: CompileOptions = {}, ): SearchParams { const folded = query.text !== undefined && query.text.length > 0 ? fold(query.text) : undefined; - const { names, weights } = queryFields(schema, query.locale); - const filterBy = compileFilterBy(query.where, schema); + const { names, weights } = queryFields(searchType, query.locale); + const filterBy = compileFilterBy(query.where, searchType); const sortBy = query.orderBy - .map((sort) => compileSort(sort, schema, query.locale)) + .map((sort) => compileSort(sort, searchType, query.locale)) .join(','); const params: SearchParams = { q: folded ?? '*', @@ -59,7 +59,7 @@ export function buildSearchParams( params.sort_by = sortBy; } if (query.facets.length > 0) { - params.facet_by = compileFacetBy(query.facets, schema); + params.facet_by = compileFacetBy(query.facets, searchType); if (options.maxFacetValues !== undefined) { params.max_facet_values = options.maxFacetValues; } @@ -76,11 +76,13 @@ export function buildSearchParams( */ function compileFacetBy( facets: readonly string[], - schema: SearchSchema, + searchType: SearchType, ): string { return facets .map((name) => { - const field = schema.fields.find((candidate) => candidate.name === name); + const field = searchType.fields.find( + (candidate) => candidate.name === name, + ); return field?.facetRanges !== undefined && field.facetRanges.length > 0 ? compileRangeFacet(field.name, field.facetRanges) : name; @@ -107,12 +109,12 @@ function compileRangeFacet( * still surface. */ function queryFields( - schema: SearchSchema, + searchType: SearchType, locale: string, ): { readonly names: string[]; readonly weights: number[] } { const names: string[] = []; const weights: number[] = []; - for (const field of searchableFields(schema)) { + for (const field of searchableFields(searchType)) { const search = physicalFields(field).search; const baseWeight = field.searchable.weight; if (field.kind === 'text' && field.localized === true) { @@ -136,19 +138,19 @@ function queryFields( /** AND-join the compiled `where` clauses; skips unknown fields and empty clauses. */ function compileFilterBy( where: readonly Filter[], - schema: SearchSchema, + searchType: SearchType, ): string { return where - .map((filter) => compileFilter(filter, schema)) + .map((filter) => compileFilter(filter, searchType)) .filter((clause): clause is string => clause !== undefined) .join(' && '); } function compileFilter( filter: Filter, - schema: SearchSchema, + searchType: SearchType, ): string | undefined { - const field = schema.fields.find( + const field = searchType.fields.find( (candidate) => candidate.name === filter.field, ); if (field === undefined) { @@ -209,11 +211,15 @@ function compileRange( * text field sorts on its active-locale folded key; any other field (including a * deployment tie-break like `status_rank`) sorts on its own name. */ -function compileSort(sort: Sort, schema: SearchSchema, locale: string): string { +function compileSort( + sort: Sort, + searchType: SearchType, + locale: string, +): string { if (sort.field === 'relevance') { return `_text_match:${sort.direction}`; } - const field = schema.fields.find( + const field = searchType.fields.find( (candidate) => candidate.name === sort.field, ); if ( diff --git a/packages/search-typesense/src/search.ts b/packages/search-typesense/src/search.ts index 3e2a9959..fc52aca4 100644 --- a/packages/search-typesense/src/search.ts +++ b/packages/search-typesense/src/search.ts @@ -10,7 +10,7 @@ import { type SearchHit, type SearchQuery, type SearchResult, - type SearchSchema, + type SearchType, type SearchValue, } from '@lde/search'; import { buildSearchParams, escapeFilterValue } from './query-compiler.js'; @@ -90,9 +90,9 @@ export function createTypesenseSearchEngine( return { async search( query: SearchQuery, - schema: SearchSchema, + searchType: SearchType, ): Promise { - const params = buildSearchParams(query, schema, { + const params = buildSearchParams(query, searchType, { maxFacetValues: options.maxFacetValues, }); const response = (await client @@ -111,20 +111,20 @@ export function createTypesenseSearchEngine( options.labelsCollection, options.labelCacheTtlMs, ); - labels = selectLabels(allLabels, referenceIris(response, schema)); + labels = selectLabels(allLabels, referenceIris(response, searchType)); } else { try { labels = await fetchLabels( client, options.labelsCollection, - referenceIris(response, schema), + referenceIris(response, searchType), ); } catch (error) { options.onLabelError?.(error); } } } - return parseSearchResponse(response, schema, labels); + return parseSearchResponse(response, searchType, labels); }, }; } @@ -169,17 +169,17 @@ function selectLabels( /** Every distinct reference IRI whose label the result will actually use. */ function referenceIris( response: TypesenseSearchResponse, - schema: SearchSchema, + searchType: SearchType, ): string[] { const referenceFieldSet = new Set( - schema.fields + searchType.fields .filter((field) => field.kind === 'reference') .map((field) => field.name), ); // Hits only carry labels for OUTPUT reference fields: reconstructDocument skips // non-output fields, so resolving a non-output reference's hit labels (e.g. a // facet-only `class` with dozens of IRIs per hit) is pure waste. - const outputReferenceFields = outputFields(schema) + const outputReferenceFields = outputFields(searchType) .filter((field) => field.kind === 'reference') .map((field) => field.name); const iris = new Set(); @@ -288,17 +288,17 @@ export interface TypesenseSearchResponse { */ export function parseSearchResponse( response: TypesenseSearchResponse, - schema: SearchSchema, + searchType: SearchType, labels: ReadonlyMap, ): SearchResult { const hits: SearchHit[] = (response.hits ?? []).map((hit) => ({ id: String(hit.document.id), - document: reconstructDocument(hit.document, schema, labels), + document: reconstructDocument(hit.document, searchType, labels), })); // Reference facets are IRI-keyed; their buckets carry a resolved data label. // Plain facets (tokens, free strings) carry no label — the consumer owns display. const referenceFacets = new Set( - schema.fields + searchType.fields .filter((field) => field.kind === 'reference') .map((field) => field.name), ); @@ -307,7 +307,7 @@ export function parseSearchResponse( const labelled = referenceFacets.has(facet.field_name); // A range facet echoes the declared range key as the bucket value; look the // bin's half-open bounds back up by key so the bucket is self-describing. - const field = schema.fields.find( + const field = searchType.fields.find( (candidate) => candidate.name === facet.field_name, ); const rangesByKey = @@ -332,11 +332,11 @@ export function parseSearchResponse( /** Rebuild one logical document from a flat Typesense document. */ function reconstructDocument( flat: Record, - schema: SearchSchema, + searchType: SearchType, labels: ReadonlyMap, ): ResultDocument { const document: Record = {}; - for (const field of outputFields(schema)) { + for (const field of outputFields(searchType)) { if (field.kind === 'boolean') { // A boolean is always present; an absent value means false. document[field.name] = flat[field.name] === true; diff --git a/packages/search-typesense/test/collection-schema.test.ts b/packages/search-typesense/test/collection-schema.test.ts index 8d82507d..49711c1e 100644 --- a/packages/search-typesense/test/collection-schema.test.ts +++ b/packages/search-typesense/test/collection-schema.test.ts @@ -1,8 +1,8 @@ import { describe, expect, it } from 'vitest'; -import type { SearchSchema } from '@lde/search'; +import type { SearchType } from '@lde/search'; import { buildCollectionSchema } from '../src/collection-schema.js'; -const schema: SearchSchema = { +const schema: SearchType = { type: 'http://www.w3.org/ns/dcat#Dataset', fields: [ { diff --git a/packages/search-typesense/test/generator-stability.test.ts b/packages/search-typesense/test/generator-stability.test.ts index bb7eca2a..9b93d134 100644 --- a/packages/search-typesense/test/generator-stability.test.ts +++ b/packages/search-typesense/test/generator-stability.test.ts @@ -1,5 +1,5 @@ import { describe, expect, it } from 'vitest'; -import type { SearchSchema } from '@lde/search'; +import type { SearchType } from '@lde/search'; import { buildCollectionSchema } from '../src/collection-schema.js'; /** @@ -9,7 +9,7 @@ import { buildCollectionSchema } from '../src/collection-schema.js'; * types, the physical fanout, stem/locale, optional/default-sorting-field) * surfaces as a snapshot diff before this library is published. */ -const THING: SearchSchema = { +const THING: SearchType = { type: 'https://example.org/Thing', fields: [ { diff --git a/packages/search-typesense/test/parse-response.test.ts b/packages/search-typesense/test/parse-response.test.ts index 55a09bdd..b59b1a47 100644 --- a/packages/search-typesense/test/parse-response.test.ts +++ b/packages/search-typesense/test/parse-response.test.ts @@ -1,5 +1,5 @@ import { afterEach, describe, expect, it, vi } from 'vitest'; -import type { LocalizedValue, SearchQuery, SearchSchema } from '@lde/search'; +import type { LocalizedValue, SearchQuery, SearchType } from '@lde/search'; import type { Client } from 'typesense'; import { createTypesenseSearchEngine, @@ -7,7 +7,7 @@ import { parseSearchResponse, } from '../src/search.js'; -const schema: SearchSchema = { +const schema: SearchType = { type: 'http://www.w3.org/ns/dcat#Dataset', fields: [ { @@ -150,7 +150,7 @@ describe('parseSearchResponse', () => { }); describe('parseSearchResponse range facets', () => { - const rangeSchema: SearchSchema = { + const rangeSchema: SearchType = { type: 'http://www.w3.org/ns/dcat#Dataset', fields: [ { diff --git a/packages/search-typesense/test/query-compiler.test.ts b/packages/search-typesense/test/query-compiler.test.ts index 6556e7b3..9a06d0f8 100644 --- a/packages/search-typesense/test/query-compiler.test.ts +++ b/packages/search-typesense/test/query-compiler.test.ts @@ -1,8 +1,8 @@ import { describe, expect, it } from 'vitest'; -import type { SearchQuery, SearchSchema } from '@lde/search'; +import type { SearchQuery, SearchType } from '@lde/search'; import { buildSearchParams } from '../src/query-compiler.js'; -const schema: SearchSchema = { +const schema: SearchType = { type: 'http://www.w3.org/ns/dcat#Dataset', fields: [ { diff --git a/packages/search-typesense/test/search-engine.test.ts b/packages/search-typesense/test/search-engine.test.ts index 8847d2d7..32f94a59 100644 --- a/packages/search-typesense/test/search-engine.test.ts +++ b/packages/search-typesense/test/search-engine.test.ts @@ -1,11 +1,11 @@ import { afterAll, beforeAll, describe, expect, it } from 'vitest'; import type { Client } from 'typesense'; -import type { SearchEngine, SearchQuery, SearchSchema } from '@lde/search'; +import type { SearchEngine, SearchQuery, SearchType } from '@lde/search'; import { buildCollectionSchema } from '../src/collection-schema.js'; import { createTypesenseSearchEngine } from '../src/search.js'; import { TypesenseContainer } from './typesense-container.js'; -const datasetSchema: SearchSchema = { +const datasetSchema: SearchType = { type: 'http://www.w3.org/ns/dcat#Dataset', fields: [ { diff --git a/packages/search/README.md b/packages/search/README.md index 476170d9..ca84cd21 100644 --- a/packages/search/README.md +++ b/packages/search/README.md @@ -8,8 +8,9 @@ defined here. The library never names your domain — the same core drives a It provides four things: -- **the unified field model** — `SearchField` / `SearchSchema`: one declaration - per field that drives all four consumers below, so they cannot drift; +- **the unified field model** — `SearchField` / `SearchType` / `SearchSchema`: + one declaration per field that drives all four consumers below, so they + cannot drift; - **the neutral query IR** — `SearchQuery` / `Filter` / `Sort` + filter semantics, the shared compiler target every API surface parses into; - **the engine port** — `SearchEngine` and the logical result types @@ -29,17 +30,36 @@ plus capability flags (`searchable` / `filterable` / `facetable` / `sortable` / `output`) describe projection, the engine collection schema, the query semantics, and the API output in a single place. +## Terminology + +The model has three levels, mirroring both SHACL (the source vocabulary) and +GraphQL (one of the surfaces): + +| Term | What it is | SHACL | GraphQL | +| -------------- | --------------------------------------------------------------------------------------------------------------- | -------------- | ----------- | +| `SearchField` | One queryable field: a `kind`, the IR `path` it projects from, and the capability flags it opts into | property shape | field | +| `SearchType` | One root type’s complete declaration: its `type` IRI plus its fields and derivations | NodeShape | object type | +| `SearchSchema` | The whole search declaration: every `SearchType`, keyed by `type` IRI — build one with `searchSchema(...types)` | shapes graph | schema | + +`projectGraph` consumes a `SearchSchema` (it projects every type in one pass); +the engine port and the GraphQL surface operate on one `SearchType` at a time. + ## Field model The mapping is data, not code. Each field declares its `kind`, the IR `path` to read (omit it for a **derived** field, populated by a `derivation`), and the capabilities it opts into. The physical field names a declaration fans out to -(per-locale search/sort keys, the grouped-facet companion) come from +(per-locale search/sort keys) come from `physicalFields`, the single convention projection, the collection schema and the query compiler all share. ```ts -import { projectGraph, irisOf, type SearchSchema } from '@lde/search'; +import { + projectGraph, + irisOf, + searchSchema, + type SearchType, +} from '@lde/search'; const DATASET = { type: 'http://www.w3.org/ns/dcat#Dataset', @@ -74,14 +94,14 @@ const DATASET = { document.classCount = irisOf(node, 'urn:dr:class').length; }, ], -} as const satisfies SearchSchema; +} as const satisfies SearchType; -for await (const document of projectGraph(quads, [DATASET])) { +for await (const document of projectGraph(quads, searchSchema(DATASET))) { // one flat search document per matching subject, streamed } ``` -Capturing the schema with `as const satisfies SearchSchema` keeps the field +Capturing the type with `as const satisfies SearchType` keeps the field literals, so the API surface can derive typed facet/output keys from it (see `@lde/search-api-graphql`). diff --git a/packages/search/package.json b/packages/search/package.json index e81f647f..6e7414c9 100644 --- a/packages/search/package.json +++ b/packages/search/package.json @@ -1,7 +1,7 @@ { "name": "@lde/search", "version": "0.1.2", - "description": "Engine- and domain-agnostic core for RDF-backed search: a unified declarative field model (SearchField/SearchSchema), a neutral query IR, the SearchEngine port with logical result types, and a streaming CONSTRUCT-to-document projection. Bakes in no engine, protocol, or domain.", + "description": "Engine- and domain-agnostic core for RDF-backed search: a unified declarative field model (SearchField/SearchType/SearchSchema), a neutral query IR, the SearchEngine port with logical result types, and a streaming CONSTRUCT-to-document projection. Bakes in no engine, protocol, or domain.", "repository": { "url": "git+https://github.com/ldelements/lde.git", "directory": "packages/search" diff --git a/packages/search/src/engine.ts b/packages/search/src/engine.ts index bcf61657..1a47bd6b 100644 --- a/packages/search/src/engine.ts +++ b/packages/search/src/engine.ts @@ -1,5 +1,5 @@ import type { SearchQuery } from './query.js'; -import type { SearchSchema } from './schema.js'; +import type { SearchType } from './schema.js'; /** * The engine port — the boundary a concrete engine adapter (e.g. @@ -20,7 +20,7 @@ export interface SearchEngine< > { search( query: SearchQuery, - schema: SearchSchema, + searchType: SearchType, ): Promise>; } @@ -43,38 +43,38 @@ export type FacetMap = Readonly< >; /** - * The facet-field-name union of a schema — the keys a {@link SearchResult}’s - * `facets` can hold. Requires the schema be captured as a literal type - * (`as const satisfies SearchSchema`), so the `facetable: true` flags survive as - * literals; a plain `: SearchSchema` annotation widens them and yields `never`. + * The facet-field-name union of a search type — the keys a {@link SearchResult}’s + * `facets` can hold. Requires the type be captured as a literal + * (`as const satisfies SearchType`), so the `facetable: true` flags survive as + * literals; a plain `: SearchType` annotation widens them and yields `never`. */ -export type FacetFieldsOf = Extract< - Schema['fields'][number], +export type FacetFieldsOf = Extract< + Type['fields'][number], { readonly facetable: true } >['name']; /** - * The output-field-name union of a schema — the keys a {@link ResultDocument} - * can hold. Like {@link FacetFieldsOf}, requires the schema captured as a literal - * (`as const satisfies SearchSchema`). + * The output-field-name union of a search type — the keys a {@link ResultDocument} + * can hold. Like {@link FacetFieldsOf}, requires the type captured as a literal + * (`as const satisfies SearchType`). */ -export type OutputFieldsOf = Extract< - Schema['fields'][number], +export type OutputFieldsOf = Extract< + Type['fields'][number], { readonly output: true } >['name']; -/** A {@link SearchEngine} narrowed to one schema: facet keys and document keys - * fixed to that schema’s facetable / output field names. The schema must be - * captured as `as const satisfies SearchSchema`. */ -export type EngineFor = SearchEngine< - FacetFieldsOf, - OutputFieldsOf +/** A {@link SearchEngine} narrowed to one search type: facet keys and document + * keys fixed to that type’s facetable / output field names. The type must be + * captured as `as const satisfies SearchType`. */ +export type EngineFor = SearchEngine< + FacetFieldsOf, + OutputFieldsOf >; -/** A {@link SearchResult} narrowed to one schema (see {@link EngineFor}). */ -export type ResultFor = SearchResult< - FacetFieldsOf, - OutputFieldsOf +/** A {@link SearchResult} narrowed to one search type (see {@link EngineFor}). */ +export type ResultFor = SearchResult< + FacetFieldsOf, + OutputFieldsOf >; /** diff --git a/packages/search/src/index.ts b/packages/search/src/index.ts index 37bc4db3..5f86c025 100644 --- a/packages/search/src/index.ts +++ b/packages/search/src/index.ts @@ -1,13 +1,14 @@ // Projection: RDF CONSTRUCT quads → flat search documents, driven by the unified -// SearchField/SearchSchema model below (one declaration; the fanout names come +// SearchField/SearchType model below (one declaration; the fanout names come // from `physicalFields`). export { projectGraph, irisOf, literalsOf, firstLiteralOf } from './project.js'; export type { SearchDocument } from './project.js'; // Unified field model: one declaration drives projection, engine collection -// schema, query semantics and the GraphQL surface. Plus the schema selectors and +// schema, query semantics and the GraphQL surface. Plus the field selectors and // the physical field-name convention they all share. export { + searchSchema, physicalFields, searchableFields, facetableFields, @@ -18,6 +19,7 @@ export { export type { FieldKind, SearchField, + SearchType, SearchSchema, Derivation, PhysicalFields, diff --git a/packages/search/src/project.ts b/packages/search/src/project.ts index 71e2416e..5aede395 100644 --- a/packages/search/src/project.ts +++ b/packages/search/src/project.ts @@ -5,6 +5,7 @@ import { physicalFields, type SearchField, type SearchSchema, + type SearchType, } from './schema.js'; /** A flat search document. `id` is the engine document key. */ @@ -12,45 +13,44 @@ export type SearchDocument = { id: string } & Record; /** * Project one framed JSON-LD node into a flat search document: apply each field - * of the schema, then run the derivations (which may read fields the field specs + * of the type, then run the derivations (which may read fields the field specs * already set). The physical field names a field fans out to come from * {@link physicalFields}, the single source shared with the engine collection * schema and the query compiler. */ export function projectDocument( node: FramedNode, - schema: SearchSchema, + searchType: SearchType, ): SearchDocument { const id = node['@id']; if (typeof id !== 'string') { throw new Error( - `Cannot project a ${schema.type} node without an @id: every search document needs a stable key, and an empty one would collide with other keyless nodes.`, + `Cannot project a ${searchType.type} node without an @id: every search document needs a stable key, and an empty one would collide with other keyless nodes.`, ); } const document: SearchDocument = { id }; - for (const field of schema.fields) { + for (const field of searchType.fields) { applyField(document, node, field); } - for (const derive of schema.derivations ?? []) { + for (const derive of searchType.derivations ?? []) { derive(document, node); } return document; } /** - * Frame `quads` for every schema’s root type and project each node with its - * type’s schema — the multi-shape pipeline. Streams one document at a time so - * memory stays flat. The IR maps to a schema by type, so adding a shape is - * adding a `SearchSchema` (no engine change). + * Frame `quads` for every root type in the schema and project each node with its + * type’s declaration — the multi-shape pipeline. Streams one document at a time + * so memory stays flat. The IR maps to a declaration by type, so adding a shape + * is adding a `SearchType` to the schema (no engine change). */ export async function* projectGraph( quads: readonly Quad[], - schemas: readonly SearchSchema[], + schema: SearchSchema, ): AsyncIterable { - const byType = new Map(schemas.map((schema) => [schema.type, schema])); - for (const schema of byType.values()) { - for await (const node of frameByType(quads, schema.type)) { - yield projectDocument(node, schema); + for (const searchType of schema.values()) { + for await (const node of frameByType(quads, searchType.type)) { + yield projectDocument(node, searchType); } } } @@ -91,7 +91,7 @@ function applyField( isoToUnix(firstLiteralOf(node, path)), ); } - // `boolean` is not projected from a path in current schemas — booleans are + // `boolean` is not projected from a path in current search types — booleans are // derivation-populated (e.g. the compatibility vinkjes). } diff --git a/packages/search/src/schema.ts b/packages/search/src/schema.ts index 41ed5356..7a687925 100644 --- a/packages/search/src/schema.ts +++ b/packages/search/src/schema.ts @@ -110,12 +110,24 @@ export type Derivation = (document: SearchDocument, node: FramedNode) => void; * shapes (and derived fields), `derivations` are its `sh:rule`-shaped computed * fields. A generator emits one of these per NodeShape. */ -export interface SearchSchema { +export interface SearchType { readonly type: string; readonly fields: readonly SearchField[]; readonly derivations?: readonly Derivation[]; } +/** + * The complete search declaration of a deployment: every root {@link SearchType}, + * keyed by its `type` IRI — the runtime form of a whole SHACL shapes graph. + * Build one with {@link searchSchema}. + */ +export type SearchSchema = ReadonlyMap; + +/** Build a {@link SearchSchema} from root-type declarations, keyed by `type`. */ +export function searchSchema(...types: readonly SearchType[]): SearchSchema { + return new Map(types.map((searchType) => [searchType.type, searchType])); +} + /** * The physical engine fields one {@link SearchField} fans out into, grouped by * the role each plays. The single source of truth for the naming convention, so @@ -142,11 +154,11 @@ export interface PhysicalFields { * `searchable` weight. */ export function searchableFields( - schema: SearchSchema, + searchType: SearchType, ): readonly (SearchField & { readonly searchable: { readonly weight: number }; })[] { - return schema.fields + return searchType.fields .filter( (field): field is SearchField & { searchable: { weight: number } } => field.searchable !== undefined, @@ -155,23 +167,27 @@ export function searchableFields( } /** Fields returned as facet buckets, in declaration order. */ -export function facetableFields(schema: SearchSchema): readonly SearchField[] { - return schema.fields.filter((field) => field.facetable === true); +export function facetableFields( + searchType: SearchType, +): readonly SearchField[] { + return searchType.fields.filter((field) => field.facetable === true); } /** Fields usable in `where`, in declaration order. */ -export function filterableFields(schema: SearchSchema): readonly SearchField[] { - return schema.fields.filter((field) => field.filterable === true); +export function filterableFields( + searchType: SearchType, +): readonly SearchField[] { + return searchType.fields.filter((field) => field.filterable === true); } /** Fields publicly selectable in `orderBy`, in declaration order. */ -export function sortableFields(schema: SearchSchema): readonly SearchField[] { - return schema.fields.filter((field) => field.sortable === true); +export function sortableFields(searchType: SearchType): readonly SearchField[] { + return searchType.fields.filter((field) => field.sortable === true); } /** Fields that appear in the API output type, in declaration order. */ -export function outputFields(schema: SearchSchema): readonly SearchField[] { - return schema.fields.filter((field) => field.output === true); +export function outputFields(searchType: SearchType): readonly SearchField[] { + return searchType.fields.filter((field) => field.output === true); } /** Derive the physical engine field names a declaration produces. */ diff --git a/packages/search/test/engine.test.ts b/packages/search/test/engine.test.ts index 54ad819d..14966451 100644 --- a/packages/search/test/engine.test.ts +++ b/packages/search/test/engine.test.ts @@ -1,9 +1,9 @@ import { describe, expect, it } from 'vitest'; import type { EngineFor, SearchEngine, SearchResult } from '../src/engine.js'; import type { SearchQuery } from '../src/query.js'; -import type { SearchSchema } from '../src/schema.js'; +import type { SearchType } from '../src/schema.js'; -const schema: SearchSchema = { +const schema: SearchType = { type: 'http://www.w3.org/ns/dcat#Dataset', fields: [{ name: 'title', kind: 'text', localized: true, locales: ['nl'] }], }; @@ -73,7 +73,7 @@ describe('typed facet and document keys', () => { { name: 'format', kind: 'keyword', array: true, facetable: true }, { name: 'status', kind: 'keyword', facetable: true }, ], - } as const satisfies SearchSchema; + } as const satisfies SearchType; // facets ⊂ { format, status }, document keys ⊂ { title }. These object // literals would not compile if the helpers widened to `string`/`never`. diff --git a/packages/search/test/project.test.ts b/packages/search/test/project.test.ts index 592caac6..2cd261f2 100644 --- a/packages/search/test/project.test.ts +++ b/packages/search/test/project.test.ts @@ -7,7 +7,12 @@ import { irisOf, type SearchDocument, } from '../src/project.js'; -import type { SearchField, SearchSchema, Derivation } from '../src/schema.js'; +import { + searchSchema, + type SearchField, + type SearchType, + type Derivation, +} from '../src/schema.js'; const DR = 'urn:dr:'; const IANA = 'https://www.iana.org/assignments/media-types/'; @@ -80,7 +85,7 @@ const derivations: Derivation[] = [ }, ]; -const schema: SearchSchema = { type: DATASET, fields, derivations }; +const schema: SearchType = { type: DATASET, fields, derivations }; describe('projectDocument', () => { it('projects every field kind and runs derivations', () => { @@ -338,7 +343,7 @@ describe('projectDocument', () => { }); describe('projectGraph', () => { - it('frames each schema’s type and projects matching nodes', async () => { + it('frames each root type in the schema and projects matching nodes', async () => { const quads = new Parser({ format: 'N-Triples' }).parse(` <${rdf.type.value}> <${DATASET}> . <${dcterms.title.value}> "Titel"@nl . @@ -349,9 +354,10 @@ describe('projectGraph', () => { `); const documents: SearchDocument[] = []; - for await (const document of projectGraph(quads, [ - { type: DATASET, fields }, - ])) { + for await (const document of projectGraph( + quads, + searchSchema({ type: DATASET, fields }), + )) { documents.push(document); } diff --git a/packages/search/test/schema.test.ts b/packages/search/test/schema.test.ts index 08ab0fd5..368821a6 100644 --- a/packages/search/test/schema.test.ts +++ b/packages/search/test/schema.test.ts @@ -7,12 +7,12 @@ import { searchableFields, sortableFields, type SearchField, - type SearchSchema, + type SearchType, } from '../src/schema.js'; const DATASET = 'http://www.w3.org/ns/dcat#Dataset'; -const schema: SearchSchema = { +const schema: SearchType = { type: DATASET, fields: [ { From 0fd49ed732d58203e390dc277096a5052f1b1bda Mon Sep 17 00:00:00 2001 From: David de Boer Date: Thu, 2 Jul 2026 19:47:07 +0200 Subject: [PATCH 12/35] docs(readme): add search packages to the packages table and diagram - add the missing @lde/search-api-graphql row to the packages table - add the search, search-typesense, search-api-graphql and text-normalization dependency edges to the architecture diagram, which lacked the search family entirely --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index 627c6bf2..dae6f504 100644 --- a/README.md +++ b/README.md @@ -143,6 +143,11 @@ await pipeline.run(); npm Project RDF into engine-agnostic search documents (framing + a declarative field spec) + + @lde/search-api-graphql + npm + Engine- and domain-agnostic GraphQL surface for search: builds an executable GraphQL schema from any SearchType at runtime + @lde/search-typesense npm @@ -229,6 +234,10 @@ graph TD subgraph Publication fastify-rdf docgen + search --> text-normalization + search-api-graphql --> search + search-typesense --> search + search-typesense --> text-normalization end subgraph Monitoring From e14638438ac275c60114800c3f1e5e51b98a5565 Mon Sep 17 00:00:00 2001 From: David de Boer Date: Thu, 2 Jul 2026 19:53:05 +0200 Subject: [PATCH 13/35] test(search-typesense): update autoUpdate line-coverage threshold --- packages/search-typesense/vite.config.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/search-typesense/vite.config.ts b/packages/search-typesense/vite.config.ts index 9184cdbe..a6245e7b 100644 --- a/packages/search-typesense/vite.config.ts +++ b/packages/search-typesense/vite.config.ts @@ -17,7 +17,7 @@ export default mergeConfig( // exercised, which is why branch coverage is lower. thresholds: { functions: 97.14, - lines: 93.28, + lines: 93.31, branches: 83.75, statements: 93.37, }, From 0c31d67e82c8462edc6e65051a229be617616d41 Mon Sep 17 00:00:00 2001 From: David de Boer Date: Fri, 3 Jul 2026 09:13:46 +0200 Subject: [PATCH 14/35] feat(search): centralize shared helpers in the core, fix date-range filters - Add referenceFields, fieldNamed, isRangeFacet, pageForOffset and the date storage codec (isoToUnixSeconds/unixSecondsToIso) to @lde/search, replacing local re-derivations in the Typesense adapter and the GraphQL surface - Route the adapter's localized display and sort field names through physicalFields instead of hand-built name interpolation - Compile a date field's ISO range bounds to the stored Unix seconds; they were previously interpolated verbatim into the int64 filter and could never match - Project boolean fields from a path (xsd:boolean lexical space) instead of silently skipping them - Resolve reference labels in a single multi_search POST and start the cached label load alongside the main search - Remove dead API introduced on this branch: acceptsFilter, filterOperator, ResultFor, PhysicalFields.value, the Sort re-export and the toLanguageStrings package export; drop resolvers that duplicate graphql-js defaults - Trim ADR 4 to the shipped surface, deferring the TS mirror and extension hooks --- .../0004-search-api-graphql-surface.md | 67 +++------ .../search-api-graphql/src/build-schema.ts | 51 ++----- packages/search-api-graphql/src/index.ts | 2 +- packages/search-api-graphql/vite.config.ts | 2 +- .../search-typesense/src/query-compiler.ts | 60 ++++---- packages/search-typesense/src/search.ts | 129 ++++++++---------- .../test/parse-response.test.ts | 46 ++++--- .../test/query-compiler.test.ts | 38 ++++++ packages/search-typesense/vite.config.ts | 8 +- packages/search/src/engine.ts | 6 - packages/search/src/index.ts | 8 +- packages/search/src/project.ts | 25 ++-- packages/search/src/query.ts | 27 +--- packages/search/src/schema.ts | 53 ++++++- packages/search/test/project.test.ts | 20 +++ packages/search/test/query.test.ts | 67 +-------- packages/search/test/schema.test.ts | 51 ++++++- packages/search/vite.config.ts | 6 +- 18 files changed, 345 insertions(+), 321 deletions(-) diff --git a/docs/decisions/0004-search-api-graphql-surface.md b/docs/decisions/0004-search-api-graphql-surface.md index 678d6d04..e29befa3 100644 --- a/docs/decisions/0004-search-api-graphql-surface.md +++ b/docs/decisions/0004-search-api-graphql-surface.md @@ -35,71 +35,39 @@ breaking changes – not a shipped artifact. ### The schema-building function ```ts -// Generic over the config *value’s* type (capture it `as const satisfies SearchType`), so -// one declaration drives both the runtime schema and the static TS types below. -function buildGraphQLSchema( - schema: S, +function buildGraphQLSchema( + searchType: SearchType, options: { typeName: string; // 'Dataset' – drives all derived type names queryField?: string; // root field; default lowercased plural of typeName queryDefaults?: (q: SearchQuery, ctx: SearchContext) => SearchQuery; // consumer policy - languageOrder?: ( - available: readonly string[], - accept: readonly string[], - ) => readonly string[]; - extendTypeDefs?: string; // merged before build (compose-before-build) - extendResolvers?: Record; + languageOrder?: LanguageOrder; // output-language ordering; default Accept-Language first }, ): GraphQLSchema; // executable schema: types + generic resolvers attached -// Static types derived from the SAME config value’s type (compile-time only, erased at -// runtime); one source, no codegen, no drift. Exported for typed in-process callers/tests. -type OutputOf; // { id: string; title: LanguageString[]; size: number | null; … } -type WhereOf; // { format?: StringFilter; size?: FloatRange; … } -type OrderByOf; // { field: 'RELEVANCE' | 'TITLE' | …; direction: 'ASC' | 'DESC' } -type FacetOf; // the facetable-field-name union - -// also exported for manual composition / non-default servers: -function buildSearchTypeDefsAndResolvers( - schema, - options, -): { typeDefs: string; resolvers: object }; // optional CI helper only: -function printGraphQLSchema(schema, options): string; // SDL, for a snapshot/breaking-change test +function printGraphQLSchema(searchType, options): string; // SDL, for a snapshot/breaking-change test ``` `buildGraphQLSchema` is the standalone, framework-agnostic artifact (depends only on -`graphql` + `@graphql-tools/schema`). Deep customisation passes `extendTypeDefs`/ -`extendResolvers` (merged before `makeExecutableSchema`, since Mercurius registers once) or -composes the exported typeDefs/resolvers by hand. +`graphql`). Deep customisation of the emitted schema is deferred (see Consequences). -### A typed surface the contract does not depend on - -One `as const satisfies SearchType` declaration drives two **independent** projections: the -**runtime contract** (the `GraphQLSchema`, built at startup by reading the value – -`field.kind`, `output`, `facetable`, …) and a **static TS mirror** (`OutputOf` / -`WhereOf` / `OrderByOf` / `FacetOf`, computed from `typeof schema` via mapped types). - -The contract **does not depend on the TS types.** `as const`/`satisfies` are compile-time only -and erased, so the served schema is byte-identical whether or not the mirror exists – it is a -developer-experience overlay. The two derivations can drift (the runtime kind→GraphQL-type -mapping lives in `buildGraphQLSchema`; the type-level mapping in `OutputOf` duplicates it), -so the **contract** is guarded by the optional `printGraphQLSchema()` SDL snapshot (the real -artifact), while the TS mirror only catches our own coding mistakes against it. +### Typed boundaries, dynamic middle Values are typed at both ends, with the resolver as the typed transform between them: -| layer | localized text | reference | int64 | keyword (array) | boolean | -| ----------------------- | ------------------------------------ | --------------------------- | ---------------- | ----------------------- | -------------------- | -| IR (`ResultDocument`) | `LocalizedValue` (lang map) | `Reference` | `number` | `readonly string[]` | `boolean` | -| GraphQL (`OutputOf`) | `LanguageString[]` (best-first list) | named type (`Organization`) | `Float`/`number` | `[String!]!`/`string[]` | `Boolean!`/`boolean` | +| layer | localized text | reference | int64 | keyword (array) | boolean | +| --------------------- | ------------------------------------ | --------------------------- | ---------------- | ----------------------- | -------------------- | +| IR (`ResultDocument`) | `LocalizedValue` (lang map) | `Reference` | `number` | `readonly string[]` | `boolean` | +| GraphQL | `LanguageString[]` (best-first list) | named type (`Organization`) | `Float`/`number` | `[String!]!`/`string[]` | `Boolean!`/`boolean` | -What stays unchecked is only the generic resolver’s **dynamic middle**: it loops over the -field model with runtime-string names, so TS cannot prove the object it builds matches -`OutputOf` – it casts at that boundary, and graphql-js’s executor (not TS) enforces the +What stays unchecked is the generic resolver’s **dynamic middle**: it loops over the field +model with runtime-string names, so TS cannot prove the object it builds matches the emitted +output types – it casts at that boundary, and graphql-js’s executor (not TS) enforces the output types at runtime (a wrong-typed return raises a field error). Same “typed boundaries, dynamic middle” shape as the engine port and the projection: type the edges where it is -honest, accept a cast where iteration is inherently dynamic. +honest, accept a cast where iteration is inherently dynamic. The **contract** is guarded by +the optional `printGraphQLSchema()` SDL snapshot (the real artifact). ### Construction rules (field model → schema) @@ -324,4 +292,7 @@ Each transport populates it per request; no framework type appears in the packag `String`) and a `Long`/`BigInt` scalar for 64-bit integers (kept `Float`); transport-layer persisted queries / cost limits; a root or per-field language argument (Accept-Language is the sole preference mechanism); metadata-language-availability filtering (a facetable dimension, - not v1). + not v1); schema extension hooks (`extendTypeDefs`/`extendResolvers` or exported + typeDefs/resolvers for manual composition); a static TS mirror of the contract + (`OutputOf` / `WhereOf` / `OrderByOf` / `FacetOf` mapped types over an + `as const satisfies SearchType` declaration) for typed in-process callers. diff --git a/packages/search-api-graphql/src/build-schema.ts b/packages/search-api-graphql/src/build-schema.ts index 836cc2a6..cb6652bf 100644 --- a/packages/search-api-graphql/src/build-schema.ts +++ b/packages/search-api-graphql/src/build-schema.ts @@ -20,16 +20,17 @@ import { facetableFields, filterableFields, filterOperatorFor, + isRangeFacet, outputFields, + pageForOffset, sortableFields, + unixSecondsToIso, type Filter, type LocalizedValue, - type Reference, type SearchEngine, type SearchField, type SearchQuery, type SearchType, - type Sort, } from '@lde/search'; import { defaultLanguageOrder, @@ -127,14 +128,8 @@ export function buildGraphQLSchema( const rangeBucket = new GraphQLObjectType({ name: 'RangeBucket', fields: { - min: { - type: GraphQLFloat, - resolve: (bucket: Source) => bucket.min ?? null, - }, - max: { - type: GraphQLFloat, - resolve: (bucket: Source) => bucket.max ?? null, - }, + min: { type: GraphQLFloat }, + max: { type: GraphQLFloat }, count: { type: new GraphQLNonNull(GraphQLInt) }, }, }); @@ -177,11 +172,10 @@ export function buildGraphQLSchema( new GraphQLObjectType({ name: field.ref.type, fields: { - id: { - type: new GraphQLNonNull(GraphQLString), - resolve: (source: Source) => (source as unknown as Reference).id, - }, - name: labelList((source) => (source as unknown as Reference).label), + id: { type: new GraphQLNonNull(GraphQLString) }, + name: labelList( + (source) => source.label as LocalizedValue | undefined, + ), }, }), ); @@ -207,7 +201,6 @@ export function buildGraphQLSchema( function outputFieldConfig( field: SearchField, ): GraphQLFieldConfig { - const passthrough = (source: Source) => source[field.name] ?? null; switch (field.kind) { case 'text': return labelList( @@ -219,7 +212,7 @@ export function buildGraphQLSchema( type: nonNullListOf(GraphQLString), resolve: (s) => s[field.name] ?? [], } - : { type: scalarOutput(GraphQLString, field), resolve: passthrough }; + : { type: scalarOutput(GraphQLString, field) }; case 'reference': { const referenceType = referenceTypes.get(field.ref?.type ?? '')!; return field.array === true @@ -232,16 +225,12 @@ export function buildGraphQLSchema( field.required === true ? new GraphQLNonNull(referenceType) : referenceType, - resolve: passthrough, }; } case 'integer': - return { type: scalarOutput(GraphQLInt, field), resolve: passthrough }; + return { type: scalarOutput(GraphQLInt, field) }; case 'number': - return { - type: scalarOutput(GraphQLFloat, field), - resolve: passthrough, - }; + return { type: scalarOutput(GraphQLFloat, field) }; case 'date': // Stored as Unix seconds (int64); the surface serves ISO 8601 (ADR 4). return { @@ -249,7 +238,7 @@ export function buildGraphQLSchema( resolve: (source) => { const value = source[field.name]; return typeof value === 'number' - ? new Date(value * 1000).toISOString() + ? unixSecondsToIso(value) : (value ?? null); }, }; @@ -321,10 +310,8 @@ export function buildGraphQLSchema( GraphQLFieldConfig > = {}; for (const field of facetableFields(searchType)) { - const isRange = - field.facetRanges !== undefined && field.facetRanges.length > 0; fields[field.name] = { - type: nonNullListOf(isRange ? rangeBucket : valueBucket), + type: nonNullListOf(isRangeFacet(field) ? rangeBucket : valueBucket), resolve: async ( source: Source, _args: unknown, @@ -404,12 +391,7 @@ export function buildGraphQLSchema( return { items: result.hits.map((hit) => ({ id: hit.id, ...hit.document })), total: result.total, - // Guard against a `perPage: 0` arg: `Math.floor(0/0)` is NaN, which a - // non-null `Int!` cannot serialize and would fail the whole query. - page: - finalQuery.limit > 0 - ? Math.floor(finalQuery.offset / finalQuery.limit) + 1 - : 1, + page: pageForOffset(finalQuery.offset, finalQuery.limit), perPage: finalQuery.limit, // Carried for the facets resolver (skip-own-filter per key). query: finalQuery, @@ -509,6 +491,3 @@ function rangeInput( fields: { min: { type: bound }, max: { type: bound } }, }); } - -// Re-exported for callers that compose a sort manually. -export type { Sort }; diff --git a/packages/search-api-graphql/src/index.ts b/packages/search-api-graphql/src/index.ts index 20c13223..7754a7e5 100644 --- a/packages/search-api-graphql/src/index.ts +++ b/packages/search-api-graphql/src/index.ts @@ -3,5 +3,5 @@ export type { SearchContext, BuildGraphQLSchemaOptions, } from './build-schema.js'; -export { defaultLanguageOrder, toLanguageStrings } from './language.js'; +export { defaultLanguageOrder } from './language.js'; export type { LanguageString, LanguageOrder } from './language.js'; diff --git a/packages/search-api-graphql/vite.config.ts b/packages/search-api-graphql/vite.config.ts index 7434ca80..2b41cfdc 100644 --- a/packages/search-api-graphql/vite.config.ts +++ b/packages/search-api-graphql/vite.config.ts @@ -12,7 +12,7 @@ export default mergeConfig( thresholds: { functions: 100, lines: 100, - branches: 88.63, + branches: 89.74, statements: 100, }, }, diff --git a/packages/search-typesense/src/query-compiler.ts b/packages/search-typesense/src/query-compiler.ts index dfeede8c..ab46b179 100644 --- a/packages/search-typesense/src/query-compiler.ts +++ b/packages/search-typesense/src/query-compiler.ts @@ -1,6 +1,10 @@ import type { SearchParams } from 'typesense/lib/Typesense/Documents.js'; import { fold } from '@lde/text-normalization'; import { + fieldNamed, + isoToUnixSeconds, + isRangeFacet, + pageForOffset, physicalFields, searchableFields, type FacetRange, @@ -48,9 +52,7 @@ export function buildSearchParams( query_by: names.join(','), query_by_weights: weights.join(','), per_page: query.limit, - // A facet-only query (`limit: 0`) fetches no hits; page is then meaningless, - // so pin it to 1 rather than dividing by zero. - page: query.limit > 0 ? Math.floor(query.offset / query.limit) + 1 : 1, + page: pageForOffset(query.offset, query.limit), }; if (filterBy.length > 0) { params.filter_by = filterBy; @@ -80,10 +82,8 @@ function compileFacetBy( ): string { return facets .map((name) => { - const field = searchType.fields.find( - (candidate) => candidate.name === name, - ); - return field?.facetRanges !== undefined && field.facetRanges.length > 0 + const field = fieldNamed(searchType, name); + return field !== undefined && isRangeFacet(field) ? compileRangeFacet(field.name, field.facetRanges) : name; }) @@ -150,9 +150,7 @@ function compileFilter( filter: Filter, searchType: SearchType, ): string | undefined { - const field = searchType.fields.find( - (candidate) => candidate.name === filter.field, - ); + const field = fieldNamed(searchType, filter.field); if (field === undefined) { return undefined; } @@ -162,7 +160,7 @@ function compileFilter( : undefined; } if ('range' in filter) { - return compileRange(field.name, filter.range); + return compileRange(field, filter.range); } return `${field.name}:=${filter.is}`; } @@ -174,26 +172,21 @@ function compileFilter( function compileMembership( field: SearchField, values: readonly string[], -): string { - const exact = field.facetable !== true; - return membership(field.name, values, exact); -} - -function membership( - name: string, - values: readonly string[], - exact: boolean, ): string { const list = `[${values.map(escapeFilterValue).join(',')}]`; - return exact ? `${name}:=${list}` : `${name}:${list}`; + return field.facetable !== true + ? `${field.name}:=${list}` + : `${field.name}:${list}`; } /** An inclusive Typesense range clause, or `undefined` when neither bound is set. */ function compileRange( - name: string, + field: SearchField, range: { readonly min?: number | string; readonly max?: number | string }, ): string | undefined { - const { min, max } = range; + const name = field.name; + const min = storedBound(field, range.min); + const max = storedBound(field, range.max); if (min !== undefined && max !== undefined) { return `${name}:[${min}..${max}]`; } @@ -206,6 +199,17 @@ function compileRange( return undefined; } +/** A range bound as stored: a `date` field’s ISO 8601 bound becomes the indexed + * Unix seconds ({@link isoToUnixSeconds}); an unparseable bound is dropped. */ +function storedBound( + field: SearchField, + bound: number | string | undefined, +): number | string | undefined { + return field.kind === 'date' && typeof bound === 'string' + ? isoToUnixSeconds(bound) + : bound; +} + /** * One `sort_by` term. `relevance` maps to Typesense’s `_text_match`; a localized * text field sorts on its active-locale folded key; any other field (including a @@ -219,15 +223,17 @@ function compileSort( if (sort.field === 'relevance') { return `_text_match:${sort.direction}`; } - const field = searchType.fields.find( - (candidate) => candidate.name === sort.field, - ); + const field = fieldNamed(searchType, sort.field); if ( field !== undefined && field.kind === 'text' && field.localized === true ) { - return `${field.name}_sort_${locale}:${sort.direction}`; + const sortName = + physicalFields(field).sort[field.locales?.indexOf(locale) ?? -1]; + if (sortName !== undefined) { + return `${sortName}:${sort.direction}`; + } } return `${sort.field}:${sort.direction}`; } diff --git a/packages/search-typesense/src/search.ts b/packages/search-typesense/src/search.ts index fc52aca4..4c9eaf72 100644 --- a/packages/search-typesense/src/search.ts +++ b/packages/search-typesense/src/search.ts @@ -1,6 +1,10 @@ import type { Client } from 'typesense'; import { + fieldNamed, + isRangeFacet, outputFields, + physicalFields, + referenceFields, type FacetBucket, type LocalizedValue, type Reference, @@ -95,6 +99,16 @@ export function createTypesenseSearchEngine( const params = buildSearchParams(query, searchType, { maxFacetValues: options.maxFacetValues, }); + // Cached path: the once-loaded full collection serves labels by in-memory + // lookup (no per-search round-trip). The load does not depend on the + // response, so it runs alongside the search; it never rejects (a failed + // load degrades to an empty map), so it cannot leave an unhandled + // rejection behind if the search itself fails. + const cachedLabelsPromise = + options.labelsCollection !== undefined && + options.labelCacheTtlMs !== undefined + ? cachedAllLabels(options.labelsCollection, options.labelCacheTtlMs) + : undefined; const response = (await client .collections(options.collection) .documents() @@ -103,25 +117,17 @@ export function createTypesenseSearchEngine( // mid-rebuild) degrades to id-only references rather than failing the whole // search, so the listing still renders with bare IRIs. let labels: ReadonlyMap = new Map(); - if (options.labelsCollection !== undefined) { - if (options.labelCacheTtlMs !== undefined) { - // Cached path: resolve the page's references by in-memory lookup - // against the once-loaded collection (no Typesense round-trip). - const allLabels = await cachedAllLabels( + if (cachedLabelsPromise !== undefined) { + labels = await cachedLabelsPromise; + } else if (options.labelsCollection !== undefined) { + try { + labels = await fetchLabels( + client, options.labelsCollection, - options.labelCacheTtlMs, + referenceIris(response, searchType), ); - labels = selectLabels(allLabels, referenceIris(response, searchType)); - } else { - try { - labels = await fetchLabels( - client, - options.labelsCollection, - referenceIris(response, searchType), - ); - } catch (error) { - options.onLabelError?.(error); - } + } catch (error) { + options.onLabelError?.(error); } } return parseSearchResponse(response, searchType, labels); @@ -151,36 +157,19 @@ async function loadAllLabels( return labels; } -/** Narrow the cached collection to just the labels `iris` actually need. */ -function selectLabels( - allLabels: ReadonlyMap, - iris: readonly string[], -): Map { - const labels = new Map(); - for (const iri of iris) { - const label = allLabels.get(iri); - if (label !== undefined) { - labels.set(iri, label); - } - } - return labels; -} - /** Every distinct reference IRI whose label the result will actually use. */ function referenceIris( response: TypesenseSearchResponse, searchType: SearchType, ): string[] { const referenceFieldSet = new Set( - searchType.fields - .filter((field) => field.kind === 'reference') - .map((field) => field.name), + referenceFields(searchType).map((field) => field.name), ); // Hits only carry labels for OUTPUT reference fields: reconstructDocument skips // non-output fields, so resolving a non-output reference's hit labels (e.g. a // facet-only `class` with dozens of IRIs per hit) is pure waste. - const outputReferenceFields = outputFields(searchType) - .filter((field) => field.kind === 'reference') + const outputReferenceFields = referenceFields(searchType) + .filter((field) => field.output === true) .map((field) => field.name); const iris = new Set(); for (const hit of response.hits ?? []) { @@ -212,12 +201,13 @@ function referenceIris( * `label_${locale}` becomes a language-map entry; the default `label` is the * untagged (`und`) fallback when no locale variant exists. * - * Sent over `multi_search` (POST) in batches: the id-list of a page or facet - * carrying many references — e.g. a dataset with dozens of classes — would - * overflow Typesense’s GET query-string limit (4000 chars, and IRIs URL-encode - * to several times their length) if it travelled in the URL. POST puts it in the - * body; the batch size stays under Typesense’s `per_page` cap. Exported for - * unit testing against a fake client. + * Sent as one `multi_search` (POST) call, the id-list split over per-search + * batches: the id-list of a page or facet carrying many references — e.g. a + * dataset with dozens of classes — would overflow Typesense’s GET query-string + * limit (4000 chars, and IRIs URL-encode to several times their length) if it + * travelled in the URL. POST puts it in the body; each batch stays under + * Typesense’s `per_page` cap, and bundling the batches keeps it one round-trip + * regardless of IRI count. Exported for unit testing against a fake client. */ export async function fetchLabels( client: Pick, @@ -225,21 +215,25 @@ export async function fetchLabels( iris: readonly string[], ): Promise> { const labels = new Map(); + if (iris.length === 0) { + return labels; + } + const searches = []; for (let start = 0; start < iris.length; start += LABEL_BATCH_SIZE) { const batch = iris.slice(start, start + LABEL_BATCH_SIZE); - const filter = `id:[${batch.map(escapeFilterValue).join(',')}]`; - const { results } = (await client.multiSearch.perform({ - searches: [ - { - collection, - q: '*', - query_by: 'label', - filter_by: filter, - per_page: batch.length, - }, - ], - })) as { results: readonly TypesenseSearchResponse[] }; - for (const hit of results[0]?.hits ?? []) { + searches.push({ + collection, + q: '*', + query_by: 'label', + filter_by: `id:[${batch.map(escapeFilterValue).join(',')}]`, + per_page: batch.length, + }); + } + const { results } = (await client.multiSearch.perform({ searches })) as { + results: readonly TypesenseSearchResponse[]; + }; + for (const result of results) { + for (const hit of result.hits ?? []) { labels.set(String(hit.document.id), labelToLocalizedValue(hit.document)); } } @@ -298,20 +292,16 @@ export function parseSearchResponse( // Reference facets are IRI-keyed; their buckets carry a resolved data label. // Plain facets (tokens, free strings) carry no label — the consumer owns display. const referenceFacets = new Set( - searchType.fields - .filter((field) => field.kind === 'reference') - .map((field) => field.name), + referenceFields(searchType).map((field) => field.name), ); const facets: Record = {}; for (const facet of response.facet_counts ?? []) { const labelled = referenceFacets.has(facet.field_name); // A range facet echoes the declared range key as the bucket value; look the // bin's half-open bounds back up by key so the bucket is self-describing. - const field = searchType.fields.find( - (candidate) => candidate.name === facet.field_name, - ); + const field = fieldNamed(searchType, facet.field_name); const rangesByKey = - field?.facetRanges !== undefined + field !== undefined && isRangeFacet(field) ? new Map(field.facetRanges.map((range) => [range.key, range])) : undefined; facets[facet.field_name] = facet.counts.map((bucket) => { @@ -337,11 +327,6 @@ function reconstructDocument( ): ResultDocument { const document: Record = {}; for (const field of outputFields(searchType)) { - if (field.kind === 'boolean') { - // A boolean is always present; an absent value means false. - document[field.name] = flat[field.name] === true; - continue; - } const value = logicalValue(flat, field, labels); if (value !== undefined) { document[field.name] = value; @@ -373,6 +358,7 @@ function logicalValue( return typeof value === 'number' ? value : undefined; } case 'boolean': + // A boolean is always present; an absent value means false. return flat[field.name] === true; } } @@ -383,12 +369,13 @@ function localizedValue( field: SearchField, ): LocalizedValue | undefined { const map: Record = {}; - for (const locale of field.locales ?? []) { - const value = flat[`${field.name}_${locale}`]; + const display = physicalFields(field).display; + (field.locales ?? []).forEach((locale, index) => { + const value = flat[display[index]]; if (typeof value === 'string') { map[locale] = [value]; } - } + }); return Object.keys(map).length > 0 ? map : undefined; } diff --git a/packages/search-typesense/test/parse-response.test.ts b/packages/search-typesense/test/parse-response.test.ts index b59b1a47..3b7dd96f 100644 --- a/packages/search-typesense/test/parse-response.test.ts +++ b/packages/search-typesense/test/parse-response.test.ts @@ -401,30 +401,35 @@ describe('createTypesenseSearchEngine label cache (labelCacheTtlMs)', () => { describe('fetchLabels', () => { // A fake Typesense client whose multi_search returns the requested ids that - // exist in `docsById`, recording the id-list of each POST so batching is - // observable. (Resolving via multi_search/POST avoids the GET query-string + // exist in `docsById`, recording each POST's per-search id-lists so batching + // is observable. (Resolving via multi_search/POST avoids the GET query-string // limit that a large id-list would otherwise overflow.) function fakeClient(docsById: Record>) { - const calls: string[][] = []; + const posts: string[][][] = []; const client = { multiSearch: { perform: (request: { searches: { readonly filter_by: string }[] }) => { - const ids = [ - ...request.searches[0].filter_by.matchAll(/`([^`]+)`/g), - ].map((match) => match[1]); - calls.push(ids); - const hits = ids - .filter((id) => docsById[id] !== undefined) - .map((id) => ({ document: { id, ...docsById[id] } })); - return Promise.resolve({ results: [{ found: hits.length, hits }] }); + const batches = request.searches.map((search) => + [...search.filter_by.matchAll(/`([^`]+)`/g)].map( + (match) => match[1], + ), + ); + posts.push(batches); + const results = batches.map((ids) => { + const hits = ids + .filter((id) => docsById[id] !== undefined) + .map((id) => ({ document: { id, ...docsById[id] } })); + return { found: hits.length, hits }; + }); + return Promise.resolve({ results }); }, }, }; - return { client: client as unknown as Pick, calls }; + return { client: client as unknown as Pick, posts }; } it('resolves labels via multi_search, merging per-locale variants', async () => { - const { client, calls } = fakeClient({ + const { client, posts } = fakeClient({ 'https://org/1': { label: 'KB', label_nl: 'KB' }, // Only a default label (no locale variant) → untagged (`und`) fallback. 'https://org/3': { label: 'Untagged' }, @@ -438,10 +443,10 @@ describe('fetchLabels', () => { expect(labels.get('https://org/3')).toEqual({ und: ['Untagged'] }); // An IRI absent from the collection yields no entry. expect(labels.has('https://org/2')).toBe(false); - expect(calls).toHaveLength(1); + expect(posts).toHaveLength(1); }); - it('batches a large id-list under the per_page cap, one POST per batch', async () => { + it('batches a large id-list under the per_page cap, in a single POST', async () => { const ids = Array.from( { length: 450 }, (_unused, index) => `https://example.org/class/${index}`, @@ -449,17 +454,18 @@ describe('fetchLabels', () => { const docsById = Object.fromEntries( ids.map((id) => [id, { label_nl: id }]), ); - const { client, calls } = fakeClient(docsById); + const { client, posts } = fakeClient(docsById); const labels = await fetchLabels(client, 'labels', ids); - // 450 ids → batches of 200, 200, 50. - expect(calls.map((batch) => batch.length)).toEqual([200, 200, 50]); + // 450 ids → batches of 200, 200, 50, bundled into one round-trip. + expect(posts).toHaveLength(1); + expect(posts[0].map((batch) => batch.length)).toEqual([200, 200, 50]); expect(labels.size).toBe(450); }); it('makes no request for an empty id-list', async () => { - const { client, calls } = fakeClient({}); + const { client, posts } = fakeClient({}); const labels = await fetchLabels(client, 'labels', []); expect(labels.size).toBe(0); - expect(calls).toHaveLength(0); + expect(posts).toHaveLength(0); }); }); diff --git a/packages/search-typesense/test/query-compiler.test.ts b/packages/search-typesense/test/query-compiler.test.ts index 9a06d0f8..6d71efca 100644 --- a/packages/search-typesense/test/query-compiler.test.ts +++ b/packages/search-typesense/test/query-compiler.test.ts @@ -48,6 +48,7 @@ const schema: SearchType = { ], }, { name: 'iiif', kind: 'boolean', filterable: true, facetable: true }, + { name: 'datePosted', kind: 'date', filterable: true, sortable: true }, ], }; @@ -129,6 +130,43 @@ describe('buildSearchParams', () => { ).toBe('size:<=9'); }); + it('converts a date field’s ISO bounds to the stored Unix seconds', () => { + const min = Date.parse('2024-01-01T00:00:00Z') / 1000; + const max = Date.parse('2025-01-01T00:00:00Z') / 1000; + expect( + buildSearchParams( + { + ...base, + where: [ + { + field: 'datePosted', + range: { + min: '2024-01-01T00:00:00Z', + max: '2025-01-01T00:00:00Z', + }, + }, + ], + }, + schema, + ).filter_by, + ).toBe(`datePosted:[${min}..${max}]`); + // An unparseable bound is dropped rather than compiled into garbage. + expect( + buildSearchParams( + { + ...base, + where: [ + { + field: 'datePosted', + range: { min: 'not-a-date', max: '2025-01-01T00:00:00Z' }, + }, + ], + }, + schema, + ).filter_by, + ).toBe(`datePosted:<=${max}`); + }); + it('compiles orderBy: RELEVANCE → _text_match and a localized field → its sort key', () => { expect( buildSearchParams( diff --git a/packages/search-typesense/vite.config.ts b/packages/search-typesense/vite.config.ts index a6245e7b..71f80bbd 100644 --- a/packages/search-typesense/vite.config.ts +++ b/packages/search-typesense/vite.config.ts @@ -16,10 +16,10 @@ export default mergeConfig( // rethrow guards and best-effort cleanup paths are deliberately not // exercised, which is why branch coverage is lower. thresholds: { - functions: 97.14, - lines: 93.31, - branches: 83.75, - statements: 93.37, + functions: 96.87, + lines: 93.49, + branches: 84.05, + statements: 93.55, }, }, }, diff --git a/packages/search/src/engine.ts b/packages/search/src/engine.ts index 1a47bd6b..20a545c8 100644 --- a/packages/search/src/engine.ts +++ b/packages/search/src/engine.ts @@ -71,12 +71,6 @@ export type EngineFor = SearchEngine< OutputFieldsOf >; -/** A {@link SearchResult} narrowed to one search type (see {@link EngineFor}). */ -export type ResultFor = SearchResult< - FacetFieldsOf, - OutputFieldsOf ->; - /** * One result row. `id` (the stable document key, an IRI) is kept *out* of * {@link ResultDocument}: it is always present and is the hit’s identity, a diff --git a/packages/search/src/index.ts b/packages/search/src/index.ts index 5f86c025..8ecfbf03 100644 --- a/packages/search/src/index.ts +++ b/packages/search/src/index.ts @@ -15,6 +15,11 @@ export { filterableFields, sortableFields, outputFields, + referenceFields, + fieldNamed, + isRangeFacet, + isoToUnixSeconds, + unixSecondsToIso, } from './schema.js'; export type { FieldKind, @@ -27,7 +32,7 @@ export type { } from './schema.js'; // Engine- and protocol-neutral query IR + filter semantics. -export { filterOperatorFor, filterOperator, acceptsFilter } from './query.js'; +export { filterOperatorFor, pageForOffset } from './query.js'; export type { SearchQuery, Filter, Sort, FilterOperator } from './query.js'; // Engine port + the logical result document returned across it. @@ -44,7 +49,6 @@ export type { FacetFieldsOf, OutputFieldsOf, EngineFor, - ResultFor, } from './engine.js'; export type { FramedNode } from './frame-by-type.js'; diff --git a/packages/search/src/project.ts b/packages/search/src/project.ts index 5aede395..75f73b04 100644 --- a/packages/search/src/project.ts +++ b/packages/search/src/project.ts @@ -2,6 +2,7 @@ import type { Quad } from '@rdfjs/types'; import { fold } from '@lde/text-normalization'; import { frameByType, type FramedNode } from './frame-by-type.js'; import { + isoToUnixSeconds, physicalFields, type SearchField, type SearchSchema, @@ -84,15 +85,23 @@ function applyField( field.name, toNumber(firstLiteralOf(node, path)), ); - case 'date': + case 'date': { + const literal = firstLiteralOf(node, path); return setNumber( document, field.name, - isoToUnix(firstLiteralOf(node, path)), + literal === undefined ? undefined : isoToUnixSeconds(literal), ); + } + case 'boolean': { + // The xsd:boolean lexical space: true/false/1/0. + const literal = firstLiteralOf(node, path); + if (literal !== undefined) { + document[field.name] = literal === 'true' || literal === '1'; + } + return; + } } - // `boolean` is not projected from a path in current search types — booleans are - // derivation-populated (e.g. the compatibility vinkjes). } /** @@ -246,14 +255,6 @@ function toNumber(literal: string | undefined): number | undefined { return literal === undefined ? undefined : Number(literal); } -function isoToUnix(iso: string | undefined): number | undefined { - if (iso === undefined) { - return undefined; - } - const millis = new Date(iso).getTime(); - return Number.isNaN(millis) ? undefined : Math.trunc(millis / 1000); -} - function setNumber( document: SearchDocument, field: string, diff --git a/packages/search/src/query.ts b/packages/search/src/query.ts index d009ea75..3ebfc37a 100644 --- a/packages/search/src/query.ts +++ b/packages/search/src/query.ts @@ -1,4 +1,4 @@ -import type { FieldKind, SearchField } from './schema.js'; +import type { FieldKind } from './schema.js'; /** * The engine- and protocol-neutral query IR. Every API surface parses its input @@ -71,25 +71,12 @@ export function filterOperatorFor(kind: FieldKind): FilterOperator | undefined { return OPERATOR_BY_KIND[kind]; } -/** The operator a concrete {@link Filter} carries, from its shape. */ -export function filterOperator(filter: Filter): FilterOperator { - if ('in' in filter) { - return 'in'; - } - if ('range' in filter) { - return 'range'; - } - return 'is'; -} - /** - * Whether `field` can be filtered by `filter`: the field must be `filterable` - * and the filter’s shape must be the operator its kind accepts. Surfaces use it - * to reject malformed `where` input before it reaches the adapter. + * The 1-based page an `offset` falls on — the numbered-pagination presentation + * of the IR, shared by the surfaces and the adapters. `limit: 0` (a facet-only + * query) fetches no hits and has no meaningful page, so it pins to 1 rather + * than dividing by zero. */ -export function acceptsFilter(field: SearchField, filter: Filter): boolean { - return ( - field.filterable === true && - filterOperator(filter) === filterOperatorFor(field.kind) - ); +export function pageForOffset(offset: number, limit: number): number { + return limit > 0 ? Math.floor(offset / limit) + 1 : 1; } diff --git a/packages/search/src/schema.ts b/packages/search/src/schema.ts index 7a687925..25e96303 100644 --- a/packages/search/src/schema.ts +++ b/packages/search/src/schema.ts @@ -135,16 +135,13 @@ export function searchSchema(...types: readonly SearchType[]): SearchSchema { * query compiler (reads them) cannot disagree. */ export interface PhysicalFields { - /** The lone stored field for a non-localized kind — faceted, filtered, sorted - * and output directly. Absent for localized text (its value lives per locale). */ - readonly value?: string; /** Per-locale output labels `${name}_${locale}` (localized text, `output`). */ readonly display: readonly string[]; /** Folded match fields: `${name}_search_${locale}` per locale (localized) or a * single `${name}_search` (non-localized), when `searchable`. */ readonly search: readonly string[]; /** Per-locale folded sort keys `${name}_sort_${locale}` (localized text, - * `sortable`); a non-localized field sorts on its `value`. */ + * `sortable`); a non-localized field sorts on its own `name` field. */ readonly sort: readonly string[]; } @@ -190,14 +187,56 @@ export function outputFields(searchType: SearchType): readonly SearchField[] { return searchType.fields.filter((field) => field.output === true); } +/** Fields of kind `reference` (IRI-valued, label-resolved), in declaration order. */ +export function referenceFields( + searchType: SearchType, +): readonly SearchField[] { + return searchType.fields.filter((field) => field.kind === 'reference'); +} + +/** Look up a field by its logical name. */ +export function fieldNamed( + searchType: SearchType, + name: string, +): SearchField | undefined { + return searchType.fields.find((field) => field.name === name); +} + +/** + * Whether a facet on this field returns fixed range bins (a histogram) rather + * than one bucket per distinct value: it declares non-empty + * {@link SearchField.facetRanges}. One predicate for the surface’s facet type, + * the adapter’s facet clause and the bucket reconstruction, so they cannot + * disagree. + */ +export function isRangeFacet( + field: SearchField, +): field is SearchField & { readonly facetRanges: readonly FacetRange[] } { + return field.facetRanges !== undefined && field.facetRanges.length > 0; +} + +/** + * The engine storage codec for `date` fields: stored as Unix seconds (a + * sortable, range-filterable int64), ISO 8601 at the API edges. One pair for + * the projection (writes), the query compiler (filter bounds) and the surface + * (output), so the three cannot disagree. Returns `undefined` for an + * unparseable value. + */ +export function isoToUnixSeconds(iso: string): number | undefined { + const millis = new Date(iso).getTime(); + return Number.isNaN(millis) ? undefined : Math.trunc(millis / 1000); +} + +/** The inverse of {@link isoToUnixSeconds}: stored Unix seconds → ISO 8601. */ +export function unixSecondsToIso(seconds: number): string { + return new Date(seconds * 1000).toISOString(); +} + /** Derive the physical engine field names a declaration produces. */ export function physicalFields(field: SearchField): PhysicalFields { const localized = field.kind === 'text' && field.localized === true; const locales = localized ? (field.locales ?? []) : []; return { - // Localized text has no single value field — its values live in the - // per-locale fields; every other kind stores into one `${name}` field. - value: localized ? undefined : field.name, display: localized && field.output ? locales.map((locale) => `${field.name}_${locale}`) diff --git a/packages/search/test/project.test.ts b/packages/search/test/project.test.ts index 2cd261f2..bdd869bf 100644 --- a/packages/search/test/project.test.ts +++ b/packages/search/test/project.test.ts @@ -162,6 +162,26 @@ describe('projectDocument', () => { expect(document.size).toBe(1234.5); }); + it('projects a boolean field from a path (xsd:boolean lexical space)', () => { + const withBoolean: SearchType = { + type: DATASET, + fields: [{ name: 'iiif', path: `${DR}iiif`, kind: 'boolean' }], + }; + const project = (value: unknown): SearchDocument => + projectDocument( + { '@id': 'https://ex/d/5', [`${DR}iiif`]: { '@value': value } }, + withBoolean, + ); + + expect(project('true').iiif).toBe(true); + expect(project('1').iiif).toBe(true); + expect(project('false').iiif).toBe(false); + // Absent value → no field (the adapter reconstructs absence as false). + expect( + projectDocument({ '@id': 'https://ex/d/5' }, withBoolean).iiif, + ).toBeUndefined(); + }); + it('folds the transformed values (not the raw ones) for a facet search field', () => { const document = projectDocument( { '@id': 'https://ex/d/4', [`${DR}format`]: [`${IANA}text/turtle`] }, diff --git a/packages/search/test/query.test.ts b/packages/search/test/query.test.ts index b82042f5..6de08b5c 100644 --- a/packages/search/test/query.test.ts +++ b/packages/search/test/query.test.ts @@ -1,30 +1,5 @@ import { describe, expect, it } from 'vitest'; -import { acceptsFilter, filterOperatorFor } from '../src/query.js'; -import type { SearchField } from '../src/schema.js'; - -const keyword: SearchField = { - name: 'format', - kind: 'keyword', - array: true, - filterable: true, -}; -const datePosted: SearchField = { - name: 'datePosted', - kind: 'date', - filterable: true, -}; -const status: SearchField = { - name: 'status', - kind: 'keyword', - facetable: true, -}; -const title: SearchField = { - name: 'title', - kind: 'text', - localized: true, - locales: ['nl'], - filterable: true, -}; +import { filterOperatorFor, pageForOffset } from '../src/query.js'; describe('filterOperatorFor', () => { it('maps each field kind to its `where` operator', () => { @@ -38,41 +13,13 @@ describe('filterOperatorFor', () => { }); }); -describe('acceptsFilter', () => { - it('accepts a filter whose shape matches the field’s operator', () => { - expect( - acceptsFilter(keyword, { field: 'format', in: ['text/turtle'] }), - ).toBe(true); - expect( - acceptsFilter(datePosted, { - field: 'datePosted', - range: { min: '2024' }, - }), - ).toBe(true); - }); - - it('rejects a filter whose shape does not match the field’s operator', () => { - expect(acceptsFilter(keyword, { field: 'format', range: { min: 1 } })).toBe( - false, - ); - }); - - it('rejects a filter on a non-filterable field', () => { - expect(acceptsFilter(status, { field: 'status', in: ['valid'] })).toBe( - false, - ); - }); - - it('rejects any filter on a text field (it feeds the free-text query)', () => { - expect(acceptsFilter(title, { field: 'title', in: ['x'] })).toBe(false); +describe('pageForOffset', () => { + it('maps an offset to its 1-based page', () => { + expect(pageForOffset(0, 20)).toBe(1); + expect(pageForOffset(40, 20)).toBe(3); }); - it('accepts an `is` filter on a filterable boolean field', () => { - const iiif: SearchField = { - name: 'iiif', - kind: 'boolean', - filterable: true, - }; - expect(acceptsFilter(iiif, { field: 'iiif', is: true })).toBe(true); + it('pins a facet-only query (limit 0) to page 1 instead of dividing by zero', () => { + expect(pageForOffset(0, 0)).toBe(1); }); }); diff --git a/packages/search/test/schema.test.ts b/packages/search/test/schema.test.ts index 368821a6..4877f68b 100644 --- a/packages/search/test/schema.test.ts +++ b/packages/search/test/schema.test.ts @@ -1,11 +1,16 @@ import { describe, expect, it } from 'vitest'; import { facetableFields, + fieldNamed, filterableFields, + isoToUnixSeconds, + isRangeFacet, outputFields, physicalFields, + referenceFields, searchableFields, sortableFields, + unixSecondsToIso, type SearchField, type SearchType, } from '../src/schema.js'; @@ -94,7 +99,6 @@ describe('physicalFields', () => { }; expect(physicalFields(keyword)).toEqual({ - value: 'keyword', display: [], search: ['keyword_search'], sort: [], @@ -134,7 +138,7 @@ describe('physicalFields', () => { }); }); - it('stores a reference field in one value field', () => { + it('fans a non-localized reference field out into no companion fields', () => { const publisher: SearchField = { name: 'publisher', kind: 'reference', @@ -145,7 +149,6 @@ describe('physicalFields', () => { }; expect(physicalFields(publisher)).toEqual({ - value: 'publisher', display: [], search: [], sort: [], @@ -185,4 +188,46 @@ describe('schema selectors', () => { 'status', ]); }); + + it('selects reference fields and looks a field up by name', () => { + const publisher: SearchField = { + name: 'publisher', + kind: 'reference', + facetable: true, + ref: { type: 'http://xmlns.com/foaf/0.1/Agent', strategy: 'labelOnly' }, + }; + const withReference: SearchType = { + type: DATASET, + fields: [...schema.fields, publisher], + }; + expect(referenceFields(withReference)).toEqual([publisher]); + expect(fieldNamed(withReference, 'publisher')).toBe(publisher); + expect(fieldNamed(withReference, 'nonexistent')).toBeUndefined(); + }); +}); + +describe('isRangeFacet', () => { + it('requires a non-empty facetRanges declaration', () => { + const size: SearchField = { + name: 'size', + kind: 'integer', + facetable: true, + facetRanges: [{ key: '0', min: 1, max: 10 }], + }; + expect(isRangeFacet(size)).toBe(true); + expect(isRangeFacet({ ...size, facetRanges: [] })).toBe(false); + expect(isRangeFacet({ ...size, facetRanges: undefined })).toBe(false); + }); +}); + +describe('date storage codec', () => { + it('round-trips ISO 8601 through the stored Unix seconds', () => { + const seconds = isoToUnixSeconds('2024-01-01T00:00:00.000Z'); + expect(seconds).toBe(Date.parse('2024-01-01T00:00:00.000Z') / 1000); + expect(unixSecondsToIso(seconds ?? 0)).toBe('2024-01-01T00:00:00.000Z'); + }); + + it('returns undefined for an unparseable date', () => { + expect(isoToUnixSeconds('not-a-date')).toBeUndefined(); + }); }); diff --git a/packages/search/vite.config.ts b/packages/search/vite.config.ts index 915a945a..61e8f5ae 100644 --- a/packages/search/vite.config.ts +++ b/packages/search/vite.config.ts @@ -11,9 +11,9 @@ export default mergeConfig( coverage: { thresholds: { functions: 100, - lines: 97.84, - branches: 90.9, - statements: 97.91, + lines: 97.87, + branches: 91.8, + statements: 97.97, }, }, }, From b24b9dd829c58589d8ceea1b930ed36d3d4e206e Mon Sep 17 00:00:00 2001 From: David de Boer Date: Fri, 3 Jul 2026 11:11:03 +0200 Subject: [PATCH 15/35] feat(search-api-graphql)!: build the GraphQL schema from the whole SearchSchema - buildGraphQLSchema(schema, { types }) emits one root query field per SearchType, so a single API serves multiple types (e.g. datasets and people), each searchable in its own way; per-type typeName, queryField and queryDefaults move into a types record keyed by type IRI, languageOrder stays global - create the shared types (LanguageString, buckets, filter inputs) once and dedupe reference types across root types: Person and CreativeWork both referencing Agent yield a single Agent type - omit the where arg for a type with no filterable fields and the facets field for a type with no facetable fields, which would be invalid empty GraphQL types - throw on a type without options, on options naming an unknown type, and on two types deriving the same root query field - test multiple root types: per-type derived types, the shared reference type, per-root-field engine routing, and the build-time errors - update ADR 4, the READMEs and the npm description accordingly BREAKING CHANGE: buildGraphQLSchema and printGraphQLSchema take the whole SearchSchema plus a per-type options record. Migrate buildGraphQLSchema(searchType, { typeName }) to buildGraphQLSchema(searchSchema(searchType), { types: { [searchType.type]: { typeName } } }). --- README.md | 2 +- .../0004-search-api-graphql-surface.md | 40 +- packages/search-api-graphql/README.md | 48 +- packages/search-api-graphql/package.json | 2 +- .../search-api-graphql/src/build-schema.ts | 428 ++++++++++-------- packages/search-api-graphql/src/index.ts | 1 + .../test/build-schema.test.ts | 152 ++++++- .../test/generator-stability.test.ts | 8 +- packages/search-api-graphql/vite.config.ts | 2 +- packages/search/README.md | 5 +- packages/search/src/engine.ts | 4 +- 11 files changed, 462 insertions(+), 230 deletions(-) diff --git a/README.md b/README.md index dae6f504..3a4ddf6a 100644 --- a/README.md +++ b/README.md @@ -146,7 +146,7 @@ await pipeline.run(); @lde/search-api-graphql npm - Engine- and domain-agnostic GraphQL surface for search: builds an executable GraphQL schema from any SearchType at runtime + Engine- and domain-agnostic GraphQL surface for search: builds an executable GraphQL schema from a SearchSchema at runtime, one root query field per type @lde/search-typesense diff --git a/docs/decisions/0004-search-api-graphql-surface.md b/docs/decisions/0004-search-api-graphql-surface.md index e29befa3..a6e412c4 100644 --- a/docs/decisions/0004-search-api-graphql-surface.md +++ b/docs/decisions/0004-search-api-graphql-surface.md @@ -21,11 +21,11 @@ separate package). ### Runtime configuration, not code generation The surface is **constructed at runtime from the field-model configuration** -(`buildGraphQLSchema(config)`), once at startup, with generic resolvers shipped in the package -attached to that schema – nothing is emitted or committed. The resolvers are inherently -generic (one root resolver maps args to a `SearchQuery`, calls the engine, and maps the result -back; the field model only parameterises data), so codegen would emit N near-identical stubs -that all delegate to the same logic, plus a build step and staleness risk, for no benefit. +(`buildGraphQLSchema(schema, options)`), once at startup, with generic resolvers shipped in the +package attached to that schema – nothing is emitted or committed. The resolvers are inherently +generic (one root resolver per type maps args to a `SearchQuery`, calls the engine, and maps the +result back; the field model only parameterises data), so codegen would emit N near-identical +stubs that all delegate to the same logic, plus a build step and staleness risk, for no benefit. A live GraphQL API serves its own schema via introspection, so clients need no committed `.graphql` file; the field-model diff is the reviewable change. `printGraphQLSchema()` exists @@ -34,19 +34,31 @@ breaking changes – not a shipped artifact. ### The schema-building function +The function takes the **whole `SearchSchema`** and emits one root query field per +`SearchType` – a schema may declare multiple root types (e.g. `Person` AND `CreativeWork`), +each searchable in its own way. Separately built `GraphQLSchema`s could never be merged later +(one `Query` type; the shared types would collide), so multi-type composition happens before +build, per the compose-before-build principle. Shared types (`LanguageString`, buckets, +filter inputs, reference types) are created once and reused across root types. + ```ts function buildGraphQLSchema( - searchType: SearchType, + schema: SearchSchema, // every root type, keyed by type IRI options: { - typeName: string; // 'Dataset' – drives all derived type names - queryField?: string; // root field; default lowercased plural of typeName - queryDefaults?: (q: SearchQuery, ctx: SearchContext) => SearchQuery; // consumer policy + types: Record< + string, // type IRI; every type in the schema needs an entry + { + typeName: string; // 'Dataset' – drives the type's derived GraphQL type names + queryField?: string; // root field; default lowercased plural of typeName + queryDefaults?: (q: SearchQuery, ctx: SearchContext) => SearchQuery; // per-type consumer policy + } + >; languageOrder?: LanguageOrder; // output-language ordering; default Accept-Language first }, ): GraphQLSchema; // executable schema: types + generic resolvers attached // optional CI helper only: -function printGraphQLSchema(searchType, options): string; // SDL, for a snapshot/breaking-change test +function printGraphQLSchema(schema, options): string; // SDL, for a snapshot/breaking-change test ``` `buildGraphQLSchema` is the standalone, framework-agnostic artifact (depends only on @@ -71,9 +83,11 @@ the optional `printGraphQLSchema()` SDL snapshot (the real artifact). ### Construction rules (field model → schema) -Type names derive from `typeName`; shared types (`LanguageString`, `ValueBucket`, `RangeBucket`, -`SortDirection`, `StringFilter`, `IntRange`, `FloatRange`, `DateRange`) are emitted once, and the -per-type keyed facets object is named `Facets`. +Type names derive from each type’s `typeName`; shared types (`LanguageString`, `ValueBucket`, +`RangeBucket`, `SortDirection`, `StringFilter`, `IntRange`, `FloatRange`, `DateRange`, and the +reference types) are emitted once across all root types, and the per-type keyed facets object +is named `Facets`. A type with no `filterable` fields gets no `where` arg, and one +with no `facetable` fields no `facets` field (empty GraphQL types are invalid). GraphQL field names are the field model `name` verbatim (declare camelCase). - **Output type** – one field per `output` field: `text`+`localized` → `[LanguageString!]!` (best-first; `[0].language` = served language, the per-field `Content-Language`); diff --git a/packages/search-api-graphql/README.md b/packages/search-api-graphql/README.md index d6274a9d..919bff24 100644 --- a/packages/search-api-graphql/README.md +++ b/packages/search-api-graphql/README.md @@ -1,43 +1,59 @@ # @lde/search-api-graphql The GraphQL surface for the [`@lde/search`](../search) core. **Both engine- and -domain-agnostic:** it builds an executable `GraphQLSchema` from any `SearchType` -at runtime, and serves it with one generic resolver over any `SearchEngine`. It -names neither your **domain** (you pass `typeName` — `Dataset`, `Person`, -`CreativeWork`, …) nor your **engine** (the resolver calls `context.engine`, be it -[`@lde/search-typesense`](../search-typesense) or another adapter). +domain-agnostic:** it builds an executable `GraphQLSchema` from your whole +`SearchSchema` at runtime — one root query field per `SearchType`, each +searchable in its own way — served by one generic resolver per root field over +any `SearchEngine`. It names neither your **domain** (you pass a `typeName` per +type — `Dataset`, `Person`, `CreativeWork`, …) nor your **engine** (the resolver +calls `context.engine`, be it [`@lde/search-typesense`](../search-typesense) or +another adapter). ## Runtime configuration, not codegen -`buildGraphQLSchema(searchType, { typeName })` constructs the schema once at +`buildGraphQLSchema(schema, { types })` constructs the GraphQL schema once at startup from the field model — no SDL artifact, no generated resolver stubs. The field model is the single source; the GraphQL contract is whatever it produces. Output types, the `where`/`orderBy`/facet inputs, reference types and nullability are all derived from each field’s `kind` and capability flags. ```ts +import { searchSchema } from '@lde/search'; import { buildGraphQLSchema } from '@lde/search-api-graphql'; -const gqlSchema = buildGraphQLSchema(DATASET, { - typeName: 'Dataset', - queryDefaults: (query) => ({ - ...query, - where: [...query.where, { field: 'status', in: ['valid'] }], - }), +const gqlSchema = buildGraphQLSchema(searchSchema(DATASET, PERSON), { + types: { + [DATASET.type]: { + typeName: 'Dataset', + queryDefaults: (query) => ({ + ...query, + where: [...query.where, { field: 'status', in: ['valid'] }], + }), + }, + [PERSON.type]: { typeName: 'Person', queryField: 'people' }, + }, }); +// The API now serves `datasets(…)` and `people(…)` root fields. // Hand `gqlSchema` to any graphql-js server; populate the per-request context: // { engine: SearchEngine, acceptLanguage: string[] } ``` -## What it builds +Per type you configure the `typeName` (drives all derived type names), an +optional `queryField` (defaults to the lowercased plural of `typeName`) and an +optional `queryDefaults` policy applied to every query of that type. Shared +types (`LanguageString`, the facet buckets, filter inputs and reference types +such as a common `Agent`) are created once and reused across root types. + +## What it builds (per root type) - **Output type** (`typeName`) — localized text → best-first `[LanguageString!]!` (`[0].language` is the language actually served); references → named per-shape types (`Organization`, `Term`) with a `name`; scalars/booleans per kind; `date` → ISO 8601 string; nullability from `required` / `array` / `kind`. - **`where`** — one input per `filterable` field (`StringFilter`, `IntRange` / - `FloatRange` / `DateRange`, or `Boolean`). + `FloatRange` / `DateRange`, or `Boolean`); omitted entirely for a type with no + filterable fields. - **`orderBy`** — `RELEVANCE` plus every `sortable` field, as an enum. - **Facets** — an enum of every `facetable` field; a bucket carries `value` + `count` + a nullable `label` — the resolved data label for **reference** facets, @@ -50,7 +66,7 @@ The surface reads the same field model the index is built from, and compiles int the same neutral `SearchQuery` the engine consumes — so the API, the index and a future REST surface stay in lockstep. The contract is **frozen** (breaking to change), and because it is generated rather than hand-written, a _consumer_ guards -it with a `printGraphQLSchema(searchType, options)` SDL snapshot over its **own** -search type and `typeName` — that snapshot also catches a `buildGraphQLSchema` +it with a `printGraphQLSchema(schema, options)` SDL snapshot over its **own** +search schema and type names — that snapshot also catches a `buildGraphQLSchema` change in a future version of this library silently altering the consumer’s contract. diff --git a/packages/search-api-graphql/package.json b/packages/search-api-graphql/package.json index 70f76450..3698172b 100644 --- a/packages/search-api-graphql/package.json +++ b/packages/search-api-graphql/package.json @@ -1,7 +1,7 @@ { "name": "@lde/search-api-graphql", "version": "0.1.0", - "description": "Engine- and domain-agnostic GraphQL surface for @lde/search: builds an executable GraphQLSchema from any SearchType at runtime (no codegen), served by one generic resolver over any SearchEngine. You supply the search type and typeName; it names neither your domain nor your engine.", + "description": "Engine- and domain-agnostic GraphQL surface for @lde/search: builds an executable GraphQLSchema from a whole SearchSchema at runtime (no codegen) — one root query field per SearchType — served by generic resolvers over any SearchEngine. You supply the schema and per-type typeNames; it names neither your domain nor your engine.", "repository": { "url": "git+https://github.com/ldelements/lde.git", "directory": "packages/search-api-graphql" diff --git a/packages/search-api-graphql/src/build-schema.ts b/packages/search-api-graphql/src/build-schema.ts index cb6652bf..8edb1309 100644 --- a/packages/search-api-graphql/src/build-schema.ts +++ b/packages/search-api-graphql/src/build-schema.ts @@ -30,6 +30,7 @@ import { type SearchEngine, type SearchField, type SearchQuery, + type SearchSchema, type SearchType, } from '@lde/search'; import { @@ -51,16 +52,24 @@ export interface SearchContext { readonly onFacetError?: (field: string, error: unknown) => void; } -export interface BuildGraphQLSchemaOptions { - /** Drives all derived type names, e.g. `Dataset`. */ +/** Per-root-type options; what the schema value cannot carry. */ +export interface SearchTypeOptions { + /** Drives the type’s derived GraphQL type names, e.g. `Dataset`. */ readonly typeName: string; /** Root query field; defaults to the lowercased plural of `typeName`. */ readonly queryField?: string; - /** Consumer policy applied to every query (default status, sort, tie-breaks). */ + /** Consumer policy applied to every query of this type (default status, sort, + * tie-breaks). */ readonly queryDefaults?: ( query: SearchQuery, context: SearchContext, ) => SearchQuery; +} + +export interface BuildGraphQLSchemaOptions { + /** Options per root type, keyed by type IRI (the {@link SearchType} `type`). + * Every type in the schema needs an entry. */ + readonly types: Readonly>; /** Output-language ordering; defaults to Accept-Language-first, `und` last. */ readonly languageOrder?: LanguageOrder; } @@ -82,20 +91,27 @@ function screamingSnake(name: string): string { } /** - * Construct an executable GraphQL schema from the unified {@link SearchField} - * model at runtime — no codegen, no SDL artifact. One generic resolver maps the - * arguments to a {@link SearchQuery}, calls `context.engine`, and maps the result - * back; the field model only parameterises data. + * Construct an executable GraphQL schema from the whole {@link SearchSchema} at + * runtime — no codegen, no SDL artifact. One root query field per + * {@link SearchType} (e.g. `datasets`, `people`), each searchable in its own + * way through its own output/`where`/`orderBy`/facet types, while the shared + * types (`LanguageString`, buckets, filter inputs, reference types) are created + * once. One generic resolver per root field maps the arguments to a + * {@link SearchQuery}, calls `context.engine`, and maps the result back; the + * field model only parameterises data. */ export function buildGraphQLSchema( - searchType: SearchType, + schema: SearchSchema, options: BuildGraphQLSchemaOptions, ): GraphQLSchema { - const { typeName } = options; const languageOrder = options.languageOrder ?? defaultLanguageOrder; - const queryField = - options.queryField ?? - `${typeName.charAt(0).toLowerCase()}${typeName.slice(1)}s`; + for (const typeIri of Object.keys(options.types)) { + if (!schema.has(typeIri)) { + throw new Error( + `Options given for type “${typeIri}”, which is not in the search schema.`, + ); + } + } const languageString = new GraphQLObjectType({ name: 'LanguageString', @@ -159,45 +175,33 @@ export function buildGraphQLSchema( }, }); - // One reference type per referenced shape, reused by every field. + // One reference type per referenced shape, shared across every root type and + // reused by every field (Person and CreativeWork both referencing Agent yield + // one Agent type). const referenceTypes = new Map(); - for (const field of outputFields(searchType)) { - if ( - field.kind === 'reference' && - field.ref && - !referenceTypes.has(field.ref.type) - ) { - referenceTypes.set( - field.ref.type, - new GraphQLObjectType({ - name: field.ref.type, - fields: { - id: { type: new GraphQLNonNull(GraphQLString) }, - name: labelList( - (source) => source.label as LocalizedValue | undefined, - ), - }, - }), - ); + for (const searchType of schema.values()) { + for (const field of outputFields(searchType)) { + if ( + field.kind === 'reference' && + field.ref && + !referenceTypes.has(field.ref.type) + ) { + referenceTypes.set( + field.ref.type, + new GraphQLObjectType({ + name: field.ref.type, + fields: { + id: { type: new GraphQLNonNull(GraphQLString) }, + name: labelList( + (source) => source.label as LocalizedValue | undefined, + ), + }, + }), + ); + } } } - const outputType = new GraphQLObjectType({ - name: typeName, - fields: () => { - const fields: Record< - string, - GraphQLFieldConfig - > = { - id: { type: new GraphQLNonNull(GraphQLString) }, - }; - for (const field of outputFields(searchType)) { - fields[field.name] = outputFieldConfig(field); - } - return fields; - }, - }); - function outputFieldConfig( field: SearchField, ): GraphQLFieldConfig { @@ -250,17 +254,6 @@ export function buildGraphQLSchema( } } - const whereInput = new GraphQLInputObjectType({ - name: `${typeName}Where`, - fields: () => { - const fields: Record = {}; - for (const field of filterableFields(searchType)) { - fields[field.name] = { type: whereFieldType(field) }; - } - return fields; - }, - }); - function whereFieldType(field: SearchField): GraphQLInputType { switch (filterOperatorFor(field.kind)) { case 'in': @@ -276,132 +269,213 @@ export function buildGraphQLSchema( } } - const sortValues: GraphQLEnumValueConfigMap = { - RELEVANCE: { value: 'relevance' }, - }; - for (const field of sortableFields(searchType)) { - sortValues[screamingSnake(field.name)] = { value: field.name }; - } - const sortField = new GraphQLEnumType({ - name: `${typeName}SortField`, - values: sortValues, - }); - const orderByInput = new GraphQLInputObjectType({ - name: `${typeName}OrderBy`, - fields: { - field: { type: new GraphQLNonNull(sortField) }, - direction: { - type: new GraphQLNonNull(sortDirection), - defaultValue: 'desc', + /** The root query field for one {@link SearchType}, with its derived types. */ + function rootField( + searchType: SearchType, + typeOptions: SearchTypeOptions, + ): GraphQLFieldConfig { + const { typeName } = typeOptions; + + const outputType = new GraphQLObjectType({ + name: typeName, + fields: () => { + const fields: Record< + string, + GraphQLFieldConfig + > = { + id: { type: new GraphQLNonNull(GraphQLString) }, + }; + for (const field of outputFields(searchType)) { + fields[field.name] = outputFieldConfig(field); + } + return fields; }, - }, - }); + }); - // Keyed facets object: one field per facetable field, typed by its kind - // (range fields → [RangeBucket!], else [ValueBucket!]). Each field's resolver - // computes that facet with its OWN where-filter removed (skip-own-filter), so a - // multi-select facet still lists its other options; only the selected fields - // are resolved (GraphQL prunes the rest), so the selection IS the request. - const facetsType = new GraphQLObjectType({ - name: `${typeName}Facets`, - fields: () => { - const fields: Record< - string, - GraphQLFieldConfig - > = {}; - for (const field of facetableFields(searchType)) { - fields[field.name] = { - type: nonNullListOf(isRangeFacet(field) ? rangeBucket : valueBucket), - resolve: async ( - source: Source, - _args: unknown, - context: SearchContext, - ) => { - const query = source.query as SearchQuery; - // Drop this facet's own filter so its other options still count - // (a removed `status` filter also drops the valid-only default, so - // the status facet counts across every status). - const facetQuery: SearchQuery = { - ...query, - where: query.where.filter( - (filter) => filter.field !== field.name, - ), - facets: [field.name], - limit: 0, - offset: 0, - }; - // A facet is supplementary: degrade a failed facet to an empty list - // rather than failing the whole query (which would null the non-null - // result and discard the items + every other facet). - try { - const result = await context.engine.search( - facetQuery, - searchType, - ); - return result.facets[field.name] ?? []; - } catch (error) { - context.onFacetError?.(field.name, error); - return []; - } + // A GraphQL input object must have at least one field, so a type with no + // filterable fields gets no `where` arg at all rather than an invalid + // empty input. + const filterable = filterableFields(searchType); + const whereInput = + filterable.length === 0 + ? undefined + : new GraphQLInputObjectType({ + name: `${typeName}Where`, + fields: () => { + const fields: Record = {}; + for (const field of filterable) { + fields[field.name] = { type: whereFieldType(field) }; + } + return fields; + }, + }); + + const sortValues: GraphQLEnumValueConfigMap = { + RELEVANCE: { value: 'relevance' }, + }; + for (const field of sortableFields(searchType)) { + sortValues[screamingSnake(field.name)] = { value: field.name }; + } + const sortField = new GraphQLEnumType({ + name: `${typeName}SortField`, + values: sortValues, + }); + const orderByInput = new GraphQLInputObjectType({ + name: `${typeName}OrderBy`, + fields: { + field: { type: new GraphQLNonNull(sortField) }, + direction: { + type: new GraphQLNonNull(sortDirection), + defaultValue: 'desc', + }, + }, + }); + + // Keyed facets object: one field per facetable field, typed by its kind + // (range fields → [RangeBucket!], else [ValueBucket!]). Each field's resolver + // computes that facet with its OWN where-filter removed (skip-own-filter), so a + // multi-select facet still lists its other options; only the selected fields + // are resolved (GraphQL prunes the rest), so the selection IS the request. + // Like `where`, omitted entirely for a type with no facetable fields (a + // GraphQL object type must have at least one field). + const facetable = facetableFields(searchType); + const facetsType = + facetable.length === 0 + ? undefined + : facetsTypeFor(searchType, typeName, facetable); + + const resultType = new GraphQLObjectType({ + name: `${typeName}SearchResult`, + fields: { + items: { type: nonNullListOf(outputType) }, + total: { type: new GraphQLNonNull(GraphQLInt) }, + page: { type: new GraphQLNonNull(GraphQLInt) }, + perPage: { type: new GraphQLNonNull(GraphQLInt) }, + // Resolved lazily, per selected key (skip-own-filter); the result object + // (which carries the resolved `query`) is the facets source. + ...(facetsType && { + facets: { + type: new GraphQLNonNull(facetsType), + resolve: (source: Source) => source, }, - }; - } - return fields; - }, - }); + }), + }, + }); - const resultType = new GraphQLObjectType({ - name: `${typeName}SearchResult`, - fields: { - items: { type: nonNullListOf(outputType) }, - total: { type: new GraphQLNonNull(GraphQLInt) }, - page: { type: new GraphQLNonNull(GraphQLInt) }, - perPage: { type: new GraphQLNonNull(GraphQLInt) }, - // Resolved lazily, per selected key (skip-own-filter); the result object - // (which carries the resolved `query`) is the facets source. - facets: { - type: new GraphQLNonNull(facetsType), - resolve: (source: Source) => source, + return { + type: new GraphQLNonNull(resultType), + args: { + query: { type: GraphQLString }, + ...(whereInput && { where: { type: whereInput } }), + orderBy: { type: orderByInput }, + page: { type: GraphQLInt, defaultValue: 1 }, + perPage: { type: GraphQLInt, defaultValue: 20 }, }, - }, - }); + resolve: async (_source, args, context: SearchContext) => { + const built = argsToQuery(args as QueryArgs, context, searchType); + const finalQuery = typeOptions.queryDefaults + ? typeOptions.queryDefaults(built, context) + : built; + // Items + total only; facets are resolved lazily per selected key. + const result = await context.engine.search( + { ...finalQuery, facets: [] }, + searchType, + ); + return { + items: result.hits.map((hit) => ({ id: hit.id, ...hit.document })), + total: result.total, + page: pageForOffset(finalQuery.offset, finalQuery.limit), + perPage: finalQuery.limit, + // Carried for the facets resolver (skip-own-filter per key). + query: finalQuery, + }; + }, + }; + } - const query = new GraphQLObjectType({ - name: 'Query', - fields: { - [queryField]: { - type: new GraphQLNonNull(resultType), - args: { - query: { type: GraphQLString }, - where: { type: whereInput }, - orderBy: { type: orderByInput }, - page: { type: GraphQLInt, defaultValue: 1 }, - perPage: { type: GraphQLInt, defaultValue: 20 }, - }, - resolve: async (_source, args, context: SearchContext) => { - const built = argsToQuery(args as QueryArgs, context, searchType); - const finalQuery = options.queryDefaults - ? options.queryDefaults(built, context) - : built; - // Items + total only; facets are resolved lazily per selected key. - const result = await context.engine.search( - { ...finalQuery, facets: [] }, - searchType, - ); - return { - items: result.hits.map((hit) => ({ id: hit.id, ...hit.document })), - total: result.total, - page: pageForOffset(finalQuery.offset, finalQuery.limit), - perPage: finalQuery.limit, - // Carried for the facets resolver (skip-own-filter per key). - query: finalQuery, + /** The keyed facets object for one type (only called with ≥ 1 facetable field). */ + function facetsTypeFor( + searchType: SearchType, + typeName: string, + facetable: readonly SearchField[], + ): GraphQLObjectType { + return new GraphQLObjectType({ + name: `${typeName}Facets`, + fields: () => { + const fields: Record< + string, + GraphQLFieldConfig + > = {}; + for (const field of facetable) { + fields[field.name] = { + type: nonNullListOf( + isRangeFacet(field) ? rangeBucket : valueBucket, + ), + resolve: async ( + source: Source, + _args: unknown, + context: SearchContext, + ) => { + const query = source.query as SearchQuery; + // Drop this facet's own filter so its other options still count + // (a removed `status` filter also drops the valid-only default, so + // the status facet counts across every status). + const facetQuery: SearchQuery = { + ...query, + where: query.where.filter( + (filter) => filter.field !== field.name, + ), + facets: [field.name], + limit: 0, + offset: 0, + }; + // A facet is supplementary: degrade a failed facet to an empty list + // rather than failing the whole query (which would null the non-null + // result and discard the items + every other facet). + try { + const result = await context.engine.search( + facetQuery, + searchType, + ); + return result.facets[field.name] ?? []; + } catch (error) { + context.onFacetError?.(field.name, error); + return []; + } + }, }; - }, + } + return fields; }, - }, - }); + }); + } - return new GraphQLSchema({ query }); + const queryFields: Record< + string, + GraphQLFieldConfig + > = {}; + for (const searchType of schema.values()) { + const typeOptions = options.types[searchType.type]; + if (typeOptions === undefined) { + throw new Error( + `Missing options (typeName) for type “${searchType.type}”.`, + ); + } + const { typeName } = typeOptions; + const queryField = + typeOptions.queryField ?? + `${typeName.charAt(0).toLowerCase()}${typeName.slice(1)}s`; + if (queryField in queryFields) { + throw new Error( + `Duplicate root query field “${queryField}”; set queryField to disambiguate.`, + ); + } + queryFields[queryField] = rootField(searchType, typeOptions); + } + + return new GraphQLSchema({ + query: new GraphQLObjectType({ name: 'Query', fields: queryFields }), + }); } /** @@ -411,10 +485,10 @@ export function buildGraphQLSchema( * future version of this library silently altering it). */ export function printGraphQLSchema( - searchType: SearchType, + schema: SearchSchema, options: BuildGraphQLSchemaOptions, ): string { - return printSchema(buildGraphQLSchema(searchType, options)); + return printSchema(buildGraphQLSchema(schema, options)); } interface QueryArgs { diff --git a/packages/search-api-graphql/src/index.ts b/packages/search-api-graphql/src/index.ts index 7754a7e5..0dfe1adf 100644 --- a/packages/search-api-graphql/src/index.ts +++ b/packages/search-api-graphql/src/index.ts @@ -2,6 +2,7 @@ export { buildGraphQLSchema, printGraphQLSchema } from './build-schema.js'; export type { SearchContext, BuildGraphQLSchemaOptions, + SearchTypeOptions, } from './build-schema.js'; export { defaultLanguageOrder } from './language.js'; export type { LanguageString, LanguageOrder } from './language.js'; diff --git a/packages/search-api-graphql/test/build-schema.test.ts b/packages/search-api-graphql/test/build-schema.test.ts index 243b0ec9..c2e29f0d 100644 --- a/packages/search-api-graphql/test/build-schema.test.ts +++ b/packages/search-api-graphql/test/build-schema.test.ts @@ -1,10 +1,11 @@ import { describe, expect, it } from 'vitest'; import { graphql, printSchema } from 'graphql'; -import type { - SearchEngine, - SearchQuery, - SearchResult, - SearchType, +import { + searchSchema, + type SearchEngine, + type SearchQuery, + type SearchResult, + type SearchType, } from '@lde/search'; import { buildGraphQLSchema, type SearchContext } from '../src/build-schema.js'; @@ -119,13 +120,17 @@ const canned: SearchResult = { facets: { keyword: [{ value: 'kaarten', count: 3 }] }, }; +const datasetOptions = { + types: { [schema.type]: { typeName: 'Dataset' } }, +}; + async function run( source: string, context: SearchContext, variables?: Record, ) { return graphql({ - schema: buildGraphQLSchema(schema, { typeName: 'Dataset' }), + schema: buildGraphQLSchema(searchSchema(schema), datasetOptions), source, contextValue: context, variableValues: variables, @@ -454,13 +459,17 @@ describe('buildGraphQLSchema', () => { return canned; }, }; - const gqlSchema = buildGraphQLSchema(schema, { - typeName: 'Dataset', - queryDefaults: (query) => ({ - ...query, - where: [...query.where, { field: 'status', in: ['valid'] }], - orderBy: [{ field: 'relevance', direction: 'desc' }], - }), + const gqlSchema = buildGraphQLSchema(searchSchema(schema), { + types: { + [schema.type]: { + typeName: 'Dataset', + queryDefaults: (query) => ({ + ...query, + where: [...query.where, { field: 'status', in: ['valid'] }], + orderBy: [{ field: 'relevance', direction: 'desc' }], + }), + }, + }, }); await graphql({ schema: gqlSchema, @@ -475,7 +484,7 @@ describe('buildGraphQLSchema', () => { it('derives nullability: required scalar non-null, optional scalar nullable, arrays/booleans non-null', () => { const sdl = printSchema( - buildGraphQLSchema(schema, { typeName: 'Dataset' }), + buildGraphQLSchema(searchSchema(schema), datasetOptions), ); expect(sdl).toMatch(/status: String!/); // required expect(sdl).toMatch(/size: Int\b(?!!)/); // optional → nullable @@ -487,7 +496,7 @@ describe('buildGraphQLSchema', () => { it('builds the where, orderBy enum and keyed facets object from the field model', () => { const sdl = printSchema( - buildGraphQLSchema(schema, { typeName: 'Dataset' }), + buildGraphQLSchema(searchSchema(schema), datasetOptions), ); expect(sdl).toMatch(/enum DatasetSortField/); expect(sdl).toMatch(/RELEVANCE/); @@ -500,4 +509,117 @@ describe('buildGraphQLSchema', () => { expect(sdl).toMatch(/status: StringFilter/); expect(sdl).toMatch(/size: IntRange/); }); + + describe('multiple root types in one schema', () => { + const PERSON: SearchType = { + type: 'https://schema.org/Person', + fields: [ + { + name: 'name', + kind: 'text', + localized: true, + locales: ['nl'], + output: true, + searchable: { weight: 5 }, + sortable: true, + }, + { + name: 'affiliation', + kind: 'reference', + facetable: true, + output: true, + ref: { type: 'Agent', strategy: 'labelOnly' }, + }, + ], + }; + const CREATIVE_WORK: SearchType = { + type: 'https://schema.org/CreativeWork', + fields: [ + { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl'], + output: true, + searchable: { weight: 5 }, + }, + { + name: 'publisher', + kind: 'reference', + facetable: true, + output: true, + ref: { type: 'Agent', strategy: 'labelOnly' }, + }, + { name: 'pageCount', kind: 'integer', filterable: true, output: true }, + ], + }; + const twoTypeSchema = buildGraphQLSchema( + searchSchema(PERSON, CREATIVE_WORK), + { + types: { + [PERSON.type]: { typeName: 'Person', queryField: 'people' }, + [CREATIVE_WORK.type]: { typeName: 'CreativeWork' }, + }, + }, + ); + + it('exposes one root field per type, each with its own derived types', () => { + const sdl = printSchema(twoTypeSchema); + expect(sdl).toMatch(/people\([\s\S]*?\): PersonSearchResult!/); + expect(sdl).toMatch( + /creativeWorks\([\s\S]*?\): CreativeWorkSearchResult!/, + ); + expect(sdl).toMatch(/enum PersonSortField/); + expect(sdl).toMatch(/input CreativeWorkWhere/); + // Person has no filterable fields, so it gets no `where` arg (an empty + // input object would be invalid GraphQL) — CreativeWork keeps its own. + expect(sdl).not.toMatch(/PersonWhere/); + // The shared reference shape is emitted once, reused by both types. + expect(sdl.match(/^type Agent /gm)).toHaveLength(1); + }); + + it('routes each root field to its own search type', async () => { + const searchedTypes: string[] = []; + const engine: SearchEngine = { + async search(_query, searchType) { + searchedTypes.push(searchType.type); + return { total: 0, hits: [], facets: {} }; + }, + }; + const result = await graphql({ + schema: twoTypeSchema, + source: `{ people { total } creativeWorks { total } }`, + contextValue: { engine, acceptLanguage: ['nl'] }, + }); + expect(result.errors).toBeUndefined(); + expect(searchedTypes).toEqual([PERSON.type, CREATIVE_WORK.type]); + }); + + it('throws on a type without options, an unknown type, and a root-field clash', () => { + expect(() => + buildGraphQLSchema(searchSchema(PERSON, CREATIVE_WORK), { + types: { [PERSON.type]: { typeName: 'Person' } }, + }), + ).toThrow(/Missing options/); + expect(() => + buildGraphQLSchema(searchSchema(PERSON), { + types: { + [PERSON.type]: { typeName: 'Person' }, + 'https://schema.org/Unknown': { typeName: 'Unknown' }, + }, + }), + ).toThrow(/not in the search schema/); + expect(() => + buildGraphQLSchema(searchSchema(PERSON, CREATIVE_WORK), { + types: { + [PERSON.type]: { typeName: 'Person', queryField: 'items' }, + [CREATIVE_WORK.type]: { + typeName: 'CreativeWork', + queryField: 'items', + }, + }, + }), + ).toThrow(/Duplicate root query field/); + }); + }); }); diff --git a/packages/search-api-graphql/test/generator-stability.test.ts b/packages/search-api-graphql/test/generator-stability.test.ts index c78b1535..2a872afb 100644 --- a/packages/search-api-graphql/test/generator-stability.test.ts +++ b/packages/search-api-graphql/test/generator-stability.test.ts @@ -1,5 +1,5 @@ import { describe, expect, it } from 'vitest'; -import type { SearchType } from '@lde/search'; +import { searchSchema, type SearchType } from '@lde/search'; import { printGraphQLSchema } from '../src/build-schema.js'; /** @@ -92,6 +92,10 @@ const THING: SearchType = { describe('GraphQL generator stability', () => { it('emits a stable SDL for a representative schema', () => { - expect(printGraphQLSchema(THING, { typeName: 'Thing' })).toMatchSnapshot(); + expect( + printGraphQLSchema(searchSchema(THING), { + types: { [THING.type]: { typeName: 'Thing' } }, + }), + ).toMatchSnapshot(); }); }); diff --git a/packages/search-api-graphql/vite.config.ts b/packages/search-api-graphql/vite.config.ts index 2b41cfdc..9d06048c 100644 --- a/packages/search-api-graphql/vite.config.ts +++ b/packages/search-api-graphql/vite.config.ts @@ -12,7 +12,7 @@ export default mergeConfig( thresholds: { functions: 100, lines: 100, - branches: 89.74, + branches: 90.21, statements: 100, }, }, diff --git a/packages/search/README.md b/packages/search/README.md index ca84cd21..63f687c7 100644 --- a/packages/search/README.md +++ b/packages/search/README.md @@ -41,8 +41,9 @@ GraphQL (one of the surfaces): | `SearchType` | One root type’s complete declaration: its `type` IRI plus its fields and derivations | NodeShape | object type | | `SearchSchema` | The whole search declaration: every `SearchType`, keyed by `type` IRI — build one with `searchSchema(...types)` | shapes graph | schema | -`projectGraph` consumes a `SearchSchema` (it projects every type in one pass); -the engine port and the GraphQL surface operate on one `SearchType` at a time. +`projectGraph` and the GraphQL surface consume a `SearchSchema` (projecting +every type in one pass, resp. emitting one root query field per type); the +engine port executes one `SearchType` at a time. ## Field model diff --git a/packages/search/src/engine.ts b/packages/search/src/engine.ts index 20a545c8..9f9bded4 100644 --- a/packages/search/src/engine.ts +++ b/packages/search/src/engine.ts @@ -6,8 +6,8 @@ import type { SearchType } from './schema.js'; * `@lde/search-typesense`’s `TypesenseSearchEngine`) implements. The adapter * owns every engine specific (companion-field expansion, full-text field * selection and weights, filter compilation, sorting, result folding, faceting) - * and returns only logical - * documents, so a deployment can swap engines without any consumer noticing. + * and returns only logical documents, so a deployment can swap engines without + * any consumer noticing. * Nothing engine-specific and nothing RDF-specific leaks past this port. * * `FacetField` keys the returned facet map; it defaults to `string` so an engine From c34608ff08113e2f71a3cb709b4ecdb98c3040d5 Mon Sep 17 00:00:00 2001 From: David de Boer Date: Fri, 3 Jul 2026 11:11:22 +0200 Subject: [PATCH 16/35] docs(search): state ADR 3 without historical references - describe the unified field model directly rather than by contrast with pre-unification per-field configurations - drop the carried-through consequence bullet, keeping the folding contract (index and query normalize identically via @lde/text-normalization) as a direct claim --- .../0003-search-api-core-query-model.md | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/docs/decisions/0003-search-api-core-query-model.md b/docs/decisions/0003-search-api-core-query-model.md index df74737c..9ef7d119 100644 --- a/docs/decisions/0003-search-api-core-query-model.md +++ b/docs/decisions/0003-search-api-core-query-model.md @@ -49,11 +49,11 @@ NodeShape + its `search:` annotations. **One `SearchField` declaration drives fo consumers** – projection (RDF→flat document), the engine collection schema, the query semantics, and the GraphQL surface – so they cannot drift. -It is a **unified** model: one declaration in place of three otherwise-separate ones – the -projection-side `FieldSpec`/`FieldKind`, the Typesense `SEARCH_FIELDS` (collection schema + -weights), and the query model below. `kind` plus capability flags replace the discriminated -projection kinds, derived fields are first-class, and the Typesense-vocabulary types are -_derived_ from `kind` rather than re-declared. +It is a **unified** model: a single declaration carries the projection, the collection +schema and search weights, and the query semantics – concerns that would otherwise each +need their own per-field configuration, free to drift apart. `kind` plus independent +capability flags express them all, derived fields are first-class, and the +Typesense-vocabulary types are _derived_ from `kind`, never declared. ```ts type FieldKind = @@ -99,8 +99,7 @@ Maps onto SHACL + `search:` (`kind`←`sh:datatype`/`sh:nodeKind`, `path`←`sh: `sortable`←`search:sortable`, `ref`←`sh:node`/`sh:class` + `search:nestedStrategy`) so an eventual generator emits it unchanged. A field with **no `path`** is a derived field – populated by a `Derivation` rather than projected from the IR – yet it still carries full -query/schema/output behavior, which is how the former separate projection `FieldSpec` is -subsumed. The physical field names a declaration fans out to (`${name}_search_${locale}`, +query/schema/output behavior. The physical field names a declaration fans out to (`${name}_search_${locale}`, `${name}_sort_${locale}`, `${name}_search`) follow one convention owned by `@lde/search`, so projection, collection schema and query compiler agree. The `status_rank` tie-break sort is a **deployment-specific delta**, never in `@lde/search`. Grouped facets need @@ -315,9 +314,8 @@ not enabled for DR v1, more relevant for B/C. - One declarative source drives GraphQL, later REST, and the index; they cannot drift. - The engine is a swappable adapter; the contract outlives engine choices. -- Carried through: the Stable API Contract discipline, the reference `strategy` concept, the - surface `LanguageString` list, folding at the adapter boundary + query side via - `@lde/text-normalization`, SDL-in-projection vs filter-compiler-in-adapter. +- Folding (case/diacritics) happens at the adapter boundary and on the query side via + `@lde/text-normalization`, so index and query normalize identically. - Deferred: REST surface; framed-JSON-LD materialised view (nested storage, index-time label inlining, detail-page-on-index, terms-collection split); semantic/hybrid (vector) search. From 1fd53ae22f278170f39072d10746bec3dcab8295 Mon Sep 17 00:00:00 2001 From: David de Boer Date: Fri, 3 Jul 2026 11:35:31 +0200 Subject: [PATCH 17/35] docs(search): treat SHACL as an optional source of the field model, not its definition - define FieldKind, SearchField, SearchType and SearchSchema on their own terms; state the SHACL mapping as one possible source (a generator can emit declarations from NodeShapes + search: annotations) rather than defining the model as the runtime form of shapes - drop per-property SHACL parentheticals (sh:path, sh:maxCount, sh:minCount) from the SearchField members - align the README terminology intro and the ADR 3 field-model lead with the same framing --- .../0003-search-api-core-query-model.md | 18 ++++--- packages/search/README.md | 44 ++++++++++++--- packages/search/src/schema.ts | 53 ++++++++++++------- 3 files changed, 81 insertions(+), 34 deletions(-) diff --git a/docs/decisions/0003-search-api-core-query-model.md b/docs/decisions/0003-search-api-core-query-model.md index 9ef7d119..cb1e3e82 100644 --- a/docs/decisions/0003-search-api-core-query-model.md +++ b/docs/decisions/0003-search-api-core-query-model.md @@ -44,10 +44,10 @@ behind the adapter and is swappable with no consumer impact. Nothing engine-spec ### Field model -The engine-neutral description of a queryable field – the runtime form of one SHACL -NodeShape + its `search:` annotations. **One `SearchField` declaration drives four -consumers** – projection (RDF→flat document), the engine collection schema, the query -semantics, and the GraphQL surface – so they cannot drift. +The engine-neutral description of a queryable field. **One `SearchField` declaration drives +four consumers** – projection (RDF→flat document), the engine collection schema, the query +semantics, and the GraphQL surface – so they cannot drift. SHACL is one possible source +(see the mapping below), not a dependency: a hand-written declaration is just as valid. It is a **unified** model: a single declaration carries the projection, the collection schema and search weights, and the query semantics – concerns that would otherwise each @@ -195,15 +195,19 @@ SearchEngine` readable. ```ts // FacetField / OutputField default to `string` (ergonomic) and a deployment narrows them -// to its type’s facetable / output field names for typo-safe facet and document access -// (helpers FacetFieldsOf / OutputFieldsOf, or the EngineFor alias). +// to its type’s facetable / output field names for typo-safe facet and document access; +// Type narrows the accepted searchType argument alongside, so a narrowed engine cannot be +// handed the wrong search type. The ergonomic route is engineFor(searchType, engine) over +// a defineSearchType declaration (helpers FacetFieldsOf / OutputFieldsOf and +// the EngineFor alias are exported for hand-written signatures). interface SearchEngine< FacetField extends string = string, OutputField extends string = string, + Type extends SearchType = SearchType, > { search( query: SearchQuery, - searchType: SearchType, + searchType: Type, ): Promise>; } diff --git a/packages/search/README.md b/packages/search/README.md index 63f687c7..a2f83ae2 100644 --- a/packages/search/README.md +++ b/packages/search/README.md @@ -32,8 +32,9 @@ and the API output in a single place. ## Terminology -The model has three levels, mirroring both SHACL (the source vocabulary) and -GraphQL (one of the surfaces): +The model has three levels, with analogues in SHACL (one possible source — see +[Why a declarative model](#why-a-declarative-model)) and GraphQL (one of the +surfaces): | Term | What it is | SHACL | GraphQL | | -------------- | --------------------------------------------------------------------------------------------------------------- | -------------- | ----------- | @@ -56,13 +57,13 @@ query compiler all share. ```ts import { + defineSearchType, projectGraph, irisOf, searchSchema, - type SearchType, } from '@lde/search'; -const DATASET = { +const DATASET = defineSearchType({ type: 'http://www.w3.org/ns/dcat#Dataset', fields: [ // → title_nl, title_en, title_search_nl/_en, title_sort_nl/_en @@ -95,16 +96,17 @@ const DATASET = { document.classCount = irisOf(node, 'urn:dr:class').length; }, ], -} as const satisfies SearchType; +}); for await (const document of projectGraph(quads, searchSchema(DATASET))) { // one flat search document per matching subject, streamed } ``` -Capturing the type with `as const satisfies SearchType` keeps the field -literals, so the API surface can derive typed facet/output keys from it (see -`@lde/search-api-graphql`). +`defineSearchType` captures the declaration as a literal (what +`as const satisfies SearchType` would do manually, with nothing to remember), +so typed facet/output keys can be derived from it — see +[Typed results](#typed-results) and `@lde/search-api-graphql`. **Kinds** (`FieldKind`): `text`, `keyword`, `integer`, `number`, `boolean`, `date`, `reference`. The Typesense/engine vocabulary and the GraphQL types are @@ -167,6 +169,32 @@ holds for **any** consumer, including an API built on this package — which is engine adapters and surfaces compile through the shared `SearchQuery` IR and the `physicalFields` convention rather than re-deriving field names. +## Typed results + +The `SearchEngine` port is loosely typed by default: facet and document keys +are plain strings. That is the correct contract for an adapter (which cannot +know your fields) and for a surface that builds queries from client input at +runtime. An **in-process caller that knows its search type at compile time** +should narrow the engine with `engineFor` — same instance, zero runtime cost: + +```ts +import { engineFor } from '@lde/search'; + +const datasetEngine = engineFor(DATASET, engine); + +const result = await datasetEngine.search(query, DATASET); +result.facets.publisher; // typed: only DATASET’s facetable fields are keys +result.facets.publsher; // compile error (typo) +result.hits[0].document.title; // typed: only DATASET’s output fields are keys +await datasetEngine.search(query, OTHER_TYPE); // compile error (wrong type) +``` + +This only works when the search type was declared with `defineSearchType` (or +captured `as const satisfies SearchType`); a plain `: SearchType` annotation +widens the field literals away. The underlying pieces (`EngineFor`, +`FacetFieldsOf`, `OutputFieldsOf`) are exported for annotating your own +signatures. + ## Why a declarative model The vocabulary mirrors SHACL on purpose: `path` is `sh:path`, `array` is diff --git a/packages/search/src/schema.ts b/packages/search/src/schema.ts index 25e96303..3970453b 100644 --- a/packages/search/src/schema.ts +++ b/packages/search/src/schema.ts @@ -2,12 +2,11 @@ import type { FramedNode } from './frame-by-type.js'; import type { SearchDocument } from './project.js'; /** - * The engine-neutral kind of a queryable field — the runtime form of one SHACL - * property shape’s datatype/nodeKind. It drives every downstream behavior: - * which physical fields the projection emits, the engine collection-schema - * type, the `where`/facet/sort semantics, and the GraphQL output/input type. - * The Typesense-vocabulary types (`string`, `int32`, …) are *derived* from this - * by the engine adapter, never declared here. + * The engine-neutral kind of a queryable field. It drives every downstream + * behavior: which physical fields the projection emits, the engine + * collection-schema type, the `where`/facet/sort semantics, and the GraphQL + * output/input type. The Typesense-vocabulary types (`string`, `int32`, …) are + * *derived* from this by the engine adapter, never declared here. */ export type FieldKind = | 'text' @@ -21,10 +20,7 @@ export type FieldKind = /** * One queryable field — the single declarative source that drives all four * consumers (projection, engine collection schema, query semantics, and the - * GraphQL surface). The vocabulary mirrors SHACL + the `search:` annotations so - * a generator can later emit it unchanged from shapes: - * `kind`←`sh:datatype`/`sh:nodeKind`, `path`←`sh:path`, `array`←`sh:maxCount`, - * `localized`←`rdf:langString`/`sh:languageIn`, `ref`←`sh:node`/`sh:class`. + * GraphQL surface). * * Capability flags (`searchable`/`filterable`/`facetable`/`sortable`/`output`) * are independent opt-ins: a field exposes exactly the roles it declares. A @@ -36,18 +32,24 @@ export type FieldKind = * keys) follow one convention, owned by * {@link physicalFields} so projection, collection-schema and query compiler * cannot disagree. + * + * SHACL is one possible *source*, not a dependency: a generator can emit a + * declaration from a NodeShape + `search:` annotations + * (`kind`←`sh:datatype`/`sh:nodeKind`, `path`←`sh:path`, `array`←`sh:maxCount`, + * `localized`←`rdf:langString`/`sh:languageIn`, `ref`←`sh:node`/`sh:class`), + * and a hand-written declaration is just as valid. */ export interface SearchField { /** Logical API name; the physical fanout derives from it. Declare camelCase * where it surfaces in GraphQL. */ readonly name: string; readonly kind: FieldKind; - /** Framed-IR predicate IRI to project from (the SHACL `sh:path`). Omit for a + /** Framed-IR predicate IRI to project from. Omit for a * derivation-populated field. */ readonly path?: string; - /** Multi-valued (`sh:maxCount > 1`). */ + /** Multi-valued. */ readonly array?: boolean; - /** Always present (`sh:minCount ≥ 1`): a non-null scalar in the API output and + /** Always present: a non-null scalar in the API output and * a non-optional field in the engine index. Moot for arrays/booleans/`id`, * which are non-null regardless. */ readonly required?: boolean; @@ -105,10 +107,10 @@ export interface FacetRange { export type Derivation = (document: SearchDocument, node: FramedNode) => void; /** - * One root type’s complete search declaration — the runtime form of a single - * SHACL NodeShape: `type` is its `sh:targetClass`, `fields` are its property - * shapes (and derived fields), `derivations` are its `sh:rule`-shaped computed - * fields. A generator emits one of these per NodeShape. + * One root type’s complete search declaration: the `type` IRI its documents are + * instances of, the queryable `fields`, and the computed `derivations`. A SHACL + * generator can emit one per NodeShape (`type`←`sh:targetClass`, `fields`←its + * property shapes), but that is a source, not a requirement. */ export interface SearchType { readonly type: string; @@ -116,10 +118,23 @@ export interface SearchType { readonly derivations?: readonly Derivation[]; } +/** + * Declare a {@link SearchType}, capturing it as a literal: the `const` type + * parameter preserves the field names and capability flags that the type-level + * helpers (`FacetFieldsOf`, `OutputFieldsOf`, `EngineFor`) read off the type — + * with none of the widening a plain `: SearchType` annotation causes and + * without having to remember `as const satisfies SearchType`. Identity at + * runtime. + */ +export function defineSearchType( + searchType: Type, +): Type { + return searchType; +} + /** * The complete search declaration of a deployment: every root {@link SearchType}, - * keyed by its `type` IRI — the runtime form of a whole SHACL shapes graph. - * Build one with {@link searchSchema}. + * keyed by its `type` IRI. Build one with {@link searchSchema}. */ export type SearchSchema = ReadonlyMap; From 6ee6c005a00fc4cb28a8c25c4da65206b88129a4 Mon Sep 17 00:00:00 2001 From: David de Boer Date: Fri, 3 Jul 2026 11:51:23 +0200 Subject: [PATCH 18/35] feat(search): add engineFor to narrow an engine to one search type - engineFor(searchType, engine) returns the same instance typed as EngineFor: typo-safe facet and document keys with no generics at the call site (the const type parameter captures the literal) - SearchEngine gains a third Type parameter (default SearchType) so an EngineFor-typed engine also rejects a mismatched search type passed to search() at compile time - point the literal-capture guidance at defineSearchType alongside as const satisfies SearchType --- .../0004-search-api-graphql-surface.md | 4 +- packages/search/src/engine.ts | 41 ++++++++++++++----- packages/search/src/index.ts | 2 + packages/search/test/engine.test.ts | 35 ++++++++++++++++ packages/search/vite.config.ts | 4 +- 5 files changed, 72 insertions(+), 14 deletions(-) diff --git a/docs/decisions/0004-search-api-graphql-surface.md b/docs/decisions/0004-search-api-graphql-surface.md index a6e412c4..7a8655e1 100644 --- a/docs/decisions/0004-search-api-graphql-surface.md +++ b/docs/decisions/0004-search-api-graphql-surface.md @@ -308,5 +308,5 @@ Each transport populates it per request; no framework type appears in the packag sole preference mechanism); metadata-language-availability filtering (a facetable dimension, not v1); schema extension hooks (`extendTypeDefs`/`extendResolvers` or exported typeDefs/resolvers for manual composition); a static TS mirror of the contract - (`OutputOf` / `WhereOf` / `OrderByOf` / `FacetOf` mapped types over an - `as const satisfies SearchType` declaration) for typed in-process callers. + (`OutputOf` / `WhereOf` / `OrderByOf` / `FacetOf` mapped types over a + `defineSearchType` declaration) for typed in-process callers. diff --git a/packages/search/src/engine.ts b/packages/search/src/engine.ts index 9f9bded4..913ecfc5 100644 --- a/packages/search/src/engine.ts +++ b/packages/search/src/engine.ts @@ -2,7 +2,7 @@ import type { SearchQuery } from './query.js'; import type { SearchType } from './schema.js'; /** - * The engine port — the boundary a concrete engine adapter (e.g. + * The engine port: the boundary a concrete engine adapter (e.g. * `@lde/search-typesense`’s `TypesenseSearchEngine`) implements. The adapter * owns every engine specific (companion-field expansion, full-text field * selection and weights, filter compilation, sorting, result folding, faceting) @@ -12,15 +12,18 @@ import type { SearchType } from './schema.js'; * * `FacetField` keys the returned facet map; it defaults to `string` so an engine * stays ergonomic, and a deployment can narrow it to its own facet-field union - * (see {@link FacetFieldsOf}) for typo-safe facet access. + * (see {@link FacetFieldsOf}) for typo-safe facet access. `Type` narrows the + * accepted `searchType` argument alongside, so an {@link EngineFor}-typed engine + * rejects a mismatched search type at compile time. */ export interface SearchEngine< FacetField extends string = string, OutputField extends string = string, + Type extends SearchType = SearchType, > { search( query: SearchQuery, - searchType: SearchType, + searchType: Type, ): Promise>; } @@ -44,9 +47,10 @@ export type FacetMap = Readonly< /** * The facet-field-name union of a search type — the keys a {@link SearchResult}’s - * `facets` can hold. Requires the type be captured as a literal - * (`as const satisfies SearchType`), so the `facetable: true` flags survive as - * literals; a plain `: SearchType` annotation widens them and yields `never`. + * `facets` can hold. Requires the type be captured as a literal (via + * `defineSearchType` or `as const satisfies SearchType`), so the + * `facetable: true` flags survive as literals; a plain `: SearchType` + * annotation widens them and yields `never`. */ export type FacetFieldsOf = Extract< Type['fields'][number], @@ -56,7 +60,7 @@ export type FacetFieldsOf = Extract< /** * The output-field-name union of a search type — the keys a {@link ResultDocument} * can hold. Like {@link FacetFieldsOf}, requires the type captured as a literal - * (`as const satisfies SearchType`). + * (via `defineSearchType` or `as const satisfies SearchType`). */ export type OutputFieldsOf = Extract< Type['fields'][number], @@ -64,13 +68,30 @@ export type OutputFieldsOf = Extract< >['name']; /** A {@link SearchEngine} narrowed to one search type: facet keys and document - * keys fixed to that type’s facetable / output field names. The type must be - * captured as `as const satisfies SearchType`. */ + * keys fixed to that type’s facetable / output field names, and `search()` + * accepting only that search type. The type must be captured as a literal + * (`defineSearchType` or `as const satisfies SearchType`); {@link engineFor} + * is the ergonomic way to obtain one. */ export type EngineFor = SearchEngine< FacetFieldsOf, - OutputFieldsOf + OutputFieldsOf, + Type >; +/** + * Narrow an engine to one search type — the ergonomic route to an + * {@link EngineFor} view. The `const` type parameter captures the search type + * as a literal, so facet and document keys come out typo-safe without the + * caller writing any generics. Identity at runtime: the same engine instance + * is returned, only its type changes. + */ +export function engineFor( + searchType: Type, + engine: SearchEngine, +): EngineFor { + return engine; +} + /** * One result row. `id` (the stable document key, an IRI) is kept *out* of * {@link ResultDocument}: it is always present and is the hit’s identity, a diff --git a/packages/search/src/index.ts b/packages/search/src/index.ts index 8ecfbf03..633de8a0 100644 --- a/packages/search/src/index.ts +++ b/packages/search/src/index.ts @@ -8,6 +8,7 @@ export type { SearchDocument } from './project.js'; // schema, query semantics and the GraphQL surface. Plus the field selectors and // the physical field-name convention they all share. export { + defineSearchType, searchSchema, physicalFields, searchableFields, @@ -36,6 +37,7 @@ export { filterOperatorFor, pageForOffset } from './query.js'; export type { SearchQuery, Filter, Sort, FilterOperator } from './query.js'; // Engine port + the logical result document returned across it. +export { engineFor } from './engine.js'; export type { SearchEngine, SearchResult, diff --git a/packages/search/test/engine.test.ts b/packages/search/test/engine.test.ts index 14966451..6c71fe9f 100644 --- a/packages/search/test/engine.test.ts +++ b/packages/search/test/engine.test.ts @@ -1,6 +1,8 @@ import { describe, expect, it } from 'vitest'; +import { engineFor } from '../src/engine.js'; import type { EngineFor, SearchEngine, SearchResult } from '../src/engine.js'; import type { SearchQuery } from '../src/query.js'; +import { defineSearchType } from '../src/schema.js'; import type { SearchType } from '../src/schema.js'; const schema: SearchType = { @@ -107,4 +109,37 @@ describe('typed facet and document keys', () => { expect(result.facets.format).toEqual([{ value: 'text/turtle', count: 2 }]); expect(result.hits[0].document.title).toEqual({ nl: ['Titel'] }); }); + + it('accepts only the search type it was narrowed to', () => { + // `defineSearchType` captures the literal (no `as const` needed): the + // `facetable: true` flag must survive for `FacetFieldsOf` to see it. + const datasetSchema = defineSearchType({ + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [{ name: 'format', kind: 'keyword', facetable: true }], + }); + const organizationSchema = defineSearchType({ + type: 'http://xmlns.com/foaf/0.1/Organization', + fields: [{ name: 'sector', kind: 'keyword', facetable: true }], + }); + const query: SearchQuery = { + where: [], + orderBy: [], + limit: 10, + offset: 0, + facets: [], + locale: 'nl', + }; + + // `engineFor` narrows a generic adapter (plain `SearchEngine`) to any + // `EngineFor` — the same instance, identity at runtime. + const engine: EngineFor = engineFor( + datasetSchema, + fake, + ); + expect(engine).toBe(fake); + + void engine.search(query, datasetSchema); + // @ts-expect-error — a mismatched search type is rejected at compile time + void engine.search(query, organizationSchema); + }); }); diff --git a/packages/search/vite.config.ts b/packages/search/vite.config.ts index 61e8f5ae..a7e51876 100644 --- a/packages/search/vite.config.ts +++ b/packages/search/vite.config.ts @@ -11,9 +11,9 @@ export default mergeConfig( coverage: { thresholds: { functions: 100, - lines: 97.87, + lines: 97.9, branches: 91.8, - statements: 97.97, + statements: 98, }, }, }, From 46958a25f260bb2af4ccb59e515af51bd2168efe Mon Sep 17 00:00:00 2001 From: David de Boer Date: Fri, 3 Jul 2026 12:02:30 +0200 Subject: [PATCH 19/35] docs(search): describe the query IR as a shared representation, not a compiler target - 'compiler target' read as the final output (the engine query), while SearchQuery is the middle: surfaces compile into it, engine adapters compile out of it - reword README, the SearchQuery JSDoc and ADR 3 accordingly --- docs/decisions/0003-search-api-core-query-model.md | 4 ++-- packages/search/README.md | 5 +++-- packages/search/src/query.ts | 8 ++++---- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/docs/decisions/0003-search-api-core-query-model.md b/docs/decisions/0003-search-api-core-query-model.md index cb1e3e82..d717d2c2 100644 --- a/docs/decisions/0003-search-api-core-query-model.md +++ b/docs/decisions/0003-search-api-core-query-model.md @@ -110,8 +110,8 @@ understands. ### `SearchQuery` – the neutral query IR -Both surfaces parse input into this; the adapter consumes this. It is the shared compiler -target that keeps GraphQL and REST from drifting. +Both surfaces compile input into this; the adapter compiles it into an engine query. One +shared representation in the middle keeps GraphQL and REST from drifting. ```ts interface SearchQuery { diff --git a/packages/search/README.md b/packages/search/README.md index a2f83ae2..30dd17af 100644 --- a/packages/search/README.md +++ b/packages/search/README.md @@ -3,7 +3,7 @@ The **engine- and domain-agnostic core** for RDF-backed search. It bakes in no search engine, no API protocol, and no domain vocabulary: you supply a declarative `SearchSchema`, and engine adapters and API surfaces sit on the ports -defined here. The library never names your domain — the same core drives a +defined here. The library never names your domain: the same core drives a `Dataset`, `Person`, or `CreativeWork` search. It provides four things: @@ -12,7 +12,8 @@ It provides four things: one declaration per field that drives all four consumers below, so they cannot drift; - **the neutral query IR** — `SearchQuery` / `Filter` / `Sort` + filter - semantics, the shared compiler target every API surface parses into; + semantics: every API surface compiles into it, every engine adapter compiles + out of it, so the two cannot drift; - **the engine port** — `SearchEngine` and the logical result types (`SearchResult` / `SearchHit` / `ResultDocument` / `Reference` / …); - **a streaming projection** — `projectGraph`, RDF `CONSTRUCT` quads → flat diff --git a/packages/search/src/query.ts b/packages/search/src/query.ts index 3ebfc37a..b98f5577 100644 --- a/packages/search/src/query.ts +++ b/packages/search/src/query.ts @@ -1,10 +1,10 @@ import type { FieldKind } from './schema.js'; /** - * The engine- and protocol-neutral query IR. Every API surface parses its input - * into this; the engine adapter consumes it. It is the shared compiler target - * that keeps the GraphQL surface, a later REST surface and the adapter from - * drifting. + * The engine- and protocol-neutral query IR. Every API surface compiles its + * input into this; every engine adapter compiles it into an engine query. One + * shared representation in the middle keeps the GraphQL surface, a later REST + * surface and the adapter from drifting. */ export interface SearchQuery { /** Free-text query; `undefined`/`''` means browse (no text ranking). */ From 880fd3f82889f28e4b8d018394f7a77cdcf70614 Mon Sep 17 00:00:00 2001 From: David de Boer Date: Fri, 3 Jul 2026 12:03:26 +0200 Subject: [PATCH 20/35] docs(search): point to the adapter and surface packages from the intro - add a short list under the intro linking the packages that sit on the core's ports: the search-typesense engine adapter and the search-api-graphql surface, with a REST surface to follow --- packages/search/README.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/packages/search/README.md b/packages/search/README.md index 30dd17af..553c4ec8 100644 --- a/packages/search/README.md +++ b/packages/search/README.md @@ -2,10 +2,16 @@ The **engine- and domain-agnostic core** for RDF-backed search. It bakes in no search engine, no API protocol, and no domain vocabulary: you supply a -declarative `SearchSchema`, and engine adapters and API surfaces sit on the ports -defined here. The library never names your domain: the same core drives a +declarative `SearchSchema`, and engine adapters and API surfaces sit on the +ports defined here. The library never names your domain: the same core drives a `Dataset`, `Person`, or `CreativeWork` search. +Sitting on those ports: + +- **engine adapters** — [`@lde/search-typesense`](../search-typesense); +- **API surfaces** — [`@lde/search-api-graphql`](../search-api-graphql), with a + REST surface to follow. + It provides four things: - **the unified field model** — `SearchField` / `SearchType` / `SearchSchema`: From 0c71656add766bb9c9d501c6986735172668d888 Mon Sep 17 00:00:00 2001 From: David de Boer Date: Fri, 3 Jul 2026 12:07:44 +0200 Subject: [PATCH 21/35] docs(search): frame the search family as a generator for search engines - intro now leads with the family-level value: one declarative SearchSchema, and the projection, collection schema, query semantics and API surface are all derived from it - state the core as engine-, API- and domain-agnostic (API was missing) - adapters plug into the ports (hexagonal parlance) instead of sitting on them --- packages/search/README.md | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/packages/search/README.md b/packages/search/README.md index 553c4ec8..3eef3a1c 100644 --- a/packages/search/README.md +++ b/packages/search/README.md @@ -1,17 +1,22 @@ # @lde/search -The **engine- and domain-agnostic core** for RDF-backed search. It bakes in no -search engine, no API protocol, and no domain vocabulary: you supply a -declarative `SearchSchema`, and engine adapters and API surfaces sit on the -ports defined here. The library never names your domain: the same core drives a -`Dataset`, `Person`, or `CreativeWork` search. +The core of the LDE search family: packages that together act as a **generator +for search engines**. You write one declarative `SearchSchema`, and everything +a running search engine needs is derived from it — the document projection, +the engine collection schema, the query semantics, and the API surface — rather +than hand-written per deployment and kept in sync by discipline. -Sitting on those ports: +The core itself is **engine-, API- and domain-agnostic**: it bakes in no search +engine, no API protocol, and no domain vocabulary. The engine- and API-specific +halves are adapters that plug into the ports defined here: - **engine adapters** — [`@lde/search-typesense`](../search-typesense); - **API surfaces** — [`@lde/search-api-graphql`](../search-api-graphql), with a REST surface to follow. +The library never names your domain: the same core drives a `Dataset`, +`Person`, or `CreativeWork` search. + It provides four things: - **the unified field model** — `SearchField` / `SearchType` / `SearchSchema`: From 6cdeefa091913088fc170483990fec0f69b27ffe Mon Sep 17 00:00:00 2001 From: David de Boer Date: Fri, 3 Jul 2026 12:11:19 +0200 Subject: [PATCH 22/35] docs(search): rephrase the derived-artifacts sentence without dashes --- packages/search/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/search/README.md b/packages/search/README.md index 3eef3a1c..daac7c6a 100644 --- a/packages/search/README.md +++ b/packages/search/README.md @@ -2,9 +2,9 @@ The core of the LDE search family: packages that together act as a **generator for search engines**. You write one declarative `SearchSchema`, and everything -a running search engine needs is derived from it — the document projection, -the engine collection schema, the query semantics, and the API surface — rather -than hand-written per deployment and kept in sync by discipline. +a running search engine needs is derived from it: the document projection, the +engine collection schema, the query semantics, and the API surface. None of +these are hand-written per deployment or kept in sync by discipline. The core itself is **engine-, API- and domain-agnostic**: it bakes in no search engine, no API protocol, and no domain vocabulary. The engine- and API-specific From 4cd78d4300cce8ee3e0320a6cbdc01da9cf44b14 Mon Sep 17 00:00:00 2001 From: David de Boer Date: Fri, 3 Jul 2026 12:14:06 +0200 Subject: [PATCH 23/35] docs(search): attribute each adapter tier to its port, drop list articles - name the exact port per tier: engine adapters implement SearchEngine; API surfaces drive it, parsing client input into the SearchQuery IR - drop the mixed articles in the four-things list --- packages/search/README.md | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/packages/search/README.md b/packages/search/README.md index daac7c6a..7e8892e0 100644 --- a/packages/search/README.md +++ b/packages/search/README.md @@ -3,31 +3,34 @@ The core of the LDE search family: packages that together act as a **generator for search engines**. You write one declarative `SearchSchema`, and everything a running search engine needs is derived from it: the document projection, the -engine collection schema, the query semantics, and the API surface. None of -these are hand-written per deployment or kept in sync by discipline. +engine collection schema, the query semantics, and the API surface. All these +are kept in sync automatically rather than handwritten per deployment. The core itself is **engine-, API- and domain-agnostic**: it bakes in no search engine, no API protocol, and no domain vocabulary. The engine- and API-specific halves are adapters that plug into the ports defined here: -- **engine adapters** — [`@lde/search-typesense`](../search-typesense); -- **API surfaces** — [`@lde/search-api-graphql`](../search-api-graphql), with a - REST surface to follow. +- **engine adapters** implement the `SearchEngine` port: + [`@lde/search-typesense`](../search-typesense); +- **API surfaces** drive that port from the other side, parsing client input + into the `SearchQuery` IR that `search()` accepts: + [`@lde/search-api-graphql`](../search-api-graphql), with a REST surface to + follow. The library never names your domain: the same core drives a `Dataset`, `Person`, or `CreativeWork` search. It provides four things: -- **the unified field model** — `SearchField` / `SearchType` / `SearchSchema`: +- **unified field model** — `SearchField` / `SearchType` / `SearchSchema`: one declaration per field that drives all four consumers below, so they cannot drift; -- **the neutral query IR** — `SearchQuery` / `Filter` / `Sort` + filter +- **neutral query IR** — `SearchQuery` / `Filter` / `Sort` + filter semantics: every API surface compiles into it, every engine adapter compiles out of it, so the two cannot drift; -- **the engine port** — `SearchEngine` and the logical result types +- **engine port** — `SearchEngine` and the logical result types (`SearchResult` / `SearchHit` / `ResultDocument` / `Reference` / …); -- **a streaming projection** — `projectGraph`, RDF `CONSTRUCT` quads → flat +- **streaming projection** — `projectGraph`, RDF `CONSTRUCT` quads → flat search documents. ``` From 92e45ee3ea3c833cf7ed919861fdf9fc9db8b5b3 Mon Sep 17 00:00:00 2001 From: David de Boer Date: Fri, 3 Jul 2026 12:14:50 +0200 Subject: [PATCH 24/35] docs(search): shorten the API-surfaces bullet to search(SearchQuery) --- packages/search/README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/packages/search/README.md b/packages/search/README.md index 7e8892e0..35e6cc6b 100644 --- a/packages/search/README.md +++ b/packages/search/README.md @@ -12,10 +12,9 @@ halves are adapters that plug into the ports defined here: - **engine adapters** implement the `SearchEngine` port: [`@lde/search-typesense`](../search-typesense); -- **API surfaces** drive that port from the other side, parsing client input - into the `SearchQuery` IR that `search()` accepts: - [`@lde/search-api-graphql`](../search-api-graphql), with a REST surface to - follow. +- **API surfaces** drive it, parsing client input into `search(SearchQuery)` + calls: [`@lde/search-api-graphql`](../search-api-graphql), with a REST + surface to follow. The library never names your domain: the same core drives a `Dataset`, `Person`, or `CreativeWork` search. From 9289f7cde03c0c57826760b51c675e1de5b6e72b Mon Sep 17 00:00:00 2001 From: David de Boer Date: Fri, 3 Jul 2026 13:02:16 +0200 Subject: [PATCH 25/35] docs(search): cut the sentence restating the diagram, name OpenSearch as next adapter - the one-field-four-consumers sentence repeated the intro, the unified-field-model bullet and the diagram above it; the capability flags are explained in the Field model section - name OpenSearch as the engine adapter to follow and tighten the derived-artifacts sentence --- packages/search/README.md | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/packages/search/README.md b/packages/search/README.md index 35e6cc6b..0b900a2e 100644 --- a/packages/search/README.md +++ b/packages/search/README.md @@ -11,9 +11,9 @@ engine, no API protocol, and no domain vocabulary. The engine- and API-specific halves are adapters that plug into the ports defined here: - **engine adapters** implement the `SearchEngine` port: - [`@lde/search-typesense`](../search-typesense); -- **API surfaces** drive it, parsing client input into `search(SearchQuery)` - calls: [`@lde/search-api-graphql`](../search-api-graphql), with a REST + [`@lde/search-typesense`](../search-typesense), with OpenSearch to follow; +- **API surfaces** drive it, parsing client input into `search(SearchQuery)`: + [`@lde/search-api-graphql`](../search-api-graphql), with a REST surface to follow. The library never names your domain: the same core drives a `Dataset`, @@ -39,16 +39,10 @@ SearchSchema ─┬─► projection (projectGraph → flat documents) └─► API surface (GraphQL / REST) e.g. @lde/search-api-graphql ``` -One field, four consumers — that is why the model is unified: a field’s `kind` -plus capability flags (`searchable` / `filterable` / `facetable` / `sortable` / -`output`) describe projection, the engine collection schema, the query semantics, -and the API output in a single place. - ## Terminology -The model has three levels, with analogues in SHACL (one possible source — see -[Why a declarative model](#why-a-declarative-model)) and GraphQL (one of the -surfaces): +The model has three levels, with analogues in SHACL ([one possible source](#why-a-declarative-model)) +and GraphQL (one of the surfaces): | Term | What it is | SHACL | GraphQL | | -------------- | --------------------------------------------------------------------------------------------------------------- | -------------- | ----------- | @@ -57,8 +51,7 @@ surfaces): | `SearchSchema` | The whole search declaration: every `SearchType`, keyed by `type` IRI — build one with `searchSchema(...types)` | shapes graph | schema | `projectGraph` and the GraphQL surface consume a `SearchSchema` (projecting -every type in one pass, resp. emitting one root query field per type); the -engine port executes one `SearchType` at a time. +every type in one pass; the engine port executes one `SearchType` at a time. ## Field model From 55f204125b8566b5f2d15aa4c1e0c5273e69fa5a Mon Sep 17 00:00:00 2001 From: David de Boer Date: Fri, 3 Jul 2026 13:06:42 +0200 Subject: [PATCH 26/35] docs(search): name OpenSearch as the hypothetical second engine adapter - align the facetRanges JSDoc with the README, which names OpenSearch as the engine adapter to follow --- packages/search/src/schema.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/search/src/schema.ts b/packages/search/src/schema.ts index 3970453b..ef7dfd59 100644 --- a/packages/search/src/schema.ts +++ b/packages/search/src/schema.ts @@ -81,7 +81,7 @@ export interface SearchField { * histogram) rather than one bucket per distinct value — the per-bucket counts * a UI slider needs. Bins are query-time only (no index impact) and * engine-neutral: the Typesense adapter emits a `facet_by` range, an - * Elasticsearch adapter a `range` aggregation. See {@link FacetRange}. + * OpenSearch adapter a `range` aggregation. See {@link FacetRange}. */ readonly facetRanges?: readonly FacetRange[]; } From cfb9db848d02b95da47b70fd01f0376d6de1c3a7 Mon Sep 17 00:00:00 2001 From: David de Boer Date: Fri, 3 Jul 2026 16:21:11 +0200 Subject: [PATCH 27/35] docs(search-api-graphql): document serving a subset of the schema - types options are an exact join with the schema (build-time errors in both directions), so partial exposure goes through a narrower schema argument, not through omitting options --- packages/search-api-graphql/README.md | 34 +++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/packages/search-api-graphql/README.md b/packages/search-api-graphql/README.md index 919bff24..774d3a98 100644 --- a/packages/search-api-graphql/README.md +++ b/packages/search-api-graphql/README.md @@ -45,17 +45,41 @@ optional `queryDefaults` policy applied to every query of that type. Shared types (`LanguageString`, the facet buckets, filter inputs and reference types such as a common `Agent`) are created once and reused across root types. +## Serving a subset of the schema + +`types` never filters: every `SearchType` in the schema you pass gets a root +field, and the options must cover them exactly — a type without options, or +options naming an unknown type, is a build-time error, so the API cannot +silently drift from the index. To expose only part of what you index, narrow +the **schema argument** instead (`searchSchema(…)` is a cheap constructor): + +```ts +// Index all three types… +projectGraph(quads, searchSchema(DATASET, PERSON, INTERNAL)); + +// …but serve only two. +const gqlSchema = buildGraphQLSchema(searchSchema(DATASET, PERSON), { + types: { + [DATASET.type]: { typeName: 'Dataset' }, + [PERSON.type]: { typeName: 'Person', queryField: 'people' }, + }, +}); +``` + +Hiding a type is then a decision readable at the call site, never an +accidental omission from the options. + ## What it builds (per root type) -- **Output type** (`typeName`) — localized text → best-first `[LanguageString!]!` +- **Output type** (`typeName`): localized text → best-first `[LanguageString!]!` (`[0].language` is the language actually served); references → named per-shape types (`Organization`, `Term`) with a `name`; scalars/booleans per kind; `date` → ISO 8601 string; nullability from `required` / `array` / `kind`. -- **`where`** — one input per `filterable` field (`StringFilter`, `IntRange` / +- **`where`** one input per `filterable` field (`StringFilter`, `IntRange` / `FloatRange` / `DateRange`, or `Boolean`); omitted entirely for a type with no filterable fields. -- **`orderBy`** — `RELEVANCE` plus every `sortable` field, as an enum. -- **Facets** — an enum of every `facetable` field; a bucket carries `value` + +- **`orderBy`**: `RELEVANCE` plus every `sortable` field, as an enum. +- **Facets**: an enum of every `facetable` field; a bucket carries `value` + `count` + a nullable `label` — the resolved data label for **reference** facets, `null` for token/free-string facets whose display the consumer owns (its own i18n, or the value itself). @@ -65,7 +89,7 @@ such as a common `Agent`) are created once and reused across root types. The surface reads the same field model the index is built from, and compiles into the same neutral `SearchQuery` the engine consumes — so the API, the index and a future REST surface stay in lockstep. The contract is **frozen** (breaking to -change), and because it is generated rather than hand-written, a _consumer_ guards +change), and because it is generated rather than handwritten, a _consumer_ guards it with a `printGraphQLSchema(schema, options)` SDL snapshot over its **own** search schema and type names — that snapshot also catches a `buildGraphQLSchema` change in a future version of this library silently altering the consumer’s From 5427afd73a609cd247f7fc466977cdc1f5e67bbf Mon Sep 17 00:00:00 2001 From: David de Boer Date: Fri, 3 Jul 2026 17:30:16 +0200 Subject: [PATCH 28/35] feat(search)!: declare the logical API name on the SearchType - SearchType gains a required name (PascalCase, e.g. 'Dataset'), mirroring SearchField.name: the declaration itself names the type in every API surface, so surface config no longer has to - buildGraphQLSchema derives all GraphQL type names and the default root query field from it; the per-type options lose typeName and become optional fine-tuning (queryField, queryDefaults) - document the pipeline as pure data transformations (three chains meeting at the engine) in the search README BREAKING CHANGE: every SearchType declaration must add a name; the GraphQL surface's per-type options no longer accept typeName and the types option is now optional. --- .../0003-search-api-core-query-model.md | 3 + .../0004-search-api-graphql-surface.md | 18 ++--- packages/search-api-graphql/README.md | 66 +++++++++---------- .../search-api-graphql/src/build-schema.ts | 37 +++++------ .../test/build-schema.test.ts | 38 +++++------ .../test/generator-stability.test.ts | 7 +- packages/search-api-graphql/vite.config.ts | 2 +- packages/search/README.md | 19 +++++- packages/search/src/schema.ts | 15 +++-- packages/search/test/engine.test.ts | 4 ++ packages/search/test/project.test.ts | 28 +++++--- packages/search/test/schema.test.ts | 2 + 12 files changed, 137 insertions(+), 102 deletions(-) diff --git a/docs/decisions/0003-search-api-core-query-model.md b/docs/decisions/0003-search-api-core-query-model.md index d717d2c2..642d458e 100644 --- a/docs/decisions/0003-search-api-core-query-model.md +++ b/docs/decisions/0003-search-api-core-query-model.md @@ -88,6 +88,9 @@ type Derivation = (document: SearchDocument, node: FramedNode) => void; // One root type (one SHACL NodeShape); a whole deployment’s declaration is the // SearchSchema, a map of SearchTypes keyed by type IRI (built with searchSchema()). interface SearchType { + readonly name: string; // logical API name ('Dataset') – names the type in every surface, + // declared (like each field's name), never derived from the IRI, so vocabulary + // churn cannot silently rename the public contract readonly type: string; // sh:targetClass readonly fields: readonly SearchField[]; readonly derivations?: readonly Derivation[]; // computed fields: status, booleans diff --git a/docs/decisions/0004-search-api-graphql-surface.md b/docs/decisions/0004-search-api-graphql-surface.md index 7a8655e1..0a21849c 100644 --- a/docs/decisions/0004-search-api-graphql-surface.md +++ b/docs/decisions/0004-search-api-graphql-surface.md @@ -44,12 +44,11 @@ filter inputs, reference types) are created once and reused across root types. ```ts function buildGraphQLSchema( schema: SearchSchema, // every root type, keyed by type IRI - options: { - types: Record< - string, // type IRI; every type in the schema needs an entry + options?: { + types?: Record< + string, // type IRI; entries are optional fine-tuning – names come from SearchType.name { - typeName: string; // 'Dataset' – drives the type's derived GraphQL type names - queryField?: string; // root field; default lowercased plural of typeName + queryField?: string; // root field; default lowercased plural of the type's name queryDefaults?: (q: SearchQuery, ctx: SearchContext) => SearchQuery; // per-type consumer policy } >; @@ -83,10 +82,11 @@ the optional `printGraphQLSchema()` SDL snapshot (the real artifact). ### Construction rules (field model → schema) -Type names derive from each type’s `typeName`; shared types (`LanguageString`, `ValueBucket`, -`RangeBucket`, `SortDirection`, `StringFilter`, `IntRange`, `FloatRange`, `DateRange`, and the -reference types) are emitted once across all root types, and the per-type keyed facets object -is named `Facets`. A type with no `filterable` fields gets no `where` arg, and one +Type names derive from each `SearchType`’s logical `name`; shared types (`LanguageString`, +`ValueBucket`, `RangeBucket`, `SortDirection`, `StringFilter`, `IntRange`, `FloatRange`, +`DateRange`, and the reference types) are emitted once across all root types, and the +per-type keyed facets object is named `Facets`. A type with no `filterable` fields gets +no `where` arg, and one with no `facetable` fields no `facets` field (empty GraphQL types are invalid). GraphQL field names are the field model `name` verbatim (declare camelCase). diff --git a/packages/search-api-graphql/README.md b/packages/search-api-graphql/README.md index 774d3a98..336ace60 100644 --- a/packages/search-api-graphql/README.md +++ b/packages/search-api-graphql/README.md @@ -4,74 +4,74 @@ The GraphQL surface for the [`@lde/search`](../search) core. **Both engine- and domain-agnostic:** it builds an executable `GraphQLSchema` from your whole `SearchSchema` at runtime — one root query field per `SearchType`, each searchable in its own way — served by one generic resolver per root field over -any `SearchEngine`. It names neither your **domain** (you pass a `typeName` per -type — `Dataset`, `Person`, `CreativeWork`, …) nor your **engine** (the resolver -calls `context.engine`, be it [`@lde/search-typesense`](../search-typesense) or -another adapter). +any `SearchEngine`. It names neither your **domain** (each type’s GraphQL name +is the `SearchType`’s own logical `name` — `Dataset`, `Person`, `CreativeWork`, +…) nor your **engine** (the resolver calls `context.engine`, be it +[`@lde/search-typesense`](../search-typesense) or another adapter). ## Runtime configuration, not codegen -`buildGraphQLSchema(schema, { types })` constructs the GraphQL schema once at -startup from the field model — no SDL artifact, no generated resolver stubs. The -field model is the single source; the GraphQL contract is whatever it produces. -Output types, the `where`/`orderBy`/facet inputs, reference types and nullability -are all derived from each field’s `kind` and capability flags. +`buildGraphQLSchema(schema)` constructs the GraphQL schema once at startup from +the field model — no SDL artifact, no generated resolver stubs. The field model +is the single source; the GraphQL contract is whatever it produces. Type names +come from each `SearchType`’s `name`; output types, the `where`/`orderBy`/facet +inputs, reference types and nullability are all derived from each field’s +`kind` and capability flags. The common case needs no options at all: ```ts import { searchSchema } from '@lde/search'; import { buildGraphQLSchema } from '@lde/search-api-graphql'; +const gqlSchema = buildGraphQLSchema(searchSchema(DATASET, PERSON)); + +// The API now serves `datasets(…)` and `persons(…)` root fields. +// Hand `gqlSchema` to any graphql-js server; populate the per-request context: +// { engine: SearchEngine, acceptLanguage: string[] } +``` + +Per-type options are pure fine-tuning, only for the types that need it: a +`queryField` when the default root field (the lowercased plural of the type’s +`name`) is wrong, and a `queryDefaults` policy applied to every query of that +type: + +```ts const gqlSchema = buildGraphQLSchema(searchSchema(DATASET, PERSON), { types: { [DATASET.type]: { - typeName: 'Dataset', queryDefaults: (query) => ({ ...query, where: [...query.where, { field: 'status', in: ['valid'] }], }), }, - [PERSON.type]: { typeName: 'Person', queryField: 'people' }, + [PERSON.type]: { queryField: 'people' }, }, }); - -// The API now serves `datasets(…)` and `people(…)` root fields. -// Hand `gqlSchema` to any graphql-js server; populate the per-request context: -// { engine: SearchEngine, acceptLanguage: string[] } ``` -Per type you configure the `typeName` (drives all derived type names), an -optional `queryField` (defaults to the lowercased plural of `typeName`) and an -optional `queryDefaults` policy applied to every query of that type. Shared -types (`LanguageString`, the facet buckets, filter inputs and reference types -such as a common `Agent`) are created once and reused across root types. +Shared types (`LanguageString`, the facet buckets, filter inputs and reference +types such as a common `Agent`) are created once and reused across root types. ## Serving a subset of the schema `types` never filters: every `SearchType` in the schema you pass gets a root -field, and the options must cover them exactly — a type without options, or -options naming an unknown type, is a build-time error, so the API cannot -silently drift from the index. To expose only part of what you index, narrow -the **schema argument** instead (`searchSchema(…)` is a cheap constructor): +field (options for a type not in the schema are a build-time error). To expose +only part of what you index, narrow the **schema argument** +(`searchSchema(…)` is a cheap constructor): ```ts // Index all three types… projectGraph(quads, searchSchema(DATASET, PERSON, INTERNAL)); // …but serve only two. -const gqlSchema = buildGraphQLSchema(searchSchema(DATASET, PERSON), { - types: { - [DATASET.type]: { typeName: 'Dataset' }, - [PERSON.type]: { typeName: 'Person', queryField: 'people' }, - }, -}); +const gqlSchema = buildGraphQLSchema(searchSchema(DATASET, PERSON)); ``` -Hiding a type is then a decision readable at the call site, never an -accidental omission from the options. +Hiding a type is then a decision readable at the call site — the schema you +build the API from _is_ the list of what it serves. ## What it builds (per root type) -- **Output type** (`typeName`): localized text → best-first `[LanguageString!]!` +- **Output type** (the `SearchType`’s `name`): localized text → best-first `[LanguageString!]!` (`[0].language` is the language actually served); references → named per-shape types (`Organization`, `Term`) with a `name`; scalars/booleans per kind; `date` → ISO 8601 string; nullability from `required` / `array` / `kind`. diff --git a/packages/search-api-graphql/src/build-schema.ts b/packages/search-api-graphql/src/build-schema.ts index 8edb1309..e692bbbd 100644 --- a/packages/search-api-graphql/src/build-schema.ts +++ b/packages/search-api-graphql/src/build-schema.ts @@ -52,11 +52,11 @@ export interface SearchContext { readonly onFacetError?: (field: string, error: unknown) => void; } -/** Per-root-type options; what the schema value cannot carry. */ +/** Per-root-type fine-tuning. The type’s name comes from the {@link SearchType} + * itself (`name`); options exist only for what has a sensible default. */ export interface SearchTypeOptions { - /** Drives the type’s derived GraphQL type names, e.g. `Dataset`. */ - readonly typeName: string; - /** Root query field; defaults to the lowercased plural of `typeName`. */ + /** Root query field; defaults to the lowercased plural of the type’s `name` + * (e.g. `Dataset` → `datasets`). */ readonly queryField?: string; /** Consumer policy applied to every query of this type (default status, sort, * tie-breaks). */ @@ -67,9 +67,9 @@ export interface SearchTypeOptions { } export interface BuildGraphQLSchemaOptions { - /** Options per root type, keyed by type IRI (the {@link SearchType} `type`). - * Every type in the schema needs an entry. */ - readonly types: Readonly>; + /** Optional fine-tuning per root type, keyed by type IRI (the + * {@link SearchType} `type`). A type without an entry gets the defaults. */ + readonly types?: Readonly>; /** Output-language ordering; defaults to Accept-Language-first, `und` last. */ readonly languageOrder?: LanguageOrder; } @@ -102,10 +102,10 @@ function screamingSnake(name: string): string { */ export function buildGraphQLSchema( schema: SearchSchema, - options: BuildGraphQLSchemaOptions, + options: BuildGraphQLSchemaOptions = {}, ): GraphQLSchema { const languageOrder = options.languageOrder ?? defaultLanguageOrder; - for (const typeIri of Object.keys(options.types)) { + for (const typeIri of Object.keys(options.types ?? {})) { if (!schema.has(typeIri)) { throw new Error( `Options given for type “${typeIri}”, which is not in the search schema.`, @@ -272,9 +272,9 @@ export function buildGraphQLSchema( /** The root query field for one {@link SearchType}, with its derived types. */ function rootField( searchType: SearchType, - typeOptions: SearchTypeOptions, + typeOptions: SearchTypeOptions | undefined, ): GraphQLFieldConfig { - const { typeName } = typeOptions; + const typeName = searchType.name; const outputType = new GraphQLObjectType({ name: typeName, @@ -373,7 +373,7 @@ export function buildGraphQLSchema( }, resolve: async (_source, args, context: SearchContext) => { const built = argsToQuery(args as QueryArgs, context, searchType); - const finalQuery = typeOptions.queryDefaults + const finalQuery = typeOptions?.queryDefaults ? typeOptions.queryDefaults(built, context) : built; // Items + total only; facets are resolved lazily per selected key. @@ -455,15 +455,10 @@ export function buildGraphQLSchema( GraphQLFieldConfig > = {}; for (const searchType of schema.values()) { - const typeOptions = options.types[searchType.type]; - if (typeOptions === undefined) { - throw new Error( - `Missing options (typeName) for type “${searchType.type}”.`, - ); - } - const { typeName } = typeOptions; + const typeOptions = options.types?.[searchType.type]; + const typeName = searchType.name; const queryField = - typeOptions.queryField ?? + typeOptions?.queryField ?? `${typeName.charAt(0).toLowerCase()}${typeName.slice(1)}s`; if (queryField in queryFields) { throw new Error( @@ -486,7 +481,7 @@ export function buildGraphQLSchema( */ export function printGraphQLSchema( schema: SearchSchema, - options: BuildGraphQLSchemaOptions, + options: BuildGraphQLSchemaOptions = {}, ): string { return printSchema(buildGraphQLSchema(schema, options)); } diff --git a/packages/search-api-graphql/test/build-schema.test.ts b/packages/search-api-graphql/test/build-schema.test.ts index c2e29f0d..9a9502b2 100644 --- a/packages/search-api-graphql/test/build-schema.test.ts +++ b/packages/search-api-graphql/test/build-schema.test.ts @@ -10,6 +10,7 @@ import { import { buildGraphQLSchema, type SearchContext } from '../src/build-schema.js'; const schema: SearchType = { + name: 'Dataset', type: 'http://www.w3.org/ns/dcat#Dataset', fields: [ { @@ -120,9 +121,7 @@ const canned: SearchResult = { facets: { keyword: [{ value: 'kaarten', count: 3 }] }, }; -const datasetOptions = { - types: { [schema.type]: { typeName: 'Dataset' } }, -}; +const datasetOptions = {}; async function run( source: string, @@ -462,7 +461,6 @@ describe('buildGraphQLSchema', () => { const gqlSchema = buildGraphQLSchema(searchSchema(schema), { types: { [schema.type]: { - typeName: 'Dataset', queryDefaults: (query) => ({ ...query, where: [...query.where, { field: 'status', in: ['valid'] }], @@ -512,6 +510,7 @@ describe('buildGraphQLSchema', () => { describe('multiple root types in one schema', () => { const PERSON: SearchType = { + name: 'Person', type: 'https://schema.org/Person', fields: [ { @@ -533,6 +532,7 @@ describe('buildGraphQLSchema', () => { ], }; const CREATIVE_WORK: SearchType = { + name: 'CreativeWork', type: 'https://schema.org/CreativeWork', fields: [ { @@ -557,8 +557,7 @@ describe('buildGraphQLSchema', () => { searchSchema(PERSON, CREATIVE_WORK), { types: { - [PERSON.type]: { typeName: 'Person', queryField: 'people' }, - [CREATIVE_WORK.type]: { typeName: 'CreativeWork' }, + [PERSON.type]: { queryField: 'people' }, }, }, ); @@ -595,28 +594,29 @@ describe('buildGraphQLSchema', () => { expect(searchedTypes).toEqual([PERSON.type, CREATIVE_WORK.type]); }); - it('throws on a type without options, an unknown type, and a root-field clash', () => { - expect(() => - buildGraphQLSchema(searchSchema(PERSON, CREATIVE_WORK), { - types: { [PERSON.type]: { typeName: 'Person' } }, - }), - ).toThrow(/Missing options/); + it('builds without any options: names come from the search types', () => { + const sdl = printSchema( + buildGraphQLSchema(searchSchema(PERSON, CREATIVE_WORK)), + ); + expect(sdl).toMatch(/persons\([\s\S]*?\): PersonSearchResult!/); + expect(sdl).toMatch( + /creativeWorks\([\s\S]*?\): CreativeWorkSearchResult!/, + ); + }); + + it('throws on options for an unknown type and on a root-field clash', () => { expect(() => buildGraphQLSchema(searchSchema(PERSON), { types: { - [PERSON.type]: { typeName: 'Person' }, - 'https://schema.org/Unknown': { typeName: 'Unknown' }, + 'https://schema.org/Unknown': { queryField: 'unknowns' }, }, }), ).toThrow(/not in the search schema/); expect(() => buildGraphQLSchema(searchSchema(PERSON, CREATIVE_WORK), { types: { - [PERSON.type]: { typeName: 'Person', queryField: 'items' }, - [CREATIVE_WORK.type]: { - typeName: 'CreativeWork', - queryField: 'items', - }, + [PERSON.type]: { queryField: 'items' }, + [CREATIVE_WORK.type]: { queryField: 'items' }, }, }), ).toThrow(/Duplicate root query field/); diff --git a/packages/search-api-graphql/test/generator-stability.test.ts b/packages/search-api-graphql/test/generator-stability.test.ts index 2a872afb..8d5aaa26 100644 --- a/packages/search-api-graphql/test/generator-stability.test.ts +++ b/packages/search-api-graphql/test/generator-stability.test.ts @@ -10,6 +10,7 @@ import { printGraphQLSchema } from '../src/build-schema.js'; * so a consumer’s contract can’t shift from under it by accident. */ const THING: SearchType = { + name: 'Thing', type: 'https://example.org/Thing', fields: [ { @@ -92,10 +93,6 @@ const THING: SearchType = { describe('GraphQL generator stability', () => { it('emits a stable SDL for a representative schema', () => { - expect( - printGraphQLSchema(searchSchema(THING), { - types: { [THING.type]: { typeName: 'Thing' } }, - }), - ).toMatchSnapshot(); + expect(printGraphQLSchema(searchSchema(THING))).toMatchSnapshot(); }); }); diff --git a/packages/search-api-graphql/vite.config.ts b/packages/search-api-graphql/vite.config.ts index 9d06048c..9baf3fd5 100644 --- a/packages/search-api-graphql/vite.config.ts +++ b/packages/search-api-graphql/vite.config.ts @@ -12,7 +12,7 @@ export default mergeConfig( thresholds: { functions: 100, lines: 100, - branches: 90.21, + branches: 90.42, statements: 100, }, }, diff --git a/packages/search/README.md b/packages/search/README.md index 0b900a2e..e6ba9ed5 100644 --- a/packages/search/README.md +++ b/packages/search/README.md @@ -39,6 +39,20 @@ SearchSchema ─┬─► projection (projectGraph → flat documents) └─► API surface (GraphQL / REST) e.g. @lde/search-api-graphql ``` +At runtime, everything those consumers do is a **pure transformation between +data shapes**, each one parameterised by the schema — three chains, meeting at +the engine: + +``` +indexing: RDF quads ──frame──► FramedNode ──project──► SearchDocument ──import──► engine +querying: client input ──parse──► SearchQuery ──compile──► engine query +results: engine response ──parse──► SearchResult ──shape──► API output +``` + +Validation happens before the first arrow (SHACL over the RDF) and inside the +last (the engine enforces its collection schema); between them every stage is +a typed, deterministic function — easy to test, and swappable per deployment. + ## Terminology The model has three levels, with analogues in SHACL ([one possible source](#why-a-declarative-model)) @@ -47,11 +61,11 @@ and GraphQL (one of the surfaces): | Term | What it is | SHACL | GraphQL | | -------------- | --------------------------------------------------------------------------------------------------------------- | -------------- | ----------- | | `SearchField` | One queryable field: a `kind`, the IR `path` it projects from, and the capability flags it opts into | property shape | field | -| `SearchType` | One root type’s complete declaration: its `type` IRI plus its fields and derivations | NodeShape | object type | +| `SearchType` | One root type’s complete declaration: its logical API `name`, its `type` IRI, its fields and derivations | NodeShape | object type | | `SearchSchema` | The whole search declaration: every `SearchType`, keyed by `type` IRI — build one with `searchSchema(...types)` | shapes graph | schema | `projectGraph` and the GraphQL surface consume a `SearchSchema` (projecting -every type in one pass; the engine port executes one `SearchType` at a time. +every type in one pass); the engine port executes one `SearchType` at a time. ## Field model @@ -71,6 +85,7 @@ import { } from '@lde/search'; const DATASET = defineSearchType({ + name: 'Dataset', // logical API name: names the GraphQL type, a REST path, … type: 'http://www.w3.org/ns/dcat#Dataset', fields: [ // → title_nl, title_en, title_search_nl/_en, title_sort_nl/_en diff --git a/packages/search/src/schema.ts b/packages/search/src/schema.ts index ef7dfd59..3c3d7374 100644 --- a/packages/search/src/schema.ts +++ b/packages/search/src/schema.ts @@ -107,12 +107,19 @@ export interface FacetRange { export type Derivation = (document: SearchDocument, node: FramedNode) => void; /** - * One root type’s complete search declaration: the `type` IRI its documents are - * instances of, the queryable `fields`, and the computed `derivations`. A SHACL - * generator can emit one per NodeShape (`type`←`sh:targetClass`, `fields`←its - * property shapes), but that is a source, not a requirement. + * One root type’s complete search declaration: its logical API `name`, the + * `type` IRI its documents are instances of, the queryable `fields`, and the + * computed `derivations`. A SHACL generator can emit one per NodeShape + * (`name`←`sh:name`/local name, `type`←`sh:targetClass`, `fields`←its property + * shapes), but that is a source, not a requirement. */ export interface SearchType { + /** Logical API name (PascalCase, e.g. `Dataset`) — names the type in the API + * surfaces (GraphQL type names, a REST path), the way each field’s + * {@link SearchField.name} names that field. Deliberately declared rather + * than derived from the `type` IRI, so re-modelling the vocabulary cannot + * silently rename the public contract. */ + readonly name: string; readonly type: string; readonly fields: readonly SearchField[]; readonly derivations?: readonly Derivation[]; diff --git a/packages/search/test/engine.test.ts b/packages/search/test/engine.test.ts index 6c71fe9f..96fbbb21 100644 --- a/packages/search/test/engine.test.ts +++ b/packages/search/test/engine.test.ts @@ -6,6 +6,7 @@ import { defineSearchType } from '../src/schema.js'; import type { SearchType } from '../src/schema.js'; const schema: SearchType = { + name: 'Dataset', type: 'http://www.w3.org/ns/dcat#Dataset', fields: [{ name: 'title', kind: 'text', localized: true, locales: ['nl'] }], }; @@ -63,6 +64,7 @@ describe('typed facet and document keys', () => { // Captured as a literal (`as const satisfies`) so the `facetable`/`output` // flags survive and the `…Of` helpers can read the field names off the type. const datasetSchema = { + name: 'Dataset', type: 'http://www.w3.org/ns/dcat#Dataset', fields: [ { @@ -114,10 +116,12 @@ describe('typed facet and document keys', () => { // `defineSearchType` captures the literal (no `as const` needed): the // `facetable: true` flag must survive for `FacetFieldsOf` to see it. const datasetSchema = defineSearchType({ + name: 'Dataset', type: 'http://www.w3.org/ns/dcat#Dataset', fields: [{ name: 'format', kind: 'keyword', facetable: true }], }); const organizationSchema = defineSearchType({ + name: 'Organization', type: 'http://xmlns.com/foaf/0.1/Organization', fields: [{ name: 'sector', kind: 'keyword', facetable: true }], }); diff --git a/packages/search/test/project.test.ts b/packages/search/test/project.test.ts index bdd869bf..96b6a888 100644 --- a/packages/search/test/project.test.ts +++ b/packages/search/test/project.test.ts @@ -85,7 +85,12 @@ const derivations: Derivation[] = [ }, ]; -const schema: SearchType = { type: DATASET, fields, derivations }; +const schema: SearchType = { + name: 'Dataset', + type: DATASET, + fields, + derivations, +}; describe('projectDocument', () => { it('projects every field kind and runs derivations', () => { @@ -123,6 +128,7 @@ describe('projectDocument', () => { [`${DR}class`]: 'http://example.org/BareClass', }, { + name: 'Dataset', type: DATASET, fields: [ { name: 'size', path: `${DR}size`, kind: 'integer' }, @@ -155,6 +161,7 @@ describe('projectDocument', () => { const document = projectDocument( { '@id': 'https://ex/d/12', [`${DR}size`]: { '@value': '1234.5' } }, { + name: 'Dataset', type: DATASET, fields: [{ name: 'size', path: `${DR}size`, kind: 'number' }], }, @@ -164,6 +171,7 @@ describe('projectDocument', () => { it('projects a boolean field from a path (xsd:boolean lexical space)', () => { const withBoolean: SearchType = { + name: 'Dataset', type: DATASET, fields: [{ name: 'iiif', path: `${DR}iiif`, kind: 'boolean' }], }; @@ -186,6 +194,7 @@ describe('projectDocument', () => { const document = projectDocument( { '@id': 'https://ex/d/4', [`${DR}format`]: [`${IANA}text/turtle`] }, { + name: 'Dataset', type: DATASET, fields: [ { @@ -208,7 +217,7 @@ describe('projectDocument', () => { '@id': 'https://ex/d/2', [dcterms.title.value]: { '@language': 'nl', '@value': 'Solo' }, }, - { type: DATASET, fields }, + { name: 'Dataset', type: DATASET, fields }, ); expect(document.id).toBe('https://ex/d/2'); expect(document.title_search_nl).toBe('solo'); @@ -219,7 +228,7 @@ describe('projectDocument', () => { it('omits the sort field when there is no value to sort on', () => { const document = projectDocument( { '@id': 'https://ex/d/5' }, - { type: DATASET, fields }, + { name: 'Dataset', type: DATASET, fields }, ); expect(document.id).toBe('https://ex/d/5'); expect(document.title_sort_nl).toBeUndefined(); @@ -231,7 +240,7 @@ describe('projectDocument', () => { '@id': 'https://ex/d/6', [dcterms.title.value]: { '@language': 'fr', '@value': 'Bonjour' }, }, - { type: DATASET, fields }, + { name: 'Dataset', type: DATASET, fields }, ); // locales is ['nl', 'en'], so the French title is invisible — no display, // search or sort field is emitted for it. @@ -247,7 +256,7 @@ describe('projectDocument', () => { '@id': 'https://ex/d/7', [dcterms.title.value]: { '@value': 'Naamloos' }, }, - { type: DATASET, fields }, + { name: 'Dataset', type: DATASET, fields }, ); expect(document.title_nl).toBeUndefined(); expect(document.title_search_nl).toBeUndefined(); @@ -261,6 +270,7 @@ describe('projectDocument', () => { [dcterms.title.value]: { '@language': 'nl', '@value': 'Verhalen' }, }, { + name: 'Dataset', type: DATASET, fields: [ { @@ -290,7 +300,7 @@ describe('projectDocument', () => { { '@language': 'nl', '@value': 'Ondertitel' }, ], }, - { type: DATASET, fields }, + { name: 'Dataset', type: DATASET, fields }, ); // Display takes the first value; search folds them all so both are matchable. expect(document.title_nl).toBe('Titel'); @@ -304,6 +314,7 @@ describe('projectDocument', () => { [dcterms.title.value]: { '@language': 'nl', '@value': 'Titel' }, }, { + name: 'Dataset', type: DATASET, fields: [ { @@ -333,7 +344,7 @@ describe('projectDocument', () => { expect(() => projectDocument( { [dcterms.title.value]: { '@value': 'No id' } }, - { type: DATASET, fields }, + { name: 'Dataset', type: DATASET, fields }, ), ).toThrow(/without an @id/); }); @@ -346,6 +357,7 @@ describe('projectDocument', () => { [dcterms.title.value]: { '@language': 'nl', '@value': 'Titel' }, }, { + name: 'Dataset', type: DATASET, fields: [ { @@ -376,7 +388,7 @@ describe('projectGraph', () => { const documents: SearchDocument[] = []; for await (const document of projectGraph( quads, - searchSchema({ type: DATASET, fields }), + searchSchema({ name: 'Dataset', type: DATASET, fields }), )) { documents.push(document); } diff --git a/packages/search/test/schema.test.ts b/packages/search/test/schema.test.ts index 4877f68b..8df30b30 100644 --- a/packages/search/test/schema.test.ts +++ b/packages/search/test/schema.test.ts @@ -18,6 +18,7 @@ import { const DATASET = 'http://www.w3.org/ns/dcat#Dataset'; const schema: SearchType = { + name: 'Dataset', type: DATASET, fields: [ { @@ -197,6 +198,7 @@ describe('schema selectors', () => { ref: { type: 'http://xmlns.com/foaf/0.1/Agent', strategy: 'labelOnly' }, }; const withReference: SearchType = { + name: 'Dataset', type: DATASET, fields: [...schema.fields, publisher], }; From 9e9bdedef336806fbaf369255655dad79586e6d2 Mon Sep 17 00:00:00 2001 From: David de Boer Date: Fri, 3 Jul 2026 17:30:47 +0200 Subject: [PATCH 29/35] feat(search-typesense)!: deepen rebuild, explicit stemming locale, ignored-filter reporting - rebuild(client, searchType, documents, options) derives the collection schema internally (buildCollectionSchema); the logical index name is the explicit options.name; options exported as RebuildOptions - buildCollectionSchema no longer assumes Dutch: defaultLocale is a pure opt-in, and without it non-localized search fields stay folded but unstemmed, so no language is silently applied - buildSearchParams now skips a where clause whose operator does not match the field's kind (it previously reached the engine as garbage) and reports every skipped clause via the new onIgnoredFilter callback, also exposed on TypesenseSearchEngineOptions BREAKING CHANGE: rebuild takes a SearchType plus options.name instead of a prebuilt CollectionCreateSchema; buildCollectionSchema no longer defaults defaultLocale to 'nl'. --- packages/search-typesense/README.md | 19 +++++---- packages/search-typesense/src/adapter.ts | 41 +++++++++++++------ .../search-typesense/src/collection-schema.ts | 22 ++++++---- packages/search-typesense/src/index.ts | 1 + .../search-typesense/src/query-compiler.ts | 37 +++++++++++++++-- packages/search-typesense/src/search.ts | 8 ++++ .../search-typesense/test/adapter.test.ts | 41 ++++++++++++------- .../test/collection-schema.test.ts | 14 +++++++ .../test/generator-stability.test.ts | 1 + .../test/parse-response.test.ts | 2 + .../test/query-compiler.test.ts | 26 ++++++++++++ .../test/search-engine.test.ts | 1 + packages/search-typesense/vite.config.ts | 8 ++-- 13 files changed, 171 insertions(+), 50 deletions(-) diff --git a/packages/search-typesense/README.md b/packages/search-typesense/README.md index efffc145..d77ffcad 100644 --- a/packages/search-typesense/README.md +++ b/packages/search-typesense/README.md @@ -25,12 +25,13 @@ and `parseSearchResponse` are exported for direct use and testing. ## Indexing -`rebuild` blue/green-rebuilds a search index in one call: it creates a fresh -versioned collection (`${schema.name}_`), streams the documents into -it in batches, atomically repoints the `schema.name` alias to it, then drops the -collection it superseded. The caller passes only the logical index name (as -`schema.name`) and a stream of documents; the versioned collection and the alias -are managed for them. +`rebuild` blue/green-rebuilds a search index in one call, straight from the +declaration: it derives the collection schema from your `SearchType` (via +`buildCollectionSchema`), creates a fresh versioned collection +(`${name}_`), streams the documents into it in batches, atomically +repoints the `name` alias to it, then drops the collection it superseded. The +caller passes the `SearchType`, the logical index `name` and a stream of +documents; the versioned collection and the alias are managed for them. ```ts import { Client } from 'typesense'; @@ -44,9 +45,13 @@ const client = new Client({ // `documents` is an async iterable (e.g. a streaming projection); only one // batch is held in memory at a time. `rebuild` returns the live collection name // and the imported count (or `null` if another rebuild was already running). -const result = await rebuild(client, schema, documents); +const result = await rebuild(client, DATASET, documents, { name: 'datasets' }); ``` +The options accept everything `buildCollectionSchema` does (`defaultLocale`, +`defaultSortingField`, `synonymSets`) plus the rebuild knobs (`batchSize`, +`lockTtlMs`). + `rebuild` takes a `Client` the caller owns (and reuses for queries), so this package adds no connection or document type of its own – any object with an `id` is a valid document, including the `SearchDocument`s `@lde/search` produces. diff --git a/packages/search-typesense/src/adapter.ts b/packages/search-typesense/src/adapter.ts index ad3bfc9c..ee176aba 100644 --- a/packages/search-typesense/src/adapter.ts +++ b/packages/search-typesense/src/adapter.ts @@ -1,12 +1,29 @@ -import type { Client, CollectionCreateSchema, ImportResponse } from 'typesense'; +import type { Client, ImportResponse } from 'typesense'; +import type { SearchType } from '@lde/search'; +import { + buildCollectionSchema, + type CollectionSchemaOptions, +} from './collection-schema.js'; const LOCK_COLLECTION = 'rebuild_locks'; const DEFAULT_LOCK_TTL_MS = 10 * 60 * 1000; +/** {@link rebuild} options: the collection-schema options (`name` is the + * logical index name the alias is kept on) plus the rebuild tuning knobs. */ +export interface RebuildOptions extends CollectionSchemaOptions { + /** Documents imported per Typesense request (default 1000). */ + readonly batchSize?: number; + /** A held lock older than this (ms) is reclaimed (default 10 minutes). */ + readonly lockTtlMs?: number; +} + /** - * Blue/green-rebuild the search index `name`. + * Blue/green-rebuild the search index `options.name` from one declarative + * source: the collection schema is derived from `searchType` + * ({@link buildCollectionSchema}) and the documents are streamed in — one call + * from declaration to live index. * - * 1. create a fresh versioned collection (`${name}_`) from `schema` + * 1. create a fresh versioned collection (`${name}_`) * 2. stream `documents` into it in batches * 3. atomically repoint the `name` alias to the new collection, then * drop the collection it superseded. The caller passes only the logical @@ -40,17 +57,17 @@ const DEFAULT_LOCK_TTL_MS = 10 * 60 * 1000; */ export async function rebuild( client: Client, - schema: CollectionCreateSchema, + searchType: SearchType, documents: AsyncIterable, - options: { - /** Documents imported per Typesense request (default 1000). */ - batchSize?: number; - /** A held lock older than this (ms) is reclaimed (default 10 minutes). */ - lockTtlMs?: number; - } = {}, + options: RebuildOptions, ): Promise<{ collection: string; imported: number } | null> { - const { batchSize = 1000, lockTtlMs = DEFAULT_LOCK_TTL_MS } = options; - const name = schema.name; + const { + batchSize = 1000, + lockTtlMs = DEFAULT_LOCK_TTL_MS, + ...schemaOptions + } = options; + const schema = buildCollectionSchema(searchType, schemaOptions); + const name = schemaOptions.name; if (!(await acquireLock(client, name, lockTtlMs))) { return null; } diff --git a/packages/search-typesense/src/collection-schema.ts b/packages/search-typesense/src/collection-schema.ts index af133b08..d0c1bf9e 100644 --- a/packages/search-typesense/src/collection-schema.ts +++ b/packages/search-typesense/src/collection-schema.ts @@ -6,8 +6,10 @@ import { physicalFields, type SearchField, type SearchType } from '@lde/search'; export interface CollectionSchemaOptions { /** The Typesense collection (or alias) name. */ readonly name: string; - /** Snowball stemming locale for non-localized searchable fields (default `nl`). - * Localized text search fields stem in their own locale. */ + /** Snowball stemming locale for non-localized searchable fields (e.g. `en`). + * Unset, those fields are not stemmed — folding still applies — so no + * language is ever assumed. Localized text search fields always stem in + * their own locale. */ readonly defaultLocale?: string; /** The field Typesense sorts by when a query imposes no order. */ readonly defaultSortingField?: string; @@ -22,15 +24,15 @@ export interface CollectionSchemaOptions { * ({@link physicalFields}); the Typesense field type is derived from the field * `kind`, never re-declared. * - * Stemming is enabled on every folded `*_search` field: localized text stems - * each `*_search_${locale}` in its own language, and a non-localized searchable - * field stems in `defaultLocale`. + * Localized text stems each folded `*_search_${locale}` field in its own + * language; a non-localized searchable field stems in `defaultLocale` when one + * is set, and is left unstemmed (folded only) otherwise. */ export function buildCollectionSchema( searchType: SearchType, options: CollectionSchemaOptions, ): CollectionCreateSchema { - const defaultLocale = options.defaultLocale ?? 'nl'; + const { defaultLocale } = options; const collection: CollectionCreateSchema = { name: options.name, fields: searchType.fields.flatMap((field) => @@ -49,7 +51,7 @@ export function buildCollectionSchema( /** The physical Typesense fields one declaration produces. */ function typesenseFields( field: SearchField, - defaultLocale: string, + defaultLocale: string | undefined, defaultSortingField: string | undefined, ): CollectionFieldSchema[] { const names = physicalFields(field); @@ -105,8 +107,10 @@ function typesenseFields( name, type: valueType, optional: true, - stem: true, - locale: defaultLocale, + ...(defaultLocale !== undefined && { + stem: true, + locale: defaultLocale, + }), }); } } diff --git a/packages/search-typesense/src/index.ts b/packages/search-typesense/src/index.ts index 66247957..f56424e1 100644 --- a/packages/search-typesense/src/index.ts +++ b/packages/search-typesense/src/index.ts @@ -1,4 +1,5 @@ export { rebuild } from './adapter.js'; +export type { RebuildOptions } from './adapter.js'; export { buildCollectionSchema } from './collection-schema.js'; export type { CollectionSchemaOptions } from './collection-schema.js'; export { buildSearchParams } from './query-compiler.js'; diff --git a/packages/search-typesense/src/query-compiler.ts b/packages/search-typesense/src/query-compiler.ts index ab46b179..c028c81c 100644 --- a/packages/search-typesense/src/query-compiler.ts +++ b/packages/search-typesense/src/query-compiler.ts @@ -2,6 +2,7 @@ import type { SearchParams } from 'typesense/lib/Typesense/Documents.js'; import { fold } from '@lde/text-normalization'; import { fieldNamed, + filterOperatorFor, isoToUnixSeconds, isRangeFacet, pageForOffset, @@ -31,6 +32,14 @@ export interface CompileOptions { * range count is still safe. */ readonly maxFacetValues?: number; + /** + * Called for each `where` clause that compiles to nothing and is therefore + * skipped: an unknown field, an operator that does not match the field’s + * kind ({@link filterOperatorFor}), an empty `in` list, or a `range` with no + * usable bound. Skipping keeps a malformed clause from reaching the engine + * as garbage; supply this to log it instead of losing it silently. + */ + readonly onIgnoredFilter?: (filter: Filter) => void; } export function buildSearchParams( @@ -43,7 +52,11 @@ export function buildSearchParams( ? fold(query.text) : undefined; const { names, weights } = queryFields(searchType, query.locale); - const filterBy = compileFilterBy(query.where, searchType); + const filterBy = compileFilterBy( + query.where, + searchType, + options.onIgnoredFilter, + ); const sortBy = query.orderBy .map((sort) => compileSort(sort, searchType, query.locale)) .join(','); @@ -135,13 +148,21 @@ function queryFields( return { names, weights }; } -/** AND-join the compiled `where` clauses; skips unknown fields and empty clauses. */ +/** AND-join the compiled `where` clauses; a clause that compiles to nothing is + * skipped and reported to `onIgnoredFilter`. */ function compileFilterBy( where: readonly Filter[], searchType: SearchType, + onIgnoredFilter: ((filter: Filter) => void) | undefined, ): string { return where - .map((filter) => compileFilter(filter, searchType)) + .map((filter) => { + const clause = compileFilter(filter, searchType); + if (clause === undefined) { + onIgnoredFilter?.(filter); + } + return clause; + }) .filter((clause): clause is string => clause !== undefined) .join(' && '); } @@ -154,6 +175,11 @@ function compileFilter( if (field === undefined) { return undefined; } + // A clause whose operator does not match the field's kind (e.g. `range` on a + // keyword) would reach the engine as garbage syntax — skip it instead. + if (filterOperatorFor(field.kind) !== filterOperator(filter)) { + return undefined; + } if ('in' in filter) { return filter.in.length > 0 ? compileMembership(field, filter.in) @@ -165,6 +191,11 @@ function compileFilter( return `${field.name}:=${filter.is}`; } +/** The operator a {@link Filter} value carries, from its discriminating key. */ +function filterOperator(filter: Filter): 'in' | 'range' | 'is' { + return 'in' in filter ? 'in' : 'range' in filter ? 'range' : 'is'; +} + /** * A membership clause. A non-facet (tokenized) field uses the exact `:=` * operator so an IRI cannot partial-match on a shared path segment. diff --git a/packages/search-typesense/src/search.ts b/packages/search-typesense/src/search.ts index 4c9eaf72..b221b94b 100644 --- a/packages/search-typesense/src/search.ts +++ b/packages/search-typesense/src/search.ts @@ -6,6 +6,7 @@ import { physicalFields, referenceFields, type FacetBucket, + type Filter, type LocalizedValue, type Reference, type ResultDocument, @@ -36,6 +37,12 @@ export interface TypesenseSearchEngineOptions { * id-only references rather than failing. Optional — omit to swallow silently. */ readonly onLabelError?: (error: unknown) => void; + /** + * Called for each `where` clause the query compiler skips instead of sending + * to the engine (unknown field, operator not matching the field’s kind, empty + * `in` list or `range` bounds). Optional — omit to swallow silently. + */ + readonly onIgnoredFilter?: (filter: Filter) => void; /** * Opt-in in-memory label cache. When set (and {@link labelsCollection} is * set), the FULL sidecar `labels` collection is loaded once via the documents @@ -98,6 +105,7 @@ export function createTypesenseSearchEngine( ): Promise { const params = buildSearchParams(query, searchType, { maxFacetValues: options.maxFacetValues, + onIgnoredFilter: options.onIgnoredFilter, }); // Cached path: the once-loaded full collection serves labels by in-memory // lookup (no per-search round-trip). The load does not depend on the diff --git a/packages/search-typesense/test/adapter.test.ts b/packages/search-typesense/test/adapter.test.ts index 9dc20d41..ceb50d00 100644 --- a/packages/search-typesense/test/adapter.test.ts +++ b/packages/search-typesense/test/adapter.test.ts @@ -1,16 +1,18 @@ import { afterAll, beforeAll, beforeEach, describe, expect, it } from 'vitest'; -import type { Client, CollectionCreateSchema } from 'typesense'; +import type { Client } from 'typesense'; +import type { SearchType } from '@lde/search'; import { rebuild } from '../src/adapter.js'; import { TypesenseContainer } from './typesense-container.js'; const NAME = 'datasets'; const LOCK_COLLECTION = 'rebuild_locks'; -const schema: CollectionCreateSchema = { - name: NAME, +const datasetType: SearchType = { + name: 'Dataset', + type: 'https://example.org/Dataset', fields: [ - { name: 'title', type: 'string' }, - { name: 'year', type: 'int32' }, + { name: 'title', kind: 'keyword' }, + { name: 'year', kind: 'integer' }, ], }; @@ -70,8 +72,9 @@ describe('search-typesense', () => { it('publishes a versioned collection and points the index alias at it', async () => { const result = await rebuild( client, - schema, + datasetType, stream([{ id: 'a', title: 'Verhaal van Utrecht', year: 2024 }]), + { name: NAME }, ); expect(result?.imported).toBe(1); @@ -87,13 +90,15 @@ describe('search-typesense', () => { it('swaps the alias to a new collection and drops the previous one', async () => { const first = await rebuild( client, - schema, + datasetType, stream([{ id: 'a', title: 'Old', year: 2023 }]), + { name: NAME }, ); const second = await rebuild( client, - schema, + datasetType, stream([{ id: 'a', title: 'New', year: 2024 }]), + { name: NAME }, ); expect(second?.collection).not.toBe(first?.collection); @@ -110,7 +115,8 @@ describe('search-typesense', () => { year: 2024, })); - const result = await rebuild(client, schema, stream(documents), { + const result = await rebuild(client, datasetType, stream(documents), { + name: NAME, batchSize: 2, }); @@ -123,8 +129,9 @@ describe('search-typesense', () => { const result = await rebuild( client, - schema, + datasetType, stream([{ id: 'a', title: 'A', year: 2024 }]), + { name: NAME }, ); expect(result).toBeNull(); @@ -136,9 +143,9 @@ describe('search-typesense', () => { const result = await rebuild( client, - schema, + datasetType, stream([{ id: 'a', title: 'A', year: 2024 }]), - { lockTtlMs: 1_000 }, + { name: NAME, lockTtlMs: 1_000 }, ); expect(result?.imported).toBe(1); @@ -148,8 +155,9 @@ describe('search-typesense', () => { it('leaves the live alias intact and drops the orphan when a build fails', async () => { await rebuild( client, - schema, + datasetType, stream([{ id: 'a', title: 'Live', year: 2024 }]), + { name: NAME }, ); const live = await aliasTarget(client); const collectionCount = (await client.collections().retrieve()).length; @@ -158,8 +166,9 @@ describe('search-typesense', () => { await expect( rebuild( client, - schema, + datasetType, stream([{ id: 'bad', title: 't', year: 'nope' }]), + { name: NAME }, ), ).rejects.toThrow(/failed/i); @@ -171,7 +180,9 @@ describe('search-typesense', () => { }); it('publishes an empty collection for an empty source', async () => { - const result = await rebuild(client, schema, stream([])); + const result = await rebuild(client, datasetType, stream([]), { + name: NAME, + }); expect(result?.imported).toBe(0); expect((await client.collections(NAME).retrieve()).num_documents).toBe(0); diff --git a/packages/search-typesense/test/collection-schema.test.ts b/packages/search-typesense/test/collection-schema.test.ts index 49711c1e..11f15cac 100644 --- a/packages/search-typesense/test/collection-schema.test.ts +++ b/packages/search-typesense/test/collection-schema.test.ts @@ -3,6 +3,7 @@ import type { SearchType } from '@lde/search'; import { buildCollectionSchema } from '../src/collection-schema.js'; const schema: SearchType = { + name: 'Dataset', type: 'http://www.w3.org/ns/dcat#Dataset', fields: [ { @@ -189,4 +190,17 @@ describe('buildCollectionSchema', () => { locale: 'nl', }); }); + + it('assumes no language: without defaultLocale the companion is folded but unstemmed', () => { + const withoutLocale = buildCollectionSchema(schema, { name: 'datasets' }); + expect(withoutLocale.fields).toContainEqual({ + name: 'keyword_search', + type: 'string[]', + optional: true, + }); + // Localized text still stems per locale — that never depended on the default. + expect(withoutLocale.fields).toContainEqual( + expect.objectContaining({ name: 'title_search_nl', locale: 'nl' }), + ); + }); }); diff --git a/packages/search-typesense/test/generator-stability.test.ts b/packages/search-typesense/test/generator-stability.test.ts index 9b93d134..8404c545 100644 --- a/packages/search-typesense/test/generator-stability.test.ts +++ b/packages/search-typesense/test/generator-stability.test.ts @@ -10,6 +10,7 @@ import { buildCollectionSchema } from '../src/collection-schema.js'; * surfaces as a snapshot diff before this library is published. */ const THING: SearchType = { + name: 'Thing', type: 'https://example.org/Thing', fields: [ { diff --git a/packages/search-typesense/test/parse-response.test.ts b/packages/search-typesense/test/parse-response.test.ts index 3b7dd96f..397c8e75 100644 --- a/packages/search-typesense/test/parse-response.test.ts +++ b/packages/search-typesense/test/parse-response.test.ts @@ -8,6 +8,7 @@ import { } from '../src/search.js'; const schema: SearchType = { + name: 'Dataset', type: 'http://www.w3.org/ns/dcat#Dataset', fields: [ { @@ -151,6 +152,7 @@ describe('parseSearchResponse', () => { describe('parseSearchResponse range facets', () => { const rangeSchema: SearchType = { + name: 'Dataset', type: 'http://www.w3.org/ns/dcat#Dataset', fields: [ { diff --git a/packages/search-typesense/test/query-compiler.test.ts b/packages/search-typesense/test/query-compiler.test.ts index 6d71efca..18d8e70b 100644 --- a/packages/search-typesense/test/query-compiler.test.ts +++ b/packages/search-typesense/test/query-compiler.test.ts @@ -3,6 +3,7 @@ import type { SearchQuery, SearchType } from '@lde/search'; import { buildSearchParams } from '../src/query-compiler.js'; const schema: SearchType = { + name: 'Dataset', type: 'http://www.w3.org/ns/dcat#Dataset', fields: [ { @@ -115,6 +116,31 @@ describe('buildSearchParams', () => { ); }); + it('skips a clause that compiles to nothing and reports it via onIgnoredFilter', () => { + const ignored: unknown[] = []; + const params = buildSearchParams( + { + ...base, + where: [ + { field: 'status', in: ['valid'] }, // fine — kept + { field: 'nonexistent', in: ['x'] }, // unknown field + { field: 'keyword', range: { min: 1 } }, // operator ≠ field kind + { field: 'status', in: [] }, // empty membership + { field: 'size', range: {} }, // no usable bound + ], + }, + schema, + { onIgnoredFilter: (filter) => ignored.push(filter) }, + ); + expect(params.filter_by).toBe('status:[`valid`]'); + expect(ignored).toEqual([ + { field: 'nonexistent', in: ['x'] }, + { field: 'keyword', range: { min: 1 } }, + { field: 'status', in: [] }, + { field: 'size', range: {} }, + ]); + }); + it('compiles a one-sided range bound', () => { expect( buildSearchParams( diff --git a/packages/search-typesense/test/search-engine.test.ts b/packages/search-typesense/test/search-engine.test.ts index 32f94a59..3c8302c3 100644 --- a/packages/search-typesense/test/search-engine.test.ts +++ b/packages/search-typesense/test/search-engine.test.ts @@ -6,6 +6,7 @@ import { createTypesenseSearchEngine } from '../src/search.js'; import { TypesenseContainer } from './typesense-container.js'; const datasetSchema: SearchType = { + name: 'Dataset', type: 'http://www.w3.org/ns/dcat#Dataset', fields: [ { diff --git a/packages/search-typesense/vite.config.ts b/packages/search-typesense/vite.config.ts index 71f80bbd..90d28c53 100644 --- a/packages/search-typesense/vite.config.ts +++ b/packages/search-typesense/vite.config.ts @@ -16,10 +16,10 @@ export default mergeConfig( // rethrow guards and best-effort cleanup paths are deliberately not // exercised, which is why branch coverage is lower. thresholds: { - functions: 96.87, - lines: 93.49, - branches: 84.05, - statements: 93.55, + functions: 96.92, + lines: 94.31, + branches: 86.91, + statements: 94.37, }, }, }, From 03cc0a38849ab7d02cb22860415e41a656866f73 Mon Sep 17 00:00:00 2001 From: David de Boer Date: Fri, 3 Jul 2026 19:28:47 +0200 Subject: [PATCH 30/35] feat(search)!: always validate queries at the engine port - add validateQuery/assertValidQuery to the core: structural validation of where (declared, filterable, operator matches kind), facets (declared, facetable) and orderBy (declared or relevance) against the SearchType; vacuous clauses (empty in, boundless range) are no-ops, not issues - the port contract now requires every adapter to reject a structurally invalid query; the Typesense engine enforces it on every search, so validation holds for every caller (queryDefaults policies, in-process callers, weaker-typed surfaces), not only GraphQL-validated input - onIgnoredFilter consequently narrows to vacuous clauses at the engine level; share filterOperator from the core instead of a compiler copy BREAKING CHANGE: TypesenseSearchEngine.search now throws on a structurally invalid query instead of silently dropping the offending clauses. --- packages/search-typesense/README.md | 12 +- .../search-typesense/src/query-compiler.ts | 6 +- packages/search-typesense/src/search.ts | 11 +- .../test/query-compiler.test.ts | 8 ++ .../test/search-engine.test.ts | 28 +++++ packages/search-typesense/vite.config.ts | 6 +- packages/search/README.md | 8 ++ packages/search/src/engine.ts | 6 + packages/search/src/index.ts | 19 +++- packages/search/src/query.ts | 103 +++++++++++++++++- packages/search/test/query.test.ts | 103 +++++++++++++++++- packages/search/vite.config.ts | 6 +- 12 files changed, 293 insertions(+), 23 deletions(-) diff --git a/packages/search-typesense/README.md b/packages/search-typesense/README.md index d77ffcad..191d5dca 100644 --- a/packages/search-typesense/README.md +++ b/packages/search-typesense/README.md @@ -17,11 +17,13 @@ search/sort keys) matches what the projection writes, via `@lde/search`’s `physicalFields`, so the index and the documents cannot drift. `createTypesenseSearchEngine(client, { collection, labelsCollection })` is the -`SearchEngine` implementation: it compiles the query, runs the search, resolves -reference (and reference-facet) labels from the sidecar `labels` collection in a -single lookup, and reconstructs the logical `SearchResult` — language maps, -labelled references, labelled facet buckets. The pure halves `buildSearchParams` -and `parseSearchResponse` are exported for direct use and testing. +`SearchEngine` implementation: it validates the query against the search type +(the port contract — a structurally invalid query is rejected, never sent), +compiles it, runs the search, resolves reference (and reference-facet) labels +from the sidecar `labels` collection in a single lookup, and reconstructs the +logical `SearchResult` — language maps, labelled references, labelled facet +buckets. The pure halves `buildSearchParams` and `parseSearchResponse` are +exported for direct use and testing. ## Indexing diff --git a/packages/search-typesense/src/query-compiler.ts b/packages/search-typesense/src/query-compiler.ts index c028c81c..9c4ef7e6 100644 --- a/packages/search-typesense/src/query-compiler.ts +++ b/packages/search-typesense/src/query-compiler.ts @@ -2,6 +2,7 @@ import type { SearchParams } from 'typesense/lib/Typesense/Documents.js'; import { fold } from '@lde/text-normalization'; import { fieldNamed, + filterOperator, filterOperatorFor, isoToUnixSeconds, isRangeFacet, @@ -191,11 +192,6 @@ function compileFilter( return `${field.name}:=${filter.is}`; } -/** The operator a {@link Filter} value carries, from its discriminating key. */ -function filterOperator(filter: Filter): 'in' | 'range' | 'is' { - return 'in' in filter ? 'in' : 'range' in filter ? 'range' : 'is'; -} - /** * A membership clause. A non-facet (tokenized) field uses the exact `:=` * operator so an IRI cannot partial-match on a shared path segment. diff --git a/packages/search-typesense/src/search.ts b/packages/search-typesense/src/search.ts index b221b94b..73430409 100644 --- a/packages/search-typesense/src/search.ts +++ b/packages/search-typesense/src/search.ts @@ -1,5 +1,6 @@ import type { Client } from 'typesense'; import { + assertValidQuery, fieldNamed, isRangeFacet, outputFields, @@ -38,9 +39,10 @@ export interface TypesenseSearchEngineOptions { */ readonly onLabelError?: (error: unknown) => void; /** - * Called for each `where` clause the query compiler skips instead of sending - * to the engine (unknown field, operator not matching the field’s kind, empty - * `in` list or `range` bounds). Optional — omit to swallow silently. + * Called for each vacuous `where` clause the query compiler skips as a no-op + * (an empty `in` list, a `range` with no usable bound). Structurally invalid + * queries never get this far — the engine rejects them up front + * (`assertValidQuery`). Optional — omit to swallow silently. */ readonly onIgnoredFilter?: (filter: Filter) => void; /** @@ -103,6 +105,9 @@ export function createTypesenseSearchEngine( query: SearchQuery, searchType: SearchType, ): Promise { + // The port contract: a structurally invalid query (unknown field, wrong + // operator, unknown facet) is rejected up front, for EVERY caller. + assertValidQuery(query, searchType); const params = buildSearchParams(query, searchType, { maxFacetValues: options.maxFacetValues, onIgnoredFilter: options.onIgnoredFilter, diff --git a/packages/search-typesense/test/query-compiler.test.ts b/packages/search-typesense/test/query-compiler.test.ts index 18d8e70b..b681551a 100644 --- a/packages/search-typesense/test/query-compiler.test.ts +++ b/packages/search-typesense/test/query-compiler.test.ts @@ -141,6 +141,14 @@ describe('buildSearchParams', () => { ]); }); + it('skips a non-compiling clause silently when no onIgnoredFilter is given', () => { + const params = buildSearchParams( + { ...base, where: [{ field: 'nonexistent', in: ['x'] }] }, + schema, + ); + expect(params.filter_by).toBeUndefined(); + }); + it('compiles a one-sided range bound', () => { expect( buildSearchParams( diff --git a/packages/search-typesense/test/search-engine.test.ts b/packages/search-typesense/test/search-engine.test.ts index 3c8302c3..519ee774 100644 --- a/packages/search-typesense/test/search-engine.test.ts +++ b/packages/search-typesense/test/search-engine.test.ts @@ -224,4 +224,32 @@ describe('createTypesenseSearchEngine (integration)', () => { }, ]); }); + + it('always rejects a structurally invalid query, before reaching the engine', async () => { + await expect( + engine.search( + { ...baseQuery, where: [{ field: 'nonexistent', in: ['x'] }] }, + datasetSchema, + ), + ).rejects.toThrow(/Invalid search query for “Dataset”/); + await expect( + engine.search({ ...baseQuery, facets: ['title'] }, datasetSchema), + ).rejects.toThrow(/not-facetable/); + }); + + it('reports a vacuous where clause via onIgnoredFilter and still searches', async () => { + const ignored: unknown[] = []; + const reporting = createTypesenseSearchEngine(client, { + collection: 'datasets', + onIgnoredFilter: (filter) => ignored.push(filter), + }); + + const result = await reporting.search( + { ...baseQuery, where: [{ field: 'status', in: [] }] }, + datasetSchema, + ); + + expect(result.total).toBeGreaterThan(0); // empty membership = no constraint + expect(ignored).toEqual([{ field: 'status', in: [] }]); + }); }); diff --git a/packages/search-typesense/vite.config.ts b/packages/search-typesense/vite.config.ts index 90d28c53..b65006b3 100644 --- a/packages/search-typesense/vite.config.ts +++ b/packages/search-typesense/vite.config.ts @@ -16,9 +16,11 @@ export default mergeConfig( // rethrow guards and best-effort cleanup paths are deliberately not // exercised, which is why branch coverage is lower. thresholds: { - functions: 96.92, + // functions dipped a hair when the shared `filterOperator` helper + // moved to @lde/search (one fewer covered function in this package). + functions: 96.87, lines: 94.31, - branches: 86.91, + branches: 87.14, statements: 94.37, }, }, diff --git a/packages/search/README.md b/packages/search/README.md index e6ba9ed5..543d0e5e 100644 --- a/packages/search/README.md +++ b/packages/search/README.md @@ -191,6 +191,14 @@ holds for **any** consumer, including an API built on this package — which is engine adapters and surfaces compile through the shared `SearchQuery` IR and the `physicalFields` convention rather than re-deriving field names. +Queries are **always validated**: the port contract requires every engine +adapter to reject a structurally invalid `SearchQuery` (`assertValidQuery`) — +unknown or non-`filterable` fields in `where`, an operator not matching the +field’s kind, non-`facetable` facet requests — no matter which surface or +policy produced it. A typed surface like GraphQL makes most of these +unrepresentable; the port enforces them for everyone else (deployment +`queryDefaults`, in-process callers, weaker-typed surfaces). + ## Typed results The `SearchEngine` port is loosely typed by default: facet and document keys diff --git a/packages/search/src/engine.ts b/packages/search/src/engine.ts index 913ecfc5..65c35c70 100644 --- a/packages/search/src/engine.ts +++ b/packages/search/src/engine.ts @@ -10,6 +10,12 @@ import type { SearchType } from './schema.js'; * any consumer noticing. * Nothing engine-specific and nothing RDF-specific leaks past this port. * + * Port contract: an adapter ALWAYS validates the incoming query against the + * search type (`assertValidQuery`) and rejects a structurally invalid one — + * unknown or non-filterable fields, mismatched operators, unknown facets — + * rather than passing garbage to its engine. Validation is not the caller’s + * job: it must hold for every surface and for injected deployment policy. + * * `FacetField` keys the returned facet map; it defaults to `string` so an engine * stays ergonomic, and a deployment can narrow it to its own facet-field union * (see {@link FacetFieldsOf}) for typo-safe facet access. `Type` narrows the diff --git a/packages/search/src/index.ts b/packages/search/src/index.ts index 633de8a0..b64d4432 100644 --- a/packages/search/src/index.ts +++ b/packages/search/src/index.ts @@ -32,9 +32,22 @@ export type { FacetRange, } from './schema.js'; -// Engine- and protocol-neutral query IR + filter semantics. -export { filterOperatorFor, pageForOffset } from './query.js'; -export type { SearchQuery, Filter, Sort, FilterOperator } from './query.js'; +// Engine- and protocol-neutral query IR + filter semantics, and the always-on +// structural query validation every engine adapter enforces. +export { + filterOperatorFor, + filterOperator, + validateQuery, + assertValidQuery, + pageForOffset, +} from './query.js'; +export type { + SearchQuery, + Filter, + Sort, + FilterOperator, + QueryIssue, +} from './query.js'; // Engine port + the logical result document returned across it. export { engineFor } from './engine.js'; diff --git a/packages/search/src/query.ts b/packages/search/src/query.ts index b98f5577..f626157d 100644 --- a/packages/search/src/query.ts +++ b/packages/search/src/query.ts @@ -1,4 +1,4 @@ -import type { FieldKind } from './schema.js'; +import { fieldNamed, type FieldKind, type SearchType } from './schema.js'; /** * The engine- and protocol-neutral query IR. Every API surface compiles its @@ -71,6 +71,107 @@ export function filterOperatorFor(kind: FieldKind): FilterOperator | undefined { return OPERATOR_BY_KIND[kind]; } +/** The operator a {@link Filter} value carries, from its discriminating key. */ +export function filterOperator(filter: Filter): FilterOperator { + return 'in' in filter ? 'in' : 'range' in filter ? 'range' : 'is'; +} + +/** + * One structural problem {@link validateQuery} found: the query references a + * field the search type does not declare, or uses it in a role it does not + * opt into. Vacuous-but-valid clauses (an empty `in` list, a `range` with no + * bound) are NOT issues — a compiler skips those as no-ops. + */ +export interface QueryIssue { + readonly part: 'where' | 'facets' | 'orderBy'; + readonly field: string; + readonly reason: + | 'unknown-field' + | 'not-filterable' + | 'operator-mismatch' + | 'not-facetable'; +} + +/** + * Structurally validate a query against its search type: every `where` clause + * targets a declared, `filterable` field with the operator its kind accepts + * ({@link filterOperatorFor}); every requested facet is a declared, `facetable` + * field; every sort is `relevance` or a declared field. Sorting deliberately + * checks declaration only, not the `sortable` flag: that flag means *publicly + * selectable*, and a deployment policy may sort on a private tie-break field. + * + * This is the port’s always-on guard: every {@link SearchEngine} adapter MUST + * reject a query with issues ({@link assertValidQuery}) instead of passing + * garbage to its engine, so validation holds for every caller — including + * `queryDefaults` policies and surfaces weaker than GraphQL. + */ +export function validateQuery( + query: SearchQuery, + searchType: SearchType, +): readonly QueryIssue[] { + const issues: QueryIssue[] = []; + for (const filter of query.where) { + const field = fieldNamed(searchType, filter.field); + if (field === undefined) { + issues.push({ + part: 'where', + field: filter.field, + reason: 'unknown-field', + }); + } else if (field.filterable !== true) { + issues.push({ + part: 'where', + field: filter.field, + reason: 'not-filterable', + }); + } else if (filterOperatorFor(field.kind) !== filterOperator(filter)) { + issues.push({ + part: 'where', + field: filter.field, + reason: 'operator-mismatch', + }); + } + } + for (const name of query.facets) { + const field = fieldNamed(searchType, name); + if (field === undefined) { + issues.push({ part: 'facets', field: name, reason: 'unknown-field' }); + } else if (field.facetable !== true) { + issues.push({ part: 'facets', field: name, reason: 'not-facetable' }); + } + } + for (const sort of query.orderBy) { + if ( + sort.field !== 'relevance' && + fieldNamed(searchType, sort.field) === undefined + ) { + issues.push({ + part: 'orderBy', + field: sort.field, + reason: 'unknown-field', + }); + } + } + return issues; +} + +/** Throw on the first structurally invalid query part ({@link validateQuery}), + * naming every issue. The always-on entry point for engine adapters. */ +export function assertValidQuery( + query: SearchQuery, + searchType: SearchType, +): void { + const issues = validateQuery(query, searchType); + if (issues.length > 0) { + const detail = issues + .map((issue) => `${issue.part}: “${issue.field}” (${issue.reason})`) + .join(', '); + throw new Error( + `Invalid search query for “${searchType.name}”: ${detail}.`, + ); + } +} + /** * The 1-based page an `offset` falls on — the numbered-pagination presentation * of the IR, shared by the surfaces and the adapters. `limit: 0` (a facet-only diff --git a/packages/search/test/query.test.ts b/packages/search/test/query.test.ts index 6de08b5c..d407c046 100644 --- a/packages/search/test/query.test.ts +++ b/packages/search/test/query.test.ts @@ -1,5 +1,12 @@ import { describe, expect, it } from 'vitest'; -import { filterOperatorFor, pageForOffset } from '../src/query.js'; +import { + assertValidQuery, + filterOperatorFor, + pageForOffset, + validateQuery, + type SearchQuery, +} from '../src/query.js'; +import type { SearchType } from '../src/schema.js'; describe('filterOperatorFor', () => { it('maps each field kind to its `where` operator', () => { @@ -13,6 +20,100 @@ describe('filterOperatorFor', () => { }); }); +describe('validateQuery', () => { + const searchType: SearchType = { + name: 'Dataset', + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [ + { name: 'status', kind: 'keyword', facetable: true, filterable: true }, + { name: 'size', kind: 'integer', filterable: true }, + { name: 'license', kind: 'keyword' }, // declared, but no roles opted into + { name: 'statusRank', kind: 'integer', sortable: true }, + ], + }; + const base: SearchQuery = { + where: [], + orderBy: [], + limit: 10, + offset: 0, + facets: [], + locale: 'nl', + }; + + it('accepts a structurally valid query', () => { + expect( + validateQuery( + { + ...base, + where: [ + { field: 'status', in: ['valid'] }, + { field: 'size', range: { min: 1 } }, + ], + facets: ['status'], + orderBy: [ + { field: 'relevance', direction: 'desc' }, + // Declared but not `sortable`: allowed — `sortable` means publicly + // selectable, and deployment policy may sort on a private tie-break. + { field: 'statusRank', direction: 'asc' }, + ], + }, + searchType, + ), + ).toEqual([]); + }); + + it('accepts vacuous clauses: they are no-ops, not structural issues', () => { + expect( + validateQuery( + { + ...base, + where: [ + { field: 'status', in: [] }, + { field: 'size', range: {} }, + ], + }, + searchType, + ), + ).toEqual([]); + }); + + it('flags every structurally invalid part', () => { + const issues = validateQuery( + { + ...base, + where: [ + { field: 'nonexistent', in: ['x'] }, + { field: 'license', in: ['MIT'] }, + { field: 'status', range: { min: 1 } }, + ], + facets: ['nonexistent', 'size'], + orderBy: [{ field: 'nonexistent', direction: 'asc' }], + }, + searchType, + ); + expect(issues).toEqual([ + { part: 'where', field: 'nonexistent', reason: 'unknown-field' }, + { part: 'where', field: 'license', reason: 'not-filterable' }, + { part: 'where', field: 'status', reason: 'operator-mismatch' }, + { part: 'facets', field: 'nonexistent', reason: 'unknown-field' }, + { part: 'facets', field: 'size', reason: 'not-facetable' }, + { part: 'orderBy', field: 'nonexistent', reason: 'unknown-field' }, + ]); + }); + + it('assertValidQuery names the type and every issue', () => { + expect(() => + assertValidQuery( + { ...base, where: [{ field: 'nonexistent', in: ['x'] }] }, + searchType, + ), + ).toThrow( + 'Invalid search query for “Dataset”: where: “nonexistent” (unknown-field).', + ); + expect(() => assertValidQuery(base, searchType)).not.toThrow(); + }); +}); + describe('pageForOffset', () => { it('maps an offset to its 1-based page', () => { expect(pageForOffset(0, 20)).toBe(1); diff --git a/packages/search/vite.config.ts b/packages/search/vite.config.ts index a7e51876..a7c148c9 100644 --- a/packages/search/vite.config.ts +++ b/packages/search/vite.config.ts @@ -11,9 +11,9 @@ export default mergeConfig( coverage: { thresholds: { functions: 100, - lines: 97.9, - branches: 91.8, - statements: 98, + lines: 98.21, + branches: 92.25, + statements: 98.28, }, }, }, From 7c0cda98a910ed229e73d42fc797073f8f48f8e3 Mon Sep 17 00:00:00 2001 From: David de Boer Date: Fri, 3 Jul 2026 19:29:08 +0200 Subject: [PATCH 31/35] docs(search-api-graphql): sharpen the README - link GraphQLSchema to graphql-js and SearchSchema to its definition in the @lde/search terminology table - describe the resolver precisely: one shared implementation, one instance per root field bound to its SearchType - consolidate the no-drift story into @lde/search (the family entry point); keep only the surface-specific frozen-contract guard here - explain the SDL snapshot guard with a code sample and the accept-a-diff workflow --- packages/search-api-graphql/README.md | 48 +++++++++++++++++---------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/packages/search-api-graphql/README.md b/packages/search-api-graphql/README.md index 336ace60..337207ca 100644 --- a/packages/search-api-graphql/README.md +++ b/packages/search-api-graphql/README.md @@ -1,10 +1,13 @@ # @lde/search-api-graphql The GraphQL surface for the [`@lde/search`](../search) core. **Both engine- and -domain-agnostic:** it builds an executable `GraphQLSchema` from your whole -`SearchSchema` at runtime — one root query field per `SearchType`, each -searchable in its own way — served by one generic resolver per root field over -any `SearchEngine`. It names neither your **domain** (each type’s GraphQL name +domain-agnostic:** it builds an executable +[graphql-js](https://graphql.org/graphql-js/) `GraphQLSchema` from your whole +[`SearchSchema`](../search/README.md#terminology) at runtime — one root query +field per `SearchType`, each searchable in its own way. All root fields are +served by the same resolver implementation (no per-type code, no codegen); +each root field gets its own instance of it, bound to that field’s +`SearchType`, over any `SearchEngine`. It names neither your **domain** (each type’s GraphQL name is the `SearchType`’s own logical `name` — `Dataset`, `Person`, `CreativeWork`, …) nor your **engine** (the resolver calls `context.engine`, be it [`@lde/search-typesense`](../search-typesense) or another adapter). @@ -13,7 +16,7 @@ is the `SearchType`’s own logical `name` — `Dataset`, `Person`, `CreativeWor `buildGraphQLSchema(schema)` constructs the GraphQL schema once at startup from the field model — no SDL artifact, no generated resolver stubs. The field model -is the single source; the GraphQL contract is whatever it produces. Type names +is the single source; the GraphQL contract is derived from it. Type names come from each `SearchType`’s `name`; output types, the `where`/`orderBy`/facet inputs, reference types and nullability are all derived from each field’s `kind` and capability flags. The common case needs no options at all: @@ -66,9 +69,6 @@ projectGraph(quads, searchSchema(DATASET, PERSON, INTERNAL)); const gqlSchema = buildGraphQLSchema(searchSchema(DATASET, PERSON)); ``` -Hiding a type is then a decision readable at the call site — the schema you -build the API from _is_ the list of what it serves. - ## What it builds (per root type) - **Output type** (the `SearchType`’s `name`): localized text → best-first `[LanguageString!]!` @@ -84,13 +84,27 @@ build the API from _is_ the list of what it serves. `null` for token/free-string facets whose display the consumer owns (its own i18n, or the value itself). -## Why it can’t drift +## Guarding the contract + +Why the API, the index and a future REST surface cannot drift apart is the +search family’s overall approach — one field model, one query IR — described +in [`@lde/search`](../search/README.md). Specific to this surface: the GraphQL +contract is **frozen** (breaking to change), yet generated rather than +handwritten, so nothing in the repo shows a contract change as a reviewable +diff. A _consumer_ restores that with one snapshot test over its **own** +search schema: + +```ts +import { printGraphQLSchema } from '@lde/search-api-graphql'; + +it('keeps the public GraphQL contract stable', () => { + expect(printGraphQLSchema(searchSchema(DATASET, PERSON))).toMatchSnapshot(); +}); +``` -The surface reads the same field model the index is built from, and compiles into -the same neutral `SearchQuery` the engine consumes — so the API, the index and a -future REST surface stay in lockstep. The contract is **frozen** (breaking to -change), and because it is generated rather than handwritten, a _consumer_ guards -it with a `printGraphQLSchema(schema, options)` SDL snapshot over its **own** -search schema and type names — that snapshot also catches a `buildGraphQLSchema` -change in a future version of this library silently altering the consumer’s -contract. +The first run writes the emitted SDL to a committed snapshot file; every later +run re-emits and diffs against it. Any contract change — your own schema edit, +or a new version of this library emitting different GraphQL for the same +declaration — fails the test and shows the SDL diff, until you consciously +accept it (`vitest -u`) and the reviewer sees the contract change spelled out +in the PR. From 02e573dbd301e1c066d11ea4f41910946cd23776 Mon Sep 17 00:00:00 2001 From: David de Boer Date: Fri, 3 Jul 2026 19:38:45 +0200 Subject: [PATCH 32/35] feat(search)!: order engineFor parameters value-first - one convention across the family: a function takes the value it operates on first and the SearchType right after (search(query, type), projectDocument(node, type)); engineFor(engine, type) now complies, and the README states the rule BREAKING CHANGE: engineFor's parameters swapped from (searchType, engine) to (engine, searchType). --- docs/decisions/0003-search-api-core-query-model.md | 2 +- packages/search/README.md | 7 ++++++- packages/search/src/engine.ts | 6 +++++- packages/search/test/engine.test.ts | 2 +- packages/search/vite.config.ts | 4 ++-- 5 files changed, 15 insertions(+), 6 deletions(-) diff --git a/docs/decisions/0003-search-api-core-query-model.md b/docs/decisions/0003-search-api-core-query-model.md index 642d458e..6b59130d 100644 --- a/docs/decisions/0003-search-api-core-query-model.md +++ b/docs/decisions/0003-search-api-core-query-model.md @@ -200,7 +200,7 @@ SearchEngine` readable. // FacetField / OutputField default to `string` (ergonomic) and a deployment narrows them // to its type’s facetable / output field names for typo-safe facet and document access; // Type narrows the accepted searchType argument alongside, so a narrowed engine cannot be -// handed the wrong search type. The ergonomic route is engineFor(searchType, engine) over +// handed the wrong search type. The ergonomic route is engineFor(engine, searchType) over // a defineSearchType declaration (helpers FacetFieldsOf / OutputFieldsOf and // the EngineFor alias are exported for hand-written signatures). interface SearchEngine< diff --git a/packages/search/README.md b/packages/search/README.md index 543d0e5e..c728307d 100644 --- a/packages/search/README.md +++ b/packages/search/README.md @@ -67,6 +67,11 @@ and GraphQL (one of the surfaces): `projectGraph` and the GraphQL surface consume a `SearchSchema` (projecting every type in one pass); the engine port executes one `SearchType` at a time. +One parameter-order convention holds across the whole family: a function takes +the value it operates on first and the `SearchType` declaration right after it +— `search(query, type)`, `projectDocument(node, type)`, +`engineFor(engine, type)`, `buildSearchParams(query, type)`. + ## Field model The mapping is data, not code. Each field declares its `kind`, the IR `path` to @@ -210,7 +215,7 @@ should narrow the engine with `engineFor` — same instance, zero runtime cost: ```ts import { engineFor } from '@lde/search'; -const datasetEngine = engineFor(DATASET, engine); +const datasetEngine = engineFor(engine, DATASET); const result = await datasetEngine.search(query, DATASET); result.facets.publisher; // typed: only DATASET’s facetable fields are keys diff --git a/packages/search/src/engine.ts b/packages/search/src/engine.ts index 65c35c70..6e55810c 100644 --- a/packages/search/src/engine.ts +++ b/packages/search/src/engine.ts @@ -90,11 +90,15 @@ export type EngineFor = SearchEngine< * as a literal, so facet and document keys come out typo-safe without the * caller writing any generics. Identity at runtime: the same engine instance * is returned, only its type changes. + * + * Parameter order follows the family-wide convention: the value being + * operated on first, its `SearchType` right after. */ export function engineFor( - searchType: Type, engine: SearchEngine, + searchType: Type, ): EngineFor { + void searchType; // exists only to infer `Type`; the engine is returned as-is return engine; } diff --git a/packages/search/test/engine.test.ts b/packages/search/test/engine.test.ts index 96fbbb21..7a8df1c0 100644 --- a/packages/search/test/engine.test.ts +++ b/packages/search/test/engine.test.ts @@ -137,8 +137,8 @@ describe('typed facet and document keys', () => { // `engineFor` narrows a generic adapter (plain `SearchEngine`) to any // `EngineFor` — the same instance, identity at runtime. const engine: EngineFor = engineFor( - datasetSchema, fake, + datasetSchema, ); expect(engine).toBe(fake); diff --git a/packages/search/vite.config.ts b/packages/search/vite.config.ts index a7c148c9..30a36186 100644 --- a/packages/search/vite.config.ts +++ b/packages/search/vite.config.ts @@ -11,9 +11,9 @@ export default mergeConfig( coverage: { thresholds: { functions: 100, - lines: 98.21, + lines: 98.22, branches: 92.25, - statements: 98.28, + statements: 98.29, }, }, }, From 0d5b488a34f2e28f6c1d95f5f59bb551092f5e5c Mon Sep 17 00:00:00 2001 From: David de Boer Date: Fri, 3 Jul 2026 19:39:05 +0200 Subject: [PATCH 33/35] feat(search-typesense)!: unify the compiler options and rebuild's parameter order - export the query-compiler options as BuildSearchParamsOptions (they were public-by-signature but unnameable) and have TypesenseSearchEngineOptions extend them, so maxFacetValues and onIgnoredFilter are declared once and the engine forwards its options wholesale - rebuild(client, documents, searchType, options) now follows the family-wide value-first, declaration-second parameter convention - turn the engine's search-steps sentence into a bulleted list in the README BREAKING CHANGE: rebuild's documents and searchType parameters swapped places. --- packages/search-typesense/README.md | 24 ++++++++------ packages/search-typesense/src/adapter.ts | 2 +- packages/search-typesense/src/index.ts | 1 + .../search-typesense/src/query-compiler.ts | 23 +++++++++----- packages/search-typesense/src/search.ts | 31 ++++++------------- .../search-typesense/test/adapter.test.ts | 18 +++++------ 6 files changed, 51 insertions(+), 48 deletions(-) diff --git a/packages/search-typesense/README.md b/packages/search-typesense/README.md index 191d5dca..8e21142c 100644 --- a/packages/search-typesense/README.md +++ b/packages/search-typesense/README.md @@ -6,7 +6,7 @@ domain-agnostic** – you supply a `SearchType`; this package never names your domain. It is the Typesense implementation of the `SearchEngine` port: it derives a collection schema from the field model, compiles the neutral `SearchQuery` into Typesense search params, runs it, reconstructs the engine-neutral `SearchResult`, -and manages the index lifecycle (blue/green rebuild). +and manages the search index lifecycle (blue/green rebuild). ## Collection schema and engine @@ -17,13 +17,19 @@ search/sort keys) matches what the projection writes, via `@lde/search`’s `physicalFields`, so the index and the documents cannot drift. `createTypesenseSearchEngine(client, { collection, labelsCollection })` is the -`SearchEngine` implementation: it validates the query against the search type -(the port contract — a structurally invalid query is rejected, never sent), -compiles it, runs the search, resolves reference (and reference-facet) labels -from the sidecar `labels` collection in a single lookup, and reconstructs the -logical `SearchResult` — language maps, labelled references, labelled facet -buckets. The pure halves `buildSearchParams` and `parseSearchResponse` are -exported for direct use and testing. +`SearchEngine` implementation. Each search: + +- validates the query against the search type (the port contract — a + structurally invalid query is rejected, never sent); +- compiles it into Typesense search params (`buildSearchParams`); +- runs the search; +- resolves reference (and reference-facet) labels from the sidecar `labels` + collection in a single lookup; +- reconstructs the logical `SearchResult` (`parseSearchResponse`) — language + maps, labelled references, labelled facet buckets. + +The pure halves `buildSearchParams` and `parseSearchResponse` are exported for +direct use and testing. ## Indexing @@ -47,7 +53,7 @@ const client = new Client({ // `documents` is an async iterable (e.g. a streaming projection); only one // batch is held in memory at a time. `rebuild` returns the live collection name // and the imported count (or `null` if another rebuild was already running). -const result = await rebuild(client, DATASET, documents, { name: 'datasets' }); +const result = await rebuild(client, documents, DATASET, { name: 'datasets' }); ``` The options accept everything `buildCollectionSchema` does (`defaultLocale`, diff --git a/packages/search-typesense/src/adapter.ts b/packages/search-typesense/src/adapter.ts index ee176aba..8aa3b11d 100644 --- a/packages/search-typesense/src/adapter.ts +++ b/packages/search-typesense/src/adapter.ts @@ -57,8 +57,8 @@ export interface RebuildOptions extends CollectionSchemaOptions { */ export async function rebuild( client: Client, - searchType: SearchType, documents: AsyncIterable, + searchType: SearchType, options: RebuildOptions, ): Promise<{ collection: string; imported: number } | null> { const { diff --git a/packages/search-typesense/src/index.ts b/packages/search-typesense/src/index.ts index f56424e1..4facde27 100644 --- a/packages/search-typesense/src/index.ts +++ b/packages/search-typesense/src/index.ts @@ -3,6 +3,7 @@ export type { RebuildOptions } from './adapter.js'; export { buildCollectionSchema } from './collection-schema.js'; export type { CollectionSchemaOptions } from './collection-schema.js'; export { buildSearchParams } from './query-compiler.js'; +export type { BuildSearchParamsOptions } from './query-compiler.js'; export { createTypesenseSearchEngine, parseSearchResponse } from './search.js'; export type { TypesenseSearchEngineOptions, diff --git a/packages/search-typesense/src/query-compiler.ts b/packages/search-typesense/src/query-compiler.ts index 9c4ef7e6..5cd4d0a5 100644 --- a/packages/search-typesense/src/query-compiler.ts +++ b/packages/search-typesense/src/query-compiler.ts @@ -18,13 +18,11 @@ import { } from '@lde/search'; /** - * Compile the engine-neutral {@link SearchQuery} into Typesense search - * parameters — the query half of the engine adapter. Pure (no client, no env), - * so the mapping is asserted directly in unit tests. Field names come from - * {@link physicalFields}, the same convention the projection and the collection - * schema use, so a query can never reference a field the index does not carry. + * Options for {@link buildSearchParams} — the query half of the engine + * adapter. {@link TypesenseSearchEngineOptions} extends this, so each knob is + * declared once and the engine forwards its options wholesale. */ -export interface CompileOptions { +export interface BuildSearchParamsOptions { /** * Cap on the number of buckets returned per facet (`max_facet_values`). Left * unset, Typesense defaults to 10 — too few for high-cardinality facets @@ -38,15 +36,24 @@ export interface CompileOptions { * skipped: an unknown field, an operator that does not match the field’s * kind ({@link filterOperatorFor}), an empty `in` list, or a `range` with no * usable bound. Skipping keeps a malformed clause from reaching the engine - * as garbage; supply this to log it instead of losing it silently. + * as garbage; supply this to log it instead of losing it silently. Through + * the engine, a structurally invalid query throws up front + * (`assertValidQuery`), so there only the vacuous clauses reach this. */ readonly onIgnoredFilter?: (filter: Filter) => void; } +/** + * Compile the engine-neutral {@link SearchQuery} into Typesense search + * parameters — the query half of the engine adapter. Pure (no client, no env), + * so the mapping is asserted directly in unit tests. Field names come from + * {@link physicalFields}, the same convention the projection and the collection + * schema use, so a query can never reference a field the index does not carry. + */ export function buildSearchParams( query: SearchQuery, searchType: SearchType, - options: CompileOptions = {}, + options: BuildSearchParamsOptions = {}, ): SearchParams { const folded = query.text !== undefined && query.text.length > 0 diff --git a/packages/search-typesense/src/search.ts b/packages/search-typesense/src/search.ts index 73430409..53816c1f 100644 --- a/packages/search-typesense/src/search.ts +++ b/packages/search-typesense/src/search.ts @@ -7,7 +7,6 @@ import { physicalFields, referenceFields, type FacetBucket, - type Filter, type LocalizedValue, type Reference, type ResultDocument, @@ -19,32 +18,25 @@ import { type SearchType, type SearchValue, } from '@lde/search'; -import { buildSearchParams, escapeFilterValue } from './query-compiler.js'; +import { + buildSearchParams, + escapeFilterValue, + type BuildSearchParamsOptions, +} from './query-compiler.js'; -/** Where the engine reads documents and (optionally) reference labels. */ -export interface TypesenseSearchEngineOptions { +/** Where the engine reads documents and (optionally) reference labels — plus + * every query-compiler knob ({@link BuildSearchParamsOptions}), declared once + * there and forwarded wholesale into each search. */ +export interface TypesenseSearchEngineOptions extends BuildSearchParamsOptions { /** The dataset collection or alias to query. */ readonly collection: string; /** The sidecar `labels` collection (IRI → label); omit for id-only references. */ readonly labelsCollection?: string; - /** - * Buckets returned per facet (`max_facet_values`). Typesense defaults to 10; - * raise it for high-cardinality facets (publisher, keyword) so their long - * value lists are not truncated. - */ - readonly maxFacetValues?: number; /** * Called when reference-label resolution fails; the search then degrades to * id-only references rather than failing. Optional — omit to swallow silently. */ readonly onLabelError?: (error: unknown) => void; - /** - * Called for each vacuous `where` clause the query compiler skips as a no-op - * (an empty `in` list, a `range` with no usable bound). Structurally invalid - * queries never get this far — the engine rejects them up front - * (`assertValidQuery`). Optional — omit to swallow silently. - */ - readonly onIgnoredFilter?: (filter: Filter) => void; /** * Opt-in in-memory label cache. When set (and {@link labelsCollection} is * set), the FULL sidecar `labels` collection is loaded once via the documents @@ -108,10 +100,7 @@ export function createTypesenseSearchEngine( // The port contract: a structurally invalid query (unknown field, wrong // operator, unknown facet) is rejected up front, for EVERY caller. assertValidQuery(query, searchType); - const params = buildSearchParams(query, searchType, { - maxFacetValues: options.maxFacetValues, - onIgnoredFilter: options.onIgnoredFilter, - }); + const params = buildSearchParams(query, searchType, options); // Cached path: the once-loaded full collection serves labels by in-memory // lookup (no per-search round-trip). The load does not depend on the // response, so it runs alongside the search; it never rejects (a failed diff --git a/packages/search-typesense/test/adapter.test.ts b/packages/search-typesense/test/adapter.test.ts index ceb50d00..82550623 100644 --- a/packages/search-typesense/test/adapter.test.ts +++ b/packages/search-typesense/test/adapter.test.ts @@ -72,8 +72,8 @@ describe('search-typesense', () => { it('publishes a versioned collection and points the index alias at it', async () => { const result = await rebuild( client, - datasetType, stream([{ id: 'a', title: 'Verhaal van Utrecht', year: 2024 }]), + datasetType, { name: NAME }, ); @@ -90,14 +90,14 @@ describe('search-typesense', () => { it('swaps the alias to a new collection and drops the previous one', async () => { const first = await rebuild( client, - datasetType, stream([{ id: 'a', title: 'Old', year: 2023 }]), + datasetType, { name: NAME }, ); const second = await rebuild( client, - datasetType, stream([{ id: 'a', title: 'New', year: 2024 }]), + datasetType, { name: NAME }, ); @@ -115,7 +115,7 @@ describe('search-typesense', () => { year: 2024, })); - const result = await rebuild(client, datasetType, stream(documents), { + const result = await rebuild(client, stream(documents), datasetType, { name: NAME, batchSize: 2, }); @@ -129,8 +129,8 @@ describe('search-typesense', () => { const result = await rebuild( client, - datasetType, stream([{ id: 'a', title: 'A', year: 2024 }]), + datasetType, { name: NAME }, ); @@ -143,8 +143,8 @@ describe('search-typesense', () => { const result = await rebuild( client, - datasetType, stream([{ id: 'a', title: 'A', year: 2024 }]), + datasetType, { name: NAME, lockTtlMs: 1_000 }, ); @@ -155,8 +155,8 @@ describe('search-typesense', () => { it('leaves the live alias intact and drops the orphan when a build fails', async () => { await rebuild( client, - datasetType, stream([{ id: 'a', title: 'Live', year: 2024 }]), + datasetType, { name: NAME }, ); const live = await aliasTarget(client); @@ -166,8 +166,8 @@ describe('search-typesense', () => { await expect( rebuild( client, - datasetType, stream([{ id: 'bad', title: 't', year: 'nope' }]), + datasetType, { name: NAME }, ), ).rejects.toThrow(/failed/i); @@ -180,7 +180,7 @@ describe('search-typesense', () => { }); it('publishes an empty collection for an empty source', async () => { - const result = await rebuild(client, datasetType, stream([]), { + const result = await rebuild(client, stream([]), datasetType, { name: NAME, }); From 32d3e59e775ab59090ee6502a9678af142269166 Mon Sep 17 00:00:00 2001 From: David de Boer Date: Fri, 3 Jul 2026 19:39:20 +0200 Subject: [PATCH 34/35] docs(search-api-graphql): state why runtime configuration benefits the consumer - no codegen step, no generated files to commit and review, no stale artifact drifting from the declaration; name the trade-off and point at the snapshot guard that restores it --- packages/search-api-graphql/README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/packages/search-api-graphql/README.md b/packages/search-api-graphql/README.md index 337207ca..7f846be5 100644 --- a/packages/search-api-graphql/README.md +++ b/packages/search-api-graphql/README.md @@ -15,7 +15,12 @@ is the `SearchType`’s own logical `name` — `Dataset`, `Person`, `CreativeWor ## Runtime configuration, not codegen `buildGraphQLSchema(schema)` constructs the GraphQL schema once at startup from -the field model — no SDL artifact, no generated resolver stubs. The field model +the field model — no SDL artifact, no generated resolver stubs. For you that +means: no codegen step in the build, no generated files to commit and review, +and no stale artifact that can drift from the declaration — change the +`SearchType`, restart, and the API is current. (The flip side, no artifact +showing contract changes as diffs, is restored by the +[snapshot guard](#guarding-the-contract).) The field model is the single source; the GraphQL contract is derived from it. Type names come from each `SearchType`’s `name`; output types, the `where`/`orderBy`/facet inputs, reference types and nullability are all derived from each field’s From 31e9c7085015be0b2ac019c97ee088d80d4284ef Mon Sep 17 00:00:00 2001 From: David de Boer Date: Fri, 3 Jul 2026 23:39:58 +0200 Subject: [PATCH 35/35] docs(search): document the family-wide API conventions - gather the parameter-order rule and the factory-verb vocabulary (define captures a declaration, build is pure data-to-data, create makes a stateful instance) into one API conventions section --- packages/search/README.md | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/packages/search/README.md b/packages/search/README.md index c728307d..81df0140 100644 --- a/packages/search/README.md +++ b/packages/search/README.md @@ -67,10 +67,20 @@ and GraphQL (one of the surfaces): `projectGraph` and the GraphQL surface consume a `SearchSchema` (projecting every type in one pass); the engine port executes one `SearchType` at a time. -One parameter-order convention holds across the whole family: a function takes -the value it operates on first and the `SearchType` declaration right after it -— `search(query, type)`, `projectDocument(node, type)`, -`engineFor(engine, type)`, `buildSearchParams(query, type)`. +### API conventions + +Two conventions hold across the whole family: + +- **Parameter order** — a function takes the value it operates on first and + the `SearchType` declaration right after it: `search(query, type)`, + `projectDocument(node, type)`, `engineFor(engine, type)`, + `buildSearchParams(query, type)`. +- **Factory verbs** — the verb tells you what kind of thing comes back. + `define*` captures a declaration as a literal (`defineSearchType`); + `build*` is a pure data-to-data constructor (`buildCollectionSchema`, + `buildSearchParams`, `buildGraphQLSchema`); `create*` makes a stateful + instance (`createTypesenseSearchEngine`). A bare noun (`searchSchema`) + constructs the trivial container it names. ## Field model