diff --git a/README.md b/README.md index 627c6bf2..3a4ddf6a 100644 --- a/README.md +++ b/README.md @@ -143,6 +143,11 @@ await pipeline.run(); npm Project RDF into engine-agnostic search documents (framing + a declarative field spec) + + @lde/search-api-graphql + npm + Engine- and domain-agnostic GraphQL surface for search: builds an executable GraphQL schema from a SearchSchema at runtime, one root query field per type + @lde/search-typesense npm @@ -229,6 +234,10 @@ graph TD subgraph Publication fastify-rdf docgen + search --> text-normalization + search-api-graphql --> search + search-typesense --> search + search-typesense --> text-normalization end subgraph Monitoring diff --git a/docs/decisions/0003-search-api-core-query-model.md b/docs/decisions/0003-search-api-core-query-model.md index 8189cda5..6b59130d 100644 --- a/docs/decisions/0003-search-api-core-query-model.md +++ b/docs/decisions/0003-search-api-core-query-model.md @@ -6,10 +6,8 @@ Date: 2026-06-25 Proposed -Reconciles against the NDE stack platform docs -(`netwerk-digitaal-erfgoed/docs` → `docs/stack/layers/platform.md`), which are themselves -a **draft under discussion**, so several decisions below are deliberate deviations from -the current draft, to be reconciled back into it. +Aligned with the NDE [stack platform docs](https://docs.nde.nl/stack/layers/platform); the +decisions below are reflected there. ## Context @@ -19,10 +17,9 @@ declarative source so the GraphQL surface, a later REST surface, and the index c from each other, and so a deployment can swap search engines without consumers noticing. That requires an engine- and protocol-neutral **core** that both API surfaces and any -engine adapter sit on. The platform draft frames this as Ports & Adapters with a framed -JSON-LD intermediate representation, generated from SHACL + a `search:` annotation -vocabulary. We adopt that direction but scope it to what a v1 keyword search needs, and -diverge on a few concrete points where the draft does not fit DR’s catalog-search case. +engine adapter sit on. The architecture is Ports & Adapters with a framed JSON-LD +intermediate representation, generated from SHACL + a `search:` annotation vocabulary, +scoped here to what a v1 keyword search needs. ## Decision @@ -32,26 +29,31 @@ Two tiers: `search-*` is backend you compose; `search-api-*` is the surface you | Tier | Package | Responsibility | | ----------- | ------------------------- | ----------------------------------------------------------------------------------------------------------------------- | -| backend | `@lde/search` | field model · `SearchQuery` · filter semantics · adapter port | +| backend | `@lde/search` | field model · `SearchQuery` · filter semantics · engine port | | backend | `@lde/search-typesense` | engine adapter: collection schema · query/filter compiler · `search()` | | API surface | `@lde/search-api-graphql` | field model + `SearchQuery` → GraphQL schema (runtime configuration; see [ADR 4](./0004-search-api-graphql-surface.md)) | | API surface | `@lde/search-api-rest` | OpenAPI + route handlers (later, thin over the core) | -This deviates from the draft’s function-mapping table (`@lde/graphql-server`, -`@lde/rest-server`, no core row); the draft should adopt the `@lde/search*` family. - ### Contract frozen, storage swappable The **API contract** (the SDL shape consumers couple to) is breaking to change and must be right in v1. The **IR / stored document** (framed JSON-LD vs a flat engine doc) lives behind the adapter and is swappable with no consumer impact. Nothing engine-specific (companion fields, `int32`, the engine query language) and nothing RDF-specific -(`@context`, `@id`, IRI-keyed predicates) leaks past the adapter port. +(`@context`, `@id`, IRI-keyed predicates) leaks past the engine port. ### Field model -The engine-neutral description of a queryable field – the runtime form of one SHACL -NodeShape + its `search:` annotations: +The engine-neutral description of a queryable field. **One `SearchField` declaration drives +four consumers** – projection (RDF→flat document), the engine collection schema, the query +semantics, and the GraphQL surface – so they cannot drift. SHACL is one possible source +(see the mapping below), not a dependency: a hand-written declaration is just as valid. + +It is a **unified** model: a single declaration carries the projection, the collection +schema and search weights, and the query semantics – concerns that would otherwise each +need their own per-field configuration, free to drift apart. `kind` plus independent +capability flags express them all, derived fields are first-class, and the +Typesense-vocabulary types are _derived_ from `kind`, never declared. ```ts type FieldKind = @@ -64,36 +66,55 @@ type FieldKind = | 'reference'; interface SearchField { - readonly name: string; // logical API name + readonly name: string; // logical API name; the physical fanout derives from it readonly kind: FieldKind; - readonly array?: boolean; - readonly localized?: boolean; + readonly path?: string; // sh:path to project from; omit for a derivation-populated field + readonly array?: boolean; // sh:maxCount + readonly required?: boolean; // sh:minCount ≥ 1 — non-null in output, non-optional in the index + readonly localized?: boolean; // rdf:langString / sh:languageIn (text only) + readonly locales?: readonly string[]; // when localized: which languages to emit readonly output?: boolean; // appears in the schema output type - readonly searchable?: { weight: number }; // free-text inclusion + weight + readonly searchable?: { weight: number }; // free-text inclusion + weight (per-locale when localized) readonly filterable?: boolean; // usable in `where` readonly facetable?: boolean; readonly sortable?: boolean; - readonly nestedStrategy?: 'labelOnly' | 'idOnly' | 'inline'; // for `reference` - readonly group?: { readonly name: string; readonly prefix: string }; // deployment delta + readonly ref?: { type: string; strategy: 'labelOnly' | 'idOnly' | 'inline' }; // kind: 'reference' + readonly transform?: (value: string) => string; // projection-time value transform + readonly facetRanges?: readonly FacetRange[]; // numeric facet: fixed [min, max) range bins (histogram) vs per-value buckets } -interface SearchSchema { +type Derivation = (document: SearchDocument, node: FramedNode) => void; + +// One root type (one SHACL NodeShape); a whole deployment’s declaration is the +// SearchSchema, a map of SearchTypes keyed by type IRI (built with searchSchema()). +interface SearchType { + readonly name: string; // logical API name ('Dataset') – names the type in every surface, + // declared (like each field's name), never derived from the IRI, so vocabulary + // churn cannot silently rename the public contract + readonly type: string; // sh:targetClass readonly fields: readonly SearchField[]; + readonly derivations?: readonly Derivation[]; // computed fields: status, booleans } ``` -Maps onto SHACL + `search:` (`kind`←`sh:datatype`, `array`←`sh:maxCount`, -`localized`←`sh:languageIn`, `facetable`←`search:facetable`, `sortable`←`search:sortable`, -`nestedStrategy`←`sh:node`/`sh:class` + `search:nestedStrategy`) so an eventual generator -emits it unchanged. The `group` companion (coarse grouped facets, e.g. `format_group`) and -the `status_rank` tie-break sort are **deployment-specific deltas**, never in `@lde/search`. -`relevance` is _not_ a delta: every full-text engine ranks by match score, so it is a -generic reserved sort the adapter understands. +Maps onto SHACL + `search:` (`kind`←`sh:datatype`/`sh:nodeKind`, `path`←`sh:path`, +`array`←`sh:maxCount`, `localized`←`sh:languageIn`, `facetable`←`search:facetable`, +`sortable`←`search:sortable`, `ref`←`sh:node`/`sh:class` + `search:nestedStrategy`) so an +eventual generator emits it unchanged. A field with **no `path`** is a derived field – +populated by a `Derivation` rather than projected from the IR – yet it still carries full +query/schema/output behavior. The physical field names a declaration fans out to (`${name}_search_${locale}`, +`${name}_sort_${locale}`, `${name}_search`) follow one convention owned by +`@lde/search`, so projection, collection schema and query compiler agree. The `status_rank` +tie-break sort is a **deployment-specific delta**, never in `@lde/search`. Grouped facets need +no field-model mechanism at all: a deployment derivation materializes group tokens (e.g. +`group:rdf`) into the field’s own values – see Consequences. `relevance` is _not_ a delta: +every full-text engine ranks by match score, so it is a generic reserved sort the adapter +understands. ### `SearchQuery` – the neutral query IR -Both surfaces parse input into this; the adapter consumes this. It is the shared compiler -target that keeps GraphQL and REST from drifting. +Both surfaces compile input into this; the adapter compiles it into an engine query. One +shared representation in the middle keeps GraphQL and REST from drifting. ```ts interface SearchQuery { @@ -147,25 +168,86 @@ variable-based clients (`$o: DatasetOrderBy`) break, so a future array is a deli **Inclusive bounds only** – `min`/`max`, no `gt`/`gte`/`lt`/`lte`: self-documenting, matches Typesense’s native inclusive range, covers every DR case, additively reversible. -Grouped facets need no special shape – `group:`-prefixed tokens travel as ordinary `in` -strings and the adapter splits/unions them. - -### Adapter port and result +A numeric facet returns **range buckets** (`[min, max)` bins declared per field); the adapter +maps them to the engine’s native range faceting. + +**Grouped facets need no special engine mechanism; they are denormalized at index time.** +A coarse category alongside granular values (e.g. `group:rdf` next to media types, `group:person` +next to class IRIs) is materialized into the field’s own values during projection, so at query +time a group token is an ordinary value: faceted natively, filtered by plain membership +(`field.in: ["group:rdf"]` unions with granular values for free), and — where the field is +`output` – read like any other value. There is no `_group` companion, no `group:`-prefix split, +no filter rewriting in the adapter; the engine stays dumb and denormalization (the document +store’s strength) does the work. A cross-source signal that is not a subset of the field (e.g. a +SPARQL capability derived from `conformsTo`, not a media type) is likewise materialized as a plain +value by a deployment derivation. + +The trade-off this design accepts: **group membership is fixed at index time.** Because the +group token is baked into each document’s values during projection, redefining a group (which +granular values map to `group:rdf`) is an index-data change that takes effect only on **reindex** – +there is no query-time mapping to edit. The constraint is acceptable here because group definitions +are deployment projection config that already drives indexing, and reindexing is already the +pipeline’s job; it would not suit a system where grouping is user-defined or changes frequently. + +### Engine port and result + +The **port** is the interface the core defines; a concrete engine **adapter** +(`@lde/search-typesense`’s `TypesenseSearchEngine`) implements it. Naming the port for the +capability (`SearchEngine`), not the pattern piece, keeps `TypesenseSearchEngine implements +SearchEngine` readable. ```ts -interface SearchAdapter { - search(query: SearchQuery, schema: SearchSchema): Promise; +// FacetField / OutputField default to `string` (ergonomic) and a deployment narrows them +// to its type’s facetable / output field names for typo-safe facet and document access; +// Type narrows the accepted searchType argument alongside, so a narrowed engine cannot be +// handed the wrong search type. The ergonomic route is engineFor(engine, searchType) over +// a defineSearchType declaration (helpers FacetFieldsOf / OutputFieldsOf and +// the EngineFor alias are exported for hand-written signatures). +interface SearchEngine< + FacetField extends string = string, + OutputField extends string = string, + Type extends SearchType = SearchType, +> { + search( + query: SearchQuery, + searchType: Type, + ): Promise>; } -interface SearchResult { - readonly hits: readonly { id: string; document: SearchDocument }[]; +interface SearchResult< + FacetField extends string = string, + OutputField extends string = string, +> { + readonly hits: readonly SearchHit[]; readonly total: number; + // Keyed by facet field name; `Partial` because only the queried facets are present. + // A bucket’s `label` (a LocalizedValue) is the engine-resolved canonical data label, + // present only for reference (IRI-keyed) facets; absent for token/free-string facets, + // whose display the consumer owns (its own i18n, or the value itself). readonly facets: Readonly< - Record + Partial< + Record< + FacetField, + readonly { value: string; count: number; label?: LocalizedValue }[] + > + > >; } -type SearchDocument = Record; +// `id` (the stable document key, an IRI) stays out of the document: it is the hit’s +// identity, always present, a different contract from the optional logical field values, +// and maps straight onto the GraphQL output’s `id: String!`. +interface SearchHit { + readonly id: string; + readonly document: ResultDocument; +} + +// The logical result document. Named distinctly from the flat, fanned-out projection +// `SearchDocument` that lives index-side: this carries logical fields (language maps, +// references) ready for a surface to shape. +type ResultDocument = Readonly< + Partial> +>; type SearchValue = | string | number @@ -192,7 +274,7 @@ per-shape types (e.g. `Organization`, `Term`) with `label` exposed as `name` - **IR / adapter-return:** JSON-LD language map (`@container: @language`), `@set` arrays, `und` for untagged. Matches schema-profile #171 (language maps are more usable as a data - model) and the platform draft’s envelope. + model) and the stack platform envelope. - **GraphQL surface:** a single **best-first** `Accept-Language`-ordered list (`[LanguageString!]!`, see [ADR 4](./0004-search-api-graphql-surface.md)). `[0]` is the value to display; **`[0].language` is the language actually served** – the per-field @@ -208,7 +290,7 @@ argument (deferred): a parallel arg would duplicate the header and need preceden Chosen over a `{nl,en}` map (silently yields `undefined` for a missing language, no defined fallback order) and over a separate resolved scalar (the value must be a `LanguageString` to carry its language anyway, so the scalar saved only the `[0]` index – not worth a second -field plus a deviation from the draft / Network-of-Terms list shape). Grounded in measured +field plus diverging from the Network-of-Terms list shape). Grounded in measured data and all three substrates: - **A (descriptions, measured):** bilingual `nl`/`en`, ~86% Dutch-only → an English user gets @@ -222,32 +304,25 @@ have an English title) is distinct from content `dct:language` (already filterab preference; expressible as a facetable dimension (languages-present-in-a-localized-field), not enabled for DR v1, more relevant for B/C. -### Other reconciled decisions +### Other decisions - **Numbered pagination** (`offset`/`limit`, presented as page/per-page), not Relay cursors. DR is a page-numbered faceted browser with totals; Typesense is natively page/per-page; the ~2,500-doc corpus never paginates deep enough for offset cost to bite; and the blue/green alias swap removes the mutation-drift that motivates cursors. - **Sidecar canonical labels**, not inline `labelOnly` as default. Facets need one - canonical label per entity; the draft’s own two-source model puts canonical labels in a - separate collection, which is what DR’s `labels` collection is. `nestedStrategy` is - carried as metadata but inline `labelOnly` is not the default. -- **Logical typed result document** at the query seam; framed JSON-LD kept index-side. The - draft treats framed JSON-LD as the universal IR; we scope it to the index/projection - artifact (its payoff – vector/LDES/UI sinks – is object-search’s, not catalog-search’s), - gated on the generic framing packages existing rather than on DR. + canonical label per entity, kept in a separate collection — DR’s `labels` collection. A + reference’s `strategy` is carried as metadata; `labelOnly` is the v1 default, not inline. +- **Logical typed result document** at the query seam; framed JSON-LD kept index-side as the + index/projection artifact (its payoff – vector/LDES/UI sinks – is object-search’s, not + catalog-search’s), gated on the generic framing packages existing rather than on DR. ## Consequences - One declarative source drives GraphQL, later REST, and the index; they cannot drift. - The engine is a swappable adapter; the contract outlives engine choices. -- Adopted from the draft unchanged: the Stable API Contract discipline, `nestedStrategy` as - a concept, the surface `LanguageString` list, folding at the adapter boundary + query - side via `@lde/text-normalization`, SDL-in-projection vs filter-compiler-in-adapter. -- Deviations to reconcile into the platform draft: numbered pagination; sidecar labels; - logical result doc (framed JSON-LD scoped to index-side); `min`/`max` filter ranges; the - `@lde/search*` naming and a core package row. +- Folding (case/diacritics) happens at the adapter boundary and on the query side via + `@lde/text-normalization`, so index and query normalize identically. - Deferred: REST surface; framed-JSON-LD materialised view (nested storage, index-time label inlining, detail-page-on-index, terms-collection split); semantic/hybrid (vector) - search; unifying the projection `FieldSpec` (RDF→doc) with this `SearchField` - (query/output) into one field declaration. + search. diff --git a/docs/decisions/0004-search-api-graphql-surface.md b/docs/decisions/0004-search-api-graphql-surface.md index d6aff824..0a21849c 100644 --- a/docs/decisions/0004-search-api-graphql-surface.md +++ b/docs/decisions/0004-search-api-graphql-surface.md @@ -11,79 +11,92 @@ Builds on [ADR 3 (Search API core query model)](./0003-search-api-core-query-mod ## Context Given the engine-neutral core of [ADR 3](./0003-search-api-core-query-model.md), the first -API surface is GraphQL. The platform draft requires the surface to be derived from the same -source as the index, never hand-written, so it cannot drift. It must also be framework-free: -resolvers are standard `graphql-js`, not tied to Fastify/Mercurius, so any GraphQL server -can host the schema (DR mounts it inline; a Fastify wrapper is deferred and, if ever built, -is a separate package). +API surface is GraphQL, derived from the same source as the index so it cannot drift. It must +be framework-free: resolvers are standard `graphql-js`, not tied to Fastify/Mercurius, so any +GraphQL server can host the schema (DR mounts it inline; a Fastify wrapper is a deferred +separate package). ## Decision ### Runtime configuration, not code generation -The platform draft frames this as _generating_ the surface – emitting GraphQL SDL **and** -resolvers as artifacts. We deviate: nothing is emitted or committed. The schema is -**constructed at runtime from the field-model configuration** (`buildSearchSchema(config)`), -once at startup, and the resolvers are **generic functions inside the package** attached to -that schema. A better name for the draft’s “generation” step, at least for this surface, is -**runtime configuration**. +The surface is **constructed at runtime from the field-model configuration** +(`buildGraphQLSchema(schema, options)`), once at startup, with generic resolvers shipped in the +package attached to that schema – nothing is emitted or committed. The resolvers are inherently +generic (one root resolver per type maps args to a `SearchQuery`, calls the engine, and maps the +result back; the field model only parameterises data), so codegen would emit N near-identical +stubs that all delegate to the same logic, plus a build step and staleness risk, for no benefit. -This matters because the resolvers are inherently generic – there is essentially one root -resolver that maps args to a `SearchQuery`, calls the adapter, and maps the result back; -the field model only parameterises data. Codegen would emit N near-identical resolver stubs -that all delegate to the same logic, plus a build step and staleness risk, for no benefit. - -**No SDL artifact.** A live GraphQL API serves its own schema via introspection, so clients -need no committed `.graphql` file. The field-model diff is the reviewable change. A -`printSchema()` helper exists only as an **optional** CI snapshot test for catching -accidental breaking changes to the frozen contract – not a shipped artifact. - -> Deviation from the stack draft: the draft’s “generate SDL + resolvers” becomes -> _construct the schema at runtime from configuration; resolvers are generic and in-package; -> SDL is served live via introspection, not emitted._ For the reconciliation list. +A live GraphQL API serves its own schema via introspection, so clients need no committed +`.graphql` file; the field-model diff is the reviewable change. `printGraphQLSchema()` exists +only as an **optional** CI snapshot test guarding the frozen contract against accidental +breaking changes – not a shipped artifact. ### The schema-building function +The function takes the **whole `SearchSchema`** and emits one root query field per +`SearchType` – a schema may declare multiple root types (e.g. `Person` AND `CreativeWork`), +each searchable in its own way. Separately built `GraphQLSchema`s could never be merged later +(one `Query` type; the shared types would collide), so multi-type composition happens before +build, per the compose-before-build principle. Shared types (`LanguageString`, buckets, +filter inputs, reference types) are created once and reused across root types. + ```ts -function buildSearchSchema( - schema: SearchSchema, - options: { - typeName: string; // 'Dataset' – drives all derived type names - queryField?: string; // root field; default lowercased plural of typeName - queryDefaults?: (q: SearchQuery, ctx: SearchContext) => SearchQuery; // consumer policy - languageOrder?: ( - available: readonly string[], - accept: readonly string[], - ) => readonly string[]; - extendTypeDefs?: string; // merged before build (compose-before-build) - extendResolvers?: Record; +function buildGraphQLSchema( + schema: SearchSchema, // every root type, keyed by type IRI + options?: { + types?: Record< + string, // type IRI; entries are optional fine-tuning – names come from SearchType.name + { + queryField?: string; // root field; default lowercased plural of the type's name + queryDefaults?: (q: SearchQuery, ctx: SearchContext) => SearchQuery; // per-type consumer policy + } + >; + languageOrder?: LanguageOrder; // output-language ordering; default Accept-Language first }, ): GraphQLSchema; // executable schema: types + generic resolvers attached -// also exported for manual composition / non-default servers: -function buildSearchTypeDefsAndResolvers( - schema, - options, -): { typeDefs: string; resolvers: object }; // optional CI helper only: -function printSearchSchema(schema, options): string; // SDL, for a snapshot/breaking-change test +function printGraphQLSchema(schema, options): string; // SDL, for a snapshot/breaking-change test ``` -`buildSearchSchema` is the standalone, framework-agnostic artifact (depends only on -`graphql` + `@graphql-tools/schema`). Deep customisation passes `extendTypeDefs`/ -`extendResolvers` (merged before `makeExecutableSchema`, since Mercurius registers once) or -composes the exported typeDefs/resolvers by hand. +`buildGraphQLSchema` is the standalone, framework-agnostic artifact (depends only on +`graphql`). Deep customisation of the emitted schema is deferred (see Consequences). + +### Typed boundaries, dynamic middle + +Values are typed at both ends, with the resolver as the typed transform between them: + +| layer | localized text | reference | int64 | keyword (array) | boolean | +| --------------------- | ------------------------------------ | --------------------------- | ---------------- | ----------------------- | -------------------- | +| IR (`ResultDocument`) | `LocalizedValue` (lang map) | `Reference` | `number` | `readonly string[]` | `boolean` | +| GraphQL | `LanguageString[]` (best-first list) | named type (`Organization`) | `Float`/`number` | `[String!]!`/`string[]` | `Boolean!`/`boolean` | + +What stays unchecked is the generic resolver’s **dynamic middle**: it loops over the field +model with runtime-string names, so TS cannot prove the object it builds matches the emitted +output types – it casts at that boundary, and graphql-js’s executor (not TS) enforces the +output types at runtime (a wrong-typed return raises a field error). Same “typed boundaries, +dynamic middle” shape as the engine port and the projection: type the edges where it is +honest, accept a cast where iteration is inherently dynamic. The **contract** is guarded by +the optional `printGraphQLSchema()` SDL snapshot (the real artifact). ### Construction rules (field model → schema) -Type names derive from `typeName`; shared types (`LanguageString`, `Facet`, `FacetBucket`, -`SortDirection`, `StringFilter`, `IntRange`, `FloatRange`, `DateRange`) are emitted once. +Type names derive from each `SearchType`’s logical `name`; shared types (`LanguageString`, +`ValueBucket`, `RangeBucket`, `SortDirection`, `StringFilter`, `IntRange`, `FloatRange`, +`DateRange`, and the reference types) are emitted once across all root types, and the +per-type keyed facets object is named `Facets`. A type with no `filterable` fields gets +no `where` arg, and one +with no `facetable` fields no `facets` field (empty GraphQL types are invalid). GraphQL field names are the field model `name` verbatim (declare camelCase). - **Output type** – one field per `output` field: `text`+`localized` → `[LanguageString!]!` (best-first; `[0].language` = served language, the per-field `Content-Language`); - `keyword` array → `[String!]!`, scalar → `String`; `integer` → `Int`; `number` → `Float`; - `date` → `String` (ISO 8601); `boolean` → `Boolean!` (absent = false); `reference` → - see below. Nullability from `array` / required / optional; `id` is `String!`. + `keyword` array → `[String!]!`, scalar → `String`; `integer` → `Int` (signed 32-bit); + `number` → `Float` (exact integers to 2^53); `date` → `String` (ISO 8601); `boolean` → + `Boolean!` (absent = false); `reference` → see below. Nullability from `array` / required / + optional; `id` is `String!`. A magnitude that can exceed 32 bits (a 64-bit count or byte size + – e.g. DR’s `size`) is `number` → `Float`, since `Int` would overflow; a `Long`/`BigInt` + custom scalar is the deferred alternative. - **Reference types** – a `reference` field is typed by the **referenced shape** (`sh:class`/`sh:node`), emitted once and reused by every field referencing the same shape. Its fields follow `nestedStrategy`: @@ -95,9 +108,9 @@ GraphQL field names are the field model `name` verbatim (declare camelCase). | `inline` (later) | the named type plus the referenced shape’s projected fields | So DR emits `publisher: Organization` (the `foaf:Agent` shape) and - `terminologySource: [Term!]!`; a shape’s type is emitted once and reused by any field that - references it. Named, not a generic GraphQL `Reference`: going `labelOnly → inline` then - only _adds_ fields (non-breaking), whereas generic→named later would break the contract. + `terminologySource: [Term!]!`. Named, not a generic GraphQL `Reference`: going + `labelOnly → inline` then only _adds_ fields (non-breaking), whereas generic→named later + would break the contract. - **`where` input** – one field per `filterable` field: `keyword`/`reference` → `StringFilter { in: [String!] }`; `integer` → `IntRange { min, max }`; `number` → @@ -105,14 +118,32 @@ GraphQL field names are the field model `name` verbatim (declare camelCase). `is` value); `text` is excluded (it goes through the `query` arg). - **`orderBy`** – `RELEVANCE` (the sane default when a `query` is present) plus every `sortable` field, as an enum, in a single `{ field, direction }` input. Only - publicly-selectable sorts appear here; the resolver expands the client’s one choice into - the internal `Sort[]`, appending deployment tie-breaks like DR’s `status_rank` via - `queryDefaults` (never exposed). Single for now because a user picks one dimension. - Promoting it to a list later is backward-compatible only for inline-literal clients (list - input coercion wraps a single value); **variable-based clients break** (`$o: DatasetOrderBy` - is rejected where `[DatasetOrderBy!]` is expected), so a future array is a deliberate, - potentially breaking change – not a free one. -- **Facets** – an enum of every `facetable` field; requested per query, returned with counts. + publicly-selectable sorts appear; the resolver expands the client’s one choice into the + internal `Sort[]`, appending deployment tie-breaks like DR’s `status_rank` via + `queryDefaults` (never exposed). Single for now because a user picks one dimension; promoting + it to a list later is backward-compatible only for inline-literal clients (list input + coercion) – **variable-based clients break** (`$o: DatasetOrderBy` where `[DatasetOrderBy!]` + is expected) – so a future array is a deliberate, potentially breaking change. +- **Facets** – a **keyed object** (`Facets`), one field per `facetable` field, typed by + the field’s kind: a numeric range-facet field is `[RangeBucket!]!`, every other facet is + `[ValueBucket!]!`. The facet set and each bucket shape are thus encoded **statically in the + schema**, not discovered at runtime through an enum + polymorphic bucket (no `__typename`, no + fragments). **Selection is the request**: only the facet keys a query selects are computed + (the resolver inspects the selection), each with its **own where-filter removed** + (skip-own-filter – a multi-select facet still lists its other options; dropping a `status` + filter also drops the valid-only default, so the status facet counts across every status). + Two bucket types: + - `ValueBucket { value, count, label }` – `value` is the selection key (filter via + `field.in`); `label` (nullable) is the engine-resolved canonical **data** label, present + only for **reference** (IRI-keyed) facets, `null` for token/free-string facets whose + display the consumer owns (its i18n for controlled tokens like `valid` → “Geldig”/“Valid”, + or the `value` itself). The null is load-bearing. + - `RangeBucket { min, max, count }` – a half-open `[min, max)` numeric bin (`max` null on an + open-ended top bin), filtered via `field.range`. + - A grouped facet (a coarse category alongside granular values, e.g. `group:rdf` next to media + types) needs **no special bucket**: its tokens are denormalized into the field at index time, + so they are ordinary `ValueBucket` values – faceted, filtered (`field.in: ["group:rdf"]`) and, + where output, read like any other value (see ADR 0003). ### Resulting schema (DR example, abridged) @@ -137,35 +168,23 @@ type Dataset { publisher: Organization terminologySource: [Term!]! format: [String!]! - class: [String!]! - size: Int + size: Float # int64 magnitude → Float, not Int (32-bit) datePosted: String status: String iiif: Boolean! # … keyword, language, iiifManifestCount, ndeSchemaAp, linkedData, terms, persistentUris } -input StringFilter { - in: [String!] -} -input IntRange { - min: Int - max: Int -} -input DateRange { - min: String - max: String -} +# shared inputs are emitted once and reused: DR uses StringFilter + FloatRange + +# SortDirection (IntRange / DateRange are pruned – no filterable int/date field). input DatasetWhere { publisher: StringFilter format: StringFilter class: StringFilter status: StringFilter - size: IntRange - datePosted: DateRange - iiif: Boolean - # … keyword, language, terminologySource, catalog, ndeSchemaAp, linkedData, terms, persistentUris + size: FloatRange + # … keyword, language, terminologySource, catalog } enum DatasetSortField { @@ -174,36 +193,31 @@ enum DatasetSortField { DATE_POSTED SIZE } -enum SortDirection { - ASC - DESC -} input DatasetOrderBy { field: DatasetSortField! direction: SortDirection! = DESC } -enum DatasetFacetField { - PUBLISHER - KEYWORD - LANGUAGE - FORMAT - CLASS - TERMINOLOGY_SOURCE - STATUS - IIIF - NDE_SCHEMA_AP - LINKED_DATA - TERMS - PERSISTENT_URIS +type ValueBucket { + value: String! # selection key: a media type, a token (group:rdf), or an IRI for reference facets + count: Int! + label: [LanguageString!] # nullable; resolved data label for reference facets, else null } -type FacetBucket { - value: String! +type RangeBucket { + min: Float # half-open [min, max); max null = open-ended top bin + max: Float count: Int! } -type Facet { - field: DatasetFacetField! - buckets: [FacetBucket!]! +type DatasetFacets { + # one field per facetable field, typed by kind; selection = request, skip-own-filter applied + publisher: [ValueBucket!]! + keyword: [ValueBucket!]! + language: [ValueBucket!]! + format: [ValueBucket!]! + class: [ValueBucket!]! + terminologySource: [ValueBucket!]! + status: [ValueBucket!]! + size: [RangeBucket!]! } type DatasetSearchResult { @@ -211,7 +225,7 @@ type DatasetSearchResult { total: Int! page: Int! perPage: Int! - facets: [Facet!]! + facets: DatasetFacets! } type Query { @@ -220,19 +234,20 @@ type Query { where: DatasetWhere orderBy: DatasetOrderBy page: Int = 1 - perPage: Int = 20 - facets: [DatasetFacetField!] + perPage: Int = 20 # no `facets` arg – selecting facet keys IS the request ): DatasetSearchResult! } ``` Numbered pagination (`page`/`perPage` + `total`), per [ADR 3](./0003-search-api-core-query-model.md) – no Relay connection. The reference types -(`Organization`, `Term`) carry `id + name` (labelOnly) from DR’s sidecar labels collection, -resolved by the adapter. `publisher` is single (`dct:publisher` `maxCount 1`); `creator` is -search-only – its name feeds full-text `query` but it has no output field of its own, -mirroring the current card. `catalog` is filter-only, so it appears in `where` but not as an -output field. +carry `id + name` (labelOnly) from DR’s sidecar labels collection, resolved by the adapter. +`publisher` is single (`dct:publisher` `maxCount 1`); `creator` is search-only (its name feeds +full-text `query` but it has no output field); `catalog` is filter-only (in `where`, not output); +`class` is facet + filter but not output (its `group:` tokens surface only as facet buckets, never +as card values); `datePosted` is sortable + output only; and the NDE compatibility booleans +(`iiif`, `ndeSchemaAp`, `linkedData`, `terms`) are output-only vinkjes – in neither `where` nor the +facets until “filter by vinkje” ships. ### Resolver behaviour @@ -241,37 +256,33 @@ The single, generic root resolver (shipped in the package, not emitted): 1. **Args → `SearchQuery`** (pure): `query`→`text`; `where`→`Filter[]`; `orderBy`→`Sort[]` (`RELEVANCE`→reserved `relevance`); `page`/`perPage`→`offset`/`limit`; `facets`→logical names; `locale`←`context.acceptLanguage[0]`. -2. **Apply `options.queryDefaults`** – the generic resolver bakes no deployment defaults; - DR injects its policy here: default `status:=valid`; default sort `relevance` when a - `query` is present else `title`; and the `status_rank` tie-break appended to either. -3. **`context.adapter.search(query, schema)` → `SearchResult`.** +2. **Apply `options.queryDefaults`** – the generic resolver bakes no deployment defaults; DR + injects its policy here: default `status:=valid`; default sort `relevance` when a `query` is + present else `title`; and the `status_rank` tie-break appended to either. +3. **`context.engine.search(query, schema)` → `SearchResult`.** 4. **`SearchResult` → output** – scalars pass through; a `LocalizedValue` map → - `[LanguageString]` ordered by `options.languageOrder(available, acceptLanguage)`; - reference values likewise; facets keyed logical→enum. GraphQL field selection prunes. + `[LanguageString]` ordered by `options.languageOrder(available, acceptLanguage)`; reference + values likewise; facets keyed logical→enum. GraphQL field selection prunes. -Default `languageOrder`: Accept-Language entries first, then remaining tagged languages, -then untagged (`und`) last – so `[0]` is always the best available value. +Default `languageOrder`: Accept-Language entries first, then remaining tagged languages, then +untagged (`und`) last – so `[0]` is always the best available value. ### Lifecycle and performance -- **Built once at startup.** The consumer calls `buildSearchSchema` during boot and hands - the single `GraphQLSchema` to its server; the field model is static per deployment, so it - is never rebuilt per request. -- **Held and reused.** That one schema serves every request (Mercurius additionally - caches/compiles it). -- **Zero per-request penalty vs codegen.** A runtime-constructed schema is the same - `GraphQLSchema` object codegen would have produced; the only added cost is the one-time - build, sub-millisecond to low-single-digit-ms for a schema this size. +- **Built once at startup, reused for every request.** The field model is static per + deployment, so the single `GraphQLSchema` is constructed during boot (sub-millisecond to + low-single-digit-ms for a schema this size) and never rebuilt per request – the same object + codegen would have produced, with no per-request penalty (Mercurius additionally caches it). - **Hot path is the engine, not GraphQL.** Per-request cost is dominated by the Typesense round-trip; parse/validate/resolve of a small query is sub-millisecond. -- **Introspection serves the contract.** Cheap (a query against the built schema, cached by - clients). Leave it on, or disable in production and use `printSearchSchema` for tooling. +- **Introspection serves the contract** (cheap, client-cached). Leave it on, or disable in + production and use `printGraphQLSchema` for tooling. ### Context contract ```ts interface SearchContext { - adapter: SearchAdapter; // any engine + engine: SearchEngine; // the port; any engine adapter acceptLanguage: readonly string[]; // parsed, ordered; drives locale + output ordering } ``` @@ -281,21 +292,21 @@ Each transport populates it per request; no framework type appears in the packag ## Consequences - The GraphQL surface is configured at runtime from the - [ADR 3](./0003-search-api-core-query-model.md) field model, so it cannot drift from the - index or a later REST surface, and works under any GraphQL server. + [ADR 3](./0003-search-api-core-query-model.md) field model, so it cannot drift from the index + or a later REST surface, and works under any GraphQL server. - **Frozen (public contract):** `LanguageString`, the named reference types (`Organization`, `Term`, …), output types, `where` operators, `orderBy` enums, numbered-pagination args, facet types. Breaking to change – right in v1. -- **Internal:** args→`SearchQuery` mapping, language ordering, how the adapter computes - facets, the `SearchDocument` shape. -- **Deviations to reconcile into the platform draft:** - - “generate SDL + resolvers” → _runtime configuration_ (construct at startup from config; - generic in-package resolvers; SDL served via introspection, not emitted as an artifact). - - Named reference types per shape (`Organization`, `Term`) rather than the draft’s uniform - `labelOnly` `{ @id, @type, name }` reference shape – chosen for ergonomics and - additive `inline` growth. -- Deferred: a `dataset(id)` single-resource query (detail-page-on-index direction; DR detail - stays on SPARQL); cross-collection `@reference` joins beyond inline labels; cursor - pagination; a `Date` scalar (kept ISO `String`); transport-layer persisted queries / cost - limits; a root or per-field language argument (Accept-Language is the sole preference - mechanism); metadata-language-availability filtering (a facetable dimension, not v1). +- **Internal:** args→`SearchQuery` mapping, language ordering, how the adapter computes facets, + the `SearchDocument` shape. +- **Named reference types** per shape rather than one uniform reference type – chosen for + ergonomics and additive `inline` growth (`labelOnly` → `inline` only adds fields). +- Deferred: a `dataset(id)` single-resource query (DR detail stays on SPARQL); cross-collection + `@reference` joins beyond inline labels; cursor pagination; a `Date` scalar (kept ISO + `String`) and a `Long`/`BigInt` scalar for 64-bit integers (kept `Float`); transport-layer + persisted queries / cost limits; a root or per-field language argument (Accept-Language is the + sole preference mechanism); metadata-language-availability filtering (a facetable dimension, + not v1); schema extension hooks (`extendTypeDefs`/`extendResolvers` or exported + typeDefs/resolvers for manual composition); a static TS mirror of the contract + (`OutputOf` / `WhereOf` / `OrderByOf` / `FacetOf` mapped types over a + `defineSearchType` declaration) for typed in-process callers. diff --git a/package-lock.json b/package-lock.json index 50845ee3..dd6a4bc4 100644 --- a/package-lock.json +++ b/package-lock.json @@ -24949,6 +24949,10 @@ "resolved": "packages/search", "link": true }, + "node_modules/@lde/search-api-graphql": { + "resolved": "packages/search-api-graphql", + "link": true + }, "node_modules/@lde/search-typesense": { "resolved": "packages/search-typesense", "link": true @@ -32504,7 +32508,6 @@ "version": "15.10.2", "resolved": "https://registry.npmjs.org/graphql/-/graphql-15.10.2.tgz", "integrity": "sha512-1PRqdDPAmViWr4h1GVBT8RoPZfWSGZa7kDzleTilOfVIslsgf+cia3Nl95v1KDmR4iERPaT7WzQ+tN4MJmbg3w==", - "dev": true, "license": "MIT", "engines": { "node": ">= 10.x" @@ -40442,7 +40445,7 @@ "commander": "^15.0.0", "cron": "^4.1.0", "drizzle-kit": "1.0.0-rc.4", - "drizzle-orm": "^1.0.0-rc.4", + "drizzle-orm": "1.0.0-rc.4", "postgres": "^3.4.9", "tslib": "^2.3.0" }, @@ -42921,11 +42924,23 @@ "n3": "^2.1.0" } }, + "packages/search-api-graphql": { + "name": "@lde/search-api-graphql", + "version": "0.1.0", + "license": "MIT", + "dependencies": { + "@lde/search": "^0.1.2", + "graphql": "^15.8.0", + "tslib": "^2.3.0" + } + }, "packages/search-typesense": { "name": "@lde/search-typesense", "version": "0.1.1", "license": "MIT", "dependencies": { + "@lde/search": "^0.1.2", + "@lde/text-normalization": "^0.1.1", "tslib": "^2.3.0", "typesense": "^3.0.6" }, diff --git a/packages/search-api-graphql/README.md b/packages/search-api-graphql/README.md new file mode 100644 index 00000000..7f846be5 --- /dev/null +++ b/packages/search-api-graphql/README.md @@ -0,0 +1,115 @@ +# @lde/search-api-graphql + +The GraphQL surface for the [`@lde/search`](../search) core. **Both engine- and +domain-agnostic:** it builds an executable +[graphql-js](https://graphql.org/graphql-js/) `GraphQLSchema` from your whole +[`SearchSchema`](../search/README.md#terminology) at runtime — one root query +field per `SearchType`, each searchable in its own way. All root fields are +served by the same resolver implementation (no per-type code, no codegen); +each root field gets its own instance of it, bound to that field’s +`SearchType`, over any `SearchEngine`. It names neither your **domain** (each type’s GraphQL name +is the `SearchType`’s own logical `name` — `Dataset`, `Person`, `CreativeWork`, +…) nor your **engine** (the resolver calls `context.engine`, be it +[`@lde/search-typesense`](../search-typesense) or another adapter). + +## Runtime configuration, not codegen + +`buildGraphQLSchema(schema)` constructs the GraphQL schema once at startup from +the field model — no SDL artifact, no generated resolver stubs. For you that +means: no codegen step in the build, no generated files to commit and review, +and no stale artifact that can drift from the declaration — change the +`SearchType`, restart, and the API is current. (The flip side, no artifact +showing contract changes as diffs, is restored by the +[snapshot guard](#guarding-the-contract).) The field model +is the single source; the GraphQL contract is derived from it. Type names +come from each `SearchType`’s `name`; output types, the `where`/`orderBy`/facet +inputs, reference types and nullability are all derived from each field’s +`kind` and capability flags. The common case needs no options at all: + +```ts +import { searchSchema } from '@lde/search'; +import { buildGraphQLSchema } from '@lde/search-api-graphql'; + +const gqlSchema = buildGraphQLSchema(searchSchema(DATASET, PERSON)); + +// The API now serves `datasets(…)` and `persons(…)` root fields. +// Hand `gqlSchema` to any graphql-js server; populate the per-request context: +// { engine: SearchEngine, acceptLanguage: string[] } +``` + +Per-type options are pure fine-tuning, only for the types that need it: a +`queryField` when the default root field (the lowercased plural of the type’s +`name`) is wrong, and a `queryDefaults` policy applied to every query of that +type: + +```ts +const gqlSchema = buildGraphQLSchema(searchSchema(DATASET, PERSON), { + types: { + [DATASET.type]: { + queryDefaults: (query) => ({ + ...query, + where: [...query.where, { field: 'status', in: ['valid'] }], + }), + }, + [PERSON.type]: { queryField: 'people' }, + }, +}); +``` + +Shared types (`LanguageString`, the facet buckets, filter inputs and reference +types such as a common `Agent`) are created once and reused across root types. + +## Serving a subset of the schema + +`types` never filters: every `SearchType` in the schema you pass gets a root +field (options for a type not in the schema are a build-time error). To expose +only part of what you index, narrow the **schema argument** +(`searchSchema(…)` is a cheap constructor): + +```ts +// Index all three types… +projectGraph(quads, searchSchema(DATASET, PERSON, INTERNAL)); + +// …but serve only two. +const gqlSchema = buildGraphQLSchema(searchSchema(DATASET, PERSON)); +``` + +## What it builds (per root type) + +- **Output type** (the `SearchType`’s `name`): localized text → best-first `[LanguageString!]!` + (`[0].language` is the language actually served); references → named per-shape + types (`Organization`, `Term`) with a `name`; scalars/booleans per kind; `date` + → ISO 8601 string; nullability from `required` / `array` / `kind`. +- **`where`** one input per `filterable` field (`StringFilter`, `IntRange` / + `FloatRange` / `DateRange`, or `Boolean`); omitted entirely for a type with no + filterable fields. +- **`orderBy`**: `RELEVANCE` plus every `sortable` field, as an enum. +- **Facets**: an enum of every `facetable` field; a bucket carries `value` + + `count` + a nullable `label` — the resolved data label for **reference** facets, + `null` for token/free-string facets whose display the consumer owns (its own + i18n, or the value itself). + +## Guarding the contract + +Why the API, the index and a future REST surface cannot drift apart is the +search family’s overall approach — one field model, one query IR — described +in [`@lde/search`](../search/README.md). Specific to this surface: the GraphQL +contract is **frozen** (breaking to change), yet generated rather than +handwritten, so nothing in the repo shows a contract change as a reviewable +diff. A _consumer_ restores that with one snapshot test over its **own** +search schema: + +```ts +import { printGraphQLSchema } from '@lde/search-api-graphql'; + +it('keeps the public GraphQL contract stable', () => { + expect(printGraphQLSchema(searchSchema(DATASET, PERSON))).toMatchSnapshot(); +}); +``` + +The first run writes the emitted SDL to a committed snapshot file; every later +run re-emits and diffs against it. Any contract change — your own schema edit, +or a new version of this library emitting different GraphQL for the same +declaration — fails the test and shows the SDL diff, until you consciously +accept it (`vitest -u`) and the reviewer sees the contract change spelled out +in the PR. diff --git a/packages/search-api-graphql/eslint.config.mjs b/packages/search-api-graphql/eslint.config.mjs new file mode 100644 index 00000000..2dcaf60c --- /dev/null +++ b/packages/search-api-graphql/eslint.config.mjs @@ -0,0 +1,22 @@ +import baseConfig from '../../eslint.config.mjs'; + +export default [ + ...baseConfig, + { + files: ['**/*.json'], + rules: { + '@nx/dependency-checks': [ + 'error', + { + ignoredFiles: [ + '{projectRoot}/eslint.config.{js,cjs,mjs}', + '{projectRoot}/vite.config.{js,ts,mjs,mts}', + ], + }, + ], + }, + languageOptions: { + parser: await import('jsonc-eslint-parser'), + }, + }, +]; diff --git a/packages/search-api-graphql/package.json b/packages/search-api-graphql/package.json new file mode 100644 index 00000000..3698172b --- /dev/null +++ b/packages/search-api-graphql/package.json @@ -0,0 +1,32 @@ +{ + "name": "@lde/search-api-graphql", + "version": "0.1.0", + "description": "Engine- and domain-agnostic GraphQL surface for @lde/search: builds an executable GraphQLSchema from a whole SearchSchema at runtime (no codegen) — one root query field per SearchType — served by generic resolvers over any SearchEngine. You supply the schema and per-type typeNames; it names neither your domain nor your engine.", + "repository": { + "url": "git+https://github.com/ldelements/lde.git", + "directory": "packages/search-api-graphql" + }, + "license": "MIT", + "type": "module", + "exports": { + "./package.json": "./package.json", + ".": { + "types": "./dist/index.d.ts", + "import": "./dist/index.js", + "development": "./src/index.ts", + "default": "./dist/index.js" + } + }, + "main": "./dist/index.js", + "module": "./dist/index.js", + "types": "./dist/index.d.ts", + "files": [ + "dist", + "!**/*.tsbuildinfo" + ], + "dependencies": { + "@lde/search": "^0.1.2", + "graphql": "^15.8.0", + "tslib": "^2.3.0" + } +} diff --git a/packages/search-api-graphql/src/build-schema.ts b/packages/search-api-graphql/src/build-schema.ts new file mode 100644 index 00000000..e692bbbd --- /dev/null +++ b/packages/search-api-graphql/src/build-schema.ts @@ -0,0 +1,562 @@ +import { + GraphQLBoolean, + GraphQLEnumType, + GraphQLFloat, + GraphQLInputObjectType, + GraphQLInt, + GraphQLList, + GraphQLNonNull, + GraphQLObjectType, + GraphQLSchema, + GraphQLString, + printSchema, + type GraphQLEnumValueConfigMap, + type GraphQLFieldConfig, + type GraphQLInputFieldConfig, + type GraphQLInputType, + type GraphQLOutputType, +} from 'graphql'; +import { + facetableFields, + filterableFields, + filterOperatorFor, + isRangeFacet, + outputFields, + pageForOffset, + sortableFields, + unixSecondsToIso, + type Filter, + type LocalizedValue, + type SearchEngine, + type SearchField, + type SearchQuery, + type SearchSchema, + type SearchType, +} from '@lde/search'; +import { + defaultLanguageOrder, + toLanguageStrings, + type LanguageOrder, +} from './language.js'; + +/** Populated per request by the transport; no framework type appears here. */ +export interface SearchContext { + readonly engine: SearchEngine; + /** Parsed, ordered `Accept-Language`; drives locale selection and output order. */ + readonly acceptLanguage: readonly string[]; + /** + * Called when a single facet's computation fails. The facet degrades to an + * empty list (a supplementary facet must not fail the whole query); supply + * this to log the cause. Optional — omit to swallow silently. + */ + readonly onFacetError?: (field: string, error: unknown) => void; +} + +/** Per-root-type fine-tuning. The type’s name comes from the {@link SearchType} + * itself (`name`); options exist only for what has a sensible default. */ +export interface SearchTypeOptions { + /** Root query field; defaults to the lowercased plural of the type’s `name` + * (e.g. `Dataset` → `datasets`). */ + readonly queryField?: string; + /** Consumer policy applied to every query of this type (default status, sort, + * tie-breaks). */ + readonly queryDefaults?: ( + query: SearchQuery, + context: SearchContext, + ) => SearchQuery; +} + +export interface BuildGraphQLSchemaOptions { + /** Optional fine-tuning per root type, keyed by type IRI (the + * {@link SearchType} `type`). A type without an entry gets the defaults. */ + readonly types?: Readonly>; + /** Output-language ordering; defaults to Accept-Language-first, `und` last. */ + readonly languageOrder?: LanguageOrder; +} + +type Source = Record; + +const nonNullListOf = (type: GraphQLOutputType): GraphQLOutputType => + new GraphQLNonNull(new GraphQLList(new GraphQLNonNull(type))); + +const scalarOutput = ( + scalar: GraphQLOutputType, + field: SearchField, +): GraphQLOutputType => + field.required === true ? new GraphQLNonNull(scalar) : scalar; + +/** SCREAMING_SNAKE_CASE for an enum value name, e.g. `datePosted` → `DATE_POSTED`. */ +function screamingSnake(name: string): string { + return name.replace(/([a-z0-9])([A-Z])/g, '$1_$2').toUpperCase(); +} + +/** + * Construct an executable GraphQL schema from the whole {@link SearchSchema} at + * runtime — no codegen, no SDL artifact. One root query field per + * {@link SearchType} (e.g. `datasets`, `people`), each searchable in its own + * way through its own output/`where`/`orderBy`/facet types, while the shared + * types (`LanguageString`, buckets, filter inputs, reference types) are created + * once. One generic resolver per root field maps the arguments to a + * {@link SearchQuery}, calls `context.engine`, and maps the result back; the + * field model only parameterises data. + */ +export function buildGraphQLSchema( + schema: SearchSchema, + options: BuildGraphQLSchemaOptions = {}, +): GraphQLSchema { + const languageOrder = options.languageOrder ?? defaultLanguageOrder; + for (const typeIri of Object.keys(options.types ?? {})) { + if (!schema.has(typeIri)) { + throw new Error( + `Options given for type “${typeIri}”, which is not in the search schema.`, + ); + } + } + + const languageString = new GraphQLObjectType({ + name: 'LanguageString', + fields: { + language: { type: GraphQLString }, + value: { type: new GraphQLNonNull(GraphQLString) }, + }, + }); + // A plain value facet bucket: a selection key, its count, and (for reference + // facets) the engine-resolved data label; null for token/free-string facets + // whose display the consumer owns. + const valueBucket = new GraphQLObjectType({ + name: 'ValueBucket', + fields: { + value: { type: new GraphQLNonNull(GraphQLString) }, + count: { type: new GraphQLNonNull(GraphQLInt) }, + label: { + type: new GraphQLList(new GraphQLNonNull(languageString)), + resolve: (bucket: Source, _args: unknown, context: SearchContext) => { + const label = bucket.label as LocalizedValue | undefined; + return label + ? toLanguageStrings(label, context.acceptLanguage, languageOrder) + : null; + }, + }, + }, + }); + // A numeric range-facet bin: half-open `[min, max)` bounds (max null on an + // open-ended top bin) and the count of documents in it. + const rangeBucket = new GraphQLObjectType({ + name: 'RangeBucket', + fields: { + min: { type: GraphQLFloat }, + max: { type: GraphQLFloat }, + count: { type: new GraphQLNonNull(GraphQLInt) }, + }, + }); + const sortDirection = new GraphQLEnumType({ + name: 'SortDirection', + values: { ASC: { value: 'asc' }, DESC: { value: 'desc' } }, + }); + const stringFilter = new GraphQLInputObjectType({ + name: 'StringFilter', + fields: { + in: { type: new GraphQLList(new GraphQLNonNull(GraphQLString)) }, + }, + }); + const intRange = rangeInput('IntRange', GraphQLInt); + const floatRange = rangeInput('FloatRange', GraphQLFloat); + const dateRange = rangeInput('DateRange', GraphQLString); + + const labelList = ( + resolveLabel: (source: Source) => LocalizedValue | undefined, + ) => ({ + type: nonNullListOf(languageString), + resolve: (source: Source, _args: unknown, context: SearchContext) => { + const value = resolveLabel(source); + return value + ? toLanguageStrings(value, context.acceptLanguage, languageOrder) + : []; + }, + }); + + // One reference type per referenced shape, shared across every root type and + // reused by every field (Person and CreativeWork both referencing Agent yield + // one Agent type). + const referenceTypes = new Map(); + for (const searchType of schema.values()) { + for (const field of outputFields(searchType)) { + if ( + field.kind === 'reference' && + field.ref && + !referenceTypes.has(field.ref.type) + ) { + referenceTypes.set( + field.ref.type, + new GraphQLObjectType({ + name: field.ref.type, + fields: { + id: { type: new GraphQLNonNull(GraphQLString) }, + name: labelList( + (source) => source.label as LocalizedValue | undefined, + ), + }, + }), + ); + } + } + } + + function outputFieldConfig( + field: SearchField, + ): GraphQLFieldConfig { + switch (field.kind) { + case 'text': + return labelList( + (source) => source[field.name] as LocalizedValue | undefined, + ); + case 'keyword': + return field.array === true + ? { + type: nonNullListOf(GraphQLString), + resolve: (s) => s[field.name] ?? [], + } + : { type: scalarOutput(GraphQLString, field) }; + case 'reference': { + const referenceType = referenceTypes.get(field.ref?.type ?? '')!; + return field.array === true + ? { + type: nonNullListOf(referenceType), + resolve: (s) => s[field.name] ?? [], + } + : { + type: + field.required === true + ? new GraphQLNonNull(referenceType) + : referenceType, + }; + } + case 'integer': + return { type: scalarOutput(GraphQLInt, field) }; + case 'number': + return { type: scalarOutput(GraphQLFloat, field) }; + case 'date': + // Stored as Unix seconds (int64); the surface serves ISO 8601 (ADR 4). + return { + type: scalarOutput(GraphQLString, field), + resolve: (source) => { + const value = source[field.name]; + return typeof value === 'number' + ? unixSecondsToIso(value) + : (value ?? null); + }, + }; + case 'boolean': + return { + type: new GraphQLNonNull(GraphQLBoolean), + resolve: (source) => source[field.name] === true, + }; + } + } + + function whereFieldType(field: SearchField): GraphQLInputType { + switch (filterOperatorFor(field.kind)) { + case 'in': + return stringFilter; + case 'range': + return field.kind === 'integer' + ? intRange + : field.kind === 'number' + ? floatRange + : dateRange; + default: + return GraphQLBoolean; + } + } + + /** The root query field for one {@link SearchType}, with its derived types. */ + function rootField( + searchType: SearchType, + typeOptions: SearchTypeOptions | undefined, + ): GraphQLFieldConfig { + const typeName = searchType.name; + + const outputType = new GraphQLObjectType({ + name: typeName, + fields: () => { + const fields: Record< + string, + GraphQLFieldConfig + > = { + id: { type: new GraphQLNonNull(GraphQLString) }, + }; + for (const field of outputFields(searchType)) { + fields[field.name] = outputFieldConfig(field); + } + return fields; + }, + }); + + // A GraphQL input object must have at least one field, so a type with no + // filterable fields gets no `where` arg at all rather than an invalid + // empty input. + const filterable = filterableFields(searchType); + const whereInput = + filterable.length === 0 + ? undefined + : new GraphQLInputObjectType({ + name: `${typeName}Where`, + fields: () => { + const fields: Record = {}; + for (const field of filterable) { + fields[field.name] = { type: whereFieldType(field) }; + } + return fields; + }, + }); + + const sortValues: GraphQLEnumValueConfigMap = { + RELEVANCE: { value: 'relevance' }, + }; + for (const field of sortableFields(searchType)) { + sortValues[screamingSnake(field.name)] = { value: field.name }; + } + const sortField = new GraphQLEnumType({ + name: `${typeName}SortField`, + values: sortValues, + }); + const orderByInput = new GraphQLInputObjectType({ + name: `${typeName}OrderBy`, + fields: { + field: { type: new GraphQLNonNull(sortField) }, + direction: { + type: new GraphQLNonNull(sortDirection), + defaultValue: 'desc', + }, + }, + }); + + // Keyed facets object: one field per facetable field, typed by its kind + // (range fields → [RangeBucket!], else [ValueBucket!]). Each field's resolver + // computes that facet with its OWN where-filter removed (skip-own-filter), so a + // multi-select facet still lists its other options; only the selected fields + // are resolved (GraphQL prunes the rest), so the selection IS the request. + // Like `where`, omitted entirely for a type with no facetable fields (a + // GraphQL object type must have at least one field). + const facetable = facetableFields(searchType); + const facetsType = + facetable.length === 0 + ? undefined + : facetsTypeFor(searchType, typeName, facetable); + + const resultType = new GraphQLObjectType({ + name: `${typeName}SearchResult`, + fields: { + items: { type: nonNullListOf(outputType) }, + total: { type: new GraphQLNonNull(GraphQLInt) }, + page: { type: new GraphQLNonNull(GraphQLInt) }, + perPage: { type: new GraphQLNonNull(GraphQLInt) }, + // Resolved lazily, per selected key (skip-own-filter); the result object + // (which carries the resolved `query`) is the facets source. + ...(facetsType && { + facets: { + type: new GraphQLNonNull(facetsType), + resolve: (source: Source) => source, + }, + }), + }, + }); + + return { + type: new GraphQLNonNull(resultType), + args: { + query: { type: GraphQLString }, + ...(whereInput && { where: { type: whereInput } }), + orderBy: { type: orderByInput }, + page: { type: GraphQLInt, defaultValue: 1 }, + perPage: { type: GraphQLInt, defaultValue: 20 }, + }, + resolve: async (_source, args, context: SearchContext) => { + const built = argsToQuery(args as QueryArgs, context, searchType); + const finalQuery = typeOptions?.queryDefaults + ? typeOptions.queryDefaults(built, context) + : built; + // Items + total only; facets are resolved lazily per selected key. + const result = await context.engine.search( + { ...finalQuery, facets: [] }, + searchType, + ); + return { + items: result.hits.map((hit) => ({ id: hit.id, ...hit.document })), + total: result.total, + page: pageForOffset(finalQuery.offset, finalQuery.limit), + perPage: finalQuery.limit, + // Carried for the facets resolver (skip-own-filter per key). + query: finalQuery, + }; + }, + }; + } + + /** The keyed facets object for one type (only called with ≥ 1 facetable field). */ + function facetsTypeFor( + searchType: SearchType, + typeName: string, + facetable: readonly SearchField[], + ): GraphQLObjectType { + return new GraphQLObjectType({ + name: `${typeName}Facets`, + fields: () => { + const fields: Record< + string, + GraphQLFieldConfig + > = {}; + for (const field of facetable) { + fields[field.name] = { + type: nonNullListOf( + isRangeFacet(field) ? rangeBucket : valueBucket, + ), + resolve: async ( + source: Source, + _args: unknown, + context: SearchContext, + ) => { + const query = source.query as SearchQuery; + // Drop this facet's own filter so its other options still count + // (a removed `status` filter also drops the valid-only default, so + // the status facet counts across every status). + const facetQuery: SearchQuery = { + ...query, + where: query.where.filter( + (filter) => filter.field !== field.name, + ), + facets: [field.name], + limit: 0, + offset: 0, + }; + // A facet is supplementary: degrade a failed facet to an empty list + // rather than failing the whole query (which would null the non-null + // result and discard the items + every other facet). + try { + const result = await context.engine.search( + facetQuery, + searchType, + ); + return result.facets[field.name] ?? []; + } catch (error) { + context.onFacetError?.(field.name, error); + return []; + } + }, + }; + } + return fields; + }, + }); + } + + const queryFields: Record< + string, + GraphQLFieldConfig + > = {}; + for (const searchType of schema.values()) { + const typeOptions = options.types?.[searchType.type]; + const typeName = searchType.name; + const queryField = + typeOptions?.queryField ?? + `${typeName.charAt(0).toLowerCase()}${typeName.slice(1)}s`; + if (queryField in queryFields) { + throw new Error( + `Duplicate root query field “${queryField}”; set queryField to disambiguate.`, + ); + } + queryFields[queryField] = rootField(searchType, typeOptions); + } + + return new GraphQLSchema({ + query: new GraphQLObjectType({ name: 'Query', fields: queryFields }), + }); +} + +/** + * The SDL of the built schema. Not a shipped artifact — a consumer uses it for an + * optional CI snapshot test over its own schema, catching accidental breaking + * changes to its frozen contract (including a `buildGraphQLSchema` change in a + * future version of this library silently altering it). + */ +export function printGraphQLSchema( + schema: SearchSchema, + options: BuildGraphQLSchemaOptions = {}, +): string { + return printSchema(buildGraphQLSchema(schema, options)); +} + +interface QueryArgs { + readonly query?: string; + readonly where?: Record; + readonly orderBy?: { field: string; direction: 'asc' | 'desc' }; + readonly page?: number; + readonly perPage?: number; +} + +/** Pure args → {@link SearchQuery} mapping. */ +function argsToQuery( + args: QueryArgs, + context: SearchContext, + searchType: SearchType, +): SearchQuery { + const perPage = args.perPage ?? 20; + const page = args.page ?? 1; + return { + text: args.query, + where: whereToFilters(args.where, searchType), + orderBy: args.orderBy + ? [{ field: args.orderBy.field, direction: args.orderBy.direction }] + : [], + limit: perPage, + offset: (page - 1) * perPage, + // Facets are requested per-key by the facets resolver, not via an arg. + facets: [], + locale: context.acceptLanguage[0] ?? 'und', + }; +} + +function whereToFilters( + where: Record | undefined, + searchType: SearchType, +): Filter[] { + if (where === undefined) { + return []; + } + const filters: Filter[] = []; + for (const field of filterableFields(searchType)) { + const value = where[field.name]; + if (value === undefined || value === null) { + continue; + } + switch (filterOperatorFor(field.kind)) { + case 'in': + filters.push({ + field: field.name, + in: (value as { in?: string[] }).in ?? [], + }); + break; + case 'range': { + const range = value as { min?: number | string; max?: number | string }; + filters.push({ + field: field.name, + range: { min: range.min, max: range.max }, + }); + break; + } + default: + filters.push({ field: field.name, is: value as boolean }); + } + } + return filters; +} + +function rangeInput( + name: string, + bound: typeof GraphQLInt | typeof GraphQLFloat | typeof GraphQLString, +): GraphQLInputObjectType { + return new GraphQLInputObjectType({ + name, + fields: { min: { type: bound }, max: { type: bound } }, + }); +} diff --git a/packages/search-api-graphql/src/index.ts b/packages/search-api-graphql/src/index.ts new file mode 100644 index 00000000..0dfe1adf --- /dev/null +++ b/packages/search-api-graphql/src/index.ts @@ -0,0 +1,8 @@ +export { buildGraphQLSchema, printGraphQLSchema } from './build-schema.js'; +export type { + SearchContext, + BuildGraphQLSchemaOptions, + SearchTypeOptions, +} from './build-schema.js'; +export { defaultLanguageOrder } from './language.js'; +export type { LanguageString, LanguageOrder } from './language.js'; diff --git a/packages/search-api-graphql/src/language.ts b/packages/search-api-graphql/src/language.ts new file mode 100644 index 00000000..96826f65 --- /dev/null +++ b/packages/search-api-graphql/src/language.ts @@ -0,0 +1,47 @@ +import type { LocalizedValue } from '@lde/search'; + +/** One entry of the surface’s best-first `[LanguageString!]!`. `language` is null + * for untagged (`und`) values; `[0]` is the value to display and `[0].language` + * is the language actually served (the per-field `Content-Language`). */ +export interface LanguageString { + readonly language: string | null; + readonly value: string; +} + +/** Orders a localized value’s available languages against the request. */ +export type LanguageOrder = ( + available: readonly string[], + accept: readonly string[], +) => readonly string[]; + +/** + * Default ordering: requested languages first (in request order), then the + * remaining tagged languages, then untagged (`und`) last — so `[0]` is always the + * best available value. + */ +export const defaultLanguageOrder: LanguageOrder = (available, accept) => { + const requested = accept.filter((language) => available.includes(language)); + const rest = available.filter( + (language) => language !== 'und' && !requested.includes(language), + ); + const untagged = available.includes('und') ? ['und'] : []; + return [...requested, ...rest, ...untagged]; +}; + +/** Flatten a language map into a best-first `LanguageString` list. */ +export function toLanguageStrings( + value: LocalizedValue, + accept: readonly string[], + order: LanguageOrder, +): LanguageString[] { + const result: LanguageString[] = []; + for (const language of order(Object.keys(value), accept)) { + for (const text of value[language] ?? []) { + result.push({ + language: language === 'und' ? null : language, + value: text, + }); + } + } + return result; +} diff --git a/packages/search-api-graphql/test/__snapshots__/generator-stability.test.ts.snap b/packages/search-api-graphql/test/__snapshots__/generator-stability.test.ts.snap new file mode 100644 index 00000000..63bc19de --- /dev/null +++ b/packages/search-api-graphql/test/__snapshots__/generator-stability.test.ts.snap @@ -0,0 +1,101 @@ +// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html + +exports[`GraphQL generator stability > emits a stable SDL for a representative schema 1`] = ` +"type Query { + things(query: String, where: ThingWhere, orderBy: ThingOrderBy, page: Int = 1, perPage: Int = 20): ThingSearchResult! +} + +type ThingSearchResult { + items: [Thing!]! + total: Int! + page: Int! + perPage: Int! + facets: ThingFacets! +} + +type Thing { + id: String! + title: [LanguageString!]! + description: [LanguageString!]! + keyword: [String!]! + creator: [Agent!]! + publisher: Agent + size: Int + score: Float + created: String + status: String! + open: Boolean! +} + +type LanguageString { + language: String + value: String! +} + +type Agent { + id: String! + name: [LanguageString!]! +} + +type ThingFacets { + keyword: [ValueBucket!]! + creator: [ValueBucket!]! + publisher: [ValueBucket!]! + status: [ValueBucket!]! + open: [ValueBucket!]! +} + +type ValueBucket { + value: String! + count: Int! + label: [LanguageString!] +} + +input ThingWhere { + keyword: StringFilter + creator: StringFilter + publisher: StringFilter + size: IntRange + score: FloatRange + created: DateRange + status: StringFilter + open: Boolean +} + +input StringFilter { + in: [String!] +} + +input IntRange { + min: Int + max: Int +} + +input FloatRange { + min: Float + max: Float +} + +input DateRange { + min: String + max: String +} + +input ThingOrderBy { + field: ThingSortField! + direction: SortDirection! = DESC +} + +enum ThingSortField { + RELEVANCE + TITLE + SIZE + CREATED +} + +enum SortDirection { + ASC + DESC +} +" +`; diff --git a/packages/search-api-graphql/test/build-schema.test.ts b/packages/search-api-graphql/test/build-schema.test.ts new file mode 100644 index 00000000..9a9502b2 --- /dev/null +++ b/packages/search-api-graphql/test/build-schema.test.ts @@ -0,0 +1,625 @@ +import { describe, expect, it } from 'vitest'; +import { graphql, printSchema } from 'graphql'; +import { + searchSchema, + type SearchEngine, + type SearchQuery, + type SearchResult, + type SearchType, +} from '@lde/search'; +import { buildGraphQLSchema, type SearchContext } from '../src/build-schema.js'; + +const schema: SearchType = { + name: 'Dataset', + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [ + { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, + }, + { + name: 'keyword', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + output: true, + }, + { + name: 'publisher', + kind: 'reference', + facetable: true, + filterable: true, + output: true, + ref: { type: 'Organization', strategy: 'labelOnly' }, + }, + { + name: 'size', + kind: 'integer', + filterable: true, + sortable: true, + facetable: true, + output: true, + facetRanges: [ + { key: '0', min: 1, max: 10 }, + { key: '1', min: 10 }, + ], + }, + { name: 'datePosted', kind: 'date', sortable: true, output: true }, + { name: 'score', kind: 'number', output: true }, + { + name: 'terminologySource', + kind: 'reference', + array: true, + facetable: true, + output: true, + ref: { type: 'Term', strategy: 'labelOnly' }, + }, + { + name: 'status', + kind: 'keyword', + facetable: true, + filterable: true, + required: true, + output: true, + }, + { + name: 'iiif', + kind: 'boolean', + facetable: true, + filterable: true, + output: true, + }, + ], +}; + +/** A fake engine that records the query it received and returns a canned result. */ +function fakeEngine(result: SearchResult): { + engine: SearchEngine; + received: () => SearchQuery; +} { + let captured: SearchQuery; + return { + engine: { + async search(query) { + captured = query; + return result; + }, + }, + received: () => captured, + }; +} + +const canned: SearchResult = { + total: 1, + hits: [ + { + id: 'https://d/1', + document: { + title: { nl: ['Titel'], en: ['Title'] }, + keyword: ['kaarten'], + publisher: { + id: 'https://org/1', + label: { nl: ['Het Utrechts Archief'] }, + }, + size: 1234, + datePosted: 1_700_000_000, + score: 4.5, + terminologySource: [ + { id: 'https://term/1', label: { nl: ['Kaarten'] } }, + ], + status: 'valid', + iiif: true, + }, + }, + ], + facets: { keyword: [{ value: 'kaarten', count: 3 }] }, +}; + +const datasetOptions = {}; + +async function run( + source: string, + context: SearchContext, + variables?: Record, +) { + return graphql({ + schema: buildGraphQLSchema(searchSchema(schema), datasetOptions), + source, + contextValue: context, + variableValues: variables, + }); +} + +describe('buildGraphQLSchema', () => { + it('resolves a query, mapping the result to the typed output', async () => { + const { engine, received } = fakeEngine(canned); + const result = await run( + `{ + datasets(query: "kaart") { + total + page + perPage + items { + id + title { language value } + keyword + publisher { id name { language value } } + terminologySource { id name { language value } } + size + datePosted + score + status + iiif + } + facets { keyword { value count } } + } + }`, + { engine, acceptLanguage: ['nl'] }, + ); + + expect(result.errors).toBeUndefined(); + const data = result.data?.datasets as Record; + expect(data.total).toBe(1); + expect(data.page).toBe(1); + const item = (data.items as Record[])[0]; + expect(item.id).toBe('https://d/1'); + expect(item.title).toEqual([ + { language: 'nl', value: 'Titel' }, + { language: 'en', value: 'Title' }, + ]); + expect(item.keyword).toEqual(['kaarten']); + expect(item.publisher).toEqual({ + id: 'https://org/1', + name: [{ language: 'nl', value: 'Het Utrechts Archief' }], + }); + expect(item.size).toBe(1234); + expect(item.datePosted).toBe('2023-11-14T22:13:20.000Z'); + expect(item.score).toBe(4.5); + expect(item.terminologySource).toEqual([ + { id: 'https://term/1', name: [{ language: 'nl', value: 'Kaarten' }] }, + ]); + expect(item.iiif).toBe(true); + expect(data.facets).toEqual({ + keyword: [{ value: 'kaarten', count: 3 }], + }); + // The free-text arg became the query text. + expect(received().text).toBe('kaart'); + }); + + it('orders the output list best-first for the requested language', async () => { + const { engine } = fakeEngine(canned); + const result = await run( + `{ datasets { items { title { language value } } } }`, + { engine, acceptLanguage: ['en'] }, + ); + const item = ( + (result.data?.datasets as Record).items as Record< + string, + unknown + >[] + )[0]; + expect(item.title).toEqual([ + { language: 'en', value: 'Title' }, + { language: 'nl', value: 'Titel' }, + ]); + }); + + it('places untagged (und) values last with a null language', async () => { + const { engine } = fakeEngine({ + total: 1, + facets: {}, + hits: [ + { + id: 'x', + document: { title: { nl: ['Titel'], und: ['Naamloos'] } }, + }, + ], + }); + const result = await run( + `{ datasets { items { title { language value } datePosted } } }`, + { engine, acceptLanguage: ['en'] }, + ); + const item = ( + (result.data?.datasets as Record).items as Record< + string, + unknown + >[] + )[0]; + expect(item.title).toEqual([ + { language: 'nl', value: 'Titel' }, + { language: null, value: 'Naamloos' }, + ]); + // An absent date resolves to null (the non-numeric branch). + expect(item.datePosted).toBeNull(); + }); + + it('labels reference-facet buckets, leaving plain-facet buckets null', async () => { + const { engine } = fakeEngine({ + total: 0, + hits: [], + facets: { + publisher: [ + { + value: 'https://org/1', + count: 2, + label: { nl: ['Het Utrechts Archief'] }, + }, + ], + keyword: [{ value: 'kaarten', count: 3 }], + }, + }); + const result = await run( + `{ datasets { facets { + publisher { value count label { language value } } + keyword { value count label { language value } } + } } }`, + { engine, acceptLanguage: ['nl'] }, + ); + const facets = (result.data?.datasets as Record) + .facets as { + publisher: unknown[]; + keyword: unknown[]; + }; + expect(facets.publisher).toEqual([ + { + value: 'https://org/1', + count: 2, + label: [{ language: 'nl', value: 'Het Utrechts Archief' }], + }, + ]); + expect(facets.keyword).toEqual([ + { value: 'kaarten', count: 3, label: null }, + ]); + }); + + it('exposes range-facet bucket bounds, null for value facets and open ends', async () => { + const { engine } = fakeEngine({ + total: 0, + hits: [], + facets: { + size: [ + { value: '0', count: 2, min: 1, max: 10 }, + // Open-ended top bin: lower bound only. + { value: '1', count: 5, min: 10 }, + ], + keyword: [{ value: 'kaarten', count: 3 }], + }, + }); + const result = await run( + `{ datasets { facets { + size { min max count } + keyword { value count } + } } }`, + { engine, acceptLanguage: ['nl'] }, + ); + const facets = (result.data?.datasets as Record) + .facets as { + size: unknown[]; + keyword: unknown[]; + }; + // RangeBuckets carry their half-open bounds (max null = open-ended top bin). + expect(facets.size).toEqual([ + { min: 1, max: 10, count: 2 }, + { min: 10, max: null, count: 5 }, + ]); + // A value facet's ValueBuckets carry no bounds. + expect(facets.keyword).toEqual([{ value: 'kaarten', count: 3 }]); + }); + + it('resolves every selected facet key, returning [] where the engine has none', async () => { + const { engine } = fakeEngine({ + total: 0, + hits: [], + facets: { keyword: [{ value: 'kaarten', count: 1 }] }, + }); + const result = await run( + `{ datasets { facets { + keyword { value count } + publisher { value count } + terminologySource { value count } + status { value count } + iiif { value count } + size { min max count } + } } }`, + { engine, acceptLanguage: ['nl'] }, + ); + const facets = (result.data?.datasets as Record) + .facets as Record; + expect(facets.keyword).toEqual([{ value: 'kaarten', count: 1 }]); + // Keys the engine returned nothing for resolve to an empty list. + for (const key of [ + 'publisher', + 'terminologySource', + 'status', + 'iiif', + 'size', + ]) { + expect(facets[key]).toEqual([]); + } + }); + + it('computes a facet with its own where-filter removed (skip-own-filter)', async () => { + const { engine, received } = fakeEngine({ + total: 0, + hits: [], + facets: { keyword: [{ value: 'kaarten', count: 1 }] }, + }); + await run( + `{ datasets(where: { keyword: { in: ["x"] }, status: { in: ["valid"] } }) { + facets { keyword { value count } } + } }`, + { engine, acceptLanguage: ['nl'] }, + ); + // The keyword facet query is run with the keyword filter dropped (so its + // other options still count), but other filters (status) retained. + const facetQuery = received(); + expect(facetQuery.facets).toEqual(['keyword']); + expect( + facetQuery.where.find((filter) => filter.field === 'keyword'), + ).toBeUndefined(); + expect(facetQuery.where).toContainEqual({ field: 'status', in: ['valid'] }); + }); + + it('degrades a failed facet to an empty list without failing the whole query', async () => { + // A facet is supplementary: its computation runs a separate search (with + // `facets` set). Fail only that, leaving the listing search untouched. + const failedFacets: string[] = []; + const engine: SearchEngine = { + async search(query) { + if (query.facets.length > 0) { + throw new Error('facet backend unavailable'); + } + return canned; + }, + }; + const result = await run( + `{ datasets { + total + items { id } + facets { keyword { value count } } + } }`, + { + engine, + acceptLanguage: ['nl'], + onFacetError: (field) => failedFacets.push(field), + }, + ); + + // No top-level error: the failed facet degraded rather than nulling the + // non-null result and discarding the items. + expect(result.errors).toBeUndefined(); + const data = result.data?.datasets as Record; + expect(data.total).toBe(1); + expect((data.items as Record[])[0].id).toBe('https://d/1'); + // The failed facet degraded to an empty list, and the cause was reported. + expect((data.facets as Record).keyword).toEqual([]); + expect(failedFacets).toEqual(['keyword']); + }); + + it('guards perPage: 0, resolving page to 1 rather than failing on NaN', async () => { + const { engine } = fakeEngine(canned); + const result = await run(`{ datasets(perPage: 0) { page total } }`, { + engine, + acceptLanguage: ['nl'], + }); + expect(result.errors).toBeUndefined(); + const data = result.data?.datasets as Record; + expect(data.page).toBe(1); + }); + + it('maps where, orderBy and pagination into the SearchQuery', async () => { + const { engine, received } = fakeEngine(canned); + await run( + `{ + datasets( + where: { status: { in: ["valid"] }, keyword: {}, size: { min: 1, max: 9 }, iiif: true } + orderBy: { field: SIZE, direction: ASC } + page: 3 + perPage: 10 + ) { total } + }`, + { engine, acceptLanguage: ['nl'] }, + ); + + const query = received(); + expect(query.where).toContainEqual({ field: 'status', in: ['valid'] }); + // An empty StringFilter compiles to an empty membership. + expect(query.where).toContainEqual({ field: 'keyword', in: [] }); + expect(query.where).toContainEqual({ + field: 'size', + range: { min: 1, max: 9 }, + }); + expect(query.where).toContainEqual({ field: 'iiif', is: true }); + expect(query.orderBy).toEqual([{ field: 'size', direction: 'asc' }]); + // Facets are requested per key via selection, not an arg; the listing query + // carries none. + expect(query.facets).toEqual([]); + expect(query.limit).toBe(10); + expect(query.offset).toBe(20); + }); + + it('falls back to the und locale when no Accept-Language is given', async () => { + const { engine, received } = fakeEngine(canned); + await run(`{ datasets { total } }`, { engine, acceptLanguage: [] }); + expect(received().locale).toBe('und'); + }); + + it('applies queryDefaults before calling the engine', async () => { + let captured: SearchQuery | undefined; + const engine: SearchEngine = { + async search(query) { + captured = query; + return canned; + }, + }; + const gqlSchema = buildGraphQLSchema(searchSchema(schema), { + types: { + [schema.type]: { + queryDefaults: (query) => ({ + ...query, + where: [...query.where, { field: 'status', in: ['valid'] }], + orderBy: [{ field: 'relevance', direction: 'desc' }], + }), + }, + }, + }); + await graphql({ + schema: gqlSchema, + source: `{ datasets { total } }`, + contextValue: { engine, acceptLanguage: ['nl'] }, + }); + expect(captured?.where).toEqual([{ field: 'status', in: ['valid'] }]); + expect(captured?.orderBy).toEqual([ + { field: 'relevance', direction: 'desc' }, + ]); + }); + + it('derives nullability: required scalar non-null, optional scalar nullable, arrays/booleans non-null', () => { + const sdl = printSchema( + buildGraphQLSchema(searchSchema(schema), datasetOptions), + ); + expect(sdl).toMatch(/status: String!/); // required + expect(sdl).toMatch(/size: Int\b(?!!)/); // optional → nullable + expect(sdl).toMatch(/title: \[LanguageString!\]!/); + expect(sdl).toMatch(/keyword: \[String!\]!/); + expect(sdl).toMatch(/iiif: Boolean!/); + expect(sdl).toMatch(/publisher: Organization\b(?!!)/); // optional reference + }); + + it('builds the where, orderBy enum and keyed facets object from the field model', () => { + const sdl = printSchema( + buildGraphQLSchema(searchSchema(schema), datasetOptions), + ); + expect(sdl).toMatch(/enum DatasetSortField/); + expect(sdl).toMatch(/RELEVANCE/); + expect(sdl).toMatch(/SIZE/); + // Facets are a keyed object, one field per facetable field, typed by kind. + expect(sdl).toMatch(/type DatasetFacets/); + expect(sdl).toMatch(/keyword: \[ValueBucket!\]!/); + expect(sdl).toMatch(/size: \[RangeBucket!\]!/); + expect(sdl).toMatch(/input DatasetWhere/); + expect(sdl).toMatch(/status: StringFilter/); + expect(sdl).toMatch(/size: IntRange/); + }); + + describe('multiple root types in one schema', () => { + const PERSON: SearchType = { + name: 'Person', + type: 'https://schema.org/Person', + fields: [ + { + name: 'name', + kind: 'text', + localized: true, + locales: ['nl'], + output: true, + searchable: { weight: 5 }, + sortable: true, + }, + { + name: 'affiliation', + kind: 'reference', + facetable: true, + output: true, + ref: { type: 'Agent', strategy: 'labelOnly' }, + }, + ], + }; + const CREATIVE_WORK: SearchType = { + name: 'CreativeWork', + type: 'https://schema.org/CreativeWork', + fields: [ + { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl'], + output: true, + searchable: { weight: 5 }, + }, + { + name: 'publisher', + kind: 'reference', + facetable: true, + output: true, + ref: { type: 'Agent', strategy: 'labelOnly' }, + }, + { name: 'pageCount', kind: 'integer', filterable: true, output: true }, + ], + }; + const twoTypeSchema = buildGraphQLSchema( + searchSchema(PERSON, CREATIVE_WORK), + { + types: { + [PERSON.type]: { queryField: 'people' }, + }, + }, + ); + + it('exposes one root field per type, each with its own derived types', () => { + const sdl = printSchema(twoTypeSchema); + expect(sdl).toMatch(/people\([\s\S]*?\): PersonSearchResult!/); + expect(sdl).toMatch( + /creativeWorks\([\s\S]*?\): CreativeWorkSearchResult!/, + ); + expect(sdl).toMatch(/enum PersonSortField/); + expect(sdl).toMatch(/input CreativeWorkWhere/); + // Person has no filterable fields, so it gets no `where` arg (an empty + // input object would be invalid GraphQL) — CreativeWork keeps its own. + expect(sdl).not.toMatch(/PersonWhere/); + // The shared reference shape is emitted once, reused by both types. + expect(sdl.match(/^type Agent /gm)).toHaveLength(1); + }); + + it('routes each root field to its own search type', async () => { + const searchedTypes: string[] = []; + const engine: SearchEngine = { + async search(_query, searchType) { + searchedTypes.push(searchType.type); + return { total: 0, hits: [], facets: {} }; + }, + }; + const result = await graphql({ + schema: twoTypeSchema, + source: `{ people { total } creativeWorks { total } }`, + contextValue: { engine, acceptLanguage: ['nl'] }, + }); + expect(result.errors).toBeUndefined(); + expect(searchedTypes).toEqual([PERSON.type, CREATIVE_WORK.type]); + }); + + it('builds without any options: names come from the search types', () => { + const sdl = printSchema( + buildGraphQLSchema(searchSchema(PERSON, CREATIVE_WORK)), + ); + expect(sdl).toMatch(/persons\([\s\S]*?\): PersonSearchResult!/); + expect(sdl).toMatch( + /creativeWorks\([\s\S]*?\): CreativeWorkSearchResult!/, + ); + }); + + it('throws on options for an unknown type and on a root-field clash', () => { + expect(() => + buildGraphQLSchema(searchSchema(PERSON), { + types: { + 'https://schema.org/Unknown': { queryField: 'unknowns' }, + }, + }), + ).toThrow(/not in the search schema/); + expect(() => + buildGraphQLSchema(searchSchema(PERSON, CREATIVE_WORK), { + types: { + [PERSON.type]: { queryField: 'items' }, + [CREATIVE_WORK.type]: { queryField: 'items' }, + }, + }), + ).toThrow(/Duplicate root query field/); + }); + }); +}); diff --git a/packages/search-api-graphql/test/generator-stability.test.ts b/packages/search-api-graphql/test/generator-stability.test.ts new file mode 100644 index 00000000..8d5aaa26 --- /dev/null +++ b/packages/search-api-graphql/test/generator-stability.test.ts @@ -0,0 +1,98 @@ +import { describe, expect, it } from 'vitest'; +import { searchSchema, type SearchType } from '@lde/search'; +import { printGraphQLSchema } from '../src/build-schema.js'; + +/** + * A neutral fixture exercising every kind + capability — NOT a real domain. Its + * SDL is snapshotted purely to pin the **generator**: any change to how + * `buildGraphQLSchema` maps the field model (nullability, type names, enums, + * reference reuse) surfaces as a snapshot diff before this library is published, + * so a consumer’s contract can’t shift from under it by accident. + */ +const THING: SearchType = { + name: 'Thing', + type: 'https://example.org/Thing', + fields: [ + { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, + required: true, + }, + { + name: 'description', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 2 }, + }, + { + name: 'keyword', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + searchable: { weight: 1 }, + output: true, + }, + // Two references sharing a shape → the Agent type is emitted once and reused. + { + name: 'creator', + kind: 'reference', + array: true, + facetable: true, + filterable: true, + output: true, + ref: { type: 'Agent', strategy: 'labelOnly' }, + }, + { + name: 'publisher', + kind: 'reference', + facetable: true, + filterable: true, + output: true, + ref: { type: 'Agent', strategy: 'labelOnly' }, + }, + { + name: 'size', + kind: 'integer', + filterable: true, + sortable: true, + output: true, + }, + { name: 'score', kind: 'number', filterable: true, output: true }, + { + name: 'created', + kind: 'date', + filterable: true, + sortable: true, + output: true, + }, + { + name: 'status', + kind: 'keyword', + facetable: true, + filterable: true, + required: true, + output: true, + }, + { + name: 'open', + kind: 'boolean', + facetable: true, + filterable: true, + output: true, + }, + ], +}; + +describe('GraphQL generator stability', () => { + it('emits a stable SDL for a representative schema', () => { + expect(printGraphQLSchema(searchSchema(THING))).toMatchSnapshot(); + }); +}); diff --git a/packages/search-api-graphql/tsconfig.json b/packages/search-api-graphql/tsconfig.json new file mode 100644 index 00000000..62ebbd94 --- /dev/null +++ b/packages/search-api-graphql/tsconfig.json @@ -0,0 +1,13 @@ +{ + "extends": "../../tsconfig.base.json", + "files": [], + "include": [], + "references": [ + { + "path": "./tsconfig.lib.json" + }, + { + "path": "./tsconfig.spec.json" + } + ] +} diff --git a/packages/search-api-graphql/tsconfig.lib.json b/packages/search-api-graphql/tsconfig.lib.json new file mode 100644 index 00000000..64610bac --- /dev/null +++ b/packages/search-api-graphql/tsconfig.lib.json @@ -0,0 +1,26 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { + "rootDir": "src", + "outDir": "dist", + "tsBuildInfoFile": "dist/tsconfig.lib.tsbuildinfo", + "emitDeclarationOnly": false, + "types": ["node"] + }, + "include": ["src/**/*.ts"], + "references": [{ "path": "../search/tsconfig.lib.json" }], + "exclude": [ + "vite.config.ts", + "vite.config.mts", + "vitest.config.ts", + "vitest.config.mts", + "test/**/*.test.ts", + "test/**/*.spec.ts", + "test/**/*.test.tsx", + "test/**/*.spec.tsx", + "test/**/*.test.js", + "test/**/*.spec.js", + "test/**/*.test.jsx", + "test/**/*.spec.jsx" + ] +} diff --git a/packages/search-api-graphql/tsconfig.spec.json b/packages/search-api-graphql/tsconfig.spec.json new file mode 100644 index 00000000..04480f69 --- /dev/null +++ b/packages/search-api-graphql/tsconfig.spec.json @@ -0,0 +1,29 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { + "outDir": "./out-tsc/vitest", + "types": [ + "vitest/globals", + "vitest/importMeta", + "vite/client", + "node", + "vitest" + ] + }, + "include": [ + "test/**/*.test.ts", + "test/**/*.spec.ts", + "test/**/*.test.tsx", + "test/**/*.spec.tsx", + "test/**/*.test.js", + "test/**/*.spec.js", + "test/**/*.test.jsx", + "test/**/*.spec.jsx", + "test/**/*.d.ts" + ], + "references": [ + { + "path": "./tsconfig.lib.json" + } + ] +} diff --git a/packages/search-api-graphql/vite.config.ts b/packages/search-api-graphql/vite.config.ts new file mode 100644 index 00000000..9baf3fd5 --- /dev/null +++ b/packages/search-api-graphql/vite.config.ts @@ -0,0 +1,21 @@ +/// +import { defineConfig, mergeConfig } from 'vite'; +import baseConfig from '../../vite.base.config.js'; + +export default mergeConfig( + baseConfig, + defineConfig({ + root: __dirname, + cacheDir: '../../node_modules/.vite/packages/search-api-graphql', + test: { + coverage: { + thresholds: { + functions: 100, + lines: 100, + branches: 90.42, + statements: 100, + }, + }, + }, + }), +); diff --git a/packages/search-typesense/README.md b/packages/search-typesense/README.md index b5d62bb9..8e21142c 100644 --- a/packages/search-typesense/README.md +++ b/packages/search-typesense/README.md @@ -1,22 +1,45 @@ # @lde/search-typesense -[Typesense](https://typesense.org/) engine adapter for RDF-backed search -pipelines. Engine-specific (Typesense) but domain-agnostic – the caller supplies -the collection schema and documents. +[Typesense](https://typesense.org/) engine adapter for the engine- and +domain-agnostic [`@lde/search`](../search) core. **Engine-specific (Typesense) but +domain-agnostic** – you supply a `SearchType`; this package never names your +domain. It is the Typesense implementation of the `SearchEngine` port: it derives +a collection schema from the field model, compiles the neutral `SearchQuery` into +Typesense search params, runs it, reconstructs the engine-neutral `SearchResult`, +and manages the search index lifecycle (blue/green rebuild). -The engine-agnostic half of the pipeline – framing `CONSTRUCT` quads into a -JSON-LD IR and projecting that IR into flat documents from a declarative field -spec – lives in [`@lde/search`](../search). This package consumes those -documents and writes them to Typesense. +## Collection schema and engine + +`buildCollectionSchema(searchType, { name, defaultSortingField, … })` derives a +Typesense collection from the unified `SearchField` model — the Typesense field +type comes from each field’s `kind`, and the physical fanout (per-locale +search/sort keys) matches what the projection writes, via +`@lde/search`’s `physicalFields`, so the index and the documents cannot drift. + +`createTypesenseSearchEngine(client, { collection, labelsCollection })` is the +`SearchEngine` implementation. Each search: + +- validates the query against the search type (the port contract — a + structurally invalid query is rejected, never sent); +- compiles it into Typesense search params (`buildSearchParams`); +- runs the search; +- resolves reference (and reference-facet) labels from the sidecar `labels` + collection in a single lookup; +- reconstructs the logical `SearchResult` (`parseSearchResponse`) — language + maps, labelled references, labelled facet buckets. + +The pure halves `buildSearchParams` and `parseSearchResponse` are exported for +direct use and testing. ## Indexing -`rebuild` blue/green-rebuilds a search index in one call: it creates a fresh -versioned collection (`${schema.name}_`), streams the documents into -it in batches, atomically repoints the `schema.name` alias to it, then drops the -collection it superseded. The caller passes only the logical index name (as -`schema.name`) and a stream of documents; the versioned collection and the alias -are managed for them. +`rebuild` blue/green-rebuilds a search index in one call, straight from the +declaration: it derives the collection schema from your `SearchType` (via +`buildCollectionSchema`), creates a fresh versioned collection +(`${name}_`), streams the documents into it in batches, atomically +repoints the `name` alias to it, then drops the collection it superseded. The +caller passes the `SearchType`, the logical index `name` and a stream of +documents; the versioned collection and the alias are managed for them. ```ts import { Client } from 'typesense'; @@ -30,9 +53,13 @@ const client = new Client({ // `documents` is an async iterable (e.g. a streaming projection); only one // batch is held in memory at a time. `rebuild` returns the live collection name // and the imported count (or `null` if another rebuild was already running). -const result = await rebuild(client, schema, documents); +const result = await rebuild(client, documents, DATASET, { name: 'datasets' }); ``` +The options accept everything `buildCollectionSchema` does (`defaultLocale`, +`defaultSortingField`, `synonymSets`) plus the rebuild knobs (`batchSize`, +`lockTtlMs`). + `rebuild` takes a `Client` the caller owns (and reuses for queries), so this package adds no connection or document type of its own – any object with an `id` is a valid document, including the `SearchDocument`s `@lde/search` produces. diff --git a/packages/search-typesense/package.json b/packages/search-typesense/package.json index b1dde852..445624fb 100644 --- a/packages/search-typesense/package.json +++ b/packages/search-typesense/package.json @@ -1,7 +1,7 @@ { "name": "@lde/search-typesense", "version": "0.1.1", - "description": "Generic Typesense engine adapter for RDF-backed search pipelines: collection lifecycle, bulk upsert and blue/green alias swap", + "description": "Typesense implementation of the @lde/search SearchEngine port: collection-schema builder, query compiler, label-resolving result reconstruction, and blue/green index lifecycle. Engine-specific (Typesense) but domain-agnostic.", "repository": { "url": "git+https://github.com/ldelements/lde.git", "directory": "packages/search-typesense" @@ -25,6 +25,8 @@ "!**/*.tsbuildinfo" ], "dependencies": { + "@lde/search": "^0.1.2", + "@lde/text-normalization": "^0.1.1", "tslib": "^2.3.0", "typesense": "^3.0.6" }, diff --git a/packages/search-typesense/src/adapter.ts b/packages/search-typesense/src/adapter.ts index ad3bfc9c..8aa3b11d 100644 --- a/packages/search-typesense/src/adapter.ts +++ b/packages/search-typesense/src/adapter.ts @@ -1,12 +1,29 @@ -import type { Client, CollectionCreateSchema, ImportResponse } from 'typesense'; +import type { Client, ImportResponse } from 'typesense'; +import type { SearchType } from '@lde/search'; +import { + buildCollectionSchema, + type CollectionSchemaOptions, +} from './collection-schema.js'; const LOCK_COLLECTION = 'rebuild_locks'; const DEFAULT_LOCK_TTL_MS = 10 * 60 * 1000; +/** {@link rebuild} options: the collection-schema options (`name` is the + * logical index name the alias is kept on) plus the rebuild tuning knobs. */ +export interface RebuildOptions extends CollectionSchemaOptions { + /** Documents imported per Typesense request (default 1000). */ + readonly batchSize?: number; + /** A held lock older than this (ms) is reclaimed (default 10 minutes). */ + readonly lockTtlMs?: number; +} + /** - * Blue/green-rebuild the search index `name`. + * Blue/green-rebuild the search index `options.name` from one declarative + * source: the collection schema is derived from `searchType` + * ({@link buildCollectionSchema}) and the documents are streamed in — one call + * from declaration to live index. * - * 1. create a fresh versioned collection (`${name}_`) from `schema` + * 1. create a fresh versioned collection (`${name}_`) * 2. stream `documents` into it in batches * 3. atomically repoint the `name` alias to the new collection, then * drop the collection it superseded. The caller passes only the logical @@ -40,17 +57,17 @@ const DEFAULT_LOCK_TTL_MS = 10 * 60 * 1000; */ export async function rebuild( client: Client, - schema: CollectionCreateSchema, documents: AsyncIterable, - options: { - /** Documents imported per Typesense request (default 1000). */ - batchSize?: number; - /** A held lock older than this (ms) is reclaimed (default 10 minutes). */ - lockTtlMs?: number; - } = {}, + searchType: SearchType, + options: RebuildOptions, ): Promise<{ collection: string; imported: number } | null> { - const { batchSize = 1000, lockTtlMs = DEFAULT_LOCK_TTL_MS } = options; - const name = schema.name; + const { + batchSize = 1000, + lockTtlMs = DEFAULT_LOCK_TTL_MS, + ...schemaOptions + } = options; + const schema = buildCollectionSchema(searchType, schemaOptions); + const name = schemaOptions.name; if (!(await acquireLock(client, name, lockTtlMs))) { return null; } diff --git a/packages/search-typesense/src/collection-schema.ts b/packages/search-typesense/src/collection-schema.ts new file mode 100644 index 00000000..d0c1bf9e --- /dev/null +++ b/packages/search-typesense/src/collection-schema.ts @@ -0,0 +1,136 @@ +import type { CollectionCreateSchema } from 'typesense'; +import type { CollectionFieldSchema } from 'typesense/lib/Typesense/Collection.js'; +import { physicalFields, type SearchField, type SearchType } from '@lde/search'; + +/** Deployment-specific options the generic field model does not carry. */ +export interface CollectionSchemaOptions { + /** The Typesense collection (or alias) name. */ + readonly name: string; + /** Snowball stemming locale for non-localized searchable fields (e.g. `en`). + * Unset, those fields are not stemmed — folding still applies — so no + * language is ever assumed. Localized text search fields always stem in + * their own locale. */ + readonly defaultLocale?: string; + /** The field Typesense sorts by when a query imposes no order. */ + readonly defaultSortingField?: string; + /** Synonym sets the collection references (synced separately). */ + readonly synonymSets?: readonly string[]; +} + +/** + * Build a Typesense collection schema from the unified {@link SearchType}, so + * the index and the projection are driven by one declarative source and cannot + * drift. Each field fans out into the same physical fields the projection writes + * ({@link physicalFields}); the Typesense field type is derived from the field + * `kind`, never re-declared. + * + * Localized text stems each folded `*_search_${locale}` field in its own + * language; a non-localized searchable field stems in `defaultLocale` when one + * is set, and is left unstemmed (folded only) otherwise. + */ +export function buildCollectionSchema( + searchType: SearchType, + options: CollectionSchemaOptions, +): CollectionCreateSchema { + const { defaultLocale } = options; + const collection: CollectionCreateSchema = { + name: options.name, + fields: searchType.fields.flatMap((field) => + typesenseFields(field, defaultLocale, options.defaultSortingField), + ), + }; + if (options.defaultSortingField !== undefined) { + collection.default_sorting_field = options.defaultSortingField; + } + if (options.synonymSets !== undefined) { + collection.synonym_sets = [...options.synonymSets]; + } + return collection; +} + +/** The physical Typesense fields one declaration produces. */ +function typesenseFields( + field: SearchField, + defaultLocale: string | undefined, + defaultSortingField: string | undefined, +): CollectionFieldSchema[] { + const names = physicalFields(field); + if (field.kind === 'text' && field.localized === true) { + const locales = field.locales ?? []; + return [ + // Display labels: stored, not indexed for search (search uses the folded + // companions), accents preserved. + ...names.display.map( + (name): CollectionFieldSchema => ({ + name, + type: 'string', + index: false, + optional: true, + }), + ), + // One folded search field per locale, each stemmed in its own language. + ...names.search.map( + (name, index): CollectionFieldSchema => ({ + name, + type: 'string', + optional: true, + stem: true, + locale: locales[index], + }), + ), + ...names.sort.map( + (name): CollectionFieldSchema => ({ + name, + type: 'string', + sort: true, + optional: true, + }), + ), + ]; + } + + const valueType = typesenseValueType(field); + const fields: CollectionFieldSchema[] = [ + { + name: field.name, + type: valueType, + facet: field.facetable ?? false, + sort: field.sortable ?? false, + // A `required` field is non-optional; so is the `default_sorting_field`, + // which Typesense requires to be present. Everything else may be absent. + optional: field.required !== true && field.name !== defaultSortingField, + }, + ]; + if (field.searchable) { + for (const name of names.search) { + fields.push({ + name, + type: valueType, + optional: true, + ...(defaultLocale !== undefined && { + stem: true, + locale: defaultLocale, + }), + }); + } + } + return fields; +} + +/** The Typesense field type for a non-localized field, from its `kind`. 64-bit + * integers (and dates, stored as Unix seconds) so large counts never overflow. */ +function typesenseValueType(field: SearchField): CollectionFieldSchema['type'] { + switch (field.kind) { + case 'integer': + case 'date': + return 'int64'; + case 'number': + return 'float'; + case 'boolean': + return 'bool'; + case 'keyword': + case 'reference': + case 'text': + return field.array === true ? 'string[]' : 'string'; + } +} diff --git a/packages/search-typesense/src/index.ts b/packages/search-typesense/src/index.ts index 6514638d..4facde27 100644 --- a/packages/search-typesense/src/index.ts +++ b/packages/search-typesense/src/index.ts @@ -1 +1,11 @@ export { rebuild } from './adapter.js'; +export type { RebuildOptions } from './adapter.js'; +export { buildCollectionSchema } from './collection-schema.js'; +export type { CollectionSchemaOptions } from './collection-schema.js'; +export { buildSearchParams } from './query-compiler.js'; +export type { BuildSearchParamsOptions } from './query-compiler.js'; +export { createTypesenseSearchEngine, parseSearchResponse } from './search.js'; +export type { + TypesenseSearchEngineOptions, + TypesenseSearchResponse, +} from './search.js'; diff --git a/packages/search-typesense/src/query-compiler.ts b/packages/search-typesense/src/query-compiler.ts new file mode 100644 index 00000000..5cd4d0a5 --- /dev/null +++ b/packages/search-typesense/src/query-compiler.ts @@ -0,0 +1,282 @@ +import type { SearchParams } from 'typesense/lib/Typesense/Documents.js'; +import { fold } from '@lde/text-normalization'; +import { + fieldNamed, + filterOperator, + filterOperatorFor, + isoToUnixSeconds, + isRangeFacet, + pageForOffset, + physicalFields, + searchableFields, + type FacetRange, + type Filter, + type SearchField, + type SearchQuery, + type SearchType, + type Sort, +} from '@lde/search'; + +/** + * Options for {@link buildSearchParams} — the query half of the engine + * adapter. {@link TypesenseSearchEngineOptions} extends this, so each knob is + * declared once and the engine forwards its options wholesale. + */ +export interface BuildSearchParamsOptions { + /** + * Cap on the number of buckets returned per facet (`max_facet_values`). Left + * unset, Typesense defaults to 10 — too few for high-cardinality facets + * (publisher, keyword), so a deployment with such facets must raise it. Range + * facets return one bucket per declared range regardless, but a value > the + * range count is still safe. + */ + readonly maxFacetValues?: number; + /** + * Called for each `where` clause that compiles to nothing and is therefore + * skipped: an unknown field, an operator that does not match the field’s + * kind ({@link filterOperatorFor}), an empty `in` list, or a `range` with no + * usable bound. Skipping keeps a malformed clause from reaching the engine + * as garbage; supply this to log it instead of losing it silently. Through + * the engine, a structurally invalid query throws up front + * (`assertValidQuery`), so there only the vacuous clauses reach this. + */ + readonly onIgnoredFilter?: (filter: Filter) => void; +} + +/** + * Compile the engine-neutral {@link SearchQuery} into Typesense search + * parameters — the query half of the engine adapter. Pure (no client, no env), + * so the mapping is asserted directly in unit tests. Field names come from + * {@link physicalFields}, the same convention the projection and the collection + * schema use, so a query can never reference a field the index does not carry. + */ +export function buildSearchParams( + query: SearchQuery, + searchType: SearchType, + options: BuildSearchParamsOptions = {}, +): SearchParams { + const folded = + query.text !== undefined && query.text.length > 0 + ? fold(query.text) + : undefined; + const { names, weights } = queryFields(searchType, query.locale); + const filterBy = compileFilterBy( + query.where, + searchType, + options.onIgnoredFilter, + ); + const sortBy = query.orderBy + .map((sort) => compileSort(sort, searchType, query.locale)) + .join(','); + const params: SearchParams = { + q: folded ?? '*', + query_by: names.join(','), + query_by_weights: weights.join(','), + per_page: query.limit, + page: pageForOffset(query.offset, query.limit), + }; + if (filterBy.length > 0) { + params.filter_by = filterBy; + } + if (sortBy.length > 0) { + params.sort_by = sortBy; + } + if (query.facets.length > 0) { + params.facet_by = compileFacetBy(query.facets, searchType); + if (options.maxFacetValues !== undefined) { + params.max_facet_values = options.maxFacetValues; + } + } + return params; +} + +/** + * The `facet_by` clause. A facet on a numeric field that declares + * {@link SearchField.facetRanges} faceted into those fixed half-open `[min, max)` + * bins (a histogram); every other facet is a plain per-value facet on its field + * name. Typesense range syntax is already start-inclusive/end-exclusive, so the + * declared bounds pass straight through with no boundary fix-up. + */ +function compileFacetBy( + facets: readonly string[], + searchType: SearchType, +): string { + return facets + .map((name) => { + const field = fieldNamed(searchType, name); + return field !== undefined && isRangeFacet(field) + ? compileRangeFacet(field.name, field.facetRanges) + : name; + }) + .join(','); +} + +/** `name(key:[min, max], …)`; a blank bound is open-ended (Typesense `[75, ]`). */ +function compileRangeFacet( + name: string, + ranges: readonly FacetRange[], +): string { + const bins = ranges + .map((range) => `${range.key}:[${range.min ?? ''}, ${range.max ?? ''}]`) + .join(', '); + return `${name}(${bins})`; +} + +/** + * The `query_by` fields and aligned weights. Each searchable field expands to its + * folded `*_search` companion(s); a localized field’s active-locale companion + * keeps its full weight while the other locale is gently demoted (−1, floored at + * 1), so a match in the user’s language ranks higher while cross-language matches + * still surface. + */ +function queryFields( + searchType: SearchType, + locale: string, +): { readonly names: string[]; readonly weights: number[] } { + const names: string[] = []; + const weights: number[] = []; + for (const field of searchableFields(searchType)) { + const search = physicalFields(field).search; + const baseWeight = field.searchable.weight; + if (field.kind === 'text' && field.localized === true) { + const locales = field.locales ?? []; + search.forEach((name, index) => { + names.push(name); + weights.push( + locales[index] === locale ? baseWeight : Math.max(1, baseWeight - 1), + ); + }); + } else { + for (const name of search) { + names.push(name); + weights.push(baseWeight); + } + } + } + return { names, weights }; +} + +/** AND-join the compiled `where` clauses; a clause that compiles to nothing is + * skipped and reported to `onIgnoredFilter`. */ +function compileFilterBy( + where: readonly Filter[], + searchType: SearchType, + onIgnoredFilter: ((filter: Filter) => void) | undefined, +): string { + return where + .map((filter) => { + const clause = compileFilter(filter, searchType); + if (clause === undefined) { + onIgnoredFilter?.(filter); + } + return clause; + }) + .filter((clause): clause is string => clause !== undefined) + .join(' && '); +} + +function compileFilter( + filter: Filter, + searchType: SearchType, +): string | undefined { + const field = fieldNamed(searchType, filter.field); + if (field === undefined) { + return undefined; + } + // A clause whose operator does not match the field's kind (e.g. `range` on a + // keyword) would reach the engine as garbage syntax — skip it instead. + if (filterOperatorFor(field.kind) !== filterOperator(filter)) { + return undefined; + } + if ('in' in filter) { + return filter.in.length > 0 + ? compileMembership(field, filter.in) + : undefined; + } + if ('range' in filter) { + return compileRange(field, filter.range); + } + return `${field.name}:=${filter.is}`; +} + +/** + * A membership clause. A non-facet (tokenized) field uses the exact `:=` + * operator so an IRI cannot partial-match on a shared path segment. + */ +function compileMembership( + field: SearchField, + values: readonly string[], +): string { + const list = `[${values.map(escapeFilterValue).join(',')}]`; + return field.facetable !== true + ? `${field.name}:=${list}` + : `${field.name}:${list}`; +} + +/** An inclusive Typesense range clause, or `undefined` when neither bound is set. */ +function compileRange( + field: SearchField, + range: { readonly min?: number | string; readonly max?: number | string }, +): string | undefined { + const name = field.name; + const min = storedBound(field, range.min); + const max = storedBound(field, range.max); + if (min !== undefined && max !== undefined) { + return `${name}:[${min}..${max}]`; + } + if (min !== undefined) { + return `${name}:>=${min}`; + } + if (max !== undefined) { + return `${name}:<=${max}`; + } + return undefined; +} + +/** A range bound as stored: a `date` field’s ISO 8601 bound becomes the indexed + * Unix seconds ({@link isoToUnixSeconds}); an unparseable bound is dropped. */ +function storedBound( + field: SearchField, + bound: number | string | undefined, +): number | string | undefined { + return field.kind === 'date' && typeof bound === 'string' + ? isoToUnixSeconds(bound) + : bound; +} + +/** + * One `sort_by` term. `relevance` maps to Typesense’s `_text_match`; a localized + * text field sorts on its active-locale folded key; any other field (including a + * deployment tie-break like `status_rank`) sorts on its own name. + */ +function compileSort( + sort: Sort, + searchType: SearchType, + locale: string, +): string { + if (sort.field === 'relevance') { + return `_text_match:${sort.direction}`; + } + const field = fieldNamed(searchType, sort.field); + if ( + field !== undefined && + field.kind === 'text' && + field.localized === true + ) { + const sortName = + physicalFields(field).sort[field.locales?.indexOf(locale) ?? -1]; + if (sortName !== undefined) { + return `${sortName}:${sort.direction}`; + } + } + return `${sort.field}:${sort.direction}`; +} + +/** + * Backtick-wrap a filter value so reserved characters in IRIs and media types + * (`:`, `/`, `&`, `,`, …) are taken literally instead of parsed as filter syntax. + * An embedded backtick is escaped. + */ +export function escapeFilterValue(value: string): string { + return `\`${value.replace(/`/g, '\\`')}\``; +} diff --git a/packages/search-typesense/src/search.ts b/packages/search-typesense/src/search.ts new file mode 100644 index 00000000..53816c1f --- /dev/null +++ b/packages/search-typesense/src/search.ts @@ -0,0 +1,400 @@ +import type { Client } from 'typesense'; +import { + assertValidQuery, + fieldNamed, + isRangeFacet, + outputFields, + physicalFields, + referenceFields, + type FacetBucket, + type LocalizedValue, + type Reference, + type ResultDocument, + type SearchEngine, + type SearchField, + type SearchHit, + type SearchQuery, + type SearchResult, + type SearchType, + type SearchValue, +} from '@lde/search'; +import { + buildSearchParams, + escapeFilterValue, + type BuildSearchParamsOptions, +} from './query-compiler.js'; + +/** Where the engine reads documents and (optionally) reference labels — plus + * every query-compiler knob ({@link BuildSearchParamsOptions}), declared once + * there and forwarded wholesale into each search. */ +export interface TypesenseSearchEngineOptions extends BuildSearchParamsOptions { + /** The dataset collection or alias to query. */ + readonly collection: string; + /** The sidecar `labels` collection (IRI → label); omit for id-only references. */ + readonly labelsCollection?: string; + /** + * Called when reference-label resolution fails; the search then degrades to + * id-only references rather than failing. Optional — omit to swallow silently. + */ + readonly onLabelError?: (error: unknown) => void; + /** + * Opt-in in-memory label cache. When set (and {@link labelsCollection} is + * set), the FULL sidecar `labels` collection is loaded once via the documents + * export endpoint and held in a process-lifetime cache for this many + * milliseconds; each `search` then resolves its reference labels by in-memory + * lookup instead of a per-search `multi_search` round-trip. Omit to keep the + * per-search {@link fetchLabels} behaviour unchanged. + */ + readonly labelCacheTtlMs?: number; +} + +/** + * A Typesense-backed {@link SearchEngine}. `search` compiles the query + * ({@link buildSearchParams}), runs it, resolves the reference labels for the + * page of hits from the sidecar `labels` collection in one lookup, and + * reconstructs the engine-neutral {@link SearchResult} ({@link parseSearchResponse}). + * Every engine specific stays here; consumers see only logical documents. + */ +export function createTypesenseSearchEngine( + client: Client, + options: TypesenseSearchEngineOptions, +): SearchEngine { + // Process-lifetime cache for the FULL `labels` collection, held in the engine + // closure. Populated lazily on the first cached search; `loadAll` is the + // single-flight in-flight promise so concurrent first-loads share one export. + let cachedLabels: ReadonlyMap | undefined; + let cacheExpiresAt = 0; + let inFlightLoad: Promise> | undefined; + + function cachedAllLabels( + labelsCollection: string, + ttlMs: number, + ): Promise> { + if (cachedLabels !== undefined && Date.now() < cacheExpiresAt) { + return Promise.resolve(cachedLabels); + } + // Single-flight: a load already running serves every concurrent caller. + inFlightLoad ??= loadAllLabels(client, labelsCollection) + .then((loaded) => { + cachedLabels = loaded; + cacheExpiresAt = Date.now() + ttlMs; + return loaded; + }) + // A failed load degrades to id-only references and is NOT cached, so the + // next search retries rather than serving an empty map for the whole TTL. + .catch((error) => { + options.onLabelError?.(error); + return new Map(); + }) + .finally(() => { + inFlightLoad = undefined; + }); + return inFlightLoad; + } + + return { + async search( + query: SearchQuery, + searchType: SearchType, + ): Promise { + // The port contract: a structurally invalid query (unknown field, wrong + // operator, unknown facet) is rejected up front, for EVERY caller. + assertValidQuery(query, searchType); + const params = buildSearchParams(query, searchType, options); + // Cached path: the once-loaded full collection serves labels by in-memory + // lookup (no per-search round-trip). The load does not depend on the + // response, so it runs alongside the search; it never rejects (a failed + // load degrades to an empty map), so it cannot leave an unhandled + // rejection behind if the search itself fails. + const cachedLabelsPromise = + options.labelsCollection !== undefined && + options.labelCacheTtlMs !== undefined + ? cachedAllLabels(options.labelsCollection, options.labelCacheTtlMs) + : undefined; + const response = (await client + .collections(options.collection) + .documents() + .search(params)) as TypesenseSearchResponse; + // Labels are supplementary: a failed lookup (e.g. the sidecar collection + // mid-rebuild) degrades to id-only references rather than failing the whole + // search, so the listing still renders with bare IRIs. + let labels: ReadonlyMap = new Map(); + if (cachedLabelsPromise !== undefined) { + labels = await cachedLabelsPromise; + } else if (options.labelsCollection !== undefined) { + try { + labels = await fetchLabels( + client, + options.labelsCollection, + referenceIris(response, searchType), + ); + } catch (error) { + options.onLabelError?.(error); + } + } + return parseSearchResponse(response, searchType, labels); + }, + }; +} + +/** + * Load the FULL `labels` collection into a label map via the documents export + * endpoint, which streams every document as JSONL (one JSON object per line). + * Each line is reconstructed by {@link labelToLocalizedValue}, exactly as the + * per-search {@link fetchLabels} path does for its `multi_search` hits. + */ +async function loadAllLabels( + client: Pick, + collection: string, +): Promise> { + const jsonl = await client.collections(collection).documents().export(); + const labels = new Map(); + for (const line of jsonl.split('\n')) { + if (line.length === 0) { + continue; + } + const document = JSON.parse(line) as Record; + labels.set(String(document.id), labelToLocalizedValue(document)); + } + return labels; +} + +/** Every distinct reference IRI whose label the result will actually use. */ +function referenceIris( + response: TypesenseSearchResponse, + searchType: SearchType, +): string[] { + const referenceFieldSet = new Set( + referenceFields(searchType).map((field) => field.name), + ); + // Hits only carry labels for OUTPUT reference fields: reconstructDocument skips + // non-output fields, so resolving a non-output reference's hit labels (e.g. a + // facet-only `class` with dozens of IRIs per hit) is pure waste. + const outputReferenceFields = referenceFields(searchType) + .filter((field) => field.output === true) + .map((field) => field.name); + const iris = new Set(); + for (const hit of response.hits ?? []) { + for (const name of outputReferenceFields) { + const raw = hit.document[name]; + if (Array.isArray(raw)) { + for (const value of raw) { + iris.add(String(value)); + } + } else if (typeof raw === 'string') { + iris.add(raw); + } + } + } + // Reference-facet bucket values are IRIs too (incl. facet-only references like + // `class`); resolve them in the same lookup. + for (const facet of response.facet_counts ?? []) { + if (referenceFieldSet.has(facet.field_name)) { + for (const bucket of facet.counts) { + iris.add(bucket.value); + } + } + } + return [...iris]; +} + +/** + * Resolve labels for `iris` from the sidecar `labels` collection. Each + * `label_${locale}` becomes a language-map entry; the default `label` is the + * untagged (`und`) fallback when no locale variant exists. + * + * Sent as one `multi_search` (POST) call, the id-list split over per-search + * batches: the id-list of a page or facet carrying many references — e.g. a + * dataset with dozens of classes — would overflow Typesense’s GET query-string + * limit (4000 chars, and IRIs URL-encode to several times their length) if it + * travelled in the URL. POST puts it in the body; each batch stays under + * Typesense’s `per_page` cap, and bundling the batches keeps it one round-trip + * regardless of IRI count. Exported for unit testing against a fake client. + */ +export async function fetchLabels( + client: Pick, + collection: string, + iris: readonly string[], +): Promise> { + const labels = new Map(); + if (iris.length === 0) { + return labels; + } + const searches = []; + for (let start = 0; start < iris.length; start += LABEL_BATCH_SIZE) { + const batch = iris.slice(start, start + LABEL_BATCH_SIZE); + searches.push({ + collection, + q: '*', + query_by: 'label', + filter_by: `id:[${batch.map(escapeFilterValue).join(',')}]`, + per_page: batch.length, + }); + } + const { results } = (await client.multiSearch.perform({ searches })) as { + results: readonly TypesenseSearchResponse[]; + }; + for (const result of results) { + for (const hit of result.hits ?? []) { + labels.set(String(hit.document.id), labelToLocalizedValue(hit.document)); + } + } + return labels; +} + +/** Typesense caps `per_page` at 250; the multi_search POST body holds the + * id-list comfortably, so resolve references in batches of this size. */ +const LABEL_BATCH_SIZE = 200; + +/** Turn a `labels` document into a language map (`label_${locale}` → locale). */ +function labelToLocalizedValue( + document: Record, +): LocalizedValue { + const map: Record = {}; + for (const [key, value] of Object.entries(document)) { + if (key.startsWith('label_') && typeof value === 'string') { + map[key.slice('label_'.length)] = [value]; + } + } + if (Object.keys(map).length === 0 && typeof document.label === 'string') { + map.und = [document.label]; + } + return map; +} + +/** The subset of a Typesense search response this adapter reads. */ +export interface TypesenseSearchResponse { + readonly found: number; + readonly hits?: readonly { readonly document: Record }[]; + readonly facet_counts?: readonly { + readonly field_name: string; + readonly counts: readonly { + readonly value: string; + readonly count: number; + }[]; + }[]; +} + +/** + * Reconstruct a Typesense response into the engine-neutral {@link SearchResult}: + * the flat, fanned-out document is turned back into a logical one (per-locale + * display fields → a language map, reference IRIs → labelled references via the + * sidecar `labels` lookup, scalars passed through). `labels` maps a reference IRI + * to its resolved label; an IRI absent from it yields an id-only reference. + */ +export function parseSearchResponse( + response: TypesenseSearchResponse, + searchType: SearchType, + labels: ReadonlyMap, +): SearchResult { + const hits: SearchHit[] = (response.hits ?? []).map((hit) => ({ + id: String(hit.document.id), + document: reconstructDocument(hit.document, searchType, labels), + })); + // Reference facets are IRI-keyed; their buckets carry a resolved data label. + // Plain facets (tokens, free strings) carry no label — the consumer owns display. + const referenceFacets = new Set( + referenceFields(searchType).map((field) => field.name), + ); + const facets: Record = {}; + for (const facet of response.facet_counts ?? []) { + const labelled = referenceFacets.has(facet.field_name); + // A range facet echoes the declared range key as the bucket value; look the + // bin's half-open bounds back up by key so the bucket is self-describing. + const field = fieldNamed(searchType, facet.field_name); + const rangesByKey = + field !== undefined && isRangeFacet(field) + ? new Map(field.facetRanges.map((range) => [range.key, range])) + : undefined; + facets[facet.field_name] = facet.counts.map((bucket) => { + const label = labelled ? labels.get(bucket.value) : undefined; + const range = rangesByKey?.get(bucket.value); + return { + value: bucket.value, + count: bucket.count, + ...(label !== undefined ? { label } : {}), + ...(range?.min !== undefined ? { min: range.min } : {}), + ...(range?.max !== undefined ? { max: range.max } : {}), + }; + }); + } + return { hits, total: response.found, facets }; +} + +/** Rebuild one logical document from a flat Typesense document. */ +function reconstructDocument( + flat: Record, + searchType: SearchType, + labels: ReadonlyMap, +): ResultDocument { + const document: Record = {}; + for (const field of outputFields(searchType)) { + const value = logicalValue(flat, field, labels); + if (value !== undefined) { + document[field.name] = value; + } + } + return document; +} + +function logicalValue( + flat: Record, + field: SearchField, + labels: ReadonlyMap, +): SearchValue | undefined { + switch (field.kind) { + case 'text': + return localizedValue(flat, field); + case 'reference': + return referenceValue(flat, field, labels); + case 'keyword': { + const value = flat[field.name]; + return Array.isArray(value) || typeof value === 'string' + ? (value as SearchValue) + : undefined; + } + case 'integer': + case 'number': + case 'date': { + const value = flat[field.name]; + return typeof value === 'number' ? value : undefined; + } + case 'boolean': + // A boolean is always present; an absent value means false. + return flat[field.name] === true; + } +} + +/** Gather the per-locale display fields back into a language map. */ +function localizedValue( + flat: Record, + field: SearchField, +): LocalizedValue | undefined { + const map: Record = {}; + const display = physicalFields(field).display; + (field.locales ?? []).forEach((locale, index) => { + const value = flat[display[index]]; + if (typeof value === 'string') { + map[locale] = [value]; + } + }); + return Object.keys(map).length > 0 ? map : undefined; +} + +/** Map stored reference IRIs to labelled references; id-only when no label. */ +function referenceValue( + flat: Record, + field: SearchField, + labels: ReadonlyMap, +): SearchValue | undefined { + const raw = flat[field.name]; + if (raw === undefined) { + return undefined; + } + const iris = Array.isArray(raw) ? (raw as string[]) : [String(raw)]; + const references: Reference[] = iris.map((iri) => { + const label = labels.get(iri); + return label === undefined ? { id: iri } : { id: iri, label }; + }); + return field.array === true ? references : references[0]; +} diff --git a/packages/search-typesense/test/__snapshots__/generator-stability.test.ts.snap b/packages/search-typesense/test/__snapshots__/generator-stability.test.ts.snap new file mode 100644 index 00000000..e56c6447 --- /dev/null +++ b/packages/search-typesense/test/__snapshots__/generator-stability.test.ts.snap @@ -0,0 +1,114 @@ +// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html + +exports[`collection-schema generator stability > derives a stable Typesense collection for a representative schema 1`] = ` +{ + "default_sorting_field": "size", + "fields": [ + { + "index": false, + "name": "title_nl", + "optional": true, + "type": "string", + }, + { + "index": false, + "name": "title_en", + "optional": true, + "type": "string", + }, + { + "locale": "nl", + "name": "title_search_nl", + "optional": true, + "stem": true, + "type": "string", + }, + { + "locale": "en", + "name": "title_search_en", + "optional": true, + "stem": true, + "type": "string", + }, + { + "name": "title_sort_nl", + "optional": true, + "sort": true, + "type": "string", + }, + { + "name": "title_sort_en", + "optional": true, + "sort": true, + "type": "string", + }, + { + "facet": true, + "name": "keyword", + "optional": true, + "sort": false, + "type": "string[]", + }, + { + "locale": "nl", + "name": "keyword_search", + "optional": true, + "stem": true, + "type": "string[]", + }, + { + "facet": true, + "name": "format", + "optional": true, + "sort": false, + "type": "string[]", + }, + { + "facet": true, + "name": "creator", + "optional": true, + "sort": false, + "type": "string[]", + }, + { + "facet": true, + "name": "status", + "optional": false, + "sort": false, + "type": "string", + }, + { + "facet": true, + "name": "size", + "optional": false, + "sort": true, + "type": "int64", + }, + { + "facet": true, + "name": "score", + "optional": true, + "sort": false, + "type": "float", + }, + { + "facet": false, + "name": "created", + "optional": true, + "sort": true, + "type": "int64", + }, + { + "facet": true, + "name": "open", + "optional": true, + "sort": false, + "type": "bool", + }, + ], + "name": "things", + "synonym_sets": [ + "things-synonyms", + ], +} +`; diff --git a/packages/search-typesense/test/adapter.test.ts b/packages/search-typesense/test/adapter.test.ts index 9dc20d41..82550623 100644 --- a/packages/search-typesense/test/adapter.test.ts +++ b/packages/search-typesense/test/adapter.test.ts @@ -1,16 +1,18 @@ import { afterAll, beforeAll, beforeEach, describe, expect, it } from 'vitest'; -import type { Client, CollectionCreateSchema } from 'typesense'; +import type { Client } from 'typesense'; +import type { SearchType } from '@lde/search'; import { rebuild } from '../src/adapter.js'; import { TypesenseContainer } from './typesense-container.js'; const NAME = 'datasets'; const LOCK_COLLECTION = 'rebuild_locks'; -const schema: CollectionCreateSchema = { - name: NAME, +const datasetType: SearchType = { + name: 'Dataset', + type: 'https://example.org/Dataset', fields: [ - { name: 'title', type: 'string' }, - { name: 'year', type: 'int32' }, + { name: 'title', kind: 'keyword' }, + { name: 'year', kind: 'integer' }, ], }; @@ -70,8 +72,9 @@ describe('search-typesense', () => { it('publishes a versioned collection and points the index alias at it', async () => { const result = await rebuild( client, - schema, stream([{ id: 'a', title: 'Verhaal van Utrecht', year: 2024 }]), + datasetType, + { name: NAME }, ); expect(result?.imported).toBe(1); @@ -87,13 +90,15 @@ describe('search-typesense', () => { it('swaps the alias to a new collection and drops the previous one', async () => { const first = await rebuild( client, - schema, stream([{ id: 'a', title: 'Old', year: 2023 }]), + datasetType, + { name: NAME }, ); const second = await rebuild( client, - schema, stream([{ id: 'a', title: 'New', year: 2024 }]), + datasetType, + { name: NAME }, ); expect(second?.collection).not.toBe(first?.collection); @@ -110,7 +115,8 @@ describe('search-typesense', () => { year: 2024, })); - const result = await rebuild(client, schema, stream(documents), { + const result = await rebuild(client, stream(documents), datasetType, { + name: NAME, batchSize: 2, }); @@ -123,8 +129,9 @@ describe('search-typesense', () => { const result = await rebuild( client, - schema, stream([{ id: 'a', title: 'A', year: 2024 }]), + datasetType, + { name: NAME }, ); expect(result).toBeNull(); @@ -136,9 +143,9 @@ describe('search-typesense', () => { const result = await rebuild( client, - schema, stream([{ id: 'a', title: 'A', year: 2024 }]), - { lockTtlMs: 1_000 }, + datasetType, + { name: NAME, lockTtlMs: 1_000 }, ); expect(result?.imported).toBe(1); @@ -148,8 +155,9 @@ describe('search-typesense', () => { it('leaves the live alias intact and drops the orphan when a build fails', async () => { await rebuild( client, - schema, stream([{ id: 'a', title: 'Live', year: 2024 }]), + datasetType, + { name: NAME }, ); const live = await aliasTarget(client); const collectionCount = (await client.collections().retrieve()).length; @@ -158,8 +166,9 @@ describe('search-typesense', () => { await expect( rebuild( client, - schema, stream([{ id: 'bad', title: 't', year: 'nope' }]), + datasetType, + { name: NAME }, ), ).rejects.toThrow(/failed/i); @@ -171,7 +180,9 @@ describe('search-typesense', () => { }); it('publishes an empty collection for an empty source', async () => { - const result = await rebuild(client, schema, stream([])); + const result = await rebuild(client, stream([]), datasetType, { + name: NAME, + }); expect(result?.imported).toBe(0); expect((await client.collections(NAME).retrieve()).num_documents).toBe(0); diff --git a/packages/search-typesense/test/collection-schema.test.ts b/packages/search-typesense/test/collection-schema.test.ts new file mode 100644 index 00000000..11f15cac --- /dev/null +++ b/packages/search-typesense/test/collection-schema.test.ts @@ -0,0 +1,206 @@ +import { describe, expect, it } from 'vitest'; +import type { SearchType } from '@lde/search'; +import { buildCollectionSchema } from '../src/collection-schema.js'; + +const schema: SearchType = { + name: 'Dataset', + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [ + { + name: 'title', + path: 'http://purl.org/dc/terms/title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, + }, + { + name: 'keyword', + path: 'http://www.w3.org/ns/dcat#keyword', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + searchable: { weight: 1 }, + }, + { + name: 'format', + path: 'https://def.nde.nl/format', + kind: 'keyword', + array: true, + facetable: true, + }, + // Derived fields (no path) still get collection fields — populated at index + // time by derivations, not projected. + { name: 'status', kind: 'keyword', facetable: true, required: true }, + { name: 'statusRank', kind: 'integer', sortable: true }, + { + name: 'size', + kind: 'integer', + facetable: true, + sortable: true, + }, + { name: 'iiif', kind: 'boolean', facetable: true }, + { + name: 'publisher', + path: 'http://purl.org/dc/terms/publisher', + kind: 'reference', + array: true, + facetable: true, + }, + { + name: 'datePosted', + path: 'https://def.nde.nl/datePosted', + kind: 'date', + sortable: true, + }, + { + name: 'score', + kind: 'number', + facetable: true, + }, + ], +}; + +describe('buildCollectionSchema', () => { + const collection = buildCollectionSchema(schema, { + name: 'datasets', + defaultLocale: 'nl', + defaultSortingField: 'statusRank', + synonymSets: ['dataset-synonyms'], + }); + + it('carries the collection name, default sorting field and synonym sets', () => { + expect(collection.name).toBe('datasets'); + expect(collection.default_sorting_field).toBe('statusRank'); + expect(collection.synonym_sets).toEqual(['dataset-synonyms']); + }); + + it('fans a localized text field into display, per-locale stemmed search and sort keys', () => { + expect(collection.fields).toContainEqual({ + name: 'title_nl', + type: 'string', + index: false, + optional: true, + }); + expect(collection.fields).toContainEqual({ + name: 'title_en', + type: 'string', + index: false, + optional: true, + }); + expect(collection.fields).toContainEqual({ + name: 'title_search_nl', + type: 'string', + optional: true, + stem: true, + locale: 'nl', + }); + expect(collection.fields).toContainEqual({ + name: 'title_search_en', + type: 'string', + optional: true, + stem: true, + locale: 'en', + }); + expect(collection.fields).toContainEqual({ + name: 'title_sort_nl', + type: 'string', + sort: true, + optional: true, + }); + expect(collection.fields).toContainEqual({ + name: 'title_sort_en', + type: 'string', + sort: true, + optional: true, + }); + }); + + it('maps keyword/reference/integer/boolean kinds to Typesense value fields', () => { + expect(collection.fields).toContainEqual({ + name: 'keyword', + type: 'string[]', + facet: true, + sort: false, + optional: true, + }); + // `status` is required → non-optional, like the default sorting field. + expect(collection.fields).toContainEqual({ + name: 'status', + type: 'string', + facet: true, + sort: false, + optional: false, + }); + // statusRank is the default_sorting_field, which Typesense requires to be + // non-optional. + expect(collection.fields).toContainEqual({ + name: 'statusRank', + type: 'int64', + facet: false, + sort: true, + optional: false, + }); + expect(collection.fields).toContainEqual({ + name: 'size', + type: 'int64', + facet: true, + sort: true, + optional: true, + }); + expect(collection.fields).toContainEqual({ + name: 'iiif', + type: 'bool', + facet: true, + sort: false, + optional: true, + }); + expect(collection.fields).toContainEqual({ + name: 'publisher', + type: 'string[]', + facet: true, + sort: false, + optional: true, + }); + expect(collection.fields).toContainEqual({ + name: 'datePosted', + type: 'int64', + facet: false, + sort: true, + optional: true, + }); + expect(collection.fields).toContainEqual({ + name: 'score', + type: 'float', + facet: true, + sort: false, + optional: true, + }); + }); + + it('emits a folded, stemmed search companion for a searchable keyword field', () => { + expect(collection.fields).toContainEqual({ + name: 'keyword_search', + type: 'string[]', + optional: true, + stem: true, + locale: 'nl', + }); + }); + + it('assumes no language: without defaultLocale the companion is folded but unstemmed', () => { + const withoutLocale = buildCollectionSchema(schema, { name: 'datasets' }); + expect(withoutLocale.fields).toContainEqual({ + name: 'keyword_search', + type: 'string[]', + optional: true, + }); + // Localized text still stems per locale — that never depended on the default. + expect(withoutLocale.fields).toContainEqual( + expect.objectContaining({ name: 'title_search_nl', locale: 'nl' }), + ); + }); +}); diff --git a/packages/search-typesense/test/generator-stability.test.ts b/packages/search-typesense/test/generator-stability.test.ts new file mode 100644 index 00000000..8404c545 --- /dev/null +++ b/packages/search-typesense/test/generator-stability.test.ts @@ -0,0 +1,66 @@ +import { describe, expect, it } from 'vitest'; +import type { SearchType } from '@lde/search'; +import { buildCollectionSchema } from '../src/collection-schema.js'; + +/** + * A neutral fixture exercising every kind + capability — NOT a real domain. The + * derived Typesense collection is snapshotted purely to pin the **generator**: + * any change to how `buildCollectionSchema` maps the field model (Typesense field + * types, the physical fanout, stem/locale, optional/default-sorting-field) + * surfaces as a snapshot diff before this library is published. + */ +const THING: SearchType = { + name: 'Thing', + type: 'https://example.org/Thing', + fields: [ + { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, + }, + { + name: 'keyword', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + searchable: { weight: 1 }, + }, + { + name: 'format', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + }, + { + name: 'creator', + kind: 'reference', + array: true, + facetable: true, + ref: { type: 'Agent', strategy: 'labelOnly' }, + }, + { name: 'status', kind: 'keyword', facetable: true, required: true }, + { name: 'size', kind: 'integer', facetable: true, sortable: true }, + { name: 'score', kind: 'number', facetable: true }, + { name: 'created', kind: 'date', sortable: true }, + { name: 'open', kind: 'boolean', facetable: true }, + ], +}; + +describe('collection-schema generator stability', () => { + it('derives a stable Typesense collection for a representative schema', () => { + expect( + buildCollectionSchema(THING, { + name: 'things', + defaultSortingField: 'size', + defaultLocale: 'nl', + synonymSets: ['things-synonyms'], + }), + ).toMatchSnapshot(); + }); +}); diff --git a/packages/search-typesense/test/parse-response.test.ts b/packages/search-typesense/test/parse-response.test.ts new file mode 100644 index 00000000..397c8e75 --- /dev/null +++ b/packages/search-typesense/test/parse-response.test.ts @@ -0,0 +1,473 @@ +import { afterEach, describe, expect, it, vi } from 'vitest'; +import type { LocalizedValue, SearchQuery, SearchType } from '@lde/search'; +import type { Client } from 'typesense'; +import { + createTypesenseSearchEngine, + fetchLabels, + parseSearchResponse, +} from '../src/search.js'; + +const schema: SearchType = { + name: 'Dataset', + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [ + { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + }, + { + name: 'keyword', + kind: 'keyword', + array: true, + facetable: true, + output: true, + }, + { + name: 'publisher', + kind: 'reference', + array: true, + facetable: true, + output: true, + ref: { type: 'http://xmlns.com/foaf/0.1/Agent', strategy: 'labelOnly' }, + }, + { name: 'size', kind: 'integer', output: true }, + { name: 'datePosted', kind: 'date', output: true }, + { name: 'iiif', kind: 'boolean', facetable: true, output: true }, + // A non-output field is never reconstructed into the logical document. + { name: 'status', kind: 'keyword', facetable: true, filterable: true }, + ], +}; + +const labels = new Map([ + ['https://org/1', { nl: ['Het Utrechts Archief'] }], + ['https://org/2', { nl: ['Rijksmuseum'], en: ['Rijksmuseum'] }], +]); + +const response = { + found: 2, + hits: [ + { + document: { + id: 'https://d/1', + title_nl: 'Titel', + title_en: 'Title', + keyword: ['kaarten'], + publisher: ['https://org/1'], + size: 1234, + datePosted: 1_700_000_000, + iiif: true, + status: 'valid', + }, + }, + { + document: { + id: 'https://d/2', + title_nl: 'Andere', + keyword: ['atlas', 'kaart'], + publisher: ['https://org/2', 'https://org/3'], + }, + }, + ], + facet_counts: [ + { + field_name: 'keyword', + counts: [ + { value: 'kaarten', count: 3 }, + { value: 'atlas', count: 1 }, + ], + }, + { + // A reference facet: buckets are keyed by IRI and carry resolved labels. + field_name: 'publisher', + counts: [ + { value: 'https://org/1', count: 2 }, + { value: 'https://org/3', count: 1 }, + ], + }, + ], +}; + +describe('parseSearchResponse', () => { + const result = parseSearchResponse(response, schema, labels); + + it('carries the total and the facet buckets keyed by field name', () => { + expect(result.total).toBe(2); + // A plain facet: buckets carry no label. + expect(result.facets.keyword).toEqual([ + { value: 'kaarten', count: 3 }, + { value: 'atlas', count: 1 }, + ]); + }); + + it('attaches resolved labels to reference-facet buckets, id-only when unlabelled', () => { + expect(result.facets.publisher).toEqual([ + { + value: 'https://org/1', + count: 2, + label: { nl: ['Het Utrechts Archief'] }, + }, + { value: 'https://org/3', count: 1 }, + ]); + }); + + it('reconstructs localized text into a best-available language map', () => { + expect(result.hits[0].id).toBe('https://d/1'); + expect(result.hits[0].document.title).toEqual({ + nl: ['Titel'], + en: ['Title'], + }); + // Only the present locale is emitted. + expect(result.hits[1].document.title).toEqual({ nl: ['Andere'] }); + }); + + it('resolves reference IRIs to labelled references, id-only when unlabelled', () => { + expect(result.hits[0].document.publisher).toEqual([ + { id: 'https://org/1', label: { nl: ['Het Utrechts Archief'] } }, + ]); + expect(result.hits[1].document.publisher).toEqual([ + { + id: 'https://org/2', + label: { nl: ['Rijksmuseum'], en: ['Rijksmuseum'] }, + }, + { id: 'https://org/3' }, + ]); + }); + + it('passes keyword arrays and numeric scalars through, and omits absent fields', () => { + expect(result.hits[0].document.keyword).toEqual(['kaarten']); + expect(result.hits[0].document.size).toBe(1234); + expect(result.hits[0].document.datePosted).toBe(1_700_000_000); + expect(result.hits[1].document.size).toBeUndefined(); + }); + + it('defaults an absent boolean to false and never reconstructs non-output fields', () => { + expect(result.hits[0].document.iiif).toBe(true); + expect(result.hits[1].document.iiif).toBe(false); + expect(result.hits[0].document.status).toBeUndefined(); + }); +}); + +describe('parseSearchResponse range facets', () => { + const rangeSchema: SearchType = { + name: 'Dataset', + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [ + { + name: 'size', + kind: 'integer', + facetable: true, + output: true, + facetRanges: [ + { key: '0', min: 1, max: 10 }, + { key: '1', min: 10, max: 100 }, + // Open-ended top bin: no upper bound. + { key: '2', min: 100 }, + ], + }, + ], + }; + + const rangeResponse = { + found: 5, + hits: [], + facet_counts: [ + { + field_name: 'size', + counts: [ + { value: '0', count: 2 }, + { value: '1', count: 1 }, + { value: '2', count: 2 }, + ], + }, + ], + }; + + it('echoes each range bin’s half-open bounds onto its bucket, open ends omitted', () => { + const result = parseSearchResponse(rangeResponse, rangeSchema, new Map()); + expect(result.facets.size).toEqual([ + { value: '0', count: 2, min: 1, max: 10 }, + { value: '1', count: 1, min: 10, max: 100 }, + // The open-ended top bin carries only its lower bound. + { value: '2', count: 2, min: 100 }, + ]); + }); +}); + +describe('createTypesenseSearchEngine label degradation', () => { + const baseQuery: SearchQuery = { + where: [], + orderBy: [], + limit: 10, + offset: 0, + facets: [], + locale: 'nl', + }; + + // A fake client whose document search succeeds but whose label lookup + // (multi_search) rejects, so the engine must degrade to id-only references. + function fakeClient(): Client { + return { + collections: () => ({ + documents: () => ({ + search: () => + Promise.resolve({ + found: 1, + hits: [ + { + document: { id: 'https://d/1', publisher: ['https://org/1'] }, + }, + ], + }), + }), + }), + multiSearch: { + perform: () => + Promise.reject(new Error('labels collection unavailable')), + }, + } as unknown as Client; + } + + it('degrades to id-only references when the label lookup fails, reporting the cause', async () => { + let capturedError: unknown; + const engine = createTypesenseSearchEngine(fakeClient(), { + collection: 'datasets', + labelsCollection: 'labels', + onLabelError: (error) => { + capturedError = error; + }, + }); + const result = await engine.search(baseQuery, schema); + // The reference is present but unlabelled: the failed lookup degraded + // rather than failing the whole search. + expect(result.hits[0].document.publisher).toEqual([ + { id: 'https://org/1' }, + ]); + expect(capturedError).toBeInstanceOf(Error); + }); +}); + +describe('createTypesenseSearchEngine label cache (labelCacheTtlMs)', () => { + const baseQuery: SearchQuery = { + where: [], + orderBy: [], + limit: 10, + offset: 0, + facets: [], + locale: 'nl', + }; + + // One labels document, as the export endpoint streams it (JSONL). + const labelsJsonl = JSON.stringify({ + id: 'https://org/1', + label: 'Het Utrechts Archief', + label_nl: 'Het Utrechts Archief', + }); + + // A fake client whose document search always returns one hit referencing + // `https://org/1`, and whose `labels` collection export is driven by + // `exportImpl`. Counters make the export-call count observable. + function fakeClient(exportImpl: () => Promise) { + let exportCalls = 0; + const client = { + collections: () => ({ + documents: () => ({ + search: () => + Promise.resolve({ + found: 1, + hits: [ + { + document: { id: 'https://d/1', publisher: ['https://org/1'] }, + }, + ], + }), + export: () => { + exportCalls += 1; + return exportImpl(); + }, + }), + }), + }; + return { + client: client as unknown as Client, + exportCalls: () => exportCalls, + }; + } + + afterEach(() => { + vi.useRealTimers(); + }); + + it('loads the collection once for concurrent searches (single-flight)', async () => { + const { client, exportCalls } = fakeClient(() => + Promise.resolve(labelsJsonl), + ); + const engine = createTypesenseSearchEngine(client, { + collection: 'datasets', + labelsCollection: 'labels', + labelCacheTtlMs: 60_000, + }); + + const results = await Promise.all([ + engine.search(baseQuery, schema), + engine.search(baseQuery, schema), + engine.search(baseQuery, schema), + ]); + + // One export served all three concurrent searches. + expect(exportCalls()).toBe(1); + for (const result of results) { + expect(result.hits[0].document.publisher).toEqual([ + { id: 'https://org/1', label: { nl: ['Het Utrechts Archief'] } }, + ]); + } + }); + + it('serves a later search from cache without a second export', async () => { + const { client, exportCalls } = fakeClient(() => + Promise.resolve(labelsJsonl), + ); + const engine = createTypesenseSearchEngine(client, { + collection: 'datasets', + labelsCollection: 'labels', + labelCacheTtlMs: 60_000, + }); + + await engine.search(baseQuery, schema); + await engine.search(baseQuery, schema); + + expect(exportCalls()).toBe(1); + }); + + it('reloads the collection after the TTL expires', async () => { + vi.useFakeTimers(); + const { client, exportCalls } = fakeClient(() => + Promise.resolve(labelsJsonl), + ); + const engine = createTypesenseSearchEngine(client, { + collection: 'datasets', + labelsCollection: 'labels', + labelCacheTtlMs: 1000, + }); + + await engine.search(baseQuery, schema); + expect(exportCalls()).toBe(1); + + // Within the TTL: still cached. + vi.advanceTimersByTime(500); + await engine.search(baseQuery, schema); + expect(exportCalls()).toBe(1); + + // Past the TTL: reload. + vi.advanceTimersByTime(600); + await engine.search(baseQuery, schema); + expect(exportCalls()).toBe(2); + }); + + it('degrades to id-only references on a load error and retries next time', async () => { + let capturedError: unknown; + let attempt = 0; + const { client, exportCalls } = fakeClient(() => { + attempt += 1; + return attempt === 1 + ? Promise.reject(new Error('labels collection unavailable')) + : Promise.resolve(labelsJsonl); + }); + const engine = createTypesenseSearchEngine(client, { + collection: 'datasets', + labelsCollection: 'labels', + labelCacheTtlMs: 60_000, + onLabelError: (error) => { + capturedError = error; + }, + }); + + // First load fails: id-only reference, error reported, nothing cached. + const failed = await engine.search(baseQuery, schema); + expect(failed.hits[0].document.publisher).toEqual([ + { id: 'https://org/1' }, + ]); + expect(capturedError).toBeInstanceOf(Error); + expect(exportCalls()).toBe(1); + + // Next search retries the load (the failure was not cached) and resolves. + const recovered = await engine.search(baseQuery, schema); + expect(recovered.hits[0].document.publisher).toEqual([ + { id: 'https://org/1', label: { nl: ['Het Utrechts Archief'] } }, + ]); + expect(exportCalls()).toBe(2); + }); +}); + +describe('fetchLabels', () => { + // A fake Typesense client whose multi_search returns the requested ids that + // exist in `docsById`, recording each POST's per-search id-lists so batching + // is observable. (Resolving via multi_search/POST avoids the GET query-string + // limit that a large id-list would otherwise overflow.) + function fakeClient(docsById: Record>) { + const posts: string[][][] = []; + const client = { + multiSearch: { + perform: (request: { searches: { readonly filter_by: string }[] }) => { + const batches = request.searches.map((search) => + [...search.filter_by.matchAll(/`([^`]+)`/g)].map( + (match) => match[1], + ), + ); + posts.push(batches); + const results = batches.map((ids) => { + const hits = ids + .filter((id) => docsById[id] !== undefined) + .map((id) => ({ document: { id, ...docsById[id] } })); + return { found: hits.length, hits }; + }); + return Promise.resolve({ results }); + }, + }, + }; + return { client: client as unknown as Pick, posts }; + } + + it('resolves labels via multi_search, merging per-locale variants', async () => { + const { client, posts } = fakeClient({ + 'https://org/1': { label: 'KB', label_nl: 'KB' }, + // Only a default label (no locale variant) → untagged (`und`) fallback. + 'https://org/3': { label: 'Untagged' }, + }); + const labels = await fetchLabels(client, 'labels', [ + 'https://org/1', + 'https://org/2', + 'https://org/3', + ]); + expect(labels.get('https://org/1')).toEqual({ nl: ['KB'] }); + expect(labels.get('https://org/3')).toEqual({ und: ['Untagged'] }); + // An IRI absent from the collection yields no entry. + expect(labels.has('https://org/2')).toBe(false); + expect(posts).toHaveLength(1); + }); + + it('batches a large id-list under the per_page cap, in a single POST', async () => { + const ids = Array.from( + { length: 450 }, + (_unused, index) => `https://example.org/class/${index}`, + ); + const docsById = Object.fromEntries( + ids.map((id) => [id, { label_nl: id }]), + ); + const { client, posts } = fakeClient(docsById); + const labels = await fetchLabels(client, 'labels', ids); + // 450 ids → batches of 200, 200, 50, bundled into one round-trip. + expect(posts).toHaveLength(1); + expect(posts[0].map((batch) => batch.length)).toEqual([200, 200, 50]); + expect(labels.size).toBe(450); + }); + + it('makes no request for an empty id-list', async () => { + const { client, posts } = fakeClient({}); + const labels = await fetchLabels(client, 'labels', []); + expect(labels.size).toBe(0); + expect(posts).toHaveLength(0); + }); +}); diff --git a/packages/search-typesense/test/query-compiler.test.ts b/packages/search-typesense/test/query-compiler.test.ts new file mode 100644 index 00000000..b681551a --- /dev/null +++ b/packages/search-typesense/test/query-compiler.test.ts @@ -0,0 +1,273 @@ +import { describe, expect, it } from 'vitest'; +import type { SearchQuery, SearchType } from '@lde/search'; +import { buildSearchParams } from '../src/query-compiler.js'; + +const schema: SearchType = { + name: 'Dataset', + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [ + { + name: 'title', + path: 'http://purl.org/dc/terms/title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, + }, + { + name: 'keyword', + path: 'http://www.w3.org/ns/dcat#keyword', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + searchable: { weight: 1 }, + }, + { + name: 'format', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + }, + // Filter-only, non-facet (tokenized) → exact `:=` membership. + { name: 'catalog', kind: 'keyword', array: true, filterable: true }, + { name: 'status', kind: 'keyword', facetable: true, filterable: true }, + { + name: 'size', + kind: 'integer', + filterable: true, + sortable: true, + facetable: true, + // Half-open `[min, max)` bins; the last is open-ended (no upper bound). + facetRanges: [ + { key: '0', min: 1, max: 10 }, + { key: '1', min: 10, max: 100 }, + { key: '2', min: 100 }, + ], + }, + { name: 'iiif', kind: 'boolean', filterable: true, facetable: true }, + { name: 'datePosted', kind: 'date', filterable: true, sortable: true }, + ], +}; + +const base: SearchQuery = { + where: [], + orderBy: [], + limit: 20, + offset: 0, + facets: [], + locale: 'nl', +}; + +describe('buildSearchParams', () => { + it('browses with a match-all q and the weighted query_by fields', () => { + const params = buildSearchParams(base, schema); + expect(params.q).toBe('*'); + expect(params.query_by).toBe( + 'title_search_nl,title_search_en,keyword_search', + ); + expect(params.per_page).toBe(20); + expect(params.page).toBe(1); + expect(params.filter_by).toBeUndefined(); + expect(params.sort_by).toBeUndefined(); + }); + + it('folds the query text and boosts the active locale in query_by_weights', () => { + expect( + buildSearchParams({ ...base, text: 'Kaart', locale: 'nl' }, schema), + ).toMatchObject({ q: 'kaart', query_by_weights: '5,4,1' }); + expect( + buildSearchParams({ ...base, text: 'Kaart', locale: 'en' }, schema) + .query_by_weights, + ).toBe('4,5,1'); + }); + + it('maps offset/limit to numbered pages', () => { + expect( + buildSearchParams({ ...base, offset: 40, limit: 20 }, schema).page, + ).toBe(3); + }); + + it('compiles where clauses, with exact membership for non-facet fields', () => { + const params = buildSearchParams( + { + ...base, + where: [ + { field: 'status', in: ['valid'] }, + { field: 'keyword', in: ['kaarten', 'atlas'] }, + { field: 'catalog', in: ['urn:cat'] }, + { field: 'format', in: ['text/turtle', 'group:rdf'] }, + { field: 'size', range: { min: 1, max: 10 } }, + { field: 'iiif', is: true }, + ], + }, + schema, + ); + expect(params.filter_by).toBe( + 'status:[`valid`] && ' + + 'keyword:[`kaarten`,`atlas`] && ' + + 'catalog:=[`urn:cat`] && ' + + 'format:[`text/turtle`,`group:rdf`] && ' + + 'size:[1..10] && ' + + 'iiif:=true', + ); + }); + + it('skips a clause that compiles to nothing and reports it via onIgnoredFilter', () => { + const ignored: unknown[] = []; + const params = buildSearchParams( + { + ...base, + where: [ + { field: 'status', in: ['valid'] }, // fine — kept + { field: 'nonexistent', in: ['x'] }, // unknown field + { field: 'keyword', range: { min: 1 } }, // operator ≠ field kind + { field: 'status', in: [] }, // empty membership + { field: 'size', range: {} }, // no usable bound + ], + }, + schema, + { onIgnoredFilter: (filter) => ignored.push(filter) }, + ); + expect(params.filter_by).toBe('status:[`valid`]'); + expect(ignored).toEqual([ + { field: 'nonexistent', in: ['x'] }, + { field: 'keyword', range: { min: 1 } }, + { field: 'status', in: [] }, + { field: 'size', range: {} }, + ]); + }); + + it('skips a non-compiling clause silently when no onIgnoredFilter is given', () => { + const params = buildSearchParams( + { ...base, where: [{ field: 'nonexistent', in: ['x'] }] }, + schema, + ); + expect(params.filter_by).toBeUndefined(); + }); + + it('compiles a one-sided range bound', () => { + expect( + buildSearchParams( + { ...base, where: [{ field: 'size', range: { min: 5 } }] }, + schema, + ).filter_by, + ).toBe('size:>=5'); + expect( + buildSearchParams( + { ...base, where: [{ field: 'size', range: { max: 9 } }] }, + schema, + ).filter_by, + ).toBe('size:<=9'); + }); + + it('converts a date field’s ISO bounds to the stored Unix seconds', () => { + const min = Date.parse('2024-01-01T00:00:00Z') / 1000; + const max = Date.parse('2025-01-01T00:00:00Z') / 1000; + expect( + buildSearchParams( + { + ...base, + where: [ + { + field: 'datePosted', + range: { + min: '2024-01-01T00:00:00Z', + max: '2025-01-01T00:00:00Z', + }, + }, + ], + }, + schema, + ).filter_by, + ).toBe(`datePosted:[${min}..${max}]`); + // An unparseable bound is dropped rather than compiled into garbage. + expect( + buildSearchParams( + { + ...base, + where: [ + { + field: 'datePosted', + range: { min: 'not-a-date', max: '2025-01-01T00:00:00Z' }, + }, + ], + }, + schema, + ).filter_by, + ).toBe(`datePosted:<=${max}`); + }); + + it('compiles orderBy: RELEVANCE → _text_match and a localized field → its sort key', () => { + expect( + buildSearchParams( + { + ...base, + orderBy: [ + { field: 'relevance', direction: 'desc' }, + { field: 'status_rank', direction: 'asc' }, + ], + }, + schema, + ).sort_by, + ).toBe('_text_match:desc,status_rank:asc'); + + expect( + buildSearchParams( + { + ...base, + locale: 'nl', + orderBy: [ + { field: 'title', direction: 'asc' }, + { field: 'status_rank', direction: 'asc' }, + ], + }, + schema, + ).sort_by, + ).toBe('title_sort_nl:asc,status_rank:asc'); + }); + + it('pins page to 1 for a facet-only (limit:0) query instead of dividing by zero', () => { + const params = buildSearchParams({ ...base, limit: 0 }, schema); + expect(params.per_page).toBe(0); + expect(params.page).toBe(1); + }); + + it('requests facets by their logical field name', () => { + expect( + buildSearchParams({ ...base, facets: ['keyword', 'format'] }, schema) + .facet_by, + ).toBe('keyword,format'); + }); + + it('facets a range field into its declared half-open bins, open ends blank', () => { + // Typesense range syntax is start-inclusive/end-exclusive, so the declared + // `[min, max)` bounds pass straight through; the open-ended bin leaves the + // upper bound blank. + expect( + buildSearchParams({ ...base, facets: ['size'] }, schema).facet_by, + ).toBe('size(0:[1, 10], 1:[10, 100], 2:[100, ])'); + }); + + it('mixes range and plain facets in one facet_by clause', () => { + expect( + buildSearchParams({ ...base, facets: ['keyword', 'size'] }, schema) + .facet_by, + ).toBe('keyword,size(0:[1, 10], 1:[10, 100], 2:[100, ])'); + }); + + it('omits max_facet_values by default but sets it when configured', () => { + expect( + buildSearchParams({ ...base, facets: ['keyword'] }, schema) + .max_facet_values, + ).toBeUndefined(); + expect( + buildSearchParams({ ...base, facets: ['keyword'] }, schema, { + maxFacetValues: 250, + }).max_facet_values, + ).toBe(250); + }); +}); diff --git a/packages/search-typesense/test/search-engine.test.ts b/packages/search-typesense/test/search-engine.test.ts new file mode 100644 index 00000000..519ee774 --- /dev/null +++ b/packages/search-typesense/test/search-engine.test.ts @@ -0,0 +1,255 @@ +import { afterAll, beforeAll, describe, expect, it } from 'vitest'; +import type { Client } from 'typesense'; +import type { SearchEngine, SearchQuery, SearchType } from '@lde/search'; +import { buildCollectionSchema } from '../src/collection-schema.js'; +import { createTypesenseSearchEngine } from '../src/search.js'; +import { TypesenseContainer } from './typesense-container.js'; + +const datasetSchema: SearchType = { + name: 'Dataset', + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [ + { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, + }, + { + name: 'keyword', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + searchable: { weight: 1 }, + output: true, + }, + { + name: 'publisher', + kind: 'reference', + array: true, + facetable: true, + output: true, + ref: { type: 'http://xmlns.com/foaf/0.1/Agent', strategy: 'labelOnly' }, + }, + { name: 'status', kind: 'keyword', facetable: true, filterable: true }, + { name: 'statusRank', kind: 'integer', sortable: true }, + ], +}; + +// Flat documents, as the projection would emit them (physical field names). +const documents = [ + { + id: 'd1', + title_nl: 'Kaart van Utrecht', + title_en: 'Map of Utrecht', + title_search_nl: 'kaart van utrecht', + title_search_en: 'map of utrecht', + title_sort_nl: 'kaart van utrecht', + title_sort_en: 'map of utrecht', + keyword: ['kaarten'], + keyword_search: ['kaarten'], + publisher: ['https://org/1'], + status: 'valid', + statusRank: 0, + }, + { + id: 'd2', + title_nl: 'Atlas der Nederlanden', + title_search_nl: 'atlas der nederlanden', + title_sort_nl: 'atlas der nederlanden', + keyword: ['atlas'], + keyword_search: ['atlas'], + publisher: ['https://org/2'], + status: 'valid', + statusRank: 0, + }, + { + id: 'd3', + title_nl: 'Verouderde kaart', + title_search_nl: 'verouderde kaart', + title_sort_nl: 'verouderde kaart', + keyword: ['kaarten'], + keyword_search: ['kaarten'], + publisher: ['https://org/1'], + status: 'invalid', + statusRank: 3, + }, +]; + +const labelDocuments = [ + { + id: 'https://org/1', + label: 'Het Utrechts Archief', + label_nl: 'Het Utrechts Archief', + type: 'organization', + }, + { + id: 'https://org/2', + label: 'Rijksmuseum', + label_nl: 'Rijksmuseum', + label_en: 'Rijksmuseum', + type: 'organization', + }, +]; + +const baseQuery: SearchQuery = { + where: [], + orderBy: [], + limit: 10, + offset: 0, + facets: [], + locale: 'nl', +}; + +describe('createTypesenseSearchEngine (integration)', () => { + const container = new TypesenseContainer(); + let client: Client; + let engine: SearchEngine; + + beforeAll(async () => { + client = await container.start(); + // Typesense accepts the generated schema (stemming, locales, int64, …). + await client.collections().create( + buildCollectionSchema(datasetSchema, { + name: 'datasets', + defaultSortingField: 'statusRank', + defaultLocale: 'nl', + }), + ); + await client.collections().create({ + name: 'labels', + fields: [ + { name: 'label', type: 'string' }, + { name: 'label_nl', type: 'string', optional: true, index: false }, + { name: 'label_en', type: 'string', optional: true, index: false }, + { name: 'type', type: 'string', facet: true }, + ], + }); + await client + .collections('datasets') + .documents() + .import(documents, { action: 'create' }); + await client + .collections('labels') + .documents() + .import(labelDocuments, { action: 'create' }); + + engine = createTypesenseSearchEngine(client, { + collection: 'datasets', + labelsCollection: 'labels', + }); + }, 120_000); + + afterAll(async () => { + await container.stop(); + }); + + it('filters by status, sorts by the localized title key, and resolves reference labels', async () => { + const result = await engine.search( + { + ...baseQuery, + where: [{ field: 'status', in: ['valid'] }], + orderBy: [ + { field: 'title', direction: 'asc' }, + { field: 'statusRank', direction: 'asc' }, + ], + }, + datasetSchema, + ); + + // d3 is invalid → filtered out; remaining two sorted by folded title. + expect(result.total).toBe(2); + expect(result.hits.map((hit) => hit.id)).toEqual(['d2', 'd1']); + expect(result.hits[0].document.title).toEqual({ + nl: ['Atlas der Nederlanden'], + }); + expect(result.hits[0].document.publisher).toEqual([ + { + id: 'https://org/2', + label: { nl: ['Rijksmuseum'], en: ['Rijksmuseum'] }, + }, + ]); + expect(result.hits[1].document.publisher).toEqual([ + { id: 'https://org/1', label: { nl: ['Het Utrechts Archief'] } }, + ]); + }); + + it('ranks a full-text query through the weighted query_by fields', async () => { + const result = await engine.search( + { + ...baseQuery, + text: 'Utrecht', + orderBy: [{ field: 'relevance', direction: 'desc' }], + }, + datasetSchema, + ); + + expect(result.hits[0].id).toBe('d1'); + expect(result.hits.map((hit) => hit.id)).not.toContain('d2'); + }); + + it('returns facet buckets with counts, labelling reference facets', async () => { + const result = await engine.search( + { ...baseQuery, facets: ['keyword', 'publisher'] }, + datasetSchema, + ); + + // Plain facet: value + count, no label. + const keyword = [...(result.facets.keyword ?? [])].sort( + (a, b) => b.count - a.count, + ); + expect(keyword).toEqual([ + { value: 'kaarten', count: 2 }, + { value: 'atlas', count: 1 }, + ]); + + // Reference facet: IRI-keyed buckets carry the resolved data label. + const publisher = [...(result.facets.publisher ?? [])].sort( + (a, b) => b.count - a.count, + ); + expect(publisher).toEqual([ + { + value: 'https://org/1', + count: 2, + label: { nl: ['Het Utrechts Archief'] }, + }, + { + value: 'https://org/2', + count: 1, + label: { nl: ['Rijksmuseum'], en: ['Rijksmuseum'] }, + }, + ]); + }); + + it('always rejects a structurally invalid query, before reaching the engine', async () => { + await expect( + engine.search( + { ...baseQuery, where: [{ field: 'nonexistent', in: ['x'] }] }, + datasetSchema, + ), + ).rejects.toThrow(/Invalid search query for “Dataset”/); + await expect( + engine.search({ ...baseQuery, facets: ['title'] }, datasetSchema), + ).rejects.toThrow(/not-facetable/); + }); + + it('reports a vacuous where clause via onIgnoredFilter and still searches', async () => { + const ignored: unknown[] = []; + const reporting = createTypesenseSearchEngine(client, { + collection: 'datasets', + onIgnoredFilter: (filter) => ignored.push(filter), + }); + + const result = await reporting.search( + { ...baseQuery, where: [{ field: 'status', in: [] }] }, + datasetSchema, + ); + + expect(result.total).toBeGreaterThan(0); // empty membership = no constraint + expect(ignored).toEqual([{ field: 'status', in: [] }]); + }); +}); diff --git a/packages/search-typesense/tsconfig.lib.json b/packages/search-typesense/tsconfig.lib.json index e7c2ce37..52ca4bb7 100644 --- a/packages/search-typesense/tsconfig.lib.json +++ b/packages/search-typesense/tsconfig.lib.json @@ -8,7 +8,10 @@ "types": ["node"] }, "include": ["src/**/*.ts"], - "references": [], + "references": [ + { "path": "../search/tsconfig.lib.json" }, + { "path": "../text-normalization/tsconfig.lib.json" } + ], "exclude": [ "vite.config.ts", "vite.config.mts", diff --git a/packages/search-typesense/vite.config.ts b/packages/search-typesense/vite.config.ts index a09c9579..b65006b3 100644 --- a/packages/search-typesense/vite.config.ts +++ b/packages/search-typesense/vite.config.ts @@ -16,10 +16,12 @@ export default mergeConfig( // rethrow guards and best-effort cleanup paths are deliberately not // exercised, which is why branch coverage is lower. thresholds: { - functions: 87.5, - lines: 84.7, - branches: 66.66, - statements: 84.88, + // functions dipped a hair when the shared `filterOperator` helper + // moved to @lde/search (one fewer covered function in this package). + functions: 96.87, + lines: 94.31, + branches: 87.14, + statements: 94.37, }, }, }, diff --git a/packages/search/README.md b/packages/search/README.md index 5672881e..81df0140 100644 --- a/packages/search/README.md +++ b/packages/search/README.md @@ -1,170 +1,249 @@ # @lde/search -Engine-agnostic search projection for RDF-backed pipelines. **`projectGraph`** -streams the result of a SPARQL `CONSTRUCT` into flat search documents, with no -engine and no vocabulary baked in. Internally it does two things per subject of -a root type: frame its one-hop subgraph into a JSON-LD IR node, then project -that node into a flat document from a **declarative field spec**. +The core of the LDE search family: packages that together act as a **generator +for search engines**. You write one declarative `SearchSchema`, and everything +a running search engine needs is derived from it: the document projection, the +engine collection schema, the query semantics, and the API surface. All these +are kept in sync automatically rather than handwritten per deployment. + +The core itself is **engine-, API- and domain-agnostic**: it bakes in no search +engine, no API protocol, and no domain vocabulary. The engine- and API-specific +halves are adapters that plug into the ports defined here: + +- **engine adapters** implement the `SearchEngine` port: + [`@lde/search-typesense`](../search-typesense), with OpenSearch to follow; +- **API surfaces** drive it, parsing client input into `search(SearchQuery)`: + [`@lde/search-api-graphql`](../search-api-graphql), with a REST + surface to follow. + +The library never names your domain: the same core drives a `Dataset`, +`Person`, or `CreativeWork` search. + +It provides four things: + +- **unified field model** — `SearchField` / `SearchType` / `SearchSchema`: + one declaration per field that drives all four consumers below, so they + cannot drift; +- **neutral query IR** — `SearchQuery` / `Filter` / `Sort` + filter + semantics: every API surface compiles into it, every engine adapter compiles + out of it, so the two cannot drift; +- **engine port** — `SearchEngine` and the logical result types + (`SearchResult` / `SearchHit` / `ResultDocument` / `Reference` / …); +- **streaming projection** — `projectGraph`, RDF `CONSTRUCT` quads → flat + search documents. -An engine adapter (e.g. [`@lde/search-typesense`](../search-typesense)) then -writes those documents to a search backend. - -```ts -import { projectGraph, type Projection } from '@lde/search'; +``` +SearchSchema ─┬─► projection (projectGraph → flat documents) [here] + ├─► engine adapter (collection schema + query compiler) e.g. @lde/search-typesense + ├─► query semantics (SearchQuery, filter/sort/facet) [here] + └─► API surface (GraphQL / REST) e.g. @lde/search-api-graphql +``` -const projection: Projection = { - /* type + field spec — see below */ -}; +At runtime, everything those consumers do is a **pure transformation between +data shapes**, each one parameterised by the schema — three chains, meeting at +the engine: -for await (const document of projectGraph(quads, [projection])) { - // one flat search document per matching subject, streamed -} +``` +indexing: RDF quads ──frame──► FramedNode ──project──► SearchDocument ──import──► engine +querying: client input ──parse──► SearchQuery ──compile──► engine query +results: engine response ──parse──► SearchResult ──shape──► API output ``` -`projectGraph` is fully streaming: subjects are grouped and framed one at a time -and documents are yielded as they are produced, so beyond a subject index memory -stays flat at scale (framing the whole graph at once is roughly O(N²)). Duplicate -triples are collapsed first, because some SPARQL engines (e.g. QLever) do not -deduplicate `CONSTRUCT` output. The IR carries no `@context`, so a `derivation` -reading it sees full predicate IRIs with language tags preserved. +Validation happens before the first arrow (SHACL over the RDF) and inside the +last (the engine enforces its collection schema); between them every stage is +a typed, deterministic function — easy to test, and swappable per deployment. -## Projection +## Terminology -The mapping is data, not code. Each field declares the IR `path` to read and a -`kind`; the conventions (per-locale split, diacritic folding via -[`@lde/text-normalization`](../text-normalization), facet arrays, numeric -coercion) are applied for you. Computed fields are `derivations` — hooks that -read the node and set fields the kinds can't. +The model has three levels, with analogues in SHACL ([one possible source](#why-a-declarative-model)) +and GraphQL (one of the surfaces): -```ts -import { projectGraph, irisOf, type Projection } from '@lde/search'; +| Term | What it is | SHACL | GraphQL | +| -------------- | --------------------------------------------------------------------------------------------------------------- | -------------- | ----------- | +| `SearchField` | One queryable field: a `kind`, the IR `path` it projects from, and the capability flags it opts into | property shape | field | +| `SearchType` | One root type’s complete declaration: its logical API `name`, its `type` IRI, its fields and derivations | NodeShape | object type | +| `SearchSchema` | The whole search declaration: every `SearchType`, keyed by `type` IRI — build one with `searchSchema(...types)` | shapes graph | schema | + +`projectGraph` and the GraphQL surface consume a `SearchSchema` (projecting +every type in one pass); the engine port executes one `SearchType` at a time. + +### API conventions + +Two conventions hold across the whole family: -const projection: Projection = { +- **Parameter order** — a function takes the value it operates on first and + the `SearchType` declaration right after it: `search(query, type)`, + `projectDocument(node, type)`, `engineFor(engine, type)`, + `buildSearchParams(query, type)`. +- **Factory verbs** — the verb tells you what kind of thing comes back. + `define*` captures a declaration as a literal (`defineSearchType`); + `build*` is a pure data-to-data constructor (`buildCollectionSchema`, + `buildSearchParams`, `buildGraphQLSchema`); `create*` makes a stateful + instance (`createTypesenseSearchEngine`). A bare noun (`searchSchema`) + constructs the trivial container it names. + +## Field model + +The mapping is data, not code. Each field declares its `kind`, the IR `path` to +read (omit it for a **derived** field, populated by a `derivation`), and the +capabilities it opts into. The physical field names a declaration fans out to +(per-locale search/sort keys) come from +`physicalFields`, the single convention projection, the collection schema and the +query compiler all share. + +```ts +import { + defineSearchType, + projectGraph, + irisOf, + searchSchema, +} from '@lde/search'; + +const DATASET = defineSearchType({ + name: 'Dataset', // logical API name: names the GraphQL type, a REST path, … type: 'http://www.w3.org/ns/dcat#Dataset', fields: [ - // → title_nl, title_en, title_search_nl, title_search_en, title_sort_nl, title_sort_en + // → title_nl, title_en, title_search_nl/_en, title_sort_nl/_en { name: 'title', path: 'http://purl.org/dc/terms/title', - kind: { - type: 'langText', - locales: ['nl', 'en'], - display: true, - search: true, - sort: true, - }, + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, }, - // → publisher (IRI facet) + // → publisher (IRI facet, resolved to a labelled reference at the surface) { name: 'publisher', path: 'http://purl.org/dc/terms/publisher', - kind: { type: 'facet', iri: true }, + kind: 'reference', + facetable: true, + output: true, + ref: { type: 'Organization', strategy: 'labelOnly' }, }, // → size (int) - { name: 'size', path: 'urn:dr:size', kind: { type: 'number' } }, + { name: 'size', path: 'urn:dr:size', kind: 'integer', sortable: true }, + // derived field (no path): populated by the derivation below + { name: 'classCount', kind: 'integer', sortable: true }, ], derivations: [ (document, node) => { - document.class_count = irisOf(node, 'urn:dr:class').length; + document.classCount = irisOf(node, 'urn:dr:class').length; }, ], -}; +}); -for await (const document of projectGraph(quads, [projection])) { - // … +for await (const document of projectGraph(quads, searchSchema(DATASET))) { + // one flat search document per matching subject, streamed } ``` -**Kinds** +`defineSearchType` captures the declaration as a literal (what +`as const satisfies SearchType` would do manually, with nothing to remember), +so typed facet/output keys can be derived from it — see +[Typed results](#typed-results) and `@lde/search-api-graphql`. + +**Kinds** (`FieldKind`): `text`, `keyword`, `integer`, `number`, `boolean`, +`date`, `reference`. The Typesense/engine vocabulary and the GraphQL types are +_derived_ from the kind by the adapter and the surface — never declared here. -| kind | emits | -| ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `langText` | per locale (see below), each opt-in: `_${locale}` display with `display`, `_search_${locale}` folded with `search`, `_sort_${locale}` folded with `sort` | -| `facet` | the field as a deduped array; `iri` reads `@id`; `search` adds a folded `_search`; `transform` rewrites values | -| `number` | a numeric scalar; `date` parses an ISO date-time to unix seconds | +| kind | `where` | facet | sort | output | +| -------------------- | -------------------- | ----- | ---------------- | ------------------------------- | +| `text` (`localized`) | – (feeds free text) | – | yes (per-locale) | best-first language list | +| `keyword` | `in` (membership) | yes | – | string / `string[]` | +| `reference` | `in` (membership) | yes | – | labelled reference (id + label) | +| `integer` / `number` | `range { min, max }` | yes | yes | number | +| `date` | `range` (inclusive) | yes | yes | ISO 8601 string (surface) | +| `boolean` | `is` | yes | – | boolean (absent = false) | + +## Projection + +`projectGraph` is fully streaming: subjects are grouped and framed one at a time +and documents are yielded as produced, so beyond a subject index memory stays +flat at scale (framing the whole graph at once is roughly O(N²)). Duplicate +triples are collapsed first, because some SPARQL engines (e.g. QLever) do not +deduplicate `CONSTRUCT` output. The IR carries no `@context`, so a `derivation` +reading it sees full predicate IRIs with language tags preserved. ## Locales -`locales` is the **single** list of languages a `langText` field projects; -`display`, `search` and `sort` are independent opt-in families that each fan out +`locales` is the **single** list of languages a localized `text` field projects; +`output`, `searchable` and `sortable` are independent opt-ins that each fan out over it (so a field emits exactly what it opts into): -- `display` → `title_nl`/`title_en` (accents preserved); -- `search` → `title_search_nl`/`title_search_en` (folded; one field per locale - lets a query `query_by` them and rank the user’s language higher via - `query_by_weights`, and lets a language that needs a dedicated tokenizer set - its own `locale` in the schema); -- `sort` → `title_sort_nl`/`title_sort_en` (folded, so a locale-switching UI +- `output` → `title_nl`/`title_en` (accents preserved); +- `searchable` → `title_search_nl`/`title_search_en` (folded; one field per locale + lets a query `query_by` them and rank the user’s language higher, and lets a + language that needs a dedicated tokenizer set its own stemming `locale` in the + engine schema); +- `sortable` → `title_sort_nl`/`title_sort_en` (folded, so a locale-switching UI sorts on the active language). -A field with `search` but no `display` is **search-only** — folded and stemmed -for retrieval but never rendered (e.g. a `publisher` searched here but shown via -a separate label). +A field with `searchable` but no `output` is **search-only** — folded and stemmed +for retrieval but never rendered (e.g. a creator searched here but shown via a +separate label). **Only listed locales are indexed**; a literal whose language tag +is not in `locales` (or has no tag) is not projected at all. Per-locale fields are +**omitted, never empty**, when a document lacks that language, so declare them +optional in the engine schema and sort with `missing_values: last`. Folding the search fields is what lets diacritic-insensitive matching and stemming coexist. A search engine on its **default** locale typically folds case -and diacritics for you (Typesense v30, verified, even folds ø/æ/ß) — so there the -folding here is belt-and-suspenders. But enabling a language’s **stemming** -requires setting that language’s `locale` (e.g. `locale: 'nl'` + `stem: true` so -`huizen` matches `huis`), and a non-default locale switches the engine to ICU -tokenization, which **preserves** diacritics. At that point the engine no longer -folds them, and `fold()` is what keeps matching diacritic-insensitive. Stemming -is a per-field engine-schema choice (the consumer’s), and being rules-based it -can mangle proper nouns and place names — e.g. the Dutch stemmer reduces the city -`Bergen` to `berg`, colliding it with “mountain”. - -Recommended split: enable stemming on the **free-text** search fields -(`*_search_${locale}`, descriptions, keywords) where morphological recall helps -(`verhaal` ↔ `verhalen`), and keep **place names and other proper-noun facets on -a separate, unstemmed field** (facets are exact-match anyway). That captures the -recall without the `Bergen`/`berg` collision in the facet. A `stem_dictionary` -can pin specific names if you need stemmed free-text without given collisions. - -**Only listed locales are indexed.** A literal whose language tag is not in -`locales` is not projected at all — no display, no search, no sort field — so it -is invisible to the index. To index a language, add it to `locales`. - -Per-locale fields are **omitted, never empty**, when a document lacks that -language, so declare them `optional: true` in the engine schema. At query time, -sort with `missing_values: last` to push documents lacking the active locale to -the end, and `query_by` all the per-locale search fields (weighting the user’s -locale higher) to keep cross-language recall. - -A literal with no `@language` tag matches no locale, so it is not projected. Tag -your source literals (or pre-process them) for the languages you index. +and diacritics for you; enabling a language’s **stemming** switches it to ICU +tokenization, which **preserves** diacritics — at which point `fold()` (from +[`@lde/text-normalization`](../text-normalization)) is what keeps matching +diacritic-insensitive. Stemming is rules-based and can mangle proper nouns (the +Dutch stemmer reduces the city `Bergen` to `berg`), so enable it on free-text +fields and keep proper-noun facets on a separate, unstemmed field. ## Querying The search fields are stored already case- and diacritic-folded, so **the query -must be folded the same way** with the same `fold()` from -[`@lde/text-normalization`](../text-normalization) before it reaches the engine. -Otherwise index and query are normalized differently and matches silently miss -(the user sees no results, with no error). An engine on its default locale would -fold a raw query for you, but one set to a stemming locale (which preserves -diacritics) or a non-folding backend will not — so always fold, and matching -stays correct on any engine. +must be folded the same way** with the same `fold()` before it reaches the engine, +or index and query normalize differently and matches silently miss. This contract +holds for **any** consumer, including an API built on this package — which is why +engine adapters and surfaces compile through the shared `SearchQuery` IR and the +`physicalFields` convention rather than re-deriving field names. + +Queries are **always validated**: the port contract requires every engine +adapter to reject a structurally invalid `SearchQuery` (`assertValidQuery`) — +unknown or non-`filterable` fields in `where`, an operator not matching the +field’s kind, non-`facetable` facet requests — no matter which surface or +policy produced it. A typed surface like GraphQL makes most of these +unrepresentable; the port enforces them for everyone else (deployment +`queryDefaults`, in-process callers, weaker-typed surfaces). + +## Typed results + +The `SearchEngine` port is loosely typed by default: facet and document keys +are plain strings. That is the correct contract for an adapter (which cannot +know your fields) and for a surface that builds queries from client input at +runtime. An **in-process caller that knows its search type at compile time** +should narrow the engine with `engineFor` — same instance, zero runtime cost: ```ts -import { fold } from '@lde/text-normalization'; - -await client - .collections(collection) - .documents() - .search({ - q: fold(userQuery), - query_by: 'title_search_nl,title_search_en', - query_by_weights: '2,1', // rank the user’s locale higher - }); +import { engineFor } from '@lde/search'; + +const datasetEngine = engineFor(engine, DATASET); + +const result = await datasetEngine.search(query, DATASET); +result.facets.publisher; // typed: only DATASET’s facetable fields are keys +result.facets.publsher; // compile error (typo) +result.hits[0].document.title; // typed: only DATASET’s output fields are keys +await datasetEngine.search(query, OTHER_TYPE); // compile error (wrong type) ``` -This contract holds for **any** consumer, including a search API built on top of -this package: index-time and query-time folding must use the same `fold()`, or -non-decomposing terms silently miss. +This only works when the search type was declared with `defineSearchType` (or +captured `as const satisfies SearchType`); a plain `: SearchType` annotation +widens the field literals away. The underlying pieces (`EngineFor`, +`FacetFieldsOf`, `OutputFieldsOf`) are exported for annotating your own +signatures. -## Why a spec +## Why a declarative model -The field spec's vocabulary mirrors SHACL on purpose: `path` is `sh:path`, and -the kind is derivable from `sh:datatype` / `sh:nodeKind` / `sh:maxCount` plus -search annotations. So the same projection engine that runs a hand-written spec -today will run a **SHACL-generated** spec tomorrow — the engine and the IR stay; -only spec-authoring gets automated. Nothing is thrown away. +The vocabulary mirrors SHACL on purpose: `path` is `sh:path`, `array` is +`sh:maxCount`, `required` is `sh:minCount`, `localized` is `sh:languageIn`, `ref` +is `sh:class`/`sh:node`. So the same core that runs a hand-written `SearchSchema` +today will run a **SHACL-generated** one tomorrow — the model, the ports and the +IR stay; only schema-authoring gets automated. diff --git a/packages/search/package.json b/packages/search/package.json index 61657f95..6e7414c9 100644 --- a/packages/search/package.json +++ b/packages/search/package.json @@ -1,7 +1,7 @@ { "name": "@lde/search", "version": "0.1.2", - "description": "Engine-agnostic search projection for RDF-backed pipelines: frame CONSTRUCT quads into a JSON-LD IR, then project that IR into flat search documents from a declarative field spec (the artifact a SHACL generator would emit)", + "description": "Engine- and domain-agnostic core for RDF-backed search: a unified declarative field model (SearchField/SearchType/SearchSchema), a neutral query IR, the SearchEngine port with logical result types, and a streaming CONSTRUCT-to-document projection. Bakes in no engine, protocol, or domain.", "repository": { "url": "git+https://github.com/ldelements/lde.git", "directory": "packages/search" diff --git a/packages/search/src/engine.ts b/packages/search/src/engine.ts new file mode 100644 index 00000000..6e55810c --- /dev/null +++ b/packages/search/src/engine.ts @@ -0,0 +1,174 @@ +import type { SearchQuery } from './query.js'; +import type { SearchType } from './schema.js'; + +/** + * The engine port: the boundary a concrete engine adapter (e.g. + * `@lde/search-typesense`’s `TypesenseSearchEngine`) implements. The adapter + * owns every engine specific (companion-field expansion, full-text field + * selection and weights, filter compilation, sorting, result folding, faceting) + * and returns only logical documents, so a deployment can swap engines without + * any consumer noticing. + * Nothing engine-specific and nothing RDF-specific leaks past this port. + * + * Port contract: an adapter ALWAYS validates the incoming query against the + * search type (`assertValidQuery`) and rejects a structurally invalid one — + * unknown or non-filterable fields, mismatched operators, unknown facets — + * rather than passing garbage to its engine. Validation is not the caller’s + * job: it must hold for every surface and for injected deployment policy. + * + * `FacetField` keys the returned facet map; it defaults to `string` so an engine + * stays ergonomic, and a deployment can narrow it to its own facet-field union + * (see {@link FacetFieldsOf}) for typo-safe facet access. `Type` narrows the + * accepted `searchType` argument alongside, so an {@link EngineFor}-typed engine + * rejects a mismatched search type at compile time. + */ +export interface SearchEngine< + FacetField extends string = string, + OutputField extends string = string, + Type extends SearchType = SearchType, +> { + search( + query: SearchQuery, + searchType: Type, + ): Promise>; +} + +/** What an engine returns: logical hits, a total, and the requested facets. */ +export interface SearchResult< + FacetField extends string = string, + OutputField extends string = string, +> { + readonly hits: readonly SearchHit[]; + readonly total: number; + readonly facets: FacetMap; +} + +/** + * Facet buckets keyed by facet field name. `Partial` because a result carries + * buckets only for the fields the query asked for, not every facetable field. + */ +export type FacetMap = Readonly< + Partial> +>; + +/** + * The facet-field-name union of a search type — the keys a {@link SearchResult}’s + * `facets` can hold. Requires the type be captured as a literal (via + * `defineSearchType` or `as const satisfies SearchType`), so the + * `facetable: true` flags survive as literals; a plain `: SearchType` + * annotation widens them and yields `never`. + */ +export type FacetFieldsOf = Extract< + Type['fields'][number], + { readonly facetable: true } +>['name']; + +/** + * The output-field-name union of a search type — the keys a {@link ResultDocument} + * can hold. Like {@link FacetFieldsOf}, requires the type captured as a literal + * (via `defineSearchType` or `as const satisfies SearchType`). + */ +export type OutputFieldsOf = Extract< + Type['fields'][number], + { readonly output: true } +>['name']; + +/** A {@link SearchEngine} narrowed to one search type: facet keys and document + * keys fixed to that type’s facetable / output field names, and `search()` + * accepting only that search type. The type must be captured as a literal + * (`defineSearchType` or `as const satisfies SearchType`); {@link engineFor} + * is the ergonomic way to obtain one. */ +export type EngineFor = SearchEngine< + FacetFieldsOf, + OutputFieldsOf, + Type +>; + +/** + * Narrow an engine to one search type — the ergonomic route to an + * {@link EngineFor} view. The `const` type parameter captures the search type + * as a literal, so facet and document keys come out typo-safe without the + * caller writing any generics. Identity at runtime: the same engine instance + * is returned, only its type changes. + * + * Parameter order follows the family-wide convention: the value being + * operated on first, its `SearchType` right after. + */ +export function engineFor( + engine: SearchEngine, + searchType: Type, +): EngineFor { + void searchType; // exists only to infer `Type`; the engine is returned as-is + return engine; +} + +/** + * One result row. `id` (the stable document key, an IRI) is kept *out* of + * {@link ResultDocument}: it is always present and is the hit’s identity, a + * different contract from the optional, typed logical field values — and it maps + * straight onto the GraphQL output’s guaranteed `id: String!`. The document + * holds only the selectable fields. + */ +export interface SearchHit { + readonly id: string; + readonly document: ResultDocument; +} + +/** + * The logical result document at the query seam — engine- and RDF-neutral. + * Distinct from the flat, fanned-out projection `SearchDocument` that lives + * index-side: this carries logical fields with language maps and references, + * ready for a surface to shape. Keyed by output field name; `Partial` because a + * document omits absent optional fields. `OutputField` defaults to `string`; a + * deployment narrows it via {@link OutputFieldsOf} for typo-safe field access. + */ +export type ResultDocument = Readonly< + Partial> +>; + +/** A logical field value. */ +export type SearchValue = + | string + | number + | boolean + | readonly string[] + | LocalizedValue + | Reference + | readonly Reference[]; + +/** + * A JSON-LD-style language map (`@container: @language`, `@set` arrays); the key + * `und` carries untagged (`@none`) values. The surface flattens it to a + * best-first `Accept-Language`-ordered list. + */ +export type LocalizedValue = Readonly>; + +/** + * The generic internal carrier for a referenced entity. The GraphQL surface maps + * it to a named per-shape type (e.g. `Organization`, `Term`) with `label` + * exposed as `name`. + */ +export interface Reference { + readonly id: string; + readonly label?: LocalizedValue; +} + +/** + * One facet bucket: a value and how many documents carry it. `label` is the + * engine-resolved canonical **data** label, present only for reference facets + * (IRI-keyed); it is absent for facets whose value is a token or free string + * whose display the consumer owns (its own i18n, or the value itself). + */ +export interface FacetBucket { + readonly value: string; + readonly count: number; + readonly label?: LocalizedValue; + /** + * For a range-facet bucket: its half-open bounds (`min` inclusive, `max` + * exclusive), echoing the declared {@link FacetRange} so the bucket is + * self-describing and a consumer never hardcodes the bin formula. Both absent + * for a value facet; either absent for an open-ended bin. + */ + readonly min?: number; + readonly max?: number; +} diff --git a/packages/search/src/index.ts b/packages/search/src/index.ts index 10c2b32f..b64d4432 100644 --- a/packages/search/src/index.ts +++ b/packages/search/src/index.ts @@ -1,13 +1,69 @@ +// Projection: RDF CONSTRUCT quads → flat search documents, driven by the unified +// SearchField/SearchType model below (one declaration; the fanout names come +// from `physicalFields`). export { projectGraph, irisOf, literalsOf, firstLiteralOf } from './project.js'; +export type { SearchDocument } from './project.js'; + +// Unified field model: one declaration drives projection, engine collection +// schema, query semantics and the GraphQL surface. Plus the field selectors and +// the physical field-name convention they all share. +export { + defineSearchType, + searchSchema, + physicalFields, + searchableFields, + facetableFields, + filterableFields, + sortableFields, + outputFields, + referenceFields, + fieldNamed, + isRangeFacet, + isoToUnixSeconds, + unixSecondsToIso, +} from './schema.js'; export type { - SearchDocument, - Projection, - FieldSpec, FieldKind, - LangTextKind, - FacetKind, - NumberKind, - DateKind, + SearchField, + SearchType, + SearchSchema, Derivation, -} from './project.js'; + PhysicalFields, + FacetRange, +} from './schema.js'; + +// Engine- and protocol-neutral query IR + filter semantics, and the always-on +// structural query validation every engine adapter enforces. +export { + filterOperatorFor, + filterOperator, + validateQuery, + assertValidQuery, + pageForOffset, +} from './query.js'; +export type { + SearchQuery, + Filter, + Sort, + FilterOperator, + QueryIssue, +} from './query.js'; + +// Engine port + the logical result document returned across it. +export { engineFor } from './engine.js'; +export type { + SearchEngine, + SearchResult, + SearchHit, + ResultDocument, + SearchValue, + LocalizedValue, + Reference, + FacetBucket, + FacetMap, + FacetFieldsOf, + OutputFieldsOf, + EngineFor, +} from './engine.js'; + export type { FramedNode } from './frame-by-type.js'; diff --git a/packages/search/src/project.ts b/packages/search/src/project.ts index c181978f..75f73b04 100644 --- a/packages/search/src/project.ts +++ b/packages/search/src/project.ts @@ -1,135 +1,57 @@ import type { Quad } from '@rdfjs/types'; import { fold } from '@lde/text-normalization'; import { frameByType, type FramedNode } from './frame-by-type.js'; +import { + isoToUnixSeconds, + physicalFields, + type SearchField, + type SearchSchema, + type SearchType, +} from './schema.js'; /** A flat search document. `id` is the engine document key. */ export type SearchDocument = { id: string } & Record; -/** - * How one framed-IR property projects into search fields. The vocabulary mirrors - * SHACL so a generator can later emit it from shapes + search annotations: - * `path` is `sh:path`, and the kind is derivable from `sh:datatype`/`sh:nodeKind` - * /`sh:maxCount` plus the search annotations. - */ -export type FieldKind = LangTextKind | FacetKind | NumberKind | DateKind; - -/** - * Language-tagged text, projected per locale. `locales` is the single source of - * truth for which languages this field emits; `display`, `search` and `sort` are - * three independent opt-in families that each fan out over it: - * - `display` → `${name}_${locale}` display label, accents preserved; - * - `search` → `${name}_search_${locale}` folded match field (one per locale so - * the engine can tokenize/stem each language and the query can rank the user’s - * locale higher); - * - `sort` → `${name}_sort_${locale}` folded sort key (one per locale so a - * locale-switching UI sorts on the active language). - * - * All three default off — a field emits exactly the families it opts into (e.g. - * `search` alone is a search-only field, shown via a separate label). Only listed - * locales are projected: a value whose language tag is not in `locales` (and is - * not mapped in by `untaggedLanguage`) is not indexed at all. - */ -export interface LangTextKind { - readonly type: 'langText'; - /** The languages to project; drives whichever of the families are enabled. */ - readonly locales: readonly string[]; - /** Emit the per-locale display labels `${name}_${locale}` (accents preserved). */ - readonly display?: boolean; - /** Emit a folded `${name}_search_${locale}` per locale (matchable). */ - readonly search?: boolean; - /** Emit a folded `${name}_sort_${locale}` per locale (sortable). */ - readonly sort?: boolean; -} - -/** A faceted multi-value field, optionally also folded for search. */ -export interface FacetKind { - readonly type: 'facet'; - /** Read IRI references (`@id`) rather than literal values. */ - readonly iri?: boolean; - /** Also emit a folded `${name}_search` array. */ - readonly search?: boolean; - /** Transform each value before faceting (e.g. strip a media-type prefix). */ - readonly transform?: (value: string) => string; -} - -/** A numeric scalar. */ -export interface NumberKind { - readonly type: 'number'; -} - -/** An ISO date-time, parsed into Unix seconds. */ -export interface DateKind { - readonly type: 'date'; -} - -/** - * One field of a projection: an output `name`, the framed-IR predicate `path` to - * read (the SHACL `sh:path`), and the kind-specific config discriminated by - * `type`. - */ -export type FieldSpec = { - /** Output field base name; per-kind suffixes are appended. */ - readonly name: string; - /** Framed-IR predicate IRI to read (the SHACL `sh:path`). */ - readonly path: string; -} & FieldKind; - -/** A computed field that is not a direct projection of a single path - * (e.g. a status rank, or a group derived from a code table). */ -export type Derivation = (document: SearchDocument, node: FramedNode) => void; - -/** - * One root type’s complete projection — the runtime form of a single SHACL - * NodeShape: `type` is its `sh:targetClass` (and the framed node’s `@type`), - * `fields` are its property shapes, and `derivations` are its `sh:rule`-shaped - * computed fields. A generator emits one of these per NodeShape. - */ -export interface Projection { - readonly type: string; - readonly fields: readonly FieldSpec[]; - readonly derivations?: readonly Derivation[]; -} - /** * Project one framed JSON-LD node into a flat search document: apply each field - * spec, then run the derivations (which may read fields the specs already set). + * of the type, then run the derivations (which may read fields the field specs + * already set). The physical field names a field fans out to come from + * {@link physicalFields}, the single source shared with the engine collection + * schema and the query compiler. */ export function projectDocument( node: FramedNode, - projection: Projection, + searchType: SearchType, ): SearchDocument { const id = node['@id']; if (typeof id !== 'string') { throw new Error( - `Cannot project a ${projection.type} node without an @id: every search document needs a stable key, and an empty one would collide with other keyless nodes.`, + `Cannot project a ${searchType.type} node without an @id: every search document needs a stable key, and an empty one would collide with other keyless nodes.`, ); } const document: SearchDocument = { id }; - for (const field of projection.fields) { + for (const field of searchType.fields) { applyField(document, node, field); } - for (const derive of projection.derivations ?? []) { + for (const derive of searchType.derivations ?? []) { derive(document, node); } return document; } /** - * Frame `quads` for every projection’s root type and project each node with its - * type’s projection — the multi-shape pipeline. Streams one document at a time - * so memory stays flat. The IR maps to a projection by type, so adding a shape - * is adding a `Projection` (no engine change). + * Frame `quads` for every root type in the schema and project each node with its + * type’s declaration — the multi-shape pipeline. Streams one document at a time + * so memory stays flat. The IR maps to a declaration by type, so adding a shape + * is adding a `SearchType` to the schema (no engine change). */ export async function* projectGraph( quads: readonly Quad[], - projections: readonly Projection[], + schema: SearchSchema, ): AsyncIterable { - const byType = new Map( - projections.map((projection) => [projection.type, projection]), - ); - for (const projection of byType.values()) { - for await (const node of frameByType(quads, projection.type)) { - yield projectDocument(node, projection); + for (const searchType of schema.values()) { + for await (const node of frameByType(quads, searchType.type)) { + yield projectDocument(node, searchType); } } } @@ -137,77 +59,110 @@ export async function* projectGraph( function applyField( document: SearchDocument, node: FramedNode, - field: FieldSpec, + field: SearchField, ): void { - switch (field.type) { - case 'langText': - return applyLangText(document, langValuesOf(node, field.path), field); - case 'facet': - return applyFacet(document, node, field); + const path = field.path; + if (path === undefined) { + // A derived field — populated by a derivation, not projected from a path. + return; + } + switch (field.kind) { + case 'text': + return applyLocalizedText(document, langValuesOf(node, path), field); + case 'keyword': + return applyFacet(document, literalsOf(node, path), field); + case 'reference': + return applyFacet(document, irisOf(node, path), field); + case 'integer': + return setNumber( + document, + field.name, + toInteger(firstLiteralOf(node, path)), + ); case 'number': return setNumber( document, field.name, - toInteger(firstLiteralOf(node, field.path)), + toNumber(firstLiteralOf(node, path)), ); - case 'date': + case 'date': { + const literal = firstLiteralOf(node, path); return setNumber( document, field.name, - isoToUnix(firstLiteralOf(node, field.path)), + literal === undefined ? undefined : isoToUnixSeconds(literal), ); + } + case 'boolean': { + // The xsd:boolean lexical space: true/false/1/0. + const literal = firstLiteralOf(node, path); + if (literal !== undefined) { + document[field.name] = literal === 'true' || literal === '1'; + } + return; + } } } -function applyLangText( +/** + * Project a language-tagged text field per locale. Display shows one label + * (accents preserved) when the field is `output`; sort keys off that same + * primary value (folded) when `sortable`; search folds every value of the locale + * when `searchable`, so all are matchable. Absent locales emit nothing. + */ +function applyLocalizedText( document: SearchDocument, values: readonly LangValue[], - { name, locales, display, search, sort }: Extract, + field: SearchField, ): void { + const locales = field.locales ?? []; if (locales.length === 0) { throw new Error( - `langText field “${name}” must declare at least one locale; nothing would be projected otherwise.`, + `Localized text field “${field.name}” must declare at least one locale; nothing would be projected otherwise.`, ); } - for (const locale of locales) { + const names = physicalFields(field); + locales.forEach((locale, index) => { const localeValues = values .filter((value) => value.lang === locale) .map((value) => value.value); if (localeValues.length === 0) { - continue; + return; } - // Display shows one label (accents preserved); sort keys off that same - // primary value (folded); search folds every value of the locale so all - // are matchable. Absent locales emit nothing (the field stays optional). const [primary] = localeValues; - if (display) { - setString(document, `${name}_${locale}`, primary); + if (field.output) { + setString(document, names.display[index], primary); } - if (search) { + if (field.searchable) { setString( document, - `${name}_search_${locale}`, + names.search[index], fold(localeValues.join(' ')).trim(), ); } - if (sort) { - setString(document, `${name}_sort_${locale}`, fold(primary)); + if (field.sortable) { + setString(document, names.sort[index], fold(primary)); } - } + }); } +/** + * Project a faceted multi-value field: dedupe (after the optional transform), + * write the value field, and — when `searchable` — a folded `${name}_search` + * array. `keyword` reads literals; `reference` reads IRIs (the caller passes the + * already-read raw values). + */ function applyFacet( document: SearchDocument, - node: FramedNode, - { name, path, iri, search, transform }: Extract, + raw: readonly string[], + field: SearchField, ): void { - const raw = iri ? irisOf(node, path) : literalsOf(node, path); - const values = dedupe(transform ? raw.map(transform) : raw); - setArray(document, name, values); - if (search) { + const values = dedupe(field.transform ? raw.map(field.transform) : raw); + setArray(document, field.name, values); + if (field.searchable) { setArray( document, - `${name}_search`, + physicalFields(field).search[0], dedupe(values.map((value) => fold(value))), ); } @@ -296,12 +251,8 @@ function toInteger(literal: string | undefined): number | undefined { return literal === undefined ? undefined : Math.trunc(Number(literal)); } -function isoToUnix(iso: string | undefined): number | undefined { - if (iso === undefined) { - return undefined; - } - const millis = new Date(iso).getTime(); - return Number.isNaN(millis) ? undefined : Math.trunc(millis / 1000); +function toNumber(literal: string | undefined): number | undefined { + return literal === undefined ? undefined : Number(literal); } function setNumber( diff --git a/packages/search/src/query.ts b/packages/search/src/query.ts new file mode 100644 index 00000000..f626157d --- /dev/null +++ b/packages/search/src/query.ts @@ -0,0 +1,183 @@ +import { fieldNamed, type FieldKind, type SearchType } from './schema.js'; + +/** + * The engine- and protocol-neutral query IR. Every API surface compiles its + * input into this; every engine adapter compiles it into an engine query. One + * shared representation in the middle keeps the GraphQL surface, a later REST + * surface and the adapter from drifting. + */ +export interface SearchQuery { + /** Free-text query; `undefined`/`''` means browse (no text ranking). */ + readonly text?: string; + /** AND across fields. */ + readonly where: readonly Filter[]; + /** Primary public sort plus any server tie-breaks, in precedence order. */ + readonly orderBy: readonly Sort[]; + /** Numbered pagination. */ + readonly limit: number; + readonly offset: number; + /** Logical field names to return facet buckets for. */ + readonly facets: readonly string[]; + /** Selects the per-locale fields to query/sort on (from `Accept-Language`). */ + readonly locale: string; +} + +/** + * One `where` clause. The operator is fixed by the target field’s {@link FieldKind} + * ({@link filterOperatorFor}): keyword/reference use `in` (OR within the field), + * the numeric/date kinds use an inclusive `range`, boolean uses `is`. Bounds are + * inclusive only — no `gt`/`gte`/`lt`/`lte`. + */ +export type Filter = + | { readonly field: string; readonly in: readonly string[] } + | { + readonly field: string; + readonly range: { + readonly min?: number | string; + readonly max?: number | string; + }; + } + | { readonly field: string; readonly is: boolean }; + +/** A single sort dimension. */ +export interface Sort { + readonly field: string; + readonly direction: 'asc' | 'desc'; +} + +/** The `where` operator a kind accepts, or `undefined` when it is not filterable + * through `where` (`text` feeds the free-text `query` instead). */ +export type FilterOperator = 'in' | 'range' | 'is'; + +const OPERATOR_BY_KIND: Readonly< + Record +> = { + text: undefined, + keyword: 'in', + reference: 'in', + integer: 'range', + number: 'range', + date: 'range', + boolean: 'is', +}; + +/** + * The `where` operator a field of this kind accepts (per the ADR filter-semantics + * table), or `undefined` for `text` — which feeds the free-text `query` rather + * than `where`. Drives both the surface’s `where` input type and the adapter’s + * filter compiler from one rule. + */ +export function filterOperatorFor(kind: FieldKind): FilterOperator | undefined { + return OPERATOR_BY_KIND[kind]; +} + +/** The operator a {@link Filter} value carries, from its discriminating key. */ +export function filterOperator(filter: Filter): FilterOperator { + return 'in' in filter ? 'in' : 'range' in filter ? 'range' : 'is'; +} + +/** + * One structural problem {@link validateQuery} found: the query references a + * field the search type does not declare, or uses it in a role it does not + * opt into. Vacuous-but-valid clauses (an empty `in` list, a `range` with no + * bound) are NOT issues — a compiler skips those as no-ops. + */ +export interface QueryIssue { + readonly part: 'where' | 'facets' | 'orderBy'; + readonly field: string; + readonly reason: + | 'unknown-field' + | 'not-filterable' + | 'operator-mismatch' + | 'not-facetable'; +} + +/** + * Structurally validate a query against its search type: every `where` clause + * targets a declared, `filterable` field with the operator its kind accepts + * ({@link filterOperatorFor}); every requested facet is a declared, `facetable` + * field; every sort is `relevance` or a declared field. Sorting deliberately + * checks declaration only, not the `sortable` flag: that flag means *publicly + * selectable*, and a deployment policy may sort on a private tie-break field. + * + * This is the port’s always-on guard: every {@link SearchEngine} adapter MUST + * reject a query with issues ({@link assertValidQuery}) instead of passing + * garbage to its engine, so validation holds for every caller — including + * `queryDefaults` policies and surfaces weaker than GraphQL. + */ +export function validateQuery( + query: SearchQuery, + searchType: SearchType, +): readonly QueryIssue[] { + const issues: QueryIssue[] = []; + for (const filter of query.where) { + const field = fieldNamed(searchType, filter.field); + if (field === undefined) { + issues.push({ + part: 'where', + field: filter.field, + reason: 'unknown-field', + }); + } else if (field.filterable !== true) { + issues.push({ + part: 'where', + field: filter.field, + reason: 'not-filterable', + }); + } else if (filterOperatorFor(field.kind) !== filterOperator(filter)) { + issues.push({ + part: 'where', + field: filter.field, + reason: 'operator-mismatch', + }); + } + } + for (const name of query.facets) { + const field = fieldNamed(searchType, name); + if (field === undefined) { + issues.push({ part: 'facets', field: name, reason: 'unknown-field' }); + } else if (field.facetable !== true) { + issues.push({ part: 'facets', field: name, reason: 'not-facetable' }); + } + } + for (const sort of query.orderBy) { + if ( + sort.field !== 'relevance' && + fieldNamed(searchType, sort.field) === undefined + ) { + issues.push({ + part: 'orderBy', + field: sort.field, + reason: 'unknown-field', + }); + } + } + return issues; +} + +/** Throw on the first structurally invalid query part ({@link validateQuery}), + * naming every issue. The always-on entry point for engine adapters. */ +export function assertValidQuery( + query: SearchQuery, + searchType: SearchType, +): void { + const issues = validateQuery(query, searchType); + if (issues.length > 0) { + const detail = issues + .map((issue) => `${issue.part}: “${issue.field}” (${issue.reason})`) + .join(', '); + throw new Error( + `Invalid search query for “${searchType.name}”: ${detail}.`, + ); + } +} + +/** + * The 1-based page an `offset` falls on — the numbered-pagination presentation + * of the IR, shared by the surfaces and the adapters. `limit: 0` (a facet-only + * query) fetches no hits and has no meaningful page, so it pins to 1 rather + * than dividing by zero. + */ +export function pageForOffset(offset: number, limit: number): number { + return limit > 0 ? Math.floor(offset / limit) + 1 : 1; +} diff --git a/packages/search/src/schema.ts b/packages/search/src/schema.ts new file mode 100644 index 00000000..3c3d7374 --- /dev/null +++ b/packages/search/src/schema.ts @@ -0,0 +1,276 @@ +import type { FramedNode } from './frame-by-type.js'; +import type { SearchDocument } from './project.js'; + +/** + * The engine-neutral kind of a queryable field. It drives every downstream + * behavior: which physical fields the projection emits, the engine + * collection-schema type, the `where`/facet/sort semantics, and the GraphQL + * output/input type. The Typesense-vocabulary types (`string`, `int32`, …) are + * *derived* from this by the engine adapter, never declared here. + */ +export type FieldKind = + | 'text' + | 'keyword' + | 'integer' + | 'number' + | 'boolean' + | 'date' + | 'reference'; + +/** + * One queryable field — the single declarative source that drives all four + * consumers (projection, engine collection schema, query semantics, and the + * GraphQL surface). + * + * Capability flags (`searchable`/`filterable`/`facetable`/`sortable`/`output`) + * are independent opt-ins: a field exposes exactly the roles it declares. A + * field with no `path` is a **derived field** — populated by a + * {@link Derivation} rather than projected from the IR — yet it still carries + * full query/schema/output behavior (e.g. `status`, the compatibility booleans). + * + * The physical field names a declaration fans out to (per-locale search/sort + * keys) follow one convention, owned by + * {@link physicalFields} so projection, collection-schema and query compiler + * cannot disagree. + * + * SHACL is one possible *source*, not a dependency: a generator can emit a + * declaration from a NodeShape + `search:` annotations + * (`kind`←`sh:datatype`/`sh:nodeKind`, `path`←`sh:path`, `array`←`sh:maxCount`, + * `localized`←`rdf:langString`/`sh:languageIn`, `ref`←`sh:node`/`sh:class`), + * and a hand-written declaration is just as valid. + */ +export interface SearchField { + /** Logical API name; the physical fanout derives from it. Declare camelCase + * where it surfaces in GraphQL. */ + readonly name: string; + readonly kind: FieldKind; + /** Framed-IR predicate IRI to project from. Omit for a + * derivation-populated field. */ + readonly path?: string; + /** Multi-valued. */ + readonly array?: boolean; + /** Always present: a non-null scalar in the API output and + * a non-optional field in the engine index. Moot for arrays/booleans/`id`, + * which are non-null regardless. */ + readonly required?: boolean; + /** Language-tagged text (`rdf:langString`); projected per locale. `text` only. */ + readonly localized?: boolean; + /** When `localized`, the languages to emit (the per-locale fanout). */ + readonly locales?: readonly string[]; + /** Appears in the API output type / carries a display label. */ + readonly output?: boolean; + /** Full-text inclusion with a `query_by` weight (folded; per-locale when + * `localized`). Presence is what makes a field searchable. */ + readonly searchable?: { readonly weight: number }; + /** Usable in `where`. */ + readonly filterable?: boolean; + /** Returned as facet buckets. */ + readonly facetable?: boolean; + /** Publicly selectable in `orderBy`; localized text also emits a folded sort key. */ + readonly sortable?: boolean; + /** For `kind: 'reference'`: the referenced shape and how much of it to carry. */ + readonly ref?: { + readonly type: string; + readonly strategy: 'labelOnly' | 'idOnly' | 'inline'; + }; + /** Projection-time value transform (e.g. strip a media-type prefix). */ + readonly transform?: (value: string) => string; + /** + * Range-facet bins for a numeric (`integer`/`number`/`date`) facetable field. + * When set, the field facets into these fixed half-open `[min, max)` ranges (a + * histogram) rather than one bucket per distinct value — the per-bucket counts + * a UI slider needs. Bins are query-time only (no index impact) and + * engine-neutral: the Typesense adapter emits a `facet_by` range, an + * OpenSearch adapter a `range` aggregation. See {@link FacetRange}. + */ + readonly facetRanges?: readonly FacetRange[]; +} + +/** + * One half-open `[min, max)` range-facet bin: `min` inclusive, `max` exclusive, + * so contiguous bins partition cleanly with no boundary double-counting. Omit + * `min` (or `max`) for an open-ended bin (`< max`, resp. `≥ min`). `key` is the + * bucket’s stable label, echoed back as the {@link FacetBucket} `value`. + */ +export interface FacetRange { + readonly key: string; + readonly min?: number; + readonly max?: number; +} + +/** + * A computed field that is not a direct projection of a single path — a status + * rank, a compatibility boolean. Reads + * the framed node and writes onto the flat document the field specs already + * populated. + */ +export type Derivation = (document: SearchDocument, node: FramedNode) => void; + +/** + * One root type’s complete search declaration: its logical API `name`, the + * `type` IRI its documents are instances of, the queryable `fields`, and the + * computed `derivations`. A SHACL generator can emit one per NodeShape + * (`name`←`sh:name`/local name, `type`←`sh:targetClass`, `fields`←its property + * shapes), but that is a source, not a requirement. + */ +export interface SearchType { + /** Logical API name (PascalCase, e.g. `Dataset`) — names the type in the API + * surfaces (GraphQL type names, a REST path), the way each field’s + * {@link SearchField.name} names that field. Deliberately declared rather + * than derived from the `type` IRI, so re-modelling the vocabulary cannot + * silently rename the public contract. */ + readonly name: string; + readonly type: string; + readonly fields: readonly SearchField[]; + readonly derivations?: readonly Derivation[]; +} + +/** + * Declare a {@link SearchType}, capturing it as a literal: the `const` type + * parameter preserves the field names and capability flags that the type-level + * helpers (`FacetFieldsOf`, `OutputFieldsOf`, `EngineFor`) read off the type — + * with none of the widening a plain `: SearchType` annotation causes and + * without having to remember `as const satisfies SearchType`. Identity at + * runtime. + */ +export function defineSearchType( + searchType: Type, +): Type { + return searchType; +} + +/** + * The complete search declaration of a deployment: every root {@link SearchType}, + * keyed by its `type` IRI. Build one with {@link searchSchema}. + */ +export type SearchSchema = ReadonlyMap; + +/** Build a {@link SearchSchema} from root-type declarations, keyed by `type`. */ +export function searchSchema(...types: readonly SearchType[]): SearchSchema { + return new Map(types.map((searchType) => [searchType.type, searchType])); +} + +/** + * The physical engine fields one {@link SearchField} fans out into, grouped by + * the role each plays. The single source of truth for the naming convention, so + * the projection (writes them), the collection schema (declares them) and the + * query compiler (reads them) cannot disagree. + */ +export interface PhysicalFields { + /** Per-locale output labels `${name}_${locale}` (localized text, `output`). */ + readonly display: readonly string[]; + /** Folded match fields: `${name}_search_${locale}` per locale (localized) or a + * single `${name}_search` (non-localized), when `searchable`. */ + readonly search: readonly string[]; + /** Per-locale folded sort keys `${name}_sort_${locale}` (localized text, + * `sortable`); a non-localized field sorts on its own `name` field. */ + readonly sort: readonly string[]; +} + +/** + * Full-text searchable fields, highest `query_by` weight first — the order the + * engine adapter weights `query_by` in. A field is searchable iff it carries a + * `searchable` weight. + */ +export function searchableFields( + searchType: SearchType, +): readonly (SearchField & { + readonly searchable: { readonly weight: number }; +})[] { + return searchType.fields + .filter( + (field): field is SearchField & { searchable: { weight: number } } => + field.searchable !== undefined, + ) + .sort((left, right) => right.searchable.weight - left.searchable.weight); +} + +/** Fields returned as facet buckets, in declaration order. */ +export function facetableFields( + searchType: SearchType, +): readonly SearchField[] { + return searchType.fields.filter((field) => field.facetable === true); +} + +/** Fields usable in `where`, in declaration order. */ +export function filterableFields( + searchType: SearchType, +): readonly SearchField[] { + return searchType.fields.filter((field) => field.filterable === true); +} + +/** Fields publicly selectable in `orderBy`, in declaration order. */ +export function sortableFields(searchType: SearchType): readonly SearchField[] { + return searchType.fields.filter((field) => field.sortable === true); +} + +/** Fields that appear in the API output type, in declaration order. */ +export function outputFields(searchType: SearchType): readonly SearchField[] { + return searchType.fields.filter((field) => field.output === true); +} + +/** Fields of kind `reference` (IRI-valued, label-resolved), in declaration order. */ +export function referenceFields( + searchType: SearchType, +): readonly SearchField[] { + return searchType.fields.filter((field) => field.kind === 'reference'); +} + +/** Look up a field by its logical name. */ +export function fieldNamed( + searchType: SearchType, + name: string, +): SearchField | undefined { + return searchType.fields.find((field) => field.name === name); +} + +/** + * Whether a facet on this field returns fixed range bins (a histogram) rather + * than one bucket per distinct value: it declares non-empty + * {@link SearchField.facetRanges}. One predicate for the surface’s facet type, + * the adapter’s facet clause and the bucket reconstruction, so they cannot + * disagree. + */ +export function isRangeFacet( + field: SearchField, +): field is SearchField & { readonly facetRanges: readonly FacetRange[] } { + return field.facetRanges !== undefined && field.facetRanges.length > 0; +} + +/** + * The engine storage codec for `date` fields: stored as Unix seconds (a + * sortable, range-filterable int64), ISO 8601 at the API edges. One pair for + * the projection (writes), the query compiler (filter bounds) and the surface + * (output), so the three cannot disagree. Returns `undefined` for an + * unparseable value. + */ +export function isoToUnixSeconds(iso: string): number | undefined { + const millis = new Date(iso).getTime(); + return Number.isNaN(millis) ? undefined : Math.trunc(millis / 1000); +} + +/** The inverse of {@link isoToUnixSeconds}: stored Unix seconds → ISO 8601. */ +export function unixSecondsToIso(seconds: number): string { + return new Date(seconds * 1000).toISOString(); +} + +/** Derive the physical engine field names a declaration produces. */ +export function physicalFields(field: SearchField): PhysicalFields { + const localized = field.kind === 'text' && field.localized === true; + const locales = localized ? (field.locales ?? []) : []; + return { + display: + localized && field.output + ? locales.map((locale) => `${field.name}_${locale}`) + : [], + search: field.searchable + ? localized + ? locales.map((locale) => `${field.name}_search_${locale}`) + : [`${field.name}_search`] + : [], + sort: + localized && field.sortable + ? locales.map((locale) => `${field.name}_sort_${locale}`) + : [], + }; +} diff --git a/packages/search/test/engine.test.ts b/packages/search/test/engine.test.ts new file mode 100644 index 00000000..7a8df1c0 --- /dev/null +++ b/packages/search/test/engine.test.ts @@ -0,0 +1,149 @@ +import { describe, expect, it } from 'vitest'; +import { engineFor } from '../src/engine.js'; +import type { EngineFor, SearchEngine, SearchResult } from '../src/engine.js'; +import type { SearchQuery } from '../src/query.js'; +import { defineSearchType } from '../src/schema.js'; +import type { SearchType } from '../src/schema.js'; + +const schema: SearchType = { + name: 'Dataset', + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [{ name: 'title', kind: 'text', localized: true, locales: ['nl'] }], +}; + +// A fake engine: the port is implementable and the result types compose into a +// logical document (language map + reference) the way a real engine returns. +const fake: SearchEngine = { + async search(query: SearchQuery): Promise { + return { + total: 1, + hits: [ + { + id: 'https://example/dataset/1', + document: { + title: { nl: ['Erfgoed'], und: [query.text ?? ''] }, + publisher: { + id: 'https://example/org/1', + label: { nl: ['Archief'] }, + }, + keyword: ['kaarten', 'atlas'], + }, + }, + ], + facets: { keyword: [{ value: 'kaarten', count: 3 }] }, + }; + }, +}; + +describe('SearchEngine port', () => { + it('returns logical hits, total and facets through the port', async () => { + const query: SearchQuery = { + text: 'kaart', + where: [], + orderBy: [{ field: 'relevance', direction: 'desc' }], + limit: 20, + offset: 0, + facets: ['keyword'], + locale: 'nl', + }; + + const result = await fake.search(query, schema); + + expect(result.total).toBe(1); + expect(result.hits[0].id).toBe('https://example/dataset/1'); + expect(result.hits[0].document.title).toEqual({ + nl: ['Erfgoed'], + und: ['kaart'], + }); + expect(result.facets.keyword).toEqual([{ value: 'kaarten', count: 3 }]); + }); +}); + +describe('typed facet and document keys', () => { + it('keys facets and the result document by the schema’s field names', async () => { + // Captured as a literal (`as const satisfies`) so the `facetable`/`output` + // flags survive and the `…Of` helpers can read the field names off the type. + const datasetSchema = { + name: 'Dataset', + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [ + { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl'], + output: true, + }, + { name: 'format', kind: 'keyword', array: true, facetable: true }, + { name: 'status', kind: 'keyword', facetable: true }, + ], + } as const satisfies SearchType; + + // facets ⊂ { format, status }, document keys ⊂ { title }. These object + // literals would not compile if the helpers widened to `string`/`never`. + const engine: EngineFor = { + async search() { + return { + total: 1, + hits: [ + { + id: 'https://example/d/1', + document: { title: { nl: ['Titel'] } }, + }, + ], + facets: { format: [{ value: 'text/turtle', count: 2 }] }, + }; + }, + }; + + const result = await engine.search( + { + where: [], + orderBy: [], + limit: 10, + offset: 0, + facets: ['format'], + locale: 'nl', + }, + datasetSchema, + ); + + expect(result.facets.format).toEqual([{ value: 'text/turtle', count: 2 }]); + expect(result.hits[0].document.title).toEqual({ nl: ['Titel'] }); + }); + + it('accepts only the search type it was narrowed to', () => { + // `defineSearchType` captures the literal (no `as const` needed): the + // `facetable: true` flag must survive for `FacetFieldsOf` to see it. + const datasetSchema = defineSearchType({ + name: 'Dataset', + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [{ name: 'format', kind: 'keyword', facetable: true }], + }); + const organizationSchema = defineSearchType({ + name: 'Organization', + type: 'http://xmlns.com/foaf/0.1/Organization', + fields: [{ name: 'sector', kind: 'keyword', facetable: true }], + }); + const query: SearchQuery = { + where: [], + orderBy: [], + limit: 10, + offset: 0, + facets: [], + locale: 'nl', + }; + + // `engineFor` narrows a generic adapter (plain `SearchEngine`) to any + // `EngineFor` — the same instance, identity at runtime. + const engine: EngineFor = engineFor( + fake, + datasetSchema, + ); + expect(engine).toBe(fake); + + void engine.search(query, datasetSchema); + // @ts-expect-error — a mismatched search type is rejected at compile time + void engine.search(query, organizationSchema); + }); +}); diff --git a/packages/search/test/project.test.ts b/packages/search/test/project.test.ts index 60c42f71..96b6a888 100644 --- a/packages/search/test/project.test.ts +++ b/packages/search/test/project.test.ts @@ -5,11 +5,14 @@ import { projectDocument, projectGraph, irisOf, - type FieldSpec, - type Derivation, - type Projection, type SearchDocument, } from '../src/project.js'; +import { + searchSchema, + type SearchField, + type SearchType, + type Derivation, +} from '../src/schema.js'; const DR = 'urn:dr:'; const IANA = 'https://www.iana.org/assignments/media-types/'; @@ -30,49 +33,50 @@ const node = { [`${DR}size`]: { '@type': xsd.integer.value, '@value': '1234' }, }; -const fields: FieldSpec[] = [ +const fields: SearchField[] = [ { name: 'title', path: dcterms.title.value, - type: 'langText', + kind: 'text', + localized: true, locales: ['nl', 'en'], - display: true, - search: true, - sort: true, + output: true, + searchable: { weight: 1 }, + sortable: true, }, { name: 'publisher', path: `${DR}publisherName`, - type: 'langText', + kind: 'text', + localized: true, locales: ['nl', 'en'], - display: true, - search: true, + output: true, + searchable: { weight: 1 }, }, { name: 'publisher', path: dcterms.publisher.value, - type: 'facet', - iri: true, + kind: 'reference', }, { name: 'keyword', path: dcat.keyword.value, - type: 'facet', - search: true, + kind: 'keyword', + searchable: { weight: 1 }, }, { name: 'format', path: `${DR}format`, - type: 'facet', + kind: 'keyword', transform: (value) => value.replace(IANA, ''), }, - { name: 'class', path: `${DR}class`, type: 'facet', iri: true }, + { name: 'class', path: `${DR}class`, kind: 'reference' }, { name: 'date_posted', path: `${DR}datePosted`, - type: 'date', + kind: 'date', }, - { name: 'size', path: `${DR}size`, type: 'number' }, + { name: 'size', path: `${DR}size`, kind: 'integer' }, ]; const derivations: Derivation[] = [ @@ -81,11 +85,16 @@ const derivations: Derivation[] = [ }, ]; -const projection: Projection = { type: DATASET, fields, derivations }; +const schema: SearchType = { + name: 'Dataset', + type: DATASET, + fields, + derivations, +}; describe('projectDocument', () => { it('projects every field kind and runs derivations', () => { - const document = projectDocument(node, projection); + const document = projectDocument(node, schema); expect(document.id).toBe('https://ex/d/1'); expect(document.title_nl).toBe('Titel'); @@ -119,25 +128,25 @@ describe('projectDocument', () => { [`${DR}class`]: 'http://example.org/BareClass', }, { + name: 'Dataset', type: DATASET, fields: [ - { name: 'size', path: `${DR}size`, type: 'number' }, + { name: 'size', path: `${DR}size`, kind: 'integer' }, { name: 'language', path: dcterms.language.value, - type: 'facet', + kind: 'keyword', }, { name: 'keyword', path: dcat.keyword.value, - type: 'facet', - search: true, + kind: 'keyword', + searchable: { weight: 1 }, }, { name: 'class', path: `${DR}class`, - type: 'facet', - iri: true, + kind: 'reference', }, ], }, @@ -148,17 +157,51 @@ describe('projectDocument', () => { expect(document.class).toEqual(['http://example.org/BareClass']); }); + it('projects a number field as a float (not truncated like integer)', () => { + const document = projectDocument( + { '@id': 'https://ex/d/12', [`${DR}size`]: { '@value': '1234.5' } }, + { + name: 'Dataset', + type: DATASET, + fields: [{ name: 'size', path: `${DR}size`, kind: 'number' }], + }, + ); + expect(document.size).toBe(1234.5); + }); + + it('projects a boolean field from a path (xsd:boolean lexical space)', () => { + const withBoolean: SearchType = { + name: 'Dataset', + type: DATASET, + fields: [{ name: 'iiif', path: `${DR}iiif`, kind: 'boolean' }], + }; + const project = (value: unknown): SearchDocument => + projectDocument( + { '@id': 'https://ex/d/5', [`${DR}iiif`]: { '@value': value } }, + withBoolean, + ); + + expect(project('true').iiif).toBe(true); + expect(project('1').iiif).toBe(true); + expect(project('false').iiif).toBe(false); + // Absent value → no field (the adapter reconstructs absence as false). + expect( + projectDocument({ '@id': 'https://ex/d/5' }, withBoolean).iiif, + ).toBeUndefined(); + }); + it('folds the transformed values (not the raw ones) for a facet search field', () => { const document = projectDocument( { '@id': 'https://ex/d/4', [`${DR}format`]: [`${IANA}text/turtle`] }, { + name: 'Dataset', type: DATASET, fields: [ { name: 'format', path: `${DR}format`, - type: 'facet', - search: true, + kind: 'keyword', + searchable: { weight: 1 }, transform: (value) => value.replace(IANA, ''), }, ], @@ -174,7 +217,7 @@ describe('projectDocument', () => { '@id': 'https://ex/d/2', [dcterms.title.value]: { '@language': 'nl', '@value': 'Solo' }, }, - { type: DATASET, fields }, + { name: 'Dataset', type: DATASET, fields }, ); expect(document.id).toBe('https://ex/d/2'); expect(document.title_search_nl).toBe('solo'); @@ -185,7 +228,7 @@ describe('projectDocument', () => { it('omits the sort field when there is no value to sort on', () => { const document = projectDocument( { '@id': 'https://ex/d/5' }, - { type: DATASET, fields }, + { name: 'Dataset', type: DATASET, fields }, ); expect(document.id).toBe('https://ex/d/5'); expect(document.title_sort_nl).toBeUndefined(); @@ -197,7 +240,7 @@ describe('projectDocument', () => { '@id': 'https://ex/d/6', [dcterms.title.value]: { '@language': 'fr', '@value': 'Bonjour' }, }, - { type: DATASET, fields }, + { name: 'Dataset', type: DATASET, fields }, ); // locales is ['nl', 'en'], so the French title is invisible — no display, // search or sort field is emitted for it. @@ -213,7 +256,7 @@ describe('projectDocument', () => { '@id': 'https://ex/d/7', [dcterms.title.value]: { '@value': 'Naamloos' }, }, - { type: DATASET, fields }, + { name: 'Dataset', type: DATASET, fields }, ); expect(document.title_nl).toBeUndefined(); expect(document.title_search_nl).toBeUndefined(); @@ -227,15 +270,17 @@ describe('projectDocument', () => { [dcterms.title.value]: { '@language': 'nl', '@value': 'Verhalen' }, }, { + name: 'Dataset', type: DATASET, fields: [ { name: 'title', path: dcterms.title.value, - // search only — display and sort not opted into. - type: 'langText', + // search only — display (output) and sort not opted into. + kind: 'text', + localized: true, locales: ['nl', 'en'], - search: true, + searchable: { weight: 1 }, }, ], }, @@ -255,18 +300,51 @@ describe('projectDocument', () => { { '@language': 'nl', '@value': 'Ondertitel' }, ], }, - { type: DATASET, fields }, + { name: 'Dataset', type: DATASET, fields }, ); // Display takes the first value; search folds them all so both are matchable. expect(document.title_nl).toBe('Titel'); expect(document.title_search_nl).toBe('titel ondertitel'); }); + it('skips a field with no path, leaving it to a derivation (derived field)', () => { + const document = projectDocument( + { + '@id': 'https://ex/d/11', + [dcterms.title.value]: { '@language': 'nl', '@value': 'Titel' }, + }, + { + name: 'Dataset', + type: DATASET, + fields: [ + { + name: 'title', + path: dcterms.title.value, + kind: 'text', + localized: true, + locales: ['nl'], + output: true, + }, + // No `path`: a derived field — its value comes from a derivation, + // never from projection. + { name: 'status', kind: 'keyword', facetable: true }, + ], + derivations: [ + (derived) => { + derived.status = 'valid'; + }, + ], + }, + ); + expect(document.title_nl).toBe('Titel'); + expect(document.status).toBe('valid'); + }); + it('throws when the framed node has no @id', () => { expect(() => projectDocument( { [dcterms.title.value]: { '@value': 'No id' } }, - { type: DATASET, fields }, + { name: 'Dataset', type: DATASET, fields }, ), ).toThrow(/without an @id/); }); @@ -279,12 +357,14 @@ describe('projectDocument', () => { [dcterms.title.value]: { '@language': 'nl', '@value': 'Titel' }, }, { + name: 'Dataset', type: DATASET, fields: [ { name: 'title', path: dcterms.title.value, - type: 'langText', + kind: 'text', + localized: true, locales: [], }, ], @@ -295,7 +375,7 @@ describe('projectDocument', () => { }); describe('projectGraph', () => { - it('frames each projection’s type and projects matching nodes', async () => { + it('frames each root type in the schema and projects matching nodes', async () => { const quads = new Parser({ format: 'N-Triples' }).parse(` <${rdf.type.value}> <${DATASET}> . <${dcterms.title.value}> "Titel"@nl . @@ -306,9 +386,10 @@ describe('projectGraph', () => { `); const documents: SearchDocument[] = []; - for await (const document of projectGraph(quads, [ - { type: DATASET, fields }, - ])) { + for await (const document of projectGraph( + quads, + searchSchema({ name: 'Dataset', type: DATASET, fields }), + )) { documents.push(document); } diff --git a/packages/search/test/query.test.ts b/packages/search/test/query.test.ts new file mode 100644 index 00000000..d407c046 --- /dev/null +++ b/packages/search/test/query.test.ts @@ -0,0 +1,126 @@ +import { describe, expect, it } from 'vitest'; +import { + assertValidQuery, + filterOperatorFor, + pageForOffset, + validateQuery, + type SearchQuery, +} from '../src/query.js'; +import type { SearchType } from '../src/schema.js'; + +describe('filterOperatorFor', () => { + it('maps each field kind to its `where` operator', () => { + expect(filterOperatorFor('text')).toBeUndefined(); + expect(filterOperatorFor('keyword')).toBe('in'); + expect(filterOperatorFor('reference')).toBe('in'); + expect(filterOperatorFor('integer')).toBe('range'); + expect(filterOperatorFor('number')).toBe('range'); + expect(filterOperatorFor('date')).toBe('range'); + expect(filterOperatorFor('boolean')).toBe('is'); + }); +}); + +describe('validateQuery', () => { + const searchType: SearchType = { + name: 'Dataset', + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [ + { name: 'status', kind: 'keyword', facetable: true, filterable: true }, + { name: 'size', kind: 'integer', filterable: true }, + { name: 'license', kind: 'keyword' }, // declared, but no roles opted into + { name: 'statusRank', kind: 'integer', sortable: true }, + ], + }; + const base: SearchQuery = { + where: [], + orderBy: [], + limit: 10, + offset: 0, + facets: [], + locale: 'nl', + }; + + it('accepts a structurally valid query', () => { + expect( + validateQuery( + { + ...base, + where: [ + { field: 'status', in: ['valid'] }, + { field: 'size', range: { min: 1 } }, + ], + facets: ['status'], + orderBy: [ + { field: 'relevance', direction: 'desc' }, + // Declared but not `sortable`: allowed — `sortable` means publicly + // selectable, and deployment policy may sort on a private tie-break. + { field: 'statusRank', direction: 'asc' }, + ], + }, + searchType, + ), + ).toEqual([]); + }); + + it('accepts vacuous clauses: they are no-ops, not structural issues', () => { + expect( + validateQuery( + { + ...base, + where: [ + { field: 'status', in: [] }, + { field: 'size', range: {} }, + ], + }, + searchType, + ), + ).toEqual([]); + }); + + it('flags every structurally invalid part', () => { + const issues = validateQuery( + { + ...base, + where: [ + { field: 'nonexistent', in: ['x'] }, + { field: 'license', in: ['MIT'] }, + { field: 'status', range: { min: 1 } }, + ], + facets: ['nonexistent', 'size'], + orderBy: [{ field: 'nonexistent', direction: 'asc' }], + }, + searchType, + ); + expect(issues).toEqual([ + { part: 'where', field: 'nonexistent', reason: 'unknown-field' }, + { part: 'where', field: 'license', reason: 'not-filterable' }, + { part: 'where', field: 'status', reason: 'operator-mismatch' }, + { part: 'facets', field: 'nonexistent', reason: 'unknown-field' }, + { part: 'facets', field: 'size', reason: 'not-facetable' }, + { part: 'orderBy', field: 'nonexistent', reason: 'unknown-field' }, + ]); + }); + + it('assertValidQuery names the type and every issue', () => { + expect(() => + assertValidQuery( + { ...base, where: [{ field: 'nonexistent', in: ['x'] }] }, + searchType, + ), + ).toThrow( + 'Invalid search query for “Dataset”: where: “nonexistent” (unknown-field).', + ); + expect(() => assertValidQuery(base, searchType)).not.toThrow(); + }); +}); + +describe('pageForOffset', () => { + it('maps an offset to its 1-based page', () => { + expect(pageForOffset(0, 20)).toBe(1); + expect(pageForOffset(40, 20)).toBe(3); + }); + + it('pins a facet-only query (limit 0) to page 1 instead of dividing by zero', () => { + expect(pageForOffset(0, 0)).toBe(1); + }); +}); diff --git a/packages/search/test/schema.test.ts b/packages/search/test/schema.test.ts new file mode 100644 index 00000000..8df30b30 --- /dev/null +++ b/packages/search/test/schema.test.ts @@ -0,0 +1,235 @@ +import { describe, expect, it } from 'vitest'; +import { + facetableFields, + fieldNamed, + filterableFields, + isoToUnixSeconds, + isRangeFacet, + outputFields, + physicalFields, + referenceFields, + searchableFields, + sortableFields, + unixSecondsToIso, + type SearchField, + type SearchType, +} from '../src/schema.js'; + +const DATASET = 'http://www.w3.org/ns/dcat#Dataset'; + +const schema: SearchType = { + name: 'Dataset', + type: DATASET, + fields: [ + { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, + }, + { + name: 'description', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 2 }, + }, + { + name: 'keyword', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + searchable: { weight: 1 }, + }, + { + name: 'format', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + }, + { + name: 'datePosted', + kind: 'date', + output: true, + filterable: true, + sortable: true, + }, + { + name: 'status', + kind: 'keyword', + facetable: true, + filterable: true, + output: true, + }, + ], +}; + +describe('physicalFields', () => { + it('fans a localized text field out into per-locale display, search and sort keys', () => { + const title: SearchField = { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, + }; + + expect(physicalFields(title)).toEqual({ + display: ['title_nl', 'title_en'], + search: ['title_search_nl', 'title_search_en'], + sort: ['title_sort_nl', 'title_sort_en'], + }); + }); + + it('gives a searchable keyword facet one value field and one folded search field', () => { + const keyword: SearchField = { + name: 'keyword', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + searchable: { weight: 1 }, + }; + + expect(physicalFields(keyword)).toEqual({ + display: [], + search: ['keyword_search'], + sort: [], + }); + }); + + it('emits only the search keys for a search-only localized field (no display, no sort)', () => { + const creator: SearchField = { + name: 'creator', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + searchable: { weight: 2 }, + }; + + expect(physicalFields(creator)).toEqual({ + display: [], + search: ['creator_search_nl', 'creator_search_en'], + sort: [], + }); + }); + + it('emits no per-locale fields when a localized field declares no locales', () => { + const title: SearchField = { + name: 'title', + kind: 'text', + localized: true, + output: true, + searchable: { weight: 5 }, + sortable: true, + }; + + expect(physicalFields(title)).toEqual({ + display: [], + search: [], + sort: [], + }); + }); + + it('fans a non-localized reference field out into no companion fields', () => { + const publisher: SearchField = { + name: 'publisher', + kind: 'reference', + facetable: true, + filterable: true, + output: true, + ref: { type: 'http://xmlns.com/foaf/0.1/Agent', strategy: 'labelOnly' }, + }; + + expect(physicalFields(publisher)).toEqual({ + display: [], + search: [], + sort: [], + }); + }); +}); + +describe('schema selectors', () => { + it('orders searchable fields by descending weight', () => { + expect(searchableFields(schema).map((field) => field.name)).toEqual([ + 'title', + 'description', + 'keyword', + ]); + }); + + it('selects facetable, filterable, sortable and output fields by capability', () => { + expect(facetableFields(schema).map((field) => field.name)).toEqual([ + 'keyword', + 'format', + 'status', + ]); + expect(filterableFields(schema).map((field) => field.name)).toEqual([ + 'keyword', + 'format', + 'datePosted', + 'status', + ]); + expect(sortableFields(schema).map((field) => field.name)).toEqual([ + 'title', + 'datePosted', + ]); + expect(outputFields(schema).map((field) => field.name)).toEqual([ + 'title', + 'description', + 'datePosted', + 'status', + ]); + }); + + it('selects reference fields and looks a field up by name', () => { + const publisher: SearchField = { + name: 'publisher', + kind: 'reference', + facetable: true, + ref: { type: 'http://xmlns.com/foaf/0.1/Agent', strategy: 'labelOnly' }, + }; + const withReference: SearchType = { + name: 'Dataset', + type: DATASET, + fields: [...schema.fields, publisher], + }; + expect(referenceFields(withReference)).toEqual([publisher]); + expect(fieldNamed(withReference, 'publisher')).toBe(publisher); + expect(fieldNamed(withReference, 'nonexistent')).toBeUndefined(); + }); +}); + +describe('isRangeFacet', () => { + it('requires a non-empty facetRanges declaration', () => { + const size: SearchField = { + name: 'size', + kind: 'integer', + facetable: true, + facetRanges: [{ key: '0', min: 1, max: 10 }], + }; + expect(isRangeFacet(size)).toBe(true); + expect(isRangeFacet({ ...size, facetRanges: [] })).toBe(false); + expect(isRangeFacet({ ...size, facetRanges: undefined })).toBe(false); + }); +}); + +describe('date storage codec', () => { + it('round-trips ISO 8601 through the stored Unix seconds', () => { + const seconds = isoToUnixSeconds('2024-01-01T00:00:00.000Z'); + expect(seconds).toBe(Date.parse('2024-01-01T00:00:00.000Z') / 1000); + expect(unixSecondsToIso(seconds ?? 0)).toBe('2024-01-01T00:00:00.000Z'); + }); + + it('returns undefined for an unparseable date', () => { + expect(isoToUnixSeconds('not-a-date')).toBeUndefined(); + }); +}); diff --git a/packages/search/vite.config.ts b/packages/search/vite.config.ts index 6a8321a2..30a36186 100644 --- a/packages/search/vite.config.ts +++ b/packages/search/vite.config.ts @@ -11,9 +11,9 @@ export default mergeConfig( coverage: { thresholds: { functions: 100, - lines: 97.3, - branches: 88.76, - statements: 97.3, + lines: 98.22, + branches: 92.25, + statements: 98.29, }, }, }, diff --git a/tsconfig.json b/tsconfig.json index 0b6d2b2c..0defc069 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -76,6 +76,9 @@ }, { "path": "./packages/search" + }, + { + "path": "./packages/search-api-graphql" } ] }