From 8df18f7d0a779d8c73d84917e4dd5931152c2750 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Sun, 24 May 2026 18:26:48 -0700 Subject: [PATCH 01/15] Add trigram extraction utility --- src/trigrams.js | 38 ++++++++++++++++++++++++++++++++++++ test/trigrams.test.js | 45 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+) create mode 100644 src/trigrams.js create mode 100644 test/trigrams.test.js diff --git a/src/trigrams.js b/src/trigrams.js new file mode 100644 index 0000000..6f3fb89 --- /dev/null +++ b/src/trigrams.js @@ -0,0 +1,38 @@ +/** + * Trigram extraction for grep-style substring matching. + * + * Text is lowercased and split on non-alphanumeric boundaries, then every + * 3-character window of each alphanumeric run is emitted as a trigram. + */ + +const TRIGRAM_LEN = 3 + +/** + * Extract the set of distinct trigrams in a string. + * + * @param {string} text + * @returns {Set} + */ +export function extractTrigrams(text) { + /** @type {Set} */ + const out = new Set() + if (typeof text !== 'string' || text.length < TRIGRAM_LEN) return out + const lower = text.toLowerCase() + for (const run of lower.split(/[^a-z0-9]+/g)) { + for (let i = 0; i + TRIGRAM_LEN <= run.length; i += 1) { + out.add(run.slice(i, i + TRIGRAM_LEN)) + } + } + return out +} + +/** + * Extract trigrams needed to satisfy a query as a substring search. + * Whitespace-separated words must each appear; their trigrams are unioned. + * + * @param {string} query + * @returns {string[]} + */ +export function queryTrigrams(query) { + return Array.from(extractTrigrams(query)) +} diff --git a/test/trigrams.test.js b/test/trigrams.test.js new file mode 100644 index 0000000..4a0d3b3 --- /dev/null +++ b/test/trigrams.test.js @@ -0,0 +1,45 @@ +import { describe, expect, it } from 'vitest' +import { extractTrigrams, queryTrigrams } from '../src/trigrams.js' + +describe('extractTrigrams', () => { + it('returns empty set for short strings', () => { + expect(extractTrigrams('')).toEqual(new Set()) + expect(extractTrigrams('ab')).toEqual(new Set()) + }) + + it('extracts overlapping trigrams', () => { + expect(extractTrigrams('rhythm')).toEqual(new Set(['rhy', 'hyt', 'yth', 'thm'])) + }) + + it('lowercases', () => { + expect(extractTrigrams('Foo')).toEqual(new Set(['foo'])) + }) + + it('splits on non-alphanumeric', () => { + expect(extractTrigrams('foo bar')).toEqual(new Set(['foo', 'bar'])) + expect(extractTrigrams('a.b.cat')).toEqual(new Set(['cat'])) + }) + + it('dedupes trigrams', () => { + expect(extractTrigrams('ababab')).toEqual(new Set(['aba', 'bab'])) + }) + + it('handles alphanumeric runs', () => { + expect(extractTrigrams('abc123')).toEqual(new Set(['abc', 'bc1', 'c12', '123'])) + }) +}) + +describe('queryTrigrams', () => { + it('returns empty array for short queries', () => { + expect(queryTrigrams('ab')).toEqual([]) + }) + + it('returns trigrams from a query', () => { + expect(new Set(queryTrigrams('rhyt'))).toEqual(new Set(['rhy', 'hyt'])) + }) + + it('unions trigrams across words', () => { + expect(new Set(queryTrigrams('foo bar'))) + .toEqual(new Set(['foo', 'bar'])) + }) +}) From 4f2f75be2c842718394b17fc8b828fddd03e934b Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Sun, 24 May 2026 18:29:07 -0700 Subject: [PATCH 02/15] Switch index to trigrams --- package.json | 2 +- src/constants.js | 2 +- src/createIndex.js | 88 +++++++++++-------------------- src/types.d.ts | 15 ++---- test/createIndex.test.js | 6 +-- test/files/alpha.index.parquet | Bin 1037 -> 708 bytes test/files/alpha.parquet | Bin 2854 -> 2858 bytes test/files/dataset.index.parquet | Bin 6525 -> 3266 bytes 8 files changed, 38 insertions(+), 75 deletions(-) diff --git a/package.json b/package.json index f28d9b2..031293f 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "hypgrep", - "version": "0.1.1", + "version": "1.0.0", "author": "Hyperparam", "homepage": "https://hyperparam.app", "license": "MIT", diff --git a/src/constants.js b/src/constants.js index c5c81b2..a3d481e 100644 --- a/src/constants.js +++ b/src/constants.js @@ -1,5 +1,5 @@ // Version of the parquet index format -export const hypGrepVersion = 0 +export const hypGrepVersion = 1 // Number of rows per virtual block export const defaultBlockSize = 500 diff --git a/src/createIndex.js b/src/createIndex.js index 0ed58d0..74bfb0b 100644 --- a/src/createIndex.js +++ b/src/createIndex.js @@ -1,16 +1,16 @@ import { parquetMetadataAsync, parquetReadObjects } from 'hyparquet' import { parquetWrite } from 'hyparquet-writer' import { defaultBlockSize, defaultIndexRowGroupSize, hypGrepVersion } from './constants.js' -import { tokenize } from './tokenize.js' +import { extractTrigrams } from './trigrams.js' import { getTextColumnsFromSchema } from './utils.js' /** - * @import { BlockStats, CreateIndexOptions, IndexRow } from './types.js' + * @import { CreateIndexOptions, IndexRow } from './types.js' * @import { ColumnSource } from 'hyparquet-writer' */ /** - * Create a full-text search index parquet next to the given parquet file. + * Create a trigram search index parquet next to the given parquet file. * * @param {CreateIndexOptions} options * @returns {Promise} @@ -29,9 +29,9 @@ export async function createIndex({ throw new Error('No string columns found to index') } - // Map from term -> array of entries (entries are in blockId order) - /** @type {Map} */ - const termIndex = new Map() + // Map from trigram -> blockIds (in ascending order) + /** @type {Map} */ + const postings = new Map() let blockId = 0 for (let rowStart = 0; rowStart < numRows; rowStart += blockSize) { @@ -45,33 +45,26 @@ export async function createIndex({ columns: textColumns, }) - const { termDocCount, termFreqMap } = collectBlockStats(rows, textColumns) - - // Build index entries for this block - for (const [term, docCount] of termDocCount.entries()) { - const termFreq = termFreqMap.get(term) || docCount - const entry = { term, blockId, docCount, termFreq } - const existing = termIndex.get(term) - if (existing) { - existing.push(entry) - } else { - termIndex.set(term, [entry]) - } + const blockTrigrams = collectBlockTrigrams(rows, textColumns) + for (const trigram of blockTrigrams) { + const existing = postings.get(trigram) + if (existing) existing.push(blockId) + else postings.set(trigram, [blockId]) } blockId += 1 } - // Sort by term - const sortedTerms = Array.from(termIndex.keys()).sort() - - // Flatten into sorted indexRows + // Flatten into rows sorted by trigram, then blockId (already sorted within each posting list) + const sortedTrigrams = Array.from(postings.keys()).sort() /** @type {IndexRow[]} */ const indexRows = [] - for (const term of sortedTerms) { - const entries = termIndex.get(term) - if (!entries) continue - indexRows.push(...entries) + for (const trigram of sortedTrigrams) { + const blocks = postings.get(trigram) + if (!blocks) continue + for (const id of blocks) { + indexRows.push({ trigram, blockId: id }) + } } const kvMetadata = [ @@ -93,41 +86,26 @@ export async function createIndex({ } /** - * Collect term statistics for a single logical block of rows. + * Collect the set of distinct trigrams present in a block. * * @param {Record[]} rows * @param {string[]} textColumns - * @returns {BlockStats} + * @returns {Set} */ -function collectBlockStats(rows, textColumns) { - const termDocCount = new Map() - const termFreqMap = new Map() - +function collectBlockTrigrams(rows, textColumns) { + /** @type {Set} */ + const trigrams = new Set() for (const row of rows) { if (!row) continue - - const seenInRow = new Set() - for (const columnName of textColumns) { const value = row[columnName] if (typeof value !== 'string' || value.length === 0) continue - - const tokens = tokenize(value) - - for (const token of tokens) { - seenInRow.add(token) - const prevFreq = termFreqMap.get(token) || 0 - termFreqMap.set(token, prevFreq + 1) + for (const t of extractTrigrams(value)) { + trigrams.add(t) } } - - for (const token of seenInRow) { - const prevDocCount = termDocCount.get(token) || 0 - termDocCount.set(token, prevDocCount + 1) - } } - - return { termDocCount, termFreqMap } + return trigrams } /** @@ -138,25 +116,19 @@ function collectBlockStats(rows, textColumns) { */ function buildColumnData(indexRows) { const { length } = indexRows - const terms = new Array(length) + const trigrams = new Array(length) const blockIds = new Array(length) - const docCounts = new Array(length) - const termFreqs = new Array(length) for (let i = 0; i < length; i += 1) { const row = indexRows[i] - terms[i] = row.term + trigrams[i] = row.trigram blockIds[i] = row.blockId - docCounts[i] = row.docCount - termFreqs[i] = row.termFreq } return [ // Delta byte array encoding works well for sorted string columns - { name: 'term', data: terms, type: 'STRING', encoding: 'DELTA_BYTE_ARRAY' }, + { name: 'trigram', data: trigrams, type: 'STRING', encoding: 'DELTA_BYTE_ARRAY' }, // Delta binary packed works well for incrementing integers { name: 'blockId', data: blockIds, type: 'INT32', encoding: 'DELTA_BINARY_PACKED' }, - { name: 'docCount', data: docCounts, type: 'INT32' }, - { name: 'termFreq', data: termFreqs, type: 'INT32' }, ] } diff --git a/src/types.d.ts b/src/types.d.ts index 7146f1d..4c2af45 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -39,13 +39,11 @@ export interface ParquetSearchOptions { } /** - * Represents a single entry in the search index. + * Represents a single entry in the trigram index. */ export interface IndexRow { - term: string // normalized search term - blockId: number // logical block ID this term appears in - docCount: number // number of documents in the block containing this term - termFreq: number // total frequency of the term in the block + trigram: string // 3-character substring + blockId: number // logical block ID this trigram appears in } export interface QueryResult { @@ -83,10 +81,3 @@ export interface HypGrepMetadata { sourceByteLength: number // byte length of the source parquet file } -/** - * Statistics collected for a single logical block during index creation. - */ -export interface BlockStats { - termDocCount: Map // number of documents containing each term - termFreqMap: Map // total frequency of each term in the block -} diff --git a/test/createIndex.test.js b/test/createIndex.test.js index b046324..5ef936d 100644 --- a/test/createIndex.test.js +++ b/test/createIndex.test.js @@ -27,13 +27,13 @@ describe('createIndex', () => { expect(existsSync(TEST_INDEX)).toBe(true) const indexBuffer = await asyncBufferFromFile(TEST_INDEX) - expect(indexBuffer.byteLength).toBe(3998) + expect(indexBuffer.byteLength).toBe(2308) const indexMetadata = await parquetMetadataAsync(indexBuffer) expect(indexMetadata.row_groups.length).toBe(7) - expect(indexMetadata.num_rows).toBe(653n) + expect(indexMetadata.num_rows).toBe(676n) expect(indexMetadata.key_value_metadata?.length).toBe(5) const kv = indexMetadata.key_value_metadata - expect(kv?.[0]).toEqual({ key: 'hypgrep.version', value: '0' }) + expect(kv?.[0]).toEqual({ key: 'hypgrep.version', value: '1' }) expect(kv?.[1]).toEqual({ key: 'hypgrep.block_size', value: '200' }) expect(kv?.[2]).toEqual({ key: 'hypgrep.text_columns', value: 'id' }) expect(kv?.[3]).toEqual({ key: 'hypgrep.source_rows', value: '676' }) diff --git a/test/files/alpha.index.parquet b/test/files/alpha.index.parquet index 29560aa7a3c1cf8dd37a6e9297f6024db0a135d6..424910b0b8ba9c5498ffa83b9c6f36c01c099a21 100644 GIT binary patch literal 708 zcmZ9KJ#W)c6o$`ruAA6OLtA>IYY1X;)S-&fNDUdH3N~6KR3TazDn*XtB+i!;f8@gx z6A}wsS0)DN?_kK7ft?jG@e?@5AwlBGdamw0_kGXND_=f)bs~1ew|()Q--x9yfRMv0 z09fkmN2dIa@y{wS1=pJ^#=v4~=1N?v>B=ssX5|1bXn%PIrQ_{cGvgPq#}o#zf0j+7 zm%8FRDYoKY?$F{BTHO8my0)<3T>Z&!U#scFsQnnU2Z>O0#1Fz~I-4)zX8R$4O=uAuLYvSbbO|n@N9Yp@CakVgy0hb zLP&@RQ^JfeCoBjt0`s~S^?w2C+W?Oa*ZEyAs9_ozjAF9{H|o#1?|Gr%AhM<+)1mKn zd~0X|2!+ngfoJ#6TaaJsM!`5PW(p!}h@7!~DHJT_->di)GxaJQrK~uXDa;9s0uGTA z3breLR0~2egpR_$mTO@z-@_W|I1UGrC$l9=JyGU=tJnaH8+ot07y(2nTHMA8$`oVAQiU-^8Th2c#&^A4d0suGF5$21!??O hYA!-&;EXz9SMo~jv<6bc7nVPXoDiPk$d~y4{sFnSm{tG) literal 1037 zcmbVM&ubGw6n?XvO?TtQCaIIoY#O~JK`E59)L5m5iXd$$GzL>V6l#8KezZw8*=*Wo z4^q&A2R(YJJqe!mU=Mg)tNf zJ+5EZeflObLYetE1Og%-Na*PphkR&`WB>?JzXMfM@R;0y^?}|Ssh3lyArZbSH5CaIA# zB<-p*xkZ3I$!$p2B+*P3sMjJw?8eT(R!#Bt)+400Ri4?9-8z!zNY|C>&%rI406pvr z_T4YAf6s*2CLEAx?lZt4F}HpI^d;RxhqHLbT9?D|kB74oAI=6xg*Qrrplb%K0@1H% ztC)(ADf}Es6rT%YlxfV^V2T_tHDn(33>Lg>5F9s{(BM7!p9-AD4e>M z(@bD11%t_A!!GTumxqy7wo4D}Udz1*CG(+Ubs%!$7qem`sm(G&u&ZJy!W7I$zWVS@ zYAb2Dsdye&3I#kz^4vGe6v6yr++xT+C6QBtsH9ULGC4&jE%|eDP!be;ypLf`IZ{r( z)Fdv8_)HI>T;!C20lH4x^~YgSv!t>(B%{9J89tJFR0je ft%lXAx;0K0mv1it*O5_hIz7vUk9fQ+{s{g6ZeGUZ diff --git a/test/files/alpha.parquet b/test/files/alpha.parquet index fa0a5f12bef735668e9f06728416373865005c9b..d9dceb3a8d5cba8574963b86fdf795f4749b84f6 100644 GIT binary patch literal 2858 zcmciEJx{_w7{KwfEeH+k5t?*x5JHHHi<66+Zp7K8LV1<8l=9Nso&7e z07Hy0#spJ5VTK4X63mfefecHmu*L>COljZPis;di(5E9Iq$6QKN5YVfgb^JHV>%Kh zbRDBO#|F!O%IPGn4-uNsZBtcX8{aahz#O=~~ljSmSFWm1^68F%6r8t@3E literal 2854 zcmciEKWoB37zXf5xQGpoJux5@2Sr+!LUAe-w~o@OlrFI`#`q^O#%OLR{UG@q{S28p zm(KkFojUi4Z9%LOi{>D|B=_E%dkBOi?bgjX%dzJx_Hug9VwI8aU5*h#VpV!QIhJvz z>q?Gv-~kNif(bpapbs_-UD$lZPgyD|NnyBoQp+Y(Xun(%3fn?|L6WBG|}TLv0f`vR2%C zJV1~LL*$4sLXHSyC8Fa_>lUxbC{qsB&lq($o-7wZB217Yf`=Rd=v1$~;iDKG5}gLh z2uS>YK?}j(%gLg zYTNRhqc0;De3ATL@Nl{3jB1T*>N$?jWWownxg__EDj~&K)yi7AmM^n{#)y`bnT%@j m=?q&ff7v8;ne&pY>sT(S#ww{FS)r?YlUdg%cPO{$D9W?d!YO_Stv$ zI(PTp%B(WZqBO#-2b z$1NKYiO;g1_xboezvtu5{i%V?ku{O`zZ|)i+#k7iXM}6M-3lU(@c4rU|!AU43j*(X6dHU@$$LT~j5ZTkS#lWX>5rN|(9wJXv#2w0!2 z?#&)TKM{8B_W=z9$3_9((V0H$il_bz*n$6bYYx5+!M%CN!0!XWyr<4h-q8gpGcB9@ z-fm(c1DnX>UHwh1`K2X~e`~7;In7(Z`_dx7M!($&l_Wgza4|Mh+yIq`CC+)miviC|{GxBkTJR?4EMU!d3gVH0+2owi9bN?C zJbE=)n)4$R9B6$U8~BntmhC)_j7Cb z#ns17*FD#ps;@MzG(4|fgXy77E8g9+gyt;6Dw}0qtalMckipl$8-77EFYg3PUE3@dHhiZ1cIzyhD?d^p4 zF&PbX_!};u1@P;J`fFZoWfr`xj6)&Xo)|ww*1y(^5Or8V>=t!ow z%Uz!J&NY^SoKEC~yDy(44QPTI$m-~)!Ig$Twc2{_Wk}K$wsJdoZFQtL4z#uHxNxTU z+|IsB6XfdRSL(OG+Q7M-LJdb0Nl`9S-DHkLWQ8(|(0pDu3z*2NMs-UX1_5&_q)02eIm~pN11W#cs;Gg$FfGRbQ;1RlH4{TXm70u+ zl5Seyh1#+)B0I#eb6}`|fdkM8mLOWO8QpXM*syMvh=K`qsG@?9S+)+g%s@OR^AB|B zuGkE#XgpIyPy_qCtY{Fj2r-mqDou|{%L_eaVl*8w+6>q7N(Zy_CJe_!_k}5VW@xB~ zIiiFikO{O9B!<9J<@2p}UMUb+xPwpiKwb)w9TjU>rf4>7GJC}3cZCg+VG-?0TG+<2 zqY+@JEIhT*WL6F{5ev@=hm#Tp(jXF%@@16oGR!dH^b%EsNBfyW3kubtj)a<|s4zt{3TRiK ztm!6^fJxv~F=Y(pwfHSP08G{}F({mv!gqV9Au~X#Iu&5P`%=)BaN=U>L=cLM7*~jQ z_B7d7WkZ8>B$C*@6)FiGWD>(+nj@hzT{2cQGnXR@6NQ#-5Kn^I$r@!ul=21s!GM() zNvbloEFbT?NO3tyD9pBqt8FLL0%H2_J4Xh*@Nc zYB+|5a|LvF7=|fFQ;&yqXj$PdPDM}izYOFH-JyUrlKOcMD-akh2Cap~Jipj%U$?}{ zh5{{2geBp7QRpPQLiN$FcjRG0EAY(}=34e0bQlaT%)ABNPHl?CvH^oiN=o{QRMt&k z)NqJo=1Ja!*-&)m$f9N(z8Z#;vJ$@>YId&hiJIZ^-@kx!K`NC~Iz)p!WJUN>h&jXT z@JH*ra6e}s{y0o4!-hHf#Ks~NiUEIP_Y0o}Z7hwBcimVME}<1H#;*%@=KR=eAqvri z_NaMtHK3EHc7^QvYC)XC;vfO;$O zXLkZ%BAY~KSW>BO18dVCL}C8qlA;Ue111wbb@jQ`3SHLE5cM$!y#3x zXn%V+slcYC44AK&ws0{u)H0SH+zkcRVC6%hn>PriTCxQo`*ITa}pQ5S)YYfWQYi`JK(srV4>2N;OzV^7Q+wUoEI&Q6?hXm{fzB6%1YNhzgZ- zSopf4sJV)UO{%d<(bQ!_HqD}4)Lrait+~4@25b=x9-M0^((%7X>9wCfO82$ie31SW zaV;nkXy4Ww&?z+|0WK1X`=Va}rv)9WagK}ln&070C6+T1*LLmB_KoeyO~H+kPfRXfwMW*1;eTD;H0*Ed zhqI<#0MxkXjrW_DYj0S#z1Ix^xc2?g+lo%5pp(kI7SRnEf}qV@bi-%D8(t3>vP+`= z`+Iu0XzOPuCK)jgh*V+FL(xF<8yCI(vqOV2T_gu|Y8z4imZn|%{~hHVlPE+hn8j$| Y;RhcAjTX?r(5b}ab|8-HhhH!M0oOy>`Tzg` literal 6525 zcmds6eRNdioxU&3z)dn4Wgz$Bw1^nusHf=#s>sm$XwrneO)m5=ePi?#1t?fP+sy%1- z?EcgLHiw(snfreHKAz|K{q9RkV{1d%gt7~(%LbRPExRzOObUKQ@s%V=7bcNkUH9R< zarcqXuO~`np>Q~K_{cMdUigpao;m!|frk&i`0RH^kN2MGY#wbt+p#5iVrOb=?TKki z56s-RGMhPke6XW=#;I%XeXrs~>6(KtJ@dk|2Y>n^<{WzF&_myu6vB@_ys3YhztP{- zwRSMScF^zKwef&9II`W}xQUN$+OcnNwLdtxk9X|aoax$lx^Pq1$gV+uWaqA3{cAUF z?0<4!QLu=bezG;4Q?$0?^zFSfa%^#0E83-8b-R{M=h{54EuGJ-XkWehs$CDnbCIjjx6fm zS=Y9)uRWu6J?Zr4ruKEe_Q=w{sd=rxt8aBX?`qFvMrL&7D@XF~iC%4T7!j_Ud8}n& z=jiKqwq7@L`_9&~C$^QY8C`$S8a;4u^V1K#@br_L-{04H^qXz1nd1-KJ8$!2!w)ZP z9zNEZx^DDXdB>8&d$-TuG<@vUrJaYjpBab;R=v4*{qFs{v)x}E*!}!Q+&w+AdvO22 zo~QQj+r4S@jlMnWM+f(e>=_*B{zCWDxEuKU?4CW{1IDKA(al|7-9NBqu?!CAD=_41G{;v85TMy1_TQq&eoTfRgD;}&`q<&UeID6)E2ZoP7 z1o+_RzQ%c%{#aNB;Dno#``0cp55N9$r!n+lNp^q34_mU?nkUZ=y)*Z%hnN2N)?Yr_ zd2-INk_Vpp{X53{@2x-b>_^2HpLym3f6M5Jb4$K8JOAdCqD5m{JEGtC_FyL#M5O*< z)y{0;(Xx2=p_jG}b{w3!c*;V3c-NHH7f1Wf&X&Ki_-cOS(5W+P4%Y2Utj{#$&)wBL z+<$gi=hhwPQm1B?ens%%RbvZ(G<>{d&GkpG{_?U<1&bPU_{+=eqsKc|6i++5wmUoO ze|_u5ny$M0-aowK-p=}&Hx($x^!`_!#U*FUo%fHQS=%{l=IiHL*UcG;&3Sju$x`Lc zvA4Dg{pgAI6)PHsADum!9Jtidoa}?)hc85z)gC>WxM9ogz3Ou9`1X|@&o_J+Qcmsu zm+kY8w0!#vA4$IT(pM%t)?dEtz3TSCtKNSsUD{fA|H(b$BS&|%EaQ*mN>>y%oOz^m zGFf%0<&g!)C#;z`Rtn+gh#}iErC_OTpA?()wV0ZsCom3s#<+viYghEA;AQ zdf>+1*>7(D>CxW~jeL7n??7hHptj-K&c8b_Z}-FBIyQX5J3Y9qxA036C@eQs!E-K8V>;Cx*HK`AlC_!QIv51!+>Fj#W>}6L?93HcPz4sk zEQJXRYh>U}w`Z6+%5|F3S&F7ox(?r&Lv<|XGuY{23s3i1+7e8oW`^dtC3Mbm2x{JB zSOx3)dB#jQsd8P1k(SDHa2#&wp21B9+pwrBR5zni-%WPh#vDhdDl=7t+%ha9MQa8w zr5T)d9LF;i!6*z(YZ;fBmWMx{lR*}&tjBd%*YMaw6cpcabr&H_``vIDJ1Ou(vFr?N z@El$7^vglh4cp~83PTO1^X_oz*HcIc+!EZ+VF`pu*f5=n3{wTwVZ6!kSvOr$frv6s z@C?;;W@@I26`0Um8*^Ym@+A506D%!+&vjL|1orlI-O_`!I?NOnQj83V%cd~b&+&Nw zr4F4s4p#sL%kfl?SrkUVdW4@Lzb3;{$Rl@xSPIyQBpFQQ)C5p%9f*X*U6un2IKsyr z24oAEW;(R7p(U*&*`5;s3Q<5%01h55ta30R2cU|YXr<%2ScaR8yi|t)o9dh}huO$D1FYPFpVxCez?w@93NX=E>Zxm6K%N6ur)=`GdOWka z%iy}}@`5u7YzXeU`*)Wr%NjX z3M$YMU`8PPAP*vmWcUHs02Y%%5IkfsZ+0W`FbIjK_{h_)Wdmb*6EHHhfK&*XpYp&J zvT}CDvK-);87hZmWJ@T=l1m!tZJ>;=A!Nup2LZrP`ku^GmKzqD09-!cHDWFh1YaQzNMVlJWPc0@W#;g#W?&%O1$_bC zp<(b6F(+mfQ#@BF04P~4Ljp)+!lr-^6XcKK>W#XkVru~mrKtkKacCS^sQAKw45`p& zm5!5vr=e{jU6A&slhuecBAQ}C2@kxXt7(XwZkPzOV1Rumk^;(u3^_~WEnM(S)3dsj z$5zla*MM$@mx1I85C|~BCG47$D) z88_Yh*k>*+&Id)n1KJ=lX$%tv7!Rd~K0p<^M9Kg#0BP!Wp&Y!`mJ9K2s@;lkd0;H!tZKe0toZjTfgNy3*T<$MsA~ke(o^2^g$Q&` z8GA?)$!|tL$rLFLp+?NEg-`+0!pd={Mmj1(kO9R{*xmE_x$)6&qqJka`rPHLW z9`4ap8q(sTSSAxO6h~(E)X#+hK|$>Fgb&-Do>yg*vyw_VB)sH-QkQ`YT4QGf;1Inn zA=4r8keeHNJOYVR`3A{pTvO{pcTnoHo?)Mi4Q7#aj`{()XUvCHu99JIM02Xv?~bC68fn1j5h8Te*ujaxz*Prh5K zS=n_S9scK1R3?=xL%)_aFIBTt2Goo9$1(=$SonsXkZf{e=<%NUk~u~~os2wPMfI#n z(E)fvRUz9Y`)XO1xljz?(4qeZh4qHMxByK6sykItOra-mqbFJ$Fo-LNSB{PqEe1## zHG{gGv7qd*PfxC|M2%I@X^8WsHHYp6mT?<^0YE{$_fcIk7@_vQ5Q8ti7|x!5pw#3v zplkg~3zbV_Xe3Z4(M3=TJDa?5{@AA8hiXx4vch!MdCV=DAO{5ZO!BKbKqd$6ZC4=me(WH#M8{sx} z8?P@lXd0#b{L7NlTUqM*HiWyqAr1le=OVV2ZaX~r zvz}koB2@ubp?R6cUsMSS*<}j1$BJY;Ayu+X#aK_Z$U*cOsNob0_MDGkg36$&~0GnOu}J+#dcYWV+|SkGOMFWQWQVR?dXGPV(Gr zG{y>=Y}m)qfkx9Z!W+H5n9e8_ifx%r@;4Ie`E7{m#TB>m?hKM`pg&8YAyhKa*NSAs zdgzf#00x=2jFK&dRSuN@rCA{OOzr6p z@$?w47dD!17y0CxsSo|6$^iLH4XwnCr#57LoiW?WqyAiO8ZB&%BINpM7yYYYl>DKn z=VG&l6@pg;Q~+DpP##9JBpbu7RyNvH6f#-#wjxwmm3GxrC=*msL%DHT^ldqW25dr= zZ3|}j;5eNWy)IJ8fJwX*C_`^cUi9ulnR}_^i^%+jkT6TS3PKJ$c_T`RP?k6`?Cpx^ zMOn55Pu>UTZI>3wp5ydja_{L{P!Y@dLBHO(C!!eOZ+8q5=NNQR&mZax7gZIqV(M9^ z_k)O%G5OHV$-NOcXGdp7B(Fx2dACDOhlJ?4JA&e%H~oHbhUS@pdo%!OBP5lpjt9&Fb=H26rNn}NM~gx5=PATavDq&gk75${aMnn9x- zg_1kAX#t?Gp*KfS3SKT@RWw%|>au%2Doq3F>%5`W(H})?ZOg_B4EiI5CBG^Iyy4;K zG&$ruC;!xvql)JlNIj|?!PsST!gv4Vp+?A4em?UEHIEhOQ;YcDLpByL4<7K4y8Yxa{aMhCeWKtET z#mC3vaY~}$cnD*vNJA)g6DgUEsSTLF9DhkHR!}0v!*is`#13I@LjkE?9*arAX5?nF zFcfbnC?|w`4g-dy5)zG*@SPz-!k-TlDONCF==`D zws2kexdJN3<0auJ(a2QNPt5or3`GE=32UY|lf@uMx6QKZk_?dpH{gY`Q8+~QkkqPBTCQLteS1{s#Oe?>5 zH$0Fk)+Q!pd>f}k_AC_g;1nf6>ZL@<$6sC!(mi-VW{L50>hYxX<0m*)aqm%ZQfQb? z5R@)gN=*LvAx;hP4tbBT@=jv>4Z*yM|Mn>5<3$D~+RY?Fx7<7zBMLBrilCX6o`rKQ I_>Y(W0&8vj&;S4c From 445aa6a245839cceb2d23f331d8e1012116bb0c6 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Sun, 24 May 2026 18:30:14 -0700 Subject: [PATCH 03/15] Switch queryIndex to trigram intersection --- src/queryIndex.js | 118 ++++++++-------------------------------- src/types.d.ts | 13 +---- test/queryIndex.test.js | 34 +++++++----- 3 files changed, 46 insertions(+), 119 deletions(-) diff --git a/src/queryIndex.js b/src/queryIndex.js index c229a9b..ed9fdb9 100644 --- a/src/queryIndex.js +++ b/src/queryIndex.js @@ -1,126 +1,54 @@ import { parquetMetadataAsync, parquetQuery } from 'hyparquet' import { defaultBlockSize, hypGrepVersion } from './constants.js' -import { tokenize } from './tokenize.js' +import { queryTrigrams } from './trigrams.js' /** - * @import { FileMetaData, KeyValue, ParquetQueryFilter } from 'hyparquet' - * @import { BlockResult, HypGrepMetadata, QueryIndexOptions, QueryResult, TermResults } from './types.js' + * @import { KeyValue } from 'hyparquet' + * @import { BlockResult, HypGrepMetadata, QueryIndexOptions, QueryResult } from './types.js' */ /** - * Build a pushdown filter to efficiently query for terms in the index. - * Optionally uses prefix matching. - * - * @param {string[]} terms - * @param {boolean} prefix - whether to use prefix matching - * @returns {ParquetQueryFilter} - */ -function termsFilter(terms, prefix) { - if (prefix) { - const $or = terms.map(t => { - const lastChar = t.charCodeAt(t.length - 1) - const upperBound = t.slice(0, -1) + String.fromCharCode(lastChar + 1) - return { term: { $gte: t, $lt: upperBound } } - }) - return { $or } - } else { - return { term: { $in: terms } } - } -} - -/** - * Query a search index to find matching row groups from the source parquet. - * Returns undefined if query is empty so the search index is not used. + * Query a trigram index to find blocks that could contain the query as a substring. + * Returns undefined if the query has no extractable trigrams (e.g. shorter than 3 chars). * * @param {QueryIndexOptions} options * @returns {Promise} */ -export async function queryIndex({ query, indexFile, indexMetadata, prefix = true }) { - // Tokenize the query using the same logic as indexing - const queryTerms = tokenize(query) - if (queryTerms.length === 0) return undefined +export async function queryIndex({ query, indexFile, indexMetadata }) { + const trigrams = queryTrigrams(query) + if (trigrams.length === 0) return undefined // Read index kv metadata indexMetadata ??= await parquetMetadataAsync(indexFile) const kvMetadata = indexMetadata.key_value_metadata || [] const { blockSize, textColumns, sourceByteLength, sourceRows } = parseKvMetadata(kvMetadata) - // Read index rows matching any of the query terms + // Read postings for every query trigram via pushdown filter const indexRows = await parquetQuery({ file: indexFile, metadata: indexMetadata, - // use hyparquet pushdown filtering - filter: termsFilter(queryTerms, prefix), + filter: { trigram: { $in: trigrams } }, }) - // Pre-compute corpusDocFreq by summing docCount per term - /** @type {Map} */ - const corpusDocFreq = new Map() - for (const row of indexRows) { - const prev = corpusDocFreq.get(row.term) || 0 - corpusDocFreq.set(row.term, prev + row.docCount) - } - - // Map to accumulate scores per blockId - /** @type {Map} */ - const blockScores = new Map() - // Map to accumulate term statistics per blockId - /** @type {Map} */ - const blockTerms = new Map() - - // For each query term, find matching blocks and accumulate scores - for (const queryTerm of queryTerms) { - for (const indexRow of indexRows) { - // Check if this index term matches (exact or prefix) - const matches = prefix - ? indexRow.term.startsWith(queryTerm) - : indexRow.term === queryTerm - if (matches) { - const currentScore = blockScores.get(indexRow.blockId) || 0 - - // Use actual index term's corpus doc freq for scoring - const termCorpusDocFreq = corpusDocFreq.get(indexRow.term) || 0 - - // BM25 scoring - // IDF component: log((N - df + 0.5) / (df + 0.5) + 1) - const idf = Math.log((sourceRows - termCorpusDocFreq + 0.5) / (termCorpusDocFreq + 0.5) + 1) - - // BM25 parameters - const k1 = 1.2 // controls term frequency saturation - const b = 0.75 // controls length normalization - - // TF component with saturation and length normalization - const tf = indexRow.termFreq - const tfComponent = tf * (k1 + 1) / (tf + k1 * (1 - b + b * indexRow.docCount / blockSize)) - - const termScore = idf * tfComponent - - blockScores.set(indexRow.blockId, currentScore + termScore) - - // Collect term statistics - if (!blockTerms.has(indexRow.blockId)) { - blockTerms.set(indexRow.blockId, {}) - } - const terms = blockTerms.get(indexRow.blockId) - if (!terms) continue - terms[indexRow.term] = { - docs: indexRow.docCount, - frequency: indexRow.termFreq, - idf, - } - } + // Count matched trigrams per block; keep blocks that hit every trigram + /** @type {Map>} */ + const hits = new Map() + for (const { trigram, blockId } of indexRows) { + let set = hits.get(blockId) + if (!set) { + set = new Set() + hits.set(blockId, set) } + set.add(trigram) } - // Convert block scores to BlockResults /** @type {BlockResult[]} */ const blocks = [] - const numRows = Number(indexMetadata.num_rows) - for (const [blockId, score] of blockScores.entries()) { + for (const [blockId, set] of hits) { + if (set.size < trigrams.length) continue const rowStart = blockId * blockSize - const rowEnd = Math.min((blockId + 1) * blockSize, numRows) - const terms = blockTerms.get(blockId) || {} - blocks.push({ blockId, rowStart, rowEnd, score, terms }) + const rowEnd = Math.min(rowStart + blockSize, sourceRows) + blocks.push({ blockId, rowStart, rowEnd, score: set.size }) } return { blocks, textColumns, sourceByteLength } diff --git a/src/types.d.ts b/src/types.d.ts index 4c2af45..c782c5c 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -13,7 +13,6 @@ export interface QueryIndexOptions { query: string // the search query string indexFile: AsyncBuffer // file reader for the index parquet file indexMetadata?: FileMetaData // optional index parquet metadata - prefix?: boolean // enable prefix matching (default: true) } export interface ParquetSearchOptions { @@ -53,21 +52,13 @@ export interface QueryResult { } /** - * Represents a matching block of rows from the source parquet. + * Represents a candidate block of rows from the source parquet. */ export interface BlockResult { blockId: number rowStart: number // starting row index (inclusive) in the source parquet rowEnd: number // ending row index (exclusive) in the source parquet - score: number // relevance score based on term frequency - terms: TermResults // per-term statistics -} - -export type TermResults = Record -interface TermResult { - docs: number // number of documents in the block containing this term - frequency: number // total occurrences of the term in the block - idf: number // inverse document frequency for this term + score: number // number of distinct query trigrams matched in this block } /** diff --git a/test/queryIndex.test.js b/test/queryIndex.test.js index 2d8706f..76394c4 100644 --- a/test/queryIndex.test.js +++ b/test/queryIndex.test.js @@ -3,23 +3,31 @@ import { asyncBufferFromFile } from 'hyparquet' import { queryIndex } from '../src/index.js' describe('queryIndex', () => { - it('should query the index and return results with BM25 scoring', async () => { + it('returns blocks that contain every query trigram', async () => { const indexFile = await asyncBufferFromFile('test/files/alpha.index.parquet') - const { blocks } = await queryIndex({ query: 'kk', indexFile }) + const result = await queryIndex({ query: 'akk', indexFile }) - expect(blocks.length).toBe(1) - expect(blocks[0].blockId).toBe(2) - expect(blocks[0].rowStart).toBe(200) - expect(blocks[0].rowEnd).toBe(300) - expect(blocks[0].score).toBeCloseTo(10.27, 2) - expect(blocks[0].terms.kk.docs).toBe(1) - expect(blocks[0].terms.kk.frequency).toBe(1) - expect(blocks[0].terms.kk.idf).toBeCloseTo(6.11, 2) + expect(result).toBeDefined() + expect(result?.blocks.length).toBe(1) + const block = result?.blocks[0] + expect(block?.blockId).toBe(0) + expect(block?.rowStart).toBe(0) + expect(block?.rowEnd).toBe(500) + expect(block?.score).toBe(1) + expect(result?.textColumns).toEqual(['id']) }) - it('should return undefined for empty query', async () => { + it('returns undefined for queries with no trigrams', async () => { const indexFile = await asyncBufferFromFile('test/files/alpha.index.parquet') - const result = await queryIndex({ query: ' ', indexFile }) - expect(result).toBeUndefined() + expect(await queryIndex({ query: '', indexFile })).toBeUndefined() + expect(await queryIndex({ query: 'ab', indexFile })).toBeUndefined() + expect(await queryIndex({ query: ' ', indexFile })).toBeUndefined() + }) + + it('intersects trigrams across multi-word queries', async () => { + const indexFile = await asyncBufferFromFile('test/files/alpha.index.parquet') + // 'akk' is in block 0, 'azz' is in block 1; together no block contains both + const result = await queryIndex({ query: 'akk azz', indexFile }) + expect(result?.blocks.length).toBe(0) }) }) From 3d4d818e64e306a1bf452b2aef0eb8c0f09b8e2f Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Sun, 24 May 2026 18:32:16 -0700 Subject: [PATCH 04/15] Switch row filter to substring match --- src/parquetFind.js | 65 +++++++------------------- src/parquetSearch.js | 96 ++++++++++++++++---------------------- src/types.d.ts | 1 - test/parquetFind.test.js | 51 ++++++-------------- test/parquetSearch.test.js | 96 ++++++++++++++------------------------ 5 files changed, 106 insertions(+), 203 deletions(-) diff --git a/src/parquetFind.js b/src/parquetFind.js index 1721ca9..e225ca1 100644 --- a/src/parquetFind.js +++ b/src/parquetFind.js @@ -1,11 +1,10 @@ import { asyncBufferFromUrl, parquetMetadataAsync, parquetReadObjects } from 'hyparquet' import { queryIndex } from './queryIndex.js' -import { tokenize } from './tokenize.js' /** - * Find rows matching a query, maintaining natural row order. + * Find rows containing the query as a substring, maintaining natural row order. * - * @import {ParquetSearchOptions, TermResults} from '../src/types.js' + * @import {ParquetSearchOptions} from '../src/types.js' * @param {ParquetSearchOptions} options * @returns {AsyncGenerator, void, unknown>} */ @@ -13,7 +12,6 @@ export async function* parquetFind({ query, url, limit = Infinity, - prefix = true, signal, asyncBufferFactory = asyncBufferFromUrl, sourceFile, @@ -24,28 +22,20 @@ export async function* parquetFind({ }) { if (!query || limit <= 0) return signal?.throwIfAborted() - // Query the index to get matching blocks indexFile ??= await asyncBufferFactory({ url: `${url.replace(/\.parquet$/i, '')}.index.parquet` }) - const queryResult = await queryIndex({ query, indexFile, indexMetadata, prefix }) + const queryResult = await queryIndex({ query, indexFile, indexMetadata }) if (!queryResult) return const { blocks, textColumns, sourceByteLength } = queryResult - // If no matching blocks, return empty result if (blocks.length === 0) return - - // Sort blocks by blockId for natural row order blocks.sort((a, b) => a.blockId - b.blockId) signal?.throwIfAborted() - // Construct source file if not provided, use byteLength from index metadata if available const file = sourceFile ?? await asyncBufferFactory({ url, byteLength: sourceByteLength }) - // Get source metadata once before loop only if needed const metadata = sourceMetadata ?? await parquetMetadataAsync(file) - // Tokenize query terms for matching - const queryTerms = new Set(tokenize(query)) + const needles = query.toLowerCase().split(/\s+/).filter(Boolean) - // For each matching block (in natural order), read rows from the source parquet let count = 0 for (const block of blocks) { signal?.throwIfAborted() @@ -58,10 +48,9 @@ export async function* parquetFind({ useOffsetIndex: true, }) - // Yield matching rows in natural order (no sorting) for (let i = 0; i < blockRows.length; i++) { const row = blockRows[i] - if (matchesRow(row, textColumns, queryTerms, block.terms, prefix)) { + if (matchesRow(row, textColumns, needles)) { yield { __index__: block.rowStart + i, ...row } if (++count >= limit) return } @@ -70,44 +59,24 @@ export async function* parquetFind({ } /** - * Check if a row matches any of the query terms. + * Return true when every needle appears as a substring of some indexed column. * * @param {Record} row * @param {string[]} textColumns - * @param {Set} queryTerms - * @param {TermResults} termStats - * @param {boolean} prefix + * @param {string[]} needles * @returns {boolean} */ -function matchesRow(row, textColumns, queryTerms, termStats, prefix) { - const rowTokens = new Set() - - // Collect all tokens from text columns - for (const col of textColumns) { - const value = row[col] - if (typeof value === 'string') { - for (const token of tokenize(value)) { - rowTokens.add(token) - } - } - } - - // Check if any query term matches - for (const queryTerm of queryTerms) { - if (prefix) { - // Prefix matching: find row tokens that start with query term - for (const token of rowTokens) { - if (token.startsWith(queryTerm) && termStats[token]) { - return true - } - } - } else { - // Exact matching - if (rowTokens.has(queryTerm) && termStats[queryTerm]) { - return true +function matchesRow(row, textColumns, needles) { + for (const needle of needles) { + let found = false + for (const col of textColumns) { + const v = row[col] + if (typeof v === 'string' && v.toLowerCase().includes(needle)) { + found = true + break } } + if (!found) return false } - - return false + return true } diff --git a/src/parquetSearch.js b/src/parquetSearch.js index f6c0b36..0fc3fe2 100644 --- a/src/parquetSearch.js +++ b/src/parquetSearch.js @@ -1,11 +1,11 @@ import { asyncBufferFromUrl, parquetMetadataAsync, parquetReadObjects } from 'hyparquet' import { queryIndex } from './queryIndex.js' -import { tokenize } from './tokenize.js' /** - * Uses the hypgrep to query a source parquet file and return matching rows. + * Uses hypgrep to find rows containing the query as a substring, ranked by total + * occurrence count across indexed text columns. * - * @import {ParquetSearchOptions, TermResults} from '../src/types.js' + * @import {ParquetSearchOptions} from '../src/types.js' * @param {ParquetSearchOptions} options * @returns {AsyncGenerator, void, unknown>} */ @@ -13,7 +13,6 @@ export async function* parquetSearch({ query, url, limit = Infinity, - prefix = true, signal, asyncBufferFactory = asyncBufferFromUrl, sourceFile, @@ -24,28 +23,22 @@ export async function* parquetSearch({ }) { if (!query || limit <= 0) return signal?.throwIfAborted() - // Query the index to get matching blocks + // Query the index to get candidate blocks indexFile ??= await asyncBufferFactory({ url: `${url.replace(/\.parquet$/i, '')}.index.parquet` }) - const queryResult = await queryIndex({ query, indexFile, indexMetadata, prefix }) + const queryResult = await queryIndex({ query, indexFile, indexMetadata }) if (!queryResult) return const { blocks, textColumns, sourceByteLength } = queryResult - // Sort blocks by score descending (most relevant first) - blocks.sort((a, b) => b.score - a.score) - - // If no matching blocks, return empty result if (blocks.length === 0) return + // Sort blocks by score descending (more matched trigrams first) + blocks.sort((a, b) => b.score - a.score) signal?.throwIfAborted() - // Construct source file if not provided, use byteLength from index metadata if available const file = sourceFile ?? await asyncBufferFactory({ url, byteLength: sourceByteLength }) - // Get source metadata once before loop only if needed const metadata = sourceMetadata ?? await parquetMetadataAsync(file) - // Tokenize query terms for matching - const queryTerms = new Set(tokenize(query)) + const needles = needlesOf(query) - // For each matching block, read rows from the source parquet let count = 0 for (const block of blocks) { signal?.throwIfAborted() @@ -58,21 +51,18 @@ export async function* parquetSearch({ useOffsetIndex: true, }) - // Score and collect matching rows within the block /** @type {{index: number, row: Record, score: number}[]} */ const scoredRows = [] for (let i = 0; i < blockRows.length; i++) { const row = blockRows[i] - const score = scoreRow(row, textColumns, queryTerms, block.terms, prefix) + const score = scoreRow(row, textColumns, needles) if (score > 0) { scoredRows.push({ index: block.rowStart + i, row, score }) } } - // Sort by score descending within block scoredRows.sort((a, b) => b.score - a.score) - // Yield rows in score order for (const { index, row } of scoredRows) { yield { __index__: index, ...row } if (++count >= limit) return @@ -81,48 +71,42 @@ export async function* parquetSearch({ } /** - * Score a row based on which query terms it matches, weighted by IDF. + * Split the query into whitespace-separated needles for substring matching. + * + * @param {string} query + * @returns {string[]} + */ +function needlesOf(query) { + return query.toLowerCase().split(/\s+/).filter(Boolean) +} + +/** + * Score a row by counting non-overlapping occurrences of every needle across the + * indexed text columns. Returns 0 if any needle is missing. * * @param {Record} row * @param {string[]} textColumns - * @param {Set} queryTerms - * @param {TermResults} termStats - * @param {boolean} prefix - * @returns {number} score (0 if no match) + * @param {string[]} needles + * @returns {number} */ -function scoreRow(row, textColumns, queryTerms, termStats, prefix) { - let score = 0 - const rowTokens = new Set() - - // Collect all tokens from text columns - for (const col of textColumns) { - const value = row[col] - if (typeof value === 'string') { - for (const token of tokenize(value)) { - rowTokens.add(token) +function scoreRow(row, textColumns, needles) { + let total = 0 + for (const needle of needles) { + let count = 0 + for (const col of textColumns) { + const v = row[col] + if (typeof v !== 'string') continue + const text = v.toLowerCase() + let from = 0 + while (true) { + const i = text.indexOf(needle, from) + if (i < 0) break + count += 1 + from = i + needle.length } } + if (count === 0) return 0 + total += count } - - // Score based on matching query terms weighted by IDF - for (const queryTerm of queryTerms) { - if (prefix) { - // Prefix matching: find row tokens that start with query term - for (const token of rowTokens) { - if (token.startsWith(queryTerm)) { - // Use the matched token's stats from the index - const stats = termStats[token] - score += stats?.idf ?? 1 - } - } - } else { - // Exact matching - if (rowTokens.has(queryTerm)) { - const stats = termStats[queryTerm] - score += stats?.idf ?? 1 - } - } - } - - return score + return total } diff --git a/src/types.d.ts b/src/types.d.ts index c782c5c..0ddcfcd 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -19,7 +19,6 @@ export interface ParquetSearchOptions { query: string // the search query string url: string // URL or file path to the source parquet file limit?: number // maximum number of matching rows to return - prefix?: boolean // enable prefix matching (default: true) // fetch options signal?: AbortSignal // optional AbortSignal to cancel the search operation diff --git a/test/parquetFind.test.js b/test/parquetFind.test.js index 3d03e22..af1d857 100644 --- a/test/parquetFind.test.js +++ b/test/parquetFind.test.js @@ -12,13 +12,13 @@ describe('parquetFind', () => { for await (const row of parquetFind({ url: 'test/files/alpha.parquet', asyncBufferFactory, - query: 'kk', + query: 'akk', })) { rows.push(row) } expect(rows.length).toBe(1) - expect(rows[0]).toEqual({ __index__: 270, id: 'kk' }) + expect(rows[0]).toEqual({ __index__: 270, id: 'akk' }) }) it('should return no results for query with no matches', async () => { @@ -36,41 +36,37 @@ describe('parquetFind', () => { it('should respect limit parameter', async () => { const rows = [] for await (const row of parquetFind({ - url: 'test/files/alpha.parquet', + url: 'test/files/dataset.parquet', asyncBufferFactory, - query: 'aa ab ac', + query: 'the', limit: 2, })) { rows.push(row) } - expect(rows.length).toBe(2) }) it('should return results in ascending __index__ order', async () => { - // Query that matches multiple rows - should be in natural order const rows = [] for await (const row of parquetFind({ url: 'test/files/dataset.parquet', asyncBufferFactory, - query: 'through petrichor', + query: 'through', })) { rows.push(row) } - - expect(rows.length).toBe(14) - // Verify results are in ascending __index__ order (natural row order) + expect(rows.length).toBeGreaterThan(1) for (let i = 1; i < rows.length; i++) { expect(rows[i].__index__).toBeGreaterThan(rows[i - 1].__index__) } }) - it('should match word prefixes by default', async () => { + it('should match substrings inside words', async () => { const rows = [] for await (const row of parquetFind({ url: 'test/files/dataset.parquet', asyncBufferFactory, - query: 'rhyt', // prefix of "rhythm" and "rhythmic" + query: 'rhyt', })) { rows.push(row) } @@ -79,31 +75,28 @@ describe('parquetFind', () => { expect(rows.some(r => r.text.includes('rhythmic'))).toBe(true) }) - it('should match whole words when prefix is false', async () => { + it('should require every whitespace-separated word to match', async () => { const rows = [] for await (const row of parquetFind({ url: 'test/files/dataset.parquet', asyncBufferFactory, - query: 'rhyt', // prefix of "rhythm" and "rhythmic" - prefix: false, + query: 'petrichor xyznopematch', })) { rows.push(row) } expect(rows.length).toBe(0) }) - it('should find exact matches when prefix is false', async () => { + it('should return no results for queries shorter than 3 chars', async () => { const rows = [] for await (const row of parquetFind({ url: 'test/files/dataset.parquet', asyncBufferFactory, - query: 'rhythm', - prefix: false, + query: 'th', })) { rows.push(row) } - expect(rows.length).toBe(1) - expect(rows[0].text).toContain('Rhythm') + expect(rows.length).toBe(0) }) it('should respect abort signal', async () => { @@ -117,27 +110,9 @@ describe('parquetFind', () => { query: 'through', signal: controller.signal, })) { - // should not reach here rows.push(row) } }).rejects.toThrow() expect(rows.length).toBe(0) }) - - it('should return results in row order across multiple blocks', async () => { - const rows = [] - for await (const row of parquetFind({ - url: 'test/files/dataset.parquet', - asyncBufferFactory, - query: 'th', - })) { - rows.push(row.__index__) - } - - // Results must be in ascending row order - expect(rows.length).toBeGreaterThan(1) - for (let i = 1; i < rows.length; i++) { - expect(rows[i]).toBeGreaterThan(rows[i - 1]) - } - }) }) diff --git a/test/parquetSearch.test.js b/test/parquetSearch.test.js index 4840e07..e33a589 100644 --- a/test/parquetSearch.test.js +++ b/test/parquetSearch.test.js @@ -13,29 +13,26 @@ describe('parquetSearch', () => { const sourceFile = await asyncBufferFromFile('test/files/alpha.parquet').then(countingBuffer) const indexFile = await asyncBufferFromFile('test/files/alpha.index.parquet').then(countingBuffer) - const rowGenerator = parquetSearch({ + const rows = [] + for await (const row of parquetSearch({ url, sourceFile, indexFile, - query: 'kk', - }) - // Collect all rows from async generator - const rows = [] - for await (const row of rowGenerator) { + query: 'akk', + })) { rows.push(row) } - // The query 'kk' should match blockId 2 (rows 200-300) expect(rows.length).toBe(1) - expect(rows[0]).toEqual({ __index__: 270, id: 'kk' }) + expect(rows[0]).toEqual({ __index__: 270, id: 'akk' }) expect(sourceFile.fetches).toBe(2) // metadata + row group fetch - expect(sourceFile.bytes).toBe(5606) + expect(sourceFile.bytes).toBe(5603) expect(indexFile.fetches).toBe(2) // metadata + index row group fetch - expect(indexFile.bytes).toBe(1630) + expect(indexFile.bytes).toBe(1084) }) it('should return no results for query with no matches', async () => { - const rowGenerator = await parquetSearch({ + const rowGenerator = parquetSearch({ url, asyncBufferFactory, query: 'xyznonexistentterm123', @@ -49,106 +46,86 @@ describe('parquetSearch', () => { }) it('should query url with asyncBufferFactory', async () => { - const rowGenerator = parquetSearch({ + const rows = [] + for await (const row of parquetSearch({ url, asyncBufferFactory, - query: 'kk', - }) - const rows = [] - for await (const row of rowGenerator) { + query: 'akk', + })) { rows.push(row) } - expect(rows.length).toBe(1) }) it('should respect limit parameter', async () => { - const rowGenerator = parquetSearch({ - url, + const rows = [] + for await (const row of parquetSearch({ + url: 'test/files/dataset.parquet', asyncBufferFactory, - query: 'aa ab ac', // matches multiple rows + query: 'the', limit: 2, - }) - const rows = [] - for await (const row of rowGenerator) { + })) { rows.push(row) } - expect(rows.length).toBe(2) }) - it('should match word prefixes', async () => { + it('should match substrings inside words', async () => { const rows = [] for await (const row of parquetSearch({ url: 'test/files/dataset.parquet', asyncBufferFactory, - query: 'rhyt', // prefix of "rhythm" and "rhythmic" + query: 'rhyt', })) { rows.push(row) } - expect(rows.length).toBe(2) // matches "rhythm" and "rhythmic" rows + expect(rows.length).toBe(2) // matches "Rhythm" and "rhythmic" expect(rows.some(r => r.text.includes('Rhythm'))).toBe(true) expect(rows.some(r => r.text.includes('rhythmic'))).toBe(true) }) - it('should search dataset.parquet for rare words', async () => { - // Query includes "through" (appears in 8 docs, low IDF) and "petrichor" - // (appears in 1 doc, high IDF). With BM25, the rare "petrichor" term - // gets high IDF weight and its document ranks first despite fewer matches. - const rowGenerator = parquetSearch({ - url: 'test/files/dataset.parquet', - asyncBufferFactory, - query: 'through petrichor', - }) - const rows = [] - for await (const row of rowGenerator) { - rows.push(row) - } - - expect(rows.length).toBe(14) - // BM25 should rank the document with "petrichor" highest - expect(rows[0].text).toContain('petrichor') - }) - - it('should match word prefixes by default', async () => { + it('should rank rows with more matches first', async () => { const rows = [] for await (const row of parquetSearch({ url: 'test/files/dataset.parquet', asyncBufferFactory, - query: 'rhyt', // prefix of "rhythm" and "rhythmic" + query: 'the', })) { rows.push(row) } - expect(rows.length).toBe(2) - expect(rows.some(r => r.text.includes('Rhythm'))).toBe(true) - expect(rows.some(r => r.text.includes('rhythmic'))).toBe(true) + expect(rows.length).toBeGreaterThan(1) + /** + * @param {string} s + * @returns {number} + */ + function count(s) { + return (s.toLowerCase().match(/the/g) ?? []).length + } + expect(count(rows[0].text)).toBeGreaterThanOrEqual(count(rows[rows.length - 1].text)) }) - it('should match whole words when prefix is false', async () => { + it('should require every whitespace-separated word to match', async () => { const rows = [] for await (const row of parquetSearch({ url: 'test/files/dataset.parquet', asyncBufferFactory, - query: 'rhyt', // prefix of "rhythm" and "rhythmic" - prefix: false, + query: 'petrichor xyznopematch', })) { rows.push(row) } expect(rows.length).toBe(0) }) - it('should find exact matches when prefix is false', async () => { + it('should return undefined for queries shorter than 3 chars', async () => { const rows = [] for await (const row of parquetSearch({ url: 'test/files/dataset.parquet', asyncBufferFactory, - query: 'rhythm', - prefix: false, + query: 'th', })) { rows.push(row) } - expect(rows.length).toBe(1) - expect(rows[0].text).toContain('Rhythm') + expect(rows.length).toBe(0) }) it('should respect abort signal', async () => { @@ -162,7 +139,6 @@ describe('parquetSearch', () => { query: 'through', signal: controller.signal, })) { - // should not reach here rows.push(row) } }).rejects.toThrow() From 185f14e4a532348990fc2d5be0b0c77c86d52ace Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Sun, 24 May 2026 18:32:45 -0700 Subject: [PATCH 05/15] Delete tokenize, stemmer, stop words --- src/index.d.ts | 6 -- src/index.js | 1 - src/stemmer.js | 78 ---------------- src/tokenize.js | 72 -------------- test/stemmer.test.js | 82 ---------------- test/tokenize.test.js | 212 ------------------------------------------ 6 files changed, 451 deletions(-) delete mode 100644 src/stemmer.js delete mode 100644 src/tokenize.js delete mode 100644 test/stemmer.test.js delete mode 100644 test/tokenize.test.js diff --git a/src/index.d.ts b/src/index.d.ts index 0246d4b..8ff1edc 100644 --- a/src/index.d.ts +++ b/src/index.d.ts @@ -21,9 +21,3 @@ export function parquetSearch(options: ParquetSearchOptions): AsyncGenerator - -/** - * Tokenize text into normalized terms. - * Lowercases and splits on non-alphanumeric boundaries. - */ -export function tokenize(text: string): string[] diff --git a/src/index.js b/src/index.js index 7040f4a..aef0145 100644 --- a/src/index.js +++ b/src/index.js @@ -3,4 +3,3 @@ export { createIndex } from './createIndex.js' export { parquetFind } from './parquetFind.js' export { parquetSearch } from './parquetSearch.js' export { queryIndex } from './queryIndex.js' -export { tokenize } from './tokenize.js' diff --git a/src/stemmer.js b/src/stemmer.js deleted file mode 100644 index 307379c..0000000 --- a/src/stemmer.js +++ /dev/null @@ -1,78 +0,0 @@ - -const vowels = 'aeiouy' - -// ordered longest-first -const suffixRules = [ - { suffix: 'ing', minStem: 4, needsVowel: true }, - { suffix: 'ed', minStem: 3, needsVowel: true }, - { suffix: 'ly', minStem: 3 }, - // handle plural-ish endings carefully - { suffix: 'es', minStem: 3, needsVowel: true, plural: true }, - { suffix: 's', minStem: 3, needsVowel: true, plural: true }, - // bad with i/y: er, ers, est, ies - // other options: ment, ingly, edly, ness -] - -/** - * Simple prefix stemmer, removes common English suffixes. - * Based on a simplified version of the Porter stemming algorithm. - * Importantly, only removes suffixes. - * - * @param {string} term - lowercase word to stem - * @returns {string} stemmed word - */ -export function stemmer(term) { - // too short to bother - if (term.length < 4) return term - - // skip anything that isn't a simple lowercase word - if (!isLowerAlpha(term)) return term - - for (let i = 0; i < suffixRules.length; i += 1) { - const rule = suffixRules[i] - const { suffix } = rule - - if (!term.endsWith(suffix)) continue - - const stem = term.slice(0, term.length - suffix.length) - if (stem.length < rule.minStem) continue - - if (rule.needsVowel && !hasVowel(stem)) continue - - if (rule.plural) { - // fix: class, boss - if (term.endsWith('ss')) continue - // fix: virus, status - if (term.endsWith('us')) continue - // fix: this, analysis - if (term.endsWith('is')) continue - } - - return stem - } - - return term -} - -/** - * @param {string} s - * @returns {boolean} - */ -function hasVowel(s) { - for (let i = 0; i < s.length; i += 1) { - if (vowels.includes(s[i])) return true - } - return false -} - -/** - * @param {string} s - * @returns {boolean} - */ -function isLowerAlpha(s) { - for (let i = 0; i < s.length; i += 1) { - const code = s.charCodeAt(i) - if (code < 97 || code > 122) return false - } - return true -} diff --git a/src/tokenize.js b/src/tokenize.js deleted file mode 100644 index 4c12c9b..0000000 --- a/src/tokenize.js +++ /dev/null @@ -1,72 +0,0 @@ -import { stemmer } from './stemmer.js' - -/** - * Common English stop words to filter from index. - * These high-frequency, low-value words are excluded to reduce index size. - */ -const STOP_WORDS = new Set([ - 'the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have', 'it', - 'for', 'not', 'on', 'with', 'he', 'as', 'you', 'do', 'at', 'this', - 'but', 'his', 'by', 'from', 'they', 'we', 'say', 'her', 'she', 'or', - 'an', 'will', 'my', 'one', 'all', 'would', 'there', 'their', 'what', - 'so', 'up', 'out', 'if', 'about', 'who', 'get', 'which', 'go', 'me', - 'when', 'make', 'can', 'like', 'no', 'just', 'him', 'know', 'take', - 'into', 'your', 'some', 'could', 'them', 'than', 'then', 'now', 'only', - 'its', 'also', 'other', 'how', 'our', 'may', 'these', 'was', 'been', - 'has', 'had', 'are', 'is', 'am', 'were', 'does', 'did', 'being', -]) - -/** - * Split camelCase and PascalCase words by inserting spaces before uppercase letters. - * Converts "parseUserInput" to "parse User Input", "XMLParser" to "XMLParser", etc. - * - * @param {string} text - * @returns {string} - */ -function splitCamelCase(text) { - // Insert space before uppercase letters that follow lowercase letters or digits - return text.replace(/([a-z0-9])([A-Z])/g, '$1 $2') -} - -/** - * Normalize Unicode text by removing diacritics/accents. - * Converts "café" to "cafe", "résumé" to "resume", etc. - * - * @param {string} text - * @returns {string} - */ -function normalizeUnicode(text) { - // NFD decomposes combined characters into base + combining marks - // Then remove combining marks (Unicode category: Mark, Nonspacing) - return text.normalize('NFD').replace(/[\u0300-\u036f]/g, '') -} - -/** - * Tokenize text into normalized terms. - * Splits camelCase, lowercases, normalizes Unicode, splits on non-alphanumeric boundaries, - * filters stop words, and applies Porter stemming. - * - * @param {string} text - * @returns {string[]} - */ -export function tokenize(text) { - // Split camelCase/PascalCase before normalization - const split = splitCamelCase(text) - // Normalize Unicode (remove accents) before lowercasing - const normalized = normalizeUnicode(split) - const lower = normalized.toLowerCase() - - // Split on non-alphanumeric boundaries - const rawTokens = lower.split(/[^a-z0-9]+/g) - const tokens = [] - - for (const token of rawTokens) { - if (!token) continue - if (token.length < 2) continue - if (STOP_WORDS.has(token)) continue - // Apply Porter stemming to reduce words to their root form - tokens.push(stemmer(token)) - } - - return tokens -} diff --git a/test/stemmer.test.js b/test/stemmer.test.js deleted file mode 100644 index b8c8aa0..0000000 --- a/test/stemmer.test.js +++ /dev/null @@ -1,82 +0,0 @@ -import { describe, expect, it } from 'vitest' -import { stemmer } from '../src/stemmer.js' - -describe('stemmer', () => { - it('should stem common English words', () => { - // Not stemmed - expect(stemmer('best')).toBe('best') - expect(stemmer('faster')).toBe('faster') - expect(stemmer('fastest')).toBe('fastest') - expect(stemmer('digitizer')).toBe('digitizer') - expect(stemmer('digitizers')).toBe('digitizer') - expect(stemmer('predication')).toBe('predication') - expect(stemmer('feudalism')).toBe('feudalism') - expect(stemmer('adjustment')).toBe('adjustment') - expect(stemmer('development')).toBe('development') - expect(stemmer('gpt-5-thinking')).toBe('gpt-5-thinking') - }) - - it('should handle plurals', () => { - expect(stemmer('cats')).toBe('cat') - expect(stemmer('ponies')).toBe('poni') - expect(stemmer('ties')).toBe('tie') - expect(stemmer('caresses')).toBe('caress') - }) - - it('should handle non-plurals', () => { - // no vowels - expect(stemmer('https')).toBe('https') - expect(stemmer('css')).toBe('css') - // exceptions - expect(stemmer('class')).toBe('class') - expect(stemmer('bless')).toBe('bless') - expect(stemmer('blessing')).toBe('bless') - expect(stemmer('bus')).toBe('bus') - expect(stemmer('status')).toBe('status') - expect(stemmer('this')).toBe('this') - expect(stemmer('analysis')).toBe('analysis') - }) - - it('should handle past tense', () => { - expect(stemmer('agreed')).toBe('agre') - expect(stemmer('plastered')).toBe('plaster') - expect(stemmer('motoring')).toBe('motor') - expect(stemmer('heading')).toBe('head') - // no stem - expect(stemmer('bled')).toBe('bled') - expect(stemmer('sing')).toBe('sing') - expect(stemmer('string')).toBe('string') - }) - - it('should handle -ly suffixes', () => { - expect(stemmer('conformably')).toBe('conformab') - expect(stemmer('radically')).toBe('radical') - expect(stemmer('differently')).toBe('different') - expect(stemmer('vilely')).toBe('vile') - expect(stemmer('analogously')).toBe('analogous') - expect(stemmer('interestingly')).toBe('interesting') - }) - - it('should handle short words', () => { - expect(stemmer('a')).toBe('a') - expect(stemmer('is')).toBe('is') - expect(stemmer('at')).toBe('at') - }) - - it('should handle words with y', () => { - expect(stemmer('happy')).toBe('happy') - expect(stemmer('sky')).toBe('sky') - expect(stemmer('yellow')).toBe('yellow') - expect(stemmer('youth')).toBe('youth') - expect(stemmer('young')).toBe('young') - }) - - it('should not stem words with non-lowercase-ascii characters', () => { - expect(stemmer('Running')).toBe('Running') - expect(stemmer('Bunnies')).toBe('Bunnies') - expect(stemmer('DEFINING')).toBe('DEFINING') - expect(stemmer('camelCasing')).toBe('camelCasing') - expect(stemmer('123squirrelingly')).toBe('123squirrelingly') - expect(stemmer('hello!ness')).toBe('hello!ness') - }) -}) diff --git a/test/tokenize.test.js b/test/tokenize.test.js deleted file mode 100644 index 7085906..0000000 --- a/test/tokenize.test.js +++ /dev/null @@ -1,212 +0,0 @@ -import { describe, expect, it } from 'vitest' -import { tokenize } from '../src/tokenize.js' - -describe('tokenize', () => { - describe('basic functionality', () => { - it('should tokenize simple text', () => { - expect(tokenize('hello world')).toEqual(['hello', 'world']) - }) - - it('should lowercase text', () => { - expect(tokenize('Hello WORLD')).toEqual(['hello', 'world']) - }) - - it('should split on punctuation', () => { - expect(tokenize('hello, world!')).toEqual(['hello', 'world']) - }) - - it('should filter empty tokens', () => { - expect(tokenize(' ')).toEqual([]) - }) - - it('should filter single-character tokens', () => { - // 'am' is a stop word, so only 'developer' remains - expect(tokenize('I am a developer')).toEqual(['developer']) - }) - - it('should keep two-character tokens', () => { - // Using non-stop-word two-character tokens - expect(tokenize('ok hi 42')).toEqual(['ok', 'hi', '42']) - }) - - it('should handle json-like text', () => { - expect(tokenize('{"key": "value", "number": 123}')) - .toEqual(['key', 'value', 'number', '123']) - }) - - it('should handle markdown-like text', () => { - expect(tokenize('# Title\nThis is **bold** text.')) - .toEqual(['title', 'bold', 'text']) - }) - - it('should handle xml-like text', () => { - expect(tokenize('Content')) - .toEqual(['tag', 'content', 'tag']) - }) - }) - - describe('stop word filtering', () => { - it('should filter common stop words', () => { - expect(tokenize('the quick brown fox')).toEqual(['quick', 'brown', 'fox']) - }) - - it('should filter "the"', () => { - expect(tokenize('the cat')).toEqual(['cat']) - }) - - it('should filter "and"', () => { - expect(tokenize('cats and dogs')).toEqual(['cat', 'dog']) - }) - - it('should filter "is"', () => { - expect(tokenize('this is test')).toEqual(['test']) - }) - - it('should filter "to" and "be"', () => { - // 'to', 'be', 'or', 'not' are all stop words - expect(tokenize('to be or not to be')).toEqual([]) - }) - - it('should filter multiple stop words in a row', () => { - // 'it', 'is', 'what' are all stop words - expect(tokenize('it is what it is')).toEqual([]) - }) - - it('should handle text with only stop words', () => { - expect(tokenize('the and or')).toEqual([]) - }) - - it('should keep content words', () => { - expect(tokenize('quick brown fox jumped over lazy dog')) - .toEqual(['quick', 'brown', 'fox', 'jump', 'over', 'lazy', 'dog']) - }) - }) - - describe('Unicode normalization', () => { - it('should normalize accented characters in French', () => { - expect(tokenize('café')).toEqual(['cafe']) - }) - - it('should normalize multiple accented characters', () => { - expect(tokenize('café résumé naïve')).toEqual(['cafe', 'resume', 'naive']) - }) - - it('should handle Spanish characters', () => { - expect(tokenize('niño mañana')).toEqual(['nino', 'manana']) - }) - - it('should handle German characters', () => { - expect(tokenize('schön über')).toEqual(['schon', 'uber']) - }) - - it('should handle mixed text', () => { - expect(tokenize('Zürich café')).toEqual(['zurich', 'cafe']) - }) - - it('should normalize uppercase accented characters', () => { - expect(tokenize('CAFÉ RÉSUMÉ')).toEqual(['cafe', 'resume']) - }) - - it('should handle accents with stop words', () => { - expect(tokenize('the café is open')).toEqual(['cafe', 'open']) - }) - }) - - describe('numeric handling', () => { - it('should keep alphanumeric tokens', () => { - // Decimals split into single digits which are filtered - expect(tokenize('version 2.1.5')).toEqual(['version']) - }) - - it('should handle prices', () => { - expect(tokenize('price $19.99')).toEqual(['price', '19', '99']) - }) - - it('should handle years', () => { - expect(tokenize('year 2024')).toEqual(['year', '2024']) - }) - }) - - describe('special cases', () => { - it('should handle hyphenated words', () => { - expect(tokenize('full-text search')).toEqual(['full', 'text', 'search']) - }) - - it('should split camelCase into separate tokens', () => { - expect(tokenize('parseUserInput')).toEqual(['parse', 'user', 'input']) - }) - - it('should split PascalCase into separate tokens', () => { - expect(tokenize('UserService')).toEqual(['user', 'service']) - }) - - it('should handle consecutive uppercase letters', () => { - // Consecutive uppercase letters stay together until followed by lowercase - expect(tokenize('XMLParser')).toEqual(['xmlparser']) - // But this splits correctly - expect(tokenize('HTTPServer')).toEqual(['httpserver']) - }) - - it('should handle camelCase with numbers', () => { - // Numbers followed by uppercase split correctly - expect(tokenize('version2Beta')).toEqual(['version2', 'beta']) - }) - - it('should handle mixed camelCase and snake_case', () => { - expect(tokenize('parseUser_name')).toEqual(['parse', 'user', 'name']) - }) - - it('should handle underscores', () => { - expect(tokenize('user_name')).toEqual(['user', 'name']) - }) - - it('should handle empty string', () => { - expect(tokenize('')).toEqual([]) - }) - - it('should handle only punctuation', () => { - expect(tokenize('!@#$%^&*()')).toEqual([]) - }) - - it('should handle mixed punctuation and text', () => { - expect(tokenize('hello!!!world???')).toEqual(['hello', 'world']) - }) - }) - - describe('real-world examples', () => { - it('should tokenize a sentence', () => { - expect(tokenize('The quick brown fox jumps over the lazy dog')) - .toEqual(['quick', 'brown', 'fox', 'jump', 'over', 'lazy', 'dog']) - }) - - it('should tokenize an email-like string', () => { - expect(tokenize('contact@example.com')) - .toEqual(['contact', 'example', 'com']) - }) - - it('should tokenize a URL-like string', () => { - expect(tokenize('https://example.com/path')) - .toEqual(['https', 'example', 'com', 'path']) - }) - - it('should tokenize multi-language text', () => { - expect(tokenize('The café in São Paulo')) - .toEqual(['cafe', 'sao', 'paulo']) - }) - - it('should tokenize code-like text', () => { - expect(tokenize('function parseJSON(data) { return JSON.parse(data); }')) - .toEqual(['function', 'parse', 'json', 'data', 'return', 'json', 'parse', 'data']) - }) - - it('should tokenize a complex technical sentence', () => { - expect(tokenize('Implementing a full-text search engine with HypGrep v2.0.1!')) - .toEqual(['implement', 'full', 'text', 'search', 'engine', 'hyp', 'grep', 'v2']) - }) - - // TODO: Should be kept as one token 'gpt-4o-mini' - it('should tokenize a model name', () => { - expect(tokenize('gpt-4o-mini')).toEqual(['gpt', '4o', 'mini']) - }) - }) -}) From a15338e0fe2f442dc86a5847951448c91f37afaf Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Sun, 24 May 2026 18:34:15 -0700 Subject: [PATCH 06/15] Update docs for trigram semantics --- README.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 593f495..a819fb2 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,11 @@ [![mit license](https://img.shields.io/badge/License-MIT-orange.svg)](https://opensource.org/licenses/MIT) ![coverage](https://img.shields.io/badge/Coverage-95-darkred) -Build a compact full-text search index for a Parquet file using [`hyparquet`](https://github.com/hyparam/hyparquet) and [`hyparquet-writer`](https://github.com/hyparam/hyparquet-writer). +Build a compact trigram search index for a Parquet file using [`hyparquet`](https://github.com/hyparam/hyparquet) and [`hyparquet-writer`](https://github.com/hyparam/hyparquet-writer). Queries are case-insensitive substring matches — grep semantics over a precomputed index. ## Why? -Enable efficient full-text search on large Parquet datasets from any client without a server. Store your Parquet dataset on S3, generate a compact index file, and query it directly from a browser or other clients using HTTP range requests. The index tells you exactly which row blocks to fetch, so you only download the data you need. +Enable efficient grep-style search on large Parquet datasets from any client without a server. Store your Parquet dataset on S3, generate a compact index file, and query it directly from a browser or other clients using HTTP range requests. The index tells you exactly which row blocks to fetch, so you only download the data you need. Perfect for serverless architectures where you want to offer search capabilities without managing infrastructure. @@ -26,7 +26,7 @@ hypgrep dataset.parquet [dataset.index.parquet] ## Find rows in a parquet file in JavaScript -Use `parquetFind` to find rows matching a query while preserving natural row order (like Ctrl+F): +Use `parquetFind` to find rows containing the query as a substring while preserving natural row order (like Ctrl+F): ```javascript import { parquetFind } from 'hypgrep' @@ -39,9 +39,11 @@ for await (const row of parquetFind({ } ``` +Whitespace-separated words are ANDed: `'foo bar'` matches rows containing both `foo` and `bar` as substrings. Queries shorter than 3 characters return no results. + ## Ranked search -Use `parquetSearch` to rank results by BM25 relevance score (like a search engine): +Use `parquetSearch` to rank results by total occurrence count of the query words: ```javascript import { parquetSearch } from 'hypgrep' @@ -50,7 +52,7 @@ for await (const row of parquetSearch({ query: 'serverless', url: 'https://s3.hyperparam.app/hypgrep/wiki_en.parquet', })) { - console.log(row) // highest relevance first + console.log(row) // most matches first } ``` From 57bee8cc976c1612cd38532e552ee4175064c701 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Sun, 24 May 2026 18:36:26 -0700 Subject: [PATCH 07/15] Bump to 0.2.0 for trigram format --- package.json | 12 ++++++------ src/constants.js | 2 +- test/createIndex.test.js | 2 +- test/files/alpha.index.parquet | Bin 708 -> 708 bytes test/files/dataset.index.parquet | Bin 3266 -> 3266 bytes 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/package.json b/package.json index 031293f..6e3443b 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "hypgrep", - "version": "1.0.0", + "version": "0.2.0", "author": "Hyperparam", "homepage": "https://hyperparam.app", "license": "MIT", @@ -41,16 +41,16 @@ "test": "vitest run" }, "dependencies": { - "hyparquet": "1.25.8", + "hyparquet": "1.26.0", "hyparquet-compressors": "1.1.1", - "hyparquet-writer": "0.15.1" + "hyparquet-writer": "0.15.2" }, "devDependencies": { "@types/node": "25.9.1", - "@vitest/coverage-v8": "4.1.6", + "@vitest/coverage-v8": "4.1.7", "eslint": "9.39.4", - "eslint-plugin-jsdoc": "62.9.0", + "eslint-plugin-jsdoc": "63.0.0", "typescript": "6.0.3", - "vitest": "4.1.6" + "vitest": "4.1.7" } } diff --git a/src/constants.js b/src/constants.js index a3d481e..c5c81b2 100644 --- a/src/constants.js +++ b/src/constants.js @@ -1,5 +1,5 @@ // Version of the parquet index format -export const hypGrepVersion = 1 +export const hypGrepVersion = 0 // Number of rows per virtual block export const defaultBlockSize = 500 diff --git a/test/createIndex.test.js b/test/createIndex.test.js index 5ef936d..c5ba468 100644 --- a/test/createIndex.test.js +++ b/test/createIndex.test.js @@ -33,7 +33,7 @@ describe('createIndex', () => { expect(indexMetadata.num_rows).toBe(676n) expect(indexMetadata.key_value_metadata?.length).toBe(5) const kv = indexMetadata.key_value_metadata - expect(kv?.[0]).toEqual({ key: 'hypgrep.version', value: '1' }) + expect(kv?.[0]).toEqual({ key: 'hypgrep.version', value: '0' }) expect(kv?.[1]).toEqual({ key: 'hypgrep.block_size', value: '200' }) expect(kv?.[2]).toEqual({ key: 'hypgrep.text_columns', value: 'id' }) expect(kv?.[3]).toEqual({ key: 'hypgrep.source_rows', value: '676' }) diff --git a/test/files/alpha.index.parquet b/test/files/alpha.index.parquet index 424910b0b8ba9c5498ffa83b9c6f36c01c099a21..7f0bbc45cdf98dced7a266de0b35aba1e53f3aff 100644 GIT binary patch delta 13 UcmX@YdW3a@4HKimWLu_o03Uq>3jhEB delta 13 UcmX@YdW3a@4HKi`WLu_o03U(`3;+NC diff --git a/test/files/dataset.index.parquet b/test/files/dataset.index.parquet index ed9c0bef62af7a12f7b8c77b3123d266dd98ffd3..ff4eb76ba0aa4c7659c81e4a8ee70659b4dfa096 100644 GIT binary patch delta 13 UcmX>kc}Q}D1rMXaWJ{h-03kI5Jpcdz delta 13 UcmX>kc}Q}D1rMX)WJ{h-03kXAJ^%m! From 3eeec1e7c3c4d534e0b5a0381aa6f5464b0aab09 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Sun, 24 May 2026 19:07:27 -0700 Subject: [PATCH 08/15] Switch to n=5 n-grams for prose selectivity --- src/constants.js | 6 ++++ src/createIndex.js | 48 ++++++++++++++++--------------- src/ngrams.js | 38 ++++++++++++++++++++++++ src/queryIndex.js | 35 ++++++++++++---------- src/trigrams.js | 38 ------------------------ src/types.d.ts | 10 ++++--- test/createIndex.test.js | 11 +++---- test/files/alpha.index.parquet | Bin 708 -> 766 bytes test/files/alpha.parquet | Bin 2858 -> 2867 bytes test/files/dataset.index.parquet | Bin 3266 -> 10009 bytes test/ngrams.test.js | 48 +++++++++++++++++++++++++++++++ test/parquetFind.test.js | 12 ++++---- test/parquetSearch.test.js | 22 +++++++------- test/queryIndex.test.js | 13 +++++---- test/trigrams.test.js | 45 ----------------------------- 15 files changed, 173 insertions(+), 153 deletions(-) create mode 100644 src/ngrams.js delete mode 100644 src/trigrams.js create mode 100644 test/ngrams.test.js delete mode 100644 test/trigrams.test.js diff --git a/src/constants.js b/src/constants.js index c5c81b2..bf090da 100644 --- a/src/constants.js +++ b/src/constants.js @@ -6,3 +6,9 @@ export const defaultBlockSize = 500 // Row group size in the index file export const defaultIndexRowGroupSize = 40000 + +// Length of n-grams emitted into the index. Tuned for prose selectivity: +// shorter n-grams (3-4) fail to prune Wikipedia-scale blocks because every +// block contains every common short window; n=5 dramatically reduces the +// candidate-block set for selective substrings. +export const defaultNgramLength = 5 diff --git a/src/createIndex.js b/src/createIndex.js index 74bfb0b..630d37c 100644 --- a/src/createIndex.js +++ b/src/createIndex.js @@ -1,7 +1,7 @@ import { parquetMetadataAsync, parquetReadObjects } from 'hyparquet' import { parquetWrite } from 'hyparquet-writer' -import { defaultBlockSize, defaultIndexRowGroupSize, hypGrepVersion } from './constants.js' -import { extractTrigrams } from './trigrams.js' +import { defaultBlockSize, defaultIndexRowGroupSize, defaultNgramLength, hypGrepVersion } from './constants.js' +import { extractNgrams } from './ngrams.js' import { getTextColumnsFromSchema } from './utils.js' /** @@ -10,7 +10,7 @@ import { getTextColumnsFromSchema } from './utils.js' */ /** - * Create a trigram search index parquet next to the given parquet file. + * Create an n-gram search index parquet next to the given parquet file. * * @param {CreateIndexOptions} options * @returns {Promise} @@ -21,6 +21,7 @@ export async function createIndex({ indexFile, blockSize = defaultBlockSize, indexRowGroupSize = defaultIndexRowGroupSize, + ngramLength = defaultNgramLength, }) { const metadata = sourceMetadata ?? await parquetMetadataAsync(sourceFile) const numRows = Number(metadata.num_rows) @@ -29,7 +30,7 @@ export async function createIndex({ throw new Error('No string columns found to index') } - // Map from trigram -> blockIds (in ascending order) + // Map from n-gram -> blockIds (in ascending order) /** @type {Map} */ const postings = new Map() @@ -45,31 +46,31 @@ export async function createIndex({ columns: textColumns, }) - const blockTrigrams = collectBlockTrigrams(rows, textColumns) - for (const trigram of blockTrigrams) { - const existing = postings.get(trigram) + const blockNgrams = collectBlockNgrams(rows, textColumns, ngramLength) + for (const ngram of blockNgrams) { + const existing = postings.get(ngram) if (existing) existing.push(blockId) - else postings.set(trigram, [blockId]) + else postings.set(ngram, [blockId]) } blockId += 1 } - // Flatten into rows sorted by trigram, then blockId (already sorted within each posting list) - const sortedTrigrams = Array.from(postings.keys()).sort() + const sortedNgrams = Array.from(postings.keys()).sort() /** @type {IndexRow[]} */ const indexRows = [] - for (const trigram of sortedTrigrams) { - const blocks = postings.get(trigram) + for (const ngram of sortedNgrams) { + const blocks = postings.get(ngram) if (!blocks) continue for (const id of blocks) { - indexRows.push({ trigram, blockId: id }) + indexRows.push({ ngram, blockId: id }) } } const kvMetadata = [ { key: 'hypgrep.version', value: String(hypGrepVersion) }, { key: 'hypgrep.block_size', value: String(blockSize) }, + { key: 'hypgrep.ngram_length', value: String(ngramLength) }, { key: 'hypgrep.text_columns', value: textColumns.join(',') }, { key: 'hypgrep.source_rows', value: String(numRows) }, // Can save network requests on the source file @@ -86,26 +87,27 @@ export async function createIndex({ } /** - * Collect the set of distinct trigrams present in a block. + * Collect the set of distinct n-grams present in a block. * * @param {Record[]} rows * @param {string[]} textColumns + * @param {number} n * @returns {Set} */ -function collectBlockTrigrams(rows, textColumns) { +function collectBlockNgrams(rows, textColumns, n) { /** @type {Set} */ - const trigrams = new Set() + const ngrams = new Set() for (const row of rows) { if (!row) continue for (const columnName of textColumns) { const value = row[columnName] - if (typeof value !== 'string' || value.length === 0) continue - for (const t of extractTrigrams(value)) { - trigrams.add(t) + if (typeof value !== 'string' || value.length < n) continue + for (const g of extractNgrams(value, n)) { + ngrams.add(g) } } } - return trigrams + return ngrams } /** @@ -116,18 +118,18 @@ function collectBlockTrigrams(rows, textColumns) { */ function buildColumnData(indexRows) { const { length } = indexRows - const trigrams = new Array(length) + const ngrams = new Array(length) const blockIds = new Array(length) for (let i = 0; i < length; i += 1) { const row = indexRows[i] - trigrams[i] = row.trigram + ngrams[i] = row.ngram blockIds[i] = row.blockId } return [ // Delta byte array encoding works well for sorted string columns - { name: 'trigram', data: trigrams, type: 'STRING', encoding: 'DELTA_BYTE_ARRAY' }, + { name: 'ngram', data: ngrams, type: 'STRING', encoding: 'DELTA_BYTE_ARRAY' }, // Delta binary packed works well for incrementing integers { name: 'blockId', data: blockIds, type: 'INT32', encoding: 'DELTA_BINARY_PACKED' }, ] diff --git a/src/ngrams.js b/src/ngrams.js new file mode 100644 index 0000000..d59aa2d --- /dev/null +++ b/src/ngrams.js @@ -0,0 +1,38 @@ +/** + * N-gram extraction for grep-style substring matching. + * + * Text is lowercased and split on non-alphanumeric boundaries, then every + * n-character window of each alphanumeric run is emitted as an n-gram. + */ + +/** + * Extract the set of distinct n-grams in a string. + * + * @param {string} text + * @param {number} n + * @returns {Set} + */ +export function extractNgrams(text, n) { + /** @type {Set} */ + const out = new Set() + if (typeof text !== 'string' || text.length < n) return out + const lower = text.toLowerCase() + for (const run of lower.split(/[^a-z0-9]+/g)) { + for (let i = 0; i + n <= run.length; i += 1) { + out.add(run.slice(i, i + n)) + } + } + return out +} + +/** + * Extract n-grams needed to satisfy a query as a substring search. + * Whitespace-separated words must each appear; their n-grams are unioned. + * + * @param {string} query + * @param {number} n + * @returns {string[]} + */ +export function queryNgrams(query, n) { + return Array.from(extractNgrams(query, n)) +} diff --git a/src/queryIndex.js b/src/queryIndex.js index ed9fdb9..394f650 100644 --- a/src/queryIndex.js +++ b/src/queryIndex.js @@ -1,6 +1,6 @@ import { parquetMetadataAsync, parquetQuery } from 'hyparquet' -import { defaultBlockSize, hypGrepVersion } from './constants.js' -import { queryTrigrams } from './trigrams.js' +import { defaultBlockSize, defaultNgramLength, hypGrepVersion } from './constants.js' +import { queryNgrams } from './ngrams.js' /** * @import { KeyValue } from 'hyparquet' @@ -8,44 +8,45 @@ import { queryTrigrams } from './trigrams.js' */ /** - * Query a trigram index to find blocks that could contain the query as a substring. - * Returns undefined if the query has no extractable trigrams (e.g. shorter than 3 chars). + * Query an n-gram index to find blocks that could contain the query as a substring. + * Returns undefined if the query has no extractable n-grams (e.g. shorter than the + * indexed n-gram length). * * @param {QueryIndexOptions} options * @returns {Promise} */ export async function queryIndex({ query, indexFile, indexMetadata }) { - const trigrams = queryTrigrams(query) - if (trigrams.length === 0) return undefined - // Read index kv metadata indexMetadata ??= await parquetMetadataAsync(indexFile) const kvMetadata = indexMetadata.key_value_metadata || [] - const { blockSize, textColumns, sourceByteLength, sourceRows } = parseKvMetadata(kvMetadata) + const { blockSize, ngramLength, textColumns, sourceByteLength, sourceRows } = parseKvMetadata(kvMetadata) + + const ngrams = queryNgrams(query, ngramLength) + if (ngrams.length === 0) return undefined - // Read postings for every query trigram via pushdown filter + // Read postings for every query n-gram via pushdown filter const indexRows = await parquetQuery({ file: indexFile, metadata: indexMetadata, - filter: { trigram: { $in: trigrams } }, + filter: { ngram: { $in: ngrams } }, }) - // Count matched trigrams per block; keep blocks that hit every trigram + // Count matched n-grams per block; keep blocks that hit every n-gram /** @type {Map>} */ const hits = new Map() - for (const { trigram, blockId } of indexRows) { + for (const { ngram, blockId } of indexRows) { let set = hits.get(blockId) if (!set) { set = new Set() hits.set(blockId, set) } - set.add(trigram) + set.add(ngram) } /** @type {BlockResult[]} */ const blocks = [] for (const [blockId, set] of hits) { - if (set.size < trigrams.length) continue + if (set.size < ngrams.length) continue const rowStart = blockId * blockSize const rowEnd = Math.min(rowStart + blockSize, sourceRows) blocks.push({ blockId, rowStart, rowEnd, score: set.size }) @@ -62,6 +63,7 @@ export async function queryIndex({ query, indexFile, indexMetadata }) { */ export function parseKvMetadata(kvMetadata) { let blockSize = defaultBlockSize + let ngramLength = defaultNgramLength /** @type {string[]} */ let textColumns = [] /** @type {number | undefined} */ @@ -73,6 +75,9 @@ export function parseKvMetadata(kvMetadata) { if (key === 'hypgrep.block_size') { blockSize = Number(value) } + if (key === 'hypgrep.ngram_length') { + ngramLength = Number(value) + } if (key === 'hypgrep.version') { if (Number(value) !== hypGrepVersion) { throw new Error(`Unsupported hypgrep version ${value}`) @@ -95,5 +100,5 @@ export function parseKvMetadata(kvMetadata) { throw new Error('Missing hypgrep.source_bytelength in index metadata') } - return { blockSize, textColumns, sourceByteLength, sourceRows } + return { blockSize, ngramLength, textColumns, sourceByteLength, sourceRows } } diff --git a/src/trigrams.js b/src/trigrams.js deleted file mode 100644 index 6f3fb89..0000000 --- a/src/trigrams.js +++ /dev/null @@ -1,38 +0,0 @@ -/** - * Trigram extraction for grep-style substring matching. - * - * Text is lowercased and split on non-alphanumeric boundaries, then every - * 3-character window of each alphanumeric run is emitted as a trigram. - */ - -const TRIGRAM_LEN = 3 - -/** - * Extract the set of distinct trigrams in a string. - * - * @param {string} text - * @returns {Set} - */ -export function extractTrigrams(text) { - /** @type {Set} */ - const out = new Set() - if (typeof text !== 'string' || text.length < TRIGRAM_LEN) return out - const lower = text.toLowerCase() - for (const run of lower.split(/[^a-z0-9]+/g)) { - for (let i = 0; i + TRIGRAM_LEN <= run.length; i += 1) { - out.add(run.slice(i, i + TRIGRAM_LEN)) - } - } - return out -} - -/** - * Extract trigrams needed to satisfy a query as a substring search. - * Whitespace-separated words must each appear; their trigrams are unioned. - * - * @param {string} query - * @returns {string[]} - */ -export function queryTrigrams(query) { - return Array.from(extractTrigrams(query)) -} diff --git a/src/types.d.ts b/src/types.d.ts index 0ddcfcd..724427e 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -7,6 +7,7 @@ export interface CreateIndexOptions { indexFile: Writer // file writer for the output index parquet file blockSize?: number // number of rows per logical block indexRowGroupSize?: number // row group size in the index file + ngramLength?: number // n-gram size used to build the index (default 5) } export interface QueryIndexOptions { @@ -37,11 +38,11 @@ export interface ParquetSearchOptions { } /** - * Represents a single entry in the trigram index. + * Represents a single entry in the n-gram index. */ export interface IndexRow { - trigram: string // 3-character substring - blockId: number // logical block ID this trigram appears in + ngram: string // n-character substring + blockId: number // logical block ID this n-gram appears in } export interface QueryResult { @@ -57,7 +58,7 @@ export interface BlockResult { blockId: number rowStart: number // starting row index (inclusive) in the source parquet rowEnd: number // ending row index (exclusive) in the source parquet - score: number // number of distinct query trigrams matched in this block + score: number // number of distinct query n-grams matched in this block } /** @@ -66,6 +67,7 @@ export interface BlockResult { */ export interface HypGrepMetadata { blockSize: number // number of rows per logical block + ngramLength: number // n-gram size used to build the index textColumns: string[] // list of indexed text columns sourceRows: number // number of rows in the source parquet file sourceByteLength: number // byte length of the source parquet file diff --git a/test/createIndex.test.js b/test/createIndex.test.js index c5ba468..9560ab7 100644 --- a/test/createIndex.test.js +++ b/test/createIndex.test.js @@ -27,16 +27,17 @@ describe('createIndex', () => { expect(existsSync(TEST_INDEX)).toBe(true) const indexBuffer = await asyncBufferFromFile(TEST_INDEX) - expect(indexBuffer.byteLength).toBe(2308) + expect(indexBuffer.byteLength).toBe(2494) const indexMetadata = await parquetMetadataAsync(indexBuffer) expect(indexMetadata.row_groups.length).toBe(7) expect(indexMetadata.num_rows).toBe(676n) - expect(indexMetadata.key_value_metadata?.length).toBe(5) + expect(indexMetadata.key_value_metadata?.length).toBe(6) const kv = indexMetadata.key_value_metadata expect(kv?.[0]).toEqual({ key: 'hypgrep.version', value: '0' }) expect(kv?.[1]).toEqual({ key: 'hypgrep.block_size', value: '200' }) - expect(kv?.[2]).toEqual({ key: 'hypgrep.text_columns', value: 'id' }) - expect(kv?.[3]).toEqual({ key: 'hypgrep.source_rows', value: '676' }) - expect(kv?.[4]).toEqual({ key: 'hypgrep.source_bytelength', value: String(sourceFile.byteLength) }) + expect(kv?.[2]).toEqual({ key: 'hypgrep.ngram_length', value: '5' }) + expect(kv?.[3]).toEqual({ key: 'hypgrep.text_columns', value: 'id' }) + expect(kv?.[4]).toEqual({ key: 'hypgrep.source_rows', value: '676' }) + expect(kv?.[5]).toEqual({ key: 'hypgrep.source_bytelength', value: String(sourceFile.byteLength) }) }) }) diff --git a/test/files/alpha.index.parquet b/test/files/alpha.index.parquet index 7f0bbc45cdf98dced7a266de0b35aba1e53f3aff..d3cb0c8834e4b9a2e1c6ca1f7064c22e0db6b158 100644 GIT binary patch delta 373 zcmX@Y`j53fz%j^Bluh)5py(9V7||14q6{E{4=BXIz;J?#@hgW#10%~4Rt839CMG7| zNh+SK%nFT@R6>D3G>nmrB{Y$hjh9z|5h%-KB*fsw%fPD2Yv?78+N*YsRYsIaQj|+lf;}lGKRMeo1)@1iEDIJp0zOH)f2qJUh}H4ApKu40g20TK)lom^m@Vjoz67BEQ0NbqM=7Ni%Y7U-3w78Oqp fXVRWLhsj&q)Pg~R6DX5dR9Ko?!r%Zj1r&M!it9w( diff --git a/test/files/alpha.parquet b/test/files/alpha.parquet index d9dceb3a8d5cba8574963b86fdf795f4749b84f6..1aa73213e584f38536f2e5c811a3f5338847734d 100644 GIT binary patch literal 2867 zcmciEJxjwt9Dwm_NJ|?k=y0WxtSzM|+QrGm&9OKO68oCIrA?Ej$%pVmi9-JdHk zF~ATbjNxH|DQ1{s0Ut}O5MYfE5jKdiMS}DwZ@i1>(UH)nBVj;C!jO)H5giF*IublO z5+-yG$ebjno${_Qr6gfS=K#!EeE$4n<2fM-3p)SETts}xZyWlABrNGjSkaLX(2=mF zBO#bhyFP5bQD z417P-hSJrBQ8&DPL#bM3*^ccr%Z{$vTG{!qHW;maw7Ht)>nqhNy7?CM{NcLFb~p+v ZwqZ4sGE228<7DXut7s4^kKr!I z07Hy0#spJ5VTK4X63mfefecHmu*L>COljZPis;di(5E9Iq$6QKN5YVfgb^JHV>%Kh zbRDBO#|F!O%IPGn4-uNsZBtcX8{aahz#O=~~ljSmSFWm1^68F%6r8t@3E diff --git a/test/files/dataset.index.parquet b/test/files/dataset.index.parquet index ff4eb76ba0aa4c7659c81e4a8ee70659b4dfa096..6aa80b490793e3e7fe54ce526b9cf9c08f5b7f3b 100644 GIT binary patch literal 10009 zcmZ8{3sh8Bnr@Yr36J84P>_O<9t(khfdmO6NoQsx4Fn8$<)Iyoz2dHOcAeUA&e@0a zC@d#4H*~8LP`skZpfb||2?~jsw2cIvJFCT4j3&K;VjgOD8tq4t?zz`Bo$0%Jt?8Nj zZ8AOgu5m3*UDWjv7acSpS$A0F~<&6s>5!y~>(_@T$+nS8?g ze&gq_czw4X@+?`5KaU)L@A_Hu#&q}Hg1#Gty^l@wwgs)*)1@yS?-)7Jce}c2B)f0q za_h~x?pxJv+M13ftIxHpX_O~AvVXY#jq0X13d-(Gx+B%KIj2wF&1~*hdXKE^xINg^ zTGh6-viV@eeJ;D8mJHNy8>nd+Q%D(0J+_mKHeBuRn$4r3f(@JdHyobY zIe1>#TwcFtwDds#39%;Y;?DhJ#V?K~w(PFCm0NVMAbKEBGG116>+OcCqZ?)qq--u| z#D=p4HR}o@eGkwjSaa-RNA~L7$7&gFa^pl>?%(WgF6zyGr~BZ<#`JM^yJG$59HP_a z&tIQ&`n}Hw`@}v?7nSDGk^`gp$v-SUTt=TCY-kQrQdatKW=V2J{qU*0)=|27(jOkn z&puJSXDUIg38mK`zF4=e)-UI>&ABVL3}{DEBUzJ0uZ_}ZW=cyx-FZH`ZXjjv=!QL` z4e28#;%v?D>e)s5-NLqem)S&~F;~z$@^nY~rN)$Z14<;gwfOPY)e}bwtdZJPTVFqE z7Pg6f$g7R}&(|FsDBaWF&@x!OH`Cld*3g_8>CV$X$*XA|RIUYz(uZH|8!g>aUUYqg zVogKKK+VryD{2Xr%;vIZs~SGs7RnBW{%l*?OmXu-V^ih>H3J2K$!K%=u2-{buAM4= z5ZnCX&7Oj9#~!Oc(0*y+(Av*yQ-JlFD1 zl>@I|JuI%XrLMz&rv6?v`+32}W1AZ9Rj+>R>aN`UGNq^K%PD=ReBa$ydmesAPhO)R ztLV5rwO*_lIni`Cd*h2gXuDm}cB{e&tZusVk$LpU#`ou%(x2IhB%IUUMO{@#=y4qa*c$yX&)R>(7@yo3tu;Bv6o^P5VYmS_*)IQ)9c@_ z?7KCsVa@c4n(n=Y)=0(fns++x<~8@P>>D5Kow%s`*YwVvXbfCvyi@Jw=J!lgHz~Ed zhbq{HbGgCdmJ`JXc9$H^EJ-_9b1=|&xId+NsyOK(dNn`~oGRKI*fpC=XA`;}9;;iN zv2qCrmtE8Mv+RlR<=#)r_Z?l?9jIxV`)L1>i{nQwH{A=i`JZaKSzhKp(|vSo-$<}| zsI^zz=E!CJ&a*u?23Ov>vf)lJ^5&%T>r-v-SL*NY?b%wf{!YcZo70U$g^fAybR4`w z-7JpXwN?UU%oB{%}i4|2wZ9y4;lfH*YO_ z)X2?hLZqjU9;lx^zjIl6(ZRgNp3#&Gd8=mg@{i``_vf*}^2WjP znrr>^?0yd^?!8sn{qP~9<#hMm%X^0|IwL36t+=`K#r%nhvcInW*7zciXV*V_l6L*$ zxx<&2{rpthy~(Dz+I9D;*YCN|*szh^o}75`G7DU)zg^f9XjvCHzI%SK=}uu}w4!Z^ z_wYHnVqePzHgtAxa#2t6bK|$BX`txPk?E4TP2IVP>vO)o`b@z%j8F#8r ziaZux(9<7n&VF>_U|r*A`M&IYV^lo zD*H)!PuiN3*&90!PWRrLTz~s=_r(*v2UoTotB4HMjR#IMY&SI7vEuZGJ9AAq+5K&h zt)a?|>5p2t$i0hZ|3|G#^?K~G;=(S7-R+9rw8Mx_$3$K4k?CbWo~vKH2$kR;UOxNL z#P2@%$)3!TPv5S&URrV?Go1J&%_^_|-hlq$Xz{iFUFS!%q+xpaKzBYr*KkFj%QRLT znn*g+7&x;xy=L8ws~cJ$?fsylY00JW?B|-k2(DjIGd^Dtxm(e;<*Du^-lOM~F@31o zn5*u%G1lR)*YDPL2cB#@cA|HD>~iyi7q(QL3|Fqod5#4ho!D}vq_(C*faSuu>i*sJ zeW%Fb$(q@>>0nuI|47aCvf`t8`3-rE*WWJLT+WuANIW=74hLBCSofK8^2dihdjALV z#>I^37ADZ@#5`+)KZGv*Y8*ku#R^)TYN<27Z0^tp&QYxSyWr zo}1G9pWfJfNl$*R@h9i|lGn7|y*xiV_vY>Mz0D6zthlo3?(~~!@3JkcBSJ9J57v-e zy=S1Ydy-t;$$AF5o?W);`Kg_WP4s(t^|NDzdzy=z^XkK+^;bu04&+6}ZLXEkJ?BW; z%7z1@4F^Z-HjmLhU+wj^WcUHPrN89)^7?~sn|?tOkJd?qG{T=P!v z;mXxFH#N>pvx$r2{ezp+j<1{Bq#v#92>-Cxe>^R4TDPz7JvP1f%kp)1CJo%?{dWr;MgeHT0F&?3vto{QzD67k~E`U;g6FZ?+Vaw5(m0c&t8D9`(}F#VrH* zz32SN8AbhLg`t0ad4L}F72L>7xe=f}=Nrff9UR+9a!an47hXHB%Ah9aH$T~%U9<1V6kGASn|yjVJWbxpzLCnN ziLuDf3;J?1_9U-q8Y;Awy~6@UeYZ~NN3-9`ooWsp?|$Q>;@4~S{??u)-q)sYO!c0a zZhZ0d>+=y^8UPsa-=;eb=Y^KAmd3ImT{|b&S-~Q(xsS@@%YH zf2X|p=qH*y-8470K6^Y8xU~P^;HDSrR{6i#YOO5uc|0T>)~H5pZV}gs;a@FoF6XTX-{`r4pGjgULi^j)7MlZV+bSKa^=CY=k*gBAmOn;}FVhX4s^S+Dz>KdxUWT%tFb>c9JL*ZN7p!kkk zrfV2Pr4Ge1JIWbVsG?cS;5y;DEmLA}R~46c85Zf>P;K3gaHi_4lj1psvjex*U0aTG zqcdtonH(2EG%4ZC!ntGzk%@JgtTDI1}zj!c=i| z{FHFRiE`I4sfEbcIC{+0oM@ay>=+J9G)Kk>40Q(;`7o$SWg9=onHFNqWJROQin|tW zr)hD6SwS6??cb8oF4=t+;LCtjNDbs4H7U36=;FP!x1GW^K8#X0!)O8d_BODMg zq9mp<$~z589E)^Pgpg@~n8GyIVThTf*iIC9acCQotr3M%13%(6@X2gFOf2G1;u^9= zy?=jyQ$$6)kT2Jc+7!zb#x>Vvnxm;myG4oZXbxgbfo)?4F=T>+$&7PTp|&Qw_(BmU z*>r$K9UzQYEFPmvDwbPO1TjV}*;~HWbg;dPGaEKCh(uNW=Kh}nKHQO%MdVLYOsa7US!NvoxCUtBHh58I-iADvQ$u!~c!ZMnVh7ka@jl0rUGK&h zsP0-ZgofA%At*#$h2t_I=c^Z%AeG#5bsZ7bDL1i!Yv4K&&1G@~d&!hJ;8u+CcBa{8 z6u3lUG=*}*0=MF}3v5}A4b~(1ay){FB1G*rkjuhU=eRFJ#9g~niR1sEqy_*2)q>vw zKnM&HXT@<5@Eu3DDA4OrK+9spLckFkB`yO5M5hMm=P*u*qJ@bg69pW$DA21##2P_R z8gqya<{998T=hKqwKh8@LAG1mek9RWrgK|^>o1dal~(5omh48)Tut|_#V>$;(UIC0<| z4-j$?L#mM#7vB=33kg{GC#EAc4#bmau1qP^6Ci@x28?BEF0ipA&-g5>VWfc zhky*Yp-vb?S+Ux|ZyeSlOz<7@0hMPlAjOIj+C5mksBG#had z^#W22%rU4Iv=>;%ejLCAibZ?@P=g~20I#cCF5>RSpxYY9DhC%ZaRfXyN_A+4P#zq? zfwXXN1zHDIyJ3RFL%3`WF-CwPQD8}g${HjR#mC}MB>_ysjxd}><*rHaoE;P@$x&z` z8nguA;);+4m2<9YHWLCM*nl#EctAjKaFj{lS7Zj^KteT;Q`7OQy1y1h6`<5auyKBP5oU-WZ~!^p34%n017Tnw z^1X0N*d8F5Ljf5;OJeigYZYj@AuDFo#=;n*Wd3PI0XP6%T-*&qcg6hyBoL0hXL&U5 zP>w=Xh@QjdXH$u+NBGQhuk461=%FaB4A<4zb_Z*4GguqcdMj*q72aP5s=;QV^b1L~ zwiA^pxT?j~C=z6I8)XH;jDrBG*h3d;0jmLx>_AMx4QKtkVJe^!XVO?wD#Xyyyx9p{iG3eRXD{y9K9PHS7C-!=eNEX+IMP81qpG1sP%NaD=>ff$PjMyp_}P#zR$ z+UCL+;A*xl2o7};ifh1U!6qYZ#06;}O+d85Vj6W^T_9%%d>E5m+t|(&Q%JG~UrFLn zI)n%+g#f|Z;vE`|Xn+mG+KIwCnm~@sV>lPrVF#JMP{{Lt|Ekb(oI?TWZD5o;ps%9C z5@}qvEGA?#w01uzj;-Q27krVW<6remSmE>4~J}P$U#UbSCqEof?53HN4*}#36UUt~j04 z4iyA+2F8Y^4-#J%!mA={qD4W&fv9x2f@^lfV}b80|^RsC^1IR1KCg^cmxswGnkA50T(Jw`2*;$5d5x}8;FX& z0zxB9Ogq#PA*G@0LhdRWtUi>i-TP@Nhay0l5GvRg@=;C{HV(?H4X4)a5C$F+1HIs| zS2jw9^tZ`Dh@_^Q(a^@_AQAilBuV;4@>*or5Hfcr4U&uE*S}p-QTW&I;ZMy&J^2YG|zrp|EY>6y?ldx@CFa^f4}7OAe|a zv@J(N$$l6%GJHuoo9rW8T9KTGsvH3)VN*gk6ExkH{wXDggU%?$w2Iw-0bUY>V~{L@ z##R`^g|Y`tAS%cJl|vy#*o7#mai|u`ILa>MnFwp@Y&Rhy@EKsPg@ynojKTweQdMco z(d(d_La`ZsI7ks)ox4>nIj-G-o<_GrAMNLO#15)JX5w%}QG^b-l^qWIyx?^Vw0WI$ z>&r5lsZI@^+r*<~;W^MB!Ihz5i!zvxnYXtA>Zn!1gq!frq#b@376U5Z^@U=f>%qC< z2GMdN5||&rz{1Y58E>BIik1Vk2cF7>jfMr$un%+Ps&DwI9;U-5Mj8BsQ3FXa|3yj(PlvFAv&m{rf8&P9ohvkkgKM* zu`mNu5nkbbVaL$0p&3P6?R}~oP7}Rn6zZZWox1d3qE9hq{w0B!(XNVAB$g?cR^~;K zGgL=zvf)9W&ArQglvllwigbl}C_f=?IMN@M?N>Q)fgmRn3a6BNDX(-?`NeK`l>Bot_s6F%K{aTQp zDkxx3J2Y3IDNMKMpI}%qgiz2aJ8`HGiWxLL3WAD83emzfWFZ}*M8Ru?&Uh^E@%-H{ zFwwILp9gcyTqFv_KnvTYLp1HBrI2)uYZPM4czou6KVv#P4zGjp4gscQD&-_65XZi> zLi#u@58{b70G_Xtw0OAptwQ)%%7JjeV*W9YDgAcoTHKkVv350J4?Q8FY2RCx2fpIS zooG;{r&Bi|&9FTrwq5ixkWKXDCJFteAO2AF7CobL;5&>iB0_a$QtsVU3i-ysWjow4 zG4%Yig$TL|zmIxlOH-+SB*O@OyQ-w>y2qM%%8z!RN~e+nU>D!cyY~8#GSc<^r z7dTXQHh{-cS+bs3Q5({P3Af%1iykbumL!Fn8j* z=(a#gbea(`8}kW@w%t}47hFaj(MN*sG0Xww=|eJuy_*1T=^8nBkhB)=6ouN+Fk*~R^f!hSNO~0t zYNE$SSB+cC*V*s%TmQ?hj#se{P$9nI59bcUSj|G?3C8=L${(h(woVC0y8AFCi@(#>wijWVyhTiz>1&&kS**^O%uAPLu|HTqQWq4 z;q5MD9{nynT(}J$ntT8LHS`%Y3YZ|=AUiSPl@JolYSETEacH<6X6WG!XLzS1`xD|a zLn_w-klg-yDz0Ms$Kq+vB%uOvM>C_+tBK^(AX-zo9fbQ$4jh~XV`Ee;o&SzcV^Vg& zrb zoKTNG?=8f95;Y194UI_uEx{+dQgf=08cRk3uxuD*N*n!kNGI}vb_q~wN0r93W&Yzz z6d@RUbj(mX3}B?!66?f4#QX-e8%+!vM8obH3gQP~mMZ-!H3!s#nL!2qX@V48mdai9 zOT+y>Mo~BYDrO6zKM^c6rN#c`uBBrpis^`VgU=vXodB;uzJy(XX(#B}k&{kLHOO`o zjVCuc+?mE=6&4Rr0=%NE>X%TriKQ>Qwg!mFz!;VyWVjbD7KX{dkR38B{UJr__7|fi zfq7IAbJLO%61sLR7ayFU+ADI}iNxh7(V&ay>z+(0Ca9~(vMzc@NzU?H_$Yx!d@w*M zO9Q%rocB36_~rIy52lo*VeU;u3l_u(+{0hyw5Z$&_Xm}^Q`?1~r> z<0BAc6TP7Uvxwmr8b{2~U_9^<7aBMhHWQ2kB=La(1bTj*l$YkQ$?um(G5$akVA)-Z zN@0ZYB=K<=W{xwhLEM*M4hZKqgti0&d59}YC_Femc!DHZOvMKnj`$0Yz$uW9-?sYh zqKw@C_xo2#|M~k@e--%p*RTG~>-nkIli|yHrp{+^?qv8p8UFl5!6led51V;VE3QL!2PEWZ`2( zPgV|gmNM}B5uSp51KymBMOhh1S^xRmuh%zcHj5XiCGf{r7Vh;DW=N_N4SHAq zf3G-nyYrF^I{~!{`owphVEP}Ip<%|f@FfeaUT|@lcw*Lnc_|G4`yY?>J+-D1??}Wu Sh=qpB@er@avjM*p_kRHp!K#S> literal 3266 zcmZ8keQX=$8Nb)|bxv`?Epg^R1-ZT@JLDH`a*)}x=i$s4Ru2sbkf>xnznR}#vdy*0+GRlv}r^J%s|F ztmFKCpU*!rk*UT=we|7ivBaD+=Xvvsa81GoZ?PIn9D#Nkc-ejjLiS3Ga2v@lGnxE$%KDIc7eDJqh%4Q=Zo5iug>>er7vO6olhX`ERJ?X*N zARk}-ads?=v)Sw(oCRf2&BAfIZLks<9K+f1>>fX$^S?8ZxV0x#4u0@f$0 zd$I@7PlO%&d_cp%(NTbRWTwx$?5RHkcHn>Angee@aBm(m@cTe8@2PW>w{-!^Ov}A} z?=-QHflcJGuKuRh{L+%gzrDqSoaQayeQ6P3qu=g;N|fdnt22GoiS&_enSo6i;oLSK zczy$tw94fLAbw#_!3Gv9lY;sQM4ljam z9=#SU&G``u4zxao@_`sh`?!7-IxqoQBqulb?>(BB3_QB?O!EAR^f!)W)^iA@`?)pz z;_9QP>YnRO)mIvq8=hCM!F3isc(rOw9*0+(e7tk}jXtqG&s~|VSI_hwZp+*xWU6P1 z4G#=Gw_f+q!-3kr&_JK>)RgC~Oo9XOSD#w>^pv;U(#6%$zQE-M9}h#Yn#T`wQv%&q z^T7D*wZ;o84Y%fLUR^k~%Rh9cxL(K%#pJ!Q?vtF9hi5}nbE1qYAp7&nEy}Zi@$zD1g1~lU{>1D@z?9XB*2vPA780-Iq>~1~fqpWOek@;7Y@vT5Uc13MA`7+9P8D<_6KuZl4Kf0AVMJ!7NrI~P9HWYLb z-H`+mRh$Eq6+?@^7%0kU0v(6~6;Qc~;06?W-w7_iu2Z->#DH%!%BkHmTn?>^LIrGx z>G2tEL^v0!I5aOLcL1pXi~x@|gqG!;C~OF8dWkB+BmK;w1%>KRM?y_fRG6X}1+=qI z)^w9dz$9?0m@h#DnnbEj+*{^u)?$q7cR)9vX*#~llX^&Y^0Lf!Wr@XQRqvVng$9A z&j?x0jK9nOJHQ-aXK;kNLidgm5e-__nW`IQoKuLTr&hz7sZj-$0tFePq-8rIX3$86 zO;8qO=_ixchy&w7<>q+sKL}eRZdr~G^cuipPAb3@4MPUZb__Z7ht%)G4wx74TQSQb zHg+m1miXUs3cm|r=Fp3wX2Z0s59gUIN{Kszu7cyiR;n3Pk`o78p^ae9gpalu#4IvJ zH5^03xdOT~48xS8smDS(w5)Ikr=q9$Uk37p?ohxQN&UQs6$p$LgVw@go?mRXuUlee zLxC11!jkZ86gtVSP<`~99eJ403Vbt#xt6^L90tP+GjBn+Q=4M3Y`~zBl9IkGm30#s zH5?+Dd6G9_HWZyXvZxt{u7u&Fti&&cnw=|rs%E(S^XG9cNTqU0hiH(8tO$P!F=v<^ z`gna8?&s`7pM+^;*f2*Q-&lk~G2n0PKH;;Vjiu4?uIp>UCA5OY_%*@KoEv*RL?N2c z9yM>Q26W=&&ahn}R3F>D)r49b(7dLNhZ%G^f0)b1zlRP+Ehh&vYY0yV6m#@jL28gx zDwr>n&~Lcm=^FkOOjGI~QAnf+-cutSi9a)XG3b`@bQdS#lN=FV4H%~I0%uI$4-^Zx z@E?a^AC0&|+q&aH<`}8oK*fwtr+ye_A~Yr%2ou>R4%!zqaACR@)J;)%D@b)IZHI`W z4Wna$JQV@~YK-p3??+)6+%j?EH$fHu7}f|C^XS){KK_@WKFp96gf*S|a9Rs8mg75< z(20(yYK95xEO8-zXE=w&A`#YSKH!S+qv_dzBcm^mVg9cn%T-9~Fh`Mz5z+}QloPn1^g(8zr-jjqW9|5*uMVM-8IHYP7 z?Qahy71*?t0rM5p7A~ZQTE^0YyP&`ttb8bR^9I3GOSS-HJah6GCy{yn&FN=h_fZkr zM28~1U5Qx^!8sTY2z-E(-@O;wR3WfLsRrvvp8jC<>jhOT%0z@4lWI`2f}yJ&QK6C! z3tv|hHCNHFNi|j}n!0SrrdhO$x{F<`HFsCVfGwiIgL4fAuz*57M6@ zt_4K`?OS^TI;CbLz(qoFU-V1hw4h@(&T$c6^E=$B#BxUB+OFQ-zOg;IDY!8bYLDF9 z{)J}52S;tky5Y4cxb=Rn%kNSx>WAAe*S-74-FVZ>-5Vpm_Q+Z={IAO!hW%~* zaMrX7fEpLQ{(i%9?RCqx_q!ng*S;@$OVOzmbW*w7BDx_%5Y)p(H+(L+3WFYs2Abcv=&he08kFfG*{@UEi264-?b`qEDCd|&AzHyKMgtE$ W@E~ZkfCh$6B__8Waa=$Adif95x!L&u diff --git a/test/ngrams.test.js b/test/ngrams.test.js new file mode 100644 index 0000000..31f531f --- /dev/null +++ b/test/ngrams.test.js @@ -0,0 +1,48 @@ +import { describe, expect, it } from 'vitest' +import { extractNgrams, queryNgrams } from '../src/ngrams.js' + +describe('extractNgrams', () => { + it('returns empty set for short strings', () => { + expect(extractNgrams('', 3)).toEqual(new Set()) + expect(extractNgrams('ab', 3)).toEqual(new Set()) + expect(extractNgrams('abcd', 5)).toEqual(new Set()) + }) + + it('extracts overlapping n-grams', () => { + expect(extractNgrams('rhythm', 3)).toEqual(new Set(['rhy', 'hyt', 'yth', 'thm'])) + expect(extractNgrams('rhythm', 5)).toEqual(new Set(['rhyth', 'hythm'])) + }) + + it('lowercases', () => { + expect(extractNgrams('Foo', 3)).toEqual(new Set(['foo'])) + }) + + it('splits on non-alphanumeric', () => { + expect(extractNgrams('foo bar', 3)).toEqual(new Set(['foo', 'bar'])) + expect(extractNgrams('a.b.cat', 3)).toEqual(new Set(['cat'])) + }) + + it('dedupes n-grams', () => { + expect(extractNgrams('ababab', 3)).toEqual(new Set(['aba', 'bab'])) + }) + + it('handles alphanumeric runs', () => { + expect(extractNgrams('abc123', 3)).toEqual(new Set(['abc', 'bc1', 'c12', '123'])) + }) +}) + +describe('queryNgrams', () => { + it('returns empty array for short queries', () => { + expect(queryNgrams('ab', 3)).toEqual([]) + expect(queryNgrams('abcd', 5)).toEqual([]) + }) + + it('returns n-grams from a query', () => { + expect(new Set(queryNgrams('rhyt', 3))).toEqual(new Set(['rhy', 'hyt'])) + }) + + it('unions n-grams across words', () => { + expect(new Set(queryNgrams('foo bar', 3))) + .toEqual(new Set(['foo', 'bar'])) + }) +}) diff --git a/test/parquetFind.test.js b/test/parquetFind.test.js index af1d857..bca8cbe 100644 --- a/test/parquetFind.test.js +++ b/test/parquetFind.test.js @@ -12,13 +12,13 @@ describe('parquetFind', () => { for await (const row of parquetFind({ url: 'test/files/alpha.parquet', asyncBufferFactory, - query: 'akk', + query: 'aaakk', })) { rows.push(row) } expect(rows.length).toBe(1) - expect(rows[0]).toEqual({ __index__: 270, id: 'akk' }) + expect(rows[0]).toEqual({ __index__: 270, id: 'aaakk' }) }) it('should return no results for query with no matches', async () => { @@ -38,7 +38,7 @@ describe('parquetFind', () => { for await (const row of parquetFind({ url: 'test/files/dataset.parquet', asyncBufferFactory, - query: 'the', + query: 'their', limit: 2, })) { rows.push(row) @@ -66,7 +66,7 @@ describe('parquetFind', () => { for await (const row of parquetFind({ url: 'test/files/dataset.parquet', asyncBufferFactory, - query: 'rhyt', + query: 'rhyth', })) { rows.push(row) } @@ -87,12 +87,12 @@ describe('parquetFind', () => { expect(rows.length).toBe(0) }) - it('should return no results for queries shorter than 3 chars', async () => { + it('should return no results for queries shorter than n-gram length', async () => { const rows = [] for await (const row of parquetFind({ url: 'test/files/dataset.parquet', asyncBufferFactory, - query: 'th', + query: 'rhyt', })) { rows.push(row) } diff --git a/test/parquetSearch.test.js b/test/parquetSearch.test.js index e33a589..527deb1 100644 --- a/test/parquetSearch.test.js +++ b/test/parquetSearch.test.js @@ -18,17 +18,17 @@ describe('parquetSearch', () => { url, sourceFile, indexFile, - query: 'akk', + query: 'aaakk', })) { rows.push(row) } expect(rows.length).toBe(1) - expect(rows[0]).toEqual({ __index__: 270, id: 'akk' }) + expect(rows[0]).toEqual({ __index__: 270, id: 'aaakk' }) expect(sourceFile.fetches).toBe(2) // metadata + row group fetch - expect(sourceFile.bytes).toBe(5603) + expect(sourceFile.bytes).toBe(5617) expect(indexFile.fetches).toBe(2) // metadata + index row group fetch - expect(indexFile.bytes).toBe(1084) + expect(indexFile.bytes).toBe(1174) }) it('should return no results for query with no matches', async () => { @@ -50,7 +50,7 @@ describe('parquetSearch', () => { for await (const row of parquetSearch({ url, asyncBufferFactory, - query: 'akk', + query: 'aaakk', })) { rows.push(row) } @@ -62,7 +62,7 @@ describe('parquetSearch', () => { for await (const row of parquetSearch({ url: 'test/files/dataset.parquet', asyncBufferFactory, - query: 'the', + query: 'their', limit: 2, })) { rows.push(row) @@ -75,7 +75,7 @@ describe('parquetSearch', () => { for await (const row of parquetSearch({ url: 'test/files/dataset.parquet', asyncBufferFactory, - query: 'rhyt', + query: 'rhyth', })) { rows.push(row) } @@ -89,7 +89,7 @@ describe('parquetSearch', () => { for await (const row of parquetSearch({ url: 'test/files/dataset.parquet', asyncBufferFactory, - query: 'the', + query: 'their', })) { rows.push(row) } @@ -99,7 +99,7 @@ describe('parquetSearch', () => { * @returns {number} */ function count(s) { - return (s.toLowerCase().match(/the/g) ?? []).length + return (s.toLowerCase().match(/their/g) ?? []).length } expect(count(rows[0].text)).toBeGreaterThanOrEqual(count(rows[rows.length - 1].text)) }) @@ -116,12 +116,12 @@ describe('parquetSearch', () => { expect(rows.length).toBe(0) }) - it('should return undefined for queries shorter than 3 chars', async () => { + it('should return no results for queries shorter than n-gram length', async () => { const rows = [] for await (const row of parquetSearch({ url: 'test/files/dataset.parquet', asyncBufferFactory, - query: 'th', + query: 'rhyt', })) { rows.push(row) } diff --git a/test/queryIndex.test.js b/test/queryIndex.test.js index 76394c4..3f8226d 100644 --- a/test/queryIndex.test.js +++ b/test/queryIndex.test.js @@ -3,9 +3,9 @@ import { asyncBufferFromFile } from 'hyparquet' import { queryIndex } from '../src/index.js' describe('queryIndex', () => { - it('returns blocks that contain every query trigram', async () => { + it('returns blocks that contain every query n-gram', async () => { const indexFile = await asyncBufferFromFile('test/files/alpha.index.parquet') - const result = await queryIndex({ query: 'akk', indexFile }) + const result = await queryIndex({ query: 'aaakk', indexFile }) expect(result).toBeDefined() expect(result?.blocks.length).toBe(1) @@ -17,17 +17,18 @@ describe('queryIndex', () => { expect(result?.textColumns).toEqual(['id']) }) - it('returns undefined for queries with no trigrams', async () => { + it('returns undefined for queries with no n-grams', async () => { const indexFile = await asyncBufferFromFile('test/files/alpha.index.parquet') expect(await queryIndex({ query: '', indexFile })).toBeUndefined() expect(await queryIndex({ query: 'ab', indexFile })).toBeUndefined() + expect(await queryIndex({ query: 'abcd', indexFile })).toBeUndefined() expect(await queryIndex({ query: ' ', indexFile })).toBeUndefined() }) - it('intersects trigrams across multi-word queries', async () => { + it('intersects n-grams across multi-word queries', async () => { const indexFile = await asyncBufferFromFile('test/files/alpha.index.parquet') - // 'akk' is in block 0, 'azz' is in block 1; together no block contains both - const result = await queryIndex({ query: 'akk azz', indexFile }) + // 'aaakk' is in block 0, 'azz' is in block 1; together no block contains both + const result = await queryIndex({ query: 'aaakk aaazz', indexFile }) expect(result?.blocks.length).toBe(0) }) }) diff --git a/test/trigrams.test.js b/test/trigrams.test.js deleted file mode 100644 index 4a0d3b3..0000000 --- a/test/trigrams.test.js +++ /dev/null @@ -1,45 +0,0 @@ -import { describe, expect, it } from 'vitest' -import { extractTrigrams, queryTrigrams } from '../src/trigrams.js' - -describe('extractTrigrams', () => { - it('returns empty set for short strings', () => { - expect(extractTrigrams('')).toEqual(new Set()) - expect(extractTrigrams('ab')).toEqual(new Set()) - }) - - it('extracts overlapping trigrams', () => { - expect(extractTrigrams('rhythm')).toEqual(new Set(['rhy', 'hyt', 'yth', 'thm'])) - }) - - it('lowercases', () => { - expect(extractTrigrams('Foo')).toEqual(new Set(['foo'])) - }) - - it('splits on non-alphanumeric', () => { - expect(extractTrigrams('foo bar')).toEqual(new Set(['foo', 'bar'])) - expect(extractTrigrams('a.b.cat')).toEqual(new Set(['cat'])) - }) - - it('dedupes trigrams', () => { - expect(extractTrigrams('ababab')).toEqual(new Set(['aba', 'bab'])) - }) - - it('handles alphanumeric runs', () => { - expect(extractTrigrams('abc123')).toEqual(new Set(['abc', 'bc1', 'c12', '123'])) - }) -}) - -describe('queryTrigrams', () => { - it('returns empty array for short queries', () => { - expect(queryTrigrams('ab')).toEqual([]) - }) - - it('returns trigrams from a query', () => { - expect(new Set(queryTrigrams('rhyt'))).toEqual(new Set(['rhy', 'hyt'])) - }) - - it('unions trigrams across words', () => { - expect(new Set(queryTrigrams('foo bar'))) - .toEqual(new Set(['foo', 'bar'])) - }) -}) From 17639ee16b79dea00f80db6994684e6bbb7ddae4 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Sun, 24 May 2026 19:09:19 -0700 Subject: [PATCH 09/15] Doc references trigram -> n-gram --- README.md | 4 ++-- src/parquetSearch.js | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a819fb2..2534ec3 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![mit license](https://img.shields.io/badge/License-MIT-orange.svg)](https://opensource.org/licenses/MIT) ![coverage](https://img.shields.io/badge/Coverage-95-darkred) -Build a compact trigram search index for a Parquet file using [`hyparquet`](https://github.com/hyparam/hyparquet) and [`hyparquet-writer`](https://github.com/hyparam/hyparquet-writer). Queries are case-insensitive substring matches — grep semantics over a precomputed index. +Build a compact n-gram search index for a Parquet file using [`hyparquet`](https://github.com/hyparam/hyparquet) and [`hyparquet-writer`](https://github.com/hyparam/hyparquet-writer). Queries are case-insensitive substring matches — grep semantics over a precomputed index. ## Why? @@ -39,7 +39,7 @@ for await (const row of parquetFind({ } ``` -Whitespace-separated words are ANDed: `'foo bar'` matches rows containing both `foo` and `bar` as substrings. Queries shorter than 3 characters return no results. +Whitespace-separated words are ANDed: `'foo bar'` matches rows containing both `foo` and `bar` as substrings. Queries shorter than the indexed n-gram length (default 5) return no results. ## Ranked search diff --git a/src/parquetSearch.js b/src/parquetSearch.js index 0fc3fe2..a21571e 100644 --- a/src/parquetSearch.js +++ b/src/parquetSearch.js @@ -30,7 +30,7 @@ export async function* parquetSearch({ const { blocks, textColumns, sourceByteLength } = queryResult if (blocks.length === 0) return - // Sort blocks by score descending (more matched trigrams first) + // Sort blocks by score descending (more matched n-grams first) blocks.sort((a, b) => b.score - a.score) signal?.throwIfAborted() From 0e1b181c101b13a23c36a6b0eed758f4a8138bea Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Sun, 24 May 2026 19:18:24 -0700 Subject: [PATCH 10/15] Drop default blockSize 500 -> 100 for better grep selectivity --- src/constants.js | 7 +++++-- test/files/alpha.index.parquet | Bin 766 -> 790 bytes test/files/dataset.index.parquet | Bin 10009 -> 11448 bytes test/parquetSearch.test.js | 2 +- test/queryIndex.test.js | 8 ++++---- 5 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/constants.js b/src/constants.js index bf090da..acbd67d 100644 --- a/src/constants.js +++ b/src/constants.js @@ -1,8 +1,11 @@ // Version of the parquet index format export const hypGrepVersion = 0 -// Number of rows per virtual block -export const defaultBlockSize = 500 +// Number of rows per virtual block. Smaller blocks improve query selectivity +// (less wasted source-byte transfer per candidate block) at the cost of a larger +// index file. Measured on Wikipedia: dropping from 500 to 100 cuts source bytes +// roughly in half on absent/rare-string queries while ~67% larger index file. +export const defaultBlockSize = 100 // Row group size in the index file export const defaultIndexRowGroupSize = 40000 diff --git a/test/files/alpha.index.parquet b/test/files/alpha.index.parquet index d3cb0c8834e4b9a2e1c6ca1f7064c22e0db6b158..2cf5ca4272a1234aa02cc5a15cfb215ce67418f6 100644 GIT binary patch delta 135 zcmeyzI*n~Z7Nb+0Xhw|a2`*6v5WywNCd$CTaDt1`Mx=p}WeF<-BO?nV2P2yR0|O&N z11l#FBT$-An1NALn2}EhD8|6hFnJndr|L9Dv1yEI=U8pb7&KVefWi_iK!SlmQU+)w dmnajEDRyVF6O$36;p9@LMvfSu)&R#KLje7e6&?Tp delta 111 zcmbQn_K$T#7Nbs>s7;LM2`*6v5WywNCd$CTaDt0bL!^O`WeF>T7bAlOBLf2mP?UjT z@*2iY}lbYE3c|Ld9JtGhdQ{$xN6_AfE0< zYN3g>c9RfWt+ofNRZ(fhdO_X$l-9e|wzk$@Y)$oUNCqwBO~LwS@BQjZ-04U ztwQ}r<0}27T)=DTjtnRH!iPOQ(jbgI&^(U z%at4Whwp}pXSlk|&MT+!DV85!onQCbhJo{rWqVI9JMCpwRyW+g2)8+X?r`Qz;(85{ zkLs7d{{Eh&FKusNT0fiH``Z)6HYYVgr>pz5pC-23tCoG%06kTXGc^8Ed~Sd2js7*I zU&5>^_b2w#b<^&CzPVzkb*Fi+r+An7KNtVuxAg5h?}(?`W?wyUzfRx$$G@8J zf^RM{aft1&;d;-nxEM1`**eR4W%s-5=MH?g#a=n9>Z=-$ZeGLJxLTjR9JW%9yV3CC zn{~-w7Y=SR5B|&T=FwxG-bW5JFL=488n;N!GQsDfmlz#Y_%C1FZ9doY`H_vyjW6Hd zZcF*&ZPrz@j%}DS?s)zpClbo&E!#A3I>2_wv+7 zHtrNJt|@s1prTZ|+%?sN%cyz2ZI0b)m3+eKM^}B>Vf~dH zR^ipQGRi{N))ZQ;G2CK#erz7f)#nDjKbCx%NFKaJ6s}upHrZ;b%`*u1(lS#a?XI#( zbM5A8qvt`J)mW~T z_37q0OAqF4p;k<+cwc+urKZh3+|qWxxv`Zie{S`*?)#**a)>jo+&FE|VE-w?{H4=~ zmYT|p<+xR>cGF>tyYyUSbg{6i@#66>mXFsN-P24;9*ypc>{?!*%v6})3nrH5n_GQ1pV)D@<;&9zy=UY4 z*o+p+D;4i*$*XYoTi^24)NMK$%9(d@kEtbp^u~CF`QANU7_JJ_7F?q01}xgL zLLcEWSC)2WwV)TH0#=YPPf}FwvjIL82QJU+|t@A6E0QZ zxXWTB$}Wv|T>G@YW%ggUn=i3f`dju2zkcyawBbc`wfWn{8`Haz`;T47f;lSPH8rRX zS2fCJ8LwOCvk~(Py9Ch|<#e^X)W^+x?V0 zUa7F%dz8eh3URX=3d@(5)L6}C++F>=%R;+MW;ana!|JwLai7H{TGi{wY8*Ejv&c7Gg&%zU=CXp9=Dt~-_|fW2+cnFM z65IP{9lwqT%L#A^_*pw#lrcfk@$tCp^tNS>fAVJY#yh{A!_5|(+h(aMx0#GJHKet- z?xP*vjjI|t!|{P_4;Mb&csk}GZu=Ss$sfK`?)_pT`-F$yBYGBWswOwJTusk9ez#P1 z;che4X>nWIoIsjtGetN}HX~)W`b=V*n>4P=jNBV|@xmwHdk){&n*8b3QGH#T)!kOw zSnad9OI7pTQ?GKBiy{>ZK6+{aH=}M~;h9Klov?Lu{g|^TV=J3(w=@nEU(oEj>C()D z=7rlzRdp_((QLH6du*U2AaDB{arf+k85G{>suwU9W};I<%>`;!tX!6edg_#QO zy*E}QH*eZ#v2mYSdtcksQettvc620frvEd0?^EaTJvY4P$A9o#c8fay$qfrPUM^oA z16{bs;O0MG^Wpg7?mh2vIrBpLH`o8@7@lHTKXS6Ha`EiXFZZsu?73Fcm9FXeT6?cw|)@s2m`B22b|c|J{9jln@J@t8bAHFwS?g|E_g z7N*BTiRkLP8;@_lViQNt3j1QsZ&mJkyY1?^bYT6RqY(odd~E?QHYf4Xjn!g$P1Y-S zw6Bbnxbypa!IsZMBlh~=YtNUSoqPT0zP!RGvm2!iXNm1^HRKb^XWEv3bvAtc{kEqz zRCRqHyssfOW*1RiLkY@4?^3I^pzQMtqdy(pTY6-mqWyfKF+UmSu$as?v)S6_vsjIG7ZoK3?(9C?dP08d!V=Jw)}d!kf0>@YYcKk@_e#&MnC6gkX7pqi$-WPE7o1|guLHDNYrZD!-S{S-0y)56A=hOT7lroMN*>Ma*Fdh5HFHhA+O+*}%a3KtZ=Wn%R@d!Lcn=&7je+u&$bmcOd$$$E4z8?Emn}HdI1@tfl|ZJ32(z)Y}7>*y9)#Mp%h}gouwzBFP+*(?W zFQshmn7hL-JQ-Q~1s; zGE4E6N!U*L)7_!`Ea5AAqTliAmdndx#WVM>+-=^F*siy=ZO3Qdmlg92lc&w(HVc`B za=yc5{pq|sdnDa^X5gEfr|G)nR$|S>p~A)sZ4@1*msSCf z*|>`Mh#5*!i>d}rE&g5e;7`J(<_{y9;{z>2KMvOQuOhBkr|x~_YJbMgzriRDp3Iu6 z7^nEh?$qdn3Z>3=k4+V1hgRzTIEO=FM2UzN8H(d|^lFBpS;PoYSwIqE*(k>dOjwj8 zM)%>r!Uq%?Ah`g~Npb`+AztK#s2G-zAds>^BAF6MnnA4Mwf!1k-~D? zm1A$G5;=`CM2UsvLAZGY2^8sLQCJd1Ugi+Vi8SK;GRv?Gmr_oA?2)l|WFB#pBt`v5 zU>RQGWigCIMiMAlgnv;;79tGGlA?%6fftcX^Q0*Gd@P%KcH*jsiiKJcvG56>xhX>!9#-*C z<8O;uNg_jhI4ChblA?mBkG~ZSG89iUaGOAwB?U6dQ0*vnYjT;tFJm<$Axah~MwG%N zhbV#N1r+5u+P^cRI8eaCn1qmS$qI=HAW{la62l=;Kr|OZysW#W0vKe@St6haw30>) z^ps*qB#k_#MJ(-?f^6RrZz`5o7#mr;Smb%39m%}Rv#<+=YMTp?D08%+*tb{!#E3y& zl-2;s1Wq2QFi;XrA^~6?$lP1W@C+w%j404ZARn6_M6e!3kswK|%!qA76k@4L+*TG z2`z_09FRXKiV`1T5IM5YL~{USPJ~m+K7lblSPoNofhY|2?s5+jES zB!;JHq`-0n66kT+By&svuqIMK6khIpTgB3%?pTQ=3VZ-CR(r|-eC0)WVd%t&O6H}R z|Cu4_Zt6bPN=S?{9I4=!kP-|0mw4LG!UO=7LU@h>{Qk(`Wcrz-S0YDKV|&n^urhB!xlj0<1&@Ili6M?S@-i)j}^?5zlqb(Wai& zl*A5YX(PMWT%?om3Njr)fe^_<+j&3{G=z~@ z0n_KGNRiF^Q8k^7AQmRaFxYb{To5CqpOs-%Py#9jU{6sN6=DYJJFBH+Is$tsZl<(_ zIk6Hr1PE7v!~U6F!1-CYMzLrGbb1ZqC8-^W0A@MF078A(#jL)4Y=EI;6e1-qAi}ZR zWKsbB`DH!~BvfSPy{}TzBH5{BI5rwZbmtQQF_2$bH%CcEuwz-}pfLg27h*V0#9o-# z@D2mqA=}BQ2#jL@z|f1KK?)yKxLf!K)5N(>M%Dgq~Kw=NCoB6fEo&5L1}3!EfG zihPI#X=BDnEQ=CGPZ)h{=^W(ulL5f16yzildtm~(p&$?i7E2>P=d{C?!%tfosUwX1 z36qbw5$g0um+D2az-<}Jzs z9)iiDW)mfIR6DpoABOu-*eR8XVt6D({Jaokz`usyQSuyiFe@X3{Z3_KfW)mFa2FQ0 zqs|{!a4gFqNr=Lc*@-Ni7==YKi0NU#hACG2zrqM01t2&q@Q+0y5w$TiH5M|!8aN;x zXcGzk7U|rT!yqw&eUhz6tNchDURnSnV55PNzgMQ-nX*K&I`#7ubpU!V^73Gx@2`nI@a6^7^LLTNz> zyQi8?itQo<48)QTmjL8~a1#zYJh4l)LS!N&BoF~HEGI?5W<;P4b}3646o;$lgX1E) z1KtghYY@=uiOCrxc4A^jsVps+l!Y6{hqr7#K zz2J@_V%m`q9*b?zBrqc~(t&v}dVzR8*HLw?v6 zQZ2S9OO-k@)%aM61nT1h8^X98Ofz9RiGcB8fO%&ZSj68|Ij|*z{fg!A9AdD}JWz(K zdG{WLC^42knN;z#C6(nQ3or!V0M201iC&gxs}q)-ZVvXe*#A2$76MeBK3B>dA+yRXH3Ah_y4csTUs# z%`}JsIUME%nwM%_xe#nnm~4f8X>5*q^bIYfsW8OyVSi4lbb7gF2~aT1!;nQQ_JRC1fjDBppO~+chPS(*S-fIXpR+gN%YY|3Svef$R`j z!DL`ivoyJoq(UNj4q&@$f|e!O@f-~dajQ}f>B0|dB^YOvVzBAixnK>vsQE{sHqZ() zjfSwr6G|YUOKdn8?OTeqWfi3Uq?@hF0WOo+`~n4C5N3GDI2c<{Nd?Jvh9&_Y0+Y(p zm&~q25YJ>NngL_CJzYsdP^Up|c|SvqESc_uzgRJZL~MKkrbs#U54j+dickXfd0vhL zGEDy49G(io{R0AswZLI(C*_10DvTY;&jn|Qa#(cIbfz6T4%8B`_1YZ7`2E;lvP%GJ zpdw84RBBjnSeT1gKSN^=PgWv7_M}P$DJ<_LRp$0w5gf~xSERN=u4{)_3pt$^T4l(r zJhpQ3NKR#{wA45rD1%MO%Sb(0I;Xxw7D8kQvNpD$Kn1Z3`yp2qMl^X_3vopTPr+_! zm8iY;zovJD89&2wZSwH4$^fV~L`=X%kg2_iZBeh~Q|C(OD>2uU+|+}Hs^Z)b2X>w3 z&(DccvJA2$1UeYgm%+n{DFYYfKP= z9E=h$G|Xv1J6Z6ok&iRl^HM{G+WCvYS|E>fkdy?0h6^#+1I3FW?I8FH0^bf92C|2Y z;W=6`HVVFuV8`-v1cv2PbBqsX<$%+Yc`uiAq=H6$?7KLfJ+uKFbaYHxVL%(jmQxT8q# z>FIGm8AWjNUUjmk3u=iwjhZ5#qn_}@wF!g9<8>fscaJ0OXfoJi#sr~;HBF9sk3r+| z*)&GI1~ze}>7FjHS7$e-<66{(!;&IjueZnH^E&iKeS(Z@yAAbrdws%-8+`=XqicdC z>Y{pMLYF3JFRs@Z>wQj5T8Gp|eY&d$_SM>9NjGUjhP2v8_r`mm12M0zsjDll(=^eJ z?zq9;ARhTy_+^bGib8Q z8;^Gxx}nG2j(V3JT4zX*J%krF(RFKKfV4xO07w|@$vBOt<6UGX-$D0yy)i?R&807L zAsy1W>V4{ToQ&h$L>wlNm?=Pq5@fF-X=u_VbRMrZNjQ_bH>SHZNo|v%NMncocRTG( zYJIo1DNd+ec8^n24`6aNY4wQgiqmR9REJJaxDaV;qIC&%5#ci=bjcnyfZ3+eCu0Pj z&g#8q?IIi=Ljra}>ZTr&b|&HmS2wA)+w}==SFg7FJ_&l`&NQCrG3f2So_L(J>Fbkt zQ?D0l?Z)ftJpig_3`Rl&?zk2Xousvm zp197~6z|m~0k`&UheqR@`N}h-!AJJQy|x5kK#K^SCecgazTP-q4|7vDb-}sgUA;Qk zB<_VB(po)X_u+^J%scchJ3Qn_y=|<}UpE((6jhd~O4K<;YOF|`$&G2R`T@;#v*z|T z1^lM|k5Om+Wt*Zz{Xmg=LXmd7kL9V>disCY?4G3AJ*f;nUR;w=u2jHnwaT$4C>cd1 iAQKbR55S+A9h|DeyP*x!cXcysQTg! zeZ;~#-I~?#Z^PO5?_TrWn`~Rj?7U~|*gD?PXc8AEb6+{@9~$jkEOic?X(FFjE^gEOe3gM{KndkFpLnPsTQvA@S&JHntm5_kF#8m_$E-7;szmdu(x-8Fp^hkCCwd-5GehjY8S zM`2B9D&@wZ6C>HL3`fl}JN3)MkL~4sHiH^P- zS(gtOin#@^)E+HQR4y0@Yy_SkUE(c#L(p&adSuJ<@|7Pn$+ zd^k(St^9nZYv>t&;?1hq_l%6&v@d&GUBP&NrZ`l#eczc2A)l>L+Xx-0I&r;R?a4jb zU0K_keJt5`Vx-cQ>~6DipIGg#UgnN5D{=5h=Wy=P{H(h}7}iwQ_Sk>@R#vSkXD*%W zEUElxzr}2_e0zVwbhfLf%9;F_y(iNc_PX*9A2HkST+V(Rw)yS(_RN<8TOD0ZH^(nL z)AjXa=yLkE_^)rB-0<|qZCh)t-x4o;QNOg~mm7A~{;8tFd<<6>ds zLLqM~a4vo9JJnzR!MrnZS6lj7ZedH=(#Y|yP;cJSY~AThm$u0Ko^NQ@dB1M^x!y5z z)&*C=miml#^A7G|*_ioWZ0yxGb6-LKQ{$Gm0kHMv`!!nQYA?s8=i6tJt8RBWP7OJF zTOBE7j_bKQqqm#-jhTsN+&P?6n~C?d=H5GGuM0EUKK_ZS4Fx01$;Vg1A-`d4d;0yB z;n@=_$^80)#=2`&p&r?nMyZh*N$d)wdV9C=On0h)mYWn9qXFN zj$Vgv8*$g=tYgN*p}BNC7um9Yq@f$FJVP=O*j4HC-=M7z^4Bq4(swWfk2o*PJVCH`~dX3I9-O z#nKIKeQi$nJ4a5=I%9r#F@7_jp5laNPxf20Zl;rj%=qZdxvn0^-1S58`B|#9s(mpJ z{h&1T!q_UE?(o0qq7VOKs&6*_Hmm~EGDZxX4lxlBsON9?4Qh;-`$oTRh{}m{v{Bk-xWvsa^X752JXAHDf83nGkDLAd*GS^i9o^;Jz4`9G!sC{k zZFh@YF<0f4OvkyY+ABVTor|@D?zjwNRj(H18FnSgM(WsYcoF8czU#BY-HB;_{`pt3k^lvQ+E;L=JCC^a?0#}4KOoP zrQNNL&daDTY@fS?d-Kw}hwOLrvQJqvDy>y_FXe>xeK%!3a6(tZDf@X%F}d3yJ@+MYjLyZ8-hYL!n% z+vX>@?q}*q+GP^uCe}3_Lhil#Vrd!)rCeJ6lNA?!DHQ`89?I@iK zwf9uDh0*Opq`jwQXZ-dTCk{n9@tao1+=%U{E8AstGz>d#58Jz}Uag)xdHCoQO4w4_ zHC(9MFJm&uP zfzoYE(a%u1bD)sFmzg)v|KV6D>>QXsllsi|lh?@N{NaJ#_QwG^^v^%+e!Az+yN~~I zp>zG6&W*qC{L$vFub=BX|J+RKo-3zo0~zmMpmzmc{r&jEFHSaG4QPb6zthoIRB(TH z)%+wGzcJR`yC>mn#r$sWRFS{o?GD4)1mhJ>x_j*O5esqHQ^isx+omn%<4m6zAmq+%PNM}#Uixb&1Jvi!Q?%ePZz}l+StG2u6-ojni znWK3)VWP4#-+nZF=x!IT{_c;z`_*sHKUe`-+jwwS-Qdakf z&GO%`_u#&W%zMeP_l&qbbiEP{;ogx$C_U$HzU|I+X0A2MenFQy{rsr&;rykA;r4|( zCHkuS&Nbgx`TSSEuGzbzW1+BfIox5kAMc+ao6aaPS6U(EShsWz6*2)%_82f5bmjhA)4ezDieLQoWS#I`s)lPdevE zs?B3=)+jm=PtlBz9|rB#21$#d1kam=SpF!?c7M ziO{?t$(T_e#irzX;bs;S8hH@!5qMb%V9SkiK@3Vh8H*xPM1&PuM2scp(WoehY7wX%Q?-1hq3hup0JIqY|F@4jYsS??y7gJa(f7 zEE1Z)8VsL;h!vKVum^U>G9ubhI3z;{WfJtsUKAjN5hRaSW(hBQaU-VXCc=AI)F41H zv`iQ^l<;aHl11c^WSMv{<3+r72k~&2=Lr&6fy0Vc9}#3llo2b!_(W|h)sVQ6!WhAW zxu8fSS&-BniCjo)jX*?1ctO>7gM}b8)GN`L7oj*=l%e}@ zqt;PYQAAiI@DgU>s=x*X5q8($Bk&qDSQ%~Alj+Nj%P$nE2s^_sJVut6ipvWYE0XNWK7K=ClJ!UY0X43>=XcP3F zW!$ot#S)9vpBmSN2rXkt5X5Gz2#Ub6f=FX5aR^Ha&xnYYJt9C&@&eos0ysq;CKZI1 zGYE^o1P{`mvk`&t5>F(N!T5P-BP_r{EXp#g5aFB7%U(Dj^bR)O{iqnI!t#jK+EN6^ zWf4tyT5Eg*5`)b!axYAu5I_zL@tdLqxSdD>1L%VBqQG*S>Q6H6Wf&g7&r494(jWpz zpgx#4V1~Re>RA0iDuX@1Oqndt#UYw=3)8#aI2a&8GxQ9$fC5=^K!!C?3omehII%%$ zu|29#KTmowG`%`oA_w6Lk+ca}Vj0Eb6(tGbX26aJPY^hy0OCYJ*58^AKYC6KGQc?4 z(90176B9f#@C(}8&;Xc`F%6}%EP%wz1M1-x9)1pr+S~-jLLC6~APHv0LscZ?0pLkA zuuD)hZm^7Gxt1zRqTm6rmE6pBW7rHn2t(+WWe^2`vdlb?l=tia`u4g_(ZK19zZ}sNK>d8(`Y4UAmf{m|ag`~<-HTa(p@L)v zz!-o>5dbKg3R;b2A4rLZm1!?8 zG_#gb9WHrL3R2XBfy;m+wat{eWq*_@Ab?H;2o)1!gpoIcGA|hv7BLC+ktPP@mE{9e zz#yytV$!Gtyph<1FC){~0A^u&giv!)m7)|tZdQSYd1aug2ey!sZP7YBc*{g93}}d` z|DBHi(q^E89!xzModzlZJ@D|-@(<$NEG*PNYlAi)#DXk0W66?fPImqAUezWyq5Yjz!r!$kc`?GQw(kAVO$L0LICOWNS1XGG8z0!1bvWVVj_5EkO$kP z&csAOKQ3DZm~Rl6C$lZDrG6vgw9qu@`2wT_AWVGTI3T58BThi!zY$_}z?>c`87RVR z0CbfA4TIp(e91tu2RI^001yt8mvEw}KOaE^>Q0Qw(;zfUwAT`2;DOJBH>qne5lEmm z#aKb--JlMr}@ES|^*d>DuXY@@;dI0ftvgRKCVOA^u)!tmf=G#OlP!q7ra zP)cf370IuFHcOU|PpEILUk^T{xtj z+7Gw`_2i(ZJ_4L-;(?iPb2D!FlHpm3_pRN_$0{-yyDd-QcKJ0U?-gQ^B=qNNNV?BM_@?sn9hqJcp+yVh=~=OC*3G zv4HxqAmG!{k-#hJmr)U3Y5J*0WdL%_8~_>N&_N8;gWL#*m1Kwk5Ca(jjuHe?f$GT^ zk^{&gqA?U8;zF9%ID%k3_@d>iPSl^xXnmPr2nm`gL*^g~G!MWbOQeMZzDsL!iF$sc z?Pcmgf)$vKWf|PJ8P)0p{Y9GyKMOEe2u{mebw29PYm1>Gft7p;3loDNhj7B1@m2sN z9H<$KKI)mc8t5yS8VVeM_#>%XpRnfUf|vvlIRLQ(@m$6=TL1|j2tgo$5;YNL07Uba zmr8O{+6Dp4 z2T5G?1_^LPNceNY0dO@)b)cK@00x`{6opa0W7!xL$zuAeS-7nx$)osuf=Ioa zm-e`M8c`2 zL0MsRkG{6Cte-bHh5_(^UsI$>V7BUbcQ|Dt0NKM}4+eXob$RJIq zw3uQTfgnN7hkWR10-X#dJQof87i8AwrH(|QPfdav-jw4>1*Jd`-y}U_@DXU@KPDx7 zo)p&%s0mV^#8L|eBZM`Mmj-`flA!HPET;Z7YPl@oSCkEd_9##ZL9*jDEBF;+pmTz| z3Vj1-PCz*~Y3y_+l0UWiwKZHPHM^~TybAEAAi zI%zQYyg?o`q?k0rn4knLOG6fSyQ$A((jc{fY>^&Z=jIVuNeK(_MO`2CASpBbz zybVTgK-f-nwl}ev|f#kpt1gQ9dW4TnHkTQ!+VVs6lBfW1UNk-DTs=Uk0S?J zBn;-{YcE?~&^Q_1(r7Zx=Yl}9V3r_HB8a6ro>@|O3-SR3odeJt$V)r~6?oy{K`k&1 zDn}CnAyDB!Ai+=pYoMRByj-Df+Pp1#0Q?>ME~D+FyAx& z{VSNi>2<%-tCd@~riY(QNlnSlj7&{3q$E)(rZv1azNA&j;U|+@d-U*M?b~YEdAvuT znzSknQ-yBLY)o%T%gmLzWLOpq{5!jN$@vOUGgXJnDCS8 V-?nbk7pOVeZ$=`$PN#ie^uN%Sdu;#! diff --git a/test/parquetSearch.test.js b/test/parquetSearch.test.js index 527deb1..7dc0fdb 100644 --- a/test/parquetSearch.test.js +++ b/test/parquetSearch.test.js @@ -28,7 +28,7 @@ describe('parquetSearch', () => { expect(sourceFile.fetches).toBe(2) // metadata + row group fetch expect(sourceFile.bytes).toBe(5617) expect(indexFile.fetches).toBe(2) // metadata + index row group fetch - expect(indexFile.bytes).toBe(1174) + expect(indexFile.bytes).toBe(1220) }) it('should return no results for query with no matches', async () => { diff --git a/test/queryIndex.test.js b/test/queryIndex.test.js index 3f8226d..b05ae91 100644 --- a/test/queryIndex.test.js +++ b/test/queryIndex.test.js @@ -10,9 +10,9 @@ describe('queryIndex', () => { expect(result).toBeDefined() expect(result?.blocks.length).toBe(1) const block = result?.blocks[0] - expect(block?.blockId).toBe(0) - expect(block?.rowStart).toBe(0) - expect(block?.rowEnd).toBe(500) + expect(block?.blockId).toBe(2) + expect(block?.rowStart).toBe(200) + expect(block?.rowEnd).toBe(300) expect(block?.score).toBe(1) expect(result?.textColumns).toEqual(['id']) }) @@ -27,7 +27,7 @@ describe('queryIndex', () => { it('intersects n-grams across multi-word queries', async () => { const indexFile = await asyncBufferFromFile('test/files/alpha.index.parquet') - // 'aaakk' is in block 0, 'azz' is in block 1; together no block contains both + // 'aaakk' is in block 2, 'aaazz' is in a different block; no block contains both const result = await queryIndex({ query: 'aaakk aaazz', indexFile }) expect(result?.blocks.length).toBe(0) }) From 5bb7cac6df62517cf866b65eb3fe3b7a7f010d21 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Sun, 24 May 2026 19:24:56 -0700 Subject: [PATCH 11/15] Coalesce candidate blocks and cache slices for low transfer bytes --- benchmark.js | 129 +++++++++++++++++-------------------- src/parquetFind.js | 44 +++++++++++-- src/parquetSearch.js | 9 ++- test/helpers.js | 14 ++-- test/parquetSearch.test.js | 8 +-- 5 files changed, 118 insertions(+), 86 deletions(-) diff --git a/benchmark.js b/benchmark.js index 16fcfed..154a679 100644 --- a/benchmark.js +++ b/benchmark.js @@ -3,13 +3,13 @@ import { asyncBufferFromFile } from 'hyparquet' import { fileWriter } from 'hyparquet-writer' import { pipeline } from 'stream/promises' import { createIndex } from './src/createIndex.js' -import { queryIndex } from './src/queryIndex.js' +import { parquetFind } from './src/parquetFind.js' const url = 'https://huggingface.co/datasets/wikimedia/wikipedia/resolve/main/20231101.en/train-00000-of-00041.parquet' const filename = 'example.parquet' const indexFilename = 'example.index.parquet' -// Download test parquet file if needed +// Download source if needed let stat = await fs.stat(filename).catch(() => undefined) if (!stat) { console.log('downloading ' + url) @@ -17,87 +17,76 @@ if (!stat) { if (!res.ok) throw new Error(res.statusText) await pipeline(res.body, createWriteStream(filename)) stat = await fs.stat(filename) - console.log('downloaded example.parquet', stat.size.toLocaleString(), 'bytes') } -// Create search index +// Build index if needed let indexStat = await fs.stat(indexFilename).catch(() => undefined) if (!indexStat) { - console.log('\n=== Creating Search Index ===') - const indexStartTime = performance.now() - + console.log('building ' + indexFilename) + const t0 = performance.now() const sourceFile = await asyncBufferFromFile(filename) const indexFile = fileWriter(indexFilename) await createIndex({ sourceFile, indexFile }) indexStat = await fs.stat(indexFilename) - - const indexMs = performance.now() - indexStartTime - console.log(`created index in ${indexMs.toFixed(0)} ms`) - console.log(`index size: ${indexStat.size.toLocaleString()} bytes (${(indexStat.size / stat.size * 100).toFixed(1)}% of source)`) -} else { - console.log('\n=== Using Existing Search Index ===') - console.log(`index size: ${indexStat.size.toLocaleString()} bytes (${(indexStat.size / stat.size * 100).toFixed(1)}% of source)`) + console.log(`built in ${((performance.now() - t0) / 1000).toFixed(1)}s`) } -// Load the index for querying -const indexFile = await asyncBufferFromFile(indexFilename) - -// Augment the AsyncBuffer with instrumentation -indexFile.sliceCount = 0 -indexFile.bytesRead = 0 -const originalSlice = indexFile.slice.bind(indexFile) -indexFile.slice = function (start, end) { - indexFile.sliceCount++ - indexFile.bytesRead += end - start - return originalSlice(start, end) +console.log(`source: ${(stat.size / 1024 / 1024).toFixed(1)} MB`) +console.log(`index: ${(indexStat.size / 1024 / 1024).toFixed(1)} MB (${(indexStat.size / stat.size * 100).toFixed(2)}% of source)`) + +/** + * @param {import('hyparquet').AsyncBuffer} buf + * @returns {import('hyparquet').AsyncBuffer & {fetches: number, bytes: number}} + */ +function instrument(buf) { + const wrapper = { + byteLength: buf.byteLength, + fetches: 0, + bytes: 0, + /** + * @param {number} start + * @param {number} [end] + * @returns {Promise | ArrayBuffer} + */ + slice(start, end) { + wrapper.fetches += 1 + wrapper.bytes += (end ?? buf.byteLength) - start + return buf.slice(start, end) + }, + } + return wrapper } -// Test queries const queries = [ - { name: 'Rare term', query: 'eigenvalue' }, - { name: 'Common term', query: 'wikipedia' }, - { name: 'Multi-term', query: 'united states history' }, + 'eigenvalue', + 'petrichor', + 'serverless', + 'quantum entanglement', + 'wikipedia', ] -console.log('\n=== Search Performance ===') - -const queryTimes = [] -const queryRequestCounts = [] -const queryBytesRead = [] - -for (const { name, query } of queries) { - // Reset stats for this query - indexFile.sliceCount = 0 - indexFile.bytesRead = 0 - - const queryStartTime = performance.now() - - const results = await queryIndex({ query, indexFile }) - - const queryMs = performance.now() - queryStartTime - queryTimes.push(queryMs) - queryRequestCounts.push(indexFile.sliceCount) - queryBytesRead.push(indexFile.bytesRead) - - console.log(`\n${name}: "${query}"`) - console.log(` Query time: ${queryMs.toFixed(2)} ms`) - console.log(` Requests: ${indexFile.sliceCount}`) - console.log(` Bytes read: ${indexFile.bytesRead.toLocaleString()}`) - console.log(` Matching blocks: ${results.blocks.length}`) - - if (results.blocks.length > 0) { - console.log(' Top 3 blocks by relevance:') - for (let i = 0; i < Math.min(3, results.blocks.length); i++) { - const result = results.blocks[i] - console.log(` Block ${result.blockId}: rows ${result.rowStart}-${result.rowEnd}, score: ${result.score}`) - } - } +console.log() +console.log('query matches ms idx_KB src_MB total') +console.log('--------------------- ------- ----- ------ ------ -----') +for (const query of queries) { + const idx = instrument(await asyncBufferFromFile(indexFilename)) + const src = instrument(await asyncBufferFromFile(filename)) + const t0 = performance.now() + let matches = 0 + for await (const _ of parquetFind({ + url: filename, + sourceFile: src, + indexFile: idx, + query, + })) matches += 1 + const ms = performance.now() - t0 + const total = (idx.bytes + src.bytes) / 1024 / 1024 + console.log( + query.padEnd(21) + + ' ' + String(matches).padStart(7) + + ' ' + ms.toFixed(0).padStart(5) + + ' ' + (idx.bytes / 1024).toFixed(0).padStart(6) + + ' ' + (src.bytes / 1024 / 1024).toFixed(0).padStart(6) + + ' ' + total.toFixed(1).padStart(5) + ' MB' + ) } - -// Summary -console.log('\n=== Summary ===') -console.log(`Source file: ${stat.size.toLocaleString()} bytes`) -console.log(`Index file: ${indexStat.size.toLocaleString()} bytes`) -console.log(`Average query time: ${(queryTimes.reduce((sum, time) => sum + time, 0) / queryTimes.length).toFixed(2)} ms`) -console.log(`Average requests per query: ${(queryRequestCounts.reduce((sum, count) => sum + count, 0) / queryRequestCounts.length).toFixed(1)}`) -console.log(`Average bytes read per query: ${(queryBytesRead.reduce((sum, bytes) => sum + bytes, 0) / queryBytesRead.length).toLocaleString()}`) diff --git a/src/parquetFind.js b/src/parquetFind.js index e225ca1..018a616 100644 --- a/src/parquetFind.js +++ b/src/parquetFind.js @@ -1,4 +1,4 @@ -import { asyncBufferFromUrl, parquetMetadataAsync, parquetReadObjects } from 'hyparquet' +import { asyncBufferFromUrl, cachedAsyncBuffer, parquetMetadataAsync, parquetReadObjects } from 'hyparquet' import { queryIndex } from './queryIndex.js' /** @@ -23,6 +23,9 @@ export async function* parquetFind({ if (!query || limit <= 0) return signal?.throwIfAborted() indexFile ??= await asyncBufferFactory({ url: `${url.replace(/\.parquet$/i, '')}.index.parquet` }) + // Cache slices: parquet readers refetch overlapping page byte ranges across + // calls; caching dedupes them within a query. + indexFile = cachedAsyncBuffer(indexFile) const queryResult = await queryIndex({ query, indexFile, indexMetadata }) if (!queryResult) return const { blocks, textColumns, sourceByteLength } = queryResult @@ -31,33 +34,62 @@ export async function* parquetFind({ blocks.sort((a, b) => a.blockId - b.blockId) signal?.throwIfAborted() - const file = sourceFile ?? await asyncBufferFactory({ url, byteLength: sourceByteLength }) + const rawFile = sourceFile ?? await asyncBufferFactory({ url, byteLength: sourceByteLength }) + const file = cachedAsyncBuffer(rawFile) const metadata = sourceMetadata ?? await parquetMetadataAsync(file) const needles = query.toLowerCase().split(/\s+/).filter(Boolean) + // Coalesce contiguous candidate blocks into single row-range reads. Without + // this, parquetReadObjects re-fetches overlapping row-group pages and blows + // out the source transfer (e.g. "wikipedia" matched every block and pulled + // 10x the source size). + const runs = coalesceRuns(blocks) + let count = 0 - for (const block of blocks) { + for (const run of runs) { signal?.throwIfAborted() const blockRows = await parquetReadObjects({ ...hyparquetOptions, file, metadata, - rowStart: block.rowStart, - rowEnd: block.rowEnd, + rowStart: run.rowStart, + rowEnd: run.rowEnd, useOffsetIndex: true, }) for (let i = 0; i < blockRows.length; i++) { const row = blockRows[i] if (matchesRow(row, textColumns, needles)) { - yield { __index__: block.rowStart + i, ...row } + yield { __index__: run.rowStart + i, ...row } if (++count >= limit) return } } } } +/** + * Merge a sorted list of blocks into contiguous row-range runs. + * + * @param {{rowStart: number, rowEnd: number}[]} blocks + * @returns {{rowStart: number, rowEnd: number}[]} + */ +function coalesceRuns(blocks) { + /** @type {{rowStart: number, rowEnd: number}[]} */ + const runs = [] + let current = null + for (const block of blocks) { + if (current && block.rowStart === current.rowEnd) { + current.rowEnd = block.rowEnd + } else { + if (current) runs.push(current) + current = { rowStart: block.rowStart, rowEnd: block.rowEnd } + } + } + if (current) runs.push(current) + return runs +} + /** * Return true when every needle appears as a substring of some indexed column. * diff --git a/src/parquetSearch.js b/src/parquetSearch.js index a21571e..4cfea23 100644 --- a/src/parquetSearch.js +++ b/src/parquetSearch.js @@ -1,4 +1,4 @@ -import { asyncBufferFromUrl, parquetMetadataAsync, parquetReadObjects } from 'hyparquet' +import { asyncBufferFromUrl, cachedAsyncBuffer, parquetMetadataAsync, parquetReadObjects } from 'hyparquet' import { queryIndex } from './queryIndex.js' /** @@ -25,6 +25,9 @@ export async function* parquetSearch({ signal?.throwIfAborted() // Query the index to get candidate blocks indexFile ??= await asyncBufferFactory({ url: `${url.replace(/\.parquet$/i, '')}.index.parquet` }) + // Cache slices: parquet readers refetch overlapping page byte ranges across + // calls; caching dedupes them within a query. + indexFile = cachedAsyncBuffer(indexFile) const queryResult = await queryIndex({ query, indexFile, indexMetadata }) if (!queryResult) return const { blocks, textColumns, sourceByteLength } = queryResult @@ -34,7 +37,9 @@ export async function* parquetSearch({ blocks.sort((a, b) => b.score - a.score) signal?.throwIfAborted() - const file = sourceFile ?? await asyncBufferFactory({ url, byteLength: sourceByteLength }) + + const rawFile = sourceFile ?? await asyncBufferFactory({ url, byteLength: sourceByteLength }) + const file = cachedAsyncBuffer(rawFile) const metadata = sourceMetadata ?? await parquetMetadataAsync(file) const needles = needlesOf(query) diff --git a/test/helpers.js b/test/helpers.js index ba58389..8dbae91 100644 --- a/test/helpers.js +++ b/test/helpers.js @@ -7,14 +7,20 @@ * @returns {AsyncBuffer & {fetches: number, bytes: number}} */ export function countingBuffer(asyncBuffer) { - return { - ...asyncBuffer, + const wrapper = { + byteLength: asyncBuffer.byteLength, fetches: 0, bytes: 0, + /** + * @param {number} start + * @param {number} [end] + * @returns {Promise | ArrayBuffer} + */ slice(start, end) { - this.fetches++ - this.bytes += (end ?? asyncBuffer.byteLength) - start + wrapper.fetches += 1 + wrapper.bytes += (end ?? asyncBuffer.byteLength) - start return asyncBuffer.slice(start, end) }, } + return wrapper } diff --git a/test/parquetSearch.test.js b/test/parquetSearch.test.js index 7dc0fdb..840536b 100644 --- a/test/parquetSearch.test.js +++ b/test/parquetSearch.test.js @@ -25,10 +25,10 @@ describe('parquetSearch', () => { expect(rows.length).toBe(1) expect(rows[0]).toEqual({ __index__: 270, id: 'aaakk' }) - expect(sourceFile.fetches).toBe(2) // metadata + row group fetch - expect(sourceFile.bytes).toBe(5617) - expect(indexFile.fetches).toBe(2) // metadata + index row group fetch - expect(indexFile.bytes).toBe(1220) + expect(sourceFile.fetches).toBe(1) // single coalesced read covers both metadata and rows + expect(sourceFile.bytes).toBe(2867) + expect(indexFile.fetches).toBe(1) + expect(indexFile.bytes).toBe(790) }) it('should return no results for query with no matches', async () => { From 26a53c2865701e62c357cdb80b4abee5f2860f6b Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Sun, 24 May 2026 19:27:28 -0700 Subject: [PATCH 12/15] Add ARCH.md documenting goal, design, and tradeoffs --- ARCH.md | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 ARCH.md diff --git a/ARCH.md b/ARCH.md new file mode 100644 index 0000000..f293a32 --- /dev/null +++ b/ARCH.md @@ -0,0 +1,113 @@ +# Architecture + +HypGrep is a serverless grep over Parquet files in S3. A client (browser, Node, etc.) reads a small precomputed index over HTTP range requests, then reads only the source byte ranges that could plausibly match. The dominant cost being optimized is **per-query transfer bytes** — every byte the client pulls is wall-clock latency and dollar cost. File size on disk is a secondary metric. + +## Index format + +A HypGrep index is itself a Parquet file with two columns: + +| column | type | encoding | meaning | +|---|---|---|---| +| `ngram` | STRING | DELTA_BYTE_ARRAY | a lowercase n-character substring | +| `blockId` | INT32 | DELTA_BINARY_PACKED | logical block this n-gram appears in | + +Rows are sorted by `(ngram, blockId)`. The set of `blockId`s for a given n-gram is its posting list. The source file is divided into logical blocks of `block_size` consecutive rows (independent of Parquet row groups). The index records, for each block, the set of distinct n-grams present across every indexed string column. + +Key-value metadata on the index file: + +``` +hypgrep.version # format version +hypgrep.block_size # rows per block +hypgrep.ngram_length # n-gram size (default 5) +hypgrep.text_columns # which columns were indexed +hypgrep.source_rows # total rows in source +hypgrep.source_bytelength # source file size (lets the client pre-size buffers) +``` + +Storing source size in the metadata lets the client construct an `AsyncBuffer` for the source without a separate HEAD request. + +## Query path + +For a query string `Q`: + +1. **Tokenize the query into n-grams.** Lowercase; split on `/[^a-z0-9]+/`; emit every n-character window of each alphanumeric run. Whitespace-separated words are unioned (substring AND across words). +2. **Push-down filter on the index** with `{ ngram: { $in: queryNgrams } }`. Hyparquet only reads the row-group(s) covering those n-grams. +3. **Intersect.** Group returned rows by `blockId`; keep blocks that hit *every* query n-gram. +4. **Coalesce contiguous candidate blocks** into row-range runs. (Critical — see below.) +5. **Read each run from the source** with `parquetReadObjects({ rowStart, rowEnd })` and `useOffsetIndex: true`. +6. **Per-row substring filter.** For each row in a run, verify `value.toLowerCase().includes(needle)` for every whitespace-separated query word. + +## Key tuning decisions + +### Why n = 5 + +Shorter n-grams fail to prune prose at any reasonable block size — every block of natural-language text contains every common 3- or 4-character window, so trigram intersection returns *every* block as a candidate and we end up scanning the whole source. n = 5 has a large enough universe (~285K alnum 5-grams vs ~47K trigrams) that distinguishing windows like `rverl` (`serverless`) or `ichor` (`petrichor`) prune effectively. Per-query source transfer on a 420 MB Wikipedia dump: + +| query | n=3 | n=4 | n=5 | +|---|---:|---:|---:| +| eigenvalue (67 hits) | 839 MB | 205 MB | 189 MB | +| petrichor (0) | 839 MB | 839 MB | 316 MB | +| serverless (0) | 839 MB | 839 MB | 29 MB | + +Tradeoff: index size grows with n. n=3 → 1.2 MB, n=5 → 15 MB on Wikipedia. The goal weighs query bytes far more than index size, so the larger index pays for itself many times over after the first few queries. + +### Why blockSize = 100 + +Smaller blocks improve pruning — when a candidate block is selected, fewer "wasted" non-matching rows ride along. Going from 500 to 100 cuts source transfer roughly in half on absent/rare-string queries: + +| query | b=500 | b=100 | +|---|---:|---:| +| petrichor | 301 MB | 157 MB | +| serverless | 28 MB | 0 MB | + +Tradeoff: smaller blocks mean more `(ngram, blockId)` postings. Index grows 15 → 25 MB. Again favorable under the goal. + +### Block coalescing + +After step 3 above, candidate blocks are sorted by `blockId` and merged into contiguous runs before reading the source. Without coalescing, each block becomes its own `parquetReadObjects` call, and overlapping Parquet pages get re-fetched. Measured impact on the `wikipedia` query (matches every block): naive per-block reads pulled **4 GB** of source bytes; coalescing collapsed that to one big read of **401 MB** — i.e. the source size itself. + +### `cachedAsyncBuffer` + +Both source and index files are wrapped with `cachedAsyncBuffer` for the duration of a query. Parquet readers commonly re-fetch the same byte range across calls (page footer + page data, or overlapping row-group pages). The cache memoizes slices by `(start, end)`, so duplicate fetches become free. Measured impact: + +| query | without cache | with cache | +|---|---:|---:| +| eigenvalue | 195 MB | 150 MB | +| petrichor | 106 MB | 59 MB | +| quantum entanglement | 159 MB | 107 MB | + +## End-to-end bytes on Wikipedia (420 MB source, 24 MB index) + +| query | matches | total transferred | of which: index | of which: source | +|---|---:|---:|---:|---:| +| eigenvalue | 67 | 150.4 MB | 724 KB | 150 MB | +| petrichor | 0 | 59.6 MB | 646 KB | 59 MB | +| serverless | 0 | 0.7 MB | 689 KB | 0 MB | +| quantum entanglement | 24 | 107.7 MB | 870 KB | 107 MB | +| wikipedia | 156289 | 401.3 MB | 708 KB | 401 MB | + +**Per-query index transfer is bounded at ~700 KB** regardless of selectivity — that's the property that makes the design serverless-friendly. The source transfer scales with how many blocks legitimately match. + +## API surface + +- `createIndex({ sourceFile, indexFile, blockSize?, ngramLength?, ... })` — build the index Parquet. +- `queryIndex({ query, indexFile, indexMetadata? })` — return candidate blocks. Streaming-friendly: just metadata. +- `parquetFind({ query, url, limit?, ... })` — async generator of matching rows in natural order (Ctrl+F semantics). +- `parquetSearch({ query, url, limit?, ... })` — async generator of matching rows ranked by occurrence count. + +Both reader functions accept either a `url` (with optional `asyncBufferFactory`) or pre-loaded `sourceFile` / `indexFile` AsyncBuffers. + +## Known limitations and where to push next + +1. **Queries shorter than `ngramLength` (default 5) chars** return no results. Acceptable: short n-grams aren't selective in prose anyway. Workaround if needed: store multi-length n-grams (e.g. 3 *and* 5) and pick at query time. +2. **Common-everywhere words** (e.g. `wikipedia` in a Wikipedia dump) match every block; the source transfer is unavoidable because every row really does match. No n-gram strategy fixes this. +3. **`limit` does not short-circuit when blocks coalesce.** A user requesting `limit: 10` on a query that matches every block still reads the full coalesced run. Fixing this means trading some coalescing for early termination — bound the run length, or process blocks in score order without merging. +4. **Per-row precision is missing.** A candidate block is scanned in full even if only one row matches. Storing per-`(ngram, block)` row bitmaps would let queries narrow the source read to specific rows, at the cost of a ~3× larger index. Probably the highest-value next step for sparse queries. +5. **Regex is not supported.** Adding it means parsing the regex into mandatory n-gram sets (Zoekt's planner), then running the real regex per matched row. The index format wouldn't change. +6. **Index files are not back-compatible.** `hypgrep.version` exists but we don't ship a multi-version reader. Reasonable for a 0.x package. + +## Dependencies + +- `hyparquet` — Parquet reader with push-down filtering, offset index, and `cachedAsyncBuffer`. +- `hyparquet-writer` — Parquet writer with `DELTA_BYTE_ARRAY` / `DELTA_BINARY_PACKED` encodings that keep the index small. +- `hyparquet-compressors` — compression codecs. From 36c8dd8c1d30df57a848db83712ef194f08cf939 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Sun, 24 May 2026 19:31:39 -0700 Subject: [PATCH 13/15] Drop block coalescing so limit short-circuits --- ARCH.md | 34 +++++++++++++++++++----------- benchmark.js | 52 +++++++++++++++++++++++++--------------------- src/parquetFind.js | 40 +++++++---------------------------- 3 files changed, 58 insertions(+), 68 deletions(-) diff --git a/ARCH.md b/ARCH.md index f293a32..ac7f3b4 100644 --- a/ARCH.md +++ b/ARCH.md @@ -33,9 +33,9 @@ For a query string `Q`: 1. **Tokenize the query into n-grams.** Lowercase; split on `/[^a-z0-9]+/`; emit every n-character window of each alphanumeric run. Whitespace-separated words are unioned (substring AND across words). 2. **Push-down filter on the index** with `{ ngram: { $in: queryNgrams } }`. Hyparquet only reads the row-group(s) covering those n-grams. 3. **Intersect.** Group returned rows by `blockId`; keep blocks that hit *every* query n-gram. -4. **Coalesce contiguous candidate blocks** into row-range runs. (Critical — see below.) -5. **Read each run from the source** with `parquetReadObjects({ rowStart, rowEnd })` and `useOffsetIndex: true`. -6. **Per-row substring filter.** For each row in a run, verify `value.toLowerCase().includes(needle)` for every whitespace-separated query word. +4. **For each candidate block, in order**, read the row range from the source with `parquetReadObjects({ rowStart, rowEnd })` and `useOffsetIndex: true`. Both files are wrapped in `cachedAsyncBuffer` for the duration of the query so adjacent blocks within a Parquet row group don't re-fetch its pages. +5. **Per-row substring filter.** For each row, verify `value.toLowerCase().includes(needle)` for every whitespace-separated query word. +6. **`limit` short-circuits** as soon as enough matches have been yielded — subsequent blocks are never read. ## Key tuning decisions @@ -62,13 +62,11 @@ Smaller blocks improve pruning — when a candidate block is selected, fewer "wa Tradeoff: smaller blocks mean more `(ngram, blockId)` postings. Index grows 15 → 25 MB. Again favorable under the goal. -### Block coalescing +### Per-block reads + `cachedAsyncBuffer` -After step 3 above, candidate blocks are sorted by `blockId` and merged into contiguous runs before reading the source. Without coalescing, each block becomes its own `parquetReadObjects` call, and overlapping Parquet pages get re-fetched. Measured impact on the `wikipedia` query (matches every block): naive per-block reads pulled **4 GB** of source bytes; coalescing collapsed that to one big read of **401 MB** — i.e. the source size itself. +Both source and index files are wrapped with `cachedAsyncBuffer` for the duration of a query. Parquet readers commonly re-fetch the same byte range across calls (page footer + page data, or overlapping row-group pages when adjacent blocks fall in the same row group). The cache memoizes slices by `(start, end)`, so duplicate fetches are free. -### `cachedAsyncBuffer` - -Both source and index files are wrapped with `cachedAsyncBuffer` for the duration of a query. Parquet readers commonly re-fetch the same byte range across calls (page footer + page data, or overlapping row-group pages). The cache memoizes slices by `(start, end)`, so duplicate fetches become free. Measured impact: +An earlier version coalesced contiguous candidate blocks into single `parquetReadObjects` calls to avoid the re-fetch. That worked for transfer bytes but defeated `limit`: a `limit: 10` query against a string that matched every block still pulled the entire coalesced run. Switching back to per-block reads (relying on the cache to dedupe pages) costs identical bytes on full scans while letting `limit` short-circuit on the next block boundary. Measured impact: | query | without cache | with cache | |---|---:|---:| @@ -78,7 +76,9 @@ Both source and index files are wrapped with `cachedAsyncBuffer` for the duratio ## End-to-end bytes on Wikipedia (420 MB source, 24 MB index) -| query | matches | total transferred | of which: index | of which: source | +No limit (every match): + +| query | matches | total | index | source | |---|---:|---:|---:|---:| | eigenvalue | 67 | 150.4 MB | 724 KB | 150 MB | | petrichor | 0 | 59.6 MB | 646 KB | 59 MB | @@ -86,7 +86,17 @@ Both source and index files are wrapped with `cachedAsyncBuffer` for the duratio | quantum entanglement | 24 | 107.7 MB | 870 KB | 107 MB | | wikipedia | 156289 | 401.3 MB | 708 KB | 401 MB | -**Per-query index transfer is bounded at ~700 KB** regardless of selectivity — that's the property that makes the design serverless-friendly. The source transfer scales with how many blocks legitimately match. +`limit: 10` (typical client usage): + +| query | matches | total | index | source | +|---|---:|---:|---:|---:| +| eigenvalue | 10 | 42.5 MB | 724 KB | 42 MB | +| petrichor | 0 | 59.6 MB | 646 KB | 59 MB | +| serverless | 0 | 0.7 MB | 689 KB | 0 MB | +| quantum entanglement | 10 | 40.2 MB | 870 KB | 39 MB | +| wikipedia | 10 | 14.1 MB | 708 KB | 13 MB | + +**Per-query index transfer is bounded at ~700 KB** regardless of selectivity — that's the property that makes the design serverless-friendly. The source transfer scales with how many blocks legitimately match, and `limit` lets selective UIs (first 10 hits) clip it further. ## API surface @@ -101,8 +111,8 @@ Both reader functions accept either a `url` (with optional `asyncBufferFactory`) 1. **Queries shorter than `ngramLength` (default 5) chars** return no results. Acceptable: short n-grams aren't selective in prose anyway. Workaround if needed: store multi-length n-grams (e.g. 3 *and* 5) and pick at query time. 2. **Common-everywhere words** (e.g. `wikipedia` in a Wikipedia dump) match every block; the source transfer is unavoidable because every row really does match. No n-gram strategy fixes this. -3. **`limit` does not short-circuit when blocks coalesce.** A user requesting `limit: 10` on a query that matches every block still reads the full coalesced run. Fixing this means trading some coalescing for early termination — bound the run length, or process blocks in score order without merging. -4. **Per-row precision is missing.** A candidate block is scanned in full even if only one row matches. Storing per-`(ngram, block)` row bitmaps would let queries narrow the source read to specific rows, at the cost of a ~3× larger index. Probably the highest-value next step for sparse queries. +3. **Per-row precision is missing.** A candidate block is scanned in full even if only one row matches. Storing per-`(ngram, block)` row bitmaps would let queries narrow the source read to specific rows, at the cost of a ~3× larger index. Probably the highest-value next step for sparse queries that match a small number of rows spread across many blocks. +4. **No-limit queries on dense matches do more work than they need to.** When every block matches (e.g. `wikipedia` against a Wikipedia dump), exhausting all matches takes ~24 s of CPU because we process blocks one at a time. The bytes are minimal; the time is CPU/parsing overhead. Real clients should pass a `limit`. 5. **Regex is not supported.** Adding it means parsing the regex into mandatory n-gram sets (Zoekt's planner), then running the real regex per matched row. The index format wouldn't change. 6. **Index files are not back-compatible.** `hypgrep.version` exists but we don't ship a multi-version reader. Reasonable for a 0.x package. diff --git a/benchmark.js b/benchmark.js index 154a679..8c72ec9 100644 --- a/benchmark.js +++ b/benchmark.js @@ -65,28 +65,32 @@ const queries = [ 'wikipedia', ] -console.log() -console.log('query matches ms idx_KB src_MB total') -console.log('--------------------- ------- ----- ------ ------ -----') -for (const query of queries) { - const idx = instrument(await asyncBufferFromFile(indexFilename)) - const src = instrument(await asyncBufferFromFile(filename)) - const t0 = performance.now() - let matches = 0 - for await (const _ of parquetFind({ - url: filename, - sourceFile: src, - indexFile: idx, - query, - })) matches += 1 - const ms = performance.now() - t0 - const total = (idx.bytes + src.bytes) / 1024 / 1024 - console.log( - query.padEnd(21) + - ' ' + String(matches).padStart(7) + - ' ' + ms.toFixed(0).padStart(5) + - ' ' + (idx.bytes / 1024).toFixed(0).padStart(6) + - ' ' + (src.bytes / 1024 / 1024).toFixed(0).padStart(6) + - ' ' + total.toFixed(1).padStart(5) + ' MB' - ) +for (const limit of [Infinity, 10]) { + console.log() + console.log('limit = ' + (limit === Infinity ? 'all' : limit)) + console.log('query matches ms idx_KB src_MB total') + console.log('--------------------- ------- ----- ------ ------ -----') + for (const query of queries) { + const idx = instrument(await asyncBufferFromFile(indexFilename)) + const src = instrument(await asyncBufferFromFile(filename)) + const t0 = performance.now() + let matches = 0 + for await (const _ of parquetFind({ + url: filename, + sourceFile: src, + indexFile: idx, + query, + limit, + })) matches += 1 + const ms = performance.now() - t0 + const total = (idx.bytes + src.bytes) / 1024 / 1024 + console.log( + query.padEnd(21) + + ' ' + String(matches).padStart(7) + + ' ' + ms.toFixed(0).padStart(5) + + ' ' + (idx.bytes / 1024).toFixed(0).padStart(6) + + ' ' + (src.bytes / 1024 / 1024).toFixed(0).padStart(6) + + ' ' + total.toFixed(1).padStart(5) + ' MB' + ) + } } diff --git a/src/parquetFind.js b/src/parquetFind.js index 018a616..b6fd3b5 100644 --- a/src/parquetFind.js +++ b/src/parquetFind.js @@ -40,56 +40,32 @@ export async function* parquetFind({ const needles = query.toLowerCase().split(/\s+/).filter(Boolean) - // Coalesce contiguous candidate blocks into single row-range reads. Without - // this, parquetReadObjects re-fetches overlapping row-group pages and blows - // out the source transfer (e.g. "wikipedia" matched every block and pulled - // 10x the source size). - const runs = coalesceRuns(blocks) - + // Process blocks one at a time so `limit` can short-circuit on the next + // block boundary. cachedAsyncBuffer dedupes overlapping page fetches across + // blocks within the same Parquet row group, so reading per-block costs the + // same bytes as a single coalesced read. let count = 0 - for (const run of runs) { + for (const block of blocks) { signal?.throwIfAborted() const blockRows = await parquetReadObjects({ ...hyparquetOptions, file, metadata, - rowStart: run.rowStart, - rowEnd: run.rowEnd, + rowStart: block.rowStart, + rowEnd: block.rowEnd, useOffsetIndex: true, }) for (let i = 0; i < blockRows.length; i++) { const row = blockRows[i] if (matchesRow(row, textColumns, needles)) { - yield { __index__: run.rowStart + i, ...row } + yield { __index__: block.rowStart + i, ...row } if (++count >= limit) return } } } } -/** - * Merge a sorted list of blocks into contiguous row-range runs. - * - * @param {{rowStart: number, rowEnd: number}[]} blocks - * @returns {{rowStart: number, rowEnd: number}[]} - */ -function coalesceRuns(blocks) { - /** @type {{rowStart: number, rowEnd: number}[]} */ - const runs = [] - let current = null - for (const block of blocks) { - if (current && block.rowStart === current.rowEnd) { - current.rowEnd = block.rowEnd - } else { - if (current) runs.push(current) - current = { rowStart: block.rowStart, rowEnd: block.rowEnd } - } - } - if (current) runs.push(current) - return runs -} - /** * Return true when every needle appears as a substring of some indexed column. * From 0715c6b28f17dfea02d795d5799ec257cee43806 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Sun, 24 May 2026 19:35:27 -0700 Subject: [PATCH 14/15] Silence unused-var lint in benchmark --- benchmark.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmark.js b/benchmark.js index 8c72ec9..cf5379c 100644 --- a/benchmark.js +++ b/benchmark.js @@ -75,7 +75,8 @@ for (const limit of [Infinity, 10]) { const src = instrument(await asyncBufferFromFile(filename)) const t0 = performance.now() let matches = 0 - for await (const _ of parquetFind({ + // eslint-disable-next-line no-unused-vars + for await (const _row of parquetFind({ url: filename, sourceFile: src, indexFile: idx, From 6d0d5ff1bd46fd72107c72966cd4d34f16af8516 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Sun, 24 May 2026 19:36:51 -0700 Subject: [PATCH 15/15] Add rowFilter callback for regex and custom predicates --- ARCH.md | 2 +- README.md | 16 ++++++++++++++++ src/parquetFind.js | 4 +++- src/types.d.ts | 3 ++- test/parquetFind.test.js | 17 +++++++++++++++++ 5 files changed, 39 insertions(+), 3 deletions(-) diff --git a/ARCH.md b/ARCH.md index ac7f3b4..65a3501 100644 --- a/ARCH.md +++ b/ARCH.md @@ -113,7 +113,7 @@ Both reader functions accept either a `url` (with optional `asyncBufferFactory`) 2. **Common-everywhere words** (e.g. `wikipedia` in a Wikipedia dump) match every block; the source transfer is unavoidable because every row really does match. No n-gram strategy fixes this. 3. **Per-row precision is missing.** A candidate block is scanned in full even if only one row matches. Storing per-`(ngram, block)` row bitmaps would let queries narrow the source read to specific rows, at the cost of a ~3× larger index. Probably the highest-value next step for sparse queries that match a small number of rows spread across many blocks. 4. **No-limit queries on dense matches do more work than they need to.** When every block matches (e.g. `wikipedia` against a Wikipedia dump), exhausting all matches takes ~24 s of CPU because we process blocks one at a time. The bytes are minimal; the time is CPU/parsing overhead. Real clients should pass a `limit`. -5. **Regex is not supported.** Adding it means parsing the regex into mandatory n-gram sets (Zoekt's planner), then running the real regex per matched row. The index format wouldn't change. +5. **Automatic regex literal extraction is not implemented.** Callers can still run regex by passing a `rowFilter` predicate and choosing a `query` string that names a literal substring the regex requires (so the index can prune). What's not done: parsing a `RegExp` automatically and deriving the mandatory n-gram set (Zoekt's planner). The index format wouldn't need to change. 6. **Index files are not back-compatible.** `hypgrep.version` exists but we don't ship a multi-version reader. Reasonable for a 0.x package. ## Dependencies diff --git a/README.md b/README.md index 2534ec3..b5092c5 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,22 @@ for await (const row of parquetFind({ Whitespace-separated words are ANDed: `'foo bar'` matches rows containing both `foo` and `bar` as substrings. Queries shorter than the indexed n-gram length (default 5) return no results. +### Regex (via `rowFilter`) + +Pass a `rowFilter` callback to override the default substring match. The `query` is still used to prune candidate blocks; the callback decides which rows to keep: + +```javascript +const re = /eigen\w*value/i + +for await (const row of parquetFind({ + query: 'eigen', + rowFilter: row => re.test(row.text), + url: '...', +})) ... +``` + +Picking a `query` that names a literal substring the regex requires is important — without it the index can't prune, and you'll scan everything. + ## Ranked search Use `parquetSearch` to rank results by total occurrence count of the query words: diff --git a/src/parquetFind.js b/src/parquetFind.js index b6fd3b5..34e1862 100644 --- a/src/parquetFind.js +++ b/src/parquetFind.js @@ -12,6 +12,7 @@ export async function* parquetFind({ query, url, limit = Infinity, + rowFilter, signal, asyncBufferFactory = asyncBufferFromUrl, sourceFile, @@ -39,6 +40,7 @@ export async function* parquetFind({ const metadata = sourceMetadata ?? await parquetMetadataAsync(file) const needles = query.toLowerCase().split(/\s+/).filter(Boolean) + const accept = rowFilter ?? ((/** @type {Record} */ row) => matchesRow(row, textColumns, needles)) // Process blocks one at a time so `limit` can short-circuit on the next // block boundary. cachedAsyncBuffer dedupes overlapping page fetches across @@ -58,7 +60,7 @@ export async function* parquetFind({ for (let i = 0; i < blockRows.length; i++) { const row = blockRows[i] - if (matchesRow(row, textColumns, needles)) { + if (accept(row)) { yield { __index__: block.rowStart + i, ...row } if (++count >= limit) return } diff --git a/src/types.d.ts b/src/types.d.ts index 724427e..85fb637 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -17,9 +17,10 @@ export interface QueryIndexOptions { } export interface ParquetSearchOptions { - query: string // the search query string + query: string // the search query string used to prune candidate blocks url: string // URL or file path to the source parquet file limit?: number // maximum number of matching rows to return + rowFilter?: (row: Record) => boolean // optional predicate to override the default substring match (parquetFind only); lets callers run regex or other matchers while still using `query` to prune the index // fetch options signal?: AbortSignal // optional AbortSignal to cancel the search operation diff --git a/test/parquetFind.test.js b/test/parquetFind.test.js index bca8cbe..c2e9077 100644 --- a/test/parquetFind.test.js +++ b/test/parquetFind.test.js @@ -99,6 +99,23 @@ describe('parquetFind', () => { expect(rows.length).toBe(0) }) + it('should use rowFilter to support regex queries', async () => { + const re = /rhythm\w+/i + const rows = [] + for await (const row of parquetFind({ + url: 'test/files/dataset.parquet', + asyncBufferFactory, + query: 'rhyth', // prune candidate blocks + rowFilter: row => re.test(row.text), + })) { + rows.push(row) + } + // 'Rhythm provides ...' has no trailing word chars after 'Rhythm', so doesn't match; + // 'rhythmic and beautiful' matches + expect(rows.length).toBe(1) + expect(rows[0].text).toContain('rhythmic') + }) + it('should respect abort signal', async () => { const controller = new AbortController() controller.abort()