From 365a4b88acf99d646613da6f631eddae40e69816 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Sun, 24 May 2026 21:37:24 -0700 Subject: [PATCH 1/3] Recurse into groups for regex DNF expansion Cross-product nested alternation up to MAX_DNF=32 branches; beyond that, drop the offending group conservatively (treat as wildcard). --- ARCH.md | 2 +- src/regex.js | 153 +++++++++++++++++++++++++++++---------------- test/regex.test.js | 44 +++++++++++-- 3 files changed, 140 insertions(+), 59 deletions(-) diff --git a/ARCH.md b/ARCH.md index f20b22f..fc9f971 100644 --- a/ARCH.md +++ b/ARCH.md @@ -125,7 +125,7 @@ Search options: `--limit N`, `--index `, `-c`/`--count`, `-i`/`--ignore-ca 2. **Common-everywhere words** (e.g. `wikipedia` in a Wikipedia dump) match every block; the source transfer is unavoidable because every row really does match. No n-gram strategy fixes this. 3. **Per-row precision is missing.** A candidate block is scanned in full even if only one row matches. Storing per-`(ngram, block)` row bitmaps would let queries narrow the source read to specific rows, at the cost of a ~3× larger index. Probably the highest-value next step for sparse queries that match a small number of rows spread across many blocks. 4. **No-limit queries on dense matches do more work than they need to.** When every block matches (e.g. `wikipedia` against a Wikipedia dump), exhausting all matches takes ~24 s of CPU because we process blocks one at a time. The bytes are minimal; the time is CPU/parsing overhead. Real clients should pass a `limit`. -5. **Regex literal extractor is conservative inside groups.** Alternation inside `(...)` groups is under-extracted: `/abc(foo|bar)def/` yields `['abc', 'def']` instead of the tighter DNF `{abc, foo, def} ∨ {abc, bar, def}`. Top-level alternation IS expanded into branches (Zoekt-style OR at the index level). Recursing into groups for full DNF expansion is the next step; bounded DNF size will need a cap. +5. **Regex literal extractor caps DNF expansion at 32 branches.** Groups with alternation are now recursed into and cross-producted, so `/abc(foo|bar)def/` correctly yields `{abc, foo, def} ∨ {abc, bar, def}`. Deeply nested alternation that would exceed 32 branches drops the offending group conservatively (treats it as a wildcard for that position) — correct, just looser pruning. Real-world bytes gains vary with data co-occurrence: `/(eigen|petri)(value|chor)/` against Wikipedia goes from a full scan (~615 MB) to 247 MB; queries where the alternation words tightly co-occur with the surrounding context (e.g. `(machine|deep) learning`) see no improvement because the constraint was already implicit in the data. 6. **Index files are not back-compatible.** `hypgrep.version` exists but we don't ship a multi-version reader. Reasonable for a 0.x package. ## Dependencies diff --git a/src/regex.js b/src/regex.js index 3ce22bb..e5a2b7e 100644 --- a/src/regex.js +++ b/src/regex.js @@ -14,86 +14,106 @@ * Conservative: we'd rather miss a safe literal (and scan more blocks) than * emit a false-positive that would let us skip a block that actually matches. * + * Groups with alternation are expanded into DNF (cross-product). To bound the + * worst case, expansion stops growing past MAX_DNF branches — beyond that, + * the offending group is treated as a wildcard for that position. + * * Examples: - * /foo/ → [['foo']] - * /foo.bar/ → [['foo', 'bar']] - * /foo|bar/ → [['foo'], ['bar']] - * /abc(foo|bar)/ → [['abc']] (inner alternation under-extracted) - * /foo|.+/ → [['foo'], []] (second branch matches anything) - * /./ → [[]] + * /foo/ → [['foo']] + * /foo.bar/ → [['foo', 'bar']] + * /foo|bar/ → [['foo'], ['bar']] + * /abc(foo|bar)def/ → [['abc','foo','def'], ['abc','bar','def']] + * /foo|.+/ → [['foo'], []] (second branch matches anything) + * /./ → [[]] * * @param {RegExp} regex * @returns {string[][]} */ export function extractRegexLiterals(regex) { const lower = regex.flags.includes('i') - const branchSources = splitTopLevelAlternation(regex.source) - return branchSources.map(src => literalsInBranch(src, lower)) + return expandRegion(regex.source, 0, regex.source.length, lower) } +const MAX_DNF = 32 + /** - * Split a regex source on top-level `|` alternation. Operators inside `(...)` - * groups or `[...]` character classes are NOT splits. + * Expand a (sub-)region of regex source into a DNF of literal branches. + * Splits on top-level alternation first, then expands each branch. * - * @param {string} source - * @returns {string[]} + * @param {string} src + * @param {number} start + * @param {number} end + * @param {boolean} lower + * @returns {string[][]} */ -function splitTopLevelAlternation(source) { - /** @type {string[]} */ - const branches = [] - let start = 0 - let depth = 0 - let i = 0 - while (i < source.length) { - const c = source[i] - if (c === '\\') { - i += 2 - continue - } - if (c === '[') { - i = skipClass(source, i) - continue - } - if (c === '(') { - depth += 1 - i += 1 - continue - } - if (c === ')') { - depth -= 1 - i += 1 - continue +function expandRegion(src, start, end, lower) { + const branchSpans = splitTopLevelAlternation(src, start, end) + /** @type {string[][]} */ + const dnf = [] + for (const [bStart, bEnd] of branchSpans) { + for (const branch of expandBranch(src, bStart, bEnd, lower)) { + dnf.push(branch) } + } + return dnf +} + +/** + * Return [start, end) spans of top-level alternation arms within src[start..end). + * Operators inside `(...)` groups or `[...]` character classes are NOT splits. + * + * @param {string} src + * @param {number} start + * @param {number} end + * @returns {Array<[number, number]>} + */ +function splitTopLevelAlternation(src, start, end) { + /** @type {Array<[number, number]>} */ + const spans = [] + let bStart = start + let depth = 0 + let i = start + while (i < end) { + const c = src[i] + if (c === '\\') { i += 2; continue } + if (c === '[') { i = skipClass(src, i); continue } + if (c === '(') { depth += 1; i += 1; continue } + if (c === ')') { depth -= 1; i += 1; continue } if (c === '|' && depth === 0) { - branches.push(source.slice(start, i)) - start = i + 1 + spans.push([bStart, i]) + bStart = i + 1 } i += 1 } - branches.push(source.slice(start)) - return branches + spans.push([bStart, end]) + return spans } /** - * Extract mandatory literals from a single regex branch (no top-level `|`). + * Expand a single (no top-level `|`) regex branch into a DNF of literal sets. + * + * Walks the branch building a sequence of "atoms": literal runs and sub-DNFs + * from groups. Then cross-products the atoms into the branch's final DNF. * * @param {string} src + * @param {number} start + * @param {number} end * @param {boolean} lower - * @returns {string[]} + * @returns {string[][]} */ -function literalsInBranch(src, lower) { - /** @type {string[]} */ - const literals = [] +function expandBranch(src, start, end, lower) { + /** @type {Array<{ lit: string } | { dnf: string[][] }>} */ + const atoms = [] let run = '' /** * @returns {void} */ function flushRun() { - if (run.length > 0) literals.push(run) + if (run.length > 0) atoms.push({ lit: run }) run = '' } - let i = 0 - while (i < src.length) { + let i = start + while (i < end) { const c = src[i] // Zero-width anchors don't affect the literal run. @@ -137,11 +157,17 @@ function literalsInBranch(src, lower) { continue } - // Group (...) — too varied to recurse into safely; treat as a wildcard. + // Group (...) — recurse for full DNF expansion if (c === '(') { flushRun() - i = skipGroup(src, i) - i = skipQuantifier(src, i) + const groupEnd = skipGroup(src, i) + const innerStart = i + 1 + const innerEnd = groupEnd - 1 + const q = peekQuantifier(src, groupEnd) + i = q.end + // Optional group — skip entirely (treat as wildcard) + if (q.copies === 0) continue + atoms.push({ dnf: expandRegion(src, innerStart, innerEnd, lower) }) continue } @@ -165,7 +191,28 @@ function literalsInBranch(src, lower) { } } flushRun() - return literals + + // Cross-product the atoms into a DNF. + /** @type {string[][]} */ + let cross = [[]] + for (const atom of atoms) { + if ('lit' in atom) { + cross = cross.map(b => b.concat(atom.lit)) + continue + } + const innerDNF = atom.dnf + // If expanding would blow past MAX_DNF, treat this group as a wildcard. + if (cross.length * innerDNF.length > MAX_DNF) continue + /** @type {string[][]} */ + const expanded = [] + for (const existing of cross) { + for (const innerBranch of innerDNF) { + expanded.push(existing.concat(innerBranch)) + } + } + cross = expanded + } + return cross } /** diff --git a/test/regex.test.js b/test/regex.test.js index 16b8d94..46ba091 100644 --- a/test/regex.test.js +++ b/test/regex.test.js @@ -41,9 +41,39 @@ describe('extractRegexLiterals', () => { expect(extractRegexLiterals(/foo[xy]bar/)).toEqual([['foo', 'bar']]) }) - it('skips groups conservatively', () => { - expect(extractRegexLiterals(/(foo|bar)xyz/)).toEqual([['xyz']]) - expect(extractRegexLiterals(/abc(foo)def/)).toEqual([['abc', 'def']]) + it('recurses into groups for DNF expansion', () => { + expect(extractRegexLiterals(/(foo|bar)xyz/)).toEqual([['foo', 'xyz'], ['bar', 'xyz']]) + expect(extractRegexLiterals(/abc(foo)def/)).toEqual([['abc', 'foo', 'def']]) + expect(extractRegexLiterals(/abc(foo|bar)def/)).toEqual([ + ['abc', 'foo', 'def'], + ['abc', 'bar', 'def'], + ]) + }) + + it('skips optional groups (treats as wildcard)', () => { + expect(extractRegexLiterals(/abc(foo|bar)?def/)).toEqual([['abc', 'def']]) + expect(extractRegexLiterals(/abc(foo)*def/)).toEqual([['abc', 'def']]) + }) + + it('cross-products multiple alternation groups', () => { + expect(extractRegexLiterals(/(a|b)(c|d)/)).toEqual([ + ['a', 'c'], ['a', 'd'], ['b', 'c'], ['b', 'd'], + ]) + }) + + it('treats groups as wildcards when DNF would exceed cap', () => { + // 2 * 2 * 2 * 2 * 2 * 2 = 64 > 32 cap → drops the offending group + const result = extractRegexLiterals(/(a|b)(c|d)(e|f)(g|h)(i|j)(k|l)needle/) + // Should still include the 'needle' literal in every branch + expect(result.length).toBeLessThanOrEqual(32) + expect(result.every(b => b.includes('needle'))).toBe(true) + }) + + it('handles empty branch in group (matches anything)', () => { + expect(extractRegexLiterals(/abc(foo|)def/)).toEqual([ + ['abc', 'foo', 'def'], + ['abc', 'def'], + ]) }) it('splits top-level alternation into branches', () => { @@ -56,8 +86,12 @@ describe('extractRegexLiterals', () => { expect(extractRegexLiterals(/foo|\w+/)).toEqual([['foo'], []]) }) - it('does not split | inside groups or char classes', () => { - expect(extractRegexLiterals(/abc(foo|bar)def/)).toEqual([['abc', 'def']]) + it('does not split | inside groups or char classes at the top level', () => { + // Top level | is split; the | inside the group is handled by group recursion + expect(extractRegexLiterals(/abc(foo|bar)def/)).toEqual([ + ['abc', 'foo', 'def'], + ['abc', 'bar', 'def'], + ]) expect(extractRegexLiterals(/abc[|]def/)).toEqual([['abc', 'def']]) }) From 53e8dd45fb02b2368c1290d9be86fd9facdc6378 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Sun, 24 May 2026 21:43:34 -0700 Subject: [PATCH 2/3] Fold small literal char classes into the regex literal run Negated, ranged, special-escape, and multi-char-quantifier classes still fall back to wildcard treatment. /serv[ei]rless/ now prunes to 'serverless' or 'servirless' instead of just 'rless'. --- ARCH.md | 2 +- src/regex.js | 154 ++++++++++++++++++++++++++++++++------------- test/regex.test.js | 33 ++++++++-- 3 files changed, 140 insertions(+), 49 deletions(-) diff --git a/ARCH.md b/ARCH.md index fc9f971..a5ec829 100644 --- a/ARCH.md +++ b/ARCH.md @@ -125,7 +125,7 @@ Search options: `--limit N`, `--index `, `-c`/`--count`, `-i`/`--ignore-ca 2. **Common-everywhere words** (e.g. `wikipedia` in a Wikipedia dump) match every block; the source transfer is unavoidable because every row really does match. No n-gram strategy fixes this. 3. **Per-row precision is missing.** A candidate block is scanned in full even if only one row matches. Storing per-`(ngram, block)` row bitmaps would let queries narrow the source read to specific rows, at the cost of a ~3× larger index. Probably the highest-value next step for sparse queries that match a small number of rows spread across many blocks. 4. **No-limit queries on dense matches do more work than they need to.** When every block matches (e.g. `wikipedia` against a Wikipedia dump), exhausting all matches takes ~24 s of CPU because we process blocks one at a time. The bytes are minimal; the time is CPU/parsing overhead. Real clients should pass a `limit`. -5. **Regex literal extractor caps DNF expansion at 32 branches.** Groups with alternation are now recursed into and cross-producted, so `/abc(foo|bar)def/` correctly yields `{abc, foo, def} ∨ {abc, bar, def}`. Deeply nested alternation that would exceed 32 branches drops the offending group conservatively (treats it as a wildcard for that position) — correct, just looser pruning. Real-world bytes gains vary with data co-occurrence: `/(eigen|petri)(value|chor)/` against Wikipedia goes from a full scan (~615 MB) to 247 MB; queries where the alternation words tightly co-occur with the surrounding context (e.g. `(machine|deep) learning`) see no improvement because the constraint was already implicit in the data. +5. **Regex literal extractor caps DNF expansion at 32 branches.** Groups with alternation are now recursed into and cross-producted, so `/abc(foo|bar)def/` correctly yields `{abc, foo, def} ∨ {abc, bar, def}`. Small literal-only character classes (no negation, no ranges, no `\w`/`\d`, single-occurrence) are folded into the surrounding run — `/serv[ei]rless/` becomes `{serverless} ∨ {servirless}`, cutting Wikipedia transfer from 333 MB to 0 MB. Negated classes, ranges, `\d`/`\w`, and multi-char quantifiers like `[ab]{3}` (which could match `aab`) fall back to wildcard treatment to stay correct. Deeply nested expansions that would exceed 32 branches drop the offending construct conservatively. 6. **Index files are not back-compatible.** `hypgrep.version` exists but we don't ship a multi-version reader. Reasonable for a 0.x package. ## Dependencies diff --git a/src/regex.js b/src/regex.js index e5a2b7e..3472525 100644 --- a/src/regex.js +++ b/src/regex.js @@ -14,15 +14,20 @@ * Conservative: we'd rather miss a safe literal (and scan more blocks) than * emit a false-positive that would let us skip a block that actually matches. * - * Groups with alternation are expanded into DNF (cross-product). To bound the - * worst case, expansion stops growing past MAX_DNF branches — beyond that, - * the offending group is treated as a wildcard for that position. + * - Top-level `|` → branch per arm. + * - Groups with alternation → cross-producted into DNF (independent literals). + * - Small literal-only character classes → folded into the surrounding run + * (`/serv[ei]rless/` → `[['serverless'], ['servirless']]`). + * + * Expansion stops growing past MAX_DNF branches — beyond that, the offending + * construct is treated as a wildcard for that position. * * Examples: * /foo/ → [['foo']] * /foo.bar/ → [['foo', 'bar']] * /foo|bar/ → [['foo'], ['bar']] * /abc(foo|bar)def/ → [['abc','foo','def'], ['abc','bar','def']] + * /serv[ei]rless/ → [['serverless'], ['servirless']] * /foo|.+/ → [['foo'], []] (second branch matches anything) * /./ → [[]] * @@ -89,11 +94,15 @@ function splitTopLevelAlternation(src, start, end) { return spans } +/** @typedef {{ literals: string[], run: string }} LiveBranch */ + /** * Expand a single (no top-level `|`) regex branch into a DNF of literal sets. * - * Walks the branch building a sequence of "atoms": literal runs and sub-DNFs - * from groups. Then cross-products the atoms into the branch's final DNF. + * Maintains a set of "live branches", each with a current literal run and the + * literals that have already been flushed. Character classes fork branches + * with the char appended to the run (so adjacent literals stay joined). + * Groups, dots, and special escapes flush the run and break the sequence. * * @param {string} src * @param {number} start @@ -102,16 +111,21 @@ function splitTopLevelAlternation(src, start, end) { * @returns {string[][]} */ function expandBranch(src, start, end, lower) { - /** @type {Array<{ lit: string } | { dnf: string[][] }>} */ - const atoms = [] - let run = '' + /** @type {LiveBranch[]} */ + let cross = [{ literals: [], run: '' }] + /** * @returns {void} */ - function flushRun() { - if (run.length > 0) atoms.push({ lit: run }) - run = '' + function flushRuns() { + for (const b of cross) { + if (b.run.length > 0) { + b.literals.push(b.run) + b.run = '' + } + } } + let i = start while (i < end) { const c = src[i] @@ -131,7 +145,7 @@ function expandBranch(src, start, end, lower) { } // \d \w \s \b etc. — a special class, not a literal char if (/[a-zA-Z]/.test(next)) { - flushRun() + flushRuns() i = skipQuantifier(src, i + 2) continue } @@ -140,40 +154,74 @@ function expandBranch(src, start, end, lower) { const q = peekQuantifier(src, i) i = q.end if (q.copies === 0) { - flushRun() + flushRuns() } else { const ch = lower ? next.toLowerCase() : next - run += ch.repeat(q.copies) - if (q.openEnded) flushRun() + for (const b of cross) b.run += ch.repeat(q.copies) + if (q.openEnded) flushRuns() } continue } // Character class [...] if (c === '[') { - flushRun() - i = skipClass(src, i) - i = skipQuantifier(src, i) + const classEnd = skipClass(src, i) + const chars = parseLiteralClass(src, i, classEnd, lower) + const q = peekQuantifier(src, classEnd) + i = q.end + if (q.copies === 0) { + flushRuns() + continue + } + // Treat as wildcard if: not a small literal-only class, or quantifier + // could vary the chars across positions, or expansion would exceed cap. + if (chars === null || q.copies > 1 || cross.length * chars.length > MAX_DNF) { + flushRuns() + continue + } + // Fork each live branch by char alternatives, appending to the run. + /** @type {LiveBranch[]} */ + const expanded = [] + for (const b of cross) { + for (const ch of chars) { + expanded.push({ literals: b.literals.slice(), run: b.run + ch }) + } + } + cross = expanded + if (q.openEnded) flushRuns() continue } // Group (...) — recurse for full DNF expansion if (c === '(') { - flushRun() const groupEnd = skipGroup(src, i) const innerStart = i + 1 const innerEnd = groupEnd - 1 const q = peekQuantifier(src, groupEnd) i = q.end - // Optional group — skip entirely (treat as wildcard) - if (q.copies === 0) continue - atoms.push({ dnf: expandRegion(src, innerStart, innerEnd, lower) }) + // Optional group — flush and skip (treat as wildcard) + if (q.copies === 0) { + flushRuns() + continue + } + // Group contents are independent literals — flush the current run first + flushRuns() + const innerDNF = expandRegion(src, innerStart, innerEnd, lower) + if (cross.length * innerDNF.length > MAX_DNF) continue + /** @type {LiveBranch[]} */ + const expanded = [] + for (const b of cross) { + for (const innerBranch of innerDNF) { + expanded.push({ literals: b.literals.concat(innerBranch), run: '' }) + } + } + cross = expanded continue } // Dot if (c === '.') { - flushRun() + flushRuns() i = skipQuantifier(src, i + 1) continue } @@ -183,36 +231,54 @@ function expandBranch(src, start, end, lower) { const q = peekQuantifier(src, i) i = q.end if (q.copies === 0) { - flushRun() + flushRuns() } else { const ch = lower ? c.toLowerCase() : c - run += ch.repeat(q.copies) - if (q.openEnded) flushRun() + for (const b of cross) b.run += ch.repeat(q.copies) + if (q.openEnded) flushRuns() } } - flushRun() + flushRuns() - // Cross-product the atoms into a DNF. - /** @type {string[][]} */ - let cross = [[]] - for (const atom of atoms) { - if ('lit' in atom) { - cross = cross.map(b => b.concat(atom.lit)) + return cross.map(b => b.literals) +} + +/** + * Parse a `[...]` character class as a list of literal alternatives. Returns + * null if the class is negated, contains a range, contains a special escape + * (`\d`, `\w`, …), or is empty — i.e., not safely foldable into a literal run. + * + * @param {string} src + * @param {number} start + * @param {number} end + * @param {boolean} lower + * @returns {string[] | null} + */ +function parseLiteralClass(src, start, end, lower) { + // src[start] === '[' and end is the index after ']' + let i = start + 1 + const lastInside = end - 1 // position of ']' + if (src[i] === '^') return null + /** @type {Set} */ + const chars = new Set() + while (i < lastInside) { + if (src[i] === '\\') { + const next = src[i + 1] + if (next === undefined) return null + if (/[a-zA-Z]/.test(next)) return null // \d, \w, \s, etc. + chars.add(lower ? next.toLowerCase() : next) + i += 2 continue } - const innerDNF = atom.dnf - // If expanding would blow past MAX_DNF, treat this group as a wildcard. - if (cross.length * innerDNF.length > MAX_DNF) continue - /** @type {string[][]} */ - const expanded = [] - for (const existing of cross) { - for (const innerBranch of innerDNF) { - expanded.push(existing.concat(innerBranch)) - } + // Detect range: a-b where '-' is not the last char in the class + if (i + 2 < lastInside && src[i + 1] === '-') { + return null } - cross = expanded + chars.add(lower ? src[i].toLowerCase() : src[i]) + i += 1 } - return cross + if (chars.size === 0) return null + return [...chars] } /** diff --git a/test/regex.test.js b/test/regex.test.js index 46ba091..f55c0c7 100644 --- a/test/regex.test.js +++ b/test/regex.test.js @@ -36,9 +36,33 @@ describe('extractRegexLiterals', () => { expect(extractRegexLiterals(/abc{1,3}def/)).toEqual([['abc', 'def']]) }) - it('skips character classes', () => { - expect(extractRegexLiterals(/[xy]def/)).toEqual([['def']]) - expect(extractRegexLiterals(/foo[xy]bar/)).toEqual([['foo', 'bar']]) + it('folds small literal char classes into the run', () => { + expect(extractRegexLiterals(/[xy]def/).sort()).toEqual([['xdef'], ['ydef']].sort()) + expect(extractRegexLiterals(/foo[xy]bar/).sort()).toEqual([['fooxbar'], ['fooybar']].sort()) + expect(extractRegexLiterals(/serv[ei]rless/).sort()).toEqual([['serverless'], ['servirless']].sort()) + }) + + it('treats negated/ranged/special char classes as wildcards', () => { + // negated + expect(extractRegexLiterals(/foo[^xy]bar/)).toEqual([['foo', 'bar']]) + // range + expect(extractRegexLiterals(/foo[a-z]bar/)).toEqual([['foo', 'bar']]) + // special escape + expect(extractRegexLiterals(/foo[\w]bar/)).toEqual([['foo', 'bar']]) + }) + + it('does not fold char classes under multi-char quantifiers', () => { + // [ab]{3} could match 'aab' — folding to 'aaa'/'bbb' would be a false miss + expect(extractRegexLiterals(/foo[ab]{3}bar/)).toEqual([['foo', 'bar']]) + // optional class: flush, no fold + expect(extractRegexLiterals(/foo[ab]?bar/)).toEqual([['foo', 'bar']]) + }) + + it('folds char class once under + (one mandatory occurrence)', () => { + // [ab]+ guarantees at least one 'a' or 'b'; further chars unknown + expect(extractRegexLiterals(/foo[ab]+bar/).sort()).toEqual([ + ['fooa', 'bar'], ['foob', 'bar'], + ].sort()) }) it('recurses into groups for DNF expansion', () => { @@ -92,7 +116,8 @@ describe('extractRegexLiterals', () => { ['abc', 'foo', 'def'], ['abc', 'bar', 'def'], ]) - expect(extractRegexLiterals(/abc[|]def/)).toEqual([['abc', 'def']]) + // `|` inside a char class is a literal char and folds into the run + expect(extractRegexLiterals(/abc[|]def/)).toEqual([['abc|def']]) }) it('ignores anchors', () => { From 97f48db869c6b0690489f19e68a9f80f60c267d4 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Sun, 24 May 2026 21:47:18 -0700 Subject: [PATCH 3/3] Fold single-literal group branches into the regex run When every inner branch of a group is a single contiguous literal, the group concats into the surrounding run instead of standing alone. /(eigen|petri)(value|chor)/ now extracts the four full words; Wikipedia transfer drops from 247 MB to 51 MB. --- ARCH.md | 6 +++++- src/regex.js | 22 ++++++++++++++++++++-- test/regex.test.js | 31 ++++++++++++++++++++++--------- 3 files changed, 47 insertions(+), 12 deletions(-) diff --git a/ARCH.md b/ARCH.md index a5ec829..72d906a 100644 --- a/ARCH.md +++ b/ARCH.md @@ -125,7 +125,11 @@ Search options: `--limit N`, `--index `, `-c`/`--count`, `-i`/`--ignore-ca 2. **Common-everywhere words** (e.g. `wikipedia` in a Wikipedia dump) match every block; the source transfer is unavoidable because every row really does match. No n-gram strategy fixes this. 3. **Per-row precision is missing.** A candidate block is scanned in full even if only one row matches. Storing per-`(ngram, block)` row bitmaps would let queries narrow the source read to specific rows, at the cost of a ~3× larger index. Probably the highest-value next step for sparse queries that match a small number of rows spread across many blocks. 4. **No-limit queries on dense matches do more work than they need to.** When every block matches (e.g. `wikipedia` against a Wikipedia dump), exhausting all matches takes ~24 s of CPU because we process blocks one at a time. The bytes are minimal; the time is CPU/parsing overhead. Real clients should pass a `limit`. -5. **Regex literal extractor caps DNF expansion at 32 branches.** Groups with alternation are now recursed into and cross-producted, so `/abc(foo|bar)def/` correctly yields `{abc, foo, def} ∨ {abc, bar, def}`. Small literal-only character classes (no negation, no ranges, no `\w`/`\d`, single-occurrence) are folded into the surrounding run — `/serv[ei]rless/` becomes `{serverless} ∨ {servirless}`, cutting Wikipedia transfer from 333 MB to 0 MB. Negated classes, ranges, `\d`/`\w`, and multi-char quantifiers like `[ab]{3}` (which could match `aab`) fall back to wildcard treatment to stay correct. Deeply nested expansions that would exceed 32 branches drop the offending construct conservatively. +5. **Regex literal extractor caps DNF expansion at 32 branches.** Groups with alternation are recursed into and cross-producted, so `/abc(foo|bar)def/` yields the tight `{abcfoodef} ∨ {abcbardef}`. Two flavors of fold keep adjacent literals contiguous so the n-gram extractor can prune them: + - **Char class fold**: small literal-only classes (no negation, no ranges, no `\w`/`\d`, single occurrence) concat into the run — `/serv[ei]rless/` becomes `{serverless} ∨ {servirless}` (333 MB → 0 MB on Wikipedia). + - **Group fold**: when every inner branch is a single contiguous literal, the group also concats — `/(eigen|petri)(value|chor)/` becomes `{eigenvalue} ∨ {eigenchor} ∨ {petrivalue} ∨ {petrichor}` (615 MB full scan → 51 MB). + + Negated classes, ranges, `\d`/`\w`, multi-char quantifiers like `[ab]{3}` (which could match `aab`), and groups whose inner branches contain wildcards or multiple literals fall back to wildcard or independent-literal treatment to stay correct. Deeply nested expansions that would exceed 32 branches drop the offending construct conservatively. 6. **Index files are not back-compatible.** `hypgrep.version` exists but we don't ship a multi-version reader. Reasonable for a 0.x package. ## Dependencies diff --git a/src/regex.js b/src/regex.js index 3472525..663361a 100644 --- a/src/regex.js +++ b/src/regex.js @@ -204,9 +204,27 @@ function expandBranch(src, start, end, lower) { flushRuns() continue } - // Group contents are independent literals — flush the current run first - flushRuns() const innerDNF = expandRegion(src, innerStart, innerEnd, lower) + // Foldable: every inner branch is a single contiguous literal and the + // group occurs exactly once. Same safety reasoning as char-class fold — + // multi-occurrence (`(a|b){3}`) could vary across positions. + const foldable = q.copies === 1 && + innerDNF.length > 0 && + innerDNF.every(b => b.length === 1) + if (foldable && cross.length * innerDNF.length <= MAX_DNF) { + /** @type {LiveBranch[]} */ + const expanded = [] + for (const b of cross) { + for (const innerBranch of innerDNF) { + expanded.push({ literals: b.literals.slice(), run: b.run + innerBranch[0] }) + } + } + cross = expanded + if (q.openEnded) flushRuns() + continue + } + // Not foldable — treat group contents as independent literals. + flushRuns() if (cross.length * innerDNF.length > MAX_DNF) continue /** @type {LiveBranch[]} */ const expanded = [] diff --git a/test/regex.test.js b/test/regex.test.js index f55c0c7..477091b 100644 --- a/test/regex.test.js +++ b/test/regex.test.js @@ -65,12 +65,24 @@ describe('extractRegexLiterals', () => { ].sort()) }) - it('recurses into groups for DNF expansion', () => { - expect(extractRegexLiterals(/(foo|bar)xyz/)).toEqual([['foo', 'xyz'], ['bar', 'xyz']]) - expect(extractRegexLiterals(/abc(foo)def/)).toEqual([['abc', 'foo', 'def']]) + it('folds single-literal groups into the surrounding run', () => { + // Single-literal-branch groups concat into the run like char classes + expect(extractRegexLiterals(/(foo|bar)xyz/)).toEqual([['fooxyz'], ['barxyz']]) + expect(extractRegexLiterals(/abc(foo)def/)).toEqual([['abcfoodef']]) expect(extractRegexLiterals(/abc(foo|bar)def/)).toEqual([ - ['abc', 'foo', 'def'], - ['abc', 'bar', 'def'], + ['abcfoodef'], + ['abcbardef'], + ]) + expect(extractRegexLiterals(/wikip(e|a)dia/)).toEqual([['wikipedia'], ['wikipadia']]) + }) + + it('keeps multi-literal group branches as independent literals', () => { + // `(ab.cd|xy)` cannot fold — the dot splits the first branch into two + // literals, so contiguous concat isn't possible for that branch. Conservative + // all-or-nothing: every inner branch is treated as independent. + expect(extractRegexLiterals(/x(ab.cd|xy)y/)).toEqual([ + ['x', 'ab', 'cd', 'y'], + ['x', 'xy', 'y'], ]) }) @@ -80,8 +92,9 @@ describe('extractRegexLiterals', () => { }) it('cross-products multiple alternation groups', () => { + // Both groups are single-literal foldable, so they concat into one run expect(extractRegexLiterals(/(a|b)(c|d)/)).toEqual([ - ['a', 'c'], ['a', 'd'], ['b', 'c'], ['b', 'd'], + ['ac'], ['ad'], ['bc'], ['bd'], ]) }) @@ -111,10 +124,10 @@ describe('extractRegexLiterals', () => { }) it('does not split | inside groups or char classes at the top level', () => { - // Top level | is split; the | inside the group is handled by group recursion + // Top level | is split; the | inside the group is handled by group recursion + fold expect(extractRegexLiterals(/abc(foo|bar)def/)).toEqual([ - ['abc', 'foo', 'def'], - ['abc', 'bar', 'def'], + ['abcfoodef'], + ['abcbardef'], ]) // `|` inside a char class is a literal char and folds into the run expect(extractRegexLiterals(/abc[|]def/)).toEqual([['abc|def']])