From 365a4b88acf99d646613da6f631eddae40e69816 Mon Sep 17 00:00:00 2001
From: Kenny Daniel <platypii@gmail.com>
Date: Sun, 24 May 2026 21:37:24 -0700
Subject: [PATCH 1/3] Recurse into groups for regex DNF expansion

Cross-product nested alternation up to MAX_DNF=32 branches; beyond
that, drop the offending group conservatively (treat as wildcard).
---
 ARCH.md            |   2 +-
 src/regex.js       | 153 +++++++++++++++++++++++++++++----------------
 test/regex.test.js |  44 +++++++++++--
 3 files changed, 140 insertions(+), 59 deletions(-)
diff --git a/ARCH.md b/ARCH.md
index f20b22f..fc9f971 100644
--- a/ARCH.md
+++ b/ARCH.md
@@ -125,7 +125,7 @@ Search options: `--limit N`, `--index <path>`, `-c`/`--count`, `-i`/`--ignore-ca
 2. **Common-everywhere words** (e.g. `wikipedia` in a Wikipedia dump) match every block; the source transfer is unavoidable because every row really does match. No n-gram strategy fixes this.
 3. **Per-row precision is missing.** A candidate block is scanned in full even if only one row matches. Storing per-`(ngram, block)` row bitmaps would let queries narrow the source read to specific rows, at the cost of a ~3× larger index. Probably the highest-value next step for sparse queries that match a small number of rows spread across many blocks.
 4. **No-limit queries on dense matches do more work than they need to.** When every block matches (e.g. `wikipedia` against a Wikipedia dump), exhausting all matches takes ~24 s of CPU because we process blocks one at a time. The bytes are minimal; the time is CPU/parsing overhead. Real clients should pass a `limit`.
-5. **Regex literal extractor is conservative inside groups.** Alternation inside `(...)` groups is under-extracted: `/abc(foo|bar)def/` yields `['abc', 'def']` instead of the tighter DNF `{abc, foo, def} ∨ {abc, bar, def}`. Top-level alternation IS expanded into branches (Zoekt-style OR at the index level). Recursing into groups for full DNF expansion is the next step; bounded DNF size will need a cap.
+5. **Regex literal extractor caps DNF expansion at 32 branches.** Groups with alternation are now recursed into and cross-producted, so `/abc(foo|bar)def/` correctly yields `{abc, foo, def} ∨ {abc, bar, def}`. Deeply nested alternation that would exceed 32 branches drops the offending group conservatively (treats it as a wildcard for that position) — correct, just looser pruning. Real-world bytes gains vary with data co-occurrence: `/(eigen|petri)(value|chor)/` against Wikipedia goes from a full scan (~615 MB) to 247 MB; queries where the alternation words tightly co-occur with the surrounding context (e.g. `(machine|deep) learning`) see no improvement because the constraint was already implicit in the data.
 6. **Index files are not back-compatible.** `hypgrep.version` exists but we don't ship a multi-version reader. Reasonable for a 0.x package.
 
 ## Dependencies
diff --git a/src/regex.js b/src/regex.js
index 3ce22bb..e5a2b7e 100644
--- a/src/regex.js
+++ b/src/regex.js
@@ -14,86 +14,106 @@
  * Conservative: we'd rather miss a safe literal (and scan more blocks) than
  * emit a false-positive that would let us skip a block that actually matches.
  *
+ * Groups with alternation are expanded into DNF (cross-product). To bound the
+ * worst case, expansion stops growing past MAX_DNF branches — beyond that,
+ * the offending group is treated as a wildcard for that position.
+ *
  * Examples:
- *   /foo/           → [['foo']]
- *   /foo.bar/       → [['foo', 'bar']]
- *   /foo|bar/       → [['foo'], ['bar']]
- *   /abc(foo|bar)/  → [['abc']]      (inner alternation under-extracted)
- *   /foo|.+/        → [['foo'], []]  (second branch matches anything)
- *   /./             → [[]]
+ *   /foo/             → [['foo']]
+ *   /foo.bar/         → [['foo', 'bar']]
+ *   /foo|bar/         → [['foo'], ['bar']]
+ *   /abc(foo|bar)def/ → [['abc','foo','def'], ['abc','bar','def']]
+ *   /foo|.+/          → [['foo'], []]  (second branch matches anything)
+ *   /./               → [[]]
  *
  * @param {RegExp} regex
  * @returns {string[][]}
  */
 export function extractRegexLiterals(regex) {
   const lower = regex.flags.includes('i')
-  const branchSources = splitTopLevelAlternation(regex.source)
-  return branchSources.map(src => literalsInBranch(src, lower))
+  return expandRegion(regex.source, 0, regex.source.length, lower)
 }
 
+const MAX_DNF = 32
+
 /**
- * Split a regex source on top-level `|` alternation. Operators inside `(...)`
- * groups or `[...]` character classes are NOT splits.
+ * Expand a (sub-)region of regex source into a DNF of literal branches.
+ * Splits on top-level alternation first, then expands each branch.
  *
- * @param {string} source
- * @returns {string[]}
+ * @param {string} src
+ * @param {number} start
+ * @param {number} end
+ * @param {boolean} lower
+ * @returns {string[][]}
  */
-function splitTopLevelAlternation(source) {
-  /** @type {string[]} */
-  const branches = []
-  let start = 0
-  let depth = 0
-  let i = 0
-  while (i < source.length) {
-    const c = source[i]
-    if (c === '\\') {
-      i += 2
-      continue
-    }
-    if (c === '[') {
-      i = skipClass(source, i)
-      continue
-    }
-    if (c === '(') {
-      depth += 1
-      i += 1
-      continue
-    }
-    if (c === ')') {
-      depth -= 1
-      i += 1
-      continue
+function expandRegion(src, start, end, lower) {
+  const branchSpans = splitTopLevelAlternation(src, start, end)
+  /** @type {string[][]} */
+  const dnf = []
+  for (const [bStart, bEnd] of branchSpans) {
+    for (const branch of expandBranch(src, bStart, bEnd, lower)) {
+      dnf.push(branch)
     }
+  }
+  return dnf
+}
+
+/**
+ * Return [start, end) spans of top-level alternation arms within src[start..end).
+ * Operators inside `(...)` groups or `[...]` character classes are NOT splits.
+ *
+ * @param {string} src
+ * @param {number} start
+ * @param {number} end
+ * @returns {Array<[number, number]>}
+ */
+function splitTopLevelAlternation(src, start, end) {
+  /** @type {Array<[number, number]>} */
+  const spans = []
+  let bStart = start
+  let depth = 0
+  let i = start
+  while (i < end) {
+    const c = src[i]
+    if (c === '\\') { i += 2; continue }
+    if (c === '[') { i = skipClass(src, i); continue }
+    if (c === '(') { depth += 1; i += 1; continue }
+    if (c === ')') { depth -= 1; i += 1; continue }
     if (c === '|' && depth === 0) {
-      branches.push(source.slice(start, i))
-      start = i + 1
+      spans.push([bStart, i])
+      bStart = i + 1
     }
     i += 1
   }
-  branches.push(source.slice(start))
-  return branches
+  spans.push([bStart, end])
+  return spans
 }
 
 /**
- * Extract mandatory literals from a single regex branch (no top-level `|`).
+ * Expand a single (no top-level `|`) regex branch into a DNF of literal sets.
+ *
+ * Walks the branch building a sequence of "atoms": literal runs and sub-DNFs
+ * from groups. Then cross-products the atoms into the branch's final DNF.
  *
  * @param {string} src
+ * @param {number} start
+ * @param {number} end
  * @param {boolean} lower
- * @returns {string[]}
+ * @returns {string[][]}
  */
-function literalsInBranch(src, lower) {
-  /** @type {string[]} */
-  const literals = []
+function expandBranch(src, start, end, lower) {
+  /** @type {Array<{ lit: string } | { dnf: string[][] }>} */
+  const atoms = []
   let run = ''
   /**
    * @returns {void}
    */
   function flushRun() {
-    if (run.length > 0) literals.push(run)
+    if (run.length > 0) atoms.push({ lit: run })
     run = ''
   }
-  let i = 0
-  while (i < src.length) {
+  let i = start
+  while (i < end) {
     const c = src[i]
 
     // Zero-width anchors don't affect the literal run.
@@ -137,11 +157,17 @@ function literalsInBranch(src, lower) {
       continue
     }
 
-    // Group (...) — too varied to recurse into safely; treat as a wildcard.
+    // Group (...) — recurse for full DNF expansion
     if (c === '(') {
       flushRun()
-      i = skipGroup(src, i)
-      i = skipQuantifier(src, i)
+      const groupEnd = skipGroup(src, i)
+      const innerStart = i + 1
+      const innerEnd = groupEnd - 1
+      const q = peekQuantifier(src, groupEnd)
+      i = q.end
+      // Optional group — skip entirely (treat as wildcard)
+      if (q.copies === 0) continue
+      atoms.push({ dnf: expandRegion(src, innerStart, innerEnd, lower) })
       continue
     }
 
@@ -165,7 +191,28 @@ function literalsInBranch(src, lower) {
     }
   }
   flushRun()
-  return literals
+
+  // Cross-product the atoms into a DNF.
+  /** @type {string[][]} */
+  let cross = [[]]
+  for (const atom of atoms) {
+    if ('lit' in atom) {
+      cross = cross.map(b => b.concat(atom.lit))
+      continue
+    }
+    const innerDNF = atom.dnf
+    // If expanding would blow past MAX_DNF, treat this group as a wildcard.
+    if (cross.length * innerDNF.length > MAX_DNF) continue
+    /** @type {string[][]} */
+    const expanded = []
+    for (const existing of cross) {
+      for (const innerBranch of innerDNF) {
+        expanded.push(existing.concat(innerBranch))
+      }
+    }
+    cross = expanded
+  }
+  return cross
 }
 
 /**
diff --git a/test/regex.test.js b/test/regex.test.js
index 16b8d94..46ba091 100644
--- a/test/regex.test.js
+++ b/test/regex.test.js
@@ -41,9 +41,39 @@ describe('extractRegexLiterals', () => {
     expect(extractRegexLiterals(/foo[xy]bar/)).toEqual([['foo', 'bar']])
   })
 
-  it('skips groups conservatively', () => {
-    expect(extractRegexLiterals(/(foo|bar)xyz/)).toEqual([['xyz']])
-    expect(extractRegexLiterals(/abc(foo)def/)).toEqual([['abc', 'def']])
+  it('recurses into groups for DNF expansion', () => {
+    expect(extractRegexLiterals(/(foo|bar)xyz/)).toEqual([['foo', 'xyz'], ['bar', 'xyz']])
+    expect(extractRegexLiterals(/abc(foo)def/)).toEqual([['abc', 'foo', 'def']])
+    expect(extractRegexLiterals(/abc(foo|bar)def/)).toEqual([
+      ['abc', 'foo', 'def'],
+      ['abc', 'bar', 'def'],
+    ])
+  })
+
+  it('skips optional groups (treats as wildcard)', () => {
+    expect(extractRegexLiterals(/abc(foo|bar)?def/)).toEqual([['abc', 'def']])
+    expect(extractRegexLiterals(/abc(foo)*def/)).toEqual([['abc', 'def']])
+  })
+
+  it('cross-products multiple alternation groups', () => {
+    expect(extractRegexLiterals(/(a|b)(c|d)/)).toEqual([
+      ['a', 'c'], ['a', 'd'], ['b', 'c'], ['b', 'd'],
+    ])
+  })
+
+  it('treats groups as wildcards when DNF would exceed cap', () => {
+    // 2 * 2 * 2 * 2 * 2 * 2 = 64 > 32 cap → drops the offending group
+    const result = extractRegexLiterals(/(a|b)(c|d)(e|f)(g|h)(i|j)(k|l)needle/)
+    // Should still include the 'needle' literal in every branch
+    expect(result.length).toBeLessThanOrEqual(32)
+    expect(result.every(b => b.includes('needle'))).toBe(true)
+  })
+
+  it('handles empty branch in group (matches anything)', () => {
+    expect(extractRegexLiterals(/abc(foo|)def/)).toEqual([
+      ['abc', 'foo', 'def'],
+      ['abc', 'def'],
+    ])
   })
 
   it('splits top-level alternation into branches', () => {
@@ -56,8 +86,12 @@ describe('extractRegexLiterals', () => {
     expect(extractRegexLiterals(/foo|\w+/)).toEqual([['foo'], []])
   })
 
-  it('does not split | inside groups or char classes', () => {
-    expect(extractRegexLiterals(/abc(foo|bar)def/)).toEqual([['abc', 'def']])
+  it('does not split | inside groups or char classes at the top level', () => {
+    // Top level | is split; the | inside the group is handled by group recursion
+    expect(extractRegexLiterals(/abc(foo|bar)def/)).toEqual([
+      ['abc', 'foo', 'def'],
+      ['abc', 'bar', 'def'],
+    ])
     expect(extractRegexLiterals(/abc[|]def/)).toEqual([['abc', 'def']])
   })
 

From 53e8dd45fb02b2368c1290d9be86fd9facdc6378 Mon Sep 17 00:00:00 2001
From: Kenny Daniel <platypii@gmail.com>
Date: Sun, 24 May 2026 21:43:34 -0700
Subject: [PATCH 2/3] Fold small literal char classes into the regex literal
 run

Negated, ranged, special-escape, and multi-char-quantifier classes
still fall back to wildcard treatment. /serv[ei]rless/ now prunes to
'serverless' or 'servirless' instead of just 'rless'.
---
 ARCH.md            |   2 +-
 src/regex.js       | 154 ++++++++++++++++++++++++++++++++-------------
 test/regex.test.js |  33 ++++++++--
 3 files changed, 140 insertions(+), 49 deletions(-)

diff --git a/ARCH.md b/ARCH.md
index fc9f971..a5ec829 100644
--- a/ARCH.md
+++ b/ARCH.md
@@ -125,7 +125,7 @@ Search options: `--limit N`, `--index <path>`, `-c`/`--count`, `-i`/`--ignore-ca
 2. **Common-everywhere words** (e.g. `wikipedia` in a Wikipedia dump) match every block; the source transfer is unavoidable because every row really does match. No n-gram strategy fixes this.
 3. **Per-row precision is missing.** A candidate block is scanned in full even if only one row matches. Storing per-`(ngram, block)` row bitmaps would let queries narrow the source read to specific rows, at the cost of a ~3× larger index. Probably the highest-value next step for sparse queries that match a small number of rows spread across many blocks.
 4. **No-limit queries on dense matches do more work than they need to.** When every block matches (e.g. `wikipedia` against a Wikipedia dump), exhausting all matches takes ~24 s of CPU because we process blocks one at a time. The bytes are minimal; the time is CPU/parsing overhead. Real clients should pass a `limit`.
-5. **Regex literal extractor caps DNF expansion at 32 branches.** Groups with alternation are now recursed into and cross-producted, so `/abc(foo|bar)def/` correctly yields `{abc, foo, def} ∨ {abc, bar, def}`. Deeply nested alternation that would exceed 32 branches drops the offending group conservatively (treats it as a wildcard for that position) — correct, just looser pruning. Real-world bytes gains vary with data co-occurrence: `/(eigen|petri)(value|chor)/` against Wikipedia goes from a full scan (~615 MB) to 247 MB; queries where the alternation words tightly co-occur with the surrounding context (e.g. `(machine|deep) learning`) see no improvement because the constraint was already implicit in the data.
+5. **Regex literal extractor caps DNF expansion at 32 branches.** Groups with alternation are now recursed into and cross-producted, so `/abc(foo|bar)def/` correctly yields `{abc, foo, def} ∨ {abc, bar, def}`. Small literal-only character classes (no negation, no ranges, no `\w`/`\d`, single-occurrence) are folded into the surrounding run — `/serv[ei]rless/` becomes `{serverless} ∨ {servirless}`, cutting Wikipedia transfer from 333 MB to 0 MB. Negated classes, ranges, `\d`/`\w`, and multi-char quantifiers like `[ab]{3}` (which could match `aab`) fall back to wildcard treatment to stay correct. Deeply nested expansions that would exceed 32 branches drop the offending construct conservatively.
 6. **Index files are not back-compatible.** `hypgrep.version` exists but we don't ship a multi-version reader. Reasonable for a 0.x package.
 
 ## Dependencies
diff --git a/src/regex.js b/src/regex.js
index e5a2b7e..3472525 100644
--- a/src/regex.js
+++ b/src/regex.js
@@ -14,15 +14,20 @@
  * Conservative: we'd rather miss a safe literal (and scan more blocks) than
  * emit a false-positive that would let us skip a block that actually matches.
  *
- * Groups with alternation are expanded into DNF (cross-product). To bound the
- * worst case, expansion stops growing past MAX_DNF branches — beyond that,
- * the offending group is treated as a wildcard for that position.
+ * - Top-level `|` → branch per arm.
+ * - Groups with alternation → cross-producted into DNF (independent literals).
+ * - Small literal-only character classes → folded into the surrounding run
+ *   (`/serv[ei]rless/` → `[['serverless'], ['servirless']]`).
+ *
+ * Expansion stops growing past MAX_DNF branches — beyond that, the offending
+ * construct is treated as a wildcard for that position.
  *
  * Examples:
  *   /foo/             → [['foo']]
  *   /foo.bar/         → [['foo', 'bar']]
  *   /foo|bar/         → [['foo'], ['bar']]
  *   /abc(foo|bar)def/ → [['abc','foo','def'], ['abc','bar','def']]
+ *   /serv[ei]rless/   → [['serverless'], ['servirless']]
  *   /foo|.+/          → [['foo'], []]  (second branch matches anything)
  *   /./               → [[]]
  *
@@ -89,11 +94,15 @@ function splitTopLevelAlternation(src, start, end) {
   return spans
 }
 
+/** @typedef {{ literals: string[], run: string }} LiveBranch */
+
 /**
  * Expand a single (no top-level `|`) regex branch into a DNF of literal sets.
  *
- * Walks the branch building a sequence of "atoms": literal runs and sub-DNFs
- * from groups. Then cross-products the atoms into the branch's final DNF.
+ * Maintains a set of "live branches", each with a current literal run and the
+ * literals that have already been flushed. Character classes fork branches
+ * with the char appended to the run (so adjacent literals stay joined).
+ * Groups, dots, and special escapes flush the run and break the sequence.
  *
  * @param {string} src
  * @param {number} start
@@ -102,16 +111,21 @@ function splitTopLevelAlternation(src, start, end) {
  * @returns {string[][]}
  */
 function expandBranch(src, start, end, lower) {
-  /** @type {Array<{ lit: string } | { dnf: string[][] }>} */
-  const atoms = []
-  let run = ''
+  /** @type {LiveBranch[]} */
+  let cross = [{ literals: [], run: '' }]
+
   /**
    * @returns {void}
    */
-  function flushRun() {
-    if (run.length > 0) atoms.push({ lit: run })
-    run = ''
+  function flushRuns() {
+    for (const b of cross) {
+      if (b.run.length > 0) {
+        b.literals.push(b.run)
+        b.run = ''
+      }
+    }
   }
+
   let i = start
   while (i < end) {
     const c = src[i]
@@ -131,7 +145,7 @@ function expandBranch(src, start, end, lower) {
       }
       // \d \w \s \b etc. — a special class, not a literal char
       if (/[a-zA-Z]/.test(next)) {
-        flushRun()
+        flushRuns()
         i = skipQuantifier(src, i + 2)
         continue
       }
@@ -140,40 +154,74 @@ function expandBranch(src, start, end, lower) {
       const q = peekQuantifier(src, i)
       i = q.end
       if (q.copies === 0) {
-        flushRun()
+        flushRuns()
       } else {
         const ch = lower ? next.toLowerCase() : next
-        run += ch.repeat(q.copies)
-        if (q.openEnded) flushRun()
+        for (const b of cross) b.run += ch.repeat(q.copies)
+        if (q.openEnded) flushRuns()
       }
       continue
     }
 
     // Character class [...]
     if (c === '[') {
-      flushRun()
-      i = skipClass(src, i)
-      i = skipQuantifier(src, i)
+      const classEnd = skipClass(src, i)
+      const chars = parseLiteralClass(src, i, classEnd, lower)
+      const q = peekQuantifier(src, classEnd)
+      i = q.end
+      if (q.copies === 0) {
+        flushRuns()
+        continue
+      }
+      // Treat as wildcard if: not a small literal-only class, or quantifier
+      // could vary the chars across positions, or expansion would exceed cap.
+      if (chars === null || q.copies > 1 || cross.length * chars.length > MAX_DNF) {
+        flushRuns()
+        continue
+      }
+      // Fork each live branch by char alternatives, appending to the run.
+      /** @type {LiveBranch[]} */
+      const expanded = []
+      for (const b of cross) {
+        for (const ch of chars) {
+          expanded.push({ literals: b.literals.slice(), run: b.run + ch })
+        }
+      }
+      cross = expanded
+      if (q.openEnded) flushRuns()
       continue
     }
 
     // Group (...) — recurse for full DNF expansion
     if (c === '(') {
-      flushRun()
       const groupEnd = skipGroup(src, i)
       const innerStart = i + 1
       const innerEnd = groupEnd - 1
       const q = peekQuantifier(src, groupEnd)
       i = q.end
-      // Optional group — skip entirely (treat as wildcard)
-      if (q.copies === 0) continue
-      atoms.push({ dnf: expandRegion(src, innerStart, innerEnd, lower) })
+      // Optional group — flush and skip (treat as wildcard)
+      if (q.copies === 0) {
+        flushRuns()
+        continue
+      }
+      // Group contents are independent literals — flush the current run first
+      flushRuns()
+      const innerDNF = expandRegion(src, innerStart, innerEnd, lower)
+      if (cross.length * innerDNF.length > MAX_DNF) continue
+      /** @type {LiveBranch[]} */
+      const expanded = []
+      for (const b of cross) {
+        for (const innerBranch of innerDNF) {
+          expanded.push({ literals: b.literals.concat(innerBranch), run: '' })
+        }
+      }
+      cross = expanded
       continue
     }
 
     // Dot
     if (c === '.') {
-      flushRun()
+      flushRuns()
       i = skipQuantifier(src, i + 1)
       continue
     }
@@ -183,36 +231,54 @@ function expandBranch(src, start, end, lower) {
     const q = peekQuantifier(src, i)
     i = q.end
     if (q.copies === 0) {
-      flushRun()
+      flushRuns()
     } else {
       const ch = lower ? c.toLowerCase() : c
-      run += ch.repeat(q.copies)
-      if (q.openEnded) flushRun()
+      for (const b of cross) b.run += ch.repeat(q.copies)
+      if (q.openEnded) flushRuns()
     }
   }
-  flushRun()
+  flushRuns()
 
-  // Cross-product the atoms into a DNF.
-  /** @type {string[][]} */
-  let cross = [[]]
-  for (const atom of atoms) {
-    if ('lit' in atom) {
-      cross = cross.map(b => b.concat(atom.lit))
+  return cross.map(b => b.literals)
+}
+
+/**
+ * Parse a `[...]` character class as a list of literal alternatives. Returns
+ * null if the class is negated, contains a range, contains a special escape
+ * (`\d`, `\w`, …), or is empty — i.e., not safely foldable into a literal run.
+ *
+ * @param {string} src
+ * @param {number} start
+ * @param {number} end
+ * @param {boolean} lower
+ * @returns {string[] | null}
+ */
+function parseLiteralClass(src, start, end, lower) {
+  // src[start] === '[' and end is the index after ']'
+  let i = start + 1
+  const lastInside = end - 1 // position of ']'
+  if (src[i] === '^') return null
+  /** @type {Set<string>} */
+  const chars = new Set()
+  while (i < lastInside) {
+    if (src[i] === '\\') {
+      const next = src[i + 1]
+      if (next === undefined) return null
+      if (/[a-zA-Z]/.test(next)) return null // \d, \w, \s, etc.
+      chars.add(lower ? next.toLowerCase() : next)
+      i += 2
       continue
     }
-    const innerDNF = atom.dnf
-    // If expanding would blow past MAX_DNF, treat this group as a wildcard.
-    if (cross.length * innerDNF.length > MAX_DNF) continue
-    /** @type {string[][]} */
-    const expanded = []
-    for (const existing of cross) {
-      for (const innerBranch of innerDNF) {
-        expanded.push(existing.concat(innerBranch))
-      }
+    // Detect range: a-b where '-' is not the last char in the class
+    if (i + 2 < lastInside && src[i + 1] === '-') {
+      return null
     }
-    cross = expanded
+    chars.add(lower ? src[i].toLowerCase() : src[i])
+    i += 1
   }
-  return cross
+  if (chars.size === 0) return null
+  return [...chars]
 }
 
 /**
diff --git a/test/regex.test.js b/test/regex.test.js
index 46ba091..f55c0c7 100644
--- a/test/regex.test.js
+++ b/test/regex.test.js
@@ -36,9 +36,33 @@ describe('extractRegexLiterals', () => {
     expect(extractRegexLiterals(/abc{1,3}def/)).toEqual([['abc', 'def']])
   })
 
-  it('skips character classes', () => {
-    expect(extractRegexLiterals(/[xy]def/)).toEqual([['def']])
-    expect(extractRegexLiterals(/foo[xy]bar/)).toEqual([['foo', 'bar']])
+  it('folds small literal char classes into the run', () => {
+    expect(extractRegexLiterals(/[xy]def/).sort()).toEqual([['xdef'], ['ydef']].sort())
+    expect(extractRegexLiterals(/foo[xy]bar/).sort()).toEqual([['fooxbar'], ['fooybar']].sort())
+    expect(extractRegexLiterals(/serv[ei]rless/).sort()).toEqual([['serverless'], ['servirless']].sort())
+  })
+
+  it('treats negated/ranged/special char classes as wildcards', () => {
+    // negated
+    expect(extractRegexLiterals(/foo[^xy]bar/)).toEqual([['foo', 'bar']])
+    // range
+    expect(extractRegexLiterals(/foo[a-z]bar/)).toEqual([['foo', 'bar']])
+    // special escape
+    expect(extractRegexLiterals(/foo[\w]bar/)).toEqual([['foo', 'bar']])
+  })
+
+  it('does not fold char classes under multi-char quantifiers', () => {
+    // [ab]{3} could match 'aab' — folding to 'aaa'/'bbb' would be a false miss
+    expect(extractRegexLiterals(/foo[ab]{3}bar/)).toEqual([['foo', 'bar']])
+    // optional class: flush, no fold
+    expect(extractRegexLiterals(/foo[ab]?bar/)).toEqual([['foo', 'bar']])
+  })
+
+  it('folds char class once under + (one mandatory occurrence)', () => {
+    // [ab]+ guarantees at least one 'a' or 'b'; further chars unknown
+    expect(extractRegexLiterals(/foo[ab]+bar/).sort()).toEqual([
+      ['fooa', 'bar'], ['foob', 'bar'],
+    ].sort())
   })
 
   it('recurses into groups for DNF expansion', () => {
@@ -92,7 +116,8 @@ describe('extractRegexLiterals', () => {
       ['abc', 'foo', 'def'],
       ['abc', 'bar', 'def'],
     ])
-    expect(extractRegexLiterals(/abc[|]def/)).toEqual([['abc', 'def']])
+    // `|` inside a char class is a literal char and folds into the run
+    expect(extractRegexLiterals(/abc[|]def/)).toEqual([['abc|def']])
   })
 
   it('ignores anchors', () => {

From 97f48db869c6b0690489f19e68a9f80f60c267d4 Mon Sep 17 00:00:00 2001
From: Kenny Daniel <platypii@gmail.com>
Date: Sun, 24 May 2026 21:47:18 -0700
Subject: [PATCH 3/3] Fold single-literal group branches into the regex run

When every inner branch of a group is a single contiguous literal, the
group concats into the surrounding run instead of standing alone.
/(eigen|petri)(value|chor)/ now extracts the four full words; Wikipedia
transfer drops from 247 MB to 51 MB.
---
 ARCH.md            |  6 +++++-
 src/regex.js       | 22 ++++++++++++++++++++--
 test/regex.test.js | 31 ++++++++++++++++++++++---------
 3 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/ARCH.md b/ARCH.md
index a5ec829..72d906a 100644
--- a/ARCH.md
+++ b/ARCH.md
@@ -125,7 +125,11 @@ Search options: `--limit N`, `--index <path>`, `-c`/`--count`, `-i`/`--ignore-ca
 2. **Common-everywhere words** (e.g. `wikipedia` in a Wikipedia dump) match every block; the source transfer is unavoidable because every row really does match. No n-gram strategy fixes this.
 3. **Per-row precision is missing.** A candidate block is scanned in full even if only one row matches. Storing per-`(ngram, block)` row bitmaps would let queries narrow the source read to specific rows, at the cost of a ~3× larger index. Probably the highest-value next step for sparse queries that match a small number of rows spread across many blocks.
 4. **No-limit queries on dense matches do more work than they need to.** When every block matches (e.g. `wikipedia` against a Wikipedia dump), exhausting all matches takes ~24 s of CPU because we process blocks one at a time. The bytes are minimal; the time is CPU/parsing overhead. Real clients should pass a `limit`.
-5. **Regex literal extractor caps DNF expansion at 32 branches.** Groups with alternation are now recursed into and cross-producted, so `/abc(foo|bar)def/` correctly yields `{abc, foo, def} ∨ {abc, bar, def}`. Small literal-only character classes (no negation, no ranges, no `\w`/`\d`, single-occurrence) are folded into the surrounding run — `/serv[ei]rless/` becomes `{serverless} ∨ {servirless}`, cutting Wikipedia transfer from 333 MB to 0 MB. Negated classes, ranges, `\d`/`\w`, and multi-char quantifiers like `[ab]{3}` (which could match `aab`) fall back to wildcard treatment to stay correct. Deeply nested expansions that would exceed 32 branches drop the offending construct conservatively.
+5. **Regex literal extractor caps DNF expansion at 32 branches.** Groups with alternation are recursed into and cross-producted, so `/abc(foo|bar)def/` yields the tight `{abcfoodef} ∨ {abcbardef}`. Two flavors of fold keep adjacent literals contiguous so the n-gram extractor can prune them:
+   - **Char class fold**: small literal-only classes (no negation, no ranges, no `\w`/`\d`, single occurrence) concat into the run — `/serv[ei]rless/` becomes `{serverless} ∨ {servirless}` (333 MB → 0 MB on Wikipedia).
+   - **Group fold**: when every inner branch is a single contiguous literal, the group also concats — `/(eigen|petri)(value|chor)/` becomes `{eigenvalue} ∨ {eigenchor} ∨ {petrivalue} ∨ {petrichor}` (615 MB full scan → 51 MB).
+
+   Negated classes, ranges, `\d`/`\w`, multi-char quantifiers like `[ab]{3}` (which could match `aab`), and groups whose inner branches contain wildcards or multiple literals fall back to wildcard or independent-literal treatment to stay correct. Deeply nested expansions that would exceed 32 branches drop the offending construct conservatively.
 6. **Index files are not back-compatible.** `hypgrep.version` exists but we don't ship a multi-version reader. Reasonable for a 0.x package.
 
 ## Dependencies
diff --git a/src/regex.js b/src/regex.js
index 3472525..663361a 100644
--- a/src/regex.js
+++ b/src/regex.js
@@ -204,9 +204,27 @@ function expandBranch(src, start, end, lower) {
         flushRuns()
         continue
       }
-      // Group contents are independent literals — flush the current run first
-      flushRuns()
       const innerDNF = expandRegion(src, innerStart, innerEnd, lower)
+      // Foldable: every inner branch is a single contiguous literal and the
+      // group occurs exactly once. Same safety reasoning as char-class fold —
+      // multi-occurrence (`(a|b){3}`) could vary across positions.
+      const foldable = q.copies === 1 &&
+        innerDNF.length > 0 &&
+        innerDNF.every(b => b.length === 1)
+      if (foldable && cross.length * innerDNF.length <= MAX_DNF) {
+        /** @type {LiveBranch[]} */
+        const expanded = []
+        for (const b of cross) {
+          for (const innerBranch of innerDNF) {
+            expanded.push({ literals: b.literals.slice(), run: b.run + innerBranch[0] })
+          }
+        }
+        cross = expanded
+        if (q.openEnded) flushRuns()
+        continue
+      }
+      // Not foldable — treat group contents as independent literals.
+      flushRuns()
       if (cross.length * innerDNF.length > MAX_DNF) continue
       /** @type {LiveBranch[]} */
       const expanded = []
diff --git a/test/regex.test.js b/test/regex.test.js
index f55c0c7..477091b 100644
--- a/test/regex.test.js
+++ b/test/regex.test.js
@@ -65,12 +65,24 @@ describe('extractRegexLiterals', () => {
     ].sort())
   })
 
-  it('recurses into groups for DNF expansion', () => {
-    expect(extractRegexLiterals(/(foo|bar)xyz/)).toEqual([['foo', 'xyz'], ['bar', 'xyz']])
-    expect(extractRegexLiterals(/abc(foo)def/)).toEqual([['abc', 'foo', 'def']])
+  it('folds single-literal groups into the surrounding run', () => {
+    // Single-literal-branch groups concat into the run like char classes
+    expect(extractRegexLiterals(/(foo|bar)xyz/)).toEqual([['fooxyz'], ['barxyz']])
+    expect(extractRegexLiterals(/abc(foo)def/)).toEqual([['abcfoodef']])
     expect(extractRegexLiterals(/abc(foo|bar)def/)).toEqual([
-      ['abc', 'foo', 'def'],
-      ['abc', 'bar', 'def'],
+      ['abcfoodef'],
+      ['abcbardef'],
+    ])
+    expect(extractRegexLiterals(/wikip(e|a)dia/)).toEqual([['wikipedia'], ['wikipadia']])
+  })
+
+  it('keeps multi-literal group branches as independent literals', () => {
+    // `(ab.cd|xy)` cannot fold — the dot splits the first branch into two
+    // literals, so contiguous concat isn't possible for that branch. Conservative
+    // all-or-nothing: every inner branch is treated as independent.
+    expect(extractRegexLiterals(/x(ab.cd|xy)y/)).toEqual([
+      ['x', 'ab', 'cd', 'y'],
+      ['x', 'xy', 'y'],
     ])
   })
 
@@ -80,8 +92,9 @@ describe('extractRegexLiterals', () => {
   })
 
   it('cross-products multiple alternation groups', () => {
+    // Both groups are single-literal foldable, so they concat into one run
     expect(extractRegexLiterals(/(a|b)(c|d)/)).toEqual([
-      ['a', 'c'], ['a', 'd'], ['b', 'c'], ['b', 'd'],
+      ['ac'], ['ad'], ['bc'], ['bd'],
     ])
   })
 
@@ -111,10 +124,10 @@ describe('extractRegexLiterals', () => {
   })
 
   it('does not split | inside groups or char classes at the top level', () => {
-    // Top level | is split; the | inside the group is handled by group recursion
+    // Top level | is split; the | inside the group is handled by group recursion + fold
     expect(extractRegexLiterals(/abc(foo|bar)def/)).toEqual([
-      ['abc', 'foo', 'def'],
-      ['abc', 'bar', 'def'],
+      ['abcfoodef'],
+      ['abcbardef'],
     ])
     // `|` inside a char class is a literal char and folds into the run
     expect(extractRegexLiterals(/abc[|]def/)).toEqual([['abc|def']])