Skip to content

Commit 236b948

Browse files
committed
fix(chunkers): also strip named capture groups in regex patterns
Named groups (?<name>...) are still capturing groups so split() interleaves their matched text. Convert them to non-capturing alongside plain ( groups.
1 parent daaadb0 commit 236b948

2 files changed

Lines changed: 39 additions & 7 deletions

File tree

apps/sim/lib/chunkers/regex-chunker.test.ts

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,27 @@ describe('RegexChunker', () => {
206206
}
207207
)
208208

209+
it.concurrent(
210+
'should not include delimiter text when pattern uses named capture groups',
211+
async () => {
212+
const chunker = new RegexChunker({
213+
pattern: '(?<sep>---)',
214+
chunkSize: 1024,
215+
strictBoundaries: true,
216+
})
217+
const text = 'Section one content.---Section two content.---Section three content.'
218+
const chunks = await chunker.chunk(text)
219+
220+
expect(chunks).toHaveLength(3)
221+
expect(chunks[0].text).toBe('Section one content.')
222+
expect(chunks[1].text).toBe('Section two content.')
223+
expect(chunks[2].text).toBe('Section three content.')
224+
for (const chunk of chunks) {
225+
expect(chunk.text).not.toBe('---')
226+
}
227+
}
228+
)
229+
209230
it.concurrent('should leave non-capturing groups and lookarounds intact', async () => {
210231
const chunker = new RegexChunker({
211232
pattern: '(?=\\n\\s*\\{\\s*"id"\\s*:)',

apps/sim/lib/chunkers/regex-chunker.ts

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,14 @@ const logger = createLogger('RegexChunker')
1515

1616
const MAX_PATTERN_LENGTH = 500
1717

18+
const NAMED_GROUP_PREFIX = /^\(\?<[^>]+>/
19+
1820
/**
19-
* Converts unescaped capturing groups `(...)` into non-capturing groups `(?:...)`.
20-
* `String.prototype.split()` interleaves captured groups into the result array,
21-
* which would surface delimiter text as spurious chunks. Lookarounds, named
22-
* groups, and other `(?...)` constructs are left untouched.
21+
* Converts unescaped capturing groups `(...)` and named capturing groups
22+
* `(?<name>...)` into non-capturing groups `(?:...)`. `String.prototype.split()`
23+
* interleaves captured text (named or otherwise) into the result array, which
24+
* would surface delimiter text as spurious chunks. Lookarounds (`(?=`, `(?!`,
25+
* `(?<=`, `(?<!`) and other `(?...)` constructs are left untouched.
2326
*/
2427
function toNonCapturing(pattern: string): string {
2528
let result = ''
@@ -33,9 +36,17 @@ function toNonCapturing(pattern: string): string {
3336
}
3437
if (c === '[') inClass = true
3538
else if (c === ']') inClass = false
36-
if (!inClass && c === '(' && pattern[i + 1] !== '?') {
37-
result += '(?:'
38-
continue
39+
if (!inClass && c === '(') {
40+
if (pattern[i + 1] !== '?') {
41+
result += '(?:'
42+
continue
43+
}
44+
const namedMatch = pattern.slice(i).match(NAMED_GROUP_PREFIX)
45+
if (namedMatch) {
46+
result += '(?:'
47+
i += namedMatch[0].length - 1
48+
continue
49+
}
3950
}
4051
result += c
4152
}

0 commit comments

Comments
 (0)