Skip to content

Commit a814b24

Browse files
authored
fix: make exclude patterns recursive to prevent index pollution (#76)
* fix: make exclude patterns recursive and share via single constant The indexer's exclude patterns were non-recursive (e.g. `coverage/**`), only matching at the project root. Nested occurrences in monorepo packages and worktrees passed through, polluting the index with generated artifacts and worktree copies. - Extract EXCLUDED_DIRECTORY_NAMES and EXCLUDED_GLOB_PATTERNS into src/constants/codebase-context.ts as the single source of truth - Indexer, file-watcher, and project-discovery all import from there - Add missing directories: .cache, .claude, .planning, worktrees, target, vendor, .nx, .turbo, .next, build - Add integration test reproducing the consumer audit failure case (nested coverage/, .claude/worktrees/, worktrees/, dist/) * style: format with prettier * fix: address PR review feedback - Make EXCLUDED_GLOB_PATTERNS readonly to prevent accidental mutation by consumers (spreads at call sites are now intentional, creating mutable copies for APIs that require string[]) - Throw on unrecognized index format in test instead of silently defaulting to empty array (prevents polluter assertions from passing vacuously) - Move analyzerRegistry.register into test body — only one test, no need for beforeEach ceremony * fix: widen IGNORED_DIRECTORY_NAMES to Set<string> for TS compatibility Set.has() requires the argument to match the set's type parameter. Spreading as-const tuples into a Set infers a narrow literal union, which rejects entry.name (plain string) at the call site on line 178.
1 parent edb1350 commit a814b24

5 files changed

Lines changed: 138 additions & 37 deletions

File tree

src/constants/codebase-context.ts

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,44 @@ export const INDEXING_STATS_FILENAME = 'indexing-stats.json' as const;
2525
export const VECTOR_DB_DIRNAME = 'index' as const;
2626
export const MANIFEST_FILENAME = 'manifest.json' as const;
2727
export const RELATIONSHIPS_FILENAME = 'relationships.json' as const;
28+
29+
/**
30+
* Directories excluded from indexing, file-watching, and project discovery.
31+
* Single source of truth — all three consumers import from here.
32+
*/
33+
export const EXCLUDED_DIRECTORY_NAMES = [
34+
'.cache',
35+
'.claude',
36+
'.codebase-context',
37+
'.git',
38+
'.next',
39+
'.nx',
40+
'.planning',
41+
'.turbo',
42+
'build',
43+
'coverage',
44+
'dist',
45+
'node_modules',
46+
'target',
47+
'vendor',
48+
'worktrees'
49+
] as const;
50+
51+
/** Glob patterns that match excluded directories at any nesting depth. */
52+
export const EXCLUDED_GLOB_PATTERNS: readonly string[] = EXCLUDED_DIRECTORY_NAMES.map(
53+
(dir) => `**/${dir}/**`
54+
);
55+
56+
/**
57+
* Additional directories skipped only during project discovery (not generated
58+
* code, just not useful roots to recurse into).
59+
*/
60+
export const DISCOVERY_ONLY_IGNORED = [
61+
'.hg',
62+
'.nuxt',
63+
'.svn',
64+
'.venv',
65+
'.yarn',
66+
'out',
67+
'tmp'
68+
] as const;

src/core/file-watcher.ts

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import chokidar from 'chokidar';
22
import path from 'path';
3+
import { EXCLUDED_GLOB_PATTERNS } from '../constants/codebase-context.js';
34
import { getSupportedExtensions } from '../utils/language-detection.js';
45

56
export interface FileWatcherOptions {
@@ -43,18 +44,7 @@ export function startFileWatcher(opts: FileWatcherOptions): () => void {
4344
};
4445

4546
const watcher = chokidar.watch(rootPath, {
46-
ignored: [
47-
'**/node_modules/**',
48-
'**/.codebase-context/**',
49-
'**/.git/**',
50-
'**/dist/**',
51-
'**/.nx/**',
52-
'**/.planning/**',
53-
'**/coverage/**',
54-
'**/.turbo/**',
55-
'**/.next/**',
56-
'**/.cache/**'
57-
],
47+
ignored: [...EXCLUDED_GLOB_PATTERNS],
5848
persistent: true,
5949
ignoreInitial: true,
6050
awaitWriteFinish: { stabilityThreshold: 200, pollInterval: 100 }

src/core/indexer.ts

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ import { mergeSmallChunks } from '../utils/chunking.js';
3939
import { getFileCommitDates } from '../utils/git-dates.js';
4040
import {
4141
CODEBASE_CONTEXT_DIRNAME,
42+
EXCLUDED_GLOB_PATTERNS,
4243
INDEX_FORMAT_VERSION,
4344
INDEXING_STATS_FILENAME,
4445
INDEX_META_FILENAME,
@@ -274,14 +275,7 @@ export class CodebaseIndexer {
274275
'**/*.{sql,graphql,gql}',
275276
'**/*.{json,jsonc,yaml,yml,toml,xml}'
276277
],
277-
exclude: [
278-
'node_modules/**',
279-
'dist/**',
280-
'build/**',
281-
'.git/**',
282-
'coverage/**',
283-
'.codebase-context/**'
284-
],
278+
exclude: [...EXCLUDED_GLOB_PATTERNS],
285279
respectGitignore: true,
286280
parsing: {
287281
maxFileSize: 1048576,

src/utils/project-discovery.ts

Lines changed: 4 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import { promises as fs } from 'fs';
22
import type { Dirent } from 'fs';
33
import path from 'path';
4+
import { EXCLUDED_DIRECTORY_NAMES, DISCOVERY_ONLY_IGNORED } from '../constants/codebase-context.js';
45

56
export type ProjectEvidence =
67
| 'existing_index'
@@ -19,23 +20,9 @@ export interface DiscoverProjectsOptions {
1920

2021
const DEFAULT_MAX_DEPTH = 4;
2122

22-
const IGNORED_DIRECTORY_NAMES = new Set([
23-
'.git',
24-
'.hg',
25-
'.svn',
26-
'.next',
27-
'.nuxt',
28-
'.turbo',
29-
'.venv',
30-
'.yarn',
31-
'build',
32-
'coverage',
33-
'dist',
34-
'node_modules',
35-
'out',
36-
'target',
37-
'tmp',
38-
'vendor'
23+
const IGNORED_DIRECTORY_NAMES: Set<string> = new Set([
24+
...EXCLUDED_DIRECTORY_NAMES,
25+
...DISCOVERY_ONLY_IGNORED
3926
]);
4027

4128
const STRONG_DIRECTORY_MARKERS = new Set(['.codebase-context', '.git']);
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
import { afterEach, describe, expect, it } from 'vitest';
2+
import { promises as fs } from 'fs';
3+
import os from 'os';
4+
import path from 'path';
5+
import { CodebaseIndexer } from '../src/core/indexer.js';
6+
import { analyzerRegistry } from '../src/core/analyzer-registry.js';
7+
import { GenericAnalyzer } from '../src/analyzers/generic/index.js';
8+
import {
9+
CODEBASE_CONTEXT_DIRNAME,
10+
KEYWORD_INDEX_FILENAME
11+
} from '../src/constants/codebase-context.js';
12+
13+
describe('Indexer exclude patterns — nested directories', () => {
14+
let tempDir: string;
15+
16+
afterEach(async () => {
17+
if (tempDir) await fs.rm(tempDir, { recursive: true, force: true });
18+
});
19+
20+
it('excludes nested coverage, worktrees, .claude, and dist directories', async () => {
21+
analyzerRegistry.register(new GenericAnalyzer());
22+
tempDir = await fs.mkdtemp(path.join(os.tmpdir(), 'indexer-exclude-patterns-'));
23+
24+
// Legitimate source file
25+
await fs.mkdir(path.join(tempDir, 'src'), { recursive: true });
26+
await fs.writeFile(
27+
path.join(tempDir, 'src', 'app.ts'),
28+
'export function main() { return "hello"; }\n'
29+
);
30+
31+
// Polluters — nested paths that should be excluded
32+
const polluters = [
33+
['packages', 'ui', 'coverage', 'prettify.js'],
34+
['.claude', 'worktrees', 'branch', 'src', 'app.ts'],
35+
['worktrees', 'portal30-pr', 'src', 'real.ts'],
36+
['apps', 'web', 'dist', 'bundle.js']
37+
];
38+
39+
for (const segments of polluters) {
40+
const dir = path.join(tempDir, ...segments.slice(0, -1));
41+
await fs.mkdir(dir, { recursive: true });
42+
await fs.writeFile(path.join(tempDir, ...segments), 'export const polluter = true;\n');
43+
}
44+
45+
const indexer = new CodebaseIndexer({
46+
rootPath: tempDir,
47+
config: {
48+
skipEmbedding: true,
49+
parsing: {
50+
maxFileSize: 1048576,
51+
chunkSize: 50,
52+
chunkOverlap: 0,
53+
parseTests: true,
54+
parseNodeModules: false
55+
}
56+
}
57+
});
58+
59+
await indexer.index();
60+
61+
const indexPath = path.join(tempDir, CODEBASE_CONTEXT_DIRNAME, KEYWORD_INDEX_FILENAME);
62+
const indexRaw = JSON.parse(await fs.readFile(indexPath, 'utf-8')) as Record<string, unknown>;
63+
64+
let chunks: Array<{ filePath: string }>;
65+
if (Array.isArray(indexRaw)) {
66+
chunks = indexRaw;
67+
} else if (Array.isArray(indexRaw?.chunks)) {
68+
chunks = indexRaw.chunks as Array<{ filePath: string }>;
69+
} else {
70+
throw new Error(
71+
`Unexpected index format: keys=${JSON.stringify(Object.keys(indexRaw ?? {}))}`
72+
);
73+
}
74+
75+
const indexedPaths = chunks.map((chunk) => chunk.filePath);
76+
77+
// The legitimate file must be indexed
78+
expect(indexedPaths.some((p) => p.includes('src/app.ts') || p.includes('src\\app.ts'))).toBe(
79+
true
80+
);
81+
82+
// None of the polluter paths should appear
83+
const polluterMarkers = ['coverage', '.claude', 'worktrees', 'dist'];
84+
for (const marker of polluterMarkers) {
85+
const leaked = indexedPaths.filter((p) => p.includes(marker));
86+
expect(leaked, `paths containing "${marker}" should not be indexed`).toEqual([]);
87+
}
88+
});
89+
});

0 commit comments

Comments
 (0)