fix: make exclude patterns recursive to prevent index pollution (#76)

PatrickSys · web-flow · commit a814b2445d3d · 2026-03-18T23:06:13.000+01:00
* fix: make exclude patterns recursive and share via single constant

The indexer's exclude patterns were non-recursive (e.g. `coverage/**`),
only matching at the project root. Nested occurrences in monorepo
packages and worktrees passed through, polluting the index with
generated artifacts and worktree copies.

- Extract EXCLUDED_DIRECTORY_NAMES and EXCLUDED_GLOB_PATTERNS into
  src/constants/codebase-context.ts as the single source of truth
- Indexer, file-watcher, and project-discovery all import from there
- Add missing directories: .cache, .claude, .planning, worktrees,
  target, vendor, .nx, .turbo, .next, build
- Add integration test reproducing the consumer audit failure case
  (nested coverage/, .claude/worktrees/, worktrees/, dist/)

* style: format with prettier

* fix: address PR review feedback

- Make EXCLUDED_GLOB_PATTERNS readonly to prevent accidental mutation
  by consumers (spreads at call sites are now intentional, creating
  mutable copies for APIs that require string[])
- Throw on unrecognized index format in test instead of silently
  defaulting to empty array (prevents polluter assertions from passing
  vacuously)
- Move analyzerRegistry.register into test body — only one test, no
  need for beforeEach ceremony

* fix: widen IGNORED_DIRECTORY_NAMES to Set&lt;string&gt; for TS compatibility

Set.has() requires the argument to match the set's type parameter.
Spreading as-const tuples into a Set infers a narrow literal union,
which rejects entry.name (plain string) at the call site on line 178.
diff --git a/src/constants/codebase-context.ts b/src/constants/codebase-context.ts
@@ -25,3 +25,44 @@ export const INDEXING_STATS_FILENAME = 'indexing-stats.json' as const;
 export const VECTOR_DB_DIRNAME = 'index' as const;
 export const MANIFEST_FILENAME = 'manifest.json' as const;
 export const RELATIONSHIPS_FILENAME = 'relationships.json' as const;
+
+/**
+ * Directories excluded from indexing, file-watching, and project discovery.
+ * Single source of truth — all three consumers import from here.
+ */
+export const EXCLUDED_DIRECTORY_NAMES = [
+  '.cache',
+  '.claude',
+  '.codebase-context',
+  '.git',
+  '.next',
+  '.nx',
+  '.planning',
+  '.turbo',
+  'build',
+  'coverage',
+  'dist',
+  'node_modules',
+  'target',
+  'vendor',
+  'worktrees'
+] as const;
+
+/** Glob patterns that match excluded directories at any nesting depth. */
+export const EXCLUDED_GLOB_PATTERNS: readonly string[] = EXCLUDED_DIRECTORY_NAMES.map(
+  (dir) => `**/${dir}/**`
+);
+
+/**
+ * Additional directories skipped only during project discovery (not generated
+ * code, just not useful roots to recurse into).
+ */
+export const DISCOVERY_ONLY_IGNORED = [
+  '.hg',
+  '.nuxt',
+  '.svn',
+  '.venv',
+  '.yarn',
+  'out',
+  'tmp'
+] as const;
diff --git a/src/core/file-watcher.ts b/src/core/file-watcher.ts
@@ -1,5 +1,6 @@
 import chokidar from 'chokidar';
 import path from 'path';
+import { EXCLUDED_GLOB_PATTERNS } from '../constants/codebase-context.js';
 import { getSupportedExtensions } from '../utils/language-detection.js';
 
 export interface FileWatcherOptions {
@@ -43,18 +44,7 @@ export function startFileWatcher(opts: FileWatcherOptions): () => void {
   };
 
   const watcher = chokidar.watch(rootPath, {
-    ignored: [
-      '**/node_modules/**',
-      '**/.codebase-context/**',
-      '**/.git/**',
-      '**/dist/**',
-      '**/.nx/**',
-      '**/.planning/**',
-      '**/coverage/**',
-      '**/.turbo/**',
-      '**/.next/**',
-      '**/.cache/**'
-    ],
+    ignored: [...EXCLUDED_GLOB_PATTERNS],
     persistent: true,
     ignoreInitial: true,
     awaitWriteFinish: { stabilityThreshold: 200, pollInterval: 100 }
diff --git a/src/core/indexer.ts b/src/core/indexer.ts
@@ -39,6 +39,7 @@ import { mergeSmallChunks } from '../utils/chunking.js';
 import { getFileCommitDates } from '../utils/git-dates.js';
 import {
   CODEBASE_CONTEXT_DIRNAME,
+  EXCLUDED_GLOB_PATTERNS,
   INDEX_FORMAT_VERSION,
   INDEXING_STATS_FILENAME,
   INDEX_META_FILENAME,
@@ -274,14 +275,7 @@ export class CodebaseIndexer {
         '**/*.{sql,graphql,gql}',
         '**/*.{json,jsonc,yaml,yml,toml,xml}'
       ],
-      exclude: [
-        'node_modules/**',
-        'dist/**',
-        'build/**',
-        '.git/**',
-        'coverage/**',
-        '.codebase-context/**'
-      ],
+      exclude: [...EXCLUDED_GLOB_PATTERNS],
       respectGitignore: true,
       parsing: {
         maxFileSize: 1048576,
diff --git a/src/utils/project-discovery.ts b/src/utils/project-discovery.ts
@@ -1,6 +1,7 @@
 import { promises as fs } from 'fs';
 import type { Dirent } from 'fs';
 import path from 'path';
+import { EXCLUDED_DIRECTORY_NAMES, DISCOVERY_ONLY_IGNORED } from '../constants/codebase-context.js';
 
 export type ProjectEvidence =
   | 'existing_index'
@@ -19,23 +20,9 @@ export interface DiscoverProjectsOptions {
 
 const DEFAULT_MAX_DEPTH = 4;
 
-const IGNORED_DIRECTORY_NAMES = new Set([
-  '.git',
-  '.hg',
-  '.svn',
-  '.next',
-  '.nuxt',
-  '.turbo',
-  '.venv',
-  '.yarn',
-  'build',
-  'coverage',
-  'dist',
-  'node_modules',
-  'out',
-  'target',
-  'tmp',
-  'vendor'
+const IGNORED_DIRECTORY_NAMES: Set<string> = new Set([
+  ...EXCLUDED_DIRECTORY_NAMES,
+  ...DISCOVERY_ONLY_IGNORED
 ]);
 
 const STRONG_DIRECTORY_MARKERS = new Set(['.codebase-context', '.git']);
diff --git a/tests/indexer-exclude-patterns.test.ts b/tests/indexer-exclude-patterns.test.ts
@@ -0,0 +1,89 @@
+import { afterEach, describe, expect, it } from 'vitest';
+import { promises as fs } from 'fs';
+import os from 'os';
+import path from 'path';
+import { CodebaseIndexer } from '../src/core/indexer.js';
+import { analyzerRegistry } from '../src/core/analyzer-registry.js';
+import { GenericAnalyzer } from '../src/analyzers/generic/index.js';
+import {
+  CODEBASE_CONTEXT_DIRNAME,
+  KEYWORD_INDEX_FILENAME
+} from '../src/constants/codebase-context.js';
+
+describe('Indexer exclude patterns — nested directories', () => {
+  let tempDir: string;
+
+  afterEach(async () => {
+    if (tempDir) await fs.rm(tempDir, { recursive: true, force: true });
+  });
+
+  it('excludes nested coverage, worktrees, .claude, and dist directories', async () => {
+    analyzerRegistry.register(new GenericAnalyzer());
+    tempDir = await fs.mkdtemp(path.join(os.tmpdir(), 'indexer-exclude-patterns-'));
+
+    // Legitimate source file
+    await fs.mkdir(path.join(tempDir, 'src'), { recursive: true });
+    await fs.writeFile(
+      path.join(tempDir, 'src', 'app.ts'),
+      'export function main() { return "hello"; }\n'
+    );
+
+    // Polluters — nested paths that should be excluded
+    const polluters = [
+      ['packages', 'ui', 'coverage', 'prettify.js'],
+      ['.claude', 'worktrees', 'branch', 'src', 'app.ts'],
+      ['worktrees', 'portal30-pr', 'src', 'real.ts'],
+      ['apps', 'web', 'dist', 'bundle.js']
+    ];
+
+    for (const segments of polluters) {
+      const dir = path.join(tempDir, ...segments.slice(0, -1));
+      await fs.mkdir(dir, { recursive: true });
+      await fs.writeFile(path.join(tempDir, ...segments), 'export const polluter = true;\n');
+    }
+
+    const indexer = new CodebaseIndexer({
+      rootPath: tempDir,
+      config: {
+        skipEmbedding: true,
+        parsing: {
+          maxFileSize: 1048576,
+          chunkSize: 50,
+          chunkOverlap: 0,
+          parseTests: true,
+          parseNodeModules: false
+        }
+      }
+    });
+
+    await indexer.index();
+
+    const indexPath = path.join(tempDir, CODEBASE_CONTEXT_DIRNAME, KEYWORD_INDEX_FILENAME);
+    const indexRaw = JSON.parse(await fs.readFile(indexPath, 'utf-8')) as Record<string, unknown>;
+
+    let chunks: Array<{ filePath: string }>;
+    if (Array.isArray(indexRaw)) {
+      chunks = indexRaw;
+    } else if (Array.isArray(indexRaw?.chunks)) {
+      chunks = indexRaw.chunks as Array<{ filePath: string }>;
+    } else {
+      throw new Error(
+        `Unexpected index format: keys=${JSON.stringify(Object.keys(indexRaw ?? {}))}`
+      );
+    }
+
+    const indexedPaths = chunks.map((chunk) => chunk.filePath);
+
+    // The legitimate file must be indexed
+    expect(indexedPaths.some((p) => p.includes('src/app.ts') || p.includes('src\\app.ts'))).toBe(
+      true
+    );
+
+    // None of the polluter paths should appear
+    const polluterMarkers = ['coverage', '.claude', 'worktrees', 'dist'];
+    for (const marker of polluterMarkers) {
+      const leaked = indexedPaths.filter((p) => p.includes(marker));
+      expect(leaked, `paths containing "${marker}" should not be indexed`).toEqual([]);
+    }
+  });
+});