diff --git a/src/utils/glob.mts b/src/utils/glob.mts index 787a55442..7fa86042f 100644 --- a/src/utils/glob.mts +++ b/src/utils/glob.mts @@ -164,6 +164,14 @@ export function filterBySupportedScanFiles( return filepaths.filter(p => micromatch.some(p, patterns, { dot: true })) } +export function createSupportedFilesFilter( + supportedFiles: SocketSdkSuccessResult<'getReportSupportedFiles'>['data'], +): (filepath: string) => boolean { + const patterns = getSupportedFilePatterns(supportedFiles) + return (filepath: string) => + micromatch.some(filepath, patterns, { dot: true }) +} + export function getSupportedFilePatterns( supportedFiles: SocketSdkSuccessResult<'getReportSupportedFiles'>['data'], ): string[] { @@ -178,6 +186,10 @@ export function getSupportedFilePatterns( } type GlobWithGitIgnoreOptions = GlobOptions & { + // Optional filter function to apply during streaming. + // When provided, only files passing this filter are accumulated. + // This is critical for memory efficiency when scanning large monorepos. + filter?: ((filepath: string) => boolean) | undefined socketConfig?: SocketYml | undefined } @@ -187,6 +199,7 @@ export async function globWithGitIgnore( ): Promise { const { cwd = process.cwd(), + filter, socketConfig, ...additionalOptions } = { __proto__: null, ...options } as GlobWithGitIgnoreOptions @@ -243,27 +256,39 @@ export async function globWithGitIgnore( ...additionalOptions, } as GlobOptions - if (!hasNegatedPattern) { + // When no filter is provided and no negated patterns exist, use the fast path. + if (!hasNegatedPattern && !filter) { return await fastGlob.glob(patterns as string[], globOptions) } - // Add support for negated "ignore" patterns which many globbing libraries, // including 'fast-glob', 'globby', and 'tinyglobby', lack support for. - const filtered: string[] = [] - const ig = ignore().add([...ignores]) + // Use streaming to avoid unbounded memory accumulation. + // This is critical for large monorepos with 100k+ files. + const results: string[] = [] + const ig = hasNegatedPattern ? ignore().add([...ignores]) : null const stream = fastGlob.globStream( patterns as string[], globOptions, ) as AsyncIterable for await (const p of stream) { - // Note: the input files must be INSIDE the cwd. If you get strange looking - // relative path errors here, most likely your path is outside the given cwd. - const relPath = globOptions.absolute ? path.relative(cwd, p) : p - if (!ig.ignores(relPath)) { - filtered.push(p) + // Check gitignore patterns with negation support. + if (ig) { + // Note: the input files must be INSIDE the cwd. If you get strange looking + // relative path errors here, most likely your path is outside the given cwd. + const relPath = globOptions.absolute ? path.relative(cwd, p) : p + if (ig.ignores(relPath)) { + continue + } + } + // Apply the optional filter to reduce memory usage. + // When scanning large monorepos, this filters early (e.g., to manifest files only) + // instead of accumulating all 100k+ files and filtering later. + if (filter && !filter(p)) { + continue } + results.push(p) } - return filtered + return results } export async function globWorkspace( diff --git a/src/utils/glob.test.mts b/src/utils/glob.test.mts new file mode 100644 index 000000000..111287576 --- /dev/null +++ b/src/utils/glob.test.mts @@ -0,0 +1,252 @@ +import { existsSync, readdirSync, rmSync } from 'node:fs' +import path from 'node:path' +import { fileURLToPath } from 'node:url' + +import mockFs from 'mock-fs' +import { afterEach, describe, expect, it } from 'vitest' + +import { normalizePath } from '@socketsecurity/registry/lib/path' + +import { NODE_MODULES } from '../constants.mjs' +import { + createSupportedFilesFilter, + globWithGitIgnore, + pathsToGlobPatterns, +} from './glob.mts' + +import type FileSystem from 'mock-fs/lib/filesystem' + +// Filter functions defined at module scope to satisfy linting rules. +function filterJsonFiles(filepath: string): boolean { + return filepath.endsWith('.json') +} + +function filterTsFiles(filepath: string): boolean { + return filepath.endsWith('.ts') +} + +const __filename = fileURLToPath(import.meta.url) +const __dirname = path.dirname(__filename) + +const rootNmPath = path.join(__dirname, '../..', NODE_MODULES) +const mockFixturePath = normalizePath(path.join(__dirname, 'glob-mock')) +const mockNmPath = normalizePath(rootNmPath) + +// Remove broken symlinks in node_modules before loading to prevent mock-fs errors. +function cleanupBrokenSymlinks(dirPath: string): void { + try { + if (!existsSync(dirPath)) { + return + } + const entries = readdirSync(dirPath, { withFileTypes: true }) + for (const entry of entries) { + const fullPath = path.join(dirPath, entry.name) + try { + if (entry.isSymbolicLink() && !existsSync(fullPath)) { + // Symlink exists but target does not, remove it. + rmSync(fullPath, { force: true }) + } else if (entry.isDirectory()) { + // Recursively check subdirectories. + cleanupBrokenSymlinks(fullPath) + } + } catch { + // Ignore errors for individual entries. + } + } + } catch { + // If we cannot read the directory, skip cleanup. + } +} + +// Clean up broken symlinks before loading node_modules. +cleanupBrokenSymlinks(rootNmPath) + +// Load node_modules with error handling for any remaining issues. +const mockedNmCallback = (() => { + try { + return mockFs.load(rootNmPath) + } catch (e) { + // If loading fails due to broken symlinks or missing files, return empty mock. + console.warn( + `Warning: Failed to load node_modules for mock-fs: ${e instanceof Error ? e.message : String(e)}`, + ) + return {} + } +})() + +function mockTestFs(config: FileSystem.DirectoryItems) { + return mockFs({ + ...config, + [mockNmPath]: mockedNmCallback, + }) +} + +describe('glob utilities', () => { + afterEach(() => { + mockFs.restore() + }) + + describe('globWithGitIgnore()', () => { + it('should find files matching glob patterns', async () => { + mockTestFs({ + [`${mockFixturePath}/package.json`]: '{}', + [`${mockFixturePath}/src/index.ts`]: '', + }) + + const results = await globWithGitIgnore(['**/*.json'], { + cwd: mockFixturePath, + }) + + expect(results.map(normalizePath)).toEqual([ + `${mockFixturePath}/package.json`, + ]) + }) + + it('should respect .gitignore files', async () => { + mockTestFs({ + [`${mockFixturePath}/.gitignore`]: 'ignored/**', + [`${mockFixturePath}/package.json`]: '{}', + [`${mockFixturePath}/ignored/package.json`]: '{}', + [`${mockFixturePath}/included/package.json`]: '{}', + }) + + const results = await globWithGitIgnore(['**/*.json'], { + cwd: mockFixturePath, + }) + + expect(results.map(normalizePath).sort()).toEqual([ + `${mockFixturePath}/included/package.json`, + `${mockFixturePath}/package.json`, + ]) + }) + + it('should handle negated patterns in .gitignore', async () => { + mockTestFs({ + [`${mockFixturePath}/.gitignore`]: 'ignored/**\n!ignored/keep.json', + [`${mockFixturePath}/package.json`]: '{}', + [`${mockFixturePath}/ignored/excluded.json`]: '{}', + [`${mockFixturePath}/ignored/keep.json`]: '{}', + }) + + const results = await globWithGitIgnore(['**/*.json'], { + cwd: mockFixturePath, + }) + + // The negated pattern should allow keep.json to be included. + expect(results.map(normalizePath).sort()).toEqual([ + `${mockFixturePath}/ignored/keep.json`, + `${mockFixturePath}/package.json`, + ]) + }) + + it('should apply filter function during streaming to reduce memory', async () => { + // Create a mock filesystem with many files. + const files: FileSystem.DirectoryItems = {} + const fileCount = 100 + for (let i = 0; i < fileCount; i += 1) { + files[`${mockFixturePath}/file${i}.txt`] = 'content' + files[`${mockFixturePath}/file${i}.json`] = '{}' + } + // Add a gitignore with negated pattern to trigger the streaming path. + files[`${mockFixturePath}/.gitignore`] = 'temp/\n!temp/keep.json' + mockTestFs(files) + + const results = await globWithGitIgnore(['**/*'], { + cwd: mockFixturePath, + filter: filterJsonFiles, + }) + + // Should only include .json files (100 files). + expect(results).toHaveLength(fileCount) + for (const result of results) { + expect(result.endsWith('.json')).toBe(true) + } + }) + + it('should apply filter without negated patterns', async () => { + mockTestFs({ + [`${mockFixturePath}/package.json`]: '{}', + [`${mockFixturePath}/src/index.ts`]: '', + [`${mockFixturePath}/src/utils.ts`]: '', + [`${mockFixturePath}/readme.md`]: '', + }) + + const results = await globWithGitIgnore(['**/*'], { + cwd: mockFixturePath, + filter: filterTsFiles, + }) + + expect(results.map(normalizePath).sort()).toEqual([ + `${mockFixturePath}/src/index.ts`, + `${mockFixturePath}/src/utils.ts`, + ]) + }) + + it('should combine filter with negated gitignore patterns', async () => { + mockTestFs({ + [`${mockFixturePath}/.gitignore`]: 'build/**\n!build/manifest.json', + [`${mockFixturePath}/package.json`]: '{}', + [`${mockFixturePath}/src/index.ts`]: '', + [`${mockFixturePath}/build/output.js`]: '', + [`${mockFixturePath}/build/manifest.json`]: '{}', + }) + + const results = await globWithGitIgnore(['**/*'], { + cwd: mockFixturePath, + filter: filterJsonFiles, + }) + + // Should include package.json and the negated build/manifest.json, but not build/output.js. + expect(results.map(normalizePath).sort()).toEqual([ + `${mockFixturePath}/build/manifest.json`, + `${mockFixturePath}/package.json`, + ]) + }) + }) + + describe('createSupportedFilesFilter()', () => { + it('should create a filter function matching supported file patterns', () => { + const supportedFiles = { + npm: { + packagejson: { pattern: 'package.json' }, + packagelockjson: { pattern: 'package-lock.json' }, + }, + } + + const filter = createSupportedFilesFilter(supportedFiles) + + expect(filter('/path/to/package.json')).toBe(true) + expect(filter('/path/to/package-lock.json')).toBe(true) + expect(filter('/path/to/random.txt')).toBe(false) + expect(filter('/path/to/nested/package.json')).toBe(true) + }) + }) + + describe('pathsToGlobPatterns()', () => { + it('should convert "." to "**/*"', () => { + expect(pathsToGlobPatterns(['.'])).toEqual(['**/*']) + expect(pathsToGlobPatterns(['./'])).toEqual(['**/*']) + }) + + it('should append "/**/*" to directory paths', () => { + mockTestFs({ + [`${mockFixturePath}/subdir`]: { + 'file.txt': '', + }, + }) + + // The function checks if path is a directory using isDirSync. + const result = pathsToGlobPatterns(['subdir'], mockFixturePath) + expect(result).toEqual(['subdir/**/*']) + }) + + it('should keep file paths unchanged', () => { + mockTestFs({ + [`${mockFixturePath}/file.txt`]: '', + }) + + const result = pathsToGlobPatterns(['file.txt'], mockFixturePath) + expect(result).toEqual(['file.txt']) + }) + }) +}) diff --git a/src/utils/path-resolve.mts b/src/utils/path-resolve.mts index 37da7d3f7..4da0347c3 100644 --- a/src/utils/path-resolve.mts +++ b/src/utils/path-resolve.mts @@ -9,7 +9,7 @@ import { isDirSync } from '@socketsecurity/registry/lib/fs' import constants, { NODE_MODULES, NPM } from '../constants.mts' import { - filterBySupportedScanFiles, + createSupportedFilesFilter, globWithGitIgnore, pathsToGlobPatterns, } from './glob.mts' @@ -114,13 +114,17 @@ export async function getPackageFilesForScan( ...options, } as PackageFilesForScanOptions - const filepaths = await globWithGitIgnore( + // Apply the supported files filter during streaming to avoid accumulating + // all files in memory. This is critical for large monorepos with 100k+ files + // where accumulating all paths before filtering causes OOM errors. + const filter = createSupportedFilesFilter(supportedFiles) + + return await globWithGitIgnore( pathsToGlobPatterns(inputPaths, options?.cwd), { cwd, + filter, socketConfig, }, ) - - return filterBySupportedScanFiles(filepaths!, supportedFiles) } diff --git a/test/glob.test.mts b/test/glob.test.mts index 610caea67..eb8b4481b 100644 --- a/test/glob.test.mts +++ b/test/glob.test.mts @@ -1,12 +1,22 @@ -import { describe, expect, it } from 'vitest' +import { mkdir, rm, writeFile } from 'node:fs/promises' +import path from 'node:path' + +import { afterAll, beforeAll, describe, expect, it } from 'vitest' import { + createSupportedFilesFilter, filterBySupportedScanFiles, + globWithGitIgnore, isReportSupportedFile, } from '../src/utils/glob.mts' import type { SocketSdkSuccessResult } from '@socketsecurity/sdk' +// Filter function for tests - defined at module scope to satisfy linting. +function packageJsonFilter(p: string): boolean { + return p.endsWith('package.json') +} + describe('glob', () => { const mockSupportedFiles: SocketSdkSuccessResult<'getReportSupportedFiles'>['data'] = { @@ -80,4 +90,101 @@ describe('glob', () => { ) }) }) + + describe('createSupportedFilesFilter', () => { + it('should create a filter function that matches supported files', () => { + const filter = createSupportedFilesFilter(mockSupportedFiles) + + expect(filter('package.json')).toBe(true) + expect(filter('poetry.lock')).toBe(true) + expect(filter('nested/package.json')).toBe(true) + expect(filter('.hidden/poetry.lock')).toBe(true) + }) + + it('should create a filter function that rejects unsupported files', () => { + const filter = createSupportedFilesFilter(mockSupportedFiles) + + expect(filter('index.ts')).toBe(false) + expect(filter('random.txt')).toBe(false) + expect(filter('src/main.js')).toBe(false) + }) + }) + + describe('globWithGitIgnore', () => { + const testDir = path.join(process.cwd(), '.test-glob-fixture') + + beforeAll(async () => { + // Create test directory structure. + await mkdir(testDir, { recursive: true }) + await mkdir(path.join(testDir, 'pkg1'), { recursive: true }) + await mkdir(path.join(testDir, 'pkg2'), { recursive: true }) + await mkdir(path.join(testDir, 'ignored'), { recursive: true }) + + // Create test files. + await writeFile(path.join(testDir, 'package.json'), '{}') + await writeFile(path.join(testDir, 'pkg1', 'package.json'), '{}') + await writeFile(path.join(testDir, 'pkg1', 'index.ts'), '') + await writeFile(path.join(testDir, 'pkg2', 'package.json'), '{}') + await writeFile(path.join(testDir, 'pkg2', 'index.ts'), '') + await writeFile(path.join(testDir, 'ignored', 'package.json'), '{}') + await writeFile(path.join(testDir, 'random.txt'), '') + + // Create .gitignore with negated pattern. + await writeFile( + path.join(testDir, '.gitignore'), + 'ignored/\n!ignored/package.json\n', + ) + }) + + afterAll(async () => { + // Cleanup test directory. + await rm(testDir, { recursive: true, force: true }) + }) + + it('should apply filter during streaming to reduce memory', async () => { + const result = await globWithGitIgnore(['**/*'], { + cwd: testDir, + filter: packageJsonFilter, + }) + + // Should only return package.json files. + expect(result.every(p => p.endsWith('package.json'))).toBe(true) + // Should have found multiple package.json files. + expect(result.length).toBeGreaterThanOrEqual(3) + }) + + it('should handle negated gitignore patterns', async () => { + const result = await globWithGitIgnore(['**/*'], { + cwd: testDir, + }) + + const relativePaths = result.map(p => path.relative(testDir, p)) + + // The ignored directory should be excluded. + expect(relativePaths.some(p => p.startsWith('ignored/'))).toBe(false) + }) + + it('should combine filter with negated patterns', async () => { + const result = await globWithGitIgnore(['**/*'], { + cwd: testDir, + filter: packageJsonFilter, + }) + + const relativePaths = result.map(p => path.relative(testDir, p)) + + // Should only return package.json files. + expect(relativePaths.every(p => p.endsWith('package.json'))).toBe(true) + // Should NOT include ignored/package.json because the directory is ignored. + expect(relativePaths).not.toContain('ignored/package.json') + }) + + it('should work without filter (backwards compatibility)', async () => { + const result = await globWithGitIgnore(['**/*.txt'], { + cwd: testDir, + }) + + expect(result.length).toBeGreaterThanOrEqual(1) + expect(result.every(p => p.endsWith('.txt'))).toBe(true) + }) + }) })