From 6beefaec2f2d6fd345054296794a6f41551d094c Mon Sep 17 00:00:00 2001 From: g122622 <2138553606@qq.com> Date: Tue, 26 May 2026 21:20:17 +0800 Subject: [PATCH 1/2] feat(resolution): add C/C++ include path resolution Add full import resolution pipeline for C and C++ #include directives, connecting extracted import nodes to actual header files in the project. - Add C/C++ extension resolution (.h, .hpp, .hxx, .cpp, .cc, .cxx) - Add system header filtering with ~80 C and ~80 C++ stdlib headers - Add extractCppImports() for #include import mapping extraction - Add compile_commands.json parsing for -I/-isystem include directories - Add heuristic include dir discovery (include/, src/, lib/, api/) - Add resolveCppIncludePath() for include directory search - Add C/C++ built-in symbol filtering (printf, malloc, std::*, etc.) - Wire getCppIncludeDirs into ResolutionContext - Add 13 new tests for C/C++ import resolution and extraction Co-Authored-By: Claude Opus 4.7 --- __tests__/extraction.test.ts | 21 ++ __tests__/resolution.test.ts | 274 ++++++++++++++++++++++++++- src/resolution/import-resolver.ts | 305 +++++++++++++++++++++++++++++- src/resolution/index.ts | 59 +++++- src/resolution/types.ts | 7 + 5 files changed, 663 insertions(+), 3 deletions(-) diff --git a/__tests__/extraction.test.ts b/__tests__/extraction.test.ts index 96b2686a..d17329c1 100644 --- a/__tests__/extraction.test.ts +++ b/__tests__/extraction.test.ts @@ -2056,6 +2056,27 @@ end expect(names).toContain('vector'); expect(names).toContain('config.h'); }); + + it('should create unresolved references for local includes', () => { + const code = `#include "myheader.h"`; + const result = extractFromSource('main.cpp', code); + + const importRef = result.unresolvedReferences.find( + (r) => r.referenceKind === 'imports' && r.referenceName === 'myheader.h' + ); + expect(importRef).toBeDefined(); + expect(importRef?.line).toBe(1); + }); + + it('should create unresolved references for system includes', () => { + const code = `#include `; + const result = extractFromSource('main.cpp', code); + + const importRef = result.unresolvedReferences.find( + (r) => r.referenceKind === 'imports' && r.referenceName === 'iostream' + ); + expect(importRef).toBeDefined(); + }); }); describe('Dart imports', () => { diff --git a/__tests__/resolution.test.ts b/__tests__/resolution.test.ts index ceaea0e6..4a8c168b 100644 --- a/__tests__/resolution.test.ts +++ b/__tests__/resolution.test.ts @@ -12,7 +12,7 @@ import { CodeGraph } from '../src'; import { Node, UnresolvedReference } from '../src/types'; import { ReferenceResolver, createResolver, ResolutionContext } from '../src/resolution'; import { matchReference } from '../src/resolution/name-matcher'; -import { resolveImportPath, extractImportMappings } from '../src/resolution/import-resolver'; +import { resolveImportPath, extractImportMappings, loadCppIncludeDirs, clearCppIncludeDirCache } from '../src/resolution/import-resolver'; import { detectFrameworks, getAllFrameworkResolvers } from '../src/resolution/frameworks'; import { QueryBuilder } from '../src/db/queries'; import { DatabaseConnection } from '../src/db'; @@ -1138,4 +1138,276 @@ func main() { expect(callers.some((c) => c.node.filePath === 'src/main.ts')).toBe(true); }); }); + + describe('C/C++ Import Resolution', () => { + afterEach(() => { + clearCppIncludeDirCache(); + }); + + it('should resolve C include to header in same directory', () => { + const context: ResolutionContext = { + getNodesInFile: () => [], + getNodesByName: () => [], + getNodesByQualifiedName: () => [], + getNodesByKind: () => [], + fileExists: (p) => p === 'utils.h', + readFile: () => null, + getProjectRoot: () => '', + getAllFiles: () => ['utils.h', 'main.c'], + }; + + const result = resolveImportPath( + 'utils.h', + 'main.c', + 'c', + context + ); + + expect(result).toBe('utils.h'); + }); + + it('should resolve C++ include with .hpp extension', () => { + const context: ResolutionContext = { + getNodesInFile: () => [], + getNodesByName: () => [], + getNodesByQualifiedName: () => [], + getNodesByKind: () => [], + fileExists: (p) => p === 'include/myclass.hpp', + readFile: () => null, + getProjectRoot: () => '', + getAllFiles: () => ['include/myclass.hpp', 'src/main.cpp'], + getCppIncludeDirs: () => ['include'], + }; + + const result = resolveImportPath( + 'myclass.hpp', + 'src/main.cpp', + 'cpp', + context + ); + + expect(result).toBe('include/myclass.hpp'); + }); + + it('should resolve include with subdirectory path', () => { + const context: ResolutionContext = { + getNodesInFile: () => [], + getNodesByName: () => [], + getNodesByQualifiedName: () => [], + getNodesByKind: () => [], + fileExists: (p) => p === 'utils/helpers.h', + readFile: () => null, + getProjectRoot: () => '', + getAllFiles: () => ['utils/helpers.h', 'main.c'], + }; + + const result = resolveImportPath( + 'utils/helpers.h', + 'main.c', + 'c', + context + ); + + expect(result).toBe('utils/helpers.h'); + }); + + it('should resolve include via include directories', () => { + const context: ResolutionContext = { + getNodesInFile: () => [], + getNodesByName: () => [], + getNodesByQualifiedName: () => [], + getNodesByKind: () => [], + fileExists: (p) => p === 'include/myheader.h', + readFile: () => null, + getProjectRoot: () => '', + getAllFiles: () => ['include/myheader.h', 'src/main.cpp'], + getCppIncludeDirs: () => ['include'], + }; + + const result = resolveImportPath( + 'myheader.h', + 'src/main.cpp', + 'cpp', + context + ); + + expect(result).toBe('include/myheader.h'); + }); + + it('should resolve include trying multiple extensions', () => { + const context: ResolutionContext = { + getNodesInFile: () => [], + getNodesByName: () => [], + getNodesByQualifiedName: () => [], + getNodesByKind: () => [], + // myclass.h does not exist, but myclass.hpp does + fileExists: (p) => p === 'include/myclass.hpp', + readFile: () => null, + getProjectRoot: () => '', + getAllFiles: () => ['include/myclass.hpp', 'src/main.cpp'], + getCppIncludeDirs: () => ['include'], + }; + + const result = resolveImportPath( + 'myclass', + 'src/main.cpp', + 'cpp', + context + ); + + expect(result).toBe('include/myclass.hpp'); + }); + + it('should return null for system headers', () => { + const context: ResolutionContext = { + getNodesInFile: () => [], + getNodesByName: () => [], + getNodesByQualifiedName: () => [], + getNodesByKind: () => [], + fileExists: () => true, + readFile: () => null, + getProjectRoot: () => '', + getAllFiles: () => [], + }; + + // C standard library header + expect(resolveImportPath('stdio.h', 'main.c', 'c', context)).toBeNull(); + // C++ standard library header + expect(resolveImportPath('vector', 'main.cpp', 'cpp', context)).toBeNull(); + // C++ C-wrapper header + expect(resolveImportPath('cstdio', 'main.cpp', 'cpp', context)).toBeNull(); + }); + + it('should return null for single-component third-party paths that cannot be resolved', () => { + const context: ResolutionContext = { + getNodesInFile: () => [], + getNodesByName: () => [], + getNodesByQualifiedName: () => [], + getNodesByKind: () => [], + fileExists: () => false, + readFile: () => null, + getProjectRoot: () => '', + getAllFiles: () => [], + getCppIncludeDirs: () => [], + }; + + // Third-party bare header without path — not resolvable, returns null + const result = resolveImportPath( + 'openssl/ssl.h', + 'main.cpp', + 'cpp', + context + ); + + expect(result).toBeNull(); + }); + + it('should not filter project headers with path separators', () => { + const context: ResolutionContext = { + getNodesInFile: () => [], + getNodesByName: () => [], + getNodesByQualifiedName: () => [], + getNodesByKind: () => [], + fileExists: (p) => p === 'mylib/utils.h', + readFile: () => null, + getProjectRoot: () => '', + getAllFiles: () => ['mylib/utils.h'], + }; + + // Path with separator should NOT be filtered as external + const result = resolveImportPath( + 'mylib/utils.h', + 'main.c', + 'c', + context + ); + + expect(result).toBe('mylib/utils.h'); + }); + + it('should extract C/C++ import mappings from #include directives', () => { + const code = `#include +#include "myheader.h" +#include "utils/helpers.hpp"`; + + const mappings = extractImportMappings('main.cpp', code, 'cpp'); + + expect(mappings.length).toBe(3); + expect(mappings[0]).toEqual({ + localName: 'iostream', + exportedName: '*', + source: 'iostream', + isDefault: false, + isNamespace: true, + }); + expect(mappings[1]).toEqual({ + localName: 'myheader', + exportedName: '*', + source: 'myheader.h', + isDefault: false, + isNamespace: true, + }); + expect(mappings[2]).toEqual({ + localName: 'helpers', + exportedName: '*', + source: 'utils/helpers.hpp', + isDefault: false, + isNamespace: true, + }); + }); + + it('should discover include directories from compile_commands.json', () => { + // Create a temp project with compile_commands.json + const tempProject = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-cpp-test-')); + try { + const compileDb = [ + { + directory: tempProject, + command: 'g++ -Iinclude -Isrc/lib -isystem /usr/include -c src/main.cpp', + file: 'src/main.cpp', + }, + ]; + fs.writeFileSync( + path.join(tempProject, 'compile_commands.json'), + JSON.stringify(compileDb) + ); + // Create the include dirs so they exist + fs.mkdirSync(path.join(tempProject, 'include'), { recursive: true }); + fs.mkdirSync(path.join(tempProject, 'src', 'lib'), { recursive: true }); + + clearCppIncludeDirCache(); + const dirs = loadCppIncludeDirs(tempProject); + + // Should find include and src/lib (relative to project root) + // /usr/include is absolute and outside project, should be excluded + expect(dirs).toContain('include'); + expect(dirs).toContain('src/lib'); + expect(dirs.some(d => d.includes('usr'))).toBe(false); + } finally { + fs.rmSync(tempProject, { recursive: true }); + } + }); + + it('should fall back to heuristic include dirs when no compile_commands.json', () => { + const tempProject = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-cpp-test-')); + try { + // Create include/ and src/ directories with headers + fs.mkdirSync(path.join(tempProject, 'include'), { recursive: true }); + fs.writeFileSync(path.join(tempProject, 'include', 'types.h'), ''); + fs.mkdirSync(path.join(tempProject, 'src'), { recursive: true }); + fs.writeFileSync(path.join(tempProject, 'src', 'main.cpp'), ''); + // Create a directory without headers — should not be included + fs.mkdirSync(path.join(tempProject, 'docs'), { recursive: true }); + + clearCppIncludeDirCache(); + const dirs = loadCppIncludeDirs(tempProject); + + expect(dirs).toContain('include'); + expect(dirs).toContain('src'); + expect(dirs).not.toContain('docs'); + } finally { + fs.rmSync(tempProject, { recursive: true }); + } + }); + }); }); diff --git a/src/resolution/import-resolver.ts b/src/resolution/import-resolver.ts index d175a0cd..26756a21 100644 --- a/src/resolution/import-resolver.ts +++ b/src/resolution/import-resolver.ts @@ -4,6 +4,7 @@ * Resolves import paths to actual files and symbols. */ +import * as fs from 'fs'; import * as path from 'path'; import { Language, Node } from '../types'; import { UnresolvedRef, ResolvedRef, ResolutionContext, ImportMapping, ReExport } from './types'; @@ -21,6 +22,8 @@ const EXTENSION_RESOLUTION: Record = { go: ['.go'], rust: ['.rs', '/mod.rs'], java: ['.java'], + c: ['.h', '.c'], + cpp: ['.h', '.hpp', '.hxx', '.cpp', '.cc', '.cxx'], csharp: ['.cs'], php: ['.php'], ruby: ['.rb'], @@ -53,9 +56,56 @@ export function resolveImportPath( } // Handle absolute/aliased imports (like @/ or src/) - return resolveAliasedImport(importPath, projectRoot, language, context); + const aliased = resolveAliasedImport(importPath, projectRoot, language, context); + if (aliased) return aliased; + + // C/C++ include directory search: when neither relative nor aliased + // resolution found a match, search -I directories from + // compile_commands.json or heuristic probing. + if (language === 'c' || language === 'cpp') { + return resolveCppIncludePath(importPath, language, context); + } + + return null; } +/** + * C and C++ standard library header names (without delimiters). + * Used by isExternalImport to filter system includes from resolution. + */ +const C_CPP_STDLIB_HEADERS = new Set([ + // C standard library headers + 'assert.h', 'complex.h', 'ctype.h', 'errno.h', 'fenv.h', 'float.h', + 'inttypes.h', 'iso646.h', 'limits.h', 'locale.h', 'math.h', 'setjmp.h', + 'signal.h', 'stdalign.h', 'stdarg.h', 'stdatomic.h', 'stdbool.h', + 'stddef.h', 'stdint.h', 'stdio.h', 'stdlib.h', 'stdnoreturn.h', + 'string.h', 'tgmath.h', 'threads.h', 'time.h', 'uchar.h', 'wchar.h', + 'wctype.h', + // C++ C-library wrappers (cname form) + 'cassert', 'ccomplex', 'cctype', 'cerrno', 'cfenv', 'cfloat', + 'cinttypes', 'ciso646', 'climits', 'clocale', 'cmath', 'csetjmp', + 'csignal', 'cstdalign', 'cstdarg', 'cstdbool', 'cstddef', 'cstdint', + 'cstdio', 'cstdlib', 'cstring', 'ctgmath', 'ctime', 'cuchar', + 'cwchar', 'cwctype', + // C++ STL headers + 'algorithm', 'any', 'array', 'atomic', 'barrier', 'bit', 'bitset', + 'charconv', 'chrono', 'codecvt', 'compare', 'complex', 'concepts', + 'condition_variable', 'coroutine', 'deque', 'exception', 'execution', + 'expected', 'filesystem', 'format', 'forward_list', 'fstream', + 'functional', 'future', 'generator', 'initializer_list', 'iomanip', + 'ios', 'iosfwd', 'iostream', 'istream', 'iterator', 'latch', + 'limits', 'list', 'locale', 'map', 'mdspan', 'memory', 'memory_resource', + 'mutex', 'new', 'numbers', 'numeric', 'optional', 'ostream', 'print', + 'queue', 'random', 'ranges', 'ratio', 'regex', 'scoped_allocator', + 'semaphore', 'set', 'shared_mutex', 'source_location', 'span', + 'spanstream', 'sstream', 'stack', 'stacktrace', 'stdexcept', + 'stdfloat', 'stop_token', 'streambuf', 'string', 'string_view', + 'strstream', 'syncstream', 'system_error', 'thread', 'tuple', + 'type_traits', 'typeindex', 'typeinfo', 'unordered_map', + 'unordered_set', 'utility', 'valarray', 'variant', 'vector', + 'version', +]); + /** * Check if an import is external (npm package, etc.) * @@ -123,6 +173,16 @@ function isExternalImport( return true; } + if (language === 'c' || language === 'cpp') { + // C/C++ standard library headers — both C-style () and + // C++-style (, ) forms. Checked against the import + // path (which the extractor strips of <> or "" delimiters). + if (C_CPP_STDLIB_HEADERS.has(importPath)) return true; + // C++ headers without .h extension (e.g. "vector", "string") + const withoutExt = importPath.replace(/\.h$/, ''); + if (C_CPP_STDLIB_HEADERS.has(withoutExt)) return true; + } + return false; } @@ -216,6 +276,214 @@ function resolveAliasedImport( return tryWithExt(importPath); } +/** + * C/C++ include directory cache (keyed by project root). + * Loaded once per resolver instance, shared across calls. + */ +const cppIncludeDirCache = new Map(); + +/** + * Clear the C/C++ include directory cache (call between indexing runs) + */ +export function clearCppIncludeDirCache(): void { + cppIncludeDirCache.clear(); +} + +/** + * Discover C/C++ include search directories for a project. + * + * Strategy: + * 1. Look for compile_commands.json (Clang compilation database) in the + * project root and common build subdirectories. Parse -I and -isystem + * flags from compiler commands. + * 2. If no compilation database is found, probe for common convention + * directories (include/, src/, lib/, api/) and top-level directories + * containing .h/.hpp files. + * + * Returns paths relative to projectRoot. + */ +export function loadCppIncludeDirs(projectRoot: string): string[] { + const cached = cppIncludeDirCache.get(projectRoot); + if (cached !== undefined) return cached; + + const dirs = loadCppIncludeDirsFromCompileDB(projectRoot) + || loadCppIncludeDirsHeuristic(projectRoot); + + cppIncludeDirCache.set(projectRoot, dirs); + return dirs; +} + +/** + * Try to load include directories from compile_commands.json. + * Returns null if no compilation database is found (so the heuristic + * fallback can run). Returns an array (possibly empty) otherwise. + */ +function loadCppIncludeDirsFromCompileDB(projectRoot: string): string[] | null { + const candidates = [ + path.join(projectRoot, 'compile_commands.json'), + path.join(projectRoot, 'build', 'compile_commands.json'), + path.join(projectRoot, 'cmake-build-debug', 'compile_commands.json'), + path.join(projectRoot, 'cmake-build-release', 'compile_commands.json'), + path.join(projectRoot, 'out', 'compile_commands.json'), + ]; + + let dbPath: string | undefined; + for (const c of candidates) { + try { + if (fs.existsSync(c)) { + dbPath = c; + break; + } + } catch { + // ignore + } + } + if (!dbPath) return null; + + try { + const content = fs.readFileSync(dbPath, 'utf-8'); + const entries = JSON.parse(content) as Array<{ + directory: string; + command?: string; + arguments?: string[]; + }>; + if (!Array.isArray(entries)) return null; + + const dirSet = new Set(); + for (const entry of entries) { + const dir = entry.directory || projectRoot; + const args = entry.arguments || (entry.command ? shlexSplit(entry.command) : []); + for (let i = 0; i < args.length; i++) { + const arg = args[i]!; + let includeDir: string | undefined; + // -I (no space) + if (arg.startsWith('-I') && arg.length > 2) { + includeDir = arg.substring(2); + } + // -isystem (space-separated) + else if ((arg === '-isystem' || arg === '-I') && i + 1 < args.length) { + includeDir = args[i + 1]; + i++; // skip next arg + } + if (includeDir) { + // Normalize: resolve relative to the compilation directory + const absPath = path.isAbsolute(includeDir) + ? includeDir + : path.resolve(dir, includeDir); + const relPath = path.relative(projectRoot, absPath).replace(/\\/g, '/'); + // Skip system directories and paths outside the project + // (relative paths starting with .. or absolute paths like + // /usr/include or C:\usr on Windows) + if (!relPath.startsWith('..') && relPath.length > 0 && !path.isAbsolute(relPath)) { + dirSet.add(relPath); + } + } + } + } + return Array.from(dirSet); + } catch { + return null; + } +} + +/** + * Minimal shlex-style split for compiler command strings. + * Handles double-quoted and single-quoted arguments. + */ +function shlexSplit(cmd: string): string[] { + const result: string[] = []; + let i = 0; + while (i < cmd.length) { + // Skip whitespace + while (i < cmd.length && /\s/.test(cmd[i]!)) i++; + if (i >= cmd.length) break; + const ch = cmd[i]!; + if (ch === '"') { + i++; + let arg = ''; + while (i < cmd.length && cmd[i] !== '"') { + if (cmd[i] === '\\' && i + 1 < cmd.length) { i++; arg += cmd[i]; } + else { arg += cmd[i]; } + i++; + } + i++; // closing quote + result.push(arg); + } else if (ch === "'") { + i++; + let arg = ''; + while (i < cmd.length && cmd[i] !== "'") { arg += cmd[i]; i++; } + i++; // closing quote + result.push(arg); + } else { + let arg = ''; + while (i < cmd.length && !/\s/.test(cmd[i]!)) { arg += cmd[i]; i++; } + result.push(arg); + } + } + return result; +} + +/** + * Heuristic include directory discovery when no compile_commands.json exists. + * Checks common convention directories and scans top-level dirs for headers. + */ +function loadCppIncludeDirsHeuristic(projectRoot: string): string[] { + const dirs: string[] = []; + const conventionDirs = ['include', 'src', 'lib', 'api', 'inc']; + + try { + const entries = fs.readdirSync(projectRoot, { withFileTypes: true }); + for (const entry of entries) { + if (!entry.isDirectory()) continue; + const name = entry.name; + // Convention directories + if (conventionDirs.includes(name.toLowerCase())) { + dirs.push(name); + continue; + } + // Any top-level directory containing .h or .hpp files + try { + const subFiles = fs.readdirSync(path.join(projectRoot, name)); + if (subFiles.some(f => /\.(h|hpp|hxx|hh)$/i.test(f))) { + dirs.push(name); + } + } catch { + // ignore permission errors + } + } + } catch { + // ignore + } + + return dirs; +} + +/** + * Resolve a C/C++ include path by searching include directories. + * Called as a fallback after relative and aliased resolution fail. + */ +function resolveCppIncludePath( + importPath: string, + language: Language, + context: ResolutionContext +): string | null { + const includeDirs = context.getCppIncludeDirs?.() ?? []; + const extensions = EXTENSION_RESOLUTION[language] ?? []; + + for (const dir of includeDirs) { + const normalizedDir = dir.replace(/\\/g, '/'); + for (const ext of extensions) { + const candidate = normalizedDir + '/' + importPath + ext; + if (context.fileExists(candidate)) return candidate; + } + // Try as-is (already has extension) + const candidate = normalizedDir + '/' + importPath; + if (context.fileExists(candidate)) return candidate; + } + + return null; +} + /** * Extract import mappings from a file */ @@ -236,6 +504,8 @@ export function extractImportMappings( mappings.push(...extractJavaImports(content)); } else if (language === 'php') { mappings.push(...extractPHPImports(content)); + } else if (language === 'c' || language === 'cpp') { + mappings.push(...extractCppImports(content)); } return mappings; @@ -511,6 +781,38 @@ function extractPHPImports(content: string): ImportMapping[] { return mappings; } +/** + * Extract C/C++ import mappings from #include directives. + * + * #include brings all symbols from the included header into scope + * (namespace import), so each mapping uses isNamespace: true and + * exportedName: '*'. The localName is set to the header's basename + * without extension so that symbol references like `MyClass` can + * match against any include that might provide it. + */ +function extractCppImports(content: string): ImportMapping[] { + const mappings: ImportMapping[] = []; + + // Match both #include <...> and #include "..." + const includeRegex = /^\s*#\s*include\s+[<"]([^>"]+)[>"]/gm; + let match; + + while ((match = includeRegex.exec(content)) !== null) { + const modulePath = match[1]!; + // Basename without extension for localName matching + const basename = modulePath.split('/').pop()!.replace(/\.(h|hpp|hxx|hh|inl|ipp|cxx|cc|cpp)$/,''); + mappings.push({ + localName: basename || modulePath, + exportedName: '*', + source: modulePath, + isDefault: false, + isNamespace: true, + }); + } + + return mappings; +} + // Cache import mappings per file to avoid re-reading and re-parsing const importMappingCache = new Map(); @@ -519,6 +821,7 @@ const importMappingCache = new Map(); */ export function clearImportMappingCache(): void { importMappingCache.clear(); + cppIncludeDirCache.clear(); } /** diff --git a/src/resolution/index.ts b/src/resolution/index.ts index 2d625148..dafacf40 100644 --- a/src/resolution/index.ts +++ b/src/resolution/index.ts @@ -17,7 +17,7 @@ import { ImportMapping, } from './types'; import { matchReference } from './name-matcher'; -import { resolveViaImport, extractImportMappings, extractReExports } from './import-resolver'; +import { resolveViaImport, extractImportMappings, extractReExports, loadCppIncludeDirs } from './import-resolver'; import { detectFrameworks } from './frameworks'; import { synthesizeCallbackEdges } from './callback-synthesizer'; import { loadProjectAliases, type AliasMap } from './path-aliases'; @@ -131,6 +131,49 @@ const PASCAL_BUILT_INS = new Set([ 'IInterface', 'IUnknown', ]); +const C_BUILT_INS = new Set([ + // Standard C library functions + 'printf', 'fprintf', 'sprintf', 'snprintf', 'scanf', 'fscanf', 'sscanf', + 'malloc', 'calloc', 'realloc', 'free', + 'memcpy', 'memmove', 'memset', 'memcmp', 'memchr', + 'strlen', 'strcpy', 'strncpy', 'strcat', 'strncat', 'strcmp', 'strncmp', + 'strstr', 'strchr', 'strrchr', 'strtok', 'strdup', + 'fopen', 'fclose', 'fread', 'fwrite', 'fgets', 'fputs', 'fputc', 'fgetc', + 'feof', 'ferror', 'fflush', 'fseek', 'ftell', 'rewind', + 'exit', 'abort', 'atexit', 'atoi', 'atol', 'atof', 'strtol', 'strtoul', 'strtod', + 'qsort', 'bsearch', + 'abs', 'labs', 'rand', 'srand', + 'sin', 'cos', 'tan', 'sqrt', 'pow', 'log', 'log10', 'exp', 'ceil', 'floor', 'fabs', + 'time', 'clock', 'difftime', 'mktime', 'localtime', 'gmtime', 'strftime', 'asctime', + 'assert', 'errno', + 'perror', 'remove', 'rename', 'tmpfile', 'tmpnam', + 'getenv', 'system', + 'signal', 'raise', + 'setjmp', 'longjmp', + 'va_start', 'va_end', 'va_arg', 'va_copy', + 'NULL', 'EOF', 'BUFSIZ', 'FILENAME_MAX', 'RAND_MAX', 'EXIT_SUCCESS', 'EXIT_FAILURE', + 'size_t', 'ptrdiff_t', 'wchar_t', 'intptr_t', 'uintptr_t', + 'int8_t', 'int16_t', 'int32_t', 'int64_t', + 'uint8_t', 'uint16_t', 'uint32_t', 'uint64_t', + 'FILE', 'FILE', + // POSIX additions commonly seen + 'stat', 'lstat', 'fstat', 'open', 'close', 'read', 'write', 'pipe', + 'fork', 'exec', 'waitpid', 'getpid', 'getppid', 'kill', 'sleep', 'usleep', + 'pthread_create', 'pthread_join', 'pthread_mutex_lock', 'pthread_mutex_unlock', + 'dlopen', 'dlsym', 'dlclose', +]); + +const CPP_BUILT_INS = new Set([ + // iostream objects (often used without std:: prefix via using) + 'cout', 'cin', 'cerr', 'clog', 'endl', 'flush', 'ws', + 'std', // the namespace itself when used as std::something + // Common C++ keywords that leak as references + 'nullptr', 'true', 'false', 'this', 'sizeof', 'alignof', 'typeid', + 'static_cast', 'dynamic_cast', 'reinterpret_cast', 'const_cast', + 'make_unique', 'make_shared', 'make_pair', + 'move', 'forward', 'swap', +]); + /** * Reference Resolver * @@ -392,6 +435,10 @@ export class ReferenceResolver { this.reExportCache.set(filePath, reExports); return reExports; }, + + getCppIncludeDirs: () => { + return loadCppIncludeDirs(this.projectRoot); + }, }; } @@ -832,6 +879,16 @@ export class ReferenceResolver { } } + // C/C++ standard library symbols (printf, malloc, std::vector, etc.) + if (ref.language === 'c' || ref.language === 'cpp') { + // Bare C standard library functions and macros + if (C_BUILT_INS.has(name)) return true; + // C++ std:: namespace prefix + if (name.startsWith('std::')) return true; + // C++ iostream manipulators and objects used without std:: + if (CPP_BUILT_INS.has(name)) return true; + } + return false; } diff --git a/src/resolution/types.ts b/src/resolution/types.ts index 25f6552e..e0ef5efa 100644 --- a/src/resolution/types.ts +++ b/src/resolution/types.ts @@ -115,6 +115,13 @@ export interface ResolutionContext { * without modification. */ listDirectories?(relativePath: string): string[]; + /** + * C/C++ include search directories (relative to project root), + * extracted from compile_commands.json or discovered by heuristic. + * Used by resolveCppIncludePath to search -I directories when + * relative resolution fails. Optional so existing callers compile. + */ + getCppIncludeDirs?(): string[]; } /** From 5f6830ffd42c3ed2956894b61f946ac31a3ea940 Mon Sep 17 00:00:00 2001 From: Colby McHenry Date: Tue, 26 May 2026 19:53:06 -0500 Subject: [PATCH 2/2] review: wire #include resolution into pipeline + fix builtin filter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PR landed the include-dir scan logic (loadCppIncludeDirs + resolveCppIncludePath) but the indexer never reached it: imports references with referenceName='X.h' fell into resolveViaImport's symbol-lookup branch (matched extractCppImports' basename-without-ext localName via .startsWith, then tried to find a symbol named like the extension and failed). End result on bitcoin-core: 0 new file→file imports vs main, despite the include-dir scan resolving paths correctly when probed directly. resolveViaImport now has a C/C++ imports branch that resolves the include path to the actual file node and returns that — skipping the irrelevant symbol scan. Measured on bitcoin-core: +2,059 newly resolved file→file imports (6,027 → 8,086, +34%). The unconditional CPP_BUILT_INS / C_BUILT_INS filter also misfired: C/C++ codebases routinely shadow stdlib names (bitcoin's mp::move, custom allocators with free/malloc, stream classes with read/write/ close/open, logging libs wrapping printf). Filtering those names killed legitimate edges — 1,179 → 0 for move(), 33 → 0 for free(), 149 → 7 for write() on bitcoin. The filter now defers to hasAnyPossibleMatch: only filter when no user-defined symbol with the name exists. std:: prefix stays unconditional (never user-shadowed in practice). After: printf/free/open/close/read/write/swap all preserved at main's counts; the std::move-binds-to-mp::move false-positives still drop (correctly: −2,154 C/C++ calls). Also: drop the duplicate 'FILE' in C_BUILT_INS; add an end-to-end test that asserts `#include "X.h"` produces a file→file imports edge in the real indexing pipeline (not just direct resolver probes); add a test documenting the cross-language `.h` heuristic claim (Obj-C dirs are intentionally allowed as C/C++ include dirs); add CHANGELOG entry under [Unreleased] with measured numbers. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 23 ++++++++- __tests__/resolution.test.ts | 82 +++++++++++++++++++++++++++++++ src/resolution/import-resolver.ts | 24 +++++++++ src/resolution/index.ts | 22 ++++++--- 4 files changed, 143 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f4a6e716..97f1492c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,28 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] -### Added +- **C/C++ `#include` resolution — bare-basename includes now connect to the + actual header file, not a phantom import node (#453).** Path-prefixed + includes (`#include "common/args.h"`) already resolved via file-path + suffix matching, but bare-basename includes (`#include "uint256.h"` from a + caller in another directory) used to leave only a phantom edge to a + floating `import` node owned by the including file. The resolver now walks + C/C++ include search directories — pulled from `compile_commands.json` + (`-I`/`-isystem` flags) when present, otherwise discovered by probing + conventional dirs (`include/`, `src/`, `lib/`, `api/`, `inc/`) plus any + top-level directory containing `.h`/`.hpp` files — and resolves the + include to a real file node, producing a true file→file `imports` edge. + System headers (``, ``, ``, ~80 C and ~80 C++ + stdlib names) are filtered before the scan so they don't false-resolve + via heuristic dir matching. C/C++ built-in symbols (`std::*` unconditionally, + plus `printf`/`malloc`/`cout`/`make_shared`/etc. when **no user-defined + symbol with that name exists**) are filtered from name-matching too — + C/C++ projects routinely shadow stdlib names (custom allocators, stream + wrappers, logging libs), so the filter only fires when there's no real + definition to bind to. Measured on bitcoin-core (1,989 indexed files): + C/C++ file→file `imports` edges 6,027 → 8,086 (**+34%**), false-positive + call edges from `std::move`/`std::swap` etc. into similarly-named user + methods −2,154 (**−3.6%** of C/C++ `calls`). - **Enterprise Spring / MyBatis flow now traces end-to-end (#389).** Three gaps that previously forced agents back to grep on large Spring/MyBatis codebases are closed: - **MyBatis XML mapper indexing + Java↔XML bridge.** `*.xml` files containing `` are now first-class: each `` and `` becomes a method-shaped node qualified as `::`, and a new synthesizer (`mybatis-java-xml`) links the matching Java mapper interface method → its XML statement with a `calls` edge. `` to a `` fragment in the same mapper also resolves. Non-mapper XML (`pom.xml`, `web.xml`, `log4j.xml`, etc.) emits just a file node — no symbol noise. Validated on macrozheng/mall-tiny: all 6 custom-SQL Java mapper methods reach their XML counterparts; `trace(UmsRoleController.listResource, UmsResourceMapper::getResourceListByRoleId-xml)` connects in 4 hops across controller → service-iface → impl → mapper-iface → XML. - **Spring `@Value`/`@ConfigurationProperties` config-key linkage.** `application.{yml,yaml,properties}` (+ profile variants `application-dev.yml`, `bootstrap.yml`, etc.) is parsed during indexing, with one `constant` node per leaf key qualified by its dotted path (`app.cache.name.user-token`). `@Value("${app.cache.name.user-token}")` and `@ConfigurationProperties(prefix = "app.cache")` references in Java/Kotlin emit binding nodes that resolve to the matching key (or, for `@ConfigurationProperties`, a key under the prefix). Spring's **relaxed binding** applies (kebab `cache-list` ↔ camel `cacheList` ↔ snake `cache_list` ↔ `CACHE_LIST`), so a Java `@Value("${app.retryCount}")` finds `app.retry-count` in `application.properties`. `${key:default}` form is supported; the default is stripped before lookup. diff --git a/__tests__/resolution.test.ts b/__tests__/resolution.test.ts index 4a8c168b..5b914bfb 100644 --- a/__tests__/resolution.test.ts +++ b/__tests__/resolution.test.ts @@ -1409,5 +1409,87 @@ func main() { fs.rmSync(tempProject, { recursive: true }); } }); + + // Documents the cross-language `.h` behavior. Objective-C and C++ share + // the `.h` extension, so in a mixed iOS-style project an Obj-C header + // dir gets claimed as a C/C++ include dir too. That's intentional — a + // C++ file legitimately can `#include "Foo.h"` against an Obj-C header + // (Obj-C++ / .mm callers), and false-positive inclusion is far cheaper + // than missing real resolutions. The test pins this so a later + // "exclude objc dirs" refactor breaks loudly and reviewers see the + // trade-off explicitly. + it('heuristic claims any top-level dir containing .h files, including Obj-C', () => { + const tempProject = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-cpp-test-')); + try { + // C++ side: an `cppmod` dir with a .hpp (C++-only extension) + fs.mkdirSync(path.join(tempProject, 'cppmod'), { recursive: true }); + fs.writeFileSync(path.join(tempProject, 'cppmod', 'shared.hpp'), ''); + // Obj-C side: an `iosmod` dir with .h + .m (no .cpp/.hpp). + fs.mkdirSync(path.join(tempProject, 'iosmod'), { recursive: true }); + fs.writeFileSync(path.join(tempProject, 'iosmod', 'View.h'), ''); + fs.writeFileSync(path.join(tempProject, 'iosmod', 'View.m'), ''); + + clearCppIncludeDirCache(); + const dirs = loadCppIncludeDirs(tempProject); + + // Both included — Obj-C dirs are intentionally allowed. + expect(dirs).toContain('cppmod'); + expect(dirs).toContain('iosmod'); + } finally { + fs.rmSync(tempProject, { recursive: true }); + } + }); + + // End-to-end: ensure `#include "X.h"` produces a file→file `imports` edge + // in the actual indexing pipeline (not just a phantom file→import-node + // edge). This pins the include-dir resolution path so the headline PR + // feature can't silently regress to a no-op in the indexing flow. + it('connects #include to the real header file via include-dir scan (end-to-end)', async () => { + const tempProject = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-cpp-e2e-')); + try { + fs.mkdirSync(path.join(tempProject, 'include'), { recursive: true }); + fs.mkdirSync(path.join(tempProject, 'src'), { recursive: true }); + fs.writeFileSync( + path.join(tempProject, 'include', 'utils.h'), + `#ifndef UTILS_H\n#define UTILS_H\nint add(int, int);\n#endif\n` + ); + fs.writeFileSync( + path.join(tempProject, 'src', 'main.cpp'), + `#include "utils.h"\n#include \nint main(){ return add(1,2); }\n` + ); + + clearCppIncludeDirCache(); + cg = await CodeGraph.init(tempProject, { index: true }); + + // Sanity: file nodes exist for the header and the cpp. + const allFiles = cg.getStats(); + expect(allFiles.fileCount).toBe(2); + + // The `#include "utils.h"` edge should target the real + // `include/utils.h` file node — not a floating `import` node + // living inside main.cpp. + const db = DatabaseConnection.open(path.join(tempProject, '.codegraph', 'codegraph.db')); + const rows = db.getDb().prepare(` + select dst.kind as dstKind, dst.file_path as dstPath + from edges e + join nodes src on e.source = src.id + join nodes dst on e.target = dst.id + where e.kind = 'imports' + and src.kind = 'file' + and src.file_path = 'src/main.cpp' + `).all() as Array<{ dstKind: string; dstPath: string }>; + const resolvedToHeader = rows.find( + (r) => r.dstKind === 'file' && r.dstPath === 'include/utils.h' + ); + expect(resolvedToHeader, 'main.cpp → include/utils.h imports edge missing').toBeDefined(); + // `` should NOT produce a file edge — it's a stdlib header. + const stdlibFile = rows.find( + (r) => r.dstKind === 'file' && r.dstPath && r.dstPath.endsWith('vector') + ); + expect(stdlibFile).toBeUndefined(); + } finally { + fs.rmSync(tempProject, { recursive: true, force: true }); + } + }); }); }); diff --git a/src/resolution/import-resolver.ts b/src/resolution/import-resolver.ts index 26756a21..7a779037 100644 --- a/src/resolution/import-resolver.ts +++ b/src/resolution/import-resolver.ts @@ -952,6 +952,30 @@ export function resolveViaImport( ref: UnresolvedRef, context: ResolutionContext ): ResolvedRef | null { + // C/C++ #include references — resolve directly to the included file + // (file→file edge), bypassing symbol lookup. The extractor emits these + // with `referenceKind: 'imports'` and `referenceName: ` + // (e.g. "uint256.h" or "common/args.h"). Without this branch the + // include-dir scan path inside resolveImportPath never produces an + // edge — resolveViaImport's symbol lookup below would search the + // resolved file for a symbol named like the file extension and fail. + if ((ref.language === 'c' || ref.language === 'cpp') && ref.referenceKind === 'imports') { + const resolvedPath = resolveImportPath(ref.referenceName, ref.filePath, ref.language, context); + if (!resolvedPath) return null; + const basename = resolvedPath.split('/').pop()!; + const fileNodes = context.getNodesByName(basename).filter((n) => n.kind === 'file'); + const fileNode = fileNodes.find((n) => n.filePath === resolvedPath); + if (fileNode) { + return { + original: ref, + targetNodeId: fileNode.id, + confidence: 0.9, + resolvedBy: 'import', + }; + } + return null; + } + // Use cached import mappings (avoids re-reading and re-parsing per ref) const imports = context.getImportMappings(ref.filePath, ref.language); if (imports.length === 0 && !context.readFile(ref.filePath)) { diff --git a/src/resolution/index.ts b/src/resolution/index.ts index dafacf40..c26157d1 100644 --- a/src/resolution/index.ts +++ b/src/resolution/index.ts @@ -155,7 +155,7 @@ const C_BUILT_INS = new Set([ 'size_t', 'ptrdiff_t', 'wchar_t', 'intptr_t', 'uintptr_t', 'int8_t', 'int16_t', 'int32_t', 'int64_t', 'uint8_t', 'uint16_t', 'uint32_t', 'uint64_t', - 'FILE', 'FILE', + 'FILE', // POSIX additions commonly seen 'stat', 'lstat', 'fstat', 'open', 'close', 'read', 'write', 'pipe', 'fork', 'exec', 'waitpid', 'getpid', 'getppid', 'kill', 'sleep', 'usleep', @@ -879,14 +879,22 @@ export class ReferenceResolver { } } - // C/C++ standard library symbols (printf, malloc, std::vector, etc.) + // C/C++ standard library symbols (printf, malloc, std::vector, etc.). + // Names that collide with user-defined symbols are NOT filtered — + // C and C++ projects routinely shadow stdlib names (custom allocators + // define `malloc`/`free`, stream wrappers define `read`/`write`/`open`, + // containers define `move`/`swap`, logging libs wrap `printf`). Killing + // those resolutions makes the graph wrong, not cleaner. We only filter + // when there's no user node with this name — then name-matching would + // produce zero edges anyway and the filter just short-circuits work. if (ref.language === 'c' || ref.language === 'cpp') { - // Bare C standard library functions and macros - if (C_BUILT_INS.has(name)) return true; - // C++ std:: namespace prefix + // C++ std:: namespace prefix — safe to filter unconditionally, + // since `std::foo` is never a user-defined qualified name in + // tree-sitter output. if (name.startsWith('std::')) return true; - // C++ iostream manipulators and objects used without std:: - if (CPP_BUILT_INS.has(name)) return true; + if (C_BUILT_INS.has(name) || CPP_BUILT_INS.has(name)) { + return !this.hasAnyPossibleMatch(name); + } } return false;