From e52650b202581c7f7f89e8752aed69b8926cf3c1 Mon Sep 17 00:00:00 2001 From: dvirdukhan Date: Wed, 27 May 2026 16:51:42 +0300 Subject: [PATCH 1/2] fix(analyzer): resolve LSP CALLS edges on repos without a venv MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Python analyzer hardcoded `environment_path={path}/venv` when starting jedi-language-server via multilspy. When the repo had no venv (the common case for cloned codebases like sphinx, sympy, anything from SWE-bench), jedi raised `InvalidPythonEnvironment` on every `request_definition()` call. analyzer.resolve() then swallowed the exception silently and the indexer produced a graph with DEFINES edges only — zero CALLS, zero EXTENDS. Benchmark validation showed sphinx (5K functions) and sympy (41K functions) had no resolved cross-references at all. Fix: - source_analyzer.py: prefer {repo}/venv, then {repo}/.venv, then fall back to the host interpreter's environment (sys.executable's prefix) so jedi always has a valid Python to introspect. - analyzer.py: log resolve() failures at WARN with file/line context instead of swallowing them silently, so the next regression is loud. Verified: re-indexed sphinx-doc/sphinx-9230 with the fix: DEFINES: 5640, CALLS: 4931, EXTENDS: 484 (was DEFINES-only). Fixes #685. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- api/analyzers/analyzer.py | 7 ++++++- api/analyzers/source_analyzer.py | 22 +++++++++++++++++++++- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/api/analyzers/analyzer.py b/api/analyzers/analyzer.py index 33ca5a2b..96470bea 100644 --- a/api/analyzers/analyzer.py +++ b/api/analyzers/analyzer.py @@ -56,7 +56,12 @@ def resolve(self, files: dict[Path, File], lsp: SyncLanguageServer, file_path: P try: locations = lsp.request_definition(str(file_path), node.start_point.row, node.start_point.column) return [(files[Path(self.resolve_path(location['absolutePath'], path))], files[Path(self.resolve_path(location['absolutePath'], path))].tree.root_node.descendant_for_point_range(Point(location['range']['start']['line'], location['range']['start']['character']), Point(location['range']['end']['line'], location['range']['end']['character']))) for location in locations if location and Path(self.resolve_path(location['absolutePath'], path)) in files] - except Exception: + except Exception as e: + import logging + logging.getLogger(__name__).warning( + "resolve() failed for %s @%d:%d: %s", + file_path, node.start_point.row, node.start_point.column, e, + ) return [] @abstractmethod diff --git a/api/analyzers/source_analyzer.py b/api/analyzers/source_analyzer.py index 9046abcf..b843b75c 100644 --- a/api/analyzers/source_analyzer.py +++ b/api/analyzers/source_analyzer.py @@ -139,7 +139,27 @@ def second_pass(self, graph: Graph, files: list[Path], path: Path) -> None: else: lsps[".java"] = NullLanguageServer() if any(path.rglob('*.py')): - config = MultilspyConfig.from_dict({"code_language": "python", "environment_path": f"{path}/venv"}) + import sys + py_venv = path / "venv" + py_dotvenv = path / ".venv" + if py_venv.is_dir() and (py_venv / "bin" / "python").exists(): + env_path = str(py_venv) + elif py_dotvenv.is_dir() and (py_dotvenv / "bin" / "python").exists(): + env_path = str(py_dotvenv) + else: + # Fall back to the host's Python environment so jedi has a + # valid interpreter to introspect; otherwise every + # request_definition() raises InvalidPythonEnvironment and + # we'd silently produce a graph with zero CALLS edges. + env_path = str(Path(sys.executable).resolve().parent.parent) + logging.info( + "No venv at %s; falling back to host env %s for jedi LSP", + path, env_path, + ) + config = MultilspyConfig.from_dict({ + "code_language": "python", + "environment_path": env_path, + }) lsps[".py"] = SyncLanguageServer.create(config, logger, str(path)) else: lsps[".py"] = NullLanguageServer() From a112c6d634c523b41982528bdad6be470d6cca32 Mon Sep 17 00:00:00 2001 From: Dvir Dukhan <12258836+DvirDukhan@users.noreply.github.com> Date: Thu, 28 May 2026 07:29:38 +0300 Subject: [PATCH 2/2] fix(analyzer): defensive skip when second_pass references untracked file In source_analyzer.second_pass, the list of files we iterate can include paths that first_pass did not add to self.files (e.g. parse errors, LSP-induced timeouts, or rare edge cases where a candidate file is present in the input list but never makes it into the files map). Previously this raised KeyError and aborted the entire index. Hit on sympy/polys/distributedmodules.py during bench calibration of sympy-12481. Skip with a WARN log instead so a single bad file no longer takes down the whole index. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- api/analyzers/source_analyzer.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/api/analyzers/source_analyzer.py b/api/analyzers/source_analyzer.py index b843b75c..c3f8d8db 100644 --- a/api/analyzers/source_analyzer.py +++ b/api/analyzers/source_analyzer.py @@ -180,7 +180,16 @@ def second_pass(self, graph: Graph, files: list[Path], path: Path) -> None: # Skip symbol resolution when no real LSP is available if isinstance(lsps.get(file_path.suffix), NullLanguageServer): continue - file = self.files[file_path] + file = self.files.get(file_path) + if file is None: + # first_pass skipped this file (e.g. parse error, empty, + # or ignored after entering the candidate list). Skip + # in second_pass too instead of crashing the whole index. + logging.warning( + "second_pass: %s not in files map (first_pass skipped it); skipping", + file_path, + ) + continue logging.info(f'Processing file ({i + 1}/{files_len}): {file_path}') for _, entity in file.entities.items(): entity.resolved_symbol(lambda key, symbol, fp=file_path: analyzers[fp.suffix].resolve_symbol(self.files, lsps[fp.suffix], fp, path, key, symbol))