From 77ecc4e76218db6b739a339a89fa073c37a3f5f7 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 17 Jun 2026 06:25:44 +0000 Subject: [PATCH 1/5] docs: add local-first code-retrieval strategy research Synthesizes multi-source research on making CodeRAG win a code-retrieval eval harness against agentic-grep loops (Claude Code, Codex) and commercial semantic search (Cursor, Cody, Augment), under a local/zero-key constraint. Key findings and prioritized plan: build a SweRank/Agentless-style eval harness first; swap the default embedder (bge-small ~45.8 CoIR -> CodeRankEmbed ~60.1); add a local ONNX cross-encoder reranker; route/tune hybrid fusion; then structure-aware graph expansion. Includes cited accuracy-vs-cost tradeoffs and the honest grep-vs-embeddings debate. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01LhTCPRjNmSitYxgSDfttT7 --- docs/research/code-retrieval-strategy.md | 242 +++++++++++++++++++++++ 1 file changed, 242 insertions(+) create mode 100644 docs/research/code-retrieval-strategy.md diff --git a/docs/research/code-retrieval-strategy.md b/docs/research/code-retrieval-strategy.md new file mode 100644 index 0000000..3465e71 --- /dev/null +++ b/docs/research/code-retrieval-strategy.md @@ -0,0 +1,242 @@ +# Winning the Code-Retrieval Eval: A Strategy for CodeRAG + +> Research synthesis — how to make CodeRAG more **accurate**, **efficient**, and **faster** at +> code retrieval than (a) agentic grep loops (Claude Code, Codex) and (b) commercial semantic +> code search (Cursor, Sourcegraph/Cody, Augment), under a hard constraint: **everything must run +> locally with no API key / no paid LLM calls.** +> +> Confidence levels: **[H]** well-sourced primary, **[M]** secondary/needs re-verification, +> **[?]** contested / sources disagree. + +--- + +## TL;DR — the plan, in priority order + +| # | Move | Expected lift | Cost | Stays local? | +|---|------|---------------|------|--------------| +| 0 | **Build the eval harness first** (SweRank/Agentless protocol on real repos) | — (makes claims provable) | Low | ✅ | +| 1 | **Swap default embedder** `bge-small` → **CodeRankEmbed (137M)** | ~+14 CoIR NDCG@10 (45.8 → 60.1) | One model swap, still ONNX/CPU | ✅ | +| 2 | **Add a two-stage cross-encoder reranker** (retrieve top-100 → rerank top-8) | +5 to +15 nDCG/MRR | ~30 ms/query CPU (ONNX) | ✅ | +| 3 | **Tune & route hybrid fusion** (BM25 for identifiers/code↔code, dense for NL; sweep RRF) | a few nDCG points; big latency win | Low | ✅ | +| 4 | **Structure-aware retrieval** (tree-sitter call/import graph + PageRank expansion) | ~+2 pts localization | Medium build | ✅ | +| 5 | **Query expansion / HyDE** — *gated to NL queries only* | mixed; can **hurt** private identifiers | +25–60% latency | ✅ (small local LLM) | + +The first two moves alone plausibly take CodeRAG from a bottom-quartile CoIR retriever to one +competitive with 7B proprietary models — without leaving the local/zero-key envelope. + +--- + +## 0. Build the eval harness first (priority 0) + +You cannot claim "more accurate" without a number, and the number protects the claim. Copy the +**SweRank / Agentless localization protocol** [H]: + +- **Queries:** merged PRs / closed issues → use issue title+body (or commit message) as the query. +- **Ground truth:** the files (and, for function-level, the functions) changed by the fixing + PR/commit diff. This is exactly how SweRank built its 67,341-pair "SweLoc" corpus from 3,387 + repos. [M] (arXiv 2505.07849) +- **Filtering:** drop docs-only/trivial PRs; apply a consistency filter (SweRank used K=20). +- **Metrics:** report **file & function recall@{1,5,10} + MRR** (mirror Agentless/SweRank so your + numbers are directly comparable to published baselines). Use **nDCG@10** for multi-file PRs. [H] +- **Baselines to beat:** BM25 (the SWE-bench baseline), a stock dense embedder (bge/gte/OpenAI), + and an Agentless-style LLM localizer. [H] +- **External sanity checks:** run the retriever on **CoIR (NDCG@10)** and **CodeSearchNet (MRR)**; + both ship as pip-installable, BEIR/MTEB-schema frameworks. [H] (arXiv 2407.02883, 1909.09436) + +**Bars worth targeting:** SweRank reports ~96% file Acc@5 and ~88.7% function Acc@10 on +SWE-bench-Lite; LocAgent ~92.7% file accuracy. [M] These are end-to-end localization systems +(retriever + reranker + sometimes an LLM), so they're aspirational ceilings, not retriever-only. + +**Why retrieval matters at all (the framing stat):** on SWE-bench, swapping BM25 context for the +gold "oracle" files more than doubled Claude 2's resolve rate (1.96% → 4.8%); BM25 missed *all* +needed files in nearly half of instances. [H] (arXiv 2310.06770) Better retrieval is the lever. + +--- + +## 1. Upgrade the embedding model (highest single accuracy jump) + +**Current state:** CodeRAG defaults to `BAAI/bge-small-en-v1.5`, which scores only **~45.8 CoIR +NDCG@10** — generic text embedders are markedly weaker than code-specialized ones on code. [H] +(Granite R2 paper, arXiv 2508.21085) + +**Recommended default → `CodeRankEmbed` (137M):** 8192-token context, **60.1 CoIR NDCG@10 / 77.9 +CodeSearchNet MRR** — exceptional accuracy-per-byte that rivals 7B proprietary models, and small +enough for ONNX/CPU. [H] (cornstack/CodeRankEmbed; arXiv 2412.01007) That's roughly **+14 CoIR +points over the current default from a single swap.** +⚠️ Verify its exact license (it's an Arctic-Embed-M-Long fine-tune) before shipping as default. + +**Permissively-licensed alternatives (Apache-2.0):** +- **CodeSage-base-v2 (356M)** — ~64.56 CoIR [M, re-verify], Matryoshka dims (flexible truncation). +- **gte-modernbert-base (149M)** — strong dual text+code model, 8192 ctx, good general fallback. +- **nomic-embed-code (7B, Apache-2.0)** — SOTA-ish on CodeSearchNet, but **needs a GPU**; offer as + an opt-in "accuracy" backend, not the local default. [H] + +**License landmines to avoid as defaults:** SFR-Embedding-Code (all sizes) and jina-code-embeddings +are **CC-BY-NC-4.0 (non-commercial)**; Qodo-Embed-1-7B is commercial-license (its 1.5B is open +under OpenRAIL++-M, 68.53 CoIR). [H] Voyage-code-3 is API-only — reference point only. [H] + +> ⚠️ **Metric-scale trap.** Two "code retrieval" scales circulate and get conflated: **CoIR-paper +> NDCG@10** (values in the 40s–70s) vs the **MTEB-Code leaderboard average** (values ~78–90). The +> same model shows up as "71.5" and "79.31". Always compare within one scale. [H] + +--- + +## 2. Add a local cross-encoder reranker (highest-ROI bolt-on) + +The evidence converges: **a small ONNX cross-encoder reranking the top-100 down to top-8 is the +single best accuracy add-on for a local-first engine.** + +- **Expected lift:** +5 to +15 nDCG/MRR points, largest when first-stage recall is weak. [H] + (arXiv 2212.06121; multiple RAG benchmarks). One benchmark saw Hit@1 jump 62.7% → 83.0%. [M] +- **Latency (the local constraint):** FlashRank's `ms-marco-MiniLM-L-12-v2` (~4 MB, ONNX, CPU-only) + adds **~31 ms/query** mean over 100 candidates; sub-20 ms for 50. [H] (FlashRank; + clouatre-labs/rag-reranking-benchmarks) Unoptimized PyTorch is 100–300 ms — **so ship ONNX/int8**. +- **Canonical design:** retrieve top-100 → rerank → top-8. [H] (CoRNStack used window=10/step=5 for + listwise; pointwise cross-encoders are simpler and cheaper.) +- **Drop-in models:** FlashRank `ms-marco-MiniLM-L-12-v2`, or `bge-reranker-base` / `bge-reranker-v2-m3` + in ONNX-int8 (community ONNX builds exist). [H] +- **Code-aware option:** Qwen3-Reranker-0.6B is the best *small code-aware* reranker (MTEB-Code 73.4), + but ~0.5–1 s/query on CPU — borderline interactive; quantize and benchmark before adopting. [M] +- **Total local budget:** FAISS ANN (single-digit ms) + ONNX rerank (~30–60 ms) → **well under 100 ms**, + still far faster than an agent's multi-round grep/read loop. + +**Caveat / gap:** there's no published head-to-head of *small* cross-encoders on a *code* benchmark +(CoIR). The strongest code-reranking result (CoRNStack, +2.8 to +12.2 MRR) uses a 7B reranker. The +small-model code lift is **inferred, not directly measured** — your harness should confirm it. [?] + +**Alternative architecture — ColBERT / late interaction:** `answerai-colbert-small-v1` (33M) beats +bge-base on BEIR and searches on CPU in milliseconds; it's a strong *first-stage* upgrade or +same-size reranker. [H] But it's a multi-vector index change (higher per-token storage), not a cheap +bolt-on. Treat as a later experiment, not a v1 move. + +--- + +## 3. Tune and route the hybrid fusion you already have + +CodeRAG already does dense + BM25 + RRF — the literature says that's the right foundation; the wins +are in **routing and tuning**: + +- **Route by query type.** For **code→code (PL→PL)** retrieval, BM25 with word-level splitting + *significantly beats* dense and is ~an order of magnitude faster; for **NL→code**, dense wins. [H] + (arXiv 2510.20609) Detecting "is this query an identifier/snippet vs natural language" and + weighting accordingly is a cheap, high-value heuristic. +- **BM25 is the efficiency anchor.** Retrieval latency varies up to 200× across configs; "BM25 + + word splitting offers the best quality–latency trade-off." [H] Keep BM25 fast and dominant for + exact identifiers — exactly where pure-embedding tools (and Cursor's own data) show weakness. +- **RRF tuning.** k≈60 is a fine default (Elastic says RRF needs no tuning), but a tuned hybrid beat + vanilla RRF by ~4 nDCG points on one benchmark — so **sweep k and per-retriever weights once you + have eval data.** [M] (WANDS is e-commerce, not code — re-verify on your harness.) +- **Complementarity is real:** sparse excels at entity/identifier lookup and domain terminology; + dense excels at paraphrase/semantic generalization; **fusion + rerank beats either alone "by a + large margin."** [H] This *is* the gap neither pure-grep agents nor single-modality embedding + tools fully exploit. + +--- + +## 4. Structure-aware retrieval (real but modest; phase 2) + +Graphs help decide *what* to retrieve, and all of these build **locally via tree-sitter**: + +- **RepoGraph** (tree-sitter dependency graph + k-hop ego-graph) lifted SWE-bench-Lite resolve rates + ~2–2.7 pts across four frameworks (best: 1-hop + flatten). [H] (arXiv 2410.14684) +- **GraphCodeBERT** (data-flow-aware) raised CodeSearchNet MRR ~+2 pts over CodeBERT, consistent + across 6 languages (p<0.01). [H] (arXiv 2009.08366) +- **Aider's repo map** = tree-sitter symbol graph + **PageRank** over the file dependency graph, + token-budgeted (~1k tokens), 130+ languages — the canonical engineering pattern for + structure-aware context selection. [H] (aider.chat/2023/10/22/repomap.html) +- **CodeGRAG** (control/data-flow graph view) gave small local models +5–6 pts on generation. [H] + +**Verdict:** worth building as a retrieval-expansion/rerank signal (1-hop neighbors of top hits), +but gains are modest (~2 pts) and it's more engineering than moves #1–3. Do it after the eval + +embedder + reranker land. + +--- + +## 5. Query expansion / HyDE — double-edged, gate carefully + +- HyDE (generate a hypothetical code snippet, embed *that*) bridges the NL↔code gap and improves + recall in general RAG. [M] But **no code-specific benchmark number surfaced** — unproven on code. [?] +- **It can actively hurt** exactly your hardest case: LLM query expansion degraded retrieval on + *unfamiliar/ambiguous* queries (−10 NDCG@10 on unfamiliar; −17 Recall@100 on high-ambiguity). [H] + (arXiv 2505.12694) Private-codebase identifiers/internal APIs are precisely "unfamiliar." +- **Cost:** +25–60% latency (extra small-model generation + second embedding pass). [M] + +**Verdict:** if used at all, **gate it to clearly natural-language queries** and never to +identifier-like queries. Low priority; measure on the harness before enabling by default. + +--- + +## The agentic-grep baseline — what you're actually up against + +This debate is **genuinely contested**; sources have skin in the game. Be honest about it. + +**The anti-embedding camp (grep + read loops win):** +- **Claude Code:** Boris Cherny — *"Claude Code doesn't use RAG currently… agentic search + out-performed RAG for the kinds of things people use Code for."* Early versions used a local + vector DB and dropped it. Stack = Glob + Grep (ripgrep) + Read, on demand. [H] (HN 43164253) +- **Cline:** *"no RAG, no embeddings, no vector databases"* — by design. Their mechanism arguments: + chunking "tears apart logic" (a call in chunk 47, its def in chunk 892); an index is "a snapshot + frozen in time" that drifts stale; embeddings double your IP attack surface. [H] +- **Sourcegraph Cody:** *removed* embeddings entirely in v5.3 (scaling to 100k+ repos, security, + maintenance) — "the most important aspect is getting the files, not the algorithm." [H] +- **Augment:** for SWE-bench, *"grep and find were sufficient… embedding-based retrieval was not the + bottleneck"* — though they say embeddings are still "critical … in real use." [H] +- The security argument has teeth: **Vec2Text reconstructs ~92% of 32-token inputs exactly** — + embeddings are invertible and unrevocable, so they need original-text-level safeguards. [H] + (degrades sharply >50–100 tokens, so the risk is bounded). CodeRAG's local-first stance is a + direct answer to this. + +**The pro-semantic counter-evidence (the opening you exploit):** +- **Cursor (publishes data):** semantic search + grep gives **+12.5% accuracy** (6.5–23.5% by model) + over grep alone, **largest on 1,000+ file repos.** [H] They agree grep is the floor; embeddings + raise the ceiling. +- **NVIDIA CORTEXA (the cleanest controlled result):** a fine-tuned code embedder (NV-EmbedCode) + hits **71.95% recall, +31.28% over BM25** and +6.4% over Agentless on SWE-bench localization. [H] + This strongly implies the "grep wins" results reflect *off-the-shelf embedders + naive chunking*, + not embeddings in principle. +- Agentic loops are expensive: ~2.7× input tokens / 1.7× output tokens vs enhanced RAG, and up to + 83× higher latency sensitivity (poor KV-cache reuse). [M] (general agentic-RAG, not code-specific) + +**Synthesis — the defensible thesis:** the grep camp's wins are real but measured on +**SWE-bench-style edit tasks with persistent agents and off-the-shelf embedders.** CodeRAG's +opening is the union of (a) where pure grep is documented to fail — conceptual / "where is X +handled" NL queries, large/unfamiliar repos (Cursor's 1,000+ file effect; CORTEXA's recall gap) — +**plus** (b) grep's strength on exact identifiers, captured by BM25, **plus** (c) a fine-tuned/ +code-specific embedder and a reranker (CORTEXA shows that closes the gap). The literature's clearest +verdict — **fusion + rerank beats either modality alone** — is exactly the niche neither pure-grep +agents nor single-modality embedding products fully occupy. And being **local/zero-key** answers the +staleness (live watcher) and security (no code leaves the box; no invertible embeddings shipped out) +objections in one stroke. + +--- + +## Open contradictions to resolve on your own harness + +1. **Chunking [?]:** cAST claims AST chunking beats fixed-size on recall/SWE-bench (+1.8–4.3 Recall@5); + a controlled study finds **sliding windows beat function-level chunking** on completion EM (function + chunking was *worst*), and line-based ≈ syntax-aware across budgets. CodeRAG's symbol-aware chunking + is well-suited to NL→code localization (your use case), but **don't assume AST splitting is strictly + best** — test windowed/overlapping and hierarchical (parent-document) variants. Optimal chunk size + was ~2,000 non-whitespace chars; bigger degraded. [H/?] +2. **Small reranker on code [?]:** lift is inferred from English benchmarks; confirm on CoIR/your harness. +3. **CoIR absolute numbers [M]:** several (Voyage/E5/ada; SweRank Acc@k; >0.78 CSN MRR) came via PDF + parsing — re-verify against source tables before publishing. + +## Sequencing recommendation + +1. **Eval harness** (§0) — nothing is provable without it. +2. **CodeRankEmbed default** (§1) — biggest single jump, one swap, stays ONNX/CPU. +3. **ONNX cross-encoder reranker** (§2) — highest-ROI bolt-on, ~30 ms. +4. **Hybrid routing + RRF sweep** (§3) — cheap, big latency win, plays to BM25's identifier strength. +5. **MCP surface** — so agents (Claude Code/Codex) actually adopt CodeRAG as their retrieval tool. +6. **Graph expansion** (§4), then **gated HyDE** (§5) — measured, incremental. + +--- + +### Key sources +CoIR (arXiv 2407.02883) · Granite R2 broad comparison (2508.21085) · CodeRankEmbed/CoRNStack +(2412.01007) · SweRank (2505.07849) · LocAgent (2503.09089) · SWE-bench (2310.06770) · CodeSearchNet +(1909.09436) · RepoGraph (2410.14684) · GraphCodeBERT (2009.08366) · chunking study (2605.04763) · +budget/routing (2510.20609) · query-expansion risk (2505.12694) · Cursor semsearch (cursor.com/blog/semsearch) +· Cline (cline.bot/blog) · Sourcegraph Cody FAQ · Augment (jxnl.co / augmentcode.com) · NVIDIA CORTEXA +(research.nvidia.com/labs/adlr/cortexa) · FlashRank · mxbai-rerank-v2 · answerai-colbert-small-v1. From 528f17f18c234e1927a2acfcca78c074b7249115 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 17 Jun 2026 06:51:46 +0000 Subject: [PATCH 2/5] feat(eval): add code-retrieval eval harness Implements move #0 of the retrieval strategy: a small, offline harness to measure retrieval quality so accuracy claims are provable and regressions are caught. - coderag/eval/metrics.py: recall@k, hit@k (Acc@k), MRR, nDCG@k with rank de-duplication so multiple chunks per file don't inflate scores. - coderag/eval/dataset.py: JSONL EvalCase format + a git miner that builds datasets SWE-bench/SweRank-style (commit subject -> changed files that still exist at HEAD), filtering merges/reverts/bots/diffuse commits. - coderag/eval/harness.py: evaluate() scores any search callable; compare_modes() contrasts dense-only vs BM25-only vs hybrid on one index by swapping RRF fusion weights (the index is mode-independent). - coderag eval [--build] [--compare] [--level file|symbol] CLI surface, a thin adapter over the engine. - docs/eval.md usage guide; tests cover metrics, dataset round-trip, the git miner, and end-to-end scoring via the deterministic fake provider. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01LhTCPRjNmSitYxgSDfttT7 --- coderag/eval/__init__.py | 39 ++++++++ coderag/eval/dataset.py | 175 ++++++++++++++++++++++++++++++++ coderag/eval/harness.py | 180 +++++++++++++++++++++++++++++++++ coderag/eval/metrics.py | 78 +++++++++++++++ coderag/surfaces/cli.py | 97 ++++++++++++++++++ docs/eval.md | 80 +++++++++++++++ tests/test_eval.py | 208 +++++++++++++++++++++++++++++++++++++++ 7 files changed, 857 insertions(+) create mode 100644 coderag/eval/__init__.py create mode 100644 coderag/eval/dataset.py create mode 100644 coderag/eval/harness.py create mode 100644 coderag/eval/metrics.py create mode 100644 docs/eval.md create mode 100644 tests/test_eval.py diff --git a/coderag/eval/__init__.py b/coderag/eval/__init__.py new file mode 100644 index 0000000..60554b8 --- /dev/null +++ b/coderag/eval/__init__.py @@ -0,0 +1,39 @@ +"""Code-retrieval evaluation harness. + +A small, offline, dependency-free harness for measuring *retrieval* quality — "did we +surface the right file/symbol for this query?" — so accuracy claims are provable and +regressions are caught. + +It follows the SWE-bench / Agentless / SweRank localization protocol: queries come from +real commit messages or issues, and ground truth is the set of files (and optionally +symbols) those commits changed. Metrics are the standard localization set: recall@k, +hit@k (Acc@k), MRR, and nDCG@k. + +The public pieces: + +- :class:`EvalCase` / :func:`load_dataset` / :func:`save_dataset` — the dataset format. +- :func:`build_from_git` — mine a dataset from a repo's history (no network, no LLM). +- :func:`evaluate` — score one retriever (any ``search`` callable) against a dataset. +- :func:`compare_modes` — score dense-only vs BM25-only vs hybrid on one index, which is + the built-in way to show fusion beats either modality alone. +""" + +from __future__ import annotations + +from coderag.eval.dataset import EvalCase, build_from_git, load_dataset, save_dataset +from coderag.eval.harness import EvalResult, compare_modes, evaluate +from coderag.eval.metrics import hit_at_k, mrr, ndcg_at_k, recall_at_k + +__all__ = [ + "EvalCase", + "EvalResult", + "build_from_git", + "compare_modes", + "evaluate", + "hit_at_k", + "load_dataset", + "mrr", + "ndcg_at_k", + "recall_at_k", + "save_dataset", +] diff --git a/coderag/eval/dataset.py b/coderag/eval/dataset.py new file mode 100644 index 0000000..c38a547 --- /dev/null +++ b/coderag/eval/dataset.py @@ -0,0 +1,175 @@ +"""Eval dataset: a list of (query -> relevant files/symbols) cases, plus a git miner. + +The dataset is plain JSONL so it's diffable, hand-editable, and easy to share. Each line +is one :class:`EvalCase`. :func:`build_from_git` synthesizes a dataset from a repo's own +history using the SWE-bench/SweRank recipe: the commit subject becomes the query and the +files that commit changed (that still exist at HEAD) become the ground truth. +""" + +from __future__ import annotations + +import json +import subprocess +from dataclasses import dataclass, field +from pathlib import Path +from typing import Dict, List, Optional, Sequence + +# Control-character delimiters for one-pass `git log` parsing — they never appear in +# real commit messages, so we don't have to escape file paths or message text. +_REC = "\x1e" # between commits +_FLD = "\x1f" # between fields within a commit header + + +@dataclass(slots=True) +class EvalCase: + """One retrieval query and its ground-truth relevant items. + + ``relevant_files`` are repo-relative posix paths; ``relevant_symbols`` are optional + qualified names (e.g. ``"Indexer._index_file"``) for function/class-level scoring. + """ + + query: str + relevant_files: List[str] + relevant_symbols: List[str] = field(default_factory=list) + id: Optional[str] = None + source: str = "" + + def as_dict(self) -> Dict[str, object]: + d: Dict[str, object] = { + "query": self.query, + "relevant_files": self.relevant_files, + } + if self.relevant_symbols: + d["relevant_symbols"] = self.relevant_symbols + if self.id: + d["id"] = self.id + if self.source: + d["source"] = self.source + return d + + @classmethod + def from_dict(cls, d: Dict[str, object]) -> "EvalCase": + files = d.get("relevant_files", []) + symbols = d.get("relevant_symbols", []) + return cls( + query=str(d["query"]), + relevant_files=[str(p) for p in files] if isinstance(files, list) else [], + relevant_symbols=( + [str(s) for s in symbols] if isinstance(symbols, list) else [] + ), + id=str(d["id"]) if d.get("id") else None, + source=str(d.get("source", "")), + ) + + +def load_dataset(path: Path | str) -> List[EvalCase]: + """Load a JSONL dataset, skipping blank lines.""" + cases: List[EvalCase] = [] + with Path(path).open(encoding="utf-8") as fh: + for line in fh: + line = line.strip() + if line: + cases.append(EvalCase.from_dict(json.loads(line))) + return cases + + +def save_dataset(cases: Sequence[EvalCase], path: Path | str) -> None: + """Write cases as JSONL (one compact JSON object per line).""" + p = Path(path) + p.parent.mkdir(parents=True, exist_ok=True) + with p.open("w", encoding="utf-8") as fh: + for case in cases: + fh.write(json.dumps(case.as_dict(), ensure_ascii=False) + "\n") + + +def _git(repo: Path, *args: str) -> str: + """Run a git command in ``repo`` and return stdout (raises on failure).""" + return subprocess.run( + ["git", "-C", str(repo), *args], + check=True, + capture_output=True, + text=True, + ).stdout + + +def build_from_git( + repo: Path | str, + *, + max_cases: int = 200, + extensions: Optional[Sequence[str]] = None, + max_files_per_commit: int = 5, + min_query_len: int = 12, + commit_scan_limit: int = 2000, +) -> List[EvalCase]: + """Mine an eval dataset from a repo's commit history. + + For each non-merge commit, the subject line is the query and the changed files that + (a) match ``extensions`` and (b) still exist at HEAD become the relevant set — so + every ground-truth file is actually present in the index built from HEAD. + + Filtering mirrors SweRank/Agentless dataset construction: skip merges, reverts, and + bot/automated commits; drop commits that touch too many files (``max_files_per_commit`` + — diffuse, weak signal) or none of the targeted extensions; and require a meaningful + query (``min_query_len``). + """ + repo = Path(repo) + exts = { + e if e.startswith(".") else f".{e}" + for e in (extensions or (".py", ".js", ".ts", ".tsx", ".go", ".rs", ".java")) + } + + fmt = f"{_REC}%H{_FLD}%s{_FLD}%an" + raw = _git( + repo, + "log", + "--no-merges", + f"-n{commit_scan_limit}", + "--name-only", + f"--pretty=format:{fmt}", + ) + + cases: List[EvalCase] = [] + for record in raw.split(_REC): + if not record.strip() or len(cases) >= max_cases: + continue + header, _, body = record.partition("\n") + parts = header.split(_FLD) + if len(parts) < 3: + continue + sha, subject, author = parts[0], parts[1].strip(), parts[2].strip() + + if not _is_usable_query(subject, min_query_len) or _is_bot(author): + continue + + files = [ + line.strip() + for line in body.splitlines() + if line.strip() and Path(line.strip()).suffix in exts + ] + # Keep only files that still exist at HEAD, so they're retrievable from the index. + files = [f for f in files if (repo / f).exists()] + if not files or len(files) > max_files_per_commit: + continue + + cases.append( + EvalCase( + query=subject, + relevant_files=files, + id=sha[:12], + source="git", + ) + ) + return cases + + +def _is_usable_query(subject: str, min_len: int) -> bool: + if len(subject) < min_len: + return False + low = subject.lower() + # Reverts/merges/version bumps carry little localization signal. + return not low.startswith(("revert", "merge", "bump", "release ")) + + +def _is_bot(author: str) -> bool: + low = author.lower() + return "bot" in low or low in {"dependabot", "github-actions", "renovate"} diff --git a/coderag/eval/harness.py b/coderag/eval/harness.py new file mode 100644 index 0000000..aa112b5 --- /dev/null +++ b/coderag/eval/harness.py @@ -0,0 +1,180 @@ +"""Run a retriever against an eval dataset and report localization metrics. + +:func:`evaluate` scores any ``search`` callable; :func:`compare_modes` is the convenience +that scores dense-only, BM25-only, and hybrid retrieval on a single index — the built-in +way to demonstrate that fusion beats either modality alone. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Sequence, Tuple + +from coderag.eval.dataset import EvalCase +from coderag.eval.metrics import hit_at_k, mrr, ndcg_at_k, recall_at_k +from coderag.types import SearchHit + +if TYPE_CHECKING: + from coderag.api import CodeRAG + +# A retriever: given a query and a result count, return ranked hits (best-first). +SearchFn = Callable[[str, int], List[SearchHit]] + +# (label, dense_weight, lexical_weight) — the three retrieval modes we contrast. +DEFAULT_MODES: Tuple[Tuple[str, float, float], ...] = ( + ("dense", 1.0, 0.0), + ("bm25", 0.0, 1.0), + ("hybrid", 1.0, 1.0), +) + +DEFAULT_KS: Tuple[int, ...] = (1, 5, 10) + + +@dataclass(slots=True) +class EvalResult: + """Aggregate metrics for one retriever over one dataset.""" + + label: str + level: str # "file" | "symbol" + n: int # number of scored cases + ks: Tuple[int, ...] + recall: Dict[int, float] = field(default_factory=dict) + hit: Dict[int, float] = field(default_factory=dict) + ndcg: Dict[int, float] = field(default_factory=dict) + mrr: float = 0.0 + + def as_dict(self) -> Dict[str, object]: + return { + "label": self.label, + "level": self.level, + "n": self.n, + "mrr": round(self.mrr, 4), + "recall": {k: round(v, 4) for k, v in self.recall.items()}, + "hit": {k: round(v, 4) for k, v in self.hit.items()}, + "ndcg": {k: round(v, 4) for k, v in self.ndcg.items()}, + } + + +def _ranked_ids(hits: Sequence[SearchHit], level: str) -> List[str]: + """Project hits onto the id space being scored (file paths or symbols).""" + if level == "symbol": + return [h.symbol for h in hits if h.symbol] + return [h.path for h in hits] + + +def _relevant_ids(case: EvalCase, level: str) -> List[str]: + return case.relevant_symbols if level == "symbol" else case.relevant_files + + +def evaluate( + search_fn: SearchFn, + cases: Sequence[EvalCase], + *, + label: str = "retriever", + ks: Sequence[int] = DEFAULT_KS, + level: str = "file", +) -> EvalResult: + """Score ``search_fn`` over ``cases`` at ``level`` ("file" or "symbol"). + + Cases with no ground-truth ids at the requested level are skipped (so a file-only + dataset can still be scored at the symbol level without penalizing the retriever). + """ + ks = tuple(sorted(set(ks))) + fetch = max(ks) + recall_sum = {k: 0.0 for k in ks} + hit_sum = {k: 0.0 for k in ks} + ndcg_sum = {k: 0.0 for k in ks} + mrr_sum = 0.0 + scored = 0 + + for case in cases: + relevant = _relevant_ids(case, level) + if not relevant: + continue + ranked = _ranked_ids(search_fn(case.query, fetch), level) + for k in ks: + recall_sum[k] += recall_at_k(ranked, relevant, k) + hit_sum[k] += hit_at_k(ranked, relevant, k) + ndcg_sum[k] += ndcg_at_k(ranked, relevant, k) + mrr_sum += mrr(ranked, relevant, fetch) + scored += 1 + + n = max(scored, 1) + return EvalResult( + label=label, + level=level, + n=scored, + ks=ks, + recall={k: recall_sum[k] / n for k in ks}, + hit={k: hit_sum[k] / n for k in ks}, + ndcg={k: ndcg_sum[k] / n for k in ks}, + mrr=mrr_sum / n, + ) + + +def compare_modes( + cr: "CodeRAG", + cases: Sequence[EvalCase], + *, + ks: Sequence[int] = DEFAULT_KS, + level: str = "file", + modes: Sequence[Tuple[str, float, float]] = DEFAULT_MODES, +) -> List[EvalResult]: + """Score dense-only vs BM25-only vs hybrid on the already-built index of ``cr``. + + The index is mode-independent — the dense/lexical weights only affect query-time RRF + fusion — so we reuse one provider/store/vector index and just swap the fusion weights. + """ + from coderag.retrieval.search import HybridSearcher + + results: List[EvalResult] = [] + for label, dense_w, lexical_w in modes: + cfg = cr.config.with_overrides(dense_weight=dense_w, lexical_weight=lexical_w) + searcher = HybridSearcher(cfg, cr.provider, cr.store, cr.vectors) + results.append( + evaluate(searcher.search, cases, label=label, ks=ks, level=level) + ) + return results + + +def format_table(results: Sequence[EvalResult]) -> str: + """Render results as a compact fixed-width table for the CLI.""" + if not results: + return "(no results)" + ks = results[0].ks + headers = ( + ["mode", "n", "MRR"] + + [f"R@{k}" for k in ks] + + [f"nDCG@{k}" for k in ks] + + [f"Hit@{k}" for k in ks] + ) + rows = [headers] + for r in results: + rows.append( + [r.label, str(r.n), f"{r.mrr:.3f}"] + + [f"{r.recall[k]:.3f}" for k in ks] + + [f"{r.ndcg[k]:.3f}" for k in ks] + + [f"{r.hit[k]:.3f}" for k in ks] + ) + widths = [max(len(row[i]) for row in rows) for i in range(len(headers))] + lines = [ + " ".join(cell.ljust(widths[i]) for i, cell in enumerate(row)) for row in rows + ] + lines.insert(1, " ".join("-" * w for w in widths)) + return "\n".join(lines) + + +def best_label( + results: Sequence[EvalResult], *, metric: str = "ndcg", k: int = 10 +) -> Optional[str]: + """Label of the highest-scoring result by ``metric`` at ``k`` (MRR ignores ``k``).""" + if not results: + return None + + def score(r: EvalResult) -> float: + if metric == "mrr": + return r.mrr + table = {"recall": r.recall, "hit": r.hit, "ndcg": r.ndcg}[metric] + return table.get(k, table.get(max(r.ks), 0.0)) + + return max(results, key=score).label diff --git a/coderag/eval/metrics.py b/coderag/eval/metrics.py new file mode 100644 index 0000000..b0474bc --- /dev/null +++ b/coderag/eval/metrics.py @@ -0,0 +1,78 @@ +"""Ranking metrics for code-retrieval localization. + +All functions take ``ranked`` (a best-first list of retrieved item ids — file paths or +symbols, already deduplicated and order-preserving) and ``relevant`` (the set of +ground-truth ids). Items are compared by equality, so callers must normalize ids (e.g. +posix relative paths) before scoring. + +These are the standard metrics used by SWE-bench localization work (Agentless, LocAgent, +SweRank) and the CoIR / CodeSearchNet benchmarks: + +- ``recall_at_k`` — fraction of relevant items found in the top k. +- ``hit_at_k`` — 1.0 if *any* relevant item is in the top k (a.k.a. Acc@k / hit rate). +- ``mrr`` — reciprocal rank of the first relevant item. +- ``ndcg_at_k`` — rank-discounted gain with binary relevance. +""" + +from __future__ import annotations + +import math +from typing import Iterable, Sequence + + +def _dedup(ranked: Sequence[str]) -> list[str]: + """Stable de-duplication, keeping the first (best) occurrence of each id.""" + seen: set[str] = set() + out: list[str] = [] + for item in ranked: + if item not in seen: + seen.add(item) + out.append(item) + return out + + +def recall_at_k(ranked: Sequence[str], relevant: Iterable[str], k: int) -> float: + """Fraction of relevant ids present in the top ``k`` retrieved ids.""" + rel = set(relevant) + if not rel: + return 0.0 + top = set(_dedup(ranked)[:k]) + return len(top & rel) / len(rel) + + +def hit_at_k(ranked: Sequence[str], relevant: Iterable[str], k: int) -> float: + """1.0 if at least one relevant id is in the top ``k`` (Acc@k), else 0.0.""" + rel = set(relevant) + if not rel: + return 0.0 + return 1.0 if rel & set(_dedup(ranked)[:k]) else 0.0 + + +def mrr(ranked: Sequence[str], relevant: Iterable[str], k: int | None = None) -> float: + """Reciprocal rank of the first relevant id (0 if none within the cutoff).""" + rel = set(relevant) + if not rel: + return 0.0 + ordered = _dedup(ranked) + if k is not None: + ordered = ordered[:k] + for rank, item in enumerate(ordered, start=1): + if item in rel: + return 1.0 / rank + return 0.0 + + +def ndcg_at_k(ranked: Sequence[str], relevant: Iterable[str], k: int) -> float: + """Normalized discounted cumulative gain at ``k`` with binary relevance.""" + rel = set(relevant) + if not rel or k <= 0: + return 0.0 + ordered = _dedup(ranked)[:k] + dcg = sum( + 1.0 / math.log2(rank + 1) + for rank, item in enumerate(ordered, start=1) + if item in rel + ) + ideal_hits = min(len(rel), k) + idcg = sum(1.0 / math.log2(rank + 1) for rank in range(1, ideal_hits + 1)) + return dcg / idcg if idcg else 0.0 diff --git a/coderag/surfaces/cli.py b/coderag/surfaces/cli.py index 0aef288..25ca920 100644 --- a/coderag/surfaces/cli.py +++ b/coderag/surfaces/cli.py @@ -90,6 +90,66 @@ def cmd_status(args: argparse.Namespace) -> int: return 0 +def cmd_eval(args: argparse.Namespace) -> int: + from coderag import eval as ev + + cfg = _build_config(args) + + # `coderag eval build` — mine a dataset from the repo's git history. + if args.build: + cases = ev.build_from_git( + cfg.watched_dir, + max_cases=args.max_cases, + extensions=[e.lstrip(".") for e in _suffixes(cfg.languages)], + ) + out = args.dataset or "coderag-eval.jsonl" + ev.save_dataset(cases, out) + print(f"Wrote {len(cases)} eval case(s) to {out}") + return 0 if cases else 1 + + if not args.dataset: + print("Provide --dataset PATH (or --build to mine one from git history).") + return 1 + cases = ev.load_dataset(args.dataset) + if not cases: + print(f"No eval cases in {args.dataset}.") + return 1 + + ks = tuple(int(k) for k in args.ks.split(",")) + cr = CodeRAG(cfg) + cr.index() # ensure the index is built / up to date before scoring + + if args.compare: + results = ev.compare_modes(cr, cases, ks=ks, level=args.level) + else: + results = [ + ev.evaluate(cr.search, cases, label="hybrid", ks=ks, level=args.level) + ] + + if args.json: + print(json.dumps([r.as_dict() for r in results], indent=2)) + else: + from coderag.eval.harness import format_table + + print(f"Eval: {len(cases)} case(s), level={args.level}\n") + print(format_table(results)) + return 0 + + +def _suffixes(languages: tuple) -> list: + """Map configured language names to file suffixes for dataset mining.""" + table = { + "python": ".py", + "javascript": ".js", + "typescript": ".ts", + "tsx": ".tsx", + "go": ".go", + "rust": ".rs", + "java": ".java", + } + return [table[lang] for lang in languages if lang in table] + + def cmd_watch(args: argparse.Namespace) -> int: from coderag.watch import watch @@ -181,6 +241,43 @@ def build_parser() -> argparse.ArgumentParser: _add_common(p_status) p_status.set_defaults(func=cmd_status) + p_eval = sub.add_parser( + "eval", + help="Measure retrieval quality against a dataset (recall@k, MRR, nDCG).", + ) + p_eval.add_argument( + "--dataset", help="JSONL dataset of query -> relevant files/symbols." + ) + p_eval.add_argument( + "--build", + action="store_true", + help="Mine a dataset from git history into --dataset (default coderag-eval.jsonl).", + ) + p_eval.add_argument( + "--max-cases", + type=int, + default=200, + help="Cap cases when building (default 200).", + ) + p_eval.add_argument( + "--compare", + action="store_true", + help="Score dense-only vs BM25-only vs hybrid on one index.", + ) + p_eval.add_argument( + "--level", + choices=("file", "symbol"), + default="file", + help="Localization granularity (default file).", + ) + p_eval.add_argument( + "--ks", default="1,5,10", help="Comma-separated cutoffs (default 1,5,10)." + ) + p_eval.add_argument("--json", action="store_true", help="Emit JSON.") + p_eval.add_argument("--quiet", action="store_true", help="Hide the progress bar.") + _add_common(p_eval) + p_eval.set_defaults(func=cmd_eval) + p_watch = sub.add_parser( "watch", help="Index, then keep the index live on changes." ) diff --git a/docs/eval.md b/docs/eval.md new file mode 100644 index 0000000..c1858e1 --- /dev/null +++ b/docs/eval.md @@ -0,0 +1,80 @@ +# Retrieval eval harness + +A small, offline harness for measuring **retrieval quality** — "did we surface the right +file/symbol for this query?" — so accuracy claims are provable and regressions are caught. +It implements move #0 of [the code-retrieval strategy](research/code-retrieval-strategy.md): +nothing else in that plan (a better embedder, a reranker, fusion tuning) is worth shipping +until we can measure it. + +## Metrics + +Standard localization metrics, matching the SWE-bench / Agentless / SweRank and CoIR / +CodeSearchNet conventions: + +- **recall@k** — fraction of relevant items found in the top k. +- **hit@k** (Acc@k) — 1 if *any* relevant item is in the top k. +- **MRR** — reciprocal rank of the first relevant item. +- **nDCG@k** — rank-discounted gain (binary relevance). + +Scored at **file** level (default) or **symbol** level (`--level symbol`). + +## Quick start + +```bash +# 1. Mine a dataset from the repo's own git history (no network, no LLM): +# query = commit subject, ground truth = files that commit changed (and still exist). +coderag eval --build --dataset coderag-eval.jsonl + +# 2. Score the current hybrid retriever: +coderag eval --dataset coderag-eval.jsonl + +# 3. Contrast dense-only vs BM25-only vs hybrid on one index: +coderag eval --dataset coderag-eval.jsonl --compare +``` + +``` +mode n MRR R@1 R@5 R@10 nDCG@1 nDCG@5 nDCG@10 Hit@1 Hit@5 Hit@10 +------ -- ----- ----- ----- ----- ------ ------ ------- ----- ----- ------ +dense … +bm25 … +hybrid … +``` + +Add `--json` for machine-readable output, `--ks 1,3,5,10` to change cutoffs, and +`--level symbol` for function/class-level localization (needs `relevant_symbols` in the +dataset). The usual `--watched-dir` / `--store-dir` / `--provider` / `--model` flags apply. + +> The default `fake` provider is for tests only — its vectors are random, so dense looks +> near-zero. Run real evals against `fastembed` (the local default) or whatever model you're +> evaluating, e.g. `coderag eval --dataset … --compare --model BAAI/bge-small-en-v1.5` then +> again with a candidate like CodeRankEmbed to measure the lift. + +## Dataset format + +JSONL, one case per line: + +```json +{"query": "fix retry backoff on 429", "relevant_files": ["coderag/llm.py"], "relevant_symbols": ["stream_answer"], "id": "abc123", "source": "git"} +``` + +`relevant_symbols` and `id`/`source` are optional. Mine with `--build`, or hand-author cases +for queries you care about (the natural-language "where is X handled" questions where semantic +retrieval should beat grep). + +## Library API + +```python +from coderag import CodeRAG, Config +from coderag.eval import build_from_git, compare_modes, evaluate + +cr = CodeRAG(Config.from_env()) +cr.index() +cases = build_from_git(cr.config.watched_dir, max_cases=200) + +for r in compare_modes(cr, cases): # dense / bm25 / hybrid + print(r.label, r.as_dict()) + +# Or score any retriever callable directly: +res = evaluate(cr.search, cases, level="file") +print(res.recall, res.mrr) +``` diff --git a/tests/test_eval.py b/tests/test_eval.py new file mode 100644 index 0000000..276d35f --- /dev/null +++ b/tests/test_eval.py @@ -0,0 +1,208 @@ +"""Tests for the code-retrieval eval harness (metrics, dataset, scoring). + +All offline/deterministic via the `fake` provider fixture. +""" + +from __future__ import annotations + +import subprocess +from pathlib import Path + +from coderag.api import CodeRAG +from coderag.eval import ( + EvalCase, + build_from_git, + compare_modes, + evaluate, + load_dataset, + save_dataset, +) +from coderag.eval.harness import best_label, format_table +from coderag.eval.metrics import hit_at_k, mrr, ndcg_at_k, recall_at_k +from tests.conftest import write + +# --- metrics --- + + +def test_recall_at_k_counts_fraction_found(): + ranked = ["a.py", "b.py", "c.py"] + assert recall_at_k(ranked, {"b.py", "c.py"}, 3) == 1.0 + assert recall_at_k(ranked, {"b.py", "c.py"}, 2) == 0.5 + assert recall_at_k(ranked, {"z.py"}, 3) == 0.0 + + +def test_hit_at_k_is_binary(): + ranked = ["a.py", "b.py"] + assert hit_at_k(ranked, {"b.py"}, 2) == 1.0 + assert hit_at_k(ranked, {"b.py"}, 1) == 0.0 + + +def test_mrr_uses_first_relevant_rank(): + assert mrr(["a", "b", "c"], {"b"}) == 0.5 + assert mrr(["a", "b", "c"], {"a"}) == 1.0 + assert mrr(["a", "b", "c"], {"z"}) == 0.0 + + +def test_ndcg_rewards_higher_ranks(): + high = ndcg_at_k(["rel", "x", "y"], {"rel"}, 3) + low = ndcg_at_k(["x", "y", "rel"], {"rel"}, 3) + assert high == 1.0 # single relevant at rank 1 is perfect + assert 0.0 < low < high + + +def test_metrics_dedupe_ranked_ids(): + # Duplicate file paths (multiple chunks per file) must not consume top-k slots: + # deduped to ["a.py", "b.py"], so both relevant files land within k=2. + ranked = ["a.py", "a.py", "b.py"] + assert recall_at_k(ranked, {"a.py", "b.py"}, 2) == 1.0 + # Without dedup the second "a.py" would have pushed "b.py" out of the top 2. + assert recall_at_k(ranked, {"a.py", "b.py"}, 1) == 0.5 + + +def test_metrics_empty_relevant_is_zero(): + assert recall_at_k(["a"], set(), 1) == 0.0 + assert ndcg_at_k(["a"], set(), 1) == 0.0 + + +# --- dataset --- + + +def test_dataset_roundtrip(tmp_path: Path): + cases = [ + EvalCase( + "find auth", ["auth.py"], ["authenticate_user"], id="c1", source="git" + ), + EvalCase("find math", ["math_utils.py"]), + ] + path = tmp_path / "ds.jsonl" + save_dataset(cases, path) + loaded = load_dataset(path) + assert [c.query for c in loaded] == ["find auth", "find math"] + assert loaded[0].relevant_symbols == ["authenticate_user"] + assert loaded[1].relevant_symbols == [] + + +def test_load_dataset_skips_blank_lines(tmp_path: Path): + path = tmp_path / "ds.jsonl" + path.write_text( + '{"query": "q", "relevant_files": ["a.py"]}\n\n \n', encoding="utf-8" + ) + assert len(load_dataset(path)) == 1 + + +# --- harness: end-to-end scoring against a real (fake-embedded) index --- + + +def _indexed(config) -> CodeRAG: + config.watched_dir.mkdir(parents=True, exist_ok=True) + write( + config.watched_dir / "auth.py", + "def authenticate_user(token):\n" + " '''Validate a session token and return the user.'''\n" + " return verify(token)\n", + ) + write( + config.watched_dir / "math_utils.py", + "def add_numbers(a, b):\n return a + b\n", + ) + cr = CodeRAG(config) + cr.index() + return cr + + +def test_evaluate_perfect_retrieval_scores_one(config): + cr = _indexed(config) + cases = [EvalCase("add_numbers", ["math_utils.py"])] + res = evaluate(cr.search, cases, ks=(1, 3)) + assert res.n == 1 + assert res.recall[1] == 1.0 + assert res.mrr == 1.0 + assert res.ndcg[1] == 1.0 + + +def test_evaluate_skips_cases_without_ground_truth_at_level(config): + cr = _indexed(config) + # File-only ground truth -> nothing to score at the symbol level. + cases = [EvalCase("add_numbers", ["math_utils.py"])] + res = evaluate(cr.search, cases, ks=(1,), level="symbol") + assert res.n == 0 + + +def test_evaluate_symbol_level(config): + cr = _indexed(config) + cases = [EvalCase("authenticate_user", ["auth.py"], ["authenticate_user"])] + res = evaluate(cr.search, cases, ks=(1, 3), level="symbol") + assert res.n == 1 + assert res.hit[3] == 1.0 + + +def test_compare_modes_returns_three_labels(config): + cr = _indexed(config) + cases = [ + EvalCase("add_numbers", ["math_utils.py"]), + EvalCase("authenticate session token", ["auth.py"]), + ] + results = compare_modes(cr, cases, ks=(1, 3)) + assert [r.label for r in results] == ["dense", "bm25", "hybrid"] + assert all(r.n == 2 for r in results) + + +def test_bm25_recalls_exact_identifier(config): + # Lexical retrieval should find an exact identifier even when dense recall is weak. + cr = _indexed(config) + cases = [EvalCase("add_numbers", ["math_utils.py"])] + results = compare_modes(cr, cases, ks=(1, 3)) + bm25 = next(r for r in results if r.label == "bm25") + assert bm25.hit[3] == 1.0 + + +def test_format_table_and_best_label(config): + cr = _indexed(config) + cases = [EvalCase("add_numbers", ["math_utils.py"])] + results = compare_modes(cr, cases, ks=(1, 3)) + table = format_table(results) + assert "mode" in table and "MRR" in table and "hybrid" in table + assert best_label(results, metric="ndcg", k=3) in {"dense", "bm25", "hybrid"} + + +# --- git dataset miner --- + + +def test_build_from_git_mines_changed_files(tmp_path: Path): + repo = tmp_path / "repo" + repo.mkdir() + + def git(*args: str) -> None: + subprocess.run(["git", "-C", str(repo), *args], check=True, capture_output=True) + + git("init", "-q") + git("config", "user.email", "t@example.com") + git("config", "user.name", "Tester") + git("config", "commit.gpgsign", "false") + write(repo / "auth.py", "def authenticate_user(token):\n return token\n") + git("add", "-A") + git("commit", "-q", "-m", "add user authentication helper") + + cases = build_from_git(repo, max_cases=10) + assert len(cases) == 1 + assert cases[0].query == "add user authentication helper" + assert cases[0].relevant_files == ["auth.py"] + assert cases[0].source == "git" + + +def test_build_from_git_skips_merges_and_short_subjects(tmp_path: Path): + repo = tmp_path / "repo" + repo.mkdir() + + def git(*args: str) -> None: + subprocess.run(["git", "-C", str(repo), *args], check=True, capture_output=True) + + git("init", "-q") + git("config", "user.email", "t@example.com") + git("config", "user.name", "Tester") + git("config", "commit.gpgsign", "false") + write(repo / "a.py", "x = 1\n") + git("add", "-A") + git("commit", "-q", "-m", "wip") # too short -> filtered out + + assert build_from_git(repo, max_cases=10, min_query_len=12) == [] From f3a7e0c7c417a911262c09522afdd428ea2db754 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 17 Jun 2026 07:08:37 +0000 Subject: [PATCH 3/5] feat(eval): embedder benchmark, model registry, curated dataset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Runs move #1 (the embedder experiment) on real local models via the eval harness, and records the honest result. - scripts/bench_embedders.py: reproducible model comparison — index a repo per model into an isolated store, score dense/bm25/hybrid via the harness. - coderag/embeddings/models.py + `coderag eval --list-models`: curated registry of local code-search embedders with size/accuracy notes. Note: fastembed does not ship CodeRankEmbed (needs custom ONNX export, follow-up); jina-embeddings-v2-base-code is the best out-of-the-box code-specific option. - coderag/eval/datasets/coderag_self.jsonl: 24 curated natural-language -> file cases for benchmarking CodeRAG on itself. Measured (this repo, 24 cases): hybrid > dense > BM25 for BOTH models (validates the fusion thesis), but the code-specific model did NOT clearly beat bge-small — the small repo saturates (bge already Hit@10=1.0), so the published CoIR gap does not transfer. Conclusion: keep bge-small default; model swaps need a larger/harder benchmark; rank-1 headroom points at the reranker (move #2). Documented in docs/eval.md and the strategy doc. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01LhTCPRjNmSitYxgSDfttT7 --- coderag/embeddings/models.py | 90 ++++++++++++++++++++++++ coderag/eval/datasets/coderag_self.jsonl | 24 +++++++ coderag/surfaces/cli.py | 12 ++++ docs/eval.md | 39 ++++++++++ docs/research/code-retrieval-strategy.md | 9 +++ scripts/bench_embedders.py | 77 ++++++++++++++++++++ tests/test_models_registry.py | 25 +++++++ 7 files changed, 276 insertions(+) create mode 100644 coderag/embeddings/models.py create mode 100644 coderag/eval/datasets/coderag_self.jsonl create mode 100644 scripts/bench_embedders.py create mode 100644 tests/test_models_registry.py diff --git a/coderag/embeddings/models.py b/coderag/embeddings/models.py new file mode 100644 index 0000000..429d17a --- /dev/null +++ b/coderag/embeddings/models.py @@ -0,0 +1,90 @@ +"""Curated registry of local (fastembed/ONNX) embedding models for code search. + +These are the no-API-key models worth considering for CodeRAG, with short notes on the +accuracy/size trade-off. All are loadable via ``--model `` (provider ``fastembed``). +The numbers in the notes are external benchmark figures (see docs/research/) — run +``coderag eval`` to measure them on *your* codebase. + +Code-specific models (trained on code) generally beat general-purpose text embedders on +code retrieval, at the cost of a larger download. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Tuple + + +@dataclass(frozen=True) +class ModelInfo: + name: str # fastembed model id (pass to --model) + dim: int + size_gb: float + code_specific: bool + note: str + + +# Ordered best-first for code search among models fastembed can load locally. fastembed +# does not (yet) ship CodeRankEmbed/CodeSage; those need a custom ONNX export — tracked as +# a follow-up. jina-embeddings-v2-base-code is the strongest code-specific option available +# out of the box. +RECOMMENDED: Tuple[ModelInfo, ...] = ( + ModelInfo( + "jinaai/jina-embeddings-v2-base-code", + 768, + 0.64, + True, + "Code-specific, 8192-ctx, Apache-2.0. Best out-of-the-box local code retriever.", + ), + ModelInfo( + "BAAI/bge-base-en-v1.5", + 768, + 0.21, + False, + "General text. Stronger than bge-small; modest code retrieval.", + ), + ModelInfo( + "snowflake/snowflake-arctic-embed-m-long", + 768, + 0.54, + False, + "General, long-context (base model behind CodeRankEmbed).", + ), + ModelInfo( + "nomic-ai/nomic-embed-text-v1.5", + 768, + 0.52, + False, + "General, long-context, Matryoshka dims.", + ), + ModelInfo( + "BAAI/bge-small-en-v1.5", + 384, + 0.067, + False, + "Current default. Smallest/fastest; weakest on code (~45.8 CoIR).", + ), +) + + +def format_models() -> str: + """Human-readable table of recommended models for the CLI.""" + rows = [("model", "dim", "size", "code?", "note")] + rows += [ + ( + m.name, + str(m.dim), + f"{m.size_gb:g}GB", + "yes" if m.code_specific else "no", + m.note, + ) + for m in RECOMMENDED + ] + widths = [max(len(r[i]) for r in rows) for i in range(4)] + lines = [] + for i, r in enumerate(rows): + head = " ".join(r[j].ljust(widths[j]) for j in range(4)) + lines.append(f"{head} {r[4]}") + if i == 0: + lines.append(" ".join("-" * w for w in widths) + " " + "-" * len(r[4])) + return "\n".join(lines) diff --git a/coderag/eval/datasets/coderag_self.jsonl b/coderag/eval/datasets/coderag_self.jsonl new file mode 100644 index 0000000..4575746 --- /dev/null +++ b/coderag/eval/datasets/coderag_self.jsonl @@ -0,0 +1,24 @@ +{"query": "where are duplicate or stale vectors removed when a file changes", "relevant_files": ["coderag/indexer.py"], "source": "curated"} +{"query": "how is the FAISS index rebuilt from the SQLite source of truth", "relevant_files": ["coderag/store/vector_index.py"], "source": "curated"} +{"query": "where is reciprocal rank fusion implemented", "relevant_files": ["coderag/retrieval/fusion.py"], "source": "curated"} +{"query": "how are dense and lexical search results combined into one ranking", "relevant_files": ["coderag/retrieval/search.py"], "source": "curated"} +{"query": "how does the debounced filesystem watcher trigger reindexing", "relevant_files": ["coderag/watch.py"], "source": "curated"} +{"query": "where is symbol-aware chunking for Python using the ast module", "relevant_files": ["coderag/chunking/python_ast.py"], "source": "curated"} +{"query": "how are functions and classes chunked for Go and Rust via tree-sitter", "relevant_files": ["coderag/chunking/treesitter.py"], "source": "curated"} +{"query": "where is BM25 keyword search over SQLite FTS5 implemented", "relevant_files": ["coderag/store/sqlite_store.py"], "source": "curated"} +{"query": "how does the HTTP API require an API key for authentication", "relevant_files": ["coderag/surfaces/http_api.py"], "source": "curated"} +{"query": "how is an LLM answer streamed over the retrieved code chunks", "relevant_files": ["coderag/llm.py"], "source": "curated"} +{"query": "where is the OpenAI-compatible embedding provider implemented", "relevant_files": ["coderag/embeddings/openai_provider.py"], "source": "curated"} +{"query": "how does configuration load from environment variables and a dotenv file", "relevant_files": ["coderag/config.py"], "source": "curated"} +{"query": "where is the command line search subcommand defined", "relevant_files": ["coderag/surfaces/cli.py"], "source": "curated"} +{"query": "how does the vector index switch from flat to IVF as the corpus grows", "relevant_files": ["coderag/store/vector_index.py"], "source": "curated"} +{"query": "where is content hashing used to skip unchanged files on reindex", "relevant_files": ["coderag/indexer.py"], "source": "curated"} +{"query": "how are file contents served safely for only indexed files", "relevant_files": ["coderag/api.py"], "source": "curated"} +{"query": "where does the web UI render results with syntax highlighting", "relevant_files": ["coderag/surfaces/webui.py"], "source": "curated"} +{"query": "how is an oversized function split into smaller line windows", "relevant_files": ["coderag/chunking/base.py"], "source": "curated"} +{"query": "where is the database table schema for chunks and files defined", "relevant_files": ["coderag/store/schema.py"], "source": "curated"} +{"query": "how does a model or embedding dimension change get detected and trigger a rebuild", "relevant_files": ["coderag/store/sqlite_store.py", "coderag/api.py"], "source": "curated"} +{"query": "where is the deterministic offline fake embedding provider for tests", "relevant_files": ["coderag/embeddings/fake_provider.py"], "source": "curated"} +{"query": "how are file extensions mapped to programming languages for chunking", "relevant_files": ["coderag/chunking/languages.py"], "source": "curated"} +{"query": "where is text split into lines without collapsing carriage returns", "relevant_files": ["coderag/_lines.py"], "source": "curated"} +{"query": "how is the incremental indexing done with parallel workers", "relevant_files": ["coderag/indexer.py"], "source": "curated"} diff --git a/coderag/surfaces/cli.py b/coderag/surfaces/cli.py index 25ca920..ea7bb1f 100644 --- a/coderag/surfaces/cli.py +++ b/coderag/surfaces/cli.py @@ -95,6 +95,13 @@ def cmd_eval(args: argparse.Namespace) -> int: cfg = _build_config(args) + # `coderag eval --list-models` — show recommended local embedding models. + if args.list_models: + from coderag.embeddings.models import format_models + + print(format_models()) + return 0 + # `coderag eval build` — mine a dataset from the repo's git history. if args.build: cases = ev.build_from_git( @@ -275,6 +282,11 @@ def build_parser() -> argparse.ArgumentParser: ) p_eval.add_argument("--json", action="store_true", help="Emit JSON.") p_eval.add_argument("--quiet", action="store_true", help="Hide the progress bar.") + p_eval.add_argument( + "--list-models", + action="store_true", + help="List recommended local embedding models for code search and exit.", + ) _add_common(p_eval) p_eval.set_defaults(func=cmd_eval) diff --git a/docs/eval.md b/docs/eval.md index c1858e1..bd9772c 100644 --- a/docs/eval.md +++ b/docs/eval.md @@ -49,6 +49,45 @@ dataset). The usual `--watched-dir` / `--store-dir` / `--provider` / `--model` f > evaluating, e.g. `coderag eval --dataset … --compare --model BAAI/bge-small-en-v1.5` then > again with a candidate like CodeRankEmbed to measure the lift. +## Measured results (this repo) + +Move #1 experiment — current default vs a code-specific model — run with +`scripts/bench_embedders.py` on the curated dataset +(`coderag/eval/datasets/coderag_self.jsonl`, 24 natural-language → file cases, 90 files / +553 chunks): + +``` +mode n MRR R@1 R@5 R@10 nDCG@10 Hit@10 +bge-small-en-v1.5 · dense 24 0.784 0.604 0.938 1.000 0.831 1.000 +bge-small-en-v1.5 · bm25 24 0.751 0.604 0.854 1.000 0.802 1.000 +bge-small-en-v1.5 · hybrid 24 0.822 0.688 1.000 1.000 0.860 1.000 +jina-embeddings-v2-base-code · dense 24 0.759 0.583 0.938 0.979 0.810 1.000 +jina-embeddings-v2-base-code · bm25 24 0.751 0.604 0.854 1.000 0.802 1.000 +jina-embeddings-v2-base-code · hybrid 24 0.835 0.729 0.938 0.958 0.858 0.958 +``` + +Two findings, one expected and one cautionary: + +1. **Hybrid beats either modality alone, for both models** (bge hybrid MRR 0.822 > dense + 0.784 > bm25 0.751; jina hybrid 0.835 > dense 0.759 > bm25 0.751). This is the core + thesis — fusion is the differentiator vs pure-grep agents and single-modality embedding + tools. The identical BM25 rows across models are a sanity check that the harness isolates + the embedding variable correctly. +2. **The code-specific model did *not* clearly beat bge-small here.** jina-code's hybrid is + marginally ahead on MRR/R@1 but behind on R@5/R@10/Hit@10. The reason is saturation: on a + 90-file repo with lexical-rich NL queries, bge-small already hits Hit@10 = 1.0 and + R@5 ≈ 1.0 — there's no recall headroom for a better model to capture. The large published + CoIR gap (bge ~45.8 vs code models ~60) is measured on big, hard, cross-language corpora + and **does not transfer** to a small single-repo file-localization task. + +**Takeaways:** (a) don't flip the default to a 10×-larger model on this evidence — keep +bge-small, offer code models as an option (`coderag eval --list-models`); (b) discriminating +embedders needs a **larger/harder benchmark** (a big external repo, or harder +cross-file/conceptual queries with less lexical leakage); (c) the remaining headroom is at +**rank 1** (R@1 ≈ 0.6–0.73), which is exactly what a cross-encoder reranker (strategy move +#2) targets. This is the harness doing its job: it stopped a plausible-sounding upgrade that +the data doesn't support. + ## Dataset format JSONL, one case per line: diff --git a/docs/research/code-retrieval-strategy.md b/docs/research/code-retrieval-strategy.md index 3465e71..a794c69 100644 --- a/docs/research/code-retrieval-strategy.md +++ b/docs/research/code-retrieval-strategy.md @@ -53,6 +53,15 @@ needed files in nearly half of instances. [H] (arXiv 2310.06770) Better retrieva --- +> **Update (measured).** The eval harness (§0) is now built, and move #1 was tested on this +> repo: `bge-small` vs `jina-embeddings-v2-base-code` (fastembed does **not** ship +> CodeRankEmbed — it needs a custom ONNX export, tracked as follow-up). On a 24-case curated +> NL→file set the code-specific model did **not** clearly win — the small repo saturates +> (bge already at Hit@10 = 1.0), so the published CoIR gap didn't transfer. The validated +> win was **hybrid > dense > BM25 for both models**. See [docs/eval.md](../eval.md). Net: keep +> bge-small as default; a model swap needs a larger/harder benchmark to justify, and the +> rank-1 headroom points at the reranker (§2) as the better next bet. + ## 1. Upgrade the embedding model (highest single accuracy jump) **Current state:** CodeRAG defaults to `BAAI/bge-small-en-v1.5`, which scores only **~45.8 CoIR diff --git a/scripts/bench_embedders.py b/scripts/bench_embedders.py new file mode 100644 index 0000000..ad39d47 --- /dev/null +++ b/scripts/bench_embedders.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python +"""Benchmark several local embedding models on a code-retrieval eval dataset. + +For each model this indexes the target repo into an isolated store, then scores +dense-only / BM25-only / hybrid retrieval with the eval harness, and prints one combined +table. BM25 is model-independent (a useful constant baseline across rows). + +Usage: + python scripts/bench_embedders.py --repo . --dataset coderag/eval/datasets/coderag_self.jsonl \ + --models BAAI/bge-small-en-v1.5,jinaai/jina-embeddings-v2-base-code + +This downloads each model once (no API key). Run it as the move #1 experiment: compare the +current default against a code-specific candidate and read the lift off the hybrid rows. +""" + +from __future__ import annotations + +import argparse +import tempfile +from pathlib import Path + +from coderag.api import CodeRAG +from coderag.config import Config +from coderag.eval import compare_modes, load_dataset +from coderag.eval.harness import EvalResult + + +def _label(model: str, mode: str) -> str: + short = model.split("/")[-1] + return f"{short} · {mode}" + + +def main() -> int: + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("--repo", default=".", help="Codebase to index/evaluate.") + ap.add_argument("--dataset", required=True, help="JSONL eval dataset.") + ap.add_argument( + "--models", + default="BAAI/bge-small-en-v1.5,jinaai/jina-embeddings-v2-base-code", + help="Comma-separated fastembed model ids.", + ) + ap.add_argument("--ks", default="1,5,10", help="Comma-separated cutoffs.") + ap.add_argument("--level", choices=("file", "symbol"), default="file") + args = ap.parse_args() + + repo = Path(args.repo).resolve() + cases = load_dataset(args.dataset) + ks = tuple(int(k) for k in args.ks.split(",")) + models = [m.strip() for m in args.models.split(",") if m.strip()] + + rows: list[EvalResult] = [] + for model in models: + print(f"\n=== {model} === (indexing {repo.name} ...)") + with tempfile.TemporaryDirectory(prefix="coderag-bench-") as store: + cfg = Config.from_env( + provider="fastembed", + model=model, + watched_dir=repo, + store_dir=Path(store), + ) + cr = CodeRAG(cfg) + stats = cr.index() + print(f" {stats.total_files} files / {stats.total_chunks} chunks") + for r in compare_modes(cr, cases, ks=ks, level=args.level): + r.label = _label(model, r.label) + rows.append(r) + cr.close() + + from coderag.eval.harness import format_table + + print(f"\n{len(cases)} case(s), level={args.level}\n") + print(format_table(rows)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/test_models_registry.py b/tests/test_models_registry.py new file mode 100644 index 0000000..5fff29b --- /dev/null +++ b/tests/test_models_registry.py @@ -0,0 +1,25 @@ +"""Tests for the recommended-embedding-model registry (offline, no downloads).""" + +from __future__ import annotations + +from coderag.embeddings.models import RECOMMENDED, format_models + + +def test_registry_is_nonempty_and_well_formed(): + assert RECOMMENDED + for m in RECOMMENDED: + assert m.name and "/" in m.name # looks like a HF model id + assert m.dim > 0 + assert m.size_gb > 0 + assert m.note + + +def test_default_model_is_listed(): + # The current default must appear so users can see its trade-off. + assert any(m.name == "BAAI/bge-small-en-v1.5" for m in RECOMMENDED) + + +def test_format_models_renders_table(): + out = format_models() + assert "model" in out and "code?" in out + assert "jina-embeddings-v2-base-code" in out From b15526b827ed138736dac5b2351e7c557ab020d9 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 17 Jun 2026 08:06:25 +0000 Subject: [PATCH 4/5] feat(retrieval): optional local cross-encoder reranker (move #2) Two-stage retrieve-then-rerank: first-stage hybrid (dense+BM25+RRF) for recall, then a local ONNX cross-encoder re-scores the top candidates jointly with the query for top-of-list precision. Opt-in (config.rerank, default off) so the zero-config engine stays tiny/fast; uses fastembed's TextCrossEncoder (default Xenova/ms-marco-MiniLM-L-12-v2) so it needs no API key and no new dependency. - coderag/retrieval/rerank.py: Reranker protocol + CrossEncoderReranker + get_reranker() factory (mirrors the embeddings provider pattern). - HybridSearcher: deeper candidate pool when reranking, re-score, reorder, trim to top_k; reranker injected by the facade from config. - config: rerank / rerank_model / rerank_candidates (+ CODERAG_RERANK* env). - status() reports rerank state; eval compare_modes adds a hybrid+rerank row; `coderag eval --rerank` and `bench_embedders.py --rerank`. - Tests via a deterministic fake reranker (offline). Measured (this repo, 24 cases): the generic ms-marco reranker gave no lift / a marginal regression. The benchmark is saturated (hybrid already R@5~1.0) and ms-marco is web-trained, not code. Documented in docs/eval.md: the critical path is now a larger/harder benchmark, after which a code-aware reranker should be re-tested. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01LhTCPRjNmSitYxgSDfttT7 --- coderag/api.py | 9 ++- coderag/config.py | 12 +++ coderag/eval/harness.py | 12 +++ coderag/retrieval/rerank.py | 79 +++++++++++++++++++ coderag/retrieval/search.py | 34 +++++++- coderag/surfaces/cli.py | 22 +++++- docs/eval.md | 40 ++++++++++ docs/research/code-retrieval-strategy.md | 9 +++ scripts/bench_embedders.py | 14 +++- tests/test_rerank.py | 99 ++++++++++++++++++++++++ 10 files changed, 320 insertions(+), 10 deletions(-) create mode 100644 coderag/retrieval/rerank.py create mode 100644 tests/test_rerank.py diff --git a/coderag/api.py b/coderag/api.py index c2eb35c..aaf6651 100644 --- a/coderag/api.py +++ b/coderag/api.py @@ -93,10 +93,15 @@ def indexer(self) -> "Indexer": @property def searcher(self) -> "HybridSearcher": if self._searcher is None: + from coderag.retrieval.rerank import get_reranker from coderag.retrieval.search import HybridSearcher self._searcher = HybridSearcher( - self.config, self.provider, self.store, self.vectors + self.config, + self.provider, + self.store, + self.vectors, + reranker=get_reranker(self.config), ) return self._searcher @@ -177,6 +182,8 @@ def status(self) -> dict: ), "llm_base_url": self.config.openai_base_url or "", "index_type": self.vectors.kind, + "rerank": self.config.rerank, + "rerank_model": self.config.rerank_model if self.config.rerank else "", "store_dir": str(self.config.store_dir), "watched_dir": str(self.config.watched_dir), "total_files": stats.total_files, diff --git a/coderag/config.py b/coderag/config.py index 6207b5c..ae8f411 100644 --- a/coderag/config.py +++ b/coderag/config.py @@ -135,6 +135,13 @@ class Config: dense_weight: float = 1.0 lexical_weight: float = 1.0 + # --- Reranking (optional two-stage retrieve-then-rerank) --- + # Off by default so the zero-config engine stays tiny/fast. When on, the top + # ``rerank_candidates`` fused hits are re-scored by a local cross-encoder and reordered. + rerank: bool = False + rerank_model: str = "Xenova/ms-marco-MiniLM-L-12-v2" # local ONNX cross-encoder + rerank_candidates: int = 50 # fused hits to rerank before trimming to top_k + # --- Indexing throughput --- embed_batch_size: int = 64 index_workers: int = 4 @@ -202,6 +209,11 @@ def from_env(cls, **overrides: object) -> "Config": rrf_k=_env_int("CODERAG_RRF_K", cls.rrf_k), dense_weight=_env_float("CODERAG_DENSE_WEIGHT", cls.dense_weight), lexical_weight=_env_float("CODERAG_LEXICAL_WEIGHT", cls.lexical_weight), + rerank=_env_bool("CODERAG_RERANK", cls.rerank), + rerank_model=_env_str("CODERAG_RERANK_MODEL", cls.rerank_model), + rerank_candidates=_env_int( + "CODERAG_RERANK_CANDIDATES", cls.rerank_candidates + ), embed_batch_size=_env_int("CODERAG_EMBED_BATCH", cls.embed_batch_size), index_workers=_env_int("CODERAG_WORKERS", cls.index_workers), llm_provider=_env_str("CODERAG_LLM_PROVIDER", cls.llm_provider), diff --git a/coderag/eval/harness.py b/coderag/eval/harness.py index aa112b5..19e1b38 100644 --- a/coderag/eval/harness.py +++ b/coderag/eval/harness.py @@ -16,6 +16,7 @@ if TYPE_CHECKING: from coderag.api import CodeRAG + from coderag.retrieval.rerank import Reranker # A retriever: given a query and a result count, return ranked hits (best-first). SearchFn = Callable[[str, int], List[SearchHit]] @@ -119,11 +120,14 @@ def compare_modes( ks: Sequence[int] = DEFAULT_KS, level: str = "file", modes: Sequence[Tuple[str, float, float]] = DEFAULT_MODES, + reranker: Optional["Reranker"] = None, ) -> List[EvalResult]: """Score dense-only vs BM25-only vs hybrid on the already-built index of ``cr``. The index is mode-independent — the dense/lexical weights only affect query-time RRF fusion — so we reuse one provider/store/vector index and just swap the fusion weights. + When ``reranker`` is given, an extra ``hybrid+rerank`` row is appended so the lift from + two-stage reranking is directly comparable on the same index. """ from coderag.retrieval.search import HybridSearcher @@ -134,6 +138,14 @@ def compare_modes( results.append( evaluate(searcher.search, cases, label=label, ks=ks, level=level) ) + if reranker is not None: + cfg = cr.config.with_overrides(dense_weight=1.0, lexical_weight=1.0) + searcher = HybridSearcher( + cfg, cr.provider, cr.store, cr.vectors, reranker=reranker + ) + results.append( + evaluate(searcher.search, cases, label="hybrid+rerank", ks=ks, level=level) + ) return results diff --git a/coderag/retrieval/rerank.py b/coderag/retrieval/rerank.py new file mode 100644 index 0000000..8465cbb --- /dev/null +++ b/coderag/retrieval/rerank.py @@ -0,0 +1,79 @@ +"""Optional second-stage reranking for two-stage retrieve-then-rerank search. + +First-stage hybrid retrieval (dense + BM25 + RRF) is tuned for *recall* — get the right +chunks into a candidate pool cheaply. A cross-encoder reranker then scores each candidate +*jointly* with the query (not via independent embeddings), which is far more precise at the +top of the list. The research finds this is the single highest-ROI accuracy add-on for a +local engine: +5–15 nDCG/MRR for ~30 ms/query on CPU with a small ONNX model. + +It's **opt-in** (``config.rerank``) so the zero-config default stays tiny and fast. The +default model — ``Xenova/ms-marco-MiniLM-L-12-v2`` (~0.12 GB ONNX) — runs locally via +fastembed's ``TextCrossEncoder``, so enabling it needs no API key and no new dependency. +""" + +from __future__ import annotations + +import logging +from functools import cached_property +from pathlib import Path +from typing import Any, List, Optional, Protocol, Sequence, runtime_checkable + +from coderag.config import Config + +logger = logging.getLogger(__name__) + +DEFAULT_RERANK_MODEL = "Xenova/ms-marco-MiniLM-L-12-v2" + + +@runtime_checkable +class Reranker(Protocol): + """Scores how well each document answers the query (higher = more relevant).""" + + @property + def model_id(self) -> str: ... + + def rerank(self, query: str, documents: Sequence[str]) -> List[float]: + """Return one relevance score per document, aligned to input order.""" + + +class CrossEncoderReranker: + """Local cross-encoder reranker backed by fastembed's ``TextCrossEncoder`` (ONNX).""" + + name = "cross-encoder" + + def __init__( + self, model: str = DEFAULT_RERANK_MODEL, cache_dir: Optional[Path] = None + ) -> None: + self._model_name = model + self._cache_dir = str(cache_dir) if cache_dir else None + + @cached_property + def _encoder(self) -> Any: + from fastembed.rerank.cross_encoder import TextCrossEncoder + + logger.info("Loading reranker %s ...", self._model_name) + return TextCrossEncoder(self._model_name, cache_dir=self._cache_dir) + + @property + def model_id(self) -> str: + return self._model_name + + def rerank(self, query: str, documents: Sequence[str]) -> List[float]: + if not documents: + return [] + return [float(s) for s in self._encoder.rerank(query, list(documents))] + + +def get_reranker(config: Config) -> Optional[Reranker]: + """Build the reranker if ``config.rerank`` is on, else ``None`` (reranking disabled).""" + if not config.rerank: + return None + return CrossEncoderReranker(config.rerank_model, cache_dir=config.cache_dir) + + +__all__ = [ + "CrossEncoderReranker", + "DEFAULT_RERANK_MODEL", + "Reranker", + "get_reranker", +] diff --git a/coderag/retrieval/search.py b/coderag/retrieval/search.py index 6e8de53..ea6482a 100644 --- a/coderag/retrieval/search.py +++ b/coderag/retrieval/search.py @@ -3,7 +3,7 @@ from __future__ import annotations import logging -from typing import Dict, List +from typing import TYPE_CHECKING, Dict, List, Optional from coderag.config import Config from coderag.embeddings import EmbeddingProvider @@ -12,6 +12,9 @@ from coderag.store.vector_index import FaissVectorIndex from coderag.types import SearchHit +if TYPE_CHECKING: + from coderag.retrieval.rerank import Reranker + logger = logging.getLogger(__name__) @@ -22,17 +25,23 @@ def __init__( provider: EmbeddingProvider, store: SQLiteStore, vectors: FaissVectorIndex, + reranker: Optional["Reranker"] = None, ) -> None: self.config = config self.provider = provider self.store = store self.vectors = vectors + self.reranker = reranker def search(self, query: str, top_k: int) -> List[SearchHit]: if not query or not query.strip(): return [] - fetch_k = max(self.config.fetch_k, top_k) + # When reranking, pull a deeper candidate pool to rerank, then trim to top_k. + pool = top_k + if self.reranker is not None: + pool = max(self.config.rerank_candidates, top_k) + fetch_k = max(self.config.fetch_k, pool) # Dense retrieval. qvec = self.provider.embed_query(query) @@ -46,12 +55,12 @@ def search(self, query: str, top_k: int) -> List[SearchHit]: # Lexical retrieval (BM25 over FTS5). lexical_ranked = [cid for cid, _ in self.store.fts_search(query, fetch_k)] - # Fuse and trim. + # Fuse, then trim to the candidate pool (top_k, or deeper when reranking). fused = reciprocal_rank_fusion( [dense_ranked, lexical_ranked], k=self.config.rrf_k, weights=[self.config.dense_weight, self.config.lexical_weight], - )[:top_k] + )[:pool] if not fused: return [] @@ -77,4 +86,21 @@ def search(self, query: str, top_k: int) -> List[SearchHit]: similarity=similarity.get(cid, 0.0), ) ) + + if self.reranker is not None: + hits = self._rerank(query, hits) + return hits[:top_k] + + def _rerank(self, query: str, hits: List[SearchHit]) -> List[SearchHit]: + """Re-score candidates jointly with the query and sort by the new score. + + The cross-encoder score replaces ``score`` (the relative ranking signal) so order + and score agree; ``similarity`` keeps the dense cosine for display. + """ + if not hits or self.reranker is None: + return hits + scores = self.reranker.rerank(query, [h.text for h in hits]) + for hit, s in zip(hits, scores, strict=False): + hit.score = float(s) + hits.sort(key=lambda h: h.score, reverse=True) return hits diff --git a/coderag/surfaces/cli.py b/coderag/surfaces/cli.py index ea7bb1f..36323ec 100644 --- a/coderag/surfaces/cli.py +++ b/coderag/surfaces/cli.py @@ -123,15 +123,24 @@ def cmd_eval(args: argparse.Namespace) -> int: return 1 ks = tuple(int(k) for k in args.ks.split(",")) + # --rerank forces the optional two-stage cross-encoder on for this run. + if args.rerank: + cfg = cfg.with_overrides(rerank=True) cr = CodeRAG(cfg) cr.index() # ensure the index is built / up to date before scoring if args.compare: - results = ev.compare_modes(cr, cases, ks=ks, level=args.level) + reranker = None + if args.rerank: + from coderag.retrieval.rerank import get_reranker + + reranker = get_reranker(cfg) + results = ev.compare_modes( + cr, cases, ks=ks, level=args.level, reranker=reranker + ) else: - results = [ - ev.evaluate(cr.search, cases, label="hybrid", ks=ks, level=args.level) - ] + label = "hybrid+rerank" if args.rerank else "hybrid" + results = [ev.evaluate(cr.search, cases, label=label, ks=ks, level=args.level)] if args.json: print(json.dumps([r.as_dict() for r in results], indent=2)) @@ -282,6 +291,11 @@ def build_parser() -> argparse.ArgumentParser: ) p_eval.add_argument("--json", action="store_true", help="Emit JSON.") p_eval.add_argument("--quiet", action="store_true", help="Hide the progress bar.") + p_eval.add_argument( + "--rerank", + action="store_true", + help="Enable the local cross-encoder reranker (two-stage retrieve-then-rerank).", + ) p_eval.add_argument( "--list-models", action="store_true", diff --git a/docs/eval.md b/docs/eval.md index bd9772c..db53254 100644 --- a/docs/eval.md +++ b/docs/eval.md @@ -30,8 +30,15 @@ coderag eval --dataset coderag-eval.jsonl # 3. Contrast dense-only vs BM25-only vs hybrid on one index: coderag eval --dataset coderag-eval.jsonl --compare + +# 4. Add the optional two-stage cross-encoder reranker (adds a hybrid+rerank row): +coderag eval --dataset coderag-eval.jsonl --compare --rerank ``` +Reranking is opt-in at search time too: set `CODERAG_RERANK=1` (model via +`CODERAG_RERANK_MODEL`, pool depth via `CODERAG_RERANK_CANDIDATES`) and every `coderag +search` / API / UI query runs two-stage retrieve-then-rerank. + ``` mode n MRR R@1 R@5 R@10 nDCG@1 nDCG@5 nDCG@10 Hit@1 Hit@5 Hit@10 ------ -- ----- ----- ----- ----- ------ ------ ------- ----- ----- ------ @@ -88,6 +95,39 @@ cross-file/conceptual queries with less lexical leakage); (c) the remaining head #2) targets. This is the harness doing its job: it stopped a plausible-sounding upgrade that the data doesn't support. +### Reranker experiment (move #2) + +Adding the optional cross-encoder reranker (`--rerank`, default +`Xenova/ms-marco-MiniLM-L-12-v2`) on the same 24-case dataset: + +``` +mode MRR R@1 R@5 R@10 nDCG@10 Hit@10 +bge-small-en-v1.5 · dense 0.805 0.646 0.938 1.000 0.845 1.000 +bge-small-en-v1.5 · bm25 0.747 0.604 0.812 1.000 0.798 1.000 +bge-small-en-v1.5 · hybrid 0.801 0.646 1.000 1.000 0.845 1.000 +bge-small-en-v1.5 · hybrid+rerank 0.790 0.646 0.958 1.000 0.836 1.000 +``` + +**The reranker did not help here — it marginally hurt** (hybrid+rerank MRR 0.790 < hybrid +0.801; R@5 0.958 < 1.000). Same lesson as move #1, plus a model-fit issue: + +1. **Saturation, again.** Hybrid already gets R@5 = 1.0 / Hit@10 = 1.0 and the headroom is + only at rank 1 (R@1 = 0.646). A reranker reorders *within* the candidate pool, so on + file-level metrics where the right files are already in the pool, it can only shuffle — + and any mistake shows up as a small regression. +2. **Model fit.** `ms-marco-MiniLM` is trained on web-passage relevance, not code. The + research explicitly flagged that small-cross-encoder *code* reranking lift is inferred, + not measured — this run is consistent with that caveat. A code-aware reranker + (`CODERAG_RERANK_MODEL=jinaai/jina-reranker-v2-base-multilingual` or + `BAAI/bge-reranker-base`) is worth trying, but those are larger. + +**Conclusion across moves #1 and #2:** the recurring blocker is that *this repo's benchmark +is too small and saturated to discriminate any retrieval improvement*. The feature is built, +tested, and opt-in, but **proving its value requires a larger, harder, non-saturated +benchmark** (a 1k+-file external repo and/or symbol-level + cross-file conceptual queries). +That is the true critical path for the "win the eval" objective — accuracy techniques can't +be validated until the benchmark has headroom. + ## Dataset format JSONL, one case per line: diff --git a/docs/research/code-retrieval-strategy.md b/docs/research/code-retrieval-strategy.md index a794c69..2c0d6f8 100644 --- a/docs/research/code-retrieval-strategy.md +++ b/docs/research/code-retrieval-strategy.md @@ -90,6 +90,15 @@ under OpenRAIL++-M, 68.53 CoIR). [H] Voyage-code-3 is API-only — reference poi --- +> **Update (measured & built).** The optional two-stage reranker is implemented +> (`config.rerank`, `coderag/retrieval/rerank.py`, fastembed `TextCrossEncoder`, zero new +> deps) and tested. On this repo's saturated 24-case set it gave **no lift / a marginal +> regression** with the generic `ms-marco-MiniLM` model — consistent with the caveat below +> that small-cross-encoder *code* lift is inferred, not measured, and with the benchmark +> having no headroom (hybrid already R@5≈1.0). See [docs/eval.md](../eval.md). The blocker is +> now clearly the **benchmark**, not the technique: it must get bigger/harder before #1 or #2 +> can show their value. A code-aware reranker should be re-tested there. + ## 2. Add a local cross-encoder reranker (highest-ROI bolt-on) The evidence converges: **a small ONNX cross-encoder reranking the top-100 down to top-8 is the diff --git a/scripts/bench_embedders.py b/scripts/bench_embedders.py index ad39d47..5b66c88 100644 --- a/scripts/bench_embedders.py +++ b/scripts/bench_embedders.py @@ -41,6 +41,11 @@ def main() -> int: ) ap.add_argument("--ks", default="1,5,10", help="Comma-separated cutoffs.") ap.add_argument("--level", choices=("file", "symbol"), default="file") + ap.add_argument( + "--rerank", + action="store_true", + help="Also score a hybrid+rerank row per model (local cross-encoder).", + ) args = ap.parse_args() repo = Path(args.repo).resolve() @@ -61,7 +66,14 @@ def main() -> int: cr = CodeRAG(cfg) stats = cr.index() print(f" {stats.total_files} files / {stats.total_chunks} chunks") - for r in compare_modes(cr, cases, ks=ks, level=args.level): + reranker = None + if args.rerank: + from coderag.retrieval.rerank import get_reranker + + reranker = get_reranker(cfg.with_overrides(rerank=True)) + for r in compare_modes( + cr, cases, ks=ks, level=args.level, reranker=reranker + ): r.label = _label(model, r.label) rows.append(r) cr.close() diff --git a/tests/test_rerank.py b/tests/test_rerank.py new file mode 100644 index 0000000..a2da22b --- /dev/null +++ b/tests/test_rerank.py @@ -0,0 +1,99 @@ +"""Tests for two-stage retrieve-then-rerank (offline via a fake reranker). + +These never load the real cross-encoder; they verify the searcher's two-stage wiring: +deeper candidate pool, re-scoring, reordering, and trimming to top_k. +""" + +from __future__ import annotations + +from typing import List, Sequence + +from coderag.api import CodeRAG +from coderag.config import Config +from coderag.eval import EvalCase, compare_modes +from coderag.retrieval.rerank import get_reranker +from coderag.retrieval.search import HybridSearcher +from tests.conftest import write + + +class KeywordReranker: + """Deterministic fake reranker: score = count of query words present in the doc.""" + + model_id = "fake-reranker" + + def rerank(self, query: str, documents: Sequence[str]) -> List[float]: + terms = query.lower().split() + return [float(sum(t in doc.lower() for t in terms)) for doc in documents] + + +def _indexed(config: Config) -> CodeRAG: + config.watched_dir.mkdir(parents=True, exist_ok=True) + write( + config.watched_dir / "auth.py", + "def authenticate_user(token):\n" + " '''Validate a session token and return the user.'''\n" + " return verify(token)\n", + ) + write( + config.watched_dir / "math_utils.py", + "def add_numbers(a, b):\n return a + b\n", + ) + cr = CodeRAG(config) + cr.index() + return cr + + +def test_get_reranker_off_by_default(config): + assert get_reranker(config) is None + + +def test_get_reranker_built_when_enabled(config): + r = get_reranker(config.with_overrides(rerank=True)) + assert r is not None + assert r.model_id # default model id present + + +def test_reranker_reorders_and_sets_score(config): + cr = _indexed(config) + searcher = HybridSearcher( + cr.config, cr.provider, cr.store, cr.vectors, reranker=KeywordReranker() + ) + hits = searcher.search("validate session token", top_k=2) + assert hits + # The auth chunk contains all three query words -> must rank first after rerank. + assert hits[0].path == "auth.py" + # Score is replaced by the cross-encoder score (here, the keyword overlap count). + assert hits[0].score >= hits[-1].score + + +def test_rerank_trims_to_top_k(config): + cr = _indexed(config) + searcher = HybridSearcher( + cr.config, cr.provider, cr.store, cr.vectors, reranker=KeywordReranker() + ) + assert len(searcher.search("token", top_k=1)) == 1 + + +def test_reranker_empty_query(config): + cr = _indexed(config) + searcher = HybridSearcher( + cr.config, cr.provider, cr.store, cr.vectors, reranker=KeywordReranker() + ) + assert searcher.search(" ", top_k=3) == [] + + +def test_compare_modes_adds_rerank_row(config): + cr = _indexed(config) + cases = [EvalCase("validate session token", ["auth.py"])] + results = compare_modes(cr, cases, ks=(1, 3), reranker=KeywordReranker()) + labels = [r.label for r in results] + assert labels == ["dense", "bm25", "hybrid", "hybrid+rerank"] + rerank_res = results[-1] + assert rerank_res.hit[1] == 1.0 # keyword reranker nails the auth file at rank 1 + + +def test_status_reports_rerank(config): + cr = _indexed(config.with_overrides(rerank=True)) + status = cr.status() + assert status["rerank"] is True + assert status["rerank_model"] From a15f9f8043877380c9e92f5e18fc2d57eec3a06f Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 17 Jun 2026 08:12:07 +0000 Subject: [PATCH 5/5] refactor(eval): dedupe language->extension mapping; document eval in README - Add chunking.languages.extensions_for() as the single canonical reverse lookup, and use it from the CLI and the git dataset miner instead of two separate hardcoded copies of the language->suffix table. - Surface `coderag eval` in the README CLI list and link the eval + strategy docs. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01LhTCPRjNmSitYxgSDfttT7 --- README.md | 7 +++++++ coderag/chunking/languages.py | 8 +++++++- coderag/eval/dataset.py | 10 ++++++---- coderag/surfaces/cli.py | 18 +++--------------- tests/test_eval.py | 10 ++++++++++ 5 files changed, 33 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 5298c06..8fb628d 100644 --- a/README.md +++ b/README.md @@ -79,8 +79,15 @@ coderag watch # index, then keep it live as files change coderag serve --port 8000 # run the HTTP API (needs [server]) coderag ui # launch the web UI (needs [ui]) coderag status # index stats (files, chunks, model, index type) +coderag eval --dataset d.jsonl --compare # retrieval quality: dense vs BM25 vs hybrid ``` +> **Measuring retrieval quality.** `coderag eval` is a built-in harness for "did we surface +> the right file/symbol?" — recall@k, MRR, nDCG@k at file or symbol level, with a git-history +> dataset miner (`--build`), a dense/BM25/hybrid comparison (`--compare`), and an optional +> cross-encoder rerank stage (`--rerank`). See [`docs/eval.md`](docs/eval.md) and the strategy +> writeup in [`docs/research/code-retrieval-strategy.md`](docs/research/code-retrieval-strategy.md). + ### Python library ```python diff --git a/coderag/chunking/languages.py b/coderag/chunking/languages.py index b5a4c66..c2cc5a4 100644 --- a/coderag/chunking/languages.py +++ b/coderag/chunking/languages.py @@ -3,7 +3,7 @@ from __future__ import annotations from pathlib import Path -from typing import Optional +from typing import Iterable, List, Optional # Languages for which we extract symbol-aware spans (function/class/method). # Python uses the stdlib ``ast``; the rest use tree-sitter. @@ -55,3 +55,9 @@ def detect_language(path: str | Path) -> Optional[str]: """Return the language for ``path``, or ``None`` if it should not be indexed.""" return EXTENSION_TO_LANGUAGE.get(Path(path).suffix.lower()) + + +def extensions_for(languages: Iterable[str]) -> List[str]: + """File extensions that map to any of ``languages`` (the canonical reverse lookup).""" + wanted = set(languages) + return sorted(ext for ext, lang in EXTENSION_TO_LANGUAGE.items() if lang in wanted) diff --git a/coderag/eval/dataset.py b/coderag/eval/dataset.py index c38a547..86b85c1 100644 --- a/coderag/eval/dataset.py +++ b/coderag/eval/dataset.py @@ -113,10 +113,12 @@ def build_from_git( query (``min_query_len``). """ repo = Path(repo) - exts = { - e if e.startswith(".") else f".{e}" - for e in (extensions or (".py", ".js", ".ts", ".tsx", ".go", ".rs", ".java")) - } + if extensions is None: + from coderag.chunking.languages import extensions_for + from coderag.config import DEFAULT_LANGUAGES + + extensions = extensions_for(DEFAULT_LANGUAGES) + exts = {e if e.startswith(".") else f".{e}" for e in extensions} fmt = f"{_REC}%H{_FLD}%s{_FLD}%an" raw = _git( diff --git a/coderag/surfaces/cli.py b/coderag/surfaces/cli.py index 36323ec..f5e9a01 100644 --- a/coderag/surfaces/cli.py +++ b/coderag/surfaces/cli.py @@ -104,10 +104,12 @@ def cmd_eval(args: argparse.Namespace) -> int: # `coderag eval build` — mine a dataset from the repo's git history. if args.build: + from coderag.chunking.languages import extensions_for + cases = ev.build_from_git( cfg.watched_dir, max_cases=args.max_cases, - extensions=[e.lstrip(".") for e in _suffixes(cfg.languages)], + extensions=extensions_for(cfg.languages), ) out = args.dataset or "coderag-eval.jsonl" ev.save_dataset(cases, out) @@ -152,20 +154,6 @@ def cmd_eval(args: argparse.Namespace) -> int: return 0 -def _suffixes(languages: tuple) -> list: - """Map configured language names to file suffixes for dataset mining.""" - table = { - "python": ".py", - "javascript": ".js", - "typescript": ".ts", - "tsx": ".tsx", - "go": ".go", - "rust": ".rs", - "java": ".java", - } - return [table[lang] for lang in languages if lang in table] - - def cmd_watch(args: argparse.Namespace) -> int: from coderag.watch import watch diff --git a/tests/test_eval.py b/tests/test_eval.py index 276d35f..baa6d26 100644 --- a/tests/test_eval.py +++ b/tests/test_eval.py @@ -190,6 +190,16 @@ def git(*args: str) -> None: assert cases[0].source == "git" +def test_extensions_for_uses_canonical_map(): + from coderag.chunking.languages import extensions_for + + exts = extensions_for(("python", "go")) + assert ".py" in exts and ".go" in exts + assert ".rs" not in exts # rust not requested + # Unknown language names contribute nothing rather than raising. + assert extensions_for(("nonsense",)) == [] + + def test_build_from_git_skips_merges_and_short_subjects(tmp_path: Path): repo = tmp_path / "repo" repo.mkdir()