From 77ecc4e76218db6b739a339a89fa073c37a3f5f7 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 17 Jun 2026 06:25:44 +0000
Subject: [PATCH 1/5] docs: add local-first code-retrieval strategy research

Synthesizes multi-source research on making CodeRAG win a code-retrieval
eval harness against agentic-grep loops (Claude Code, Codex) and commercial
semantic search (Cursor, Cody, Augment), under a local/zero-key constraint.

Key findings and prioritized plan: build a SweRank/Agentless-style eval
harness first; swap the default embedder (bge-small ~45.8 CoIR ->
CodeRankEmbed ~60.1); add a local ONNX cross-encoder reranker; route/tune
hybrid fusion; then structure-aware graph expansion. Includes cited
accuracy-vs-cost tradeoffs and the honest grep-vs-embeddings debate.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01LhTCPRjNmSitYxgSDfttT7
---
 docs/research/code-retrieval-strategy.md | 242 +++++++++++++++++++++++
 1 file changed, 242 insertions(+)
 create mode 100644 docs/research/code-retrieval-strategy.md

diff --git a/docs/research/code-retrieval-strategy.md b/docs/research/code-retrieval-strategy.md
new file mode 100644
index 0000000..3465e71
--- /dev/null
+++ b/docs/research/code-retrieval-strategy.md
@@ -0,0 +1,242 @@
+# Winning the Code-Retrieval Eval: A Strategy for CodeRAG
+
+> Research synthesis — how to make CodeRAG more **accurate**, **efficient**, and **faster** at
+> code retrieval than (a) agentic grep loops (Claude Code, Codex) and (b) commercial semantic
+> code search (Cursor, Sourcegraph/Cody, Augment), under a hard constraint: **everything must run
+> locally with no API key / no paid LLM calls.**
+>
+> Confidence levels: **[H]** well-sourced primary, **[M]** secondary/needs re-verification,
+> **[?]** contested / sources disagree.
+
+---
+
+## TL;DR — the plan, in priority order
+
+| # | Move | Expected lift | Cost | Stays local? |
+|---|------|---------------|------|--------------|
+| 0 | **Build the eval harness first** (SweRank/Agentless protocol on real repos) | — (makes claims provable) | Low | ✅ |
+| 1 | **Swap default embedder** `bge-small` → **CodeRankEmbed (137M)** | ~+14 CoIR NDCG@10 (45.8 → 60.1) | One model swap, still ONNX/CPU | ✅ |
+| 2 | **Add a two-stage cross-encoder reranker** (retrieve top-100 → rerank top-8) | +5 to +15 nDCG/MRR | ~30 ms/query CPU (ONNX) | ✅ |
+| 3 | **Tune & route hybrid fusion** (BM25 for identifiers/code↔code, dense for NL; sweep RRF) | a few nDCG points; big latency win | Low | ✅ |
+| 4 | **Structure-aware retrieval** (tree-sitter call/import graph + PageRank expansion) | ~+2 pts localization | Medium build | ✅ |
+| 5 | **Query expansion / HyDE** — *gated to NL queries only* | mixed; can **hurt** private identifiers | +25–60% latency | ✅ (small local LLM) |
+
+The first two moves alone plausibly take CodeRAG from a bottom-quartile CoIR retriever to one
+competitive with 7B proprietary models — without leaving the local/zero-key envelope.
+
+---
+
+## 0. Build the eval harness first (priority 0)
+
+You cannot claim "more accurate" without a number, and the number protects the claim. Copy the
+**SweRank / Agentless localization protocol** [H]:
+
+- **Queries:** merged PRs / closed issues → use issue title+body (or commit message) as the query.
+- **Ground truth:** the files (and, for function-level, the functions) changed by the fixing
+  PR/commit diff. This is exactly how SweRank built its 67,341-pair "SweLoc" corpus from 3,387
+  repos. [M] (arXiv 2505.07849)
+- **Filtering:** drop docs-only/trivial PRs; apply a consistency filter (SweRank used K=20).
+- **Metrics:** report **file & function recall@{1,5,10} + MRR** (mirror Agentless/SweRank so your
+  numbers are directly comparable to published baselines). Use **nDCG@10** for multi-file PRs. [H]
+- **Baselines to beat:** BM25 (the SWE-bench baseline), a stock dense embedder (bge/gte/OpenAI),
+  and an Agentless-style LLM localizer. [H]
+- **External sanity checks:** run the retriever on **CoIR (NDCG@10)** and **CodeSearchNet (MRR)**;
+  both ship as pip-installable, BEIR/MTEB-schema frameworks. [H] (arXiv 2407.02883, 1909.09436)
+
+**Bars worth targeting:** SweRank reports ~96% file Acc@5 and ~88.7% function Acc@10 on
+SWE-bench-Lite; LocAgent ~92.7% file accuracy. [M] These are end-to-end localization systems
+(retriever + reranker + sometimes an LLM), so they're aspirational ceilings, not retriever-only.
+
+**Why retrieval matters at all (the framing stat):** on SWE-bench, swapping BM25 context for the
+gold "oracle" files more than doubled Claude 2's resolve rate (1.96% → 4.8%); BM25 missed *all*
+needed files in nearly half of instances. [H] (arXiv 2310.06770) Better retrieval is the lever.
+
+---
+
+## 1. Upgrade the embedding model (highest single accuracy jump)
+
+**Current state:** CodeRAG defaults to `BAAI/bge-small-en-v1.5`, which scores only **~45.8 CoIR
+NDCG@10** — generic text embedders are markedly weaker than code-specialized ones on code. [H]
+(Granite R2 paper, arXiv 2508.21085)
+
+**Recommended default → `CodeRankEmbed` (137M):** 8192-token context, **60.1 CoIR NDCG@10 / 77.9
+CodeSearchNet MRR** — exceptional accuracy-per-byte that rivals 7B proprietary models, and small
+enough for ONNX/CPU. [H] (cornstack/CodeRankEmbed; arXiv 2412.01007) That's roughly **+14 CoIR
+points over the current default from a single swap.**
+⚠️ Verify its exact license (it's an Arctic-Embed-M-Long fine-tune) before shipping as default.
+
+**Permissively-licensed alternatives (Apache-2.0):**
+- **CodeSage-base-v2 (356M)** — ~64.56 CoIR [M, re-verify], Matryoshka dims (flexible truncation).
+- **gte-modernbert-base (149M)** — strong dual text+code model, 8192 ctx, good general fallback.
+- **nomic-embed-code (7B, Apache-2.0)** — SOTA-ish on CodeSearchNet, but **needs a GPU**; offer as
+  an opt-in "accuracy" backend, not the local default. [H]
+
+**License landmines to avoid as defaults:** SFR-Embedding-Code (all sizes) and jina-code-embeddings
+are **CC-BY-NC-4.0 (non-commercial)**; Qodo-Embed-1-7B is commercial-license (its 1.5B is open
+under OpenRAIL++-M, 68.53 CoIR). [H] Voyage-code-3 is API-only — reference point only. [H]
+
+> ⚠️ **Metric-scale trap.** Two "code retrieval" scales circulate and get conflated: **CoIR-paper
+> NDCG@10** (values in the 40s–70s) vs the **MTEB-Code leaderboard average** (values ~78–90). The
+> same model shows up as "71.5" and "79.31". Always compare within one scale. [H]
+
+---
+
+## 2. Add a local cross-encoder reranker (highest-ROI bolt-on)
+
+The evidence converges: **a small ONNX cross-encoder reranking the top-100 down to top-8 is the
+single best accuracy add-on for a local-first engine.**
+
+- **Expected lift:** +5 to +15 nDCG/MRR points, largest when first-stage recall is weak. [H]
+  (arXiv 2212.06121; multiple RAG benchmarks). One benchmark saw Hit@1 jump 62.7% → 83.0%. [M]
+- **Latency (the local constraint):** FlashRank's `ms-marco-MiniLM-L-12-v2` (~4 MB, ONNX, CPU-only)
+  adds **~31 ms/query** mean over 100 candidates; sub-20 ms for 50. [H] (FlashRank;
+  clouatre-labs/rag-reranking-benchmarks) Unoptimized PyTorch is 100–300 ms — **so ship ONNX/int8**.
+- **Canonical design:** retrieve top-100 → rerank → top-8. [H] (CoRNStack used window=10/step=5 for
+  listwise; pointwise cross-encoders are simpler and cheaper.)
+- **Drop-in models:** FlashRank `ms-marco-MiniLM-L-12-v2`, or `bge-reranker-base` / `bge-reranker-v2-m3`
+  in ONNX-int8 (community ONNX builds exist). [H]
+- **Code-aware option:** Qwen3-Reranker-0.6B is the best *small code-aware* reranker (MTEB-Code 73.4),
+  but ~0.5–1 s/query on CPU — borderline interactive; quantize and benchmark before adopting. [M]
+- **Total local budget:** FAISS ANN (single-digit ms) + ONNX rerank (~30–60 ms) → **well under 100 ms**,
+  still far faster than an agent's multi-round grep/read loop.
+
+**Caveat / gap:** there's no published head-to-head of *small* cross-encoders on a *code* benchmark
+(CoIR). The strongest code-reranking result (CoRNStack, +2.8 to +12.2 MRR) uses a 7B reranker. The
+small-model code lift is **inferred, not directly measured** — your harness should confirm it. [?]
+
+**Alternative architecture — ColBERT / late interaction:** `answerai-colbert-small-v1` (33M) beats
+bge-base on BEIR and searches on CPU in milliseconds; it's a strong *first-stage* upgrade or
+same-size reranker. [H] But it's a multi-vector index change (higher per-token storage), not a cheap
+bolt-on. Treat as a later experiment, not a v1 move.
+
+---
+
+## 3. Tune and route the hybrid fusion you already have
+
+CodeRAG already does dense + BM25 + RRF — the literature says that's the right foundation; the wins
+are in **routing and tuning**:
+
+- **Route by query type.** For **code→code (PL→PL)** retrieval, BM25 with word-level splitting
+  *significantly beats* dense and is ~an order of magnitude faster; for **NL→code**, dense wins. [H]
+  (arXiv 2510.20609) Detecting "is this query an identifier/snippet vs natural language" and
+  weighting accordingly is a cheap, high-value heuristic.
+- **BM25 is the efficiency anchor.** Retrieval latency varies up to 200× across configs; "BM25 +
+  word splitting offers the best quality–latency trade-off." [H] Keep BM25 fast and dominant for
+  exact identifiers — exactly where pure-embedding tools (and Cursor's own data) show weakness.
+- **RRF tuning.** k≈60 is a fine default (Elastic says RRF needs no tuning), but a tuned hybrid beat
+  vanilla RRF by ~4 nDCG points on one benchmark — so **sweep k and per-retriever weights once you
+  have eval data.** [M] (WANDS is e-commerce, not code — re-verify on your harness.)
+- **Complementarity is real:** sparse excels at entity/identifier lookup and domain terminology;
+  dense excels at paraphrase/semantic generalization; **fusion + rerank beats either alone "by a
+  large margin."** [H] This *is* the gap neither pure-grep agents nor single-modality embedding
+  tools fully exploit.
+
+---
+
+## 4. Structure-aware retrieval (real but modest; phase 2)
+
+Graphs help decide *what* to retrieve, and all of these build **locally via tree-sitter**:
+
+- **RepoGraph** (tree-sitter dependency graph + k-hop ego-graph) lifted SWE-bench-Lite resolve rates
+  ~2–2.7 pts across four frameworks (best: 1-hop + flatten). [H] (arXiv 2410.14684)
+- **GraphCodeBERT** (data-flow-aware) raised CodeSearchNet MRR ~+2 pts over CodeBERT, consistent
+  across 6 languages (p<0.01). [H] (arXiv 2009.08366)
+- **Aider's repo map** = tree-sitter symbol graph + **PageRank** over the file dependency graph,
+  token-budgeted (~1k tokens), 130+ languages — the canonical engineering pattern for
+  structure-aware context selection. [H] (aider.chat/2023/10/22/repomap.html)
+- **CodeGRAG** (control/data-flow graph view) gave small local models +5–6 pts on generation. [H]
+
+**Verdict:** worth building as a retrieval-expansion/rerank signal (1-hop neighbors of top hits),
+but gains are modest (~2 pts) and it's more engineering than moves #1–3. Do it after the eval +
+embedder + reranker land.
+
+---
+
+## 5. Query expansion / HyDE — double-edged, gate carefully
+
+- HyDE (generate a hypothetical code snippet, embed *that*) bridges the NL↔code gap and improves
+  recall in general RAG. [M] But **no code-specific benchmark number surfaced** — unproven on code. [?]
+- **It can actively hurt** exactly your hardest case: LLM query expansion degraded retrieval on
+  *unfamiliar/ambiguous* queries (−10 NDCG@10 on unfamiliar; −17 Recall@100 on high-ambiguity). [H]
+  (arXiv 2505.12694) Private-codebase identifiers/internal APIs are precisely "unfamiliar." 
+- **Cost:** +25–60% latency (extra small-model generation + second embedding pass). [M]
+
+**Verdict:** if used at all, **gate it to clearly natural-language queries** and never to
+identifier-like queries. Low priority; measure on the harness before enabling by default.
+
+---
+
+## The agentic-grep baseline — what you're actually up against
+
+This debate is **genuinely contested**; sources have skin in the game. Be honest about it.
+
+**The anti-embedding camp (grep + read loops win):**
+- **Claude Code:** Boris Cherny — *"Claude Code doesn't use RAG currently… agentic search
+  out-performed RAG for the kinds of things people use Code for."* Early versions used a local
+  vector DB and dropped it. Stack = Glob + Grep (ripgrep) + Read, on demand. [H] (HN 43164253)
+- **Cline:** *"no RAG, no embeddings, no vector databases"* — by design. Their mechanism arguments:
+  chunking "tears apart logic" (a call in chunk 47, its def in chunk 892); an index is "a snapshot
+  frozen in time" that drifts stale; embeddings double your IP attack surface. [H]
+- **Sourcegraph Cody:** *removed* embeddings entirely in v5.3 (scaling to 100k+ repos, security,
+  maintenance) — "the most important aspect is getting the files, not the algorithm." [H]
+- **Augment:** for SWE-bench, *"grep and find were sufficient… embedding-based retrieval was not the
+  bottleneck"* — though they say embeddings are still "critical … in real use." [H]
+- The security argument has teeth: **Vec2Text reconstructs ~92% of 32-token inputs exactly** —
+  embeddings are invertible and unrevocable, so they need original-text-level safeguards. [H]
+  (degrades sharply >50–100 tokens, so the risk is bounded). CodeRAG's local-first stance is a
+  direct answer to this.
+
+**The pro-semantic counter-evidence (the opening you exploit):**
+- **Cursor (publishes data):** semantic search + grep gives **+12.5% accuracy** (6.5–23.5% by model)
+  over grep alone, **largest on 1,000+ file repos.** [H] They agree grep is the floor; embeddings
+  raise the ceiling.
+- **NVIDIA CORTEXA (the cleanest controlled result):** a fine-tuned code embedder (NV-EmbedCode)
+  hits **71.95% recall, +31.28% over BM25** and +6.4% over Agentless on SWE-bench localization. [H]
+  This strongly implies the "grep wins" results reflect *off-the-shelf embedders + naive chunking*,
+  not embeddings in principle.
+- Agentic loops are expensive: ~2.7× input tokens / 1.7× output tokens vs enhanced RAG, and up to
+  83× higher latency sensitivity (poor KV-cache reuse). [M] (general agentic-RAG, not code-specific)
+
+**Synthesis — the defensible thesis:** the grep camp's wins are real but measured on
+**SWE-bench-style edit tasks with persistent agents and off-the-shelf embedders.** CodeRAG's
+opening is the union of (a) where pure grep is documented to fail — conceptual / "where is X
+handled" NL queries, large/unfamiliar repos (Cursor's 1,000+ file effect; CORTEXA's recall gap) —
+**plus** (b) grep's strength on exact identifiers, captured by BM25, **plus** (c) a fine-tuned/
+code-specific embedder and a reranker (CORTEXA shows that closes the gap). The literature's clearest
+verdict — **fusion + rerank beats either modality alone** — is exactly the niche neither pure-grep
+agents nor single-modality embedding products fully occupy. And being **local/zero-key** answers the
+staleness (live watcher) and security (no code leaves the box; no invertible embeddings shipped out)
+objections in one stroke.
+
+---
+
+## Open contradictions to resolve on your own harness
+
+1. **Chunking [?]:** cAST claims AST chunking beats fixed-size on recall/SWE-bench (+1.8–4.3 Recall@5);
+   a controlled study finds **sliding windows beat function-level chunking** on completion EM (function
+   chunking was *worst*), and line-based ≈ syntax-aware across budgets. CodeRAG's symbol-aware chunking
+   is well-suited to NL→code localization (your use case), but **don't assume AST splitting is strictly
+   best** — test windowed/overlapping and hierarchical (parent-document) variants. Optimal chunk size
+   was ~2,000 non-whitespace chars; bigger degraded. [H/?]
+2. **Small reranker on code [?]:** lift is inferred from English benchmarks; confirm on CoIR/your harness.
+3. **CoIR absolute numbers [M]:** several (Voyage/E5/ada; SweRank Acc@k; >0.78 CSN MRR) came via PDF
+   parsing — re-verify against source tables before publishing.
+
+## Sequencing recommendation
+
+1. **Eval harness** (§0) — nothing is provable without it.
+2. **CodeRankEmbed default** (§1) — biggest single jump, one swap, stays ONNX/CPU.
+3. **ONNX cross-encoder reranker** (§2) — highest-ROI bolt-on, ~30 ms.
+4. **Hybrid routing + RRF sweep** (§3) — cheap, big latency win, plays to BM25's identifier strength.
+5. **MCP surface** — so agents (Claude Code/Codex) actually adopt CodeRAG as their retrieval tool.
+6. **Graph expansion** (§4), then **gated HyDE** (§5) — measured, incremental.
+
+---
+
+### Key sources
+CoIR (arXiv 2407.02883) · Granite R2 broad comparison (2508.21085) · CodeRankEmbed/CoRNStack
+(2412.01007) · SweRank (2505.07849) · LocAgent (2503.09089) · SWE-bench (2310.06770) · CodeSearchNet
+(1909.09436) · RepoGraph (2410.14684) · GraphCodeBERT (2009.08366) · chunking study (2605.04763) ·
+budget/routing (2510.20609) · query-expansion risk (2505.12694) · Cursor semsearch (cursor.com/blog/semsearch)
+· Cline (cline.bot/blog) · Sourcegraph Cody FAQ · Augment (jxnl.co / augmentcode.com) · NVIDIA CORTEXA
+(research.nvidia.com/labs/adlr/cortexa) · FlashRank · mxbai-rerank-v2 · answerai-colbert-small-v1.

From 528f17f18c234e1927a2acfcca78c074b7249115 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 17 Jun 2026 06:51:46 +0000
Subject: [PATCH 2/5] feat(eval): add code-retrieval eval harness

Implements move #0 of the retrieval strategy: a small, offline harness to
measure retrieval quality so accuracy claims are provable and regressions
are caught.

- coderag/eval/metrics.py: recall@k, hit@k (Acc@k), MRR, nDCG@k with
  rank de-duplication so multiple chunks per file don't inflate scores.
- coderag/eval/dataset.py: JSONL EvalCase format + a git miner that builds
  datasets SWE-bench/SweRank-style (commit subject -> changed files that
  still exist at HEAD), filtering merges/reverts/bots/diffuse commits.
- coderag/eval/harness.py: evaluate() scores any search callable;
  compare_modes() contrasts dense-only vs BM25-only vs hybrid on one index
  by swapping RRF fusion weights (the index is mode-independent).
- coderag eval [--build] [--compare] [--level file|symbol] CLI surface,
  a thin adapter over the engine.
- docs/eval.md usage guide; tests cover metrics, dataset round-trip, the
  git miner, and end-to-end scoring via the deterministic fake provider.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01LhTCPRjNmSitYxgSDfttT7
---
 coderag/eval/__init__.py |  39 ++++++++
 coderag/eval/dataset.py  | 175 ++++++++++++++++++++++++++++++++
 coderag/eval/harness.py  | 180 +++++++++++++++++++++++++++++++++
 coderag/eval/metrics.py  |  78 +++++++++++++++
 coderag/surfaces/cli.py  |  97 ++++++++++++++++++
 docs/eval.md             |  80 +++++++++++++++
 tests/test_eval.py       | 208 +++++++++++++++++++++++++++++++++++++++
 7 files changed, 857 insertions(+)
 create mode 100644 coderag/eval/__init__.py
 create mode 100644 coderag/eval/dataset.py
 create mode 100644 coderag/eval/harness.py
 create mode 100644 coderag/eval/metrics.py
 create mode 100644 docs/eval.md
 create mode 100644 tests/test_eval.py

diff --git a/coderag/eval/__init__.py b/coderag/eval/__init__.py
new file mode 100644
index 0000000..60554b8
--- /dev/null
+++ b/coderag/eval/__init__.py
@@ -0,0 +1,39 @@
+"""Code-retrieval evaluation harness.
+
+A small, offline, dependency-free harness for measuring *retrieval* quality — "did we
+surface the right file/symbol for this query?" — so accuracy claims are provable and
+regressions are caught.
+
+It follows the SWE-bench / Agentless / SweRank localization protocol: queries come from
+real commit messages or issues, and ground truth is the set of files (and optionally
+symbols) those commits changed. Metrics are the standard localization set: recall@k,
+hit@k (Acc@k), MRR, and nDCG@k.
+
+The public pieces:
+
+- :class:`EvalCase` / :func:`load_dataset` / :func:`save_dataset` — the dataset format.
+- :func:`build_from_git` — mine a dataset from a repo's history (no network, no LLM).
+- :func:`evaluate` — score one retriever (any ``search`` callable) against a dataset.
+- :func:`compare_modes` — score dense-only vs BM25-only vs hybrid on one index, which is
+  the built-in way to show fusion beats either modality alone.
+"""
+
+from __future__ import annotations
+
+from coderag.eval.dataset import EvalCase, build_from_git, load_dataset, save_dataset
+from coderag.eval.harness import EvalResult, compare_modes, evaluate
+from coderag.eval.metrics import hit_at_k, mrr, ndcg_at_k, recall_at_k
+
+__all__ = [
+    "EvalCase",
+    "EvalResult",
+    "build_from_git",
+    "compare_modes",
+    "evaluate",
+    "hit_at_k",
+    "load_dataset",
+    "mrr",
+    "ndcg_at_k",
+    "recall_at_k",
+    "save_dataset",
+]
diff --git a/coderag/eval/dataset.py b/coderag/eval/dataset.py
new file mode 100644
index 0000000..c38a547
--- /dev/null
+++ b/coderag/eval/dataset.py
@@ -0,0 +1,175 @@
+"""Eval dataset: a list of (query -> relevant files/symbols) cases, plus a git miner.
+
+The dataset is plain JSONL so it's diffable, hand-editable, and easy to share. Each line
+is one :class:`EvalCase`. :func:`build_from_git` synthesizes a dataset from a repo's own
+history using the SWE-bench/SweRank recipe: the commit subject becomes the query and the
+files that commit changed (that still exist at HEAD) become the ground truth.
+"""
+
+from __future__ import annotations
+
+import json
+import subprocess
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Dict, List, Optional, Sequence
+
+# Control-character delimiters for one-pass `git log` parsing — they never appear in
+# real commit messages, so we don't have to escape file paths or message text.
+_REC = "\x1e"  # between commits
+_FLD = "\x1f"  # between fields within a commit header
+
+
+@dataclass(slots=True)
+class EvalCase:
+    """One retrieval query and its ground-truth relevant items.
+
+    ``relevant_files`` are repo-relative posix paths; ``relevant_symbols`` are optional
+    qualified names (e.g. ``"Indexer._index_file"``) for function/class-level scoring.
+    """
+
+    query: str
+    relevant_files: List[str]
+    relevant_symbols: List[str] = field(default_factory=list)
+    id: Optional[str] = None
+    source: str = ""
+
+    def as_dict(self) -> Dict[str, object]:
+        d: Dict[str, object] = {
+            "query": self.query,
+            "relevant_files": self.relevant_files,
+        }
+        if self.relevant_symbols:
+            d["relevant_symbols"] = self.relevant_symbols
+        if self.id:
+            d["id"] = self.id
+        if self.source:
+            d["source"] = self.source
+        return d
+
+    @classmethod
+    def from_dict(cls, d: Dict[str, object]) -> "EvalCase":
+        files = d.get("relevant_files", [])
+        symbols = d.get("relevant_symbols", [])
+        return cls(
+            query=str(d["query"]),
+            relevant_files=[str(p) for p in files] if isinstance(files, list) else [],
+            relevant_symbols=(
+                [str(s) for s in symbols] if isinstance(symbols, list) else []
+            ),
+            id=str(d["id"]) if d.get("id") else None,
+            source=str(d.get("source", "")),
+        )
+
+
+def load_dataset(path: Path | str) -> List[EvalCase]:
+    """Load a JSONL dataset, skipping blank lines."""
+    cases: List[EvalCase] = []
+    with Path(path).open(encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if line:
+                cases.append(EvalCase.from_dict(json.loads(line)))
+    return cases
+
+
+def save_dataset(cases: Sequence[EvalCase], path: Path | str) -> None:
+    """Write cases as JSONL (one compact JSON object per line)."""
+    p = Path(path)
+    p.parent.mkdir(parents=True, exist_ok=True)
+    with p.open("w", encoding="utf-8") as fh:
+        for case in cases:
+            fh.write(json.dumps(case.as_dict(), ensure_ascii=False) + "\n")
+
+
+def _git(repo: Path, *args: str) -> str:
+    """Run a git command in ``repo`` and return stdout (raises on failure)."""
+    return subprocess.run(
+        ["git", "-C", str(repo), *args],
+        check=True,
+        capture_output=True,
+        text=True,
+    ).stdout
+
+
+def build_from_git(
+    repo: Path | str,
+    *,
+    max_cases: int = 200,
+    extensions: Optional[Sequence[str]] = None,
+    max_files_per_commit: int = 5,
+    min_query_len: int = 12,
+    commit_scan_limit: int = 2000,
+) -> List[EvalCase]:
+    """Mine an eval dataset from a repo's commit history.
+
+    For each non-merge commit, the subject line is the query and the changed files that
+    (a) match ``extensions`` and (b) still exist at HEAD become the relevant set — so
+    every ground-truth file is actually present in the index built from HEAD.
+
+    Filtering mirrors SweRank/Agentless dataset construction: skip merges, reverts, and
+    bot/automated commits; drop commits that touch too many files (``max_files_per_commit``
+    — diffuse, weak signal) or none of the targeted extensions; and require a meaningful
+    query (``min_query_len``).
+    """
+    repo = Path(repo)
+    exts = {
+        e if e.startswith(".") else f".{e}"
+        for e in (extensions or (".py", ".js", ".ts", ".tsx", ".go", ".rs", ".java"))
+    }
+
+    fmt = f"{_REC}%H{_FLD}%s{_FLD}%an"
+    raw = _git(
+        repo,
+        "log",
+        "--no-merges",
+        f"-n{commit_scan_limit}",
+        "--name-only",
+        f"--pretty=format:{fmt}",
+    )
+
+    cases: List[EvalCase] = []
+    for record in raw.split(_REC):
+        if not record.strip() or len(cases) >= max_cases:
+            continue
+        header, _, body = record.partition("\n")
+        parts = header.split(_FLD)
+        if len(parts) < 3:
+            continue
+        sha, subject, author = parts[0], parts[1].strip(), parts[2].strip()
+
+        if not _is_usable_query(subject, min_query_len) or _is_bot(author):
+            continue
+
+        files = [
+            line.strip()
+            for line in body.splitlines()
+            if line.strip() and Path(line.strip()).suffix in exts
+        ]
+        # Keep only files that still exist at HEAD, so they're retrievable from the index.
+        files = [f for f in files if (repo / f).exists()]
+        if not files or len(files) > max_files_per_commit:
+            continue
+
+        cases.append(
+            EvalCase(
+                query=subject,
+                relevant_files=files,
+                id=sha[:12],
+                source="git",
+            )
+        )
+    return cases
+
+
+def _is_usable_query(subject: str, min_len: int) -> bool:
+    if len(subject) < min_len:
+        return False
+    low = subject.lower()
+    # Reverts/merges/version bumps carry little localization signal.
+    return not low.startswith(("revert", "merge", "bump", "release "))
+
+
+def _is_bot(author: str) -> bool:
+    low = author.lower()
+    return "bot" in low or low in {"dependabot", "github-actions", "renovate"}
diff --git a/coderag/eval/harness.py b/coderag/eval/harness.py
new file mode 100644
index 0000000..aa112b5
--- /dev/null
+++ b/coderag/eval/harness.py
@@ -0,0 +1,180 @@
+"""Run a retriever against an eval dataset and report localization metrics.
+
+:func:`evaluate` scores any ``search`` callable; :func:`compare_modes` is the convenience
+that scores dense-only, BM25-only, and hybrid retrieval on a single index — the built-in
+way to demonstrate that fusion beats either modality alone.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Sequence, Tuple
+
+from coderag.eval.dataset import EvalCase
+from coderag.eval.metrics import hit_at_k, mrr, ndcg_at_k, recall_at_k
+from coderag.types import SearchHit
+
+if TYPE_CHECKING:
+    from coderag.api import CodeRAG
+
+# A retriever: given a query and a result count, return ranked hits (best-first).
+SearchFn = Callable[[str, int], List[SearchHit]]
+
+# (label, dense_weight, lexical_weight) — the three retrieval modes we contrast.
+DEFAULT_MODES: Tuple[Tuple[str, float, float], ...] = (
+    ("dense", 1.0, 0.0),
+    ("bm25", 0.0, 1.0),
+    ("hybrid", 1.0, 1.0),
+)
+
+DEFAULT_KS: Tuple[int, ...] = (1, 5, 10)
+
+
+@dataclass(slots=True)
+class EvalResult:
+    """Aggregate metrics for one retriever over one dataset."""
+
+    label: str
+    level: str  # "file" | "symbol"
+    n: int  # number of scored cases
+    ks: Tuple[int, ...]
+    recall: Dict[int, float] = field(default_factory=dict)
+    hit: Dict[int, float] = field(default_factory=dict)
+    ndcg: Dict[int, float] = field(default_factory=dict)
+    mrr: float = 0.0
+
+    def as_dict(self) -> Dict[str, object]:
+        return {
+            "label": self.label,
+            "level": self.level,
+            "n": self.n,
+            "mrr": round(self.mrr, 4),
+            "recall": {k: round(v, 4) for k, v in self.recall.items()},
+            "hit": {k: round(v, 4) for k, v in self.hit.items()},
+            "ndcg": {k: round(v, 4) for k, v in self.ndcg.items()},
+        }
+
+
+def _ranked_ids(hits: Sequence[SearchHit], level: str) -> List[str]:
+    """Project hits onto the id space being scored (file paths or symbols)."""
+    if level == "symbol":
+        return [h.symbol for h in hits if h.symbol]
+    return [h.path for h in hits]
+
+
+def _relevant_ids(case: EvalCase, level: str) -> List[str]:
+    return case.relevant_symbols if level == "symbol" else case.relevant_files
+
+
+def evaluate(
+    search_fn: SearchFn,
+    cases: Sequence[EvalCase],
+    *,
+    label: str = "retriever",
+    ks: Sequence[int] = DEFAULT_KS,
+    level: str = "file",
+) -> EvalResult:
+    """Score ``search_fn`` over ``cases`` at ``level`` ("file" or "symbol").
+
+    Cases with no ground-truth ids at the requested level are skipped (so a file-only
+    dataset can still be scored at the symbol level without penalizing the retriever).
+    """
+    ks = tuple(sorted(set(ks)))
+    fetch = max(ks)
+    recall_sum = {k: 0.0 for k in ks}
+    hit_sum = {k: 0.0 for k in ks}
+    ndcg_sum = {k: 0.0 for k in ks}
+    mrr_sum = 0.0
+    scored = 0
+
+    for case in cases:
+        relevant = _relevant_ids(case, level)
+        if not relevant:
+            continue
+        ranked = _ranked_ids(search_fn(case.query, fetch), level)
+        for k in ks:
+            recall_sum[k] += recall_at_k(ranked, relevant, k)
+            hit_sum[k] += hit_at_k(ranked, relevant, k)
+            ndcg_sum[k] += ndcg_at_k(ranked, relevant, k)
+        mrr_sum += mrr(ranked, relevant, fetch)
+        scored += 1
+
+    n = max(scored, 1)
+    return EvalResult(
+        label=label,
+        level=level,
+        n=scored,
+        ks=ks,
+        recall={k: recall_sum[k] / n for k in ks},
+        hit={k: hit_sum[k] / n for k in ks},
+        ndcg={k: ndcg_sum[k] / n for k in ks},
+        mrr=mrr_sum / n,
+    )
+
+
+def compare_modes(
+    cr: "CodeRAG",
+    cases: Sequence[EvalCase],
+    *,
+    ks: Sequence[int] = DEFAULT_KS,
+    level: str = "file",
+    modes: Sequence[Tuple[str, float, float]] = DEFAULT_MODES,
+) -> List[EvalResult]:
+    """Score dense-only vs BM25-only vs hybrid on the already-built index of ``cr``.
+
+    The index is mode-independent — the dense/lexical weights only affect query-time RRF
+    fusion — so we reuse one provider/store/vector index and just swap the fusion weights.
+    """
+    from coderag.retrieval.search import HybridSearcher
+
+    results: List[EvalResult] = []
+    for label, dense_w, lexical_w in modes:
+        cfg = cr.config.with_overrides(dense_weight=dense_w, lexical_weight=lexical_w)
+        searcher = HybridSearcher(cfg, cr.provider, cr.store, cr.vectors)
+        results.append(
+            evaluate(searcher.search, cases, label=label, ks=ks, level=level)
+        )
+    return results
+
+
+def format_table(results: Sequence[EvalResult]) -> str:
+    """Render results as a compact fixed-width table for the CLI."""
+    if not results:
+        return "(no results)"
+    ks = results[0].ks
+    headers = (
+        ["mode", "n", "MRR"]
+        + [f"R@{k}" for k in ks]
+        + [f"nDCG@{k}" for k in ks]
+        + [f"Hit@{k}" for k in ks]
+    )
+    rows = [headers]
+    for r in results:
+        rows.append(
+            [r.label, str(r.n), f"{r.mrr:.3f}"]
+            + [f"{r.recall[k]:.3f}" for k in ks]
+            + [f"{r.ndcg[k]:.3f}" for k in ks]
+            + [f"{r.hit[k]:.3f}" for k in ks]
+        )
+    widths = [max(len(row[i]) for row in rows) for i in range(len(headers))]
+    lines = [
+        "  ".join(cell.ljust(widths[i]) for i, cell in enumerate(row)) for row in rows
+    ]
+    lines.insert(1, "  ".join("-" * w for w in widths))
+    return "\n".join(lines)
+
+
+def best_label(
+    results: Sequence[EvalResult], *, metric: str = "ndcg", k: int = 10
+) -> Optional[str]:
+    """Label of the highest-scoring result by ``metric`` at ``k`` (MRR ignores ``k``)."""
+    if not results:
+        return None
+
+    def score(r: EvalResult) -> float:
+        if metric == "mrr":
+            return r.mrr
+        table = {"recall": r.recall, "hit": r.hit, "ndcg": r.ndcg}[metric]
+        return table.get(k, table.get(max(r.ks), 0.0))
+
+    return max(results, key=score).label
diff --git a/coderag/eval/metrics.py b/coderag/eval/metrics.py
new file mode 100644
index 0000000..b0474bc
--- /dev/null
+++ b/coderag/eval/metrics.py
@@ -0,0 +1,78 @@
+"""Ranking metrics for code-retrieval localization.
+
+All functions take ``ranked`` (a best-first list of retrieved item ids — file paths or
+symbols, already deduplicated and order-preserving) and ``relevant`` (the set of
+ground-truth ids). Items are compared by equality, so callers must normalize ids (e.g.
+posix relative paths) before scoring.
+
+These are the standard metrics used by SWE-bench localization work (Agentless, LocAgent,
+SweRank) and the CoIR / CodeSearchNet benchmarks:
+
+- ``recall_at_k`` — fraction of relevant items found in the top k.
+- ``hit_at_k``    — 1.0 if *any* relevant item is in the top k (a.k.a. Acc@k / hit rate).
+- ``mrr``         — reciprocal rank of the first relevant item.
+- ``ndcg_at_k``   — rank-discounted gain with binary relevance.
+"""
+
+from __future__ import annotations
+
+import math
+from typing import Iterable, Sequence
+
+
+def _dedup(ranked: Sequence[str]) -> list[str]:
+    """Stable de-duplication, keeping the first (best) occurrence of each id."""
+    seen: set[str] = set()
+    out: list[str] = []
+    for item in ranked:
+        if item not in seen:
+            seen.add(item)
+            out.append(item)
+    return out
+
+
+def recall_at_k(ranked: Sequence[str], relevant: Iterable[str], k: int) -> float:
+    """Fraction of relevant ids present in the top ``k`` retrieved ids."""
+    rel = set(relevant)
+    if not rel:
+        return 0.0
+    top = set(_dedup(ranked)[:k])
+    return len(top & rel) / len(rel)
+
+
+def hit_at_k(ranked: Sequence[str], relevant: Iterable[str], k: int) -> float:
+    """1.0 if at least one relevant id is in the top ``k`` (Acc@k), else 0.0."""
+    rel = set(relevant)
+    if not rel:
+        return 0.0
+    return 1.0 if rel & set(_dedup(ranked)[:k]) else 0.0
+
+
+def mrr(ranked: Sequence[str], relevant: Iterable[str], k: int | None = None) -> float:
+    """Reciprocal rank of the first relevant id (0 if none within the cutoff)."""
+    rel = set(relevant)
+    if not rel:
+        return 0.0
+    ordered = _dedup(ranked)
+    if k is not None:
+        ordered = ordered[:k]
+    for rank, item in enumerate(ordered, start=1):
+        if item in rel:
+            return 1.0 / rank
+    return 0.0
+
+
+def ndcg_at_k(ranked: Sequence[str], relevant: Iterable[str], k: int) -> float:
+    """Normalized discounted cumulative gain at ``k`` with binary relevance."""
+    rel = set(relevant)
+    if not rel or k <= 0:
+        return 0.0
+    ordered = _dedup(ranked)[:k]
+    dcg = sum(
+        1.0 / math.log2(rank + 1)
+        for rank, item in enumerate(ordered, start=1)
+        if item in rel
+    )
+    ideal_hits = min(len(rel), k)
+    idcg = sum(1.0 / math.log2(rank + 1) for rank in range(1, ideal_hits + 1))
+    return dcg / idcg if idcg else 0.0
diff --git a/coderag/surfaces/cli.py b/coderag/surfaces/cli.py
index 0aef288..25ca920 100644
--- a/coderag/surfaces/cli.py
+++ b/coderag/surfaces/cli.py
@@ -90,6 +90,66 @@ def cmd_status(args: argparse.Namespace) -> int:
     return 0
 
 
+def cmd_eval(args: argparse.Namespace) -> int:
+    from coderag import eval as ev
+
+    cfg = _build_config(args)
+
+    # `coderag eval build` — mine a dataset from the repo's git history.
+    if args.build:
+        cases = ev.build_from_git(
+            cfg.watched_dir,
+            max_cases=args.max_cases,
+            extensions=[e.lstrip(".") for e in _suffixes(cfg.languages)],
+        )
+        out = args.dataset or "coderag-eval.jsonl"
+        ev.save_dataset(cases, out)
+        print(f"Wrote {len(cases)} eval case(s) to {out}")
+        return 0 if cases else 1
+
+    if not args.dataset:
+        print("Provide --dataset PATH (or --build to mine one from git history).")
+        return 1
+    cases = ev.load_dataset(args.dataset)
+    if not cases:
+        print(f"No eval cases in {args.dataset}.")
+        return 1
+
+    ks = tuple(int(k) for k in args.ks.split(","))
+    cr = CodeRAG(cfg)
+    cr.index()  # ensure the index is built / up to date before scoring
+
+    if args.compare:
+        results = ev.compare_modes(cr, cases, ks=ks, level=args.level)
+    else:
+        results = [
+            ev.evaluate(cr.search, cases, label="hybrid", ks=ks, level=args.level)
+        ]
+
+    if args.json:
+        print(json.dumps([r.as_dict() for r in results], indent=2))
+    else:
+        from coderag.eval.harness import format_table
+
+        print(f"Eval: {len(cases)} case(s), level={args.level}\n")
+        print(format_table(results))
+    return 0
+
+
+def _suffixes(languages: tuple) -> list:
+    """Map configured language names to file suffixes for dataset mining."""
+    table = {
+        "python": ".py",
+        "javascript": ".js",
+        "typescript": ".ts",
+        "tsx": ".tsx",
+        "go": ".go",
+        "rust": ".rs",
+        "java": ".java",
+    }
+    return [table[lang] for lang in languages if lang in table]
+
+
 def cmd_watch(args: argparse.Namespace) -> int:
     from coderag.watch import watch
 
@@ -181,6 +241,43 @@ def build_parser() -> argparse.ArgumentParser:
     _add_common(p_status)
     p_status.set_defaults(func=cmd_status)
 
+    p_eval = sub.add_parser(
+        "eval",
+        help="Measure retrieval quality against a dataset (recall@k, MRR, nDCG).",
+    )
+    p_eval.add_argument(
+        "--dataset", help="JSONL dataset of query -> relevant files/symbols."
+    )
+    p_eval.add_argument(
+        "--build",
+        action="store_true",
+        help="Mine a dataset from git history into --dataset (default coderag-eval.jsonl).",
+    )
+    p_eval.add_argument(
+        "--max-cases",
+        type=int,
+        default=200,
+        help="Cap cases when building (default 200).",
+    )
+    p_eval.add_argument(
+        "--compare",
+        action="store_true",
+        help="Score dense-only vs BM25-only vs hybrid on one index.",
+    )
+    p_eval.add_argument(
+        "--level",
+        choices=("file", "symbol"),
+        default="file",
+        help="Localization granularity (default file).",
+    )
+    p_eval.add_argument(
+        "--ks", default="1,5,10", help="Comma-separated cutoffs (default 1,5,10)."
+    )
+    p_eval.add_argument("--json", action="store_true", help="Emit JSON.")
+    p_eval.add_argument("--quiet", action="store_true", help="Hide the progress bar.")
+    _add_common(p_eval)
+    p_eval.set_defaults(func=cmd_eval)
+
     p_watch = sub.add_parser(
         "watch", help="Index, then keep the index live on changes."
     )
diff --git a/docs/eval.md b/docs/eval.md
new file mode 100644
index 0000000..c1858e1
--- /dev/null
+++ b/docs/eval.md
@@ -0,0 +1,80 @@
+# Retrieval eval harness
+
+A small, offline harness for measuring **retrieval quality** — "did we surface the right
+file/symbol for this query?" — so accuracy claims are provable and regressions are caught.
+It implements move #0 of [the code-retrieval strategy](research/code-retrieval-strategy.md):
+nothing else in that plan (a better embedder, a reranker, fusion tuning) is worth shipping
+until we can measure it.
+
+## Metrics
+
+Standard localization metrics, matching the SWE-bench / Agentless / SweRank and CoIR /
+CodeSearchNet conventions:
+
+- **recall@k** — fraction of relevant items found in the top k.
+- **hit@k** (Acc@k) — 1 if *any* relevant item is in the top k.
+- **MRR** — reciprocal rank of the first relevant item.
+- **nDCG@k** — rank-discounted gain (binary relevance).
+
+Scored at **file** level (default) or **symbol** level (`--level symbol`).
+
+## Quick start
+
+```bash
+# 1. Mine a dataset from the repo's own git history (no network, no LLM):
+#    query = commit subject, ground truth = files that commit changed (and still exist).
+coderag eval --build --dataset coderag-eval.jsonl
+
+# 2. Score the current hybrid retriever:
+coderag eval --dataset coderag-eval.jsonl
+
+# 3. Contrast dense-only vs BM25-only vs hybrid on one index:
+coderag eval --dataset coderag-eval.jsonl --compare
+```
+
+```
+mode    n   MRR    R@1    R@5    R@10   nDCG@1  nDCG@5  nDCG@10  Hit@1  Hit@5  Hit@10
+------  --  -----  -----  -----  -----  ------  ------  -------  -----  -----  ------
+dense   …
+bm25    …
+hybrid  …
+```
+
+Add `--json` for machine-readable output, `--ks 1,3,5,10` to change cutoffs, and
+`--level symbol` for function/class-level localization (needs `relevant_symbols` in the
+dataset). The usual `--watched-dir` / `--store-dir` / `--provider` / `--model` flags apply.
+
+> The default `fake` provider is for tests only — its vectors are random, so dense looks
+> near-zero. Run real evals against `fastembed` (the local default) or whatever model you're
+> evaluating, e.g. `coderag eval --dataset … --compare --model BAAI/bge-small-en-v1.5` then
+> again with a candidate like CodeRankEmbed to measure the lift.
+
+## Dataset format
+
+JSONL, one case per line:
+
+```json
+{"query": "fix retry backoff on 429", "relevant_files": ["coderag/llm.py"], "relevant_symbols": ["stream_answer"], "id": "abc123", "source": "git"}
+```
+
+`relevant_symbols` and `id`/`source` are optional. Mine with `--build`, or hand-author cases
+for queries you care about (the natural-language "where is X handled" questions where semantic
+retrieval should beat grep).
+
+## Library API
+
+```python
+from coderag import CodeRAG, Config
+from coderag.eval import build_from_git, compare_modes, evaluate
+
+cr = CodeRAG(Config.from_env())
+cr.index()
+cases = build_from_git(cr.config.watched_dir, max_cases=200)
+
+for r in compare_modes(cr, cases):          # dense / bm25 / hybrid
+    print(r.label, r.as_dict())
+
+# Or score any retriever callable directly:
+res = evaluate(cr.search, cases, level="file")
+print(res.recall, res.mrr)
+```
diff --git a/tests/test_eval.py b/tests/test_eval.py
new file mode 100644
index 0000000..276d35f
--- /dev/null
+++ b/tests/test_eval.py
@@ -0,0 +1,208 @@
+"""Tests for the code-retrieval eval harness (metrics, dataset, scoring).
+
+All offline/deterministic via the `fake` provider fixture.
+"""
+
+from __future__ import annotations
+
+import subprocess
+from pathlib import Path
+
+from coderag.api import CodeRAG
+from coderag.eval import (
+    EvalCase,
+    build_from_git,
+    compare_modes,
+    evaluate,
+    load_dataset,
+    save_dataset,
+)
+from coderag.eval.harness import best_label, format_table
+from coderag.eval.metrics import hit_at_k, mrr, ndcg_at_k, recall_at_k
+from tests.conftest import write
+
+# --- metrics ---
+
+
+def test_recall_at_k_counts_fraction_found():
+    ranked = ["a.py", "b.py", "c.py"]
+    assert recall_at_k(ranked, {"b.py", "c.py"}, 3) == 1.0
+    assert recall_at_k(ranked, {"b.py", "c.py"}, 2) == 0.5
+    assert recall_at_k(ranked, {"z.py"}, 3) == 0.0
+
+
+def test_hit_at_k_is_binary():
+    ranked = ["a.py", "b.py"]
+    assert hit_at_k(ranked, {"b.py"}, 2) == 1.0
+    assert hit_at_k(ranked, {"b.py"}, 1) == 0.0
+
+
+def test_mrr_uses_first_relevant_rank():
+    assert mrr(["a", "b", "c"], {"b"}) == 0.5
+    assert mrr(["a", "b", "c"], {"a"}) == 1.0
+    assert mrr(["a", "b", "c"], {"z"}) == 0.0
+
+
+def test_ndcg_rewards_higher_ranks():
+    high = ndcg_at_k(["rel", "x", "y"], {"rel"}, 3)
+    low = ndcg_at_k(["x", "y", "rel"], {"rel"}, 3)
+    assert high == 1.0  # single relevant at rank 1 is perfect
+    assert 0.0 < low < high
+
+
+def test_metrics_dedupe_ranked_ids():
+    # Duplicate file paths (multiple chunks per file) must not consume top-k slots:
+    # deduped to ["a.py", "b.py"], so both relevant files land within k=2.
+    ranked = ["a.py", "a.py", "b.py"]
+    assert recall_at_k(ranked, {"a.py", "b.py"}, 2) == 1.0
+    # Without dedup the second "a.py" would have pushed "b.py" out of the top 2.
+    assert recall_at_k(ranked, {"a.py", "b.py"}, 1) == 0.5
+
+
+def test_metrics_empty_relevant_is_zero():
+    assert recall_at_k(["a"], set(), 1) == 0.0
+    assert ndcg_at_k(["a"], set(), 1) == 0.0
+
+
+# --- dataset ---
+
+
+def test_dataset_roundtrip(tmp_path: Path):
+    cases = [
+        EvalCase(
+            "find auth", ["auth.py"], ["authenticate_user"], id="c1", source="git"
+        ),
+        EvalCase("find math", ["math_utils.py"]),
+    ]
+    path = tmp_path / "ds.jsonl"
+    save_dataset(cases, path)
+    loaded = load_dataset(path)
+    assert [c.query for c in loaded] == ["find auth", "find math"]
+    assert loaded[0].relevant_symbols == ["authenticate_user"]
+    assert loaded[1].relevant_symbols == []
+
+
+def test_load_dataset_skips_blank_lines(tmp_path: Path):
+    path = tmp_path / "ds.jsonl"
+    path.write_text(
+        '{"query": "q", "relevant_files": ["a.py"]}\n\n   \n', encoding="utf-8"
+    )
+    assert len(load_dataset(path)) == 1
+
+
+# --- harness: end-to-end scoring against a real (fake-embedded) index ---
+
+
+def _indexed(config) -> CodeRAG:
+    config.watched_dir.mkdir(parents=True, exist_ok=True)
+    write(
+        config.watched_dir / "auth.py",
+        "def authenticate_user(token):\n"
+        "    '''Validate a session token and return the user.'''\n"
+        "    return verify(token)\n",
+    )
+    write(
+        config.watched_dir / "math_utils.py",
+        "def add_numbers(a, b):\n    return a + b\n",
+    )
+    cr = CodeRAG(config)
+    cr.index()
+    return cr
+
+
+def test_evaluate_perfect_retrieval_scores_one(config):
+    cr = _indexed(config)
+    cases = [EvalCase("add_numbers", ["math_utils.py"])]
+    res = evaluate(cr.search, cases, ks=(1, 3))
+    assert res.n == 1
+    assert res.recall[1] == 1.0
+    assert res.mrr == 1.0
+    assert res.ndcg[1] == 1.0
+
+
+def test_evaluate_skips_cases_without_ground_truth_at_level(config):
+    cr = _indexed(config)
+    # File-only ground truth -> nothing to score at the symbol level.
+    cases = [EvalCase("add_numbers", ["math_utils.py"])]
+    res = evaluate(cr.search, cases, ks=(1,), level="symbol")
+    assert res.n == 0
+
+
+def test_evaluate_symbol_level(config):
+    cr = _indexed(config)
+    cases = [EvalCase("authenticate_user", ["auth.py"], ["authenticate_user"])]
+    res = evaluate(cr.search, cases, ks=(1, 3), level="symbol")
+    assert res.n == 1
+    assert res.hit[3] == 1.0
+
+
+def test_compare_modes_returns_three_labels(config):
+    cr = _indexed(config)
+    cases = [
+        EvalCase("add_numbers", ["math_utils.py"]),
+        EvalCase("authenticate session token", ["auth.py"]),
+    ]
+    results = compare_modes(cr, cases, ks=(1, 3))
+    assert [r.label for r in results] == ["dense", "bm25", "hybrid"]
+    assert all(r.n == 2 for r in results)
+
+
+def test_bm25_recalls_exact_identifier(config):
+    # Lexical retrieval should find an exact identifier even when dense recall is weak.
+    cr = _indexed(config)
+    cases = [EvalCase("add_numbers", ["math_utils.py"])]
+    results = compare_modes(cr, cases, ks=(1, 3))
+    bm25 = next(r for r in results if r.label == "bm25")
+    assert bm25.hit[3] == 1.0
+
+
+def test_format_table_and_best_label(config):
+    cr = _indexed(config)
+    cases = [EvalCase("add_numbers", ["math_utils.py"])]
+    results = compare_modes(cr, cases, ks=(1, 3))
+    table = format_table(results)
+    assert "mode" in table and "MRR" in table and "hybrid" in table
+    assert best_label(results, metric="ndcg", k=3) in {"dense", "bm25", "hybrid"}
+
+
+# --- git dataset miner ---
+
+
+def test_build_from_git_mines_changed_files(tmp_path: Path):
+    repo = tmp_path / "repo"
+    repo.mkdir()
+
+    def git(*args: str) -> None:
+        subprocess.run(["git", "-C", str(repo), *args], check=True, capture_output=True)
+
+    git("init", "-q")
+    git("config", "user.email", "t@example.com")
+    git("config", "user.name", "Tester")
+    git("config", "commit.gpgsign", "false")
+    write(repo / "auth.py", "def authenticate_user(token):\n    return token\n")
+    git("add", "-A")
+    git("commit", "-q", "-m", "add user authentication helper")
+
+    cases = build_from_git(repo, max_cases=10)
+    assert len(cases) == 1
+    assert cases[0].query == "add user authentication helper"
+    assert cases[0].relevant_files == ["auth.py"]
+    assert cases[0].source == "git"
+
+
+def test_build_from_git_skips_merges_and_short_subjects(tmp_path: Path):
+    repo = tmp_path / "repo"
+    repo.mkdir()
+
+    def git(*args: str) -> None:
+        subprocess.run(["git", "-C", str(repo), *args], check=True, capture_output=True)
+
+    git("init", "-q")
+    git("config", "user.email", "t@example.com")
+    git("config", "user.name", "Tester")
+    git("config", "commit.gpgsign", "false")
+    write(repo / "a.py", "x = 1\n")
+    git("add", "-A")
+    git("commit", "-q", "-m", "wip")  # too short -> filtered out
+
+    assert build_from_git(repo, max_cases=10, min_query_len=12) == []

From f3a7e0c7c417a911262c09522afdd428ea2db754 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 17 Jun 2026 07:08:37 +0000
Subject: [PATCH 3/5] feat(eval): embedder benchmark, model registry, curated
 dataset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Runs move #1 (the embedder experiment) on real local models via the eval
harness, and records the honest result.

- scripts/bench_embedders.py: reproducible model comparison — index a repo
  per model into an isolated store, score dense/bm25/hybrid via the harness.
- coderag/embeddings/models.py + `coderag eval --list-models`: curated
  registry of local code-search embedders with size/accuracy notes. Note:
  fastembed does not ship CodeRankEmbed (needs custom ONNX export, follow-up);
  jina-embeddings-v2-base-code is the best out-of-the-box code-specific option.
- coderag/eval/datasets/coderag_self.jsonl: 24 curated natural-language ->
  file cases for benchmarking CodeRAG on itself.

Measured (this repo, 24 cases): hybrid > dense > BM25 for BOTH models
(validates the fusion thesis), but the code-specific model did NOT clearly
beat bge-small — the small repo saturates (bge already Hit@10=1.0), so the
published CoIR gap does not transfer. Conclusion: keep bge-small default;
model swaps need a larger/harder benchmark; rank-1 headroom points at the
reranker (move #2). Documented in docs/eval.md and the strategy doc.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01LhTCPRjNmSitYxgSDfttT7
---
 coderag/embeddings/models.py             | 90 ++++++++++++++++++++++++
 coderag/eval/datasets/coderag_self.jsonl | 24 +++++++
 coderag/surfaces/cli.py                  | 12 ++++
 docs/eval.md                             | 39 ++++++++++
 docs/research/code-retrieval-strategy.md |  9 +++
 scripts/bench_embedders.py               | 77 ++++++++++++++++++++
 tests/test_models_registry.py            | 25 +++++++
 7 files changed, 276 insertions(+)
 create mode 100644 coderag/embeddings/models.py
 create mode 100644 coderag/eval/datasets/coderag_self.jsonl
 create mode 100644 scripts/bench_embedders.py
 create mode 100644 tests/test_models_registry.py

diff --git a/coderag/embeddings/models.py b/coderag/embeddings/models.py
new file mode 100644
index 0000000..429d17a
--- /dev/null
+++ b/coderag/embeddings/models.py
@@ -0,0 +1,90 @@
+"""Curated registry of local (fastembed/ONNX) embedding models for code search.
+
+These are the no-API-key models worth considering for CodeRAG, with short notes on the
+accuracy/size trade-off. All are loadable via ``--model <name>`` (provider ``fastembed``).
+The numbers in the notes are external benchmark figures (see docs/research/) — run
+``coderag eval`` to measure them on *your* codebase.
+
+Code-specific models (trained on code) generally beat general-purpose text embedders on
+code retrieval, at the cost of a larger download.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Tuple
+
+
+@dataclass(frozen=True)
+class ModelInfo:
+    name: str  # fastembed model id (pass to --model)
+    dim: int
+    size_gb: float
+    code_specific: bool
+    note: str
+
+
+# Ordered best-first for code search among models fastembed can load locally. fastembed
+# does not (yet) ship CodeRankEmbed/CodeSage; those need a custom ONNX export — tracked as
+# a follow-up. jina-embeddings-v2-base-code is the strongest code-specific option available
+# out of the box.
+RECOMMENDED: Tuple[ModelInfo, ...] = (
+    ModelInfo(
+        "jinaai/jina-embeddings-v2-base-code",
+        768,
+        0.64,
+        True,
+        "Code-specific, 8192-ctx, Apache-2.0. Best out-of-the-box local code retriever.",
+    ),
+    ModelInfo(
+        "BAAI/bge-base-en-v1.5",
+        768,
+        0.21,
+        False,
+        "General text. Stronger than bge-small; modest code retrieval.",
+    ),
+    ModelInfo(
+        "snowflake/snowflake-arctic-embed-m-long",
+        768,
+        0.54,
+        False,
+        "General, long-context (base model behind CodeRankEmbed).",
+    ),
+    ModelInfo(
+        "nomic-ai/nomic-embed-text-v1.5",
+        768,
+        0.52,
+        False,
+        "General, long-context, Matryoshka dims.",
+    ),
+    ModelInfo(
+        "BAAI/bge-small-en-v1.5",
+        384,
+        0.067,
+        False,
+        "Current default. Smallest/fastest; weakest on code (~45.8 CoIR).",
+    ),
+)
+
+
+def format_models() -> str:
+    """Human-readable table of recommended models for the CLI."""
+    rows = [("model", "dim", "size", "code?", "note")]
+    rows += [
+        (
+            m.name,
+            str(m.dim),
+            f"{m.size_gb:g}GB",
+            "yes" if m.code_specific else "no",
+            m.note,
+        )
+        for m in RECOMMENDED
+    ]
+    widths = [max(len(r[i]) for r in rows) for i in range(4)]
+    lines = []
+    for i, r in enumerate(rows):
+        head = "  ".join(r[j].ljust(widths[j]) for j in range(4))
+        lines.append(f"{head}  {r[4]}")
+        if i == 0:
+            lines.append("  ".join("-" * w for w in widths) + "  " + "-" * len(r[4]))
+    return "\n".join(lines)
diff --git a/coderag/eval/datasets/coderag_self.jsonl b/coderag/eval/datasets/coderag_self.jsonl
new file mode 100644
index 0000000..4575746
--- /dev/null
+++ b/coderag/eval/datasets/coderag_self.jsonl
@@ -0,0 +1,24 @@
+{"query": "where are duplicate or stale vectors removed when a file changes", "relevant_files": ["coderag/indexer.py"], "source": "curated"}
+{"query": "how is the FAISS index rebuilt from the SQLite source of truth", "relevant_files": ["coderag/store/vector_index.py"], "source": "curated"}
+{"query": "where is reciprocal rank fusion implemented", "relevant_files": ["coderag/retrieval/fusion.py"], "source": "curated"}
+{"query": "how are dense and lexical search results combined into one ranking", "relevant_files": ["coderag/retrieval/search.py"], "source": "curated"}
+{"query": "how does the debounced filesystem watcher trigger reindexing", "relevant_files": ["coderag/watch.py"], "source": "curated"}
+{"query": "where is symbol-aware chunking for Python using the ast module", "relevant_files": ["coderag/chunking/python_ast.py"], "source": "curated"}
+{"query": "how are functions and classes chunked for Go and Rust via tree-sitter", "relevant_files": ["coderag/chunking/treesitter.py"], "source": "curated"}
+{"query": "where is BM25 keyword search over SQLite FTS5 implemented", "relevant_files": ["coderag/store/sqlite_store.py"], "source": "curated"}
+{"query": "how does the HTTP API require an API key for authentication", "relevant_files": ["coderag/surfaces/http_api.py"], "source": "curated"}
+{"query": "how is an LLM answer streamed over the retrieved code chunks", "relevant_files": ["coderag/llm.py"], "source": "curated"}
+{"query": "where is the OpenAI-compatible embedding provider implemented", "relevant_files": ["coderag/embeddings/openai_provider.py"], "source": "curated"}
+{"query": "how does configuration load from environment variables and a dotenv file", "relevant_files": ["coderag/config.py"], "source": "curated"}
+{"query": "where is the command line search subcommand defined", "relevant_files": ["coderag/surfaces/cli.py"], "source": "curated"}
+{"query": "how does the vector index switch from flat to IVF as the corpus grows", "relevant_files": ["coderag/store/vector_index.py"], "source": "curated"}
+{"query": "where is content hashing used to skip unchanged files on reindex", "relevant_files": ["coderag/indexer.py"], "source": "curated"}
+{"query": "how are file contents served safely for only indexed files", "relevant_files": ["coderag/api.py"], "source": "curated"}
+{"query": "where does the web UI render results with syntax highlighting", "relevant_files": ["coderag/surfaces/webui.py"], "source": "curated"}
+{"query": "how is an oversized function split into smaller line windows", "relevant_files": ["coderag/chunking/base.py"], "source": "curated"}
+{"query": "where is the database table schema for chunks and files defined", "relevant_files": ["coderag/store/schema.py"], "source": "curated"}
+{"query": "how does a model or embedding dimension change get detected and trigger a rebuild", "relevant_files": ["coderag/store/sqlite_store.py", "coderag/api.py"], "source": "curated"}
+{"query": "where is the deterministic offline fake embedding provider for tests", "relevant_files": ["coderag/embeddings/fake_provider.py"], "source": "curated"}
+{"query": "how are file extensions mapped to programming languages for chunking", "relevant_files": ["coderag/chunking/languages.py"], "source": "curated"}
+{"query": "where is text split into lines without collapsing carriage returns", "relevant_files": ["coderag/_lines.py"], "source": "curated"}
+{"query": "how is the incremental indexing done with parallel workers", "relevant_files": ["coderag/indexer.py"], "source": "curated"}
diff --git a/coderag/surfaces/cli.py b/coderag/surfaces/cli.py
index 25ca920..ea7bb1f 100644
--- a/coderag/surfaces/cli.py
+++ b/coderag/surfaces/cli.py
@@ -95,6 +95,13 @@ def cmd_eval(args: argparse.Namespace) -> int:
 
     cfg = _build_config(args)
 
+    # `coderag eval --list-models` — show recommended local embedding models.
+    if args.list_models:
+        from coderag.embeddings.models import format_models
+
+        print(format_models())
+        return 0
+
     # `coderag eval build` — mine a dataset from the repo's git history.
     if args.build:
         cases = ev.build_from_git(
@@ -275,6 +282,11 @@ def build_parser() -> argparse.ArgumentParser:
     )
     p_eval.add_argument("--json", action="store_true", help="Emit JSON.")
     p_eval.add_argument("--quiet", action="store_true", help="Hide the progress bar.")
+    p_eval.add_argument(
+        "--list-models",
+        action="store_true",
+        help="List recommended local embedding models for code search and exit.",
+    )
     _add_common(p_eval)
     p_eval.set_defaults(func=cmd_eval)
 
diff --git a/docs/eval.md b/docs/eval.md
index c1858e1..bd9772c 100644
--- a/docs/eval.md
+++ b/docs/eval.md
@@ -49,6 +49,45 @@ dataset). The usual `--watched-dir` / `--store-dir` / `--provider` / `--model` f
 > evaluating, e.g. `coderag eval --dataset … --compare --model BAAI/bge-small-en-v1.5` then
 > again with a candidate like CodeRankEmbed to measure the lift.
 
+## Measured results (this repo)
+
+Move #1 experiment — current default vs a code-specific model — run with
+`scripts/bench_embedders.py` on the curated dataset
+(`coderag/eval/datasets/coderag_self.jsonl`, 24 natural-language → file cases, 90 files /
+553 chunks):
+
+```
+mode                                   n   MRR    R@1    R@5    R@10   nDCG@10  Hit@10
+bge-small-en-v1.5 · dense              24  0.784  0.604  0.938  1.000  0.831    1.000
+bge-small-en-v1.5 · bm25               24  0.751  0.604  0.854  1.000  0.802    1.000
+bge-small-en-v1.5 · hybrid             24  0.822  0.688  1.000  1.000  0.860    1.000
+jina-embeddings-v2-base-code · dense   24  0.759  0.583  0.938  0.979  0.810    1.000
+jina-embeddings-v2-base-code · bm25    24  0.751  0.604  0.854  1.000  0.802    1.000
+jina-embeddings-v2-base-code · hybrid  24  0.835  0.729  0.938  0.958  0.858    0.958
+```
+
+Two findings, one expected and one cautionary:
+
+1. **Hybrid beats either modality alone, for both models** (bge hybrid MRR 0.822 > dense
+   0.784 > bm25 0.751; jina hybrid 0.835 > dense 0.759 > bm25 0.751). This is the core
+   thesis — fusion is the differentiator vs pure-grep agents and single-modality embedding
+   tools. The identical BM25 rows across models are a sanity check that the harness isolates
+   the embedding variable correctly.
+2. **The code-specific model did *not* clearly beat bge-small here.** jina-code's hybrid is
+   marginally ahead on MRR/R@1 but behind on R@5/R@10/Hit@10. The reason is saturation: on a
+   90-file repo with lexical-rich NL queries, bge-small already hits Hit@10 = 1.0 and
+   R@5 ≈ 1.0 — there's no recall headroom for a better model to capture. The large published
+   CoIR gap (bge ~45.8 vs code models ~60) is measured on big, hard, cross-language corpora
+   and **does not transfer** to a small single-repo file-localization task.
+
+**Takeaways:** (a) don't flip the default to a 10×-larger model on this evidence — keep
+bge-small, offer code models as an option (`coderag eval --list-models`); (b) discriminating
+embedders needs a **larger/harder benchmark** (a big external repo, or harder
+cross-file/conceptual queries with less lexical leakage); (c) the remaining headroom is at
+**rank 1** (R@1 ≈ 0.6–0.73), which is exactly what a cross-encoder reranker (strategy move
+#2) targets. This is the harness doing its job: it stopped a plausible-sounding upgrade that
+the data doesn't support.
+
 ## Dataset format
 
 JSONL, one case per line:
diff --git a/docs/research/code-retrieval-strategy.md b/docs/research/code-retrieval-strategy.md
index 3465e71..a794c69 100644
--- a/docs/research/code-retrieval-strategy.md
+++ b/docs/research/code-retrieval-strategy.md
@@ -53,6 +53,15 @@ needed files in nearly half of instances. [H] (arXiv 2310.06770) Better retrieva
 
 ---
 
+> **Update (measured).** The eval harness (§0) is now built, and move #1 was tested on this
+> repo: `bge-small` vs `jina-embeddings-v2-base-code` (fastembed does **not** ship
+> CodeRankEmbed — it needs a custom ONNX export, tracked as follow-up). On a 24-case curated
+> NL→file set the code-specific model did **not** clearly win — the small repo saturates
+> (bge already at Hit@10 = 1.0), so the published CoIR gap didn't transfer. The validated
+> win was **hybrid > dense > BM25 for both models**. See [docs/eval.md](../eval.md). Net: keep
+> bge-small as default; a model swap needs a larger/harder benchmark to justify, and the
+> rank-1 headroom points at the reranker (§2) as the better next bet.
+
 ## 1. Upgrade the embedding model (highest single accuracy jump)
 
 **Current state:** CodeRAG defaults to `BAAI/bge-small-en-v1.5`, which scores only **~45.8 CoIR
diff --git a/scripts/bench_embedders.py b/scripts/bench_embedders.py
new file mode 100644
index 0000000..ad39d47
--- /dev/null
+++ b/scripts/bench_embedders.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+"""Benchmark several local embedding models on a code-retrieval eval dataset.
+
+For each model this indexes the target repo into an isolated store, then scores
+dense-only / BM25-only / hybrid retrieval with the eval harness, and prints one combined
+table. BM25 is model-independent (a useful constant baseline across rows).
+
+Usage:
+    python scripts/bench_embedders.py --repo . --dataset coderag/eval/datasets/coderag_self.jsonl \
+        --models BAAI/bge-small-en-v1.5,jinaai/jina-embeddings-v2-base-code
+
+This downloads each model once (no API key). Run it as the move #1 experiment: compare the
+current default against a code-specific candidate and read the lift off the hybrid rows.
+"""
+
+from __future__ import annotations
+
+import argparse
+import tempfile
+from pathlib import Path
+
+from coderag.api import CodeRAG
+from coderag.config import Config
+from coderag.eval import compare_modes, load_dataset
+from coderag.eval.harness import EvalResult
+
+
+def _label(model: str, mode: str) -> str:
+    short = model.split("/")[-1]
+    return f"{short} · {mode}"
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--repo", default=".", help="Codebase to index/evaluate.")
+    ap.add_argument("--dataset", required=True, help="JSONL eval dataset.")
+    ap.add_argument(
+        "--models",
+        default="BAAI/bge-small-en-v1.5,jinaai/jina-embeddings-v2-base-code",
+        help="Comma-separated fastembed model ids.",
+    )
+    ap.add_argument("--ks", default="1,5,10", help="Comma-separated cutoffs.")
+    ap.add_argument("--level", choices=("file", "symbol"), default="file")
+    args = ap.parse_args()
+
+    repo = Path(args.repo).resolve()
+    cases = load_dataset(args.dataset)
+    ks = tuple(int(k) for k in args.ks.split(","))
+    models = [m.strip() for m in args.models.split(",") if m.strip()]
+
+    rows: list[EvalResult] = []
+    for model in models:
+        print(f"\n=== {model} ===  (indexing {repo.name} ...)")
+        with tempfile.TemporaryDirectory(prefix="coderag-bench-") as store:
+            cfg = Config.from_env(
+                provider="fastembed",
+                model=model,
+                watched_dir=repo,
+                store_dir=Path(store),
+            )
+            cr = CodeRAG(cfg)
+            stats = cr.index()
+            print(f"    {stats.total_files} files / {stats.total_chunks} chunks")
+            for r in compare_modes(cr, cases, ks=ks, level=args.level):
+                r.label = _label(model, r.label)
+                rows.append(r)
+            cr.close()
+
+    from coderag.eval.harness import format_table
+
+    print(f"\n{len(cases)} case(s), level={args.level}\n")
+    print(format_table(rows))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tests/test_models_registry.py b/tests/test_models_registry.py
new file mode 100644
index 0000000..5fff29b
--- /dev/null
+++ b/tests/test_models_registry.py
@@ -0,0 +1,25 @@
+"""Tests for the recommended-embedding-model registry (offline, no downloads)."""
+
+from __future__ import annotations
+
+from coderag.embeddings.models import RECOMMENDED, format_models
+
+
+def test_registry_is_nonempty_and_well_formed():
+    assert RECOMMENDED
+    for m in RECOMMENDED:
+        assert m.name and "/" in m.name  # looks like a HF model id
+        assert m.dim > 0
+        assert m.size_gb > 0
+        assert m.note
+
+
+def test_default_model_is_listed():
+    # The current default must appear so users can see its trade-off.
+    assert any(m.name == "BAAI/bge-small-en-v1.5" for m in RECOMMENDED)
+
+
+def test_format_models_renders_table():
+    out = format_models()
+    assert "model" in out and "code?" in out
+    assert "jina-embeddings-v2-base-code" in out

From b15526b827ed138736dac5b2351e7c557ab020d9 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 17 Jun 2026 08:06:25 +0000
Subject: [PATCH 4/5] feat(retrieval): optional local cross-encoder reranker
 (move #2)

Two-stage retrieve-then-rerank: first-stage hybrid (dense+BM25+RRF) for
recall, then a local ONNX cross-encoder re-scores the top candidates jointly
with the query for top-of-list precision. Opt-in (config.rerank, default off)
so the zero-config engine stays tiny/fast; uses fastembed's TextCrossEncoder
(default Xenova/ms-marco-MiniLM-L-12-v2) so it needs no API key and no new
dependency.

- coderag/retrieval/rerank.py: Reranker protocol + CrossEncoderReranker +
  get_reranker() factory (mirrors the embeddings provider pattern).
- HybridSearcher: deeper candidate pool when reranking, re-score, reorder,
  trim to top_k; reranker injected by the facade from config.
- config: rerank / rerank_model / rerank_candidates (+ CODERAG_RERANK* env).
- status() reports rerank state; eval compare_modes adds a hybrid+rerank row;
  `coderag eval --rerank` and `bench_embedders.py --rerank`.
- Tests via a deterministic fake reranker (offline).

Measured (this repo, 24 cases): the generic ms-marco reranker gave no lift /
a marginal regression. The benchmark is saturated (hybrid already R@5~1.0)
and ms-marco is web-trained, not code. Documented in docs/eval.md: the
critical path is now a larger/harder benchmark, after which a code-aware
reranker should be re-tested.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01LhTCPRjNmSitYxgSDfttT7
---
 coderag/api.py                           |  9 ++-
 coderag/config.py                        | 12 +++
 coderag/eval/harness.py                  | 12 +++
 coderag/retrieval/rerank.py              | 79 +++++++++++++++++++
 coderag/retrieval/search.py              | 34 +++++++-
 coderag/surfaces/cli.py                  | 22 +++++-
 docs/eval.md                             | 40 ++++++++++
 docs/research/code-retrieval-strategy.md |  9 +++
 scripts/bench_embedders.py               | 14 +++-
 tests/test_rerank.py                     | 99 ++++++++++++++++++++++++
 10 files changed, 320 insertions(+), 10 deletions(-)
 create mode 100644 coderag/retrieval/rerank.py
 create mode 100644 tests/test_rerank.py

diff --git a/coderag/api.py b/coderag/api.py
index c2eb35c..aaf6651 100644
--- a/coderag/api.py
+++ b/coderag/api.py
@@ -93,10 +93,15 @@ def indexer(self) -> "Indexer":
     @property
     def searcher(self) -> "HybridSearcher":
         if self._searcher is None:
+            from coderag.retrieval.rerank import get_reranker
             from coderag.retrieval.search import HybridSearcher
 
             self._searcher = HybridSearcher(
-                self.config, self.provider, self.store, self.vectors
+                self.config,
+                self.provider,
+                self.store,
+                self.vectors,
+                reranker=get_reranker(self.config),
             )
         return self._searcher
 
@@ -177,6 +182,8 @@ def status(self) -> dict:
             ),
             "llm_base_url": self.config.openai_base_url or "",
             "index_type": self.vectors.kind,
+            "rerank": self.config.rerank,
+            "rerank_model": self.config.rerank_model if self.config.rerank else "",
             "store_dir": str(self.config.store_dir),
             "watched_dir": str(self.config.watched_dir),
             "total_files": stats.total_files,
diff --git a/coderag/config.py b/coderag/config.py
index 6207b5c..ae8f411 100644
--- a/coderag/config.py
+++ b/coderag/config.py
@@ -135,6 +135,13 @@ class Config:
     dense_weight: float = 1.0
     lexical_weight: float = 1.0
 
+    # --- Reranking (optional two-stage retrieve-then-rerank) ---
+    # Off by default so the zero-config engine stays tiny/fast. When on, the top
+    # ``rerank_candidates`` fused hits are re-scored by a local cross-encoder and reordered.
+    rerank: bool = False
+    rerank_model: str = "Xenova/ms-marco-MiniLM-L-12-v2"  # local ONNX cross-encoder
+    rerank_candidates: int = 50  # fused hits to rerank before trimming to top_k
+
     # --- Indexing throughput ---
     embed_batch_size: int = 64
     index_workers: int = 4
@@ -202,6 +209,11 @@ def from_env(cls, **overrides: object) -> "Config":
             rrf_k=_env_int("CODERAG_RRF_K", cls.rrf_k),
             dense_weight=_env_float("CODERAG_DENSE_WEIGHT", cls.dense_weight),
             lexical_weight=_env_float("CODERAG_LEXICAL_WEIGHT", cls.lexical_weight),
+            rerank=_env_bool("CODERAG_RERANK", cls.rerank),
+            rerank_model=_env_str("CODERAG_RERANK_MODEL", cls.rerank_model),
+            rerank_candidates=_env_int(
+                "CODERAG_RERANK_CANDIDATES", cls.rerank_candidates
+            ),
             embed_batch_size=_env_int("CODERAG_EMBED_BATCH", cls.embed_batch_size),
             index_workers=_env_int("CODERAG_WORKERS", cls.index_workers),
             llm_provider=_env_str("CODERAG_LLM_PROVIDER", cls.llm_provider),
diff --git a/coderag/eval/harness.py b/coderag/eval/harness.py
index aa112b5..19e1b38 100644
--- a/coderag/eval/harness.py
+++ b/coderag/eval/harness.py
@@ -16,6 +16,7 @@
 
 if TYPE_CHECKING:
     from coderag.api import CodeRAG
+    from coderag.retrieval.rerank import Reranker
 
 # A retriever: given a query and a result count, return ranked hits (best-first).
 SearchFn = Callable[[str, int], List[SearchHit]]
@@ -119,11 +120,14 @@ def compare_modes(
     ks: Sequence[int] = DEFAULT_KS,
     level: str = "file",
     modes: Sequence[Tuple[str, float, float]] = DEFAULT_MODES,
+    reranker: Optional["Reranker"] = None,
 ) -> List[EvalResult]:
     """Score dense-only vs BM25-only vs hybrid on the already-built index of ``cr``.
 
     The index is mode-independent — the dense/lexical weights only affect query-time RRF
     fusion — so we reuse one provider/store/vector index and just swap the fusion weights.
+    When ``reranker`` is given, an extra ``hybrid+rerank`` row is appended so the lift from
+    two-stage reranking is directly comparable on the same index.
     """
     from coderag.retrieval.search import HybridSearcher
 
@@ -134,6 +138,14 @@ def compare_modes(
         results.append(
             evaluate(searcher.search, cases, label=label, ks=ks, level=level)
         )
+    if reranker is not None:
+        cfg = cr.config.with_overrides(dense_weight=1.0, lexical_weight=1.0)
+        searcher = HybridSearcher(
+            cfg, cr.provider, cr.store, cr.vectors, reranker=reranker
+        )
+        results.append(
+            evaluate(searcher.search, cases, label="hybrid+rerank", ks=ks, level=level)
+        )
     return results
 
 
diff --git a/coderag/retrieval/rerank.py b/coderag/retrieval/rerank.py
new file mode 100644
index 0000000..8465cbb
--- /dev/null
+++ b/coderag/retrieval/rerank.py
@@ -0,0 +1,79 @@
+"""Optional second-stage reranking for two-stage retrieve-then-rerank search.
+
+First-stage hybrid retrieval (dense + BM25 + RRF) is tuned for *recall* — get the right
+chunks into a candidate pool cheaply. A cross-encoder reranker then scores each candidate
+*jointly* with the query (not via independent embeddings), which is far more precise at the
+top of the list. The research finds this is the single highest-ROI accuracy add-on for a
+local engine: +5–15 nDCG/MRR for ~30 ms/query on CPU with a small ONNX model.
+
+It's **opt-in** (``config.rerank``) so the zero-config default stays tiny and fast. The
+default model — ``Xenova/ms-marco-MiniLM-L-12-v2`` (~0.12 GB ONNX) — runs locally via
+fastembed's ``TextCrossEncoder``, so enabling it needs no API key and no new dependency.
+"""
+
+from __future__ import annotations
+
+import logging
+from functools import cached_property
+from pathlib import Path
+from typing import Any, List, Optional, Protocol, Sequence, runtime_checkable
+
+from coderag.config import Config
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_RERANK_MODEL = "Xenova/ms-marco-MiniLM-L-12-v2"
+
+
+@runtime_checkable
+class Reranker(Protocol):
+    """Scores how well each document answers the query (higher = more relevant)."""
+
+    @property
+    def model_id(self) -> str: ...
+
+    def rerank(self, query: str, documents: Sequence[str]) -> List[float]:
+        """Return one relevance score per document, aligned to input order."""
+
+
+class CrossEncoderReranker:
+    """Local cross-encoder reranker backed by fastembed's ``TextCrossEncoder`` (ONNX)."""
+
+    name = "cross-encoder"
+
+    def __init__(
+        self, model: str = DEFAULT_RERANK_MODEL, cache_dir: Optional[Path] = None
+    ) -> None:
+        self._model_name = model
+        self._cache_dir = str(cache_dir) if cache_dir else None
+
+    @cached_property
+    def _encoder(self) -> Any:
+        from fastembed.rerank.cross_encoder import TextCrossEncoder
+
+        logger.info("Loading reranker %s ...", self._model_name)
+        return TextCrossEncoder(self._model_name, cache_dir=self._cache_dir)
+
+    @property
+    def model_id(self) -> str:
+        return self._model_name
+
+    def rerank(self, query: str, documents: Sequence[str]) -> List[float]:
+        if not documents:
+            return []
+        return [float(s) for s in self._encoder.rerank(query, list(documents))]
+
+
+def get_reranker(config: Config) -> Optional[Reranker]:
+    """Build the reranker if ``config.rerank`` is on, else ``None`` (reranking disabled)."""
+    if not config.rerank:
+        return None
+    return CrossEncoderReranker(config.rerank_model, cache_dir=config.cache_dir)
+
+
+__all__ = [
+    "CrossEncoderReranker",
+    "DEFAULT_RERANK_MODEL",
+    "Reranker",
+    "get_reranker",
+]
diff --git a/coderag/retrieval/search.py b/coderag/retrieval/search.py
index 6e8de53..ea6482a 100644
--- a/coderag/retrieval/search.py
+++ b/coderag/retrieval/search.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 import logging
-from typing import Dict, List
+from typing import TYPE_CHECKING, Dict, List, Optional
 
 from coderag.config import Config
 from coderag.embeddings import EmbeddingProvider
@@ -12,6 +12,9 @@
 from coderag.store.vector_index import FaissVectorIndex
 from coderag.types import SearchHit
 
+if TYPE_CHECKING:
+    from coderag.retrieval.rerank import Reranker
+
 logger = logging.getLogger(__name__)
 
 
@@ -22,17 +25,23 @@ def __init__(
         provider: EmbeddingProvider,
         store: SQLiteStore,
         vectors: FaissVectorIndex,
+        reranker: Optional["Reranker"] = None,
     ) -> None:
         self.config = config
         self.provider = provider
         self.store = store
         self.vectors = vectors
+        self.reranker = reranker
 
     def search(self, query: str, top_k: int) -> List[SearchHit]:
         if not query or not query.strip():
             return []
 
-        fetch_k = max(self.config.fetch_k, top_k)
+        # When reranking, pull a deeper candidate pool to rerank, then trim to top_k.
+        pool = top_k
+        if self.reranker is not None:
+            pool = max(self.config.rerank_candidates, top_k)
+        fetch_k = max(self.config.fetch_k, pool)
 
         # Dense retrieval.
         qvec = self.provider.embed_query(query)
@@ -46,12 +55,12 @@ def search(self, query: str, top_k: int) -> List[SearchHit]:
         # Lexical retrieval (BM25 over FTS5).
         lexical_ranked = [cid for cid, _ in self.store.fts_search(query, fetch_k)]
 
-        # Fuse and trim.
+        # Fuse, then trim to the candidate pool (top_k, or deeper when reranking).
         fused = reciprocal_rank_fusion(
             [dense_ranked, lexical_ranked],
             k=self.config.rrf_k,
             weights=[self.config.dense_weight, self.config.lexical_weight],
-        )[:top_k]
+        )[:pool]
         if not fused:
             return []
 
@@ -77,4 +86,21 @@ def search(self, query: str, top_k: int) -> List[SearchHit]:
                     similarity=similarity.get(cid, 0.0),
                 )
             )
+
+        if self.reranker is not None:
+            hits = self._rerank(query, hits)
+        return hits[:top_k]
+
+    def _rerank(self, query: str, hits: List[SearchHit]) -> List[SearchHit]:
+        """Re-score candidates jointly with the query and sort by the new score.
+
+        The cross-encoder score replaces ``score`` (the relative ranking signal) so order
+        and score agree; ``similarity`` keeps the dense cosine for display.
+        """
+        if not hits or self.reranker is None:
+            return hits
+        scores = self.reranker.rerank(query, [h.text for h in hits])
+        for hit, s in zip(hits, scores, strict=False):
+            hit.score = float(s)
+        hits.sort(key=lambda h: h.score, reverse=True)
         return hits
diff --git a/coderag/surfaces/cli.py b/coderag/surfaces/cli.py
index ea7bb1f..36323ec 100644
--- a/coderag/surfaces/cli.py
+++ b/coderag/surfaces/cli.py
@@ -123,15 +123,24 @@ def cmd_eval(args: argparse.Namespace) -> int:
         return 1
 
     ks = tuple(int(k) for k in args.ks.split(","))
+    # --rerank forces the optional two-stage cross-encoder on for this run.
+    if args.rerank:
+        cfg = cfg.with_overrides(rerank=True)
     cr = CodeRAG(cfg)
     cr.index()  # ensure the index is built / up to date before scoring
 
     if args.compare:
-        results = ev.compare_modes(cr, cases, ks=ks, level=args.level)
+        reranker = None
+        if args.rerank:
+            from coderag.retrieval.rerank import get_reranker
+
+            reranker = get_reranker(cfg)
+        results = ev.compare_modes(
+            cr, cases, ks=ks, level=args.level, reranker=reranker
+        )
     else:
-        results = [
-            ev.evaluate(cr.search, cases, label="hybrid", ks=ks, level=args.level)
-        ]
+        label = "hybrid+rerank" if args.rerank else "hybrid"
+        results = [ev.evaluate(cr.search, cases, label=label, ks=ks, level=args.level)]
 
     if args.json:
         print(json.dumps([r.as_dict() for r in results], indent=2))
@@ -282,6 +291,11 @@ def build_parser() -> argparse.ArgumentParser:
     )
     p_eval.add_argument("--json", action="store_true", help="Emit JSON.")
     p_eval.add_argument("--quiet", action="store_true", help="Hide the progress bar.")
+    p_eval.add_argument(
+        "--rerank",
+        action="store_true",
+        help="Enable the local cross-encoder reranker (two-stage retrieve-then-rerank).",
+    )
     p_eval.add_argument(
         "--list-models",
         action="store_true",
diff --git a/docs/eval.md b/docs/eval.md
index bd9772c..db53254 100644
--- a/docs/eval.md
+++ b/docs/eval.md
@@ -30,8 +30,15 @@ coderag eval --dataset coderag-eval.jsonl
 
 # 3. Contrast dense-only vs BM25-only vs hybrid on one index:
 coderag eval --dataset coderag-eval.jsonl --compare
+
+# 4. Add the optional two-stage cross-encoder reranker (adds a hybrid+rerank row):
+coderag eval --dataset coderag-eval.jsonl --compare --rerank
 ```
 
+Reranking is opt-in at search time too: set `CODERAG_RERANK=1` (model via
+`CODERAG_RERANK_MODEL`, pool depth via `CODERAG_RERANK_CANDIDATES`) and every `coderag
+search` / API / UI query runs two-stage retrieve-then-rerank.
+
 ```
 mode    n   MRR    R@1    R@5    R@10   nDCG@1  nDCG@5  nDCG@10  Hit@1  Hit@5  Hit@10
 ------  --  -----  -----  -----  -----  ------  ------  -------  -----  -----  ------
@@ -88,6 +95,39 @@ cross-file/conceptual queries with less lexical leakage); (c) the remaining head
 #2) targets. This is the harness doing its job: it stopped a plausible-sounding upgrade that
 the data doesn't support.
 
+### Reranker experiment (move #2)
+
+Adding the optional cross-encoder reranker (`--rerank`, default
+`Xenova/ms-marco-MiniLM-L-12-v2`) on the same 24-case dataset:
+
+```
+mode                               MRR    R@1    R@5    R@10   nDCG@10  Hit@10
+bge-small-en-v1.5 · dense          0.805  0.646  0.938  1.000  0.845    1.000
+bge-small-en-v1.5 · bm25           0.747  0.604  0.812  1.000  0.798    1.000
+bge-small-en-v1.5 · hybrid         0.801  0.646  1.000  1.000  0.845    1.000
+bge-small-en-v1.5 · hybrid+rerank  0.790  0.646  0.958  1.000  0.836    1.000
+```
+
+**The reranker did not help here — it marginally hurt** (hybrid+rerank MRR 0.790 < hybrid
+0.801; R@5 0.958 < 1.000). Same lesson as move #1, plus a model-fit issue:
+
+1. **Saturation, again.** Hybrid already gets R@5 = 1.0 / Hit@10 = 1.0 and the headroom is
+   only at rank 1 (R@1 = 0.646). A reranker reorders *within* the candidate pool, so on
+   file-level metrics where the right files are already in the pool, it can only shuffle —
+   and any mistake shows up as a small regression.
+2. **Model fit.** `ms-marco-MiniLM` is trained on web-passage relevance, not code. The
+   research explicitly flagged that small-cross-encoder *code* reranking lift is inferred,
+   not measured — this run is consistent with that caveat. A code-aware reranker
+   (`CODERAG_RERANK_MODEL=jinaai/jina-reranker-v2-base-multilingual` or
+   `BAAI/bge-reranker-base`) is worth trying, but those are larger.
+
+**Conclusion across moves #1 and #2:** the recurring blocker is that *this repo's benchmark
+is too small and saturated to discriminate any retrieval improvement*. The feature is built,
+tested, and opt-in, but **proving its value requires a larger, harder, non-saturated
+benchmark** (a 1k+-file external repo and/or symbol-level + cross-file conceptual queries).
+That is the true critical path for the "win the eval" objective — accuracy techniques can't
+be validated until the benchmark has headroom.
+
 ## Dataset format
 
 JSONL, one case per line:
diff --git a/docs/research/code-retrieval-strategy.md b/docs/research/code-retrieval-strategy.md
index a794c69..2c0d6f8 100644
--- a/docs/research/code-retrieval-strategy.md
+++ b/docs/research/code-retrieval-strategy.md
@@ -90,6 +90,15 @@ under OpenRAIL++-M, 68.53 CoIR). [H] Voyage-code-3 is API-only — reference poi
 
 ---
 
+> **Update (measured & built).** The optional two-stage reranker is implemented
+> (`config.rerank`, `coderag/retrieval/rerank.py`, fastembed `TextCrossEncoder`, zero new
+> deps) and tested. On this repo's saturated 24-case set it gave **no lift / a marginal
+> regression** with the generic `ms-marco-MiniLM` model — consistent with the caveat below
+> that small-cross-encoder *code* lift is inferred, not measured, and with the benchmark
+> having no headroom (hybrid already R@5≈1.0). See [docs/eval.md](../eval.md). The blocker is
+> now clearly the **benchmark**, not the technique: it must get bigger/harder before #1 or #2
+> can show their value. A code-aware reranker should be re-tested there.
+
 ## 2. Add a local cross-encoder reranker (highest-ROI bolt-on)
 
 The evidence converges: **a small ONNX cross-encoder reranking the top-100 down to top-8 is the
diff --git a/scripts/bench_embedders.py b/scripts/bench_embedders.py
index ad39d47..5b66c88 100644
--- a/scripts/bench_embedders.py
+++ b/scripts/bench_embedders.py
@@ -41,6 +41,11 @@ def main() -> int:
     )
     ap.add_argument("--ks", default="1,5,10", help="Comma-separated cutoffs.")
     ap.add_argument("--level", choices=("file", "symbol"), default="file")
+    ap.add_argument(
+        "--rerank",
+        action="store_true",
+        help="Also score a hybrid+rerank row per model (local cross-encoder).",
+    )
     args = ap.parse_args()
 
     repo = Path(args.repo).resolve()
@@ -61,7 +66,14 @@ def main() -> int:
             cr = CodeRAG(cfg)
             stats = cr.index()
             print(f"    {stats.total_files} files / {stats.total_chunks} chunks")
-            for r in compare_modes(cr, cases, ks=ks, level=args.level):
+            reranker = None
+            if args.rerank:
+                from coderag.retrieval.rerank import get_reranker
+
+                reranker = get_reranker(cfg.with_overrides(rerank=True))
+            for r in compare_modes(
+                cr, cases, ks=ks, level=args.level, reranker=reranker
+            ):
                 r.label = _label(model, r.label)
                 rows.append(r)
             cr.close()
diff --git a/tests/test_rerank.py b/tests/test_rerank.py
new file mode 100644
index 0000000..a2da22b
--- /dev/null
+++ b/tests/test_rerank.py
@@ -0,0 +1,99 @@
+"""Tests for two-stage retrieve-then-rerank (offline via a fake reranker).
+
+These never load the real cross-encoder; they verify the searcher's two-stage wiring:
+deeper candidate pool, re-scoring, reordering, and trimming to top_k.
+"""
+
+from __future__ import annotations
+
+from typing import List, Sequence
+
+from coderag.api import CodeRAG
+from coderag.config import Config
+from coderag.eval import EvalCase, compare_modes
+from coderag.retrieval.rerank import get_reranker
+from coderag.retrieval.search import HybridSearcher
+from tests.conftest import write
+
+
+class KeywordReranker:
+    """Deterministic fake reranker: score = count of query words present in the doc."""
+
+    model_id = "fake-reranker"
+
+    def rerank(self, query: str, documents: Sequence[str]) -> List[float]:
+        terms = query.lower().split()
+        return [float(sum(t in doc.lower() for t in terms)) for doc in documents]
+
+
+def _indexed(config: Config) -> CodeRAG:
+    config.watched_dir.mkdir(parents=True, exist_ok=True)
+    write(
+        config.watched_dir / "auth.py",
+        "def authenticate_user(token):\n"
+        "    '''Validate a session token and return the user.'''\n"
+        "    return verify(token)\n",
+    )
+    write(
+        config.watched_dir / "math_utils.py",
+        "def add_numbers(a, b):\n    return a + b\n",
+    )
+    cr = CodeRAG(config)
+    cr.index()
+    return cr
+
+
+def test_get_reranker_off_by_default(config):
+    assert get_reranker(config) is None
+
+
+def test_get_reranker_built_when_enabled(config):
+    r = get_reranker(config.with_overrides(rerank=True))
+    assert r is not None
+    assert r.model_id  # default model id present
+
+
+def test_reranker_reorders_and_sets_score(config):
+    cr = _indexed(config)
+    searcher = HybridSearcher(
+        cr.config, cr.provider, cr.store, cr.vectors, reranker=KeywordReranker()
+    )
+    hits = searcher.search("validate session token", top_k=2)
+    assert hits
+    # The auth chunk contains all three query words -> must rank first after rerank.
+    assert hits[0].path == "auth.py"
+    # Score is replaced by the cross-encoder score (here, the keyword overlap count).
+    assert hits[0].score >= hits[-1].score
+
+
+def test_rerank_trims_to_top_k(config):
+    cr = _indexed(config)
+    searcher = HybridSearcher(
+        cr.config, cr.provider, cr.store, cr.vectors, reranker=KeywordReranker()
+    )
+    assert len(searcher.search("token", top_k=1)) == 1
+
+
+def test_reranker_empty_query(config):
+    cr = _indexed(config)
+    searcher = HybridSearcher(
+        cr.config, cr.provider, cr.store, cr.vectors, reranker=KeywordReranker()
+    )
+    assert searcher.search("   ", top_k=3) == []
+
+
+def test_compare_modes_adds_rerank_row(config):
+    cr = _indexed(config)
+    cases = [EvalCase("validate session token", ["auth.py"])]
+    results = compare_modes(cr, cases, ks=(1, 3), reranker=KeywordReranker())
+    labels = [r.label for r in results]
+    assert labels == ["dense", "bm25", "hybrid", "hybrid+rerank"]
+    rerank_res = results[-1]
+    assert rerank_res.hit[1] == 1.0  # keyword reranker nails the auth file at rank 1
+
+
+def test_status_reports_rerank(config):
+    cr = _indexed(config.with_overrides(rerank=True))
+    status = cr.status()
+    assert status["rerank"] is True
+    assert status["rerank_model"]

From a15f9f8043877380c9e92f5e18fc2d57eec3a06f Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 17 Jun 2026 08:12:07 +0000
Subject: [PATCH 5/5] refactor(eval): dedupe language->extension mapping;
 document eval in README

- Add chunking.languages.extensions_for() as the single canonical reverse
  lookup, and use it from the CLI and the git dataset miner instead of two
  separate hardcoded copies of the language->suffix table.
- Surface `coderag eval` in the README CLI list and link the eval + strategy
  docs.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01LhTCPRjNmSitYxgSDfttT7
---
 README.md                     |  7 +++++++
 coderag/chunking/languages.py |  8 +++++++-
 coderag/eval/dataset.py       | 10 ++++++----
 coderag/surfaces/cli.py       | 18 +++---------------
 tests/test_eval.py            | 10 ++++++++++
 5 files changed, 33 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index 5298c06..8fb628d 100644
--- a/README.md
+++ b/README.md
@@ -79,8 +79,15 @@ coderag watch                     # index, then keep it live as files change
 coderag serve --port 8000         # run the HTTP API  (needs [server])
 coderag ui                        # launch the web UI (needs [ui])
 coderag status                    # index stats (files, chunks, model, index type)
+coderag eval --dataset d.jsonl --compare  # retrieval quality: dense vs BM25 vs hybrid
 ```
 
+> **Measuring retrieval quality.** `coderag eval` is a built-in harness for "did we surface
+> the right file/symbol?" — recall@k, MRR, nDCG@k at file or symbol level, with a git-history
+> dataset miner (`--build`), a dense/BM25/hybrid comparison (`--compare`), and an optional
+> cross-encoder rerank stage (`--rerank`). See [`docs/eval.md`](docs/eval.md) and the strategy
+> writeup in [`docs/research/code-retrieval-strategy.md`](docs/research/code-retrieval-strategy.md).
+
 ### Python library
 
 ```python
diff --git a/coderag/chunking/languages.py b/coderag/chunking/languages.py
index b5a4c66..c2cc5a4 100644
--- a/coderag/chunking/languages.py
+++ b/coderag/chunking/languages.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 from pathlib import Path
-from typing import Optional
+from typing import Iterable, List, Optional
 
 # Languages for which we extract symbol-aware spans (function/class/method).
 # Python uses the stdlib ``ast``; the rest use tree-sitter.
@@ -55,3 +55,9 @@
 def detect_language(path: str | Path) -> Optional[str]:
     """Return the language for ``path``, or ``None`` if it should not be indexed."""
     return EXTENSION_TO_LANGUAGE.get(Path(path).suffix.lower())
+
+
+def extensions_for(languages: Iterable[str]) -> List[str]:
+    """File extensions that map to any of ``languages`` (the canonical reverse lookup)."""
+    wanted = set(languages)
+    return sorted(ext for ext, lang in EXTENSION_TO_LANGUAGE.items() if lang in wanted)
diff --git a/coderag/eval/dataset.py b/coderag/eval/dataset.py
index c38a547..86b85c1 100644
--- a/coderag/eval/dataset.py
+++ b/coderag/eval/dataset.py
@@ -113,10 +113,12 @@ def build_from_git(
     query (``min_query_len``).
     """
     repo = Path(repo)
-    exts = {
-        e if e.startswith(".") else f".{e}"
-        for e in (extensions or (".py", ".js", ".ts", ".tsx", ".go", ".rs", ".java"))
-    }
+    if extensions is None:
+        from coderag.chunking.languages import extensions_for
+        from coderag.config import DEFAULT_LANGUAGES
+
+        extensions = extensions_for(DEFAULT_LANGUAGES)
+    exts = {e if e.startswith(".") else f".{e}" for e in extensions}
 
     fmt = f"{_REC}%H{_FLD}%s{_FLD}%an"
     raw = _git(
diff --git a/coderag/surfaces/cli.py b/coderag/surfaces/cli.py
index 36323ec..f5e9a01 100644
--- a/coderag/surfaces/cli.py
+++ b/coderag/surfaces/cli.py
@@ -104,10 +104,12 @@ def cmd_eval(args: argparse.Namespace) -> int:
 
     # `coderag eval build` — mine a dataset from the repo's git history.
     if args.build:
+        from coderag.chunking.languages import extensions_for
+
         cases = ev.build_from_git(
             cfg.watched_dir,
             max_cases=args.max_cases,
-            extensions=[e.lstrip(".") for e in _suffixes(cfg.languages)],
+            extensions=extensions_for(cfg.languages),
         )
         out = args.dataset or "coderag-eval.jsonl"
         ev.save_dataset(cases, out)
@@ -152,20 +154,6 @@ def cmd_eval(args: argparse.Namespace) -> int:
     return 0
 
 
-def _suffixes(languages: tuple) -> list:
-    """Map configured language names to file suffixes for dataset mining."""
-    table = {
-        "python": ".py",
-        "javascript": ".js",
-        "typescript": ".ts",
-        "tsx": ".tsx",
-        "go": ".go",
-        "rust": ".rs",
-        "java": ".java",
-    }
-    return [table[lang] for lang in languages if lang in table]
-
-
 def cmd_watch(args: argparse.Namespace) -> int:
     from coderag.watch import watch
 
diff --git a/tests/test_eval.py b/tests/test_eval.py
index 276d35f..baa6d26 100644
--- a/tests/test_eval.py
+++ b/tests/test_eval.py
@@ -190,6 +190,16 @@ def git(*args: str) -> None:
     assert cases[0].source == "git"
 
 
+def test_extensions_for_uses_canonical_map():
+    from coderag.chunking.languages import extensions_for
+
+    exts = extensions_for(("python", "go"))
+    assert ".py" in exts and ".go" in exts
+    assert ".rs" not in exts  # rust not requested
+    # Unknown language names contribute nothing rather than raising.
+    assert extensions_for(("nonsense",)) == []
+
+
 def test_build_from_git_skips_merges_and_short_subjects(tmp_path: Path):
     repo = tmp_path / "repo"
     repo.mkdir()