From 1dc12b4260d8bf045fc79cca39614a58d4ef688a Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 17 Jun 2026 08:37:16 +0000
Subject: [PATCH 1/3] feat(eval): symbol-level dataset mining (non-saturated
 benchmark)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

File-level eval on a small repo saturates (hybrid hits Hit@10=1.0), so it
could not measure retrieval improvements. Symbol-level localization — find
the right function/class, not just file — has real headroom and discriminates.

- build_from_git(symbols=True) / `coderag eval --build --level symbol`:
  maps each commit's changed lines (zero-context diff hunks) to the symbols
  they touch, parsed from the file content *at that commit* via CodeRAG's own
  chunker, then intersected with the symbols present at HEAD so every
  ground-truth symbol is retrievable from the index. Off by default.
- Tests cover symbol extraction (only the changed function is reported) and
  the default-off behavior.

Result (10 symbol-level cases, this repo): the benchmark stops saturating
(Hit@10 ~0.5), and the previously-flat cross-encoder reranker now shows the
predicted lift — R@1 0.183->0.283 (+55%), MRR 0.420->0.514, nDCG@10
0.369->0.448. This validates move #2 and confirms the file-level null result
was a benchmark artifact. Documented in docs/eval.md and the strategy doc.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01LhTCPRjNmSitYxgSDfttT7
---
 coderag/eval/dataset.py                  | 90 ++++++++++++++++++++++++
 coderag/surfaces/cli.py                  |  1 +
 docs/eval.md                             | 41 +++++++++--
 docs/research/code-retrieval-strategy.md | 14 ++--
 tests/test_eval.py                       | 50 +++++++++++++
 5 files changed, 183 insertions(+), 13 deletions(-)

diff --git a/coderag/eval/dataset.py b/coderag/eval/dataset.py
index 86b85c1..0626505 100644
--- a/coderag/eval/dataset.py
+++ b/coderag/eval/dataset.py
@@ -9,6 +9,7 @@
 from __future__ import annotations
 
 import json
+import re
 import subprocess
 from dataclasses import dataclass, field
 from pathlib import Path
@@ -100,6 +101,7 @@ def build_from_git(
     max_files_per_commit: int = 5,
     min_query_len: int = 12,
     commit_scan_limit: int = 2000,
+    symbols: bool = False,
 ) -> List[EvalCase]:
     """Mine an eval dataset from a repo's commit history.
 
@@ -107,6 +109,10 @@ def build_from_git(
     (a) match ``extensions`` and (b) still exist at HEAD become the relevant set — so
     every ground-truth file is actually present in the index built from HEAD.
 
+    With ``symbols=True``, the functions/classes/methods touched by the commit (that still
+    exist at HEAD) are also recorded as ``relevant_symbols``, enabling the much harder —
+    and less saturation-prone — symbol-level eval (``coderag eval --level symbol``).
+
     Filtering mirrors SweRank/Agentless dataset construction: skip merges, reverts, and
     bot/automated commits; drop commits that touch too many files (``max_files_per_commit``
     — diffuse, weak signal) or none of the targeted extensions; and require a meaningful
@@ -153,10 +159,16 @@ def build_from_git(
         if not files or len(files) > max_files_per_commit:
             continue
 
+        relevant_symbols: List[str] = []
+        if symbols:
+            for f in files:
+                relevant_symbols.extend(_changed_symbols(repo, sha, f))
+
         cases.append(
             EvalCase(
                 query=subject,
                 relevant_files=files,
+                relevant_symbols=sorted(set(relevant_symbols)),
                 id=sha[:12],
                 source="git",
             )
@@ -175,3 +187,81 @@ def _is_usable_query(subject: str, min_len: int) -> bool:
 def _is_bot(author: str) -> bool:
     low = author.lower()
     return "bot" in low or low in {"dependabot", "github-actions", "renovate"}
+
+
+# Unified-diff hunk header: @@ -old[,n] +new[,n] @@ — we only need the new-side range.
+_HUNK = re.compile(r"^@@ -\d+(?:,\d+)? \+(\d+)(?:,(\d+))? @@")
+
+
+def _diff_new_lines(repo: Path, sha: str, file: str) -> set[int]:
+    """Line numbers (new side) touched by ``sha`` in ``file``, from a zero-context diff."""
+    try:
+        diff = _git(repo, "show", "--unified=0", "--no-color", sha, "--", file)
+    except subprocess.CalledProcessError:
+        return set()
+    lines: set[int] = set()
+    for row in diff.splitlines():
+        m = _HUNK.match(row)
+        if not m:
+            continue
+        start = int(m.group(1))
+        count = int(m.group(2)) if m.group(2) is not None else 1
+        # Pure deletions (count 0) still implicate the symbol around the anchor line.
+        lines.update(range(start, start + max(count, 1)))
+    return lines
+
+
+def _symbols_covering(text: str, language: str, lines: set[int]) -> set[str]:
+    """Named symbols in ``text`` whose line span overlaps any of ``lines``."""
+    from coderag.chunking import chunk_file
+    from coderag.config import Config
+
+    if not text.strip() or not lines:
+        return set()
+    found: set[str] = set()
+    for chunk in chunk_file(text, language, Config()):
+        if chunk.symbol and chunk.kind != "window":
+            if any(chunk.start_line <= ln <= chunk.end_line for ln in lines):
+                found.add(chunk.symbol)
+    return found
+
+
+def _changed_symbols(repo: Path, sha: str, file: str) -> List[str]:
+    """Symbols a commit changed in ``file`` that still exist in the file at HEAD.
+
+    The change is mapped against the file content *at that commit* (so diff line numbers
+    line up), then intersected with the symbols present at HEAD so every ground-truth
+    symbol is actually retrievable from the current index.
+    """
+    from coderag.chunking.languages import detect_language
+
+    language = detect_language(file)
+    if language is None:
+        return []
+    changed_lines = _diff_new_lines(repo, sha, file)
+    if not changed_lines:
+        return []
+    try:
+        at_commit = _git(repo, "show", f"{sha}:{file}")
+    except subprocess.CalledProcessError:
+        return []
+    changed = _symbols_covering(at_commit, language, changed_lines)
+    if not changed:
+        return []
+    head_text = (repo / file).read_text(encoding="utf-8", errors="replace")
+    head_symbols = _all_symbols(head_text, language)
+    return sorted(changed & head_symbols)
+
+
+def _all_symbols(text: str, language: str) -> set[str]:
+    """All named symbols present in ``text`` (used as a HEAD-side existence check)."""
+    from coderag.chunking import chunk_file
+    from coderag.config import Config
+
+    if not text.strip():
+        return set()
+    return {
+        c.symbol
+        for c in chunk_file(text, language, Config())
+        if c.symbol and c.kind != "window"
+    }
diff --git a/coderag/surfaces/cli.py b/coderag/surfaces/cli.py
index f5e9a01..6493c3b 100644
--- a/coderag/surfaces/cli.py
+++ b/coderag/surfaces/cli.py
@@ -110,6 +110,7 @@ def cmd_eval(args: argparse.Namespace) -> int:
             cfg.watched_dir,
             max_cases=args.max_cases,
             extensions=extensions_for(cfg.languages),
+            symbols=args.level == "symbol",
         )
         out = args.dataset or "coderag-eval.jsonl"
         ev.save_dataset(cases, out)
diff --git a/docs/eval.md b/docs/eval.md
index db53254..c26723a 100644
--- a/docs/eval.md
+++ b/docs/eval.md
@@ -33,6 +33,10 @@ coderag eval --dataset coderag-eval.jsonl --compare
 
 # 4. Add the optional two-stage cross-encoder reranker (adds a hybrid+rerank row):
 coderag eval --dataset coderag-eval.jsonl --compare --rerank
+
+# 5. Harder, non-saturated: symbol-level (find the right function, not just file).
+coderag eval --build --level symbol --dataset sym.jsonl   # mines relevant_symbols too
+coderag eval --dataset sym.jsonl --level symbol --compare --rerank
 ```
 
 Reranking is opt-in at search time too: set `CODERAG_RERANK=1` (model via
@@ -121,12 +125,37 @@ bge-small-en-v1.5 · hybrid+rerank  0.790  0.646  0.958  1.000  0.836    1.000
    (`CODERAG_RERANK_MODEL=jinaai/jina-reranker-v2-base-multilingual` or
    `BAAI/bge-reranker-base`) is worth trying, but those are larger.
 
-**Conclusion across moves #1 and #2:** the recurring blocker is that *this repo's benchmark
-is too small and saturated to discriminate any retrieval improvement*. The feature is built,
-tested, and opt-in, but **proving its value requires a larger, harder, non-saturated
-benchmark** (a 1k+-file external repo and/or symbol-level + cross-file conceptual queries).
-That is the true critical path for the "win the eval" objective — accuracy techniques can't
-be validated until the benchmark has headroom.
+**Conclusion across moves #1 and #2 (file level):** the recurring blocker was that *file-level
+on this small repo is too saturated to discriminate any retrieval improvement*. The fix is a
+harder benchmark — see the symbol-level results next, which resolve it.
+
+### Symbol-level: the non-saturated benchmark (and the reranker, validated)
+
+Build a symbol-level dataset (`coderag eval --build --level symbol`, or `build_from_git(...,
+symbols=True)`) — the functions/classes a commit touched that still exist at HEAD — and score
+with `--level symbol`. Finding the right *function* (not just file) is far harder, so the
+benchmark stops saturating (Hit@10 ≈ 0.5 instead of 1.0). On 10 symbol-level cases from this
+repo's history:
+
+```
+mode                               MRR    R@1    R@5    R@10   nDCG@10  Hit@10
+bge-small-en-v1.5 · dense          0.400  0.183  0.292  0.317  0.327    0.400
+bge-small-en-v1.5 · bm25           0.417  0.183  0.317  0.342  0.345    0.500
+bge-small-en-v1.5 · hybrid         0.420  0.183  0.417  0.417  0.369    0.500
+bge-small-en-v1.5 · hybrid+rerank  0.514  0.283  0.392  0.442  0.448    0.600
+```
+
+**With headroom, the reranker delivers exactly the predicted lift:** R@1 0.183 → 0.283
+(+55%), MRR 0.420 → 0.514, nDCG@10 0.369 → 0.448, Hit@10 0.500 → 0.600 — improvement across
+*every* top-of-list metric, from the same off-the-shelf `ms-marco-MiniLM` that looked useless
+at the saturated file level. This both validates move #2 and confirms the saturation
+diagnosis: the file-level null result was a benchmark artifact, not a property of the
+technique. (Caveat: 10 cases is small/noisy; the direction is strong and consistent, but
+widen the dataset before quoting exact numbers.)
+
+**Net guidance:** evaluate retrieval changes at **symbol level** — it's where the signal is.
+Re-run the embedder comparison there too (a code-aware reranker like `bge-reranker-base` is
+the next thing to test now that there's a benchmark that can measure it).
 
 ## Dataset format
 
diff --git a/docs/research/code-retrieval-strategy.md b/docs/research/code-retrieval-strategy.md
index 2c0d6f8..1884012 100644
--- a/docs/research/code-retrieval-strategy.md
+++ b/docs/research/code-retrieval-strategy.md
@@ -90,14 +90,14 @@ under OpenRAIL++-M, 68.53 CoIR). [H] Voyage-code-3 is API-only — reference poi
 
 ---
 
-> **Update (measured & built).** The optional two-stage reranker is implemented
+> **Update (built & VALIDATED).** The optional two-stage reranker is implemented
 > (`config.rerank`, `coderag/retrieval/rerank.py`, fastembed `TextCrossEncoder`, zero new
-> deps) and tested. On this repo's saturated 24-case set it gave **no lift / a marginal
-> regression** with the generic `ms-marco-MiniLM` model — consistent with the caveat below
-> that small-cross-encoder *code* lift is inferred, not measured, and with the benchmark
-> having no headroom (hybrid already R@5≈1.0). See [docs/eval.md](../eval.md). The blocker is
-> now clearly the **benchmark**, not the technique: it must get bigger/harder before #1 or #2
-> can show their value. A code-aware reranker should be re-tested there.
+> deps) and tested. It showed **no lift at the saturated file level**, but once the benchmark
+> was made non-saturated via **symbol-level** ground truth (`build_from_git(symbols=True)`),
+> the same off-the-shelf `ms-marco-MiniLM` reranker delivered the predicted lift: **R@1
+> 0.183 → 0.283 (+55%), MRR 0.420 → 0.514, nDCG@10 0.369 → 0.448**. The earlier null result
+> was a benchmark artifact, not a property of the technique. Evaluate at symbol level. See
+> [docs/eval.md](../eval.md).
 
 ## 2. Add a local cross-encoder reranker (highest-ROI bolt-on)
 
diff --git a/tests/test_eval.py b/tests/test_eval.py
index baa6d26..51ed135 100644
--- a/tests/test_eval.py
+++ b/tests/test_eval.py
@@ -216,3 +216,53 @@ def git(*args: str) -> None:
     git("commit", "-q", "-m", "wip")  # too short -> filtered out
 
     assert build_from_git(repo, max_cases=10, min_query_len=12) == []
+
+
+def test_build_from_git_extracts_changed_symbols(tmp_path: Path):
+    repo = tmp_path / "repo"
+    repo.mkdir()
+
+    def git(*args: str) -> None:
+        subprocess.run(["git", "-C", str(repo), *args], check=True, capture_output=True)
+
+    git("init", "-q")
+    git("config", "user.email", "t@example.com")
+    git("config", "user.name", "Tester")
+    git("config", "commit.gpgsign", "false")
+    write(
+        repo / "m.py",
+        "def alpha():\n    return 1\n\n\ndef beta():\n    return 2\n",
+    )
+    git("add", "-A")
+    git("commit", "-q", "-m", "initial two functions")
+    # Change only beta's body -> only beta should be reported as changed.
+    write(
+        repo / "m.py",
+        "def alpha():\n    return 1\n\n\ndef beta():\n    return 22\n",
+    )
+    git("add", "-A")
+    git("commit", "-q", "-m", "tweak beta return value")
+
+    cases = build_from_git(repo, max_cases=10, symbols=True, min_query_len=5)
+    latest = next(c for c in cases if c.id and c.query.startswith("tweak beta"))
+    assert latest.relevant_files == ["m.py"]
+    assert latest.relevant_symbols == ["beta"]  # alpha untouched
+
+
+def test_build_from_git_symbols_off_by_default(tmp_path: Path):
+    repo = tmp_path / "repo"
+    repo.mkdir()
+
+    def git(*args: str) -> None:
+        subprocess.run(["git", "-C", str(repo), *args], check=True, capture_output=True)
+
+    git("init", "-q")
+    git("config", "user.email", "t@example.com")
+    git("config", "user.name", "Tester")
+    git("config", "commit.gpgsign", "false")
+    write(repo / "m.py", "def alpha():\n    return 1\n")
+    git("add", "-A")
+    git("commit", "-q", "-m", "add alpha function")
+
+    cases = build_from_git(repo, max_cases=10)  # symbols=False
+    assert cases[0].relevant_symbols == []

From e7d17d4caae35d4863e732bd028c8ad6af81af97 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 17 Jun 2026 09:42:09 +0000
Subject: [PATCH 2/3] feat(eval): reranker registry, multi-reranker bench,
 curated symbol set

Tooling for the symbol-level model comparisons:
- RECOMMENDED_RERANKERS registry + `coderag eval --list-models` now lists
  local cross-encoder rerankers (MiniLM, bge-reranker-base, jina-reranker-v2)
  with size/notes, so code-aware rerankers are discoverable.
- scripts/bench_embedders.py --rerank-models: score one hybrid+rerank row per
  named reranker, to compare reranker models on a fixed index.
- coderag/eval/datasets/coderag_self_symbols.jsonl: 22 curated
  natural-language -> function/method cases (verified symbol names) for a
  trustworthy symbol-level eval, less noisy than the git-mined set.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01LhTCPRjNmSitYxgSDfttT7
---
 coderag/embeddings/models.py                  | 38 +++++++++++++++++++
 .../eval/datasets/coderag_self_symbols.jsonl  | 22 +++++++++++
 scripts/bench_embedders.py                    | 29 ++++++++++++--
 tests/test_models_registry.py                 | 18 ++++++++-
 4 files changed, 102 insertions(+), 5 deletions(-)
 create mode 100644 coderag/eval/datasets/coderag_self_symbols.jsonl

diff --git a/coderag/embeddings/models.py b/coderag/embeddings/models.py
index 429d17a..ca5ec9d 100644
--- a/coderag/embeddings/models.py
+++ b/coderag/embeddings/models.py
@@ -87,4 +87,42 @@ def format_models() -> str:
         lines.append(f"{head}  {r[4]}")
         if i == 0:
             lines.append("  ".join("-" * w for w in widths) + "  " + "-" * len(r[4]))
+    lines.append("")
+    lines.append("Rerankers (set CODERAG_RERANK=1, CODERAG_RERANK_MODEL=<name>):")
+    rwidth = max(len(rr.name) for rr in RECOMMENDED_RERANKERS)
+    for rr in RECOMMENDED_RERANKERS:
+        lines.append(f"  {rr.name.ljust(rwidth)}  {f'{rr.size_gb:g}GB':>8}  {rr.note}")
     return "\n".join(lines)
+
+
+@dataclass(frozen=True)
+class RerankerInfo:
+    name: str  # fastembed TextCrossEncoder model id (pass via CODERAG_RERANK_MODEL)
+    size_gb: float
+    note: str
+
+
+# Local cross-encoder rerankers loadable via fastembed's TextCrossEncoder. The MiniLM
+# pair is web-trained (small/fast); bge/jina are larger and worth testing for code.
+RECOMMENDED_RERANKERS: Tuple[RerankerInfo, ...] = (
+    RerankerInfo(
+        "Xenova/ms-marco-MiniLM-L-12-v2",
+        0.12,
+        "Default. Tiny/fast (~30ms CPU); web-trained, not code-specific.",
+    ),
+    RerankerInfo(
+        "Xenova/ms-marco-MiniLM-L-6-v2",
+        0.08,
+        "Smallest/fastest MiniLM; slightly weaker than L-12.",
+    ),
+    RerankerInfo(
+        "BAAI/bge-reranker-base",
+        1.04,
+        "Larger, stronger general reranker; multilingual incl. code-ish text.",
+    ),
+    RerankerInfo(
+        "jinaai/jina-reranker-v2-base-multilingual",
+        1.11,
+        "Strong multilingual reranker with code in its training mix.",
+    ),
+)
diff --git a/coderag/eval/datasets/coderag_self_symbols.jsonl b/coderag/eval/datasets/coderag_self_symbols.jsonl
new file mode 100644
index 0000000..4c484c3
--- /dev/null
+++ b/coderag/eval/datasets/coderag_self_symbols.jsonl
@@ -0,0 +1,22 @@
+{"query": "where is reciprocal rank fusion implemented", "relevant_files": ["coderag/retrieval/fusion.py"], "relevant_symbols": ["reciprocal_rank_fusion"], "source": "curated"}
+{"query": "how are dense and lexical search results combined into one ranking", "relevant_files": ["coderag/retrieval/search.py"], "relevant_symbols": ["HybridSearcher.search"], "source": "curated"}
+{"query": "where are a changed file's old chunks removed before new ones are added", "relevant_files": ["coderag/indexer.py"], "relevant_symbols": ["Indexer._index_file"], "source": "curated"}
+{"query": "how is the FAISS index rebuilt from the SQLite store", "relevant_files": ["coderag/store/vector_index.py"], "relevant_symbols": ["FaissVectorIndex.rebuild_from_store"], "source": "curated"}
+{"query": "where does the vector index choose between flat and IVF", "relevant_files": ["coderag/store/vector_index.py"], "relevant_symbols": ["FaissVectorIndex._choose_kind"], "source": "curated"}
+{"query": "how are query vectors searched in the FAISS index", "relevant_files": ["coderag/store/vector_index.py"], "relevant_symbols": ["FaissVectorIndex.search"], "source": "curated"}
+{"query": "how is the number of IVF clusters derived from corpus size", "relevant_files": ["coderag/store/vector_index.py"], "relevant_symbols": ["_derive_nlist"], "source": "curated"}
+{"query": "where is BM25 keyword search over the full text index", "relevant_files": ["coderag/store/sqlite_store.py"], "relevant_symbols": ["SQLiteStore.fts_search"], "source": "curated"}
+{"query": "how does the store detect a model or embedding dimension change on startup", "relevant_files": ["coderag/store/sqlite_store.py"], "relevant_symbols": ["SQLiteStore.bootstrap"], "source": "curated"}
+{"query": "where are search results hydrated from the database by chunk id", "relevant_files": ["coderag/store/sqlite_store.py"], "relevant_symbols": ["SQLiteStore.hydrate"], "source": "curated"}
+{"query": "how are full text search query strings sanitized", "relevant_files": ["coderag/store/sqlite_store.py"], "relevant_symbols": ["_sanitize_fts"], "source": "curated"}
+{"query": "how does the filesystem watcher start watching and applying changes", "relevant_files": ["coderag/watch.py"], "relevant_symbols": ["watch"], "source": "curated"}
+{"query": "where are python functions and classes extracted as symbol spans", "relevant_files": ["coderag/chunking/python_ast.py"], "relevant_symbols": ["extract_spans"], "source": "curated"}
+{"query": "how is an LLM answer streamed over retrieved code chunks", "relevant_files": ["coderag/llm.py"], "relevant_symbols": ["stream_answer"], "source": "curated"}
+{"query": "where is the prompt context assembled from retrieved chunks", "relevant_files": ["coderag/llm.py"], "relevant_symbols": ["build_context"], "source": "curated"}
+{"query": "how does the facade run a hybrid search query", "relevant_files": ["coderag/api.py"], "relevant_symbols": ["CodeRAG.search"], "source": "curated"}
+{"query": "where are file contents served safely for only indexed files", "relevant_files": ["coderag/api.py"], "relevant_symbols": ["CodeRAG.get_file"], "source": "curated"}
+{"query": "how is recall at k computed for retrieval evaluation", "relevant_files": ["coderag/eval/metrics.py"], "relevant_symbols": ["recall_at_k"], "source": "curated"}
+{"query": "where is normalized discounted cumulative gain computed", "relevant_files": ["coderag/eval/metrics.py"], "relevant_symbols": ["ndcg_at_k"], "source": "curated"}
+{"query": "how does the cross-encoder reranker score documents against the query", "relevant_files": ["coderag/retrieval/rerank.py"], "relevant_symbols": ["CrossEncoderReranker.rerank"], "source": "curated"}
+{"query": "where is the reranker constructed from configuration", "relevant_files": ["coderag/retrieval/rerank.py"], "relevant_symbols": ["get_reranker"], "source": "curated"}
+{"query": "how does incremental indexing orchestrate hashing and embedding", "relevant_files": ["coderag/indexer.py"], "relevant_symbols": ["Indexer.index"], "source": "curated"}
diff --git a/scripts/bench_embedders.py b/scripts/bench_embedders.py
index 5b66c88..9872eff 100644
--- a/scripts/bench_embedders.py
+++ b/scripts/bench_embedders.py
@@ -44,7 +44,12 @@ def main() -> int:
     ap.add_argument(
         "--rerank",
         action="store_true",
-        help="Also score a hybrid+rerank row per model (local cross-encoder).",
+        help="Also score a hybrid+rerank row per model (default reranker).",
+    )
+    ap.add_argument(
+        "--rerank-models",
+        default="",
+        help="Comma-separated reranker ids; one hybrid+rerank row per reranker.",
     )
     args = ap.parse_args()
 
@@ -52,6 +57,7 @@ def main() -> int:
     cases = load_dataset(args.dataset)
     ks = tuple(int(k) for k in args.ks.split(","))
     models = [m.strip() for m in args.models.split(",") if m.strip()]
+    rerank_models = [m.strip() for m in args.rerank_models.split(",") if m.strip()]
 
     rows: list[EvalResult] = []
     for model in models:
@@ -66,16 +72,31 @@ def main() -> int:
             cr = CodeRAG(cfg)
             stats = cr.index()
             print(f"    {stats.total_files} files / {stats.total_chunks} chunks")
-            reranker = None
+
+            # Baseline modes (dense / bm25 / hybrid), plus default reranker if --rerank.
+            default_reranker = None
             if args.rerank:
                 from coderag.retrieval.rerank import get_reranker
 
-                reranker = get_reranker(cfg.with_overrides(rerank=True))
+                default_reranker = get_reranker(cfg.with_overrides(rerank=True))
             for r in compare_modes(
-                cr, cases, ks=ks, level=args.level, reranker=reranker
+                cr, cases, ks=ks, level=args.level, reranker=default_reranker
             ):
                 r.label = _label(model, r.label)
                 rows.append(r)
+
+            # One extra hybrid+rerank row per explicitly named reranker.
+            for rm in rerank_models:
+                from coderag.retrieval.rerank import CrossEncoderReranker
+
+                print(f"    reranking with {rm} ...")
+                reranker = CrossEncoderReranker(rm, cache_dir=cfg.cache_dir)
+                res = compare_modes(
+                    cr, cases, ks=ks, level=args.level, reranker=reranker, modes=()
+                )
+                for r in res:
+                    r.label = _label(model, f"rerank:{rm.split('/')[-1]}")
+                    rows.append(r)
             cr.close()
 
     from coderag.eval.harness import format_table
diff --git a/tests/test_models_registry.py b/tests/test_models_registry.py
index 5fff29b..df864b5 100644
--- a/tests/test_models_registry.py
+++ b/tests/test_models_registry.py
@@ -2,7 +2,11 @@
 
 from __future__ import annotations
 
-from coderag.embeddings.models import RECOMMENDED, format_models
+from coderag.embeddings.models import (
+    RECOMMENDED,
+    RECOMMENDED_RERANKERS,
+    format_models,
+)
 
 
 def test_registry_is_nonempty_and_well_formed():
@@ -19,7 +23,19 @@ def test_default_model_is_listed():
     assert any(m.name == "BAAI/bge-small-en-v1.5" for m in RECOMMENDED)
 
 
+def test_reranker_registry_well_formed():
+    assert RECOMMENDED_RERANKERS
+    for r in RECOMMENDED_RERANKERS:
+        assert r.name and "/" in r.name
+        assert r.size_gb > 0 and r.note
+    # The default reranker model must be listed.
+    assert any(
+        r.name == "Xenova/ms-marco-MiniLM-L-12-v2" for r in RECOMMENDED_RERANKERS
+    )
+
+
 def test_format_models_renders_table():
     out = format_models()
     assert "model" in out and "code?" in out
     assert "jina-embeddings-v2-base-code" in out
+    assert "Rerankers" in out and "bge-reranker-base" in out

From 45265333f49672502a7401315f4813d3cf282b8e Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 17 Jun 2026 13:11:41 +0000
Subject: [PATCH 3/3] docs(eval): symbol-level model comparison results and
 findings

Measured on the curated 22-case symbol dataset (bge-small vs jina-code, with
the ms-marco reranker):

1. The code-specific jina-code-v2 does NOT beat bge-small on NL->symbol
   queries (dense MRR 0.483 vs 0.675); a good general text embedder wins for
   natural-language "where is X" retrieval.
2. Equal-weight hybrid is not universally better: for the strong bge-small
   retriever, dense alone (0.675) beats 1:1 hybrid (0.573) because weak BM25
   drags it down via RRF. Fusion weighting should be query-type-aware
   (dense-up for NL, BM25-up for identifiers) -- the biggest lever found.
3. Reranking lifts top-1 precision (R@1 0.364->0.409, +12%), consistent with
   the git-mined result.

Documents these in docs/eval.md and elevates query-type fusion weighting in
the strategy doc.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01LhTCPRjNmSitYxgSDfttT7
---
 docs/eval.md                             | 44 ++++++++++++++++++++++--
 docs/research/code-retrieval-strategy.md |  7 ++++
 2 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/docs/eval.md b/docs/eval.md
index c26723a..396e9a8 100644
--- a/docs/eval.md
+++ b/docs/eval.md
@@ -154,8 +154,48 @@ technique. (Caveat: 10 cases is small/noisy; the direction is strong and consist
 widen the dataset before quoting exact numbers.)
 
 **Net guidance:** evaluate retrieval changes at **symbol level** — it's where the signal is.
-Re-run the embedder comparison there too (a code-aware reranker like `bge-reranker-base` is
-the next thing to test now that there's a benchmark that can measure it).
+
+### Symbol-level model comparison (curated 22-case set)
+
+Re-run on `coderag/eval/datasets/coderag_self_symbols.jsonl` (22 hand-verified
+natural-language → function/method cases, much less noisy than the git-mined set):
+
+```
+mode                                   MRR    R@1    R@5    R@10   nDCG@10  Hit@10
+bge-small-en-v1.5 · dense              0.675  0.591  0.818  0.864  0.720    0.864
+bge-small-en-v1.5 · bm25               0.427  0.318  0.636  0.727  0.498    0.727
+bge-small-en-v1.5 · hybrid             0.573  0.364  0.864  0.864  0.647    0.864
+bge-small-en-v1.5 · hybrid+rerank      0.580  0.409  0.864  0.864  0.651    0.864
+jina-embeddings-v2-base-code · dense   0.483  0.318  0.682  0.773  0.554    0.773
+jina-embeddings-v2-base-code · hybrid  0.604  0.455  0.818  0.864  0.668    0.864
+```
+
+Three findings, all actionable:
+
+1. **The code-specific model does not win on NL→symbol queries.** `bge-small · dense`
+   (MRR 0.675, R@1 0.591) clearly beats `jina-code · dense` (0.483 / 0.318). jina-v2-base-code
+   is older and tuned more for code↔code; for natural-language "where is X" queries a good
+   general text embedder is stronger. (jina-code's *hybrid* is competitive only because BM25
+   props up its weaker dense signal.)
+2. **Equal-weight hybrid is not universally better.** For the strong `bge-small` retriever,
+   `dense` alone (0.675) *beats* `hybrid` (0.573): on NL queries BM25 is weak (0.427) and
+   equal-weight RRF drags the strong dense ranking down. For the weaker jina-code, BM25 helps
+   (hybrid 0.604 > dense 0.483). **Takeaway: fusion weights should depend on query type** —
+   weight dense up for natural-language queries, BM25 up for exact-identifier/code queries
+   (strategy §3). A fixed 1:1 is a compromise, not an optimum.
+3. **Reranking improves top-1 precision.** `hybrid+rerank` lifts R@1 0.364 → 0.409 (+12%) over
+   hybrid with the tiny ms-marco model — consistent with the git-mined result (+55% on 10
+   noisier cases). The reranker reliably sharpens the top of the list; it operates on the
+   hybrid pool, so it can't fully recover the dense-vs-hybrid gap above (reranking a
+   dense-weighted pool is the natural follow-up).
+
+Larger code-aware rerankers (`bge-reranker-base`, `jina-reranker-v2`) are registered
+(`coderag eval --list-models`) but are ~1 GB and slow to rerank on CPU — test them on a GPU
+or a smaller candidate pool. The MiniLM default is the pragmatic local choice.
+
+**Bottom line for "win the eval":** the biggest lever found here is **query-type-aware fusion
+weighting** (finding 2), then **reranking for top-1** (finding 3) — not a bigger embedding
+model (finding 1). Validate these on a larger external repo next.
 
 ## Dataset format
 
diff --git a/docs/research/code-retrieval-strategy.md b/docs/research/code-retrieval-strategy.md
index 1884012..9721dd8 100644
--- a/docs/research/code-retrieval-strategy.md
+++ b/docs/research/code-retrieval-strategy.md
@@ -131,6 +131,13 @@ bolt-on. Treat as a later experiment, not a v1 move.
 
 ## 3. Tune and route the hybrid fusion you already have
 
+> **Update (measured — now the #1 lever).** Symbol-level eval on this repo showed equal-weight
+> hybrid can *lose* to dense alone on NL queries: `bge-small` dense MRR 0.675 vs hybrid 0.573,
+> because weak BM25 (0.427) drags the strong dense ranking down via 1:1 RRF. For the weaker
+> jina-code, BM25 *helps* (hybrid 0.604 > dense 0.483). So fusion weighting should be
+> **query-type-aware** (dense-up for NL, BM25-up for identifiers) rather than fixed 1:1 — this
+> was the single biggest lever found, ahead of a bigger embedder. See [docs/eval.md](../eval.md).
+
 CodeRAG already does dense + BM25 + RRF — the literature says that's the right foundation; the wins
 are in **routing and tuning**: