Neverdecel · Neverdecel · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026
diff --git a/coderag/embeddings/models.py b/coderag/embeddings/models.py
@@ -87,4 +87,42 @@ def format_models() -> str:
         lines.append(f"{head}  {r[4]}")
         if i == 0:
             lines.append("  ".join("-" * w for w in widths) + "  " + "-" * len(r[4]))
+    lines.append("")
+    lines.append("Rerankers (set CODERAG_RERANK=1, CODERAG_RERANK_MODEL=<name>):")
+    rwidth = max(len(rr.name) for rr in RECOMMENDED_RERANKERS)
+    for rr in RECOMMENDED_RERANKERS:
+        lines.append(f"  {rr.name.ljust(rwidth)}  {f'{rr.size_gb:g}GB':>8}  {rr.note}")
     return "\n".join(lines)
+
+
+@dataclass(frozen=True)
+class RerankerInfo:
+    name: str  # fastembed TextCrossEncoder model id (pass via CODERAG_RERANK_MODEL)
+    size_gb: float
+    note: str
+
+
+# Local cross-encoder rerankers loadable via fastembed's TextCrossEncoder. The MiniLM
+# pair is web-trained (small/fast); bge/jina are larger and worth testing for code.
+RECOMMENDED_RERANKERS: Tuple[RerankerInfo, ...] = (
+    RerankerInfo(
+        "Xenova/ms-marco-MiniLM-L-12-v2",
+        0.12,
+        "Default. Tiny/fast (~30ms CPU); web-trained, not code-specific.",
+    ),
+    RerankerInfo(
+        "Xenova/ms-marco-MiniLM-L-6-v2",
+        0.08,
+        "Smallest/fastest MiniLM; slightly weaker than L-12.",
+    ),
+    RerankerInfo(
+        "BAAI/bge-reranker-base",
+        1.04,
+        "Larger, stronger general reranker; multilingual incl. code-ish text.",
+    ),
+    RerankerInfo(
+        "jinaai/jina-reranker-v2-base-multilingual",
+        1.11,
+        "Strong multilingual reranker with code in its training mix.",
+    ),
+)
diff --git a/coderag/eval/dataset.py b/coderag/eval/dataset.py
@@ -9,6 +9,7 @@
 from __future__ import annotations
 
 import json
+import re
 import subprocess
 from dataclasses import dataclass, field
 from pathlib import Path
@@ -100,13 +101,18 @@ def build_from_git(
     max_files_per_commit: int = 5,
     min_query_len: int = 12,
     commit_scan_limit: int = 2000,
+    symbols: bool = False,
 ) -> List[EvalCase]:
     """Mine an eval dataset from a repo's commit history.
 
     For each non-merge commit, the subject line is the query and the changed files that
     (a) match ``extensions`` and (b) still exist at HEAD become the relevant set — so
     every ground-truth file is actually present in the index built from HEAD.
 
+    With ``symbols=True``, the functions/classes/methods touched by the commit (that still
+    exist at HEAD) are also recorded as ``relevant_symbols``, enabling the much harder —
+    and less saturation-prone — symbol-level eval (``coderag eval --level symbol``).
+
     Filtering mirrors SweRank/Agentless dataset construction: skip merges, reverts, and
     bot/automated commits; drop commits that touch too many files (``max_files_per_commit``
     — diffuse, weak signal) or none of the targeted extensions; and require a meaningful
@@ -153,10 +159,16 @@ def build_from_git(
         if not files or len(files) > max_files_per_commit:
             continue
 
+        relevant_symbols: List[str] = []
+        if symbols:
+            for f in files:
+                relevant_symbols.extend(_changed_symbols(repo, sha, f))
+
         cases.append(
             EvalCase(
                 query=subject,
                 relevant_files=files,
+                relevant_symbols=sorted(set(relevant_symbols)),
                 id=sha[:12],
                 source="git",
             )
@@ -175,3 +187,81 @@ def _is_usable_query(subject: str, min_len: int) -> bool:
 def _is_bot(author: str) -> bool:
     low = author.lower()
     return "bot" in low or low in {"dependabot", "github-actions", "renovate"}
+
+
+# Unified-diff hunk header: @@ -old[,n] +new[,n] @@ — we only need the new-side range.
+_HUNK = re.compile(r"^@@ -\d+(?:,\d+)? \+(\d+)(?:,(\d+))? @@")
+
+
+def _diff_new_lines(repo: Path, sha: str, file: str) -> set[int]:
+    """Line numbers (new side) touched by ``sha`` in ``file``, from a zero-context diff."""
+    try:
+        diff = _git(repo, "show", "--unified=0", "--no-color", sha, "--", file)
+    except subprocess.CalledProcessError:
+        return set()
+    lines: set[int] = set()
+    for row in diff.splitlines():
+        m = _HUNK.match(row)
+        if not m:
+            continue
+        start = int(m.group(1))
+        count = int(m.group(2)) if m.group(2) is not None else 1
+        # Pure deletions (count 0) still implicate the symbol around the anchor line.
+        lines.update(range(start, start + max(count, 1)))
+    return lines
+
+
+def _symbols_covering(text: str, language: str, lines: set[int]) -> set[str]:
+    """Named symbols in ``text`` whose line span overlaps any of ``lines``."""
+    from coderag.chunking import chunk_file
+    from coderag.config import Config
+
+    if not text.strip() or not lines:
+        return set()
+    found: set[str] = set()
+    for chunk in chunk_file(text, language, Config()):
+        if chunk.symbol and chunk.kind != "window":
+            if any(chunk.start_line <= ln <= chunk.end_line for ln in lines):
+                found.add(chunk.symbol)
+    return found
+
+
+def _changed_symbols(repo: Path, sha: str, file: str) -> List[str]:
+    """Symbols a commit changed in ``file`` that still exist in the file at HEAD.
+
+    The change is mapped against the file content *at that commit* (so diff line numbers
+    line up), then intersected with the symbols present at HEAD so every ground-truth
+    symbol is actually retrievable from the current index.
+    """
+    from coderag.chunking.languages import detect_language
+
+    language = detect_language(file)
+    if language is None:
+        return []
+    changed_lines = _diff_new_lines(repo, sha, file)
+    if not changed_lines:
+        return []
+    try:
+        at_commit = _git(repo, "show", f"{sha}:{file}")
+    except subprocess.CalledProcessError:
+        return []
+    changed = _symbols_covering(at_commit, language, changed_lines)
+    if not changed:
+        return []
+    head_text = (repo / file).read_text(encoding="utf-8", errors="replace")
+    head_symbols = _all_symbols(head_text, language)
+    return sorted(changed & head_symbols)
+
+
+def _all_symbols(text: str, language: str) -> set[str]:
+    """All named symbols present in ``text`` (used as a HEAD-side existence check)."""
+    from coderag.chunking import chunk_file
+    from coderag.config import Config
+
+    if not text.strip():
+        return set()
+    return {
+        c.symbol
+        for c in chunk_file(text, language, Config())
+        if c.symbol and c.kind != "window"
+    }
diff --git a/coderag/eval/datasets/coderag_self_symbols.jsonl b/coderag/eval/datasets/coderag_self_symbols.jsonl
@@ -0,0 +1,22 @@
+{"query": "where is reciprocal rank fusion implemented", "relevant_files": ["coderag/retrieval/fusion.py"], "relevant_symbols": ["reciprocal_rank_fusion"], "source": "curated"}
+{"query": "how are dense and lexical search results combined into one ranking", "relevant_files": ["coderag/retrieval/search.py"], "relevant_symbols": ["HybridSearcher.search"], "source": "curated"}
+{"query": "where are a changed file's old chunks removed before new ones are added", "relevant_files": ["coderag/indexer.py"], "relevant_symbols": ["Indexer._index_file"], "source": "curated"}
+{"query": "how is the FAISS index rebuilt from the SQLite store", "relevant_files": ["coderag/store/vector_index.py"], "relevant_symbols": ["FaissVectorIndex.rebuild_from_store"], "source": "curated"}
+{"query": "where does the vector index choose between flat and IVF", "relevant_files": ["coderag/store/vector_index.py"], "relevant_symbols": ["FaissVectorIndex._choose_kind"], "source": "curated"}
+{"query": "how are query vectors searched in the FAISS index", "relevant_files": ["coderag/store/vector_index.py"], "relevant_symbols": ["FaissVectorIndex.search"], "source": "curated"}
+{"query": "how is the number of IVF clusters derived from corpus size", "relevant_files": ["coderag/store/vector_index.py"], "relevant_symbols": ["_derive_nlist"], "source": "curated"}
+{"query": "where is BM25 keyword search over the full text index", "relevant_files": ["coderag/store/sqlite_store.py"], "relevant_symbols": ["SQLiteStore.fts_search"], "source": "curated"}
+{"query": "how does the store detect a model or embedding dimension change on startup", "relevant_files": ["coderag/store/sqlite_store.py"], "relevant_symbols": ["SQLiteStore.bootstrap"], "source": "curated"}
+{"query": "where are search results hydrated from the database by chunk id", "relevant_files": ["coderag/store/sqlite_store.py"], "relevant_symbols": ["SQLiteStore.hydrate"], "source": "curated"}
+{"query": "how are full text search query strings sanitized", "relevant_files": ["coderag/store/sqlite_store.py"], "relevant_symbols": ["_sanitize_fts"], "source": "curated"}
+{"query": "how does the filesystem watcher start watching and applying changes", "relevant_files": ["coderag/watch.py"], "relevant_symbols": ["watch"], "source": "curated"}
+{"query": "where are python functions and classes extracted as symbol spans", "relevant_files": ["coderag/chunking/python_ast.py"], "relevant_symbols": ["extract_spans"], "source": "curated"}
+{"query": "how is an LLM answer streamed over retrieved code chunks", "relevant_files": ["coderag/llm.py"], "relevant_symbols": ["stream_answer"], "source": "curated"}
+{"query": "where is the prompt context assembled from retrieved chunks", "relevant_files": ["coderag/llm.py"], "relevant_symbols": ["build_context"], "source": "curated"}
+{"query": "how does the facade run a hybrid search query", "relevant_files": ["coderag/api.py"], "relevant_symbols": ["CodeRAG.search"], "source": "curated"}
+{"query": "where are file contents served safely for only indexed files", "relevant_files": ["coderag/api.py"], "relevant_symbols": ["CodeRAG.get_file"], "source": "curated"}
+{"query": "how is recall at k computed for retrieval evaluation", "relevant_files": ["coderag/eval/metrics.py"], "relevant_symbols": ["recall_at_k"], "source": "curated"}
+{"query": "where is normalized discounted cumulative gain computed", "relevant_files": ["coderag/eval/metrics.py"], "relevant_symbols": ["ndcg_at_k"], "source": "curated"}
+{"query": "how does the cross-encoder reranker score documents against the query", "relevant_files": ["coderag/retrieval/rerank.py"], "relevant_symbols": ["CrossEncoderReranker.rerank"], "source": "curated"}
+{"query": "where is the reranker constructed from configuration", "relevant_files": ["coderag/retrieval/rerank.py"], "relevant_symbols": ["get_reranker"], "source": "curated"}
+{"query": "how does incremental indexing orchestrate hashing and embedding", "relevant_files": ["coderag/indexer.py"], "relevant_symbols": ["Indexer.index"], "source": "curated"}
diff --git a/coderag/surfaces/cli.py b/coderag/surfaces/cli.py
@@ -110,6 +110,7 @@ def cmd_eval(args: argparse.Namespace) -> int:
             cfg.watched_dir,
             max_cases=args.max_cases,
             extensions=extensions_for(cfg.languages),
+            symbols=args.level == "symbol",
         )
         out = args.dataset or "coderag-eval.jsonl"
         ev.save_dataset(cases, out)

diff --git a/docs/eval.md b/docs/eval.md
@@ -33,6 +33,10 @@ coderag eval --dataset coderag-eval.jsonl --compare
 
 # 4. Add the optional two-stage cross-encoder reranker (adds a hybrid+rerank row):
 coderag eval --dataset coderag-eval.jsonl --compare --rerank
+
+# 5. Harder, non-saturated: symbol-level (find the right function, not just file).
+coderag eval --build --level symbol --dataset sym.jsonl   # mines relevant_symbols too
+coderag eval --dataset sym.jsonl --level symbol --compare --rerank
 ```
 
 Reranking is opt-in at search time too: set `CODERAG_RERANK=1` (model via
@@ -121,12 +125,77 @@ bge-small-en-v1.5 · hybrid+rerank  0.790  0.646  0.958  1.000  0.836    1.000
    (`CODERAG_RERANK_MODEL=jinaai/jina-reranker-v2-base-multilingual` or
    `BAAI/bge-reranker-base`) is worth trying, but those are larger.
 
-**Conclusion across moves #1 and #2:** the recurring blocker is that *this repo's benchmark
-is too small and saturated to discriminate any retrieval improvement*. The feature is built,
-tested, and opt-in, but **proving its value requires a larger, harder, non-saturated
-benchmark** (a 1k+-file external repo and/or symbol-level + cross-file conceptual queries).
-That is the true critical path for the "win the eval" objective — accuracy techniques can't
-be validated until the benchmark has headroom.
+**Conclusion across moves #1 and #2 (file level):** the recurring blocker was that *file-level
+on this small repo is too saturated to discriminate any retrieval improvement*. The fix is a
+harder benchmark — see the symbol-level results next, which resolve it.
+
+### Symbol-level: the non-saturated benchmark (and the reranker, validated)
+
+Build a symbol-level dataset (`coderag eval --build --level symbol`, or `build_from_git(...,
+symbols=True)`) — the functions/classes a commit touched that still exist at HEAD — and score
+with `--level symbol`. Finding the right *function* (not just file) is far harder, so the
+benchmark stops saturating (Hit@10 ≈ 0.5 instead of 1.0). On 10 symbol-level cases from this
+repo's history:
+
+```
+mode                               MRR    R@1    R@5    R@10   nDCG@10  Hit@10
+bge-small-en-v1.5 · dense          0.400  0.183  0.292  0.317  0.327    0.400
+bge-small-en-v1.5 · bm25           0.417  0.183  0.317  0.342  0.345    0.500
+bge-small-en-v1.5 · hybrid         0.420  0.183  0.417  0.417  0.369    0.500
+bge-small-en-v1.5 · hybrid+rerank  0.514  0.283  0.392  0.442  0.448    0.600
+```
+
+**With headroom, the reranker delivers exactly the predicted lift:** R@1 0.183 → 0.283
+(+55%), MRR 0.420 → 0.514, nDCG@10 0.369 → 0.448, Hit@10 0.500 → 0.600 — improvement across
+*every* top-of-list metric, from the same off-the-shelf `ms-marco-MiniLM` that looked useless
+at the saturated file level. This both validates move #2 and confirms the saturation
+diagnosis: the file-level null result was a benchmark artifact, not a property of the
+technique. (Caveat: 10 cases is small/noisy; the direction is strong and consistent, but
+widen the dataset before quoting exact numbers.)
+
+**Net guidance:** evaluate retrieval changes at **symbol level** — it's where the signal is.
+
+### Symbol-level model comparison (curated 22-case set)
+
+Re-run on `coderag/eval/datasets/coderag_self_symbols.jsonl` (22 hand-verified
+natural-language → function/method cases, much less noisy than the git-mined set):
+
+```
+mode                                   MRR    R@1    R@5    R@10   nDCG@10  Hit@10
+bge-small-en-v1.5 · dense              0.675  0.591  0.818  0.864  0.720    0.864
+bge-small-en-v1.5 · bm25               0.427  0.318  0.636  0.727  0.498    0.727
+bge-small-en-v1.5 · hybrid             0.573  0.364  0.864  0.864  0.647    0.864
+bge-small-en-v1.5 · hybrid+rerank      0.580  0.409  0.864  0.864  0.651    0.864
+jina-embeddings-v2-base-code · dense   0.483  0.318  0.682  0.773  0.554    0.773
+jina-embeddings-v2-base-code · hybrid  0.604  0.455  0.818  0.864  0.668    0.864
+```
+
+Three findings, all actionable:
+
+1. **The code-specific model does not win on NL→symbol queries.** `bge-small · dense`
+   (MRR 0.675, R@1 0.591) clearly beats `jina-code · dense` (0.483 / 0.318). jina-v2-base-code
+   is older and tuned more for code↔code; for natural-language "where is X" queries a good
+   general text embedder is stronger. (jina-code's *hybrid* is competitive only because BM25
+   props up its weaker dense signal.)
+2. **Equal-weight hybrid is not universally better.** For the strong `bge-small` retriever,
+   `dense` alone (0.675) *beats* `hybrid` (0.573): on NL queries BM25 is weak (0.427) and
+   equal-weight RRF drags the strong dense ranking down. For the weaker jina-code, BM25 helps
+   (hybrid 0.604 > dense 0.483). **Takeaway: fusion weights should depend on query type** —
+   weight dense up for natural-language queries, BM25 up for exact-identifier/code queries
+   (strategy §3). A fixed 1:1 is a compromise, not an optimum.
+3. **Reranking improves top-1 precision.** `hybrid+rerank` lifts R@1 0.364 → 0.409 (+12%) over
+   hybrid with the tiny ms-marco model — consistent with the git-mined result (+55% on 10
+   noisier cases). The reranker reliably sharpens the top of the list; it operates on the
+   hybrid pool, so it can't fully recover the dense-vs-hybrid gap above (reranking a
+   dense-weighted pool is the natural follow-up).
+
+Larger code-aware rerankers (`bge-reranker-base`, `jina-reranker-v2`) are registered
+(`coderag eval --list-models`) but are ~1 GB and slow to rerank on CPU — test them on a GPU
+or a smaller candidate pool. The MiniLM default is the pragmatic local choice.
+
+**Bottom line for "win the eval":** the biggest lever found here is **query-type-aware fusion
+weighting** (finding 2), then **reranking for top-1** (finding 3) — not a bigger embedding
+model (finding 1). Validate these on a larger external repo next.
 
 ## Dataset format
 

diff --git a/docs/research/code-retrieval-strategy.md b/docs/research/code-retrieval-strategy.md
@@ -90,14 +90,14 @@ under OpenRAIL++-M, 68.53 CoIR). [H] Voyage-code-3 is API-only — reference poi
 
 ---
 
-> **Update (measured & built).** The optional two-stage reranker is implemented
+> **Update (built & VALIDATED).** The optional two-stage reranker is implemented
 > (`config.rerank`, `coderag/retrieval/rerank.py`, fastembed `TextCrossEncoder`, zero new
-> deps) and tested. On this repo's saturated 24-case set it gave **no lift / a marginal
-> regression** with the generic `ms-marco-MiniLM` model — consistent with the caveat below
-> that small-cross-encoder *code* lift is inferred, not measured, and with the benchmark
-> having no headroom (hybrid already R@5≈1.0). See [docs/eval.md](../eval.md). The blocker is
-> now clearly the **benchmark**, not the technique: it must get bigger/harder before #1 or #2
-> can show their value. A code-aware reranker should be re-tested there.
+> deps) and tested. It showed **no lift at the saturated file level**, but once the benchmark
+> was made non-saturated via **symbol-level** ground truth (`build_from_git(symbols=True)`),
+> the same off-the-shelf `ms-marco-MiniLM` reranker delivered the predicted lift: **R@1
+> 0.183 → 0.283 (+55%), MRR 0.420 → 0.514, nDCG@10 0.369 → 0.448**. The earlier null result
+> was a benchmark artifact, not a property of the technique. Evaluate at symbol level. See
+> [docs/eval.md](../eval.md).
 
 ## 2. Add a local cross-encoder reranker (highest-ROI bolt-on)
 
@@ -131,6 +131,13 @@ bolt-on. Treat as a later experiment, not a v1 move.
 
 ## 3. Tune and route the hybrid fusion you already have
 
+> **Update (measured — now the #1 lever).** Symbol-level eval on this repo showed equal-weight
+> hybrid can *lose* to dense alone on NL queries: `bge-small` dense MRR 0.675 vs hybrid 0.573,
+> because weak BM25 (0.427) drags the strong dense ranking down via 1:1 RRF. For the weaker
+> jina-code, BM25 *helps* (hybrid 0.604 > dense 0.483). So fusion weighting should be
+> **query-type-aware** (dense-up for NL, BM25-up for identifiers) rather than fixed 1:1 — this
+> was the single biggest lever found, ahead of a bigger embedder. See [docs/eval.md](../eval.md).
+
 CodeRAG already does dense + BM25 + RRF — the literature says that's the right foundation; the wins
 are in **routing and tuning**: