Neverdecel · Neverdecel · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026
diff --git a/README.md b/README.md
@@ -79,8 +79,15 @@ coderag watch                     # index, then keep it live as files change
 coderag serve --port 8000         # run the HTTP API  (needs [server])
 coderag ui                        # launch the web UI (needs [ui])
 coderag status                    # index stats (files, chunks, model, index type)
+coderag eval --dataset d.jsonl --compare  # retrieval quality: dense vs BM25 vs hybrid
 ```
 
+> **Measuring retrieval quality.** `coderag eval` is a built-in harness for "did we surface
+> the right file/symbol?" — recall@k, MRR, nDCG@k at file or symbol level, with a git-history
+> dataset miner (`--build`), a dense/BM25/hybrid comparison (`--compare`), and an optional
+> cross-encoder rerank stage (`--rerank`). See [`docs/eval.md`](docs/eval.md) and the strategy
+> writeup in [`docs/research/code-retrieval-strategy.md`](docs/research/code-retrieval-strategy.md).
+
 ### Python library
 
 ```python

diff --git a/coderag/api.py b/coderag/api.py
@@ -93,10 +93,15 @@ def indexer(self) -> "Indexer":
     @property
     def searcher(self) -> "HybridSearcher":
         if self._searcher is None:
+            from coderag.retrieval.rerank import get_reranker
             from coderag.retrieval.search import HybridSearcher
 
             self._searcher = HybridSearcher(
-                self.config, self.provider, self.store, self.vectors
+                self.config,
+                self.provider,
+                self.store,
+                self.vectors,
+                reranker=get_reranker(self.config),
             )
         return self._searcher
 
@@ -177,6 +182,8 @@ def status(self) -> dict:
             ),
             "llm_base_url": self.config.openai_base_url or "",
             "index_type": self.vectors.kind,
+            "rerank": self.config.rerank,
+            "rerank_model": self.config.rerank_model if self.config.rerank else "",
             "store_dir": str(self.config.store_dir),
             "watched_dir": str(self.config.watched_dir),
             "total_files": stats.total_files,

diff --git a/coderag/chunking/languages.py b/coderag/chunking/languages.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 from pathlib import Path
-from typing import Optional
+from typing import Iterable, List, Optional
 
 # Languages for which we extract symbol-aware spans (function/class/method).
 # Python uses the stdlib ``ast``; the rest use tree-sitter.
@@ -55,3 +55,9 @@
 def detect_language(path: str | Path) -> Optional[str]:
     """Return the language for ``path``, or ``None`` if it should not be indexed."""
     return EXTENSION_TO_LANGUAGE.get(Path(path).suffix.lower())
+
+
+def extensions_for(languages: Iterable[str]) -> List[str]:
+    """File extensions that map to any of ``languages`` (the canonical reverse lookup)."""
+    wanted = set(languages)
+    return sorted(ext for ext, lang in EXTENSION_TO_LANGUAGE.items() if lang in wanted)
diff --git a/coderag/config.py b/coderag/config.py
@@ -135,6 +135,13 @@ class Config:
     dense_weight: float = 1.0
     lexical_weight: float = 1.0
 
+    # --- Reranking (optional two-stage retrieve-then-rerank) ---
+    # Off by default so the zero-config engine stays tiny/fast. When on, the top
+    # ``rerank_candidates`` fused hits are re-scored by a local cross-encoder and reordered.
+    rerank: bool = False
+    rerank_model: str = "Xenova/ms-marco-MiniLM-L-12-v2"  # local ONNX cross-encoder
+    rerank_candidates: int = 50  # fused hits to rerank before trimming to top_k
+
     # --- Indexing throughput ---
     embed_batch_size: int = 64
     index_workers: int = 4
@@ -202,6 +209,11 @@ def from_env(cls, **overrides: object) -> "Config":
             rrf_k=_env_int("CODERAG_RRF_K", cls.rrf_k),
             dense_weight=_env_float("CODERAG_DENSE_WEIGHT", cls.dense_weight),
             lexical_weight=_env_float("CODERAG_LEXICAL_WEIGHT", cls.lexical_weight),
+            rerank=_env_bool("CODERAG_RERANK", cls.rerank),
+            rerank_model=_env_str("CODERAG_RERANK_MODEL", cls.rerank_model),
+            rerank_candidates=_env_int(
+                "CODERAG_RERANK_CANDIDATES", cls.rerank_candidates
+            ),
             embed_batch_size=_env_int("CODERAG_EMBED_BATCH", cls.embed_batch_size),
             index_workers=_env_int("CODERAG_WORKERS", cls.index_workers),
             llm_provider=_env_str("CODERAG_LLM_PROVIDER", cls.llm_provider),

diff --git a/coderag/embeddings/models.py b/coderag/embeddings/models.py
@@ -0,0 +1,90 @@
+"""Curated registry of local (fastembed/ONNX) embedding models for code search.
+
+These are the no-API-key models worth considering for CodeRAG, with short notes on the
+accuracy/size trade-off. All are loadable via ``--model <name>`` (provider ``fastembed``).
+The numbers in the notes are external benchmark figures (see docs/research/) — run
+``coderag eval`` to measure them on *your* codebase.
+
+Code-specific models (trained on code) generally beat general-purpose text embedders on
+code retrieval, at the cost of a larger download.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Tuple
+
+
+@dataclass(frozen=True)
+class ModelInfo:
+    name: str  # fastembed model id (pass to --model)
+    dim: int
+    size_gb: float
+    code_specific: bool
+    note: str
+
+
+# Ordered best-first for code search among models fastembed can load locally. fastembed
+# does not (yet) ship CodeRankEmbed/CodeSage; those need a custom ONNX export — tracked as
+# a follow-up. jina-embeddings-v2-base-code is the strongest code-specific option available
+# out of the box.
+RECOMMENDED: Tuple[ModelInfo, ...] = (
+    ModelInfo(
+        "jinaai/jina-embeddings-v2-base-code",
+        768,
+        0.64,
+        True,
+        "Code-specific, 8192-ctx, Apache-2.0. Best out-of-the-box local code retriever.",
+    ),
+    ModelInfo(
+        "BAAI/bge-base-en-v1.5",
+        768,
+        0.21,
+        False,
+        "General text. Stronger than bge-small; modest code retrieval.",
+    ),
+    ModelInfo(
+        "snowflake/snowflake-arctic-embed-m-long",
+        768,
+        0.54,
+        False,
+        "General, long-context (base model behind CodeRankEmbed).",
+    ),
+    ModelInfo(
+        "nomic-ai/nomic-embed-text-v1.5",
+        768,
+        0.52,
+        False,
+        "General, long-context, Matryoshka dims.",
+    ),
+    ModelInfo(
+        "BAAI/bge-small-en-v1.5",
+        384,
+        0.067,
+        False,
+        "Current default. Smallest/fastest; weakest on code (~45.8 CoIR).",
+    ),
+)
+
+
+def format_models() -> str:
+    """Human-readable table of recommended models for the CLI."""
+    rows = [("model", "dim", "size", "code?", "note")]
+    rows += [
+        (
+            m.name,
+            str(m.dim),
+            f"{m.size_gb:g}GB",
+            "yes" if m.code_specific else "no",
+            m.note,
+        )
+        for m in RECOMMENDED
+    ]
+    widths = [max(len(r[i]) for r in rows) for i in range(4)]
+    lines = []
+    for i, r in enumerate(rows):
+        head = "  ".join(r[j].ljust(widths[j]) for j in range(4))
+        lines.append(f"{head}  {r[4]}")
+        if i == 0:
+            lines.append("  ".join("-" * w for w in widths) + "  " + "-" * len(r[4]))
+    return "\n".join(lines)
diff --git a/coderag/eval/__init__.py b/coderag/eval/__init__.py
@@ -0,0 +1,39 @@
+"""Code-retrieval evaluation harness.
+
+A small, offline, dependency-free harness for measuring *retrieval* quality — "did we
+surface the right file/symbol for this query?" — so accuracy claims are provable and
+regressions are caught.
+
+It follows the SWE-bench / Agentless / SweRank localization protocol: queries come from
+real commit messages or issues, and ground truth is the set of files (and optionally
+symbols) those commits changed. Metrics are the standard localization set: recall@k,
+hit@k (Acc@k), MRR, and nDCG@k.
+
+The public pieces:
+
+- :class:`EvalCase` / :func:`load_dataset` / :func:`save_dataset` — the dataset format.
+- :func:`build_from_git` — mine a dataset from a repo's history (no network, no LLM).
+- :func:`evaluate` — score one retriever (any ``search`` callable) against a dataset.
+- :func:`compare_modes` — score dense-only vs BM25-only vs hybrid on one index, which is
+  the built-in way to show fusion beats either modality alone.
+"""
+
+from __future__ import annotations
+
+from coderag.eval.dataset import EvalCase, build_from_git, load_dataset, save_dataset
+from coderag.eval.harness import EvalResult, compare_modes, evaluate
+from coderag.eval.metrics import hit_at_k, mrr, ndcg_at_k, recall_at_k
+
+__all__ = [
+    "EvalCase",
+    "EvalResult",
+    "build_from_git",
+    "compare_modes",
+    "evaluate",
+    "hit_at_k",
+    "load_dataset",
+    "mrr",
+    "ndcg_at_k",
+    "recall_at_k",
+    "save_dataset",
+]
diff --git a/coderag/eval/dataset.py b/coderag/eval/dataset.py
@@ -0,0 +1,177 @@
+"""Eval dataset: a list of (query -> relevant files/symbols) cases, plus a git miner.
+
+The dataset is plain JSONL so it's diffable, hand-editable, and easy to share. Each line
+is one :class:`EvalCase`. :func:`build_from_git` synthesizes a dataset from a repo's own
+history using the SWE-bench/SweRank recipe: the commit subject becomes the query and the
+files that commit changed (that still exist at HEAD) become the ground truth.
+"""
+
+from __future__ import annotations
+
+import json
+import subprocess
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Dict, List, Optional, Sequence
+
+# Control-character delimiters for one-pass `git log` parsing — they never appear in
+# real commit messages, so we don't have to escape file paths or message text.
+_REC = "\x1e"  # between commits
+_FLD = "\x1f"  # between fields within a commit header
+
+
+@dataclass(slots=True)
+class EvalCase:
+    """One retrieval query and its ground-truth relevant items.
+
+    ``relevant_files`` are repo-relative posix paths; ``relevant_symbols`` are optional
+    qualified names (e.g. ``"Indexer._index_file"``) for function/class-level scoring.
+    """
+
+    query: str
+    relevant_files: List[str]
+    relevant_symbols: List[str] = field(default_factory=list)
+    id: Optional[str] = None
+    source: str = ""
+
+    def as_dict(self) -> Dict[str, object]:
+        d: Dict[str, object] = {
+            "query": self.query,
+            "relevant_files": self.relevant_files,
+        }
+        if self.relevant_symbols:
+            d["relevant_symbols"] = self.relevant_symbols
+        if self.id:
+            d["id"] = self.id
+        if self.source:
+            d["source"] = self.source
+        return d
+
+    @classmethod
+    def from_dict(cls, d: Dict[str, object]) -> "EvalCase":
+        files = d.get("relevant_files", [])
+        symbols = d.get("relevant_symbols", [])
+        return cls(
+            query=str(d["query"]),
+            relevant_files=[str(p) for p in files] if isinstance(files, list) else [],
+            relevant_symbols=(
+                [str(s) for s in symbols] if isinstance(symbols, list) else []
+            ),
+            id=str(d["id"]) if d.get("id") else None,
+            source=str(d.get("source", "")),
+        )
+
+
+def load_dataset(path: Path | str) -> List[EvalCase]:
+    """Load a JSONL dataset, skipping blank lines."""
+    cases: List[EvalCase] = []
+    with Path(path).open(encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if line:
+                cases.append(EvalCase.from_dict(json.loads(line)))
+    return cases
+
+
+def save_dataset(cases: Sequence[EvalCase], path: Path | str) -> None:
+    """Write cases as JSONL (one compact JSON object per line)."""
+    p = Path(path)
+    p.parent.mkdir(parents=True, exist_ok=True)
+    with p.open("w", encoding="utf-8") as fh:
+        for case in cases:
+            fh.write(json.dumps(case.as_dict(), ensure_ascii=False) + "\n")
+
+
+def _git(repo: Path, *args: str) -> str:
+    """Run a git command in ``repo`` and return stdout (raises on failure)."""
+    return subprocess.run(
+        ["git", "-C", str(repo), *args],
+        check=True,
+        capture_output=True,
+        text=True,
+    ).stdout
+
+
+def build_from_git(
+    repo: Path | str,
+    *,
+    max_cases: int = 200,
+    extensions: Optional[Sequence[str]] = None,
+    max_files_per_commit: int = 5,
+    min_query_len: int = 12,
+    commit_scan_limit: int = 2000,
+) -> List[EvalCase]:
+    """Mine an eval dataset from a repo's commit history.
+
+    For each non-merge commit, the subject line is the query and the changed files that
+    (a) match ``extensions`` and (b) still exist at HEAD become the relevant set — so
+    every ground-truth file is actually present in the index built from HEAD.
+
+    Filtering mirrors SweRank/Agentless dataset construction: skip merges, reverts, and
+    bot/automated commits; drop commits that touch too many files (``max_files_per_commit``
+    — diffuse, weak signal) or none of the targeted extensions; and require a meaningful
+    query (``min_query_len``).
+    """
+    repo = Path(repo)
+    if extensions is None:
+        from coderag.chunking.languages import extensions_for
+        from coderag.config import DEFAULT_LANGUAGES
+
+        extensions = extensions_for(DEFAULT_LANGUAGES)
+    exts = {e if e.startswith(".") else f".{e}" for e in extensions}
+
+    fmt = f"{_REC}%H{_FLD}%s{_FLD}%an"
+    raw = _git(
+        repo,
+        "log",
+        "--no-merges",
+        f"-n{commit_scan_limit}",
+        "--name-only",
+        f"--pretty=format:{fmt}",
+    )
+
+    cases: List[EvalCase] = []
+    for record in raw.split(_REC):
+        if not record.strip() or len(cases) >= max_cases:
+            continue
+        header, _, body = record.partition("\n")
+        parts = header.split(_FLD)
+        if len(parts) < 3:
+            continue
+        sha, subject, author = parts[0], parts[1].strip(), parts[2].strip()
+
+        if not _is_usable_query(subject, min_query_len) or _is_bot(author):
+            continue
+
+        files = [
+            line.strip()
+            for line in body.splitlines()
+            if line.strip() and Path(line.strip()).suffix in exts
+        ]
+        # Keep only files that still exist at HEAD, so they're retrievable from the index.
+        files = [f for f in files if (repo / f).exists()]
+        if not files or len(files) > max_files_per_commit:
+            continue
+
+        cases.append(
+            EvalCase(
+                query=subject,
+                relevant_files=files,
+                id=sha[:12],
+                source="git",
+            )
+        )
+    return cases
+
+
+def _is_usable_query(subject: str, min_len: int) -> bool:
+    if len(subject) < min_len:
+        return False
+    low = subject.lower()
+    # Reverts/merges/version bumps carry little localization signal.
+    return not low.startswith(("revert", "merge", "bump", "release "))
+
+
+def _is_bot(author: str) -> bool:
+    low = author.lower()
+    return "bot" in low or low in {"dependabot", "github-actions", "renovate"}