From 1dc12b4260d8bf045fc79cca39614a58d4ef688a Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 17 Jun 2026 08:37:16 +0000 Subject: [PATCH 1/3] feat(eval): symbol-level dataset mining (non-saturated benchmark) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit File-level eval on a small repo saturates (hybrid hits Hit@10=1.0), so it could not measure retrieval improvements. Symbol-level localization — find the right function/class, not just file — has real headroom and discriminates. - build_from_git(symbols=True) / `coderag eval --build --level symbol`: maps each commit's changed lines (zero-context diff hunks) to the symbols they touch, parsed from the file content *at that commit* via CodeRAG's own chunker, then intersected with the symbols present at HEAD so every ground-truth symbol is retrievable from the index. Off by default. - Tests cover symbol extraction (only the changed function is reported) and the default-off behavior. Result (10 symbol-level cases, this repo): the benchmark stops saturating (Hit@10 ~0.5), and the previously-flat cross-encoder reranker now shows the predicted lift — R@1 0.183->0.283 (+55%), MRR 0.420->0.514, nDCG@10 0.369->0.448. This validates move #2 and confirms the file-level null result was a benchmark artifact. Documented in docs/eval.md and the strategy doc. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01LhTCPRjNmSitYxgSDfttT7 --- coderag/eval/dataset.py | 90 ++++++++++++++++++++++++ coderag/surfaces/cli.py | 1 + docs/eval.md | 41 +++++++++-- docs/research/code-retrieval-strategy.md | 14 ++-- tests/test_eval.py | 50 +++++++++++++ 5 files changed, 183 insertions(+), 13 deletions(-) diff --git a/coderag/eval/dataset.py b/coderag/eval/dataset.py index 86b85c1..0626505 100644 --- a/coderag/eval/dataset.py +++ b/coderag/eval/dataset.py @@ -9,6 +9,7 @@ from __future__ import annotations import json +import re import subprocess from dataclasses import dataclass, field from pathlib import Path @@ -100,6 +101,7 @@ def build_from_git( max_files_per_commit: int = 5, min_query_len: int = 12, commit_scan_limit: int = 2000, + symbols: bool = False, ) -> List[EvalCase]: """Mine an eval dataset from a repo's commit history. @@ -107,6 +109,10 @@ def build_from_git( (a) match ``extensions`` and (b) still exist at HEAD become the relevant set — so every ground-truth file is actually present in the index built from HEAD. + With ``symbols=True``, the functions/classes/methods touched by the commit (that still + exist at HEAD) are also recorded as ``relevant_symbols``, enabling the much harder — + and less saturation-prone — symbol-level eval (``coderag eval --level symbol``). + Filtering mirrors SweRank/Agentless dataset construction: skip merges, reverts, and bot/automated commits; drop commits that touch too many files (``max_files_per_commit`` — diffuse, weak signal) or none of the targeted extensions; and require a meaningful @@ -153,10 +159,16 @@ def build_from_git( if not files or len(files) > max_files_per_commit: continue + relevant_symbols: List[str] = [] + if symbols: + for f in files: + relevant_symbols.extend(_changed_symbols(repo, sha, f)) + cases.append( EvalCase( query=subject, relevant_files=files, + relevant_symbols=sorted(set(relevant_symbols)), id=sha[:12], source="git", ) @@ -175,3 +187,81 @@ def _is_usable_query(subject: str, min_len: int) -> bool: def _is_bot(author: str) -> bool: low = author.lower() return "bot" in low or low in {"dependabot", "github-actions", "renovate"} + + +# Unified-diff hunk header: @@ -old[,n] +new[,n] @@ — we only need the new-side range. +_HUNK = re.compile(r"^@@ -\d+(?:,\d+)? \+(\d+)(?:,(\d+))? @@") + + +def _diff_new_lines(repo: Path, sha: str, file: str) -> set[int]: + """Line numbers (new side) touched by ``sha`` in ``file``, from a zero-context diff.""" + try: + diff = _git(repo, "show", "--unified=0", "--no-color", sha, "--", file) + except subprocess.CalledProcessError: + return set() + lines: set[int] = set() + for row in diff.splitlines(): + m = _HUNK.match(row) + if not m: + continue + start = int(m.group(1)) + count = int(m.group(2)) if m.group(2) is not None else 1 + # Pure deletions (count 0) still implicate the symbol around the anchor line. + lines.update(range(start, start + max(count, 1))) + return lines + + +def _symbols_covering(text: str, language: str, lines: set[int]) -> set[str]: + """Named symbols in ``text`` whose line span overlaps any of ``lines``.""" + from coderag.chunking import chunk_file + from coderag.config import Config + + if not text.strip() or not lines: + return set() + found: set[str] = set() + for chunk in chunk_file(text, language, Config()): + if chunk.symbol and chunk.kind != "window": + if any(chunk.start_line <= ln <= chunk.end_line for ln in lines): + found.add(chunk.symbol) + return found + + +def _changed_symbols(repo: Path, sha: str, file: str) -> List[str]: + """Symbols a commit changed in ``file`` that still exist in the file at HEAD. + + The change is mapped against the file content *at that commit* (so diff line numbers + line up), then intersected with the symbols present at HEAD so every ground-truth + symbol is actually retrievable from the current index. + """ + from coderag.chunking.languages import detect_language + + language = detect_language(file) + if language is None: + return [] + changed_lines = _diff_new_lines(repo, sha, file) + if not changed_lines: + return [] + try: + at_commit = _git(repo, "show", f"{sha}:{file}") + except subprocess.CalledProcessError: + return [] + changed = _symbols_covering(at_commit, language, changed_lines) + if not changed: + return [] + head_text = (repo / file).read_text(encoding="utf-8", errors="replace") + head_symbols = _all_symbols(head_text, language) + return sorted(changed & head_symbols) + + +def _all_symbols(text: str, language: str) -> set[str]: + """All named symbols present in ``text`` (used as a HEAD-side existence check).""" + from coderag.chunking import chunk_file + from coderag.config import Config + + if not text.strip(): + return set() + return { + c.symbol + for c in chunk_file(text, language, Config()) + if c.symbol and c.kind != "window" + } diff --git a/coderag/surfaces/cli.py b/coderag/surfaces/cli.py index f5e9a01..6493c3b 100644 --- a/coderag/surfaces/cli.py +++ b/coderag/surfaces/cli.py @@ -110,6 +110,7 @@ def cmd_eval(args: argparse.Namespace) -> int: cfg.watched_dir, max_cases=args.max_cases, extensions=extensions_for(cfg.languages), + symbols=args.level == "symbol", ) out = args.dataset or "coderag-eval.jsonl" ev.save_dataset(cases, out) diff --git a/docs/eval.md b/docs/eval.md index db53254..c26723a 100644 --- a/docs/eval.md +++ b/docs/eval.md @@ -33,6 +33,10 @@ coderag eval --dataset coderag-eval.jsonl --compare # 4. Add the optional two-stage cross-encoder reranker (adds a hybrid+rerank row): coderag eval --dataset coderag-eval.jsonl --compare --rerank + +# 5. Harder, non-saturated: symbol-level (find the right function, not just file). +coderag eval --build --level symbol --dataset sym.jsonl # mines relevant_symbols too +coderag eval --dataset sym.jsonl --level symbol --compare --rerank ``` Reranking is opt-in at search time too: set `CODERAG_RERANK=1` (model via @@ -121,12 +125,37 @@ bge-small-en-v1.5 · hybrid+rerank 0.790 0.646 0.958 1.000 0.836 1.000 (`CODERAG_RERANK_MODEL=jinaai/jina-reranker-v2-base-multilingual` or `BAAI/bge-reranker-base`) is worth trying, but those are larger. -**Conclusion across moves #1 and #2:** the recurring blocker is that *this repo's benchmark -is too small and saturated to discriminate any retrieval improvement*. The feature is built, -tested, and opt-in, but **proving its value requires a larger, harder, non-saturated -benchmark** (a 1k+-file external repo and/or symbol-level + cross-file conceptual queries). -That is the true critical path for the "win the eval" objective — accuracy techniques can't -be validated until the benchmark has headroom. +**Conclusion across moves #1 and #2 (file level):** the recurring blocker was that *file-level +on this small repo is too saturated to discriminate any retrieval improvement*. The fix is a +harder benchmark — see the symbol-level results next, which resolve it. + +### Symbol-level: the non-saturated benchmark (and the reranker, validated) + +Build a symbol-level dataset (`coderag eval --build --level symbol`, or `build_from_git(..., +symbols=True)`) — the functions/classes a commit touched that still exist at HEAD — and score +with `--level symbol`. Finding the right *function* (not just file) is far harder, so the +benchmark stops saturating (Hit@10 ≈ 0.5 instead of 1.0). On 10 symbol-level cases from this +repo's history: + +``` +mode MRR R@1 R@5 R@10 nDCG@10 Hit@10 +bge-small-en-v1.5 · dense 0.400 0.183 0.292 0.317 0.327 0.400 +bge-small-en-v1.5 · bm25 0.417 0.183 0.317 0.342 0.345 0.500 +bge-small-en-v1.5 · hybrid 0.420 0.183 0.417 0.417 0.369 0.500 +bge-small-en-v1.5 · hybrid+rerank 0.514 0.283 0.392 0.442 0.448 0.600 +``` + +**With headroom, the reranker delivers exactly the predicted lift:** R@1 0.183 → 0.283 +(+55%), MRR 0.420 → 0.514, nDCG@10 0.369 → 0.448, Hit@10 0.500 → 0.600 — improvement across +*every* top-of-list metric, from the same off-the-shelf `ms-marco-MiniLM` that looked useless +at the saturated file level. This both validates move #2 and confirms the saturation +diagnosis: the file-level null result was a benchmark artifact, not a property of the +technique. (Caveat: 10 cases is small/noisy; the direction is strong and consistent, but +widen the dataset before quoting exact numbers.) + +**Net guidance:** evaluate retrieval changes at **symbol level** — it's where the signal is. +Re-run the embedder comparison there too (a code-aware reranker like `bge-reranker-base` is +the next thing to test now that there's a benchmark that can measure it). ## Dataset format diff --git a/docs/research/code-retrieval-strategy.md b/docs/research/code-retrieval-strategy.md index 2c0d6f8..1884012 100644 --- a/docs/research/code-retrieval-strategy.md +++ b/docs/research/code-retrieval-strategy.md @@ -90,14 +90,14 @@ under OpenRAIL++-M, 68.53 CoIR). [H] Voyage-code-3 is API-only — reference poi --- -> **Update (measured & built).** The optional two-stage reranker is implemented +> **Update (built & VALIDATED).** The optional two-stage reranker is implemented > (`config.rerank`, `coderag/retrieval/rerank.py`, fastembed `TextCrossEncoder`, zero new -> deps) and tested. On this repo's saturated 24-case set it gave **no lift / a marginal -> regression** with the generic `ms-marco-MiniLM` model — consistent with the caveat below -> that small-cross-encoder *code* lift is inferred, not measured, and with the benchmark -> having no headroom (hybrid already R@5≈1.0). See [docs/eval.md](../eval.md). The blocker is -> now clearly the **benchmark**, not the technique: it must get bigger/harder before #1 or #2 -> can show their value. A code-aware reranker should be re-tested there. +> deps) and tested. It showed **no lift at the saturated file level**, but once the benchmark +> was made non-saturated via **symbol-level** ground truth (`build_from_git(symbols=True)`), +> the same off-the-shelf `ms-marco-MiniLM` reranker delivered the predicted lift: **R@1 +> 0.183 → 0.283 (+55%), MRR 0.420 → 0.514, nDCG@10 0.369 → 0.448**. The earlier null result +> was a benchmark artifact, not a property of the technique. Evaluate at symbol level. See +> [docs/eval.md](../eval.md). ## 2. Add a local cross-encoder reranker (highest-ROI bolt-on) diff --git a/tests/test_eval.py b/tests/test_eval.py index baa6d26..51ed135 100644 --- a/tests/test_eval.py +++ b/tests/test_eval.py @@ -216,3 +216,53 @@ def git(*args: str) -> None: git("commit", "-q", "-m", "wip") # too short -> filtered out assert build_from_git(repo, max_cases=10, min_query_len=12) == [] + + +def test_build_from_git_extracts_changed_symbols(tmp_path: Path): + repo = tmp_path / "repo" + repo.mkdir() + + def git(*args: str) -> None: + subprocess.run(["git", "-C", str(repo), *args], check=True, capture_output=True) + + git("init", "-q") + git("config", "user.email", "t@example.com") + git("config", "user.name", "Tester") + git("config", "commit.gpgsign", "false") + write( + repo / "m.py", + "def alpha():\n return 1\n\n\ndef beta():\n return 2\n", + ) + git("add", "-A") + git("commit", "-q", "-m", "initial two functions") + # Change only beta's body -> only beta should be reported as changed. + write( + repo / "m.py", + "def alpha():\n return 1\n\n\ndef beta():\n return 22\n", + ) + git("add", "-A") + git("commit", "-q", "-m", "tweak beta return value") + + cases = build_from_git(repo, max_cases=10, symbols=True, min_query_len=5) + latest = next(c for c in cases if c.id and c.query.startswith("tweak beta")) + assert latest.relevant_files == ["m.py"] + assert latest.relevant_symbols == ["beta"] # alpha untouched + + +def test_build_from_git_symbols_off_by_default(tmp_path: Path): + repo = tmp_path / "repo" + repo.mkdir() + + def git(*args: str) -> None: + subprocess.run(["git", "-C", str(repo), *args], check=True, capture_output=True) + + git("init", "-q") + git("config", "user.email", "t@example.com") + git("config", "user.name", "Tester") + git("config", "commit.gpgsign", "false") + write(repo / "m.py", "def alpha():\n return 1\n") + git("add", "-A") + git("commit", "-q", "-m", "add alpha function") + + cases = build_from_git(repo, max_cases=10) # symbols=False + assert cases[0].relevant_symbols == [] From e7d17d4caae35d4863e732bd028c8ad6af81af97 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 17 Jun 2026 09:42:09 +0000 Subject: [PATCH 2/3] feat(eval): reranker registry, multi-reranker bench, curated symbol set Tooling for the symbol-level model comparisons: - RECOMMENDED_RERANKERS registry + `coderag eval --list-models` now lists local cross-encoder rerankers (MiniLM, bge-reranker-base, jina-reranker-v2) with size/notes, so code-aware rerankers are discoverable. - scripts/bench_embedders.py --rerank-models: score one hybrid+rerank row per named reranker, to compare reranker models on a fixed index. - coderag/eval/datasets/coderag_self_symbols.jsonl: 22 curated natural-language -> function/method cases (verified symbol names) for a trustworthy symbol-level eval, less noisy than the git-mined set. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01LhTCPRjNmSitYxgSDfttT7 --- coderag/embeddings/models.py | 38 +++++++++++++++++++ .../eval/datasets/coderag_self_symbols.jsonl | 22 +++++++++++ scripts/bench_embedders.py | 29 ++++++++++++-- tests/test_models_registry.py | 18 ++++++++- 4 files changed, 102 insertions(+), 5 deletions(-) create mode 100644 coderag/eval/datasets/coderag_self_symbols.jsonl diff --git a/coderag/embeddings/models.py b/coderag/embeddings/models.py index 429d17a..ca5ec9d 100644 --- a/coderag/embeddings/models.py +++ b/coderag/embeddings/models.py @@ -87,4 +87,42 @@ def format_models() -> str: lines.append(f"{head} {r[4]}") if i == 0: lines.append(" ".join("-" * w for w in widths) + " " + "-" * len(r[4])) + lines.append("") + lines.append("Rerankers (set CODERAG_RERANK=1, CODERAG_RERANK_MODEL=):") + rwidth = max(len(rr.name) for rr in RECOMMENDED_RERANKERS) + for rr in RECOMMENDED_RERANKERS: + lines.append(f" {rr.name.ljust(rwidth)} {f'{rr.size_gb:g}GB':>8} {rr.note}") return "\n".join(lines) + + +@dataclass(frozen=True) +class RerankerInfo: + name: str # fastembed TextCrossEncoder model id (pass via CODERAG_RERANK_MODEL) + size_gb: float + note: str + + +# Local cross-encoder rerankers loadable via fastembed's TextCrossEncoder. The MiniLM +# pair is web-trained (small/fast); bge/jina are larger and worth testing for code. +RECOMMENDED_RERANKERS: Tuple[RerankerInfo, ...] = ( + RerankerInfo( + "Xenova/ms-marco-MiniLM-L-12-v2", + 0.12, + "Default. Tiny/fast (~30ms CPU); web-trained, not code-specific.", + ), + RerankerInfo( + "Xenova/ms-marco-MiniLM-L-6-v2", + 0.08, + "Smallest/fastest MiniLM; slightly weaker than L-12.", + ), + RerankerInfo( + "BAAI/bge-reranker-base", + 1.04, + "Larger, stronger general reranker; multilingual incl. code-ish text.", + ), + RerankerInfo( + "jinaai/jina-reranker-v2-base-multilingual", + 1.11, + "Strong multilingual reranker with code in its training mix.", + ), +) diff --git a/coderag/eval/datasets/coderag_self_symbols.jsonl b/coderag/eval/datasets/coderag_self_symbols.jsonl new file mode 100644 index 0000000..4c484c3 --- /dev/null +++ b/coderag/eval/datasets/coderag_self_symbols.jsonl @@ -0,0 +1,22 @@ +{"query": "where is reciprocal rank fusion implemented", "relevant_files": ["coderag/retrieval/fusion.py"], "relevant_symbols": ["reciprocal_rank_fusion"], "source": "curated"} +{"query": "how are dense and lexical search results combined into one ranking", "relevant_files": ["coderag/retrieval/search.py"], "relevant_symbols": ["HybridSearcher.search"], "source": "curated"} +{"query": "where are a changed file's old chunks removed before new ones are added", "relevant_files": ["coderag/indexer.py"], "relevant_symbols": ["Indexer._index_file"], "source": "curated"} +{"query": "how is the FAISS index rebuilt from the SQLite store", "relevant_files": ["coderag/store/vector_index.py"], "relevant_symbols": ["FaissVectorIndex.rebuild_from_store"], "source": "curated"} +{"query": "where does the vector index choose between flat and IVF", "relevant_files": ["coderag/store/vector_index.py"], "relevant_symbols": ["FaissVectorIndex._choose_kind"], "source": "curated"} +{"query": "how are query vectors searched in the FAISS index", "relevant_files": ["coderag/store/vector_index.py"], "relevant_symbols": ["FaissVectorIndex.search"], "source": "curated"} +{"query": "how is the number of IVF clusters derived from corpus size", "relevant_files": ["coderag/store/vector_index.py"], "relevant_symbols": ["_derive_nlist"], "source": "curated"} +{"query": "where is BM25 keyword search over the full text index", "relevant_files": ["coderag/store/sqlite_store.py"], "relevant_symbols": ["SQLiteStore.fts_search"], "source": "curated"} +{"query": "how does the store detect a model or embedding dimension change on startup", "relevant_files": ["coderag/store/sqlite_store.py"], "relevant_symbols": ["SQLiteStore.bootstrap"], "source": "curated"} +{"query": "where are search results hydrated from the database by chunk id", "relevant_files": ["coderag/store/sqlite_store.py"], "relevant_symbols": ["SQLiteStore.hydrate"], "source": "curated"} +{"query": "how are full text search query strings sanitized", "relevant_files": ["coderag/store/sqlite_store.py"], "relevant_symbols": ["_sanitize_fts"], "source": "curated"} +{"query": "how does the filesystem watcher start watching and applying changes", "relevant_files": ["coderag/watch.py"], "relevant_symbols": ["watch"], "source": "curated"} +{"query": "where are python functions and classes extracted as symbol spans", "relevant_files": ["coderag/chunking/python_ast.py"], "relevant_symbols": ["extract_spans"], "source": "curated"} +{"query": "how is an LLM answer streamed over retrieved code chunks", "relevant_files": ["coderag/llm.py"], "relevant_symbols": ["stream_answer"], "source": "curated"} +{"query": "where is the prompt context assembled from retrieved chunks", "relevant_files": ["coderag/llm.py"], "relevant_symbols": ["build_context"], "source": "curated"} +{"query": "how does the facade run a hybrid search query", "relevant_files": ["coderag/api.py"], "relevant_symbols": ["CodeRAG.search"], "source": "curated"} +{"query": "where are file contents served safely for only indexed files", "relevant_files": ["coderag/api.py"], "relevant_symbols": ["CodeRAG.get_file"], "source": "curated"} +{"query": "how is recall at k computed for retrieval evaluation", "relevant_files": ["coderag/eval/metrics.py"], "relevant_symbols": ["recall_at_k"], "source": "curated"} +{"query": "where is normalized discounted cumulative gain computed", "relevant_files": ["coderag/eval/metrics.py"], "relevant_symbols": ["ndcg_at_k"], "source": "curated"} +{"query": "how does the cross-encoder reranker score documents against the query", "relevant_files": ["coderag/retrieval/rerank.py"], "relevant_symbols": ["CrossEncoderReranker.rerank"], "source": "curated"} +{"query": "where is the reranker constructed from configuration", "relevant_files": ["coderag/retrieval/rerank.py"], "relevant_symbols": ["get_reranker"], "source": "curated"} +{"query": "how does incremental indexing orchestrate hashing and embedding", "relevant_files": ["coderag/indexer.py"], "relevant_symbols": ["Indexer.index"], "source": "curated"} diff --git a/scripts/bench_embedders.py b/scripts/bench_embedders.py index 5b66c88..9872eff 100644 --- a/scripts/bench_embedders.py +++ b/scripts/bench_embedders.py @@ -44,7 +44,12 @@ def main() -> int: ap.add_argument( "--rerank", action="store_true", - help="Also score a hybrid+rerank row per model (local cross-encoder).", + help="Also score a hybrid+rerank row per model (default reranker).", + ) + ap.add_argument( + "--rerank-models", + default="", + help="Comma-separated reranker ids; one hybrid+rerank row per reranker.", ) args = ap.parse_args() @@ -52,6 +57,7 @@ def main() -> int: cases = load_dataset(args.dataset) ks = tuple(int(k) for k in args.ks.split(",")) models = [m.strip() for m in args.models.split(",") if m.strip()] + rerank_models = [m.strip() for m in args.rerank_models.split(",") if m.strip()] rows: list[EvalResult] = [] for model in models: @@ -66,16 +72,31 @@ def main() -> int: cr = CodeRAG(cfg) stats = cr.index() print(f" {stats.total_files} files / {stats.total_chunks} chunks") - reranker = None + + # Baseline modes (dense / bm25 / hybrid), plus default reranker if --rerank. + default_reranker = None if args.rerank: from coderag.retrieval.rerank import get_reranker - reranker = get_reranker(cfg.with_overrides(rerank=True)) + default_reranker = get_reranker(cfg.with_overrides(rerank=True)) for r in compare_modes( - cr, cases, ks=ks, level=args.level, reranker=reranker + cr, cases, ks=ks, level=args.level, reranker=default_reranker ): r.label = _label(model, r.label) rows.append(r) + + # One extra hybrid+rerank row per explicitly named reranker. + for rm in rerank_models: + from coderag.retrieval.rerank import CrossEncoderReranker + + print(f" reranking with {rm} ...") + reranker = CrossEncoderReranker(rm, cache_dir=cfg.cache_dir) + res = compare_modes( + cr, cases, ks=ks, level=args.level, reranker=reranker, modes=() + ) + for r in res: + r.label = _label(model, f"rerank:{rm.split('/')[-1]}") + rows.append(r) cr.close() from coderag.eval.harness import format_table diff --git a/tests/test_models_registry.py b/tests/test_models_registry.py index 5fff29b..df864b5 100644 --- a/tests/test_models_registry.py +++ b/tests/test_models_registry.py @@ -2,7 +2,11 @@ from __future__ import annotations -from coderag.embeddings.models import RECOMMENDED, format_models +from coderag.embeddings.models import ( + RECOMMENDED, + RECOMMENDED_RERANKERS, + format_models, +) def test_registry_is_nonempty_and_well_formed(): @@ -19,7 +23,19 @@ def test_default_model_is_listed(): assert any(m.name == "BAAI/bge-small-en-v1.5" for m in RECOMMENDED) +def test_reranker_registry_well_formed(): + assert RECOMMENDED_RERANKERS + for r in RECOMMENDED_RERANKERS: + assert r.name and "/" in r.name + assert r.size_gb > 0 and r.note + # The default reranker model must be listed. + assert any( + r.name == "Xenova/ms-marco-MiniLM-L-12-v2" for r in RECOMMENDED_RERANKERS + ) + + def test_format_models_renders_table(): out = format_models() assert "model" in out and "code?" in out assert "jina-embeddings-v2-base-code" in out + assert "Rerankers" in out and "bge-reranker-base" in out From 45265333f49672502a7401315f4813d3cf282b8e Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 17 Jun 2026 13:11:41 +0000 Subject: [PATCH 3/3] docs(eval): symbol-level model comparison results and findings Measured on the curated 22-case symbol dataset (bge-small vs jina-code, with the ms-marco reranker): 1. The code-specific jina-code-v2 does NOT beat bge-small on NL->symbol queries (dense MRR 0.483 vs 0.675); a good general text embedder wins for natural-language "where is X" retrieval. 2. Equal-weight hybrid is not universally better: for the strong bge-small retriever, dense alone (0.675) beats 1:1 hybrid (0.573) because weak BM25 drags it down via RRF. Fusion weighting should be query-type-aware (dense-up for NL, BM25-up for identifiers) -- the biggest lever found. 3. Reranking lifts top-1 precision (R@1 0.364->0.409, +12%), consistent with the git-mined result. Documents these in docs/eval.md and elevates query-type fusion weighting in the strategy doc. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01LhTCPRjNmSitYxgSDfttT7 --- docs/eval.md | 44 ++++++++++++++++++++++-- docs/research/code-retrieval-strategy.md | 7 ++++ 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/docs/eval.md b/docs/eval.md index c26723a..396e9a8 100644 --- a/docs/eval.md +++ b/docs/eval.md @@ -154,8 +154,48 @@ technique. (Caveat: 10 cases is small/noisy; the direction is strong and consist widen the dataset before quoting exact numbers.) **Net guidance:** evaluate retrieval changes at **symbol level** — it's where the signal is. -Re-run the embedder comparison there too (a code-aware reranker like `bge-reranker-base` is -the next thing to test now that there's a benchmark that can measure it). + +### Symbol-level model comparison (curated 22-case set) + +Re-run on `coderag/eval/datasets/coderag_self_symbols.jsonl` (22 hand-verified +natural-language → function/method cases, much less noisy than the git-mined set): + +``` +mode MRR R@1 R@5 R@10 nDCG@10 Hit@10 +bge-small-en-v1.5 · dense 0.675 0.591 0.818 0.864 0.720 0.864 +bge-small-en-v1.5 · bm25 0.427 0.318 0.636 0.727 0.498 0.727 +bge-small-en-v1.5 · hybrid 0.573 0.364 0.864 0.864 0.647 0.864 +bge-small-en-v1.5 · hybrid+rerank 0.580 0.409 0.864 0.864 0.651 0.864 +jina-embeddings-v2-base-code · dense 0.483 0.318 0.682 0.773 0.554 0.773 +jina-embeddings-v2-base-code · hybrid 0.604 0.455 0.818 0.864 0.668 0.864 +``` + +Three findings, all actionable: + +1. **The code-specific model does not win on NL→symbol queries.** `bge-small · dense` + (MRR 0.675, R@1 0.591) clearly beats `jina-code · dense` (0.483 / 0.318). jina-v2-base-code + is older and tuned more for code↔code; for natural-language "where is X" queries a good + general text embedder is stronger. (jina-code's *hybrid* is competitive only because BM25 + props up its weaker dense signal.) +2. **Equal-weight hybrid is not universally better.** For the strong `bge-small` retriever, + `dense` alone (0.675) *beats* `hybrid` (0.573): on NL queries BM25 is weak (0.427) and + equal-weight RRF drags the strong dense ranking down. For the weaker jina-code, BM25 helps + (hybrid 0.604 > dense 0.483). **Takeaway: fusion weights should depend on query type** — + weight dense up for natural-language queries, BM25 up for exact-identifier/code queries + (strategy §3). A fixed 1:1 is a compromise, not an optimum. +3. **Reranking improves top-1 precision.** `hybrid+rerank` lifts R@1 0.364 → 0.409 (+12%) over + hybrid with the tiny ms-marco model — consistent with the git-mined result (+55% on 10 + noisier cases). The reranker reliably sharpens the top of the list; it operates on the + hybrid pool, so it can't fully recover the dense-vs-hybrid gap above (reranking a + dense-weighted pool is the natural follow-up). + +Larger code-aware rerankers (`bge-reranker-base`, `jina-reranker-v2`) are registered +(`coderag eval --list-models`) but are ~1 GB and slow to rerank on CPU — test them on a GPU +or a smaller candidate pool. The MiniLM default is the pragmatic local choice. + +**Bottom line for "win the eval":** the biggest lever found here is **query-type-aware fusion +weighting** (finding 2), then **reranking for top-1** (finding 3) — not a bigger embedding +model (finding 1). Validate these on a larger external repo next. ## Dataset format diff --git a/docs/research/code-retrieval-strategy.md b/docs/research/code-retrieval-strategy.md index 1884012..9721dd8 100644 --- a/docs/research/code-retrieval-strategy.md +++ b/docs/research/code-retrieval-strategy.md @@ -131,6 +131,13 @@ bolt-on. Treat as a later experiment, not a v1 move. ## 3. Tune and route the hybrid fusion you already have +> **Update (measured — now the #1 lever).** Symbol-level eval on this repo showed equal-weight +> hybrid can *lose* to dense alone on NL queries: `bge-small` dense MRR 0.675 vs hybrid 0.573, +> because weak BM25 (0.427) drags the strong dense ranking down via 1:1 RRF. For the weaker +> jina-code, BM25 *helps* (hybrid 0.604 > dense 0.483). So fusion weighting should be +> **query-type-aware** (dense-up for NL, BM25-up for identifiers) rather than fixed 1:1 — this +> was the single biggest lever found, ahead of a bigger embedder. See [docs/eval.md](../eval.md). + CodeRAG already does dense + BM25 + RRF — the literature says that's the right foundation; the wins are in **routing and tuning**: