Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions coderag/embeddings/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,4 +87,42 @@ def format_models() -> str:
lines.append(f"{head} {r[4]}")
if i == 0:
lines.append(" ".join("-" * w for w in widths) + " " + "-" * len(r[4]))
lines.append("")
lines.append("Rerankers (set CODERAG_RERANK=1, CODERAG_RERANK_MODEL=<name>):")
rwidth = max(len(rr.name) for rr in RECOMMENDED_RERANKERS)
for rr in RECOMMENDED_RERANKERS:
lines.append(f" {rr.name.ljust(rwidth)} {f'{rr.size_gb:g}GB':>8} {rr.note}")
return "\n".join(lines)


@dataclass(frozen=True)
class RerankerInfo:
name: str # fastembed TextCrossEncoder model id (pass via CODERAG_RERANK_MODEL)
size_gb: float
note: str


# Local cross-encoder rerankers loadable via fastembed's TextCrossEncoder. The MiniLM
# pair is web-trained (small/fast); bge/jina are larger and worth testing for code.
RECOMMENDED_RERANKERS: Tuple[RerankerInfo, ...] = (
RerankerInfo(
"Xenova/ms-marco-MiniLM-L-12-v2",
0.12,
"Default. Tiny/fast (~30ms CPU); web-trained, not code-specific.",
),
RerankerInfo(
"Xenova/ms-marco-MiniLM-L-6-v2",
0.08,
"Smallest/fastest MiniLM; slightly weaker than L-12.",
),
RerankerInfo(
"BAAI/bge-reranker-base",
1.04,
"Larger, stronger general reranker; multilingual incl. code-ish text.",
),
RerankerInfo(
"jinaai/jina-reranker-v2-base-multilingual",
1.11,
"Strong multilingual reranker with code in its training mix.",
),
)
90 changes: 90 additions & 0 deletions coderag/eval/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from __future__ import annotations

import json
import re
import subprocess
from dataclasses import dataclass, field
from pathlib import Path
Expand Down Expand Up @@ -100,13 +101,18 @@ def build_from_git(
max_files_per_commit: int = 5,
min_query_len: int = 12,
commit_scan_limit: int = 2000,
symbols: bool = False,
) -> List[EvalCase]:
"""Mine an eval dataset from a repo's commit history.

For each non-merge commit, the subject line is the query and the changed files that
(a) match ``extensions`` and (b) still exist at HEAD become the relevant set — so
every ground-truth file is actually present in the index built from HEAD.

With ``symbols=True``, the functions/classes/methods touched by the commit (that still
exist at HEAD) are also recorded as ``relevant_symbols``, enabling the much harder —
and less saturation-prone — symbol-level eval (``coderag eval --level symbol``).

Filtering mirrors SweRank/Agentless dataset construction: skip merges, reverts, and
bot/automated commits; drop commits that touch too many files (``max_files_per_commit``
— diffuse, weak signal) or none of the targeted extensions; and require a meaningful
Expand Down Expand Up @@ -153,10 +159,16 @@ def build_from_git(
if not files or len(files) > max_files_per_commit:
continue

relevant_symbols: List[str] = []
if symbols:
for f in files:
relevant_symbols.extend(_changed_symbols(repo, sha, f))

cases.append(
EvalCase(
query=subject,
relevant_files=files,
relevant_symbols=sorted(set(relevant_symbols)),
id=sha[:12],
source="git",
)
Expand All @@ -175,3 +187,81 @@ def _is_usable_query(subject: str, min_len: int) -> bool:
def _is_bot(author: str) -> bool:
low = author.lower()
return "bot" in low or low in {"dependabot", "github-actions", "renovate"}


# Unified-diff hunk header: @@ -old[,n] +new[,n] @@ — we only need the new-side range.
_HUNK = re.compile(r"^@@ -\d+(?:,\d+)? \+(\d+)(?:,(\d+))? @@")


def _diff_new_lines(repo: Path, sha: str, file: str) -> set[int]:
"""Line numbers (new side) touched by ``sha`` in ``file``, from a zero-context diff."""
try:
diff = _git(repo, "show", "--unified=0", "--no-color", sha, "--", file)
except subprocess.CalledProcessError:
return set()
lines: set[int] = set()
for row in diff.splitlines():
m = _HUNK.match(row)
if not m:
continue
start = int(m.group(1))
count = int(m.group(2)) if m.group(2) is not None else 1
# Pure deletions (count 0) still implicate the symbol around the anchor line.
lines.update(range(start, start + max(count, 1)))
return lines


def _symbols_covering(text: str, language: str, lines: set[int]) -> set[str]:
"""Named symbols in ``text`` whose line span overlaps any of ``lines``."""
from coderag.chunking import chunk_file
from coderag.config import Config

if not text.strip() or not lines:
return set()
found: set[str] = set()
for chunk in chunk_file(text, language, Config()):
if chunk.symbol and chunk.kind != "window":
if any(chunk.start_line <= ln <= chunk.end_line for ln in lines):
found.add(chunk.symbol)
return found


def _changed_symbols(repo: Path, sha: str, file: str) -> List[str]:
"""Symbols a commit changed in ``file`` that still exist in the file at HEAD.

The change is mapped against the file content *at that commit* (so diff line numbers
line up), then intersected with the symbols present at HEAD so every ground-truth
symbol is actually retrievable from the current index.
"""
from coderag.chunking.languages import detect_language

language = detect_language(file)
if language is None:
return []
changed_lines = _diff_new_lines(repo, sha, file)
if not changed_lines:
return []
try:
at_commit = _git(repo, "show", f"{sha}:{file}")
except subprocess.CalledProcessError:
return []
changed = _symbols_covering(at_commit, language, changed_lines)
if not changed:
return []
head_text = (repo / file).read_text(encoding="utf-8", errors="replace")
head_symbols = _all_symbols(head_text, language)
return sorted(changed & head_symbols)


def _all_symbols(text: str, language: str) -> set[str]:
"""All named symbols present in ``text`` (used as a HEAD-side existence check)."""
from coderag.chunking import chunk_file
from coderag.config import Config

if not text.strip():
return set()
return {
c.symbol
for c in chunk_file(text, language, Config())
if c.symbol and c.kind != "window"
}
22 changes: 22 additions & 0 deletions coderag/eval/datasets/coderag_self_symbols.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{"query": "where is reciprocal rank fusion implemented", "relevant_files": ["coderag/retrieval/fusion.py"], "relevant_symbols": ["reciprocal_rank_fusion"], "source": "curated"}
{"query": "how are dense and lexical search results combined into one ranking", "relevant_files": ["coderag/retrieval/search.py"], "relevant_symbols": ["HybridSearcher.search"], "source": "curated"}
{"query": "where are a changed file's old chunks removed before new ones are added", "relevant_files": ["coderag/indexer.py"], "relevant_symbols": ["Indexer._index_file"], "source": "curated"}
{"query": "how is the FAISS index rebuilt from the SQLite store", "relevant_files": ["coderag/store/vector_index.py"], "relevant_symbols": ["FaissVectorIndex.rebuild_from_store"], "source": "curated"}
{"query": "where does the vector index choose between flat and IVF", "relevant_files": ["coderag/store/vector_index.py"], "relevant_symbols": ["FaissVectorIndex._choose_kind"], "source": "curated"}
{"query": "how are query vectors searched in the FAISS index", "relevant_files": ["coderag/store/vector_index.py"], "relevant_symbols": ["FaissVectorIndex.search"], "source": "curated"}
{"query": "how is the number of IVF clusters derived from corpus size", "relevant_files": ["coderag/store/vector_index.py"], "relevant_symbols": ["_derive_nlist"], "source": "curated"}
{"query": "where is BM25 keyword search over the full text index", "relevant_files": ["coderag/store/sqlite_store.py"], "relevant_symbols": ["SQLiteStore.fts_search"], "source": "curated"}
{"query": "how does the store detect a model or embedding dimension change on startup", "relevant_files": ["coderag/store/sqlite_store.py"], "relevant_symbols": ["SQLiteStore.bootstrap"], "source": "curated"}
{"query": "where are search results hydrated from the database by chunk id", "relevant_files": ["coderag/store/sqlite_store.py"], "relevant_symbols": ["SQLiteStore.hydrate"], "source": "curated"}
{"query": "how are full text search query strings sanitized", "relevant_files": ["coderag/store/sqlite_store.py"], "relevant_symbols": ["_sanitize_fts"], "source": "curated"}
{"query": "how does the filesystem watcher start watching and applying changes", "relevant_files": ["coderag/watch.py"], "relevant_symbols": ["watch"], "source": "curated"}
{"query": "where are python functions and classes extracted as symbol spans", "relevant_files": ["coderag/chunking/python_ast.py"], "relevant_symbols": ["extract_spans"], "source": "curated"}
{"query": "how is an LLM answer streamed over retrieved code chunks", "relevant_files": ["coderag/llm.py"], "relevant_symbols": ["stream_answer"], "source": "curated"}
{"query": "where is the prompt context assembled from retrieved chunks", "relevant_files": ["coderag/llm.py"], "relevant_symbols": ["build_context"], "source": "curated"}
{"query": "how does the facade run a hybrid search query", "relevant_files": ["coderag/api.py"], "relevant_symbols": ["CodeRAG.search"], "source": "curated"}
{"query": "where are file contents served safely for only indexed files", "relevant_files": ["coderag/api.py"], "relevant_symbols": ["CodeRAG.get_file"], "source": "curated"}
{"query": "how is recall at k computed for retrieval evaluation", "relevant_files": ["coderag/eval/metrics.py"], "relevant_symbols": ["recall_at_k"], "source": "curated"}
{"query": "where is normalized discounted cumulative gain computed", "relevant_files": ["coderag/eval/metrics.py"], "relevant_symbols": ["ndcg_at_k"], "source": "curated"}
{"query": "how does the cross-encoder reranker score documents against the query", "relevant_files": ["coderag/retrieval/rerank.py"], "relevant_symbols": ["CrossEncoderReranker.rerank"], "source": "curated"}
{"query": "where is the reranker constructed from configuration", "relevant_files": ["coderag/retrieval/rerank.py"], "relevant_symbols": ["get_reranker"], "source": "curated"}
{"query": "how does incremental indexing orchestrate hashing and embedding", "relevant_files": ["coderag/indexer.py"], "relevant_symbols": ["Indexer.index"], "source": "curated"}
1 change: 1 addition & 0 deletions coderag/surfaces/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ def cmd_eval(args: argparse.Namespace) -> int:
cfg.watched_dir,
max_cases=args.max_cases,
extensions=extensions_for(cfg.languages),
symbols=args.level == "symbol",
)
out = args.dataset or "coderag-eval.jsonl"
ev.save_dataset(cases, out)
Expand Down
81 changes: 75 additions & 6 deletions docs/eval.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ coderag eval --dataset coderag-eval.jsonl --compare

# 4. Add the optional two-stage cross-encoder reranker (adds a hybrid+rerank row):
coderag eval --dataset coderag-eval.jsonl --compare --rerank

# 5. Harder, non-saturated: symbol-level (find the right function, not just file).
coderag eval --build --level symbol --dataset sym.jsonl # mines relevant_symbols too
coderag eval --dataset sym.jsonl --level symbol --compare --rerank
```

Reranking is opt-in at search time too: set `CODERAG_RERANK=1` (model via
Expand Down Expand Up @@ -121,12 +125,77 @@ bge-small-en-v1.5 · hybrid+rerank 0.790 0.646 0.958 1.000 0.836 1.000
(`CODERAG_RERANK_MODEL=jinaai/jina-reranker-v2-base-multilingual` or
`BAAI/bge-reranker-base`) is worth trying, but those are larger.

**Conclusion across moves #1 and #2:** the recurring blocker is that *this repo's benchmark
is too small and saturated to discriminate any retrieval improvement*. The feature is built,
tested, and opt-in, but **proving its value requires a larger, harder, non-saturated
benchmark** (a 1k+-file external repo and/or symbol-level + cross-file conceptual queries).
That is the true critical path for the "win the eval" objective — accuracy techniques can't
be validated until the benchmark has headroom.
**Conclusion across moves #1 and #2 (file level):** the recurring blocker was that *file-level
on this small repo is too saturated to discriminate any retrieval improvement*. The fix is a
harder benchmark — see the symbol-level results next, which resolve it.

### Symbol-level: the non-saturated benchmark (and the reranker, validated)

Build a symbol-level dataset (`coderag eval --build --level symbol`, or `build_from_git(...,
symbols=True)`) — the functions/classes a commit touched that still exist at HEAD — and score
with `--level symbol`. Finding the right *function* (not just file) is far harder, so the
benchmark stops saturating (Hit@10 ≈ 0.5 instead of 1.0). On 10 symbol-level cases from this
repo's history:

```
mode MRR R@1 R@5 R@10 nDCG@10 Hit@10
bge-small-en-v1.5 · dense 0.400 0.183 0.292 0.317 0.327 0.400
bge-small-en-v1.5 · bm25 0.417 0.183 0.317 0.342 0.345 0.500
bge-small-en-v1.5 · hybrid 0.420 0.183 0.417 0.417 0.369 0.500
bge-small-en-v1.5 · hybrid+rerank 0.514 0.283 0.392 0.442 0.448 0.600
```

**With headroom, the reranker delivers exactly the predicted lift:** R@1 0.183 → 0.283
(+55%), MRR 0.420 → 0.514, nDCG@10 0.369 → 0.448, Hit@10 0.500 → 0.600 — improvement across
*every* top-of-list metric, from the same off-the-shelf `ms-marco-MiniLM` that looked useless
at the saturated file level. This both validates move #2 and confirms the saturation
diagnosis: the file-level null result was a benchmark artifact, not a property of the
technique. (Caveat: 10 cases is small/noisy; the direction is strong and consistent, but
widen the dataset before quoting exact numbers.)

**Net guidance:** evaluate retrieval changes at **symbol level** — it's where the signal is.

### Symbol-level model comparison (curated 22-case set)

Re-run on `coderag/eval/datasets/coderag_self_symbols.jsonl` (22 hand-verified
natural-language → function/method cases, much less noisy than the git-mined set):

```
mode MRR R@1 R@5 R@10 nDCG@10 Hit@10
bge-small-en-v1.5 · dense 0.675 0.591 0.818 0.864 0.720 0.864
bge-small-en-v1.5 · bm25 0.427 0.318 0.636 0.727 0.498 0.727
bge-small-en-v1.5 · hybrid 0.573 0.364 0.864 0.864 0.647 0.864
bge-small-en-v1.5 · hybrid+rerank 0.580 0.409 0.864 0.864 0.651 0.864
jina-embeddings-v2-base-code · dense 0.483 0.318 0.682 0.773 0.554 0.773
jina-embeddings-v2-base-code · hybrid 0.604 0.455 0.818 0.864 0.668 0.864
```

Three findings, all actionable:

1. **The code-specific model does not win on NL→symbol queries.** `bge-small · dense`
(MRR 0.675, R@1 0.591) clearly beats `jina-code · dense` (0.483 / 0.318). jina-v2-base-code
is older and tuned more for code↔code; for natural-language "where is X" queries a good
general text embedder is stronger. (jina-code's *hybrid* is competitive only because BM25
props up its weaker dense signal.)
2. **Equal-weight hybrid is not universally better.** For the strong `bge-small` retriever,
`dense` alone (0.675) *beats* `hybrid` (0.573): on NL queries BM25 is weak (0.427) and
equal-weight RRF drags the strong dense ranking down. For the weaker jina-code, BM25 helps
(hybrid 0.604 > dense 0.483). **Takeaway: fusion weights should depend on query type** —
weight dense up for natural-language queries, BM25 up for exact-identifier/code queries
(strategy §3). A fixed 1:1 is a compromise, not an optimum.
3. **Reranking improves top-1 precision.** `hybrid+rerank` lifts R@1 0.364 → 0.409 (+12%) over
hybrid with the tiny ms-marco model — consistent with the git-mined result (+55% on 10
noisier cases). The reranker reliably sharpens the top of the list; it operates on the
hybrid pool, so it can't fully recover the dense-vs-hybrid gap above (reranking a
dense-weighted pool is the natural follow-up).

Larger code-aware rerankers (`bge-reranker-base`, `jina-reranker-v2`) are registered
(`coderag eval --list-models`) but are ~1 GB and slow to rerank on CPU — test them on a GPU
or a smaller candidate pool. The MiniLM default is the pragmatic local choice.

**Bottom line for "win the eval":** the biggest lever found here is **query-type-aware fusion
weighting** (finding 2), then **reranking for top-1** (finding 3) — not a bigger embedding
model (finding 1). Validate these on a larger external repo next.

## Dataset format

Expand Down
21 changes: 14 additions & 7 deletions docs/research/code-retrieval-strategy.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,14 +90,14 @@ under OpenRAIL++-M, 68.53 CoIR). [H] Voyage-code-3 is API-only — reference poi

---

> **Update (measured & built).** The optional two-stage reranker is implemented
> **Update (built & VALIDATED).** The optional two-stage reranker is implemented
> (`config.rerank`, `coderag/retrieval/rerank.py`, fastembed `TextCrossEncoder`, zero new
> deps) and tested. On this repo's saturated 24-case set it gave **no lift / a marginal
> regression** with the generic `ms-marco-MiniLM` model — consistent with the caveat below
> that small-cross-encoder *code* lift is inferred, not measured, and with the benchmark
> having no headroom (hybrid already R@5≈1.0). See [docs/eval.md](../eval.md). The blocker is
> now clearly the **benchmark**, not the technique: it must get bigger/harder before #1 or #2
> can show their value. A code-aware reranker should be re-tested there.
> deps) and tested. It showed **no lift at the saturated file level**, but once the benchmark
> was made non-saturated via **symbol-level** ground truth (`build_from_git(symbols=True)`),
> the same off-the-shelf `ms-marco-MiniLM` reranker delivered the predicted lift: **R@1
> 0.183 → 0.283 (+55%), MRR 0.420 → 0.514, nDCG@10 0.369 → 0.448**. The earlier null result
> was a benchmark artifact, not a property of the technique. Evaluate at symbol level. See
> [docs/eval.md](../eval.md).

## 2. Add a local cross-encoder reranker (highest-ROI bolt-on)

Expand Down Expand Up @@ -131,6 +131,13 @@ bolt-on. Treat as a later experiment, not a v1 move.

## 3. Tune and route the hybrid fusion you already have

> **Update (measured — now the #1 lever).** Symbol-level eval on this repo showed equal-weight
> hybrid can *lose* to dense alone on NL queries: `bge-small` dense MRR 0.675 vs hybrid 0.573,
> because weak BM25 (0.427) drags the strong dense ranking down via 1:1 RRF. For the weaker
> jina-code, BM25 *helps* (hybrid 0.604 > dense 0.483). So fusion weighting should be
> **query-type-aware** (dense-up for NL, BM25-up for identifiers) rather than fixed 1:1 — this
> was the single biggest lever found, ahead of a bigger embedder. See [docs/eval.md](../eval.md).

CodeRAG already does dense + BM25 + RRF — the literature says that's the right foundation; the wins
are in **routing and tuning**:

Expand Down
Loading
Loading