Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,15 @@ coderag watch # index, then keep it live as files change
coderag serve --port 8000 # run the HTTP API (needs [server])
coderag ui # launch the web UI (needs [ui])
coderag status # index stats (files, chunks, model, index type)
coderag eval --dataset d.jsonl --compare # retrieval quality: dense vs BM25 vs hybrid
```

> **Measuring retrieval quality.** `coderag eval` is a built-in harness for "did we surface
> the right file/symbol?" — recall@k, MRR, nDCG@k at file or symbol level, with a git-history
> dataset miner (`--build`), a dense/BM25/hybrid comparison (`--compare`), and an optional
> cross-encoder rerank stage (`--rerank`). See [`docs/eval.md`](docs/eval.md) and the strategy
> writeup in [`docs/research/code-retrieval-strategy.md`](docs/research/code-retrieval-strategy.md).

### Python library

```python
Expand Down
9 changes: 8 additions & 1 deletion coderag/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,15 @@ def indexer(self) -> "Indexer":
@property
def searcher(self) -> "HybridSearcher":
if self._searcher is None:
from coderag.retrieval.rerank import get_reranker
from coderag.retrieval.search import HybridSearcher

self._searcher = HybridSearcher(
self.config, self.provider, self.store, self.vectors
self.config,
self.provider,
self.store,
self.vectors,
reranker=get_reranker(self.config),
)
return self._searcher

Expand Down Expand Up @@ -177,6 +182,8 @@ def status(self) -> dict:
),
"llm_base_url": self.config.openai_base_url or "",
"index_type": self.vectors.kind,
"rerank": self.config.rerank,
"rerank_model": self.config.rerank_model if self.config.rerank else "",
"store_dir": str(self.config.store_dir),
"watched_dir": str(self.config.watched_dir),
"total_files": stats.total_files,
Expand Down
8 changes: 7 additions & 1 deletion coderag/chunking/languages.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from __future__ import annotations

from pathlib import Path
from typing import Optional
from typing import Iterable, List, Optional

# Languages for which we extract symbol-aware spans (function/class/method).
# Python uses the stdlib ``ast``; the rest use tree-sitter.
Expand Down Expand Up @@ -55,3 +55,9 @@
def detect_language(path: str | Path) -> Optional[str]:
"""Return the language for ``path``, or ``None`` if it should not be indexed."""
return EXTENSION_TO_LANGUAGE.get(Path(path).suffix.lower())


def extensions_for(languages: Iterable[str]) -> List[str]:
"""File extensions that map to any of ``languages`` (the canonical reverse lookup)."""
wanted = set(languages)
return sorted(ext for ext, lang in EXTENSION_TO_LANGUAGE.items() if lang in wanted)
12 changes: 12 additions & 0 deletions coderag/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,13 @@ class Config:
dense_weight: float = 1.0
lexical_weight: float = 1.0

# --- Reranking (optional two-stage retrieve-then-rerank) ---
# Off by default so the zero-config engine stays tiny/fast. When on, the top
# ``rerank_candidates`` fused hits are re-scored by a local cross-encoder and reordered.
rerank: bool = False
rerank_model: str = "Xenova/ms-marco-MiniLM-L-12-v2" # local ONNX cross-encoder
rerank_candidates: int = 50 # fused hits to rerank before trimming to top_k

# --- Indexing throughput ---
embed_batch_size: int = 64
index_workers: int = 4
Expand Down Expand Up @@ -202,6 +209,11 @@ def from_env(cls, **overrides: object) -> "Config":
rrf_k=_env_int("CODERAG_RRF_K", cls.rrf_k),
dense_weight=_env_float("CODERAG_DENSE_WEIGHT", cls.dense_weight),
lexical_weight=_env_float("CODERAG_LEXICAL_WEIGHT", cls.lexical_weight),
rerank=_env_bool("CODERAG_RERANK", cls.rerank),
rerank_model=_env_str("CODERAG_RERANK_MODEL", cls.rerank_model),
rerank_candidates=_env_int(
"CODERAG_RERANK_CANDIDATES", cls.rerank_candidates
),
embed_batch_size=_env_int("CODERAG_EMBED_BATCH", cls.embed_batch_size),
index_workers=_env_int("CODERAG_WORKERS", cls.index_workers),
llm_provider=_env_str("CODERAG_LLM_PROVIDER", cls.llm_provider),
Expand Down
90 changes: 90 additions & 0 deletions coderag/embeddings/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
"""Curated registry of local (fastembed/ONNX) embedding models for code search.

These are the no-API-key models worth considering for CodeRAG, with short notes on the
accuracy/size trade-off. All are loadable via ``--model <name>`` (provider ``fastembed``).
The numbers in the notes are external benchmark figures (see docs/research/) — run
``coderag eval`` to measure them on *your* codebase.

Code-specific models (trained on code) generally beat general-purpose text embedders on
code retrieval, at the cost of a larger download.
"""

from __future__ import annotations

from dataclasses import dataclass
from typing import Tuple


@dataclass(frozen=True)
class ModelInfo:
name: str # fastembed model id (pass to --model)
dim: int
size_gb: float
code_specific: bool
note: str


# Ordered best-first for code search among models fastembed can load locally. fastembed
# does not (yet) ship CodeRankEmbed/CodeSage; those need a custom ONNX export — tracked as
# a follow-up. jina-embeddings-v2-base-code is the strongest code-specific option available
# out of the box.
RECOMMENDED: Tuple[ModelInfo, ...] = (
ModelInfo(
"jinaai/jina-embeddings-v2-base-code",
768,
0.64,
True,
"Code-specific, 8192-ctx, Apache-2.0. Best out-of-the-box local code retriever.",
),
ModelInfo(
"BAAI/bge-base-en-v1.5",
768,
0.21,
False,
"General text. Stronger than bge-small; modest code retrieval.",
),
ModelInfo(
"snowflake/snowflake-arctic-embed-m-long",
768,
0.54,
False,
"General, long-context (base model behind CodeRankEmbed).",
),
ModelInfo(
"nomic-ai/nomic-embed-text-v1.5",
768,
0.52,
False,
"General, long-context, Matryoshka dims.",
),
ModelInfo(
"BAAI/bge-small-en-v1.5",
384,
0.067,
False,
"Current default. Smallest/fastest; weakest on code (~45.8 CoIR).",
),
)


def format_models() -> str:
"""Human-readable table of recommended models for the CLI."""
rows = [("model", "dim", "size", "code?", "note")]
rows += [
(
m.name,
str(m.dim),
f"{m.size_gb:g}GB",
"yes" if m.code_specific else "no",
m.note,
)
for m in RECOMMENDED
]
widths = [max(len(r[i]) for r in rows) for i in range(4)]
lines = []
for i, r in enumerate(rows):
head = " ".join(r[j].ljust(widths[j]) for j in range(4))
lines.append(f"{head} {r[4]}")
if i == 0:
lines.append(" ".join("-" * w for w in widths) + " " + "-" * len(r[4]))
return "\n".join(lines)
39 changes: 39 additions & 0 deletions coderag/eval/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
"""Code-retrieval evaluation harness.

A small, offline, dependency-free harness for measuring *retrieval* quality — "did we
surface the right file/symbol for this query?" — so accuracy claims are provable and
regressions are caught.

It follows the SWE-bench / Agentless / SweRank localization protocol: queries come from
real commit messages or issues, and ground truth is the set of files (and optionally
symbols) those commits changed. Metrics are the standard localization set: recall@k,
hit@k (Acc@k), MRR, and nDCG@k.

The public pieces:

- :class:`EvalCase` / :func:`load_dataset` / :func:`save_dataset` — the dataset format.
- :func:`build_from_git` — mine a dataset from a repo's history (no network, no LLM).
- :func:`evaluate` — score one retriever (any ``search`` callable) against a dataset.
- :func:`compare_modes` — score dense-only vs BM25-only vs hybrid on one index, which is
the built-in way to show fusion beats either modality alone.
"""

from __future__ import annotations

from coderag.eval.dataset import EvalCase, build_from_git, load_dataset, save_dataset
from coderag.eval.harness import EvalResult, compare_modes, evaluate
from coderag.eval.metrics import hit_at_k, mrr, ndcg_at_k, recall_at_k

__all__ = [
"EvalCase",
"EvalResult",
"build_from_git",
"compare_modes",
"evaluate",
"hit_at_k",
"load_dataset",
"mrr",
"ndcg_at_k",
"recall_at_k",
"save_dataset",
]
177 changes: 177 additions & 0 deletions coderag/eval/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
"""Eval dataset: a list of (query -> relevant files/symbols) cases, plus a git miner.

The dataset is plain JSONL so it's diffable, hand-editable, and easy to share. Each line
is one :class:`EvalCase`. :func:`build_from_git` synthesizes a dataset from a repo's own
history using the SWE-bench/SweRank recipe: the commit subject becomes the query and the
files that commit changed (that still exist at HEAD) become the ground truth.
"""

from __future__ import annotations

import json
import subprocess
from dataclasses import dataclass, field
from pathlib import Path
from typing import Dict, List, Optional, Sequence

# Control-character delimiters for one-pass `git log` parsing — they never appear in
# real commit messages, so we don't have to escape file paths or message text.
_REC = "\x1e" # between commits
_FLD = "\x1f" # between fields within a commit header


@dataclass(slots=True)
class EvalCase:
"""One retrieval query and its ground-truth relevant items.

``relevant_files`` are repo-relative posix paths; ``relevant_symbols`` are optional
qualified names (e.g. ``"Indexer._index_file"``) for function/class-level scoring.
"""

query: str
relevant_files: List[str]
relevant_symbols: List[str] = field(default_factory=list)
id: Optional[str] = None
source: str = ""

def as_dict(self) -> Dict[str, object]:
d: Dict[str, object] = {
"query": self.query,
"relevant_files": self.relevant_files,
}
if self.relevant_symbols:
d["relevant_symbols"] = self.relevant_symbols
if self.id:
d["id"] = self.id
if self.source:
d["source"] = self.source
return d

@classmethod
def from_dict(cls, d: Dict[str, object]) -> "EvalCase":
files = d.get("relevant_files", [])
symbols = d.get("relevant_symbols", [])
return cls(
query=str(d["query"]),
relevant_files=[str(p) for p in files] if isinstance(files, list) else [],
relevant_symbols=(
[str(s) for s in symbols] if isinstance(symbols, list) else []
),
id=str(d["id"]) if d.get("id") else None,
source=str(d.get("source", "")),
)


def load_dataset(path: Path | str) -> List[EvalCase]:
"""Load a JSONL dataset, skipping blank lines."""
cases: List[EvalCase] = []
with Path(path).open(encoding="utf-8") as fh:
for line in fh:
line = line.strip()
if line:
cases.append(EvalCase.from_dict(json.loads(line)))
return cases


def save_dataset(cases: Sequence[EvalCase], path: Path | str) -> None:
"""Write cases as JSONL (one compact JSON object per line)."""
p = Path(path)
p.parent.mkdir(parents=True, exist_ok=True)
with p.open("w", encoding="utf-8") as fh:
for case in cases:
fh.write(json.dumps(case.as_dict(), ensure_ascii=False) + "\n")


def _git(repo: Path, *args: str) -> str:
"""Run a git command in ``repo`` and return stdout (raises on failure)."""
return subprocess.run(
["git", "-C", str(repo), *args],
check=True,
capture_output=True,
text=True,
).stdout


def build_from_git(
repo: Path | str,
*,
max_cases: int = 200,
extensions: Optional[Sequence[str]] = None,
max_files_per_commit: int = 5,
min_query_len: int = 12,
commit_scan_limit: int = 2000,
) -> List[EvalCase]:
"""Mine an eval dataset from a repo's commit history.

For each non-merge commit, the subject line is the query and the changed files that
(a) match ``extensions`` and (b) still exist at HEAD become the relevant set — so
every ground-truth file is actually present in the index built from HEAD.

Filtering mirrors SweRank/Agentless dataset construction: skip merges, reverts, and
bot/automated commits; drop commits that touch too many files (``max_files_per_commit``
— diffuse, weak signal) or none of the targeted extensions; and require a meaningful
query (``min_query_len``).
"""
repo = Path(repo)
if extensions is None:
from coderag.chunking.languages import extensions_for
from coderag.config import DEFAULT_LANGUAGES

extensions = extensions_for(DEFAULT_LANGUAGES)
exts = {e if e.startswith(".") else f".{e}" for e in extensions}

fmt = f"{_REC}%H{_FLD}%s{_FLD}%an"
raw = _git(
repo,
"log",
"--no-merges",
f"-n{commit_scan_limit}",
"--name-only",
f"--pretty=format:{fmt}",
)

cases: List[EvalCase] = []
for record in raw.split(_REC):
if not record.strip() or len(cases) >= max_cases:
continue
header, _, body = record.partition("\n")
parts = header.split(_FLD)
if len(parts) < 3:
continue
sha, subject, author = parts[0], parts[1].strip(), parts[2].strip()

if not _is_usable_query(subject, min_query_len) or _is_bot(author):
continue

files = [
line.strip()
for line in body.splitlines()
if line.strip() and Path(line.strip()).suffix in exts
]
# Keep only files that still exist at HEAD, so they're retrievable from the index.
files = [f for f in files if (repo / f).exists()]
if not files or len(files) > max_files_per_commit:
continue

cases.append(
EvalCase(
query=subject,
relevant_files=files,
id=sha[:12],
source="git",
)
)
return cases


def _is_usable_query(subject: str, min_len: int) -> bool:
if len(subject) < min_len:
return False
low = subject.lower()
# Reverts/merges/version bumps carry little localization signal.
return not low.startswith(("revert", "merge", "bump", "release "))


def _is_bot(author: str) -> bool:
low = author.lower()
return "bot" in low or low in {"dependabot", "github-actions", "renovate"}
Loading
Loading