Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions coderag/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ def status(self) -> dict:
"index_type": self.vectors.kind,
"rerank": self.config.rerank,
"rerank_model": self.config.rerank_model if self.config.rerank else "",
"adaptive_fusion": self.config.adaptive_fusion,
"store_dir": str(self.config.store_dir),
"watched_dir": str(self.config.watched_dir),
"total_files": stats.total_files,
Expand Down
27 changes: 27 additions & 0 deletions coderag/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,22 @@ class Config:
dense_weight: float = 1.0
lexical_weight: float = 1.0

# --- Adaptive fusion weighting (query-type-aware) ---
# Off by default. When on, fusion weights tilt by query type: dense up for
# natural-language queries, BM25 up for exact-identifier/code queries (a fixed 1:1 is a
# compromise — see docs/eval.md). These pairs override dense_weight/lexical_weight only
# when adaptive_fusion is enabled.
adaptive_fusion: bool = False
# NL queries: lean dense (weak BM25 otherwise drags a strong dense ranking down).
nl_dense_weight: float = 1.0
nl_lexical_weight: float = 0.4
# Identifier/code queries: stay balanced. Up-weighting BM25 here *hurt* on this repo
# (short, common identifiers are lexically ambiguous, and the embedder already matches
# them well) — so the default is neutral, and BM25-leaning is left configurable for
# larger repos where exact-string recall matters more. See docs/eval.md.
code_dense_weight: float = 1.0
code_lexical_weight: float = 1.0

# --- Reranking (optional two-stage retrieve-then-rerank) ---
# Off by default so the zero-config engine stays tiny/fast. When on, the top
# ``rerank_candidates`` fused hits are re-scored by a local cross-encoder and reordered.
Expand Down Expand Up @@ -209,6 +225,17 @@ def from_env(cls, **overrides: object) -> "Config":
rrf_k=_env_int("CODERAG_RRF_K", cls.rrf_k),
dense_weight=_env_float("CODERAG_DENSE_WEIGHT", cls.dense_weight),
lexical_weight=_env_float("CODERAG_LEXICAL_WEIGHT", cls.lexical_weight),
adaptive_fusion=_env_bool("CODERAG_ADAPTIVE_FUSION", cls.adaptive_fusion),
nl_dense_weight=_env_float("CODERAG_NL_DENSE_WEIGHT", cls.nl_dense_weight),
nl_lexical_weight=_env_float(
"CODERAG_NL_LEXICAL_WEIGHT", cls.nl_lexical_weight
),
code_dense_weight=_env_float(
"CODERAG_CODE_DENSE_WEIGHT", cls.code_dense_weight
),
code_lexical_weight=_env_float(
"CODERAG_CODE_LEXICAL_WEIGHT", cls.code_lexical_weight
),
rerank=_env_bool("CODERAG_RERANK", cls.rerank),
rerank_model=_env_str("CODERAG_RERANK_MODEL", cls.rerank_model),
rerank_candidates=_env_int(
Expand Down
22 changes: 22 additions & 0 deletions coderag/eval/datasets/coderag_self_identifiers.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{"query": "reciprocal_rank_fusion", "relevant_files": ["coderag/retrieval/fusion.py"], "relevant_symbols": ["reciprocal_rank_fusion"], "source": "curated-id"}
{"query": "search", "relevant_files": ["coderag/retrieval/search.py"], "relevant_symbols": ["HybridSearcher.search"], "source": "curated-id"}
{"query": "_index_file", "relevant_files": ["coderag/indexer.py"], "relevant_symbols": ["Indexer._index_file"], "source": "curated-id"}
{"query": "rebuild_from_store", "relevant_files": ["coderag/store/vector_index.py"], "relevant_symbols": ["FaissVectorIndex.rebuild_from_store"], "source": "curated-id"}
{"query": "_choose_kind", "relevant_files": ["coderag/store/vector_index.py"], "relevant_symbols": ["FaissVectorIndex._choose_kind"], "source": "curated-id"}
{"query": "search", "relevant_files": ["coderag/store/vector_index.py"], "relevant_symbols": ["FaissVectorIndex.search"], "source": "curated-id"}
{"query": "_derive_nlist", "relevant_files": ["coderag/store/vector_index.py"], "relevant_symbols": ["_derive_nlist"], "source": "curated-id"}
{"query": "fts_search", "relevant_files": ["coderag/store/sqlite_store.py"], "relevant_symbols": ["SQLiteStore.fts_search"], "source": "curated-id"}
{"query": "bootstrap", "relevant_files": ["coderag/store/sqlite_store.py"], "relevant_symbols": ["SQLiteStore.bootstrap"], "source": "curated-id"}
{"query": "hydrate", "relevant_files": ["coderag/store/sqlite_store.py"], "relevant_symbols": ["SQLiteStore.hydrate"], "source": "curated-id"}
{"query": "_sanitize_fts", "relevant_files": ["coderag/store/sqlite_store.py"], "relevant_symbols": ["_sanitize_fts"], "source": "curated-id"}
{"query": "watch", "relevant_files": ["coderag/watch.py"], "relevant_symbols": ["watch"], "source": "curated-id"}
{"query": "extract_spans", "relevant_files": ["coderag/chunking/python_ast.py"], "relevant_symbols": ["extract_spans"], "source": "curated-id"}
{"query": "stream_answer", "relevant_files": ["coderag/llm.py"], "relevant_symbols": ["stream_answer"], "source": "curated-id"}
{"query": "build_context", "relevant_files": ["coderag/llm.py"], "relevant_symbols": ["build_context"], "source": "curated-id"}
{"query": "search", "relevant_files": ["coderag/api.py"], "relevant_symbols": ["CodeRAG.search"], "source": "curated-id"}
{"query": "get_file", "relevant_files": ["coderag/api.py"], "relevant_symbols": ["CodeRAG.get_file"], "source": "curated-id"}
{"query": "recall_at_k", "relevant_files": ["coderag/eval/metrics.py"], "relevant_symbols": ["recall_at_k"], "source": "curated-id"}
{"query": "ndcg_at_k", "relevant_files": ["coderag/eval/metrics.py"], "relevant_symbols": ["ndcg_at_k"], "source": "curated-id"}
{"query": "rerank", "relevant_files": ["coderag/retrieval/rerank.py"], "relevant_symbols": ["CrossEncoderReranker.rerank"], "source": "curated-id"}
{"query": "get_reranker", "relevant_files": ["coderag/retrieval/rerank.py"], "relevant_symbols": ["get_reranker"], "source": "curated-id"}
{"query": "index", "relevant_files": ["coderag/indexer.py"], "relevant_symbols": ["Indexer.index"], "source": "curated-id"}
21 changes: 17 additions & 4 deletions coderag/eval/harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,25 +121,38 @@ def compare_modes(
level: str = "file",
modes: Sequence[Tuple[str, float, float]] = DEFAULT_MODES,
reranker: Optional["Reranker"] = None,
adaptive: bool = False,
) -> List[EvalResult]:
"""Score dense-only vs BM25-only vs hybrid on the already-built index of ``cr``.

The index is mode-independent — the dense/lexical weights only affect query-time RRF
fusion — so we reuse one provider/store/vector index and just swap the fusion weights.
When ``reranker`` is given, an extra ``hybrid+rerank`` row is appended so the lift from
two-stage reranking is directly comparable on the same index.
When ``reranker`` is given, an extra ``hybrid+rerank`` row is appended; when ``adaptive``
is set, an ``adaptive`` row uses query-type-aware fusion weighting. All comparable on the
same index.
"""
from coderag.retrieval.search import HybridSearcher

results: List[EvalResult] = []
for label, dense_w, lexical_w in modes:
cfg = cr.config.with_overrides(dense_weight=dense_w, lexical_weight=lexical_w)
# Fixed modes must isolate the weights, so force adaptive fusion off here.
cfg = cr.config.with_overrides(
dense_weight=dense_w, lexical_weight=lexical_w, adaptive_fusion=False
)
searcher = HybridSearcher(cfg, cr.provider, cr.store, cr.vectors)
results.append(
evaluate(searcher.search, cases, label=label, ks=ks, level=level)
)
if adaptive:
cfg = cr.config.with_overrides(adaptive_fusion=True)
searcher = HybridSearcher(cfg, cr.provider, cr.store, cr.vectors)
results.append(
evaluate(searcher.search, cases, label="adaptive", ks=ks, level=level)
)
if reranker is not None:
cfg = cr.config.with_overrides(dense_weight=1.0, lexical_weight=1.0)
cfg = cr.config.with_overrides(
dense_weight=1.0, lexical_weight=1.0, adaptive_fusion=False
)
searcher = HybridSearcher(
cfg, cr.provider, cr.store, cr.vectors, reranker=reranker
)
Expand Down
89 changes: 89 additions & 0 deletions coderag/retrieval/query_type.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
"""Query-type detection for adaptive fusion weighting.

Symbol-level evaluation showed that a fixed 1:1 dense/BM25 fusion is a compromise, not an
optimum: on natural-language "where is X handled" queries the dense retriever is much
stronger and equal-weight RRF *drags it down* with weak BM25, while on exact-identifier
queries (``fts_search``, ``HybridSearcher.search``) the opposite holds. Routing the fusion
weights by query type recovers most of that gap with a cheap, local heuristic.

``looks_like_identifier`` is deliberately conservative: it only calls a query "code" when it
is short *and* either a lone token or visibly code-shaped (snake_case, dotted path,
camelCase, a call paren), and never when it contains natural-language cue words.
"""

from __future__ import annotations

import re
from typing import TYPE_CHECKING, Tuple

if TYPE_CHECKING:
from coderag.config import Config

# Words that mark a query as natural language even if it's short.
_NL_CUES = frozenset(
{
"where",
"how",
"what",
"why",
"when",
"which",
"who",
"is",
"are",
"was",
"were",
"does",
"do",
"did",
"can",
"the",
"a",
"an",
"to",
"of",
"in",
"for",
"on",
"and",
"or",
"with",
}
)

_CAMEL = re.compile(r"[a-z][A-Z]")
_DOTTED = re.compile(r"[A-Za-z0-9_]\.[A-Za-z0-9_]")


def looks_like_identifier(query: str) -> bool:
"""True if ``query`` reads like an exact code/symbol lookup rather than prose."""
q = query.strip()
if not q:
return False
tokens = q.split()
if len(tokens) >= 4:
return False # multi-word -> natural language
if {t.lower().strip("?.,:") for t in tokens} & _NL_CUES:
return False # contains a natural-language cue word
code_shaped = (
"_" in q
or "(" in q
or _CAMEL.search(q) is not None
or _DOTTED.search(q) is not None
)
if len(tokens) == 1:
return True # a lone token is treated as a literal-term lookup
return code_shaped # 2-3 tokens only count as code when visibly code-shaped


def fusion_weights(query: str, config: "Config") -> Tuple[float, float]:
"""Return ``(dense_weight, lexical_weight)`` for ``query``.

Without adaptive fusion this is just the configured static pair. With it on, weights tilt
toward dense for natural-language queries and toward BM25 for identifier-like queries.
"""
if not config.adaptive_fusion:
return config.dense_weight, config.lexical_weight
if looks_like_identifier(query):
return config.code_dense_weight, config.code_lexical_weight
return config.nl_dense_weight, config.nl_lexical_weight
5 changes: 4 additions & 1 deletion coderag/retrieval/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from coderag.config import Config
from coderag.embeddings import EmbeddingProvider
from coderag.retrieval.fusion import reciprocal_rank_fusion
from coderag.retrieval.query_type import fusion_weights
from coderag.store.sqlite_store import SQLiteStore
from coderag.store.vector_index import FaissVectorIndex
from coderag.types import SearchHit
Expand Down Expand Up @@ -56,10 +57,12 @@ def search(self, query: str, top_k: int) -> List[SearchHit]:
lexical_ranked = [cid for cid, _ in self.store.fts_search(query, fetch_k)]

# Fuse, then trim to the candidate pool (top_k, or deeper when reranking).
# Weights may adapt to the query type (dense-up for NL, BM25-up for identifiers).
dense_w, lexical_w = fusion_weights(query, self.config)
fused = reciprocal_rank_fusion(
[dense_ranked, lexical_ranked],
k=self.config.rrf_k,
weights=[self.config.dense_weight, self.config.lexical_weight],
weights=[dense_w, lexical_w],
)[:pool]
if not fused:
return []
Expand Down
28 changes: 24 additions & 4 deletions coderag/surfaces/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,9 +126,11 @@ def cmd_eval(args: argparse.Namespace) -> int:
return 1

ks = tuple(int(k) for k in args.ks.split(","))
# --rerank forces the optional two-stage cross-encoder on for this run.
# --rerank / --adaptive force the optional stages on for this run.
if args.rerank:
cfg = cfg.with_overrides(rerank=True)
if args.adaptive:
cfg = cfg.with_overrides(adaptive_fusion=True)
cr = CodeRAG(cfg)
cr.index() # ensure the index is built / up to date before scoring

Expand All @@ -139,11 +141,24 @@ def cmd_eval(args: argparse.Namespace) -> int:

reranker = get_reranker(cfg)
results = ev.compare_modes(
cr, cases, ks=ks, level=args.level, reranker=reranker
cr,
cases,
ks=ks,
level=args.level,
reranker=reranker,
adaptive=args.adaptive,
)
else:
label = "hybrid+rerank" if args.rerank else "hybrid"
results = [ev.evaluate(cr.search, cases, label=label, ks=ks, level=args.level)]
parts = ["hybrid"]
if args.adaptive:
parts = ["adaptive"]
if args.rerank:
parts.append("rerank")
results = [
ev.evaluate(
cr.search, cases, label="+".join(parts), ks=ks, level=args.level
)
]

if args.json:
print(json.dumps([r.as_dict() for r in results], indent=2))
Expand Down Expand Up @@ -285,6 +300,11 @@ def build_parser() -> argparse.ArgumentParser:
action="store_true",
help="Enable the local cross-encoder reranker (two-stage retrieve-then-rerank).",
)
p_eval.add_argument(
"--adaptive",
action="store_true",
help="Enable query-type-aware fusion weighting (dense-up for NL, BM25-up for code).",
)
p_eval.add_argument(
"--list-models",
action="store_true",
Expand Down
31 changes: 29 additions & 2 deletions docs/eval.md
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,9 @@ Three findings, all actionable:
`dense` alone (0.675) *beats* `hybrid` (0.573): on NL queries BM25 is weak (0.427) and
equal-weight RRF drags the strong dense ranking down. For the weaker jina-code, BM25 helps
(hybrid 0.604 > dense 0.483). **Takeaway: fusion weights should depend on query type** —
weight dense up for natural-language queries, BM25 up for exact-identifier/code queries
(strategy §3). A fixed 1:1 is a compromise, not an optimum.
weight dense up for natural-language queries; a fixed 1:1 is a compromise, not an optimum.
(Implemented and validated below — note the "BM25-up for code" half of this intuition was
*refuted* by the data.)
3. **Reranking improves top-1 precision.** `hybrid+rerank` lifts R@1 0.364 → 0.409 (+12%) over
hybrid with the tiny ms-marco model — consistent with the git-mined result (+55% on 10
noisier cases). The reranker reliably sharpens the top of the list; it operates on the
Expand All @@ -197,6 +198,32 @@ or a smaller candidate pool. The MiniLM default is the pragmatic local choice.
weighting** (finding 2), then **reranking for top-1** (finding 3) — not a bigger embedding
model (finding 1). Validate these on a larger external repo next.

### Adaptive fusion weighting (finding #2, implemented)

`CODERAG_ADAPTIVE_FUSION=1` (or `coderag eval --adaptive`) routes the fusion weights by query
type: a cheap local heuristic (`looks_like_identifier`) leans **dense** for natural-language
queries and stays **neutral** for identifier-like ones. Validated on bge-small at symbol
level against fixed 1:1 hybrid, on two 22-case sets:

```
NL queries (coderag_self_symbols.jsonl) MRR R@1 nDCG@10
hybrid (fixed 1:1) 0.604 0.455 0.669
adaptive 0.674 0.545 0.722 ← +0.070 MRR, +20% R@1

identifier queries (coderag_self_identifiers.jsonl)
hybrid (fixed 1:1) 0.685 0.545 0.741
adaptive 0.685 0.545 0.741 ← unchanged (no regression)
```

So adaptive is a **Pareto improvement** over fixed hybrid here: big gain on NL, no loss on
identifiers. **Honest caveat that shaped the defaults:** the literature's "BM25-up for code"
intuition was *refuted* by the data — up-weighting BM25 for identifier queries actively hurt
(MRR 0.685 → 0.613), because short/common identifiers (`search`, `index`) are lexically
ambiguous and the embedder already matches them well. So the code-side default is **neutral
(1:1)**, not BM25-leaning; BM25-leaning is left configurable (`CODERAG_CODE_LEXICAL_WEIGHT`)
for larger repos where exact-string recall matters more. Off by default pending larger-repo
validation; enable with `CODERAG_ADAPTIVE_FUSION=1`.

## Dataset format

JSONL, one case per line:
Expand Down
13 changes: 7 additions & 6 deletions docs/research/code-retrieval-strategy.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,12 +131,13 @@ bolt-on. Treat as a later experiment, not a v1 move.

## 3. Tune and route the hybrid fusion you already have

> **Update (measured — now the #1 lever).** Symbol-level eval on this repo showed equal-weight
> hybrid can *lose* to dense alone on NL queries: `bge-small` dense MRR 0.675 vs hybrid 0.573,
> because weak BM25 (0.427) drags the strong dense ranking down via 1:1 RRF. For the weaker
> jina-code, BM25 *helps* (hybrid 0.604 > dense 0.483). So fusion weighting should be
> **query-type-aware** (dense-up for NL, BM25-up for identifiers) rather than fixed 1:1 — this
> was the single biggest lever found, ahead of a bigger embedder. See [docs/eval.md](../eval.md).
> **Update (implemented & validated).** Query-type-aware fusion weighting now exists
> (`config.adaptive_fusion`, `coderag/retrieval/query_type.py`, `coderag eval --adaptive`). On
> symbol-level eval it is a **Pareto improvement** over fixed 1:1 hybrid: NL queries +0.070 MRR
> / +20% R@1 (leaning dense), identifier queries unchanged. **The "BM25-up for identifiers"
> half of the hypothesis was refuted** — up-weighting BM25 there *hurt* (short identifiers are
> lexically ambiguous; the embedder already matches them), so the code-side default is neutral.
> Off by default pending larger-repo validation. See [docs/eval.md](../eval.md).

CodeRAG already does dense + BM25 + RRF — the literature says that's the right foundation; the wins
are in **routing and tuning**:
Expand Down
12 changes: 11 additions & 1 deletion scripts/bench_embedders.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ def main() -> int:
default="",
help="Comma-separated reranker ids; one hybrid+rerank row per reranker.",
)
ap.add_argument(
"--adaptive",
action="store_true",
help="Also score an 'adaptive' row (query-type-aware fusion weighting).",
)
args = ap.parse_args()

repo = Path(args.repo).resolve()
Expand Down Expand Up @@ -80,7 +85,12 @@ def main() -> int:

default_reranker = get_reranker(cfg.with_overrides(rerank=True))
for r in compare_modes(
cr, cases, ks=ks, level=args.level, reranker=default_reranker
cr,
cases,
ks=ks,
level=args.level,
reranker=default_reranker,
adaptive=args.adaptive,
):
r.label = _label(model, r.label)
rows.append(r)
Expand Down
Loading
Loading