From ceaa4a483f0d7035c3b5744ef263fb5f81815ed7 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 17 Jun 2026 14:03:52 +0000 Subject: [PATCH] feat(retrieval): query-type-aware adaptive fusion weighting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements the symbol-level finding that fixed 1:1 dense/BM25 fusion is a compromise: on natural-language queries weak BM25 drags a strong dense ranking down via RRF. A cheap local heuristic (looks_like_identifier) routes the fusion weights by query type. - coderag/retrieval/query_type.py: looks_like_identifier() + fusion_weights(). - config: adaptive_fusion (off by default) + nl_/code_ weight pairs (+ env). - HybridSearcher uses fusion_weights(); compare_modes gains an `adaptive` row (and isolates fixed modes from adaptive); `coderag eval --adaptive`, bench `--adaptive`; status() reports adaptive_fusion. - coderag/eval/datasets/coderag_self_identifiers.jsonl: 22 identifier-query cases (derived from the symbol set) to validate the code-side routing. Validated (bge-small, symbol level) vs fixed 1:1 hybrid — a Pareto improvement: NL queries: hybrid MRR 0.604 -> adaptive 0.674 (+0.070, R@1 +20%) identifier queries: hybrid MRR 0.685 -> adaptive 0.685 (unchanged) Honest correction baked into the defaults: the "BM25-up for identifiers" half of the hypothesis was refuted by the data (up-weighting BM25 hurt, MRR 0.685->0.613, because short/common identifiers are lexically ambiguous and the embedder already matches them), so the code-side default is neutral, not BM25-leaning. Documented in docs/eval.md and the strategy doc. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01LhTCPRjNmSitYxgSDfttT7 --- coderag/api.py | 1 + coderag/config.py | 27 ++++++ .../datasets/coderag_self_identifiers.jsonl | 22 +++++ coderag/eval/harness.py | 21 ++++- coderag/retrieval/query_type.py | 89 +++++++++++++++++++ coderag/retrieval/search.py | 5 +- coderag/surfaces/cli.py | 28 +++++- docs/eval.md | 31 ++++++- docs/research/code-retrieval-strategy.md | 13 +-- scripts/bench_embedders.py | 12 ++- tests/test_query_type.py | 61 +++++++++++++ 11 files changed, 292 insertions(+), 18 deletions(-) create mode 100644 coderag/eval/datasets/coderag_self_identifiers.jsonl create mode 100644 coderag/retrieval/query_type.py create mode 100644 tests/test_query_type.py diff --git a/coderag/api.py b/coderag/api.py index aaf6651..a144ff8 100644 --- a/coderag/api.py +++ b/coderag/api.py @@ -184,6 +184,7 @@ def status(self) -> dict: "index_type": self.vectors.kind, "rerank": self.config.rerank, "rerank_model": self.config.rerank_model if self.config.rerank else "", + "adaptive_fusion": self.config.adaptive_fusion, "store_dir": str(self.config.store_dir), "watched_dir": str(self.config.watched_dir), "total_files": stats.total_files, diff --git a/coderag/config.py b/coderag/config.py index ae8f411..4c76423 100644 --- a/coderag/config.py +++ b/coderag/config.py @@ -135,6 +135,22 @@ class Config: dense_weight: float = 1.0 lexical_weight: float = 1.0 + # --- Adaptive fusion weighting (query-type-aware) --- + # Off by default. When on, fusion weights tilt by query type: dense up for + # natural-language queries, BM25 up for exact-identifier/code queries (a fixed 1:1 is a + # compromise — see docs/eval.md). These pairs override dense_weight/lexical_weight only + # when adaptive_fusion is enabled. + adaptive_fusion: bool = False + # NL queries: lean dense (weak BM25 otherwise drags a strong dense ranking down). + nl_dense_weight: float = 1.0 + nl_lexical_weight: float = 0.4 + # Identifier/code queries: stay balanced. Up-weighting BM25 here *hurt* on this repo + # (short, common identifiers are lexically ambiguous, and the embedder already matches + # them well) — so the default is neutral, and BM25-leaning is left configurable for + # larger repos where exact-string recall matters more. See docs/eval.md. + code_dense_weight: float = 1.0 + code_lexical_weight: float = 1.0 + # --- Reranking (optional two-stage retrieve-then-rerank) --- # Off by default so the zero-config engine stays tiny/fast. When on, the top # ``rerank_candidates`` fused hits are re-scored by a local cross-encoder and reordered. @@ -209,6 +225,17 @@ def from_env(cls, **overrides: object) -> "Config": rrf_k=_env_int("CODERAG_RRF_K", cls.rrf_k), dense_weight=_env_float("CODERAG_DENSE_WEIGHT", cls.dense_weight), lexical_weight=_env_float("CODERAG_LEXICAL_WEIGHT", cls.lexical_weight), + adaptive_fusion=_env_bool("CODERAG_ADAPTIVE_FUSION", cls.adaptive_fusion), + nl_dense_weight=_env_float("CODERAG_NL_DENSE_WEIGHT", cls.nl_dense_weight), + nl_lexical_weight=_env_float( + "CODERAG_NL_LEXICAL_WEIGHT", cls.nl_lexical_weight + ), + code_dense_weight=_env_float( + "CODERAG_CODE_DENSE_WEIGHT", cls.code_dense_weight + ), + code_lexical_weight=_env_float( + "CODERAG_CODE_LEXICAL_WEIGHT", cls.code_lexical_weight + ), rerank=_env_bool("CODERAG_RERANK", cls.rerank), rerank_model=_env_str("CODERAG_RERANK_MODEL", cls.rerank_model), rerank_candidates=_env_int( diff --git a/coderag/eval/datasets/coderag_self_identifiers.jsonl b/coderag/eval/datasets/coderag_self_identifiers.jsonl new file mode 100644 index 0000000..e68a227 --- /dev/null +++ b/coderag/eval/datasets/coderag_self_identifiers.jsonl @@ -0,0 +1,22 @@ +{"query": "reciprocal_rank_fusion", "relevant_files": ["coderag/retrieval/fusion.py"], "relevant_symbols": ["reciprocal_rank_fusion"], "source": "curated-id"} +{"query": "search", "relevant_files": ["coderag/retrieval/search.py"], "relevant_symbols": ["HybridSearcher.search"], "source": "curated-id"} +{"query": "_index_file", "relevant_files": ["coderag/indexer.py"], "relevant_symbols": ["Indexer._index_file"], "source": "curated-id"} +{"query": "rebuild_from_store", "relevant_files": ["coderag/store/vector_index.py"], "relevant_symbols": ["FaissVectorIndex.rebuild_from_store"], "source": "curated-id"} +{"query": "_choose_kind", "relevant_files": ["coderag/store/vector_index.py"], "relevant_symbols": ["FaissVectorIndex._choose_kind"], "source": "curated-id"} +{"query": "search", "relevant_files": ["coderag/store/vector_index.py"], "relevant_symbols": ["FaissVectorIndex.search"], "source": "curated-id"} +{"query": "_derive_nlist", "relevant_files": ["coderag/store/vector_index.py"], "relevant_symbols": ["_derive_nlist"], "source": "curated-id"} +{"query": "fts_search", "relevant_files": ["coderag/store/sqlite_store.py"], "relevant_symbols": ["SQLiteStore.fts_search"], "source": "curated-id"} +{"query": "bootstrap", "relevant_files": ["coderag/store/sqlite_store.py"], "relevant_symbols": ["SQLiteStore.bootstrap"], "source": "curated-id"} +{"query": "hydrate", "relevant_files": ["coderag/store/sqlite_store.py"], "relevant_symbols": ["SQLiteStore.hydrate"], "source": "curated-id"} +{"query": "_sanitize_fts", "relevant_files": ["coderag/store/sqlite_store.py"], "relevant_symbols": ["_sanitize_fts"], "source": "curated-id"} +{"query": "watch", "relevant_files": ["coderag/watch.py"], "relevant_symbols": ["watch"], "source": "curated-id"} +{"query": "extract_spans", "relevant_files": ["coderag/chunking/python_ast.py"], "relevant_symbols": ["extract_spans"], "source": "curated-id"} +{"query": "stream_answer", "relevant_files": ["coderag/llm.py"], "relevant_symbols": ["stream_answer"], "source": "curated-id"} +{"query": "build_context", "relevant_files": ["coderag/llm.py"], "relevant_symbols": ["build_context"], "source": "curated-id"} +{"query": "search", "relevant_files": ["coderag/api.py"], "relevant_symbols": ["CodeRAG.search"], "source": "curated-id"} +{"query": "get_file", "relevant_files": ["coderag/api.py"], "relevant_symbols": ["CodeRAG.get_file"], "source": "curated-id"} +{"query": "recall_at_k", "relevant_files": ["coderag/eval/metrics.py"], "relevant_symbols": ["recall_at_k"], "source": "curated-id"} +{"query": "ndcg_at_k", "relevant_files": ["coderag/eval/metrics.py"], "relevant_symbols": ["ndcg_at_k"], "source": "curated-id"} +{"query": "rerank", "relevant_files": ["coderag/retrieval/rerank.py"], "relevant_symbols": ["CrossEncoderReranker.rerank"], "source": "curated-id"} +{"query": "get_reranker", "relevant_files": ["coderag/retrieval/rerank.py"], "relevant_symbols": ["get_reranker"], "source": "curated-id"} +{"query": "index", "relevant_files": ["coderag/indexer.py"], "relevant_symbols": ["Indexer.index"], "source": "curated-id"} diff --git a/coderag/eval/harness.py b/coderag/eval/harness.py index 19e1b38..89946c8 100644 --- a/coderag/eval/harness.py +++ b/coderag/eval/harness.py @@ -121,25 +121,38 @@ def compare_modes( level: str = "file", modes: Sequence[Tuple[str, float, float]] = DEFAULT_MODES, reranker: Optional["Reranker"] = None, + adaptive: bool = False, ) -> List[EvalResult]: """Score dense-only vs BM25-only vs hybrid on the already-built index of ``cr``. The index is mode-independent — the dense/lexical weights only affect query-time RRF fusion — so we reuse one provider/store/vector index and just swap the fusion weights. - When ``reranker`` is given, an extra ``hybrid+rerank`` row is appended so the lift from - two-stage reranking is directly comparable on the same index. + When ``reranker`` is given, an extra ``hybrid+rerank`` row is appended; when ``adaptive`` + is set, an ``adaptive`` row uses query-type-aware fusion weighting. All comparable on the + same index. """ from coderag.retrieval.search import HybridSearcher results: List[EvalResult] = [] for label, dense_w, lexical_w in modes: - cfg = cr.config.with_overrides(dense_weight=dense_w, lexical_weight=lexical_w) + # Fixed modes must isolate the weights, so force adaptive fusion off here. + cfg = cr.config.with_overrides( + dense_weight=dense_w, lexical_weight=lexical_w, adaptive_fusion=False + ) searcher = HybridSearcher(cfg, cr.provider, cr.store, cr.vectors) results.append( evaluate(searcher.search, cases, label=label, ks=ks, level=level) ) + if adaptive: + cfg = cr.config.with_overrides(adaptive_fusion=True) + searcher = HybridSearcher(cfg, cr.provider, cr.store, cr.vectors) + results.append( + evaluate(searcher.search, cases, label="adaptive", ks=ks, level=level) + ) if reranker is not None: - cfg = cr.config.with_overrides(dense_weight=1.0, lexical_weight=1.0) + cfg = cr.config.with_overrides( + dense_weight=1.0, lexical_weight=1.0, adaptive_fusion=False + ) searcher = HybridSearcher( cfg, cr.provider, cr.store, cr.vectors, reranker=reranker ) diff --git a/coderag/retrieval/query_type.py b/coderag/retrieval/query_type.py new file mode 100644 index 0000000..7389afc --- /dev/null +++ b/coderag/retrieval/query_type.py @@ -0,0 +1,89 @@ +"""Query-type detection for adaptive fusion weighting. + +Symbol-level evaluation showed that a fixed 1:1 dense/BM25 fusion is a compromise, not an +optimum: on natural-language "where is X handled" queries the dense retriever is much +stronger and equal-weight RRF *drags it down* with weak BM25, while on exact-identifier +queries (``fts_search``, ``HybridSearcher.search``) the opposite holds. Routing the fusion +weights by query type recovers most of that gap with a cheap, local heuristic. + +``looks_like_identifier`` is deliberately conservative: it only calls a query "code" when it +is short *and* either a lone token or visibly code-shaped (snake_case, dotted path, +camelCase, a call paren), and never when it contains natural-language cue words. +""" + +from __future__ import annotations + +import re +from typing import TYPE_CHECKING, Tuple + +if TYPE_CHECKING: + from coderag.config import Config + +# Words that mark a query as natural language even if it's short. +_NL_CUES = frozenset( + { + "where", + "how", + "what", + "why", + "when", + "which", + "who", + "is", + "are", + "was", + "were", + "does", + "do", + "did", + "can", + "the", + "a", + "an", + "to", + "of", + "in", + "for", + "on", + "and", + "or", + "with", + } +) + +_CAMEL = re.compile(r"[a-z][A-Z]") +_DOTTED = re.compile(r"[A-Za-z0-9_]\.[A-Za-z0-9_]") + + +def looks_like_identifier(query: str) -> bool: + """True if ``query`` reads like an exact code/symbol lookup rather than prose.""" + q = query.strip() + if not q: + return False + tokens = q.split() + if len(tokens) >= 4: + return False # multi-word -> natural language + if {t.lower().strip("?.,:") for t in tokens} & _NL_CUES: + return False # contains a natural-language cue word + code_shaped = ( + "_" in q + or "(" in q + or _CAMEL.search(q) is not None + or _DOTTED.search(q) is not None + ) + if len(tokens) == 1: + return True # a lone token is treated as a literal-term lookup + return code_shaped # 2-3 tokens only count as code when visibly code-shaped + + +def fusion_weights(query: str, config: "Config") -> Tuple[float, float]: + """Return ``(dense_weight, lexical_weight)`` for ``query``. + + Without adaptive fusion this is just the configured static pair. With it on, weights tilt + toward dense for natural-language queries and toward BM25 for identifier-like queries. + """ + if not config.adaptive_fusion: + return config.dense_weight, config.lexical_weight + if looks_like_identifier(query): + return config.code_dense_weight, config.code_lexical_weight + return config.nl_dense_weight, config.nl_lexical_weight diff --git a/coderag/retrieval/search.py b/coderag/retrieval/search.py index ea6482a..1dfb5be 100644 --- a/coderag/retrieval/search.py +++ b/coderag/retrieval/search.py @@ -8,6 +8,7 @@ from coderag.config import Config from coderag.embeddings import EmbeddingProvider from coderag.retrieval.fusion import reciprocal_rank_fusion +from coderag.retrieval.query_type import fusion_weights from coderag.store.sqlite_store import SQLiteStore from coderag.store.vector_index import FaissVectorIndex from coderag.types import SearchHit @@ -56,10 +57,12 @@ def search(self, query: str, top_k: int) -> List[SearchHit]: lexical_ranked = [cid for cid, _ in self.store.fts_search(query, fetch_k)] # Fuse, then trim to the candidate pool (top_k, or deeper when reranking). + # Weights may adapt to the query type (dense-up for NL, BM25-up for identifiers). + dense_w, lexical_w = fusion_weights(query, self.config) fused = reciprocal_rank_fusion( [dense_ranked, lexical_ranked], k=self.config.rrf_k, - weights=[self.config.dense_weight, self.config.lexical_weight], + weights=[dense_w, lexical_w], )[:pool] if not fused: return [] diff --git a/coderag/surfaces/cli.py b/coderag/surfaces/cli.py index 6493c3b..aa36730 100644 --- a/coderag/surfaces/cli.py +++ b/coderag/surfaces/cli.py @@ -126,9 +126,11 @@ def cmd_eval(args: argparse.Namespace) -> int: return 1 ks = tuple(int(k) for k in args.ks.split(",")) - # --rerank forces the optional two-stage cross-encoder on for this run. + # --rerank / --adaptive force the optional stages on for this run. if args.rerank: cfg = cfg.with_overrides(rerank=True) + if args.adaptive: + cfg = cfg.with_overrides(adaptive_fusion=True) cr = CodeRAG(cfg) cr.index() # ensure the index is built / up to date before scoring @@ -139,11 +141,24 @@ def cmd_eval(args: argparse.Namespace) -> int: reranker = get_reranker(cfg) results = ev.compare_modes( - cr, cases, ks=ks, level=args.level, reranker=reranker + cr, + cases, + ks=ks, + level=args.level, + reranker=reranker, + adaptive=args.adaptive, ) else: - label = "hybrid+rerank" if args.rerank else "hybrid" - results = [ev.evaluate(cr.search, cases, label=label, ks=ks, level=args.level)] + parts = ["hybrid"] + if args.adaptive: + parts = ["adaptive"] + if args.rerank: + parts.append("rerank") + results = [ + ev.evaluate( + cr.search, cases, label="+".join(parts), ks=ks, level=args.level + ) + ] if args.json: print(json.dumps([r.as_dict() for r in results], indent=2)) @@ -285,6 +300,11 @@ def build_parser() -> argparse.ArgumentParser: action="store_true", help="Enable the local cross-encoder reranker (two-stage retrieve-then-rerank).", ) + p_eval.add_argument( + "--adaptive", + action="store_true", + help="Enable query-type-aware fusion weighting (dense-up for NL, BM25-up for code).", + ) p_eval.add_argument( "--list-models", action="store_true", diff --git a/docs/eval.md b/docs/eval.md index 396e9a8..51145ce 100644 --- a/docs/eval.md +++ b/docs/eval.md @@ -181,8 +181,9 @@ Three findings, all actionable: `dense` alone (0.675) *beats* `hybrid` (0.573): on NL queries BM25 is weak (0.427) and equal-weight RRF drags the strong dense ranking down. For the weaker jina-code, BM25 helps (hybrid 0.604 > dense 0.483). **Takeaway: fusion weights should depend on query type** — - weight dense up for natural-language queries, BM25 up for exact-identifier/code queries - (strategy §3). A fixed 1:1 is a compromise, not an optimum. + weight dense up for natural-language queries; a fixed 1:1 is a compromise, not an optimum. + (Implemented and validated below — note the "BM25-up for code" half of this intuition was + *refuted* by the data.) 3. **Reranking improves top-1 precision.** `hybrid+rerank` lifts R@1 0.364 → 0.409 (+12%) over hybrid with the tiny ms-marco model — consistent with the git-mined result (+55% on 10 noisier cases). The reranker reliably sharpens the top of the list; it operates on the @@ -197,6 +198,32 @@ or a smaller candidate pool. The MiniLM default is the pragmatic local choice. weighting** (finding 2), then **reranking for top-1** (finding 3) — not a bigger embedding model (finding 1). Validate these on a larger external repo next. +### Adaptive fusion weighting (finding #2, implemented) + +`CODERAG_ADAPTIVE_FUSION=1` (or `coderag eval --adaptive`) routes the fusion weights by query +type: a cheap local heuristic (`looks_like_identifier`) leans **dense** for natural-language +queries and stays **neutral** for identifier-like ones. Validated on bge-small at symbol +level against fixed 1:1 hybrid, on two 22-case sets: + +``` +NL queries (coderag_self_symbols.jsonl) MRR R@1 nDCG@10 + hybrid (fixed 1:1) 0.604 0.455 0.669 + adaptive 0.674 0.545 0.722 ← +0.070 MRR, +20% R@1 + +identifier queries (coderag_self_identifiers.jsonl) + hybrid (fixed 1:1) 0.685 0.545 0.741 + adaptive 0.685 0.545 0.741 ← unchanged (no regression) +``` + +So adaptive is a **Pareto improvement** over fixed hybrid here: big gain on NL, no loss on +identifiers. **Honest caveat that shaped the defaults:** the literature's "BM25-up for code" +intuition was *refuted* by the data — up-weighting BM25 for identifier queries actively hurt +(MRR 0.685 → 0.613), because short/common identifiers (`search`, `index`) are lexically +ambiguous and the embedder already matches them well. So the code-side default is **neutral +(1:1)**, not BM25-leaning; BM25-leaning is left configurable (`CODERAG_CODE_LEXICAL_WEIGHT`) +for larger repos where exact-string recall matters more. Off by default pending larger-repo +validation; enable with `CODERAG_ADAPTIVE_FUSION=1`. + ## Dataset format JSONL, one case per line: diff --git a/docs/research/code-retrieval-strategy.md b/docs/research/code-retrieval-strategy.md index 9721dd8..4a9aea1 100644 --- a/docs/research/code-retrieval-strategy.md +++ b/docs/research/code-retrieval-strategy.md @@ -131,12 +131,13 @@ bolt-on. Treat as a later experiment, not a v1 move. ## 3. Tune and route the hybrid fusion you already have -> **Update (measured — now the #1 lever).** Symbol-level eval on this repo showed equal-weight -> hybrid can *lose* to dense alone on NL queries: `bge-small` dense MRR 0.675 vs hybrid 0.573, -> because weak BM25 (0.427) drags the strong dense ranking down via 1:1 RRF. For the weaker -> jina-code, BM25 *helps* (hybrid 0.604 > dense 0.483). So fusion weighting should be -> **query-type-aware** (dense-up for NL, BM25-up for identifiers) rather than fixed 1:1 — this -> was the single biggest lever found, ahead of a bigger embedder. See [docs/eval.md](../eval.md). +> **Update (implemented & validated).** Query-type-aware fusion weighting now exists +> (`config.adaptive_fusion`, `coderag/retrieval/query_type.py`, `coderag eval --adaptive`). On +> symbol-level eval it is a **Pareto improvement** over fixed 1:1 hybrid: NL queries +0.070 MRR +> / +20% R@1 (leaning dense), identifier queries unchanged. **The "BM25-up for identifiers" +> half of the hypothesis was refuted** — up-weighting BM25 there *hurt* (short identifiers are +> lexically ambiguous; the embedder already matches them), so the code-side default is neutral. +> Off by default pending larger-repo validation. See [docs/eval.md](../eval.md). CodeRAG already does dense + BM25 + RRF — the literature says that's the right foundation; the wins are in **routing and tuning**: diff --git a/scripts/bench_embedders.py b/scripts/bench_embedders.py index 9872eff..021963e 100644 --- a/scripts/bench_embedders.py +++ b/scripts/bench_embedders.py @@ -51,6 +51,11 @@ def main() -> int: default="", help="Comma-separated reranker ids; one hybrid+rerank row per reranker.", ) + ap.add_argument( + "--adaptive", + action="store_true", + help="Also score an 'adaptive' row (query-type-aware fusion weighting).", + ) args = ap.parse_args() repo = Path(args.repo).resolve() @@ -80,7 +85,12 @@ def main() -> int: default_reranker = get_reranker(cfg.with_overrides(rerank=True)) for r in compare_modes( - cr, cases, ks=ks, level=args.level, reranker=default_reranker + cr, + cases, + ks=ks, + level=args.level, + reranker=default_reranker, + adaptive=args.adaptive, ): r.label = _label(model, r.label) rows.append(r) diff --git a/tests/test_query_type.py b/tests/test_query_type.py new file mode 100644 index 0000000..cb45a73 --- /dev/null +++ b/tests/test_query_type.py @@ -0,0 +1,61 @@ +"""Tests for query-type detection and adaptive fusion weighting.""" + +from __future__ import annotations + +from coderag.api import CodeRAG +from coderag.retrieval.query_type import fusion_weights, looks_like_identifier +from tests.conftest import write + + +def test_identifier_queries_detected(): + assert looks_like_identifier("fts_search") + assert looks_like_identifier("reciprocal_rank_fusion") + assert looks_like_identifier("HybridSearcher.search") + assert looks_like_identifier("getUserToken") # camelCase + assert looks_like_identifier("authenticate(token)") # call paren + + +def test_natural_language_queries_detected(): + assert not looks_like_identifier("where is retry backoff handled") + assert not looks_like_identifier("how does indexing work") + assert not looks_like_identifier("the auth flow") # NL cue word + assert not looks_like_identifier("user authentication flow") # 3 plain words + assert not looks_like_identifier("") + + +def test_fusion_weights_static_when_adaptive_off(config): + cfg = config.with_overrides(dense_weight=1.0, lexical_weight=1.0) + assert fusion_weights("anything at all here", cfg) == (1.0, 1.0) + + +def test_fusion_weights_tilt_by_query_type(config): + cfg = config.with_overrides( + adaptive_fusion=True, + nl_dense_weight=1.0, + nl_lexical_weight=0.4, + code_dense_weight=0.4, + code_lexical_weight=1.0, + ) + # Natural language -> dense up. + assert fusion_weights("where is the token validated", cfg) == (1.0, 0.4) + # Identifier -> BM25 up. + assert fusion_weights("validate_token", cfg) == (0.4, 1.0) + + +def test_default_weights_lean_dense_for_nl_neutral_for_code(config): + # Validated defaults: NL leans dense; identifiers stay neutral (BM25-up hurt on-repo). + cfg = config.with_overrides(adaptive_fusion=True) + assert fusion_weights("how is the index rebuilt", cfg) == (1.0, 0.4) + assert fusion_weights("rebuild_from_store", cfg) == (1.0, 1.0) + + +def test_adaptive_search_runs_end_to_end(config): + repo = config.watched_dir + repo.mkdir(parents=True, exist_ok=True) + write(repo / "auth.py", "def validate_token(token):\n return token\n") + write(repo / "math_utils.py", "def add_numbers(a, b):\n return a + b\n") + cr = CodeRAG(config.with_overrides(adaptive_fusion=True)) + cr.index() + # Identifier query still retrieves its exact symbol with adaptive weighting on. + hits = cr.search("validate_token", top_k=3) + assert any(h.symbol == "validate_token" for h in hits)