Neverdecel · Neverdecel · Jun 17, 2026 · Jun 17, 2026
diff --git a/coderag/api.py b/coderag/api.py
@@ -184,6 +184,7 @@ def status(self) -> dict:
             "index_type": self.vectors.kind,
             "rerank": self.config.rerank,
             "rerank_model": self.config.rerank_model if self.config.rerank else "",
+            "adaptive_fusion": self.config.adaptive_fusion,
             "store_dir": str(self.config.store_dir),
             "watched_dir": str(self.config.watched_dir),
             "total_files": stats.total_files,

diff --git a/coderag/config.py b/coderag/config.py
@@ -135,6 +135,22 @@ class Config:
     dense_weight: float = 1.0
     lexical_weight: float = 1.0
 
+    # --- Adaptive fusion weighting (query-type-aware) ---
+    # Off by default. When on, fusion weights tilt by query type: dense up for
+    # natural-language queries, BM25 up for exact-identifier/code queries (a fixed 1:1 is a
+    # compromise — see docs/eval.md). These pairs override dense_weight/lexical_weight only
+    # when adaptive_fusion is enabled.
+    adaptive_fusion: bool = False
+    # NL queries: lean dense (weak BM25 otherwise drags a strong dense ranking down).
+    nl_dense_weight: float = 1.0
+    nl_lexical_weight: float = 0.4
+    # Identifier/code queries: stay balanced. Up-weighting BM25 here *hurt* on this repo
+    # (short, common identifiers are lexically ambiguous, and the embedder already matches
+    # them well) — so the default is neutral, and BM25-leaning is left configurable for
+    # larger repos where exact-string recall matters more. See docs/eval.md.
+    code_dense_weight: float = 1.0
+    code_lexical_weight: float = 1.0
+
     # --- Reranking (optional two-stage retrieve-then-rerank) ---
     # Off by default so the zero-config engine stays tiny/fast. When on, the top
     # ``rerank_candidates`` fused hits are re-scored by a local cross-encoder and reordered.
@@ -209,6 +225,17 @@ def from_env(cls, **overrides: object) -> "Config":
             rrf_k=_env_int("CODERAG_RRF_K", cls.rrf_k),
             dense_weight=_env_float("CODERAG_DENSE_WEIGHT", cls.dense_weight),
             lexical_weight=_env_float("CODERAG_LEXICAL_WEIGHT", cls.lexical_weight),
+            adaptive_fusion=_env_bool("CODERAG_ADAPTIVE_FUSION", cls.adaptive_fusion),
+            nl_dense_weight=_env_float("CODERAG_NL_DENSE_WEIGHT", cls.nl_dense_weight),
+            nl_lexical_weight=_env_float(
+                "CODERAG_NL_LEXICAL_WEIGHT", cls.nl_lexical_weight
+            ),
+            code_dense_weight=_env_float(
+                "CODERAG_CODE_DENSE_WEIGHT", cls.code_dense_weight
+            ),
+            code_lexical_weight=_env_float(
+                "CODERAG_CODE_LEXICAL_WEIGHT", cls.code_lexical_weight
+            ),
             rerank=_env_bool("CODERAG_RERANK", cls.rerank),
             rerank_model=_env_str("CODERAG_RERANK_MODEL", cls.rerank_model),
             rerank_candidates=_env_int(

diff --git a/coderag/eval/datasets/coderag_self_identifiers.jsonl b/coderag/eval/datasets/coderag_self_identifiers.jsonl
@@ -0,0 +1,22 @@
+{"query": "reciprocal_rank_fusion", "relevant_files": ["coderag/retrieval/fusion.py"], "relevant_symbols": ["reciprocal_rank_fusion"], "source": "curated-id"}
+{"query": "search", "relevant_files": ["coderag/retrieval/search.py"], "relevant_symbols": ["HybridSearcher.search"], "source": "curated-id"}
+{"query": "_index_file", "relevant_files": ["coderag/indexer.py"], "relevant_symbols": ["Indexer._index_file"], "source": "curated-id"}
+{"query": "rebuild_from_store", "relevant_files": ["coderag/store/vector_index.py"], "relevant_symbols": ["FaissVectorIndex.rebuild_from_store"], "source": "curated-id"}
+{"query": "_choose_kind", "relevant_files": ["coderag/store/vector_index.py"], "relevant_symbols": ["FaissVectorIndex._choose_kind"], "source": "curated-id"}
+{"query": "search", "relevant_files": ["coderag/store/vector_index.py"], "relevant_symbols": ["FaissVectorIndex.search"], "source": "curated-id"}
+{"query": "_derive_nlist", "relevant_files": ["coderag/store/vector_index.py"], "relevant_symbols": ["_derive_nlist"], "source": "curated-id"}
+{"query": "fts_search", "relevant_files": ["coderag/store/sqlite_store.py"], "relevant_symbols": ["SQLiteStore.fts_search"], "source": "curated-id"}
+{"query": "bootstrap", "relevant_files": ["coderag/store/sqlite_store.py"], "relevant_symbols": ["SQLiteStore.bootstrap"], "source": "curated-id"}
+{"query": "hydrate", "relevant_files": ["coderag/store/sqlite_store.py"], "relevant_symbols": ["SQLiteStore.hydrate"], "source": "curated-id"}
+{"query": "_sanitize_fts", "relevant_files": ["coderag/store/sqlite_store.py"], "relevant_symbols": ["_sanitize_fts"], "source": "curated-id"}
+{"query": "watch", "relevant_files": ["coderag/watch.py"], "relevant_symbols": ["watch"], "source": "curated-id"}
+{"query": "extract_spans", "relevant_files": ["coderag/chunking/python_ast.py"], "relevant_symbols": ["extract_spans"], "source": "curated-id"}
+{"query": "stream_answer", "relevant_files": ["coderag/llm.py"], "relevant_symbols": ["stream_answer"], "source": "curated-id"}
+{"query": "build_context", "relevant_files": ["coderag/llm.py"], "relevant_symbols": ["build_context"], "source": "curated-id"}
+{"query": "search", "relevant_files": ["coderag/api.py"], "relevant_symbols": ["CodeRAG.search"], "source": "curated-id"}
+{"query": "get_file", "relevant_files": ["coderag/api.py"], "relevant_symbols": ["CodeRAG.get_file"], "source": "curated-id"}
+{"query": "recall_at_k", "relevant_files": ["coderag/eval/metrics.py"], "relevant_symbols": ["recall_at_k"], "source": "curated-id"}
+{"query": "ndcg_at_k", "relevant_files": ["coderag/eval/metrics.py"], "relevant_symbols": ["ndcg_at_k"], "source": "curated-id"}
+{"query": "rerank", "relevant_files": ["coderag/retrieval/rerank.py"], "relevant_symbols": ["CrossEncoderReranker.rerank"], "source": "curated-id"}
+{"query": "get_reranker", "relevant_files": ["coderag/retrieval/rerank.py"], "relevant_symbols": ["get_reranker"], "source": "curated-id"}
+{"query": "index", "relevant_files": ["coderag/indexer.py"], "relevant_symbols": ["Indexer.index"], "source": "curated-id"}
diff --git a/coderag/eval/harness.py b/coderag/eval/harness.py
@@ -121,25 +121,38 @@ def compare_modes(
     level: str = "file",
     modes: Sequence[Tuple[str, float, float]] = DEFAULT_MODES,
     reranker: Optional["Reranker"] = None,
+    adaptive: bool = False,
 ) -> List[EvalResult]:
     """Score dense-only vs BM25-only vs hybrid on the already-built index of ``cr``.
 
     The index is mode-independent — the dense/lexical weights only affect query-time RRF
     fusion — so we reuse one provider/store/vector index and just swap the fusion weights.
-    When ``reranker`` is given, an extra ``hybrid+rerank`` row is appended so the lift from
-    two-stage reranking is directly comparable on the same index.
+    When ``reranker`` is given, an extra ``hybrid+rerank`` row is appended; when ``adaptive``
+    is set, an ``adaptive`` row uses query-type-aware fusion weighting. All comparable on the
+    same index.
     """
     from coderag.retrieval.search import HybridSearcher
 
     results: List[EvalResult] = []
     for label, dense_w, lexical_w in modes:
-        cfg = cr.config.with_overrides(dense_weight=dense_w, lexical_weight=lexical_w)
+        # Fixed modes must isolate the weights, so force adaptive fusion off here.
+        cfg = cr.config.with_overrides(
+            dense_weight=dense_w, lexical_weight=lexical_w, adaptive_fusion=False
+        )
         searcher = HybridSearcher(cfg, cr.provider, cr.store, cr.vectors)
         results.append(
             evaluate(searcher.search, cases, label=label, ks=ks, level=level)
         )
+    if adaptive:
+        cfg = cr.config.with_overrides(adaptive_fusion=True)
+        searcher = HybridSearcher(cfg, cr.provider, cr.store, cr.vectors)
+        results.append(
+            evaluate(searcher.search, cases, label="adaptive", ks=ks, level=level)
+        )
     if reranker is not None:
-        cfg = cr.config.with_overrides(dense_weight=1.0, lexical_weight=1.0)
+        cfg = cr.config.with_overrides(
+            dense_weight=1.0, lexical_weight=1.0, adaptive_fusion=False
+        )
         searcher = HybridSearcher(
             cfg, cr.provider, cr.store, cr.vectors, reranker=reranker
         )

diff --git a/coderag/retrieval/query_type.py b/coderag/retrieval/query_type.py
@@ -0,0 +1,89 @@
+"""Query-type detection for adaptive fusion weighting.
+
+Symbol-level evaluation showed that a fixed 1:1 dense/BM25 fusion is a compromise, not an
+optimum: on natural-language "where is X handled" queries the dense retriever is much
+stronger and equal-weight RRF *drags it down* with weak BM25, while on exact-identifier
+queries (``fts_search``, ``HybridSearcher.search``) the opposite holds. Routing the fusion
+weights by query type recovers most of that gap with a cheap, local heuristic.
+
+``looks_like_identifier`` is deliberately conservative: it only calls a query "code" when it
+is short *and* either a lone token or visibly code-shaped (snake_case, dotted path,
+camelCase, a call paren), and never when it contains natural-language cue words.
+"""
+
+from __future__ import annotations
+
+import re
+from typing import TYPE_CHECKING, Tuple
+
+if TYPE_CHECKING:
+    from coderag.config import Config
+
+# Words that mark a query as natural language even if it's short.
+_NL_CUES = frozenset(
+    {
+        "where",
+        "how",
+        "what",
+        "why",
+        "when",
+        "which",
+        "who",
+        "is",
+        "are",
+        "was",
+        "were",
+        "does",
+        "do",
+        "did",
+        "can",
+        "the",
+        "a",
+        "an",
+        "to",
+        "of",
+        "in",
+        "for",
+        "on",
+        "and",
+        "or",
+        "with",
+    }
+)
+
+_CAMEL = re.compile(r"[a-z][A-Z]")
+_DOTTED = re.compile(r"[A-Za-z0-9_]\.[A-Za-z0-9_]")
+
+
+def looks_like_identifier(query: str) -> bool:
+    """True if ``query`` reads like an exact code/symbol lookup rather than prose."""
+    q = query.strip()
+    if not q:
+        return False
+    tokens = q.split()
+    if len(tokens) >= 4:
+        return False  # multi-word -> natural language
+    if {t.lower().strip("?.,:") for t in tokens} & _NL_CUES:
+        return False  # contains a natural-language cue word
+    code_shaped = (
+        "_" in q
+        or "(" in q
+        or _CAMEL.search(q) is not None
+        or _DOTTED.search(q) is not None
+    )
+    if len(tokens) == 1:
+        return True  # a lone token is treated as a literal-term lookup
+    return code_shaped  # 2-3 tokens only count as code when visibly code-shaped
+
+
+def fusion_weights(query: str, config: "Config") -> Tuple[float, float]:
+    """Return ``(dense_weight, lexical_weight)`` for ``query``.
+
+    Without adaptive fusion this is just the configured static pair. With it on, weights tilt
+    toward dense for natural-language queries and toward BM25 for identifier-like queries.
+    """
+    if not config.adaptive_fusion:
+        return config.dense_weight, config.lexical_weight
+    if looks_like_identifier(query):
+        return config.code_dense_weight, config.code_lexical_weight
+    return config.nl_dense_weight, config.nl_lexical_weight
diff --git a/coderag/retrieval/search.py b/coderag/retrieval/search.py
@@ -8,6 +8,7 @@
 from coderag.config import Config
 from coderag.embeddings import EmbeddingProvider
 from coderag.retrieval.fusion import reciprocal_rank_fusion
+from coderag.retrieval.query_type import fusion_weights
 from coderag.store.sqlite_store import SQLiteStore
 from coderag.store.vector_index import FaissVectorIndex
 from coderag.types import SearchHit
@@ -56,10 +57,12 @@ def search(self, query: str, top_k: int) -> List[SearchHit]:
         lexical_ranked = [cid for cid, _ in self.store.fts_search(query, fetch_k)]
 
         # Fuse, then trim to the candidate pool (top_k, or deeper when reranking).
+        # Weights may adapt to the query type (dense-up for NL, BM25-up for identifiers).
+        dense_w, lexical_w = fusion_weights(query, self.config)
         fused = reciprocal_rank_fusion(
             [dense_ranked, lexical_ranked],
             k=self.config.rrf_k,
-            weights=[self.config.dense_weight, self.config.lexical_weight],
+            weights=[dense_w, lexical_w],
         )[:pool]
         if not fused:
             return []

diff --git a/coderag/surfaces/cli.py b/coderag/surfaces/cli.py
@@ -126,9 +126,11 @@ def cmd_eval(args: argparse.Namespace) -> int:
         return 1
 
     ks = tuple(int(k) for k in args.ks.split(","))
-    # --rerank forces the optional two-stage cross-encoder on for this run.
+    # --rerank / --adaptive force the optional stages on for this run.
     if args.rerank:
         cfg = cfg.with_overrides(rerank=True)
+    if args.adaptive:
+        cfg = cfg.with_overrides(adaptive_fusion=True)
     cr = CodeRAG(cfg)
     cr.index()  # ensure the index is built / up to date before scoring
 
@@ -139,11 +141,24 @@ def cmd_eval(args: argparse.Namespace) -> int:
 
             reranker = get_reranker(cfg)
         results = ev.compare_modes(
-            cr, cases, ks=ks, level=args.level, reranker=reranker
+            cr,
+            cases,
+            ks=ks,
+            level=args.level,
+            reranker=reranker,
+            adaptive=args.adaptive,
         )
     else:
-        label = "hybrid+rerank" if args.rerank else "hybrid"
-        results = [ev.evaluate(cr.search, cases, label=label, ks=ks, level=args.level)]
+        parts = ["hybrid"]
+        if args.adaptive:
+            parts = ["adaptive"]
+        if args.rerank:
+            parts.append("rerank")
+        results = [
+            ev.evaluate(
+                cr.search, cases, label="+".join(parts), ks=ks, level=args.level
+            )
+        ]
 
     if args.json:
         print(json.dumps([r.as_dict() for r in results], indent=2))
@@ -285,6 +300,11 @@ def build_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="Enable the local cross-encoder reranker (two-stage retrieve-then-rerank).",
     )
+    p_eval.add_argument(
+        "--adaptive",
+        action="store_true",
+        help="Enable query-type-aware fusion weighting (dense-up for NL, BM25-up for code).",
+    )
     p_eval.add_argument(
         "--list-models",
         action="store_true",

diff --git a/docs/eval.md b/docs/eval.md
@@ -181,8 +181,9 @@ Three findings, all actionable:
    `dense` alone (0.675) *beats* `hybrid` (0.573): on NL queries BM25 is weak (0.427) and
    equal-weight RRF drags the strong dense ranking down. For the weaker jina-code, BM25 helps
    (hybrid 0.604 > dense 0.483). **Takeaway: fusion weights should depend on query type** —
-   weight dense up for natural-language queries, BM25 up for exact-identifier/code queries
-   (strategy §3). A fixed 1:1 is a compromise, not an optimum.
+   weight dense up for natural-language queries; a fixed 1:1 is a compromise, not an optimum.
+   (Implemented and validated below — note the "BM25-up for code" half of this intuition was
+   *refuted* by the data.)
 3. **Reranking improves top-1 precision.** `hybrid+rerank` lifts R@1 0.364 → 0.409 (+12%) over
    hybrid with the tiny ms-marco model — consistent with the git-mined result (+55% on 10
    noisier cases). The reranker reliably sharpens the top of the list; it operates on the
@@ -197,6 +198,32 @@ or a smaller candidate pool. The MiniLM default is the pragmatic local choice.
 weighting** (finding 2), then **reranking for top-1** (finding 3) — not a bigger embedding
 model (finding 1). Validate these on a larger external repo next.
 
+### Adaptive fusion weighting (finding #2, implemented)
+
+`CODERAG_ADAPTIVE_FUSION=1` (or `coderag eval --adaptive`) routes the fusion weights by query
+type: a cheap local heuristic (`looks_like_identifier`) leans **dense** for natural-language
+queries and stays **neutral** for identifier-like ones. Validated on bge-small at symbol
+level against fixed 1:1 hybrid, on two 22-case sets:
+
+```
+NL queries (coderag_self_symbols.jsonl)     MRR    R@1    nDCG@10
+  hybrid (fixed 1:1)                         0.604  0.455  0.669
+  adaptive                                   0.674  0.545  0.722     ← +0.070 MRR, +20% R@1
+
+identifier queries (coderag_self_identifiers.jsonl)
+  hybrid (fixed 1:1)                         0.685  0.545  0.741
+  adaptive                                   0.685  0.545  0.741     ← unchanged (no regression)
+```
+
+So adaptive is a **Pareto improvement** over fixed hybrid here: big gain on NL, no loss on
+identifiers. **Honest caveat that shaped the defaults:** the literature's "BM25-up for code"
+intuition was *refuted* by the data — up-weighting BM25 for identifier queries actively hurt
+(MRR 0.685 → 0.613), because short/common identifiers (`search`, `index`) are lexically
+ambiguous and the embedder already matches them well. So the code-side default is **neutral
+(1:1)**, not BM25-leaning; BM25-leaning is left configurable (`CODERAG_CODE_LEXICAL_WEIGHT`)
+for larger repos where exact-string recall matters more. Off by default pending larger-repo
+validation; enable with `CODERAG_ADAPTIVE_FUSION=1`.
+
 ## Dataset format
 
 JSONL, one case per line:

diff --git a/docs/research/code-retrieval-strategy.md b/docs/research/code-retrieval-strategy.md
@@ -131,12 +131,13 @@ bolt-on. Treat as a later experiment, not a v1 move.
 
 ## 3. Tune and route the hybrid fusion you already have
 
-> **Update (measured — now the #1 lever).** Symbol-level eval on this repo showed equal-weight
-> hybrid can *lose* to dense alone on NL queries: `bge-small` dense MRR 0.675 vs hybrid 0.573,
-> because weak BM25 (0.427) drags the strong dense ranking down via 1:1 RRF. For the weaker
-> jina-code, BM25 *helps* (hybrid 0.604 > dense 0.483). So fusion weighting should be
-> **query-type-aware** (dense-up for NL, BM25-up for identifiers) rather than fixed 1:1 — this
-> was the single biggest lever found, ahead of a bigger embedder. See [docs/eval.md](../eval.md).
+> **Update (implemented & validated).** Query-type-aware fusion weighting now exists
+> (`config.adaptive_fusion`, `coderag/retrieval/query_type.py`, `coderag eval --adaptive`). On
+> symbol-level eval it is a **Pareto improvement** over fixed 1:1 hybrid: NL queries +0.070 MRR
+> / +20% R@1 (leaning dense), identifier queries unchanged. **The "BM25-up for identifiers"
+> half of the hypothesis was refuted** — up-weighting BM25 there *hurt* (short identifiers are
+> lexically ambiguous; the embedder already matches them), so the code-side default is neutral.
+> Off by default pending larger-repo validation. See [docs/eval.md](../eval.md).
 
 CodeRAG already does dense + BM25 + RRF — the literature says that's the right foundation; the wins
 are in **routing and tuning**:

diff --git a/scripts/bench_embedders.py b/scripts/bench_embedders.py
@@ -51,6 +51,11 @@ def main() -> int:
         default="",
         help="Comma-separated reranker ids; one hybrid+rerank row per reranker.",
     )
+    ap.add_argument(
+        "--adaptive",
+        action="store_true",
+        help="Also score an 'adaptive' row (query-type-aware fusion weighting).",
+    )
     args = ap.parse_args()
 
     repo = Path(args.repo).resolve()
@@ -80,7 +85,12 @@ def main() -> int:
 
                 default_reranker = get_reranker(cfg.with_overrides(rerank=True))
             for r in compare_modes(
-                cr, cases, ks=ks, level=args.level, reranker=default_reranker
+                cr,
+                cases,
+                ks=ks,
+                level=args.level,
+                reranker=default_reranker,
+                adaptive=args.adaptive,
             ):
                 r.label = _label(model, r.label)
                 rows.append(r)