Neverdecel · Neverdecel · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026
diff --git a/coderag/retrieval/query_type.py b/coderag/retrieval/query_type.py
@@ -1,14 +1,17 @@
 """Query-type detection for adaptive fusion weighting.
 
-Symbol-level evaluation showed that a fixed 1:1 dense/BM25 fusion is a compromise, not an
-optimum: on natural-language "where is X handled" queries the dense retriever is much
-stronger and equal-weight RRF *drags it down* with weak BM25, while on exact-identifier
-queries (``fts_search``, ``HybridSearcher.search``) the opposite holds. Routing the fusion
-weights by query type recovers most of that gap with a cheap, local heuristic.
-
-``looks_like_identifier`` is deliberately conservative: it only calls a query "code" when it
-is short *and* either a lone token or visibly code-shaped (snake_case, dotted path,
-camelCase, a call paren), and never when it contains natural-language cue words.
+Symbol-level evaluation showed a fixed 1:1 dense/BM25 fusion is a compromise: on pure
+natural-language "where is X handled" queries the dense retriever is much stronger and
+equal-weight RRF drags it down with weak BM25. But the first cut keyed on query *shape*
+(short + code-looking) and mis-classified the common real case — a prose query that
+*mentions* a specific symbol, e.g. a commit message *"Fix tuple order in
+`AliasGenerator.generate_aliases()`"*. External validation (pydantic) showed leaning dense
+on those hurts, because the discriminating signal is the exact identifier BM25 matches.
+
+So the routing question is simply: **does the query reference a specific code identifier?**
+If yes, keep fusion neutral (BM25 carries real signal); if it's pure prose, lean dense. This
+makes adaptive fusion fall back to plain hybrid whenever an identifier is named — it can only
+help pure-NL queries, never repeat the regression.
 """
 
 from __future__ import annotations
@@ -19,71 +22,71 @@
 if TYPE_CHECKING:
     from coderag.config import Config
 
-# Words that mark a query as natural language even if it's short.
-_NL_CUES = frozenset(
-    {
-        "where",
-        "how",
-        "what",
-        "why",
-        "when",
-        "which",
-        "who",
-        "is",
-        "are",
-        "was",
-        "were",
-        "does",
-        "do",
-        "did",
-        "can",
-        "the",
-        "a",
-        "an",
-        "to",
-        "of",
-        "in",
-        "for",
-        "on",
-        "and",
-        "or",
-        "with",
-    }
-)
-
-_CAMEL = re.compile(r"[a-z][A-Z]")
-_DOTTED = re.compile(r"[A-Za-z0-9_]\.[A-Za-z0-9_]")
-
-
-def looks_like_identifier(query: str) -> bool:
-    """True if ``query`` reads like an exact code/symbol lookup rather than prose."""
-    q = query.strip()
-    if not q:
-        return False
-    tokens = q.split()
-    if len(tokens) >= 4:
-        return False  # multi-word -> natural language
-    if {t.lower().strip("?.,:") for t in tokens} & _NL_CUES:
-        return False  # contains a natural-language cue word
-    code_shaped = (
-        "_" in q
-        or "(" in q
-        or _CAMEL.search(q) is not None
-        or _DOTTED.search(q) is not None
+# Query-wide signals, both linear (disjoint character classes — no catastrophic backtracking
+# on the user-supplied query string): a `backtick`-quoted span, or a call ``word(``.
+_BACKTICK = re.compile(r"`[^`]+`")
+_CALL = re.compile(r"\w\(")
+# Punctuation stripped from a token before classifying it (query prose, not code).
+_STRIP = "`'\".,:;!?()[]{}<>"
+
+
+def _is_snake_case(token: str) -> bool:
+    """An underscore flanked by alphanumerics, e.g. ``foo_bar`` (linear scan)."""
+    return any(
+        token[i] == "_" and token[i - 1].isalnum() and token[i + 1].isalnum()
+        for i in range(1, len(token) - 1)
     )
-    if len(tokens) == 1:
-        return True  # a lone token is treated as a literal-term lookup
-    return code_shaped  # 2-3 tokens only count as code when visibly code-shaped
+
+
+def _is_dotted_path(token: str) -> bool:
+    """Two adjacent identifier parts joined by a dot, e.g. ``Foo.bar`` — excludes ``e.g``/``3.11``."""
+    parts = token.split(".")
+    return any(
+        len(a) >= 2 and len(b) >= 2 and a.isidentifier() and b.isidentifier()
+        for a, b in zip(parts, parts[1:], strict=False)
+    )
+
+
+def _is_camel_case(token: str) -> bool:
+    """A lower→upper boundary, e.g. ``fooBar`` / ``AliasGenerator`` (linear scan)."""
+    return any(
+        a.islower() and b.isupper() for a, b in zip(token, token[1:], strict=False)
+    )
+
+
+def references_identifier(query: str) -> bool:
+    """True if ``query`` names a specific code identifier (even inside a prose sentence).
+
+    Uses linear token scanning rather than one backtracking regex, so it can't be turned into
+    a ReDoS by a crafted query (the query flows in from the HTTP API).
+    """
+    if not query:
+        return False
+    if _BACKTICK.search(query) or _CALL.search(query):
+        return True
+    for raw in query.split():
+        token = raw.strip(_STRIP)
+        if len(token) >= 2 and (
+            _is_snake_case(token) or _is_dotted_path(token) or _is_camel_case(token)
+        ):
+            return True
+    return False
+
+
+# Back-compat alias: the original name meant "is an identifier lookup"; the detection is now
+# "references an identifier", which subsumes the old behavior and also catches embedded ones.
+looks_like_identifier = references_identifier
 
 
 def fusion_weights(query: str, config: "Config") -> Tuple[float, float]:
     """Return ``(dense_weight, lexical_weight)`` for ``query``.
 
-    Without adaptive fusion this is just the configured static pair. With it on, weights tilt
-    toward dense for natural-language queries and toward BM25 for identifier-like queries.
+    Without adaptive fusion this is the configured static pair. With it on: a query that
+    references a specific identifier uses the (neutral by default) code weights — so BM25's
+    exact-match signal is kept; a pure natural-language query uses the dense-leaning weights.
     """
     if not config.adaptive_fusion:
         return config.dense_weight, config.lexical_weight
-    if looks_like_identifier(query):
+    if references_identifier(query):
         return config.code_dense_weight, config.code_lexical_weight
     return config.nl_dense_weight, config.nl_lexical_weight
diff --git a/docs/eval.md b/docs/eval.md
@@ -224,12 +224,35 @@ ambiguous and the embedder already matches them well. So the code-side default i
 for larger repos where exact-string recall matters more. Off by default pending larger-repo
 validation; enable with `CODERAG_ADAPTIVE_FUSION=1`.
 
-> ⚠️ **This did NOT generalize — keep it off.** On `pydantic` (4 155-chunk corpus, commit-message
-> queries) adaptive *hurt* (MRR 0.286 vs hybrid 0.361): those queries embed exact API names, so
-> "lean dense for NL" is backwards. The dense-vs-BM25 ranking flips by repo/query style, and
-> **fixed 1:1 hybrid is the robust default.** Full write-up:
+> ⚠️ **First cut did not generalize — then was fixed.** The original classifier keyed on query
+> *shape* and mis-read prose queries that *name* a symbol. On `pydantic` (commit-message queries)
+> it leaned dense and *hurt* (MRR 0.286 vs hybrid 0.361). The classifier now detects identifiers
+> **embedded in prose** (`references_identifier`) and routes them to the neutral code weights, so
+> adaptive falls back to plain hybrid whenever a symbol is named. Full write-up:
 > [research/external-validation.md](research/external-validation.md).
 
+### Adaptive fusion, after the classifier fix
+
+The classifier fix removed the catastrophic regression the first cut had (on `pydantic`,
+adaptive went from **0.286 → 0.458 = hybrid**, no longer hurting). On two early datasets it
+looked like a clear win:
+
+```
+CodeRAG curated, symbol level          hybrid  dense  adaptive
+  natural-language queries (MRR)        0.581   0.675   0.706   ← beats both
+  identifier queries (MRR)              0.685   0.686   0.715   ← beats both
+
+pydantic, symbol level (172 cases, 22 071-chunk corpus)
+  dense 0.328 · bm25 0.398 · hybrid 0.458 · adaptive 0.458     ← equals hybrid (was 0.286)
+```
+
+⚠️ **But a 4-repo sweep (627 git-mined cases) shows it is *not* an aggregate win** — hybrid 0.442
+vs adaptive 0.423 MRR; adaptive is a wash on the well-powered repos and the big CodeRAG-curated
+gain turned out to be an artifact of unusually dense-friendly clean-NL queries (see the
+*Multi-repo evaluation* section below / PR adding it). So adaptive stays **off by default** — it's
+a **safe opt-in** (no catastrophic regression after this fix), not a default. Fixed 1:1 hybrid
+remains the default. Enable per-session with `CODERAG_ADAPTIVE_FUSION=1`.
+
 ## Dataset format
 
 JSONL, one case per line:

diff --git a/docs/research/external-validation.md b/docs/research/external-validation.md
@@ -61,10 +61,17 @@ evaporated. The robust configuration is exactly the **shipped defaults** — 1:1
 off, rerank opt-in. This is the harness earning its keep: it caught the overfitting before any
 of it became a default.
 
+> **Update — the regression was fixed, but adaptive still doesn't earn default-on.** The
+> classifier now detects identifiers *embedded in prose* (`references_identifier`) and routes
+> those queries to neutral weights, removing the catastrophic case (pydantic **0.286 → 0.458 =
+> hybrid**). However, a later **4-repo sweep** (coderag/flask/requests/click, 627 git-mined cases)
+> showed adaptive is **not** an aggregate win — hybrid 0.442 vs adaptive 0.423 MRR. The big
+> CodeRAG-curated adaptive gain was an artifact of dense-friendly clean-NL queries. So adaptive is
+> a **safe opt-in**, not a default; fixed 1:1 hybrid stays the default. See [../eval.md](../eval.md).
+
 **Actionable next steps** (none change a default):
-- Make `looks_like_identifier` smarter — detect identifiers *embedded* in prose queries (so
-  "Fix `AliasGenerator.generate_aliases`" routes BM25-up, not dense-up). That could make adaptive
-  fusion a net win across query styles instead of fragile.
+- ~~Make the classifier detect identifiers *embedded* in prose queries.~~ **Done** — see the
+  update above; adaptive now generalizes across both repos.
 - Test a **code-aware reranker** (`bge-reranker-base`, `jina-reranker-v2`) at scale on GPU —
   the only lever not yet fairly evaluated.
 - Build a multi-repo eval set (several external repos) so future tuning is judged on

diff --git a/tests/test_query_type.py b/tests/test_query_type.py
@@ -3,24 +3,53 @@
 from __future__ import annotations
 
 from coderag.api import CodeRAG
-from coderag.retrieval.query_type import fusion_weights, looks_like_identifier
+from coderag.retrieval.query_type import (
+    fusion_weights,
+    looks_like_identifier,
+    references_identifier,
+)
 from tests.conftest import write
 
 
 def test_identifier_queries_detected():
-    assert looks_like_identifier("fts_search")
-    assert looks_like_identifier("reciprocal_rank_fusion")
-    assert looks_like_identifier("HybridSearcher.search")
-    assert looks_like_identifier("getUserToken")  # camelCase
-    assert looks_like_identifier("authenticate(token)")  # call paren
+    assert references_identifier("fts_search")
+    assert references_identifier("reciprocal_rank_fusion")
+    assert references_identifier("HybridSearcher.search")
+    assert references_identifier("getUserToken")  # camelCase
+    assert references_identifier("authenticate(token)")  # call paren
+
+
+def test_identifier_embedded_in_prose_detected():
+    # The case external validation exposed: a prose query that names a symbol.
+    assert references_identifier("Fix tuple order in AliasGenerator.generate_aliases()")
+    assert references_identifier("why does build_context drop the last chunk")
+    assert references_identifier("update the `validate_token` helper")
 
 
 def test_natural_language_queries_detected():
-    assert not looks_like_identifier("where is retry backoff handled")
-    assert not looks_like_identifier("how does indexing work")
-    assert not looks_like_identifier("the auth flow")  # NL cue word
-    assert not looks_like_identifier("user authentication flow")  # 3 plain words
-    assert not looks_like_identifier("")
+    assert not references_identifier("where is retry backoff handled")
+    assert not references_identifier("how does indexing work")
+    assert not references_identifier("the auth flow")
+    assert not references_identifier("user authentication flow")
+    assert not references_identifier("")
+
+
+def test_prose_abbreviations_and_versions_are_not_identifiers():
+    # Common false positives that must NOT trip the dotted-path rule.
+    assert not references_identifier("fix the bug e.g. on startup")
+    assert not references_identifier("bump to version 3.11 support")
+
+
+def test_looks_like_identifier_alias():
+    assert looks_like_identifier is references_identifier
+
+
+def test_detection_is_linear_no_redos():
+    # A long adversarial run must be handled in linear time (the query is API-reachable).
+    # Quadratic backtracking on this input would take minutes; linear is milliseconds.
+    big = "a" * 200_000 + " "
+    assert references_identifier(big) is False
+    assert references_identifier(big + "needs_a_match") is True
 
 
 def test_fusion_weights_static_when_adaptive_off(config):