From 7ec0d58aad3b665fd1da01d18dc3f8738f40d3ab Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 17 Jun 2026 15:48:57 +0000
Subject: [PATCH 1/4] fix(retrieval): detect identifiers embedded in prose
 queries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

External validation (pydantic) showed the first adaptive-fusion classifier
mis-routed the common real case: a prose query that *names* a symbol
(e.g. a commit message "Fix tuple order in AliasGenerator.generate_aliases()")
was treated as natural language and leaned dense — exactly backwards, since
the discriminating signal is the exact identifier BM25 matches.

Replace the shape/length heuristic with references_identifier(): detect a
specific identifier token anywhere in the query (backtick spans, calls foo(,
dotted Foo.bar, snake_case, camelCase), with e.g./i.e./version-number guards.
A query that names an identifier now routes to the neutral (1:1) code weights
instead of dense-leaning — so adaptive fusion falls back to plain hybrid
whenever a symbol is mentioned and can only help pure-NL queries.

Re-validated on CodeRAG (symbol level), now a clear Pareto win over both
hybrid and dense on both query styles:
  NL set:         hybrid 0.581 / dense 0.675 -> adaptive 0.706 MRR
  identifier set: hybrid 0.685 / dense 0.686 -> adaptive 0.715 MRR

looks_like_identifier kept as a back-compat alias.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01LhTCPRjNmSitYxgSDfttT7
---
 coderag/retrieval/query_type.py | 95 +++++++++++----------------------
 tests/test_query_type.py        | 43 +++++++++++----
 2 files changed, 64 insertions(+), 74 deletions(-)

diff --git a/coderag/retrieval/query_type.py b/coderag/retrieval/query_type.py
index 7389afc..66e676d 100644
--- a/coderag/retrieval/query_type.py
+++ b/coderag/retrieval/query_type.py
@@ -1,14 +1,17 @@
 """Query-type detection for adaptive fusion weighting.
 
-Symbol-level evaluation showed that a fixed 1:1 dense/BM25 fusion is a compromise, not an
-optimum: on natural-language "where is X handled" queries the dense retriever is much
-stronger and equal-weight RRF *drags it down* with weak BM25, while on exact-identifier
-queries (``fts_search``, ``HybridSearcher.search``) the opposite holds. Routing the fusion
-weights by query type recovers most of that gap with a cheap, local heuristic.
-
-``looks_like_identifier`` is deliberately conservative: it only calls a query "code" when it
-is short *and* either a lone token or visibly code-shaped (snake_case, dotted path,
-camelCase, a call paren), and never when it contains natural-language cue words.
+Symbol-level evaluation showed a fixed 1:1 dense/BM25 fusion is a compromise: on pure
+natural-language "where is X handled" queries the dense retriever is much stronger and
+equal-weight RRF drags it down with weak BM25. But the first cut keyed on query *shape*
+(short + code-looking) and mis-classified the common real case — a prose query that
+*mentions* a specific symbol, e.g. a commit message *"Fix tuple order in
+`AliasGenerator.generate_aliases()`"*. External validation (pydantic) showed leaning dense
+on those hurts, because the discriminating signal is the exact identifier BM25 matches.
+
+So the routing question is simply: **does the query reference a specific code identifier?**
+If yes, keep fusion neutral (BM25 carries real signal); if it's pure prose, lean dense. This
+makes adaptive fusion fall back to plain hybrid whenever an identifier is named — it can only
+help pure-NL queries, never repeat the regression.
 """
 
 from __future__ import annotations
@@ -19,71 +22,37 @@
 if TYPE_CHECKING:
     from coderag.config import Config
 
-# Words that mark a query as natural language even if it's short.
-_NL_CUES = frozenset(
-    {
-        "where",
-        "how",
-        "what",
-        "why",
-        "when",
-        "which",
-        "who",
-        "is",
-        "are",
-        "was",
-        "were",
-        "does",
-        "do",
-        "did",
-        "can",
-        "the",
-        "a",
-        "an",
-        "to",
-        "of",
-        "in",
-        "for",
-        "on",
-        "and",
-        "or",
-        "with",
-    }
+# A token that denotes a specific code identifier, even when embedded in prose. Matches
+# `backtick` spans, calls ``foo(``, dotted paths ``Foo.bar`` (both sides 2+ chars, so
+# ``e.g``/``3.11`` are excluded), snake_case ``foo_bar``, and camelCase ``fooBar``.
+_IDENTIFIER = re.compile(
+    r"`[^`]+`"  # `backtick`-quoted span
+    r"|[A-Za-z_]\w*\("  # a call: foo(
+    r"|[A-Za-z_]\w+\.[A-Za-z_]\w+"  # dotted path: Foo.bar
+    r"|[A-Za-z]\w*_\w+"  # snake_case: foo_bar
+    r"|[a-z][A-Z]"  # camelCase boundary: fooBar
 )
 
-_CAMEL = re.compile(r"[a-z][A-Z]")
-_DOTTED = re.compile(r"[A-Za-z0-9_]\.[A-Za-z0-9_]")
+
+def references_identifier(query: str) -> bool:
+    """True if ``query`` names a specific code identifier (even inside a prose sentence)."""
+    return bool(_IDENTIFIER.search(query))
 
 
-def looks_like_identifier(query: str) -> bool:
-    """True if ``query`` reads like an exact code/symbol lookup rather than prose."""
-    q = query.strip()
-    if not q:
-        return False
-    tokens = q.split()
-    if len(tokens) >= 4:
-        return False  # multi-word -> natural language
-    if {t.lower().strip("?.,:") for t in tokens} & _NL_CUES:
-        return False  # contains a natural-language cue word
-    code_shaped = (
-        "_" in q
-        or "(" in q
-        or _CAMEL.search(q) is not None
-        or _DOTTED.search(q) is not None
-    )
-    if len(tokens) == 1:
-        return True  # a lone token is treated as a literal-term lookup
-    return code_shaped  # 2-3 tokens only count as code when visibly code-shaped
+# Back-compat alias: the original name meant "is an identifier lookup"; the detection is now
+# "references an identifier", which subsumes the old behavior and also catches embedded ones.
+looks_like_identifier = references_identifier
 
 
 def fusion_weights(query: str, config: "Config") -> Tuple[float, float]:
     """Return ``(dense_weight, lexical_weight)`` for ``query``.
 
-    Without adaptive fusion this is just the configured static pair. With it on, weights tilt
-    toward dense for natural-language queries and toward BM25 for identifier-like queries.
+    Without adaptive fusion this is the configured static pair. With it on: a query that
+    references a specific identifier uses the (neutral by default) code weights — so BM25's
+    exact-match signal is kept; a pure natural-language query uses the dense-leaning weights.
     """
     if not config.adaptive_fusion:
         return config.dense_weight, config.lexical_weight
-    if looks_like_identifier(query):
+    if references_identifier(query):
         return config.code_dense_weight, config.code_lexical_weight
     return config.nl_dense_weight, config.nl_lexical_weight
diff --git a/tests/test_query_type.py b/tests/test_query_type.py
index cb45a73..7f3ef16 100644
--- a/tests/test_query_type.py
+++ b/tests/test_query_type.py
@@ -3,24 +3,45 @@
 from __future__ import annotations
 
 from coderag.api import CodeRAG
-from coderag.retrieval.query_type import fusion_weights, looks_like_identifier
+from coderag.retrieval.query_type import (
+    fusion_weights,
+    looks_like_identifier,
+    references_identifier,
+)
 from tests.conftest import write
 
 
 def test_identifier_queries_detected():
-    assert looks_like_identifier("fts_search")
-    assert looks_like_identifier("reciprocal_rank_fusion")
-    assert looks_like_identifier("HybridSearcher.search")
-    assert looks_like_identifier("getUserToken")  # camelCase
-    assert looks_like_identifier("authenticate(token)")  # call paren
+    assert references_identifier("fts_search")
+    assert references_identifier("reciprocal_rank_fusion")
+    assert references_identifier("HybridSearcher.search")
+    assert references_identifier("getUserToken")  # camelCase
+    assert references_identifier("authenticate(token)")  # call paren
+
+
+def test_identifier_embedded_in_prose_detected():
+    # The case external validation exposed: a prose query that names a symbol.
+    assert references_identifier("Fix tuple order in AliasGenerator.generate_aliases()")
+    assert references_identifier("why does build_context drop the last chunk")
+    assert references_identifier("update the `validate_token` helper")
 
 
 def test_natural_language_queries_detected():
-    assert not looks_like_identifier("where is retry backoff handled")
-    assert not looks_like_identifier("how does indexing work")
-    assert not looks_like_identifier("the auth flow")  # NL cue word
-    assert not looks_like_identifier("user authentication flow")  # 3 plain words
-    assert not looks_like_identifier("")
+    assert not references_identifier("where is retry backoff handled")
+    assert not references_identifier("how does indexing work")
+    assert not references_identifier("the auth flow")
+    assert not references_identifier("user authentication flow")
+    assert not references_identifier("")
+
+
+def test_prose_abbreviations_and_versions_are_not_identifiers():
+    # Common false positives that must NOT trip the dotted-path rule.
+    assert not references_identifier("fix the bug e.g. on startup")
+    assert not references_identifier("bump to version 3.11 support")
+
+
+def test_looks_like_identifier_alias():
+    assert looks_like_identifier is references_identifier
 
 
 def test_fusion_weights_static_when_adaptive_off(config):

From 91b1eb3bd14ec53e2387e3880c6102c7211fb180 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 17 Jun 2026 16:41:42 +0000
Subject: [PATCH 2/4] docs(eval): adaptive fusion now generalizes after the
 classifier fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Re-validated the smarter classifier on a larger pydantic index (172 cases,
22,071-chunk corpus): adaptive went from 0.286 (regression vs hybrid 0.361
with the old shape-based classifier) to 0.458 = hybrid — no regression —
because identifier-naming queries now route to neutral weights. It still
beats hybrid on CodeRAG (NL 0.706 vs 0.581; identifier 0.715 vs 0.685).

Adaptive fusion is now a Pareto win over fixed 1:1 hybrid across both repos.
Updates the eval.md caveat and the external-validation write-up accordingly
(the "make the classifier smarter" follow-up is now done). Still off by
default pending a multi-repo sweep, but a strong default-on candidate.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01LhTCPRjNmSitYxgSDfttT7
---
 docs/eval.md                         | 28 ++++++++++++++++++++++++----
 docs/research/external-validation.md | 12 +++++++++---
 2 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/docs/eval.md b/docs/eval.md
index 69d6f72..85694c2 100644
--- a/docs/eval.md
+++ b/docs/eval.md
@@ -224,12 +224,32 @@ ambiguous and the embedder already matches them well. So the code-side default i
 for larger repos where exact-string recall matters more. Off by default pending larger-repo
 validation; enable with `CODERAG_ADAPTIVE_FUSION=1`.
 
-> ⚠️ **This did NOT generalize — keep it off.** On `pydantic` (4 155-chunk corpus, commit-message
-> queries) adaptive *hurt* (MRR 0.286 vs hybrid 0.361): those queries embed exact API names, so
-> "lean dense for NL" is backwards. The dense-vs-BM25 ranking flips by repo/query style, and
-> **fixed 1:1 hybrid is the robust default.** Full write-up:
+> ⚠️ **First cut did not generalize — then was fixed.** The original classifier keyed on query
+> *shape* and mis-read prose queries that *name* a symbol. On `pydantic` (commit-message queries)
+> it leaned dense and *hurt* (MRR 0.286 vs hybrid 0.361). The classifier now detects identifiers
+> **embedded in prose** (`references_identifier`) and routes them to the neutral code weights, so
+> adaptive falls back to plain hybrid whenever a symbol is named. Full write-up:
 > [research/external-validation.md](research/external-validation.md).
 
+### Adaptive fusion, after the classifier fix (validated on two repos)
+
+With embedded-identifier detection, adaptive is a **Pareto win over fixed 1:1 hybrid on both
+repos** — it beats hybrid where dense helps, and matches it (no regression) where BM25 helps:
+
+```
+CodeRAG, symbol level                  hybrid  dense  adaptive
+  natural-language queries (MRR)        0.581   0.675   0.706   ← beats both
+  identifier queries (MRR)              0.685   0.686   0.715   ← beats both
+
+pydantic, symbol level (172 cases, 22 071-chunk corpus)
+  dense 0.328 · bm25 0.398 · hybrid 0.458 · adaptive 0.458     ← equals hybrid (was 0.286)
+```
+
+(Also note: on the larger pydantic corpus **hybrid now beats both single modalities** — 0.458 vs
+bm25 0.398 vs dense 0.328 — reinforcing 1:1 hybrid as the robust base.) Adaptive is now never
+worse than hybrid across two very different repos; it stays **off by default** pending a
+multi-repo sweep, but is a strong default-on candidate. Enable with `CODERAG_ADAPTIVE_FUSION=1`.
+
 ## Dataset format
 
 JSONL, one case per line:
diff --git a/docs/research/external-validation.md b/docs/research/external-validation.md
index afec0b6..e968953 100644
--- a/docs/research/external-validation.md
+++ b/docs/research/external-validation.md
@@ -61,10 +61,16 @@ evaporated. The robust configuration is exactly the **shipped defaults** — 1:1
 off, rerank opt-in. This is the harness earning its keep: it caught the overfitting before any
 of it became a default.
 
+> **Update — the adaptive-fusion failure has since been fixed.** The classifier now detects
+> identifiers *embedded in prose* (`references_identifier`) and routes those queries to neutral
+> weights, so adaptive falls back to plain hybrid whenever a symbol is named. Re-validated on a
+> larger pydantic index (172 cases, 22 071 chunks): adaptive went from **0.286 (regression) →
+> 0.458 (= hybrid, no regression)**, while still beating hybrid on CodeRAG (0.706 vs 0.581) — a
+> Pareto win across both repos. See [../eval.md](../eval.md). The reranker/embedder findings stand.
+
 **Actionable next steps** (none change a default):
-- Make `looks_like_identifier` smarter — detect identifiers *embedded* in prose queries (so
-  "Fix `AliasGenerator.generate_aliases`" routes BM25-up, not dense-up). That could make adaptive
-  fusion a net win across query styles instead of fragile.
+- ~~Make the classifier detect identifiers *embedded* in prose queries.~~ **Done** — see the
+  update above; adaptive now generalizes across both repos.
 - Test a **code-aware reranker** (`bge-reranker-base`, `jina-reranker-v2`) at scale on GPU —
   the only lever not yet fairly evaluated.
 - Build a multi-repo eval set (several external repos) so future tuning is judged on

From b450520aa8b9b3fc3d31d31748b200fa1d5448c6 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 17 Jun 2026 16:57:49 +0000
Subject: [PATCH 3/4] fix(retrieval): make identifier detection linear
 (ReDoS-safe)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CodeQL flagged the identifier regex as polynomial-ReDoS: the snake_case branch
[A-Za-z]\w*_\w+ is ambiguous (underscore is in \w), so a crafted query caused
quadratic backtracking (measured 12ms→47ms→187ms→736ms as n doubled). The
query string flows in from the HTTP API, so this is API-reachable.

Replace the single backtracking regex with linear token scanning: two
disjoint-class regexes (`backtick` span, word-then-paren) plus per-token
plain-Python checks for snake_case / dotted-path / camelCase. Same detection
behavior (all routing tests unchanged); timing is now linear (64x input ->
~56x time). Adds a regression test on a 200k-char adversarial input.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01LhTCPRjNmSitYxgSDfttT7
---
 coderag/retrieval/query_type.py | 58 ++++++++++++++++++++++++++-------
 tests/test_query_type.py        |  8 +++++
 2 files changed, 54 insertions(+), 12 deletions(-)

diff --git a/coderag/retrieval/query_type.py b/coderag/retrieval/query_type.py
index 66e676d..93f6190 100644
--- a/coderag/retrieval/query_type.py
+++ b/coderag/retrieval/query_type.py
@@ -22,21 +22,55 @@
 if TYPE_CHECKING:
     from coderag.config import Config
 
-# A token that denotes a specific code identifier, even when embedded in prose. Matches
-# `backtick` spans, calls ``foo(``, dotted paths ``Foo.bar`` (both sides 2+ chars, so
-# ``e.g``/``3.11`` are excluded), snake_case ``foo_bar``, and camelCase ``fooBar``.
-_IDENTIFIER = re.compile(
-    r"`[^`]+`"  # `backtick`-quoted span
-    r"|[A-Za-z_]\w*\("  # a call: foo(
-    r"|[A-Za-z_]\w+\.[A-Za-z_]\w+"  # dotted path: Foo.bar
-    r"|[A-Za-z]\w*_\w+"  # snake_case: foo_bar
-    r"|[a-z][A-Z]"  # camelCase boundary: fooBar
-)
+# Query-wide signals, both linear (disjoint character classes — no catastrophic backtracking
+# on the user-supplied query string): a `backtick`-quoted span, or a call ``word(``.
+_BACKTICK = re.compile(r"`[^`]+`")
+_CALL = re.compile(r"\w\(")
+# Punctuation stripped from a token before classifying it (query prose, not code).
+_STRIP = "`'\".,:;!?()[]{}<>"
+
+
+def _is_snake_case(token: str) -> bool:
+    """An underscore flanked by alphanumerics, e.g. ``foo_bar`` (linear scan)."""
+    return any(
+        token[i] == "_" and token[i - 1].isalnum() and token[i + 1].isalnum()
+        for i in range(1, len(token) - 1)
+    )
+
+
+def _is_dotted_path(token: str) -> bool:
+    """Two adjacent identifier parts joined by a dot, e.g. ``Foo.bar`` — excludes ``e.g``/``3.11``."""
+    parts = token.split(".")
+    return any(
+        len(a) >= 2 and len(b) >= 2 and a.isidentifier() and b.isidentifier()
+        for a, b in zip(parts, parts[1:], strict=False)
+    )
+
+
+def _is_camel_case(token: str) -> bool:
+    """A lower→upper boundary, e.g. ``fooBar`` / ``AliasGenerator`` (linear scan)."""
+    return any(
+        a.islower() and b.isupper() for a, b in zip(token, token[1:], strict=False)
+    )
 
 
 def references_identifier(query: str) -> bool:
-    """True if ``query`` names a specific code identifier (even inside a prose sentence)."""
-    return bool(_IDENTIFIER.search(query))
+    """True if ``query`` names a specific code identifier (even inside a prose sentence).
+
+    Uses linear token scanning rather than one backtracking regex, so it can't be turned into
+    a ReDoS by a crafted query (the query flows in from the HTTP API).
+    """
+    if not query:
+        return False
+    if _BACKTICK.search(query) or _CALL.search(query):
+        return True
+    for raw in query.split():
+        token = raw.strip(_STRIP)
+        if len(token) >= 2 and (
+            _is_snake_case(token) or _is_dotted_path(token) or _is_camel_case(token)
+        ):
+            return True
+    return False
 
 
 # Back-compat alias: the original name meant "is an identifier lookup"; the detection is now
diff --git a/tests/test_query_type.py b/tests/test_query_type.py
index 7f3ef16..40150c5 100644
--- a/tests/test_query_type.py
+++ b/tests/test_query_type.py
@@ -44,6 +44,14 @@ def test_looks_like_identifier_alias():
     assert looks_like_identifier is references_identifier
 
 
+def test_detection_is_linear_no_redos():
+    # A long adversarial run must be handled in linear time (the query is API-reachable).
+    # Quadratic backtracking on this input would take minutes; linear is milliseconds.
+    big = "a" * 200_000 + " "
+    assert references_identifier(big) is False
+    assert references_identifier(big + "needs_a_match") is True
+
+
 def test_fusion_weights_static_when_adaptive_off(config):
     cfg = config.with_overrides(dense_weight=1.0, lexical_weight=1.0)
     assert fusion_weights("anything at all here", cfg) == (1.0, 1.0)

From 6093e180ba922434321c7fbdb9a2a6fa247ae66d Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 17 Jun 2026 17:23:22 +0000
Subject: [PATCH 4/4] docs: temper adaptive-fusion claims to match the 4-repo
 result
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A 4-repo sweep (627 git-mined cases) showed adaptive fusion is NOT an
aggregate win (hybrid 0.442 vs adaptive 0.423 MRR) — the big CodeRAG-curated
gain was an artifact of dense-friendly clean-NL queries. The classifier fix
still matters: it removed the catastrophic regression (pydantic 0.286->0.458),
making adaptive a safe opt-in. But it is not a default-on candidate; fixed 1:1
hybrid stays the default. Corrects the earlier "Pareto win / strong default-on
candidate" framing in eval.md and external-validation.md.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01LhTCPRjNmSitYxgSDfttT7
---
 docs/eval.md                         | 19 +++++++++++--------
 docs/research/external-validation.md | 13 +++++++------
 2 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/docs/eval.md b/docs/eval.md
index 85694c2..acc2f97 100644
--- a/docs/eval.md
+++ b/docs/eval.md
@@ -231,13 +231,14 @@ validation; enable with `CODERAG_ADAPTIVE_FUSION=1`.
 > adaptive falls back to plain hybrid whenever a symbol is named. Full write-up:
 > [research/external-validation.md](research/external-validation.md).
 
-### Adaptive fusion, after the classifier fix (validated on two repos)
+### Adaptive fusion, after the classifier fix
 
-With embedded-identifier detection, adaptive is a **Pareto win over fixed 1:1 hybrid on both
-repos** — it beats hybrid where dense helps, and matches it (no regression) where BM25 helps:
+The classifier fix removed the catastrophic regression the first cut had (on `pydantic`,
+adaptive went from **0.286 → 0.458 = hybrid**, no longer hurting). On two early datasets it
+looked like a clear win:
 
 ```
-CodeRAG, symbol level                  hybrid  dense  adaptive
+CodeRAG curated, symbol level          hybrid  dense  adaptive
   natural-language queries (MRR)        0.581   0.675   0.706   ← beats both
   identifier queries (MRR)              0.685   0.686   0.715   ← beats both
 
@@ -245,10 +246,12 @@ pydantic, symbol level (172 cases, 22 071-chunk corpus)
   dense 0.328 · bm25 0.398 · hybrid 0.458 · adaptive 0.458     ← equals hybrid (was 0.286)
 ```
 
-(Also note: on the larger pydantic corpus **hybrid now beats both single modalities** — 0.458 vs
-bm25 0.398 vs dense 0.328 — reinforcing 1:1 hybrid as the robust base.) Adaptive is now never
-worse than hybrid across two very different repos; it stays **off by default** pending a
-multi-repo sweep, but is a strong default-on candidate. Enable with `CODERAG_ADAPTIVE_FUSION=1`.
+⚠️ **But a 4-repo sweep (627 git-mined cases) shows it is *not* an aggregate win** — hybrid 0.442
+vs adaptive 0.423 MRR; adaptive is a wash on the well-powered repos and the big CodeRAG-curated
+gain turned out to be an artifact of unusually dense-friendly clean-NL queries (see the
+*Multi-repo evaluation* section below / PR adding it). So adaptive stays **off by default** — it's
+a **safe opt-in** (no catastrophic regression after this fix), not a default. Fixed 1:1 hybrid
+remains the default. Enable per-session with `CODERAG_ADAPTIVE_FUSION=1`.
 
 ## Dataset format
 
diff --git a/docs/research/external-validation.md b/docs/research/external-validation.md
index e968953..e330cc5 100644
--- a/docs/research/external-validation.md
+++ b/docs/research/external-validation.md
@@ -61,12 +61,13 @@ evaporated. The robust configuration is exactly the **shipped defaults** — 1:1
 off, rerank opt-in. This is the harness earning its keep: it caught the overfitting before any
 of it became a default.
 
-> **Update — the adaptive-fusion failure has since been fixed.** The classifier now detects
-> identifiers *embedded in prose* (`references_identifier`) and routes those queries to neutral
-> weights, so adaptive falls back to plain hybrid whenever a symbol is named. Re-validated on a
-> larger pydantic index (172 cases, 22 071 chunks): adaptive went from **0.286 (regression) →
-> 0.458 (= hybrid, no regression)**, while still beating hybrid on CodeRAG (0.706 vs 0.581) — a
-> Pareto win across both repos. See [../eval.md](../eval.md). The reranker/embedder findings stand.
+> **Update — the regression was fixed, but adaptive still doesn't earn default-on.** The
+> classifier now detects identifiers *embedded in prose* (`references_identifier`) and routes
+> those queries to neutral weights, removing the catastrophic case (pydantic **0.286 → 0.458 =
+> hybrid**). However, a later **4-repo sweep** (coderag/flask/requests/click, 627 git-mined cases)
+> showed adaptive is **not** an aggregate win — hybrid 0.442 vs adaptive 0.423 MRR. The big
+> CodeRAG-curated adaptive gain was an artifact of dense-friendly clean-NL queries. So adaptive is
+> a **safe opt-in**, not a default; fixed 1:1 hybrid stays the default. See [../eval.md](../eval.md).
 
 **Actionable next steps** (none change a default):
 - ~~Make the classifier detect identifiers *embedded* in prose queries.~~ **Done** — see the