From 7ec0d58aad3b665fd1da01d18dc3f8738f40d3ab Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 17 Jun 2026 15:48:57 +0000 Subject: [PATCH 1/4] fix(retrieval): detect identifiers embedded in prose queries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit External validation (pydantic) showed the first adaptive-fusion classifier mis-routed the common real case: a prose query that *names* a symbol (e.g. a commit message "Fix tuple order in AliasGenerator.generate_aliases()") was treated as natural language and leaned dense — exactly backwards, since the discriminating signal is the exact identifier BM25 matches. Replace the shape/length heuristic with references_identifier(): detect a specific identifier token anywhere in the query (backtick spans, calls foo(, dotted Foo.bar, snake_case, camelCase), with e.g./i.e./version-number guards. A query that names an identifier now routes to the neutral (1:1) code weights instead of dense-leaning — so adaptive fusion falls back to plain hybrid whenever a symbol is mentioned and can only help pure-NL queries. Re-validated on CodeRAG (symbol level), now a clear Pareto win over both hybrid and dense on both query styles: NL set: hybrid 0.581 / dense 0.675 -> adaptive 0.706 MRR identifier set: hybrid 0.685 / dense 0.686 -> adaptive 0.715 MRR looks_like_identifier kept as a back-compat alias. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01LhTCPRjNmSitYxgSDfttT7 --- coderag/retrieval/query_type.py | 95 +++++++++++---------------------- tests/test_query_type.py | 43 +++++++++++---- 2 files changed, 64 insertions(+), 74 deletions(-) diff --git a/coderag/retrieval/query_type.py b/coderag/retrieval/query_type.py index 7389afc..66e676d 100644 --- a/coderag/retrieval/query_type.py +++ b/coderag/retrieval/query_type.py @@ -1,14 +1,17 @@ """Query-type detection for adaptive fusion weighting. -Symbol-level evaluation showed that a fixed 1:1 dense/BM25 fusion is a compromise, not an -optimum: on natural-language "where is X handled" queries the dense retriever is much -stronger and equal-weight RRF *drags it down* with weak BM25, while on exact-identifier -queries (``fts_search``, ``HybridSearcher.search``) the opposite holds. Routing the fusion -weights by query type recovers most of that gap with a cheap, local heuristic. - -``looks_like_identifier`` is deliberately conservative: it only calls a query "code" when it -is short *and* either a lone token or visibly code-shaped (snake_case, dotted path, -camelCase, a call paren), and never when it contains natural-language cue words. +Symbol-level evaluation showed a fixed 1:1 dense/BM25 fusion is a compromise: on pure +natural-language "where is X handled" queries the dense retriever is much stronger and +equal-weight RRF drags it down with weak BM25. But the first cut keyed on query *shape* +(short + code-looking) and mis-classified the common real case — a prose query that +*mentions* a specific symbol, e.g. a commit message *"Fix tuple order in +`AliasGenerator.generate_aliases()`"*. External validation (pydantic) showed leaning dense +on those hurts, because the discriminating signal is the exact identifier BM25 matches. + +So the routing question is simply: **does the query reference a specific code identifier?** +If yes, keep fusion neutral (BM25 carries real signal); if it's pure prose, lean dense. This +makes adaptive fusion fall back to plain hybrid whenever an identifier is named — it can only +help pure-NL queries, never repeat the regression. """ from __future__ import annotations @@ -19,71 +22,37 @@ if TYPE_CHECKING: from coderag.config import Config -# Words that mark a query as natural language even if it's short. -_NL_CUES = frozenset( - { - "where", - "how", - "what", - "why", - "when", - "which", - "who", - "is", - "are", - "was", - "were", - "does", - "do", - "did", - "can", - "the", - "a", - "an", - "to", - "of", - "in", - "for", - "on", - "and", - "or", - "with", - } +# A token that denotes a specific code identifier, even when embedded in prose. Matches +# `backtick` spans, calls ``foo(``, dotted paths ``Foo.bar`` (both sides 2+ chars, so +# ``e.g``/``3.11`` are excluded), snake_case ``foo_bar``, and camelCase ``fooBar``. +_IDENTIFIER = re.compile( + r"`[^`]+`" # `backtick`-quoted span + r"|[A-Za-z_]\w*\(" # a call: foo( + r"|[A-Za-z_]\w+\.[A-Za-z_]\w+" # dotted path: Foo.bar + r"|[A-Za-z]\w*_\w+" # snake_case: foo_bar + r"|[a-z][A-Z]" # camelCase boundary: fooBar ) -_CAMEL = re.compile(r"[a-z][A-Z]") -_DOTTED = re.compile(r"[A-Za-z0-9_]\.[A-Za-z0-9_]") + +def references_identifier(query: str) -> bool: + """True if ``query`` names a specific code identifier (even inside a prose sentence).""" + return bool(_IDENTIFIER.search(query)) -def looks_like_identifier(query: str) -> bool: - """True if ``query`` reads like an exact code/symbol lookup rather than prose.""" - q = query.strip() - if not q: - return False - tokens = q.split() - if len(tokens) >= 4: - return False # multi-word -> natural language - if {t.lower().strip("?.,:") for t in tokens} & _NL_CUES: - return False # contains a natural-language cue word - code_shaped = ( - "_" in q - or "(" in q - or _CAMEL.search(q) is not None - or _DOTTED.search(q) is not None - ) - if len(tokens) == 1: - return True # a lone token is treated as a literal-term lookup - return code_shaped # 2-3 tokens only count as code when visibly code-shaped +# Back-compat alias: the original name meant "is an identifier lookup"; the detection is now +# "references an identifier", which subsumes the old behavior and also catches embedded ones. +looks_like_identifier = references_identifier def fusion_weights(query: str, config: "Config") -> Tuple[float, float]: """Return ``(dense_weight, lexical_weight)`` for ``query``. - Without adaptive fusion this is just the configured static pair. With it on, weights tilt - toward dense for natural-language queries and toward BM25 for identifier-like queries. + Without adaptive fusion this is the configured static pair. With it on: a query that + references a specific identifier uses the (neutral by default) code weights — so BM25's + exact-match signal is kept; a pure natural-language query uses the dense-leaning weights. """ if not config.adaptive_fusion: return config.dense_weight, config.lexical_weight - if looks_like_identifier(query): + if references_identifier(query): return config.code_dense_weight, config.code_lexical_weight return config.nl_dense_weight, config.nl_lexical_weight diff --git a/tests/test_query_type.py b/tests/test_query_type.py index cb45a73..7f3ef16 100644 --- a/tests/test_query_type.py +++ b/tests/test_query_type.py @@ -3,24 +3,45 @@ from __future__ import annotations from coderag.api import CodeRAG -from coderag.retrieval.query_type import fusion_weights, looks_like_identifier +from coderag.retrieval.query_type import ( + fusion_weights, + looks_like_identifier, + references_identifier, +) from tests.conftest import write def test_identifier_queries_detected(): - assert looks_like_identifier("fts_search") - assert looks_like_identifier("reciprocal_rank_fusion") - assert looks_like_identifier("HybridSearcher.search") - assert looks_like_identifier("getUserToken") # camelCase - assert looks_like_identifier("authenticate(token)") # call paren + assert references_identifier("fts_search") + assert references_identifier("reciprocal_rank_fusion") + assert references_identifier("HybridSearcher.search") + assert references_identifier("getUserToken") # camelCase + assert references_identifier("authenticate(token)") # call paren + + +def test_identifier_embedded_in_prose_detected(): + # The case external validation exposed: a prose query that names a symbol. + assert references_identifier("Fix tuple order in AliasGenerator.generate_aliases()") + assert references_identifier("why does build_context drop the last chunk") + assert references_identifier("update the `validate_token` helper") def test_natural_language_queries_detected(): - assert not looks_like_identifier("where is retry backoff handled") - assert not looks_like_identifier("how does indexing work") - assert not looks_like_identifier("the auth flow") # NL cue word - assert not looks_like_identifier("user authentication flow") # 3 plain words - assert not looks_like_identifier("") + assert not references_identifier("where is retry backoff handled") + assert not references_identifier("how does indexing work") + assert not references_identifier("the auth flow") + assert not references_identifier("user authentication flow") + assert not references_identifier("") + + +def test_prose_abbreviations_and_versions_are_not_identifiers(): + # Common false positives that must NOT trip the dotted-path rule. + assert not references_identifier("fix the bug e.g. on startup") + assert not references_identifier("bump to version 3.11 support") + + +def test_looks_like_identifier_alias(): + assert looks_like_identifier is references_identifier def test_fusion_weights_static_when_adaptive_off(config): From 91b1eb3bd14ec53e2387e3880c6102c7211fb180 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 17 Jun 2026 16:41:42 +0000 Subject: [PATCH 2/4] docs(eval): adaptive fusion now generalizes after the classifier fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Re-validated the smarter classifier on a larger pydantic index (172 cases, 22,071-chunk corpus): adaptive went from 0.286 (regression vs hybrid 0.361 with the old shape-based classifier) to 0.458 = hybrid — no regression — because identifier-naming queries now route to neutral weights. It still beats hybrid on CodeRAG (NL 0.706 vs 0.581; identifier 0.715 vs 0.685). Adaptive fusion is now a Pareto win over fixed 1:1 hybrid across both repos. Updates the eval.md caveat and the external-validation write-up accordingly (the "make the classifier smarter" follow-up is now done). Still off by default pending a multi-repo sweep, but a strong default-on candidate. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01LhTCPRjNmSitYxgSDfttT7 --- docs/eval.md | 28 ++++++++++++++++++++++++---- docs/research/external-validation.md | 12 +++++++++--- 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/docs/eval.md b/docs/eval.md index 69d6f72..85694c2 100644 --- a/docs/eval.md +++ b/docs/eval.md @@ -224,12 +224,32 @@ ambiguous and the embedder already matches them well. So the code-side default i for larger repos where exact-string recall matters more. Off by default pending larger-repo validation; enable with `CODERAG_ADAPTIVE_FUSION=1`. -> ⚠️ **This did NOT generalize — keep it off.** On `pydantic` (4 155-chunk corpus, commit-message -> queries) adaptive *hurt* (MRR 0.286 vs hybrid 0.361): those queries embed exact API names, so -> "lean dense for NL" is backwards. The dense-vs-BM25 ranking flips by repo/query style, and -> **fixed 1:1 hybrid is the robust default.** Full write-up: +> ⚠️ **First cut did not generalize — then was fixed.** The original classifier keyed on query +> *shape* and mis-read prose queries that *name* a symbol. On `pydantic` (commit-message queries) +> it leaned dense and *hurt* (MRR 0.286 vs hybrid 0.361). The classifier now detects identifiers +> **embedded in prose** (`references_identifier`) and routes them to the neutral code weights, so +> adaptive falls back to plain hybrid whenever a symbol is named. Full write-up: > [research/external-validation.md](research/external-validation.md). +### Adaptive fusion, after the classifier fix (validated on two repos) + +With embedded-identifier detection, adaptive is a **Pareto win over fixed 1:1 hybrid on both +repos** — it beats hybrid where dense helps, and matches it (no regression) where BM25 helps: + +``` +CodeRAG, symbol level hybrid dense adaptive + natural-language queries (MRR) 0.581 0.675 0.706 ← beats both + identifier queries (MRR) 0.685 0.686 0.715 ← beats both + +pydantic, symbol level (172 cases, 22 071-chunk corpus) + dense 0.328 · bm25 0.398 · hybrid 0.458 · adaptive 0.458 ← equals hybrid (was 0.286) +``` + +(Also note: on the larger pydantic corpus **hybrid now beats both single modalities** — 0.458 vs +bm25 0.398 vs dense 0.328 — reinforcing 1:1 hybrid as the robust base.) Adaptive is now never +worse than hybrid across two very different repos; it stays **off by default** pending a +multi-repo sweep, but is a strong default-on candidate. Enable with `CODERAG_ADAPTIVE_FUSION=1`. + ## Dataset format JSONL, one case per line: diff --git a/docs/research/external-validation.md b/docs/research/external-validation.md index afec0b6..e968953 100644 --- a/docs/research/external-validation.md +++ b/docs/research/external-validation.md @@ -61,10 +61,16 @@ evaporated. The robust configuration is exactly the **shipped defaults** — 1:1 off, rerank opt-in. This is the harness earning its keep: it caught the overfitting before any of it became a default. +> **Update — the adaptive-fusion failure has since been fixed.** The classifier now detects +> identifiers *embedded in prose* (`references_identifier`) and routes those queries to neutral +> weights, so adaptive falls back to plain hybrid whenever a symbol is named. Re-validated on a +> larger pydantic index (172 cases, 22 071 chunks): adaptive went from **0.286 (regression) → +> 0.458 (= hybrid, no regression)**, while still beating hybrid on CodeRAG (0.706 vs 0.581) — a +> Pareto win across both repos. See [../eval.md](../eval.md). The reranker/embedder findings stand. + **Actionable next steps** (none change a default): -- Make `looks_like_identifier` smarter — detect identifiers *embedded* in prose queries (so - "Fix `AliasGenerator.generate_aliases`" routes BM25-up, not dense-up). That could make adaptive - fusion a net win across query styles instead of fragile. +- ~~Make the classifier detect identifiers *embedded* in prose queries.~~ **Done** — see the + update above; adaptive now generalizes across both repos. - Test a **code-aware reranker** (`bge-reranker-base`, `jina-reranker-v2`) at scale on GPU — the only lever not yet fairly evaluated. - Build a multi-repo eval set (several external repos) so future tuning is judged on From b450520aa8b9b3fc3d31d31748b200fa1d5448c6 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 17 Jun 2026 16:57:49 +0000 Subject: [PATCH 3/4] fix(retrieval): make identifier detection linear (ReDoS-safe) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CodeQL flagged the identifier regex as polynomial-ReDoS: the snake_case branch [A-Za-z]\w*_\w+ is ambiguous (underscore is in \w), so a crafted query caused quadratic backtracking (measured 12ms→47ms→187ms→736ms as n doubled). The query string flows in from the HTTP API, so this is API-reachable. Replace the single backtracking regex with linear token scanning: two disjoint-class regexes (`backtick` span, word-then-paren) plus per-token plain-Python checks for snake_case / dotted-path / camelCase. Same detection behavior (all routing tests unchanged); timing is now linear (64x input -> ~56x time). Adds a regression test on a 200k-char adversarial input. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01LhTCPRjNmSitYxgSDfttT7 --- coderag/retrieval/query_type.py | 58 ++++++++++++++++++++++++++------- tests/test_query_type.py | 8 +++++ 2 files changed, 54 insertions(+), 12 deletions(-) diff --git a/coderag/retrieval/query_type.py b/coderag/retrieval/query_type.py index 66e676d..93f6190 100644 --- a/coderag/retrieval/query_type.py +++ b/coderag/retrieval/query_type.py @@ -22,21 +22,55 @@ if TYPE_CHECKING: from coderag.config import Config -# A token that denotes a specific code identifier, even when embedded in prose. Matches -# `backtick` spans, calls ``foo(``, dotted paths ``Foo.bar`` (both sides 2+ chars, so -# ``e.g``/``3.11`` are excluded), snake_case ``foo_bar``, and camelCase ``fooBar``. -_IDENTIFIER = re.compile( - r"`[^`]+`" # `backtick`-quoted span - r"|[A-Za-z_]\w*\(" # a call: foo( - r"|[A-Za-z_]\w+\.[A-Za-z_]\w+" # dotted path: Foo.bar - r"|[A-Za-z]\w*_\w+" # snake_case: foo_bar - r"|[a-z][A-Z]" # camelCase boundary: fooBar -) +# Query-wide signals, both linear (disjoint character classes — no catastrophic backtracking +# on the user-supplied query string): a `backtick`-quoted span, or a call ``word(``. +_BACKTICK = re.compile(r"`[^`]+`") +_CALL = re.compile(r"\w\(") +# Punctuation stripped from a token before classifying it (query prose, not code). +_STRIP = "`'\".,:;!?()[]{}<>" + + +def _is_snake_case(token: str) -> bool: + """An underscore flanked by alphanumerics, e.g. ``foo_bar`` (linear scan).""" + return any( + token[i] == "_" and token[i - 1].isalnum() and token[i + 1].isalnum() + for i in range(1, len(token) - 1) + ) + + +def _is_dotted_path(token: str) -> bool: + """Two adjacent identifier parts joined by a dot, e.g. ``Foo.bar`` — excludes ``e.g``/``3.11``.""" + parts = token.split(".") + return any( + len(a) >= 2 and len(b) >= 2 and a.isidentifier() and b.isidentifier() + for a, b in zip(parts, parts[1:], strict=False) + ) + + +def _is_camel_case(token: str) -> bool: + """A lower→upper boundary, e.g. ``fooBar`` / ``AliasGenerator`` (linear scan).""" + return any( + a.islower() and b.isupper() for a, b in zip(token, token[1:], strict=False) + ) def references_identifier(query: str) -> bool: - """True if ``query`` names a specific code identifier (even inside a prose sentence).""" - return bool(_IDENTIFIER.search(query)) + """True if ``query`` names a specific code identifier (even inside a prose sentence). + + Uses linear token scanning rather than one backtracking regex, so it can't be turned into + a ReDoS by a crafted query (the query flows in from the HTTP API). + """ + if not query: + return False + if _BACKTICK.search(query) or _CALL.search(query): + return True + for raw in query.split(): + token = raw.strip(_STRIP) + if len(token) >= 2 and ( + _is_snake_case(token) or _is_dotted_path(token) or _is_camel_case(token) + ): + return True + return False # Back-compat alias: the original name meant "is an identifier lookup"; the detection is now diff --git a/tests/test_query_type.py b/tests/test_query_type.py index 7f3ef16..40150c5 100644 --- a/tests/test_query_type.py +++ b/tests/test_query_type.py @@ -44,6 +44,14 @@ def test_looks_like_identifier_alias(): assert looks_like_identifier is references_identifier +def test_detection_is_linear_no_redos(): + # A long adversarial run must be handled in linear time (the query is API-reachable). + # Quadratic backtracking on this input would take minutes; linear is milliseconds. + big = "a" * 200_000 + " " + assert references_identifier(big) is False + assert references_identifier(big + "needs_a_match") is True + + def test_fusion_weights_static_when_adaptive_off(config): cfg = config.with_overrides(dense_weight=1.0, lexical_weight=1.0) assert fusion_weights("anything at all here", cfg) == (1.0, 1.0) From 6093e180ba922434321c7fbdb9a2a6fa247ae66d Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 17 Jun 2026 17:23:22 +0000 Subject: [PATCH 4/4] docs: temper adaptive-fusion claims to match the 4-repo result MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A 4-repo sweep (627 git-mined cases) showed adaptive fusion is NOT an aggregate win (hybrid 0.442 vs adaptive 0.423 MRR) — the big CodeRAG-curated gain was an artifact of dense-friendly clean-NL queries. The classifier fix still matters: it removed the catastrophic regression (pydantic 0.286->0.458), making adaptive a safe opt-in. But it is not a default-on candidate; fixed 1:1 hybrid stays the default. Corrects the earlier "Pareto win / strong default-on candidate" framing in eval.md and external-validation.md. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01LhTCPRjNmSitYxgSDfttT7 --- docs/eval.md | 19 +++++++++++-------- docs/research/external-validation.md | 13 +++++++------ 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/docs/eval.md b/docs/eval.md index 85694c2..acc2f97 100644 --- a/docs/eval.md +++ b/docs/eval.md @@ -231,13 +231,14 @@ validation; enable with `CODERAG_ADAPTIVE_FUSION=1`. > adaptive falls back to plain hybrid whenever a symbol is named. Full write-up: > [research/external-validation.md](research/external-validation.md). -### Adaptive fusion, after the classifier fix (validated on two repos) +### Adaptive fusion, after the classifier fix -With embedded-identifier detection, adaptive is a **Pareto win over fixed 1:1 hybrid on both -repos** — it beats hybrid where dense helps, and matches it (no regression) where BM25 helps: +The classifier fix removed the catastrophic regression the first cut had (on `pydantic`, +adaptive went from **0.286 → 0.458 = hybrid**, no longer hurting). On two early datasets it +looked like a clear win: ``` -CodeRAG, symbol level hybrid dense adaptive +CodeRAG curated, symbol level hybrid dense adaptive natural-language queries (MRR) 0.581 0.675 0.706 ← beats both identifier queries (MRR) 0.685 0.686 0.715 ← beats both @@ -245,10 +246,12 @@ pydantic, symbol level (172 cases, 22 071-chunk corpus) dense 0.328 · bm25 0.398 · hybrid 0.458 · adaptive 0.458 ← equals hybrid (was 0.286) ``` -(Also note: on the larger pydantic corpus **hybrid now beats both single modalities** — 0.458 vs -bm25 0.398 vs dense 0.328 — reinforcing 1:1 hybrid as the robust base.) Adaptive is now never -worse than hybrid across two very different repos; it stays **off by default** pending a -multi-repo sweep, but is a strong default-on candidate. Enable with `CODERAG_ADAPTIVE_FUSION=1`. +⚠️ **But a 4-repo sweep (627 git-mined cases) shows it is *not* an aggregate win** — hybrid 0.442 +vs adaptive 0.423 MRR; adaptive is a wash on the well-powered repos and the big CodeRAG-curated +gain turned out to be an artifact of unusually dense-friendly clean-NL queries (see the +*Multi-repo evaluation* section below / PR adding it). So adaptive stays **off by default** — it's +a **safe opt-in** (no catastrophic regression after this fix), not a default. Fixed 1:1 hybrid +remains the default. Enable per-session with `CODERAG_ADAPTIVE_FUSION=1`. ## Dataset format diff --git a/docs/research/external-validation.md b/docs/research/external-validation.md index e968953..e330cc5 100644 --- a/docs/research/external-validation.md +++ b/docs/research/external-validation.md @@ -61,12 +61,13 @@ evaporated. The robust configuration is exactly the **shipped defaults** — 1:1 off, rerank opt-in. This is the harness earning its keep: it caught the overfitting before any of it became a default. -> **Update — the adaptive-fusion failure has since been fixed.** The classifier now detects -> identifiers *embedded in prose* (`references_identifier`) and routes those queries to neutral -> weights, so adaptive falls back to plain hybrid whenever a symbol is named. Re-validated on a -> larger pydantic index (172 cases, 22 071 chunks): adaptive went from **0.286 (regression) → -> 0.458 (= hybrid, no regression)**, while still beating hybrid on CodeRAG (0.706 vs 0.581) — a -> Pareto win across both repos. See [../eval.md](../eval.md). The reranker/embedder findings stand. +> **Update — the regression was fixed, but adaptive still doesn't earn default-on.** The +> classifier now detects identifiers *embedded in prose* (`references_identifier`) and routes +> those queries to neutral weights, removing the catastrophic case (pydantic **0.286 → 0.458 = +> hybrid**). However, a later **4-repo sweep** (coderag/flask/requests/click, 627 git-mined cases) +> showed adaptive is **not** an aggregate win — hybrid 0.442 vs adaptive 0.423 MRR. The big +> CodeRAG-curated adaptive gain was an artifact of dense-friendly clean-NL queries. So adaptive is +> a **safe opt-in**, not a default; fixed 1:1 hybrid stays the default. See [../eval.md](../eval.md). **Actionable next steps** (none change a default): - ~~Make the classifier detect identifiers *embedded* in prose queries.~~ **Done** — see the