diff --git a/coderag/retrieval/query_type.py b/coderag/retrieval/query_type.py index 7389afc..93f6190 100644 --- a/coderag/retrieval/query_type.py +++ b/coderag/retrieval/query_type.py @@ -1,14 +1,17 @@ """Query-type detection for adaptive fusion weighting. -Symbol-level evaluation showed that a fixed 1:1 dense/BM25 fusion is a compromise, not an -optimum: on natural-language "where is X handled" queries the dense retriever is much -stronger and equal-weight RRF *drags it down* with weak BM25, while on exact-identifier -queries (``fts_search``, ``HybridSearcher.search``) the opposite holds. Routing the fusion -weights by query type recovers most of that gap with a cheap, local heuristic. - -``looks_like_identifier`` is deliberately conservative: it only calls a query "code" when it -is short *and* either a lone token or visibly code-shaped (snake_case, dotted path, -camelCase, a call paren), and never when it contains natural-language cue words. +Symbol-level evaluation showed a fixed 1:1 dense/BM25 fusion is a compromise: on pure +natural-language "where is X handled" queries the dense retriever is much stronger and +equal-weight RRF drags it down with weak BM25. But the first cut keyed on query *shape* +(short + code-looking) and mis-classified the common real case — a prose query that +*mentions* a specific symbol, e.g. a commit message *"Fix tuple order in +`AliasGenerator.generate_aliases()`"*. External validation (pydantic) showed leaning dense +on those hurts, because the discriminating signal is the exact identifier BM25 matches. + +So the routing question is simply: **does the query reference a specific code identifier?** +If yes, keep fusion neutral (BM25 carries real signal); if it's pure prose, lean dense. This +makes adaptive fusion fall back to plain hybrid whenever an identifier is named — it can only +help pure-NL queries, never repeat the regression. """ from __future__ import annotations @@ -19,71 +22,71 @@ if TYPE_CHECKING: from coderag.config import Config -# Words that mark a query as natural language even if it's short. -_NL_CUES = frozenset( - { - "where", - "how", - "what", - "why", - "when", - "which", - "who", - "is", - "are", - "was", - "were", - "does", - "do", - "did", - "can", - "the", - "a", - "an", - "to", - "of", - "in", - "for", - "on", - "and", - "or", - "with", - } -) - -_CAMEL = re.compile(r"[a-z][A-Z]") -_DOTTED = re.compile(r"[A-Za-z0-9_]\.[A-Za-z0-9_]") - - -def looks_like_identifier(query: str) -> bool: - """True if ``query`` reads like an exact code/symbol lookup rather than prose.""" - q = query.strip() - if not q: - return False - tokens = q.split() - if len(tokens) >= 4: - return False # multi-word -> natural language - if {t.lower().strip("?.,:") for t in tokens} & _NL_CUES: - return False # contains a natural-language cue word - code_shaped = ( - "_" in q - or "(" in q - or _CAMEL.search(q) is not None - or _DOTTED.search(q) is not None +# Query-wide signals, both linear (disjoint character classes — no catastrophic backtracking +# on the user-supplied query string): a `backtick`-quoted span, or a call ``word(``. +_BACKTICK = re.compile(r"`[^`]+`") +_CALL = re.compile(r"\w\(") +# Punctuation stripped from a token before classifying it (query prose, not code). +_STRIP = "`'\".,:;!?()[]{}<>" + + +def _is_snake_case(token: str) -> bool: + """An underscore flanked by alphanumerics, e.g. ``foo_bar`` (linear scan).""" + return any( + token[i] == "_" and token[i - 1].isalnum() and token[i + 1].isalnum() + for i in range(1, len(token) - 1) ) - if len(tokens) == 1: - return True # a lone token is treated as a literal-term lookup - return code_shaped # 2-3 tokens only count as code when visibly code-shaped + + +def _is_dotted_path(token: str) -> bool: + """Two adjacent identifier parts joined by a dot, e.g. ``Foo.bar`` — excludes ``e.g``/``3.11``.""" + parts = token.split(".") + return any( + len(a) >= 2 and len(b) >= 2 and a.isidentifier() and b.isidentifier() + for a, b in zip(parts, parts[1:], strict=False) + ) + + +def _is_camel_case(token: str) -> bool: + """A lower→upper boundary, e.g. ``fooBar`` / ``AliasGenerator`` (linear scan).""" + return any( + a.islower() and b.isupper() for a, b in zip(token, token[1:], strict=False) + ) + + +def references_identifier(query: str) -> bool: + """True if ``query`` names a specific code identifier (even inside a prose sentence). + + Uses linear token scanning rather than one backtracking regex, so it can't be turned into + a ReDoS by a crafted query (the query flows in from the HTTP API). + """ + if not query: + return False + if _BACKTICK.search(query) or _CALL.search(query): + return True + for raw in query.split(): + token = raw.strip(_STRIP) + if len(token) >= 2 and ( + _is_snake_case(token) or _is_dotted_path(token) or _is_camel_case(token) + ): + return True + return False + + +# Back-compat alias: the original name meant "is an identifier lookup"; the detection is now +# "references an identifier", which subsumes the old behavior and also catches embedded ones. +looks_like_identifier = references_identifier def fusion_weights(query: str, config: "Config") -> Tuple[float, float]: """Return ``(dense_weight, lexical_weight)`` for ``query``. - Without adaptive fusion this is just the configured static pair. With it on, weights tilt - toward dense for natural-language queries and toward BM25 for identifier-like queries. + Without adaptive fusion this is the configured static pair. With it on: a query that + references a specific identifier uses the (neutral by default) code weights — so BM25's + exact-match signal is kept; a pure natural-language query uses the dense-leaning weights. """ if not config.adaptive_fusion: return config.dense_weight, config.lexical_weight - if looks_like_identifier(query): + if references_identifier(query): return config.code_dense_weight, config.code_lexical_weight return config.nl_dense_weight, config.nl_lexical_weight diff --git a/docs/eval.md b/docs/eval.md index 69d6f72..acc2f97 100644 --- a/docs/eval.md +++ b/docs/eval.md @@ -224,12 +224,35 @@ ambiguous and the embedder already matches them well. So the code-side default i for larger repos where exact-string recall matters more. Off by default pending larger-repo validation; enable with `CODERAG_ADAPTIVE_FUSION=1`. -> ⚠️ **This did NOT generalize — keep it off.** On `pydantic` (4 155-chunk corpus, commit-message -> queries) adaptive *hurt* (MRR 0.286 vs hybrid 0.361): those queries embed exact API names, so -> "lean dense for NL" is backwards. The dense-vs-BM25 ranking flips by repo/query style, and -> **fixed 1:1 hybrid is the robust default.** Full write-up: +> ⚠️ **First cut did not generalize — then was fixed.** The original classifier keyed on query +> *shape* and mis-read prose queries that *name* a symbol. On `pydantic` (commit-message queries) +> it leaned dense and *hurt* (MRR 0.286 vs hybrid 0.361). The classifier now detects identifiers +> **embedded in prose** (`references_identifier`) and routes them to the neutral code weights, so +> adaptive falls back to plain hybrid whenever a symbol is named. Full write-up: > [research/external-validation.md](research/external-validation.md). +### Adaptive fusion, after the classifier fix + +The classifier fix removed the catastrophic regression the first cut had (on `pydantic`, +adaptive went from **0.286 → 0.458 = hybrid**, no longer hurting). On two early datasets it +looked like a clear win: + +``` +CodeRAG curated, symbol level hybrid dense adaptive + natural-language queries (MRR) 0.581 0.675 0.706 ← beats both + identifier queries (MRR) 0.685 0.686 0.715 ← beats both + +pydantic, symbol level (172 cases, 22 071-chunk corpus) + dense 0.328 · bm25 0.398 · hybrid 0.458 · adaptive 0.458 ← equals hybrid (was 0.286) +``` + +⚠️ **But a 4-repo sweep (627 git-mined cases) shows it is *not* an aggregate win** — hybrid 0.442 +vs adaptive 0.423 MRR; adaptive is a wash on the well-powered repos and the big CodeRAG-curated +gain turned out to be an artifact of unusually dense-friendly clean-NL queries (see the +*Multi-repo evaluation* section below / PR adding it). So adaptive stays **off by default** — it's +a **safe opt-in** (no catastrophic regression after this fix), not a default. Fixed 1:1 hybrid +remains the default. Enable per-session with `CODERAG_ADAPTIVE_FUSION=1`. + ## Dataset format JSONL, one case per line: diff --git a/docs/research/external-validation.md b/docs/research/external-validation.md index afec0b6..e330cc5 100644 --- a/docs/research/external-validation.md +++ b/docs/research/external-validation.md @@ -61,10 +61,17 @@ evaporated. The robust configuration is exactly the **shipped defaults** — 1:1 off, rerank opt-in. This is the harness earning its keep: it caught the overfitting before any of it became a default. +> **Update — the regression was fixed, but adaptive still doesn't earn default-on.** The +> classifier now detects identifiers *embedded in prose* (`references_identifier`) and routes +> those queries to neutral weights, removing the catastrophic case (pydantic **0.286 → 0.458 = +> hybrid**). However, a later **4-repo sweep** (coderag/flask/requests/click, 627 git-mined cases) +> showed adaptive is **not** an aggregate win — hybrid 0.442 vs adaptive 0.423 MRR. The big +> CodeRAG-curated adaptive gain was an artifact of dense-friendly clean-NL queries. So adaptive is +> a **safe opt-in**, not a default; fixed 1:1 hybrid stays the default. See [../eval.md](../eval.md). + **Actionable next steps** (none change a default): -- Make `looks_like_identifier` smarter — detect identifiers *embedded* in prose queries (so - "Fix `AliasGenerator.generate_aliases`" routes BM25-up, not dense-up). That could make adaptive - fusion a net win across query styles instead of fragile. +- ~~Make the classifier detect identifiers *embedded* in prose queries.~~ **Done** — see the + update above; adaptive now generalizes across both repos. - Test a **code-aware reranker** (`bge-reranker-base`, `jina-reranker-v2`) at scale on GPU — the only lever not yet fairly evaluated. - Build a multi-repo eval set (several external repos) so future tuning is judged on diff --git a/tests/test_query_type.py b/tests/test_query_type.py index cb45a73..40150c5 100644 --- a/tests/test_query_type.py +++ b/tests/test_query_type.py @@ -3,24 +3,53 @@ from __future__ import annotations from coderag.api import CodeRAG -from coderag.retrieval.query_type import fusion_weights, looks_like_identifier +from coderag.retrieval.query_type import ( + fusion_weights, + looks_like_identifier, + references_identifier, +) from tests.conftest import write def test_identifier_queries_detected(): - assert looks_like_identifier("fts_search") - assert looks_like_identifier("reciprocal_rank_fusion") - assert looks_like_identifier("HybridSearcher.search") - assert looks_like_identifier("getUserToken") # camelCase - assert looks_like_identifier("authenticate(token)") # call paren + assert references_identifier("fts_search") + assert references_identifier("reciprocal_rank_fusion") + assert references_identifier("HybridSearcher.search") + assert references_identifier("getUserToken") # camelCase + assert references_identifier("authenticate(token)") # call paren + + +def test_identifier_embedded_in_prose_detected(): + # The case external validation exposed: a prose query that names a symbol. + assert references_identifier("Fix tuple order in AliasGenerator.generate_aliases()") + assert references_identifier("why does build_context drop the last chunk") + assert references_identifier("update the `validate_token` helper") def test_natural_language_queries_detected(): - assert not looks_like_identifier("where is retry backoff handled") - assert not looks_like_identifier("how does indexing work") - assert not looks_like_identifier("the auth flow") # NL cue word - assert not looks_like_identifier("user authentication flow") # 3 plain words - assert not looks_like_identifier("") + assert not references_identifier("where is retry backoff handled") + assert not references_identifier("how does indexing work") + assert not references_identifier("the auth flow") + assert not references_identifier("user authentication flow") + assert not references_identifier("") + + +def test_prose_abbreviations_and_versions_are_not_identifiers(): + # Common false positives that must NOT trip the dotted-path rule. + assert not references_identifier("fix the bug e.g. on startup") + assert not references_identifier("bump to version 3.11 support") + + +def test_looks_like_identifier_alias(): + assert looks_like_identifier is references_identifier + + +def test_detection_is_linear_no_redos(): + # A long adversarial run must be handled in linear time (the query is API-reachable). + # Quadratic backtracking on this input would take minutes; linear is milliseconds. + big = "a" * 200_000 + " " + assert references_identifier(big) is False + assert references_identifier(big + "needs_a_match") is True def test_fusion_weights_static_when_adaptive_off(config):