Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 69 additions & 66 deletions coderag/retrieval/query_type.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
"""Query-type detection for adaptive fusion weighting.

Symbol-level evaluation showed that a fixed 1:1 dense/BM25 fusion is a compromise, not an
optimum: on natural-language "where is X handled" queries the dense retriever is much
stronger and equal-weight RRF *drags it down* with weak BM25, while on exact-identifier
queries (``fts_search``, ``HybridSearcher.search``) the opposite holds. Routing the fusion
weights by query type recovers most of that gap with a cheap, local heuristic.

``looks_like_identifier`` is deliberately conservative: it only calls a query "code" when it
is short *and* either a lone token or visibly code-shaped (snake_case, dotted path,
camelCase, a call paren), and never when it contains natural-language cue words.
Symbol-level evaluation showed a fixed 1:1 dense/BM25 fusion is a compromise: on pure
natural-language "where is X handled" queries the dense retriever is much stronger and
equal-weight RRF drags it down with weak BM25. But the first cut keyed on query *shape*
(short + code-looking) and mis-classified the common real case — a prose query that
*mentions* a specific symbol, e.g. a commit message *"Fix tuple order in
`AliasGenerator.generate_aliases()`"*. External validation (pydantic) showed leaning dense
on those hurts, because the discriminating signal is the exact identifier BM25 matches.

So the routing question is simply: **does the query reference a specific code identifier?**
If yes, keep fusion neutral (BM25 carries real signal); if it's pure prose, lean dense. This
makes adaptive fusion fall back to plain hybrid whenever an identifier is named — it can only
help pure-NL queries, never repeat the regression.
"""

from __future__ import annotations
Expand All @@ -19,71 +22,71 @@
if TYPE_CHECKING:
from coderag.config import Config

# Words that mark a query as natural language even if it's short.
_NL_CUES = frozenset(
{
"where",
"how",
"what",
"why",
"when",
"which",
"who",
"is",
"are",
"was",
"were",
"does",
"do",
"did",
"can",
"the",
"a",
"an",
"to",
"of",
"in",
"for",
"on",
"and",
"or",
"with",
}
)

_CAMEL = re.compile(r"[a-z][A-Z]")
_DOTTED = re.compile(r"[A-Za-z0-9_]\.[A-Za-z0-9_]")


def looks_like_identifier(query: str) -> bool:
"""True if ``query`` reads like an exact code/symbol lookup rather than prose."""
q = query.strip()
if not q:
return False
tokens = q.split()
if len(tokens) >= 4:
return False # multi-word -> natural language
if {t.lower().strip("?.,:") for t in tokens} & _NL_CUES:
return False # contains a natural-language cue word
code_shaped = (
"_" in q
or "(" in q
or _CAMEL.search(q) is not None
or _DOTTED.search(q) is not None
# Query-wide signals, both linear (disjoint character classes — no catastrophic backtracking
# on the user-supplied query string): a `backtick`-quoted span, or a call ``word(``.
_BACKTICK = re.compile(r"`[^`]+`")
_CALL = re.compile(r"\w\(")
# Punctuation stripped from a token before classifying it (query prose, not code).
_STRIP = "`'\".,:;!?()[]{}<>"


def _is_snake_case(token: str) -> bool:
"""An underscore flanked by alphanumerics, e.g. ``foo_bar`` (linear scan)."""
return any(
token[i] == "_" and token[i - 1].isalnum() and token[i + 1].isalnum()
for i in range(1, len(token) - 1)
)
if len(tokens) == 1:
return True # a lone token is treated as a literal-term lookup
return code_shaped # 2-3 tokens only count as code when visibly code-shaped


def _is_dotted_path(token: str) -> bool:
"""Two adjacent identifier parts joined by a dot, e.g. ``Foo.bar`` — excludes ``e.g``/``3.11``."""
parts = token.split(".")
return any(
len(a) >= 2 and len(b) >= 2 and a.isidentifier() and b.isidentifier()
for a, b in zip(parts, parts[1:], strict=False)
)


def _is_camel_case(token: str) -> bool:
"""A lower→upper boundary, e.g. ``fooBar`` / ``AliasGenerator`` (linear scan)."""
return any(
a.islower() and b.isupper() for a, b in zip(token, token[1:], strict=False)
)


def references_identifier(query: str) -> bool:
"""True if ``query`` names a specific code identifier (even inside a prose sentence).

Uses linear token scanning rather than one backtracking regex, so it can't be turned into
a ReDoS by a crafted query (the query flows in from the HTTP API).
"""
if not query:
return False
if _BACKTICK.search(query) or _CALL.search(query):
return True
for raw in query.split():
token = raw.strip(_STRIP)
if len(token) >= 2 and (
_is_snake_case(token) or _is_dotted_path(token) or _is_camel_case(token)
):
return True
return False


# Back-compat alias: the original name meant "is an identifier lookup"; the detection is now
# "references an identifier", which subsumes the old behavior and also catches embedded ones.
looks_like_identifier = references_identifier


def fusion_weights(query: str, config: "Config") -> Tuple[float, float]:
"""Return ``(dense_weight, lexical_weight)`` for ``query``.

Without adaptive fusion this is just the configured static pair. With it on, weights tilt
toward dense for natural-language queries and toward BM25 for identifier-like queries.
Without adaptive fusion this is the configured static pair. With it on: a query that
references a specific identifier uses the (neutral by default) code weights — so BM25's
exact-match signal is kept; a pure natural-language query uses the dense-leaning weights.
"""
if not config.adaptive_fusion:
return config.dense_weight, config.lexical_weight
if looks_like_identifier(query):
if references_identifier(query):
return config.code_dense_weight, config.code_lexical_weight
return config.nl_dense_weight, config.nl_lexical_weight
31 changes: 27 additions & 4 deletions docs/eval.md
Original file line number Diff line number Diff line change
Expand Up @@ -224,12 +224,35 @@ ambiguous and the embedder already matches them well. So the code-side default i
for larger repos where exact-string recall matters more. Off by default pending larger-repo
validation; enable with `CODERAG_ADAPTIVE_FUSION=1`.

> ⚠️ **This did NOT generalize — keep it off.** On `pydantic` (4 155-chunk corpus, commit-message
> queries) adaptive *hurt* (MRR 0.286 vs hybrid 0.361): those queries embed exact API names, so
> "lean dense for NL" is backwards. The dense-vs-BM25 ranking flips by repo/query style, and
> **fixed 1:1 hybrid is the robust default.** Full write-up:
> ⚠️ **First cut did not generalize — then was fixed.** The original classifier keyed on query
> *shape* and mis-read prose queries that *name* a symbol. On `pydantic` (commit-message queries)
> it leaned dense and *hurt* (MRR 0.286 vs hybrid 0.361). The classifier now detects identifiers
> **embedded in prose** (`references_identifier`) and routes them to the neutral code weights, so
> adaptive falls back to plain hybrid whenever a symbol is named. Full write-up:
> [research/external-validation.md](research/external-validation.md).

### Adaptive fusion, after the classifier fix

The classifier fix removed the catastrophic regression the first cut had (on `pydantic`,
adaptive went from **0.286 → 0.458 = hybrid**, no longer hurting). On two early datasets it
looked like a clear win:

```
CodeRAG curated, symbol level hybrid dense adaptive
natural-language queries (MRR) 0.581 0.675 0.706 ← beats both
identifier queries (MRR) 0.685 0.686 0.715 ← beats both

pydantic, symbol level (172 cases, 22 071-chunk corpus)
dense 0.328 · bm25 0.398 · hybrid 0.458 · adaptive 0.458 ← equals hybrid (was 0.286)
```

⚠️ **But a 4-repo sweep (627 git-mined cases) shows it is *not* an aggregate win** — hybrid 0.442
vs adaptive 0.423 MRR; adaptive is a wash on the well-powered repos and the big CodeRAG-curated
gain turned out to be an artifact of unusually dense-friendly clean-NL queries (see the
*Multi-repo evaluation* section below / PR adding it). So adaptive stays **off by default** — it's
a **safe opt-in** (no catastrophic regression after this fix), not a default. Fixed 1:1 hybrid
remains the default. Enable per-session with `CODERAG_ADAPTIVE_FUSION=1`.

## Dataset format

JSONL, one case per line:
Expand Down
13 changes: 10 additions & 3 deletions docs/research/external-validation.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,17 @@ evaporated. The robust configuration is exactly the **shipped defaults** — 1:1
off, rerank opt-in. This is the harness earning its keep: it caught the overfitting before any
of it became a default.

> **Update — the regression was fixed, but adaptive still doesn't earn default-on.** The
> classifier now detects identifiers *embedded in prose* (`references_identifier`) and routes
> those queries to neutral weights, removing the catastrophic case (pydantic **0.286 → 0.458 =
> hybrid**). However, a later **4-repo sweep** (coderag/flask/requests/click, 627 git-mined cases)
> showed adaptive is **not** an aggregate win — hybrid 0.442 vs adaptive 0.423 MRR. The big
> CodeRAG-curated adaptive gain was an artifact of dense-friendly clean-NL queries. So adaptive is
> a **safe opt-in**, not a default; fixed 1:1 hybrid stays the default. See [../eval.md](../eval.md).

**Actionable next steps** (none change a default):
- Make `looks_like_identifier` smarter — detect identifiers *embedded* in prose queries (so
"Fix `AliasGenerator.generate_aliases`" routes BM25-up, not dense-up). That could make adaptive
fusion a net win across query styles instead of fragile.
- ~~Make the classifier detect identifiers *embedded* in prose queries.~~ **Done** — see the
update above; adaptive now generalizes across both repos.
- Test a **code-aware reranker** (`bge-reranker-base`, `jina-reranker-v2`) at scale on GPU —
the only lever not yet fairly evaluated.
- Build a multi-repo eval set (several external repos) so future tuning is judged on
Expand Down
51 changes: 40 additions & 11 deletions tests/test_query_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,53 @@
from __future__ import annotations

from coderag.api import CodeRAG
from coderag.retrieval.query_type import fusion_weights, looks_like_identifier
from coderag.retrieval.query_type import (
fusion_weights,
looks_like_identifier,
references_identifier,
)
from tests.conftest import write


def test_identifier_queries_detected():
assert looks_like_identifier("fts_search")
assert looks_like_identifier("reciprocal_rank_fusion")
assert looks_like_identifier("HybridSearcher.search")
assert looks_like_identifier("getUserToken") # camelCase
assert looks_like_identifier("authenticate(token)") # call paren
assert references_identifier("fts_search")
assert references_identifier("reciprocal_rank_fusion")
assert references_identifier("HybridSearcher.search")
assert references_identifier("getUserToken") # camelCase
assert references_identifier("authenticate(token)") # call paren


def test_identifier_embedded_in_prose_detected():
# The case external validation exposed: a prose query that names a symbol.
assert references_identifier("Fix tuple order in AliasGenerator.generate_aliases()")
assert references_identifier("why does build_context drop the last chunk")
assert references_identifier("update the `validate_token` helper")


def test_natural_language_queries_detected():
assert not looks_like_identifier("where is retry backoff handled")
assert not looks_like_identifier("how does indexing work")
assert not looks_like_identifier("the auth flow") # NL cue word
assert not looks_like_identifier("user authentication flow") # 3 plain words
assert not looks_like_identifier("")
assert not references_identifier("where is retry backoff handled")
assert not references_identifier("how does indexing work")
assert not references_identifier("the auth flow")
assert not references_identifier("user authentication flow")
assert not references_identifier("")


def test_prose_abbreviations_and_versions_are_not_identifiers():
# Common false positives that must NOT trip the dotted-path rule.
assert not references_identifier("fix the bug e.g. on startup")
assert not references_identifier("bump to version 3.11 support")


def test_looks_like_identifier_alias():
assert looks_like_identifier is references_identifier


def test_detection_is_linear_no_redos():
# A long adversarial run must be handled in linear time (the query is API-reachable).
# Quadratic backtracking on this input would take minutes; linear is milliseconds.
big = "a" * 200_000 + " "
assert references_identifier(big) is False
assert references_identifier(big + "needs_a_match") is True


def test_fusion_weights_static_when_adaptive_off(config):
Expand Down
Loading