From 649674e76a9137371fd4022c44743cddd689f159 Mon Sep 17 00:00:00 2001 From: Serhii Savchuk Date: Fri, 5 Jun 2026 20:48:08 +0300 Subject: [PATCH] perf(query): narrow + materialize before group for WHERE + multi-key MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pre-filter narrowing path in ray_select_fn previously fired only when the by-dict contained a *computed* val (e.g. q42's (xbar EventTime ...)). For bare-ref multi-key shapes with a selective WHERE — e.g. ClickBench q30 / q31: (select {c: (count ClientIP) s: (sum IsRefresh) a: (avg ResolutionWidth) from: hits where: (!= SearchPhrase "") by: {SearchEngineID: SearchEngineID ClientIP: ClientIP} desc: c take: 10}) the planner left the work to the fused mk_par_v2 path. That path reads each by-key + agg input column from the *original* wide table at the sparse positions left by the WHERE bitmap. On hits — 100+ columns — and ~14% selectivity, the gather wastes a cache line per touched column per passing row. Narrowing the input down to just the referenced columns and filtering once gives the downstream group a dense column-store and skips the gather. Extend the gate to fire when: - the desc/asc COUNT take N shape matches, - by-vals are all bare column refs (no computed val), - the by-dict has ≥ 2 keys, and - at least one aggregate has an input column distinct from the by-keys (sum/min/max/avg, not pure count). Count-only shapes (q14: count of SearchPhrase by {SearchEngineID, SearchPhrase}; q40: count URLHash by {URLHash, EventDate}) reuse the by-key column for the count input — narrowing costs the projection without saving the gather, so the gate keeps them on the original fused path. ClickBench 10M: q30 ~152 → ~50 ms q31 ~353 → ~55 ms --- src/ops/query.c | 45 +++++++++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/src/ops/query.c b/src/ops/query.c index 485dc8e9..a6774b8b 100644 --- a/src/ops/query.c +++ b/src/ops/query.c @@ -4420,11 +4420,40 @@ ray_t* ray_select(ray_t** args, int64_t n) { } ray_group_emit_filter_t prefilter_top_count; memset(&prefilter_top_count, 0, sizeof(prefilter_top_count)); - bool prefilter_computed_by = - has_computed_by_val && + bool prefilter_top_n_match = match_group_desc_count_take(dict_elems, dict_n, from_id, where_id, by_id, take_id, asc_id, desc_id, &prefilter_top_count); + bool prefilter_computed_by = + has_computed_by_val && prefilter_top_n_match; + /* Multi-key WHERE shape — same kind of win even with bare-ref + * by-vals when at least one aggregate has a *distinct* input + * column (SUM / MIN / MAX / AVG on something other than a + * by-key). mk_par_v2's wide composite path then reads those + * extra inputs from the *original* wide table at the sparse + * positions left by the WHERE filter; for ClickBench-class + * 100-col tables with selective WHEREs (q30/q31 ~14%) the + * gather wastes a cache line per touched column per passing + * row. Count-only shapes (q14: SearchEngineID/SearchPhrase + * keys, count of SearchPhrase) don't carry the extra agg col + * — narrowing costs the projection without saving the gather. */ + bool prefilter_multi_key_where = false; + if (prefilter_top_n_match && !has_computed_by_val && nk >= 2) { + int64_t count_sym = ray_sym_intern("count", 5); + for (int64_t i = 0; i + 1 < dict_n && + !prefilter_multi_key_where; i += 2) { + int64_t kid = dict_elems[i]->i64; + if (kid == from_id || kid == where_id || kid == by_id || + kid == take_id || kid == asc_id || kid == desc_id || + kid == nearest_id) continue; + ray_t* val = dict_elems[i + 1]; + if (!is_group_dag_agg_expr(val)) continue; + ray_t** ae = (ray_t**)ray_data(val); + if (!ae[0] || ae[0]->type != -RAY_SYM) continue; + if (ae[0]->i64 == count_sym) continue; + prefilter_multi_key_where = true; /* sum/min/max/avg */ + } + } /* Computed by-val + WHERE: eagerly evaluating a non-trivial * group key (e.g. q42's `(xbar EventTime 60000000000)`) over * every input row wastes work proportional to the WHERE's @@ -4439,16 +4468,8 @@ ray_t* ray_select(ray_t** args, int64_t n) { * Narrowing matters: for wide tables (ClickBench's `hits` has * ~100 cols) materialising the full filtered table dominates * what was meant to be a cheap prefilter (single-col filter - * is O(passing × esz), full filter is ~50× that). - * - * The matcher gate (top-N-by-agg) constrains where this fires - * to shapes where the prefilter's cost can be amortised — the - * downstream group materialisation and top-N extraction - * benefit from operating on a small filtered slice. Broader - * shapes that already have an efficient fused-filter+group - * path (OP_FILTERED_GROUP) would lose more in the duplicated - * filter work than they'd save in the smaller by-val eval. */ - if (where_expr && prefilter_computed_by) { + * is O(passing × esz), full filter is ~50× that). */ + if (where_expr && (prefilter_computed_by || prefilter_multi_key_where)) { int64_t keep_syms[256]; int n_keep = 0; n_keep = collect_col_refs_set(where_expr, tbl,