From 649674e76a9137371fd4022c44743cddd689f159 Mon Sep 17 00:00:00 2001
From: Serhii Savchuk <ser.vasilich@hotmail.com>
Date: Fri, 5 Jun 2026 20:48:08 +0300
Subject: [PATCH] perf(query): narrow + materialize before group for WHERE +
 multi-key
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The pre-filter narrowing path in ray_select_fn previously fired only
when the by-dict contained a *computed* val (e.g. q42's
(xbar EventTime ...)).  For bare-ref multi-key shapes with a
selective WHERE — e.g. ClickBench q30 / q31:

  (select {c: (count ClientIP)
           s: (sum IsRefresh)
           a: (avg ResolutionWidth)
           from: hits where: (!= SearchPhrase "")
           by: {SearchEngineID: SearchEngineID ClientIP: ClientIP}
           desc: c take: 10})

the planner left the work to the fused mk_par_v2 path.  That path
reads each by-key + agg input column from the *original* wide table
at the sparse positions left by the WHERE bitmap.  On hits — 100+
columns — and ~14% selectivity, the gather wastes a cache line per
touched column per passing row.  Narrowing the input down to just
the referenced columns and filtering once gives the downstream group
a dense column-store and skips the gather.

Extend the gate to fire when:
  - the desc/asc COUNT take N shape matches,
  - by-vals are all bare column refs (no computed val),
  - the by-dict has ≥ 2 keys, and
  - at least one aggregate has an input column distinct from the
    by-keys (sum/min/max/avg, not pure count).

Count-only shapes (q14: count of SearchPhrase by
{SearchEngineID, SearchPhrase}; q40: count URLHash by
{URLHash, EventDate}) reuse the by-key column for the count input
— narrowing costs the projection without saving the gather, so the
gate keeps them on the original fused path.

ClickBench 10M:

  q30  ~152 →  ~50 ms
  q31  ~353 →  ~55 ms
---
 src/ops/query.c | 45 +++++++++++++++++++++++++++++++++------------
 1 file changed, 33 insertions(+), 12 deletions(-)

diff --git a/src/ops/query.c b/src/ops/query.c
index 485dc8e9..a6774b8b 100644
--- a/src/ops/query.c
+++ b/src/ops/query.c
@@ -4420,11 +4420,40 @@ ray_t* ray_select(ray_t** args, int64_t n) {
         }
         ray_group_emit_filter_t prefilter_top_count;
         memset(&prefilter_top_count, 0, sizeof(prefilter_top_count));
-        bool prefilter_computed_by =
-            has_computed_by_val &&
+        bool prefilter_top_n_match =
             match_group_desc_count_take(dict_elems, dict_n, from_id, where_id,
                                         by_id, take_id, asc_id, desc_id,
                                         &prefilter_top_count);
+        bool prefilter_computed_by =
+            has_computed_by_val && prefilter_top_n_match;
+        /* Multi-key WHERE shape — same kind of win even with bare-ref
+         * by-vals when at least one aggregate has a *distinct* input
+         * column (SUM / MIN / MAX / AVG on something other than a
+         * by-key).  mk_par_v2's wide composite path then reads those
+         * extra inputs from the *original* wide table at the sparse
+         * positions left by the WHERE filter; for ClickBench-class
+         * 100-col tables with selective WHEREs (q30/q31 ~14%) the
+         * gather wastes a cache line per touched column per passing
+         * row.  Count-only shapes (q14: SearchEngineID/SearchPhrase
+         * keys, count of SearchPhrase) don't carry the extra agg col
+         * — narrowing costs the projection without saving the gather. */
+        bool prefilter_multi_key_where = false;
+        if (prefilter_top_n_match && !has_computed_by_val && nk >= 2) {
+            int64_t count_sym = ray_sym_intern("count", 5);
+            for (int64_t i = 0; i + 1 < dict_n &&
+                                !prefilter_multi_key_where; i += 2) {
+                int64_t kid = dict_elems[i]->i64;
+                if (kid == from_id || kid == where_id || kid == by_id ||
+                    kid == take_id || kid == asc_id || kid == desc_id ||
+                    kid == nearest_id) continue;
+                ray_t* val = dict_elems[i + 1];
+                if (!is_group_dag_agg_expr(val)) continue;
+                ray_t** ae = (ray_t**)ray_data(val);
+                if (!ae[0] || ae[0]->type != -RAY_SYM) continue;
+                if (ae[0]->i64 == count_sym) continue;
+                prefilter_multi_key_where = true;  /* sum/min/max/avg */
+            }
+        }
         /* Computed by-val + WHERE: eagerly evaluating a non-trivial
          * group key (e.g. q42's `(xbar EventTime 60000000000)`) over
          * every input row wastes work proportional to the WHERE's
@@ -4439,16 +4468,8 @@ ray_t* ray_select(ray_t** args, int64_t n) {
          * Narrowing matters: for wide tables (ClickBench's `hits` has
          * ~100 cols) materialising the full filtered table dominates
          * what was meant to be a cheap prefilter (single-col filter
-         * is O(passing × esz), full filter is ~50× that).
-         *
-         * The matcher gate (top-N-by-agg) constrains where this fires
-         * to shapes where the prefilter's cost can be amortised — the
-         * downstream group materialisation and top-N extraction
-         * benefit from operating on a small filtered slice.  Broader
-         * shapes that already have an efficient fused-filter+group
-         * path (OP_FILTERED_GROUP) would lose more in the duplicated
-         * filter work than they'd save in the smaller by-val eval. */
-        if (where_expr && prefilter_computed_by) {
+         * is O(passing × esz), full filter is ~50× that). */
+        if (where_expr && (prefilter_computed_by || prefilter_multi_key_where)) {
             int64_t keep_syms[256];
             int n_keep = 0;
             n_keep = collect_col_refs_set(where_expr, tbl,