From cfdd1b6d501aefd6f6366a93d307ce3aeaf11e66 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Fri, 15 May 2026 07:02:39 +0200
Subject: [PATCH 01/17] Initial basic functionality (phase 1 of plan completed)

---
 bench/ctable/groupby.py      | 119 ++++++++
 doc/reference/ctable.rst     |  20 ++
 plans/ctable-groupby.md      | 574 +++++++++++++++++++++++++++++++++++
 src/blosc2/__init__.py       |   2 +
 src/blosc2/ctable.py         |  48 ++-
 src/blosc2/groupby.py        | 548 +++++++++++++++++++++++++++++++++
 tests/ctable/test_groupby.py | 151 +++++++++
 7 files changed, 1460 insertions(+), 2 deletions(-)
 create mode 100644 bench/ctable/groupby.py
 create mode 100644 plans/ctable-groupby.md
 create mode 100644 src/blosc2/groupby.py
 create mode 100644 tests/ctable/test_groupby.py

diff --git a/bench/ctable/groupby.py b/bench/ctable/groupby.py
new file mode 100644
index 00000000..fb601eb3
--- /dev/null
+++ b/bench/ctable/groupby.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python
+"""Phase-1 CTable group_by benchmark.
+
+Examples
+--------
+python bench/ctable/groupby.py --rows 10_000_000 --groups 1_000 --op sum
+python bench/ctable/groupby.py --rows 1_000_000 --groups 100 --dictionary --pandas
+"""
+
+from __future__ import annotations
+
+import argparse
+import dataclasses
+import time
+from pathlib import Path
+
+import numpy as np
+
+import blosc2
+
+
+def parse_int(text: str) -> int:
+    return int(text.replace("_", ""))
+
+
+def build_row_type(dictionary: bool):
+    if dictionary:
+
+        @dataclasses.dataclass
+        class Row:
+            key: str = blosc2.field(blosc2.dictionary())
+            value: float = blosc2.field(blosc2.float64())
+
+    else:
+
+        @dataclasses.dataclass
+        class Row:
+            key: int = blosc2.field(blosc2.int32())
+            value: float = blosc2.field(blosc2.float64())
+
+    return Row
+
+
+def make_data(nrows: int, ngroups: int, dictionary: bool, seed: int):
+    rng = np.random.default_rng(seed)
+    key_codes = rng.integers(0, ngroups, size=nrows, dtype=np.int32)
+    values = rng.random(nrows, dtype=np.float64)
+    if dictionary:
+        keys = np.asarray([f"k{code}" for code in key_codes], dtype=object)
+    else:
+        keys = key_codes
+    return keys, values
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--rows", type=parse_int, default=10_000_000)
+    parser.add_argument("--groups", type=parse_int, default=1_000)
+    parser.add_argument("--chunk-size", type=parse_int, default=None)
+    parser.add_argument("--dictionary", action="store_true", help="Use a dictionary-encoded string key")
+    parser.add_argument("--op", choices=["size", "count", "sum", "mean", "min", "max"], default="sum")
+    parser.add_argument("--sort", action="store_true")
+    parser.add_argument("--pandas", action="store_true", help="Also run a pandas comparison if available")
+    parser.add_argument("--urlpath", type=Path, default=None, help="Optional persistent CTable path")
+    parser.add_argument("--seed", type=int, default=0)
+    args = parser.parse_args()
+
+    print(
+        f"rows={args.rows:,} groups={args.groups:,} dictionary={args.dictionary} "
+        f"op={args.op} sort={args.sort} chunk_size={args.chunk_size} urlpath={args.urlpath}"
+    )
+
+    keys, values = make_data(args.rows, args.groups, args.dictionary, args.seed)
+    Row = build_row_type(args.dictionary)
+
+    kwargs = {}
+    if args.urlpath is not None:
+        kwargs.update(urlpath=str(args.urlpath), mode="w")
+
+    t0 = time.perf_counter()
+    table = blosc2.CTable(Row, new_data={"key": keys, "value": values}, expected_size=args.rows, **kwargs)
+    build_time = time.perf_counter() - t0
+    print(f"ctable_build_seconds={build_time:.6f}")
+
+    t0 = time.perf_counter()
+    gb = table.group_by("key", sort=args.sort, chunk_size=args.chunk_size)
+    if args.op == "size":
+        out = gb.size()
+    elif args.op == "count":
+        out = gb.count("value")
+    else:
+        out = gb.agg({"value": args.op})
+    elapsed = time.perf_counter() - t0
+    print(f"ctable_groupby_seconds={elapsed:.6f}")
+    print(f"result_rows={out.nrows:,}")
+
+    if args.pandas:
+        try:
+            import pandas as pd
+        except ImportError:
+            print("pandas_unavailable=true")
+        else:
+            df = pd.DataFrame({"key": keys, "value": values})
+            t0 = time.perf_counter()
+            if args.op == "size":
+                pdf = df.groupby("key", sort=args.sort).size()
+            elif args.op == "count":
+                pdf = df.groupby("key", sort=args.sort)["value"].count()
+            else:
+                pdf = df.groupby("key", sort=args.sort)["value"].agg(args.op)
+            pandas_elapsed = time.perf_counter() - t0
+            print(f"pandas_groupby_seconds={pandas_elapsed:.6f}")
+            print(f"pandas_result_rows={len(pdf):,}")
+
+    table.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/doc/reference/ctable.rst b/doc/reference/ctable.rst
index 12e99ea0..2ed012d0 100644
--- a/doc/reference/ctable.rst
+++ b/doc/reference/ctable.rst
@@ -233,6 +233,7 @@ When a NumPy structured array is needed, materialize explicitly::
     CTable.sample
     CTable.sort_by
     CTable.iter_sorted
+    CTable.group_by
 
 .. automethod:: CTable.where
 .. automethod:: CTable.view
@@ -242,6 +243,25 @@ When a NumPy structured array is needed, materialize explicitly::
 .. automethod:: CTable.sample
 .. automethod:: CTable.sort_by
 .. automethod:: CTable.iter_sorted
+.. automethod:: CTable.group_by
+
+
+Group-by reductions
+-------------------
+
+:meth:`CTable.group_by` returns a lightweight deferred group-by object.  It is
+not a table view; methods such as :meth:`~blosc2.CTableGroupBy.size`,
+:meth:`~blosc2.CTableGroupBy.count`, and
+:meth:`~blosc2.CTableGroupBy.agg` materialize a new :class:`CTable` with
+one row per group::
+
+    by_city = t.group_by("city", sort=True)
+    counts = by_city.size()                  # row count per city / COUNT(*)
+    non_null = by_city.count("sales")        # non-null sales count / COUNT(sales)
+    totals = by_city.agg({"sales": "sum"})
+
+.. autoclass:: CTableGroupBy
+    :members: size, count, agg
 
 
 Mutations
diff --git a/plans/ctable-groupby.md b/plans/ctable-groupby.md
new file mode 100644
index 00000000..b86ac078
--- /dev/null
+++ b/plans/ctable-groupby.md
@@ -0,0 +1,574 @@
+# CTable `group_by` implementation plan
+
+## Goals
+
+Add a `CTable.group_by()` facility that is efficient for columnar, compressed
+CTable storage while keeping the first implementation simple and correct.  The
+long-term goal is to expose a compressed-aware group-reduce primitive that can
+power `CTable.group_by()` and possibly other analytics APIs.
+
+Key design principles:
+
+- Stay columnar: read only grouping columns, aggregation columns, and the live-row mask.
+- Keep memory bounded: process the table chunk-by-chunk; never require materializing all rows.
+- Use indexes opportunistically, but do not require them.
+- Start with a NumPy implementation, then add Cython kernels for hot paths.
+- Keep compressed input columns compressed between chunks; only chunk slices become NumPy buffers.
+
+## Proposed user API
+
+Initial high-level API could be:
+
+```python
+t.group_by("city").agg({"sales": "sum", "id": "count"})
+t.group_by(["country", "city"]).agg({"sales": ["sum", "mean"], "price": "max"})
+```
+
+Potential variants:
+
+```python
+t.group_by("city", sort=False).agg(...)
+t.group_by("city", engine="auto").agg(...)
+t.group_by("city").count()
+t.group_by("city").sum("sales")
+```
+
+The result should be a new in-memory `CTable` initially.  Persistent output can
+be added later via an `out=`/`urlpath=` option if useful.
+
+Output column naming should be predictable, for example:
+
+```text
+city, sales_sum, id_count
+country, city, sales_sum, sales_mean, price_max
+```
+
+For a single aggregation on a column, decide whether to preserve the original
+column name or always suffix it.  Always suffixing is less ambiguous.
+
+## Supported MVP semantics
+
+Start with:
+
+- Group keys:
+  - fixed-width scalar columns: bool, signed/unsigned ints, floats, datetimes/timedeltas;
+  - dictionary-encoded string columns via integer codes.
+- Aggregations:
+  - `count` / `size`;
+  - `sum`;
+  - `min`;
+  - `max`;
+  - `mean` implemented as `sum + count` during accumulation.
+- Respect live rows (`_valid_rows`) and views.
+- Read only required columns.
+
+Defer initially:
+
+- list columns;
+- vlstring/vlbytes/object/struct scalar columns, except dictionary columns;
+- arbitrary Python aggregators;
+- group-by over computed columns, unless they can be chunk-evaluated cleanly;
+- disk spilling for very high cardinality;
+- parallel hash aggregation.
+
+## Baseline algorithm: chunked hash aggregation
+
+The default implementation should be a chunked hash group-reduce:
+
+```text
+global_accumulator = hash table: group_key -> aggregate state
+
+for each row chunk:
+    read/decompress key column chunk(s)
+    read/decompress aggregation value column chunk(s)
+    read/decompress valid-row mask chunk
+    apply live-row mask
+
+    build local grouping keys
+    compute local partial aggregates
+    merge local partial aggregates into global_accumulator
+
+finalize aggregate state
+materialize group keys and aggregate columns into a result CTable
+```
+
+The important point is that the global hash table is proportional to the number
+of groups, not to the number of rows:
+
+```text
+memory ~= O(number_of_groups * (key_size + aggregate_state_size + hash overhead))
+```
+
+The global accumulator should normally live uncompressed in memory.  It is
+accessed for every chunk merge, so compressing it would likely dominate runtime.
+The compressed-aware aspect is in the input traversal: compressed CTable columns
+are decompressed only one bounded chunk at a time.
+
+## Columnar chunk traversal
+
+Use synchronized physical row ranges.  For each range:
+
+```python
+valid = np.asarray(self._valid_rows[start:stop])
+key1 = np.asarray(self._cols[key1_name][start:stop])
+value = np.asarray(self._cols[value_name][start:stop])
+
+key1 = key1[valid]
+value = value[valid]
+```
+
+Where possible, align chunk ranges with the physical chunks of `_valid_rows` or
+input columns to improve decompression locality.  The exact chunk size should be
+configurable internally; a reasonable default can be based on CTable/NDArray
+chunk sizes, with a cap to avoid excessive temporaries.
+
+For dictionary columns, read codes instead of decoded strings:
+
+```python
+codes = np.asarray(dict_col.codes[start:stop], dtype=np.int32)
+```
+
+Decode codes only when materializing the final result.
+
+## NumPy MVP local grouping
+
+For a single key:
+
+```python
+unique_keys, inverse = np.unique(keys, return_inverse=True)
+partial_sum = np.bincount(inverse, weights=values)
+partial_count = np.bincount(inverse)
+```
+
+For min/max use `np.minimum.at` / `np.maximum.at` into arrays initialized with
+appropriate identity values.
+
+For multiple fixed-width keys, build a structured array per chunk:
+
+```python
+keys = np.empty(n, dtype=[("k0", key0.dtype), ("k1", key1.dtype)])
+keys["k0"] = key0
+keys["k1"] = key1
+
+unique_keys, inverse = np.unique(keys, return_inverse=True)
+```
+
+This is simple and should be the initial correctness path.  Costs to be aware of:
+
+- structured key array allocation and copy per chunk;
+- `np.unique` is generally sort-based;
+- `return_inverse=True` allocates one integer per live row in the chunk;
+- aggregations are separate passes over the inverse.
+
+These costs are acceptable for the MVP because they are bounded by chunk size.
+
+## Global accumulator design
+
+For the Python MVP, a dictionary is adequate:
+
+```python
+acc: dict[group_key, AggregateState]
+```
+
+Where `group_key` is:
+
+- a Python scalar for single numeric/dictionary keys;
+- a tuple for multi-column keys;
+- a normalized representation for null-aware keys when nullable support is added.
+
+`AggregateState` can store arrays or small Python objects with fields like:
+
+```text
+count
+sum
+min
+max
+mean_sum
+mean_count
+```
+
+For `mean`, keep `sum` and `count` and divide only during finalization.  For
+multiple aggregations over the same input column, share state when possible
+(e.g. `mean` and `sum` can reuse the same sum).
+
+For better performance after the API stabilizes, replace parts of this with a
+NumPy-backed accumulator or Cython state object.
+
+## Index-aware paths
+
+Indexes are optional accelerators.
+
+### FULL index on a single group key
+
+A FULL index stores sorted values and positions.  For a single grouping key,
+this can make group-by a sorted scan:
+
+```text
+obtain sorted positions from FULL index
+scan rows in key order
+detect group boundaries
+reduce contiguous runs
+```
+
+Benefits:
+
+- no hash table needed for the grouping key;
+- no sort needed at query time;
+- output is naturally sorted by key.
+
+This is most useful for:
+
+```python
+t.create_index("city", kind=blosc2.IndexKind.FULL)
+t.group_by("city").agg(...)
+```
+
+Caveats:
+
+- only directly helps single-key group-by;
+- for multi-key group-by, a single-column FULL index only partially helps;
+- stale indexes must be ignored or rebuilt;
+- views/deleted rows still require intersecting with `_valid_rows`.
+
+### Bucket/segment indexes
+
+The default predicate indexes are useful before group-by, not usually during it:
+
+```python
+t.where("year == 2024").group_by("city")
+```
+
+The index accelerates `where()`, reducing rows scanned by group-by.  It does not
+by itself provide grouped order.
+
+## Existing `indexing_ext` sort helpers
+
+`indexing_ext.pyx` contains:
+
+- `keysort_values_positions(values, positions)`;
+- `keysort_keys_indices(keys, indices)`.
+
+These sort a 1-D scalar key array in-place while carrying an `int64` side array.
+They are useful for sort/index oriented paths, especially:
+
+- building/reusing FULL indexes;
+- single-key sort-based group-by;
+- dictionary-code group-by where codes are scalar integers.
+
+They are not the main primitive for hash-based group-reduce because hash
+aggregation does not require sorted keys.  They also do not directly support
+multi-column keys, variable-length strings, or fused aggregation.
+
+## Compressed-aware `group_reduce` primitive
+
+Longer term, introduce a lower-level primitive used by `CTable.group_by()`:
+
+```python
+blosc2.group_reduce(
+    keys=[key_ndarray1, key_ndarray2],
+    values=[value_ndarray1],
+    aggs={"value": ["sum", "count"]},
+    mask=valid_rows,
+    chunk_size=None,
+    engine="auto",
+)
+```
+
+However, the first implementation can live under an internal module, e.g.
+`blosc2.groupby`, before becoming public.
+
+The primitive should be compressed-aware in traversal, not necessarily operate
+on compressed bytes directly.  General key comparison/grouping still needs
+values.  The intended execution is:
+
+```text
+read compressed NDArray slices -> NumPy buffers -> local group/reduce -> merge
+```
+
+This avoids full-column materialization while keeping the hot loop simple.
+
+## Cython optimization plan
+
+### Phase 1: Python/NumPy only
+
+Files:
+
+```text
+src/blosc2/ctable.py     # public API / GroupBy facade
+src/blosc2/groupby.py    # internal implementation and NumPy engine
+```
+
+Focus on correctness, tests, API shape, and an early benchmark harness.  The
+benchmark should be added in Phase 1, before any Cython work, so that later
+optimization decisions are driven by numbers rather than intuition.  At minimum,
+add one reusable script under `bench/` that can generate or open a CTable and
+compare:
+
+- chunked NumPy hash group-by;
+- single-key sort/scan group-by where practical;
+- dictionary-code grouping;
+- pandas or DuckDB on an equivalent in-memory/external dataset for rough context.
+
+The initial benchmark does not need to be exhaustive, but it should record row
+count, cardinality, chunk size, compression parameters, elapsed time, peak memory
+if easy to capture, and whether the input is in-memory, `.b2d`, or `.b2z`.
+
+### Phase 2: optimized kernels in `indexing_ext.pyx`
+
+To avoid adding a third extension too early, place initial Cython kernels in
+`src/blosc2/indexing_ext.pyx` under a clearly separated section:
+
+```cython
+# ----------------------------------------------------------------------
+# Group-reduce kernels
+# ----------------------------------------------------------------------
+```
+
+Initial kernels should target high-value simple cases:
+
+- single `int32`/`int64` key;
+- dictionary-code keys (`int32`);
+- numeric value columns;
+- `count`, `sum`, `min`, `max`, maybe `mean` via sum/count.
+
+The Python layer remains responsible for:
+
+- CTable schema validation;
+- chunk iteration;
+- decompression into NumPy buffers;
+- final result CTable construction;
+- fallback to NumPy for unsupported dtypes.
+
+The Cython layer consumes NumPy buffers and updates a hash accumulator or returns
+chunk partial aggregates.
+
+### Phase 3: split to `groupby_ext.pyx` if it grows
+
+If the optimized path grows to include multi-column hash tables, nullable key
+semantics, multiple aggregate state layouts, spilling, or parallel execution,
+move it to a dedicated extension:
+
+```text
+src/blosc2/groupby_ext.pyx
+```
+
+This is cleaner long-term than overloading `indexing_ext.pyx` indefinitely.
+Avoid putting this functionality in `blosc2_ext.pyx`; group-reduce is a
+higher-level analytics/query primitive, not core compression/NDArray machinery.
+
+## What custom Cython buys over structured NumPy keys
+
+NumPy structured dtype is a good MVP, but a custom Cython hash reducer can avoid
+several costs:
+
+- no temporary packed structured key array;
+- no sort-based `np.unique` for every chunk;
+- no `inverse` array of length equal to the chunk;
+- factorization and aggregation can be fused in one pass;
+- multiple aggregations can be updated together;
+- direct processing of CTable's columnar SoA layout;
+- easier future per-thread hash tables and merges.
+
+A typical optimized loop is:
+
+```text
+for i in range(n):
+    key = key_columns[i]
+    slot = hash_lookup_or_insert(key)
+    acc_sum[slot] += value[i]
+    acc_count[slot] += 1
+    acc_min[slot] = min(acc_min[slot], value[i])
+```
+
+For multi-column keys, the Cython path can hash directly across multiple arrays
+without packing them into a structured array first.
+
+## High-cardinality strategy
+
+Hash aggregation can become memory-heavy when the number of groups approaches
+the number of rows.  Add safeguards and future alternatives:
+
+- estimate cardinality from early chunks;
+- expose/keep an internal memory limit;
+- fall back to sort-based group-by when cardinality is too high;
+- use FULL index if available;
+- later: partitioned hash group-by with spill-to-disk.
+
+For the MVP, document that very high-cardinality group-by may require memory
+proportional to output cardinality.
+
+## Null and NaN semantics
+
+Define before finalizing the API:
+
+- Should null sentinel values form their own group, be skipped, or be controlled
+  by `dropna=`?
+- Should float NaNs group together?  NumPy `unique` behavior and hash behavior
+  must be made consistent.
+- Nullable booleans/dictionary null codes need explicit handling.
+
+Suggested default, matching common dataframe behavior:
+
+```python
+t.group_by("key", dropna=True)  # default? skip null keys
+t.group_by("key", dropna=False)  # include null group
+```
+
+But this should be aligned with existing CTable nullable semantics.
+
+## Documentation
+
+Add user-facing docstrings and Sphinx documentation for the new group-by API:
+
+- `CTable.group_by()` docstring with parameters such as `keys`, `sort`,
+  `dropna`, `engine`, and `chunk_size` if exposed;
+- the returned `GroupBy`/`CTableGroupBy` facade docstring, documenting that it
+  is a deferred operation builder, not a `CTable` view;
+- `GroupBy.size()`, `GroupBy.count()`, and `GroupBy.agg()` docstrings;
+- examples in the CTable documentation showing row counts, non-null counts,
+  sums/means, dictionary string grouping, and optional sorted output.
+
+The class may be described as "the object returned by `CTable.group_by()`" and
+need not encourage direct construction.
+
+## Tests
+
+Add tests under `tests/ctable/`, covering:
+
+- single-key count/sum/min/max/mean;
+- multi-key group-by;
+- dictionary string key grouping;
+- views and deleted rows;
+- empty table and all-filtered view;
+- different numeric dtypes and bool keys;
+- nullable key behavior once specified;
+- result schema and output column names;
+- consistency with a reference Python/pandas-like implementation;
+- chunk-size variation to ensure chunk-boundary independence;
+- optional FULL-index path returns same results as hash path.
+
+For deterministic tests, sort result rows before comparison unless the API
+guarantees output order.
+
+## Benchmark plan
+
+Add a small but useful benchmark during Phase 1.  This is important because it
+sets the baseline for the NumPy implementation and identifies which Cython
+kernels are worth writing first.
+
+Benchmarks should include:
+
+- low-cardinality single key, e.g. 10 groups over 100M rows;
+- medium cardinality, e.g. 100k groups;
+- high cardinality, near unique keys;
+- dictionary string columns grouped by codes;
+- multi-column keys;
+- multiple aggregations over one value column;
+- multiple value columns;
+- with and without FULL index;
+- persistent `.b2d`/`.b2z` inputs.
+
+Compare:
+
+- Python/NumPy chunked implementation;
+- Cython hash path when available;
+- sort-based path using existing keysort helpers;
+- pandas/duckdb for sanity, where feasible.
+
+## Open decisions and recommended defaults
+
+### Public API and result column names
+
+Recommendation: use a small `GroupBy` facade and an explicit `.agg()` method:
+
+```python
+t.group_by("city").agg({"sales": "sum"})
+t.group_by(["country", "city"]).agg({"sales": ["sum", "mean"], "price": "max"})
+```
+
+Always suffix aggregate output columns as `<input>_<agg>`:
+
+```text
+city, sales_sum
+country, city, sales_sum, sales_mean, price_max
+```
+
+This avoids ambiguity and remains stable when users later request multiple
+aggregations on the same input column.  Convenience methods should include at least `GroupBy.size()` and
+`GroupBy.count(column)` early:
+
+```python
+t.group_by("city").size()  # row count per group / COUNT(*)
+t.group_by("city").count("sales")  # non-null sales count / COUNT(sales)
+```
+
+Additional conveniences like `.sum()`, `.mean()`, `.min()`, and `.max()` can be
+added after `.agg()` is stable.
+
+### Output order
+
+Recommendation: make output order configurable, with hash insertion order as the
+fast default and sorted output as an option:
+
+```python
+t.group_by("city", sort=False).agg(...)  # default: fastest
+t.group_by("city", sort=True).agg(...)  # sort by group keys
+```
+
+When a single-key FULL index is used, sorted output can be produced naturally.
+Tests should not depend on default order unless explicitly testing order.
+
+### Null and NaN grouping semantics
+
+Recommendation: provide `dropna=` and default to `True`, matching common
+dataframe behavior:
+
+```python
+t.group_by("key", dropna=True)  # skip rows with null/NaN keys
+t.group_by("key", dropna=False)  # include a null/NaN group
+```
+
+For `dropna=False`, all NaNs in a floating key should belong to one group, and
+nullable sentinels/dictionary null codes should belong to one null group.  The
+NumPy and Cython engines must normalize these cases consistently.
+
+### `size` vs `count`
+
+Recommendation: support both, with distinct meanings, scoped to group-by rather
+than as new top-level `CTable.size()` / `CTable.count()` methods:
+
+- `GroupBy.size()`: number of rows in the group, independent of value-column
+  nulls; equivalent to SQL `COUNT(*)` and pandas `groupby(...).size()`;
+- `GroupBy.count(column)`: number of non-null values for a specific value
+  column; equivalent to SQL `COUNT(column)` and pandas `groupby(...)[column].count()`;
+- `count` aggregation, e.g. `GroupBy.agg({"sales": "count"})`, should be an
+  equivalent spelling for `GroupBy.count("sales")`.
+
+Prefer `size()` over `len()` for the MVP.  Although `len` resembles Python's
+`len()`, `size()` follows pandas group-by terminology and avoids suggesting that
+it returns a single scalar length.  A `len()` alias can be considered later if
+there is demand.
+
+For non-nullable columns, `count(col)` equals `size`.  For nullable columns,
+`count(col)` excludes null sentinels/NaNs according to the column null policy.
+The MVP can implement `GroupBy.size()` first and add nullable-aware `count` as
+nullable aggregate semantics mature.
+
+### Public `blosc2.group_reduce()` exposure
+
+Recommendation: keep `group_reduce` internal at first, e.g. in
+`blosc2.groupby`, until the API and semantics settle through `CTable.group_by()`.
+Expose a public `blosc2.group_reduce()` only after:
+
+- aggregation semantics are stable;
+- null/NaN behavior is documented;
+- output representation is clear;
+- benchmarks show it is useful outside CTable.
+
+### Cython extension placement
+
+Recommendation: start optimized kernels in `indexing_ext.pyx` only for Phase 2,
+under a clearly marked group-reduce section, to avoid build-system churn while
+validating the approach.  If the code grows beyond a few focused kernels or needs
+its own persistent state classes, move it to `groupby_ext.pyx`.  Do not place it
+in `blosc2_ext.pyx`.
diff --git a/src/blosc2/__init__.py b/src/blosc2/__init__.py
index 8a587c06..29ed2024 100644
--- a/src/blosc2/__init__.py
+++ b/src/blosc2/__init__.py
@@ -628,6 +628,7 @@ def _raise(exc):
 # Note: bool, bytes, string shadow builtins in the blosc2 namespace by design —
 # they are schema spec constructors (b2.bool(), b2.bytes(), etc.).
 from .ctable import DEFAULT_NULL_POLICY, Column, CTable, NullPolicy, get_null_policy, null_policy
+from .groupby import CTableGroupBy
 from .ndarray import (
     abs,
     acos,
@@ -804,6 +805,7 @@ def _raise(exc):
     # Classes
     "C2Array",
     "CParams",
+    "CTableGroupBy",
     "Batch",
     "BatchArray",
     # Enums
diff --git a/src/blosc2/ctable.py b/src/blosc2/ctable.py
index 1f80dc3d..56a89583 100644
--- a/src/blosc2/ctable.py
+++ b/src/blosc2/ctable.py
@@ -20,7 +20,7 @@
 import re
 import shutil
 from collections import namedtuple
-from collections.abc import Iterable, Mapping
+from collections.abc import Iterable, Mapping, Sequence
 from dataclasses import MISSING, dataclass
 from dataclasses import field as dataclass_field
 from textwrap import TextWrapper
@@ -2338,12 +2338,15 @@ def _init_columns(
                 )
                 continue
             if self._is_dictionary_column(col):
-                self._cols[col.name] = storage.create_dictionary_column(
+                dict_col = storage.create_dictionary_column(
                     col.name,
                     spec=col.spec,
                     cparams=col_storage.get("cparams"),
                     dparams=col_storage.get("dparams"),
                 )
+                if len(dict_col.codes) < expected_size:
+                    dict_col.resize((expected_size,))
+                self._cols[col.name] = dict_col
                 continue
             # Recompute chunks/blocks using the actual dtype so that wide
             # string columns (e.g. U183642) don't produce multi-GB chunks.
@@ -3482,6 +3485,47 @@ def select(self, cols: list[str]) -> CTable:
         obj._col_widths = {name: self._col_widths[name] for name in cols if name in self._col_widths}
         return obj
 
+    def group_by(
+        self,
+        keys: str | Sequence[str],
+        *,
+        sort: bool = False,
+        dropna: bool = True,
+        engine: str = "auto",
+        chunk_size: int | None = None,
+    ):
+        """Return a deferred group-by object for this table.
+
+        Parameters
+        ----------
+        keys:
+            Column name or sequence of column names to group by.
+        sort:
+            If ``True``, sort the result by the group keys.  The default
+            ``False`` preserves the hash aggregation order and is usually
+            faster.
+        dropna:
+            If ``True`` (default), rows with null/NaN group keys are skipped.
+            If ``False``, null/NaN keys form their own group.
+        engine:
+            Execution engine.  Phase 1 accepts ``"auto"`` and uses the NumPy
+            chunked implementation.
+        chunk_size:
+            Optional number of physical rows processed per chunk.
+
+        Returns
+        -------
+        CTableGroupBy
+            A lightweight deferred operation builder.  Call methods such as
+            ``.size()``, ``.count(column)`` or ``.agg({...})`` to materialize a
+            grouped result as a new :class:`CTable`.
+        """
+        if engine != "auto":
+            raise ValueError("Only engine='auto' is supported for group_by() in Phase 1")
+        from blosc2.groupby import CTableGroupBy
+
+        return CTableGroupBy(self, keys, sort=sort, dropna=dropna, engine=engine, chunk_size=chunk_size)
+
     def describe(self) -> None:
         """Print a per-column statistical summary.
 
diff --git a/src/blosc2/groupby.py b/src/blosc2/groupby.py
new file mode 100644
index 00000000..6dd6874e
--- /dev/null
+++ b/src/blosc2/groupby.py
@@ -0,0 +1,548 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+"""Group-by support for :class:`blosc2.CTable`.
+
+This module contains the Phase-1, NumPy-based implementation.  It is deliberately
+chunked and columnar: only grouping columns, aggregation columns, and the
+live-row mask are read from the source table.
+"""
+
+from __future__ import annotations
+
+import copy
+import dataclasses
+import math
+import re
+from collections.abc import Mapping, Sequence
+from typing import TYPE_CHECKING, Any, Literal
+
+import numpy as np
+
+from blosc2.schema import DictionarySpec, SchemaSpec, float64, int64
+from blosc2.schema import bool as b2_bool
+from blosc2.schema import field as b2_field
+
+if TYPE_CHECKING:  # pragma: no cover
+    from blosc2.ctable import CTable
+
+
+AggName = Literal["size", "count", "sum", "mean", "min", "max"]
+
+_IDENTIFIER_RE = re.compile(r"^[A-Za-z_]\w*$")
+_NAN_KEY = ("__blosc2_groupby_nan__",)
+
+
+@dataclasses.dataclass
+class _AggSpec:
+    input_col: str | None
+    op: AggName
+    output_col: str
+
+
+@dataclasses.dataclass
+class _AggState:
+    op: AggName
+    value: Any = None
+    count: int = 0
+
+
+class CTableGroupBy:
+    """Deferred group-by operation returned by :meth:`CTable.group_by`.
+
+    The object stores the source table, grouping keys, and execution options.
+    It is not a :class:`CTable` view and does not materialize grouped data until
+    a terminal method such as :meth:`size`, :meth:`count`, or :meth:`agg` is
+    called.
+    """
+
+    def __init__(
+        self,
+        table: CTable,
+        keys: str | Sequence[str],
+        *,
+        sort: bool = False,
+        dropna: bool = True,
+        engine: str = "auto",
+        chunk_size: int | None = None,
+    ) -> None:
+        if isinstance(keys, str):
+            keys = [keys]
+        else:
+            keys = list(keys)
+        if not keys:
+            raise ValueError("group_by() requires at least one key column")
+
+        self.table = table
+        self.keys = [table._logical_to_physical_name(k) for k in keys]
+        self.sort = bool(sort)
+        self.dropna = bool(dropna)
+        self.engine = engine
+        self.chunk_size = chunk_size
+
+        for name in self.keys:
+            if name in table._computed_cols:
+                raise NotImplementedError("group_by() over computed columns is not supported yet")
+            if name not in table._cols:
+                raise KeyError(f"No column named {name!r}. Available: {table.col_names}")
+            col_info = table._schema.columns_by_name[name]
+            if table._is_list_column(col_info) or table._is_varlen_scalar_column(col_info):
+                raise TypeError(f"Cannot group by variable-length/list column {name!r} in Phase 1")
+
+    def size(self):
+        """Return row counts per group as a new :class:`CTable`.
+
+        This is equivalent to SQL ``COUNT(*)``: it counts rows in each group and
+        is independent of null values in non-key columns.
+        """
+        return self._execute([_AggSpec(None, "size", "size")])
+
+    def count(self, column: str):
+        """Return non-null value counts for *column* per group.
+
+        This is equivalent to SQL ``COUNT(column)`` and to
+        ``group_by(...).agg({column: "count"})``.
+        """
+        col = self.table._logical_to_physical_name(column)
+        return self._execute([_AggSpec(col, "count", f"{col}_count")])
+
+    def agg(self, aggregations: Mapping[str, str | Sequence[str]]):
+        """Aggregate value columns per group.
+
+        Parameters
+        ----------
+        aggregations:
+            Mapping from input column name to an aggregation name or list of
+            names.  Supported operations in Phase 1 are ``"count"``, ``"sum"``,
+            ``"mean"``, ``"min"``, ``"max"`` and the special row-count spelling
+            ``{"*": "size"``}.
+        """
+        specs = self._normalize_aggs(aggregations)
+        return self._execute(specs)
+
+    def _normalize_aggs(self, aggregations: Mapping[str, str | Sequence[str]]) -> list[_AggSpec]:
+        if not isinstance(aggregations, Mapping) or not aggregations:
+            raise ValueError("agg() requires a non-empty mapping")
+        specs: list[_AggSpec] = []
+        for col_name, ops in aggregations.items():
+            if isinstance(ops, str):
+                op_list = [ops]
+            else:
+                op_list = list(ops)
+            if not op_list:
+                raise ValueError(f"No aggregations specified for column {col_name!r}")
+
+            if col_name == "*":
+                for op in op_list:
+                    if op != "size":
+                        raise ValueError("Only the 'size' aggregation is supported for '*' input")
+                    specs.append(_AggSpec(None, "size", "size"))
+                continue
+
+            physical = self.table._logical_to_physical_name(col_name)
+            self._validate_value_column(physical)
+            for op in op_list:
+                if op not in {"count", "sum", "mean", "min", "max"}:
+                    raise ValueError(f"Unsupported aggregation {op!r}")
+                self._validate_agg_for_column(physical, op)
+                specs.append(_AggSpec(physical, op, f"{physical}_{op}"))
+        output_names = [s.output_col for s in specs]
+        if len(output_names) != len(set(output_names)):
+            raise ValueError("Aggregation output column names must be unique")
+        return specs
+
+    def _validate_agg_for_column(self, name: str, op: str) -> None:
+        dtype = getattr(self.table._schema.columns_by_name[name].spec, "dtype", None)
+        if op in {"sum", "mean"} and dtype is not None and dtype.kind not in "biuf":
+            raise TypeError(f"Aggregation {op!r} is not supported for column {name!r} with dtype {dtype}")
+        if op in {"min", "max"} and dtype is not None and dtype.kind == "c":
+            raise TypeError(f"Aggregation {op!r} is not supported for complex column {name!r}")
+
+    def _validate_value_column(self, name: str) -> None:
+        if name in self.table._computed_cols:
+            raise NotImplementedError("group_by() aggregations over computed columns are not supported yet")
+        if name not in self.table._cols:
+            raise KeyError(f"No column named {name!r}. Available: {self.table.col_names}")
+        col_info = self.table._schema.columns_by_name[name]
+        if self.table._is_list_column(col_info) or self.table._is_varlen_scalar_column(col_info):
+            raise TypeError(f"Cannot aggregate variable-length/list column {name!r} in Phase 1")
+        if self.table._is_dictionary_column(col_info):
+            raise TypeError(f"Cannot aggregate dictionary column {name!r} in Phase 1")
+
+    def _execute(self, specs: list[_AggSpec]):
+        self._validate_output_names(specs)
+        acc: dict[Any, dict[str, _AggState]] = {}
+        key_values: dict[Any, tuple[Any, ...]] = {}
+
+        phys_len = len(self.table._valid_rows)
+        chunk_size = self._chunk_size()
+        value_cols = sorted({s.input_col for s in specs if s.input_col is not None})
+
+        for start in range(0, phys_len, chunk_size):
+            stop = min(start + chunk_size, phys_len)
+            valid = np.asarray(self.table._valid_rows[start:stop], dtype=bool)
+            if not np.any(valid):
+                continue
+
+            raw_keys = [self._read_key_chunk(name, start, stop) for name in self.keys]
+            live_mask = valid.copy()
+            if self.dropna:
+                for name, values in zip(self.keys, raw_keys, strict=True):
+                    live_mask &= ~self._null_mask(name, values, is_key=True)
+            if not np.any(live_mask):
+                continue
+
+            keys_live = [np.asarray(values)[live_mask] for values in raw_keys]
+            n_live = len(keys_live[0])
+            if n_live == 0:
+                continue
+
+            unique_keys, inverse = self._factorize_keys(keys_live)
+            value_chunks = {
+                name: np.asarray(self.table._cols[name][start:stop])[live_mask] for name in value_cols
+            }
+
+            partials = self._compute_partials(specs, unique_keys, inverse, value_chunks)
+            display_keys = self._display_keys(unique_keys)
+            normalized_keys = self._normalized_keys(display_keys)
+            self._merge_partials(acc, key_values, normalized_keys, display_keys, partials, specs)
+
+        rows = self._final_rows(acc, key_values, specs)
+        return self._build_result(rows, specs)
+
+    def _chunk_size(self) -> int:
+        if self.chunk_size is not None:
+            if self.chunk_size <= 0:
+                raise ValueError("chunk_size must be positive")
+            return int(self.chunk_size)
+        chunks = getattr(self.table._valid_rows, "chunks", None)
+        if chunks:
+            return max(int(chunks[0]), 1)
+        return 65536
+
+    def _read_key_chunk(self, name: str, start: int, stop: int) -> np.ndarray:
+        col_info = self.table._schema.columns_by_name[name]
+        if self.table._is_dictionary_column(col_info):
+            return np.asarray(self.table._cols[name].codes[start:stop], dtype=np.int32)
+        return np.asarray(self.table._cols[name][start:stop])
+
+    def _factorize_keys(
+        self, keys_live: list[np.ndarray]
+    ) -> tuple[np.ndarray | list[np.ndarray], np.ndarray]:
+        if len(keys_live) == 1:
+            unique, inverse = np.unique(keys_live[0], return_inverse=True)
+            return unique, inverse
+
+        dtype = [(f"k{i}", arr.dtype) for i, arr in enumerate(keys_live)]
+        packed = np.empty(len(keys_live[0]), dtype=dtype)
+        for i, arr in enumerate(keys_live):
+            packed[f"k{i}"] = arr
+        unique, inverse = np.unique(packed, return_inverse=True)
+        return unique, inverse
+
+    def _display_keys(self, unique_keys: np.ndarray | list[np.ndarray]) -> list[tuple[Any, ...]]:
+        if len(self.keys) == 1:
+            name = self.keys[0]
+            col_info = self.table._schema.columns_by_name[name]
+            values = []
+            for value in np.asarray(unique_keys):
+                if self.table._is_dictionary_column(col_info):
+                    values.append((self.table._cols[name].decode(int(value)),))
+                else:
+                    values.append((_python_scalar(value),))
+            return values
+
+        result = []
+        assert isinstance(unique_keys, np.ndarray)
+        for row in unique_keys:
+            vals = []
+            for i, name in enumerate(self.keys):
+                value = row[f"k{i}"]
+                col_info = self.table._schema.columns_by_name[name]
+                if self.table._is_dictionary_column(col_info):
+                    vals.append(self.table._cols[name].decode(int(value)))
+                else:
+                    vals.append(_python_scalar(value))
+            result.append(tuple(vals))
+        return result
+
+    def _normalized_keys(self, display_keys: list[tuple[Any, ...]]) -> list[Any]:
+        normalized = []
+        for key in display_keys:
+            norm = tuple(_normalize_key_part(v) for v in key)
+            normalized.append(norm[0] if len(norm) == 1 else norm)
+        return normalized
+
+    def _compute_partials(
+        self,
+        specs: list[_AggSpec],
+        unique_keys: np.ndarray | list[np.ndarray],
+        inverse: np.ndarray,
+        value_chunks: dict[str, np.ndarray],
+    ) -> dict[str, Any]:
+        n_groups = len(unique_keys)
+        partials: dict[str, Any] = {}
+        for spec in specs:
+            if spec.op == "size":
+                partials[spec.output_col] = np.bincount(inverse, minlength=n_groups).astype(np.int64)
+                continue
+
+            assert spec.input_col is not None
+            values = value_chunks[spec.input_col]
+            non_null = ~self._null_mask(spec.input_col, values, is_key=False)
+
+            if spec.op == "count":
+                partials[spec.output_col] = np.bincount(
+                    inverse, weights=non_null.astype(np.int64), minlength=n_groups
+                ).astype(np.int64)
+            elif spec.op in {"sum", "mean"}:
+                counts = np.bincount(inverse, weights=non_null.astype(np.int64), minlength=n_groups).astype(
+                    np.int64
+                )
+                if spec.op == "sum" and values.dtype.kind in "biu":
+                    sums = np.zeros(n_groups, dtype=np.int64)
+                    np.add.at(sums, inverse[non_null], values[non_null].astype(np.int64, copy=False))
+                else:
+                    weights = np.where(non_null, values, 0)
+                    sums = np.bincount(inverse, weights=weights, minlength=n_groups)
+                partials[spec.output_col] = (sums, counts)
+            elif spec.op in {"min", "max"}:
+                partials[spec.output_col] = self._minmax_partials(
+                    spec.op, inverse, values, non_null, n_groups
+                )
+        return partials
+
+    def _minmax_partials(
+        self, op: AggName, inverse: np.ndarray, values: np.ndarray, non_null: np.ndarray, n_groups: int
+    ) -> tuple[np.ndarray, np.ndarray]:
+        if values.dtype.kind in "biufcmM":
+            if op == "min":
+                identity = _max_identity(values.dtype)
+                out = np.full(n_groups, identity, dtype=values.dtype)
+                np.minimum.at(out, inverse[non_null], values[non_null])
+            else:
+                identity = _min_identity(values.dtype)
+                out = np.full(n_groups, identity, dtype=values.dtype)
+                np.maximum.at(out, inverse[non_null], values[non_null])
+        else:
+            out = np.empty(n_groups, dtype=values.dtype)
+            has = np.zeros(n_groups, dtype=bool)
+            for group, value, ok in zip(inverse, values, non_null, strict=True):
+                if not ok:
+                    continue
+                if not has[group] or (value < out[group] if op == "min" else value > out[group]):
+                    out[group] = value
+                    has[group] = True
+            return out, has
+        has_value = np.bincount(inverse, weights=non_null.astype(np.int64), minlength=n_groups) > 0
+        return out, has_value
+
+    def _merge_partials(
+        self,
+        acc: dict[Any, dict[str, _AggState]],
+        key_values: dict[Any, tuple[Any, ...]],
+        normalized_keys: list[Any],
+        display_keys: list[tuple[Any, ...]],
+        partials: dict[str, Any],
+        specs: list[_AggSpec],
+    ) -> None:
+        for i, norm_key in enumerate(normalized_keys):
+            states = acc.setdefault(norm_key, {})
+            key_values.setdefault(norm_key, display_keys[i])
+            for spec in specs:
+                state = states.setdefault(spec.output_col, _AggState(spec.op))
+                partial = partials[spec.output_col]
+                if spec.op in {"size", "count"}:
+                    state.value = (0 if state.value is None else state.value) + int(partial[i])
+                elif spec.op == "sum":
+                    sums, counts = partial
+                    if counts[i] > 0:
+                        state.value = (0 if state.value is None else state.value) + _python_scalar(sums[i])
+                        state.count += int(counts[i])
+                elif spec.op == "mean":
+                    sums, counts = partial
+                    if counts[i] > 0:
+                        state.value = (0.0 if state.value is None else state.value) + float(sums[i])
+                        state.count += int(counts[i])
+                elif spec.op in {"min", "max"}:
+                    values, has_value = partial
+                    if has_value[i]:
+                        value = _python_scalar(values[i])
+                        if (
+                            state.count == 0
+                            or (spec.op == "min" and value < state.value)
+                            or (spec.op == "max" and value > state.value)
+                        ):
+                            state.value = value
+                        state.count += 1
+
+    def _final_rows(
+        self,
+        acc: dict[Any, dict[str, _AggState]],
+        key_values: dict[Any, tuple[Any, ...]],
+        specs: list[_AggSpec],
+    ) -> list[dict[str, Any]]:
+        keys = list(acc)
+        if self.sort:
+            keys.sort(key=lambda k: tuple(_sortable_key_part(v) for v in key_values[k]))
+
+        rows = []
+        for norm_key in keys:
+            row = dict(zip(self.keys, key_values[norm_key], strict=True))
+            states = acc[norm_key]
+            for spec in specs:
+                state = states[spec.output_col]
+                if spec.op == "mean":
+                    row[spec.output_col] = math.nan if state.count == 0 else state.value / state.count
+                elif spec.op in {"sum", "min", "max"} and state.count == 0:
+                    row[spec.output_col] = _null_output_value(self._result_spec_for_agg(spec))
+                else:
+                    row[spec.output_col] = 0 if state.value is None else state.value
+            rows.append(row)
+        return rows
+
+    def _build_result(self, rows: list[dict[str, Any]], specs: list[_AggSpec]):
+        from blosc2.ctable import CTable
+
+        columns = self.keys + [spec.output_col for spec in specs]
+        schema_specs = {name: self._result_spec_for_key(name) for name in self.keys}
+        for spec in specs:
+            schema_specs[spec.output_col] = self._result_spec_for_agg(spec)
+
+        fields = []
+        for name in columns:
+            fields.append((name, _python_type_for_spec(schema_specs[name]), b2_field(schema_specs[name])))
+        row_type = dataclasses.make_dataclass("CTableGroupByRow", fields)
+        data = {name: [row[name] for row in rows] for name in columns}
+        return CTable(row_type, new_data=data, expected_size=max(len(rows), 1), validate=False)
+
+    def _validate_output_names(self, specs: list[_AggSpec]) -> None:
+        names = self.keys + [s.output_col for s in specs]
+        bad = [name for name in names if not _IDENTIFIER_RE.match(name)]
+        if bad:
+            raise NotImplementedError(
+                "Phase-1 group_by() result columns must be valid Python identifiers; "
+                f"unsupported names: {bad!r}"
+            )
+        if len(names) != len(set(names)):
+            raise ValueError("Group-by result column names would not be unique")
+
+    def _result_spec_for_key(self, name: str) -> SchemaSpec:
+        return copy.deepcopy(self.table._schema.columns_by_name[name].spec)
+
+    def _result_spec_for_agg(self, spec: _AggSpec) -> SchemaSpec:
+        if spec.op in {"size", "count"}:
+            return int64()
+        if spec.op == "mean":
+            return float64()
+        assert spec.input_col is not None
+        input_spec = self.table._schema.columns_by_name[spec.input_col].spec
+        dtype = getattr(input_spec, "dtype", None)
+        if spec.op == "sum":
+            if dtype is not None and dtype.kind in "iu":
+                return int64()
+            if dtype is not None and dtype.kind == "b":
+                return int64()
+            if dtype is not None and dtype.kind == "f":
+                return float64()
+        return copy.deepcopy(input_spec)
+
+    def _null_mask(self, name: str, values: np.ndarray, *, is_key: bool) -> np.ndarray:
+        col_info = self.table._schema.columns_by_name[name]
+        spec = col_info.spec
+        if isinstance(spec, DictionarySpec):
+            mask = values == np.int32(spec.null_code)
+            return mask if is_key or getattr(spec, "nullable", False) else np.zeros(len(values), dtype=bool)
+        null_value = getattr(spec, "null_value", None)
+        mask = np.zeros(len(values), dtype=bool)
+        # For keys, treat all NaNs as missing so dropna behaves predictably.
+        # For values, only nullable NaN sentinels are skipped.
+        if values.dtype.kind == "f" and (
+            is_key or (isinstance(null_value, float) and math.isnan(null_value))
+        ):
+            mask |= np.isnan(values)
+        if null_value is not None and not (isinstance(null_value, float) and math.isnan(null_value)):
+            mask |= values == null_value
+        return mask
+
+
+def _normalize_key_part(value: Any) -> Any:
+    if isinstance(value, float) and math.isnan(value):
+        return _NAN_KEY
+    return value
+
+
+def _sortable_key_part(value: Any) -> tuple[int, Any]:
+    if value is None:
+        return (0, "")
+    if isinstance(value, float) and math.isnan(value):
+        return (0, "")
+    return (1, value)
+
+
+def _python_scalar(value: Any) -> Any:
+    if isinstance(value, np.generic):
+        return value.item()
+    return value
+
+
+def _python_type_for_spec(spec: SchemaSpec):
+    if isinstance(spec, DictionarySpec):
+        return str
+    if isinstance(spec, b2_bool):
+        return bool
+    dtype = getattr(spec, "dtype", None)
+    if dtype is not None:
+        if dtype.kind in "iu":
+            return int
+        if dtype.kind == "f":
+            return float
+        if dtype.kind == "b":
+            return bool
+        if dtype.kind in "US":
+            return str if dtype.kind == "U" else bytes
+    return getattr(spec, "python_type", object)
+
+
+def _max_identity(dtype: np.dtype):
+    dtype = np.dtype(dtype)
+    if dtype.kind in "iu":
+        return np.iinfo(dtype).max
+    if dtype.kind == "f":
+        return np.inf
+    if dtype.kind in "mM":
+        return np.iinfo(np.int64).max
+    return None
+
+
+def _min_identity(dtype: np.dtype):
+    dtype = np.dtype(dtype)
+    if dtype.kind in "iu":
+        return np.iinfo(dtype).min
+    if dtype.kind == "f":
+        return -np.inf
+    if dtype.kind in "mM":
+        return np.iinfo(np.int64).min
+    return None
+
+
+def _null_output_value(spec: SchemaSpec):
+    dtype = getattr(spec, "dtype", None)
+    null_value = getattr(spec, "null_value", None)
+    if null_value is not None:
+        return null_value
+    if dtype is not None and dtype.kind == "f":
+        return math.nan
+    if dtype is not None and dtype.kind in "iu":
+        return 0
+    if dtype is not None and dtype.kind == "b":
+        return False
+    if dtype is not None and dtype.kind == "U":
+        return ""
+    if dtype is not None and dtype.kind == "S":
+        return b""
+    return None
diff --git a/tests/ctable/test_groupby.py b/tests/ctable/test_groupby.py
new file mode 100644
index 00000000..12394fb4
--- /dev/null
+++ b/tests/ctable/test_groupby.py
@@ -0,0 +1,151 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+from dataclasses import dataclass
+
+import numpy as np
+import pytest
+
+import blosc2
+from blosc2 import CTable
+
+
+@dataclass
+class SalesRow:
+    city: str = blosc2.field(blosc2.string(max_length=16))
+    category: int = blosc2.field(blosc2.int32())
+    sales: float = blosc2.field(blosc2.float64(nullable=True), default=0.0)
+    qty: int = blosc2.field(blosc2.int32(), default=0)
+
+
+DATA = [
+    ("Paris", 1, 10.0, 1),
+    ("Paris", 1, np.nan, 2),
+    ("Rome", 1, 20.0, 3),
+    ("Paris", 2, 30.0, 4),
+    ("Rome", 1, 40.0, 5),
+    ("Berlin", 2, np.nan, 6),
+]
+
+
+def col(table, name):
+    return list(table._cols[name][: table.nrows])
+
+
+def rows(table):
+    return [tuple(table._cols[name][i] for name in table.col_names) for i in range(table.nrows)]
+
+
+def test_groupby_size_counts_rows_per_group():
+    t = CTable(SalesRow, new_data=DATA)
+
+    out = t.group_by("city", sort=True).size()
+
+    assert out.col_names == ["city", "size"]
+    assert rows(out) == [("Berlin", 1), ("Paris", 3), ("Rome", 2)]
+
+
+def test_groupby_count_counts_non_null_values():
+    t = CTable(SalesRow, new_data=DATA)
+
+    out = t.group_by("city", sort=True).count("sales")
+
+    assert out.col_names == ["city", "sales_count"]
+    assert rows(out) == [("Berlin", 0), ("Paris", 2), ("Rome", 2)]
+
+
+def test_groupby_agg_numeric_reductions():
+    t = CTable(SalesRow, new_data=DATA)
+
+    out = t.group_by("city", sort=True).agg({"sales": ["sum", "mean", "min", "max", "count"]})
+
+    assert out.col_names == ["city", "sales_sum", "sales_mean", "sales_min", "sales_max", "sales_count"]
+    got = rows(out)
+    assert got[0][0] == "Berlin"
+    assert np.isnan(got[0][1])
+    assert np.isnan(got[0][2])
+    assert np.isnan(got[0][3])
+    assert np.isnan(got[0][4])
+    assert got[0][5] == 0
+    assert got[1] == ("Paris", 40.0, 20.0, 10.0, 30.0, 2)
+    assert got[2] == ("Rome", 60.0, 30.0, 20.0, 40.0, 2)
+
+
+def test_groupby_multi_key_size():
+    t = CTable(SalesRow, new_data=DATA)
+
+    out = t.group_by(["city", "category"], sort=True).size()
+
+    assert rows(out) == [("Berlin", 2, 1), ("Paris", 1, 2), ("Paris", 2, 1), ("Rome", 1, 2)]
+
+
+def test_groupby_respects_views_and_deleted_rows():
+    t = CTable(SalesRow, new_data=DATA)
+    t.delete(0)
+    view = t.where("qty >= 3")
+
+    out = view.group_by("city", sort=True).size()
+
+    assert rows(out) == [("Berlin", 1), ("Paris", 1), ("Rome", 2)]
+
+
+@dataclass
+class DictRow:
+    city: str = blosc2.field(blosc2.dictionary())
+    sales: int = blosc2.field(blosc2.int32())
+
+
+def test_groupby_dictionary_key_groups_by_decoded_value():
+    t = CTable(DictRow, new_data=[("Paris", 10), ("Rome", 20), ("Paris", 30)])
+
+    out = t.group_by("city", sort=True).agg({"sales": "sum"})
+
+    assert out.col_names == ["city", "sales_sum"]
+    assert rows(out) == [("Paris", 40), ("Rome", 20)]
+
+
+def test_groupby_dictionary_key_beyond_default_code_capacity():
+    data = [("Paris" if i % 2 == 0 else "Rome", 1) for i in range(5000)]
+    t = CTable(DictRow, new_data=data)
+
+    out = t.group_by("city", sort=True).size()
+
+    assert rows(out) == [("Paris", 2500), ("Rome", 2500)]
+
+
+def test_groupby_dropna_key_default_and_false():
+    t = CTable(DictRow, new_data=[("Paris", 10), (None, 20), ("Paris", 30)])
+
+    dropped = t.group_by("city", sort=True).size()
+    kept = t.group_by("city", sort=True, dropna=False).size()
+
+    assert rows(dropped) == [("Paris", 2)]
+    assert rows(kept) == [(None, 1), ("Paris", 2)]
+
+
+def test_groupby_agg_star_size():
+    t = CTable(SalesRow, new_data=DATA)
+
+    out = t.group_by("city", sort=True).agg({"*": "size"})
+
+    assert rows(out) == [("Berlin", 1), ("Paris", 3), ("Rome", 2)]
+
+
+def test_groupby_empty_table_returns_empty_result():
+    t = CTable(SalesRow)
+
+    out = t.group_by("city").size()
+
+    assert out.nrows == 0
+    assert out.col_names == ["city", "size"]
+
+
+def test_groupby_rejects_bad_engine():
+    t = CTable(SalesRow, new_data=DATA)
+
+    with pytest.raises(ValueError):
+        t.group_by("city", engine="cython")

From 110f3e91687e246060e836331c4f2f3a8d08ce4d Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Fri, 15 May 2026 07:10:03 +0200
Subject: [PATCH 02/17] Fast path for one dense integer/dictionary-code key (8x
 speedup)

---
 src/blosc2/groupby.py | 161 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 161 insertions(+)

diff --git a/src/blosc2/groupby.py b/src/blosc2/groupby.py
index 6dd6874e..af4eb9f2 100644
--- a/src/blosc2/groupby.py
+++ b/src/blosc2/groupby.py
@@ -175,6 +175,10 @@ def _validate_value_column(self, name: str) -> None:
 
     def _execute(self, specs: list[_AggSpec]):
         self._validate_output_names(specs)
+        fast = self._try_execute_dense_single_int_key(specs)
+        if fast is not None:
+            return fast
+
         acc: dict[Any, dict[str, _AggState]] = {}
         key_values: dict[Any, tuple[Any, ...]] = {}
 
@@ -214,6 +218,163 @@ def _execute(self, specs: list[_AggSpec]):
         rows = self._final_rows(acc, key_values, specs)
         return self._build_result(rows, specs)
 
+    def _try_execute_dense_single_int_key(self, specs: list[_AggSpec]):  # noqa: C901
+        """Fast path for one dense integer/dictionary-code key.
+
+        This avoids per-chunk ``np.unique`` and Python dictionary merging.  It is
+        intentionally conservative: keys must be non-negative and the observed
+        key range must stay reasonably compact.
+        """
+        if len(self.keys) != 1:
+            return None
+        key_name = self.keys[0]
+        key_info = self.table._schema.columns_by_name[key_name]
+        key_is_dict = self.table._is_dictionary_column(key_info)
+        key_dtype = np.dtype(np.int32) if key_is_dict else getattr(key_info.spec, "dtype", None)
+        if key_dtype is None or key_dtype.kind not in "biu":
+            return None
+        if any(spec.op in {"min", "max"} and spec.input_col is not None for spec in specs):
+            for spec in specs:
+                if spec.op in {"min", "max"} and spec.input_col is not None:
+                    dtype = getattr(self.table._schema.columns_by_name[spec.input_col].spec, "dtype", None)
+                    if dtype is None or np.dtype(dtype).kind not in "biufmM":
+                        return None
+
+        compact_limit = 10_000_000
+        present = np.zeros(0, dtype=bool)
+        states: dict[str, Any] = {}
+        for spec in specs:
+            if spec.op in {"size", "count"}:
+                states[spec.output_col] = np.zeros(0, dtype=np.int64)
+            elif spec.op == "sum":
+                out_dtype = np.int64
+                if spec.input_col is not None:
+                    dtype = np.dtype(self.table._schema.columns_by_name[spec.input_col].spec.dtype)
+                    out_dtype = np.float64 if dtype.kind == "f" else np.int64
+                states[spec.output_col] = np.zeros(0, dtype=out_dtype)
+            elif spec.op == "mean":
+                states[spec.output_col] = (np.zeros(0, dtype=np.float64), np.zeros(0, dtype=np.int64))
+            elif spec.op in {"min", "max"}:
+                assert spec.input_col is not None
+                dtype = np.dtype(self.table._schema.columns_by_name[spec.input_col].spec.dtype)
+                identity = _max_identity(dtype) if spec.op == "min" else _min_identity(dtype)
+                states[spec.output_col] = (np.full(0, identity, dtype=dtype), np.zeros(0, dtype=bool))
+
+        def ensure_size(size: int) -> bool:
+            nonlocal present, states
+            if size > compact_limit:
+                return False
+            if size <= len(present):
+                return True
+            old = len(present)
+            present = np.pad(present, (0, size - old), constant_values=False)
+            for spec in specs:
+                state = states[spec.output_col]
+                if spec.op in {"size", "count", "sum"}:
+                    states[spec.output_col] = np.pad(state, (0, size - old), constant_values=0)
+                elif spec.op == "mean":
+                    sums, counts = state
+                    states[spec.output_col] = (
+                        np.pad(sums, (0, size - old), constant_values=0),
+                        np.pad(counts, (0, size - old), constant_values=0),
+                    )
+                elif spec.op in {"min", "max"}:
+                    values, has = state
+                    dtype = values.dtype
+                    identity = _max_identity(dtype) if spec.op == "min" else _min_identity(dtype)
+                    states[spec.output_col] = (
+                        np.pad(values, (0, size - old), constant_values=identity),
+                        np.pad(has, (0, size - old), constant_values=False),
+                    )
+            return True
+
+        phys_len = len(self.table._valid_rows)
+        chunk_size = self._chunk_size()
+        value_cols = sorted({s.input_col for s in specs if s.input_col is not None})
+        for start in range(0, phys_len, chunk_size):
+            stop = min(start + chunk_size, phys_len)
+            valid = np.asarray(self.table._valid_rows[start:stop], dtype=bool)
+            if not np.any(valid):
+                continue
+            raw_keys = self._read_key_chunk(key_name, start, stop)
+            live_mask = valid.copy()
+            if self.dropna:
+                live_mask &= ~self._null_mask(key_name, raw_keys, is_key=True)
+            if not np.any(live_mask):
+                continue
+            keys = np.asarray(raw_keys[live_mask])
+            if keys.dtype.kind == "b":
+                keys = keys.astype(np.int8, copy=False)
+            if len(keys) == 0:
+                continue
+            min_key = int(np.min(keys))
+            if min_key < 0:
+                return None
+            max_key = int(np.max(keys))
+            if not ensure_size(max_key + 1):
+                return None
+            present[keys] = True
+            value_chunks = {
+                name: np.asarray(self.table._cols[name][start:stop])[live_mask] for name in value_cols
+            }
+
+            for spec in specs:
+                if spec.op == "size":
+                    states[spec.output_col] += np.bincount(keys, minlength=len(present)).astype(np.int64)
+                    continue
+                assert spec.input_col is not None
+                values = value_chunks[spec.input_col]
+                non_null = ~self._null_mask(spec.input_col, values, is_key=False)
+                if spec.op == "count":
+                    states[spec.output_col] += np.bincount(
+                        keys, weights=non_null.astype(np.int64), minlength=len(present)
+                    ).astype(np.int64)
+                elif spec.op == "sum":
+                    state = states[spec.output_col]
+                    if values.dtype.kind in "biu":
+                        np.add.at(state, keys[non_null], values[non_null].astype(np.int64, copy=False))
+                    else:
+                        state += np.bincount(
+                            keys, weights=np.where(non_null, values, 0), minlength=len(present)
+                        ).astype(state.dtype, copy=False)
+                elif spec.op == "mean":
+                    sums, counts = states[spec.output_col]
+                    sums += np.bincount(keys, weights=np.where(non_null, values, 0), minlength=len(present))
+                    counts += np.bincount(
+                        keys, weights=non_null.astype(np.int64), minlength=len(present)
+                    ).astype(np.int64)
+                elif spec.op in {"min", "max"}:
+                    values_state, has_state = states[spec.output_col]
+                    if spec.op == "min":
+                        np.minimum.at(values_state, keys[non_null], values[non_null])
+                    else:
+                        np.maximum.at(values_state, keys[non_null], values[non_null])
+                    has_state[keys[non_null]] = True
+
+        group_codes = np.nonzero(present)[0]
+        rows = []
+        for code in group_codes:
+            key_value = self.table._cols[key_name].decode(int(code)) if key_is_dict else _python_scalar(code)
+            row = {key_name: key_value}
+            for spec in specs:
+                state = states[spec.output_col]
+                if spec.op == "mean":
+                    sums, counts = state
+                    row[spec.output_col] = (
+                        math.nan if counts[code] == 0 else float(sums[code]) / int(counts[code])
+                    )
+                elif spec.op in {"min", "max"}:
+                    values_state, has_state = state
+                    row[spec.output_col] = (
+                        _python_scalar(values_state[code])
+                        if has_state[code]
+                        else _null_output_value(self._result_spec_for_agg(spec))
+                    )
+                else:
+                    row[spec.output_col] = _python_scalar(state[code])
+            rows.append(row)
+        return self._build_result(rows, specs)
+
     def _chunk_size(self) -> int:
         if self.chunk_size is not None:
             if self.chunk_size <= 0:

From 40e58bd3b95b4db56c7743241c8771d90582aa47 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Fri, 15 May 2026 07:22:48 +0200
Subject: [PATCH 03/17] Optimizing thing in cython (phase 2 going on). More
 than 2x speedup.

---
 src/blosc2/groupby.py       |  78 +++++++++++++++++++
 src/blosc2/indexing_ext.pyx | 150 ++++++++++++++++++++++++++++++++++++
 2 files changed, 228 insertions(+)

diff --git a/src/blosc2/groupby.py b/src/blosc2/groupby.py
index af4eb9f2..b3b1572e 100644
--- a/src/blosc2/groupby.py
+++ b/src/blosc2/groupby.py
@@ -175,6 +175,9 @@ def _validate_value_column(self, name: str) -> None:
 
     def _execute(self, specs: list[_AggSpec]):
         self._validate_output_names(specs)
+        fast = self._try_execute_cython_i32_f64_sum(specs)
+        if fast is not None:
+            return fast
         fast = self._try_execute_dense_single_int_key(specs)
         if fast is not None:
             return fast
@@ -218,6 +221,81 @@ def _execute(self, specs: list[_AggSpec]):
         rows = self._final_rows(acc, key_values, specs)
         return self._build_result(rows, specs)
 
+    def _try_execute_cython_i32_f64_sum(self, specs: list[_AggSpec]):  # noqa: C901
+        """Cython fast path for one int32 key and one non-null float64 sum."""
+        if len(self.keys) != 1 or len(specs) != 1 or specs[0].op != "sum" or self.sort:
+            return None
+        spec = specs[0]
+        if spec.input_col is None:
+            return None
+        key_name = self.keys[0]
+        key_info = self.table._schema.columns_by_name[key_name]
+        value_info = self.table._schema.columns_by_name[spec.input_col]
+        if self.table._is_dictionary_column(key_info):
+            key_arr = self.table._cols[key_name].codes
+            key_is_dict = True
+            key_null = int(key_info.spec.null_code)
+            skip_key_null = self.dropna
+        else:
+            key_arr = self.table._cols[key_name]
+            key_is_dict = False
+            key_dtype = getattr(key_info.spec, "dtype", None)
+            if key_dtype != np.dtype(np.int32):
+                return None
+            key_null_value = getattr(key_info.spec, "null_value", None)
+            skip_key_null = self.dropna and key_null_value is not None
+            key_null = 0 if key_null_value is None else int(key_null_value)
+        value_dtype = getattr(value_info.spec, "dtype", None)
+        if value_dtype != np.dtype(np.float64) or getattr(value_info.spec, "null_value", None) is not None:
+            return None
+        try:
+            from blosc2 import indexing_ext
+        except ImportError:
+            return None
+        kernel = getattr(indexing_ext, "groupby_dense_i32_f64_sum_checked", None)
+        if kernel is None:
+            return None
+
+        compact_limit = 10_000_000
+        sums = np.zeros(0, dtype=np.float64)
+        present = np.zeros(0, dtype=bool)
+
+        def ensure_size(size: int) -> bool:
+            nonlocal sums, present
+            if size > compact_limit:
+                return False
+            if size <= len(sums):
+                return True
+            old = len(sums)
+            sums = np.pad(sums, (0, size - old), constant_values=0)
+            present = np.pad(present, (0, size - old), constant_values=False)
+            return True
+
+        phys_len = len(self.table._valid_rows)
+        chunk_size = self._chunk_size()
+        for start in range(0, phys_len, chunk_size):
+            stop = min(start + chunk_size, phys_len)
+            valid = np.asarray(self.table._valid_rows[start:stop], dtype=bool)
+            if not np.any(valid):
+                continue
+            keys = np.asarray(key_arr[start:stop], dtype=np.int32)
+            values = np.asarray(self.table._cols[spec.input_col][start:stop], dtype=np.float64)
+            status = int(kernel(keys, values, valid, sums, present, skip_key_null, key_null, False))
+            if status == -1:
+                return None
+            if status > 0:
+                if not ensure_size(status):
+                    return None
+                status = int(kernel(keys, values, valid, sums, present, skip_key_null, key_null, False))
+                if status != 0:
+                    return None
+
+        rows = []
+        for code in np.nonzero(present)[0]:
+            key_value = self.table._cols[key_name].decode(int(code)) if key_is_dict else int(code)
+            rows.append({key_name: key_value, spec.output_col: float(sums[code])})
+        return self._build_result(rows, specs)
+
     def _try_execute_dense_single_int_key(self, specs: list[_AggSpec]):  # noqa: C901
         """Fast path for one dense integer/dictionary-code key.
 
diff --git a/src/blosc2/indexing_ext.pyx b/src/blosc2/indexing_ext.pyx
index 759f5980..e1202edd 100644
--- a/src/blosc2/indexing_ext.pyx
+++ b/src/blosc2/indexing_ext.pyx
@@ -2495,3 +2495,153 @@ def keysort_keys_indices(np.ndarray keys, np.ndarray indices):
         return None
     _keysort_ndarray(keys, indices)
     return None
+
+
+# ----------------------------------------------------------------------
+# Group-reduce kernels
+# ----------------------------------------------------------------------
+
+def groupby_dense_i32_f64_sum(
+    np.ndarray keys,
+    np.ndarray values,
+    np.ndarray valid,
+    np.ndarray sums,
+    np.ndarray present,
+    bint skip_key_null=False,
+    int32_t key_null=0,
+    bint skip_value_nan=False,
+):
+    """Accumulate ``sum(values)`` by dense int32 keys.
+
+    This is a low-level CTable group-by helper.  *keys*, *values*, and *valid*
+    are same-length 1-D chunk arrays.  *sums* and *present* are dense group
+    state arrays indexed directly by key value.  Keys must be non-negative and
+    already fit in the state arrays.
+    """
+    if keys.ndim != 1 or values.ndim != 1 or valid.ndim != 1:
+        raise ValueError("keys, values and valid must be 1-D arrays")
+    if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]:
+        raise ValueError("keys, values and valid must have the same length")
+    if sums.ndim != 1 or present.ndim != 1:
+        raise ValueError("sums and present must be 1-D arrays")
+    if keys.dtype != np.dtype(np.int32):
+        raise TypeError("keys must have dtype int32")
+    if values.dtype != np.dtype(np.float64):
+        raise TypeError("values must have dtype float64")
+    if valid.dtype != np.dtype(np.bool_):
+        raise TypeError("valid must have dtype bool")
+    if sums.dtype != np.dtype(np.float64):
+        raise TypeError("sums must have dtype float64")
+    if present.dtype != np.dtype(np.bool_):
+        raise TypeError("present must have dtype bool")
+
+    cdef int32_t[:] keys_view = keys
+    cdef double[:] values_view = values
+    cdef np.npy_bool[:] valid_view = valid
+    cdef double[:] sums_view = sums
+    cdef np.npy_bool[:] present_view = present
+    cdef Py_ssize_t n = keys.shape[0]
+    cdef Py_ssize_t nstates = sums.shape[0]
+    cdef Py_ssize_t i
+    cdef int32_t key
+    cdef double value
+
+    if present.shape[0] != sums.shape[0]:
+        raise ValueError("present and sums must have the same length")
+
+    with nogil:
+        for i in range(n):
+            if not valid_view[i]:
+                continue
+            key = keys_view[i]
+            if skip_key_null and key == key_null:
+                continue
+            if key < 0 or key >= nstates:
+                continue
+            value = values_view[i]
+            if skip_value_nan and value != value:
+                continue
+            sums_view[key] += value
+            present_view[key] = 1
+    return None
+
+
+
+def groupby_dense_i32_f64_sum_checked(
+    np.ndarray keys,
+    np.ndarray values,
+    np.ndarray valid,
+    np.ndarray sums,
+    np.ndarray present,
+    bint skip_key_null=False,
+    int32_t key_null=0,
+    bint skip_value_nan=False,
+):
+    """Checked dense int32/float64 sum kernel.
+
+    Returns ``0`` on success, ``-1`` if a negative non-null key is found, or
+    ``max_key + 1`` when the dense state arrays need to be grown.  The state is
+    not mutated unless the function returns ``0``.
+    """
+    if keys.ndim != 1 or values.ndim != 1 or valid.ndim != 1:
+        raise ValueError("keys, values and valid must be 1-D arrays")
+    if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]:
+        raise ValueError("keys, values and valid must have the same length")
+    if sums.ndim != 1 or present.ndim != 1:
+        raise ValueError("sums and present must be 1-D arrays")
+    if keys.dtype != np.dtype(np.int32):
+        raise TypeError("keys must have dtype int32")
+    if values.dtype != np.dtype(np.float64):
+        raise TypeError("values must have dtype float64")
+    if valid.dtype != np.dtype(np.bool_):
+        raise TypeError("valid must have dtype bool")
+    if sums.dtype != np.dtype(np.float64):
+        raise TypeError("sums must have dtype float64")
+    if present.dtype != np.dtype(np.bool_):
+        raise TypeError("present must have dtype bool")
+    if present.shape[0] != sums.shape[0]:
+        raise ValueError("present and sums must have the same length")
+
+    cdef int32_t[:] keys_view = keys
+    cdef double[:] values_view = values
+    cdef np.npy_bool[:] valid_view = valid
+    cdef double[:] sums_view = sums
+    cdef np.npy_bool[:] present_view = present
+    cdef Py_ssize_t n = keys.shape[0]
+    cdef Py_ssize_t nstates = sums.shape[0]
+    cdef Py_ssize_t i
+    cdef int32_t key
+    cdef int32_t max_key = -1
+    cdef int ret = 0
+    cdef double value
+
+    with nogil:
+        for i in range(n):
+            if not valid_view[i]:
+                continue
+            key = keys_view[i]
+            if skip_key_null and key == key_null:
+                continue
+            if key < 0:
+                ret = -1
+                break
+            if key > max_key:
+                max_key = key
+        if ret == 0:
+            if max_key < 0:
+                ret = 0
+            elif max_key >= nstates:
+                ret = <int>max_key + 1
+            else:
+                for i in range(n):
+                    if not valid_view[i]:
+                        continue
+                    key = keys_view[i]
+                    if skip_key_null and key == key_null:
+                        continue
+                    value = values_view[i]
+                    if skip_value_nan and value != value:
+                        continue
+                    sums_view[key] += value
+                    present_view[key] = 1
+    return ret

From 4bbd843730049daff57ce391cd2eef878f24c6f0 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Fri, 15 May 2026 07:36:48 +0200
Subject: [PATCH 04/17] Acceleration path for float32/64 as groupby keys. 25x
 speedup.

---
 bench/ctable/groupby.py     |  48 ++++++++--
 src/blosc2/groupby.py       |  77 +++++++++++++++
 src/blosc2/indexing_ext.pyx | 184 ++++++++++++++++++++++++++++++++++++
 3 files changed, 302 insertions(+), 7 deletions(-)

diff --git a/bench/ctable/groupby.py b/bench/ctable/groupby.py
index fb601eb3..68c03551 100644
--- a/bench/ctable/groupby.py
+++ b/bench/ctable/groupby.py
@@ -4,6 +4,7 @@
 Examples
 --------
 python bench/ctable/groupby.py --rows 10_000_000 --groups 1_000 --op sum
+python bench/ctable/groupby.py --rows 10_000_000 --groups 1_000 --key-dtype float64 --op sum
 python bench/ctable/groupby.py --rows 1_000_000 --groups 100 --dictionary --pandas
 """
 
@@ -23,7 +24,7 @@ def parse_int(text: str) -> int:
     return int(text.replace("_", ""))
 
 
-def build_row_type(dictionary: bool):
+def build_row_type(dictionary: bool, key_dtype: str):
     if dictionary:
 
         @dataclasses.dataclass
@@ -31,24 +32,50 @@ class Row:
             key: str = blosc2.field(blosc2.dictionary())
             value: float = blosc2.field(blosc2.float64())
 
-    else:
+    elif key_dtype == "int32":
 
         @dataclasses.dataclass
         class Row:
             key: int = blosc2.field(blosc2.int32())
             value: float = blosc2.field(blosc2.float64())
 
+    elif key_dtype == "int64":
+
+        @dataclasses.dataclass
+        class Row:
+            key: int = blosc2.field(blosc2.int64())
+            value: float = blosc2.field(blosc2.float64())
+
+    elif key_dtype == "float32":
+
+        @dataclasses.dataclass
+        class Row:
+            key: float = blosc2.field(blosc2.float32())
+            value: float = blosc2.field(blosc2.float64())
+
+    elif key_dtype == "float64":
+
+        @dataclasses.dataclass
+        class Row:
+            key: float = blosc2.field(blosc2.float64())
+            value: float = blosc2.field(blosc2.float64())
+
+    else:  # pragma: no cover - argparse choices prevent this
+        raise ValueError(f"unsupported key dtype {key_dtype!r}")
+
     return Row
 
 
-def make_data(nrows: int, ngroups: int, dictionary: bool, seed: int):
+def make_data(nrows: int, ngroups: int, dictionary: bool, key_dtype: str, seed: int):
     rng = np.random.default_rng(seed)
     key_codes = rng.integers(0, ngroups, size=nrows, dtype=np.int32)
     values = rng.random(nrows, dtype=np.float64)
     if dictionary:
         keys = np.asarray([f"k{code}" for code in key_codes], dtype=object)
+    elif key_dtype in {"float32", "float64"}:
+        keys = key_codes.astype(np.dtype(key_dtype))
     else:
-        keys = key_codes
+        keys = key_codes.astype(np.dtype(key_dtype), copy=False)
     return keys, values
 
 
@@ -58,6 +85,12 @@ def main() -> None:
     parser.add_argument("--groups", type=parse_int, default=1_000)
     parser.add_argument("--chunk-size", type=parse_int, default=None)
     parser.add_argument("--dictionary", action="store_true", help="Use a dictionary-encoded string key")
+    parser.add_argument(
+        "--key-dtype",
+        choices=["int32", "int64", "float32", "float64"],
+        default="int32",
+        help="Physical dtype for non-dictionary keys. Float keys are generated from group codes cast to float.",
+    )
     parser.add_argument("--op", choices=["size", "count", "sum", "mean", "min", "max"], default="sum")
     parser.add_argument("--sort", action="store_true")
     parser.add_argument("--pandas", action="store_true", help="Also run a pandas comparison if available")
@@ -67,11 +100,12 @@ def main() -> None:
 
     print(
         f"rows={args.rows:,} groups={args.groups:,} dictionary={args.dictionary} "
-        f"op={args.op} sort={args.sort} chunk_size={args.chunk_size} urlpath={args.urlpath}"
+        f"key_dtype={args.key_dtype} op={args.op} sort={args.sort} "
+        f"chunk_size={args.chunk_size} urlpath={args.urlpath}"
     )
 
-    keys, values = make_data(args.rows, args.groups, args.dictionary, args.seed)
-    Row = build_row_type(args.dictionary)
+    keys, values = make_data(args.rows, args.groups, args.dictionary, args.key_dtype, args.seed)
+    Row = build_row_type(args.dictionary, args.key_dtype)
 
     kwargs = {}
     if args.urlpath is not None:
diff --git a/src/blosc2/groupby.py b/src/blosc2/groupby.py
index b3b1572e..170edab8 100644
--- a/src/blosc2/groupby.py
+++ b/src/blosc2/groupby.py
@@ -176,6 +176,9 @@ def _validate_value_column(self, name: str) -> None:
     def _execute(self, specs: list[_AggSpec]):
         self._validate_output_names(specs)
         fast = self._try_execute_cython_i32_f64_sum(specs)
+        if fast is not None:
+            return fast
+        fast = self._try_execute_cython_float_integral_key_f64_sum(specs)
         if fast is not None:
             return fast
         fast = self._try_execute_dense_single_int_key(specs)
@@ -296,6 +299,80 @@ def ensure_size(size: int) -> bool:
             rows.append({key_name: key_value, spec.output_col: float(sums[code])})
         return self._build_result(rows, specs)
 
+    def _try_execute_cython_float_integral_key_f64_sum(self, specs: list[_AggSpec]):  # noqa: C901
+        """Cython fast path for integral float32/float64 keys and one non-null float64 sum."""
+        if len(self.keys) != 1 or len(specs) != 1 or specs[0].op != "sum" or self.sort:
+            return None
+        spec = specs[0]
+        if spec.input_col is None:
+            return None
+        key_name = self.keys[0]
+        key_info = self.table._schema.columns_by_name[key_name]
+        value_info = self.table._schema.columns_by_name[spec.input_col]
+        key_dtype = getattr(key_info.spec, "dtype", None)
+        value_dtype = getattr(value_info.spec, "dtype", None)
+        if key_dtype not in {np.dtype(np.float32), np.dtype(np.float64)} or value_dtype != np.dtype(
+            np.float64
+        ):
+            return None
+        if getattr(value_info.spec, "null_value", None) is not None:
+            return None
+        # The fast path can skip NaNs.  If dropna=False and NaNs are present,
+        # the Cython kernel reports unsupported and we fall back to generic
+        # grouping, which can materialize a NaN group.
+        skip_key_nan = self.dropna
+        try:
+            from blosc2 import indexing_ext
+        except ImportError:
+            return None
+        kernel_name = (
+            "groupby_dense_f32_integral_key_f64_sum_checked"
+            if key_dtype == np.dtype(np.float32)
+            else "groupby_dense_f64_integral_key_f64_sum_checked"
+        )
+        kernel = getattr(indexing_ext, kernel_name, None)
+        if kernel is None:
+            return None
+
+        compact_limit = 10_000_000
+        sums = np.zeros(0, dtype=np.float64)
+        present = np.zeros(0, dtype=bool)
+
+        def ensure_size(size: int) -> bool:
+            nonlocal sums, present
+            if size > compact_limit:
+                return False
+            if size <= len(sums):
+                return True
+            old = len(sums)
+            sums = np.pad(sums, (0, size - old), constant_values=0)
+            present = np.pad(present, (0, size - old), constant_values=False)
+            return True
+
+        phys_len = len(self.table._valid_rows)
+        chunk_size = self._chunk_size()
+        for start in range(0, phys_len, chunk_size):
+            stop = min(start + chunk_size, phys_len)
+            valid = np.asarray(self.table._valid_rows[start:stop], dtype=bool)
+            if not np.any(valid):
+                continue
+            keys = np.asarray(self.table._cols[key_name][start:stop], dtype=key_dtype)
+            values = np.asarray(self.table._cols[spec.input_col][start:stop], dtype=np.float64)
+            status = int(kernel(keys, values, valid, sums, present, skip_key_nan, False))
+            if status == -1:
+                return None
+            if status > 0:
+                if not ensure_size(status):
+                    return None
+                status = int(kernel(keys, values, valid, sums, present, skip_key_nan, False))
+                if status != 0:
+                    return None
+
+        rows = [
+            {key_name: float(code), spec.output_col: float(sums[code])} for code in np.nonzero(present)[0]
+        ]
+        return self._build_result(rows, specs)
+
     def _try_execute_dense_single_int_key(self, specs: list[_AggSpec]):  # noqa: C901
         """Fast path for one dense integer/dictionary-code key.
 
diff --git a/src/blosc2/indexing_ext.pyx b/src/blosc2/indexing_ext.pyx
index e1202edd..8c479dcd 100644
--- a/src/blosc2/indexing_ext.pyx
+++ b/src/blosc2/indexing_ext.pyx
@@ -2645,3 +2645,187 @@ def groupby_dense_i32_f64_sum_checked(
                     sums_view[key] += value
                     present_view[key] = 1
     return ret
+
+
+def groupby_dense_f64_integral_key_f64_sum_checked(
+    np.ndarray keys,
+    np.ndarray values,
+    np.ndarray valid,
+    np.ndarray sums,
+    np.ndarray present,
+    bint skip_key_nan=True,
+    bint skip_value_nan=False,
+):
+    """Checked dense float64-integral-key/float64 sum kernel.
+
+    Fast path for float keys that are exactly integral, finite and
+    non-negative.  Returns ``0`` on success, ``-1`` if a key cannot be handled,
+    or ``max_key + 1`` when dense state arrays need to be grown.  The state is
+    not mutated unless the function returns ``0``.
+    """
+    if keys.ndim != 1 or values.ndim != 1 or valid.ndim != 1:
+        raise ValueError("keys, values and valid must be 1-D arrays")
+    if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]:
+        raise ValueError("keys, values and valid must have the same length")
+    if sums.ndim != 1 or present.ndim != 1:
+        raise ValueError("sums and present must be 1-D arrays")
+    if keys.dtype != np.dtype(np.float64):
+        raise TypeError("keys must have dtype float64")
+    if values.dtype != np.dtype(np.float64):
+        raise TypeError("values must have dtype float64")
+    if valid.dtype != np.dtype(np.bool_):
+        raise TypeError("valid must have dtype bool")
+    if sums.dtype != np.dtype(np.float64):
+        raise TypeError("sums must have dtype float64")
+    if present.dtype != np.dtype(np.bool_):
+        raise TypeError("present must have dtype bool")
+    if present.shape[0] != sums.shape[0]:
+        raise ValueError("present and sums must have the same length")
+
+    cdef double[:] keys_view = keys
+    cdef double[:] values_view = values
+    cdef np.npy_bool[:] valid_view = valid
+    cdef double[:] sums_view = sums
+    cdef np.npy_bool[:] present_view = present
+    cdef Py_ssize_t n = keys.shape[0]
+    cdef Py_ssize_t nstates = sums.shape[0]
+    cdef Py_ssize_t i
+    cdef double key_f
+    cdef int64_t key_i
+    cdef int64_t max_key = -1
+    cdef int ret = 0
+    cdef double value
+
+    with nogil:
+        for i in range(n):
+            if not valid_view[i]:
+                continue
+            key_f = keys_view[i]
+            if key_f != key_f:
+                if skip_key_nan:
+                    continue
+                ret = -1
+                break
+            if key_f < 0.0 or key_f > 9223372036854774784.0:
+                ret = -1
+                break
+            key_i = <int64_t>key_f
+            if key_f != <double>key_i:
+                ret = -1
+                break
+            if key_i > max_key:
+                max_key = key_i
+        if ret == 0:
+            if max_key < 0:
+                ret = 0
+            elif max_key >= nstates:
+                if max_key > 2147483646:
+                    ret = -1
+                else:
+                    ret = <int>max_key + 1
+            else:
+                for i in range(n):
+                    if not valid_view[i]:
+                        continue
+                    key_f = keys_view[i]
+                    if key_f != key_f:
+                        if skip_key_nan:
+                            continue
+                        ret = -1
+                        break
+                    key_i = <int64_t>key_f
+                    value = values_view[i]
+                    if skip_value_nan and value != value:
+                        continue
+                    sums_view[key_i] += value
+                    present_view[key_i] = 1
+    return ret
+
+
+def groupby_dense_f32_integral_key_f64_sum_checked(
+    np.ndarray keys,
+    np.ndarray values,
+    np.ndarray valid,
+    np.ndarray sums,
+    np.ndarray present,
+    bint skip_key_nan=True,
+    bint skip_value_nan=False,
+):
+    """Checked dense float32-integral-key/float64 sum kernel."""
+    if keys.ndim != 1 or values.ndim != 1 or valid.ndim != 1:
+        raise ValueError("keys, values and valid must be 1-D arrays")
+    if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]:
+        raise ValueError("keys, values and valid must have the same length")
+    if sums.ndim != 1 or present.ndim != 1:
+        raise ValueError("sums and present must be 1-D arrays")
+    if keys.dtype != np.dtype(np.float32):
+        raise TypeError("keys must have dtype float32")
+    if values.dtype != np.dtype(np.float64):
+        raise TypeError("values must have dtype float64")
+    if valid.dtype != np.dtype(np.bool_):
+        raise TypeError("valid must have dtype bool")
+    if sums.dtype != np.dtype(np.float64):
+        raise TypeError("sums must have dtype float64")
+    if present.dtype != np.dtype(np.bool_):
+        raise TypeError("present must have dtype bool")
+    if present.shape[0] != sums.shape[0]:
+        raise ValueError("present and sums must have the same length")
+
+    cdef float[:] keys_view = keys
+    cdef double[:] values_view = values
+    cdef np.npy_bool[:] valid_view = valid
+    cdef double[:] sums_view = sums
+    cdef np.npy_bool[:] present_view = present
+    cdef Py_ssize_t n = keys.shape[0]
+    cdef Py_ssize_t nstates = sums.shape[0]
+    cdef Py_ssize_t i
+    cdef float key_f
+    cdef int64_t key_i
+    cdef int64_t max_key = -1
+    cdef int ret = 0
+    cdef double value
+
+    with nogil:
+        for i in range(n):
+            if not valid_view[i]:
+                continue
+            key_f = keys_view[i]
+            if key_f != key_f:
+                if skip_key_nan:
+                    continue
+                ret = -1
+                break
+            if key_f < 0.0 or key_f > 16777216.0:
+                ret = -1
+                break
+            key_i = <int64_t>key_f
+            if key_f != <float>key_i:
+                ret = -1
+                break
+            if key_i > max_key:
+                max_key = key_i
+        if ret == 0:
+            if max_key < 0:
+                ret = 0
+            elif max_key >= nstates:
+                if max_key > 2147483646:
+                    ret = -1
+                else:
+                    ret = <int>max_key + 1
+            else:
+                for i in range(n):
+                    if not valid_view[i]:
+                        continue
+                    key_f = keys_view[i]
+                    if key_f != key_f:
+                        if skip_key_nan:
+                            continue
+                        ret = -1
+                        break
+                    key_i = <int64_t>key_f
+                    value = values_view[i]
+                    if skip_value_nan and value != value:
+                        continue
+                    sums_view[key_i] += value
+                    present_view[key_i] = 1
+    return ret

From 02e284b2690e3628e73d23211d2fc5bf9389b233 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Fri, 15 May 2026 07:54:33 +0200
Subject: [PATCH 05/17] Factorized all groupby accelerations in groupby_ext.pyx

---
 CMakeLists.txt               |  12 +-
 src/blosc2/groupby.py        |   8 +-
 src/blosc2/groupby_ext.pyx   | 347 +++++++++++++++++++++++++++++++++++
 src/blosc2/indexing_ext.pyx  | 331 ---------------------------------
 tests/ctable/test_groupby.py |  76 ++++++++
 5 files changed, 438 insertions(+), 336 deletions(-)
 create mode 100644 src/blosc2/groupby_ext.pyx

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 734a4fea..ff4425a3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -51,6 +51,13 @@ add_custom_command(
   DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2/indexing_ext.pyx"
   VERBATIM)
 
+add_custom_command(
+  OUTPUT groupby_ext.c
+  COMMAND Python::Interpreter -m cython
+          "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2/groupby_ext.pyx" --output-file groupby_ext.c
+  DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2/groupby_ext.pyx"
+  VERBATIM)
+
 # ...and add it to the target
 Python_add_library(blosc2_ext MODULE blosc2_ext.c WITH_SOABI)
 target_sources(blosc2_ext PRIVATE src/blosc2/matmul_kernels.c)
@@ -59,10 +66,12 @@ if(UNIX)
   target_link_libraries(blosc2_ext PRIVATE ${CMAKE_DL_LIBS})
 endif()
 Python_add_library(indexing_ext MODULE indexing_ext.c WITH_SOABI)
+Python_add_library(groupby_ext MODULE groupby_ext.c WITH_SOABI)
 
 # We need to link against NumPy
 target_link_libraries(blosc2_ext PRIVATE Python::NumPy)
 target_link_libraries(indexing_ext PRIVATE Python::NumPy)
+target_link_libraries(groupby_ext PRIVATE Python::NumPy)
 
 # Fetch and build miniexpr library
 include(FetchContent)
@@ -99,6 +108,7 @@ endif()
 
 target_compile_features(blosc2_ext PRIVATE c_std_11)
 target_compile_features(indexing_ext PRIVATE c_std_11)
+target_compile_features(groupby_ext PRIVATE c_std_11)
 if(WIN32 AND CMAKE_C_COMPILER_ID STREQUAL "Clang")
     execute_process(
         COMMAND "${CMAKE_C_COMPILER}" -print-resource-dir
@@ -173,7 +183,7 @@ endif()
 
 # Python extension -> site-packages/blosc2
 install(
-  TARGETS blosc2_ext indexing_ext
+  TARGETS blosc2_ext indexing_ext groupby_ext
   LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/blosc2
 )
 
diff --git a/src/blosc2/groupby.py b/src/blosc2/groupby.py
index 170edab8..dfeda79d 100644
--- a/src/blosc2/groupby.py
+++ b/src/blosc2/groupby.py
@@ -252,10 +252,10 @@ def _try_execute_cython_i32_f64_sum(self, specs: list[_AggSpec]):  # noqa: C901
         if value_dtype != np.dtype(np.float64) or getattr(value_info.spec, "null_value", None) is not None:
             return None
         try:
-            from blosc2 import indexing_ext
+            from blosc2 import groupby_ext
         except ImportError:
             return None
-        kernel = getattr(indexing_ext, "groupby_dense_i32_f64_sum_checked", None)
+        kernel = getattr(groupby_ext, "groupby_dense_i32_f64_sum_checked", None)
         if kernel is None:
             return None
 
@@ -322,7 +322,7 @@ def _try_execute_cython_float_integral_key_f64_sum(self, specs: list[_AggSpec]):
         # grouping, which can materialize a NaN group.
         skip_key_nan = self.dropna
         try:
-            from blosc2 import indexing_ext
+            from blosc2 import groupby_ext
         except ImportError:
             return None
         kernel_name = (
@@ -330,7 +330,7 @@ def _try_execute_cython_float_integral_key_f64_sum(self, specs: list[_AggSpec]):
             if key_dtype == np.dtype(np.float32)
             else "groupby_dense_f64_integral_key_f64_sum_checked"
         )
-        kernel = getattr(indexing_ext, kernel_name, None)
+        kernel = getattr(groupby_ext, kernel_name, None)
         if kernel is None:
             return None
 
diff --git a/src/blosc2/groupby_ext.pyx b/src/blosc2/groupby_ext.pyx
new file mode 100644
index 00000000..c621d6fc
--- /dev/null
+++ b/src/blosc2/groupby_ext.pyx
@@ -0,0 +1,347 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+# cython: boundscheck=False, wraparound=False, initializedcheck=False
+
+"""Cython group-reduce kernels for CTable group_by()."""
+
+import numpy as np
+cimport numpy as np
+
+from libc.stdint cimport int32_t, int64_t
+
+
+# ----------------------------------------------------------------------
+# Group-reduce kernels
+# ----------------------------------------------------------------------
+
+def groupby_dense_i32_f64_sum(
+    np.ndarray keys,
+    np.ndarray values,
+    np.ndarray valid,
+    np.ndarray sums,
+    np.ndarray present,
+    bint skip_key_null=False,
+    int32_t key_null=0,
+    bint skip_value_nan=False,
+):
+    """Accumulate ``sum(values)`` by dense int32 keys.
+
+    This is a low-level CTable group-by helper.  *keys*, *values*, and *valid*
+    are same-length 1-D chunk arrays.  *sums* and *present* are dense group
+    state arrays indexed directly by key value.  Keys must be non-negative and
+    already fit in the state arrays.
+    """
+    if keys.ndim != 1 or values.ndim != 1 or valid.ndim != 1:
+        raise ValueError("keys, values and valid must be 1-D arrays")
+    if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]:
+        raise ValueError("keys, values and valid must have the same length")
+    if sums.ndim != 1 or present.ndim != 1:
+        raise ValueError("sums and present must be 1-D arrays")
+    if keys.dtype != np.dtype(np.int32):
+        raise TypeError("keys must have dtype int32")
+    if values.dtype != np.dtype(np.float64):
+        raise TypeError("values must have dtype float64")
+    if valid.dtype != np.dtype(np.bool_):
+        raise TypeError("valid must have dtype bool")
+    if sums.dtype != np.dtype(np.float64):
+        raise TypeError("sums must have dtype float64")
+    if present.dtype != np.dtype(np.bool_):
+        raise TypeError("present must have dtype bool")
+
+    cdef int32_t[:] keys_view = keys
+    cdef double[:] values_view = values
+    cdef np.npy_bool[:] valid_view = valid
+    cdef double[:] sums_view = sums
+    cdef np.npy_bool[:] present_view = present
+    cdef Py_ssize_t n = keys.shape[0]
+    cdef Py_ssize_t nstates = sums.shape[0]
+    cdef Py_ssize_t i
+    cdef int32_t key
+    cdef double value
+
+    if present.shape[0] != sums.shape[0]:
+        raise ValueError("present and sums must have the same length")
+
+    with nogil:
+        for i in range(n):
+            if not valid_view[i]:
+                continue
+            key = keys_view[i]
+            if skip_key_null and key == key_null:
+                continue
+            if key < 0 or key >= nstates:
+                continue
+            value = values_view[i]
+            if skip_value_nan and value != value:
+                continue
+            sums_view[key] += value
+            present_view[key] = 1
+    return None
+
+
+def groupby_dense_i32_f64_sum_checked(
+    np.ndarray keys,
+    np.ndarray values,
+    np.ndarray valid,
+    np.ndarray sums,
+    np.ndarray present,
+    bint skip_key_null=False,
+    int32_t key_null=0,
+    bint skip_value_nan=False,
+):
+    """Checked dense int32/float64 sum kernel.
+
+    Returns ``0`` on success, ``-1`` if a negative non-null key is found, or
+    ``max_key + 1`` when the dense state arrays need to be grown.  The state is
+    not mutated unless the function returns ``0``.
+    """
+    if keys.ndim != 1 or values.ndim != 1 or valid.ndim != 1:
+        raise ValueError("keys, values and valid must be 1-D arrays")
+    if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]:
+        raise ValueError("keys, values and valid must have the same length")
+    if sums.ndim != 1 or present.ndim != 1:
+        raise ValueError("sums and present must be 1-D arrays")
+    if keys.dtype != np.dtype(np.int32):
+        raise TypeError("keys must have dtype int32")
+    if values.dtype != np.dtype(np.float64):
+        raise TypeError("values must have dtype float64")
+    if valid.dtype != np.dtype(np.bool_):
+        raise TypeError("valid must have dtype bool")
+    if sums.dtype != np.dtype(np.float64):
+        raise TypeError("sums must have dtype float64")
+    if present.dtype != np.dtype(np.bool_):
+        raise TypeError("present must have dtype bool")
+    if present.shape[0] != sums.shape[0]:
+        raise ValueError("present and sums must have the same length")
+
+    cdef int32_t[:] keys_view = keys
+    cdef double[:] values_view = values
+    cdef np.npy_bool[:] valid_view = valid
+    cdef double[:] sums_view = sums
+    cdef np.npy_bool[:] present_view = present
+    cdef Py_ssize_t n = keys.shape[0]
+    cdef Py_ssize_t nstates = sums.shape[0]
+    cdef Py_ssize_t i
+    cdef int32_t key
+    cdef int32_t max_key = -1
+    cdef int ret = 0
+    cdef double value
+
+    with nogil:
+        for i in range(n):
+            if not valid_view[i]:
+                continue
+            key = keys_view[i]
+            if skip_key_null and key == key_null:
+                continue
+            if key < 0:
+                ret = -1
+                break
+            if key > max_key:
+                max_key = key
+        if ret == 0:
+            if max_key < 0:
+                ret = 0
+            elif max_key >= nstates:
+                ret = <int>max_key + 1
+            else:
+                for i in range(n):
+                    if not valid_view[i]:
+                        continue
+                    key = keys_view[i]
+                    if skip_key_null and key == key_null:
+                        continue
+                    value = values_view[i]
+                    if skip_value_nan and value != value:
+                        continue
+                    sums_view[key] += value
+                    present_view[key] = 1
+    return ret
+
+
+def groupby_dense_f64_integral_key_f64_sum_checked(
+    np.ndarray keys,
+    np.ndarray values,
+    np.ndarray valid,
+    np.ndarray sums,
+    np.ndarray present,
+    bint skip_key_nan=True,
+    bint skip_value_nan=False,
+):
+    """Checked dense float64-integral-key/float64 sum kernel.
+
+    Fast path for float keys that are exactly integral, finite and
+    non-negative.  Returns ``0`` on success, ``-1`` if a key cannot be handled,
+    or ``max_key + 1`` when the dense state arrays need to be grown.  The state is
+    not mutated unless the function returns ``0``.
+    """
+    if keys.ndim != 1 or values.ndim != 1 or valid.ndim != 1:
+        raise ValueError("keys, values and valid must be 1-D arrays")
+    if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]:
+        raise ValueError("keys, values and valid must have the same length")
+    if sums.ndim != 1 or present.ndim != 1:
+        raise ValueError("sums and present must be 1-D arrays")
+    if keys.dtype != np.dtype(np.float64):
+        raise TypeError("keys must have dtype float64")
+    if values.dtype != np.dtype(np.float64):
+        raise TypeError("values must have dtype float64")
+    if valid.dtype != np.dtype(np.bool_):
+        raise TypeError("valid must have dtype bool")
+    if sums.dtype != np.dtype(np.float64):
+        raise TypeError("sums must have dtype float64")
+    if present.dtype != np.dtype(np.bool_):
+        raise TypeError("present must have dtype bool")
+    if present.shape[0] != sums.shape[0]:
+        raise ValueError("present and sums must have the same length")
+
+    cdef double[:] keys_view = keys
+    cdef double[:] values_view = values
+    cdef np.npy_bool[:] valid_view = valid
+    cdef double[:] sums_view = sums
+    cdef np.npy_bool[:] present_view = present
+    cdef Py_ssize_t n = keys.shape[0]
+    cdef Py_ssize_t nstates = sums.shape[0]
+    cdef Py_ssize_t i
+    cdef double key_f
+    cdef int64_t key_i
+    cdef int64_t max_key = -1
+    cdef int ret = 0
+    cdef double value
+
+    with nogil:
+        for i in range(n):
+            if not valid_view[i]:
+                continue
+            key_f = keys_view[i]
+            if key_f != key_f:
+                if skip_key_nan:
+                    continue
+                ret = -1
+                break
+            if key_f < 0.0 or key_f > 9223372036854774784.0:
+                ret = -1
+                break
+            key_i = <int64_t>key_f
+            if key_f != <double>key_i:
+                ret = -1
+                break
+            if key_i > max_key:
+                max_key = key_i
+        if ret == 0:
+            if max_key < 0:
+                ret = 0
+            elif max_key >= nstates:
+                if max_key > 2147483646:
+                    ret = -1
+                else:
+                    ret = <int>max_key + 1
+            else:
+                for i in range(n):
+                    if not valid_view[i]:
+                        continue
+                    key_f = keys_view[i]
+                    if key_f != key_f:
+                        if skip_key_nan:
+                            continue
+                        ret = -1
+                        break
+                    key_i = <int64_t>key_f
+                    value = values_view[i]
+                    if skip_value_nan and value != value:
+                        continue
+                    sums_view[key_i] += value
+                    present_view[key_i] = 1
+    return ret
+
+
+def groupby_dense_f32_integral_key_f64_sum_checked(
+    np.ndarray keys,
+    np.ndarray values,
+    np.ndarray valid,
+    np.ndarray sums,
+    np.ndarray present,
+    bint skip_key_nan=True,
+    bint skip_value_nan=False,
+):
+    """Checked dense float32-integral-key/float64 sum kernel."""
+    if keys.ndim != 1 or values.ndim != 1 or valid.ndim != 1:
+        raise ValueError("keys, values and valid must be 1-D arrays")
+    if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]:
+        raise ValueError("keys, values and valid must have the same length")
+    if sums.ndim != 1 or present.ndim != 1:
+        raise ValueError("sums and present must be 1-D arrays")
+    if keys.dtype != np.dtype(np.float32):
+        raise TypeError("keys must have dtype float32")
+    if values.dtype != np.dtype(np.float64):
+        raise TypeError("values must have dtype float64")
+    if valid.dtype != np.dtype(np.bool_):
+        raise TypeError("valid must have dtype bool")
+    if sums.dtype != np.dtype(np.float64):
+        raise TypeError("sums must have dtype float64")
+    if present.dtype != np.dtype(np.bool_):
+        raise TypeError("present must have dtype bool")
+    if present.shape[0] != sums.shape[0]:
+        raise ValueError("present and sums must have the same length")
+
+    cdef float[:] keys_view = keys
+    cdef double[:] values_view = values
+    cdef np.npy_bool[:] valid_view = valid
+    cdef double[:] sums_view = sums
+    cdef np.npy_bool[:] present_view = present
+    cdef Py_ssize_t n = keys.shape[0]
+    cdef Py_ssize_t nstates = sums.shape[0]
+    cdef Py_ssize_t i
+    cdef float key_f
+    cdef int64_t key_i
+    cdef int64_t max_key = -1
+    cdef int ret = 0
+    cdef double value
+
+    with nogil:
+        for i in range(n):
+            if not valid_view[i]:
+                continue
+            key_f = keys_view[i]
+            if key_f != key_f:
+                if skip_key_nan:
+                    continue
+                ret = -1
+                break
+            if key_f < 0.0 or key_f > 16777216.0:
+                ret = -1
+                break
+            key_i = <int64_t>key_f
+            if key_f != <float>key_i:
+                ret = -1
+                break
+            if key_i > max_key:
+                max_key = key_i
+        if ret == 0:
+            if max_key < 0:
+                ret = 0
+            elif max_key >= nstates:
+                if max_key > 2147483646:
+                    ret = -1
+                else:
+                    ret = <int>max_key + 1
+            else:
+                for i in range(n):
+                    if not valid_view[i]:
+                        continue
+                    key_f = keys_view[i]
+                    if key_f != key_f:
+                        if skip_key_nan:
+                            continue
+                        ret = -1
+                        break
+                    key_i = <int64_t>key_f
+                    value = values_view[i]
+                    if skip_value_nan and value != value:
+                        continue
+                    sums_view[key_i] += value
+                    present_view[key_i] = 1
+    return ret
diff --git a/src/blosc2/indexing_ext.pyx b/src/blosc2/indexing_ext.pyx
index 8c479dcd..91072bea 100644
--- a/src/blosc2/indexing_ext.pyx
+++ b/src/blosc2/indexing_ext.pyx
@@ -2498,334 +2498,3 @@ def keysort_keys_indices(np.ndarray keys, np.ndarray indices):
 
 
 # ----------------------------------------------------------------------
-# Group-reduce kernels
-# ----------------------------------------------------------------------
-
-def groupby_dense_i32_f64_sum(
-    np.ndarray keys,
-    np.ndarray values,
-    np.ndarray valid,
-    np.ndarray sums,
-    np.ndarray present,
-    bint skip_key_null=False,
-    int32_t key_null=0,
-    bint skip_value_nan=False,
-):
-    """Accumulate ``sum(values)`` by dense int32 keys.
-
-    This is a low-level CTable group-by helper.  *keys*, *values*, and *valid*
-    are same-length 1-D chunk arrays.  *sums* and *present* are dense group
-    state arrays indexed directly by key value.  Keys must be non-negative and
-    already fit in the state arrays.
-    """
-    if keys.ndim != 1 or values.ndim != 1 or valid.ndim != 1:
-        raise ValueError("keys, values and valid must be 1-D arrays")
-    if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]:
-        raise ValueError("keys, values and valid must have the same length")
-    if sums.ndim != 1 or present.ndim != 1:
-        raise ValueError("sums and present must be 1-D arrays")
-    if keys.dtype != np.dtype(np.int32):
-        raise TypeError("keys must have dtype int32")
-    if values.dtype != np.dtype(np.float64):
-        raise TypeError("values must have dtype float64")
-    if valid.dtype != np.dtype(np.bool_):
-        raise TypeError("valid must have dtype bool")
-    if sums.dtype != np.dtype(np.float64):
-        raise TypeError("sums must have dtype float64")
-    if present.dtype != np.dtype(np.bool_):
-        raise TypeError("present must have dtype bool")
-
-    cdef int32_t[:] keys_view = keys
-    cdef double[:] values_view = values
-    cdef np.npy_bool[:] valid_view = valid
-    cdef double[:] sums_view = sums
-    cdef np.npy_bool[:] present_view = present
-    cdef Py_ssize_t n = keys.shape[0]
-    cdef Py_ssize_t nstates = sums.shape[0]
-    cdef Py_ssize_t i
-    cdef int32_t key
-    cdef double value
-
-    if present.shape[0] != sums.shape[0]:
-        raise ValueError("present and sums must have the same length")
-
-    with nogil:
-        for i in range(n):
-            if not valid_view[i]:
-                continue
-            key = keys_view[i]
-            if skip_key_null and key == key_null:
-                continue
-            if key < 0 or key >= nstates:
-                continue
-            value = values_view[i]
-            if skip_value_nan and value != value:
-                continue
-            sums_view[key] += value
-            present_view[key] = 1
-    return None
-
-
-
-def groupby_dense_i32_f64_sum_checked(
-    np.ndarray keys,
-    np.ndarray values,
-    np.ndarray valid,
-    np.ndarray sums,
-    np.ndarray present,
-    bint skip_key_null=False,
-    int32_t key_null=0,
-    bint skip_value_nan=False,
-):
-    """Checked dense int32/float64 sum kernel.
-
-    Returns ``0`` on success, ``-1`` if a negative non-null key is found, or
-    ``max_key + 1`` when the dense state arrays need to be grown.  The state is
-    not mutated unless the function returns ``0``.
-    """
-    if keys.ndim != 1 or values.ndim != 1 or valid.ndim != 1:
-        raise ValueError("keys, values and valid must be 1-D arrays")
-    if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]:
-        raise ValueError("keys, values and valid must have the same length")
-    if sums.ndim != 1 or present.ndim != 1:
-        raise ValueError("sums and present must be 1-D arrays")
-    if keys.dtype != np.dtype(np.int32):
-        raise TypeError("keys must have dtype int32")
-    if values.dtype != np.dtype(np.float64):
-        raise TypeError("values must have dtype float64")
-    if valid.dtype != np.dtype(np.bool_):
-        raise TypeError("valid must have dtype bool")
-    if sums.dtype != np.dtype(np.float64):
-        raise TypeError("sums must have dtype float64")
-    if present.dtype != np.dtype(np.bool_):
-        raise TypeError("present must have dtype bool")
-    if present.shape[0] != sums.shape[0]:
-        raise ValueError("present and sums must have the same length")
-
-    cdef int32_t[:] keys_view = keys
-    cdef double[:] values_view = values
-    cdef np.npy_bool[:] valid_view = valid
-    cdef double[:] sums_view = sums
-    cdef np.npy_bool[:] present_view = present
-    cdef Py_ssize_t n = keys.shape[0]
-    cdef Py_ssize_t nstates = sums.shape[0]
-    cdef Py_ssize_t i
-    cdef int32_t key
-    cdef int32_t max_key = -1
-    cdef int ret = 0
-    cdef double value
-
-    with nogil:
-        for i in range(n):
-            if not valid_view[i]:
-                continue
-            key = keys_view[i]
-            if skip_key_null and key == key_null:
-                continue
-            if key < 0:
-                ret = -1
-                break
-            if key > max_key:
-                max_key = key
-        if ret == 0:
-            if max_key < 0:
-                ret = 0
-            elif max_key >= nstates:
-                ret = <int>max_key + 1
-            else:
-                for i in range(n):
-                    if not valid_view[i]:
-                        continue
-                    key = keys_view[i]
-                    if skip_key_null and key == key_null:
-                        continue
-                    value = values_view[i]
-                    if skip_value_nan and value != value:
-                        continue
-                    sums_view[key] += value
-                    present_view[key] = 1
-    return ret
-
-
-def groupby_dense_f64_integral_key_f64_sum_checked(
-    np.ndarray keys,
-    np.ndarray values,
-    np.ndarray valid,
-    np.ndarray sums,
-    np.ndarray present,
-    bint skip_key_nan=True,
-    bint skip_value_nan=False,
-):
-    """Checked dense float64-integral-key/float64 sum kernel.
-
-    Fast path for float keys that are exactly integral, finite and
-    non-negative.  Returns ``0`` on success, ``-1`` if a key cannot be handled,
-    or ``max_key + 1`` when dense state arrays need to be grown.  The state is
-    not mutated unless the function returns ``0``.
-    """
-    if keys.ndim != 1 or values.ndim != 1 or valid.ndim != 1:
-        raise ValueError("keys, values and valid must be 1-D arrays")
-    if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]:
-        raise ValueError("keys, values and valid must have the same length")
-    if sums.ndim != 1 or present.ndim != 1:
-        raise ValueError("sums and present must be 1-D arrays")
-    if keys.dtype != np.dtype(np.float64):
-        raise TypeError("keys must have dtype float64")
-    if values.dtype != np.dtype(np.float64):
-        raise TypeError("values must have dtype float64")
-    if valid.dtype != np.dtype(np.bool_):
-        raise TypeError("valid must have dtype bool")
-    if sums.dtype != np.dtype(np.float64):
-        raise TypeError("sums must have dtype float64")
-    if present.dtype != np.dtype(np.bool_):
-        raise TypeError("present must have dtype bool")
-    if present.shape[0] != sums.shape[0]:
-        raise ValueError("present and sums must have the same length")
-
-    cdef double[:] keys_view = keys
-    cdef double[:] values_view = values
-    cdef np.npy_bool[:] valid_view = valid
-    cdef double[:] sums_view = sums
-    cdef np.npy_bool[:] present_view = present
-    cdef Py_ssize_t n = keys.shape[0]
-    cdef Py_ssize_t nstates = sums.shape[0]
-    cdef Py_ssize_t i
-    cdef double key_f
-    cdef int64_t key_i
-    cdef int64_t max_key = -1
-    cdef int ret = 0
-    cdef double value
-
-    with nogil:
-        for i in range(n):
-            if not valid_view[i]:
-                continue
-            key_f = keys_view[i]
-            if key_f != key_f:
-                if skip_key_nan:
-                    continue
-                ret = -1
-                break
-            if key_f < 0.0 or key_f > 9223372036854774784.0:
-                ret = -1
-                break
-            key_i = <int64_t>key_f
-            if key_f != <double>key_i:
-                ret = -1
-                break
-            if key_i > max_key:
-                max_key = key_i
-        if ret == 0:
-            if max_key < 0:
-                ret = 0
-            elif max_key >= nstates:
-                if max_key > 2147483646:
-                    ret = -1
-                else:
-                    ret = <int>max_key + 1
-            else:
-                for i in range(n):
-                    if not valid_view[i]:
-                        continue
-                    key_f = keys_view[i]
-                    if key_f != key_f:
-                        if skip_key_nan:
-                            continue
-                        ret = -1
-                        break
-                    key_i = <int64_t>key_f
-                    value = values_view[i]
-                    if skip_value_nan and value != value:
-                        continue
-                    sums_view[key_i] += value
-                    present_view[key_i] = 1
-    return ret
-
-
-def groupby_dense_f32_integral_key_f64_sum_checked(
-    np.ndarray keys,
-    np.ndarray values,
-    np.ndarray valid,
-    np.ndarray sums,
-    np.ndarray present,
-    bint skip_key_nan=True,
-    bint skip_value_nan=False,
-):
-    """Checked dense float32-integral-key/float64 sum kernel."""
-    if keys.ndim != 1 or values.ndim != 1 or valid.ndim != 1:
-        raise ValueError("keys, values and valid must be 1-D arrays")
-    if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]:
-        raise ValueError("keys, values and valid must have the same length")
-    if sums.ndim != 1 or present.ndim != 1:
-        raise ValueError("sums and present must be 1-D arrays")
-    if keys.dtype != np.dtype(np.float32):
-        raise TypeError("keys must have dtype float32")
-    if values.dtype != np.dtype(np.float64):
-        raise TypeError("values must have dtype float64")
-    if valid.dtype != np.dtype(np.bool_):
-        raise TypeError("valid must have dtype bool")
-    if sums.dtype != np.dtype(np.float64):
-        raise TypeError("sums must have dtype float64")
-    if present.dtype != np.dtype(np.bool_):
-        raise TypeError("present must have dtype bool")
-    if present.shape[0] != sums.shape[0]:
-        raise ValueError("present and sums must have the same length")
-
-    cdef float[:] keys_view = keys
-    cdef double[:] values_view = values
-    cdef np.npy_bool[:] valid_view = valid
-    cdef double[:] sums_view = sums
-    cdef np.npy_bool[:] present_view = present
-    cdef Py_ssize_t n = keys.shape[0]
-    cdef Py_ssize_t nstates = sums.shape[0]
-    cdef Py_ssize_t i
-    cdef float key_f
-    cdef int64_t key_i
-    cdef int64_t max_key = -1
-    cdef int ret = 0
-    cdef double value
-
-    with nogil:
-        for i in range(n):
-            if not valid_view[i]:
-                continue
-            key_f = keys_view[i]
-            if key_f != key_f:
-                if skip_key_nan:
-                    continue
-                ret = -1
-                break
-            if key_f < 0.0 or key_f > 16777216.0:
-                ret = -1
-                break
-            key_i = <int64_t>key_f
-            if key_f != <float>key_i:
-                ret = -1
-                break
-            if key_i > max_key:
-                max_key = key_i
-        if ret == 0:
-            if max_key < 0:
-                ret = 0
-            elif max_key >= nstates:
-                if max_key > 2147483646:
-                    ret = -1
-                else:
-                    ret = <int>max_key + 1
-            else:
-                for i in range(n):
-                    if not valid_view[i]:
-                        continue
-                    key_f = keys_view[i]
-                    if key_f != key_f:
-                        if skip_key_nan:
-                            continue
-                        ret = -1
-                        break
-                    key_i = <int64_t>key_f
-                    value = values_view[i]
-                    if skip_value_nan and value != value:
-                        continue
-                    sums_view[key_i] += value
-                    present_view[key_i] = 1
-    return ret
diff --git a/tests/ctable/test_groupby.py b/tests/ctable/test_groupby.py
index 12394fb4..306d6522 100644
--- a/tests/ctable/test_groupby.py
+++ b/tests/ctable/test_groupby.py
@@ -144,6 +144,82 @@ def test_groupby_empty_table_returns_empty_result():
     assert out.col_names == ["city", "size"]
 
 
+@dataclass
+class Int32FloatRow:
+    key: int = blosc2.field(blosc2.int32())
+    value: float = blosc2.field(blosc2.float64())
+
+
+@dataclass
+class Float64KeyRow:
+    key: float = blosc2.field(blosc2.float64())
+    value: float = blosc2.field(blosc2.float64())
+
+
+@dataclass
+class Float32KeyRow:
+    key: float = blosc2.field(blosc2.float32())
+    value: float = blosc2.field(blosc2.float64())
+
+
+@dataclass
+class DictFloatRow:
+    key: str = blosc2.field(blosc2.dictionary())
+    value: float = blosc2.field(blosc2.float64())
+
+
+@pytest.mark.parametrize(
+    ("row_type", "data", "expected"),
+    [
+        (
+            Int32FloatRow,
+            [(0, 1.5), (2, 10.0), (1, 2.5), (2, 3.0), (0, 4.0)],
+            [(0, 5.5), (1, 2.5), (2, 13.0)],
+        ),
+        (
+            Float64KeyRow,
+            [(0.0, 1.5), (2.0, 10.0), (1.0, 2.5), (2.0, 3.0), (0.0, 4.0)],
+            [(0.0, 5.5), (1.0, 2.5), (2.0, 13.0)],
+        ),
+        (
+            Float32KeyRow,
+            [(0.0, 1.5), (2.0, 10.0), (1.0, 2.5), (2.0, 3.0), (0.0, 4.0)],
+            [(0.0, 5.5), (1.0, 2.5), (2.0, 13.0)],
+        ),
+        (
+            DictFloatRow,
+            [("a", 1.5), ("c", 10.0), ("b", 2.5), ("c", 3.0), ("a", 4.0)],
+            [("a", 5.5), ("c", 13.0), ("b", 2.5)],
+        ),
+    ],
+)
+def test_groupby_fast_path_sum_variants(row_type, data, expected):
+    t = CTable(row_type, new_data=data)
+
+    out = t.group_by("key").agg({"value": "sum"})
+
+    assert rows(out) == expected
+
+
+def test_groupby_float_integral_fast_path_falls_back_for_non_integral_keys():
+    t = CTable(Float64KeyRow, new_data=[(0.5, 1.0), (1.5, 2.0), (0.5, 3.0)])
+
+    out = t.group_by("key").agg({"value": "sum"})
+
+    assert rows(out) == [(0.5, 4.0), (1.5, 2.0)]
+
+
+def test_groupby_float_integral_fast_path_falls_back_for_nan_group_when_kept():
+    t = CTable(Float64KeyRow, new_data=[(0.0, 1.0), (np.nan, 2.0), (0.0, 3.0)])
+
+    out = t.group_by("key", dropna=False).agg({"value": "sum"})
+
+    got = rows(out)
+    assert got[0] == (0.0, 4.0)
+    assert np.isnan(got[1][0])
+    assert got[1][1] == 2.0
+
+
 def test_groupby_rejects_bad_engine():
     t = CTable(SalesRow, new_data=DATA)
 

From e414af33156ed4e8d03854cace6491f4e850819e Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Fri, 15 May 2026 08:02:27 +0200
Subject: [PATCH 06/17] Add new tests and bench files

---
 bench/ctable/bench_nested_filter_index.py | 205 ++++++++++++++++++++++
 tests/ctable/test_nested_append.py        |  96 ++++++++++
 2 files changed, 301 insertions(+)
 create mode 100644 bench/ctable/bench_nested_filter_index.py
 create mode 100644 tests/ctable/test_nested_append.py

diff --git a/bench/ctable/bench_nested_filter_index.py b/bench/ctable/bench_nested_filter_index.py
new file mode 100644
index 00000000..71d44112
--- /dev/null
+++ b/bench/ctable/bench_nested_filter_index.py
@@ -0,0 +1,205 @@
+#!/usr/bin/env python
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+"""Benchmark nested leaf filter/index performance vs flat columns.
+
+Compares a CTable with flat column names against an equivalent one that uses
+dotted nested column names (physically stored under hierarchical _cols/ paths).
+Both tables hold the same data; each filter/index/aggregate operation is timed
+on both to show the overhead (or absence thereof) introduced by the nested layout.
+"""
+
+from __future__ import annotations
+
+import argparse
+import gc
+import time
+from dataclasses import dataclass
+
+import numpy as np
+
+import blosc2
+
+
+# ---------------------------------------------------------------------------
+# Schema helpers
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class FlatRow:
+    trip_begin_lon: float = blosc2.field(blosc2.float64())
+    trip_begin_lat: float = blosc2.field(blosc2.float64())
+    trip_end_lon: float = blosc2.field(blosc2.float64())
+    trip_end_lat: float = blosc2.field(blosc2.float64())
+    payment_fare: float = blosc2.field(blosc2.float64(ge=0))
+
+
+@dataclass
+class NestedRow:
+    """Same physical columns as FlatRow but accessed via dotted names after creation."""
+
+    trip_begin_lon: float = blosc2.field(blosc2.float64())
+    trip_begin_lat: float = blosc2.field(blosc2.float64())
+    trip_end_lon: float = blosc2.field(blosc2.float64())
+    trip_end_lat: float = blosc2.field(blosc2.float64())
+    payment_fare: float = blosc2.field(blosc2.float64(ge=0))
+
+
+def _build_data(n: int) -> dict:
+    rng = np.random.default_rng(42)
+    return {
+        "trip_begin_lon": rng.uniform(-88.0, -87.5, n).astype(np.float64),
+        "trip_begin_lat": rng.uniform(41.6, 42.0, n).astype(np.float64),
+        "trip_end_lon": rng.uniform(-88.0, -87.5, n).astype(np.float64),
+        "trip_end_lat": rng.uniform(41.6, 42.0, n).astype(np.float64),
+        "payment_fare": rng.uniform(3.0, 50.0, n).astype(np.float64),
+    }
+
+
+def _build_flat(data: dict, n: int) -> "blosc2.CTable":
+    t = blosc2.CTable(FlatRow, expected_size=n)
+    t.extend(data)
+    return t
+
+
+def _build_nested(data: dict, n: int) -> "blosc2.CTable":
+    t = blosc2.CTable(NestedRow, expected_size=n)
+    t.extend(data)
+    # Rename to dotted nested names
+    t.rename_column("trip_begin_lon", "trip.begin.lon")
+    t.rename_column("trip_begin_lat", "trip.begin.lat")
+    t.rename_column("trip_end_lon", "trip.end.lon")
+    t.rename_column("trip_end_lat", "trip.end.lat")
+    t.rename_column("payment_fare", "payment.fare")
+    return t
+
+
+# ---------------------------------------------------------------------------
+# Timing helper
+# ---------------------------------------------------------------------------
+
+
+def _timeit(fn, repeats: int = 5) -> float:
+    gc.collect()
+    times = []
+    for _ in range(repeats):
+        t0 = time.perf_counter()
+        fn()
+        times.append(time.perf_counter() - t0)
+    return min(times)
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+
+def main() -> None:
+    p = argparse.ArgumentParser(description="Benchmark nested vs flat column filter/index/aggregate")
+    p.add_argument("--rows", type=int, default=1_000_000, help="Number of rows (default: 1M)")
+    p.add_argument("--repeats", type=int, default=5, help="Timing repeats (default: 5)")
+    args = p.parse_args()
+
+    N = args.rows
+    R = args.repeats
+
+    print(f"Building tables with {N:,} rows …")
+    data = _build_data(N)
+    flat_data = data.copy()  # flat uses underscore names
+    nested_data = {
+        "trip_begin_lon": data["trip_begin_lon"],
+        "trip_begin_lat": data["trip_begin_lat"],
+        "trip_end_lon": data["trip_end_lon"],
+        "trip_end_lat": data["trip_end_lat"],
+        "payment_fare": data["payment_fare"],
+    }
+
+    tf = _build_flat(flat_data, N)
+    tn = _build_nested(nested_data, N)
+    print(f"  flat   col_names: {tf.col_names}")
+    print(f"  nested col_names: {tn.col_names}")
+    print()
+
+    # Build indexes on the fare column for index-accelerated queries
+    print("Building indexes …")
+    tf.create_index("payment_fare")
+    tn.create_index("payment.fare")
+    print()
+
+    header = f"{'Operation':<45} {'flat (ms)':>12} {'nested (ms)':>13} {'ratio':>8}"
+    print(header)
+    print("-" * len(header))
+
+    def bench(label, flat_fn, nested_fn):
+        t_flat = _timeit(flat_fn, R) * 1000
+        t_nested = _timeit(nested_fn, R) * 1000
+        ratio = t_nested / t_flat if t_flat > 0 else float("nan")
+        print(f"{label:<45} {t_flat:>12.3f} {t_nested:>13.3f} {ratio:>8.3f}x")
+
+    bench(
+        "where (string expr, full scan)",
+        lambda: tf.where("payment_fare > 20"),
+        lambda: tn.where("payment.fare > 20"),
+    )
+
+    bench(
+        "where (string expr, full scan, nrows)",
+        lambda: tf.where("payment_fare > 20").nrows,
+        lambda: tn.where("payment.fare > 20").nrows,
+    )
+
+    bench(
+        "where (LazyExpr, full scan)",
+        lambda: tf.where(tf["payment_fare"] > 20),
+        lambda: tn.where(tn["payment.fare"] > 20),
+    )
+
+    bench(
+        "where (auto index-accelerated, nrows)",
+        lambda: tf.where("payment_fare > 20").nrows,
+        lambda: tn.where("payment.fare > 20").nrows,
+    )
+
+    bench(
+        "column mean (full scan)",
+        lambda: tf["payment_fare"].mean(),
+        lambda: tn["payment.fare"].mean(),
+    )
+
+    bench(
+        "column sum (full scan)",
+        lambda: tf["payment_fare"].sum(),
+        lambda: tn["payment.fare"].sum(),
+    )
+
+    bench(
+        "column min (full scan)",
+        lambda: tf["trip_begin_lon"].min(),
+        lambda: tn["trip.begin.lon"].min(),
+    )
+
+    bench(
+        "multi-column where (string expr, nrows)",
+        lambda: tf.where("trip_begin_lon > -87.7 and payment_fare > 10").nrows,
+        lambda: tn.where("trip.begin.lon > -87.7 and payment.fare > 10").nrows,
+    )
+
+    bench(
+        "sort_by (single leaf)",
+        lambda: tf.sort_by("payment_fare"),
+        lambda: tn.sort_by("payment.fare"),
+    )
+
+    print()
+    print("ratio < 1 means nested is faster; ratio > 1 means flat is faster.")
+    print("Ratios close to 1.0 indicate the nested path adds negligible overhead.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/ctable/test_nested_append.py b/tests/ctable/test_nested_append.py
new file mode 100644
index 00000000..7be94a6e
--- /dev/null
+++ b/tests/ctable/test_nested_append.py
@@ -0,0 +1,96 @@
+"""Tests for Ph 3.1: append/extend with nested dict rows on tables with dotted column names."""
+
+from dataclasses import dataclass
+
+import numpy as np
+import pytest
+
+import blosc2
+
+
+@dataclass
+class FlatTrip:
+    trip_begin_lon: float
+    trip_begin_lat: float
+    payment_fare: float
+
+
+def _make_nested_table():
+    """Create a CTable with dotted (nested) column names via rename."""
+    t = blosc2.CTable(FlatTrip)
+    t.rename_column("trip_begin_lon", "trip.begin.lon")
+    t.rename_column("trip_begin_lat", "trip.begin.lat")
+    t.rename_column("payment_fare", "payment.fare")
+    return t
+
+
+def test_append_nested_dict():
+    """append() accepts a fully-nested dict and flattens it to dotted keys."""
+    t = _make_nested_table()
+    t.append({"trip": {"begin": {"lon": 1.0, "lat": 2.0}}, "payment": {"fare": 10.0}})
+    t.append({"trip": {"begin": {"lon": 3.0, "lat": 4.0}}, "payment": {"fare": 20.0}})
+
+    assert t.nrows == 2
+    np.testing.assert_array_almost_equal(t["trip.begin.lon"][:], [1.0, 3.0])
+    np.testing.assert_array_almost_equal(t["trip.begin.lat"][:], [2.0, 4.0])
+    np.testing.assert_array_almost_equal(t["payment.fare"][:], [10.0, 20.0])
+
+
+def test_append_flat_dotted_dict_unchanged():
+    """append() with already-flat dotted keys continues to work."""
+    t = _make_nested_table()
+    t.append({"trip.begin.lon": 5.0, "trip.begin.lat": 6.0, "payment.fare": 30.0})
+
+    assert t.nrows == 1
+    assert t["trip.begin.lon"][0] == pytest.approx(5.0)
+
+
+def test_extend_list_of_nested_dicts():
+    """extend() with a list of nested dicts flattens each row."""
+    t = _make_nested_table()
+    rows = [
+        {"trip": {"begin": {"lon": 1.0, "lat": 2.0}}, "payment": {"fare": 10.0}},
+        {"trip": {"begin": {"lon": 3.0, "lat": 4.0}}, "payment": {"fare": 20.0}},
+        {"trip": {"begin": {"lon": 5.0, "lat": 6.0}}, "payment": {"fare": 30.0}},
+    ]
+    t.extend(rows)
+
+    assert t.nrows == 3
+    np.testing.assert_array_almost_equal(t["trip.begin.lon"][:], [1.0, 3.0, 5.0])
+    np.testing.assert_array_almost_equal(t["payment.fare"][:], [10.0, 20.0, 30.0])
+
+
+def test_extend_nested_dict_of_arrays():
+    """extend() with a nested dict-of-arrays flattens the outer dict to dotted keys."""
+    t = _make_nested_table()
+    t.extend(
+        {
+            "trip": {"begin": {"lon": [1.0, 2.0, 3.0], "lat": [4.0, 5.0, 6.0]}},
+            "payment": {"fare": [10.0, 20.0, 30.0]},
+        }
+    )
+
+    assert t.nrows == 3
+    np.testing.assert_array_almost_equal(t["trip.begin.lon"][:], [1.0, 2.0, 3.0])
+    np.testing.assert_array_almost_equal(t["trip.begin.lat"][:], [4.0, 5.0, 6.0])
+    np.testing.assert_array_almost_equal(t["payment.fare"][:], [10.0, 20.0, 30.0])
+
+
+def test_append_nested_dict_where_and_attribute_access():
+    """append() with nested dicts integrates correctly with where() and attribute proxy."""
+    t = _make_nested_table()
+    for lon, lat, fare in [(1.0, 2.0, 5.0), (3.0, 4.0, 15.0), (5.0, 6.0, 25.0)]:
+        t.append({"trip": {"begin": {"lon": lon, "lat": lat}}, "payment": {"fare": fare}})
+
+    view = t.where("payment.fare > 10")
+    assert view.nrows == 2
+    assert t.trip.begin.lon.max() == pytest.approx(5.0)
+
+
+def test_nested_dotted_string_where_in_aggregate():
+    """Aggregate where= strings accept dotted nested column names."""
+    t = _make_nested_table()
+    for lon, lat, fare in [(1.0, 2.0, 5.0), (3.0, 4.0, 15.0), (5.0, 6.0, 25.0)]:
+        t.append({"trip": {"begin": {"lon": lon, "lat": lat}}, "payment": {"fare": fare}})
+
+    assert t.trip.begin.lon.sum(where="payment.fare > 10") == pytest.approx(8.0)

From eadd91a995e9eb980fe62010995f3a6bb41cb885 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Fri, 15 May 2026 08:08:30 +0200
Subject: [PATCH 07/17] Updated plan

---
 plans/ctable-groupby.md | 667 ++++++++++++----------------------------
 1 file changed, 201 insertions(+), 466 deletions(-)

diff --git a/plans/ctable-groupby.md b/plans/ctable-groupby.md
index b86ac078..95e82c0e 100644
--- a/plans/ctable-groupby.md
+++ b/plans/ctable-groupby.md
@@ -1,574 +1,309 @@
-# CTable `group_by` implementation plan
+# CTable `group_by` implementation plan — status
 
-## Goals
+This document started as the implementation plan for `CTable.group_by()`.  The
+initial plan has now been executed through Phase 3.  The remaining sections
+record what was completed and what is future work.
 
-Add a `CTable.group_by()` facility that is efficient for columnar, compressed
-CTable storage while keeping the first implementation simple and correct.  The
-long-term goal is to expose a compressed-aware group-reduce primitive that can
-power `CTable.group_by()` and possibly other analytics APIs.
+## Completed
 
-Key design principles:
+### Public API
 
-- Stay columnar: read only grouping columns, aggregation columns, and the live-row mask.
-- Keep memory bounded: process the table chunk-by-chunk; never require materializing all rows.
-- Use indexes opportunistically, but do not require them.
-- Start with a NumPy implementation, then add Cython kernels for hot paths.
-- Keep compressed input columns compressed between chunks; only chunk slices become NumPy buffers.
-
-## Proposed user API
-
-Initial high-level API could be:
+Implemented:
 
 ```python
-t.group_by("city").agg({"sales": "sum", "id": "count"})
-t.group_by(["country", "city"]).agg({"sales": ["sum", "mean"], "price": "max"})
+t.group_by("city").size()
+t.group_by("city").count("sales")
+t.group_by("city").agg({"sales": "sum"})
+t.group_by(["country", "city"]).agg({"sales": ["sum", "mean"]})
 ```
 
-Potential variants:
+Implemented API decisions:
 
-```python
-t.group_by("city", sort=False).agg(...)
-t.group_by("city", engine="auto").agg(...)
-t.group_by("city").count()
-t.group_by("city").sum("sales")
-```
+- `CTable.group_by(...)` returns a lightweight `CTableGroupBy` facade.
+- `CTableGroupBy` is a deferred operation builder, not a `CTable` view.
+- Terminal methods materialize a new in-memory `CTable`.
+- Aggregate result columns are suffixed as `<input>_<agg>`.
+- `GroupBy.size()` means row count per group / SQL `COUNT(*)`.
+- `GroupBy.count(column)` means non-null count / SQL `COUNT(column)`.
+- `GroupBy.agg({"col": "count"})` is equivalent to `GroupBy.count("col")`.
+- `sort=False` is the fast default; `sort=True` sorts output by group keys.
+- `dropna=True` is the default; `dropna=False` keeps null/NaN key groups.
+- No top-level `CTable.size()` or `CTable.count()` was added.
 
-The result should be a new in-memory `CTable` initially.  Persistent output can
-be added later via an `out=`/`urlpath=` option if useful.
+### Phase 1: Python/NumPy implementation
 
-Output column naming should be predictable, for example:
+Implemented files:
 
 ```text
-city, sales_sum, id_count
-country, city, sales_sum, sales_mean, price_max
+src/blosc2/ctable.py      # CTable.group_by()
+src/blosc2/groupby.py     # CTableGroupBy and NumPy fallback engine
 ```
 
-For a single aggregation on a column, decide whether to preserve the original
-column name or always suffix it.  Always suffixing is less ambiguous.
-
-## Supported MVP semantics
-
-Start with:
-
-- Group keys:
-  - fixed-width scalar columns: bool, signed/unsigned ints, floats, datetimes/timedeltas;
-  - dictionary-encoded string columns via integer codes.
-- Aggregations:
-  - `count` / `size`;
-  - `sum`;
-  - `min`;
-  - `max`;
-  - `mean` implemented as `sum + count` during accumulation.
-- Respect live rows (`_valid_rows`) and views.
-- Read only required columns.
-
-Defer initially:
-
-- list columns;
-- vlstring/vlbytes/object/struct scalar columns, except dictionary columns;
-- arbitrary Python aggregators;
-- group-by over computed columns, unless they can be chunk-evaluated cleanly;
-- disk spilling for very high cardinality;
-- parallel hash aggregation.
-
-## Baseline algorithm: chunked hash aggregation
-
-The default implementation should be a chunked hash group-reduce:
-
-```text
-global_accumulator = hash table: group_key -> aggregate state
-
-for each row chunk:
-    read/decompress key column chunk(s)
-    read/decompress aggregation value column chunk(s)
-    read/decompress valid-row mask chunk
-    apply live-row mask
+Implemented functionality:
 
-    build local grouping keys
-    compute local partial aggregates
-    merge local partial aggregates into global_accumulator
+- Chunked, columnar traversal.
+- Reads only group keys, aggregation value columns, and `_valid_rows`.
+- Handles live rows, views, and deleted rows.
+- Supports fixed-width scalar keys and dictionary-encoded string keys.
+- Dictionary keys group by codes and decode only for result materialization.
+- Supports `size`, `count`, `sum`, `mean`, `min`, `max`.
+- Supports multi-key group-by via structured NumPy keys.
+- Supports empty inputs.
+- Falls back to the generic NumPy path for unsupported optimized cases.
 
-finalize aggregate state
-materialize group keys and aggregate columns into a result CTable
-```
+### Phase 1 benchmark harness
 
-The important point is that the global hash table is proportional to the number
-of groups, not to the number of rows:
+Implemented:
 
 ```text
-memory ~= O(number_of_groups * (key_size + aggregate_state_size + hash overhead))
+bench/ctable/groupby.py
 ```
 
-The global accumulator should normally live uncompressed in memory.  It is
-accessed for every chunk merge, so compressing it would likely dominate runtime.
-The compressed-aware aspect is in the input traversal: compressed CTable columns
-are decompressed only one bounded chunk at a time.
-
-## Columnar chunk traversal
-
-Use synchronized physical row ranges.  For each range:
-
-```python
-valid = np.asarray(self._valid_rows[start:stop])
-key1 = np.asarray(self._cols[key1_name][start:stop])
-value = np.asarray(self._cols[value_name][start:stop])
-
-key1 = key1[valid]
-value = value[valid]
-```
-
-Where possible, align chunk ranges with the physical chunks of `_valid_rows` or
-input columns to improve decompression locality.  The exact chunk size should be
-configurable internally; a reasonable default can be based on CTable/NDArray
-chunk sizes, with a cap to avoid excessive temporaries.
-
-For dictionary columns, read codes instead of decoded strings:
-
-```python
-codes = np.asarray(dict_col.codes[start:stop], dtype=np.int32)
-```
-
-Decode codes only when materializing the final result.
-
-## NumPy MVP local grouping
+The benchmark can vary:
 
-For a single key:
+- row count;
+- group cardinality;
+- key dtype via `--key-dtype int32|int64|float32|float64`;
+- dictionary keys via `--dictionary`;
+- operation via `--op size|count|sum|mean|min|max`;
+- sorted output;
+- chunk size;
+- optional persistent `urlpath`;
+- optional pandas comparison.
 
-```python
-unique_keys, inverse = np.unique(keys, return_inverse=True)
-partial_sum = np.bincount(inverse, weights=values)
-partial_count = np.bincount(inverse)
-```
-
-For min/max use `np.minimum.at` / `np.maximum.at` into arrays initialized with
-appropriate identity values.
-
-For multiple fixed-width keys, build a structured array per chunk:
+### Phase 2: optimized paths
 
-```python
-keys = np.empty(n, dtype=[("k0", key0.dtype), ("k1", key1.dtype)])
-keys["k0"] = key0
-keys["k1"] = key1
+Implemented dense NumPy and Cython fast paths for the main benchmark-driven
+cases.
 
-unique_keys, inverse = np.unique(keys, return_inverse=True)
-```
+Optimized cases currently include:
 
-This is simple and should be the initial correctness path.  Costs to be aware of:
+- compact non-negative integer/dictionary-code single keys in Python/NumPy dense mode;
+- `int32 key + float64 sum` in Cython;
+- dictionary-code key + `float64 sum` in Cython;
+- integral `float64 key + float64 sum` in Cython;
+- integral `float32 key + float64 sum` in Cython.
 
-- structured key array allocation and copy per chunk;
-- `np.unique` is generally sort-based;
-- `return_inverse=True` allocates one integer per live row in the chunk;
-- aggregations are separate passes over the inverse.
+These paths avoid the original per-chunk `np.unique(..., return_inverse=True)`
+and Python dictionary merge overhead for compact single-key sum workloads.
 
-These costs are acceptable for the MVP because they are bounded by chunk size.
+Representative benchmark improvements observed during implementation:
 
-## Global accumulator design
+```text
+50M rows, 5k int32 groups, float64 sum:
+  generic/early path: ~0.47 s
+  Cython dense path:  ~0.20–0.22 s
 
-For the Python MVP, a dictionary is adequate:
+50M rows, 5k float64 integral groups, float64 sum:
+  generic path:       ~5.51 s
+  Cython dense path:  ~0.27–0.29 s
 
-```python
-acc: dict[group_key, AggregateState]
+50M rows, 5k float32 integral groups, float64 sum:
+  Cython dense path:  ~0.24–0.25 s
 ```
 
-Where `group_key` is:
+### Phase 3: separate Cython extension
 
-- a Python scalar for single numeric/dictionary keys;
-- a tuple for multi-column keys;
-- a normalized representation for null-aware keys when nullable support is added.
-
-`AggregateState` can store arrays or small Python objects with fields like:
+Implemented:
 
 ```text
-count
-sum
-min
-max
-mean_sum
-mean_count
+src/blosc2/groupby_ext.pyx
 ```
 
-For `mean`, keep `sum` and `count` and divide only during finalization.  For
-multiple aggregations over the same input column, share state when possible
-(e.g. `mean` and `sum` can reuse the same sum).
+Build integration:
 
-For better performance after the API stabilizes, replace parts of this with a
-NumPy-backed accumulator or Cython state object.
+- `CMakeLists.txt` builds, links, and installs `groupby_ext`.
+- Group-by kernels were removed from `indexing_ext.pyx`.
+- `src/blosc2/groupby.py` imports `blosc2.groupby_ext` for optimized kernels.
 
-## Index-aware paths
+Rationale:
 
-Indexes are optional accelerators.
+- Group-by kernels are analytics/query execution code, not indexing internals.
+- A dedicated extension keeps separation of concerns cleaner as optimized paths grow.
 
-### FULL index on a single group key
+### Documentation
 
-A FULL index stores sorted values and positions.  For a single grouping key,
-this can make group-by a sorted scan:
+Implemented user-facing documentation in:
 
 ```text
-obtain sorted positions from FULL index
-scan rows in key order
-detect group boundaries
-reduce contiguous runs
-```
-
-Benefits:
-
-- no hash table needed for the grouping key;
-- no sort needed at query time;
-- output is naturally sorted by key.
-
-This is most useful for:
-
-```python
-t.create_index("city", kind=blosc2.IndexKind.FULL)
-t.group_by("city").agg(...)
+doc/reference/ctable.rst
 ```
 
-Caveats:
+Documented:
 
-- only directly helps single-key group-by;
-- for multi-key group-by, a single-column FULL index only partially helps;
-- stale indexes must be ignored or rebuilt;
-- views/deleted rows still require intersecting with `_valid_rows`.
+- `CTable.group_by()`;
+- returned `CTableGroupBy` object;
+- `size()`, `count()`, `agg()`;
+- examples for row counts, non-null counts, and sums.
 
-### Bucket/segment indexes
+### Tests
 
-The default predicate indexes are useful before group-by, not usually during it:
+Implemented/extended:
 
-```python
-t.where("year == 2024").group_by("city")
+```text
+tests/ctable/test_groupby.py
 ```
 
-The index accelerates `where()`, reducing rows scanned by group-by.  It does not
-by itself provide grouped order.
-
-## Existing `indexing_ext` sort helpers
-
-`indexing_ext.pyx` contains:
-
-- `keysort_values_positions(values, positions)`;
-- `keysort_keys_indices(keys, indices)`.
-
-These sort a 1-D scalar key array in-place while carrying an `int64` side array.
-They are useful for sort/index oriented paths, especially:
-
-- building/reusing FULL indexes;
-- single-key sort-based group-by;
-- dictionary-code group-by where codes are scalar integers.
+Coverage includes:
 
-They are not the main primitive for hash-based group-reduce because hash
-aggregation does not require sorted keys.  They also do not directly support
-multi-column keys, variable-length strings, or fused aggregation.
-
-## Compressed-aware `group_reduce` primitive
+- `size()` row counts;
+- `count(column)` non-null counts;
+- `agg()` with `sum`, `mean`, `min`, `max`, `count`;
+- `agg({"*": "size"})`;
+- multi-key group-by;
+- dictionary string keys;
+- views and deleted rows;
+- empty tables;
+- `dropna=True` / `dropna=False` behavior;
+- bad engine rejection;
+- optimized int32/dictionary/float32/float64 sum variants;
+- fallback for non-integral float keys;
+- fallback for NaN float-key group when `dropna=False`.
 
-Longer term, introduce a lower-level primitive used by `CTable.group_by()`:
+Validation during implementation:
 
-```python
-blosc2.group_reduce(
-    keys=[key_ndarray1, key_ndarray2],
-    values=[value_ndarray1],
-    aggs={"value": ["sum", "count"]},
-    mask=valid_rows,
-    chunk_size=None,
-    engine="auto",
-)
+```text
+pytest tests/ctable/test_groupby.py -q
+pytest tests/ctable -q
 ```
 
-However, the first implementation can live under an internal module, e.g.
-`blosc2.groupby`, before becoming public.
+The full CTable suite passed after Phase 3.
 
-The primitive should be compressed-aware in traversal, not necessarily operate
-on compressed bytes directly.  General key comparison/grouping still needs
-values.  The intended execution is:
+## Current design summary
 
-```text
-read compressed NDArray slices -> NumPy buffers -> local group/reduce -> merge
-```
+The implementation now has three execution layers:
 
-This avoids full-column materialization while keeping the hot loop simple.
+1. Generic chunked NumPy path:
+   - supports the broadest set of Phase-1 semantics;
+   - uses per-chunk local grouping and merges partials globally.
+2. Dense NumPy single-key path:
+   - for compact non-negative integer/dictionary-code keys;
+   - uses dense accumulator arrays where possible.
+3. Cython single-key sum kernels:
+   - for the most important compact/integral key + `float64 sum` cases;
+   - lives in `groupby_ext.pyx`.
 
-## Cython optimization plan
+All optimized paths are conservative and fall back to the generic engine when
+unsupported data or semantics are encountered.
 
-### Phase 1: Python/NumPy only
+## Deferred / future work
 
-Files:
+### Integer-key Cython coverage
 
-```text
-src/blosc2/ctable.py     # public API / GroupBy facade
-src/blosc2/groupby.py    # internal implementation and NumPy engine
-```
+Current Cython integer coverage is focused on `int32` keys.  Future work should
+replace this with fused-type or equivalent kernels covering:
 
-Focus on correctness, tests, API shape, and an early benchmark harness.  The
-benchmark should be added in Phase 1, before any Cython work, so that later
-optimization decisions are driven by numbers rather than intuition.  At minimum,
-add one reusable script under `bench/` that can generate or open a CTable and
-compare:
+- `int8`, `uint8`;
+- `int16`, `uint16`;
+- `int32`, `uint32`;
+- `int64`, `uint64` with compact-range checks.
 
-- chunked NumPy hash group-by;
-- single-key sort/scan group-by where practical;
-- dictionary-code grouping;
-- pandas or DuckDB on an equivalent in-memory/external dataset for rough context.
+For dense group-by, the key range matters more than the dtype.  Smaller integer
+types are naturally compact and should be low-risk fast paths.
 
-The initial benchmark does not need to be exhaustive, but it should record row
-count, cardinality, chunk size, compression parameters, elapsed time, peak memory
-if easy to capture, and whether the input is in-memory, `.b2d`, or `.b2z`.
+### More Cython aggregations
 
-### Phase 2: optimized kernels in `indexing_ext.pyx`
+Current Cython kernels primarily accelerate single-key `float64 sum`.
+Future kernels should cover:
 
-To avoid adding a third extension too early, place initial Cython kernels in
-`src/blosc2/indexing_ext.pyx` under a clearly separated section:
+- `size`;
+- `count`;
+- `mean` via sum/count;
+- `min`;
+- `max`;
+- multiple aggregations in a single fused pass;
+- multiple value columns.
 
-```cython
-# ----------------------------------------------------------------------
-# Group-reduce kernels
-# ----------------------------------------------------------------------
-```
+### Arbitrary float-key hash table
 
-Initial kernels should target high-value simple cases:
+Current float Cython fast paths handle integral float32/float64 keys only.  A
+true float-key hash table would support arbitrary float keys without sorting or
+`np.unique`.
 
-- single `int32`/`int64` key;
-- dictionary-code keys (`int32`);
-- numeric value columns;
-- `count`, `sum`, `min`, `max`, maybe `mean` via sum/count.
+Required semantic decisions/handling:
 
-The Python layer remains responsible for:
+- `dropna=True`: skip NaN keys;
+- `dropna=False`: all NaN keys should form one group;
+- `+0.0` and `-0.0` should likely be the same group;
+- infinities are valid groups;
+- nullable float sentinels must be normalized consistently.
 
-- CTable schema validation;
-- chunk iteration;
-- decompression into NumPy buffers;
-- final result CTable construction;
-- fallback to NumPy for unsupported dtypes.
+### Multi-key Cython hash path
 
-The Cython layer consumes NumPy buffers and updates a hash accumulator or returns
-chunk partial aggregates.
+The generic NumPy path supports multi-key grouping via structured arrays.  Future
+Cython work could hash directly across multiple key arrays, avoiding structured
+key packing, sort-based unique, inverse arrays, and Python merge overhead.
 
-### Phase 3: split to `groupby_ext.pyx` if it grows
+### FULL-index sorted group-by path
 
-If the optimized path grows to include multi-column hash tables, nullable key
-semantics, multiple aggregate state layouts, spilling, or parallel execution,
-move it to a dedicated extension:
+A FULL index on a single grouping key can provide sorted positions.  A future
+sorted-scan group-by path could:
 
 ```text
-src/blosc2/groupby_ext.pyx
+read sorted positions from FULL index
+scan contiguous key runs
+reduce each run
+emit sorted groups naturally
 ```
 
-This is cleaner long-term than overloading `indexing_ext.pyx` indefinitely.
-Avoid putting this functionality in `blosc2_ext.pyx`; group-reduce is a
-higher-level analytics/query primitive, not core compression/NDArray machinery.
+This would be especially useful for high-cardinality single-key group-by and
+for users requesting `sort=True`.
 
-## What custom Cython buys over structured NumPy keys
+### Public `blosc2.group_reduce()`
 
-NumPy structured dtype is a good MVP, but a custom Cython hash reducer can avoid
-several costs:
+Keep lower-level group-reduce machinery internal for now.  Consider exposing a
+public `blosc2.group_reduce()` only after:
 
-- no temporary packed structured key array;
-- no sort-based `np.unique` for every chunk;
-- no `inverse` array of length equal to the chunk;
-- factorization and aggregation can be fused in one pass;
-- multiple aggregations can be updated together;
-- direct processing of CTable's columnar SoA layout;
-- easier future per-thread hash tables and merges.
-
-A typical optimized loop is:
-
-```text
-for i in range(n):
-    key = key_columns[i]
-    slot = hash_lookup_or_insert(key)
-    acc_sum[slot] += value[i]
-    acc_count[slot] += 1
-    acc_min[slot] = min(acc_min[slot], value[i])
-```
-
-For multi-column keys, the Cython path can hash directly across multiple arrays
-without packing them into a structured array first.
+- aggregation semantics are stable;
+- null/NaN behavior is fully documented;
+- output representation is clear;
+- benchmarks show usefulness outside `CTable.group_by()`.
 
-## High-cardinality strategy
+### High-cardinality and memory strategy
 
-Hash aggregation can become memory-heavy when the number of groups approaches
-the number of rows.  Add safeguards and future alternatives:
+Future safeguards/features:
 
 - estimate cardinality from early chunks;
 - expose/keep an internal memory limit;
-- fall back to sort-based group-by when cardinality is too high;
-- use FULL index if available;
-- later: partitioned hash group-by with spill-to-disk.
+- fall back to sort-based grouping when cardinality is too high;
+- use FULL indexes when available;
+- eventually implement partitioned hash group-by with spill-to-disk.
 
-For the MVP, document that very high-cardinality group-by may require memory
-proportional to output cardinality.
+### Parallel execution
 
-## Null and NaN semantics
+Potential future optimization:
 
-Define before finalizing the API:
+- per-thread local accumulators;
+- merge accumulators at chunk or partition boundaries;
+- coordinate with Blosc2 decompression threading to avoid oversubscription.
 
-- Should null sentinel values form their own group, be skipped, or be controlled
-  by `dropna=`?
-- Should float NaNs group together?  NumPy `unique` behavior and hash behavior
-  must be made consistent.
-- Nullable booleans/dictionary null codes need explicit handling.
+### Additional API conveniences
 
-Suggested default, matching common dataframe behavior:
+Potential future user conveniences:
 
 ```python
-t.group_by("key", dropna=True)  # default? skip null keys
-t.group_by("key", dropna=False)  # include null group
+t.group_by("city").sum("sales")
+t.group_by("city").mean("sales")
+t.group_by("city").min("sales")
+t.group_by("city").max("sales")
 ```
 
-But this should be aligned with existing CTable nullable semantics.
-
-## Documentation
-
-Add user-facing docstrings and Sphinx documentation for the new group-by API:
-
-- `CTable.group_by()` docstring with parameters such as `keys`, `sort`,
-  `dropna`, `engine`, and `chunk_size` if exposed;
-- the returned `GroupBy`/`CTableGroupBy` facade docstring, documenting that it
-  is a deferred operation builder, not a `CTable` view;
-- `GroupBy.size()`, `GroupBy.count()`, and `GroupBy.agg()` docstrings;
-- examples in the CTable documentation showing row counts, non-null counts,
-  sums/means, dictionary string grouping, and optional sorted output.
+Do not add top-level `CTable.size()` / `CTable.count()` until their semantics are
+clearly justified outside group-by.
 
-The class may be described as "the object returned by `CTable.group_by()`" and
-need not encourage direct construction.
+### Persistent output
 
-## Tests
+The current result is an in-memory `CTable`.  Future work may add an `out=` or
+`urlpath=` option for persistent grouped output.
 
-Add tests under `tests/ctable/`, covering:
-
-- single-key count/sum/min/max/mean;
-- multi-key group-by;
-- dictionary string key grouping;
-- views and deleted rows;
-- empty table and all-filtered view;
-- different numeric dtypes and bool keys;
-- nullable key behavior once specified;
-- result schema and output column names;
-- consistency with a reference Python/pandas-like implementation;
-- chunk-size variation to ensure chunk-boundary independence;
-- optional FULL-index path returns same results as hash path.
-
-For deterministic tests, sort result rows before comparison unless the API
-guarantees output order.
-
-## Benchmark plan
-
-Add a small but useful benchmark during Phase 1.  This is important because it
-sets the baseline for the NumPy implementation and identifies which Cython
-kernels are worth writing first.
-
-Benchmarks should include:
-
-- low-cardinality single key, e.g. 10 groups over 100M rows;
-- medium cardinality, e.g. 100k groups;
-- high cardinality, near unique keys;
-- dictionary string columns grouped by codes;
-- multi-column keys;
-- multiple aggregations over one value column;
-- multiple value columns;
-- with and without FULL index;
-- persistent `.b2d`/`.b2z` inputs.
-
-Compare:
-
-- Python/NumPy chunked implementation;
-- Cython hash path when available;
-- sort-based path using existing keysort helpers;
-- pandas/duckdb for sanity, where feasible.
-
-## Open decisions and recommended defaults
-
-### Public API and result column names
-
-Recommendation: use a small `GroupBy` facade and an explicit `.agg()` method:
-
-```python
-t.group_by("city").agg({"sales": "sum"})
-t.group_by(["country", "city"]).agg({"sales": ["sum", "mean"], "price": "max"})
-```
+## Related untracked files reviewed
 
-Always suffix aggregate output columns as `<input>_<agg>`:
+During cleanup, these untracked files were reviewed and found non-duplicative:
 
 ```text
-city, sales_sum
-country, city, sales_sum, sales_mean, price_max
-```
-
-This avoids ambiguity and remains stable when users later request multiple
-aggregations on the same input column.  Convenience methods should include at least `GroupBy.size()` and
-`GroupBy.count(column)` early:
-
-```python
-t.group_by("city").size()  # row count per group / COUNT(*)
-t.group_by("city").count("sales")  # non-null sales count / COUNT(sales)
-```
-
-Additional conveniences like `.sum()`, `.mean()`, `.min()`, and `.max()` can be
-added after `.agg()` is stable.
-
-### Output order
-
-Recommendation: make output order configurable, with hash insertion order as the
-fast default and sorted output as an option:
-
-```python
-t.group_by("city", sort=False).agg(...)  # default: fastest
-t.group_by("city", sort=True).agg(...)  # sort by group keys
-```
-
-When a single-key FULL index is used, sorted output can be produced naturally.
-Tests should not depend on default order unless explicitly testing order.
-
-### Null and NaN grouping semantics
-
-Recommendation: provide `dropna=` and default to `True`, matching common
-dataframe behavior:
-
-```python
-t.group_by("key", dropna=True)  # skip rows with null/NaN keys
-t.group_by("key", dropna=False)  # include a null/NaN group
+tests/ctable/test_nested_append.py
+bench/ctable/bench_nested_filter_index.py
 ```
 
-For `dropna=False`, all NaNs in a floating key should belong to one group, and
-nullable sentinels/dictionary null codes should belong to one null group.  The
-NumPy and Cython engines must normalize these cases consistently.
-
-### `size` vs `count`
-
-Recommendation: support both, with distinct meanings, scoped to group-by rather
-than as new top-level `CTable.size()` / `CTable.count()` methods:
-
-- `GroupBy.size()`: number of rows in the group, independent of value-column
-  nulls; equivalent to SQL `COUNT(*)` and pandas `groupby(...).size()`;
-- `GroupBy.count(column)`: number of non-null values for a specific value
-  column; equivalent to SQL `COUNT(column)` and pandas `groupby(...)[column].count()`;
-- `count` aggregation, e.g. `GroupBy.agg({"sales": "count"})`, should be an
-  equivalent spelling for `GroupBy.count("sales")`.
-
-Prefer `size()` over `len()` for the MVP.  Although `len` resembles Python's
-`len()`, `size()` follows pandas group-by terminology and avoids suggesting that
-it returns a single scalar length.  A `len()` alias can be considered later if
-there is demand.
-
-For non-nullable columns, `count(col)` equals `size`.  For nullable columns,
-`count(col)` excludes null sentinels/NaNs according to the column null policy.
-The MVP can implement `GroupBy.size()` first and add nullable-aware `count` as
-nullable aggregate semantics mature.
-
-### Public `blosc2.group_reduce()` exposure
-
-Recommendation: keep `group_reduce` internal at first, e.g. in
-`blosc2.groupby`, until the API and semantics settle through `CTable.group_by()`.
-Expose a public `blosc2.group_reduce()` only after:
-
-- aggregation semantics are stable;
-- null/NaN behavior is documented;
-- output representation is clear;
-- benchmarks show it is useful outside CTable.
-
-### Cython extension placement
-
-Recommendation: start optimized kernels in `indexing_ext.pyx` only for Phase 2,
-under a clearly marked group-reduce section, to avoid build-system churn while
-validating the approach.  If the code grows beyond a few focused kernels or needs
-its own persistent state classes, move it to `groupby_ext.pyx`.  Do not place it
-in `blosc2_ext.pyx`.
+They cover direct nested append/extend correctness and nested flat-vs-dotted
+performance comparisons, respectively, and are worth keeping/adding separately.

From f491f5d472196bca7dac80f5e1920613d85ce0be Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Fri, 15 May 2026 12:13:49 +0200
Subject: [PATCH 08/17] Fused integer-key kernels and more Cython aggregations

Implemented:

- fused dense integer-key Cython kernels covering `int8`, `uint8`,
  `int16`, `uint16`, `int32`, `uint32`, `int64`, and `uint64` keys;
- dense integer/dictionary-code Cython path for `size`, `count`, `sum`,
  `mean`, `min`, and `max`;
- float64 value kernels with NaN-null skipping where applicable;
- int64 value kernels for integer/bool `sum`, `min`, and `max`;
- shared key-presence tracking so groups with all-null values are still
  emitted correctly for `count` and nullable float aggregations.
---
 bench/ctable/groupby.py      |  25 ++-
 plans/ctable-groupby.md      |  40 ++--
 src/blosc2/groupby.py        | 297 ++++++++++++++++++++++++++++
 src/blosc2/groupby_ext.pyx   | 374 ++++++++++++++++++++++++++++++++++-
 tests/ctable/test_groupby.py |  68 ++++++-
 5 files changed, 777 insertions(+), 27 deletions(-)

diff --git a/bench/ctable/groupby.py b/bench/ctable/groupby.py
index 68c03551..14bf4b06 100644
--- a/bench/ctable/groupby.py
+++ b/bench/ctable/groupby.py
@@ -32,18 +32,12 @@ class Row:
             key: str = blosc2.field(blosc2.dictionary())
             value: float = blosc2.field(blosc2.float64())
 
-    elif key_dtype == "int32":
+    elif key_dtype in {"int8", "uint8", "int16", "uint16", "int32", "uint32", "int64", "uint64"}:
+        key_spec = getattr(blosc2, key_dtype)()
 
         @dataclasses.dataclass
         class Row:
-            key: int = blosc2.field(blosc2.int32())
-            value: float = blosc2.field(blosc2.float64())
-
-    elif key_dtype == "int64":
-
-        @dataclasses.dataclass
-        class Row:
-            key: int = blosc2.field(blosc2.int64())
+            key: int = blosc2.field(key_spec)
             value: float = blosc2.field(blosc2.float64())
 
     elif key_dtype == "float32":
@@ -87,7 +81,18 @@ def main() -> None:
     parser.add_argument("--dictionary", action="store_true", help="Use a dictionary-encoded string key")
     parser.add_argument(
         "--key-dtype",
-        choices=["int32", "int64", "float32", "float64"],
+        choices=[
+            "int8",
+            "uint8",
+            "int16",
+            "uint16",
+            "int32",
+            "uint32",
+            "int64",
+            "uint64",
+            "float32",
+            "float64",
+        ],
         default="int32",
         help="Physical dtype for non-dictionary keys. Float keys are generated from group codes cast to float.",
     )
diff --git a/plans/ctable-groupby.md b/plans/ctable-groupby.md
index 95e82c0e..69eb7547 100644
--- a/plans/ctable-groupby.md
+++ b/plans/ctable-groupby.md
@@ -121,6 +121,19 @@ Rationale:
 - Group-by kernels are analytics/query execution code, not indexing internals.
 - A dedicated extension keeps separation of concerns cleaner as optimized paths grow.
 
+### Phase 4: fused integer-key kernels and more Cython aggregations
+
+Implemented:
+
+- fused dense integer-key Cython kernels covering `int8`, `uint8`,
+  `int16`, `uint16`, `int32`, `uint32`, `int64`, and `uint64` keys;
+- dense integer/dictionary-code Cython path for `size`, `count`, `sum`,
+  `mean`, `min`, and `max`;
+- float64 value kernels with NaN-null skipping where applicable;
+- int64 value kernels for integer/bool `sum`, `min`, and `max`;
+- shared key-presence tracking so groups with all-null values are still
+  emitted correctly for `count` and nullable float aggregations.
+
 ### Documentation
 
 Implemented user-facing documentation in:
@@ -190,29 +203,26 @@ unsupported data or semantics are encountered.
 
 ### Integer-key Cython coverage
 
-Current Cython integer coverage is focused on `int32` keys.  Future work should
-replace this with fused-type or equivalent kernels covering:
-
-- `int8`, `uint8`;
-- `int16`, `uint16`;
-- `int32`, `uint32`;
-- `int64`, `uint64` with compact-range checks.
-
-For dense group-by, the key range matters more than the dtype.  Smaller integer
-types are naturally compact and should be low-risk fast paths.
+Completed for dense compact single-key group-by with fused kernels covering
+`int8`, `uint8`, `int16`, `uint16`, `int32`, `uint32`, `int64`, and `uint64`.
+The dense path still falls back for negative non-null keys and non-compact key
+ranges.
 
 ### More Cython aggregations
 
-Current Cython kernels primarily accelerate single-key `float64 sum`.
-Future kernels should cover:
+Completed for dense compact integer/dictionary-code single keys:
 
 - `size`;
 - `count`;
+- `sum`;
 - `mean` via sum/count;
 - `min`;
-- `max`;
-- multiple aggregations in a single fused pass;
-- multiple value columns.
+- `max`.
+
+Remaining possible extensions in this area:
+
+- fuse multiple aggregations/value columns into one Cython pass;
+- broaden value-type coverage beyond float64/int64 normalized kernels.
 
 ### Arbitrary float-key hash table
 
diff --git a/src/blosc2/groupby.py b/src/blosc2/groupby.py
index dfeda79d..9222fbd4 100644
--- a/src/blosc2/groupby.py
+++ b/src/blosc2/groupby.py
@@ -175,6 +175,9 @@ def _validate_value_column(self, name: str) -> None:
 
     def _execute(self, specs: list[_AggSpec]):
         self._validate_output_names(specs)
+        fast = self._try_execute_cython_dense_int_key(specs)
+        if fast is not None:
+            return fast
         fast = self._try_execute_cython_i32_f64_sum(specs)
         if fast is not None:
             return fast
@@ -224,6 +227,300 @@ def _execute(self, specs: list[_AggSpec]):
         rows = self._final_rows(acc, key_values, specs)
         return self._build_result(rows, specs)
 
+    def _try_execute_cython_dense_int_key(self, specs: list[_AggSpec]):  # noqa: C901
+        """Cython fast path for one compact integer/dictionary key and dense aggregations."""
+        if len(self.keys) != 1:
+            return None
+        key_name = self.keys[0]
+        key_info = self.table._schema.columns_by_name[key_name]
+        key_is_dict = self.table._is_dictionary_column(key_info)
+        if key_is_dict:
+            key_arr = self.table._cols[key_name].codes
+            key_dtype = np.dtype(np.int32)
+            skip_key_null = self.dropna
+            key_null = int(key_info.spec.null_code)
+        else:
+            key_arr = self.table._cols[key_name]
+            key_dtype = getattr(key_info.spec, "dtype", None)
+            if key_dtype is None:
+                return None
+            key_dtype = np.dtype(key_dtype)
+            if key_dtype.kind not in "biu":
+                return None
+            key_null_value = getattr(key_info.spec, "null_value", None)
+            skip_key_null = self.dropna and key_null_value is not None
+            key_null = 0 if key_null_value is None else int(key_null_value)
+
+        try:
+            from blosc2 import groupby_ext
+        except ImportError:
+            return None
+
+        descriptors = []
+        for spec in specs:
+            desc: dict[str, Any] = {"spec": spec, "op": spec.op}
+            if spec.op == "size":
+                kernel = getattr(groupby_ext, "groupby_dense_int_size_checked", None)
+                if kernel is None:
+                    return None
+                desc.update({"kernel": kernel, "state_kind": "counts"})
+                descriptors.append(desc)
+                continue
+
+            if spec.input_col is None:
+                return None
+            value_info = self.table._schema.columns_by_name[spec.input_col]
+            value_dtype = getattr(value_info.spec, "dtype", None)
+            if value_dtype is None:
+                return None
+            value_dtype = np.dtype(value_dtype)
+            null_value = getattr(value_info.spec, "null_value", None)
+
+            if spec.op == "count":
+                kernel = getattr(groupby_ext, "groupby_dense_int_count_checked", None)
+                if kernel is None:
+                    return None
+                desc.update({"kernel": kernel, "state_kind": "counts", "value_dtype": value_dtype})
+            elif spec.op in {"sum", "mean", "min", "max"}:
+                if value_dtype.kind == "f":
+                    skip_nan = isinstance(null_value, float) and math.isnan(null_value)
+                    if null_value is not None and not skip_nan:
+                        return None
+                    suffix = "sum" if spec.op == "sum" else spec.op
+                    kernel = getattr(groupby_ext, f"groupby_dense_int_f64_{suffix}_checked", None)
+                    if kernel is None:
+                        return None
+                    desc.update(
+                        {
+                            "kernel": kernel,
+                            "value_dtype": np.float64,
+                            "value_kind": "f64",
+                            "skip_nan": skip_nan,
+                        }
+                    )
+                elif value_dtype.kind in "biu":
+                    if null_value is not None:
+                        return None
+                    if spec.op == "mean":
+                        kernel = getattr(groupby_ext, "groupby_dense_int_f64_mean_checked", None)
+                        if kernel is None:
+                            return None
+                        desc.update(
+                            {
+                                "kernel": kernel,
+                                "value_dtype": np.float64,
+                                "value_kind": "f64",
+                                "skip_nan": False,
+                            }
+                        )
+                    else:
+                        kernel = getattr(groupby_ext, f"groupby_dense_int_i64_{spec.op}_checked", None)
+                        if kernel is None:
+                            return None
+                        desc.update(
+                            {
+                                "kernel": kernel,
+                                "value_dtype": np.int64,
+                                "value_kind": "i64",
+                                "skip_nan": False,
+                            }
+                        )
+                else:
+                    return None
+                if spec.op in {"sum", "min", "max"}:
+                    desc["state_kind"] = "value_present" if spec.op == "sum" else "extreme"
+                elif spec.op == "mean":
+                    desc["state_kind"] = "mean"
+            else:
+                return None
+            descriptors.append(desc)
+
+        compact_limit = 10_000_000
+        keys_present = np.zeros(0, dtype=bool)
+        states: dict[str, Any] = {}
+        for desc in descriptors:
+            spec = desc["spec"]
+            if desc["state_kind"] == "counts":
+                states[spec.output_col] = np.zeros(0, dtype=np.int64)
+            elif desc["state_kind"] == "mean":
+                states[spec.output_col] = (np.zeros(0, dtype=np.float64), np.zeros(0, dtype=np.int64))
+            elif desc["state_kind"] == "value_present" or desc["state_kind"] == "extreme":
+                dtype = np.float64 if desc["value_kind"] == "f64" else np.int64
+                states[spec.output_col] = (np.zeros(0, dtype=dtype), np.zeros(0, dtype=bool))
+
+        def ensure_size(size: int) -> bool:
+            nonlocal keys_present, states
+            if size > compact_limit:
+                return False
+            if size <= len(keys_present):
+                return True
+            old = len(keys_present)
+            keys_present = np.pad(keys_present, (0, size - old), constant_values=False)
+            for desc in descriptors:
+                spec = desc["spec"]
+                state = states[spec.output_col]
+                if desc["state_kind"] == "counts":
+                    states[spec.output_col] = np.pad(state, (0, size - old), constant_values=0)
+                else:
+                    first, second = state
+                    states[spec.output_col] = (
+                        np.pad(first, (0, size - old), constant_values=0),
+                        np.pad(
+                            second, (0, size - old), constant_values=False if second.dtype == np.bool_ else 0
+                        ),
+                    )
+            return True
+
+        def call_checked(kernel, *args) -> bool:
+            return int(kernel(*args)) == 0
+
+        phys_len = len(self.table._valid_rows)
+        chunk_size = self._chunk_size()
+        for start in range(0, phys_len, chunk_size):
+            stop = min(start + chunk_size, phys_len)
+            valid = np.asarray(self.table._valid_rows[start:stop], dtype=bool)
+            if not np.any(valid):
+                continue
+            keys = np.asarray(key_arr[start:stop], dtype=np.int8 if key_dtype.kind == "b" else key_dtype)
+            keys = np.ascontiguousarray(keys)
+            valid = np.ascontiguousarray(valid)
+            live = valid.copy()
+            if skip_key_null:
+                live &= keys != key_null
+            if not np.any(live):
+                continue
+            live_keys = keys[live]
+            if np.min(live_keys) < 0:
+                return None
+            max_key = int(np.max(live_keys))
+            if not ensure_size(max_key + 1):
+                return None
+
+            for desc in descriptors:
+                spec = desc["spec"]
+                state = states[spec.output_col]
+                if spec.op == "size":
+                    if not call_checked(
+                        desc["kernel"], keys, valid, state, keys_present, skip_key_null, key_null
+                    ):
+                        return None
+                elif spec.op == "count":
+                    values = np.asarray(self.table._cols[spec.input_col][start:stop])
+                    values_valid = np.ascontiguousarray(
+                        ~self._null_mask(spec.input_col, values, is_key=False)
+                    )
+                    if not call_checked(
+                        desc["kernel"],
+                        keys,
+                        valid,
+                        values_valid,
+                        state,
+                        keys_present,
+                        skip_key_null,
+                        key_null,
+                    ):
+                        return None
+                elif spec.op == "sum":
+                    values = np.asarray(
+                        self.table._cols[spec.input_col][start:stop], dtype=desc["value_dtype"]
+                    )
+                    values = np.ascontiguousarray(values)
+                    sums, value_present = state
+                    args = (
+                        keys,
+                        values,
+                        valid,
+                        sums,
+                        value_present,
+                        keys_present,
+                        skip_key_null,
+                        key_null,
+                    )
+                    if desc["value_kind"] == "f64":
+                        args = (*args, desc["skip_nan"])
+                    if not call_checked(desc["kernel"], *args):
+                        return None
+                elif spec.op == "mean":
+                    values = np.asarray(
+                        self.table._cols[spec.input_col][start:stop], dtype=desc["value_dtype"]
+                    )
+                    values = np.ascontiguousarray(values)
+                    sums, counts = state
+                    if not call_checked(
+                        desc["kernel"],
+                        keys,
+                        values,
+                        valid,
+                        sums,
+                        counts,
+                        keys_present,
+                        skip_key_null,
+                        key_null,
+                        desc["skip_nan"],
+                    ):
+                        return None
+                elif spec.op in {"min", "max"}:
+                    values = np.asarray(
+                        self.table._cols[spec.input_col][start:stop], dtype=desc["value_dtype"]
+                    )
+                    values = np.ascontiguousarray(values)
+                    extremes, has_value = state
+                    args = (
+                        keys,
+                        values,
+                        valid,
+                        extremes,
+                        has_value,
+                        keys_present,
+                        skip_key_null,
+                        key_null,
+                    )
+                    if desc["value_kind"] == "f64":
+                        args = (*args, desc["skip_nan"])
+                    if not call_checked(desc["kernel"], *args):
+                        return None
+
+        group_codes = np.nonzero(keys_present)[0]
+        if self.sort and key_is_dict:
+            group_codes = np.array(
+                sorted(
+                    group_codes,
+                    key=lambda code: _sortable_key_part(self.table._cols[key_name].decode(int(code))),
+                ),
+                dtype=group_codes.dtype,
+            )
+
+        rows = []
+        for code in group_codes:
+            key_value = self.table._cols[key_name].decode(int(code)) if key_is_dict else _python_scalar(code)
+            row = {key_name: key_value}
+            for desc in descriptors:
+                spec = desc["spec"]
+                state = states[spec.output_col]
+                if spec.op in {"size", "count"}:
+                    row[spec.output_col] = int(state[code])
+                elif spec.op == "sum":
+                    sums, value_present = state
+                    row[spec.output_col] = (
+                        _python_scalar(sums[code])
+                        if value_present[code]
+                        else _null_output_value(self._result_spec_for_agg(spec))
+                    )
+                elif spec.op == "mean":
+                    sums, counts = state
+                    row[spec.output_col] = (
+                        math.nan if counts[code] == 0 else float(sums[code]) / int(counts[code])
+                    )
+                elif spec.op in {"min", "max"}:
+                    extremes, has_value = state
+                    row[spec.output_col] = (
+                        _python_scalar(extremes[code])
+                        if has_value[code]
+                        else _null_output_value(self._result_spec_for_agg(spec))
+                    )
+            rows.append(row)
+        return self._build_result(rows, specs)
+
     def _try_execute_cython_i32_f64_sum(self, specs: list[_AggSpec]):  # noqa: C901
         """Cython fast path for one int32 key and one non-null float64 sum."""
         if len(self.keys) != 1 or len(specs) != 1 or specs[0].op != "sum" or self.sort:
diff --git a/src/blosc2/groupby_ext.pyx b/src/blosc2/groupby_ext.pyx
index c621d6fc..d8475f87 100644
--- a/src/blosc2/groupby_ext.pyx
+++ b/src/blosc2/groupby_ext.pyx
@@ -11,7 +11,7 @@
 import numpy as np
 cimport numpy as np
 
-from libc.stdint cimport int32_t, int64_t
+from libc.stdint cimport int8_t, uint8_t, int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t
 
 
 # ----------------------------------------------------------------------
@@ -345,3 +345,375 @@ def groupby_dense_f32_integral_key_f64_sum_checked(
                     sums_view[key_i] += value
                     present_view[key_i] = 1
     return ret
+
+
+# ----------------------------------------------------------------------
+# Fused integer-key dense kernels
+# ----------------------------------------------------------------------
+
+ctypedef fused dense_int_key_t:
+    int8_t
+    uint8_t
+    int16_t
+    uint16_t
+    int32_t
+    uint32_t
+    int64_t
+    uint64_t
+
+
+cdef inline int _dense_int_key_scan(
+    dense_int_key_t[:] keys_view,
+    np.npy_bool[:] valid_view,
+    Py_ssize_t n,
+    Py_ssize_t nstates,
+    bint skip_key_null,
+    int64_t key_null,
+    int* ret,
+) noexcept nogil:
+    cdef Py_ssize_t i
+    cdef int64_t key
+    cdef int64_t max_key = -1
+    ret[0] = 0
+    for i in range(n):
+        if not valid_view[i]:
+            continue
+        key = <int64_t>keys_view[i]
+        if skip_key_null and key == key_null:
+            continue
+        if key < 0:
+            ret[0] = -1
+            return 0
+        if key > max_key:
+            max_key = key
+    if max_key < 0:
+        ret[0] = 0
+    elif max_key >= nstates:
+        if max_key > 2147483646:
+            ret[0] = -1
+        else:
+            ret[0] = <int>max_key + 1
+    return 0
+
+
+def groupby_dense_int_size_checked(
+    dense_int_key_t[:] keys,
+    np.npy_bool[:] valid,
+    int64_t[:] counts,
+    np.npy_bool[:] keys_present,
+    bint skip_key_null=False,
+    int64_t key_null=0,
+):
+    """Checked dense integer-key ``size`` kernel for all integer key widths."""
+    if keys.shape[0] != valid.shape[0]:
+        raise ValueError("keys and valid must have the same length")
+    if counts.shape[0] != keys_present.shape[0]:
+        raise ValueError("counts and keys_present must have the same length")
+    cdef Py_ssize_t n = keys.shape[0]
+    cdef Py_ssize_t nstates = counts.shape[0]
+    cdef Py_ssize_t i
+    cdef int64_t key
+    cdef int ret
+    with nogil:
+        _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret)
+        if ret == 0:
+            for i in range(n):
+                if not valid[i]:
+                    continue
+                key = <int64_t>keys[i]
+                if skip_key_null and key == key_null:
+                    continue
+                counts[key] += 1
+                keys_present[key] = 1
+    return ret
+
+
+def groupby_dense_int_count_checked(
+    dense_int_key_t[:] keys,
+    np.npy_bool[:] valid,
+    np.npy_bool[:] values_valid,
+    int64_t[:] counts,
+    np.npy_bool[:] keys_present,
+    bint skip_key_null=False,
+    int64_t key_null=0,
+):
+    """Checked dense integer-key non-null count kernel."""
+    if keys.shape[0] != valid.shape[0] or keys.shape[0] != values_valid.shape[0]:
+        raise ValueError("keys, valid and values_valid must have the same length")
+    if counts.shape[0] != keys_present.shape[0]:
+        raise ValueError("counts and keys_present must have the same length")
+    cdef Py_ssize_t n = keys.shape[0]
+    cdef Py_ssize_t nstates = counts.shape[0]
+    cdef Py_ssize_t i
+    cdef int64_t key
+    cdef int ret
+    with nogil:
+        _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret)
+        if ret == 0:
+            for i in range(n):
+                if not valid[i]:
+                    continue
+                key = <int64_t>keys[i]
+                if skip_key_null and key == key_null:
+                    continue
+                keys_present[key] = 1
+                if values_valid[i]:
+                    counts[key] += 1
+    return ret
+
+
+def groupby_dense_int_f64_sum_checked(
+    dense_int_key_t[:] keys,
+    double[:] values,
+    np.npy_bool[:] valid,
+    double[:] sums,
+    np.npy_bool[:] value_present,
+    np.npy_bool[:] keys_present,
+    bint skip_key_null=False,
+    int64_t key_null=0,
+    bint skip_value_nan=False,
+):
+    """Checked dense integer-key float64 sum kernel."""
+    if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]:
+        raise ValueError("keys, values and valid must have the same length")
+    if sums.shape[0] != value_present.shape[0] or sums.shape[0] != keys_present.shape[0]:
+        raise ValueError("state arrays must have the same length")
+    cdef Py_ssize_t n = keys.shape[0]
+    cdef Py_ssize_t nstates = sums.shape[0]
+    cdef Py_ssize_t i
+    cdef int64_t key
+    cdef double value
+    cdef int ret
+    with nogil:
+        _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret)
+        if ret == 0:
+            for i in range(n):
+                if not valid[i]:
+                    continue
+                key = <int64_t>keys[i]
+                if skip_key_null and key == key_null:
+                    continue
+                keys_present[key] = 1
+                value = values[i]
+                if skip_value_nan and value != value:
+                    continue
+                sums[key] += value
+                value_present[key] = 1
+    return ret
+
+
+def groupby_dense_int_i64_sum_checked(
+    dense_int_key_t[:] keys,
+    int64_t[:] values,
+    np.npy_bool[:] valid,
+    int64_t[:] sums,
+    np.npy_bool[:] value_present,
+    np.npy_bool[:] keys_present,
+    bint skip_key_null=False,
+    int64_t key_null=0,
+):
+    """Checked dense integer-key int64 sum kernel."""
+    if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]:
+        raise ValueError("keys, values and valid must have the same length")
+    if sums.shape[0] != value_present.shape[0] or sums.shape[0] != keys_present.shape[0]:
+        raise ValueError("state arrays must have the same length")
+    cdef Py_ssize_t n = keys.shape[0]
+    cdef Py_ssize_t nstates = sums.shape[0]
+    cdef Py_ssize_t i
+    cdef int64_t key
+    cdef int ret
+    with nogil:
+        _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret)
+        if ret == 0:
+            for i in range(n):
+                if not valid[i]:
+                    continue
+                key = <int64_t>keys[i]
+                if skip_key_null and key == key_null:
+                    continue
+                keys_present[key] = 1
+                sums[key] += values[i]
+                value_present[key] = 1
+    return ret
+
+
+def groupby_dense_int_f64_mean_checked(
+    dense_int_key_t[:] keys,
+    double[:] values,
+    np.npy_bool[:] valid,
+    double[:] sums,
+    int64_t[:] counts,
+    np.npy_bool[:] keys_present,
+    bint skip_key_null=False,
+    int64_t key_null=0,
+    bint skip_value_nan=False,
+):
+    """Checked dense integer-key float64 mean state kernel."""
+    if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]:
+        raise ValueError("keys, values and valid must have the same length")
+    if sums.shape[0] != counts.shape[0] or sums.shape[0] != keys_present.shape[0]:
+        raise ValueError("state arrays must have the same length")
+    cdef Py_ssize_t n = keys.shape[0]
+    cdef Py_ssize_t nstates = sums.shape[0]
+    cdef Py_ssize_t i
+    cdef int64_t key
+    cdef double value
+    cdef int ret
+    with nogil:
+        _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret)
+        if ret == 0:
+            for i in range(n):
+                if not valid[i]:
+                    continue
+                key = <int64_t>keys[i]
+                if skip_key_null and key == key_null:
+                    continue
+                keys_present[key] = 1
+                value = values[i]
+                if skip_value_nan and value != value:
+                    continue
+                sums[key] += value
+                counts[key] += 1
+    return ret
+
+
+def groupby_dense_int_f64_min_checked(
+    dense_int_key_t[:] keys,
+    double[:] values,
+    np.npy_bool[:] valid,
+    double[:] mins,
+    np.npy_bool[:] has_value,
+    np.npy_bool[:] keys_present,
+    bint skip_key_null=False,
+    int64_t key_null=0,
+    bint skip_value_nan=False,
+):
+    """Checked dense integer-key float64 min kernel."""
+    cdef Py_ssize_t n = keys.shape[0]
+    cdef Py_ssize_t nstates = mins.shape[0]
+    cdef Py_ssize_t i
+    cdef int64_t key
+    cdef double value
+    cdef int ret
+    with nogil:
+        _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret)
+        if ret == 0:
+            for i in range(n):
+                if not valid[i]:
+                    continue
+                key = <int64_t>keys[i]
+                if skip_key_null and key == key_null:
+                    continue
+                keys_present[key] = 1
+                value = values[i]
+                if skip_value_nan and value != value:
+                    continue
+                if not has_value[key] or value < mins[key]:
+                    mins[key] = value
+                has_value[key] = 1
+    return ret
+
+
+def groupby_dense_int_f64_max_checked(
+    dense_int_key_t[:] keys,
+    double[:] values,
+    np.npy_bool[:] valid,
+    double[:] maxs,
+    np.npy_bool[:] has_value,
+    np.npy_bool[:] keys_present,
+    bint skip_key_null=False,
+    int64_t key_null=0,
+    bint skip_value_nan=False,
+):
+    """Checked dense integer-key float64 max kernel."""
+    cdef Py_ssize_t n = keys.shape[0]
+    cdef Py_ssize_t nstates = maxs.shape[0]
+    cdef Py_ssize_t i
+    cdef int64_t key
+    cdef double value
+    cdef int ret
+    with nogil:
+        _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret)
+        if ret == 0:
+            for i in range(n):
+                if not valid[i]:
+                    continue
+                key = <int64_t>keys[i]
+                if skip_key_null and key == key_null:
+                    continue
+                keys_present[key] = 1
+                value = values[i]
+                if skip_value_nan and value != value:
+                    continue
+                if not has_value[key] or value > maxs[key]:
+                    maxs[key] = value
+                has_value[key] = 1
+    return ret
+
+
+def groupby_dense_int_i64_min_checked(
+    dense_int_key_t[:] keys,
+    int64_t[:] values,
+    np.npy_bool[:] valid,
+    int64_t[:] mins,
+    np.npy_bool[:] has_value,
+    np.npy_bool[:] keys_present,
+    bint skip_key_null=False,
+    int64_t key_null=0,
+):
+    """Checked dense integer-key int64 min kernel."""
+    cdef Py_ssize_t n = keys.shape[0]
+    cdef Py_ssize_t nstates = mins.shape[0]
+    cdef Py_ssize_t i
+    cdef int64_t key
+    cdef int64_t value
+    cdef int ret
+    with nogil:
+        _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret)
+        if ret == 0:
+            for i in range(n):
+                if not valid[i]:
+                    continue
+                key = <int64_t>keys[i]
+                if skip_key_null and key == key_null:
+                    continue
+                keys_present[key] = 1
+                value = values[i]
+                if not has_value[key] or value < mins[key]:
+                    mins[key] = value
+                has_value[key] = 1
+    return ret
+
+
+def groupby_dense_int_i64_max_checked(
+    dense_int_key_t[:] keys,
+    int64_t[:] values,
+    np.npy_bool[:] valid,
+    int64_t[:] maxs,
+    np.npy_bool[:] has_value,
+    np.npy_bool[:] keys_present,
+    bint skip_key_null=False,
+    int64_t key_null=0,
+):
+    """Checked dense integer-key int64 max kernel."""
+    cdef Py_ssize_t n = keys.shape[0]
+    cdef Py_ssize_t nstates = maxs.shape[0]
+    cdef Py_ssize_t i
+    cdef int64_t key
+    cdef int64_t value
+    cdef int ret
+    with nogil:
+        _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret)
+        if ret == 0:
+            for i in range(n):
+                if not valid[i]:
+                    continue
+                key = <int64_t>keys[i]
+                if skip_key_null and key == key_null:
+                    continue
+                keys_present[key] = 1
+                value = values[i]
+                if not has_value[key] or value > maxs[key]:
+                    maxs[key] = value
+                has_value[key] = 1
+    return ret
diff --git a/tests/ctable/test_groupby.py b/tests/ctable/test_groupby.py
index 306d6522..ec4f2185 100644
--- a/tests/ctable/test_groupby.py
+++ b/tests/ctable/test_groupby.py
@@ -5,7 +5,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #######################################################################
 
-from dataclasses import dataclass
+from dataclasses import dataclass, make_dataclass
 
 import numpy as np
 import pytest
@@ -225,3 +225,69 @@ def test_groupby_rejects_bad_engine():
 
     with pytest.raises(ValueError):
         t.group_by("city", engine="cython")
+
+
+@pytest.mark.parametrize(
+    ("schema_factory", "values"),
+    [
+        (blosc2.int8, [0, 2, 1, 2, 0]),
+        (blosc2.uint8, [0, 2, 1, 2, 0]),
+        (blosc2.int16, [0, 2, 1, 2, 0]),
+        (blosc2.uint16, [0, 2, 1, 2, 0]),
+        (blosc2.int32, [0, 2, 1, 2, 0]),
+        (blosc2.uint32, [0, 2, 1, 2, 0]),
+        (blosc2.int64, [0, 2, 1, 2, 0]),
+        (blosc2.uint64, [0, 2, 1, 2, 0]),
+    ],
+)
+def test_groupby_cython_fused_integer_key_dtypes(schema_factory, values):
+    row_type = make_dataclass(
+        f"FusedKey{schema_factory.__name__}Row",
+        [
+            ("key", int, blosc2.field(schema_factory())),
+            ("value", int, blosc2.field(blosc2.int32())),
+        ],
+    )
+    t = CTable(row_type, new_data=list(zip(values, [1, 10, 2, 3, 4], strict=True)))
+
+    out = t.group_by("key", sort=True).agg({"value": "sum"})
+
+    assert rows(out) == [(0, 5), (1, 2), (2, 13)]
+
+
+def test_groupby_cython_integer_key_more_integer_aggs():
+    row_type = make_dataclass(
+        "IntKeyMoreIntegerAggsRow",
+        [
+            ("key", int, blosc2.field(blosc2.int16())),
+            ("value", int, blosc2.field(blosc2.int32())),
+        ],
+    )
+    t = CTable(row_type, new_data=[(0, 5), (1, 10), (0, -2), (1, 20), (2, 7)])
+
+    out = t.group_by("key", sort=True).agg({"*": "size", "value": ["count", "sum", "mean", "min", "max"]})
+
+    assert rows(out) == [(0, 2, 2, 3, 1.5, -2, 5), (1, 2, 2, 30, 15.0, 10, 20), (2, 1, 1, 7, 7.0, 7, 7)]
+
+
+def test_groupby_cython_integer_key_nullable_float_aggs():
+    row_type = make_dataclass(
+        "IntKeyNullableFloatAggsRow",
+        [
+            ("key", int, blosc2.field(blosc2.uint16())),
+            ("value", float, blosc2.field(blosc2.float64(nullable=True))),
+        ],
+    )
+    t = CTable(row_type, new_data=[(0, 1.5), (1, np.nan), (0, 2.5), (1, np.nan), (2, 10.0)])
+
+    out = t.group_by("key", sort=True).agg({"value": ["count", "sum", "mean", "min", "max"]})
+
+    got = rows(out)
+    assert got[0] == (0, 2, 4.0, 2.0, 1.5, 2.5)
+    assert got[1][0] == 1
+    assert got[1][1] == 0
+    assert np.isnan(got[1][2])
+    assert np.isnan(got[1][3])
+    assert np.isnan(got[1][4])
+    assert np.isnan(got[1][5])
+    assert got[2] == (2, 1, 10.0, 10.0, 10.0, 10.0)

From 03b3583eeba653a397ec7a51014e454a071b8183 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Fri, 15 May 2026 12:31:39 +0200
Subject: [PATCH 09/17] Implemented arbitrary float hash table for floating
 point acceleration

---
 bench/ctable/groupby.py      |   9 +-
 plans/ctable-groupby.md      |  24 ++--
 src/blosc2/groupby.py        | 132 +++++++++++++++++++++
 src/blosc2/groupby_ext.pyx   | 221 +++++++++++++++++++++++++++++++++++
 tests/ctable/test_groupby.py |  28 +++++
 5 files changed, 404 insertions(+), 10 deletions(-)

diff --git a/bench/ctable/groupby.py b/bench/ctable/groupby.py
index 14bf4b06..dfa7d863 100644
--- a/bench/ctable/groupby.py
+++ b/bench/ctable/groupby.py
@@ -5,6 +5,7 @@
 --------
 python bench/ctable/groupby.py --rows 10_000_000 --groups 1_000 --op sum
 python bench/ctable/groupby.py --rows 10_000_000 --groups 1_000 --key-dtype float64 --op sum
+# float key dtypes generate non-integral repeated labels to exercise the float hash path
 python bench/ctable/groupby.py --rows 1_000_000 --groups 100 --dictionary --pandas
 """
 
@@ -67,7 +68,11 @@ def make_data(nrows: int, ngroups: int, dictionary: bool, key_dtype: str, seed:
     if dictionary:
         keys = np.asarray([f"k{code}" for code in key_codes], dtype=object)
     elif key_dtype in {"float32", "float64"}:
-        keys = key_codes.astype(np.dtype(key_dtype))
+        # Use non-integral, repeated float labels by default so float-key
+        # benchmarks exercise the arbitrary-float hash path instead of the
+        # dense integral-float fast path.
+        labels = key_codes.astype(np.float64) + 0.25
+        keys = labels.astype(np.dtype(key_dtype))
     else:
         keys = key_codes.astype(np.dtype(key_dtype), copy=False)
     return keys, values
@@ -94,7 +99,7 @@ def main() -> None:
             "float64",
         ],
         default="int32",
-        help="Physical dtype for non-dictionary keys. Float keys are generated from group codes cast to float.",
+        help="Physical dtype for non-dictionary keys. Float keys are generated as non-integral repeated labels.",
     )
     parser.add_argument("--op", choices=["size", "count", "sum", "mean", "min", "max"], default="sum")
     parser.add_argument("--sort", action="store_true")
diff --git a/plans/ctable-groupby.md b/plans/ctable-groupby.md
index 69eb7547..b0f91276 100644
--- a/plans/ctable-groupby.md
+++ b/plans/ctable-groupby.md
@@ -226,17 +226,25 @@ Remaining possible extensions in this area:
 
 ### Arbitrary float-key hash table
 
-Current float Cython fast paths handle integral float32/float64 keys only.  A
-true float-key hash table would support arbitrary float keys without sorting or
-`np.unique`.
+Implemented a conservative Cython open-addressing hash path for single
+`float32`/`float64` keys with float value aggregations.  It supports `size`,
+`count`, `sum`, `mean`, `min`, and `max` for supported single-value-column
+queries and falls back otherwise.
 
-Required semantic decisions/handling:
+Implemented semantics:
 
 - `dropna=True`: skip NaN keys;
-- `dropna=False`: all NaN keys should form one group;
-- `+0.0` and `-0.0` should likely be the same group;
-- infinities are valid groups;
-- nullable float sentinels must be normalized consistently.
+- `dropna=False`: all NaN keys form one group;
+- `+0.0` and `-0.0` are normalized into the same group;
+- infinities are valid groups through regular float bit hashing;
+- NaN-null float values are skipped for value aggregations.
+
+Remaining possible extensions:
+
+- support non-float value columns in the hash path without normalizing through
+  float64;
+- fuse multiple value columns directly in one hash-table pass;
+- add explicit memory/cardinality safeguards for very high-cardinality floats.
 
 ### Multi-key Cython hash path
 
diff --git a/src/blosc2/groupby.py b/src/blosc2/groupby.py
index 9222fbd4..28a6a03a 100644
--- a/src/blosc2/groupby.py
+++ b/src/blosc2/groupby.py
@@ -182,6 +182,9 @@ def _execute(self, specs: list[_AggSpec]):
         if fast is not None:
             return fast
         fast = self._try_execute_cython_float_integral_key_f64_sum(specs)
+        if fast is not None:
+            return fast
+        fast = self._try_execute_cython_float_hash(specs)
         if fast is not None:
             return fast
         fast = self._try_execute_dense_single_int_key(specs)
@@ -596,6 +599,135 @@ def ensure_size(size: int) -> bool:
             rows.append({key_name: key_value, spec.output_col: float(sums[code])})
         return self._build_result(rows, specs)
 
+    def _try_execute_cython_float_hash(self, specs: list[_AggSpec]):  # noqa: C901
+        """Cython hash path for one arbitrary float key.
+
+        This covers float32/float64 keys that are not suitable for dense
+        integral-key indexing.  It currently supports float value columns for
+        value reductions and falls back for unsupported mixed/multi-column cases.
+        """
+        if len(self.keys) != 1:
+            return None
+        key_name = self.keys[0]
+        key_info = self.table._schema.columns_by_name[key_name]
+        if self.table._is_dictionary_column(key_info):
+            return None
+        key_dtype = getattr(key_info.spec, "dtype", None)
+        if key_dtype not in {np.dtype(np.float32), np.dtype(np.float64)}:
+            return None
+
+        value_cols = {s.input_col for s in specs if s.input_col is not None}
+        if len(value_cols) > 1:
+            return None
+        value_col = next(iter(value_cols), None)
+        value_dtype = None
+        nullable_nan_value = False
+        if value_col is not None:
+            value_info = self.table._schema.columns_by_name[value_col]
+            value_dtype = getattr(value_info.spec, "dtype", None)
+            # Count can operate on any fixed-width value column via values_valid,
+            # but other reductions in this hash kernel normalize values to f64.
+            if any(s.op in {"sum", "mean", "min", "max"} for s in specs):
+                if value_dtype is None or np.dtype(value_dtype).kind != "f":
+                    return None
+                null_value = getattr(value_info.spec, "null_value", None)
+                nullable_nan_value = isinstance(null_value, float) and math.isnan(null_value)
+                if null_value is not None and not nullable_nan_value:
+                    return None
+
+        try:
+            from blosc2 import groupby_ext
+        except ImportError:
+            return None
+        kernel = getattr(groupby_ext, "groupby_hash_f64_f64", None)
+        if kernel is None:
+            return None
+
+        acc: dict[Any, dict[str, _AggState]] = {}
+        key_values: dict[Any, tuple[Any, ...]] = {}
+        phys_len = len(self.table._valid_rows)
+        chunk_size = self._chunk_size()
+
+        for start in range(0, phys_len, chunk_size):
+            stop = min(start + chunk_size, phys_len)
+            valid = np.asarray(self.table._valid_rows[start:stop], dtype=bool)
+            if not np.any(valid):
+                continue
+            keys = np.ascontiguousarray(np.asarray(self.table._cols[key_name][start:stop], dtype=np.float64))
+            if value_col is None:
+                values = np.empty(len(keys), dtype=np.float64)
+                values_valid = np.zeros(len(keys), dtype=bool)
+                has_values = False
+            else:
+                raw_values = np.asarray(self.table._cols[value_col][start:stop])
+                if any(s.op in {"sum", "mean", "min", "max"} for s in specs):
+                    values = np.ascontiguousarray(raw_values.astype(np.float64, copy=False))
+                else:
+                    values = np.empty(len(keys), dtype=np.float64)
+                values_valid = np.ascontiguousarray(~self._null_mask(value_col, raw_values, is_key=False))
+                has_values = True
+
+            (
+                chunk_keys,
+                row_counts,
+                value_counts,
+                sums,
+                mins,
+                maxs,
+                has_value,
+            ) = kernel(keys, values, np.ascontiguousarray(valid), values_valid, has_values, self.dropna)
+
+            for i, key in enumerate(chunk_keys):
+                key_scalar = np.asarray(key, dtype=key_dtype).item()
+                norm_key = _normalize_key_part(float(key_scalar))
+                states = acc.setdefault(norm_key, {})
+                key_values.setdefault(norm_key, (key_scalar,))
+                for spec in specs:
+                    state = states.setdefault(spec.output_col, _AggState(spec.op))
+                    if spec.op == "size":
+                        state.value = (0 if state.value is None else state.value) + int(row_counts[i])
+                    elif spec.op == "count":
+                        state.value = (0 if state.value is None else state.value) + int(value_counts[i])
+                    elif spec.op == "sum" or spec.op == "mean":
+                        if has_value[i]:
+                            state.value = (0.0 if state.value is None else state.value) + float(sums[i])
+                            state.count += int(value_counts[i])
+                    elif spec.op == "min":
+                        if has_value[i]:
+                            value = float(mins[i])
+                            if state.count == 0 or value < state.value:
+                                state.value = value
+                            state.count += 1
+                    elif spec.op == "max" and has_value[i]:
+                        value = float(maxs[i])
+                        if state.count == 0 or value > state.value:
+                            state.value = value
+                        state.count += 1
+
+        # Hash-table iteration order is intentionally not exposed.  Emit float
+        # hash groups in key order for deterministic results and compatibility
+        # with the previous NumPy fallback behavior for these cases.
+        ordered_keys = list(acc)
+        ordered_keys.sort(
+            key=lambda k: tuple(
+                (1, "") if isinstance(v, float) and math.isnan(v) else (0, v) for v in key_values[k]
+            )
+        )
+        rows = []
+        for norm_key in ordered_keys:
+            row = dict(zip(self.keys, key_values[norm_key], strict=True))
+            states = acc[norm_key]
+            for spec in specs:
+                state = states[spec.output_col]
+                if spec.op == "mean":
+                    row[spec.output_col] = math.nan if state.count == 0 else state.value / state.count
+                elif spec.op in {"sum", "min", "max"} and state.count == 0:
+                    row[spec.output_col] = _null_output_value(self._result_spec_for_agg(spec))
+                else:
+                    row[spec.output_col] = 0 if state.value is None else state.value
+            rows.append(row)
+        return self._build_result(rows, specs)
+
     def _try_execute_cython_float_integral_key_f64_sum(self, specs: list[_AggSpec]):  # noqa: C901
         """Cython fast path for integral float32/float64 keys and one non-null float64 sum."""
         if len(self.keys) != 1 or len(specs) != 1 or specs[0].op != "sum" or self.sort:
diff --git a/src/blosc2/groupby_ext.pyx b/src/blosc2/groupby_ext.pyx
index d8475f87..03b18894 100644
--- a/src/blosc2/groupby_ext.pyx
+++ b/src/blosc2/groupby_ext.pyx
@@ -12,6 +12,8 @@ import numpy as np
 cimport numpy as np
 
 from libc.stdint cimport int8_t, uint8_t, int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t
+from libc.stdlib cimport malloc, free
+from libc.string cimport memcpy
 
 
 # ----------------------------------------------------------------------
@@ -717,3 +719,222 @@ def groupby_dense_int_i64_max_checked(
                     maxs[key] = value
                 has_value[key] = 1
     return ret
+
+
+# ----------------------------------------------------------------------
+# Arbitrary float-key hash kernels
+# ----------------------------------------------------------------------
+
+cdef inline uint64_t _f64_bits(double value) noexcept:
+    cdef uint64_t bits
+    memcpy(&bits, &value, sizeof(double))
+    return bits
+
+
+cdef inline uint64_t _mix_u64(uint64_t x) noexcept:
+    x ^= x >> 30
+    x *= <uint64_t>0xbf58476d1ce4e5b9
+    x ^= x >> 27
+    x *= <uint64_t>0x94d049bb133111eb
+    x ^= x >> 31
+    return x
+
+
+def groupby_hash_f64_f64(
+    double[:] keys,
+    double[:] values,
+    np.npy_bool[:] valid,
+    np.npy_bool[:] values_valid,
+    bint has_values,
+    bint dropna=True,
+):
+    """Hash arbitrary float64 keys and accumulate float64 group states.
+
+    Returns ``(keys, row_counts, value_counts, sums, mins, maxs, has_value)``.
+    NaN keys are skipped when ``dropna`` is true; otherwise all NaN bit-patterns
+    are normalized into one NaN group.  ``+0.0`` and ``-0.0`` are normalized into
+    the same zero group.
+    """
+    if keys.shape[0] != valid.shape[0]:
+        raise ValueError("keys and valid must have the same length")
+    if has_values and (values.shape[0] != keys.shape[0] or values_valid.shape[0] != keys.shape[0]):
+        raise ValueError("values, values_valid and keys must have the same length")
+
+    cdef Py_ssize_t n = keys.shape[0]
+    cdef Py_ssize_t cap = 1024
+    cdef Py_ssize_t used_count = 0
+    cdef Py_ssize_t i, pos, old_pos, out_pos
+    cdef uint64_t mask = <uint64_t>cap - 1
+    cdef uint64_t bits, h, old_bits
+    cdef double key, value
+    cdef double nan_value = float("nan")
+    cdef uint64_t nan_bits = <uint64_t>0x7ff8000000000000
+    cdef bint value_ok
+
+    cdef uint64_t* table_bits = <uint64_t*>malloc(cap * sizeof(uint64_t))
+    cdef np.npy_bool* table_used = <np.npy_bool*>malloc(cap * sizeof(np.npy_bool))
+    cdef double* table_keys = <double*>malloc(cap * sizeof(double))
+    cdef int64_t* row_counts = <int64_t*>malloc(cap * sizeof(int64_t))
+    cdef int64_t* value_counts = <int64_t*>malloc(cap * sizeof(int64_t))
+    cdef double* sums = <double*>malloc(cap * sizeof(double))
+    cdef double* mins = <double*>malloc(cap * sizeof(double))
+    cdef double* maxs = <double*>malloc(cap * sizeof(double))
+    cdef np.npy_bool* has_value = <np.npy_bool*>malloc(cap * sizeof(np.npy_bool))
+
+    cdef uint64_t* new_bits
+    cdef np.npy_bool* new_used
+    cdef double* new_keys
+    cdef int64_t* new_row_counts
+    cdef int64_t* new_value_counts
+    cdef double* new_sums
+    cdef double* new_mins
+    cdef double* new_maxs
+    cdef np.npy_bool* new_has_value
+    cdef Py_ssize_t old_cap
+    cdef uint64_t new_mask
+
+    if (
+        table_bits == NULL
+        or table_used == NULL
+        or table_keys == NULL
+        or row_counts == NULL
+        or value_counts == NULL
+        or sums == NULL
+        or mins == NULL
+        or maxs == NULL
+        or has_value == NULL
+    ):
+        free(table_bits); free(table_used); free(table_keys); free(row_counts); free(value_counts)
+        free(sums); free(mins); free(maxs); free(has_value)
+        raise MemoryError()
+
+    for i in range(cap):
+        table_used[i] = 0
+
+    try:
+        for i in range(n):
+            if not valid[i]:
+                continue
+            key = keys[i]
+            if key != key:
+                if dropna:
+                    continue
+                bits = nan_bits
+                key = nan_value
+            elif key == 0.0:
+                key = 0.0
+                bits = 0
+            else:
+                bits = _f64_bits(key)
+
+            if (used_count + 1) * 2 >= cap:
+                old_cap = cap
+                cap *= 2
+                mask = <uint64_t>cap - 1
+                new_bits = <uint64_t*>malloc(cap * sizeof(uint64_t))
+                new_used = <np.npy_bool*>malloc(cap * sizeof(np.npy_bool))
+                new_keys = <double*>malloc(cap * sizeof(double))
+                new_row_counts = <int64_t*>malloc(cap * sizeof(int64_t))
+                new_value_counts = <int64_t*>malloc(cap * sizeof(int64_t))
+                new_sums = <double*>malloc(cap * sizeof(double))
+                new_mins = <double*>malloc(cap * sizeof(double))
+                new_maxs = <double*>malloc(cap * sizeof(double))
+                new_has_value = <np.npy_bool*>malloc(cap * sizeof(np.npy_bool))
+                if (
+                    new_bits == NULL
+                    or new_used == NULL
+                    or new_keys == NULL
+                    or new_row_counts == NULL
+                    or new_value_counts == NULL
+                    or new_sums == NULL
+                    or new_mins == NULL
+                    or new_maxs == NULL
+                    or new_has_value == NULL
+                ):
+                    free(new_bits); free(new_used); free(new_keys); free(new_row_counts); free(new_value_counts)
+                    free(new_sums); free(new_mins); free(new_maxs); free(new_has_value)
+                    raise MemoryError()
+                for pos in range(cap):
+                    new_used[pos] = 0
+                for old_pos in range(old_cap):
+                    if not table_used[old_pos]:
+                        continue
+                    old_bits = table_bits[old_pos]
+                    h = _mix_u64(old_bits)
+                    pos = <Py_ssize_t>(h & mask)
+                    while new_used[pos]:
+                        pos = <Py_ssize_t>((pos + 1) & mask)
+                    new_used[pos] = 1
+                    new_bits[pos] = old_bits
+                    new_keys[pos] = table_keys[old_pos]
+                    new_row_counts[pos] = row_counts[old_pos]
+                    new_value_counts[pos] = value_counts[old_pos]
+                    new_sums[pos] = sums[old_pos]
+                    new_mins[pos] = mins[old_pos]
+                    new_maxs[pos] = maxs[old_pos]
+                    new_has_value[pos] = has_value[old_pos]
+                free(table_bits); free(table_used); free(table_keys); free(row_counts); free(value_counts)
+                free(sums); free(mins); free(maxs); free(has_value)
+                table_bits = new_bits
+                table_used = new_used
+                table_keys = new_keys
+                row_counts = new_row_counts
+                value_counts = new_value_counts
+                sums = new_sums
+                mins = new_mins
+                maxs = new_maxs
+                has_value = new_has_value
+
+            h = _mix_u64(bits)
+            pos = <Py_ssize_t>(h & mask)
+            while table_used[pos] and table_bits[pos] != bits:
+                pos = <Py_ssize_t>((pos + 1) & mask)
+            if not table_used[pos]:
+                table_used[pos] = 1
+                table_bits[pos] = bits
+                table_keys[pos] = key
+                row_counts[pos] = 0
+                value_counts[pos] = 0
+                sums[pos] = 0.0
+                mins[pos] = 0.0
+                maxs[pos] = 0.0
+                has_value[pos] = 0
+                used_count += 1
+
+            row_counts[pos] += 1
+            if has_values:
+                value_ok = values_valid[i]
+                if value_ok:
+                    value = values[i]
+                    value_counts[pos] += 1
+                    sums[pos] += value
+                    if not has_value[pos] or value < mins[pos]:
+                        mins[pos] = value
+                    if not has_value[pos] or value > maxs[pos]:
+                        maxs[pos] = value
+                    has_value[pos] = 1
+
+        out_keys = np.empty(used_count, dtype=np.float64)
+        out_row_counts = np.empty(used_count, dtype=np.int64)
+        out_value_counts = np.empty(used_count, dtype=np.int64)
+        out_sums = np.empty(used_count, dtype=np.float64)
+        out_mins = np.empty(used_count, dtype=np.float64)
+        out_maxs = np.empty(used_count, dtype=np.float64)
+        out_has_value = np.empty(used_count, dtype=bool)
+
+        out_pos = 0
+        for pos in range(cap):
+            if not table_used[pos]:
+                continue
+            out_keys[out_pos] = table_keys[pos]
+            out_row_counts[out_pos] = row_counts[pos]
+            out_value_counts[out_pos] = value_counts[pos]
+            out_sums[out_pos] = sums[pos]
+            out_mins[out_pos] = mins[pos]
+            out_maxs[out_pos] = maxs[pos]
+            out_has_value[out_pos] = has_value[pos]
+            out_pos += 1
+        return out_keys, out_row_counts, out_value_counts, out_sums, out_mins, out_maxs, out_has_value
+    finally:
+        free(table_bits); free(table_used); free(table_keys); free(row_counts); free(value_counts)
+        free(sums); free(mins); free(maxs); free(has_value)
diff --git a/tests/ctable/test_groupby.py b/tests/ctable/test_groupby.py
index ec4f2185..b085c92f 100644
--- a/tests/ctable/test_groupby.py
+++ b/tests/ctable/test_groupby.py
@@ -291,3 +291,31 @@ def test_groupby_cython_integer_key_nullable_float_aggs():
     assert np.isnan(got[1][4])
     assert np.isnan(got[1][5])
     assert got[2] == (2, 1, 10.0, 10.0, 10.0, 10.0)
+
+
+def test_groupby_cython_arbitrary_float_key_aggs():
+    t = CTable(
+        Float64KeyRow,
+        new_data=[(0.5, 1.0), (1.25, 10.0), (0.5, 3.0), (-2.5, 4.0), (1.25, 2.0)],
+    )
+
+    out = t.group_by("key").agg({"value": ["count", "sum", "mean", "min", "max"]})
+
+    assert rows(out) == [
+        (-2.5, 1, 4.0, 4.0, 4.0, 4.0),
+        (0.5, 2, 4.0, 2.0, 1.0, 3.0),
+        (1.25, 2, 12.0, 6.0, 2.0, 10.0),
+    ]
+
+
+def test_groupby_cython_arbitrary_float_key_nan_and_signed_zero():
+    t = CTable(Float64KeyRow, new_data=[(-0.0, 1.0), (0.0, 2.0), (np.nan, 3.0), (np.nan, 4.0)])
+
+    dropped = t.group_by("key").agg({"value": "sum"})
+    kept = t.group_by("key", dropna=False).agg({"value": "sum"})
+
+    assert rows(dropped) == [(0.0, 3.0)]
+    got = rows(kept)
+    assert got[0] == (0.0, 3.0)
+    assert np.isnan(got[1][0])
+    assert got[1][1] == 7.0

From 840a469dd804f12ee3b8c2817fa2cd118feb6c91 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Fri, 15 May 2026 12:44:49 +0200
Subject: [PATCH 10/17] Add Cython group-by fast paths

 Implement fused integer-key dense kernels for all integer widths, add Cython aggregations beyond sum, introduce arbitrary float-key hashing, and add a conservative
 two-key integer/dictionary hash path. Extend group-by benchmarks and tests for the new optimized cases.
---
 bench/ctable/groupby.py      | 108 +++++++++++++-------
 plans/ctable-groupby.md      |  18 +++-
 src/blosc2/groupby.py        | 136 ++++++++++++++++++++++++++
 src/blosc2/groupby_ext.pyx   | 184 +++++++++++++++++++++++++++++++++++
 tests/ctable/test_groupby.py |  39 ++++++++
 5 files changed, 446 insertions(+), 39 deletions(-)

diff --git a/bench/ctable/groupby.py b/bench/ctable/groupby.py
index dfa7d863..41929563 100644
--- a/bench/ctable/groupby.py
+++ b/bench/ctable/groupby.py
@@ -7,6 +7,7 @@
 python bench/ctable/groupby.py --rows 10_000_000 --groups 1_000 --key-dtype float64 --op sum
 # float key dtypes generate non-integral repeated labels to exercise the float hash path
 python bench/ctable/groupby.py --rows 1_000_000 --groups 100 --dictionary --pandas
+python bench/ctable/groupby.py --rows 10_000_000 --groups 1_000 --groups2 100 --multi-key --op sum
 """
 
 from __future__ import annotations
@@ -25,35 +26,57 @@ def parse_int(text: str) -> int:
     return int(text.replace("_", ""))
 
 
-def build_row_type(dictionary: bool, key_dtype: str):
-    if dictionary:
+def build_row_type(dictionary: bool, key_dtype: str, multi_key: bool):
+    if dictionary and multi_key:
 
         @dataclasses.dataclass
         class Row:
-            key: str = blosc2.field(blosc2.dictionary())
+            key0: str = blosc2.field(blosc2.dictionary())
+            key1: int = blosc2.field(blosc2.int32())
             value: float = blosc2.field(blosc2.float64())
 
-    elif key_dtype in {"int8", "uint8", "int16", "uint16", "int32", "uint32", "int64", "uint64"}:
-        key_spec = getattr(blosc2, key_dtype)()
+    elif dictionary:
 
         @dataclasses.dataclass
         class Row:
-            key: int = blosc2.field(key_spec)
+            key: str = blosc2.field(blosc2.dictionary())
             value: float = blosc2.field(blosc2.float64())
 
-    elif key_dtype == "float32":
+    elif key_dtype in {"int8", "uint8", "int16", "uint16", "int32", "uint32", "int64", "uint64"}:
+        key_spec = getattr(blosc2, key_dtype)()
 
-        @dataclasses.dataclass
-        class Row:
-            key: float = blosc2.field(blosc2.float32())
-            value: float = blosc2.field(blosc2.float64())
+        if multi_key:
 
-    elif key_dtype == "float64":
+            @dataclasses.dataclass
+            class Row:
+                key0: int = blosc2.field(key_spec)
+                key1: int = blosc2.field(key_spec)
+                value: float = blosc2.field(blosc2.float64())
 
-        @dataclasses.dataclass
-        class Row:
-            key: float = blosc2.field(blosc2.float64())
-            value: float = blosc2.field(blosc2.float64())
+        else:
+
+            @dataclasses.dataclass
+            class Row:
+                key: int = blosc2.field(key_spec)
+                value: float = blosc2.field(blosc2.float64())
+
+    elif key_dtype in {"float32", "float64"}:
+        key_spec = blosc2.float32() if key_dtype == "float32" else blosc2.float64()
+
+        if multi_key:
+
+            @dataclasses.dataclass
+            class Row:
+                key0: float = blosc2.field(key_spec)
+                key1: float = blosc2.field(key_spec)
+                value: float = blosc2.field(blosc2.float64())
+
+        else:
+
+            @dataclasses.dataclass
+            class Row:
+                key: float = blosc2.field(key_spec)
+                value: float = blosc2.field(blosc2.float64())
 
     else:  # pragma: no cover - argparse choices prevent this
         raise ValueError(f"unsupported key dtype {key_dtype!r}")
@@ -61,27 +84,37 @@ class Row:
     return Row
 
 
-def make_data(nrows: int, ngroups: int, dictionary: bool, key_dtype: str, seed: int):
-    rng = np.random.default_rng(seed)
-    key_codes = rng.integers(0, ngroups, size=nrows, dtype=np.int32)
-    values = rng.random(nrows, dtype=np.float64)
+def make_key_data(key_codes: np.ndarray, dictionary: bool, key_dtype: str):
     if dictionary:
-        keys = np.asarray([f"k{code}" for code in key_codes], dtype=object)
-    elif key_dtype in {"float32", "float64"}:
+        return np.asarray([f"k{code}" for code in key_codes], dtype=object)
+    if key_dtype in {"float32", "float64"}:
         # Use non-integral, repeated float labels by default so float-key
         # benchmarks exercise the arbitrary-float hash path instead of the
         # dense integral-float fast path.
         labels = key_codes.astype(np.float64) + 0.25
-        keys = labels.astype(np.dtype(key_dtype))
-    else:
-        keys = key_codes.astype(np.dtype(key_dtype), copy=False)
-    return keys, values
+        return labels.astype(np.dtype(key_dtype))
+    return key_codes.astype(np.dtype(key_dtype), copy=False)
+
+
+def make_data(nrows: int, ngroups: int, ngroups2: int, dictionary: bool, key_dtype: str, multi_key: bool, seed: int):
+    rng = np.random.default_rng(seed)
+    key_codes = rng.integers(0, ngroups, size=nrows, dtype=np.int32)
+    values = rng.random(nrows, dtype=np.float64)
+    if not multi_key:
+        return {"key": make_key_data(key_codes, dictionary, key_dtype), "value": values}
+
+    key2_codes = rng.integers(0, ngroups2, size=nrows, dtype=np.int32)
+    key0 = make_key_data(key_codes, dictionary, key_dtype)
+    key1_dtype = "int32" if dictionary else key_dtype
+    key1 = make_key_data(key2_codes, False, key1_dtype)
+    return {"key0": key0, "key1": key1, "value": values}
 
 
 def main() -> None:
     parser = argparse.ArgumentParser(description=__doc__)
     parser.add_argument("--rows", type=parse_int, default=10_000_000)
     parser.add_argument("--groups", type=parse_int, default=1_000)
+    parser.add_argument("--groups2", type=parse_int, default=None, help="Number of groups for key1 with --multi-key")
     parser.add_argument("--chunk-size", type=parse_int, default=None)
     parser.add_argument("--dictionary", action="store_true", help="Use a dictionary-encoded string key")
     parser.add_argument(
@@ -102,32 +135,35 @@ def main() -> None:
         help="Physical dtype for non-dictionary keys. Float keys are generated as non-integral repeated labels.",
     )
     parser.add_argument("--op", choices=["size", "count", "sum", "mean", "min", "max"], default="sum")
+    parser.add_argument("--multi-key", action="store_true", help="Group by two keys: key0 and key1")
     parser.add_argument("--sort", action="store_true")
     parser.add_argument("--pandas", action="store_true", help="Also run a pandas comparison if available")
     parser.add_argument("--urlpath", type=Path, default=None, help="Optional persistent CTable path")
     parser.add_argument("--seed", type=int, default=0)
     args = parser.parse_args()
 
+    groups2 = args.groups if args.groups2 is None else args.groups2
     print(
-        f"rows={args.rows:,} groups={args.groups:,} dictionary={args.dictionary} "
-        f"key_dtype={args.key_dtype} op={args.op} sort={args.sort} "
+        f"rows={args.rows:,} groups={args.groups:,} groups2={groups2:,} multi_key={args.multi_key} "
+        f"dictionary={args.dictionary} key_dtype={args.key_dtype} op={args.op} sort={args.sort} "
         f"chunk_size={args.chunk_size} urlpath={args.urlpath}"
     )
 
-    keys, values = make_data(args.rows, args.groups, args.dictionary, args.key_dtype, args.seed)
-    Row = build_row_type(args.dictionary, args.key_dtype)
+    data = make_data(args.rows, args.groups, groups2, args.dictionary, args.key_dtype, args.multi_key, args.seed)
+    Row = build_row_type(args.dictionary, args.key_dtype, args.multi_key)
 
     kwargs = {}
     if args.urlpath is not None:
         kwargs.update(urlpath=str(args.urlpath), mode="w")
 
     t0 = time.perf_counter()
-    table = blosc2.CTable(Row, new_data={"key": keys, "value": values}, expected_size=args.rows, **kwargs)
+    table = blosc2.CTable(Row, new_data=data, expected_size=args.rows, **kwargs)
     build_time = time.perf_counter() - t0
     print(f"ctable_build_seconds={build_time:.6f}")
 
     t0 = time.perf_counter()
-    gb = table.group_by("key", sort=args.sort, chunk_size=args.chunk_size)
+    group_keys = ["key0", "key1"] if args.multi_key else "key"
+    gb = table.group_by(group_keys, sort=args.sort, chunk_size=args.chunk_size)
     if args.op == "size":
         out = gb.size()
     elif args.op == "count":
@@ -144,14 +180,14 @@ def main() -> None:
         except ImportError:
             print("pandas_unavailable=true")
         else:
-            df = pd.DataFrame({"key": keys, "value": values})
+            df = pd.DataFrame(data)
             t0 = time.perf_counter()
             if args.op == "size":
-                pdf = df.groupby("key", sort=args.sort).size()
+                pdf = df.groupby(group_keys, sort=args.sort).size()
             elif args.op == "count":
-                pdf = df.groupby("key", sort=args.sort)["value"].count()
+                pdf = df.groupby(group_keys, sort=args.sort)["value"].count()
             else:
-                pdf = df.groupby("key", sort=args.sort)["value"].agg(args.op)
+                pdf = df.groupby(group_keys, sort=args.sort)["value"].agg(args.op)
             pandas_elapsed = time.perf_counter() - t0
             print(f"pandas_groupby_seconds={pandas_elapsed:.6f}")
             print(f"pandas_result_rows={len(pdf):,}")
diff --git a/plans/ctable-groupby.md b/plans/ctable-groupby.md
index b0f91276..75a71873 100644
--- a/plans/ctable-groupby.md
+++ b/plans/ctable-groupby.md
@@ -248,9 +248,21 @@ Remaining possible extensions:
 
 ### Multi-key Cython hash path
 
-The generic NumPy path supports multi-key grouping via structured arrays.  Future
-Cython work could hash directly across multiple key arrays, avoiding structured
-key packing, sort-based unique, inverse arrays, and Python merge overhead.
+Implemented a conservative Cython hash path for two-key group-by when both keys
+are integer or dictionary-code-backed columns.  The path normalizes keys to
+`int64`, hashes `(key0, key1)` directly, and supports `size`, `count`, `sum`,
+`mean`, `min`, and `max` for supported float value reductions.  This avoids
+structured-array packing and per-chunk `np.unique` for common two-key
+categorical/integer workloads.
+
+Remaining possible extensions:
+
+- support more than two key columns;
+- support float/string fixed-width key components directly;
+- support non-float value columns without normalizing value reductions through
+  float64;
+- fuse/merge multi-key states across chunks fully in Cython rather than via the
+  existing Python accumulator merge.
 
 ### FULL-index sorted group-by path
 
diff --git a/src/blosc2/groupby.py b/src/blosc2/groupby.py
index 28a6a03a..773b0a30 100644
--- a/src/blosc2/groupby.py
+++ b/src/blosc2/groupby.py
@@ -176,6 +176,9 @@ def _validate_value_column(self, name: str) -> None:
     def _execute(self, specs: list[_AggSpec]):
         self._validate_output_names(specs)
         fast = self._try_execute_cython_dense_int_key(specs)
+        if fast is not None:
+            return fast
+        fast = self._try_execute_cython_two_int_key_hash(specs)
         if fast is not None:
             return fast
         fast = self._try_execute_cython_i32_f64_sum(specs)
@@ -230,6 +233,139 @@ def _execute(self, specs: list[_AggSpec]):
         rows = self._final_rows(acc, key_values, specs)
         return self._build_result(rows, specs)
 
+    def _try_execute_cython_two_int_key_hash(self, specs: list[_AggSpec]):  # noqa: C901
+        """Cython hash path for two integer/dictionary-code keys."""
+        if len(self.keys) != 2:
+            return None
+
+        key_arrays = []
+        key_is_dict = []
+        key_nulls = []
+        skip_key_nulls = []
+        for key_name in self.keys:
+            key_info = self.table._schema.columns_by_name[key_name]
+            if self.table._is_dictionary_column(key_info):
+                key_arrays.append(self.table._cols[key_name].codes)
+                key_is_dict.append(True)
+                key_nulls.append(int(key_info.spec.null_code))
+                skip_key_nulls.append(self.dropna)
+                continue
+            key_dtype = getattr(key_info.spec, "dtype", None)
+            if key_dtype is None or np.dtype(key_dtype).kind not in "biu":
+                return None
+            null_value = getattr(key_info.spec, "null_value", None)
+            if null_value is not None and not self.dropna:
+                return None
+            key_arrays.append(self.table._cols[key_name])
+            key_is_dict.append(False)
+            key_nulls.append(0 if null_value is None else int(null_value))
+            skip_key_nulls.append(self.dropna and null_value is not None)
+
+        value_cols = {s.input_col for s in specs if s.input_col is not None}
+        if len(value_cols) > 1:
+            return None
+        value_col = next(iter(value_cols), None)
+        if value_col is not None and any(s.op in {"sum", "mean", "min", "max"} for s in specs):
+            value_info = self.table._schema.columns_by_name[value_col]
+            value_dtype = getattr(value_info.spec, "dtype", None)
+            if value_dtype is None or np.dtype(value_dtype).kind != "f":
+                return None
+            null_value = getattr(value_info.spec, "null_value", None)
+            if null_value is not None and not (isinstance(null_value, float) and math.isnan(null_value)):
+                return None
+
+        try:
+            from blosc2 import groupby_ext
+        except ImportError:
+            return None
+        kernel = getattr(groupby_ext, "groupby_hash_i64x2_f64", None)
+        if kernel is None:
+            return None
+
+        acc: dict[Any, dict[str, _AggState]] = {}
+        key_values: dict[Any, tuple[Any, ...]] = {}
+        phys_len = len(self.table._valid_rows)
+        chunk_size = self._chunk_size()
+
+        for start in range(0, phys_len, chunk_size):
+            stop = min(start + chunk_size, phys_len)
+            valid = np.asarray(self.table._valid_rows[start:stop], dtype=bool)
+            if not np.any(valid):
+                continue
+            key_chunks = [np.asarray(arr[start:stop], dtype=np.int64) for arr in key_arrays]
+            live = valid.copy()
+            for key_chunk, skip_null, null_value in zip(key_chunks, skip_key_nulls, key_nulls, strict=True):
+                if skip_null:
+                    live &= key_chunk != null_value
+            if not np.any(live):
+                continue
+
+            if value_col is None:
+                values = np.empty(len(valid), dtype=np.float64)
+                values_valid = np.zeros(len(valid), dtype=bool)
+                has_values = False
+            else:
+                raw_values = np.asarray(self.table._cols[value_col][start:stop])
+                values = np.ascontiguousarray(raw_values.astype(np.float64, copy=False))
+                values_valid = np.ascontiguousarray(~self._null_mask(value_col, raw_values, is_key=False))
+                has_values = True
+
+            (
+                out_k0,
+                out_k1,
+                row_counts,
+                value_counts,
+                sums,
+                mins,
+                maxs,
+                has_value,
+            ) = kernel(
+                np.ascontiguousarray(key_chunks[0]),
+                np.ascontiguousarray(key_chunks[1]),
+                values,
+                np.ascontiguousarray(live),
+                values_valid,
+                has_values,
+            )
+
+            for i, (code0, code1) in enumerate(zip(out_k0, out_k1, strict=True)):
+                display = []
+                norm_parts = []
+                for key_pos, code in enumerate((int(code0), int(code1))):
+                    if key_is_dict[key_pos]:
+                        value = self.table._cols[self.keys[key_pos]].decode(code)
+                    else:
+                        value = code
+                    display.append(value)
+                    norm_parts.append(_normalize_key_part(value))
+                norm_key = tuple(norm_parts)
+                states = acc.setdefault(norm_key, {})
+                key_values.setdefault(norm_key, tuple(display))
+                for spec in specs:
+                    state = states.setdefault(spec.output_col, _AggState(spec.op))
+                    if spec.op == "size":
+                        state.value = (0 if state.value is None else state.value) + int(row_counts[i])
+                    elif spec.op == "count":
+                        state.value = (0 if state.value is None else state.value) + int(value_counts[i])
+                    elif spec.op in {"sum", "mean"}:
+                        if has_value[i]:
+                            state.value = (0.0 if state.value is None else state.value) + float(sums[i])
+                            state.count += int(value_counts[i])
+                    elif spec.op == "min":
+                        if has_value[i]:
+                            value = float(mins[i])
+                            if state.count == 0 or value < state.value:
+                                state.value = value
+                            state.count += 1
+                    elif spec.op == "max" and has_value[i]:
+                        value = float(maxs[i])
+                        if state.count == 0 or value > state.value:
+                            state.value = value
+                        state.count += 1
+
+        rows = self._final_rows(acc, key_values, specs)
+        return self._build_result(rows, specs)
+
     def _try_execute_cython_dense_int_key(self, specs: list[_AggSpec]):  # noqa: C901
         """Cython fast path for one compact integer/dictionary key and dense aggregations."""
         if len(self.keys) != 1:
diff --git a/src/blosc2/groupby_ext.pyx b/src/blosc2/groupby_ext.pyx
index 03b18894..ae5ffd8a 100644
--- a/src/blosc2/groupby_ext.pyx
+++ b/src/blosc2/groupby_ext.pyx
@@ -938,3 +938,187 @@ def groupby_hash_f64_f64(
     finally:
         free(table_bits); free(table_used); free(table_keys); free(row_counts); free(value_counts)
         free(sums); free(mins); free(maxs); free(has_value)
+
+
+def groupby_hash_i64x2_f64(
+    int64_t[:] key0,
+    int64_t[:] key1,
+    double[:] values,
+    np.npy_bool[:] valid,
+    np.npy_bool[:] values_valid,
+    bint has_values,
+):
+    """Hash two int64-normalized keys and accumulate float64 group states."""
+    if key0.shape[0] != key1.shape[0] or key0.shape[0] != valid.shape[0]:
+        raise ValueError("key0, key1 and valid must have the same length")
+    if has_values and (values.shape[0] != key0.shape[0] or values_valid.shape[0] != key0.shape[0]):
+        raise ValueError("values, values_valid and keys must have the same length")
+
+    cdef Py_ssize_t n = key0.shape[0]
+    cdef Py_ssize_t cap = 1024
+    cdef Py_ssize_t used_count = 0
+    cdef Py_ssize_t i, pos, old_pos, out_pos
+    cdef uint64_t mask = <uint64_t>cap - 1
+    cdef uint64_t h
+    cdef int64_t k0, k1
+    cdef double value
+    cdef bint value_ok
+
+    cdef int64_t* table_k0 = <int64_t*>malloc(cap * sizeof(int64_t))
+    cdef int64_t* table_k1 = <int64_t*>malloc(cap * sizeof(int64_t))
+    cdef np.npy_bool* table_used = <np.npy_bool*>malloc(cap * sizeof(np.npy_bool))
+    cdef int64_t* row_counts = <int64_t*>malloc(cap * sizeof(int64_t))
+    cdef int64_t* value_counts = <int64_t*>malloc(cap * sizeof(int64_t))
+    cdef double* sums = <double*>malloc(cap * sizeof(double))
+    cdef double* mins = <double*>malloc(cap * sizeof(double))
+    cdef double* maxs = <double*>malloc(cap * sizeof(double))
+    cdef np.npy_bool* has_value = <np.npy_bool*>malloc(cap * sizeof(np.npy_bool))
+
+    cdef int64_t* new_k0
+    cdef int64_t* new_k1
+    cdef np.npy_bool* new_used
+    cdef int64_t* new_row_counts
+    cdef int64_t* new_value_counts
+    cdef double* new_sums
+    cdef double* new_mins
+    cdef double* new_maxs
+    cdef np.npy_bool* new_has_value
+    cdef Py_ssize_t old_cap
+
+    if (
+        table_k0 == NULL
+        or table_k1 == NULL
+        or table_used == NULL
+        or row_counts == NULL
+        or value_counts == NULL
+        or sums == NULL
+        or mins == NULL
+        or maxs == NULL
+        or has_value == NULL
+    ):
+        free(table_k0); free(table_k1); free(table_used); free(row_counts); free(value_counts)
+        free(sums); free(mins); free(maxs); free(has_value)
+        raise MemoryError()
+
+    for i in range(cap):
+        table_used[i] = 0
+
+    try:
+        for i in range(n):
+            if not valid[i]:
+                continue
+            k0 = key0[i]
+            k1 = key1[i]
+
+            if (used_count + 1) * 2 >= cap:
+                old_cap = cap
+                cap *= 2
+                mask = <uint64_t>cap - 1
+                new_k0 = <int64_t*>malloc(cap * sizeof(int64_t))
+                new_k1 = <int64_t*>malloc(cap * sizeof(int64_t))
+                new_used = <np.npy_bool*>malloc(cap * sizeof(np.npy_bool))
+                new_row_counts = <int64_t*>malloc(cap * sizeof(int64_t))
+                new_value_counts = <int64_t*>malloc(cap * sizeof(int64_t))
+                new_sums = <double*>malloc(cap * sizeof(double))
+                new_mins = <double*>malloc(cap * sizeof(double))
+                new_maxs = <double*>malloc(cap * sizeof(double))
+                new_has_value = <np.npy_bool*>malloc(cap * sizeof(np.npy_bool))
+                if (
+                    new_k0 == NULL
+                    or new_k1 == NULL
+                    or new_used == NULL
+                    or new_row_counts == NULL
+                    or new_value_counts == NULL
+                    or new_sums == NULL
+                    or new_mins == NULL
+                    or new_maxs == NULL
+                    or new_has_value == NULL
+                ):
+                    free(new_k0); free(new_k1); free(new_used); free(new_row_counts); free(new_value_counts)
+                    free(new_sums); free(new_mins); free(new_maxs); free(new_has_value)
+                    raise MemoryError()
+                for pos in range(cap):
+                    new_used[pos] = 0
+                for old_pos in range(old_cap):
+                    if not table_used[old_pos]:
+                        continue
+                    h = _mix_u64(<uint64_t>table_k0[old_pos]) ^ _mix_u64(<uint64_t>table_k1[old_pos] + <uint64_t>0x9e3779b97f4a7c15)
+                    pos = <Py_ssize_t>(h & mask)
+                    while new_used[pos]:
+                        pos = <Py_ssize_t>((pos + 1) & mask)
+                    new_used[pos] = 1
+                    new_k0[pos] = table_k0[old_pos]
+                    new_k1[pos] = table_k1[old_pos]
+                    new_row_counts[pos] = row_counts[old_pos]
+                    new_value_counts[pos] = value_counts[old_pos]
+                    new_sums[pos] = sums[old_pos]
+                    new_mins[pos] = mins[old_pos]
+                    new_maxs[pos] = maxs[old_pos]
+                    new_has_value[pos] = has_value[old_pos]
+                free(table_k0); free(table_k1); free(table_used); free(row_counts); free(value_counts)
+                free(sums); free(mins); free(maxs); free(has_value)
+                table_k0 = new_k0
+                table_k1 = new_k1
+                table_used = new_used
+                row_counts = new_row_counts
+                value_counts = new_value_counts
+                sums = new_sums
+                mins = new_mins
+                maxs = new_maxs
+                has_value = new_has_value
+
+            h = _mix_u64(<uint64_t>k0) ^ _mix_u64(<uint64_t>k1 + <uint64_t>0x9e3779b97f4a7c15)
+            pos = <Py_ssize_t>(h & mask)
+            while table_used[pos] and (table_k0[pos] != k0 or table_k1[pos] != k1):
+                pos = <Py_ssize_t>((pos + 1) & mask)
+            if not table_used[pos]:
+                table_used[pos] = 1
+                table_k0[pos] = k0
+                table_k1[pos] = k1
+                row_counts[pos] = 0
+                value_counts[pos] = 0
+                sums[pos] = 0.0
+                mins[pos] = 0.0
+                maxs[pos] = 0.0
+                has_value[pos] = 0
+                used_count += 1
+
+            row_counts[pos] += 1
+            if has_values:
+                value_ok = values_valid[i]
+                if value_ok:
+                    value = values[i]
+                    value_counts[pos] += 1
+                    sums[pos] += value
+                    if not has_value[pos] or value < mins[pos]:
+                        mins[pos] = value
+                    if not has_value[pos] or value > maxs[pos]:
+                        maxs[pos] = value
+                    has_value[pos] = 1
+
+        out_k0 = np.empty(used_count, dtype=np.int64)
+        out_k1 = np.empty(used_count, dtype=np.int64)
+        out_row_counts = np.empty(used_count, dtype=np.int64)
+        out_value_counts = np.empty(used_count, dtype=np.int64)
+        out_sums = np.empty(used_count, dtype=np.float64)
+        out_mins = np.empty(used_count, dtype=np.float64)
+        out_maxs = np.empty(used_count, dtype=np.float64)
+        out_has_value = np.empty(used_count, dtype=bool)
+
+        out_pos = 0
+        for pos in range(cap):
+            if not table_used[pos]:
+                continue
+            out_k0[out_pos] = table_k0[pos]
+            out_k1[out_pos] = table_k1[pos]
+            out_row_counts[out_pos] = row_counts[pos]
+            out_value_counts[out_pos] = value_counts[pos]
+            out_sums[out_pos] = sums[pos]
+            out_mins[out_pos] = mins[pos]
+            out_maxs[out_pos] = maxs[pos]
+            out_has_value[out_pos] = has_value[pos]
+            out_pos += 1
+        return out_k0, out_k1, out_row_counts, out_value_counts, out_sums, out_mins, out_maxs, out_has_value
+    finally:
+        free(table_k0); free(table_k1); free(table_used); free(row_counts); free(value_counts)
+        free(sums); free(mins); free(maxs); free(has_value)
diff --git a/tests/ctable/test_groupby.py b/tests/ctable/test_groupby.py
index b085c92f..b920c99a 100644
--- a/tests/ctable/test_groupby.py
+++ b/tests/ctable/test_groupby.py
@@ -319,3 +319,42 @@ def test_groupby_cython_arbitrary_float_key_nan_and_signed_zero():
     assert got[0] == (0.0, 3.0)
     assert np.isnan(got[1][0])
     assert got[1][1] == 7.0
+
+
+@dataclass
+class TwoIntKeyFloatRow:
+    key0: int = blosc2.field(blosc2.int16())
+    key1: int = blosc2.field(blosc2.uint16())
+    value: float = blosc2.field(blosc2.float64(nullable=True), default=0.0)
+
+
+def test_groupby_cython_two_integer_key_hash_aggs():
+    t = CTable(
+        TwoIntKeyFloatRow,
+        new_data=[(0, 1, 1.0), (0, 1, 3.0), (0, 2, 10.0), (1, 1, np.nan), (1, 1, 5.0)],
+    )
+
+    out = t.group_by(["key0", "key1"], sort=True).agg(
+        {"*": "size", "value": ["count", "sum", "mean", "min", "max"]}
+    )
+
+    assert rows(out) == [
+        (0, 1, 2, 2, 4.0, 2.0, 1.0, 3.0),
+        (0, 2, 1, 1, 10.0, 10.0, 10.0, 10.0),
+        (1, 1, 2, 1, 5.0, 5.0, 5.0, 5.0),
+    ]
+
+
+@dataclass
+class DictIntKeyFloatRow:
+    key0: str = blosc2.field(blosc2.dictionary())
+    key1: int = blosc2.field(blosc2.int32())
+    value: float = blosc2.field(blosc2.float64())
+
+
+def test_groupby_cython_dictionary_integer_key_hash():
+    t = CTable(DictIntKeyFloatRow, new_data=[("b", 2, 1.0), ("a", 1, 2.0), ("b", 2, 3.0)])
+
+    out = t.group_by(["key0", "key1"], sort=True).agg({"value": "sum"})
+
+    assert rows(out) == [("a", 1, 2.0), ("b", 2, 4.0)]

From 5df37372ce246330cbe1f36eea03f2d11e32de78 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Fri, 15 May 2026 13:06:10 +0200
Subject: [PATCH 11/17] Aborted the full index sorted group_by path, and an
 explanation on why

---
 plans/ctable-groupby.md | 51 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 46 insertions(+), 5 deletions(-)

diff --git a/plans/ctable-groupby.md b/plans/ctable-groupby.md
index 75a71873..59fd6f07 100644
--- a/plans/ctable-groupby.md
+++ b/plans/ctable-groupby.md
@@ -266,18 +266,59 @@ Remaining possible extensions:
 
 ### FULL-index sorted group-by path
 
-A FULL index on a single grouping key can provide sorted positions.  A future
-sorted-scan group-by path could:
+A FULL index on a single grouping key can provide sorted positions.  A prototype
+Python/NumPy sorted-scan path was implemented and then reverted after
+benchmarking because it was not competitive with the existing dense/hash paths.
+
+Prototype behavior:
 
 ```text
-read sorted positions from FULL index
+read sorted values/positions from FULL sidecars
 scan contiguous key runs
+respect _valid_rows
 reduce each run
 emit sorted groups naturally
 ```
 
-This would be especially useful for high-cardinality single-key group-by and
-for users requesting `sort=True`.
+Observed benchmark results on 50M rows / 5k compact groups:
+
+```text
+float64 key, sum, sort=True, FULL index:
+  index build: ~6.2 s
+  group_by:    ~104 s
+
+int64 key, sum, sort=True, FULL index:
+  index build: ~5.5 s
+  group_by:    ~102 s
+
+int64 key, size, sort=True, FULL index:
+  index build: ~5.5 s
+  group_by:    ~0.45 s
+
+int64 key, size, sort=False, no FULL index:
+  group_by:    ~0.14 s
+```
+
+Why the prototype was slow:
+
+- value aggregations required many scattered gathers from the original value
+  column, one gathered position set per key run;
+- scattered value access is much less cache/compression friendly than the
+  existing sequential dense/hash scans;
+- the implementation still had Python-level run processing and result merging;
+- FULL index build cost is substantial unless the index already exists and can
+  be reused many times;
+- compact integer-key workloads are already ideal for dense accumulator arrays.
+
+Recommendation:
+
+- keep this deferred for now;
+- do not reintroduce a Python-level FULL-index value-aggregation path;
+- revisit only with a block-aware/Cython reducer that batches sorted positions
+  by physical chunks/blocks, or as part of a broader high-cardinality/sparse-key
+  strategy;
+- if revisited, benchmark primarily against high-cardinality non-compact keys
+  and already-existing FULL indexes, not compact dense-key workloads.
 
 ### Public `blosc2.group_reduce()`
 

From 0a459a515c27297a24fc1ef363f4b1d40f5317e2 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Fri, 15 May 2026 13:29:38 +0200
Subject: [PATCH 12/17] Added a new public blosc2.group_reduce() that is apt
 for NDArray instances

---
 doc/reference/reduction_functions.rst |   2 +
 plans/ctable-groupby.md               |  43 +++-
 src/blosc2/__init__.py                |   4 +-
 src/blosc2/groupby.py                 | 297 ++++++++++++++++++++++++++
 tests/test_group_reduce.py            |  75 +++++++
 5 files changed, 414 insertions(+), 7 deletions(-)
 create mode 100644 tests/test_group_reduce.py

diff --git a/doc/reference/reduction_functions.rst b/doc/reference/reduction_functions.rst
index 4c21c150..5122807b 100644
--- a/doc/reference/reduction_functions.rst
+++ b/doc/reference/reduction_functions.rst
@@ -14,6 +14,7 @@ Reduction operations can be used with any of :ref:`NDArray <NDArray>`, :ref:`C2A
     argmax
     argmin
     count_nonzero
+    group_reduce
     cumulative_prod
     cumulative_sum
     max
@@ -31,6 +32,7 @@ Reduction operations can be used with any of :ref:`NDArray <NDArray>`, :ref:`C2A
 .. autofunction:: blosc2.argmax
 .. autofunction:: blosc2.argmin
 .. autofunction:: blosc2.count_nonzero
+.. autofunction:: blosc2.group_reduce
 .. autofunction:: blosc2.cumulative_prod
 .. autofunction:: blosc2.cumulative_sum
 .. autofunction:: blosc2.max
diff --git a/plans/ctable-groupby.md b/plans/ctable-groupby.md
index 59fd6f07..44b4d2f5 100644
--- a/plans/ctable-groupby.md
+++ b/plans/ctable-groupby.md
@@ -322,13 +322,44 @@ Recommendation:
 
 ### Public `blosc2.group_reduce()`
 
-Keep lower-level group-reduce machinery internal for now.  Consider exposing a
-public `blosc2.group_reduce()` only after:
+Implemented a conservative public `blosc2.group_reduce()` array API for
+single-key grouped reductions without requiring a `CTable`.
 
-- aggregation semantics are stable;
-- null/NaN behavior is fully documented;
-- output representation is clear;
-- benchmarks show usefulness outside `CTable.group_by()`.
+Implemented API:
+
+```python
+groups, result = blosc2.group_reduce(
+    keys, values=None, op="size", sort=False, dropna=True
+)
+```
+
+Implemented operations:
+
+- `size`;
+- `count`;
+- `sum`;
+- `mean`;
+- `min`;
+- `max`.
+
+Implemented semantics:
+
+- returns plain NumPy arrays `(groups, result)`;
+- `size` counts rows and does not require values;
+- `count` counts non-NaN values;
+- `dropna=True` skips NaN float keys;
+- `dropna=False` keeps one normalized NaN group;
+- `+0.0` and `-0.0` are normalized by the float hash path;
+- optimized dense integer and arbitrary-float hash paths are used
+  opportunistically, with a NumPy/Python fallback.
+
+Remaining possible extensions:
+
+- multi-key public API;
+- multiple aggregations in one call;
+- multiple value columns;
+- NDArray/chunked execution without eager NumPy conversion;
+- optional CTable/persistent output.
 
 ### High-cardinality and memory strategy
 
diff --git a/src/blosc2/__init__.py b/src/blosc2/__init__.py
index 29ed2024..ee258655 100644
--- a/src/blosc2/__init__.py
+++ b/src/blosc2/__init__.py
@@ -628,7 +628,7 @@ def _raise(exc):
 # Note: bool, bytes, string shadow builtins in the blosc2 namespace by design —
 # they are schema spec constructors (b2.bool(), b2.bytes(), etc.).
 from .ctable import DEFAULT_NULL_POLICY, Column, CTable, NullPolicy, get_null_policy, null_policy
-from .groupby import CTableGroupBy
+from .groupby import CTableGroupBy, group_reduce
 from .ndarray import (
     abs,
     acos,
@@ -802,6 +802,8 @@ def _raise(exc):
     "uint64",
     "vlbytes",
     "vlstring",
+    # Grouped reductions
+    "group_reduce",
     # Classes
     "C2Array",
     "CParams",
diff --git a/src/blosc2/groupby.py b/src/blosc2/groupby.py
index 773b0a30..60ef8bbf 100644
--- a/src/blosc2/groupby.py
+++ b/src/blosc2/groupby.py
@@ -1427,3 +1427,300 @@ def _null_output_value(spec: SchemaSpec):
     if dtype is not None and dtype.kind == "S":
         return b""
     return None
+
+
+# ----------------------------------------------------------------------
+# Public array-oriented grouped reductions
+# ----------------------------------------------------------------------
+
+
+def group_reduce(keys, values=None, op: AggName = "size", *, sort: bool = False, dropna: bool = True):
+    """Group *keys* and reduce *values* with *op*.
+
+    This is a lower-level, NumPy-style grouped reduction primitive.  It exposes
+    Blosc2's optimized group-reduce kernels for plain array-like inputs without
+    requiring a :class:`blosc2.CTable`.
+
+    Parameters
+    ----------
+    keys : array-like
+        One-dimensional grouping keys.
+    values : array-like, optional
+        One-dimensional values to reduce.  Required for ``"count"``, ``"sum"``,
+        ``"mean"``, ``"min"`` and ``"max"``.  Ignored for ``"size"``.
+    op : {"size", "count", "sum", "mean", "min", "max"}, default: "size"
+        Reduction operation.  ``"size"`` counts rows per group, while
+        ``"count"`` counts non-NaN values per group.
+    sort : bool, default: False
+        If true, sort output groups by key.  With ``sort=False`` output order is
+        implementation dependent.
+    dropna : bool, default: True
+        If true, skip NaN float keys.  If false, all NaN keys form one group.
+
+    Returns
+    -------
+    groups, result : numpy.ndarray, numpy.ndarray
+        Group keys and reduced values.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import blosc2
+    >>> keys = np.array([1, 2, 1, 2, 1])
+    >>> values = np.array([10., 20., 30., 40., 50.])
+    >>> groups, sums = blosc2.group_reduce(keys, values, op="sum", sort=True)
+    >>> groups
+    array([1, 2])
+    >>> sums
+    array([90., 60.])
+    """
+    if op not in {"size", "count", "sum", "mean", "min", "max"}:
+        raise ValueError(f"unsupported group_reduce operation {op!r}")
+
+    keys_arr = np.asarray(keys)
+    if keys_arr.ndim != 1:
+        raise ValueError("keys must be a 1-D array")
+
+    if op == "size":
+        values_arr = None
+    else:
+        if values is None:
+            raise ValueError(f"values are required for group_reduce op {op!r}")
+        values_arr = np.asarray(values)
+        if values_arr.ndim != 1:
+            raise ValueError("values must be a 1-D array")
+        if len(values_arr) != len(keys_arr):
+            raise ValueError("keys and values must have the same length")
+
+    if len(keys_arr) == 0:
+        return keys_arr.copy(), np.empty(0, dtype=_result_dtype(values_arr, op))
+
+    fast = _try_dense_integer(keys_arr, values_arr, op, sort=sort)
+    if fast is not None:
+        return fast
+
+    fast = _try_float_hash(keys_arr, values_arr, op, sort=sort, dropna=dropna)
+    if fast is not None:
+        return fast
+
+    return _group_reduce_numpy(keys_arr, values_arr, op, sort=sort, dropna=dropna)
+
+
+def _try_dense_integer(keys: np.ndarray, values: np.ndarray | None, op: str, *, sort: bool):  # noqa: C901
+    key_dtype = np.dtype(keys.dtype)
+    if key_dtype.kind == "b":
+        keys = keys.astype(np.int8, copy=False)
+    elif key_dtype.kind not in "iu":
+        return None
+    keys = np.ascontiguousarray(keys)
+    if len(keys) == 0:
+        return None
+    if np.min(keys) < 0:
+        return None
+    max_key = int(np.max(keys))
+    if max_key + 1 > 10_000_000:
+        return None
+
+    try:
+        from blosc2 import groupby_ext
+    except ImportError:
+        return None
+
+    valid = np.ones(len(keys), dtype=bool)
+    keys_present = np.zeros(max_key + 1, dtype=bool)
+
+    if op == "size":
+        counts = np.zeros(max_key + 1, dtype=np.int64)
+        groupby_ext.groupby_dense_int_size_checked(keys, valid, counts, keys_present, False, 0)
+        groups = np.nonzero(keys_present)[0].astype(key_dtype if key_dtype.kind != "b" else np.bool_)
+        result = counts[np.nonzero(keys_present)[0]]
+        return _maybe_sort(groups, result, sort)
+
+    assert values is not None
+    value_dtype = np.dtype(values.dtype)
+    if op == "count":
+        counts = np.zeros(max_key + 1, dtype=np.int64)
+        values_valid = _values_valid(values)
+        groupby_ext.groupby_dense_int_count_checked(
+            keys, valid, np.ascontiguousarray(values_valid), counts, keys_present, False, 0
+        )
+        codes = np.nonzero(keys_present)[0]
+        return _maybe_sort(
+            codes.astype(key_dtype if key_dtype.kind != "b" else np.bool_), counts[codes], sort
+        )
+
+    if op == "mean" or value_dtype.kind == "f":
+        vals = np.ascontiguousarray(values.astype(np.float64, copy=False))
+        skip_nan = value_dtype.kind == "f"
+        if op == "sum":
+            sums = np.zeros(max_key + 1, dtype=np.float64)
+            present = np.zeros(max_key + 1, dtype=bool)
+            groupby_ext.groupby_dense_int_f64_sum_checked(
+                keys, vals, valid, sums, present, keys_present, False, 0, skip_nan
+            )
+            codes = np.nonzero(keys_present)[0]
+            result = sums[codes]
+            result[~present[codes]] = np.nan
+        elif op == "mean":
+            sums = np.zeros(max_key + 1, dtype=np.float64)
+            counts = np.zeros(max_key + 1, dtype=np.int64)
+            groupby_ext.groupby_dense_int_f64_mean_checked(
+                keys, vals, valid, sums, counts, keys_present, False, 0, skip_nan
+            )
+            codes = np.nonzero(keys_present)[0]
+            result = np.full(len(codes), np.nan, dtype=np.float64)
+            ok = counts[codes] > 0
+            result[ok] = sums[codes][ok] / counts[codes][ok]
+        elif op in {"min", "max"}:
+            state = np.zeros(max_key + 1, dtype=np.float64)
+            has_value = np.zeros(max_key + 1, dtype=bool)
+            kernel = getattr(groupby_ext, f"groupby_dense_int_f64_{op}_checked")
+            kernel(keys, vals, valid, state, has_value, keys_present, False, 0, skip_nan)
+            codes = np.nonzero(keys_present)[0]
+            result = state[codes]
+            result[~has_value[codes]] = np.nan
+        else:  # pragma: no cover
+            return None
+        return _maybe_sort(codes.astype(key_dtype if key_dtype.kind != "b" else np.bool_), result, sort)
+
+    if value_dtype.kind not in "biu":
+        return None
+    vals_i64 = np.ascontiguousarray(values.astype(np.int64, copy=False))
+    state = np.zeros(max_key + 1, dtype=np.int64)
+    present = np.zeros(max_key + 1, dtype=bool)
+    kernel = getattr(groupby_ext, f"groupby_dense_int_i64_{op}_checked", None)
+    if kernel is None:
+        return None
+    kernel(keys, vals_i64, valid, state, present, keys_present, False, 0)
+    codes = np.nonzero(keys_present)[0]
+    return _maybe_sort(codes.astype(key_dtype if key_dtype.kind != "b" else np.bool_), state[codes], sort)
+
+
+def _try_float_hash(keys: np.ndarray, values: np.ndarray | None, op: str, *, sort: bool, dropna: bool):
+    key_dtype = np.dtype(keys.dtype)
+    if key_dtype.kind != "f":
+        return None
+    if values is not None and np.dtype(values.dtype).kind != "f" and op != "count":
+        return None
+    try:
+        from blosc2 import groupby_ext
+    except ImportError:
+        return None
+
+    keys_f64 = np.ascontiguousarray(keys.astype(np.float64, copy=False))
+    valid = np.ones(len(keys_f64), dtype=bool)
+    if values is None:
+        values_f64 = np.empty(len(keys_f64), dtype=np.float64)
+        values_valid = np.zeros(len(keys_f64), dtype=bool)
+        has_values = False
+    else:
+        values_f64 = np.ascontiguousarray(np.asarray(values, dtype=np.float64))
+        values_valid = np.ascontiguousarray(_values_valid(values))
+        has_values = True
+
+    groups, row_counts, value_counts, sums, mins, maxs, has_value = groupby_ext.groupby_hash_f64_f64(
+        keys_f64, values_f64, valid, values_valid, has_values, dropna
+    )
+    groups = groups.astype(key_dtype, copy=False)
+    if op == "size":
+        result = row_counts
+    elif op == "count":
+        result = value_counts
+    elif op == "sum":
+        result = sums.copy()
+        result[~has_value] = np.nan
+    elif op == "mean":
+        result = np.full(len(groups), np.nan, dtype=np.float64)
+        ok = value_counts > 0
+        result[ok] = sums[ok] / value_counts[ok]
+    elif op == "min":
+        result = mins.copy()
+        result[~has_value] = np.nan
+    elif op == "max":
+        result = maxs.copy()
+        result[~has_value] = np.nan
+    else:  # pragma: no cover
+        return None
+    return _maybe_sort(groups, result, sort)
+
+
+def _group_reduce_numpy(  # noqa: C901
+    keys: np.ndarray, values: np.ndarray | None, op: str, *, sort: bool, dropna: bool
+):
+    acc: dict[object, list] = {}
+    display: dict[object, object] = {}
+    for i, key in enumerate(keys):
+        key_item = _python_scalar(key)
+        if isinstance(key_item, float) and math.isnan(key_item):
+            if dropna:
+                continue
+            norm_key = _NAN_KEY
+        else:
+            norm_key = key_item
+        display.setdefault(norm_key, key_item)
+        state = acc.setdefault(norm_key, [0, 0, 0.0, None, None])
+        state[0] += 1
+        if values is None:
+            continue
+        value = _python_scalar(values[i])
+        if isinstance(value, float) and math.isnan(value):
+            continue
+        state[1] += 1
+        if op in {"sum", "mean"}:
+            state[2] += value
+        elif op == "min" and (state[3] is None or value < state[3]):
+            state[3] = value
+        elif op == "max" and (state[4] is None or value > state[4]):
+            state[4] = value
+
+    order = list(acc)
+    if sort:
+        order.sort(key=lambda k: (1, "") if k is _NAN_KEY else (0, display[k]))
+    groups = np.asarray([display[k] for k in order], dtype=keys.dtype)
+    result = []
+    for k in order:
+        rows, count, total, min_value, max_value = acc[k]
+        if op == "size":
+            result.append(rows)
+        elif op == "count":
+            result.append(count)
+        elif op == "sum":
+            result.append(total if count else _null_value_for(values))
+        elif op == "mean":
+            result.append(math.nan if count == 0 else total / count)
+        elif op == "min":
+            result.append(min_value if count else _null_value_for(values))
+        elif op == "max":
+            result.append(max_value if count else _null_value_for(values))
+    return groups, np.asarray(result, dtype=_result_dtype(values, op))
+
+
+def _maybe_sort(groups: np.ndarray, result: np.ndarray, sort: bool):
+    if sort and len(groups):
+        order = np.argsort(groups, kind="stable")
+        return groups[order], result[order]
+    return groups, result
+
+
+def _values_valid(values: np.ndarray) -> np.ndarray:
+    values = np.asarray(values)
+    if values.dtype.kind == "f":
+        return ~np.isnan(values)
+    return np.ones(len(values), dtype=bool)
+
+
+def _result_dtype(values: np.ndarray | None, op: str):
+    if op in {"size", "count"}:
+        return np.int64
+    if op == "mean" or values is None:
+        return np.float64
+    dtype = np.dtype(values.dtype)
+    if op == "sum" and dtype.kind in "biu":
+        return np.int64
+    return dtype
+
+
+def _null_value_for(values: np.ndarray | None):
+    if values is not None and np.dtype(values.dtype).kind in "iu":
+        return 0
+    return math.nan
diff --git a/tests/test_group_reduce.py b/tests/test_group_reduce.py
new file mode 100644
index 00000000..856c25ef
--- /dev/null
+++ b/tests/test_group_reduce.py
@@ -0,0 +1,75 @@
+import numpy as np
+import pytest
+
+import blosc2
+
+
+def test_group_reduce_size_and_sum_integer_keys():
+    keys = np.array([2, 1, 2, 1, 2], dtype=np.int16)
+    values = np.array([10, 1, 30, 3, 50], dtype=np.int32)
+
+    groups, sizes = blosc2.group_reduce(keys, op="size", sort=True)
+    groups2, sums = blosc2.group_reduce(keys, values, op="sum", sort=True)
+
+    assert groups.dtype == keys.dtype
+    np.testing.assert_array_equal(groups, np.array([1, 2], dtype=np.int16))
+    np.testing.assert_array_equal(sizes, np.array([2, 3]))
+    np.testing.assert_array_equal(groups2, np.array([1, 2], dtype=np.int16))
+    np.testing.assert_array_equal(sums, np.array([4, 90]))
+
+
+def test_group_reduce_integer_keys_float_aggs_with_nan_values():
+    keys = np.array([0, 1, 0, 1, 2], dtype=np.uint16)
+    values = np.array([1.0, np.nan, 3.0, np.nan, 10.0])
+
+    groups, counts = blosc2.group_reduce(keys, values, op="count", sort=True)
+    _, means = blosc2.group_reduce(keys, values, op="mean", sort=True)
+    _, mins = blosc2.group_reduce(keys, values, op="min", sort=True)
+    _, maxs = blosc2.group_reduce(keys, values, op="max", sort=True)
+
+    np.testing.assert_array_equal(groups, np.array([0, 1, 2], dtype=np.uint16))
+    np.testing.assert_array_equal(counts, np.array([2, 0, 1]))
+    assert means[0] == 2.0
+    assert np.isnan(means[1])
+    assert means[2] == 10.0
+    assert mins[0] == 1.0
+    assert np.isnan(mins[1])
+    assert mins[2] == 10.0
+    assert maxs[0] == 3.0
+    assert np.isnan(maxs[1])
+    assert maxs[2] == 10.0
+
+
+def test_group_reduce_arbitrary_float_keys_and_nan_key_group():
+    keys = np.array([0.5, np.nan, 0.5, -0.0, 0.0, np.nan])
+    values = np.array([1.0, 2.0, 3.0, 10.0, 20.0, 5.0])
+
+    groups, sums = blosc2.group_reduce(keys, values, op="sum", sort=True, dropna=False)
+
+    assert groups[0] == 0.0
+    assert sums[0] == 30.0
+    assert groups[1] == 0.5
+    assert sums[1] == 4.0
+    assert np.isnan(groups[2])
+    assert sums[2] == 7.0
+
+
+def test_group_reduce_dropna_default_skips_nan_keys():
+    keys = np.array([1.0, np.nan, 1.0])
+    values = np.array([2.0, 10.0, 3.0])
+
+    groups, sums = blosc2.group_reduce(keys, values, op="sum", sort=True)
+
+    np.testing.assert_array_equal(groups, np.array([1.0]))
+    np.testing.assert_array_equal(sums, np.array([5.0]))
+
+
+def test_group_reduce_rejects_bad_inputs():
+    with pytest.raises(ValueError):
+        blosc2.group_reduce(np.ones((2, 2)), op="size")
+    with pytest.raises(ValueError):
+        blosc2.group_reduce(np.arange(3), op="sum")
+    with pytest.raises(ValueError):
+        blosc2.group_reduce(np.arange(3), np.arange(2), op="sum")
+    with pytest.raises(ValueError):
+        blosc2.group_reduce(np.arange(3), op="bad")

From 0185e796f11434ec3c84443bbf0461b50cf889bc Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Fri, 15 May 2026 13:30:22 +0200
Subject: [PATCH 13/17] Add forgotten test unit

---
 tests/ctable/test_object_spec.py | 65 ++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)
 create mode 100644 tests/ctable/test_object_spec.py

diff --git a/tests/ctable/test_object_spec.py b/tests/ctable/test_object_spec.py
new file mode 100644
index 00000000..30fa7258
--- /dev/null
+++ b/tests/ctable/test_object_spec.py
@@ -0,0 +1,65 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+"""Tests for schema-less CTable object columns."""
+
+from dataclasses import dataclass
+
+import pytest
+
+import blosc2
+from blosc2 import CTable
+
+
+@dataclass
+class ObjectRow:
+    id: int = blosc2.field(blosc2.int32())
+    payload: object = blosc2.field(blosc2.object(nullable=True))
+
+
+def test_object_column_heterogeneous_values():
+    t = CTable(ObjectRow)
+    t.append([1, {"kind": "dict", "values": [1, 2]}])
+    t.append([2, ("tuple", 3)])
+    t.append([3, None])
+
+    assert t["payload"][:] == [{"kind": "dict", "values": [1, 2]}, ("tuple", 3), None]
+    assert t["payload"].is_varlen_scalar
+
+
+def test_object_column_persistence(tmp_path):
+    path = tmp_path / "objects.b2d"
+    t = CTable(ObjectRow, urlpath=str(path), mode="w")
+    t.extend([[1, {"x": 1}], [2, ["a", "b"]], [3, None]])
+    t.close()
+
+    reopened = CTable.open(str(path), mode="r")
+    assert reopened["payload"][:] == [{"x": 1}, ["a", "b"], None]
+
+
+def test_object_column_to_arrow_raises():
+    t = CTable(ObjectRow)
+    t.append([1, {"x": 1}])
+    with pytest.raises(TypeError, match="ObjectSpec columns"):
+        t.to_arrow()
+
+
+def test_object_column_rejects_none_when_not_nullable():
+    @dataclass
+    class StrictObjectRow:
+        payload: object = blosc2.field(blosc2.object())
+
+    t = CTable(StrictObjectRow)
+    with pytest.raises(TypeError, match="not nullable"):
+        t.append([None])
+
+
+def test_object_column_rejects_non_msgpack_value_on_flush():
+    t = CTable(ObjectRow)
+    t.append([1, {"not-msgpack": {1, 2, 3}}])
+    with pytest.raises(TypeError):
+        t.close()

From 98bb1400d68c37ec8e3789bb05b794505859bad5 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Fri, 15 May 2026 13:35:39 +0200
Subject: [PATCH 14/17] Implemented group-by convenience methods (sum, mean,
 min, max)

---
 doc/reference/ctable.rst     |  9 ++++++---
 plans/ctable-groupby.md      |  5 ++++-
 src/blosc2/groupby.py        | 28 ++++++++++++++++++++++++++++
 tests/ctable/test_groupby.py | 17 +++++++++++++++++
 4 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/doc/reference/ctable.rst b/doc/reference/ctable.rst
index 2ed012d0..8ccf74b8 100644
--- a/doc/reference/ctable.rst
+++ b/doc/reference/ctable.rst
@@ -251,17 +251,20 @@ Group-by reductions
 
 :meth:`CTable.group_by` returns a lightweight deferred group-by object.  It is
 not a table view; methods such as :meth:`~blosc2.CTableGroupBy.size`,
-:meth:`~blosc2.CTableGroupBy.count`, and
+:meth:`~blosc2.CTableGroupBy.count`, :meth:`~blosc2.CTableGroupBy.sum`, and
 :meth:`~blosc2.CTableGroupBy.agg` materialize a new :class:`CTable` with
 one row per group::
 
     by_city = t.group_by("city", sort=True)
     counts = by_city.size()                  # row count per city / COUNT(*)
     non_null = by_city.count("sales")        # non-null sales count / COUNT(sales)
-    totals = by_city.agg({"sales": "sum"})
+    totals = by_city.sum("sales")            # equivalent to agg({"sales": "sum"})
+    means = by_city.mean("sales")
+    mins = by_city.min("sales")
+    maxs = by_city.max("sales")
 
 .. autoclass:: CTableGroupBy
-    :members: size, count, agg
+    :members: size, count, sum, mean, min, max, agg
 
 
 Mutations
diff --git a/plans/ctable-groupby.md b/plans/ctable-groupby.md
index 44b4d2f5..308c47b8 100644
--- a/plans/ctable-groupby.md
+++ b/plans/ctable-groupby.md
@@ -381,7 +381,7 @@ Potential future optimization:
 
 ### Additional API conveniences
 
-Potential future user conveniences:
+Implemented group-by convenience methods:
 
 ```python
 t.group_by("city").sum("sales")
@@ -390,6 +390,9 @@ t.group_by("city").min("sales")
 t.group_by("city").max("sales")
 ```
 
+These are equivalent to `agg({column: op})` and complement the already-existing
+`size()` and `count(column)` group-by methods.
+
 Do not add top-level `CTable.size()` / `CTable.count()` until their semantics are
 clearly justified outside group-by.
 
diff --git a/src/blosc2/groupby.py b/src/blosc2/groupby.py
index 60ef8bbf..c5f14221 100644
--- a/src/blosc2/groupby.py
+++ b/src/blosc2/groupby.py
@@ -110,6 +110,34 @@ def count(self, column: str):
         col = self.table._logical_to_physical_name(column)
         return self._execute([_AggSpec(col, "count", f"{col}_count")])
 
+    def sum(self, column: str):
+        """Return sums of *column* per group.
+
+        This is equivalent to ``group_by(...).agg({column: "sum"})``.
+        """
+        return self.agg({column: "sum"})
+
+    def mean(self, column: str):
+        """Return means of *column* per group.
+
+        This is equivalent to ``group_by(...).agg({column: "mean"})``.
+        """
+        return self.agg({column: "mean"})
+
+    def min(self, column: str):
+        """Return minimum values of *column* per group.
+
+        This is equivalent to ``group_by(...).agg({column: "min"})``.
+        """
+        return self.agg({column: "min"})
+
+    def max(self, column: str):
+        """Return maximum values of *column* per group.
+
+        This is equivalent to ``group_by(...).agg({column: "max"})``.
+        """
+        return self.agg({column: "max"})
+
     def agg(self, aggregations: Mapping[str, str | Sequence[str]]):
         """Aggregate value columns per group.
 
diff --git a/tests/ctable/test_groupby.py b/tests/ctable/test_groupby.py
index b920c99a..ae884c42 100644
--- a/tests/ctable/test_groupby.py
+++ b/tests/ctable/test_groupby.py
@@ -358,3 +358,20 @@ def test_groupby_cython_dictionary_integer_key_hash():
     out = t.group_by(["key0", "key1"], sort=True).agg({"value": "sum"})
 
     assert rows(out) == [("a", 1, 2.0), ("b", 2, 4.0)]
+
+
+def test_groupby_convenience_numeric_methods():
+    t = CTable(SalesRow, new_data=DATA)
+
+    assert rows(t.group_by("city", sort=True).sum("qty")) == rows(
+        t.group_by("city", sort=True).agg({"qty": "sum"})
+    )
+    assert rows(t.group_by("city", sort=True).mean("qty")) == rows(
+        t.group_by("city", sort=True).agg({"qty": "mean"})
+    )
+    assert rows(t.group_by("city", sort=True).min("qty")) == rows(
+        t.group_by("city", sort=True).agg({"qty": "min"})
+    )
+    assert rows(t.group_by("city", sort=True).max("qty")) == rows(
+        t.group_by("city", sort=True).agg({"qty": "max"})
+    )

From 9f964c9ebcddca829aabf89979f57ac2acddb712 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Fri, 15 May 2026 13:42:17 +0200
Subject: [PATCH 15/17] Updated plan with current state

---
 plans/ctable-groupby.md | 395 ++++++++++++++++++++--------------------
 1 file changed, 197 insertions(+), 198 deletions(-)

diff --git a/plans/ctable-groupby.md b/plans/ctable-groupby.md
index 308c47b8..522fba47 100644
--- a/plans/ctable-groupby.md
+++ b/plans/ctable-groupby.md
@@ -1,12 +1,12 @@
 # CTable `group_by` implementation plan — status
 
 This document started as the implementation plan for `CTable.group_by()`.  The
-initial plan has now been executed through Phase 3.  The remaining sections
-record what was completed and what is future work.
+core API and several optimized execution paths are now implemented.  The first
+section records completed work; the final section lists remaining future work.
 
 ## Completed
 
-### Public API
+### Public `CTable.group_by()` API
 
 Implemented:
 
@@ -30,13 +30,27 @@ Implemented API decisions:
 - `dropna=True` is the default; `dropna=False` keeps null/NaN key groups.
 - No top-level `CTable.size()` or `CTable.count()` was added.
 
-### Phase 1: Python/NumPy implementation
+### Convenience group-by methods
+
+Implemented group-by convenience methods:
+
+```python
+t.group_by("city").sum("sales")
+t.group_by("city").mean("sales")
+t.group_by("city").min("sales")
+t.group_by("city").max("sales")
+```
+
+These are equivalent to `agg({column: op})` and complement `size()` and
+`count(column)`.
+
+### Generic Python/NumPy implementation
 
 Implemented files:
 
 ```text
 src/blosc2/ctable.py      # CTable.group_by()
-src/blosc2/groupby.py     # CTableGroupBy and NumPy fallback engine
+src/blosc2/groupby.py     # CTableGroupBy, NumPy fallback, public group_reduce()
 ```
 
 Implemented functionality:
@@ -51,9 +65,9 @@ Implemented functionality:
 - Supports empty inputs.
 - Falls back to the generic NumPy path for unsupported optimized cases.
 
-### Phase 1 benchmark harness
+### Benchmark harness
 
-Implemented:
+Implemented/extended:
 
 ```text
 bench/ctable/groupby.py
@@ -63,31 +77,65 @@ The benchmark can vary:
 
 - row count;
 - group cardinality;
-- key dtype via `--key-dtype int32|int64|float32|float64`;
+- key dtype via `--key-dtype` including integer, unsigned integer, and float dtypes;
 - dictionary keys via `--dictionary`;
 - operation via `--op size|count|sum|mean|min|max`;
 - sorted output;
 - chunk size;
+- multi-key mode via `--multi-key` and `--groups2`;
 - optional persistent `urlpath`;
 - optional pandas comparison.
 
-### Phase 2: optimized paths
+Float key benchmarks now generate non-integral repeated labels by default so
+`float32`/`float64` runs exercise the arbitrary-float hash path instead of the
+integral-float dense path.
 
-Implemented dense NumPy and Cython fast paths for the main benchmark-driven
-cases.
+### Dedicated Cython extension
 
-Optimized cases currently include:
+Implemented:
+
+```text
+src/blosc2/groupby_ext.pyx
+```
 
-- compact non-negative integer/dictionary-code single keys in Python/NumPy dense mode;
-- `int32 key + float64 sum` in Cython;
-- dictionary-code key + `float64 sum` in Cython;
-- integral `float64 key + float64 sum` in Cython;
-- integral `float32 key + float64 sum` in Cython.
+Build integration:
 
-These paths avoid the original per-chunk `np.unique(..., return_inverse=True)`
-and Python dictionary merge overhead for compact single-key sum workloads.
+- `CMakeLists.txt` builds, links, and installs `groupby_ext`.
+- Group-by kernels were removed from `indexing_ext.pyx`.
+- `src/blosc2/groupby.py` imports `blosc2.groupby_ext` for optimized kernels.
 
-Representative benchmark improvements observed during implementation:
+Rationale:
+
+- Group-by kernels are analytics/query execution code, not indexing internals.
+- A dedicated extension keeps separation of concerns cleaner as optimized paths grow.
+
+### Dense integer-key Cython coverage
+
+Implemented fused dense integer-key Cython kernels covering:
+
+- `int8`, `uint8`;
+- `int16`, `uint16`;
+- `int32`, `uint32`;
+- `int64`, `uint64`.
+
+Implemented dense integer/dictionary-code Cython path for:
+
+- `size`;
+- `count`;
+- `sum`;
+- `mean` via sum/count;
+- `min`;
+- `max`.
+
+Additional details:
+
+- Uses compact dense accumulator arrays.
+- Falls back for negative non-null keys and non-compact key ranges.
+- Supports float64 value kernels with NaN-null skipping where applicable.
+- Supports int64-normalized integer/bool value kernels for `sum`, `min`, and `max`.
+- Tracks key presence separately so groups with all-null values are emitted correctly.
+
+Representative benchmark improvements observed during earlier optimization:
 
 ```text
 50M rows, 5k int32 groups, float64 sum:
@@ -102,52 +150,96 @@ Representative benchmark improvements observed during implementation:
   Cython dense path:  ~0.24–0.25 s
 ```
 
-### Phase 3: separate Cython extension
+### Arbitrary float-key hash path
 
-Implemented:
+Implemented a conservative Cython open-addressing hash path for single
+`float32`/`float64` keys with float value aggregations.
 
-```text
-src/blosc2/groupby_ext.pyx
-```
+Implemented operations:
 
-Build integration:
+- `size`;
+- `count`;
+- `sum`;
+- `mean`;
+- `min`;
+- `max`.
 
-- `CMakeLists.txt` builds, links, and installs `groupby_ext`.
-- Group-by kernels were removed from `indexing_ext.pyx`.
-- `src/blosc2/groupby.py` imports `blosc2.groupby_ext` for optimized kernels.
+Implemented semantics:
 
-Rationale:
+- `dropna=True`: skip NaN keys;
+- `dropna=False`: all NaN keys form one group;
+- `+0.0` and `-0.0` are normalized into the same group;
+- infinities are valid groups through regular float bit hashing;
+- NaN-null float values are skipped for value aggregations.
 
-- Group-by kernels are analytics/query execution code, not indexing internals.
-- A dedicated extension keeps separation of concerns cleaner as optimized paths grow.
+### Two-key Cython hash path
 
-### Phase 4: fused integer-key kernels and more Cython aggregations
+Implemented a conservative Cython hash path for two-key group-by when both keys
+are integer or dictionary-code-backed columns.
 
-Implemented:
+Implemented behavior:
+
+- normalizes keys to `int64`;
+- hashes `(key0, key1)` directly;
+- supports `size`, `count`, `sum`, `mean`, `min`, and `max` for supported float
+  value reductions;
+- avoids structured-array packing and per-chunk `np.unique` for common two-key
+  categorical/integer workloads;
+- falls back for unsupported cases.
+
+Benchmarks showed this is functionally useful but still leaves room for future
+optimization because partial states are merged in Python and the generic hash
+kernel maintains more state than a specialized one-operation kernel needs.
+
+### Public `blosc2.group_reduce()`
+
+Implemented a conservative public array API for single-key grouped reductions
+without requiring a `CTable`.
+
+Implemented API:
+
+```python
+groups, result = blosc2.group_reduce(
+    keys, values=None, op="size", sort=False, dropna=True
+)
+```
+
+Implemented operations:
+
+- `size`;
+- `count`;
+- `sum`;
+- `mean`;
+- `min`;
+- `max`.
 
-- fused dense integer-key Cython kernels covering `int8`, `uint8`,
-  `int16`, `uint16`, `int32`, `uint32`, `int64`, and `uint64` keys;
-- dense integer/dictionary-code Cython path for `size`, `count`, `sum`,
-  `mean`, `min`, and `max`;
-- float64 value kernels with NaN-null skipping where applicable;
-- int64 value kernels for integer/bool `sum`, `min`, and `max`;
-- shared key-presence tracking so groups with all-null values are still
-  emitted correctly for `count` and nullable float aggregations.
+Implemented semantics:
+
+- returns plain NumPy arrays `(groups, result)`;
+- `size` counts rows and does not require values;
+- `count` counts non-NaN values;
+- `dropna=True` skips NaN float keys;
+- `dropna=False` keeps one normalized NaN group;
+- `+0.0` and `-0.0` are normalized by the float hash path;
+- optimized dense integer and arbitrary-float hash paths are used
+  opportunistically, with a NumPy/Python fallback.
 
 ### Documentation
 
-Implemented user-facing documentation in:
+Implemented/updated user-facing documentation in:
 
 ```text
 doc/reference/ctable.rst
+doc/reference/reduction_functions.rst
 ```
 
 Documented:
 
 - `CTable.group_by()`;
 - returned `CTableGroupBy` object;
-- `size()`, `count()`, `agg()`;
-- examples for row counts, non-null counts, and sums.
+- `size()`, `count()`, `sum()`, `mean()`, `min()`, `max()`, `agg()`;
+- examples for row counts, non-null counts, and grouped reductions;
+- public `blosc2.group_reduce()`.
 
 ### Tests
 
@@ -155,6 +247,7 @@ Implemented/extended:
 
 ```text
 tests/ctable/test_groupby.py
+tests/test_group_reduce.py
 ```
 
 Coverage includes:
@@ -162,6 +255,7 @@ Coverage includes:
 - `size()` row counts;
 - `count(column)` non-null counts;
 - `agg()` with `sum`, `mean`, `min`, `max`, `count`;
+- convenience `sum`, `mean`, `min`, `max` methods;
 - `agg({"*": "size"})`;
 - multi-key group-by;
 - dictionary string keys;
@@ -169,106 +263,66 @@ Coverage includes:
 - empty tables;
 - `dropna=True` / `dropna=False` behavior;
 - bad engine rejection;
-- optimized int32/dictionary/float32/float64 sum variants;
-- fallback for non-integral float keys;
-- fallback for NaN float-key group when `dropna=False`.
-
-Validation during implementation:
-
-```text
-pytest tests/ctable/test_groupby.py -q
-pytest tests/ctable -q
-```
-
-The full CTable suite passed after Phase 3.
+- optimized integer/dictionary/float variants;
+- arbitrary float-key hash behavior;
+- public `group_reduce()` behavior and input validation.
 
 ## Current design summary
 
-The implementation now has three execution layers:
+The implementation now has these execution layers:
 
 1. Generic chunked NumPy path:
-   - supports the broadest set of Phase-1 semantics;
-   - uses per-chunk local grouping and merges partials globally.
+   - broadest semantics;
+   - per-chunk local grouping and global merge.
 2. Dense NumPy single-key path:
-   - for compact non-negative integer/dictionary-code keys;
-   - uses dense accumulator arrays where possible.
-3. Cython single-key sum kernels:
-   - for the most important compact/integral key + `float64 sum` cases;
-   - lives in `groupby_ext.pyx`.
+   - compact non-negative integer/dictionary-code keys;
+   - dense accumulator arrays.
+3. Cython dense integer-key path:
+   - fused integer key dtypes;
+   - `size`, `count`, `sum`, `mean`, `min`, `max`.
+4. Cython integral-float dense path:
+   - integral `float32`/`float64` keys for selected dense cases.
+5. Cython arbitrary-float hash path:
+   - non-integral `float32`/`float64` keys;
+   - normalized NaN and signed-zero semantics.
+6. Cython two-key hash path:
+   - two integer/dictionary-code-backed keys;
+   - float value reductions.
+7. Public array-level `blosc2.group_reduce()`:
+   - uses optimized kernels opportunistically without requiring a `CTable`.
 
 All optimized paths are conservative and fall back to the generic engine when
 unsupported data or semantics are encountered.
 
-## Deferred / future work
-
-### Integer-key Cython coverage
-
-Completed for dense compact single-key group-by with fused kernels covering
-`int8`, `uint8`, `int16`, `uint16`, `int32`, `uint32`, `int64`, and `uint64`.
-The dense path still falls back for negative non-null keys and non-compact key
-ranges.
+## Future work
 
-### More Cython aggregations
-
-Completed for dense compact integer/dictionary-code single keys:
-
-- `size`;
-- `count`;
-- `sum`;
-- `mean` via sum/count;
-- `min`;
-- `max`.
+### Fuse multiple aggregations/value columns in Cython
 
-Remaining possible extensions in this area:
+Current optimized paths often run separate kernels or maintain generic state.
+Future work could:
 
-- fuse multiple aggregations/value columns into one Cython pass;
+- fuse multiple aggregations in a single pass;
+- support multiple value columns directly;
+- specialize kernels by requested operation so, for example, a `sum` workload
+  does not maintain min/max state;
 - broaden value-type coverage beyond float64/int64 normalized kernels.
 
-### Arbitrary float-key hash table
-
-Implemented a conservative Cython open-addressing hash path for single
-`float32`/`float64` keys with float value aggregations.  It supports `size`,
-`count`, `sum`, `mean`, `min`, and `max` for supported single-value-column
-queries and falls back otherwise.
-
-Implemented semantics:
-
-- `dropna=True`: skip NaN keys;
-- `dropna=False`: all NaN keys form one group;
-- `+0.0` and `-0.0` are normalized into the same group;
-- infinities are valid groups through regular float bit hashing;
-- NaN-null float values are skipped for value aggregations.
-
-Remaining possible extensions:
-
-- support non-float value columns in the hash path without normalizing through
-  float64;
-- fuse multiple value columns directly in one hash-table pass;
-- add explicit memory/cardinality safeguards for very high-cardinality floats.
+### Extend multi-key optimized paths
 
-### Multi-key Cython hash path
-
-Implemented a conservative Cython hash path for two-key group-by when both keys
-are integer or dictionary-code-backed columns.  The path normalizes keys to
-`int64`, hashes `(key0, key1)` directly, and supports `size`, `count`, `sum`,
-`mean`, `min`, and `max` for supported float value reductions.  This avoids
-structured-array packing and per-chunk `np.unique` for common two-key
-categorical/integer workloads.
-
-Remaining possible extensions:
+Current Cython multi-key support is intentionally narrow.
+Future work could:
 
 - support more than two key columns;
-- support float/string fixed-width key components directly;
-- support non-float value columns without normalizing value reductions through
-  float64;
-- fuse/merge multi-key states across chunks fully in Cython rather than via the
-  existing Python accumulator merge.
+- support float key components directly;
+- support fixed-width string/bytes key components directly;
+- support non-float value columns without normalizing reductions through float64;
+- merge multi-key states fully in Cython instead of via Python accumulators;
+- add a dense two-integer-key path for compact Cartesian key domains.
 
-### FULL-index sorted group-by path
+### Revisit FULL-index sorted group-by only with a better design
 
-A FULL index on a single grouping key can provide sorted positions.  A prototype
-Python/NumPy sorted-scan path was implemented and then reverted after
-benchmarking because it was not competitive with the existing dense/hash paths.
+A Python/NumPy FULL-index sorted-scan prototype was implemented and reverted
+after benchmarking because it was not competitive with existing dense/hash paths.
 
 Prototype behavior:
 
@@ -303,8 +357,8 @@ Why the prototype was slow:
 
 - value aggregations required many scattered gathers from the original value
   column, one gathered position set per key run;
-- scattered value access is much less cache/compression friendly than the
-  existing sequential dense/hash scans;
+- scattered value access is much less cache/compression friendly than existing
+  sequential dense/hash scans;
 - the implementation still had Python-level run processing and result merging;
 - FULL index build cost is substantial unless the index already exists and can
   be reused many times;
@@ -312,54 +366,13 @@ Why the prototype was slow:
 
 Recommendation:
 
-- keep this deferred for now;
+- keep this deferred;
 - do not reintroduce a Python-level FULL-index value-aggregation path;
 - revisit only with a block-aware/Cython reducer that batches sorted positions
   by physical chunks/blocks, or as part of a broader high-cardinality/sparse-key
   strategy;
-- if revisited, benchmark primarily against high-cardinality non-compact keys
-  and already-existing FULL indexes, not compact dense-key workloads.
-
-### Public `blosc2.group_reduce()`
-
-Implemented a conservative public `blosc2.group_reduce()` array API for
-single-key grouped reductions without requiring a `CTable`.
-
-Implemented API:
-
-```python
-groups, result = blosc2.group_reduce(
-    keys, values=None, op="size", sort=False, dropna=True
-)
-```
-
-Implemented operations:
-
-- `size`;
-- `count`;
-- `sum`;
-- `mean`;
-- `min`;
-- `max`.
-
-Implemented semantics:
-
-- returns plain NumPy arrays `(groups, result)`;
-- `size` counts rows and does not require values;
-- `count` counts non-NaN values;
-- `dropna=True` skips NaN float keys;
-- `dropna=False` keeps one normalized NaN group;
-- `+0.0` and `-0.0` are normalized by the float hash path;
-- optimized dense integer and arbitrary-float hash paths are used
-  opportunistically, with a NumPy/Python fallback.
-
-Remaining possible extensions:
-
-- multi-key public API;
-- multiple aggregations in one call;
-- multiple value columns;
-- NDArray/chunked execution without eager NumPy conversion;
-- optional CTable/persistent output.
+- benchmark primarily against high-cardinality non-compact keys and
+  already-existing FULL indexes, not compact dense-key workloads.
 
 ### High-cardinality and memory strategy
 
@@ -368,7 +381,7 @@ Future safeguards/features:
 - estimate cardinality from early chunks;
 - expose/keep an internal memory limit;
 - fall back to sort-based grouping when cardinality is too high;
-- use FULL indexes when available;
+- possibly use FULL indexes when available and demonstrably beneficial;
 - eventually implement partitioned hash group-by with spill-to-disk.
 
 ### Parallel execution
@@ -379,36 +392,22 @@ Potential future optimization:
 - merge accumulators at chunk or partition boundaries;
 - coordinate with Blosc2 decompression threading to avoid oversubscription.
 
-### Additional API conveniences
-
-Implemented group-by convenience methods:
-
-```python
-t.group_by("city").sum("sales")
-t.group_by("city").mean("sales")
-t.group_by("city").min("sales")
-t.group_by("city").max("sales")
-```
+### Extend public `blosc2.group_reduce()`
 
-These are equivalent to `agg({column: op})` and complement the already-existing
-`size()` and `count(column)` group-by methods.
+Remaining possible extensions:
 
-Do not add top-level `CTable.size()` / `CTable.count()` until their semantics are
-clearly justified outside group-by.
+- multi-key public API;
+- multiple aggregations in one call;
+- multiple value columns;
+- NDArray/chunked execution without eager NumPy conversion;
+- optional CTable/persistent output.
 
 ### Persistent output
 
-The current result is an in-memory `CTable`.  Future work may add an `out=` or
-`urlpath=` option for persistent grouped output.
+The current `CTable.group_by()` result is an in-memory `CTable`.  Future work may
+add an `out=` or `urlpath=` option for persistent grouped output.
 
-## Related untracked files reviewed
+### Top-level CTable count/size semantics
 
-During cleanup, these untracked files were reviewed and found non-duplicative:
-
-```text
-tests/ctable/test_nested_append.py
-bench/ctable/bench_nested_filter_index.py
-```
-
-They cover direct nested append/extend correctness and nested flat-vs-dotted
-performance comparisons, respectively, and are worth keeping/adding separately.
+Do not add top-level `CTable.size()` / `CTable.count()` until their semantics are
+clearly justified outside group-by.

From 74b6ee246b7bca20fd28a9ea3fecb49fc512d4e9 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Fri, 15 May 2026 13:48:24 +0200
Subject: [PATCH 16/17] Implemented persistent grouped output via `urlpath=`

---
 doc/reference/ctable.rst     |  5 ++++
 plans/ctable-groupby.md      | 30 ++++++++++++++++++++----
 src/blosc2/groupby.py        | 45 ++++++++++++++++++++++--------------
 tests/ctable/test_groupby.py | 23 ++++++++++++++++++
 4 files changed, 81 insertions(+), 22 deletions(-)

diff --git a/doc/reference/ctable.rst b/doc/reference/ctable.rst
index 8ccf74b8..ad6f5b9c 100644
--- a/doc/reference/ctable.rst
+++ b/doc/reference/ctable.rst
@@ -263,6 +263,11 @@ one row per group::
     mins = by_city.min("sales")
     maxs = by_city.max("sales")
 
+Grouped results are in-memory by default.  Pass ``urlpath=`` to a terminal
+method to write the result as a persistent :class:`CTable`::
+
+    totals = by_city.sum("sales", urlpath="sales_by_city.b2d")
+
 .. autoclass:: CTableGroupBy
     :members: size, count, sum, mean, min, max, agg
 
diff --git a/plans/ctable-groupby.md b/plans/ctable-groupby.md
index 522fba47..4587f748 100644
--- a/plans/ctable-groupby.md
+++ b/plans/ctable-groupby.md
@@ -21,7 +21,9 @@ Implemented API decisions:
 
 - `CTable.group_by(...)` returns a lightweight `CTableGroupBy` facade.
 - `CTableGroupBy` is a deferred operation builder, not a `CTable` view.
-- Terminal methods materialize a new in-memory `CTable`.
+- Terminal methods materialize a new `CTable`.
+- Results are in-memory by default and persistent when terminal methods receive
+  `urlpath=`.
 - Aggregate result columns are suffixed as `<input>_<agg>`.
 - `GroupBy.size()` means row count per group / SQL `COUNT(*)`.
 - `GroupBy.count(column)` means non-null count / SQL `COUNT(column)`.
@@ -44,6 +46,22 @@ t.group_by("city").max("sales")
 These are equivalent to `agg({column: op})` and complement `size()` and
 `count(column)`.
 
+### Persistent grouped output
+
+Implemented `urlpath=` on group-by terminal methods for persistent grouped
+output:
+
+```python
+t.group_by("city").size(urlpath="counts.b2d")
+t.group_by("city").count("sales", urlpath="sales_count.b2d")
+t.group_by("city").sum("sales", urlpath="sales_sum.b2d")
+t.group_by("city").agg({"sales": "mean"}, urlpath="sales_mean.b2d")
+```
+
+The result remains an in-memory `CTable` when `urlpath` is omitted.  When
+`urlpath` is supplied, the grouped result is written with `mode="w"` semantics
+and returned as the newly created persistent `CTable`.
+
 ### Generic Python/NumPy implementation
 
 Implemented files:
@@ -238,6 +256,7 @@ Documented:
 - `CTable.group_by()`;
 - returned `CTableGroupBy` object;
 - `size()`, `count()`, `sum()`, `mean()`, `min()`, `max()`, `agg()`;
+- persistent grouped output via `urlpath=`;
 - examples for row counts, non-null counts, and grouped reductions;
 - public `blosc2.group_reduce()`.
 
@@ -265,7 +284,8 @@ Coverage includes:
 - bad engine rejection;
 - optimized integer/dictionary/float variants;
 - arbitrary float-key hash behavior;
-- public `group_reduce()` behavior and input validation.
+- public `group_reduce()` behavior and input validation;
+- persistent grouped output via `urlpath=`.
 
 ## Current design summary
 
@@ -402,10 +422,10 @@ Remaining possible extensions:
 - NDArray/chunked execution without eager NumPy conversion;
 - optional CTable/persistent output.
 
-### Persistent output
+### Output storage controls
 
-The current `CTable.group_by()` result is an in-memory `CTable`.  Future work may
-add an `out=` or `urlpath=` option for persistent grouped output.
+Future extensions may add a more general `out=` parameter or expose additional
+storage/cparams controls for grouped output.
 
 ### Top-level CTable count/size semantics
 
diff --git a/src/blosc2/groupby.py b/src/blosc2/groupby.py
index c5f14221..8e245548 100644
--- a/src/blosc2/groupby.py
+++ b/src/blosc2/groupby.py
@@ -93,52 +93,53 @@ def __init__(
             if table._is_list_column(col_info) or table._is_varlen_scalar_column(col_info):
                 raise TypeError(f"Cannot group by variable-length/list column {name!r} in Phase 1")
 
-    def size(self):
+    def size(self, *, urlpath: str | None = None):
         """Return row counts per group as a new :class:`CTable`.
 
         This is equivalent to SQL ``COUNT(*)``: it counts rows in each group and
-        is independent of null values in non-key columns.
+        is independent of null values in non-key columns.  If *urlpath* is
+        provided, the result is written as a persistent CTable at that path.
         """
-        return self._execute([_AggSpec(None, "size", "size")])
+        return self._execute([_AggSpec(None, "size", "size")], urlpath=urlpath)
 
-    def count(self, column: str):
+    def count(self, column: str, *, urlpath: str | None = None):
         """Return non-null value counts for *column* per group.
 
         This is equivalent to SQL ``COUNT(column)`` and to
         ``group_by(...).agg({column: "count"})``.
         """
         col = self.table._logical_to_physical_name(column)
-        return self._execute([_AggSpec(col, "count", f"{col}_count")])
+        return self._execute([_AggSpec(col, "count", f"{col}_count")], urlpath=urlpath)
 
-    def sum(self, column: str):
+    def sum(self, column: str, *, urlpath: str | None = None):
         """Return sums of *column* per group.
 
         This is equivalent to ``group_by(...).agg({column: "sum"})``.
         """
-        return self.agg({column: "sum"})
+        return self.agg({column: "sum"}, urlpath=urlpath)
 
-    def mean(self, column: str):
+    def mean(self, column: str, *, urlpath: str | None = None):
         """Return means of *column* per group.
 
         This is equivalent to ``group_by(...).agg({column: "mean"})``.
         """
-        return self.agg({column: "mean"})
+        return self.agg({column: "mean"}, urlpath=urlpath)
 
-    def min(self, column: str):
+    def min(self, column: str, *, urlpath: str | None = None):
         """Return minimum values of *column* per group.
 
         This is equivalent to ``group_by(...).agg({column: "min"})``.
         """
-        return self.agg({column: "min"})
+        return self.agg({column: "min"}, urlpath=urlpath)
 
-    def max(self, column: str):
+    def max(self, column: str, *, urlpath: str | None = None):
         """Return maximum values of *column* per group.
 
         This is equivalent to ``group_by(...).agg({column: "max"})``.
         """
-        return self.agg({column: "max"})
+        return self.agg({column: "max"}, urlpath=urlpath)
 
-    def agg(self, aggregations: Mapping[str, str | Sequence[str]]):
+    def agg(self, aggregations: Mapping[str, str | Sequence[str]], *, urlpath: str | None = None):
         """Aggregate value columns per group.
 
         Parameters
@@ -150,7 +151,7 @@ def agg(self, aggregations: Mapping[str, str | Sequence[str]]):
             ``{"*": "size"``}.
         """
         specs = self._normalize_aggs(aggregations)
-        return self._execute(specs)
+        return self._execute(specs, urlpath=urlpath)
 
     def _normalize_aggs(self, aggregations: Mapping[str, str | Sequence[str]]) -> list[_AggSpec]:
         if not isinstance(aggregations, Mapping) or not aggregations:
@@ -201,8 +202,16 @@ def _validate_value_column(self, name: str) -> None:
         if self.table._is_dictionary_column(col_info):
             raise TypeError(f"Cannot aggregate dictionary column {name!r} in Phase 1")
 
-    def _execute(self, specs: list[_AggSpec]):
+    def _execute(self, specs: list[_AggSpec], *, urlpath: str | None = None):
         self._validate_output_names(specs)
+        old_result_urlpath = getattr(self, "_result_urlpath", None)
+        self._result_urlpath = urlpath
+        try:
+            return self._execute_with_result_target(specs)
+        finally:
+            self._result_urlpath = old_result_urlpath
+
+    def _execute_with_result_target(self, specs: list[_AggSpec]):
         fast = self._try_execute_cython_dense_int_key(specs)
         if fast is not None:
             return fast
@@ -1327,7 +1336,9 @@ def _build_result(self, rows: list[dict[str, Any]], specs: list[_AggSpec]):
             fields.append((name, _python_type_for_spec(schema_specs[name]), b2_field(schema_specs[name])))
         row_type = dataclasses.make_dataclass("CTableGroupByRow", fields)
         data = {name: [row[name] for row in rows] for name in columns}
-        return CTable(row_type, new_data=data, expected_size=max(len(rows), 1), validate=False)
+        urlpath = getattr(self, "_result_urlpath", None)
+        kwargs = {"urlpath": str(urlpath), "mode": "w"} if urlpath is not None else {}
+        return CTable(row_type, new_data=data, expected_size=max(len(rows), 1), validate=False, **kwargs)
 
     def _validate_output_names(self, specs: list[_AggSpec]) -> None:
         names = self.keys + [s.output_col for s in specs]
diff --git a/tests/ctable/test_groupby.py b/tests/ctable/test_groupby.py
index ae884c42..fec1fca4 100644
--- a/tests/ctable/test_groupby.py
+++ b/tests/ctable/test_groupby.py
@@ -375,3 +375,26 @@ def test_groupby_convenience_numeric_methods():
     assert rows(t.group_by("city", sort=True).max("qty")) == rows(
         t.group_by("city", sort=True).agg({"qty": "max"})
     )
+
+
+def test_groupby_persistent_output_urlpath(tmp_path):
+    t = CTable(SalesRow, new_data=DATA)
+    path = tmp_path / "grouped.b2d"
+
+    out = t.group_by("city", sort=True).agg({"qty": "sum"}, urlpath=path)
+    out.close()
+
+    reopened = CTable.open(str(path), mode="r")
+    assert reopened.col_names == ["city", "qty_sum"]
+    assert rows(reopened) == [("Berlin", 6), ("Paris", 7), ("Rome", 8)]
+
+
+def test_groupby_persistent_output_urlpath_on_convenience_method(tmp_path):
+    t = CTable(SalesRow, new_data=DATA)
+    path = tmp_path / "grouped_mean.b2d"
+
+    out = t.group_by("city", sort=True).mean("qty", urlpath=path)
+    out.close()
+
+    reopened = CTable.open(str(path), mode="r")
+    assert rows(reopened) == [("Berlin", 6.0), ("Paris", 7 / 3), ("Rome", 4.0)]

From 4e137133382b5098044fa847b261d6adee533cd8 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Fri, 15 May 2026 14:28:45 +0200
Subject: [PATCH 17/17] Protect tests when pyarrow is not installed

---
 tests/ctable/test_object_spec.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/ctable/test_object_spec.py b/tests/ctable/test_object_spec.py
index 30fa7258..9b6154dc 100644
--- a/tests/ctable/test_object_spec.py
+++ b/tests/ctable/test_object_spec.py
@@ -42,6 +42,7 @@ def test_object_column_persistence(tmp_path):
 
 
 def test_object_column_to_arrow_raises():
+    pytest.importorskip("pyarrow")
     t = CTable(ObjectRow)
     t.append([1, {"x": 1}])
     with pytest.raises(TypeError, match="ObjectSpec columns"):