diff --git a/CMakeLists.txt b/CMakeLists.txt
index 734a4fea..ff4425a3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -51,6 +51,13 @@ add_custom_command(
   DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2/indexing_ext.pyx"
   VERBATIM)
 
+add_custom_command(
+  OUTPUT groupby_ext.c
+  COMMAND Python::Interpreter -m cython
+          "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2/groupby_ext.pyx" --output-file groupby_ext.c
+  DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2/groupby_ext.pyx"
+  VERBATIM)
+
 # ...and add it to the target
 Python_add_library(blosc2_ext MODULE blosc2_ext.c WITH_SOABI)
 target_sources(blosc2_ext PRIVATE src/blosc2/matmul_kernels.c)
@@ -59,10 +66,12 @@ if(UNIX)
   target_link_libraries(blosc2_ext PRIVATE ${CMAKE_DL_LIBS})
 endif()
 Python_add_library(indexing_ext MODULE indexing_ext.c WITH_SOABI)
+Python_add_library(groupby_ext MODULE groupby_ext.c WITH_SOABI)
 
 # We need to link against NumPy
 target_link_libraries(blosc2_ext PRIVATE Python::NumPy)
 target_link_libraries(indexing_ext PRIVATE Python::NumPy)
+target_link_libraries(groupby_ext PRIVATE Python::NumPy)
 
 # Fetch and build miniexpr library
 include(FetchContent)
@@ -99,6 +108,7 @@ endif()
 
 target_compile_features(blosc2_ext PRIVATE c_std_11)
 target_compile_features(indexing_ext PRIVATE c_std_11)
+target_compile_features(groupby_ext PRIVATE c_std_11)
 if(WIN32 AND CMAKE_C_COMPILER_ID STREQUAL "Clang")
     execute_process(
         COMMAND "${CMAKE_C_COMPILER}" -print-resource-dir
@@ -173,7 +183,7 @@ endif()
 
 # Python extension -> site-packages/blosc2
 install(
-  TARGETS blosc2_ext indexing_ext
+  TARGETS blosc2_ext indexing_ext groupby_ext
   LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/blosc2
 )
 
diff --git a/bench/ctable/bench_nested_filter_index.py b/bench/ctable/bench_nested_filter_index.py
new file mode 100644
index 00000000..71d44112
--- /dev/null
+++ b/bench/ctable/bench_nested_filter_index.py
@@ -0,0 +1,205 @@
+#!/usr/bin/env python
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+"""Benchmark nested leaf filter/index performance vs flat columns.
+
+Compares a CTable with flat column names against an equivalent one that uses
+dotted nested column names (physically stored under hierarchical _cols/ paths).
+Both tables hold the same data; each filter/index/aggregate operation is timed
+on both to show the overhead (or absence thereof) introduced by the nested layout.
+"""
+
+from __future__ import annotations
+
+import argparse
+import gc
+import time
+from dataclasses import dataclass
+
+import numpy as np
+
+import blosc2
+
+
+# ---------------------------------------------------------------------------
+# Schema helpers
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class FlatRow:
+    trip_begin_lon: float = blosc2.field(blosc2.float64())
+    trip_begin_lat: float = blosc2.field(blosc2.float64())
+    trip_end_lon: float = blosc2.field(blosc2.float64())
+    trip_end_lat: float = blosc2.field(blosc2.float64())
+    payment_fare: float = blosc2.field(blosc2.float64(ge=0))
+
+
+@dataclass
+class NestedRow:
+    """Same physical columns as FlatRow but accessed via dotted names after creation."""
+
+    trip_begin_lon: float = blosc2.field(blosc2.float64())
+    trip_begin_lat: float = blosc2.field(blosc2.float64())
+    trip_end_lon: float = blosc2.field(blosc2.float64())
+    trip_end_lat: float = blosc2.field(blosc2.float64())
+    payment_fare: float = blosc2.field(blosc2.float64(ge=0))
+
+
+def _build_data(n: int) -> dict:
+    rng = np.random.default_rng(42)
+    return {
+        "trip_begin_lon": rng.uniform(-88.0, -87.5, n).astype(np.float64),
+        "trip_begin_lat": rng.uniform(41.6, 42.0, n).astype(np.float64),
+        "trip_end_lon": rng.uniform(-88.0, -87.5, n).astype(np.float64),
+        "trip_end_lat": rng.uniform(41.6, 42.0, n).astype(np.float64),
+        "payment_fare": rng.uniform(3.0, 50.0, n).astype(np.float64),
+    }
+
+
+def _build_flat(data: dict, n: int) -> "blosc2.CTable":
+    t = blosc2.CTable(FlatRow, expected_size=n)
+    t.extend(data)
+    return t
+
+
+def _build_nested(data: dict, n: int) -> "blosc2.CTable":
+    t = blosc2.CTable(NestedRow, expected_size=n)
+    t.extend(data)
+    # Rename to dotted nested names
+    t.rename_column("trip_begin_lon", "trip.begin.lon")
+    t.rename_column("trip_begin_lat", "trip.begin.lat")
+    t.rename_column("trip_end_lon", "trip.end.lon")
+    t.rename_column("trip_end_lat", "trip.end.lat")
+    t.rename_column("payment_fare", "payment.fare")
+    return t
+
+
+# ---------------------------------------------------------------------------
+# Timing helper
+# ---------------------------------------------------------------------------
+
+
+def _timeit(fn, repeats: int = 5) -> float:
+    gc.collect()
+    times = []
+    for _ in range(repeats):
+        t0 = time.perf_counter()
+        fn()
+        times.append(time.perf_counter() - t0)
+    return min(times)
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+
+def main() -> None:
+    p = argparse.ArgumentParser(description="Benchmark nested vs flat column filter/index/aggregate")
+    p.add_argument("--rows", type=int, default=1_000_000, help="Number of rows (default: 1M)")
+    p.add_argument("--repeats", type=int, default=5, help="Timing repeats (default: 5)")
+    args = p.parse_args()
+
+    N = args.rows
+    R = args.repeats
+
+    print(f"Building tables with {N:,} rows …")
+    data = _build_data(N)
+    flat_data = data.copy()  # flat uses underscore names
+    nested_data = {
+        "trip_begin_lon": data["trip_begin_lon"],
+        "trip_begin_lat": data["trip_begin_lat"],
+        "trip_end_lon": data["trip_end_lon"],
+        "trip_end_lat": data["trip_end_lat"],
+        "payment_fare": data["payment_fare"],
+    }
+
+    tf = _build_flat(flat_data, N)
+    tn = _build_nested(nested_data, N)
+    print(f"  flat   col_names: {tf.col_names}")
+    print(f"  nested col_names: {tn.col_names}")
+    print()
+
+    # Build indexes on the fare column for index-accelerated queries
+    print("Building indexes …")
+    tf.create_index("payment_fare")
+    tn.create_index("payment.fare")
+    print()
+
+    header = f"{'Operation':<45} {'flat (ms)':>12} {'nested (ms)':>13} {'ratio':>8}"
+    print(header)
+    print("-" * len(header))
+
+    def bench(label, flat_fn, nested_fn):
+        t_flat = _timeit(flat_fn, R) * 1000
+        t_nested = _timeit(nested_fn, R) * 1000
+        ratio = t_nested / t_flat if t_flat > 0 else float("nan")
+        print(f"{label:<45} {t_flat:>12.3f} {t_nested:>13.3f} {ratio:>8.3f}x")
+
+    bench(
+        "where (string expr, full scan)",
+        lambda: tf.where("payment_fare > 20"),
+        lambda: tn.where("payment.fare > 20"),
+    )
+
+    bench(
+        "where (string expr, full scan, nrows)",
+        lambda: tf.where("payment_fare > 20").nrows,
+        lambda: tn.where("payment.fare > 20").nrows,
+    )
+
+    bench(
+        "where (LazyExpr, full scan)",
+        lambda: tf.where(tf["payment_fare"] > 20),
+        lambda: tn.where(tn["payment.fare"] > 20),
+    )
+
+    bench(
+        "where (auto index-accelerated, nrows)",
+        lambda: tf.where("payment_fare > 20").nrows,
+        lambda: tn.where("payment.fare > 20").nrows,
+    )
+
+    bench(
+        "column mean (full scan)",
+        lambda: tf["payment_fare"].mean(),
+        lambda: tn["payment.fare"].mean(),
+    )
+
+    bench(
+        "column sum (full scan)",
+        lambda: tf["payment_fare"].sum(),
+        lambda: tn["payment.fare"].sum(),
+    )
+
+    bench(
+        "column min (full scan)",
+        lambda: tf["trip_begin_lon"].min(),
+        lambda: tn["trip.begin.lon"].min(),
+    )
+
+    bench(
+        "multi-column where (string expr, nrows)",
+        lambda: tf.where("trip_begin_lon > -87.7 and payment_fare > 10").nrows,
+        lambda: tn.where("trip.begin.lon > -87.7 and payment.fare > 10").nrows,
+    )
+
+    bench(
+        "sort_by (single leaf)",
+        lambda: tf.sort_by("payment_fare"),
+        lambda: tn.sort_by("payment.fare"),
+    )
+
+    print()
+    print("ratio < 1 means nested is faster; ratio > 1 means flat is faster.")
+    print("Ratios close to 1.0 indicate the nested path adds negligible overhead.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bench/ctable/groupby.py b/bench/ctable/groupby.py
new file mode 100644
index 00000000..41929563
--- /dev/null
+++ b/bench/ctable/groupby.py
@@ -0,0 +1,199 @@
+#!/usr/bin/env python
+"""Phase-1 CTable group_by benchmark.
+
+Examples
+--------
+python bench/ctable/groupby.py --rows 10_000_000 --groups 1_000 --op sum
+python bench/ctable/groupby.py --rows 10_000_000 --groups 1_000 --key-dtype float64 --op sum
+# float key dtypes generate non-integral repeated labels to exercise the float hash path
+python bench/ctable/groupby.py --rows 1_000_000 --groups 100 --dictionary --pandas
+python bench/ctable/groupby.py --rows 10_000_000 --groups 1_000 --groups2 100 --multi-key --op sum
+"""
+
+from __future__ import annotations
+
+import argparse
+import dataclasses
+import time
+from pathlib import Path
+
+import numpy as np
+
+import blosc2
+
+
+def parse_int(text: str) -> int:
+    return int(text.replace("_", ""))
+
+
+def build_row_type(dictionary: bool, key_dtype: str, multi_key: bool):
+    if dictionary and multi_key:
+
+        @dataclasses.dataclass
+        class Row:
+            key0: str = blosc2.field(blosc2.dictionary())
+            key1: int = blosc2.field(blosc2.int32())
+            value: float = blosc2.field(blosc2.float64())
+
+    elif dictionary:
+
+        @dataclasses.dataclass
+        class Row:
+            key: str = blosc2.field(blosc2.dictionary())
+            value: float = blosc2.field(blosc2.float64())
+
+    elif key_dtype in {"int8", "uint8", "int16", "uint16", "int32", "uint32", "int64", "uint64"}:
+        key_spec = getattr(blosc2, key_dtype)()
+
+        if multi_key:
+
+            @dataclasses.dataclass
+            class Row:
+                key0: int = blosc2.field(key_spec)
+                key1: int = blosc2.field(key_spec)
+                value: float = blosc2.field(blosc2.float64())
+
+        else:
+
+            @dataclasses.dataclass
+            class Row:
+                key: int = blosc2.field(key_spec)
+                value: float = blosc2.field(blosc2.float64())
+
+    elif key_dtype in {"float32", "float64"}:
+        key_spec = blosc2.float32() if key_dtype == "float32" else blosc2.float64()
+
+        if multi_key:
+
+            @dataclasses.dataclass
+            class Row:
+                key0: float = blosc2.field(key_spec)
+                key1: float = blosc2.field(key_spec)
+                value: float = blosc2.field(blosc2.float64())
+
+        else:
+
+            @dataclasses.dataclass
+            class Row:
+                key: float = blosc2.field(key_spec)
+                value: float = blosc2.field(blosc2.float64())
+
+    else:  # pragma: no cover - argparse choices prevent this
+        raise ValueError(f"unsupported key dtype {key_dtype!r}")
+
+    return Row
+
+
+def make_key_data(key_codes: np.ndarray, dictionary: bool, key_dtype: str):
+    if dictionary:
+        return np.asarray([f"k{code}" for code in key_codes], dtype=object)
+    if key_dtype in {"float32", "float64"}:
+        # Use non-integral, repeated float labels by default so float-key
+        # benchmarks exercise the arbitrary-float hash path instead of the
+        # dense integral-float fast path.
+        labels = key_codes.astype(np.float64) + 0.25
+        return labels.astype(np.dtype(key_dtype))
+    return key_codes.astype(np.dtype(key_dtype), copy=False)
+
+
+def make_data(nrows: int, ngroups: int, ngroups2: int, dictionary: bool, key_dtype: str, multi_key: bool, seed: int):
+    rng = np.random.default_rng(seed)
+    key_codes = rng.integers(0, ngroups, size=nrows, dtype=np.int32)
+    values = rng.random(nrows, dtype=np.float64)
+    if not multi_key:
+        return {"key": make_key_data(key_codes, dictionary, key_dtype), "value": values}
+
+    key2_codes = rng.integers(0, ngroups2, size=nrows, dtype=np.int32)
+    key0 = make_key_data(key_codes, dictionary, key_dtype)
+    key1_dtype = "int32" if dictionary else key_dtype
+    key1 = make_key_data(key2_codes, False, key1_dtype)
+    return {"key0": key0, "key1": key1, "value": values}
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--rows", type=parse_int, default=10_000_000)
+    parser.add_argument("--groups", type=parse_int, default=1_000)
+    parser.add_argument("--groups2", type=parse_int, default=None, help="Number of groups for key1 with --multi-key")
+    parser.add_argument("--chunk-size", type=parse_int, default=None)
+    parser.add_argument("--dictionary", action="store_true", help="Use a dictionary-encoded string key")
+    parser.add_argument(
+        "--key-dtype",
+        choices=[
+            "int8",
+            "uint8",
+            "int16",
+            "uint16",
+            "int32",
+            "uint32",
+            "int64",
+            "uint64",
+            "float32",
+            "float64",
+        ],
+        default="int32",
+        help="Physical dtype for non-dictionary keys. Float keys are generated as non-integral repeated labels.",
+    )
+    parser.add_argument("--op", choices=["size", "count", "sum", "mean", "min", "max"], default="sum")
+    parser.add_argument("--multi-key", action="store_true", help="Group by two keys: key0 and key1")
+    parser.add_argument("--sort", action="store_true")
+    parser.add_argument("--pandas", action="store_true", help="Also run a pandas comparison if available")
+    parser.add_argument("--urlpath", type=Path, default=None, help="Optional persistent CTable path")
+    parser.add_argument("--seed", type=int, default=0)
+    args = parser.parse_args()
+
+    groups2 = args.groups if args.groups2 is None else args.groups2
+    print(
+        f"rows={args.rows:,} groups={args.groups:,} groups2={groups2:,} multi_key={args.multi_key} "
+        f"dictionary={args.dictionary} key_dtype={args.key_dtype} op={args.op} sort={args.sort} "
+        f"chunk_size={args.chunk_size} urlpath={args.urlpath}"
+    )
+
+    data = make_data(args.rows, args.groups, groups2, args.dictionary, args.key_dtype, args.multi_key, args.seed)
+    Row = build_row_type(args.dictionary, args.key_dtype, args.multi_key)
+
+    kwargs = {}
+    if args.urlpath is not None:
+        kwargs.update(urlpath=str(args.urlpath), mode="w")
+
+    t0 = time.perf_counter()
+    table = blosc2.CTable(Row, new_data=data, expected_size=args.rows, **kwargs)
+    build_time = time.perf_counter() - t0
+    print(f"ctable_build_seconds={build_time:.6f}")
+
+    t0 = time.perf_counter()
+    group_keys = ["key0", "key1"] if args.multi_key else "key"
+    gb = table.group_by(group_keys, sort=args.sort, chunk_size=args.chunk_size)
+    if args.op == "size":
+        out = gb.size()
+    elif args.op == "count":
+        out = gb.count("value")
+    else:
+        out = gb.agg({"value": args.op})
+    elapsed = time.perf_counter() - t0
+    print(f"ctable_groupby_seconds={elapsed:.6f}")
+    print(f"result_rows={out.nrows:,}")
+
+    if args.pandas:
+        try:
+            import pandas as pd
+        except ImportError:
+            print("pandas_unavailable=true")
+        else:
+            df = pd.DataFrame(data)
+            t0 = time.perf_counter()
+            if args.op == "size":
+                pdf = df.groupby(group_keys, sort=args.sort).size()
+            elif args.op == "count":
+                pdf = df.groupby(group_keys, sort=args.sort)["value"].count()
+            else:
+                pdf = df.groupby(group_keys, sort=args.sort)["value"].agg(args.op)
+            pandas_elapsed = time.perf_counter() - t0
+            print(f"pandas_groupby_seconds={pandas_elapsed:.6f}")
+            print(f"pandas_result_rows={len(pdf):,}")
+
+    table.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/doc/reference/ctable.rst b/doc/reference/ctable.rst
index 12e99ea0..ad6f5b9c 100644
--- a/doc/reference/ctable.rst
+++ b/doc/reference/ctable.rst
@@ -233,6 +233,7 @@ When a NumPy structured array is needed, materialize explicitly::
     CTable.sample
     CTable.sort_by
     CTable.iter_sorted
+    CTable.group_by
 
 .. automethod:: CTable.where
 .. automethod:: CTable.view
@@ -242,6 +243,33 @@ When a NumPy structured array is needed, materialize explicitly::
 .. automethod:: CTable.sample
 .. automethod:: CTable.sort_by
 .. automethod:: CTable.iter_sorted
+.. automethod:: CTable.group_by
+
+
+Group-by reductions
+-------------------
+
+:meth:`CTable.group_by` returns a lightweight deferred group-by object.  It is
+not a table view; methods such as :meth:`~blosc2.CTableGroupBy.size`,
+:meth:`~blosc2.CTableGroupBy.count`, :meth:`~blosc2.CTableGroupBy.sum`, and
+:meth:`~blosc2.CTableGroupBy.agg` materialize a new :class:`CTable` with
+one row per group::
+
+    by_city = t.group_by("city", sort=True)
+    counts = by_city.size()                  # row count per city / COUNT(*)
+    non_null = by_city.count("sales")        # non-null sales count / COUNT(sales)
+    totals = by_city.sum("sales")            # equivalent to agg({"sales": "sum"})
+    means = by_city.mean("sales")
+    mins = by_city.min("sales")
+    maxs = by_city.max("sales")
+
+Grouped results are in-memory by default.  Pass ``urlpath=`` to a terminal
+method to write the result as a persistent :class:`CTable`::
+
+    totals = by_city.sum("sales", urlpath="sales_by_city.b2d")
+
+.. autoclass:: CTableGroupBy
+    :members: size, count, sum, mean, min, max, agg
 
 
 Mutations
diff --git a/doc/reference/reduction_functions.rst b/doc/reference/reduction_functions.rst
index 4c21c150..5122807b 100644
--- a/doc/reference/reduction_functions.rst
+++ b/doc/reference/reduction_functions.rst
@@ -14,6 +14,7 @@ Reduction operations can be used with any of :ref:`NDArray <NDArray>`, :ref:`C2A
     argmax
     argmin
     count_nonzero
+    group_reduce
     cumulative_prod
     cumulative_sum
     max
@@ -31,6 +32,7 @@ Reduction operations can be used with any of :ref:`NDArray <NDArray>`, :ref:`C2A
 .. autofunction:: blosc2.argmax
 .. autofunction:: blosc2.argmin
 .. autofunction:: blosc2.count_nonzero
+.. autofunction:: blosc2.group_reduce
 .. autofunction:: blosc2.cumulative_prod
 .. autofunction:: blosc2.cumulative_sum
 .. autofunction:: blosc2.max
diff --git a/plans/ctable-groupby.md b/plans/ctable-groupby.md
new file mode 100644
index 00000000..4587f748
--- /dev/null
+++ b/plans/ctable-groupby.md
@@ -0,0 +1,433 @@
+# CTable `group_by` implementation plan — status
+
+This document started as the implementation plan for `CTable.group_by()`.  The
+core API and several optimized execution paths are now implemented.  The first
+section records completed work; the final section lists remaining future work.
+
+## Completed
+
+### Public `CTable.group_by()` API
+
+Implemented:
+
+```python
+t.group_by("city").size()
+t.group_by("city").count("sales")
+t.group_by("city").agg({"sales": "sum"})
+t.group_by(["country", "city"]).agg({"sales": ["sum", "mean"]})
+```
+
+Implemented API decisions:
+
+- `CTable.group_by(...)` returns a lightweight `CTableGroupBy` facade.
+- `CTableGroupBy` is a deferred operation builder, not a `CTable` view.
+- Terminal methods materialize a new `CTable`.
+- Results are in-memory by default and persistent when terminal methods receive
+  `urlpath=`.
+- Aggregate result columns are suffixed as `<input>_<agg>`.
+- `GroupBy.size()` means row count per group / SQL `COUNT(*)`.
+- `GroupBy.count(column)` means non-null count / SQL `COUNT(column)`.
+- `GroupBy.agg({"col": "count"})` is equivalent to `GroupBy.count("col")`.
+- `sort=False` is the fast default; `sort=True` sorts output by group keys.
+- `dropna=True` is the default; `dropna=False` keeps null/NaN key groups.
+- No top-level `CTable.size()` or `CTable.count()` was added.
+
+### Convenience group-by methods
+
+Implemented group-by convenience methods:
+
+```python
+t.group_by("city").sum("sales")
+t.group_by("city").mean("sales")
+t.group_by("city").min("sales")
+t.group_by("city").max("sales")
+```
+
+These are equivalent to `agg({column: op})` and complement `size()` and
+`count(column)`.
+
+### Persistent grouped output
+
+Implemented `urlpath=` on group-by terminal methods for persistent grouped
+output:
+
+```python
+t.group_by("city").size(urlpath="counts.b2d")
+t.group_by("city").count("sales", urlpath="sales_count.b2d")
+t.group_by("city").sum("sales", urlpath="sales_sum.b2d")
+t.group_by("city").agg({"sales": "mean"}, urlpath="sales_mean.b2d")
+```
+
+The result remains an in-memory `CTable` when `urlpath` is omitted.  When
+`urlpath` is supplied, the grouped result is written with `mode="w"` semantics
+and returned as the newly created persistent `CTable`.
+
+### Generic Python/NumPy implementation
+
+Implemented files:
+
+```text
+src/blosc2/ctable.py      # CTable.group_by()
+src/blosc2/groupby.py     # CTableGroupBy, NumPy fallback, public group_reduce()
+```
+
+Implemented functionality:
+
+- Chunked, columnar traversal.
+- Reads only group keys, aggregation value columns, and `_valid_rows`.
+- Handles live rows, views, and deleted rows.
+- Supports fixed-width scalar keys and dictionary-encoded string keys.
+- Dictionary keys group by codes and decode only for result materialization.
+- Supports `size`, `count`, `sum`, `mean`, `min`, `max`.
+- Supports multi-key group-by via structured NumPy keys.
+- Supports empty inputs.
+- Falls back to the generic NumPy path for unsupported optimized cases.
+
+### Benchmark harness
+
+Implemented/extended:
+
+```text
+bench/ctable/groupby.py
+```
+
+The benchmark can vary:
+
+- row count;
+- group cardinality;
+- key dtype via `--key-dtype` including integer, unsigned integer, and float dtypes;
+- dictionary keys via `--dictionary`;
+- operation via `--op size|count|sum|mean|min|max`;
+- sorted output;
+- chunk size;
+- multi-key mode via `--multi-key` and `--groups2`;
+- optional persistent `urlpath`;
+- optional pandas comparison.
+
+Float key benchmarks now generate non-integral repeated labels by default so
+`float32`/`float64` runs exercise the arbitrary-float hash path instead of the
+integral-float dense path.
+
+### Dedicated Cython extension
+
+Implemented:
+
+```text
+src/blosc2/groupby_ext.pyx
+```
+
+Build integration:
+
+- `CMakeLists.txt` builds, links, and installs `groupby_ext`.
+- Group-by kernels were removed from `indexing_ext.pyx`.
+- `src/blosc2/groupby.py` imports `blosc2.groupby_ext` for optimized kernels.
+
+Rationale:
+
+- Group-by kernels are analytics/query execution code, not indexing internals.
+- A dedicated extension keeps separation of concerns cleaner as optimized paths grow.
+
+### Dense integer-key Cython coverage
+
+Implemented fused dense integer-key Cython kernels covering:
+
+- `int8`, `uint8`;
+- `int16`, `uint16`;
+- `int32`, `uint32`;
+- `int64`, `uint64`.
+
+Implemented dense integer/dictionary-code Cython path for:
+
+- `size`;
+- `count`;
+- `sum`;
+- `mean` via sum/count;
+- `min`;
+- `max`.
+
+Additional details:
+
+- Uses compact dense accumulator arrays.
+- Falls back for negative non-null keys and non-compact key ranges.
+- Supports float64 value kernels with NaN-null skipping where applicable.
+- Supports int64-normalized integer/bool value kernels for `sum`, `min`, and `max`.
+- Tracks key presence separately so groups with all-null values are emitted correctly.
+
+Representative benchmark improvements observed during earlier optimization:
+
+```text
+50M rows, 5k int32 groups, float64 sum:
+  generic/early path: ~0.47 s
+  Cython dense path:  ~0.20–0.22 s
+
+50M rows, 5k float64 integral groups, float64 sum:
+  generic path:       ~5.51 s
+  Cython dense path:  ~0.27–0.29 s
+
+50M rows, 5k float32 integral groups, float64 sum:
+  Cython dense path:  ~0.24–0.25 s
+```
+
+### Arbitrary float-key hash path
+
+Implemented a conservative Cython open-addressing hash path for single
+`float32`/`float64` keys with float value aggregations.
+
+Implemented operations:
+
+- `size`;
+- `count`;
+- `sum`;
+- `mean`;
+- `min`;
+- `max`.
+
+Implemented semantics:
+
+- `dropna=True`: skip NaN keys;
+- `dropna=False`: all NaN keys form one group;
+- `+0.0` and `-0.0` are normalized into the same group;
+- infinities are valid groups through regular float bit hashing;
+- NaN-null float values are skipped for value aggregations.
+
+### Two-key Cython hash path
+
+Implemented a conservative Cython hash path for two-key group-by when both keys
+are integer or dictionary-code-backed columns.
+
+Implemented behavior:
+
+- normalizes keys to `int64`;
+- hashes `(key0, key1)` directly;
+- supports `size`, `count`, `sum`, `mean`, `min`, and `max` for supported float
+  value reductions;
+- avoids structured-array packing and per-chunk `np.unique` for common two-key
+  categorical/integer workloads;
+- falls back for unsupported cases.
+
+Benchmarks showed this is functionally useful but still leaves room for future
+optimization because partial states are merged in Python and the generic hash
+kernel maintains more state than a specialized one-operation kernel needs.
+
+### Public `blosc2.group_reduce()`
+
+Implemented a conservative public array API for single-key grouped reductions
+without requiring a `CTable`.
+
+Implemented API:
+
+```python
+groups, result = blosc2.group_reduce(
+    keys, values=None, op="size", sort=False, dropna=True
+)
+```
+
+Implemented operations:
+
+- `size`;
+- `count`;
+- `sum`;
+- `mean`;
+- `min`;
+- `max`.
+
+Implemented semantics:
+
+- returns plain NumPy arrays `(groups, result)`;
+- `size` counts rows and does not require values;
+- `count` counts non-NaN values;
+- `dropna=True` skips NaN float keys;
+- `dropna=False` keeps one normalized NaN group;
+- `+0.0` and `-0.0` are normalized by the float hash path;
+- optimized dense integer and arbitrary-float hash paths are used
+  opportunistically, with a NumPy/Python fallback.
+
+### Documentation
+
+Implemented/updated user-facing documentation in:
+
+```text
+doc/reference/ctable.rst
+doc/reference/reduction_functions.rst
+```
+
+Documented:
+
+- `CTable.group_by()`;
+- returned `CTableGroupBy` object;
+- `size()`, `count()`, `sum()`, `mean()`, `min()`, `max()`, `agg()`;
+- persistent grouped output via `urlpath=`;
+- examples for row counts, non-null counts, and grouped reductions;
+- public `blosc2.group_reduce()`.
+
+### Tests
+
+Implemented/extended:
+
+```text
+tests/ctable/test_groupby.py
+tests/test_group_reduce.py
+```
+
+Coverage includes:
+
+- `size()` row counts;
+- `count(column)` non-null counts;
+- `agg()` with `sum`, `mean`, `min`, `max`, `count`;
+- convenience `sum`, `mean`, `min`, `max` methods;
+- `agg({"*": "size"})`;
+- multi-key group-by;
+- dictionary string keys;
+- views and deleted rows;
+- empty tables;
+- `dropna=True` / `dropna=False` behavior;
+- bad engine rejection;
+- optimized integer/dictionary/float variants;
+- arbitrary float-key hash behavior;
+- public `group_reduce()` behavior and input validation;
+- persistent grouped output via `urlpath=`.
+
+## Current design summary
+
+The implementation now has these execution layers:
+
+1. Generic chunked NumPy path:
+   - broadest semantics;
+   - per-chunk local grouping and global merge.
+2. Dense NumPy single-key path:
+   - compact non-negative integer/dictionary-code keys;
+   - dense accumulator arrays.
+3. Cython dense integer-key path:
+   - fused integer key dtypes;
+   - `size`, `count`, `sum`, `mean`, `min`, `max`.
+4. Cython integral-float dense path:
+   - integral `float32`/`float64` keys for selected dense cases.
+5. Cython arbitrary-float hash path:
+   - non-integral `float32`/`float64` keys;
+   - normalized NaN and signed-zero semantics.
+6. Cython two-key hash path:
+   - two integer/dictionary-code-backed keys;
+   - float value reductions.
+7. Public array-level `blosc2.group_reduce()`:
+   - uses optimized kernels opportunistically without requiring a `CTable`.
+
+All optimized paths are conservative and fall back to the generic engine when
+unsupported data or semantics are encountered.
+
+## Future work
+
+### Fuse multiple aggregations/value columns in Cython
+
+Current optimized paths often run separate kernels or maintain generic state.
+Future work could:
+
+- fuse multiple aggregations in a single pass;
+- support multiple value columns directly;
+- specialize kernels by requested operation so, for example, a `sum` workload
+  does not maintain min/max state;
+- broaden value-type coverage beyond float64/int64 normalized kernels.
+
+### Extend multi-key optimized paths
+
+Current Cython multi-key support is intentionally narrow.
+Future work could:
+
+- support more than two key columns;
+- support float key components directly;
+- support fixed-width string/bytes key components directly;
+- support non-float value columns without normalizing reductions through float64;
+- merge multi-key states fully in Cython instead of via Python accumulators;
+- add a dense two-integer-key path for compact Cartesian key domains.
+
+### Revisit FULL-index sorted group-by only with a better design
+
+A Python/NumPy FULL-index sorted-scan prototype was implemented and reverted
+after benchmarking because it was not competitive with existing dense/hash paths.
+
+Prototype behavior:
+
+```text
+read sorted values/positions from FULL sidecars
+scan contiguous key runs
+respect _valid_rows
+reduce each run
+emit sorted groups naturally
+```
+
+Observed benchmark results on 50M rows / 5k compact groups:
+
+```text
+float64 key, sum, sort=True, FULL index:
+  index build: ~6.2 s
+  group_by:    ~104 s
+
+int64 key, sum, sort=True, FULL index:
+  index build: ~5.5 s
+  group_by:    ~102 s
+
+int64 key, size, sort=True, FULL index:
+  index build: ~5.5 s
+  group_by:    ~0.45 s
+
+int64 key, size, sort=False, no FULL index:
+  group_by:    ~0.14 s
+```
+
+Why the prototype was slow:
+
+- value aggregations required many scattered gathers from the original value
+  column, one gathered position set per key run;
+- scattered value access is much less cache/compression friendly than existing
+  sequential dense/hash scans;
+- the implementation still had Python-level run processing and result merging;
+- FULL index build cost is substantial unless the index already exists and can
+  be reused many times;
+- compact integer-key workloads are already ideal for dense accumulator arrays.
+
+Recommendation:
+
+- keep this deferred;
+- do not reintroduce a Python-level FULL-index value-aggregation path;
+- revisit only with a block-aware/Cython reducer that batches sorted positions
+  by physical chunks/blocks, or as part of a broader high-cardinality/sparse-key
+  strategy;
+- benchmark primarily against high-cardinality non-compact keys and
+  already-existing FULL indexes, not compact dense-key workloads.
+
+### High-cardinality and memory strategy
+
+Future safeguards/features:
+
+- estimate cardinality from early chunks;
+- expose/keep an internal memory limit;
+- fall back to sort-based grouping when cardinality is too high;
+- possibly use FULL indexes when available and demonstrably beneficial;
+- eventually implement partitioned hash group-by with spill-to-disk.
+
+### Parallel execution
+
+Potential future optimization:
+
+- per-thread local accumulators;
+- merge accumulators at chunk or partition boundaries;
+- coordinate with Blosc2 decompression threading to avoid oversubscription.
+
+### Extend public `blosc2.group_reduce()`
+
+Remaining possible extensions:
+
+- multi-key public API;
+- multiple aggregations in one call;
+- multiple value columns;
+- NDArray/chunked execution without eager NumPy conversion;
+- optional CTable/persistent output.
+
+### Output storage controls
+
+Future extensions may add a more general `out=` parameter or expose additional
+storage/cparams controls for grouped output.
+
+### Top-level CTable count/size semantics
+
+Do not add top-level `CTable.size()` / `CTable.count()` until their semantics are
+clearly justified outside group-by.
diff --git a/src/blosc2/__init__.py b/src/blosc2/__init__.py
index 8a587c06..ee258655 100644
--- a/src/blosc2/__init__.py
+++ b/src/blosc2/__init__.py
@@ -628,6 +628,7 @@ def _raise(exc):
 # Note: bool, bytes, string shadow builtins in the blosc2 namespace by design —
 # they are schema spec constructors (b2.bool(), b2.bytes(), etc.).
 from .ctable import DEFAULT_NULL_POLICY, Column, CTable, NullPolicy, get_null_policy, null_policy
+from .groupby import CTableGroupBy, group_reduce
 from .ndarray import (
     abs,
     acos,
@@ -801,9 +802,12 @@ def _raise(exc):
     "uint64",
     "vlbytes",
     "vlstring",
+    # Grouped reductions
+    "group_reduce",
     # Classes
     "C2Array",
     "CParams",
+    "CTableGroupBy",
     "Batch",
     "BatchArray",
     # Enums
diff --git a/src/blosc2/ctable.py b/src/blosc2/ctable.py
index 1f80dc3d..56a89583 100644
--- a/src/blosc2/ctable.py
+++ b/src/blosc2/ctable.py
@@ -20,7 +20,7 @@
 import re
 import shutil
 from collections import namedtuple
-from collections.abc import Iterable, Mapping
+from collections.abc import Iterable, Mapping, Sequence
 from dataclasses import MISSING, dataclass
 from dataclasses import field as dataclass_field
 from textwrap import TextWrapper
@@ -2338,12 +2338,15 @@ def _init_columns(
                 )
                 continue
             if self._is_dictionary_column(col):
-                self._cols[col.name] = storage.create_dictionary_column(
+                dict_col = storage.create_dictionary_column(
                     col.name,
                     spec=col.spec,
                     cparams=col_storage.get("cparams"),
                     dparams=col_storage.get("dparams"),
                 )
+                if len(dict_col.codes) < expected_size:
+                    dict_col.resize((expected_size,))
+                self._cols[col.name] = dict_col
                 continue
             # Recompute chunks/blocks using the actual dtype so that wide
             # string columns (e.g. U183642) don't produce multi-GB chunks.
@@ -3482,6 +3485,47 @@ def select(self, cols: list[str]) -> CTable:
         obj._col_widths = {name: self._col_widths[name] for name in cols if name in self._col_widths}
         return obj
 
+    def group_by(
+        self,
+        keys: str | Sequence[str],
+        *,
+        sort: bool = False,
+        dropna: bool = True,
+        engine: str = "auto",
+        chunk_size: int | None = None,
+    ):
+        """Return a deferred group-by object for this table.
+
+        Parameters
+        ----------
+        keys:
+            Column name or sequence of column names to group by.
+        sort:
+            If ``True``, sort the result by the group keys.  The default
+            ``False`` preserves the hash aggregation order and is usually
+            faster.
+        dropna:
+            If ``True`` (default), rows with null/NaN group keys are skipped.
+            If ``False``, null/NaN keys form their own group.
+        engine:
+            Execution engine.  Phase 1 accepts ``"auto"`` and uses the NumPy
+            chunked implementation.
+        chunk_size:
+            Optional number of physical rows processed per chunk.
+
+        Returns
+        -------
+        CTableGroupBy
+            A lightweight deferred operation builder.  Call methods such as
+            ``.size()``, ``.count(column)`` or ``.agg({...})`` to materialize a
+            grouped result as a new :class:`CTable`.
+        """
+        if engine != "auto":
+            raise ValueError("Only engine='auto' is supported for group_by() in Phase 1")
+        from blosc2.groupby import CTableGroupBy
+
+        return CTableGroupBy(self, keys, sort=sort, dropna=dropna, engine=engine, chunk_size=chunk_size)
+
     def describe(self) -> None:
         """Print a per-column statistical summary.
 
diff --git a/src/blosc2/groupby.py b/src/blosc2/groupby.py
new file mode 100644
index 00000000..8e245548
--- /dev/null
+++ b/src/blosc2/groupby.py
@@ -0,0 +1,1765 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+"""Group-by support for :class:`blosc2.CTable`.
+
+This module contains the Phase-1, NumPy-based implementation.  It is deliberately
+chunked and columnar: only grouping columns, aggregation columns, and the
+live-row mask are read from the source table.
+"""
+
+from __future__ import annotations
+
+import copy
+import dataclasses
+import math
+import re
+from collections.abc import Mapping, Sequence
+from typing import TYPE_CHECKING, Any, Literal
+
+import numpy as np
+
+from blosc2.schema import DictionarySpec, SchemaSpec, float64, int64
+from blosc2.schema import bool as b2_bool
+from blosc2.schema import field as b2_field
+
+if TYPE_CHECKING:  # pragma: no cover
+    from blosc2.ctable import CTable
+
+
+AggName = Literal["size", "count", "sum", "mean", "min", "max"]
+
+_IDENTIFIER_RE = re.compile(r"^[A-Za-z_]\w*$")
+_NAN_KEY = ("__blosc2_groupby_nan__",)
+
+
+@dataclasses.dataclass
+class _AggSpec:
+    input_col: str | None
+    op: AggName
+    output_col: str
+
+
+@dataclasses.dataclass
+class _AggState:
+    op: AggName
+    value: Any = None
+    count: int = 0
+
+
+class CTableGroupBy:
+    """Deferred group-by operation returned by :meth:`CTable.group_by`.
+
+    The object stores the source table, grouping keys, and execution options.
+    It is not a :class:`CTable` view and does not materialize grouped data until
+    a terminal method such as :meth:`size`, :meth:`count`, or :meth:`agg` is
+    called.
+    """
+
+    def __init__(
+        self,
+        table: CTable,
+        keys: str | Sequence[str],
+        *,
+        sort: bool = False,
+        dropna: bool = True,
+        engine: str = "auto",
+        chunk_size: int | None = None,
+    ) -> None:
+        if isinstance(keys, str):
+            keys = [keys]
+        else:
+            keys = list(keys)
+        if not keys:
+            raise ValueError("group_by() requires at least one key column")
+
+        self.table = table
+        self.keys = [table._logical_to_physical_name(k) for k in keys]
+        self.sort = bool(sort)
+        self.dropna = bool(dropna)
+        self.engine = engine
+        self.chunk_size = chunk_size
+
+        for name in self.keys:
+            if name in table._computed_cols:
+                raise NotImplementedError("group_by() over computed columns is not supported yet")
+            if name not in table._cols:
+                raise KeyError(f"No column named {name!r}. Available: {table.col_names}")
+            col_info = table._schema.columns_by_name[name]
+            if table._is_list_column(col_info) or table._is_varlen_scalar_column(col_info):
+                raise TypeError(f"Cannot group by variable-length/list column {name!r} in Phase 1")
+
+    def size(self, *, urlpath: str | None = None):
+        """Return row counts per group as a new :class:`CTable`.
+
+        This is equivalent to SQL ``COUNT(*)``: it counts rows in each group and
+        is independent of null values in non-key columns.  If *urlpath* is
+        provided, the result is written as a persistent CTable at that path.
+        """
+        return self._execute([_AggSpec(None, "size", "size")], urlpath=urlpath)
+
+    def count(self, column: str, *, urlpath: str | None = None):
+        """Return non-null value counts for *column* per group.
+
+        This is equivalent to SQL ``COUNT(column)`` and to
+        ``group_by(...).agg({column: "count"})``.
+        """
+        col = self.table._logical_to_physical_name(column)
+        return self._execute([_AggSpec(col, "count", f"{col}_count")], urlpath=urlpath)
+
+    def sum(self, column: str, *, urlpath: str | None = None):
+        """Return sums of *column* per group.
+
+        This is equivalent to ``group_by(...).agg({column: "sum"})``.
+        """
+        return self.agg({column: "sum"}, urlpath=urlpath)
+
+    def mean(self, column: str, *, urlpath: str | None = None):
+        """Return means of *column* per group.
+
+        This is equivalent to ``group_by(...).agg({column: "mean"})``.
+        """
+        return self.agg({column: "mean"}, urlpath=urlpath)
+
+    def min(self, column: str, *, urlpath: str | None = None):
+        """Return minimum values of *column* per group.
+
+        This is equivalent to ``group_by(...).agg({column: "min"})``.
+        """
+        return self.agg({column: "min"}, urlpath=urlpath)
+
+    def max(self, column: str, *, urlpath: str | None = None):
+        """Return maximum values of *column* per group.
+
+        This is equivalent to ``group_by(...).agg({column: "max"})``.
+        """
+        return self.agg({column: "max"}, urlpath=urlpath)
+
+    def agg(self, aggregations: Mapping[str, str | Sequence[str]], *, urlpath: str | None = None):
+        """Aggregate value columns per group.
+
+        Parameters
+        ----------
+        aggregations:
+            Mapping from input column name to an aggregation name or list of
+            names.  Supported operations in Phase 1 are ``"count"``, ``"sum"``,
+            ``"mean"``, ``"min"``, ``"max"`` and the special row-count spelling
+            ``{"*": "size"``}.
+        """
+        specs = self._normalize_aggs(aggregations)
+        return self._execute(specs, urlpath=urlpath)
+
+    def _normalize_aggs(self, aggregations: Mapping[str, str | Sequence[str]]) -> list[_AggSpec]:
+        if not isinstance(aggregations, Mapping) or not aggregations:
+            raise ValueError("agg() requires a non-empty mapping")
+        specs: list[_AggSpec] = []
+        for col_name, ops in aggregations.items():
+            if isinstance(ops, str):
+                op_list = [ops]
+            else:
+                op_list = list(ops)
+            if not op_list:
+                raise ValueError(f"No aggregations specified for column {col_name!r}")
+
+            if col_name == "*":
+                for op in op_list:
+                    if op != "size":
+                        raise ValueError("Only the 'size' aggregation is supported for '*' input")
+                    specs.append(_AggSpec(None, "size", "size"))
+                continue
+
+            physical = self.table._logical_to_physical_name(col_name)
+            self._validate_value_column(physical)
+            for op in op_list:
+                if op not in {"count", "sum", "mean", "min", "max"}:
+                    raise ValueError(f"Unsupported aggregation {op!r}")
+                self._validate_agg_for_column(physical, op)
+                specs.append(_AggSpec(physical, op, f"{physical}_{op}"))
+        output_names = [s.output_col for s in specs]
+        if len(output_names) != len(set(output_names)):
+            raise ValueError("Aggregation output column names must be unique")
+        return specs
+
+    def _validate_agg_for_column(self, name: str, op: str) -> None:
+        dtype = getattr(self.table._schema.columns_by_name[name].spec, "dtype", None)
+        if op in {"sum", "mean"} and dtype is not None and dtype.kind not in "biuf":
+            raise TypeError(f"Aggregation {op!r} is not supported for column {name!r} with dtype {dtype}")
+        if op in {"min", "max"} and dtype is not None and dtype.kind == "c":
+            raise TypeError(f"Aggregation {op!r} is not supported for complex column {name!r}")
+
+    def _validate_value_column(self, name: str) -> None:
+        if name in self.table._computed_cols:
+            raise NotImplementedError("group_by() aggregations over computed columns are not supported yet")
+        if name not in self.table._cols:
+            raise KeyError(f"No column named {name!r}. Available: {self.table.col_names}")
+        col_info = self.table._schema.columns_by_name[name]
+        if self.table._is_list_column(col_info) or self.table._is_varlen_scalar_column(col_info):
+            raise TypeError(f"Cannot aggregate variable-length/list column {name!r} in Phase 1")
+        if self.table._is_dictionary_column(col_info):
+            raise TypeError(f"Cannot aggregate dictionary column {name!r} in Phase 1")
+
+    def _execute(self, specs: list[_AggSpec], *, urlpath: str | None = None):
+        self._validate_output_names(specs)
+        old_result_urlpath = getattr(self, "_result_urlpath", None)
+        self._result_urlpath = urlpath
+        try:
+            return self._execute_with_result_target(specs)
+        finally:
+            self._result_urlpath = old_result_urlpath
+
+    def _execute_with_result_target(self, specs: list[_AggSpec]):
+        fast = self._try_execute_cython_dense_int_key(specs)
+        if fast is not None:
+            return fast
+        fast = self._try_execute_cython_two_int_key_hash(specs)
+        if fast is not None:
+            return fast
+        fast = self._try_execute_cython_i32_f64_sum(specs)
+        if fast is not None:
+            return fast
+        fast = self._try_execute_cython_float_integral_key_f64_sum(specs)
+        if fast is not None:
+            return fast
+        fast = self._try_execute_cython_float_hash(specs)
+        if fast is not None:
+            return fast
+        fast = self._try_execute_dense_single_int_key(specs)
+        if fast is not None:
+            return fast
+
+        acc: dict[Any, dict[str, _AggState]] = {}
+        key_values: dict[Any, tuple[Any, ...]] = {}
+
+        phys_len = len(self.table._valid_rows)
+        chunk_size = self._chunk_size()
+        value_cols = sorted({s.input_col for s in specs if s.input_col is not None})
+
+        for start in range(0, phys_len, chunk_size):
+            stop = min(start + chunk_size, phys_len)
+            valid = np.asarray(self.table._valid_rows[start:stop], dtype=bool)
+            if not np.any(valid):
+                continue
+
+            raw_keys = [self._read_key_chunk(name, start, stop) for name in self.keys]
+            live_mask = valid.copy()
+            if self.dropna:
+                for name, values in zip(self.keys, raw_keys, strict=True):
+                    live_mask &= ~self._null_mask(name, values, is_key=True)
+            if not np.any(live_mask):
+                continue
+
+            keys_live = [np.asarray(values)[live_mask] for values in raw_keys]
+            n_live = len(keys_live[0])
+            if n_live == 0:
+                continue
+
+            unique_keys, inverse = self._factorize_keys(keys_live)
+            value_chunks = {
+                name: np.asarray(self.table._cols[name][start:stop])[live_mask] for name in value_cols
+            }
+
+            partials = self._compute_partials(specs, unique_keys, inverse, value_chunks)
+            display_keys = self._display_keys(unique_keys)
+            normalized_keys = self._normalized_keys(display_keys)
+            self._merge_partials(acc, key_values, normalized_keys, display_keys, partials, specs)
+
+        rows = self._final_rows(acc, key_values, specs)
+        return self._build_result(rows, specs)
+
+    def _try_execute_cython_two_int_key_hash(self, specs: list[_AggSpec]):  # noqa: C901
+        """Cython hash path for two integer/dictionary-code keys."""
+        if len(self.keys) != 2:
+            return None
+
+        key_arrays = []
+        key_is_dict = []
+        key_nulls = []
+        skip_key_nulls = []
+        for key_name in self.keys:
+            key_info = self.table._schema.columns_by_name[key_name]
+            if self.table._is_dictionary_column(key_info):
+                key_arrays.append(self.table._cols[key_name].codes)
+                key_is_dict.append(True)
+                key_nulls.append(int(key_info.spec.null_code))
+                skip_key_nulls.append(self.dropna)
+                continue
+            key_dtype = getattr(key_info.spec, "dtype", None)
+            if key_dtype is None or np.dtype(key_dtype).kind not in "biu":
+                return None
+            null_value = getattr(key_info.spec, "null_value", None)
+            if null_value is not None and not self.dropna:
+                return None
+            key_arrays.append(self.table._cols[key_name])
+            key_is_dict.append(False)
+            key_nulls.append(0 if null_value is None else int(null_value))
+            skip_key_nulls.append(self.dropna and null_value is not None)
+
+        value_cols = {s.input_col for s in specs if s.input_col is not None}
+        if len(value_cols) > 1:
+            return None
+        value_col = next(iter(value_cols), None)
+        if value_col is not None and any(s.op in {"sum", "mean", "min", "max"} for s in specs):
+            value_info = self.table._schema.columns_by_name[value_col]
+            value_dtype = getattr(value_info.spec, "dtype", None)
+            if value_dtype is None or np.dtype(value_dtype).kind != "f":
+                return None
+            null_value = getattr(value_info.spec, "null_value", None)
+            if null_value is not None and not (isinstance(null_value, float) and math.isnan(null_value)):
+                return None
+
+        try:
+            from blosc2 import groupby_ext
+        except ImportError:
+            return None
+        kernel = getattr(groupby_ext, "groupby_hash_i64x2_f64", None)
+        if kernel is None:
+            return None
+
+        acc: dict[Any, dict[str, _AggState]] = {}
+        key_values: dict[Any, tuple[Any, ...]] = {}
+        phys_len = len(self.table._valid_rows)
+        chunk_size = self._chunk_size()
+
+        for start in range(0, phys_len, chunk_size):
+            stop = min(start + chunk_size, phys_len)
+            valid = np.asarray(self.table._valid_rows[start:stop], dtype=bool)
+            if not np.any(valid):
+                continue
+            key_chunks = [np.asarray(arr[start:stop], dtype=np.int64) for arr in key_arrays]
+            live = valid.copy()
+            for key_chunk, skip_null, null_value in zip(key_chunks, skip_key_nulls, key_nulls, strict=True):
+                if skip_null:
+                    live &= key_chunk != null_value
+            if not np.any(live):
+                continue
+
+            if value_col is None:
+                values = np.empty(len(valid), dtype=np.float64)
+                values_valid = np.zeros(len(valid), dtype=bool)
+                has_values = False
+            else:
+                raw_values = np.asarray(self.table._cols[value_col][start:stop])
+                values = np.ascontiguousarray(raw_values.astype(np.float64, copy=False))
+                values_valid = np.ascontiguousarray(~self._null_mask(value_col, raw_values, is_key=False))
+                has_values = True
+
+            (
+                out_k0,
+                out_k1,
+                row_counts,
+                value_counts,
+                sums,
+                mins,
+                maxs,
+                has_value,
+            ) = kernel(
+                np.ascontiguousarray(key_chunks[0]),
+                np.ascontiguousarray(key_chunks[1]),
+                values,
+                np.ascontiguousarray(live),
+                values_valid,
+                has_values,
+            )
+
+            for i, (code0, code1) in enumerate(zip(out_k0, out_k1, strict=True)):
+                display = []
+                norm_parts = []
+                for key_pos, code in enumerate((int(code0), int(code1))):
+                    if key_is_dict[key_pos]:
+                        value = self.table._cols[self.keys[key_pos]].decode(code)
+                    else:
+                        value = code
+                    display.append(value)
+                    norm_parts.append(_normalize_key_part(value))
+                norm_key = tuple(norm_parts)
+                states = acc.setdefault(norm_key, {})
+                key_values.setdefault(norm_key, tuple(display))
+                for spec in specs:
+                    state = states.setdefault(spec.output_col, _AggState(spec.op))
+                    if spec.op == "size":
+                        state.value = (0 if state.value is None else state.value) + int(row_counts[i])
+                    elif spec.op == "count":
+                        state.value = (0 if state.value is None else state.value) + int(value_counts[i])
+                    elif spec.op in {"sum", "mean"}:
+                        if has_value[i]:
+                            state.value = (0.0 if state.value is None else state.value) + float(sums[i])
+                            state.count += int(value_counts[i])
+                    elif spec.op == "min":
+                        if has_value[i]:
+                            value = float(mins[i])
+                            if state.count == 0 or value < state.value:
+                                state.value = value
+                            state.count += 1
+                    elif spec.op == "max" and has_value[i]:
+                        value = float(maxs[i])
+                        if state.count == 0 or value > state.value:
+                            state.value = value
+                        state.count += 1
+
+        rows = self._final_rows(acc, key_values, specs)
+        return self._build_result(rows, specs)
+
+    def _try_execute_cython_dense_int_key(self, specs: list[_AggSpec]):  # noqa: C901
+        """Cython fast path for one compact integer/dictionary key and dense aggregations."""
+        if len(self.keys) != 1:
+            return None
+        key_name = self.keys[0]
+        key_info = self.table._schema.columns_by_name[key_name]
+        key_is_dict = self.table._is_dictionary_column(key_info)
+        if key_is_dict:
+            key_arr = self.table._cols[key_name].codes
+            key_dtype = np.dtype(np.int32)
+            skip_key_null = self.dropna
+            key_null = int(key_info.spec.null_code)
+        else:
+            key_arr = self.table._cols[key_name]
+            key_dtype = getattr(key_info.spec, "dtype", None)
+            if key_dtype is None:
+                return None
+            key_dtype = np.dtype(key_dtype)
+            if key_dtype.kind not in "biu":
+                return None
+            key_null_value = getattr(key_info.spec, "null_value", None)
+            skip_key_null = self.dropna and key_null_value is not None
+            key_null = 0 if key_null_value is None else int(key_null_value)
+
+        try:
+            from blosc2 import groupby_ext
+        except ImportError:
+            return None
+
+        descriptors = []
+        for spec in specs:
+            desc: dict[str, Any] = {"spec": spec, "op": spec.op}
+            if spec.op == "size":
+                kernel = getattr(groupby_ext, "groupby_dense_int_size_checked", None)
+                if kernel is None:
+                    return None
+                desc.update({"kernel": kernel, "state_kind": "counts"})
+                descriptors.append(desc)
+                continue
+
+            if spec.input_col is None:
+                return None
+            value_info = self.table._schema.columns_by_name[spec.input_col]
+            value_dtype = getattr(value_info.spec, "dtype", None)
+            if value_dtype is None:
+                return None
+            value_dtype = np.dtype(value_dtype)
+            null_value = getattr(value_info.spec, "null_value", None)
+
+            if spec.op == "count":
+                kernel = getattr(groupby_ext, "groupby_dense_int_count_checked", None)
+                if kernel is None:
+                    return None
+                desc.update({"kernel": kernel, "state_kind": "counts", "value_dtype": value_dtype})
+            elif spec.op in {"sum", "mean", "min", "max"}:
+                if value_dtype.kind == "f":
+                    skip_nan = isinstance(null_value, float) and math.isnan(null_value)
+                    if null_value is not None and not skip_nan:
+                        return None
+                    suffix = "sum" if spec.op == "sum" else spec.op
+                    kernel = getattr(groupby_ext, f"groupby_dense_int_f64_{suffix}_checked", None)
+                    if kernel is None:
+                        return None
+                    desc.update(
+                        {
+                            "kernel": kernel,
+                            "value_dtype": np.float64,
+                            "value_kind": "f64",
+                            "skip_nan": skip_nan,
+                        }
+                    )
+                elif value_dtype.kind in "biu":
+                    if null_value is not None:
+                        return None
+                    if spec.op == "mean":
+                        kernel = getattr(groupby_ext, "groupby_dense_int_f64_mean_checked", None)
+                        if kernel is None:
+                            return None
+                        desc.update(
+                            {
+                                "kernel": kernel,
+                                "value_dtype": np.float64,
+                                "value_kind": "f64",
+                                "skip_nan": False,
+                            }
+                        )
+                    else:
+                        kernel = getattr(groupby_ext, f"groupby_dense_int_i64_{spec.op}_checked", None)
+                        if kernel is None:
+                            return None
+                        desc.update(
+                            {
+                                "kernel": kernel,
+                                "value_dtype": np.int64,
+                                "value_kind": "i64",
+                                "skip_nan": False,
+                            }
+                        )
+                else:
+                    return None
+                if spec.op in {"sum", "min", "max"}:
+                    desc["state_kind"] = "value_present" if spec.op == "sum" else "extreme"
+                elif spec.op == "mean":
+                    desc["state_kind"] = "mean"
+            else:
+                return None
+            descriptors.append(desc)
+
+        compact_limit = 10_000_000
+        keys_present = np.zeros(0, dtype=bool)
+        states: dict[str, Any] = {}
+        for desc in descriptors:
+            spec = desc["spec"]
+            if desc["state_kind"] == "counts":
+                states[spec.output_col] = np.zeros(0, dtype=np.int64)
+            elif desc["state_kind"] == "mean":
+                states[spec.output_col] = (np.zeros(0, dtype=np.float64), np.zeros(0, dtype=np.int64))
+            elif desc["state_kind"] == "value_present" or desc["state_kind"] == "extreme":
+                dtype = np.float64 if desc["value_kind"] == "f64" else np.int64
+                states[spec.output_col] = (np.zeros(0, dtype=dtype), np.zeros(0, dtype=bool))
+
+        def ensure_size(size: int) -> bool:
+            nonlocal keys_present, states
+            if size > compact_limit:
+                return False
+            if size <= len(keys_present):
+                return True
+            old = len(keys_present)
+            keys_present = np.pad(keys_present, (0, size - old), constant_values=False)
+            for desc in descriptors:
+                spec = desc["spec"]
+                state = states[spec.output_col]
+                if desc["state_kind"] == "counts":
+                    states[spec.output_col] = np.pad(state, (0, size - old), constant_values=0)
+                else:
+                    first, second = state
+                    states[spec.output_col] = (
+                        np.pad(first, (0, size - old), constant_values=0),
+                        np.pad(
+                            second, (0, size - old), constant_values=False if second.dtype == np.bool_ else 0
+                        ),
+                    )
+            return True
+
+        def call_checked(kernel, *args) -> bool:
+            return int(kernel(*args)) == 0
+
+        phys_len = len(self.table._valid_rows)
+        chunk_size = self._chunk_size()
+        for start in range(0, phys_len, chunk_size):
+            stop = min(start + chunk_size, phys_len)
+            valid = np.asarray(self.table._valid_rows[start:stop], dtype=bool)
+            if not np.any(valid):
+                continue
+            keys = np.asarray(key_arr[start:stop], dtype=np.int8 if key_dtype.kind == "b" else key_dtype)
+            keys = np.ascontiguousarray(keys)
+            valid = np.ascontiguousarray(valid)
+            live = valid.copy()
+            if skip_key_null:
+                live &= keys != key_null
+            if not np.any(live):
+                continue
+            live_keys = keys[live]
+            if np.min(live_keys) < 0:
+                return None
+            max_key = int(np.max(live_keys))
+            if not ensure_size(max_key + 1):
+                return None
+
+            for desc in descriptors:
+                spec = desc["spec"]
+                state = states[spec.output_col]
+                if spec.op == "size":
+                    if not call_checked(
+                        desc["kernel"], keys, valid, state, keys_present, skip_key_null, key_null
+                    ):
+                        return None
+                elif spec.op == "count":
+                    values = np.asarray(self.table._cols[spec.input_col][start:stop])
+                    values_valid = np.ascontiguousarray(
+                        ~self._null_mask(spec.input_col, values, is_key=False)
+                    )
+                    if not call_checked(
+                        desc["kernel"],
+                        keys,
+                        valid,
+                        values_valid,
+                        state,
+                        keys_present,
+                        skip_key_null,
+                        key_null,
+                    ):
+                        return None
+                elif spec.op == "sum":
+                    values = np.asarray(
+                        self.table._cols[spec.input_col][start:stop], dtype=desc["value_dtype"]
+                    )
+                    values = np.ascontiguousarray(values)
+                    sums, value_present = state
+                    args = (
+                        keys,
+                        values,
+                        valid,
+                        sums,
+                        value_present,
+                        keys_present,
+                        skip_key_null,
+                        key_null,
+                    )
+                    if desc["value_kind"] == "f64":
+                        args = (*args, desc["skip_nan"])
+                    if not call_checked(desc["kernel"], *args):
+                        return None
+                elif spec.op == "mean":
+                    values = np.asarray(
+                        self.table._cols[spec.input_col][start:stop], dtype=desc["value_dtype"]
+                    )
+                    values = np.ascontiguousarray(values)
+                    sums, counts = state
+                    if not call_checked(
+                        desc["kernel"],
+                        keys,
+                        values,
+                        valid,
+                        sums,
+                        counts,
+                        keys_present,
+                        skip_key_null,
+                        key_null,
+                        desc["skip_nan"],
+                    ):
+                        return None
+                elif spec.op in {"min", "max"}:
+                    values = np.asarray(
+                        self.table._cols[spec.input_col][start:stop], dtype=desc["value_dtype"]
+                    )
+                    values = np.ascontiguousarray(values)
+                    extremes, has_value = state
+                    args = (
+                        keys,
+                        values,
+                        valid,
+                        extremes,
+                        has_value,
+                        keys_present,
+                        skip_key_null,
+                        key_null,
+                    )
+                    if desc["value_kind"] == "f64":
+                        args = (*args, desc["skip_nan"])
+                    if not call_checked(desc["kernel"], *args):
+                        return None
+
+        group_codes = np.nonzero(keys_present)[0]
+        if self.sort and key_is_dict:
+            group_codes = np.array(
+                sorted(
+                    group_codes,
+                    key=lambda code: _sortable_key_part(self.table._cols[key_name].decode(int(code))),
+                ),
+                dtype=group_codes.dtype,
+            )
+
+        rows = []
+        for code in group_codes:
+            key_value = self.table._cols[key_name].decode(int(code)) if key_is_dict else _python_scalar(code)
+            row = {key_name: key_value}
+            for desc in descriptors:
+                spec = desc["spec"]
+                state = states[spec.output_col]
+                if spec.op in {"size", "count"}:
+                    row[spec.output_col] = int(state[code])
+                elif spec.op == "sum":
+                    sums, value_present = state
+                    row[spec.output_col] = (
+                        _python_scalar(sums[code])
+                        if value_present[code]
+                        else _null_output_value(self._result_spec_for_agg(spec))
+                    )
+                elif spec.op == "mean":
+                    sums, counts = state
+                    row[spec.output_col] = (
+                        math.nan if counts[code] == 0 else float(sums[code]) / int(counts[code])
+                    )
+                elif spec.op in {"min", "max"}:
+                    extremes, has_value = state
+                    row[spec.output_col] = (
+                        _python_scalar(extremes[code])
+                        if has_value[code]
+                        else _null_output_value(self._result_spec_for_agg(spec))
+                    )
+            rows.append(row)
+        return self._build_result(rows, specs)
+
+    def _try_execute_cython_i32_f64_sum(self, specs: list[_AggSpec]):  # noqa: C901
+        """Cython fast path for one int32 key and one non-null float64 sum."""
+        if len(self.keys) != 1 or len(specs) != 1 or specs[0].op != "sum" or self.sort:
+            return None
+        spec = specs[0]
+        if spec.input_col is None:
+            return None
+        key_name = self.keys[0]
+        key_info = self.table._schema.columns_by_name[key_name]
+        value_info = self.table._schema.columns_by_name[spec.input_col]
+        if self.table._is_dictionary_column(key_info):
+            key_arr = self.table._cols[key_name].codes
+            key_is_dict = True
+            key_null = int(key_info.spec.null_code)
+            skip_key_null = self.dropna
+        else:
+            key_arr = self.table._cols[key_name]
+            key_is_dict = False
+            key_dtype = getattr(key_info.spec, "dtype", None)
+            if key_dtype != np.dtype(np.int32):
+                return None
+            key_null_value = getattr(key_info.spec, "null_value", None)
+            skip_key_null = self.dropna and key_null_value is not None
+            key_null = 0 if key_null_value is None else int(key_null_value)
+        value_dtype = getattr(value_info.spec, "dtype", None)
+        if value_dtype != np.dtype(np.float64) or getattr(value_info.spec, "null_value", None) is not None:
+            return None
+        try:
+            from blosc2 import groupby_ext
+        except ImportError:
+            return None
+        kernel = getattr(groupby_ext, "groupby_dense_i32_f64_sum_checked", None)
+        if kernel is None:
+            return None
+
+        compact_limit = 10_000_000
+        sums = np.zeros(0, dtype=np.float64)
+        present = np.zeros(0, dtype=bool)
+
+        def ensure_size(size: int) -> bool:
+            nonlocal sums, present
+            if size > compact_limit:
+                return False
+            if size <= len(sums):
+                return True
+            old = len(sums)
+            sums = np.pad(sums, (0, size - old), constant_values=0)
+            present = np.pad(present, (0, size - old), constant_values=False)
+            return True
+
+        phys_len = len(self.table._valid_rows)
+        chunk_size = self._chunk_size()
+        for start in range(0, phys_len, chunk_size):
+            stop = min(start + chunk_size, phys_len)
+            valid = np.asarray(self.table._valid_rows[start:stop], dtype=bool)
+            if not np.any(valid):
+                continue
+            keys = np.asarray(key_arr[start:stop], dtype=np.int32)
+            values = np.asarray(self.table._cols[spec.input_col][start:stop], dtype=np.float64)
+            status = int(kernel(keys, values, valid, sums, present, skip_key_null, key_null, False))
+            if status == -1:
+                return None
+            if status > 0:
+                if not ensure_size(status):
+                    return None
+                status = int(kernel(keys, values, valid, sums, present, skip_key_null, key_null, False))
+                if status != 0:
+                    return None
+
+        rows = []
+        for code in np.nonzero(present)[0]:
+            key_value = self.table._cols[key_name].decode(int(code)) if key_is_dict else int(code)
+            rows.append({key_name: key_value, spec.output_col: float(sums[code])})
+        return self._build_result(rows, specs)
+
+    def _try_execute_cython_float_hash(self, specs: list[_AggSpec]):  # noqa: C901
+        """Cython hash path for one arbitrary float key.
+
+        This covers float32/float64 keys that are not suitable for dense
+        integral-key indexing.  It currently supports float value columns for
+        value reductions and falls back for unsupported mixed/multi-column cases.
+        """
+        if len(self.keys) != 1:
+            return None
+        key_name = self.keys[0]
+        key_info = self.table._schema.columns_by_name[key_name]
+        if self.table._is_dictionary_column(key_info):
+            return None
+        key_dtype = getattr(key_info.spec, "dtype", None)
+        if key_dtype not in {np.dtype(np.float32), np.dtype(np.float64)}:
+            return None
+
+        value_cols = {s.input_col for s in specs if s.input_col is not None}
+        if len(value_cols) > 1:
+            return None
+        value_col = next(iter(value_cols), None)
+        value_dtype = None
+        nullable_nan_value = False
+        if value_col is not None:
+            value_info = self.table._schema.columns_by_name[value_col]
+            value_dtype = getattr(value_info.spec, "dtype", None)
+            # Count can operate on any fixed-width value column via values_valid,
+            # but other reductions in this hash kernel normalize values to f64.
+            if any(s.op in {"sum", "mean", "min", "max"} for s in specs):
+                if value_dtype is None or np.dtype(value_dtype).kind != "f":
+                    return None
+                null_value = getattr(value_info.spec, "null_value", None)
+                nullable_nan_value = isinstance(null_value, float) and math.isnan(null_value)
+                if null_value is not None and not nullable_nan_value:
+                    return None
+
+        try:
+            from blosc2 import groupby_ext
+        except ImportError:
+            return None
+        kernel = getattr(groupby_ext, "groupby_hash_f64_f64", None)
+        if kernel is None:
+            return None
+
+        acc: dict[Any, dict[str, _AggState]] = {}
+        key_values: dict[Any, tuple[Any, ...]] = {}
+        phys_len = len(self.table._valid_rows)
+        chunk_size = self._chunk_size()
+
+        for start in range(0, phys_len, chunk_size):
+            stop = min(start + chunk_size, phys_len)
+            valid = np.asarray(self.table._valid_rows[start:stop], dtype=bool)
+            if not np.any(valid):
+                continue
+            keys = np.ascontiguousarray(np.asarray(self.table._cols[key_name][start:stop], dtype=np.float64))
+            if value_col is None:
+                values = np.empty(len(keys), dtype=np.float64)
+                values_valid = np.zeros(len(keys), dtype=bool)
+                has_values = False
+            else:
+                raw_values = np.asarray(self.table._cols[value_col][start:stop])
+                if any(s.op in {"sum", "mean", "min", "max"} for s in specs):
+                    values = np.ascontiguousarray(raw_values.astype(np.float64, copy=False))
+                else:
+                    values = np.empty(len(keys), dtype=np.float64)
+                values_valid = np.ascontiguousarray(~self._null_mask(value_col, raw_values, is_key=False))
+                has_values = True
+
+            (
+                chunk_keys,
+                row_counts,
+                value_counts,
+                sums,
+                mins,
+                maxs,
+                has_value,
+            ) = kernel(keys, values, np.ascontiguousarray(valid), values_valid, has_values, self.dropna)
+
+            for i, key in enumerate(chunk_keys):
+                key_scalar = np.asarray(key, dtype=key_dtype).item()
+                norm_key = _normalize_key_part(float(key_scalar))
+                states = acc.setdefault(norm_key, {})
+                key_values.setdefault(norm_key, (key_scalar,))
+                for spec in specs:
+                    state = states.setdefault(spec.output_col, _AggState(spec.op))
+                    if spec.op == "size":
+                        state.value = (0 if state.value is None else state.value) + int(row_counts[i])
+                    elif spec.op == "count":
+                        state.value = (0 if state.value is None else state.value) + int(value_counts[i])
+                    elif spec.op == "sum" or spec.op == "mean":
+                        if has_value[i]:
+                            state.value = (0.0 if state.value is None else state.value) + float(sums[i])
+                            state.count += int(value_counts[i])
+                    elif spec.op == "min":
+                        if has_value[i]:
+                            value = float(mins[i])
+                            if state.count == 0 or value < state.value:
+                                state.value = value
+                            state.count += 1
+                    elif spec.op == "max" and has_value[i]:
+                        value = float(maxs[i])
+                        if state.count == 0 or value > state.value:
+                            state.value = value
+                        state.count += 1
+
+        # Hash-table iteration order is intentionally not exposed.  Emit float
+        # hash groups in key order for deterministic results and compatibility
+        # with the previous NumPy fallback behavior for these cases.
+        ordered_keys = list(acc)
+        ordered_keys.sort(
+            key=lambda k: tuple(
+                (1, "") if isinstance(v, float) and math.isnan(v) else (0, v) for v in key_values[k]
+            )
+        )
+        rows = []
+        for norm_key in ordered_keys:
+            row = dict(zip(self.keys, key_values[norm_key], strict=True))
+            states = acc[norm_key]
+            for spec in specs:
+                state = states[spec.output_col]
+                if spec.op == "mean":
+                    row[spec.output_col] = math.nan if state.count == 0 else state.value / state.count
+                elif spec.op in {"sum", "min", "max"} and state.count == 0:
+                    row[spec.output_col] = _null_output_value(self._result_spec_for_agg(spec))
+                else:
+                    row[spec.output_col] = 0 if state.value is None else state.value
+            rows.append(row)
+        return self._build_result(rows, specs)
+
+    def _try_execute_cython_float_integral_key_f64_sum(self, specs: list[_AggSpec]):  # noqa: C901
+        """Cython fast path for integral float32/float64 keys and one non-null float64 sum."""
+        if len(self.keys) != 1 or len(specs) != 1 or specs[0].op != "sum" or self.sort:
+            return None
+        spec = specs[0]
+        if spec.input_col is None:
+            return None
+        key_name = self.keys[0]
+        key_info = self.table._schema.columns_by_name[key_name]
+        value_info = self.table._schema.columns_by_name[spec.input_col]
+        key_dtype = getattr(key_info.spec, "dtype", None)
+        value_dtype = getattr(value_info.spec, "dtype", None)
+        if key_dtype not in {np.dtype(np.float32), np.dtype(np.float64)} or value_dtype != np.dtype(
+            np.float64
+        ):
+            return None
+        if getattr(value_info.spec, "null_value", None) is not None:
+            return None
+        # The fast path can skip NaNs.  If dropna=False and NaNs are present,
+        # the Cython kernel reports unsupported and we fall back to generic
+        # grouping, which can materialize a NaN group.
+        skip_key_nan = self.dropna
+        try:
+            from blosc2 import groupby_ext
+        except ImportError:
+            return None
+        kernel_name = (
+            "groupby_dense_f32_integral_key_f64_sum_checked"
+            if key_dtype == np.dtype(np.float32)
+            else "groupby_dense_f64_integral_key_f64_sum_checked"
+        )
+        kernel = getattr(groupby_ext, kernel_name, None)
+        if kernel is None:
+            return None
+
+        compact_limit = 10_000_000
+        sums = np.zeros(0, dtype=np.float64)
+        present = np.zeros(0, dtype=bool)
+
+        def ensure_size(size: int) -> bool:
+            nonlocal sums, present
+            if size > compact_limit:
+                return False
+            if size <= len(sums):
+                return True
+            old = len(sums)
+            sums = np.pad(sums, (0, size - old), constant_values=0)
+            present = np.pad(present, (0, size - old), constant_values=False)
+            return True
+
+        phys_len = len(self.table._valid_rows)
+        chunk_size = self._chunk_size()
+        for start in range(0, phys_len, chunk_size):
+            stop = min(start + chunk_size, phys_len)
+            valid = np.asarray(self.table._valid_rows[start:stop], dtype=bool)
+            if not np.any(valid):
+                continue
+            keys = np.asarray(self.table._cols[key_name][start:stop], dtype=key_dtype)
+            values = np.asarray(self.table._cols[spec.input_col][start:stop], dtype=np.float64)
+            status = int(kernel(keys, values, valid, sums, present, skip_key_nan, False))
+            if status == -1:
+                return None
+            if status > 0:
+                if not ensure_size(status):
+                    return None
+                status = int(kernel(keys, values, valid, sums, present, skip_key_nan, False))
+                if status != 0:
+                    return None
+
+        rows = [
+            {key_name: float(code), spec.output_col: float(sums[code])} for code in np.nonzero(present)[0]
+        ]
+        return self._build_result(rows, specs)
+
+    def _try_execute_dense_single_int_key(self, specs: list[_AggSpec]):  # noqa: C901
+        """Fast path for one dense integer/dictionary-code key.
+
+        This avoids per-chunk ``np.unique`` and Python dictionary merging.  It is
+        intentionally conservative: keys must be non-negative and the observed
+        key range must stay reasonably compact.
+        """
+        if len(self.keys) != 1:
+            return None
+        key_name = self.keys[0]
+        key_info = self.table._schema.columns_by_name[key_name]
+        key_is_dict = self.table._is_dictionary_column(key_info)
+        key_dtype = np.dtype(np.int32) if key_is_dict else getattr(key_info.spec, "dtype", None)
+        if key_dtype is None or key_dtype.kind not in "biu":
+            return None
+        if any(spec.op in {"min", "max"} and spec.input_col is not None for spec in specs):
+            for spec in specs:
+                if spec.op in {"min", "max"} and spec.input_col is not None:
+                    dtype = getattr(self.table._schema.columns_by_name[spec.input_col].spec, "dtype", None)
+                    if dtype is None or np.dtype(dtype).kind not in "biufmM":
+                        return None
+
+        compact_limit = 10_000_000
+        present = np.zeros(0, dtype=bool)
+        states: dict[str, Any] = {}
+        for spec in specs:
+            if spec.op in {"size", "count"}:
+                states[spec.output_col] = np.zeros(0, dtype=np.int64)
+            elif spec.op == "sum":
+                out_dtype = np.int64
+                if spec.input_col is not None:
+                    dtype = np.dtype(self.table._schema.columns_by_name[spec.input_col].spec.dtype)
+                    out_dtype = np.float64 if dtype.kind == "f" else np.int64
+                states[spec.output_col] = np.zeros(0, dtype=out_dtype)
+            elif spec.op == "mean":
+                states[spec.output_col] = (np.zeros(0, dtype=np.float64), np.zeros(0, dtype=np.int64))
+            elif spec.op in {"min", "max"}:
+                assert spec.input_col is not None
+                dtype = np.dtype(self.table._schema.columns_by_name[spec.input_col].spec.dtype)
+                identity = _max_identity(dtype) if spec.op == "min" else _min_identity(dtype)
+                states[spec.output_col] = (np.full(0, identity, dtype=dtype), np.zeros(0, dtype=bool))
+
+        def ensure_size(size: int) -> bool:
+            nonlocal present, states
+            if size > compact_limit:
+                return False
+            if size <= len(present):
+                return True
+            old = len(present)
+            present = np.pad(present, (0, size - old), constant_values=False)
+            for spec in specs:
+                state = states[spec.output_col]
+                if spec.op in {"size", "count", "sum"}:
+                    states[spec.output_col] = np.pad(state, (0, size - old), constant_values=0)
+                elif spec.op == "mean":
+                    sums, counts = state
+                    states[spec.output_col] = (
+                        np.pad(sums, (0, size - old), constant_values=0),
+                        np.pad(counts, (0, size - old), constant_values=0),
+                    )
+                elif spec.op in {"min", "max"}:
+                    values, has = state
+                    dtype = values.dtype
+                    identity = _max_identity(dtype) if spec.op == "min" else _min_identity(dtype)
+                    states[spec.output_col] = (
+                        np.pad(values, (0, size - old), constant_values=identity),
+                        np.pad(has, (0, size - old), constant_values=False),
+                    )
+            return True
+
+        phys_len = len(self.table._valid_rows)
+        chunk_size = self._chunk_size()
+        value_cols = sorted({s.input_col for s in specs if s.input_col is not None})
+        for start in range(0, phys_len, chunk_size):
+            stop = min(start + chunk_size, phys_len)
+            valid = np.asarray(self.table._valid_rows[start:stop], dtype=bool)
+            if not np.any(valid):
+                continue
+            raw_keys = self._read_key_chunk(key_name, start, stop)
+            live_mask = valid.copy()
+            if self.dropna:
+                live_mask &= ~self._null_mask(key_name, raw_keys, is_key=True)
+            if not np.any(live_mask):
+                continue
+            keys = np.asarray(raw_keys[live_mask])
+            if keys.dtype.kind == "b":
+                keys = keys.astype(np.int8, copy=False)
+            if len(keys) == 0:
+                continue
+            min_key = int(np.min(keys))
+            if min_key < 0:
+                return None
+            max_key = int(np.max(keys))
+            if not ensure_size(max_key + 1):
+                return None
+            present[keys] = True
+            value_chunks = {
+                name: np.asarray(self.table._cols[name][start:stop])[live_mask] for name in value_cols
+            }
+
+            for spec in specs:
+                if spec.op == "size":
+                    states[spec.output_col] += np.bincount(keys, minlength=len(present)).astype(np.int64)
+                    continue
+                assert spec.input_col is not None
+                values = value_chunks[spec.input_col]
+                non_null = ~self._null_mask(spec.input_col, values, is_key=False)
+                if spec.op == "count":
+                    states[spec.output_col] += np.bincount(
+                        keys, weights=non_null.astype(np.int64), minlength=len(present)
+                    ).astype(np.int64)
+                elif spec.op == "sum":
+                    state = states[spec.output_col]
+                    if values.dtype.kind in "biu":
+                        np.add.at(state, keys[non_null], values[non_null].astype(np.int64, copy=False))
+                    else:
+                        state += np.bincount(
+                            keys, weights=np.where(non_null, values, 0), minlength=len(present)
+                        ).astype(state.dtype, copy=False)
+                elif spec.op == "mean":
+                    sums, counts = states[spec.output_col]
+                    sums += np.bincount(keys, weights=np.where(non_null, values, 0), minlength=len(present))
+                    counts += np.bincount(
+                        keys, weights=non_null.astype(np.int64), minlength=len(present)
+                    ).astype(np.int64)
+                elif spec.op in {"min", "max"}:
+                    values_state, has_state = states[spec.output_col]
+                    if spec.op == "min":
+                        np.minimum.at(values_state, keys[non_null], values[non_null])
+                    else:
+                        np.maximum.at(values_state, keys[non_null], values[non_null])
+                    has_state[keys[non_null]] = True
+
+        group_codes = np.nonzero(present)[0]
+        rows = []
+        for code in group_codes:
+            key_value = self.table._cols[key_name].decode(int(code)) if key_is_dict else _python_scalar(code)
+            row = {key_name: key_value}
+            for spec in specs:
+                state = states[spec.output_col]
+                if spec.op == "mean":
+                    sums, counts = state
+                    row[spec.output_col] = (
+                        math.nan if counts[code] == 0 else float(sums[code]) / int(counts[code])
+                    )
+                elif spec.op in {"min", "max"}:
+                    values_state, has_state = state
+                    row[spec.output_col] = (
+                        _python_scalar(values_state[code])
+                        if has_state[code]
+                        else _null_output_value(self._result_spec_for_agg(spec))
+                    )
+                else:
+                    row[spec.output_col] = _python_scalar(state[code])
+            rows.append(row)
+        return self._build_result(rows, specs)
+
+    def _chunk_size(self) -> int:
+        if self.chunk_size is not None:
+            if self.chunk_size <= 0:
+                raise ValueError("chunk_size must be positive")
+            return int(self.chunk_size)
+        chunks = getattr(self.table._valid_rows, "chunks", None)
+        if chunks:
+            return max(int(chunks[0]), 1)
+        return 65536
+
+    def _read_key_chunk(self, name: str, start: int, stop: int) -> np.ndarray:
+        col_info = self.table._schema.columns_by_name[name]
+        if self.table._is_dictionary_column(col_info):
+            return np.asarray(self.table._cols[name].codes[start:stop], dtype=np.int32)
+        return np.asarray(self.table._cols[name][start:stop])
+
+    def _factorize_keys(
+        self, keys_live: list[np.ndarray]
+    ) -> tuple[np.ndarray | list[np.ndarray], np.ndarray]:
+        if len(keys_live) == 1:
+            unique, inverse = np.unique(keys_live[0], return_inverse=True)
+            return unique, inverse
+
+        dtype = [(f"k{i}", arr.dtype) for i, arr in enumerate(keys_live)]
+        packed = np.empty(len(keys_live[0]), dtype=dtype)
+        for i, arr in enumerate(keys_live):
+            packed[f"k{i}"] = arr
+        unique, inverse = np.unique(packed, return_inverse=True)
+        return unique, inverse
+
+    def _display_keys(self, unique_keys: np.ndarray | list[np.ndarray]) -> list[tuple[Any, ...]]:
+        if len(self.keys) == 1:
+            name = self.keys[0]
+            col_info = self.table._schema.columns_by_name[name]
+            values = []
+            for value in np.asarray(unique_keys):
+                if self.table._is_dictionary_column(col_info):
+                    values.append((self.table._cols[name].decode(int(value)),))
+                else:
+                    values.append((_python_scalar(value),))
+            return values
+
+        result = []
+        assert isinstance(unique_keys, np.ndarray)
+        for row in unique_keys:
+            vals = []
+            for i, name in enumerate(self.keys):
+                value = row[f"k{i}"]
+                col_info = self.table._schema.columns_by_name[name]
+                if self.table._is_dictionary_column(col_info):
+                    vals.append(self.table._cols[name].decode(int(value)))
+                else:
+                    vals.append(_python_scalar(value))
+            result.append(tuple(vals))
+        return result
+
+    def _normalized_keys(self, display_keys: list[tuple[Any, ...]]) -> list[Any]:
+        normalized = []
+        for key in display_keys:
+            norm = tuple(_normalize_key_part(v) for v in key)
+            normalized.append(norm[0] if len(norm) == 1 else norm)
+        return normalized
+
+    def _compute_partials(
+        self,
+        specs: list[_AggSpec],
+        unique_keys: np.ndarray | list[np.ndarray],
+        inverse: np.ndarray,
+        value_chunks: dict[str, np.ndarray],
+    ) -> dict[str, Any]:
+        n_groups = len(unique_keys)
+        partials: dict[str, Any] = {}
+        for spec in specs:
+            if spec.op == "size":
+                partials[spec.output_col] = np.bincount(inverse, minlength=n_groups).astype(np.int64)
+                continue
+
+            assert spec.input_col is not None
+            values = value_chunks[spec.input_col]
+            non_null = ~self._null_mask(spec.input_col, values, is_key=False)
+
+            if spec.op == "count":
+                partials[spec.output_col] = np.bincount(
+                    inverse, weights=non_null.astype(np.int64), minlength=n_groups
+                ).astype(np.int64)
+            elif spec.op in {"sum", "mean"}:
+                counts = np.bincount(inverse, weights=non_null.astype(np.int64), minlength=n_groups).astype(
+                    np.int64
+                )
+                if spec.op == "sum" and values.dtype.kind in "biu":
+                    sums = np.zeros(n_groups, dtype=np.int64)
+                    np.add.at(sums, inverse[non_null], values[non_null].astype(np.int64, copy=False))
+                else:
+                    weights = np.where(non_null, values, 0)
+                    sums = np.bincount(inverse, weights=weights, minlength=n_groups)
+                partials[spec.output_col] = (sums, counts)
+            elif spec.op in {"min", "max"}:
+                partials[spec.output_col] = self._minmax_partials(
+                    spec.op, inverse, values, non_null, n_groups
+                )
+        return partials
+
+    def _minmax_partials(
+        self, op: AggName, inverse: np.ndarray, values: np.ndarray, non_null: np.ndarray, n_groups: int
+    ) -> tuple[np.ndarray, np.ndarray]:
+        if values.dtype.kind in "biufcmM":
+            if op == "min":
+                identity = _max_identity(values.dtype)
+                out = np.full(n_groups, identity, dtype=values.dtype)
+                np.minimum.at(out, inverse[non_null], values[non_null])
+            else:
+                identity = _min_identity(values.dtype)
+                out = np.full(n_groups, identity, dtype=values.dtype)
+                np.maximum.at(out, inverse[non_null], values[non_null])
+        else:
+            out = np.empty(n_groups, dtype=values.dtype)
+            has = np.zeros(n_groups, dtype=bool)
+            for group, value, ok in zip(inverse, values, non_null, strict=True):
+                if not ok:
+                    continue
+                if not has[group] or (value < out[group] if op == "min" else value > out[group]):
+                    out[group] = value
+                    has[group] = True
+            return out, has
+        has_value = np.bincount(inverse, weights=non_null.astype(np.int64), minlength=n_groups) > 0
+        return out, has_value
+
+    def _merge_partials(
+        self,
+        acc: dict[Any, dict[str, _AggState]],
+        key_values: dict[Any, tuple[Any, ...]],
+        normalized_keys: list[Any],
+        display_keys: list[tuple[Any, ...]],
+        partials: dict[str, Any],
+        specs: list[_AggSpec],
+    ) -> None:
+        for i, norm_key in enumerate(normalized_keys):
+            states = acc.setdefault(norm_key, {})
+            key_values.setdefault(norm_key, display_keys[i])
+            for spec in specs:
+                state = states.setdefault(spec.output_col, _AggState(spec.op))
+                partial = partials[spec.output_col]
+                if spec.op in {"size", "count"}:
+                    state.value = (0 if state.value is None else state.value) + int(partial[i])
+                elif spec.op == "sum":
+                    sums, counts = partial
+                    if counts[i] > 0:
+                        state.value = (0 if state.value is None else state.value) + _python_scalar(sums[i])
+                        state.count += int(counts[i])
+                elif spec.op == "mean":
+                    sums, counts = partial
+                    if counts[i] > 0:
+                        state.value = (0.0 if state.value is None else state.value) + float(sums[i])
+                        state.count += int(counts[i])
+                elif spec.op in {"min", "max"}:
+                    values, has_value = partial
+                    if has_value[i]:
+                        value = _python_scalar(values[i])
+                        if (
+                            state.count == 0
+                            or (spec.op == "min" and value < state.value)
+                            or (spec.op == "max" and value > state.value)
+                        ):
+                            state.value = value
+                        state.count += 1
+
+    def _final_rows(
+        self,
+        acc: dict[Any, dict[str, _AggState]],
+        key_values: dict[Any, tuple[Any, ...]],
+        specs: list[_AggSpec],
+    ) -> list[dict[str, Any]]:
+        keys = list(acc)
+        if self.sort:
+            keys.sort(key=lambda k: tuple(_sortable_key_part(v) for v in key_values[k]))
+
+        rows = []
+        for norm_key in keys:
+            row = dict(zip(self.keys, key_values[norm_key], strict=True))
+            states = acc[norm_key]
+            for spec in specs:
+                state = states[spec.output_col]
+                if spec.op == "mean":
+                    row[spec.output_col] = math.nan if state.count == 0 else state.value / state.count
+                elif spec.op in {"sum", "min", "max"} and state.count == 0:
+                    row[spec.output_col] = _null_output_value(self._result_spec_for_agg(spec))
+                else:
+                    row[spec.output_col] = 0 if state.value is None else state.value
+            rows.append(row)
+        return rows
+
+    def _build_result(self, rows: list[dict[str, Any]], specs: list[_AggSpec]):
+        from blosc2.ctable import CTable
+
+        columns = self.keys + [spec.output_col for spec in specs]
+        schema_specs = {name: self._result_spec_for_key(name) for name in self.keys}
+        for spec in specs:
+            schema_specs[spec.output_col] = self._result_spec_for_agg(spec)
+
+        fields = []
+        for name in columns:
+            fields.append((name, _python_type_for_spec(schema_specs[name]), b2_field(schema_specs[name])))
+        row_type = dataclasses.make_dataclass("CTableGroupByRow", fields)
+        data = {name: [row[name] for row in rows] for name in columns}
+        urlpath = getattr(self, "_result_urlpath", None)
+        kwargs = {"urlpath": str(urlpath), "mode": "w"} if urlpath is not None else {}
+        return CTable(row_type, new_data=data, expected_size=max(len(rows), 1), validate=False, **kwargs)
+
+    def _validate_output_names(self, specs: list[_AggSpec]) -> None:
+        names = self.keys + [s.output_col for s in specs]
+        bad = [name for name in names if not _IDENTIFIER_RE.match(name)]
+        if bad:
+            raise NotImplementedError(
+                "Phase-1 group_by() result columns must be valid Python identifiers; "
+                f"unsupported names: {bad!r}"
+            )
+        if len(names) != len(set(names)):
+            raise ValueError("Group-by result column names would not be unique")
+
+    def _result_spec_for_key(self, name: str) -> SchemaSpec:
+        return copy.deepcopy(self.table._schema.columns_by_name[name].spec)
+
+    def _result_spec_for_agg(self, spec: _AggSpec) -> SchemaSpec:
+        if spec.op in {"size", "count"}:
+            return int64()
+        if spec.op == "mean":
+            return float64()
+        assert spec.input_col is not None
+        input_spec = self.table._schema.columns_by_name[spec.input_col].spec
+        dtype = getattr(input_spec, "dtype", None)
+        if spec.op == "sum":
+            if dtype is not None and dtype.kind in "iu":
+                return int64()
+            if dtype is not None and dtype.kind == "b":
+                return int64()
+            if dtype is not None and dtype.kind == "f":
+                return float64()
+        return copy.deepcopy(input_spec)
+
+    def _null_mask(self, name: str, values: np.ndarray, *, is_key: bool) -> np.ndarray:
+        col_info = self.table._schema.columns_by_name[name]
+        spec = col_info.spec
+        if isinstance(spec, DictionarySpec):
+            mask = values == np.int32(spec.null_code)
+            return mask if is_key or getattr(spec, "nullable", False) else np.zeros(len(values), dtype=bool)
+        null_value = getattr(spec, "null_value", None)
+        mask = np.zeros(len(values), dtype=bool)
+        # For keys, treat all NaNs as missing so dropna behaves predictably.
+        # For values, only nullable NaN sentinels are skipped.
+        if values.dtype.kind == "f" and (
+            is_key or (isinstance(null_value, float) and math.isnan(null_value))
+        ):
+            mask |= np.isnan(values)
+        if null_value is not None and not (isinstance(null_value, float) and math.isnan(null_value)):
+            mask |= values == null_value
+        return mask
+
+
+def _normalize_key_part(value: Any) -> Any:
+    if isinstance(value, float) and math.isnan(value):
+        return _NAN_KEY
+    return value
+
+
+def _sortable_key_part(value: Any) -> tuple[int, Any]:
+    if value is None:
+        return (0, "")
+    if isinstance(value, float) and math.isnan(value):
+        return (0, "")
+    return (1, value)
+
+
+def _python_scalar(value: Any) -> Any:
+    if isinstance(value, np.generic):
+        return value.item()
+    return value
+
+
+def _python_type_for_spec(spec: SchemaSpec):
+    if isinstance(spec, DictionarySpec):
+        return str
+    if isinstance(spec, b2_bool):
+        return bool
+    dtype = getattr(spec, "dtype", None)
+    if dtype is not None:
+        if dtype.kind in "iu":
+            return int
+        if dtype.kind == "f":
+            return float
+        if dtype.kind == "b":
+            return bool
+        if dtype.kind in "US":
+            return str if dtype.kind == "U" else bytes
+    return getattr(spec, "python_type", object)
+
+
+def _max_identity(dtype: np.dtype):
+    dtype = np.dtype(dtype)
+    if dtype.kind in "iu":
+        return np.iinfo(dtype).max
+    if dtype.kind == "f":
+        return np.inf
+    if dtype.kind in "mM":
+        return np.iinfo(np.int64).max
+    return None
+
+
+def _min_identity(dtype: np.dtype):
+    dtype = np.dtype(dtype)
+    if dtype.kind in "iu":
+        return np.iinfo(dtype).min
+    if dtype.kind == "f":
+        return -np.inf
+    if dtype.kind in "mM":
+        return np.iinfo(np.int64).min
+    return None
+
+
+def _null_output_value(spec: SchemaSpec):
+    dtype = getattr(spec, "dtype", None)
+    null_value = getattr(spec, "null_value", None)
+    if null_value is not None:
+        return null_value
+    if dtype is not None and dtype.kind == "f":
+        return math.nan
+    if dtype is not None and dtype.kind in "iu":
+        return 0
+    if dtype is not None and dtype.kind == "b":
+        return False
+    if dtype is not None and dtype.kind == "U":
+        return ""
+    if dtype is not None and dtype.kind == "S":
+        return b""
+    return None
+
+
+# ----------------------------------------------------------------------
+# Public array-oriented grouped reductions
+# ----------------------------------------------------------------------
+
+
+def group_reduce(keys, values=None, op: AggName = "size", *, sort: bool = False, dropna: bool = True):
+    """Group *keys* and reduce *values* with *op*.
+
+    This is a lower-level, NumPy-style grouped reduction primitive.  It exposes
+    Blosc2's optimized group-reduce kernels for plain array-like inputs without
+    requiring a :class:`blosc2.CTable`.
+
+    Parameters
+    ----------
+    keys : array-like
+        One-dimensional grouping keys.
+    values : array-like, optional
+        One-dimensional values to reduce.  Required for ``"count"``, ``"sum"``,
+        ``"mean"``, ``"min"`` and ``"max"``.  Ignored for ``"size"``.
+    op : {"size", "count", "sum", "mean", "min", "max"}, default: "size"
+        Reduction operation.  ``"size"`` counts rows per group, while
+        ``"count"`` counts non-NaN values per group.
+    sort : bool, default: False
+        If true, sort output groups by key.  With ``sort=False`` output order is
+        implementation dependent.
+    dropna : bool, default: True
+        If true, skip NaN float keys.  If false, all NaN keys form one group.
+
+    Returns
+    -------
+    groups, result : numpy.ndarray, numpy.ndarray
+        Group keys and reduced values.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import blosc2
+    >>> keys = np.array([1, 2, 1, 2, 1])
+    >>> values = np.array([10., 20., 30., 40., 50.])
+    >>> groups, sums = blosc2.group_reduce(keys, values, op="sum", sort=True)
+    >>> groups
+    array([1, 2])
+    >>> sums
+    array([90., 60.])
+    """
+    if op not in {"size", "count", "sum", "mean", "min", "max"}:
+        raise ValueError(f"unsupported group_reduce operation {op!r}")
+
+    keys_arr = np.asarray(keys)
+    if keys_arr.ndim != 1:
+        raise ValueError("keys must be a 1-D array")
+
+    if op == "size":
+        values_arr = None
+    else:
+        if values is None:
+            raise ValueError(f"values are required for group_reduce op {op!r}")
+        values_arr = np.asarray(values)
+        if values_arr.ndim != 1:
+            raise ValueError("values must be a 1-D array")
+        if len(values_arr) != len(keys_arr):
+            raise ValueError("keys and values must have the same length")
+
+    if len(keys_arr) == 0:
+        return keys_arr.copy(), np.empty(0, dtype=_result_dtype(values_arr, op))
+
+    fast = _try_dense_integer(keys_arr, values_arr, op, sort=sort)
+    if fast is not None:
+        return fast
+
+    fast = _try_float_hash(keys_arr, values_arr, op, sort=sort, dropna=dropna)
+    if fast is not None:
+        return fast
+
+    return _group_reduce_numpy(keys_arr, values_arr, op, sort=sort, dropna=dropna)
+
+
+def _try_dense_integer(keys: np.ndarray, values: np.ndarray | None, op: str, *, sort: bool):  # noqa: C901
+    key_dtype = np.dtype(keys.dtype)
+    if key_dtype.kind == "b":
+        keys = keys.astype(np.int8, copy=False)
+    elif key_dtype.kind not in "iu":
+        return None
+    keys = np.ascontiguousarray(keys)
+    if len(keys) == 0:
+        return None
+    if np.min(keys) < 0:
+        return None
+    max_key = int(np.max(keys))
+    if max_key + 1 > 10_000_000:
+        return None
+
+    try:
+        from blosc2 import groupby_ext
+    except ImportError:
+        return None
+
+    valid = np.ones(len(keys), dtype=bool)
+    keys_present = np.zeros(max_key + 1, dtype=bool)
+
+    if op == "size":
+        counts = np.zeros(max_key + 1, dtype=np.int64)
+        groupby_ext.groupby_dense_int_size_checked(keys, valid, counts, keys_present, False, 0)
+        groups = np.nonzero(keys_present)[0].astype(key_dtype if key_dtype.kind != "b" else np.bool_)
+        result = counts[np.nonzero(keys_present)[0]]
+        return _maybe_sort(groups, result, sort)
+
+    assert values is not None
+    value_dtype = np.dtype(values.dtype)
+    if op == "count":
+        counts = np.zeros(max_key + 1, dtype=np.int64)
+        values_valid = _values_valid(values)
+        groupby_ext.groupby_dense_int_count_checked(
+            keys, valid, np.ascontiguousarray(values_valid), counts, keys_present, False, 0
+        )
+        codes = np.nonzero(keys_present)[0]
+        return _maybe_sort(
+            codes.astype(key_dtype if key_dtype.kind != "b" else np.bool_), counts[codes], sort
+        )
+
+    if op == "mean" or value_dtype.kind == "f":
+        vals = np.ascontiguousarray(values.astype(np.float64, copy=False))
+        skip_nan = value_dtype.kind == "f"
+        if op == "sum":
+            sums = np.zeros(max_key + 1, dtype=np.float64)
+            present = np.zeros(max_key + 1, dtype=bool)
+            groupby_ext.groupby_dense_int_f64_sum_checked(
+                keys, vals, valid, sums, present, keys_present, False, 0, skip_nan
+            )
+            codes = np.nonzero(keys_present)[0]
+            result = sums[codes]
+            result[~present[codes]] = np.nan
+        elif op == "mean":
+            sums = np.zeros(max_key + 1, dtype=np.float64)
+            counts = np.zeros(max_key + 1, dtype=np.int64)
+            groupby_ext.groupby_dense_int_f64_mean_checked(
+                keys, vals, valid, sums, counts, keys_present, False, 0, skip_nan
+            )
+            codes = np.nonzero(keys_present)[0]
+            result = np.full(len(codes), np.nan, dtype=np.float64)
+            ok = counts[codes] > 0
+            result[ok] = sums[codes][ok] / counts[codes][ok]
+        elif op in {"min", "max"}:
+            state = np.zeros(max_key + 1, dtype=np.float64)
+            has_value = np.zeros(max_key + 1, dtype=bool)
+            kernel = getattr(groupby_ext, f"groupby_dense_int_f64_{op}_checked")
+            kernel(keys, vals, valid, state, has_value, keys_present, False, 0, skip_nan)
+            codes = np.nonzero(keys_present)[0]
+            result = state[codes]
+            result[~has_value[codes]] = np.nan
+        else:  # pragma: no cover
+            return None
+        return _maybe_sort(codes.astype(key_dtype if key_dtype.kind != "b" else np.bool_), result, sort)
+
+    if value_dtype.kind not in "biu":
+        return None
+    vals_i64 = np.ascontiguousarray(values.astype(np.int64, copy=False))
+    state = np.zeros(max_key + 1, dtype=np.int64)
+    present = np.zeros(max_key + 1, dtype=bool)
+    kernel = getattr(groupby_ext, f"groupby_dense_int_i64_{op}_checked", None)
+    if kernel is None:
+        return None
+    kernel(keys, vals_i64, valid, state, present, keys_present, False, 0)
+    codes = np.nonzero(keys_present)[0]
+    return _maybe_sort(codes.astype(key_dtype if key_dtype.kind != "b" else np.bool_), state[codes], sort)
+
+
+def _try_float_hash(keys: np.ndarray, values: np.ndarray | None, op: str, *, sort: bool, dropna: bool):
+    key_dtype = np.dtype(keys.dtype)
+    if key_dtype.kind != "f":
+        return None
+    if values is not None and np.dtype(values.dtype).kind != "f" and op != "count":
+        return None
+    try:
+        from blosc2 import groupby_ext
+    except ImportError:
+        return None
+
+    keys_f64 = np.ascontiguousarray(keys.astype(np.float64, copy=False))
+    valid = np.ones(len(keys_f64), dtype=bool)
+    if values is None:
+        values_f64 = np.empty(len(keys_f64), dtype=np.float64)
+        values_valid = np.zeros(len(keys_f64), dtype=bool)
+        has_values = False
+    else:
+        values_f64 = np.ascontiguousarray(np.asarray(values, dtype=np.float64))
+        values_valid = np.ascontiguousarray(_values_valid(values))
+        has_values = True
+
+    groups, row_counts, value_counts, sums, mins, maxs, has_value = groupby_ext.groupby_hash_f64_f64(
+        keys_f64, values_f64, valid, values_valid, has_values, dropna
+    )
+    groups = groups.astype(key_dtype, copy=False)
+    if op == "size":
+        result = row_counts
+    elif op == "count":
+        result = value_counts
+    elif op == "sum":
+        result = sums.copy()
+        result[~has_value] = np.nan
+    elif op == "mean":
+        result = np.full(len(groups), np.nan, dtype=np.float64)
+        ok = value_counts > 0
+        result[ok] = sums[ok] / value_counts[ok]
+    elif op == "min":
+        result = mins.copy()
+        result[~has_value] = np.nan
+    elif op == "max":
+        result = maxs.copy()
+        result[~has_value] = np.nan
+    else:  # pragma: no cover
+        return None
+    return _maybe_sort(groups, result, sort)
+
+
+def _group_reduce_numpy(  # noqa: C901
+    keys: np.ndarray, values: np.ndarray | None, op: str, *, sort: bool, dropna: bool
+):
+    acc: dict[object, list] = {}
+    display: dict[object, object] = {}
+    for i, key in enumerate(keys):
+        key_item = _python_scalar(key)
+        if isinstance(key_item, float) and math.isnan(key_item):
+            if dropna:
+                continue
+            norm_key = _NAN_KEY
+        else:
+            norm_key = key_item
+        display.setdefault(norm_key, key_item)
+        state = acc.setdefault(norm_key, [0, 0, 0.0, None, None])
+        state[0] += 1
+        if values is None:
+            continue
+        value = _python_scalar(values[i])
+        if isinstance(value, float) and math.isnan(value):
+            continue
+        state[1] += 1
+        if op in {"sum", "mean"}:
+            state[2] += value
+        elif op == "min" and (state[3] is None or value < state[3]):
+            state[3] = value
+        elif op == "max" and (state[4] is None or value > state[4]):
+            state[4] = value
+
+    order = list(acc)
+    if sort:
+        order.sort(key=lambda k: (1, "") if k is _NAN_KEY else (0, display[k]))
+    groups = np.asarray([display[k] for k in order], dtype=keys.dtype)
+    result = []
+    for k in order:
+        rows, count, total, min_value, max_value = acc[k]
+        if op == "size":
+            result.append(rows)
+        elif op == "count":
+            result.append(count)
+        elif op == "sum":
+            result.append(total if count else _null_value_for(values))
+        elif op == "mean":
+            result.append(math.nan if count == 0 else total / count)
+        elif op == "min":
+            result.append(min_value if count else _null_value_for(values))
+        elif op == "max":
+            result.append(max_value if count else _null_value_for(values))
+    return groups, np.asarray(result, dtype=_result_dtype(values, op))
+
+
+def _maybe_sort(groups: np.ndarray, result: np.ndarray, sort: bool):
+    if sort and len(groups):
+        order = np.argsort(groups, kind="stable")
+        return groups[order], result[order]
+    return groups, result
+
+
+def _values_valid(values: np.ndarray) -> np.ndarray:
+    values = np.asarray(values)
+    if values.dtype.kind == "f":
+        return ~np.isnan(values)
+    return np.ones(len(values), dtype=bool)
+
+
+def _result_dtype(values: np.ndarray | None, op: str):
+    if op in {"size", "count"}:
+        return np.int64
+    if op == "mean" or values is None:
+        return np.float64
+    dtype = np.dtype(values.dtype)
+    if op == "sum" and dtype.kind in "biu":
+        return np.int64
+    return dtype
+
+
+def _null_value_for(values: np.ndarray | None):
+    if values is not None and np.dtype(values.dtype).kind in "iu":
+        return 0
+    return math.nan
diff --git a/src/blosc2/groupby_ext.pyx b/src/blosc2/groupby_ext.pyx
new file mode 100644
index 00000000..ae5ffd8a
--- /dev/null
+++ b/src/blosc2/groupby_ext.pyx
@@ -0,0 +1,1124 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+# cython: boundscheck=False, wraparound=False, initializedcheck=False
+
+"""Cython group-reduce kernels for CTable group_by()."""
+
+import numpy as np
+cimport numpy as np
+
+from libc.stdint cimport int8_t, uint8_t, int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t
+from libc.stdlib cimport malloc, free
+from libc.string cimport memcpy
+
+
+# ----------------------------------------------------------------------
+# Group-reduce kernels
+# ----------------------------------------------------------------------
+
+def groupby_dense_i32_f64_sum(
+    np.ndarray keys,
+    np.ndarray values,
+    np.ndarray valid,
+    np.ndarray sums,
+    np.ndarray present,
+    bint skip_key_null=False,
+    int32_t key_null=0,
+    bint skip_value_nan=False,
+):
+    """Accumulate ``sum(values)`` by dense int32 keys.
+
+    This is a low-level CTable group-by helper.  *keys*, *values*, and *valid*
+    are same-length 1-D chunk arrays.  *sums* and *present* are dense group
+    state arrays indexed directly by key value.  Keys must be non-negative and
+    already fit in the state arrays.
+    """
+    if keys.ndim != 1 or values.ndim != 1 or valid.ndim != 1:
+        raise ValueError("keys, values and valid must be 1-D arrays")
+    if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]:
+        raise ValueError("keys, values and valid must have the same length")
+    if sums.ndim != 1 or present.ndim != 1:
+        raise ValueError("sums and present must be 1-D arrays")
+    if keys.dtype != np.dtype(np.int32):
+        raise TypeError("keys must have dtype int32")
+    if values.dtype != np.dtype(np.float64):
+        raise TypeError("values must have dtype float64")
+    if valid.dtype != np.dtype(np.bool_):
+        raise TypeError("valid must have dtype bool")
+    if sums.dtype != np.dtype(np.float64):
+        raise TypeError("sums must have dtype float64")
+    if present.dtype != np.dtype(np.bool_):
+        raise TypeError("present must have dtype bool")
+
+    cdef int32_t[:] keys_view = keys
+    cdef double[:] values_view = values
+    cdef np.npy_bool[:] valid_view = valid
+    cdef double[:] sums_view = sums
+    cdef np.npy_bool[:] present_view = present
+    cdef Py_ssize_t n = keys.shape[0]
+    cdef Py_ssize_t nstates = sums.shape[0]
+    cdef Py_ssize_t i
+    cdef int32_t key
+    cdef double value
+
+    if present.shape[0] != sums.shape[0]:
+        raise ValueError("present and sums must have the same length")
+
+    with nogil:
+        for i in range(n):
+            if not valid_view[i]:
+                continue
+            key = keys_view[i]
+            if skip_key_null and key == key_null:
+                continue
+            if key < 0 or key >= nstates:
+                continue
+            value = values_view[i]
+            if skip_value_nan and value != value:
+                continue
+            sums_view[key] += value
+            present_view[key] = 1
+    return None
+
+
+def groupby_dense_i32_f64_sum_checked(
+    np.ndarray keys,
+    np.ndarray values,
+    np.ndarray valid,
+    np.ndarray sums,
+    np.ndarray present,
+    bint skip_key_null=False,
+    int32_t key_null=0,
+    bint skip_value_nan=False,
+):
+    """Checked dense int32/float64 sum kernel.
+
+    Returns ``0`` on success, ``-1`` if a negative non-null key is found, or
+    ``max_key + 1`` when the dense state arrays need to be grown.  The state is
+    not mutated unless the function returns ``0``.
+    """
+    if keys.ndim != 1 or values.ndim != 1 or valid.ndim != 1:
+        raise ValueError("keys, values and valid must be 1-D arrays")
+    if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]:
+        raise ValueError("keys, values and valid must have the same length")
+    if sums.ndim != 1 or present.ndim != 1:
+        raise ValueError("sums and present must be 1-D arrays")
+    if keys.dtype != np.dtype(np.int32):
+        raise TypeError("keys must have dtype int32")
+    if values.dtype != np.dtype(np.float64):
+        raise TypeError("values must have dtype float64")
+    if valid.dtype != np.dtype(np.bool_):
+        raise TypeError("valid must have dtype bool")
+    if sums.dtype != np.dtype(np.float64):
+        raise TypeError("sums must have dtype float64")
+    if present.dtype != np.dtype(np.bool_):
+        raise TypeError("present must have dtype bool")
+    if present.shape[0] != sums.shape[0]:
+        raise ValueError("present and sums must have the same length")
+
+    cdef int32_t[:] keys_view = keys
+    cdef double[:] values_view = values
+    cdef np.npy_bool[:] valid_view = valid
+    cdef double[:] sums_view = sums
+    cdef np.npy_bool[:] present_view = present
+    cdef Py_ssize_t n = keys.shape[0]
+    cdef Py_ssize_t nstates = sums.shape[0]
+    cdef Py_ssize_t i
+    cdef int32_t key
+    cdef int32_t max_key = -1
+    cdef int ret = 0
+    cdef double value
+
+    with nogil:
+        for i in range(n):
+            if not valid_view[i]:
+                continue
+            key = keys_view[i]
+            if skip_key_null and key == key_null:
+                continue
+            if key < 0:
+                ret = -1
+                break
+            if key > max_key:
+                max_key = key
+        if ret == 0:
+            if max_key < 0:
+                ret = 0
+            elif max_key >= nstates:
+                ret = <int>max_key + 1
+            else:
+                for i in range(n):
+                    if not valid_view[i]:
+                        continue
+                    key = keys_view[i]
+                    if skip_key_null and key == key_null:
+                        continue
+                    value = values_view[i]
+                    if skip_value_nan and value != value:
+                        continue
+                    sums_view[key] += value
+                    present_view[key] = 1
+    return ret
+
+
+def groupby_dense_f64_integral_key_f64_sum_checked(
+    np.ndarray keys,
+    np.ndarray values,
+    np.ndarray valid,
+    np.ndarray sums,
+    np.ndarray present,
+    bint skip_key_nan=True,
+    bint skip_value_nan=False,
+):
+    """Checked dense float64-integral-key/float64 sum kernel.
+
+    Fast path for float keys that are exactly integral, finite and
+    non-negative.  Returns ``0`` on success, ``-1`` if a key cannot be handled,
+    or ``max_key + 1`` when the dense state arrays need to be grown.  The state is
+    not mutated unless the function returns ``0``.
+    """
+    if keys.ndim != 1 or values.ndim != 1 or valid.ndim != 1:
+        raise ValueError("keys, values and valid must be 1-D arrays")
+    if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]:
+        raise ValueError("keys, values and valid must have the same length")
+    if sums.ndim != 1 or present.ndim != 1:
+        raise ValueError("sums and present must be 1-D arrays")
+    if keys.dtype != np.dtype(np.float64):
+        raise TypeError("keys must have dtype float64")
+    if values.dtype != np.dtype(np.float64):
+        raise TypeError("values must have dtype float64")
+    if valid.dtype != np.dtype(np.bool_):
+        raise TypeError("valid must have dtype bool")
+    if sums.dtype != np.dtype(np.float64):
+        raise TypeError("sums must have dtype float64")
+    if present.dtype != np.dtype(np.bool_):
+        raise TypeError("present must have dtype bool")
+    if present.shape[0] != sums.shape[0]:
+        raise ValueError("present and sums must have the same length")
+
+    cdef double[:] keys_view = keys
+    cdef double[:] values_view = values
+    cdef np.npy_bool[:] valid_view = valid
+    cdef double[:] sums_view = sums
+    cdef np.npy_bool[:] present_view = present
+    cdef Py_ssize_t n = keys.shape[0]
+    cdef Py_ssize_t nstates = sums.shape[0]
+    cdef Py_ssize_t i
+    cdef double key_f
+    cdef int64_t key_i
+    cdef int64_t max_key = -1
+    cdef int ret = 0
+    cdef double value
+
+    with nogil:
+        for i in range(n):
+            if not valid_view[i]:
+                continue
+            key_f = keys_view[i]
+            if key_f != key_f:
+                if skip_key_nan:
+                    continue
+                ret = -1
+                break
+            if key_f < 0.0 or key_f > 9223372036854774784.0:
+                ret = -1
+                break
+            key_i = <int64_t>key_f
+            if key_f != <double>key_i:
+                ret = -1
+                break
+            if key_i > max_key:
+                max_key = key_i
+        if ret == 0:
+            if max_key < 0:
+                ret = 0
+            elif max_key >= nstates:
+                if max_key > 2147483646:
+                    ret = -1
+                else:
+                    ret = <int>max_key + 1
+            else:
+                for i in range(n):
+                    if not valid_view[i]:
+                        continue
+                    key_f = keys_view[i]
+                    if key_f != key_f:
+                        if skip_key_nan:
+                            continue
+                        ret = -1
+                        break
+                    key_i = <int64_t>key_f
+                    value = values_view[i]
+                    if skip_value_nan and value != value:
+                        continue
+                    sums_view[key_i] += value
+                    present_view[key_i] = 1
+    return ret
+
+
+def groupby_dense_f32_integral_key_f64_sum_checked(
+    np.ndarray keys,
+    np.ndarray values,
+    np.ndarray valid,
+    np.ndarray sums,
+    np.ndarray present,
+    bint skip_key_nan=True,
+    bint skip_value_nan=False,
+):
+    """Checked dense float32-integral-key/float64 sum kernel."""
+    if keys.ndim != 1 or values.ndim != 1 or valid.ndim != 1:
+        raise ValueError("keys, values and valid must be 1-D arrays")
+    if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]:
+        raise ValueError("keys, values and valid must have the same length")
+    if sums.ndim != 1 or present.ndim != 1:
+        raise ValueError("sums and present must be 1-D arrays")
+    if keys.dtype != np.dtype(np.float32):
+        raise TypeError("keys must have dtype float32")
+    if values.dtype != np.dtype(np.float64):
+        raise TypeError("values must have dtype float64")
+    if valid.dtype != np.dtype(np.bool_):
+        raise TypeError("valid must have dtype bool")
+    if sums.dtype != np.dtype(np.float64):
+        raise TypeError("sums must have dtype float64")
+    if present.dtype != np.dtype(np.bool_):
+        raise TypeError("present must have dtype bool")
+    if present.shape[0] != sums.shape[0]:
+        raise ValueError("present and sums must have the same length")
+
+    cdef float[:] keys_view = keys
+    cdef double[:] values_view = values
+    cdef np.npy_bool[:] valid_view = valid
+    cdef double[:] sums_view = sums
+    cdef np.npy_bool[:] present_view = present
+    cdef Py_ssize_t n = keys.shape[0]
+    cdef Py_ssize_t nstates = sums.shape[0]
+    cdef Py_ssize_t i
+    cdef float key_f
+    cdef int64_t key_i
+    cdef int64_t max_key = -1
+    cdef int ret = 0
+    cdef double value
+
+    with nogil:
+        for i in range(n):
+            if not valid_view[i]:
+                continue
+            key_f = keys_view[i]
+            if key_f != key_f:
+                if skip_key_nan:
+                    continue
+                ret = -1
+                break
+            if key_f < 0.0 or key_f > 16777216.0:
+                ret = -1
+                break
+            key_i = <int64_t>key_f
+            if key_f != <float>key_i:
+                ret = -1
+                break
+            if key_i > max_key:
+                max_key = key_i
+        if ret == 0:
+            if max_key < 0:
+                ret = 0
+            elif max_key >= nstates:
+                if max_key > 2147483646:
+                    ret = -1
+                else:
+                    ret = <int>max_key + 1
+            else:
+                for i in range(n):
+                    if not valid_view[i]:
+                        continue
+                    key_f = keys_view[i]
+                    if key_f != key_f:
+                        if skip_key_nan:
+                            continue
+                        ret = -1
+                        break
+                    key_i = <int64_t>key_f
+                    value = values_view[i]
+                    if skip_value_nan and value != value:
+                        continue
+                    sums_view[key_i] += value
+                    present_view[key_i] = 1
+    return ret
+
+
+# ----------------------------------------------------------------------
+# Fused integer-key dense kernels
+# ----------------------------------------------------------------------
+
+ctypedef fused dense_int_key_t:
+    int8_t
+    uint8_t
+    int16_t
+    uint16_t
+    int32_t
+    uint32_t
+    int64_t
+    uint64_t
+
+
+cdef inline int _dense_int_key_scan(
+    dense_int_key_t[:] keys_view,
+    np.npy_bool[:] valid_view,
+    Py_ssize_t n,
+    Py_ssize_t nstates,
+    bint skip_key_null,
+    int64_t key_null,
+    int* ret,
+) noexcept nogil:
+    cdef Py_ssize_t i
+    cdef int64_t key
+    cdef int64_t max_key = -1
+    ret[0] = 0
+    for i in range(n):
+        if not valid_view[i]:
+            continue
+        key = <int64_t>keys_view[i]
+        if skip_key_null and key == key_null:
+            continue
+        if key < 0:
+            ret[0] = -1
+            return 0
+        if key > max_key:
+            max_key = key
+    if max_key < 0:
+        ret[0] = 0
+    elif max_key >= nstates:
+        if max_key > 2147483646:
+            ret[0] = -1
+        else:
+            ret[0] = <int>max_key + 1
+    return 0
+
+
+def groupby_dense_int_size_checked(
+    dense_int_key_t[:] keys,
+    np.npy_bool[:] valid,
+    int64_t[:] counts,
+    np.npy_bool[:] keys_present,
+    bint skip_key_null=False,
+    int64_t key_null=0,
+):
+    """Checked dense integer-key ``size`` kernel for all integer key widths."""
+    if keys.shape[0] != valid.shape[0]:
+        raise ValueError("keys and valid must have the same length")
+    if counts.shape[0] != keys_present.shape[0]:
+        raise ValueError("counts and keys_present must have the same length")
+    cdef Py_ssize_t n = keys.shape[0]
+    cdef Py_ssize_t nstates = counts.shape[0]
+    cdef Py_ssize_t i
+    cdef int64_t key
+    cdef int ret
+    with nogil:
+        _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret)
+        if ret == 0:
+            for i in range(n):
+                if not valid[i]:
+                    continue
+                key = <int64_t>keys[i]
+                if skip_key_null and key == key_null:
+                    continue
+                counts[key] += 1
+                keys_present[key] = 1
+    return ret
+
+
+def groupby_dense_int_count_checked(
+    dense_int_key_t[:] keys,
+    np.npy_bool[:] valid,
+    np.npy_bool[:] values_valid,
+    int64_t[:] counts,
+    np.npy_bool[:] keys_present,
+    bint skip_key_null=False,
+    int64_t key_null=0,
+):
+    """Checked dense integer-key non-null count kernel."""
+    if keys.shape[0] != valid.shape[0] or keys.shape[0] != values_valid.shape[0]:
+        raise ValueError("keys, valid and values_valid must have the same length")
+    if counts.shape[0] != keys_present.shape[0]:
+        raise ValueError("counts and keys_present must have the same length")
+    cdef Py_ssize_t n = keys.shape[0]
+    cdef Py_ssize_t nstates = counts.shape[0]
+    cdef Py_ssize_t i
+    cdef int64_t key
+    cdef int ret
+    with nogil:
+        _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret)
+        if ret == 0:
+            for i in range(n):
+                if not valid[i]:
+                    continue
+                key = <int64_t>keys[i]
+                if skip_key_null and key == key_null:
+                    continue
+                keys_present[key] = 1
+                if values_valid[i]:
+                    counts[key] += 1
+    return ret
+
+
+def groupby_dense_int_f64_sum_checked(
+    dense_int_key_t[:] keys,
+    double[:] values,
+    np.npy_bool[:] valid,
+    double[:] sums,
+    np.npy_bool[:] value_present,
+    np.npy_bool[:] keys_present,
+    bint skip_key_null=False,
+    int64_t key_null=0,
+    bint skip_value_nan=False,
+):
+    """Checked dense integer-key float64 sum kernel."""
+    if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]:
+        raise ValueError("keys, values and valid must have the same length")
+    if sums.shape[0] != value_present.shape[0] or sums.shape[0] != keys_present.shape[0]:
+        raise ValueError("state arrays must have the same length")
+    cdef Py_ssize_t n = keys.shape[0]
+    cdef Py_ssize_t nstates = sums.shape[0]
+    cdef Py_ssize_t i
+    cdef int64_t key
+    cdef double value
+    cdef int ret
+    with nogil:
+        _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret)
+        if ret == 0:
+            for i in range(n):
+                if not valid[i]:
+                    continue
+                key = <int64_t>keys[i]
+                if skip_key_null and key == key_null:
+                    continue
+                keys_present[key] = 1
+                value = values[i]
+                if skip_value_nan and value != value:
+                    continue
+                sums[key] += value
+                value_present[key] = 1
+    return ret
+
+
+def groupby_dense_int_i64_sum_checked(
+    dense_int_key_t[:] keys,
+    int64_t[:] values,
+    np.npy_bool[:] valid,
+    int64_t[:] sums,
+    np.npy_bool[:] value_present,
+    np.npy_bool[:] keys_present,
+    bint skip_key_null=False,
+    int64_t key_null=0,
+):
+    """Checked dense integer-key int64 sum kernel."""
+    if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]:
+        raise ValueError("keys, values and valid must have the same length")
+    if sums.shape[0] != value_present.shape[0] or sums.shape[0] != keys_present.shape[0]:
+        raise ValueError("state arrays must have the same length")
+    cdef Py_ssize_t n = keys.shape[0]
+    cdef Py_ssize_t nstates = sums.shape[0]
+    cdef Py_ssize_t i
+    cdef int64_t key
+    cdef int ret
+    with nogil:
+        _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret)
+        if ret == 0:
+            for i in range(n):
+                if not valid[i]:
+                    continue
+                key = <int64_t>keys[i]
+                if skip_key_null and key == key_null:
+                    continue
+                keys_present[key] = 1
+                sums[key] += values[i]
+                value_present[key] = 1
+    return ret
+
+
+def groupby_dense_int_f64_mean_checked(
+    dense_int_key_t[:] keys,
+    double[:] values,
+    np.npy_bool[:] valid,
+    double[:] sums,
+    int64_t[:] counts,
+    np.npy_bool[:] keys_present,
+    bint skip_key_null=False,
+    int64_t key_null=0,
+    bint skip_value_nan=False,
+):
+    """Checked dense integer-key float64 mean state kernel."""
+    if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]:
+        raise ValueError("keys, values and valid must have the same length")
+    if sums.shape[0] != counts.shape[0] or sums.shape[0] != keys_present.shape[0]:
+        raise ValueError("state arrays must have the same length")
+    cdef Py_ssize_t n = keys.shape[0]
+    cdef Py_ssize_t nstates = sums.shape[0]
+    cdef Py_ssize_t i
+    cdef int64_t key
+    cdef double value
+    cdef int ret
+    with nogil:
+        _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret)
+        if ret == 0:
+            for i in range(n):
+                if not valid[i]:
+                    continue
+                key = <int64_t>keys[i]
+                if skip_key_null and key == key_null:
+                    continue
+                keys_present[key] = 1
+                value = values[i]
+                if skip_value_nan and value != value:
+                    continue
+                sums[key] += value
+                counts[key] += 1
+    return ret
+
+
+def groupby_dense_int_f64_min_checked(
+    dense_int_key_t[:] keys,
+    double[:] values,
+    np.npy_bool[:] valid,
+    double[:] mins,
+    np.npy_bool[:] has_value,
+    np.npy_bool[:] keys_present,
+    bint skip_key_null=False,
+    int64_t key_null=0,
+    bint skip_value_nan=False,
+):
+    """Checked dense integer-key float64 min kernel."""
+    cdef Py_ssize_t n = keys.shape[0]
+    cdef Py_ssize_t nstates = mins.shape[0]
+    cdef Py_ssize_t i
+    cdef int64_t key
+    cdef double value
+    cdef int ret
+    with nogil:
+        _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret)
+        if ret == 0:
+            for i in range(n):
+                if not valid[i]:
+                    continue
+                key = <int64_t>keys[i]
+                if skip_key_null and key == key_null:
+                    continue
+                keys_present[key] = 1
+                value = values[i]
+                if skip_value_nan and value != value:
+                    continue
+                if not has_value[key] or value < mins[key]:
+                    mins[key] = value
+                has_value[key] = 1
+    return ret
+
+
+def groupby_dense_int_f64_max_checked(
+    dense_int_key_t[:] keys,
+    double[:] values,
+    np.npy_bool[:] valid,
+    double[:] maxs,
+    np.npy_bool[:] has_value,
+    np.npy_bool[:] keys_present,
+    bint skip_key_null=False,
+    int64_t key_null=0,
+    bint skip_value_nan=False,
+):
+    """Checked dense integer-key float64 max kernel."""
+    cdef Py_ssize_t n = keys.shape[0]
+    cdef Py_ssize_t nstates = maxs.shape[0]
+    cdef Py_ssize_t i
+    cdef int64_t key
+    cdef double value
+    cdef int ret
+    with nogil:
+        _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret)
+        if ret == 0:
+            for i in range(n):
+                if not valid[i]:
+                    continue
+                key = <int64_t>keys[i]
+                if skip_key_null and key == key_null:
+                    continue
+                keys_present[key] = 1
+                value = values[i]
+                if skip_value_nan and value != value:
+                    continue
+                if not has_value[key] or value > maxs[key]:
+                    maxs[key] = value
+                has_value[key] = 1
+    return ret
+
+
+def groupby_dense_int_i64_min_checked(
+    dense_int_key_t[:] keys,
+    int64_t[:] values,
+    np.npy_bool[:] valid,
+    int64_t[:] mins,
+    np.npy_bool[:] has_value,
+    np.npy_bool[:] keys_present,
+    bint skip_key_null=False,
+    int64_t key_null=0,
+):
+    """Checked dense integer-key int64 min kernel."""
+    cdef Py_ssize_t n = keys.shape[0]
+    cdef Py_ssize_t nstates = mins.shape[0]
+    cdef Py_ssize_t i
+    cdef int64_t key
+    cdef int64_t value
+    cdef int ret
+    with nogil:
+        _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret)
+        if ret == 0:
+            for i in range(n):
+                if not valid[i]:
+                    continue
+                key = <int64_t>keys[i]
+                if skip_key_null and key == key_null:
+                    continue
+                keys_present[key] = 1
+                value = values[i]
+                if not has_value[key] or value < mins[key]:
+                    mins[key] = value
+                has_value[key] = 1
+    return ret
+
+
+def groupby_dense_int_i64_max_checked(
+    dense_int_key_t[:] keys,
+    int64_t[:] values,
+    np.npy_bool[:] valid,
+    int64_t[:] maxs,
+    np.npy_bool[:] has_value,
+    np.npy_bool[:] keys_present,
+    bint skip_key_null=False,
+    int64_t key_null=0,
+):
+    """Checked dense integer-key int64 max kernel."""
+    cdef Py_ssize_t n = keys.shape[0]
+    cdef Py_ssize_t nstates = maxs.shape[0]
+    cdef Py_ssize_t i
+    cdef int64_t key
+    cdef int64_t value
+    cdef int ret
+    with nogil:
+        _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret)
+        if ret == 0:
+            for i in range(n):
+                if not valid[i]:
+                    continue
+                key = <int64_t>keys[i]
+                if skip_key_null and key == key_null:
+                    continue
+                keys_present[key] = 1
+                value = values[i]
+                if not has_value[key] or value > maxs[key]:
+                    maxs[key] = value
+                has_value[key] = 1
+    return ret
+
+
+# ----------------------------------------------------------------------
+# Arbitrary float-key hash kernels
+# ----------------------------------------------------------------------
+
+cdef inline uint64_t _f64_bits(double value) noexcept:
+    cdef uint64_t bits
+    memcpy(&bits, &value, sizeof(double))
+    return bits
+
+
+cdef inline uint64_t _mix_u64(uint64_t x) noexcept:
+    x ^= x >> 30
+    x *= <uint64_t>0xbf58476d1ce4e5b9
+    x ^= x >> 27
+    x *= <uint64_t>0x94d049bb133111eb
+    x ^= x >> 31
+    return x
+
+
+def groupby_hash_f64_f64(
+    double[:] keys,
+    double[:] values,
+    np.npy_bool[:] valid,
+    np.npy_bool[:] values_valid,
+    bint has_values,
+    bint dropna=True,
+):
+    """Hash arbitrary float64 keys and accumulate float64 group states.
+
+    Returns ``(keys, row_counts, value_counts, sums, mins, maxs, has_value)``.
+    NaN keys are skipped when ``dropna`` is true; otherwise all NaN bit-patterns
+    are normalized into one NaN group.  ``+0.0`` and ``-0.0`` are normalized into
+    the same zero group.
+    """
+    if keys.shape[0] != valid.shape[0]:
+        raise ValueError("keys and valid must have the same length")
+    if has_values and (values.shape[0] != keys.shape[0] or values_valid.shape[0] != keys.shape[0]):
+        raise ValueError("values, values_valid and keys must have the same length")
+
+    cdef Py_ssize_t n = keys.shape[0]
+    cdef Py_ssize_t cap = 1024
+    cdef Py_ssize_t used_count = 0
+    cdef Py_ssize_t i, pos, old_pos, out_pos
+    cdef uint64_t mask = <uint64_t>cap - 1
+    cdef uint64_t bits, h, old_bits
+    cdef double key, value
+    cdef double nan_value = float("nan")
+    cdef uint64_t nan_bits = <uint64_t>0x7ff8000000000000
+    cdef bint value_ok
+
+    cdef uint64_t* table_bits = <uint64_t*>malloc(cap * sizeof(uint64_t))
+    cdef np.npy_bool* table_used = <np.npy_bool*>malloc(cap * sizeof(np.npy_bool))
+    cdef double* table_keys = <double*>malloc(cap * sizeof(double))
+    cdef int64_t* row_counts = <int64_t*>malloc(cap * sizeof(int64_t))
+    cdef int64_t* value_counts = <int64_t*>malloc(cap * sizeof(int64_t))
+    cdef double* sums = <double*>malloc(cap * sizeof(double))
+    cdef double* mins = <double*>malloc(cap * sizeof(double))
+    cdef double* maxs = <double*>malloc(cap * sizeof(double))
+    cdef np.npy_bool* has_value = <np.npy_bool*>malloc(cap * sizeof(np.npy_bool))
+
+    cdef uint64_t* new_bits
+    cdef np.npy_bool* new_used
+    cdef double* new_keys
+    cdef int64_t* new_row_counts
+    cdef int64_t* new_value_counts
+    cdef double* new_sums
+    cdef double* new_mins
+    cdef double* new_maxs
+    cdef np.npy_bool* new_has_value
+    cdef Py_ssize_t old_cap
+    cdef uint64_t new_mask
+
+    if (
+        table_bits == NULL
+        or table_used == NULL
+        or table_keys == NULL
+        or row_counts == NULL
+        or value_counts == NULL
+        or sums == NULL
+        or mins == NULL
+        or maxs == NULL
+        or has_value == NULL
+    ):
+        free(table_bits); free(table_used); free(table_keys); free(row_counts); free(value_counts)
+        free(sums); free(mins); free(maxs); free(has_value)
+        raise MemoryError()
+
+    for i in range(cap):
+        table_used[i] = 0
+
+    try:
+        for i in range(n):
+            if not valid[i]:
+                continue
+            key = keys[i]
+            if key != key:
+                if dropna:
+                    continue
+                bits = nan_bits
+                key = nan_value
+            elif key == 0.0:
+                key = 0.0
+                bits = 0
+            else:
+                bits = _f64_bits(key)
+
+            if (used_count + 1) * 2 >= cap:
+                old_cap = cap
+                cap *= 2
+                mask = <uint64_t>cap - 1
+                new_bits = <uint64_t*>malloc(cap * sizeof(uint64_t))
+                new_used = <np.npy_bool*>malloc(cap * sizeof(np.npy_bool))
+                new_keys = <double*>malloc(cap * sizeof(double))
+                new_row_counts = <int64_t*>malloc(cap * sizeof(int64_t))
+                new_value_counts = <int64_t*>malloc(cap * sizeof(int64_t))
+                new_sums = <double*>malloc(cap * sizeof(double))
+                new_mins = <double*>malloc(cap * sizeof(double))
+                new_maxs = <double*>malloc(cap * sizeof(double))
+                new_has_value = <np.npy_bool*>malloc(cap * sizeof(np.npy_bool))
+                if (
+                    new_bits == NULL
+                    or new_used == NULL
+                    or new_keys == NULL
+                    or new_row_counts == NULL
+                    or new_value_counts == NULL
+                    or new_sums == NULL
+                    or new_mins == NULL
+                    or new_maxs == NULL
+                    or new_has_value == NULL
+                ):
+                    free(new_bits); free(new_used); free(new_keys); free(new_row_counts); free(new_value_counts)
+                    free(new_sums); free(new_mins); free(new_maxs); free(new_has_value)
+                    raise MemoryError()
+                for pos in range(cap):
+                    new_used[pos] = 0
+                for old_pos in range(old_cap):
+                    if not table_used[old_pos]:
+                        continue
+                    old_bits = table_bits[old_pos]
+                    h = _mix_u64(old_bits)
+                    pos = <Py_ssize_t>(h & mask)
+                    while new_used[pos]:
+                        pos = <Py_ssize_t>((pos + 1) & mask)
+                    new_used[pos] = 1
+                    new_bits[pos] = old_bits
+                    new_keys[pos] = table_keys[old_pos]
+                    new_row_counts[pos] = row_counts[old_pos]
+                    new_value_counts[pos] = value_counts[old_pos]
+                    new_sums[pos] = sums[old_pos]
+                    new_mins[pos] = mins[old_pos]
+                    new_maxs[pos] = maxs[old_pos]
+                    new_has_value[pos] = has_value[old_pos]
+                free(table_bits); free(table_used); free(table_keys); free(row_counts); free(value_counts)
+                free(sums); free(mins); free(maxs); free(has_value)
+                table_bits = new_bits
+                table_used = new_used
+                table_keys = new_keys
+                row_counts = new_row_counts
+                value_counts = new_value_counts
+                sums = new_sums
+                mins = new_mins
+                maxs = new_maxs
+                has_value = new_has_value
+
+            h = _mix_u64(bits)
+            pos = <Py_ssize_t>(h & mask)
+            while table_used[pos] and table_bits[pos] != bits:
+                pos = <Py_ssize_t>((pos + 1) & mask)
+            if not table_used[pos]:
+                table_used[pos] = 1
+                table_bits[pos] = bits
+                table_keys[pos] = key
+                row_counts[pos] = 0
+                value_counts[pos] = 0
+                sums[pos] = 0.0
+                mins[pos] = 0.0
+                maxs[pos] = 0.0
+                has_value[pos] = 0
+                used_count += 1
+
+            row_counts[pos] += 1
+            if has_values:
+                value_ok = values_valid[i]
+                if value_ok:
+                    value = values[i]
+                    value_counts[pos] += 1
+                    sums[pos] += value
+                    if not has_value[pos] or value < mins[pos]:
+                        mins[pos] = value
+                    if not has_value[pos] or value > maxs[pos]:
+                        maxs[pos] = value
+                    has_value[pos] = 1
+
+        out_keys = np.empty(used_count, dtype=np.float64)
+        out_row_counts = np.empty(used_count, dtype=np.int64)
+        out_value_counts = np.empty(used_count, dtype=np.int64)
+        out_sums = np.empty(used_count, dtype=np.float64)
+        out_mins = np.empty(used_count, dtype=np.float64)
+        out_maxs = np.empty(used_count, dtype=np.float64)
+        out_has_value = np.empty(used_count, dtype=bool)
+
+        out_pos = 0
+        for pos in range(cap):
+            if not table_used[pos]:
+                continue
+            out_keys[out_pos] = table_keys[pos]
+            out_row_counts[out_pos] = row_counts[pos]
+            out_value_counts[out_pos] = value_counts[pos]
+            out_sums[out_pos] = sums[pos]
+            out_mins[out_pos] = mins[pos]
+            out_maxs[out_pos] = maxs[pos]
+            out_has_value[out_pos] = has_value[pos]
+            out_pos += 1
+        return out_keys, out_row_counts, out_value_counts, out_sums, out_mins, out_maxs, out_has_value
+    finally:
+        free(table_bits); free(table_used); free(table_keys); free(row_counts); free(value_counts)
+        free(sums); free(mins); free(maxs); free(has_value)
+
+
+def groupby_hash_i64x2_f64(
+    int64_t[:] key0,
+    int64_t[:] key1,
+    double[:] values,
+    np.npy_bool[:] valid,
+    np.npy_bool[:] values_valid,
+    bint has_values,
+):
+    """Hash two int64-normalized keys and accumulate float64 group states."""
+    if key0.shape[0] != key1.shape[0] or key0.shape[0] != valid.shape[0]:
+        raise ValueError("key0, key1 and valid must have the same length")
+    if has_values and (values.shape[0] != key0.shape[0] or values_valid.shape[0] != key0.shape[0]):
+        raise ValueError("values, values_valid and keys must have the same length")
+
+    cdef Py_ssize_t n = key0.shape[0]
+    cdef Py_ssize_t cap = 1024
+    cdef Py_ssize_t used_count = 0
+    cdef Py_ssize_t i, pos, old_pos, out_pos
+    cdef uint64_t mask = <uint64_t>cap - 1
+    cdef uint64_t h
+    cdef int64_t k0, k1
+    cdef double value
+    cdef bint value_ok
+
+    cdef int64_t* table_k0 = <int64_t*>malloc(cap * sizeof(int64_t))
+    cdef int64_t* table_k1 = <int64_t*>malloc(cap * sizeof(int64_t))
+    cdef np.npy_bool* table_used = <np.npy_bool*>malloc(cap * sizeof(np.npy_bool))
+    cdef int64_t* row_counts = <int64_t*>malloc(cap * sizeof(int64_t))
+    cdef int64_t* value_counts = <int64_t*>malloc(cap * sizeof(int64_t))
+    cdef double* sums = <double*>malloc(cap * sizeof(double))
+    cdef double* mins = <double*>malloc(cap * sizeof(double))
+    cdef double* maxs = <double*>malloc(cap * sizeof(double))
+    cdef np.npy_bool* has_value = <np.npy_bool*>malloc(cap * sizeof(np.npy_bool))
+
+    cdef int64_t* new_k0
+    cdef int64_t* new_k1
+    cdef np.npy_bool* new_used
+    cdef int64_t* new_row_counts
+    cdef int64_t* new_value_counts
+    cdef double* new_sums
+    cdef double* new_mins
+    cdef double* new_maxs
+    cdef np.npy_bool* new_has_value
+    cdef Py_ssize_t old_cap
+
+    if (
+        table_k0 == NULL
+        or table_k1 == NULL
+        or table_used == NULL
+        or row_counts == NULL
+        or value_counts == NULL
+        or sums == NULL
+        or mins == NULL
+        or maxs == NULL
+        or has_value == NULL
+    ):
+        free(table_k0); free(table_k1); free(table_used); free(row_counts); free(value_counts)
+        free(sums); free(mins); free(maxs); free(has_value)
+        raise MemoryError()
+
+    for i in range(cap):
+        table_used[i] = 0
+
+    try:
+        for i in range(n):
+            if not valid[i]:
+                continue
+            k0 = key0[i]
+            k1 = key1[i]
+
+            if (used_count + 1) * 2 >= cap:
+                old_cap = cap
+                cap *= 2
+                mask = <uint64_t>cap - 1
+                new_k0 = <int64_t*>malloc(cap * sizeof(int64_t))
+                new_k1 = <int64_t*>malloc(cap * sizeof(int64_t))
+                new_used = <np.npy_bool*>malloc(cap * sizeof(np.npy_bool))
+                new_row_counts = <int64_t*>malloc(cap * sizeof(int64_t))
+                new_value_counts = <int64_t*>malloc(cap * sizeof(int64_t))
+                new_sums = <double*>malloc(cap * sizeof(double))
+                new_mins = <double*>malloc(cap * sizeof(double))
+                new_maxs = <double*>malloc(cap * sizeof(double))
+                new_has_value = <np.npy_bool*>malloc(cap * sizeof(np.npy_bool))
+                if (
+                    new_k0 == NULL
+                    or new_k1 == NULL
+                    or new_used == NULL
+                    or new_row_counts == NULL
+                    or new_value_counts == NULL
+                    or new_sums == NULL
+                    or new_mins == NULL
+                    or new_maxs == NULL
+                    or new_has_value == NULL
+                ):
+                    free(new_k0); free(new_k1); free(new_used); free(new_row_counts); free(new_value_counts)
+                    free(new_sums); free(new_mins); free(new_maxs); free(new_has_value)
+                    raise MemoryError()
+                for pos in range(cap):
+                    new_used[pos] = 0
+                for old_pos in range(old_cap):
+                    if not table_used[old_pos]:
+                        continue
+                    h = _mix_u64(<uint64_t>table_k0[old_pos]) ^ _mix_u64(<uint64_t>table_k1[old_pos] + <uint64_t>0x9e3779b97f4a7c15)
+                    pos = <Py_ssize_t>(h & mask)
+                    while new_used[pos]:
+                        pos = <Py_ssize_t>((pos + 1) & mask)
+                    new_used[pos] = 1
+                    new_k0[pos] = table_k0[old_pos]
+                    new_k1[pos] = table_k1[old_pos]
+                    new_row_counts[pos] = row_counts[old_pos]
+                    new_value_counts[pos] = value_counts[old_pos]
+                    new_sums[pos] = sums[old_pos]
+                    new_mins[pos] = mins[old_pos]
+                    new_maxs[pos] = maxs[old_pos]
+                    new_has_value[pos] = has_value[old_pos]
+                free(table_k0); free(table_k1); free(table_used); free(row_counts); free(value_counts)
+                free(sums); free(mins); free(maxs); free(has_value)
+                table_k0 = new_k0
+                table_k1 = new_k1
+                table_used = new_used
+                row_counts = new_row_counts
+                value_counts = new_value_counts
+                sums = new_sums
+                mins = new_mins
+                maxs = new_maxs
+                has_value = new_has_value
+
+            h = _mix_u64(<uint64_t>k0) ^ _mix_u64(<uint64_t>k1 + <uint64_t>0x9e3779b97f4a7c15)
+            pos = <Py_ssize_t>(h & mask)
+            while table_used[pos] and (table_k0[pos] != k0 or table_k1[pos] != k1):
+                pos = <Py_ssize_t>((pos + 1) & mask)
+            if not table_used[pos]:
+                table_used[pos] = 1
+                table_k0[pos] = k0
+                table_k1[pos] = k1
+                row_counts[pos] = 0
+                value_counts[pos] = 0
+                sums[pos] = 0.0
+                mins[pos] = 0.0
+                maxs[pos] = 0.0
+                has_value[pos] = 0
+                used_count += 1
+
+            row_counts[pos] += 1
+            if has_values:
+                value_ok = values_valid[i]
+                if value_ok:
+                    value = values[i]
+                    value_counts[pos] += 1
+                    sums[pos] += value
+                    if not has_value[pos] or value < mins[pos]:
+                        mins[pos] = value
+                    if not has_value[pos] or value > maxs[pos]:
+                        maxs[pos] = value
+                    has_value[pos] = 1
+
+        out_k0 = np.empty(used_count, dtype=np.int64)
+        out_k1 = np.empty(used_count, dtype=np.int64)
+        out_row_counts = np.empty(used_count, dtype=np.int64)
+        out_value_counts = np.empty(used_count, dtype=np.int64)
+        out_sums = np.empty(used_count, dtype=np.float64)
+        out_mins = np.empty(used_count, dtype=np.float64)
+        out_maxs = np.empty(used_count, dtype=np.float64)
+        out_has_value = np.empty(used_count, dtype=bool)
+
+        out_pos = 0
+        for pos in range(cap):
+            if not table_used[pos]:
+                continue
+            out_k0[out_pos] = table_k0[pos]
+            out_k1[out_pos] = table_k1[pos]
+            out_row_counts[out_pos] = row_counts[pos]
+            out_value_counts[out_pos] = value_counts[pos]
+            out_sums[out_pos] = sums[pos]
+            out_mins[out_pos] = mins[pos]
+            out_maxs[out_pos] = maxs[pos]
+            out_has_value[out_pos] = has_value[pos]
+            out_pos += 1
+        return out_k0, out_k1, out_row_counts, out_value_counts, out_sums, out_mins, out_maxs, out_has_value
+    finally:
+        free(table_k0); free(table_k1); free(table_used); free(row_counts); free(value_counts)
+        free(sums); free(mins); free(maxs); free(has_value)
diff --git a/src/blosc2/indexing_ext.pyx b/src/blosc2/indexing_ext.pyx
index 759f5980..91072bea 100644
--- a/src/blosc2/indexing_ext.pyx
+++ b/src/blosc2/indexing_ext.pyx
@@ -2495,3 +2495,6 @@ def keysort_keys_indices(np.ndarray keys, np.ndarray indices):
         return None
     _keysort_ndarray(keys, indices)
     return None
+
+
+# ----------------------------------------------------------------------
diff --git a/tests/ctable/test_groupby.py b/tests/ctable/test_groupby.py
new file mode 100644
index 00000000..fec1fca4
--- /dev/null
+++ b/tests/ctable/test_groupby.py
@@ -0,0 +1,400 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+from dataclasses import dataclass, make_dataclass
+
+import numpy as np
+import pytest
+
+import blosc2
+from blosc2 import CTable
+
+
+@dataclass
+class SalesRow:
+    city: str = blosc2.field(blosc2.string(max_length=16))
+    category: int = blosc2.field(blosc2.int32())
+    sales: float = blosc2.field(blosc2.float64(nullable=True), default=0.0)
+    qty: int = blosc2.field(blosc2.int32(), default=0)
+
+
+DATA = [
+    ("Paris", 1, 10.0, 1),
+    ("Paris", 1, np.nan, 2),
+    ("Rome", 1, 20.0, 3),
+    ("Paris", 2, 30.0, 4),
+    ("Rome", 1, 40.0, 5),
+    ("Berlin", 2, np.nan, 6),
+]
+
+
+def col(table, name):
+    return list(table._cols[name][: table.nrows])
+
+
+def rows(table):
+    return [tuple(table._cols[name][i] for name in table.col_names) for i in range(table.nrows)]
+
+
+def test_groupby_size_counts_rows_per_group():
+    t = CTable(SalesRow, new_data=DATA)
+
+    out = t.group_by("city", sort=True).size()
+
+    assert out.col_names == ["city", "size"]
+    assert rows(out) == [("Berlin", 1), ("Paris", 3), ("Rome", 2)]
+
+
+def test_groupby_count_counts_non_null_values():
+    t = CTable(SalesRow, new_data=DATA)
+
+    out = t.group_by("city", sort=True).count("sales")
+
+    assert out.col_names == ["city", "sales_count"]
+    assert rows(out) == [("Berlin", 0), ("Paris", 2), ("Rome", 2)]
+
+
+def test_groupby_agg_numeric_reductions():
+    t = CTable(SalesRow, new_data=DATA)
+
+    out = t.group_by("city", sort=True).agg({"sales": ["sum", "mean", "min", "max", "count"]})
+
+    assert out.col_names == ["city", "sales_sum", "sales_mean", "sales_min", "sales_max", "sales_count"]
+    got = rows(out)
+    assert got[0][0] == "Berlin"
+    assert np.isnan(got[0][1])
+    assert np.isnan(got[0][2])
+    assert np.isnan(got[0][3])
+    assert np.isnan(got[0][4])
+    assert got[0][5] == 0
+    assert got[1] == ("Paris", 40.0, 20.0, 10.0, 30.0, 2)
+    assert got[2] == ("Rome", 60.0, 30.0, 20.0, 40.0, 2)
+
+
+def test_groupby_multi_key_size():
+    t = CTable(SalesRow, new_data=DATA)
+
+    out = t.group_by(["city", "category"], sort=True).size()
+
+    assert rows(out) == [("Berlin", 2, 1), ("Paris", 1, 2), ("Paris", 2, 1), ("Rome", 1, 2)]
+
+
+def test_groupby_respects_views_and_deleted_rows():
+    t = CTable(SalesRow, new_data=DATA)
+    t.delete(0)
+    view = t.where("qty >= 3")
+
+    out = view.group_by("city", sort=True).size()
+
+    assert rows(out) == [("Berlin", 1), ("Paris", 1), ("Rome", 2)]
+
+
+@dataclass
+class DictRow:
+    city: str = blosc2.field(blosc2.dictionary())
+    sales: int = blosc2.field(blosc2.int32())
+
+
+def test_groupby_dictionary_key_groups_by_decoded_value():
+    t = CTable(DictRow, new_data=[("Paris", 10), ("Rome", 20), ("Paris", 30)])
+
+    out = t.group_by("city", sort=True).agg({"sales": "sum"})
+
+    assert out.col_names == ["city", "sales_sum"]
+    assert rows(out) == [("Paris", 40), ("Rome", 20)]
+
+
+def test_groupby_dictionary_key_beyond_default_code_capacity():
+    data = [("Paris" if i % 2 == 0 else "Rome", 1) for i in range(5000)]
+    t = CTable(DictRow, new_data=data)
+
+    out = t.group_by("city", sort=True).size()
+
+    assert rows(out) == [("Paris", 2500), ("Rome", 2500)]
+
+
+def test_groupby_dropna_key_default_and_false():
+    t = CTable(DictRow, new_data=[("Paris", 10), (None, 20), ("Paris", 30)])
+
+    dropped = t.group_by("city", sort=True).size()
+    kept = t.group_by("city", sort=True, dropna=False).size()
+
+    assert rows(dropped) == [("Paris", 2)]
+    assert rows(kept) == [(None, 1), ("Paris", 2)]
+
+
+def test_groupby_agg_star_size():
+    t = CTable(SalesRow, new_data=DATA)
+
+    out = t.group_by("city", sort=True).agg({"*": "size"})
+
+    assert rows(out) == [("Berlin", 1), ("Paris", 3), ("Rome", 2)]
+
+
+def test_groupby_empty_table_returns_empty_result():
+    t = CTable(SalesRow)
+
+    out = t.group_by("city").size()
+
+    assert out.nrows == 0
+    assert out.col_names == ["city", "size"]
+
+
+@dataclass
+class Int32FloatRow:
+    key: int = blosc2.field(blosc2.int32())
+    value: float = blosc2.field(blosc2.float64())
+
+
+@dataclass
+class Float64KeyRow:
+    key: float = blosc2.field(blosc2.float64())
+    value: float = blosc2.field(blosc2.float64())
+
+
+@dataclass
+class Float32KeyRow:
+    key: float = blosc2.field(blosc2.float32())
+    value: float = blosc2.field(blosc2.float64())
+
+
+@dataclass
+class DictFloatRow:
+    key: str = blosc2.field(blosc2.dictionary())
+    value: float = blosc2.field(blosc2.float64())
+
+
+@pytest.mark.parametrize(
+    ("row_type", "data", "expected"),
+    [
+        (
+            Int32FloatRow,
+            [(0, 1.5), (2, 10.0), (1, 2.5), (2, 3.0), (0, 4.0)],
+            [(0, 5.5), (1, 2.5), (2, 13.0)],
+        ),
+        (
+            Float64KeyRow,
+            [(0.0, 1.5), (2.0, 10.0), (1.0, 2.5), (2.0, 3.0), (0.0, 4.0)],
+            [(0.0, 5.5), (1.0, 2.5), (2.0, 13.0)],
+        ),
+        (
+            Float32KeyRow,
+            [(0.0, 1.5), (2.0, 10.0), (1.0, 2.5), (2.0, 3.0), (0.0, 4.0)],
+            [(0.0, 5.5), (1.0, 2.5), (2.0, 13.0)],
+        ),
+        (
+            DictFloatRow,
+            [("a", 1.5), ("c", 10.0), ("b", 2.5), ("c", 3.0), ("a", 4.0)],
+            [("a", 5.5), ("c", 13.0), ("b", 2.5)],
+        ),
+    ],
+)
+def test_groupby_fast_path_sum_variants(row_type, data, expected):
+    t = CTable(row_type, new_data=data)
+
+    out = t.group_by("key").agg({"value": "sum"})
+
+    assert rows(out) == expected
+
+
+def test_groupby_float_integral_fast_path_falls_back_for_non_integral_keys():
+    t = CTable(Float64KeyRow, new_data=[(0.5, 1.0), (1.5, 2.0), (0.5, 3.0)])
+
+    out = t.group_by("key").agg({"value": "sum"})
+
+    assert rows(out) == [(0.5, 4.0), (1.5, 2.0)]
+
+
+def test_groupby_float_integral_fast_path_falls_back_for_nan_group_when_kept():
+    t = CTable(Float64KeyRow, new_data=[(0.0, 1.0), (np.nan, 2.0), (0.0, 3.0)])
+
+    out = t.group_by("key", dropna=False).agg({"value": "sum"})
+
+    got = rows(out)
+    assert got[0] == (0.0, 4.0)
+    assert np.isnan(got[1][0])
+    assert got[1][1] == 2.0
+
+
+def test_groupby_rejects_bad_engine():
+    t = CTable(SalesRow, new_data=DATA)
+
+    with pytest.raises(ValueError):
+        t.group_by("city", engine="cython")
+
+
+@pytest.mark.parametrize(
+    ("schema_factory", "values"),
+    [
+        (blosc2.int8, [0, 2, 1, 2, 0]),
+        (blosc2.uint8, [0, 2, 1, 2, 0]),
+        (blosc2.int16, [0, 2, 1, 2, 0]),
+        (blosc2.uint16, [0, 2, 1, 2, 0]),
+        (blosc2.int32, [0, 2, 1, 2, 0]),
+        (blosc2.uint32, [0, 2, 1, 2, 0]),
+        (blosc2.int64, [0, 2, 1, 2, 0]),
+        (blosc2.uint64, [0, 2, 1, 2, 0]),
+    ],
+)
+def test_groupby_cython_fused_integer_key_dtypes(schema_factory, values):
+    row_type = make_dataclass(
+        f"FusedKey{schema_factory.__name__}Row",
+        [
+            ("key", int, blosc2.field(schema_factory())),
+            ("value", int, blosc2.field(blosc2.int32())),
+        ],
+    )
+    t = CTable(row_type, new_data=list(zip(values, [1, 10, 2, 3, 4], strict=True)))
+
+    out = t.group_by("key", sort=True).agg({"value": "sum"})
+
+    assert rows(out) == [(0, 5), (1, 2), (2, 13)]
+
+
+def test_groupby_cython_integer_key_more_integer_aggs():
+    row_type = make_dataclass(
+        "IntKeyMoreIntegerAggsRow",
+        [
+            ("key", int, blosc2.field(blosc2.int16())),
+            ("value", int, blosc2.field(blosc2.int32())),
+        ],
+    )
+    t = CTable(row_type, new_data=[(0, 5), (1, 10), (0, -2), (1, 20), (2, 7)])
+
+    out = t.group_by("key", sort=True).agg({"*": "size", "value": ["count", "sum", "mean", "min", "max"]})
+
+    assert rows(out) == [(0, 2, 2, 3, 1.5, -2, 5), (1, 2, 2, 30, 15.0, 10, 20), (2, 1, 1, 7, 7.0, 7, 7)]
+
+
+def test_groupby_cython_integer_key_nullable_float_aggs():
+    row_type = make_dataclass(
+        "IntKeyNullableFloatAggsRow",
+        [
+            ("key", int, blosc2.field(blosc2.uint16())),
+            ("value", float, blosc2.field(blosc2.float64(nullable=True))),
+        ],
+    )
+    t = CTable(row_type, new_data=[(0, 1.5), (1, np.nan), (0, 2.5), (1, np.nan), (2, 10.0)])
+
+    out = t.group_by("key", sort=True).agg({"value": ["count", "sum", "mean", "min", "max"]})
+
+    got = rows(out)
+    assert got[0] == (0, 2, 4.0, 2.0, 1.5, 2.5)
+    assert got[1][0] == 1
+    assert got[1][1] == 0
+    assert np.isnan(got[1][2])
+    assert np.isnan(got[1][3])
+    assert np.isnan(got[1][4])
+    assert np.isnan(got[1][5])
+    assert got[2] == (2, 1, 10.0, 10.0, 10.0, 10.0)
+
+
+def test_groupby_cython_arbitrary_float_key_aggs():
+    t = CTable(
+        Float64KeyRow,
+        new_data=[(0.5, 1.0), (1.25, 10.0), (0.5, 3.0), (-2.5, 4.0), (1.25, 2.0)],
+    )
+
+    out = t.group_by("key").agg({"value": ["count", "sum", "mean", "min", "max"]})
+
+    assert rows(out) == [
+        (-2.5, 1, 4.0, 4.0, 4.0, 4.0),
+        (0.5, 2, 4.0, 2.0, 1.0, 3.0),
+        (1.25, 2, 12.0, 6.0, 2.0, 10.0),
+    ]
+
+
+def test_groupby_cython_arbitrary_float_key_nan_and_signed_zero():
+    t = CTable(Float64KeyRow, new_data=[(-0.0, 1.0), (0.0, 2.0), (np.nan, 3.0), (np.nan, 4.0)])
+
+    dropped = t.group_by("key").agg({"value": "sum"})
+    kept = t.group_by("key", dropna=False).agg({"value": "sum"})
+
+    assert rows(dropped) == [(0.0, 3.0)]
+    got = rows(kept)
+    assert got[0] == (0.0, 3.0)
+    assert np.isnan(got[1][0])
+    assert got[1][1] == 7.0
+
+
+@dataclass
+class TwoIntKeyFloatRow:
+    key0: int = blosc2.field(blosc2.int16())
+    key1: int = blosc2.field(blosc2.uint16())
+    value: float = blosc2.field(blosc2.float64(nullable=True), default=0.0)
+
+
+def test_groupby_cython_two_integer_key_hash_aggs():
+    t = CTable(
+        TwoIntKeyFloatRow,
+        new_data=[(0, 1, 1.0), (0, 1, 3.0), (0, 2, 10.0), (1, 1, np.nan), (1, 1, 5.0)],
+    )
+
+    out = t.group_by(["key0", "key1"], sort=True).agg(
+        {"*": "size", "value": ["count", "sum", "mean", "min", "max"]}
+    )
+
+    assert rows(out) == [
+        (0, 1, 2, 2, 4.0, 2.0, 1.0, 3.0),
+        (0, 2, 1, 1, 10.0, 10.0, 10.0, 10.0),
+        (1, 1, 2, 1, 5.0, 5.0, 5.0, 5.0),
+    ]
+
+
+@dataclass
+class DictIntKeyFloatRow:
+    key0: str = blosc2.field(blosc2.dictionary())
+    key1: int = blosc2.field(blosc2.int32())
+    value: float = blosc2.field(blosc2.float64())
+
+
+def test_groupby_cython_dictionary_integer_key_hash():
+    t = CTable(DictIntKeyFloatRow, new_data=[("b", 2, 1.0), ("a", 1, 2.0), ("b", 2, 3.0)])
+
+    out = t.group_by(["key0", "key1"], sort=True).agg({"value": "sum"})
+
+    assert rows(out) == [("a", 1, 2.0), ("b", 2, 4.0)]
+
+
+def test_groupby_convenience_numeric_methods():
+    t = CTable(SalesRow, new_data=DATA)
+
+    assert rows(t.group_by("city", sort=True).sum("qty")) == rows(
+        t.group_by("city", sort=True).agg({"qty": "sum"})
+    )
+    assert rows(t.group_by("city", sort=True).mean("qty")) == rows(
+        t.group_by("city", sort=True).agg({"qty": "mean"})
+    )
+    assert rows(t.group_by("city", sort=True).min("qty")) == rows(
+        t.group_by("city", sort=True).agg({"qty": "min"})
+    )
+    assert rows(t.group_by("city", sort=True).max("qty")) == rows(
+        t.group_by("city", sort=True).agg({"qty": "max"})
+    )
+
+
+def test_groupby_persistent_output_urlpath(tmp_path):
+    t = CTable(SalesRow, new_data=DATA)
+    path = tmp_path / "grouped.b2d"
+
+    out = t.group_by("city", sort=True).agg({"qty": "sum"}, urlpath=path)
+    out.close()
+
+    reopened = CTable.open(str(path), mode="r")
+    assert reopened.col_names == ["city", "qty_sum"]
+    assert rows(reopened) == [("Berlin", 6), ("Paris", 7), ("Rome", 8)]
+
+
+def test_groupby_persistent_output_urlpath_on_convenience_method(tmp_path):
+    t = CTable(SalesRow, new_data=DATA)
+    path = tmp_path / "grouped_mean.b2d"
+
+    out = t.group_by("city", sort=True).mean("qty", urlpath=path)
+    out.close()
+
+    reopened = CTable.open(str(path), mode="r")
+    assert rows(reopened) == [("Berlin", 6.0), ("Paris", 7 / 3), ("Rome", 4.0)]
diff --git a/tests/ctable/test_nested_append.py b/tests/ctable/test_nested_append.py
new file mode 100644
index 00000000..7be94a6e
--- /dev/null
+++ b/tests/ctable/test_nested_append.py
@@ -0,0 +1,96 @@
+"""Tests for Ph 3.1: append/extend with nested dict rows on tables with dotted column names."""
+
+from dataclasses import dataclass
+
+import numpy as np
+import pytest
+
+import blosc2
+
+
+@dataclass
+class FlatTrip:
+    trip_begin_lon: float
+    trip_begin_lat: float
+    payment_fare: float
+
+
+def _make_nested_table():
+    """Create a CTable with dotted (nested) column names via rename."""
+    t = blosc2.CTable(FlatTrip)
+    t.rename_column("trip_begin_lon", "trip.begin.lon")
+    t.rename_column("trip_begin_lat", "trip.begin.lat")
+    t.rename_column("payment_fare", "payment.fare")
+    return t
+
+
+def test_append_nested_dict():
+    """append() accepts a fully-nested dict and flattens it to dotted keys."""
+    t = _make_nested_table()
+    t.append({"trip": {"begin": {"lon": 1.0, "lat": 2.0}}, "payment": {"fare": 10.0}})
+    t.append({"trip": {"begin": {"lon": 3.0, "lat": 4.0}}, "payment": {"fare": 20.0}})
+
+    assert t.nrows == 2
+    np.testing.assert_array_almost_equal(t["trip.begin.lon"][:], [1.0, 3.0])
+    np.testing.assert_array_almost_equal(t["trip.begin.lat"][:], [2.0, 4.0])
+    np.testing.assert_array_almost_equal(t["payment.fare"][:], [10.0, 20.0])
+
+
+def test_append_flat_dotted_dict_unchanged():
+    """append() with already-flat dotted keys continues to work."""
+    t = _make_nested_table()
+    t.append({"trip.begin.lon": 5.0, "trip.begin.lat": 6.0, "payment.fare": 30.0})
+
+    assert t.nrows == 1
+    assert t["trip.begin.lon"][0] == pytest.approx(5.0)
+
+
+def test_extend_list_of_nested_dicts():
+    """extend() with a list of nested dicts flattens each row."""
+    t = _make_nested_table()
+    rows = [
+        {"trip": {"begin": {"lon": 1.0, "lat": 2.0}}, "payment": {"fare": 10.0}},
+        {"trip": {"begin": {"lon": 3.0, "lat": 4.0}}, "payment": {"fare": 20.0}},
+        {"trip": {"begin": {"lon": 5.0, "lat": 6.0}}, "payment": {"fare": 30.0}},
+    ]
+    t.extend(rows)
+
+    assert t.nrows == 3
+    np.testing.assert_array_almost_equal(t["trip.begin.lon"][:], [1.0, 3.0, 5.0])
+    np.testing.assert_array_almost_equal(t["payment.fare"][:], [10.0, 20.0, 30.0])
+
+
+def test_extend_nested_dict_of_arrays():
+    """extend() with a nested dict-of-arrays flattens the outer dict to dotted keys."""
+    t = _make_nested_table()
+    t.extend(
+        {
+            "trip": {"begin": {"lon": [1.0, 2.0, 3.0], "lat": [4.0, 5.0, 6.0]}},
+            "payment": {"fare": [10.0, 20.0, 30.0]},
+        }
+    )
+
+    assert t.nrows == 3
+    np.testing.assert_array_almost_equal(t["trip.begin.lon"][:], [1.0, 2.0, 3.0])
+    np.testing.assert_array_almost_equal(t["trip.begin.lat"][:], [4.0, 5.0, 6.0])
+    np.testing.assert_array_almost_equal(t["payment.fare"][:], [10.0, 20.0, 30.0])
+
+
+def test_append_nested_dict_where_and_attribute_access():
+    """append() with nested dicts integrates correctly with where() and attribute proxy."""
+    t = _make_nested_table()
+    for lon, lat, fare in [(1.0, 2.0, 5.0), (3.0, 4.0, 15.0), (5.0, 6.0, 25.0)]:
+        t.append({"trip": {"begin": {"lon": lon, "lat": lat}}, "payment": {"fare": fare}})
+
+    view = t.where("payment.fare > 10")
+    assert view.nrows == 2
+    assert t.trip.begin.lon.max() == pytest.approx(5.0)
+
+
+def test_nested_dotted_string_where_in_aggregate():
+    """Aggregate where= strings accept dotted nested column names."""
+    t = _make_nested_table()
+    for lon, lat, fare in [(1.0, 2.0, 5.0), (3.0, 4.0, 15.0), (5.0, 6.0, 25.0)]:
+        t.append({"trip": {"begin": {"lon": lon, "lat": lat}}, "payment": {"fare": fare}})
+
+    assert t.trip.begin.lon.sum(where="payment.fare > 10") == pytest.approx(8.0)
diff --git a/tests/ctable/test_object_spec.py b/tests/ctable/test_object_spec.py
new file mode 100644
index 00000000..9b6154dc
--- /dev/null
+++ b/tests/ctable/test_object_spec.py
@@ -0,0 +1,66 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+"""Tests for schema-less CTable object columns."""
+
+from dataclasses import dataclass
+
+import pytest
+
+import blosc2
+from blosc2 import CTable
+
+
+@dataclass
+class ObjectRow:
+    id: int = blosc2.field(blosc2.int32())
+    payload: object = blosc2.field(blosc2.object(nullable=True))
+
+
+def test_object_column_heterogeneous_values():
+    t = CTable(ObjectRow)
+    t.append([1, {"kind": "dict", "values": [1, 2]}])
+    t.append([2, ("tuple", 3)])
+    t.append([3, None])
+
+    assert t["payload"][:] == [{"kind": "dict", "values": [1, 2]}, ("tuple", 3), None]
+    assert t["payload"].is_varlen_scalar
+
+
+def test_object_column_persistence(tmp_path):
+    path = tmp_path / "objects.b2d"
+    t = CTable(ObjectRow, urlpath=str(path), mode="w")
+    t.extend([[1, {"x": 1}], [2, ["a", "b"]], [3, None]])
+    t.close()
+
+    reopened = CTable.open(str(path), mode="r")
+    assert reopened["payload"][:] == [{"x": 1}, ["a", "b"], None]
+
+
+def test_object_column_to_arrow_raises():
+    pytest.importorskip("pyarrow")
+    t = CTable(ObjectRow)
+    t.append([1, {"x": 1}])
+    with pytest.raises(TypeError, match="ObjectSpec columns"):
+        t.to_arrow()
+
+
+def test_object_column_rejects_none_when_not_nullable():
+    @dataclass
+    class StrictObjectRow:
+        payload: object = blosc2.field(blosc2.object())
+
+    t = CTable(StrictObjectRow)
+    with pytest.raises(TypeError, match="not nullable"):
+        t.append([None])
+
+
+def test_object_column_rejects_non_msgpack_value_on_flush():
+    t = CTable(ObjectRow)
+    t.append([1, {"not-msgpack": {1, 2, 3}}])
+    with pytest.raises(TypeError):
+        t.close()
diff --git a/tests/test_group_reduce.py b/tests/test_group_reduce.py
new file mode 100644
index 00000000..856c25ef
--- /dev/null
+++ b/tests/test_group_reduce.py
@@ -0,0 +1,75 @@
+import numpy as np
+import pytest
+
+import blosc2
+
+
+def test_group_reduce_size_and_sum_integer_keys():
+    keys = np.array([2, 1, 2, 1, 2], dtype=np.int16)
+    values = np.array([10, 1, 30, 3, 50], dtype=np.int32)
+
+    groups, sizes = blosc2.group_reduce(keys, op="size", sort=True)
+    groups2, sums = blosc2.group_reduce(keys, values, op="sum", sort=True)
+
+    assert groups.dtype == keys.dtype
+    np.testing.assert_array_equal(groups, np.array([1, 2], dtype=np.int16))
+    np.testing.assert_array_equal(sizes, np.array([2, 3]))
+    np.testing.assert_array_equal(groups2, np.array([1, 2], dtype=np.int16))
+    np.testing.assert_array_equal(sums, np.array([4, 90]))
+
+
+def test_group_reduce_integer_keys_float_aggs_with_nan_values():
+    keys = np.array([0, 1, 0, 1, 2], dtype=np.uint16)
+    values = np.array([1.0, np.nan, 3.0, np.nan, 10.0])
+
+    groups, counts = blosc2.group_reduce(keys, values, op="count", sort=True)
+    _, means = blosc2.group_reduce(keys, values, op="mean", sort=True)
+    _, mins = blosc2.group_reduce(keys, values, op="min", sort=True)
+    _, maxs = blosc2.group_reduce(keys, values, op="max", sort=True)
+
+    np.testing.assert_array_equal(groups, np.array([0, 1, 2], dtype=np.uint16))
+    np.testing.assert_array_equal(counts, np.array([2, 0, 1]))
+    assert means[0] == 2.0
+    assert np.isnan(means[1])
+    assert means[2] == 10.0
+    assert mins[0] == 1.0
+    assert np.isnan(mins[1])
+    assert mins[2] == 10.0
+    assert maxs[0] == 3.0
+    assert np.isnan(maxs[1])
+    assert maxs[2] == 10.0
+
+
+def test_group_reduce_arbitrary_float_keys_and_nan_key_group():
+    keys = np.array([0.5, np.nan, 0.5, -0.0, 0.0, np.nan])
+    values = np.array([1.0, 2.0, 3.0, 10.0, 20.0, 5.0])
+
+    groups, sums = blosc2.group_reduce(keys, values, op="sum", sort=True, dropna=False)
+
+    assert groups[0] == 0.0
+    assert sums[0] == 30.0
+    assert groups[1] == 0.5
+    assert sums[1] == 4.0
+    assert np.isnan(groups[2])
+    assert sums[2] == 7.0
+
+
+def test_group_reduce_dropna_default_skips_nan_keys():
+    keys = np.array([1.0, np.nan, 1.0])
+    values = np.array([2.0, 10.0, 3.0])
+
+    groups, sums = blosc2.group_reduce(keys, values, op="sum", sort=True)
+
+    np.testing.assert_array_equal(groups, np.array([1.0]))
+    np.testing.assert_array_equal(sums, np.array([5.0]))
+
+
+def test_group_reduce_rejects_bad_inputs():
+    with pytest.raises(ValueError):
+        blosc2.group_reduce(np.ones((2, 2)), op="size")
+    with pytest.raises(ValueError):
+        blosc2.group_reduce(np.arange(3), op="sum")
+    with pytest.raises(ValueError):
+        blosc2.group_reduce(np.arange(3), np.arange(2), op="sum")
+    with pytest.raises(ValueError):
+        blosc2.group_reduce(np.arange(3), op="bad")