From cfdd1b6d501aefd6f6366a93d307ce3aeaf11e66 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 15 May 2026 07:02:39 +0200 Subject: [PATCH 01/17] Initial basic functionality (phase 1 of plan completed) --- bench/ctable/groupby.py | 119 ++++++++ doc/reference/ctable.rst | 20 ++ plans/ctable-groupby.md | 574 +++++++++++++++++++++++++++++++++++ src/blosc2/__init__.py | 2 + src/blosc2/ctable.py | 48 ++- src/blosc2/groupby.py | 548 +++++++++++++++++++++++++++++++++ tests/ctable/test_groupby.py | 151 +++++++++ 7 files changed, 1460 insertions(+), 2 deletions(-) create mode 100644 bench/ctable/groupby.py create mode 100644 plans/ctable-groupby.md create mode 100644 src/blosc2/groupby.py create mode 100644 tests/ctable/test_groupby.py diff --git a/bench/ctable/groupby.py b/bench/ctable/groupby.py new file mode 100644 index 00000000..fb601eb3 --- /dev/null +++ b/bench/ctable/groupby.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python +"""Phase-1 CTable group_by benchmark. + +Examples +-------- +python bench/ctable/groupby.py --rows 10_000_000 --groups 1_000 --op sum +python bench/ctable/groupby.py --rows 1_000_000 --groups 100 --dictionary --pandas +""" + +from __future__ import annotations + +import argparse +import dataclasses +import time +from pathlib import Path + +import numpy as np + +import blosc2 + + +def parse_int(text: str) -> int: + return int(text.replace("_", "")) + + +def build_row_type(dictionary: bool): + if dictionary: + + @dataclasses.dataclass + class Row: + key: str = blosc2.field(blosc2.dictionary()) + value: float = blosc2.field(blosc2.float64()) + + else: + + @dataclasses.dataclass + class Row: + key: int = blosc2.field(blosc2.int32()) + value: float = blosc2.field(blosc2.float64()) + + return Row + + +def make_data(nrows: int, ngroups: int, dictionary: bool, seed: int): + rng = np.random.default_rng(seed) + key_codes = rng.integers(0, ngroups, size=nrows, dtype=np.int32) + values = rng.random(nrows, dtype=np.float64) + if dictionary: + keys = np.asarray([f"k{code}" for code in key_codes], dtype=object) + else: + keys = key_codes + return keys, values + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--rows", type=parse_int, default=10_000_000) + parser.add_argument("--groups", type=parse_int, default=1_000) + parser.add_argument("--chunk-size", type=parse_int, default=None) + parser.add_argument("--dictionary", action="store_true", help="Use a dictionary-encoded string key") + parser.add_argument("--op", choices=["size", "count", "sum", "mean", "min", "max"], default="sum") + parser.add_argument("--sort", action="store_true") + parser.add_argument("--pandas", action="store_true", help="Also run a pandas comparison if available") + parser.add_argument("--urlpath", type=Path, default=None, help="Optional persistent CTable path") + parser.add_argument("--seed", type=int, default=0) + args = parser.parse_args() + + print( + f"rows={args.rows:,} groups={args.groups:,} dictionary={args.dictionary} " + f"op={args.op} sort={args.sort} chunk_size={args.chunk_size} urlpath={args.urlpath}" + ) + + keys, values = make_data(args.rows, args.groups, args.dictionary, args.seed) + Row = build_row_type(args.dictionary) + + kwargs = {} + if args.urlpath is not None: + kwargs.update(urlpath=str(args.urlpath), mode="w") + + t0 = time.perf_counter() + table = blosc2.CTable(Row, new_data={"key": keys, "value": values}, expected_size=args.rows, **kwargs) + build_time = time.perf_counter() - t0 + print(f"ctable_build_seconds={build_time:.6f}") + + t0 = time.perf_counter() + gb = table.group_by("key", sort=args.sort, chunk_size=args.chunk_size) + if args.op == "size": + out = gb.size() + elif args.op == "count": + out = gb.count("value") + else: + out = gb.agg({"value": args.op}) + elapsed = time.perf_counter() - t0 + print(f"ctable_groupby_seconds={elapsed:.6f}") + print(f"result_rows={out.nrows:,}") + + if args.pandas: + try: + import pandas as pd + except ImportError: + print("pandas_unavailable=true") + else: + df = pd.DataFrame({"key": keys, "value": values}) + t0 = time.perf_counter() + if args.op == "size": + pdf = df.groupby("key", sort=args.sort).size() + elif args.op == "count": + pdf = df.groupby("key", sort=args.sort)["value"].count() + else: + pdf = df.groupby("key", sort=args.sort)["value"].agg(args.op) + pandas_elapsed = time.perf_counter() - t0 + print(f"pandas_groupby_seconds={pandas_elapsed:.6f}") + print(f"pandas_result_rows={len(pdf):,}") + + table.close() + + +if __name__ == "__main__": + main() diff --git a/doc/reference/ctable.rst b/doc/reference/ctable.rst index 12e99ea0..2ed012d0 100644 --- a/doc/reference/ctable.rst +++ b/doc/reference/ctable.rst @@ -233,6 +233,7 @@ When a NumPy structured array is needed, materialize explicitly:: CTable.sample CTable.sort_by CTable.iter_sorted + CTable.group_by .. automethod:: CTable.where .. automethod:: CTable.view @@ -242,6 +243,25 @@ When a NumPy structured array is needed, materialize explicitly:: .. automethod:: CTable.sample .. automethod:: CTable.sort_by .. automethod:: CTable.iter_sorted +.. automethod:: CTable.group_by + + +Group-by reductions +------------------- + +:meth:`CTable.group_by` returns a lightweight deferred group-by object. It is +not a table view; methods such as :meth:`~blosc2.CTableGroupBy.size`, +:meth:`~blosc2.CTableGroupBy.count`, and +:meth:`~blosc2.CTableGroupBy.agg` materialize a new :class:`CTable` with +one row per group:: + + by_city = t.group_by("city", sort=True) + counts = by_city.size() # row count per city / COUNT(*) + non_null = by_city.count("sales") # non-null sales count / COUNT(sales) + totals = by_city.agg({"sales": "sum"}) + +.. autoclass:: CTableGroupBy + :members: size, count, agg Mutations diff --git a/plans/ctable-groupby.md b/plans/ctable-groupby.md new file mode 100644 index 00000000..b86ac078 --- /dev/null +++ b/plans/ctable-groupby.md @@ -0,0 +1,574 @@ +# CTable `group_by` implementation plan + +## Goals + +Add a `CTable.group_by()` facility that is efficient for columnar, compressed +CTable storage while keeping the first implementation simple and correct. The +long-term goal is to expose a compressed-aware group-reduce primitive that can +power `CTable.group_by()` and possibly other analytics APIs. + +Key design principles: + +- Stay columnar: read only grouping columns, aggregation columns, and the live-row mask. +- Keep memory bounded: process the table chunk-by-chunk; never require materializing all rows. +- Use indexes opportunistically, but do not require them. +- Start with a NumPy implementation, then add Cython kernels for hot paths. +- Keep compressed input columns compressed between chunks; only chunk slices become NumPy buffers. + +## Proposed user API + +Initial high-level API could be: + +```python +t.group_by("city").agg({"sales": "sum", "id": "count"}) +t.group_by(["country", "city"]).agg({"sales": ["sum", "mean"], "price": "max"}) +``` + +Potential variants: + +```python +t.group_by("city", sort=False).agg(...) +t.group_by("city", engine="auto").agg(...) +t.group_by("city").count() +t.group_by("city").sum("sales") +``` + +The result should be a new in-memory `CTable` initially. Persistent output can +be added later via an `out=`/`urlpath=` option if useful. + +Output column naming should be predictable, for example: + +```text +city, sales_sum, id_count +country, city, sales_sum, sales_mean, price_max +``` + +For a single aggregation on a column, decide whether to preserve the original +column name or always suffix it. Always suffixing is less ambiguous. + +## Supported MVP semantics + +Start with: + +- Group keys: + - fixed-width scalar columns: bool, signed/unsigned ints, floats, datetimes/timedeltas; + - dictionary-encoded string columns via integer codes. +- Aggregations: + - `count` / `size`; + - `sum`; + - `min`; + - `max`; + - `mean` implemented as `sum + count` during accumulation. +- Respect live rows (`_valid_rows`) and views. +- Read only required columns. + +Defer initially: + +- list columns; +- vlstring/vlbytes/object/struct scalar columns, except dictionary columns; +- arbitrary Python aggregators; +- group-by over computed columns, unless they can be chunk-evaluated cleanly; +- disk spilling for very high cardinality; +- parallel hash aggregation. + +## Baseline algorithm: chunked hash aggregation + +The default implementation should be a chunked hash group-reduce: + +```text +global_accumulator = hash table: group_key -> aggregate state + +for each row chunk: + read/decompress key column chunk(s) + read/decompress aggregation value column chunk(s) + read/decompress valid-row mask chunk + apply live-row mask + + build local grouping keys + compute local partial aggregates + merge local partial aggregates into global_accumulator + +finalize aggregate state +materialize group keys and aggregate columns into a result CTable +``` + +The important point is that the global hash table is proportional to the number +of groups, not to the number of rows: + +```text +memory ~= O(number_of_groups * (key_size + aggregate_state_size + hash overhead)) +``` + +The global accumulator should normally live uncompressed in memory. It is +accessed for every chunk merge, so compressing it would likely dominate runtime. +The compressed-aware aspect is in the input traversal: compressed CTable columns +are decompressed only one bounded chunk at a time. + +## Columnar chunk traversal + +Use synchronized physical row ranges. For each range: + +```python +valid = np.asarray(self._valid_rows[start:stop]) +key1 = np.asarray(self._cols[key1_name][start:stop]) +value = np.asarray(self._cols[value_name][start:stop]) + +key1 = key1[valid] +value = value[valid] +``` + +Where possible, align chunk ranges with the physical chunks of `_valid_rows` or +input columns to improve decompression locality. The exact chunk size should be +configurable internally; a reasonable default can be based on CTable/NDArray +chunk sizes, with a cap to avoid excessive temporaries. + +For dictionary columns, read codes instead of decoded strings: + +```python +codes = np.asarray(dict_col.codes[start:stop], dtype=np.int32) +``` + +Decode codes only when materializing the final result. + +## NumPy MVP local grouping + +For a single key: + +```python +unique_keys, inverse = np.unique(keys, return_inverse=True) +partial_sum = np.bincount(inverse, weights=values) +partial_count = np.bincount(inverse) +``` + +For min/max use `np.minimum.at` / `np.maximum.at` into arrays initialized with +appropriate identity values. + +For multiple fixed-width keys, build a structured array per chunk: + +```python +keys = np.empty(n, dtype=[("k0", key0.dtype), ("k1", key1.dtype)]) +keys["k0"] = key0 +keys["k1"] = key1 + +unique_keys, inverse = np.unique(keys, return_inverse=True) +``` + +This is simple and should be the initial correctness path. Costs to be aware of: + +- structured key array allocation and copy per chunk; +- `np.unique` is generally sort-based; +- `return_inverse=True` allocates one integer per live row in the chunk; +- aggregations are separate passes over the inverse. + +These costs are acceptable for the MVP because they are bounded by chunk size. + +## Global accumulator design + +For the Python MVP, a dictionary is adequate: + +```python +acc: dict[group_key, AggregateState] +``` + +Where `group_key` is: + +- a Python scalar for single numeric/dictionary keys; +- a tuple for multi-column keys; +- a normalized representation for null-aware keys when nullable support is added. + +`AggregateState` can store arrays or small Python objects with fields like: + +```text +count +sum +min +max +mean_sum +mean_count +``` + +For `mean`, keep `sum` and `count` and divide only during finalization. For +multiple aggregations over the same input column, share state when possible +(e.g. `mean` and `sum` can reuse the same sum). + +For better performance after the API stabilizes, replace parts of this with a +NumPy-backed accumulator or Cython state object. + +## Index-aware paths + +Indexes are optional accelerators. + +### FULL index on a single group key + +A FULL index stores sorted values and positions. For a single grouping key, +this can make group-by a sorted scan: + +```text +obtain sorted positions from FULL index +scan rows in key order +detect group boundaries +reduce contiguous runs +``` + +Benefits: + +- no hash table needed for the grouping key; +- no sort needed at query time; +- output is naturally sorted by key. + +This is most useful for: + +```python +t.create_index("city", kind=blosc2.IndexKind.FULL) +t.group_by("city").agg(...) +``` + +Caveats: + +- only directly helps single-key group-by; +- for multi-key group-by, a single-column FULL index only partially helps; +- stale indexes must be ignored or rebuilt; +- views/deleted rows still require intersecting with `_valid_rows`. + +### Bucket/segment indexes + +The default predicate indexes are useful before group-by, not usually during it: + +```python +t.where("year == 2024").group_by("city") +``` + +The index accelerates `where()`, reducing rows scanned by group-by. It does not +by itself provide grouped order. + +## Existing `indexing_ext` sort helpers + +`indexing_ext.pyx` contains: + +- `keysort_values_positions(values, positions)`; +- `keysort_keys_indices(keys, indices)`. + +These sort a 1-D scalar key array in-place while carrying an `int64` side array. +They are useful for sort/index oriented paths, especially: + +- building/reusing FULL indexes; +- single-key sort-based group-by; +- dictionary-code group-by where codes are scalar integers. + +They are not the main primitive for hash-based group-reduce because hash +aggregation does not require sorted keys. They also do not directly support +multi-column keys, variable-length strings, or fused aggregation. + +## Compressed-aware `group_reduce` primitive + +Longer term, introduce a lower-level primitive used by `CTable.group_by()`: + +```python +blosc2.group_reduce( + keys=[key_ndarray1, key_ndarray2], + values=[value_ndarray1], + aggs={"value": ["sum", "count"]}, + mask=valid_rows, + chunk_size=None, + engine="auto", +) +``` + +However, the first implementation can live under an internal module, e.g. +`blosc2.groupby`, before becoming public. + +The primitive should be compressed-aware in traversal, not necessarily operate +on compressed bytes directly. General key comparison/grouping still needs +values. The intended execution is: + +```text +read compressed NDArray slices -> NumPy buffers -> local group/reduce -> merge +``` + +This avoids full-column materialization while keeping the hot loop simple. + +## Cython optimization plan + +### Phase 1: Python/NumPy only + +Files: + +```text +src/blosc2/ctable.py # public API / GroupBy facade +src/blosc2/groupby.py # internal implementation and NumPy engine +``` + +Focus on correctness, tests, API shape, and an early benchmark harness. The +benchmark should be added in Phase 1, before any Cython work, so that later +optimization decisions are driven by numbers rather than intuition. At minimum, +add one reusable script under `bench/` that can generate or open a CTable and +compare: + +- chunked NumPy hash group-by; +- single-key sort/scan group-by where practical; +- dictionary-code grouping; +- pandas or DuckDB on an equivalent in-memory/external dataset for rough context. + +The initial benchmark does not need to be exhaustive, but it should record row +count, cardinality, chunk size, compression parameters, elapsed time, peak memory +if easy to capture, and whether the input is in-memory, `.b2d`, or `.b2z`. + +### Phase 2: optimized kernels in `indexing_ext.pyx` + +To avoid adding a third extension too early, place initial Cython kernels in +`src/blosc2/indexing_ext.pyx` under a clearly separated section: + +```cython +# ---------------------------------------------------------------------- +# Group-reduce kernels +# ---------------------------------------------------------------------- +``` + +Initial kernels should target high-value simple cases: + +- single `int32`/`int64` key; +- dictionary-code keys (`int32`); +- numeric value columns; +- `count`, `sum`, `min`, `max`, maybe `mean` via sum/count. + +The Python layer remains responsible for: + +- CTable schema validation; +- chunk iteration; +- decompression into NumPy buffers; +- final result CTable construction; +- fallback to NumPy for unsupported dtypes. + +The Cython layer consumes NumPy buffers and updates a hash accumulator or returns +chunk partial aggregates. + +### Phase 3: split to `groupby_ext.pyx` if it grows + +If the optimized path grows to include multi-column hash tables, nullable key +semantics, multiple aggregate state layouts, spilling, or parallel execution, +move it to a dedicated extension: + +```text +src/blosc2/groupby_ext.pyx +``` + +This is cleaner long-term than overloading `indexing_ext.pyx` indefinitely. +Avoid putting this functionality in `blosc2_ext.pyx`; group-reduce is a +higher-level analytics/query primitive, not core compression/NDArray machinery. + +## What custom Cython buys over structured NumPy keys + +NumPy structured dtype is a good MVP, but a custom Cython hash reducer can avoid +several costs: + +- no temporary packed structured key array; +- no sort-based `np.unique` for every chunk; +- no `inverse` array of length equal to the chunk; +- factorization and aggregation can be fused in one pass; +- multiple aggregations can be updated together; +- direct processing of CTable's columnar SoA layout; +- easier future per-thread hash tables and merges. + +A typical optimized loop is: + +```text +for i in range(n): + key = key_columns[i] + slot = hash_lookup_or_insert(key) + acc_sum[slot] += value[i] + acc_count[slot] += 1 + acc_min[slot] = min(acc_min[slot], value[i]) +``` + +For multi-column keys, the Cython path can hash directly across multiple arrays +without packing them into a structured array first. + +## High-cardinality strategy + +Hash aggregation can become memory-heavy when the number of groups approaches +the number of rows. Add safeguards and future alternatives: + +- estimate cardinality from early chunks; +- expose/keep an internal memory limit; +- fall back to sort-based group-by when cardinality is too high; +- use FULL index if available; +- later: partitioned hash group-by with spill-to-disk. + +For the MVP, document that very high-cardinality group-by may require memory +proportional to output cardinality. + +## Null and NaN semantics + +Define before finalizing the API: + +- Should null sentinel values form their own group, be skipped, or be controlled + by `dropna=`? +- Should float NaNs group together? NumPy `unique` behavior and hash behavior + must be made consistent. +- Nullable booleans/dictionary null codes need explicit handling. + +Suggested default, matching common dataframe behavior: + +```python +t.group_by("key", dropna=True) # default? skip null keys +t.group_by("key", dropna=False) # include null group +``` + +But this should be aligned with existing CTable nullable semantics. + +## Documentation + +Add user-facing docstrings and Sphinx documentation for the new group-by API: + +- `CTable.group_by()` docstring with parameters such as `keys`, `sort`, + `dropna`, `engine`, and `chunk_size` if exposed; +- the returned `GroupBy`/`CTableGroupBy` facade docstring, documenting that it + is a deferred operation builder, not a `CTable` view; +- `GroupBy.size()`, `GroupBy.count()`, and `GroupBy.agg()` docstrings; +- examples in the CTable documentation showing row counts, non-null counts, + sums/means, dictionary string grouping, and optional sorted output. + +The class may be described as "the object returned by `CTable.group_by()`" and +need not encourage direct construction. + +## Tests + +Add tests under `tests/ctable/`, covering: + +- single-key count/sum/min/max/mean; +- multi-key group-by; +- dictionary string key grouping; +- views and deleted rows; +- empty table and all-filtered view; +- different numeric dtypes and bool keys; +- nullable key behavior once specified; +- result schema and output column names; +- consistency with a reference Python/pandas-like implementation; +- chunk-size variation to ensure chunk-boundary independence; +- optional FULL-index path returns same results as hash path. + +For deterministic tests, sort result rows before comparison unless the API +guarantees output order. + +## Benchmark plan + +Add a small but useful benchmark during Phase 1. This is important because it +sets the baseline for the NumPy implementation and identifies which Cython +kernels are worth writing first. + +Benchmarks should include: + +- low-cardinality single key, e.g. 10 groups over 100M rows; +- medium cardinality, e.g. 100k groups; +- high cardinality, near unique keys; +- dictionary string columns grouped by codes; +- multi-column keys; +- multiple aggregations over one value column; +- multiple value columns; +- with and without FULL index; +- persistent `.b2d`/`.b2z` inputs. + +Compare: + +- Python/NumPy chunked implementation; +- Cython hash path when available; +- sort-based path using existing keysort helpers; +- pandas/duckdb for sanity, where feasible. + +## Open decisions and recommended defaults + +### Public API and result column names + +Recommendation: use a small `GroupBy` facade and an explicit `.agg()` method: + +```python +t.group_by("city").agg({"sales": "sum"}) +t.group_by(["country", "city"]).agg({"sales": ["sum", "mean"], "price": "max"}) +``` + +Always suffix aggregate output columns as `_`: + +```text +city, sales_sum +country, city, sales_sum, sales_mean, price_max +``` + +This avoids ambiguity and remains stable when users later request multiple +aggregations on the same input column. Convenience methods should include at least `GroupBy.size()` and +`GroupBy.count(column)` early: + +```python +t.group_by("city").size() # row count per group / COUNT(*) +t.group_by("city").count("sales") # non-null sales count / COUNT(sales) +``` + +Additional conveniences like `.sum()`, `.mean()`, `.min()`, and `.max()` can be +added after `.agg()` is stable. + +### Output order + +Recommendation: make output order configurable, with hash insertion order as the +fast default and sorted output as an option: + +```python +t.group_by("city", sort=False).agg(...) # default: fastest +t.group_by("city", sort=True).agg(...) # sort by group keys +``` + +When a single-key FULL index is used, sorted output can be produced naturally. +Tests should not depend on default order unless explicitly testing order. + +### Null and NaN grouping semantics + +Recommendation: provide `dropna=` and default to `True`, matching common +dataframe behavior: + +```python +t.group_by("key", dropna=True) # skip rows with null/NaN keys +t.group_by("key", dropna=False) # include a null/NaN group +``` + +For `dropna=False`, all NaNs in a floating key should belong to one group, and +nullable sentinels/dictionary null codes should belong to one null group. The +NumPy and Cython engines must normalize these cases consistently. + +### `size` vs `count` + +Recommendation: support both, with distinct meanings, scoped to group-by rather +than as new top-level `CTable.size()` / `CTable.count()` methods: + +- `GroupBy.size()`: number of rows in the group, independent of value-column + nulls; equivalent to SQL `COUNT(*)` and pandas `groupby(...).size()`; +- `GroupBy.count(column)`: number of non-null values for a specific value + column; equivalent to SQL `COUNT(column)` and pandas `groupby(...)[column].count()`; +- `count` aggregation, e.g. `GroupBy.agg({"sales": "count"})`, should be an + equivalent spelling for `GroupBy.count("sales")`. + +Prefer `size()` over `len()` for the MVP. Although `len` resembles Python's +`len()`, `size()` follows pandas group-by terminology and avoids suggesting that +it returns a single scalar length. A `len()` alias can be considered later if +there is demand. + +For non-nullable columns, `count(col)` equals `size`. For nullable columns, +`count(col)` excludes null sentinels/NaNs according to the column null policy. +The MVP can implement `GroupBy.size()` first and add nullable-aware `count` as +nullable aggregate semantics mature. + +### Public `blosc2.group_reduce()` exposure + +Recommendation: keep `group_reduce` internal at first, e.g. in +`blosc2.groupby`, until the API and semantics settle through `CTable.group_by()`. +Expose a public `blosc2.group_reduce()` only after: + +- aggregation semantics are stable; +- null/NaN behavior is documented; +- output representation is clear; +- benchmarks show it is useful outside CTable. + +### Cython extension placement + +Recommendation: start optimized kernels in `indexing_ext.pyx` only for Phase 2, +under a clearly marked group-reduce section, to avoid build-system churn while +validating the approach. If the code grows beyond a few focused kernels or needs +its own persistent state classes, move it to `groupby_ext.pyx`. Do not place it +in `blosc2_ext.pyx`. diff --git a/src/blosc2/__init__.py b/src/blosc2/__init__.py index 8a587c06..29ed2024 100644 --- a/src/blosc2/__init__.py +++ b/src/blosc2/__init__.py @@ -628,6 +628,7 @@ def _raise(exc): # Note: bool, bytes, string shadow builtins in the blosc2 namespace by design — # they are schema spec constructors (b2.bool(), b2.bytes(), etc.). from .ctable import DEFAULT_NULL_POLICY, Column, CTable, NullPolicy, get_null_policy, null_policy +from .groupby import CTableGroupBy from .ndarray import ( abs, acos, @@ -804,6 +805,7 @@ def _raise(exc): # Classes "C2Array", "CParams", + "CTableGroupBy", "Batch", "BatchArray", # Enums diff --git a/src/blosc2/ctable.py b/src/blosc2/ctable.py index 1f80dc3d..56a89583 100644 --- a/src/blosc2/ctable.py +++ b/src/blosc2/ctable.py @@ -20,7 +20,7 @@ import re import shutil from collections import namedtuple -from collections.abc import Iterable, Mapping +from collections.abc import Iterable, Mapping, Sequence from dataclasses import MISSING, dataclass from dataclasses import field as dataclass_field from textwrap import TextWrapper @@ -2338,12 +2338,15 @@ def _init_columns( ) continue if self._is_dictionary_column(col): - self._cols[col.name] = storage.create_dictionary_column( + dict_col = storage.create_dictionary_column( col.name, spec=col.spec, cparams=col_storage.get("cparams"), dparams=col_storage.get("dparams"), ) + if len(dict_col.codes) < expected_size: + dict_col.resize((expected_size,)) + self._cols[col.name] = dict_col continue # Recompute chunks/blocks using the actual dtype so that wide # string columns (e.g. U183642) don't produce multi-GB chunks. @@ -3482,6 +3485,47 @@ def select(self, cols: list[str]) -> CTable: obj._col_widths = {name: self._col_widths[name] for name in cols if name in self._col_widths} return obj + def group_by( + self, + keys: str | Sequence[str], + *, + sort: bool = False, + dropna: bool = True, + engine: str = "auto", + chunk_size: int | None = None, + ): + """Return a deferred group-by object for this table. + + Parameters + ---------- + keys: + Column name or sequence of column names to group by. + sort: + If ``True``, sort the result by the group keys. The default + ``False`` preserves the hash aggregation order and is usually + faster. + dropna: + If ``True`` (default), rows with null/NaN group keys are skipped. + If ``False``, null/NaN keys form their own group. + engine: + Execution engine. Phase 1 accepts ``"auto"`` and uses the NumPy + chunked implementation. + chunk_size: + Optional number of physical rows processed per chunk. + + Returns + ------- + CTableGroupBy + A lightweight deferred operation builder. Call methods such as + ``.size()``, ``.count(column)`` or ``.agg({...})`` to materialize a + grouped result as a new :class:`CTable`. + """ + if engine != "auto": + raise ValueError("Only engine='auto' is supported for group_by() in Phase 1") + from blosc2.groupby import CTableGroupBy + + return CTableGroupBy(self, keys, sort=sort, dropna=dropna, engine=engine, chunk_size=chunk_size) + def describe(self) -> None: """Print a per-column statistical summary. diff --git a/src/blosc2/groupby.py b/src/blosc2/groupby.py new file mode 100644 index 00000000..6dd6874e --- /dev/null +++ b/src/blosc2/groupby.py @@ -0,0 +1,548 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +"""Group-by support for :class:`blosc2.CTable`. + +This module contains the Phase-1, NumPy-based implementation. It is deliberately +chunked and columnar: only grouping columns, aggregation columns, and the +live-row mask are read from the source table. +""" + +from __future__ import annotations + +import copy +import dataclasses +import math +import re +from collections.abc import Mapping, Sequence +from typing import TYPE_CHECKING, Any, Literal + +import numpy as np + +from blosc2.schema import DictionarySpec, SchemaSpec, float64, int64 +from blosc2.schema import bool as b2_bool +from blosc2.schema import field as b2_field + +if TYPE_CHECKING: # pragma: no cover + from blosc2.ctable import CTable + + +AggName = Literal["size", "count", "sum", "mean", "min", "max"] + +_IDENTIFIER_RE = re.compile(r"^[A-Za-z_]\w*$") +_NAN_KEY = ("__blosc2_groupby_nan__",) + + +@dataclasses.dataclass +class _AggSpec: + input_col: str | None + op: AggName + output_col: str + + +@dataclasses.dataclass +class _AggState: + op: AggName + value: Any = None + count: int = 0 + + +class CTableGroupBy: + """Deferred group-by operation returned by :meth:`CTable.group_by`. + + The object stores the source table, grouping keys, and execution options. + It is not a :class:`CTable` view and does not materialize grouped data until + a terminal method such as :meth:`size`, :meth:`count`, or :meth:`agg` is + called. + """ + + def __init__( + self, + table: CTable, + keys: str | Sequence[str], + *, + sort: bool = False, + dropna: bool = True, + engine: str = "auto", + chunk_size: int | None = None, + ) -> None: + if isinstance(keys, str): + keys = [keys] + else: + keys = list(keys) + if not keys: + raise ValueError("group_by() requires at least one key column") + + self.table = table + self.keys = [table._logical_to_physical_name(k) for k in keys] + self.sort = bool(sort) + self.dropna = bool(dropna) + self.engine = engine + self.chunk_size = chunk_size + + for name in self.keys: + if name in table._computed_cols: + raise NotImplementedError("group_by() over computed columns is not supported yet") + if name not in table._cols: + raise KeyError(f"No column named {name!r}. Available: {table.col_names}") + col_info = table._schema.columns_by_name[name] + if table._is_list_column(col_info) or table._is_varlen_scalar_column(col_info): + raise TypeError(f"Cannot group by variable-length/list column {name!r} in Phase 1") + + def size(self): + """Return row counts per group as a new :class:`CTable`. + + This is equivalent to SQL ``COUNT(*)``: it counts rows in each group and + is independent of null values in non-key columns. + """ + return self._execute([_AggSpec(None, "size", "size")]) + + def count(self, column: str): + """Return non-null value counts for *column* per group. + + This is equivalent to SQL ``COUNT(column)`` and to + ``group_by(...).agg({column: "count"})``. + """ + col = self.table._logical_to_physical_name(column) + return self._execute([_AggSpec(col, "count", f"{col}_count")]) + + def agg(self, aggregations: Mapping[str, str | Sequence[str]]): + """Aggregate value columns per group. + + Parameters + ---------- + aggregations: + Mapping from input column name to an aggregation name or list of + names. Supported operations in Phase 1 are ``"count"``, ``"sum"``, + ``"mean"``, ``"min"``, ``"max"`` and the special row-count spelling + ``{"*": "size"``}. + """ + specs = self._normalize_aggs(aggregations) + return self._execute(specs) + + def _normalize_aggs(self, aggregations: Mapping[str, str | Sequence[str]]) -> list[_AggSpec]: + if not isinstance(aggregations, Mapping) or not aggregations: + raise ValueError("agg() requires a non-empty mapping") + specs: list[_AggSpec] = [] + for col_name, ops in aggregations.items(): + if isinstance(ops, str): + op_list = [ops] + else: + op_list = list(ops) + if not op_list: + raise ValueError(f"No aggregations specified for column {col_name!r}") + + if col_name == "*": + for op in op_list: + if op != "size": + raise ValueError("Only the 'size' aggregation is supported for '*' input") + specs.append(_AggSpec(None, "size", "size")) + continue + + physical = self.table._logical_to_physical_name(col_name) + self._validate_value_column(physical) + for op in op_list: + if op not in {"count", "sum", "mean", "min", "max"}: + raise ValueError(f"Unsupported aggregation {op!r}") + self._validate_agg_for_column(physical, op) + specs.append(_AggSpec(physical, op, f"{physical}_{op}")) + output_names = [s.output_col for s in specs] + if len(output_names) != len(set(output_names)): + raise ValueError("Aggregation output column names must be unique") + return specs + + def _validate_agg_for_column(self, name: str, op: str) -> None: + dtype = getattr(self.table._schema.columns_by_name[name].spec, "dtype", None) + if op in {"sum", "mean"} and dtype is not None and dtype.kind not in "biuf": + raise TypeError(f"Aggregation {op!r} is not supported for column {name!r} with dtype {dtype}") + if op in {"min", "max"} and dtype is not None and dtype.kind == "c": + raise TypeError(f"Aggregation {op!r} is not supported for complex column {name!r}") + + def _validate_value_column(self, name: str) -> None: + if name in self.table._computed_cols: + raise NotImplementedError("group_by() aggregations over computed columns are not supported yet") + if name not in self.table._cols: + raise KeyError(f"No column named {name!r}. Available: {self.table.col_names}") + col_info = self.table._schema.columns_by_name[name] + if self.table._is_list_column(col_info) or self.table._is_varlen_scalar_column(col_info): + raise TypeError(f"Cannot aggregate variable-length/list column {name!r} in Phase 1") + if self.table._is_dictionary_column(col_info): + raise TypeError(f"Cannot aggregate dictionary column {name!r} in Phase 1") + + def _execute(self, specs: list[_AggSpec]): + self._validate_output_names(specs) + acc: dict[Any, dict[str, _AggState]] = {} + key_values: dict[Any, tuple[Any, ...]] = {} + + phys_len = len(self.table._valid_rows) + chunk_size = self._chunk_size() + value_cols = sorted({s.input_col for s in specs if s.input_col is not None}) + + for start in range(0, phys_len, chunk_size): + stop = min(start + chunk_size, phys_len) + valid = np.asarray(self.table._valid_rows[start:stop], dtype=bool) + if not np.any(valid): + continue + + raw_keys = [self._read_key_chunk(name, start, stop) for name in self.keys] + live_mask = valid.copy() + if self.dropna: + for name, values in zip(self.keys, raw_keys, strict=True): + live_mask &= ~self._null_mask(name, values, is_key=True) + if not np.any(live_mask): + continue + + keys_live = [np.asarray(values)[live_mask] for values in raw_keys] + n_live = len(keys_live[0]) + if n_live == 0: + continue + + unique_keys, inverse = self._factorize_keys(keys_live) + value_chunks = { + name: np.asarray(self.table._cols[name][start:stop])[live_mask] for name in value_cols + } + + partials = self._compute_partials(specs, unique_keys, inverse, value_chunks) + display_keys = self._display_keys(unique_keys) + normalized_keys = self._normalized_keys(display_keys) + self._merge_partials(acc, key_values, normalized_keys, display_keys, partials, specs) + + rows = self._final_rows(acc, key_values, specs) + return self._build_result(rows, specs) + + def _chunk_size(self) -> int: + if self.chunk_size is not None: + if self.chunk_size <= 0: + raise ValueError("chunk_size must be positive") + return int(self.chunk_size) + chunks = getattr(self.table._valid_rows, "chunks", None) + if chunks: + return max(int(chunks[0]), 1) + return 65536 + + def _read_key_chunk(self, name: str, start: int, stop: int) -> np.ndarray: + col_info = self.table._schema.columns_by_name[name] + if self.table._is_dictionary_column(col_info): + return np.asarray(self.table._cols[name].codes[start:stop], dtype=np.int32) + return np.asarray(self.table._cols[name][start:stop]) + + def _factorize_keys( + self, keys_live: list[np.ndarray] + ) -> tuple[np.ndarray | list[np.ndarray], np.ndarray]: + if len(keys_live) == 1: + unique, inverse = np.unique(keys_live[0], return_inverse=True) + return unique, inverse + + dtype = [(f"k{i}", arr.dtype) for i, arr in enumerate(keys_live)] + packed = np.empty(len(keys_live[0]), dtype=dtype) + for i, arr in enumerate(keys_live): + packed[f"k{i}"] = arr + unique, inverse = np.unique(packed, return_inverse=True) + return unique, inverse + + def _display_keys(self, unique_keys: np.ndarray | list[np.ndarray]) -> list[tuple[Any, ...]]: + if len(self.keys) == 1: + name = self.keys[0] + col_info = self.table._schema.columns_by_name[name] + values = [] + for value in np.asarray(unique_keys): + if self.table._is_dictionary_column(col_info): + values.append((self.table._cols[name].decode(int(value)),)) + else: + values.append((_python_scalar(value),)) + return values + + result = [] + assert isinstance(unique_keys, np.ndarray) + for row in unique_keys: + vals = [] + for i, name in enumerate(self.keys): + value = row[f"k{i}"] + col_info = self.table._schema.columns_by_name[name] + if self.table._is_dictionary_column(col_info): + vals.append(self.table._cols[name].decode(int(value))) + else: + vals.append(_python_scalar(value)) + result.append(tuple(vals)) + return result + + def _normalized_keys(self, display_keys: list[tuple[Any, ...]]) -> list[Any]: + normalized = [] + for key in display_keys: + norm = tuple(_normalize_key_part(v) for v in key) + normalized.append(norm[0] if len(norm) == 1 else norm) + return normalized + + def _compute_partials( + self, + specs: list[_AggSpec], + unique_keys: np.ndarray | list[np.ndarray], + inverse: np.ndarray, + value_chunks: dict[str, np.ndarray], + ) -> dict[str, Any]: + n_groups = len(unique_keys) + partials: dict[str, Any] = {} + for spec in specs: + if spec.op == "size": + partials[spec.output_col] = np.bincount(inverse, minlength=n_groups).astype(np.int64) + continue + + assert spec.input_col is not None + values = value_chunks[spec.input_col] + non_null = ~self._null_mask(spec.input_col, values, is_key=False) + + if spec.op == "count": + partials[spec.output_col] = np.bincount( + inverse, weights=non_null.astype(np.int64), minlength=n_groups + ).astype(np.int64) + elif spec.op in {"sum", "mean"}: + counts = np.bincount(inverse, weights=non_null.astype(np.int64), minlength=n_groups).astype( + np.int64 + ) + if spec.op == "sum" and values.dtype.kind in "biu": + sums = np.zeros(n_groups, dtype=np.int64) + np.add.at(sums, inverse[non_null], values[non_null].astype(np.int64, copy=False)) + else: + weights = np.where(non_null, values, 0) + sums = np.bincount(inverse, weights=weights, minlength=n_groups) + partials[spec.output_col] = (sums, counts) + elif spec.op in {"min", "max"}: + partials[spec.output_col] = self._minmax_partials( + spec.op, inverse, values, non_null, n_groups + ) + return partials + + def _minmax_partials( + self, op: AggName, inverse: np.ndarray, values: np.ndarray, non_null: np.ndarray, n_groups: int + ) -> tuple[np.ndarray, np.ndarray]: + if values.dtype.kind in "biufcmM": + if op == "min": + identity = _max_identity(values.dtype) + out = np.full(n_groups, identity, dtype=values.dtype) + np.minimum.at(out, inverse[non_null], values[non_null]) + else: + identity = _min_identity(values.dtype) + out = np.full(n_groups, identity, dtype=values.dtype) + np.maximum.at(out, inverse[non_null], values[non_null]) + else: + out = np.empty(n_groups, dtype=values.dtype) + has = np.zeros(n_groups, dtype=bool) + for group, value, ok in zip(inverse, values, non_null, strict=True): + if not ok: + continue + if not has[group] or (value < out[group] if op == "min" else value > out[group]): + out[group] = value + has[group] = True + return out, has + has_value = np.bincount(inverse, weights=non_null.astype(np.int64), minlength=n_groups) > 0 + return out, has_value + + def _merge_partials( + self, + acc: dict[Any, dict[str, _AggState]], + key_values: dict[Any, tuple[Any, ...]], + normalized_keys: list[Any], + display_keys: list[tuple[Any, ...]], + partials: dict[str, Any], + specs: list[_AggSpec], + ) -> None: + for i, norm_key in enumerate(normalized_keys): + states = acc.setdefault(norm_key, {}) + key_values.setdefault(norm_key, display_keys[i]) + for spec in specs: + state = states.setdefault(spec.output_col, _AggState(spec.op)) + partial = partials[spec.output_col] + if spec.op in {"size", "count"}: + state.value = (0 if state.value is None else state.value) + int(partial[i]) + elif spec.op == "sum": + sums, counts = partial + if counts[i] > 0: + state.value = (0 if state.value is None else state.value) + _python_scalar(sums[i]) + state.count += int(counts[i]) + elif spec.op == "mean": + sums, counts = partial + if counts[i] > 0: + state.value = (0.0 if state.value is None else state.value) + float(sums[i]) + state.count += int(counts[i]) + elif spec.op in {"min", "max"}: + values, has_value = partial + if has_value[i]: + value = _python_scalar(values[i]) + if ( + state.count == 0 + or (spec.op == "min" and value < state.value) + or (spec.op == "max" and value > state.value) + ): + state.value = value + state.count += 1 + + def _final_rows( + self, + acc: dict[Any, dict[str, _AggState]], + key_values: dict[Any, tuple[Any, ...]], + specs: list[_AggSpec], + ) -> list[dict[str, Any]]: + keys = list(acc) + if self.sort: + keys.sort(key=lambda k: tuple(_sortable_key_part(v) for v in key_values[k])) + + rows = [] + for norm_key in keys: + row = dict(zip(self.keys, key_values[norm_key], strict=True)) + states = acc[norm_key] + for spec in specs: + state = states[spec.output_col] + if spec.op == "mean": + row[spec.output_col] = math.nan if state.count == 0 else state.value / state.count + elif spec.op in {"sum", "min", "max"} and state.count == 0: + row[spec.output_col] = _null_output_value(self._result_spec_for_agg(spec)) + else: + row[spec.output_col] = 0 if state.value is None else state.value + rows.append(row) + return rows + + def _build_result(self, rows: list[dict[str, Any]], specs: list[_AggSpec]): + from blosc2.ctable import CTable + + columns = self.keys + [spec.output_col for spec in specs] + schema_specs = {name: self._result_spec_for_key(name) for name in self.keys} + for spec in specs: + schema_specs[spec.output_col] = self._result_spec_for_agg(spec) + + fields = [] + for name in columns: + fields.append((name, _python_type_for_spec(schema_specs[name]), b2_field(schema_specs[name]))) + row_type = dataclasses.make_dataclass("CTableGroupByRow", fields) + data = {name: [row[name] for row in rows] for name in columns} + return CTable(row_type, new_data=data, expected_size=max(len(rows), 1), validate=False) + + def _validate_output_names(self, specs: list[_AggSpec]) -> None: + names = self.keys + [s.output_col for s in specs] + bad = [name for name in names if not _IDENTIFIER_RE.match(name)] + if bad: + raise NotImplementedError( + "Phase-1 group_by() result columns must be valid Python identifiers; " + f"unsupported names: {bad!r}" + ) + if len(names) != len(set(names)): + raise ValueError("Group-by result column names would not be unique") + + def _result_spec_for_key(self, name: str) -> SchemaSpec: + return copy.deepcopy(self.table._schema.columns_by_name[name].spec) + + def _result_spec_for_agg(self, spec: _AggSpec) -> SchemaSpec: + if spec.op in {"size", "count"}: + return int64() + if spec.op == "mean": + return float64() + assert spec.input_col is not None + input_spec = self.table._schema.columns_by_name[spec.input_col].spec + dtype = getattr(input_spec, "dtype", None) + if spec.op == "sum": + if dtype is not None and dtype.kind in "iu": + return int64() + if dtype is not None and dtype.kind == "b": + return int64() + if dtype is not None and dtype.kind == "f": + return float64() + return copy.deepcopy(input_spec) + + def _null_mask(self, name: str, values: np.ndarray, *, is_key: bool) -> np.ndarray: + col_info = self.table._schema.columns_by_name[name] + spec = col_info.spec + if isinstance(spec, DictionarySpec): + mask = values == np.int32(spec.null_code) + return mask if is_key or getattr(spec, "nullable", False) else np.zeros(len(values), dtype=bool) + null_value = getattr(spec, "null_value", None) + mask = np.zeros(len(values), dtype=bool) + # For keys, treat all NaNs as missing so dropna behaves predictably. + # For values, only nullable NaN sentinels are skipped. + if values.dtype.kind == "f" and ( + is_key or (isinstance(null_value, float) and math.isnan(null_value)) + ): + mask |= np.isnan(values) + if null_value is not None and not (isinstance(null_value, float) and math.isnan(null_value)): + mask |= values == null_value + return mask + + +def _normalize_key_part(value: Any) -> Any: + if isinstance(value, float) and math.isnan(value): + return _NAN_KEY + return value + + +def _sortable_key_part(value: Any) -> tuple[int, Any]: + if value is None: + return (0, "") + if isinstance(value, float) and math.isnan(value): + return (0, "") + return (1, value) + + +def _python_scalar(value: Any) -> Any: + if isinstance(value, np.generic): + return value.item() + return value + + +def _python_type_for_spec(spec: SchemaSpec): + if isinstance(spec, DictionarySpec): + return str + if isinstance(spec, b2_bool): + return bool + dtype = getattr(spec, "dtype", None) + if dtype is not None: + if dtype.kind in "iu": + return int + if dtype.kind == "f": + return float + if dtype.kind == "b": + return bool + if dtype.kind in "US": + return str if dtype.kind == "U" else bytes + return getattr(spec, "python_type", object) + + +def _max_identity(dtype: np.dtype): + dtype = np.dtype(dtype) + if dtype.kind in "iu": + return np.iinfo(dtype).max + if dtype.kind == "f": + return np.inf + if dtype.kind in "mM": + return np.iinfo(np.int64).max + return None + + +def _min_identity(dtype: np.dtype): + dtype = np.dtype(dtype) + if dtype.kind in "iu": + return np.iinfo(dtype).min + if dtype.kind == "f": + return -np.inf + if dtype.kind in "mM": + return np.iinfo(np.int64).min + return None + + +def _null_output_value(spec: SchemaSpec): + dtype = getattr(spec, "dtype", None) + null_value = getattr(spec, "null_value", None) + if null_value is not None: + return null_value + if dtype is not None and dtype.kind == "f": + return math.nan + if dtype is not None and dtype.kind in "iu": + return 0 + if dtype is not None and dtype.kind == "b": + return False + if dtype is not None and dtype.kind == "U": + return "" + if dtype is not None and dtype.kind == "S": + return b"" + return None diff --git a/tests/ctable/test_groupby.py b/tests/ctable/test_groupby.py new file mode 100644 index 00000000..12394fb4 --- /dev/null +++ b/tests/ctable/test_groupby.py @@ -0,0 +1,151 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from dataclasses import dataclass + +import numpy as np +import pytest + +import blosc2 +from blosc2 import CTable + + +@dataclass +class SalesRow: + city: str = blosc2.field(blosc2.string(max_length=16)) + category: int = blosc2.field(blosc2.int32()) + sales: float = blosc2.field(blosc2.float64(nullable=True), default=0.0) + qty: int = blosc2.field(blosc2.int32(), default=0) + + +DATA = [ + ("Paris", 1, 10.0, 1), + ("Paris", 1, np.nan, 2), + ("Rome", 1, 20.0, 3), + ("Paris", 2, 30.0, 4), + ("Rome", 1, 40.0, 5), + ("Berlin", 2, np.nan, 6), +] + + +def col(table, name): + return list(table._cols[name][: table.nrows]) + + +def rows(table): + return [tuple(table._cols[name][i] for name in table.col_names) for i in range(table.nrows)] + + +def test_groupby_size_counts_rows_per_group(): + t = CTable(SalesRow, new_data=DATA) + + out = t.group_by("city", sort=True).size() + + assert out.col_names == ["city", "size"] + assert rows(out) == [("Berlin", 1), ("Paris", 3), ("Rome", 2)] + + +def test_groupby_count_counts_non_null_values(): + t = CTable(SalesRow, new_data=DATA) + + out = t.group_by("city", sort=True).count("sales") + + assert out.col_names == ["city", "sales_count"] + assert rows(out) == [("Berlin", 0), ("Paris", 2), ("Rome", 2)] + + +def test_groupby_agg_numeric_reductions(): + t = CTable(SalesRow, new_data=DATA) + + out = t.group_by("city", sort=True).agg({"sales": ["sum", "mean", "min", "max", "count"]}) + + assert out.col_names == ["city", "sales_sum", "sales_mean", "sales_min", "sales_max", "sales_count"] + got = rows(out) + assert got[0][0] == "Berlin" + assert np.isnan(got[0][1]) + assert np.isnan(got[0][2]) + assert np.isnan(got[0][3]) + assert np.isnan(got[0][4]) + assert got[0][5] == 0 + assert got[1] == ("Paris", 40.0, 20.0, 10.0, 30.0, 2) + assert got[2] == ("Rome", 60.0, 30.0, 20.0, 40.0, 2) + + +def test_groupby_multi_key_size(): + t = CTable(SalesRow, new_data=DATA) + + out = t.group_by(["city", "category"], sort=True).size() + + assert rows(out) == [("Berlin", 2, 1), ("Paris", 1, 2), ("Paris", 2, 1), ("Rome", 1, 2)] + + +def test_groupby_respects_views_and_deleted_rows(): + t = CTable(SalesRow, new_data=DATA) + t.delete(0) + view = t.where("qty >= 3") + + out = view.group_by("city", sort=True).size() + + assert rows(out) == [("Berlin", 1), ("Paris", 1), ("Rome", 2)] + + +@dataclass +class DictRow: + city: str = blosc2.field(blosc2.dictionary()) + sales: int = blosc2.field(blosc2.int32()) + + +def test_groupby_dictionary_key_groups_by_decoded_value(): + t = CTable(DictRow, new_data=[("Paris", 10), ("Rome", 20), ("Paris", 30)]) + + out = t.group_by("city", sort=True).agg({"sales": "sum"}) + + assert out.col_names == ["city", "sales_sum"] + assert rows(out) == [("Paris", 40), ("Rome", 20)] + + +def test_groupby_dictionary_key_beyond_default_code_capacity(): + data = [("Paris" if i % 2 == 0 else "Rome", 1) for i in range(5000)] + t = CTable(DictRow, new_data=data) + + out = t.group_by("city", sort=True).size() + + assert rows(out) == [("Paris", 2500), ("Rome", 2500)] + + +def test_groupby_dropna_key_default_and_false(): + t = CTable(DictRow, new_data=[("Paris", 10), (None, 20), ("Paris", 30)]) + + dropped = t.group_by("city", sort=True).size() + kept = t.group_by("city", sort=True, dropna=False).size() + + assert rows(dropped) == [("Paris", 2)] + assert rows(kept) == [(None, 1), ("Paris", 2)] + + +def test_groupby_agg_star_size(): + t = CTable(SalesRow, new_data=DATA) + + out = t.group_by("city", sort=True).agg({"*": "size"}) + + assert rows(out) == [("Berlin", 1), ("Paris", 3), ("Rome", 2)] + + +def test_groupby_empty_table_returns_empty_result(): + t = CTable(SalesRow) + + out = t.group_by("city").size() + + assert out.nrows == 0 + assert out.col_names == ["city", "size"] + + +def test_groupby_rejects_bad_engine(): + t = CTable(SalesRow, new_data=DATA) + + with pytest.raises(ValueError): + t.group_by("city", engine="cython") From 110f3e91687e246060e836331c4f2f3a8d08ce4d Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 15 May 2026 07:10:03 +0200 Subject: [PATCH 02/17] Fast path for one dense integer/dictionary-code key (8x speedup) --- src/blosc2/groupby.py | 161 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) diff --git a/src/blosc2/groupby.py b/src/blosc2/groupby.py index 6dd6874e..af4eb9f2 100644 --- a/src/blosc2/groupby.py +++ b/src/blosc2/groupby.py @@ -175,6 +175,10 @@ def _validate_value_column(self, name: str) -> None: def _execute(self, specs: list[_AggSpec]): self._validate_output_names(specs) + fast = self._try_execute_dense_single_int_key(specs) + if fast is not None: + return fast + acc: dict[Any, dict[str, _AggState]] = {} key_values: dict[Any, tuple[Any, ...]] = {} @@ -214,6 +218,163 @@ def _execute(self, specs: list[_AggSpec]): rows = self._final_rows(acc, key_values, specs) return self._build_result(rows, specs) + def _try_execute_dense_single_int_key(self, specs: list[_AggSpec]): # noqa: C901 + """Fast path for one dense integer/dictionary-code key. + + This avoids per-chunk ``np.unique`` and Python dictionary merging. It is + intentionally conservative: keys must be non-negative and the observed + key range must stay reasonably compact. + """ + if len(self.keys) != 1: + return None + key_name = self.keys[0] + key_info = self.table._schema.columns_by_name[key_name] + key_is_dict = self.table._is_dictionary_column(key_info) + key_dtype = np.dtype(np.int32) if key_is_dict else getattr(key_info.spec, "dtype", None) + if key_dtype is None or key_dtype.kind not in "biu": + return None + if any(spec.op in {"min", "max"} and spec.input_col is not None for spec in specs): + for spec in specs: + if spec.op in {"min", "max"} and spec.input_col is not None: + dtype = getattr(self.table._schema.columns_by_name[spec.input_col].spec, "dtype", None) + if dtype is None or np.dtype(dtype).kind not in "biufmM": + return None + + compact_limit = 10_000_000 + present = np.zeros(0, dtype=bool) + states: dict[str, Any] = {} + for spec in specs: + if spec.op in {"size", "count"}: + states[spec.output_col] = np.zeros(0, dtype=np.int64) + elif spec.op == "sum": + out_dtype = np.int64 + if spec.input_col is not None: + dtype = np.dtype(self.table._schema.columns_by_name[spec.input_col].spec.dtype) + out_dtype = np.float64 if dtype.kind == "f" else np.int64 + states[spec.output_col] = np.zeros(0, dtype=out_dtype) + elif spec.op == "mean": + states[spec.output_col] = (np.zeros(0, dtype=np.float64), np.zeros(0, dtype=np.int64)) + elif spec.op in {"min", "max"}: + assert spec.input_col is not None + dtype = np.dtype(self.table._schema.columns_by_name[spec.input_col].spec.dtype) + identity = _max_identity(dtype) if spec.op == "min" else _min_identity(dtype) + states[spec.output_col] = (np.full(0, identity, dtype=dtype), np.zeros(0, dtype=bool)) + + def ensure_size(size: int) -> bool: + nonlocal present, states + if size > compact_limit: + return False + if size <= len(present): + return True + old = len(present) + present = np.pad(present, (0, size - old), constant_values=False) + for spec in specs: + state = states[spec.output_col] + if spec.op in {"size", "count", "sum"}: + states[spec.output_col] = np.pad(state, (0, size - old), constant_values=0) + elif spec.op == "mean": + sums, counts = state + states[spec.output_col] = ( + np.pad(sums, (0, size - old), constant_values=0), + np.pad(counts, (0, size - old), constant_values=0), + ) + elif spec.op in {"min", "max"}: + values, has = state + dtype = values.dtype + identity = _max_identity(dtype) if spec.op == "min" else _min_identity(dtype) + states[spec.output_col] = ( + np.pad(values, (0, size - old), constant_values=identity), + np.pad(has, (0, size - old), constant_values=False), + ) + return True + + phys_len = len(self.table._valid_rows) + chunk_size = self._chunk_size() + value_cols = sorted({s.input_col for s in specs if s.input_col is not None}) + for start in range(0, phys_len, chunk_size): + stop = min(start + chunk_size, phys_len) + valid = np.asarray(self.table._valid_rows[start:stop], dtype=bool) + if not np.any(valid): + continue + raw_keys = self._read_key_chunk(key_name, start, stop) + live_mask = valid.copy() + if self.dropna: + live_mask &= ~self._null_mask(key_name, raw_keys, is_key=True) + if not np.any(live_mask): + continue + keys = np.asarray(raw_keys[live_mask]) + if keys.dtype.kind == "b": + keys = keys.astype(np.int8, copy=False) + if len(keys) == 0: + continue + min_key = int(np.min(keys)) + if min_key < 0: + return None + max_key = int(np.max(keys)) + if not ensure_size(max_key + 1): + return None + present[keys] = True + value_chunks = { + name: np.asarray(self.table._cols[name][start:stop])[live_mask] for name in value_cols + } + + for spec in specs: + if spec.op == "size": + states[spec.output_col] += np.bincount(keys, minlength=len(present)).astype(np.int64) + continue + assert spec.input_col is not None + values = value_chunks[spec.input_col] + non_null = ~self._null_mask(spec.input_col, values, is_key=False) + if spec.op == "count": + states[spec.output_col] += np.bincount( + keys, weights=non_null.astype(np.int64), minlength=len(present) + ).astype(np.int64) + elif spec.op == "sum": + state = states[spec.output_col] + if values.dtype.kind in "biu": + np.add.at(state, keys[non_null], values[non_null].astype(np.int64, copy=False)) + else: + state += np.bincount( + keys, weights=np.where(non_null, values, 0), minlength=len(present) + ).astype(state.dtype, copy=False) + elif spec.op == "mean": + sums, counts = states[spec.output_col] + sums += np.bincount(keys, weights=np.where(non_null, values, 0), minlength=len(present)) + counts += np.bincount( + keys, weights=non_null.astype(np.int64), minlength=len(present) + ).astype(np.int64) + elif spec.op in {"min", "max"}: + values_state, has_state = states[spec.output_col] + if spec.op == "min": + np.minimum.at(values_state, keys[non_null], values[non_null]) + else: + np.maximum.at(values_state, keys[non_null], values[non_null]) + has_state[keys[non_null]] = True + + group_codes = np.nonzero(present)[0] + rows = [] + for code in group_codes: + key_value = self.table._cols[key_name].decode(int(code)) if key_is_dict else _python_scalar(code) + row = {key_name: key_value} + for spec in specs: + state = states[spec.output_col] + if spec.op == "mean": + sums, counts = state + row[spec.output_col] = ( + math.nan if counts[code] == 0 else float(sums[code]) / int(counts[code]) + ) + elif spec.op in {"min", "max"}: + values_state, has_state = state + row[spec.output_col] = ( + _python_scalar(values_state[code]) + if has_state[code] + else _null_output_value(self._result_spec_for_agg(spec)) + ) + else: + row[spec.output_col] = _python_scalar(state[code]) + rows.append(row) + return self._build_result(rows, specs) + def _chunk_size(self) -> int: if self.chunk_size is not None: if self.chunk_size <= 0: From 40e58bd3b95b4db56c7743241c8771d90582aa47 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 15 May 2026 07:22:48 +0200 Subject: [PATCH 03/17] Optimizing thing in cython (phase 2 going on). More than 2x speedup. --- src/blosc2/groupby.py | 78 +++++++++++++++++++ src/blosc2/indexing_ext.pyx | 150 ++++++++++++++++++++++++++++++++++++ 2 files changed, 228 insertions(+) diff --git a/src/blosc2/groupby.py b/src/blosc2/groupby.py index af4eb9f2..b3b1572e 100644 --- a/src/blosc2/groupby.py +++ b/src/blosc2/groupby.py @@ -175,6 +175,9 @@ def _validate_value_column(self, name: str) -> None: def _execute(self, specs: list[_AggSpec]): self._validate_output_names(specs) + fast = self._try_execute_cython_i32_f64_sum(specs) + if fast is not None: + return fast fast = self._try_execute_dense_single_int_key(specs) if fast is not None: return fast @@ -218,6 +221,81 @@ def _execute(self, specs: list[_AggSpec]): rows = self._final_rows(acc, key_values, specs) return self._build_result(rows, specs) + def _try_execute_cython_i32_f64_sum(self, specs: list[_AggSpec]): # noqa: C901 + """Cython fast path for one int32 key and one non-null float64 sum.""" + if len(self.keys) != 1 or len(specs) != 1 or specs[0].op != "sum" or self.sort: + return None + spec = specs[0] + if spec.input_col is None: + return None + key_name = self.keys[0] + key_info = self.table._schema.columns_by_name[key_name] + value_info = self.table._schema.columns_by_name[spec.input_col] + if self.table._is_dictionary_column(key_info): + key_arr = self.table._cols[key_name].codes + key_is_dict = True + key_null = int(key_info.spec.null_code) + skip_key_null = self.dropna + else: + key_arr = self.table._cols[key_name] + key_is_dict = False + key_dtype = getattr(key_info.spec, "dtype", None) + if key_dtype != np.dtype(np.int32): + return None + key_null_value = getattr(key_info.spec, "null_value", None) + skip_key_null = self.dropna and key_null_value is not None + key_null = 0 if key_null_value is None else int(key_null_value) + value_dtype = getattr(value_info.spec, "dtype", None) + if value_dtype != np.dtype(np.float64) or getattr(value_info.spec, "null_value", None) is not None: + return None + try: + from blosc2 import indexing_ext + except ImportError: + return None + kernel = getattr(indexing_ext, "groupby_dense_i32_f64_sum_checked", None) + if kernel is None: + return None + + compact_limit = 10_000_000 + sums = np.zeros(0, dtype=np.float64) + present = np.zeros(0, dtype=bool) + + def ensure_size(size: int) -> bool: + nonlocal sums, present + if size > compact_limit: + return False + if size <= len(sums): + return True + old = len(sums) + sums = np.pad(sums, (0, size - old), constant_values=0) + present = np.pad(present, (0, size - old), constant_values=False) + return True + + phys_len = len(self.table._valid_rows) + chunk_size = self._chunk_size() + for start in range(0, phys_len, chunk_size): + stop = min(start + chunk_size, phys_len) + valid = np.asarray(self.table._valid_rows[start:stop], dtype=bool) + if not np.any(valid): + continue + keys = np.asarray(key_arr[start:stop], dtype=np.int32) + values = np.asarray(self.table._cols[spec.input_col][start:stop], dtype=np.float64) + status = int(kernel(keys, values, valid, sums, present, skip_key_null, key_null, False)) + if status == -1: + return None + if status > 0: + if not ensure_size(status): + return None + status = int(kernel(keys, values, valid, sums, present, skip_key_null, key_null, False)) + if status != 0: + return None + + rows = [] + for code in np.nonzero(present)[0]: + key_value = self.table._cols[key_name].decode(int(code)) if key_is_dict else int(code) + rows.append({key_name: key_value, spec.output_col: float(sums[code])}) + return self._build_result(rows, specs) + def _try_execute_dense_single_int_key(self, specs: list[_AggSpec]): # noqa: C901 """Fast path for one dense integer/dictionary-code key. diff --git a/src/blosc2/indexing_ext.pyx b/src/blosc2/indexing_ext.pyx index 759f5980..e1202edd 100644 --- a/src/blosc2/indexing_ext.pyx +++ b/src/blosc2/indexing_ext.pyx @@ -2495,3 +2495,153 @@ def keysort_keys_indices(np.ndarray keys, np.ndarray indices): return None _keysort_ndarray(keys, indices) return None + + +# ---------------------------------------------------------------------- +# Group-reduce kernels +# ---------------------------------------------------------------------- + +def groupby_dense_i32_f64_sum( + np.ndarray keys, + np.ndarray values, + np.ndarray valid, + np.ndarray sums, + np.ndarray present, + bint skip_key_null=False, + int32_t key_null=0, + bint skip_value_nan=False, +): + """Accumulate ``sum(values)`` by dense int32 keys. + + This is a low-level CTable group-by helper. *keys*, *values*, and *valid* + are same-length 1-D chunk arrays. *sums* and *present* are dense group + state arrays indexed directly by key value. Keys must be non-negative and + already fit in the state arrays. + """ + if keys.ndim != 1 or values.ndim != 1 or valid.ndim != 1: + raise ValueError("keys, values and valid must be 1-D arrays") + if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]: + raise ValueError("keys, values and valid must have the same length") + if sums.ndim != 1 or present.ndim != 1: + raise ValueError("sums and present must be 1-D arrays") + if keys.dtype != np.dtype(np.int32): + raise TypeError("keys must have dtype int32") + if values.dtype != np.dtype(np.float64): + raise TypeError("values must have dtype float64") + if valid.dtype != np.dtype(np.bool_): + raise TypeError("valid must have dtype bool") + if sums.dtype != np.dtype(np.float64): + raise TypeError("sums must have dtype float64") + if present.dtype != np.dtype(np.bool_): + raise TypeError("present must have dtype bool") + + cdef int32_t[:] keys_view = keys + cdef double[:] values_view = values + cdef np.npy_bool[:] valid_view = valid + cdef double[:] sums_view = sums + cdef np.npy_bool[:] present_view = present + cdef Py_ssize_t n = keys.shape[0] + cdef Py_ssize_t nstates = sums.shape[0] + cdef Py_ssize_t i + cdef int32_t key + cdef double value + + if present.shape[0] != sums.shape[0]: + raise ValueError("present and sums must have the same length") + + with nogil: + for i in range(n): + if not valid_view[i]: + continue + key = keys_view[i] + if skip_key_null and key == key_null: + continue + if key < 0 or key >= nstates: + continue + value = values_view[i] + if skip_value_nan and value != value: + continue + sums_view[key] += value + present_view[key] = 1 + return None + + + +def groupby_dense_i32_f64_sum_checked( + np.ndarray keys, + np.ndarray values, + np.ndarray valid, + np.ndarray sums, + np.ndarray present, + bint skip_key_null=False, + int32_t key_null=0, + bint skip_value_nan=False, +): + """Checked dense int32/float64 sum kernel. + + Returns ``0`` on success, ``-1`` if a negative non-null key is found, or + ``max_key + 1`` when the dense state arrays need to be grown. The state is + not mutated unless the function returns ``0``. + """ + if keys.ndim != 1 or values.ndim != 1 or valid.ndim != 1: + raise ValueError("keys, values and valid must be 1-D arrays") + if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]: + raise ValueError("keys, values and valid must have the same length") + if sums.ndim != 1 or present.ndim != 1: + raise ValueError("sums and present must be 1-D arrays") + if keys.dtype != np.dtype(np.int32): + raise TypeError("keys must have dtype int32") + if values.dtype != np.dtype(np.float64): + raise TypeError("values must have dtype float64") + if valid.dtype != np.dtype(np.bool_): + raise TypeError("valid must have dtype bool") + if sums.dtype != np.dtype(np.float64): + raise TypeError("sums must have dtype float64") + if present.dtype != np.dtype(np.bool_): + raise TypeError("present must have dtype bool") + if present.shape[0] != sums.shape[0]: + raise ValueError("present and sums must have the same length") + + cdef int32_t[:] keys_view = keys + cdef double[:] values_view = values + cdef np.npy_bool[:] valid_view = valid + cdef double[:] sums_view = sums + cdef np.npy_bool[:] present_view = present + cdef Py_ssize_t n = keys.shape[0] + cdef Py_ssize_t nstates = sums.shape[0] + cdef Py_ssize_t i + cdef int32_t key + cdef int32_t max_key = -1 + cdef int ret = 0 + cdef double value + + with nogil: + for i in range(n): + if not valid_view[i]: + continue + key = keys_view[i] + if skip_key_null and key == key_null: + continue + if key < 0: + ret = -1 + break + if key > max_key: + max_key = key + if ret == 0: + if max_key < 0: + ret = 0 + elif max_key >= nstates: + ret = max_key + 1 + else: + for i in range(n): + if not valid_view[i]: + continue + key = keys_view[i] + if skip_key_null and key == key_null: + continue + value = values_view[i] + if skip_value_nan and value != value: + continue + sums_view[key] += value + present_view[key] = 1 + return ret From 4bbd843730049daff57ce391cd2eef878f24c6f0 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 15 May 2026 07:36:48 +0200 Subject: [PATCH 04/17] Acceleration path for float32/64 as groupby keys. 25x speedup. --- bench/ctable/groupby.py | 48 ++++++++-- src/blosc2/groupby.py | 77 +++++++++++++++ src/blosc2/indexing_ext.pyx | 184 ++++++++++++++++++++++++++++++++++++ 3 files changed, 302 insertions(+), 7 deletions(-) diff --git a/bench/ctable/groupby.py b/bench/ctable/groupby.py index fb601eb3..68c03551 100644 --- a/bench/ctable/groupby.py +++ b/bench/ctable/groupby.py @@ -4,6 +4,7 @@ Examples -------- python bench/ctable/groupby.py --rows 10_000_000 --groups 1_000 --op sum +python bench/ctable/groupby.py --rows 10_000_000 --groups 1_000 --key-dtype float64 --op sum python bench/ctable/groupby.py --rows 1_000_000 --groups 100 --dictionary --pandas """ @@ -23,7 +24,7 @@ def parse_int(text: str) -> int: return int(text.replace("_", "")) -def build_row_type(dictionary: bool): +def build_row_type(dictionary: bool, key_dtype: str): if dictionary: @dataclasses.dataclass @@ -31,24 +32,50 @@ class Row: key: str = blosc2.field(blosc2.dictionary()) value: float = blosc2.field(blosc2.float64()) - else: + elif key_dtype == "int32": @dataclasses.dataclass class Row: key: int = blosc2.field(blosc2.int32()) value: float = blosc2.field(blosc2.float64()) + elif key_dtype == "int64": + + @dataclasses.dataclass + class Row: + key: int = blosc2.field(blosc2.int64()) + value: float = blosc2.field(blosc2.float64()) + + elif key_dtype == "float32": + + @dataclasses.dataclass + class Row: + key: float = blosc2.field(blosc2.float32()) + value: float = blosc2.field(blosc2.float64()) + + elif key_dtype == "float64": + + @dataclasses.dataclass + class Row: + key: float = blosc2.field(blosc2.float64()) + value: float = blosc2.field(blosc2.float64()) + + else: # pragma: no cover - argparse choices prevent this + raise ValueError(f"unsupported key dtype {key_dtype!r}") + return Row -def make_data(nrows: int, ngroups: int, dictionary: bool, seed: int): +def make_data(nrows: int, ngroups: int, dictionary: bool, key_dtype: str, seed: int): rng = np.random.default_rng(seed) key_codes = rng.integers(0, ngroups, size=nrows, dtype=np.int32) values = rng.random(nrows, dtype=np.float64) if dictionary: keys = np.asarray([f"k{code}" for code in key_codes], dtype=object) + elif key_dtype in {"float32", "float64"}: + keys = key_codes.astype(np.dtype(key_dtype)) else: - keys = key_codes + keys = key_codes.astype(np.dtype(key_dtype), copy=False) return keys, values @@ -58,6 +85,12 @@ def main() -> None: parser.add_argument("--groups", type=parse_int, default=1_000) parser.add_argument("--chunk-size", type=parse_int, default=None) parser.add_argument("--dictionary", action="store_true", help="Use a dictionary-encoded string key") + parser.add_argument( + "--key-dtype", + choices=["int32", "int64", "float32", "float64"], + default="int32", + help="Physical dtype for non-dictionary keys. Float keys are generated from group codes cast to float.", + ) parser.add_argument("--op", choices=["size", "count", "sum", "mean", "min", "max"], default="sum") parser.add_argument("--sort", action="store_true") parser.add_argument("--pandas", action="store_true", help="Also run a pandas comparison if available") @@ -67,11 +100,12 @@ def main() -> None: print( f"rows={args.rows:,} groups={args.groups:,} dictionary={args.dictionary} " - f"op={args.op} sort={args.sort} chunk_size={args.chunk_size} urlpath={args.urlpath}" + f"key_dtype={args.key_dtype} op={args.op} sort={args.sort} " + f"chunk_size={args.chunk_size} urlpath={args.urlpath}" ) - keys, values = make_data(args.rows, args.groups, args.dictionary, args.seed) - Row = build_row_type(args.dictionary) + keys, values = make_data(args.rows, args.groups, args.dictionary, args.key_dtype, args.seed) + Row = build_row_type(args.dictionary, args.key_dtype) kwargs = {} if args.urlpath is not None: diff --git a/src/blosc2/groupby.py b/src/blosc2/groupby.py index b3b1572e..170edab8 100644 --- a/src/blosc2/groupby.py +++ b/src/blosc2/groupby.py @@ -176,6 +176,9 @@ def _validate_value_column(self, name: str) -> None: def _execute(self, specs: list[_AggSpec]): self._validate_output_names(specs) fast = self._try_execute_cython_i32_f64_sum(specs) + if fast is not None: + return fast + fast = self._try_execute_cython_float_integral_key_f64_sum(specs) if fast is not None: return fast fast = self._try_execute_dense_single_int_key(specs) @@ -296,6 +299,80 @@ def ensure_size(size: int) -> bool: rows.append({key_name: key_value, spec.output_col: float(sums[code])}) return self._build_result(rows, specs) + def _try_execute_cython_float_integral_key_f64_sum(self, specs: list[_AggSpec]): # noqa: C901 + """Cython fast path for integral float32/float64 keys and one non-null float64 sum.""" + if len(self.keys) != 1 or len(specs) != 1 or specs[0].op != "sum" or self.sort: + return None + spec = specs[0] + if spec.input_col is None: + return None + key_name = self.keys[0] + key_info = self.table._schema.columns_by_name[key_name] + value_info = self.table._schema.columns_by_name[spec.input_col] + key_dtype = getattr(key_info.spec, "dtype", None) + value_dtype = getattr(value_info.spec, "dtype", None) + if key_dtype not in {np.dtype(np.float32), np.dtype(np.float64)} or value_dtype != np.dtype( + np.float64 + ): + return None + if getattr(value_info.spec, "null_value", None) is not None: + return None + # The fast path can skip NaNs. If dropna=False and NaNs are present, + # the Cython kernel reports unsupported and we fall back to generic + # grouping, which can materialize a NaN group. + skip_key_nan = self.dropna + try: + from blosc2 import indexing_ext + except ImportError: + return None + kernel_name = ( + "groupby_dense_f32_integral_key_f64_sum_checked" + if key_dtype == np.dtype(np.float32) + else "groupby_dense_f64_integral_key_f64_sum_checked" + ) + kernel = getattr(indexing_ext, kernel_name, None) + if kernel is None: + return None + + compact_limit = 10_000_000 + sums = np.zeros(0, dtype=np.float64) + present = np.zeros(0, dtype=bool) + + def ensure_size(size: int) -> bool: + nonlocal sums, present + if size > compact_limit: + return False + if size <= len(sums): + return True + old = len(sums) + sums = np.pad(sums, (0, size - old), constant_values=0) + present = np.pad(present, (0, size - old), constant_values=False) + return True + + phys_len = len(self.table._valid_rows) + chunk_size = self._chunk_size() + for start in range(0, phys_len, chunk_size): + stop = min(start + chunk_size, phys_len) + valid = np.asarray(self.table._valid_rows[start:stop], dtype=bool) + if not np.any(valid): + continue + keys = np.asarray(self.table._cols[key_name][start:stop], dtype=key_dtype) + values = np.asarray(self.table._cols[spec.input_col][start:stop], dtype=np.float64) + status = int(kernel(keys, values, valid, sums, present, skip_key_nan, False)) + if status == -1: + return None + if status > 0: + if not ensure_size(status): + return None + status = int(kernel(keys, values, valid, sums, present, skip_key_nan, False)) + if status != 0: + return None + + rows = [ + {key_name: float(code), spec.output_col: float(sums[code])} for code in np.nonzero(present)[0] + ] + return self._build_result(rows, specs) + def _try_execute_dense_single_int_key(self, specs: list[_AggSpec]): # noqa: C901 """Fast path for one dense integer/dictionary-code key. diff --git a/src/blosc2/indexing_ext.pyx b/src/blosc2/indexing_ext.pyx index e1202edd..8c479dcd 100644 --- a/src/blosc2/indexing_ext.pyx +++ b/src/blosc2/indexing_ext.pyx @@ -2645,3 +2645,187 @@ def groupby_dense_i32_f64_sum_checked( sums_view[key] += value present_view[key] = 1 return ret + + +def groupby_dense_f64_integral_key_f64_sum_checked( + np.ndarray keys, + np.ndarray values, + np.ndarray valid, + np.ndarray sums, + np.ndarray present, + bint skip_key_nan=True, + bint skip_value_nan=False, +): + """Checked dense float64-integral-key/float64 sum kernel. + + Fast path for float keys that are exactly integral, finite and + non-negative. Returns ``0`` on success, ``-1`` if a key cannot be handled, + or ``max_key + 1`` when dense state arrays need to be grown. The state is + not mutated unless the function returns ``0``. + """ + if keys.ndim != 1 or values.ndim != 1 or valid.ndim != 1: + raise ValueError("keys, values and valid must be 1-D arrays") + if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]: + raise ValueError("keys, values and valid must have the same length") + if sums.ndim != 1 or present.ndim != 1: + raise ValueError("sums and present must be 1-D arrays") + if keys.dtype != np.dtype(np.float64): + raise TypeError("keys must have dtype float64") + if values.dtype != np.dtype(np.float64): + raise TypeError("values must have dtype float64") + if valid.dtype != np.dtype(np.bool_): + raise TypeError("valid must have dtype bool") + if sums.dtype != np.dtype(np.float64): + raise TypeError("sums must have dtype float64") + if present.dtype != np.dtype(np.bool_): + raise TypeError("present must have dtype bool") + if present.shape[0] != sums.shape[0]: + raise ValueError("present and sums must have the same length") + + cdef double[:] keys_view = keys + cdef double[:] values_view = values + cdef np.npy_bool[:] valid_view = valid + cdef double[:] sums_view = sums + cdef np.npy_bool[:] present_view = present + cdef Py_ssize_t n = keys.shape[0] + cdef Py_ssize_t nstates = sums.shape[0] + cdef Py_ssize_t i + cdef double key_f + cdef int64_t key_i + cdef int64_t max_key = -1 + cdef int ret = 0 + cdef double value + + with nogil: + for i in range(n): + if not valid_view[i]: + continue + key_f = keys_view[i] + if key_f != key_f: + if skip_key_nan: + continue + ret = -1 + break + if key_f < 0.0 or key_f > 9223372036854774784.0: + ret = -1 + break + key_i = key_f + if key_f != key_i: + ret = -1 + break + if key_i > max_key: + max_key = key_i + if ret == 0: + if max_key < 0: + ret = 0 + elif max_key >= nstates: + if max_key > 2147483646: + ret = -1 + else: + ret = max_key + 1 + else: + for i in range(n): + if not valid_view[i]: + continue + key_f = keys_view[i] + if key_f != key_f: + if skip_key_nan: + continue + ret = -1 + break + key_i = key_f + value = values_view[i] + if skip_value_nan and value != value: + continue + sums_view[key_i] += value + present_view[key_i] = 1 + return ret + + +def groupby_dense_f32_integral_key_f64_sum_checked( + np.ndarray keys, + np.ndarray values, + np.ndarray valid, + np.ndarray sums, + np.ndarray present, + bint skip_key_nan=True, + bint skip_value_nan=False, +): + """Checked dense float32-integral-key/float64 sum kernel.""" + if keys.ndim != 1 or values.ndim != 1 or valid.ndim != 1: + raise ValueError("keys, values and valid must be 1-D arrays") + if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]: + raise ValueError("keys, values and valid must have the same length") + if sums.ndim != 1 or present.ndim != 1: + raise ValueError("sums and present must be 1-D arrays") + if keys.dtype != np.dtype(np.float32): + raise TypeError("keys must have dtype float32") + if values.dtype != np.dtype(np.float64): + raise TypeError("values must have dtype float64") + if valid.dtype != np.dtype(np.bool_): + raise TypeError("valid must have dtype bool") + if sums.dtype != np.dtype(np.float64): + raise TypeError("sums must have dtype float64") + if present.dtype != np.dtype(np.bool_): + raise TypeError("present must have dtype bool") + if present.shape[0] != sums.shape[0]: + raise ValueError("present and sums must have the same length") + + cdef float[:] keys_view = keys + cdef double[:] values_view = values + cdef np.npy_bool[:] valid_view = valid + cdef double[:] sums_view = sums + cdef np.npy_bool[:] present_view = present + cdef Py_ssize_t n = keys.shape[0] + cdef Py_ssize_t nstates = sums.shape[0] + cdef Py_ssize_t i + cdef float key_f + cdef int64_t key_i + cdef int64_t max_key = -1 + cdef int ret = 0 + cdef double value + + with nogil: + for i in range(n): + if not valid_view[i]: + continue + key_f = keys_view[i] + if key_f != key_f: + if skip_key_nan: + continue + ret = -1 + break + if key_f < 0.0 or key_f > 16777216.0: + ret = -1 + break + key_i = key_f + if key_f != key_i: + ret = -1 + break + if key_i > max_key: + max_key = key_i + if ret == 0: + if max_key < 0: + ret = 0 + elif max_key >= nstates: + if max_key > 2147483646: + ret = -1 + else: + ret = max_key + 1 + else: + for i in range(n): + if not valid_view[i]: + continue + key_f = keys_view[i] + if key_f != key_f: + if skip_key_nan: + continue + ret = -1 + break + key_i = key_f + value = values_view[i] + if skip_value_nan and value != value: + continue + sums_view[key_i] += value + present_view[key_i] = 1 + return ret From 02e284b2690e3628e73d23211d2fc5bf9389b233 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 15 May 2026 07:54:33 +0200 Subject: [PATCH 05/17] Factorized all groupby accelerations in groupby_ext.pyx --- CMakeLists.txt | 12 +- src/blosc2/groupby.py | 8 +- src/blosc2/groupby_ext.pyx | 347 +++++++++++++++++++++++++++++++++++ src/blosc2/indexing_ext.pyx | 331 --------------------------------- tests/ctable/test_groupby.py | 76 ++++++++ 5 files changed, 438 insertions(+), 336 deletions(-) create mode 100644 src/blosc2/groupby_ext.pyx diff --git a/CMakeLists.txt b/CMakeLists.txt index 734a4fea..ff4425a3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -51,6 +51,13 @@ add_custom_command( DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2/indexing_ext.pyx" VERBATIM) +add_custom_command( + OUTPUT groupby_ext.c + COMMAND Python::Interpreter -m cython + "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2/groupby_ext.pyx" --output-file groupby_ext.c + DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2/groupby_ext.pyx" + VERBATIM) + # ...and add it to the target Python_add_library(blosc2_ext MODULE blosc2_ext.c WITH_SOABI) target_sources(blosc2_ext PRIVATE src/blosc2/matmul_kernels.c) @@ -59,10 +66,12 @@ if(UNIX) target_link_libraries(blosc2_ext PRIVATE ${CMAKE_DL_LIBS}) endif() Python_add_library(indexing_ext MODULE indexing_ext.c WITH_SOABI) +Python_add_library(groupby_ext MODULE groupby_ext.c WITH_SOABI) # We need to link against NumPy target_link_libraries(blosc2_ext PRIVATE Python::NumPy) target_link_libraries(indexing_ext PRIVATE Python::NumPy) +target_link_libraries(groupby_ext PRIVATE Python::NumPy) # Fetch and build miniexpr library include(FetchContent) @@ -99,6 +108,7 @@ endif() target_compile_features(blosc2_ext PRIVATE c_std_11) target_compile_features(indexing_ext PRIVATE c_std_11) +target_compile_features(groupby_ext PRIVATE c_std_11) if(WIN32 AND CMAKE_C_COMPILER_ID STREQUAL "Clang") execute_process( COMMAND "${CMAKE_C_COMPILER}" -print-resource-dir @@ -173,7 +183,7 @@ endif() # Python extension -> site-packages/blosc2 install( - TARGETS blosc2_ext indexing_ext + TARGETS blosc2_ext indexing_ext groupby_ext LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/blosc2 ) diff --git a/src/blosc2/groupby.py b/src/blosc2/groupby.py index 170edab8..dfeda79d 100644 --- a/src/blosc2/groupby.py +++ b/src/blosc2/groupby.py @@ -252,10 +252,10 @@ def _try_execute_cython_i32_f64_sum(self, specs: list[_AggSpec]): # noqa: C901 if value_dtype != np.dtype(np.float64) or getattr(value_info.spec, "null_value", None) is not None: return None try: - from blosc2 import indexing_ext + from blosc2 import groupby_ext except ImportError: return None - kernel = getattr(indexing_ext, "groupby_dense_i32_f64_sum_checked", None) + kernel = getattr(groupby_ext, "groupby_dense_i32_f64_sum_checked", None) if kernel is None: return None @@ -322,7 +322,7 @@ def _try_execute_cython_float_integral_key_f64_sum(self, specs: list[_AggSpec]): # grouping, which can materialize a NaN group. skip_key_nan = self.dropna try: - from blosc2 import indexing_ext + from blosc2 import groupby_ext except ImportError: return None kernel_name = ( @@ -330,7 +330,7 @@ def _try_execute_cython_float_integral_key_f64_sum(self, specs: list[_AggSpec]): if key_dtype == np.dtype(np.float32) else "groupby_dense_f64_integral_key_f64_sum_checked" ) - kernel = getattr(indexing_ext, kernel_name, None) + kernel = getattr(groupby_ext, kernel_name, None) if kernel is None: return None diff --git a/src/blosc2/groupby_ext.pyx b/src/blosc2/groupby_ext.pyx new file mode 100644 index 00000000..c621d6fc --- /dev/null +++ b/src/blosc2/groupby_ext.pyx @@ -0,0 +1,347 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### +# cython: boundscheck=False, wraparound=False, initializedcheck=False + +"""Cython group-reduce kernels for CTable group_by().""" + +import numpy as np +cimport numpy as np + +from libc.stdint cimport int32_t, int64_t + + +# ---------------------------------------------------------------------- +# Group-reduce kernels +# ---------------------------------------------------------------------- + +def groupby_dense_i32_f64_sum( + np.ndarray keys, + np.ndarray values, + np.ndarray valid, + np.ndarray sums, + np.ndarray present, + bint skip_key_null=False, + int32_t key_null=0, + bint skip_value_nan=False, +): + """Accumulate ``sum(values)`` by dense int32 keys. + + This is a low-level CTable group-by helper. *keys*, *values*, and *valid* + are same-length 1-D chunk arrays. *sums* and *present* are dense group + state arrays indexed directly by key value. Keys must be non-negative and + already fit in the state arrays. + """ + if keys.ndim != 1 or values.ndim != 1 or valid.ndim != 1: + raise ValueError("keys, values and valid must be 1-D arrays") + if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]: + raise ValueError("keys, values and valid must have the same length") + if sums.ndim != 1 or present.ndim != 1: + raise ValueError("sums and present must be 1-D arrays") + if keys.dtype != np.dtype(np.int32): + raise TypeError("keys must have dtype int32") + if values.dtype != np.dtype(np.float64): + raise TypeError("values must have dtype float64") + if valid.dtype != np.dtype(np.bool_): + raise TypeError("valid must have dtype bool") + if sums.dtype != np.dtype(np.float64): + raise TypeError("sums must have dtype float64") + if present.dtype != np.dtype(np.bool_): + raise TypeError("present must have dtype bool") + + cdef int32_t[:] keys_view = keys + cdef double[:] values_view = values + cdef np.npy_bool[:] valid_view = valid + cdef double[:] sums_view = sums + cdef np.npy_bool[:] present_view = present + cdef Py_ssize_t n = keys.shape[0] + cdef Py_ssize_t nstates = sums.shape[0] + cdef Py_ssize_t i + cdef int32_t key + cdef double value + + if present.shape[0] != sums.shape[0]: + raise ValueError("present and sums must have the same length") + + with nogil: + for i in range(n): + if not valid_view[i]: + continue + key = keys_view[i] + if skip_key_null and key == key_null: + continue + if key < 0 or key >= nstates: + continue + value = values_view[i] + if skip_value_nan and value != value: + continue + sums_view[key] += value + present_view[key] = 1 + return None + + +def groupby_dense_i32_f64_sum_checked( + np.ndarray keys, + np.ndarray values, + np.ndarray valid, + np.ndarray sums, + np.ndarray present, + bint skip_key_null=False, + int32_t key_null=0, + bint skip_value_nan=False, +): + """Checked dense int32/float64 sum kernel. + + Returns ``0`` on success, ``-1`` if a negative non-null key is found, or + ``max_key + 1`` when the dense state arrays need to be grown. The state is + not mutated unless the function returns ``0``. + """ + if keys.ndim != 1 or values.ndim != 1 or valid.ndim != 1: + raise ValueError("keys, values and valid must be 1-D arrays") + if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]: + raise ValueError("keys, values and valid must have the same length") + if sums.ndim != 1 or present.ndim != 1: + raise ValueError("sums and present must be 1-D arrays") + if keys.dtype != np.dtype(np.int32): + raise TypeError("keys must have dtype int32") + if values.dtype != np.dtype(np.float64): + raise TypeError("values must have dtype float64") + if valid.dtype != np.dtype(np.bool_): + raise TypeError("valid must have dtype bool") + if sums.dtype != np.dtype(np.float64): + raise TypeError("sums must have dtype float64") + if present.dtype != np.dtype(np.bool_): + raise TypeError("present must have dtype bool") + if present.shape[0] != sums.shape[0]: + raise ValueError("present and sums must have the same length") + + cdef int32_t[:] keys_view = keys + cdef double[:] values_view = values + cdef np.npy_bool[:] valid_view = valid + cdef double[:] sums_view = sums + cdef np.npy_bool[:] present_view = present + cdef Py_ssize_t n = keys.shape[0] + cdef Py_ssize_t nstates = sums.shape[0] + cdef Py_ssize_t i + cdef int32_t key + cdef int32_t max_key = -1 + cdef int ret = 0 + cdef double value + + with nogil: + for i in range(n): + if not valid_view[i]: + continue + key = keys_view[i] + if skip_key_null and key == key_null: + continue + if key < 0: + ret = -1 + break + if key > max_key: + max_key = key + if ret == 0: + if max_key < 0: + ret = 0 + elif max_key >= nstates: + ret = max_key + 1 + else: + for i in range(n): + if not valid_view[i]: + continue + key = keys_view[i] + if skip_key_null and key == key_null: + continue + value = values_view[i] + if skip_value_nan and value != value: + continue + sums_view[key] += value + present_view[key] = 1 + return ret + + +def groupby_dense_f64_integral_key_f64_sum_checked( + np.ndarray keys, + np.ndarray values, + np.ndarray valid, + np.ndarray sums, + np.ndarray present, + bint skip_key_nan=True, + bint skip_value_nan=False, +): + """Checked dense float64-integral-key/float64 sum kernel. + + Fast path for float keys that are exactly integral, finite and + non-negative. Returns ``0`` on success, ``-1`` if a key cannot be handled, + or ``max_key + 1`` when the dense state arrays need to be grown. The state is + not mutated unless the function returns ``0``. + """ + if keys.ndim != 1 or values.ndim != 1 or valid.ndim != 1: + raise ValueError("keys, values and valid must be 1-D arrays") + if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]: + raise ValueError("keys, values and valid must have the same length") + if sums.ndim != 1 or present.ndim != 1: + raise ValueError("sums and present must be 1-D arrays") + if keys.dtype != np.dtype(np.float64): + raise TypeError("keys must have dtype float64") + if values.dtype != np.dtype(np.float64): + raise TypeError("values must have dtype float64") + if valid.dtype != np.dtype(np.bool_): + raise TypeError("valid must have dtype bool") + if sums.dtype != np.dtype(np.float64): + raise TypeError("sums must have dtype float64") + if present.dtype != np.dtype(np.bool_): + raise TypeError("present must have dtype bool") + if present.shape[0] != sums.shape[0]: + raise ValueError("present and sums must have the same length") + + cdef double[:] keys_view = keys + cdef double[:] values_view = values + cdef np.npy_bool[:] valid_view = valid + cdef double[:] sums_view = sums + cdef np.npy_bool[:] present_view = present + cdef Py_ssize_t n = keys.shape[0] + cdef Py_ssize_t nstates = sums.shape[0] + cdef Py_ssize_t i + cdef double key_f + cdef int64_t key_i + cdef int64_t max_key = -1 + cdef int ret = 0 + cdef double value + + with nogil: + for i in range(n): + if not valid_view[i]: + continue + key_f = keys_view[i] + if key_f != key_f: + if skip_key_nan: + continue + ret = -1 + break + if key_f < 0.0 or key_f > 9223372036854774784.0: + ret = -1 + break + key_i = key_f + if key_f != key_i: + ret = -1 + break + if key_i > max_key: + max_key = key_i + if ret == 0: + if max_key < 0: + ret = 0 + elif max_key >= nstates: + if max_key > 2147483646: + ret = -1 + else: + ret = max_key + 1 + else: + for i in range(n): + if not valid_view[i]: + continue + key_f = keys_view[i] + if key_f != key_f: + if skip_key_nan: + continue + ret = -1 + break + key_i = key_f + value = values_view[i] + if skip_value_nan and value != value: + continue + sums_view[key_i] += value + present_view[key_i] = 1 + return ret + + +def groupby_dense_f32_integral_key_f64_sum_checked( + np.ndarray keys, + np.ndarray values, + np.ndarray valid, + np.ndarray sums, + np.ndarray present, + bint skip_key_nan=True, + bint skip_value_nan=False, +): + """Checked dense float32-integral-key/float64 sum kernel.""" + if keys.ndim != 1 or values.ndim != 1 or valid.ndim != 1: + raise ValueError("keys, values and valid must be 1-D arrays") + if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]: + raise ValueError("keys, values and valid must have the same length") + if sums.ndim != 1 or present.ndim != 1: + raise ValueError("sums and present must be 1-D arrays") + if keys.dtype != np.dtype(np.float32): + raise TypeError("keys must have dtype float32") + if values.dtype != np.dtype(np.float64): + raise TypeError("values must have dtype float64") + if valid.dtype != np.dtype(np.bool_): + raise TypeError("valid must have dtype bool") + if sums.dtype != np.dtype(np.float64): + raise TypeError("sums must have dtype float64") + if present.dtype != np.dtype(np.bool_): + raise TypeError("present must have dtype bool") + if present.shape[0] != sums.shape[0]: + raise ValueError("present and sums must have the same length") + + cdef float[:] keys_view = keys + cdef double[:] values_view = values + cdef np.npy_bool[:] valid_view = valid + cdef double[:] sums_view = sums + cdef np.npy_bool[:] present_view = present + cdef Py_ssize_t n = keys.shape[0] + cdef Py_ssize_t nstates = sums.shape[0] + cdef Py_ssize_t i + cdef float key_f + cdef int64_t key_i + cdef int64_t max_key = -1 + cdef int ret = 0 + cdef double value + + with nogil: + for i in range(n): + if not valid_view[i]: + continue + key_f = keys_view[i] + if key_f != key_f: + if skip_key_nan: + continue + ret = -1 + break + if key_f < 0.0 or key_f > 16777216.0: + ret = -1 + break + key_i = key_f + if key_f != key_i: + ret = -1 + break + if key_i > max_key: + max_key = key_i + if ret == 0: + if max_key < 0: + ret = 0 + elif max_key >= nstates: + if max_key > 2147483646: + ret = -1 + else: + ret = max_key + 1 + else: + for i in range(n): + if not valid_view[i]: + continue + key_f = keys_view[i] + if key_f != key_f: + if skip_key_nan: + continue + ret = -1 + break + key_i = key_f + value = values_view[i] + if skip_value_nan and value != value: + continue + sums_view[key_i] += value + present_view[key_i] = 1 + return ret diff --git a/src/blosc2/indexing_ext.pyx b/src/blosc2/indexing_ext.pyx index 8c479dcd..91072bea 100644 --- a/src/blosc2/indexing_ext.pyx +++ b/src/blosc2/indexing_ext.pyx @@ -2498,334 +2498,3 @@ def keysort_keys_indices(np.ndarray keys, np.ndarray indices): # ---------------------------------------------------------------------- -# Group-reduce kernels -# ---------------------------------------------------------------------- - -def groupby_dense_i32_f64_sum( - np.ndarray keys, - np.ndarray values, - np.ndarray valid, - np.ndarray sums, - np.ndarray present, - bint skip_key_null=False, - int32_t key_null=0, - bint skip_value_nan=False, -): - """Accumulate ``sum(values)`` by dense int32 keys. - - This is a low-level CTable group-by helper. *keys*, *values*, and *valid* - are same-length 1-D chunk arrays. *sums* and *present* are dense group - state arrays indexed directly by key value. Keys must be non-negative and - already fit in the state arrays. - """ - if keys.ndim != 1 or values.ndim != 1 or valid.ndim != 1: - raise ValueError("keys, values and valid must be 1-D arrays") - if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]: - raise ValueError("keys, values and valid must have the same length") - if sums.ndim != 1 or present.ndim != 1: - raise ValueError("sums and present must be 1-D arrays") - if keys.dtype != np.dtype(np.int32): - raise TypeError("keys must have dtype int32") - if values.dtype != np.dtype(np.float64): - raise TypeError("values must have dtype float64") - if valid.dtype != np.dtype(np.bool_): - raise TypeError("valid must have dtype bool") - if sums.dtype != np.dtype(np.float64): - raise TypeError("sums must have dtype float64") - if present.dtype != np.dtype(np.bool_): - raise TypeError("present must have dtype bool") - - cdef int32_t[:] keys_view = keys - cdef double[:] values_view = values - cdef np.npy_bool[:] valid_view = valid - cdef double[:] sums_view = sums - cdef np.npy_bool[:] present_view = present - cdef Py_ssize_t n = keys.shape[0] - cdef Py_ssize_t nstates = sums.shape[0] - cdef Py_ssize_t i - cdef int32_t key - cdef double value - - if present.shape[0] != sums.shape[0]: - raise ValueError("present and sums must have the same length") - - with nogil: - for i in range(n): - if not valid_view[i]: - continue - key = keys_view[i] - if skip_key_null and key == key_null: - continue - if key < 0 or key >= nstates: - continue - value = values_view[i] - if skip_value_nan and value != value: - continue - sums_view[key] += value - present_view[key] = 1 - return None - - - -def groupby_dense_i32_f64_sum_checked( - np.ndarray keys, - np.ndarray values, - np.ndarray valid, - np.ndarray sums, - np.ndarray present, - bint skip_key_null=False, - int32_t key_null=0, - bint skip_value_nan=False, -): - """Checked dense int32/float64 sum kernel. - - Returns ``0`` on success, ``-1`` if a negative non-null key is found, or - ``max_key + 1`` when the dense state arrays need to be grown. The state is - not mutated unless the function returns ``0``. - """ - if keys.ndim != 1 or values.ndim != 1 or valid.ndim != 1: - raise ValueError("keys, values and valid must be 1-D arrays") - if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]: - raise ValueError("keys, values and valid must have the same length") - if sums.ndim != 1 or present.ndim != 1: - raise ValueError("sums and present must be 1-D arrays") - if keys.dtype != np.dtype(np.int32): - raise TypeError("keys must have dtype int32") - if values.dtype != np.dtype(np.float64): - raise TypeError("values must have dtype float64") - if valid.dtype != np.dtype(np.bool_): - raise TypeError("valid must have dtype bool") - if sums.dtype != np.dtype(np.float64): - raise TypeError("sums must have dtype float64") - if present.dtype != np.dtype(np.bool_): - raise TypeError("present must have dtype bool") - if present.shape[0] != sums.shape[0]: - raise ValueError("present and sums must have the same length") - - cdef int32_t[:] keys_view = keys - cdef double[:] values_view = values - cdef np.npy_bool[:] valid_view = valid - cdef double[:] sums_view = sums - cdef np.npy_bool[:] present_view = present - cdef Py_ssize_t n = keys.shape[0] - cdef Py_ssize_t nstates = sums.shape[0] - cdef Py_ssize_t i - cdef int32_t key - cdef int32_t max_key = -1 - cdef int ret = 0 - cdef double value - - with nogil: - for i in range(n): - if not valid_view[i]: - continue - key = keys_view[i] - if skip_key_null and key == key_null: - continue - if key < 0: - ret = -1 - break - if key > max_key: - max_key = key - if ret == 0: - if max_key < 0: - ret = 0 - elif max_key >= nstates: - ret = max_key + 1 - else: - for i in range(n): - if not valid_view[i]: - continue - key = keys_view[i] - if skip_key_null and key == key_null: - continue - value = values_view[i] - if skip_value_nan and value != value: - continue - sums_view[key] += value - present_view[key] = 1 - return ret - - -def groupby_dense_f64_integral_key_f64_sum_checked( - np.ndarray keys, - np.ndarray values, - np.ndarray valid, - np.ndarray sums, - np.ndarray present, - bint skip_key_nan=True, - bint skip_value_nan=False, -): - """Checked dense float64-integral-key/float64 sum kernel. - - Fast path for float keys that are exactly integral, finite and - non-negative. Returns ``0`` on success, ``-1`` if a key cannot be handled, - or ``max_key + 1`` when dense state arrays need to be grown. The state is - not mutated unless the function returns ``0``. - """ - if keys.ndim != 1 or values.ndim != 1 or valid.ndim != 1: - raise ValueError("keys, values and valid must be 1-D arrays") - if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]: - raise ValueError("keys, values and valid must have the same length") - if sums.ndim != 1 or present.ndim != 1: - raise ValueError("sums and present must be 1-D arrays") - if keys.dtype != np.dtype(np.float64): - raise TypeError("keys must have dtype float64") - if values.dtype != np.dtype(np.float64): - raise TypeError("values must have dtype float64") - if valid.dtype != np.dtype(np.bool_): - raise TypeError("valid must have dtype bool") - if sums.dtype != np.dtype(np.float64): - raise TypeError("sums must have dtype float64") - if present.dtype != np.dtype(np.bool_): - raise TypeError("present must have dtype bool") - if present.shape[0] != sums.shape[0]: - raise ValueError("present and sums must have the same length") - - cdef double[:] keys_view = keys - cdef double[:] values_view = values - cdef np.npy_bool[:] valid_view = valid - cdef double[:] sums_view = sums - cdef np.npy_bool[:] present_view = present - cdef Py_ssize_t n = keys.shape[0] - cdef Py_ssize_t nstates = sums.shape[0] - cdef Py_ssize_t i - cdef double key_f - cdef int64_t key_i - cdef int64_t max_key = -1 - cdef int ret = 0 - cdef double value - - with nogil: - for i in range(n): - if not valid_view[i]: - continue - key_f = keys_view[i] - if key_f != key_f: - if skip_key_nan: - continue - ret = -1 - break - if key_f < 0.0 or key_f > 9223372036854774784.0: - ret = -1 - break - key_i = key_f - if key_f != key_i: - ret = -1 - break - if key_i > max_key: - max_key = key_i - if ret == 0: - if max_key < 0: - ret = 0 - elif max_key >= nstates: - if max_key > 2147483646: - ret = -1 - else: - ret = max_key + 1 - else: - for i in range(n): - if not valid_view[i]: - continue - key_f = keys_view[i] - if key_f != key_f: - if skip_key_nan: - continue - ret = -1 - break - key_i = key_f - value = values_view[i] - if skip_value_nan and value != value: - continue - sums_view[key_i] += value - present_view[key_i] = 1 - return ret - - -def groupby_dense_f32_integral_key_f64_sum_checked( - np.ndarray keys, - np.ndarray values, - np.ndarray valid, - np.ndarray sums, - np.ndarray present, - bint skip_key_nan=True, - bint skip_value_nan=False, -): - """Checked dense float32-integral-key/float64 sum kernel.""" - if keys.ndim != 1 or values.ndim != 1 or valid.ndim != 1: - raise ValueError("keys, values and valid must be 1-D arrays") - if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]: - raise ValueError("keys, values and valid must have the same length") - if sums.ndim != 1 or present.ndim != 1: - raise ValueError("sums and present must be 1-D arrays") - if keys.dtype != np.dtype(np.float32): - raise TypeError("keys must have dtype float32") - if values.dtype != np.dtype(np.float64): - raise TypeError("values must have dtype float64") - if valid.dtype != np.dtype(np.bool_): - raise TypeError("valid must have dtype bool") - if sums.dtype != np.dtype(np.float64): - raise TypeError("sums must have dtype float64") - if present.dtype != np.dtype(np.bool_): - raise TypeError("present must have dtype bool") - if present.shape[0] != sums.shape[0]: - raise ValueError("present and sums must have the same length") - - cdef float[:] keys_view = keys - cdef double[:] values_view = values - cdef np.npy_bool[:] valid_view = valid - cdef double[:] sums_view = sums - cdef np.npy_bool[:] present_view = present - cdef Py_ssize_t n = keys.shape[0] - cdef Py_ssize_t nstates = sums.shape[0] - cdef Py_ssize_t i - cdef float key_f - cdef int64_t key_i - cdef int64_t max_key = -1 - cdef int ret = 0 - cdef double value - - with nogil: - for i in range(n): - if not valid_view[i]: - continue - key_f = keys_view[i] - if key_f != key_f: - if skip_key_nan: - continue - ret = -1 - break - if key_f < 0.0 or key_f > 16777216.0: - ret = -1 - break - key_i = key_f - if key_f != key_i: - ret = -1 - break - if key_i > max_key: - max_key = key_i - if ret == 0: - if max_key < 0: - ret = 0 - elif max_key >= nstates: - if max_key > 2147483646: - ret = -1 - else: - ret = max_key + 1 - else: - for i in range(n): - if not valid_view[i]: - continue - key_f = keys_view[i] - if key_f != key_f: - if skip_key_nan: - continue - ret = -1 - break - key_i = key_f - value = values_view[i] - if skip_value_nan and value != value: - continue - sums_view[key_i] += value - present_view[key_i] = 1 - return ret diff --git a/tests/ctable/test_groupby.py b/tests/ctable/test_groupby.py index 12394fb4..306d6522 100644 --- a/tests/ctable/test_groupby.py +++ b/tests/ctable/test_groupby.py @@ -144,6 +144,82 @@ def test_groupby_empty_table_returns_empty_result(): assert out.col_names == ["city", "size"] +@dataclass +class Int32FloatRow: + key: int = blosc2.field(blosc2.int32()) + value: float = blosc2.field(blosc2.float64()) + + +@dataclass +class Float64KeyRow: + key: float = blosc2.field(blosc2.float64()) + value: float = blosc2.field(blosc2.float64()) + + +@dataclass +class Float32KeyRow: + key: float = blosc2.field(blosc2.float32()) + value: float = blosc2.field(blosc2.float64()) + + +@dataclass +class DictFloatRow: + key: str = blosc2.field(blosc2.dictionary()) + value: float = blosc2.field(blosc2.float64()) + + +@pytest.mark.parametrize( + ("row_type", "data", "expected"), + [ + ( + Int32FloatRow, + [(0, 1.5), (2, 10.0), (1, 2.5), (2, 3.0), (0, 4.0)], + [(0, 5.5), (1, 2.5), (2, 13.0)], + ), + ( + Float64KeyRow, + [(0.0, 1.5), (2.0, 10.0), (1.0, 2.5), (2.0, 3.0), (0.0, 4.0)], + [(0.0, 5.5), (1.0, 2.5), (2.0, 13.0)], + ), + ( + Float32KeyRow, + [(0.0, 1.5), (2.0, 10.0), (1.0, 2.5), (2.0, 3.0), (0.0, 4.0)], + [(0.0, 5.5), (1.0, 2.5), (2.0, 13.0)], + ), + ( + DictFloatRow, + [("a", 1.5), ("c", 10.0), ("b", 2.5), ("c", 3.0), ("a", 4.0)], + [("a", 5.5), ("c", 13.0), ("b", 2.5)], + ), + ], +) +def test_groupby_fast_path_sum_variants(row_type, data, expected): + t = CTable(row_type, new_data=data) + + out = t.group_by("key").agg({"value": "sum"}) + + assert rows(out) == expected + + +def test_groupby_float_integral_fast_path_falls_back_for_non_integral_keys(): + t = CTable(Float64KeyRow, new_data=[(0.5, 1.0), (1.5, 2.0), (0.5, 3.0)]) + + out = t.group_by("key").agg({"value": "sum"}) + + assert rows(out) == [(0.5, 4.0), (1.5, 2.0)] + + +def test_groupby_float_integral_fast_path_falls_back_for_nan_group_when_kept(): + t = CTable(Float64KeyRow, new_data=[(0.0, 1.0), (np.nan, 2.0), (0.0, 3.0)]) + + out = t.group_by("key", dropna=False).agg({"value": "sum"}) + + got = rows(out) + assert got[0] == (0.0, 4.0) + assert np.isnan(got[1][0]) + assert got[1][1] == 2.0 + + def test_groupby_rejects_bad_engine(): t = CTable(SalesRow, new_data=DATA) From e414af33156ed4e8d03854cace6491f4e850819e Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 15 May 2026 08:02:27 +0200 Subject: [PATCH 06/17] Add new tests and bench files --- bench/ctable/bench_nested_filter_index.py | 205 ++++++++++++++++++++++ tests/ctable/test_nested_append.py | 96 ++++++++++ 2 files changed, 301 insertions(+) create mode 100644 bench/ctable/bench_nested_filter_index.py create mode 100644 tests/ctable/test_nested_append.py diff --git a/bench/ctable/bench_nested_filter_index.py b/bench/ctable/bench_nested_filter_index.py new file mode 100644 index 00000000..71d44112 --- /dev/null +++ b/bench/ctable/bench_nested_filter_index.py @@ -0,0 +1,205 @@ +#!/usr/bin/env python +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +"""Benchmark nested leaf filter/index performance vs flat columns. + +Compares a CTable with flat column names against an equivalent one that uses +dotted nested column names (physically stored under hierarchical _cols/ paths). +Both tables hold the same data; each filter/index/aggregate operation is timed +on both to show the overhead (or absence thereof) introduced by the nested layout. +""" + +from __future__ import annotations + +import argparse +import gc +import time +from dataclasses import dataclass + +import numpy as np + +import blosc2 + + +# --------------------------------------------------------------------------- +# Schema helpers +# --------------------------------------------------------------------------- + + +@dataclass +class FlatRow: + trip_begin_lon: float = blosc2.field(blosc2.float64()) + trip_begin_lat: float = blosc2.field(blosc2.float64()) + trip_end_lon: float = blosc2.field(blosc2.float64()) + trip_end_lat: float = blosc2.field(blosc2.float64()) + payment_fare: float = blosc2.field(blosc2.float64(ge=0)) + + +@dataclass +class NestedRow: + """Same physical columns as FlatRow but accessed via dotted names after creation.""" + + trip_begin_lon: float = blosc2.field(blosc2.float64()) + trip_begin_lat: float = blosc2.field(blosc2.float64()) + trip_end_lon: float = blosc2.field(blosc2.float64()) + trip_end_lat: float = blosc2.field(blosc2.float64()) + payment_fare: float = blosc2.field(blosc2.float64(ge=0)) + + +def _build_data(n: int) -> dict: + rng = np.random.default_rng(42) + return { + "trip_begin_lon": rng.uniform(-88.0, -87.5, n).astype(np.float64), + "trip_begin_lat": rng.uniform(41.6, 42.0, n).astype(np.float64), + "trip_end_lon": rng.uniform(-88.0, -87.5, n).astype(np.float64), + "trip_end_lat": rng.uniform(41.6, 42.0, n).astype(np.float64), + "payment_fare": rng.uniform(3.0, 50.0, n).astype(np.float64), + } + + +def _build_flat(data: dict, n: int) -> "blosc2.CTable": + t = blosc2.CTable(FlatRow, expected_size=n) + t.extend(data) + return t + + +def _build_nested(data: dict, n: int) -> "blosc2.CTable": + t = blosc2.CTable(NestedRow, expected_size=n) + t.extend(data) + # Rename to dotted nested names + t.rename_column("trip_begin_lon", "trip.begin.lon") + t.rename_column("trip_begin_lat", "trip.begin.lat") + t.rename_column("trip_end_lon", "trip.end.lon") + t.rename_column("trip_end_lat", "trip.end.lat") + t.rename_column("payment_fare", "payment.fare") + return t + + +# --------------------------------------------------------------------------- +# Timing helper +# --------------------------------------------------------------------------- + + +def _timeit(fn, repeats: int = 5) -> float: + gc.collect() + times = [] + for _ in range(repeats): + t0 = time.perf_counter() + fn() + times.append(time.perf_counter() - t0) + return min(times) + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +def main() -> None: + p = argparse.ArgumentParser(description="Benchmark nested vs flat column filter/index/aggregate") + p.add_argument("--rows", type=int, default=1_000_000, help="Number of rows (default: 1M)") + p.add_argument("--repeats", type=int, default=5, help="Timing repeats (default: 5)") + args = p.parse_args() + + N = args.rows + R = args.repeats + + print(f"Building tables with {N:,} rows …") + data = _build_data(N) + flat_data = data.copy() # flat uses underscore names + nested_data = { + "trip_begin_lon": data["trip_begin_lon"], + "trip_begin_lat": data["trip_begin_lat"], + "trip_end_lon": data["trip_end_lon"], + "trip_end_lat": data["trip_end_lat"], + "payment_fare": data["payment_fare"], + } + + tf = _build_flat(flat_data, N) + tn = _build_nested(nested_data, N) + print(f" flat col_names: {tf.col_names}") + print(f" nested col_names: {tn.col_names}") + print() + + # Build indexes on the fare column for index-accelerated queries + print("Building indexes …") + tf.create_index("payment_fare") + tn.create_index("payment.fare") + print() + + header = f"{'Operation':<45} {'flat (ms)':>12} {'nested (ms)':>13} {'ratio':>8}" + print(header) + print("-" * len(header)) + + def bench(label, flat_fn, nested_fn): + t_flat = _timeit(flat_fn, R) * 1000 + t_nested = _timeit(nested_fn, R) * 1000 + ratio = t_nested / t_flat if t_flat > 0 else float("nan") + print(f"{label:<45} {t_flat:>12.3f} {t_nested:>13.3f} {ratio:>8.3f}x") + + bench( + "where (string expr, full scan)", + lambda: tf.where("payment_fare > 20"), + lambda: tn.where("payment.fare > 20"), + ) + + bench( + "where (string expr, full scan, nrows)", + lambda: tf.where("payment_fare > 20").nrows, + lambda: tn.where("payment.fare > 20").nrows, + ) + + bench( + "where (LazyExpr, full scan)", + lambda: tf.where(tf["payment_fare"] > 20), + lambda: tn.where(tn["payment.fare"] > 20), + ) + + bench( + "where (auto index-accelerated, nrows)", + lambda: tf.where("payment_fare > 20").nrows, + lambda: tn.where("payment.fare > 20").nrows, + ) + + bench( + "column mean (full scan)", + lambda: tf["payment_fare"].mean(), + lambda: tn["payment.fare"].mean(), + ) + + bench( + "column sum (full scan)", + lambda: tf["payment_fare"].sum(), + lambda: tn["payment.fare"].sum(), + ) + + bench( + "column min (full scan)", + lambda: tf["trip_begin_lon"].min(), + lambda: tn["trip.begin.lon"].min(), + ) + + bench( + "multi-column where (string expr, nrows)", + lambda: tf.where("trip_begin_lon > -87.7 and payment_fare > 10").nrows, + lambda: tn.where("trip.begin.lon > -87.7 and payment.fare > 10").nrows, + ) + + bench( + "sort_by (single leaf)", + lambda: tf.sort_by("payment_fare"), + lambda: tn.sort_by("payment.fare"), + ) + + print() + print("ratio < 1 means nested is faster; ratio > 1 means flat is faster.") + print("Ratios close to 1.0 indicate the nested path adds negligible overhead.") + + +if __name__ == "__main__": + main() diff --git a/tests/ctable/test_nested_append.py b/tests/ctable/test_nested_append.py new file mode 100644 index 00000000..7be94a6e --- /dev/null +++ b/tests/ctable/test_nested_append.py @@ -0,0 +1,96 @@ +"""Tests for Ph 3.1: append/extend with nested dict rows on tables with dotted column names.""" + +from dataclasses import dataclass + +import numpy as np +import pytest + +import blosc2 + + +@dataclass +class FlatTrip: + trip_begin_lon: float + trip_begin_lat: float + payment_fare: float + + +def _make_nested_table(): + """Create a CTable with dotted (nested) column names via rename.""" + t = blosc2.CTable(FlatTrip) + t.rename_column("trip_begin_lon", "trip.begin.lon") + t.rename_column("trip_begin_lat", "trip.begin.lat") + t.rename_column("payment_fare", "payment.fare") + return t + + +def test_append_nested_dict(): + """append() accepts a fully-nested dict and flattens it to dotted keys.""" + t = _make_nested_table() + t.append({"trip": {"begin": {"lon": 1.0, "lat": 2.0}}, "payment": {"fare": 10.0}}) + t.append({"trip": {"begin": {"lon": 3.0, "lat": 4.0}}, "payment": {"fare": 20.0}}) + + assert t.nrows == 2 + np.testing.assert_array_almost_equal(t["trip.begin.lon"][:], [1.0, 3.0]) + np.testing.assert_array_almost_equal(t["trip.begin.lat"][:], [2.0, 4.0]) + np.testing.assert_array_almost_equal(t["payment.fare"][:], [10.0, 20.0]) + + +def test_append_flat_dotted_dict_unchanged(): + """append() with already-flat dotted keys continues to work.""" + t = _make_nested_table() + t.append({"trip.begin.lon": 5.0, "trip.begin.lat": 6.0, "payment.fare": 30.0}) + + assert t.nrows == 1 + assert t["trip.begin.lon"][0] == pytest.approx(5.0) + + +def test_extend_list_of_nested_dicts(): + """extend() with a list of nested dicts flattens each row.""" + t = _make_nested_table() + rows = [ + {"trip": {"begin": {"lon": 1.0, "lat": 2.0}}, "payment": {"fare": 10.0}}, + {"trip": {"begin": {"lon": 3.0, "lat": 4.0}}, "payment": {"fare": 20.0}}, + {"trip": {"begin": {"lon": 5.0, "lat": 6.0}}, "payment": {"fare": 30.0}}, + ] + t.extend(rows) + + assert t.nrows == 3 + np.testing.assert_array_almost_equal(t["trip.begin.lon"][:], [1.0, 3.0, 5.0]) + np.testing.assert_array_almost_equal(t["payment.fare"][:], [10.0, 20.0, 30.0]) + + +def test_extend_nested_dict_of_arrays(): + """extend() with a nested dict-of-arrays flattens the outer dict to dotted keys.""" + t = _make_nested_table() + t.extend( + { + "trip": {"begin": {"lon": [1.0, 2.0, 3.0], "lat": [4.0, 5.0, 6.0]}}, + "payment": {"fare": [10.0, 20.0, 30.0]}, + } + ) + + assert t.nrows == 3 + np.testing.assert_array_almost_equal(t["trip.begin.lon"][:], [1.0, 2.0, 3.0]) + np.testing.assert_array_almost_equal(t["trip.begin.lat"][:], [4.0, 5.0, 6.0]) + np.testing.assert_array_almost_equal(t["payment.fare"][:], [10.0, 20.0, 30.0]) + + +def test_append_nested_dict_where_and_attribute_access(): + """append() with nested dicts integrates correctly with where() and attribute proxy.""" + t = _make_nested_table() + for lon, lat, fare in [(1.0, 2.0, 5.0), (3.0, 4.0, 15.0), (5.0, 6.0, 25.0)]: + t.append({"trip": {"begin": {"lon": lon, "lat": lat}}, "payment": {"fare": fare}}) + + view = t.where("payment.fare > 10") + assert view.nrows == 2 + assert t.trip.begin.lon.max() == pytest.approx(5.0) + + +def test_nested_dotted_string_where_in_aggregate(): + """Aggregate where= strings accept dotted nested column names.""" + t = _make_nested_table() + for lon, lat, fare in [(1.0, 2.0, 5.0), (3.0, 4.0, 15.0), (5.0, 6.0, 25.0)]: + t.append({"trip": {"begin": {"lon": lon, "lat": lat}}, "payment": {"fare": fare}}) + + assert t.trip.begin.lon.sum(where="payment.fare > 10") == pytest.approx(8.0) From eadd91a995e9eb980fe62010995f3a6bb41cb885 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 15 May 2026 08:08:30 +0200 Subject: [PATCH 07/17] Updated plan --- plans/ctable-groupby.md | 667 ++++++++++++---------------------------- 1 file changed, 201 insertions(+), 466 deletions(-) diff --git a/plans/ctable-groupby.md b/plans/ctable-groupby.md index b86ac078..95e82c0e 100644 --- a/plans/ctable-groupby.md +++ b/plans/ctable-groupby.md @@ -1,574 +1,309 @@ -# CTable `group_by` implementation plan +# CTable `group_by` implementation plan — status -## Goals +This document started as the implementation plan for `CTable.group_by()`. The +initial plan has now been executed through Phase 3. The remaining sections +record what was completed and what is future work. -Add a `CTable.group_by()` facility that is efficient for columnar, compressed -CTable storage while keeping the first implementation simple and correct. The -long-term goal is to expose a compressed-aware group-reduce primitive that can -power `CTable.group_by()` and possibly other analytics APIs. +## Completed -Key design principles: +### Public API -- Stay columnar: read only grouping columns, aggregation columns, and the live-row mask. -- Keep memory bounded: process the table chunk-by-chunk; never require materializing all rows. -- Use indexes opportunistically, but do not require them. -- Start with a NumPy implementation, then add Cython kernels for hot paths. -- Keep compressed input columns compressed between chunks; only chunk slices become NumPy buffers. - -## Proposed user API - -Initial high-level API could be: +Implemented: ```python -t.group_by("city").agg({"sales": "sum", "id": "count"}) -t.group_by(["country", "city"]).agg({"sales": ["sum", "mean"], "price": "max"}) +t.group_by("city").size() +t.group_by("city").count("sales") +t.group_by("city").agg({"sales": "sum"}) +t.group_by(["country", "city"]).agg({"sales": ["sum", "mean"]}) ``` -Potential variants: +Implemented API decisions: -```python -t.group_by("city", sort=False).agg(...) -t.group_by("city", engine="auto").agg(...) -t.group_by("city").count() -t.group_by("city").sum("sales") -``` +- `CTable.group_by(...)` returns a lightweight `CTableGroupBy` facade. +- `CTableGroupBy` is a deferred operation builder, not a `CTable` view. +- Terminal methods materialize a new in-memory `CTable`. +- Aggregate result columns are suffixed as `_`. +- `GroupBy.size()` means row count per group / SQL `COUNT(*)`. +- `GroupBy.count(column)` means non-null count / SQL `COUNT(column)`. +- `GroupBy.agg({"col": "count"})` is equivalent to `GroupBy.count("col")`. +- `sort=False` is the fast default; `sort=True` sorts output by group keys. +- `dropna=True` is the default; `dropna=False` keeps null/NaN key groups. +- No top-level `CTable.size()` or `CTable.count()` was added. -The result should be a new in-memory `CTable` initially. Persistent output can -be added later via an `out=`/`urlpath=` option if useful. +### Phase 1: Python/NumPy implementation -Output column naming should be predictable, for example: +Implemented files: ```text -city, sales_sum, id_count -country, city, sales_sum, sales_mean, price_max +src/blosc2/ctable.py # CTable.group_by() +src/blosc2/groupby.py # CTableGroupBy and NumPy fallback engine ``` -For a single aggregation on a column, decide whether to preserve the original -column name or always suffix it. Always suffixing is less ambiguous. - -## Supported MVP semantics - -Start with: - -- Group keys: - - fixed-width scalar columns: bool, signed/unsigned ints, floats, datetimes/timedeltas; - - dictionary-encoded string columns via integer codes. -- Aggregations: - - `count` / `size`; - - `sum`; - - `min`; - - `max`; - - `mean` implemented as `sum + count` during accumulation. -- Respect live rows (`_valid_rows`) and views. -- Read only required columns. - -Defer initially: - -- list columns; -- vlstring/vlbytes/object/struct scalar columns, except dictionary columns; -- arbitrary Python aggregators; -- group-by over computed columns, unless they can be chunk-evaluated cleanly; -- disk spilling for very high cardinality; -- parallel hash aggregation. - -## Baseline algorithm: chunked hash aggregation - -The default implementation should be a chunked hash group-reduce: - -```text -global_accumulator = hash table: group_key -> aggregate state - -for each row chunk: - read/decompress key column chunk(s) - read/decompress aggregation value column chunk(s) - read/decompress valid-row mask chunk - apply live-row mask +Implemented functionality: - build local grouping keys - compute local partial aggregates - merge local partial aggregates into global_accumulator +- Chunked, columnar traversal. +- Reads only group keys, aggregation value columns, and `_valid_rows`. +- Handles live rows, views, and deleted rows. +- Supports fixed-width scalar keys and dictionary-encoded string keys. +- Dictionary keys group by codes and decode only for result materialization. +- Supports `size`, `count`, `sum`, `mean`, `min`, `max`. +- Supports multi-key group-by via structured NumPy keys. +- Supports empty inputs. +- Falls back to the generic NumPy path for unsupported optimized cases. -finalize aggregate state -materialize group keys and aggregate columns into a result CTable -``` +### Phase 1 benchmark harness -The important point is that the global hash table is proportional to the number -of groups, not to the number of rows: +Implemented: ```text -memory ~= O(number_of_groups * (key_size + aggregate_state_size + hash overhead)) +bench/ctable/groupby.py ``` -The global accumulator should normally live uncompressed in memory. It is -accessed for every chunk merge, so compressing it would likely dominate runtime. -The compressed-aware aspect is in the input traversal: compressed CTable columns -are decompressed only one bounded chunk at a time. - -## Columnar chunk traversal - -Use synchronized physical row ranges. For each range: - -```python -valid = np.asarray(self._valid_rows[start:stop]) -key1 = np.asarray(self._cols[key1_name][start:stop]) -value = np.asarray(self._cols[value_name][start:stop]) - -key1 = key1[valid] -value = value[valid] -``` - -Where possible, align chunk ranges with the physical chunks of `_valid_rows` or -input columns to improve decompression locality. The exact chunk size should be -configurable internally; a reasonable default can be based on CTable/NDArray -chunk sizes, with a cap to avoid excessive temporaries. - -For dictionary columns, read codes instead of decoded strings: - -```python -codes = np.asarray(dict_col.codes[start:stop], dtype=np.int32) -``` - -Decode codes only when materializing the final result. - -## NumPy MVP local grouping +The benchmark can vary: -For a single key: +- row count; +- group cardinality; +- key dtype via `--key-dtype int32|int64|float32|float64`; +- dictionary keys via `--dictionary`; +- operation via `--op size|count|sum|mean|min|max`; +- sorted output; +- chunk size; +- optional persistent `urlpath`; +- optional pandas comparison. -```python -unique_keys, inverse = np.unique(keys, return_inverse=True) -partial_sum = np.bincount(inverse, weights=values) -partial_count = np.bincount(inverse) -``` - -For min/max use `np.minimum.at` / `np.maximum.at` into arrays initialized with -appropriate identity values. - -For multiple fixed-width keys, build a structured array per chunk: +### Phase 2: optimized paths -```python -keys = np.empty(n, dtype=[("k0", key0.dtype), ("k1", key1.dtype)]) -keys["k0"] = key0 -keys["k1"] = key1 +Implemented dense NumPy and Cython fast paths for the main benchmark-driven +cases. -unique_keys, inverse = np.unique(keys, return_inverse=True) -``` +Optimized cases currently include: -This is simple and should be the initial correctness path. Costs to be aware of: +- compact non-negative integer/dictionary-code single keys in Python/NumPy dense mode; +- `int32 key + float64 sum` in Cython; +- dictionary-code key + `float64 sum` in Cython; +- integral `float64 key + float64 sum` in Cython; +- integral `float32 key + float64 sum` in Cython. -- structured key array allocation and copy per chunk; -- `np.unique` is generally sort-based; -- `return_inverse=True` allocates one integer per live row in the chunk; -- aggregations are separate passes over the inverse. +These paths avoid the original per-chunk `np.unique(..., return_inverse=True)` +and Python dictionary merge overhead for compact single-key sum workloads. -These costs are acceptable for the MVP because they are bounded by chunk size. +Representative benchmark improvements observed during implementation: -## Global accumulator design +```text +50M rows, 5k int32 groups, float64 sum: + generic/early path: ~0.47 s + Cython dense path: ~0.20–0.22 s -For the Python MVP, a dictionary is adequate: +50M rows, 5k float64 integral groups, float64 sum: + generic path: ~5.51 s + Cython dense path: ~0.27–0.29 s -```python -acc: dict[group_key, AggregateState] +50M rows, 5k float32 integral groups, float64 sum: + Cython dense path: ~0.24–0.25 s ``` -Where `group_key` is: +### Phase 3: separate Cython extension -- a Python scalar for single numeric/dictionary keys; -- a tuple for multi-column keys; -- a normalized representation for null-aware keys when nullable support is added. - -`AggregateState` can store arrays or small Python objects with fields like: +Implemented: ```text -count -sum -min -max -mean_sum -mean_count +src/blosc2/groupby_ext.pyx ``` -For `mean`, keep `sum` and `count` and divide only during finalization. For -multiple aggregations over the same input column, share state when possible -(e.g. `mean` and `sum` can reuse the same sum). +Build integration: -For better performance after the API stabilizes, replace parts of this with a -NumPy-backed accumulator or Cython state object. +- `CMakeLists.txt` builds, links, and installs `groupby_ext`. +- Group-by kernels were removed from `indexing_ext.pyx`. +- `src/blosc2/groupby.py` imports `blosc2.groupby_ext` for optimized kernels. -## Index-aware paths +Rationale: -Indexes are optional accelerators. +- Group-by kernels are analytics/query execution code, not indexing internals. +- A dedicated extension keeps separation of concerns cleaner as optimized paths grow. -### FULL index on a single group key +### Documentation -A FULL index stores sorted values and positions. For a single grouping key, -this can make group-by a sorted scan: +Implemented user-facing documentation in: ```text -obtain sorted positions from FULL index -scan rows in key order -detect group boundaries -reduce contiguous runs -``` - -Benefits: - -- no hash table needed for the grouping key; -- no sort needed at query time; -- output is naturally sorted by key. - -This is most useful for: - -```python -t.create_index("city", kind=blosc2.IndexKind.FULL) -t.group_by("city").agg(...) +doc/reference/ctable.rst ``` -Caveats: +Documented: -- only directly helps single-key group-by; -- for multi-key group-by, a single-column FULL index only partially helps; -- stale indexes must be ignored or rebuilt; -- views/deleted rows still require intersecting with `_valid_rows`. +- `CTable.group_by()`; +- returned `CTableGroupBy` object; +- `size()`, `count()`, `agg()`; +- examples for row counts, non-null counts, and sums. -### Bucket/segment indexes +### Tests -The default predicate indexes are useful before group-by, not usually during it: +Implemented/extended: -```python -t.where("year == 2024").group_by("city") +```text +tests/ctable/test_groupby.py ``` -The index accelerates `where()`, reducing rows scanned by group-by. It does not -by itself provide grouped order. - -## Existing `indexing_ext` sort helpers - -`indexing_ext.pyx` contains: - -- `keysort_values_positions(values, positions)`; -- `keysort_keys_indices(keys, indices)`. - -These sort a 1-D scalar key array in-place while carrying an `int64` side array. -They are useful for sort/index oriented paths, especially: - -- building/reusing FULL indexes; -- single-key sort-based group-by; -- dictionary-code group-by where codes are scalar integers. +Coverage includes: -They are not the main primitive for hash-based group-reduce because hash -aggregation does not require sorted keys. They also do not directly support -multi-column keys, variable-length strings, or fused aggregation. - -## Compressed-aware `group_reduce` primitive +- `size()` row counts; +- `count(column)` non-null counts; +- `agg()` with `sum`, `mean`, `min`, `max`, `count`; +- `agg({"*": "size"})`; +- multi-key group-by; +- dictionary string keys; +- views and deleted rows; +- empty tables; +- `dropna=True` / `dropna=False` behavior; +- bad engine rejection; +- optimized int32/dictionary/float32/float64 sum variants; +- fallback for non-integral float keys; +- fallback for NaN float-key group when `dropna=False`. -Longer term, introduce a lower-level primitive used by `CTable.group_by()`: +Validation during implementation: -```python -blosc2.group_reduce( - keys=[key_ndarray1, key_ndarray2], - values=[value_ndarray1], - aggs={"value": ["sum", "count"]}, - mask=valid_rows, - chunk_size=None, - engine="auto", -) +```text +pytest tests/ctable/test_groupby.py -q +pytest tests/ctable -q ``` -However, the first implementation can live under an internal module, e.g. -`blosc2.groupby`, before becoming public. +The full CTable suite passed after Phase 3. -The primitive should be compressed-aware in traversal, not necessarily operate -on compressed bytes directly. General key comparison/grouping still needs -values. The intended execution is: +## Current design summary -```text -read compressed NDArray slices -> NumPy buffers -> local group/reduce -> merge -``` +The implementation now has three execution layers: -This avoids full-column materialization while keeping the hot loop simple. +1. Generic chunked NumPy path: + - supports the broadest set of Phase-1 semantics; + - uses per-chunk local grouping and merges partials globally. +2. Dense NumPy single-key path: + - for compact non-negative integer/dictionary-code keys; + - uses dense accumulator arrays where possible. +3. Cython single-key sum kernels: + - for the most important compact/integral key + `float64 sum` cases; + - lives in `groupby_ext.pyx`. -## Cython optimization plan +All optimized paths are conservative and fall back to the generic engine when +unsupported data or semantics are encountered. -### Phase 1: Python/NumPy only +## Deferred / future work -Files: +### Integer-key Cython coverage -```text -src/blosc2/ctable.py # public API / GroupBy facade -src/blosc2/groupby.py # internal implementation and NumPy engine -``` +Current Cython integer coverage is focused on `int32` keys. Future work should +replace this with fused-type or equivalent kernels covering: -Focus on correctness, tests, API shape, and an early benchmark harness. The -benchmark should be added in Phase 1, before any Cython work, so that later -optimization decisions are driven by numbers rather than intuition. At minimum, -add one reusable script under `bench/` that can generate or open a CTable and -compare: +- `int8`, `uint8`; +- `int16`, `uint16`; +- `int32`, `uint32`; +- `int64`, `uint64` with compact-range checks. -- chunked NumPy hash group-by; -- single-key sort/scan group-by where practical; -- dictionary-code grouping; -- pandas or DuckDB on an equivalent in-memory/external dataset for rough context. +For dense group-by, the key range matters more than the dtype. Smaller integer +types are naturally compact and should be low-risk fast paths. -The initial benchmark does not need to be exhaustive, but it should record row -count, cardinality, chunk size, compression parameters, elapsed time, peak memory -if easy to capture, and whether the input is in-memory, `.b2d`, or `.b2z`. +### More Cython aggregations -### Phase 2: optimized kernels in `indexing_ext.pyx` +Current Cython kernels primarily accelerate single-key `float64 sum`. +Future kernels should cover: -To avoid adding a third extension too early, place initial Cython kernels in -`src/blosc2/indexing_ext.pyx` under a clearly separated section: +- `size`; +- `count`; +- `mean` via sum/count; +- `min`; +- `max`; +- multiple aggregations in a single fused pass; +- multiple value columns. -```cython -# ---------------------------------------------------------------------- -# Group-reduce kernels -# ---------------------------------------------------------------------- -``` +### Arbitrary float-key hash table -Initial kernels should target high-value simple cases: +Current float Cython fast paths handle integral float32/float64 keys only. A +true float-key hash table would support arbitrary float keys without sorting or +`np.unique`. -- single `int32`/`int64` key; -- dictionary-code keys (`int32`); -- numeric value columns; -- `count`, `sum`, `min`, `max`, maybe `mean` via sum/count. +Required semantic decisions/handling: -The Python layer remains responsible for: +- `dropna=True`: skip NaN keys; +- `dropna=False`: all NaN keys should form one group; +- `+0.0` and `-0.0` should likely be the same group; +- infinities are valid groups; +- nullable float sentinels must be normalized consistently. -- CTable schema validation; -- chunk iteration; -- decompression into NumPy buffers; -- final result CTable construction; -- fallback to NumPy for unsupported dtypes. +### Multi-key Cython hash path -The Cython layer consumes NumPy buffers and updates a hash accumulator or returns -chunk partial aggregates. +The generic NumPy path supports multi-key grouping via structured arrays. Future +Cython work could hash directly across multiple key arrays, avoiding structured +key packing, sort-based unique, inverse arrays, and Python merge overhead. -### Phase 3: split to `groupby_ext.pyx` if it grows +### FULL-index sorted group-by path -If the optimized path grows to include multi-column hash tables, nullable key -semantics, multiple aggregate state layouts, spilling, or parallel execution, -move it to a dedicated extension: +A FULL index on a single grouping key can provide sorted positions. A future +sorted-scan group-by path could: ```text -src/blosc2/groupby_ext.pyx +read sorted positions from FULL index +scan contiguous key runs +reduce each run +emit sorted groups naturally ``` -This is cleaner long-term than overloading `indexing_ext.pyx` indefinitely. -Avoid putting this functionality in `blosc2_ext.pyx`; group-reduce is a -higher-level analytics/query primitive, not core compression/NDArray machinery. +This would be especially useful for high-cardinality single-key group-by and +for users requesting `sort=True`. -## What custom Cython buys over structured NumPy keys +### Public `blosc2.group_reduce()` -NumPy structured dtype is a good MVP, but a custom Cython hash reducer can avoid -several costs: +Keep lower-level group-reduce machinery internal for now. Consider exposing a +public `blosc2.group_reduce()` only after: -- no temporary packed structured key array; -- no sort-based `np.unique` for every chunk; -- no `inverse` array of length equal to the chunk; -- factorization and aggregation can be fused in one pass; -- multiple aggregations can be updated together; -- direct processing of CTable's columnar SoA layout; -- easier future per-thread hash tables and merges. - -A typical optimized loop is: - -```text -for i in range(n): - key = key_columns[i] - slot = hash_lookup_or_insert(key) - acc_sum[slot] += value[i] - acc_count[slot] += 1 - acc_min[slot] = min(acc_min[slot], value[i]) -``` - -For multi-column keys, the Cython path can hash directly across multiple arrays -without packing them into a structured array first. +- aggregation semantics are stable; +- null/NaN behavior is fully documented; +- output representation is clear; +- benchmarks show usefulness outside `CTable.group_by()`. -## High-cardinality strategy +### High-cardinality and memory strategy -Hash aggregation can become memory-heavy when the number of groups approaches -the number of rows. Add safeguards and future alternatives: +Future safeguards/features: - estimate cardinality from early chunks; - expose/keep an internal memory limit; -- fall back to sort-based group-by when cardinality is too high; -- use FULL index if available; -- later: partitioned hash group-by with spill-to-disk. +- fall back to sort-based grouping when cardinality is too high; +- use FULL indexes when available; +- eventually implement partitioned hash group-by with spill-to-disk. -For the MVP, document that very high-cardinality group-by may require memory -proportional to output cardinality. +### Parallel execution -## Null and NaN semantics +Potential future optimization: -Define before finalizing the API: +- per-thread local accumulators; +- merge accumulators at chunk or partition boundaries; +- coordinate with Blosc2 decompression threading to avoid oversubscription. -- Should null sentinel values form their own group, be skipped, or be controlled - by `dropna=`? -- Should float NaNs group together? NumPy `unique` behavior and hash behavior - must be made consistent. -- Nullable booleans/dictionary null codes need explicit handling. +### Additional API conveniences -Suggested default, matching common dataframe behavior: +Potential future user conveniences: ```python -t.group_by("key", dropna=True) # default? skip null keys -t.group_by("key", dropna=False) # include null group +t.group_by("city").sum("sales") +t.group_by("city").mean("sales") +t.group_by("city").min("sales") +t.group_by("city").max("sales") ``` -But this should be aligned with existing CTable nullable semantics. - -## Documentation - -Add user-facing docstrings and Sphinx documentation for the new group-by API: - -- `CTable.group_by()` docstring with parameters such as `keys`, `sort`, - `dropna`, `engine`, and `chunk_size` if exposed; -- the returned `GroupBy`/`CTableGroupBy` facade docstring, documenting that it - is a deferred operation builder, not a `CTable` view; -- `GroupBy.size()`, `GroupBy.count()`, and `GroupBy.agg()` docstrings; -- examples in the CTable documentation showing row counts, non-null counts, - sums/means, dictionary string grouping, and optional sorted output. +Do not add top-level `CTable.size()` / `CTable.count()` until their semantics are +clearly justified outside group-by. -The class may be described as "the object returned by `CTable.group_by()`" and -need not encourage direct construction. +### Persistent output -## Tests +The current result is an in-memory `CTable`. Future work may add an `out=` or +`urlpath=` option for persistent grouped output. -Add tests under `tests/ctable/`, covering: - -- single-key count/sum/min/max/mean; -- multi-key group-by; -- dictionary string key grouping; -- views and deleted rows; -- empty table and all-filtered view; -- different numeric dtypes and bool keys; -- nullable key behavior once specified; -- result schema and output column names; -- consistency with a reference Python/pandas-like implementation; -- chunk-size variation to ensure chunk-boundary independence; -- optional FULL-index path returns same results as hash path. - -For deterministic tests, sort result rows before comparison unless the API -guarantees output order. - -## Benchmark plan - -Add a small but useful benchmark during Phase 1. This is important because it -sets the baseline for the NumPy implementation and identifies which Cython -kernels are worth writing first. - -Benchmarks should include: - -- low-cardinality single key, e.g. 10 groups over 100M rows; -- medium cardinality, e.g. 100k groups; -- high cardinality, near unique keys; -- dictionary string columns grouped by codes; -- multi-column keys; -- multiple aggregations over one value column; -- multiple value columns; -- with and without FULL index; -- persistent `.b2d`/`.b2z` inputs. - -Compare: - -- Python/NumPy chunked implementation; -- Cython hash path when available; -- sort-based path using existing keysort helpers; -- pandas/duckdb for sanity, where feasible. - -## Open decisions and recommended defaults - -### Public API and result column names - -Recommendation: use a small `GroupBy` facade and an explicit `.agg()` method: - -```python -t.group_by("city").agg({"sales": "sum"}) -t.group_by(["country", "city"]).agg({"sales": ["sum", "mean"], "price": "max"}) -``` +## Related untracked files reviewed -Always suffix aggregate output columns as `_`: +During cleanup, these untracked files were reviewed and found non-duplicative: ```text -city, sales_sum -country, city, sales_sum, sales_mean, price_max -``` - -This avoids ambiguity and remains stable when users later request multiple -aggregations on the same input column. Convenience methods should include at least `GroupBy.size()` and -`GroupBy.count(column)` early: - -```python -t.group_by("city").size() # row count per group / COUNT(*) -t.group_by("city").count("sales") # non-null sales count / COUNT(sales) -``` - -Additional conveniences like `.sum()`, `.mean()`, `.min()`, and `.max()` can be -added after `.agg()` is stable. - -### Output order - -Recommendation: make output order configurable, with hash insertion order as the -fast default and sorted output as an option: - -```python -t.group_by("city", sort=False).agg(...) # default: fastest -t.group_by("city", sort=True).agg(...) # sort by group keys -``` - -When a single-key FULL index is used, sorted output can be produced naturally. -Tests should not depend on default order unless explicitly testing order. - -### Null and NaN grouping semantics - -Recommendation: provide `dropna=` and default to `True`, matching common -dataframe behavior: - -```python -t.group_by("key", dropna=True) # skip rows with null/NaN keys -t.group_by("key", dropna=False) # include a null/NaN group +tests/ctable/test_nested_append.py +bench/ctable/bench_nested_filter_index.py ``` -For `dropna=False`, all NaNs in a floating key should belong to one group, and -nullable sentinels/dictionary null codes should belong to one null group. The -NumPy and Cython engines must normalize these cases consistently. - -### `size` vs `count` - -Recommendation: support both, with distinct meanings, scoped to group-by rather -than as new top-level `CTable.size()` / `CTable.count()` methods: - -- `GroupBy.size()`: number of rows in the group, independent of value-column - nulls; equivalent to SQL `COUNT(*)` and pandas `groupby(...).size()`; -- `GroupBy.count(column)`: number of non-null values for a specific value - column; equivalent to SQL `COUNT(column)` and pandas `groupby(...)[column].count()`; -- `count` aggregation, e.g. `GroupBy.agg({"sales": "count"})`, should be an - equivalent spelling for `GroupBy.count("sales")`. - -Prefer `size()` over `len()` for the MVP. Although `len` resembles Python's -`len()`, `size()` follows pandas group-by terminology and avoids suggesting that -it returns a single scalar length. A `len()` alias can be considered later if -there is demand. - -For non-nullable columns, `count(col)` equals `size`. For nullable columns, -`count(col)` excludes null sentinels/NaNs according to the column null policy. -The MVP can implement `GroupBy.size()` first and add nullable-aware `count` as -nullable aggregate semantics mature. - -### Public `blosc2.group_reduce()` exposure - -Recommendation: keep `group_reduce` internal at first, e.g. in -`blosc2.groupby`, until the API and semantics settle through `CTable.group_by()`. -Expose a public `blosc2.group_reduce()` only after: - -- aggregation semantics are stable; -- null/NaN behavior is documented; -- output representation is clear; -- benchmarks show it is useful outside CTable. - -### Cython extension placement - -Recommendation: start optimized kernels in `indexing_ext.pyx` only for Phase 2, -under a clearly marked group-reduce section, to avoid build-system churn while -validating the approach. If the code grows beyond a few focused kernels or needs -its own persistent state classes, move it to `groupby_ext.pyx`. Do not place it -in `blosc2_ext.pyx`. +They cover direct nested append/extend correctness and nested flat-vs-dotted +performance comparisons, respectively, and are worth keeping/adding separately. From f491f5d472196bca7dac80f5e1920613d85ce0be Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 15 May 2026 12:13:49 +0200 Subject: [PATCH 08/17] Fused integer-key kernels and more Cython aggregations Implemented: - fused dense integer-key Cython kernels covering `int8`, `uint8`, `int16`, `uint16`, `int32`, `uint32`, `int64`, and `uint64` keys; - dense integer/dictionary-code Cython path for `size`, `count`, `sum`, `mean`, `min`, and `max`; - float64 value kernels with NaN-null skipping where applicable; - int64 value kernels for integer/bool `sum`, `min`, and `max`; - shared key-presence tracking so groups with all-null values are still emitted correctly for `count` and nullable float aggregations. --- bench/ctable/groupby.py | 25 ++- plans/ctable-groupby.md | 40 ++-- src/blosc2/groupby.py | 297 ++++++++++++++++++++++++++++ src/blosc2/groupby_ext.pyx | 374 ++++++++++++++++++++++++++++++++++- tests/ctable/test_groupby.py | 68 ++++++- 5 files changed, 777 insertions(+), 27 deletions(-) diff --git a/bench/ctable/groupby.py b/bench/ctable/groupby.py index 68c03551..14bf4b06 100644 --- a/bench/ctable/groupby.py +++ b/bench/ctable/groupby.py @@ -32,18 +32,12 @@ class Row: key: str = blosc2.field(blosc2.dictionary()) value: float = blosc2.field(blosc2.float64()) - elif key_dtype == "int32": + elif key_dtype in {"int8", "uint8", "int16", "uint16", "int32", "uint32", "int64", "uint64"}: + key_spec = getattr(blosc2, key_dtype)() @dataclasses.dataclass class Row: - key: int = blosc2.field(blosc2.int32()) - value: float = blosc2.field(blosc2.float64()) - - elif key_dtype == "int64": - - @dataclasses.dataclass - class Row: - key: int = blosc2.field(blosc2.int64()) + key: int = blosc2.field(key_spec) value: float = blosc2.field(blosc2.float64()) elif key_dtype == "float32": @@ -87,7 +81,18 @@ def main() -> None: parser.add_argument("--dictionary", action="store_true", help="Use a dictionary-encoded string key") parser.add_argument( "--key-dtype", - choices=["int32", "int64", "float32", "float64"], + choices=[ + "int8", + "uint8", + "int16", + "uint16", + "int32", + "uint32", + "int64", + "uint64", + "float32", + "float64", + ], default="int32", help="Physical dtype for non-dictionary keys. Float keys are generated from group codes cast to float.", ) diff --git a/plans/ctable-groupby.md b/plans/ctable-groupby.md index 95e82c0e..69eb7547 100644 --- a/plans/ctable-groupby.md +++ b/plans/ctable-groupby.md @@ -121,6 +121,19 @@ Rationale: - Group-by kernels are analytics/query execution code, not indexing internals. - A dedicated extension keeps separation of concerns cleaner as optimized paths grow. +### Phase 4: fused integer-key kernels and more Cython aggregations + +Implemented: + +- fused dense integer-key Cython kernels covering `int8`, `uint8`, + `int16`, `uint16`, `int32`, `uint32`, `int64`, and `uint64` keys; +- dense integer/dictionary-code Cython path for `size`, `count`, `sum`, + `mean`, `min`, and `max`; +- float64 value kernels with NaN-null skipping where applicable; +- int64 value kernels for integer/bool `sum`, `min`, and `max`; +- shared key-presence tracking so groups with all-null values are still + emitted correctly for `count` and nullable float aggregations. + ### Documentation Implemented user-facing documentation in: @@ -190,29 +203,26 @@ unsupported data or semantics are encountered. ### Integer-key Cython coverage -Current Cython integer coverage is focused on `int32` keys. Future work should -replace this with fused-type or equivalent kernels covering: - -- `int8`, `uint8`; -- `int16`, `uint16`; -- `int32`, `uint32`; -- `int64`, `uint64` with compact-range checks. - -For dense group-by, the key range matters more than the dtype. Smaller integer -types are naturally compact and should be low-risk fast paths. +Completed for dense compact single-key group-by with fused kernels covering +`int8`, `uint8`, `int16`, `uint16`, `int32`, `uint32`, `int64`, and `uint64`. +The dense path still falls back for negative non-null keys and non-compact key +ranges. ### More Cython aggregations -Current Cython kernels primarily accelerate single-key `float64 sum`. -Future kernels should cover: +Completed for dense compact integer/dictionary-code single keys: - `size`; - `count`; +- `sum`; - `mean` via sum/count; - `min`; -- `max`; -- multiple aggregations in a single fused pass; -- multiple value columns. +- `max`. + +Remaining possible extensions in this area: + +- fuse multiple aggregations/value columns into one Cython pass; +- broaden value-type coverage beyond float64/int64 normalized kernels. ### Arbitrary float-key hash table diff --git a/src/blosc2/groupby.py b/src/blosc2/groupby.py index dfeda79d..9222fbd4 100644 --- a/src/blosc2/groupby.py +++ b/src/blosc2/groupby.py @@ -175,6 +175,9 @@ def _validate_value_column(self, name: str) -> None: def _execute(self, specs: list[_AggSpec]): self._validate_output_names(specs) + fast = self._try_execute_cython_dense_int_key(specs) + if fast is not None: + return fast fast = self._try_execute_cython_i32_f64_sum(specs) if fast is not None: return fast @@ -224,6 +227,300 @@ def _execute(self, specs: list[_AggSpec]): rows = self._final_rows(acc, key_values, specs) return self._build_result(rows, specs) + def _try_execute_cython_dense_int_key(self, specs: list[_AggSpec]): # noqa: C901 + """Cython fast path for one compact integer/dictionary key and dense aggregations.""" + if len(self.keys) != 1: + return None + key_name = self.keys[0] + key_info = self.table._schema.columns_by_name[key_name] + key_is_dict = self.table._is_dictionary_column(key_info) + if key_is_dict: + key_arr = self.table._cols[key_name].codes + key_dtype = np.dtype(np.int32) + skip_key_null = self.dropna + key_null = int(key_info.spec.null_code) + else: + key_arr = self.table._cols[key_name] + key_dtype = getattr(key_info.spec, "dtype", None) + if key_dtype is None: + return None + key_dtype = np.dtype(key_dtype) + if key_dtype.kind not in "biu": + return None + key_null_value = getattr(key_info.spec, "null_value", None) + skip_key_null = self.dropna and key_null_value is not None + key_null = 0 if key_null_value is None else int(key_null_value) + + try: + from blosc2 import groupby_ext + except ImportError: + return None + + descriptors = [] + for spec in specs: + desc: dict[str, Any] = {"spec": spec, "op": spec.op} + if spec.op == "size": + kernel = getattr(groupby_ext, "groupby_dense_int_size_checked", None) + if kernel is None: + return None + desc.update({"kernel": kernel, "state_kind": "counts"}) + descriptors.append(desc) + continue + + if spec.input_col is None: + return None + value_info = self.table._schema.columns_by_name[spec.input_col] + value_dtype = getattr(value_info.spec, "dtype", None) + if value_dtype is None: + return None + value_dtype = np.dtype(value_dtype) + null_value = getattr(value_info.spec, "null_value", None) + + if spec.op == "count": + kernel = getattr(groupby_ext, "groupby_dense_int_count_checked", None) + if kernel is None: + return None + desc.update({"kernel": kernel, "state_kind": "counts", "value_dtype": value_dtype}) + elif spec.op in {"sum", "mean", "min", "max"}: + if value_dtype.kind == "f": + skip_nan = isinstance(null_value, float) and math.isnan(null_value) + if null_value is not None and not skip_nan: + return None + suffix = "sum" if spec.op == "sum" else spec.op + kernel = getattr(groupby_ext, f"groupby_dense_int_f64_{suffix}_checked", None) + if kernel is None: + return None + desc.update( + { + "kernel": kernel, + "value_dtype": np.float64, + "value_kind": "f64", + "skip_nan": skip_nan, + } + ) + elif value_dtype.kind in "biu": + if null_value is not None: + return None + if spec.op == "mean": + kernel = getattr(groupby_ext, "groupby_dense_int_f64_mean_checked", None) + if kernel is None: + return None + desc.update( + { + "kernel": kernel, + "value_dtype": np.float64, + "value_kind": "f64", + "skip_nan": False, + } + ) + else: + kernel = getattr(groupby_ext, f"groupby_dense_int_i64_{spec.op}_checked", None) + if kernel is None: + return None + desc.update( + { + "kernel": kernel, + "value_dtype": np.int64, + "value_kind": "i64", + "skip_nan": False, + } + ) + else: + return None + if spec.op in {"sum", "min", "max"}: + desc["state_kind"] = "value_present" if spec.op == "sum" else "extreme" + elif spec.op == "mean": + desc["state_kind"] = "mean" + else: + return None + descriptors.append(desc) + + compact_limit = 10_000_000 + keys_present = np.zeros(0, dtype=bool) + states: dict[str, Any] = {} + for desc in descriptors: + spec = desc["spec"] + if desc["state_kind"] == "counts": + states[spec.output_col] = np.zeros(0, dtype=np.int64) + elif desc["state_kind"] == "mean": + states[spec.output_col] = (np.zeros(0, dtype=np.float64), np.zeros(0, dtype=np.int64)) + elif desc["state_kind"] == "value_present" or desc["state_kind"] == "extreme": + dtype = np.float64 if desc["value_kind"] == "f64" else np.int64 + states[spec.output_col] = (np.zeros(0, dtype=dtype), np.zeros(0, dtype=bool)) + + def ensure_size(size: int) -> bool: + nonlocal keys_present, states + if size > compact_limit: + return False + if size <= len(keys_present): + return True + old = len(keys_present) + keys_present = np.pad(keys_present, (0, size - old), constant_values=False) + for desc in descriptors: + spec = desc["spec"] + state = states[spec.output_col] + if desc["state_kind"] == "counts": + states[spec.output_col] = np.pad(state, (0, size - old), constant_values=0) + else: + first, second = state + states[spec.output_col] = ( + np.pad(first, (0, size - old), constant_values=0), + np.pad( + second, (0, size - old), constant_values=False if second.dtype == np.bool_ else 0 + ), + ) + return True + + def call_checked(kernel, *args) -> bool: + return int(kernel(*args)) == 0 + + phys_len = len(self.table._valid_rows) + chunk_size = self._chunk_size() + for start in range(0, phys_len, chunk_size): + stop = min(start + chunk_size, phys_len) + valid = np.asarray(self.table._valid_rows[start:stop], dtype=bool) + if not np.any(valid): + continue + keys = np.asarray(key_arr[start:stop], dtype=np.int8 if key_dtype.kind == "b" else key_dtype) + keys = np.ascontiguousarray(keys) + valid = np.ascontiguousarray(valid) + live = valid.copy() + if skip_key_null: + live &= keys != key_null + if not np.any(live): + continue + live_keys = keys[live] + if np.min(live_keys) < 0: + return None + max_key = int(np.max(live_keys)) + if not ensure_size(max_key + 1): + return None + + for desc in descriptors: + spec = desc["spec"] + state = states[spec.output_col] + if spec.op == "size": + if not call_checked( + desc["kernel"], keys, valid, state, keys_present, skip_key_null, key_null + ): + return None + elif spec.op == "count": + values = np.asarray(self.table._cols[spec.input_col][start:stop]) + values_valid = np.ascontiguousarray( + ~self._null_mask(spec.input_col, values, is_key=False) + ) + if not call_checked( + desc["kernel"], + keys, + valid, + values_valid, + state, + keys_present, + skip_key_null, + key_null, + ): + return None + elif spec.op == "sum": + values = np.asarray( + self.table._cols[spec.input_col][start:stop], dtype=desc["value_dtype"] + ) + values = np.ascontiguousarray(values) + sums, value_present = state + args = ( + keys, + values, + valid, + sums, + value_present, + keys_present, + skip_key_null, + key_null, + ) + if desc["value_kind"] == "f64": + args = (*args, desc["skip_nan"]) + if not call_checked(desc["kernel"], *args): + return None + elif spec.op == "mean": + values = np.asarray( + self.table._cols[spec.input_col][start:stop], dtype=desc["value_dtype"] + ) + values = np.ascontiguousarray(values) + sums, counts = state + if not call_checked( + desc["kernel"], + keys, + values, + valid, + sums, + counts, + keys_present, + skip_key_null, + key_null, + desc["skip_nan"], + ): + return None + elif spec.op in {"min", "max"}: + values = np.asarray( + self.table._cols[spec.input_col][start:stop], dtype=desc["value_dtype"] + ) + values = np.ascontiguousarray(values) + extremes, has_value = state + args = ( + keys, + values, + valid, + extremes, + has_value, + keys_present, + skip_key_null, + key_null, + ) + if desc["value_kind"] == "f64": + args = (*args, desc["skip_nan"]) + if not call_checked(desc["kernel"], *args): + return None + + group_codes = np.nonzero(keys_present)[0] + if self.sort and key_is_dict: + group_codes = np.array( + sorted( + group_codes, + key=lambda code: _sortable_key_part(self.table._cols[key_name].decode(int(code))), + ), + dtype=group_codes.dtype, + ) + + rows = [] + for code in group_codes: + key_value = self.table._cols[key_name].decode(int(code)) if key_is_dict else _python_scalar(code) + row = {key_name: key_value} + for desc in descriptors: + spec = desc["spec"] + state = states[spec.output_col] + if spec.op in {"size", "count"}: + row[spec.output_col] = int(state[code]) + elif spec.op == "sum": + sums, value_present = state + row[spec.output_col] = ( + _python_scalar(sums[code]) + if value_present[code] + else _null_output_value(self._result_spec_for_agg(spec)) + ) + elif spec.op == "mean": + sums, counts = state + row[spec.output_col] = ( + math.nan if counts[code] == 0 else float(sums[code]) / int(counts[code]) + ) + elif spec.op in {"min", "max"}: + extremes, has_value = state + row[spec.output_col] = ( + _python_scalar(extremes[code]) + if has_value[code] + else _null_output_value(self._result_spec_for_agg(spec)) + ) + rows.append(row) + return self._build_result(rows, specs) + def _try_execute_cython_i32_f64_sum(self, specs: list[_AggSpec]): # noqa: C901 """Cython fast path for one int32 key and one non-null float64 sum.""" if len(self.keys) != 1 or len(specs) != 1 or specs[0].op != "sum" or self.sort: diff --git a/src/blosc2/groupby_ext.pyx b/src/blosc2/groupby_ext.pyx index c621d6fc..d8475f87 100644 --- a/src/blosc2/groupby_ext.pyx +++ b/src/blosc2/groupby_ext.pyx @@ -11,7 +11,7 @@ import numpy as np cimport numpy as np -from libc.stdint cimport int32_t, int64_t +from libc.stdint cimport int8_t, uint8_t, int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t # ---------------------------------------------------------------------- @@ -345,3 +345,375 @@ def groupby_dense_f32_integral_key_f64_sum_checked( sums_view[key_i] += value present_view[key_i] = 1 return ret + + +# ---------------------------------------------------------------------- +# Fused integer-key dense kernels +# ---------------------------------------------------------------------- + +ctypedef fused dense_int_key_t: + int8_t + uint8_t + int16_t + uint16_t + int32_t + uint32_t + int64_t + uint64_t + + +cdef inline int _dense_int_key_scan( + dense_int_key_t[:] keys_view, + np.npy_bool[:] valid_view, + Py_ssize_t n, + Py_ssize_t nstates, + bint skip_key_null, + int64_t key_null, + int* ret, +) noexcept nogil: + cdef Py_ssize_t i + cdef int64_t key + cdef int64_t max_key = -1 + ret[0] = 0 + for i in range(n): + if not valid_view[i]: + continue + key = keys_view[i] + if skip_key_null and key == key_null: + continue + if key < 0: + ret[0] = -1 + return 0 + if key > max_key: + max_key = key + if max_key < 0: + ret[0] = 0 + elif max_key >= nstates: + if max_key > 2147483646: + ret[0] = -1 + else: + ret[0] = max_key + 1 + return 0 + + +def groupby_dense_int_size_checked( + dense_int_key_t[:] keys, + np.npy_bool[:] valid, + int64_t[:] counts, + np.npy_bool[:] keys_present, + bint skip_key_null=False, + int64_t key_null=0, +): + """Checked dense integer-key ``size`` kernel for all integer key widths.""" + if keys.shape[0] != valid.shape[0]: + raise ValueError("keys and valid must have the same length") + if counts.shape[0] != keys_present.shape[0]: + raise ValueError("counts and keys_present must have the same length") + cdef Py_ssize_t n = keys.shape[0] + cdef Py_ssize_t nstates = counts.shape[0] + cdef Py_ssize_t i + cdef int64_t key + cdef int ret + with nogil: + _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret) + if ret == 0: + for i in range(n): + if not valid[i]: + continue + key = keys[i] + if skip_key_null and key == key_null: + continue + counts[key] += 1 + keys_present[key] = 1 + return ret + + +def groupby_dense_int_count_checked( + dense_int_key_t[:] keys, + np.npy_bool[:] valid, + np.npy_bool[:] values_valid, + int64_t[:] counts, + np.npy_bool[:] keys_present, + bint skip_key_null=False, + int64_t key_null=0, +): + """Checked dense integer-key non-null count kernel.""" + if keys.shape[0] != valid.shape[0] or keys.shape[0] != values_valid.shape[0]: + raise ValueError("keys, valid and values_valid must have the same length") + if counts.shape[0] != keys_present.shape[0]: + raise ValueError("counts and keys_present must have the same length") + cdef Py_ssize_t n = keys.shape[0] + cdef Py_ssize_t nstates = counts.shape[0] + cdef Py_ssize_t i + cdef int64_t key + cdef int ret + with nogil: + _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret) + if ret == 0: + for i in range(n): + if not valid[i]: + continue + key = keys[i] + if skip_key_null and key == key_null: + continue + keys_present[key] = 1 + if values_valid[i]: + counts[key] += 1 + return ret + + +def groupby_dense_int_f64_sum_checked( + dense_int_key_t[:] keys, + double[:] values, + np.npy_bool[:] valid, + double[:] sums, + np.npy_bool[:] value_present, + np.npy_bool[:] keys_present, + bint skip_key_null=False, + int64_t key_null=0, + bint skip_value_nan=False, +): + """Checked dense integer-key float64 sum kernel.""" + if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]: + raise ValueError("keys, values and valid must have the same length") + if sums.shape[0] != value_present.shape[0] or sums.shape[0] != keys_present.shape[0]: + raise ValueError("state arrays must have the same length") + cdef Py_ssize_t n = keys.shape[0] + cdef Py_ssize_t nstates = sums.shape[0] + cdef Py_ssize_t i + cdef int64_t key + cdef double value + cdef int ret + with nogil: + _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret) + if ret == 0: + for i in range(n): + if not valid[i]: + continue + key = keys[i] + if skip_key_null and key == key_null: + continue + keys_present[key] = 1 + value = values[i] + if skip_value_nan and value != value: + continue + sums[key] += value + value_present[key] = 1 + return ret + + +def groupby_dense_int_i64_sum_checked( + dense_int_key_t[:] keys, + int64_t[:] values, + np.npy_bool[:] valid, + int64_t[:] sums, + np.npy_bool[:] value_present, + np.npy_bool[:] keys_present, + bint skip_key_null=False, + int64_t key_null=0, +): + """Checked dense integer-key int64 sum kernel.""" + if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]: + raise ValueError("keys, values and valid must have the same length") + if sums.shape[0] != value_present.shape[0] or sums.shape[0] != keys_present.shape[0]: + raise ValueError("state arrays must have the same length") + cdef Py_ssize_t n = keys.shape[0] + cdef Py_ssize_t nstates = sums.shape[0] + cdef Py_ssize_t i + cdef int64_t key + cdef int ret + with nogil: + _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret) + if ret == 0: + for i in range(n): + if not valid[i]: + continue + key = keys[i] + if skip_key_null and key == key_null: + continue + keys_present[key] = 1 + sums[key] += values[i] + value_present[key] = 1 + return ret + + +def groupby_dense_int_f64_mean_checked( + dense_int_key_t[:] keys, + double[:] values, + np.npy_bool[:] valid, + double[:] sums, + int64_t[:] counts, + np.npy_bool[:] keys_present, + bint skip_key_null=False, + int64_t key_null=0, + bint skip_value_nan=False, +): + """Checked dense integer-key float64 mean state kernel.""" + if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]: + raise ValueError("keys, values and valid must have the same length") + if sums.shape[0] != counts.shape[0] or sums.shape[0] != keys_present.shape[0]: + raise ValueError("state arrays must have the same length") + cdef Py_ssize_t n = keys.shape[0] + cdef Py_ssize_t nstates = sums.shape[0] + cdef Py_ssize_t i + cdef int64_t key + cdef double value + cdef int ret + with nogil: + _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret) + if ret == 0: + for i in range(n): + if not valid[i]: + continue + key = keys[i] + if skip_key_null and key == key_null: + continue + keys_present[key] = 1 + value = values[i] + if skip_value_nan and value != value: + continue + sums[key] += value + counts[key] += 1 + return ret + + +def groupby_dense_int_f64_min_checked( + dense_int_key_t[:] keys, + double[:] values, + np.npy_bool[:] valid, + double[:] mins, + np.npy_bool[:] has_value, + np.npy_bool[:] keys_present, + bint skip_key_null=False, + int64_t key_null=0, + bint skip_value_nan=False, +): + """Checked dense integer-key float64 min kernel.""" + cdef Py_ssize_t n = keys.shape[0] + cdef Py_ssize_t nstates = mins.shape[0] + cdef Py_ssize_t i + cdef int64_t key + cdef double value + cdef int ret + with nogil: + _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret) + if ret == 0: + for i in range(n): + if not valid[i]: + continue + key = keys[i] + if skip_key_null and key == key_null: + continue + keys_present[key] = 1 + value = values[i] + if skip_value_nan and value != value: + continue + if not has_value[key] or value < mins[key]: + mins[key] = value + has_value[key] = 1 + return ret + + +def groupby_dense_int_f64_max_checked( + dense_int_key_t[:] keys, + double[:] values, + np.npy_bool[:] valid, + double[:] maxs, + np.npy_bool[:] has_value, + np.npy_bool[:] keys_present, + bint skip_key_null=False, + int64_t key_null=0, + bint skip_value_nan=False, +): + """Checked dense integer-key float64 max kernel.""" + cdef Py_ssize_t n = keys.shape[0] + cdef Py_ssize_t nstates = maxs.shape[0] + cdef Py_ssize_t i + cdef int64_t key + cdef double value + cdef int ret + with nogil: + _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret) + if ret == 0: + for i in range(n): + if not valid[i]: + continue + key = keys[i] + if skip_key_null and key == key_null: + continue + keys_present[key] = 1 + value = values[i] + if skip_value_nan and value != value: + continue + if not has_value[key] or value > maxs[key]: + maxs[key] = value + has_value[key] = 1 + return ret + + +def groupby_dense_int_i64_min_checked( + dense_int_key_t[:] keys, + int64_t[:] values, + np.npy_bool[:] valid, + int64_t[:] mins, + np.npy_bool[:] has_value, + np.npy_bool[:] keys_present, + bint skip_key_null=False, + int64_t key_null=0, +): + """Checked dense integer-key int64 min kernel.""" + cdef Py_ssize_t n = keys.shape[0] + cdef Py_ssize_t nstates = mins.shape[0] + cdef Py_ssize_t i + cdef int64_t key + cdef int64_t value + cdef int ret + with nogil: + _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret) + if ret == 0: + for i in range(n): + if not valid[i]: + continue + key = keys[i] + if skip_key_null and key == key_null: + continue + keys_present[key] = 1 + value = values[i] + if not has_value[key] or value < mins[key]: + mins[key] = value + has_value[key] = 1 + return ret + + +def groupby_dense_int_i64_max_checked( + dense_int_key_t[:] keys, + int64_t[:] values, + np.npy_bool[:] valid, + int64_t[:] maxs, + np.npy_bool[:] has_value, + np.npy_bool[:] keys_present, + bint skip_key_null=False, + int64_t key_null=0, +): + """Checked dense integer-key int64 max kernel.""" + cdef Py_ssize_t n = keys.shape[0] + cdef Py_ssize_t nstates = maxs.shape[0] + cdef Py_ssize_t i + cdef int64_t key + cdef int64_t value + cdef int ret + with nogil: + _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret) + if ret == 0: + for i in range(n): + if not valid[i]: + continue + key = keys[i] + if skip_key_null and key == key_null: + continue + keys_present[key] = 1 + value = values[i] + if not has_value[key] or value > maxs[key]: + maxs[key] = value + has_value[key] = 1 + return ret diff --git a/tests/ctable/test_groupby.py b/tests/ctable/test_groupby.py index 306d6522..ec4f2185 100644 --- a/tests/ctable/test_groupby.py +++ b/tests/ctable/test_groupby.py @@ -5,7 +5,7 @@ # SPDX-License-Identifier: BSD-3-Clause ####################################################################### -from dataclasses import dataclass +from dataclasses import dataclass, make_dataclass import numpy as np import pytest @@ -225,3 +225,69 @@ def test_groupby_rejects_bad_engine(): with pytest.raises(ValueError): t.group_by("city", engine="cython") + + +@pytest.mark.parametrize( + ("schema_factory", "values"), + [ + (blosc2.int8, [0, 2, 1, 2, 0]), + (blosc2.uint8, [0, 2, 1, 2, 0]), + (blosc2.int16, [0, 2, 1, 2, 0]), + (blosc2.uint16, [0, 2, 1, 2, 0]), + (blosc2.int32, [0, 2, 1, 2, 0]), + (blosc2.uint32, [0, 2, 1, 2, 0]), + (blosc2.int64, [0, 2, 1, 2, 0]), + (blosc2.uint64, [0, 2, 1, 2, 0]), + ], +) +def test_groupby_cython_fused_integer_key_dtypes(schema_factory, values): + row_type = make_dataclass( + f"FusedKey{schema_factory.__name__}Row", + [ + ("key", int, blosc2.field(schema_factory())), + ("value", int, blosc2.field(blosc2.int32())), + ], + ) + t = CTable(row_type, new_data=list(zip(values, [1, 10, 2, 3, 4], strict=True))) + + out = t.group_by("key", sort=True).agg({"value": "sum"}) + + assert rows(out) == [(0, 5), (1, 2), (2, 13)] + + +def test_groupby_cython_integer_key_more_integer_aggs(): + row_type = make_dataclass( + "IntKeyMoreIntegerAggsRow", + [ + ("key", int, blosc2.field(blosc2.int16())), + ("value", int, blosc2.field(blosc2.int32())), + ], + ) + t = CTable(row_type, new_data=[(0, 5), (1, 10), (0, -2), (1, 20), (2, 7)]) + + out = t.group_by("key", sort=True).agg({"*": "size", "value": ["count", "sum", "mean", "min", "max"]}) + + assert rows(out) == [(0, 2, 2, 3, 1.5, -2, 5), (1, 2, 2, 30, 15.0, 10, 20), (2, 1, 1, 7, 7.0, 7, 7)] + + +def test_groupby_cython_integer_key_nullable_float_aggs(): + row_type = make_dataclass( + "IntKeyNullableFloatAggsRow", + [ + ("key", int, blosc2.field(blosc2.uint16())), + ("value", float, blosc2.field(blosc2.float64(nullable=True))), + ], + ) + t = CTable(row_type, new_data=[(0, 1.5), (1, np.nan), (0, 2.5), (1, np.nan), (2, 10.0)]) + + out = t.group_by("key", sort=True).agg({"value": ["count", "sum", "mean", "min", "max"]}) + + got = rows(out) + assert got[0] == (0, 2, 4.0, 2.0, 1.5, 2.5) + assert got[1][0] == 1 + assert got[1][1] == 0 + assert np.isnan(got[1][2]) + assert np.isnan(got[1][3]) + assert np.isnan(got[1][4]) + assert np.isnan(got[1][5]) + assert got[2] == (2, 1, 10.0, 10.0, 10.0, 10.0) From 03b3583eeba653a397ec7a51014e454a071b8183 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 15 May 2026 12:31:39 +0200 Subject: [PATCH 09/17] Implemented arbitrary float hash table for floating point acceleration --- bench/ctable/groupby.py | 9 +- plans/ctable-groupby.md | 24 ++-- src/blosc2/groupby.py | 132 +++++++++++++++++++++ src/blosc2/groupby_ext.pyx | 221 +++++++++++++++++++++++++++++++++++ tests/ctable/test_groupby.py | 28 +++++ 5 files changed, 404 insertions(+), 10 deletions(-) diff --git a/bench/ctable/groupby.py b/bench/ctable/groupby.py index 14bf4b06..dfa7d863 100644 --- a/bench/ctable/groupby.py +++ b/bench/ctable/groupby.py @@ -5,6 +5,7 @@ -------- python bench/ctable/groupby.py --rows 10_000_000 --groups 1_000 --op sum python bench/ctable/groupby.py --rows 10_000_000 --groups 1_000 --key-dtype float64 --op sum +# float key dtypes generate non-integral repeated labels to exercise the float hash path python bench/ctable/groupby.py --rows 1_000_000 --groups 100 --dictionary --pandas """ @@ -67,7 +68,11 @@ def make_data(nrows: int, ngroups: int, dictionary: bool, key_dtype: str, seed: if dictionary: keys = np.asarray([f"k{code}" for code in key_codes], dtype=object) elif key_dtype in {"float32", "float64"}: - keys = key_codes.astype(np.dtype(key_dtype)) + # Use non-integral, repeated float labels by default so float-key + # benchmarks exercise the arbitrary-float hash path instead of the + # dense integral-float fast path. + labels = key_codes.astype(np.float64) + 0.25 + keys = labels.astype(np.dtype(key_dtype)) else: keys = key_codes.astype(np.dtype(key_dtype), copy=False) return keys, values @@ -94,7 +99,7 @@ def main() -> None: "float64", ], default="int32", - help="Physical dtype for non-dictionary keys. Float keys are generated from group codes cast to float.", + help="Physical dtype for non-dictionary keys. Float keys are generated as non-integral repeated labels.", ) parser.add_argument("--op", choices=["size", "count", "sum", "mean", "min", "max"], default="sum") parser.add_argument("--sort", action="store_true") diff --git a/plans/ctable-groupby.md b/plans/ctable-groupby.md index 69eb7547..b0f91276 100644 --- a/plans/ctable-groupby.md +++ b/plans/ctable-groupby.md @@ -226,17 +226,25 @@ Remaining possible extensions in this area: ### Arbitrary float-key hash table -Current float Cython fast paths handle integral float32/float64 keys only. A -true float-key hash table would support arbitrary float keys without sorting or -`np.unique`. +Implemented a conservative Cython open-addressing hash path for single +`float32`/`float64` keys with float value aggregations. It supports `size`, +`count`, `sum`, `mean`, `min`, and `max` for supported single-value-column +queries and falls back otherwise. -Required semantic decisions/handling: +Implemented semantics: - `dropna=True`: skip NaN keys; -- `dropna=False`: all NaN keys should form one group; -- `+0.0` and `-0.0` should likely be the same group; -- infinities are valid groups; -- nullable float sentinels must be normalized consistently. +- `dropna=False`: all NaN keys form one group; +- `+0.0` and `-0.0` are normalized into the same group; +- infinities are valid groups through regular float bit hashing; +- NaN-null float values are skipped for value aggregations. + +Remaining possible extensions: + +- support non-float value columns in the hash path without normalizing through + float64; +- fuse multiple value columns directly in one hash-table pass; +- add explicit memory/cardinality safeguards for very high-cardinality floats. ### Multi-key Cython hash path diff --git a/src/blosc2/groupby.py b/src/blosc2/groupby.py index 9222fbd4..28a6a03a 100644 --- a/src/blosc2/groupby.py +++ b/src/blosc2/groupby.py @@ -182,6 +182,9 @@ def _execute(self, specs: list[_AggSpec]): if fast is not None: return fast fast = self._try_execute_cython_float_integral_key_f64_sum(specs) + if fast is not None: + return fast + fast = self._try_execute_cython_float_hash(specs) if fast is not None: return fast fast = self._try_execute_dense_single_int_key(specs) @@ -596,6 +599,135 @@ def ensure_size(size: int) -> bool: rows.append({key_name: key_value, spec.output_col: float(sums[code])}) return self._build_result(rows, specs) + def _try_execute_cython_float_hash(self, specs: list[_AggSpec]): # noqa: C901 + """Cython hash path for one arbitrary float key. + + This covers float32/float64 keys that are not suitable for dense + integral-key indexing. It currently supports float value columns for + value reductions and falls back for unsupported mixed/multi-column cases. + """ + if len(self.keys) != 1: + return None + key_name = self.keys[0] + key_info = self.table._schema.columns_by_name[key_name] + if self.table._is_dictionary_column(key_info): + return None + key_dtype = getattr(key_info.spec, "dtype", None) + if key_dtype not in {np.dtype(np.float32), np.dtype(np.float64)}: + return None + + value_cols = {s.input_col for s in specs if s.input_col is not None} + if len(value_cols) > 1: + return None + value_col = next(iter(value_cols), None) + value_dtype = None + nullable_nan_value = False + if value_col is not None: + value_info = self.table._schema.columns_by_name[value_col] + value_dtype = getattr(value_info.spec, "dtype", None) + # Count can operate on any fixed-width value column via values_valid, + # but other reductions in this hash kernel normalize values to f64. + if any(s.op in {"sum", "mean", "min", "max"} for s in specs): + if value_dtype is None or np.dtype(value_dtype).kind != "f": + return None + null_value = getattr(value_info.spec, "null_value", None) + nullable_nan_value = isinstance(null_value, float) and math.isnan(null_value) + if null_value is not None and not nullable_nan_value: + return None + + try: + from blosc2 import groupby_ext + except ImportError: + return None + kernel = getattr(groupby_ext, "groupby_hash_f64_f64", None) + if kernel is None: + return None + + acc: dict[Any, dict[str, _AggState]] = {} + key_values: dict[Any, tuple[Any, ...]] = {} + phys_len = len(self.table._valid_rows) + chunk_size = self._chunk_size() + + for start in range(0, phys_len, chunk_size): + stop = min(start + chunk_size, phys_len) + valid = np.asarray(self.table._valid_rows[start:stop], dtype=bool) + if not np.any(valid): + continue + keys = np.ascontiguousarray(np.asarray(self.table._cols[key_name][start:stop], dtype=np.float64)) + if value_col is None: + values = np.empty(len(keys), dtype=np.float64) + values_valid = np.zeros(len(keys), dtype=bool) + has_values = False + else: + raw_values = np.asarray(self.table._cols[value_col][start:stop]) + if any(s.op in {"sum", "mean", "min", "max"} for s in specs): + values = np.ascontiguousarray(raw_values.astype(np.float64, copy=False)) + else: + values = np.empty(len(keys), dtype=np.float64) + values_valid = np.ascontiguousarray(~self._null_mask(value_col, raw_values, is_key=False)) + has_values = True + + ( + chunk_keys, + row_counts, + value_counts, + sums, + mins, + maxs, + has_value, + ) = kernel(keys, values, np.ascontiguousarray(valid), values_valid, has_values, self.dropna) + + for i, key in enumerate(chunk_keys): + key_scalar = np.asarray(key, dtype=key_dtype).item() + norm_key = _normalize_key_part(float(key_scalar)) + states = acc.setdefault(norm_key, {}) + key_values.setdefault(norm_key, (key_scalar,)) + for spec in specs: + state = states.setdefault(spec.output_col, _AggState(spec.op)) + if spec.op == "size": + state.value = (0 if state.value is None else state.value) + int(row_counts[i]) + elif spec.op == "count": + state.value = (0 if state.value is None else state.value) + int(value_counts[i]) + elif spec.op == "sum" or spec.op == "mean": + if has_value[i]: + state.value = (0.0 if state.value is None else state.value) + float(sums[i]) + state.count += int(value_counts[i]) + elif spec.op == "min": + if has_value[i]: + value = float(mins[i]) + if state.count == 0 or value < state.value: + state.value = value + state.count += 1 + elif spec.op == "max" and has_value[i]: + value = float(maxs[i]) + if state.count == 0 or value > state.value: + state.value = value + state.count += 1 + + # Hash-table iteration order is intentionally not exposed. Emit float + # hash groups in key order for deterministic results and compatibility + # with the previous NumPy fallback behavior for these cases. + ordered_keys = list(acc) + ordered_keys.sort( + key=lambda k: tuple( + (1, "") if isinstance(v, float) and math.isnan(v) else (0, v) for v in key_values[k] + ) + ) + rows = [] + for norm_key in ordered_keys: + row = dict(zip(self.keys, key_values[norm_key], strict=True)) + states = acc[norm_key] + for spec in specs: + state = states[spec.output_col] + if spec.op == "mean": + row[spec.output_col] = math.nan if state.count == 0 else state.value / state.count + elif spec.op in {"sum", "min", "max"} and state.count == 0: + row[spec.output_col] = _null_output_value(self._result_spec_for_agg(spec)) + else: + row[spec.output_col] = 0 if state.value is None else state.value + rows.append(row) + return self._build_result(rows, specs) + def _try_execute_cython_float_integral_key_f64_sum(self, specs: list[_AggSpec]): # noqa: C901 """Cython fast path for integral float32/float64 keys and one non-null float64 sum.""" if len(self.keys) != 1 or len(specs) != 1 or specs[0].op != "sum" or self.sort: diff --git a/src/blosc2/groupby_ext.pyx b/src/blosc2/groupby_ext.pyx index d8475f87..03b18894 100644 --- a/src/blosc2/groupby_ext.pyx +++ b/src/blosc2/groupby_ext.pyx @@ -12,6 +12,8 @@ import numpy as np cimport numpy as np from libc.stdint cimport int8_t, uint8_t, int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t +from libc.stdlib cimport malloc, free +from libc.string cimport memcpy # ---------------------------------------------------------------------- @@ -717,3 +719,222 @@ def groupby_dense_int_i64_max_checked( maxs[key] = value has_value[key] = 1 return ret + + +# ---------------------------------------------------------------------- +# Arbitrary float-key hash kernels +# ---------------------------------------------------------------------- + +cdef inline uint64_t _f64_bits(double value) noexcept: + cdef uint64_t bits + memcpy(&bits, &value, sizeof(double)) + return bits + + +cdef inline uint64_t _mix_u64(uint64_t x) noexcept: + x ^= x >> 30 + x *= 0xbf58476d1ce4e5b9 + x ^= x >> 27 + x *= 0x94d049bb133111eb + x ^= x >> 31 + return x + + +def groupby_hash_f64_f64( + double[:] keys, + double[:] values, + np.npy_bool[:] valid, + np.npy_bool[:] values_valid, + bint has_values, + bint dropna=True, +): + """Hash arbitrary float64 keys and accumulate float64 group states. + + Returns ``(keys, row_counts, value_counts, sums, mins, maxs, has_value)``. + NaN keys are skipped when ``dropna`` is true; otherwise all NaN bit-patterns + are normalized into one NaN group. ``+0.0`` and ``-0.0`` are normalized into + the same zero group. + """ + if keys.shape[0] != valid.shape[0]: + raise ValueError("keys and valid must have the same length") + if has_values and (values.shape[0] != keys.shape[0] or values_valid.shape[0] != keys.shape[0]): + raise ValueError("values, values_valid and keys must have the same length") + + cdef Py_ssize_t n = keys.shape[0] + cdef Py_ssize_t cap = 1024 + cdef Py_ssize_t used_count = 0 + cdef Py_ssize_t i, pos, old_pos, out_pos + cdef uint64_t mask = cap - 1 + cdef uint64_t bits, h, old_bits + cdef double key, value + cdef double nan_value = float("nan") + cdef uint64_t nan_bits = 0x7ff8000000000000 + cdef bint value_ok + + cdef uint64_t* table_bits = malloc(cap * sizeof(uint64_t)) + cdef np.npy_bool* table_used = malloc(cap * sizeof(np.npy_bool)) + cdef double* table_keys = malloc(cap * sizeof(double)) + cdef int64_t* row_counts = malloc(cap * sizeof(int64_t)) + cdef int64_t* value_counts = malloc(cap * sizeof(int64_t)) + cdef double* sums = malloc(cap * sizeof(double)) + cdef double* mins = malloc(cap * sizeof(double)) + cdef double* maxs = malloc(cap * sizeof(double)) + cdef np.npy_bool* has_value = malloc(cap * sizeof(np.npy_bool)) + + cdef uint64_t* new_bits + cdef np.npy_bool* new_used + cdef double* new_keys + cdef int64_t* new_row_counts + cdef int64_t* new_value_counts + cdef double* new_sums + cdef double* new_mins + cdef double* new_maxs + cdef np.npy_bool* new_has_value + cdef Py_ssize_t old_cap + cdef uint64_t new_mask + + if ( + table_bits == NULL + or table_used == NULL + or table_keys == NULL + or row_counts == NULL + or value_counts == NULL + or sums == NULL + or mins == NULL + or maxs == NULL + or has_value == NULL + ): + free(table_bits); free(table_used); free(table_keys); free(row_counts); free(value_counts) + free(sums); free(mins); free(maxs); free(has_value) + raise MemoryError() + + for i in range(cap): + table_used[i] = 0 + + try: + for i in range(n): + if not valid[i]: + continue + key = keys[i] + if key != key: + if dropna: + continue + bits = nan_bits + key = nan_value + elif key == 0.0: + key = 0.0 + bits = 0 + else: + bits = _f64_bits(key) + + if (used_count + 1) * 2 >= cap: + old_cap = cap + cap *= 2 + mask = cap - 1 + new_bits = malloc(cap * sizeof(uint64_t)) + new_used = malloc(cap * sizeof(np.npy_bool)) + new_keys = malloc(cap * sizeof(double)) + new_row_counts = malloc(cap * sizeof(int64_t)) + new_value_counts = malloc(cap * sizeof(int64_t)) + new_sums = malloc(cap * sizeof(double)) + new_mins = malloc(cap * sizeof(double)) + new_maxs = malloc(cap * sizeof(double)) + new_has_value = malloc(cap * sizeof(np.npy_bool)) + if ( + new_bits == NULL + or new_used == NULL + or new_keys == NULL + or new_row_counts == NULL + or new_value_counts == NULL + or new_sums == NULL + or new_mins == NULL + or new_maxs == NULL + or new_has_value == NULL + ): + free(new_bits); free(new_used); free(new_keys); free(new_row_counts); free(new_value_counts) + free(new_sums); free(new_mins); free(new_maxs); free(new_has_value) + raise MemoryError() + for pos in range(cap): + new_used[pos] = 0 + for old_pos in range(old_cap): + if not table_used[old_pos]: + continue + old_bits = table_bits[old_pos] + h = _mix_u64(old_bits) + pos = (h & mask) + while new_used[pos]: + pos = ((pos + 1) & mask) + new_used[pos] = 1 + new_bits[pos] = old_bits + new_keys[pos] = table_keys[old_pos] + new_row_counts[pos] = row_counts[old_pos] + new_value_counts[pos] = value_counts[old_pos] + new_sums[pos] = sums[old_pos] + new_mins[pos] = mins[old_pos] + new_maxs[pos] = maxs[old_pos] + new_has_value[pos] = has_value[old_pos] + free(table_bits); free(table_used); free(table_keys); free(row_counts); free(value_counts) + free(sums); free(mins); free(maxs); free(has_value) + table_bits = new_bits + table_used = new_used + table_keys = new_keys + row_counts = new_row_counts + value_counts = new_value_counts + sums = new_sums + mins = new_mins + maxs = new_maxs + has_value = new_has_value + + h = _mix_u64(bits) + pos = (h & mask) + while table_used[pos] and table_bits[pos] != bits: + pos = ((pos + 1) & mask) + if not table_used[pos]: + table_used[pos] = 1 + table_bits[pos] = bits + table_keys[pos] = key + row_counts[pos] = 0 + value_counts[pos] = 0 + sums[pos] = 0.0 + mins[pos] = 0.0 + maxs[pos] = 0.0 + has_value[pos] = 0 + used_count += 1 + + row_counts[pos] += 1 + if has_values: + value_ok = values_valid[i] + if value_ok: + value = values[i] + value_counts[pos] += 1 + sums[pos] += value + if not has_value[pos] or value < mins[pos]: + mins[pos] = value + if not has_value[pos] or value > maxs[pos]: + maxs[pos] = value + has_value[pos] = 1 + + out_keys = np.empty(used_count, dtype=np.float64) + out_row_counts = np.empty(used_count, dtype=np.int64) + out_value_counts = np.empty(used_count, dtype=np.int64) + out_sums = np.empty(used_count, dtype=np.float64) + out_mins = np.empty(used_count, dtype=np.float64) + out_maxs = np.empty(used_count, dtype=np.float64) + out_has_value = np.empty(used_count, dtype=bool) + + out_pos = 0 + for pos in range(cap): + if not table_used[pos]: + continue + out_keys[out_pos] = table_keys[pos] + out_row_counts[out_pos] = row_counts[pos] + out_value_counts[out_pos] = value_counts[pos] + out_sums[out_pos] = sums[pos] + out_mins[out_pos] = mins[pos] + out_maxs[out_pos] = maxs[pos] + out_has_value[out_pos] = has_value[pos] + out_pos += 1 + return out_keys, out_row_counts, out_value_counts, out_sums, out_mins, out_maxs, out_has_value + finally: + free(table_bits); free(table_used); free(table_keys); free(row_counts); free(value_counts) + free(sums); free(mins); free(maxs); free(has_value) diff --git a/tests/ctable/test_groupby.py b/tests/ctable/test_groupby.py index ec4f2185..b085c92f 100644 --- a/tests/ctable/test_groupby.py +++ b/tests/ctable/test_groupby.py @@ -291,3 +291,31 @@ def test_groupby_cython_integer_key_nullable_float_aggs(): assert np.isnan(got[1][4]) assert np.isnan(got[1][5]) assert got[2] == (2, 1, 10.0, 10.0, 10.0, 10.0) + + +def test_groupby_cython_arbitrary_float_key_aggs(): + t = CTable( + Float64KeyRow, + new_data=[(0.5, 1.0), (1.25, 10.0), (0.5, 3.0), (-2.5, 4.0), (1.25, 2.0)], + ) + + out = t.group_by("key").agg({"value": ["count", "sum", "mean", "min", "max"]}) + + assert rows(out) == [ + (-2.5, 1, 4.0, 4.0, 4.0, 4.0), + (0.5, 2, 4.0, 2.0, 1.0, 3.0), + (1.25, 2, 12.0, 6.0, 2.0, 10.0), + ] + + +def test_groupby_cython_arbitrary_float_key_nan_and_signed_zero(): + t = CTable(Float64KeyRow, new_data=[(-0.0, 1.0), (0.0, 2.0), (np.nan, 3.0), (np.nan, 4.0)]) + + dropped = t.group_by("key").agg({"value": "sum"}) + kept = t.group_by("key", dropna=False).agg({"value": "sum"}) + + assert rows(dropped) == [(0.0, 3.0)] + got = rows(kept) + assert got[0] == (0.0, 3.0) + assert np.isnan(got[1][0]) + assert got[1][1] == 7.0 From 840a469dd804f12ee3b8c2817fa2cd118feb6c91 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 15 May 2026 12:44:49 +0200 Subject: [PATCH 10/17] Add Cython group-by fast paths Implement fused integer-key dense kernels for all integer widths, add Cython aggregations beyond sum, introduce arbitrary float-key hashing, and add a conservative two-key integer/dictionary hash path. Extend group-by benchmarks and tests for the new optimized cases. --- bench/ctable/groupby.py | 108 +++++++++++++------- plans/ctable-groupby.md | 18 +++- src/blosc2/groupby.py | 136 ++++++++++++++++++++++++++ src/blosc2/groupby_ext.pyx | 184 +++++++++++++++++++++++++++++++++++ tests/ctable/test_groupby.py | 39 ++++++++ 5 files changed, 446 insertions(+), 39 deletions(-) diff --git a/bench/ctable/groupby.py b/bench/ctable/groupby.py index dfa7d863..41929563 100644 --- a/bench/ctable/groupby.py +++ b/bench/ctable/groupby.py @@ -7,6 +7,7 @@ python bench/ctable/groupby.py --rows 10_000_000 --groups 1_000 --key-dtype float64 --op sum # float key dtypes generate non-integral repeated labels to exercise the float hash path python bench/ctable/groupby.py --rows 1_000_000 --groups 100 --dictionary --pandas +python bench/ctable/groupby.py --rows 10_000_000 --groups 1_000 --groups2 100 --multi-key --op sum """ from __future__ import annotations @@ -25,35 +26,57 @@ def parse_int(text: str) -> int: return int(text.replace("_", "")) -def build_row_type(dictionary: bool, key_dtype: str): - if dictionary: +def build_row_type(dictionary: bool, key_dtype: str, multi_key: bool): + if dictionary and multi_key: @dataclasses.dataclass class Row: - key: str = blosc2.field(blosc2.dictionary()) + key0: str = blosc2.field(blosc2.dictionary()) + key1: int = blosc2.field(blosc2.int32()) value: float = blosc2.field(blosc2.float64()) - elif key_dtype in {"int8", "uint8", "int16", "uint16", "int32", "uint32", "int64", "uint64"}: - key_spec = getattr(blosc2, key_dtype)() + elif dictionary: @dataclasses.dataclass class Row: - key: int = blosc2.field(key_spec) + key: str = blosc2.field(blosc2.dictionary()) value: float = blosc2.field(blosc2.float64()) - elif key_dtype == "float32": + elif key_dtype in {"int8", "uint8", "int16", "uint16", "int32", "uint32", "int64", "uint64"}: + key_spec = getattr(blosc2, key_dtype)() - @dataclasses.dataclass - class Row: - key: float = blosc2.field(blosc2.float32()) - value: float = blosc2.field(blosc2.float64()) + if multi_key: - elif key_dtype == "float64": + @dataclasses.dataclass + class Row: + key0: int = blosc2.field(key_spec) + key1: int = blosc2.field(key_spec) + value: float = blosc2.field(blosc2.float64()) - @dataclasses.dataclass - class Row: - key: float = blosc2.field(blosc2.float64()) - value: float = blosc2.field(blosc2.float64()) + else: + + @dataclasses.dataclass + class Row: + key: int = blosc2.field(key_spec) + value: float = blosc2.field(blosc2.float64()) + + elif key_dtype in {"float32", "float64"}: + key_spec = blosc2.float32() if key_dtype == "float32" else blosc2.float64() + + if multi_key: + + @dataclasses.dataclass + class Row: + key0: float = blosc2.field(key_spec) + key1: float = blosc2.field(key_spec) + value: float = blosc2.field(blosc2.float64()) + + else: + + @dataclasses.dataclass + class Row: + key: float = blosc2.field(key_spec) + value: float = blosc2.field(blosc2.float64()) else: # pragma: no cover - argparse choices prevent this raise ValueError(f"unsupported key dtype {key_dtype!r}") @@ -61,27 +84,37 @@ class Row: return Row -def make_data(nrows: int, ngroups: int, dictionary: bool, key_dtype: str, seed: int): - rng = np.random.default_rng(seed) - key_codes = rng.integers(0, ngroups, size=nrows, dtype=np.int32) - values = rng.random(nrows, dtype=np.float64) +def make_key_data(key_codes: np.ndarray, dictionary: bool, key_dtype: str): if dictionary: - keys = np.asarray([f"k{code}" for code in key_codes], dtype=object) - elif key_dtype in {"float32", "float64"}: + return np.asarray([f"k{code}" for code in key_codes], dtype=object) + if key_dtype in {"float32", "float64"}: # Use non-integral, repeated float labels by default so float-key # benchmarks exercise the arbitrary-float hash path instead of the # dense integral-float fast path. labels = key_codes.astype(np.float64) + 0.25 - keys = labels.astype(np.dtype(key_dtype)) - else: - keys = key_codes.astype(np.dtype(key_dtype), copy=False) - return keys, values + return labels.astype(np.dtype(key_dtype)) + return key_codes.astype(np.dtype(key_dtype), copy=False) + + +def make_data(nrows: int, ngroups: int, ngroups2: int, dictionary: bool, key_dtype: str, multi_key: bool, seed: int): + rng = np.random.default_rng(seed) + key_codes = rng.integers(0, ngroups, size=nrows, dtype=np.int32) + values = rng.random(nrows, dtype=np.float64) + if not multi_key: + return {"key": make_key_data(key_codes, dictionary, key_dtype), "value": values} + + key2_codes = rng.integers(0, ngroups2, size=nrows, dtype=np.int32) + key0 = make_key_data(key_codes, dictionary, key_dtype) + key1_dtype = "int32" if dictionary else key_dtype + key1 = make_key_data(key2_codes, False, key1_dtype) + return {"key0": key0, "key1": key1, "value": values} def main() -> None: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--rows", type=parse_int, default=10_000_000) parser.add_argument("--groups", type=parse_int, default=1_000) + parser.add_argument("--groups2", type=parse_int, default=None, help="Number of groups for key1 with --multi-key") parser.add_argument("--chunk-size", type=parse_int, default=None) parser.add_argument("--dictionary", action="store_true", help="Use a dictionary-encoded string key") parser.add_argument( @@ -102,32 +135,35 @@ def main() -> None: help="Physical dtype for non-dictionary keys. Float keys are generated as non-integral repeated labels.", ) parser.add_argument("--op", choices=["size", "count", "sum", "mean", "min", "max"], default="sum") + parser.add_argument("--multi-key", action="store_true", help="Group by two keys: key0 and key1") parser.add_argument("--sort", action="store_true") parser.add_argument("--pandas", action="store_true", help="Also run a pandas comparison if available") parser.add_argument("--urlpath", type=Path, default=None, help="Optional persistent CTable path") parser.add_argument("--seed", type=int, default=0) args = parser.parse_args() + groups2 = args.groups if args.groups2 is None else args.groups2 print( - f"rows={args.rows:,} groups={args.groups:,} dictionary={args.dictionary} " - f"key_dtype={args.key_dtype} op={args.op} sort={args.sort} " + f"rows={args.rows:,} groups={args.groups:,} groups2={groups2:,} multi_key={args.multi_key} " + f"dictionary={args.dictionary} key_dtype={args.key_dtype} op={args.op} sort={args.sort} " f"chunk_size={args.chunk_size} urlpath={args.urlpath}" ) - keys, values = make_data(args.rows, args.groups, args.dictionary, args.key_dtype, args.seed) - Row = build_row_type(args.dictionary, args.key_dtype) + data = make_data(args.rows, args.groups, groups2, args.dictionary, args.key_dtype, args.multi_key, args.seed) + Row = build_row_type(args.dictionary, args.key_dtype, args.multi_key) kwargs = {} if args.urlpath is not None: kwargs.update(urlpath=str(args.urlpath), mode="w") t0 = time.perf_counter() - table = blosc2.CTable(Row, new_data={"key": keys, "value": values}, expected_size=args.rows, **kwargs) + table = blosc2.CTable(Row, new_data=data, expected_size=args.rows, **kwargs) build_time = time.perf_counter() - t0 print(f"ctable_build_seconds={build_time:.6f}") t0 = time.perf_counter() - gb = table.group_by("key", sort=args.sort, chunk_size=args.chunk_size) + group_keys = ["key0", "key1"] if args.multi_key else "key" + gb = table.group_by(group_keys, sort=args.sort, chunk_size=args.chunk_size) if args.op == "size": out = gb.size() elif args.op == "count": @@ -144,14 +180,14 @@ def main() -> None: except ImportError: print("pandas_unavailable=true") else: - df = pd.DataFrame({"key": keys, "value": values}) + df = pd.DataFrame(data) t0 = time.perf_counter() if args.op == "size": - pdf = df.groupby("key", sort=args.sort).size() + pdf = df.groupby(group_keys, sort=args.sort).size() elif args.op == "count": - pdf = df.groupby("key", sort=args.sort)["value"].count() + pdf = df.groupby(group_keys, sort=args.sort)["value"].count() else: - pdf = df.groupby("key", sort=args.sort)["value"].agg(args.op) + pdf = df.groupby(group_keys, sort=args.sort)["value"].agg(args.op) pandas_elapsed = time.perf_counter() - t0 print(f"pandas_groupby_seconds={pandas_elapsed:.6f}") print(f"pandas_result_rows={len(pdf):,}") diff --git a/plans/ctable-groupby.md b/plans/ctable-groupby.md index b0f91276..75a71873 100644 --- a/plans/ctable-groupby.md +++ b/plans/ctable-groupby.md @@ -248,9 +248,21 @@ Remaining possible extensions: ### Multi-key Cython hash path -The generic NumPy path supports multi-key grouping via structured arrays. Future -Cython work could hash directly across multiple key arrays, avoiding structured -key packing, sort-based unique, inverse arrays, and Python merge overhead. +Implemented a conservative Cython hash path for two-key group-by when both keys +are integer or dictionary-code-backed columns. The path normalizes keys to +`int64`, hashes `(key0, key1)` directly, and supports `size`, `count`, `sum`, +`mean`, `min`, and `max` for supported float value reductions. This avoids +structured-array packing and per-chunk `np.unique` for common two-key +categorical/integer workloads. + +Remaining possible extensions: + +- support more than two key columns; +- support float/string fixed-width key components directly; +- support non-float value columns without normalizing value reductions through + float64; +- fuse/merge multi-key states across chunks fully in Cython rather than via the + existing Python accumulator merge. ### FULL-index sorted group-by path diff --git a/src/blosc2/groupby.py b/src/blosc2/groupby.py index 28a6a03a..773b0a30 100644 --- a/src/blosc2/groupby.py +++ b/src/blosc2/groupby.py @@ -176,6 +176,9 @@ def _validate_value_column(self, name: str) -> None: def _execute(self, specs: list[_AggSpec]): self._validate_output_names(specs) fast = self._try_execute_cython_dense_int_key(specs) + if fast is not None: + return fast + fast = self._try_execute_cython_two_int_key_hash(specs) if fast is not None: return fast fast = self._try_execute_cython_i32_f64_sum(specs) @@ -230,6 +233,139 @@ def _execute(self, specs: list[_AggSpec]): rows = self._final_rows(acc, key_values, specs) return self._build_result(rows, specs) + def _try_execute_cython_two_int_key_hash(self, specs: list[_AggSpec]): # noqa: C901 + """Cython hash path for two integer/dictionary-code keys.""" + if len(self.keys) != 2: + return None + + key_arrays = [] + key_is_dict = [] + key_nulls = [] + skip_key_nulls = [] + for key_name in self.keys: + key_info = self.table._schema.columns_by_name[key_name] + if self.table._is_dictionary_column(key_info): + key_arrays.append(self.table._cols[key_name].codes) + key_is_dict.append(True) + key_nulls.append(int(key_info.spec.null_code)) + skip_key_nulls.append(self.dropna) + continue + key_dtype = getattr(key_info.spec, "dtype", None) + if key_dtype is None or np.dtype(key_dtype).kind not in "biu": + return None + null_value = getattr(key_info.spec, "null_value", None) + if null_value is not None and not self.dropna: + return None + key_arrays.append(self.table._cols[key_name]) + key_is_dict.append(False) + key_nulls.append(0 if null_value is None else int(null_value)) + skip_key_nulls.append(self.dropna and null_value is not None) + + value_cols = {s.input_col for s in specs if s.input_col is not None} + if len(value_cols) > 1: + return None + value_col = next(iter(value_cols), None) + if value_col is not None and any(s.op in {"sum", "mean", "min", "max"} for s in specs): + value_info = self.table._schema.columns_by_name[value_col] + value_dtype = getattr(value_info.spec, "dtype", None) + if value_dtype is None or np.dtype(value_dtype).kind != "f": + return None + null_value = getattr(value_info.spec, "null_value", None) + if null_value is not None and not (isinstance(null_value, float) and math.isnan(null_value)): + return None + + try: + from blosc2 import groupby_ext + except ImportError: + return None + kernel = getattr(groupby_ext, "groupby_hash_i64x2_f64", None) + if kernel is None: + return None + + acc: dict[Any, dict[str, _AggState]] = {} + key_values: dict[Any, tuple[Any, ...]] = {} + phys_len = len(self.table._valid_rows) + chunk_size = self._chunk_size() + + for start in range(0, phys_len, chunk_size): + stop = min(start + chunk_size, phys_len) + valid = np.asarray(self.table._valid_rows[start:stop], dtype=bool) + if not np.any(valid): + continue + key_chunks = [np.asarray(arr[start:stop], dtype=np.int64) for arr in key_arrays] + live = valid.copy() + for key_chunk, skip_null, null_value in zip(key_chunks, skip_key_nulls, key_nulls, strict=True): + if skip_null: + live &= key_chunk != null_value + if not np.any(live): + continue + + if value_col is None: + values = np.empty(len(valid), dtype=np.float64) + values_valid = np.zeros(len(valid), dtype=bool) + has_values = False + else: + raw_values = np.asarray(self.table._cols[value_col][start:stop]) + values = np.ascontiguousarray(raw_values.astype(np.float64, copy=False)) + values_valid = np.ascontiguousarray(~self._null_mask(value_col, raw_values, is_key=False)) + has_values = True + + ( + out_k0, + out_k1, + row_counts, + value_counts, + sums, + mins, + maxs, + has_value, + ) = kernel( + np.ascontiguousarray(key_chunks[0]), + np.ascontiguousarray(key_chunks[1]), + values, + np.ascontiguousarray(live), + values_valid, + has_values, + ) + + for i, (code0, code1) in enumerate(zip(out_k0, out_k1, strict=True)): + display = [] + norm_parts = [] + for key_pos, code in enumerate((int(code0), int(code1))): + if key_is_dict[key_pos]: + value = self.table._cols[self.keys[key_pos]].decode(code) + else: + value = code + display.append(value) + norm_parts.append(_normalize_key_part(value)) + norm_key = tuple(norm_parts) + states = acc.setdefault(norm_key, {}) + key_values.setdefault(norm_key, tuple(display)) + for spec in specs: + state = states.setdefault(spec.output_col, _AggState(spec.op)) + if spec.op == "size": + state.value = (0 if state.value is None else state.value) + int(row_counts[i]) + elif spec.op == "count": + state.value = (0 if state.value is None else state.value) + int(value_counts[i]) + elif spec.op in {"sum", "mean"}: + if has_value[i]: + state.value = (0.0 if state.value is None else state.value) + float(sums[i]) + state.count += int(value_counts[i]) + elif spec.op == "min": + if has_value[i]: + value = float(mins[i]) + if state.count == 0 or value < state.value: + state.value = value + state.count += 1 + elif spec.op == "max" and has_value[i]: + value = float(maxs[i]) + if state.count == 0 or value > state.value: + state.value = value + state.count += 1 + + rows = self._final_rows(acc, key_values, specs) + return self._build_result(rows, specs) + def _try_execute_cython_dense_int_key(self, specs: list[_AggSpec]): # noqa: C901 """Cython fast path for one compact integer/dictionary key and dense aggregations.""" if len(self.keys) != 1: diff --git a/src/blosc2/groupby_ext.pyx b/src/blosc2/groupby_ext.pyx index 03b18894..ae5ffd8a 100644 --- a/src/blosc2/groupby_ext.pyx +++ b/src/blosc2/groupby_ext.pyx @@ -938,3 +938,187 @@ def groupby_hash_f64_f64( finally: free(table_bits); free(table_used); free(table_keys); free(row_counts); free(value_counts) free(sums); free(mins); free(maxs); free(has_value) + + +def groupby_hash_i64x2_f64( + int64_t[:] key0, + int64_t[:] key1, + double[:] values, + np.npy_bool[:] valid, + np.npy_bool[:] values_valid, + bint has_values, +): + """Hash two int64-normalized keys and accumulate float64 group states.""" + if key0.shape[0] != key1.shape[0] or key0.shape[0] != valid.shape[0]: + raise ValueError("key0, key1 and valid must have the same length") + if has_values and (values.shape[0] != key0.shape[0] or values_valid.shape[0] != key0.shape[0]): + raise ValueError("values, values_valid and keys must have the same length") + + cdef Py_ssize_t n = key0.shape[0] + cdef Py_ssize_t cap = 1024 + cdef Py_ssize_t used_count = 0 + cdef Py_ssize_t i, pos, old_pos, out_pos + cdef uint64_t mask = cap - 1 + cdef uint64_t h + cdef int64_t k0, k1 + cdef double value + cdef bint value_ok + + cdef int64_t* table_k0 = malloc(cap * sizeof(int64_t)) + cdef int64_t* table_k1 = malloc(cap * sizeof(int64_t)) + cdef np.npy_bool* table_used = malloc(cap * sizeof(np.npy_bool)) + cdef int64_t* row_counts = malloc(cap * sizeof(int64_t)) + cdef int64_t* value_counts = malloc(cap * sizeof(int64_t)) + cdef double* sums = malloc(cap * sizeof(double)) + cdef double* mins = malloc(cap * sizeof(double)) + cdef double* maxs = malloc(cap * sizeof(double)) + cdef np.npy_bool* has_value = malloc(cap * sizeof(np.npy_bool)) + + cdef int64_t* new_k0 + cdef int64_t* new_k1 + cdef np.npy_bool* new_used + cdef int64_t* new_row_counts + cdef int64_t* new_value_counts + cdef double* new_sums + cdef double* new_mins + cdef double* new_maxs + cdef np.npy_bool* new_has_value + cdef Py_ssize_t old_cap + + if ( + table_k0 == NULL + or table_k1 == NULL + or table_used == NULL + or row_counts == NULL + or value_counts == NULL + or sums == NULL + or mins == NULL + or maxs == NULL + or has_value == NULL + ): + free(table_k0); free(table_k1); free(table_used); free(row_counts); free(value_counts) + free(sums); free(mins); free(maxs); free(has_value) + raise MemoryError() + + for i in range(cap): + table_used[i] = 0 + + try: + for i in range(n): + if not valid[i]: + continue + k0 = key0[i] + k1 = key1[i] + + if (used_count + 1) * 2 >= cap: + old_cap = cap + cap *= 2 + mask = cap - 1 + new_k0 = malloc(cap * sizeof(int64_t)) + new_k1 = malloc(cap * sizeof(int64_t)) + new_used = malloc(cap * sizeof(np.npy_bool)) + new_row_counts = malloc(cap * sizeof(int64_t)) + new_value_counts = malloc(cap * sizeof(int64_t)) + new_sums = malloc(cap * sizeof(double)) + new_mins = malloc(cap * sizeof(double)) + new_maxs = malloc(cap * sizeof(double)) + new_has_value = malloc(cap * sizeof(np.npy_bool)) + if ( + new_k0 == NULL + or new_k1 == NULL + or new_used == NULL + or new_row_counts == NULL + or new_value_counts == NULL + or new_sums == NULL + or new_mins == NULL + or new_maxs == NULL + or new_has_value == NULL + ): + free(new_k0); free(new_k1); free(new_used); free(new_row_counts); free(new_value_counts) + free(new_sums); free(new_mins); free(new_maxs); free(new_has_value) + raise MemoryError() + for pos in range(cap): + new_used[pos] = 0 + for old_pos in range(old_cap): + if not table_used[old_pos]: + continue + h = _mix_u64(table_k0[old_pos]) ^ _mix_u64(table_k1[old_pos] + 0x9e3779b97f4a7c15) + pos = (h & mask) + while new_used[pos]: + pos = ((pos + 1) & mask) + new_used[pos] = 1 + new_k0[pos] = table_k0[old_pos] + new_k1[pos] = table_k1[old_pos] + new_row_counts[pos] = row_counts[old_pos] + new_value_counts[pos] = value_counts[old_pos] + new_sums[pos] = sums[old_pos] + new_mins[pos] = mins[old_pos] + new_maxs[pos] = maxs[old_pos] + new_has_value[pos] = has_value[old_pos] + free(table_k0); free(table_k1); free(table_used); free(row_counts); free(value_counts) + free(sums); free(mins); free(maxs); free(has_value) + table_k0 = new_k0 + table_k1 = new_k1 + table_used = new_used + row_counts = new_row_counts + value_counts = new_value_counts + sums = new_sums + mins = new_mins + maxs = new_maxs + has_value = new_has_value + + h = _mix_u64(k0) ^ _mix_u64(k1 + 0x9e3779b97f4a7c15) + pos = (h & mask) + while table_used[pos] and (table_k0[pos] != k0 or table_k1[pos] != k1): + pos = ((pos + 1) & mask) + if not table_used[pos]: + table_used[pos] = 1 + table_k0[pos] = k0 + table_k1[pos] = k1 + row_counts[pos] = 0 + value_counts[pos] = 0 + sums[pos] = 0.0 + mins[pos] = 0.0 + maxs[pos] = 0.0 + has_value[pos] = 0 + used_count += 1 + + row_counts[pos] += 1 + if has_values: + value_ok = values_valid[i] + if value_ok: + value = values[i] + value_counts[pos] += 1 + sums[pos] += value + if not has_value[pos] or value < mins[pos]: + mins[pos] = value + if not has_value[pos] or value > maxs[pos]: + maxs[pos] = value + has_value[pos] = 1 + + out_k0 = np.empty(used_count, dtype=np.int64) + out_k1 = np.empty(used_count, dtype=np.int64) + out_row_counts = np.empty(used_count, dtype=np.int64) + out_value_counts = np.empty(used_count, dtype=np.int64) + out_sums = np.empty(used_count, dtype=np.float64) + out_mins = np.empty(used_count, dtype=np.float64) + out_maxs = np.empty(used_count, dtype=np.float64) + out_has_value = np.empty(used_count, dtype=bool) + + out_pos = 0 + for pos in range(cap): + if not table_used[pos]: + continue + out_k0[out_pos] = table_k0[pos] + out_k1[out_pos] = table_k1[pos] + out_row_counts[out_pos] = row_counts[pos] + out_value_counts[out_pos] = value_counts[pos] + out_sums[out_pos] = sums[pos] + out_mins[out_pos] = mins[pos] + out_maxs[out_pos] = maxs[pos] + out_has_value[out_pos] = has_value[pos] + out_pos += 1 + return out_k0, out_k1, out_row_counts, out_value_counts, out_sums, out_mins, out_maxs, out_has_value + finally: + free(table_k0); free(table_k1); free(table_used); free(row_counts); free(value_counts) + free(sums); free(mins); free(maxs); free(has_value) diff --git a/tests/ctable/test_groupby.py b/tests/ctable/test_groupby.py index b085c92f..b920c99a 100644 --- a/tests/ctable/test_groupby.py +++ b/tests/ctable/test_groupby.py @@ -319,3 +319,42 @@ def test_groupby_cython_arbitrary_float_key_nan_and_signed_zero(): assert got[0] == (0.0, 3.0) assert np.isnan(got[1][0]) assert got[1][1] == 7.0 + + +@dataclass +class TwoIntKeyFloatRow: + key0: int = blosc2.field(blosc2.int16()) + key1: int = blosc2.field(blosc2.uint16()) + value: float = blosc2.field(blosc2.float64(nullable=True), default=0.0) + + +def test_groupby_cython_two_integer_key_hash_aggs(): + t = CTable( + TwoIntKeyFloatRow, + new_data=[(0, 1, 1.0), (0, 1, 3.0), (0, 2, 10.0), (1, 1, np.nan), (1, 1, 5.0)], + ) + + out = t.group_by(["key0", "key1"], sort=True).agg( + {"*": "size", "value": ["count", "sum", "mean", "min", "max"]} + ) + + assert rows(out) == [ + (0, 1, 2, 2, 4.0, 2.0, 1.0, 3.0), + (0, 2, 1, 1, 10.0, 10.0, 10.0, 10.0), + (1, 1, 2, 1, 5.0, 5.0, 5.0, 5.0), + ] + + +@dataclass +class DictIntKeyFloatRow: + key0: str = blosc2.field(blosc2.dictionary()) + key1: int = blosc2.field(blosc2.int32()) + value: float = blosc2.field(blosc2.float64()) + + +def test_groupby_cython_dictionary_integer_key_hash(): + t = CTable(DictIntKeyFloatRow, new_data=[("b", 2, 1.0), ("a", 1, 2.0), ("b", 2, 3.0)]) + + out = t.group_by(["key0", "key1"], sort=True).agg({"value": "sum"}) + + assert rows(out) == [("a", 1, 2.0), ("b", 2, 4.0)] From 5df37372ce246330cbe1f36eea03f2d11e32de78 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 15 May 2026 13:06:10 +0200 Subject: [PATCH 11/17] Aborted the full index sorted group_by path, and an explanation on why --- plans/ctable-groupby.md | 51 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/plans/ctable-groupby.md b/plans/ctable-groupby.md index 75a71873..59fd6f07 100644 --- a/plans/ctable-groupby.md +++ b/plans/ctable-groupby.md @@ -266,18 +266,59 @@ Remaining possible extensions: ### FULL-index sorted group-by path -A FULL index on a single grouping key can provide sorted positions. A future -sorted-scan group-by path could: +A FULL index on a single grouping key can provide sorted positions. A prototype +Python/NumPy sorted-scan path was implemented and then reverted after +benchmarking because it was not competitive with the existing dense/hash paths. + +Prototype behavior: ```text -read sorted positions from FULL index +read sorted values/positions from FULL sidecars scan contiguous key runs +respect _valid_rows reduce each run emit sorted groups naturally ``` -This would be especially useful for high-cardinality single-key group-by and -for users requesting `sort=True`. +Observed benchmark results on 50M rows / 5k compact groups: + +```text +float64 key, sum, sort=True, FULL index: + index build: ~6.2 s + group_by: ~104 s + +int64 key, sum, sort=True, FULL index: + index build: ~5.5 s + group_by: ~102 s + +int64 key, size, sort=True, FULL index: + index build: ~5.5 s + group_by: ~0.45 s + +int64 key, size, sort=False, no FULL index: + group_by: ~0.14 s +``` + +Why the prototype was slow: + +- value aggregations required many scattered gathers from the original value + column, one gathered position set per key run; +- scattered value access is much less cache/compression friendly than the + existing sequential dense/hash scans; +- the implementation still had Python-level run processing and result merging; +- FULL index build cost is substantial unless the index already exists and can + be reused many times; +- compact integer-key workloads are already ideal for dense accumulator arrays. + +Recommendation: + +- keep this deferred for now; +- do not reintroduce a Python-level FULL-index value-aggregation path; +- revisit only with a block-aware/Cython reducer that batches sorted positions + by physical chunks/blocks, or as part of a broader high-cardinality/sparse-key + strategy; +- if revisited, benchmark primarily against high-cardinality non-compact keys + and already-existing FULL indexes, not compact dense-key workloads. ### Public `blosc2.group_reduce()` From 0a459a515c27297a24fc1ef363f4b1d40f5317e2 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 15 May 2026 13:29:38 +0200 Subject: [PATCH 12/17] Added a new public blosc2.group_reduce() that is apt for NDArray instances --- doc/reference/reduction_functions.rst | 2 + plans/ctable-groupby.md | 43 +++- src/blosc2/__init__.py | 4 +- src/blosc2/groupby.py | 297 ++++++++++++++++++++++++++ tests/test_group_reduce.py | 75 +++++++ 5 files changed, 414 insertions(+), 7 deletions(-) create mode 100644 tests/test_group_reduce.py diff --git a/doc/reference/reduction_functions.rst b/doc/reference/reduction_functions.rst index 4c21c150..5122807b 100644 --- a/doc/reference/reduction_functions.rst +++ b/doc/reference/reduction_functions.rst @@ -14,6 +14,7 @@ Reduction operations can be used with any of :ref:`NDArray `, :ref:`C2A argmax argmin count_nonzero + group_reduce cumulative_prod cumulative_sum max @@ -31,6 +32,7 @@ Reduction operations can be used with any of :ref:`NDArray `, :ref:`C2A .. autofunction:: blosc2.argmax .. autofunction:: blosc2.argmin .. autofunction:: blosc2.count_nonzero +.. autofunction:: blosc2.group_reduce .. autofunction:: blosc2.cumulative_prod .. autofunction:: blosc2.cumulative_sum .. autofunction:: blosc2.max diff --git a/plans/ctable-groupby.md b/plans/ctable-groupby.md index 59fd6f07..44b4d2f5 100644 --- a/plans/ctable-groupby.md +++ b/plans/ctable-groupby.md @@ -322,13 +322,44 @@ Recommendation: ### Public `blosc2.group_reduce()` -Keep lower-level group-reduce machinery internal for now. Consider exposing a -public `blosc2.group_reduce()` only after: +Implemented a conservative public `blosc2.group_reduce()` array API for +single-key grouped reductions without requiring a `CTable`. -- aggregation semantics are stable; -- null/NaN behavior is fully documented; -- output representation is clear; -- benchmarks show usefulness outside `CTable.group_by()`. +Implemented API: + +```python +groups, result = blosc2.group_reduce( + keys, values=None, op="size", sort=False, dropna=True +) +``` + +Implemented operations: + +- `size`; +- `count`; +- `sum`; +- `mean`; +- `min`; +- `max`. + +Implemented semantics: + +- returns plain NumPy arrays `(groups, result)`; +- `size` counts rows and does not require values; +- `count` counts non-NaN values; +- `dropna=True` skips NaN float keys; +- `dropna=False` keeps one normalized NaN group; +- `+0.0` and `-0.0` are normalized by the float hash path; +- optimized dense integer and arbitrary-float hash paths are used + opportunistically, with a NumPy/Python fallback. + +Remaining possible extensions: + +- multi-key public API; +- multiple aggregations in one call; +- multiple value columns; +- NDArray/chunked execution without eager NumPy conversion; +- optional CTable/persistent output. ### High-cardinality and memory strategy diff --git a/src/blosc2/__init__.py b/src/blosc2/__init__.py index 29ed2024..ee258655 100644 --- a/src/blosc2/__init__.py +++ b/src/blosc2/__init__.py @@ -628,7 +628,7 @@ def _raise(exc): # Note: bool, bytes, string shadow builtins in the blosc2 namespace by design — # they are schema spec constructors (b2.bool(), b2.bytes(), etc.). from .ctable import DEFAULT_NULL_POLICY, Column, CTable, NullPolicy, get_null_policy, null_policy -from .groupby import CTableGroupBy +from .groupby import CTableGroupBy, group_reduce from .ndarray import ( abs, acos, @@ -802,6 +802,8 @@ def _raise(exc): "uint64", "vlbytes", "vlstring", + # Grouped reductions + "group_reduce", # Classes "C2Array", "CParams", diff --git a/src/blosc2/groupby.py b/src/blosc2/groupby.py index 773b0a30..60ef8bbf 100644 --- a/src/blosc2/groupby.py +++ b/src/blosc2/groupby.py @@ -1427,3 +1427,300 @@ def _null_output_value(spec: SchemaSpec): if dtype is not None and dtype.kind == "S": return b"" return None + + +# ---------------------------------------------------------------------- +# Public array-oriented grouped reductions +# ---------------------------------------------------------------------- + + +def group_reduce(keys, values=None, op: AggName = "size", *, sort: bool = False, dropna: bool = True): + """Group *keys* and reduce *values* with *op*. + + This is a lower-level, NumPy-style grouped reduction primitive. It exposes + Blosc2's optimized group-reduce kernels for plain array-like inputs without + requiring a :class:`blosc2.CTable`. + + Parameters + ---------- + keys : array-like + One-dimensional grouping keys. + values : array-like, optional + One-dimensional values to reduce. Required for ``"count"``, ``"sum"``, + ``"mean"``, ``"min"`` and ``"max"``. Ignored for ``"size"``. + op : {"size", "count", "sum", "mean", "min", "max"}, default: "size" + Reduction operation. ``"size"`` counts rows per group, while + ``"count"`` counts non-NaN values per group. + sort : bool, default: False + If true, sort output groups by key. With ``sort=False`` output order is + implementation dependent. + dropna : bool, default: True + If true, skip NaN float keys. If false, all NaN keys form one group. + + Returns + ------- + groups, result : numpy.ndarray, numpy.ndarray + Group keys and reduced values. + + Examples + -------- + >>> import numpy as np + >>> import blosc2 + >>> keys = np.array([1, 2, 1, 2, 1]) + >>> values = np.array([10., 20., 30., 40., 50.]) + >>> groups, sums = blosc2.group_reduce(keys, values, op="sum", sort=True) + >>> groups + array([1, 2]) + >>> sums + array([90., 60.]) + """ + if op not in {"size", "count", "sum", "mean", "min", "max"}: + raise ValueError(f"unsupported group_reduce operation {op!r}") + + keys_arr = np.asarray(keys) + if keys_arr.ndim != 1: + raise ValueError("keys must be a 1-D array") + + if op == "size": + values_arr = None + else: + if values is None: + raise ValueError(f"values are required for group_reduce op {op!r}") + values_arr = np.asarray(values) + if values_arr.ndim != 1: + raise ValueError("values must be a 1-D array") + if len(values_arr) != len(keys_arr): + raise ValueError("keys and values must have the same length") + + if len(keys_arr) == 0: + return keys_arr.copy(), np.empty(0, dtype=_result_dtype(values_arr, op)) + + fast = _try_dense_integer(keys_arr, values_arr, op, sort=sort) + if fast is not None: + return fast + + fast = _try_float_hash(keys_arr, values_arr, op, sort=sort, dropna=dropna) + if fast is not None: + return fast + + return _group_reduce_numpy(keys_arr, values_arr, op, sort=sort, dropna=dropna) + + +def _try_dense_integer(keys: np.ndarray, values: np.ndarray | None, op: str, *, sort: bool): # noqa: C901 + key_dtype = np.dtype(keys.dtype) + if key_dtype.kind == "b": + keys = keys.astype(np.int8, copy=False) + elif key_dtype.kind not in "iu": + return None + keys = np.ascontiguousarray(keys) + if len(keys) == 0: + return None + if np.min(keys) < 0: + return None + max_key = int(np.max(keys)) + if max_key + 1 > 10_000_000: + return None + + try: + from blosc2 import groupby_ext + except ImportError: + return None + + valid = np.ones(len(keys), dtype=bool) + keys_present = np.zeros(max_key + 1, dtype=bool) + + if op == "size": + counts = np.zeros(max_key + 1, dtype=np.int64) + groupby_ext.groupby_dense_int_size_checked(keys, valid, counts, keys_present, False, 0) + groups = np.nonzero(keys_present)[0].astype(key_dtype if key_dtype.kind != "b" else np.bool_) + result = counts[np.nonzero(keys_present)[0]] + return _maybe_sort(groups, result, sort) + + assert values is not None + value_dtype = np.dtype(values.dtype) + if op == "count": + counts = np.zeros(max_key + 1, dtype=np.int64) + values_valid = _values_valid(values) + groupby_ext.groupby_dense_int_count_checked( + keys, valid, np.ascontiguousarray(values_valid), counts, keys_present, False, 0 + ) + codes = np.nonzero(keys_present)[0] + return _maybe_sort( + codes.astype(key_dtype if key_dtype.kind != "b" else np.bool_), counts[codes], sort + ) + + if op == "mean" or value_dtype.kind == "f": + vals = np.ascontiguousarray(values.astype(np.float64, copy=False)) + skip_nan = value_dtype.kind == "f" + if op == "sum": + sums = np.zeros(max_key + 1, dtype=np.float64) + present = np.zeros(max_key + 1, dtype=bool) + groupby_ext.groupby_dense_int_f64_sum_checked( + keys, vals, valid, sums, present, keys_present, False, 0, skip_nan + ) + codes = np.nonzero(keys_present)[0] + result = sums[codes] + result[~present[codes]] = np.nan + elif op == "mean": + sums = np.zeros(max_key + 1, dtype=np.float64) + counts = np.zeros(max_key + 1, dtype=np.int64) + groupby_ext.groupby_dense_int_f64_mean_checked( + keys, vals, valid, sums, counts, keys_present, False, 0, skip_nan + ) + codes = np.nonzero(keys_present)[0] + result = np.full(len(codes), np.nan, dtype=np.float64) + ok = counts[codes] > 0 + result[ok] = sums[codes][ok] / counts[codes][ok] + elif op in {"min", "max"}: + state = np.zeros(max_key + 1, dtype=np.float64) + has_value = np.zeros(max_key + 1, dtype=bool) + kernel = getattr(groupby_ext, f"groupby_dense_int_f64_{op}_checked") + kernel(keys, vals, valid, state, has_value, keys_present, False, 0, skip_nan) + codes = np.nonzero(keys_present)[0] + result = state[codes] + result[~has_value[codes]] = np.nan + else: # pragma: no cover + return None + return _maybe_sort(codes.astype(key_dtype if key_dtype.kind != "b" else np.bool_), result, sort) + + if value_dtype.kind not in "biu": + return None + vals_i64 = np.ascontiguousarray(values.astype(np.int64, copy=False)) + state = np.zeros(max_key + 1, dtype=np.int64) + present = np.zeros(max_key + 1, dtype=bool) + kernel = getattr(groupby_ext, f"groupby_dense_int_i64_{op}_checked", None) + if kernel is None: + return None + kernel(keys, vals_i64, valid, state, present, keys_present, False, 0) + codes = np.nonzero(keys_present)[0] + return _maybe_sort(codes.astype(key_dtype if key_dtype.kind != "b" else np.bool_), state[codes], sort) + + +def _try_float_hash(keys: np.ndarray, values: np.ndarray | None, op: str, *, sort: bool, dropna: bool): + key_dtype = np.dtype(keys.dtype) + if key_dtype.kind != "f": + return None + if values is not None and np.dtype(values.dtype).kind != "f" and op != "count": + return None + try: + from blosc2 import groupby_ext + except ImportError: + return None + + keys_f64 = np.ascontiguousarray(keys.astype(np.float64, copy=False)) + valid = np.ones(len(keys_f64), dtype=bool) + if values is None: + values_f64 = np.empty(len(keys_f64), dtype=np.float64) + values_valid = np.zeros(len(keys_f64), dtype=bool) + has_values = False + else: + values_f64 = np.ascontiguousarray(np.asarray(values, dtype=np.float64)) + values_valid = np.ascontiguousarray(_values_valid(values)) + has_values = True + + groups, row_counts, value_counts, sums, mins, maxs, has_value = groupby_ext.groupby_hash_f64_f64( + keys_f64, values_f64, valid, values_valid, has_values, dropna + ) + groups = groups.astype(key_dtype, copy=False) + if op == "size": + result = row_counts + elif op == "count": + result = value_counts + elif op == "sum": + result = sums.copy() + result[~has_value] = np.nan + elif op == "mean": + result = np.full(len(groups), np.nan, dtype=np.float64) + ok = value_counts > 0 + result[ok] = sums[ok] / value_counts[ok] + elif op == "min": + result = mins.copy() + result[~has_value] = np.nan + elif op == "max": + result = maxs.copy() + result[~has_value] = np.nan + else: # pragma: no cover + return None + return _maybe_sort(groups, result, sort) + + +def _group_reduce_numpy( # noqa: C901 + keys: np.ndarray, values: np.ndarray | None, op: str, *, sort: bool, dropna: bool +): + acc: dict[object, list] = {} + display: dict[object, object] = {} + for i, key in enumerate(keys): + key_item = _python_scalar(key) + if isinstance(key_item, float) and math.isnan(key_item): + if dropna: + continue + norm_key = _NAN_KEY + else: + norm_key = key_item + display.setdefault(norm_key, key_item) + state = acc.setdefault(norm_key, [0, 0, 0.0, None, None]) + state[0] += 1 + if values is None: + continue + value = _python_scalar(values[i]) + if isinstance(value, float) and math.isnan(value): + continue + state[1] += 1 + if op in {"sum", "mean"}: + state[2] += value + elif op == "min" and (state[3] is None or value < state[3]): + state[3] = value + elif op == "max" and (state[4] is None or value > state[4]): + state[4] = value + + order = list(acc) + if sort: + order.sort(key=lambda k: (1, "") if k is _NAN_KEY else (0, display[k])) + groups = np.asarray([display[k] for k in order], dtype=keys.dtype) + result = [] + for k in order: + rows, count, total, min_value, max_value = acc[k] + if op == "size": + result.append(rows) + elif op == "count": + result.append(count) + elif op == "sum": + result.append(total if count else _null_value_for(values)) + elif op == "mean": + result.append(math.nan if count == 0 else total / count) + elif op == "min": + result.append(min_value if count else _null_value_for(values)) + elif op == "max": + result.append(max_value if count else _null_value_for(values)) + return groups, np.asarray(result, dtype=_result_dtype(values, op)) + + +def _maybe_sort(groups: np.ndarray, result: np.ndarray, sort: bool): + if sort and len(groups): + order = np.argsort(groups, kind="stable") + return groups[order], result[order] + return groups, result + + +def _values_valid(values: np.ndarray) -> np.ndarray: + values = np.asarray(values) + if values.dtype.kind == "f": + return ~np.isnan(values) + return np.ones(len(values), dtype=bool) + + +def _result_dtype(values: np.ndarray | None, op: str): + if op in {"size", "count"}: + return np.int64 + if op == "mean" or values is None: + return np.float64 + dtype = np.dtype(values.dtype) + if op == "sum" and dtype.kind in "biu": + return np.int64 + return dtype + + +def _null_value_for(values: np.ndarray | None): + if values is not None and np.dtype(values.dtype).kind in "iu": + return 0 + return math.nan diff --git a/tests/test_group_reduce.py b/tests/test_group_reduce.py new file mode 100644 index 00000000..856c25ef --- /dev/null +++ b/tests/test_group_reduce.py @@ -0,0 +1,75 @@ +import numpy as np +import pytest + +import blosc2 + + +def test_group_reduce_size_and_sum_integer_keys(): + keys = np.array([2, 1, 2, 1, 2], dtype=np.int16) + values = np.array([10, 1, 30, 3, 50], dtype=np.int32) + + groups, sizes = blosc2.group_reduce(keys, op="size", sort=True) + groups2, sums = blosc2.group_reduce(keys, values, op="sum", sort=True) + + assert groups.dtype == keys.dtype + np.testing.assert_array_equal(groups, np.array([1, 2], dtype=np.int16)) + np.testing.assert_array_equal(sizes, np.array([2, 3])) + np.testing.assert_array_equal(groups2, np.array([1, 2], dtype=np.int16)) + np.testing.assert_array_equal(sums, np.array([4, 90])) + + +def test_group_reduce_integer_keys_float_aggs_with_nan_values(): + keys = np.array([0, 1, 0, 1, 2], dtype=np.uint16) + values = np.array([1.0, np.nan, 3.0, np.nan, 10.0]) + + groups, counts = blosc2.group_reduce(keys, values, op="count", sort=True) + _, means = blosc2.group_reduce(keys, values, op="mean", sort=True) + _, mins = blosc2.group_reduce(keys, values, op="min", sort=True) + _, maxs = blosc2.group_reduce(keys, values, op="max", sort=True) + + np.testing.assert_array_equal(groups, np.array([0, 1, 2], dtype=np.uint16)) + np.testing.assert_array_equal(counts, np.array([2, 0, 1])) + assert means[0] == 2.0 + assert np.isnan(means[1]) + assert means[2] == 10.0 + assert mins[0] == 1.0 + assert np.isnan(mins[1]) + assert mins[2] == 10.0 + assert maxs[0] == 3.0 + assert np.isnan(maxs[1]) + assert maxs[2] == 10.0 + + +def test_group_reduce_arbitrary_float_keys_and_nan_key_group(): + keys = np.array([0.5, np.nan, 0.5, -0.0, 0.0, np.nan]) + values = np.array([1.0, 2.0, 3.0, 10.0, 20.0, 5.0]) + + groups, sums = blosc2.group_reduce(keys, values, op="sum", sort=True, dropna=False) + + assert groups[0] == 0.0 + assert sums[0] == 30.0 + assert groups[1] == 0.5 + assert sums[1] == 4.0 + assert np.isnan(groups[2]) + assert sums[2] == 7.0 + + +def test_group_reduce_dropna_default_skips_nan_keys(): + keys = np.array([1.0, np.nan, 1.0]) + values = np.array([2.0, 10.0, 3.0]) + + groups, sums = blosc2.group_reduce(keys, values, op="sum", sort=True) + + np.testing.assert_array_equal(groups, np.array([1.0])) + np.testing.assert_array_equal(sums, np.array([5.0])) + + +def test_group_reduce_rejects_bad_inputs(): + with pytest.raises(ValueError): + blosc2.group_reduce(np.ones((2, 2)), op="size") + with pytest.raises(ValueError): + blosc2.group_reduce(np.arange(3), op="sum") + with pytest.raises(ValueError): + blosc2.group_reduce(np.arange(3), np.arange(2), op="sum") + with pytest.raises(ValueError): + blosc2.group_reduce(np.arange(3), op="bad") From 0185e796f11434ec3c84443bbf0461b50cf889bc Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 15 May 2026 13:30:22 +0200 Subject: [PATCH 13/17] Add forgotten test unit --- tests/ctable/test_object_spec.py | 65 ++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 tests/ctable/test_object_spec.py diff --git a/tests/ctable/test_object_spec.py b/tests/ctable/test_object_spec.py new file mode 100644 index 00000000..30fa7258 --- /dev/null +++ b/tests/ctable/test_object_spec.py @@ -0,0 +1,65 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +"""Tests for schema-less CTable object columns.""" + +from dataclasses import dataclass + +import pytest + +import blosc2 +from blosc2 import CTable + + +@dataclass +class ObjectRow: + id: int = blosc2.field(blosc2.int32()) + payload: object = blosc2.field(blosc2.object(nullable=True)) + + +def test_object_column_heterogeneous_values(): + t = CTable(ObjectRow) + t.append([1, {"kind": "dict", "values": [1, 2]}]) + t.append([2, ("tuple", 3)]) + t.append([3, None]) + + assert t["payload"][:] == [{"kind": "dict", "values": [1, 2]}, ("tuple", 3), None] + assert t["payload"].is_varlen_scalar + + +def test_object_column_persistence(tmp_path): + path = tmp_path / "objects.b2d" + t = CTable(ObjectRow, urlpath=str(path), mode="w") + t.extend([[1, {"x": 1}], [2, ["a", "b"]], [3, None]]) + t.close() + + reopened = CTable.open(str(path), mode="r") + assert reopened["payload"][:] == [{"x": 1}, ["a", "b"], None] + + +def test_object_column_to_arrow_raises(): + t = CTable(ObjectRow) + t.append([1, {"x": 1}]) + with pytest.raises(TypeError, match="ObjectSpec columns"): + t.to_arrow() + + +def test_object_column_rejects_none_when_not_nullable(): + @dataclass + class StrictObjectRow: + payload: object = blosc2.field(blosc2.object()) + + t = CTable(StrictObjectRow) + with pytest.raises(TypeError, match="not nullable"): + t.append([None]) + + +def test_object_column_rejects_non_msgpack_value_on_flush(): + t = CTable(ObjectRow) + t.append([1, {"not-msgpack": {1, 2, 3}}]) + with pytest.raises(TypeError): + t.close() From 98bb1400d68c37ec8e3789bb05b794505859bad5 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 15 May 2026 13:35:39 +0200 Subject: [PATCH 14/17] Implemented group-by convenience methods (sum, mean, min, max) --- doc/reference/ctable.rst | 9 ++++++--- plans/ctable-groupby.md | 5 ++++- src/blosc2/groupby.py | 28 ++++++++++++++++++++++++++++ tests/ctable/test_groupby.py | 17 +++++++++++++++++ 4 files changed, 55 insertions(+), 4 deletions(-) diff --git a/doc/reference/ctable.rst b/doc/reference/ctable.rst index 2ed012d0..8ccf74b8 100644 --- a/doc/reference/ctable.rst +++ b/doc/reference/ctable.rst @@ -251,17 +251,20 @@ Group-by reductions :meth:`CTable.group_by` returns a lightweight deferred group-by object. It is not a table view; methods such as :meth:`~blosc2.CTableGroupBy.size`, -:meth:`~blosc2.CTableGroupBy.count`, and +:meth:`~blosc2.CTableGroupBy.count`, :meth:`~blosc2.CTableGroupBy.sum`, and :meth:`~blosc2.CTableGroupBy.agg` materialize a new :class:`CTable` with one row per group:: by_city = t.group_by("city", sort=True) counts = by_city.size() # row count per city / COUNT(*) non_null = by_city.count("sales") # non-null sales count / COUNT(sales) - totals = by_city.agg({"sales": "sum"}) + totals = by_city.sum("sales") # equivalent to agg({"sales": "sum"}) + means = by_city.mean("sales") + mins = by_city.min("sales") + maxs = by_city.max("sales") .. autoclass:: CTableGroupBy - :members: size, count, agg + :members: size, count, sum, mean, min, max, agg Mutations diff --git a/plans/ctable-groupby.md b/plans/ctable-groupby.md index 44b4d2f5..308c47b8 100644 --- a/plans/ctable-groupby.md +++ b/plans/ctable-groupby.md @@ -381,7 +381,7 @@ Potential future optimization: ### Additional API conveniences -Potential future user conveniences: +Implemented group-by convenience methods: ```python t.group_by("city").sum("sales") @@ -390,6 +390,9 @@ t.group_by("city").min("sales") t.group_by("city").max("sales") ``` +These are equivalent to `agg({column: op})` and complement the already-existing +`size()` and `count(column)` group-by methods. + Do not add top-level `CTable.size()` / `CTable.count()` until their semantics are clearly justified outside group-by. diff --git a/src/blosc2/groupby.py b/src/blosc2/groupby.py index 60ef8bbf..c5f14221 100644 --- a/src/blosc2/groupby.py +++ b/src/blosc2/groupby.py @@ -110,6 +110,34 @@ def count(self, column: str): col = self.table._logical_to_physical_name(column) return self._execute([_AggSpec(col, "count", f"{col}_count")]) + def sum(self, column: str): + """Return sums of *column* per group. + + This is equivalent to ``group_by(...).agg({column: "sum"})``. + """ + return self.agg({column: "sum"}) + + def mean(self, column: str): + """Return means of *column* per group. + + This is equivalent to ``group_by(...).agg({column: "mean"})``. + """ + return self.agg({column: "mean"}) + + def min(self, column: str): + """Return minimum values of *column* per group. + + This is equivalent to ``group_by(...).agg({column: "min"})``. + """ + return self.agg({column: "min"}) + + def max(self, column: str): + """Return maximum values of *column* per group. + + This is equivalent to ``group_by(...).agg({column: "max"})``. + """ + return self.agg({column: "max"}) + def agg(self, aggregations: Mapping[str, str | Sequence[str]]): """Aggregate value columns per group. diff --git a/tests/ctable/test_groupby.py b/tests/ctable/test_groupby.py index b920c99a..ae884c42 100644 --- a/tests/ctable/test_groupby.py +++ b/tests/ctable/test_groupby.py @@ -358,3 +358,20 @@ def test_groupby_cython_dictionary_integer_key_hash(): out = t.group_by(["key0", "key1"], sort=True).agg({"value": "sum"}) assert rows(out) == [("a", 1, 2.0), ("b", 2, 4.0)] + + +def test_groupby_convenience_numeric_methods(): + t = CTable(SalesRow, new_data=DATA) + + assert rows(t.group_by("city", sort=True).sum("qty")) == rows( + t.group_by("city", sort=True).agg({"qty": "sum"}) + ) + assert rows(t.group_by("city", sort=True).mean("qty")) == rows( + t.group_by("city", sort=True).agg({"qty": "mean"}) + ) + assert rows(t.group_by("city", sort=True).min("qty")) == rows( + t.group_by("city", sort=True).agg({"qty": "min"}) + ) + assert rows(t.group_by("city", sort=True).max("qty")) == rows( + t.group_by("city", sort=True).agg({"qty": "max"}) + ) From 9f964c9ebcddca829aabf89979f57ac2acddb712 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 15 May 2026 13:42:17 +0200 Subject: [PATCH 15/17] Updated plan with current state --- plans/ctable-groupby.md | 395 ++++++++++++++++++++-------------------- 1 file changed, 197 insertions(+), 198 deletions(-) diff --git a/plans/ctable-groupby.md b/plans/ctable-groupby.md index 308c47b8..522fba47 100644 --- a/plans/ctable-groupby.md +++ b/plans/ctable-groupby.md @@ -1,12 +1,12 @@ # CTable `group_by` implementation plan — status This document started as the implementation plan for `CTable.group_by()`. The -initial plan has now been executed through Phase 3. The remaining sections -record what was completed and what is future work. +core API and several optimized execution paths are now implemented. The first +section records completed work; the final section lists remaining future work. ## Completed -### Public API +### Public `CTable.group_by()` API Implemented: @@ -30,13 +30,27 @@ Implemented API decisions: - `dropna=True` is the default; `dropna=False` keeps null/NaN key groups. - No top-level `CTable.size()` or `CTable.count()` was added. -### Phase 1: Python/NumPy implementation +### Convenience group-by methods + +Implemented group-by convenience methods: + +```python +t.group_by("city").sum("sales") +t.group_by("city").mean("sales") +t.group_by("city").min("sales") +t.group_by("city").max("sales") +``` + +These are equivalent to `agg({column: op})` and complement `size()` and +`count(column)`. + +### Generic Python/NumPy implementation Implemented files: ```text src/blosc2/ctable.py # CTable.group_by() -src/blosc2/groupby.py # CTableGroupBy and NumPy fallback engine +src/blosc2/groupby.py # CTableGroupBy, NumPy fallback, public group_reduce() ``` Implemented functionality: @@ -51,9 +65,9 @@ Implemented functionality: - Supports empty inputs. - Falls back to the generic NumPy path for unsupported optimized cases. -### Phase 1 benchmark harness +### Benchmark harness -Implemented: +Implemented/extended: ```text bench/ctable/groupby.py @@ -63,31 +77,65 @@ The benchmark can vary: - row count; - group cardinality; -- key dtype via `--key-dtype int32|int64|float32|float64`; +- key dtype via `--key-dtype` including integer, unsigned integer, and float dtypes; - dictionary keys via `--dictionary`; - operation via `--op size|count|sum|mean|min|max`; - sorted output; - chunk size; +- multi-key mode via `--multi-key` and `--groups2`; - optional persistent `urlpath`; - optional pandas comparison. -### Phase 2: optimized paths +Float key benchmarks now generate non-integral repeated labels by default so +`float32`/`float64` runs exercise the arbitrary-float hash path instead of the +integral-float dense path. -Implemented dense NumPy and Cython fast paths for the main benchmark-driven -cases. +### Dedicated Cython extension -Optimized cases currently include: +Implemented: + +```text +src/blosc2/groupby_ext.pyx +``` -- compact non-negative integer/dictionary-code single keys in Python/NumPy dense mode; -- `int32 key + float64 sum` in Cython; -- dictionary-code key + `float64 sum` in Cython; -- integral `float64 key + float64 sum` in Cython; -- integral `float32 key + float64 sum` in Cython. +Build integration: -These paths avoid the original per-chunk `np.unique(..., return_inverse=True)` -and Python dictionary merge overhead for compact single-key sum workloads. +- `CMakeLists.txt` builds, links, and installs `groupby_ext`. +- Group-by kernels were removed from `indexing_ext.pyx`. +- `src/blosc2/groupby.py` imports `blosc2.groupby_ext` for optimized kernels. -Representative benchmark improvements observed during implementation: +Rationale: + +- Group-by kernels are analytics/query execution code, not indexing internals. +- A dedicated extension keeps separation of concerns cleaner as optimized paths grow. + +### Dense integer-key Cython coverage + +Implemented fused dense integer-key Cython kernels covering: + +- `int8`, `uint8`; +- `int16`, `uint16`; +- `int32`, `uint32`; +- `int64`, `uint64`. + +Implemented dense integer/dictionary-code Cython path for: + +- `size`; +- `count`; +- `sum`; +- `mean` via sum/count; +- `min`; +- `max`. + +Additional details: + +- Uses compact dense accumulator arrays. +- Falls back for negative non-null keys and non-compact key ranges. +- Supports float64 value kernels with NaN-null skipping where applicable. +- Supports int64-normalized integer/bool value kernels for `sum`, `min`, and `max`. +- Tracks key presence separately so groups with all-null values are emitted correctly. + +Representative benchmark improvements observed during earlier optimization: ```text 50M rows, 5k int32 groups, float64 sum: @@ -102,52 +150,96 @@ Representative benchmark improvements observed during implementation: Cython dense path: ~0.24–0.25 s ``` -### Phase 3: separate Cython extension +### Arbitrary float-key hash path -Implemented: +Implemented a conservative Cython open-addressing hash path for single +`float32`/`float64` keys with float value aggregations. -```text -src/blosc2/groupby_ext.pyx -``` +Implemented operations: -Build integration: +- `size`; +- `count`; +- `sum`; +- `mean`; +- `min`; +- `max`. -- `CMakeLists.txt` builds, links, and installs `groupby_ext`. -- Group-by kernels were removed from `indexing_ext.pyx`. -- `src/blosc2/groupby.py` imports `blosc2.groupby_ext` for optimized kernels. +Implemented semantics: -Rationale: +- `dropna=True`: skip NaN keys; +- `dropna=False`: all NaN keys form one group; +- `+0.0` and `-0.0` are normalized into the same group; +- infinities are valid groups through regular float bit hashing; +- NaN-null float values are skipped for value aggregations. -- Group-by kernels are analytics/query execution code, not indexing internals. -- A dedicated extension keeps separation of concerns cleaner as optimized paths grow. +### Two-key Cython hash path -### Phase 4: fused integer-key kernels and more Cython aggregations +Implemented a conservative Cython hash path for two-key group-by when both keys +are integer or dictionary-code-backed columns. -Implemented: +Implemented behavior: + +- normalizes keys to `int64`; +- hashes `(key0, key1)` directly; +- supports `size`, `count`, `sum`, `mean`, `min`, and `max` for supported float + value reductions; +- avoids structured-array packing and per-chunk `np.unique` for common two-key + categorical/integer workloads; +- falls back for unsupported cases. + +Benchmarks showed this is functionally useful but still leaves room for future +optimization because partial states are merged in Python and the generic hash +kernel maintains more state than a specialized one-operation kernel needs. + +### Public `blosc2.group_reduce()` + +Implemented a conservative public array API for single-key grouped reductions +without requiring a `CTable`. + +Implemented API: + +```python +groups, result = blosc2.group_reduce( + keys, values=None, op="size", sort=False, dropna=True +) +``` + +Implemented operations: + +- `size`; +- `count`; +- `sum`; +- `mean`; +- `min`; +- `max`. -- fused dense integer-key Cython kernels covering `int8`, `uint8`, - `int16`, `uint16`, `int32`, `uint32`, `int64`, and `uint64` keys; -- dense integer/dictionary-code Cython path for `size`, `count`, `sum`, - `mean`, `min`, and `max`; -- float64 value kernels with NaN-null skipping where applicable; -- int64 value kernels for integer/bool `sum`, `min`, and `max`; -- shared key-presence tracking so groups with all-null values are still - emitted correctly for `count` and nullable float aggregations. +Implemented semantics: + +- returns plain NumPy arrays `(groups, result)`; +- `size` counts rows and does not require values; +- `count` counts non-NaN values; +- `dropna=True` skips NaN float keys; +- `dropna=False` keeps one normalized NaN group; +- `+0.0` and `-0.0` are normalized by the float hash path; +- optimized dense integer and arbitrary-float hash paths are used + opportunistically, with a NumPy/Python fallback. ### Documentation -Implemented user-facing documentation in: +Implemented/updated user-facing documentation in: ```text doc/reference/ctable.rst +doc/reference/reduction_functions.rst ``` Documented: - `CTable.group_by()`; - returned `CTableGroupBy` object; -- `size()`, `count()`, `agg()`; -- examples for row counts, non-null counts, and sums. +- `size()`, `count()`, `sum()`, `mean()`, `min()`, `max()`, `agg()`; +- examples for row counts, non-null counts, and grouped reductions; +- public `blosc2.group_reduce()`. ### Tests @@ -155,6 +247,7 @@ Implemented/extended: ```text tests/ctable/test_groupby.py +tests/test_group_reduce.py ``` Coverage includes: @@ -162,6 +255,7 @@ Coverage includes: - `size()` row counts; - `count(column)` non-null counts; - `agg()` with `sum`, `mean`, `min`, `max`, `count`; +- convenience `sum`, `mean`, `min`, `max` methods; - `agg({"*": "size"})`; - multi-key group-by; - dictionary string keys; @@ -169,106 +263,66 @@ Coverage includes: - empty tables; - `dropna=True` / `dropna=False` behavior; - bad engine rejection; -- optimized int32/dictionary/float32/float64 sum variants; -- fallback for non-integral float keys; -- fallback for NaN float-key group when `dropna=False`. - -Validation during implementation: - -```text -pytest tests/ctable/test_groupby.py -q -pytest tests/ctable -q -``` - -The full CTable suite passed after Phase 3. +- optimized integer/dictionary/float variants; +- arbitrary float-key hash behavior; +- public `group_reduce()` behavior and input validation. ## Current design summary -The implementation now has three execution layers: +The implementation now has these execution layers: 1. Generic chunked NumPy path: - - supports the broadest set of Phase-1 semantics; - - uses per-chunk local grouping and merges partials globally. + - broadest semantics; + - per-chunk local grouping and global merge. 2. Dense NumPy single-key path: - - for compact non-negative integer/dictionary-code keys; - - uses dense accumulator arrays where possible. -3. Cython single-key sum kernels: - - for the most important compact/integral key + `float64 sum` cases; - - lives in `groupby_ext.pyx`. + - compact non-negative integer/dictionary-code keys; + - dense accumulator arrays. +3. Cython dense integer-key path: + - fused integer key dtypes; + - `size`, `count`, `sum`, `mean`, `min`, `max`. +4. Cython integral-float dense path: + - integral `float32`/`float64` keys for selected dense cases. +5. Cython arbitrary-float hash path: + - non-integral `float32`/`float64` keys; + - normalized NaN and signed-zero semantics. +6. Cython two-key hash path: + - two integer/dictionary-code-backed keys; + - float value reductions. +7. Public array-level `blosc2.group_reduce()`: + - uses optimized kernels opportunistically without requiring a `CTable`. All optimized paths are conservative and fall back to the generic engine when unsupported data or semantics are encountered. -## Deferred / future work - -### Integer-key Cython coverage - -Completed for dense compact single-key group-by with fused kernels covering -`int8`, `uint8`, `int16`, `uint16`, `int32`, `uint32`, `int64`, and `uint64`. -The dense path still falls back for negative non-null keys and non-compact key -ranges. +## Future work -### More Cython aggregations - -Completed for dense compact integer/dictionary-code single keys: - -- `size`; -- `count`; -- `sum`; -- `mean` via sum/count; -- `min`; -- `max`. +### Fuse multiple aggregations/value columns in Cython -Remaining possible extensions in this area: +Current optimized paths often run separate kernels or maintain generic state. +Future work could: -- fuse multiple aggregations/value columns into one Cython pass; +- fuse multiple aggregations in a single pass; +- support multiple value columns directly; +- specialize kernels by requested operation so, for example, a `sum` workload + does not maintain min/max state; - broaden value-type coverage beyond float64/int64 normalized kernels. -### Arbitrary float-key hash table - -Implemented a conservative Cython open-addressing hash path for single -`float32`/`float64` keys with float value aggregations. It supports `size`, -`count`, `sum`, `mean`, `min`, and `max` for supported single-value-column -queries and falls back otherwise. - -Implemented semantics: - -- `dropna=True`: skip NaN keys; -- `dropna=False`: all NaN keys form one group; -- `+0.0` and `-0.0` are normalized into the same group; -- infinities are valid groups through regular float bit hashing; -- NaN-null float values are skipped for value aggregations. - -Remaining possible extensions: - -- support non-float value columns in the hash path without normalizing through - float64; -- fuse multiple value columns directly in one hash-table pass; -- add explicit memory/cardinality safeguards for very high-cardinality floats. +### Extend multi-key optimized paths -### Multi-key Cython hash path - -Implemented a conservative Cython hash path for two-key group-by when both keys -are integer or dictionary-code-backed columns. The path normalizes keys to -`int64`, hashes `(key0, key1)` directly, and supports `size`, `count`, `sum`, -`mean`, `min`, and `max` for supported float value reductions. This avoids -structured-array packing and per-chunk `np.unique` for common two-key -categorical/integer workloads. - -Remaining possible extensions: +Current Cython multi-key support is intentionally narrow. +Future work could: - support more than two key columns; -- support float/string fixed-width key components directly; -- support non-float value columns without normalizing value reductions through - float64; -- fuse/merge multi-key states across chunks fully in Cython rather than via the - existing Python accumulator merge. +- support float key components directly; +- support fixed-width string/bytes key components directly; +- support non-float value columns without normalizing reductions through float64; +- merge multi-key states fully in Cython instead of via Python accumulators; +- add a dense two-integer-key path for compact Cartesian key domains. -### FULL-index sorted group-by path +### Revisit FULL-index sorted group-by only with a better design -A FULL index on a single grouping key can provide sorted positions. A prototype -Python/NumPy sorted-scan path was implemented and then reverted after -benchmarking because it was not competitive with the existing dense/hash paths. +A Python/NumPy FULL-index sorted-scan prototype was implemented and reverted +after benchmarking because it was not competitive with existing dense/hash paths. Prototype behavior: @@ -303,8 +357,8 @@ Why the prototype was slow: - value aggregations required many scattered gathers from the original value column, one gathered position set per key run; -- scattered value access is much less cache/compression friendly than the - existing sequential dense/hash scans; +- scattered value access is much less cache/compression friendly than existing + sequential dense/hash scans; - the implementation still had Python-level run processing and result merging; - FULL index build cost is substantial unless the index already exists and can be reused many times; @@ -312,54 +366,13 @@ Why the prototype was slow: Recommendation: -- keep this deferred for now; +- keep this deferred; - do not reintroduce a Python-level FULL-index value-aggregation path; - revisit only with a block-aware/Cython reducer that batches sorted positions by physical chunks/blocks, or as part of a broader high-cardinality/sparse-key strategy; -- if revisited, benchmark primarily against high-cardinality non-compact keys - and already-existing FULL indexes, not compact dense-key workloads. - -### Public `blosc2.group_reduce()` - -Implemented a conservative public `blosc2.group_reduce()` array API for -single-key grouped reductions without requiring a `CTable`. - -Implemented API: - -```python -groups, result = blosc2.group_reduce( - keys, values=None, op="size", sort=False, dropna=True -) -``` - -Implemented operations: - -- `size`; -- `count`; -- `sum`; -- `mean`; -- `min`; -- `max`. - -Implemented semantics: - -- returns plain NumPy arrays `(groups, result)`; -- `size` counts rows and does not require values; -- `count` counts non-NaN values; -- `dropna=True` skips NaN float keys; -- `dropna=False` keeps one normalized NaN group; -- `+0.0` and `-0.0` are normalized by the float hash path; -- optimized dense integer and arbitrary-float hash paths are used - opportunistically, with a NumPy/Python fallback. - -Remaining possible extensions: - -- multi-key public API; -- multiple aggregations in one call; -- multiple value columns; -- NDArray/chunked execution without eager NumPy conversion; -- optional CTable/persistent output. +- benchmark primarily against high-cardinality non-compact keys and + already-existing FULL indexes, not compact dense-key workloads. ### High-cardinality and memory strategy @@ -368,7 +381,7 @@ Future safeguards/features: - estimate cardinality from early chunks; - expose/keep an internal memory limit; - fall back to sort-based grouping when cardinality is too high; -- use FULL indexes when available; +- possibly use FULL indexes when available and demonstrably beneficial; - eventually implement partitioned hash group-by with spill-to-disk. ### Parallel execution @@ -379,36 +392,22 @@ Potential future optimization: - merge accumulators at chunk or partition boundaries; - coordinate with Blosc2 decompression threading to avoid oversubscription. -### Additional API conveniences - -Implemented group-by convenience methods: - -```python -t.group_by("city").sum("sales") -t.group_by("city").mean("sales") -t.group_by("city").min("sales") -t.group_by("city").max("sales") -``` +### Extend public `blosc2.group_reduce()` -These are equivalent to `agg({column: op})` and complement the already-existing -`size()` and `count(column)` group-by methods. +Remaining possible extensions: -Do not add top-level `CTable.size()` / `CTable.count()` until their semantics are -clearly justified outside group-by. +- multi-key public API; +- multiple aggregations in one call; +- multiple value columns; +- NDArray/chunked execution without eager NumPy conversion; +- optional CTable/persistent output. ### Persistent output -The current result is an in-memory `CTable`. Future work may add an `out=` or -`urlpath=` option for persistent grouped output. +The current `CTable.group_by()` result is an in-memory `CTable`. Future work may +add an `out=` or `urlpath=` option for persistent grouped output. -## Related untracked files reviewed +### Top-level CTable count/size semantics -During cleanup, these untracked files were reviewed and found non-duplicative: - -```text -tests/ctable/test_nested_append.py -bench/ctable/bench_nested_filter_index.py -``` - -They cover direct nested append/extend correctness and nested flat-vs-dotted -performance comparisons, respectively, and are worth keeping/adding separately. +Do not add top-level `CTable.size()` / `CTable.count()` until their semantics are +clearly justified outside group-by. From 74b6ee246b7bca20fd28a9ea3fecb49fc512d4e9 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 15 May 2026 13:48:24 +0200 Subject: [PATCH 16/17] Implemented persistent grouped output via `urlpath=` --- doc/reference/ctable.rst | 5 ++++ plans/ctable-groupby.md | 30 ++++++++++++++++++++---- src/blosc2/groupby.py | 45 ++++++++++++++++++++++-------------- tests/ctable/test_groupby.py | 23 ++++++++++++++++++ 4 files changed, 81 insertions(+), 22 deletions(-) diff --git a/doc/reference/ctable.rst b/doc/reference/ctable.rst index 8ccf74b8..ad6f5b9c 100644 --- a/doc/reference/ctable.rst +++ b/doc/reference/ctable.rst @@ -263,6 +263,11 @@ one row per group:: mins = by_city.min("sales") maxs = by_city.max("sales") +Grouped results are in-memory by default. Pass ``urlpath=`` to a terminal +method to write the result as a persistent :class:`CTable`:: + + totals = by_city.sum("sales", urlpath="sales_by_city.b2d") + .. autoclass:: CTableGroupBy :members: size, count, sum, mean, min, max, agg diff --git a/plans/ctable-groupby.md b/plans/ctable-groupby.md index 522fba47..4587f748 100644 --- a/plans/ctable-groupby.md +++ b/plans/ctable-groupby.md @@ -21,7 +21,9 @@ Implemented API decisions: - `CTable.group_by(...)` returns a lightweight `CTableGroupBy` facade. - `CTableGroupBy` is a deferred operation builder, not a `CTable` view. -- Terminal methods materialize a new in-memory `CTable`. +- Terminal methods materialize a new `CTable`. +- Results are in-memory by default and persistent when terminal methods receive + `urlpath=`. - Aggregate result columns are suffixed as `_`. - `GroupBy.size()` means row count per group / SQL `COUNT(*)`. - `GroupBy.count(column)` means non-null count / SQL `COUNT(column)`. @@ -44,6 +46,22 @@ t.group_by("city").max("sales") These are equivalent to `agg({column: op})` and complement `size()` and `count(column)`. +### Persistent grouped output + +Implemented `urlpath=` on group-by terminal methods for persistent grouped +output: + +```python +t.group_by("city").size(urlpath="counts.b2d") +t.group_by("city").count("sales", urlpath="sales_count.b2d") +t.group_by("city").sum("sales", urlpath="sales_sum.b2d") +t.group_by("city").agg({"sales": "mean"}, urlpath="sales_mean.b2d") +``` + +The result remains an in-memory `CTable` when `urlpath` is omitted. When +`urlpath` is supplied, the grouped result is written with `mode="w"` semantics +and returned as the newly created persistent `CTable`. + ### Generic Python/NumPy implementation Implemented files: @@ -238,6 +256,7 @@ Documented: - `CTable.group_by()`; - returned `CTableGroupBy` object; - `size()`, `count()`, `sum()`, `mean()`, `min()`, `max()`, `agg()`; +- persistent grouped output via `urlpath=`; - examples for row counts, non-null counts, and grouped reductions; - public `blosc2.group_reduce()`. @@ -265,7 +284,8 @@ Coverage includes: - bad engine rejection; - optimized integer/dictionary/float variants; - arbitrary float-key hash behavior; -- public `group_reduce()` behavior and input validation. +- public `group_reduce()` behavior and input validation; +- persistent grouped output via `urlpath=`. ## Current design summary @@ -402,10 +422,10 @@ Remaining possible extensions: - NDArray/chunked execution without eager NumPy conversion; - optional CTable/persistent output. -### Persistent output +### Output storage controls -The current `CTable.group_by()` result is an in-memory `CTable`. Future work may -add an `out=` or `urlpath=` option for persistent grouped output. +Future extensions may add a more general `out=` parameter or expose additional +storage/cparams controls for grouped output. ### Top-level CTable count/size semantics diff --git a/src/blosc2/groupby.py b/src/blosc2/groupby.py index c5f14221..8e245548 100644 --- a/src/blosc2/groupby.py +++ b/src/blosc2/groupby.py @@ -93,52 +93,53 @@ def __init__( if table._is_list_column(col_info) or table._is_varlen_scalar_column(col_info): raise TypeError(f"Cannot group by variable-length/list column {name!r} in Phase 1") - def size(self): + def size(self, *, urlpath: str | None = None): """Return row counts per group as a new :class:`CTable`. This is equivalent to SQL ``COUNT(*)``: it counts rows in each group and - is independent of null values in non-key columns. + is independent of null values in non-key columns. If *urlpath* is + provided, the result is written as a persistent CTable at that path. """ - return self._execute([_AggSpec(None, "size", "size")]) + return self._execute([_AggSpec(None, "size", "size")], urlpath=urlpath) - def count(self, column: str): + def count(self, column: str, *, urlpath: str | None = None): """Return non-null value counts for *column* per group. This is equivalent to SQL ``COUNT(column)`` and to ``group_by(...).agg({column: "count"})``. """ col = self.table._logical_to_physical_name(column) - return self._execute([_AggSpec(col, "count", f"{col}_count")]) + return self._execute([_AggSpec(col, "count", f"{col}_count")], urlpath=urlpath) - def sum(self, column: str): + def sum(self, column: str, *, urlpath: str | None = None): """Return sums of *column* per group. This is equivalent to ``group_by(...).agg({column: "sum"})``. """ - return self.agg({column: "sum"}) + return self.agg({column: "sum"}, urlpath=urlpath) - def mean(self, column: str): + def mean(self, column: str, *, urlpath: str | None = None): """Return means of *column* per group. This is equivalent to ``group_by(...).agg({column: "mean"})``. """ - return self.agg({column: "mean"}) + return self.agg({column: "mean"}, urlpath=urlpath) - def min(self, column: str): + def min(self, column: str, *, urlpath: str | None = None): """Return minimum values of *column* per group. This is equivalent to ``group_by(...).agg({column: "min"})``. """ - return self.agg({column: "min"}) + return self.agg({column: "min"}, urlpath=urlpath) - def max(self, column: str): + def max(self, column: str, *, urlpath: str | None = None): """Return maximum values of *column* per group. This is equivalent to ``group_by(...).agg({column: "max"})``. """ - return self.agg({column: "max"}) + return self.agg({column: "max"}, urlpath=urlpath) - def agg(self, aggregations: Mapping[str, str | Sequence[str]]): + def agg(self, aggregations: Mapping[str, str | Sequence[str]], *, urlpath: str | None = None): """Aggregate value columns per group. Parameters @@ -150,7 +151,7 @@ def agg(self, aggregations: Mapping[str, str | Sequence[str]]): ``{"*": "size"``}. """ specs = self._normalize_aggs(aggregations) - return self._execute(specs) + return self._execute(specs, urlpath=urlpath) def _normalize_aggs(self, aggregations: Mapping[str, str | Sequence[str]]) -> list[_AggSpec]: if not isinstance(aggregations, Mapping) or not aggregations: @@ -201,8 +202,16 @@ def _validate_value_column(self, name: str) -> None: if self.table._is_dictionary_column(col_info): raise TypeError(f"Cannot aggregate dictionary column {name!r} in Phase 1") - def _execute(self, specs: list[_AggSpec]): + def _execute(self, specs: list[_AggSpec], *, urlpath: str | None = None): self._validate_output_names(specs) + old_result_urlpath = getattr(self, "_result_urlpath", None) + self._result_urlpath = urlpath + try: + return self._execute_with_result_target(specs) + finally: + self._result_urlpath = old_result_urlpath + + def _execute_with_result_target(self, specs: list[_AggSpec]): fast = self._try_execute_cython_dense_int_key(specs) if fast is not None: return fast @@ -1327,7 +1336,9 @@ def _build_result(self, rows: list[dict[str, Any]], specs: list[_AggSpec]): fields.append((name, _python_type_for_spec(schema_specs[name]), b2_field(schema_specs[name]))) row_type = dataclasses.make_dataclass("CTableGroupByRow", fields) data = {name: [row[name] for row in rows] for name in columns} - return CTable(row_type, new_data=data, expected_size=max(len(rows), 1), validate=False) + urlpath = getattr(self, "_result_urlpath", None) + kwargs = {"urlpath": str(urlpath), "mode": "w"} if urlpath is not None else {} + return CTable(row_type, new_data=data, expected_size=max(len(rows), 1), validate=False, **kwargs) def _validate_output_names(self, specs: list[_AggSpec]) -> None: names = self.keys + [s.output_col for s in specs] diff --git a/tests/ctable/test_groupby.py b/tests/ctable/test_groupby.py index ae884c42..fec1fca4 100644 --- a/tests/ctable/test_groupby.py +++ b/tests/ctable/test_groupby.py @@ -375,3 +375,26 @@ def test_groupby_convenience_numeric_methods(): assert rows(t.group_by("city", sort=True).max("qty")) == rows( t.group_by("city", sort=True).agg({"qty": "max"}) ) + + +def test_groupby_persistent_output_urlpath(tmp_path): + t = CTable(SalesRow, new_data=DATA) + path = tmp_path / "grouped.b2d" + + out = t.group_by("city", sort=True).agg({"qty": "sum"}, urlpath=path) + out.close() + + reopened = CTable.open(str(path), mode="r") + assert reopened.col_names == ["city", "qty_sum"] + assert rows(reopened) == [("Berlin", 6), ("Paris", 7), ("Rome", 8)] + + +def test_groupby_persistent_output_urlpath_on_convenience_method(tmp_path): + t = CTable(SalesRow, new_data=DATA) + path = tmp_path / "grouped_mean.b2d" + + out = t.group_by("city", sort=True).mean("qty", urlpath=path) + out.close() + + reopened = CTable.open(str(path), mode="r") + assert rows(reopened) == [("Berlin", 6.0), ("Paris", 7 / 3), ("Rome", 4.0)] From 4e137133382b5098044fa847b261d6adee533cd8 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 15 May 2026 14:28:45 +0200 Subject: [PATCH 17/17] Protect tests when pyarrow is not installed --- tests/ctable/test_object_spec.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/ctable/test_object_spec.py b/tests/ctable/test_object_spec.py index 30fa7258..9b6154dc 100644 --- a/tests/ctable/test_object_spec.py +++ b/tests/ctable/test_object_spec.py @@ -42,6 +42,7 @@ def test_object_column_persistence(tmp_path): def test_object_column_to_arrow_raises(): + pytest.importorskip("pyarrow") t = CTable(ObjectRow) t.append([1, {"x": 1}]) with pytest.raises(TypeError, match="ObjectSpec columns"):