diff --git a/CMakeLists.txt b/CMakeLists.txt index 734a4fea..ff4425a3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -51,6 +51,13 @@ add_custom_command( DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2/indexing_ext.pyx" VERBATIM) +add_custom_command( + OUTPUT groupby_ext.c + COMMAND Python::Interpreter -m cython + "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2/groupby_ext.pyx" --output-file groupby_ext.c + DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2/groupby_ext.pyx" + VERBATIM) + # ...and add it to the target Python_add_library(blosc2_ext MODULE blosc2_ext.c WITH_SOABI) target_sources(blosc2_ext PRIVATE src/blosc2/matmul_kernels.c) @@ -59,10 +66,12 @@ if(UNIX) target_link_libraries(blosc2_ext PRIVATE ${CMAKE_DL_LIBS}) endif() Python_add_library(indexing_ext MODULE indexing_ext.c WITH_SOABI) +Python_add_library(groupby_ext MODULE groupby_ext.c WITH_SOABI) # We need to link against NumPy target_link_libraries(blosc2_ext PRIVATE Python::NumPy) target_link_libraries(indexing_ext PRIVATE Python::NumPy) +target_link_libraries(groupby_ext PRIVATE Python::NumPy) # Fetch and build miniexpr library include(FetchContent) @@ -99,6 +108,7 @@ endif() target_compile_features(blosc2_ext PRIVATE c_std_11) target_compile_features(indexing_ext PRIVATE c_std_11) +target_compile_features(groupby_ext PRIVATE c_std_11) if(WIN32 AND CMAKE_C_COMPILER_ID STREQUAL "Clang") execute_process( COMMAND "${CMAKE_C_COMPILER}" -print-resource-dir @@ -173,7 +183,7 @@ endif() # Python extension -> site-packages/blosc2 install( - TARGETS blosc2_ext indexing_ext + TARGETS blosc2_ext indexing_ext groupby_ext LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/blosc2 ) diff --git a/bench/ctable/bench_nested_filter_index.py b/bench/ctable/bench_nested_filter_index.py new file mode 100644 index 00000000..71d44112 --- /dev/null +++ b/bench/ctable/bench_nested_filter_index.py @@ -0,0 +1,205 @@ +#!/usr/bin/env python +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +"""Benchmark nested leaf filter/index performance vs flat columns. + +Compares a CTable with flat column names against an equivalent one that uses +dotted nested column names (physically stored under hierarchical _cols/ paths). +Both tables hold the same data; each filter/index/aggregate operation is timed +on both to show the overhead (or absence thereof) introduced by the nested layout. +""" + +from __future__ import annotations + +import argparse +import gc +import time +from dataclasses import dataclass + +import numpy as np + +import blosc2 + + +# --------------------------------------------------------------------------- +# Schema helpers +# --------------------------------------------------------------------------- + + +@dataclass +class FlatRow: + trip_begin_lon: float = blosc2.field(blosc2.float64()) + trip_begin_lat: float = blosc2.field(blosc2.float64()) + trip_end_lon: float = blosc2.field(blosc2.float64()) + trip_end_lat: float = blosc2.field(blosc2.float64()) + payment_fare: float = blosc2.field(blosc2.float64(ge=0)) + + +@dataclass +class NestedRow: + """Same physical columns as FlatRow but accessed via dotted names after creation.""" + + trip_begin_lon: float = blosc2.field(blosc2.float64()) + trip_begin_lat: float = blosc2.field(blosc2.float64()) + trip_end_lon: float = blosc2.field(blosc2.float64()) + trip_end_lat: float = blosc2.field(blosc2.float64()) + payment_fare: float = blosc2.field(blosc2.float64(ge=0)) + + +def _build_data(n: int) -> dict: + rng = np.random.default_rng(42) + return { + "trip_begin_lon": rng.uniform(-88.0, -87.5, n).astype(np.float64), + "trip_begin_lat": rng.uniform(41.6, 42.0, n).astype(np.float64), + "trip_end_lon": rng.uniform(-88.0, -87.5, n).astype(np.float64), + "trip_end_lat": rng.uniform(41.6, 42.0, n).astype(np.float64), + "payment_fare": rng.uniform(3.0, 50.0, n).astype(np.float64), + } + + +def _build_flat(data: dict, n: int) -> "blosc2.CTable": + t = blosc2.CTable(FlatRow, expected_size=n) + t.extend(data) + return t + + +def _build_nested(data: dict, n: int) -> "blosc2.CTable": + t = blosc2.CTable(NestedRow, expected_size=n) + t.extend(data) + # Rename to dotted nested names + t.rename_column("trip_begin_lon", "trip.begin.lon") + t.rename_column("trip_begin_lat", "trip.begin.lat") + t.rename_column("trip_end_lon", "trip.end.lon") + t.rename_column("trip_end_lat", "trip.end.lat") + t.rename_column("payment_fare", "payment.fare") + return t + + +# --------------------------------------------------------------------------- +# Timing helper +# --------------------------------------------------------------------------- + + +def _timeit(fn, repeats: int = 5) -> float: + gc.collect() + times = [] + for _ in range(repeats): + t0 = time.perf_counter() + fn() + times.append(time.perf_counter() - t0) + return min(times) + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +def main() -> None: + p = argparse.ArgumentParser(description="Benchmark nested vs flat column filter/index/aggregate") + p.add_argument("--rows", type=int, default=1_000_000, help="Number of rows (default: 1M)") + p.add_argument("--repeats", type=int, default=5, help="Timing repeats (default: 5)") + args = p.parse_args() + + N = args.rows + R = args.repeats + + print(f"Building tables with {N:,} rows …") + data = _build_data(N) + flat_data = data.copy() # flat uses underscore names + nested_data = { + "trip_begin_lon": data["trip_begin_lon"], + "trip_begin_lat": data["trip_begin_lat"], + "trip_end_lon": data["trip_end_lon"], + "trip_end_lat": data["trip_end_lat"], + "payment_fare": data["payment_fare"], + } + + tf = _build_flat(flat_data, N) + tn = _build_nested(nested_data, N) + print(f" flat col_names: {tf.col_names}") + print(f" nested col_names: {tn.col_names}") + print() + + # Build indexes on the fare column for index-accelerated queries + print("Building indexes …") + tf.create_index("payment_fare") + tn.create_index("payment.fare") + print() + + header = f"{'Operation':<45} {'flat (ms)':>12} {'nested (ms)':>13} {'ratio':>8}" + print(header) + print("-" * len(header)) + + def bench(label, flat_fn, nested_fn): + t_flat = _timeit(flat_fn, R) * 1000 + t_nested = _timeit(nested_fn, R) * 1000 + ratio = t_nested / t_flat if t_flat > 0 else float("nan") + print(f"{label:<45} {t_flat:>12.3f} {t_nested:>13.3f} {ratio:>8.3f}x") + + bench( + "where (string expr, full scan)", + lambda: tf.where("payment_fare > 20"), + lambda: tn.where("payment.fare > 20"), + ) + + bench( + "where (string expr, full scan, nrows)", + lambda: tf.where("payment_fare > 20").nrows, + lambda: tn.where("payment.fare > 20").nrows, + ) + + bench( + "where (LazyExpr, full scan)", + lambda: tf.where(tf["payment_fare"] > 20), + lambda: tn.where(tn["payment.fare"] > 20), + ) + + bench( + "where (auto index-accelerated, nrows)", + lambda: tf.where("payment_fare > 20").nrows, + lambda: tn.where("payment.fare > 20").nrows, + ) + + bench( + "column mean (full scan)", + lambda: tf["payment_fare"].mean(), + lambda: tn["payment.fare"].mean(), + ) + + bench( + "column sum (full scan)", + lambda: tf["payment_fare"].sum(), + lambda: tn["payment.fare"].sum(), + ) + + bench( + "column min (full scan)", + lambda: tf["trip_begin_lon"].min(), + lambda: tn["trip.begin.lon"].min(), + ) + + bench( + "multi-column where (string expr, nrows)", + lambda: tf.where("trip_begin_lon > -87.7 and payment_fare > 10").nrows, + lambda: tn.where("trip.begin.lon > -87.7 and payment.fare > 10").nrows, + ) + + bench( + "sort_by (single leaf)", + lambda: tf.sort_by("payment_fare"), + lambda: tn.sort_by("payment.fare"), + ) + + print() + print("ratio < 1 means nested is faster; ratio > 1 means flat is faster.") + print("Ratios close to 1.0 indicate the nested path adds negligible overhead.") + + +if __name__ == "__main__": + main() diff --git a/bench/ctable/groupby.py b/bench/ctable/groupby.py new file mode 100644 index 00000000..41929563 --- /dev/null +++ b/bench/ctable/groupby.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python +"""Phase-1 CTable group_by benchmark. + +Examples +-------- +python bench/ctable/groupby.py --rows 10_000_000 --groups 1_000 --op sum +python bench/ctable/groupby.py --rows 10_000_000 --groups 1_000 --key-dtype float64 --op sum +# float key dtypes generate non-integral repeated labels to exercise the float hash path +python bench/ctable/groupby.py --rows 1_000_000 --groups 100 --dictionary --pandas +python bench/ctable/groupby.py --rows 10_000_000 --groups 1_000 --groups2 100 --multi-key --op sum +""" + +from __future__ import annotations + +import argparse +import dataclasses +import time +from pathlib import Path + +import numpy as np + +import blosc2 + + +def parse_int(text: str) -> int: + return int(text.replace("_", "")) + + +def build_row_type(dictionary: bool, key_dtype: str, multi_key: bool): + if dictionary and multi_key: + + @dataclasses.dataclass + class Row: + key0: str = blosc2.field(blosc2.dictionary()) + key1: int = blosc2.field(blosc2.int32()) + value: float = blosc2.field(blosc2.float64()) + + elif dictionary: + + @dataclasses.dataclass + class Row: + key: str = blosc2.field(blosc2.dictionary()) + value: float = blosc2.field(blosc2.float64()) + + elif key_dtype in {"int8", "uint8", "int16", "uint16", "int32", "uint32", "int64", "uint64"}: + key_spec = getattr(blosc2, key_dtype)() + + if multi_key: + + @dataclasses.dataclass + class Row: + key0: int = blosc2.field(key_spec) + key1: int = blosc2.field(key_spec) + value: float = blosc2.field(blosc2.float64()) + + else: + + @dataclasses.dataclass + class Row: + key: int = blosc2.field(key_spec) + value: float = blosc2.field(blosc2.float64()) + + elif key_dtype in {"float32", "float64"}: + key_spec = blosc2.float32() if key_dtype == "float32" else blosc2.float64() + + if multi_key: + + @dataclasses.dataclass + class Row: + key0: float = blosc2.field(key_spec) + key1: float = blosc2.field(key_spec) + value: float = blosc2.field(blosc2.float64()) + + else: + + @dataclasses.dataclass + class Row: + key: float = blosc2.field(key_spec) + value: float = blosc2.field(blosc2.float64()) + + else: # pragma: no cover - argparse choices prevent this + raise ValueError(f"unsupported key dtype {key_dtype!r}") + + return Row + + +def make_key_data(key_codes: np.ndarray, dictionary: bool, key_dtype: str): + if dictionary: + return np.asarray([f"k{code}" for code in key_codes], dtype=object) + if key_dtype in {"float32", "float64"}: + # Use non-integral, repeated float labels by default so float-key + # benchmarks exercise the arbitrary-float hash path instead of the + # dense integral-float fast path. + labels = key_codes.astype(np.float64) + 0.25 + return labels.astype(np.dtype(key_dtype)) + return key_codes.astype(np.dtype(key_dtype), copy=False) + + +def make_data(nrows: int, ngroups: int, ngroups2: int, dictionary: bool, key_dtype: str, multi_key: bool, seed: int): + rng = np.random.default_rng(seed) + key_codes = rng.integers(0, ngroups, size=nrows, dtype=np.int32) + values = rng.random(nrows, dtype=np.float64) + if not multi_key: + return {"key": make_key_data(key_codes, dictionary, key_dtype), "value": values} + + key2_codes = rng.integers(0, ngroups2, size=nrows, dtype=np.int32) + key0 = make_key_data(key_codes, dictionary, key_dtype) + key1_dtype = "int32" if dictionary else key_dtype + key1 = make_key_data(key2_codes, False, key1_dtype) + return {"key0": key0, "key1": key1, "value": values} + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--rows", type=parse_int, default=10_000_000) + parser.add_argument("--groups", type=parse_int, default=1_000) + parser.add_argument("--groups2", type=parse_int, default=None, help="Number of groups for key1 with --multi-key") + parser.add_argument("--chunk-size", type=parse_int, default=None) + parser.add_argument("--dictionary", action="store_true", help="Use a dictionary-encoded string key") + parser.add_argument( + "--key-dtype", + choices=[ + "int8", + "uint8", + "int16", + "uint16", + "int32", + "uint32", + "int64", + "uint64", + "float32", + "float64", + ], + default="int32", + help="Physical dtype for non-dictionary keys. Float keys are generated as non-integral repeated labels.", + ) + parser.add_argument("--op", choices=["size", "count", "sum", "mean", "min", "max"], default="sum") + parser.add_argument("--multi-key", action="store_true", help="Group by two keys: key0 and key1") + parser.add_argument("--sort", action="store_true") + parser.add_argument("--pandas", action="store_true", help="Also run a pandas comparison if available") + parser.add_argument("--urlpath", type=Path, default=None, help="Optional persistent CTable path") + parser.add_argument("--seed", type=int, default=0) + args = parser.parse_args() + + groups2 = args.groups if args.groups2 is None else args.groups2 + print( + f"rows={args.rows:,} groups={args.groups:,} groups2={groups2:,} multi_key={args.multi_key} " + f"dictionary={args.dictionary} key_dtype={args.key_dtype} op={args.op} sort={args.sort} " + f"chunk_size={args.chunk_size} urlpath={args.urlpath}" + ) + + data = make_data(args.rows, args.groups, groups2, args.dictionary, args.key_dtype, args.multi_key, args.seed) + Row = build_row_type(args.dictionary, args.key_dtype, args.multi_key) + + kwargs = {} + if args.urlpath is not None: + kwargs.update(urlpath=str(args.urlpath), mode="w") + + t0 = time.perf_counter() + table = blosc2.CTable(Row, new_data=data, expected_size=args.rows, **kwargs) + build_time = time.perf_counter() - t0 + print(f"ctable_build_seconds={build_time:.6f}") + + t0 = time.perf_counter() + group_keys = ["key0", "key1"] if args.multi_key else "key" + gb = table.group_by(group_keys, sort=args.sort, chunk_size=args.chunk_size) + if args.op == "size": + out = gb.size() + elif args.op == "count": + out = gb.count("value") + else: + out = gb.agg({"value": args.op}) + elapsed = time.perf_counter() - t0 + print(f"ctable_groupby_seconds={elapsed:.6f}") + print(f"result_rows={out.nrows:,}") + + if args.pandas: + try: + import pandas as pd + except ImportError: + print("pandas_unavailable=true") + else: + df = pd.DataFrame(data) + t0 = time.perf_counter() + if args.op == "size": + pdf = df.groupby(group_keys, sort=args.sort).size() + elif args.op == "count": + pdf = df.groupby(group_keys, sort=args.sort)["value"].count() + else: + pdf = df.groupby(group_keys, sort=args.sort)["value"].agg(args.op) + pandas_elapsed = time.perf_counter() - t0 + print(f"pandas_groupby_seconds={pandas_elapsed:.6f}") + print(f"pandas_result_rows={len(pdf):,}") + + table.close() + + +if __name__ == "__main__": + main() diff --git a/doc/reference/ctable.rst b/doc/reference/ctable.rst index 12e99ea0..ad6f5b9c 100644 --- a/doc/reference/ctable.rst +++ b/doc/reference/ctable.rst @@ -233,6 +233,7 @@ When a NumPy structured array is needed, materialize explicitly:: CTable.sample CTable.sort_by CTable.iter_sorted + CTable.group_by .. automethod:: CTable.where .. automethod:: CTable.view @@ -242,6 +243,33 @@ When a NumPy structured array is needed, materialize explicitly:: .. automethod:: CTable.sample .. automethod:: CTable.sort_by .. automethod:: CTable.iter_sorted +.. automethod:: CTable.group_by + + +Group-by reductions +------------------- + +:meth:`CTable.group_by` returns a lightweight deferred group-by object. It is +not a table view; methods such as :meth:`~blosc2.CTableGroupBy.size`, +:meth:`~blosc2.CTableGroupBy.count`, :meth:`~blosc2.CTableGroupBy.sum`, and +:meth:`~blosc2.CTableGroupBy.agg` materialize a new :class:`CTable` with +one row per group:: + + by_city = t.group_by("city", sort=True) + counts = by_city.size() # row count per city / COUNT(*) + non_null = by_city.count("sales") # non-null sales count / COUNT(sales) + totals = by_city.sum("sales") # equivalent to agg({"sales": "sum"}) + means = by_city.mean("sales") + mins = by_city.min("sales") + maxs = by_city.max("sales") + +Grouped results are in-memory by default. Pass ``urlpath=`` to a terminal +method to write the result as a persistent :class:`CTable`:: + + totals = by_city.sum("sales", urlpath="sales_by_city.b2d") + +.. autoclass:: CTableGroupBy + :members: size, count, sum, mean, min, max, agg Mutations diff --git a/doc/reference/reduction_functions.rst b/doc/reference/reduction_functions.rst index 4c21c150..5122807b 100644 --- a/doc/reference/reduction_functions.rst +++ b/doc/reference/reduction_functions.rst @@ -14,6 +14,7 @@ Reduction operations can be used with any of :ref:`NDArray `, :ref:`C2A argmax argmin count_nonzero + group_reduce cumulative_prod cumulative_sum max @@ -31,6 +32,7 @@ Reduction operations can be used with any of :ref:`NDArray `, :ref:`C2A .. autofunction:: blosc2.argmax .. autofunction:: blosc2.argmin .. autofunction:: blosc2.count_nonzero +.. autofunction:: blosc2.group_reduce .. autofunction:: blosc2.cumulative_prod .. autofunction:: blosc2.cumulative_sum .. autofunction:: blosc2.max diff --git a/plans/ctable-groupby.md b/plans/ctable-groupby.md new file mode 100644 index 00000000..4587f748 --- /dev/null +++ b/plans/ctable-groupby.md @@ -0,0 +1,433 @@ +# CTable `group_by` implementation plan — status + +This document started as the implementation plan for `CTable.group_by()`. The +core API and several optimized execution paths are now implemented. The first +section records completed work; the final section lists remaining future work. + +## Completed + +### Public `CTable.group_by()` API + +Implemented: + +```python +t.group_by("city").size() +t.group_by("city").count("sales") +t.group_by("city").agg({"sales": "sum"}) +t.group_by(["country", "city"]).agg({"sales": ["sum", "mean"]}) +``` + +Implemented API decisions: + +- `CTable.group_by(...)` returns a lightweight `CTableGroupBy` facade. +- `CTableGroupBy` is a deferred operation builder, not a `CTable` view. +- Terminal methods materialize a new `CTable`. +- Results are in-memory by default and persistent when terminal methods receive + `urlpath=`. +- Aggregate result columns are suffixed as `_`. +- `GroupBy.size()` means row count per group / SQL `COUNT(*)`. +- `GroupBy.count(column)` means non-null count / SQL `COUNT(column)`. +- `GroupBy.agg({"col": "count"})` is equivalent to `GroupBy.count("col")`. +- `sort=False` is the fast default; `sort=True` sorts output by group keys. +- `dropna=True` is the default; `dropna=False` keeps null/NaN key groups. +- No top-level `CTable.size()` or `CTable.count()` was added. + +### Convenience group-by methods + +Implemented group-by convenience methods: + +```python +t.group_by("city").sum("sales") +t.group_by("city").mean("sales") +t.group_by("city").min("sales") +t.group_by("city").max("sales") +``` + +These are equivalent to `agg({column: op})` and complement `size()` and +`count(column)`. + +### Persistent grouped output + +Implemented `urlpath=` on group-by terminal methods for persistent grouped +output: + +```python +t.group_by("city").size(urlpath="counts.b2d") +t.group_by("city").count("sales", urlpath="sales_count.b2d") +t.group_by("city").sum("sales", urlpath="sales_sum.b2d") +t.group_by("city").agg({"sales": "mean"}, urlpath="sales_mean.b2d") +``` + +The result remains an in-memory `CTable` when `urlpath` is omitted. When +`urlpath` is supplied, the grouped result is written with `mode="w"` semantics +and returned as the newly created persistent `CTable`. + +### Generic Python/NumPy implementation + +Implemented files: + +```text +src/blosc2/ctable.py # CTable.group_by() +src/blosc2/groupby.py # CTableGroupBy, NumPy fallback, public group_reduce() +``` + +Implemented functionality: + +- Chunked, columnar traversal. +- Reads only group keys, aggregation value columns, and `_valid_rows`. +- Handles live rows, views, and deleted rows. +- Supports fixed-width scalar keys and dictionary-encoded string keys. +- Dictionary keys group by codes and decode only for result materialization. +- Supports `size`, `count`, `sum`, `mean`, `min`, `max`. +- Supports multi-key group-by via structured NumPy keys. +- Supports empty inputs. +- Falls back to the generic NumPy path for unsupported optimized cases. + +### Benchmark harness + +Implemented/extended: + +```text +bench/ctable/groupby.py +``` + +The benchmark can vary: + +- row count; +- group cardinality; +- key dtype via `--key-dtype` including integer, unsigned integer, and float dtypes; +- dictionary keys via `--dictionary`; +- operation via `--op size|count|sum|mean|min|max`; +- sorted output; +- chunk size; +- multi-key mode via `--multi-key` and `--groups2`; +- optional persistent `urlpath`; +- optional pandas comparison. + +Float key benchmarks now generate non-integral repeated labels by default so +`float32`/`float64` runs exercise the arbitrary-float hash path instead of the +integral-float dense path. + +### Dedicated Cython extension + +Implemented: + +```text +src/blosc2/groupby_ext.pyx +``` + +Build integration: + +- `CMakeLists.txt` builds, links, and installs `groupby_ext`. +- Group-by kernels were removed from `indexing_ext.pyx`. +- `src/blosc2/groupby.py` imports `blosc2.groupby_ext` for optimized kernels. + +Rationale: + +- Group-by kernels are analytics/query execution code, not indexing internals. +- A dedicated extension keeps separation of concerns cleaner as optimized paths grow. + +### Dense integer-key Cython coverage + +Implemented fused dense integer-key Cython kernels covering: + +- `int8`, `uint8`; +- `int16`, `uint16`; +- `int32`, `uint32`; +- `int64`, `uint64`. + +Implemented dense integer/dictionary-code Cython path for: + +- `size`; +- `count`; +- `sum`; +- `mean` via sum/count; +- `min`; +- `max`. + +Additional details: + +- Uses compact dense accumulator arrays. +- Falls back for negative non-null keys and non-compact key ranges. +- Supports float64 value kernels with NaN-null skipping where applicable. +- Supports int64-normalized integer/bool value kernels for `sum`, `min`, and `max`. +- Tracks key presence separately so groups with all-null values are emitted correctly. + +Representative benchmark improvements observed during earlier optimization: + +```text +50M rows, 5k int32 groups, float64 sum: + generic/early path: ~0.47 s + Cython dense path: ~0.20–0.22 s + +50M rows, 5k float64 integral groups, float64 sum: + generic path: ~5.51 s + Cython dense path: ~0.27–0.29 s + +50M rows, 5k float32 integral groups, float64 sum: + Cython dense path: ~0.24–0.25 s +``` + +### Arbitrary float-key hash path + +Implemented a conservative Cython open-addressing hash path for single +`float32`/`float64` keys with float value aggregations. + +Implemented operations: + +- `size`; +- `count`; +- `sum`; +- `mean`; +- `min`; +- `max`. + +Implemented semantics: + +- `dropna=True`: skip NaN keys; +- `dropna=False`: all NaN keys form one group; +- `+0.0` and `-0.0` are normalized into the same group; +- infinities are valid groups through regular float bit hashing; +- NaN-null float values are skipped for value aggregations. + +### Two-key Cython hash path + +Implemented a conservative Cython hash path for two-key group-by when both keys +are integer or dictionary-code-backed columns. + +Implemented behavior: + +- normalizes keys to `int64`; +- hashes `(key0, key1)` directly; +- supports `size`, `count`, `sum`, `mean`, `min`, and `max` for supported float + value reductions; +- avoids structured-array packing and per-chunk `np.unique` for common two-key + categorical/integer workloads; +- falls back for unsupported cases. + +Benchmarks showed this is functionally useful but still leaves room for future +optimization because partial states are merged in Python and the generic hash +kernel maintains more state than a specialized one-operation kernel needs. + +### Public `blosc2.group_reduce()` + +Implemented a conservative public array API for single-key grouped reductions +without requiring a `CTable`. + +Implemented API: + +```python +groups, result = blosc2.group_reduce( + keys, values=None, op="size", sort=False, dropna=True +) +``` + +Implemented operations: + +- `size`; +- `count`; +- `sum`; +- `mean`; +- `min`; +- `max`. + +Implemented semantics: + +- returns plain NumPy arrays `(groups, result)`; +- `size` counts rows and does not require values; +- `count` counts non-NaN values; +- `dropna=True` skips NaN float keys; +- `dropna=False` keeps one normalized NaN group; +- `+0.0` and `-0.0` are normalized by the float hash path; +- optimized dense integer and arbitrary-float hash paths are used + opportunistically, with a NumPy/Python fallback. + +### Documentation + +Implemented/updated user-facing documentation in: + +```text +doc/reference/ctable.rst +doc/reference/reduction_functions.rst +``` + +Documented: + +- `CTable.group_by()`; +- returned `CTableGroupBy` object; +- `size()`, `count()`, `sum()`, `mean()`, `min()`, `max()`, `agg()`; +- persistent grouped output via `urlpath=`; +- examples for row counts, non-null counts, and grouped reductions; +- public `blosc2.group_reduce()`. + +### Tests + +Implemented/extended: + +```text +tests/ctable/test_groupby.py +tests/test_group_reduce.py +``` + +Coverage includes: + +- `size()` row counts; +- `count(column)` non-null counts; +- `agg()` with `sum`, `mean`, `min`, `max`, `count`; +- convenience `sum`, `mean`, `min`, `max` methods; +- `agg({"*": "size"})`; +- multi-key group-by; +- dictionary string keys; +- views and deleted rows; +- empty tables; +- `dropna=True` / `dropna=False` behavior; +- bad engine rejection; +- optimized integer/dictionary/float variants; +- arbitrary float-key hash behavior; +- public `group_reduce()` behavior and input validation; +- persistent grouped output via `urlpath=`. + +## Current design summary + +The implementation now has these execution layers: + +1. Generic chunked NumPy path: + - broadest semantics; + - per-chunk local grouping and global merge. +2. Dense NumPy single-key path: + - compact non-negative integer/dictionary-code keys; + - dense accumulator arrays. +3. Cython dense integer-key path: + - fused integer key dtypes; + - `size`, `count`, `sum`, `mean`, `min`, `max`. +4. Cython integral-float dense path: + - integral `float32`/`float64` keys for selected dense cases. +5. Cython arbitrary-float hash path: + - non-integral `float32`/`float64` keys; + - normalized NaN and signed-zero semantics. +6. Cython two-key hash path: + - two integer/dictionary-code-backed keys; + - float value reductions. +7. Public array-level `blosc2.group_reduce()`: + - uses optimized kernels opportunistically without requiring a `CTable`. + +All optimized paths are conservative and fall back to the generic engine when +unsupported data or semantics are encountered. + +## Future work + +### Fuse multiple aggregations/value columns in Cython + +Current optimized paths often run separate kernels or maintain generic state. +Future work could: + +- fuse multiple aggregations in a single pass; +- support multiple value columns directly; +- specialize kernels by requested operation so, for example, a `sum` workload + does not maintain min/max state; +- broaden value-type coverage beyond float64/int64 normalized kernels. + +### Extend multi-key optimized paths + +Current Cython multi-key support is intentionally narrow. +Future work could: + +- support more than two key columns; +- support float key components directly; +- support fixed-width string/bytes key components directly; +- support non-float value columns without normalizing reductions through float64; +- merge multi-key states fully in Cython instead of via Python accumulators; +- add a dense two-integer-key path for compact Cartesian key domains. + +### Revisit FULL-index sorted group-by only with a better design + +A Python/NumPy FULL-index sorted-scan prototype was implemented and reverted +after benchmarking because it was not competitive with existing dense/hash paths. + +Prototype behavior: + +```text +read sorted values/positions from FULL sidecars +scan contiguous key runs +respect _valid_rows +reduce each run +emit sorted groups naturally +``` + +Observed benchmark results on 50M rows / 5k compact groups: + +```text +float64 key, sum, sort=True, FULL index: + index build: ~6.2 s + group_by: ~104 s + +int64 key, sum, sort=True, FULL index: + index build: ~5.5 s + group_by: ~102 s + +int64 key, size, sort=True, FULL index: + index build: ~5.5 s + group_by: ~0.45 s + +int64 key, size, sort=False, no FULL index: + group_by: ~0.14 s +``` + +Why the prototype was slow: + +- value aggregations required many scattered gathers from the original value + column, one gathered position set per key run; +- scattered value access is much less cache/compression friendly than existing + sequential dense/hash scans; +- the implementation still had Python-level run processing and result merging; +- FULL index build cost is substantial unless the index already exists and can + be reused many times; +- compact integer-key workloads are already ideal for dense accumulator arrays. + +Recommendation: + +- keep this deferred; +- do not reintroduce a Python-level FULL-index value-aggregation path; +- revisit only with a block-aware/Cython reducer that batches sorted positions + by physical chunks/blocks, or as part of a broader high-cardinality/sparse-key + strategy; +- benchmark primarily against high-cardinality non-compact keys and + already-existing FULL indexes, not compact dense-key workloads. + +### High-cardinality and memory strategy + +Future safeguards/features: + +- estimate cardinality from early chunks; +- expose/keep an internal memory limit; +- fall back to sort-based grouping when cardinality is too high; +- possibly use FULL indexes when available and demonstrably beneficial; +- eventually implement partitioned hash group-by with spill-to-disk. + +### Parallel execution + +Potential future optimization: + +- per-thread local accumulators; +- merge accumulators at chunk or partition boundaries; +- coordinate with Blosc2 decompression threading to avoid oversubscription. + +### Extend public `blosc2.group_reduce()` + +Remaining possible extensions: + +- multi-key public API; +- multiple aggregations in one call; +- multiple value columns; +- NDArray/chunked execution without eager NumPy conversion; +- optional CTable/persistent output. + +### Output storage controls + +Future extensions may add a more general `out=` parameter or expose additional +storage/cparams controls for grouped output. + +### Top-level CTable count/size semantics + +Do not add top-level `CTable.size()` / `CTable.count()` until their semantics are +clearly justified outside group-by. diff --git a/src/blosc2/__init__.py b/src/blosc2/__init__.py index 8a587c06..ee258655 100644 --- a/src/blosc2/__init__.py +++ b/src/blosc2/__init__.py @@ -628,6 +628,7 @@ def _raise(exc): # Note: bool, bytes, string shadow builtins in the blosc2 namespace by design — # they are schema spec constructors (b2.bool(), b2.bytes(), etc.). from .ctable import DEFAULT_NULL_POLICY, Column, CTable, NullPolicy, get_null_policy, null_policy +from .groupby import CTableGroupBy, group_reduce from .ndarray import ( abs, acos, @@ -801,9 +802,12 @@ def _raise(exc): "uint64", "vlbytes", "vlstring", + # Grouped reductions + "group_reduce", # Classes "C2Array", "CParams", + "CTableGroupBy", "Batch", "BatchArray", # Enums diff --git a/src/blosc2/ctable.py b/src/blosc2/ctable.py index 1f80dc3d..56a89583 100644 --- a/src/blosc2/ctable.py +++ b/src/blosc2/ctable.py @@ -20,7 +20,7 @@ import re import shutil from collections import namedtuple -from collections.abc import Iterable, Mapping +from collections.abc import Iterable, Mapping, Sequence from dataclasses import MISSING, dataclass from dataclasses import field as dataclass_field from textwrap import TextWrapper @@ -2338,12 +2338,15 @@ def _init_columns( ) continue if self._is_dictionary_column(col): - self._cols[col.name] = storage.create_dictionary_column( + dict_col = storage.create_dictionary_column( col.name, spec=col.spec, cparams=col_storage.get("cparams"), dparams=col_storage.get("dparams"), ) + if len(dict_col.codes) < expected_size: + dict_col.resize((expected_size,)) + self._cols[col.name] = dict_col continue # Recompute chunks/blocks using the actual dtype so that wide # string columns (e.g. U183642) don't produce multi-GB chunks. @@ -3482,6 +3485,47 @@ def select(self, cols: list[str]) -> CTable: obj._col_widths = {name: self._col_widths[name] for name in cols if name in self._col_widths} return obj + def group_by( + self, + keys: str | Sequence[str], + *, + sort: bool = False, + dropna: bool = True, + engine: str = "auto", + chunk_size: int | None = None, + ): + """Return a deferred group-by object for this table. + + Parameters + ---------- + keys: + Column name or sequence of column names to group by. + sort: + If ``True``, sort the result by the group keys. The default + ``False`` preserves the hash aggregation order and is usually + faster. + dropna: + If ``True`` (default), rows with null/NaN group keys are skipped. + If ``False``, null/NaN keys form their own group. + engine: + Execution engine. Phase 1 accepts ``"auto"`` and uses the NumPy + chunked implementation. + chunk_size: + Optional number of physical rows processed per chunk. + + Returns + ------- + CTableGroupBy + A lightweight deferred operation builder. Call methods such as + ``.size()``, ``.count(column)`` or ``.agg({...})`` to materialize a + grouped result as a new :class:`CTable`. + """ + if engine != "auto": + raise ValueError("Only engine='auto' is supported for group_by() in Phase 1") + from blosc2.groupby import CTableGroupBy + + return CTableGroupBy(self, keys, sort=sort, dropna=dropna, engine=engine, chunk_size=chunk_size) + def describe(self) -> None: """Print a per-column statistical summary. diff --git a/src/blosc2/groupby.py b/src/blosc2/groupby.py new file mode 100644 index 00000000..8e245548 --- /dev/null +++ b/src/blosc2/groupby.py @@ -0,0 +1,1765 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +"""Group-by support for :class:`blosc2.CTable`. + +This module contains the Phase-1, NumPy-based implementation. It is deliberately +chunked and columnar: only grouping columns, aggregation columns, and the +live-row mask are read from the source table. +""" + +from __future__ import annotations + +import copy +import dataclasses +import math +import re +from collections.abc import Mapping, Sequence +from typing import TYPE_CHECKING, Any, Literal + +import numpy as np + +from blosc2.schema import DictionarySpec, SchemaSpec, float64, int64 +from blosc2.schema import bool as b2_bool +from blosc2.schema import field as b2_field + +if TYPE_CHECKING: # pragma: no cover + from blosc2.ctable import CTable + + +AggName = Literal["size", "count", "sum", "mean", "min", "max"] + +_IDENTIFIER_RE = re.compile(r"^[A-Za-z_]\w*$") +_NAN_KEY = ("__blosc2_groupby_nan__",) + + +@dataclasses.dataclass +class _AggSpec: + input_col: str | None + op: AggName + output_col: str + + +@dataclasses.dataclass +class _AggState: + op: AggName + value: Any = None + count: int = 0 + + +class CTableGroupBy: + """Deferred group-by operation returned by :meth:`CTable.group_by`. + + The object stores the source table, grouping keys, and execution options. + It is not a :class:`CTable` view and does not materialize grouped data until + a terminal method such as :meth:`size`, :meth:`count`, or :meth:`agg` is + called. + """ + + def __init__( + self, + table: CTable, + keys: str | Sequence[str], + *, + sort: bool = False, + dropna: bool = True, + engine: str = "auto", + chunk_size: int | None = None, + ) -> None: + if isinstance(keys, str): + keys = [keys] + else: + keys = list(keys) + if not keys: + raise ValueError("group_by() requires at least one key column") + + self.table = table + self.keys = [table._logical_to_physical_name(k) for k in keys] + self.sort = bool(sort) + self.dropna = bool(dropna) + self.engine = engine + self.chunk_size = chunk_size + + for name in self.keys: + if name in table._computed_cols: + raise NotImplementedError("group_by() over computed columns is not supported yet") + if name not in table._cols: + raise KeyError(f"No column named {name!r}. Available: {table.col_names}") + col_info = table._schema.columns_by_name[name] + if table._is_list_column(col_info) or table._is_varlen_scalar_column(col_info): + raise TypeError(f"Cannot group by variable-length/list column {name!r} in Phase 1") + + def size(self, *, urlpath: str | None = None): + """Return row counts per group as a new :class:`CTable`. + + This is equivalent to SQL ``COUNT(*)``: it counts rows in each group and + is independent of null values in non-key columns. If *urlpath* is + provided, the result is written as a persistent CTable at that path. + """ + return self._execute([_AggSpec(None, "size", "size")], urlpath=urlpath) + + def count(self, column: str, *, urlpath: str | None = None): + """Return non-null value counts for *column* per group. + + This is equivalent to SQL ``COUNT(column)`` and to + ``group_by(...).agg({column: "count"})``. + """ + col = self.table._logical_to_physical_name(column) + return self._execute([_AggSpec(col, "count", f"{col}_count")], urlpath=urlpath) + + def sum(self, column: str, *, urlpath: str | None = None): + """Return sums of *column* per group. + + This is equivalent to ``group_by(...).agg({column: "sum"})``. + """ + return self.agg({column: "sum"}, urlpath=urlpath) + + def mean(self, column: str, *, urlpath: str | None = None): + """Return means of *column* per group. + + This is equivalent to ``group_by(...).agg({column: "mean"})``. + """ + return self.agg({column: "mean"}, urlpath=urlpath) + + def min(self, column: str, *, urlpath: str | None = None): + """Return minimum values of *column* per group. + + This is equivalent to ``group_by(...).agg({column: "min"})``. + """ + return self.agg({column: "min"}, urlpath=urlpath) + + def max(self, column: str, *, urlpath: str | None = None): + """Return maximum values of *column* per group. + + This is equivalent to ``group_by(...).agg({column: "max"})``. + """ + return self.agg({column: "max"}, urlpath=urlpath) + + def agg(self, aggregations: Mapping[str, str | Sequence[str]], *, urlpath: str | None = None): + """Aggregate value columns per group. + + Parameters + ---------- + aggregations: + Mapping from input column name to an aggregation name or list of + names. Supported operations in Phase 1 are ``"count"``, ``"sum"``, + ``"mean"``, ``"min"``, ``"max"`` and the special row-count spelling + ``{"*": "size"``}. + """ + specs = self._normalize_aggs(aggregations) + return self._execute(specs, urlpath=urlpath) + + def _normalize_aggs(self, aggregations: Mapping[str, str | Sequence[str]]) -> list[_AggSpec]: + if not isinstance(aggregations, Mapping) or not aggregations: + raise ValueError("agg() requires a non-empty mapping") + specs: list[_AggSpec] = [] + for col_name, ops in aggregations.items(): + if isinstance(ops, str): + op_list = [ops] + else: + op_list = list(ops) + if not op_list: + raise ValueError(f"No aggregations specified for column {col_name!r}") + + if col_name == "*": + for op in op_list: + if op != "size": + raise ValueError("Only the 'size' aggregation is supported for '*' input") + specs.append(_AggSpec(None, "size", "size")) + continue + + physical = self.table._logical_to_physical_name(col_name) + self._validate_value_column(physical) + for op in op_list: + if op not in {"count", "sum", "mean", "min", "max"}: + raise ValueError(f"Unsupported aggregation {op!r}") + self._validate_agg_for_column(physical, op) + specs.append(_AggSpec(physical, op, f"{physical}_{op}")) + output_names = [s.output_col for s in specs] + if len(output_names) != len(set(output_names)): + raise ValueError("Aggregation output column names must be unique") + return specs + + def _validate_agg_for_column(self, name: str, op: str) -> None: + dtype = getattr(self.table._schema.columns_by_name[name].spec, "dtype", None) + if op in {"sum", "mean"} and dtype is not None and dtype.kind not in "biuf": + raise TypeError(f"Aggregation {op!r} is not supported for column {name!r} with dtype {dtype}") + if op in {"min", "max"} and dtype is not None and dtype.kind == "c": + raise TypeError(f"Aggregation {op!r} is not supported for complex column {name!r}") + + def _validate_value_column(self, name: str) -> None: + if name in self.table._computed_cols: + raise NotImplementedError("group_by() aggregations over computed columns are not supported yet") + if name not in self.table._cols: + raise KeyError(f"No column named {name!r}. Available: {self.table.col_names}") + col_info = self.table._schema.columns_by_name[name] + if self.table._is_list_column(col_info) or self.table._is_varlen_scalar_column(col_info): + raise TypeError(f"Cannot aggregate variable-length/list column {name!r} in Phase 1") + if self.table._is_dictionary_column(col_info): + raise TypeError(f"Cannot aggregate dictionary column {name!r} in Phase 1") + + def _execute(self, specs: list[_AggSpec], *, urlpath: str | None = None): + self._validate_output_names(specs) + old_result_urlpath = getattr(self, "_result_urlpath", None) + self._result_urlpath = urlpath + try: + return self._execute_with_result_target(specs) + finally: + self._result_urlpath = old_result_urlpath + + def _execute_with_result_target(self, specs: list[_AggSpec]): + fast = self._try_execute_cython_dense_int_key(specs) + if fast is not None: + return fast + fast = self._try_execute_cython_two_int_key_hash(specs) + if fast is not None: + return fast + fast = self._try_execute_cython_i32_f64_sum(specs) + if fast is not None: + return fast + fast = self._try_execute_cython_float_integral_key_f64_sum(specs) + if fast is not None: + return fast + fast = self._try_execute_cython_float_hash(specs) + if fast is not None: + return fast + fast = self._try_execute_dense_single_int_key(specs) + if fast is not None: + return fast + + acc: dict[Any, dict[str, _AggState]] = {} + key_values: dict[Any, tuple[Any, ...]] = {} + + phys_len = len(self.table._valid_rows) + chunk_size = self._chunk_size() + value_cols = sorted({s.input_col for s in specs if s.input_col is not None}) + + for start in range(0, phys_len, chunk_size): + stop = min(start + chunk_size, phys_len) + valid = np.asarray(self.table._valid_rows[start:stop], dtype=bool) + if not np.any(valid): + continue + + raw_keys = [self._read_key_chunk(name, start, stop) for name in self.keys] + live_mask = valid.copy() + if self.dropna: + for name, values in zip(self.keys, raw_keys, strict=True): + live_mask &= ~self._null_mask(name, values, is_key=True) + if not np.any(live_mask): + continue + + keys_live = [np.asarray(values)[live_mask] for values in raw_keys] + n_live = len(keys_live[0]) + if n_live == 0: + continue + + unique_keys, inverse = self._factorize_keys(keys_live) + value_chunks = { + name: np.asarray(self.table._cols[name][start:stop])[live_mask] for name in value_cols + } + + partials = self._compute_partials(specs, unique_keys, inverse, value_chunks) + display_keys = self._display_keys(unique_keys) + normalized_keys = self._normalized_keys(display_keys) + self._merge_partials(acc, key_values, normalized_keys, display_keys, partials, specs) + + rows = self._final_rows(acc, key_values, specs) + return self._build_result(rows, specs) + + def _try_execute_cython_two_int_key_hash(self, specs: list[_AggSpec]): # noqa: C901 + """Cython hash path for two integer/dictionary-code keys.""" + if len(self.keys) != 2: + return None + + key_arrays = [] + key_is_dict = [] + key_nulls = [] + skip_key_nulls = [] + for key_name in self.keys: + key_info = self.table._schema.columns_by_name[key_name] + if self.table._is_dictionary_column(key_info): + key_arrays.append(self.table._cols[key_name].codes) + key_is_dict.append(True) + key_nulls.append(int(key_info.spec.null_code)) + skip_key_nulls.append(self.dropna) + continue + key_dtype = getattr(key_info.spec, "dtype", None) + if key_dtype is None or np.dtype(key_dtype).kind not in "biu": + return None + null_value = getattr(key_info.spec, "null_value", None) + if null_value is not None and not self.dropna: + return None + key_arrays.append(self.table._cols[key_name]) + key_is_dict.append(False) + key_nulls.append(0 if null_value is None else int(null_value)) + skip_key_nulls.append(self.dropna and null_value is not None) + + value_cols = {s.input_col for s in specs if s.input_col is not None} + if len(value_cols) > 1: + return None + value_col = next(iter(value_cols), None) + if value_col is not None and any(s.op in {"sum", "mean", "min", "max"} for s in specs): + value_info = self.table._schema.columns_by_name[value_col] + value_dtype = getattr(value_info.spec, "dtype", None) + if value_dtype is None or np.dtype(value_dtype).kind != "f": + return None + null_value = getattr(value_info.spec, "null_value", None) + if null_value is not None and not (isinstance(null_value, float) and math.isnan(null_value)): + return None + + try: + from blosc2 import groupby_ext + except ImportError: + return None + kernel = getattr(groupby_ext, "groupby_hash_i64x2_f64", None) + if kernel is None: + return None + + acc: dict[Any, dict[str, _AggState]] = {} + key_values: dict[Any, tuple[Any, ...]] = {} + phys_len = len(self.table._valid_rows) + chunk_size = self._chunk_size() + + for start in range(0, phys_len, chunk_size): + stop = min(start + chunk_size, phys_len) + valid = np.asarray(self.table._valid_rows[start:stop], dtype=bool) + if not np.any(valid): + continue + key_chunks = [np.asarray(arr[start:stop], dtype=np.int64) for arr in key_arrays] + live = valid.copy() + for key_chunk, skip_null, null_value in zip(key_chunks, skip_key_nulls, key_nulls, strict=True): + if skip_null: + live &= key_chunk != null_value + if not np.any(live): + continue + + if value_col is None: + values = np.empty(len(valid), dtype=np.float64) + values_valid = np.zeros(len(valid), dtype=bool) + has_values = False + else: + raw_values = np.asarray(self.table._cols[value_col][start:stop]) + values = np.ascontiguousarray(raw_values.astype(np.float64, copy=False)) + values_valid = np.ascontiguousarray(~self._null_mask(value_col, raw_values, is_key=False)) + has_values = True + + ( + out_k0, + out_k1, + row_counts, + value_counts, + sums, + mins, + maxs, + has_value, + ) = kernel( + np.ascontiguousarray(key_chunks[0]), + np.ascontiguousarray(key_chunks[1]), + values, + np.ascontiguousarray(live), + values_valid, + has_values, + ) + + for i, (code0, code1) in enumerate(zip(out_k0, out_k1, strict=True)): + display = [] + norm_parts = [] + for key_pos, code in enumerate((int(code0), int(code1))): + if key_is_dict[key_pos]: + value = self.table._cols[self.keys[key_pos]].decode(code) + else: + value = code + display.append(value) + norm_parts.append(_normalize_key_part(value)) + norm_key = tuple(norm_parts) + states = acc.setdefault(norm_key, {}) + key_values.setdefault(norm_key, tuple(display)) + for spec in specs: + state = states.setdefault(spec.output_col, _AggState(spec.op)) + if spec.op == "size": + state.value = (0 if state.value is None else state.value) + int(row_counts[i]) + elif spec.op == "count": + state.value = (0 if state.value is None else state.value) + int(value_counts[i]) + elif spec.op in {"sum", "mean"}: + if has_value[i]: + state.value = (0.0 if state.value is None else state.value) + float(sums[i]) + state.count += int(value_counts[i]) + elif spec.op == "min": + if has_value[i]: + value = float(mins[i]) + if state.count == 0 or value < state.value: + state.value = value + state.count += 1 + elif spec.op == "max" and has_value[i]: + value = float(maxs[i]) + if state.count == 0 or value > state.value: + state.value = value + state.count += 1 + + rows = self._final_rows(acc, key_values, specs) + return self._build_result(rows, specs) + + def _try_execute_cython_dense_int_key(self, specs: list[_AggSpec]): # noqa: C901 + """Cython fast path for one compact integer/dictionary key and dense aggregations.""" + if len(self.keys) != 1: + return None + key_name = self.keys[0] + key_info = self.table._schema.columns_by_name[key_name] + key_is_dict = self.table._is_dictionary_column(key_info) + if key_is_dict: + key_arr = self.table._cols[key_name].codes + key_dtype = np.dtype(np.int32) + skip_key_null = self.dropna + key_null = int(key_info.spec.null_code) + else: + key_arr = self.table._cols[key_name] + key_dtype = getattr(key_info.spec, "dtype", None) + if key_dtype is None: + return None + key_dtype = np.dtype(key_dtype) + if key_dtype.kind not in "biu": + return None + key_null_value = getattr(key_info.spec, "null_value", None) + skip_key_null = self.dropna and key_null_value is not None + key_null = 0 if key_null_value is None else int(key_null_value) + + try: + from blosc2 import groupby_ext + except ImportError: + return None + + descriptors = [] + for spec in specs: + desc: dict[str, Any] = {"spec": spec, "op": spec.op} + if spec.op == "size": + kernel = getattr(groupby_ext, "groupby_dense_int_size_checked", None) + if kernel is None: + return None + desc.update({"kernel": kernel, "state_kind": "counts"}) + descriptors.append(desc) + continue + + if spec.input_col is None: + return None + value_info = self.table._schema.columns_by_name[spec.input_col] + value_dtype = getattr(value_info.spec, "dtype", None) + if value_dtype is None: + return None + value_dtype = np.dtype(value_dtype) + null_value = getattr(value_info.spec, "null_value", None) + + if spec.op == "count": + kernel = getattr(groupby_ext, "groupby_dense_int_count_checked", None) + if kernel is None: + return None + desc.update({"kernel": kernel, "state_kind": "counts", "value_dtype": value_dtype}) + elif spec.op in {"sum", "mean", "min", "max"}: + if value_dtype.kind == "f": + skip_nan = isinstance(null_value, float) and math.isnan(null_value) + if null_value is not None and not skip_nan: + return None + suffix = "sum" if spec.op == "sum" else spec.op + kernel = getattr(groupby_ext, f"groupby_dense_int_f64_{suffix}_checked", None) + if kernel is None: + return None + desc.update( + { + "kernel": kernel, + "value_dtype": np.float64, + "value_kind": "f64", + "skip_nan": skip_nan, + } + ) + elif value_dtype.kind in "biu": + if null_value is not None: + return None + if spec.op == "mean": + kernel = getattr(groupby_ext, "groupby_dense_int_f64_mean_checked", None) + if kernel is None: + return None + desc.update( + { + "kernel": kernel, + "value_dtype": np.float64, + "value_kind": "f64", + "skip_nan": False, + } + ) + else: + kernel = getattr(groupby_ext, f"groupby_dense_int_i64_{spec.op}_checked", None) + if kernel is None: + return None + desc.update( + { + "kernel": kernel, + "value_dtype": np.int64, + "value_kind": "i64", + "skip_nan": False, + } + ) + else: + return None + if spec.op in {"sum", "min", "max"}: + desc["state_kind"] = "value_present" if spec.op == "sum" else "extreme" + elif spec.op == "mean": + desc["state_kind"] = "mean" + else: + return None + descriptors.append(desc) + + compact_limit = 10_000_000 + keys_present = np.zeros(0, dtype=bool) + states: dict[str, Any] = {} + for desc in descriptors: + spec = desc["spec"] + if desc["state_kind"] == "counts": + states[spec.output_col] = np.zeros(0, dtype=np.int64) + elif desc["state_kind"] == "mean": + states[spec.output_col] = (np.zeros(0, dtype=np.float64), np.zeros(0, dtype=np.int64)) + elif desc["state_kind"] == "value_present" or desc["state_kind"] == "extreme": + dtype = np.float64 if desc["value_kind"] == "f64" else np.int64 + states[spec.output_col] = (np.zeros(0, dtype=dtype), np.zeros(0, dtype=bool)) + + def ensure_size(size: int) -> bool: + nonlocal keys_present, states + if size > compact_limit: + return False + if size <= len(keys_present): + return True + old = len(keys_present) + keys_present = np.pad(keys_present, (0, size - old), constant_values=False) + for desc in descriptors: + spec = desc["spec"] + state = states[spec.output_col] + if desc["state_kind"] == "counts": + states[spec.output_col] = np.pad(state, (0, size - old), constant_values=0) + else: + first, second = state + states[spec.output_col] = ( + np.pad(first, (0, size - old), constant_values=0), + np.pad( + second, (0, size - old), constant_values=False if second.dtype == np.bool_ else 0 + ), + ) + return True + + def call_checked(kernel, *args) -> bool: + return int(kernel(*args)) == 0 + + phys_len = len(self.table._valid_rows) + chunk_size = self._chunk_size() + for start in range(0, phys_len, chunk_size): + stop = min(start + chunk_size, phys_len) + valid = np.asarray(self.table._valid_rows[start:stop], dtype=bool) + if not np.any(valid): + continue + keys = np.asarray(key_arr[start:stop], dtype=np.int8 if key_dtype.kind == "b" else key_dtype) + keys = np.ascontiguousarray(keys) + valid = np.ascontiguousarray(valid) + live = valid.copy() + if skip_key_null: + live &= keys != key_null + if not np.any(live): + continue + live_keys = keys[live] + if np.min(live_keys) < 0: + return None + max_key = int(np.max(live_keys)) + if not ensure_size(max_key + 1): + return None + + for desc in descriptors: + spec = desc["spec"] + state = states[spec.output_col] + if spec.op == "size": + if not call_checked( + desc["kernel"], keys, valid, state, keys_present, skip_key_null, key_null + ): + return None + elif spec.op == "count": + values = np.asarray(self.table._cols[spec.input_col][start:stop]) + values_valid = np.ascontiguousarray( + ~self._null_mask(spec.input_col, values, is_key=False) + ) + if not call_checked( + desc["kernel"], + keys, + valid, + values_valid, + state, + keys_present, + skip_key_null, + key_null, + ): + return None + elif spec.op == "sum": + values = np.asarray( + self.table._cols[spec.input_col][start:stop], dtype=desc["value_dtype"] + ) + values = np.ascontiguousarray(values) + sums, value_present = state + args = ( + keys, + values, + valid, + sums, + value_present, + keys_present, + skip_key_null, + key_null, + ) + if desc["value_kind"] == "f64": + args = (*args, desc["skip_nan"]) + if not call_checked(desc["kernel"], *args): + return None + elif spec.op == "mean": + values = np.asarray( + self.table._cols[spec.input_col][start:stop], dtype=desc["value_dtype"] + ) + values = np.ascontiguousarray(values) + sums, counts = state + if not call_checked( + desc["kernel"], + keys, + values, + valid, + sums, + counts, + keys_present, + skip_key_null, + key_null, + desc["skip_nan"], + ): + return None + elif spec.op in {"min", "max"}: + values = np.asarray( + self.table._cols[spec.input_col][start:stop], dtype=desc["value_dtype"] + ) + values = np.ascontiguousarray(values) + extremes, has_value = state + args = ( + keys, + values, + valid, + extremes, + has_value, + keys_present, + skip_key_null, + key_null, + ) + if desc["value_kind"] == "f64": + args = (*args, desc["skip_nan"]) + if not call_checked(desc["kernel"], *args): + return None + + group_codes = np.nonzero(keys_present)[0] + if self.sort and key_is_dict: + group_codes = np.array( + sorted( + group_codes, + key=lambda code: _sortable_key_part(self.table._cols[key_name].decode(int(code))), + ), + dtype=group_codes.dtype, + ) + + rows = [] + for code in group_codes: + key_value = self.table._cols[key_name].decode(int(code)) if key_is_dict else _python_scalar(code) + row = {key_name: key_value} + for desc in descriptors: + spec = desc["spec"] + state = states[spec.output_col] + if spec.op in {"size", "count"}: + row[spec.output_col] = int(state[code]) + elif spec.op == "sum": + sums, value_present = state + row[spec.output_col] = ( + _python_scalar(sums[code]) + if value_present[code] + else _null_output_value(self._result_spec_for_agg(spec)) + ) + elif spec.op == "mean": + sums, counts = state + row[spec.output_col] = ( + math.nan if counts[code] == 0 else float(sums[code]) / int(counts[code]) + ) + elif spec.op in {"min", "max"}: + extremes, has_value = state + row[spec.output_col] = ( + _python_scalar(extremes[code]) + if has_value[code] + else _null_output_value(self._result_spec_for_agg(spec)) + ) + rows.append(row) + return self._build_result(rows, specs) + + def _try_execute_cython_i32_f64_sum(self, specs: list[_AggSpec]): # noqa: C901 + """Cython fast path for one int32 key and one non-null float64 sum.""" + if len(self.keys) != 1 or len(specs) != 1 or specs[0].op != "sum" or self.sort: + return None + spec = specs[0] + if spec.input_col is None: + return None + key_name = self.keys[0] + key_info = self.table._schema.columns_by_name[key_name] + value_info = self.table._schema.columns_by_name[spec.input_col] + if self.table._is_dictionary_column(key_info): + key_arr = self.table._cols[key_name].codes + key_is_dict = True + key_null = int(key_info.spec.null_code) + skip_key_null = self.dropna + else: + key_arr = self.table._cols[key_name] + key_is_dict = False + key_dtype = getattr(key_info.spec, "dtype", None) + if key_dtype != np.dtype(np.int32): + return None + key_null_value = getattr(key_info.spec, "null_value", None) + skip_key_null = self.dropna and key_null_value is not None + key_null = 0 if key_null_value is None else int(key_null_value) + value_dtype = getattr(value_info.spec, "dtype", None) + if value_dtype != np.dtype(np.float64) or getattr(value_info.spec, "null_value", None) is not None: + return None + try: + from blosc2 import groupby_ext + except ImportError: + return None + kernel = getattr(groupby_ext, "groupby_dense_i32_f64_sum_checked", None) + if kernel is None: + return None + + compact_limit = 10_000_000 + sums = np.zeros(0, dtype=np.float64) + present = np.zeros(0, dtype=bool) + + def ensure_size(size: int) -> bool: + nonlocal sums, present + if size > compact_limit: + return False + if size <= len(sums): + return True + old = len(sums) + sums = np.pad(sums, (0, size - old), constant_values=0) + present = np.pad(present, (0, size - old), constant_values=False) + return True + + phys_len = len(self.table._valid_rows) + chunk_size = self._chunk_size() + for start in range(0, phys_len, chunk_size): + stop = min(start + chunk_size, phys_len) + valid = np.asarray(self.table._valid_rows[start:stop], dtype=bool) + if not np.any(valid): + continue + keys = np.asarray(key_arr[start:stop], dtype=np.int32) + values = np.asarray(self.table._cols[spec.input_col][start:stop], dtype=np.float64) + status = int(kernel(keys, values, valid, sums, present, skip_key_null, key_null, False)) + if status == -1: + return None + if status > 0: + if not ensure_size(status): + return None + status = int(kernel(keys, values, valid, sums, present, skip_key_null, key_null, False)) + if status != 0: + return None + + rows = [] + for code in np.nonzero(present)[0]: + key_value = self.table._cols[key_name].decode(int(code)) if key_is_dict else int(code) + rows.append({key_name: key_value, spec.output_col: float(sums[code])}) + return self._build_result(rows, specs) + + def _try_execute_cython_float_hash(self, specs: list[_AggSpec]): # noqa: C901 + """Cython hash path for one arbitrary float key. + + This covers float32/float64 keys that are not suitable for dense + integral-key indexing. It currently supports float value columns for + value reductions and falls back for unsupported mixed/multi-column cases. + """ + if len(self.keys) != 1: + return None + key_name = self.keys[0] + key_info = self.table._schema.columns_by_name[key_name] + if self.table._is_dictionary_column(key_info): + return None + key_dtype = getattr(key_info.spec, "dtype", None) + if key_dtype not in {np.dtype(np.float32), np.dtype(np.float64)}: + return None + + value_cols = {s.input_col for s in specs if s.input_col is not None} + if len(value_cols) > 1: + return None + value_col = next(iter(value_cols), None) + value_dtype = None + nullable_nan_value = False + if value_col is not None: + value_info = self.table._schema.columns_by_name[value_col] + value_dtype = getattr(value_info.spec, "dtype", None) + # Count can operate on any fixed-width value column via values_valid, + # but other reductions in this hash kernel normalize values to f64. + if any(s.op in {"sum", "mean", "min", "max"} for s in specs): + if value_dtype is None or np.dtype(value_dtype).kind != "f": + return None + null_value = getattr(value_info.spec, "null_value", None) + nullable_nan_value = isinstance(null_value, float) and math.isnan(null_value) + if null_value is not None and not nullable_nan_value: + return None + + try: + from blosc2 import groupby_ext + except ImportError: + return None + kernel = getattr(groupby_ext, "groupby_hash_f64_f64", None) + if kernel is None: + return None + + acc: dict[Any, dict[str, _AggState]] = {} + key_values: dict[Any, tuple[Any, ...]] = {} + phys_len = len(self.table._valid_rows) + chunk_size = self._chunk_size() + + for start in range(0, phys_len, chunk_size): + stop = min(start + chunk_size, phys_len) + valid = np.asarray(self.table._valid_rows[start:stop], dtype=bool) + if not np.any(valid): + continue + keys = np.ascontiguousarray(np.asarray(self.table._cols[key_name][start:stop], dtype=np.float64)) + if value_col is None: + values = np.empty(len(keys), dtype=np.float64) + values_valid = np.zeros(len(keys), dtype=bool) + has_values = False + else: + raw_values = np.asarray(self.table._cols[value_col][start:stop]) + if any(s.op in {"sum", "mean", "min", "max"} for s in specs): + values = np.ascontiguousarray(raw_values.astype(np.float64, copy=False)) + else: + values = np.empty(len(keys), dtype=np.float64) + values_valid = np.ascontiguousarray(~self._null_mask(value_col, raw_values, is_key=False)) + has_values = True + + ( + chunk_keys, + row_counts, + value_counts, + sums, + mins, + maxs, + has_value, + ) = kernel(keys, values, np.ascontiguousarray(valid), values_valid, has_values, self.dropna) + + for i, key in enumerate(chunk_keys): + key_scalar = np.asarray(key, dtype=key_dtype).item() + norm_key = _normalize_key_part(float(key_scalar)) + states = acc.setdefault(norm_key, {}) + key_values.setdefault(norm_key, (key_scalar,)) + for spec in specs: + state = states.setdefault(spec.output_col, _AggState(spec.op)) + if spec.op == "size": + state.value = (0 if state.value is None else state.value) + int(row_counts[i]) + elif spec.op == "count": + state.value = (0 if state.value is None else state.value) + int(value_counts[i]) + elif spec.op == "sum" or spec.op == "mean": + if has_value[i]: + state.value = (0.0 if state.value is None else state.value) + float(sums[i]) + state.count += int(value_counts[i]) + elif spec.op == "min": + if has_value[i]: + value = float(mins[i]) + if state.count == 0 or value < state.value: + state.value = value + state.count += 1 + elif spec.op == "max" and has_value[i]: + value = float(maxs[i]) + if state.count == 0 or value > state.value: + state.value = value + state.count += 1 + + # Hash-table iteration order is intentionally not exposed. Emit float + # hash groups in key order for deterministic results and compatibility + # with the previous NumPy fallback behavior for these cases. + ordered_keys = list(acc) + ordered_keys.sort( + key=lambda k: tuple( + (1, "") if isinstance(v, float) and math.isnan(v) else (0, v) for v in key_values[k] + ) + ) + rows = [] + for norm_key in ordered_keys: + row = dict(zip(self.keys, key_values[norm_key], strict=True)) + states = acc[norm_key] + for spec in specs: + state = states[spec.output_col] + if spec.op == "mean": + row[spec.output_col] = math.nan if state.count == 0 else state.value / state.count + elif spec.op in {"sum", "min", "max"} and state.count == 0: + row[spec.output_col] = _null_output_value(self._result_spec_for_agg(spec)) + else: + row[spec.output_col] = 0 if state.value is None else state.value + rows.append(row) + return self._build_result(rows, specs) + + def _try_execute_cython_float_integral_key_f64_sum(self, specs: list[_AggSpec]): # noqa: C901 + """Cython fast path for integral float32/float64 keys and one non-null float64 sum.""" + if len(self.keys) != 1 or len(specs) != 1 or specs[0].op != "sum" or self.sort: + return None + spec = specs[0] + if spec.input_col is None: + return None + key_name = self.keys[0] + key_info = self.table._schema.columns_by_name[key_name] + value_info = self.table._schema.columns_by_name[spec.input_col] + key_dtype = getattr(key_info.spec, "dtype", None) + value_dtype = getattr(value_info.spec, "dtype", None) + if key_dtype not in {np.dtype(np.float32), np.dtype(np.float64)} or value_dtype != np.dtype( + np.float64 + ): + return None + if getattr(value_info.spec, "null_value", None) is not None: + return None + # The fast path can skip NaNs. If dropna=False and NaNs are present, + # the Cython kernel reports unsupported and we fall back to generic + # grouping, which can materialize a NaN group. + skip_key_nan = self.dropna + try: + from blosc2 import groupby_ext + except ImportError: + return None + kernel_name = ( + "groupby_dense_f32_integral_key_f64_sum_checked" + if key_dtype == np.dtype(np.float32) + else "groupby_dense_f64_integral_key_f64_sum_checked" + ) + kernel = getattr(groupby_ext, kernel_name, None) + if kernel is None: + return None + + compact_limit = 10_000_000 + sums = np.zeros(0, dtype=np.float64) + present = np.zeros(0, dtype=bool) + + def ensure_size(size: int) -> bool: + nonlocal sums, present + if size > compact_limit: + return False + if size <= len(sums): + return True + old = len(sums) + sums = np.pad(sums, (0, size - old), constant_values=0) + present = np.pad(present, (0, size - old), constant_values=False) + return True + + phys_len = len(self.table._valid_rows) + chunk_size = self._chunk_size() + for start in range(0, phys_len, chunk_size): + stop = min(start + chunk_size, phys_len) + valid = np.asarray(self.table._valid_rows[start:stop], dtype=bool) + if not np.any(valid): + continue + keys = np.asarray(self.table._cols[key_name][start:stop], dtype=key_dtype) + values = np.asarray(self.table._cols[spec.input_col][start:stop], dtype=np.float64) + status = int(kernel(keys, values, valid, sums, present, skip_key_nan, False)) + if status == -1: + return None + if status > 0: + if not ensure_size(status): + return None + status = int(kernel(keys, values, valid, sums, present, skip_key_nan, False)) + if status != 0: + return None + + rows = [ + {key_name: float(code), spec.output_col: float(sums[code])} for code in np.nonzero(present)[0] + ] + return self._build_result(rows, specs) + + def _try_execute_dense_single_int_key(self, specs: list[_AggSpec]): # noqa: C901 + """Fast path for one dense integer/dictionary-code key. + + This avoids per-chunk ``np.unique`` and Python dictionary merging. It is + intentionally conservative: keys must be non-negative and the observed + key range must stay reasonably compact. + """ + if len(self.keys) != 1: + return None + key_name = self.keys[0] + key_info = self.table._schema.columns_by_name[key_name] + key_is_dict = self.table._is_dictionary_column(key_info) + key_dtype = np.dtype(np.int32) if key_is_dict else getattr(key_info.spec, "dtype", None) + if key_dtype is None or key_dtype.kind not in "biu": + return None + if any(spec.op in {"min", "max"} and spec.input_col is not None for spec in specs): + for spec in specs: + if spec.op in {"min", "max"} and spec.input_col is not None: + dtype = getattr(self.table._schema.columns_by_name[spec.input_col].spec, "dtype", None) + if dtype is None or np.dtype(dtype).kind not in "biufmM": + return None + + compact_limit = 10_000_000 + present = np.zeros(0, dtype=bool) + states: dict[str, Any] = {} + for spec in specs: + if spec.op in {"size", "count"}: + states[spec.output_col] = np.zeros(0, dtype=np.int64) + elif spec.op == "sum": + out_dtype = np.int64 + if spec.input_col is not None: + dtype = np.dtype(self.table._schema.columns_by_name[spec.input_col].spec.dtype) + out_dtype = np.float64 if dtype.kind == "f" else np.int64 + states[spec.output_col] = np.zeros(0, dtype=out_dtype) + elif spec.op == "mean": + states[spec.output_col] = (np.zeros(0, dtype=np.float64), np.zeros(0, dtype=np.int64)) + elif spec.op in {"min", "max"}: + assert spec.input_col is not None + dtype = np.dtype(self.table._schema.columns_by_name[spec.input_col].spec.dtype) + identity = _max_identity(dtype) if spec.op == "min" else _min_identity(dtype) + states[spec.output_col] = (np.full(0, identity, dtype=dtype), np.zeros(0, dtype=bool)) + + def ensure_size(size: int) -> bool: + nonlocal present, states + if size > compact_limit: + return False + if size <= len(present): + return True + old = len(present) + present = np.pad(present, (0, size - old), constant_values=False) + for spec in specs: + state = states[spec.output_col] + if spec.op in {"size", "count", "sum"}: + states[spec.output_col] = np.pad(state, (0, size - old), constant_values=0) + elif spec.op == "mean": + sums, counts = state + states[spec.output_col] = ( + np.pad(sums, (0, size - old), constant_values=0), + np.pad(counts, (0, size - old), constant_values=0), + ) + elif spec.op in {"min", "max"}: + values, has = state + dtype = values.dtype + identity = _max_identity(dtype) if spec.op == "min" else _min_identity(dtype) + states[spec.output_col] = ( + np.pad(values, (0, size - old), constant_values=identity), + np.pad(has, (0, size - old), constant_values=False), + ) + return True + + phys_len = len(self.table._valid_rows) + chunk_size = self._chunk_size() + value_cols = sorted({s.input_col for s in specs if s.input_col is not None}) + for start in range(0, phys_len, chunk_size): + stop = min(start + chunk_size, phys_len) + valid = np.asarray(self.table._valid_rows[start:stop], dtype=bool) + if not np.any(valid): + continue + raw_keys = self._read_key_chunk(key_name, start, stop) + live_mask = valid.copy() + if self.dropna: + live_mask &= ~self._null_mask(key_name, raw_keys, is_key=True) + if not np.any(live_mask): + continue + keys = np.asarray(raw_keys[live_mask]) + if keys.dtype.kind == "b": + keys = keys.astype(np.int8, copy=False) + if len(keys) == 0: + continue + min_key = int(np.min(keys)) + if min_key < 0: + return None + max_key = int(np.max(keys)) + if not ensure_size(max_key + 1): + return None + present[keys] = True + value_chunks = { + name: np.asarray(self.table._cols[name][start:stop])[live_mask] for name in value_cols + } + + for spec in specs: + if spec.op == "size": + states[spec.output_col] += np.bincount(keys, minlength=len(present)).astype(np.int64) + continue + assert spec.input_col is not None + values = value_chunks[spec.input_col] + non_null = ~self._null_mask(spec.input_col, values, is_key=False) + if spec.op == "count": + states[spec.output_col] += np.bincount( + keys, weights=non_null.astype(np.int64), minlength=len(present) + ).astype(np.int64) + elif spec.op == "sum": + state = states[spec.output_col] + if values.dtype.kind in "biu": + np.add.at(state, keys[non_null], values[non_null].astype(np.int64, copy=False)) + else: + state += np.bincount( + keys, weights=np.where(non_null, values, 0), minlength=len(present) + ).astype(state.dtype, copy=False) + elif spec.op == "mean": + sums, counts = states[spec.output_col] + sums += np.bincount(keys, weights=np.where(non_null, values, 0), minlength=len(present)) + counts += np.bincount( + keys, weights=non_null.astype(np.int64), minlength=len(present) + ).astype(np.int64) + elif spec.op in {"min", "max"}: + values_state, has_state = states[spec.output_col] + if spec.op == "min": + np.minimum.at(values_state, keys[non_null], values[non_null]) + else: + np.maximum.at(values_state, keys[non_null], values[non_null]) + has_state[keys[non_null]] = True + + group_codes = np.nonzero(present)[0] + rows = [] + for code in group_codes: + key_value = self.table._cols[key_name].decode(int(code)) if key_is_dict else _python_scalar(code) + row = {key_name: key_value} + for spec in specs: + state = states[spec.output_col] + if spec.op == "mean": + sums, counts = state + row[spec.output_col] = ( + math.nan if counts[code] == 0 else float(sums[code]) / int(counts[code]) + ) + elif spec.op in {"min", "max"}: + values_state, has_state = state + row[spec.output_col] = ( + _python_scalar(values_state[code]) + if has_state[code] + else _null_output_value(self._result_spec_for_agg(spec)) + ) + else: + row[spec.output_col] = _python_scalar(state[code]) + rows.append(row) + return self._build_result(rows, specs) + + def _chunk_size(self) -> int: + if self.chunk_size is not None: + if self.chunk_size <= 0: + raise ValueError("chunk_size must be positive") + return int(self.chunk_size) + chunks = getattr(self.table._valid_rows, "chunks", None) + if chunks: + return max(int(chunks[0]), 1) + return 65536 + + def _read_key_chunk(self, name: str, start: int, stop: int) -> np.ndarray: + col_info = self.table._schema.columns_by_name[name] + if self.table._is_dictionary_column(col_info): + return np.asarray(self.table._cols[name].codes[start:stop], dtype=np.int32) + return np.asarray(self.table._cols[name][start:stop]) + + def _factorize_keys( + self, keys_live: list[np.ndarray] + ) -> tuple[np.ndarray | list[np.ndarray], np.ndarray]: + if len(keys_live) == 1: + unique, inverse = np.unique(keys_live[0], return_inverse=True) + return unique, inverse + + dtype = [(f"k{i}", arr.dtype) for i, arr in enumerate(keys_live)] + packed = np.empty(len(keys_live[0]), dtype=dtype) + for i, arr in enumerate(keys_live): + packed[f"k{i}"] = arr + unique, inverse = np.unique(packed, return_inverse=True) + return unique, inverse + + def _display_keys(self, unique_keys: np.ndarray | list[np.ndarray]) -> list[tuple[Any, ...]]: + if len(self.keys) == 1: + name = self.keys[0] + col_info = self.table._schema.columns_by_name[name] + values = [] + for value in np.asarray(unique_keys): + if self.table._is_dictionary_column(col_info): + values.append((self.table._cols[name].decode(int(value)),)) + else: + values.append((_python_scalar(value),)) + return values + + result = [] + assert isinstance(unique_keys, np.ndarray) + for row in unique_keys: + vals = [] + for i, name in enumerate(self.keys): + value = row[f"k{i}"] + col_info = self.table._schema.columns_by_name[name] + if self.table._is_dictionary_column(col_info): + vals.append(self.table._cols[name].decode(int(value))) + else: + vals.append(_python_scalar(value)) + result.append(tuple(vals)) + return result + + def _normalized_keys(self, display_keys: list[tuple[Any, ...]]) -> list[Any]: + normalized = [] + for key in display_keys: + norm = tuple(_normalize_key_part(v) for v in key) + normalized.append(norm[0] if len(norm) == 1 else norm) + return normalized + + def _compute_partials( + self, + specs: list[_AggSpec], + unique_keys: np.ndarray | list[np.ndarray], + inverse: np.ndarray, + value_chunks: dict[str, np.ndarray], + ) -> dict[str, Any]: + n_groups = len(unique_keys) + partials: dict[str, Any] = {} + for spec in specs: + if spec.op == "size": + partials[spec.output_col] = np.bincount(inverse, minlength=n_groups).astype(np.int64) + continue + + assert spec.input_col is not None + values = value_chunks[spec.input_col] + non_null = ~self._null_mask(spec.input_col, values, is_key=False) + + if spec.op == "count": + partials[spec.output_col] = np.bincount( + inverse, weights=non_null.astype(np.int64), minlength=n_groups + ).astype(np.int64) + elif spec.op in {"sum", "mean"}: + counts = np.bincount(inverse, weights=non_null.astype(np.int64), minlength=n_groups).astype( + np.int64 + ) + if spec.op == "sum" and values.dtype.kind in "biu": + sums = np.zeros(n_groups, dtype=np.int64) + np.add.at(sums, inverse[non_null], values[non_null].astype(np.int64, copy=False)) + else: + weights = np.where(non_null, values, 0) + sums = np.bincount(inverse, weights=weights, minlength=n_groups) + partials[spec.output_col] = (sums, counts) + elif spec.op in {"min", "max"}: + partials[spec.output_col] = self._minmax_partials( + spec.op, inverse, values, non_null, n_groups + ) + return partials + + def _minmax_partials( + self, op: AggName, inverse: np.ndarray, values: np.ndarray, non_null: np.ndarray, n_groups: int + ) -> tuple[np.ndarray, np.ndarray]: + if values.dtype.kind in "biufcmM": + if op == "min": + identity = _max_identity(values.dtype) + out = np.full(n_groups, identity, dtype=values.dtype) + np.minimum.at(out, inverse[non_null], values[non_null]) + else: + identity = _min_identity(values.dtype) + out = np.full(n_groups, identity, dtype=values.dtype) + np.maximum.at(out, inverse[non_null], values[non_null]) + else: + out = np.empty(n_groups, dtype=values.dtype) + has = np.zeros(n_groups, dtype=bool) + for group, value, ok in zip(inverse, values, non_null, strict=True): + if not ok: + continue + if not has[group] or (value < out[group] if op == "min" else value > out[group]): + out[group] = value + has[group] = True + return out, has + has_value = np.bincount(inverse, weights=non_null.astype(np.int64), minlength=n_groups) > 0 + return out, has_value + + def _merge_partials( + self, + acc: dict[Any, dict[str, _AggState]], + key_values: dict[Any, tuple[Any, ...]], + normalized_keys: list[Any], + display_keys: list[tuple[Any, ...]], + partials: dict[str, Any], + specs: list[_AggSpec], + ) -> None: + for i, norm_key in enumerate(normalized_keys): + states = acc.setdefault(norm_key, {}) + key_values.setdefault(norm_key, display_keys[i]) + for spec in specs: + state = states.setdefault(spec.output_col, _AggState(spec.op)) + partial = partials[spec.output_col] + if spec.op in {"size", "count"}: + state.value = (0 if state.value is None else state.value) + int(partial[i]) + elif spec.op == "sum": + sums, counts = partial + if counts[i] > 0: + state.value = (0 if state.value is None else state.value) + _python_scalar(sums[i]) + state.count += int(counts[i]) + elif spec.op == "mean": + sums, counts = partial + if counts[i] > 0: + state.value = (0.0 if state.value is None else state.value) + float(sums[i]) + state.count += int(counts[i]) + elif spec.op in {"min", "max"}: + values, has_value = partial + if has_value[i]: + value = _python_scalar(values[i]) + if ( + state.count == 0 + or (spec.op == "min" and value < state.value) + or (spec.op == "max" and value > state.value) + ): + state.value = value + state.count += 1 + + def _final_rows( + self, + acc: dict[Any, dict[str, _AggState]], + key_values: dict[Any, tuple[Any, ...]], + specs: list[_AggSpec], + ) -> list[dict[str, Any]]: + keys = list(acc) + if self.sort: + keys.sort(key=lambda k: tuple(_sortable_key_part(v) for v in key_values[k])) + + rows = [] + for norm_key in keys: + row = dict(zip(self.keys, key_values[norm_key], strict=True)) + states = acc[norm_key] + for spec in specs: + state = states[spec.output_col] + if spec.op == "mean": + row[spec.output_col] = math.nan if state.count == 0 else state.value / state.count + elif spec.op in {"sum", "min", "max"} and state.count == 0: + row[spec.output_col] = _null_output_value(self._result_spec_for_agg(spec)) + else: + row[spec.output_col] = 0 if state.value is None else state.value + rows.append(row) + return rows + + def _build_result(self, rows: list[dict[str, Any]], specs: list[_AggSpec]): + from blosc2.ctable import CTable + + columns = self.keys + [spec.output_col for spec in specs] + schema_specs = {name: self._result_spec_for_key(name) for name in self.keys} + for spec in specs: + schema_specs[spec.output_col] = self._result_spec_for_agg(spec) + + fields = [] + for name in columns: + fields.append((name, _python_type_for_spec(schema_specs[name]), b2_field(schema_specs[name]))) + row_type = dataclasses.make_dataclass("CTableGroupByRow", fields) + data = {name: [row[name] for row in rows] for name in columns} + urlpath = getattr(self, "_result_urlpath", None) + kwargs = {"urlpath": str(urlpath), "mode": "w"} if urlpath is not None else {} + return CTable(row_type, new_data=data, expected_size=max(len(rows), 1), validate=False, **kwargs) + + def _validate_output_names(self, specs: list[_AggSpec]) -> None: + names = self.keys + [s.output_col for s in specs] + bad = [name for name in names if not _IDENTIFIER_RE.match(name)] + if bad: + raise NotImplementedError( + "Phase-1 group_by() result columns must be valid Python identifiers; " + f"unsupported names: {bad!r}" + ) + if len(names) != len(set(names)): + raise ValueError("Group-by result column names would not be unique") + + def _result_spec_for_key(self, name: str) -> SchemaSpec: + return copy.deepcopy(self.table._schema.columns_by_name[name].spec) + + def _result_spec_for_agg(self, spec: _AggSpec) -> SchemaSpec: + if spec.op in {"size", "count"}: + return int64() + if spec.op == "mean": + return float64() + assert spec.input_col is not None + input_spec = self.table._schema.columns_by_name[spec.input_col].spec + dtype = getattr(input_spec, "dtype", None) + if spec.op == "sum": + if dtype is not None and dtype.kind in "iu": + return int64() + if dtype is not None and dtype.kind == "b": + return int64() + if dtype is not None and dtype.kind == "f": + return float64() + return copy.deepcopy(input_spec) + + def _null_mask(self, name: str, values: np.ndarray, *, is_key: bool) -> np.ndarray: + col_info = self.table._schema.columns_by_name[name] + spec = col_info.spec + if isinstance(spec, DictionarySpec): + mask = values == np.int32(spec.null_code) + return mask if is_key or getattr(spec, "nullable", False) else np.zeros(len(values), dtype=bool) + null_value = getattr(spec, "null_value", None) + mask = np.zeros(len(values), dtype=bool) + # For keys, treat all NaNs as missing so dropna behaves predictably. + # For values, only nullable NaN sentinels are skipped. + if values.dtype.kind == "f" and ( + is_key or (isinstance(null_value, float) and math.isnan(null_value)) + ): + mask |= np.isnan(values) + if null_value is not None and not (isinstance(null_value, float) and math.isnan(null_value)): + mask |= values == null_value + return mask + + +def _normalize_key_part(value: Any) -> Any: + if isinstance(value, float) and math.isnan(value): + return _NAN_KEY + return value + + +def _sortable_key_part(value: Any) -> tuple[int, Any]: + if value is None: + return (0, "") + if isinstance(value, float) and math.isnan(value): + return (0, "") + return (1, value) + + +def _python_scalar(value: Any) -> Any: + if isinstance(value, np.generic): + return value.item() + return value + + +def _python_type_for_spec(spec: SchemaSpec): + if isinstance(spec, DictionarySpec): + return str + if isinstance(spec, b2_bool): + return bool + dtype = getattr(spec, "dtype", None) + if dtype is not None: + if dtype.kind in "iu": + return int + if dtype.kind == "f": + return float + if dtype.kind == "b": + return bool + if dtype.kind in "US": + return str if dtype.kind == "U" else bytes + return getattr(spec, "python_type", object) + + +def _max_identity(dtype: np.dtype): + dtype = np.dtype(dtype) + if dtype.kind in "iu": + return np.iinfo(dtype).max + if dtype.kind == "f": + return np.inf + if dtype.kind in "mM": + return np.iinfo(np.int64).max + return None + + +def _min_identity(dtype: np.dtype): + dtype = np.dtype(dtype) + if dtype.kind in "iu": + return np.iinfo(dtype).min + if dtype.kind == "f": + return -np.inf + if dtype.kind in "mM": + return np.iinfo(np.int64).min + return None + + +def _null_output_value(spec: SchemaSpec): + dtype = getattr(spec, "dtype", None) + null_value = getattr(spec, "null_value", None) + if null_value is not None: + return null_value + if dtype is not None and dtype.kind == "f": + return math.nan + if dtype is not None and dtype.kind in "iu": + return 0 + if dtype is not None and dtype.kind == "b": + return False + if dtype is not None and dtype.kind == "U": + return "" + if dtype is not None and dtype.kind == "S": + return b"" + return None + + +# ---------------------------------------------------------------------- +# Public array-oriented grouped reductions +# ---------------------------------------------------------------------- + + +def group_reduce(keys, values=None, op: AggName = "size", *, sort: bool = False, dropna: bool = True): + """Group *keys* and reduce *values* with *op*. + + This is a lower-level, NumPy-style grouped reduction primitive. It exposes + Blosc2's optimized group-reduce kernels for plain array-like inputs without + requiring a :class:`blosc2.CTable`. + + Parameters + ---------- + keys : array-like + One-dimensional grouping keys. + values : array-like, optional + One-dimensional values to reduce. Required for ``"count"``, ``"sum"``, + ``"mean"``, ``"min"`` and ``"max"``. Ignored for ``"size"``. + op : {"size", "count", "sum", "mean", "min", "max"}, default: "size" + Reduction operation. ``"size"`` counts rows per group, while + ``"count"`` counts non-NaN values per group. + sort : bool, default: False + If true, sort output groups by key. With ``sort=False`` output order is + implementation dependent. + dropna : bool, default: True + If true, skip NaN float keys. If false, all NaN keys form one group. + + Returns + ------- + groups, result : numpy.ndarray, numpy.ndarray + Group keys and reduced values. + + Examples + -------- + >>> import numpy as np + >>> import blosc2 + >>> keys = np.array([1, 2, 1, 2, 1]) + >>> values = np.array([10., 20., 30., 40., 50.]) + >>> groups, sums = blosc2.group_reduce(keys, values, op="sum", sort=True) + >>> groups + array([1, 2]) + >>> sums + array([90., 60.]) + """ + if op not in {"size", "count", "sum", "mean", "min", "max"}: + raise ValueError(f"unsupported group_reduce operation {op!r}") + + keys_arr = np.asarray(keys) + if keys_arr.ndim != 1: + raise ValueError("keys must be a 1-D array") + + if op == "size": + values_arr = None + else: + if values is None: + raise ValueError(f"values are required for group_reduce op {op!r}") + values_arr = np.asarray(values) + if values_arr.ndim != 1: + raise ValueError("values must be a 1-D array") + if len(values_arr) != len(keys_arr): + raise ValueError("keys and values must have the same length") + + if len(keys_arr) == 0: + return keys_arr.copy(), np.empty(0, dtype=_result_dtype(values_arr, op)) + + fast = _try_dense_integer(keys_arr, values_arr, op, sort=sort) + if fast is not None: + return fast + + fast = _try_float_hash(keys_arr, values_arr, op, sort=sort, dropna=dropna) + if fast is not None: + return fast + + return _group_reduce_numpy(keys_arr, values_arr, op, sort=sort, dropna=dropna) + + +def _try_dense_integer(keys: np.ndarray, values: np.ndarray | None, op: str, *, sort: bool): # noqa: C901 + key_dtype = np.dtype(keys.dtype) + if key_dtype.kind == "b": + keys = keys.astype(np.int8, copy=False) + elif key_dtype.kind not in "iu": + return None + keys = np.ascontiguousarray(keys) + if len(keys) == 0: + return None + if np.min(keys) < 0: + return None + max_key = int(np.max(keys)) + if max_key + 1 > 10_000_000: + return None + + try: + from blosc2 import groupby_ext + except ImportError: + return None + + valid = np.ones(len(keys), dtype=bool) + keys_present = np.zeros(max_key + 1, dtype=bool) + + if op == "size": + counts = np.zeros(max_key + 1, dtype=np.int64) + groupby_ext.groupby_dense_int_size_checked(keys, valid, counts, keys_present, False, 0) + groups = np.nonzero(keys_present)[0].astype(key_dtype if key_dtype.kind != "b" else np.bool_) + result = counts[np.nonzero(keys_present)[0]] + return _maybe_sort(groups, result, sort) + + assert values is not None + value_dtype = np.dtype(values.dtype) + if op == "count": + counts = np.zeros(max_key + 1, dtype=np.int64) + values_valid = _values_valid(values) + groupby_ext.groupby_dense_int_count_checked( + keys, valid, np.ascontiguousarray(values_valid), counts, keys_present, False, 0 + ) + codes = np.nonzero(keys_present)[0] + return _maybe_sort( + codes.astype(key_dtype if key_dtype.kind != "b" else np.bool_), counts[codes], sort + ) + + if op == "mean" or value_dtype.kind == "f": + vals = np.ascontiguousarray(values.astype(np.float64, copy=False)) + skip_nan = value_dtype.kind == "f" + if op == "sum": + sums = np.zeros(max_key + 1, dtype=np.float64) + present = np.zeros(max_key + 1, dtype=bool) + groupby_ext.groupby_dense_int_f64_sum_checked( + keys, vals, valid, sums, present, keys_present, False, 0, skip_nan + ) + codes = np.nonzero(keys_present)[0] + result = sums[codes] + result[~present[codes]] = np.nan + elif op == "mean": + sums = np.zeros(max_key + 1, dtype=np.float64) + counts = np.zeros(max_key + 1, dtype=np.int64) + groupby_ext.groupby_dense_int_f64_mean_checked( + keys, vals, valid, sums, counts, keys_present, False, 0, skip_nan + ) + codes = np.nonzero(keys_present)[0] + result = np.full(len(codes), np.nan, dtype=np.float64) + ok = counts[codes] > 0 + result[ok] = sums[codes][ok] / counts[codes][ok] + elif op in {"min", "max"}: + state = np.zeros(max_key + 1, dtype=np.float64) + has_value = np.zeros(max_key + 1, dtype=bool) + kernel = getattr(groupby_ext, f"groupby_dense_int_f64_{op}_checked") + kernel(keys, vals, valid, state, has_value, keys_present, False, 0, skip_nan) + codes = np.nonzero(keys_present)[0] + result = state[codes] + result[~has_value[codes]] = np.nan + else: # pragma: no cover + return None + return _maybe_sort(codes.astype(key_dtype if key_dtype.kind != "b" else np.bool_), result, sort) + + if value_dtype.kind not in "biu": + return None + vals_i64 = np.ascontiguousarray(values.astype(np.int64, copy=False)) + state = np.zeros(max_key + 1, dtype=np.int64) + present = np.zeros(max_key + 1, dtype=bool) + kernel = getattr(groupby_ext, f"groupby_dense_int_i64_{op}_checked", None) + if kernel is None: + return None + kernel(keys, vals_i64, valid, state, present, keys_present, False, 0) + codes = np.nonzero(keys_present)[0] + return _maybe_sort(codes.astype(key_dtype if key_dtype.kind != "b" else np.bool_), state[codes], sort) + + +def _try_float_hash(keys: np.ndarray, values: np.ndarray | None, op: str, *, sort: bool, dropna: bool): + key_dtype = np.dtype(keys.dtype) + if key_dtype.kind != "f": + return None + if values is not None and np.dtype(values.dtype).kind != "f" and op != "count": + return None + try: + from blosc2 import groupby_ext + except ImportError: + return None + + keys_f64 = np.ascontiguousarray(keys.astype(np.float64, copy=False)) + valid = np.ones(len(keys_f64), dtype=bool) + if values is None: + values_f64 = np.empty(len(keys_f64), dtype=np.float64) + values_valid = np.zeros(len(keys_f64), dtype=bool) + has_values = False + else: + values_f64 = np.ascontiguousarray(np.asarray(values, dtype=np.float64)) + values_valid = np.ascontiguousarray(_values_valid(values)) + has_values = True + + groups, row_counts, value_counts, sums, mins, maxs, has_value = groupby_ext.groupby_hash_f64_f64( + keys_f64, values_f64, valid, values_valid, has_values, dropna + ) + groups = groups.astype(key_dtype, copy=False) + if op == "size": + result = row_counts + elif op == "count": + result = value_counts + elif op == "sum": + result = sums.copy() + result[~has_value] = np.nan + elif op == "mean": + result = np.full(len(groups), np.nan, dtype=np.float64) + ok = value_counts > 0 + result[ok] = sums[ok] / value_counts[ok] + elif op == "min": + result = mins.copy() + result[~has_value] = np.nan + elif op == "max": + result = maxs.copy() + result[~has_value] = np.nan + else: # pragma: no cover + return None + return _maybe_sort(groups, result, sort) + + +def _group_reduce_numpy( # noqa: C901 + keys: np.ndarray, values: np.ndarray | None, op: str, *, sort: bool, dropna: bool +): + acc: dict[object, list] = {} + display: dict[object, object] = {} + for i, key in enumerate(keys): + key_item = _python_scalar(key) + if isinstance(key_item, float) and math.isnan(key_item): + if dropna: + continue + norm_key = _NAN_KEY + else: + norm_key = key_item + display.setdefault(norm_key, key_item) + state = acc.setdefault(norm_key, [0, 0, 0.0, None, None]) + state[0] += 1 + if values is None: + continue + value = _python_scalar(values[i]) + if isinstance(value, float) and math.isnan(value): + continue + state[1] += 1 + if op in {"sum", "mean"}: + state[2] += value + elif op == "min" and (state[3] is None or value < state[3]): + state[3] = value + elif op == "max" and (state[4] is None or value > state[4]): + state[4] = value + + order = list(acc) + if sort: + order.sort(key=lambda k: (1, "") if k is _NAN_KEY else (0, display[k])) + groups = np.asarray([display[k] for k in order], dtype=keys.dtype) + result = [] + for k in order: + rows, count, total, min_value, max_value = acc[k] + if op == "size": + result.append(rows) + elif op == "count": + result.append(count) + elif op == "sum": + result.append(total if count else _null_value_for(values)) + elif op == "mean": + result.append(math.nan if count == 0 else total / count) + elif op == "min": + result.append(min_value if count else _null_value_for(values)) + elif op == "max": + result.append(max_value if count else _null_value_for(values)) + return groups, np.asarray(result, dtype=_result_dtype(values, op)) + + +def _maybe_sort(groups: np.ndarray, result: np.ndarray, sort: bool): + if sort and len(groups): + order = np.argsort(groups, kind="stable") + return groups[order], result[order] + return groups, result + + +def _values_valid(values: np.ndarray) -> np.ndarray: + values = np.asarray(values) + if values.dtype.kind == "f": + return ~np.isnan(values) + return np.ones(len(values), dtype=bool) + + +def _result_dtype(values: np.ndarray | None, op: str): + if op in {"size", "count"}: + return np.int64 + if op == "mean" or values is None: + return np.float64 + dtype = np.dtype(values.dtype) + if op == "sum" and dtype.kind in "biu": + return np.int64 + return dtype + + +def _null_value_for(values: np.ndarray | None): + if values is not None and np.dtype(values.dtype).kind in "iu": + return 0 + return math.nan diff --git a/src/blosc2/groupby_ext.pyx b/src/blosc2/groupby_ext.pyx new file mode 100644 index 00000000..ae5ffd8a --- /dev/null +++ b/src/blosc2/groupby_ext.pyx @@ -0,0 +1,1124 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### +# cython: boundscheck=False, wraparound=False, initializedcheck=False + +"""Cython group-reduce kernels for CTable group_by().""" + +import numpy as np +cimport numpy as np + +from libc.stdint cimport int8_t, uint8_t, int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t +from libc.stdlib cimport malloc, free +from libc.string cimport memcpy + + +# ---------------------------------------------------------------------- +# Group-reduce kernels +# ---------------------------------------------------------------------- + +def groupby_dense_i32_f64_sum( + np.ndarray keys, + np.ndarray values, + np.ndarray valid, + np.ndarray sums, + np.ndarray present, + bint skip_key_null=False, + int32_t key_null=0, + bint skip_value_nan=False, +): + """Accumulate ``sum(values)`` by dense int32 keys. + + This is a low-level CTable group-by helper. *keys*, *values*, and *valid* + are same-length 1-D chunk arrays. *sums* and *present* are dense group + state arrays indexed directly by key value. Keys must be non-negative and + already fit in the state arrays. + """ + if keys.ndim != 1 or values.ndim != 1 or valid.ndim != 1: + raise ValueError("keys, values and valid must be 1-D arrays") + if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]: + raise ValueError("keys, values and valid must have the same length") + if sums.ndim != 1 or present.ndim != 1: + raise ValueError("sums and present must be 1-D arrays") + if keys.dtype != np.dtype(np.int32): + raise TypeError("keys must have dtype int32") + if values.dtype != np.dtype(np.float64): + raise TypeError("values must have dtype float64") + if valid.dtype != np.dtype(np.bool_): + raise TypeError("valid must have dtype bool") + if sums.dtype != np.dtype(np.float64): + raise TypeError("sums must have dtype float64") + if present.dtype != np.dtype(np.bool_): + raise TypeError("present must have dtype bool") + + cdef int32_t[:] keys_view = keys + cdef double[:] values_view = values + cdef np.npy_bool[:] valid_view = valid + cdef double[:] sums_view = sums + cdef np.npy_bool[:] present_view = present + cdef Py_ssize_t n = keys.shape[0] + cdef Py_ssize_t nstates = sums.shape[0] + cdef Py_ssize_t i + cdef int32_t key + cdef double value + + if present.shape[0] != sums.shape[0]: + raise ValueError("present and sums must have the same length") + + with nogil: + for i in range(n): + if not valid_view[i]: + continue + key = keys_view[i] + if skip_key_null and key == key_null: + continue + if key < 0 or key >= nstates: + continue + value = values_view[i] + if skip_value_nan and value != value: + continue + sums_view[key] += value + present_view[key] = 1 + return None + + +def groupby_dense_i32_f64_sum_checked( + np.ndarray keys, + np.ndarray values, + np.ndarray valid, + np.ndarray sums, + np.ndarray present, + bint skip_key_null=False, + int32_t key_null=0, + bint skip_value_nan=False, +): + """Checked dense int32/float64 sum kernel. + + Returns ``0`` on success, ``-1`` if a negative non-null key is found, or + ``max_key + 1`` when the dense state arrays need to be grown. The state is + not mutated unless the function returns ``0``. + """ + if keys.ndim != 1 or values.ndim != 1 or valid.ndim != 1: + raise ValueError("keys, values and valid must be 1-D arrays") + if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]: + raise ValueError("keys, values and valid must have the same length") + if sums.ndim != 1 or present.ndim != 1: + raise ValueError("sums and present must be 1-D arrays") + if keys.dtype != np.dtype(np.int32): + raise TypeError("keys must have dtype int32") + if values.dtype != np.dtype(np.float64): + raise TypeError("values must have dtype float64") + if valid.dtype != np.dtype(np.bool_): + raise TypeError("valid must have dtype bool") + if sums.dtype != np.dtype(np.float64): + raise TypeError("sums must have dtype float64") + if present.dtype != np.dtype(np.bool_): + raise TypeError("present must have dtype bool") + if present.shape[0] != sums.shape[0]: + raise ValueError("present and sums must have the same length") + + cdef int32_t[:] keys_view = keys + cdef double[:] values_view = values + cdef np.npy_bool[:] valid_view = valid + cdef double[:] sums_view = sums + cdef np.npy_bool[:] present_view = present + cdef Py_ssize_t n = keys.shape[0] + cdef Py_ssize_t nstates = sums.shape[0] + cdef Py_ssize_t i + cdef int32_t key + cdef int32_t max_key = -1 + cdef int ret = 0 + cdef double value + + with nogil: + for i in range(n): + if not valid_view[i]: + continue + key = keys_view[i] + if skip_key_null and key == key_null: + continue + if key < 0: + ret = -1 + break + if key > max_key: + max_key = key + if ret == 0: + if max_key < 0: + ret = 0 + elif max_key >= nstates: + ret = max_key + 1 + else: + for i in range(n): + if not valid_view[i]: + continue + key = keys_view[i] + if skip_key_null and key == key_null: + continue + value = values_view[i] + if skip_value_nan and value != value: + continue + sums_view[key] += value + present_view[key] = 1 + return ret + + +def groupby_dense_f64_integral_key_f64_sum_checked( + np.ndarray keys, + np.ndarray values, + np.ndarray valid, + np.ndarray sums, + np.ndarray present, + bint skip_key_nan=True, + bint skip_value_nan=False, +): + """Checked dense float64-integral-key/float64 sum kernel. + + Fast path for float keys that are exactly integral, finite and + non-negative. Returns ``0`` on success, ``-1`` if a key cannot be handled, + or ``max_key + 1`` when the dense state arrays need to be grown. The state is + not mutated unless the function returns ``0``. + """ + if keys.ndim != 1 or values.ndim != 1 or valid.ndim != 1: + raise ValueError("keys, values and valid must be 1-D arrays") + if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]: + raise ValueError("keys, values and valid must have the same length") + if sums.ndim != 1 or present.ndim != 1: + raise ValueError("sums and present must be 1-D arrays") + if keys.dtype != np.dtype(np.float64): + raise TypeError("keys must have dtype float64") + if values.dtype != np.dtype(np.float64): + raise TypeError("values must have dtype float64") + if valid.dtype != np.dtype(np.bool_): + raise TypeError("valid must have dtype bool") + if sums.dtype != np.dtype(np.float64): + raise TypeError("sums must have dtype float64") + if present.dtype != np.dtype(np.bool_): + raise TypeError("present must have dtype bool") + if present.shape[0] != sums.shape[0]: + raise ValueError("present and sums must have the same length") + + cdef double[:] keys_view = keys + cdef double[:] values_view = values + cdef np.npy_bool[:] valid_view = valid + cdef double[:] sums_view = sums + cdef np.npy_bool[:] present_view = present + cdef Py_ssize_t n = keys.shape[0] + cdef Py_ssize_t nstates = sums.shape[0] + cdef Py_ssize_t i + cdef double key_f + cdef int64_t key_i + cdef int64_t max_key = -1 + cdef int ret = 0 + cdef double value + + with nogil: + for i in range(n): + if not valid_view[i]: + continue + key_f = keys_view[i] + if key_f != key_f: + if skip_key_nan: + continue + ret = -1 + break + if key_f < 0.0 or key_f > 9223372036854774784.0: + ret = -1 + break + key_i = key_f + if key_f != key_i: + ret = -1 + break + if key_i > max_key: + max_key = key_i + if ret == 0: + if max_key < 0: + ret = 0 + elif max_key >= nstates: + if max_key > 2147483646: + ret = -1 + else: + ret = max_key + 1 + else: + for i in range(n): + if not valid_view[i]: + continue + key_f = keys_view[i] + if key_f != key_f: + if skip_key_nan: + continue + ret = -1 + break + key_i = key_f + value = values_view[i] + if skip_value_nan and value != value: + continue + sums_view[key_i] += value + present_view[key_i] = 1 + return ret + + +def groupby_dense_f32_integral_key_f64_sum_checked( + np.ndarray keys, + np.ndarray values, + np.ndarray valid, + np.ndarray sums, + np.ndarray present, + bint skip_key_nan=True, + bint skip_value_nan=False, +): + """Checked dense float32-integral-key/float64 sum kernel.""" + if keys.ndim != 1 or values.ndim != 1 or valid.ndim != 1: + raise ValueError("keys, values and valid must be 1-D arrays") + if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]: + raise ValueError("keys, values and valid must have the same length") + if sums.ndim != 1 or present.ndim != 1: + raise ValueError("sums and present must be 1-D arrays") + if keys.dtype != np.dtype(np.float32): + raise TypeError("keys must have dtype float32") + if values.dtype != np.dtype(np.float64): + raise TypeError("values must have dtype float64") + if valid.dtype != np.dtype(np.bool_): + raise TypeError("valid must have dtype bool") + if sums.dtype != np.dtype(np.float64): + raise TypeError("sums must have dtype float64") + if present.dtype != np.dtype(np.bool_): + raise TypeError("present must have dtype bool") + if present.shape[0] != sums.shape[0]: + raise ValueError("present and sums must have the same length") + + cdef float[:] keys_view = keys + cdef double[:] values_view = values + cdef np.npy_bool[:] valid_view = valid + cdef double[:] sums_view = sums + cdef np.npy_bool[:] present_view = present + cdef Py_ssize_t n = keys.shape[0] + cdef Py_ssize_t nstates = sums.shape[0] + cdef Py_ssize_t i + cdef float key_f + cdef int64_t key_i + cdef int64_t max_key = -1 + cdef int ret = 0 + cdef double value + + with nogil: + for i in range(n): + if not valid_view[i]: + continue + key_f = keys_view[i] + if key_f != key_f: + if skip_key_nan: + continue + ret = -1 + break + if key_f < 0.0 or key_f > 16777216.0: + ret = -1 + break + key_i = key_f + if key_f != key_i: + ret = -1 + break + if key_i > max_key: + max_key = key_i + if ret == 0: + if max_key < 0: + ret = 0 + elif max_key >= nstates: + if max_key > 2147483646: + ret = -1 + else: + ret = max_key + 1 + else: + for i in range(n): + if not valid_view[i]: + continue + key_f = keys_view[i] + if key_f != key_f: + if skip_key_nan: + continue + ret = -1 + break + key_i = key_f + value = values_view[i] + if skip_value_nan and value != value: + continue + sums_view[key_i] += value + present_view[key_i] = 1 + return ret + + +# ---------------------------------------------------------------------- +# Fused integer-key dense kernels +# ---------------------------------------------------------------------- + +ctypedef fused dense_int_key_t: + int8_t + uint8_t + int16_t + uint16_t + int32_t + uint32_t + int64_t + uint64_t + + +cdef inline int _dense_int_key_scan( + dense_int_key_t[:] keys_view, + np.npy_bool[:] valid_view, + Py_ssize_t n, + Py_ssize_t nstates, + bint skip_key_null, + int64_t key_null, + int* ret, +) noexcept nogil: + cdef Py_ssize_t i + cdef int64_t key + cdef int64_t max_key = -1 + ret[0] = 0 + for i in range(n): + if not valid_view[i]: + continue + key = keys_view[i] + if skip_key_null and key == key_null: + continue + if key < 0: + ret[0] = -1 + return 0 + if key > max_key: + max_key = key + if max_key < 0: + ret[0] = 0 + elif max_key >= nstates: + if max_key > 2147483646: + ret[0] = -1 + else: + ret[0] = max_key + 1 + return 0 + + +def groupby_dense_int_size_checked( + dense_int_key_t[:] keys, + np.npy_bool[:] valid, + int64_t[:] counts, + np.npy_bool[:] keys_present, + bint skip_key_null=False, + int64_t key_null=0, +): + """Checked dense integer-key ``size`` kernel for all integer key widths.""" + if keys.shape[0] != valid.shape[0]: + raise ValueError("keys and valid must have the same length") + if counts.shape[0] != keys_present.shape[0]: + raise ValueError("counts and keys_present must have the same length") + cdef Py_ssize_t n = keys.shape[0] + cdef Py_ssize_t nstates = counts.shape[0] + cdef Py_ssize_t i + cdef int64_t key + cdef int ret + with nogil: + _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret) + if ret == 0: + for i in range(n): + if not valid[i]: + continue + key = keys[i] + if skip_key_null and key == key_null: + continue + counts[key] += 1 + keys_present[key] = 1 + return ret + + +def groupby_dense_int_count_checked( + dense_int_key_t[:] keys, + np.npy_bool[:] valid, + np.npy_bool[:] values_valid, + int64_t[:] counts, + np.npy_bool[:] keys_present, + bint skip_key_null=False, + int64_t key_null=0, +): + """Checked dense integer-key non-null count kernel.""" + if keys.shape[0] != valid.shape[0] or keys.shape[0] != values_valid.shape[0]: + raise ValueError("keys, valid and values_valid must have the same length") + if counts.shape[0] != keys_present.shape[0]: + raise ValueError("counts and keys_present must have the same length") + cdef Py_ssize_t n = keys.shape[0] + cdef Py_ssize_t nstates = counts.shape[0] + cdef Py_ssize_t i + cdef int64_t key + cdef int ret + with nogil: + _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret) + if ret == 0: + for i in range(n): + if not valid[i]: + continue + key = keys[i] + if skip_key_null and key == key_null: + continue + keys_present[key] = 1 + if values_valid[i]: + counts[key] += 1 + return ret + + +def groupby_dense_int_f64_sum_checked( + dense_int_key_t[:] keys, + double[:] values, + np.npy_bool[:] valid, + double[:] sums, + np.npy_bool[:] value_present, + np.npy_bool[:] keys_present, + bint skip_key_null=False, + int64_t key_null=0, + bint skip_value_nan=False, +): + """Checked dense integer-key float64 sum kernel.""" + if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]: + raise ValueError("keys, values and valid must have the same length") + if sums.shape[0] != value_present.shape[0] or sums.shape[0] != keys_present.shape[0]: + raise ValueError("state arrays must have the same length") + cdef Py_ssize_t n = keys.shape[0] + cdef Py_ssize_t nstates = sums.shape[0] + cdef Py_ssize_t i + cdef int64_t key + cdef double value + cdef int ret + with nogil: + _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret) + if ret == 0: + for i in range(n): + if not valid[i]: + continue + key = keys[i] + if skip_key_null and key == key_null: + continue + keys_present[key] = 1 + value = values[i] + if skip_value_nan and value != value: + continue + sums[key] += value + value_present[key] = 1 + return ret + + +def groupby_dense_int_i64_sum_checked( + dense_int_key_t[:] keys, + int64_t[:] values, + np.npy_bool[:] valid, + int64_t[:] sums, + np.npy_bool[:] value_present, + np.npy_bool[:] keys_present, + bint skip_key_null=False, + int64_t key_null=0, +): + """Checked dense integer-key int64 sum kernel.""" + if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]: + raise ValueError("keys, values and valid must have the same length") + if sums.shape[0] != value_present.shape[0] or sums.shape[0] != keys_present.shape[0]: + raise ValueError("state arrays must have the same length") + cdef Py_ssize_t n = keys.shape[0] + cdef Py_ssize_t nstates = sums.shape[0] + cdef Py_ssize_t i + cdef int64_t key + cdef int ret + with nogil: + _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret) + if ret == 0: + for i in range(n): + if not valid[i]: + continue + key = keys[i] + if skip_key_null and key == key_null: + continue + keys_present[key] = 1 + sums[key] += values[i] + value_present[key] = 1 + return ret + + +def groupby_dense_int_f64_mean_checked( + dense_int_key_t[:] keys, + double[:] values, + np.npy_bool[:] valid, + double[:] sums, + int64_t[:] counts, + np.npy_bool[:] keys_present, + bint skip_key_null=False, + int64_t key_null=0, + bint skip_value_nan=False, +): + """Checked dense integer-key float64 mean state kernel.""" + if keys.shape[0] != values.shape[0] or keys.shape[0] != valid.shape[0]: + raise ValueError("keys, values and valid must have the same length") + if sums.shape[0] != counts.shape[0] or sums.shape[0] != keys_present.shape[0]: + raise ValueError("state arrays must have the same length") + cdef Py_ssize_t n = keys.shape[0] + cdef Py_ssize_t nstates = sums.shape[0] + cdef Py_ssize_t i + cdef int64_t key + cdef double value + cdef int ret + with nogil: + _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret) + if ret == 0: + for i in range(n): + if not valid[i]: + continue + key = keys[i] + if skip_key_null and key == key_null: + continue + keys_present[key] = 1 + value = values[i] + if skip_value_nan and value != value: + continue + sums[key] += value + counts[key] += 1 + return ret + + +def groupby_dense_int_f64_min_checked( + dense_int_key_t[:] keys, + double[:] values, + np.npy_bool[:] valid, + double[:] mins, + np.npy_bool[:] has_value, + np.npy_bool[:] keys_present, + bint skip_key_null=False, + int64_t key_null=0, + bint skip_value_nan=False, +): + """Checked dense integer-key float64 min kernel.""" + cdef Py_ssize_t n = keys.shape[0] + cdef Py_ssize_t nstates = mins.shape[0] + cdef Py_ssize_t i + cdef int64_t key + cdef double value + cdef int ret + with nogil: + _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret) + if ret == 0: + for i in range(n): + if not valid[i]: + continue + key = keys[i] + if skip_key_null and key == key_null: + continue + keys_present[key] = 1 + value = values[i] + if skip_value_nan and value != value: + continue + if not has_value[key] or value < mins[key]: + mins[key] = value + has_value[key] = 1 + return ret + + +def groupby_dense_int_f64_max_checked( + dense_int_key_t[:] keys, + double[:] values, + np.npy_bool[:] valid, + double[:] maxs, + np.npy_bool[:] has_value, + np.npy_bool[:] keys_present, + bint skip_key_null=False, + int64_t key_null=0, + bint skip_value_nan=False, +): + """Checked dense integer-key float64 max kernel.""" + cdef Py_ssize_t n = keys.shape[0] + cdef Py_ssize_t nstates = maxs.shape[0] + cdef Py_ssize_t i + cdef int64_t key + cdef double value + cdef int ret + with nogil: + _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret) + if ret == 0: + for i in range(n): + if not valid[i]: + continue + key = keys[i] + if skip_key_null and key == key_null: + continue + keys_present[key] = 1 + value = values[i] + if skip_value_nan and value != value: + continue + if not has_value[key] or value > maxs[key]: + maxs[key] = value + has_value[key] = 1 + return ret + + +def groupby_dense_int_i64_min_checked( + dense_int_key_t[:] keys, + int64_t[:] values, + np.npy_bool[:] valid, + int64_t[:] mins, + np.npy_bool[:] has_value, + np.npy_bool[:] keys_present, + bint skip_key_null=False, + int64_t key_null=0, +): + """Checked dense integer-key int64 min kernel.""" + cdef Py_ssize_t n = keys.shape[0] + cdef Py_ssize_t nstates = mins.shape[0] + cdef Py_ssize_t i + cdef int64_t key + cdef int64_t value + cdef int ret + with nogil: + _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret) + if ret == 0: + for i in range(n): + if not valid[i]: + continue + key = keys[i] + if skip_key_null and key == key_null: + continue + keys_present[key] = 1 + value = values[i] + if not has_value[key] or value < mins[key]: + mins[key] = value + has_value[key] = 1 + return ret + + +def groupby_dense_int_i64_max_checked( + dense_int_key_t[:] keys, + int64_t[:] values, + np.npy_bool[:] valid, + int64_t[:] maxs, + np.npy_bool[:] has_value, + np.npy_bool[:] keys_present, + bint skip_key_null=False, + int64_t key_null=0, +): + """Checked dense integer-key int64 max kernel.""" + cdef Py_ssize_t n = keys.shape[0] + cdef Py_ssize_t nstates = maxs.shape[0] + cdef Py_ssize_t i + cdef int64_t key + cdef int64_t value + cdef int ret + with nogil: + _dense_int_key_scan(keys, valid, n, nstates, skip_key_null, key_null, &ret) + if ret == 0: + for i in range(n): + if not valid[i]: + continue + key = keys[i] + if skip_key_null and key == key_null: + continue + keys_present[key] = 1 + value = values[i] + if not has_value[key] or value > maxs[key]: + maxs[key] = value + has_value[key] = 1 + return ret + + +# ---------------------------------------------------------------------- +# Arbitrary float-key hash kernels +# ---------------------------------------------------------------------- + +cdef inline uint64_t _f64_bits(double value) noexcept: + cdef uint64_t bits + memcpy(&bits, &value, sizeof(double)) + return bits + + +cdef inline uint64_t _mix_u64(uint64_t x) noexcept: + x ^= x >> 30 + x *= 0xbf58476d1ce4e5b9 + x ^= x >> 27 + x *= 0x94d049bb133111eb + x ^= x >> 31 + return x + + +def groupby_hash_f64_f64( + double[:] keys, + double[:] values, + np.npy_bool[:] valid, + np.npy_bool[:] values_valid, + bint has_values, + bint dropna=True, +): + """Hash arbitrary float64 keys and accumulate float64 group states. + + Returns ``(keys, row_counts, value_counts, sums, mins, maxs, has_value)``. + NaN keys are skipped when ``dropna`` is true; otherwise all NaN bit-patterns + are normalized into one NaN group. ``+0.0`` and ``-0.0`` are normalized into + the same zero group. + """ + if keys.shape[0] != valid.shape[0]: + raise ValueError("keys and valid must have the same length") + if has_values and (values.shape[0] != keys.shape[0] or values_valid.shape[0] != keys.shape[0]): + raise ValueError("values, values_valid and keys must have the same length") + + cdef Py_ssize_t n = keys.shape[0] + cdef Py_ssize_t cap = 1024 + cdef Py_ssize_t used_count = 0 + cdef Py_ssize_t i, pos, old_pos, out_pos + cdef uint64_t mask = cap - 1 + cdef uint64_t bits, h, old_bits + cdef double key, value + cdef double nan_value = float("nan") + cdef uint64_t nan_bits = 0x7ff8000000000000 + cdef bint value_ok + + cdef uint64_t* table_bits = malloc(cap * sizeof(uint64_t)) + cdef np.npy_bool* table_used = malloc(cap * sizeof(np.npy_bool)) + cdef double* table_keys = malloc(cap * sizeof(double)) + cdef int64_t* row_counts = malloc(cap * sizeof(int64_t)) + cdef int64_t* value_counts = malloc(cap * sizeof(int64_t)) + cdef double* sums = malloc(cap * sizeof(double)) + cdef double* mins = malloc(cap * sizeof(double)) + cdef double* maxs = malloc(cap * sizeof(double)) + cdef np.npy_bool* has_value = malloc(cap * sizeof(np.npy_bool)) + + cdef uint64_t* new_bits + cdef np.npy_bool* new_used + cdef double* new_keys + cdef int64_t* new_row_counts + cdef int64_t* new_value_counts + cdef double* new_sums + cdef double* new_mins + cdef double* new_maxs + cdef np.npy_bool* new_has_value + cdef Py_ssize_t old_cap + cdef uint64_t new_mask + + if ( + table_bits == NULL + or table_used == NULL + or table_keys == NULL + or row_counts == NULL + or value_counts == NULL + or sums == NULL + or mins == NULL + or maxs == NULL + or has_value == NULL + ): + free(table_bits); free(table_used); free(table_keys); free(row_counts); free(value_counts) + free(sums); free(mins); free(maxs); free(has_value) + raise MemoryError() + + for i in range(cap): + table_used[i] = 0 + + try: + for i in range(n): + if not valid[i]: + continue + key = keys[i] + if key != key: + if dropna: + continue + bits = nan_bits + key = nan_value + elif key == 0.0: + key = 0.0 + bits = 0 + else: + bits = _f64_bits(key) + + if (used_count + 1) * 2 >= cap: + old_cap = cap + cap *= 2 + mask = cap - 1 + new_bits = malloc(cap * sizeof(uint64_t)) + new_used = malloc(cap * sizeof(np.npy_bool)) + new_keys = malloc(cap * sizeof(double)) + new_row_counts = malloc(cap * sizeof(int64_t)) + new_value_counts = malloc(cap * sizeof(int64_t)) + new_sums = malloc(cap * sizeof(double)) + new_mins = malloc(cap * sizeof(double)) + new_maxs = malloc(cap * sizeof(double)) + new_has_value = malloc(cap * sizeof(np.npy_bool)) + if ( + new_bits == NULL + or new_used == NULL + or new_keys == NULL + or new_row_counts == NULL + or new_value_counts == NULL + or new_sums == NULL + or new_mins == NULL + or new_maxs == NULL + or new_has_value == NULL + ): + free(new_bits); free(new_used); free(new_keys); free(new_row_counts); free(new_value_counts) + free(new_sums); free(new_mins); free(new_maxs); free(new_has_value) + raise MemoryError() + for pos in range(cap): + new_used[pos] = 0 + for old_pos in range(old_cap): + if not table_used[old_pos]: + continue + old_bits = table_bits[old_pos] + h = _mix_u64(old_bits) + pos = (h & mask) + while new_used[pos]: + pos = ((pos + 1) & mask) + new_used[pos] = 1 + new_bits[pos] = old_bits + new_keys[pos] = table_keys[old_pos] + new_row_counts[pos] = row_counts[old_pos] + new_value_counts[pos] = value_counts[old_pos] + new_sums[pos] = sums[old_pos] + new_mins[pos] = mins[old_pos] + new_maxs[pos] = maxs[old_pos] + new_has_value[pos] = has_value[old_pos] + free(table_bits); free(table_used); free(table_keys); free(row_counts); free(value_counts) + free(sums); free(mins); free(maxs); free(has_value) + table_bits = new_bits + table_used = new_used + table_keys = new_keys + row_counts = new_row_counts + value_counts = new_value_counts + sums = new_sums + mins = new_mins + maxs = new_maxs + has_value = new_has_value + + h = _mix_u64(bits) + pos = (h & mask) + while table_used[pos] and table_bits[pos] != bits: + pos = ((pos + 1) & mask) + if not table_used[pos]: + table_used[pos] = 1 + table_bits[pos] = bits + table_keys[pos] = key + row_counts[pos] = 0 + value_counts[pos] = 0 + sums[pos] = 0.0 + mins[pos] = 0.0 + maxs[pos] = 0.0 + has_value[pos] = 0 + used_count += 1 + + row_counts[pos] += 1 + if has_values: + value_ok = values_valid[i] + if value_ok: + value = values[i] + value_counts[pos] += 1 + sums[pos] += value + if not has_value[pos] or value < mins[pos]: + mins[pos] = value + if not has_value[pos] or value > maxs[pos]: + maxs[pos] = value + has_value[pos] = 1 + + out_keys = np.empty(used_count, dtype=np.float64) + out_row_counts = np.empty(used_count, dtype=np.int64) + out_value_counts = np.empty(used_count, dtype=np.int64) + out_sums = np.empty(used_count, dtype=np.float64) + out_mins = np.empty(used_count, dtype=np.float64) + out_maxs = np.empty(used_count, dtype=np.float64) + out_has_value = np.empty(used_count, dtype=bool) + + out_pos = 0 + for pos in range(cap): + if not table_used[pos]: + continue + out_keys[out_pos] = table_keys[pos] + out_row_counts[out_pos] = row_counts[pos] + out_value_counts[out_pos] = value_counts[pos] + out_sums[out_pos] = sums[pos] + out_mins[out_pos] = mins[pos] + out_maxs[out_pos] = maxs[pos] + out_has_value[out_pos] = has_value[pos] + out_pos += 1 + return out_keys, out_row_counts, out_value_counts, out_sums, out_mins, out_maxs, out_has_value + finally: + free(table_bits); free(table_used); free(table_keys); free(row_counts); free(value_counts) + free(sums); free(mins); free(maxs); free(has_value) + + +def groupby_hash_i64x2_f64( + int64_t[:] key0, + int64_t[:] key1, + double[:] values, + np.npy_bool[:] valid, + np.npy_bool[:] values_valid, + bint has_values, +): + """Hash two int64-normalized keys and accumulate float64 group states.""" + if key0.shape[0] != key1.shape[0] or key0.shape[0] != valid.shape[0]: + raise ValueError("key0, key1 and valid must have the same length") + if has_values and (values.shape[0] != key0.shape[0] or values_valid.shape[0] != key0.shape[0]): + raise ValueError("values, values_valid and keys must have the same length") + + cdef Py_ssize_t n = key0.shape[0] + cdef Py_ssize_t cap = 1024 + cdef Py_ssize_t used_count = 0 + cdef Py_ssize_t i, pos, old_pos, out_pos + cdef uint64_t mask = cap - 1 + cdef uint64_t h + cdef int64_t k0, k1 + cdef double value + cdef bint value_ok + + cdef int64_t* table_k0 = malloc(cap * sizeof(int64_t)) + cdef int64_t* table_k1 = malloc(cap * sizeof(int64_t)) + cdef np.npy_bool* table_used = malloc(cap * sizeof(np.npy_bool)) + cdef int64_t* row_counts = malloc(cap * sizeof(int64_t)) + cdef int64_t* value_counts = malloc(cap * sizeof(int64_t)) + cdef double* sums = malloc(cap * sizeof(double)) + cdef double* mins = malloc(cap * sizeof(double)) + cdef double* maxs = malloc(cap * sizeof(double)) + cdef np.npy_bool* has_value = malloc(cap * sizeof(np.npy_bool)) + + cdef int64_t* new_k0 + cdef int64_t* new_k1 + cdef np.npy_bool* new_used + cdef int64_t* new_row_counts + cdef int64_t* new_value_counts + cdef double* new_sums + cdef double* new_mins + cdef double* new_maxs + cdef np.npy_bool* new_has_value + cdef Py_ssize_t old_cap + + if ( + table_k0 == NULL + or table_k1 == NULL + or table_used == NULL + or row_counts == NULL + or value_counts == NULL + or sums == NULL + or mins == NULL + or maxs == NULL + or has_value == NULL + ): + free(table_k0); free(table_k1); free(table_used); free(row_counts); free(value_counts) + free(sums); free(mins); free(maxs); free(has_value) + raise MemoryError() + + for i in range(cap): + table_used[i] = 0 + + try: + for i in range(n): + if not valid[i]: + continue + k0 = key0[i] + k1 = key1[i] + + if (used_count + 1) * 2 >= cap: + old_cap = cap + cap *= 2 + mask = cap - 1 + new_k0 = malloc(cap * sizeof(int64_t)) + new_k1 = malloc(cap * sizeof(int64_t)) + new_used = malloc(cap * sizeof(np.npy_bool)) + new_row_counts = malloc(cap * sizeof(int64_t)) + new_value_counts = malloc(cap * sizeof(int64_t)) + new_sums = malloc(cap * sizeof(double)) + new_mins = malloc(cap * sizeof(double)) + new_maxs = malloc(cap * sizeof(double)) + new_has_value = malloc(cap * sizeof(np.npy_bool)) + if ( + new_k0 == NULL + or new_k1 == NULL + or new_used == NULL + or new_row_counts == NULL + or new_value_counts == NULL + or new_sums == NULL + or new_mins == NULL + or new_maxs == NULL + or new_has_value == NULL + ): + free(new_k0); free(new_k1); free(new_used); free(new_row_counts); free(new_value_counts) + free(new_sums); free(new_mins); free(new_maxs); free(new_has_value) + raise MemoryError() + for pos in range(cap): + new_used[pos] = 0 + for old_pos in range(old_cap): + if not table_used[old_pos]: + continue + h = _mix_u64(table_k0[old_pos]) ^ _mix_u64(table_k1[old_pos] + 0x9e3779b97f4a7c15) + pos = (h & mask) + while new_used[pos]: + pos = ((pos + 1) & mask) + new_used[pos] = 1 + new_k0[pos] = table_k0[old_pos] + new_k1[pos] = table_k1[old_pos] + new_row_counts[pos] = row_counts[old_pos] + new_value_counts[pos] = value_counts[old_pos] + new_sums[pos] = sums[old_pos] + new_mins[pos] = mins[old_pos] + new_maxs[pos] = maxs[old_pos] + new_has_value[pos] = has_value[old_pos] + free(table_k0); free(table_k1); free(table_used); free(row_counts); free(value_counts) + free(sums); free(mins); free(maxs); free(has_value) + table_k0 = new_k0 + table_k1 = new_k1 + table_used = new_used + row_counts = new_row_counts + value_counts = new_value_counts + sums = new_sums + mins = new_mins + maxs = new_maxs + has_value = new_has_value + + h = _mix_u64(k0) ^ _mix_u64(k1 + 0x9e3779b97f4a7c15) + pos = (h & mask) + while table_used[pos] and (table_k0[pos] != k0 or table_k1[pos] != k1): + pos = ((pos + 1) & mask) + if not table_used[pos]: + table_used[pos] = 1 + table_k0[pos] = k0 + table_k1[pos] = k1 + row_counts[pos] = 0 + value_counts[pos] = 0 + sums[pos] = 0.0 + mins[pos] = 0.0 + maxs[pos] = 0.0 + has_value[pos] = 0 + used_count += 1 + + row_counts[pos] += 1 + if has_values: + value_ok = values_valid[i] + if value_ok: + value = values[i] + value_counts[pos] += 1 + sums[pos] += value + if not has_value[pos] or value < mins[pos]: + mins[pos] = value + if not has_value[pos] or value > maxs[pos]: + maxs[pos] = value + has_value[pos] = 1 + + out_k0 = np.empty(used_count, dtype=np.int64) + out_k1 = np.empty(used_count, dtype=np.int64) + out_row_counts = np.empty(used_count, dtype=np.int64) + out_value_counts = np.empty(used_count, dtype=np.int64) + out_sums = np.empty(used_count, dtype=np.float64) + out_mins = np.empty(used_count, dtype=np.float64) + out_maxs = np.empty(used_count, dtype=np.float64) + out_has_value = np.empty(used_count, dtype=bool) + + out_pos = 0 + for pos in range(cap): + if not table_used[pos]: + continue + out_k0[out_pos] = table_k0[pos] + out_k1[out_pos] = table_k1[pos] + out_row_counts[out_pos] = row_counts[pos] + out_value_counts[out_pos] = value_counts[pos] + out_sums[out_pos] = sums[pos] + out_mins[out_pos] = mins[pos] + out_maxs[out_pos] = maxs[pos] + out_has_value[out_pos] = has_value[pos] + out_pos += 1 + return out_k0, out_k1, out_row_counts, out_value_counts, out_sums, out_mins, out_maxs, out_has_value + finally: + free(table_k0); free(table_k1); free(table_used); free(row_counts); free(value_counts) + free(sums); free(mins); free(maxs); free(has_value) diff --git a/src/blosc2/indexing_ext.pyx b/src/blosc2/indexing_ext.pyx index 759f5980..91072bea 100644 --- a/src/blosc2/indexing_ext.pyx +++ b/src/blosc2/indexing_ext.pyx @@ -2495,3 +2495,6 @@ def keysort_keys_indices(np.ndarray keys, np.ndarray indices): return None _keysort_ndarray(keys, indices) return None + + +# ---------------------------------------------------------------------- diff --git a/tests/ctable/test_groupby.py b/tests/ctable/test_groupby.py new file mode 100644 index 00000000..fec1fca4 --- /dev/null +++ b/tests/ctable/test_groupby.py @@ -0,0 +1,400 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from dataclasses import dataclass, make_dataclass + +import numpy as np +import pytest + +import blosc2 +from blosc2 import CTable + + +@dataclass +class SalesRow: + city: str = blosc2.field(blosc2.string(max_length=16)) + category: int = blosc2.field(blosc2.int32()) + sales: float = blosc2.field(blosc2.float64(nullable=True), default=0.0) + qty: int = blosc2.field(blosc2.int32(), default=0) + + +DATA = [ + ("Paris", 1, 10.0, 1), + ("Paris", 1, np.nan, 2), + ("Rome", 1, 20.0, 3), + ("Paris", 2, 30.0, 4), + ("Rome", 1, 40.0, 5), + ("Berlin", 2, np.nan, 6), +] + + +def col(table, name): + return list(table._cols[name][: table.nrows]) + + +def rows(table): + return [tuple(table._cols[name][i] for name in table.col_names) for i in range(table.nrows)] + + +def test_groupby_size_counts_rows_per_group(): + t = CTable(SalesRow, new_data=DATA) + + out = t.group_by("city", sort=True).size() + + assert out.col_names == ["city", "size"] + assert rows(out) == [("Berlin", 1), ("Paris", 3), ("Rome", 2)] + + +def test_groupby_count_counts_non_null_values(): + t = CTable(SalesRow, new_data=DATA) + + out = t.group_by("city", sort=True).count("sales") + + assert out.col_names == ["city", "sales_count"] + assert rows(out) == [("Berlin", 0), ("Paris", 2), ("Rome", 2)] + + +def test_groupby_agg_numeric_reductions(): + t = CTable(SalesRow, new_data=DATA) + + out = t.group_by("city", sort=True).agg({"sales": ["sum", "mean", "min", "max", "count"]}) + + assert out.col_names == ["city", "sales_sum", "sales_mean", "sales_min", "sales_max", "sales_count"] + got = rows(out) + assert got[0][0] == "Berlin" + assert np.isnan(got[0][1]) + assert np.isnan(got[0][2]) + assert np.isnan(got[0][3]) + assert np.isnan(got[0][4]) + assert got[0][5] == 0 + assert got[1] == ("Paris", 40.0, 20.0, 10.0, 30.0, 2) + assert got[2] == ("Rome", 60.0, 30.0, 20.0, 40.0, 2) + + +def test_groupby_multi_key_size(): + t = CTable(SalesRow, new_data=DATA) + + out = t.group_by(["city", "category"], sort=True).size() + + assert rows(out) == [("Berlin", 2, 1), ("Paris", 1, 2), ("Paris", 2, 1), ("Rome", 1, 2)] + + +def test_groupby_respects_views_and_deleted_rows(): + t = CTable(SalesRow, new_data=DATA) + t.delete(0) + view = t.where("qty >= 3") + + out = view.group_by("city", sort=True).size() + + assert rows(out) == [("Berlin", 1), ("Paris", 1), ("Rome", 2)] + + +@dataclass +class DictRow: + city: str = blosc2.field(blosc2.dictionary()) + sales: int = blosc2.field(blosc2.int32()) + + +def test_groupby_dictionary_key_groups_by_decoded_value(): + t = CTable(DictRow, new_data=[("Paris", 10), ("Rome", 20), ("Paris", 30)]) + + out = t.group_by("city", sort=True).agg({"sales": "sum"}) + + assert out.col_names == ["city", "sales_sum"] + assert rows(out) == [("Paris", 40), ("Rome", 20)] + + +def test_groupby_dictionary_key_beyond_default_code_capacity(): + data = [("Paris" if i % 2 == 0 else "Rome", 1) for i in range(5000)] + t = CTable(DictRow, new_data=data) + + out = t.group_by("city", sort=True).size() + + assert rows(out) == [("Paris", 2500), ("Rome", 2500)] + + +def test_groupby_dropna_key_default_and_false(): + t = CTable(DictRow, new_data=[("Paris", 10), (None, 20), ("Paris", 30)]) + + dropped = t.group_by("city", sort=True).size() + kept = t.group_by("city", sort=True, dropna=False).size() + + assert rows(dropped) == [("Paris", 2)] + assert rows(kept) == [(None, 1), ("Paris", 2)] + + +def test_groupby_agg_star_size(): + t = CTable(SalesRow, new_data=DATA) + + out = t.group_by("city", sort=True).agg({"*": "size"}) + + assert rows(out) == [("Berlin", 1), ("Paris", 3), ("Rome", 2)] + + +def test_groupby_empty_table_returns_empty_result(): + t = CTable(SalesRow) + + out = t.group_by("city").size() + + assert out.nrows == 0 + assert out.col_names == ["city", "size"] + + +@dataclass +class Int32FloatRow: + key: int = blosc2.field(blosc2.int32()) + value: float = blosc2.field(blosc2.float64()) + + +@dataclass +class Float64KeyRow: + key: float = blosc2.field(blosc2.float64()) + value: float = blosc2.field(blosc2.float64()) + + +@dataclass +class Float32KeyRow: + key: float = blosc2.field(blosc2.float32()) + value: float = blosc2.field(blosc2.float64()) + + +@dataclass +class DictFloatRow: + key: str = blosc2.field(blosc2.dictionary()) + value: float = blosc2.field(blosc2.float64()) + + +@pytest.mark.parametrize( + ("row_type", "data", "expected"), + [ + ( + Int32FloatRow, + [(0, 1.5), (2, 10.0), (1, 2.5), (2, 3.0), (0, 4.0)], + [(0, 5.5), (1, 2.5), (2, 13.0)], + ), + ( + Float64KeyRow, + [(0.0, 1.5), (2.0, 10.0), (1.0, 2.5), (2.0, 3.0), (0.0, 4.0)], + [(0.0, 5.5), (1.0, 2.5), (2.0, 13.0)], + ), + ( + Float32KeyRow, + [(0.0, 1.5), (2.0, 10.0), (1.0, 2.5), (2.0, 3.0), (0.0, 4.0)], + [(0.0, 5.5), (1.0, 2.5), (2.0, 13.0)], + ), + ( + DictFloatRow, + [("a", 1.5), ("c", 10.0), ("b", 2.5), ("c", 3.0), ("a", 4.0)], + [("a", 5.5), ("c", 13.0), ("b", 2.5)], + ), + ], +) +def test_groupby_fast_path_sum_variants(row_type, data, expected): + t = CTable(row_type, new_data=data) + + out = t.group_by("key").agg({"value": "sum"}) + + assert rows(out) == expected + + +def test_groupby_float_integral_fast_path_falls_back_for_non_integral_keys(): + t = CTable(Float64KeyRow, new_data=[(0.5, 1.0), (1.5, 2.0), (0.5, 3.0)]) + + out = t.group_by("key").agg({"value": "sum"}) + + assert rows(out) == [(0.5, 4.0), (1.5, 2.0)] + + +def test_groupby_float_integral_fast_path_falls_back_for_nan_group_when_kept(): + t = CTable(Float64KeyRow, new_data=[(0.0, 1.0), (np.nan, 2.0), (0.0, 3.0)]) + + out = t.group_by("key", dropna=False).agg({"value": "sum"}) + + got = rows(out) + assert got[0] == (0.0, 4.0) + assert np.isnan(got[1][0]) + assert got[1][1] == 2.0 + + +def test_groupby_rejects_bad_engine(): + t = CTable(SalesRow, new_data=DATA) + + with pytest.raises(ValueError): + t.group_by("city", engine="cython") + + +@pytest.mark.parametrize( + ("schema_factory", "values"), + [ + (blosc2.int8, [0, 2, 1, 2, 0]), + (blosc2.uint8, [0, 2, 1, 2, 0]), + (blosc2.int16, [0, 2, 1, 2, 0]), + (blosc2.uint16, [0, 2, 1, 2, 0]), + (blosc2.int32, [0, 2, 1, 2, 0]), + (blosc2.uint32, [0, 2, 1, 2, 0]), + (blosc2.int64, [0, 2, 1, 2, 0]), + (blosc2.uint64, [0, 2, 1, 2, 0]), + ], +) +def test_groupby_cython_fused_integer_key_dtypes(schema_factory, values): + row_type = make_dataclass( + f"FusedKey{schema_factory.__name__}Row", + [ + ("key", int, blosc2.field(schema_factory())), + ("value", int, blosc2.field(blosc2.int32())), + ], + ) + t = CTable(row_type, new_data=list(zip(values, [1, 10, 2, 3, 4], strict=True))) + + out = t.group_by("key", sort=True).agg({"value": "sum"}) + + assert rows(out) == [(0, 5), (1, 2), (2, 13)] + + +def test_groupby_cython_integer_key_more_integer_aggs(): + row_type = make_dataclass( + "IntKeyMoreIntegerAggsRow", + [ + ("key", int, blosc2.field(blosc2.int16())), + ("value", int, blosc2.field(blosc2.int32())), + ], + ) + t = CTable(row_type, new_data=[(0, 5), (1, 10), (0, -2), (1, 20), (2, 7)]) + + out = t.group_by("key", sort=True).agg({"*": "size", "value": ["count", "sum", "mean", "min", "max"]}) + + assert rows(out) == [(0, 2, 2, 3, 1.5, -2, 5), (1, 2, 2, 30, 15.0, 10, 20), (2, 1, 1, 7, 7.0, 7, 7)] + + +def test_groupby_cython_integer_key_nullable_float_aggs(): + row_type = make_dataclass( + "IntKeyNullableFloatAggsRow", + [ + ("key", int, blosc2.field(blosc2.uint16())), + ("value", float, blosc2.field(blosc2.float64(nullable=True))), + ], + ) + t = CTable(row_type, new_data=[(0, 1.5), (1, np.nan), (0, 2.5), (1, np.nan), (2, 10.0)]) + + out = t.group_by("key", sort=True).agg({"value": ["count", "sum", "mean", "min", "max"]}) + + got = rows(out) + assert got[0] == (0, 2, 4.0, 2.0, 1.5, 2.5) + assert got[1][0] == 1 + assert got[1][1] == 0 + assert np.isnan(got[1][2]) + assert np.isnan(got[1][3]) + assert np.isnan(got[1][4]) + assert np.isnan(got[1][5]) + assert got[2] == (2, 1, 10.0, 10.0, 10.0, 10.0) + + +def test_groupby_cython_arbitrary_float_key_aggs(): + t = CTable( + Float64KeyRow, + new_data=[(0.5, 1.0), (1.25, 10.0), (0.5, 3.0), (-2.5, 4.0), (1.25, 2.0)], + ) + + out = t.group_by("key").agg({"value": ["count", "sum", "mean", "min", "max"]}) + + assert rows(out) == [ + (-2.5, 1, 4.0, 4.0, 4.0, 4.0), + (0.5, 2, 4.0, 2.0, 1.0, 3.0), + (1.25, 2, 12.0, 6.0, 2.0, 10.0), + ] + + +def test_groupby_cython_arbitrary_float_key_nan_and_signed_zero(): + t = CTable(Float64KeyRow, new_data=[(-0.0, 1.0), (0.0, 2.0), (np.nan, 3.0), (np.nan, 4.0)]) + + dropped = t.group_by("key").agg({"value": "sum"}) + kept = t.group_by("key", dropna=False).agg({"value": "sum"}) + + assert rows(dropped) == [(0.0, 3.0)] + got = rows(kept) + assert got[0] == (0.0, 3.0) + assert np.isnan(got[1][0]) + assert got[1][1] == 7.0 + + +@dataclass +class TwoIntKeyFloatRow: + key0: int = blosc2.field(blosc2.int16()) + key1: int = blosc2.field(blosc2.uint16()) + value: float = blosc2.field(blosc2.float64(nullable=True), default=0.0) + + +def test_groupby_cython_two_integer_key_hash_aggs(): + t = CTable( + TwoIntKeyFloatRow, + new_data=[(0, 1, 1.0), (0, 1, 3.0), (0, 2, 10.0), (1, 1, np.nan), (1, 1, 5.0)], + ) + + out = t.group_by(["key0", "key1"], sort=True).agg( + {"*": "size", "value": ["count", "sum", "mean", "min", "max"]} + ) + + assert rows(out) == [ + (0, 1, 2, 2, 4.0, 2.0, 1.0, 3.0), + (0, 2, 1, 1, 10.0, 10.0, 10.0, 10.0), + (1, 1, 2, 1, 5.0, 5.0, 5.0, 5.0), + ] + + +@dataclass +class DictIntKeyFloatRow: + key0: str = blosc2.field(blosc2.dictionary()) + key1: int = blosc2.field(blosc2.int32()) + value: float = blosc2.field(blosc2.float64()) + + +def test_groupby_cython_dictionary_integer_key_hash(): + t = CTable(DictIntKeyFloatRow, new_data=[("b", 2, 1.0), ("a", 1, 2.0), ("b", 2, 3.0)]) + + out = t.group_by(["key0", "key1"], sort=True).agg({"value": "sum"}) + + assert rows(out) == [("a", 1, 2.0), ("b", 2, 4.0)] + + +def test_groupby_convenience_numeric_methods(): + t = CTable(SalesRow, new_data=DATA) + + assert rows(t.group_by("city", sort=True).sum("qty")) == rows( + t.group_by("city", sort=True).agg({"qty": "sum"}) + ) + assert rows(t.group_by("city", sort=True).mean("qty")) == rows( + t.group_by("city", sort=True).agg({"qty": "mean"}) + ) + assert rows(t.group_by("city", sort=True).min("qty")) == rows( + t.group_by("city", sort=True).agg({"qty": "min"}) + ) + assert rows(t.group_by("city", sort=True).max("qty")) == rows( + t.group_by("city", sort=True).agg({"qty": "max"}) + ) + + +def test_groupby_persistent_output_urlpath(tmp_path): + t = CTable(SalesRow, new_data=DATA) + path = tmp_path / "grouped.b2d" + + out = t.group_by("city", sort=True).agg({"qty": "sum"}, urlpath=path) + out.close() + + reopened = CTable.open(str(path), mode="r") + assert reopened.col_names == ["city", "qty_sum"] + assert rows(reopened) == [("Berlin", 6), ("Paris", 7), ("Rome", 8)] + + +def test_groupby_persistent_output_urlpath_on_convenience_method(tmp_path): + t = CTable(SalesRow, new_data=DATA) + path = tmp_path / "grouped_mean.b2d" + + out = t.group_by("city", sort=True).mean("qty", urlpath=path) + out.close() + + reopened = CTable.open(str(path), mode="r") + assert rows(reopened) == [("Berlin", 6.0), ("Paris", 7 / 3), ("Rome", 4.0)] diff --git a/tests/ctable/test_nested_append.py b/tests/ctable/test_nested_append.py new file mode 100644 index 00000000..7be94a6e --- /dev/null +++ b/tests/ctable/test_nested_append.py @@ -0,0 +1,96 @@ +"""Tests for Ph 3.1: append/extend with nested dict rows on tables with dotted column names.""" + +from dataclasses import dataclass + +import numpy as np +import pytest + +import blosc2 + + +@dataclass +class FlatTrip: + trip_begin_lon: float + trip_begin_lat: float + payment_fare: float + + +def _make_nested_table(): + """Create a CTable with dotted (nested) column names via rename.""" + t = blosc2.CTable(FlatTrip) + t.rename_column("trip_begin_lon", "trip.begin.lon") + t.rename_column("trip_begin_lat", "trip.begin.lat") + t.rename_column("payment_fare", "payment.fare") + return t + + +def test_append_nested_dict(): + """append() accepts a fully-nested dict and flattens it to dotted keys.""" + t = _make_nested_table() + t.append({"trip": {"begin": {"lon": 1.0, "lat": 2.0}}, "payment": {"fare": 10.0}}) + t.append({"trip": {"begin": {"lon": 3.0, "lat": 4.0}}, "payment": {"fare": 20.0}}) + + assert t.nrows == 2 + np.testing.assert_array_almost_equal(t["trip.begin.lon"][:], [1.0, 3.0]) + np.testing.assert_array_almost_equal(t["trip.begin.lat"][:], [2.0, 4.0]) + np.testing.assert_array_almost_equal(t["payment.fare"][:], [10.0, 20.0]) + + +def test_append_flat_dotted_dict_unchanged(): + """append() with already-flat dotted keys continues to work.""" + t = _make_nested_table() + t.append({"trip.begin.lon": 5.0, "trip.begin.lat": 6.0, "payment.fare": 30.0}) + + assert t.nrows == 1 + assert t["trip.begin.lon"][0] == pytest.approx(5.0) + + +def test_extend_list_of_nested_dicts(): + """extend() with a list of nested dicts flattens each row.""" + t = _make_nested_table() + rows = [ + {"trip": {"begin": {"lon": 1.0, "lat": 2.0}}, "payment": {"fare": 10.0}}, + {"trip": {"begin": {"lon": 3.0, "lat": 4.0}}, "payment": {"fare": 20.0}}, + {"trip": {"begin": {"lon": 5.0, "lat": 6.0}}, "payment": {"fare": 30.0}}, + ] + t.extend(rows) + + assert t.nrows == 3 + np.testing.assert_array_almost_equal(t["trip.begin.lon"][:], [1.0, 3.0, 5.0]) + np.testing.assert_array_almost_equal(t["payment.fare"][:], [10.0, 20.0, 30.0]) + + +def test_extend_nested_dict_of_arrays(): + """extend() with a nested dict-of-arrays flattens the outer dict to dotted keys.""" + t = _make_nested_table() + t.extend( + { + "trip": {"begin": {"lon": [1.0, 2.0, 3.0], "lat": [4.0, 5.0, 6.0]}}, + "payment": {"fare": [10.0, 20.0, 30.0]}, + } + ) + + assert t.nrows == 3 + np.testing.assert_array_almost_equal(t["trip.begin.lon"][:], [1.0, 2.0, 3.0]) + np.testing.assert_array_almost_equal(t["trip.begin.lat"][:], [4.0, 5.0, 6.0]) + np.testing.assert_array_almost_equal(t["payment.fare"][:], [10.0, 20.0, 30.0]) + + +def test_append_nested_dict_where_and_attribute_access(): + """append() with nested dicts integrates correctly with where() and attribute proxy.""" + t = _make_nested_table() + for lon, lat, fare in [(1.0, 2.0, 5.0), (3.0, 4.0, 15.0), (5.0, 6.0, 25.0)]: + t.append({"trip": {"begin": {"lon": lon, "lat": lat}}, "payment": {"fare": fare}}) + + view = t.where("payment.fare > 10") + assert view.nrows == 2 + assert t.trip.begin.lon.max() == pytest.approx(5.0) + + +def test_nested_dotted_string_where_in_aggregate(): + """Aggregate where= strings accept dotted nested column names.""" + t = _make_nested_table() + for lon, lat, fare in [(1.0, 2.0, 5.0), (3.0, 4.0, 15.0), (5.0, 6.0, 25.0)]: + t.append({"trip": {"begin": {"lon": lon, "lat": lat}}, "payment": {"fare": fare}}) + + assert t.trip.begin.lon.sum(where="payment.fare > 10") == pytest.approx(8.0) diff --git a/tests/ctable/test_object_spec.py b/tests/ctable/test_object_spec.py new file mode 100644 index 00000000..9b6154dc --- /dev/null +++ b/tests/ctable/test_object_spec.py @@ -0,0 +1,66 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +"""Tests for schema-less CTable object columns.""" + +from dataclasses import dataclass + +import pytest + +import blosc2 +from blosc2 import CTable + + +@dataclass +class ObjectRow: + id: int = blosc2.field(blosc2.int32()) + payload: object = blosc2.field(blosc2.object(nullable=True)) + + +def test_object_column_heterogeneous_values(): + t = CTable(ObjectRow) + t.append([1, {"kind": "dict", "values": [1, 2]}]) + t.append([2, ("tuple", 3)]) + t.append([3, None]) + + assert t["payload"][:] == [{"kind": "dict", "values": [1, 2]}, ("tuple", 3), None] + assert t["payload"].is_varlen_scalar + + +def test_object_column_persistence(tmp_path): + path = tmp_path / "objects.b2d" + t = CTable(ObjectRow, urlpath=str(path), mode="w") + t.extend([[1, {"x": 1}], [2, ["a", "b"]], [3, None]]) + t.close() + + reopened = CTable.open(str(path), mode="r") + assert reopened["payload"][:] == [{"x": 1}, ["a", "b"], None] + + +def test_object_column_to_arrow_raises(): + pytest.importorskip("pyarrow") + t = CTable(ObjectRow) + t.append([1, {"x": 1}]) + with pytest.raises(TypeError, match="ObjectSpec columns"): + t.to_arrow() + + +def test_object_column_rejects_none_when_not_nullable(): + @dataclass + class StrictObjectRow: + payload: object = blosc2.field(blosc2.object()) + + t = CTable(StrictObjectRow) + with pytest.raises(TypeError, match="not nullable"): + t.append([None]) + + +def test_object_column_rejects_non_msgpack_value_on_flush(): + t = CTable(ObjectRow) + t.append([1, {"not-msgpack": {1, 2, 3}}]) + with pytest.raises(TypeError): + t.close() diff --git a/tests/test_group_reduce.py b/tests/test_group_reduce.py new file mode 100644 index 00000000..856c25ef --- /dev/null +++ b/tests/test_group_reduce.py @@ -0,0 +1,75 @@ +import numpy as np +import pytest + +import blosc2 + + +def test_group_reduce_size_and_sum_integer_keys(): + keys = np.array([2, 1, 2, 1, 2], dtype=np.int16) + values = np.array([10, 1, 30, 3, 50], dtype=np.int32) + + groups, sizes = blosc2.group_reduce(keys, op="size", sort=True) + groups2, sums = blosc2.group_reduce(keys, values, op="sum", sort=True) + + assert groups.dtype == keys.dtype + np.testing.assert_array_equal(groups, np.array([1, 2], dtype=np.int16)) + np.testing.assert_array_equal(sizes, np.array([2, 3])) + np.testing.assert_array_equal(groups2, np.array([1, 2], dtype=np.int16)) + np.testing.assert_array_equal(sums, np.array([4, 90])) + + +def test_group_reduce_integer_keys_float_aggs_with_nan_values(): + keys = np.array([0, 1, 0, 1, 2], dtype=np.uint16) + values = np.array([1.0, np.nan, 3.0, np.nan, 10.0]) + + groups, counts = blosc2.group_reduce(keys, values, op="count", sort=True) + _, means = blosc2.group_reduce(keys, values, op="mean", sort=True) + _, mins = blosc2.group_reduce(keys, values, op="min", sort=True) + _, maxs = blosc2.group_reduce(keys, values, op="max", sort=True) + + np.testing.assert_array_equal(groups, np.array([0, 1, 2], dtype=np.uint16)) + np.testing.assert_array_equal(counts, np.array([2, 0, 1])) + assert means[0] == 2.0 + assert np.isnan(means[1]) + assert means[2] == 10.0 + assert mins[0] == 1.0 + assert np.isnan(mins[1]) + assert mins[2] == 10.0 + assert maxs[0] == 3.0 + assert np.isnan(maxs[1]) + assert maxs[2] == 10.0 + + +def test_group_reduce_arbitrary_float_keys_and_nan_key_group(): + keys = np.array([0.5, np.nan, 0.5, -0.0, 0.0, np.nan]) + values = np.array([1.0, 2.0, 3.0, 10.0, 20.0, 5.0]) + + groups, sums = blosc2.group_reduce(keys, values, op="sum", sort=True, dropna=False) + + assert groups[0] == 0.0 + assert sums[0] == 30.0 + assert groups[1] == 0.5 + assert sums[1] == 4.0 + assert np.isnan(groups[2]) + assert sums[2] == 7.0 + + +def test_group_reduce_dropna_default_skips_nan_keys(): + keys = np.array([1.0, np.nan, 1.0]) + values = np.array([2.0, 10.0, 3.0]) + + groups, sums = blosc2.group_reduce(keys, values, op="sum", sort=True) + + np.testing.assert_array_equal(groups, np.array([1.0])) + np.testing.assert_array_equal(sums, np.array([5.0])) + + +def test_group_reduce_rejects_bad_inputs(): + with pytest.raises(ValueError): + blosc2.group_reduce(np.ones((2, 2)), op="size") + with pytest.raises(ValueError): + blosc2.group_reduce(np.arange(3), op="sum") + with pytest.raises(ValueError): + blosc2.group_reduce(np.arange(3), np.arange(2), op="sum") + with pytest.raises(ValueError): + blosc2.group_reduce(np.arange(3), op="bad")