From d1bc0f43fd113bfd809360b68fa318c1cb8f1c10 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Mon, 16 Mar 2026 17:48:00 +0100 Subject: [PATCH 01/34] Initial implementation for BatchArray --- CMakeLists.txt | 2 +- src/blosc2/__init__.py | 9 + src/blosc2/batch_array.py | 431 ++++++++++++++++++++++++++++++++++++++ src/blosc2/blosc2_ext.pyx | 127 +++++++++++ src/blosc2/core.py | 12 +- src/blosc2/dict_store.py | 24 ++- src/blosc2/embed_store.py | 14 +- src/blosc2/schunk.py | 6 + src/blosc2/tree_store.py | 10 +- tests/test_batch_array.py | 369 ++++++++++++++++++++++++++++++++ 10 files changed, 981 insertions(+), 23 deletions(-) create mode 100644 src/blosc2/batch_array.py create mode 100644 tests/test_batch_array.py diff --git a/CMakeLists.txt b/CMakeLists.txt index b748c794..ed72f8a5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -119,7 +119,7 @@ else() include(FetchContent) FetchContent_Declare(blosc2 GIT_REPOSITORY https://github.com/Blosc/c-blosc2 - GIT_TAG 25197eb96d05318c939b3252a6b373ccd6ae49fe # variable-length chunks support in schunks + GIT_TAG 6bed0534d61652cb1e62a3e7be7283f333dfaaf7 # variable-length chunks support in schunks # SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../c-blosc2 ) FetchContent_MakeAvailable(blosc2) diff --git a/src/blosc2/__init__.py b/src/blosc2/__init__.py index e0e8f9ac..66455881 100644 --- a/src/blosc2/__init__.py +++ b/src/blosc2/__init__.py @@ -37,6 +37,11 @@ from .version import __array_api_version__, __version__ +_PACKAGE_DIR = str(Path(__file__).resolve().parent) +if _PACKAGE_DIR in __path__: + __path__.remove(_PACKAGE_DIR) +__path__.insert(0, _PACKAGE_DIR) + def _configure_libtcc_runtime_path(): """Best-effort configuration so miniexpr can find bundled libtcc at runtime.""" @@ -530,6 +535,7 @@ def _raise(exc): from .embed_store import EmbedStore, estore_from_cframe from .dict_store import DictStore from .tree_store import TreeStore +from .batch_array import Batch, BatchArray, batcharray_from_cframe from .vlarray import VLArray, vlarray_from_cframe from .c2array import c2context, C2Array, URLPath @@ -714,6 +720,8 @@ def _raise(exc): # Classes "C2Array", "CParams", + "Batch", + "BatchArray", # Enums "Codec", "DParams", @@ -936,6 +944,7 @@ def _raise(exc): "validate_expr", "var", "vecdot", + "batcharray_from_cframe", "vlarray_from_cframe", "where", "zeros", diff --git a/src/blosc2/batch_array.py b/src/blosc2/batch_array.py new file mode 100644 index 00000000..0376d398 --- /dev/null +++ b/src/blosc2/batch_array.py @@ -0,0 +1,431 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from __future__ import annotations + +import copy +import pathlib +from collections.abc import Iterator, Sequence +from dataclasses import asdict +from typing import Any + +import blosc2 +from blosc2._msgpack_utils import msgpack_packb, msgpack_unpackb + +_BATCHARRAY_META = {"version": 1, "serializer": "msgpack", "format": "vlblocks"} + + +def _check_serialized_size(buffer: bytes) -> None: + if len(buffer) > blosc2.MAX_BUFFERSIZE: + raise ValueError(f"Serialized objects cannot be larger than {blosc2.MAX_BUFFERSIZE} bytes") + + +class Batch(Sequence[Any]): + """A lazy sequence of Python objects stored in one BatchArray chunk.""" + + def __init__(self, parent: BatchArray, nchunk: int, lazychunk: bytes) -> None: + self._parent = parent + self._nchunk = nchunk + self._lazychunk = lazychunk + self._payloads: list[bytes] | None = None + self._nbytes, self._cbytes, self._nblocks = blosc2.get_cbuffer_sizes(lazychunk) + + def _normalize_index(self, index: int) -> int: + if not isinstance(index, int): + raise TypeError("Batch indices must be integers") + if index < 0: + index += len(self) + if index < 0 or index >= len(self): + raise IndexError("Batch index out of range") + return index + + def _decode_payloads(self) -> list[bytes]: + if self._payloads is None: + self._payloads = self._parent._decode_payloads(self._nchunk) + return self._payloads + + def __getitem__(self, index: int | slice) -> Any | list[Any]: + payloads = self._decode_payloads() + if isinstance(index, slice): + return [msgpack_unpackb(payload) for payload in payloads[index]] + index = self._normalize_index(index) + return msgpack_unpackb(payloads[index]) + + def __len__(self) -> int: + return self._nblocks + + def __iter__(self) -> Iterator[Any]: + for i in range(len(self)): + yield self[i] + + @property + def lazychunk(self) -> bytes: + return self._lazychunk + + @property + def nbytes(self) -> int: + return self._nbytes + + @property + def cbytes(self) -> int: + return self._cbytes + + @property + def cratio(self) -> float: + return self._nbytes / self._cbytes + + def __repr__(self) -> str: + return f"Batch(len={len(self)}, nbytes={self.nbytes}, cbytes={self.cbytes})" + + +class BatchArray: + """A batched variable-length array backed by an :class:`blosc2.SChunk`.""" + + @staticmethod + def _set_typesize_one(cparams: blosc2.CParams | dict | None) -> blosc2.CParams | dict: + if cparams is None: + cparams = blosc2.CParams() + elif isinstance(cparams, blosc2.CParams): + cparams = copy.deepcopy(cparams) + else: + cparams = dict(cparams) + + if isinstance(cparams, blosc2.CParams): + cparams.typesize = 1 + else: + cparams["typesize"] = 1 + return cparams + + @staticmethod + def _coerce_storage(storage: blosc2.Storage | dict | None, kwargs: dict[str, Any]) -> blosc2.Storage: + if storage is not None: + storage_keys = set(blosc2.Storage.__annotations__) + storage_kwargs = storage_keys.intersection(kwargs) + if storage_kwargs: + unexpected = ", ".join(sorted(storage_kwargs)) + raise AttributeError( + f"Cannot pass both `storage` and other kwargs already included in Storage: {unexpected}" + ) + if isinstance(storage, blosc2.Storage): + return copy.deepcopy(storage) + return blosc2.Storage(**storage) + + storage_kwargs = { + name: kwargs.pop(name) for name in list(blosc2.Storage.__annotations__) if name in kwargs + } + return blosc2.Storage(**storage_kwargs) + + @staticmethod + def _validate_storage(storage: blosc2.Storage) -> None: + if storage.mmap_mode not in (None, "r"): + raise ValueError("For BatchArray containers, mmap_mode must be None or 'r'") + if storage.mmap_mode == "r" and storage.mode != "r": + raise ValueError("For BatchArray containers, mmap_mode='r' requires mode='r'") + + def _attach_schunk(self, schunk: blosc2.SChunk) -> None: + self.schunk = schunk + self.urlpath = schunk.urlpath + self.mode = schunk.mode + self.mmap_mode = getattr(schunk, "mmap_mode", None) + self._validate_tag() + + def _maybe_open_existing(self, storage: blosc2.Storage) -> bool: + urlpath = storage.urlpath + if urlpath is None or storage.mode not in ("r", "a") or not pathlib.Path(urlpath).exists(): + return False + + schunk = blosc2.blosc2_ext.open(urlpath, mode=storage.mode, offset=0, mmap_mode=storage.mmap_mode) + self._attach_schunk(schunk) + return True + + def _make_storage(self) -> blosc2.Storage: + meta = {name: self.meta[name] for name in self.meta} + return blosc2.Storage( + contiguous=self.schunk.contiguous, + urlpath=self.urlpath, + mode=self.mode, + mmap_mode=self.mmap_mode, + meta=meta, + ) + + def __init__( + self, + chunksize: int | None = None, + _from_schunk: blosc2.SChunk | None = None, + **kwargs: Any, + ) -> None: + if _from_schunk is not None: + if chunksize is not None: + raise ValueError("Cannot pass `chunksize` together with `_from_schunk`") + if kwargs: + unexpected = ", ".join(sorted(kwargs)) + raise ValueError(f"Cannot pass {unexpected} together with `_from_schunk`") + self._attach_schunk(_from_schunk) + return + + cparams = kwargs.pop("cparams", None) + dparams = kwargs.pop("dparams", None) + storage = kwargs.pop("storage", None) + storage = self._coerce_storage(storage, kwargs) + + if kwargs: + unexpected = ", ".join(sorted(kwargs)) + raise ValueError(f"Unsupported BatchArray keyword argument(s): {unexpected}") + + self._validate_storage(storage) + cparams = self._set_typesize_one(cparams) + + if dparams is None: + dparams = blosc2.DParams() + + if self._maybe_open_existing(storage): + return + + fixed_meta = dict(storage.meta or {}) + fixed_meta["batcharray"] = dict(_BATCHARRAY_META) + storage.meta = fixed_meta + if chunksize is None: + chunksize = -1 + schunk = blosc2.SChunk( + chunksize=chunksize, data=None, cparams=cparams, dparams=dparams, storage=storage + ) + self._attach_schunk(schunk) + + def _validate_tag(self) -> None: + if "batcharray" not in self.schunk.meta: + raise ValueError("The supplied SChunk is not tagged as a BatchArray") + + def _check_writable(self) -> None: + if self.mode == "r": + raise ValueError("Cannot modify a BatchArray opened in read-only mode") + + def _normalize_index(self, index: int) -> int: + if not isinstance(index, int): + raise TypeError("BatchArray indices must be integers") + if index < 0: + index += len(self) + if index < 0 or index >= len(self): + raise IndexError("BatchArray index out of range") + return index + + def _normalize_insert_index(self, index: int) -> int: + if not isinstance(index, int): + raise TypeError("BatchArray indices must be integers") + if index < 0: + index += len(self) + if index < 0: + return 0 + if index > len(self): + return len(self) + return index + + def _slice_indices(self, index: slice) -> list[int]: + return list(range(*index.indices(len(self)))) + + def _copy_meta(self) -> dict[str, Any]: + return {name: self.meta[name] for name in self.meta} + + def _normalize_batch(self, value: object) -> list[Any]: + if isinstance(value, (str, bytes, bytearray, memoryview)): + raise TypeError("BatchArray entries must be sequences of Python objects") + if not isinstance(value, Sequence): + raise TypeError("BatchArray entries must be sequences of Python objects") + values = list(value) + if len(values) == 0: + raise ValueError("BatchArray entries cannot be empty") + return values + + def _serialize_batch(self, value: object) -> list[bytes]: + payloads = [] + for item in self._normalize_batch(value): + payload = msgpack_packb(item) + _check_serialized_size(payload) + payloads.append(payload) + return payloads + + def _vl_cparams_kwargs(self) -> dict[str, Any]: + return asdict(self.schunk.cparams) + + def _vl_dparams_kwargs(self) -> dict[str, Any]: + return asdict(self.schunk.dparams) + + def _compress_batch(self, payloads: list[bytes]) -> bytes: + return blosc2.blosc2_ext.vlcompress(payloads, **self._vl_cparams_kwargs()) + + def _decode_payloads(self, nchunk: int) -> list[bytes]: + return blosc2.blosc2_ext.vldecompress(self.schunk.get_chunk(nchunk), **self._vl_dparams_kwargs()) + + def _get_batch(self, index: int) -> Batch: + return Batch(self, index, self.schunk.get_lazychunk(index)) + + def append(self, value: object) -> int: + """Append one batch and return the new number of entries.""" + self._check_writable() + chunk = self._compress_batch(self._serialize_batch(value)) + return self.schunk.append_chunk(chunk) + + def insert(self, index: int, value: object) -> int: + """Insert one batch at ``index`` and return the new number of entries.""" + self._check_writable() + index = self._normalize_insert_index(index) + chunk = self._compress_batch(self._serialize_batch(value)) + return self.schunk.insert_chunk(index, chunk) + + def delete(self, index: int | slice) -> int: + """Delete the batch at ``index`` and return the new number of entries.""" + self._check_writable() + if isinstance(index, slice): + for idx in reversed(self._slice_indices(index)): + self.schunk.delete_chunk(idx) + return len(self) + index = self._normalize_index(index) + return self.schunk.delete_chunk(index) + + def pop(self, index: int = -1) -> list[Any]: + """Remove and return the batch at ``index``.""" + self._check_writable() + if isinstance(index, slice): + raise NotImplementedError("Slicing is not supported for BatchArray") + index = self._normalize_index(index) + value = self[index][:] + self.schunk.delete_chunk(index) + return value + + def extend(self, values: object) -> None: + """Append all batches from an iterable.""" + self._check_writable() + for value in values: + chunk = self._compress_batch(self._serialize_batch(value)) + self.schunk.append_chunk(chunk) + + def clear(self) -> None: + """Remove all entries from the container.""" + self._check_writable() + storage = self._make_storage() + if storage.urlpath is not None: + blosc2.remove_urlpath(storage.urlpath) + schunk = blosc2.SChunk( + chunksize=-1, + data=None, + cparams=copy.deepcopy(self.cparams), + dparams=copy.deepcopy(self.dparams), + storage=storage, + ) + self._attach_schunk(schunk) + + def __getitem__(self, index: int | slice) -> Batch | list[Batch]: + if isinstance(index, slice): + return [self[i] for i in self._slice_indices(index)] + index = self._normalize_index(index) + return self._get_batch(index) + + def __setitem__(self, index: int | slice, value: object) -> None: + if isinstance(index, slice): + self._check_writable() + indices = self._slice_indices(index) + values = list(value) + step = 1 if index.step is None else index.step + if step == 1: + start = self._normalize_insert_index(0 if index.start is None else index.start) + for idx in reversed(indices): + self.schunk.delete_chunk(idx) + for offset, item in enumerate(values): + chunk = self._compress_batch(self._serialize_batch(item)) + self.schunk.insert_chunk(start + offset, chunk) + return + if len(values) != len(indices): + raise ValueError( + f"attempt to assign sequence of size {len(values)} to extended slice of size {len(indices)}" + ) + for idx, item in zip(indices, values, strict=True): + chunk = self._compress_batch(self._serialize_batch(item)) + self.schunk.update_chunk(idx, chunk) + return + self._check_writable() + index = self._normalize_index(index) + chunk = self._compress_batch(self._serialize_batch(value)) + self.schunk.update_chunk(index, chunk) + + def __delitem__(self, index: int | slice) -> None: + self.delete(index) + + def __len__(self) -> int: + return self.schunk.nchunks + + def __iter__(self) -> Iterator[Batch]: + for i in range(len(self)): + yield self[i] + + @property + def meta(self): + return self.schunk.meta + + @property + def vlmeta(self): + return self.schunk.vlmeta + + @property + def cparams(self): + return self.schunk.cparams + + @property + def dparams(self): + return self.schunk.dparams + + @property + def chunksize(self) -> int: + return self.schunk.chunksize + + @property + def nbytes(self) -> int: + return self.schunk.nbytes + + @property + def cbytes(self) -> int: + return self.schunk.cbytes + + @property + def cratio(self) -> float: + return self.schunk.cratio + + def to_cframe(self) -> bytes: + return self.schunk.to_cframe() + + def copy(self, **kwargs: Any) -> BatchArray: + """Create a copy of the container with optional constructor overrides.""" + if "meta" in kwargs: + raise ValueError("meta should not be passed to copy") + + kwargs["cparams"] = kwargs.get("cparams", copy.deepcopy(self.cparams)) + kwargs["dparams"] = kwargs.get("dparams", copy.deepcopy(self.dparams)) + kwargs["chunksize"] = kwargs.get("chunksize", -1) + + if "storage" not in kwargs: + kwargs["meta"] = self._copy_meta() + kwargs["contiguous"] = kwargs.get("contiguous", self.schunk.contiguous) + if "urlpath" in kwargs and "mode" not in kwargs: + kwargs["mode"] = "w" + + out = BatchArray(**kwargs) + out.extend(self) + return out + + def __enter__(self) -> BatchArray: + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> bool: + return False + + def __repr__(self) -> str: + return f"BatchArray(len={len(self)}, urlpath={self.urlpath!r})" + + +def batcharray_from_cframe(cframe: bytes, copy: bool = True) -> BatchArray: + """Deserialize a CFrame buffer into a :class:`BatchArray`.""" + + schunk = blosc2.schunk_from_cframe(cframe, copy=copy) + return BatchArray(_from_schunk=schunk) diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx index 604be0bd..edd41837 100644 --- a/src/blosc2/blosc2_ext.pyx +++ b/src/blosc2/blosc2_ext.pyx @@ -279,9 +279,17 @@ cdef extern from "blosc2.h": blosc2_context * context, const void * src, int32_t srcsize, void * dest, int32_t destsize) nogil + int blosc2_vlcompress_ctx( + blosc2_context * context, const void * const * srcs, const int32_t * srcsizes, + int32_t nblocks, void * dest, int32_t destsize) nogil + int blosc2_decompress_ctx(blosc2_context * context, const void * src, int32_t srcsize, void * dest, int32_t destsize) nogil + int blosc2_vldecompress_ctx(blosc2_context* context, const void* src, + int32_t srcsize, void** dests, + int32_t* destsizes, int32_t maxblocks) + int blosc2_getitem_ctx(blosc2_context* context, const void* src, int32_t srcsize, int start, int nitems, void* dest, int32_t destsize) nogil @@ -1095,6 +1103,7 @@ def compress2(src, **kwargs): return dest[:size] cdef create_dparams_from_kwargs(blosc2_dparams *dparams, kwargs, blosc2_cparams* cparams=NULL): + memcpy(dparams, &BLOSC2_DPARAMS_DEFAULTS, sizeof(BLOSC2_DPARAMS_DEFAULTS)) dparams.nthreads = kwargs.get('nthreads', blosc2.nthreads) dparams.schunk = NULL dparams.postfilter = NULL @@ -1154,6 +1163,124 @@ def decompress2(src, dst=None, **kwargs): raise ValueError("Error while decompressing, check the src data and/or the dparams") +def vlcompress(srcs, **kwargs): + cdef blosc2_cparams cparams + create_cparams_from_kwargs(&cparams, kwargs) + + cdef Py_ssize_t nblocks = len(srcs) + if nblocks <= 0: + raise ValueError("At least one block is required") + + cdef blosc2_context *cctx = NULL + cdef Py_buffer *buffers = calloc(nblocks, sizeof(Py_buffer)) + cdef const void **src_ptrs = malloc(nblocks * sizeof(void *)) + cdef int32_t *srcsizes = malloc(nblocks * sizeof(int32_t)) + cdef Py_ssize_t acquired = 0 + cdef Py_ssize_t i + cdef int64_t total_nbytes = 0 + cdef int32_t len_dest + cdef int size + cdef Py_ssize_t release_i + cdef void *_dest + if buffers == NULL or src_ptrs == NULL or srcsizes == NULL: + free(buffers) + free(src_ptrs) + free(srcsizes) + raise MemoryError() + + try: + for i in range(nblocks): + PyObject_GetBuffer(srcs[i], &buffers[i], PyBUF_SIMPLE) + acquired += 1 + if buffers[i].len <= 0: + raise ValueError("Each VL block must have at least one byte") + src_ptrs[i] = buffers[i].buf + srcsizes[i] = buffers[i].len + total_nbytes += buffers[i].len + + # VL blocks can carry enough per-block framing that the simple + # total_nbytes + global_overhead estimate is too small for many tiny + # buffers. Budget one max-overhead chunk per block as a conservative + # upper bound for the temporary destination. + len_dest = (total_nbytes + BLOSC2_MAX_OVERHEAD * (nblocks + 1) + 64) + dest = PyBytes_FromStringAndSize(NULL, len_dest) + if dest is None: + raise MemoryError() + _dest = dest + cctx = blosc2_create_cctx(cparams) + if cctx == NULL: + raise RuntimeError("Could not create the compression context") + if RELEASEGIL: + with nogil: + size = blosc2_vlcompress_ctx(cctx, src_ptrs, srcsizes, nblocks, _dest, len_dest) + else: + size = blosc2_vlcompress_ctx(cctx, src_ptrs, srcsizes, nblocks, _dest, len_dest) + finally: + if cctx != NULL: + blosc2_free_ctx(cctx) + for release_i in range(acquired): + PyBuffer_Release(&buffers[release_i]) + free(buffers) + free(src_ptrs) + free(srcsizes) + + if size < 0: + raise RuntimeError("Could not compress the data") + elif size == 0: + del dest + raise RuntimeError("The result could not fit ") + return dest[:size] + + +def vldecompress(src, **kwargs): + cdef blosc2_dparams dparams + create_dparams_from_kwargs(&dparams, kwargs) + + cdef blosc2_context *dctx = blosc2_create_dctx(dparams) + if dctx == NULL: + raise RuntimeError("Could not create decompression context") + + cdef const uint8_t[:] typed_view_src + mem_view_src = memoryview(src) + typed_view_src = mem_view_src.cast('B') + _check_comp_length('src', typed_view_src.nbytes) + cdef int32_t nbytes + cdef int32_t cbytes + cdef int32_t nblocks + blosc2_cbuffer_sizes(&typed_view_src[0], &nbytes, &cbytes, &nblocks) + if nblocks <= 0: + blosc2_free_ctx(dctx) + raise ValueError("Chunk does not contain VL blocks") + + cdef void **dests = calloc(nblocks, sizeof(void *)) + cdef int32_t *destsizes = malloc(nblocks * sizeof(int32_t)) + cdef int32_t rc + cdef int32_t i + cdef list out = [] + if dests == NULL or destsizes == NULL: + blosc2_free_ctx(dctx) + free(dests) + free(destsizes) + raise MemoryError() + + try: + rc = blosc2_vldecompress_ctx(dctx, &typed_view_src[0], cbytes, dests, destsizes, nblocks) + if rc < 0: + raise RuntimeError("Could not decompress the data") + for i in range(rc): + out.append(PyBytes_FromStringAndSize(dests[i], destsizes[i])) + free(dests[i]) + dests[i] = NULL + return out + finally: + for i in range(nblocks): + if dests[i] != NULL: + free(dests[i]) + free(dests) + free(destsizes) + blosc2_free_ctx(dctx) + + cdef create_storage(blosc2_storage *storage, kwargs): contiguous = kwargs.get('contiguous', blosc2.storage_dflts['contiguous']) storage.contiguous = contiguous diff --git a/src/blosc2/core.py b/src/blosc2/core.py index 5526a7f2..fc8749ad 100644 --- a/src/blosc2/core.py +++ b/src/blosc2/core.py @@ -1918,9 +1918,9 @@ def ndarray_from_cframe(cframe: bytes | str, copy: bool = False) -> blosc2.NDArr def from_cframe( cframe: bytes | str, copy: bool = True -) -> blosc2.EmbedStore | blosc2.NDArray | blosc2.SChunk | blosc2.VLArray: - """Create a :ref:`EmbedStore `, :ref:`NDArray `, :ref:`SChunk ` - or :ref:`VLArray ` instance +) -> blosc2.EmbedStore | blosc2.NDArray | blosc2.SChunk | blosc2.BatchArray | blosc2.VLArray: + """Create a :ref:`EmbedStore `, :ref:`NDArray `, :ref:`SChunk `, + :ref:`BatchArray ` or :ref:`VLArray ` instance from a contiguous frame buffer. Parameters @@ -1937,8 +1937,8 @@ def from_cframe( Returns ------- - out: :ref:`EmbedStore `, :ref:`NDArray `, :ref:`SChunk ` - or :ref:`VLArray ` + out: :ref:`EmbedStore `, :ref:`NDArray `, :ref:`SChunk `, + :ref:`BatchArray ` or :ref:`VLArray ` A new instance of the appropriate type containing the data passed. See Also @@ -1952,6 +1952,8 @@ def from_cframe( # Check the metalayer to determine the type if "b2embed" in schunk.meta: return blosc2.estore_from_cframe(cframe, copy=copy) + if "batcharray" in schunk.meta: + return blosc2.batcharray_from_cframe(cframe, copy=copy) if "vlarray" in schunk.meta: return blosc2.vlarray_from_cframe(cframe, copy=copy) if "b2nd" in schunk.meta: diff --git a/src/blosc2/dict_store.py b/src/blosc2/dict_store.py index 1cb6dd3c..65bab76e 100644 --- a/src/blosc2/dict_store.py +++ b/src/blosc2/dict_store.py @@ -249,24 +249,26 @@ def estore(self) -> EmbedStore: return self._estore @staticmethod - def _value_nbytes(value: blosc2.Array | SChunk | blosc2.VLArray) -> int: - if isinstance(value, blosc2.VLArray): + def _value_nbytes(value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.BatchArray) -> int: + if isinstance(value, (blosc2.VLArray, blosc2.BatchArray)): return value.schunk.nbytes return value.nbytes @staticmethod - def _is_external_value(value: blosc2.Array | SChunk | blosc2.VLArray) -> bool: - return isinstance(value, (blosc2.NDArray, SChunk, blosc2.VLArray)) and bool( + def _is_external_value(value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.BatchArray) -> bool: + return isinstance(value, (blosc2.NDArray, SChunk, blosc2.VLArray, blosc2.BatchArray)) and bool( getattr(value, "urlpath", None) ) @staticmethod - def _external_ext(value: blosc2.Array | SChunk | blosc2.VLArray) -> str: + def _external_ext(value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.BatchArray) -> str: if isinstance(value, blosc2.NDArray): return ".b2nd" return ".b2f" - def __setitem__(self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray) -> None: + def __setitem__( + self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.BatchArray + ) -> None: """Add a node to the DictStore.""" if isinstance(value, np.ndarray): value = blosc2.asarray(value, cparams=self.cparams, dparams=self.dparams) @@ -292,7 +294,7 @@ def __setitem__(self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray) - if hasattr(value, "save"): value.save(urlpath=dest_path) else: - # SChunk and VLArray can both be persisted via their cframe. + # SChunk, VLArray and BatchArray can all be persisted via their cframe. with open(dest_path, "wb") as f: f.write(value.to_cframe()) else: @@ -310,7 +312,9 @@ def __setitem__(self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray) - value = blosc2.from_cframe(value.to_cframe()) self._estore[key] = value - def __getitem__(self, key: str) -> blosc2.NDArray | SChunk | blosc2.VLArray | C2Array: + def __getitem__( + self, key: str + ) -> blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.BatchArray | C2Array: """Retrieve a node from the DictStore.""" # Check map_tree first if key in self.map_tree: @@ -340,7 +344,9 @@ def __getitem__(self, key: str) -> blosc2.NDArray | SChunk | blosc2.VLArray | C2 # Fall back to EmbedStore return self._estore[key] - def get(self, key: str, default: Any = None) -> blosc2.NDArray | SChunk | blosc2.VLArray | C2Array | Any: + def get( + self, key: str, default: Any = None + ) -> blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.BatchArray | C2Array | Any: """Retrieve a node, or default if not found.""" try: return self[key] diff --git a/src/blosc2/embed_store.py b/src/blosc2/embed_store.py index b03d892e..e1c8b0d2 100644 --- a/src/blosc2/embed_store.py +++ b/src/blosc2/embed_store.py @@ -173,7 +173,9 @@ def _ensure_capacity(self, needed_bytes: int) -> None: new_size = max(required_size, int(self._store.shape[0] * 1.5)) self._store.resize((new_size,)) - def __setitem__(self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray) -> None: + def __setitem__( + self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.BatchArray + ) -> None: """Add a node to the embed store.""" if self.mode == "r": raise ValueError("Cannot set items in read-only mode.") @@ -196,7 +198,7 @@ def __setitem__(self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray) - self._embed_map[key] = {"offset": offset, "length": data_len} self._save_metadata() - def __getitem__(self, key: str) -> blosc2.NDArray | SChunk | blosc2.VLArray: + def __getitem__(self, key: str) -> blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.BatchArray: """Retrieve a node from the embed store.""" if key not in self._embed_map: raise KeyError(f"Key '{key}' not found in the embed store.") @@ -212,7 +214,9 @@ def __getitem__(self, key: str) -> blosc2.NDArray | SChunk | blosc2.VLArray: # Use from_cframe so we can deserialize either an NDArray or an SChunk return blosc2.from_cframe(serialized_data, copy=True) - def get(self, key: str, default: Any = None) -> blosc2.NDArray | SChunk | blosc2.VLArray | Any: + def get( + self, key: str, default: Any = None + ) -> blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.BatchArray | Any: """Retrieve a node, or default if not found.""" return self[key] if key in self._embed_map else default @@ -239,12 +243,12 @@ def keys(self) -> KeysView[str]: """Return all keys.""" return self._embed_map.keys() - def values(self) -> Iterator[blosc2.NDArray | SChunk | blosc2.VLArray]: + def values(self) -> Iterator[blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.BatchArray]: """Iterate over all values.""" for key in self._embed_map: yield self[key] - def items(self) -> Iterator[tuple[str, blosc2.NDArray | SChunk | blosc2.VLArray]]: + def items(self) -> Iterator[tuple[str, blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.BatchArray]]: """Iterate over (key, value) pairs.""" for key in self._embed_map: yield key, self[key] diff --git a/src/blosc2/schunk.py b/src/blosc2/schunk.py index 5421ae41..1acfc043 100644 --- a/src/blosc2/schunk.py +++ b/src/blosc2/schunk.py @@ -1621,6 +1621,11 @@ def _process_opened_object(res): return VLArray(_from_schunk=getattr(res, "schunk", res)) + if "batcharray" in meta: + from blosc2.batch_array import BatchArray + + return BatchArray(_from_schunk=getattr(res, "schunk", res)) + if isinstance(res, blosc2.NDArray) and "LazyArray" in res.schunk.meta: return blosc2._open_lazyarray(res) else: @@ -1632,6 +1637,7 @@ def open( ) -> ( blosc2.SChunk | blosc2.NDArray + | blosc2.BatchArray | blosc2.VLArray | blosc2.C2Array | blosc2.LazyArray diff --git a/src/blosc2/tree_store.py b/src/blosc2/tree_store.py index a96c11a4..82b8b3de 100644 --- a/src/blosc2/tree_store.py +++ b/src/blosc2/tree_store.py @@ -226,7 +226,9 @@ def _validate_key(self, key: str) -> str: return key - def __setitem__(self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray) -> None: + def __setitem__( + self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.BatchArray + ) -> None: """Add a node with hierarchical key validation. Parameters @@ -268,7 +270,9 @@ def __setitem__(self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray) - full_key = self._translate_key_to_full(key) super().__setitem__(full_key, value) - def __getitem__(self, key: str) -> NDArray | C2Array | SChunk | blosc2.VLArray | TreeStore: + def __getitem__( + self, key: str + ) -> NDArray | C2Array | SChunk | blosc2.VLArray | blosc2.BatchArray | TreeStore: """Retrieve a node or subtree view. If the key points to a subtree (intermediate path with children), @@ -282,7 +286,7 @@ def __getitem__(self, key: str) -> NDArray | C2Array | SChunk | blosc2.VLArray | Returns ------- - out : blosc2.NDArray or blosc2.C2Array or blosc2.SChunk or blosc2.VLArray or TreeStore + out : blosc2.NDArray or blosc2.C2Array or blosc2.SChunk or blosc2.VLArray or blosc2.BatchArray or TreeStore The stored array/chunk if key is a leaf node, or a TreeStore subtree view if key is an intermediate path with children. diff --git a/tests/test_batch_array.py b/tests/test_batch_array.py new file mode 100644 index 00000000..9ddd498e --- /dev/null +++ b/tests/test_batch_array.py @@ -0,0 +1,369 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +import pytest + +import blosc2 +from blosc2._msgpack_utils import msgpack_packb + +BATCHES = [ + [b"bytes\x00payload", "plain text", 42], + [{"nested": [1, 2]}, None], + [(1, 2, "three"), 3.5, True, {"rows": [[], ["nested"]]}], +] + + +def _make_payload(seed, size): + base = bytes((seed + i) % 251 for i in range(251)) + reps = size // len(base) + 1 + return (base * reps)[:size] + + +def _storage(contiguous, urlpath, mode="w"): + return blosc2.Storage(contiguous=contiguous, urlpath=urlpath, mode=mode) + + +@pytest.mark.parametrize( + ("contiguous", "urlpath"), + [ + (False, None), + (True, None), + (True, "test_batcharray.b2frame"), + (False, "test_batcharray_s.b2frame"), + ], +) +def test_batcharray_roundtrip(contiguous, urlpath): + blosc2.remove_urlpath(urlpath) + + barray = blosc2.BatchArray(storage=_storage(contiguous, urlpath)) + assert barray.meta["batcharray"]["serializer"] == "msgpack" + + for i, batch in enumerate(BATCHES, start=1): + assert barray.append(batch) == i + + assert len(barray) == len(BATCHES) + assert [batch[:] for batch in barray] == BATCHES + + batch0 = barray[0] + assert isinstance(batch0, blosc2.Batch) + assert len(batch0) == len(BATCHES[0]) + assert batch0[1] == BATCHES[0][1] + assert batch0[:] == BATCHES[0] + assert isinstance(batch0.lazychunk, bytes) + assert batch0.nbytes > 0 + assert batch0.cbytes > 0 + assert batch0.cratio > 0 + + expected = list(BATCHES) + expected[1] = ["updated", {"tuple": (7, 8)}] + expected[-1] = ["tiny"] + barray[1] = expected[1] + barray[-1] = expected[-1] + assert barray.insert(0, ["head", 0]) == len(expected) + 1 + expected.insert(0, ["head", 0]) + assert barray.insert(-1, ["between", {"k": 5}]) == len(expected) + 1 + expected.insert(-1, ["between", {"k": 5}]) + assert barray.insert(999, ["tail"]) == len(expected) + 1 + expected.insert(999, ["tail"]) + assert barray.delete(2) == len(expected) - 1 + del expected[2] + del barray[-2] + del expected[-2] + assert [batch[:] for batch in barray] == expected + + if urlpath is not None: + reopened = blosc2.open(urlpath, mode="r") + assert isinstance(reopened, blosc2.BatchArray) + assert [batch[:] for batch in reopened] == expected + with pytest.raises(ValueError): + reopened.append(["nope"]) + with pytest.raises(ValueError): + reopened[0] = ["nope"] + with pytest.raises(ValueError): + reopened.insert(0, ["nope"]) + with pytest.raises(ValueError): + reopened.delete(0) + with pytest.raises(ValueError): + del reopened[0] + with pytest.raises(ValueError): + reopened.extend([["nope"]]) + with pytest.raises(ValueError): + reopened.pop() + with pytest.raises(ValueError): + reopened.clear() + + reopened_rw = blosc2.open(urlpath, mode="a") + reopened_rw[0] = ["changed"] + expected[0] = ["changed"] + assert [batch[:] for batch in reopened_rw] == expected + + if contiguous: + reopened_mmap = blosc2.open(urlpath, mode="r", mmap_mode="r") + assert isinstance(reopened_mmap, blosc2.BatchArray) + assert [batch[:] for batch in reopened_mmap] == expected + + blosc2.remove_urlpath(urlpath) + + +def test_batcharray_from_cframe(): + barray = blosc2.BatchArray() + barray.extend(BATCHES) + barray.insert(1, ["inserted", True]) + del barray[3] + expected = list(BATCHES) + expected.insert(1, ["inserted", True]) + del expected[3] + + restored = blosc2.from_cframe(barray.to_cframe()) + assert isinstance(restored, blosc2.BatchArray) + assert [batch[:] for batch in restored] == expected + + restored2 = blosc2.batcharray_from_cframe(barray.to_cframe()) + assert isinstance(restored2, blosc2.BatchArray) + assert [batch[:] for batch in restored2] == expected + + +def test_vlcompress_small_blocks_roundtrip(): + values = [ + {"value": None}, + {"value": []}, + {"value": []}, + {"value": ["en:salt"]}, + {"value": []}, + {"value": ["en:sugar", "en:flour"]}, + {"value": None}, + {"value": []}, + {"value": ["en:water", "en:yeast", "en:oil"]}, + {"value": []}, + {"value": []}, + {"value": ["en:acid", "en:color", "en:preservative", "en:spice"]}, + {"value": None}, + {"value": []}, + {"value": ["en:a", "en:b", "en:c", "en:d", "en:e", "en:f"]}, + {"value": []}, + {"value": []}, + {"value": None}, + {"value": ["en:x"]}, + {"value": []}, + ] + payloads = [msgpack_packb(value) for value in values] + + chunk = blosc2.blosc2_ext.vlcompress( + payloads, + codec=blosc2.Codec.ZSTD, + clevel=5, + typesize=1, + nthreads=1, + ) + out = blosc2.blosc2_ext.vldecompress(chunk, nthreads=1) + + assert out == payloads + + +def test_batcharray_constructor_kwargs(): + urlpath = "test_batcharray_kwargs.b2frame" + blosc2.remove_urlpath(urlpath) + + barray = blosc2.BatchArray(urlpath=urlpath, mode="w", contiguous=True) + barray.extend(BATCHES) + + reopened = blosc2.BatchArray(urlpath=urlpath, mode="r", contiguous=True, mmap_mode="r") + assert [batch[:] for batch in reopened] == BATCHES + + blosc2.remove_urlpath(urlpath) + + +@pytest.mark.parametrize( + ("contiguous", "urlpath"), + [ + (False, None), + (True, None), + (True, "test_batcharray_list_ops.b2frame"), + (False, "test_batcharray_list_ops_s.b2frame"), + ], +) +def test_batcharray_list_like_ops(contiguous, urlpath): + blosc2.remove_urlpath(urlpath) + + barray = blosc2.BatchArray(storage=_storage(contiguous, urlpath)) + barray.extend([[1, 2], [3], [4, 5, 6]]) + assert [batch[:] for batch in barray] == [[1, 2], [3], [4, 5, 6]] + assert barray.pop() == [4, 5, 6] + assert barray.pop(0) == [1, 2] + assert [batch[:] for batch in barray] == [[3]] + + barray.clear() + assert len(barray) == 0 + assert [batch[:] for batch in barray] == [] + + barray.extend([["a"], ["b", "c"]]) + assert [batch[:] for batch in barray] == [["a"], ["b", "c"]] + + if urlpath is not None: + reopened = blosc2.open(urlpath, mode="r") + assert [batch[:] for batch in reopened] == [["a"], ["b", "c"]] + + blosc2.remove_urlpath(urlpath) + + +@pytest.mark.parametrize( + ("contiguous", "urlpath"), + [ + (False, None), + (True, None), + (True, "test_batcharray_slices.b2frame"), + (False, "test_batcharray_slices_s.b2frame"), + ], +) +def test_batcharray_slices(contiguous, urlpath): + blosc2.remove_urlpath(urlpath) + + expected = [[i, i + 100] for i in range(8)] + barray = blosc2.BatchArray(storage=_storage(contiguous, urlpath)) + barray.extend(expected) + + assert [batch[:] for batch in barray[1:6:2]] == expected[1:6:2] + assert [batch[:] for batch in barray[::-2]] == expected[::-2] + + barray[2:5] = [["a"], ["b", "c"]] + expected[2:5] = [["a"], ["b", "c"]] + assert [batch[:] for batch in barray] == expected + + barray[1:6:2] = [[100], [101], [102]] + expected[1:6:2] = [[100], [101], [102]] + assert [batch[:] for batch in barray] == expected + + del barray[::3] + del expected[::3] + assert [batch[:] for batch in barray] == expected + + if urlpath is not None: + reopened = blosc2.open(urlpath, mode="r") + assert [batch[:] for batch in reopened[::2]] == expected[::2] + with pytest.raises(ValueError): + reopened[1:3] = [[9]] + with pytest.raises(ValueError): + del reopened[::2] + + blosc2.remove_urlpath(urlpath) + + +def test_batcharray_slice_errors(): + barray = blosc2.BatchArray() + barray.extend([[0], [1], [2], [3]]) + + with pytest.raises(ValueError, match="extended slice"): + barray[::2] = [[9]] + with pytest.raises(TypeError): + barray[1:2] = 3 + with pytest.raises(ValueError): + _ = barray[::0] + + +def test_batcharray_copy(): + urlpath = "test_batcharray_copy.b2frame" + copy_path = "test_batcharray_copy_out.b2frame" + blosc2.remove_urlpath(urlpath) + blosc2.remove_urlpath(copy_path) + + original = blosc2.BatchArray(urlpath=urlpath, mode="w", contiguous=True) + original.extend(BATCHES) + original.insert(1, ["copy", True]) + + copied = original.copy( + urlpath=copy_path, contiguous=False, cparams={"codec": blosc2.Codec.LZ4, "clevel": 5} + ) + assert [batch[:] for batch in copied] == [batch[:] for batch in original] + assert copied.urlpath == copy_path + assert copied.schunk.contiguous is False + assert copied.cparams.codec == blosc2.Codec.LZ4 + assert copied.cparams.clevel == 5 + + inmem = original.copy() + assert [batch[:] for batch in inmem] == [batch[:] for batch in original] + assert inmem.urlpath is None + + with pytest.raises(ValueError, match="meta should not be passed to copy"): + original.copy(meta={}) + + blosc2.remove_urlpath(urlpath) + blosc2.remove_urlpath(copy_path) + + +@pytest.mark.parametrize(("contiguous", "nthreads"), [(False, 2), (True, 4)]) +def test_batcharray_multithreaded_inner_vl(contiguous, nthreads): + batches = [] + for batch_id in range(24): + batch = [] + for obj_id, size in enumerate( + (13, 1024 + batch_id * 17, 70_000 + batch_id * 13, 250_000 + batch_id * 101) + ): + batch.append( + { + "batch": batch_id, + "obj": obj_id, + "size": size, + "payload": _make_payload(batch_id + obj_id, size), + } + ) + batches.append(batch) + + barray = blosc2.BatchArray( + storage=blosc2.Storage(contiguous=contiguous), + cparams=blosc2.CParams(typesize=1, nthreads=nthreads, codec=blosc2.Codec.ZSTD, clevel=5), + dparams=blosc2.DParams(nthreads=nthreads), + ) + barray.extend(batches) + + assert [batch[:] for batch in barray] == batches + assert [barray[i][:] for i in range(len(barray))] == batches + + +def test_batcharray_validation_errors(): + barray = blosc2.BatchArray() + + with pytest.raises(TypeError): + barray.append("value") + with pytest.raises(ValueError): + barray.append([]) + with pytest.raises(TypeError): + barray.insert("0", ["bad"]) + with pytest.raises(IndexError): + barray.delete(3) + with pytest.raises(IndexError): + blosc2.BatchArray().pop() + barray.extend([[1]]) + with pytest.raises(NotImplementedError): + barray.pop(slice(0, 1)) + + +def test_batcharray_in_embed_store(): + estore = blosc2.EmbedStore() + barray = blosc2.BatchArray() + barray.extend(BATCHES) + + estore["/batch"] = barray + restored = estore["/batch"] + assert isinstance(restored, blosc2.BatchArray) + assert [batch[:] for batch in restored] == BATCHES + + +def test_batcharray_in_dict_store(): + path = "test_batcharray_store.b2z" + blosc2.remove_urlpath(path) + + with blosc2.DictStore(path, mode="w", threshold=1) as dstore: + barray = blosc2.BatchArray() + barray.extend(BATCHES) + dstore["/batch"] = barray + + with blosc2.DictStore(path, mode="r") as dstore: + restored = dstore["/batch"] + assert isinstance(restored, blosc2.BatchArray) + assert [batch[:] for batch in restored] == BATCHES + + blosc2.remove_urlpath(path) From 37837d9f611c4c9d97669225e25296e21428dcb2 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Tue, 17 Mar 2026 07:02:33 +0100 Subject: [PATCH 02/34] Add .info to BatchArray and VLArray; fancier .info for others too --- src/blosc2/batch_array.py | 46 +++++++++++++++++++++++++- src/blosc2/c2array.py | 6 ++-- src/blosc2/info.py | 16 +++++++++ src/blosc2/ndarray.py | 6 ++-- src/blosc2/schunk.py | 6 ++-- src/blosc2/vlarray.py | 62 ++++++++++++++++++++++++++++++++++- tests/ndarray/test_ndarray.py | 12 +++++++ tests/test_batch_array.py | 28 ++++++++++++++++ tests/test_schunk.py | 13 ++++++++ tests/test_vlarray.py | 27 +++++++++++++++ 10 files changed, 211 insertions(+), 11 deletions(-) diff --git a/src/blosc2/batch_array.py b/src/blosc2/batch_array.py index 0376d398..5dc8cb52 100644 --- a/src/blosc2/batch_array.py +++ b/src/blosc2/batch_array.py @@ -15,6 +15,7 @@ import blosc2 from blosc2._msgpack_utils import msgpack_packb, msgpack_unpackb +from blosc2.info import InfoReporter, format_nbytes_info _BATCHARRAY_META = {"version": 1, "serializer": "msgpack", "format": "vlblocks"} @@ -128,7 +129,6 @@ def _validate_storage(storage: blosc2.Storage) -> None: def _attach_schunk(self, schunk: blosc2.SChunk) -> None: self.schunk = schunk - self.urlpath = schunk.urlpath self.mode = schunk.mode self.mmap_mode = getattr(schunk, "mmap_mode", None) self._validate_tag() @@ -262,6 +262,13 @@ def _decode_payloads(self, nchunk: int) -> list[bytes]: def _get_batch(self, index: int) -> Batch: return Batch(self, index, self.schunk.get_lazychunk(index)) + def _batch_lengths(self) -> list[int]: + lengths = [] + for i in range(len(self)): + _, _, nblocks = blosc2.get_cbuffer_sizes(self.schunk.get_lazychunk(i)) + lengths.append(nblocks) + return lengths + def append(self, value: object) -> int: """Append one batch and return the new number of entries.""" self._check_writable() @@ -380,6 +387,10 @@ def dparams(self): def chunksize(self) -> int: return self.schunk.chunksize + @property + def typesize(self) -> int: + return self.schunk.typesize + @property def nbytes(self) -> int: return self.schunk.nbytes @@ -392,6 +403,39 @@ def cbytes(self) -> int: def cratio(self) -> float: return self.schunk.cratio + @property + def urlpath(self) -> str | None: + return self.schunk.urlpath + + @property + def contiguous(self) -> bool: + return self.schunk.contiguous + + @property + def info(self) -> InfoReporter: + """Print information about this BatchArray.""" + return InfoReporter(self) + + @property + def info_items(self) -> list: + """A list of tuples with summary information about this BatchArray.""" + batch_lengths = self._batch_lengths() + nitems = sum(batch_lengths) + avg_batch_len = nitems / len(batch_lengths) if batch_lengths else 0.0 + return [ + ("type", f"{self.__class__.__name__}"), + ("nbatches", len(self)), + ("nitems", nitems), + ("batch_len_min", min(batch_lengths) if batch_lengths else 0), + ("batch_len_max", max(batch_lengths) if batch_lengths else 0), + ("batch_len_avg", f"{avg_batch_len:.2f}"), + ("nbytes", format_nbytes_info(self.nbytes)), + ("cbytes", format_nbytes_info(self.cbytes)), + ("cratio", f"{self.cratio:.2f}"), + ("cparams", self.cparams), + ("dparams", self.dparams), + ] + def to_cframe(self) -> bytes: return self.schunk.to_cframe() diff --git a/src/blosc2/c2array.py b/src/blosc2/c2array.py index e8556ba4..11f7f6cb 100644 --- a/src/blosc2/c2array.py +++ b/src/blosc2/c2array.py @@ -18,7 +18,7 @@ import requests import blosc2 -from blosc2.info import InfoReporter +from blosc2.info import InfoReporter, format_nbytes_info _subscriber_data = { "urlbase": os.environ.get("BLOSC_C2URLBASE"), @@ -424,8 +424,8 @@ def info_items(self) -> list: items += [("chunks", self.chunks)] items += [("blocks", self.blocks)] items += [("dtype", self.dtype)] - items += [("nbytes", self.nbytes)] - items += [("cbytes", self.cbytes)] + items += [("nbytes", format_nbytes_info(self.nbytes))] + items += [("cbytes", format_nbytes_info(self.cbytes))] items += [("cratio", f"{self.cratio:.2f}")] items += [("cparams", self.cparams)] # items += [("dparams", self.dparams)] diff --git a/src/blosc2/info.py b/src/blosc2/info.py index 4ac629da..ef1e3011 100644 --- a/src/blosc2/info.py +++ b/src/blosc2/info.py @@ -10,6 +10,22 @@ from textwrap import TextWrapper +def format_nbytes_human(nbytes: int) -> str: + units = ("B", "KiB", "MiB", "GiB", "TiB", "PiB") + value = float(nbytes) + for unit in units: + if value < 1024.0 or unit == units[-1]: + if unit == "B": + return f"{nbytes} B" + return f"{value:.2f} {unit}" + value /= 1024.0 + return None + + +def format_nbytes_info(nbytes: int) -> str: + return f"{nbytes} ({format_nbytes_human(nbytes)})" + + def info_text_report_(items: list) -> str: with io.StringIO() as buf: print(items, file=buf) diff --git a/src/blosc2/ndarray.py b/src/blosc2/ndarray.py index bc396622..4c35cef6 100644 --- a/src/blosc2/ndarray.py +++ b/src/blosc2/ndarray.py @@ -29,7 +29,7 @@ import blosc2 from blosc2 import SpecialValue, blosc2_ext, compute_chunks_blocks -from blosc2.info import InfoReporter +from blosc2.info import InfoReporter, format_nbytes_info from blosc2.schunk import SChunk from .linalg import matmul @@ -3838,8 +3838,8 @@ def info_items(self) -> list: items += [("chunks", self.chunks)] items += [("blocks", self.blocks)] items += [("dtype", self.dtype)] - items += [("nbytes", self.nbytes)] - items += [("cbytes", self.cbytes)] + items += [("nbytes", format_nbytes_info(self.nbytes))] + items += [("cbytes", format_nbytes_info(self.cbytes))] items += [("cratio", f"{self.cratio:.2f}")] items += [("cparams", self.cparams)] items += [("dparams", self.dparams)] diff --git a/src/blosc2/schunk.py b/src/blosc2/schunk.py index 1acfc043..a422928a 100644 --- a/src/blosc2/schunk.py +++ b/src/blosc2/schunk.py @@ -20,7 +20,7 @@ import blosc2 from blosc2 import SpecialValue, blosc2_ext from blosc2._msgpack_utils import msgpack_packb, msgpack_unpackb -from blosc2.info import InfoReporter +from blosc2.info import InfoReporter, format_nbytes_info class vlmeta(MutableMapping, blosc2_ext.vlmeta): @@ -491,8 +491,8 @@ def info_items(self) -> list: items += [("chunksize", self.chunksize)] items += [("blocksize", self.blocksize)] items += [("typesize", self.typesize)] - items += [("nbytes", self.nbytes)] - items += [("cbytes", self.cbytes)] + items += [("nbytes", format_nbytes_info(self.nbytes))] + items += [("cbytes", format_nbytes_info(self.cbytes))] items += [("cratio", f"{self.cratio:.2f}")] items += [("cparams", self.cparams)] items += [("dparams", self.dparams)] diff --git a/src/blosc2/vlarray.py b/src/blosc2/vlarray.py index d7d885e2..7f2d6445 100644 --- a/src/blosc2/vlarray.py +++ b/src/blosc2/vlarray.py @@ -13,6 +13,7 @@ import blosc2 from blosc2._msgpack_utils import msgpack_packb, msgpack_unpackb +from blosc2.info import InfoReporter, format_nbytes_info if TYPE_CHECKING: from collections.abc import Iterator @@ -73,7 +74,6 @@ def _validate_storage(storage: blosc2.Storage) -> None: def _attach_schunk(self, schunk: SChunk) -> None: self.schunk = schunk - self.urlpath = schunk.urlpath self.mode = schunk.mode self.mmap_mode = getattr(schunk, "mmap_mode", None) self._validate_tag() @@ -174,6 +174,15 @@ def _slice_indices(self, index: slice) -> list[int]: def _copy_meta(self) -> dict[str, Any]: return {name: self.meta[name] for name in self.meta} + def _item_size_stats(self) -> tuple[list[int], list[int]]: + item_nbytes = [] + chunk_cbytes = [] + for i in range(len(self)): + nbytes, cbytes, _ = blosc2.get_cbuffer_sizes(self.schunk.get_lazychunk(i)) + item_nbytes.append(nbytes) + chunk_cbytes.append(cbytes) + return item_nbytes, chunk_cbytes + def _serialize(self, value: Any) -> bytes: payload = msgpack_packb(value) _check_serialized_size(payload) @@ -301,6 +310,57 @@ def dparams(self): def chunksize(self) -> int: return self.schunk.chunksize + @property + def typesize(self) -> int: + return self.schunk.typesize + + @property + def nbytes(self) -> int: + return self.schunk.nbytes + + @property + def cbytes(self) -> int: + return self.schunk.cbytes + + @property + def cratio(self) -> float: + return self.schunk.cratio + + @property + def urlpath(self) -> str | None: + return self.schunk.urlpath + + @property + def contiguous(self) -> bool: + return self.schunk.contiguous + + @property + def info(self) -> InfoReporter: + """Print information about this VLArray.""" + return InfoReporter(self) + + @property + def info_items(self) -> list: + """A list of tuples with summary information about this VLArray.""" + item_nbytes, chunk_cbytes = self._item_size_stats() + avg_item_nbytes = sum(item_nbytes) / len(item_nbytes) if item_nbytes else 0.0 + avg_chunk_cbytes = sum(chunk_cbytes) / len(chunk_cbytes) if chunk_cbytes else 0.0 + return [ + ("type", f"{self.__class__.__name__}"), + ("entries", len(self)), + ("item_nbytes_min", min(item_nbytes) if item_nbytes else 0), + ("item_nbytes_max", max(item_nbytes) if item_nbytes else 0), + ("item_nbytes_avg", f"{avg_item_nbytes:.2f}"), + ("chunk_cbytes_min", min(chunk_cbytes) if chunk_cbytes else 0), + ("chunk_cbytes_max", max(chunk_cbytes) if chunk_cbytes else 0), + ("chunk_cbytes_avg", f"{avg_chunk_cbytes:.2f}"), + ("nbytes", format_nbytes_info(self.nbytes)), + ("cbytes", format_nbytes_info(self.cbytes)), + ("cratio", f"{self.cratio:.2f}"), + ("cparams", self.cparams), + ("dparams", self.dparams), + ] + def to_cframe(self) -> bytes: return self.schunk.to_cframe() diff --git a/tests/ndarray/test_ndarray.py b/tests/ndarray/test_ndarray.py index 8c9b45f7..b557bb65 100644 --- a/tests/ndarray/test_ndarray.py +++ b/tests/ndarray/test_ndarray.py @@ -103,6 +103,18 @@ def test_asarray(a): np.testing.assert_allclose(a, b[:]) +def test_ndarray_info_has_human_sizes(): + array = blosc2.asarray(np.arange(16, dtype=np.int32)) + + items = dict(array.info_items) + assert "(" in items["nbytes"] + assert "(" in items["cbytes"] + + text = repr(array.info) + assert "nbytes" in text + assert "cbytes" in text + + @pytest.mark.parametrize( ("shape", "newshape", "chunks", "blocks"), [ diff --git a/tests/test_batch_array.py b/tests/test_batch_array.py index 9ddd498e..7d584a36 100644 --- a/tests/test_batch_array.py +++ b/tests/test_batch_array.py @@ -127,6 +127,34 @@ def test_batcharray_from_cframe(): assert [batch[:] for batch in restored2] == expected +def test_batcharray_info(): + barray = blosc2.BatchArray() + barray.extend(BATCHES) + + assert barray.typesize == 1 + assert barray.contiguous == barray.schunk.contiguous + assert barray.urlpath == barray.schunk.urlpath + + items = dict(barray.info_items) + assert items["type"] == "BatchArray" + assert items["nbatches"] == len(BATCHES) + assert items["nitems"] == sum(len(batch) for batch in BATCHES) + assert items["batch_len_min"] == 2 + assert items["batch_len_max"] == 4 + assert items["batch_len_avg"] == "3.00" + assert "urlpath" not in items + assert "contiguous" not in items + assert "typesize" not in items + assert "(" in items["nbytes"] + assert "(" in items["cbytes"] + assert "B)" in items["nbytes"] or "KiB)" in items["nbytes"] or "MiB)" in items["nbytes"] + + text = repr(barray.info) + assert "type" in text + assert "BatchArray" in text + assert "batch_len_avg" in text + + def test_vlcompress_small_blocks_roundtrip(): values = [ {"value": None}, diff --git a/tests/test_schunk.py b/tests/test_schunk.py index 539e495f..db7c087e 100644 --- a/tests/test_schunk.py +++ b/tests/test_schunk.py @@ -186,6 +186,19 @@ def test_schunk(contiguous, urlpath, mode, mmap_mode, nbytes, cparams, dparams, blosc2.remove_urlpath(urlpath) +def test_schunk_info_has_human_sizes(): + schunk = blosc2.SChunk(chunksize=32) + schunk.append_data(b"a" * 32) + + items = dict(schunk.info_items) + assert "(" in items["nbytes"] + assert "(" in items["cbytes"] + + text = repr(schunk.info) + assert "nbytes" in text + assert "cbytes" in text + + @pytest.mark.parametrize( ("urlpath", "contiguous", "mode", "mmap_mode"), [ diff --git a/tests/test_vlarray.py b/tests/test_vlarray.py index 3a25fa78..f9834f45 100644 --- a/tests/test_vlarray.py +++ b/tests/test_vlarray.py @@ -117,6 +117,33 @@ def test_vlarray_from_cframe(): assert list(restored2) == expected +def test_vlarray_info(): + vlarray = blosc2.VLArray() + vlarray.extend(VALUES) + + assert vlarray.typesize == 1 + assert vlarray.contiguous == vlarray.schunk.contiguous + assert vlarray.urlpath == vlarray.schunk.urlpath + + items = dict(vlarray.info_items) + assert items["type"] == "VLArray" + assert items["entries"] == len(VALUES) + assert items["item_nbytes_min"] > 0 + assert items["item_nbytes_max"] >= items["item_nbytes_min"] + assert items["chunk_cbytes_min"] > 0 + assert items["chunk_cbytes_max"] >= items["chunk_cbytes_min"] + assert "urlpath" not in items + assert "contiguous" not in items + assert "typesize" not in items + assert "(" in items["nbytes"] + assert "(" in items["cbytes"] + + text = repr(vlarray.info) + assert "type" in text + assert "VLArray" in text + assert "item_nbytes_avg" in text + + def test_vlarray_constructor_kwargs(): urlpath = "test_vlarray_kwargs.b2frame" blosc2.remove_urlpath(urlpath) From f3ef3619a0bfb7a8a0fcc48cd41b196400a7e22a Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Tue, 17 Mar 2026 07:04:33 +0100 Subject: [PATCH 03/34] Update to latest c-blosc2 in vlblocks branch --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ed72f8a5..7b70ce5e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -119,7 +119,7 @@ else() include(FetchContent) FetchContent_Declare(blosc2 GIT_REPOSITORY https://github.com/Blosc/c-blosc2 - GIT_TAG 6bed0534d61652cb1e62a3e7be7283f333dfaaf7 # variable-length chunks support in schunks + GIT_TAG b6c9357e76913d484918fa3e3fdec61df7510aa9 # variable-length chunks support in schunks # SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../c-blosc2 ) FetchContent_MakeAvailable(blosc2) From 65ea44b64591599e901f74145ce5046970be79f4 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Tue, 17 Mar 2026 08:08:16 +0100 Subject: [PATCH 04/34] Update to latest c-blosc2 in vlblocks branch --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7b70ce5e..7b5aa350 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -119,7 +119,7 @@ else() include(FetchContent) FetchContent_Declare(blosc2 GIT_REPOSITORY https://github.com/Blosc/c-blosc2 - GIT_TAG b6c9357e76913d484918fa3e3fdec61df7510aa9 # variable-length chunks support in schunks + GIT_TAG 98fa458177be2755aac62f46573dc102baa26739 # variable-length chunks support in schunks # SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../c-blosc2 ) FetchContent_MakeAvailable(blosc2) From 4172c3fa3d7e4b4f8eec6c69a9a6754148c1c52a Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Tue, 17 Mar 2026 08:11:45 +0100 Subject: [PATCH 05/34] Enable dicts by default in BatchArray and VLArray when using zstd --- src/blosc2/batch_array.py | 10 ++++++++++ src/blosc2/vlarray.py | 10 ++++++++++ tests/test_batch_array.py | 22 ++++++++++++++++++++++ tests/test_vlarray.py | 22 ++++++++++++++++++++++ 4 files changed, 64 insertions(+) diff --git a/src/blosc2/batch_array.py b/src/blosc2/batch_array.py index 5dc8cb52..108428e5 100644 --- a/src/blosc2/batch_array.py +++ b/src/blosc2/batch_array.py @@ -88,17 +88,27 @@ class BatchArray: @staticmethod def _set_typesize_one(cparams: blosc2.CParams | dict | None) -> blosc2.CParams | dict: + auto_use_dict = cparams is None if cparams is None: cparams = blosc2.CParams() elif isinstance(cparams, blosc2.CParams): cparams = copy.deepcopy(cparams) else: cparams = dict(cparams) + auto_use_dict = "use_dict" not in cparams if isinstance(cparams, blosc2.CParams): cparams.typesize = 1 + if auto_use_dict and cparams.codec == blosc2.Codec.ZSTD and cparams.clevel > 0: + # BatchArray stores many small serialized payloads, where Zstd dicts help materially. + cparams.use_dict = True else: cparams["typesize"] = 1 + codec = cparams.get("codec", blosc2.Codec.ZSTD) + clevel = cparams.get("clevel", 5) + if auto_use_dict and codec == blosc2.Codec.ZSTD and clevel > 0: + # BatchArray stores many small serialized payloads, where Zstd dicts help materially. + cparams["use_dict"] = True return cparams @staticmethod diff --git a/src/blosc2/vlarray.py b/src/blosc2/vlarray.py index 7f2d6445..18c737a6 100644 --- a/src/blosc2/vlarray.py +++ b/src/blosc2/vlarray.py @@ -33,17 +33,27 @@ class VLArray: @staticmethod def _set_typesize_one(cparams: blosc2.CParams | dict | None) -> blosc2.CParams | dict: + auto_use_dict = cparams is None if cparams is None: cparams = blosc2.CParams() elif isinstance(cparams, blosc2.CParams): cparams = copy.deepcopy(cparams) else: cparams = dict(cparams) + auto_use_dict = "use_dict" not in cparams if isinstance(cparams, blosc2.CParams): cparams.typesize = 1 + if auto_use_dict and cparams.codec == blosc2.Codec.ZSTD and cparams.clevel > 0: + # VLArray stores many small serialized payloads, where Zstd dicts help materially. + cparams.use_dict = True else: cparams["typesize"] = 1 + codec = cparams.get("codec", blosc2.Codec.ZSTD) + clevel = cparams.get("clevel", 5) + if auto_use_dict and codec == blosc2.Codec.ZSTD and clevel > 0: + # VLArray stores many small serialized payloads, where Zstd dicts help materially. + cparams["use_dict"] = True return cparams @staticmethod diff --git a/tests/test_batch_array.py b/tests/test_batch_array.py index 7d584a36..48f5fce0 100644 --- a/tests/test_batch_array.py +++ b/tests/test_batch_array.py @@ -155,6 +155,28 @@ def test_batcharray_info(): assert "batch_len_avg" in text +def test_batcharray_zstd_uses_dict_by_default(): + barray = blosc2.BatchArray() + assert barray.cparams.codec == blosc2.Codec.ZSTD + assert barray.cparams.use_dict is True + + +def test_batcharray_respects_explicit_use_dict_and_non_zstd(): + barray = blosc2.BatchArray(cparams={"codec": blosc2.Codec.LZ4, "clevel": 5}) + assert barray.cparams.codec == blosc2.Codec.LZ4 + assert barray.cparams.use_dict is False + + barray = blosc2.BatchArray(cparams={"codec": blosc2.Codec.ZSTD, "clevel": 0}) + assert barray.cparams.codec == blosc2.Codec.ZSTD + assert barray.cparams.use_dict is False + + barray = blosc2.BatchArray(cparams={"codec": blosc2.Codec.ZSTD, "clevel": 5, "use_dict": False}) + assert barray.cparams.use_dict is False + + barray = blosc2.BatchArray(cparams=blosc2.CParams(codec=blosc2.Codec.ZSTD, clevel=5, use_dict=False)) + assert barray.cparams.use_dict is False + + def test_vlcompress_small_blocks_roundtrip(): values = [ {"value": None}, diff --git a/tests/test_vlarray.py b/tests/test_vlarray.py index f9834f45..2c792e10 100644 --- a/tests/test_vlarray.py +++ b/tests/test_vlarray.py @@ -144,6 +144,28 @@ def test_vlarray_info(): assert "item_nbytes_avg" in text +def test_vlarray_zstd_uses_dict_by_default(): + vlarray = blosc2.VLArray() + assert vlarray.cparams.codec == blosc2.Codec.ZSTD + assert vlarray.cparams.use_dict is True + + +def test_vlarray_respects_explicit_use_dict_and_non_zstd(): + vlarray = blosc2.VLArray(cparams={"codec": blosc2.Codec.LZ4, "clevel": 5}) + assert vlarray.cparams.codec == blosc2.Codec.LZ4 + assert vlarray.cparams.use_dict is False + + vlarray = blosc2.VLArray(cparams={"codec": blosc2.Codec.ZSTD, "clevel": 0}) + assert vlarray.cparams.codec == blosc2.Codec.ZSTD + assert vlarray.cparams.use_dict is False + + vlarray = blosc2.VLArray(cparams={"codec": blosc2.Codec.ZSTD, "clevel": 5, "use_dict": False}) + assert vlarray.cparams.use_dict is False + + vlarray = blosc2.VLArray(cparams=blosc2.CParams(codec=blosc2.Codec.ZSTD, clevel=5, use_dict=False)) + assert vlarray.cparams.use_dict is False + + def test_vlarray_constructor_kwargs(): urlpath = "test_vlarray_kwargs.b2frame" blosc2.remove_urlpath(urlpath) From e799739c8b5cd199ce585d6461b998a67f479aa2 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Tue, 17 Mar 2026 11:30:21 +0100 Subject: [PATCH 06/34] Update to latest c-blosc2 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7b5aa350..db897448 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -119,7 +119,7 @@ else() include(FetchContent) FetchContent_Declare(blosc2 GIT_REPOSITORY https://github.com/Blosc/c-blosc2 - GIT_TAG 98fa458177be2755aac62f46573dc102baa26739 # variable-length chunks support in schunks + GIT_TAG 11144a703a85c2b224e8b77fc1769a2a46881cc0 # variable-length chunks/blocks # SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../c-blosc2 ) FetchContent_MakeAvailable(blosc2) From e640474548976dd44611be36757bf8088579dc94 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Tue, 17 Mar 2026 13:18:05 +0100 Subject: [PATCH 07/34] Update to latest c-blosc2 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index db897448..98584fe5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -119,7 +119,7 @@ else() include(FetchContent) FetchContent_Declare(blosc2 GIT_REPOSITORY https://github.com/Blosc/c-blosc2 - GIT_TAG 11144a703a85c2b224e8b77fc1769a2a46881cc0 # variable-length chunks/blocks + GIT_TAG 7f6f1204784404099c767371245bd12cd4570c7c # variable-length chunks/blocks # SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../c-blosc2 ) FetchContent_MakeAvailable(blosc2) From 07ec9d2abc04f533afa01aa250423d8d1a666c61 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Tue, 17 Mar 2026 13:52:52 +0100 Subject: [PATCH 08/34] Update the nthreads to the default value in this machine --- src/blosc2/blosc2_ext.pyx | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx index edd41837..4c574015 100644 --- a/src/blosc2/blosc2_ext.pyx +++ b/src/blosc2/blosc2_ext.pyx @@ -8,6 +8,7 @@ #cython: language_level=3 import os +import dataclasses import ast import atexit import pathlib @@ -2571,7 +2572,8 @@ def open(urlpath, mode, offset, **kwargs): if mode != "w" and kwargs is not None: check_schunk_params(schunk, kwargs) cparams = kwargs.get("cparams") - # For reading with the default number of threads + # nthreads is not stored in the frame; apply the live global when the caller + # did not supply an explicit cparams — symmetric with the DParams default below. dparams = kwargs.get("dparams", blosc2.DParams()) if is_ndarray: @@ -2579,6 +2581,8 @@ def open(urlpath, mode, offset, **kwargs): _array=PyCapsule_New(array, "b2nd_array_t*", NULL)) if cparams is not None: res.schunk.cparams = cparams if isinstance(cparams, blosc2.CParams) else blosc2.CParams(**cparams) + else: + res.schunk.cparams = dataclasses.replace(res.schunk.cparams, nthreads=blosc2.nthreads) if dparams is not None: res.schunk.dparams = dparams if isinstance(dparams, blosc2.DParams) else blosc2.DParams(**dparams) res.schunk.mode = mode @@ -2587,6 +2591,8 @@ def open(urlpath, mode, offset, **kwargs): mode=mode, **kwargs) if cparams is not None: res.cparams = cparams if isinstance(cparams, blosc2.CParams) else blosc2.CParams(**cparams) + else: + res.cparams = dataclasses.replace(res.cparams, nthreads=blosc2.nthreads) if dparams is not None: res.dparams = dparams if isinstance(dparams, blosc2.DParams) else blosc2.DParams(**dparams) From 4fe44e8d18038ac61d452b7f53db0651e01eb44f Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Wed, 18 Mar 2026 13:41:40 +0100 Subject: [PATCH 09/34] New meanings for .chunksize and .blocksize properties --- CMakeLists.txt | 6 +- src/blosc2/batch_array.py | 183 +++++++++++++++++++++++++++++--------- tests/test_batch_array.py | 81 ++++++++++------- 3 files changed, 194 insertions(+), 76 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 98584fe5..6f053288 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -118,9 +118,9 @@ else() set(BLOSC_INSTALL ON) include(FetchContent) FetchContent_Declare(blosc2 - GIT_REPOSITORY https://github.com/Blosc/c-blosc2 - GIT_TAG 7f6f1204784404099c767371245bd12cd4570c7c # variable-length chunks/blocks - # SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../c-blosc2 + #GIT_REPOSITORY https://github.com/Blosc/c-blosc2 + #GIT_TAG 7f6f1204784404099c767371245bd12cd4570c7c # variable-length chunks/blocks + SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../c-blosc2 ) FetchContent_MakeAvailable(blosc2) include_directories("${blosc2_SOURCE_DIR}/include") diff --git a/src/blosc2/batch_array.py b/src/blosc2/batch_array.py index 108428e5..2bd486ad 100644 --- a/src/blosc2/batch_array.py +++ b/src/blosc2/batch_array.py @@ -17,7 +17,8 @@ from blosc2._msgpack_utils import msgpack_packb, msgpack_unpackb from blosc2.info import InfoReporter, format_nbytes_info -_BATCHARRAY_META = {"version": 1, "serializer": "msgpack", "format": "vlblocks"} +_BATCHARRAY_META = {"version": 2, "serializer": "msgpack", "format": "batched_vlblocks"} +_BATCHARRAY_LAYOUT_KEY = "batcharray" def _check_serialized_size(buffer: bytes) -> None: @@ -32,7 +33,7 @@ def __init__(self, parent: BatchArray, nchunk: int, lazychunk: bytes) -> None: self._parent = parent self._nchunk = nchunk self._lazychunk = lazychunk - self._payloads: list[bytes] | None = None + self._blocks: list[list[Any]] | None = None self._nbytes, self._cbytes, self._nblocks = blosc2.get_cbuffer_sizes(lazychunk) def _normalize_index(self, index: int) -> int: @@ -44,20 +45,28 @@ def _normalize_index(self, index: int) -> int: raise IndexError("Batch index out of range") return index - def _decode_payloads(self) -> list[bytes]: - if self._payloads is None: - self._payloads = self._parent._decode_payloads(self._nchunk) - return self._payloads + def _decode_blocks(self) -> list[list[Any]]: + if self._blocks is None: + self._blocks = self._parent._decode_blocks(self._nchunk) + return self._blocks def __getitem__(self, index: int | slice) -> Any | list[Any]: - payloads = self._decode_payloads() + blocks = self._decode_blocks() if isinstance(index, slice): - return [msgpack_unpackb(payload) for payload in payloads[index]] + flat_items = [item for block in blocks for item in block] + return flat_items[index] index = self._normalize_index(index) - return msgpack_unpackb(payloads[index]) + blocksize = self._parent.blocksize + if blocksize is None: + raise RuntimeError("BatchArray blocksize is not initialized") + block_index, item_index = divmod(index, blocksize) + return blocks[block_index][item_index] def __len__(self) -> int: - return self._nblocks + chunksize = self._parent.chunksize + if chunksize is None: + return self._nblocks + return chunksize def __iter__(self) -> Iterator[Any]: for i in range(len(self)): @@ -142,6 +151,7 @@ def _attach_schunk(self, schunk: blosc2.SChunk) -> None: self.mode = schunk.mode self.mmap_mode = getattr(schunk, "mmap_mode", None) self._validate_tag() + self._load_layout() def _maybe_open_existing(self, storage: blosc2.Storage) -> bool: urlpath = storage.urlpath @@ -165,18 +175,21 @@ def _make_storage(self) -> blosc2.Storage: def __init__( self, chunksize: int | None = None, + blocksize: int | None = None, _from_schunk: blosc2.SChunk | None = None, **kwargs: Any, ) -> None: + self._chunksize: int | None = chunksize + self._blocksize: int | None = blocksize + self._layout_format: str | None = None if _from_schunk is not None: - if chunksize is not None: - raise ValueError("Cannot pass `chunksize` together with `_from_schunk`") + if chunksize is not None or blocksize is not None: + raise ValueError("Cannot pass `chunksize` or `blocksize` together with `_from_schunk`") if kwargs: unexpected = ", ".join(sorted(kwargs)) raise ValueError(f"Cannot pass {unexpected} together with `_from_schunk`") self._attach_schunk(_from_schunk) return - cparams = kwargs.pop("cparams", None) dparams = kwargs.pop("dparams", None) storage = kwargs.pop("storage", None) @@ -198,17 +211,48 @@ def __init__( fixed_meta = dict(storage.meta or {}) fixed_meta["batcharray"] = dict(_BATCHARRAY_META) storage.meta = fixed_meta - if chunksize is None: - chunksize = -1 - schunk = blosc2.SChunk( - chunksize=chunksize, data=None, cparams=cparams, dparams=dparams, storage=storage - ) + schunk = blosc2.SChunk(chunksize=-1, data=None, cparams=cparams, dparams=dparams, storage=storage) self._attach_schunk(schunk) + if self._chunksize is not None or self._blocksize is not None: + self._store_layout() def _validate_tag(self) -> None: if "batcharray" not in self.schunk.meta: raise ValueError("The supplied SChunk is not tagged as a BatchArray") + def _load_layout(self) -> None: + layout = None + self._layout_format = None + if _BATCHARRAY_LAYOUT_KEY in self.vlmeta: + layout = self.vlmeta[_BATCHARRAY_LAYOUT_KEY] + if isinstance(layout, dict): + self._chunksize = layout.get("chunksize") + self._blocksize = layout.get("blocksize") + self._layout_format = layout.get("format", "batched_vlblocks") + return + if len(self) == 0: + return + # Legacy fallback: one object per VL block. + first_nbytes, _, nblocks = blosc2.get_cbuffer_sizes(self.schunk.get_lazychunk(0)) + if nblocks <= 0 or first_nbytes <= 0: + return + self._chunksize = nblocks + self._blocksize = 1 + self._layout_format = "legacy_vlblocks" + self._store_layout() + + def _store_layout(self) -> None: + if self._chunksize is None or self.mode == "r": + return + layout = { + "version": 1, + "chunksize": self._chunksize, + "blocksize": self._blocksize, + "format": self._layout_format or "batched_vlblocks", + "sizing_policy": "l2_cache_prefix", + } + self.vlmeta[_BATCHARRAY_LAYOUT_KEY] = layout + def _check_writable(self) -> None: if self.mode == "r": raise ValueError("Cannot modify a BatchArray opened in read-only mode") @@ -249,13 +293,42 @@ def _normalize_batch(self, value: object) -> list[Any]: raise ValueError("BatchArray entries cannot be empty") return values - def _serialize_batch(self, value: object) -> list[bytes]: - payloads = [] - for item in self._normalize_batch(value): - payload = msgpack_packb(item) - _check_serialized_size(payload) - payloads.append(payload) - return payloads + def _ensure_layout_for_batch(self, batch: list[Any]) -> None: + if self._chunksize is None: + self._chunksize = len(batch) + if len(batch) != self._chunksize: + raise ValueError(f"BatchArray entries must contain exactly {self._chunksize} objects") + if self._blocksize is None: + payload_sizes = [len(msgpack_packb(item)) for item in batch] + self._blocksize = self._guess_blocksize(payload_sizes) + self._store_layout() + + def _guess_blocksize(self, payload_sizes: list[int]) -> int: + if not payload_sizes: + raise ValueError("BatchArray entries cannot be empty") + l2_cache_size = blosc2.cpu_info.get("l2_cache_size") + if not isinstance(l2_cache_size, int) or l2_cache_size <= 0: + return len(payload_sizes) + total = 0 + count = 0 + for payload_size in payload_sizes: + if count > 0 and total + payload_size > l2_cache_size: + break + total += payload_size + count += 1 + if count == 0: + count = 1 + return min(count, len(payload_sizes)) + + def _serialize_batch(self, value: object) -> list[Any]: + batch = self._normalize_batch(value) + self._ensure_layout_for_batch(batch) + return batch + + def _serialize_block(self, items: list[Any]) -> bytes: + payload = msgpack_packb(items) + _check_serialized_size(payload) + return payload def _vl_cparams_kwargs(self) -> dict[str, Any]: return asdict(self.schunk.cparams) @@ -263,33 +336,44 @@ def _vl_cparams_kwargs(self) -> dict[str, Any]: def _vl_dparams_kwargs(self) -> dict[str, Any]: return asdict(self.schunk.dparams) - def _compress_batch(self, payloads: list[bytes]) -> bytes: - return blosc2.blosc2_ext.vlcompress(payloads, **self._vl_cparams_kwargs()) + def _compress_batch(self, batch: list[Any]) -> bytes: + if self._blocksize is None: + raise RuntimeError("BatchArray blocksize is not initialized") + blocks = [ + self._serialize_block(batch[i : i + self._blocksize]) + for i in range(0, len(batch), self._blocksize) + ] + return blosc2.blosc2_ext.vlcompress(blocks, **self._vl_cparams_kwargs()) - def _decode_payloads(self, nchunk: int) -> list[bytes]: - return blosc2.blosc2_ext.vldecompress(self.schunk.get_chunk(nchunk), **self._vl_dparams_kwargs()) + def _decode_blocks(self, nchunk: int) -> list[list[Any]]: + block_payloads = blosc2.blosc2_ext.vldecompress( + self.schunk.get_chunk(nchunk), **self._vl_dparams_kwargs() + ) + if self._layout_format == "legacy_vlblocks": + return [[msgpack_unpackb(payload)] for payload in block_payloads] + return [msgpack_unpackb(payload) for payload in block_payloads] def _get_batch(self, index: int) -> Batch: return Batch(self, index, self.schunk.get_lazychunk(index)) def _batch_lengths(self) -> list[int]: - lengths = [] - for i in range(len(self)): - _, _, nblocks = blosc2.get_cbuffer_sizes(self.schunk.get_lazychunk(i)) - lengths.append(nblocks) - return lengths + if self.chunksize is not None: + return [self.chunksize for _ in range(len(self))] + return [len(self[i]) for i in range(len(self))] def append(self, value: object) -> int: """Append one batch and return the new number of entries.""" self._check_writable() - chunk = self._compress_batch(self._serialize_batch(value)) + batch = self._serialize_batch(value) + chunk = self._compress_batch(batch) return self.schunk.append_chunk(chunk) def insert(self, index: int, value: object) -> int: """Insert one batch at ``index`` and return the new number of entries.""" self._check_writable() index = self._normalize_insert_index(index) - chunk = self._compress_batch(self._serialize_batch(value)) + batch = self._serialize_batch(value) + chunk = self._compress_batch(batch) return self.schunk.insert_chunk(index, chunk) def delete(self, index: int | slice) -> int: @@ -316,7 +400,8 @@ def extend(self, values: object) -> None: """Append all batches from an iterable.""" self._check_writable() for value in values: - chunk = self._compress_batch(self._serialize_batch(value)) + batch = self._serialize_batch(value) + chunk = self._compress_batch(batch) self.schunk.append_chunk(chunk) def clear(self) -> None: @@ -333,6 +418,7 @@ def clear(self) -> None: storage=storage, ) self._attach_schunk(schunk) + self._store_layout() def __getitem__(self, index: int | slice) -> Batch | list[Batch]: if isinstance(index, slice): @@ -351,7 +437,8 @@ def __setitem__(self, index: int | slice, value: object) -> None: for idx in reversed(indices): self.schunk.delete_chunk(idx) for offset, item in enumerate(values): - chunk = self._compress_batch(self._serialize_batch(item)) + batch = self._serialize_batch(item) + chunk = self._compress_batch(batch) self.schunk.insert_chunk(start + offset, chunk) return if len(values) != len(indices): @@ -359,12 +446,14 @@ def __setitem__(self, index: int | slice, value: object) -> None: f"attempt to assign sequence of size {len(values)} to extended slice of size {len(indices)}" ) for idx, item in zip(indices, values, strict=True): - chunk = self._compress_batch(self._serialize_batch(item)) + batch = self._serialize_batch(item) + chunk = self._compress_batch(batch) self.schunk.update_chunk(idx, chunk) return self._check_writable() index = self._normalize_index(index) - chunk = self._compress_batch(self._serialize_batch(value)) + batch = self._serialize_batch(value) + chunk = self._compress_batch(batch) self.schunk.update_chunk(index, chunk) def __delitem__(self, index: int | slice) -> None: @@ -395,7 +484,11 @@ def dparams(self): @property def chunksize(self) -> int: - return self.schunk.chunksize + return self._chunksize + + @property + def blocksize(self) -> int: + return self._blocksize @property def typesize(self) -> int: @@ -435,6 +528,8 @@ def info_items(self) -> list: return [ ("type", f"{self.__class__.__name__}"), ("nbatches", len(self)), + ("chunksize", self.chunksize), + ("blocksize", self.blocksize), ("nitems", nitems), ("batch_len_min", min(batch_lengths) if batch_lengths else 0), ("batch_len_max", max(batch_lengths) if batch_lengths else 0), @@ -456,7 +551,8 @@ def copy(self, **kwargs: Any) -> BatchArray: kwargs["cparams"] = kwargs.get("cparams", copy.deepcopy(self.cparams)) kwargs["dparams"] = kwargs.get("dparams", copy.deepcopy(self.dparams)) - kwargs["chunksize"] = kwargs.get("chunksize", -1) + kwargs["chunksize"] = kwargs.get("chunksize", self.chunksize) + kwargs["blocksize"] = kwargs.get("blocksize", self.blocksize) if "storage" not in kwargs: kwargs["meta"] = self._copy_meta() @@ -465,6 +561,9 @@ def copy(self, **kwargs: Any) -> BatchArray: kwargs["mode"] = "w" out = BatchArray(**kwargs) + if "storage" not in kwargs and len(self.vlmeta) > 0: + for key, value in self.vlmeta.getall().items(): + out.vlmeta[key] = value out.extend(self) return out diff --git a/tests/test_batch_array.py b/tests/test_batch_array.py index 48f5fce0..9ec692fb 100644 --- a/tests/test_batch_array.py +++ b/tests/test_batch_array.py @@ -12,8 +12,8 @@ BATCHES = [ [b"bytes\x00payload", "plain text", 42], - [{"nested": [1, 2]}, None], - [(1, 2, "three"), 3.5, True, {"rows": [[], ["nested"]]}], + [{"nested": [1, 2]}, None, {"tail": True}], + [(1, 2, "three"), 3.5, True], ] @@ -46,7 +46,12 @@ def test_batcharray_roundtrip(contiguous, urlpath): assert barray.append(batch) == i assert len(barray) == len(BATCHES) + assert barray.chunksize == len(BATCHES[0]) + assert barray.blocksize is not None + assert 1 <= barray.blocksize <= barray.chunksize assert [batch[:] for batch in barray] == BATCHES + with pytest.raises(ValueError): + barray.append([1, 2]) batch0 = barray[0] assert isinstance(batch0, blosc2.Batch) @@ -59,16 +64,16 @@ def test_batcharray_roundtrip(contiguous, urlpath): assert batch0.cratio > 0 expected = list(BATCHES) - expected[1] = ["updated", {"tuple": (7, 8)}] - expected[-1] = ["tiny"] + expected[1] = ["updated", {"tuple": (7, 8)}, 99] + expected[-1] = ["tiny", False, "x"] barray[1] = expected[1] barray[-1] = expected[-1] - assert barray.insert(0, ["head", 0]) == len(expected) + 1 - expected.insert(0, ["head", 0]) - assert barray.insert(-1, ["between", {"k": 5}]) == len(expected) + 1 - expected.insert(-1, ["between", {"k": 5}]) - assert barray.insert(999, ["tail"]) == len(expected) + 1 - expected.insert(999, ["tail"]) + assert barray.insert(0, ["head", 0, "x"]) == len(expected) + 1 + expected.insert(0, ["head", 0, "x"]) + assert barray.insert(-1, ["between", {"k": 5}, None]) == len(expected) + 1 + expected.insert(-1, ["between", {"k": 5}, None]) + assert barray.insert(999, ["tail", 1, 2]) == len(expected) + 1 + expected.insert(999, ["tail", 1, 2]) assert barray.delete(2) == len(expected) - 1 del expected[2] del barray[-2] @@ -78,6 +83,8 @@ def test_batcharray_roundtrip(contiguous, urlpath): if urlpath is not None: reopened = blosc2.open(urlpath, mode="r") assert isinstance(reopened, blosc2.BatchArray) + assert reopened.chunksize == barray.chunksize + assert reopened.blocksize == barray.blocksize assert [batch[:] for batch in reopened] == expected with pytest.raises(ValueError): reopened.append(["nope"]) @@ -97,8 +104,8 @@ def test_batcharray_roundtrip(contiguous, urlpath): reopened.clear() reopened_rw = blosc2.open(urlpath, mode="a") - reopened_rw[0] = ["changed"] - expected[0] = ["changed"] + reopened_rw[0] = ["changed", "batch", 0] + expected[0] = ["changed", "batch", 0] assert [batch[:] for batch in reopened_rw] == expected if contiguous: @@ -112,10 +119,10 @@ def test_batcharray_roundtrip(contiguous, urlpath): def test_batcharray_from_cframe(): barray = blosc2.BatchArray() barray.extend(BATCHES) - barray.insert(1, ["inserted", True]) + barray.insert(1, ["inserted", True, None]) del barray[3] expected = list(BATCHES) - expected.insert(1, ["inserted", True]) + expected.insert(1, ["inserted", True, None]) del expected[3] restored = blosc2.from_cframe(barray.to_cframe()) @@ -138,9 +145,11 @@ def test_batcharray_info(): items = dict(barray.info_items) assert items["type"] == "BatchArray" assert items["nbatches"] == len(BATCHES) + assert items["chunksize"] == len(BATCHES[0]) + assert items["blocksize"] == barray.blocksize assert items["nitems"] == sum(len(batch) for batch in BATCHES) - assert items["batch_len_min"] == 2 - assert items["batch_len_max"] == 4 + assert items["batch_len_min"] == 3 + assert items["batch_len_max"] == 3 assert items["batch_len_avg"] == "3.00" assert "urlpath" not in items assert "contiguous" not in items @@ -161,6 +170,14 @@ def test_batcharray_zstd_uses_dict_by_default(): assert barray.cparams.use_dict is True +def test_batcharray_explicit_chunksize_blocksize(): + barray = blosc2.BatchArray(chunksize=3, blocksize=2) + assert barray.chunksize == 3 + assert barray.blocksize == 2 + barray.append([1, 2, 3]) + assert [batch[:] for batch in barray] == [[1, 2, 3]] + + def test_batcharray_respects_explicit_use_dict_and_non_zstd(): barray = blosc2.BatchArray(cparams={"codec": blosc2.Codec.LZ4, "clevel": 5}) assert barray.cparams.codec == blosc2.Codec.LZ4 @@ -240,22 +257,22 @@ def test_batcharray_list_like_ops(contiguous, urlpath): blosc2.remove_urlpath(urlpath) barray = blosc2.BatchArray(storage=_storage(contiguous, urlpath)) - barray.extend([[1, 2], [3], [4, 5, 6]]) - assert [batch[:] for batch in barray] == [[1, 2], [3], [4, 5, 6]] - assert barray.pop() == [4, 5, 6] - assert barray.pop(0) == [1, 2] - assert [batch[:] for batch in barray] == [[3]] + barray.extend([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + assert [batch[:] for batch in barray] == [[1, 2, 3], [4, 5, 6], [7, 8, 9]] + assert barray.pop() == [7, 8, 9] + assert barray.pop(0) == [1, 2, 3] + assert [batch[:] for batch in barray] == [[4, 5, 6]] barray.clear() assert len(barray) == 0 assert [batch[:] for batch in barray] == [] - barray.extend([["a"], ["b", "c"]]) - assert [batch[:] for batch in barray] == [["a"], ["b", "c"]] + barray.extend([["a", "b", "c"], ["d", "e", "f"]]) + assert [batch[:] for batch in barray] == [["a", "b", "c"], ["d", "e", "f"]] if urlpath is not None: reopened = blosc2.open(urlpath, mode="r") - assert [batch[:] for batch in reopened] == [["a"], ["b", "c"]] + assert [batch[:] for batch in reopened] == [["a", "b", "c"], ["d", "e", "f"]] blosc2.remove_urlpath(urlpath) @@ -272,19 +289,19 @@ def test_batcharray_list_like_ops(contiguous, urlpath): def test_batcharray_slices(contiguous, urlpath): blosc2.remove_urlpath(urlpath) - expected = [[i, i + 100] for i in range(8)] + expected = [[i, i + 100, i + 200] for i in range(8)] barray = blosc2.BatchArray(storage=_storage(contiguous, urlpath)) barray.extend(expected) assert [batch[:] for batch in barray[1:6:2]] == expected[1:6:2] assert [batch[:] for batch in barray[::-2]] == expected[::-2] - barray[2:5] = [["a"], ["b", "c"]] - expected[2:5] = [["a"], ["b", "c"]] + barray[2:5] = [["a", "b", "c"], ["d", "e", "f"], ["g", "h", "i"]] + expected[2:5] = [["a", "b", "c"], ["d", "e", "f"], ["g", "h", "i"]] assert [batch[:] for batch in barray] == expected - barray[1:6:2] = [[100], [101], [102]] - expected[1:6:2] = [[100], [101], [102]] + barray[1:6:2] = [[100, 101, 102], [103, 104, 105], [106, 107, 108]] + expected[1:6:2] = [[100, 101, 102], [103, 104, 105], [106, 107, 108]] assert [batch[:] for batch in barray] == expected del barray[::3] @@ -322,7 +339,7 @@ def test_batcharray_copy(): original = blosc2.BatchArray(urlpath=urlpath, mode="w", contiguous=True) original.extend(BATCHES) - original.insert(1, ["copy", True]) + original.insert(1, ["copy", True, 123]) copied = original.copy( urlpath=copy_path, contiguous=False, cparams={"codec": blosc2.Codec.LZ4, "clevel": 5} @@ -386,7 +403,9 @@ def test_batcharray_validation_errors(): barray.delete(3) with pytest.raises(IndexError): blosc2.BatchArray().pop() - barray.extend([[1]]) + barray.extend([[1, 2, 3]]) + with pytest.raises(ValueError): + barray.append([2, 3]) with pytest.raises(NotImplementedError): barray.pop(slice(0, 1)) From bad841173fa92bc856a91274a4ba1cde92940d2c Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Wed, 18 Mar 2026 13:45:01 +0100 Subject: [PATCH 10/34] Fix for newer versions of torch --- tests/ndarray/test_setitem.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/ndarray/test_setitem.py b/tests/ndarray/test_setitem.py index 02a0a336..bde27317 100644 --- a/tests/ndarray/test_setitem.py +++ b/tests/ndarray/test_setitem.py @@ -66,7 +66,8 @@ def test_setitem_torch_proxy(shape, chunks, blocks, slices, dtype): dtype_ = {np.float32: torch.float32, np.int32: torch.int32, np.float64: torch.float64}[dtype] val = torch.ones(slice_shape, dtype=dtype_) a[slices] = val - nparray[slices] = val + # Make the expected assignment explicit so NumPy does not rely on torch.__array__(). + nparray[slices] = val.numpy() np.testing.assert_almost_equal(a[...], nparray) From 4fb6882cef59afe54e5ca9781a3bc2dd362bd3e3 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Wed, 18 Mar 2026 14:00:09 +0100 Subject: [PATCH 11/34] BatchArray -> ObjectArray --- src/blosc2/__init__.py | 6 +- src/blosc2/core.py | 10 +- src/blosc2/dict_store.py | 18 +-- src/blosc2/embed_store.py | 10 +- .../{batch_array.py => object_array.py} | 74 ++++++------ src/blosc2/schunk.py | 8 +- src/blosc2/tree_store.py | 6 +- ...st_batch_array.py => test_object_array.py} | 110 +++++++++--------- 8 files changed, 121 insertions(+), 121 deletions(-) rename src/blosc2/{batch_array.py => object_array.py} (89%) rename tests/{test_batch_array.py => test_object_array.py} (79%) diff --git a/src/blosc2/__init__.py b/src/blosc2/__init__.py index 66455881..1f058757 100644 --- a/src/blosc2/__init__.py +++ b/src/blosc2/__init__.py @@ -535,7 +535,7 @@ def _raise(exc): from .embed_store import EmbedStore, estore_from_cframe from .dict_store import DictStore from .tree_store import TreeStore -from .batch_array import Batch, BatchArray, batcharray_from_cframe +from .object_array import Batch, ObjectArray, objectarray_from_cframe from .vlarray import VLArray, vlarray_from_cframe from .c2array import c2context, C2Array, URLPath @@ -721,7 +721,7 @@ def _raise(exc): "C2Array", "CParams", "Batch", - "BatchArray", + "ObjectArray", # Enums "Codec", "DParams", @@ -944,7 +944,7 @@ def _raise(exc): "validate_expr", "var", "vecdot", - "batcharray_from_cframe", + "objectarray_from_cframe", "vlarray_from_cframe", "where", "zeros", diff --git a/src/blosc2/core.py b/src/blosc2/core.py index fc8749ad..6e37b139 100644 --- a/src/blosc2/core.py +++ b/src/blosc2/core.py @@ -1918,9 +1918,9 @@ def ndarray_from_cframe(cframe: bytes | str, copy: bool = False) -> blosc2.NDArr def from_cframe( cframe: bytes | str, copy: bool = True -) -> blosc2.EmbedStore | blosc2.NDArray | blosc2.SChunk | blosc2.BatchArray | blosc2.VLArray: +) -> blosc2.EmbedStore | blosc2.NDArray | blosc2.SChunk | blosc2.ObjectArray | blosc2.VLArray: """Create a :ref:`EmbedStore `, :ref:`NDArray `, :ref:`SChunk `, - :ref:`BatchArray ` or :ref:`VLArray ` instance + :ref:`ObjectArray ` or :ref:`VLArray ` instance from a contiguous frame buffer. Parameters @@ -1938,7 +1938,7 @@ def from_cframe( Returns ------- out: :ref:`EmbedStore `, :ref:`NDArray `, :ref:`SChunk `, - :ref:`BatchArray ` or :ref:`VLArray ` + :ref:`ObjectArray ` or :ref:`VLArray ` A new instance of the appropriate type containing the data passed. See Also @@ -1952,8 +1952,8 @@ def from_cframe( # Check the metalayer to determine the type if "b2embed" in schunk.meta: return blosc2.estore_from_cframe(cframe, copy=copy) - if "batcharray" in schunk.meta: - return blosc2.batcharray_from_cframe(cframe, copy=copy) + if "objectarray" in schunk.meta: + return blosc2.objectarray_from_cframe(cframe, copy=copy) if "vlarray" in schunk.meta: return blosc2.vlarray_from_cframe(cframe, copy=copy) if "b2nd" in schunk.meta: diff --git a/src/blosc2/dict_store.py b/src/blosc2/dict_store.py index 65bab76e..8ed2fac9 100644 --- a/src/blosc2/dict_store.py +++ b/src/blosc2/dict_store.py @@ -249,25 +249,25 @@ def estore(self) -> EmbedStore: return self._estore @staticmethod - def _value_nbytes(value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.BatchArray) -> int: - if isinstance(value, (blosc2.VLArray, blosc2.BatchArray)): + def _value_nbytes(value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.ObjectArray) -> int: + if isinstance(value, (blosc2.VLArray, blosc2.ObjectArray)): return value.schunk.nbytes return value.nbytes @staticmethod - def _is_external_value(value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.BatchArray) -> bool: - return isinstance(value, (blosc2.NDArray, SChunk, blosc2.VLArray, blosc2.BatchArray)) and bool( + def _is_external_value(value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.ObjectArray) -> bool: + return isinstance(value, (blosc2.NDArray, SChunk, blosc2.VLArray, blosc2.ObjectArray)) and bool( getattr(value, "urlpath", None) ) @staticmethod - def _external_ext(value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.BatchArray) -> str: + def _external_ext(value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.ObjectArray) -> str: if isinstance(value, blosc2.NDArray): return ".b2nd" return ".b2f" def __setitem__( - self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.BatchArray + self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.ObjectArray ) -> None: """Add a node to the DictStore.""" if isinstance(value, np.ndarray): @@ -294,7 +294,7 @@ def __setitem__( if hasattr(value, "save"): value.save(urlpath=dest_path) else: - # SChunk, VLArray and BatchArray can all be persisted via their cframe. + # SChunk, VLArray and ObjectArray can all be persisted via their cframe. with open(dest_path, "wb") as f: f.write(value.to_cframe()) else: @@ -314,7 +314,7 @@ def __setitem__( def __getitem__( self, key: str - ) -> blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.BatchArray | C2Array: + ) -> blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.ObjectArray | C2Array: """Retrieve a node from the DictStore.""" # Check map_tree first if key in self.map_tree: @@ -346,7 +346,7 @@ def __getitem__( def get( self, key: str, default: Any = None - ) -> blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.BatchArray | C2Array | Any: + ) -> blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.ObjectArray | C2Array | Any: """Retrieve a node, or default if not found.""" try: return self[key] diff --git a/src/blosc2/embed_store.py b/src/blosc2/embed_store.py index e1c8b0d2..e54967fb 100644 --- a/src/blosc2/embed_store.py +++ b/src/blosc2/embed_store.py @@ -174,7 +174,7 @@ def _ensure_capacity(self, needed_bytes: int) -> None: self._store.resize((new_size,)) def __setitem__( - self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.BatchArray + self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.ObjectArray ) -> None: """Add a node to the embed store.""" if self.mode == "r": @@ -198,7 +198,7 @@ def __setitem__( self._embed_map[key] = {"offset": offset, "length": data_len} self._save_metadata() - def __getitem__(self, key: str) -> blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.BatchArray: + def __getitem__(self, key: str) -> blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.ObjectArray: """Retrieve a node from the embed store.""" if key not in self._embed_map: raise KeyError(f"Key '{key}' not found in the embed store.") @@ -216,7 +216,7 @@ def __getitem__(self, key: str) -> blosc2.NDArray | SChunk | blosc2.VLArray | bl def get( self, key: str, default: Any = None - ) -> blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.BatchArray | Any: + ) -> blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.ObjectArray | Any: """Retrieve a node, or default if not found.""" return self[key] if key in self._embed_map else default @@ -243,12 +243,12 @@ def keys(self) -> KeysView[str]: """Return all keys.""" return self._embed_map.keys() - def values(self) -> Iterator[blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.BatchArray]: + def values(self) -> Iterator[blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.ObjectArray]: """Iterate over all values.""" for key in self._embed_map: yield self[key] - def items(self) -> Iterator[tuple[str, blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.BatchArray]]: + def items(self) -> Iterator[tuple[str, blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.ObjectArray]]: """Iterate over (key, value) pairs.""" for key in self._embed_map: yield key, self[key] diff --git a/src/blosc2/batch_array.py b/src/blosc2/object_array.py similarity index 89% rename from src/blosc2/batch_array.py rename to src/blosc2/object_array.py index 2bd486ad..decffc18 100644 --- a/src/blosc2/batch_array.py +++ b/src/blosc2/object_array.py @@ -17,8 +17,8 @@ from blosc2._msgpack_utils import msgpack_packb, msgpack_unpackb from blosc2.info import InfoReporter, format_nbytes_info -_BATCHARRAY_META = {"version": 2, "serializer": "msgpack", "format": "batched_vlblocks"} -_BATCHARRAY_LAYOUT_KEY = "batcharray" +_OBJECTARRAY_META = {"version": 2, "serializer": "msgpack", "format": "batched_vlblocks"} +_OBJECTARRAY_LAYOUT_KEY = "objectarray" def _check_serialized_size(buffer: bytes) -> None: @@ -27,9 +27,9 @@ def _check_serialized_size(buffer: bytes) -> None: class Batch(Sequence[Any]): - """A lazy sequence of Python objects stored in one BatchArray chunk.""" + """A lazy sequence of Python objects stored in one ObjectArray chunk.""" - def __init__(self, parent: BatchArray, nchunk: int, lazychunk: bytes) -> None: + def __init__(self, parent: ObjectArray, nchunk: int, lazychunk: bytes) -> None: self._parent = parent self._nchunk = nchunk self._lazychunk = lazychunk @@ -58,7 +58,7 @@ def __getitem__(self, index: int | slice) -> Any | list[Any]: index = self._normalize_index(index) blocksize = self._parent.blocksize if blocksize is None: - raise RuntimeError("BatchArray blocksize is not initialized") + raise RuntimeError("ObjectArray blocksize is not initialized") block_index, item_index = divmod(index, blocksize) return blocks[block_index][item_index] @@ -92,7 +92,7 @@ def __repr__(self) -> str: return f"Batch(len={len(self)}, nbytes={self.nbytes}, cbytes={self.cbytes})" -class BatchArray: +class ObjectArray: """A batched variable-length array backed by an :class:`blosc2.SChunk`.""" @staticmethod @@ -109,14 +109,14 @@ def _set_typesize_one(cparams: blosc2.CParams | dict | None) -> blosc2.CParams | if isinstance(cparams, blosc2.CParams): cparams.typesize = 1 if auto_use_dict and cparams.codec == blosc2.Codec.ZSTD and cparams.clevel > 0: - # BatchArray stores many small serialized payloads, where Zstd dicts help materially. + # ObjectArray stores many small serialized payloads, where Zstd dicts help materially. cparams.use_dict = True else: cparams["typesize"] = 1 codec = cparams.get("codec", blosc2.Codec.ZSTD) clevel = cparams.get("clevel", 5) if auto_use_dict and codec == blosc2.Codec.ZSTD and clevel > 0: - # BatchArray stores many small serialized payloads, where Zstd dicts help materially. + # ObjectArray stores many small serialized payloads, where Zstd dicts help materially. cparams["use_dict"] = True return cparams @@ -142,9 +142,9 @@ def _coerce_storage(storage: blosc2.Storage | dict | None, kwargs: dict[str, Any @staticmethod def _validate_storage(storage: blosc2.Storage) -> None: if storage.mmap_mode not in (None, "r"): - raise ValueError("For BatchArray containers, mmap_mode must be None or 'r'") + raise ValueError("For ObjectArray containers, mmap_mode must be None or 'r'") if storage.mmap_mode == "r" and storage.mode != "r": - raise ValueError("For BatchArray containers, mmap_mode='r' requires mode='r'") + raise ValueError("For ObjectArray containers, mmap_mode='r' requires mode='r'") def _attach_schunk(self, schunk: blosc2.SChunk) -> None: self.schunk = schunk @@ -197,7 +197,7 @@ def __init__( if kwargs: unexpected = ", ".join(sorted(kwargs)) - raise ValueError(f"Unsupported BatchArray keyword argument(s): {unexpected}") + raise ValueError(f"Unsupported ObjectArray keyword argument(s): {unexpected}") self._validate_storage(storage) cparams = self._set_typesize_one(cparams) @@ -209,7 +209,7 @@ def __init__( return fixed_meta = dict(storage.meta or {}) - fixed_meta["batcharray"] = dict(_BATCHARRAY_META) + fixed_meta["objectarray"] = dict(_OBJECTARRAY_META) storage.meta = fixed_meta schunk = blosc2.SChunk(chunksize=-1, data=None, cparams=cparams, dparams=dparams, storage=storage) self._attach_schunk(schunk) @@ -217,14 +217,14 @@ def __init__( self._store_layout() def _validate_tag(self) -> None: - if "batcharray" not in self.schunk.meta: - raise ValueError("The supplied SChunk is not tagged as a BatchArray") + if "objectarray" not in self.schunk.meta: + raise ValueError("The supplied SChunk is not tagged as an ObjectArray") def _load_layout(self) -> None: layout = None self._layout_format = None - if _BATCHARRAY_LAYOUT_KEY in self.vlmeta: - layout = self.vlmeta[_BATCHARRAY_LAYOUT_KEY] + if _OBJECTARRAY_LAYOUT_KEY in self.vlmeta: + layout = self.vlmeta[_OBJECTARRAY_LAYOUT_KEY] if isinstance(layout, dict): self._chunksize = layout.get("chunksize") self._blocksize = layout.get("blocksize") @@ -251,24 +251,24 @@ def _store_layout(self) -> None: "format": self._layout_format or "batched_vlblocks", "sizing_policy": "l2_cache_prefix", } - self.vlmeta[_BATCHARRAY_LAYOUT_KEY] = layout + self.vlmeta[_OBJECTARRAY_LAYOUT_KEY] = layout def _check_writable(self) -> None: if self.mode == "r": - raise ValueError("Cannot modify a BatchArray opened in read-only mode") + raise ValueError("Cannot modify an ObjectArray opened in read-only mode") def _normalize_index(self, index: int) -> int: if not isinstance(index, int): - raise TypeError("BatchArray indices must be integers") + raise TypeError("ObjectArray indices must be integers") if index < 0: index += len(self) if index < 0 or index >= len(self): - raise IndexError("BatchArray index out of range") + raise IndexError("ObjectArray index out of range") return index def _normalize_insert_index(self, index: int) -> int: if not isinstance(index, int): - raise TypeError("BatchArray indices must be integers") + raise TypeError("ObjectArray indices must be integers") if index < 0: index += len(self) if index < 0: @@ -285,19 +285,19 @@ def _copy_meta(self) -> dict[str, Any]: def _normalize_batch(self, value: object) -> list[Any]: if isinstance(value, (str, bytes, bytearray, memoryview)): - raise TypeError("BatchArray entries must be sequences of Python objects") + raise TypeError("ObjectArray entries must be sequences of Python objects") if not isinstance(value, Sequence): - raise TypeError("BatchArray entries must be sequences of Python objects") + raise TypeError("ObjectArray entries must be sequences of Python objects") values = list(value) if len(values) == 0: - raise ValueError("BatchArray entries cannot be empty") + raise ValueError("ObjectArray entries cannot be empty") return values def _ensure_layout_for_batch(self, batch: list[Any]) -> None: if self._chunksize is None: self._chunksize = len(batch) if len(batch) != self._chunksize: - raise ValueError(f"BatchArray entries must contain exactly {self._chunksize} objects") + raise ValueError(f"ObjectArray entries must contain exactly {self._chunksize} objects") if self._blocksize is None: payload_sizes = [len(msgpack_packb(item)) for item in batch] self._blocksize = self._guess_blocksize(payload_sizes) @@ -305,7 +305,7 @@ def _ensure_layout_for_batch(self, batch: list[Any]) -> None: def _guess_blocksize(self, payload_sizes: list[int]) -> int: if not payload_sizes: - raise ValueError("BatchArray entries cannot be empty") + raise ValueError("ObjectArray entries cannot be empty") l2_cache_size = blosc2.cpu_info.get("l2_cache_size") if not isinstance(l2_cache_size, int) or l2_cache_size <= 0: return len(payload_sizes) @@ -338,7 +338,7 @@ def _vl_dparams_kwargs(self) -> dict[str, Any]: def _compress_batch(self, batch: list[Any]) -> bytes: if self._blocksize is None: - raise RuntimeError("BatchArray blocksize is not initialized") + raise RuntimeError("ObjectArray blocksize is not initialized") blocks = [ self._serialize_block(batch[i : i + self._blocksize]) for i in range(0, len(batch), self._blocksize) @@ -390,7 +390,7 @@ def pop(self, index: int = -1) -> list[Any]: """Remove and return the batch at ``index``.""" self._check_writable() if isinstance(index, slice): - raise NotImplementedError("Slicing is not supported for BatchArray") + raise NotImplementedError("Slicing is not supported for ObjectArray") index = self._normalize_index(index) value = self[index][:] self.schunk.delete_chunk(index) @@ -516,12 +516,12 @@ def contiguous(self) -> bool: @property def info(self) -> InfoReporter: - """Print information about this BatchArray.""" + """Print information about this ObjectArray.""" return InfoReporter(self) @property def info_items(self) -> list: - """A list of tuples with summary information about this BatchArray.""" + """A list of tuples with summary information about this ObjectArray.""" batch_lengths = self._batch_lengths() nitems = sum(batch_lengths) avg_batch_len = nitems / len(batch_lengths) if batch_lengths else 0.0 @@ -544,7 +544,7 @@ def info_items(self) -> list: def to_cframe(self) -> bytes: return self.schunk.to_cframe() - def copy(self, **kwargs: Any) -> BatchArray: + def copy(self, **kwargs: Any) -> ObjectArray: """Create a copy of the container with optional constructor overrides.""" if "meta" in kwargs: raise ValueError("meta should not be passed to copy") @@ -560,25 +560,25 @@ def copy(self, **kwargs: Any) -> BatchArray: if "urlpath" in kwargs and "mode" not in kwargs: kwargs["mode"] = "w" - out = BatchArray(**kwargs) + out = ObjectArray(**kwargs) if "storage" not in kwargs and len(self.vlmeta) > 0: for key, value in self.vlmeta.getall().items(): out.vlmeta[key] = value out.extend(self) return out - def __enter__(self) -> BatchArray: + def __enter__(self) -> ObjectArray: return self def __exit__(self, exc_type, exc_val, exc_tb) -> bool: return False def __repr__(self) -> str: - return f"BatchArray(len={len(self)}, urlpath={self.urlpath!r})" + return f"ObjectArray(len={len(self)}, urlpath={self.urlpath!r})" -def batcharray_from_cframe(cframe: bytes, copy: bool = True) -> BatchArray: - """Deserialize a CFrame buffer into a :class:`BatchArray`.""" +def objectarray_from_cframe(cframe: bytes, copy: bool = True) -> ObjectArray: + """Deserialize a CFrame buffer into a :class:`ObjectArray`.""" schunk = blosc2.schunk_from_cframe(cframe, copy=copy) - return BatchArray(_from_schunk=schunk) + return ObjectArray(_from_schunk=schunk) diff --git a/src/blosc2/schunk.py b/src/blosc2/schunk.py index a422928a..50428dd0 100644 --- a/src/blosc2/schunk.py +++ b/src/blosc2/schunk.py @@ -1621,10 +1621,10 @@ def _process_opened_object(res): return VLArray(_from_schunk=getattr(res, "schunk", res)) - if "batcharray" in meta: - from blosc2.batch_array import BatchArray + if "objectarray" in meta: + from blosc2.object_array import ObjectArray - return BatchArray(_from_schunk=getattr(res, "schunk", res)) + return ObjectArray(_from_schunk=getattr(res, "schunk", res)) if isinstance(res, blosc2.NDArray) and "LazyArray" in res.schunk.meta: return blosc2._open_lazyarray(res) @@ -1637,7 +1637,7 @@ def open( ) -> ( blosc2.SChunk | blosc2.NDArray - | blosc2.BatchArray + | blosc2.ObjectArray | blosc2.VLArray | blosc2.C2Array | blosc2.LazyArray diff --git a/src/blosc2/tree_store.py b/src/blosc2/tree_store.py index 82b8b3de..3efcc19f 100644 --- a/src/blosc2/tree_store.py +++ b/src/blosc2/tree_store.py @@ -227,7 +227,7 @@ def _validate_key(self, key: str) -> str: return key def __setitem__( - self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.BatchArray + self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.ObjectArray ) -> None: """Add a node with hierarchical key validation. @@ -272,7 +272,7 @@ def __setitem__( def __getitem__( self, key: str - ) -> NDArray | C2Array | SChunk | blosc2.VLArray | blosc2.BatchArray | TreeStore: + ) -> NDArray | C2Array | SChunk | blosc2.VLArray | blosc2.ObjectArray | TreeStore: """Retrieve a node or subtree view. If the key points to a subtree (intermediate path with children), @@ -286,7 +286,7 @@ def __getitem__( Returns ------- - out : blosc2.NDArray or blosc2.C2Array or blosc2.SChunk or blosc2.VLArray or blosc2.BatchArray or TreeStore + out : blosc2.NDArray or blosc2.C2Array or blosc2.SChunk or blosc2.VLArray or blosc2.ObjectArray or TreeStore The stored array/chunk if key is a leaf node, or a TreeStore subtree view if key is an intermediate path with children. diff --git a/tests/test_batch_array.py b/tests/test_object_array.py similarity index 79% rename from tests/test_batch_array.py rename to tests/test_object_array.py index 9ec692fb..4215801f 100644 --- a/tests/test_batch_array.py +++ b/tests/test_object_array.py @@ -32,15 +32,15 @@ def _storage(contiguous, urlpath, mode="w"): [ (False, None), (True, None), - (True, "test_batcharray.b2frame"), - (False, "test_batcharray_s.b2frame"), + (True, "test_objectarray.b2frame"), + (False, "test_objectarray_s.b2frame"), ], ) -def test_batcharray_roundtrip(contiguous, urlpath): +def test_objectarray_roundtrip(contiguous, urlpath): blosc2.remove_urlpath(urlpath) - barray = blosc2.BatchArray(storage=_storage(contiguous, urlpath)) - assert barray.meta["batcharray"]["serializer"] == "msgpack" + barray = blosc2.ObjectArray(storage=_storage(contiguous, urlpath)) + assert barray.meta["objectarray"]["serializer"] == "msgpack" for i, batch in enumerate(BATCHES, start=1): assert barray.append(batch) == i @@ -82,7 +82,7 @@ def test_batcharray_roundtrip(contiguous, urlpath): if urlpath is not None: reopened = blosc2.open(urlpath, mode="r") - assert isinstance(reopened, blosc2.BatchArray) + assert isinstance(reopened, blosc2.ObjectArray) assert reopened.chunksize == barray.chunksize assert reopened.blocksize == barray.blocksize assert [batch[:] for batch in reopened] == expected @@ -110,14 +110,14 @@ def test_batcharray_roundtrip(contiguous, urlpath): if contiguous: reopened_mmap = blosc2.open(urlpath, mode="r", mmap_mode="r") - assert isinstance(reopened_mmap, blosc2.BatchArray) + assert isinstance(reopened_mmap, blosc2.ObjectArray) assert [batch[:] for batch in reopened_mmap] == expected blosc2.remove_urlpath(urlpath) -def test_batcharray_from_cframe(): - barray = blosc2.BatchArray() +def test_objectarray_from_cframe(): + barray = blosc2.ObjectArray() barray.extend(BATCHES) barray.insert(1, ["inserted", True, None]) del barray[3] @@ -126,16 +126,16 @@ def test_batcharray_from_cframe(): del expected[3] restored = blosc2.from_cframe(barray.to_cframe()) - assert isinstance(restored, blosc2.BatchArray) + assert isinstance(restored, blosc2.ObjectArray) assert [batch[:] for batch in restored] == expected - restored2 = blosc2.batcharray_from_cframe(barray.to_cframe()) - assert isinstance(restored2, blosc2.BatchArray) + restored2 = blosc2.objectarray_from_cframe(barray.to_cframe()) + assert isinstance(restored2, blosc2.ObjectArray) assert [batch[:] for batch in restored2] == expected -def test_batcharray_info(): - barray = blosc2.BatchArray() +def test_objectarray_info(): + barray = blosc2.ObjectArray() barray.extend(BATCHES) assert barray.typesize == 1 @@ -143,7 +143,7 @@ def test_batcharray_info(): assert barray.urlpath == barray.schunk.urlpath items = dict(barray.info_items) - assert items["type"] == "BatchArray" + assert items["type"] == "ObjectArray" assert items["nbatches"] == len(BATCHES) assert items["chunksize"] == len(BATCHES[0]) assert items["blocksize"] == barray.blocksize @@ -160,37 +160,37 @@ def test_batcharray_info(): text = repr(barray.info) assert "type" in text - assert "BatchArray" in text + assert "ObjectArray" in text assert "batch_len_avg" in text -def test_batcharray_zstd_uses_dict_by_default(): - barray = blosc2.BatchArray() +def test_objectarray_zstd_uses_dict_by_default(): + barray = blosc2.ObjectArray() assert barray.cparams.codec == blosc2.Codec.ZSTD assert barray.cparams.use_dict is True -def test_batcharray_explicit_chunksize_blocksize(): - barray = blosc2.BatchArray(chunksize=3, blocksize=2) +def test_objectarray_explicit_chunksize_blocksize(): + barray = blosc2.ObjectArray(chunksize=3, blocksize=2) assert barray.chunksize == 3 assert barray.blocksize == 2 barray.append([1, 2, 3]) assert [batch[:] for batch in barray] == [[1, 2, 3]] -def test_batcharray_respects_explicit_use_dict_and_non_zstd(): - barray = blosc2.BatchArray(cparams={"codec": blosc2.Codec.LZ4, "clevel": 5}) +def test_objectarray_respects_explicit_use_dict_and_non_zstd(): + barray = blosc2.ObjectArray(cparams={"codec": blosc2.Codec.LZ4, "clevel": 5}) assert barray.cparams.codec == blosc2.Codec.LZ4 assert barray.cparams.use_dict is False - barray = blosc2.BatchArray(cparams={"codec": blosc2.Codec.ZSTD, "clevel": 0}) + barray = blosc2.ObjectArray(cparams={"codec": blosc2.Codec.ZSTD, "clevel": 0}) assert barray.cparams.codec == blosc2.Codec.ZSTD assert barray.cparams.use_dict is False - barray = blosc2.BatchArray(cparams={"codec": blosc2.Codec.ZSTD, "clevel": 5, "use_dict": False}) + barray = blosc2.ObjectArray(cparams={"codec": blosc2.Codec.ZSTD, "clevel": 5, "use_dict": False}) assert barray.cparams.use_dict is False - barray = blosc2.BatchArray(cparams=blosc2.CParams(codec=blosc2.Codec.ZSTD, clevel=5, use_dict=False)) + barray = blosc2.ObjectArray(cparams=blosc2.CParams(codec=blosc2.Codec.ZSTD, clevel=5, use_dict=False)) assert barray.cparams.use_dict is False @@ -231,14 +231,14 @@ def test_vlcompress_small_blocks_roundtrip(): assert out == payloads -def test_batcharray_constructor_kwargs(): - urlpath = "test_batcharray_kwargs.b2frame" +def test_objectarray_constructor_kwargs(): + urlpath = "test_objectarray_kwargs.b2frame" blosc2.remove_urlpath(urlpath) - barray = blosc2.BatchArray(urlpath=urlpath, mode="w", contiguous=True) + barray = blosc2.ObjectArray(urlpath=urlpath, mode="w", contiguous=True) barray.extend(BATCHES) - reopened = blosc2.BatchArray(urlpath=urlpath, mode="r", contiguous=True, mmap_mode="r") + reopened = blosc2.ObjectArray(urlpath=urlpath, mode="r", contiguous=True, mmap_mode="r") assert [batch[:] for batch in reopened] == BATCHES blosc2.remove_urlpath(urlpath) @@ -249,14 +249,14 @@ def test_batcharray_constructor_kwargs(): [ (False, None), (True, None), - (True, "test_batcharray_list_ops.b2frame"), - (False, "test_batcharray_list_ops_s.b2frame"), + (True, "test_objectarray_list_ops.b2frame"), + (False, "test_objectarray_list_ops_s.b2frame"), ], ) -def test_batcharray_list_like_ops(contiguous, urlpath): +def test_objectarray_list_like_ops(contiguous, urlpath): blosc2.remove_urlpath(urlpath) - barray = blosc2.BatchArray(storage=_storage(contiguous, urlpath)) + barray = blosc2.ObjectArray(storage=_storage(contiguous, urlpath)) barray.extend([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) assert [batch[:] for batch in barray] == [[1, 2, 3], [4, 5, 6], [7, 8, 9]] assert barray.pop() == [7, 8, 9] @@ -282,15 +282,15 @@ def test_batcharray_list_like_ops(contiguous, urlpath): [ (False, None), (True, None), - (True, "test_batcharray_slices.b2frame"), - (False, "test_batcharray_slices_s.b2frame"), + (True, "test_objectarray_slices.b2frame"), + (False, "test_objectarray_slices_s.b2frame"), ], ) -def test_batcharray_slices(contiguous, urlpath): +def test_objectarray_slices(contiguous, urlpath): blosc2.remove_urlpath(urlpath) expected = [[i, i + 100, i + 200] for i in range(8)] - barray = blosc2.BatchArray(storage=_storage(contiguous, urlpath)) + barray = blosc2.ObjectArray(storage=_storage(contiguous, urlpath)) barray.extend(expected) assert [batch[:] for batch in barray[1:6:2]] == expected[1:6:2] @@ -319,8 +319,8 @@ def test_batcharray_slices(contiguous, urlpath): blosc2.remove_urlpath(urlpath) -def test_batcharray_slice_errors(): - barray = blosc2.BatchArray() +def test_objectarray_slice_errors(): + barray = blosc2.ObjectArray() barray.extend([[0], [1], [2], [3]]) with pytest.raises(ValueError, match="extended slice"): @@ -331,13 +331,13 @@ def test_batcharray_slice_errors(): _ = barray[::0] -def test_batcharray_copy(): - urlpath = "test_batcharray_copy.b2frame" - copy_path = "test_batcharray_copy_out.b2frame" +def test_objectarray_copy(): + urlpath = "test_objectarray_copy.b2frame" + copy_path = "test_objectarray_copy_out.b2frame" blosc2.remove_urlpath(urlpath) blosc2.remove_urlpath(copy_path) - original = blosc2.BatchArray(urlpath=urlpath, mode="w", contiguous=True) + original = blosc2.ObjectArray(urlpath=urlpath, mode="w", contiguous=True) original.extend(BATCHES) original.insert(1, ["copy", True, 123]) @@ -362,7 +362,7 @@ def test_batcharray_copy(): @pytest.mark.parametrize(("contiguous", "nthreads"), [(False, 2), (True, 4)]) -def test_batcharray_multithreaded_inner_vl(contiguous, nthreads): +def test_objectarray_multithreaded_inner_vl(contiguous, nthreads): batches = [] for batch_id in range(24): batch = [] @@ -379,7 +379,7 @@ def test_batcharray_multithreaded_inner_vl(contiguous, nthreads): ) batches.append(batch) - barray = blosc2.BatchArray( + barray = blosc2.ObjectArray( storage=blosc2.Storage(contiguous=contiguous), cparams=blosc2.CParams(typesize=1, nthreads=nthreads, codec=blosc2.Codec.ZSTD, clevel=5), dparams=blosc2.DParams(nthreads=nthreads), @@ -390,8 +390,8 @@ def test_batcharray_multithreaded_inner_vl(contiguous, nthreads): assert [barray[i][:] for i in range(len(barray))] == batches -def test_batcharray_validation_errors(): - barray = blosc2.BatchArray() +def test_objectarray_validation_errors(): + barray = blosc2.ObjectArray() with pytest.raises(TypeError): barray.append("value") @@ -402,7 +402,7 @@ def test_batcharray_validation_errors(): with pytest.raises(IndexError): barray.delete(3) with pytest.raises(IndexError): - blosc2.BatchArray().pop() + blosc2.ObjectArray().pop() barray.extend([[1, 2, 3]]) with pytest.raises(ValueError): barray.append([2, 3]) @@ -410,29 +410,29 @@ def test_batcharray_validation_errors(): barray.pop(slice(0, 1)) -def test_batcharray_in_embed_store(): +def test_objectarray_in_embed_store(): estore = blosc2.EmbedStore() - barray = blosc2.BatchArray() + barray = blosc2.ObjectArray() barray.extend(BATCHES) estore["/batch"] = barray restored = estore["/batch"] - assert isinstance(restored, blosc2.BatchArray) + assert isinstance(restored, blosc2.ObjectArray) assert [batch[:] for batch in restored] == BATCHES -def test_batcharray_in_dict_store(): - path = "test_batcharray_store.b2z" +def test_objectarray_in_dict_store(): + path = "test_objectarray_store.b2z" blosc2.remove_urlpath(path) with blosc2.DictStore(path, mode="w", threshold=1) as dstore: - barray = blosc2.BatchArray() + barray = blosc2.ObjectArray() barray.extend(BATCHES) dstore["/batch"] = barray with blosc2.DictStore(path, mode="r") as dstore: restored = dstore["/batch"] - assert isinstance(restored, blosc2.BatchArray) + assert isinstance(restored, blosc2.ObjectArray) assert [batch[:] for batch in restored] == BATCHES blosc2.remove_urlpath(path) From bf1b935c0b26b22c8bcfd02076942267ca62de4d Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Wed, 18 Mar 2026 17:05:00 +0100 Subject: [PATCH 12/34] Remove legacy fallback --- src/blosc2/object_array.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/src/blosc2/object_array.py b/src/blosc2/object_array.py index decffc18..21091ac7 100644 --- a/src/blosc2/object_array.py +++ b/src/blosc2/object_array.py @@ -232,14 +232,7 @@ def _load_layout(self) -> None: return if len(self) == 0: return - # Legacy fallback: one object per VL block. - first_nbytes, _, nblocks = blosc2.get_cbuffer_sizes(self.schunk.get_lazychunk(0)) - if nblocks <= 0 or first_nbytes <= 0: - return - self._chunksize = nblocks - self._blocksize = 1 - self._layout_format = "legacy_vlblocks" - self._store_layout() + raise ValueError("ObjectArray layout metadata is missing") def _store_layout(self) -> None: if self._chunksize is None or self.mode == "r": @@ -349,8 +342,6 @@ def _decode_blocks(self, nchunk: int) -> list[list[Any]]: block_payloads = blosc2.blosc2_ext.vldecompress( self.schunk.get_chunk(nchunk), **self._vl_dparams_kwargs() ) - if self._layout_format == "legacy_vlblocks": - return [[msgpack_unpackb(payload)] for payload in block_payloads] return [msgpack_unpackb(payload) for payload in block_payloads] def _get_batch(self, index: int) -> Batch: From 93378e50b26c2b2512a3e69f1782cca3578d9e53 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Wed, 18 Mar 2026 17:06:00 +0100 Subject: [PATCH 13/34] Update to latest c-blosc2 --- CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6f053288..133e06dc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -118,9 +118,9 @@ else() set(BLOSC_INSTALL ON) include(FetchContent) FetchContent_Declare(blosc2 - #GIT_REPOSITORY https://github.com/Blosc/c-blosc2 - #GIT_TAG 7f6f1204784404099c767371245bd12cd4570c7c # variable-length chunks/blocks - SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../c-blosc2 + GIT_REPOSITORY https://github.com/Blosc/c-blosc2 + GIT_TAG fb31a6ab43db2a26ba6d1c690166988b680c3fd7 # variable-length chunks/blocks + # SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../c-blosc2 ) FetchContent_MakeAvailable(blosc2) include_directories("${blosc2_SOURCE_DIR}/include") From 99be79772bcf32455cf4b9807a764d0dad18a1b9 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Wed, 18 Mar 2026 17:25:39 +0100 Subject: [PATCH 14/34] ObjectArray -> ObjectStore --- src/blosc2/__init__.py | 5 +- src/blosc2/core.py | 10 +- src/blosc2/dict_store.py | 18 +-- src/blosc2/embed_store.py | 10 +- .../{object_array.py => object_store.py} | 69 +++++------ src/blosc2/schunk.py | 8 +- src/blosc2/tree_store.py | 6 +- ...t_object_array.py => test_object_store.py} | 110 +++++++++--------- 8 files changed, 114 insertions(+), 122 deletions(-) rename src/blosc2/{object_array.py => object_store.py} (90%) rename tests/{test_object_array.py => test_object_store.py} (81%) diff --git a/src/blosc2/__init__.py b/src/blosc2/__init__.py index 1f058757..e750d9d1 100644 --- a/src/blosc2/__init__.py +++ b/src/blosc2/__init__.py @@ -535,7 +535,7 @@ def _raise(exc): from .embed_store import EmbedStore, estore_from_cframe from .dict_store import DictStore from .tree_store import TreeStore -from .object_array import Batch, ObjectArray, objectarray_from_cframe +from .object_store import Batch, ObjectStore from .vlarray import VLArray, vlarray_from_cframe from .c2array import c2context, C2Array, URLPath @@ -721,7 +721,7 @@ def _raise(exc): "C2Array", "CParams", "Batch", - "ObjectArray", + "ObjectStore", # Enums "Codec", "DParams", @@ -944,7 +944,6 @@ def _raise(exc): "validate_expr", "var", "vecdot", - "objectarray_from_cframe", "vlarray_from_cframe", "where", "zeros", diff --git a/src/blosc2/core.py b/src/blosc2/core.py index 6e37b139..c37fb1b1 100644 --- a/src/blosc2/core.py +++ b/src/blosc2/core.py @@ -1918,9 +1918,9 @@ def ndarray_from_cframe(cframe: bytes | str, copy: bool = False) -> blosc2.NDArr def from_cframe( cframe: bytes | str, copy: bool = True -) -> blosc2.EmbedStore | blosc2.NDArray | blosc2.SChunk | blosc2.ObjectArray | blosc2.VLArray: +) -> blosc2.EmbedStore | blosc2.NDArray | blosc2.SChunk | blosc2.ObjectStore | blosc2.VLArray: """Create a :ref:`EmbedStore `, :ref:`NDArray `, :ref:`SChunk `, - :ref:`ObjectArray ` or :ref:`VLArray ` instance + :ref:`ObjectStore ` or :ref:`VLArray ` instance from a contiguous frame buffer. Parameters @@ -1938,7 +1938,7 @@ def from_cframe( Returns ------- out: :ref:`EmbedStore `, :ref:`NDArray `, :ref:`SChunk `, - :ref:`ObjectArray ` or :ref:`VLArray ` + :ref:`ObjectStore ` or :ref:`VLArray ` A new instance of the appropriate type containing the data passed. See Also @@ -1952,8 +1952,8 @@ def from_cframe( # Check the metalayer to determine the type if "b2embed" in schunk.meta: return blosc2.estore_from_cframe(cframe, copy=copy) - if "objectarray" in schunk.meta: - return blosc2.objectarray_from_cframe(cframe, copy=copy) + if "objectstore" in schunk.meta: + return blosc2.ObjectStore(_from_schunk=schunk_from_cframe(cframe, copy=copy)) if "vlarray" in schunk.meta: return blosc2.vlarray_from_cframe(cframe, copy=copy) if "b2nd" in schunk.meta: diff --git a/src/blosc2/dict_store.py b/src/blosc2/dict_store.py index 8ed2fac9..018c9fe8 100644 --- a/src/blosc2/dict_store.py +++ b/src/blosc2/dict_store.py @@ -249,25 +249,25 @@ def estore(self) -> EmbedStore: return self._estore @staticmethod - def _value_nbytes(value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.ObjectArray) -> int: - if isinstance(value, (blosc2.VLArray, blosc2.ObjectArray)): + def _value_nbytes(value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.ObjectStore) -> int: + if isinstance(value, (blosc2.VLArray, blosc2.ObjectStore)): return value.schunk.nbytes return value.nbytes @staticmethod - def _is_external_value(value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.ObjectArray) -> bool: - return isinstance(value, (blosc2.NDArray, SChunk, blosc2.VLArray, blosc2.ObjectArray)) and bool( + def _is_external_value(value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.ObjectStore) -> bool: + return isinstance(value, (blosc2.NDArray, SChunk, blosc2.VLArray, blosc2.ObjectStore)) and bool( getattr(value, "urlpath", None) ) @staticmethod - def _external_ext(value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.ObjectArray) -> str: + def _external_ext(value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.ObjectStore) -> str: if isinstance(value, blosc2.NDArray): return ".b2nd" return ".b2f" def __setitem__( - self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.ObjectArray + self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.ObjectStore ) -> None: """Add a node to the DictStore.""" if isinstance(value, np.ndarray): @@ -294,7 +294,7 @@ def __setitem__( if hasattr(value, "save"): value.save(urlpath=dest_path) else: - # SChunk, VLArray and ObjectArray can all be persisted via their cframe. + # SChunk, VLArray and ObjectStore can all be persisted via their cframe. with open(dest_path, "wb") as f: f.write(value.to_cframe()) else: @@ -314,7 +314,7 @@ def __setitem__( def __getitem__( self, key: str - ) -> blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.ObjectArray | C2Array: + ) -> blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.ObjectStore | C2Array: """Retrieve a node from the DictStore.""" # Check map_tree first if key in self.map_tree: @@ -346,7 +346,7 @@ def __getitem__( def get( self, key: str, default: Any = None - ) -> blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.ObjectArray | C2Array | Any: + ) -> blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.ObjectStore | C2Array | Any: """Retrieve a node, or default if not found.""" try: return self[key] diff --git a/src/blosc2/embed_store.py b/src/blosc2/embed_store.py index e54967fb..7a062b52 100644 --- a/src/blosc2/embed_store.py +++ b/src/blosc2/embed_store.py @@ -174,7 +174,7 @@ def _ensure_capacity(self, needed_bytes: int) -> None: self._store.resize((new_size,)) def __setitem__( - self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.ObjectArray + self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.ObjectStore ) -> None: """Add a node to the embed store.""" if self.mode == "r": @@ -198,7 +198,7 @@ def __setitem__( self._embed_map[key] = {"offset": offset, "length": data_len} self._save_metadata() - def __getitem__(self, key: str) -> blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.ObjectArray: + def __getitem__(self, key: str) -> blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.ObjectStore: """Retrieve a node from the embed store.""" if key not in self._embed_map: raise KeyError(f"Key '{key}' not found in the embed store.") @@ -216,7 +216,7 @@ def __getitem__(self, key: str) -> blosc2.NDArray | SChunk | blosc2.VLArray | bl def get( self, key: str, default: Any = None - ) -> blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.ObjectArray | Any: + ) -> blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.ObjectStore | Any: """Retrieve a node, or default if not found.""" return self[key] if key in self._embed_map else default @@ -243,12 +243,12 @@ def keys(self) -> KeysView[str]: """Return all keys.""" return self._embed_map.keys() - def values(self) -> Iterator[blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.ObjectArray]: + def values(self) -> Iterator[blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.ObjectStore]: """Iterate over all values.""" for key in self._embed_map: yield self[key] - def items(self) -> Iterator[tuple[str, blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.ObjectArray]]: + def items(self) -> Iterator[tuple[str, blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.ObjectStore]]: """Iterate over (key, value) pairs.""" for key in self._embed_map: yield key, self[key] diff --git a/src/blosc2/object_array.py b/src/blosc2/object_store.py similarity index 90% rename from src/blosc2/object_array.py rename to src/blosc2/object_store.py index 21091ac7..f7ed8b47 100644 --- a/src/blosc2/object_array.py +++ b/src/blosc2/object_store.py @@ -18,7 +18,7 @@ from blosc2.info import InfoReporter, format_nbytes_info _OBJECTARRAY_META = {"version": 2, "serializer": "msgpack", "format": "batched_vlblocks"} -_OBJECTARRAY_LAYOUT_KEY = "objectarray" +_OBJECTARRAY_LAYOUT_KEY = "objectstore" def _check_serialized_size(buffer: bytes) -> None: @@ -27,9 +27,9 @@ def _check_serialized_size(buffer: bytes) -> None: class Batch(Sequence[Any]): - """A lazy sequence of Python objects stored in one ObjectArray chunk.""" + """A lazy sequence of Python objects stored in one ObjectStore chunk.""" - def __init__(self, parent: ObjectArray, nchunk: int, lazychunk: bytes) -> None: + def __init__(self, parent: ObjectStore, nchunk: int, lazychunk: bytes) -> None: self._parent = parent self._nchunk = nchunk self._lazychunk = lazychunk @@ -58,7 +58,7 @@ def __getitem__(self, index: int | slice) -> Any | list[Any]: index = self._normalize_index(index) blocksize = self._parent.blocksize if blocksize is None: - raise RuntimeError("ObjectArray blocksize is not initialized") + raise RuntimeError("ObjectStore blocksize is not initialized") block_index, item_index = divmod(index, blocksize) return blocks[block_index][item_index] @@ -92,7 +92,7 @@ def __repr__(self) -> str: return f"Batch(len={len(self)}, nbytes={self.nbytes}, cbytes={self.cbytes})" -class ObjectArray: +class ObjectStore: """A batched variable-length array backed by an :class:`blosc2.SChunk`.""" @staticmethod @@ -109,14 +109,14 @@ def _set_typesize_one(cparams: blosc2.CParams | dict | None) -> blosc2.CParams | if isinstance(cparams, blosc2.CParams): cparams.typesize = 1 if auto_use_dict and cparams.codec == blosc2.Codec.ZSTD and cparams.clevel > 0: - # ObjectArray stores many small serialized payloads, where Zstd dicts help materially. + # ObjectStore stores many small serialized payloads, where Zstd dicts help materially. cparams.use_dict = True else: cparams["typesize"] = 1 codec = cparams.get("codec", blosc2.Codec.ZSTD) clevel = cparams.get("clevel", 5) if auto_use_dict and codec == blosc2.Codec.ZSTD and clevel > 0: - # ObjectArray stores many small serialized payloads, where Zstd dicts help materially. + # ObjectStore stores many small serialized payloads, where Zstd dicts help materially. cparams["use_dict"] = True return cparams @@ -142,9 +142,9 @@ def _coerce_storage(storage: blosc2.Storage | dict | None, kwargs: dict[str, Any @staticmethod def _validate_storage(storage: blosc2.Storage) -> None: if storage.mmap_mode not in (None, "r"): - raise ValueError("For ObjectArray containers, mmap_mode must be None or 'r'") + raise ValueError("For ObjectStore containers, mmap_mode must be None or 'r'") if storage.mmap_mode == "r" and storage.mode != "r": - raise ValueError("For ObjectArray containers, mmap_mode='r' requires mode='r'") + raise ValueError("For ObjectStore containers, mmap_mode='r' requires mode='r'") def _attach_schunk(self, schunk: blosc2.SChunk) -> None: self.schunk = schunk @@ -197,7 +197,7 @@ def __init__( if kwargs: unexpected = ", ".join(sorted(kwargs)) - raise ValueError(f"Unsupported ObjectArray keyword argument(s): {unexpected}") + raise ValueError(f"Unsupported ObjectStore keyword argument(s): {unexpected}") self._validate_storage(storage) cparams = self._set_typesize_one(cparams) @@ -209,7 +209,7 @@ def __init__( return fixed_meta = dict(storage.meta or {}) - fixed_meta["objectarray"] = dict(_OBJECTARRAY_META) + fixed_meta["objectstore"] = dict(_OBJECTARRAY_META) storage.meta = fixed_meta schunk = blosc2.SChunk(chunksize=-1, data=None, cparams=cparams, dparams=dparams, storage=storage) self._attach_schunk(schunk) @@ -217,8 +217,8 @@ def __init__( self._store_layout() def _validate_tag(self) -> None: - if "objectarray" not in self.schunk.meta: - raise ValueError("The supplied SChunk is not tagged as an ObjectArray") + if "objectstore" not in self.schunk.meta: + raise ValueError("The supplied SChunk is not tagged as an ObjectStore") def _load_layout(self) -> None: layout = None @@ -232,7 +232,7 @@ def _load_layout(self) -> None: return if len(self) == 0: return - raise ValueError("ObjectArray layout metadata is missing") + raise ValueError("ObjectStore layout metadata is missing") def _store_layout(self) -> None: if self._chunksize is None or self.mode == "r": @@ -248,20 +248,20 @@ def _store_layout(self) -> None: def _check_writable(self) -> None: if self.mode == "r": - raise ValueError("Cannot modify an ObjectArray opened in read-only mode") + raise ValueError("Cannot modify an ObjectStore opened in read-only mode") def _normalize_index(self, index: int) -> int: if not isinstance(index, int): - raise TypeError("ObjectArray indices must be integers") + raise TypeError("ObjectStore indices must be integers") if index < 0: index += len(self) if index < 0 or index >= len(self): - raise IndexError("ObjectArray index out of range") + raise IndexError("ObjectStore index out of range") return index def _normalize_insert_index(self, index: int) -> int: if not isinstance(index, int): - raise TypeError("ObjectArray indices must be integers") + raise TypeError("ObjectStore indices must be integers") if index < 0: index += len(self) if index < 0: @@ -278,19 +278,19 @@ def _copy_meta(self) -> dict[str, Any]: def _normalize_batch(self, value: object) -> list[Any]: if isinstance(value, (str, bytes, bytearray, memoryview)): - raise TypeError("ObjectArray entries must be sequences of Python objects") + raise TypeError("ObjectStore entries must be sequences of Python objects") if not isinstance(value, Sequence): - raise TypeError("ObjectArray entries must be sequences of Python objects") + raise TypeError("ObjectStore entries must be sequences of Python objects") values = list(value) if len(values) == 0: - raise ValueError("ObjectArray entries cannot be empty") + raise ValueError("ObjectStore entries cannot be empty") return values def _ensure_layout_for_batch(self, batch: list[Any]) -> None: if self._chunksize is None: self._chunksize = len(batch) if len(batch) != self._chunksize: - raise ValueError(f"ObjectArray entries must contain exactly {self._chunksize} objects") + raise ValueError(f"ObjectStore entries must contain exactly {self._chunksize} objects") if self._blocksize is None: payload_sizes = [len(msgpack_packb(item)) for item in batch] self._blocksize = self._guess_blocksize(payload_sizes) @@ -298,7 +298,7 @@ def _ensure_layout_for_batch(self, batch: list[Any]) -> None: def _guess_blocksize(self, payload_sizes: list[int]) -> int: if not payload_sizes: - raise ValueError("ObjectArray entries cannot be empty") + raise ValueError("ObjectStore entries cannot be empty") l2_cache_size = blosc2.cpu_info.get("l2_cache_size") if not isinstance(l2_cache_size, int) or l2_cache_size <= 0: return len(payload_sizes) @@ -331,7 +331,7 @@ def _vl_dparams_kwargs(self) -> dict[str, Any]: def _compress_batch(self, batch: list[Any]) -> bytes: if self._blocksize is None: - raise RuntimeError("ObjectArray blocksize is not initialized") + raise RuntimeError("ObjectStore blocksize is not initialized") blocks = [ self._serialize_block(batch[i : i + self._blocksize]) for i in range(0, len(batch), self._blocksize) @@ -381,7 +381,7 @@ def pop(self, index: int = -1) -> list[Any]: """Remove and return the batch at ``index``.""" self._check_writable() if isinstance(index, slice): - raise NotImplementedError("Slicing is not supported for ObjectArray") + raise NotImplementedError("Slicing is not supported for ObjectStore") index = self._normalize_index(index) value = self[index][:] self.schunk.delete_chunk(index) @@ -507,12 +507,12 @@ def contiguous(self) -> bool: @property def info(self) -> InfoReporter: - """Print information about this ObjectArray.""" + """Print information about this ObjectStore.""" return InfoReporter(self) @property def info_items(self) -> list: - """A list of tuples with summary information about this ObjectArray.""" + """A list of tuples with summary information about this ObjectStore.""" batch_lengths = self._batch_lengths() nitems = sum(batch_lengths) avg_batch_len = nitems / len(batch_lengths) if batch_lengths else 0.0 @@ -535,7 +535,7 @@ def info_items(self) -> list: def to_cframe(self) -> bytes: return self.schunk.to_cframe() - def copy(self, **kwargs: Any) -> ObjectArray: + def copy(self, **kwargs: Any) -> ObjectStore: """Create a copy of the container with optional constructor overrides.""" if "meta" in kwargs: raise ValueError("meta should not be passed to copy") @@ -551,25 +551,18 @@ def copy(self, **kwargs: Any) -> ObjectArray: if "urlpath" in kwargs and "mode" not in kwargs: kwargs["mode"] = "w" - out = ObjectArray(**kwargs) + out = ObjectStore(**kwargs) if "storage" not in kwargs and len(self.vlmeta) > 0: for key, value in self.vlmeta.getall().items(): out.vlmeta[key] = value out.extend(self) return out - def __enter__(self) -> ObjectArray: + def __enter__(self) -> ObjectStore: return self def __exit__(self, exc_type, exc_val, exc_tb) -> bool: return False def __repr__(self) -> str: - return f"ObjectArray(len={len(self)}, urlpath={self.urlpath!r})" - - -def objectarray_from_cframe(cframe: bytes, copy: bool = True) -> ObjectArray: - """Deserialize a CFrame buffer into a :class:`ObjectArray`.""" - - schunk = blosc2.schunk_from_cframe(cframe, copy=copy) - return ObjectArray(_from_schunk=schunk) + return f"ObjectStore(len={len(self)}, urlpath={self.urlpath!r})" diff --git a/src/blosc2/schunk.py b/src/blosc2/schunk.py index 50428dd0..aca106d1 100644 --- a/src/blosc2/schunk.py +++ b/src/blosc2/schunk.py @@ -1621,10 +1621,10 @@ def _process_opened_object(res): return VLArray(_from_schunk=getattr(res, "schunk", res)) - if "objectarray" in meta: - from blosc2.object_array import ObjectArray + if "objectstore" in meta: + from blosc2.object_store import ObjectStore - return ObjectArray(_from_schunk=getattr(res, "schunk", res)) + return ObjectStore(_from_schunk=getattr(res, "schunk", res)) if isinstance(res, blosc2.NDArray) and "LazyArray" in res.schunk.meta: return blosc2._open_lazyarray(res) @@ -1637,7 +1637,7 @@ def open( ) -> ( blosc2.SChunk | blosc2.NDArray - | blosc2.ObjectArray + | blosc2.ObjectStore | blosc2.VLArray | blosc2.C2Array | blosc2.LazyArray diff --git a/src/blosc2/tree_store.py b/src/blosc2/tree_store.py index 3efcc19f..9287b4fc 100644 --- a/src/blosc2/tree_store.py +++ b/src/blosc2/tree_store.py @@ -227,7 +227,7 @@ def _validate_key(self, key: str) -> str: return key def __setitem__( - self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.ObjectArray + self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.ObjectStore ) -> None: """Add a node with hierarchical key validation. @@ -272,7 +272,7 @@ def __setitem__( def __getitem__( self, key: str - ) -> NDArray | C2Array | SChunk | blosc2.VLArray | blosc2.ObjectArray | TreeStore: + ) -> NDArray | C2Array | SChunk | blosc2.VLArray | blosc2.ObjectStore | TreeStore: """Retrieve a node or subtree view. If the key points to a subtree (intermediate path with children), @@ -286,7 +286,7 @@ def __getitem__( Returns ------- - out : blosc2.NDArray or blosc2.C2Array or blosc2.SChunk or blosc2.VLArray or blosc2.ObjectArray or TreeStore + out : blosc2.NDArray or blosc2.C2Array or blosc2.SChunk or blosc2.VLArray or blosc2.ObjectStore or TreeStore The stored array/chunk if key is a leaf node, or a TreeStore subtree view if key is an intermediate path with children. diff --git a/tests/test_object_array.py b/tests/test_object_store.py similarity index 81% rename from tests/test_object_array.py rename to tests/test_object_store.py index 4215801f..b8d1112d 100644 --- a/tests/test_object_array.py +++ b/tests/test_object_store.py @@ -32,15 +32,15 @@ def _storage(contiguous, urlpath, mode="w"): [ (False, None), (True, None), - (True, "test_objectarray.b2frame"), - (False, "test_objectarray_s.b2frame"), + (True, "test_objectstore.b2frame"), + (False, "test_objectstore_s.b2frame"), ], ) -def test_objectarray_roundtrip(contiguous, urlpath): +def test_objectstore_roundtrip(contiguous, urlpath): blosc2.remove_urlpath(urlpath) - barray = blosc2.ObjectArray(storage=_storage(contiguous, urlpath)) - assert barray.meta["objectarray"]["serializer"] == "msgpack" + barray = blosc2.ObjectStore(storage=_storage(contiguous, urlpath)) + assert barray.meta["objectstore"]["serializer"] == "msgpack" for i, batch in enumerate(BATCHES, start=1): assert barray.append(batch) == i @@ -82,7 +82,7 @@ def test_objectarray_roundtrip(contiguous, urlpath): if urlpath is not None: reopened = blosc2.open(urlpath, mode="r") - assert isinstance(reopened, blosc2.ObjectArray) + assert isinstance(reopened, blosc2.ObjectStore) assert reopened.chunksize == barray.chunksize assert reopened.blocksize == barray.blocksize assert [batch[:] for batch in reopened] == expected @@ -110,14 +110,14 @@ def test_objectarray_roundtrip(contiguous, urlpath): if contiguous: reopened_mmap = blosc2.open(urlpath, mode="r", mmap_mode="r") - assert isinstance(reopened_mmap, blosc2.ObjectArray) + assert isinstance(reopened_mmap, blosc2.ObjectStore) assert [batch[:] for batch in reopened_mmap] == expected blosc2.remove_urlpath(urlpath) -def test_objectarray_from_cframe(): - barray = blosc2.ObjectArray() +def test_objectstore_from_cframe(): + barray = blosc2.ObjectStore() barray.extend(BATCHES) barray.insert(1, ["inserted", True, None]) del barray[3] @@ -126,16 +126,16 @@ def test_objectarray_from_cframe(): del expected[3] restored = blosc2.from_cframe(barray.to_cframe()) - assert isinstance(restored, blosc2.ObjectArray) + assert isinstance(restored, blosc2.ObjectStore) assert [batch[:] for batch in restored] == expected - restored2 = blosc2.objectarray_from_cframe(barray.to_cframe()) - assert isinstance(restored2, blosc2.ObjectArray) + restored2 = blosc2.from_cframe(barray.to_cframe()) + assert isinstance(restored2, blosc2.ObjectStore) assert [batch[:] for batch in restored2] == expected -def test_objectarray_info(): - barray = blosc2.ObjectArray() +def test_objectstore_info(): + barray = blosc2.ObjectStore() barray.extend(BATCHES) assert barray.typesize == 1 @@ -143,7 +143,7 @@ def test_objectarray_info(): assert barray.urlpath == barray.schunk.urlpath items = dict(barray.info_items) - assert items["type"] == "ObjectArray" + assert items["type"] == "ObjectStore" assert items["nbatches"] == len(BATCHES) assert items["chunksize"] == len(BATCHES[0]) assert items["blocksize"] == barray.blocksize @@ -160,37 +160,37 @@ def test_objectarray_info(): text = repr(barray.info) assert "type" in text - assert "ObjectArray" in text + assert "ObjectStore" in text assert "batch_len_avg" in text -def test_objectarray_zstd_uses_dict_by_default(): - barray = blosc2.ObjectArray() +def test_objectstore_zstd_uses_dict_by_default(): + barray = blosc2.ObjectStore() assert barray.cparams.codec == blosc2.Codec.ZSTD assert barray.cparams.use_dict is True -def test_objectarray_explicit_chunksize_blocksize(): - barray = blosc2.ObjectArray(chunksize=3, blocksize=2) +def test_objectstore_explicit_chunksize_blocksize(): + barray = blosc2.ObjectStore(chunksize=3, blocksize=2) assert barray.chunksize == 3 assert barray.blocksize == 2 barray.append([1, 2, 3]) assert [batch[:] for batch in barray] == [[1, 2, 3]] -def test_objectarray_respects_explicit_use_dict_and_non_zstd(): - barray = blosc2.ObjectArray(cparams={"codec": blosc2.Codec.LZ4, "clevel": 5}) +def test_objectstore_respects_explicit_use_dict_and_non_zstd(): + barray = blosc2.ObjectStore(cparams={"codec": blosc2.Codec.LZ4, "clevel": 5}) assert barray.cparams.codec == blosc2.Codec.LZ4 assert barray.cparams.use_dict is False - barray = blosc2.ObjectArray(cparams={"codec": blosc2.Codec.ZSTD, "clevel": 0}) + barray = blosc2.ObjectStore(cparams={"codec": blosc2.Codec.ZSTD, "clevel": 0}) assert barray.cparams.codec == blosc2.Codec.ZSTD assert barray.cparams.use_dict is False - barray = blosc2.ObjectArray(cparams={"codec": blosc2.Codec.ZSTD, "clevel": 5, "use_dict": False}) + barray = blosc2.ObjectStore(cparams={"codec": blosc2.Codec.ZSTD, "clevel": 5, "use_dict": False}) assert barray.cparams.use_dict is False - barray = blosc2.ObjectArray(cparams=blosc2.CParams(codec=blosc2.Codec.ZSTD, clevel=5, use_dict=False)) + barray = blosc2.ObjectStore(cparams=blosc2.CParams(codec=blosc2.Codec.ZSTD, clevel=5, use_dict=False)) assert barray.cparams.use_dict is False @@ -231,14 +231,14 @@ def test_vlcompress_small_blocks_roundtrip(): assert out == payloads -def test_objectarray_constructor_kwargs(): - urlpath = "test_objectarray_kwargs.b2frame" +def test_objectstore_constructor_kwargs(): + urlpath = "test_objectstore_kwargs.b2frame" blosc2.remove_urlpath(urlpath) - barray = blosc2.ObjectArray(urlpath=urlpath, mode="w", contiguous=True) + barray = blosc2.ObjectStore(urlpath=urlpath, mode="w", contiguous=True) barray.extend(BATCHES) - reopened = blosc2.ObjectArray(urlpath=urlpath, mode="r", contiguous=True, mmap_mode="r") + reopened = blosc2.ObjectStore(urlpath=urlpath, mode="r", contiguous=True, mmap_mode="r") assert [batch[:] for batch in reopened] == BATCHES blosc2.remove_urlpath(urlpath) @@ -249,14 +249,14 @@ def test_objectarray_constructor_kwargs(): [ (False, None), (True, None), - (True, "test_objectarray_list_ops.b2frame"), - (False, "test_objectarray_list_ops_s.b2frame"), + (True, "test_objectstore_list_ops.b2frame"), + (False, "test_objectstore_list_ops_s.b2frame"), ], ) -def test_objectarray_list_like_ops(contiguous, urlpath): +def test_objectstore_list_like_ops(contiguous, urlpath): blosc2.remove_urlpath(urlpath) - barray = blosc2.ObjectArray(storage=_storage(contiguous, urlpath)) + barray = blosc2.ObjectStore(storage=_storage(contiguous, urlpath)) barray.extend([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) assert [batch[:] for batch in barray] == [[1, 2, 3], [4, 5, 6], [7, 8, 9]] assert barray.pop() == [7, 8, 9] @@ -282,15 +282,15 @@ def test_objectarray_list_like_ops(contiguous, urlpath): [ (False, None), (True, None), - (True, "test_objectarray_slices.b2frame"), - (False, "test_objectarray_slices_s.b2frame"), + (True, "test_objectstore_slices.b2frame"), + (False, "test_objectstore_slices_s.b2frame"), ], ) -def test_objectarray_slices(contiguous, urlpath): +def test_objectstore_slices(contiguous, urlpath): blosc2.remove_urlpath(urlpath) expected = [[i, i + 100, i + 200] for i in range(8)] - barray = blosc2.ObjectArray(storage=_storage(contiguous, urlpath)) + barray = blosc2.ObjectStore(storage=_storage(contiguous, urlpath)) barray.extend(expected) assert [batch[:] for batch in barray[1:6:2]] == expected[1:6:2] @@ -319,8 +319,8 @@ def test_objectarray_slices(contiguous, urlpath): blosc2.remove_urlpath(urlpath) -def test_objectarray_slice_errors(): - barray = blosc2.ObjectArray() +def test_objectstore_slice_errors(): + barray = blosc2.ObjectStore() barray.extend([[0], [1], [2], [3]]) with pytest.raises(ValueError, match="extended slice"): @@ -331,13 +331,13 @@ def test_objectarray_slice_errors(): _ = barray[::0] -def test_objectarray_copy(): - urlpath = "test_objectarray_copy.b2frame" - copy_path = "test_objectarray_copy_out.b2frame" +def test_objectstore_copy(): + urlpath = "test_objectstore_copy.b2frame" + copy_path = "test_objectstore_copy_out.b2frame" blosc2.remove_urlpath(urlpath) blosc2.remove_urlpath(copy_path) - original = blosc2.ObjectArray(urlpath=urlpath, mode="w", contiguous=True) + original = blosc2.ObjectStore(urlpath=urlpath, mode="w", contiguous=True) original.extend(BATCHES) original.insert(1, ["copy", True, 123]) @@ -362,7 +362,7 @@ def test_objectarray_copy(): @pytest.mark.parametrize(("contiguous", "nthreads"), [(False, 2), (True, 4)]) -def test_objectarray_multithreaded_inner_vl(contiguous, nthreads): +def test_objectstore_multithreaded_inner_vl(contiguous, nthreads): batches = [] for batch_id in range(24): batch = [] @@ -379,7 +379,7 @@ def test_objectarray_multithreaded_inner_vl(contiguous, nthreads): ) batches.append(batch) - barray = blosc2.ObjectArray( + barray = blosc2.ObjectStore( storage=blosc2.Storage(contiguous=contiguous), cparams=blosc2.CParams(typesize=1, nthreads=nthreads, codec=blosc2.Codec.ZSTD, clevel=5), dparams=blosc2.DParams(nthreads=nthreads), @@ -390,8 +390,8 @@ def test_objectarray_multithreaded_inner_vl(contiguous, nthreads): assert [barray[i][:] for i in range(len(barray))] == batches -def test_objectarray_validation_errors(): - barray = blosc2.ObjectArray() +def test_objectstore_validation_errors(): + barray = blosc2.ObjectStore() with pytest.raises(TypeError): barray.append("value") @@ -402,7 +402,7 @@ def test_objectarray_validation_errors(): with pytest.raises(IndexError): barray.delete(3) with pytest.raises(IndexError): - blosc2.ObjectArray().pop() + blosc2.ObjectStore().pop() barray.extend([[1, 2, 3]]) with pytest.raises(ValueError): barray.append([2, 3]) @@ -410,29 +410,29 @@ def test_objectarray_validation_errors(): barray.pop(slice(0, 1)) -def test_objectarray_in_embed_store(): +def test_objectstore_in_embed_store(): estore = blosc2.EmbedStore() - barray = blosc2.ObjectArray() + barray = blosc2.ObjectStore() barray.extend(BATCHES) estore["/batch"] = barray restored = estore["/batch"] - assert isinstance(restored, blosc2.ObjectArray) + assert isinstance(restored, blosc2.ObjectStore) assert [batch[:] for batch in restored] == BATCHES -def test_objectarray_in_dict_store(): - path = "test_objectarray_store.b2z" +def test_objectstore_in_dict_store(): + path = "test_objectstore_store.b2z" blosc2.remove_urlpath(path) with blosc2.DictStore(path, mode="w", threshold=1) as dstore: - barray = blosc2.ObjectArray() + barray = blosc2.ObjectStore() barray.extend(BATCHES) dstore["/batch"] = barray with blosc2.DictStore(path, mode="r") as dstore: restored = dstore["/batch"] - assert isinstance(restored, blosc2.ObjectArray) + assert isinstance(restored, blosc2.ObjectStore) assert [batch[:] for batch in restored] == BATCHES blosc2.remove_urlpath(path) From 244c86d74bf3eef9244108eaac26717e02a6158f Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Wed, 18 Mar 2026 20:06:45 +0100 Subject: [PATCH 15/34] chunksize -> batchsize --- src/blosc2/object_store.py | 52 +++++++++++++++++++++++--------------- tests/test_object_store.py | 17 ++++++++----- 2 files changed, 43 insertions(+), 26 deletions(-) diff --git a/src/blosc2/object_store.py b/src/blosc2/object_store.py index f7ed8b47..9a5171e4 100644 --- a/src/blosc2/object_store.py +++ b/src/blosc2/object_store.py @@ -63,10 +63,10 @@ def __getitem__(self, index: int | slice) -> Any | list[Any]: return blocks[block_index][item_index] def __len__(self) -> int: - chunksize = self._parent.chunksize - if chunksize is None: + batchsize = self._parent.batchsize + if batchsize is None: return self._nblocks - return chunksize + return batchsize def __iter__(self) -> Iterator[Any]: for i in range(len(self)): @@ -174,17 +174,21 @@ def _make_storage(self) -> blosc2.Storage: def __init__( self, - chunksize: int | None = None, + batchsize: int | None = None, blocksize: int | None = None, _from_schunk: blosc2.SChunk | None = None, **kwargs: Any, ) -> None: - self._chunksize: int | None = chunksize + if "chunksize" in kwargs: + if batchsize is not None: + raise ValueError("Cannot pass both `batchsize` and `chunksize`") + batchsize = kwargs.pop("chunksize") + self._batchsize: int | None = batchsize self._blocksize: int | None = blocksize self._layout_format: str | None = None if _from_schunk is not None: - if chunksize is not None or blocksize is not None: - raise ValueError("Cannot pass `chunksize` or `blocksize` together with `_from_schunk`") + if batchsize is not None or blocksize is not None: + raise ValueError("Cannot pass `batchsize` or `blocksize` together with `_from_schunk`") if kwargs: unexpected = ", ".join(sorted(kwargs)) raise ValueError(f"Cannot pass {unexpected} together with `_from_schunk`") @@ -213,7 +217,7 @@ def __init__( storage.meta = fixed_meta schunk = blosc2.SChunk(chunksize=-1, data=None, cparams=cparams, dparams=dparams, storage=storage) self._attach_schunk(schunk) - if self._chunksize is not None or self._blocksize is not None: + if self._batchsize is not None or self._blocksize is not None: self._store_layout() def _validate_tag(self) -> None: @@ -226,7 +230,7 @@ def _load_layout(self) -> None: if _OBJECTARRAY_LAYOUT_KEY in self.vlmeta: layout = self.vlmeta[_OBJECTARRAY_LAYOUT_KEY] if isinstance(layout, dict): - self._chunksize = layout.get("chunksize") + self._batchsize = layout.get("batchsize", layout.get("chunksize")) self._blocksize = layout.get("blocksize") self._layout_format = layout.get("format", "batched_vlblocks") return @@ -235,11 +239,11 @@ def _load_layout(self) -> None: raise ValueError("ObjectStore layout metadata is missing") def _store_layout(self) -> None: - if self._chunksize is None or self.mode == "r": + if self._batchsize is None or self.mode == "r": return layout = { "version": 1, - "chunksize": self._chunksize, + "batchsize": self._batchsize, "blocksize": self._blocksize, "format": self._layout_format or "batched_vlblocks", "sizing_policy": "l2_cache_prefix", @@ -287,10 +291,10 @@ def _normalize_batch(self, value: object) -> list[Any]: return values def _ensure_layout_for_batch(self, batch: list[Any]) -> None: - if self._chunksize is None: - self._chunksize = len(batch) - if len(batch) != self._chunksize: - raise ValueError(f"ObjectStore entries must contain exactly {self._chunksize} objects") + if self._batchsize is None: + self._batchsize = len(batch) + if len(batch) != self._batchsize: + raise ValueError(f"ObjectStore entries must contain exactly {self._batchsize} objects") if self._blocksize is None: payload_sizes = [len(msgpack_packb(item)) for item in batch] self._blocksize = self._guess_blocksize(payload_sizes) @@ -348,8 +352,8 @@ def _get_batch(self, index: int) -> Batch: return Batch(self, index, self.schunk.get_lazychunk(index)) def _batch_lengths(self) -> list[int]: - if self.chunksize is not None: - return [self.chunksize for _ in range(len(self))] + if self.batchsize is not None: + return [self.batchsize for _ in range(len(self))] return [len(self[i]) for i in range(len(self))] def append(self, value: object) -> int: @@ -475,7 +479,11 @@ def dparams(self): @property def chunksize(self) -> int: - return self._chunksize + return self._batchsize + + @property + def batchsize(self) -> int: + return self._batchsize @property def blocksize(self) -> int: @@ -519,7 +527,7 @@ def info_items(self) -> list: return [ ("type", f"{self.__class__.__name__}"), ("nbatches", len(self)), - ("chunksize", self.chunksize), + ("batchsize", self.batchsize), ("blocksize", self.blocksize), ("nitems", nitems), ("batch_len_min", min(batch_lengths) if batch_lengths else 0), @@ -539,10 +547,14 @@ def copy(self, **kwargs: Any) -> ObjectStore: """Create a copy of the container with optional constructor overrides.""" if "meta" in kwargs: raise ValueError("meta should not be passed to copy") + if "chunksize" in kwargs: + if "batchsize" in kwargs: + raise ValueError("Cannot pass both `batchsize` and `chunksize` to copy") + kwargs["batchsize"] = kwargs.pop("chunksize") kwargs["cparams"] = kwargs.get("cparams", copy.deepcopy(self.cparams)) kwargs["dparams"] = kwargs.get("dparams", copy.deepcopy(self.dparams)) - kwargs["chunksize"] = kwargs.get("chunksize", self.chunksize) + kwargs["batchsize"] = kwargs.get("batchsize", self.batchsize) kwargs["blocksize"] = kwargs.get("blocksize", self.blocksize) if "storage" not in kwargs: diff --git a/tests/test_object_store.py b/tests/test_object_store.py index b8d1112d..1e67bb6a 100644 --- a/tests/test_object_store.py +++ b/tests/test_object_store.py @@ -46,9 +46,9 @@ def test_objectstore_roundtrip(contiguous, urlpath): assert barray.append(batch) == i assert len(barray) == len(BATCHES) - assert barray.chunksize == len(BATCHES[0]) + assert barray.batchsize == len(BATCHES[0]) assert barray.blocksize is not None - assert 1 <= barray.blocksize <= barray.chunksize + assert 1 <= barray.blocksize <= barray.batchsize assert [batch[:] for batch in barray] == BATCHES with pytest.raises(ValueError): barray.append([1, 2]) @@ -83,7 +83,7 @@ def test_objectstore_roundtrip(contiguous, urlpath): if urlpath is not None: reopened = blosc2.open(urlpath, mode="r") assert isinstance(reopened, blosc2.ObjectStore) - assert reopened.chunksize == barray.chunksize + assert reopened.batchsize == barray.batchsize assert reopened.blocksize == barray.blocksize assert [batch[:] for batch in reopened] == expected with pytest.raises(ValueError): @@ -145,7 +145,7 @@ def test_objectstore_info(): items = dict(barray.info_items) assert items["type"] == "ObjectStore" assert items["nbatches"] == len(BATCHES) - assert items["chunksize"] == len(BATCHES[0]) + assert items["batchsize"] == len(BATCHES[0]) assert items["blocksize"] == barray.blocksize assert items["nitems"] == sum(len(batch) for batch in BATCHES) assert items["batch_len_min"] == 3 @@ -170,13 +170,18 @@ def test_objectstore_zstd_uses_dict_by_default(): assert barray.cparams.use_dict is True -def test_objectstore_explicit_chunksize_blocksize(): - barray = blosc2.ObjectStore(chunksize=3, blocksize=2) +def test_objectstore_explicit_batchsize_blocksize(): + barray = blosc2.ObjectStore(batchsize=3, blocksize=2) + assert barray.batchsize == 3 assert barray.chunksize == 3 assert barray.blocksize == 2 barray.append([1, 2, 3]) assert [batch[:] for batch in barray] == [[1, 2, 3]] + legacy = blosc2.ObjectStore(chunksize=3, blocksize=2) + assert legacy.batchsize == 3 + assert legacy.chunksize == 3 + def test_objectstore_respects_explicit_use_dict_and_non_zstd(): barray = blosc2.ObjectStore(cparams={"codec": blosc2.Codec.LZ4, "clevel": 5}) From e999d1d20d2beaf5701311262f2ff51c9f9acfab Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Wed, 18 Mar 2026 20:09:10 +0100 Subject: [PATCH 16/34] batchsize is always constant --- src/blosc2/object_store.py | 12 +----------- tests/test_object_store.py | 5 +---- 2 files changed, 2 insertions(+), 15 deletions(-) diff --git a/src/blosc2/object_store.py b/src/blosc2/object_store.py index 9a5171e4..b6b54d14 100644 --- a/src/blosc2/object_store.py +++ b/src/blosc2/object_store.py @@ -351,11 +351,6 @@ def _decode_blocks(self, nchunk: int) -> list[list[Any]]: def _get_batch(self, index: int) -> Batch: return Batch(self, index, self.schunk.get_lazychunk(index)) - def _batch_lengths(self) -> list[int]: - if self.batchsize is not None: - return [self.batchsize for _ in range(len(self))] - return [len(self[i]) for i in range(len(self))] - def append(self, value: object) -> int: """Append one batch and return the new number of entries.""" self._check_writable() @@ -521,18 +516,13 @@ def info(self) -> InfoReporter: @property def info_items(self) -> list: """A list of tuples with summary information about this ObjectStore.""" - batch_lengths = self._batch_lengths() - nitems = sum(batch_lengths) - avg_batch_len = nitems / len(batch_lengths) if batch_lengths else 0.0 + nitems = len(self) * self.batchsize if self.batchsize is not None else 0 return [ ("type", f"{self.__class__.__name__}"), ("nbatches", len(self)), ("batchsize", self.batchsize), ("blocksize", self.blocksize), ("nitems", nitems), - ("batch_len_min", min(batch_lengths) if batch_lengths else 0), - ("batch_len_max", max(batch_lengths) if batch_lengths else 0), - ("batch_len_avg", f"{avg_batch_len:.2f}"), ("nbytes", format_nbytes_info(self.nbytes)), ("cbytes", format_nbytes_info(self.cbytes)), ("cratio", f"{self.cratio:.2f}"), diff --git a/tests/test_object_store.py b/tests/test_object_store.py index 1e67bb6a..63190614 100644 --- a/tests/test_object_store.py +++ b/tests/test_object_store.py @@ -148,9 +148,6 @@ def test_objectstore_info(): assert items["batchsize"] == len(BATCHES[0]) assert items["blocksize"] == barray.blocksize assert items["nitems"] == sum(len(batch) for batch in BATCHES) - assert items["batch_len_min"] == 3 - assert items["batch_len_max"] == 3 - assert items["batch_len_avg"] == "3.00" assert "urlpath" not in items assert "contiguous" not in items assert "typesize" not in items @@ -161,7 +158,7 @@ def test_objectstore_info(): text = repr(barray.info) assert "type" in text assert "ObjectStore" in text - assert "batch_len_avg" in text + assert "batchsize" in text def test_objectstore_zstd_uses_dict_by_default(): From afc84365dcdc5d7fc81c5b3858987a28ca253a03 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Wed, 18 Mar 2026 20:16:13 +0100 Subject: [PATCH 17/34] ObjectStore -> BatchStore --- src/blosc2/__init__.py | 4 +- .../{object_store.py => batch_store.py} | 85 ++++++------- src/blosc2/core.py | 10 +- src/blosc2/dict_store.py | 18 +-- src/blosc2/embed_store.py | 10 +- src/blosc2/schunk.py | 8 +- src/blosc2/tree_store.py | 6 +- ...st_object_store.py => test_batch_store.py} | 113 +++++++++--------- 8 files changed, 118 insertions(+), 136 deletions(-) rename src/blosc2/{object_store.py => batch_store.py} (86%) rename tests/{test_object_store.py => test_batch_store.py} (79%) diff --git a/src/blosc2/__init__.py b/src/blosc2/__init__.py index e750d9d1..e32b2f48 100644 --- a/src/blosc2/__init__.py +++ b/src/blosc2/__init__.py @@ -535,7 +535,7 @@ def _raise(exc): from .embed_store import EmbedStore, estore_from_cframe from .dict_store import DictStore from .tree_store import TreeStore -from .object_store import Batch, ObjectStore +from .batch_store import Batch, BatchStore from .vlarray import VLArray, vlarray_from_cframe from .c2array import c2context, C2Array, URLPath @@ -721,7 +721,7 @@ def _raise(exc): "C2Array", "CParams", "Batch", - "ObjectStore", + "BatchStore", # Enums "Codec", "DParams", diff --git a/src/blosc2/object_store.py b/src/blosc2/batch_store.py similarity index 86% rename from src/blosc2/object_store.py rename to src/blosc2/batch_store.py index b6b54d14..cb7e31fb 100644 --- a/src/blosc2/object_store.py +++ b/src/blosc2/batch_store.py @@ -17,8 +17,8 @@ from blosc2._msgpack_utils import msgpack_packb, msgpack_unpackb from blosc2.info import InfoReporter, format_nbytes_info -_OBJECTARRAY_META = {"version": 2, "serializer": "msgpack", "format": "batched_vlblocks"} -_OBJECTARRAY_LAYOUT_KEY = "objectstore" +_BATCHSTORE_META = {"version": 2, "serializer": "msgpack", "format": "batched_vlblocks"} +_BATCHSTORE_LAYOUT_KEY = "batchstore" def _check_serialized_size(buffer: bytes) -> None: @@ -27,9 +27,9 @@ def _check_serialized_size(buffer: bytes) -> None: class Batch(Sequence[Any]): - """A lazy sequence of Python objects stored in one ObjectStore chunk.""" + """A lazy sequence of Python objects stored in one BatchStore chunk.""" - def __init__(self, parent: ObjectStore, nchunk: int, lazychunk: bytes) -> None: + def __init__(self, parent: BatchStore, nchunk: int, lazychunk: bytes) -> None: self._parent = parent self._nchunk = nchunk self._lazychunk = lazychunk @@ -58,7 +58,7 @@ def __getitem__(self, index: int | slice) -> Any | list[Any]: index = self._normalize_index(index) blocksize = self._parent.blocksize if blocksize is None: - raise RuntimeError("ObjectStore blocksize is not initialized") + raise RuntimeError("BatchStore blocksize is not initialized") block_index, item_index = divmod(index, blocksize) return blocks[block_index][item_index] @@ -92,7 +92,7 @@ def __repr__(self) -> str: return f"Batch(len={len(self)}, nbytes={self.nbytes}, cbytes={self.cbytes})" -class ObjectStore: +class BatchStore: """A batched variable-length array backed by an :class:`blosc2.SChunk`.""" @staticmethod @@ -109,14 +109,14 @@ def _set_typesize_one(cparams: blosc2.CParams | dict | None) -> blosc2.CParams | if isinstance(cparams, blosc2.CParams): cparams.typesize = 1 if auto_use_dict and cparams.codec == blosc2.Codec.ZSTD and cparams.clevel > 0: - # ObjectStore stores many small serialized payloads, where Zstd dicts help materially. + # BatchStore stores many small serialized payloads, where Zstd dicts help materially. cparams.use_dict = True else: cparams["typesize"] = 1 codec = cparams.get("codec", blosc2.Codec.ZSTD) clevel = cparams.get("clevel", 5) if auto_use_dict and codec == blosc2.Codec.ZSTD and clevel > 0: - # ObjectStore stores many small serialized payloads, where Zstd dicts help materially. + # BatchStore stores many small serialized payloads, where Zstd dicts help materially. cparams["use_dict"] = True return cparams @@ -142,9 +142,9 @@ def _coerce_storage(storage: blosc2.Storage | dict | None, kwargs: dict[str, Any @staticmethod def _validate_storage(storage: blosc2.Storage) -> None: if storage.mmap_mode not in (None, "r"): - raise ValueError("For ObjectStore containers, mmap_mode must be None or 'r'") + raise ValueError("For BatchStore containers, mmap_mode must be None or 'r'") if storage.mmap_mode == "r" and storage.mode != "r": - raise ValueError("For ObjectStore containers, mmap_mode='r' requires mode='r'") + raise ValueError("For BatchStore containers, mmap_mode='r' requires mode='r'") def _attach_schunk(self, schunk: blosc2.SChunk) -> None: self.schunk = schunk @@ -179,10 +179,6 @@ def __init__( _from_schunk: blosc2.SChunk | None = None, **kwargs: Any, ) -> None: - if "chunksize" in kwargs: - if batchsize is not None: - raise ValueError("Cannot pass both `batchsize` and `chunksize`") - batchsize = kwargs.pop("chunksize") self._batchsize: int | None = batchsize self._blocksize: int | None = blocksize self._layout_format: str | None = None @@ -201,7 +197,7 @@ def __init__( if kwargs: unexpected = ", ".join(sorted(kwargs)) - raise ValueError(f"Unsupported ObjectStore keyword argument(s): {unexpected}") + raise ValueError(f"Unsupported BatchStore keyword argument(s): {unexpected}") self._validate_storage(storage) cparams = self._set_typesize_one(cparams) @@ -213,7 +209,7 @@ def __init__( return fixed_meta = dict(storage.meta or {}) - fixed_meta["objectstore"] = dict(_OBJECTARRAY_META) + fixed_meta["batchstore"] = dict(_BATCHSTORE_META) storage.meta = fixed_meta schunk = blosc2.SChunk(chunksize=-1, data=None, cparams=cparams, dparams=dparams, storage=storage) self._attach_schunk(schunk) @@ -221,22 +217,22 @@ def __init__( self._store_layout() def _validate_tag(self) -> None: - if "objectstore" not in self.schunk.meta: - raise ValueError("The supplied SChunk is not tagged as an ObjectStore") + if "batchstore" not in self.schunk.meta: + raise ValueError("The supplied SChunk is not tagged as a BatchStore") def _load_layout(self) -> None: layout = None self._layout_format = None - if _OBJECTARRAY_LAYOUT_KEY in self.vlmeta: - layout = self.vlmeta[_OBJECTARRAY_LAYOUT_KEY] + if _BATCHSTORE_LAYOUT_KEY in self.vlmeta: + layout = self.vlmeta[_BATCHSTORE_LAYOUT_KEY] if isinstance(layout, dict): - self._batchsize = layout.get("batchsize", layout.get("chunksize")) + self._batchsize = layout["batchsize"] self._blocksize = layout.get("blocksize") self._layout_format = layout.get("format", "batched_vlblocks") return if len(self) == 0: return - raise ValueError("ObjectStore layout metadata is missing") + raise ValueError("BatchStore layout metadata is missing") def _store_layout(self) -> None: if self._batchsize is None or self.mode == "r": @@ -248,24 +244,24 @@ def _store_layout(self) -> None: "format": self._layout_format or "batched_vlblocks", "sizing_policy": "l2_cache_prefix", } - self.vlmeta[_OBJECTARRAY_LAYOUT_KEY] = layout + self.vlmeta[_BATCHSTORE_LAYOUT_KEY] = layout def _check_writable(self) -> None: if self.mode == "r": - raise ValueError("Cannot modify an ObjectStore opened in read-only mode") + raise ValueError("Cannot modify a BatchStore opened in read-only mode") def _normalize_index(self, index: int) -> int: if not isinstance(index, int): - raise TypeError("ObjectStore indices must be integers") + raise TypeError("BatchStore indices must be integers") if index < 0: index += len(self) if index < 0 or index >= len(self): - raise IndexError("ObjectStore index out of range") + raise IndexError("BatchStore index out of range") return index def _normalize_insert_index(self, index: int) -> int: if not isinstance(index, int): - raise TypeError("ObjectStore indices must be integers") + raise TypeError("BatchStore indices must be integers") if index < 0: index += len(self) if index < 0: @@ -282,19 +278,19 @@ def _copy_meta(self) -> dict[str, Any]: def _normalize_batch(self, value: object) -> list[Any]: if isinstance(value, (str, bytes, bytearray, memoryview)): - raise TypeError("ObjectStore entries must be sequences of Python objects") + raise TypeError("BatchStore entries must be sequences of Python objects") if not isinstance(value, Sequence): - raise TypeError("ObjectStore entries must be sequences of Python objects") + raise TypeError("BatchStore entries must be sequences of Python objects") values = list(value) if len(values) == 0: - raise ValueError("ObjectStore entries cannot be empty") + raise ValueError("BatchStore entries cannot be empty") return values def _ensure_layout_for_batch(self, batch: list[Any]) -> None: if self._batchsize is None: self._batchsize = len(batch) if len(batch) != self._batchsize: - raise ValueError(f"ObjectStore entries must contain exactly {self._batchsize} objects") + raise ValueError(f"BatchStore entries must contain exactly {self._batchsize} objects") if self._blocksize is None: payload_sizes = [len(msgpack_packb(item)) for item in batch] self._blocksize = self._guess_blocksize(payload_sizes) @@ -302,7 +298,7 @@ def _ensure_layout_for_batch(self, batch: list[Any]) -> None: def _guess_blocksize(self, payload_sizes: list[int]) -> int: if not payload_sizes: - raise ValueError("ObjectStore entries cannot be empty") + raise ValueError("BatchStore entries cannot be empty") l2_cache_size = blosc2.cpu_info.get("l2_cache_size") if not isinstance(l2_cache_size, int) or l2_cache_size <= 0: return len(payload_sizes) @@ -335,7 +331,7 @@ def _vl_dparams_kwargs(self) -> dict[str, Any]: def _compress_batch(self, batch: list[Any]) -> bytes: if self._blocksize is None: - raise RuntimeError("ObjectStore blocksize is not initialized") + raise RuntimeError("BatchStore blocksize is not initialized") blocks = [ self._serialize_block(batch[i : i + self._blocksize]) for i in range(0, len(batch), self._blocksize) @@ -380,7 +376,7 @@ def pop(self, index: int = -1) -> list[Any]: """Remove and return the batch at ``index``.""" self._check_writable() if isinstance(index, slice): - raise NotImplementedError("Slicing is not supported for ObjectStore") + raise NotImplementedError("Slicing is not supported for BatchStore") index = self._normalize_index(index) value = self[index][:] self.schunk.delete_chunk(index) @@ -472,10 +468,6 @@ def cparams(self): def dparams(self): return self.schunk.dparams - @property - def chunksize(self) -> int: - return self._batchsize - @property def batchsize(self) -> int: return self._batchsize @@ -510,12 +502,12 @@ def contiguous(self) -> bool: @property def info(self) -> InfoReporter: - """Print information about this ObjectStore.""" + """Print information about this BatchStore.""" return InfoReporter(self) @property def info_items(self) -> list: - """A list of tuples with summary information about this ObjectStore.""" + """A list of tuples with summary information about this BatchStore.""" nitems = len(self) * self.batchsize if self.batchsize is not None else 0 return [ ("type", f"{self.__class__.__name__}"), @@ -533,15 +525,10 @@ def info_items(self) -> list: def to_cframe(self) -> bytes: return self.schunk.to_cframe() - def copy(self, **kwargs: Any) -> ObjectStore: + def copy(self, **kwargs: Any) -> BatchStore: """Create a copy of the container with optional constructor overrides.""" if "meta" in kwargs: raise ValueError("meta should not be passed to copy") - if "chunksize" in kwargs: - if "batchsize" in kwargs: - raise ValueError("Cannot pass both `batchsize` and `chunksize` to copy") - kwargs["batchsize"] = kwargs.pop("chunksize") - kwargs["cparams"] = kwargs.get("cparams", copy.deepcopy(self.cparams)) kwargs["dparams"] = kwargs.get("dparams", copy.deepcopy(self.dparams)) kwargs["batchsize"] = kwargs.get("batchsize", self.batchsize) @@ -553,18 +540,18 @@ def copy(self, **kwargs: Any) -> ObjectStore: if "urlpath" in kwargs and "mode" not in kwargs: kwargs["mode"] = "w" - out = ObjectStore(**kwargs) + out = BatchStore(**kwargs) if "storage" not in kwargs and len(self.vlmeta) > 0: for key, value in self.vlmeta.getall().items(): out.vlmeta[key] = value out.extend(self) return out - def __enter__(self) -> ObjectStore: + def __enter__(self) -> BatchStore: return self def __exit__(self, exc_type, exc_val, exc_tb) -> bool: return False def __repr__(self) -> str: - return f"ObjectStore(len={len(self)}, urlpath={self.urlpath!r})" + return f"BatchStore(len={len(self)}, urlpath={self.urlpath!r})" diff --git a/src/blosc2/core.py b/src/blosc2/core.py index c37fb1b1..d574a21e 100644 --- a/src/blosc2/core.py +++ b/src/blosc2/core.py @@ -1918,9 +1918,9 @@ def ndarray_from_cframe(cframe: bytes | str, copy: bool = False) -> blosc2.NDArr def from_cframe( cframe: bytes | str, copy: bool = True -) -> blosc2.EmbedStore | blosc2.NDArray | blosc2.SChunk | blosc2.ObjectStore | blosc2.VLArray: +) -> blosc2.EmbedStore | blosc2.NDArray | blosc2.SChunk | blosc2.BatchStore | blosc2.VLArray: """Create a :ref:`EmbedStore `, :ref:`NDArray `, :ref:`SChunk `, - :ref:`ObjectStore ` or :ref:`VLArray ` instance + :ref:`BatchStore ` or :ref:`VLArray ` instance from a contiguous frame buffer. Parameters @@ -1938,7 +1938,7 @@ def from_cframe( Returns ------- out: :ref:`EmbedStore `, :ref:`NDArray `, :ref:`SChunk `, - :ref:`ObjectStore ` or :ref:`VLArray ` + :ref:`BatchStore ` or :ref:`VLArray ` A new instance of the appropriate type containing the data passed. See Also @@ -1952,8 +1952,8 @@ def from_cframe( # Check the metalayer to determine the type if "b2embed" in schunk.meta: return blosc2.estore_from_cframe(cframe, copy=copy) - if "objectstore" in schunk.meta: - return blosc2.ObjectStore(_from_schunk=schunk_from_cframe(cframe, copy=copy)) + if "batchstore" in schunk.meta: + return blosc2.BatchStore(_from_schunk=schunk_from_cframe(cframe, copy=copy)) if "vlarray" in schunk.meta: return blosc2.vlarray_from_cframe(cframe, copy=copy) if "b2nd" in schunk.meta: diff --git a/src/blosc2/dict_store.py b/src/blosc2/dict_store.py index 018c9fe8..0cb5ef63 100644 --- a/src/blosc2/dict_store.py +++ b/src/blosc2/dict_store.py @@ -249,25 +249,25 @@ def estore(self) -> EmbedStore: return self._estore @staticmethod - def _value_nbytes(value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.ObjectStore) -> int: - if isinstance(value, (blosc2.VLArray, blosc2.ObjectStore)): + def _value_nbytes(value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.BatchStore) -> int: + if isinstance(value, (blosc2.VLArray, blosc2.BatchStore)): return value.schunk.nbytes return value.nbytes @staticmethod - def _is_external_value(value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.ObjectStore) -> bool: - return isinstance(value, (blosc2.NDArray, SChunk, blosc2.VLArray, blosc2.ObjectStore)) and bool( + def _is_external_value(value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.BatchStore) -> bool: + return isinstance(value, (blosc2.NDArray, SChunk, blosc2.VLArray, blosc2.BatchStore)) and bool( getattr(value, "urlpath", None) ) @staticmethod - def _external_ext(value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.ObjectStore) -> str: + def _external_ext(value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.BatchStore) -> str: if isinstance(value, blosc2.NDArray): return ".b2nd" return ".b2f" def __setitem__( - self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.ObjectStore + self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.BatchStore ) -> None: """Add a node to the DictStore.""" if isinstance(value, np.ndarray): @@ -294,7 +294,7 @@ def __setitem__( if hasattr(value, "save"): value.save(urlpath=dest_path) else: - # SChunk, VLArray and ObjectStore can all be persisted via their cframe. + # SChunk, VLArray and BatchStore can all be persisted via their cframe. with open(dest_path, "wb") as f: f.write(value.to_cframe()) else: @@ -314,7 +314,7 @@ def __setitem__( def __getitem__( self, key: str - ) -> blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.ObjectStore | C2Array: + ) -> blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.BatchStore | C2Array: """Retrieve a node from the DictStore.""" # Check map_tree first if key in self.map_tree: @@ -346,7 +346,7 @@ def __getitem__( def get( self, key: str, default: Any = None - ) -> blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.ObjectStore | C2Array | Any: + ) -> blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.BatchStore | C2Array | Any: """Retrieve a node, or default if not found.""" try: return self[key] diff --git a/src/blosc2/embed_store.py b/src/blosc2/embed_store.py index 7a062b52..2497cad2 100644 --- a/src/blosc2/embed_store.py +++ b/src/blosc2/embed_store.py @@ -174,7 +174,7 @@ def _ensure_capacity(self, needed_bytes: int) -> None: self._store.resize((new_size,)) def __setitem__( - self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.ObjectStore + self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.BatchStore ) -> None: """Add a node to the embed store.""" if self.mode == "r": @@ -198,7 +198,7 @@ def __setitem__( self._embed_map[key] = {"offset": offset, "length": data_len} self._save_metadata() - def __getitem__(self, key: str) -> blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.ObjectStore: + def __getitem__(self, key: str) -> blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.BatchStore: """Retrieve a node from the embed store.""" if key not in self._embed_map: raise KeyError(f"Key '{key}' not found in the embed store.") @@ -216,7 +216,7 @@ def __getitem__(self, key: str) -> blosc2.NDArray | SChunk | blosc2.VLArray | bl def get( self, key: str, default: Any = None - ) -> blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.ObjectStore | Any: + ) -> blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.BatchStore | Any: """Retrieve a node, or default if not found.""" return self[key] if key in self._embed_map else default @@ -243,12 +243,12 @@ def keys(self) -> KeysView[str]: """Return all keys.""" return self._embed_map.keys() - def values(self) -> Iterator[blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.ObjectStore]: + def values(self) -> Iterator[blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.BatchStore]: """Iterate over all values.""" for key in self._embed_map: yield self[key] - def items(self) -> Iterator[tuple[str, blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.ObjectStore]]: + def items(self) -> Iterator[tuple[str, blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.BatchStore]]: """Iterate over (key, value) pairs.""" for key in self._embed_map: yield key, self[key] diff --git a/src/blosc2/schunk.py b/src/blosc2/schunk.py index aca106d1..b39524c6 100644 --- a/src/blosc2/schunk.py +++ b/src/blosc2/schunk.py @@ -1621,10 +1621,10 @@ def _process_opened_object(res): return VLArray(_from_schunk=getattr(res, "schunk", res)) - if "objectstore" in meta: - from blosc2.object_store import ObjectStore + if "batchstore" in meta: + from blosc2.batch_store import BatchStore - return ObjectStore(_from_schunk=getattr(res, "schunk", res)) + return BatchStore(_from_schunk=getattr(res, "schunk", res)) if isinstance(res, blosc2.NDArray) and "LazyArray" in res.schunk.meta: return blosc2._open_lazyarray(res) @@ -1637,7 +1637,7 @@ def open( ) -> ( blosc2.SChunk | blosc2.NDArray - | blosc2.ObjectStore + | blosc2.BatchStore | blosc2.VLArray | blosc2.C2Array | blosc2.LazyArray diff --git a/src/blosc2/tree_store.py b/src/blosc2/tree_store.py index 9287b4fc..7f4fe6ba 100644 --- a/src/blosc2/tree_store.py +++ b/src/blosc2/tree_store.py @@ -227,7 +227,7 @@ def _validate_key(self, key: str) -> str: return key def __setitem__( - self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.ObjectStore + self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.BatchStore ) -> None: """Add a node with hierarchical key validation. @@ -272,7 +272,7 @@ def __setitem__( def __getitem__( self, key: str - ) -> NDArray | C2Array | SChunk | blosc2.VLArray | blosc2.ObjectStore | TreeStore: + ) -> NDArray | C2Array | SChunk | blosc2.VLArray | blosc2.BatchStore | TreeStore: """Retrieve a node or subtree view. If the key points to a subtree (intermediate path with children), @@ -286,7 +286,7 @@ def __getitem__( Returns ------- - out : blosc2.NDArray or blosc2.C2Array or blosc2.SChunk or blosc2.VLArray or blosc2.ObjectStore or TreeStore + out : blosc2.NDArray or blosc2.C2Array or blosc2.SChunk or blosc2.VLArray or blosc2.BatchStore or TreeStore The stored array/chunk if key is a leaf node, or a TreeStore subtree view if key is an intermediate path with children. diff --git a/tests/test_object_store.py b/tests/test_batch_store.py similarity index 79% rename from tests/test_object_store.py rename to tests/test_batch_store.py index 63190614..45ff9195 100644 --- a/tests/test_object_store.py +++ b/tests/test_batch_store.py @@ -32,15 +32,15 @@ def _storage(contiguous, urlpath, mode="w"): [ (False, None), (True, None), - (True, "test_objectstore.b2frame"), - (False, "test_objectstore_s.b2frame"), + (True, "test_batchstore.b2frame"), + (False, "test_batchstore_s.b2frame"), ], ) -def test_objectstore_roundtrip(contiguous, urlpath): +def test_batchstore_roundtrip(contiguous, urlpath): blosc2.remove_urlpath(urlpath) - barray = blosc2.ObjectStore(storage=_storage(contiguous, urlpath)) - assert barray.meta["objectstore"]["serializer"] == "msgpack" + barray = blosc2.BatchStore(storage=_storage(contiguous, urlpath)) + assert barray.meta["batchstore"]["serializer"] == "msgpack" for i, batch in enumerate(BATCHES, start=1): assert barray.append(batch) == i @@ -82,7 +82,7 @@ def test_objectstore_roundtrip(contiguous, urlpath): if urlpath is not None: reopened = blosc2.open(urlpath, mode="r") - assert isinstance(reopened, blosc2.ObjectStore) + assert isinstance(reopened, blosc2.BatchStore) assert reopened.batchsize == barray.batchsize assert reopened.blocksize == barray.blocksize assert [batch[:] for batch in reopened] == expected @@ -110,14 +110,14 @@ def test_objectstore_roundtrip(contiguous, urlpath): if contiguous: reopened_mmap = blosc2.open(urlpath, mode="r", mmap_mode="r") - assert isinstance(reopened_mmap, blosc2.ObjectStore) + assert isinstance(reopened_mmap, blosc2.BatchStore) assert [batch[:] for batch in reopened_mmap] == expected blosc2.remove_urlpath(urlpath) -def test_objectstore_from_cframe(): - barray = blosc2.ObjectStore() +def test_batchstore_from_cframe(): + barray = blosc2.BatchStore() barray.extend(BATCHES) barray.insert(1, ["inserted", True, None]) del barray[3] @@ -126,16 +126,16 @@ def test_objectstore_from_cframe(): del expected[3] restored = blosc2.from_cframe(barray.to_cframe()) - assert isinstance(restored, blosc2.ObjectStore) + assert isinstance(restored, blosc2.BatchStore) assert [batch[:] for batch in restored] == expected restored2 = blosc2.from_cframe(barray.to_cframe()) - assert isinstance(restored2, blosc2.ObjectStore) + assert isinstance(restored2, blosc2.BatchStore) assert [batch[:] for batch in restored2] == expected -def test_objectstore_info(): - barray = blosc2.ObjectStore() +def test_batchstore_info(): + barray = blosc2.BatchStore() barray.extend(BATCHES) assert barray.typesize == 1 @@ -143,7 +143,7 @@ def test_objectstore_info(): assert barray.urlpath == barray.schunk.urlpath items = dict(barray.info_items) - assert items["type"] == "ObjectStore" + assert items["type"] == "BatchStore" assert items["nbatches"] == len(BATCHES) assert items["batchsize"] == len(BATCHES[0]) assert items["blocksize"] == barray.blocksize @@ -157,42 +157,37 @@ def test_objectstore_info(): text = repr(barray.info) assert "type" in text - assert "ObjectStore" in text + assert "BatchStore" in text assert "batchsize" in text -def test_objectstore_zstd_uses_dict_by_default(): - barray = blosc2.ObjectStore() +def test_batchstore_zstd_uses_dict_by_default(): + barray = blosc2.BatchStore() assert barray.cparams.codec == blosc2.Codec.ZSTD assert barray.cparams.use_dict is True -def test_objectstore_explicit_batchsize_blocksize(): - barray = blosc2.ObjectStore(batchsize=3, blocksize=2) +def test_batchstore_explicit_batchsize_blocksize(): + barray = blosc2.BatchStore(batchsize=3, blocksize=2) assert barray.batchsize == 3 - assert barray.chunksize == 3 assert barray.blocksize == 2 barray.append([1, 2, 3]) assert [batch[:] for batch in barray] == [[1, 2, 3]] - legacy = blosc2.ObjectStore(chunksize=3, blocksize=2) - assert legacy.batchsize == 3 - assert legacy.chunksize == 3 - -def test_objectstore_respects_explicit_use_dict_and_non_zstd(): - barray = blosc2.ObjectStore(cparams={"codec": blosc2.Codec.LZ4, "clevel": 5}) +def test_batchstore_respects_explicit_use_dict_and_non_zstd(): + barray = blosc2.BatchStore(cparams={"codec": blosc2.Codec.LZ4, "clevel": 5}) assert barray.cparams.codec == blosc2.Codec.LZ4 assert barray.cparams.use_dict is False - barray = blosc2.ObjectStore(cparams={"codec": blosc2.Codec.ZSTD, "clevel": 0}) + barray = blosc2.BatchStore(cparams={"codec": blosc2.Codec.ZSTD, "clevel": 0}) assert barray.cparams.codec == blosc2.Codec.ZSTD assert barray.cparams.use_dict is False - barray = blosc2.ObjectStore(cparams={"codec": blosc2.Codec.ZSTD, "clevel": 5, "use_dict": False}) + barray = blosc2.BatchStore(cparams={"codec": blosc2.Codec.ZSTD, "clevel": 5, "use_dict": False}) assert barray.cparams.use_dict is False - barray = blosc2.ObjectStore(cparams=blosc2.CParams(codec=blosc2.Codec.ZSTD, clevel=5, use_dict=False)) + barray = blosc2.BatchStore(cparams=blosc2.CParams(codec=blosc2.Codec.ZSTD, clevel=5, use_dict=False)) assert barray.cparams.use_dict is False @@ -233,14 +228,14 @@ def test_vlcompress_small_blocks_roundtrip(): assert out == payloads -def test_objectstore_constructor_kwargs(): - urlpath = "test_objectstore_kwargs.b2frame" +def test_batchstore_constructor_kwargs(): + urlpath = "test_batchstore_kwargs.b2frame" blosc2.remove_urlpath(urlpath) - barray = blosc2.ObjectStore(urlpath=urlpath, mode="w", contiguous=True) + barray = blosc2.BatchStore(urlpath=urlpath, mode="w", contiguous=True) barray.extend(BATCHES) - reopened = blosc2.ObjectStore(urlpath=urlpath, mode="r", contiguous=True, mmap_mode="r") + reopened = blosc2.BatchStore(urlpath=urlpath, mode="r", contiguous=True, mmap_mode="r") assert [batch[:] for batch in reopened] == BATCHES blosc2.remove_urlpath(urlpath) @@ -251,14 +246,14 @@ def test_objectstore_constructor_kwargs(): [ (False, None), (True, None), - (True, "test_objectstore_list_ops.b2frame"), - (False, "test_objectstore_list_ops_s.b2frame"), + (True, "test_batchstore_list_ops.b2frame"), + (False, "test_batchstore_list_ops_s.b2frame"), ], ) -def test_objectstore_list_like_ops(contiguous, urlpath): +def test_batchstore_list_like_ops(contiguous, urlpath): blosc2.remove_urlpath(urlpath) - barray = blosc2.ObjectStore(storage=_storage(contiguous, urlpath)) + barray = blosc2.BatchStore(storage=_storage(contiguous, urlpath)) barray.extend([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) assert [batch[:] for batch in barray] == [[1, 2, 3], [4, 5, 6], [7, 8, 9]] assert barray.pop() == [7, 8, 9] @@ -284,15 +279,15 @@ def test_objectstore_list_like_ops(contiguous, urlpath): [ (False, None), (True, None), - (True, "test_objectstore_slices.b2frame"), - (False, "test_objectstore_slices_s.b2frame"), + (True, "test_batchstore_slices.b2frame"), + (False, "test_batchstore_slices_s.b2frame"), ], ) -def test_objectstore_slices(contiguous, urlpath): +def test_batchstore_slices(contiguous, urlpath): blosc2.remove_urlpath(urlpath) expected = [[i, i + 100, i + 200] for i in range(8)] - barray = blosc2.ObjectStore(storage=_storage(contiguous, urlpath)) + barray = blosc2.BatchStore(storage=_storage(contiguous, urlpath)) barray.extend(expected) assert [batch[:] for batch in barray[1:6:2]] == expected[1:6:2] @@ -321,8 +316,8 @@ def test_objectstore_slices(contiguous, urlpath): blosc2.remove_urlpath(urlpath) -def test_objectstore_slice_errors(): - barray = blosc2.ObjectStore() +def test_batchstore_slice_errors(): + barray = blosc2.BatchStore() barray.extend([[0], [1], [2], [3]]) with pytest.raises(ValueError, match="extended slice"): @@ -333,13 +328,13 @@ def test_objectstore_slice_errors(): _ = barray[::0] -def test_objectstore_copy(): - urlpath = "test_objectstore_copy.b2frame" - copy_path = "test_objectstore_copy_out.b2frame" +def test_batchstore_copy(): + urlpath = "test_batchstore_copy.b2frame" + copy_path = "test_batchstore_copy_out.b2frame" blosc2.remove_urlpath(urlpath) blosc2.remove_urlpath(copy_path) - original = blosc2.ObjectStore(urlpath=urlpath, mode="w", contiguous=True) + original = blosc2.BatchStore(urlpath=urlpath, mode="w", contiguous=True) original.extend(BATCHES) original.insert(1, ["copy", True, 123]) @@ -364,7 +359,7 @@ def test_objectstore_copy(): @pytest.mark.parametrize(("contiguous", "nthreads"), [(False, 2), (True, 4)]) -def test_objectstore_multithreaded_inner_vl(contiguous, nthreads): +def test_batchstore_multithreaded_inner_vl(contiguous, nthreads): batches = [] for batch_id in range(24): batch = [] @@ -381,7 +376,7 @@ def test_objectstore_multithreaded_inner_vl(contiguous, nthreads): ) batches.append(batch) - barray = blosc2.ObjectStore( + barray = blosc2.BatchStore( storage=blosc2.Storage(contiguous=contiguous), cparams=blosc2.CParams(typesize=1, nthreads=nthreads, codec=blosc2.Codec.ZSTD, clevel=5), dparams=blosc2.DParams(nthreads=nthreads), @@ -392,8 +387,8 @@ def test_objectstore_multithreaded_inner_vl(contiguous, nthreads): assert [barray[i][:] for i in range(len(barray))] == batches -def test_objectstore_validation_errors(): - barray = blosc2.ObjectStore() +def test_batchstore_validation_errors(): + barray = blosc2.BatchStore() with pytest.raises(TypeError): barray.append("value") @@ -404,7 +399,7 @@ def test_objectstore_validation_errors(): with pytest.raises(IndexError): barray.delete(3) with pytest.raises(IndexError): - blosc2.ObjectStore().pop() + blosc2.BatchStore().pop() barray.extend([[1, 2, 3]]) with pytest.raises(ValueError): barray.append([2, 3]) @@ -412,29 +407,29 @@ def test_objectstore_validation_errors(): barray.pop(slice(0, 1)) -def test_objectstore_in_embed_store(): +def test_batchstore_in_embed_store(): estore = blosc2.EmbedStore() - barray = blosc2.ObjectStore() + barray = blosc2.BatchStore() barray.extend(BATCHES) estore["/batch"] = barray restored = estore["/batch"] - assert isinstance(restored, blosc2.ObjectStore) + assert isinstance(restored, blosc2.BatchStore) assert [batch[:] for batch in restored] == BATCHES -def test_objectstore_in_dict_store(): - path = "test_objectstore_store.b2z" +def test_batchstore_in_dict_store(): + path = "test_batchstore_store.b2z" blosc2.remove_urlpath(path) with blosc2.DictStore(path, mode="w", threshold=1) as dstore: - barray = blosc2.ObjectStore() + barray = blosc2.BatchStore() barray.extend(BATCHES) dstore["/batch"] = barray with blosc2.DictStore(path, mode="r") as dstore: restored = dstore["/batch"] - assert isinstance(restored, blosc2.ObjectStore) + assert isinstance(restored, blosc2.BatchStore) assert [batch[:] for batch in restored] == BATCHES blosc2.remove_urlpath(path) From be41f852fb498fe10468127c09464b43fcfef70f Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Wed, 18 Mar 2026 20:21:24 +0100 Subject: [PATCH 18/34] More consistent naming --- src/blosc2/batch_store.py | 44 +++++++++++++++++++-------------------- tests/test_batch_store.py | 6 +++--- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/src/blosc2/batch_store.py b/src/blosc2/batch_store.py index cb7e31fb..b9b222c9 100644 --- a/src/blosc2/batch_store.py +++ b/src/blosc2/batch_store.py @@ -27,14 +27,14 @@ def _check_serialized_size(buffer: bytes) -> None: class Batch(Sequence[Any]): - """A lazy sequence of Python objects stored in one BatchStore chunk.""" + """A lazy sequence of Python objects stored in one BatchStore batch.""" - def __init__(self, parent: BatchStore, nchunk: int, lazychunk: bytes) -> None: + def __init__(self, parent: BatchStore, nbatch: int, lazybatch: bytes) -> None: self._parent = parent - self._nchunk = nchunk - self._lazychunk = lazychunk + self._nbatch = nbatch + self._lazybatch = lazybatch self._blocks: list[list[Any]] | None = None - self._nbytes, self._cbytes, self._nblocks = blosc2.get_cbuffer_sizes(lazychunk) + self._nbytes, self._cbytes, self._nblocks = blosc2.get_cbuffer_sizes(lazybatch) def _normalize_index(self, index: int) -> int: if not isinstance(index, int): @@ -47,7 +47,7 @@ def _normalize_index(self, index: int) -> int: def _decode_blocks(self) -> list[list[Any]]: if self._blocks is None: - self._blocks = self._parent._decode_blocks(self._nchunk) + self._blocks = self._parent._decode_blocks(self._nbatch) return self._blocks def __getitem__(self, index: int | slice) -> Any | list[Any]: @@ -73,8 +73,8 @@ def __iter__(self) -> Iterator[Any]: yield self[i] @property - def lazychunk(self) -> bytes: - return self._lazychunk + def lazybatch(self) -> bytes: + return self._lazybatch @property def nbytes(self) -> int: @@ -338,9 +338,9 @@ def _compress_batch(self, batch: list[Any]) -> bytes: ] return blosc2.blosc2_ext.vlcompress(blocks, **self._vl_cparams_kwargs()) - def _decode_blocks(self, nchunk: int) -> list[list[Any]]: + def _decode_blocks(self, nbatch: int) -> list[list[Any]]: block_payloads = blosc2.blosc2_ext.vldecompress( - self.schunk.get_chunk(nchunk), **self._vl_dparams_kwargs() + self.schunk.get_chunk(nbatch), **self._vl_dparams_kwargs() ) return [msgpack_unpackb(payload) for payload in block_payloads] @@ -351,16 +351,16 @@ def append(self, value: object) -> int: """Append one batch and return the new number of entries.""" self._check_writable() batch = self._serialize_batch(value) - chunk = self._compress_batch(batch) - return self.schunk.append_chunk(chunk) + batch_payload = self._compress_batch(batch) + return self.schunk.append_chunk(batch_payload) def insert(self, index: int, value: object) -> int: """Insert one batch at ``index`` and return the new number of entries.""" self._check_writable() index = self._normalize_insert_index(index) batch = self._serialize_batch(value) - chunk = self._compress_batch(batch) - return self.schunk.insert_chunk(index, chunk) + batch_payload = self._compress_batch(batch) + return self.schunk.insert_chunk(index, batch_payload) def delete(self, index: int | slice) -> int: """Delete the batch at ``index`` and return the new number of entries.""" @@ -387,8 +387,8 @@ def extend(self, values: object) -> None: self._check_writable() for value in values: batch = self._serialize_batch(value) - chunk = self._compress_batch(batch) - self.schunk.append_chunk(chunk) + batch_payload = self._compress_batch(batch) + self.schunk.append_chunk(batch_payload) def clear(self) -> None: """Remove all entries from the container.""" @@ -424,8 +424,8 @@ def __setitem__(self, index: int | slice, value: object) -> None: self.schunk.delete_chunk(idx) for offset, item in enumerate(values): batch = self._serialize_batch(item) - chunk = self._compress_batch(batch) - self.schunk.insert_chunk(start + offset, chunk) + batch_payload = self._compress_batch(batch) + self.schunk.insert_chunk(start + offset, batch_payload) return if len(values) != len(indices): raise ValueError( @@ -433,14 +433,14 @@ def __setitem__(self, index: int | slice, value: object) -> None: ) for idx, item in zip(indices, values, strict=True): batch = self._serialize_batch(item) - chunk = self._compress_batch(batch) - self.schunk.update_chunk(idx, chunk) + batch_payload = self._compress_batch(batch) + self.schunk.update_chunk(idx, batch_payload) return self._check_writable() index = self._normalize_index(index) batch = self._serialize_batch(value) - chunk = self._compress_batch(batch) - self.schunk.update_chunk(index, chunk) + batch_payload = self._compress_batch(batch) + self.schunk.update_chunk(index, batch_payload) def __delitem__(self, index: int | slice) -> None: self.delete(index) diff --git a/tests/test_batch_store.py b/tests/test_batch_store.py index 45ff9195..1399626f 100644 --- a/tests/test_batch_store.py +++ b/tests/test_batch_store.py @@ -58,7 +58,7 @@ def test_batchstore_roundtrip(contiguous, urlpath): assert len(batch0) == len(BATCHES[0]) assert batch0[1] == BATCHES[0][1] assert batch0[:] == BATCHES[0] - assert isinstance(batch0.lazychunk, bytes) + assert isinstance(batch0.lazybatch, bytes) assert batch0.nbytes > 0 assert batch0.cbytes > 0 assert batch0.cratio > 0 @@ -216,14 +216,14 @@ def test_vlcompress_small_blocks_roundtrip(): ] payloads = [msgpack_packb(value) for value in values] - chunk = blosc2.blosc2_ext.vlcompress( + batch_payload = blosc2.blosc2_ext.vlcompress( payloads, codec=blosc2.Codec.ZSTD, clevel=5, typesize=1, nthreads=1, ) - out = blosc2.blosc2_ext.vldecompress(chunk, nthreads=1) + out = blosc2.blosc2_ext.vldecompress(batch_payload, nthreads=1) assert out == payloads From c6540e6cfcb1d448bddecf2c6217318237ff9969 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 19 Mar 2026 06:37:05 +0100 Subject: [PATCH 19/34] batchsize is not immutable anymore (and neither blocksize) --- src/blosc2/batch_store.py | 114 ++++++++++++-------------------------- tests/test_batch_store.py | 34 ++++++------ 2 files changed, 52 insertions(+), 96 deletions(-) diff --git a/src/blosc2/batch_store.py b/src/blosc2/batch_store.py index b9b222c9..c30ef325 100644 --- a/src/blosc2/batch_store.py +++ b/src/blosc2/batch_store.py @@ -9,6 +9,7 @@ import copy import pathlib +import statistics from collections.abc import Iterator, Sequence from dataclasses import asdict from typing import Any @@ -17,8 +18,7 @@ from blosc2._msgpack_utils import msgpack_packb, msgpack_unpackb from blosc2.info import InfoReporter, format_nbytes_info -_BATCHSTORE_META = {"version": 2, "serializer": "msgpack", "format": "batched_vlblocks"} -_BATCHSTORE_LAYOUT_KEY = "batchstore" +_BATCHSTORE_META = {"version": 1, "serializer": "msgpack"} def _check_serialized_size(buffer: bytes) -> None: @@ -33,7 +33,7 @@ def __init__(self, parent: BatchStore, nbatch: int, lazybatch: bytes) -> None: self._parent = parent self._nbatch = nbatch self._lazybatch = lazybatch - self._blocks: list[list[Any]] | None = None + self._items: list[Any] | None = None self._nbytes, self._cbytes, self._nblocks = blosc2.get_cbuffer_sizes(lazybatch) def _normalize_index(self, index: int) -> int: @@ -45,28 +45,21 @@ def _normalize_index(self, index: int) -> int: raise IndexError("Batch index out of range") return index - def _decode_blocks(self) -> list[list[Any]]: - if self._blocks is None: - self._blocks = self._parent._decode_blocks(self._nbatch) - return self._blocks + def _decode_items(self) -> list[Any]: + if self._items is None: + blocks = self._parent._decode_blocks(self._nbatch) + self._items = [item for block in blocks for item in block] + return self._items def __getitem__(self, index: int | slice) -> Any | list[Any]: - blocks = self._decode_blocks() + items = self._decode_items() if isinstance(index, slice): - flat_items = [item for block in blocks for item in block] - return flat_items[index] + return items[index] index = self._normalize_index(index) - blocksize = self._parent.blocksize - if blocksize is None: - raise RuntimeError("BatchStore blocksize is not initialized") - block_index, item_index = divmod(index, blocksize) - return blocks[block_index][item_index] + return items[index] def __len__(self) -> int: - batchsize = self._parent.batchsize - if batchsize is None: - return self._nblocks - return batchsize + return len(self._decode_items()) def __iter__(self) -> Iterator[Any]: for i in range(len(self)): @@ -151,7 +144,6 @@ def _attach_schunk(self, schunk: blosc2.SChunk) -> None: self.mode = schunk.mode self.mmap_mode = getattr(schunk, "mmap_mode", None) self._validate_tag() - self._load_layout() def _maybe_open_existing(self, storage: blosc2.Storage) -> bool: urlpath = storage.urlpath @@ -174,17 +166,14 @@ def _make_storage(self) -> blosc2.Storage: def __init__( self, - batchsize: int | None = None, - blocksize: int | None = None, + blocksize_max: int | None = None, _from_schunk: blosc2.SChunk | None = None, **kwargs: Any, ) -> None: - self._batchsize: int | None = batchsize - self._blocksize: int | None = blocksize - self._layout_format: str | None = None + if blocksize_max is not None and blocksize_max <= 0: + raise ValueError("blocksize_max must be a positive integer") + self._blocksize_max: int | None = blocksize_max if _from_schunk is not None: - if batchsize is not None or blocksize is not None: - raise ValueError("Cannot pass `batchsize` or `blocksize` together with `_from_schunk`") if kwargs: unexpected = ", ".join(sorted(kwargs)) raise ValueError(f"Cannot pass {unexpected} together with `_from_schunk`") @@ -213,39 +202,11 @@ def __init__( storage.meta = fixed_meta schunk = blosc2.SChunk(chunksize=-1, data=None, cparams=cparams, dparams=dparams, storage=storage) self._attach_schunk(schunk) - if self._batchsize is not None or self._blocksize is not None: - self._store_layout() def _validate_tag(self) -> None: if "batchstore" not in self.schunk.meta: raise ValueError("The supplied SChunk is not tagged as a BatchStore") - def _load_layout(self) -> None: - layout = None - self._layout_format = None - if _BATCHSTORE_LAYOUT_KEY in self.vlmeta: - layout = self.vlmeta[_BATCHSTORE_LAYOUT_KEY] - if isinstance(layout, dict): - self._batchsize = layout["batchsize"] - self._blocksize = layout.get("blocksize") - self._layout_format = layout.get("format", "batched_vlblocks") - return - if len(self) == 0: - return - raise ValueError("BatchStore layout metadata is missing") - - def _store_layout(self) -> None: - if self._batchsize is None or self.mode == "r": - return - layout = { - "version": 1, - "batchsize": self._batchsize, - "blocksize": self._blocksize, - "format": self._layout_format or "batched_vlblocks", - "sizing_policy": "l2_cache_prefix", - } - self.vlmeta[_BATCHSTORE_LAYOUT_KEY] = layout - def _check_writable(self) -> None: if self.mode == "r": raise ValueError("Cannot modify a BatchStore opened in read-only mode") @@ -287,14 +248,9 @@ def _normalize_batch(self, value: object) -> list[Any]: return values def _ensure_layout_for_batch(self, batch: list[Any]) -> None: - if self._batchsize is None: - self._batchsize = len(batch) - if len(batch) != self._batchsize: - raise ValueError(f"BatchStore entries must contain exactly {self._batchsize} objects") - if self._blocksize is None: + if self._blocksize_max is None: payload_sizes = [len(msgpack_packb(item)) for item in batch] - self._blocksize = self._guess_blocksize(payload_sizes) - self._store_layout() + self._blocksize_max = self._guess_blocksize(payload_sizes) def _guess_blocksize(self, payload_sizes: list[int]) -> int: if not payload_sizes: @@ -330,11 +286,11 @@ def _vl_dparams_kwargs(self) -> dict[str, Any]: return asdict(self.schunk.dparams) def _compress_batch(self, batch: list[Any]) -> bytes: - if self._blocksize is None: - raise RuntimeError("BatchStore blocksize is not initialized") + if self._blocksize_max is None: + raise RuntimeError("BatchStore blocksize_max is not initialized") blocks = [ - self._serialize_block(batch[i : i + self._blocksize]) - for i in range(0, len(batch), self._blocksize) + self._serialize_block(batch[i : i + self._blocksize_max]) + for i in range(0, len(batch), self._blocksize_max) ] return blosc2.blosc2_ext.vlcompress(blocks, **self._vl_cparams_kwargs()) @@ -404,7 +360,6 @@ def clear(self) -> None: storage=storage, ) self._attach_schunk(schunk) - self._store_layout() def __getitem__(self, index: int | slice) -> Batch | list[Batch]: if isinstance(index, slice): @@ -469,12 +424,8 @@ def dparams(self): return self.schunk.dparams @property - def batchsize(self) -> int: - return self._batchsize - - @property - def blocksize(self) -> int: - return self._blocksize + def blocksize_max(self) -> int | None: + return self._blocksize_max @property def typesize(self) -> int: @@ -508,13 +459,19 @@ def info(self) -> InfoReporter: @property def info_items(self) -> list: """A list of tuples with summary information about this BatchStore.""" - nitems = len(self) * self.batchsize if self.batchsize is not None else 0 + batch_sizes = [len(batch) for batch in self] + if batch_sizes: + batch_stats = ( + f"mean={statistics.fmean(batch_sizes):.2f}, max={max(batch_sizes)}, min={min(batch_sizes)}" + ) + else: + batch_stats = "n/a" return [ ("type", f"{self.__class__.__name__}"), ("nbatches", len(self)), - ("batchsize", self.batchsize), - ("blocksize", self.blocksize), - ("nitems", nitems), + ("batch stats", batch_stats), + ("blocksize_max", self.blocksize_max), + ("nitems", sum(batch_sizes)), ("nbytes", format_nbytes_info(self.nbytes)), ("cbytes", format_nbytes_info(self.cbytes)), ("cratio", f"{self.cratio:.2f}"), @@ -531,8 +488,7 @@ def copy(self, **kwargs: Any) -> BatchStore: raise ValueError("meta should not be passed to copy") kwargs["cparams"] = kwargs.get("cparams", copy.deepcopy(self.cparams)) kwargs["dparams"] = kwargs.get("dparams", copy.deepcopy(self.dparams)) - kwargs["batchsize"] = kwargs.get("batchsize", self.batchsize) - kwargs["blocksize"] = kwargs.get("blocksize", self.blocksize) + kwargs["blocksize_max"] = kwargs.get("blocksize_max", self.blocksize_max) if "storage" not in kwargs: kwargs["meta"] = self._copy_meta() diff --git a/tests/test_batch_store.py b/tests/test_batch_store.py index 1399626f..dbd54cde 100644 --- a/tests/test_batch_store.py +++ b/tests/test_batch_store.py @@ -46,12 +46,11 @@ def test_batchstore_roundtrip(contiguous, urlpath): assert barray.append(batch) == i assert len(barray) == len(BATCHES) - assert barray.batchsize == len(BATCHES[0]) - assert barray.blocksize is not None - assert 1 <= barray.blocksize <= barray.batchsize + assert barray.blocksize_max is not None + assert 1 <= barray.blocksize_max <= len(BATCHES[0]) assert [batch[:] for batch in barray] == BATCHES - with pytest.raises(ValueError): - barray.append([1, 2]) + assert barray.append([1, 2]) == len(BATCHES) + 1 + assert [batch[:] for batch in barray][-1] == [1, 2] batch0 = barray[0] assert isinstance(batch0, blosc2.Batch) @@ -64,6 +63,7 @@ def test_batchstore_roundtrip(contiguous, urlpath): assert batch0.cratio > 0 expected = list(BATCHES) + expected.append([1, 2]) expected[1] = ["updated", {"tuple": (7, 8)}, 99] expected[-1] = ["tiny", False, "x"] barray[1] = expected[1] @@ -83,8 +83,7 @@ def test_batchstore_roundtrip(contiguous, urlpath): if urlpath is not None: reopened = blosc2.open(urlpath, mode="r") assert isinstance(reopened, blosc2.BatchStore) - assert reopened.batchsize == barray.batchsize - assert reopened.blocksize == barray.blocksize + assert reopened.blocksize_max is None assert [batch[:] for batch in reopened] == expected with pytest.raises(ValueError): reopened.append(["nope"]) @@ -145,8 +144,8 @@ def test_batchstore_info(): items = dict(barray.info_items) assert items["type"] == "BatchStore" assert items["nbatches"] == len(BATCHES) - assert items["batchsize"] == len(BATCHES[0]) - assert items["blocksize"] == barray.blocksize + assert items["batch stats"].startswith("mean=") + assert items["blocksize_max"] == barray.blocksize_max assert items["nitems"] == sum(len(batch) for batch in BATCHES) assert "urlpath" not in items assert "contiguous" not in items @@ -158,7 +157,8 @@ def test_batchstore_info(): text = repr(barray.info) assert "type" in text assert "BatchStore" in text - assert "batchsize" in text + assert "batch stats" in text + assert "blocksize_max" in text def test_batchstore_zstd_uses_dict_by_default(): @@ -167,12 +167,12 @@ def test_batchstore_zstd_uses_dict_by_default(): assert barray.cparams.use_dict is True -def test_batchstore_explicit_batchsize_blocksize(): - barray = blosc2.BatchStore(batchsize=3, blocksize=2) - assert barray.batchsize == 3 - assert barray.blocksize == 2 +def test_batchstore_explicit_blocksize_max(): + barray = blosc2.BatchStore(blocksize_max=2) + assert barray.blocksize_max == 2 barray.append([1, 2, 3]) - assert [batch[:] for batch in barray] == [[1, 2, 3]] + barray.append([4]) + assert [batch[:] for batch in barray] == [[1, 2, 3], [4]] def test_batchstore_respects_explicit_use_dict_and_non_zstd(): @@ -401,8 +401,8 @@ def test_batchstore_validation_errors(): with pytest.raises(IndexError): blosc2.BatchStore().pop() barray.extend([[1, 2, 3]]) - with pytest.raises(ValueError): - barray.append([2, 3]) + assert barray.append([2, 3]) == 2 + assert [batch[:] for batch in barray] == [[1, 2, 3], [2, 3]] with pytest.raises(NotImplementedError): barray.pop(slice(0, 1)) From e60e31c454c7a5978d7b5c69ffa637e14846b88f Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 19 Mar 2026 07:29:55 +0100 Subject: [PATCH 20/34] Implemented block-only reads for improved random read access --- CMakeLists.txt | 2 +- bench/batch_store.py | 151 ++++++++++++++++++++++++++++++++++++++ examples/batch_store.py | 68 +++++++++++++++++ src/blosc2/batch_store.py | 27 ++++--- src/blosc2/blosc2_ext.pyx | 49 +++++++++++++ src/blosc2/schunk.py | 4 + tests/test_batch_store.py | 51 ++++++++++--- 7 files changed, 328 insertions(+), 24 deletions(-) create mode 100644 bench/batch_store.py create mode 100644 examples/batch_store.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 133e06dc..3ed060c9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -119,7 +119,7 @@ else() include(FetchContent) FetchContent_Declare(blosc2 GIT_REPOSITORY https://github.com/Blosc/c-blosc2 - GIT_TAG fb31a6ab43db2a26ba6d1c690166988b680c3fd7 # variable-length chunks/blocks + GIT_TAG c0f5416f55662fccad861aa0387e965f73f644b4 # variable-length chunks/blocks # SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../c-blosc2 ) FetchContent_MakeAvailable(blosc2) diff --git a/bench/batch_store.py b/bench/batch_store.py new file mode 100644 index 00000000..67cc25ed --- /dev/null +++ b/bench/batch_store.py @@ -0,0 +1,151 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from __future__ import annotations + +import argparse +import random +import statistics +import time + +import blosc2 + + +URLPATH = "bench_batch_store.b2b" +NBATCHES = 10_000 +OBJECTS_PER_BATCH = 100 +TOTAL_OBJECTS = NBATCHES * OBJECTS_PER_BATCH +BLOCKSIZE_MAX = 32 +N_RANDOM_READS = 1_000 + + +def make_rgb(batch_index: int, item_index: int) -> dict[str, int]: + global_index = batch_index * OBJECTS_PER_BATCH + item_index + return { + "red": batch_index, + "green": item_index, + "blue": global_index, + } + + +def make_batch(batch_index: int) -> list[dict[str, int]]: + return [make_rgb(batch_index, item_index) for item_index in range(OBJECTS_PER_BATCH)] + + +def expected_entry(batch_index: int, item_index: int) -> dict[str, int]: + return { + "red": batch_index, + "green": item_index, + "blue": batch_index * OBJECTS_PER_BATCH + item_index, + } + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Benchmark BatchStore single-entry reads.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument("--codec", type=str, default="ZSTD", choices=[codec.name for codec in blosc2.Codec]) + parser.add_argument("--clevel", type=int, default=5) + parser.add_argument("--use-dict", action="store_true", help="Enable dictionaries for ZSTD/LZ4 codecs.") + parser.add_argument("--in-mem", action="store_true", help="Keep the BatchStore purely in memory.") + return parser + + +def build_store(codec: blosc2.Codec, clevel: int, use_dict: bool, in_mem: bool) -> blosc2.BatchStore | None: + if in_mem: + storage = blosc2.Storage(mode="w") + store = blosc2.BatchStore( + storage=storage, + blocksize_max=BLOCKSIZE_MAX, + cparams={ + "codec": codec, + "clevel": clevel, + "use_dict": use_dict and codec in (blosc2.Codec.ZSTD, blosc2.Codec.LZ4), + }, + ) + for batch_index in range(NBATCHES): + store.append(make_batch(batch_index)) + return store + + blosc2.remove_urlpath(URLPATH) + storage = blosc2.Storage(urlpath=URLPATH, mode="w", contiguous=True) + cparams = { + "codec": codec, + "clevel": clevel, + "use_dict": use_dict and codec in (blosc2.Codec.ZSTD, blosc2.Codec.LZ4), + } + with blosc2.BatchStore(storage=storage, blocksize_max=BLOCKSIZE_MAX, cparams=cparams) as store: + for batch_index in range(NBATCHES): + store.append(make_batch(batch_index)) + return None + + +def measure_random_reads(store: blosc2.BatchStore) -> tuple[list[tuple[int, int, int, dict[str, int]]], list[int]]: + rng = random.Random(2024) + samples: list[tuple[int, int, int, dict[str, int]]] = [] + timings_ns: list[int] = [] + + for _ in range(N_RANDOM_READS): + batch_index = rng.randrange(len(store)) + item_index = rng.randrange(OBJECTS_PER_BATCH) + t0 = time.perf_counter_ns() + value = store[batch_index][item_index] + timings_ns.append(time.perf_counter_ns() - t0) + if value != expected_entry(batch_index, item_index): + raise RuntimeError(f"Value mismatch at batch={batch_index}, item={item_index}") + samples.append((timings_ns[-1], batch_index, item_index, value)) + + return samples, timings_ns + + +def main() -> None: + parser = build_parser() + args = parser.parse_args() + codec = blosc2.Codec[args.codec] + use_dict = args.use_dict and codec in (blosc2.Codec.ZSTD, blosc2.Codec.LZ4) + + mode_label = "in-memory" if args.in_mem else "persistent" + article = "an" if args.in_mem else "a" + print(f"Building {article} {mode_label} BatchStore with 1,000,000 RGB dicts and timing 1,000 random scalar reads...") + print(f" codec: {codec.name}") + print(f" clevel: {args.clevel}") + print(f" use_dict: {use_dict}") + print(f" in_mem: {args.in_mem}") + t0 = time.perf_counter() + store = build_store(codec=codec, clevel=args.clevel, use_dict=use_dict, in_mem=args.in_mem) + build_time_s = time.perf_counter() - t0 + if args.in_mem: + assert store is not None + read_store = store + else: + read_store = blosc2.BatchStore(urlpath=URLPATH, mode="r", contiguous=True, blocksize_max=BLOCKSIZE_MAX) + samples, timings_ns = measure_random_reads(read_store) + + print() + print("BatchStore benchmark") + print(f" build time: {build_time_s:.3f} s") + print(f" batches: {len(read_store)}") + print(f" objects: {TOTAL_OBJECTS}") + print(f" blocksize_max: {read_store.blocksize_max}") + print() + print(read_store.info) + print(f"Random scalar reads: {N_RANDOM_READS}") + print(f" mean: {statistics.fmean(timings_ns) / 1_000:.2f} us") + print(f" max: {max(timings_ns) / 1_000:.2f} us") + print(f" min: {min(timings_ns) / 1_000:.2f} us") + print("Sample reads:") + for timing_ns, batch_index, item_index, value in samples[:5]: + print(f" {timing_ns / 1_000:.2f} us -> read_store[{batch_index}][{item_index}] = {value}") + if args.in_mem: + print("BatchStore kept in memory") + else: + print(f"BatchStore file at: {read_store.urlpath}") + + +if __name__ == "__main__": + main() diff --git a/examples/batch_store.py b/examples/batch_store.py new file mode 100644 index 00000000..5a127576 --- /dev/null +++ b/examples/batch_store.py @@ -0,0 +1,68 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from __future__ import annotations + +import random + +import blosc2 + +URLPATH = "example_batch_store.b2b" +NBATCHES = 100 +OBJECTS_PER_BATCH = 100 +BLOCKSIZE_MAX = 32 +N_RANDOM_SAMPLES = 5 + + +def make_rgb(batch_index: int, item_index: int) -> dict[str, int]: + global_index = batch_index * OBJECTS_PER_BATCH + item_index + return { + "red": batch_index, + "green": item_index, + "blue": global_index, + } + + +def make_batch(batch_index: int) -> list[dict[str, int]]: + return [make_rgb(batch_index, item_index) for item_index in range(OBJECTS_PER_BATCH)] + + +def main() -> None: + # Start clean so the example is reproducible when run multiple times. + blosc2.remove_urlpath(URLPATH) + + storage = blosc2.Storage(urlpath=URLPATH, mode="w", contiguous=True) + with blosc2.BatchStore(storage=storage, blocksize_max=BLOCKSIZE_MAX) as store: + for batch_index in range(NBATCHES): + store.append(make_batch(batch_index)) + + total_objects = sum(len(batch) for batch in store) + print("Created BatchStore") + print(f" batches: {len(store)}") + print(f" objects: {total_objects}") + print(f" blocksize_max: {store.blocksize_max}") + + # Reopen with the same blocksize_max hint so scalar reads can use the + # VL-block path instead of decoding the entire batch. + reopened = blosc2.BatchStore(urlpath=URLPATH, mode="r", contiguous=True, blocksize_max=BLOCKSIZE_MAX) + + print() + print(reopened.info) + + sample_rng = random.Random(2024) + print("Random scalar reads:") + for _ in range(N_RANDOM_SAMPLES): + batch_index = sample_rng.randrange(len(reopened)) + item_index = sample_rng.randrange(OBJECTS_PER_BATCH) + value = reopened[batch_index][item_index] + print(f" reopened[{batch_index}][{item_index}] -> {value}") + + print(f"BatchStore file at: {reopened.urlpath}") + + +if __name__ == "__main__": + main() diff --git a/src/blosc2/batch_store.py b/src/blosc2/batch_store.py index c30ef325..86579cc6 100644 --- a/src/blosc2/batch_store.py +++ b/src/blosc2/batch_store.py @@ -52,9 +52,24 @@ def _decode_items(self) -> list[Any]: return self._items def __getitem__(self, index: int | slice) -> Any | list[Any]: - items = self._decode_items() if isinstance(index, slice): + items = self._decode_items() + return items[index] + if index < 0: + items = self._decode_items() + index = self._normalize_index(index) return items[index] + blocksize_max = self._parent.blocksize_max + if blocksize_max is not None: + block_index, item_index = divmod(index, blocksize_max) + if block_index >= self._nblocks: + raise IndexError("Batch index out of range") + block = msgpack_unpackb(self._parent.schunk.get_vlblock(self._nbatch, block_index)) + try: + return block[item_index] + except IndexError as exc: + raise IndexError("Batch index out of range") from exc + items = self._decode_items() index = self._normalize_index(index) return items[index] @@ -90,27 +105,17 @@ class BatchStore: @staticmethod def _set_typesize_one(cparams: blosc2.CParams | dict | None) -> blosc2.CParams | dict: - auto_use_dict = cparams is None if cparams is None: cparams = blosc2.CParams() elif isinstance(cparams, blosc2.CParams): cparams = copy.deepcopy(cparams) else: cparams = dict(cparams) - auto_use_dict = "use_dict" not in cparams if isinstance(cparams, blosc2.CParams): cparams.typesize = 1 - if auto_use_dict and cparams.codec == blosc2.Codec.ZSTD and cparams.clevel > 0: - # BatchStore stores many small serialized payloads, where Zstd dicts help materially. - cparams.use_dict = True else: cparams["typesize"] = 1 - codec = cparams.get("codec", blosc2.Codec.ZSTD) - clevel = cparams.get("clevel", 5) - if auto_use_dict and codec == blosc2.Codec.ZSTD and clevel > 0: - # BatchStore stores many small serialized payloads, where Zstd dicts help materially. - cparams["use_dict"] = True return cparams @staticmethod diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx index 4c574015..7b6791fa 100644 --- a/src/blosc2/blosc2_ext.pyx +++ b/src/blosc2/blosc2_ext.pyx @@ -390,6 +390,8 @@ cdef extern from "blosc2.h": c_bool *needs_free) nogil int blosc2_schunk_get_lazychunk(blosc2_schunk *schunk, int64_t nchunk, uint8_t ** chunk, c_bool *needs_free) nogil + int blosc2_schunk_get_vlblock(blosc2_schunk *schunk, int64_t nchunk, int32_t nblock, + uint8_t **dest, int32_t *destsize) int blosc2_schunk_get_slice_buffer(blosc2_schunk *schunk, int64_t start, int64_t stop, void *buffer) int blosc2_schunk_set_slice_buffer(blosc2_schunk *schunk, int64_t start, int64_t stop, void *buffer) int blosc2_schunk_get_cparams(blosc2_schunk *schunk, blosc2_cparams** cparams) @@ -416,6 +418,9 @@ cdef extern from "blosc2.h": uint8_t **content, int32_t *content_len) int blosc2_vlmeta_delete(blosc2_schunk *schunk, const char *name) int blosc2_vlmeta_get_names(blosc2_schunk *schunk, char **names) + int blosc2_vldecompress_block_ctx(blosc2_context* context, const void* src, + int32_t srcsize, int32_t nblock, uint8_t** dest, + int32_t* destsize) int blosc1_get_blocksize() @@ -1282,6 +1287,40 @@ def vldecompress(src, **kwargs): blosc2_free_ctx(dctx) +def vldecompress_block(src, int32_t nblock, **kwargs): + cdef blosc2_dparams dparams + create_dparams_from_kwargs(&dparams, kwargs) + + cdef blosc2_context *dctx = blosc2_create_dctx(dparams) + if dctx == NULL: + raise RuntimeError("Could not create decompression context") + + cdef const uint8_t[:] typed_view_src + mem_view_src = memoryview(src) + typed_view_src = mem_view_src.cast('B') + _check_comp_length('src', typed_view_src.nbytes) + + cdef uint8_t *dest = NULL + cdef int32_t destsize = 0 + cdef int32_t rc + try: + rc = blosc2_vldecompress_block_ctx( + dctx, + &typed_view_src[0], + typed_view_src.nbytes, + nblock, + &dest, + &destsize, + ) + if rc < 0: + raise RuntimeError("Could not decompress the block") + return PyBytes_FromStringAndSize(dest, destsize) + finally: + if dest != NULL: + free(dest) + blosc2_free_ctx(dctx) + + cdef create_storage(blosc2_storage *storage, kwargs): contiguous = kwargs.get('contiguous', blosc2.storage_dflts['contiguous']) storage.contiguous = contiguous @@ -1687,6 +1726,16 @@ cdef class SChunk: free(chunk) return ret_chunk + def get_vlblock(self, nchunk, nblock): + cdef uint8_t *block + cdef int32_t destsize + cbytes = blosc2_schunk_get_vlblock(self.schunk, nchunk, nblock, &block, &destsize) + if cbytes < 0: + raise RuntimeError("Error while getting the vlblock") + ret_block = PyBytes_FromStringAndSize(block, destsize) + free(block) + return ret_block + def delete_chunk(self, nchunk): rc = blosc2_schunk_delete_chunk(self.schunk, nchunk) if rc < 0: diff --git a/src/blosc2/schunk.py b/src/blosc2/schunk.py index b39524c6..55b4acdf 100644 --- a/src/blosc2/schunk.py +++ b/src/blosc2/schunk.py @@ -674,6 +674,10 @@ def get_chunk(self, nchunk: int) -> bytes: """ return super().get_chunk(nchunk) + def get_vlblock(self, nchunk: int, nblock: int) -> bytes: + """Return the decompressed payload of one VL block from a chunk.""" + return super().get_vlblock(nchunk, nblock) + def delete_chunk(self, nchunk: int) -> int: """Delete the specified chunk from the SChunk. diff --git a/tests/test_batch_store.py b/tests/test_batch_store.py index dbd54cde..13a4b981 100644 --- a/tests/test_batch_store.py +++ b/tests/test_batch_store.py @@ -8,7 +8,7 @@ import pytest import blosc2 -from blosc2._msgpack_utils import msgpack_packb +from blosc2._msgpack_utils import msgpack_packb, msgpack_unpackb BATCHES = [ [b"bytes\x00payload", "plain text", 42], @@ -32,8 +32,8 @@ def _storage(contiguous, urlpath, mode="w"): [ (False, None), (True, None), - (True, "test_batchstore.b2frame"), - (False, "test_batchstore_s.b2frame"), + (True, "test_batchstore.b2b"), + (False, "test_batchstore_s.b2b"), ], ) def test_batchstore_roundtrip(contiguous, urlpath): @@ -161,10 +161,10 @@ def test_batchstore_info(): assert "blocksize_max" in text -def test_batchstore_zstd_uses_dict_by_default(): +def test_batchstore_zstd_does_not_use_dict_by_default(): barray = blosc2.BatchStore() assert barray.cparams.codec == blosc2.Codec.ZSTD - assert barray.cparams.use_dict is True + assert barray.cparams.use_dict is False def test_batchstore_explicit_blocksize_max(): @@ -175,6 +175,33 @@ def test_batchstore_explicit_blocksize_max(): assert [batch[:] for batch in barray] == [[1, 2, 3], [4]] +def test_batchstore_get_vlblock_and_scalar_access(): + urlpath = "test_batchstore_vlblock.b2b" + blosc2.remove_urlpath(urlpath) + + batch = [0, 1, 2, 3, 4] + barray = blosc2.BatchStore(storage=_storage(True, urlpath), blocksize_max=2) + barray.append(batch) + + assert barray.blocksize_max == 2 + assert msgpack_unpackb(barray.schunk.get_vlblock(0, 0)) == batch[:2] + assert msgpack_unpackb(barray.schunk.get_vlblock(0, 1)) == batch[2:4] + assert msgpack_unpackb(barray.schunk.get_vlblock(0, 2)) == batch[4:] + + assert barray[0][0] == 0 + assert barray[0][2] == 2 + assert barray[0][4] == 4 + + reopened = blosc2.open(urlpath, mode="r") + assert isinstance(reopened, blosc2.BatchStore) + assert reopened[0][0] == 0 + assert reopened[0][2] == 2 + assert reopened[0][4] == 4 + assert msgpack_unpackb(reopened.schunk.get_vlblock(0, 1)) == batch[2:4] + + blosc2.remove_urlpath(urlpath) + + def test_batchstore_respects_explicit_use_dict_and_non_zstd(): barray = blosc2.BatchStore(cparams={"codec": blosc2.Codec.LZ4, "clevel": 5}) assert barray.cparams.codec == blosc2.Codec.LZ4 @@ -229,7 +256,7 @@ def test_vlcompress_small_blocks_roundtrip(): def test_batchstore_constructor_kwargs(): - urlpath = "test_batchstore_kwargs.b2frame" + urlpath = "test_batchstore_kwargs.b2b" blosc2.remove_urlpath(urlpath) barray = blosc2.BatchStore(urlpath=urlpath, mode="w", contiguous=True) @@ -246,8 +273,8 @@ def test_batchstore_constructor_kwargs(): [ (False, None), (True, None), - (True, "test_batchstore_list_ops.b2frame"), - (False, "test_batchstore_list_ops_s.b2frame"), + (True, "test_batchstore_list_ops.b2b"), + (False, "test_batchstore_list_ops_s.b2b"), ], ) def test_batchstore_list_like_ops(contiguous, urlpath): @@ -279,8 +306,8 @@ def test_batchstore_list_like_ops(contiguous, urlpath): [ (False, None), (True, None), - (True, "test_batchstore_slices.b2frame"), - (False, "test_batchstore_slices_s.b2frame"), + (True, "test_batchstore_slices.b2b"), + (False, "test_batchstore_slices_s.b2b"), ], ) def test_batchstore_slices(contiguous, urlpath): @@ -329,8 +356,8 @@ def test_batchstore_slice_errors(): def test_batchstore_copy(): - urlpath = "test_batchstore_copy.b2frame" - copy_path = "test_batchstore_copy_out.b2frame" + urlpath = "test_batchstore_copy.b2b" + copy_path = "test_batchstore_copy_out.b2b" blosc2.remove_urlpath(urlpath) blosc2.remove_urlpath(copy_path) From 4eed97c7832dc3f871def45095ca2a3da450ba70 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 19 Mar 2026 07:34:17 +0100 Subject: [PATCH 21/34] New cache for the last block read --- src/blosc2/batch_store.py | 12 +++++++++++- tests/test_batch_store.py | 24 ++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/src/blosc2/batch_store.py b/src/blosc2/batch_store.py index 86579cc6..b3de30a2 100644 --- a/src/blosc2/batch_store.py +++ b/src/blosc2/batch_store.py @@ -34,6 +34,8 @@ def __init__(self, parent: BatchStore, nbatch: int, lazybatch: bytes) -> None: self._nbatch = nbatch self._lazybatch = lazybatch self._items: list[Any] | None = None + self._cached_block_index: int | None = None + self._cached_block: list[Any] | None = None self._nbytes, self._cbytes, self._nblocks = blosc2.get_cbuffer_sizes(lazybatch) def _normalize_index(self, index: int) -> int: @@ -51,6 +53,14 @@ def _decode_items(self) -> list[Any]: self._items = [item for block in blocks for item in block] return self._items + def _get_block(self, block_index: int) -> list[Any]: + if self._cached_block_index == block_index and self._cached_block is not None: + return self._cached_block + block = msgpack_unpackb(self._parent.schunk.get_vlblock(self._nbatch, block_index)) + self._cached_block_index = block_index + self._cached_block = block + return block + def __getitem__(self, index: int | slice) -> Any | list[Any]: if isinstance(index, slice): items = self._decode_items() @@ -64,7 +74,7 @@ def __getitem__(self, index: int | slice) -> Any | list[Any]: block_index, item_index = divmod(index, blocksize_max) if block_index >= self._nblocks: raise IndexError("Batch index out of range") - block = msgpack_unpackb(self._parent.schunk.get_vlblock(self._nbatch, block_index)) + block = self._get_block(block_index) try: return block[item_index] except IndexError as exc: diff --git a/tests/test_batch_store.py b/tests/test_batch_store.py index 13a4b981..de56ac58 100644 --- a/tests/test_batch_store.py +++ b/tests/test_batch_store.py @@ -202,6 +202,30 @@ def test_batchstore_get_vlblock_and_scalar_access(): blosc2.remove_urlpath(urlpath) +def test_batchstore_scalar_reads_cache_vlblocks(): + barray = blosc2.BatchStore(blocksize_max=2) + barray.append([0, 1, 2, 3, 4]) + + batch = barray[0] + original_get_vlblock = barray.schunk.get_vlblock + calls = [] + + def wrapped_get_vlblock(nchunk, nblock): + calls.append((nchunk, nblock)) + return original_get_vlblock(nchunk, nblock) + + barray.schunk.get_vlblock = wrapped_get_vlblock + try: + assert batch[0] == 0 + assert batch[1] == 1 + assert batch[0] == 0 + assert batch[2] == 2 + assert batch[3] == 3 + assert calls == [(0, 0), (0, 1)] + finally: + barray.schunk.get_vlblock = original_get_vlblock + + def test_batchstore_respects_explicit_use_dict_and_non_zstd(): barray = blosc2.BatchStore(cparams={"codec": blosc2.Codec.LZ4, "clevel": 5}) assert barray.cparams.codec == blosc2.Codec.LZ4 From f51c430e2b3e42573ae5cd6bcc453fa2bf60e118 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 19 Mar 2026 07:44:27 +0100 Subject: [PATCH 22/34] New iter_objects for iterating over objects in batch store --- bench/batch_store.py | 10 +++++++ src/blosc2/batch_store.py | 13 ++++++--- tests/test_batch_store.py | 57 ++++++++++++++++++++++----------------- 3 files changed, 53 insertions(+), 27 deletions(-) diff --git a/bench/batch_store.py b/bench/batch_store.py index 67cc25ed..abc5fbe0 100644 --- a/bench/batch_store.py +++ b/bench/batch_store.py @@ -125,6 +125,13 @@ def main() -> None: else: read_store = blosc2.BatchStore(urlpath=URLPATH, mode="r", contiguous=True, blocksize_max=BLOCKSIZE_MAX) samples, timings_ns = measure_random_reads(read_store) + t0 = time.perf_counter() + checksum = 0 + nobjects = 0 + for obj in read_store.iter_objects(): + checksum += obj["blue"] + nobjects += 1 + iter_time_s = time.perf_counter() - t0 print() print("BatchStore benchmark") @@ -138,6 +145,9 @@ def main() -> None: print(f" mean: {statistics.fmean(timings_ns) / 1_000:.2f} us") print(f" max: {max(timings_ns) / 1_000:.2f} us") print(f" min: {min(timings_ns) / 1_000:.2f} us") + print(f"Object iteration via iter_objects(): {iter_time_s:.3f} s") + print(f" per object: {iter_time_s * 1_000_000 / nobjects:.2f} us") + print(f" checksum: {checksum}") print("Sample reads:") for timing_ns, batch_index, item_index, value in samples[:5]: print(f" {timing_ns / 1_000:.2f} us -> read_store[{batch_index}][{item_index}] = {value}") diff --git a/src/blosc2/batch_store.py b/src/blosc2/batch_store.py index b3de30a2..55f5adf6 100644 --- a/src/blosc2/batch_store.py +++ b/src/blosc2/batch_store.py @@ -418,10 +418,17 @@ def __delitem__(self, index: int | slice) -> None: def __len__(self) -> int: return self.schunk.nchunks - def __iter__(self) -> Iterator[Batch]: + def iter_batches(self) -> Iterator[Batch]: for i in range(len(self)): yield self[i] + def iter_objects(self) -> Iterator[Any]: + for batch in self.iter_batches(): + yield from batch + + def __iter__(self) -> Iterator[Batch]: + yield from self.iter_batches() + @property def meta(self): return self.schunk.meta @@ -474,7 +481,7 @@ def info(self) -> InfoReporter: @property def info_items(self) -> list: """A list of tuples with summary information about this BatchStore.""" - batch_sizes = [len(batch) for batch in self] + batch_sizes = [len(batch) for batch in self.iter_batches()] if batch_sizes: batch_stats = ( f"mean={statistics.fmean(batch_sizes):.2f}, max={max(batch_sizes)}, min={min(batch_sizes)}" @@ -515,7 +522,7 @@ def copy(self, **kwargs: Any) -> BatchStore: if "storage" not in kwargs and len(self.vlmeta) > 0: for key, value in self.vlmeta.getall().items(): out.vlmeta[key] = value - out.extend(self) + out.extend(self.iter_batches()) return out def __enter__(self) -> BatchStore: diff --git a/tests/test_batch_store.py b/tests/test_batch_store.py index de56ac58..daed6145 100644 --- a/tests/test_batch_store.py +++ b/tests/test_batch_store.py @@ -48,9 +48,9 @@ def test_batchstore_roundtrip(contiguous, urlpath): assert len(barray) == len(BATCHES) assert barray.blocksize_max is not None assert 1 <= barray.blocksize_max <= len(BATCHES[0]) - assert [batch[:] for batch in barray] == BATCHES + assert [batch[:] for batch in barray.iter_batches()] == BATCHES assert barray.append([1, 2]) == len(BATCHES) + 1 - assert [batch[:] for batch in barray][-1] == [1, 2] + assert [batch[:] for batch in barray.iter_batches()][-1] == [1, 2] batch0 = barray[0] assert isinstance(batch0, blosc2.Batch) @@ -78,13 +78,13 @@ def test_batchstore_roundtrip(contiguous, urlpath): del expected[2] del barray[-2] del expected[-2] - assert [batch[:] for batch in barray] == expected + assert [batch[:] for batch in barray.iter_batches()] == expected if urlpath is not None: reopened = blosc2.open(urlpath, mode="r") assert isinstance(reopened, blosc2.BatchStore) assert reopened.blocksize_max is None - assert [batch[:] for batch in reopened] == expected + assert [batch[:] for batch in reopened.iter_batches()] == expected with pytest.raises(ValueError): reopened.append(["nope"]) with pytest.raises(ValueError): @@ -105,12 +105,12 @@ def test_batchstore_roundtrip(contiguous, urlpath): reopened_rw = blosc2.open(urlpath, mode="a") reopened_rw[0] = ["changed", "batch", 0] expected[0] = ["changed", "batch", 0] - assert [batch[:] for batch in reopened_rw] == expected + assert [batch[:] for batch in reopened_rw.iter_batches()] == expected if contiguous: reopened_mmap = blosc2.open(urlpath, mode="r", mmap_mode="r") assert isinstance(reopened_mmap, blosc2.BatchStore) - assert [batch[:] for batch in reopened_mmap] == expected + assert [batch[:] for batch in reopened_mmap.iter_batches()] == expected blosc2.remove_urlpath(urlpath) @@ -126,11 +126,11 @@ def test_batchstore_from_cframe(): restored = blosc2.from_cframe(barray.to_cframe()) assert isinstance(restored, blosc2.BatchStore) - assert [batch[:] for batch in restored] == expected + assert [batch[:] for batch in restored.iter_batches()] == expected restored2 = blosc2.from_cframe(barray.to_cframe()) assert isinstance(restored2, blosc2.BatchStore) - assert [batch[:] for batch in restored2] == expected + assert [batch[:] for batch in restored2.iter_batches()] == expected def test_batchstore_info(): @@ -172,7 +172,7 @@ def test_batchstore_explicit_blocksize_max(): assert barray.blocksize_max == 2 barray.append([1, 2, 3]) barray.append([4]) - assert [batch[:] for batch in barray] == [[1, 2, 3], [4]] + assert [batch[:] for batch in barray.iter_batches()] == [[1, 2, 3], [4]] def test_batchstore_get_vlblock_and_scalar_access(): @@ -226,6 +226,15 @@ def wrapped_get_vlblock(nchunk, nblock): barray.schunk.get_vlblock = original_get_vlblock +def test_batchstore_iter_objects(): + barray = blosc2.BatchStore(blocksize_max=2) + batches = [[1, 2, 3], [4], [5, 6]] + barray.extend(batches) + + assert [batch[:] for batch in barray] == batches + assert list(barray.iter_objects()) == [1, 2, 3, 4, 5, 6] + + def test_batchstore_respects_explicit_use_dict_and_non_zstd(): barray = blosc2.BatchStore(cparams={"codec": blosc2.Codec.LZ4, "clevel": 5}) assert barray.cparams.codec == blosc2.Codec.LZ4 @@ -287,7 +296,7 @@ def test_batchstore_constructor_kwargs(): barray.extend(BATCHES) reopened = blosc2.BatchStore(urlpath=urlpath, mode="r", contiguous=True, mmap_mode="r") - assert [batch[:] for batch in reopened] == BATCHES + assert [batch[:] for batch in reopened.iter_batches()] == BATCHES blosc2.remove_urlpath(urlpath) @@ -306,21 +315,21 @@ def test_batchstore_list_like_ops(contiguous, urlpath): barray = blosc2.BatchStore(storage=_storage(contiguous, urlpath)) barray.extend([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - assert [batch[:] for batch in barray] == [[1, 2, 3], [4, 5, 6], [7, 8, 9]] + assert [batch[:] for batch in barray.iter_batches()] == [[1, 2, 3], [4, 5, 6], [7, 8, 9]] assert barray.pop() == [7, 8, 9] assert barray.pop(0) == [1, 2, 3] - assert [batch[:] for batch in barray] == [[4, 5, 6]] + assert [batch[:] for batch in barray.iter_batches()] == [[4, 5, 6]] barray.clear() assert len(barray) == 0 - assert [batch[:] for batch in barray] == [] + assert [batch[:] for batch in barray.iter_batches()] == [] barray.extend([["a", "b", "c"], ["d", "e", "f"]]) - assert [batch[:] for batch in barray] == [["a", "b", "c"], ["d", "e", "f"]] + assert [batch[:] for batch in barray.iter_batches()] == [["a", "b", "c"], ["d", "e", "f"]] if urlpath is not None: reopened = blosc2.open(urlpath, mode="r") - assert [batch[:] for batch in reopened] == [["a", "b", "c"], ["d", "e", "f"]] + assert [batch[:] for batch in reopened.iter_batches()] == [["a", "b", "c"], ["d", "e", "f"]] blosc2.remove_urlpath(urlpath) @@ -346,15 +355,15 @@ def test_batchstore_slices(contiguous, urlpath): barray[2:5] = [["a", "b", "c"], ["d", "e", "f"], ["g", "h", "i"]] expected[2:5] = [["a", "b", "c"], ["d", "e", "f"], ["g", "h", "i"]] - assert [batch[:] for batch in barray] == expected + assert [batch[:] for batch in barray.iter_batches()] == expected barray[1:6:2] = [[100, 101, 102], [103, 104, 105], [106, 107, 108]] expected[1:6:2] = [[100, 101, 102], [103, 104, 105], [106, 107, 108]] - assert [batch[:] for batch in barray] == expected + assert [batch[:] for batch in barray.iter_batches()] == expected del barray[::3] del expected[::3] - assert [batch[:] for batch in barray] == expected + assert [batch[:] for batch in barray.iter_batches()] == expected if urlpath is not None: reopened = blosc2.open(urlpath, mode="r") @@ -392,14 +401,14 @@ def test_batchstore_copy(): copied = original.copy( urlpath=copy_path, contiguous=False, cparams={"codec": blosc2.Codec.LZ4, "clevel": 5} ) - assert [batch[:] for batch in copied] == [batch[:] for batch in original] + assert [batch[:] for batch in copied.iter_batches()] == [batch[:] for batch in original.iter_batches()] assert copied.urlpath == copy_path assert copied.schunk.contiguous is False assert copied.cparams.codec == blosc2.Codec.LZ4 assert copied.cparams.clevel == 5 inmem = original.copy() - assert [batch[:] for batch in inmem] == [batch[:] for batch in original] + assert [batch[:] for batch in inmem.iter_batches()] == [batch[:] for batch in original.iter_batches()] assert inmem.urlpath is None with pytest.raises(ValueError, match="meta should not be passed to copy"): @@ -434,7 +443,7 @@ def test_batchstore_multithreaded_inner_vl(contiguous, nthreads): ) barray.extend(batches) - assert [batch[:] for batch in barray] == batches + assert [batch[:] for batch in barray.iter_batches()] == batches assert [barray[i][:] for i in range(len(barray))] == batches @@ -453,7 +462,7 @@ def test_batchstore_validation_errors(): blosc2.BatchStore().pop() barray.extend([[1, 2, 3]]) assert barray.append([2, 3]) == 2 - assert [batch[:] for batch in barray] == [[1, 2, 3], [2, 3]] + assert [batch[:] for batch in barray.iter_batches()] == [[1, 2, 3], [2, 3]] with pytest.raises(NotImplementedError): barray.pop(slice(0, 1)) @@ -466,7 +475,7 @@ def test_batchstore_in_embed_store(): estore["/batch"] = barray restored = estore["/batch"] assert isinstance(restored, blosc2.BatchStore) - assert [batch[:] for batch in restored] == BATCHES + assert [batch[:] for batch in restored.iter_batches()] == BATCHES def test_batchstore_in_dict_store(): @@ -481,6 +490,6 @@ def test_batchstore_in_dict_store(): with blosc2.DictStore(path, mode="r") as dstore: restored = dstore["/batch"] assert isinstance(restored, blosc2.BatchStore) - assert [batch[:] for batch in restored] == BATCHES + assert [batch[:] for batch in restored.iter_batches()] == BATCHES blosc2.remove_urlpath(path) From ab8b49586194980d7eca045488d3df58279da1f7 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 19 Mar 2026 20:20:51 +0100 Subject: [PATCH 23/34] Recognize .b2b extension as BatchStore in DictStore --- src/blosc2/dict_store.py | 17 ++++++++++++----- src/blosc2/tree_store.py | 11 +++++++++-- tests/test_tree_store.py | 28 ++++++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 7 deletions(-) diff --git a/src/blosc2/dict_store.py b/src/blosc2/dict_store.py index 0cb5ef63..17f50ac6 100644 --- a/src/blosc2/dict_store.py +++ b/src/blosc2/dict_store.py @@ -36,6 +36,8 @@ class DictStore: are stored as .b2nd files. - blosc2.SChunk: super-chunks. When persisted externally they are stored as .b2f files. + - blosc2.BatchStore: batched variable-length containers. When persisted + externally they are stored as .b2b files. - blosc2.C2Array: columnar containers. These are always kept inside the embedded store (never externalized). - numpy.ndarray: converted to blosc2.NDArray on assignment. @@ -91,7 +93,7 @@ class DictStore: Notes ----- - External persistence uses the following file extensions: - .b2nd for NDArray and .b2f for SChunk. + .b2nd for NDArray, .b2f for SChunk, and .b2b for BatchStore. """ def __init__( @@ -181,8 +183,11 @@ def _init_read_mode(self, dparams: blosc2.DParams | None = None): dparams=dparams, ) for filepath in self.offsets: - if filepath.endswith((".b2nd", ".b2f")): - key = "/" + filepath[: -5 if filepath.endswith(".b2nd") else -4] + if filepath.endswith((".b2nd", ".b2f", ".b2b")): + if filepath.endswith(".b2nd"): + key = "/" + filepath[:-5] + else: + key = "/" + filepath[:-4] self.map_tree[key] = filepath else: # .b2d if not os.path.isdir(self.localpath): @@ -228,14 +233,14 @@ def _update_map_tree(self): for root, _, files in os.walk(self.working_dir): for file in files: filepath = os.path.join(root, file) - if filepath.endswith((".b2nd", ".b2f")): + if filepath.endswith((".b2nd", ".b2f", ".b2b")): # Convert filename to key: remove extension and ensure starts with / rel_path = os.path.relpath(filepath, self.working_dir) # Normalize path separators to forward slashes for cross-platform consistency rel_path = rel_path.replace(os.sep, "/") if rel_path.endswith(".b2nd"): key = rel_path[:-5] - elif rel_path.endswith(".b2f"): + elif rel_path.endswith(".b2b") or rel_path.endswith(".b2f"): key = rel_path[:-4] else: continue @@ -264,6 +269,8 @@ def _is_external_value(value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.Ba def _external_ext(value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.BatchStore) -> str: if isinstance(value, blosc2.NDArray): return ".b2nd" + if isinstance(value, blosc2.BatchStore): + return ".b2b" return ".b2f" def __setitem__( diff --git a/src/blosc2/tree_store.py b/src/blosc2/tree_store.py index 7f4fe6ba..9be6672f 100644 --- a/src/blosc2/tree_store.py +++ b/src/blosc2/tree_store.py @@ -668,8 +668,15 @@ def _persist_vlmeta(self) -> None: """ if hasattr(self, "_vlmeta_key"): vlmeta_key = self._vlmeta_key - # Only embedded case is expected; handle it safely. - if hasattr(self, "_estore") and vlmeta_key in self._estore: + if vlmeta_key in self.map_tree: + filepath = self.map_tree[vlmeta_key] + dest_path = os.path.join(self.working_dir, filepath) + parent_dir = os.path.dirname(dest_path) + if parent_dir and not os.path.exists(parent_dir): + os.makedirs(parent_dir, exist_ok=True) + with open(dest_path, "wb") as f: + f.write(self._vlmeta.to_cframe()) + elif hasattr(self, "_estore") and vlmeta_key in self._estore: # Replace the stored snapshot with contextlib.suppress(KeyError): del self._estore[vlmeta_key] diff --git a/tests/test_tree_store.py b/tests/test_tree_store.py index 5da45f64..bfbf791c 100644 --- a/tests/test_tree_store.py +++ b/tests/test_tree_store.py @@ -654,6 +654,34 @@ def test_external_vlarray_support(): os.remove("test_vlarray_external.b2z") +def test_external_batchstore_support(tmp_path): + store_path = tmp_path / "test_batchstore_external.b2d" + + with TreeStore(str(store_path), mode="w", threshold=0) as tstore: + bstore = blosc2.BatchStore(blocksize_max=2) + bstore.extend([[{"id": 1}, {"id": 2}], [{"id": 3}]]) + tstore["/data/batchstore"] = bstore + + batchstore_path = store_path / "data" / "batchstore.b2b" + assert batchstore_path.exists() + + with TreeStore(str(store_path), mode="r") as tstore: + retrieved = tstore["/data/batchstore"] + assert isinstance(retrieved, blosc2.BatchStore) + assert [batch[:] for batch in retrieved] == [[{"id": 1}, {"id": 2}], [{"id": 3}]] + + +def test_treestore_vlmeta_externalized_b2d(tmp_path): + store_path = tmp_path / "test_vlmeta_externalized.b2d" + + with TreeStore(str(store_path), mode="w", threshold=0) as tstore: + tstore["/data"] = np.array([1, 2, 3]) + tstore.vlmeta["schema_manifest"] = {"version": 1, "fields": {"a": {"kind": "fixed"}}} + + with TreeStore(str(store_path), mode="r") as tstore: + assert tstore.vlmeta["schema_manifest"] == {"version": 1, "fields": {"a": {"kind": "fixed"}}} + + def test_walk_topdown_argument_ordering(): """Ensure walk supports topdown argument mimicking os.walk order semantics.""" with TreeStore("test_walk_topdown.b2z", mode="w") as tstore: From d21ed20c958a9a33795b79abbede47588a07c9b0 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 19 Mar 2026 20:38:00 +0100 Subject: [PATCH 24/34] blocksize_max -> max_blocksize. also, this is persisted in metalayer now. --- bench/batch_store.py | 8 ++--- examples/batch_store.py | 8 ++--- src/blosc2/batch_store.py | 63 +++++++++++++++++++++++++++------------ tests/test_batch_store.py | 25 ++++++++-------- tests/test_tree_store.py | 2 +- 5 files changed, 66 insertions(+), 40 deletions(-) diff --git a/bench/batch_store.py b/bench/batch_store.py index abc5fbe0..ca3d83b0 100644 --- a/bench/batch_store.py +++ b/bench/batch_store.py @@ -61,7 +61,7 @@ def build_store(codec: blosc2.Codec, clevel: int, use_dict: bool, in_mem: bool) storage = blosc2.Storage(mode="w") store = blosc2.BatchStore( storage=storage, - blocksize_max=BLOCKSIZE_MAX, + max_blocksize=BLOCKSIZE_MAX, cparams={ "codec": codec, "clevel": clevel, @@ -79,7 +79,7 @@ def build_store(codec: blosc2.Codec, clevel: int, use_dict: bool, in_mem: bool) "clevel": clevel, "use_dict": use_dict and codec in (blosc2.Codec.ZSTD, blosc2.Codec.LZ4), } - with blosc2.BatchStore(storage=storage, blocksize_max=BLOCKSIZE_MAX, cparams=cparams) as store: + with blosc2.BatchStore(storage=storage, max_blocksize=BLOCKSIZE_MAX, cparams=cparams) as store: for batch_index in range(NBATCHES): store.append(make_batch(batch_index)) return None @@ -123,7 +123,7 @@ def main() -> None: assert store is not None read_store = store else: - read_store = blosc2.BatchStore(urlpath=URLPATH, mode="r", contiguous=True, blocksize_max=BLOCKSIZE_MAX) + read_store = blosc2.BatchStore(urlpath=URLPATH, mode="r", contiguous=True, max_blocksize=BLOCKSIZE_MAX) samples, timings_ns = measure_random_reads(read_store) t0 = time.perf_counter() checksum = 0 @@ -138,7 +138,7 @@ def main() -> None: print(f" build time: {build_time_s:.3f} s") print(f" batches: {len(read_store)}") print(f" objects: {TOTAL_OBJECTS}") - print(f" blocksize_max: {read_store.blocksize_max}") + print(f" max_blocksize: {read_store.max_blocksize}") print() print(read_store.info) print(f"Random scalar reads: {N_RANDOM_READS}") diff --git a/examples/batch_store.py b/examples/batch_store.py index 5a127576..4a387af5 100644 --- a/examples/batch_store.py +++ b/examples/batch_store.py @@ -36,7 +36,7 @@ def main() -> None: blosc2.remove_urlpath(URLPATH) storage = blosc2.Storage(urlpath=URLPATH, mode="w", contiguous=True) - with blosc2.BatchStore(storage=storage, blocksize_max=BLOCKSIZE_MAX) as store: + with blosc2.BatchStore(storage=storage, max_blocksize=BLOCKSIZE_MAX) as store: for batch_index in range(NBATCHES): store.append(make_batch(batch_index)) @@ -44,11 +44,11 @@ def main() -> None: print("Created BatchStore") print(f" batches: {len(store)}") print(f" objects: {total_objects}") - print(f" blocksize_max: {store.blocksize_max}") + print(f" max_blocksize: {store.max_blocksize}") - # Reopen with the same blocksize_max hint so scalar reads can use the + # Reopen with the same max_blocksize hint so scalar reads can use the # VL-block path instead of decoding the entire batch. - reopened = blosc2.BatchStore(urlpath=URLPATH, mode="r", contiguous=True, blocksize_max=BLOCKSIZE_MAX) + reopened = blosc2.BatchStore(urlpath=URLPATH, mode="r", contiguous=True, max_blocksize=BLOCKSIZE_MAX) print() print(reopened.info) diff --git a/src/blosc2/batch_store.py b/src/blosc2/batch_store.py index 55f5adf6..78f1b364 100644 --- a/src/blosc2/batch_store.py +++ b/src/blosc2/batch_store.py @@ -18,7 +18,7 @@ from blosc2._msgpack_utils import msgpack_packb, msgpack_unpackb from blosc2.info import InfoReporter, format_nbytes_info -_BATCHSTORE_META = {"version": 1, "serializer": "msgpack"} +_BATCHSTORE_META = {"version": 1, "serializer": "msgpack", "max_blocksize": None} def _check_serialized_size(buffer: bytes) -> None: @@ -69,9 +69,9 @@ def __getitem__(self, index: int | slice) -> Any | list[Any]: items = self._decode_items() index = self._normalize_index(index) return items[index] - blocksize_max = self._parent.blocksize_max - if blocksize_max is not None: - block_index, item_index = divmod(index, blocksize_max) + max_blocksize = self._parent.max_blocksize + if max_blocksize is not None: + block_index, item_index = divmod(index, max_blocksize) if block_index >= self._nblocks: raise IndexError("Batch index out of range") block = self._get_block(block_index) @@ -158,6 +158,11 @@ def _attach_schunk(self, schunk: blosc2.SChunk) -> None: self.schunk = schunk self.mode = schunk.mode self.mmap_mode = getattr(schunk, "mmap_mode", None) + try: + batchstore_meta = self.schunk.meta["batchstore"] + except KeyError: + batchstore_meta = {} + self._max_blocksize = batchstore_meta.get("max_blocksize", self._max_blocksize) self._validate_tag() def _maybe_open_existing(self, storage: blosc2.Storage) -> bool: @@ -181,13 +186,13 @@ def _make_storage(self) -> blosc2.Storage: def __init__( self, - blocksize_max: int | None = None, + max_blocksize: int | None = None, _from_schunk: blosc2.SChunk | None = None, **kwargs: Any, ) -> None: - if blocksize_max is not None and blocksize_max <= 0: - raise ValueError("blocksize_max must be a positive integer") - self._blocksize_max: int | None = blocksize_max + if max_blocksize is not None and max_blocksize <= 0: + raise ValueError("max_blocksize must be a positive integer") + self._max_blocksize: int | None = max_blocksize if _from_schunk is not None: if kwargs: unexpected = ", ".join(sorted(kwargs)) @@ -213,7 +218,7 @@ def __init__( return fixed_meta = dict(storage.meta or {}) - fixed_meta["batchstore"] = dict(_BATCHSTORE_META) + fixed_meta["batchstore"] = {**_BATCHSTORE_META, "max_blocksize": self._max_blocksize} storage.meta = fixed_meta schunk = blosc2.SChunk(chunksize=-1, data=None, cparams=cparams, dparams=dparams, storage=storage) self._attach_schunk(schunk) @@ -263,9 +268,29 @@ def _normalize_batch(self, value: object) -> list[Any]: return values def _ensure_layout_for_batch(self, batch: list[Any]) -> None: - if self._blocksize_max is None: + if self._max_blocksize is None: payload_sizes = [len(msgpack_packb(item)) for item in batch] - self._blocksize_max = self._guess_blocksize(payload_sizes) + self._max_blocksize = self._guess_blocksize(payload_sizes) + self._persist_max_blocksize() + + def _persist_max_blocksize(self) -> None: + if self._max_blocksize is None or len(self) > 0: + return + storage = self._make_storage() + fixed_meta = dict(storage.meta or {}) + fixed_meta["batchstore"] = { + **dict(fixed_meta.get("batchstore", {})), + "max_blocksize": self._max_blocksize, + } + storage.meta = fixed_meta + schunk = blosc2.SChunk( + chunksize=-1, + data=None, + cparams=copy.deepcopy(self.cparams), + dparams=copy.deepcopy(self.dparams), + storage=storage, + ) + self._attach_schunk(schunk) def _guess_blocksize(self, payload_sizes: list[int]) -> int: if not payload_sizes: @@ -301,11 +326,11 @@ def _vl_dparams_kwargs(self) -> dict[str, Any]: return asdict(self.schunk.dparams) def _compress_batch(self, batch: list[Any]) -> bytes: - if self._blocksize_max is None: - raise RuntimeError("BatchStore blocksize_max is not initialized") + if self._max_blocksize is None: + raise RuntimeError("BatchStore max_blocksize is not initialized") blocks = [ - self._serialize_block(batch[i : i + self._blocksize_max]) - for i in range(0, len(batch), self._blocksize_max) + self._serialize_block(batch[i : i + self._max_blocksize]) + for i in range(0, len(batch), self._max_blocksize) ] return blosc2.blosc2_ext.vlcompress(blocks, **self._vl_cparams_kwargs()) @@ -446,8 +471,8 @@ def dparams(self): return self.schunk.dparams @property - def blocksize_max(self) -> int | None: - return self._blocksize_max + def max_blocksize(self) -> int | None: + return self._max_blocksize @property def typesize(self) -> int: @@ -492,7 +517,7 @@ def info_items(self) -> list: ("type", f"{self.__class__.__name__}"), ("nbatches", len(self)), ("batch stats", batch_stats), - ("blocksize_max", self.blocksize_max), + ("max_blocksize", self.max_blocksize), ("nitems", sum(batch_sizes)), ("nbytes", format_nbytes_info(self.nbytes)), ("cbytes", format_nbytes_info(self.cbytes)), @@ -510,7 +535,7 @@ def copy(self, **kwargs: Any) -> BatchStore: raise ValueError("meta should not be passed to copy") kwargs["cparams"] = kwargs.get("cparams", copy.deepcopy(self.cparams)) kwargs["dparams"] = kwargs.get("dparams", copy.deepcopy(self.dparams)) - kwargs["blocksize_max"] = kwargs.get("blocksize_max", self.blocksize_max) + kwargs["max_blocksize"] = kwargs.get("max_blocksize", self.max_blocksize) if "storage" not in kwargs: kwargs["meta"] = self._copy_meta() diff --git a/tests/test_batch_store.py b/tests/test_batch_store.py index daed6145..42486257 100644 --- a/tests/test_batch_store.py +++ b/tests/test_batch_store.py @@ -46,8 +46,8 @@ def test_batchstore_roundtrip(contiguous, urlpath): assert barray.append(batch) == i assert len(barray) == len(BATCHES) - assert barray.blocksize_max is not None - assert 1 <= barray.blocksize_max <= len(BATCHES[0]) + assert barray.max_blocksize is not None + assert 1 <= barray.max_blocksize <= len(BATCHES[0]) assert [batch[:] for batch in barray.iter_batches()] == BATCHES assert barray.append([1, 2]) == len(BATCHES) + 1 assert [batch[:] for batch in barray.iter_batches()][-1] == [1, 2] @@ -83,7 +83,7 @@ def test_batchstore_roundtrip(contiguous, urlpath): if urlpath is not None: reopened = blosc2.open(urlpath, mode="r") assert isinstance(reopened, blosc2.BatchStore) - assert reopened.blocksize_max is None + assert reopened.max_blocksize == barray.max_blocksize assert [batch[:] for batch in reopened.iter_batches()] == expected with pytest.raises(ValueError): reopened.append(["nope"]) @@ -145,7 +145,7 @@ def test_batchstore_info(): assert items["type"] == "BatchStore" assert items["nbatches"] == len(BATCHES) assert items["batch stats"].startswith("mean=") - assert items["blocksize_max"] == barray.blocksize_max + assert items["max_blocksize"] == barray.max_blocksize assert items["nitems"] == sum(len(batch) for batch in BATCHES) assert "urlpath" not in items assert "contiguous" not in items @@ -158,7 +158,7 @@ def test_batchstore_info(): assert "type" in text assert "BatchStore" in text assert "batch stats" in text - assert "blocksize_max" in text + assert "max_blocksize" in text def test_batchstore_zstd_does_not_use_dict_by_default(): @@ -167,9 +167,9 @@ def test_batchstore_zstd_does_not_use_dict_by_default(): assert barray.cparams.use_dict is False -def test_batchstore_explicit_blocksize_max(): - barray = blosc2.BatchStore(blocksize_max=2) - assert barray.blocksize_max == 2 +def test_batchstore_explicit_max_blocksize(): + barray = blosc2.BatchStore(max_blocksize=2) + assert barray.max_blocksize == 2 barray.append([1, 2, 3]) barray.append([4]) assert [batch[:] for batch in barray.iter_batches()] == [[1, 2, 3], [4]] @@ -180,10 +180,10 @@ def test_batchstore_get_vlblock_and_scalar_access(): blosc2.remove_urlpath(urlpath) batch = [0, 1, 2, 3, 4] - barray = blosc2.BatchStore(storage=_storage(True, urlpath), blocksize_max=2) + barray = blosc2.BatchStore(storage=_storage(True, urlpath), max_blocksize=2) barray.append(batch) - assert barray.blocksize_max == 2 + assert barray.max_blocksize == 2 assert msgpack_unpackb(barray.schunk.get_vlblock(0, 0)) == batch[:2] assert msgpack_unpackb(barray.schunk.get_vlblock(0, 1)) == batch[2:4] assert msgpack_unpackb(barray.schunk.get_vlblock(0, 2)) == batch[4:] @@ -194,6 +194,7 @@ def test_batchstore_get_vlblock_and_scalar_access(): reopened = blosc2.open(urlpath, mode="r") assert isinstance(reopened, blosc2.BatchStore) + assert reopened.max_blocksize == 2 assert reopened[0][0] == 0 assert reopened[0][2] == 2 assert reopened[0][4] == 4 @@ -203,7 +204,7 @@ def test_batchstore_get_vlblock_and_scalar_access(): def test_batchstore_scalar_reads_cache_vlblocks(): - barray = blosc2.BatchStore(blocksize_max=2) + barray = blosc2.BatchStore(max_blocksize=2) barray.append([0, 1, 2, 3, 4]) batch = barray[0] @@ -227,7 +228,7 @@ def wrapped_get_vlblock(nchunk, nblock): def test_batchstore_iter_objects(): - barray = blosc2.BatchStore(blocksize_max=2) + barray = blosc2.BatchStore(max_blocksize=2) batches = [[1, 2, 3], [4], [5, 6]] barray.extend(batches) diff --git a/tests/test_tree_store.py b/tests/test_tree_store.py index bfbf791c..27b86452 100644 --- a/tests/test_tree_store.py +++ b/tests/test_tree_store.py @@ -658,7 +658,7 @@ def test_external_batchstore_support(tmp_path): store_path = tmp_path / "test_batchstore_external.b2d" with TreeStore(str(store_path), mode="w", threshold=0) as tstore: - bstore = blosc2.BatchStore(blocksize_max=2) + bstore = blosc2.BatchStore(max_blocksize=2) bstore.extend([[{"id": 1}, {"id": 2}], [{"id": 3}]]) tstore["/data/batchstore"] = bstore From 9c177bd0226e341d0a9b76968d39404c6e931b16 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 19 Mar 2026 21:07:02 +0100 Subject: [PATCH 25/34] Adapt max_blocksize depending on the clevel --- bench/batch_store.py | 8 ++++---- src/blosc2/batch_store.py | 14 +++++++++++--- src/blosc2/storage.py | 4 +++- tests/test_batch_store.py | 25 +++++++++++++++++++++++++ tests/test_vlarray.py | 4 ++++ 5 files changed, 47 insertions(+), 8 deletions(-) diff --git a/bench/batch_store.py b/bench/batch_store.py index ca3d83b0..9c1f992b 100644 --- a/bench/batch_store.py +++ b/bench/batch_store.py @@ -51,7 +51,7 @@ def build_parser() -> argparse.ArgumentParser: ) parser.add_argument("--codec", type=str, default="ZSTD", choices=[codec.name for codec in blosc2.Codec]) parser.add_argument("--clevel", type=int, default=5) - parser.add_argument("--use-dict", action="store_true", help="Enable dictionaries for ZSTD/LZ4 codecs.") + parser.add_argument("--use-dict", action="store_true", help="Enable dictionaries for ZSTD/LZ4/LZ4HC codecs.") parser.add_argument("--in-mem", action="store_true", help="Keep the BatchStore purely in memory.") return parser @@ -65,7 +65,7 @@ def build_store(codec: blosc2.Codec, clevel: int, use_dict: bool, in_mem: bool) cparams={ "codec": codec, "clevel": clevel, - "use_dict": use_dict and codec in (blosc2.Codec.ZSTD, blosc2.Codec.LZ4), + "use_dict": use_dict and codec in (blosc2.Codec.ZSTD, blosc2.Codec.LZ4, blosc2.Codec.LZ4HC), }, ) for batch_index in range(NBATCHES): @@ -77,7 +77,7 @@ def build_store(codec: blosc2.Codec, clevel: int, use_dict: bool, in_mem: bool) cparams = { "codec": codec, "clevel": clevel, - "use_dict": use_dict and codec in (blosc2.Codec.ZSTD, blosc2.Codec.LZ4), + "use_dict": use_dict and codec in (blosc2.Codec.ZSTD, blosc2.Codec.LZ4, blosc2.Codec.LZ4HC), } with blosc2.BatchStore(storage=storage, max_blocksize=BLOCKSIZE_MAX, cparams=cparams) as store: for batch_index in range(NBATCHES): @@ -107,7 +107,7 @@ def main() -> None: parser = build_parser() args = parser.parse_args() codec = blosc2.Codec[args.codec] - use_dict = args.use_dict and codec in (blosc2.Codec.ZSTD, blosc2.Codec.LZ4) + use_dict = args.use_dict and codec in (blosc2.Codec.ZSTD, blosc2.Codec.LZ4, blosc2.Codec.LZ4HC) mode_label = "in-memory" if args.in_mem else "persistent" article = "an" if args.in_mem else "a" diff --git a/src/blosc2/batch_store.py b/src/blosc2/batch_store.py index 78f1b364..71a08e81 100644 --- a/src/blosc2/batch_store.py +++ b/src/blosc2/batch_store.py @@ -295,13 +295,21 @@ def _persist_max_blocksize(self) -> None: def _guess_blocksize(self, payload_sizes: list[int]) -> int: if not payload_sizes: raise ValueError("BatchStore entries cannot be empty") - l2_cache_size = blosc2.cpu_info.get("l2_cache_size") - if not isinstance(l2_cache_size, int) or l2_cache_size <= 0: + clevel = self.cparams.clevel + if clevel == 9: + return len(payload_sizes) + if 0 < clevel < 6: + budget = blosc2.cpu_info.get("l1_data_cache_size") + elif 6 <= clevel < 9: + budget = blosc2.cpu_info.get("l2_cache_size") + else: + return len(payload_sizes) + if not isinstance(budget, int) or budget <= 0: return len(payload_sizes) total = 0 count = 0 for payload_size in payload_sizes: - if count > 0 and total + payload_size > l2_cache_size: + if count > 0 and total + payload_size > budget: break total += payload_size count += 1 diff --git a/src/blosc2/storage.py b/src/blosc2/storage.py index 43835118..0015aea9 100644 --- a/src/blosc2/storage.py +++ b/src/blosc2/storage.py @@ -46,7 +46,9 @@ class CParams: (maximum compression). Default is 1. use_dict: bool Whether to use dictionaries when compressing - (only for :py:obj:`blosc2.Codec.ZSTD `). Default is `False`. + (supported for :py:obj:`blosc2.Codec.ZSTD `, + :py:obj:`blosc2.Codec.LZ4 `, and + :py:obj:`blosc2.Codec.LZ4HC `). Default is `False`. typesize: int The data type size, ranging from 1 to 255. Default is 8. nthreads: int diff --git a/tests/test_batch_store.py b/tests/test_batch_store.py index 42486257..29a2b701 100644 --- a/tests/test_batch_store.py +++ b/tests/test_batch_store.py @@ -241,6 +241,10 @@ def test_batchstore_respects_explicit_use_dict_and_non_zstd(): assert barray.cparams.codec == blosc2.Codec.LZ4 assert barray.cparams.use_dict is False + barray = blosc2.BatchStore(cparams={"codec": blosc2.Codec.LZ4HC, "clevel": 1, "use_dict": True}) + assert barray.cparams.codec == blosc2.Codec.LZ4HC + assert barray.cparams.use_dict is True + barray = blosc2.BatchStore(cparams={"codec": blosc2.Codec.ZSTD, "clevel": 0}) assert barray.cparams.codec == blosc2.Codec.ZSTD assert barray.cparams.use_dict is False @@ -252,6 +256,27 @@ def test_batchstore_respects_explicit_use_dict_and_non_zstd(): assert barray.cparams.use_dict is False +def test_batchstore_guess_max_blocksize_uses_l1_for_low_clevel(monkeypatch): + monkeypatch.setitem(blosc2.cpu_info, "l1_data_cache_size", 100) + monkeypatch.setitem(blosc2.cpu_info, "l2_cache_size", 1000) + barray = blosc2.BatchStore(cparams={"clevel": 5}) + assert barray._guess_blocksize([30, 30, 30, 30]) == 3 + + +def test_batchstore_guess_max_blocksize_uses_l2_for_mid_clevel(monkeypatch): + monkeypatch.setitem(blosc2.cpu_info, "l1_data_cache_size", 100) + monkeypatch.setitem(blosc2.cpu_info, "l2_cache_size", 150) + barray = blosc2.BatchStore(cparams={"clevel": 6}) + assert barray._guess_blocksize([60, 60, 60, 60]) == 2 + + +def test_batchstore_guess_max_blocksize_uses_full_batch_for_clevel_9(monkeypatch): + monkeypatch.setitem(blosc2.cpu_info, "l1_data_cache_size", 1) + monkeypatch.setitem(blosc2.cpu_info, "l2_cache_size", 1) + barray = blosc2.BatchStore(cparams={"clevel": 9}) + assert barray._guess_blocksize([100, 100, 100, 100]) == 4 + + def test_vlcompress_small_blocks_roundtrip(): values = [ {"value": None}, diff --git a/tests/test_vlarray.py b/tests/test_vlarray.py index 2c792e10..0c1f01f3 100644 --- a/tests/test_vlarray.py +++ b/tests/test_vlarray.py @@ -155,6 +155,10 @@ def test_vlarray_respects_explicit_use_dict_and_non_zstd(): assert vlarray.cparams.codec == blosc2.Codec.LZ4 assert vlarray.cparams.use_dict is False + vlarray = blosc2.VLArray(cparams={"codec": blosc2.Codec.LZ4HC, "clevel": 1, "use_dict": True}) + assert vlarray.cparams.codec == blosc2.Codec.LZ4HC + assert vlarray.cparams.use_dict is True + vlarray = blosc2.VLArray(cparams={"codec": blosc2.Codec.ZSTD, "clevel": 0}) assert vlarray.cparams.codec == blosc2.Codec.ZSTD assert vlarray.cparams.use_dict is False From a65cb690c6507d5dd619f7a1bac276cbd6a5d2a1 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 20 Mar 2026 08:13:46 +0100 Subject: [PATCH 26/34] Revamped BatchStore. Add arrow as an optional serializer, and much more, including docs. --- bench/batch_store.py | 29 +- .../tutorials/12.batchstore.ipynb | 392 ++++++++++++++++++ doc/reference/batch_store.rst | 93 +++++ doc/reference/classes.rst | 2 + pyproject.toml | 6 + src/blosc2/batch_store.py | 373 +++++++++++++++-- tests/test_batch_store.py | 134 ++++-- 7 files changed, 945 insertions(+), 84 deletions(-) create mode 100644 doc/getting_started/tutorials/12.batchstore.ipynb create mode 100644 doc/reference/batch_store.rst diff --git a/bench/batch_store.py b/bench/batch_store.py index 9c1f992b..7b84370d 100644 --- a/bench/batch_store.py +++ b/bench/batch_store.py @@ -51,17 +51,21 @@ def build_parser() -> argparse.ArgumentParser: ) parser.add_argument("--codec", type=str, default="ZSTD", choices=[codec.name for codec in blosc2.Codec]) parser.add_argument("--clevel", type=int, default=5) + parser.add_argument("--serializer", type=str, default="msgpack", choices=["msgpack", "arrow"]) parser.add_argument("--use-dict", action="store_true", help="Enable dictionaries for ZSTD/LZ4/LZ4HC codecs.") parser.add_argument("--in-mem", action="store_true", help="Keep the BatchStore purely in memory.") return parser -def build_store(codec: blosc2.Codec, clevel: int, use_dict: bool, in_mem: bool) -> blosc2.BatchStore | None: +def build_store( + codec: blosc2.Codec, clevel: int, use_dict: bool, serializer: str, in_mem: bool +) -> blosc2.BatchStore | None: if in_mem: storage = blosc2.Storage(mode="w") store = blosc2.BatchStore( storage=storage, max_blocksize=BLOCKSIZE_MAX, + serializer=serializer, cparams={ "codec": codec, "clevel": clevel, @@ -79,7 +83,9 @@ def build_store(codec: blosc2.Codec, clevel: int, use_dict: bool, in_mem: bool) "clevel": clevel, "use_dict": use_dict and codec in (blosc2.Codec.ZSTD, blosc2.Codec.LZ4, blosc2.Codec.LZ4HC), } - with blosc2.BatchStore(storage=storage, max_blocksize=BLOCKSIZE_MAX, cparams=cparams) as store: + with blosc2.BatchStore( + storage=storage, max_blocksize=BLOCKSIZE_MAX, serializer=serializer, cparams=cparams + ) as store: for batch_index in range(NBATCHES): store.append(make_batch(batch_index)) return None @@ -114,10 +120,13 @@ def main() -> None: print(f"Building {article} {mode_label} BatchStore with 1,000,000 RGB dicts and timing 1,000 random scalar reads...") print(f" codec: {codec.name}") print(f" clevel: {args.clevel}") + print(f" serializer: {args.serializer}") print(f" use_dict: {use_dict}") print(f" in_mem: {args.in_mem}") t0 = time.perf_counter() - store = build_store(codec=codec, clevel=args.clevel, use_dict=use_dict, in_mem=args.in_mem) + store = build_store( + codec=codec, clevel=args.clevel, use_dict=use_dict, serializer=args.serializer, in_mem=args.in_mem + ) build_time_s = time.perf_counter() - t0 if args.in_mem: assert store is not None @@ -127,17 +136,17 @@ def main() -> None: samples, timings_ns = measure_random_reads(read_store) t0 = time.perf_counter() checksum = 0 - nobjects = 0 - for obj in read_store.iter_objects(): - checksum += obj["blue"] - nobjects += 1 + nitems = 0 + for item in read_store.iter_items(): + checksum += item["blue"] + nitems += 1 iter_time_s = time.perf_counter() - t0 print() print("BatchStore benchmark") print(f" build time: {build_time_s:.3f} s") print(f" batches: {len(read_store)}") - print(f" objects: {TOTAL_OBJECTS}") + print(f" items: {TOTAL_OBJECTS}") print(f" max_blocksize: {read_store.max_blocksize}") print() print(read_store.info) @@ -145,8 +154,8 @@ def main() -> None: print(f" mean: {statistics.fmean(timings_ns) / 1_000:.2f} us") print(f" max: {max(timings_ns) / 1_000:.2f} us") print(f" min: {min(timings_ns) / 1_000:.2f} us") - print(f"Object iteration via iter_objects(): {iter_time_s:.3f} s") - print(f" per object: {iter_time_s * 1_000_000 / nobjects:.2f} us") + print(f"Item iteration via iter_items(): {iter_time_s:.3f} s") + print(f" per item: {iter_time_s * 1_000_000 / nitems:.2f} us") print(f" checksum: {checksum}") print("Sample reads:") for timing_ns, batch_index, item_index, value in samples[:5]: diff --git a/doc/getting_started/tutorials/12.batchstore.ipynb b/doc/getting_started/tutorials/12.batchstore.ipynb new file mode 100644 index 00000000..b898c455 --- /dev/null +++ b/doc/getting_started/tutorials/12.batchstore.ipynb @@ -0,0 +1,392 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2c822501cae3b91d", + "metadata": {}, + "source": [ + "# Working with BatchStore\n", + "\n", + "A `BatchStore` is a batch-oriented container for variable-length Python items backed by a single `SChunk`. Each batch is stored in one compressed chunk, and each chunk may contain one or more internal variable-length blocks.\n", + "\n", + "This makes `BatchStore` a good fit when data arrives naturally in batches and you want efficient batch append/update operations together with occasional item-level access inside each batch." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "be8591f8f86952e8", + "metadata": { + "ExecuteTime": { + "end_time": "2026-03-20T07:06:05.876287Z", + "start_time": "2026-03-20T07:06:05.661801Z" + } + }, + "outputs": [], + "source": [ + "import blosc2\n", + "\n", + "\n", + "def show(label, value):\n", + " print(f\"{label}: {value}\")\n", + "\n", + "\n", + "urlpath = \"batchstore_tutorial.b2b\"\n", + "copy_path = \"batchstore_tutorial_copy.b2b\"\n", + "blosc2.remove_urlpath(urlpath)\n", + "blosc2.remove_urlpath(copy_path)" + ] + }, + { + "cell_type": "markdown", + "id": "dda38c56e3e63ec1", + "metadata": {}, + "source": [ + "## Creating and populating a BatchStore\n", + "\n", + "A `BatchStore` is indexed by batch. Batches can be appended one by one with `append()` or in bulk with `extend()`. Here we set a small `max_blocksize` just so the internal block structure is easy to observe in `.info`." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f8c8a2b7692e7228", + "metadata": { + "ExecuteTime": { + "end_time": "2026-03-20T07:06:05.904277Z", + "start_time": "2026-03-20T07:06:05.882545Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Batches: [[{'name': 'alpha', 'count': 1}, {'name': 'beta', 'count': 2}, {'name': 'gamma', 'count': 3}], [{'name': 'delta', 'count': 4}, {'name': 'epsilon', 'count': 5}], [{'name': 'zeta', 'count': 6}], [{'name': 'eta', 'count': 7}, {'name': 'theta', 'count': 8}]]\n", + "Number of batches: 4\n" + ] + } + ], + "source": [ + "store = blosc2.BatchStore(urlpath=urlpath, mode=\"w\", contiguous=True, max_blocksize=2)\n", + "store.append(\n", + " [\n", + " {\"name\": \"alpha\", \"count\": 1},\n", + " {\"name\": \"beta\", \"count\": 2},\n", + " {\"name\": \"gamma\", \"count\": 3},\n", + " ]\n", + ")\n", + "store.append(\n", + " [\n", + " {\"name\": \"delta\", \"count\": 4},\n", + " {\"name\": \"epsilon\", \"count\": 5},\n", + " ]\n", + ")\n", + "store.extend(\n", + " [\n", + " [{\"name\": \"zeta\", \"count\": 6}],\n", + " [{\"name\": \"eta\", \"count\": 7}, {\"name\": \"theta\", \"count\": 8}],\n", + " ]\n", + ")\n", + "\n", + "show(\"Batches\", [batch[:] for batch in store])\n", + "show(\"Number of batches\", len(store))" + ] + }, + { + "cell_type": "markdown", + "id": "f57fc5cf2cbaa9ba", + "metadata": {}, + "source": [ + "## Batch and item access\n", + "\n", + "Indexing the store returns a batch. Indexing a batch returns an item inside that batch. Flat item-wise traversal is available through `iter_items()`." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "20861d3e348f9df1", + "metadata": { + "ExecuteTime": { + "end_time": "2026-03-20T07:06:05.924634Z", + "start_time": "2026-03-20T07:06:05.905576Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "First batch: [{'name': 'alpha', 'count': 1}, {'name': 'beta', 'count': 2}, {'name': 'gamma', 'count': 3}]\n", + "Second item in first batch: {'name': 'beta', 'count': 2}\n", + "Slice of second batch: [{'name': 'delta', 'count': 4}]\n", + "All items: [{'name': 'alpha', 'count': 1}, {'name': 'beta', 'count': 2}, {'name': 'gamma', 'count': 3}, {'name': 'delta', 'count': 4}, {'name': 'epsilon', 'count': 5}, {'name': 'zeta', 'count': 6}, {'name': 'eta', 'count': 7}, {'name': 'theta', 'count': 8}]\n" + ] + } + ], + "source": [ + "show(\"First batch\", store[0][:])\n", + "show(\"Second item in first batch\", store[0][1])\n", + "show(\"Slice of second batch\", store[1][:1])\n", + "show(\"All items\", list(store.iter_items()))" + ] + }, + { + "cell_type": "markdown", + "id": "eba42acee73bffe3", + "metadata": {}, + "source": [ + "## Updating, inserting, and deleting batches\n", + "\n", + "Mutation is batch-oriented too: you overwrite, insert, delete, and pop whole batches." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "df556f6da8adc369", + "metadata": { + "ExecuteTime": { + "end_time": "2026-03-20T07:06:05.945986Z", + "start_time": "2026-03-20T07:06:05.925866Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Popped batch: [{'name': 'eta', 'count': 7}, {'name': 'theta', 'count': 8}]\n", + "After updates: [[{'name': 'between', 'count': 99}], [{'name': 'delta*', 'count': 40}, {'name': 'epsilon*', 'count': 50}], [{'name': 'zeta', 'count': 6}]]\n" + ] + } + ], + "source": [ + "store[1] = [\n", + " {\"name\": \"delta*\", \"count\": 40},\n", + " {\"name\": \"epsilon*\", \"count\": 50},\n", + "]\n", + "store.insert(1, [{\"name\": \"between\", \"count\": 99}])\n", + "removed = store.pop()\n", + "del store[0]\n", + "\n", + "show(\"Popped batch\", removed)\n", + "show(\"After updates\", [batch[:] for batch in store])" + ] + }, + { + "cell_type": "markdown", + "id": "e48791c431156e56", + "metadata": {}, + "source": [ + "## Iteration and summary info\n", + "\n", + "Iterating a `BatchStore` yields batches. The `.info` summary reports both batch-level and internal block-level statistics." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "b32d72a68d83673e", + "metadata": { + "ExecuteTime": { + "end_time": "2026-03-20T07:06:05.965086Z", + "start_time": "2026-03-20T07:06:05.947144Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Batches via iteration: [[{'name': 'between', 'count': 99}], [{'name': 'delta*', 'count': 40}, {'name': 'epsilon*', 'count': 50}], [{'name': 'zeta', 'count': 6}]]\n", + "type : BatchStore\n", + "serializer : msgpack\n", + "nbatches : 3 (items per batch: mean=1.33, max=2, min=1)\n", + "nblocks : 3 (items per block: mean=1.33, max=2, min=1)\n", + "nitems : 4\n", + "nbytes : 84 (84 B)\n", + "cbytes : 468 (468 B)\n", + "cratio : 0.18\n", + "cparams : CParams(codec=, codec_meta=0, clevel=5, use_dict=False, typesize=1,\n", + " : nthreads=8, blocksize=0, splitmode=,\n", + " : filters=[, , ,\n", + " : , , ], filters_meta=[0,\n", + " : 0, 0, 0, 0, 0], tuner=)\n", + "dparams : DParams(nthreads=8)\n", + "\n" + ] + } + ], + "source": [ + "show(\"Batches via iteration\", [batch[:] for batch in store])\n", + "print(store.info)" + ] + }, + { + "cell_type": "markdown", + "id": "1d6abe8fe87d3663", + "metadata": {}, + "source": [ + "## Copying and changing storage settings\n", + "\n", + "Like other Blosc2 containers, `BatchStore.copy()` can write a new persistent store while changing storage or compression settings." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "45f878b8f4414a3b", + "metadata": { + "ExecuteTime": { + "end_time": "2026-03-20T07:06:05.990783Z", + "start_time": "2026-03-20T07:06:05.965791Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Copied batches: [[{'name': 'between', 'count': 99}], [{'name': 'delta*', 'count': 40}, {'name': 'epsilon*', 'count': 50}], [{'name': 'zeta', 'count': 6}]]\n", + "Copy serializer: msgpack\n", + "Copy codec: Codec.LZ4\n" + ] + } + ], + "source": [ + "store_copy = store.copy(\n", + " urlpath=copy_path,\n", + " contiguous=False,\n", + " cparams={\"codec\": blosc2.Codec.LZ4, \"clevel\": 5},\n", + ")\n", + "\n", + "show(\"Copied batches\", [batch[:] for batch in store_copy])\n", + "show(\"Copy serializer\", store_copy.serializer)\n", + "show(\"Copy codec\", store_copy.cparams.codec)" + ] + }, + { + "cell_type": "markdown", + "id": "19c51a629db1209", + "metadata": {}, + "source": [ + "## Round-tripping through cframes and reopening from disk\n", + "\n", + "Tagged persistent stores automatically reopen as `BatchStore`, and a serialized cframe buffer does too." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "fd4957093f509bd4", + "metadata": { + "ExecuteTime": { + "end_time": "2026-03-20T07:06:06.025738Z", + "start_time": "2026-03-20T07:06:05.999799Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "from_cframe type: BatchStore\n", + "from_cframe batches: [[{'name': 'between', 'count': 99}], [{'name': 'delta*', 'count': 40}, {'name': 'epsilon*', 'count': 50}], [{'name': 'zeta', 'count': 6}]]\n", + "Reopened type: BatchStore\n", + "Reopened batches: [[{'name': 'between', 'count': 99}], [{'name': 'delta*', 'count': 40}, {'name': 'epsilon*', 'count': 50}], [{'name': 'zeta', 'count': 6}]]\n" + ] + } + ], + "source": [ + "cframe = store.to_cframe()\n", + "restored = blosc2.from_cframe(cframe)\n", + "show(\"from_cframe type\", type(restored).__name__)\n", + "show(\"from_cframe batches\", [batch[:] for batch in restored])\n", + "\n", + "reopened = blosc2.open(urlpath, mode=\"r\", mmap_mode=\"r\")\n", + "show(\"Reopened type\", type(reopened).__name__)\n", + "show(\"Reopened batches\", [batch[:] for batch in reopened])" + ] + }, + { + "cell_type": "markdown", + "id": "dc362a1cab78d016", + "metadata": {}, + "source": [ + "## Clearing and reusing a store\n", + "\n", + "Calling `clear()` resets the backing storage so the container remains ready for new batches." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "2214b2be1bfb5bc7", + "metadata": { + "ExecuteTime": { + "end_time": "2026-03-20T07:06:06.050975Z", + "start_time": "2026-03-20T07:06:06.034152Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "After clear + extend: [[{'name': 'fresh', 'count': 1}], [{'name': 'again', 'count': 2}, {'name': 'done', 'count': 3}]]\n" + ] + } + ], + "source": [ + "scratch = store.copy()\n", + "scratch.clear()\n", + "scratch.extend(\n", + " [\n", + " [{\"name\": \"fresh\", \"count\": 1}],\n", + " [{\"name\": \"again\", \"count\": 2}, {\"name\": \"done\", \"count\": 3}],\n", + " ]\n", + ")\n", + "show(\"After clear + extend\", [batch[:] for batch in scratch])\n", + "\n", + "blosc2.remove_urlpath(urlpath)\n", + "blosc2.remove_urlpath(copy_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "27c47e4fd1332b48", + "metadata": { + "ExecuteTime": { + "end_time": "2026-03-20T07:06:06.061727Z", + "start_time": "2026-03-20T07:06:06.051400Z" + } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/doc/reference/batch_store.rst b/doc/reference/batch_store.rst new file mode 100644 index 00000000..6b1edc8d --- /dev/null +++ b/doc/reference/batch_store.rst @@ -0,0 +1,93 @@ +.. _BatchStore: + +BatchStore +========== + +Overview +-------- +BatchStore is a batch-oriented container for variable-length Python items +backed by a single Blosc2 ``SChunk``. + +Each batch is stored in one compressed chunk: + +- batches contain one or more Python items +- each chunk may contain one or more internal variable-length blocks +- the store itself is indexed by batch +- item-wise traversal is available via :meth:`BatchStore.iter_items` + +BatchStore is a good fit when data arrives naturally in batches and you want: + +- efficient batch append/update operations +- persistent ``.b2b`` stores +- item-level reads inside a batch +- compact summary information about batches and internal blocks via ``.info`` + +Serializer support +------------------ + +BatchStore currently supports two serializers: + +- ``"msgpack"``: the default and general-purpose choice for Python items +- ``"arrow"``: optional and requires ``pyarrow``; mainly useful when data is + already Arrow-shaped before ingestion + +Quick example +------------- + +.. code-block:: python + + import blosc2 + + store = blosc2.BatchStore(urlpath="example_batch_store.b2b", mode="w", contiguous=True) + store.append([{"red": 1, "green": 2, "blue": 3}, {"red": 4, "green": 5, "blue": 6}]) + store.append([{"red": 7, "green": 8, "blue": 9}]) + + print(store[0]) # first batch + print(store[0][1]) # second item in first batch + print(list(store.iter_items())) + + reopened = blosc2.open("example_batch_store.b2b", mode="r") + print(type(reopened).__name__) + print(reopened.info) + +.. note:: + BatchStore is batch-oriented by design. ``store[i]`` returns a batch, not a + single item. Use :meth:`BatchStore.iter_items` for flat item-wise traversal. + +.. currentmodule:: blosc2 + +.. autoclass:: BatchStore + + Constructors + ------------ + .. automethod:: __init__ + + Batch Interface + --------------- + .. automethod:: __getitem__ + .. automethod:: __setitem__ + .. automethod:: __delitem__ + .. automethod:: __len__ + .. automethod:: __iter__ + .. automethod:: iter_items + + Mutation + -------- + .. automethod:: append + .. automethod:: extend + .. automethod:: insert + .. automethod:: pop + .. automethod:: delete + .. automethod:: clear + .. automethod:: copy + + Context Manager + --------------- + .. automethod:: __enter__ + .. automethod:: __exit__ + + Public Members + -------------- + .. automethod:: to_cframe + +.. autoclass:: Batch diff --git a/doc/reference/classes.rst b/doc/reference/classes.rst index 84af533c..83733b2f 100644 --- a/doc/reference/classes.rst +++ b/doc/reference/classes.rst @@ -16,6 +16,7 @@ Main Classes DictStore TreeStore EmbedStore + BatchStore VLArray Proxy ProxySource @@ -34,6 +35,7 @@ Main Classes dict_store tree_store embed_store + batch_store vlarray proxy proxysource diff --git a/pyproject.toml b/pyproject.toml index 81c8d52a..c25612d5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,12 @@ dependencies = [ "requests", ] version = "4.1.3.dev0" + +[project.optional-dependencies] +recommended = [ + "pyarrow", +] + [project.entry-points."array_api"] blosc2 = "blosc2" diff --git a/src/blosc2/batch_store.py b/src/blosc2/batch_store.py index 71a08e81..f0736b3e 100644 --- a/src/blosc2/batch_store.py +++ b/src/blosc2/batch_store.py @@ -12,13 +12,16 @@ import statistics from collections.abc import Iterator, Sequence from dataclasses import asdict +from functools import lru_cache from typing import Any import blosc2 from blosc2._msgpack_utils import msgpack_packb, msgpack_unpackb from blosc2.info import InfoReporter, format_nbytes_info -_BATCHSTORE_META = {"version": 1, "serializer": "msgpack", "max_blocksize": None} +_BATCHSTORE_META = {"version": 1, "serializer": "msgpack", "max_blocksize": None, "arrow_schema": None} +_SUPPORTED_SERIALIZERS = {"msgpack", "arrow"} +_BATCHSTORE_VLMETA_KEY = "_batch_store_metadata" def _check_serialized_size(buffer: bytes) -> None: @@ -27,7 +30,15 @@ def _check_serialized_size(buffer: bytes) -> None: class Batch(Sequence[Any]): - """A lazy sequence of Python objects stored in one BatchStore batch.""" + """A lazy sequence representing one batch in a :class:`BatchStore`. + + ``Batch`` provides sequence-style access to the items stored in a single + batch. Integer indexing can use block-local reads when possible, while + slicing materializes the full batch into Python items. + + Batch instances are normally obtained via :class:`BatchStore` indexing or + iteration rather than constructed directly. + """ def __init__(self, parent: BatchStore, nbatch: int, lazybatch: bytes) -> None: self._parent = parent @@ -56,7 +67,7 @@ def _decode_items(self) -> list[Any]: def _get_block(self, block_index: int) -> list[Any]: if self._cached_block_index == block_index and self._cached_block is not None: return self._cached_block - block = msgpack_unpackb(self._parent.schunk.get_vlblock(self._nbatch, block_index)) + block = self._parent._deserialize_block(self._parent.schunk.get_vlblock(self._nbatch, block_index)) self._cached_block_index = block_index self._cached_block = block return block @@ -84,6 +95,9 @@ def __getitem__(self, index: int | slice) -> Any | list[Any]: return items[index] def __len__(self) -> int: + batch_length = self._parent._batch_length(self._nbatch) + if batch_length is not None: + return batch_length return len(self._decode_items()) def __iter__(self) -> Iterator[Any]: @@ -111,7 +125,40 @@ def __repr__(self) -> str: class BatchStore: - """A batched variable-length array backed by an :class:`blosc2.SChunk`.""" + """A batched container for variable-length Python items. + + BatchStore stores data as a sequence of *batches*, where each batch contains + one or more Python items. Each batch is stored in one compressed chunk, and + each chunk is internally split into one or more variable-length blocks for + efficient item access. + + The main abstraction is batch-oriented: + + - indexing the store returns batches + - iterating the store yields batches + - :meth:`iter_items` provides flat item-wise traversal + + BatchStore is a good fit when: + + - data arrives naturally in batches + - batch-level append/update operations are important + - occasional item-level reads are needed inside a batch + + Parameters + ---------- + max_blocksize : int, optional + Maximum number of items stored in each internal variable-length block. + If not provided, a value is inferred from the first batch. + serializer : {"msgpack", "arrow"}, optional + Serializer used for batch payloads. ``"msgpack"`` is the default and is + the general-purpose choice for Python items. ``"arrow"`` is optional and + requires ``pyarrow``. + _from_schunk : blosc2.SChunk, optional + Internal hook used when reopening an already-tagged BatchStore. + **kwargs + Storage, compression, and decompression arguments accepted by the + constructor. + """ @staticmethod def _set_typesize_one(cparams: blosc2.CParams | dict | None) -> blosc2.CParams | dict: @@ -162,7 +209,11 @@ def _attach_schunk(self, schunk: blosc2.SChunk) -> None: batchstore_meta = self.schunk.meta["batchstore"] except KeyError: batchstore_meta = {} + self._serializer = batchstore_meta.get("serializer", self._serializer) self._max_blocksize = batchstore_meta.get("max_blocksize", self._max_blocksize) + self._arrow_schema = batchstore_meta.get("arrow_schema", self._arrow_schema) + self._arrow_schema_obj = None + self._batch_lengths = self._load_batch_lengths() self._validate_tag() def _maybe_open_existing(self, storage: blosc2.Storage) -> bool: @@ -187,12 +238,25 @@ def _make_storage(self) -> blosc2.Storage: def __init__( self, max_blocksize: int | None = None, + serializer: str = "msgpack", _from_schunk: blosc2.SChunk | None = None, **kwargs: Any, ) -> None: + """Create a new BatchStore or reopen an existing one. + + When a persistent ``urlpath`` points to an existing BatchStore and the + mode is ``"r"`` or ``"a"``, the container is reopened automatically. + Otherwise a new empty store is created. + """ if max_blocksize is not None and max_blocksize <= 0: raise ValueError("max_blocksize must be a positive integer") + if serializer not in _SUPPORTED_SERIALIZERS: + raise ValueError(f"Unsupported BatchStore serializer: {serializer!r}") self._max_blocksize: int | None = max_blocksize + self._serializer = serializer + self._arrow_schema: bytes | None = None + self._arrow_schema_obj = None + self._batch_lengths: list[int] | None = None if _from_schunk is not None: if kwargs: unexpected = ", ".join(sorted(kwargs)) @@ -218,7 +282,12 @@ def __init__( return fixed_meta = dict(storage.meta or {}) - fixed_meta["batchstore"] = {**_BATCHSTORE_META, "max_blocksize": self._max_blocksize} + fixed_meta["batchstore"] = { + **_BATCHSTORE_META, + "serializer": self._serializer, + "max_blocksize": self._max_blocksize, + "arrow_schema": self._arrow_schema, + } storage.meta = fixed_meta schunk = blosc2.SChunk(chunksize=-1, data=None, cparams=cparams, dparams=dparams, storage=storage) self._attach_schunk(schunk) @@ -226,6 +295,20 @@ def __init__( def _validate_tag(self) -> None: if "batchstore" not in self.schunk.meta: raise ValueError("The supplied SChunk is not tagged as a BatchStore") + if self._serializer not in _SUPPORTED_SERIALIZERS: + raise ValueError(f"Unsupported BatchStore serializer in metadata: {self._serializer!r}") + if self._serializer == "arrow": + self._require_pyarrow() + + @staticmethod + @lru_cache(maxsize=1) + def _require_pyarrow(): + try: + import pyarrow as pa + import pyarrow.ipc as pa_ipc + except ImportError as exc: + raise ImportError("BatchStore serializer='arrow' requires pyarrow") from exc + return pa, pa_ipc def _check_writable(self) -> None: if self.mode == "r": @@ -257,7 +340,72 @@ def _slice_indices(self, index: slice) -> list[int]: def _copy_meta(self) -> dict[str, Any]: return {name: self.meta[name] for name in self.meta} - def _normalize_batch(self, value: object) -> list[Any]: + def _load_batch_lengths(self) -> list[int] | None: + try: + metadata = self.schunk.vlmeta[_BATCHSTORE_VLMETA_KEY] + except KeyError: + return None + batch_lengths = metadata.get("batch_lengths") + if not isinstance(batch_lengths, list): + return None + return [int(length) for length in batch_lengths] + + def _persist_batch_lengths(self) -> None: + if self._batch_lengths is None: + return + self.schunk.vlmeta[_BATCHSTORE_VLMETA_KEY] = {"batch_lengths": list(self._batch_lengths)} + + def _get_batch_lengths(self) -> list[int] | None: + return self._batch_lengths + + def _ensure_batch_lengths(self) -> list[int]: + if self._batch_lengths is None: + self._batch_lengths = [] + return self._batch_lengths + + def _batch_length(self, index: int) -> int | None: + if self._batch_lengths is None: + return None + return self._batch_lengths[index] + + def _block_sizes_from_batch_length(self, batch_length: int, nblocks: int) -> list[int]: + if self._max_blocksize is None or nblocks <= 0: + return [] + full_blocks, remainder = divmod(batch_length, self._max_blocksize) + block_sizes = [self._max_blocksize] * full_blocks + if remainder: + block_sizes.append(remainder) + if not block_sizes and batch_length > 0: + block_sizes.append(batch_length) + if len(block_sizes) != nblocks: + return [] + return block_sizes + + def _get_block_sizes(self, batch_sizes: list[int]) -> list[int] | None: + if self._max_blocksize is None: + return None + block_sizes: list[int] = [] + for index, batch_length in enumerate(batch_sizes): + lazychunk = self.schunk.get_lazychunk(index) + _, _, nblocks = blosc2.get_cbuffer_sizes(lazychunk) + sizes = self._block_sizes_from_batch_length(batch_length, nblocks) + if not sizes: + return None + block_sizes.extend(sizes) + return block_sizes + + def _total_nblocks(self) -> int: + total = 0 + for index in range(len(self)): + lazychunk = self.schunk.get_lazychunk(index) + _, _, nblocks = blosc2.get_cbuffer_sizes(lazychunk) + total += nblocks + return total + + def _user_vlmeta_items(self) -> dict[str, Any]: + return {key: value for key, value in self.vlmeta.getall().items() if key != _BATCHSTORE_VLMETA_KEY} + + def _normalize_msgpack_batch(self, value: object) -> list[Any]: if isinstance(value, (str, bytes, bytearray, memoryview)): raise TypeError("BatchStore entries must be sequences of Python objects") if not isinstance(value, Sequence): @@ -267,20 +415,85 @@ def _normalize_batch(self, value: object) -> list[Any]: raise ValueError("BatchStore entries cannot be empty") return values - def _ensure_layout_for_batch(self, batch: list[Any]) -> None: + def _normalize_arrow_batch(self, value: object): + pa, _ = self._require_pyarrow() + if isinstance(value, pa.ChunkedArray): + value = value.combine_chunks() + elif isinstance(value, pa.RecordBatch): + if value.num_columns != 1: + raise TypeError("Arrow RecordBatch inputs for BatchStore must have exactly one column") + value = value.column(0) + elif not isinstance(value, pa.Array): + if isinstance(value, (str, bytes, bytearray, memoryview)): + raise TypeError("BatchStore entries must be Arrow arrays or sequences of Python objects") + if not isinstance(value, Sequence): + raise TypeError("BatchStore entries must be Arrow arrays or sequences of Python objects") + value = pa.array(list(value)) + if len(value) == 0: + raise ValueError("BatchStore entries cannot be empty") + self._ensure_arrow_schema(value) + return value + + def _ensure_arrow_schema(self, batch) -> None: + if self._serializer != "arrow": + return + pa, _ = self._require_pyarrow() + schema = pa.schema([pa.field("values", batch.type)]) + if self._arrow_schema is None: + self._arrow_schema = schema.serialize().to_pybytes() + self._arrow_schema_obj = schema + return + existing_schema = self._get_arrow_schema() + if not existing_schema.equals(schema): + raise TypeError("All Arrow batches in a BatchStore must share the same schema") + + def _get_arrow_schema(self): + if self._serializer != "arrow": + return None + if self._arrow_schema is None: + raise RuntimeError("Arrow schema is not initialized") + if self._arrow_schema_obj is None: + pa, pa_ipc = self._require_pyarrow() + self._arrow_schema_obj = pa_ipc.read_schema(pa.BufferReader(self._arrow_schema)) + return self._arrow_schema_obj + + def _normalize_batch(self, value: object) -> Any: + if self._serializer == "arrow": + return self._normalize_arrow_batch(value) + return self._normalize_msgpack_batch(value) + + def _batch_len(self, batch: Any) -> int: + return len(batch) + + def _payload_sizes_for_batch(self, batch: Any) -> list[int]: + if self._serializer == "arrow": + total_size = batch.get_total_buffer_size() + avg_size = max(1, total_size // max(1, len(batch))) + return [avg_size] * len(batch) + return [len(msgpack_packb(item)) for item in batch] + + def _ensure_layout_for_batch(self, batch: Any) -> None: + layout_changed = False if self._max_blocksize is None: - payload_sizes = [len(msgpack_packb(item)) for item in batch] + payload_sizes = self._payload_sizes_for_batch(batch) self._max_blocksize = self._guess_blocksize(payload_sizes) - self._persist_max_blocksize() - - def _persist_max_blocksize(self) -> None: - if self._max_blocksize is None or len(self) > 0: + layout_changed = True + if self._serializer == "arrow" and self._arrow_schema is not None: + layout_changed = layout_changed or len(self) == 0 + if layout_changed: + self._persist_layout_metadata() + + def _persist_layout_metadata(self) -> None: + if len(self) > 0: return + batch_lengths = None if self._batch_lengths is None else list(self._batch_lengths) storage = self._make_storage() fixed_meta = dict(storage.meta or {}) fixed_meta["batchstore"] = { **dict(fixed_meta.get("batchstore", {})), "max_blocksize": self._max_blocksize, + "serializer": self._serializer, + "arrow_schema": self._arrow_schema, } storage.meta = fixed_meta schunk = blosc2.SChunk( @@ -291,6 +504,8 @@ def _persist_max_blocksize(self) -> None: storage=storage, ) self._attach_schunk(schunk) + if batch_lengths is not None and self._batch_lengths is None: + self._batch_lengths = batch_lengths def _guess_blocksize(self, payload_sizes: list[int]) -> int: if not payload_sizes: @@ -317,28 +532,53 @@ def _guess_blocksize(self, payload_sizes: list[int]) -> int: count = 1 return min(count, len(payload_sizes)) - def _serialize_batch(self, value: object) -> list[Any]: + def _serialize_batch(self, value: object) -> Any: batch = self._normalize_batch(value) self._ensure_layout_for_batch(batch) return batch - def _serialize_block(self, items: list[Any]) -> bytes: + def _serialize_msgpack_block(self, items: list[Any]) -> bytes: payload = msgpack_packb(items) _check_serialized_size(payload) return payload + def _serialize_arrow_block(self, items) -> bytes: + pa, _ = self._require_pyarrow() + batch = pa.record_batch([items], schema=self._get_arrow_schema()) + payload = batch.serialize().to_pybytes() + _check_serialized_size(payload) + return payload + + def _serialize_block(self, items: Any) -> bytes: + if self._serializer == "arrow": + return self._serialize_arrow_block(items) + return self._serialize_msgpack_block(items) + + def _deserialize_msgpack_block(self, payload: bytes) -> list[Any]: + return msgpack_unpackb(payload) + + def _deserialize_arrow_block(self, payload: bytes) -> list[Any]: + pa, pa_ipc = self._require_pyarrow() + batch = pa_ipc.read_record_batch(pa.BufferReader(payload), self._get_arrow_schema()) + return batch.column(0).to_pylist() + + def _deserialize_block(self, payload: bytes) -> list[Any]: + if self._serializer == "arrow": + return self._deserialize_arrow_block(payload) + return self._deserialize_msgpack_block(payload) + def _vl_cparams_kwargs(self) -> dict[str, Any]: return asdict(self.schunk.cparams) def _vl_dparams_kwargs(self) -> dict[str, Any]: return asdict(self.schunk.dparams) - def _compress_batch(self, batch: list[Any]) -> bytes: + def _compress_batch(self, batch: Any) -> bytes: if self._max_blocksize is None: raise RuntimeError("BatchStore max_blocksize is not initialized") blocks = [ self._serialize_block(batch[i : i + self._max_blocksize]) - for i in range(0, len(batch), self._max_blocksize) + for i in range(0, self._batch_len(batch), self._max_blocksize) ] return blosc2.blosc2_ext.vlcompress(blocks, **self._vl_cparams_kwargs()) @@ -346,53 +586,70 @@ def _decode_blocks(self, nbatch: int) -> list[list[Any]]: block_payloads = blosc2.blosc2_ext.vldecompress( self.schunk.get_chunk(nbatch), **self._vl_dparams_kwargs() ) - return [msgpack_unpackb(payload) for payload in block_payloads] + return [self._deserialize_block(payload) for payload in block_payloads] def _get_batch(self, index: int) -> Batch: return Batch(self, index, self.schunk.get_lazychunk(index)) def append(self, value: object) -> int: - """Append one batch and return the new number of entries.""" + """Append one batch and return the new number of batches.""" self._check_writable() batch = self._serialize_batch(value) batch_payload = self._compress_batch(batch) - return self.schunk.append_chunk(batch_payload) + length = self._batch_len(batch) + new_len = self.schunk.append_chunk(batch_payload) + self._ensure_batch_lengths().append(length) + self._persist_batch_lengths() + return new_len def insert(self, index: int, value: object) -> int: - """Insert one batch at ``index`` and return the new number of entries.""" + """Insert one batch at ``index`` and return the new number of batches.""" self._check_writable() index = self._normalize_insert_index(index) batch = self._serialize_batch(value) batch_payload = self._compress_batch(batch) - return self.schunk.insert_chunk(index, batch_payload) + length = self._batch_len(batch) + new_len = self.schunk.insert_chunk(index, batch_payload) + self._ensure_batch_lengths().insert(index, length) + self._persist_batch_lengths() + return new_len def delete(self, index: int | slice) -> int: - """Delete the batch at ``index`` and return the new number of entries.""" + """Delete the batch at ``index`` and return the new number of batches.""" self._check_writable() if isinstance(index, slice): for idx in reversed(self._slice_indices(index)): self.schunk.delete_chunk(idx) + if self._batch_lengths is not None: + del self._batch_lengths[idx] + self._persist_batch_lengths() return len(self) index = self._normalize_index(index) - return self.schunk.delete_chunk(index) + new_len = self.schunk.delete_chunk(index) + if self._batch_lengths is not None: + del self._batch_lengths[index] + self._persist_batch_lengths() + return new_len def pop(self, index: int = -1) -> list[Any]: - """Remove and return the batch at ``index``.""" + """Remove and return the batch at ``index`` as a Python list.""" self._check_writable() if isinstance(index, slice): raise NotImplementedError("Slicing is not supported for BatchStore") index = self._normalize_index(index) value = self[index][:] - self.schunk.delete_chunk(index) + self.delete(index) return value def extend(self, values: object) -> None: - """Append all batches from an iterable.""" + """Append all batches from an iterable of batches.""" self._check_writable() for value in values: batch = self._serialize_batch(value) batch_payload = self._compress_batch(batch) self.schunk.append_chunk(batch_payload) + self._ensure_batch_lengths().append(self._batch_len(batch)) + self._persist_batch_lengths() def clear(self) -> None: """Remove all entries from the container.""" @@ -408,8 +665,11 @@ def clear(self) -> None: storage=storage, ) self._attach_schunk(schunk) + self._batch_lengths = [] + self._persist_batch_lengths() def __getitem__(self, index: int | slice) -> Batch | list[Batch]: + """Return one batch or a list of batches.""" if isinstance(index, slice): return [self[i] for i in self._slice_indices(index)] index = self._normalize_index(index) @@ -425,10 +685,14 @@ def __setitem__(self, index: int | slice, value: object) -> None: start = self._normalize_insert_index(0 if index.start is None else index.start) for idx in reversed(indices): self.schunk.delete_chunk(idx) + if self._batch_lengths is not None: + del self._batch_lengths[idx] for offset, item in enumerate(values): batch = self._serialize_batch(item) batch_payload = self._compress_batch(batch) self.schunk.insert_chunk(start + offset, batch_payload) + self._ensure_batch_lengths().insert(start + offset, self._batch_len(batch)) + self._persist_batch_lengths() return if len(values) != len(indices): raise ValueError( @@ -438,29 +702,34 @@ def __setitem__(self, index: int | slice, value: object) -> None: batch = self._serialize_batch(item) batch_payload = self._compress_batch(batch) self.schunk.update_chunk(idx, batch_payload) + if self._batch_lengths is not None: + self._batch_lengths[idx] = self._batch_len(batch) + self._persist_batch_lengths() return self._check_writable() index = self._normalize_index(index) batch = self._serialize_batch(value) batch_payload = self._compress_batch(batch) self.schunk.update_chunk(index, batch_payload) + if self._batch_lengths is not None: + self._batch_lengths[index] = self._batch_len(batch) + self._persist_batch_lengths() def __delitem__(self, index: int | slice) -> None: self.delete(index) def __len__(self) -> int: + """Return the number of batches stored in the container.""" return self.schunk.nchunks - def iter_batches(self) -> Iterator[Batch]: - for i in range(len(self)): - yield self[i] - - def iter_objects(self) -> Iterator[Any]: - for batch in self.iter_batches(): + def iter_items(self) -> Iterator[Any]: + """Iterate over all items across all batches in order.""" + for batch in self: yield from batch def __iter__(self) -> Iterator[Batch]: - yield from self.iter_batches() + for i in range(len(self)): + yield self[i] @property def meta(self): @@ -508,24 +777,35 @@ def contiguous(self) -> bool: @property def info(self) -> InfoReporter: - """Print information about this BatchStore.""" + """Return an info reporter with a compact summary of the store.""" return InfoReporter(self) @property def info_items(self) -> list: - """A list of tuples with summary information about this BatchStore.""" - batch_sizes = [len(batch) for batch in self.iter_batches()] + """Return summary information as ``(name, value)`` pairs.""" + batch_sizes = self._get_batch_lengths() + if batch_sizes is None: + batch_sizes = [len(batch) for batch in self] + block_sizes = self._get_block_sizes(batch_sizes) if batch_sizes: batch_stats = ( f"mean={statistics.fmean(batch_sizes):.2f}, max={max(batch_sizes)}, min={min(batch_sizes)}" ) + nbatches_value = f"{len(self)} (items per batch: {batch_stats})" else: - batch_stats = "n/a" + nbatches_value = f"{len(self)} (items per batch: n/a)" + if block_sizes: + block_stats = ( + f"mean={statistics.fmean(block_sizes):.2f}, max={max(block_sizes)}, min={min(block_sizes)}" + ) + nblocks_value = f"{self._total_nblocks()} (items per block: {block_stats})" + else: + nblocks_value = f"{self._total_nblocks()} (items per block: n/a)" return [ ("type", f"{self.__class__.__name__}"), - ("nbatches", len(self)), - ("batch stats", batch_stats), - ("max_blocksize", self.max_blocksize), + ("serializer", self.serializer), + ("nbatches", nbatches_value), + ("nblocks", nblocks_value), ("nitems", sum(batch_sizes)), ("nbytes", format_nbytes_info(self.nbytes)), ("cbytes", format_nbytes_info(self.cbytes)), @@ -535,15 +815,17 @@ def info_items(self) -> list: ] def to_cframe(self) -> bytes: + """Serialize the full store to a Blosc2 cframe buffer.""" return self.schunk.to_cframe() def copy(self, **kwargs: Any) -> BatchStore: - """Create a copy of the container with optional constructor overrides.""" + """Create a copy of the store with optional constructor overrides.""" if "meta" in kwargs: raise ValueError("meta should not be passed to copy") kwargs["cparams"] = kwargs.get("cparams", copy.deepcopy(self.cparams)) kwargs["dparams"] = kwargs.get("dparams", copy.deepcopy(self.dparams)) kwargs["max_blocksize"] = kwargs.get("max_blocksize", self.max_blocksize) + kwargs["serializer"] = kwargs.get("serializer", self.serializer) if "storage" not in kwargs: kwargs["meta"] = self._copy_meta() @@ -553,9 +835,9 @@ def copy(self, **kwargs: Any) -> BatchStore: out = BatchStore(**kwargs) if "storage" not in kwargs and len(self.vlmeta) > 0: - for key, value in self.vlmeta.getall().items(): + for key, value in self._user_vlmeta_items().items(): out.vlmeta[key] = value - out.extend(self.iter_batches()) + out.extend(self) return out def __enter__(self) -> BatchStore: @@ -566,3 +848,8 @@ def __exit__(self, exc_type, exc_val, exc_tb) -> bool: def __repr__(self) -> str: return f"BatchStore(len={len(self)}, urlpath={self.urlpath!r})" + + @property + def serializer(self) -> str: + """Serializer name used for batch payloads.""" + return self._serializer diff --git a/tests/test_batch_store.py b/tests/test_batch_store.py index 29a2b701..c9f520ed 100644 --- a/tests/test_batch_store.py +++ b/tests/test_batch_store.py @@ -48,9 +48,9 @@ def test_batchstore_roundtrip(contiguous, urlpath): assert len(barray) == len(BATCHES) assert barray.max_blocksize is not None assert 1 <= barray.max_blocksize <= len(BATCHES[0]) - assert [batch[:] for batch in barray.iter_batches()] == BATCHES + assert [batch[:] for batch in barray] == BATCHES assert barray.append([1, 2]) == len(BATCHES) + 1 - assert [batch[:] for batch in barray.iter_batches()][-1] == [1, 2] + assert [batch[:] for batch in barray][-1] == [1, 2] batch0 = barray[0] assert isinstance(batch0, blosc2.Batch) @@ -78,13 +78,13 @@ def test_batchstore_roundtrip(contiguous, urlpath): del expected[2] del barray[-2] del expected[-2] - assert [batch[:] for batch in barray.iter_batches()] == expected + assert [batch[:] for batch in barray] == expected if urlpath is not None: reopened = blosc2.open(urlpath, mode="r") assert isinstance(reopened, blosc2.BatchStore) assert reopened.max_blocksize == barray.max_blocksize - assert [batch[:] for batch in reopened.iter_batches()] == expected + assert [batch[:] for batch in reopened] == expected with pytest.raises(ValueError): reopened.append(["nope"]) with pytest.raises(ValueError): @@ -105,12 +105,40 @@ def test_batchstore_roundtrip(contiguous, urlpath): reopened_rw = blosc2.open(urlpath, mode="a") reopened_rw[0] = ["changed", "batch", 0] expected[0] = ["changed", "batch", 0] - assert [batch[:] for batch in reopened_rw.iter_batches()] == expected + assert [batch[:] for batch in reopened_rw] == expected if contiguous: reopened_mmap = blosc2.open(urlpath, mode="r", mmap_mode="r") assert isinstance(reopened_mmap, blosc2.BatchStore) - assert [batch[:] for batch in reopened_mmap.iter_batches()] == expected + assert [batch[:] for batch in reopened_mmap] == expected + + blosc2.remove_urlpath(urlpath) + + +def test_batchstore_arrow_ipc_roundtrip(): + pa = pytest.importorskip("pyarrow") + urlpath = "test_batchstore_arrow_ipc.b2b" + blosc2.remove_urlpath(urlpath) + + barray = blosc2.BatchStore(storage=_storage(True, urlpath), serializer="arrow") + assert barray.serializer == "arrow" + assert barray.meta["batchstore"]["serializer"] == "arrow" + + batch1 = pa.array([[1, 2], None, [3]]) + batch2 = pa.array([[4], [5, 6]]) + barray.append(batch1) + barray.append(batch2) + + assert barray[0][:] == [[1, 2], None, [3]] + assert barray[1][:] == [[4], [5, 6]] + assert barray.meta["batchstore"]["arrow_schema"] is not None + + reopened = blosc2.open(urlpath, mode="r") + assert isinstance(reopened, blosc2.BatchStore) + assert reopened.serializer == "arrow" + assert reopened.meta["batchstore"]["serializer"] == "arrow" + assert reopened[0][:] == [[1, 2], None, [3]] + assert reopened[1][:] == [[4], [5, 6]] blosc2.remove_urlpath(urlpath) @@ -126,11 +154,11 @@ def test_batchstore_from_cframe(): restored = blosc2.from_cframe(barray.to_cframe()) assert isinstance(restored, blosc2.BatchStore) - assert [batch[:] for batch in restored.iter_batches()] == expected + assert [batch[:] for batch in restored] == expected restored2 = blosc2.from_cframe(barray.to_cframe()) assert isinstance(restored2, blosc2.BatchStore) - assert [batch[:] for batch in restored2.iter_batches()] == expected + assert [batch[:] for batch in restored2] == expected def test_batchstore_info(): @@ -143,9 +171,9 @@ def test_batchstore_info(): items = dict(barray.info_items) assert items["type"] == "BatchStore" - assert items["nbatches"] == len(BATCHES) - assert items["batch stats"].startswith("mean=") - assert items["max_blocksize"] == barray.max_blocksize + assert items["serializer"] == "msgpack" + assert items["nbatches"].startswith(f"{len(BATCHES)} (items per batch: mean=") + assert items["nblocks"].startswith(str(len(BATCHES))) assert items["nitems"] == sum(len(batch) for batch in BATCHES) assert "urlpath" not in items assert "contiguous" not in items @@ -156,9 +184,53 @@ def test_batchstore_info(): text = repr(barray.info) assert "type" in text + assert "serializer" in text assert "BatchStore" in text - assert "batch stats" in text - assert "max_blocksize" in text + assert "items per batch" in text + assert "items per block" in text + + +def test_batchstore_info_uses_persisted_batch_lengths(): + barray = blosc2.BatchStore() + barray.extend(BATCHES) + + assert barray.vlmeta["_batch_store_metadata"]["batch_lengths"] == [len(batch) for batch in BATCHES] + + def fail_decode(*args, **kwargs): + raise AssertionError( + "info() should not deserialize batches when batch_lengths metadata is available" + ) + + original_decode_blocks = barray._decode_blocks + barray._decode_blocks = fail_decode + try: + items = dict(barray.info_items) + finally: + barray._decode_blocks = original_decode_blocks + + assert items["nitems"] == sum(len(batch) for batch in BATCHES) + assert "items per batch: mean=" in items["nbatches"] + + +def test_batchstore_info_reports_exact_block_stats_from_lazy_chunks(): + barray = blosc2.BatchStore(max_blocksize=2) + barray.extend([[1, 2, 3, 4, 5], [6, 7], [8]]) + + items = dict(barray.info_items) + assert items["nblocks"] == "5 (items per block: mean=1.60, max=2, min=1)" + + +def test_batchstore_pop_keeps_batch_lengths_metadata_in_sync(): + barray = blosc2.BatchStore(max_blocksize=2) + barray.extend([[1, 2, 3], [4, 5], [6]]) + + removed = barray.pop(1) + + assert removed == [4, 5] + assert [batch[:] for batch in barray] == [[1, 2, 3], [6]] + assert barray.vlmeta["_batch_store_metadata"]["batch_lengths"] == [3, 1] + items = dict(barray.info_items) + assert items["nbatches"].startswith("2 (items per batch: mean=2.00") def test_batchstore_zstd_does_not_use_dict_by_default(): @@ -172,7 +244,7 @@ def test_batchstore_explicit_max_blocksize(): assert barray.max_blocksize == 2 barray.append([1, 2, 3]) barray.append([4]) - assert [batch[:] for batch in barray.iter_batches()] == [[1, 2, 3], [4]] + assert [batch[:] for batch in barray] == [[1, 2, 3], [4]] def test_batchstore_get_vlblock_and_scalar_access(): @@ -227,13 +299,13 @@ def wrapped_get_vlblock(nchunk, nblock): barray.schunk.get_vlblock = original_get_vlblock -def test_batchstore_iter_objects(): +def test_batchstore_iter_items(): barray = blosc2.BatchStore(max_blocksize=2) batches = [[1, 2, 3], [4], [5, 6]] barray.extend(batches) assert [batch[:] for batch in barray] == batches - assert list(barray.iter_objects()) == [1, 2, 3, 4, 5, 6] + assert list(barray.iter_items()) == [1, 2, 3, 4, 5, 6] def test_batchstore_respects_explicit_use_dict_and_non_zstd(): @@ -322,7 +394,7 @@ def test_batchstore_constructor_kwargs(): barray.extend(BATCHES) reopened = blosc2.BatchStore(urlpath=urlpath, mode="r", contiguous=True, mmap_mode="r") - assert [batch[:] for batch in reopened.iter_batches()] == BATCHES + assert [batch[:] for batch in reopened] == BATCHES blosc2.remove_urlpath(urlpath) @@ -341,21 +413,21 @@ def test_batchstore_list_like_ops(contiguous, urlpath): barray = blosc2.BatchStore(storage=_storage(contiguous, urlpath)) barray.extend([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - assert [batch[:] for batch in barray.iter_batches()] == [[1, 2, 3], [4, 5, 6], [7, 8, 9]] + assert [batch[:] for batch in barray] == [[1, 2, 3], [4, 5, 6], [7, 8, 9]] assert barray.pop() == [7, 8, 9] assert barray.pop(0) == [1, 2, 3] - assert [batch[:] for batch in barray.iter_batches()] == [[4, 5, 6]] + assert [batch[:] for batch in barray] == [[4, 5, 6]] barray.clear() assert len(barray) == 0 - assert [batch[:] for batch in barray.iter_batches()] == [] + assert [batch[:] for batch in barray] == [] barray.extend([["a", "b", "c"], ["d", "e", "f"]]) - assert [batch[:] for batch in barray.iter_batches()] == [["a", "b", "c"], ["d", "e", "f"]] + assert [batch[:] for batch in barray] == [["a", "b", "c"], ["d", "e", "f"]] if urlpath is not None: reopened = blosc2.open(urlpath, mode="r") - assert [batch[:] for batch in reopened.iter_batches()] == [["a", "b", "c"], ["d", "e", "f"]] + assert [batch[:] for batch in reopened] == [["a", "b", "c"], ["d", "e", "f"]] blosc2.remove_urlpath(urlpath) @@ -381,15 +453,15 @@ def test_batchstore_slices(contiguous, urlpath): barray[2:5] = [["a", "b", "c"], ["d", "e", "f"], ["g", "h", "i"]] expected[2:5] = [["a", "b", "c"], ["d", "e", "f"], ["g", "h", "i"]] - assert [batch[:] for batch in barray.iter_batches()] == expected + assert [batch[:] for batch in barray] == expected barray[1:6:2] = [[100, 101, 102], [103, 104, 105], [106, 107, 108]] expected[1:6:2] = [[100, 101, 102], [103, 104, 105], [106, 107, 108]] - assert [batch[:] for batch in barray.iter_batches()] == expected + assert [batch[:] for batch in barray] == expected del barray[::3] del expected[::3] - assert [batch[:] for batch in barray.iter_batches()] == expected + assert [batch[:] for batch in barray] == expected if urlpath is not None: reopened = blosc2.open(urlpath, mode="r") @@ -427,14 +499,14 @@ def test_batchstore_copy(): copied = original.copy( urlpath=copy_path, contiguous=False, cparams={"codec": blosc2.Codec.LZ4, "clevel": 5} ) - assert [batch[:] for batch in copied.iter_batches()] == [batch[:] for batch in original.iter_batches()] + assert [batch[:] for batch in copied] == [batch[:] for batch in original] assert copied.urlpath == copy_path assert copied.schunk.contiguous is False assert copied.cparams.codec == blosc2.Codec.LZ4 assert copied.cparams.clevel == 5 inmem = original.copy() - assert [batch[:] for batch in inmem.iter_batches()] == [batch[:] for batch in original.iter_batches()] + assert [batch[:] for batch in inmem] == [batch[:] for batch in original] assert inmem.urlpath is None with pytest.raises(ValueError, match="meta should not be passed to copy"): @@ -469,7 +541,7 @@ def test_batchstore_multithreaded_inner_vl(contiguous, nthreads): ) barray.extend(batches) - assert [batch[:] for batch in barray.iter_batches()] == batches + assert [batch[:] for batch in barray] == batches assert [barray[i][:] for i in range(len(barray))] == batches @@ -488,7 +560,7 @@ def test_batchstore_validation_errors(): blosc2.BatchStore().pop() barray.extend([[1, 2, 3]]) assert barray.append([2, 3]) == 2 - assert [batch[:] for batch in barray.iter_batches()] == [[1, 2, 3], [2, 3]] + assert [batch[:] for batch in barray] == [[1, 2, 3], [2, 3]] with pytest.raises(NotImplementedError): barray.pop(slice(0, 1)) @@ -501,7 +573,7 @@ def test_batchstore_in_embed_store(): estore["/batch"] = barray restored = estore["/batch"] assert isinstance(restored, blosc2.BatchStore) - assert [batch[:] for batch in restored.iter_batches()] == BATCHES + assert [batch[:] for batch in restored] == BATCHES def test_batchstore_in_dict_store(): @@ -516,6 +588,6 @@ def test_batchstore_in_dict_store(): with blosc2.DictStore(path, mode="r") as dstore: restored = dstore["/batch"] assert isinstance(restored, blosc2.BatchStore) - assert [batch[:] for batch in restored.iter_batches()] == BATCHES + assert [batch[:] for batch in restored] == BATCHES blosc2.remove_urlpath(path) From 0ab340d592862a02f2e9770e606574baa831779e Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 20 Mar 2026 08:38:40 +0100 Subject: [PATCH 27/34] Use metadata-based DictStore discovery and warn on leaf mismatches --- src/blosc2/dict_store.py | 159 ++++++++++++++++++++++++++++++--------- tests/test_dict_store.py | 88 ++++++++++++++++++++++ tests/test_tree_store.py | 39 ++++++++++ 3 files changed, 249 insertions(+), 37 deletions(-) diff --git a/src/blosc2/dict_store.py b/src/blosc2/dict_store.py index 17f50ac6..81b577b4 100644 --- a/src/blosc2/dict_store.py +++ b/src/blosc2/dict_store.py @@ -10,6 +10,7 @@ import os import shutil import tempfile +import warnings import zipfile from typing import TYPE_CHECKING, Any @@ -94,6 +95,9 @@ class DictStore: ----- - External persistence uses the following file extensions: .b2nd for NDArray, .b2f for SChunk, and .b2b for BatchStore. + These suffixes are a naming convention for newly written leaves; when + reopening an existing store, leaf typing is resolved from object + metadata instead of trusting the suffix alone. """ def __init__( @@ -112,7 +116,7 @@ def __init__( """ See :class:`DictStore` for full documentation of parameters. """ - self.localpath = localpath if isinstance(localpath, (str, bytes)) else str(localpath) + self.localpath = localpath if isinstance(localpath, str | bytes) else str(localpath) if not self.localpath.endswith((".b2z", ".b2d")): raise ValueError(f"localpath must have a .b2z or .b2d extension; you passed: {self.localpath}") if mode not in ("r", "w", "a"): @@ -182,13 +186,7 @@ def _init_read_mode(self, dparams: blosc2.DParams | None = None): mmap_mode=self.mmap_mode, dparams=dparams, ) - for filepath in self.offsets: - if filepath.endswith((".b2nd", ".b2f", ".b2b")): - if filepath.endswith(".b2nd"): - key = "/" + filepath[:-5] - else: - key = "/" + filepath[:-4] - self.map_tree[key] = filepath + self._update_map_tree_from_offsets() else: # .b2d if not os.path.isdir(self.localpath): raise FileNotFoundError(f"Directory {self.localpath} does not exist for reading.") @@ -204,6 +202,90 @@ def _init_read_mode(self, dparams: blosc2.DParams | None = None): self._estore = EmbedStore(_from_schunk=schunk) self.storage.meta = self._estore.storage.meta + @staticmethod + def _logical_key_from_relpath(rel_path: str) -> str: + """Map an external leaf path to its logical tree key.""" + rel_path = rel_path.replace(os.sep, "/") + key = os.path.splitext(rel_path)[0] + if not key.startswith("/"): + key = "/" + key + return key + + @staticmethod + def _expected_ext_from_kind(kind: str) -> str: + """Return the canonical write-time suffix for a supported external leaf kind.""" + if kind == "ndarray": + return ".b2nd" + if kind == "batchstore": + return ".b2b" + return ".b2f" + + @classmethod + def _opened_external_kind( + cls, + opened: blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.BatchStore | C2Array | Any, + rel_path: str, + ) -> str | None: + """Return the supported external leaf kind for an already opened object.""" + processed = _process_opened_object(opened) + if isinstance(processed, blosc2.BatchStore): + kind = "batchstore" + elif isinstance(processed, blosc2.VLArray): + kind = "vlarray" + elif isinstance(processed, blosc2.NDArray): + kind = "ndarray" + elif isinstance(processed, SChunk): + kind = "schunk" + else: + warnings.warn( + f"Ignoring unsupported Blosc2 object at '{rel_path}' during DictStore discovery: " + f"{type(processed).__name__}", + UserWarning, + stacklevel=2, + ) + return None + + expected_ext = cls._expected_ext_from_kind(kind) + found_ext = os.path.splitext(rel_path)[1] + if found_ext != expected_ext: + warnings.warn( + f"External leaf '{rel_path}' uses extension '{found_ext}' but metadata resolves to " + f"{type(processed).__name__}; expected '{expected_ext}'.", + UserWarning, + stacklevel=2, + ) + return kind + + def _probe_external_leaf_path(self, rel_path: str) -> bool: + """Return whether a working-dir file is a supported external leaf.""" + urlpath = os.path.join(self.working_dir, rel_path) + try: + opened = blosc2.blosc2_ext.open( + urlpath, + mode="r", + offset=0, + mmap_mode=self.mmap_mode, + dparams=self.dparams, + ) + except Exception: + return False + return self._opened_external_kind(opened, rel_path) is not None + + def _probe_external_leaf_offset(self, filepath: str) -> bool: + """Return whether a zip member is a supported external leaf.""" + offset = self.offsets[filepath]["offset"] + try: + opened = blosc2.blosc2_ext.open( + self.b2z_path, + mode="r", + offset=offset, + mmap_mode=self.mmap_mode, + dparams=self.dparams, + ) + except Exception: + return False + return self._opened_external_kind(opened, filepath) is not None + def _init_write_append_mode( self, cparams: blosc2.CParams | None, @@ -229,24 +311,23 @@ def _init_write_append_mode( self._update_map_tree() def _update_map_tree(self): - # Build map_tree from .b2nd and .b2f files in working dir + # Build map_tree from supported external leaves in working dir. for root, _, files in os.walk(self.working_dir): for file in files: filepath = os.path.join(root, file) - if filepath.endswith((".b2nd", ".b2f", ".b2b")): - # Convert filename to key: remove extension and ensure starts with / - rel_path = os.path.relpath(filepath, self.working_dir) - # Normalize path separators to forward slashes for cross-platform consistency - rel_path = rel_path.replace(os.sep, "/") - if rel_path.endswith(".b2nd"): - key = rel_path[:-5] - elif rel_path.endswith(".b2b") or rel_path.endswith(".b2f"): - key = rel_path[:-4] - else: - continue - if not key.startswith("/"): - key = "/" + key - self.map_tree[key] = rel_path + if os.path.abspath(filepath) == os.path.abspath(self.estore_path): + continue + rel_path = os.path.relpath(filepath, self.working_dir).replace(os.sep, "/") + if self._probe_external_leaf_path(rel_path): + self.map_tree[self._logical_key_from_relpath(rel_path)] = rel_path + + def _update_map_tree_from_offsets(self): + """Build map_tree from supported external leaves in a zip store.""" + for filepath in self.offsets: + if filepath == "embed.b2e": + continue + if self._probe_external_leaf_offset(filepath): + self.map_tree[self._logical_key_from_relpath(filepath)] = filepath @property def estore(self) -> EmbedStore: @@ -255,13 +336,13 @@ def estore(self) -> EmbedStore: @staticmethod def _value_nbytes(value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.BatchStore) -> int: - if isinstance(value, (blosc2.VLArray, blosc2.BatchStore)): + if isinstance(value, blosc2.VLArray | blosc2.BatchStore): return value.schunk.nbytes return value.nbytes @staticmethod def _is_external_value(value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.BatchStore) -> bool: - return isinstance(value, (blosc2.NDArray, SChunk, blosc2.VLArray, blosc2.BatchStore)) and bool( + return isinstance(value, blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.BatchStore) and bool( getattr(value, "urlpath", None) ) @@ -406,12 +487,14 @@ def values(self) -> Iterator[blosc2.NDArray | SChunk | C2Array]: if self.is_zip_store: if filepath in self.offsets: offset = self.offsets[filepath]["offset"] - yield blosc2.blosc2_ext.open( - self.b2z_path, - mode="r", - offset=offset, - mmap_mode=self.mmap_mode, - dparams=self.dparams, + yield _process_opened_object( + blosc2.blosc2_ext.open( + self.b2z_path, + mode="r", + offset=offset, + mmap_mode=self.mmap_mode, + dparams=self.dparams, + ) ) else: urlpath = os.path.join(self.working_dir, filepath) @@ -438,12 +521,14 @@ def items(self) -> Iterator[tuple[str, blosc2.NDArray | SChunk | C2Array]]: offset = self.offsets[filepath]["offset"] yield ( key, - blosc2.blosc2_ext.open( - self.b2z_path, - mode="r", - offset=offset, - mmap_mode=self.mmap_mode, - dparams=self.dparams, + _process_opened_object( + blosc2.blosc2_ext.open( + self.b2z_path, + mode="r", + offset=offset, + mmap_mode=self.mmap_mode, + dparams=self.dparams, + ) ), ) else: diff --git a/tests/test_dict_store.py b/tests/test_dict_store.py index 74122424..6a157f72 100644 --- a/tests/test_dict_store.py +++ b/tests/test_dict_store.py @@ -16,6 +16,22 @@ from blosc2.dict_store import DictStore +def _rename_store_member(store_path, old_name, new_name): + """Rename an external leaf inside a .b2d/.b2z store without changing its contents.""" + if str(store_path).endswith(".b2d"): + old_path = os.path.join(store_path, old_name.replace("/", os.sep)) + new_path = os.path.join(store_path, new_name.replace("/", os.sep)) + os.rename(old_path, new_path) + return + + tmp_zip = f"{store_path}.tmp" + with zipfile.ZipFile(store_path, "r") as src, zipfile.ZipFile(tmp_zip, "w", zipfile.ZIP_STORED) as dst: + for info in src.infolist(): + arcname = new_name if info.filename == old_name else info.filename + dst.writestr(arcname, src.read(info.filename), compress_type=zipfile.ZIP_STORED) + os.replace(tmp_zip, store_path) + + @pytest.fixture(params=["b2d", "b2z"]) def populated_dict_store(request): """Create and populate a DictStore for tests. @@ -266,6 +282,78 @@ def test_external_vlarray_file_and_reopen(tmp_path): assert value.vlmeta["description"] == "External VLArray" +@pytest.mark.parametrize("storage_type", ["b2d", "b2z"]) +def test_metadata_discovery_reopens_renamed_external_ndarray(storage_type, tmp_path): + path = tmp_path / f"test_renamed_ndarray.{storage_type}" + ext_path = tmp_path / "renamed_array_source.b2nd" + + with DictStore(str(path), mode="w", threshold=None) as dstore: + arr_external = blosc2.arange(5, urlpath=str(ext_path), mode="w") + arr_external.vlmeta["description"] = "Renamed NDArray" + dstore["/dir1/node3"] = arr_external + + old_name = "dir1/node3.b2nd" + new_name = "dir1/node3.weird" + _rename_store_member(str(path), old_name, new_name) + + with pytest.warns(UserWarning, match=r"node3\.weird'.*NDArray.*expected '\.b2nd'"): + dstore_read = DictStore(str(path), mode="r") + with dstore_read: + assert dstore_read.map_tree["/dir1/node3"] == new_name + node3 = dstore_read["/dir1/node3"] + assert isinstance(node3, blosc2.NDArray) + assert np.array_equal(node3[:], np.arange(5)) + assert node3.vlmeta["description"] == "Renamed NDArray" + + +@pytest.mark.parametrize("storage_type", ["b2d", "b2z"]) +def test_metadata_discovery_reopens_renamed_external_vlarray(storage_type, tmp_path): + path = tmp_path / f"test_renamed_vlarray.{storage_type}" + ext_path = tmp_path / "renamed_vlarray_source.b2frame" + values = ["alpha", {"nested": True}, None, (1, 2, 3)] + + vlarray = blosc2.VLArray(urlpath=str(ext_path), mode="w", contiguous=True) + vlarray.extend(values) + vlarray.vlmeta["description"] = "Renamed VLArray" + + with DictStore(str(path), mode="w", threshold=None) as dstore: + dstore["/dir1/vlarray_ext"] = vlarray + + old_name = "dir1/vlarray_ext.b2f" + new_name = "dir1/vlarray_ext.renamed" + _rename_store_member(str(path), old_name, new_name) + + with pytest.warns(UserWarning, match=r"vlarray_ext\.renamed'.*VLArray.*expected '\.b2f'"): + dstore_read = DictStore(str(path), mode="r") + with dstore_read: + assert dstore_read.map_tree["/dir1/vlarray_ext"] == new_name + value = dstore_read["/dir1/vlarray_ext"] + assert isinstance(value, blosc2.VLArray) + assert list(value) == values + assert value.vlmeta["description"] == "Renamed VLArray" + + +def test_metadata_discovery_warns_and_skips_unsupported_blosc2_leaf(tmp_path): + path = tmp_path / "test_unsupported_lazyexpr.b2d" + + with DictStore(str(path), mode="w") as dstore: + dstore["/embedded"] = np.arange(3) + + a = blosc2.asarray(np.arange(5), urlpath=str(tmp_path / "a.b2nd"), mode="w") + b = blosc2.asarray(np.arange(5), urlpath=str(tmp_path / "b.b2nd"), mode="w") + expr = a + b + expr_path = path / "unsupported_lazyexpr.b2nd" + expr.save(str(expr_path)) + + with pytest.warns( + UserWarning, match=r"Ignoring unsupported Blosc2 object.*unsupported_lazyexpr\.b2nd.*LazyExpr" + ): + dstore_read = DictStore(str(path), mode="r") + with dstore_read: + assert "/unsupported_lazyexpr" not in dstore_read + assert "/embedded" in dstore_read + + def _digest_value(value): """Return a bytes digest of a stored value.""" if isinstance(value, blosc2.SChunk): diff --git a/tests/test_tree_store.py b/tests/test_tree_store.py index 27b86452..780d6cf5 100644 --- a/tests/test_tree_store.py +++ b/tests/test_tree_store.py @@ -7,6 +7,7 @@ import os import shutil +import zipfile import numpy as np import pytest @@ -15,6 +16,22 @@ from blosc2.tree_store import TreeStore +def _rename_store_member(store_path, old_name, new_name): + """Rename an external leaf inside a .b2d/.b2z store without changing its contents.""" + if str(store_path).endswith(".b2d"): + old_path = os.path.join(store_path, old_name.replace("/", os.sep)) + new_path = os.path.join(store_path, new_name.replace("/", os.sep)) + os.rename(old_path, new_path) + return + + tmp_zip = f"{store_path}.tmp" + with zipfile.ZipFile(store_path, "r") as src, zipfile.ZipFile(tmp_zip, "w", zipfile.ZIP_STORED) as dst: + for info in src.infolist(): + arcname = new_name if info.filename == old_name else info.filename + dst.writestr(arcname, src.read(info.filename), compress_type=zipfile.ZIP_STORED) + os.replace(tmp_zip, store_path) + + @pytest.fixture(params=["b2d", "b2z"]) def populated_tree_store(request): """A fixture that creates and populates a TreeStore.""" @@ -671,6 +688,28 @@ def test_external_batchstore_support(tmp_path): assert [batch[:] for batch in retrieved] == [[{"id": 1}, {"id": 2}], [{"id": 3}]] +@pytest.mark.parametrize("storage_type", ["b2d", "b2z"]) +def test_metadata_discovery_reopens_renamed_batchstore_leaf(storage_type, tmp_path): + store_path = tmp_path / f"test_batchstore_renamed.{storage_type}" + + with TreeStore(str(store_path), mode="w", threshold=0) as tstore: + bstore = blosc2.BatchStore(max_blocksize=2) + bstore.extend([[{"id": 1}, {"id": 2}], [{"id": 3}]]) + tstore["/data/batchstore"] = bstore + + old_name = "data/batchstore.b2b" + new_name = "data/batchstore.odd" + _rename_store_member(str(store_path), old_name, new_name) + + with pytest.warns(UserWarning, match=r"batchstore\.odd'.*BatchStore.*expected '\.b2b'"): + tstore = TreeStore(str(store_path), mode="r") + with tstore: + assert tstore.map_tree["/data/batchstore"] == new_name + retrieved = tstore["/data/batchstore"] + assert isinstance(retrieved, blosc2.BatchStore) + assert [batch[:] for batch in retrieved] == [[{"id": 1}, {"id": 2}], [{"id": 3}]] + + def test_treestore_vlmeta_externalized_b2d(tmp_path): store_path = tmp_path / "test_vlmeta_externalized.b2d" From 79cd7ab00f85c03ccc8827858575a9431ac3ba5d Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 20 Mar 2026 11:36:30 +0100 Subject: [PATCH 28/34] Add a BatchStore.items accessor --- .../tutorials/12.batchstore.ipynb | 189 ++++++++++++++---- examples/batch_store.py | 5 + src/blosc2/batch_store.py | 68 +++++++ src/blosc2/schunk.py | 10 +- tests/test_batch_store.py | 57 ++++++ 5 files changed, 285 insertions(+), 44 deletions(-) diff --git a/doc/getting_started/tutorials/12.batchstore.ipynb b/doc/getting_started/tutorials/12.batchstore.ipynb index b898c455..52bcc660 100644 --- a/doc/getting_started/tutorials/12.batchstore.ipynb +++ b/doc/getting_started/tutorials/12.batchstore.ipynb @@ -18,8 +18,14 @@ "id": "be8591f8f86952e8", "metadata": { "ExecuteTime": { - "end_time": "2026-03-20T07:06:05.876287Z", - "start_time": "2026-03-20T07:06:05.661801Z" + "end_time": "2026-03-20T10:24:10.190550Z", + "start_time": "2026-03-20T10:24:10.014859Z" + }, + "execution": { + "iopub.execute_input": "2026-03-20T10:23:51.329739Z", + "iopub.status.busy": "2026-03-20T10:23:51.329437Z", + "iopub.status.idle": "2026-03-20T10:23:51.556056Z", + "shell.execute_reply": "2026-03-20T10:23:51.555614Z" } }, "outputs": [], @@ -53,8 +59,14 @@ "id": "f8c8a2b7692e7228", "metadata": { "ExecuteTime": { - "end_time": "2026-03-20T07:06:05.904277Z", - "start_time": "2026-03-20T07:06:05.882545Z" + "end_time": "2026-03-20T10:24:10.211954Z", + "start_time": "2026-03-20T10:24:10.191296Z" + }, + "execution": { + "iopub.execute_input": "2026-03-20T10:23:51.557338Z", + "iopub.status.busy": "2026-03-20T10:23:51.557245Z", + "iopub.status.idle": "2026-03-20T10:23:51.564920Z", + "shell.execute_reply": "2026-03-20T10:23:51.564578Z" } }, "outputs": [ @@ -62,8 +74,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Batches: [[{'name': 'alpha', 'count': 1}, {'name': 'beta', 'count': 2}, {'name': 'gamma', 'count': 3}], [{'name': 'delta', 'count': 4}, {'name': 'epsilon', 'count': 5}], [{'name': 'zeta', 'count': 6}], [{'name': 'eta', 'count': 7}, {'name': 'theta', 'count': 8}]]\n", - "Number of batches: 4\n" + "Batches: [[{'name': 'alpha', 'count': 1}, {'name': 'beta', 'count': 2}, {'name': 'gamma', 'count': 3}], [{'name': 'delta', 'count': 4}, {'name': 'epsilon', 'count': 5}], [{'name': 'zeta', 'count': 6}], [{'name': 'eta', 'count': 7}, {'name': 'theta', 'count': 8}], [{'name': 'iota', 'count': 9}, {'name': 'kappa', 'count': 10}, {'name': 'lambda', 'count': 11}]]\n", + "Number of batches: 5\n" ] } ], @@ -86,6 +98,11 @@ " [\n", " [{\"name\": \"zeta\", \"count\": 6}],\n", " [{\"name\": \"eta\", \"count\": 7}, {\"name\": \"theta\", \"count\": 8}],\n", + " [\n", + " {\"name\": \"iota\", \"count\": 9},\n", + " {\"name\": \"kappa\", \"count\": 10},\n", + " {\"name\": \"lambda\", \"count\": 11},\n", + " ],\n", " ]\n", ")\n", "\n", @@ -109,8 +126,14 @@ "id": "20861d3e348f9df1", "metadata": { "ExecuteTime": { - "end_time": "2026-03-20T07:06:05.924634Z", - "start_time": "2026-03-20T07:06:05.905576Z" + "end_time": "2026-03-20T10:24:10.229980Z", + "start_time": "2026-03-20T10:24:10.213198Z" + }, + "execution": { + "iopub.execute_input": "2026-03-20T10:23:51.566000Z", + "iopub.status.busy": "2026-03-20T10:23:51.565919Z", + "iopub.status.idle": "2026-03-20T10:23:51.569765Z", + "shell.execute_reply": "2026-03-20T10:23:51.569439Z" } }, "outputs": [ @@ -121,7 +144,7 @@ "First batch: [{'name': 'alpha', 'count': 1}, {'name': 'beta', 'count': 2}, {'name': 'gamma', 'count': 3}]\n", "Second item in first batch: {'name': 'beta', 'count': 2}\n", "Slice of second batch: [{'name': 'delta', 'count': 4}]\n", - "All items: [{'name': 'alpha', 'count': 1}, {'name': 'beta', 'count': 2}, {'name': 'gamma', 'count': 3}, {'name': 'delta', 'count': 4}, {'name': 'epsilon', 'count': 5}, {'name': 'zeta', 'count': 6}, {'name': 'eta', 'count': 7}, {'name': 'theta', 'count': 8}]\n" + "All items: [{'name': 'alpha', 'count': 1}, {'name': 'beta', 'count': 2}, {'name': 'gamma', 'count': 3}, {'name': 'delta', 'count': 4}, {'name': 'epsilon', 'count': 5}, {'name': 'zeta', 'count': 6}, {'name': 'eta', 'count': 7}, {'name': 'theta', 'count': 8}, {'name': 'iota', 'count': 9}, {'name': 'kappa', 'count': 10}, {'name': 'lambda', 'count': 11}]\n" ] } ], @@ -148,8 +171,14 @@ "id": "df556f6da8adc369", "metadata": { "ExecuteTime": { - "end_time": "2026-03-20T07:06:05.945986Z", - "start_time": "2026-03-20T07:06:05.925866Z" + "end_time": "2026-03-20T10:24:10.259055Z", + "start_time": "2026-03-20T10:24:10.231589Z" + }, + "execution": { + "iopub.execute_input": "2026-03-20T10:23:51.570823Z", + "iopub.status.busy": "2026-03-20T10:23:51.570763Z", + "iopub.status.idle": "2026-03-20T10:23:51.577607Z", + "shell.execute_reply": "2026-03-20T10:23:51.577269Z" } }, "outputs": [ @@ -157,8 +186,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Popped batch: [{'name': 'eta', 'count': 7}, {'name': 'theta', 'count': 8}]\n", - "After updates: [[{'name': 'between', 'count': 99}], [{'name': 'delta*', 'count': 40}, {'name': 'epsilon*', 'count': 50}], [{'name': 'zeta', 'count': 6}]]\n" + "Popped batch: [{'name': 'zeta', 'count': 6}]\n", + "After updates: [[{'name': 'alpha*', 'count': 10}, {'name': 'beta*', 'count': 20}], [{'name': 'delta*', 'count': 40}, {'name': 'epsilon*', 'count': 50}], [{'name': 'between-a', 'count': 99}, {'name': 'between-b', 'count': 100}], [{'name': 'eta', 'count': 7}, {'name': 'theta', 'count': 8}], [{'name': 'iota', 'count': 9}, {'name': 'kappa', 'count': 10}, {'name': 'lambda', 'count': 11}]]\n" ] } ], @@ -167,9 +196,10 @@ " {\"name\": \"delta*\", \"count\": 40},\n", " {\"name\": \"epsilon*\", \"count\": 50},\n", "]\n", - "store.insert(1, [{\"name\": \"between\", \"count\": 99}])\n", - "removed = store.pop()\n", + "store.insert(2, [{\"name\": \"between-a\", \"count\": 99}, {\"name\": \"between-b\", \"count\": 100}])\n", + "removed = store.pop(3)\n", "del store[0]\n", + "store.insert(0, [{\"name\": \"alpha*\", \"count\": 10}, {\"name\": \"beta*\", \"count\": 20}])\n", "\n", "show(\"Popped batch\", removed)\n", "show(\"After updates\", [batch[:] for batch in store])" @@ -191,8 +221,14 @@ "id": "b32d72a68d83673e", "metadata": { "ExecuteTime": { - "end_time": "2026-03-20T07:06:05.965086Z", - "start_time": "2026-03-20T07:06:05.947144Z" + "end_time": "2026-03-20T10:24:10.300526Z", + "start_time": "2026-03-20T10:24:10.259712Z" + }, + "execution": { + "iopub.execute_input": "2026-03-20T10:23:51.578504Z", + "iopub.status.busy": "2026-03-20T10:23:51.578433Z", + "iopub.status.idle": "2026-03-20T10:23:51.581563Z", + "shell.execute_reply": "2026-03-20T10:23:51.581191Z" } }, "outputs": [ @@ -200,21 +236,21 @@ "name": "stdout", "output_type": "stream", "text": [ - "Batches via iteration: [[{'name': 'between', 'count': 99}], [{'name': 'delta*', 'count': 40}, {'name': 'epsilon*', 'count': 50}], [{'name': 'zeta', 'count': 6}]]\n", + "Batches via iteration: [[{'name': 'alpha*', 'count': 10}, {'name': 'beta*', 'count': 20}], [{'name': 'delta*', 'count': 40}, {'name': 'epsilon*', 'count': 50}], [{'name': 'between-a', 'count': 99}, {'name': 'between-b', 'count': 100}], [{'name': 'eta', 'count': 7}, {'name': 'theta', 'count': 8}], [{'name': 'iota', 'count': 9}, {'name': 'kappa', 'count': 10}, {'name': 'lambda', 'count': 11}]]\n", "type : BatchStore\n", "serializer : msgpack\n", - "nbatches : 3 (items per batch: mean=1.33, max=2, min=1)\n", - "nblocks : 3 (items per block: mean=1.33, max=2, min=1)\n", - "nitems : 4\n", - "nbytes : 84 (84 B)\n", - "cbytes : 468 (468 B)\n", - "cratio : 0.18\n", + "nbatches : 5 (items per batch: mean=2.20, max=3, min=2)\n", + "nblocks : 6 (items per block: mean=1.83, max=2, min=1)\n", + "nitems : 11\n", + "nbytes : 226 (226 B)\n", + "cbytes : 680 (680 B)\n", + "cratio : 0.33\n", "cparams : CParams(codec=, codec_meta=0, clevel=5, use_dict=False, typesize=1,\n", - " : nthreads=8, blocksize=0, splitmode=,\n", + " : nthreads=12, blocksize=0, splitmode=,\n", " : filters=[, , ,\n", " : , , ], filters_meta=[0,\n", " : 0, 0, 0, 0, 0], tuner=)\n", - "dparams : DParams(nthreads=8)\n", + "dparams : DParams(nthreads=12)\n", "\n" ] } @@ -240,8 +276,14 @@ "id": "45f878b8f4414a3b", "metadata": { "ExecuteTime": { - "end_time": "2026-03-20T07:06:05.990783Z", - "start_time": "2026-03-20T07:06:05.965791Z" + "end_time": "2026-03-20T10:24:10.334099Z", + "start_time": "2026-03-20T10:24:10.301619Z" + }, + "execution": { + "iopub.execute_input": "2026-03-20T10:23:51.582437Z", + "iopub.status.busy": "2026-03-20T10:23:51.582372Z", + "iopub.status.idle": "2026-03-20T10:23:51.590494Z", + "shell.execute_reply": "2026-03-20T10:23:51.590186Z" } }, "outputs": [ @@ -249,7 +291,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Copied batches: [[{'name': 'between', 'count': 99}], [{'name': 'delta*', 'count': 40}, {'name': 'epsilon*', 'count': 50}], [{'name': 'zeta', 'count': 6}]]\n", + "Copied batches: [[{'name': 'alpha*', 'count': 10}, {'name': 'beta*', 'count': 20}], [{'name': 'delta*', 'count': 40}, {'name': 'epsilon*', 'count': 50}], [{'name': 'between-a', 'count': 99}, {'name': 'between-b', 'count': 100}], [{'name': 'eta', 'count': 7}, {'name': 'theta', 'count': 8}], [{'name': 'iota', 'count': 9}, {'name': 'kappa', 'count': 10}, {'name': 'lambda', 'count': 11}]]\n", "Copy serializer: msgpack\n", "Copy codec: Codec.LZ4\n" ] @@ -283,8 +325,14 @@ "id": "fd4957093f509bd4", "metadata": { "ExecuteTime": { - "end_time": "2026-03-20T07:06:06.025738Z", - "start_time": "2026-03-20T07:06:05.999799Z" + "end_time": "2026-03-20T10:24:10.359063Z", + "start_time": "2026-03-20T10:24:10.343012Z" + }, + "execution": { + "iopub.execute_input": "2026-03-20T10:23:51.591475Z", + "iopub.status.busy": "2026-03-20T10:23:51.591415Z", + "iopub.status.idle": "2026-03-20T10:23:51.594839Z", + "shell.execute_reply": "2026-03-20T10:23:51.594553Z" } }, "outputs": [ @@ -293,9 +341,9 @@ "output_type": "stream", "text": [ "from_cframe type: BatchStore\n", - "from_cframe batches: [[{'name': 'between', 'count': 99}], [{'name': 'delta*', 'count': 40}, {'name': 'epsilon*', 'count': 50}], [{'name': 'zeta', 'count': 6}]]\n", + "from_cframe batches: [[{'name': 'alpha*', 'count': 10}, {'name': 'beta*', 'count': 20}], [{'name': 'delta*', 'count': 40}, {'name': 'epsilon*', 'count': 50}], [{'name': 'between-a', 'count': 99}, {'name': 'between-b', 'count': 100}], [{'name': 'eta', 'count': 7}, {'name': 'theta', 'count': 8}], [{'name': 'iota', 'count': 9}, {'name': 'kappa', 'count': 10}, {'name': 'lambda', 'count': 11}]]\n", "Reopened type: BatchStore\n", - "Reopened batches: [[{'name': 'between', 'count': 99}], [{'name': 'delta*', 'count': 40}, {'name': 'epsilon*', 'count': 50}], [{'name': 'zeta', 'count': 6}]]\n" + "Reopened batches: [[{'name': 'alpha*', 'count': 10}, {'name': 'beta*', 'count': 20}], [{'name': 'delta*', 'count': 40}, {'name': 'epsilon*', 'count': 50}], [{'name': 'between-a', 'count': 99}, {'name': 'between-b', 'count': 100}], [{'name': 'eta', 'count': 7}, {'name': 'theta', 'count': 8}], [{'name': 'iota', 'count': 9}, {'name': 'kappa', 'count': 10}, {'name': 'lambda', 'count': 11}]]\n" ] } ], @@ -326,8 +374,14 @@ "id": "2214b2be1bfb5bc7", "metadata": { "ExecuteTime": { - "end_time": "2026-03-20T07:06:06.050975Z", - "start_time": "2026-03-20T07:06:06.034152Z" + "end_time": "2026-03-20T10:24:10.386442Z", + "start_time": "2026-03-20T10:24:10.365740Z" + }, + "execution": { + "iopub.execute_input": "2026-03-20T10:23:51.595854Z", + "iopub.status.busy": "2026-03-20T10:23:51.595778Z", + "iopub.status.idle": "2026-03-20T10:23:51.601478Z", + "shell.execute_reply": "2026-03-20T10:23:51.601232Z" } }, "outputs": [ @@ -348,24 +402,73 @@ " [{\"name\": \"again\", \"count\": 2}, {\"name\": \"done\", \"count\": 3}],\n", " ]\n", ")\n", - "show(\"After clear + extend\", [batch[:] for batch in scratch])\n", + "show(\"After clear + extend\", [batch[:] for batch in scratch])" + ] + }, + { + "cell_type": "markdown", + "id": "8d8f9df58a46c4c1", + "metadata": {}, + "source": [ + "## Flat item access with `.items`\n", "\n", - "blosc2.remove_urlpath(urlpath)\n", - "blosc2.remove_urlpath(copy_path)" + "The main `BatchStore` API remains batch-oriented, but the `.items` accessor offers a read-only flat view across all items. Integer indexing returns one item and slicing returns a Python list." ] }, { "cell_type": "code", - "execution_count": 8, - "id": "27c47e4fd1332b48", + "execution_count": 9, + "id": "4f5c4e5a1b8f92d4", + "metadata": { + "ExecuteTime": { + "end_time": "2026-03-20T10:24:10.403443Z", + "start_time": "2026-03-20T10:24:10.387808Z" + }, + "execution": { + "iopub.execute_input": "2026-03-20T10:23:51.602502Z", + "iopub.status.busy": "2026-03-20T10:23:51.602451Z", + "iopub.status.idle": "2026-03-20T10:23:51.606267Z", + "shell.execute_reply": "2026-03-20T10:23:51.605893Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Flat item 0: {'name': 'alpha*', 'count': 10}\n", + "Flat item 6: {'name': 'eta', 'count': 7}\n", + "Flat slice 3:8: [{'name': 'epsilon*', 'count': 50}, {'name': 'between-a', 'count': 99}, {'name': 'between-b', 'count': 100}, {'name': 'eta', 'count': 7}, {'name': 'theta', 'count': 8}]\n" + ] + } + ], + "source": [ + "show(\"Flat item 0\", store.items[0])\n", + "show(\"Flat item 6\", store.items[6])\n", + "show(\"Flat slice 3:8\", store.items[3:8])" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "2a355a3fc8673692", "metadata": { "ExecuteTime": { - "end_time": "2026-03-20T07:06:06.061727Z", - "start_time": "2026-03-20T07:06:06.051400Z" + "end_time": "2026-03-20T10:24:10.420064Z", + "start_time": "2026-03-20T10:24:10.403926Z" + }, + "execution": { + "iopub.execute_input": "2026-03-20T10:23:51.607247Z", + "iopub.status.busy": "2026-03-20T10:23:51.607185Z", + "iopub.status.idle": "2026-03-20T10:23:51.608877Z", + "shell.execute_reply": "2026-03-20T10:23:51.608598Z" } }, "outputs": [], - "source": [] + "source": [ + "blosc2.remove_urlpath(urlpath)\n", + "blosc2.remove_urlpath(copy_path)" + ] } ], "metadata": { diff --git a/examples/batch_store.py b/examples/batch_store.py index 4a387af5..9a809fec 100644 --- a/examples/batch_store.py +++ b/examples/batch_store.py @@ -61,6 +61,11 @@ def main() -> None: value = reopened[batch_index][item_index] print(f" reopened[{batch_index}][{item_index}] -> {value}") + print() + print("Flat item reads via .items:") + print(f" reopened.items[0] -> {reopened.items[0]}") + print(f" reopened.items[150:153] -> {reopened.items[150:153]}") + print(f"BatchStore file at: {reopened.urlpath}") diff --git a/src/blosc2/batch_store.py b/src/blosc2/batch_store.py index f0736b3e..cf90ba78 100644 --- a/src/blosc2/batch_store.py +++ b/src/blosc2/batch_store.py @@ -15,6 +15,8 @@ from functools import lru_cache from typing import Any +import numpy as np + import blosc2 from blosc2._msgpack_utils import msgpack_packb, msgpack_unpackb from blosc2.info import InfoReporter, format_nbytes_info @@ -124,6 +126,19 @@ def __repr__(self) -> str: return f"Batch(len={len(self)}, nbytes={self.nbytes}, cbytes={self.cbytes})" +class BatchStoreItems(Sequence[Any]): + """A read-only flat view over the items stored in a :class:`BatchStore`.""" + + def __init__(self, parent: BatchStore) -> None: + self._parent = parent + + def __getitem__(self, index: int | slice) -> Any | list[Any]: + return self._parent._get_flat_item(index) + + def __len__(self) -> int: + return self._parent._get_total_item_count() + + class BatchStore: """A batched container for variable-length Python items. @@ -214,6 +229,8 @@ def _attach_schunk(self, schunk: blosc2.SChunk) -> None: self._arrow_schema = batchstore_meta.get("arrow_schema", self._arrow_schema) self._arrow_schema_obj = None self._batch_lengths = self._load_batch_lengths() + self._items = BatchStoreItems(self) + self._item_prefix_sums: np.ndarray | None = None self._validate_tag() def _maybe_open_existing(self, storage: blosc2.Storage) -> bool: @@ -363,11 +380,49 @@ def _ensure_batch_lengths(self) -> list[int]: self._batch_lengths = [] return self._batch_lengths + def _load_or_compute_batch_lengths(self) -> list[int]: + if self._batch_lengths is None: + self._batch_lengths = [len(self._get_batch(i)) for i in range(len(self))] + if self.mode != "r": + self._persist_batch_lengths() + return self._batch_lengths + def _batch_length(self, index: int) -> int | None: if self._batch_lengths is None: return None return self._batch_lengths[index] + def _invalidate_item_cache(self) -> None: + self._item_prefix_sums = None + + def _get_item_prefix_sums(self) -> np.ndarray: + if self._item_prefix_sums is None: + batch_lengths = np.asarray(self._load_or_compute_batch_lengths(), dtype=np.int64) + prefix_sums = np.empty(len(batch_lengths) + 1, dtype=np.int64) + prefix_sums[0] = 0 + prefix_sums[1:] = np.cumsum(batch_lengths, dtype=np.int64) + self._item_prefix_sums = prefix_sums + return self._item_prefix_sums + + def _get_total_item_count(self) -> int: + return int(self._get_item_prefix_sums()[-1]) + + def _get_flat_item(self, index: int | slice) -> Any | list[Any]: + if isinstance(index, slice): + return [self._get_flat_item(i) for i in range(*index.indices(self._get_total_item_count()))] + if not isinstance(index, int): + raise TypeError("BatchStore item indices must be integers") + nitems = self._get_total_item_count() + if index < 0: + index += nitems + if index < 0 or index >= nitems: + raise IndexError("BatchStore item index out of range") + + prefix_sums = self._get_item_prefix_sums() + batch_index = int(np.searchsorted(prefix_sums, index, side="right") - 1) + item_index = int(index - prefix_sums[batch_index]) + return self[batch_index][item_index] + def _block_sizes_from_batch_length(self, batch_length: int, nblocks: int) -> list[int]: if self._max_blocksize is None or nblocks <= 0: return [] @@ -600,6 +655,7 @@ def append(self, value: object) -> int: new_len = self.schunk.append_chunk(batch_payload) self._ensure_batch_lengths().append(length) self._persist_batch_lengths() + self._invalidate_item_cache() return new_len def insert(self, index: int, value: object) -> int: @@ -612,6 +668,7 @@ def insert(self, index: int, value: object) -> int: new_len = self.schunk.insert_chunk(index, batch_payload) self._ensure_batch_lengths().insert(index, length) self._persist_batch_lengths() + self._invalidate_item_cache() return new_len def delete(self, index: int | slice) -> int: @@ -623,12 +680,14 @@ def delete(self, index: int | slice) -> int: if self._batch_lengths is not None: del self._batch_lengths[idx] self._persist_batch_lengths() + self._invalidate_item_cache() return len(self) index = self._normalize_index(index) new_len = self.schunk.delete_chunk(index) if self._batch_lengths is not None: del self._batch_lengths[index] self._persist_batch_lengths() + self._invalidate_item_cache() return new_len def pop(self, index: int = -1) -> list[Any]: @@ -650,6 +709,7 @@ def extend(self, values: object) -> None: self.schunk.append_chunk(batch_payload) self._ensure_batch_lengths().append(self._batch_len(batch)) self._persist_batch_lengths() + self._invalidate_item_cache() def clear(self) -> None: """Remove all entries from the container.""" @@ -667,6 +727,7 @@ def clear(self) -> None: self._attach_schunk(schunk) self._batch_lengths = [] self._persist_batch_lengths() + self._invalidate_item_cache() def __getitem__(self, index: int | slice) -> Batch | list[Batch]: """Return one batch or a list of batches.""" @@ -693,6 +754,7 @@ def __setitem__(self, index: int | slice, value: object) -> None: self.schunk.insert_chunk(start + offset, batch_payload) self._ensure_batch_lengths().insert(start + offset, self._batch_len(batch)) self._persist_batch_lengths() + self._invalidate_item_cache() return if len(values) != len(indices): raise ValueError( @@ -705,6 +767,7 @@ def __setitem__(self, index: int | slice, value: object) -> None: if self._batch_lengths is not None: self._batch_lengths[idx] = self._batch_len(batch) self._persist_batch_lengths() + self._invalidate_item_cache() return self._check_writable() index = self._normalize_index(index) @@ -714,6 +777,7 @@ def __setitem__(self, index: int | slice, value: object) -> None: if self._batch_lengths is not None: self._batch_lengths[index] = self._batch_len(batch) self._persist_batch_lengths() + self._invalidate_item_cache() def __delitem__(self, index: int | slice) -> None: self.delete(index) @@ -751,6 +815,10 @@ def dparams(self): def max_blocksize(self) -> int | None: return self._max_blocksize + @property + def items(self) -> BatchStoreItems: + return self._items + @property def typesize(self) -> int: return self.schunk.typesize diff --git a/src/blosc2/schunk.py b/src/blosc2/schunk.py index 55b4acdf..5da599ca 100644 --- a/src/blosc2/schunk.py +++ b/src/blosc2/schunk.py @@ -676,7 +676,15 @@ def get_chunk(self, nchunk: int) -> bytes: def get_vlblock(self, nchunk: int, nblock: int) -> bytes: """Return the decompressed payload of one VL block from a chunk.""" - return super().get_vlblock(nchunk, nblock) + get_vlblock = getattr(super(), "get_vlblock", None) + if get_vlblock is not None: + return get_vlblock(nchunk, nblock) + + block_payloads = blosc2_ext.vldecompress(self.get_chunk(nchunk), **asdict(self.dparams)) + try: + return block_payloads[nblock] + except IndexError as exc: + raise IndexError("VL block index out of range") from exc def delete_chunk(self, nchunk: int) -> int: """Delete the specified chunk from the SChunk. diff --git a/tests/test_batch_store.py b/tests/test_batch_store.py index c9f520ed..a567f625 100644 --- a/tests/test_batch_store.py +++ b/tests/test_batch_store.py @@ -486,6 +486,63 @@ def test_batchstore_slice_errors(): _ = barray[::0] +@pytest.mark.parametrize( + ("contiguous", "urlpath"), + [ + (False, None), + (True, None), + (True, "test_batchstore_items.b2b"), + (False, "test_batchstore_items_s.b2b"), + ], +) +def test_batchstore_items_accessor(contiguous, urlpath): + blosc2.remove_urlpath(urlpath) + + batches = [["a", "b"], [10, 11, 12], [{"x": 1}], [None, True]] + flat = [item for batch in batches for item in batch] + barray = blosc2.BatchStore(storage=_storage(contiguous, urlpath), max_blocksize=2) + barray.extend(batches) + + assert len(barray.items) == len(flat) + assert barray.items[0] == flat[0] + assert barray.items[3] == flat[3] + assert barray.items[-1] == flat[-1] + assert barray.items[1:6] == flat[1:6] + assert barray.items[::-2] == flat[::-2] + + barray.append(["tail0", "tail1"]) + flat.extend(["tail0", "tail1"]) + assert len(barray.items) == len(flat) + assert barray.items[-2:] == flat[-2:] + + barray.insert(1, ["mid0", "mid1"]) + flat[2:2] = ["mid0", "mid1"] + assert barray.items[:] == flat + + barray[2] = ["replaced"] + batch_start = len(batches[0]) + 2 + flat[batch_start : batch_start + 3] = ["replaced"] + assert barray.items[:] == flat + + del barray[0] + del flat[:2] + assert barray.items[:] == flat + + with pytest.raises(IndexError, match="item index out of range"): + _ = barray.items[len(flat)] + with pytest.raises(TypeError, match="item indices must be integers"): + _ = barray.items[1.5] + with pytest.raises(ValueError): + _ = barray.items[::0] + + if urlpath is not None: + reopened = blosc2.open(urlpath, mode="r") + assert reopened.items[:] == flat + assert reopened.items[2] == flat[2] + + blosc2.remove_urlpath(urlpath) + + def test_batchstore_copy(): urlpath = "test_batchstore_copy.b2b" copy_path = "test_batchstore_copy_out.b2b" From c6cccdd35f0080003a995e5513e9bfd6a7e41e38 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 20 Mar 2026 11:46:11 +0100 Subject: [PATCH 29/34] Undo an unneceesary workaround --- src/blosc2/schunk.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/src/blosc2/schunk.py b/src/blosc2/schunk.py index 5da599ca..55b4acdf 100644 --- a/src/blosc2/schunk.py +++ b/src/blosc2/schunk.py @@ -676,15 +676,7 @@ def get_chunk(self, nchunk: int) -> bytes: def get_vlblock(self, nchunk: int, nblock: int) -> bytes: """Return the decompressed payload of one VL block from a chunk.""" - get_vlblock = getattr(super(), "get_vlblock", None) - if get_vlblock is not None: - return get_vlblock(nchunk, nblock) - - block_payloads = blosc2_ext.vldecompress(self.get_chunk(nchunk), **asdict(self.dparams)) - try: - return block_payloads[nblock] - except IndexError as exc: - raise IndexError("VL block index out of range") from exc + return super().get_vlblock(nchunk, nblock) def delete_chunk(self, nchunk: int) -> int: """Delete the specified chunk from the SChunk. From 20a958ca7fbf9bdc3d34147f82d694a46dd7f3c9 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 20 Mar 2026 12:05:25 +0100 Subject: [PATCH 30/34] Start using L2 cache size for clevel==5 --- src/blosc2/batch_store.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/blosc2/batch_store.py b/src/blosc2/batch_store.py index cf90ba78..cd71684f 100644 --- a/src/blosc2/batch_store.py +++ b/src/blosc2/batch_store.py @@ -568,9 +568,9 @@ def _guess_blocksize(self, payload_sizes: list[int]) -> int: clevel = self.cparams.clevel if clevel == 9: return len(payload_sizes) - if 0 < clevel < 6: + if 0 < clevel < 5: budget = blosc2.cpu_info.get("l1_data_cache_size") - elif 6 <= clevel < 9: + elif 5 <= clevel < 9: budget = blosc2.cpu_info.get("l2_cache_size") else: return len(payload_sizes) From c8c16327b29ec38b2360bf9642ced9c5b701945c Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 20 Mar 2026 13:35:27 +0100 Subject: [PATCH 31/34] Allow use a filename in .b2z as a single argument --- src/blosc2/dict_store.py | 15 ++++++--- tests/test_dict_store.py | 68 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 5 deletions(-) diff --git a/src/blosc2/dict_store.py b/src/blosc2/dict_store.py index 81b577b4..6fe9e7ee 100644 --- a/src/blosc2/dict_store.py +++ b/src/blosc2/dict_store.py @@ -85,7 +85,7 @@ class DictStore: >>> schunk.append_data(b"abcd") 4 >>> dstore["/dir1/schunk1"] = schunk # externalized as .b2f if above threshold - >>> dstore.to_b2z() # persist to the zip file; external files are copied in + >>> dstore.to_b2z(filename="my_dstore.b2z") # persist to the zip file; external files are copied in >>> print(sorted(dstore.keys())) ['/dir1/node3', '/dir1/schunk1', '/node1', '/node2'] >>> print(dstore["/node1"][:])) @@ -555,14 +555,19 @@ def to_b2z(self, overwrite=False, filename=None) -> os.PathLike[Any] | str: If True, overwrite the existing b2z file if it exists. Default is False. filename : str, optional If provided, use this filename instead of the default b2z file path. + Keyword use is recommended for clarity. Returns ------- filename : str The absolute path to the created b2z file. """ - if self.mode == "r": - raise ValueError("Cannot call to_b2z() on a DictStore opened in read mode.") + if isinstance(overwrite, str | os.PathLike) and filename is None: + filename = overwrite + overwrite = False + + if self.mode == "r" and self.is_zip_store: + raise ValueError("Cannot call to_b2z() on a .b2z DictStore opened in read mode.") b2z_path = self.b2z_path if filename is None else filename if not b2z_path.endswith(".b2z"): @@ -582,7 +587,7 @@ def to_b2z(self, overwrite=False, filename=None) -> os.PathLike[Any] | str: # Sort filepaths by file size from largest to smallest filepaths.sort(key=os.path.getsize, reverse=True) - with zipfile.ZipFile(self.b2z_path, "w", zipfile.ZIP_STORED) as zf: + with zipfile.ZipFile(b2z_path, "w", zipfile.ZIP_STORED) as zf: # Write all files (except estore_path) first (sorted by size) for filepath in filepaths: arcname = os.path.relpath(filepath, self.working_dir) @@ -591,7 +596,7 @@ def to_b2z(self, overwrite=False, filename=None) -> os.PathLike[Any] | str: if os.path.exists(self.estore_path): arcname = os.path.relpath(self.estore_path, self.working_dir) zf.write(self.estore_path, arcname) - return os.path.abspath(self.b2z_path) + return os.path.abspath(b2z_path) def _get_zip_offsets(self) -> dict[str, dict[str, int]]: """Get offset and length of all files in the zip archive.""" diff --git a/tests/test_dict_store.py b/tests/test_dict_store.py index 6a157f72..337ace30 100644 --- a/tests/test_dict_store.py +++ b/tests/test_dict_store.py @@ -114,6 +114,74 @@ def test_to_b2z_and_reopen(populated_dict_store): assert np.all(dstore_read["/nodeB"][:] == np.arange(6)) +def test_to_b2z_from_readonly_b2d(): + b2d_path = "test_to_b2z_from_readonly.b2d" + b2z_path = "test_to_b2z_from_readonly.b2z" + + if os.path.exists(b2d_path): + shutil.rmtree(b2d_path) + if os.path.exists(b2z_path): + os.remove(b2z_path) + + with DictStore(b2d_path, mode="w") as dstore: + dstore["/nodeA"] = np.arange(5) + dstore["/nodeB"] = np.arange(6) + + with DictStore(b2d_path, mode="r") as dstore: + packed = dstore.to_b2z(filename=b2z_path) + assert packed.endswith(b2z_path) + + with DictStore(b2z_path, mode="r") as dstore: + assert np.all(dstore["/nodeA"][:] == np.arange(5)) + assert np.all(dstore["/nodeB"][:] == np.arange(6)) + + shutil.rmtree(b2d_path) + os.remove(b2z_path) + + +def test_to_b2z_accepts_positional_filename(): + b2d_path = "test_to_b2z_positional_filename.b2d" + b2z_path = "test_to_b2z_positional_filename.b2z" + + if os.path.exists(b2d_path): + shutil.rmtree(b2d_path) + if os.path.exists(b2z_path): + os.remove(b2z_path) + + with DictStore(b2d_path, mode="w") as dstore: + dstore["/nodeA"] = np.arange(5) + + with DictStore(b2d_path, mode="r") as dstore: + packed = dstore.to_b2z(b2z_path) + assert packed.endswith(b2z_path) + + with DictStore(b2z_path, mode="r") as dstore: + assert np.all(dstore["/nodeA"][:] == np.arange(5)) + + shutil.rmtree(b2d_path) + os.remove(b2z_path) + + +def test_to_b2z_from_readonly_b2z_raises(): + b2z_path = "test_to_b2z_readonly_zip.b2z" + out_path = "test_to_b2z_readonly_zip_out.b2z" + + for path in (b2z_path, out_path): + if os.path.exists(path): + os.remove(path) + + with DictStore(b2z_path, mode="w") as dstore: + dstore["/nodeA"] = np.arange(5) + + with ( + DictStore(b2z_path, mode="r") as dstore, + pytest.raises(ValueError, match=r"\.b2z DictStore opened in read mode"), + ): + dstore.to_b2z(filename=out_path) + + os.remove(b2z_path) + + def test_map_tree_precedence(populated_dict_store): dstore, path = populated_dict_store # Create external file and add to dstore From 5a1cd0fa31c0f34717c40ea15941c23b999fd846 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 20 Mar 2026 14:43:55 +0100 Subject: [PATCH 32/34] Adapt test to new blocksize thresholds --- tests/test_batch_store.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_batch_store.py b/tests/test_batch_store.py index a567f625..b5299a0c 100644 --- a/tests/test_batch_store.py +++ b/tests/test_batch_store.py @@ -328,11 +328,11 @@ def test_batchstore_respects_explicit_use_dict_and_non_zstd(): assert barray.cparams.use_dict is False -def test_batchstore_guess_max_blocksize_uses_l1_for_low_clevel(monkeypatch): +def test_batchstore_guess_max_blocksize_uses_l2_for_clevel_5(monkeypatch): monkeypatch.setitem(blosc2.cpu_info, "l1_data_cache_size", 100) monkeypatch.setitem(blosc2.cpu_info, "l2_cache_size", 1000) barray = blosc2.BatchStore(cparams={"clevel": 5}) - assert barray._guess_blocksize([30, 30, 30, 30]) == 3 + assert barray._guess_blocksize([30, 30, 30, 30]) == 4 def test_batchstore_guess_max_blocksize_uses_l2_for_mid_clevel(monkeypatch): From 62dc7172e3e45bada0dfe2cc9372992181af04fd Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 20 Mar 2026 18:28:24 +0100 Subject: [PATCH 33/34] Fix BatchStore metadata preservation paths Preserve user vlmeta when BatchStore recreates its empty backing SChunk during initial layout inference, avoid persisting empty batch_lengths metadata that breaks vlmeta.getall() on empty stores, and keep user meta/vlmeta when copy(storage=...) is used. Add BatchStore regression tests covering: - vlmeta preservation during inferred layout initialization - clear()/delete-last on empty stores - metadata preservation on copy(storage=...) --- src/blosc2/batch_store.py | 24 ++++++++++--- tests/test_batch_store.py | 72 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+), 5 deletions(-) diff --git a/src/blosc2/batch_store.py b/src/blosc2/batch_store.py index cd71684f..984f043e 100644 --- a/src/blosc2/batch_store.py +++ b/src/blosc2/batch_store.py @@ -370,6 +370,10 @@ def _load_batch_lengths(self) -> list[int] | None: def _persist_batch_lengths(self) -> None: if self._batch_lengths is None: return + if len(self._batch_lengths) == 0: + if _BATCHSTORE_VLMETA_KEY in self.vlmeta: + del self.vlmeta[_BATCHSTORE_VLMETA_KEY] + return self.schunk.vlmeta[_BATCHSTORE_VLMETA_KEY] = {"batch_lengths": list(self._batch_lengths)} def _get_batch_lengths(self) -> list[int] | None: @@ -542,6 +546,7 @@ def _persist_layout_metadata(self) -> None: if len(self) > 0: return batch_lengths = None if self._batch_lengths is None else list(self._batch_lengths) + user_vlmeta = self._user_vlmeta_items() if len(self.vlmeta) > 0 else {} storage = self._make_storage() fixed_meta = dict(storage.meta or {}) fixed_meta["batchstore"] = { @@ -559,6 +564,8 @@ def _persist_layout_metadata(self) -> None: storage=storage, ) self._attach_schunk(schunk) + for key, value in user_vlmeta.items(): + self.vlmeta[key] = value if batch_lengths is not None and self._batch_lengths is None: self._batch_lengths = batch_lengths @@ -894,17 +901,24 @@ def copy(self, **kwargs: Any) -> BatchStore: kwargs["dparams"] = kwargs.get("dparams", copy.deepcopy(self.dparams)) kwargs["max_blocksize"] = kwargs.get("max_blocksize", self.max_blocksize) kwargs["serializer"] = kwargs.get("serializer", self.serializer) - - if "storage" not in kwargs: + user_vlmeta = self._user_vlmeta_items() if len(self.vlmeta) > 0 else {} + + if "storage" in kwargs: + storage = self._coerce_storage(kwargs["storage"], {}) + fixed_meta = self._copy_meta() + if storage.meta is not None: + fixed_meta.update(storage.meta) + storage.meta = fixed_meta + kwargs["storage"] = storage + else: kwargs["meta"] = self._copy_meta() kwargs["contiguous"] = kwargs.get("contiguous", self.schunk.contiguous) if "urlpath" in kwargs and "mode" not in kwargs: kwargs["mode"] = "w" out = BatchStore(**kwargs) - if "storage" not in kwargs and len(self.vlmeta) > 0: - for key, value in self._user_vlmeta_items().items(): - out.vlmeta[key] = value + for key, value in user_vlmeta.items(): + out.vlmeta[key] = value out.extend(self) return out diff --git a/tests/test_batch_store.py b/tests/test_batch_store.py index b5299a0c..9ae83de5 100644 --- a/tests/test_batch_store.py +++ b/tests/test_batch_store.py @@ -143,6 +143,26 @@ def test_batchstore_arrow_ipc_roundtrip(): blosc2.remove_urlpath(urlpath) +def test_batchstore_inferred_layout_preserves_user_vlmeta(): + barray = blosc2.BatchStore() + barray.vlmeta["user"] = {"x": 1} + + barray.append([1, 2, 3]) + + assert barray.vlmeta["user"] == {"x": 1} + + +def test_batchstore_arrow_layout_persistence_preserves_user_vlmeta(): + pa = pytest.importorskip("pyarrow") + + barray = blosc2.BatchStore(serializer="arrow") + barray.vlmeta["user"] = {"x": 1} + + barray.append(pa.array([[1], [2, 3]])) + + assert barray.vlmeta["user"] == {"x": 1} + + def test_batchstore_from_cframe(): barray = blosc2.BatchStore() barray.extend(BATCHES) @@ -233,6 +253,38 @@ def test_batchstore_pop_keeps_batch_lengths_metadata_in_sync(): assert items["nbatches"].startswith("2 (items per batch: mean=2.00") +def test_batchstore_clear_keeps_empty_store_vlmeta_readable(): + urlpath = "test_batchstore_clear_empty_vlmeta.b2b" + blosc2.remove_urlpath(urlpath) + + barray = blosc2.BatchStore(urlpath=urlpath, mode="w", contiguous=True) + barray.append([1, 2, 3]) + barray.clear() + + assert barray.vlmeta.getall() == {} + + reopened = blosc2.open(urlpath, mode="r") + assert reopened.vlmeta.getall() == {} + + blosc2.remove_urlpath(urlpath) + + +def test_batchstore_delete_last_keeps_empty_store_vlmeta_readable(): + urlpath = "test_batchstore_delete_last_empty_vlmeta.b2b" + blosc2.remove_urlpath(urlpath) + + barray = blosc2.BatchStore(urlpath=urlpath, mode="w", contiguous=True) + barray.append([1, 2, 3]) + barray.delete(0) + + assert barray.vlmeta.getall() == {} + + reopened = blosc2.open(urlpath, mode="r") + assert reopened.vlmeta.getall() == {} + + blosc2.remove_urlpath(urlpath) + + def test_batchstore_zstd_does_not_use_dict_by_default(): barray = blosc2.BatchStore() assert barray.cparams.codec == blosc2.Codec.ZSTD @@ -573,6 +625,26 @@ def test_batchstore_copy(): blosc2.remove_urlpath(copy_path) +def test_batchstore_copy_with_storage_preserves_user_metadata(): + urlpath = "test_batchstore_copy_storage.b2b" + copy_path = "test_batchstore_copy_storage_out.b2b" + blosc2.remove_urlpath(urlpath) + blosc2.remove_urlpath(copy_path) + + original = blosc2.BatchStore(urlpath=urlpath, mode="w", contiguous=True, meta={"user_meta": {"a": 1}}) + original.vlmeta["user_vlmeta"] = {"b": 2} + original.extend(BATCHES) + + copied = original.copy(storage=blosc2.Storage(contiguous=False, urlpath=copy_path, mode="w")) + + assert [batch[:] for batch in copied] == [batch[:] for batch in original] + assert copied.meta["user_meta"] == {"a": 1} + assert copied.vlmeta["user_vlmeta"] == {"b": 2} + + blosc2.remove_urlpath(urlpath) + blosc2.remove_urlpath(copy_path) + + @pytest.mark.parametrize(("contiguous", "nthreads"), [(False, 2), (True, 4)]) def test_batchstore_multithreaded_inner_vl(contiguous, nthreads): batches = [] From b379a2c2ae643e329945916e3926956f89d8531e Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 20 Mar 2026 18:31:53 +0100 Subject: [PATCH 34/34] Update to latest c-blosc2 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ed060c9..a1a23c22 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -119,7 +119,7 @@ else() include(FetchContent) FetchContent_Declare(blosc2 GIT_REPOSITORY https://github.com/Blosc/c-blosc2 - GIT_TAG c0f5416f55662fccad861aa0387e965f73f644b4 # variable-length chunks/blocks + GIT_TAG 7cec94ba9d4243ff7d7eb397ef669ec5dd501711 # variable-length chunks/blocks # SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../c-blosc2 ) FetchContent_MakeAvailable(blosc2)