From 4866f23379a61e1f0bb5d4e02d04c8d8ff7e9cb0 Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Mon, 16 Mar 2026 17:52:28 -0400 Subject: [PATCH 1/9] pluggable compressor first draft Signed-off-by: Connor Tsui --- fuzz/src/array/mod.rs | 15 +- vortex-btrblocks/src/builder.rs | 206 +++--- vortex-btrblocks/src/canonical_compressor.rs | 313 ++++----- vortex-btrblocks/src/compressor/decimal.rs | 4 +- vortex-btrblocks/src/compressor/float/mod.rs | 441 ++++++------- .../src/compressor/integer/mod.rs | 594 ++++++++---------- vortex-btrblocks/src/compressor/mod.rs | 178 +----- vortex-btrblocks/src/compressor/rle.rs | 65 +- vortex-btrblocks/src/compressor/string.rs | 310 ++++----- vortex-btrblocks/src/compressor/temporal.rs | 8 +- vortex-btrblocks/src/ctx.rs | 102 +-- vortex-btrblocks/src/lib.rs | 64 +- vortex-btrblocks/src/scheme.rs | 181 +++--- vortex-btrblocks/src/stats_cache.rs | 64 ++ vortex-file/src/strategy.rs | 32 +- vortex-layout/src/layouts/compressed.rs | 5 +- vortex/src/lib.rs | 5 +- 17 files changed, 1142 insertions(+), 1445 deletions(-) create mode 100644 vortex-btrblocks/src/stats_cache.rs diff --git a/fuzz/src/array/mod.rs b/fuzz/src/array/mod.rs index 0b101b91d8f..197d5b441e0 100644 --- a/fuzz/src/array/mod.rs +++ b/fuzz/src/array/mod.rs @@ -61,9 +61,10 @@ use vortex_array::search_sorted::SearchSorted; use vortex_array::search_sorted::SearchSortedSide; use vortex_btrblocks::BtrBlocksCompressor; use vortex_btrblocks::BtrBlocksCompressorBuilder; -use vortex_btrblocks::FloatCode; -use vortex_btrblocks::IntCode; -use vortex_btrblocks::StringCode; +use vortex_btrblocks::Scheme; +use vortex_btrblocks::compressor::float; +use vortex_btrblocks::compressor::integer; +use vortex_btrblocks::compressor::string; use vortex_error::VortexExpect; use vortex_error::vortex_panic; use vortex_mask::Mask; @@ -546,9 +547,11 @@ pub fn compress_array(array: &ArrayRef, strategy: CompressorStrategy) -> ArrayRe .compress(array) .vortex_expect("BtrBlocksCompressor compress should succeed in fuzz test"), CompressorStrategy::Compact => BtrBlocksCompressorBuilder::default() - .include_string([StringCode::Zstd]) - .include_int([IntCode::Pco]) - .include_float([FloatCode::Pco]) + .include([ + string::ZstdScheme.id(), + integer::PcoScheme.id(), + float::PcoScheme.id(), + ]) .build() .compress(array) .vortex_expect("Compact compress should succeed in fuzz test"), diff --git a/vortex-btrblocks/src/builder.rs b/vortex-btrblocks/src/builder.rs index d329ec8c139..6b88f759f7f 100644 --- a/vortex-btrblocks/src/builder.rs +++ b/vortex-btrblocks/src/builder.rs @@ -3,156 +3,152 @@ //! Builder for configuring `BtrBlocksCompressor` instances. -use itertools::Itertools; use vortex_utils::aliases::hash_set::HashSet; use crate::BtrBlocksCompressor; -use crate::FloatCode; -use crate::IntCode; -use crate::StringCode; -use crate::compressor::float::ALL_FLOAT_SCHEMES; -use crate::compressor::float::FloatScheme; -use crate::compressor::integer::ALL_INT_SCHEMES; -use crate::compressor::integer::IntegerScheme; -use crate::compressor::string::ALL_STRING_SCHEMES; -use crate::compressor::string::StringScheme; +use crate::Scheme; +use crate::SchemeId; + +/// All available compression schemes. +/// +/// This list is order-sensitive: the builder preserves this order when constructing +/// the final scheme list, so that tie-breaking is deterministic. +pub const ALL_SCHEMES: &[&dyn Scheme] = &[ + // Integer schemes. + &crate::compressor::integer::UncompressedScheme as &dyn Scheme, + &crate::compressor::integer::ConstantScheme, + &crate::compressor::integer::FORScheme, + &crate::compressor::integer::ZigZagScheme, + &crate::compressor::integer::BitPackingScheme, + &crate::compressor::integer::SparseScheme, + &crate::compressor::integer::DictScheme, + &crate::compressor::integer::RunEndScheme, + &crate::compressor::integer::SequenceScheme, + &crate::compressor::integer::RLE_INTEGER_SCHEME, + #[cfg(feature = "pco")] + &crate::compressor::integer::PcoScheme, + // Float schemes. + &crate::compressor::float::UncompressedScheme, + &crate::compressor::float::ConstantScheme, + &crate::compressor::float::ALPScheme, + &crate::compressor::float::ALPRDScheme, + &crate::compressor::float::DictScheme, + &crate::compressor::float::NullDominated, + &crate::compressor::float::RLE_FLOAT_SCHEME, + #[cfg(feature = "pco")] + &crate::compressor::float::PcoScheme, + // String schemes. + &crate::compressor::string::UncompressedScheme, + &crate::compressor::string::DictScheme, + &crate::compressor::string::FSSTScheme, + &crate::compressor::string::ConstantScheme, + &crate::compressor::string::NullDominated, + #[cfg(feature = "zstd")] + &crate::compressor::string::ZstdScheme, + #[cfg(all(feature = "zstd", feature = "unstable_encodings"))] + &crate::compressor::string::ZstdBuffersScheme, +]; + +/// Schemes excluded by default (behind feature gates that are off or known-expensive). +const DEFAULT_EXCLUDED: &[SchemeId] = &[ + #[cfg(feature = "pco")] + SchemeId { + name: "vortex.int.pco", + }, + #[cfg(feature = "pco")] + SchemeId { + name: "vortex.float.pco", + }, + #[cfg(feature = "zstd")] + SchemeId { + name: "vortex.string.zstd", + }, + #[cfg(all(feature = "zstd", feature = "unstable_encodings"))] + SchemeId { + name: "vortex.string.zstd_buffers", + }, +]; /// Builder for creating configured [`BtrBlocksCompressor`] instances. /// -/// Use this builder to configure which compression schemes are allowed for each data type. -/// By default, all schemes are enabled. +/// Use this builder to configure which compression schemes are allowed. +/// By default, all schemes are enabled except those in [`DEFAULT_EXCLUDED`]. /// /// # Examples /// /// ```rust -/// use vortex_btrblocks::{BtrBlocksCompressorBuilder, IntCode, FloatCode}; +/// use vortex_btrblocks::{BtrBlocksCompressorBuilder, Scheme}; +/// use vortex_btrblocks::compressor::integer::DictScheme; /// -/// // Default compressor - all schemes allowed +/// // Default compressor - all non-excluded schemes allowed. /// let compressor = BtrBlocksCompressorBuilder::default().build(); /// -/// // Exclude specific schemes +/// // Exclude specific schemes. /// let compressor = BtrBlocksCompressorBuilder::default() -/// .exclude_int([IntCode::Dict]) +/// .exclude([DictScheme.id()]) /// .build(); /// -/// // Exclude then re-include +/// // Exclude then re-include. /// let compressor = BtrBlocksCompressorBuilder::default() -/// .exclude_int([IntCode::Dict, IntCode::Rle]) -/// .include_int([IntCode::Dict]) +/// .exclude([DictScheme.id()]) +/// .include([DictScheme.id()]) /// .build(); /// ``` #[derive(Debug, Clone)] pub struct BtrBlocksCompressorBuilder { - int_schemes: HashSet<&'static dyn IntegerScheme>, - float_schemes: HashSet<&'static dyn FloatScheme>, - string_schemes: HashSet<&'static dyn StringScheme>, + schemes: HashSet<&'static dyn Scheme>, } impl Default for BtrBlocksCompressorBuilder { fn default() -> Self { + let excluded: HashSet = DEFAULT_EXCLUDED.iter().copied().collect(); Self { - int_schemes: ALL_INT_SCHEMES - .iter() - .copied() - .filter(|s| s.code() != IntCode::Pco) - .collect(), - float_schemes: ALL_FLOAT_SCHEMES - .iter() - .copied() - .filter(|s| s.code() != FloatCode::Pco) - .collect(), - string_schemes: ALL_STRING_SCHEMES + schemes: ALL_SCHEMES .iter() .copied() - .filter(|s| s.code() != StringCode::Zstd && s.code() != StringCode::ZstdBuffers) + .filter(|s| !excluded.contains(&s.id())) .collect(), } } } impl BtrBlocksCompressorBuilder { - /// Create a new builder with no encodings enabled. - pub fn empty() -> Self { - Self { - int_schemes: Default::default(), - float_schemes: Default::default(), - string_schemes: Default::default(), - } - } - - /// Excludes the specified integer compression schemes. - pub fn exclude_int(mut self, codes: impl IntoIterator) -> Self { - let codes: HashSet<_> = codes.into_iter().collect(); - self.int_schemes.retain(|s| !codes.contains(&s.code())); + /// Excludes the specified compression schemes by their [`SchemeId`]. + pub fn exclude(mut self, ids: impl IntoIterator) -> Self { + let ids: HashSet<_> = ids.into_iter().collect(); + self.schemes.retain(|s| !ids.contains(&s.id())); self } - /// Excludes the specified float compression schemes. - pub fn exclude_float(mut self, codes: impl IntoIterator) -> Self { - let codes: HashSet<_> = codes.into_iter().collect(); - self.float_schemes.retain(|s| !codes.contains(&s.code())); - self - } - - /// Excludes the specified string compression schemes. - pub fn exclude_string(mut self, codes: impl IntoIterator) -> Self { - let codes: HashSet<_> = codes.into_iter().collect(); - self.string_schemes.retain(|s| !codes.contains(&s.code())); - self - } - - /// Includes the specified integer compression schemes. - pub fn include_int(mut self, codes: impl IntoIterator) -> Self { - let codes: HashSet<_> = codes.into_iter().collect(); - for scheme in ALL_INT_SCHEMES { - if codes.contains(&scheme.code()) { - self.int_schemes.insert(*scheme); + /// Includes the specified compression schemes by their [`SchemeId`]. + /// + /// Only schemes present in [`ALL_SCHEMES`] can be included. + pub fn include(mut self, ids: impl IntoIterator) -> Self { + let ids: HashSet<_> = ids.into_iter().collect(); + for scheme in ALL_SCHEMES { + if ids.contains(&scheme.id()) { + self.schemes.insert(*scheme); } } self } - /// Includes the specified float compression schemes. - pub fn include_float(mut self, codes: impl IntoIterator) -> Self { - let codes: HashSet<_> = codes.into_iter().collect(); - for scheme in ALL_FLOAT_SCHEMES { - if codes.contains(&scheme.code()) { - self.float_schemes.insert(*scheme); - } - } + /// Adds a single scheme to the builder. + pub fn with_scheme(mut self, scheme: &'static dyn Scheme) -> Self { + self.schemes.insert(scheme); self } - /// Includes the specified string compression schemes. - pub fn include_string(mut self, codes: impl IntoIterator) -> Self { - let codes: HashSet<_> = codes.into_iter().collect(); - for scheme in ALL_STRING_SCHEMES { - if codes.contains(&scheme.code()) { - self.string_schemes.insert(*scheme); - } - } - self - } - - /// Builds the configured `BtrBlocksCompressor`. + /// Builds the configured [`BtrBlocksCompressor`]. + /// + /// The resulting scheme list preserves the order of [`ALL_SCHEMES`] for deterministic + /// tie-breaking. pub fn build(self) -> BtrBlocksCompressor { - // Note we should apply the schemes in the same order, in case try conflict. - BtrBlocksCompressor { - int_schemes: self - .int_schemes - .into_iter() - .sorted_by_key(|s| s.code()) - .collect_vec(), - float_schemes: self - .float_schemes - .into_iter() - .sorted_by_key(|s| s.code()) - .collect_vec(), - string_schemes: self - .string_schemes - .into_iter() - .sorted_by_key(|s| s.code()) - .collect_vec(), - } + let schemes = ALL_SCHEMES + .iter() + .copied() + .filter(|s| self.schemes.contains(s)) + .collect(); + BtrBlocksCompressor { schemes } } } diff --git a/vortex-btrblocks/src/canonical_compressor.rs b/vortex-btrblocks/src/canonical_compressor.rs index 410dda0b599..682af7a1c19 100644 --- a/vortex-btrblocks/src/canonical_compressor.rs +++ b/vortex-btrblocks/src/canonical_compressor.rs @@ -23,46 +23,19 @@ use vortex_array::arrays::listview::list_from_list_view; use vortex_array::dtype::DType; use vortex_array::dtype::Nullability; use vortex_array::extension::datetime::TemporalMetadata; +use vortex_array::scalar::Scalar; use vortex_array::vtable::ValidityHelper; use vortex_error::VortexResult; use crate::BtrBlocksCompressorBuilder; use crate::CompressorContext; -use crate::CompressorExt; -use crate::Excludes; -use crate::FloatCompressor; -use crate::IntCode; -use crate::IntCompressor; -use crate::StringCompressor; +use crate::Scheme; +use crate::SchemeId; +use crate::StatsCache; use crate::compressor::decimal::compress_decimal; -use crate::compressor::float::FloatScheme; -use crate::compressor::integer::IntegerScheme; -use crate::compressor::string::StringScheme; +use crate::compressor::integer::DictScheme as IntDictScheme; use crate::compressor::temporal::compress_temporal; -/// Trait for compressors that can compress canonical arrays. -/// -/// Provides access to configured compression schemes and the ability to -/// compress canonical arrays recursively. -pub trait CanonicalCompressor { - /// Compresses a canonical array with the specified options. - fn compress_canonical( - &self, - array: Canonical, - ctx: CompressorContext, - excludes: Excludes, - ) -> VortexResult; - - /// Returns the enabled integer compression schemes. - fn int_schemes(&self) -> &[&'static dyn IntegerScheme]; - - /// Returns the enabled float compression schemes. - fn float_schemes(&self) -> &[&'static dyn FloatScheme]; - - /// Returns the enabled string compression schemes. - fn string_schemes(&self) -> &[&'static dyn StringScheme]; -} - /// The main compressor type implementing BtrBlocks-inspired compression. /// /// This compressor applies adaptive compression schemes to arrays based on their data types @@ -70,36 +43,31 @@ pub trait CanonicalCompressor { /// and chooses optimal compression schemes for primitive types. /// /// The compressor works by: -/// 1. Canonicalizing input arrays to a standard representation -/// 2. Analyzing data characteristics to choose optimal compression schemes -/// 3. Recursively compressing nested structures -/// 4. Applying type-specific compression for primitives, strings, and temporal data +/// 1. Canonicalizing input arrays to a standard representation. +/// 2. Pre-filtering schemes by [`Scheme::matches`] and excludes. +/// 3. Evaluating each matching scheme's compression ratio on a sample. +/// 4. Compressing with the best scheme and verifying the result is smaller. /// /// Use [`BtrBlocksCompressorBuilder`] to configure which compression schemes are enabled. /// /// # Examples /// /// ```rust -/// use vortex_btrblocks::{BtrBlocksCompressor, BtrBlocksCompressorBuilder, IntCode}; +/// use vortex_btrblocks::{BtrBlocksCompressor, BtrBlocksCompressorBuilder, Scheme}; +/// use vortex_btrblocks::compressor::integer::DictScheme; /// -/// // Default compressor - all schemes allowed +/// // Default compressor - all schemes allowed. /// let compressor = BtrBlocksCompressor::default(); /// -/// // Exclude specific schemes using the builder +/// // Exclude specific schemes using the builder. /// let compressor = BtrBlocksCompressorBuilder::default() -/// .exclude_int([IntCode::Dict]) +/// .exclude([DictScheme.id()]) /// .build(); /// ``` #[derive(Clone)] pub struct BtrBlocksCompressor { - /// Integer compressor with configured schemes. - pub int_schemes: Vec<&'static dyn IntegerScheme>, - - /// Float compressor with configured schemes. - pub float_schemes: Vec<&'static dyn FloatScheme>, - - /// String compressor with configured schemes. - pub string_schemes: Vec<&'static dyn StringScheme>, + /// The enabled compression schemes. + pub schemes: Vec<&'static dyn Scheme>, } impl Default for BtrBlocksCompressor { @@ -113,117 +81,32 @@ impl BtrBlocksCompressor { /// /// First canonicalizes and compacts the array, then applies optimal compression schemes. pub fn compress(&self, array: &ArrayRef) -> VortexResult { - // Canonicalize the array - // TODO(joe): receive `ctx` and use it. let canonical = array .clone() .execute::(&mut LEGACY_SESSION.create_execution_ctx())? .0; - // Compact it, removing any wasted space before we attempt to compress it + // Compact it, removing any wasted space before we attempt to compress it. let compact = canonical.compact()?; - self.compress_canonical(compact, CompressorContext::default(), Excludes::none()) - } - - pub(crate) fn integer_compressor(&self) -> IntCompressor<'_> { - IntCompressor { - btr_blocks_compressor: self, - } - } - - pub(crate) fn float_compressor(&self) -> FloatCompressor<'_> { - FloatCompressor { - btr_blocks_compressor: self, - } - } - - pub(crate) fn string_compressor(&self) -> StringCompressor<'_> { - StringCompressor { - btr_blocks_compressor: self, - } - } - - /// Compresses a [`ListArray`] by narrowing offsets and recursively compressing elements. - fn compress_list_array( - &self, - list_array: ListArray, - ctx: CompressorContext, - ) -> VortexResult { - // Reset the offsets to remove garbage data that might prevent us from narrowing our - // offsets (there could be a large amount of trailing garbage data that the current - // views do not reference at all). - let list_array = list_array.reset_offsets(true)?; - - let compressed_elems = self.compress(list_array.elements())?; - - // Note that since the type of our offsets are not encoded in our `DType`, and since - // we guarantee above that all elements are referenced by offsets, we may narrow the - // widths. - let compressed_offsets = self.compress_canonical( - Canonical::Primitive(list_array.offsets().to_primitive().narrow()?), - ctx, - Excludes::from(&[IntCode::Dict]), - )?; - - Ok(ListArray::try_new( - compressed_elems, - compressed_offsets, - list_array.validity().clone(), - )? - .into_array()) + self.compress_canonical(compact, CompressorContext::default(), &[]) } - /// Compresses a [`ListViewArray`] by narrowing offsets/sizes and recursively compressing - /// elements. - fn compress_list_view_array( - &self, - list_view: ListViewArray, - ctx: CompressorContext, - ) -> VortexResult { - let compressed_elems = self.compress(list_view.elements())?; - let compressed_offsets = self.compress_canonical( - Canonical::Primitive(list_view.offsets().to_primitive().narrow()?), - ctx, - Excludes::none(), - )?; - let compressed_sizes = self.compress_canonical( - Canonical::Primitive(list_view.sizes().to_primitive().narrow()?), - ctx, - Excludes::none(), - )?; - Ok(ListViewArray::try_new( - compressed_elems, - compressed_offsets, - compressed_sizes, - list_view.validity().clone(), - )? - .into_array()) - } -} - -impl CanonicalCompressor for BtrBlocksCompressor { - /// Compresses a canonical array by dispatching to type-specific compressors. + /// Compresses a canonical array by dispatching to type-specific logic. /// - /// Recursively compresses nested structures and applies optimal schemes for each data type. - fn compress_canonical( + /// For primitives and strings this calls [`choose_and_compress`](Self::choose_and_compress). + /// For compound types it recurses into children. + pub(crate) fn compress_canonical( &self, array: Canonical, ctx: CompressorContext, - excludes: Excludes, + excludes: &[SchemeId], ) -> VortexResult { match array { Canonical::Null(null_array) => Ok(null_array.into_array()), - // TODO(aduffy): Sparse, other bool compressors. Canonical::Bool(bool_array) => Ok(bool_array.into_array()), Canonical::Primitive(primitive) => { - if primitive.ptype().is_int() { - self.integer_compressor() - .compress(self, &primitive, ctx, excludes.int) - } else { - self.float_compressor() - .compress(self, &primitive, ctx, excludes.float) - } + self.choose_and_compress(Canonical::Primitive(primitive), ctx, excludes) } Canonical::Decimal(decimal) => compress_decimal(self, &decimal), Canonical::Struct(struct_array) => { @@ -243,8 +126,6 @@ impl CanonicalCompressor for BtrBlocksCompressor { } Canonical::List(list_view_array) => { if list_view_array.is_zero_copy_to_list() || list_view_array.elements().is_empty() { - // Offsets are already monotonic and non-overlapping, so we - // can drop the sizes array and compress as a ListArray. let list_array = list_from_list_view(list_view_array)?; self.compress_list_array(list_array, ctx) } else { @@ -267,15 +148,13 @@ impl CanonicalCompressor for BtrBlocksCompressor { .dtype() .eq_ignore_nullability(&DType::Utf8(Nullability::NonNullable)) { - self.string_compressor() - .compress(self, &strings, ctx, excludes.string) + self.choose_and_compress(Canonical::VarBinView(strings), ctx, excludes) } else { - // Binary arrays do not compress + // Binary arrays do not compress. Ok(strings.into_array()) } } Canonical::Extension(ext_array) => { - // We compress Timestamp-level arrays with DateTimeParts compression if let Ok(temporal_array) = TemporalArray::try_from(ext_array.clone().into_array()) && let TemporalMetadata::Timestamp(..) = temporal_array.temporal_metadata() { @@ -301,17 +180,145 @@ impl CanonicalCompressor for BtrBlocksCompressor { } } - fn int_schemes(&self) -> &[&'static dyn IntegerScheme] { - &self.int_schemes + /// Filters eligible schemes, evaluates their compression ratios, and compresses with the + /// best one. + fn choose_and_compress( + &self, + canonical: Canonical, + ctx: CompressorContext, + excludes: &[SchemeId], + ) -> VortexResult { + let eligible: Vec<&'static dyn Scheme> = self + .schemes + .iter() + .copied() + .filter(|s| s.matches(&canonical) && !excludes.contains(&s.id())) + .collect(); + + let array: ArrayRef = canonical.into(); + + // Nothing to compress if empty or all-null. + if array.is_empty() { + return Ok(array); + } + + if array.all_invalid()? { + return Ok( + ConstantArray::new(Scalar::null(array.dtype().clone()), array.len()).into_array(), + ); + } + + let before_nbytes = array.nbytes(); + let mut cache = StatsCache::new(); + + if let Some(winner) = self.choose_scheme(&eligible, &array, ctx, &mut cache, excludes)? { + let compressed = winner.compress(self, &array, ctx, &mut cache, excludes)?; + if compressed.nbytes() < before_nbytes { + return Ok(compressed); + } + } + + // No scheme improved on the original. + Ok(array) } - fn float_schemes(&self) -> &[&'static dyn FloatScheme] { - &self.float_schemes + /// Evaluates each candidate scheme and returns the one with the best compression ratio + /// (must be > 1.0). + fn choose_scheme( + &self, + schemes: &[&'static dyn Scheme], + array: &ArrayRef, + ctx: CompressorContext, + cache: &mut StatsCache, + excludes: &[SchemeId], + ) -> VortexResult> { + let mut best: Option<(&'static dyn Scheme, f64)> = None; + + for &scheme in schemes { + let ratio = self.evaluate_scheme(scheme, array, ctx, cache, excludes)?; + if is_valid_ratio(ratio) && ratio > 1.0 && best.is_none_or(|(_, r)| ratio > r) { + best = Some((scheme, ratio)); + } + } + + Ok(best.map(|(s, _)| s)) } - fn string_schemes(&self) -> &[&'static dyn StringScheme] { - &self.string_schemes + /// Evaluates a single scheme's expected compression ratio with tracing. + fn evaluate_scheme( + &self, + scheme: &'static dyn Scheme, + array: &ArrayRef, + ctx: CompressorContext, + cache: &mut StatsCache, + excludes: &[SchemeId], + ) -> VortexResult { + let ratio = scheme.expected_compression_ratio(self, array, ctx, cache, excludes)?; + + tracing::debug!( + scheme = %scheme.id(), + ratio, + "evaluated compression ratio" + ); + + Ok(ratio) } + + /// Compresses a [`ListArray`] by narrowing offsets and recursively compressing elements. + fn compress_list_array( + &self, + list_array: ListArray, + ctx: CompressorContext, + ) -> VortexResult { + let list_array = list_array.reset_offsets(true)?; + + let compressed_elems = self.compress(list_array.elements())?; + + let compressed_offsets = self.compress_canonical( + Canonical::Primitive(list_array.offsets().to_primitive().narrow()?), + ctx, + &[IntDictScheme.id()], + )?; + + Ok(ListArray::try_new( + compressed_elems, + compressed_offsets, + list_array.validity().clone(), + )? + .into_array()) + } + + /// Compresses a [`ListViewArray`] by narrowing offsets/sizes and recursively compressing + /// elements. + fn compress_list_view_array( + &self, + list_view: ListViewArray, + ctx: CompressorContext, + ) -> VortexResult { + let compressed_elems = self.compress(list_view.elements())?; + let compressed_offsets = self.compress_canonical( + Canonical::Primitive(list_view.offsets().to_primitive().narrow()?), + ctx, + &[], + )?; + let compressed_sizes = self.compress_canonical( + Canonical::Primitive(list_view.sizes().to_primitive().narrow()?), + ctx, + &[], + )?; + Ok(ListViewArray::try_new( + compressed_elems, + compressed_offsets, + compressed_sizes, + list_view.validity().clone(), + )? + .into_array()) + } +} + +/// Returns `true` if the ratio is a usable finite number (not NaN, infinity, or subnormal). +fn is_valid_ratio(ratio: f64) -> bool { + ratio.is_finite() && !ratio.is_subnormal() } #[cfg(test)] diff --git a/vortex-btrblocks/src/compressor/decimal.rs b/vortex-btrblocks/src/compressor/decimal.rs index bf738a72839..e9985c90f81 100644 --- a/vortex-btrblocks/src/compressor/decimal.rs +++ b/vortex-btrblocks/src/compressor/decimal.rs @@ -13,9 +13,7 @@ use vortex_decimal_byte_parts::DecimalBytePartsArray; use vortex_error::VortexResult; use crate::BtrBlocksCompressor; -use crate::CanonicalCompressor; use crate::CompressorContext; -use crate::Excludes; // TODO(joe): add support splitting i128/256 buffers into chunks primitive values for compression. // 2 for i128 and 4 for i256 @@ -36,7 +34,7 @@ pub fn compress_decimal( let compressed = compressor.compress_canonical( Canonical::Primitive(prim), CompressorContext::default(), - Excludes::none(), + &[], )?; DecimalBytePartsArray::try_new(compressed, decimal.decimal_dtype()).map(|d| d.into_array()) diff --git a/vortex-btrblocks/src/compressor/float/mod.rs b/vortex-btrblocks/src/compressor/float/mod.rs index 57bb4dc65f3..7bbf78f1133 100644 --- a/vortex-btrblocks/src/compressor/float/mod.rs +++ b/vortex-btrblocks/src/compressor/float/mod.rs @@ -4,10 +4,6 @@ pub(crate) mod dictionary; pub(super) mod stats; -use std::hash::Hash; -use std::hash::Hasher; - -use enum_iterator::Sequence; use vortex_alp::ALP; use vortex_alp::ALPArray; use vortex_alp::RDEncoder; @@ -19,11 +15,10 @@ use vortex_array::ToCanonical; use vortex_array::arrays::ConstantArray; use vortex_array::arrays::DictArray; use vortex_array::arrays::MaskedArray; -use vortex_array::arrays::Primitive; +use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::dict::DictArrayParts; use vortex_array::dtype::PType; use vortex_array::scalar::Scalar; -use vortex_array::vtable::VTable; use vortex_array::vtable::ValidityHelper; use vortex_error::VortexResult; use vortex_error::vortex_panic; @@ -34,135 +29,46 @@ use self::dictionary::dictionary_encode; pub use self::stats::FloatStats; use super::integer::DictScheme as IntDictScheme; use super::integer::RunEndScheme as IntRunEndScheme; +use super::integer::SequenceScheme as IntSequenceScheme; use super::integer::SparseScheme as IntSparseScheme; use crate::BtrBlocksCompressor; -use crate::CanonicalCompressor; -use crate::Compressor; use crate::CompressorContext; use crate::CompressorStats; -use crate::Excludes; use crate::GenerateStatsOptions; -use crate::IntCode; use crate::Scheme; -use crate::SchemeExt; +use crate::SchemeId; +use crate::StatsCache; use crate::compressor::patches::compress_patches; use crate::compressor::rle; use crate::compressor::rle::RLEScheme; +use crate::scheme::estimate_compression_ratio_with_sampling; -pub trait FloatScheme: Scheme + Send + Sync {} - -impl FloatScheme for T where T: Scheme + Send + Sync -{} - -impl PartialEq for dyn FloatScheme { - fn eq(&self, other: &Self) -> bool { - self.code() == other.code() - } -} - -impl Eq for dyn FloatScheme {} - -impl Hash for dyn FloatScheme { - fn hash(&self, state: &mut H) { - self.code().hash(state) - } -} - -/// All available float compression schemes. -pub const ALL_FLOAT_SCHEMES: &[&dyn FloatScheme] = &[ - &UncompressedScheme, - &ConstantScheme, - &ALPScheme, - &ALPRDScheme, - &DictScheme, - &NullDominated, - &RLE_FLOAT_SCHEME, - #[cfg(feature = "pco")] - &PcoScheme, -]; - -/// [`Compressor`] for floating-point numbers. -#[derive(Clone, Copy)] -pub struct FloatCompressor<'a> { - /// Reference to the parent compressor. - pub btr_blocks_compressor: &'a dyn CanonicalCompressor, -} - -impl<'a> Compressor for FloatCompressor<'a> { - type ArrayVTable = Primitive; - type SchemeType = dyn FloatScheme; - type StatsType = FloatStats; - - fn gen_stats(&self, array: &::Array) -> Self::StatsType { - if self - .btr_blocks_compressor - .float_schemes() - .iter() - .any(|s| s.code() == DictScheme.code()) - { - FloatStats::generate_opts( - array, - GenerateStatsOptions { - count_distinct_values: true, - }, - ) - } else { - FloatStats::generate_opts( - array, - GenerateStatsOptions { - count_distinct_values: false, - }, - ) - } - } - - fn schemes(&self) -> &[&'static dyn FloatScheme] { - self.btr_blocks_compressor.float_schemes() - } - - fn default_scheme(&self) -> &'static Self::SchemeType { - &UncompressedScheme - } -} - -/// Unique identifier for float compression schemes. -#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash, Sequence, Ord, PartialOrd)] -pub enum FloatCode { - /// No compression applied. - Uncompressed, - /// Constant encoding for arrays with a single distinct value. - Constant, - /// ALP (Adaptive Lossless floating-Point) encoding. - Alp, - /// ALPRD (ALP with Right Division) encoding variant. - AlpRd, - /// Dictionary encoding for low-cardinality float values. - Dict, - /// Run-end encoding. - RunEnd, - /// RLE encoding - generic run-length encoding. - Rle, - /// Sparse encoding for null-dominated arrays. - Sparse, - /// Pco (pcodec) compression for floats. - Pco, +/// Returns `true` if the canonical form represents a floating-point primitive. +fn is_float_primitive(canonical: &Canonical) -> bool { + matches!(canonical, Canonical::Primitive(p) if !p.ptype().is_int()) } +/// Uncompressed passthrough for floating-point arrays. #[derive(Debug, Copy, Clone, PartialEq, Eq)] -struct UncompressedScheme; +pub struct UncompressedScheme; +/// Constant encoding for arrays with a single distinct float value. #[derive(Debug, Copy, Clone, PartialEq, Eq)] -struct ConstantScheme; +pub struct ConstantScheme; +/// ALP (Adaptive Lossless floating-Point) encoding. #[derive(Debug, Copy, Clone, PartialEq, Eq)] -struct ALPScheme; +pub struct ALPScheme; +/// ALPRD (ALP with Right Division) encoding variant. #[derive(Debug, Copy, Clone, PartialEq, Eq)] -struct ALPRDScheme; +pub struct ALPRDScheme; +/// Dictionary encoding for low-cardinality float values. #[derive(Debug, Copy, Clone, PartialEq, Eq)] -struct DictScheme; +pub struct DictScheme; +/// Sparse encoding for null-dominated float arrays. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct NullDominated; @@ -177,17 +83,24 @@ pub struct FloatRLEConfig; impl rle::RLEConfig for FloatRLEConfig { type Stats = FloatStats; - type Code = FloatCode; - const CODE: FloatCode = FloatCode::Rle; + const SCHEME_NAME: &'static str = "vortex.float.rle"; + + fn matches(canonical: &Canonical) -> bool { + is_float_primitive(canonical) + } + + fn generate_stats(array: &ArrayRef) -> FloatStats { + FloatStats::generate(&array.to_primitive()) + } fn compress_values( compressor: &BtrBlocksCompressor, - values: &vortex_array::arrays::PrimitiveArray, + values: &PrimitiveArray, ctx: CompressorContext, - excludes: &[FloatCode], + excludes: &[SchemeId], ) -> VortexResult { - compressor.compress_canonical(Canonical::Primitive(values.clone()), ctx, excludes.into()) + compressor.compress_canonical(Canonical::Primitive(values.clone()), ctx, excludes) } } @@ -195,59 +108,71 @@ impl rle::RLEConfig for FloatRLEConfig { pub const RLE_FLOAT_SCHEME: RLEScheme = RLEScheme::new(); impl Scheme for UncompressedScheme { - type StatsType = FloatStats; - type CodeType = FloatCode; + fn scheme_name(&self) -> &'static str { + "vortex.float.uncompressed" + } - fn code(&self) -> FloatCode { - FloatCode::Uncompressed + fn matches(&self, canonical: &Canonical) -> bool { + is_float_primitive(canonical) } fn expected_compression_ratio( &self, _compressor: &BtrBlocksCompressor, - _stats: &Self::StatsType, + _array: &ArrayRef, _ctx: CompressorContext, - _excludes: &[FloatCode], + _cache: &mut StatsCache, + _excludes: &[SchemeId], ) -> VortexResult { Ok(1.0) } fn compress( &self, - _btr_blocks_compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, + _compressor: &BtrBlocksCompressor, + array: &ArrayRef, _ctx: CompressorContext, - _excludes: &[FloatCode], + _cache: &mut StatsCache, + _excludes: &[SchemeId], ) -> VortexResult { - Ok(stats.source().clone().into_array()) + Ok(array.clone()) } } impl Scheme for ConstantScheme { - type StatsType = FloatStats; - type CodeType = FloatCode; + fn scheme_name(&self) -> &'static str { + "vortex.float.constant" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_float_primitive(canonical) + } - fn code(&self) -> FloatCode { - FloatCode::Constant + fn is_constant(&self) -> bool { + true } fn expected_compression_ratio( &self, - _btr_blocks_compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, + _compressor: &BtrBlocksCompressor, + array: &ArrayRef, ctx: CompressorContext, - _excludes: &[FloatCode], + cache: &mut StatsCache, + _excludes: &[SchemeId], ) -> VortexResult { - // Never select Constant when sampling + // Never select Constant when sampling. if ctx.is_sample { return Ok(0.0); } + let stats = + cache.get_or_insert_with::(|| FloatStats::generate(&array.to_primitive())); + if stats.null_count as usize == stats.src.len() || stats.value_count == 0 { return Ok(0.0); } - // Can only have 1 distinct value + // Can only have 1 distinct value. if stats.distinct_values_count != 1 { return Ok(0.0); } @@ -257,11 +182,15 @@ impl Scheme for ConstantScheme { fn compress( &self, - _btr_blocks_compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, + _compressor: &BtrBlocksCompressor, + array: &ArrayRef, _ctx: CompressorContext, - _excludes: &[FloatCode], + cache: &mut StatsCache, + _excludes: &[SchemeId], ) -> VortexResult { + let stats = + cache.get_or_insert_with::(|| FloatStats::generate(&array.to_primitive())); + let scalar_idx = (0..stats.source().len()).position(|idx| stats.source().is_valid(idx).unwrap_or(false)); @@ -285,21 +214,26 @@ impl Scheme for ConstantScheme { } impl Scheme for ALPScheme { - type StatsType = FloatStats; - type CodeType = FloatCode; + fn scheme_name(&self) -> &'static str { + "vortex.float.alp" + } - fn code(&self) -> FloatCode { - FloatCode::Alp + fn matches(&self, canonical: &Canonical) -> bool { + is_float_primitive(canonical) } fn expected_compression_ratio( &self, compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, + array: &ArrayRef, ctx: CompressorContext, - excludes: &[FloatCode], + cache: &mut StatsCache, + excludes: &[SchemeId], ) -> VortexResult { - // We don't support ALP for f16 + let stats = + cache.get_or_insert_with::(|| FloatStats::generate(&array.to_primitive())); + + // We don't support ALP for f16. if stats.source().ptype() == PType::F16 { return Ok(0.0); } @@ -310,16 +244,20 @@ impl Scheme for ALPScheme { return Ok(0.0); } - self.estimate_compression_ratio_with_sampling(compressor, stats, ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, array, ctx, excludes) } fn compress( &self, compressor: &BtrBlocksCompressor, - stats: &FloatStats, + array: &ArrayRef, ctx: CompressorContext, - excludes: &[FloatCode], + cache: &mut StatsCache, + excludes: &[SchemeId], ) -> VortexResult { + let stats = + cache.get_or_insert_with::(|| FloatStats::generate(&array.to_primitive())); + let alp_encoded = alp_encode(&stats.source().to_primitive(), None)?; let alp = alp_encoded.as_::(); let alp_ints = alp.encoded().to_primitive(); @@ -327,18 +265,18 @@ impl Scheme for ALPScheme { // Compress the ALP ints. // Patches are not compressed. They should be infrequent, and if they are not then we want // to keep them linear for easy indexing. - let mut int_excludes = Vec::new(); - if excludes.contains(&FloatCode::Dict) { - int_excludes.push(IntDictScheme.code()); + let mut new_excludes = Vec::new(); + if excludes.contains(&DictScheme.id()) { + new_excludes.push(IntDictScheme.id()); } - if excludes.contains(&FloatCode::RunEnd) { - int_excludes.push(IntRunEndScheme.code()); + if excludes.contains(&RLE_FLOAT_SCHEME.id()) { + new_excludes.push(IntRunEndScheme.id()); } let compressed_alp_ints = compressor.compress_canonical( Canonical::Primitive(alp_ints), ctx.descend(), - Excludes::int_only(&int_excludes), + &new_excludes, )?; let patches = alp.patches().map(compress_patches).transpose()?; @@ -348,34 +286,43 @@ impl Scheme for ALPScheme { } impl Scheme for ALPRDScheme { - type StatsType = FloatStats; - type CodeType = FloatCode; + fn scheme_name(&self) -> &'static str { + "vortex.float.alprd" + } - fn code(&self) -> FloatCode { - FloatCode::AlpRd + fn matches(&self, canonical: &Canonical) -> bool { + is_float_primitive(canonical) } fn expected_compression_ratio( &self, compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, + array: &ArrayRef, ctx: CompressorContext, - excludes: &[FloatCode], + cache: &mut StatsCache, + excludes: &[SchemeId], ) -> VortexResult { + let stats = + cache.get_or_insert_with::(|| FloatStats::generate(&array.to_primitive())); + if stats.source().ptype() == PType::F16 { return Ok(0.0); } - self.estimate_compression_ratio_with_sampling(compressor, stats, ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, array, ctx, excludes) } fn compress( &self, _compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, + array: &ArrayRef, _ctx: CompressorContext, - _excludes: &[FloatCode], + cache: &mut StatsCache, + _excludes: &[SchemeId], ) -> VortexResult { + let stats = + cache.get_or_insert_with::(|| FloatStats::generate(&array.to_primitive())); + let encoder = match stats.source().ptype() { PType::F32 => RDEncoder::new(stats.source().as_slice::()), PType::F64 => RDEncoder::new(stats.source().as_slice::()), @@ -395,20 +342,31 @@ impl Scheme for ALPRDScheme { } impl Scheme for DictScheme { - type StatsType = FloatStats; - type CodeType = FloatCode; + fn scheme_name(&self) -> &'static str { + "vortex.float.dict" + } - fn code(&self) -> FloatCode { - FloatCode::Dict + fn matches(&self, canonical: &Canonical) -> bool { + is_float_primitive(canonical) } fn expected_compression_ratio( &self, compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, + array: &ArrayRef, ctx: CompressorContext, - excludes: &[FloatCode], + cache: &mut StatsCache, + excludes: &[SchemeId], ) -> VortexResult { + let stats = cache.get_or_insert_with::(|| { + FloatStats::generate_opts( + &array.to_primitive(), + GenerateStatsOptions { + count_distinct_values: true, + }, + ) + }); + if stats.value_count == 0 { return Ok(0.0); } @@ -419,16 +377,26 @@ impl Scheme for DictScheme { } // Take a sample and run compression on the sample to determine before/after size. - self.estimate_compression_ratio_with_sampling(compressor, stats, ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, array, ctx, excludes) } fn compress( &self, compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, + array: &ArrayRef, ctx: CompressorContext, - _excludes: &[Self::CodeType], + cache: &mut StatsCache, + _excludes: &[SchemeId], ) -> VortexResult { + let stats = cache.get_or_insert_with::(|| { + FloatStats::generate_opts( + &array.to_primitive(), + GenerateStatsOptions { + count_distinct_values: true, + }, + ) + }); + let dict = dictionary_encode(stats); let has_all_values_referenced = dict.has_all_values_referenced(); let DictArrayParts { codes, values, .. } = dict.into_parts(); @@ -436,17 +404,17 @@ impl Scheme for DictScheme { let compressed_codes = compressor.compress_canonical( Canonical::Primitive(codes.to_primitive()), ctx.descend(), - Excludes::int_only(&[IntCode::Dict, IntCode::Sequence]), + &[IntDictScheme.id(), IntSequenceScheme.id()], )?; assert!(values.is_canonical()); let compressed_values = compressor.compress_canonical( Canonical::Primitive(values.to_primitive()), ctx.descend(), - Excludes::from(&[FloatCode::Dict]), + &[DictScheme.id()], )?; - // SAFETY: compressing codes or values does not alter the invariants + // SAFETY: compressing codes or values does not alter the invariants. unsafe { Ok( DictArray::new_unchecked(compressed_codes, compressed_values) @@ -458,27 +426,32 @@ impl Scheme for DictScheme { } impl Scheme for NullDominated { - type StatsType = FloatStats; - type CodeType = FloatCode; + fn scheme_name(&self) -> &'static str { + "vortex.float.sparse" + } - fn code(&self) -> Self::CodeType { - FloatCode::Sparse + fn matches(&self, canonical: &Canonical) -> bool { + is_float_primitive(canonical) } fn expected_compression_ratio( &self, _compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, + array: &ArrayRef, ctx: CompressorContext, - _excludes: &[Self::CodeType], + cache: &mut StatsCache, + _excludes: &[SchemeId], ) -> VortexResult { // Only use `SparseScheme` if we can cascade. if ctx.allowed_cascading == 0 { return Ok(0.0); } + let stats = + cache.get_or_insert_with::(|| FloatStats::generate(&array.to_primitive())); + if stats.value_count == 0 { - // All nulls should use ConstantScheme + // All nulls should use ConstantScheme. return Ok(0.0); } @@ -487,33 +460,32 @@ impl Scheme for NullDominated { return Ok(stats.src.len() as f64 / stats.value_count as f64); } - // Otherwise we don't go this route + // Otherwise we don't go this route. Ok(0.0) } fn compress( &self, compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, + array: &ArrayRef, ctx: CompressorContext, - _excludes: &[Self::CodeType], + cache: &mut StatsCache, + _excludes: &[SchemeId], ) -> VortexResult { assert!(ctx.allowed_cascading > 0); - // We pass None as we only run this pathway for NULL-dominated float arrays + let stats = + cache.get_or_insert_with::(|| FloatStats::generate(&array.to_primitive())); + + // We pass None as we only run this pathway for NULL-dominated float arrays. let sparse_encoded = SparseArray::encode(&stats.src.clone().into_array(), None)?; if let Some(sparse) = sparse_encoded.as_opt::() { - // Compress the values - let new_excludes = [IntSparseScheme.code()]; - - // Don't attempt to compress the non-null values - let indices = sparse.patches().indices().to_primitive().narrow()?; let compressed_indices = compressor.compress_canonical( Canonical::Primitive(indices.to_primitive()), ctx.descend(), - Excludes::int_only(&new_excludes), + &[IntSparseScheme.id()], )?; SparseArray::try_new( @@ -531,20 +503,24 @@ impl Scheme for NullDominated { #[cfg(feature = "pco")] impl Scheme for PcoScheme { - type StatsType = FloatStats; - type CodeType = FloatCode; + fn scheme_name(&self) -> &'static str { + "vortex.float.pco" + } - fn code(&self) -> FloatCode { - FloatCode::Pco + fn matches(&self, canonical: &Canonical) -> bool { + is_float_primitive(canonical) } fn compress( &self, _compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, + array: &ArrayRef, _ctx: CompressorContext, - _excludes: &[FloatCode], + cache: &mut StatsCache, + _excludes: &[SchemeId], ) -> VortexResult { + let stats = + cache.get_or_insert_with::(|| FloatStats::generate(&array.to_primitive())); Ok(vortex_pco::PcoArray::from_primitive( stats.source(), pco::DEFAULT_COMPRESSION_LEVEL, @@ -556,12 +532,10 @@ impl Scheme for PcoScheme { #[cfg(test)] mod tests { - use std::iter; use vortex_array::DynArray; use vortex_array::IntoArray; - use vortex_array::ToCanonical; use vortex_array::arrays::PrimitiveArray; use vortex_array::assert_arrays_eq; use vortex_array::builders::ArrayBuilder; @@ -576,20 +550,14 @@ mod tests { use super::RLE_FLOAT_SCHEME; use crate::BtrBlocksCompressor; use crate::CompressorContext; - use crate::CompressorExt; - use crate::CompressorStats; use crate::Scheme; + use crate::StatsCache; #[test] fn test_empty() -> VortexResult<()> { - // Make sure empty array compression does not fail let btr = BtrBlocksCompressor::default(); - let result = btr.float_compressor().compress( - &btr, - &PrimitiveArray::new(Buffer::::empty(), Validity::NonNullable), - CompressorContext::default(), - &[], - )?; + let array = PrimitiveArray::new(Buffer::::empty(), Validity::NonNullable).into_array(); + let result = btr.compress(&array)?; assert!(result.is_empty()); Ok(()) @@ -598,19 +566,13 @@ mod tests { #[test] fn test_compress() -> VortexResult<()> { let mut values = buffer_mut![1.0f32; 1024]; - // Sprinkle some other values in. for i in 0..1024 { - // Insert 2.0 at all odd positions. - // This should force dictionary encoding and exclude run-end due to the - // average run length being 1. values[i] = (i % 50) as f32; } - let floats = values.into_array().to_primitive(); + let array = values.into_array(); let btr = BtrBlocksCompressor::default(); - let compressed = - btr.float_compressor() - .compress(&btr, &floats, CompressorContext::default(), &[])?; + let compressed = btr.compress(&array)?; assert_eq!(compressed.len(), 1024); let display = compressed @@ -630,14 +592,18 @@ mod tests { values.extend(iter::repeat_n(3.15f32, 150)); let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); - let stats = super::FloatStats::generate(&array); let btr = BtrBlocksCompressor::default(); - let compressed = - RLE_FLOAT_SCHEME.compress(&btr, &stats, CompressorContext::default(), &[])?; + let mut cache = StatsCache::new(); + let compressed = RLE_FLOAT_SCHEME.compress( + &btr, + &array.into_array(), + CompressorContext::default(), + &mut cache, + &[], + )?; - let decoded = compressed; let expected = Buffer::copy_from(&values).into_array(); - assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); + assert_arrays_eq!(compressed.as_ref(), expected.as_ref()); Ok(()) } @@ -652,11 +618,9 @@ mod tests { array.append_value(-0.0f32); array.append_nulls(90); - let floats = array.finish_into_primitive(); + let array = array.finish_into_primitive().into_array(); let btr = BtrBlocksCompressor::default(); - let compressed = - btr.float_compressor() - .compress(&btr, &floats, CompressorContext::default(), &[])?; + let compressed = btr.compress(&array)?; assert_eq!(compressed.len(), 96); let display = compressed @@ -672,8 +636,9 @@ mod tests { /// Tests to verify that each float compression scheme produces the expected encoding. #[cfg(test)] mod scheme_selection_tests { - use vortex_alp::ALP; + use vortex_array::DynArray; + use vortex_array::IntoArray; use vortex_array::arrays::Constant; use vortex_array::arrays::Dict; use vortex_array::arrays::PrimitiveArray; @@ -685,17 +650,13 @@ mod scheme_selection_tests { use vortex_error::VortexResult; use crate::BtrBlocksCompressor; - use crate::CompressorContext; - use crate::CompressorExt; #[test] fn test_constant_compressed() -> VortexResult<()> { let values: Vec = vec![42.5; 100]; let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); let btr = BtrBlocksCompressor::default(); - let compressed = - btr.float_compressor() - .compress(&btr, &array, CompressorContext::default(), &[])?; + let compressed = btr.compress(&array.into_array())?; assert!(compressed.is::()); Ok(()) } @@ -705,9 +666,7 @@ mod scheme_selection_tests { let values: Vec = (0..1000).map(|i| (i as f64) * 0.01).collect(); let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); let btr = BtrBlocksCompressor::default(); - let compressed = - btr.float_compressor() - .compress(&btr, &array, CompressorContext::default(), &[])?; + let compressed = btr.compress(&array.into_array())?; assert!(compressed.is::()); Ok(()) } @@ -720,9 +679,7 @@ mod scheme_selection_tests { .collect(); let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); let btr = BtrBlocksCompressor::default(); - let compressed = - btr.float_compressor() - .compress(&btr, &array, CompressorContext::default(), &[])?; + let compressed = btr.compress(&array.into_array())?; assert!(compressed.is::()); Ok(()) } @@ -736,9 +693,7 @@ mod scheme_selection_tests { builder.append_nulls(95); let array = builder.finish_into_primitive(); let btr = BtrBlocksCompressor::default(); - let compressed = - btr.float_compressor() - .compress(&btr, &array, CompressorContext::default(), &[])?; + let compressed = btr.compress(&array.into_array())?; // Verify the compressed array preserves values. assert_eq!(compressed.len(), 100); Ok(()) diff --git a/vortex-btrblocks/src/compressor/integer/mod.rs b/vortex-btrblocks/src/compressor/integer/mod.rs index 58ee4f62e76..0aadd34d27f 100644 --- a/vortex-btrblocks/src/compressor/integer/mod.rs +++ b/vortex-btrblocks/src/compressor/integer/mod.rs @@ -4,10 +4,6 @@ pub(crate) mod dictionary; pub(super) mod stats; -use std::hash::Hash; -use std::hash::Hasher; - -use enum_iterator::Sequence; pub use stats::IntegerStats; use vortex_array::ArrayRef; use vortex_array::Canonical; @@ -16,10 +12,8 @@ use vortex_array::ToCanonical; use vortex_array::arrays::ConstantArray; use vortex_array::arrays::DictArray; use vortex_array::arrays::MaskedArray; -use vortex_array::arrays::Primitive; use vortex_array::arrays::PrimitiveArray; use vortex_array::scalar::Scalar; -use vortex_array::vtable::VTable; use vortex_array::vtable::ValidityHelper; use vortex_error::VortexExpect; use vortex_error::VortexResult; @@ -39,158 +33,54 @@ use vortex_zigzag::zigzag_encode; use self::dictionary::dictionary_encode; use crate::BtrBlocksCompressor; -use crate::CanonicalCompressor; -use crate::Compressor; use crate::CompressorContext; use crate::CompressorStats; -use crate::Excludes; -use crate::GenerateStatsOptions; use crate::Scheme; -use crate::SchemeExt; +use crate::SchemeId; +use crate::StatsCache; use crate::compressor::patches::compress_patches; use crate::compressor::rle; use crate::compressor::rle::RLEScheme; +use crate::scheme::estimate_compression_ratio_with_sampling; -/// All available integer compression schemes. -pub const ALL_INT_SCHEMES: &[&dyn IntegerScheme] = &[ - &ConstantScheme, - &FORScheme, - &ZigZagScheme, - &BitPackingScheme, - &SparseScheme, - &DictScheme, - &RunEndScheme, - &SequenceScheme, - &RLE_INTEGER_SCHEME, - #[cfg(feature = "pco")] - &PcoScheme, -]; - -/// [`Compressor`] for signed and unsigned integers. -#[derive(Clone, Copy)] -pub struct IntCompressor<'a> { - /// Reference to the parent compressor. - pub btr_blocks_compressor: &'a dyn CanonicalCompressor, -} - -impl<'a> Compressor for IntCompressor<'a> { - type ArrayVTable = Primitive; - type SchemeType = dyn IntegerScheme; - type StatsType = IntegerStats; - - fn schemes(&self) -> &[&'static dyn IntegerScheme] { - self.btr_blocks_compressor.int_schemes() - } - - fn default_scheme(&self) -> &'static Self::SchemeType { - &UncompressedScheme - } - - fn gen_stats(&self, array: &::Array) -> Self::StatsType { - if self - .btr_blocks_compressor - .int_schemes() - .iter() - .any(|s| s.code() == IntCode::Dict) - { - IntegerStats::generate_opts( - array, - GenerateStatsOptions { - count_distinct_values: true, - }, - ) - } else { - IntegerStats::generate_opts( - array, - GenerateStatsOptions { - count_distinct_values: false, - }, - ) - } - } -} - -pub trait IntegerScheme: - Scheme + Send + Sync -{ -} - -// Auto-impl -impl IntegerScheme for T where - T: Scheme + Send + Sync -{ -} - -impl PartialEq for dyn IntegerScheme { - fn eq(&self, other: &Self) -> bool { - self.code() == other.code() - } -} - -impl Eq for dyn IntegerScheme {} - -impl Hash for dyn IntegerScheme { - fn hash(&self, state: &mut H) { - self.code().hash(state) - } -} - -/// Unique identifier for integer compression schemes. -/// -/// NOTE: Variant order matters for tie-breaking; `For` must precede `BitPacking` to avoid unnecessary patches. -#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash, Sequence, Ord, PartialOrd)] -pub enum IntCode { - /// No compression applied. - Uncompressed, - /// Constant encoding for arrays with a single distinct value. - Constant, - /// Frame of Reference encoding - subtracts minimum value then bitpacks. - For, - /// BitPacking encoding - compresses non-negative integers by reducing bit width. - BitPacking, - /// ZigZag encoding - transforms negative integers to positive for better bitpacking. - ZigZag, - /// Sparse encoding - optimizes null-dominated or single-value-dominated arrays. - Sparse, - /// Dictionary encoding - creates a dictionary of unique values. - Dict, - /// Run-end encoding - run-length encoding with end positions. - RunEnd, - /// Sequence encoding - detects sequential patterns. - Sequence, - /// RLE encoding - generic run-length encoding. - Rle, - /// Pco (pcodec) compression for integers. - Pco, +/// Returns `true` if the canonical array is a primitive with an integer ptype. +fn is_integer_primitive(canonical: &Canonical) -> bool { + matches!(canonical, Canonical::Primitive(p) if p.ptype().is_int()) } +/// No compression applied. #[derive(Debug, Copy, Clone, PartialEq, Eq)] - pub struct UncompressedScheme; +/// Constant encoding for arrays with a single distinct value. #[derive(Debug, Copy, Clone, PartialEq, Eq)] - pub struct ConstantScheme; +/// Frame of Reference encoding. #[derive(Debug, Copy, Clone, PartialEq, Eq)] - pub struct FORScheme; +/// ZigZag encoding for negative integers. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct ZigZagScheme; +/// BitPacking encoding for non-negative integers. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct BitPackingScheme; +/// Sparse encoding for single-value-dominated arrays. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct SparseScheme; +/// Dictionary encoding for low-cardinality integer values. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct DictScheme; +/// Run-end encoding with end positions. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct RunEndScheme; +/// Sequence encoding for sequential patterns. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct SequenceScheme; @@ -208,17 +98,24 @@ pub struct IntRLEConfig; impl rle::RLEConfig for IntRLEConfig { type Stats = IntegerStats; - type Code = IntCode; - const CODE: IntCode = IntCode::Rle; + const SCHEME_NAME: &'static str = "vortex.int.rle"; + + fn matches(canonical: &Canonical) -> bool { + is_integer_primitive(canonical) + } + + fn generate_stats(array: &ArrayRef) -> IntegerStats { + IntegerStats::generate(&array.to_primitive()) + } fn compress_values( compressor: &BtrBlocksCompressor, values: &PrimitiveArray, ctx: CompressorContext, - excludes: &[IntCode], + excludes: &[SchemeId], ) -> VortexResult { - compressor.compress_canonical(Canonical::Primitive(values.clone()), ctx, excludes.into()) + compressor.compress_canonical(Canonical::Primitive(values.clone()), ctx, excludes) } } @@ -226,41 +123,45 @@ impl rle::RLEConfig for IntRLEConfig { pub const RLE_INTEGER_SCHEME: RLEScheme = RLEScheme::new(); impl Scheme for UncompressedScheme { - type StatsType = IntegerStats; - type CodeType = IntCode; + fn scheme_name(&self) -> &'static str { + "vortex.int.uncompressed" + } - fn code(&self) -> IntCode { - IntCode::Uncompressed + fn matches(&self, canonical: &Canonical) -> bool { + is_integer_primitive(canonical) } fn expected_compression_ratio( &self, _compressor: &BtrBlocksCompressor, - _stats: &IntegerStats, + _array: &ArrayRef, _ctx: CompressorContext, - _excludes: &[IntCode], + _cache: &mut StatsCache, + _excludes: &[SchemeId], ) -> VortexResult { - // no compression + // No compression. Ok(1.0) } fn compress( &self, _compressor: &BtrBlocksCompressor, - stats: &IntegerStats, + array: &ArrayRef, _ctx: CompressorContext, - _excludes: &[IntCode], + _cache: &mut StatsCache, + _excludes: &[SchemeId], ) -> VortexResult { - Ok(stats.source().clone().into_array()) + Ok(array.clone()) } } impl Scheme for ConstantScheme { - type StatsType = IntegerStats; - type CodeType = IntCode; + fn scheme_name(&self) -> &'static str { + "vortex.int.constant" + } - fn code(&self) -> IntCode { - IntCode::Constant + fn matches(&self, canonical: &Canonical) -> bool { + is_integer_primitive(canonical) } fn is_constant(&self) -> bool { @@ -270,16 +171,20 @@ impl Scheme for ConstantScheme { fn expected_compression_ratio( &self, _compressor: &BtrBlocksCompressor, - stats: &IntegerStats, + array: &ArrayRef, ctx: CompressorContext, - _excludes: &[IntCode], + cache: &mut StatsCache, + _excludes: &[SchemeId], ) -> VortexResult { // Never yield ConstantScheme for a sample, it could be a false-positive. if ctx.is_sample { return Ok(0.0); } - // Only arrays with one distinct values can be constant compressed. + let stats = cache + .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + + // Only arrays with one distinct value can be constant compressed. if stats.distinct_values_count > 1 { return Ok(0.0); } @@ -290,10 +195,14 @@ impl Scheme for ConstantScheme { fn compress( &self, _compressor: &BtrBlocksCompressor, - stats: &IntegerStats, + array: &ArrayRef, _ctx: CompressorContext, - _excludes: &[IntCode], + cache: &mut StatsCache, + _excludes: &[SchemeId], ) -> VortexResult { + let stats = cache + .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + let scalar_idx = (0..stats.source().len()).position(|idx| stats.source().is_valid(idx).unwrap_or(false)); @@ -317,25 +226,30 @@ impl Scheme for ConstantScheme { } impl Scheme for FORScheme { - type StatsType = IntegerStats; - type CodeType = IntCode; + fn scheme_name(&self) -> &'static str { + "vortex.int.for" + } - fn code(&self) -> IntCode { - IntCode::For + fn matches(&self, canonical: &Canonical) -> bool { + is_integer_primitive(canonical) } fn expected_compression_ratio( &self, _compressor: &BtrBlocksCompressor, - stats: &IntegerStats, + array: &ArrayRef, ctx: CompressorContext, - _excludes: &[IntCode], + cache: &mut StatsCache, + _excludes: &[SchemeId], ) -> VortexResult { - // Only apply if we are not at the leaf + // Only apply if we are not at the leaf. if ctx.allowed_cascading == 0 { return Ok(0.0); } + let stats = cache + .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + // All-null cannot be FOR compressed. if stats.value_count == 0 { return Ok(0.0); @@ -346,7 +260,7 @@ impl Scheme for FORScheme { return Ok(0.0); } - // Difference between max and min + // Difference between max and min. let full_width: u32 = stats .src .ptype() @@ -355,8 +269,8 @@ impl Scheme for FORScheme { .vortex_expect("bit width must fit in u32"); let for_bw = match stats.typed.max_minus_min().checked_ilog2() { Some(l) => l + 1, - // If max-min == 0, it we should use a different compression scheme - // as we don't want to bitpack down to 0 bits. + // If max-min == 0, we should use a different compression scheme as we don't want to + // bitpack down to 0 bits. None => return Ok(0.0), }; @@ -380,18 +294,14 @@ impl Scheme for FORScheme { fn compress( &self, compressor: &BtrBlocksCompressor, - stats: &IntegerStats, + array: &ArrayRef, ctx: CompressorContext, - excludes: &[IntCode], + _cache: &mut StatsCache, + excludes: &[SchemeId], ) -> VortexResult { - let for_array = FoRArray::encode(stats.src.clone())?; + let primitive = array.to_primitive(); + let for_array = FoRArray::encode(primitive)?; let biased = for_array.encoded().to_primitive(); - let biased_stats = IntegerStats::generate_opts( - &biased, - GenerateStatsOptions { - count_distinct_values: false, - }, - ); // Immediately bitpack. If any other scheme was preferable, it would be chosen instead // of bitpacking. @@ -401,8 +311,14 @@ impl Scheme for FORScheme { is_sample: ctx.is_sample, allowed_cascading: 0, }; - let compressed = - BitPackingScheme.compress(compressor, &biased_stats, leaf_ctx, excludes)?; + let mut biased_cache = StatsCache::new(); + let compressed = BitPackingScheme.compress( + compressor, + &biased.into_array(), + leaf_ctx, + &mut biased_cache, + excludes, + )?; let for_compressed = FoRArray::try_new(compressed, for_array.reference_scalar().clone())?; for_compressed @@ -414,26 +330,31 @@ impl Scheme for FORScheme { } impl Scheme for ZigZagScheme { - type StatsType = IntegerStats; - type CodeType = IntCode; + fn scheme_name(&self) -> &'static str { + "vortex.int.zigzag" + } - fn code(&self) -> IntCode { - IntCode::ZigZag + fn matches(&self, canonical: &Canonical) -> bool { + is_integer_primitive(canonical) } fn expected_compression_ratio( &self, compressor: &BtrBlocksCompressor, - stats: &IntegerStats, + array: &ArrayRef, ctx: CompressorContext, - excludes: &[IntCode], + cache: &mut StatsCache, + excludes: &[SchemeId], ) -> VortexResult { - // ZigZag is only useful when we cascade it with another encoding + // ZigZag is only useful when we cascade it with another encoding. if ctx.allowed_cascading == 0 { return Ok(0.0); } - // Don't try and compress all-null arrays + let stats = cache + .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + + // Don't try and compress all-null arrays. if stats.value_count == 0 { return Ok(0.0); } @@ -444,16 +365,20 @@ impl Scheme for ZigZagScheme { } // Run compression on a sample to see how it performs. - self.estimate_compression_ratio_with_sampling(compressor, stats, ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, array, ctx, excludes) } fn compress( &self, compressor: &BtrBlocksCompressor, - stats: &IntegerStats, + array: &ArrayRef, ctx: CompressorContext, - excludes: &[IntCode], + cache: &mut StatsCache, + excludes: &[SchemeId], ) -> VortexResult { + let stats = cache + .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + // Zigzag encode the values, then recursively compress the inner values. let zag = zigzag_encode(stats.src.clone())?; let encoded = zag.encoded().to_primitive(); @@ -461,17 +386,17 @@ impl Scheme for ZigZagScheme { // ZigZag should be after Dict, RunEnd or Sparse. // We should only do these "container" style compressors once. let mut new_excludes = vec![ - ZigZagScheme.code(), - DictScheme.code(), - RunEndScheme.code(), - SparseScheme.code(), + ZigZagScheme.id(), + DictScheme.id(), + RunEndScheme.id(), + SparseScheme.id(), ]; new_excludes.extend_from_slice(excludes); let compressed = compressor.compress_canonical( Canonical::Primitive(encoded), ctx.descend(), - Excludes::int_only(&new_excludes), + &new_excludes, )?; tracing::debug!("zigzag output: {}", compressed.encoding_id()); @@ -481,40 +406,49 @@ impl Scheme for ZigZagScheme { } impl Scheme for BitPackingScheme { - type StatsType = IntegerStats; - type CodeType = IntCode; + fn scheme_name(&self) -> &'static str { + "vortex.int.bitpacking" + } - fn code(&self) -> IntCode { - IntCode::BitPacking + fn matches(&self, canonical: &Canonical) -> bool { + is_integer_primitive(canonical) } fn expected_compression_ratio( &self, compressor: &BtrBlocksCompressor, - stats: &IntegerStats, + array: &ArrayRef, ctx: CompressorContext, - excludes: &[IntCode], + cache: &mut StatsCache, + excludes: &[SchemeId], ) -> VortexResult { - // BitPacking only works for non-negative values + let stats = cache + .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + + // BitPacking only works for non-negative values. if stats.typed.min_is_negative() { return Ok(0.0); } - // Don't compress all-null arrays + // Don't compress all-null arrays. if stats.value_count == 0 { return Ok(0.0); } - self.estimate_compression_ratio_with_sampling(compressor, stats, ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, array, ctx, excludes) } fn compress( &self, _compressor: &BtrBlocksCompressor, - stats: &IntegerStats, + array: &ArrayRef, _ctx: CompressorContext, - _excludes: &[IntCode], + cache: &mut StatsCache, + _excludes: &[SchemeId], ) -> VortexResult { + let stats = cache + .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + let histogram = bit_width_histogram(stats.source())?; let bw = find_best_bit_width(stats.source().ptype(), &histogram)?; // If best bw is determined to be the current bit-width, return the original array. @@ -531,28 +465,32 @@ impl Scheme for BitPackingScheme { } impl Scheme for SparseScheme { - type StatsType = IntegerStats; - type CodeType = IntCode; + fn scheme_name(&self) -> &'static str { + "vortex.int.sparse" + } - fn code(&self) -> IntCode { - IntCode::Sparse + fn matches(&self, canonical: &Canonical) -> bool { + is_integer_primitive(canonical) } - // We can avoid asserting the encoding tree instead. fn expected_compression_ratio( &self, _compressor: &BtrBlocksCompressor, - stats: &IntegerStats, + array: &ArrayRef, ctx: CompressorContext, - _excludes: &[IntCode], + cache: &mut StatsCache, + _excludes: &[SchemeId], ) -> VortexResult { // Only use `SparseScheme` if we can cascade. if ctx.allowed_cascading == 0 { return Ok(0.0); } + let stats = cache + .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + if stats.value_count == 0 { - // All nulls should use ConstantScheme + // All nulls should use ConstantScheme. return Ok(0.0); } @@ -565,7 +503,7 @@ impl Scheme for SparseScheme { let (_, top_count) = stats.typed.top_value_and_count(); if top_count == stats.value_count { - // top_value is the only value, should use ConstantScheme instead + // top_value is the only value, should use ConstantScheme instead. return Ok(0.0); } @@ -581,14 +519,19 @@ impl Scheme for SparseScheme { fn compress( &self, compressor: &BtrBlocksCompressor, - stats: &IntegerStats, + array: &ArrayRef, ctx: CompressorContext, - excludes: &[IntCode], + cache: &mut StatsCache, + excludes: &[SchemeId], ) -> VortexResult { assert!(ctx.allowed_cascading > 0); + + let stats = cache + .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + let (top_pvalue, top_count) = stats.typed.top_value_and_count(); if top_count as usize == stats.src.len() { - // top_value is the only value, use ConstantScheme + // top_value is the only value, use ConstantScheme. return Ok(ConstantArray::new( Scalar::primitive_value( top_pvalue, @@ -610,14 +553,14 @@ impl Scheme for SparseScheme { )?; if let Some(sparse) = sparse_encoded.as_opt::() { - // Compress the values - let mut new_excludes = vec![SparseScheme.code(), IntCode::Dict]; + // Compress the values. + let mut new_excludes = vec![SparseScheme.id(), DictScheme.id()]; new_excludes.extend_from_slice(excludes); let compressed_values = compressor.compress_canonical( Canonical::Primitive(sparse.patches().values().to_primitive()), ctx.descend(), - Excludes::int_only(&new_excludes), + &new_excludes, )?; let indices = sparse.patches().indices().to_primitive().narrow()?; @@ -625,7 +568,7 @@ impl Scheme for SparseScheme { let compressed_indices = compressor.compress_canonical( Canonical::Primitive(indices), ctx.descend(), - Excludes::int_only(&new_excludes), + &new_excludes, )?; SparseArray::try_new( @@ -642,25 +585,30 @@ impl Scheme for SparseScheme { } impl Scheme for DictScheme { - type StatsType = IntegerStats; - type CodeType = IntCode; + fn scheme_name(&self) -> &'static str { + "vortex.int.dict" + } - fn code(&self) -> IntCode { - IntCode::Dict + fn matches(&self, canonical: &Canonical) -> bool { + is_integer_primitive(canonical) } fn expected_compression_ratio( &self, _compressor: &BtrBlocksCompressor, - stats: &IntegerStats, + array: &ArrayRef, ctx: CompressorContext, - _excludes: &[IntCode], + cache: &mut StatsCache, + _excludes: &[SchemeId], ) -> VortexResult { // Dict should not be terminal. if ctx.allowed_cascading == 0 { return Ok(0.0); } + let stats = cache + .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + if stats.value_count == 0 { return Ok(0.0); } @@ -678,7 +626,7 @@ impl Scheme for DictScheme { let n_runs = (stats.value_count / stats.average_run_length) as usize; - // Assume that codes will either be BitPack or RLE-BitPack + // Assume that codes will either be BitPack or RLE-BitPack. let codes_size_bp = (codes_bw * stats.value_count) as usize; let codes_size_rle_bp = usize::checked_mul((codes_bw + 32) as usize, n_runs); @@ -692,29 +640,34 @@ impl Scheme for DictScheme { fn compress( &self, compressor: &BtrBlocksCompressor, - stats: &IntegerStats, + array: &ArrayRef, ctx: CompressorContext, - excludes: &[IntCode], + cache: &mut StatsCache, + excludes: &[SchemeId], ) -> VortexResult { assert!(ctx.allowed_cascading > 0); + let stats = cache + .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + // TODO(aduffy): we can be more prescriptive: we know that codes will EITHER be // RLE or FOR + BP. Cascading probably wastes some time here. let dict = dictionary_encode(stats); - // Cascade the codes child - // Don't allow SequenceArray as the codes child as it merely adds extra indirection without actually compressing data. - let mut new_excludes = vec![IntCode::Dict, IntCode::Sequence]; + // Cascade the codes child. + // Don't allow SequenceArray as the codes child as it merely adds extra indirection + // without actually compressing data. + let mut new_excludes = vec![DictScheme.id(), SequenceScheme.id()]; new_excludes.extend_from_slice(excludes); let compressed_codes = compressor.compress_canonical( Canonical::Primitive(dict.codes().to_primitive().narrow()?), ctx.descend(), - Excludes::int_only(&new_excludes), + &new_excludes, )?; - // SAFETY: compressing codes does not change their values + // SAFETY: compressing codes does not change their values. unsafe { Ok( DictArray::new_unchecked(compressed_codes, dict.values().clone()) @@ -726,20 +679,25 @@ impl Scheme for DictScheme { } impl Scheme for RunEndScheme { - type StatsType = IntegerStats; - type CodeType = IntCode; + fn scheme_name(&self) -> &'static str { + "vortex.int.runend" + } - fn code(&self) -> IntCode { - IntCode::RunEnd + fn matches(&self, canonical: &Canonical) -> bool { + is_integer_primitive(canonical) } fn expected_compression_ratio( &self, compressor: &BtrBlocksCompressor, - stats: &IntegerStats, + array: &ArrayRef, ctx: CompressorContext, - excludes: &[IntCode], + cache: &mut StatsCache, + excludes: &[SchemeId], ) -> VortexResult { + let stats = cache + .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + // If the run length is below the threshold, drop it. if stats.average_run_length < RUN_END_THRESHOLD { return Ok(0.0); @@ -750,37 +708,41 @@ impl Scheme for RunEndScheme { } // Run compression on a sample, see how it performs. - self.estimate_compression_ratio_with_sampling(compressor, stats, ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, array, ctx, excludes) } fn compress( &self, compressor: &BtrBlocksCompressor, - stats: &IntegerStats, + array: &ArrayRef, ctx: CompressorContext, - excludes: &[IntCode], + cache: &mut StatsCache, + excludes: &[SchemeId], ) -> VortexResult { assert!(ctx.allowed_cascading > 0); - // run-end encode the ends + let stats = cache + .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + + // Run-end encode the ends. let (ends, values) = runend_encode(&stats.src); - let mut new_excludes = vec![RunEndScheme.code(), DictScheme.code()]; + let mut new_excludes = vec![RunEndScheme.id(), DictScheme.id()]; new_excludes.extend_from_slice(excludes); let compressed_ends = compressor.compress_canonical( Canonical::Primitive(ends.to_primitive()), ctx.descend(), - Excludes::int_only(&new_excludes), + &new_excludes, )?; let compressed_values = compressor.compress_canonical( Canonical::Primitive(values.to_primitive()), ctx.descend(), - Excludes::int_only(&new_excludes), + &new_excludes, )?; - // SAFETY: compression doesn't affect invariants + // SAFETY: compression doesn't affect invariants. unsafe { Ok( RunEndArray::new_unchecked(compressed_ends, compressed_values, 0, stats.src.len()) @@ -791,34 +753,39 @@ impl Scheme for RunEndScheme { } impl Scheme for SequenceScheme { - type StatsType = IntegerStats; - type CodeType = IntCode; + fn scheme_name(&self) -> &'static str { + "vortex.int.sequence" + } - fn code(&self) -> Self::CodeType { - IntCode::Sequence + fn matches(&self, canonical: &Canonical) -> bool { + is_integer_primitive(canonical) } fn expected_compression_ratio( &self, _compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, + array: &ArrayRef, _ctx: CompressorContext, - _excludes: &[Self::CodeType], + cache: &mut StatsCache, + _excludes: &[SchemeId], ) -> VortexResult { + let stats = cache + .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + if stats.null_count > 0 { return Ok(0.0); } - // If the distinct_values_count was computed (!= u32::MAX) - // Then all values in a sequence must be unique. + // If the distinct_values_count was computed (!= u32::MAX) then all values in a sequence + // must be unique. if stats.distinct_values_count != u32::MAX && stats.distinct_values_count as usize != stats.src.len() { return Ok(0.0); } - // Since two values are required to store base and multiplier the - // compression ratio is divided by 2. + // Since two values are required to store base and multiplier the compression ratio is + // divided by 2. Ok(sequence_encode(&stats.src)? .map(|_| stats.src.len() as f64 / 2.0) .unwrap_or(0.0)) @@ -827,10 +794,14 @@ impl Scheme for SequenceScheme { fn compress( &self, _compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, + array: &ArrayRef, _ctx: CompressorContext, - _excludes: &[Self::CodeType], + cache: &mut StatsCache, + _excludes: &[SchemeId], ) -> VortexResult { + let stats = cache + .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + if stats.null_count > 0 { vortex_bail!("sequence encoding does not support nulls"); } @@ -840,20 +811,25 @@ impl Scheme for SequenceScheme { #[cfg(feature = "pco")] impl Scheme for PcoScheme { - type StatsType = IntegerStats; - type CodeType = IntCode; + fn scheme_name(&self) -> &'static str { + "vortex.int.pco" + } - fn code(&self) -> IntCode { - IntCode::Pco + fn matches(&self, canonical: &Canonical) -> bool { + is_integer_primitive(canonical) } fn expected_compression_ratio( &self, compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, + array: &ArrayRef, ctx: CompressorContext, - excludes: &[IntCode], + cache: &mut StatsCache, + excludes: &[SchemeId], ) -> VortexResult { + let stats = cache + .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + // Pco does not support I8 or U8. if matches!( stats.src.ptype(), @@ -862,16 +838,20 @@ impl Scheme for PcoScheme { return Ok(0.0); } - self.estimate_compression_ratio_with_sampling(compressor, stats, ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, array, ctx, excludes) } fn compress( &self, _compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, + array: &ArrayRef, _ctx: CompressorContext, - _excludes: &[IntCode], + cache: &mut StatsCache, + _excludes: &[SchemeId], ) -> VortexResult { + let stats = cache + .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + Ok(vortex_pco::PcoArray::from_primitive( stats.source(), pco::DEFAULT_COMPRESSION_LEVEL, @@ -891,12 +871,10 @@ mod tests { use rand::rngs::StdRng; use vortex_array::DynArray; use vortex_array::IntoArray; - use vortex_array::ToCanonical; use vortex_array::arrays::Dict; use vortex_array::arrays::PrimitiveArray; use vortex_array::assert_arrays_eq; use vortex_array::validity::Validity; - use vortex_array::vtable::ValidityHelper; use vortex_buffer::Buffer; use vortex_buffer::BufferMut; use vortex_buffer::buffer; @@ -904,26 +882,20 @@ mod tests { use vortex_sequence::Sequence; use vortex_sparse::Sparse; - use super::IntegerStats; use super::RLE_INTEGER_SCHEME; use super::SequenceScheme; use super::SparseScheme; use crate::BtrBlocksCompressor; use crate::CompressorContext; - use crate::CompressorExt; - use crate::CompressorStats; use crate::Scheme; + use crate::StatsCache; #[test] fn test_empty() -> VortexResult<()> { - // Make sure empty array compression does not fail + // Make sure empty array compression does not fail. let btr = BtrBlocksCompressor::default(); - let result = btr.integer_compressor().compress( - &btr, - &PrimitiveArray::new(Buffer::::empty(), Validity::NonNullable), - CompressorContext::default(), - &[], - )?; + let array = PrimitiveArray::new(Buffer::::empty(), Validity::NonNullable); + let result = btr.compress(&array.into_array())?; assert!(result.is_empty()); Ok(()) @@ -949,14 +921,8 @@ mod tests { } } - let primitive = codes.freeze().into_array().to_primitive(); let btr = BtrBlocksCompressor::default(); - let compressed = btr.integer_compressor().compress( - &btr, - &primitive, - CompressorContext::default(), - &[], - )?; + let compressed = btr.compress(&codes.freeze().into_array())?; assert!(compressed.is::()); Ok(()) } @@ -968,17 +934,19 @@ mod tests { Validity::from_iter(vec![true, true, true, true, false]), ); let btr = BtrBlocksCompressor::default(); + let array_ref = array.clone().into_array(); + let mut cache = StatsCache::new(); let compressed = SparseScheme.compress( &btr, - &IntegerStats::generate(&array), + &array_ref, CompressorContext::default(), + &mut cache, &[], )?; assert!(compressed.is::()); let decoded = compressed.clone(); let expected = - PrimitiveArray::new(buffer![189u8, 189, 189, 0, 0], array.validity().clone()) - .into_array(); + PrimitiveArray::new(buffer![189u8, 189, 189, 0, 0], array.validity()?).into_array(); assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); Ok(()) } @@ -992,17 +960,20 @@ mod tests { ]), ); let btr = BtrBlocksCompressor::default(); + let array_ref = array.clone().into_array(); + let mut cache = StatsCache::new(); let compressed = SparseScheme.compress( &btr, - &IntegerStats::generate(&array), + &array_ref, CompressorContext::default(), + &mut cache, &[], )?; assert!(compressed.is::()); let decoded = compressed.clone(); let expected = PrimitiveArray::new( buffer![0u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 46], - array.validity().clone(), + array.validity()?, ) .into_array(); assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); @@ -1014,10 +985,13 @@ mod tests { let values = (0i32..20).step_by(7).collect_vec(); let array = PrimitiveArray::from_option_iter(values.clone().into_iter().map(Some)); let btr = BtrBlocksCompressor::default(); + let array_ref = array.into_array(); + let mut cache = StatsCache::new(); let compressed = SequenceScheme.compress( &btr, - &IntegerStats::generate(&array), + &array_ref, CompressorContext::default(), + &mut cache, &[], )?; assert!(compressed.is::()); @@ -1036,10 +1010,13 @@ mod tests { let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); let btr = BtrBlocksCompressor::default(); + let array_ref = array.into_array(); + let mut cache = StatsCache::new(); let compressed = RLE_INTEGER_SCHEME.compress( &btr, - &IntegerStats::generate(&array), + &array_ref, CompressorContext::default(), + &mut cache, &[], )?; @@ -1077,6 +1054,7 @@ mod scheme_selection_tests { use rand::Rng; use rand::SeedableRng; use rand::rngs::StdRng; + use vortex_array::IntoArray; use vortex_array::arrays::Constant; use vortex_array::arrays::Dict; use vortex_array::arrays::PrimitiveArray; @@ -1085,24 +1063,18 @@ mod scheme_selection_tests { use vortex_error::VortexResult; use vortex_fastlanes::BitPacked; use vortex_fastlanes::FoR; - use vortex_fastlanes::RLE; use vortex_runend::RunEnd; use vortex_sequence::Sequence; use vortex_sparse::Sparse; use crate::BtrBlocksCompressor; - use crate::CompressorContext; - use crate::CompressorExt; - use crate::IntCode; #[test] fn test_constant_compressed() -> VortexResult<()> { let values: Vec = iter::repeat_n(42, 100).collect(); let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); let btr = BtrBlocksCompressor::default(); - let compressed = - btr.integer_compressor() - .compress(&btr, &array, CompressorContext::default(), &[])?; + let compressed = btr.compress(&array.into_array())?; assert!(compressed.is::()); Ok(()) } @@ -1112,9 +1084,7 @@ mod scheme_selection_tests { let values: Vec = (0..1000).map(|i| 1_000_000 + ((i * 37) % 100)).collect(); let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); let btr = BtrBlocksCompressor::default(); - let compressed = - btr.integer_compressor() - .compress(&btr, &array, CompressorContext::default(), &[])?; + let compressed = btr.compress(&array.into_array())?; assert!(compressed.is::()); Ok(()) } @@ -1124,9 +1094,7 @@ mod scheme_selection_tests { let values: Vec = (0..1000).map(|i| i % 16).collect(); let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); let btr = BtrBlocksCompressor::default(); - let compressed = - btr.integer_compressor() - .compress(&btr, &array, CompressorContext::default(), &[])?; + let compressed = btr.compress(&array.into_array())?; assert!(compressed.is::()); Ok(()) } @@ -1143,9 +1111,7 @@ mod scheme_selection_tests { } let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); let btr = BtrBlocksCompressor::default(); - let compressed = - btr.integer_compressor() - .compress(&btr, &array, CompressorContext::default(), &[])?; + let compressed = btr.compress(&array.into_array())?; assert!(compressed.is::()); Ok(()) } @@ -1169,9 +1135,7 @@ mod scheme_selection_tests { let array = PrimitiveArray::new(Buffer::copy_from(&codes), Validity::NonNullable); let btr = BtrBlocksCompressor::default(); - let compressed = - btr.integer_compressor() - .compress(&btr, &array, CompressorContext::default(), &[])?; + let compressed = btr.compress(&array.into_array())?; assert!(compressed.is::()); Ok(()) } @@ -1184,9 +1148,7 @@ mod scheme_selection_tests { } let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); let btr = BtrBlocksCompressor::default(); - let compressed = - btr.integer_compressor() - .compress(&btr, &array, CompressorContext::default(), &[])?; + let compressed = btr.compress(&array.into_array())?; assert!(compressed.is::()); Ok(()) } @@ -1196,9 +1158,7 @@ mod scheme_selection_tests { let values: Vec = (0..1000).map(|i| i * 7).collect(); let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); let btr = BtrBlocksCompressor::default(); - let compressed = - btr.integer_compressor() - .compress(&btr, &array, CompressorContext::default(), &[])?; + let compressed = btr.compress(&array.into_array())?; assert!(compressed.is::()); Ok(()) } @@ -1211,13 +1171,9 @@ mod scheme_selection_tests { } let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); let btr = BtrBlocksCompressor::default(); - let compressed = btr.integer_compressor().compress( - &btr, - &array, - CompressorContext::default(), - &[IntCode::RunEnd], - )?; - assert!(compressed.is::()); + let compressed = btr.compress(&array.into_array())?; + eprintln!("{}", compressed.display_tree()); + assert!(compressed.is::()); Ok(()) } } diff --git a/vortex-btrblocks/src/compressor/mod.rs b/vortex-btrblocks/src/compressor/mod.rs index 5c3a31271cd..3a088f7668e 100644 --- a/vortex-btrblocks/src/compressor/mod.rs +++ b/vortex-btrblocks/src/compressor/mod.rs @@ -1,178 +1,18 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Type-specific compressor traits that drive scheme selection and compression. -//! -//! [`Compressor`] defines the interface: generate statistics for an array via -//! [`Compressor::gen_stats`], and provide available [`Scheme`]s via [`Compressor::schemes`]. -//! -//! [`CompressorExt`] is blanket-implemented for all `Compressor`s and adds the core logic: -//! -//! - [`CompressorExt::choose_scheme`] iterates all schemes, skips excluded ones, and calls -//! [`Scheme::expected_compression_ratio`] on each. It returns the scheme with the highest ratio -//! above 1.0, or falls back to the default. See the [`scheme`](crate::scheme) module for how -//! ratio estimation works. -//! - [`CompressorExt::compress`] generates stats, calls `choose_scheme()`, and applies the -//! result. If compression did not shrink the array, the original is returned. - -use vortex_array::ArrayRef; -use vortex_array::IntoArray; -use vortex_array::arrays::ConstantArray; -use vortex_array::scalar::Scalar; -use vortex_array::vtable::VTable; -use vortex_error::VortexResult; - -use crate::BtrBlocksCompressor; -use crate::CompressorContext; -use crate::CompressorStats; -use crate::Scheme; +//! Compression scheme implementations. pub(crate) mod decimal; -pub(crate) mod float; -pub(crate) mod integer; -mod patches; -mod rle; -pub(crate) mod string; +/// Float compression schemes. +pub mod float; +/// Integer compression schemes. +pub mod integer; +pub(crate) mod patches; +pub(crate) mod rle; +/// String compression schemes. +pub mod string; pub(crate) mod temporal; /// Maximum cascade depth for compression. pub(crate) const MAX_CASCADE: usize = 3; - -/// A compressor for a particular input type. -/// -/// This trait defines the interface for type-specific compressors that can adaptively -/// choose and apply compression schemes based on data characteristics. Compressors -/// analyze input arrays, select optimal compression schemes, and handle cascading -/// compression with multiple encoding layers. -/// -/// The compressor works by generating statistics on the input data, evaluating -/// available compression schemes, and selecting the one with the best compression ratio. -pub trait Compressor { - /// The VTable type for arrays this compressor operates on. - type ArrayVTable: VTable; - /// The compression scheme type used by this compressor. - type SchemeType: Scheme + ?Sized; - /// The statistics type used to analyze arrays for compression. - type StatsType: CompressorStats; - - /// Generates statistics for the given array to guide compression scheme selection. - fn gen_stats(&self, array: &::Array) -> Self::StatsType; - - /// Returns all available compression schemes for this compressor. - fn schemes(&self) -> &[&'static Self::SchemeType]; - /// Returns the default fallback compression scheme. - fn default_scheme(&self) -> &'static Self::SchemeType; -} - -/// Extension trait providing scheme selection and compression for compressors. -pub trait CompressorExt: Compressor -where - Self::SchemeType: 'static, -{ - /// Selects the best compression scheme based on expected compression ratios. - /// - /// Evaluates all available schemes against the provided statistics and returns - /// the one with the highest compression ratio. Falls back to the default scheme - /// if no scheme provides compression benefits. - #[allow(clippy::cognitive_complexity)] - fn choose_scheme( - &self, - compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - ctx: CompressorContext, - excludes: &[::CodeType], - ) -> VortexResult<&'static Self::SchemeType> { - let mut best_ratio = 1.0; - let mut best_scheme: Option<&'static Self::SchemeType> = None; - - // logging helpers - let depth = MAX_CASCADE - ctx.allowed_cascading; - - for scheme in self.schemes().iter() { - // Skip excluded schemes - if excludes.contains(&scheme.code()) { - continue; - } - - // We never choose Constant for a sample - if ctx.is_sample && scheme.is_constant() { - continue; - } - - tracing::trace!( - is_sample = ctx.is_sample, - depth, - is_constant = scheme.is_constant(), - ?scheme, - "Trying compression scheme" - ); - - let ratio = scheme.expected_compression_ratio(compressor, stats, ctx, excludes)?; - tracing::trace!( - is_sample = ctx.is_sample, - depth, - ratio, - ?scheme, - "Expected compression result" - ); - - if !(ratio.is_subnormal() || ratio.is_infinite() || ratio.is_nan()) { - if ratio > best_ratio { - best_ratio = ratio; - best_scheme = Some(*scheme); - } - } else { - tracing::trace!( - "Calculated invalid compression ratio {ratio} for scheme: {scheme:?}. Must not be sub-normal, infinite or nan." - ); - } - } - - tracing::trace!(depth, scheme = ?best_scheme, ratio = best_ratio, "best scheme found"); - - if let Some(best) = best_scheme { - Ok(best) - } else { - Ok(self.default_scheme()) - } - } - - /// Compresses an array using this compressor. - /// - /// Generates statistics on the input array, selects the best compression scheme, - /// and applies it. Returns the original array if compression would increase size. - fn compress( - &self, - btr_blocks_compressor: &BtrBlocksCompressor, - array: &<::ArrayVTable as VTable>::Array, - ctx: CompressorContext, - excludes: &[::CodeType], - ) -> VortexResult { - // Avoid compressing empty arrays. - if array.is_empty() { - return Ok(array.to_array()); - } - - // Avoid compressing all-null arrays. - if array.all_invalid()? { - return Ok( - ConstantArray::new(Scalar::null(array.dtype().clone()), array.len()).into_array(), - ); - } - - // Generate stats on the array directly. - let stats = self.gen_stats(array); - let best_scheme = self.choose_scheme(btr_blocks_compressor, &stats, ctx, excludes)?; - - let output = best_scheme.compress(btr_blocks_compressor, &stats, ctx, excludes)?; - if output.nbytes() < array.nbytes() { - Ok(output) - } else { - tracing::debug!("resulting tree too large: {}", output.encoding_id()); - Ok(array.to_array()) - } - } -} - -// Blanket implementation for all Compressor types with 'static SchemeType -impl CompressorExt for T where T::SchemeType: 'static {} diff --git a/vortex-btrblocks/src/compressor/rle.rs b/vortex-btrblocks/src/compressor/rle.rs index ef4b3fcb048..562bba664ad 100644 --- a/vortex-btrblocks/src/compressor/rle.rs +++ b/vortex-btrblocks/src/compressor/rle.rs @@ -2,7 +2,6 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors use std::fmt::Debug; -use std::hash::Hash; use std::marker::PhantomData; use vortex_array::ArrayRef; @@ -14,43 +13,49 @@ use vortex_error::VortexResult; use vortex_fastlanes::RLEArray; use crate::BtrBlocksCompressor; -use crate::CanonicalCompressor; use crate::CompressorContext; -use crate::CompressorStats; -use crate::Excludes; -use crate::IntCode; use crate::Scheme; -use crate::SchemeExt; +use crate::SchemeId; +use crate::StatsCache; +use crate::compressor::integer::DictScheme as IntDictScheme; +use crate::scheme::estimate_compression_ratio_with_sampling; /// Threshold for the average run length in an array before we consider run-length encoding. pub const RUN_LENGTH_THRESHOLD: u32 = 4; /// Trait for accessing RLE-specific statistics. pub trait RLEStats { + /// Returns the number of non-null values. fn value_count(&self) -> u32; + /// Returns the average run length. fn average_run_length(&self) -> u32; + /// Returns the underlying source array. fn source(&self) -> &PrimitiveArray; } /// Configuration trait for RLE schemes. /// /// Implement this trait to define the behavior of an RLE scheme for a specific -/// stats and code type combination. +/// stats type. pub trait RLEConfig: Debug + Send + Sync + 'static { /// The statistics type used by this RLE scheme. - type Stats: RLEStats + CompressorStats; - /// The code type used to identify schemes. - type Code: Copy + Clone + Debug + Hash + Eq + Ord; + type Stats: RLEStats + 'static; - /// The unique code identifying this RLE scheme. - const CODE: Self::Code; + /// The globally unique name for this RLE scheme. + const SCHEME_NAME: &'static str; + + /// Whether this scheme can compress the given canonical array. + fn matches(canonical: &Canonical) -> bool; + + /// Generates statistics for the given array. + fn generate_stats(array: &ArrayRef) -> Self::Stats; /// Compress the values array after RLE encoding. fn compress_values( compressor: &BtrBlocksCompressor, values: &PrimitiveArray, ctx: CompressorContext, - excludes: &[Self::Code], + excludes: &[SchemeId], ) -> VortexResult; } @@ -74,25 +79,29 @@ impl Default for RLEScheme { } impl Scheme for RLEScheme { - type StatsType = C::Stats; - type CodeType = C::Code; + fn scheme_name(&self) -> &'static str { + C::SCHEME_NAME + } - fn code(&self) -> C::Code { - C::CODE + fn matches(&self, canonical: &Canonical) -> bool { + C::matches(canonical) } fn expected_compression_ratio( &self, compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, + array: &ArrayRef, ctx: CompressorContext, - excludes: &[C::Code], + cache: &mut StatsCache, + excludes: &[SchemeId], ) -> VortexResult { // RLE is only useful when we cascade it with another encoding. if ctx.allowed_cascading == 0 { return Ok(0.0); } + let stats = cache.get_or_insert_with::(|| C::generate_stats(array)); + // Don't compress all-null or empty arrays. if stats.value_count() == 0 { return Ok(0.0); @@ -104,16 +113,18 @@ impl Scheme for RLEScheme { } // Run compression on a sample to see how it performs. - self.estimate_compression_ratio_with_sampling(compressor, stats, ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, array, ctx, excludes) } fn compress( &self, compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, + array: &ArrayRef, ctx: CompressorContext, - excludes: &[C::Code], + cache: &mut StatsCache, + excludes: &[SchemeId], ) -> VortexResult { + let stats = cache.get_or_insert_with::(|| C::generate_stats(array)); let rle_array = RLEArray::encode(RLEStats::source(stats))?; if ctx.allowed_cascading == 0 { @@ -121,7 +132,7 @@ impl Scheme for RLEScheme { } // Prevent RLE recursion. - let mut new_excludes = vec![self.code()]; + let mut new_excludes = vec![self.id()]; new_excludes.extend_from_slice(excludes); let compressed_values = C::compress_values( @@ -137,20 +148,20 @@ impl Scheme for RLEScheme { &rle_array.indices().to_primitive().narrow()?, compressor, ctx.descend(), - Excludes::from(&[IntCode::Dict]), + &[IntDictScheme.id()], )?; #[cfg(not(feature = "unstable_encodings"))] let compressed_indices = compressor.compress_canonical( Canonical::Primitive(rle_array.indices().to_primitive().narrow()?), ctx.descend(), - Excludes::from(&[IntCode::Dict]), + &[IntDictScheme.id()], )?; let compressed_offsets = compressor.compress_canonical( Canonical::Primitive(rle_array.values_idx_offsets().to_primitive().narrow()?), ctx.descend(), - Excludes::from(&[IntCode::Dict]), + &[IntDictScheme.id()], )?; // SAFETY: Recursive compression doesn't affect the invariants. @@ -173,7 +184,7 @@ fn try_compress_delta( primitive_array: &PrimitiveArray, compressor: &BtrBlocksCompressor, ctx: CompressorContext, - excludes: Excludes, + excludes: &[SchemeId], ) -> VortexResult { use vortex_array::VortexSessionExecute; diff --git a/vortex-btrblocks/src/compressor/string.rs b/vortex-btrblocks/src/compressor/string.rs index cac9cf969cd..48b6b439e33 100644 --- a/vortex-btrblocks/src/compressor/string.rs +++ b/vortex-btrblocks/src/compressor/string.rs @@ -1,10 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -use std::hash::Hash; -use std::hash::Hasher; - -use enum_iterator::Sequence; use vortex_array::ArrayRef; use vortex_array::Canonical; use vortex_array::IntoArray; @@ -19,8 +15,9 @@ use vortex_array::arrays::VarBinArray; use vortex_array::arrays::VarBinView; use vortex_array::arrays::VarBinViewArray; use vortex_array::builders::dict::dict_encode; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; use vortex_array::scalar::Scalar; -use vortex_array::vtable::VTable; use vortex_array::vtable::ValidityHelper; use vortex_error::VortexExpect; use vortex_error::VortexResult; @@ -36,16 +33,19 @@ use super::integer::DictScheme as IntDictScheme; use super::integer::SequenceScheme as IntSequenceScheme; use super::integer::SparseScheme as IntSparseScheme; use crate::BtrBlocksCompressor; -use crate::CanonicalCompressor; -use crate::Compressor; use crate::CompressorContext; use crate::CompressorStats; -use crate::Excludes; use crate::GenerateStatsOptions; -use crate::IntCode; use crate::Scheme; -use crate::SchemeExt; +use crate::SchemeId; +use crate::StatsCache; use crate::sample::sample; +use crate::scheme::estimate_compression_ratio_with_sampling; + +/// Returns `true` if the canonical array is a UTF-8 string type. +fn is_utf8_string(canonical: &Canonical) -> bool { + matches!(canonical, Canonical::VarBinView(v) if v.dtype().eq_ignore_nullability(&DType::Utf8(Nullability::NonNullable))) +} /// Array of variable-length byte arrays, and relevant stats for compression. #[derive(Clone, Debug)] @@ -120,99 +120,23 @@ impl CompressorStats for StringStats { } } -/// All available string compression schemes. -pub const ALL_STRING_SCHEMES: &[&dyn StringScheme] = &[ - &UncompressedScheme, - &DictScheme, - &FSSTScheme, - &ConstantScheme, - &NullDominated, - #[cfg(feature = "zstd")] - &ZstdScheme, - #[cfg(all(feature = "zstd", feature = "unstable_encodings"))] - &ZstdBuffersScheme, -]; - -/// [`Compressor`] for strings. -#[derive(Clone, Copy)] -pub struct StringCompressor<'a> { - /// Reference to the parent compressor. - pub btr_blocks_compressor: &'a dyn CanonicalCompressor, -} - -impl<'a> Compressor for StringCompressor<'a> { - type ArrayVTable = VarBinView; - type SchemeType = dyn StringScheme; - type StatsType = StringStats; - - fn gen_stats(&self, array: &::Array) -> Self::StatsType { - if self - .btr_blocks_compressor - .string_schemes() - .iter() - .any(|s| s.code() == DictScheme.code()) - { - StringStats::generate_opts( - array, - GenerateStatsOptions { - count_distinct_values: true, - }, - ) - } else { - StringStats::generate_opts( - array, - GenerateStatsOptions { - count_distinct_values: false, - }, - ) - } - } - - fn schemes(&self) -> &[&'static dyn StringScheme] { - self.btr_blocks_compressor.string_schemes() - } - - fn default_scheme(&self) -> &'static Self::SchemeType { - &UncompressedScheme - } -} - -pub trait StringScheme: - Scheme + Send + Sync -{ -} - -impl StringScheme for T where - T: Scheme + Send + Sync -{ -} - -impl PartialEq for dyn StringScheme { - fn eq(&self, other: &Self) -> bool { - self.code() == other.code() - } -} - -impl Eq for dyn StringScheme {} - -impl Hash for dyn StringScheme { - fn hash(&self, state: &mut H) { - self.code().hash(state) - } -} - +/// Uncompressed string scheme (identity). #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct UncompressedScheme; +/// Dictionary encoding for low-cardinality strings. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct DictScheme; +/// FSST (Fast Static Symbol Table) compression. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct FSSTScheme; +/// Constant encoding for arrays with a single distinct value. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct ConstantScheme; +/// Sparse encoding for null-dominated arrays. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct NullDominated; @@ -226,39 +150,22 @@ pub struct ZstdScheme; #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct ZstdBuffersScheme; -/// Unique identifier for string compression schemes. -#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash, Sequence, Ord, PartialOrd)] -pub enum StringCode { - /// No compression applied. - Uncompressed, - /// Dictionary encoding for low-cardinality strings. - Dict, - /// FSST (Fast Static Symbol Table) compression. - Fsst, - /// Constant encoding for arrays with a single distinct value. - Constant, - /// Sparse encoding for null-dominated arrays. - Sparse, - /// Zstd compression without dictionaries. - Zstd, - /// Zstd buffer-level compression preserving array layout. - ZstdBuffers, -} - impl Scheme for UncompressedScheme { - type StatsType = StringStats; - type CodeType = StringCode; + fn scheme_name(&self) -> &'static str { + "vortex.string.uncompressed" + } - fn code(&self) -> StringCode { - StringCode::Uncompressed + fn matches(&self, canonical: &Canonical) -> bool { + is_utf8_string(canonical) } fn expected_compression_ratio( &self, _compressor: &BtrBlocksCompressor, - _stats: &Self::StatsType, + _array: &ArrayRef, _ctx: CompressorContext, - _excludes: &[StringCode], + _cache: &mut StatsCache, + _excludes: &[SchemeId], ) -> VortexResult { Ok(1.0) } @@ -266,29 +173,35 @@ impl Scheme for UncompressedScheme { fn compress( &self, _compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, + array: &ArrayRef, _ctx: CompressorContext, - _excludes: &[StringCode], + _cache: &mut StatsCache, + _excludes: &[SchemeId], ) -> VortexResult { - Ok(stats.source().clone().into_array()) + Ok(array.clone()) } } impl Scheme for DictScheme { - type StatsType = StringStats; - type CodeType = StringCode; + fn scheme_name(&self) -> &'static str { + "vortex.string.dict" + } - fn code(&self) -> StringCode { - StringCode::Dict + fn matches(&self, canonical: &Canonical) -> bool { + is_utf8_string(canonical) } fn expected_compression_ratio( &self, compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, + array: &ArrayRef, ctx: CompressorContext, - excludes: &[StringCode], + cache: &mut StatsCache, + excludes: &[SchemeId], ) -> VortexResult { + let stats = cache + .get_or_insert_with::(|| StringStats::generate(&array.to_varbinview())); + // If we don't have a sufficiently high number of distinct values, do not attempt Dict. if stats.estimated_distinct_count > stats.value_count / 2 { return Ok(0.0); @@ -299,16 +212,20 @@ impl Scheme for DictScheme { return Ok(0.0); } - self.estimate_compression_ratio_with_sampling(compressor, stats, ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, array, ctx, excludes) } fn compress( &self, compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, + array: &ArrayRef, ctx: CompressorContext, - _excludes: &[StringCode], + cache: &mut StatsCache, + _excludes: &[SchemeId], ) -> VortexResult { + let stats = cache + .get_or_insert_with::(|| StringStats::generate(&array.to_varbinview())); + let dict = dict_encode(&stats.source().clone().into_array())?; // If we are not allowed to cascade, do not attempt codes or values compression. @@ -316,11 +233,11 @@ impl Scheme for DictScheme { return Ok(dict.into_array()); } - // Find best compressor for codes and values separately + // Find best compressor for codes and values separately. let compressed_codes = compressor.compress_canonical( Canonical::Primitive(dict.codes().to_primitive()), ctx.descend(), - Excludes::from(&[IntDictScheme.code(), IntSequenceScheme.code()]), + &[IntDictScheme.id(), IntSequenceScheme.id()], )?; // Attempt to compress the values with non-Dict compression. @@ -328,10 +245,10 @@ impl Scheme for DictScheme { let compressed_values = compressor.compress_canonical( Canonical::VarBinView(dict.values().to_varbinview()), ctx.descend(), - Excludes::from(&[DictScheme.code()]), + &[DictScheme.id()], )?; - // SAFETY: compressing codes or values does not alter the invariants + // SAFETY: compressing codes or values does not alter the invariants. unsafe { Ok( DictArray::new_unchecked(compressed_codes, compressed_values) @@ -343,35 +260,40 @@ impl Scheme for DictScheme { } impl Scheme for FSSTScheme { - type StatsType = StringStats; - type CodeType = StringCode; + fn scheme_name(&self) -> &'static str { + "vortex.string.fsst" + } - fn code(&self) -> StringCode { - StringCode::Fsst + fn matches(&self, canonical: &Canonical) -> bool { + is_utf8_string(canonical) } fn compress( &self, compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, + array: &ArrayRef, ctx: CompressorContext, - _excludes: &[StringCode], + cache: &mut StatsCache, + _excludes: &[SchemeId], ) -> VortexResult { + let stats = cache + .get_or_insert_with::(|| StringStats::generate(&array.to_varbinview())); + let fsst = { - let compressor = fsst_train_compressor(&stats.src); - fsst_compress(&stats.src, &compressor) + let compressor_fsst = fsst_train_compressor(&stats.src); + fsst_compress(&stats.src, &compressor_fsst) }; let compressed_original_lengths = compressor.compress_canonical( Canonical::Primitive(fsst.uncompressed_lengths().to_primitive().narrow()?), ctx, - Excludes::none(), + &[], )?; let compressed_codes_offsets = compressor.compress_canonical( Canonical::Primitive(fsst.codes().offsets().to_primitive().narrow()?), ctx, - Excludes::none(), + &[], )?; let compressed_codes = VarBinArray::try_new( compressed_codes_offsets, @@ -393,11 +315,12 @@ impl Scheme for FSSTScheme { } impl Scheme for ConstantScheme { - type StatsType = StringStats; - type CodeType = StringCode; + fn scheme_name(&self) -> &'static str { + "vortex.string.constant" + } - fn code(&self) -> Self::CodeType { - StringCode::Constant + fn matches(&self, canonical: &Canonical) -> bool { + is_utf8_string(canonical) } fn is_constant(&self) -> bool { @@ -407,14 +330,18 @@ impl Scheme for ConstantScheme { fn expected_compression_ratio( &self, _compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, + array: &ArrayRef, ctx: CompressorContext, - _excludes: &[Self::CodeType], + cache: &mut StatsCache, + _excludes: &[SchemeId], ) -> VortexResult { if ctx.is_sample { return Ok(0.0); } + let stats = cache + .get_or_insert_with::(|| StringStats::generate(&array.to_varbinview())); + let mut ctx = LEGACY_SESSION.create_execution_ctx(); if stats.estimated_distinct_count > 1 || !is_constant(&stats.src.clone().into_array(), &mut ctx)? @@ -422,17 +349,21 @@ impl Scheme for ConstantScheme { return Ok(0.0); } - // Force constant is these cases + // Force constant in these cases. Ok(f64::MAX) } fn compress( &self, _compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, + array: &ArrayRef, _ctx: CompressorContext, - _excludes: &[Self::CodeType], + cache: &mut StatsCache, + _excludes: &[SchemeId], ) -> VortexResult { + let stats = cache + .get_or_insert_with::(|| StringStats::generate(&array.to_varbinview())); + let scalar_idx = (0..stats.source().len()).position(|idx| stats.source().is_valid(idx).unwrap_or(false)); @@ -456,27 +387,32 @@ impl Scheme for ConstantScheme { } impl Scheme for NullDominated { - type StatsType = StringStats; - type CodeType = StringCode; + fn scheme_name(&self) -> &'static str { + "vortex.string.sparse" + } - fn code(&self) -> Self::CodeType { - StringCode::Sparse + fn matches(&self, canonical: &Canonical) -> bool { + is_utf8_string(canonical) } fn expected_compression_ratio( &self, _compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, + array: &ArrayRef, ctx: CompressorContext, - _excludes: &[Self::CodeType], + cache: &mut StatsCache, + _excludes: &[SchemeId], ) -> VortexResult { // Only use `SparseScheme` if we can cascade. if ctx.allowed_cascading == 0 { return Ok(0.0); } + let stats = cache + .get_or_insert_with::(|| StringStats::generate(&array.to_varbinview())); + if stats.value_count == 0 { - // All nulls should use ConstantScheme + // All nulls should use ConstantScheme. return Ok(0.0); } @@ -485,31 +421,35 @@ impl Scheme for NullDominated { return Ok(stats.src.len() as f64 / stats.value_count as f64); } - // Otherwise we don't go this route + // Otherwise we don't go this route. Ok(0.0) } fn compress( &self, compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, + array: &ArrayRef, ctx: CompressorContext, - _excludes: &[Self::CodeType], + cache: &mut StatsCache, + _excludes: &[SchemeId], ) -> VortexResult { assert!(ctx.allowed_cascading > 0); - // We pass None as we only run this pathway for NULL-dominated string arrays + let stats = cache + .get_or_insert_with::(|| StringStats::generate(&array.to_varbinview())); + + // We pass None as we only run this pathway for NULL-dominated string arrays. let sparse_encoded = SparseArray::encode(&stats.src.clone().into_array(), None)?; if let Some(sparse) = sparse_encoded.as_opt::() { - // Compress the indices only (not the values for strings) - let new_excludes = vec![IntSparseScheme.code(), IntCode::Dict]; + // Compress the indices only (not the values for strings). + let new_excludes = vec![IntSparseScheme.id(), IntDictScheme.id()]; let indices = sparse.patches().indices().to_primitive().narrow()?; let compressed_indices = compressor.compress_canonical( Canonical::Primitive(indices), ctx.descend(), - Excludes::int_only(&new_excludes), + &new_excludes, )?; SparseArray::try_new( @@ -527,20 +467,25 @@ impl Scheme for NullDominated { #[cfg(feature = "zstd")] impl Scheme for ZstdScheme { - type StatsType = StringStats; - type CodeType = StringCode; + fn scheme_name(&self) -> &'static str { + "vortex.string.zstd" + } - fn code(&self) -> StringCode { - StringCode::Zstd + fn matches(&self, canonical: &Canonical) -> bool { + is_utf8_string(canonical) } fn compress( &self, _compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, + array: &ArrayRef, _ctx: CompressorContext, - _excludes: &[StringCode], + cache: &mut StatsCache, + _excludes: &[SchemeId], ) -> VortexResult { + let stats = cache + .get_or_insert_with::(|| StringStats::generate(&array.to_varbinview())); + let compacted = stats.source().compact_buffers()?; Ok( vortex_zstd::ZstdArray::from_var_bin_view_without_dict(&compacted, 3, 8192)? @@ -551,20 +496,25 @@ impl Scheme for ZstdScheme { #[cfg(all(feature = "zstd", feature = "unstable_encodings"))] impl Scheme for ZstdBuffersScheme { - type StatsType = StringStats; - type CodeType = StringCode; + fn scheme_name(&self) -> &'static str { + "vortex.string.zstd_buffers" + } - fn code(&self) -> StringCode { - StringCode::ZstdBuffers + fn matches(&self, canonical: &Canonical) -> bool { + is_utf8_string(canonical) } fn compress( &self, _compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, + array: &ArrayRef, _ctx: CompressorContext, - _excludes: &[StringCode], + cache: &mut StatsCache, + _excludes: &[SchemeId], ) -> VortexResult { + let stats = cache + .get_or_insert_with::(|| StringStats::generate(&array.to_varbinview())); + Ok( vortex_zstd::ZstdBuffersArray::compress(&stats.source().clone().into_array(), 3)? .into_array(), @@ -597,7 +547,8 @@ mod tests { let strings = VarBinViewArray::from_iter(strings, DType::Utf8(Nullability::NonNullable)); let array_ref = strings.into_array(); - let compressed = BtrBlocksCompressor::default().compress(&array_ref)?; + let btr = BtrBlocksCompressor::default(); + let compressed = btr.compress(&array_ref)?; assert_eq!(compressed.len(), 2048); let display = compressed @@ -619,7 +570,8 @@ mod tests { let strings = strings.finish_into_varbinview(); let array_ref = strings.into_array(); - let compressed = BtrBlocksCompressor::default().compress(&array_ref)?; + let btr = BtrBlocksCompressor::default(); + let compressed = btr.compress(&array_ref)?; assert_eq!(compressed.len(), 100); let display = compressed diff --git a/vortex-btrblocks/src/compressor/temporal.rs b/vortex-btrblocks/src/compressor/temporal.rs index 6fb917be58d..b958c77c126 100644 --- a/vortex-btrblocks/src/compressor/temporal.rs +++ b/vortex-btrblocks/src/compressor/temporal.rs @@ -14,9 +14,7 @@ use vortex_datetime_parts::split_temporal; use vortex_error::VortexResult; use crate::BtrBlocksCompressor; -use crate::CanonicalCompressor; use crate::CompressorContext; -use crate::Excludes; /// Compress a temporal array into a `DateTimePartsArray`. pub fn compress_temporal( @@ -35,17 +33,17 @@ pub fn compress_temporal( let days = compressor.compress_canonical( Canonical::Primitive(days.to_primitive().narrow()?), ctx, - Excludes::none(), + &[], )?; let seconds = compressor.compress_canonical( Canonical::Primitive(seconds.to_primitive().narrow()?), ctx, - Excludes::none(), + &[], )?; let subseconds = compressor.compress_canonical( Canonical::Primitive(subseconds.to_primitive().narrow()?), ctx, - Excludes::none(), + &[], )?; Ok(DateTimePartsArray::try_new(dtype, days, seconds, subseconds)?.into_array()) diff --git a/vortex-btrblocks/src/ctx.rs b/vortex-btrblocks/src/ctx.rs index f2cb6a37102..d346a3018b4 100644 --- a/vortex-btrblocks/src/ctx.rs +++ b/vortex-btrblocks/src/ctx.rs @@ -1,105 +1,11 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Compression context types for recursive compression. +//! Compression context for recursive compression. -use crate::FloatCode; -use crate::IntCode; -use crate::MAX_CASCADE; -use crate::StringCode; - -/// Holds references to exclude lists for each compression code type. -/// -/// This struct is passed through recursive compression calls to specify -/// which schemes should be excluded at each level. -#[derive(Debug, Clone, Copy, Default)] -pub struct Excludes<'a> { - /// Integer schemes to exclude. - pub int: &'a [IntCode], - /// Float schemes to exclude. - pub float: &'a [FloatCode], - /// String schemes to exclude. - pub string: &'a [StringCode], -} - -impl<'a> Excludes<'a> { - /// Creates an empty excludes (no exclusions). - pub const fn none() -> Self { - Self { - int: &[], - float: &[], - string: &[], - } - } - - /// Creates excludes with only integer exclusions. - pub const fn int_only(int: &'a [IntCode]) -> Self { - Self { - int, - float: &[], - string: &[], - } - } - - /// Creates excludes with only float exclusions. - pub const fn float_only(float: &'a [FloatCode]) -> Self { - Self { - int: &[], - float, - string: &[], - } - } - - /// Creates excludes with only string exclusions. - pub const fn string_only(string: &'a [StringCode]) -> Self { - Self { - int: &[], - float: &[], - string, - } - } -} - -impl<'a> From<&'a [IntCode]> for Excludes<'a> { - fn from(int: &'a [IntCode]) -> Self { - Self::int_only(int) - } -} - -impl<'a, const N: usize> From<&'a [IntCode; N]> for Excludes<'a> { - fn from(int: &'a [IntCode; N]) -> Self { - Self::int_only(int) - } -} - -impl<'a> From<&'a [FloatCode]> for Excludes<'a> { - fn from(float: &'a [FloatCode]) -> Self { - Self::float_only(float) - } -} - -impl<'a, const N: usize> From<&'a [FloatCode; N]> for Excludes<'a> { - fn from(float: &'a [FloatCode; N]) -> Self { - Self::float_only(float) - } -} - -impl<'a> From<&'a [StringCode]> for Excludes<'a> { - fn from(string: &'a [StringCode]) -> Self { - Self::string_only(string) - } -} - -impl<'a, const N: usize> From<&'a [StringCode; N]> for Excludes<'a> { - fn from(string: &'a [StringCode; N]) -> Self { - Self::string_only(string) - } -} +use crate::compressor::MAX_CASCADE; /// Context passed through recursive compression calls. -/// -/// Bundles `is_sample` and `allowed_cascading` which always travel together. -/// Excludes are passed separately since they're type-specific. #[derive(Debug, Clone, Copy)] pub struct CompressorContext { /// Whether we're compressing a sample (for ratio estimation). @@ -118,7 +24,7 @@ impl Default for CompressorContext { } impl CompressorContext { - /// Descend one level in the cascade (decrements `allowed_cascading`). + /// Descend one level in the cascade. pub fn descend(self) -> Self { Self { allowed_cascading: self.allowed_cascading.saturating_sub(1), @@ -126,7 +32,7 @@ impl CompressorContext { } } - /// Returns a context marked as sample compression (for ratio estimation). + /// Returns a context marked as sample compression. pub fn as_sample(self) -> Self { Self { is_sample: true, diff --git a/vortex-btrblocks/src/lib.rs b/vortex-btrblocks/src/lib.rs index 28e4eeb8dfa..1674443a13e 100644 --- a/vortex-btrblocks/src/lib.rs +++ b/vortex-btrblocks/src/lib.rs @@ -12,71 +12,69 @@ //! //! # Key Features //! -//! - **Adaptive Compression**: Automatically selects the best compression scheme based on data patterns -//! - **Type-Specific Compressors**: Specialized compression for integers, floats, strings, and temporal data -//! - **Cascaded Encoding**: Multiple compression layers can be applied for optimal results -//! - **Statistical Analysis**: Uses data sampling and statistics to predict compression ratios -//! - **Recursive Structure Handling**: Compresses nested structures like structs and lists +//! - **Adaptive Compression**: Automatically selects the best compression scheme based on data +//! patterns. +//! - **Unified Scheme Trait**: A single [`Scheme`] trait covers all data types (integers, floats, +//! strings, etc.) with a [`SchemeId`] for identity. +//! - **Cascaded Encoding**: Multiple compression layers can be applied for optimal results. +//! - **Statistical Analysis**: Uses data sampling and statistics to predict compression ratios. +//! - **Recursive Structure Handling**: Compresses nested structures like structs and lists. //! //! # How It Works //! //! [`BtrBlocksCompressor::compress()`] takes an `&ArrayRef` and returns an `ArrayRef` that may //! use a different encoding. It first canonicalizes the input, then dispatches by type. -//! Primitives go to a type-specific `Compressor` (integer, float, or string). Compound types -//! like structs and lists recurse into their fields and elements. +//! Primitives and strings go through [`choose_and_compress`], which evaluates every enabled +//! [`Scheme`] and picks the one with the best compression ratio. Compound types like structs +//! and lists recurse into their fields and elements. //! -//! Each type-specific compressor holds a static list of `Scheme` implementations (e.g. -//! BitPacking, ALP, Dict). There is no dynamic registry. The compressor evaluates each scheme by -//! compressing a ~1% sample and measuring the ratio, then picks the best. See `SchemeExt` for -//! details on how sampling works. +//! Each `Scheme` implementation declares whether it [`matches`](Scheme::matches) a given +//! canonical form and, if so, estimates the compression ratio (often by compressing a ~1% +//! sample). There is no dynamic registry — the set of schemes is fixed at build time via +//! [`ALL_SCHEMES`]. //! //! Schemes can produce arrays that are themselves further compressed (e.g. FoR then BitPacking), -//! up to `MAX_CASCADE` (3) layers deep. An `Excludes` set prevents the same scheme from being -//! applied twice in a chain. +//! up to [`MAX_CASCADE`](compressor::MAX_CASCADE) (3) layers deep. An excludes slice of +//! [`SchemeId`] prevents the same scheme from being applied twice in a chain. +//! +//! [`choose_and_compress`]: BtrBlocksCompressor::choose_and_compress //! //! # Example //! //! ```rust -//! use vortex_btrblocks::{BtrBlocksCompressor, BtrBlocksCompressorBuilder, IntCode}; +//! use vortex_btrblocks::{BtrBlocksCompressor, BtrBlocksCompressorBuilder, Scheme}; +//! use vortex_btrblocks::compressor::integer::DictScheme; //! use vortex_array::DynArray; //! -//! // Default compressor with all schemes enabled +//! // Default compressor with all schemes enabled. //! let compressor = BtrBlocksCompressor::default(); //! -//! // Configure with builder to exclude specific schemes +//! // Configure with builder to exclude specific schemes. //! let compressor = BtrBlocksCompressorBuilder::default() -//! .exclude_int([IntCode::Dict]) +//! .exclude([DictScheme.id()]) //! .build(); //! ``` //! //! [BtrBlocks]: https://www.cs.cit.tum.de/fileadmin/w00cfj/dis/papers/btrblocks.pdf -pub use compressor::float::FloatCode; -use compressor::float::FloatCompressor; -pub use compressor::integer::IntCode; -use compressor::integer::IntCompressor; -pub use compressor::string::StringCode; -use compressor::string::StringCompressor; - mod builder; mod canonical_compressor; -mod compressor; +/// Compression scheme implementations. +pub mod compressor; mod ctx; mod sample; mod scheme; mod stats; +mod stats_cache; +pub use builder::ALL_SCHEMES; pub use builder::BtrBlocksCompressorBuilder; pub use canonical_compressor::BtrBlocksCompressor; -pub use canonical_compressor::CanonicalCompressor; -use compressor::Compressor; -use compressor::CompressorExt; -use compressor::MAX_CASCADE; pub use compressor::integer::IntegerStats; pub use compressor::integer::dictionary::dictionary_encode as integer_dictionary_encode; -use ctx::CompressorContext; -use ctx::Excludes; -use scheme::Scheme; -use scheme::SchemeExt; +pub use ctx::CompressorContext; +pub use scheme::Scheme; +pub use scheme::SchemeId; pub use stats::CompressorStats; pub use stats::GenerateStatsOptions; +pub use stats_cache::StatsCache; diff --git a/vortex-btrblocks/src/scheme.rs b/vortex-btrblocks/src/scheme.rs index 1b12a5930e5..d229c87ca11 100644 --- a/vortex-btrblocks/src/scheme.rs +++ b/vortex-btrblocks/src/scheme.rs @@ -1,135 +1,140 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Compression scheme traits. This is the interface each encoding implements to participate in -//! compression. -//! -//! [`Scheme`] is the core trait. Each encoding (e.g. BitPacking, ALP, Dict) implements it with -//! two key methods: [`Scheme::expected_compression_ratio`] to estimate how well it compresses -//! the data, and [`Scheme::compress`] to apply the encoding. Type-specific sub-traits -//! ([`IntegerScheme`], [`FloatScheme`], [`StringScheme`]) bind schemes to the appropriate stats -//! and code types. -//! -//! [`SchemeExt`] provides the default ratio estimation strategy. It samples ~1% of the array -//! (minimum [`SAMPLE_SIZE`] values), compresses the sample, and returns the before/after byte -//! ratio. Schemes can override [`Scheme::expected_compression_ratio`] if they have a cheaper -//! heuristic. -//! -//! [`IntegerScheme`]: crate::compressor::integer::IntegerScheme -//! [`FloatScheme`]: crate::compressor::float::FloatScheme -//! [`StringScheme`]: crate::compressor::string::StringScheme -//! [`SAMPLE_SIZE`]: crate::stats::SAMPLE_SIZE +//! Unified compression scheme trait. +use std::fmt; use std::fmt::Debug; use std::hash::Hash; use std::hash::Hasher; use vortex_array::ArrayRef; +use vortex_array::Canonical; use vortex_error::VortexResult; use crate::BtrBlocksCompressor; use crate::CompressorContext; -use crate::CompressorStats; +use crate::StatsCache; +use crate::sample::sample; use crate::sample::sample_count_approx_one_percent; use crate::stats::SAMPLE_SIZE; -/// Top-level compression scheme trait. +/// Unique identifier for a compression scheme. /// -/// Variants are specialized for each data type, e.g. see `IntegerScheme`, `FloatScheme`, etc. -pub trait Scheme: Debug { - /// Type of the stats generated by the compression scheme. - type StatsType: CompressorStats; - /// Type of the code used to uniquely identify the compression scheme. - type CodeType: Copy + Eq + Hash + Ord; +/// `SchemeId` is opaque — the only way to obtain one is through [`Scheme::id()`], which is a +/// provided method that wraps [`Scheme::scheme_name()`]. There is no public constructor. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct SchemeId { + pub(crate) name: &'static str, +} + +impl fmt::Display for SchemeId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(self.name) + } +} + +/// Unified compression scheme trait. +/// +/// Implementors provide [`scheme_name`](Scheme::scheme_name) to declare their identity. The +/// [`id`](Scheme::id) method is derived automatically and cannot be meaningfully overridden by +/// external crates (since [`SchemeId`] has no public constructor). +pub trait Scheme: Debug + Send + Sync { + /// The globally unique name for this scheme (e.g. `"vortex.int.bitpacking"`). + fn scheme_name(&self) -> &'static str; + + /// Unique identifier derived from [`scheme_name`](Scheme::scheme_name). + fn id(&self) -> SchemeId { + SchemeId { + name: self.scheme_name(), + } + } - /// Scheme unique identifier. - fn code(&self) -> Self::CodeType; + /// Whether this scheme can compress the given canonical array. + fn matches(&self, canonical: &Canonical) -> bool; - /// True if this is the singular Constant scheme for this data type. + /// True if this scheme detects constant arrays. fn is_constant(&self) -> bool { false } - /// Estimate the compression ratio for running this scheme (and its children) - /// for the given input. - /// - /// Depth is the depth in the encoding tree we've already reached before considering this - /// scheme. - /// - /// Returns the estimated compression ratio as well as the tree of compressors to use. + /// Estimate the compression ratio for this scheme on the given array. fn expected_compression_ratio( &self, compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, + array: &ArrayRef, ctx: CompressorContext, - excludes: &[Self::CodeType], + cache: &mut StatsCache, + excludes: &[SchemeId], ) -> VortexResult { - self.estimate_compression_ratio_with_sampling(compressor, stats, ctx, excludes) + let _ = cache; + estimate_compression_ratio_with_sampling(self, compressor, array, ctx, excludes) } - /// Compress the input with this scheme, yielding a new array. + /// Compress the array using this scheme. fn compress( &self, compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, + array: &ArrayRef, ctx: CompressorContext, - excludes: &[Self::CodeType], + cache: &mut StatsCache, + excludes: &[SchemeId], ) -> VortexResult; } -impl PartialEq for dyn Scheme { +impl PartialEq for dyn Scheme { fn eq(&self, other: &Self) -> bool { - self.code() == other.code() + self.id() == other.id() } } -impl Eq for dyn Scheme {} -impl Hash for dyn Scheme { + +impl Eq for dyn Scheme {} + +impl Hash for dyn Scheme { fn hash(&self, state: &mut H) { - self.code().hash(state) + self.id().hash(state); } } -/// Extension trait providing sampling-based compression ratio estimation for schemes. -pub trait SchemeExt: Scheme { - /// Estimates compression ratio by compressing a sample of the data. - /// - /// This method samples approximately 1% of the data (with a minimum of 1024 values) - /// and compresses it to estimate the overall compression ratio. - fn estimate_compression_ratio_with_sampling( - &self, - btr_blocks_compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - ctx: CompressorContext, - excludes: &[Self::CodeType], - ) -> VortexResult { - let sample = if ctx.is_sample { - stats.clone() - } else { - let source_len = stats.source().len(); - let sample_count = sample_count_approx_one_percent(source_len); - - tracing::trace!( - "Sampling {} values out of {}", - SAMPLE_SIZE as u64 * sample_count as u64, - source_len - ); - - stats.sample(SAMPLE_SIZE, sample_count) - }; - - let after = self - .compress(btr_blocks_compressor, &sample, ctx.as_sample(), excludes)? - .nbytes(); - let before = sample.source().nbytes(); - - tracing::debug!( - "estimate_compression_ratio_with_sampling(compressor={self:#?} ctx={ctx:?}) = {}", - before as f64 / after as f64 +/// Estimates compression ratio by compressing a ~1% sample of the data. +pub fn estimate_compression_ratio_with_sampling( + scheme: &S, + compressor: &BtrBlocksCompressor, + array: &ArrayRef, + ctx: CompressorContext, + excludes: &[SchemeId], +) -> VortexResult { + let sample_array = if ctx.is_sample { + array.clone() + } else { + let source_len = array.len(); + let sample_count = sample_count_approx_one_percent(source_len); + + tracing::trace!( + "Sampling {} values out of {}", + SAMPLE_SIZE as u64 * sample_count as u64, + source_len ); - Ok(before as f64 / after as f64) - } + sample(array, SAMPLE_SIZE, sample_count) + }; + + let mut sample_cache = StatsCache::new(); + let after = scheme + .compress( + compressor, + &sample_array, + ctx.as_sample(), + &mut sample_cache, + excludes, + )? + .nbytes(); + let before = sample_array.nbytes(); + + tracing::debug!( + "estimate_compression_ratio_with_sampling(compressor={scheme:#?} ctx={ctx:?}) = {}", + before as f64 / after as f64 + ); + + Ok(before as f64 / after as f64) } - -// Blanket implementation for all Scheme types -impl SchemeExt for T {} diff --git a/vortex-btrblocks/src/stats_cache.rs b/vortex-btrblocks/src/stats_cache.rs new file mode 100644 index 00000000000..f0121f70648 --- /dev/null +++ b/vortex-btrblocks/src/stats_cache.rs @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Per-compression-site statistics cache. +//! +//! [`StatsCache`] is a [`TypeId`]-keyed container created fresh for each [`choose_and_compress`] +//! call. It stores arbitrary stats types (e.g., [`IntegerStats`], [`FloatStats`]) so that +//! multiple schemes evaluated at the same compression site share the same computed statistics. +//! +//! [`choose_and_compress`]: crate::BtrBlocksCompressor::choose_and_compress +//! [`IntegerStats`]: crate::compressor::integer::IntegerStats +//! [`FloatStats`]: crate::compressor::float::FloatStats + +use std::any::Any; +use std::any::TypeId; + +use vortex_error::VortexExpect; + +/// Cache for compression statistics, keyed by concrete type. +/// +/// Schemes access stats via [`get_or_insert_with`], which returns a cached `&T` on subsequent +/// calls. The first scheme to request a given stats type triggers its computation; all later +/// schemes at the same site get the cached version. +/// +/// [`get_or_insert_with`]: StatsCache::get_or_insert_with +pub struct StatsCache { + entries: Vec<(TypeId, Box)>, +} + +impl StatsCache { + /// Creates an empty cache. + pub fn new() -> Self { + Self { + entries: Vec::new(), + } + } + + /// Returns a cached `&T`, computing and storing it on first access. + pub fn get_or_insert_with(&mut self, f: impl FnOnce() -> T) -> &T { + let type_id = TypeId::of::(); + let pos = self.entries.iter().position(|(id, _)| *id == type_id); + + if let Some(pos) = pos { + self.entries[pos] + .1 + .downcast_ref::() + .vortex_expect("TypeId mismatch in StatsCache") + } else { + self.entries.push((type_id, Box::new(f()))); + self.entries + .last() + .vortex_expect("just pushed") + .1 + .downcast_ref::() + .vortex_expect("TypeId mismatch in StatsCache") + } + } +} + +impl Default for StatsCache { + fn default() -> Self { + Self::new() + } +} diff --git a/vortex-file/src/strategy.rs b/vortex-file/src/strategy.rs index 9645e09df34..4f52bdd917b 100644 --- a/vortex-file/src/strategy.rs +++ b/vortex-file/src/strategy.rs @@ -30,11 +30,13 @@ use vortex_array::session::ArrayRegistry; #[cfg(feature = "zstd")] use vortex_btrblocks::BtrBlocksCompressorBuilder; #[cfg(feature = "zstd")] -use vortex_btrblocks::FloatCode; +use vortex_btrblocks::Scheme; #[cfg(feature = "zstd")] -use vortex_btrblocks::IntCode; +use vortex_btrblocks::compressor::float; #[cfg(feature = "zstd")] -use vortex_btrblocks::StringCode; +use vortex_btrblocks::compressor::integer; +#[cfg(feature = "zstd")] +use vortex_btrblocks::compressor::string; use vortex_bytebool::ByteBool; use vortex_datetime_parts::DateTimeParts; use vortex_decimal_byte_parts::DecimalByteParts; @@ -195,18 +197,22 @@ impl WriteStrategyBuilder { /// GPU decompression. Without it, strings use interleaved Zstd compression. #[cfg(feature = "zstd")] pub fn with_cuda_compatible_encodings(mut self) -> Self { - let mut builder = BtrBlocksCompressorBuilder::default() - .exclude_int([IntCode::Sparse, IntCode::Rle]) - .exclude_float([FloatCode::Rle, FloatCode::Sparse]) - .exclude_string([StringCode::Dict, StringCode::Fsst]); + let mut builder = BtrBlocksCompressorBuilder::default().exclude([ + integer::SparseScheme.id(), + integer::RLE_INTEGER_SCHEME.id(), + float::RLE_FLOAT_SCHEME.id(), + float::NullDominated.id(), + string::DictScheme.id(), + string::FSSTScheme.id(), + ]); #[cfg(feature = "unstable_encodings")] { - builder = builder.include_string([StringCode::ZstdBuffers]); + builder = builder.include([string::ZstdBuffersScheme.id()]); } #[cfg(not(feature = "unstable_encodings"))] { - builder = builder.include_string([StringCode::Zstd]); + builder = builder.include([string::ZstdScheme.id()]); } self.compressor = Some(Arc::new(builder.build())); @@ -221,9 +227,11 @@ impl WriteStrategyBuilder { #[cfg(feature = "zstd")] pub fn with_compact_encodings(mut self) -> Self { let btrblocks = BtrBlocksCompressorBuilder::default() - .include_string([StringCode::Zstd]) - .include_int([IntCode::Pco]) - .include_float([FloatCode::Pco]) + .include([ + string::ZstdScheme.id(), + integer::PcoScheme.id(), + float::PcoScheme.id(), + ]) .build(); self.compressor = Some(Arc::new(btrblocks)); diff --git a/vortex-layout/src/layouts/compressed.rs b/vortex-layout/src/layouts/compressed.rs index 58ba381d415..76f684d36ec 100644 --- a/vortex-layout/src/layouts/compressed.rs +++ b/vortex-layout/src/layouts/compressed.rs @@ -11,7 +11,8 @@ use vortex_array::DynArray; use vortex_array::expr::stats::Stat; use vortex_btrblocks::BtrBlocksCompressor; use vortex_btrblocks::BtrBlocksCompressorBuilder; -use vortex_btrblocks::IntCode; +use vortex_btrblocks::Scheme; +use vortex_btrblocks::compressor::integer::DictScheme; use vortex_error::VortexResult; use vortex_io::runtime::Handle; @@ -69,7 +70,7 @@ impl CompressingStrategy { pub fn new_btrblocks(child: S, exclude_int_dict_encoding: bool) -> Self { let compressor = if exclude_int_dict_encoding { BtrBlocksCompressorBuilder::default() - .exclude_int([IntCode::Dict]) + .exclude([DictScheme.id()]) .build() } else { BtrBlocksCompressor::default() diff --git a/vortex/src/lib.rs b/vortex/src/lib.rs index a532fc1adad..ab22ea36f4e 100644 --- a/vortex/src/lib.rs +++ b/vortex/src/lib.rs @@ -36,9 +36,8 @@ pub mod buffer { pub mod compressor { pub use vortex_btrblocks::BtrBlocksCompressor; pub use vortex_btrblocks::BtrBlocksCompressorBuilder; - pub use vortex_btrblocks::FloatCode; - pub use vortex_btrblocks::IntCode; - pub use vortex_btrblocks::StringCode; + pub use vortex_btrblocks::Scheme; + pub use vortex_btrblocks::SchemeId; } pub mod dtype { From d7e50165a0653f31a3619cb66212e61812d601c4 Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Tue, 17 Mar 2026 17:50:42 -0400 Subject: [PATCH 2/9] make distinct values calc lazy + refactor typed stats Signed-off-by: Connor Tsui --- vortex-btrblocks/src/canonical_compressor.rs | 42 +++++ .../src/compressor/float/dictionary.rs | 24 ++- vortex-btrblocks/src/compressor/float/mod.rs | 26 ++- .../src/compressor/float/stats.rs | 102 ++++++---- .../src/compressor/integer/dictionary.rs | 16 +- .../src/compressor/integer/mod.rs | 90 +++++++-- .../src/compressor/integer/stats.rs | 178 ++++++++++++------ vortex-btrblocks/src/compressor/string.rs | 4 + vortex-btrblocks/src/scheme.rs | 5 + vortex-btrblocks/src/stats.rs | 10 +- 10 files changed, 353 insertions(+), 144 deletions(-) diff --git a/vortex-btrblocks/src/canonical_compressor.rs b/vortex-btrblocks/src/canonical_compressor.rs index 682af7a1c19..862fda7fbeb 100644 --- a/vortex-btrblocks/src/canonical_compressor.rs +++ b/vortex-btrblocks/src/canonical_compressor.rs @@ -17,6 +17,7 @@ use vortex_array::arrays::ExtensionArray; use vortex_array::arrays::FixedSizeListArray; use vortex_array::arrays::ListArray; use vortex_array::arrays::ListViewArray; +use vortex_array::arrays::Primitive; use vortex_array::arrays::StructArray; use vortex_array::arrays::TemporalArray; use vortex_array::arrays::listview::list_from_list_view; @@ -29,11 +30,16 @@ use vortex_error::VortexResult; use crate::BtrBlocksCompressorBuilder; use crate::CompressorContext; +use crate::CompressorStats; +use crate::GenerateStatsOptions; use crate::Scheme; use crate::SchemeId; use crate::StatsCache; use crate::compressor::decimal::compress_decimal; +use crate::compressor::float::FloatStats; use crate::compressor::integer::DictScheme as IntDictScheme; +use crate::compressor::integer::IntegerStats; +use crate::compressor::string::StringStats; use crate::compressor::temporal::compress_temporal; /// The main compressor type implementing BtrBlocks-inspired compression. @@ -209,8 +215,44 @@ impl BtrBlocksCompressor { } let before_nbytes = array.nbytes(); + let needs_distinct = eligible.iter().any(|s| s.needs_distinct_values()); let mut cache = StatsCache::new(); + // Pre-populate the stats cache with the right `count_distinct_values` setting. + // This matches the old `gen_stats` behavior where distinct values were only computed + // when Dict was in the scheme list. + if let Some(prim) = array.as_opt::() { + let prim = prim.to_primitive(); + if prim.ptype().is_int() { + cache.get_or_insert_with::(|| { + IntegerStats::generate_opts( + &prim, + GenerateStatsOptions { + count_distinct_values: needs_distinct, + }, + ) + }); + } else { + cache.get_or_insert_with::(|| { + FloatStats::generate_opts( + &prim, + GenerateStatsOptions { + count_distinct_values: needs_distinct, + }, + ) + }); + } + } else if array.as_opt::().is_some() { + cache.get_or_insert_with::(|| { + StringStats::generate_opts( + &array.to_varbinview(), + GenerateStatsOptions { + count_distinct_values: needs_distinct, + }, + ) + }); + } + if let Some(winner) = self.choose_scheme(&eligible, &array, ctx, &mut cache, excludes)? { let compressed = winner.compress(self, &array, ctx, &mut cache, excludes)?; if compressed.nbytes() < before_nbytes { diff --git a/vortex-btrblocks/src/compressor/float/dictionary.rs b/vortex-btrblocks/src/compressor/float/dictionary.rs index 33c024af4b3..c0f7d2993b3 100644 --- a/vortex-btrblocks/src/compressor/float/dictionary.rs +++ b/vortex-btrblocks/src/compressor/float/dictionary.rs @@ -12,13 +12,18 @@ use vortex_array::dtype::half::f16; use vortex_array::validity::Validity; use vortex_array::vtable::ValidityHelper; use vortex_buffer::Buffer; +use vortex_error::VortexExpect; -use super::stats::ErasedDistinctValues; +use super::stats::ErasedStats; use super::stats::FloatStats; macro_rules! typed_encode { ($stats:ident, $typed:ident, $validity:ident, $typ:ty) => {{ - let values: Buffer<$typ> = $typed.values.iter().map(|x| x.0).collect(); + let distinct = $typed.distinct.as_ref().vortex_expect( + "this must be present since `DictScheme` declared that we need distinct values", + ); + + let values: Buffer<$typ> = distinct.distinct_values.iter().map(|x| x.0).collect(); let max_code = values.len(); let codes = if max_code <= u8::MAX as usize { @@ -49,10 +54,10 @@ macro_rules! typed_encode { /// Compresses a floating-point array into a dictionary arrays according to attached stats. pub fn dictionary_encode(stats: &FloatStats) -> DictArray { let validity = stats.src.validity(); - match &stats.distinct_values { - ErasedDistinctValues::F16(typed) => typed_encode!(stats, typed, validity, f16), - ErasedDistinctValues::F32(typed) => typed_encode!(stats, typed, validity, f32), - ErasedDistinctValues::F64(typed) => typed_encode!(stats, typed, validity, f64), + match &stats.erased { + ErasedStats::F16(typed) => typed_encode!(stats, typed, validity, f16), + ErasedStats::F32(typed) => typed_encode!(stats, typed, validity, f32), + ErasedStats::F64(typed) => typed_encode!(stats, typed, validity, f64), } } @@ -118,7 +123,12 @@ mod tests { Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()); let array = PrimitiveArray::new(values, validity); - let stats = FloatStats::generate(&array); + let stats = FloatStats::generate_opts( + &array, + crate::GenerateStatsOptions { + count_distinct_values: true, + }, + ); let dict_array = dictionary_encode(&stats); assert_eq!(dict_array.values().len(), 2); assert_eq!(dict_array.codes().len(), 5); diff --git a/vortex-btrblocks/src/compressor/float/mod.rs b/vortex-btrblocks/src/compressor/float/mod.rs index 7bbf78f1133..31d970bde34 100644 --- a/vortex-btrblocks/src/compressor/float/mod.rs +++ b/vortex-btrblocks/src/compressor/float/mod.rs @@ -173,11 +173,11 @@ impl Scheme for ConstantScheme { } // Can only have 1 distinct value. - if stats.distinct_values_count != 1 { - return Ok(0.0); + if stats.distinct_count().is_some_and(|count| count == 1) { + return Ok(stats.value_count as f64); } - Ok(stats.value_count as f64) + Ok(0.0) } fn compress( @@ -350,6 +350,10 @@ impl Scheme for DictScheme { is_float_primitive(canonical) } + fn needs_distinct_values(&self) -> bool { + true + } + fn expected_compression_ratio( &self, compressor: &BtrBlocksCompressor, @@ -371,13 +375,19 @@ impl Scheme for DictScheme { return Ok(0.0); } - // If the array is high cardinality (>50% unique values) skip. - if stats.distinct_values_count > stats.value_count / 2 { - return Ok(0.0); + // If the array is high cardinality (>50% unique values), we do not want to compress as a + // dictionary. + if stats + .distinct_count() + .is_some_and(|count| count <= stats.value_count / 2) + { + // Take a sample and run compression on the sample to determine before/after size. + return estimate_compression_ratio_with_sampling( + self, compressor, array, ctx, excludes, + ); } - // Take a sample and run compression on the sample to determine before/after size. - estimate_compression_ratio_with_sampling(self, compressor, array, ctx, excludes) + Ok(0.0) } fn compress( diff --git a/vortex-btrblocks/src/compressor/float/stats.rs b/vortex-btrblocks/src/compressor/float/stats.rs index 818ba40d2cf..a5e9de9e08e 100644 --- a/vortex-btrblocks/src/compressor/float/stats.rs +++ b/vortex-btrblocks/src/compressor/float/stats.rs @@ -27,43 +27,58 @@ use crate::compressor::rle::RLEStats; use crate::sample::sample; #[derive(Debug, Clone)] -pub struct DistinctValues { - pub values: HashSet, FxBuildHasher>, +pub struct DistinctInfo { + pub(super) distinct_values: HashSet, FxBuildHasher>, + distinct_count: u32, } #[derive(Debug, Clone)] -pub enum ErasedDistinctValues { - F16(DistinctValues), - F32(DistinctValues), - F64(DistinctValues), +pub struct TypedStats { + pub(super) distinct: Option>, +} + +#[derive(Debug, Clone)] +pub enum ErasedStats { + F16(TypedStats), + F32(TypedStats), + F64(TypedStats), +} + +impl ErasedStats { + /// Get the count of distinct values, if we have computed it already. + fn distinct_count(&self) -> Option { + match self { + ErasedStats::F16(x) => x.distinct.as_ref().map(|d| d.distinct_count), + ErasedStats::F32(x) => x.distinct.as_ref().map(|d| d.distinct_count), + ErasedStats::F64(x) => x.distinct.as_ref().map(|d| d.distinct_count), + } + } } macro_rules! impl_from_typed { - ($typ:ty, $variant:path) => { - impl From> for ErasedDistinctValues { - fn from(value: DistinctValues<$typ>) -> Self { - $variant(value) + ($T:ty, $variant:path) => { + impl From> for ErasedStats { + fn from(typed: TypedStats<$T>) -> Self { + $variant(typed) } } }; } -impl_from_typed!(f16, ErasedDistinctValues::F16); -impl_from_typed!(f32, ErasedDistinctValues::F32); -impl_from_typed!(f64, ErasedDistinctValues::F64); +impl_from_typed!(f16, ErasedStats::F16); +impl_from_typed!(f32, ErasedStats::F32); +impl_from_typed!(f64, ErasedStats::F64); /// Array of floating-point numbers and relevant stats for compression. #[derive(Debug, Clone)] pub struct FloatStats { - pub(crate) src: PrimitiveArray, + pub(super) src: PrimitiveArray, // cache for validity.false_count() - pub(crate) null_count: u32, + pub(super) null_count: u32, // cache for validity.true_count() - pub(crate) value_count: u32, - #[allow(dead_code)] - pub(crate) average_run_length: u32, - pub(crate) distinct_values: ErasedDistinctValues, - pub(crate) distinct_values_count: u32, + pub(super) value_count: u32, + pub(super) average_run_length: u32, + pub(super) erased: ErasedStats, } impl FloatStats { @@ -78,6 +93,11 @@ impl FloatStats { _ => vortex_panic!("cannot generate FloatStats from ptype {}", input.ptype()), } } + + /// Get the count of distinct values, if we have computed it already. + pub fn distinct_count(&self) -> Option { + self.erased.distinct_count() + } } impl CompressorStats for FloatStats { @@ -119,8 +139,8 @@ fn typed_float_stats( count_distinct_values: bool, ) -> VortexResult where - DistinctValues: Into, NativeValue: Hash + Eq, + TypedStats: Into, { // Special case: empty array if array.is_empty() { @@ -129,11 +149,7 @@ where null_count: 0, value_count: 0, average_run_length: 0, - distinct_values_count: 0, - distinct_values: DistinctValues { - values: HashSet::, FxBuildHasher>::with_hasher(FxBuildHasher), - } - .into(), + erased: TypedStats { distinct: None }.into(), }); } else if array.all_invalid()? { return Ok(FloatStats { @@ -141,11 +157,7 @@ where null_count: u32::try_from(array.len())?, value_count: 0, average_run_length: 0, - distinct_values_count: 0, - distinct_values: DistinctValues { - values: HashSet::, FxBuildHasher>::with_hasher(FxBuildHasher), - } - .into(), + erased: TypedStats { distinct: None }.into(), }); } @@ -208,7 +220,7 @@ where let null_count = u32::try_from(null_count)?; let value_count = u32::try_from(value_count)?; - let distinct_values_count = if count_distinct_values { + let distinct_count = if count_distinct_values { u32::try_from(distinct_values.len())? } else { u32::MAX @@ -217,11 +229,13 @@ where Ok(FloatStats { null_count, value_count, - distinct_values_count, src: array.clone(), average_run_length: value_count / runs, - distinct_values: DistinctValues { - values: distinct_values, + erased: TypedStats { + distinct: Some(DistinctInfo { + distinct_values, + distinct_count, + }), } .into(), }) @@ -243,12 +257,17 @@ mod tests { let floats = buffer![0.0f32, 1.0f32, 2.0f32].into_array(); let floats = floats.to_primitive(); - let stats = FloatStats::generate(&floats); + let stats = FloatStats::generate_opts( + &floats, + crate::GenerateStatsOptions { + count_distinct_values: true, + }, + ); assert_eq!(stats.value_count, 3); assert_eq!(stats.null_count, 0); assert_eq!(stats.average_run_length, 1); - assert_eq!(stats.distinct_values_count, 3); + assert_eq!(stats.distinct_count().unwrap(), 3); } #[test] @@ -258,11 +277,16 @@ mod tests { Validity::from_iter([false, true, true]), ); - let stats = FloatStats::generate(&floats); + let stats = FloatStats::generate_opts( + &floats, + crate::GenerateStatsOptions { + count_distinct_values: true, + }, + ); assert_eq!(stats.value_count, 2); assert_eq!(stats.null_count, 1); assert_eq!(stats.average_run_length, 1); - assert_eq!(stats.distinct_values_count, 2); + assert_eq!(stats.distinct_count().unwrap(), 2); } } diff --git a/vortex-btrblocks/src/compressor/integer/dictionary.rs b/vortex-btrblocks/src/compressor/integer/dictionary.rs index 70a29aaeedd..0106d98db1d 100644 --- a/vortex-btrblocks/src/compressor/integer/dictionary.rs +++ b/vortex-btrblocks/src/compressor/integer/dictionary.rs @@ -11,13 +11,18 @@ use vortex_array::arrays::PrimitiveArray; use vortex_array::validity::Validity; use vortex_array::vtable::ValidityHelper; use vortex_buffer::Buffer; +use vortex_error::VortexExpect; use super::IntegerStats; use super::stats::ErasedStats; macro_rules! typed_encode { ($stats:ident, $typed:ident, $validity:ident, $typ:ty) => {{ - let values: Buffer<$typ> = $typed.distinct_values.keys().map(|x| x.0).collect(); + let distinct = $typed.distinct.as_ref().vortex_expect( + "this must be present since `DictScheme` declared that we need distinct values", + ); + + let values: Buffer<$typ> = distinct.distinct_values.keys().map(|x| x.0).collect(); let max_code = values.len(); let codes = if max_code <= u8::MAX as usize { @@ -54,7 +59,7 @@ pub fn dictionary_encode(stats: &IntegerStats) -> DictArray { // We need to preserve the nullability somehow from the original let src_validity = stats.src.validity(); - match &stats.typed { + match &stats.erased { ErasedStats::U8(typed) => typed_encode!(stats, typed, src_validity, u8), ErasedStats::U16(typed) => typed_encode!(stats, typed, src_validity, u16), ErasedStats::U32(typed) => typed_encode!(stats, typed, src_validity, u32), @@ -134,7 +139,12 @@ mod tests { Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()); let array = PrimitiveArray::new(data, validity); - let stats = IntegerStats::generate(&array); + let stats = IntegerStats::generate_opts( + &array, + crate::GenerateStatsOptions { + count_distinct_values: true, + }, + ); let dict_array = dictionary_encode(&stats); assert_eq!(dict_array.values().len(), 2); assert_eq!(dict_array.codes().len(), 5); diff --git a/vortex-btrblocks/src/compressor/integer/mod.rs b/vortex-btrblocks/src/compressor/integer/mod.rs index 0aadd34d27f..00dce7dbc6c 100644 --- a/vortex-btrblocks/src/compressor/integer/mod.rs +++ b/vortex-btrblocks/src/compressor/integer/mod.rs @@ -35,6 +35,7 @@ use self::dictionary::dictionary_encode; use crate::BtrBlocksCompressor; use crate::CompressorContext; use crate::CompressorStats; +use crate::GenerateStatsOptions; use crate::Scheme; use crate::SchemeId; use crate::StatsCache; @@ -185,7 +186,7 @@ impl Scheme for ConstantScheme { .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); // Only arrays with one distinct value can be constant compressed. - if stats.distinct_values_count > 1 { + if stats.distinct_count().is_none_or(|count| count > 1) { return Ok(0.0); } @@ -256,7 +257,7 @@ impl Scheme for FORScheme { } // Only apply when the min is not already zero. - if stats.typed.min_is_zero() { + if stats.erased.min_is_zero() { return Ok(0.0); } @@ -267,7 +268,7 @@ impl Scheme for FORScheme { .bit_width() .try_into() .vortex_expect("bit width must fit in u32"); - let for_bw = match stats.typed.max_minus_min().checked_ilog2() { + let for_bw = match stats.erased.max_minus_min().checked_ilog2() { Some(l) => l + 1, // If max-min == 0, we should use a different compression scheme as we don't want to // bitpack down to 0 bits. @@ -278,9 +279,9 @@ impl Scheme for FORScheme { // compared to BitPacking, don't use FOR since it has overhead (storing reference). // Only skip FOR when min >= 0, otherwise BitPacking can't apply directly. if let Some(max_log) = stats - .typed + .erased .max_ilog2() - .filter(|_| !stats.typed.min_is_negative()) + .filter(|_| !stats.erased.min_is_negative()) { let bitpack_bw = max_log + 1; if for_bw >= bitpack_bw { @@ -360,7 +361,7 @@ impl Scheme for ZigZagScheme { } // ZigZag is only useful when there are negative values. - if !stats.typed.min_is_negative() { + if !stats.erased.min_is_negative() { return Ok(0.0); } @@ -426,7 +427,7 @@ impl Scheme for BitPackingScheme { .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); // BitPacking only works for non-negative values. - if stats.typed.min_is_negative() { + if stats.erased.min_is_negative() { return Ok(0.0); } @@ -473,6 +474,10 @@ impl Scheme for SparseScheme { is_integer_primitive(canonical) } + fn needs_distinct_values(&self) -> bool { + true + } + fn expected_compression_ratio( &self, _compressor: &BtrBlocksCompressor, @@ -486,8 +491,14 @@ impl Scheme for SparseScheme { return Ok(0.0); } - let stats = cache - .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + let stats = cache.get_or_insert_with::(|| { + IntegerStats::generate_opts( + &array.to_primitive(), + GenerateStatsOptions { + count_distinct_values: true, + }, + ) + }); if stats.value_count == 0 { // All nulls should use ConstantScheme. @@ -500,7 +511,9 @@ impl Scheme for SparseScheme { } // See if the top value accounts for >= 90% of the set values. - let (_, top_count) = stats.typed.top_value_and_count(); + let (_, top_count) = stats.erased.most_frequent_value_and_count().vortex_expect( + "this must be present since `SparseScheme` declared that we need distinct values", + ); if top_count == stats.value_count { // top_value is the only value, should use ConstantScheme instead. @@ -529,7 +542,9 @@ impl Scheme for SparseScheme { let stats = cache .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); - let (top_pvalue, top_count) = stats.typed.top_value_and_count(); + let (top_pvalue, top_count) = stats.erased.most_frequent_value_and_count().vortex_expect( + "this must be present since `SparseScheme` declared that we need distinct values", + ); if top_count as usize == stats.src.len() { // top_value is the only value, use ConstantScheme. return Ok(ConstantArray::new( @@ -593,6 +608,10 @@ impl Scheme for DictScheme { is_integer_primitive(canonical) } + fn needs_distinct_values(&self) -> bool { + true + } + fn expected_compression_ratio( &self, _compressor: &BtrBlocksCompressor, @@ -606,23 +625,33 @@ impl Scheme for DictScheme { return Ok(0.0); } - let stats = cache - .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + let stats = cache.get_or_insert_with::(|| { + IntegerStats::generate_opts( + &array.to_primitive(), + GenerateStatsOptions { + count_distinct_values: true, + }, + ) + }); if stats.value_count == 0 { return Ok(0.0); } + let distinct_values_count = stats.distinct_count().vortex_expect( + "this must be present since `DictScheme` declared that we need distinct values", + ); + // If > 50% of the values are distinct, skip dict. - if stats.distinct_values_count > stats.value_count / 2 { + if distinct_values_count > stats.value_count / 2 { return Ok(0.0); } // Ignore nulls encoding for the estimate. We only focus on values. - let values_size = stats.source().ptype().bit_width() * stats.distinct_values_count as usize; + let values_size = stats.source().ptype().bit_width() * distinct_values_count as usize; // Assume codes are compressed RLE + BitPacking. - let codes_bw = usize::BITS - stats.distinct_values_count.leading_zeros(); + let codes_bw = usize::BITS - distinct_values_count.leading_zeros(); let n_runs = (stats.value_count / stats.average_run_length) as usize; @@ -776,10 +805,12 @@ impl Scheme for SequenceScheme { return Ok(0.0); } - // If the distinct_values_count was computed (!= u32::MAX) then all values in a sequence - // must be unique. - if stats.distinct_values_count != u32::MAX - && stats.distinct_values_count as usize != stats.src.len() + // If the distinct_values_count was computed, and not all values are unique, then this + // cannot be encoded as a sequence array. + if stats + .distinct_count() + // TODO(connor): Shouldn't this be `is_none_or`??? Why do things fail if not this? + .is_some_and(|count| count as usize != stats.src.len()) { return Ok(0.0); } @@ -887,6 +918,7 @@ mod tests { use super::SparseScheme; use crate::BtrBlocksCompressor; use crate::CompressorContext; + use crate::CompressorStats; use crate::Scheme; use crate::StatsCache; @@ -936,6 +968,15 @@ mod tests { let btr = BtrBlocksCompressor::default(); let array_ref = array.clone().into_array(); let mut cache = StatsCache::new(); + // SparseScheme needs distinct values. + cache.get_or_insert_with::(|| { + super::IntegerStats::generate_opts( + &array, + crate::GenerateStatsOptions { + count_distinct_values: true, + }, + ) + }); let compressed = SparseScheme.compress( &btr, &array_ref, @@ -962,6 +1003,15 @@ mod tests { let btr = BtrBlocksCompressor::default(); let array_ref = array.clone().into_array(); let mut cache = StatsCache::new(); + // SparseScheme needs distinct values. + cache.get_or_insert_with::(|| { + super::IntegerStats::generate_opts( + &array, + crate::GenerateStatsOptions { + count_distinct_values: true, + }, + ) + }); let compressed = SparseScheme.compress( &btr, &array_ref, diff --git a/vortex-btrblocks/src/compressor/integer/stats.rs b/vortex-btrblocks/src/compressor/integer/stats.rs index 111a1b7a155..4ce10e0d255 100644 --- a/vortex-btrblocks/src/compressor/integer/stats.rs +++ b/vortex-btrblocks/src/compressor/integer/stats.rs @@ -27,13 +27,36 @@ use crate::GenerateStatsOptions; use crate::compressor::rle::RLEStats; use crate::sample::sample; -#[derive(Clone, Debug)] +#[derive(Debug, Clone)] +pub struct DistinctInfo { + /// The unique values and their occurrences. + pub(super) distinct_values: HashMap, u32, FxBuildHasher>, + /// The count of unique values. + distinct_count: u32, + /// The most frequent value. + most_frequent_value: T, + /// The number of times the most frequent value occurs. + top_frequency: u32, +} + +#[derive(Debug, Clone)] pub struct TypedStats { - pub min: T, - pub max: T, - pub top_value: T, - pub top_count: u32, - pub distinct_values: HashMap, u32, FxBuildHasher>, + min: T, + max: T, + pub(super) distinct: Option>, +} + +impl TypedStats { + /// Get the count of distinct values, if we have computed it already. + fn distinct_count(&self) -> Option { + Some(self.distinct.as_ref()?.distinct_count) + } + + /// Get the most commonly occurring value and its count, if we have computed it already. + fn most_frequent_value_and_count(&self) -> Option<(&T, u32)> { + let distinct = self.distinct.as_ref()?; + Some((&distinct.most_frequent_value, distinct.top_frequency)) + } } /// Type-erased container for one of the [TypedStats] variants. @@ -116,17 +139,55 @@ impl ErasedStats { } } + /// Get the count of distinct values, if we have computed it already. + pub fn distinct_count(&self) -> Option { + match &self { + ErasedStats::U8(x) => x.distinct_count(), + ErasedStats::U16(x) => x.distinct_count(), + ErasedStats::U32(x) => x.distinct_count(), + ErasedStats::U64(x) => x.distinct_count(), + ErasedStats::I8(x) => x.distinct_count(), + ErasedStats::I16(x) => x.distinct_count(), + ErasedStats::I32(x) => x.distinct_count(), + ErasedStats::I64(x) => x.distinct_count(), + } + } + /// Get the most commonly occurring value and its count - pub fn top_value_and_count(&self) -> (PValue, u32) { + pub fn most_frequent_value_and_count(&self) -> Option<(PValue, u32)> { match &self { - ErasedStats::U8(x) => (x.top_value.into(), x.top_count), - ErasedStats::U16(x) => (x.top_value.into(), x.top_count), - ErasedStats::U32(x) => (x.top_value.into(), x.top_count), - ErasedStats::U64(x) => (x.top_value.into(), x.top_count), - ErasedStats::I8(x) => (x.top_value.into(), x.top_count), - ErasedStats::I16(x) => (x.top_value.into(), x.top_count), - ErasedStats::I32(x) => (x.top_value.into(), x.top_count), - ErasedStats::I64(x) => (x.top_value.into(), x.top_count), + ErasedStats::U8(x) => { + let (top_value, count) = x.most_frequent_value_and_count()?; + Some(((*top_value).into(), count)) + } + ErasedStats::U16(x) => { + let (top_value, count) = x.most_frequent_value_and_count()?; + Some(((*top_value).into(), count)) + } + ErasedStats::U32(x) => { + let (top_value, count) = x.most_frequent_value_and_count()?; + Some(((*top_value).into(), count)) + } + ErasedStats::U64(x) => { + let (top_value, count) = x.most_frequent_value_and_count()?; + Some(((*top_value).into(), count)) + } + ErasedStats::I8(x) => { + let (top_value, count) = x.most_frequent_value_and_count()?; + Some(((*top_value).into(), count)) + } + ErasedStats::I16(x) => { + let (top_value, count) = x.most_frequent_value_and_count()?; + Some(((*top_value).into(), count)) + } + ErasedStats::I32(x) => { + let (top_value, count) = x.most_frequent_value_and_count()?; + Some(((*top_value).into(), count)) + } + ErasedStats::I64(x) => { + let (top_value, count) = x.most_frequent_value_and_count()?; + Some(((*top_value).into(), count)) + } } } } @@ -159,8 +220,7 @@ pub struct IntegerStats { // cache for validity.true_count() pub(super) value_count: u32, pub(super) average_run_length: u32, - pub(super) distinct_values_count: u32, - pub(crate) typed: ErasedStats, + pub(super) erased: ErasedStats, } impl IntegerStats { @@ -172,6 +232,16 @@ impl IntegerStats { typed_int_stats::(input, opts.count_distinct_values) }) } + + /// Get the count of distinct values, if we have computed it already. + pub fn distinct_count(&self) -> Option { + self.erased.distinct_count() + } + + /// Get the most commonly occurring value and its count, if we have computed it already. + pub fn most_frequent_value_and_count(&self) -> Option<(PValue, u32)> { + self.erased.most_frequent_value_and_count() + } } impl CompressorStats for IntegerStats { @@ -224,13 +294,10 @@ where null_count: 0, value_count: 0, average_run_length: 0, - distinct_values_count: 0, - typed: TypedStats { + erased: TypedStats { min: T::max_value(), max: T::min_value(), - top_value: T::default(), - top_count: 0, - distinct_values: HashMap::with_hasher(FxBuildHasher), + distinct: None, } .into(), }); @@ -240,13 +307,10 @@ where null_count: u32::try_from(array.len())?, value_count: 0, average_run_length: 0, - distinct_values_count: 0, - typed: TypedStats { + erased: TypedStats { min: T::max_value(), max: T::min_value(), - top_value: T::default(), - top_count: 0, - distinct_values: HashMap::with_hasher(FxBuildHasher), + distinct: None, } .into(), }); @@ -329,23 +393,7 @@ where } } - let (top_value, top_count) = if count_distinct_values { - let (&top_value, &top_count) = loop_state - .distinct_values - .iter() - .max_by_key(|&(_, &count)| count) - .vortex_expect("non-empty"); - (top_value.0, top_count) - } else { - (T::default(), 0) - }; - let runs = loop_state.runs; - let distinct_values_count = if count_distinct_values { - u32::try_from(loop_state.distinct_values.len())? - } else { - u32::MAX - }; let min = array .statistics() @@ -357,13 +405,23 @@ where .compute_as::(Stat::Max) .vortex_expect("max should be computed"); - let typed = TypedStats { - min, - max, - distinct_values: loop_state.distinct_values, - top_value, - top_count, - }; + let distinct = count_distinct_values.then(|| { + let (&top_value, &top_count) = loop_state + .distinct_values + .iter() + .max_by_key(|&(_, &count)| count) + .vortex_expect("we know this is non-empty"); + + DistinctInfo { + distinct_count: u32::try_from(loop_state.distinct_values.len()) + .vortex_expect("there are more than `u32::MAX` distinct values"), + most_frequent_value: top_value.0, + top_frequency: top_count, + distinct_values: loop_state.distinct_values, + } + }); + + let typed = TypedStats { min, max, distinct }; let null_count = u32::try_from(null_count)?; let value_count = u32::try_from(value_count)?; @@ -373,8 +431,7 @@ where null_count, value_count, average_run_length: value_count / runs, - distinct_values_count, - typed: typed.into(), + erased: typed.into(), }) } @@ -469,7 +526,7 @@ mod tests { fn test_naive_count_distinct_values() -> VortexResult<()> { let array = PrimitiveArray::new(buffer![217u8, 0], Validity::NonNullable); let stats = typed_int_stats::(&array, true)?; - assert_eq!(stats.distinct_values_count, 2); + assert_eq!(stats.distinct_count().unwrap(), 2); Ok(()) } @@ -480,7 +537,7 @@ mod tests { Validity::from(BitBuffer::from(vec![true, false])), ); let stats = typed_int_stats::(&array, true)?; - assert_eq!(stats.distinct_values_count, 1); + assert_eq!(stats.distinct_count().unwrap(), 1); Ok(()) } @@ -488,7 +545,7 @@ mod tests { fn test_count_distinct_values() -> VortexResult<()> { let array = PrimitiveArray::new((0..128u8).collect::>(), Validity::NonNullable); let stats = typed_int_stats::(&array, true)?; - assert_eq!(stats.distinct_values_count, 128); + assert_eq!(stats.distinct_count().unwrap(), 128); Ok(()) } @@ -501,7 +558,7 @@ mod tests { )), ); let stats = typed_int_stats::(&array, true)?; - assert_eq!(stats.distinct_values_count, 64); + assert_eq!(stats.distinct_count().unwrap(), 64); Ok(()) } @@ -509,11 +566,16 @@ mod tests { fn test_integer_stats_leading_nulls() { let ints = PrimitiveArray::new(buffer![0, 1, 2], Validity::from_iter([false, true, true])); - let stats = IntegerStats::generate(&ints); + let stats = IntegerStats::generate_opts( + &ints, + crate::GenerateStatsOptions { + count_distinct_values: true, + }, + ); assert_eq!(stats.value_count, 2); assert_eq!(stats.null_count, 1); assert_eq!(stats.average_run_length, 1); - assert_eq!(stats.distinct_values_count, 2); + assert_eq!(stats.distinct_count().unwrap(), 2); } } diff --git a/vortex-btrblocks/src/compressor/string.rs b/vortex-btrblocks/src/compressor/string.rs index 48b6b439e33..e4de7bfcfa8 100644 --- a/vortex-btrblocks/src/compressor/string.rs +++ b/vortex-btrblocks/src/compressor/string.rs @@ -191,6 +191,10 @@ impl Scheme for DictScheme { is_utf8_string(canonical) } + fn needs_distinct_values(&self) -> bool { + true + } + fn expected_compression_ratio( &self, compressor: &BtrBlocksCompressor, diff --git a/vortex-btrblocks/src/scheme.rs b/vortex-btrblocks/src/scheme.rs index d229c87ca11..96bd60adbdf 100644 --- a/vortex-btrblocks/src/scheme.rs +++ b/vortex-btrblocks/src/scheme.rs @@ -58,6 +58,11 @@ pub trait Scheme: Debug + Send + Sync { false } + /// Whether this scheme requires distinct-value statistics to be pre-computed. + fn needs_distinct_values(&self) -> bool { + false + } + /// Estimate the compression ratio for this scheme on the given array. fn expected_compression_ratio( &self, diff --git a/vortex-btrblocks/src/stats.rs b/vortex-btrblocks/src/stats.rs index b3e25cfb8d6..e3421991f1e 100644 --- a/vortex-btrblocks/src/stats.rs +++ b/vortex-btrblocks/src/stats.rs @@ -8,6 +8,7 @@ use std::fmt::Debug; use vortex_array::vtable::VTable; /// Configures how stats are generated. +#[derive(Default)] pub struct GenerateStatsOptions { /// Should distinct values should be counted during stats generation. pub count_distinct_values: bool, @@ -15,15 +16,6 @@ pub struct GenerateStatsOptions { // should this be scheme-specific? } -impl Default for GenerateStatsOptions { - fn default() -> Self { - Self { - count_distinct_values: true, - // count_runs: true, - } - } -} - /// The size of each sampled run. pub(crate) const SAMPLE_SIZE: u32 = 64; /// The number of sampled runs. From f78d660ef1c3c5bc198755d7b68c77249d0aae33 Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Wed, 18 Mar 2026 10:32:43 -0400 Subject: [PATCH 3/9] add `ArrayAndStats` Signed-off-by: Connor Tsui clean up Signed-off-by: Connor Tsui --- vortex-btrblocks/benches/dict_encode.rs | 1 - vortex-btrblocks/benches/stats_calc.rs | 1 - vortex-btrblocks/src/builder.rs | 3 +- vortex-btrblocks/src/canonical_compressor.rs | 68 ++--- .../src/compressor/float/dictionary.rs | 1 - vortex-btrblocks/src/compressor/float/mod.rs | 118 +++------ .../src/compressor/float/stats.rs | 46 ++-- .../src/compressor/integer/dictionary.rs | 1 - .../src/compressor/integer/mod.rs | 243 ++++++------------ .../src/compressor/integer/stats.rs | 27 +- vortex-btrblocks/src/compressor/rle.rs | 16 +- vortex-btrblocks/src/compressor/string.rs | 115 ++++----- vortex-btrblocks/src/ctx.rs | 4 + vortex-btrblocks/src/lib.rs | 3 +- vortex-btrblocks/src/scheme.rs | 50 ++-- vortex-btrblocks/src/stats.rs | 53 ++-- vortex-btrblocks/src/stats_cache.rs | 111 ++++++-- 17 files changed, 362 insertions(+), 499 deletions(-) diff --git a/vortex-btrblocks/benches/dict_encode.rs b/vortex-btrblocks/benches/dict_encode.rs index 9bed0f11936..8d7c6fc6297 100644 --- a/vortex-btrblocks/benches/dict_encode.rs +++ b/vortex-btrblocks/benches/dict_encode.rs @@ -9,7 +9,6 @@ use vortex_array::arrays::BoolArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::builders::dict::dict_encode; use vortex_array::validity::Validity; -use vortex_btrblocks::CompressorStats; use vortex_btrblocks::IntegerStats; use vortex_btrblocks::integer_dictionary_encode; use vortex_buffer::BufferMut; diff --git a/vortex-btrblocks/benches/stats_calc.rs b/vortex-btrblocks/benches/stats_calc.rs index a272c16210c..b3070598d6b 100644 --- a/vortex-btrblocks/benches/stats_calc.rs +++ b/vortex-btrblocks/benches/stats_calc.rs @@ -10,7 +10,6 @@ mod benchmarks { use divan::Bencher; use vortex_array::arrays::PrimitiveArray; use vortex_array::validity::Validity; - use vortex_btrblocks::CompressorStats; use vortex_btrblocks::GenerateStatsOptions; use vortex_btrblocks::IntegerStats; use vortex_buffer::Buffer; diff --git a/vortex-btrblocks/src/builder.rs b/vortex-btrblocks/src/builder.rs index 6b88f759f7f..ff3229c4a37 100644 --- a/vortex-btrblocks/src/builder.rs +++ b/vortex-btrblocks/src/builder.rs @@ -17,9 +17,10 @@ pub const ALL_SCHEMES: &[&dyn Scheme] = &[ // Integer schemes. &crate::compressor::integer::UncompressedScheme as &dyn Scheme, &crate::compressor::integer::ConstantScheme, + // NOTE: For must precede BitPacking to avoid unnecessary patches. &crate::compressor::integer::FORScheme, - &crate::compressor::integer::ZigZagScheme, &crate::compressor::integer::BitPackingScheme, + &crate::compressor::integer::ZigZagScheme, &crate::compressor::integer::SparseScheme, &crate::compressor::integer::DictScheme, &crate::compressor::integer::RunEndScheme, diff --git a/vortex-btrblocks/src/canonical_compressor.rs b/vortex-btrblocks/src/canonical_compressor.rs index 862fda7fbeb..e56032c9c08 100644 --- a/vortex-btrblocks/src/canonical_compressor.rs +++ b/vortex-btrblocks/src/canonical_compressor.rs @@ -17,7 +17,6 @@ use vortex_array::arrays::ExtensionArray; use vortex_array::arrays::FixedSizeListArray; use vortex_array::arrays::ListArray; use vortex_array::arrays::ListViewArray; -use vortex_array::arrays::Primitive; use vortex_array::arrays::StructArray; use vortex_array::arrays::TemporalArray; use vortex_array::arrays::listview::list_from_list_view; @@ -28,18 +27,14 @@ use vortex_array::scalar::Scalar; use vortex_array::vtable::ValidityHelper; use vortex_error::VortexResult; +use crate::ArrayAndStats; use crate::BtrBlocksCompressorBuilder; use crate::CompressorContext; -use crate::CompressorStats; use crate::GenerateStatsOptions; use crate::Scheme; use crate::SchemeId; -use crate::StatsCache; use crate::compressor::decimal::compress_decimal; -use crate::compressor::float::FloatStats; use crate::compressor::integer::DictScheme as IntDictScheme; -use crate::compressor::integer::IntegerStats; -use crate::compressor::string::StringStats; use crate::compressor::temporal::compress_temporal; /// The main compressor type implementing BtrBlocks-inspired compression. @@ -215,53 +210,26 @@ impl BtrBlocksCompressor { } let before_nbytes = array.nbytes(); - let needs_distinct = eligible.iter().any(|s| s.needs_distinct_values()); - let mut cache = StatsCache::new(); - - // Pre-populate the stats cache with the right `count_distinct_values` setting. - // This matches the old `gen_stats` behavior where distinct values were only computed - // when Dict was in the scheme list. - if let Some(prim) = array.as_opt::() { - let prim = prim.to_primitive(); - if prim.ptype().is_int() { - cache.get_or_insert_with::(|| { - IntegerStats::generate_opts( - &prim, - GenerateStatsOptions { - count_distinct_values: needs_distinct, - }, - ) - }); - } else { - cache.get_or_insert_with::(|| { - FloatStats::generate_opts( - &prim, - GenerateStatsOptions { - count_distinct_values: needs_distinct, - }, - ) - }); - } - } else if array.as_opt::().is_some() { - cache.get_or_insert_with::(|| { - StringStats::generate_opts( - &array.to_varbinview(), - GenerateStatsOptions { - count_distinct_values: needs_distinct, - }, - ) + let merged_opts = eligible + .iter() + .fold(GenerateStatsOptions::default(), |acc, s| { + acc.merge(s.stats_options()) }); - } - if let Some(winner) = self.choose_scheme(&eligible, &array, ctx, &mut cache, excludes)? { - let compressed = winner.compress(self, &array, ctx, &mut cache, excludes)?; + let mut ctx = ctx; + ctx.stats_options = merged_opts; + + let mut data = ArrayAndStats::new(array, merged_opts); + + if let Some(winner) = self.choose_scheme(&eligible, &mut data, ctx, excludes)? { + let compressed = winner.compress(self, &mut data, ctx, excludes)?; if compressed.nbytes() < before_nbytes { return Ok(compressed); } } // No scheme improved on the original. - Ok(array) + Ok(data.into_array()) } /// Evaluates each candidate scheme and returns the one with the best compression ratio @@ -269,15 +237,14 @@ impl BtrBlocksCompressor { fn choose_scheme( &self, schemes: &[&'static dyn Scheme], - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - cache: &mut StatsCache, excludes: &[SchemeId], ) -> VortexResult> { let mut best: Option<(&'static dyn Scheme, f64)> = None; for &scheme in schemes { - let ratio = self.evaluate_scheme(scheme, array, ctx, cache, excludes)?; + let ratio = self.evaluate_scheme(scheme, data, ctx, excludes)?; if is_valid_ratio(ratio) && ratio > 1.0 && best.is_none_or(|(_, r)| ratio > r) { best = Some((scheme, ratio)); } @@ -290,12 +257,11 @@ impl BtrBlocksCompressor { fn evaluate_scheme( &self, scheme: &'static dyn Scheme, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - cache: &mut StatsCache, excludes: &[SchemeId], ) -> VortexResult { - let ratio = scheme.expected_compression_ratio(self, array, ctx, cache, excludes)?; + let ratio = scheme.expected_compression_ratio(self, data, ctx, excludes)?; tracing::debug!( scheme = %scheme.id(), diff --git a/vortex-btrblocks/src/compressor/float/dictionary.rs b/vortex-btrblocks/src/compressor/float/dictionary.rs index c0f7d2993b3..3370c20cf3b 100644 --- a/vortex-btrblocks/src/compressor/float/dictionary.rs +++ b/vortex-btrblocks/src/compressor/float/dictionary.rs @@ -112,7 +112,6 @@ mod tests { use vortex_buffer::buffer; use super::super::FloatStats; - use crate::CompressorStats; use crate::compressor::float::dictionary::dictionary_encode; #[test] diff --git a/vortex-btrblocks/src/compressor/float/mod.rs b/vortex-btrblocks/src/compressor/float/mod.rs index 31d970bde34..f979a4b3275 100644 --- a/vortex-btrblocks/src/compressor/float/mod.rs +++ b/vortex-btrblocks/src/compressor/float/mod.rs @@ -31,13 +31,12 @@ use super::integer::DictScheme as IntDictScheme; use super::integer::RunEndScheme as IntRunEndScheme; use super::integer::SequenceScheme as IntSequenceScheme; use super::integer::SparseScheme as IntSparseScheme; +use crate::ArrayAndStats; use crate::BtrBlocksCompressor; use crate::CompressorContext; -use crate::CompressorStats; use crate::GenerateStatsOptions; use crate::Scheme; use crate::SchemeId; -use crate::StatsCache; use crate::compressor::patches::compress_patches; use crate::compressor::rle; use crate::compressor::rle::RLEScheme; @@ -119,9 +118,8 @@ impl Scheme for UncompressedScheme { fn expected_compression_ratio( &self, _compressor: &BtrBlocksCompressor, - _array: &ArrayRef, + _data: &mut ArrayAndStats, _ctx: CompressorContext, - _cache: &mut StatsCache, _excludes: &[SchemeId], ) -> VortexResult { Ok(1.0) @@ -130,12 +128,11 @@ impl Scheme for UncompressedScheme { fn compress( &self, _compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, _ctx: CompressorContext, - _cache: &mut StatsCache, _excludes: &[SchemeId], ) -> VortexResult { - Ok(array.clone()) + Ok(data.array().clone()) } } @@ -155,9 +152,8 @@ impl Scheme for ConstantScheme { fn expected_compression_ratio( &self, _compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - cache: &mut StatsCache, _excludes: &[SchemeId], ) -> VortexResult { // Never select Constant when sampling. @@ -165,8 +161,7 @@ impl Scheme for ConstantScheme { return Ok(0.0); } - let stats = - cache.get_or_insert_with::(|| FloatStats::generate(&array.to_primitive())); + let stats = data.float_stats(); if stats.null_count as usize == stats.src.len() || stats.value_count == 0 { return Ok(0.0); @@ -183,13 +178,11 @@ impl Scheme for ConstantScheme { fn compress( &self, _compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, _ctx: CompressorContext, - cache: &mut StatsCache, _excludes: &[SchemeId], ) -> VortexResult { - let stats = - cache.get_or_insert_with::(|| FloatStats::generate(&array.to_primitive())); + let stats = data.float_stats(); let scalar_idx = (0..stats.source().len()).position(|idx| stats.source().is_valid(idx).unwrap_or(false)); @@ -225,13 +218,11 @@ impl Scheme for ALPScheme { fn expected_compression_ratio( &self, compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - cache: &mut StatsCache, excludes: &[SchemeId], ) -> VortexResult { - let stats = - cache.get_or_insert_with::(|| FloatStats::generate(&array.to_primitive())); + let stats = data.float_stats(); // We don't support ALP for f16. if stats.source().ptype() == PType::F16 { @@ -244,19 +235,17 @@ impl Scheme for ALPScheme { return Ok(0.0); } - estimate_compression_ratio_with_sampling(self, compressor, array, ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, data, ctx, excludes) } fn compress( &self, compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - cache: &mut StatsCache, excludes: &[SchemeId], ) -> VortexResult { - let stats = - cache.get_or_insert_with::(|| FloatStats::generate(&array.to_primitive())); + let stats = data.float_stats(); let alp_encoded = alp_encode(&stats.source().to_primitive(), None)?; let alp = alp_encoded.as_::(); @@ -297,31 +286,27 @@ impl Scheme for ALPRDScheme { fn expected_compression_ratio( &self, compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - cache: &mut StatsCache, excludes: &[SchemeId], ) -> VortexResult { - let stats = - cache.get_or_insert_with::(|| FloatStats::generate(&array.to_primitive())); + let stats = data.float_stats(); if stats.source().ptype() == PType::F16 { return Ok(0.0); } - estimate_compression_ratio_with_sampling(self, compressor, array, ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, data, ctx, excludes) } fn compress( &self, _compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, _ctx: CompressorContext, - cache: &mut StatsCache, _excludes: &[SchemeId], ) -> VortexResult { - let stats = - cache.get_or_insert_with::(|| FloatStats::generate(&array.to_primitive())); + let stats = data.float_stats(); let encoder = match stats.source().ptype() { PType::F32 => RDEncoder::new(stats.source().as_slice::()), @@ -350,26 +335,20 @@ impl Scheme for DictScheme { is_float_primitive(canonical) } - fn needs_distinct_values(&self) -> bool { - true + fn stats_options(&self) -> GenerateStatsOptions { + GenerateStatsOptions { + count_distinct_values: true, + } } fn expected_compression_ratio( &self, compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - cache: &mut StatsCache, excludes: &[SchemeId], ) -> VortexResult { - let stats = cache.get_or_insert_with::(|| { - FloatStats::generate_opts( - &array.to_primitive(), - GenerateStatsOptions { - count_distinct_values: true, - }, - ) - }); + let stats = data.float_stats(); if stats.value_count == 0 { return Ok(0.0); @@ -382,9 +361,7 @@ impl Scheme for DictScheme { .is_some_and(|count| count <= stats.value_count / 2) { // Take a sample and run compression on the sample to determine before/after size. - return estimate_compression_ratio_with_sampling( - self, compressor, array, ctx, excludes, - ); + return estimate_compression_ratio_with_sampling(self, compressor, data, ctx, excludes); } Ok(0.0) @@ -393,19 +370,11 @@ impl Scheme for DictScheme { fn compress( &self, compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - cache: &mut StatsCache, _excludes: &[SchemeId], ) -> VortexResult { - let stats = cache.get_or_insert_with::(|| { - FloatStats::generate_opts( - &array.to_primitive(), - GenerateStatsOptions { - count_distinct_values: true, - }, - ) - }); + let stats = data.float_stats(); let dict = dictionary_encode(stats); let has_all_values_referenced = dict.has_all_values_referenced(); @@ -447,9 +416,8 @@ impl Scheme for NullDominated { fn expected_compression_ratio( &self, _compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - cache: &mut StatsCache, _excludes: &[SchemeId], ) -> VortexResult { // Only use `SparseScheme` if we can cascade. @@ -457,8 +425,7 @@ impl Scheme for NullDominated { return Ok(0.0); } - let stats = - cache.get_or_insert_with::(|| FloatStats::generate(&array.to_primitive())); + let stats = data.float_stats(); if stats.value_count == 0 { // All nulls should use ConstantScheme. @@ -477,15 +444,13 @@ impl Scheme for NullDominated { fn compress( &self, compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - cache: &mut StatsCache, _excludes: &[SchemeId], ) -> VortexResult { assert!(ctx.allowed_cascading > 0); - let stats = - cache.get_or_insert_with::(|| FloatStats::generate(&array.to_primitive())); + let stats = data.float_stats(); // We pass None as we only run this pathway for NULL-dominated float arrays. let sparse_encoded = SparseArray::encode(&stats.src.clone().into_array(), None)?; @@ -524,13 +489,11 @@ impl Scheme for PcoScheme { fn compress( &self, _compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, _ctx: CompressorContext, - cache: &mut StatsCache, _excludes: &[SchemeId], ) -> VortexResult { - let stats = - cache.get_or_insert_with::(|| FloatStats::generate(&array.to_primitive())); + let stats = data.float_stats(); Ok(vortex_pco::PcoArray::from_primitive( stats.source(), pco::DEFAULT_COMPRESSION_LEVEL, @@ -558,10 +521,11 @@ mod tests { use vortex_error::VortexResult; use super::RLE_FLOAT_SCHEME; + use crate::ArrayAndStats; use crate::BtrBlocksCompressor; use crate::CompressorContext; + use crate::GenerateStatsOptions; use crate::Scheme; - use crate::StatsCache; #[test] fn test_empty() -> VortexResult<()> { @@ -603,14 +567,10 @@ mod tests { let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); let btr = BtrBlocksCompressor::default(); - let mut cache = StatsCache::new(); - let compressed = RLE_FLOAT_SCHEME.compress( - &btr, - &array.into_array(), - CompressorContext::default(), - &mut cache, - &[], - )?; + let array_ref = array.into_array(); + let mut data = ArrayAndStats::new(array_ref.clone(), GenerateStatsOptions::default()); + let compressed = + RLE_FLOAT_SCHEME.compress(&btr, &mut data, CompressorContext::default(), &[])?; let expected = Buffer::copy_from(&values).into_array(); assert_arrays_eq!(compressed.as_ref(), expected.as_ref()); diff --git a/vortex-btrblocks/src/compressor/float/stats.rs b/vortex-btrblocks/src/compressor/float/stats.rs index a5e9de9e08e..9c114a113fa 100644 --- a/vortex-btrblocks/src/compressor/float/stats.rs +++ b/vortex-btrblocks/src/compressor/float/stats.rs @@ -6,9 +6,6 @@ use std::hash::Hash; use itertools::Itertools; use num_traits::Float; use rustc_hash::FxBuildHasher; -use vortex_array::IntoArray; -use vortex_array::ToCanonical; -use vortex_array::arrays::Primitive; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::primitive::NativeValue; use vortex_array::dtype::NativePType; @@ -21,10 +18,8 @@ use vortex_error::vortex_panic; use vortex_mask::AllOr; use vortex_utils::aliases::hash_set::HashSet; -use crate::CompressorStats; use crate::GenerateStatsOptions; use crate::compressor::rle::RLEStats; -use crate::sample::sample; #[derive(Debug, Clone)] pub struct DistinctInfo { @@ -100,24 +95,23 @@ impl FloatStats { } } -impl CompressorStats for FloatStats { - type ArrayVTable = Primitive; +impl FloatStats { + /// Generates stats with default options. + pub fn generate(input: &PrimitiveArray) -> Self { + Self::generate_opts(input, GenerateStatsOptions::default()) + } - fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self { + /// Generates stats with provided options. + pub fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self { Self::generate_opts_fallible(input, opts) .vortex_expect("FloatStats::generate_opts should not fail") } - fn source(&self) -> &PrimitiveArray { + /// Returns the underlying source array. + #[expect(clippy::same_name_method)] + pub fn source(&self) -> &PrimitiveArray { &self.src } - - fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self { - let sampled = - sample(&self.src.clone().into_array(), sample_size, sample_count).to_primitive(); - - Self::generate_opts(&sampled, opts) - } } impl RLEStats for FloatStats { @@ -220,24 +214,19 @@ where let null_count = u32::try_from(null_count)?; let value_count = u32::try_from(value_count)?; - let distinct_count = if count_distinct_values { - u32::try_from(distinct_values.len())? - } else { - u32::MAX - }; + + let distinct = count_distinct_values.then(|| DistinctInfo { + distinct_count: u32::try_from(distinct_values.len()) + .vortex_expect("more than u32::MAX distinct values"), + distinct_values, + }); Ok(FloatStats { null_count, value_count, src: array.clone(), average_run_length: value_count / runs, - erased: TypedStats { - distinct: Some(DistinctInfo { - distinct_values, - distinct_count, - }), - } - .into(), + erased: TypedStats { distinct }.into(), }) } @@ -250,7 +239,6 @@ mod tests { use vortex_buffer::buffer; use super::FloatStats; - use crate::CompressorStats; #[test] fn test_float_stats() { diff --git a/vortex-btrblocks/src/compressor/integer/dictionary.rs b/vortex-btrblocks/src/compressor/integer/dictionary.rs index 0106d98db1d..d12ea2b6233 100644 --- a/vortex-btrblocks/src/compressor/integer/dictionary.rs +++ b/vortex-btrblocks/src/compressor/integer/dictionary.rs @@ -129,7 +129,6 @@ mod tests { use super::IntegerStats; use super::dictionary_encode; - use crate::CompressorStats; #[test] fn test_dict_encode_integer_stats() { diff --git a/vortex-btrblocks/src/compressor/integer/mod.rs b/vortex-btrblocks/src/compressor/integer/mod.rs index 00dce7dbc6c..fdc7c6864dc 100644 --- a/vortex-btrblocks/src/compressor/integer/mod.rs +++ b/vortex-btrblocks/src/compressor/integer/mod.rs @@ -32,13 +32,12 @@ use vortex_zigzag::ZigZagArray; use vortex_zigzag::zigzag_encode; use self::dictionary::dictionary_encode; +use crate::ArrayAndStats; use crate::BtrBlocksCompressor; use crate::CompressorContext; -use crate::CompressorStats; use crate::GenerateStatsOptions; use crate::Scheme; use crate::SchemeId; -use crate::StatsCache; use crate::compressor::patches::compress_patches; use crate::compressor::rle; use crate::compressor::rle::RLEScheme; @@ -135,9 +134,8 @@ impl Scheme for UncompressedScheme { fn expected_compression_ratio( &self, _compressor: &BtrBlocksCompressor, - _array: &ArrayRef, + _data: &mut ArrayAndStats, _ctx: CompressorContext, - _cache: &mut StatsCache, _excludes: &[SchemeId], ) -> VortexResult { // No compression. @@ -147,12 +145,11 @@ impl Scheme for UncompressedScheme { fn compress( &self, _compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, _ctx: CompressorContext, - _cache: &mut StatsCache, _excludes: &[SchemeId], ) -> VortexResult { - Ok(array.clone()) + Ok(data.array().clone()) } } @@ -172,9 +169,8 @@ impl Scheme for ConstantScheme { fn expected_compression_ratio( &self, _compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - cache: &mut StatsCache, _excludes: &[SchemeId], ) -> VortexResult { // Never yield ConstantScheme for a sample, it could be a false-positive. @@ -182,8 +178,7 @@ impl Scheme for ConstantScheme { return Ok(0.0); } - let stats = cache - .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + let stats = data.integer_stats(); // Only arrays with one distinct value can be constant compressed. if stats.distinct_count().is_none_or(|count| count > 1) { @@ -196,13 +191,11 @@ impl Scheme for ConstantScheme { fn compress( &self, _compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, _ctx: CompressorContext, - cache: &mut StatsCache, _excludes: &[SchemeId], ) -> VortexResult { - let stats = cache - .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + let stats = data.integer_stats(); let scalar_idx = (0..stats.source().len()).position(|idx| stats.source().is_valid(idx).unwrap_or(false)); @@ -238,9 +231,8 @@ impl Scheme for FORScheme { fn expected_compression_ratio( &self, _compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - cache: &mut StatsCache, _excludes: &[SchemeId], ) -> VortexResult { // Only apply if we are not at the leaf. @@ -248,8 +240,7 @@ impl Scheme for FORScheme { return Ok(0.0); } - let stats = cache - .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + let stats = data.integer_stats(); // All-null cannot be FOR compressed. if stats.value_count == 0 { @@ -295,12 +286,11 @@ impl Scheme for FORScheme { fn compress( &self, compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - _cache: &mut StatsCache, excludes: &[SchemeId], ) -> VortexResult { - let primitive = array.to_primitive(); + let primitive = data.array().to_primitive(); let for_array = FoRArray::encode(primitive)?; let biased = for_array.encoded().to_primitive(); @@ -311,15 +301,11 @@ impl Scheme for FORScheme { let leaf_ctx = CompressorContext { is_sample: ctx.is_sample, allowed_cascading: 0, + stats_options: ctx.stats_options, }; - let mut biased_cache = StatsCache::new(); - let compressed = BitPackingScheme.compress( - compressor, - &biased.into_array(), - leaf_ctx, - &mut biased_cache, - excludes, - )?; + let mut biased_data = ArrayAndStats::new(biased.into_array(), ctx.stats_options); + let compressed = + BitPackingScheme.compress(compressor, &mut biased_data, leaf_ctx, excludes)?; let for_compressed = FoRArray::try_new(compressed, for_array.reference_scalar().clone())?; for_compressed @@ -342,9 +328,8 @@ impl Scheme for ZigZagScheme { fn expected_compression_ratio( &self, compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - cache: &mut StatsCache, excludes: &[SchemeId], ) -> VortexResult { // ZigZag is only useful when we cascade it with another encoding. @@ -352,8 +337,7 @@ impl Scheme for ZigZagScheme { return Ok(0.0); } - let stats = cache - .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + let stats = data.integer_stats(); // Don't try and compress all-null arrays. if stats.value_count == 0 { @@ -366,19 +350,17 @@ impl Scheme for ZigZagScheme { } // Run compression on a sample to see how it performs. - estimate_compression_ratio_with_sampling(self, compressor, array, ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, data, ctx, excludes) } fn compress( &self, compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - cache: &mut StatsCache, excludes: &[SchemeId], ) -> VortexResult { - let stats = cache - .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + let stats = data.integer_stats(); // Zigzag encode the values, then recursively compress the inner values. let zag = zigzag_encode(stats.src.clone())?; @@ -418,13 +400,11 @@ impl Scheme for BitPackingScheme { fn expected_compression_ratio( &self, compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - cache: &mut StatsCache, excludes: &[SchemeId], ) -> VortexResult { - let stats = cache - .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + let stats = data.integer_stats(); // BitPacking only works for non-negative values. if stats.erased.min_is_negative() { @@ -436,19 +416,17 @@ impl Scheme for BitPackingScheme { return Ok(0.0); } - estimate_compression_ratio_with_sampling(self, compressor, array, ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, data, ctx, excludes) } fn compress( &self, _compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, _ctx: CompressorContext, - cache: &mut StatsCache, _excludes: &[SchemeId], ) -> VortexResult { - let stats = cache - .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + let stats = data.integer_stats(); let histogram = bit_width_histogram(stats.source())?; let bw = find_best_bit_width(stats.source().ptype(), &histogram)?; @@ -474,16 +452,17 @@ impl Scheme for SparseScheme { is_integer_primitive(canonical) } - fn needs_distinct_values(&self) -> bool { - true + fn stats_options(&self) -> GenerateStatsOptions { + GenerateStatsOptions { + count_distinct_values: true, + } } fn expected_compression_ratio( &self, _compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - cache: &mut StatsCache, _excludes: &[SchemeId], ) -> VortexResult { // Only use `SparseScheme` if we can cascade. @@ -491,14 +470,12 @@ impl Scheme for SparseScheme { return Ok(0.0); } - let stats = cache.get_or_insert_with::(|| { - IntegerStats::generate_opts( - &array.to_primitive(), - GenerateStatsOptions { - count_distinct_values: true, - }, - ) - }); + // We use `generate()` (not `generate_opts` with `count_distinct_values: true`) + // because the cache is pre-populated by `choose_and_compress` with the merged + // `stats_options` from all eligible schemes. Since this scheme declares + // `stats_options()` with `count_distinct_values: true`, the pre-populated stats + // will have distinct values computed. + let stats = data.integer_stats(); if stats.value_count == 0 { // All nulls should use ConstantScheme. @@ -532,15 +509,13 @@ impl Scheme for SparseScheme { fn compress( &self, compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - cache: &mut StatsCache, excludes: &[SchemeId], ) -> VortexResult { assert!(ctx.allowed_cascading > 0); - let stats = cache - .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + let stats = data.integer_stats(); let (top_pvalue, top_count) = stats.erased.most_frequent_value_and_count().vortex_expect( "this must be present since `SparseScheme` declared that we need distinct values", @@ -608,16 +583,17 @@ impl Scheme for DictScheme { is_integer_primitive(canonical) } - fn needs_distinct_values(&self) -> bool { - true + fn stats_options(&self) -> GenerateStatsOptions { + GenerateStatsOptions { + count_distinct_values: true, + } } fn expected_compression_ratio( &self, _compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - cache: &mut StatsCache, _excludes: &[SchemeId], ) -> VortexResult { // Dict should not be terminal. @@ -625,14 +601,7 @@ impl Scheme for DictScheme { return Ok(0.0); } - let stats = cache.get_or_insert_with::(|| { - IntegerStats::generate_opts( - &array.to_primitive(), - GenerateStatsOptions { - count_distinct_values: true, - }, - ) - }); + let stats = data.integer_stats(); if stats.value_count == 0 { return Ok(0.0); @@ -669,15 +638,13 @@ impl Scheme for DictScheme { fn compress( &self, compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - cache: &mut StatsCache, excludes: &[SchemeId], ) -> VortexResult { assert!(ctx.allowed_cascading > 0); - let stats = cache - .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + let stats = data.integer_stats(); // TODO(aduffy): we can be more prescriptive: we know that codes will EITHER be // RLE or FOR + BP. Cascading probably wastes some time here. @@ -719,13 +686,11 @@ impl Scheme for RunEndScheme { fn expected_compression_ratio( &self, compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - cache: &mut StatsCache, excludes: &[SchemeId], ) -> VortexResult { - let stats = cache - .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + let stats = data.integer_stats(); // If the run length is below the threshold, drop it. if stats.average_run_length < RUN_END_THRESHOLD { @@ -737,21 +702,19 @@ impl Scheme for RunEndScheme { } // Run compression on a sample, see how it performs. - estimate_compression_ratio_with_sampling(self, compressor, array, ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, data, ctx, excludes) } fn compress( &self, compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - cache: &mut StatsCache, excludes: &[SchemeId], ) -> VortexResult { assert!(ctx.allowed_cascading > 0); - let stats = cache - .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + let stats = data.integer_stats(); // Run-end encode the ends. let (ends, values) = runend_encode(&stats.src); @@ -793,13 +756,11 @@ impl Scheme for SequenceScheme { fn expected_compression_ratio( &self, _compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, _ctx: CompressorContext, - cache: &mut StatsCache, _excludes: &[SchemeId], ) -> VortexResult { - let stats = cache - .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + let stats = data.integer_stats(); if stats.null_count > 0 { return Ok(0.0); @@ -825,13 +786,11 @@ impl Scheme for SequenceScheme { fn compress( &self, _compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, _ctx: CompressorContext, - cache: &mut StatsCache, _excludes: &[SchemeId], ) -> VortexResult { - let stats = cache - .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + let stats = data.integer_stats(); if stats.null_count > 0 { vortex_bail!("sequence encoding does not support nulls"); @@ -853,13 +812,11 @@ impl Scheme for PcoScheme { fn expected_compression_ratio( &self, compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - cache: &mut StatsCache, excludes: &[SchemeId], ) -> VortexResult { - let stats = cache - .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + let stats = data.integer_stats(); // Pco does not support I8 or U8. if matches!( @@ -869,19 +826,17 @@ impl Scheme for PcoScheme { return Ok(0.0); } - estimate_compression_ratio_with_sampling(self, compressor, array, ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, data, ctx, excludes) } fn compress( &self, _compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, _ctx: CompressorContext, - cache: &mut StatsCache, _excludes: &[SchemeId], ) -> VortexResult { - let stats = cache - .get_or_insert_with::(|| IntegerStats::generate(&array.to_primitive())); + let stats = data.integer_stats(); Ok(vortex_pco::PcoArray::from_primitive( stats.source(), @@ -916,11 +871,11 @@ mod tests { use super::RLE_INTEGER_SCHEME; use super::SequenceScheme; use super::SparseScheme; + use crate::ArrayAndStats; use crate::BtrBlocksCompressor; use crate::CompressorContext; - use crate::CompressorStats; + use crate::GenerateStatsOptions; use crate::Scheme; - use crate::StatsCache; #[test] fn test_empty() -> VortexResult<()> { @@ -966,24 +921,15 @@ mod tests { Validity::from_iter(vec![true, true, true, true, false]), ); let btr = BtrBlocksCompressor::default(); - let array_ref = array.clone().into_array(); - let mut cache = StatsCache::new(); // SparseScheme needs distinct values. - cache.get_or_insert_with::(|| { - super::IntegerStats::generate_opts( - &array, - crate::GenerateStatsOptions { - count_distinct_values: true, - }, - ) - }); - let compressed = SparseScheme.compress( - &btr, - &array_ref, - CompressorContext::default(), - &mut cache, - &[], - )?; + let mut data = ArrayAndStats::new( + array.clone().into_array(), + GenerateStatsOptions { + count_distinct_values: true, + }, + ); + let compressed = + SparseScheme.compress(&btr, &mut data, CompressorContext::default(), &[])?; assert!(compressed.is::()); let decoded = compressed.clone(); let expected = @@ -1001,24 +947,15 @@ mod tests { ]), ); let btr = BtrBlocksCompressor::default(); - let array_ref = array.clone().into_array(); - let mut cache = StatsCache::new(); // SparseScheme needs distinct values. - cache.get_or_insert_with::(|| { - super::IntegerStats::generate_opts( - &array, - crate::GenerateStatsOptions { - count_distinct_values: true, - }, - ) - }); - let compressed = SparseScheme.compress( - &btr, - &array_ref, - CompressorContext::default(), - &mut cache, - &[], - )?; + let mut data = ArrayAndStats::new( + array.clone().into_array(), + GenerateStatsOptions { + count_distinct_values: true, + }, + ); + let compressed = + SparseScheme.compress(&btr, &mut data, CompressorContext::default(), &[])?; assert!(compressed.is::()); let decoded = compressed.clone(); let expected = PrimitiveArray::new( @@ -1036,14 +973,9 @@ mod tests { let array = PrimitiveArray::from_option_iter(values.clone().into_iter().map(Some)); let btr = BtrBlocksCompressor::default(); let array_ref = array.into_array(); - let mut cache = StatsCache::new(); - let compressed = SequenceScheme.compress( - &btr, - &array_ref, - CompressorContext::default(), - &mut cache, - &[], - )?; + let mut data = ArrayAndStats::new(array_ref.clone(), GenerateStatsOptions::default()); + let compressed = + SequenceScheme.compress(&btr, &mut data, CompressorContext::default(), &[])?; assert!(compressed.is::()); let decoded = compressed; let expected = PrimitiveArray::from_option_iter(values.into_iter().map(Some)).into_array(); @@ -1061,14 +993,9 @@ mod tests { let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); let btr = BtrBlocksCompressor::default(); let array_ref = array.into_array(); - let mut cache = StatsCache::new(); - let compressed = RLE_INTEGER_SCHEME.compress( - &btr, - &array_ref, - CompressorContext::default(), - &mut cache, - &[], - )?; + let mut data = ArrayAndStats::new(array_ref.clone(), GenerateStatsOptions::default()); + let compressed = + RLE_INTEGER_SCHEME.compress(&btr, &mut data, CompressorContext::default(), &[])?; let decoded = compressed; let expected = Buffer::copy_from(&values).into_array(); diff --git a/vortex-btrblocks/src/compressor/integer/stats.rs b/vortex-btrblocks/src/compressor/integer/stats.rs index 4ce10e0d255..f50fb5a548e 100644 --- a/vortex-btrblocks/src/compressor/integer/stats.rs +++ b/vortex-btrblocks/src/compressor/integer/stats.rs @@ -5,9 +5,6 @@ use std::hash::Hash; use num_traits::PrimInt; use rustc_hash::FxBuildHasher; -use vortex_array::IntoArray; -use vortex_array::ToCanonical; -use vortex_array::arrays::Primitive; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::primitive::NativeValue; use vortex_array::dtype::IntegerPType; @@ -22,10 +19,8 @@ use vortex_error::VortexResult; use vortex_mask::AllOr; use vortex_utils::aliases::hash_map::HashMap; -use crate::CompressorStats; use crate::GenerateStatsOptions; use crate::compressor::rle::RLEStats; -use crate::sample::sample; #[derive(Debug, Clone)] pub struct DistinctInfo { @@ -244,24 +239,23 @@ impl IntegerStats { } } -impl CompressorStats for IntegerStats { - type ArrayVTable = Primitive; +impl IntegerStats { + /// Generates stats with default options. + pub fn generate(input: &PrimitiveArray) -> Self { + Self::generate_opts(input, GenerateStatsOptions::default()) + } - fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self { + /// Generates stats with provided options. + pub fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self { Self::generate_opts_fallible(input, opts) .vortex_expect("IntegerStats::generate_opts should not fail") } - fn source(&self) -> &PrimitiveArray { + /// Returns the underlying source array. + #[expect(clippy::same_name_method)] + pub fn source(&self) -> &PrimitiveArray { &self.src } - - fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self { - let sampled = - sample(&self.src.clone().into_array(), sample_size, sample_count).to_primitive(); - - Self::generate_opts(&sampled, opts) - } } impl RLEStats for IntegerStats { @@ -520,7 +514,6 @@ mod tests { use super::IntegerStats; use super::typed_int_stats; - use crate::CompressorStats; #[test] fn test_naive_count_distinct_values() -> VortexResult<()> { diff --git a/vortex-btrblocks/src/compressor/rle.rs b/vortex-btrblocks/src/compressor/rle.rs index 562bba664ad..6e801732b37 100644 --- a/vortex-btrblocks/src/compressor/rle.rs +++ b/vortex-btrblocks/src/compressor/rle.rs @@ -12,11 +12,11 @@ use vortex_array::arrays::PrimitiveArray; use vortex_error::VortexResult; use vortex_fastlanes::RLEArray; +use crate::ArrayAndStats; use crate::BtrBlocksCompressor; use crate::CompressorContext; use crate::Scheme; use crate::SchemeId; -use crate::StatsCache; use crate::compressor::integer::DictScheme as IntDictScheme; use crate::scheme::estimate_compression_ratio_with_sampling; @@ -90,9 +90,8 @@ impl Scheme for RLEScheme { fn expected_compression_ratio( &self, compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - cache: &mut StatsCache, excludes: &[SchemeId], ) -> VortexResult { // RLE is only useful when we cascade it with another encoding. @@ -100,7 +99,8 @@ impl Scheme for RLEScheme { return Ok(0.0); } - let stats = cache.get_or_insert_with::(|| C::generate_stats(array)); + let array = data.array().clone(); + let stats = data.get_or_insert_with::(|| C::generate_stats(&array)); // Don't compress all-null or empty arrays. if stats.value_count() == 0 { @@ -113,18 +113,18 @@ impl Scheme for RLEScheme { } // Run compression on a sample to see how it performs. - estimate_compression_ratio_with_sampling(self, compressor, array, ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, data, ctx, excludes) } fn compress( &self, compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - cache: &mut StatsCache, excludes: &[SchemeId], ) -> VortexResult { - let stats = cache.get_or_insert_with::(|| C::generate_stats(array)); + let array = data.array().clone(); + let stats = data.get_or_insert_with::(|| C::generate_stats(&array)); let rle_array = RLEArray::encode(RLEStats::source(stats))?; if ctx.allowed_cascading == 0 { diff --git a/vortex-btrblocks/src/compressor/string.rs b/vortex-btrblocks/src/compressor/string.rs index e4de7bfcfa8..46baa1ae349 100644 --- a/vortex-btrblocks/src/compressor/string.rs +++ b/vortex-btrblocks/src/compressor/string.rs @@ -12,7 +12,6 @@ use vortex_array::arrays::ConstantArray; use vortex_array::arrays::DictArray; use vortex_array::arrays::MaskedArray; use vortex_array::arrays::VarBinArray; -use vortex_array::arrays::VarBinView; use vortex_array::arrays::VarBinViewArray; use vortex_array::builders::dict::dict_encode; use vortex_array::dtype::DType; @@ -32,14 +31,12 @@ use vortex_utils::aliases::hash_set::HashSet; use super::integer::DictScheme as IntDictScheme; use super::integer::SequenceScheme as IntSequenceScheme; use super::integer::SparseScheme as IntSparseScheme; +use crate::ArrayAndStats; use crate::BtrBlocksCompressor; use crate::CompressorContext; -use crate::CompressorStats; use crate::GenerateStatsOptions; use crate::Scheme; use crate::SchemeId; -use crate::StatsCache; -use crate::sample::sample; use crate::scheme::estimate_compression_ratio_with_sampling; /// Returns `true` if the canonical array is a UTF-8 string type. @@ -51,7 +48,7 @@ fn is_utf8_string(canonical: &Canonical) -> bool { #[derive(Clone, Debug)] pub struct StringStats { src: VarBinViewArray, - estimated_distinct_count: u32, + estimated_distinct_count: Option, value_count: u32, null_count: u32, } @@ -85,39 +82,36 @@ impl StringStats { .compute_null_count() .ok_or_else(|| vortex_err!("Failed to compute null_count"))?; let value_count = input.len() - null_count; - let estimated_distinct = if opts.count_distinct_values { - estimate_distinct_count(input)? - } else { - u32::MAX - }; + let estimated_distinct_count = opts + .count_distinct_values + .then(|| estimate_distinct_count(input)) + .transpose()?; Ok(Self { src: input.clone(), value_count: u32::try_from(value_count)?, null_count: u32::try_from(null_count)?, - estimated_distinct_count: estimated_distinct, + estimated_distinct_count, }) } } -impl CompressorStats for StringStats { - type ArrayVTable = VarBinView; +impl StringStats { + /// Generates stats with default options. + pub fn generate(input: &VarBinViewArray) -> Self { + Self::generate_opts(input, GenerateStatsOptions::default()) + } - fn generate_opts(input: &VarBinViewArray, opts: GenerateStatsOptions) -> Self { + /// Generates stats with provided options. + pub fn generate_opts(input: &VarBinViewArray, opts: GenerateStatsOptions) -> Self { Self::generate_opts_fallible(input, opts) .vortex_expect("StringStats::generate_opts should not fail") } - fn source(&self) -> &VarBinViewArray { + /// Returns the underlying source array. + pub fn source(&self) -> &VarBinViewArray { &self.src } - - fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self { - let sampled = - sample(&self.src.clone().into_array(), sample_size, sample_count).to_varbinview(); - - Self::generate_opts(&sampled, opts) - } } /// Uncompressed string scheme (identity). @@ -162,9 +156,8 @@ impl Scheme for UncompressedScheme { fn expected_compression_ratio( &self, _compressor: &BtrBlocksCompressor, - _array: &ArrayRef, + _data: &mut ArrayAndStats, _ctx: CompressorContext, - _cache: &mut StatsCache, _excludes: &[SchemeId], ) -> VortexResult { Ok(1.0) @@ -173,12 +166,11 @@ impl Scheme for UncompressedScheme { fn compress( &self, _compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, _ctx: CompressorContext, - _cache: &mut StatsCache, _excludes: &[SchemeId], ) -> VortexResult { - Ok(array.clone()) + Ok(data.array().clone()) } } @@ -191,23 +183,26 @@ impl Scheme for DictScheme { is_utf8_string(canonical) } - fn needs_distinct_values(&self) -> bool { - true + fn stats_options(&self) -> GenerateStatsOptions { + GenerateStatsOptions { + count_distinct_values: true, + } } fn expected_compression_ratio( &self, compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - cache: &mut StatsCache, excludes: &[SchemeId], ) -> VortexResult { - let stats = cache - .get_or_insert_with::(|| StringStats::generate(&array.to_varbinview())); + let stats = data.string_stats(); // If we don't have a sufficiently high number of distinct values, do not attempt Dict. - if stats.estimated_distinct_count > stats.value_count / 2 { + if stats + .estimated_distinct_count + .is_none_or(|c| c > stats.value_count / 2) + { return Ok(0.0); } @@ -216,19 +211,17 @@ impl Scheme for DictScheme { return Ok(0.0); } - estimate_compression_ratio_with_sampling(self, compressor, array, ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, data, ctx, excludes) } fn compress( &self, compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - cache: &mut StatsCache, _excludes: &[SchemeId], ) -> VortexResult { - let stats = cache - .get_or_insert_with::(|| StringStats::generate(&array.to_varbinview())); + let stats = data.string_stats(); let dict = dict_encode(&stats.source().clone().into_array())?; @@ -275,13 +268,11 @@ impl Scheme for FSSTScheme { fn compress( &self, compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - cache: &mut StatsCache, _excludes: &[SchemeId], ) -> VortexResult { - let stats = cache - .get_or_insert_with::(|| StringStats::generate(&array.to_varbinview())); + let stats = data.string_stats(); let fsst = { let compressor_fsst = fsst_train_compressor(&stats.src); @@ -334,20 +325,18 @@ impl Scheme for ConstantScheme { fn expected_compression_ratio( &self, _compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - cache: &mut StatsCache, _excludes: &[SchemeId], ) -> VortexResult { if ctx.is_sample { return Ok(0.0); } - let stats = cache - .get_or_insert_with::(|| StringStats::generate(&array.to_varbinview())); + let stats = data.string_stats(); let mut ctx = LEGACY_SESSION.create_execution_ctx(); - if stats.estimated_distinct_count > 1 + if stats.estimated_distinct_count.is_none_or(|c| c > 1) || !is_constant(&stats.src.clone().into_array(), &mut ctx)? { return Ok(0.0); @@ -360,13 +349,11 @@ impl Scheme for ConstantScheme { fn compress( &self, _compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, _ctx: CompressorContext, - cache: &mut StatsCache, _excludes: &[SchemeId], ) -> VortexResult { - let stats = cache - .get_or_insert_with::(|| StringStats::generate(&array.to_varbinview())); + let stats = data.string_stats(); let scalar_idx = (0..stats.source().len()).position(|idx| stats.source().is_valid(idx).unwrap_or(false)); @@ -402,9 +389,8 @@ impl Scheme for NullDominated { fn expected_compression_ratio( &self, _compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - cache: &mut StatsCache, _excludes: &[SchemeId], ) -> VortexResult { // Only use `SparseScheme` if we can cascade. @@ -412,8 +398,7 @@ impl Scheme for NullDominated { return Ok(0.0); } - let stats = cache - .get_or_insert_with::(|| StringStats::generate(&array.to_varbinview())); + let stats = data.string_stats(); if stats.value_count == 0 { // All nulls should use ConstantScheme. @@ -432,15 +417,13 @@ impl Scheme for NullDominated { fn compress( &self, compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - cache: &mut StatsCache, _excludes: &[SchemeId], ) -> VortexResult { assert!(ctx.allowed_cascading > 0); - let stats = cache - .get_or_insert_with::(|| StringStats::generate(&array.to_varbinview())); + let stats = data.string_stats(); // We pass None as we only run this pathway for NULL-dominated string arrays. let sparse_encoded = SparseArray::encode(&stats.src.clone().into_array(), None)?; @@ -482,13 +465,11 @@ impl Scheme for ZstdScheme { fn compress( &self, _compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, _ctx: CompressorContext, - cache: &mut StatsCache, _excludes: &[SchemeId], ) -> VortexResult { - let stats = cache - .get_or_insert_with::(|| StringStats::generate(&array.to_varbinview())); + let stats = data.string_stats(); let compacted = stats.source().compact_buffers()?; Ok( @@ -511,13 +492,11 @@ impl Scheme for ZstdBuffersScheme { fn compress( &self, _compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, _ctx: CompressorContext, - cache: &mut StatsCache, _excludes: &[SchemeId], ) -> VortexResult { - let stats = cache - .get_or_insert_with::(|| StringStats::generate(&array.to_varbinview())); + let stats = data.string_stats(); Ok( vortex_zstd::ZstdBuffersArray::compress(&stats.source().clone().into_array(), 3)? diff --git a/vortex-btrblocks/src/ctx.rs b/vortex-btrblocks/src/ctx.rs index d346a3018b4..c4832b3aa7a 100644 --- a/vortex-btrblocks/src/ctx.rs +++ b/vortex-btrblocks/src/ctx.rs @@ -3,6 +3,7 @@ //! Compression context for recursive compression. +use crate::GenerateStatsOptions; use crate::compressor::MAX_CASCADE; /// Context passed through recursive compression calls. @@ -12,6 +13,8 @@ pub struct CompressorContext { pub is_sample: bool, /// Remaining cascade depth allowed. pub allowed_cascading: usize, + /// Merged stats options from all eligible schemes at this compression site. + pub stats_options: GenerateStatsOptions, } impl Default for CompressorContext { @@ -19,6 +22,7 @@ impl Default for CompressorContext { Self { is_sample: false, allowed_cascading: MAX_CASCADE, + stats_options: GenerateStatsOptions::default(), } } } diff --git a/vortex-btrblocks/src/lib.rs b/vortex-btrblocks/src/lib.rs index 1674443a13e..ff144b5e84f 100644 --- a/vortex-btrblocks/src/lib.rs +++ b/vortex-btrblocks/src/lib.rs @@ -75,6 +75,5 @@ pub use compressor::integer::dictionary::dictionary_encode as integer_dictionary pub use ctx::CompressorContext; pub use scheme::Scheme; pub use scheme::SchemeId; -pub use stats::CompressorStats; pub use stats::GenerateStatsOptions; -pub use stats_cache::StatsCache; +pub use stats_cache::ArrayAndStats; diff --git a/vortex-btrblocks/src/scheme.rs b/vortex-btrblocks/src/scheme.rs index 96bd60adbdf..e1b89bb700c 100644 --- a/vortex-btrblocks/src/scheme.rs +++ b/vortex-btrblocks/src/scheme.rs @@ -12,9 +12,10 @@ use vortex_array::ArrayRef; use vortex_array::Canonical; use vortex_error::VortexResult; +use crate::ArrayAndStats; use crate::BtrBlocksCompressor; use crate::CompressorContext; -use crate::StatsCache; +use crate::GenerateStatsOptions; use crate::sample::sample; use crate::sample::sample_count_approx_one_percent; use crate::stats::SAMPLE_SIZE; @@ -58,31 +59,36 @@ pub trait Scheme: Debug + Send + Sync { false } - /// Whether this scheme requires distinct-value statistics to be pre-computed. - fn needs_distinct_values(&self) -> bool { - false + /// Returns the stats generation options this scheme requires. The compressor merges all + /// eligible schemes' options before generating stats, so that a single stats pass satisfies + /// every scheme. + fn stats_options(&self) -> GenerateStatsOptions { + GenerateStatsOptions::default() } /// Estimate the compression ratio for this scheme on the given array. + /// + /// The `data` bundle contains the array and a pre-populated stats cache. Schemes access + /// stats via `data.get_or_insert_with::(|| ...)`. fn expected_compression_ratio( &self, compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - cache: &mut StatsCache, excludes: &[SchemeId], ) -> VortexResult { - let _ = cache; - estimate_compression_ratio_with_sampling(self, compressor, array, ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, data, ctx, excludes) } /// Compress the array using this scheme. + /// + /// The `data` bundle contains the array and a pre-populated stats cache. Schemes access + /// stats via `data.get_or_insert_with::(|| ...)`. fn compress( &self, compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, - cache: &mut StatsCache, excludes: &[SchemeId], ) -> VortexResult; } @@ -102,17 +108,20 @@ impl Hash for dyn Scheme { } /// Estimates compression ratio by compressing a ~1% sample of the data. +/// +/// Creates a new [`ArrayAndStats`] for the sample so that stats are generated from the sample, +/// not the full array. pub fn estimate_compression_ratio_with_sampling( scheme: &S, compressor: &BtrBlocksCompressor, - array: &ArrayRef, + data: &mut ArrayAndStats, ctx: CompressorContext, excludes: &[SchemeId], ) -> VortexResult { let sample_array = if ctx.is_sample { - array.clone() + data.array().clone() } else { - let source_len = array.len(); + let source_len = data.array().len(); let sample_count = sample_count_approx_one_percent(source_len); tracing::trace!( @@ -121,20 +130,15 @@ pub fn estimate_compression_ratio_with_sampling( source_len ); - sample(array, SAMPLE_SIZE, sample_count) + sample(data.array(), SAMPLE_SIZE, sample_count) }; - let mut sample_cache = StatsCache::new(); + let mut sample_data = ArrayAndStats::new(sample_array, ctx.stats_options); + let after = scheme - .compress( - compressor, - &sample_array, - ctx.as_sample(), - &mut sample_cache, - excludes, - )? + .compress(compressor, &mut sample_data, ctx.as_sample(), excludes)? .nbytes(); - let before = sample_array.nbytes(); + let before = sample_data.array().nbytes(); tracing::debug!( "estimate_compression_ratio_with_sampling(compressor={scheme:#?} ctx={ctx:?}) = {}", diff --git a/vortex-btrblocks/src/stats.rs b/vortex-btrblocks/src/stats.rs index e3421991f1e..cd3239cd20b 100644 --- a/vortex-btrblocks/src/stats.rs +++ b/vortex-btrblocks/src/stats.rs @@ -3,17 +3,26 @@ //! Compression statistics types. -use std::fmt::Debug; - -use vortex_array::vtable::VTable; - /// Configures how stats are generated. -#[derive(Default)] +/// +/// Each scheme declares its required options via [`Scheme::stats_options`]. The compressor +/// merges all eligible schemes' options before generating stats, so that a single stats pass +/// satisfies every scheme. +/// +/// [`Scheme::stats_options`]: crate::Scheme::stats_options +#[derive(Debug, Default, Clone, Copy)] pub struct GenerateStatsOptions { - /// Should distinct values should be counted during stats generation. + /// Whether distinct values should be counted during stats generation. pub count_distinct_values: bool, - // pub count_runs: bool, - // should this be scheme-specific? +} + +impl GenerateStatsOptions { + /// Merges two options by OR-ing each field. The result enables a stat if either input does. + pub fn merge(self, other: Self) -> Self { + Self { + count_distinct_values: self.count_distinct_values || other.count_distinct_values, + } + } } /// The size of each sampled run. @@ -25,31 +34,3 @@ pub(crate) const SAMPLE_SIZE: u32 = 64; /// The product of SAMPLE_SIZE and SAMPLE_COUNT should be (roughly) a multiple of 1024 so that /// fastlanes bitpacking of sampled vectors does not introduce (large amounts of) padding. pub(crate) const SAMPLE_COUNT: u32 = 16; - -/// Stats for the compressor. -pub trait CompressorStats: Debug + Clone { - /// The type of the underlying source array vtable. - type ArrayVTable: VTable; - - /// Generates stats with default options. - fn generate(input: &::Array) -> Self { - Self::generate_opts(input, GenerateStatsOptions::default()) - } - - /// Generates stats with provided options. - fn generate_opts( - input: &::Array, - opts: GenerateStatsOptions, - ) -> Self; - - /// Returns the underlying source array that statistics were generated from. - fn source(&self) -> &::Array; - - /// Sample the array with default options. - fn sample(&self, sample_size: u32, sample_count: u32) -> Self { - self.sample_opts(sample_size, sample_count, GenerateStatsOptions::default()) - } - - /// Sample the array with provided options. - fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self; -} diff --git a/vortex-btrblocks/src/stats_cache.rs b/vortex-btrblocks/src/stats_cache.rs index f0121f70648..02103ade4fb 100644 --- a/vortex-btrblocks/src/stats_cache.rs +++ b/vortex-btrblocks/src/stats_cache.rs @@ -1,42 +1,33 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Per-compression-site statistics cache. -//! -//! [`StatsCache`] is a [`TypeId`]-keyed container created fresh for each [`choose_and_compress`] -//! call. It stores arbitrary stats types (e.g., [`IntegerStats`], [`FloatStats`]) so that -//! multiple schemes evaluated at the same compression site share the same computed statistics. -//! -//! [`choose_and_compress`]: crate::BtrBlocksCompressor::choose_and_compress -//! [`IntegerStats`]: crate::compressor::integer::IntegerStats -//! [`FloatStats`]: crate::compressor::float::FloatStats +//! Per-compression-site statistics cache and the [`ArrayAndStats`] bundle. use std::any::Any; use std::any::TypeId; +use vortex_array::ArrayRef; +use vortex_array::ToCanonical; use vortex_error::VortexExpect; +use crate::GenerateStatsOptions; +use crate::compressor::float::FloatStats; +use crate::compressor::integer::IntegerStats; +use crate::compressor::string::StringStats; + /// Cache for compression statistics, keyed by concrete type. -/// -/// Schemes access stats via [`get_or_insert_with`], which returns a cached `&T` on subsequent -/// calls. The first scheme to request a given stats type triggers its computation; all later -/// schemes at the same site get the cached version. -/// -/// [`get_or_insert_with`]: StatsCache::get_or_insert_with -pub struct StatsCache { +struct StatsCache { entries: Vec<(TypeId, Box)>, } impl StatsCache { - /// Creates an empty cache. - pub fn new() -> Self { + fn new() -> Self { Self { entries: Vec::new(), } } - /// Returns a cached `&T`, computing and storing it on first access. - pub fn get_or_insert_with(&mut self, f: impl FnOnce() -> T) -> &T { + fn get_or_insert_with(&mut self, f: impl FnOnce() -> T) -> &T { let type_id = TypeId::of::(); let pos = self.entries.iter().position(|(id, _)| *id == type_id); @@ -57,8 +48,82 @@ impl StatsCache { } } -impl Default for StatsCache { - fn default() -> Self { - Self::new() +/// An array bundled with its lazily-computed statistics cache. +/// +/// The cache is guaranteed to correspond to the array. When a scheme creates a derived array +/// (e.g. FoR bias subtraction), it must create a new [`ArrayAndStats`] so that stale stats +/// from the original array are not reused. +/// +/// Built-in stats are accessed via typed methods ([`integer_stats`](Self::integer_stats), +/// [`float_stats`](Self::float_stats), [`string_stats`](Self::string_stats)) which generate +/// stats lazily on first access using the stored [`GenerateStatsOptions`]. Extension schemes +/// can use [`get_or_insert_with`](Self::get_or_insert_with) for custom stats types. +pub struct ArrayAndStats { + array: ArrayRef, + cache: StatsCache, + opts: GenerateStatsOptions, +} + +impl ArrayAndStats { + /// Creates a new bundle with the given stats generation options. + /// + /// Stats are generated lazily on first access via the typed accessor methods. + pub fn new(array: ArrayRef, opts: GenerateStatsOptions) -> Self { + Self { + array, + cache: StatsCache::new(), + opts, + } + } + + /// Creates a new bundle with default (cheapest) stats options. + pub fn without_stats(array: ArrayRef) -> Self { + Self { + array, + cache: StatsCache::new(), + opts: GenerateStatsOptions::default(), + } + } + + /// Returns a reference to the array. + pub fn array(&self) -> &ArrayRef { + &self.array + } + + /// Consumes the bundle and returns the array. + pub fn into_array(self) -> ArrayRef { + self.array + } + + /// Returns integer stats, generating them lazily on first access. + pub fn integer_stats(&mut self) -> &IntegerStats { + let array = self.array.clone(); + let opts = self.opts; + self.cache.get_or_insert_with::(|| { + IntegerStats::generate_opts(&array.to_primitive(), opts) + }) + } + + /// Returns float stats, generating them lazily on first access. + pub fn float_stats(&mut self) -> &FloatStats { + let array = self.array.clone(); + let opts = self.opts; + self.cache.get_or_insert_with::(|| { + FloatStats::generate_opts(&array.to_primitive(), opts) + }) + } + + /// Returns string stats, generating them lazily on first access. + pub fn string_stats(&mut self) -> &StringStats { + let array = self.array.clone(); + let opts = self.opts; + self.cache.get_or_insert_with::(|| { + StringStats::generate_opts(&array.to_varbinview(), opts) + }) + } + + /// For extension schemes with custom stats types. + pub fn get_or_insert_with(&mut self, f: impl FnOnce() -> T) -> &T { + self.cache.get_or_insert_with::(f) } } From 8717519d1cce49382bc7704cc216963fe8ae5613 Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Wed, 18 Mar 2026 14:08:41 -0400 Subject: [PATCH 4/9] move into new vortex-compressor crate Signed-off-by: Connor Tsui clean up Signed-off-by: Connor Tsui --- Cargo.lock | 18 + Cargo.toml | 2 + fuzz/src/array/mod.rs | 2 +- vortex-btrblocks/Cargo.toml | 1 + vortex-btrblocks/public-api.lock | 898 ++++++++++++++---- vortex-btrblocks/src/builder.rs | 42 +- vortex-btrblocks/src/canonical_compressor.rs | 316 +----- vortex-btrblocks/src/compressor/decimal.rs | 88 +- .../src/compressor/float/dictionary.rs | 30 +- vortex-btrblocks/src/compressor/float/mod.rs | 94 +- .../src/compressor/integer/dictionary.rs | 28 +- .../src/compressor/integer/mod.rs | 192 ++-- vortex-btrblocks/src/compressor/mod.rs | 9 +- vortex-btrblocks/src/compressor/rle.rs | 13 +- vortex-btrblocks/src/compressor/string.rs | 139 +-- vortex-btrblocks/src/compressor/temporal.rs | 137 ++- vortex-btrblocks/src/lib.rs | 37 +- vortex-btrblocks/src/scheme.rs | 149 --- vortex-compressor/Cargo.toml | 33 + vortex-compressor/public-api.lock | 431 +++++++++ vortex-compressor/src/compressor.rs | 316 ++++++ .../src/ctx.rs | 6 +- vortex-compressor/src/lib.rs | 26 + .../src/sample.rs | 46 +- vortex-compressor/src/scheme.rs | 197 ++++ .../src/stats/cache.rs | 48 +- .../src/stats/float.rs | 77 +- .../src/stats/integer.rs | 110 ++- vortex-compressor/src/stats/mod.rs | 22 + .../src/stats/options.rs | 12 +- vortex-compressor/src/stats/string.rs | 102 ++ vortex-file/src/strategy.rs | 2 +- vortex-layout/src/layouts/compressed.rs | 2 +- vortex/public-api.lock | 6 +- 34 files changed, 2569 insertions(+), 1062 deletions(-) delete mode 100644 vortex-btrblocks/src/scheme.rs create mode 100644 vortex-compressor/Cargo.toml create mode 100644 vortex-compressor/public-api.lock create mode 100644 vortex-compressor/src/compressor.rs rename {vortex-btrblocks => vortex-compressor}/src/ctx.rs (91%) create mode 100644 vortex-compressor/src/lib.rs rename {vortex-btrblocks => vortex-compressor}/src/sample.rs (60%) create mode 100644 vortex-compressor/src/scheme.rs rename vortex-btrblocks/src/stats_cache.rs => vortex-compressor/src/stats/cache.rs (73%) rename vortex-btrblocks/src/compressor/float/stats.rs => vortex-compressor/src/stats/float.rs (79%) rename vortex-btrblocks/src/compressor/integer/stats.rs => vortex-compressor/src/stats/integer.rs (85%) create mode 100644 vortex-compressor/src/stats/mod.rs rename vortex-btrblocks/src/stats.rs => vortex-compressor/src/stats/options.rs (68%) create mode 100644 vortex-compressor/src/stats/string.rs diff --git a/Cargo.lock b/Cargo.lock index bc08da59b90..2661a12756c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9872,6 +9872,7 @@ dependencies = [ "vortex-alp", "vortex-array", "vortex-buffer", + "vortex-compressor", "vortex-datetime-parts", "vortex-decimal-byte-parts", "vortex-error", @@ -9942,6 +9943,23 @@ dependencies = [ "vortex-session", ] +[[package]] +name = "vortex-compressor" +version = "0.1.0" +dependencies = [ + "itertools 0.14.0", + "num-traits", + "rand 0.10.0", + "rstest", + "rustc-hash", + "tracing", + "vortex-array", + "vortex-buffer", + "vortex-error", + "vortex-mask", + "vortex-utils", +] + [[package]] name = "vortex-cub" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 1f21bdf4f21..324b0b13c54 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,6 +12,7 @@ members = [ "vortex-proto", "vortex-array", "vortex-tensor", + "vortex-compressor", "vortex-btrblocks", "vortex-layout", "vortex-scan", @@ -260,6 +261,7 @@ vortex-array = { version = "0.1.0", path = "./vortex-array", default-features = vortex-btrblocks = { version = "0.1.0", path = "./vortex-btrblocks", default-features = false } vortex-buffer = { version = "0.1.0", path = "./vortex-buffer", default-features = false } vortex-bytebool = { version = "0.1.0", path = "./encodings/bytebool", default-features = false } +vortex-compressor = { version = "0.1.0", path = "./vortex-compressor", default-features = false } vortex-datafusion = { version = "0.1.0", path = "./vortex-datafusion", default-features = false } vortex-datetime-parts = { version = "0.1.0", path = "./encodings/datetime-parts", default-features = false } vortex-decimal-byte-parts = { version = "0.1.0", path = "encodings/decimal-byte-parts", default-features = false } diff --git a/fuzz/src/array/mod.rs b/fuzz/src/array/mod.rs index 197d5b441e0..d9380a57a08 100644 --- a/fuzz/src/array/mod.rs +++ b/fuzz/src/array/mod.rs @@ -61,7 +61,7 @@ use vortex_array::search_sorted::SearchSorted; use vortex_array::search_sorted::SearchSortedSide; use vortex_btrblocks::BtrBlocksCompressor; use vortex_btrblocks::BtrBlocksCompressorBuilder; -use vortex_btrblocks::Scheme; +use vortex_btrblocks::SchemeExt; use vortex_btrblocks::compressor::float; use vortex_btrblocks::compressor::integer; use vortex_btrblocks::compressor::string; diff --git a/vortex-btrblocks/Cargo.toml b/vortex-btrblocks/Cargo.toml index 1c745306c4a..0b2fa7e0ca7 100644 --- a/vortex-btrblocks/Cargo.toml +++ b/vortex-btrblocks/Cargo.toml @@ -25,6 +25,7 @@ tracing = { workspace = true } vortex-alp = { workspace = true } vortex-array = { workspace = true } vortex-buffer = { workspace = true } +vortex-compressor = { workspace = true } vortex-datetime-parts = { workspace = true } vortex-decimal-byte-parts = { workspace = true } vortex-error = { workspace = true } diff --git a/vortex-btrblocks/public-api.lock b/vortex-btrblocks/public-api.lock index 55d23a96a26..8fe186586bd 100644 --- a/vortex-btrblocks/public-api.lock +++ b/vortex-btrblocks/public-api.lock @@ -1,331 +1,899 @@ pub mod vortex_btrblocks -pub enum vortex_btrblocks::FloatCode +pub use vortex_btrblocks::ArrayAndStats -pub vortex_btrblocks::FloatCode::Alp +pub use vortex_btrblocks::CascadingCompressor -pub vortex_btrblocks::FloatCode::AlpRd +pub use vortex_btrblocks::CompressorContext -pub vortex_btrblocks::FloatCode::Constant +pub use vortex_btrblocks::FloatStats -pub vortex_btrblocks::FloatCode::Dict +pub use vortex_btrblocks::GenerateStatsOptions -pub vortex_btrblocks::FloatCode::Pco +pub use vortex_btrblocks::IntegerStats -pub vortex_btrblocks::FloatCode::Rle +pub use vortex_btrblocks::MAX_CASCADE -pub vortex_btrblocks::FloatCode::RunEnd +pub use vortex_btrblocks::Scheme -pub vortex_btrblocks::FloatCode::Sparse +pub use vortex_btrblocks::SchemeExt -pub vortex_btrblocks::FloatCode::Uncompressed +pub use vortex_btrblocks::SchemeId -impl core::clone::Clone for vortex_btrblocks::FloatCode +pub use vortex_btrblocks::StringStats -pub fn vortex_btrblocks::FloatCode::clone(&self) -> vortex_btrblocks::FloatCode +pub use vortex_btrblocks::estimate_compression_ratio_with_sampling -impl core::cmp::Eq for vortex_btrblocks::FloatCode +pub mod vortex_btrblocks::compressor -impl core::cmp::Ord for vortex_btrblocks::FloatCode +pub mod vortex_btrblocks::compressor::decimal -pub fn vortex_btrblocks::FloatCode::cmp(&self, other: &vortex_btrblocks::FloatCode) -> core::cmp::Ordering +pub struct vortex_btrblocks::compressor::decimal::DecimalScheme -impl core::cmp::PartialEq for vortex_btrblocks::FloatCode +impl core::clone::Clone for vortex_btrblocks::compressor::decimal::DecimalScheme -pub fn vortex_btrblocks::FloatCode::eq(&self, other: &vortex_btrblocks::FloatCode) -> bool +pub fn vortex_btrblocks::compressor::decimal::DecimalScheme::clone(&self) -> vortex_btrblocks::compressor::decimal::DecimalScheme -impl core::cmp::PartialOrd for vortex_btrblocks::FloatCode +impl core::cmp::Eq for vortex_btrblocks::compressor::decimal::DecimalScheme -pub fn vortex_btrblocks::FloatCode::partial_cmp(&self, other: &vortex_btrblocks::FloatCode) -> core::option::Option +impl core::cmp::PartialEq for vortex_btrblocks::compressor::decimal::DecimalScheme -impl core::fmt::Debug for vortex_btrblocks::FloatCode +pub fn vortex_btrblocks::compressor::decimal::DecimalScheme::eq(&self, other: &vortex_btrblocks::compressor::decimal::DecimalScheme) -> bool -pub fn vortex_btrblocks::FloatCode::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +impl core::fmt::Debug for vortex_btrblocks::compressor::decimal::DecimalScheme -impl core::hash::Hash for vortex_btrblocks::FloatCode +pub fn vortex_btrblocks::compressor::decimal::DecimalScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -pub fn vortex_btrblocks::FloatCode::hash<__H: core::hash::Hasher>(&self, state: &mut __H) +impl core::marker::Copy for vortex_btrblocks::compressor::decimal::DecimalScheme -impl core::marker::Copy for vortex_btrblocks::FloatCode +impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::decimal::DecimalScheme -impl core::marker::StructuralPartialEq for vortex_btrblocks::FloatCode +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::decimal::DecimalScheme -impl enum_iterator::Sequence for vortex_btrblocks::FloatCode +pub fn vortex_btrblocks::compressor::decimal::DecimalScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult -pub const vortex_btrblocks::FloatCode::CARDINALITY: usize +pub fn vortex_btrblocks::compressor::decimal::DecimalScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, _data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult -pub fn vortex_btrblocks::FloatCode::first() -> core::option::Option +pub fn vortex_btrblocks::compressor::decimal::DecimalScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -pub fn vortex_btrblocks::FloatCode::last() -> core::option::Option +pub fn vortex_btrblocks::compressor::decimal::DecimalScheme::scheme_name(&self) -> &'static str -pub fn vortex_btrblocks::FloatCode::next(&self) -> core::option::Option +pub mod vortex_btrblocks::compressor::float -pub fn vortex_btrblocks::FloatCode::previous(&self) -> core::option::Option +pub use vortex_btrblocks::compressor::float::FloatStats -pub enum vortex_btrblocks::IntCode +pub struct vortex_btrblocks::compressor::float::ALPRDScheme -pub vortex_btrblocks::IntCode::BitPacking +impl core::clone::Clone for vortex_btrblocks::compressor::float::ALPRDScheme -pub vortex_btrblocks::IntCode::Constant +pub fn vortex_btrblocks::compressor::float::ALPRDScheme::clone(&self) -> vortex_btrblocks::compressor::float::ALPRDScheme -pub vortex_btrblocks::IntCode::Dict +impl core::cmp::Eq for vortex_btrblocks::compressor::float::ALPRDScheme -pub vortex_btrblocks::IntCode::For +impl core::cmp::PartialEq for vortex_btrblocks::compressor::float::ALPRDScheme -pub vortex_btrblocks::IntCode::Pco +pub fn vortex_btrblocks::compressor::float::ALPRDScheme::eq(&self, other: &vortex_btrblocks::compressor::float::ALPRDScheme) -> bool -pub vortex_btrblocks::IntCode::Rle +impl core::fmt::Debug for vortex_btrblocks::compressor::float::ALPRDScheme -pub vortex_btrblocks::IntCode::RunEnd +pub fn vortex_btrblocks::compressor::float::ALPRDScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -pub vortex_btrblocks::IntCode::Sequence +impl core::marker::Copy for vortex_btrblocks::compressor::float::ALPRDScheme -pub vortex_btrblocks::IntCode::Sparse +impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::float::ALPRDScheme -pub vortex_btrblocks::IntCode::Uncompressed +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::float::ALPRDScheme -pub vortex_btrblocks::IntCode::ZigZag +pub fn vortex_btrblocks::compressor::float::ALPRDScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult -impl core::clone::Clone for vortex_btrblocks::IntCode +pub fn vortex_btrblocks::compressor::float::ALPRDScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult -pub fn vortex_btrblocks::IntCode::clone(&self) -> vortex_btrblocks::IntCode +pub fn vortex_btrblocks::compressor::float::ALPRDScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -impl core::cmp::Eq for vortex_btrblocks::IntCode +pub fn vortex_btrblocks::compressor::float::ALPRDScheme::scheme_name(&self) -> &'static str -impl core::cmp::Ord for vortex_btrblocks::IntCode +pub struct vortex_btrblocks::compressor::float::ALPScheme -pub fn vortex_btrblocks::IntCode::cmp(&self, other: &vortex_btrblocks::IntCode) -> core::cmp::Ordering +impl core::clone::Clone for vortex_btrblocks::compressor::float::ALPScheme -impl core::cmp::PartialEq for vortex_btrblocks::IntCode +pub fn vortex_btrblocks::compressor::float::ALPScheme::clone(&self) -> vortex_btrblocks::compressor::float::ALPScheme -pub fn vortex_btrblocks::IntCode::eq(&self, other: &vortex_btrblocks::IntCode) -> bool +impl core::cmp::Eq for vortex_btrblocks::compressor::float::ALPScheme -impl core::cmp::PartialOrd for vortex_btrblocks::IntCode +impl core::cmp::PartialEq for vortex_btrblocks::compressor::float::ALPScheme -pub fn vortex_btrblocks::IntCode::partial_cmp(&self, other: &vortex_btrblocks::IntCode) -> core::option::Option +pub fn vortex_btrblocks::compressor::float::ALPScheme::eq(&self, other: &vortex_btrblocks::compressor::float::ALPScheme) -> bool -impl core::fmt::Debug for vortex_btrblocks::IntCode +impl core::fmt::Debug for vortex_btrblocks::compressor::float::ALPScheme -pub fn vortex_btrblocks::IntCode::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub fn vortex_btrblocks::compressor::float::ALPScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -impl core::hash::Hash for vortex_btrblocks::IntCode +impl core::marker::Copy for vortex_btrblocks::compressor::float::ALPScheme -pub fn vortex_btrblocks::IntCode::hash<__H: core::hash::Hasher>(&self, state: &mut __H) +impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::float::ALPScheme -impl core::marker::Copy for vortex_btrblocks::IntCode +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::float::ALPScheme -impl core::marker::StructuralPartialEq for vortex_btrblocks::IntCode +pub fn vortex_btrblocks::compressor::float::ALPScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult -impl enum_iterator::Sequence for vortex_btrblocks::IntCode +pub fn vortex_btrblocks::compressor::float::ALPScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult -pub const vortex_btrblocks::IntCode::CARDINALITY: usize +pub fn vortex_btrblocks::compressor::float::ALPScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -pub fn vortex_btrblocks::IntCode::first() -> core::option::Option +pub fn vortex_btrblocks::compressor::float::ALPScheme::scheme_name(&self) -> &'static str -pub fn vortex_btrblocks::IntCode::last() -> core::option::Option +pub struct vortex_btrblocks::compressor::float::ConstantScheme -pub fn vortex_btrblocks::IntCode::next(&self) -> core::option::Option +impl core::clone::Clone for vortex_btrblocks::compressor::float::ConstantScheme -pub fn vortex_btrblocks::IntCode::previous(&self) -> core::option::Option +pub fn vortex_btrblocks::compressor::float::ConstantScheme::clone(&self) -> vortex_btrblocks::compressor::float::ConstantScheme -pub enum vortex_btrblocks::StringCode +impl core::cmp::Eq for vortex_btrblocks::compressor::float::ConstantScheme -pub vortex_btrblocks::StringCode::Constant +impl core::cmp::PartialEq for vortex_btrblocks::compressor::float::ConstantScheme -pub vortex_btrblocks::StringCode::Dict +pub fn vortex_btrblocks::compressor::float::ConstantScheme::eq(&self, other: &vortex_btrblocks::compressor::float::ConstantScheme) -> bool -pub vortex_btrblocks::StringCode::Fsst +impl core::fmt::Debug for vortex_btrblocks::compressor::float::ConstantScheme -pub vortex_btrblocks::StringCode::Sparse +pub fn vortex_btrblocks::compressor::float::ConstantScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -pub vortex_btrblocks::StringCode::Uncompressed +impl core::marker::Copy for vortex_btrblocks::compressor::float::ConstantScheme -pub vortex_btrblocks::StringCode::Zstd +impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::float::ConstantScheme -pub vortex_btrblocks::StringCode::ZstdBuffers +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::float::ConstantScheme -impl core::clone::Clone for vortex_btrblocks::StringCode +pub fn vortex_btrblocks::compressor::float::ConstantScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult -pub fn vortex_btrblocks::StringCode::clone(&self) -> vortex_btrblocks::StringCode +pub fn vortex_btrblocks::compressor::float::ConstantScheme::detects_constant(&self) -> bool -impl core::cmp::Eq for vortex_btrblocks::StringCode +pub fn vortex_btrblocks::compressor::float::ConstantScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult -impl core::cmp::Ord for vortex_btrblocks::StringCode +pub fn vortex_btrblocks::compressor::float::ConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -pub fn vortex_btrblocks::StringCode::cmp(&self, other: &vortex_btrblocks::StringCode) -> core::cmp::Ordering +pub fn vortex_btrblocks::compressor::float::ConstantScheme::scheme_name(&self) -> &'static str -impl core::cmp::PartialEq for vortex_btrblocks::StringCode +pub struct vortex_btrblocks::compressor::float::DictScheme -pub fn vortex_btrblocks::StringCode::eq(&self, other: &vortex_btrblocks::StringCode) -> bool +impl core::clone::Clone for vortex_btrblocks::compressor::float::DictScheme -impl core::cmp::PartialOrd for vortex_btrblocks::StringCode +pub fn vortex_btrblocks::compressor::float::DictScheme::clone(&self) -> vortex_btrblocks::compressor::float::DictScheme -pub fn vortex_btrblocks::StringCode::partial_cmp(&self, other: &vortex_btrblocks::StringCode) -> core::option::Option +impl core::cmp::Eq for vortex_btrblocks::compressor::float::DictScheme -impl core::fmt::Debug for vortex_btrblocks::StringCode +impl core::cmp::PartialEq for vortex_btrblocks::compressor::float::DictScheme -pub fn vortex_btrblocks::StringCode::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub fn vortex_btrblocks::compressor::float::DictScheme::eq(&self, other: &vortex_btrblocks::compressor::float::DictScheme) -> bool -impl core::hash::Hash for vortex_btrblocks::StringCode +impl core::fmt::Debug for vortex_btrblocks::compressor::float::DictScheme -pub fn vortex_btrblocks::StringCode::hash<__H: core::hash::Hasher>(&self, state: &mut __H) +pub fn vortex_btrblocks::compressor::float::DictScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -impl core::marker::Copy for vortex_btrblocks::StringCode +impl core::marker::Copy for vortex_btrblocks::compressor::float::DictScheme -impl core::marker::StructuralPartialEq for vortex_btrblocks::StringCode +impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::float::DictScheme -impl enum_iterator::Sequence for vortex_btrblocks::StringCode +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::float::DictScheme -pub const vortex_btrblocks::StringCode::CARDINALITY: usize +pub fn vortex_btrblocks::compressor::float::DictScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult -pub fn vortex_btrblocks::StringCode::first() -> core::option::Option +pub fn vortex_btrblocks::compressor::float::DictScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult -pub fn vortex_btrblocks::StringCode::last() -> core::option::Option +pub fn vortex_btrblocks::compressor::float::DictScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -pub fn vortex_btrblocks::StringCode::next(&self) -> core::option::Option +pub fn vortex_btrblocks::compressor::float::DictScheme::scheme_name(&self) -> &'static str -pub fn vortex_btrblocks::StringCode::previous(&self) -> core::option::Option +pub fn vortex_btrblocks::compressor::float::DictScheme::stats_options(&self) -> vortex_compressor::stats::options::GenerateStatsOptions -pub struct vortex_btrblocks::BtrBlocksCompressor +pub struct vortex_btrblocks::compressor::float::FloatRLEConfig -pub vortex_btrblocks::BtrBlocksCompressor::float_schemes: alloc::vec::Vec<&'static dyn vortex_btrblocks::compressor::float::FloatScheme> +impl core::clone::Clone for vortex_btrblocks::compressor::float::FloatRLEConfig -pub vortex_btrblocks::BtrBlocksCompressor::int_schemes: alloc::vec::Vec<&'static dyn vortex_btrblocks::compressor::integer::IntegerScheme> +pub fn vortex_btrblocks::compressor::float::FloatRLEConfig::clone(&self) -> vortex_btrblocks::compressor::float::FloatRLEConfig -pub vortex_btrblocks::BtrBlocksCompressor::string_schemes: alloc::vec::Vec<&'static dyn vortex_btrblocks::compressor::string::StringScheme> +impl core::cmp::Eq for vortex_btrblocks::compressor::float::FloatRLEConfig -impl vortex_btrblocks::BtrBlocksCompressor +impl core::cmp::PartialEq for vortex_btrblocks::compressor::float::FloatRLEConfig -pub fn vortex_btrblocks::BtrBlocksCompressor::compress(&self, array: &vortex_array::array::ArrayRef) -> vortex_error::VortexResult +pub fn vortex_btrblocks::compressor::float::FloatRLEConfig::eq(&self, other: &vortex_btrblocks::compressor::float::FloatRLEConfig) -> bool -impl core::clone::Clone for vortex_btrblocks::BtrBlocksCompressor +impl core::fmt::Debug for vortex_btrblocks::compressor::float::FloatRLEConfig -pub fn vortex_btrblocks::BtrBlocksCompressor::clone(&self) -> vortex_btrblocks::BtrBlocksCompressor +pub fn vortex_btrblocks::compressor::float::FloatRLEConfig::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -impl core::default::Default for vortex_btrblocks::BtrBlocksCompressor +impl core::marker::Copy for vortex_btrblocks::compressor::float::FloatRLEConfig -pub fn vortex_btrblocks::BtrBlocksCompressor::default() -> Self +impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::float::FloatRLEConfig -impl vortex_btrblocks::CanonicalCompressor for vortex_btrblocks::BtrBlocksCompressor +pub struct vortex_btrblocks::compressor::float::NullDominated -pub fn vortex_btrblocks::BtrBlocksCompressor::compress_canonical(&self, array: vortex_array::canonical::Canonical, ctx: vortex_btrblocks::ctx::CompressorContext, excludes: vortex_btrblocks::ctx::Excludes<'_>) -> vortex_error::VortexResult +impl core::clone::Clone for vortex_btrblocks::compressor::float::NullDominated -pub fn vortex_btrblocks::BtrBlocksCompressor::float_schemes(&self) -> &[&'static dyn vortex_btrblocks::compressor::float::FloatScheme] +pub fn vortex_btrblocks::compressor::float::NullDominated::clone(&self) -> vortex_btrblocks::compressor::float::NullDominated -pub fn vortex_btrblocks::BtrBlocksCompressor::int_schemes(&self) -> &[&'static dyn vortex_btrblocks::compressor::integer::IntegerScheme] +impl core::cmp::Eq for vortex_btrblocks::compressor::float::NullDominated -pub fn vortex_btrblocks::BtrBlocksCompressor::string_schemes(&self) -> &[&'static dyn vortex_btrblocks::compressor::string::StringScheme] +impl core::cmp::PartialEq for vortex_btrblocks::compressor::float::NullDominated -pub struct vortex_btrblocks::BtrBlocksCompressorBuilder +pub fn vortex_btrblocks::compressor::float::NullDominated::eq(&self, other: &vortex_btrblocks::compressor::float::NullDominated) -> bool -impl vortex_btrblocks::BtrBlocksCompressorBuilder +impl core::fmt::Debug for vortex_btrblocks::compressor::float::NullDominated -pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::build(self) -> vortex_btrblocks::BtrBlocksCompressor +pub fn vortex_btrblocks::compressor::float::NullDominated::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::empty() -> Self +impl core::marker::Copy for vortex_btrblocks::compressor::float::NullDominated -pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::exclude_float(self, codes: impl core::iter::traits::collect::IntoIterator) -> Self +impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::float::NullDominated -pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::exclude_int(self, codes: impl core::iter::traits::collect::IntoIterator) -> Self +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::float::NullDominated -pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::exclude_string(self, codes: impl core::iter::traits::collect::IntoIterator) -> Self +pub fn vortex_btrblocks::compressor::float::NullDominated::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult -pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::include_float(self, codes: impl core::iter::traits::collect::IntoIterator) -> Self +pub fn vortex_btrblocks::compressor::float::NullDominated::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult -pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::include_int(self, codes: impl core::iter::traits::collect::IntoIterator) -> Self +pub fn vortex_btrblocks::compressor::float::NullDominated::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::include_string(self, codes: impl core::iter::traits::collect::IntoIterator) -> Self +pub fn vortex_btrblocks::compressor::float::NullDominated::scheme_name(&self) -> &'static str -impl core::clone::Clone for vortex_btrblocks::BtrBlocksCompressorBuilder +pub struct vortex_btrblocks::compressor::float::PcoScheme -pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::clone(&self) -> vortex_btrblocks::BtrBlocksCompressorBuilder +impl core::clone::Clone for vortex_btrblocks::compressor::float::PcoScheme -impl core::default::Default for vortex_btrblocks::BtrBlocksCompressorBuilder +pub fn vortex_btrblocks::compressor::float::PcoScheme::clone(&self) -> vortex_btrblocks::compressor::float::PcoScheme -pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::default() -> Self +impl core::cmp::Eq for vortex_btrblocks::compressor::float::PcoScheme -impl core::fmt::Debug for vortex_btrblocks::BtrBlocksCompressorBuilder +impl core::cmp::PartialEq for vortex_btrblocks::compressor::float::PcoScheme -pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub fn vortex_btrblocks::compressor::float::PcoScheme::eq(&self, other: &vortex_btrblocks::compressor::float::PcoScheme) -> bool + +impl core::fmt::Debug for vortex_btrblocks::compressor::float::PcoScheme + +pub fn vortex_btrblocks::compressor::float::PcoScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_btrblocks::compressor::float::PcoScheme + +impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::float::PcoScheme + +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::float::PcoScheme + +pub fn vortex_btrblocks::compressor::float::PcoScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::compressor::float::PcoScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_btrblocks::compressor::float::PcoScheme::scheme_name(&self) -> &'static str + +pub struct vortex_btrblocks::compressor::float::UncompressedScheme + +impl core::clone::Clone for vortex_btrblocks::compressor::float::UncompressedScheme + +pub fn vortex_btrblocks::compressor::float::UncompressedScheme::clone(&self) -> vortex_btrblocks::compressor::float::UncompressedScheme + +impl core::cmp::Eq for vortex_btrblocks::compressor::float::UncompressedScheme + +impl core::cmp::PartialEq for vortex_btrblocks::compressor::float::UncompressedScheme + +pub fn vortex_btrblocks::compressor::float::UncompressedScheme::eq(&self, other: &vortex_btrblocks::compressor::float::UncompressedScheme) -> bool + +impl core::fmt::Debug for vortex_btrblocks::compressor::float::UncompressedScheme + +pub fn vortex_btrblocks::compressor::float::UncompressedScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_btrblocks::compressor::float::UncompressedScheme + +impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::float::UncompressedScheme + +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::float::UncompressedScheme + +pub fn vortex_btrblocks::compressor::float::UncompressedScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::compressor::float::UncompressedScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, _data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::compressor::float::UncompressedScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_btrblocks::compressor::float::UncompressedScheme::scheme_name(&self) -> &'static str + +pub const vortex_btrblocks::compressor::float::RLE_FLOAT_SCHEME: vortex_btrblocks::compressor::rle::RLEScheme + +pub mod vortex_btrblocks::compressor::integer + +pub use vortex_btrblocks::compressor::integer::IntegerStats + +pub struct vortex_btrblocks::compressor::integer::BitPackingScheme + +impl core::clone::Clone for vortex_btrblocks::compressor::integer::BitPackingScheme + +pub fn vortex_btrblocks::compressor::integer::BitPackingScheme::clone(&self) -> vortex_btrblocks::compressor::integer::BitPackingScheme + +impl core::cmp::Eq for vortex_btrblocks::compressor::integer::BitPackingScheme + +impl core::cmp::PartialEq for vortex_btrblocks::compressor::integer::BitPackingScheme + +pub fn vortex_btrblocks::compressor::integer::BitPackingScheme::eq(&self, other: &vortex_btrblocks::compressor::integer::BitPackingScheme) -> bool + +impl core::fmt::Debug for vortex_btrblocks::compressor::integer::BitPackingScheme + +pub fn vortex_btrblocks::compressor::integer::BitPackingScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_btrblocks::compressor::integer::BitPackingScheme + +impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::integer::BitPackingScheme + +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::integer::BitPackingScheme + +pub fn vortex_btrblocks::compressor::integer::BitPackingScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::compressor::integer::BitPackingScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::compressor::integer::BitPackingScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_btrblocks::compressor::integer::BitPackingScheme::scheme_name(&self) -> &'static str + +pub struct vortex_btrblocks::compressor::integer::ConstantScheme + +impl core::clone::Clone for vortex_btrblocks::compressor::integer::ConstantScheme + +pub fn vortex_btrblocks::compressor::integer::ConstantScheme::clone(&self) -> vortex_btrblocks::compressor::integer::ConstantScheme + +impl core::cmp::Eq for vortex_btrblocks::compressor::integer::ConstantScheme + +impl core::cmp::PartialEq for vortex_btrblocks::compressor::integer::ConstantScheme + +pub fn vortex_btrblocks::compressor::integer::ConstantScheme::eq(&self, other: &vortex_btrblocks::compressor::integer::ConstantScheme) -> bool + +impl core::fmt::Debug for vortex_btrblocks::compressor::integer::ConstantScheme + +pub fn vortex_btrblocks::compressor::integer::ConstantScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_btrblocks::compressor::integer::ConstantScheme + +impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::integer::ConstantScheme + +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::integer::ConstantScheme + +pub fn vortex_btrblocks::compressor::integer::ConstantScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::compressor::integer::ConstantScheme::detects_constant(&self) -> bool + +pub fn vortex_btrblocks::compressor::integer::ConstantScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::compressor::integer::ConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_btrblocks::compressor::integer::ConstantScheme::scheme_name(&self) -> &'static str + +pub struct vortex_btrblocks::compressor::integer::DictScheme + +impl core::clone::Clone for vortex_btrblocks::compressor::integer::DictScheme + +pub fn vortex_btrblocks::compressor::integer::DictScheme::clone(&self) -> vortex_btrblocks::compressor::integer::DictScheme + +impl core::cmp::Eq for vortex_btrblocks::compressor::integer::DictScheme + +impl core::cmp::PartialEq for vortex_btrblocks::compressor::integer::DictScheme + +pub fn vortex_btrblocks::compressor::integer::DictScheme::eq(&self, other: &vortex_btrblocks::compressor::integer::DictScheme) -> bool + +impl core::fmt::Debug for vortex_btrblocks::compressor::integer::DictScheme + +pub fn vortex_btrblocks::compressor::integer::DictScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_btrblocks::compressor::integer::DictScheme + +impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::integer::DictScheme + +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::integer::DictScheme + +pub fn vortex_btrblocks::compressor::integer::DictScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::compressor::integer::DictScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::compressor::integer::DictScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_btrblocks::compressor::integer::DictScheme::scheme_name(&self) -> &'static str + +pub fn vortex_btrblocks::compressor::integer::DictScheme::stats_options(&self) -> vortex_compressor::stats::options::GenerateStatsOptions + +pub struct vortex_btrblocks::compressor::integer::FORScheme + +impl core::clone::Clone for vortex_btrblocks::compressor::integer::FORScheme + +pub fn vortex_btrblocks::compressor::integer::FORScheme::clone(&self) -> vortex_btrblocks::compressor::integer::FORScheme + +impl core::cmp::Eq for vortex_btrblocks::compressor::integer::FORScheme + +impl core::cmp::PartialEq for vortex_btrblocks::compressor::integer::FORScheme + +pub fn vortex_btrblocks::compressor::integer::FORScheme::eq(&self, other: &vortex_btrblocks::compressor::integer::FORScheme) -> bool + +impl core::fmt::Debug for vortex_btrblocks::compressor::integer::FORScheme + +pub fn vortex_btrblocks::compressor::integer::FORScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_btrblocks::compressor::integer::FORScheme + +impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::integer::FORScheme + +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::integer::FORScheme + +pub fn vortex_btrblocks::compressor::integer::FORScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::compressor::integer::FORScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::compressor::integer::FORScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_btrblocks::compressor::integer::FORScheme::scheme_name(&self) -> &'static str + +pub struct vortex_btrblocks::compressor::integer::IntRLEConfig + +impl core::clone::Clone for vortex_btrblocks::compressor::integer::IntRLEConfig + +pub fn vortex_btrblocks::compressor::integer::IntRLEConfig::clone(&self) -> vortex_btrblocks::compressor::integer::IntRLEConfig + +impl core::cmp::Eq for vortex_btrblocks::compressor::integer::IntRLEConfig + +impl core::cmp::PartialEq for vortex_btrblocks::compressor::integer::IntRLEConfig + +pub fn vortex_btrblocks::compressor::integer::IntRLEConfig::eq(&self, other: &vortex_btrblocks::compressor::integer::IntRLEConfig) -> bool + +impl core::fmt::Debug for vortex_btrblocks::compressor::integer::IntRLEConfig + +pub fn vortex_btrblocks::compressor::integer::IntRLEConfig::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_btrblocks::compressor::integer::IntRLEConfig + +impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::integer::IntRLEConfig + +pub struct vortex_btrblocks::compressor::integer::PcoScheme + +impl core::clone::Clone for vortex_btrblocks::compressor::integer::PcoScheme + +pub fn vortex_btrblocks::compressor::integer::PcoScheme::clone(&self) -> vortex_btrblocks::compressor::integer::PcoScheme + +impl core::cmp::Eq for vortex_btrblocks::compressor::integer::PcoScheme + +impl core::cmp::PartialEq for vortex_btrblocks::compressor::integer::PcoScheme + +pub fn vortex_btrblocks::compressor::integer::PcoScheme::eq(&self, other: &vortex_btrblocks::compressor::integer::PcoScheme) -> bool + +impl core::fmt::Debug for vortex_btrblocks::compressor::integer::PcoScheme + +pub fn vortex_btrblocks::compressor::integer::PcoScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_btrblocks::compressor::integer::PcoScheme + +impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::integer::PcoScheme + +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::integer::PcoScheme + +pub fn vortex_btrblocks::compressor::integer::PcoScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::compressor::integer::PcoScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::compressor::integer::PcoScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_btrblocks::compressor::integer::PcoScheme::scheme_name(&self) -> &'static str + +pub struct vortex_btrblocks::compressor::integer::RunEndScheme + +impl core::clone::Clone for vortex_btrblocks::compressor::integer::RunEndScheme + +pub fn vortex_btrblocks::compressor::integer::RunEndScheme::clone(&self) -> vortex_btrblocks::compressor::integer::RunEndScheme + +impl core::cmp::Eq for vortex_btrblocks::compressor::integer::RunEndScheme + +impl core::cmp::PartialEq for vortex_btrblocks::compressor::integer::RunEndScheme + +pub fn vortex_btrblocks::compressor::integer::RunEndScheme::eq(&self, other: &vortex_btrblocks::compressor::integer::RunEndScheme) -> bool + +impl core::fmt::Debug for vortex_btrblocks::compressor::integer::RunEndScheme + +pub fn vortex_btrblocks::compressor::integer::RunEndScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_btrblocks::compressor::integer::RunEndScheme + +impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::integer::RunEndScheme + +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::integer::RunEndScheme + +pub fn vortex_btrblocks::compressor::integer::RunEndScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::compressor::integer::RunEndScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::compressor::integer::RunEndScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_btrblocks::compressor::integer::RunEndScheme::scheme_name(&self) -> &'static str + +pub struct vortex_btrblocks::compressor::integer::SequenceScheme + +impl core::clone::Clone for vortex_btrblocks::compressor::integer::SequenceScheme + +pub fn vortex_btrblocks::compressor::integer::SequenceScheme::clone(&self) -> vortex_btrblocks::compressor::integer::SequenceScheme + +impl core::cmp::Eq for vortex_btrblocks::compressor::integer::SequenceScheme + +impl core::cmp::PartialEq for vortex_btrblocks::compressor::integer::SequenceScheme + +pub fn vortex_btrblocks::compressor::integer::SequenceScheme::eq(&self, other: &vortex_btrblocks::compressor::integer::SequenceScheme) -> bool + +impl core::fmt::Debug for vortex_btrblocks::compressor::integer::SequenceScheme + +pub fn vortex_btrblocks::compressor::integer::SequenceScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_btrblocks::compressor::integer::SequenceScheme + +impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::integer::SequenceScheme + +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::integer::SequenceScheme + +pub fn vortex_btrblocks::compressor::integer::SequenceScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::compressor::integer::SequenceScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::compressor::integer::SequenceScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -pub struct vortex_btrblocks::GenerateStatsOptions +pub fn vortex_btrblocks::compressor::integer::SequenceScheme::scheme_name(&self) -> &'static str -pub vortex_btrblocks::GenerateStatsOptions::count_distinct_values: bool +pub struct vortex_btrblocks::compressor::integer::SparseScheme -impl core::default::Default for vortex_btrblocks::GenerateStatsOptions +impl core::clone::Clone for vortex_btrblocks::compressor::integer::SparseScheme -pub fn vortex_btrblocks::GenerateStatsOptions::default() -> Self +pub fn vortex_btrblocks::compressor::integer::SparseScheme::clone(&self) -> vortex_btrblocks::compressor::integer::SparseScheme -pub struct vortex_btrblocks::IntegerStats +impl core::cmp::Eq for vortex_btrblocks::compressor::integer::SparseScheme -impl core::clone::Clone for vortex_btrblocks::IntegerStats +impl core::cmp::PartialEq for vortex_btrblocks::compressor::integer::SparseScheme -pub fn vortex_btrblocks::IntegerStats::clone(&self) -> vortex_btrblocks::IntegerStats +pub fn vortex_btrblocks::compressor::integer::SparseScheme::eq(&self, other: &vortex_btrblocks::compressor::integer::SparseScheme) -> bool -impl core::fmt::Debug for vortex_btrblocks::IntegerStats +impl core::fmt::Debug for vortex_btrblocks::compressor::integer::SparseScheme -pub fn vortex_btrblocks::IntegerStats::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub fn vortex_btrblocks::compressor::integer::SparseScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -impl vortex_btrblocks::CompressorStats for vortex_btrblocks::IntegerStats +impl core::marker::Copy for vortex_btrblocks::compressor::integer::SparseScheme -pub type vortex_btrblocks::IntegerStats::ArrayVTable = vortex_array::arrays::primitive::vtable::Primitive +impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::integer::SparseScheme -pub fn vortex_btrblocks::IntegerStats::generate(input: &::Array) -> Self +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::integer::SparseScheme -pub fn vortex_btrblocks::IntegerStats::generate_opts(input: &vortex_array::arrays::primitive::array::PrimitiveArray, opts: vortex_btrblocks::GenerateStatsOptions) -> Self +pub fn vortex_btrblocks::compressor::integer::SparseScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult -pub fn vortex_btrblocks::IntegerStats::sample(&self, sample_size: u32, sample_count: u32) -> Self +pub fn vortex_btrblocks::compressor::integer::SparseScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult -pub fn vortex_btrblocks::IntegerStats::sample_opts(&self, sample_size: u32, sample_count: u32, opts: vortex_btrblocks::GenerateStatsOptions) -> Self +pub fn vortex_btrblocks::compressor::integer::SparseScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -pub fn vortex_btrblocks::IntegerStats::source(&self) -> &vortex_array::arrays::primitive::array::PrimitiveArray +pub fn vortex_btrblocks::compressor::integer::SparseScheme::scheme_name(&self) -> &'static str -pub trait vortex_btrblocks::CanonicalCompressor +pub fn vortex_btrblocks::compressor::integer::SparseScheme::stats_options(&self) -> vortex_compressor::stats::options::GenerateStatsOptions -pub fn vortex_btrblocks::CanonicalCompressor::compress_canonical(&self, array: vortex_array::canonical::Canonical, ctx: vortex_btrblocks::ctx::CompressorContext, excludes: vortex_btrblocks::ctx::Excludes<'_>) -> vortex_error::VortexResult +pub struct vortex_btrblocks::compressor::integer::UncompressedScheme -pub fn vortex_btrblocks::CanonicalCompressor::float_schemes(&self) -> &[&'static dyn vortex_btrblocks::compressor::float::FloatScheme] +impl core::clone::Clone for vortex_btrblocks::compressor::integer::UncompressedScheme -pub fn vortex_btrblocks::CanonicalCompressor::int_schemes(&self) -> &[&'static dyn vortex_btrblocks::compressor::integer::IntegerScheme] +pub fn vortex_btrblocks::compressor::integer::UncompressedScheme::clone(&self) -> vortex_btrblocks::compressor::integer::UncompressedScheme -pub fn vortex_btrblocks::CanonicalCompressor::string_schemes(&self) -> &[&'static dyn vortex_btrblocks::compressor::string::StringScheme] +impl core::cmp::Eq for vortex_btrblocks::compressor::integer::UncompressedScheme -impl vortex_btrblocks::CanonicalCompressor for vortex_btrblocks::BtrBlocksCompressor +impl core::cmp::PartialEq for vortex_btrblocks::compressor::integer::UncompressedScheme -pub fn vortex_btrblocks::BtrBlocksCompressor::compress_canonical(&self, array: vortex_array::canonical::Canonical, ctx: vortex_btrblocks::ctx::CompressorContext, excludes: vortex_btrblocks::ctx::Excludes<'_>) -> vortex_error::VortexResult +pub fn vortex_btrblocks::compressor::integer::UncompressedScheme::eq(&self, other: &vortex_btrblocks::compressor::integer::UncompressedScheme) -> bool -pub fn vortex_btrblocks::BtrBlocksCompressor::float_schemes(&self) -> &[&'static dyn vortex_btrblocks::compressor::float::FloatScheme] +impl core::fmt::Debug for vortex_btrblocks::compressor::integer::UncompressedScheme -pub fn vortex_btrblocks::BtrBlocksCompressor::int_schemes(&self) -> &[&'static dyn vortex_btrblocks::compressor::integer::IntegerScheme] +pub fn vortex_btrblocks::compressor::integer::UncompressedScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -pub fn vortex_btrblocks::BtrBlocksCompressor::string_schemes(&self) -> &[&'static dyn vortex_btrblocks::compressor::string::StringScheme] +impl core::marker::Copy for vortex_btrblocks::compressor::integer::UncompressedScheme -pub trait vortex_btrblocks::CompressorStats: core::fmt::Debug + core::clone::Clone +impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::integer::UncompressedScheme -pub type vortex_btrblocks::CompressorStats::ArrayVTable: vortex_array::vtable::VTable +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::integer::UncompressedScheme -pub fn vortex_btrblocks::CompressorStats::generate(input: &::Array) -> Self +pub fn vortex_btrblocks::compressor::integer::UncompressedScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult -pub fn vortex_btrblocks::CompressorStats::generate_opts(input: &::Array, opts: vortex_btrblocks::GenerateStatsOptions) -> Self +pub fn vortex_btrblocks::compressor::integer::UncompressedScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, _data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult -pub fn vortex_btrblocks::CompressorStats::sample(&self, sample_size: u32, sample_count: u32) -> Self +pub fn vortex_btrblocks::compressor::integer::UncompressedScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -pub fn vortex_btrblocks::CompressorStats::sample_opts(&self, sample_size: u32, sample_count: u32, opts: vortex_btrblocks::GenerateStatsOptions) -> Self +pub fn vortex_btrblocks::compressor::integer::UncompressedScheme::scheme_name(&self) -> &'static str -pub fn vortex_btrblocks::CompressorStats::source(&self) -> &::Array +pub struct vortex_btrblocks::compressor::integer::ZigZagScheme -impl vortex_btrblocks::CompressorStats for vortex_btrblocks::IntegerStats +impl core::clone::Clone for vortex_btrblocks::compressor::integer::ZigZagScheme -pub type vortex_btrblocks::IntegerStats::ArrayVTable = vortex_array::arrays::primitive::vtable::Primitive +pub fn vortex_btrblocks::compressor::integer::ZigZagScheme::clone(&self) -> vortex_btrblocks::compressor::integer::ZigZagScheme -pub fn vortex_btrblocks::IntegerStats::generate(input: &::Array) -> Self +impl core::cmp::Eq for vortex_btrblocks::compressor::integer::ZigZagScheme -pub fn vortex_btrblocks::IntegerStats::generate_opts(input: &vortex_array::arrays::primitive::array::PrimitiveArray, opts: vortex_btrblocks::GenerateStatsOptions) -> Self +impl core::cmp::PartialEq for vortex_btrblocks::compressor::integer::ZigZagScheme + +pub fn vortex_btrblocks::compressor::integer::ZigZagScheme::eq(&self, other: &vortex_btrblocks::compressor::integer::ZigZagScheme) -> bool + +impl core::fmt::Debug for vortex_btrblocks::compressor::integer::ZigZagScheme + +pub fn vortex_btrblocks::compressor::integer::ZigZagScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_btrblocks::compressor::integer::ZigZagScheme + +impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::integer::ZigZagScheme + +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::integer::ZigZagScheme + +pub fn vortex_btrblocks::compressor::integer::ZigZagScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::compressor::integer::ZigZagScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::compressor::integer::ZigZagScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_btrblocks::compressor::integer::ZigZagScheme::scheme_name(&self) -> &'static str + +pub const vortex_btrblocks::compressor::integer::RLE_INTEGER_SCHEME: vortex_btrblocks::compressor::rle::RLEScheme + +pub mod vortex_btrblocks::compressor::string + +pub use vortex_btrblocks::compressor::string::StringStats + +pub struct vortex_btrblocks::compressor::string::ConstantScheme + +impl core::clone::Clone for vortex_btrblocks::compressor::string::ConstantScheme + +pub fn vortex_btrblocks::compressor::string::ConstantScheme::clone(&self) -> vortex_btrblocks::compressor::string::ConstantScheme + +impl core::cmp::Eq for vortex_btrblocks::compressor::string::ConstantScheme + +impl core::cmp::PartialEq for vortex_btrblocks::compressor::string::ConstantScheme + +pub fn vortex_btrblocks::compressor::string::ConstantScheme::eq(&self, other: &vortex_btrblocks::compressor::string::ConstantScheme) -> bool + +impl core::fmt::Debug for vortex_btrblocks::compressor::string::ConstantScheme + +pub fn vortex_btrblocks::compressor::string::ConstantScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_btrblocks::compressor::string::ConstantScheme + +impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::string::ConstantScheme + +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::string::ConstantScheme + +pub fn vortex_btrblocks::compressor::string::ConstantScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::compressor::string::ConstantScheme::detects_constant(&self) -> bool + +pub fn vortex_btrblocks::compressor::string::ConstantScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::compressor::string::ConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_btrblocks::compressor::string::ConstantScheme::scheme_name(&self) -> &'static str + +pub struct vortex_btrblocks::compressor::string::DictScheme + +impl core::clone::Clone for vortex_btrblocks::compressor::string::DictScheme + +pub fn vortex_btrblocks::compressor::string::DictScheme::clone(&self) -> vortex_btrblocks::compressor::string::DictScheme + +impl core::cmp::Eq for vortex_btrblocks::compressor::string::DictScheme + +impl core::cmp::PartialEq for vortex_btrblocks::compressor::string::DictScheme + +pub fn vortex_btrblocks::compressor::string::DictScheme::eq(&self, other: &vortex_btrblocks::compressor::string::DictScheme) -> bool + +impl core::fmt::Debug for vortex_btrblocks::compressor::string::DictScheme + +pub fn vortex_btrblocks::compressor::string::DictScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_btrblocks::compressor::string::DictScheme + +impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::string::DictScheme + +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::string::DictScheme + +pub fn vortex_btrblocks::compressor::string::DictScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::compressor::string::DictScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::compressor::string::DictScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_btrblocks::compressor::string::DictScheme::scheme_name(&self) -> &'static str + +pub fn vortex_btrblocks::compressor::string::DictScheme::stats_options(&self) -> vortex_compressor::stats::options::GenerateStatsOptions + +pub struct vortex_btrblocks::compressor::string::FSSTScheme + +impl core::clone::Clone for vortex_btrblocks::compressor::string::FSSTScheme + +pub fn vortex_btrblocks::compressor::string::FSSTScheme::clone(&self) -> vortex_btrblocks::compressor::string::FSSTScheme + +impl core::cmp::Eq for vortex_btrblocks::compressor::string::FSSTScheme + +impl core::cmp::PartialEq for vortex_btrblocks::compressor::string::FSSTScheme + +pub fn vortex_btrblocks::compressor::string::FSSTScheme::eq(&self, other: &vortex_btrblocks::compressor::string::FSSTScheme) -> bool + +impl core::fmt::Debug for vortex_btrblocks::compressor::string::FSSTScheme + +pub fn vortex_btrblocks::compressor::string::FSSTScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_btrblocks::compressor::string::FSSTScheme + +impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::string::FSSTScheme + +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::string::FSSTScheme + +pub fn vortex_btrblocks::compressor::string::FSSTScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::compressor::string::FSSTScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_btrblocks::compressor::string::FSSTScheme::scheme_name(&self) -> &'static str + +pub struct vortex_btrblocks::compressor::string::NullDominated + +impl core::clone::Clone for vortex_btrblocks::compressor::string::NullDominated + +pub fn vortex_btrblocks::compressor::string::NullDominated::clone(&self) -> vortex_btrblocks::compressor::string::NullDominated + +impl core::cmp::Eq for vortex_btrblocks::compressor::string::NullDominated + +impl core::cmp::PartialEq for vortex_btrblocks::compressor::string::NullDominated + +pub fn vortex_btrblocks::compressor::string::NullDominated::eq(&self, other: &vortex_btrblocks::compressor::string::NullDominated) -> bool + +impl core::fmt::Debug for vortex_btrblocks::compressor::string::NullDominated + +pub fn vortex_btrblocks::compressor::string::NullDominated::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_btrblocks::compressor::string::NullDominated + +impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::string::NullDominated + +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::string::NullDominated + +pub fn vortex_btrblocks::compressor::string::NullDominated::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::compressor::string::NullDominated::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::compressor::string::NullDominated::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_btrblocks::compressor::string::NullDominated::scheme_name(&self) -> &'static str + +pub struct vortex_btrblocks::compressor::string::UncompressedScheme + +impl core::clone::Clone for vortex_btrblocks::compressor::string::UncompressedScheme + +pub fn vortex_btrblocks::compressor::string::UncompressedScheme::clone(&self) -> vortex_btrblocks::compressor::string::UncompressedScheme + +impl core::cmp::Eq for vortex_btrblocks::compressor::string::UncompressedScheme + +impl core::cmp::PartialEq for vortex_btrblocks::compressor::string::UncompressedScheme + +pub fn vortex_btrblocks::compressor::string::UncompressedScheme::eq(&self, other: &vortex_btrblocks::compressor::string::UncompressedScheme) -> bool + +impl core::fmt::Debug for vortex_btrblocks::compressor::string::UncompressedScheme + +pub fn vortex_btrblocks::compressor::string::UncompressedScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_btrblocks::compressor::string::UncompressedScheme + +impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::string::UncompressedScheme + +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::string::UncompressedScheme + +pub fn vortex_btrblocks::compressor::string::UncompressedScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::compressor::string::UncompressedScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, _data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::compressor::string::UncompressedScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_btrblocks::compressor::string::UncompressedScheme::scheme_name(&self) -> &'static str + +pub struct vortex_btrblocks::compressor::string::ZstdScheme + +impl core::clone::Clone for vortex_btrblocks::compressor::string::ZstdScheme + +pub fn vortex_btrblocks::compressor::string::ZstdScheme::clone(&self) -> vortex_btrblocks::compressor::string::ZstdScheme + +impl core::cmp::Eq for vortex_btrblocks::compressor::string::ZstdScheme + +impl core::cmp::PartialEq for vortex_btrblocks::compressor::string::ZstdScheme + +pub fn vortex_btrblocks::compressor::string::ZstdScheme::eq(&self, other: &vortex_btrblocks::compressor::string::ZstdScheme) -> bool + +impl core::fmt::Debug for vortex_btrblocks::compressor::string::ZstdScheme + +pub fn vortex_btrblocks::compressor::string::ZstdScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_btrblocks::compressor::string::ZstdScheme + +impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::string::ZstdScheme + +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::string::ZstdScheme + +pub fn vortex_btrblocks::compressor::string::ZstdScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::compressor::string::ZstdScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_btrblocks::compressor::string::ZstdScheme::scheme_name(&self) -> &'static str + +pub mod vortex_btrblocks::compressor::temporal + +pub struct vortex_btrblocks::compressor::temporal::TemporalScheme + +impl core::clone::Clone for vortex_btrblocks::compressor::temporal::TemporalScheme + +pub fn vortex_btrblocks::compressor::temporal::TemporalScheme::clone(&self) -> vortex_btrblocks::compressor::temporal::TemporalScheme + +impl core::cmp::Eq for vortex_btrblocks::compressor::temporal::TemporalScheme + +impl core::cmp::PartialEq for vortex_btrblocks::compressor::temporal::TemporalScheme + +pub fn vortex_btrblocks::compressor::temporal::TemporalScheme::eq(&self, other: &vortex_btrblocks::compressor::temporal::TemporalScheme) -> bool + +impl core::fmt::Debug for vortex_btrblocks::compressor::temporal::TemporalScheme + +pub fn vortex_btrblocks::compressor::temporal::TemporalScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_btrblocks::compressor::temporal::TemporalScheme + +impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::temporal::TemporalScheme + +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::temporal::TemporalScheme + +pub fn vortex_btrblocks::compressor::temporal::TemporalScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::compressor::temporal::TemporalScheme::detects_constant(&self) -> bool + +pub fn vortex_btrblocks::compressor::temporal::TemporalScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, _data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::compressor::temporal::TemporalScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_btrblocks::compressor::temporal::TemporalScheme::scheme_name(&self) -> &'static str + +pub struct vortex_btrblocks::BtrBlocksCompressor(pub vortex_compressor::compressor::CascadingCompressor) + +impl vortex_btrblocks::BtrBlocksCompressor + +pub fn vortex_btrblocks::BtrBlocksCompressor::compress(&self, array: &vortex_array::array::ArrayRef) -> vortex_error::VortexResult + +impl core::clone::Clone for vortex_btrblocks::BtrBlocksCompressor + +pub fn vortex_btrblocks::BtrBlocksCompressor::clone(&self) -> vortex_btrblocks::BtrBlocksCompressor + +impl core::default::Default for vortex_btrblocks::BtrBlocksCompressor + +pub fn vortex_btrblocks::BtrBlocksCompressor::default() -> Self + +impl core::ops::deref::Deref for vortex_btrblocks::BtrBlocksCompressor + +pub type vortex_btrblocks::BtrBlocksCompressor::Target = vortex_compressor::compressor::CascadingCompressor + +pub fn vortex_btrblocks::BtrBlocksCompressor::deref(&self) -> &vortex_compressor::compressor::CascadingCompressor + +pub struct vortex_btrblocks::BtrBlocksCompressorBuilder + +impl vortex_btrblocks::BtrBlocksCompressorBuilder + +pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::build(self) -> vortex_btrblocks::BtrBlocksCompressor + +pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::exclude(self, ids: impl core::iter::traits::collect::IntoIterator) -> Self + +pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::include(self, ids: impl core::iter::traits::collect::IntoIterator) -> Self + +pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::with_scheme(self, scheme: &'static dyn vortex_compressor::scheme::Scheme) -> Self + +impl core::clone::Clone for vortex_btrblocks::BtrBlocksCompressorBuilder + +pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::clone(&self) -> vortex_btrblocks::BtrBlocksCompressorBuilder + +impl core::default::Default for vortex_btrblocks::BtrBlocksCompressorBuilder + +pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::default() -> Self + +impl core::fmt::Debug for vortex_btrblocks::BtrBlocksCompressorBuilder + +pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -pub fn vortex_btrblocks::IntegerStats::sample(&self, sample_size: u32, sample_count: u32) -> Self +pub const vortex_btrblocks::ALL_SCHEMES: &[&dyn vortex_compressor::scheme::Scheme] -pub fn vortex_btrblocks::IntegerStats::sample_opts(&self, sample_size: u32, sample_count: u32, opts: vortex_btrblocks::GenerateStatsOptions) -> Self +pub fn vortex_btrblocks::compress_patches(patches: &vortex_array::patches::Patches) -> vortex_error::VortexResult -pub fn vortex_btrblocks::IntegerStats::source(&self) -> &vortex_array::arrays::primitive::array::PrimitiveArray +pub fn vortex_btrblocks::default_excluded() -> vortex_utils::aliases::hash_set::HashSet -pub fn vortex_btrblocks::integer_dictionary_encode(stats: &vortex_btrblocks::IntegerStats) -> vortex_array::arrays::dict::array::DictArray +pub fn vortex_btrblocks::integer_dictionary_encode(stats: &vortex_compressor::stats::integer::IntegerStats) -> vortex_array::arrays::dict::array::DictArray diff --git a/vortex-btrblocks/src/builder.rs b/vortex-btrblocks/src/builder.rs index ff3229c4a37..550a59cb697 100644 --- a/vortex-btrblocks/src/builder.rs +++ b/vortex-btrblocks/src/builder.rs @@ -6,7 +6,9 @@ use vortex_utils::aliases::hash_set::HashSet; use crate::BtrBlocksCompressor; +use crate::CascadingCompressor; use crate::Scheme; +use crate::SchemeExt; use crate::SchemeId; /// All available compression schemes. @@ -38,6 +40,10 @@ pub const ALL_SCHEMES: &[&dyn Scheme] = &[ &crate::compressor::float::RLE_FLOAT_SCHEME, #[cfg(feature = "pco")] &crate::compressor::float::PcoScheme, + // Decimal schemes. + &crate::compressor::decimal::DecimalScheme, + // Temporal schemes. + &crate::compressor::temporal::TemporalScheme, // String schemes. &crate::compressor::string::UncompressedScheme, &crate::compressor::string::DictScheme, @@ -50,35 +56,31 @@ pub const ALL_SCHEMES: &[&dyn Scheme] = &[ &crate::compressor::string::ZstdBuffersScheme, ]; -/// Schemes excluded by default (behind feature gates that are off or known-expensive). -const DEFAULT_EXCLUDED: &[SchemeId] = &[ +/// Returns the set of scheme IDs excluded by default (behind feature gates or known-expensive). +pub fn default_excluded() -> HashSet { + #[allow(unused_mut)] + let mut excluded = HashSet::new(); #[cfg(feature = "pco")] - SchemeId { - name: "vortex.int.pco", - }, - #[cfg(feature = "pco")] - SchemeId { - name: "vortex.float.pco", - }, + { + excluded.insert(crate::compressor::integer::PcoScheme.id()); + excluded.insert(crate::compressor::float::PcoScheme.id()); + } #[cfg(feature = "zstd")] - SchemeId { - name: "vortex.string.zstd", - }, + excluded.insert(crate::compressor::string::ZstdScheme.id()); #[cfg(all(feature = "zstd", feature = "unstable_encodings"))] - SchemeId { - name: "vortex.string.zstd_buffers", - }, -]; + excluded.insert(crate::compressor::string::ZstdBuffersScheme.id()); + excluded +} /// Builder for creating configured [`BtrBlocksCompressor`] instances. /// /// Use this builder to configure which compression schemes are allowed. -/// By default, all schemes are enabled except those in [`DEFAULT_EXCLUDED`]. +/// By default, all schemes are enabled except those in [`default_excluded`]. /// /// # Examples /// /// ```rust -/// use vortex_btrblocks::{BtrBlocksCompressorBuilder, Scheme}; +/// use vortex_btrblocks::{BtrBlocksCompressorBuilder, Scheme, SchemeExt}; /// use vortex_btrblocks::compressor::integer::DictScheme; /// /// // Default compressor - all non-excluded schemes allowed. @@ -102,7 +104,7 @@ pub struct BtrBlocksCompressorBuilder { impl Default for BtrBlocksCompressorBuilder { fn default() -> Self { - let excluded: HashSet = DEFAULT_EXCLUDED.iter().copied().collect(); + let excluded = default_excluded(); Self { schemes: ALL_SCHEMES .iter() @@ -150,6 +152,6 @@ impl BtrBlocksCompressorBuilder { .copied() .filter(|s| self.schemes.contains(s)) .collect(); - BtrBlocksCompressor { schemes } + BtrBlocksCompressor(CascadingCompressor::new(schemes)) } } diff --git a/vortex-btrblocks/src/canonical_compressor.rs b/vortex-btrblocks/src/canonical_compressor.rs index e56032c9c08..3d1aea20323 100644 --- a/vortex-btrblocks/src/canonical_compressor.rs +++ b/vortex-btrblocks/src/canonical_compressor.rs @@ -1,60 +1,25 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Canonical array compression implementation. +//! BtrBlocks-specific compressor wrapping the generic [`CascadingCompressor`]. + +use std::ops::Deref; use vortex_array::ArrayRef; -use vortex_array::Canonical; -use vortex_array::CanonicalValidity; -use vortex_array::DynArray; -use vortex_array::IntoArray; -use vortex_array::LEGACY_SESSION; -use vortex_array::ToCanonical; -use vortex_array::VortexSessionExecute; -use vortex_array::aggregate_fn::fns::is_constant::is_constant; -use vortex_array::arrays::ConstantArray; -use vortex_array::arrays::ExtensionArray; -use vortex_array::arrays::FixedSizeListArray; -use vortex_array::arrays::ListArray; -use vortex_array::arrays::ListViewArray; -use vortex_array::arrays::StructArray; -use vortex_array::arrays::TemporalArray; -use vortex_array::arrays::listview::list_from_list_view; -use vortex_array::dtype::DType; -use vortex_array::dtype::Nullability; -use vortex_array::extension::datetime::TemporalMetadata; -use vortex_array::scalar::Scalar; -use vortex_array::vtable::ValidityHelper; use vortex_error::VortexResult; -use crate::ArrayAndStats; use crate::BtrBlocksCompressorBuilder; -use crate::CompressorContext; -use crate::GenerateStatsOptions; -use crate::Scheme; -use crate::SchemeId; -use crate::compressor::decimal::compress_decimal; -use crate::compressor::integer::DictScheme as IntDictScheme; -use crate::compressor::temporal::compress_temporal; +use crate::CascadingCompressor; -/// The main compressor type implementing BtrBlocks-inspired compression. -/// -/// This compressor applies adaptive compression schemes to arrays based on their data types -/// and characteristics. It recursively compresses nested structures like structs and lists, -/// and chooses optimal compression schemes for primitive types. +/// The BtrBlocks-style compressor with all built-in schemes pre-registered. /// -/// The compressor works by: -/// 1. Canonicalizing input arrays to a standard representation. -/// 2. Pre-filtering schemes by [`Scheme::matches`] and excludes. -/// 3. Evaluating each matching scheme's compression ratio on a sample. -/// 4. Compressing with the best scheme and verifying the result is smaller. -/// -/// Use [`BtrBlocksCompressorBuilder`] to configure which compression schemes are enabled. +/// This is a thin wrapper around [`CascadingCompressor`] that provides a default set of +/// compression schemes via [`BtrBlocksCompressorBuilder`]. /// /// # Examples /// /// ```rust -/// use vortex_btrblocks::{BtrBlocksCompressor, BtrBlocksCompressorBuilder, Scheme}; +/// use vortex_btrblocks::{BtrBlocksCompressor, BtrBlocksCompressorBuilder, Scheme, SchemeExt}; /// use vortex_btrblocks::compressor::integer::DictScheme; /// /// // Default compressor - all schemes allowed. @@ -66,267 +31,30 @@ use crate::compressor::temporal::compress_temporal; /// .build(); /// ``` #[derive(Clone)] -pub struct BtrBlocksCompressor { - /// The enabled compression schemes. - pub schemes: Vec<&'static dyn Scheme>, -} - -impl Default for BtrBlocksCompressor { - fn default() -> Self { - BtrBlocksCompressorBuilder::default().build() - } -} +pub struct BtrBlocksCompressor( + /// The underlying cascading compressor. + pub CascadingCompressor, +); impl BtrBlocksCompressor { /// Compresses an array using BtrBlocks-inspired compression. - /// - /// First canonicalizes and compacts the array, then applies optimal compression schemes. pub fn compress(&self, array: &ArrayRef) -> VortexResult { - let canonical = array - .clone() - .execute::(&mut LEGACY_SESSION.create_execution_ctx())? - .0; - - // Compact it, removing any wasted space before we attempt to compress it. - let compact = canonical.compact()?; - - self.compress_canonical(compact, CompressorContext::default(), &[]) - } - - /// Compresses a canonical array by dispatching to type-specific logic. - /// - /// For primitives and strings this calls [`choose_and_compress`](Self::choose_and_compress). - /// For compound types it recurses into children. - pub(crate) fn compress_canonical( - &self, - array: Canonical, - ctx: CompressorContext, - excludes: &[SchemeId], - ) -> VortexResult { - match array { - Canonical::Null(null_array) => Ok(null_array.into_array()), - Canonical::Bool(bool_array) => Ok(bool_array.into_array()), - Canonical::Primitive(primitive) => { - self.choose_and_compress(Canonical::Primitive(primitive), ctx, excludes) - } - Canonical::Decimal(decimal) => compress_decimal(self, &decimal), - Canonical::Struct(struct_array) => { - let fields = struct_array - .unmasked_fields() - .iter() - .map(|field| self.compress(field)) - .collect::, _>>()?; - - Ok(StructArray::try_new( - struct_array.names().clone(), - fields, - struct_array.len(), - struct_array.validity().clone(), - )? - .into_array()) - } - Canonical::List(list_view_array) => { - if list_view_array.is_zero_copy_to_list() || list_view_array.elements().is_empty() { - let list_array = list_from_list_view(list_view_array)?; - self.compress_list_array(list_array, ctx) - } else { - self.compress_list_view_array(list_view_array, ctx) - } - } - Canonical::FixedSizeList(fsl_array) => { - let compressed_elems = self.compress(fsl_array.elements())?; - - Ok(FixedSizeListArray::try_new( - compressed_elems, - fsl_array.list_size(), - fsl_array.validity().clone(), - fsl_array.len(), - )? - .into_array()) - } - Canonical::VarBinView(strings) => { - if strings - .dtype() - .eq_ignore_nullability(&DType::Utf8(Nullability::NonNullable)) - { - self.choose_and_compress(Canonical::VarBinView(strings), ctx, excludes) - } else { - // Binary arrays do not compress. - Ok(strings.into_array()) - } - } - Canonical::Extension(ext_array) => { - if let Ok(temporal_array) = TemporalArray::try_from(ext_array.clone().into_array()) - && let TemporalMetadata::Timestamp(..) = temporal_array.temporal_metadata() - { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); - if is_constant(&ext_array.clone().into_array(), &mut ctx)? { - return Ok(ConstantArray::new( - temporal_array.as_ref().scalar_at(0)?, - ext_array.len(), - ) - .into_array()); - } - return compress_temporal(self, temporal_array); - } - - // Compress the underlying storage array. - let compressed_storage = self.compress(ext_array.storage_array())?; - - Ok( - ExtensionArray::new(ext_array.ext_dtype().clone(), compressed_storage) - .into_array(), - ) - } - } - } - - /// Filters eligible schemes, evaluates their compression ratios, and compresses with the - /// best one. - fn choose_and_compress( - &self, - canonical: Canonical, - ctx: CompressorContext, - excludes: &[SchemeId], - ) -> VortexResult { - let eligible: Vec<&'static dyn Scheme> = self - .schemes - .iter() - .copied() - .filter(|s| s.matches(&canonical) && !excludes.contains(&s.id())) - .collect(); - - let array: ArrayRef = canonical.into(); - - // Nothing to compress if empty or all-null. - if array.is_empty() { - return Ok(array); - } - - if array.all_invalid()? { - return Ok( - ConstantArray::new(Scalar::null(array.dtype().clone()), array.len()).into_array(), - ); - } - - let before_nbytes = array.nbytes(); - let merged_opts = eligible - .iter() - .fold(GenerateStatsOptions::default(), |acc, s| { - acc.merge(s.stats_options()) - }); - - let mut ctx = ctx; - ctx.stats_options = merged_opts; - - let mut data = ArrayAndStats::new(array, merged_opts); - - if let Some(winner) = self.choose_scheme(&eligible, &mut data, ctx, excludes)? { - let compressed = winner.compress(self, &mut data, ctx, excludes)?; - if compressed.nbytes() < before_nbytes { - return Ok(compressed); - } - } - - // No scheme improved on the original. - Ok(data.into_array()) + self.0.compress(array) } +} - /// Evaluates each candidate scheme and returns the one with the best compression ratio - /// (must be > 1.0). - fn choose_scheme( - &self, - schemes: &[&'static dyn Scheme], - data: &mut ArrayAndStats, - ctx: CompressorContext, - excludes: &[SchemeId], - ) -> VortexResult> { - let mut best: Option<(&'static dyn Scheme, f64)> = None; - - for &scheme in schemes { - let ratio = self.evaluate_scheme(scheme, data, ctx, excludes)?; - if is_valid_ratio(ratio) && ratio > 1.0 && best.is_none_or(|(_, r)| ratio > r) { - best = Some((scheme, ratio)); - } - } - - Ok(best.map(|(s, _)| s)) - } - - /// Evaluates a single scheme's expected compression ratio with tracing. - fn evaluate_scheme( - &self, - scheme: &'static dyn Scheme, - data: &mut ArrayAndStats, - ctx: CompressorContext, - excludes: &[SchemeId], - ) -> VortexResult { - let ratio = scheme.expected_compression_ratio(self, data, ctx, excludes)?; - - tracing::debug!( - scheme = %scheme.id(), - ratio, - "evaluated compression ratio" - ); - - Ok(ratio) - } - - /// Compresses a [`ListArray`] by narrowing offsets and recursively compressing elements. - fn compress_list_array( - &self, - list_array: ListArray, - ctx: CompressorContext, - ) -> VortexResult { - let list_array = list_array.reset_offsets(true)?; - - let compressed_elems = self.compress(list_array.elements())?; - - let compressed_offsets = self.compress_canonical( - Canonical::Primitive(list_array.offsets().to_primitive().narrow()?), - ctx, - &[IntDictScheme.id()], - )?; - - Ok(ListArray::try_new( - compressed_elems, - compressed_offsets, - list_array.validity().clone(), - )? - .into_array()) - } +impl Deref for BtrBlocksCompressor { + type Target = CascadingCompressor; - /// Compresses a [`ListViewArray`] by narrowing offsets/sizes and recursively compressing - /// elements. - fn compress_list_view_array( - &self, - list_view: ListViewArray, - ctx: CompressorContext, - ) -> VortexResult { - let compressed_elems = self.compress(list_view.elements())?; - let compressed_offsets = self.compress_canonical( - Canonical::Primitive(list_view.offsets().to_primitive().narrow()?), - ctx, - &[], - )?; - let compressed_sizes = self.compress_canonical( - Canonical::Primitive(list_view.sizes().to_primitive().narrow()?), - ctx, - &[], - )?; - Ok(ListViewArray::try_new( - compressed_elems, - compressed_offsets, - compressed_sizes, - list_view.validity().clone(), - )? - .into_array()) + fn deref(&self) -> &CascadingCompressor { + &self.0 } } -/// Returns `true` if the ratio is a usable finite number (not NaN, infinity, or subnormal). -fn is_valid_ratio(ratio: f64) -> bool { - ratio.is_finite() && !ratio.is_subnormal() +impl Default for BtrBlocksCompressor { + fn default() -> Self { + BtrBlocksCompressorBuilder::default().build() + } } #[cfg(test)] diff --git a/vortex-btrblocks/src/compressor/decimal.rs b/vortex-btrblocks/src/compressor/decimal.rs index e9985c90f81..2aaa5a4e5d2 100644 --- a/vortex-btrblocks/src/compressor/decimal.rs +++ b/vortex-btrblocks/src/compressor/decimal.rs @@ -1,10 +1,12 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +//! Decimal compression scheme using byte-part decomposition. + use vortex_array::ArrayRef; use vortex_array::Canonical; use vortex_array::IntoArray; -use vortex_array::arrays::DecimalArray; +use vortex_array::ToCanonical; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::decimal::narrowed_decimal; use vortex_array::dtype::DecimalType; @@ -12,30 +14,66 @@ use vortex_array::vtable::ValidityHelper; use vortex_decimal_byte_parts::DecimalBytePartsArray; use vortex_error::VortexResult; -use crate::BtrBlocksCompressor; +use crate::ArrayAndStats; +use crate::CascadingCompressor; use crate::CompressorContext; +use crate::Scheme; +use crate::SchemeId; + +/// Compression scheme for decimal arrays via byte-part decomposition. +/// +/// Narrows the decimal to the smallest integer type, compresses the underlying primitive, +/// and wraps the result in a [`DecimalBytePartsArray`]. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct DecimalScheme; + +impl Scheme for DecimalScheme { + fn scheme_name(&self) -> &'static str { + "vortex.decimal.byte_parts" + } + + fn matches(&self, canonical: &Canonical) -> bool { + matches!(canonical, Canonical::Decimal(_)) + } + + fn expected_compression_ratio( + &self, + _compressor: &CascadingCompressor, + _data: &mut ArrayAndStats, + _ctx: CompressorContext, + _excludes: &[SchemeId], + ) -> VortexResult { + // Decimal compression is almost always beneficial (narrowing + primitive compression). + // Return a moderate ratio to ensure this scheme is always selected. + Ok(2.0) + } + + fn compress( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + _excludes: &[SchemeId], + ) -> VortexResult { + // TODO(joe): add support splitting i128/256 buffers into chunks of primitive values + // for compression. 2 for i128 and 4 for i256. + let decimal = data.array().clone().to_decimal(); + let decimal = narrowed_decimal(decimal); + let validity = decimal.validity(); + let prim = match decimal.values_type() { + DecimalType::I8 => PrimitiveArray::new(decimal.buffer::(), validity.clone()), + DecimalType::I16 => PrimitiveArray::new(decimal.buffer::(), validity.clone()), + DecimalType::I32 => PrimitiveArray::new(decimal.buffer::(), validity.clone()), + DecimalType::I64 => PrimitiveArray::new(decimal.buffer::(), validity.clone()), + _ => return Ok(decimal.into_array()), + }; + + let compressed = compressor.compress_canonical( + Canonical::Primitive(prim), + CompressorContext::default(), + &[], + )?; -// TODO(joe): add support splitting i128/256 buffers into chunks primitive values for compression. -// 2 for i128 and 4 for i256 -pub fn compress_decimal( - compressor: &BtrBlocksCompressor, - decimal: &DecimalArray, -) -> VortexResult { - let decimal = narrowed_decimal(decimal.clone()); - let validity = decimal.validity(); - let prim = match decimal.values_type() { - DecimalType::I8 => PrimitiveArray::new(decimal.buffer::(), validity.clone()), - DecimalType::I16 => PrimitiveArray::new(decimal.buffer::(), validity.clone()), - DecimalType::I32 => PrimitiveArray::new(decimal.buffer::(), validity.clone()), - DecimalType::I64 => PrimitiveArray::new(decimal.buffer::(), validity.clone()), - _ => return Ok(decimal.into_array()), - }; - - let compressed = compressor.compress_canonical( - Canonical::Primitive(prim), - CompressorContext::default(), - &[], - )?; - - DecimalBytePartsArray::try_new(compressed, decimal.decimal_dtype()).map(|d| d.into_array()) + DecimalBytePartsArray::try_new(compressed, decimal.decimal_dtype()).map(|d| d.into_array()) + } } diff --git a/vortex-btrblocks/src/compressor/float/dictionary.rs b/vortex-btrblocks/src/compressor/float/dictionary.rs index 3370c20cf3b..d5fb91c9a67 100644 --- a/vortex-btrblocks/src/compressor/float/dictionary.rs +++ b/vortex-btrblocks/src/compressor/float/dictionary.rs @@ -12,31 +12,37 @@ use vortex_array::dtype::half::f16; use vortex_array::validity::Validity; use vortex_array::vtable::ValidityHelper; use vortex_buffer::Buffer; +use vortex_compressor::stats::FloatErasedStats as ErasedStats; use vortex_error::VortexExpect; -use super::stats::ErasedStats; -use super::stats::FloatStats; +use super::FloatStats; macro_rules! typed_encode { ($stats:ident, $typed:ident, $validity:ident, $typ:ty) => {{ - let distinct = $typed.distinct.as_ref().vortex_expect( + let distinct = $typed.distinct().vortex_expect( "this must be present since `DictScheme` declared that we need distinct values", ); - let values: Buffer<$typ> = distinct.distinct_values.iter().map(|x| x.0).collect(); + let values: Buffer<$typ> = distinct.distinct_values().iter().map(|x| x.0).collect(); let max_code = values.len(); let codes = if max_code <= u8::MAX as usize { - let buf = - >::encode(&values, $stats.src.as_slice::<$typ>()); + let buf = >::encode( + &values, + $stats.source().as_slice::<$typ>(), + ); PrimitiveArray::new(buf, $validity.clone()).into_array() } else if max_code <= u16::MAX as usize { - let buf = - >::encode(&values, $stats.src.as_slice::<$typ>()); + let buf = >::encode( + &values, + $stats.source().as_slice::<$typ>(), + ); PrimitiveArray::new(buf, $validity.clone()).into_array() } else { - let buf = - >::encode(&values, $stats.src.as_slice::<$typ>()); + let buf = >::encode( + &values, + $stats.source().as_slice::<$typ>(), + ); PrimitiveArray::new(buf, $validity.clone()).into_array() }; @@ -53,8 +59,8 @@ macro_rules! typed_encode { /// Compresses a floating-point array into a dictionary arrays according to attached stats. pub fn dictionary_encode(stats: &FloatStats) -> DictArray { - let validity = stats.src.validity(); - match &stats.erased { + let validity = stats.source().validity(); + match stats.erased() { ErasedStats::F16(typed) => typed_encode!(stats, typed, validity, f16), ErasedStats::F32(typed) => typed_encode!(stats, typed, validity, f32), ErasedStats::F64(typed) => typed_encode!(stats, typed, validity, f64), diff --git a/vortex-btrblocks/src/compressor/float/mod.rs b/vortex-btrblocks/src/compressor/float/mod.rs index f979a4b3275..ea2a6e127fd 100644 --- a/vortex-btrblocks/src/compressor/float/mod.rs +++ b/vortex-btrblocks/src/compressor/float/mod.rs @@ -2,7 +2,6 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors pub(crate) mod dictionary; -pub(super) mod stats; use vortex_alp::ALP; use vortex_alp::ALPArray; @@ -20,27 +19,29 @@ use vortex_array::arrays::dict::DictArrayParts; use vortex_array::dtype::PType; use vortex_array::scalar::Scalar; use vortex_array::vtable::ValidityHelper; +pub use vortex_compressor::stats::FloatStats; use vortex_error::VortexResult; use vortex_error::vortex_panic; use vortex_sparse::Sparse; use vortex_sparse::SparseArray; use self::dictionary::dictionary_encode; -pub use self::stats::FloatStats; use super::integer::DictScheme as IntDictScheme; use super::integer::RunEndScheme as IntRunEndScheme; use super::integer::SequenceScheme as IntSequenceScheme; use super::integer::SparseScheme as IntSparseScheme; use crate::ArrayAndStats; -use crate::BtrBlocksCompressor; +use crate::CascadingCompressor; use crate::CompressorContext; use crate::GenerateStatsOptions; use crate::Scheme; +use crate::SchemeExt; use crate::SchemeId; -use crate::compressor::patches::compress_patches; +use crate::compress_patches; use crate::compressor::rle; use crate::compressor::rle::RLEScheme; -use crate::scheme::estimate_compression_ratio_with_sampling; +use crate::compressor::rle::RLEStats; +use crate::estimate_compression_ratio_with_sampling; /// Returns `true` if the canonical form represents a floating-point primitive. fn is_float_primitive(canonical: &Canonical) -> bool { @@ -94,7 +95,7 @@ impl rle::RLEConfig for FloatRLEConfig { } fn compress_values( - compressor: &BtrBlocksCompressor, + compressor: &CascadingCompressor, values: &PrimitiveArray, ctx: CompressorContext, excludes: &[SchemeId], @@ -103,6 +104,20 @@ impl rle::RLEConfig for FloatRLEConfig { } } +impl RLEStats for FloatStats { + fn value_count(&self) -> u32 { + FloatStats::value_count(self) + } + + fn average_run_length(&self) -> u32 { + FloatStats::average_run_length(self) + } + + fn source(&self) -> &PrimitiveArray { + FloatStats::source(self) + } +} + /// RLE scheme for float compression. pub const RLE_FLOAT_SCHEME: RLEScheme = RLEScheme::new(); @@ -117,7 +132,7 @@ impl Scheme for UncompressedScheme { fn expected_compression_ratio( &self, - _compressor: &BtrBlocksCompressor, + _compressor: &CascadingCompressor, _data: &mut ArrayAndStats, _ctx: CompressorContext, _excludes: &[SchemeId], @@ -127,7 +142,7 @@ impl Scheme for UncompressedScheme { fn compress( &self, - _compressor: &BtrBlocksCompressor, + _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, _excludes: &[SchemeId], @@ -145,13 +160,13 @@ impl Scheme for ConstantScheme { is_float_primitive(canonical) } - fn is_constant(&self) -> bool { + fn detects_constant(&self) -> bool { true } fn expected_compression_ratio( &self, - _compressor: &BtrBlocksCompressor, + _compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, _excludes: &[SchemeId], @@ -163,13 +178,13 @@ impl Scheme for ConstantScheme { let stats = data.float_stats(); - if stats.null_count as usize == stats.src.len() || stats.value_count == 0 { + if stats.null_count() as usize == stats.source().len() || stats.value_count() == 0 { return Ok(0.0); } // Can only have 1 distinct value. if stats.distinct_count().is_some_and(|count| count == 1) { - return Ok(stats.value_count as f64); + return Ok(stats.value_count() as f64); } Ok(0.0) @@ -177,7 +192,7 @@ impl Scheme for ConstantScheme { fn compress( &self, - _compressor: &BtrBlocksCompressor, + _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, _excludes: &[SchemeId], @@ -190,16 +205,19 @@ impl Scheme for ConstantScheme { match scalar_idx { Some(idx) => { let scalar = stats.source().scalar_at(idx)?; - let const_arr = ConstantArray::new(scalar, stats.src.len()).into_array(); + let const_arr = ConstantArray::new(scalar, stats.source().len()).into_array(); if !stats.source().all_valid()? { - Ok(MaskedArray::try_new(const_arr, stats.src.validity().clone())?.into_array()) + Ok( + MaskedArray::try_new(const_arr, stats.source().validity().clone())? + .into_array(), + ) } else { Ok(const_arr) } } None => Ok(ConstantArray::new( - Scalar::null(stats.src.dtype().clone()), - stats.src.len(), + Scalar::null(stats.source().dtype().clone()), + stats.source().len(), ) .into_array()), } @@ -217,7 +235,7 @@ impl Scheme for ALPScheme { fn expected_compression_ratio( &self, - compressor: &BtrBlocksCompressor, + compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, excludes: &[SchemeId], @@ -235,12 +253,12 @@ impl Scheme for ALPScheme { return Ok(0.0); } - estimate_compression_ratio_with_sampling(self, compressor, data, ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx, excludes) } fn compress( &self, - compressor: &BtrBlocksCompressor, + compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, excludes: &[SchemeId], @@ -285,7 +303,7 @@ impl Scheme for ALPRDScheme { fn expected_compression_ratio( &self, - compressor: &BtrBlocksCompressor, + compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, excludes: &[SchemeId], @@ -296,12 +314,12 @@ impl Scheme for ALPRDScheme { return Ok(0.0); } - estimate_compression_ratio_with_sampling(self, compressor, data, ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx, excludes) } fn compress( &self, - _compressor: &BtrBlocksCompressor, + _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, _excludes: &[SchemeId], @@ -343,14 +361,14 @@ impl Scheme for DictScheme { fn expected_compression_ratio( &self, - compressor: &BtrBlocksCompressor, + compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, excludes: &[SchemeId], ) -> VortexResult { let stats = data.float_stats(); - if stats.value_count == 0 { + if stats.value_count() == 0 { return Ok(0.0); } @@ -358,10 +376,16 @@ impl Scheme for DictScheme { // dictionary. if stats .distinct_count() - .is_some_and(|count| count <= stats.value_count / 2) + .is_some_and(|count| count <= stats.value_count() / 2) { // Take a sample and run compression on the sample to determine before/after size. - return estimate_compression_ratio_with_sampling(self, compressor, data, ctx, excludes); + return estimate_compression_ratio_with_sampling( + self, + compressor, + data.array(), + ctx, + excludes, + ); } Ok(0.0) @@ -369,7 +393,7 @@ impl Scheme for DictScheme { fn compress( &self, - compressor: &BtrBlocksCompressor, + compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, _excludes: &[SchemeId], @@ -415,7 +439,7 @@ impl Scheme for NullDominated { fn expected_compression_ratio( &self, - _compressor: &BtrBlocksCompressor, + _compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, _excludes: &[SchemeId], @@ -427,14 +451,14 @@ impl Scheme for NullDominated { let stats = data.float_stats(); - if stats.value_count == 0 { + if stats.value_count() == 0 { // All nulls should use ConstantScheme. return Ok(0.0); } // If the majority is null, will compress well. - if stats.null_count as f64 / stats.src.len() as f64 > 0.9 { - return Ok(stats.src.len() as f64 / stats.value_count as f64); + if stats.null_count() as f64 / stats.source().len() as f64 > 0.9 { + return Ok(stats.source().len() as f64 / stats.value_count() as f64); } // Otherwise we don't go this route. @@ -443,7 +467,7 @@ impl Scheme for NullDominated { fn compress( &self, - compressor: &BtrBlocksCompressor, + compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, _excludes: &[SchemeId], @@ -453,7 +477,7 @@ impl Scheme for NullDominated { let stats = data.float_stats(); // We pass None as we only run this pathway for NULL-dominated float arrays. - let sparse_encoded = SparseArray::encode(&stats.src.clone().into_array(), None)?; + let sparse_encoded = SparseArray::encode(&stats.source().clone().into_array(), None)?; if let Some(sparse) = sparse_encoded.as_opt::() { let indices = sparse.patches().indices().to_primitive().narrow()?; @@ -488,7 +512,7 @@ impl Scheme for PcoScheme { fn compress( &self, - _compressor: &BtrBlocksCompressor, + _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, _excludes: &[SchemeId], diff --git a/vortex-btrblocks/src/compressor/integer/dictionary.rs b/vortex-btrblocks/src/compressor/integer/dictionary.rs index d12ea2b6233..2bbf10c224a 100644 --- a/vortex-btrblocks/src/compressor/integer/dictionary.rs +++ b/vortex-btrblocks/src/compressor/integer/dictionary.rs @@ -11,31 +11,37 @@ use vortex_array::arrays::PrimitiveArray; use vortex_array::validity::Validity; use vortex_array::vtable::ValidityHelper; use vortex_buffer::Buffer; +use vortex_compressor::stats::IntegerErasedStats as ErasedStats; use vortex_error::VortexExpect; use super::IntegerStats; -use super::stats::ErasedStats; macro_rules! typed_encode { ($stats:ident, $typed:ident, $validity:ident, $typ:ty) => {{ - let distinct = $typed.distinct.as_ref().vortex_expect( + let distinct = $typed.distinct().vortex_expect( "this must be present since `DictScheme` declared that we need distinct values", ); - let values: Buffer<$typ> = distinct.distinct_values.keys().map(|x| x.0).collect(); + let values: Buffer<$typ> = distinct.distinct_values().keys().map(|x| x.0).collect(); let max_code = values.len(); let codes = if max_code <= u8::MAX as usize { - let buf = - >::encode(&values, $stats.src.as_slice::<$typ>()); + let buf = >::encode( + &values, + $stats.source().as_slice::<$typ>(), + ); PrimitiveArray::new(buf, $validity.clone()).into_array() } else if max_code <= u16::MAX as usize { - let buf = - >::encode(&values, $stats.src.as_slice::<$typ>()); + let buf = >::encode( + &values, + $stats.source().as_slice::<$typ>(), + ); PrimitiveArray::new(buf, $validity.clone()).into_array() } else { - let buf = - >::encode(&values, $stats.src.as_slice::<$typ>()); + let buf = >::encode( + &values, + $stats.source().as_slice::<$typ>(), + ); PrimitiveArray::new(buf, $validity.clone()).into_array() }; @@ -57,9 +63,9 @@ macro_rules! typed_encode { )] pub fn dictionary_encode(stats: &IntegerStats) -> DictArray { // We need to preserve the nullability somehow from the original - let src_validity = stats.src.validity(); + let src_validity = stats.source().validity(); - match &stats.erased { + match stats.erased() { ErasedStats::U8(typed) => typed_encode!(stats, typed, src_validity, u8), ErasedStats::U16(typed) => typed_encode!(stats, typed, src_validity, u16), ErasedStats::U32(typed) => typed_encode!(stats, typed, src_validity, u32), diff --git a/vortex-btrblocks/src/compressor/integer/mod.rs b/vortex-btrblocks/src/compressor/integer/mod.rs index fdc7c6864dc..86d99079265 100644 --- a/vortex-btrblocks/src/compressor/integer/mod.rs +++ b/vortex-btrblocks/src/compressor/integer/mod.rs @@ -2,9 +2,6 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors pub(crate) mod dictionary; -pub(super) mod stats; - -pub use stats::IntegerStats; use vortex_array::ArrayRef; use vortex_array::Canonical; use vortex_array::IntoArray; @@ -15,6 +12,7 @@ use vortex_array::arrays::MaskedArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::scalar::Scalar; use vortex_array::vtable::ValidityHelper; +pub use vortex_compressor::stats::IntegerStats; use vortex_error::VortexExpect; use vortex_error::VortexResult; use vortex_error::vortex_bail; @@ -33,15 +31,17 @@ use vortex_zigzag::zigzag_encode; use self::dictionary::dictionary_encode; use crate::ArrayAndStats; -use crate::BtrBlocksCompressor; +use crate::CascadingCompressor; use crate::CompressorContext; use crate::GenerateStatsOptions; use crate::Scheme; +use crate::SchemeExt; use crate::SchemeId; -use crate::compressor::patches::compress_patches; +use crate::compress_patches; use crate::compressor::rle; use crate::compressor::rle::RLEScheme; -use crate::scheme::estimate_compression_ratio_with_sampling; +use crate::compressor::rle::RLEStats; +use crate::estimate_compression_ratio_with_sampling; /// Returns `true` if the canonical array is a primitive with an integer ptype. fn is_integer_primitive(canonical: &Canonical) -> bool { @@ -110,7 +110,7 @@ impl rle::RLEConfig for IntRLEConfig { } fn compress_values( - compressor: &BtrBlocksCompressor, + compressor: &CascadingCompressor, values: &PrimitiveArray, ctx: CompressorContext, excludes: &[SchemeId], @@ -119,6 +119,20 @@ impl rle::RLEConfig for IntRLEConfig { } } +impl RLEStats for IntegerStats { + fn value_count(&self) -> u32 { + self.value_count() + } + + fn average_run_length(&self) -> u32 { + self.average_run_length() + } + + fn source(&self) -> &PrimitiveArray { + self.source() + } +} + /// RLE scheme for integer compression. pub const RLE_INTEGER_SCHEME: RLEScheme = RLEScheme::new(); @@ -133,7 +147,7 @@ impl Scheme for UncompressedScheme { fn expected_compression_ratio( &self, - _compressor: &BtrBlocksCompressor, + _compressor: &CascadingCompressor, _data: &mut ArrayAndStats, _ctx: CompressorContext, _excludes: &[SchemeId], @@ -144,7 +158,7 @@ impl Scheme for UncompressedScheme { fn compress( &self, - _compressor: &BtrBlocksCompressor, + _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, _excludes: &[SchemeId], @@ -162,13 +176,13 @@ impl Scheme for ConstantScheme { is_integer_primitive(canonical) } - fn is_constant(&self) -> bool { + fn detects_constant(&self) -> bool { true } fn expected_compression_ratio( &self, - _compressor: &BtrBlocksCompressor, + _compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, _excludes: &[SchemeId], @@ -185,12 +199,12 @@ impl Scheme for ConstantScheme { return Ok(0.0); } - Ok(stats.value_count as f64) + Ok(stats.value_count() as f64) } fn compress( &self, - _compressor: &BtrBlocksCompressor, + _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, _excludes: &[SchemeId], @@ -203,16 +217,19 @@ impl Scheme for ConstantScheme { match scalar_idx { Some(idx) => { let scalar = stats.source().scalar_at(idx)?; - let const_arr = ConstantArray::new(scalar, stats.src.len()).into_array(); + let const_arr = ConstantArray::new(scalar, stats.source().len()).into_array(); if !stats.source().all_valid()? { - Ok(MaskedArray::try_new(const_arr, stats.src.validity().clone())?.into_array()) + Ok( + MaskedArray::try_new(const_arr, stats.source().validity().clone())? + .into_array(), + ) } else { Ok(const_arr) } } None => Ok(ConstantArray::new( - Scalar::null(stats.src.dtype().clone()), - stats.src.len(), + Scalar::null(stats.source().dtype().clone()), + stats.source().len(), ) .into_array()), } @@ -230,7 +247,7 @@ impl Scheme for FORScheme { fn expected_compression_ratio( &self, - _compressor: &BtrBlocksCompressor, + _compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, _excludes: &[SchemeId], @@ -243,23 +260,23 @@ impl Scheme for FORScheme { let stats = data.integer_stats(); // All-null cannot be FOR compressed. - if stats.value_count == 0 { + if stats.value_count() == 0 { return Ok(0.0); } // Only apply when the min is not already zero. - if stats.erased.min_is_zero() { + if stats.erased().min_is_zero() { return Ok(0.0); } // Difference between max and min. let full_width: u32 = stats - .src + .source() .ptype() .bit_width() .try_into() .vortex_expect("bit width must fit in u32"); - let for_bw = match stats.erased.max_minus_min().checked_ilog2() { + let for_bw = match stats.erased().max_minus_min().checked_ilog2() { Some(l) => l + 1, // If max-min == 0, we should use a different compression scheme as we don't want to // bitpack down to 0 bits. @@ -270,9 +287,9 @@ impl Scheme for FORScheme { // compared to BitPacking, don't use FOR since it has overhead (storing reference). // Only skip FOR when min >= 0, otherwise BitPacking can't apply directly. if let Some(max_log) = stats - .erased + .erased() .max_ilog2() - .filter(|_| !stats.erased.min_is_negative()) + .filter(|_| !stats.erased().min_is_negative()) { let bitpack_bw = max_log + 1; if for_bw >= bitpack_bw { @@ -285,7 +302,7 @@ impl Scheme for FORScheme { fn compress( &self, - compressor: &BtrBlocksCompressor, + compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, excludes: &[SchemeId], @@ -327,7 +344,7 @@ impl Scheme for ZigZagScheme { fn expected_compression_ratio( &self, - compressor: &BtrBlocksCompressor, + compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, excludes: &[SchemeId], @@ -340,22 +357,22 @@ impl Scheme for ZigZagScheme { let stats = data.integer_stats(); // Don't try and compress all-null arrays. - if stats.value_count == 0 { + if stats.value_count() == 0 { return Ok(0.0); } // ZigZag is only useful when there are negative values. - if !stats.erased.min_is_negative() { + if !stats.erased().min_is_negative() { return Ok(0.0); } // Run compression on a sample to see how it performs. - estimate_compression_ratio_with_sampling(self, compressor, data, ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx, excludes) } fn compress( &self, - compressor: &BtrBlocksCompressor, + compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, excludes: &[SchemeId], @@ -363,7 +380,7 @@ impl Scheme for ZigZagScheme { let stats = data.integer_stats(); // Zigzag encode the values, then recursively compress the inner values. - let zag = zigzag_encode(stats.src.clone())?; + let zag = zigzag_encode(stats.source().clone())?; let encoded = zag.encoded().to_primitive(); // ZigZag should be after Dict, RunEnd or Sparse. @@ -399,7 +416,7 @@ impl Scheme for BitPackingScheme { fn expected_compression_ratio( &self, - compressor: &BtrBlocksCompressor, + compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, excludes: &[SchemeId], @@ -407,21 +424,21 @@ impl Scheme for BitPackingScheme { let stats = data.integer_stats(); // BitPacking only works for non-negative values. - if stats.erased.min_is_negative() { + if stats.erased().min_is_negative() { return Ok(0.0); } // Don't compress all-null arrays. - if stats.value_count == 0 { + if stats.value_count() == 0 { return Ok(0.0); } - estimate_compression_ratio_with_sampling(self, compressor, data, ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx, excludes) } fn compress( &self, - _compressor: &BtrBlocksCompressor, + _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, _excludes: &[SchemeId], @@ -460,7 +477,7 @@ impl Scheme for SparseScheme { fn expected_compression_ratio( &self, - _compressor: &BtrBlocksCompressor, + _compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, _excludes: &[SchemeId], @@ -477,30 +494,33 @@ impl Scheme for SparseScheme { // will have distinct values computed. let stats = data.integer_stats(); - if stats.value_count == 0 { + if stats.value_count() == 0 { // All nulls should use ConstantScheme. return Ok(0.0); } // If the majority is null, will compress well. - if stats.null_count as f64 / stats.src.len() as f64 > 0.9 { - return Ok(stats.src.len() as f64 / stats.value_count as f64); + if stats.null_count() as f64 / stats.source().len() as f64 > 0.9 { + return Ok(stats.source().len() as f64 / stats.value_count() as f64); } // See if the top value accounts for >= 90% of the set values. - let (_, top_count) = stats.erased.most_frequent_value_and_count().vortex_expect( - "this must be present since `SparseScheme` declared that we need distinct values", - ); - - if top_count == stats.value_count { + let (_, top_count) = stats + .erased() + .most_frequent_value_and_count() + .vortex_expect( + "this must be present since `SparseScheme` declared that we need distinct values", + ); + + if top_count == stats.value_count() { // top_value is the only value, should use ConstantScheme instead. return Ok(0.0); } - let freq = top_count as f64 / stats.value_count as f64; + let freq = top_count as f64 / stats.value_count() as f64; if freq >= 0.9 { // We only store the positions of the non-top values. - return Ok(stats.value_count as f64 / (stats.value_count - top_count) as f64); + return Ok(stats.value_count() as f64 / (stats.value_count() - top_count) as f64); } Ok(0.0) @@ -508,7 +528,7 @@ impl Scheme for SparseScheme { fn compress( &self, - compressor: &BtrBlocksCompressor, + compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, excludes: &[SchemeId], @@ -517,28 +537,31 @@ impl Scheme for SparseScheme { let stats = data.integer_stats(); - let (top_pvalue, top_count) = stats.erased.most_frequent_value_and_count().vortex_expect( - "this must be present since `SparseScheme` declared that we need distinct values", - ); - if top_count as usize == stats.src.len() { + let (top_pvalue, top_count) = stats + .erased() + .most_frequent_value_and_count() + .vortex_expect( + "this must be present since `SparseScheme` declared that we need distinct values", + ); + if top_count as usize == stats.source().len() { // top_value is the only value, use ConstantScheme. return Ok(ConstantArray::new( Scalar::primitive_value( top_pvalue, top_pvalue.ptype(), - stats.src.dtype().nullability(), + stats.source().dtype().nullability(), ), - stats.src.len(), + stats.source().len(), ) .into_array()); } let sparse_encoded = SparseArray::encode( - &stats.src.clone().into_array(), + &stats.source().clone().into_array(), Some(Scalar::primitive_value( top_pvalue, top_pvalue.ptype(), - stats.src.dtype().nullability(), + stats.source().dtype().nullability(), )), )?; @@ -591,7 +614,7 @@ impl Scheme for DictScheme { fn expected_compression_ratio( &self, - _compressor: &BtrBlocksCompressor, + _compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, _excludes: &[SchemeId], @@ -603,7 +626,7 @@ impl Scheme for DictScheme { let stats = data.integer_stats(); - if stats.value_count == 0 { + if stats.value_count() == 0 { return Ok(0.0); } @@ -612,7 +635,7 @@ impl Scheme for DictScheme { ); // If > 50% of the values are distinct, skip dict. - if distinct_values_count > stats.value_count / 2 { + if distinct_values_count > stats.value_count() / 2 { return Ok(0.0); } @@ -622,22 +645,22 @@ impl Scheme for DictScheme { // Assume codes are compressed RLE + BitPacking. let codes_bw = usize::BITS - distinct_values_count.leading_zeros(); - let n_runs = (stats.value_count / stats.average_run_length) as usize; + let n_runs = (stats.value_count() / stats.average_run_length()) as usize; // Assume that codes will either be BitPack or RLE-BitPack. - let codes_size_bp = (codes_bw * stats.value_count) as usize; + let codes_size_bp = (codes_bw * stats.value_count()) as usize; let codes_size_rle_bp = usize::checked_mul((codes_bw + 32) as usize, n_runs); let codes_size = usize::min(codes_size_bp, codes_size_rle_bp.unwrap_or(usize::MAX)); - let before = stats.value_count as usize * stats.source().ptype().bit_width(); + let before = stats.value_count() as usize * stats.source().ptype().bit_width(); Ok(before as f64 / (values_size + codes_size) as f64) } fn compress( &self, - compressor: &BtrBlocksCompressor, + compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, excludes: &[SchemeId], @@ -685,7 +708,7 @@ impl Scheme for RunEndScheme { fn expected_compression_ratio( &self, - compressor: &BtrBlocksCompressor, + compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, excludes: &[SchemeId], @@ -693,7 +716,7 @@ impl Scheme for RunEndScheme { let stats = data.integer_stats(); // If the run length is below the threshold, drop it. - if stats.average_run_length < RUN_END_THRESHOLD { + if stats.average_run_length() < RUN_END_THRESHOLD { return Ok(0.0); } @@ -702,12 +725,12 @@ impl Scheme for RunEndScheme { } // Run compression on a sample, see how it performs. - estimate_compression_ratio_with_sampling(self, compressor, data, ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx, excludes) } fn compress( &self, - compressor: &BtrBlocksCompressor, + compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, excludes: &[SchemeId], @@ -717,7 +740,7 @@ impl Scheme for RunEndScheme { let stats = data.integer_stats(); // Run-end encode the ends. - let (ends, values) = runend_encode(&stats.src); + let (ends, values) = runend_encode(stats.source()); let mut new_excludes = vec![RunEndScheme.id(), DictScheme.id()]; new_excludes.extend_from_slice(excludes); @@ -736,10 +759,13 @@ impl Scheme for RunEndScheme { // SAFETY: compression doesn't affect invariants. unsafe { - Ok( - RunEndArray::new_unchecked(compressed_ends, compressed_values, 0, stats.src.len()) - .into_array(), + Ok(RunEndArray::new_unchecked( + compressed_ends, + compressed_values, + 0, + stats.source().len(), ) + .into_array()) } } } @@ -755,14 +781,14 @@ impl Scheme for SequenceScheme { fn expected_compression_ratio( &self, - _compressor: &BtrBlocksCompressor, + _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, _excludes: &[SchemeId], ) -> VortexResult { let stats = data.integer_stats(); - if stats.null_count > 0 { + if stats.null_count() > 0 { return Ok(0.0); } @@ -771,31 +797,31 @@ impl Scheme for SequenceScheme { if stats .distinct_count() // TODO(connor): Shouldn't this be `is_none_or`??? Why do things fail if not this? - .is_some_and(|count| count as usize != stats.src.len()) + .is_some_and(|count| count as usize != stats.source().len()) { return Ok(0.0); } // Since two values are required to store base and multiplier the compression ratio is // divided by 2. - Ok(sequence_encode(&stats.src)? - .map(|_| stats.src.len() as f64 / 2.0) + Ok(sequence_encode(stats.source())? + .map(|_| stats.source().len() as f64 / 2.0) .unwrap_or(0.0)) } fn compress( &self, - _compressor: &BtrBlocksCompressor, + _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, _excludes: &[SchemeId], ) -> VortexResult { let stats = data.integer_stats(); - if stats.null_count > 0 { + if stats.null_count() > 0 { vortex_bail!("sequence encoding does not support nulls"); } - sequence_encode(&stats.src)?.ok_or_else(|| vortex_err!("cannot sequence encode array")) + sequence_encode(stats.source())?.ok_or_else(|| vortex_err!("cannot sequence encode array")) } } @@ -811,7 +837,7 @@ impl Scheme for PcoScheme { fn expected_compression_ratio( &self, - compressor: &BtrBlocksCompressor, + compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, excludes: &[SchemeId], @@ -820,18 +846,18 @@ impl Scheme for PcoScheme { // Pco does not support I8 or U8. if matches!( - stats.src.ptype(), + stats.source().ptype(), vortex_array::dtype::PType::I8 | vortex_array::dtype::PType::U8 ) { return Ok(0.0); } - estimate_compression_ratio_with_sampling(self, compressor, data, ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx, excludes) } fn compress( &self, - _compressor: &BtrBlocksCompressor, + _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, _excludes: &[SchemeId], diff --git a/vortex-btrblocks/src/compressor/mod.rs b/vortex-btrblocks/src/compressor/mod.rs index 3a088f7668e..cea3bc0ec09 100644 --- a/vortex-btrblocks/src/compressor/mod.rs +++ b/vortex-btrblocks/src/compressor/mod.rs @@ -3,7 +3,8 @@ //! Compression scheme implementations. -pub(crate) mod decimal; +/// Decimal compression schemes. +pub mod decimal; /// Float compression schemes. pub mod float; /// Integer compression schemes. @@ -12,7 +13,5 @@ pub(crate) mod patches; pub(crate) mod rle; /// String compression schemes. pub mod string; -pub(crate) mod temporal; - -/// Maximum cascade depth for compression. -pub(crate) const MAX_CASCADE: usize = 3; +/// Temporal compression schemes. +pub mod temporal; diff --git a/vortex-btrblocks/src/compressor/rle.rs b/vortex-btrblocks/src/compressor/rle.rs index 6e801732b37..6de217df4b4 100644 --- a/vortex-btrblocks/src/compressor/rle.rs +++ b/vortex-btrblocks/src/compressor/rle.rs @@ -13,12 +13,13 @@ use vortex_error::VortexResult; use vortex_fastlanes::RLEArray; use crate::ArrayAndStats; -use crate::BtrBlocksCompressor; +use crate::CascadingCompressor; use crate::CompressorContext; use crate::Scheme; +use crate::SchemeExt; use crate::SchemeId; use crate::compressor::integer::DictScheme as IntDictScheme; -use crate::scheme::estimate_compression_ratio_with_sampling; +use crate::estimate_compression_ratio_with_sampling; /// Threshold for the average run length in an array before we consider run-length encoding. pub const RUN_LENGTH_THRESHOLD: u32 = 4; @@ -52,7 +53,7 @@ pub trait RLEConfig: Debug + Send + Sync + 'static { /// Compress the values array after RLE encoding. fn compress_values( - compressor: &BtrBlocksCompressor, + compressor: &CascadingCompressor, values: &PrimitiveArray, ctx: CompressorContext, excludes: &[SchemeId], @@ -89,7 +90,7 @@ impl Scheme for RLEScheme { fn expected_compression_ratio( &self, - compressor: &BtrBlocksCompressor, + compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, excludes: &[SchemeId], @@ -113,12 +114,12 @@ impl Scheme for RLEScheme { } // Run compression on a sample to see how it performs. - estimate_compression_ratio_with_sampling(self, compressor, data, ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx, excludes) } fn compress( &self, - compressor: &BtrBlocksCompressor, + compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, excludes: &[SchemeId], diff --git a/vortex-btrblocks/src/compressor/string.rs b/vortex-btrblocks/src/compressor/string.rs index 46baa1ae349..9e86376e68e 100644 --- a/vortex-btrblocks/src/compressor/string.rs +++ b/vortex-btrblocks/src/compressor/string.rs @@ -12,107 +12,36 @@ use vortex_array::arrays::ConstantArray; use vortex_array::arrays::DictArray; use vortex_array::arrays::MaskedArray; use vortex_array::arrays::VarBinArray; -use vortex_array::arrays::VarBinViewArray; use vortex_array::builders::dict::dict_encode; use vortex_array::dtype::DType; use vortex_array::dtype::Nullability; use vortex_array::scalar::Scalar; use vortex_array::vtable::ValidityHelper; -use vortex_error::VortexExpect; use vortex_error::VortexResult; -use vortex_error::vortex_err; use vortex_fsst::FSSTArray; use vortex_fsst::fsst_compress; use vortex_fsst::fsst_train_compressor; use vortex_sparse::Sparse; use vortex_sparse::SparseArray; -use vortex_utils::aliases::hash_set::HashSet; use super::integer::DictScheme as IntDictScheme; use super::integer::SequenceScheme as IntSequenceScheme; use super::integer::SparseScheme as IntSparseScheme; use crate::ArrayAndStats; -use crate::BtrBlocksCompressor; +use crate::CascadingCompressor; use crate::CompressorContext; use crate::GenerateStatsOptions; use crate::Scheme; +use crate::SchemeExt; use crate::SchemeId; -use crate::scheme::estimate_compression_ratio_with_sampling; +use crate::estimate_compression_ratio_with_sampling; /// Returns `true` if the canonical array is a UTF-8 string type. fn is_utf8_string(canonical: &Canonical) -> bool { matches!(canonical, Canonical::VarBinView(v) if v.dtype().eq_ignore_nullability(&DType::Utf8(Nullability::NonNullable))) } -/// Array of variable-length byte arrays, and relevant stats for compression. -#[derive(Clone, Debug)] -pub struct StringStats { - src: VarBinViewArray, - estimated_distinct_count: Option, - value_count: u32, - null_count: u32, -} - -/// Estimate the number of distinct strings in the var bin view array. -fn estimate_distinct_count(strings: &VarBinViewArray) -> VortexResult { - let views = strings.views(); - // Iterate the views. Two strings which are equal must have the same first 8-bytes. - // NOTE: there are cases where this performs pessimally, e.g. when we have strings that all - // share a 4-byte prefix and have the same length. - let mut distinct = HashSet::with_capacity(views.len() / 2); - views.iter().for_each(|&view| { - #[expect( - clippy::cast_possible_truncation, - reason = "approximate uniqueness with view prefix" - )] - let len_and_prefix = view.as_u128() as u64; - distinct.insert(len_and_prefix); - }); - - Ok(u32::try_from(distinct.len())?) -} - -impl StringStats { - fn generate_opts_fallible( - input: &VarBinViewArray, - opts: GenerateStatsOptions, - ) -> VortexResult { - let null_count = input - .statistics() - .compute_null_count() - .ok_or_else(|| vortex_err!("Failed to compute null_count"))?; - let value_count = input.len() - null_count; - let estimated_distinct_count = opts - .count_distinct_values - .then(|| estimate_distinct_count(input)) - .transpose()?; - - Ok(Self { - src: input.clone(), - value_count: u32::try_from(value_count)?, - null_count: u32::try_from(null_count)?, - estimated_distinct_count, - }) - } -} - -impl StringStats { - /// Generates stats with default options. - pub fn generate(input: &VarBinViewArray) -> Self { - Self::generate_opts(input, GenerateStatsOptions::default()) - } - - /// Generates stats with provided options. - pub fn generate_opts(input: &VarBinViewArray, opts: GenerateStatsOptions) -> Self { - Self::generate_opts_fallible(input, opts) - .vortex_expect("StringStats::generate_opts should not fail") - } - - /// Returns the underlying source array. - pub fn source(&self) -> &VarBinViewArray { - &self.src - } -} +pub use vortex_compressor::stats::StringStats; /// Uncompressed string scheme (identity). #[derive(Debug, Copy, Clone, PartialEq, Eq)] @@ -155,7 +84,7 @@ impl Scheme for UncompressedScheme { fn expected_compression_ratio( &self, - _compressor: &BtrBlocksCompressor, + _compressor: &CascadingCompressor, _data: &mut ArrayAndStats, _ctx: CompressorContext, _excludes: &[SchemeId], @@ -165,7 +94,7 @@ impl Scheme for UncompressedScheme { fn compress( &self, - _compressor: &BtrBlocksCompressor, + _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, _excludes: &[SchemeId], @@ -191,7 +120,7 @@ impl Scheme for DictScheme { fn expected_compression_ratio( &self, - compressor: &BtrBlocksCompressor, + compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, excludes: &[SchemeId], @@ -200,23 +129,23 @@ impl Scheme for DictScheme { // If we don't have a sufficiently high number of distinct values, do not attempt Dict. if stats - .estimated_distinct_count - .is_none_or(|c| c > stats.value_count / 2) + .estimated_distinct_count() + .is_none_or(|c| c > stats.value_count() / 2) { return Ok(0.0); } // If array is all null, do not attempt dict. - if stats.value_count == 0 { + if stats.value_count() == 0 { return Ok(0.0); } - estimate_compression_ratio_with_sampling(self, compressor, data, ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx, excludes) } fn compress( &self, - compressor: &BtrBlocksCompressor, + compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, _excludes: &[SchemeId], @@ -267,7 +196,7 @@ impl Scheme for FSSTScheme { fn compress( &self, - compressor: &BtrBlocksCompressor, + compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, _excludes: &[SchemeId], @@ -275,8 +204,8 @@ impl Scheme for FSSTScheme { let stats = data.string_stats(); let fsst = { - let compressor_fsst = fsst_train_compressor(&stats.src); - fsst_compress(&stats.src, &compressor_fsst) + let compressor_fsst = fsst_train_compressor(stats.source()); + fsst_compress(stats.source(), &compressor_fsst) }; let compressed_original_lengths = compressor.compress_canonical( @@ -318,13 +247,13 @@ impl Scheme for ConstantScheme { is_utf8_string(canonical) } - fn is_constant(&self) -> bool { + fn detects_constant(&self) -> bool { true } fn expected_compression_ratio( &self, - _compressor: &BtrBlocksCompressor, + _compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, _excludes: &[SchemeId], @@ -335,9 +264,10 @@ impl Scheme for ConstantScheme { let stats = data.string_stats(); + // TODO(connor): Put the execution context somewhere! let mut ctx = LEGACY_SESSION.create_execution_ctx(); - if stats.estimated_distinct_count.is_none_or(|c| c > 1) - || !is_constant(&stats.src.clone().into_array(), &mut ctx)? + if stats.estimated_distinct_count().is_none_or(|c| c > 1) + || !is_constant(&stats.source().clone().into_array(), &mut ctx)? { return Ok(0.0); } @@ -348,7 +278,7 @@ impl Scheme for ConstantScheme { fn compress( &self, - _compressor: &BtrBlocksCompressor, + _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, _excludes: &[SchemeId], @@ -361,16 +291,19 @@ impl Scheme for ConstantScheme { match scalar_idx { Some(idx) => { let scalar = stats.source().scalar_at(idx)?; - let const_arr = ConstantArray::new(scalar, stats.src.len()).into_array(); + let const_arr = ConstantArray::new(scalar, stats.source().len()).into_array(); if !stats.source().all_valid()? { - Ok(MaskedArray::try_new(const_arr, stats.src.validity().clone())?.into_array()) + Ok( + MaskedArray::try_new(const_arr, stats.source().validity().clone())? + .into_array(), + ) } else { Ok(const_arr) } } None => Ok(ConstantArray::new( - Scalar::null(stats.src.dtype().clone()), - stats.src.len(), + Scalar::null(stats.source().dtype().clone()), + stats.source().len(), ) .into_array()), } @@ -388,7 +321,7 @@ impl Scheme for NullDominated { fn expected_compression_ratio( &self, - _compressor: &BtrBlocksCompressor, + _compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, _excludes: &[SchemeId], @@ -400,14 +333,14 @@ impl Scheme for NullDominated { let stats = data.string_stats(); - if stats.value_count == 0 { + if stats.value_count() == 0 { // All nulls should use ConstantScheme. return Ok(0.0); } // If the majority is null, will compress well. - if stats.null_count as f64 / stats.src.len() as f64 > 0.9 { - return Ok(stats.src.len() as f64 / stats.value_count as f64); + if stats.null_count() as f64 / stats.source().len() as f64 > 0.9 { + return Ok(stats.source().len() as f64 / stats.value_count() as f64); } // Otherwise we don't go this route. @@ -416,7 +349,7 @@ impl Scheme for NullDominated { fn compress( &self, - compressor: &BtrBlocksCompressor, + compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, _excludes: &[SchemeId], @@ -426,7 +359,7 @@ impl Scheme for NullDominated { let stats = data.string_stats(); // We pass None as we only run this pathway for NULL-dominated string arrays. - let sparse_encoded = SparseArray::encode(&stats.src.clone().into_array(), None)?; + let sparse_encoded = SparseArray::encode(&stats.source().clone().into_array(), None)?; if let Some(sparse) = sparse_encoded.as_opt::() { // Compress the indices only (not the values for strings). @@ -464,7 +397,7 @@ impl Scheme for ZstdScheme { fn compress( &self, - _compressor: &BtrBlocksCompressor, + _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, _excludes: &[SchemeId], @@ -491,7 +424,7 @@ impl Scheme for ZstdBuffersScheme { fn compress( &self, - _compressor: &BtrBlocksCompressor, + _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, _excludes: &[SchemeId], diff --git a/vortex-btrblocks/src/compressor/temporal.rs b/vortex-btrblocks/src/compressor/temporal.rs index b958c77c126..619338aca3e 100644 --- a/vortex-btrblocks/src/compressor/temporal.rs +++ b/vortex-btrblocks/src/compressor/temporal.rs @@ -1,50 +1,121 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Specialized compressor for DateTimeParts metadata. +//! Temporal compression scheme using datetime-part decomposition. use vortex_array::ArrayRef; use vortex_array::Canonical; use vortex_array::IntoArray; +use vortex_array::LEGACY_SESSION; use vortex_array::ToCanonical; +use vortex_array::VortexSessionExecute; +use vortex_array::aggregate_fn::fns::is_constant::is_constant; +use vortex_array::arrays::ConstantArray; use vortex_array::arrays::TemporalArray; +use vortex_array::dtype::extension::Matcher; +use vortex_array::extension::datetime::AnyTemporal; +use vortex_array::extension::datetime::TemporalMetadata; use vortex_datetime_parts::DateTimePartsArray; use vortex_datetime_parts::TemporalParts; use vortex_datetime_parts::split_temporal; use vortex_error::VortexResult; -use crate::BtrBlocksCompressor; +use crate::ArrayAndStats; +use crate::CascadingCompressor; use crate::CompressorContext; +use crate::Scheme; +use crate::SchemeId; -/// Compress a temporal array into a `DateTimePartsArray`. -pub fn compress_temporal( - compressor: &BtrBlocksCompressor, - array: TemporalArray, -) -> VortexResult { - let dtype = array.dtype().clone(); - let TemporalParts { - days, - seconds, - subseconds, - } = split_temporal(array)?; - - let ctx = CompressorContext::default().descend(); - - let days = compressor.compress_canonical( - Canonical::Primitive(days.to_primitive().narrow()?), - ctx, - &[], - )?; - let seconds = compressor.compress_canonical( - Canonical::Primitive(seconds.to_primitive().narrow()?), - ctx, - &[], - )?; - let subseconds = compressor.compress_canonical( - Canonical::Primitive(subseconds.to_primitive().narrow()?), - ctx, - &[], - )?; - - Ok(DateTimePartsArray::try_new(dtype, days, seconds, subseconds)?.into_array()) +/// Compression scheme for temporal timestamp arrays via datetime-part decomposition. +/// +/// Splits timestamps into days, seconds, and subseconds components, compresses each +/// independently, and wraps the result in a [`DateTimePartsArray`]. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct TemporalScheme; + +impl Scheme for TemporalScheme { + fn scheme_name(&self) -> &'static str { + "vortex.ext.temporal" + } + + fn matches(&self, canonical: &Canonical) -> bool { + let Canonical::Extension(ext) = canonical else { + return false; + }; + + let ext_dtype = ext.ext_dtype(); + + matches!( + AnyTemporal::try_match(ext_dtype), + Some(TemporalMetadata::Timestamp(..)) + ) + } + + fn detects_constant(&self) -> bool { + true + } + + fn expected_compression_ratio( + &self, + _compressor: &CascadingCompressor, + _data: &mut ArrayAndStats, + _ctx: CompressorContext, + _excludes: &[SchemeId], + ) -> VortexResult { + // Temporal compression (splitting into parts) is almost always beneficial. + // Return a moderate ratio to ensure this scheme is selected. + Ok(2.0) + } + + fn compress( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + _excludes: &[SchemeId], + ) -> VortexResult { + let array = data.array().clone(); + let ext_array = array.to_extension(); + let temporal_array = TemporalArray::try_from(ext_array.clone().into_array())?; + + // TODO(connor): Put the execution context somewhere! + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + + // Check for constant array and return early if so. + let is_constant = is_constant(&ext_array.clone().into_array(), &mut ctx)?; + + if is_constant { + return Ok( + ConstantArray::new(temporal_array.as_ref().scalar_at(0)?, ext_array.len()) + .into_array(), + ); + } + + let dtype = temporal_array.dtype().clone(); + let TemporalParts { + days, + seconds, + subseconds, + } = split_temporal(temporal_array)?; + + let ctx = CompressorContext::default().descend(); + + let days = compressor.compress_canonical( + Canonical::Primitive(days.to_primitive().narrow()?), + ctx, + &[], + )?; + let seconds = compressor.compress_canonical( + Canonical::Primitive(seconds.to_primitive().narrow()?), + ctx, + &[], + )?; + let subseconds = compressor.compress_canonical( + Canonical::Primitive(subseconds.to_primitive().narrow()?), + ctx, + &[], + )?; + + Ok(DateTimePartsArray::try_new(dtype, days, seconds, subseconds)?.into_array()) + } } diff --git a/vortex-btrblocks/src/lib.rs b/vortex-btrblocks/src/lib.rs index ff144b5e84f..7b16343ca81 100644 --- a/vortex-btrblocks/src/lib.rs +++ b/vortex-btrblocks/src/lib.rs @@ -24,7 +24,7 @@ //! //! [`BtrBlocksCompressor::compress()`] takes an `&ArrayRef` and returns an `ArrayRef` that may //! use a different encoding. It first canonicalizes the input, then dispatches by type. -//! Primitives and strings go through [`choose_and_compress`], which evaluates every enabled +//! Primitives and strings go through `choose_and_compress`, which evaluates every enabled //! [`Scheme`] and picks the one with the best compression ratio. Compound types like structs //! and lists recurse into their fields and elements. //! @@ -34,15 +34,13 @@ //! [`ALL_SCHEMES`]. //! //! Schemes can produce arrays that are themselves further compressed (e.g. FoR then BitPacking), -//! up to [`MAX_CASCADE`](compressor::MAX_CASCADE) (3) layers deep. An excludes slice of -//! [`SchemeId`] prevents the same scheme from being applied twice in a chain. -//! -//! [`choose_and_compress`]: BtrBlocksCompressor::choose_and_compress +//! up to [`MAX_CASCADE`] (3) layers deep. An excludes slice of [`SchemeId`] prevents the same +//! scheme from being applied twice in a chain. //! //! # Example //! //! ```rust -//! use vortex_btrblocks::{BtrBlocksCompressor, BtrBlocksCompressorBuilder, Scheme}; +//! use vortex_btrblocks::{BtrBlocksCompressor, BtrBlocksCompressorBuilder, Scheme, SchemeExt}; //! use vortex_btrblocks::compressor::integer::DictScheme; //! use vortex_array::DynArray; //! @@ -61,19 +59,24 @@ mod builder; mod canonical_compressor; /// Compression scheme implementations. pub mod compressor; -mod ctx; -mod sample; -mod scheme; -mod stats; -mod stats_cache; +// Re-export framework types from vortex-compressor for backwards compatibility. +// Btrblocks-specific exports. pub use builder::ALL_SCHEMES; pub use builder::BtrBlocksCompressorBuilder; +pub use builder::default_excluded; pub use canonical_compressor::BtrBlocksCompressor; -pub use compressor::integer::IntegerStats; pub use compressor::integer::dictionary::dictionary_encode as integer_dictionary_encode; -pub use ctx::CompressorContext; -pub use scheme::Scheme; -pub use scheme::SchemeId; -pub use stats::GenerateStatsOptions; -pub use stats_cache::ArrayAndStats; +pub use compressor::patches::compress_patches; +pub use vortex_compressor::CascadingCompressor; +pub use vortex_compressor::ctx::CompressorContext; +pub use vortex_compressor::ctx::MAX_CASCADE; +pub use vortex_compressor::scheme::Scheme; +pub use vortex_compressor::scheme::SchemeExt; +pub use vortex_compressor::scheme::SchemeId; +pub use vortex_compressor::scheme::estimate_compression_ratio_with_sampling; +pub use vortex_compressor::stats::ArrayAndStats; +pub use vortex_compressor::stats::FloatStats; +pub use vortex_compressor::stats::GenerateStatsOptions; +pub use vortex_compressor::stats::IntegerStats; +pub use vortex_compressor::stats::StringStats; diff --git a/vortex-btrblocks/src/scheme.rs b/vortex-btrblocks/src/scheme.rs deleted file mode 100644 index e1b89bb700c..00000000000 --- a/vortex-btrblocks/src/scheme.rs +++ /dev/null @@ -1,149 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Unified compression scheme trait. - -use std::fmt; -use std::fmt::Debug; -use std::hash::Hash; -use std::hash::Hasher; - -use vortex_array::ArrayRef; -use vortex_array::Canonical; -use vortex_error::VortexResult; - -use crate::ArrayAndStats; -use crate::BtrBlocksCompressor; -use crate::CompressorContext; -use crate::GenerateStatsOptions; -use crate::sample::sample; -use crate::sample::sample_count_approx_one_percent; -use crate::stats::SAMPLE_SIZE; - -/// Unique identifier for a compression scheme. -/// -/// `SchemeId` is opaque — the only way to obtain one is through [`Scheme::id()`], which is a -/// provided method that wraps [`Scheme::scheme_name()`]. There is no public constructor. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub struct SchemeId { - pub(crate) name: &'static str, -} - -impl fmt::Display for SchemeId { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str(self.name) - } -} - -/// Unified compression scheme trait. -/// -/// Implementors provide [`scheme_name`](Scheme::scheme_name) to declare their identity. The -/// [`id`](Scheme::id) method is derived automatically and cannot be meaningfully overridden by -/// external crates (since [`SchemeId`] has no public constructor). -pub trait Scheme: Debug + Send + Sync { - /// The globally unique name for this scheme (e.g. `"vortex.int.bitpacking"`). - fn scheme_name(&self) -> &'static str; - - /// Unique identifier derived from [`scheme_name`](Scheme::scheme_name). - fn id(&self) -> SchemeId { - SchemeId { - name: self.scheme_name(), - } - } - - /// Whether this scheme can compress the given canonical array. - fn matches(&self, canonical: &Canonical) -> bool; - - /// True if this scheme detects constant arrays. - fn is_constant(&self) -> bool { - false - } - - /// Returns the stats generation options this scheme requires. The compressor merges all - /// eligible schemes' options before generating stats, so that a single stats pass satisfies - /// every scheme. - fn stats_options(&self) -> GenerateStatsOptions { - GenerateStatsOptions::default() - } - - /// Estimate the compression ratio for this scheme on the given array. - /// - /// The `data` bundle contains the array and a pre-populated stats cache. Schemes access - /// stats via `data.get_or_insert_with::(|| ...)`. - fn expected_compression_ratio( - &self, - compressor: &BtrBlocksCompressor, - data: &mut ArrayAndStats, - ctx: CompressorContext, - excludes: &[SchemeId], - ) -> VortexResult { - estimate_compression_ratio_with_sampling(self, compressor, data, ctx, excludes) - } - - /// Compress the array using this scheme. - /// - /// The `data` bundle contains the array and a pre-populated stats cache. Schemes access - /// stats via `data.get_or_insert_with::(|| ...)`. - fn compress( - &self, - compressor: &BtrBlocksCompressor, - data: &mut ArrayAndStats, - ctx: CompressorContext, - excludes: &[SchemeId], - ) -> VortexResult; -} - -impl PartialEq for dyn Scheme { - fn eq(&self, other: &Self) -> bool { - self.id() == other.id() - } -} - -impl Eq for dyn Scheme {} - -impl Hash for dyn Scheme { - fn hash(&self, state: &mut H) { - self.id().hash(state); - } -} - -/// Estimates compression ratio by compressing a ~1% sample of the data. -/// -/// Creates a new [`ArrayAndStats`] for the sample so that stats are generated from the sample, -/// not the full array. -pub fn estimate_compression_ratio_with_sampling( - scheme: &S, - compressor: &BtrBlocksCompressor, - data: &mut ArrayAndStats, - ctx: CompressorContext, - excludes: &[SchemeId], -) -> VortexResult { - let sample_array = if ctx.is_sample { - data.array().clone() - } else { - let source_len = data.array().len(); - let sample_count = sample_count_approx_one_percent(source_len); - - tracing::trace!( - "Sampling {} values out of {}", - SAMPLE_SIZE as u64 * sample_count as u64, - source_len - ); - - sample(data.array(), SAMPLE_SIZE, sample_count) - }; - - let mut sample_data = ArrayAndStats::new(sample_array, ctx.stats_options); - - let after = scheme - .compress(compressor, &mut sample_data, ctx.as_sample(), excludes)? - .nbytes(); - let before = sample_data.array().nbytes(); - - tracing::debug!( - "estimate_compression_ratio_with_sampling(compressor={scheme:#?} ctx={ctx:?}) = {}", - before as f64 / after as f64 - ); - - Ok(before as f64 / after as f64) -} diff --git a/vortex-compressor/Cargo.toml b/vortex-compressor/Cargo.toml new file mode 100644 index 00000000000..d4e7bf07e62 --- /dev/null +++ b/vortex-compressor/Cargo.toml @@ -0,0 +1,33 @@ +[package] +name = "vortex-compressor" +authors = { workspace = true } +categories = { workspace = true } +description = "Encoding-agnostic compression framework for Vortex arrays" +edition = { workspace = true } +homepage = { workspace = true } +include = { workspace = true } +keywords = { workspace = true } +license = { workspace = true } +readme = { workspace = true } +repository = { workspace = true } +rust-version = { workspace = true } +version = { workspace = true } + +[dependencies] +itertools = { workspace = true } +num-traits = { workspace = true } +rand = { workspace = true } +rustc-hash = { workspace = true } +tracing = { workspace = true } +vortex-array = { workspace = true } +vortex-buffer = { workspace = true } +vortex-error = { workspace = true } +vortex-mask = { workspace = true } +vortex-utils = { workspace = true } + +[dev-dependencies] +rstest = { workspace = true } +vortex-array = { workspace = true, features = ["_test-harness"] } + +[lints] +workspace = true diff --git a/vortex-compressor/public-api.lock b/vortex-compressor/public-api.lock new file mode 100644 index 00000000000..cfdc31c79f5 --- /dev/null +++ b/vortex-compressor/public-api.lock @@ -0,0 +1,431 @@ +pub mod vortex_compressor + +pub mod vortex_compressor::ctx + +pub struct vortex_compressor::ctx::CompressorContext + +pub vortex_compressor::ctx::CompressorContext::allowed_cascading: usize + +pub vortex_compressor::ctx::CompressorContext::is_sample: bool + +pub vortex_compressor::ctx::CompressorContext::stats_options: vortex_compressor::stats::GenerateStatsOptions + +impl vortex_compressor::ctx::CompressorContext + +pub fn vortex_compressor::ctx::CompressorContext::as_sample(self) -> Self + +pub fn vortex_compressor::ctx::CompressorContext::descend(self) -> Self + +impl core::clone::Clone for vortex_compressor::ctx::CompressorContext + +pub fn vortex_compressor::ctx::CompressorContext::clone(&self) -> vortex_compressor::ctx::CompressorContext + +impl core::default::Default for vortex_compressor::ctx::CompressorContext + +pub fn vortex_compressor::ctx::CompressorContext::default() -> Self + +impl core::fmt::Debug for vortex_compressor::ctx::CompressorContext + +pub fn vortex_compressor::ctx::CompressorContext::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::ctx::CompressorContext + +pub const vortex_compressor::ctx::MAX_CASCADE: usize + +pub mod vortex_compressor::scheme + +pub struct vortex_compressor::scheme::SchemeId + +impl core::clone::Clone for vortex_compressor::scheme::SchemeId + +pub fn vortex_compressor::scheme::SchemeId::clone(&self) -> vortex_compressor::scheme::SchemeId + +impl core::cmp::Eq for vortex_compressor::scheme::SchemeId + +impl core::cmp::PartialEq for vortex_compressor::scheme::SchemeId + +pub fn vortex_compressor::scheme::SchemeId::eq(&self, other: &vortex_compressor::scheme::SchemeId) -> bool + +impl core::fmt::Debug for vortex_compressor::scheme::SchemeId + +pub fn vortex_compressor::scheme::SchemeId::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::fmt::Display for vortex_compressor::scheme::SchemeId + +pub fn vortex_compressor::scheme::SchemeId::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::hash::Hash for vortex_compressor::scheme::SchemeId + +pub fn vortex_compressor::scheme::SchemeId::hash<__H: core::hash::Hasher>(&self, state: &mut __H) + +impl core::marker::Copy for vortex_compressor::scheme::SchemeId + +impl core::marker::StructuralPartialEq for vortex_compressor::scheme::SchemeId + +pub trait vortex_compressor::scheme::Scheme: core::fmt::Debug + core::marker::Send + core::marker::Sync + +pub fn vortex_compressor::scheme::Scheme::compress(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_compressor::scheme::Scheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::scheme::Scheme::expected_compression_ratio(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_compressor::scheme::Scheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::scheme::Scheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::scheme::Scheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +pub trait vortex_compressor::scheme::SchemeExt: vortex_compressor::scheme::Scheme + +pub fn vortex_compressor::scheme::SchemeExt::id(&self) -> vortex_compressor::scheme::SchemeId + +impl vortex_compressor::scheme::SchemeExt for T + +pub fn T::id(&self) -> vortex_compressor::scheme::SchemeId + +pub fn vortex_compressor::scheme::estimate_compression_ratio_with_sampling(scheme: &S, compressor: &vortex_compressor::CascadingCompressor, array: &vortex_array::array::ArrayRef, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub mod vortex_compressor::stats + +pub enum vortex_compressor::stats::FloatErasedStats + +pub vortex_compressor::stats::FloatErasedStats::F16(vortex_compressor::stats::FloatTypedStats) + +pub vortex_compressor::stats::FloatErasedStats::F32(vortex_compressor::stats::FloatTypedStats) + +pub vortex_compressor::stats::FloatErasedStats::F64(vortex_compressor::stats::FloatTypedStats) + +impl core::clone::Clone for vortex_compressor::stats::FloatErasedStats + +pub fn vortex_compressor::stats::FloatErasedStats::clone(&self) -> vortex_compressor::stats::FloatErasedStats + +impl core::convert::From> for vortex_compressor::stats::FloatErasedStats + +pub fn vortex_compressor::stats::FloatErasedStats::from(typed: vortex_compressor::stats::FloatTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::FloatErasedStats + +pub fn vortex_compressor::stats::FloatErasedStats::from(typed: vortex_compressor::stats::FloatTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::FloatErasedStats + +pub fn vortex_compressor::stats::FloatErasedStats::from(typed: vortex_compressor::stats::FloatTypedStats) -> Self + +impl core::fmt::Debug for vortex_compressor::stats::FloatErasedStats + +pub fn vortex_compressor::stats::FloatErasedStats::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub enum vortex_compressor::stats::IntegerErasedStats + +pub vortex_compressor::stats::IntegerErasedStats::I16(vortex_compressor::stats::IntegerTypedStats) + +pub vortex_compressor::stats::IntegerErasedStats::I32(vortex_compressor::stats::IntegerTypedStats) + +pub vortex_compressor::stats::IntegerErasedStats::I64(vortex_compressor::stats::IntegerTypedStats) + +pub vortex_compressor::stats::IntegerErasedStats::I8(vortex_compressor::stats::IntegerTypedStats) + +pub vortex_compressor::stats::IntegerErasedStats::U16(vortex_compressor::stats::IntegerTypedStats) + +pub vortex_compressor::stats::IntegerErasedStats::U32(vortex_compressor::stats::IntegerTypedStats) + +pub vortex_compressor::stats::IntegerErasedStats::U64(vortex_compressor::stats::IntegerTypedStats) + +pub vortex_compressor::stats::IntegerErasedStats::U8(vortex_compressor::stats::IntegerTypedStats) + +impl vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::distinct_count(&self) -> core::option::Option + +pub fn vortex_compressor::stats::IntegerErasedStats::max_ilog2(&self) -> core::option::Option + +pub fn vortex_compressor::stats::IntegerErasedStats::max_minus_min(&self) -> u64 + +pub fn vortex_compressor::stats::IntegerErasedStats::min_is_negative(&self) -> bool + +pub fn vortex_compressor::stats::IntegerErasedStats::min_is_zero(&self) -> bool + +pub fn vortex_compressor::stats::IntegerErasedStats::most_frequent_value_and_count(&self) -> core::option::Option<(vortex_array::scalar::typed_view::primitive::pvalue::PValue, u32)> + +impl core::clone::Clone for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::clone(&self) -> vortex_compressor::stats::IntegerErasedStats + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::fmt::Debug for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub struct vortex_compressor::stats::ArrayAndStats + +impl vortex_compressor::stats::ArrayAndStats + +pub fn vortex_compressor::stats::ArrayAndStats::array(&self) -> &vortex_array::array::ArrayRef + +pub fn vortex_compressor::stats::ArrayAndStats::float_stats(&mut self) -> &vortex_compressor::stats::FloatStats + +pub fn vortex_compressor::stats::ArrayAndStats::get_or_insert_with(&mut self, f: impl core::ops::function::FnOnce() -> T) -> &T + +pub fn vortex_compressor::stats::ArrayAndStats::integer_stats(&mut self) -> &vortex_compressor::stats::IntegerStats + +pub fn vortex_compressor::stats::ArrayAndStats::into_array(self) -> vortex_array::array::ArrayRef + +pub fn vortex_compressor::stats::ArrayAndStats::new(array: vortex_array::array::ArrayRef, opts: vortex_compressor::stats::GenerateStatsOptions) -> Self + +pub fn vortex_compressor::stats::ArrayAndStats::string_stats(&mut self) -> &vortex_compressor::stats::StringStats + +pub struct vortex_compressor::stats::FloatDistinctInfo + +impl vortex_compressor::stats::FloatDistinctInfo + +pub fn vortex_compressor::stats::FloatDistinctInfo::distinct_values(&self) -> &vortex_utils::aliases::hash_set::HashSet, rustc_hash::FxBuildHasher> + +impl core::clone::Clone for vortex_compressor::stats::FloatDistinctInfo + +pub fn vortex_compressor::stats::FloatDistinctInfo::clone(&self) -> vortex_compressor::stats::FloatDistinctInfo + +impl core::fmt::Debug for vortex_compressor::stats::FloatDistinctInfo + +pub fn vortex_compressor::stats::FloatDistinctInfo::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub struct vortex_compressor::stats::FloatStats + +impl vortex_compressor::stats::FloatStats + +pub fn vortex_compressor::stats::FloatStats::average_run_length(&self) -> u32 + +pub fn vortex_compressor::stats::FloatStats::erased(&self) -> &vortex_compressor::stats::FloatErasedStats + +pub fn vortex_compressor::stats::FloatStats::generate(input: &vortex_array::arrays::primitive::array::PrimitiveArray) -> Self + +pub fn vortex_compressor::stats::FloatStats::generate_opts(input: &vortex_array::arrays::primitive::array::PrimitiveArray, opts: vortex_compressor::stats::GenerateStatsOptions) -> Self + +pub fn vortex_compressor::stats::FloatStats::null_count(&self) -> u32 + +pub fn vortex_compressor::stats::FloatStats::source(&self) -> &vortex_array::arrays::primitive::array::PrimitiveArray + +pub fn vortex_compressor::stats::FloatStats::value_count(&self) -> u32 + +impl vortex_compressor::stats::FloatStats + +pub fn vortex_compressor::stats::FloatStats::distinct_count(&self) -> core::option::Option + +impl core::clone::Clone for vortex_compressor::stats::FloatStats + +pub fn vortex_compressor::stats::FloatStats::clone(&self) -> vortex_compressor::stats::FloatStats + +impl core::fmt::Debug for vortex_compressor::stats::FloatStats + +pub fn vortex_compressor::stats::FloatStats::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub struct vortex_compressor::stats::FloatTypedStats + +impl vortex_compressor::stats::FloatTypedStats + +pub fn vortex_compressor::stats::FloatTypedStats::distinct(&self) -> core::option::Option<&vortex_compressor::stats::FloatDistinctInfo> + +impl core::convert::From> for vortex_compressor::stats::FloatErasedStats + +pub fn vortex_compressor::stats::FloatErasedStats::from(typed: vortex_compressor::stats::FloatTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::FloatErasedStats + +pub fn vortex_compressor::stats::FloatErasedStats::from(typed: vortex_compressor::stats::FloatTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::FloatErasedStats + +pub fn vortex_compressor::stats::FloatErasedStats::from(typed: vortex_compressor::stats::FloatTypedStats) -> Self + +impl core::clone::Clone for vortex_compressor::stats::FloatTypedStats + +pub fn vortex_compressor::stats::FloatTypedStats::clone(&self) -> vortex_compressor::stats::FloatTypedStats + +impl core::fmt::Debug for vortex_compressor::stats::FloatTypedStats + +pub fn vortex_compressor::stats::FloatTypedStats::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub struct vortex_compressor::stats::GenerateStatsOptions + +pub vortex_compressor::stats::GenerateStatsOptions::count_distinct_values: bool + +impl vortex_compressor::stats::GenerateStatsOptions + +pub fn vortex_compressor::stats::GenerateStatsOptions::merge(self, other: Self) -> Self + +impl core::clone::Clone for vortex_compressor::stats::GenerateStatsOptions + +pub fn vortex_compressor::stats::GenerateStatsOptions::clone(&self) -> vortex_compressor::stats::GenerateStatsOptions + +impl core::default::Default for vortex_compressor::stats::GenerateStatsOptions + +pub fn vortex_compressor::stats::GenerateStatsOptions::default() -> vortex_compressor::stats::GenerateStatsOptions + +impl core::fmt::Debug for vortex_compressor::stats::GenerateStatsOptions + +pub fn vortex_compressor::stats::GenerateStatsOptions::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::stats::GenerateStatsOptions + +pub struct vortex_compressor::stats::IntegerDistinctInfo + +impl vortex_compressor::stats::IntegerDistinctInfo + +pub fn vortex_compressor::stats::IntegerDistinctInfo::distinct_values(&self) -> &vortex_utils::aliases::hash_map::HashMap, u32, rustc_hash::FxBuildHasher> + +impl core::clone::Clone for vortex_compressor::stats::IntegerDistinctInfo + +pub fn vortex_compressor::stats::IntegerDistinctInfo::clone(&self) -> vortex_compressor::stats::IntegerDistinctInfo + +impl core::fmt::Debug for vortex_compressor::stats::IntegerDistinctInfo + +pub fn vortex_compressor::stats::IntegerDistinctInfo::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub struct vortex_compressor::stats::IntegerStats + +impl vortex_compressor::stats::IntegerStats + +pub fn vortex_compressor::stats::IntegerStats::average_run_length(&self) -> u32 + +pub fn vortex_compressor::stats::IntegerStats::erased(&self) -> &vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerStats::generate(input: &vortex_array::arrays::primitive::array::PrimitiveArray) -> Self + +pub fn vortex_compressor::stats::IntegerStats::generate_opts(input: &vortex_array::arrays::primitive::array::PrimitiveArray, opts: vortex_compressor::stats::GenerateStatsOptions) -> Self + +pub fn vortex_compressor::stats::IntegerStats::null_count(&self) -> u32 + +pub fn vortex_compressor::stats::IntegerStats::source(&self) -> &vortex_array::arrays::primitive::array::PrimitiveArray + +pub fn vortex_compressor::stats::IntegerStats::value_count(&self) -> u32 + +impl vortex_compressor::stats::IntegerStats + +pub fn vortex_compressor::stats::IntegerStats::distinct_count(&self) -> core::option::Option + +pub fn vortex_compressor::stats::IntegerStats::most_frequent_value_and_count(&self) -> core::option::Option<(vortex_array::scalar::typed_view::primitive::pvalue::PValue, u32)> + +impl core::clone::Clone for vortex_compressor::stats::IntegerStats + +pub fn vortex_compressor::stats::IntegerStats::clone(&self) -> vortex_compressor::stats::IntegerStats + +impl core::fmt::Debug for vortex_compressor::stats::IntegerStats + +pub fn vortex_compressor::stats::IntegerStats::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub struct vortex_compressor::stats::IntegerTypedStats + +impl vortex_compressor::stats::IntegerTypedStats + +pub fn vortex_compressor::stats::IntegerTypedStats::distinct(&self) -> core::option::Option<&vortex_compressor::stats::IntegerDistinctInfo> + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::clone::Clone for vortex_compressor::stats::IntegerTypedStats + +pub fn vortex_compressor::stats::IntegerTypedStats::clone(&self) -> vortex_compressor::stats::IntegerTypedStats + +impl core::fmt::Debug for vortex_compressor::stats::IntegerTypedStats + +pub fn vortex_compressor::stats::IntegerTypedStats::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub struct vortex_compressor::stats::StringStats + +impl vortex_compressor::stats::StringStats + +pub fn vortex_compressor::stats::StringStats::estimated_distinct_count(&self) -> core::option::Option + +pub fn vortex_compressor::stats::StringStats::generate(input: &vortex_array::arrays::varbinview::array::VarBinViewArray) -> Self + +pub fn vortex_compressor::stats::StringStats::generate_opts(input: &vortex_array::arrays::varbinview::array::VarBinViewArray, opts: vortex_compressor::stats::GenerateStatsOptions) -> Self + +pub fn vortex_compressor::stats::StringStats::null_count(&self) -> u32 + +pub fn vortex_compressor::stats::StringStats::source(&self) -> &vortex_array::arrays::varbinview::array::VarBinViewArray + +pub fn vortex_compressor::stats::StringStats::value_count(&self) -> u32 + +impl core::clone::Clone for vortex_compressor::stats::StringStats + +pub fn vortex_compressor::stats::StringStats::clone(&self) -> vortex_compressor::stats::StringStats + +impl core::fmt::Debug for vortex_compressor::stats::StringStats + +pub fn vortex_compressor::stats::StringStats::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub struct vortex_compressor::CascadingCompressor + +pub vortex_compressor::CascadingCompressor::schemes: alloc::vec::Vec<&'static dyn vortex_compressor::scheme::Scheme> + +impl vortex_compressor::CascadingCompressor + +pub fn vortex_compressor::CascadingCompressor::compress(&self, array: &vortex_array::array::ArrayRef) -> vortex_error::VortexResult + +pub fn vortex_compressor::CascadingCompressor::compress_canonical(&self, array: vortex_array::canonical::Canonical, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult + +pub fn vortex_compressor::CascadingCompressor::new(schemes: alloc::vec::Vec<&'static dyn vortex_compressor::scheme::Scheme>) -> Self + +impl core::clone::Clone for vortex_compressor::CascadingCompressor + +pub fn vortex_compressor::CascadingCompressor::clone(&self) -> vortex_compressor::CascadingCompressor diff --git a/vortex-compressor/src/compressor.rs b/vortex-compressor/src/compressor.rs new file mode 100644 index 00000000000..3a488a08e0b --- /dev/null +++ b/vortex-compressor/src/compressor.rs @@ -0,0 +1,316 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Cascading array compression implementation. + +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_array::CanonicalValidity; +use vortex_array::DynArray; +use vortex_array::IntoArray; +use vortex_array::LEGACY_SESSION; +use vortex_array::ToCanonical; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::ExtensionArray; +use vortex_array::arrays::FixedSizeListArray; +use vortex_array::arrays::ListArray; +use vortex_array::arrays::ListViewArray; +use vortex_array::arrays::StructArray; +use vortex_array::arrays::listview::list_from_list_view; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::scalar::Scalar; +use vortex_array::vtable::ValidityHelper; +use vortex_error::VortexResult; + +use crate::ctx::CompressorContext; +use crate::scheme::Scheme; +use crate::scheme::SchemeExt; +use crate::scheme::SchemeId; +use crate::stats::ArrayAndStats; +use crate::stats::GenerateStatsOptions; + +/// The main compressor type implementing cascading adaptive compression. +/// +/// This compressor applies adaptive compression [`Scheme`]s to arrays based on their data types and +/// characteristics. It recursively compresses nested structures like structs and lists, and chooses +/// optimal compression schemes for leaf types. +/// +/// The compressor works by: +/// 1. Canonicalizing input arrays to a standard representation. +/// 2. Pre-filtering schemes by [`Scheme::matches`] and excludes. +/// 3. Evaluating each matching scheme's compression ratio on a sample. +/// 4. Compressing with the best scheme and verifying the result is smaller. +#[derive(Clone)] +pub struct CascadingCompressor { + /// The enabled compression schemes. + pub schemes: Vec<&'static dyn Scheme>, +} + +impl CascadingCompressor { + /// Creates a new compressor with the given schemes. + pub fn new(schemes: Vec<&'static dyn Scheme>) -> Self { + Self { schemes } + } + + /// Compresses an array using cascading adaptive compression. + /// + /// First canonicalizes and compacts the array, then applies optimal compression schemes. + /// + /// # Errors + /// + /// Returns an error if canonicalization or compression fails. + pub fn compress(&self, array: &ArrayRef) -> VortexResult { + let canonical = array + .clone() + .execute::(&mut LEGACY_SESSION.create_execution_ctx())? + .0; + + // Compact it, removing any wasted space before we attempt to compress it. + let compact = canonical.compact()?; + + self.compress_canonical(compact, CompressorContext::default(), &[]) + } + + /// Compresses a canonical array by dispatching to type-specific logic. + /// + /// # Errors + /// + /// Returns an error if compression of any sub-array fails. + pub fn compress_canonical( + &self, + array: Canonical, + ctx: CompressorContext, + excludes: &[SchemeId], + ) -> VortexResult { + match array { + Canonical::Null(null_array) => Ok(null_array.into_array()), + Canonical::Bool(bool_array) => Ok(bool_array.into_array()), + Canonical::Primitive(primitive) => { + self.choose_and_compress(Canonical::Primitive(primitive), ctx, excludes) + } + Canonical::Decimal(decimal) => { + self.choose_and_compress(Canonical::Decimal(decimal), ctx, excludes) + } + Canonical::Struct(struct_array) => { + let fields = struct_array + .unmasked_fields() + .iter() + .map(|field| self.compress(field)) + .collect::, _>>()?; + + Ok(StructArray::try_new( + struct_array.names().clone(), + fields, + struct_array.len(), + struct_array.validity().clone(), + )? + .into_array()) + } + Canonical::List(list_view_array) => { + if list_view_array.is_zero_copy_to_list() || list_view_array.elements().is_empty() { + let list_array = list_from_list_view(list_view_array)?; + self.compress_list_array(list_array, ctx) + } else { + self.compress_list_view_array(list_view_array, ctx) + } + } + Canonical::FixedSizeList(fsl_array) => { + let compressed_elems = self.compress(fsl_array.elements())?; + + Ok(FixedSizeListArray::try_new( + compressed_elems, + fsl_array.list_size(), + fsl_array.validity().clone(), + fsl_array.len(), + )? + .into_array()) + } + Canonical::VarBinView(strings) => { + if strings + .dtype() + .eq_ignore_nullability(&DType::Utf8(Nullability::NonNullable)) + { + self.choose_and_compress(Canonical::VarBinView(strings), ctx, excludes) + } else { + // We do not compress binary arrays. + Ok(strings.into_array()) + } + } + Canonical::Extension(ext_array) => { + let before_nbytes = ext_array.as_ref().nbytes(); + + // Try scheme-based compression first. + let result = self.choose_and_compress( + Canonical::Extension(ext_array.clone()), + ctx, + excludes, + )?; + if result.nbytes() < before_nbytes { + return Ok(result); + } + + // Otherwise, fall back to compressing the underlying storage array. + let compressed_storage = self.compress(ext_array.storage_array())?; + + Ok( + ExtensionArray::new(ext_array.ext_dtype().clone(), compressed_storage) + .into_array(), + ) + } + } + } + + /// The main scheme-selection entry point for a single leaf array. + /// + /// Filters allowed schemes by [`matches`], merges their [`stats_options`] into a single + /// [`GenerateStatsOptions`], then delegates to [`choose_scheme`] to pick the winner by + /// estimated compression ratio. + /// + /// If a winner is found and its compressed output is actually smaller, that output is returned. + /// Otherwise, the original array is returned unchanged. + /// + /// Empty and all-null arrays are short-circuited before any scheme evaluation. + /// + /// [`matches`]: Scheme::matches + /// [`stats_options`]: Scheme::stats_options + /// [`choose_scheme`]: Self::choose_scheme + fn choose_and_compress( + &self, + canonical: Canonical, + ctx: CompressorContext, + excludes: &[SchemeId], + ) -> VortexResult { + let eligible_schemes: Vec<&'static dyn Scheme> = self + .schemes + .iter() + .copied() + .filter(|s| s.matches(&canonical) && !excludes.contains(&s.id())) + .collect(); + + let array: ArrayRef = canonical.into(); + + // If there are no schemes that we can compress into, then just return it uncompressed. + if eligible_schemes.is_empty() { + return Ok(array); + } + + // Nothing to compress if empty or all-null. + if array.is_empty() { + return Ok(array); + } + + if array.all_invalid()? { + return Ok( + ConstantArray::new(Scalar::null(array.dtype().clone()), array.len()).into_array(), + ); + } + + let before_nbytes = array.nbytes(); + let merged_opts = eligible_schemes + .iter() + .fold(GenerateStatsOptions::default(), |acc, s| { + acc.merge(s.stats_options()) + }); + + let mut ctx = ctx; + ctx.stats_options = merged_opts; + + let mut data = ArrayAndStats::new(array, merged_opts); + + if let Some(winner) = self.choose_scheme(&eligible_schemes, &mut data, ctx, excludes)? { + let compressed = winner.compress(self, &mut data, ctx, excludes)?; + if compressed.nbytes() < before_nbytes { + return Ok(compressed); + } + } + + // No scheme improved on the original. + Ok(data.into_array()) + } + + /// Calls [`expected_compression_ratio`] on each candidate and returns the scheme with the + /// highest ratio, or `None` if no scheme exceeds 1.0. Ties are broken by registration + /// order (earlier in the list wins). + /// + /// [`expected_compression_ratio`]: Scheme::expected_compression_ratio + fn choose_scheme( + &self, + schemes: &[&'static dyn Scheme], + data: &mut ArrayAndStats, + ctx: CompressorContext, + excludes: &[SchemeId], + ) -> VortexResult> { + let mut best: Option<(&'static dyn Scheme, f64)> = None; + + for &scheme in schemes { + let ratio = scheme.expected_compression_ratio(self, data, ctx, excludes)?; + + tracing::debug!(scheme = %scheme.id(), ratio, "evaluated compression ratio"); + + if is_better_ratio(ratio, &best) { + best = Some((scheme, ratio)); + } + } + + Ok(best.map(|(s, _)| s)) + } + + /// Compresses a [`ListArray`] by narrowing offsets and recursively compressing elements. + fn compress_list_array( + &self, + list_array: ListArray, + ctx: CompressorContext, + ) -> VortexResult { + let list_array = list_array.reset_offsets(true)?; + + let compressed_elems = self.compress(list_array.elements())?; + + let compressed_offsets = self.compress_canonical( + Canonical::Primitive(list_array.offsets().to_primitive().narrow()?), + ctx, + &[], + )?; + + Ok(ListArray::try_new( + compressed_elems, + compressed_offsets, + list_array.validity().clone(), + )? + .into_array()) + } + + /// Compresses a [`ListViewArray`] by narrowing offsets/sizes and recursively compressing + /// elements. + fn compress_list_view_array( + &self, + list_view: ListViewArray, + ctx: CompressorContext, + ) -> VortexResult { + let compressed_elems = self.compress(list_view.elements())?; + let compressed_offsets = self.compress_canonical( + Canonical::Primitive(list_view.offsets().to_primitive().narrow()?), + ctx, + &[], + )?; + let compressed_sizes = self.compress_canonical( + Canonical::Primitive(list_view.sizes().to_primitive().narrow()?), + ctx, + &[], + )?; + Ok(ListViewArray::try_new( + compressed_elems, + compressed_offsets, + compressed_sizes, + list_view.validity().clone(), + )? + .into_array()) + } +} + +/// Returns `true` if `ratio` is a valid compression ratio (> 1.0, finite, not subnormal) that beats +/// the current best. +fn is_better_ratio(ratio: f64, best: &Option<(&'static dyn Scheme, f64)>) -> bool { + ratio.is_finite() && !ratio.is_subnormal() && ratio > 1.0 && best.is_none_or(|(_, r)| ratio > r) +} diff --git a/vortex-btrblocks/src/ctx.rs b/vortex-compressor/src/ctx.rs similarity index 91% rename from vortex-btrblocks/src/ctx.rs rename to vortex-compressor/src/ctx.rs index c4832b3aa7a..b4ec616995d 100644 --- a/vortex-btrblocks/src/ctx.rs +++ b/vortex-compressor/src/ctx.rs @@ -3,8 +3,10 @@ //! Compression context for recursive compression. -use crate::GenerateStatsOptions; -use crate::compressor::MAX_CASCADE; +use crate::stats::GenerateStatsOptions; + +/// Maximum cascade depth for compression. +pub const MAX_CASCADE: usize = 3; /// Context passed through recursive compression calls. #[derive(Debug, Clone, Copy)] diff --git a/vortex-compressor/src/lib.rs b/vortex-compressor/src/lib.rs new file mode 100644 index 00000000000..8031243725f --- /dev/null +++ b/vortex-compressor/src/lib.rs @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +#![deny(missing_docs)] +#![warn(clippy::missing_docs_in_private_items)] +#![warn(clippy::missing_errors_doc)] +#![warn(clippy::missing_panics_doc)] +#![warn(clippy::missing_safety_doc)] + +//! Encoding-agnostic compression framework for Vortex arrays. +//! +//! This crate provides the core compression engine: the [`Scheme`](scheme::Scheme) trait, +//! sampling-based ratio estimation, cascaded compression, and statistics infrastructure for +//! deciding the best encoding scheme for an array. +//! +//! This crate contains no encoding dependencies. Batteries-included compressors are provided by +//! downstream crates like `vortex-btrblocks`, which register different encodings to the compressor. + +pub mod ctx; +pub mod scheme; +pub mod stats; + +mod sample; + +mod compressor; +pub use compressor::CascadingCompressor; diff --git a/vortex-btrblocks/src/sample.rs b/vortex-compressor/src/sample.rs similarity index 60% rename from vortex-btrblocks/src/sample.rs rename to vortex-compressor/src/sample.rs index 25ff4d0f527..726115f5e2b 100644 --- a/vortex-btrblocks/src/sample.rs +++ b/vortex-compressor/src/sample.rs @@ -1,6 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +//! Sampling utilities for compression ratio estimation. + use rand::RngExt; use rand::SeedableRng; use rand::prelude::StdRng; @@ -10,10 +12,22 @@ use vortex_array::IntoArray; use vortex_array::arrays::ChunkedArray; use vortex_error::VortexExpect; -use crate::stats::SAMPLE_COUNT; -use crate::stats::SAMPLE_SIZE; +/// The size of each sampled run. +pub const SAMPLE_SIZE: u32 = 64; + +/// The number of sampled runs. +/// +/// # Warning +/// +/// The product of `SAMPLE_SIZE` and `SAMPLE_COUNT` should be (roughly) a multiple of 1024 so that +/// fastlanes bitpacking of sampled vectors does not introduce (large amounts of) padding. +pub const SAMPLE_COUNT: u32 = 16; + +/// Fixed seed for the sampling RNG, ensuring deterministic compression output. +const SAMPLE_SEED: u64 = 1234567890; -pub(crate) fn sample(input: &ArrayRef, sample_size: u32, sample_count: u32) -> ArrayRef { +/// Samples approximately 1% of the input array for compression ratio estimation. +pub fn sample(input: &ArrayRef, sample_size: u32, sample_count: u32) -> ArrayRef { if input.len() <= (sample_size as usize) * (sample_count as usize) { return input.to_array(); } @@ -22,7 +36,7 @@ pub(crate) fn sample(input: &ArrayRef, sample_size: u32, sample_count: u32) -> A input.len(), sample_size, sample_count, - &mut StdRng::seed_from_u64(1234567890u64), + &mut StdRng::seed_from_u64(SAMPLE_SEED), ); // For every slice, grab the relevant slice and repack into a new PrimitiveArray. @@ -34,14 +48,13 @@ pub(crate) fn sample(input: &ArrayRef, sample_size: u32, sample_count: u32) -> A .vortex_expect("slice should succeed") }) .collect(); - ChunkedArray::try_new(chunks, input.dtype().clone()) - .vortex_expect("sample slices should form valid chunked array") - .into_array() + // SAFETY: all chunks are slices of `input`, so they share its dtype. + unsafe { ChunkedArray::new_unchecked(chunks, input.dtype().clone()) }.into_array() } /// Computes the number of sample chunks to cover approximately 1% of `len` elements, /// with a minimum of `SAMPLE_SIZE * SAMPLE_COUNT` (1024) values. -pub(crate) fn sample_count_approx_one_percent(len: usize) -> u32 { +pub fn sample_count_approx_one_percent(len: usize) -> u32 { let approximately_one_percent = (len / 100) / usize::try_from(SAMPLE_SIZE).vortex_expect("SAMPLE_SIZE must fit in usize"); u32::max( @@ -55,6 +68,15 @@ pub(crate) fn sample_count_approx_one_percent(len: usize) -> u32 { ) } +/// Divides an array into `sample_count` equal partitions and picks one random contiguous +/// slice of `sample_size` elements from each partition. +/// +/// This is a stratified sampling strategy: instead of drawing all samples from one region, +/// it spreads them evenly across the array so that every part of the data is represented. +/// Each returned `(start, end)` pair is a half-open range into the original array. +/// +/// If the total number of requested samples (`sample_size * sample_count`) is greater than or +/// equal to `length`, a single slice spanning the whole array is returned. pub fn stratified_slices( length: usize, sample_size: u32, @@ -86,8 +108,12 @@ pub fn stratified_slices( .collect() } -/// Split a range of array indices into as-equal-as-possible slices. If the provided `num_partitions` doesn't -/// evenly divide into `length`, then the first `(length % num_partitions)` slices will have an extra element. +/// Splits `[0, length)` into `num_partitions` contiguous, non-overlapping slices of +/// approximately equal size. +/// +/// If `length` is not evenly divisible by `num_partitions`, the first +/// `length % num_partitions` slices get one extra element. Each returned `(start, end)` pair +/// is a half-open range. pub fn partition_indices(length: usize, num_partitions: u32) -> Vec<(usize, usize)> { let num_long_parts = length % num_partitions as usize; let short_step = length / num_partitions as usize; diff --git a/vortex-compressor/src/scheme.rs b/vortex-compressor/src/scheme.rs new file mode 100644 index 00000000000..f9c4b6efb9a --- /dev/null +++ b/vortex-compressor/src/scheme.rs @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Unified compression scheme trait. + +use std::fmt; +use std::fmt::Debug; +use std::hash::Hash; +use std::hash::Hasher; + +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_error::VortexResult; + +use crate::CascadingCompressor; +use crate::ctx::CompressorContext; +use crate::sample::SAMPLE_SIZE; +use crate::sample::sample; +use crate::sample::sample_count_approx_one_percent; +use crate::stats::ArrayAndStats; +use crate::stats::GenerateStatsOptions; + +/// Unique identifier for a compression scheme. +/// +/// The only way to obtain a [`SchemeId`] is through [`SchemeExt::id()`], which is auto-implemented +/// for all [`Scheme`] types, wrapping [`Scheme::scheme_name()`]. There is no public constructor. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct SchemeId { + /// The scheme name. + name: &'static str, +} + +impl fmt::Display for SchemeId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(self.name) + } +} + +/// A single compression encoding that the [`CascadingCompressor`] can select from. +/// +/// The compressor evaluates every registered scheme whose [`matches`] returns `true` for a +/// given array, picks the one with the highest [`expected_compression_ratio`], and calls +/// [`compress`] on the winner. +/// +/// One of key features of this compressor is that schemes may "cascade": a scheme's [`compress`] +/// can call back into the compressor to compress child or transformed arrays, building up multiple +/// encoding layers (e.g. frame-of-reference and then bit-packing). +/// +/// # Identity +/// +/// Every scheme has a globally unique name returned by [`scheme_name`]. The [`SchemeExt::id`] +/// method (auto-implemented, cannot be overridden) wraps that name in an opaque [`SchemeId`] +/// used for equality, hashing, and exclude lists. +/// +/// # Implementing a scheme +/// +/// At a minimum, implementors must implement [`scheme_name`], [`matches`], and [`compress`]. +/// +/// The default [`expected_compression_ratio`] estimates the ratio by compressing a small sample. +/// Implementors should only override this method when a cheaper heuristic is available (e.g. +/// returning `f64::MAX` for constant detection or `0.0` for early rejection based on stats). +/// +/// Schemes that need statistics that may be expensive to compute should override [`stats_options`] +/// to declare what they require. Currently, this is just distinct values and frequencies, but in +/// the future we might add run lengths. The compressor merges all eligible schemes' options before +/// generating stats, so each stat is always computed at most once for a given array. +/// +/// [`scheme_name`]: Scheme::scheme_name +/// [`matches`]: Scheme::matches +/// [`compress`]: Scheme::compress +/// [`expected_compression_ratio`]: Scheme::expected_compression_ratio +/// [`stats_options`]: Scheme::stats_options +pub trait Scheme: Debug + Send + Sync { + /// The globally unique name for this scheme (e.g. `"vortex.int.bitpacking"`). + fn scheme_name(&self) -> &'static str; + + /// Whether this scheme can compress the given canonical array. + fn matches(&self, canonical: &Canonical) -> bool; + + /// True if this scheme detects constant arrays. + fn detects_constant(&self) -> bool { + false + } + + /// Returns the stats generation options this scheme requires. The compressor merges all + /// eligible schemes' options before generating stats so that a single stats pass satisfies + /// every scheme. + fn stats_options(&self) -> GenerateStatsOptions { + GenerateStatsOptions::default() + } + + /// Estimate the compression ratio for this scheme on the given array. + /// + /// Schemes listed in `excludes` must not be used when cascading into the compressor. + /// + /// # Errors + /// + /// Returns an error if compression of the sample fails. + fn expected_compression_ratio( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + excludes: &[SchemeId], + ) -> VortexResult { + estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx, excludes) + } + + /// Compress the array using this scheme. + /// + /// Schemes listed in `excludes` must not be used when cascading into the compressor. + /// + /// # Errors + /// + /// Returns an error if compression fails. + fn compress( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + excludes: &[SchemeId], + ) -> VortexResult; +} + +impl PartialEq for dyn Scheme { + fn eq(&self, other: &Self) -> bool { + self.id() == other.id() + } +} + +impl Eq for dyn Scheme {} + +impl Hash for dyn Scheme { + fn hash(&self, state: &mut H) { + self.id().hash(state); + } +} + +/// Extension trait providing [`id`](SchemeExt::id) for all [`Scheme`] implementors. +/// +/// This trait is automatically implemented for every type that implements [`Scheme`]. Because the +/// blanket implementation covers all types, external crates cannot override the `id()` method. +pub trait SchemeExt: Scheme { + /// Unique identifier derived from [`scheme_name`](Scheme::scheme_name). + fn id(&self) -> SchemeId { + SchemeId { + name: self.scheme_name(), + } + } +} + +impl SchemeExt for T {} + +/// Estimates compression ratio by compressing a ~1% sample of the data. +/// +/// This function will create a new [`ArrayAndStats`] for the sample so that stats are generated +/// from the sample, not the full array. +/// +/// # Errors +/// +/// Returns an error if sample compression fails. +pub fn estimate_compression_ratio_with_sampling( + scheme: &S, + compressor: &CascadingCompressor, + array: &ArrayRef, + ctx: CompressorContext, + excludes: &[SchemeId], +) -> VortexResult { + let sample_array = if ctx.is_sample { + array.clone() + } else { + let source_len = array.len(); + let sample_count = sample_count_approx_one_percent(source_len); + + tracing::trace!( + "Sampling {} values out of {}", + SAMPLE_SIZE as u64 * sample_count as u64, + source_len + ); + + sample(array, SAMPLE_SIZE, sample_count) + }; + + let mut sample_data = ArrayAndStats::new(sample_array, ctx.stats_options); + + let after = scheme + .compress(compressor, &mut sample_data, ctx.as_sample(), excludes)? + .nbytes(); + let before = sample_data.array().nbytes(); + + tracing::debug!( + "estimate_compression_ratio_with_sampling(compressor={scheme:#?} ctx={ctx:?}) = {}", + before as f64 / after as f64 + ); + + Ok(before as f64 / after as f64) +} diff --git a/vortex-btrblocks/src/stats_cache.rs b/vortex-compressor/src/stats/cache.rs similarity index 73% rename from vortex-btrblocks/src/stats_cache.rs rename to vortex-compressor/src/stats/cache.rs index 02103ade4fb..bbb6522337f 100644 --- a/vortex-btrblocks/src/stats_cache.rs +++ b/vortex-compressor/src/stats/cache.rs @@ -10,23 +10,30 @@ use vortex_array::ArrayRef; use vortex_array::ToCanonical; use vortex_error::VortexExpect; -use crate::GenerateStatsOptions; -use crate::compressor::float::FloatStats; -use crate::compressor::integer::IntegerStats; -use crate::compressor::string::StringStats; +use super::FloatStats; +use super::GenerateStatsOptions; +use super::IntegerStats; +use super::StringStats; /// Cache for compression statistics, keyed by concrete type. struct StatsCache { + // TODO(connor): We could further optimize this with a `SmallVec` here. + /// The cache entries, keyed by [`TypeId`]. + /// + /// The total number of statistics types in this stats should be relatively small, so we use a + /// vector instead of a hash map. entries: Vec<(TypeId, Box)>, } impl StatsCache { + /// Creates a new empty cache. fn new() -> Self { Self { entries: Vec::new(), } } + /// Returns a cached value, computing it on first access. fn get_or_insert_with(&mut self, f: impl FnOnce() -> T) -> &T { let type_id = TypeId::of::(); let pos = self.entries.iter().position(|(id, _)| *id == type_id); @@ -35,7 +42,7 @@ impl StatsCache { self.entries[pos] .1 .downcast_ref::() - .vortex_expect("TypeId mismatch in StatsCache") + .vortex_expect("we just checked the TypeID") } else { self.entries.push((type_id, Box::new(f()))); self.entries @@ -43,24 +50,27 @@ impl StatsCache { .vortex_expect("just pushed") .1 .downcast_ref::() - .vortex_expect("TypeId mismatch in StatsCache") + .vortex_expect("we just checked the TypeID") } } } /// An array bundled with its lazily-computed statistics cache. /// -/// The cache is guaranteed to correspond to the array. When a scheme creates a derived array -/// (e.g. FoR bias subtraction), it must create a new [`ArrayAndStats`] so that stale stats -/// from the original array are not reused. +/// The cache is guaranteed to correspond to the array. When a scheme creates a derived array (e.g. +/// FoR bias subtraction), it must create a new [`ArrayAndStats`] so that stale stats from the +/// original array are not reused. +/// +/// Built-in stats are accessed via typed methods (`integer_stats`, `float_stats`, `string_stats`) +/// which generate stats lazily on first access using the stored [`GenerateStatsOptions`]. /// -/// Built-in stats are accessed via typed methods ([`integer_stats`](Self::integer_stats), -/// [`float_stats`](Self::float_stats), [`string_stats`](Self::string_stats)) which generate -/// stats lazily on first access using the stored [`GenerateStatsOptions`]. Extension schemes -/// can use [`get_or_insert_with`](Self::get_or_insert_with) for custom stats types. +/// Extension schemes can use `get_or_insert_with` for custom stats types. pub struct ArrayAndStats { + /// The array. array: ArrayRef, + /// The stats cache. cache: StatsCache, + /// The stats generation options. opts: GenerateStatsOptions, } @@ -76,15 +86,6 @@ impl ArrayAndStats { } } - /// Creates a new bundle with default (cheapest) stats options. - pub fn without_stats(array: ArrayRef) -> Self { - Self { - array, - cache: StatsCache::new(), - opts: GenerateStatsOptions::default(), - } - } - /// Returns a reference to the array. pub fn array(&self) -> &ArrayRef { &self.array @@ -99,6 +100,7 @@ impl ArrayAndStats { pub fn integer_stats(&mut self) -> &IntegerStats { let array = self.array.clone(); let opts = self.opts; + self.cache.get_or_insert_with::(|| { IntegerStats::generate_opts(&array.to_primitive(), opts) }) @@ -108,6 +110,7 @@ impl ArrayAndStats { pub fn float_stats(&mut self) -> &FloatStats { let array = self.array.clone(); let opts = self.opts; + self.cache.get_or_insert_with::(|| { FloatStats::generate_opts(&array.to_primitive(), opts) }) @@ -117,6 +120,7 @@ impl ArrayAndStats { pub fn string_stats(&mut self) -> &StringStats { let array = self.array.clone(); let opts = self.opts; + self.cache.get_or_insert_with::(|| { StringStats::generate_opts(&array.to_varbinview(), opts) }) diff --git a/vortex-btrblocks/src/compressor/float/stats.rs b/vortex-compressor/src/stats/float.rs similarity index 79% rename from vortex-btrblocks/src/compressor/float/stats.rs rename to vortex-compressor/src/stats/float.rs index 9c114a113fa..67877d7796c 100644 --- a/vortex-btrblocks/src/compressor/float/stats.rs +++ b/vortex-compressor/src/stats/float.rs @@ -1,6 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +//! Float compression statistics. + use std::hash::Hash; use itertools::Itertools; @@ -18,24 +20,46 @@ use vortex_error::vortex_panic; use vortex_mask::AllOr; use vortex_utils::aliases::hash_set::HashSet; -use crate::GenerateStatsOptions; -use crate::compressor::rle::RLEStats; +use super::GenerateStatsOptions; +/// Information about the distinct values in a float array. #[derive(Debug, Clone)] pub struct DistinctInfo { - pub(super) distinct_values: HashSet, FxBuildHasher>, + /// The set of distinct float values. + distinct_values: HashSet, FxBuildHasher>, + /// The count of unique values. distinct_count: u32, } +impl DistinctInfo { + /// Returns a reference to the distinct values set. + pub fn distinct_values(&self) -> &HashSet, FxBuildHasher> { + &self.distinct_values + } +} + +/// Typed statistics for a specific float type. #[derive(Debug, Clone)] pub struct TypedStats { - pub(super) distinct: Option>, + /// Distinct value information, or `None` if not computed. + distinct: Option>, } +impl TypedStats { + /// Returns the distinct value information, if computed. + pub fn distinct(&self) -> Option<&DistinctInfo> { + self.distinct.as_ref() + } +} + +/// Type-erased container for one of the [`TypedStats`] variants. #[derive(Debug, Clone)] pub enum ErasedStats { + /// Stats for `f16` arrays. F16(TypedStats), + /// Stats for `f32` arrays. F32(TypedStats), + /// Stats for `f64` arrays. F64(TypedStats), } @@ -50,6 +74,7 @@ impl ErasedStats { } } +/// Implements `From>` for [`ErasedStats`]. macro_rules! impl_from_typed { ($T:ty, $variant:path) => { impl From> for ErasedStats { @@ -67,16 +92,20 @@ impl_from_typed!(f64, ErasedStats::F64); /// Array of floating-point numbers and relevant stats for compression. #[derive(Debug, Clone)] pub struct FloatStats { - pub(super) src: PrimitiveArray, - // cache for validity.false_count() - pub(super) null_count: u32, - // cache for validity.true_count() - pub(super) value_count: u32, - pub(super) average_run_length: u32, - pub(super) erased: ErasedStats, + /// The underlying source array. + src: PrimitiveArray, + /// Cache for `validity.false_count()`. + null_count: u32, + /// Cache for `validity.true_count()`. + value_count: u32, + /// The average run length. + average_run_length: u32, + /// Type-erased typed statistics. + erased: ErasedStats, } impl FloatStats { + /// Generates stats, returning an error on failure. fn generate_opts_fallible( input: &PrimitiveArray, opts: GenerateStatsOptions, @@ -108,26 +137,32 @@ impl FloatStats { } /// Returns the underlying source array. - #[expect(clippy::same_name_method)] pub fn source(&self) -> &PrimitiveArray { &self.src } -} -impl RLEStats for FloatStats { - fn value_count(&self) -> u32 { + /// Returns the number of null values. + pub fn null_count(&self) -> u32 { + self.null_count + } + + /// Returns the number of non-null values. + pub fn value_count(&self) -> u32 { self.value_count } - fn average_run_length(&self) -> u32 { + /// Returns the average run length. + pub fn average_run_length(&self) -> u32 { self.average_run_length } - fn source(&self) -> &PrimitiveArray { - &self.src + /// Returns the type-erased typed statistics. + pub fn erased(&self) -> &ErasedStats { + &self.erased } } +/// Computes typed float statistics for a specific float type. fn typed_float_stats( array: &PrimitiveArray, count_distinct_values: bool, @@ -136,7 +171,7 @@ where NativeValue: Hash + Eq, TypedStats: Into, { - // Special case: empty array + // Special case: empty array. if array.is_empty() { return Ok(FloatStats { src: array.clone(), @@ -247,7 +282,7 @@ mod tests { let stats = FloatStats::generate_opts( &floats, - crate::GenerateStatsOptions { + crate::stats::GenerateStatsOptions { count_distinct_values: true, }, ); @@ -267,7 +302,7 @@ mod tests { let stats = FloatStats::generate_opts( &floats, - crate::GenerateStatsOptions { + crate::stats::GenerateStatsOptions { count_distinct_values: true, }, ); diff --git a/vortex-btrblocks/src/compressor/integer/stats.rs b/vortex-compressor/src/stats/integer.rs similarity index 85% rename from vortex-btrblocks/src/compressor/integer/stats.rs rename to vortex-compressor/src/stats/integer.rs index f50fb5a548e..1f13118584b 100644 --- a/vortex-btrblocks/src/compressor/integer/stats.rs +++ b/vortex-compressor/src/stats/integer.rs @@ -1,6 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +//! Integer compression statistics. + use std::hash::Hash; use num_traits::PrimInt; @@ -19,13 +21,13 @@ use vortex_error::VortexResult; use vortex_mask::AllOr; use vortex_utils::aliases::hash_map::HashMap; -use crate::GenerateStatsOptions; -use crate::compressor::rle::RLEStats; +use super::GenerateStatsOptions; +/// Information about the distinct values in an integer array. #[derive(Debug, Clone)] pub struct DistinctInfo { /// The unique values and their occurrences. - pub(super) distinct_values: HashMap, u32, FxBuildHasher>, + distinct_values: HashMap, u32, FxBuildHasher>, /// The count of unique values. distinct_count: u32, /// The most frequent value. @@ -34,11 +36,29 @@ pub struct DistinctInfo { top_frequency: u32, } +impl DistinctInfo { + /// Returns a reference to the distinct values map. + pub fn distinct_values(&self) -> &HashMap, u32, FxBuildHasher> { + &self.distinct_values + } +} + +/// Typed statistics for a specific integer type. #[derive(Debug, Clone)] pub struct TypedStats { + /// The minimum value. min: T, + /// The maximum value. max: T, - pub(super) distinct: Option>, + /// Distinct value information, or `None` if not computed. + distinct: Option>, +} + +impl TypedStats { + /// Returns the distinct value information, if computed. + pub fn distinct(&self) -> Option<&DistinctInfo> { + self.distinct.as_ref() + } } impl TypedStats { @@ -54,23 +74,32 @@ impl TypedStats { } } -/// Type-erased container for one of the [TypedStats] variants. +/// Type-erased container for one of the [`TypedStats`] variants. /// /// Building the `TypedStats` is considerably faster and cheaper than building a type-erased /// set of stats. We then perform a variety of access methods on them. #[derive(Clone, Debug)] pub enum ErasedStats { + /// Stats for `u8` arrays. U8(TypedStats), + /// Stats for `u16` arrays. U16(TypedStats), + /// Stats for `u32` arrays. U32(TypedStats), + /// Stats for `u64` arrays. U64(TypedStats), + /// Stats for `i8` arrays. I8(TypedStats), + /// Stats for `i16` arrays. I16(TypedStats), + /// Stats for `i32` arrays. I32(TypedStats), + /// Stats for `i64` arrays. I64(TypedStats), } impl ErasedStats { + /// Returns `true` if the minimum value is zero. pub fn min_is_zero(&self) -> bool { match &self { ErasedStats::U8(x) => x.min == 0, @@ -84,6 +113,7 @@ impl ErasedStats { } } + /// Returns `true` if the minimum value is negative. pub fn min_is_negative(&self) -> bool { match &self { ErasedStats::U8(_) @@ -97,7 +127,7 @@ impl ErasedStats { } } - // Difference between max and min. + /// Difference between max and min. pub fn max_minus_min(&self) -> u64 { match &self { ErasedStats::U8(x) => (x.max - x.min) as u64, @@ -112,10 +142,10 @@ impl ErasedStats { } } - /// Returns the ilog2 of the max value when transmuted to unsigned, or None if zero. + /// Returns the ilog2 of the max value when transmuted to unsigned, or `None` if zero. /// /// This matches how BitPacking computes bit width: it reinterprets signed values as - /// unsigned (preserving bit pattern) and uses leading_zeros. For non-negative signed + /// unsigned (preserving bit pattern) and uses `leading_zeros`. For non-negative signed /// values, the transmuted value equals the original value. /// /// This is used to determine if FOR encoding would reduce bit width compared to @@ -126,7 +156,7 @@ impl ErasedStats { ErasedStats::U16(x) => x.max.checked_ilog2(), ErasedStats::U32(x) => x.max.checked_ilog2(), ErasedStats::U64(x) => x.max.checked_ilog2(), - // Transmute signed to unsigned (bit pattern preserved) to match BitPacking behavior + // Transmute signed to unsigned (bit pattern preserved) to match BitPacking behavior. ErasedStats::I8(x) => (x.max as u8).checked_ilog2(), ErasedStats::I16(x) => (x.max as u16).checked_ilog2(), ErasedStats::I32(x) => (x.max as u32).checked_ilog2(), @@ -148,7 +178,7 @@ impl ErasedStats { } } - /// Get the most commonly occurring value and its count + /// Get the most commonly occurring value and its count. pub fn most_frequent_value_and_count(&self) -> Option<(PValue, u32)> { match &self { ErasedStats::U8(x) => { @@ -187,6 +217,7 @@ impl ErasedStats { } } +/// Implements `From>` for [`ErasedStats`]. macro_rules! impl_from_typed { ($T:ty, $variant:path) => { impl From> for ErasedStats { @@ -209,16 +240,20 @@ impl_from_typed!(i64, ErasedStats::I64); /// Array of integers and relevant stats for compression. #[derive(Clone, Debug)] pub struct IntegerStats { - pub(super) src: PrimitiveArray, - // cache for validity.false_count() - pub(super) null_count: u32, - // cache for validity.true_count() - pub(super) value_count: u32, - pub(super) average_run_length: u32, - pub(super) erased: ErasedStats, + /// The underlying source array. + src: PrimitiveArray, + /// Cache for `validity.false_count()`. + null_count: u32, + /// Cache for `validity.true_count()`. + value_count: u32, + /// The average run length. + average_run_length: u32, + /// Type-erased typed statistics. + erased: ErasedStats, } impl IntegerStats { + /// Generates stats, returning an error on failure. fn generate_opts_fallible( input: &PrimitiveArray, opts: GenerateStatsOptions, @@ -252,26 +287,32 @@ impl IntegerStats { } /// Returns the underlying source array. - #[expect(clippy::same_name_method)] pub fn source(&self) -> &PrimitiveArray { &self.src } -} -impl RLEStats for IntegerStats { - fn value_count(&self) -> u32 { + /// Returns the number of null values. + pub fn null_count(&self) -> u32 { + self.null_count + } + + /// Returns the number of non-null values. + pub fn value_count(&self) -> u32 { self.value_count } - fn average_run_length(&self) -> u32 { + /// Returns the average run length. + pub fn average_run_length(&self) -> u32 { self.average_run_length } - fn source(&self) -> &PrimitiveArray { - &self.src + /// Returns the type-erased typed statistics. + pub fn erased(&self) -> &ErasedStats { + &self.erased } } +/// Computes typed integer statistics for a specific integer type. fn typed_int_stats( array: &PrimitiveArray, count_distinct_values: bool, @@ -281,7 +322,7 @@ where TypedStats: Into, NativeValue: Eq + Hash, { - // Special case: empty array + // Special case: empty array. if array.is_empty() { return Ok(IntegerStats { src: array.clone(), @@ -314,7 +355,7 @@ where let null_count = validity.false_count(); let value_count = validity.true_count(); - // Initialize loop state + // Initialize loop state. let head_idx = validity .first() .vortex_expect("All null masks have been handled before"); @@ -359,15 +400,15 @@ where offset += 64; match validity.true_count() { - // All nulls -> no stats to update + // All nulls -> no stats to update. 0 => continue, - // Inner loop for when validity check can be elided + // Inner loop for when validity check can be elided. 64 => inner_loop_nonnull( chunk.try_into().ok().vortex_expect("chunk size must be 64"), count_distinct_values, &mut loop_state, ), - // Inner loop for when we need to check validity + // Inner loop for when we need to check validity. _ => inner_loop_nullable( chunk.try_into().ok().vortex_expect("chunk size must be 64"), count_distinct_values, @@ -376,7 +417,7 @@ where ), } } - // Final iteration, run naive loop + // Final iteration, run naive loop. let remainder = chunks.remainder(); inner_loop_naive( remainder, @@ -429,12 +470,17 @@ where }) } +/// Internal loop state for integer stats computation. struct LoopState { + /// The previous value seen. prev: T, + /// The run count. runs: u32, + /// The distinct values map. distinct_values: HashMap, u32, FxBuildHasher>, } +/// Inner loop for non-null chunks of 64 values. #[inline(always)] fn inner_loop_nonnull( values: &[T; 64], @@ -455,6 +501,7 @@ fn inner_loop_nonnull( } } +/// Inner loop for nullable chunks of 64 values. #[inline(always)] fn inner_loop_nullable( values: &[T; 64], @@ -478,6 +525,7 @@ fn inner_loop_nullable( } } +/// Fallback inner loop for remainder values. #[inline(always)] fn inner_loop_naive( values: &[T], @@ -561,7 +609,7 @@ mod tests { let stats = IntegerStats::generate_opts( &ints, - crate::GenerateStatsOptions { + crate::stats::GenerateStatsOptions { count_distinct_values: true, }, ); diff --git a/vortex-compressor/src/stats/mod.rs b/vortex-compressor/src/stats/mod.rs new file mode 100644 index 00000000000..e4417b66b3d --- /dev/null +++ b/vortex-compressor/src/stats/mod.rs @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Compression statistics types and caching. + +mod cache; +mod float; +mod integer; +mod options; +mod string; + +pub use cache::ArrayAndStats; +pub use float::DistinctInfo as FloatDistinctInfo; +pub use float::ErasedStats as FloatErasedStats; +pub use float::FloatStats; +pub use float::TypedStats as FloatTypedStats; +pub use integer::DistinctInfo as IntegerDistinctInfo; +pub use integer::ErasedStats as IntegerErasedStats; +pub use integer::IntegerStats; +pub use integer::TypedStats as IntegerTypedStats; +pub use options::GenerateStatsOptions; +pub use string::StringStats; diff --git a/vortex-btrblocks/src/stats.rs b/vortex-compressor/src/stats/options.rs similarity index 68% rename from vortex-btrblocks/src/stats.rs rename to vortex-compressor/src/stats/options.rs index cd3239cd20b..d53b69d748a 100644 --- a/vortex-btrblocks/src/stats.rs +++ b/vortex-compressor/src/stats/options.rs @@ -9,7 +9,7 @@ /// merges all eligible schemes' options before generating stats, so that a single stats pass /// satisfies every scheme. /// -/// [`Scheme::stats_options`]: crate::Scheme::stats_options +/// [`Scheme::stats_options`]: crate::scheme::Scheme::stats_options #[derive(Debug, Default, Clone, Copy)] pub struct GenerateStatsOptions { /// Whether distinct values should be counted during stats generation. @@ -24,13 +24,3 @@ impl GenerateStatsOptions { } } } - -/// The size of each sampled run. -pub(crate) const SAMPLE_SIZE: u32 = 64; -/// The number of sampled runs. -/// -/// # Warning -/// -/// The product of SAMPLE_SIZE and SAMPLE_COUNT should be (roughly) a multiple of 1024 so that -/// fastlanes bitpacking of sampled vectors does not introduce (large amounts of) padding. -pub(crate) const SAMPLE_COUNT: u32 = 16; diff --git a/vortex-compressor/src/stats/string.rs b/vortex-compressor/src/stats/string.rs new file mode 100644 index 00000000000..f8db9d0c4f2 --- /dev/null +++ b/vortex-compressor/src/stats/string.rs @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! String compression statistics. + +use vortex_array::arrays::VarBinViewArray; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; +use vortex_error::vortex_err; +use vortex_utils::aliases::hash_set::HashSet; + +use super::GenerateStatsOptions; + +/// Array of variable-length byte arrays, and relevant stats for compression. +#[derive(Clone, Debug)] +pub struct StringStats { + /// The underlying source array. + src: VarBinViewArray, + /// The estimated number of distinct strings, or `None` if not computed. + estimated_distinct_count: Option, + /// The number of non-null values. + value_count: u32, + /// The number of null values. + null_count: u32, +} + +/// Estimate the number of distinct strings in the var bin view array. +fn estimate_distinct_count(strings: &VarBinViewArray) -> VortexResult { + let views = strings.views(); + // Iterate the views. Two strings which are equal must have the same first 8-bytes. + // NOTE: there are cases where this performs pessimally, e.g. when we have strings that all + // share a 4-byte prefix and have the same length. + let mut distinct = HashSet::with_capacity(views.len() / 2); + views.iter().for_each(|&view| { + #[expect( + clippy::cast_possible_truncation, + reason = "approximate uniqueness with view prefix" + )] + let len_and_prefix = view.as_u128() as u64; + distinct.insert(len_and_prefix); + }); + + Ok(u32::try_from(distinct.len())?) +} + +impl StringStats { + /// Generates stats, returning an error on failure. + fn generate_opts_fallible( + input: &VarBinViewArray, + opts: GenerateStatsOptions, + ) -> VortexResult { + let null_count = input + .statistics() + .compute_null_count() + .ok_or_else(|| vortex_err!("Failed to compute null_count"))?; + let value_count = input.len() - null_count; + let estimated_distinct_count = opts + .count_distinct_values + .then(|| estimate_distinct_count(input)) + .transpose()?; + + Ok(Self { + src: input.clone(), + value_count: u32::try_from(value_count)?, + null_count: u32::try_from(null_count)?, + estimated_distinct_count, + }) + } +} + +impl StringStats { + /// Generates stats with default options. + pub fn generate(input: &VarBinViewArray) -> Self { + Self::generate_opts(input, GenerateStatsOptions::default()) + } + + /// Generates stats with provided options. + pub fn generate_opts(input: &VarBinViewArray, opts: GenerateStatsOptions) -> Self { + Self::generate_opts_fallible(input, opts) + .vortex_expect("StringStats::generate_opts should not fail") + } + + /// Returns the underlying source array. + pub fn source(&self) -> &VarBinViewArray { + &self.src + } + + /// Returns the estimated number of distinct strings, or `None` if not computed. + pub fn estimated_distinct_count(&self) -> Option { + self.estimated_distinct_count + } + + /// Returns the number of non-null values. + pub fn value_count(&self) -> u32 { + self.value_count + } + + /// Returns the number of null values. + pub fn null_count(&self) -> u32 { + self.null_count + } +} diff --git a/vortex-file/src/strategy.rs b/vortex-file/src/strategy.rs index 4f52bdd917b..adc4f08dc63 100644 --- a/vortex-file/src/strategy.rs +++ b/vortex-file/src/strategy.rs @@ -30,7 +30,7 @@ use vortex_array::session::ArrayRegistry; #[cfg(feature = "zstd")] use vortex_btrblocks::BtrBlocksCompressorBuilder; #[cfg(feature = "zstd")] -use vortex_btrblocks::Scheme; +use vortex_btrblocks::SchemeExt; #[cfg(feature = "zstd")] use vortex_btrblocks::compressor::float; #[cfg(feature = "zstd")] diff --git a/vortex-layout/src/layouts/compressed.rs b/vortex-layout/src/layouts/compressed.rs index 76f684d36ec..25228a12813 100644 --- a/vortex-layout/src/layouts/compressed.rs +++ b/vortex-layout/src/layouts/compressed.rs @@ -11,7 +11,7 @@ use vortex_array::DynArray; use vortex_array::expr::stats::Stat; use vortex_btrblocks::BtrBlocksCompressor; use vortex_btrblocks::BtrBlocksCompressorBuilder; -use vortex_btrblocks::Scheme; +use vortex_btrblocks::SchemeExt; use vortex_btrblocks::compressor::integer::DictScheme; use vortex_error::VortexResult; use vortex_io::runtime::Handle; diff --git a/vortex/public-api.lock b/vortex/public-api.lock index 0c8ce9d0cd9..7be026902db 100644 --- a/vortex/public-api.lock +++ b/vortex/public-api.lock @@ -22,11 +22,9 @@ pub use vortex::compressor::BtrBlocksCompressor pub use vortex::compressor::BtrBlocksCompressorBuilder -pub use vortex::compressor::FloatCode +pub use vortex::compressor::Scheme -pub use vortex::compressor::IntCode - -pub use vortex::compressor::StringCode +pub use vortex::compressor::SchemeId pub mod vortex::dtype From 0841d74aa3989b7b918241e78cb99b4b5e434ca5 Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Thu, 19 Mar 2026 09:48:51 -0400 Subject: [PATCH 5/9] add execution context to compressor Signed-off-by: Connor Tsui --- Cargo.lock | 1 + vortex-array/src/executor.rs | 1 + vortex-btrblocks/src/builder.rs | 12 +- vortex-btrblocks/src/compressor/decimal.rs | 17 +- vortex-btrblocks/src/compressor/float/mod.rs | 106 +++++----- .../src/compressor/integer/mod.rs | 198 ++++++++++-------- vortex-btrblocks/src/compressor/rle.rs | 32 +-- vortex-btrblocks/src/compressor/string.rs | 82 +++++--- vortex-btrblocks/src/compressor/temporal.rs | 35 ++-- vortex-compressor/Cargo.toml | 1 + vortex-compressor/src/compressor.rs | 167 ++++++++++++--- vortex-compressor/src/ctx.rs | 45 +++- vortex-compressor/src/lib.rs | 1 + vortex-compressor/src/scheme.rs | 131 ++++++++++-- 14 files changed, 560 insertions(+), 269 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2661a12756c..53499bffee7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9949,6 +9949,7 @@ version = "0.1.0" dependencies = [ "itertools 0.14.0", "num-traits", + "parking_lot", "rand 0.10.0", "rstest", "rustc-hash", diff --git a/vortex-array/src/executor.rs b/vortex-array/src/executor.rs index da05450f8de..114adf355c3 100644 --- a/vortex-array/src/executor.rs +++ b/vortex-array/src/executor.rs @@ -172,6 +172,7 @@ impl dyn DynArray + '_ { /// /// Accumulates a trace of execution steps. Individual steps are logged at TRACE level for /// real-time following, and the full trace is dumped at DEBUG level when the context is dropped. +#[derive(Debug, Clone)] pub struct ExecutionCtx { id: usize, session: VortexSession, diff --git a/vortex-btrblocks/src/builder.rs b/vortex-btrblocks/src/builder.rs index 550a59cb697..0786dcb3e26 100644 --- a/vortex-btrblocks/src/builder.rs +++ b/vortex-btrblocks/src/builder.rs @@ -3,6 +3,9 @@ //! Builder for configuring `BtrBlocksCompressor` instances. +use vortex_compressor::root_list_children; +use vortex_compressor::scheme::ChildSelection; +use vortex_compressor::scheme::DescendantExclusion; use vortex_utils::aliases::hash_set::HashSet; use crate::BtrBlocksCompressor; @@ -10,6 +13,7 @@ use crate::CascadingCompressor; use crate::Scheme; use crate::SchemeExt; use crate::SchemeId; +use crate::compressor::integer::DictScheme as IntDictScheme; /// All available compression schemes. /// @@ -152,6 +156,12 @@ impl BtrBlocksCompressorBuilder { .copied() .filter(|s| self.schemes.contains(s)) .collect(); - BtrBlocksCompressor(CascadingCompressor::new(schemes)) + // Root exclusion: exclude IntDict from list/listview offsets (monotonically + // increasing data where dictionary encoding is wasteful). + let root_exclusions = vec![DescendantExclusion { + excluded: IntDictScheme::ID, + children: ChildSelection::One(root_list_children::OFFSETS), + }]; + BtrBlocksCompressor(CascadingCompressor::new(schemes, root_exclusions)) } } diff --git a/vortex-btrblocks/src/compressor/decimal.rs b/vortex-btrblocks/src/compressor/decimal.rs index 2aaa5a4e5d2..de67244d3c9 100644 --- a/vortex-btrblocks/src/compressor/decimal.rs +++ b/vortex-btrblocks/src/compressor/decimal.rs @@ -18,7 +18,7 @@ use crate::ArrayAndStats; use crate::CascadingCompressor; use crate::CompressorContext; use crate::Scheme; -use crate::SchemeId; +use crate::SchemeExt; /// Compression scheme for decimal arrays via byte-part decomposition. /// @@ -36,12 +36,15 @@ impl Scheme for DecimalScheme { matches!(canonical, Canonical::Decimal(_)) } + fn num_children(&self) -> usize { + 1 + } + fn expected_compression_ratio( &self, _compressor: &CascadingCompressor, _data: &mut ArrayAndStats, _ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { // Decimal compression is almost always beneficial (narrowing + primitive compression). // Return a moderate ratio to ensure this scheme is always selected. @@ -53,7 +56,6 @@ impl Scheme for DecimalScheme { compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { // TODO(joe): add support splitting i128/256 buffers into chunks of primitive values // for compression. 2 for i128 and 4 for i256. @@ -68,11 +70,10 @@ impl Scheme for DecimalScheme { _ => return Ok(decimal.into_array()), }; - let compressed = compressor.compress_canonical( - Canonical::Primitive(prim), - CompressorContext::default(), - &[], - )?; + let ctx = CompressorContext::default() + .descend() + .with_scheme(self.id(), 0); + let compressed = compressor.compress_canonical(Canonical::Primitive(prim), ctx)?; DecimalBytePartsArray::try_new(compressed, decimal.decimal_dtype()).map(|d| d.into_array()) } diff --git a/vortex-btrblocks/src/compressor/float/mod.rs b/vortex-btrblocks/src/compressor/float/mod.rs index ea2a6e127fd..b959103d2f6 100644 --- a/vortex-btrblocks/src/compressor/float/mod.rs +++ b/vortex-btrblocks/src/compressor/float/mod.rs @@ -19,6 +19,8 @@ use vortex_array::arrays::dict::DictArrayParts; use vortex_array::dtype::PType; use vortex_array::scalar::Scalar; use vortex_array::vtable::ValidityHelper; +use vortex_compressor::scheme::ChildSelection; +use vortex_compressor::scheme::DescendantExclusion; pub use vortex_compressor::stats::FloatStats; use vortex_error::VortexResult; use vortex_error::vortex_panic; @@ -36,7 +38,6 @@ use crate::CompressorContext; use crate::GenerateStatsOptions; use crate::Scheme; use crate::SchemeExt; -use crate::SchemeId; use crate::compress_patches; use crate::compressor::rle; use crate::compressor::rle::RLEScheme; @@ -98,9 +99,8 @@ impl rle::RLEConfig for FloatRLEConfig { compressor: &CascadingCompressor, values: &PrimitiveArray, ctx: CompressorContext, - excludes: &[SchemeId], ) -> VortexResult { - compressor.compress_canonical(Canonical::Primitive(values.clone()), ctx, excludes) + compressor.compress_canonical(Canonical::Primitive(values.clone()), ctx) } } @@ -135,7 +135,6 @@ impl Scheme for UncompressedScheme { _compressor: &CascadingCompressor, _data: &mut ArrayAndStats, _ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { Ok(1.0) } @@ -145,7 +144,6 @@ impl Scheme for UncompressedScheme { _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { Ok(data.array().clone()) } @@ -169,7 +167,6 @@ impl Scheme for ConstantScheme { _compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { // Never select Constant when sampling. if ctx.is_sample { @@ -195,7 +192,6 @@ impl Scheme for ConstantScheme { _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { let stats = data.float_stats(); @@ -233,12 +229,15 @@ impl Scheme for ALPScheme { is_float_primitive(canonical) } + fn num_children(&self) -> usize { + 1 + } + fn expected_compression_ratio( &self, compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - excludes: &[SchemeId], ) -> VortexResult { let stats = data.float_stats(); @@ -253,7 +252,7 @@ impl Scheme for ALPScheme { return Ok(0.0); } - estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) } fn compress( @@ -261,7 +260,6 @@ impl Scheme for ALPScheme { compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - excludes: &[SchemeId], ) -> VortexResult { let stats = data.float_stats(); @@ -272,18 +270,9 @@ impl Scheme for ALPScheme { // Compress the ALP ints. // Patches are not compressed. They should be infrequent, and if they are not then we want // to keep them linear for easy indexing. - let mut new_excludes = Vec::new(); - if excludes.contains(&DictScheme.id()) { - new_excludes.push(IntDictScheme.id()); - } - if excludes.contains(&RLE_FLOAT_SCHEME.id()) { - new_excludes.push(IntRunEndScheme.id()); - } - let compressed_alp_ints = compressor.compress_canonical( Canonical::Primitive(alp_ints), - ctx.descend(), - &new_excludes, + ctx.descend().with_scheme(self.id(), 0), )?; let patches = alp.patches().map(compress_patches).transpose()?; @@ -306,7 +295,6 @@ impl Scheme for ALPRDScheme { compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - excludes: &[SchemeId], ) -> VortexResult { let stats = data.float_stats(); @@ -314,7 +302,7 @@ impl Scheme for ALPRDScheme { return Ok(0.0); } - estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) } fn compress( @@ -322,7 +310,6 @@ impl Scheme for ALPRDScheme { _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { let stats = data.float_stats(); @@ -359,12 +346,40 @@ impl Scheme for DictScheme { } } + fn num_children(&self) -> usize { + 2 + } + + fn descendant_exclusions(&self) -> &[DescendantExclusion] { + &[ + // Exclude IntDict from codes child. + DescendantExclusion { + excluded: IntDictScheme::ID, + children: ChildSelection::One(1), + }, + // Exclude IntSequence from codes child. + DescendantExclusion { + excluded: IntSequenceScheme::ID, + children: ChildSelection::One(1), + }, + // Exclude IntDict from values child (replaces ALP propagation). + DescendantExclusion { + excluded: IntDictScheme::ID, + children: ChildSelection::One(0), + }, + // Exclude IntRunEnd from values child (replaces ALP propagation). + DescendantExclusion { + excluded: IntRunEndScheme::ID, + children: ChildSelection::One(0), + }, + ] + } + fn expected_compression_ratio( &self, compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - excludes: &[SchemeId], ) -> VortexResult { let stats = data.float_stats(); @@ -379,13 +394,7 @@ impl Scheme for DictScheme { .is_some_and(|count| count <= stats.value_count() / 2) { // Take a sample and run compression on the sample to determine before/after size. - return estimate_compression_ratio_with_sampling( - self, - compressor, - data.array(), - ctx, - excludes, - ); + return estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx); } Ok(0.0) @@ -396,7 +405,6 @@ impl Scheme for DictScheme { compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { let stats = data.float_stats(); @@ -404,17 +412,14 @@ impl Scheme for DictScheme { let has_all_values_referenced = dict.has_all_values_referenced(); let DictArrayParts { codes, values, .. } = dict.into_parts(); - let compressed_codes = compressor.compress_canonical( - Canonical::Primitive(codes.to_primitive()), - ctx.descend(), - &[IntDictScheme.id(), IntSequenceScheme.id()], - )?; - - assert!(values.is_canonical()); let compressed_values = compressor.compress_canonical( Canonical::Primitive(values.to_primitive()), - ctx.descend(), - &[DictScheme.id()], + ctx.clone().descend().with_scheme(self.id(), 0), + )?; + + let compressed_codes = compressor.compress_canonical( + Canonical::Primitive(codes.to_primitive()), + ctx.descend().with_scheme(self.id(), 1), )?; // SAFETY: compressing codes or values does not alter the invariants. @@ -437,12 +442,22 @@ impl Scheme for NullDominated { is_float_primitive(canonical) } + fn num_children(&self) -> usize { + 1 + } + + fn descendant_exclusions(&self) -> &[DescendantExclusion] { + &[DescendantExclusion { + excluded: IntSparseScheme::ID, + children: ChildSelection::All, + }] + } + fn expected_compression_ratio( &self, _compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { // Only use `SparseScheme` if we can cascade. if ctx.allowed_cascading == 0 { @@ -470,7 +485,6 @@ impl Scheme for NullDominated { compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { assert!(ctx.allowed_cascading > 0); @@ -483,8 +497,7 @@ impl Scheme for NullDominated { let indices = sparse.patches().indices().to_primitive().narrow()?; let compressed_indices = compressor.compress_canonical( Canonical::Primitive(indices.to_primitive()), - ctx.descend(), - &[IntSparseScheme.id()], + ctx.descend().with_scheme(self.id(), 0), )?; SparseArray::try_new( @@ -515,7 +528,6 @@ impl Scheme for PcoScheme { _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { let stats = data.float_stats(); Ok(vortex_pco::PcoArray::from_primitive( @@ -594,7 +606,7 @@ mod tests { let array_ref = array.into_array(); let mut data = ArrayAndStats::new(array_ref.clone(), GenerateStatsOptions::default()); let compressed = - RLE_FLOAT_SCHEME.compress(&btr, &mut data, CompressorContext::default(), &[])?; + RLE_FLOAT_SCHEME.compress(&btr, &mut data, CompressorContext::default())?; let expected = Buffer::copy_from(&values).into_array(); assert_arrays_eq!(compressed.as_ref(), expected.as_ref()); diff --git a/vortex-btrblocks/src/compressor/integer/mod.rs b/vortex-btrblocks/src/compressor/integer/mod.rs index 86d99079265..5d5881f16c4 100644 --- a/vortex-btrblocks/src/compressor/integer/mod.rs +++ b/vortex-btrblocks/src/compressor/integer/mod.rs @@ -12,6 +12,9 @@ use vortex_array::arrays::MaskedArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::scalar::Scalar; use vortex_array::vtable::ValidityHelper; +use vortex_compressor::scheme::AncestorExclusion; +use vortex_compressor::scheme::ChildSelection; +use vortex_compressor::scheme::DescendantExclusion; pub use vortex_compressor::stats::IntegerStats; use vortex_error::VortexExpect; use vortex_error::VortexResult; @@ -72,18 +75,46 @@ pub struct BitPackingScheme; #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct SparseScheme; +impl SparseScheme { + /// Constant [`SchemeId`] for use in static exclusion rules. + pub const ID: SchemeId = SchemeId { + name: "vortex.int.sparse", + }; +} + /// Dictionary encoding for low-cardinality integer values. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct DictScheme; +impl DictScheme { + /// Constant [`SchemeId`] for use in static exclusion rules. + pub const ID: SchemeId = SchemeId { + name: "vortex.int.dict", + }; +} + /// Run-end encoding with end positions. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct RunEndScheme; +impl RunEndScheme { + /// Constant [`SchemeId`] for use in static exclusion rules. + pub const ID: SchemeId = SchemeId { + name: "vortex.int.runend", + }; +} + /// Sequence encoding for sequential patterns. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct SequenceScheme; +impl SequenceScheme { + /// Constant [`SchemeId`] for use in static exclusion rules. + pub const ID: SchemeId = SchemeId { + name: "vortex.int.sequence", + }; +} + /// Pco (pcodec) compression for integers. #[cfg(feature = "pco")] #[derive(Debug, Copy, Clone, PartialEq, Eq)] @@ -113,9 +144,8 @@ impl rle::RLEConfig for IntRLEConfig { compressor: &CascadingCompressor, values: &PrimitiveArray, ctx: CompressorContext, - excludes: &[SchemeId], ) -> VortexResult { - compressor.compress_canonical(Canonical::Primitive(values.clone()), ctx, excludes) + compressor.compress_canonical(Canonical::Primitive(values.clone()), ctx) } } @@ -150,7 +180,6 @@ impl Scheme for UncompressedScheme { _compressor: &CascadingCompressor, _data: &mut ArrayAndStats, _ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { // No compression. Ok(1.0) @@ -161,7 +190,6 @@ impl Scheme for UncompressedScheme { _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { Ok(data.array().clone()) } @@ -185,7 +213,6 @@ impl Scheme for ConstantScheme { _compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { // Never yield ConstantScheme for a sample, it could be a false-positive. if ctx.is_sample { @@ -207,7 +234,6 @@ impl Scheme for ConstantScheme { _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { let stats = data.integer_stats(); @@ -250,7 +276,6 @@ impl Scheme for FORScheme { _compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { // Only apply if we are not at the leaf. if ctx.allowed_cascading == 0 { @@ -305,7 +330,6 @@ impl Scheme for FORScheme { compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - excludes: &[SchemeId], ) -> VortexResult { let primitive = data.array().to_primitive(); let for_array = FoRArray::encode(primitive)?; @@ -315,14 +339,10 @@ impl Scheme for FORScheme { // of bitpacking. // NOTE: we could delegate in the future if we had another downstream codec that performs // as well. - let leaf_ctx = CompressorContext { - is_sample: ctx.is_sample, - allowed_cascading: 0, - stats_options: ctx.stats_options, - }; + let mut leaf_ctx = ctx.clone(); + leaf_ctx.allowed_cascading = 0; let mut biased_data = ArrayAndStats::new(biased.into_array(), ctx.stats_options); - let compressed = - BitPackingScheme.compress(compressor, &mut biased_data, leaf_ctx, excludes)?; + let compressed = BitPackingScheme.compress(compressor, &mut biased_data, leaf_ctx)?; let for_compressed = FoRArray::try_new(compressed, for_array.reference_scalar().clone())?; for_compressed @@ -342,12 +362,32 @@ impl Scheme for ZigZagScheme { is_integer_primitive(canonical) } + fn num_children(&self) -> usize { + 1 + } + + fn descendant_exclusions(&self) -> &[DescendantExclusion] { + &[ + DescendantExclusion { + excluded: DictScheme::ID, + children: ChildSelection::All, + }, + DescendantExclusion { + excluded: RunEndScheme::ID, + children: ChildSelection::All, + }, + DescendantExclusion { + excluded: SparseScheme::ID, + children: ChildSelection::All, + }, + ] + } + fn expected_compression_ratio( &self, compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - excludes: &[SchemeId], ) -> VortexResult { // ZigZag is only useful when we cascade it with another encoding. if ctx.allowed_cascading == 0 { @@ -367,7 +407,7 @@ impl Scheme for ZigZagScheme { } // Run compression on a sample to see how it performs. - estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) } fn compress( @@ -375,7 +415,6 @@ impl Scheme for ZigZagScheme { compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - excludes: &[SchemeId], ) -> VortexResult { let stats = data.integer_stats(); @@ -383,21 +422,8 @@ impl Scheme for ZigZagScheme { let zag = zigzag_encode(stats.source().clone())?; let encoded = zag.encoded().to_primitive(); - // ZigZag should be after Dict, RunEnd or Sparse. - // We should only do these "container" style compressors once. - let mut new_excludes = vec![ - ZigZagScheme.id(), - DictScheme.id(), - RunEndScheme.id(), - SparseScheme.id(), - ]; - new_excludes.extend_from_slice(excludes); - - let compressed = compressor.compress_canonical( - Canonical::Primitive(encoded), - ctx.descend(), - &new_excludes, - )?; + let child_ctx = ctx.descend().with_scheme(self.id(), 0); + let compressed = compressor.compress_canonical(Canonical::Primitive(encoded), child_ctx)?; tracing::debug!("zigzag output: {}", compressed.encoding_id()); @@ -419,7 +445,6 @@ impl Scheme for BitPackingScheme { compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - excludes: &[SchemeId], ) -> VortexResult { let stats = data.integer_stats(); @@ -433,7 +458,7 @@ impl Scheme for BitPackingScheme { return Ok(0.0); } - estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) } fn compress( @@ -441,7 +466,6 @@ impl Scheme for BitPackingScheme { _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { let stats = data.integer_stats(); @@ -475,12 +499,22 @@ impl Scheme for SparseScheme { } } + fn num_children(&self) -> usize { + 2 + } + + fn descendant_exclusions(&self) -> &[DescendantExclusion] { + &[DescendantExclusion { + excluded: DictScheme::ID, + children: ChildSelection::All, + }] + } + fn expected_compression_ratio( &self, _compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { // Only use `SparseScheme` if we can cascade. if ctx.allowed_cascading == 0 { @@ -531,7 +565,6 @@ impl Scheme for SparseScheme { compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - excludes: &[SchemeId], ) -> VortexResult { assert!(ctx.allowed_cascading > 0); @@ -566,23 +599,17 @@ impl Scheme for SparseScheme { )?; if let Some(sparse) = sparse_encoded.as_opt::() { - // Compress the values. - let mut new_excludes = vec![SparseScheme.id(), DictScheme.id()]; - new_excludes.extend_from_slice(excludes); - + let values_ctx = ctx.clone().descend().with_scheme(self.id(), 0); let compressed_values = compressor.compress_canonical( Canonical::Primitive(sparse.patches().values().to_primitive()), - ctx.descend(), - &new_excludes, + values_ctx, )?; let indices = sparse.patches().indices().to_primitive().narrow()?; - let compressed_indices = compressor.compress_canonical( - Canonical::Primitive(indices), - ctx.descend(), - &new_excludes, - )?; + let indices_ctx = ctx.descend().with_scheme(self.id(), 1); + let compressed_indices = + compressor.compress_canonical(Canonical::Primitive(indices), indices_ctx)?; SparseArray::try_new( compressed_indices, @@ -612,12 +639,15 @@ impl Scheme for DictScheme { } } + fn num_children(&self) -> usize { + 1 + } + fn expected_compression_ratio( &self, _compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { // Dict should not be terminal. if ctx.allowed_cascading == 0 { @@ -663,7 +693,6 @@ impl Scheme for DictScheme { compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - excludes: &[SchemeId], ) -> VortexResult { assert!(ctx.allowed_cascading > 0); @@ -675,15 +704,10 @@ impl Scheme for DictScheme { let dict = dictionary_encode(stats); // Cascade the codes child. - // Don't allow SequenceArray as the codes child as it merely adds extra indirection - // without actually compressing data. - let mut new_excludes = vec![DictScheme.id(), SequenceScheme.id()]; - new_excludes.extend_from_slice(excludes); - + let codes_ctx = ctx.descend().with_scheme(self.id(), 0); let compressed_codes = compressor.compress_canonical( Canonical::Primitive(dict.codes().to_primitive().narrow()?), - ctx.descend(), - &new_excludes, + codes_ctx, )?; // SAFETY: compressing codes does not change their values. @@ -706,12 +730,22 @@ impl Scheme for RunEndScheme { is_integer_primitive(canonical) } + fn num_children(&self) -> usize { + 2 + } + + fn descendant_exclusions(&self) -> &[DescendantExclusion] { + &[DescendantExclusion { + excluded: DictScheme::ID, + children: ChildSelection::All, + }] + } + fn expected_compression_ratio( &self, compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - excludes: &[SchemeId], ) -> VortexResult { let stats = data.integer_stats(); @@ -725,7 +759,7 @@ impl Scheme for RunEndScheme { } // Run compression on a sample, see how it performs. - estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) } fn compress( @@ -733,7 +767,6 @@ impl Scheme for RunEndScheme { compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - excludes: &[SchemeId], ) -> VortexResult { assert!(ctx.allowed_cascading > 0); @@ -742,20 +775,13 @@ impl Scheme for RunEndScheme { // Run-end encode the ends. let (ends, values) = runend_encode(stats.source()); - let mut new_excludes = vec![RunEndScheme.id(), DictScheme.id()]; - new_excludes.extend_from_slice(excludes); + let values_ctx = ctx.clone().descend().with_scheme(self.id(), 0); + let compressed_values = compressor + .compress_canonical(Canonical::Primitive(values.to_primitive()), values_ctx)?; - let compressed_ends = compressor.compress_canonical( - Canonical::Primitive(ends.to_primitive()), - ctx.descend(), - &new_excludes, - )?; - - let compressed_values = compressor.compress_canonical( - Canonical::Primitive(values.to_primitive()), - ctx.descend(), - &new_excludes, - )?; + let ends_ctx = ctx.descend().with_scheme(self.id(), 1); + let compressed_ends = + compressor.compress_canonical(Canonical::Primitive(ends.to_primitive()), ends_ctx)?; // SAFETY: compression doesn't affect invariants. unsafe { @@ -779,12 +805,18 @@ impl Scheme for SequenceScheme { is_integer_primitive(canonical) } + fn ancestor_exclusions(&self) -> &[AncestorExclusion] { + &[AncestorExclusion { + ancestor: DictScheme::ID, + children: ChildSelection::All, + }] + } + fn expected_compression_ratio( &self, _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { let stats = data.integer_stats(); @@ -814,7 +846,6 @@ impl Scheme for SequenceScheme { _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { let stats = data.integer_stats(); @@ -840,7 +871,6 @@ impl Scheme for PcoScheme { compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - excludes: &[SchemeId], ) -> VortexResult { let stats = data.integer_stats(); @@ -852,7 +882,7 @@ impl Scheme for PcoScheme { return Ok(0.0); } - estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) } fn compress( @@ -860,7 +890,6 @@ impl Scheme for PcoScheme { _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { let stats = data.integer_stats(); @@ -954,8 +983,7 @@ mod tests { count_distinct_values: true, }, ); - let compressed = - SparseScheme.compress(&btr, &mut data, CompressorContext::default(), &[])?; + let compressed = SparseScheme.compress(&btr, &mut data, CompressorContext::default())?; assert!(compressed.is::()); let decoded = compressed.clone(); let expected = @@ -980,8 +1008,7 @@ mod tests { count_distinct_values: true, }, ); - let compressed = - SparseScheme.compress(&btr, &mut data, CompressorContext::default(), &[])?; + let compressed = SparseScheme.compress(&btr, &mut data, CompressorContext::default())?; assert!(compressed.is::()); let decoded = compressed.clone(); let expected = PrimitiveArray::new( @@ -1000,8 +1027,7 @@ mod tests { let btr = BtrBlocksCompressor::default(); let array_ref = array.into_array(); let mut data = ArrayAndStats::new(array_ref.clone(), GenerateStatsOptions::default()); - let compressed = - SequenceScheme.compress(&btr, &mut data, CompressorContext::default(), &[])?; + let compressed = SequenceScheme.compress(&btr, &mut data, CompressorContext::default())?; assert!(compressed.is::()); let decoded = compressed; let expected = PrimitiveArray::from_option_iter(values.into_iter().map(Some)).into_array(); @@ -1021,7 +1047,7 @@ mod tests { let array_ref = array.into_array(); let mut data = ArrayAndStats::new(array_ref.clone(), GenerateStatsOptions::default()); let compressed = - RLE_INTEGER_SCHEME.compress(&btr, &mut data, CompressorContext::default(), &[])?; + RLE_INTEGER_SCHEME.compress(&btr, &mut data, CompressorContext::default())?; let decoded = compressed; let expected = Buffer::copy_from(&values).into_array(); diff --git a/vortex-btrblocks/src/compressor/rle.rs b/vortex-btrblocks/src/compressor/rle.rs index 6de217df4b4..ae4971c87de 100644 --- a/vortex-btrblocks/src/compressor/rle.rs +++ b/vortex-btrblocks/src/compressor/rle.rs @@ -9,6 +9,8 @@ use vortex_array::Canonical; use vortex_array::IntoArray; use vortex_array::ToCanonical; use vortex_array::arrays::PrimitiveArray; +use vortex_compressor::scheme::ChildSelection; +use vortex_compressor::scheme::DescendantExclusion; use vortex_error::VortexResult; use vortex_fastlanes::RLEArray; @@ -17,7 +19,6 @@ use crate::CascadingCompressor; use crate::CompressorContext; use crate::Scheme; use crate::SchemeExt; -use crate::SchemeId; use crate::compressor::integer::DictScheme as IntDictScheme; use crate::estimate_compression_ratio_with_sampling; @@ -56,7 +57,6 @@ pub trait RLEConfig: Debug + Send + Sync + 'static { compressor: &CascadingCompressor, values: &PrimitiveArray, ctx: CompressorContext, - excludes: &[SchemeId], ) -> VortexResult; } @@ -88,12 +88,22 @@ impl Scheme for RLEScheme { C::matches(canonical) } + fn num_children(&self) -> usize { + 3 + } + + fn descendant_exclusions(&self) -> &[DescendantExclusion] { + &[DescendantExclusion { + excluded: IntDictScheme::ID, + children: ChildSelection::Many(&[1, 2]), + }] + } + fn expected_compression_ratio( &self, compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - excludes: &[SchemeId], ) -> VortexResult { // RLE is only useful when we cascade it with another encoding. if ctx.allowed_cascading == 0 { @@ -114,7 +124,7 @@ impl Scheme for RLEScheme { } // Run compression on a sample to see how it performs. - estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) } fn compress( @@ -122,7 +132,6 @@ impl Scheme for RLEScheme { compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - excludes: &[SchemeId], ) -> VortexResult { let array = data.array().clone(); let stats = data.get_or_insert_with::(|| C::generate_stats(&array)); @@ -132,15 +141,10 @@ impl Scheme for RLEScheme { return Ok(rle_array.into_array()); } - // Prevent RLE recursion. - let mut new_excludes = vec![self.id()]; - new_excludes.extend_from_slice(excludes); - let compressed_values = C::compress_values( compressor, &rle_array.values().to_primitive(), - ctx.descend(), - &new_excludes, + ctx.clone().descend().with_scheme(self.id(), 0), )?; // Delta in an unstable encoding, once we deem it stable we can switch over to this always. @@ -155,14 +159,12 @@ impl Scheme for RLEScheme { #[cfg(not(feature = "unstable_encodings"))] let compressed_indices = compressor.compress_canonical( Canonical::Primitive(rle_array.indices().to_primitive().narrow()?), - ctx.descend(), - &[IntDictScheme.id()], + ctx.clone().descend().with_scheme(self.id(), 1), )?; let compressed_offsets = compressor.compress_canonical( Canonical::Primitive(rle_array.values_idx_offsets().to_primitive().narrow()?), - ctx.descend(), - &[IntDictScheme.id()], + ctx.descend().with_scheme(self.id(), 2), )?; // SAFETY: Recursive compression doesn't affect the invariants. diff --git a/vortex-btrblocks/src/compressor/string.rs b/vortex-btrblocks/src/compressor/string.rs index 9e86376e68e..b33bae849fa 100644 --- a/vortex-btrblocks/src/compressor/string.rs +++ b/vortex-btrblocks/src/compressor/string.rs @@ -4,9 +4,7 @@ use vortex_array::ArrayRef; use vortex_array::Canonical; use vortex_array::IntoArray; -use vortex_array::LEGACY_SESSION; use vortex_array::ToCanonical; -use vortex_array::VortexSessionExecute; use vortex_array::aggregate_fn::fns::is_constant::is_constant; use vortex_array::arrays::ConstantArray; use vortex_array::arrays::DictArray; @@ -17,6 +15,8 @@ use vortex_array::dtype::DType; use vortex_array::dtype::Nullability; use vortex_array::scalar::Scalar; use vortex_array::vtable::ValidityHelper; +use vortex_compressor::scheme::ChildSelection; +use vortex_compressor::scheme::DescendantExclusion; use vortex_error::VortexResult; use vortex_fsst::FSSTArray; use vortex_fsst::fsst_compress; @@ -33,7 +33,6 @@ use crate::CompressorContext; use crate::GenerateStatsOptions; use crate::Scheme; use crate::SchemeExt; -use crate::SchemeId; use crate::estimate_compression_ratio_with_sampling; /// Returns `true` if the canonical array is a UTF-8 string type. @@ -87,7 +86,6 @@ impl Scheme for UncompressedScheme { _compressor: &CascadingCompressor, _data: &mut ArrayAndStats, _ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { Ok(1.0) } @@ -97,7 +95,6 @@ impl Scheme for UncompressedScheme { _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { Ok(data.array().clone()) } @@ -118,12 +115,28 @@ impl Scheme for DictScheme { } } + fn num_children(&self) -> usize { + 2 + } + + fn descendant_exclusions(&self) -> &[DescendantExclusion] { + &[ + DescendantExclusion { + excluded: IntDictScheme::ID, + children: ChildSelection::One(1), + }, + DescendantExclusion { + excluded: IntSequenceScheme::ID, + children: ChildSelection::One(1), + }, + ] + } + fn expected_compression_ratio( &self, compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - excludes: &[SchemeId], ) -> VortexResult { let stats = data.string_stats(); @@ -140,7 +153,7 @@ impl Scheme for DictScheme { return Ok(0.0); } - estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) } fn compress( @@ -148,7 +161,6 @@ impl Scheme for DictScheme { compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { let stats = data.string_stats(); @@ -162,16 +174,14 @@ impl Scheme for DictScheme { // Find best compressor for codes and values separately. let compressed_codes = compressor.compress_canonical( Canonical::Primitive(dict.codes().to_primitive()), - ctx.descend(), - &[IntDictScheme.id(), IntSequenceScheme.id()], + ctx.clone().descend().with_scheme(self.id(), 1), )?; // Attempt to compress the values with non-Dict compression. // Currently this will only be FSST. let compressed_values = compressor.compress_canonical( Canonical::VarBinView(dict.values().to_varbinview()), - ctx.descend(), - &[DictScheme.id()], + ctx.descend().with_scheme(self.id(), 0), )?; // SAFETY: compressing codes or values does not alter the invariants. @@ -194,12 +204,15 @@ impl Scheme for FSSTScheme { is_utf8_string(canonical) } + fn num_children(&self) -> usize { + 2 + } + fn compress( &self, compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { let stats = data.string_stats(); @@ -210,14 +223,12 @@ impl Scheme for FSSTScheme { let compressed_original_lengths = compressor.compress_canonical( Canonical::Primitive(fsst.uncompressed_lengths().to_primitive().narrow()?), - ctx, - &[], + ctx.clone().descend().with_scheme(self.id(), 0), )?; let compressed_codes_offsets = compressor.compress_canonical( Canonical::Primitive(fsst.codes().offsets().to_primitive().narrow()?), - ctx, - &[], + ctx.descend().with_scheme(self.id(), 1), )?; let compressed_codes = VarBinArray::try_new( compressed_codes_offsets, @@ -253,10 +264,9 @@ impl Scheme for ConstantScheme { fn expected_compression_ratio( &self, - _compressor: &CascadingCompressor, + compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { if ctx.is_sample { return Ok(0.0); @@ -264,10 +274,11 @@ impl Scheme for ConstantScheme { let stats = data.string_stats(); - // TODO(connor): Put the execution context somewhere! - let mut ctx = LEGACY_SESSION.create_execution_ctx(); if stats.estimated_distinct_count().is_none_or(|c| c > 1) - || !is_constant(&stats.source().clone().into_array(), &mut ctx)? + || !is_constant( + &stats.source().clone().into_array(), + &mut compressor.execution_ctx(), + )? { return Ok(0.0); } @@ -281,7 +292,6 @@ impl Scheme for ConstantScheme { _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { let stats = data.string_stats(); @@ -319,12 +329,28 @@ impl Scheme for NullDominated { is_utf8_string(canonical) } + fn num_children(&self) -> usize { + 1 + } + + fn descendant_exclusions(&self) -> &[DescendantExclusion] { + &[ + DescendantExclusion { + excluded: IntSparseScheme::ID, + children: ChildSelection::All, + }, + DescendantExclusion { + excluded: IntDictScheme::ID, + children: ChildSelection::All, + }, + ] + } + fn expected_compression_ratio( &self, _compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { // Only use `SparseScheme` if we can cascade. if ctx.allowed_cascading == 0 { @@ -352,7 +378,6 @@ impl Scheme for NullDominated { compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { assert!(ctx.allowed_cascading > 0); @@ -363,13 +388,10 @@ impl Scheme for NullDominated { if let Some(sparse) = sparse_encoded.as_opt::() { // Compress the indices only (not the values for strings). - let new_excludes = vec![IntSparseScheme.id(), IntDictScheme.id()]; - let indices = sparse.patches().indices().to_primitive().narrow()?; let compressed_indices = compressor.compress_canonical( Canonical::Primitive(indices), - ctx.descend(), - &new_excludes, + ctx.descend().with_scheme(self.id(), 0), )?; SparseArray::try_new( @@ -400,7 +422,6 @@ impl Scheme for ZstdScheme { _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { let stats = data.string_stats(); @@ -427,7 +448,6 @@ impl Scheme for ZstdBuffersScheme { _compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { let stats = data.string_stats(); diff --git a/vortex-btrblocks/src/compressor/temporal.rs b/vortex-btrblocks/src/compressor/temporal.rs index 619338aca3e..e880ed3c6d3 100644 --- a/vortex-btrblocks/src/compressor/temporal.rs +++ b/vortex-btrblocks/src/compressor/temporal.rs @@ -6,9 +6,7 @@ use vortex_array::ArrayRef; use vortex_array::Canonical; use vortex_array::IntoArray; -use vortex_array::LEGACY_SESSION; use vortex_array::ToCanonical; -use vortex_array::VortexSessionExecute; use vortex_array::aggregate_fn::fns::is_constant::is_constant; use vortex_array::arrays::ConstantArray; use vortex_array::arrays::TemporalArray; @@ -24,7 +22,7 @@ use crate::ArrayAndStats; use crate::CascadingCompressor; use crate::CompressorContext; use crate::Scheme; -use crate::SchemeId; +use crate::SchemeExt; /// Compression scheme for temporal timestamp arrays via datetime-part decomposition. /// @@ -55,12 +53,15 @@ impl Scheme for TemporalScheme { true } + fn num_children(&self) -> usize { + 3 + } + fn expected_compression_ratio( &self, _compressor: &CascadingCompressor, _data: &mut ArrayAndStats, _ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { // Temporal compression (splitting into parts) is almost always beneficial. // Return a moderate ratio to ensure this scheme is selected. @@ -72,17 +73,16 @@ impl Scheme for TemporalScheme { compressor: &CascadingCompressor, data: &mut ArrayAndStats, _ctx: CompressorContext, - _excludes: &[SchemeId], ) -> VortexResult { let array = data.array().clone(); let ext_array = array.to_extension(); let temporal_array = TemporalArray::try_from(ext_array.clone().into_array())?; - // TODO(connor): Put the execution context somewhere! - let mut ctx = LEGACY_SESSION.create_execution_ctx(); - // Check for constant array and return early if so. - let is_constant = is_constant(&ext_array.clone().into_array(), &mut ctx)?; + let is_constant = is_constant( + &ext_array.clone().into_array(), + &mut compressor.execution_ctx(), + )?; if is_constant { return Ok( @@ -98,22 +98,23 @@ impl Scheme for TemporalScheme { subseconds, } = split_temporal(temporal_array)?; - let ctx = CompressorContext::default().descend(); - let days = compressor.compress_canonical( Canonical::Primitive(days.to_primitive().narrow()?), - ctx, - &[], + CompressorContext::default() + .descend() + .with_scheme(self.id(), 0), )?; let seconds = compressor.compress_canonical( Canonical::Primitive(seconds.to_primitive().narrow()?), - ctx, - &[], + CompressorContext::default() + .descend() + .with_scheme(self.id(), 1), )?; let subseconds = compressor.compress_canonical( Canonical::Primitive(subseconds.to_primitive().narrow()?), - ctx, - &[], + CompressorContext::default() + .descend() + .with_scheme(self.id(), 2), )?; Ok(DateTimePartsArray::try_new(dtype, days, seconds, subseconds)?.into_array()) diff --git a/vortex-compressor/Cargo.toml b/vortex-compressor/Cargo.toml index d4e7bf07e62..260c9c531f5 100644 --- a/vortex-compressor/Cargo.toml +++ b/vortex-compressor/Cargo.toml @@ -16,6 +16,7 @@ version = { workspace = true } [dependencies] itertools = { workspace = true } num-traits = { workspace = true } +parking_lot = { workspace = true } rand = { workspace = true } rustc-hash = { workspace = true } tracing = { workspace = true } diff --git a/vortex-compressor/src/compressor.rs b/vortex-compressor/src/compressor.rs index 3a488a08e0b..473ce610f13 100644 --- a/vortex-compressor/src/compressor.rs +++ b/vortex-compressor/src/compressor.rs @@ -3,10 +3,15 @@ //! Cascading array compression implementation. +use std::sync::Arc; + +use parking_lot::Mutex; +use parking_lot::MutexGuard; use vortex_array::ArrayRef; use vortex_array::Canonical; use vortex_array::CanonicalValidity; use vortex_array::DynArray; +use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::LEGACY_SESSION; use vortex_array::ToCanonical; @@ -25,12 +30,29 @@ use vortex_array::vtable::ValidityHelper; use vortex_error::VortexResult; use crate::ctx::CompressorContext; +use crate::scheme::DescendantExclusion; use crate::scheme::Scheme; use crate::scheme::SchemeExt; use crate::scheme::SchemeId; use crate::stats::ArrayAndStats; use crate::stats::GenerateStatsOptions; +/// The implicit root scheme ID for the compressor's own cascading (e.g. list offset +/// compression). +pub(crate) const ROOT_SCHEME_ID: SchemeId = SchemeId { + name: "vortex.compressor.root", +}; + +/// Child indices for the compressor's list/listview compression. +pub mod root_list_children { + /// List elements child. + pub const ELEMENTS: usize = 0; + /// List/ListView offsets child. + pub const OFFSETS: usize = 1; + /// ListView sizes child. + pub const SIZES: usize = 2; +} + /// The main compressor type implementing cascading adaptive compression. /// /// This compressor applies adaptive compression [`Scheme`]s to arrays based on their data types and @@ -39,19 +61,44 @@ use crate::stats::GenerateStatsOptions; /// /// The compressor works by: /// 1. Canonicalizing input arrays to a standard representation. -/// 2. Pre-filtering schemes by [`Scheme::matches`] and excludes. +/// 2. Pre-filtering schemes by [`Scheme::matches`] and exclusion rules. /// 3. Evaluating each matching scheme's compression ratio on a sample. /// 4. Compressing with the best scheme and verifying the result is smaller. -#[derive(Clone)] +/// +/// No scheme may appear twice in a cascade chain. The compressor enforces this automatically +/// along with push/pull exclusion rules declared by each scheme. +#[derive(Debug, Clone)] pub struct CascadingCompressor { /// The enabled compression schemes. pub schemes: Vec<&'static dyn Scheme>, + + /// Descendant exclusion rules for the compressor's own cascading (e.g. excluding Dict from + /// list offsets). + root_exclusions: Vec, + + /// Shared execution context for array operations during compression. + /// + /// This should have low contention as we only execute arrays one at a time during compression. + ctx: Arc>, } impl CascadingCompressor { - /// Creates a new compressor with the given schemes. - pub fn new(schemes: Vec<&'static dyn Scheme>) -> Self { - Self { schemes } + /// Creates a new compressor with the given schemes and root exclusion rules. + pub fn new( + schemes: Vec<&'static dyn Scheme>, + root_exclusions: Vec, + ) -> Self { + Self { + schemes, + root_exclusions, + // TODO(connor): The caller should probably pass this in. + ctx: Arc::new(Mutex::new(LEGACY_SESSION.create_execution_ctx())), + } + } + + /// Returns a mutable borrow of the execution context. + pub fn execution_ctx(&self) -> MutexGuard<'_, ExecutionCtx> { + self.ctx.lock() } /// Compresses an array using cascading adaptive compression. @@ -64,13 +111,13 @@ impl CascadingCompressor { pub fn compress(&self, array: &ArrayRef) -> VortexResult { let canonical = array .clone() - .execute::(&mut LEGACY_SESSION.create_execution_ctx())? + .execute::(&mut self.execution_ctx())? .0; // Compact it, removing any wasted space before we attempt to compress it. let compact = canonical.compact()?; - self.compress_canonical(compact, CompressorContext::default(), &[]) + self.compress_canonical(compact, CompressorContext::default()) } /// Compresses a canonical array by dispatching to type-specific logic. @@ -82,16 +129,15 @@ impl CascadingCompressor { &self, array: Canonical, ctx: CompressorContext, - excludes: &[SchemeId], ) -> VortexResult { match array { Canonical::Null(null_array) => Ok(null_array.into_array()), Canonical::Bool(bool_array) => Ok(bool_array.into_array()), Canonical::Primitive(primitive) => { - self.choose_and_compress(Canonical::Primitive(primitive), ctx, excludes) + self.choose_and_compress(Canonical::Primitive(primitive), ctx) } Canonical::Decimal(decimal) => { - self.choose_and_compress(Canonical::Decimal(decimal), ctx, excludes) + self.choose_and_compress(Canonical::Decimal(decimal), ctx) } Canonical::Struct(struct_array) => { let fields = struct_array @@ -132,7 +178,7 @@ impl CascadingCompressor { .dtype() .eq_ignore_nullability(&DType::Utf8(Nullability::NonNullable)) { - self.choose_and_compress(Canonical::VarBinView(strings), ctx, excludes) + self.choose_and_compress(Canonical::VarBinView(strings), ctx) } else { // We do not compress binary arrays. Ok(strings.into_array()) @@ -142,11 +188,8 @@ impl CascadingCompressor { let before_nbytes = ext_array.as_ref().nbytes(); // Try scheme-based compression first. - let result = self.choose_and_compress( - Canonical::Extension(ext_array.clone()), - ctx, - excludes, - )?; + let result = + self.choose_and_compress(Canonical::Extension(ext_array.clone()), ctx)?; if result.nbytes() < before_nbytes { return Ok(result); } @@ -164,9 +207,9 @@ impl CascadingCompressor { /// The main scheme-selection entry point for a single leaf array. /// - /// Filters allowed schemes by [`matches`], merges their [`stats_options`] into a single - /// [`GenerateStatsOptions`], then delegates to [`choose_scheme`] to pick the winner by - /// estimated compression ratio. + /// Filters allowed schemes by [`matches`] and exclusion rules, merges their [`stats_options`] + /// into a single [`GenerateStatsOptions`], then delegates to [`choose_scheme`] to pick the + /// winner by estimated compression ratio. /// /// If a winner is found and its compressed output is actually smaller, that output is returned. /// Otherwise, the original array is returned unchanged. @@ -180,13 +223,12 @@ impl CascadingCompressor { &self, canonical: Canonical, ctx: CompressorContext, - excludes: &[SchemeId], ) -> VortexResult { let eligible_schemes: Vec<&'static dyn Scheme> = self .schemes .iter() .copied() - .filter(|s| s.matches(&canonical) && !excludes.contains(&s.id())) + .filter(|s| s.matches(&canonical) && !self.is_excluded(*s, &ctx)) .collect(); let array: ArrayRef = canonical.into(); @@ -219,8 +261,8 @@ impl CascadingCompressor { let mut data = ArrayAndStats::new(array, merged_opts); - if let Some(winner) = self.choose_scheme(&eligible_schemes, &mut data, ctx, excludes)? { - let compressed = winner.compress(self, &mut data, ctx, excludes)?; + if let Some(winner) = self.choose_scheme(&eligible_schemes, &mut data, ctx.clone())? { + let compressed = winner.compress(self, &mut data, ctx)?; if compressed.nbytes() < before_nbytes { return Ok(compressed); } @@ -240,12 +282,11 @@ impl CascadingCompressor { schemes: &[&'static dyn Scheme], data: &mut ArrayAndStats, ctx: CompressorContext, - excludes: &[SchemeId], ) -> VortexResult> { let mut best: Option<(&'static dyn Scheme, f64)> = None; for &scheme in schemes { - let ratio = scheme.expected_compression_ratio(self, data, ctx, excludes)?; + let ratio = scheme.expected_compression_ratio(self, data, ctx.clone())?; tracing::debug!(scheme = %scheme.id(), ratio, "evaluated compression ratio"); @@ -257,6 +298,55 @@ impl CascadingCompressor { Ok(best.map(|(s, _)| s)) } + /// Returns `true` if the candidate scheme should be excluded based on the cascade history + /// and exclusion rules. + fn is_excluded(&self, candidate: &dyn Scheme, ctx: &CompressorContext) -> bool { + let id = candidate.id(); + let history = ctx.cascade_history(); + + // Self-exclusion: no scheme appears twice in any chain. + if history.iter().any(|(sid, _)| *sid == id) { + return true; + } + + // Push rules: check each ancestor's descendant_exclusions. + for &(ancestor_id, child_idx) in history { + // Root scheme rules. + if ancestor_id == ROOT_SCHEME_ID { + if self + .root_exclusions + .iter() + .any(|rule| rule.excluded == id && rule.children.contains(child_idx)) + { + return true; + } + continue; + } + + // Scheme-level push rules. + if let Some(ancestor) = self.schemes.iter().find(|s| s.id() == ancestor_id) + && ancestor + .descendant_exclusions() + .iter() + .any(|rule| rule.excluded == id && rule.children.contains(child_idx)) + { + return true; + } + } + + // Pull rules: candidate's ancestor_exclusions. + for rule in candidate.ancestor_exclusions() { + if history + .iter() + .any(|(sid, cidx)| *sid == rule.ancestor && rule.children.contains(*cidx)) + { + return true; + } + } + + false + } + /// Compresses a [`ListArray`] by narrowing offsets and recursively compressing elements. fn compress_list_array( &self, @@ -267,10 +357,13 @@ impl CascadingCompressor { let compressed_elems = self.compress(list_array.elements())?; + // Record the root scheme with the offsets child index so root exclusion rules apply. + let offset_ctx = ctx + .descend() + .with_scheme(ROOT_SCHEME_ID, root_list_children::OFFSETS); let compressed_offsets = self.compress_canonical( Canonical::Primitive(list_array.offsets().to_primitive().narrow()?), - ctx, - &[], + offset_ctx, )?; Ok(ListArray::try_new( @@ -289,16 +382,24 @@ impl CascadingCompressor { ctx: CompressorContext, ) -> VortexResult { let compressed_elems = self.compress(list_view.elements())?; + + let offset_ctx = ctx + .clone() + .descend() + .with_scheme(ROOT_SCHEME_ID, root_list_children::OFFSETS); let compressed_offsets = self.compress_canonical( Canonical::Primitive(list_view.offsets().to_primitive().narrow()?), - ctx, - &[], + offset_ctx, )?; + + let sizes_ctx = ctx + .descend() + .with_scheme(ROOT_SCHEME_ID, root_list_children::SIZES); let compressed_sizes = self.compress_canonical( Canonical::Primitive(list_view.sizes().to_primitive().narrow()?), - ctx, - &[], + sizes_ctx, )?; + Ok(ListViewArray::try_new( compressed_elems, compressed_offsets, @@ -309,8 +410,8 @@ impl CascadingCompressor { } } -/// Returns `true` if `ratio` is a valid compression ratio (> 1.0, finite, not subnormal) that beats -/// the current best. +/// Returns `true` if `ratio` is a valid compression ratio (> 1.0, finite, not subnormal) that +/// beats the current best. fn is_better_ratio(ratio: f64, best: &Option<(&'static dyn Scheme, f64)>) -> bool { ratio.is_finite() && !ratio.is_subnormal() && ratio > 1.0 && best.is_none_or(|(_, r)| ratio > r) } diff --git a/vortex-compressor/src/ctx.rs b/vortex-compressor/src/ctx.rs index b4ec616995d..6fd33be8bd7 100644 --- a/vortex-compressor/src/ctx.rs +++ b/vortex-compressor/src/ctx.rs @@ -3,13 +3,17 @@ //! Compression context for recursive compression. +use crate::scheme::SchemeId; use crate::stats::GenerateStatsOptions; /// Maximum cascade depth for compression. pub const MAX_CASCADE: usize = 3; /// Context passed through recursive compression calls. -#[derive(Debug, Clone, Copy)] +/// +/// Tracks the cascade history (which schemes and child indices have been applied in the current +/// chain) so the compressor can enforce exclusion rules and prevent cycles. +#[derive(Debug, Clone)] pub struct CompressorContext { /// Whether we're compressing a sample (for ratio estimation). pub is_sample: bool, @@ -17,6 +21,13 @@ pub struct CompressorContext { pub allowed_cascading: usize, /// Merged stats options from all eligible schemes at this compression site. pub stats_options: GenerateStatsOptions, + /// The cascade chain: `(scheme_id, child_index)` pairs from root to current depth. + /// Used for self-exclusion, push rules ([`descendant_exclusions`]), and pull rules + /// ([`ancestor_exclusions`]). + /// + /// [`descendant_exclusions`]: crate::scheme::Scheme::descendant_exclusions + /// [`ancestor_exclusions`]: crate::scheme::Scheme::ancestor_exclusions + cascade_history: Vec<(SchemeId, usize)>, } impl Default for CompressorContext { @@ -25,24 +36,36 @@ impl Default for CompressorContext { is_sample: false, allowed_cascading: MAX_CASCADE, stats_options: GenerateStatsOptions::default(), + cascade_history: Vec::new(), } } } impl CompressorContext { /// Descend one level in the cascade. - pub fn descend(self) -> Self { - Self { - allowed_cascading: self.allowed_cascading.saturating_sub(1), - ..self - } + pub fn descend(mut self) -> Self { + self.allowed_cascading = self.allowed_cascading.saturating_sub(1); + self } /// Returns a context marked as sample compression. - pub fn as_sample(self) -> Self { - Self { - is_sample: true, - ..self - } + pub fn as_sample(mut self) -> Self { + self.is_sample = true; + self + } + + /// Records a scheme and its child index in the cascade chain. + /// + /// Cascading schemes should call this before delegating child arrays to the compressor. + /// The `child_index` identifies which child of the scheme is being compressed (e.g. for + /// Dict: values=0, codes=1). + pub fn with_scheme(mut self, id: SchemeId, child_index: usize) -> Self { + self.cascade_history.push((id, child_index)); + self + } + + /// Returns the cascade chain of `(scheme_id, child_index)` pairs. + pub fn cascade_history(&self) -> &[(SchemeId, usize)] { + &self.cascade_history } } diff --git a/vortex-compressor/src/lib.rs b/vortex-compressor/src/lib.rs index 8031243725f..7a5d746471a 100644 --- a/vortex-compressor/src/lib.rs +++ b/vortex-compressor/src/lib.rs @@ -24,3 +24,4 @@ mod sample; mod compressor; pub use compressor::CascadingCompressor; +pub use compressor::root_list_children; diff --git a/vortex-compressor/src/scheme.rs b/vortex-compressor/src/scheme.rs index f9c4b6efb9a..5a3008cb95b 100644 --- a/vortex-compressor/src/scheme.rs +++ b/vortex-compressor/src/scheme.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Unified compression scheme trait. +//! Unified compression scheme trait and exclusion rules. use std::fmt; use std::fmt::Debug; @@ -22,12 +22,13 @@ use crate::stats::GenerateStatsOptions; /// Unique identifier for a compression scheme. /// -/// The only way to obtain a [`SchemeId`] is through [`SchemeExt::id()`], which is auto-implemented -/// for all [`Scheme`] types, wrapping [`Scheme::scheme_name()`]. There is no public constructor. +/// Typically obtained through [`SchemeExt::id()`], which is auto-implemented for all [`Scheme`] +/// types, wrapping [`Scheme::scheme_name()`]. The [`name`](SchemeId::name) field is also +/// available for constructing `SchemeId` in `const` contexts (e.g. static exclusion rules). #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub struct SchemeId { /// The scheme name. - name: &'static str, + pub name: &'static str, } impl fmt::Display for SchemeId { @@ -36,6 +37,58 @@ impl fmt::Display for SchemeId { } } +// --- Exclusion rule types --- + +/// Selects which children of a cascading scheme a rule applies to. +#[derive(Debug, Clone, Copy)] +pub enum ChildSelection { + /// Rule applies to all children. + All, + /// Rule applies to a single child. + One(usize), + /// Rule applies to multiple specific children. + Many(&'static [usize]), +} + +impl ChildSelection { + /// Returns `true` if this selection includes the given child index. + pub fn contains(&self, child_index: usize) -> bool { + match self { + ChildSelection::All => true, + ChildSelection::One(idx) => *idx == child_index, + ChildSelection::Many(indices) => indices.contains(&child_index), + } + } +} + +/// Push rule: declared by a cascading scheme to exclude another scheme from the subtree +/// rooted at the specified children. +/// +/// Use this when the declaring scheme (the ancestor) knows about the excluded scheme. +/// For example, `ZigZag` excludes `Dict` from all its children. +#[derive(Debug, Clone, Copy)] +pub struct DescendantExclusion { + /// The scheme to exclude from descendants. + pub excluded: SchemeId, + /// Which children of the declaring scheme this rule applies to. + pub children: ChildSelection, +} + +/// Pull rule: declared by a scheme to exclude itself when the specified ancestor is in +/// the cascade chain. +/// +/// Use this when the excluded scheme (the descendant) knows about the ancestor. For example, +/// `Sequence` excludes itself when `IntDict` is an ancestor on its codes child. +#[derive(Debug, Clone, Copy)] +pub struct AncestorExclusion { + /// The ancestor scheme that makes the declaring scheme ineligible. + pub ancestor: SchemeId, + /// Which children of the ancestor this rule applies to. + pub children: ChildSelection, +} + +// --- Scheme trait --- + /// A single compression encoding that the [`CascadingCompressor`] can select from. /// /// The compressor evaluates every registered scheme whose [`matches`] returns `true` for a @@ -50,7 +103,26 @@ impl fmt::Display for SchemeId { /// /// Every scheme has a globally unique name returned by [`scheme_name`]. The [`SchemeExt::id`] /// method (auto-implemented, cannot be overridden) wraps that name in an opaque [`SchemeId`] -/// used for equality, hashing, and exclude lists. +/// used for equality, hashing, and exclusion rules. +/// +/// # Cascading and children +/// +/// Schemes that produce child arrays for further compression declare [`num_children`] > 0. Each +/// child is identified by index, with values at index 0 by convention. When cascading, schemes +/// call [`CompressorContext::with_scheme`] to record which child they are compressing. +/// +/// No scheme may appear twice in a cascade chain (enforced by the compressor). This keeps the +/// search space a DAG. +/// +/// # Exclusion rules +/// +/// Schemes declare exclusion rules to prevent incompatible scheme combinations in the cascade +/// chain: +/// +/// - [`descendant_exclusions`] (push): "exclude scheme X from my child Y's subtree." Used when +/// the declaring scheme knows about the excluded scheme. +/// - [`ancestor_exclusions`] (pull): "exclude me if ancestor X's child Y is above me." Used when +/// the declaring scheme knows about the ancestor. /// /// # Implementing a scheme /// @@ -61,8 +133,7 @@ impl fmt::Display for SchemeId { /// returning `f64::MAX` for constant detection or `0.0` for early rejection based on stats). /// /// Schemes that need statistics that may be expensive to compute should override [`stats_options`] -/// to declare what they require. Currently, this is just distinct values and frequencies, but in -/// the future we might add run lengths. The compressor merges all eligible schemes' options before +/// to declare what they require. The compressor merges all eligible schemes' options before /// generating stats, so each stat is always computed at most once for a given array. /// /// [`scheme_name`]: Scheme::scheme_name @@ -70,6 +141,9 @@ impl fmt::Display for SchemeId { /// [`compress`]: Scheme::compress /// [`expected_compression_ratio`]: Scheme::expected_compression_ratio /// [`stats_options`]: Scheme::stats_options +/// [`num_children`]: Scheme::num_children +/// [`descendant_exclusions`]: Scheme::descendant_exclusions +/// [`ancestor_exclusions`]: Scheme::ancestor_exclusions pub trait Scheme: Debug + Send + Sync { /// The globally unique name for this scheme (e.g. `"vortex.int.bitpacking"`). fn scheme_name(&self) -> &'static str; @@ -89,9 +163,29 @@ pub trait Scheme: Debug + Send + Sync { GenerateStatsOptions::default() } - /// Estimate the compression ratio for this scheme on the given array. + /// The number of child arrays this scheme produces when cascading. Returns 0 for leaf + /// schemes that produce a final encoded array. + fn num_children(&self) -> usize { + 0 + } + + /// Schemes to exclude from specific children's subtrees (push direction). + /// + /// Each rule says: "when I cascade through child Y, do not use scheme X anywhere in that + /// subtree." Only meaningful when [`num_children`](Scheme::num_children) > 0. + fn descendant_exclusions(&self) -> &[DescendantExclusion] { + &[] + } + + /// Ancestors that make this scheme ineligible (pull direction). /// - /// Schemes listed in `excludes` must not be used when cascading into the compressor. + /// Each rule says: "if ancestor X cascaded through child Y somewhere above me in the chain, + /// do not try me." + fn ancestor_exclusions(&self) -> &[AncestorExclusion] { + &[] + } + + /// Estimate the compression ratio for this scheme on the given array. /// /// # Errors /// @@ -101,14 +195,14 @@ pub trait Scheme: Debug + Send + Sync { compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - excludes: &[SchemeId], ) -> VortexResult { - estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx, excludes) + estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) } /// Compress the array using this scheme. /// - /// Schemes listed in `excludes` must not be used when cascading into the compressor. + /// Cascading schemes should call [`CompressorContext::with_scheme`] to record which child + /// they are compressing before delegating to the compressor. /// /// # Errors /// @@ -118,7 +212,6 @@ pub trait Scheme: Debug + Send + Sync { compressor: &CascadingCompressor, data: &mut ArrayAndStats, ctx: CompressorContext, - excludes: &[SchemeId], ) -> VortexResult; } @@ -164,7 +257,6 @@ pub fn estimate_compression_ratio_with_sampling( compressor: &CascadingCompressor, array: &ArrayRef, ctx: CompressorContext, - excludes: &[SchemeId], ) -> VortexResult { let sample_array = if ctx.is_sample { array.clone() @@ -182,16 +274,15 @@ pub fn estimate_compression_ratio_with_sampling( }; let mut sample_data = ArrayAndStats::new(sample_array, ctx.stats_options); + let sample_ctx = ctx.as_sample(); let after = scheme - .compress(compressor, &mut sample_data, ctx.as_sample(), excludes)? + .compress(compressor, &mut sample_data, sample_ctx)? .nbytes(); let before = sample_data.array().nbytes(); + let ratio = before as f64 / after as f64; - tracing::debug!( - "estimate_compression_ratio_with_sampling(compressor={scheme:#?} ctx={ctx:?}) = {}", - before as f64 / after as f64 - ); + tracing::debug!("estimate_compression_ratio_with_sampling(compressor={scheme:#?}) = {ratio}",); - Ok(before as f64 / after as f64) + Ok(ratio) } From 7357eca39732e9faff42e36bb0f4bdf85ecbfef2 Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Thu, 19 Mar 2026 12:42:47 -0400 Subject: [PATCH 6/9] move builtins into `vortex-compressor` Signed-off-by: Connor Tsui --- fuzz/src/array/mod.rs | 6 +- vortex-btrblocks/src/builder.rs | 78 ++-- vortex-btrblocks/src/canonical_compressor.rs | 2 +- vortex-btrblocks/src/lib.rs | 8 +- .../src/{compressor => schemes}/decimal.rs | 4 +- .../float/mod.rs => schemes/float.rs} | 258 +------------ .../integer/mod.rs => schemes/integer.rs} | 343 ++++-------------- .../src/{compressor => schemes}/mod.rs | 0 .../src/{compressor => schemes}/patches.rs | 0 .../src/{compressor => schemes}/rle.rs | 18 +- .../src/{compressor => schemes}/string.rs | 248 +------------ .../src/{compressor => schemes}/temporal.rs | 12 +- vortex-compressor/src/builtins/constant.rs | 216 +++++++++++ .../src/builtins/dict/float.rs | 40 +- .../src/builtins/dict/integer.rs | 50 ++- vortex-compressor/src/builtins/dict/mod.rs | 321 ++++++++++++++++ vortex-compressor/src/builtins/mod.rs | 49 +++ .../src/builtins/uncompressed.rs | 112 ++++++ vortex-compressor/src/compressor.rs | 75 ++-- vortex-compressor/src/ctx.rs | 45 ++- vortex-compressor/src/lib.rs | 1 + vortex-compressor/src/scheme.rs | 22 +- vortex-file/src/strategy.rs | 20 +- vortex-layout/src/layouts/compressed.rs | 2 +- 24 files changed, 992 insertions(+), 938 deletions(-) rename vortex-btrblocks/src/{compressor => schemes}/decimal.rs (96%) rename vortex-btrblocks/src/{compressor/float/mod.rs => schemes/float.rs} (65%) rename vortex-btrblocks/src/{compressor/integer/mod.rs => schemes/integer.rs} (77%) rename vortex-btrblocks/src/{compressor => schemes}/mod.rs (100%) rename vortex-btrblocks/src/{compressor => schemes}/patches.rs (100%) rename vortex-btrblocks/src/{compressor => schemes}/rle.rs (93%) rename vortex-btrblocks/src/{compressor => schemes}/string.rs (59%) rename vortex-btrblocks/src/{compressor => schemes}/temporal.rs (91%) create mode 100644 vortex-compressor/src/builtins/constant.rs rename vortex-btrblocks/src/compressor/float/dictionary.rs => vortex-compressor/src/builtins/dict/float.rs (80%) rename vortex-btrblocks/src/compressor/integer/dictionary.rs => vortex-compressor/src/builtins/dict/integer.rs (74%) create mode 100644 vortex-compressor/src/builtins/dict/mod.rs create mode 100644 vortex-compressor/src/builtins/mod.rs create mode 100644 vortex-compressor/src/builtins/uncompressed.rs diff --git a/fuzz/src/array/mod.rs b/fuzz/src/array/mod.rs index d9380a57a08..70094fed072 100644 --- a/fuzz/src/array/mod.rs +++ b/fuzz/src/array/mod.rs @@ -62,9 +62,9 @@ use vortex_array::search_sorted::SearchSortedSide; use vortex_btrblocks::BtrBlocksCompressor; use vortex_btrblocks::BtrBlocksCompressorBuilder; use vortex_btrblocks::SchemeExt; -use vortex_btrblocks::compressor::float; -use vortex_btrblocks::compressor::integer; -use vortex_btrblocks::compressor::string; +use vortex_btrblocks::schemes::float; +use vortex_btrblocks::schemes::integer; +use vortex_btrblocks::schemes::string; use vortex_error::VortexExpect; use vortex_error::vortex_panic; use vortex_mask::Mask; diff --git a/vortex-btrblocks/src/builder.rs b/vortex-btrblocks/src/builder.rs index 0786dcb3e26..745e4755740 100644 --- a/vortex-btrblocks/src/builder.rs +++ b/vortex-btrblocks/src/builder.rs @@ -3,9 +3,6 @@ //! Builder for configuring `BtrBlocksCompressor` instances. -use vortex_compressor::root_list_children; -use vortex_compressor::scheme::ChildSelection; -use vortex_compressor::scheme::DescendantExclusion; use vortex_utils::aliases::hash_set::HashSet; use crate::BtrBlocksCompressor; @@ -13,7 +10,6 @@ use crate::CascadingCompressor; use crate::Scheme; use crate::SchemeExt; use crate::SchemeId; -use crate::compressor::integer::DictScheme as IntDictScheme; /// All available compression schemes. /// @@ -21,43 +17,43 @@ use crate::compressor::integer::DictScheme as IntDictScheme; /// the final scheme list, so that tie-breaking is deterministic. pub const ALL_SCHEMES: &[&dyn Scheme] = &[ // Integer schemes. - &crate::compressor::integer::UncompressedScheme as &dyn Scheme, - &crate::compressor::integer::ConstantScheme, + &crate::schemes::integer::UncompressedScheme as &dyn Scheme, + &crate::schemes::integer::ConstantScheme, // NOTE: For must precede BitPacking to avoid unnecessary patches. - &crate::compressor::integer::FORScheme, - &crate::compressor::integer::BitPackingScheme, - &crate::compressor::integer::ZigZagScheme, - &crate::compressor::integer::SparseScheme, - &crate::compressor::integer::DictScheme, - &crate::compressor::integer::RunEndScheme, - &crate::compressor::integer::SequenceScheme, - &crate::compressor::integer::RLE_INTEGER_SCHEME, + &crate::schemes::integer::FORScheme, + &crate::schemes::integer::BitPackingScheme, + &crate::schemes::integer::ZigZagScheme, + &crate::schemes::integer::SparseScheme, + &crate::schemes::integer::DictScheme, + &crate::schemes::integer::RunEndScheme, + &crate::schemes::integer::SequenceScheme, + &crate::schemes::integer::RLE_INTEGER_SCHEME, #[cfg(feature = "pco")] - &crate::compressor::integer::PcoScheme, + &crate::schemes::integer::PcoScheme, // Float schemes. - &crate::compressor::float::UncompressedScheme, - &crate::compressor::float::ConstantScheme, - &crate::compressor::float::ALPScheme, - &crate::compressor::float::ALPRDScheme, - &crate::compressor::float::DictScheme, - &crate::compressor::float::NullDominated, - &crate::compressor::float::RLE_FLOAT_SCHEME, + &crate::schemes::float::UncompressedScheme, + &crate::schemes::float::ConstantScheme, + &crate::schemes::float::ALPScheme, + &crate::schemes::float::ALPRDScheme, + &crate::schemes::float::DictScheme, + &crate::schemes::float::NullDominated, + &crate::schemes::float::RLE_FLOAT_SCHEME, #[cfg(feature = "pco")] - &crate::compressor::float::PcoScheme, + &crate::schemes::float::PcoScheme, // Decimal schemes. - &crate::compressor::decimal::DecimalScheme, + &crate::schemes::decimal::DecimalScheme, // Temporal schemes. - &crate::compressor::temporal::TemporalScheme, + &crate::schemes::temporal::TemporalScheme, // String schemes. - &crate::compressor::string::UncompressedScheme, - &crate::compressor::string::DictScheme, - &crate::compressor::string::FSSTScheme, - &crate::compressor::string::ConstantScheme, - &crate::compressor::string::NullDominated, + &crate::schemes::string::UncompressedScheme, + &crate::schemes::string::DictScheme, + &crate::schemes::string::FSSTScheme, + &crate::schemes::string::ConstantScheme, + &crate::schemes::string::NullDominated, #[cfg(feature = "zstd")] - &crate::compressor::string::ZstdScheme, + &crate::schemes::string::ZstdScheme, #[cfg(all(feature = "zstd", feature = "unstable_encodings"))] - &crate::compressor::string::ZstdBuffersScheme, + &crate::schemes::string::ZstdBuffersScheme, ]; /// Returns the set of scheme IDs excluded by default (behind feature gates or known-expensive). @@ -66,13 +62,13 @@ pub fn default_excluded() -> HashSet { let mut excluded = HashSet::new(); #[cfg(feature = "pco")] { - excluded.insert(crate::compressor::integer::PcoScheme.id()); - excluded.insert(crate::compressor::float::PcoScheme.id()); + excluded.insert(crate::schemes::integer::PcoScheme.id()); + excluded.insert(crate::schemes::float::PcoScheme.id()); } #[cfg(feature = "zstd")] - excluded.insert(crate::compressor::string::ZstdScheme.id()); + excluded.insert(crate::schemes::string::ZstdScheme.id()); #[cfg(all(feature = "zstd", feature = "unstable_encodings"))] - excluded.insert(crate::compressor::string::ZstdBuffersScheme.id()); + excluded.insert(crate::schemes::string::ZstdBuffersScheme.id()); excluded } @@ -85,7 +81,7 @@ pub fn default_excluded() -> HashSet { /// /// ```rust /// use vortex_btrblocks::{BtrBlocksCompressorBuilder, Scheme, SchemeExt}; -/// use vortex_btrblocks::compressor::integer::DictScheme; +/// use vortex_btrblocks::schemes::integer::DictScheme; /// /// // Default compressor - all non-excluded schemes allowed. /// let compressor = BtrBlocksCompressorBuilder::default().build(); @@ -156,12 +152,6 @@ impl BtrBlocksCompressorBuilder { .copied() .filter(|s| self.schemes.contains(s)) .collect(); - // Root exclusion: exclude IntDict from list/listview offsets (monotonically - // increasing data where dictionary encoding is wasteful). - let root_exclusions = vec![DescendantExclusion { - excluded: IntDictScheme::ID, - children: ChildSelection::One(root_list_children::OFFSETS), - }]; - BtrBlocksCompressor(CascadingCompressor::new(schemes, root_exclusions)) + BtrBlocksCompressor(CascadingCompressor::new(schemes)) } } diff --git a/vortex-btrblocks/src/canonical_compressor.rs b/vortex-btrblocks/src/canonical_compressor.rs index 3d1aea20323..2705d1c2527 100644 --- a/vortex-btrblocks/src/canonical_compressor.rs +++ b/vortex-btrblocks/src/canonical_compressor.rs @@ -20,7 +20,7 @@ use crate::CascadingCompressor; /// /// ```rust /// use vortex_btrblocks::{BtrBlocksCompressor, BtrBlocksCompressorBuilder, Scheme, SchemeExt}; -/// use vortex_btrblocks::compressor::integer::DictScheme; +/// use vortex_btrblocks::schemes::integer::DictScheme; /// /// // Default compressor - all schemes allowed. /// let compressor = BtrBlocksCompressor::default(); diff --git a/vortex-btrblocks/src/lib.rs b/vortex-btrblocks/src/lib.rs index 7b16343ca81..3283513f0b6 100644 --- a/vortex-btrblocks/src/lib.rs +++ b/vortex-btrblocks/src/lib.rs @@ -41,7 +41,7 @@ //! //! ```rust //! use vortex_btrblocks::{BtrBlocksCompressor, BtrBlocksCompressorBuilder, Scheme, SchemeExt}; -//! use vortex_btrblocks::compressor::integer::DictScheme; +//! use vortex_btrblocks::schemes::integer::DictScheme; //! use vortex_array::DynArray; //! //! // Default compressor with all schemes enabled. @@ -58,7 +58,7 @@ mod builder; mod canonical_compressor; /// Compression scheme implementations. -pub mod compressor; +pub mod schemes; // Re-export framework types from vortex-compressor for backwards compatibility. // Btrblocks-specific exports. @@ -66,9 +66,9 @@ pub use builder::ALL_SCHEMES; pub use builder::BtrBlocksCompressorBuilder; pub use builder::default_excluded; pub use canonical_compressor::BtrBlocksCompressor; -pub use compressor::integer::dictionary::dictionary_encode as integer_dictionary_encode; -pub use compressor::patches::compress_patches; +pub use schemes::patches::compress_patches; pub use vortex_compressor::CascadingCompressor; +pub use vortex_compressor::builtins::integer_dictionary_encode; pub use vortex_compressor::ctx::CompressorContext; pub use vortex_compressor::ctx::MAX_CASCADE; pub use vortex_compressor::scheme::Scheme; diff --git a/vortex-btrblocks/src/compressor/decimal.rs b/vortex-btrblocks/src/schemes/decimal.rs similarity index 96% rename from vortex-btrblocks/src/compressor/decimal.rs rename to vortex-btrblocks/src/schemes/decimal.rs index de67244d3c9..8ccfcc6697f 100644 --- a/vortex-btrblocks/src/compressor/decimal.rs +++ b/vortex-btrblocks/src/schemes/decimal.rs @@ -70,9 +70,7 @@ impl Scheme for DecimalScheme { _ => return Ok(decimal.into_array()), }; - let ctx = CompressorContext::default() - .descend() - .with_scheme(self.id(), 0); + let ctx = CompressorContext::default().descend_with_scheme(self.id(), 0); let compressed = compressor.compress_canonical(Canonical::Primitive(prim), ctx)?; DecimalBytePartsArray::try_new(compressed, decimal.decimal_dtype()).map(|d| d.into_array()) diff --git a/vortex-btrblocks/src/compressor/float/mod.rs b/vortex-btrblocks/src/schemes/float.rs similarity index 65% rename from vortex-btrblocks/src/compressor/float/mod.rs rename to vortex-btrblocks/src/schemes/float.rs index b959103d2f6..8279d0eb086 100644 --- a/vortex-btrblocks/src/compressor/float/mod.rs +++ b/vortex-btrblocks/src/schemes/float.rs @@ -1,8 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -pub(crate) mod dictionary; - +// Re-export builtin schemes from vortex-compressor. use vortex_alp::ALP; use vortex_alp::ALPArray; use vortex_alp::RDEncoder; @@ -11,14 +10,12 @@ use vortex_array::ArrayRef; use vortex_array::Canonical; use vortex_array::IntoArray; use vortex_array::ToCanonical; -use vortex_array::arrays::ConstantArray; -use vortex_array::arrays::DictArray; -use vortex_array::arrays::MaskedArray; use vortex_array::arrays::PrimitiveArray; -use vortex_array::arrays::dict::DictArrayParts; use vortex_array::dtype::PType; -use vortex_array::scalar::Scalar; -use vortex_array::vtable::ValidityHelper; +pub use vortex_compressor::builtins::FloatConstantScheme as ConstantScheme; +pub use vortex_compressor::builtins::FloatDictScheme as DictScheme; +pub use vortex_compressor::builtins::FloatUncompressedScheme as UncompressedScheme; +pub use vortex_compressor::builtins::is_float_primitive; use vortex_compressor::scheme::ChildSelection; use vortex_compressor::scheme::DescendantExclusion; pub use vortex_compressor::stats::FloatStats; @@ -27,35 +24,17 @@ use vortex_error::vortex_panic; use vortex_sparse::Sparse; use vortex_sparse::SparseArray; -use self::dictionary::dictionary_encode; -use super::integer::DictScheme as IntDictScheme; -use super::integer::RunEndScheme as IntRunEndScheme; -use super::integer::SequenceScheme as IntSequenceScheme; use super::integer::SparseScheme as IntSparseScheme; use crate::ArrayAndStats; use crate::CascadingCompressor; use crate::CompressorContext; -use crate::GenerateStatsOptions; use crate::Scheme; use crate::SchemeExt; use crate::compress_patches; -use crate::compressor::rle; -use crate::compressor::rle::RLEScheme; -use crate::compressor::rle::RLEStats; use crate::estimate_compression_ratio_with_sampling; - -/// Returns `true` if the canonical form represents a floating-point primitive. -fn is_float_primitive(canonical: &Canonical) -> bool { - matches!(canonical, Canonical::Primitive(p) if !p.ptype().is_int()) -} - -/// Uncompressed passthrough for floating-point arrays. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct UncompressedScheme; - -/// Constant encoding for arrays with a single distinct float value. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct ConstantScheme; +use crate::schemes::rle; +use crate::schemes::rle::RLEScheme; +use crate::schemes::rle::RLEStats; /// ALP (Adaptive Lossless floating-Point) encoding. #[derive(Debug, Copy, Clone, PartialEq, Eq)] @@ -65,10 +44,6 @@ pub struct ALPScheme; #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct ALPRDScheme; -/// Dictionary encoding for low-cardinality float values. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct DictScheme; - /// Sparse encoding for null-dominated float arrays. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct NullDominated; @@ -121,105 +96,6 @@ impl RLEStats for FloatStats { /// RLE scheme for float compression. pub const RLE_FLOAT_SCHEME: RLEScheme = RLEScheme::new(); -impl Scheme for UncompressedScheme { - fn scheme_name(&self) -> &'static str { - "vortex.float.uncompressed" - } - - fn matches(&self, canonical: &Canonical) -> bool { - is_float_primitive(canonical) - } - - fn expected_compression_ratio( - &self, - _compressor: &CascadingCompressor, - _data: &mut ArrayAndStats, - _ctx: CompressorContext, - ) -> VortexResult { - Ok(1.0) - } - - fn compress( - &self, - _compressor: &CascadingCompressor, - data: &mut ArrayAndStats, - _ctx: CompressorContext, - ) -> VortexResult { - Ok(data.array().clone()) - } -} - -impl Scheme for ConstantScheme { - fn scheme_name(&self) -> &'static str { - "vortex.float.constant" - } - - fn matches(&self, canonical: &Canonical) -> bool { - is_float_primitive(canonical) - } - - fn detects_constant(&self) -> bool { - true - } - - fn expected_compression_ratio( - &self, - _compressor: &CascadingCompressor, - data: &mut ArrayAndStats, - ctx: CompressorContext, - ) -> VortexResult { - // Never select Constant when sampling. - if ctx.is_sample { - return Ok(0.0); - } - - let stats = data.float_stats(); - - if stats.null_count() as usize == stats.source().len() || stats.value_count() == 0 { - return Ok(0.0); - } - - // Can only have 1 distinct value. - if stats.distinct_count().is_some_and(|count| count == 1) { - return Ok(stats.value_count() as f64); - } - - Ok(0.0) - } - - fn compress( - &self, - _compressor: &CascadingCompressor, - data: &mut ArrayAndStats, - _ctx: CompressorContext, - ) -> VortexResult { - let stats = data.float_stats(); - - let scalar_idx = - (0..stats.source().len()).position(|idx| stats.source().is_valid(idx).unwrap_or(false)); - - match scalar_idx { - Some(idx) => { - let scalar = stats.source().scalar_at(idx)?; - let const_arr = ConstantArray::new(scalar, stats.source().len()).into_array(); - if !stats.source().all_valid()? { - Ok( - MaskedArray::try_new(const_arr, stats.source().validity().clone())? - .into_array(), - ) - } else { - Ok(const_arr) - } - } - None => Ok(ConstantArray::new( - Scalar::null(stats.source().dtype().clone()), - stats.source().len(), - ) - .into_array()), - } - } -} - impl Scheme for ALPScheme { fn scheme_name(&self) -> &'static str { "vortex.float.alp" @@ -246,7 +122,7 @@ impl Scheme for ALPScheme { return Ok(0.0); } - if ctx.allowed_cascading == 0 { + if ctx.finished_cascading() { // ALP does not compress on its own, we need to be able to cascade it with // an integer compressor. return Ok(0.0); @@ -272,7 +148,7 @@ impl Scheme for ALPScheme { // to keep them linear for easy indexing. let compressed_alp_ints = compressor.compress_canonical( Canonical::Primitive(alp_ints), - ctx.descend().with_scheme(self.id(), 0), + ctx.descend_with_scheme(self.id(), 0), )?; let patches = alp.patches().map(compress_patches).transpose()?; @@ -331,108 +207,6 @@ impl Scheme for ALPRDScheme { } } -impl Scheme for DictScheme { - fn scheme_name(&self) -> &'static str { - "vortex.float.dict" - } - - fn matches(&self, canonical: &Canonical) -> bool { - is_float_primitive(canonical) - } - - fn stats_options(&self) -> GenerateStatsOptions { - GenerateStatsOptions { - count_distinct_values: true, - } - } - - fn num_children(&self) -> usize { - 2 - } - - fn descendant_exclusions(&self) -> &[DescendantExclusion] { - &[ - // Exclude IntDict from codes child. - DescendantExclusion { - excluded: IntDictScheme::ID, - children: ChildSelection::One(1), - }, - // Exclude IntSequence from codes child. - DescendantExclusion { - excluded: IntSequenceScheme::ID, - children: ChildSelection::One(1), - }, - // Exclude IntDict from values child (replaces ALP propagation). - DescendantExclusion { - excluded: IntDictScheme::ID, - children: ChildSelection::One(0), - }, - // Exclude IntRunEnd from values child (replaces ALP propagation). - DescendantExclusion { - excluded: IntRunEndScheme::ID, - children: ChildSelection::One(0), - }, - ] - } - - fn expected_compression_ratio( - &self, - compressor: &CascadingCompressor, - data: &mut ArrayAndStats, - ctx: CompressorContext, - ) -> VortexResult { - let stats = data.float_stats(); - - if stats.value_count() == 0 { - return Ok(0.0); - } - - // If the array is high cardinality (>50% unique values), we do not want to compress as a - // dictionary. - if stats - .distinct_count() - .is_some_and(|count| count <= stats.value_count() / 2) - { - // Take a sample and run compression on the sample to determine before/after size. - return estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx); - } - - Ok(0.0) - } - - fn compress( - &self, - compressor: &CascadingCompressor, - data: &mut ArrayAndStats, - ctx: CompressorContext, - ) -> VortexResult { - let stats = data.float_stats(); - - let dict = dictionary_encode(stats); - let has_all_values_referenced = dict.has_all_values_referenced(); - let DictArrayParts { codes, values, .. } = dict.into_parts(); - - let compressed_values = compressor.compress_canonical( - Canonical::Primitive(values.to_primitive()), - ctx.clone().descend().with_scheme(self.id(), 0), - )?; - - let compressed_codes = compressor.compress_canonical( - Canonical::Primitive(codes.to_primitive()), - ctx.descend().with_scheme(self.id(), 1), - )?; - - // SAFETY: compressing codes or values does not alter the invariants. - unsafe { - Ok( - DictArray::new_unchecked(compressed_codes, compressed_values) - .set_all_values_referenced(has_all_values_referenced) - .into_array(), - ) - } - } -} - impl Scheme for NullDominated { fn scheme_name(&self) -> &'static str { "vortex.float.sparse" @@ -446,9 +220,9 @@ impl Scheme for NullDominated { 1 } - fn descendant_exclusions(&self) -> &[DescendantExclusion] { - &[DescendantExclusion { - excluded: IntSparseScheme::ID, + fn descendant_exclusions(&self) -> Vec { + vec![DescendantExclusion { + excluded: IntSparseScheme.id(), children: ChildSelection::All, }] } @@ -460,7 +234,7 @@ impl Scheme for NullDominated { ctx: CompressorContext, ) -> VortexResult { // Only use `SparseScheme` if we can cascade. - if ctx.allowed_cascading == 0 { + if ctx.finished_cascading() { return Ok(0.0); } @@ -486,7 +260,7 @@ impl Scheme for NullDominated { data: &mut ArrayAndStats, ctx: CompressorContext, ) -> VortexResult { - assert!(ctx.allowed_cascading > 0); + assert!(!ctx.finished_cascading()); let stats = data.float_stats(); @@ -497,7 +271,7 @@ impl Scheme for NullDominated { let indices = sparse.patches().indices().to_primitive().narrow()?; let compressed_indices = compressor.compress_canonical( Canonical::Primitive(indices.to_primitive()), - ctx.descend().with_scheme(self.id(), 0), + ctx.descend_with_scheme(self.id(), 0), )?; SparseArray::try_new( diff --git a/vortex-btrblocks/src/compressor/integer/mod.rs b/vortex-btrblocks/src/schemes/integer.rs similarity index 77% rename from vortex-btrblocks/src/compressor/integer/mod.rs rename to vortex-btrblocks/src/schemes/integer.rs index 5d5881f16c4..574c8d56bbb 100644 --- a/vortex-btrblocks/src/compressor/integer/mod.rs +++ b/vortex-btrblocks/src/schemes/integer.rs @@ -1,17 +1,18 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -pub(crate) mod dictionary; +// Re-export builtin schemes from vortex-compressor. use vortex_array::ArrayRef; use vortex_array::Canonical; use vortex_array::IntoArray; use vortex_array::ToCanonical; use vortex_array::arrays::ConstantArray; -use vortex_array::arrays::DictArray; -use vortex_array::arrays::MaskedArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::scalar::Scalar; -use vortex_array::vtable::ValidityHelper; +pub use vortex_compressor::builtins::IntConstantScheme as ConstantScheme; +pub use vortex_compressor::builtins::IntDictScheme as DictScheme; +pub use vortex_compressor::builtins::IntUncompressedScheme as UncompressedScheme; +pub use vortex_compressor::builtins::is_integer_primitive; use vortex_compressor::scheme::AncestorExclusion; use vortex_compressor::scheme::ChildSelection; use vortex_compressor::scheme::DescendantExclusion; @@ -32,32 +33,17 @@ use vortex_sparse::SparseArray; use vortex_zigzag::ZigZagArray; use vortex_zigzag::zigzag_encode; -use self::dictionary::dictionary_encode; use crate::ArrayAndStats; use crate::CascadingCompressor; use crate::CompressorContext; use crate::GenerateStatsOptions; use crate::Scheme; use crate::SchemeExt; -use crate::SchemeId; use crate::compress_patches; -use crate::compressor::rle; -use crate::compressor::rle::RLEScheme; -use crate::compressor::rle::RLEStats; use crate::estimate_compression_ratio_with_sampling; - -/// Returns `true` if the canonical array is a primitive with an integer ptype. -fn is_integer_primitive(canonical: &Canonical) -> bool { - matches!(canonical, Canonical::Primitive(p) if p.ptype().is_int()) -} - -/// No compression applied. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct UncompressedScheme; - -/// Constant encoding for arrays with a single distinct value. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct ConstantScheme; +use crate::schemes::rle; +use crate::schemes::rle::RLEScheme; +use crate::schemes::rle::RLEStats; /// Frame of Reference encoding. #[derive(Debug, Copy, Clone, PartialEq, Eq)] @@ -75,46 +61,14 @@ pub struct BitPackingScheme; #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct SparseScheme; -impl SparseScheme { - /// Constant [`SchemeId`] for use in static exclusion rules. - pub const ID: SchemeId = SchemeId { - name: "vortex.int.sparse", - }; -} - -/// Dictionary encoding for low-cardinality integer values. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct DictScheme; - -impl DictScheme { - /// Constant [`SchemeId`] for use in static exclusion rules. - pub const ID: SchemeId = SchemeId { - name: "vortex.int.dict", - }; -} - /// Run-end encoding with end positions. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct RunEndScheme; -impl RunEndScheme { - /// Constant [`SchemeId`] for use in static exclusion rules. - pub const ID: SchemeId = SchemeId { - name: "vortex.int.runend", - }; -} - /// Sequence encoding for sequential patterns. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct SequenceScheme; -impl SequenceScheme { - /// Constant [`SchemeId`] for use in static exclusion rules. - pub const ID: SchemeId = SchemeId { - name: "vortex.int.sequence", - }; -} - /// Pco (pcodec) compression for integers. #[cfg(feature = "pco")] #[derive(Debug, Copy, Clone, PartialEq, Eq)] @@ -166,102 +120,6 @@ impl RLEStats for IntegerStats { /// RLE scheme for integer compression. pub const RLE_INTEGER_SCHEME: RLEScheme = RLEScheme::new(); -impl Scheme for UncompressedScheme { - fn scheme_name(&self) -> &'static str { - "vortex.int.uncompressed" - } - - fn matches(&self, canonical: &Canonical) -> bool { - is_integer_primitive(canonical) - } - - fn expected_compression_ratio( - &self, - _compressor: &CascadingCompressor, - _data: &mut ArrayAndStats, - _ctx: CompressorContext, - ) -> VortexResult { - // No compression. - Ok(1.0) - } - - fn compress( - &self, - _compressor: &CascadingCompressor, - data: &mut ArrayAndStats, - _ctx: CompressorContext, - ) -> VortexResult { - Ok(data.array().clone()) - } -} - -impl Scheme for ConstantScheme { - fn scheme_name(&self) -> &'static str { - "vortex.int.constant" - } - - fn matches(&self, canonical: &Canonical) -> bool { - is_integer_primitive(canonical) - } - - fn detects_constant(&self) -> bool { - true - } - - fn expected_compression_ratio( - &self, - _compressor: &CascadingCompressor, - data: &mut ArrayAndStats, - ctx: CompressorContext, - ) -> VortexResult { - // Never yield ConstantScheme for a sample, it could be a false-positive. - if ctx.is_sample { - return Ok(0.0); - } - - let stats = data.integer_stats(); - - // Only arrays with one distinct value can be constant compressed. - if stats.distinct_count().is_none_or(|count| count > 1) { - return Ok(0.0); - } - - Ok(stats.value_count() as f64) - } - - fn compress( - &self, - _compressor: &CascadingCompressor, - data: &mut ArrayAndStats, - _ctx: CompressorContext, - ) -> VortexResult { - let stats = data.integer_stats(); - - let scalar_idx = - (0..stats.source().len()).position(|idx| stats.source().is_valid(idx).unwrap_or(false)); - - match scalar_idx { - Some(idx) => { - let scalar = stats.source().scalar_at(idx)?; - let const_arr = ConstantArray::new(scalar, stats.source().len()).into_array(); - if !stats.source().all_valid()? { - Ok( - MaskedArray::try_new(const_arr, stats.source().validity().clone())? - .into_array(), - ) - } else { - Ok(const_arr) - } - } - None => Ok(ConstantArray::new( - Scalar::null(stats.source().dtype().clone()), - stats.source().len(), - ) - .into_array()), - } - } -} - impl Scheme for FORScheme { fn scheme_name(&self) -> &'static str { "vortex.int.for" @@ -278,7 +136,7 @@ impl Scheme for FORScheme { ctx: CompressorContext, ) -> VortexResult { // Only apply if we are not at the leaf. - if ctx.allowed_cascading == 0 { + if ctx.finished_cascading() { return Ok(0.0); } @@ -339,9 +197,8 @@ impl Scheme for FORScheme { // of bitpacking. // NOTE: we could delegate in the future if we had another downstream codec that performs // as well. - let mut leaf_ctx = ctx.clone(); - leaf_ctx.allowed_cascading = 0; - let mut biased_data = ArrayAndStats::new(biased.into_array(), ctx.stats_options); + let leaf_ctx = ctx.clone().as_leaf(); + let mut biased_data = ArrayAndStats::new(biased.into_array(), ctx.stats_options()); let compressed = BitPackingScheme.compress(compressor, &mut biased_data, leaf_ctx)?; let for_compressed = FoRArray::try_new(compressed, for_array.reference_scalar().clone())?; @@ -366,18 +223,18 @@ impl Scheme for ZigZagScheme { 1 } - fn descendant_exclusions(&self) -> &[DescendantExclusion] { - &[ + fn descendant_exclusions(&self) -> Vec { + vec![ DescendantExclusion { - excluded: DictScheme::ID, + excluded: DictScheme.id(), children: ChildSelection::All, }, DescendantExclusion { - excluded: RunEndScheme::ID, + excluded: RunEndScheme.id(), children: ChildSelection::All, }, DescendantExclusion { - excluded: SparseScheme::ID, + excluded: SparseScheme.id(), children: ChildSelection::All, }, ] @@ -390,7 +247,7 @@ impl Scheme for ZigZagScheme { ctx: CompressorContext, ) -> VortexResult { // ZigZag is only useful when we cascade it with another encoding. - if ctx.allowed_cascading == 0 { + if ctx.finished_cascading() { return Ok(0.0); } @@ -422,7 +279,7 @@ impl Scheme for ZigZagScheme { let zag = zigzag_encode(stats.source().clone())?; let encoded = zag.encoded().to_primitive(); - let child_ctx = ctx.descend().with_scheme(self.id(), 0); + let child_ctx = ctx.descend_with_scheme(self.id(), 0); let compressed = compressor.compress_canonical(Canonical::Primitive(encoded), child_ctx)?; tracing::debug!("zigzag output: {}", compressed.encoding_id()); @@ -503,9 +360,9 @@ impl Scheme for SparseScheme { 2 } - fn descendant_exclusions(&self) -> &[DescendantExclusion] { - &[DescendantExclusion { - excluded: DictScheme::ID, + fn descendant_exclusions(&self) -> Vec { + vec![DescendantExclusion { + excluded: DictScheme.id(), children: ChildSelection::All, }] } @@ -517,7 +374,7 @@ impl Scheme for SparseScheme { ctx: CompressorContext, ) -> VortexResult { // Only use `SparseScheme` if we can cascade. - if ctx.allowed_cascading == 0 { + if ctx.finished_cascading() { return Ok(0.0); } @@ -566,7 +423,7 @@ impl Scheme for SparseScheme { data: &mut ArrayAndStats, ctx: CompressorContext, ) -> VortexResult { - assert!(ctx.allowed_cascading > 0); + assert!(!ctx.finished_cascading()); let stats = data.integer_stats(); @@ -599,7 +456,7 @@ impl Scheme for SparseScheme { )?; if let Some(sparse) = sparse_encoded.as_opt::() { - let values_ctx = ctx.clone().descend().with_scheme(self.id(), 0); + let values_ctx = ctx.clone().descend_with_scheme(self.id(), 0); let compressed_values = compressor.compress_canonical( Canonical::Primitive(sparse.patches().values().to_primitive()), values_ctx, @@ -607,7 +464,7 @@ impl Scheme for SparseScheme { let indices = sparse.patches().indices().to_primitive().narrow()?; - let indices_ctx = ctx.descend().with_scheme(self.id(), 1); + let indices_ctx = ctx.descend_with_scheme(self.id(), 1); let compressed_indices = compressor.compress_canonical(Canonical::Primitive(indices), indices_ctx)?; @@ -624,103 +481,6 @@ impl Scheme for SparseScheme { } } -impl Scheme for DictScheme { - fn scheme_name(&self) -> &'static str { - "vortex.int.dict" - } - - fn matches(&self, canonical: &Canonical) -> bool { - is_integer_primitive(canonical) - } - - fn stats_options(&self) -> GenerateStatsOptions { - GenerateStatsOptions { - count_distinct_values: true, - } - } - - fn num_children(&self) -> usize { - 1 - } - - fn expected_compression_ratio( - &self, - _compressor: &CascadingCompressor, - data: &mut ArrayAndStats, - ctx: CompressorContext, - ) -> VortexResult { - // Dict should not be terminal. - if ctx.allowed_cascading == 0 { - return Ok(0.0); - } - - let stats = data.integer_stats(); - - if stats.value_count() == 0 { - return Ok(0.0); - } - - let distinct_values_count = stats.distinct_count().vortex_expect( - "this must be present since `DictScheme` declared that we need distinct values", - ); - - // If > 50% of the values are distinct, skip dict. - if distinct_values_count > stats.value_count() / 2 { - return Ok(0.0); - } - - // Ignore nulls encoding for the estimate. We only focus on values. - let values_size = stats.source().ptype().bit_width() * distinct_values_count as usize; - - // Assume codes are compressed RLE + BitPacking. - let codes_bw = usize::BITS - distinct_values_count.leading_zeros(); - - let n_runs = (stats.value_count() / stats.average_run_length()) as usize; - - // Assume that codes will either be BitPack or RLE-BitPack. - let codes_size_bp = (codes_bw * stats.value_count()) as usize; - let codes_size_rle_bp = usize::checked_mul((codes_bw + 32) as usize, n_runs); - - let codes_size = usize::min(codes_size_bp, codes_size_rle_bp.unwrap_or(usize::MAX)); - - let before = stats.value_count() as usize * stats.source().ptype().bit_width(); - - Ok(before as f64 / (values_size + codes_size) as f64) - } - - fn compress( - &self, - compressor: &CascadingCompressor, - data: &mut ArrayAndStats, - ctx: CompressorContext, - ) -> VortexResult { - assert!(ctx.allowed_cascading > 0); - - let stats = data.integer_stats(); - - // TODO(aduffy): we can be more prescriptive: we know that codes will EITHER be - // RLE or FOR + BP. Cascading probably wastes some time here. - - let dict = dictionary_encode(stats); - - // Cascade the codes child. - let codes_ctx = ctx.descend().with_scheme(self.id(), 0); - let compressed_codes = compressor.compress_canonical( - Canonical::Primitive(dict.codes().to_primitive().narrow()?), - codes_ctx, - )?; - - // SAFETY: compressing codes does not change their values. - unsafe { - Ok( - DictArray::new_unchecked(compressed_codes, dict.values().clone()) - .set_all_values_referenced(dict.has_all_values_referenced()) - .into_array(), - ) - } - } -} - impl Scheme for RunEndScheme { fn scheme_name(&self) -> &'static str { "vortex.int.runend" @@ -734,13 +494,26 @@ impl Scheme for RunEndScheme { 2 } - fn descendant_exclusions(&self) -> &[DescendantExclusion] { - &[DescendantExclusion { - excluded: DictScheme::ID, + fn descendant_exclusions(&self) -> Vec { + vec![DescendantExclusion { + excluded: DictScheme.id(), children: ChildSelection::All, }] } + fn ancestor_exclusions(&self) -> Vec { + use vortex_compressor::builtins::FloatDictScheme; + + vec![ + // Exclude from FloatDict values child (child 0). This replaces the old ALP + // conditional propagation of float RLE exclusion to integer RunEnd. + AncestorExclusion { + ancestor: FloatDictScheme.id(), + children: ChildSelection::One(0), + }, + ] + } + fn expected_compression_ratio( &self, compressor: &CascadingCompressor, @@ -754,7 +527,7 @@ impl Scheme for RunEndScheme { return Ok(0.0); } - if ctx.allowed_cascading == 0 { + if ctx.finished_cascading() { return Ok(0.0); } @@ -768,18 +541,18 @@ impl Scheme for RunEndScheme { data: &mut ArrayAndStats, ctx: CompressorContext, ) -> VortexResult { - assert!(ctx.allowed_cascading > 0); + assert!(!ctx.finished_cascading()); let stats = data.integer_stats(); // Run-end encode the ends. let (ends, values) = runend_encode(stats.source()); - let values_ctx = ctx.clone().descend().with_scheme(self.id(), 0); + let values_ctx = ctx.clone().descend_with_scheme(self.id(), 0); let compressed_values = compressor .compress_canonical(Canonical::Primitive(values.to_primitive()), values_ctx)?; - let ends_ctx = ctx.descend().with_scheme(self.id(), 1); + let ends_ctx = ctx.descend_with_scheme(self.id(), 1); let compressed_ends = compressor.compress_canonical(Canonical::Primitive(ends.to_primitive()), ends_ctx)?; @@ -805,11 +578,27 @@ impl Scheme for SequenceScheme { is_integer_primitive(canonical) } - fn ancestor_exclusions(&self) -> &[AncestorExclusion] { - &[AncestorExclusion { - ancestor: DictScheme::ID, - children: ChildSelection::All, - }] + fn ancestor_exclusions(&self) -> Vec { + use vortex_compressor::builtins::FloatDictScheme; + use vortex_compressor::builtins::StringDictScheme; + + vec![ + // Exclude from IntDict codes. + AncestorExclusion { + ancestor: DictScheme.id(), + children: ChildSelection::All, + }, + // Exclude from FloatDict codes (child 1). + AncestorExclusion { + ancestor: FloatDictScheme.id(), + children: ChildSelection::One(1), + }, + // Exclude from StringDict codes (child 1). + AncestorExclusion { + ancestor: StringDictScheme.id(), + children: ChildSelection::One(1), + }, + ] } fn expected_compression_ratio( diff --git a/vortex-btrblocks/src/compressor/mod.rs b/vortex-btrblocks/src/schemes/mod.rs similarity index 100% rename from vortex-btrblocks/src/compressor/mod.rs rename to vortex-btrblocks/src/schemes/mod.rs diff --git a/vortex-btrblocks/src/compressor/patches.rs b/vortex-btrblocks/src/schemes/patches.rs similarity index 100% rename from vortex-btrblocks/src/compressor/patches.rs rename to vortex-btrblocks/src/schemes/patches.rs diff --git a/vortex-btrblocks/src/compressor/rle.rs b/vortex-btrblocks/src/schemes/rle.rs similarity index 93% rename from vortex-btrblocks/src/compressor/rle.rs rename to vortex-btrblocks/src/schemes/rle.rs index ae4971c87de..8a41a981cf5 100644 --- a/vortex-btrblocks/src/compressor/rle.rs +++ b/vortex-btrblocks/src/schemes/rle.rs @@ -19,8 +19,8 @@ use crate::CascadingCompressor; use crate::CompressorContext; use crate::Scheme; use crate::SchemeExt; -use crate::compressor::integer::DictScheme as IntDictScheme; use crate::estimate_compression_ratio_with_sampling; +use crate::schemes::integer::DictScheme as IntDictScheme; /// Threshold for the average run length in an array before we consider run-length encoding. pub const RUN_LENGTH_THRESHOLD: u32 = 4; @@ -92,9 +92,9 @@ impl Scheme for RLEScheme { 3 } - fn descendant_exclusions(&self) -> &[DescendantExclusion] { - &[DescendantExclusion { - excluded: IntDictScheme::ID, + fn descendant_exclusions(&self) -> Vec { + vec![DescendantExclusion { + excluded: IntDictScheme.id(), children: ChildSelection::Many(&[1, 2]), }] } @@ -106,7 +106,7 @@ impl Scheme for RLEScheme { ctx: CompressorContext, ) -> VortexResult { // RLE is only useful when we cascade it with another encoding. - if ctx.allowed_cascading == 0 { + if ctx.finished_cascading() { return Ok(0.0); } @@ -137,14 +137,14 @@ impl Scheme for RLEScheme { let stats = data.get_or_insert_with::(|| C::generate_stats(&array)); let rle_array = RLEArray::encode(RLEStats::source(stats))?; - if ctx.allowed_cascading == 0 { + if ctx.finished_cascading() { return Ok(rle_array.into_array()); } let compressed_values = C::compress_values( compressor, &rle_array.values().to_primitive(), - ctx.clone().descend().with_scheme(self.id(), 0), + ctx.clone().descend_with_scheme(self.id(), 0), )?; // Delta in an unstable encoding, once we deem it stable we can switch over to this always. @@ -159,12 +159,12 @@ impl Scheme for RLEScheme { #[cfg(not(feature = "unstable_encodings"))] let compressed_indices = compressor.compress_canonical( Canonical::Primitive(rle_array.indices().to_primitive().narrow()?), - ctx.clone().descend().with_scheme(self.id(), 1), + ctx.clone().descend_with_scheme(self.id(), 1), )?; let compressed_offsets = compressor.compress_canonical( Canonical::Primitive(rle_array.values_idx_offsets().to_primitive().narrow()?), - ctx.descend().with_scheme(self.id(), 2), + ctx.descend_with_scheme(self.id(), 2), )?; // SAFETY: Recursive compression doesn't affect the invariants. diff --git a/vortex-btrblocks/src/compressor/string.rs b/vortex-btrblocks/src/schemes/string.rs similarity index 59% rename from vortex-btrblocks/src/compressor/string.rs rename to vortex-btrblocks/src/schemes/string.rs index b33bae849fa..7a43129b6a6 100644 --- a/vortex-btrblocks/src/compressor/string.rs +++ b/vortex-btrblocks/src/schemes/string.rs @@ -1,22 +1,20 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +// Re-export builtin schemes from vortex-compressor. use vortex_array::ArrayRef; use vortex_array::Canonical; use vortex_array::IntoArray; use vortex_array::ToCanonical; -use vortex_array::aggregate_fn::fns::is_constant::is_constant; -use vortex_array::arrays::ConstantArray; -use vortex_array::arrays::DictArray; -use vortex_array::arrays::MaskedArray; use vortex_array::arrays::VarBinArray; -use vortex_array::builders::dict::dict_encode; -use vortex_array::dtype::DType; -use vortex_array::dtype::Nullability; -use vortex_array::scalar::Scalar; use vortex_array::vtable::ValidityHelper; +pub use vortex_compressor::builtins::StringConstantScheme as ConstantScheme; +pub use vortex_compressor::builtins::StringDictScheme as DictScheme; +pub use vortex_compressor::builtins::StringUncompressedScheme as UncompressedScheme; +pub use vortex_compressor::builtins::is_utf8_string; use vortex_compressor::scheme::ChildSelection; use vortex_compressor::scheme::DescendantExclusion; +pub use vortex_compressor::stats::StringStats; use vortex_error::VortexResult; use vortex_fsst::FSSTArray; use vortex_fsst::fsst_compress; @@ -25,39 +23,17 @@ use vortex_sparse::Sparse; use vortex_sparse::SparseArray; use super::integer::DictScheme as IntDictScheme; -use super::integer::SequenceScheme as IntSequenceScheme; use super::integer::SparseScheme as IntSparseScheme; use crate::ArrayAndStats; use crate::CascadingCompressor; use crate::CompressorContext; -use crate::GenerateStatsOptions; use crate::Scheme; use crate::SchemeExt; -use crate::estimate_compression_ratio_with_sampling; - -/// Returns `true` if the canonical array is a UTF-8 string type. -fn is_utf8_string(canonical: &Canonical) -> bool { - matches!(canonical, Canonical::VarBinView(v) if v.dtype().eq_ignore_nullability(&DType::Utf8(Nullability::NonNullable))) -} - -pub use vortex_compressor::stats::StringStats; - -/// Uncompressed string scheme (identity). -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct UncompressedScheme; - -/// Dictionary encoding for low-cardinality strings. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct DictScheme; /// FSST (Fast Static Symbol Table) compression. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct FSSTScheme; -/// Constant encoding for arrays with a single distinct value. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct ConstantScheme; - /// Sparse encoding for null-dominated arrays. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct NullDominated; @@ -72,129 +48,6 @@ pub struct ZstdScheme; #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct ZstdBuffersScheme; -impl Scheme for UncompressedScheme { - fn scheme_name(&self) -> &'static str { - "vortex.string.uncompressed" - } - - fn matches(&self, canonical: &Canonical) -> bool { - is_utf8_string(canonical) - } - - fn expected_compression_ratio( - &self, - _compressor: &CascadingCompressor, - _data: &mut ArrayAndStats, - _ctx: CompressorContext, - ) -> VortexResult { - Ok(1.0) - } - - fn compress( - &self, - _compressor: &CascadingCompressor, - data: &mut ArrayAndStats, - _ctx: CompressorContext, - ) -> VortexResult { - Ok(data.array().clone()) - } -} - -impl Scheme for DictScheme { - fn scheme_name(&self) -> &'static str { - "vortex.string.dict" - } - - fn matches(&self, canonical: &Canonical) -> bool { - is_utf8_string(canonical) - } - - fn stats_options(&self) -> GenerateStatsOptions { - GenerateStatsOptions { - count_distinct_values: true, - } - } - - fn num_children(&self) -> usize { - 2 - } - - fn descendant_exclusions(&self) -> &[DescendantExclusion] { - &[ - DescendantExclusion { - excluded: IntDictScheme::ID, - children: ChildSelection::One(1), - }, - DescendantExclusion { - excluded: IntSequenceScheme::ID, - children: ChildSelection::One(1), - }, - ] - } - - fn expected_compression_ratio( - &self, - compressor: &CascadingCompressor, - data: &mut ArrayAndStats, - ctx: CompressorContext, - ) -> VortexResult { - let stats = data.string_stats(); - - // If we don't have a sufficiently high number of distinct values, do not attempt Dict. - if stats - .estimated_distinct_count() - .is_none_or(|c| c > stats.value_count() / 2) - { - return Ok(0.0); - } - - // If array is all null, do not attempt dict. - if stats.value_count() == 0 { - return Ok(0.0); - } - - estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) - } - - fn compress( - &self, - compressor: &CascadingCompressor, - data: &mut ArrayAndStats, - ctx: CompressorContext, - ) -> VortexResult { - let stats = data.string_stats(); - - let dict = dict_encode(&stats.source().clone().into_array())?; - - // If we are not allowed to cascade, do not attempt codes or values compression. - if ctx.allowed_cascading == 0 { - return Ok(dict.into_array()); - } - - // Find best compressor for codes and values separately. - let compressed_codes = compressor.compress_canonical( - Canonical::Primitive(dict.codes().to_primitive()), - ctx.clone().descend().with_scheme(self.id(), 1), - )?; - - // Attempt to compress the values with non-Dict compression. - // Currently this will only be FSST. - let compressed_values = compressor.compress_canonical( - Canonical::VarBinView(dict.values().to_varbinview()), - ctx.descend().with_scheme(self.id(), 0), - )?; - - // SAFETY: compressing codes or values does not alter the invariants. - unsafe { - Ok( - DictArray::new_unchecked(compressed_codes, compressed_values) - .set_all_values_referenced(dict.has_all_values_referenced()) - .into_array(), - ) - } - } -} - impl Scheme for FSSTScheme { fn scheme_name(&self) -> &'static str { "vortex.string.fsst" @@ -223,12 +76,12 @@ impl Scheme for FSSTScheme { let compressed_original_lengths = compressor.compress_canonical( Canonical::Primitive(fsst.uncompressed_lengths().to_primitive().narrow()?), - ctx.clone().descend().with_scheme(self.id(), 0), + ctx.clone().descend_with_scheme(self.id(), 0), )?; let compressed_codes_offsets = compressor.compress_canonical( Canonical::Primitive(fsst.codes().offsets().to_primitive().narrow()?), - ctx.descend().with_scheme(self.id(), 1), + ctx.descend_with_scheme(self.id(), 1), )?; let compressed_codes = VarBinArray::try_new( compressed_codes_offsets, @@ -249,77 +102,6 @@ impl Scheme for FSSTScheme { } } -impl Scheme for ConstantScheme { - fn scheme_name(&self) -> &'static str { - "vortex.string.constant" - } - - fn matches(&self, canonical: &Canonical) -> bool { - is_utf8_string(canonical) - } - - fn detects_constant(&self) -> bool { - true - } - - fn expected_compression_ratio( - &self, - compressor: &CascadingCompressor, - data: &mut ArrayAndStats, - ctx: CompressorContext, - ) -> VortexResult { - if ctx.is_sample { - return Ok(0.0); - } - - let stats = data.string_stats(); - - if stats.estimated_distinct_count().is_none_or(|c| c > 1) - || !is_constant( - &stats.source().clone().into_array(), - &mut compressor.execution_ctx(), - )? - { - return Ok(0.0); - } - - // Force constant in these cases. - Ok(f64::MAX) - } - - fn compress( - &self, - _compressor: &CascadingCompressor, - data: &mut ArrayAndStats, - _ctx: CompressorContext, - ) -> VortexResult { - let stats = data.string_stats(); - - let scalar_idx = - (0..stats.source().len()).position(|idx| stats.source().is_valid(idx).unwrap_or(false)); - - match scalar_idx { - Some(idx) => { - let scalar = stats.source().scalar_at(idx)?; - let const_arr = ConstantArray::new(scalar, stats.source().len()).into_array(); - if !stats.source().all_valid()? { - Ok( - MaskedArray::try_new(const_arr, stats.source().validity().clone())? - .into_array(), - ) - } else { - Ok(const_arr) - } - } - None => Ok(ConstantArray::new( - Scalar::null(stats.source().dtype().clone()), - stats.source().len(), - ) - .into_array()), - } - } -} - impl Scheme for NullDominated { fn scheme_name(&self) -> &'static str { "vortex.string.sparse" @@ -333,14 +115,14 @@ impl Scheme for NullDominated { 1 } - fn descendant_exclusions(&self) -> &[DescendantExclusion] { - &[ + fn descendant_exclusions(&self) -> Vec { + vec![ DescendantExclusion { - excluded: IntSparseScheme::ID, + excluded: IntSparseScheme.id(), children: ChildSelection::All, }, DescendantExclusion { - excluded: IntDictScheme::ID, + excluded: IntDictScheme.id(), children: ChildSelection::All, }, ] @@ -353,7 +135,7 @@ impl Scheme for NullDominated { ctx: CompressorContext, ) -> VortexResult { // Only use `SparseScheme` if we can cascade. - if ctx.allowed_cascading == 0 { + if ctx.finished_cascading() { return Ok(0.0); } @@ -379,7 +161,7 @@ impl Scheme for NullDominated { data: &mut ArrayAndStats, ctx: CompressorContext, ) -> VortexResult { - assert!(ctx.allowed_cascading > 0); + assert!(!ctx.finished_cascading()); let stats = data.string_stats(); @@ -391,7 +173,7 @@ impl Scheme for NullDominated { let indices = sparse.patches().indices().to_primitive().narrow()?; let compressed_indices = compressor.compress_canonical( Canonical::Primitive(indices), - ctx.descend().with_scheme(self.id(), 0), + ctx.descend_with_scheme(self.id(), 0), )?; SparseArray::try_new( diff --git a/vortex-btrblocks/src/compressor/temporal.rs b/vortex-btrblocks/src/schemes/temporal.rs similarity index 91% rename from vortex-btrblocks/src/compressor/temporal.rs rename to vortex-btrblocks/src/schemes/temporal.rs index e880ed3c6d3..7e501f146d0 100644 --- a/vortex-btrblocks/src/compressor/temporal.rs +++ b/vortex-btrblocks/src/schemes/temporal.rs @@ -100,21 +100,15 @@ impl Scheme for TemporalScheme { let days = compressor.compress_canonical( Canonical::Primitive(days.to_primitive().narrow()?), - CompressorContext::default() - .descend() - .with_scheme(self.id(), 0), + CompressorContext::default().descend_with_scheme(self.id(), 0), )?; let seconds = compressor.compress_canonical( Canonical::Primitive(seconds.to_primitive().narrow()?), - CompressorContext::default() - .descend() - .with_scheme(self.id(), 1), + CompressorContext::default().descend_with_scheme(self.id(), 1), )?; let subseconds = compressor.compress_canonical( Canonical::Primitive(subseconds.to_primitive().narrow()?), - CompressorContext::default() - .descend() - .with_scheme(self.id(), 2), + CompressorContext::default().descend_with_scheme(self.id(), 2), )?; Ok(DateTimePartsArray::try_new(dtype, days, seconds, subseconds)?.into_array()) diff --git a/vortex-compressor/src/builtins/constant.rs b/vortex-compressor/src/builtins/constant.rs new file mode 100644 index 00000000000..178f67e3e9d --- /dev/null +++ b/vortex-compressor/src/builtins/constant.rs @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Constant encoding schemes for integer, float, and string arrays. + +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_array::IntoArray; +use vortex_array::aggregate_fn::fns::is_constant::is_constant; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::MaskedArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::scalar::Scalar; +use vortex_array::vtable::ValidityHelper; +use vortex_error::VortexResult; + +use super::is_float_primitive; +use super::is_integer_primitive; +use super::is_utf8_string; +use crate::CascadingCompressor; +use crate::ctx::CompressorContext; +use crate::scheme::Scheme; +use crate::stats::ArrayAndStats; + +/// Constant encoding for integer arrays with a single distinct value. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct IntConstantScheme; + +impl Scheme for IntConstantScheme { + fn scheme_name(&self) -> &'static str { + "vortex.int.constant" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_integer_primitive(canonical) + } + + fn detects_constant(&self) -> bool { + true + } + + fn expected_compression_ratio( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + if ctx.is_sample() { + return Ok(0.0); + } + + let stats = data.integer_stats(); + + if stats.distinct_count().is_none_or(|count| count > 1) { + return Ok(0.0); + } + + Ok(stats.value_count() as f64) + } + + fn compress( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + let source = data.integer_stats().source().clone(); + compress_constant_primitive(&source) + } +} + +/// Constant encoding for float arrays with a single distinct value. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct FloatConstantScheme; + +impl Scheme for FloatConstantScheme { + fn scheme_name(&self) -> &'static str { + "vortex.float.constant" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_float_primitive(canonical) + } + + fn detects_constant(&self) -> bool { + true + } + + fn expected_compression_ratio( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + if ctx.is_sample() { + return Ok(0.0); + } + + let stats = data.float_stats(); + + if stats.null_count() as usize == stats.source().len() || stats.value_count() == 0 { + return Ok(0.0); + } + + if stats.distinct_count().is_some_and(|count| count == 1) { + return Ok(stats.value_count() as f64); + } + + Ok(0.0) + } + + fn compress( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + let source = data.float_stats().source().clone(); + compress_constant_primitive(&source) + } +} + +/// Constant encoding for string arrays with a single distinct value. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct StringConstantScheme; + +impl Scheme for StringConstantScheme { + fn scheme_name(&self) -> &'static str { + "vortex.string.constant" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_utf8_string(canonical) + } + + fn detects_constant(&self) -> bool { + true + } + + fn expected_compression_ratio( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + if ctx.is_sample() { + return Ok(0.0); + } + + let stats = data.string_stats(); + + if stats.estimated_distinct_count().is_none_or(|c| c > 1) + || !is_constant( + &stats.source().clone().into_array(), + &mut compressor.execution_ctx(), + )? + { + return Ok(0.0); + } + + // Force constant in these cases. + Ok(f64::MAX) + } + + fn compress( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + let stats = data.string_stats(); + + let scalar_idx = + (0..stats.source().len()).position(|idx| stats.source().is_valid(idx).unwrap_or(false)); + + match scalar_idx { + Some(idx) => { + let scalar = stats.source().scalar_at(idx)?; + let const_arr = ConstantArray::new(scalar, stats.source().len()).into_array(); + if !stats.source().all_valid()? { + Ok( + MaskedArray::try_new(const_arr, stats.source().validity().clone())? + .into_array(), + ) + } else { + Ok(const_arr) + } + } + None => Ok(ConstantArray::new( + Scalar::null(stats.source().dtype().clone()), + stats.source().len(), + ) + .into_array()), + } + } +} + +/// Shared helper for compressing a constant primitive array (int or float). +fn compress_constant_primitive(source: &PrimitiveArray) -> VortexResult { + let scalar_idx = (0..source.len()).position(|idx| source.is_valid(idx).unwrap_or(false)); + + match scalar_idx { + Some(idx) => { + let scalar = source.scalar_at(idx)?; + let const_arr = ConstantArray::new(scalar, source.len()).into_array(); + if !source.all_valid()? { + Ok(MaskedArray::try_new(const_arr, source.validity().clone())?.into_array()) + } else { + Ok(const_arr) + } + } + None => { + Ok(ConstantArray::new(Scalar::null(source.dtype().clone()), source.len()).into_array()) + } + } +} diff --git a/vortex-btrblocks/src/compressor/float/dictionary.rs b/vortex-compressor/src/builtins/dict/float.rs similarity index 80% rename from vortex-btrblocks/src/compressor/float/dictionary.rs rename to vortex-compressor/src/builtins/dict/float.rs index d5fb91c9a67..d9a7af35e16 100644 --- a/vortex-btrblocks/src/compressor/float/dictionary.rs +++ b/vortex-compressor/src/builtins/dict/float.rs @@ -3,7 +3,8 @@ //! Float-specific dictionary encoding implementation. //! -//! Vortex encoders must always produce unsigned integer codes; signed codes are only accepted for external compatibility. +//! Vortex encoders must always produce unsigned integer codes; signed codes are only accepted for +//! external compatibility. use vortex_array::IntoArray; use vortex_array::arrays::DictArray; @@ -12,11 +13,12 @@ use vortex_array::dtype::half::f16; use vortex_array::validity::Validity; use vortex_array::vtable::ValidityHelper; use vortex_buffer::Buffer; -use vortex_compressor::stats::FloatErasedStats as ErasedStats; use vortex_error::VortexExpect; -use super::FloatStats; +use crate::stats::FloatErasedStats; +use crate::stats::FloatStats; +/// Encodes a typed float array into a [`DictArray`] using the pre-computed distinct values. macro_rules! typed_encode { ($stats:ident, $typed:ident, $validity:ident, $typ:ty) => {{ let distinct = $typed.distinct().vortex_expect( @@ -52,28 +54,31 @@ macro_rules! typed_encode { }; let values = PrimitiveArray::new(values, values_validity).into_array(); - // SAFETY: enforced by the DictEncoder + // SAFETY: enforced by the DictEncoder. unsafe { DictArray::new_unchecked(codes, values).set_all_values_referenced(true) } }}; } -/// Compresses a floating-point array into a dictionary arrays according to attached stats. +/// Compresses a floating-point array into a dictionary array according to attached stats. pub fn dictionary_encode(stats: &FloatStats) -> DictArray { let validity = stats.source().validity(); match stats.erased() { - ErasedStats::F16(typed) => typed_encode!(stats, typed, validity, f16), - ErasedStats::F32(typed) => typed_encode!(stats, typed, validity, f32), - ErasedStats::F64(typed) => typed_encode!(stats, typed, validity, f64), + FloatErasedStats::F16(typed) => typed_encode!(stats, typed, validity, f16), + FloatErasedStats::F32(typed) => typed_encode!(stats, typed, validity, f32), + FloatErasedStats::F64(typed) => typed_encode!(stats, typed, validity, f64), } } +/// Stateless encoder that maps values to dictionary codes via a `HashMap`. struct DictEncoder; +/// Trait for encoding values of type `T` into codes of type `I`. trait Encode { /// Using the distinct value set, turn the values into a set of codes. fn encode(distinct: &[T], values: &[T]) -> Buffer; } +/// Implements [`Encode`] for a float type using its bit representation as the hash key. macro_rules! impl_encode { ($typ:ty, $utyp:ty) => { impl_encode!($typ, $utyp, u8, u16, u32); }; ($typ:ty, $utyp:ty, $($ityp:ty),+) => { @@ -91,12 +96,11 @@ macro_rules! impl_encode { let mut output = vortex_buffer::BufferMut::with_capacity(values.len()); for value in values { - // Any code lookups which fail are for nulls, so their value - // does not matter. + // Any code lookups which fail are for nulls, so their value does not matter. output.push(codes.get(&value.to_bits()).copied().unwrap_or_default()); } - return output.freeze(); + output.freeze() } } )* @@ -117,12 +121,12 @@ mod tests { use vortex_array::validity::Validity; use vortex_buffer::buffer; - use super::super::FloatStats; - use crate::compressor::float::dictionary::dictionary_encode; + use super::dictionary_encode; + use crate::stats::FloatStats; + use crate::stats::GenerateStatsOptions; #[test] fn test_float_dict_encode() { - // Create an array that has some nulls let values = buffer![1f32, 2f32, 2f32, 0f32, 1f32]; let validity = Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()); @@ -130,7 +134,7 @@ mod tests { let stats = FloatStats::generate_opts( &array, - crate::GenerateStatsOptions { + GenerateStatsOptions { count_distinct_values: true, }, ); @@ -138,15 +142,11 @@ mod tests { assert_eq!(dict_array.values().len(), 2); assert_eq!(dict_array.codes().len(), 5); - let undict = dict_array; - - // We just use code zero but it doesn't really matter. - // We can just shove a whole validity buffer in there instead. let expected = PrimitiveArray::new( buffer![1f32, 2f32, 2f32, 1f32, 1f32], Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()), ) .into_array(); - assert_arrays_eq!(undict.as_ref(), expected.as_ref()); + assert_arrays_eq!(dict_array.as_ref(), expected.as_ref()); } } diff --git a/vortex-btrblocks/src/compressor/integer/dictionary.rs b/vortex-compressor/src/builtins/dict/integer.rs similarity index 74% rename from vortex-btrblocks/src/compressor/integer/dictionary.rs rename to vortex-compressor/src/builtins/dict/integer.rs index 2bbf10c224a..00ec39ae1a9 100644 --- a/vortex-btrblocks/src/compressor/integer/dictionary.rs +++ b/vortex-compressor/src/builtins/dict/integer.rs @@ -1,9 +1,10 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Dictionary compressor that reuses the unique values in the `IntegerStats`. +//! Dictionary compressor that reuses the unique values in the [`IntegerStats`]. //! -//! Vortex encoders must always produce unsigned integer codes; signed codes are only accepted for external compatibility. +//! Vortex encoders must always produce unsigned integer codes; signed codes are only accepted +//! for external compatibility. use vortex_array::IntoArray; use vortex_array::arrays::DictArray; @@ -11,11 +12,12 @@ use vortex_array::arrays::PrimitiveArray; use vortex_array::validity::Validity; use vortex_array::vtable::ValidityHelper; use vortex_buffer::Buffer; -use vortex_compressor::stats::IntegerErasedStats as ErasedStats; use vortex_error::VortexExpect; -use super::IntegerStats; +use crate::stats::IntegerErasedStats; +use crate::stats::IntegerStats; +/// Encodes a typed integer array into a [`DictArray`] using the pre-computed distinct values. macro_rules! typed_encode { ($stats:ident, $typed:ident, $validity:ident, $typ:ty) => {{ let distinct = $typed.distinct().vortex_expect( @@ -51,39 +53,41 @@ macro_rules! typed_encode { }; let values = PrimitiveArray::new(values, values_validity).into_array(); - // SAFETY: invariants enforced in DictEncoder + // SAFETY: invariants enforced in DictEncoder. unsafe { DictArray::new_unchecked(codes, values).set_all_values_referenced(true) } }}; } -/// Compresses an integer array into a dictionary arrays according to attached stats. +/// Compresses an integer array into a dictionary array according to attached stats. #[expect( clippy::cognitive_complexity, reason = "complexity from match on all integer types" )] pub fn dictionary_encode(stats: &IntegerStats) -> DictArray { - // We need to preserve the nullability somehow from the original let src_validity = stats.source().validity(); match stats.erased() { - ErasedStats::U8(typed) => typed_encode!(stats, typed, src_validity, u8), - ErasedStats::U16(typed) => typed_encode!(stats, typed, src_validity, u16), - ErasedStats::U32(typed) => typed_encode!(stats, typed, src_validity, u32), - ErasedStats::U64(typed) => typed_encode!(stats, typed, src_validity, u64), - ErasedStats::I8(typed) => typed_encode!(stats, typed, src_validity, i8), - ErasedStats::I16(typed) => typed_encode!(stats, typed, src_validity, i16), - ErasedStats::I32(typed) => typed_encode!(stats, typed, src_validity, i32), - ErasedStats::I64(typed) => typed_encode!(stats, typed, src_validity, i64), + IntegerErasedStats::U8(typed) => typed_encode!(stats, typed, src_validity, u8), + IntegerErasedStats::U16(typed) => typed_encode!(stats, typed, src_validity, u16), + IntegerErasedStats::U32(typed) => typed_encode!(stats, typed, src_validity, u32), + IntegerErasedStats::U64(typed) => typed_encode!(stats, typed, src_validity, u64), + IntegerErasedStats::I8(typed) => typed_encode!(stats, typed, src_validity, i8), + IntegerErasedStats::I16(typed) => typed_encode!(stats, typed, src_validity, i16), + IntegerErasedStats::I32(typed) => typed_encode!(stats, typed, src_validity, i32), + IntegerErasedStats::I64(typed) => typed_encode!(stats, typed, src_validity, i64), } } +/// Stateless encoder that maps values to dictionary codes via a `HashMap`. struct DictEncoder; +/// Trait for encoding values of type `T` into codes of type `I`. trait Encode { /// Using the distinct value set, turn the values into a set of codes. fn encode(distinct: &[T], values: &[T]) -> Buffer; } +/// Implements [`Encode`] for an integer type with all code width variants (u8, u16, u32). macro_rules! impl_encode { ($typ:ty) => { impl_encode!($typ, u8, u16, u32); }; ($typ:ty, $($ityp:ty),+) => { @@ -101,13 +105,12 @@ macro_rules! impl_encode { let mut output = vortex_buffer::BufferMut::with_capacity(values.len()); for value in values { - // Any code lookups which fail are for nulls, so their value - // does not matter. + // Any code lookups which fail are for nulls, so their value does not matter. // SAFETY: we have exactly sized output to be as large as values. unsafe { output.push_unchecked(codes.get(value).copied().unwrap_or_default()) }; } - return output.freeze(); + output.freeze() } } )* @@ -133,12 +136,11 @@ mod tests { use vortex_array::validity::Validity; use vortex_buffer::buffer; - use super::IntegerStats; use super::dictionary_encode; + use crate::stats::IntegerStats; #[test] fn test_dict_encode_integer_stats() { - // Create an array that has some nulls let data = buffer![100i32, 200, 100, 0, 100]; let validity = Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()); @@ -146,7 +148,7 @@ mod tests { let stats = IntegerStats::generate_opts( &array, - crate::GenerateStatsOptions { + crate::stats::GenerateStatsOptions { count_distinct_values: true, }, ); @@ -154,15 +156,11 @@ mod tests { assert_eq!(dict_array.values().len(), 2); assert_eq!(dict_array.codes().len(), 5); - let undict = dict_array; - - // We just use code zero, but it doesn't really matter. - // We can just shove a whole validity buffer in there instead. let expected = PrimitiveArray::new( buffer![100i32, 200, 100, 100, 100], Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()), ) .into_array(); - assert_arrays_eq!(undict.as_ref(), expected.as_ref()); + assert_arrays_eq!(dict_array.as_ref(), expected.as_ref()); } } diff --git a/vortex-compressor/src/builtins/dict/mod.rs b/vortex-compressor/src/builtins/dict/mod.rs new file mode 100644 index 00000000000..15c133daf7e --- /dev/null +++ b/vortex-compressor/src/builtins/dict/mod.rs @@ -0,0 +1,321 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Dictionary encoding schemes for integer, float, and string arrays. + +pub mod float; +pub mod integer; + +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_array::IntoArray; +use vortex_array::ToCanonical; +use vortex_array::arrays::DictArray; +use vortex_array::arrays::dict::DictArrayParts; +use vortex_array::builders::dict::dict_encode; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; + +use super::is_float_primitive; +use super::is_integer_primitive; +use super::is_utf8_string; +use crate::CascadingCompressor; +use crate::ctx::CompressorContext; +use crate::scheme::ChildSelection; +use crate::scheme::DescendantExclusion; +use crate::scheme::Scheme; +use crate::scheme::SchemeExt; +use crate::scheme::estimate_compression_ratio_with_sampling; +use crate::stats::ArrayAndStats; +use crate::stats::GenerateStatsOptions; + +/// Dictionary encoding for low-cardinality integer values. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct IntDictScheme; + +impl Scheme for IntDictScheme { + fn scheme_name(&self) -> &'static str { + "vortex.int.dict" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_integer_primitive(canonical) + } + + fn stats_options(&self) -> GenerateStatsOptions { + GenerateStatsOptions { + count_distinct_values: true, + } + } + + fn num_children(&self) -> usize { + 1 + } + + fn expected_compression_ratio( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + // Dict should not be terminal. + if ctx.finished_cascading() { + return Ok(0.0); + } + + let stats = data.integer_stats(); + + if stats.value_count() == 0 { + return Ok(0.0); + } + + let distinct_values_count = stats.distinct_count().vortex_expect( + "this must be present since `DictScheme` declared that we need distinct values", + ); + + // If > 50% of the values are distinct, skip dict. + if distinct_values_count > stats.value_count() / 2 { + return Ok(0.0); + } + + // Ignore nulls encoding for the estimate. We only focus on values. + let values_size = stats.source().ptype().bit_width() * distinct_values_count as usize; + + // Assume codes are compressed RLE + BitPacking. + let codes_bw = usize::BITS - distinct_values_count.leading_zeros(); + + let n_runs = (stats.value_count() / stats.average_run_length()) as usize; + + // Assume that codes will either be BitPack or RLE-BitPack. + let codes_size_bp = (codes_bw * stats.value_count()) as usize; + let codes_size_rle_bp = usize::checked_mul((codes_bw + 32) as usize, n_runs); + + let codes_size = usize::min(codes_size_bp, codes_size_rle_bp.unwrap_or(usize::MAX)); + + let before = stats.value_count() as usize * stats.source().ptype().bit_width(); + + Ok(before as f64 / (values_size + codes_size) as f64) + } + + fn compress( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + assert!(!ctx.finished_cascading()); + + let stats = data.integer_stats(); + + let dict = integer::dictionary_encode(stats); + + // Cascade the codes child (child 0 for now — will become child 1 with values-first). + let codes_ctx = ctx.descend_with_scheme(self.id(), 0); + let compressed_codes = compressor.compress_canonical( + Canonical::Primitive(dict.codes().to_primitive().narrow()?), + codes_ctx, + )?; + + // SAFETY: compressing codes does not change their values. + unsafe { + Ok( + DictArray::new_unchecked(compressed_codes, dict.values().clone()) + .set_all_values_referenced(dict.has_all_values_referenced()) + .into_array(), + ) + } + } +} + +/// Dictionary encoding for low-cardinality float values. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct FloatDictScheme; + +impl Scheme for FloatDictScheme { + fn scheme_name(&self) -> &'static str { + "vortex.float.dict" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_float_primitive(canonical) + } + + fn stats_options(&self) -> GenerateStatsOptions { + GenerateStatsOptions { + count_distinct_values: true, + } + } + + fn num_children(&self) -> usize { + 2 + } + + fn descendant_exclusions(&self) -> Vec { + vec![ + // Exclude IntDict from codes child. + DescendantExclusion { + excluded: IntDictScheme.id(), + children: ChildSelection::One(1), + }, + // Exclude IntDict from values child (replaces ALP conditional propagation). + DescendantExclusion { + excluded: IntDictScheme.id(), + children: ChildSelection::One(0), + }, + // Note: IntSequenceScheme and IntRunEndScheme exclusions are expressed as pull + // rules on those schemes in vortex-btrblocks, since they can't be referenced + // from vortex-compressor. + ] + } + + fn expected_compression_ratio( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + let stats = data.float_stats(); + + if stats.value_count() == 0 { + return Ok(0.0); + } + + if stats + .distinct_count() + .is_some_and(|count| count <= stats.value_count() / 2) + { + return estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx); + } + + Ok(0.0) + } + + fn compress( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + let stats = data.float_stats(); + + let dict = float::dictionary_encode(stats); + let has_all_values_referenced = dict.has_all_values_referenced(); + let DictArrayParts { codes, values, .. } = dict.into_parts(); + + // Values = child 0. + let compressed_values = compressor.compress_canonical( + Canonical::Primitive(values.to_primitive()), + ctx.clone().descend_with_scheme(self.id(), 0), + )?; + + // Codes = child 1. + let compressed_codes = compressor.compress_canonical( + Canonical::Primitive(codes.to_primitive()), + ctx.descend_with_scheme(self.id(), 1), + )?; + + // SAFETY: compressing codes or values does not alter the invariants. + unsafe { + Ok( + DictArray::new_unchecked(compressed_codes, compressed_values) + .set_all_values_referenced(has_all_values_referenced) + .into_array(), + ) + } + } +} + +/// Dictionary encoding for low-cardinality string values. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct StringDictScheme; + +impl Scheme for StringDictScheme { + fn scheme_name(&self) -> &'static str { + "vortex.string.dict" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_utf8_string(canonical) + } + + fn stats_options(&self) -> GenerateStatsOptions { + GenerateStatsOptions { + count_distinct_values: true, + } + } + + fn num_children(&self) -> usize { + 2 + } + + fn descendant_exclusions(&self) -> Vec { + vec![ + // Exclude IntDict from codes child. + DescendantExclusion { + excluded: IntDictScheme.id(), + children: ChildSelection::One(1), + }, + // Note: IntSequenceScheme exclusion is expressed as a pull rule on + // IntSequenceScheme in vortex-btrblocks. + ] + } + + fn expected_compression_ratio( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + let stats = data.string_stats(); + + if stats + .estimated_distinct_count() + .is_none_or(|c| c > stats.value_count() / 2) + { + return Ok(0.0); + } + + if stats.value_count() == 0 { + return Ok(0.0); + } + + estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) + } + + fn compress( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + let stats = data.string_stats(); + + let dict = dict_encode(&stats.source().clone().into_array())?; + + // If we are not allowed to cascade, do not attempt codes or values compression. + if ctx.finished_cascading() { + return Ok(dict.into_array()); + } + + // Codes = child 1. + let compressed_codes = compressor.compress_canonical( + Canonical::Primitive(dict.codes().to_primitive()), + ctx.clone().descend_with_scheme(self.id(), 1), + )?; + + // Values = child 0. + let compressed_values = compressor.compress_canonical( + Canonical::VarBinView(dict.values().to_varbinview()), + ctx.descend_with_scheme(self.id(), 0), + )?; + + // SAFETY: compressing codes or values does not alter the invariants. + unsafe { + Ok( + DictArray::new_unchecked(compressed_codes, compressed_values) + .set_all_values_referenced(dict.has_all_values_referenced()) + .into_array(), + ) + } + } +} diff --git a/vortex-compressor/src/builtins/mod.rs b/vortex-compressor/src/builtins/mod.rs new file mode 100644 index 00000000000..b648f557941 --- /dev/null +++ b/vortex-compressor/src/builtins/mod.rs @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Built-in compression schemes that use only `vortex-array` encodings. +//! +//! These schemes produce arrays using types already in `vortex-array` ([`ConstantArray`], +//! [`DictArray`], [`MaskedArray`], etc.) and have no external encoding crate dependencies. +//! +//! [`ConstantArray`]: vortex_array::arrays::ConstantArray +//! [`DictArray`]: vortex_array::arrays::DictArray +//! [`MaskedArray`]: vortex_array::arrays::MaskedArray + +pub use constant::FloatConstantScheme; +pub use constant::IntConstantScheme; +pub use constant::StringConstantScheme; +pub use dict::FloatDictScheme; +pub use dict::IntDictScheme; +pub use dict::StringDictScheme; +pub use dict::float::dictionary_encode as float_dictionary_encode; +pub use dict::integer::dictionary_encode as integer_dictionary_encode; +pub use uncompressed::FloatUncompressedScheme; +pub use uncompressed::IntUncompressedScheme; +pub use uncompressed::StringUncompressedScheme; + +mod constant; +mod dict; +mod uncompressed; + +use vortex_array::Canonical; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; + +/// Returns `true` if the canonical array is a primitive with an integer ptype. +pub fn is_integer_primitive(canonical: &Canonical) -> bool { + matches!(canonical, Canonical::Primitive(p) if p.ptype().is_int()) +} + +/// Returns `true` if the canonical form represents a floating-point primitive. +pub fn is_float_primitive(canonical: &Canonical) -> bool { + matches!(canonical, Canonical::Primitive(p) if !p.ptype().is_int()) +} + +/// Returns `true` if the canonical array is a UTF-8 string type. +pub fn is_utf8_string(canonical: &Canonical) -> bool { + matches!(canonical, + Canonical::VarBinView(v) if + v.dtype().eq_ignore_nullability(&DType::Utf8(Nullability::NonNullable)) + ) +} diff --git a/vortex-compressor/src/builtins/uncompressed.rs b/vortex-compressor/src/builtins/uncompressed.rs new file mode 100644 index 00000000000..2ec6b3d019e --- /dev/null +++ b/vortex-compressor/src/builtins/uncompressed.rs @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Uncompressed passthrough schemes for integer, float, and string arrays. + +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_error::VortexResult; + +use super::is_float_primitive; +use super::is_integer_primitive; +use super::is_utf8_string; +use crate::CascadingCompressor; +use crate::ctx::CompressorContext; +use crate::scheme::Scheme; +use crate::stats::ArrayAndStats; + +/// No compression applied to integer arrays. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct IntUncompressedScheme; + +impl Scheme for IntUncompressedScheme { + fn scheme_name(&self) -> &'static str { + "vortex.int.uncompressed" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_integer_primitive(canonical) + } + + fn expected_compression_ratio( + &self, + _compressor: &CascadingCompressor, + _data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + Ok(1.0) + } + + fn compress( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + Ok(data.array().clone()) + } +} + +/// No compression applied to float arrays. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct FloatUncompressedScheme; + +impl Scheme for FloatUncompressedScheme { + fn scheme_name(&self) -> &'static str { + "vortex.float.uncompressed" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_float_primitive(canonical) + } + + fn expected_compression_ratio( + &self, + _compressor: &CascadingCompressor, + _data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + Ok(1.0) + } + + fn compress( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + Ok(data.array().clone()) + } +} + +/// No compression applied to string arrays. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct StringUncompressedScheme; + +impl Scheme for StringUncompressedScheme { + fn scheme_name(&self) -> &'static str { + "vortex.string.uncompressed" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_utf8_string(canonical) + } + + fn expected_compression_ratio( + &self, + _compressor: &CascadingCompressor, + _data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + Ok(1.0) + } + + fn compress( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + Ok(data.array().clone()) + } +} diff --git a/vortex-compressor/src/compressor.rs b/vortex-compressor/src/compressor.rs index 473ce610f13..02d3a64c305 100644 --- a/vortex-compressor/src/compressor.rs +++ b/vortex-compressor/src/compressor.rs @@ -29,7 +29,9 @@ use vortex_array::scalar::Scalar; use vortex_array::vtable::ValidityHelper; use vortex_error::VortexResult; +use crate::builtins::IntDictScheme; use crate::ctx::CompressorContext; +use crate::scheme::ChildSelection; use crate::scheme::DescendantExclusion; use crate::scheme::Scheme; use crate::scheme::SchemeExt; @@ -37,9 +39,10 @@ use crate::scheme::SchemeId; use crate::stats::ArrayAndStats; use crate::stats::GenerateStatsOptions; -/// The implicit root scheme ID for the compressor's own cascading (e.g. list offset -/// compression). -pub(crate) const ROOT_SCHEME_ID: SchemeId = SchemeId { +/// The implicit root scheme ID for the compressor's own cascading (e.g. list offset compression). +/// +/// This is the **only** [`SchemeId`] that is not auto-provided via [`SchemeExt`]. +const ROOT_SCHEME_ID: SchemeId = SchemeId { name: "vortex.compressor.root", }; @@ -83,11 +86,17 @@ pub struct CascadingCompressor { } impl CascadingCompressor { - /// Creates a new compressor with the given schemes and root exclusion rules. - pub fn new( - schemes: Vec<&'static dyn Scheme>, - root_exclusions: Vec, - ) -> Self { + /// Creates a new compressor with the given schemes. + /// + /// Root-level exclusion rules (e.g. excluding Dict from list offsets) are built + /// automatically. + pub fn new(schemes: Vec<&'static dyn Scheme>) -> Self { + // Root exclusion: exclude IntDict from list/listview offsets (monotonically + // increasing data where dictionary encoding is wasteful). + let root_exclusions = vec![DescendantExclusion { + excluded: IntDictScheme.id(), + children: ChildSelection::One(root_list_children::OFFSETS), + }]; Self { schemes, root_exclusions, @@ -256,8 +265,7 @@ impl CascadingCompressor { acc.merge(s.stats_options()) }); - let mut ctx = ctx; - ctx.stats_options = merged_opts; + let ctx = ctx.with_stats_options(merged_opts); let mut data = ArrayAndStats::new(array, merged_opts); @@ -298,32 +306,32 @@ impl CascadingCompressor { Ok(best.map(|(s, _)| s)) } - /// Returns `true` if the candidate scheme should be excluded based on the cascade history - /// and exclusion rules. + /// Returns `true` if the candidate scheme should be excluded based on the cascade history and + /// exclusion rules. fn is_excluded(&self, candidate: &dyn Scheme, ctx: &CompressorContext) -> bool { let id = candidate.id(); let history = ctx.cascade_history(); // Self-exclusion: no scheme appears twice in any chain. - if history.iter().any(|(sid, _)| *sid == id) { + if history.iter().any(|&(sid, _)| sid == id) { return true; } - // Push rules: check each ancestor's descendant_exclusions. - for &(ancestor_id, child_idx) in history { - // Root scheme rules. - if ancestor_id == ROOT_SCHEME_ID { - if self - .root_exclusions - .iter() - .any(|rule| rule.excluded == id && rule.children.contains(child_idx)) - { - return true; - } - continue; - } + let mut iter = history.iter().copied().peekable(); + + // The root entry is always first in the history (if present). Check if the root has + // excluded us. + if let Some((_, child_idx)) = iter.next_if(|&(sid, _)| sid == ROOT_SCHEME_ID) + && self + .root_exclusions + .iter() + .any(|rule| rule.excluded == id && rule.children.contains(child_idx)) + { + return true; + } - // Scheme-level push rules. + // Push rules: Check if any of our ancestors have excluded us. + for (ancestor_id, child_idx) in iter { if let Some(ancestor) = self.schemes.iter().find(|s| s.id() == ancestor_id) && ancestor .descendant_exclusions() @@ -334,7 +342,7 @@ impl CascadingCompressor { } } - // Pull rules: candidate's ancestor_exclusions. + // Pull rules: Check if we have excluded ourselves because of our ancestors. for rule in candidate.ancestor_exclusions() { if history .iter() @@ -358,9 +366,7 @@ impl CascadingCompressor { let compressed_elems = self.compress(list_array.elements())?; // Record the root scheme with the offsets child index so root exclusion rules apply. - let offset_ctx = ctx - .descend() - .with_scheme(ROOT_SCHEME_ID, root_list_children::OFFSETS); + let offset_ctx = ctx.descend_with_scheme(ROOT_SCHEME_ID, root_list_children::OFFSETS); let compressed_offsets = self.compress_canonical( Canonical::Primitive(list_array.offsets().to_primitive().narrow()?), offset_ctx, @@ -385,16 +391,13 @@ impl CascadingCompressor { let offset_ctx = ctx .clone() - .descend() - .with_scheme(ROOT_SCHEME_ID, root_list_children::OFFSETS); + .descend_with_scheme(ROOT_SCHEME_ID, root_list_children::OFFSETS); let compressed_offsets = self.compress_canonical( Canonical::Primitive(list_view.offsets().to_primitive().narrow()?), offset_ctx, )?; - let sizes_ctx = ctx - .descend() - .with_scheme(ROOT_SCHEME_ID, root_list_children::SIZES); + let sizes_ctx = ctx.descend_with_scheme(ROOT_SCHEME_ID, root_list_children::SIZES); let compressed_sizes = self.compress_canonical( Canonical::Primitive(list_view.sizes().to_primitive().narrow()?), sizes_ctx, diff --git a/vortex-compressor/src/ctx.rs b/vortex-compressor/src/ctx.rs index 6fd33be8bd7..770d48358ff 100644 --- a/vortex-compressor/src/ctx.rs +++ b/vortex-compressor/src/ctx.rs @@ -3,6 +3,8 @@ //! Compression context for recursive compression. +use vortex_error::VortexExpect; + use crate::scheme::SchemeId; use crate::stats::GenerateStatsOptions; @@ -16,11 +18,11 @@ pub const MAX_CASCADE: usize = 3; #[derive(Debug, Clone)] pub struct CompressorContext { /// Whether we're compressing a sample (for ratio estimation). - pub is_sample: bool, + is_sample: bool, /// Remaining cascade depth allowed. - pub allowed_cascading: usize, + allowed_cascading: usize, /// Merged stats options from all eligible schemes at this compression site. - pub stats_options: GenerateStatsOptions, + stats_options: GenerateStatsOptions, /// The cascade chain: `(scheme_id, child_index)` pairs from root to current depth. /// Used for self-exclusion, push rules ([`descendant_exclusions`]), and pull rules /// ([`ancestor_exclusions`]). @@ -42,9 +44,24 @@ impl Default for CompressorContext { } impl CompressorContext { - /// Descend one level in the cascade. - pub fn descend(mut self) -> Self { - self.allowed_cascading = self.allowed_cascading.saturating_sub(1); + /// Whether this context is for sample compression (ratio estimation). + pub fn is_sample(&self) -> bool { + self.is_sample + } + + /// Whether cascading is exhausted (no further cascade levels allowed). + pub fn finished_cascading(&self) -> bool { + self.allowed_cascading == 0 + } + + /// Returns the merged stats generation options for this compression site. + pub fn stats_options(&self) -> GenerateStatsOptions { + self.stats_options + } + + /// Returns a context with the given stats options. + pub fn with_stats_options(mut self, opts: GenerateStatsOptions) -> Self { + self.stats_options = opts; self } @@ -54,12 +71,22 @@ impl CompressorContext { self } - /// Records a scheme and its child index in the cascade chain. + /// Returns a context that disallows further cascading. + pub fn as_leaf(mut self) -> Self { + self.allowed_cascading = 0; + self + } + + /// Descends one level in the cascade, recording the current scheme and which child is + /// being compressed. /// - /// Cascading schemes should call this before delegating child arrays to the compressor. /// The `child_index` identifies which child of the scheme is being compressed (e.g. for /// Dict: values=0, codes=1). - pub fn with_scheme(mut self, id: SchemeId, child_index: usize) -> Self { + pub fn descend_with_scheme(mut self, id: SchemeId, child_index: usize) -> Self { + self.allowed_cascading = self + .allowed_cascading + .checked_sub(1) + .vortex_expect("cannot descend: cascade depth exhausted"); self.cascade_history.push((id, child_index)); self } diff --git a/vortex-compressor/src/lib.rs b/vortex-compressor/src/lib.rs index 7a5d746471a..9066d9ecd86 100644 --- a/vortex-compressor/src/lib.rs +++ b/vortex-compressor/src/lib.rs @@ -16,6 +16,7 @@ //! This crate contains no encoding dependencies. Batteries-included compressors are provided by //! downstream crates like `vortex-btrblocks`, which register different encodings to the compressor. +pub mod builtins; pub mod ctx; pub mod scheme; pub mod stats; diff --git a/vortex-compressor/src/scheme.rs b/vortex-compressor/src/scheme.rs index 5a3008cb95b..14a3810a282 100644 --- a/vortex-compressor/src/scheme.rs +++ b/vortex-compressor/src/scheme.rs @@ -22,13 +22,13 @@ use crate::stats::GenerateStatsOptions; /// Unique identifier for a compression scheme. /// -/// Typically obtained through [`SchemeExt::id()`], which is auto-implemented for all [`Scheme`] -/// types, wrapping [`Scheme::scheme_name()`]. The [`name`](SchemeId::name) field is also -/// available for constructing `SchemeId` in `const` contexts (e.g. static exclusion rules). +/// The only way to obtain a [`SchemeId`] is through [`SchemeExt::id()`], which is +/// auto-implemented for all [`Scheme`] types. There is no public constructor. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub struct SchemeId { - /// The scheme name. - pub name: &'static str, + /// The scheme name. Only constructable within `vortex-compressor` — the only public way + /// to obtain a [`SchemeId`] is through [`SchemeExt::id()`]. + pub(super) name: &'static str, } impl fmt::Display for SchemeId { @@ -173,16 +173,16 @@ pub trait Scheme: Debug + Send + Sync { /// /// Each rule says: "when I cascade through child Y, do not use scheme X anywhere in that /// subtree." Only meaningful when [`num_children`](Scheme::num_children) > 0. - fn descendant_exclusions(&self) -> &[DescendantExclusion] { - &[] + fn descendant_exclusions(&self) -> Vec { + Vec::new() } /// Ancestors that make this scheme ineligible (pull direction). /// /// Each rule says: "if ancestor X cascaded through child Y somewhere above me in the chain, /// do not try me." - fn ancestor_exclusions(&self) -> &[AncestorExclusion] { - &[] + fn ancestor_exclusions(&self) -> Vec { + Vec::new() } /// Estimate the compression ratio for this scheme on the given array. @@ -258,7 +258,7 @@ pub fn estimate_compression_ratio_with_sampling( array: &ArrayRef, ctx: CompressorContext, ) -> VortexResult { - let sample_array = if ctx.is_sample { + let sample_array = if ctx.is_sample() { array.clone() } else { let source_len = array.len(); @@ -273,7 +273,7 @@ pub fn estimate_compression_ratio_with_sampling( sample(array, SAMPLE_SIZE, sample_count) }; - let mut sample_data = ArrayAndStats::new(sample_array, ctx.stats_options); + let mut sample_data = ArrayAndStats::new(sample_array, ctx.stats_options()); let sample_ctx = ctx.as_sample(); let after = scheme diff --git a/vortex-file/src/strategy.rs b/vortex-file/src/strategy.rs index adc4f08dc63..534da6c659e 100644 --- a/vortex-file/src/strategy.rs +++ b/vortex-file/src/strategy.rs @@ -27,16 +27,6 @@ use vortex_array::arrays::VarBin; use vortex_array::arrays::VarBinView; use vortex_array::dtype::FieldPath; use vortex_array::session::ArrayRegistry; -#[cfg(feature = "zstd")] -use vortex_btrblocks::BtrBlocksCompressorBuilder; -#[cfg(feature = "zstd")] -use vortex_btrblocks::SchemeExt; -#[cfg(feature = "zstd")] -use vortex_btrblocks::compressor::float; -#[cfg(feature = "zstd")] -use vortex_btrblocks::compressor::integer; -#[cfg(feature = "zstd")] -use vortex_btrblocks::compressor::string; use vortex_bytebool::ByteBool; use vortex_datetime_parts::DateTimeParts; use vortex_decimal_byte_parts::DecimalByteParts; @@ -64,6 +54,16 @@ use vortex_sequence::Sequence; use vortex_sparse::Sparse; use vortex_utils::aliases::hash_map::HashMap; use vortex_zigzag::ZigZag; + +#[rustfmt::skip] +#[cfg(feature = "zstd")] +use vortex_btrblocks::{ + BtrBlocksCompressorBuilder, + SchemeExt, + schemes::float, + schemes::integer, + schemes::string, +}; #[cfg(feature = "zstd")] use vortex_zstd::Zstd; #[cfg(all(feature = "zstd", feature = "unstable_encodings"))] diff --git a/vortex-layout/src/layouts/compressed.rs b/vortex-layout/src/layouts/compressed.rs index 25228a12813..981779edeb3 100644 --- a/vortex-layout/src/layouts/compressed.rs +++ b/vortex-layout/src/layouts/compressed.rs @@ -12,7 +12,7 @@ use vortex_array::expr::stats::Stat; use vortex_btrblocks::BtrBlocksCompressor; use vortex_btrblocks::BtrBlocksCompressorBuilder; use vortex_btrblocks::SchemeExt; -use vortex_btrblocks::compressor::integer::DictScheme; +use vortex_btrblocks::schemes::integer::DictScheme; use vortex_error::VortexResult; use vortex_io::runtime::Handle; From e4b8de51242947abad8854e1d6139eca07c71c71 Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Thu, 19 Mar 2026 15:37:40 -0400 Subject: [PATCH 7/9] add `compress_child` Signed-off-by: Connor Tsui --- vortex-array/public-api.lock | 8 + vortex-btrblocks/public-api.lock | 824 +++++++------------ vortex-btrblocks/src/builder.rs | 81 +- vortex-btrblocks/src/canonical_compressor.rs | 4 +- vortex-btrblocks/src/lib.rs | 4 +- vortex-btrblocks/src/schemes/decimal.rs | 12 +- vortex-btrblocks/src/schemes/float.rs | 76 +- vortex-btrblocks/src/schemes/integer.rs | 171 ++-- vortex-btrblocks/src/schemes/mod.rs | 13 +- vortex-btrblocks/src/schemes/rle.rs | 74 +- vortex-btrblocks/src/schemes/string.rs | 60 +- vortex-btrblocks/src/schemes/temporal.rs | 26 +- vortex-compressor/public-api.lock | 658 ++++++++++++++- vortex-compressor/src/builtins/dict/mod.rs | 81 +- vortex-compressor/src/compressor.rs | 38 +- vortex-compressor/src/ctx.rs | 17 +- vortex-compressor/src/scheme.rs | 55 +- vortex-file/src/strategy.rs | 4 +- vortex-layout/src/layouts/compressed.rs | 4 +- 19 files changed, 1305 insertions(+), 905 deletions(-) diff --git a/vortex-array/public-api.lock b/vortex-array/public-api.lock index b72ced1ebf3..cc9eb9b9e64 100644 --- a/vortex-array/public-api.lock +++ b/vortex-array/public-api.lock @@ -22144,6 +22144,14 @@ pub fn vortex_array::ExecutionCtx::new(session: vortex_session::VortexSession) - pub fn vortex_array::ExecutionCtx::session(&self) -> &vortex_session::VortexSession +impl core::clone::Clone for vortex_array::ExecutionCtx + +pub fn vortex_array::ExecutionCtx::clone(&self) -> vortex_array::ExecutionCtx + +impl core::fmt::Debug for vortex_array::ExecutionCtx + +pub fn vortex_array::ExecutionCtx::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + impl core::fmt::Display for vortex_array::ExecutionCtx pub fn vortex_array::ExecutionCtx::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result diff --git a/vortex-btrblocks/public-api.lock b/vortex-btrblocks/public-api.lock index 8fe186586bd..92e9a1472a6 100644 --- a/vortex-btrblocks/public-api.lock +++ b/vortex-btrblocks/public-api.lock @@ -24,827 +24,603 @@ pub use vortex_btrblocks::StringStats pub use vortex_btrblocks::estimate_compression_ratio_with_sampling -pub mod vortex_btrblocks::compressor +pub use vortex_btrblocks::integer_dictionary_encode -pub mod vortex_btrblocks::compressor::decimal +pub mod vortex_btrblocks::schemes -pub struct vortex_btrblocks::compressor::decimal::DecimalScheme +pub mod vortex_btrblocks::schemes::decimal -impl core::clone::Clone for vortex_btrblocks::compressor::decimal::DecimalScheme +pub struct vortex_btrblocks::schemes::decimal::DecimalScheme -pub fn vortex_btrblocks::compressor::decimal::DecimalScheme::clone(&self) -> vortex_btrblocks::compressor::decimal::DecimalScheme +impl core::clone::Clone for vortex_btrblocks::schemes::decimal::DecimalScheme -impl core::cmp::Eq for vortex_btrblocks::compressor::decimal::DecimalScheme +pub fn vortex_btrblocks::schemes::decimal::DecimalScheme::clone(&self) -> vortex_btrblocks::schemes::decimal::DecimalScheme -impl core::cmp::PartialEq for vortex_btrblocks::compressor::decimal::DecimalScheme +impl core::cmp::Eq for vortex_btrblocks::schemes::decimal::DecimalScheme -pub fn vortex_btrblocks::compressor::decimal::DecimalScheme::eq(&self, other: &vortex_btrblocks::compressor::decimal::DecimalScheme) -> bool +impl core::cmp::PartialEq for vortex_btrblocks::schemes::decimal::DecimalScheme -impl core::fmt::Debug for vortex_btrblocks::compressor::decimal::DecimalScheme +pub fn vortex_btrblocks::schemes::decimal::DecimalScheme::eq(&self, other: &vortex_btrblocks::schemes::decimal::DecimalScheme) -> bool -pub fn vortex_btrblocks::compressor::decimal::DecimalScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +impl core::fmt::Debug for vortex_btrblocks::schemes::decimal::DecimalScheme -impl core::marker::Copy for vortex_btrblocks::compressor::decimal::DecimalScheme +pub fn vortex_btrblocks::schemes::decimal::DecimalScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::decimal::DecimalScheme +impl core::marker::Copy for vortex_btrblocks::schemes::decimal::DecimalScheme -impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::decimal::DecimalScheme +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::decimal::DecimalScheme -pub fn vortex_btrblocks::compressor::decimal::DecimalScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::decimal::DecimalScheme -pub fn vortex_btrblocks::compressor::decimal::DecimalScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, _data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +pub fn vortex_btrblocks::schemes::decimal::DecimalScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub fn vortex_btrblocks::compressor::decimal::DecimalScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool +pub fn vortex_btrblocks::schemes::decimal::DecimalScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, _data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub fn vortex_btrblocks::compressor::decimal::DecimalScheme::scheme_name(&self) -> &'static str +pub fn vortex_btrblocks::schemes::decimal::DecimalScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -pub mod vortex_btrblocks::compressor::float +pub fn vortex_btrblocks::schemes::decimal::DecimalScheme::num_children(&self) -> usize -pub use vortex_btrblocks::compressor::float::FloatStats +pub fn vortex_btrblocks::schemes::decimal::DecimalScheme::scheme_name(&self) -> &'static str -pub struct vortex_btrblocks::compressor::float::ALPRDScheme +pub mod vortex_btrblocks::schemes::float -impl core::clone::Clone for vortex_btrblocks::compressor::float::ALPRDScheme +pub use vortex_btrblocks::schemes::float::FloatConstantScheme -pub fn vortex_btrblocks::compressor::float::ALPRDScheme::clone(&self) -> vortex_btrblocks::compressor::float::ALPRDScheme +pub use vortex_btrblocks::schemes::float::FloatDictScheme -impl core::cmp::Eq for vortex_btrblocks::compressor::float::ALPRDScheme +pub use vortex_btrblocks::schemes::float::FloatStats -impl core::cmp::PartialEq for vortex_btrblocks::compressor::float::ALPRDScheme +pub use vortex_btrblocks::schemes::float::FloatUncompressedScheme -pub fn vortex_btrblocks::compressor::float::ALPRDScheme::eq(&self, other: &vortex_btrblocks::compressor::float::ALPRDScheme) -> bool +pub use vortex_btrblocks::schemes::float::is_float_primitive -impl core::fmt::Debug for vortex_btrblocks::compressor::float::ALPRDScheme +pub struct vortex_btrblocks::schemes::float::ALPRDScheme -pub fn vortex_btrblocks::compressor::float::ALPRDScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +impl core::clone::Clone for vortex_btrblocks::schemes::float::ALPRDScheme -impl core::marker::Copy for vortex_btrblocks::compressor::float::ALPRDScheme +pub fn vortex_btrblocks::schemes::float::ALPRDScheme::clone(&self) -> vortex_btrblocks::schemes::float::ALPRDScheme -impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::float::ALPRDScheme +impl core::cmp::Eq for vortex_btrblocks::schemes::float::ALPRDScheme -impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::float::ALPRDScheme +impl core::cmp::PartialEq for vortex_btrblocks::schemes::float::ALPRDScheme -pub fn vortex_btrblocks::compressor::float::ALPRDScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +pub fn vortex_btrblocks::schemes::float::ALPRDScheme::eq(&self, other: &vortex_btrblocks::schemes::float::ALPRDScheme) -> bool -pub fn vortex_btrblocks::compressor::float::ALPRDScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +impl core::fmt::Debug for vortex_btrblocks::schemes::float::ALPRDScheme -pub fn vortex_btrblocks::compressor::float::ALPRDScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool +pub fn vortex_btrblocks::schemes::float::ALPRDScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -pub fn vortex_btrblocks::compressor::float::ALPRDScheme::scheme_name(&self) -> &'static str +impl core::marker::Copy for vortex_btrblocks::schemes::float::ALPRDScheme -pub struct vortex_btrblocks::compressor::float::ALPScheme +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::float::ALPRDScheme -impl core::clone::Clone for vortex_btrblocks::compressor::float::ALPScheme +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::float::ALPRDScheme -pub fn vortex_btrblocks::compressor::float::ALPScheme::clone(&self) -> vortex_btrblocks::compressor::float::ALPScheme +pub fn vortex_btrblocks::schemes::float::ALPRDScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -impl core::cmp::Eq for vortex_btrblocks::compressor::float::ALPScheme +pub fn vortex_btrblocks::schemes::float::ALPRDScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -impl core::cmp::PartialEq for vortex_btrblocks::compressor::float::ALPScheme +pub fn vortex_btrblocks::schemes::float::ALPRDScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -pub fn vortex_btrblocks::compressor::float::ALPScheme::eq(&self, other: &vortex_btrblocks::compressor::float::ALPScheme) -> bool +pub fn vortex_btrblocks::schemes::float::ALPRDScheme::scheme_name(&self) -> &'static str -impl core::fmt::Debug for vortex_btrblocks::compressor::float::ALPScheme +pub struct vortex_btrblocks::schemes::float::ALPScheme -pub fn vortex_btrblocks::compressor::float::ALPScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +impl core::clone::Clone for vortex_btrblocks::schemes::float::ALPScheme -impl core::marker::Copy for vortex_btrblocks::compressor::float::ALPScheme +pub fn vortex_btrblocks::schemes::float::ALPScheme::clone(&self) -> vortex_btrblocks::schemes::float::ALPScheme -impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::float::ALPScheme +impl core::cmp::Eq for vortex_btrblocks::schemes::float::ALPScheme -impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::float::ALPScheme +impl core::cmp::PartialEq for vortex_btrblocks::schemes::float::ALPScheme -pub fn vortex_btrblocks::compressor::float::ALPScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +pub fn vortex_btrblocks::schemes::float::ALPScheme::eq(&self, other: &vortex_btrblocks::schemes::float::ALPScheme) -> bool -pub fn vortex_btrblocks::compressor::float::ALPScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +impl core::fmt::Debug for vortex_btrblocks::schemes::float::ALPScheme -pub fn vortex_btrblocks::compressor::float::ALPScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool +pub fn vortex_btrblocks::schemes::float::ALPScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -pub fn vortex_btrblocks::compressor::float::ALPScheme::scheme_name(&self) -> &'static str +impl core::marker::Copy for vortex_btrblocks::schemes::float::ALPScheme -pub struct vortex_btrblocks::compressor::float::ConstantScheme +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::float::ALPScheme -impl core::clone::Clone for vortex_btrblocks::compressor::float::ConstantScheme +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::float::ALPScheme -pub fn vortex_btrblocks::compressor::float::ConstantScheme::clone(&self) -> vortex_btrblocks::compressor::float::ConstantScheme +pub fn vortex_btrblocks::schemes::float::ALPScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -impl core::cmp::Eq for vortex_btrblocks::compressor::float::ConstantScheme +pub fn vortex_btrblocks::schemes::float::ALPScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -impl core::cmp::PartialEq for vortex_btrblocks::compressor::float::ConstantScheme +pub fn vortex_btrblocks::schemes::float::ALPScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -pub fn vortex_btrblocks::compressor::float::ConstantScheme::eq(&self, other: &vortex_btrblocks::compressor::float::ConstantScheme) -> bool +pub fn vortex_btrblocks::schemes::float::ALPScheme::num_children(&self) -> usize -impl core::fmt::Debug for vortex_btrblocks::compressor::float::ConstantScheme +pub fn vortex_btrblocks::schemes::float::ALPScheme::scheme_name(&self) -> &'static str -pub fn vortex_btrblocks::compressor::float::ConstantScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub struct vortex_btrblocks::schemes::float::FloatRLEConfig -impl core::marker::Copy for vortex_btrblocks::compressor::float::ConstantScheme +impl core::clone::Clone for vortex_btrblocks::schemes::float::FloatRLEConfig -impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::float::ConstantScheme +pub fn vortex_btrblocks::schemes::float::FloatRLEConfig::clone(&self) -> vortex_btrblocks::schemes::float::FloatRLEConfig -impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::float::ConstantScheme +impl core::cmp::Eq for vortex_btrblocks::schemes::float::FloatRLEConfig -pub fn vortex_btrblocks::compressor::float::ConstantScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +impl core::cmp::PartialEq for vortex_btrblocks::schemes::float::FloatRLEConfig -pub fn vortex_btrblocks::compressor::float::ConstantScheme::detects_constant(&self) -> bool +pub fn vortex_btrblocks::schemes::float::FloatRLEConfig::eq(&self, other: &vortex_btrblocks::schemes::float::FloatRLEConfig) -> bool -pub fn vortex_btrblocks::compressor::float::ConstantScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +impl core::fmt::Debug for vortex_btrblocks::schemes::float::FloatRLEConfig -pub fn vortex_btrblocks::compressor::float::ConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool +pub fn vortex_btrblocks::schemes::float::FloatRLEConfig::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -pub fn vortex_btrblocks::compressor::float::ConstantScheme::scheme_name(&self) -> &'static str +impl core::marker::Copy for vortex_btrblocks::schemes::float::FloatRLEConfig -pub struct vortex_btrblocks::compressor::float::DictScheme +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::float::FloatRLEConfig -impl core::clone::Clone for vortex_btrblocks::compressor::float::DictScheme +pub struct vortex_btrblocks::schemes::float::NullDominatedSparseScheme -pub fn vortex_btrblocks::compressor::float::DictScheme::clone(&self) -> vortex_btrblocks::compressor::float::DictScheme +impl core::clone::Clone for vortex_btrblocks::schemes::float::NullDominatedSparseScheme -impl core::cmp::Eq for vortex_btrblocks::compressor::float::DictScheme +pub fn vortex_btrblocks::schemes::float::NullDominatedSparseScheme::clone(&self) -> vortex_btrblocks::schemes::float::NullDominatedSparseScheme -impl core::cmp::PartialEq for vortex_btrblocks::compressor::float::DictScheme +impl core::cmp::Eq for vortex_btrblocks::schemes::float::NullDominatedSparseScheme -pub fn vortex_btrblocks::compressor::float::DictScheme::eq(&self, other: &vortex_btrblocks::compressor::float::DictScheme) -> bool +impl core::cmp::PartialEq for vortex_btrblocks::schemes::float::NullDominatedSparseScheme -impl core::fmt::Debug for vortex_btrblocks::compressor::float::DictScheme +pub fn vortex_btrblocks::schemes::float::NullDominatedSparseScheme::eq(&self, other: &vortex_btrblocks::schemes::float::NullDominatedSparseScheme) -> bool -pub fn vortex_btrblocks::compressor::float::DictScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +impl core::fmt::Debug for vortex_btrblocks::schemes::float::NullDominatedSparseScheme -impl core::marker::Copy for vortex_btrblocks::compressor::float::DictScheme +pub fn vortex_btrblocks::schemes::float::NullDominatedSparseScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::float::DictScheme +impl core::marker::Copy for vortex_btrblocks::schemes::float::NullDominatedSparseScheme -impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::float::DictScheme +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::float::NullDominatedSparseScheme -pub fn vortex_btrblocks::compressor::float::DictScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::float::NullDominatedSparseScheme -pub fn vortex_btrblocks::compressor::float::DictScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +pub fn vortex_btrblocks::schemes::float::NullDominatedSparseScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub fn vortex_btrblocks::compressor::float::DictScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool +pub fn vortex_btrblocks::schemes::float::NullDominatedSparseScheme::descendant_exclusions(&self) -> alloc::vec::Vec -pub fn vortex_btrblocks::compressor::float::DictScheme::scheme_name(&self) -> &'static str +pub fn vortex_btrblocks::schemes::float::NullDominatedSparseScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub fn vortex_btrblocks::compressor::float::DictScheme::stats_options(&self) -> vortex_compressor::stats::options::GenerateStatsOptions +pub fn vortex_btrblocks::schemes::float::NullDominatedSparseScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -pub struct vortex_btrblocks::compressor::float::FloatRLEConfig +pub fn vortex_btrblocks::schemes::float::NullDominatedSparseScheme::num_children(&self) -> usize -impl core::clone::Clone for vortex_btrblocks::compressor::float::FloatRLEConfig +pub fn vortex_btrblocks::schemes::float::NullDominatedSparseScheme::scheme_name(&self) -> &'static str -pub fn vortex_btrblocks::compressor::float::FloatRLEConfig::clone(&self) -> vortex_btrblocks::compressor::float::FloatRLEConfig +pub struct vortex_btrblocks::schemes::float::PcoScheme -impl core::cmp::Eq for vortex_btrblocks::compressor::float::FloatRLEConfig +impl core::clone::Clone for vortex_btrblocks::schemes::float::PcoScheme -impl core::cmp::PartialEq for vortex_btrblocks::compressor::float::FloatRLEConfig +pub fn vortex_btrblocks::schemes::float::PcoScheme::clone(&self) -> vortex_btrblocks::schemes::float::PcoScheme -pub fn vortex_btrblocks::compressor::float::FloatRLEConfig::eq(&self, other: &vortex_btrblocks::compressor::float::FloatRLEConfig) -> bool +impl core::cmp::Eq for vortex_btrblocks::schemes::float::PcoScheme -impl core::fmt::Debug for vortex_btrblocks::compressor::float::FloatRLEConfig +impl core::cmp::PartialEq for vortex_btrblocks::schemes::float::PcoScheme -pub fn vortex_btrblocks::compressor::float::FloatRLEConfig::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub fn vortex_btrblocks::schemes::float::PcoScheme::eq(&self, other: &vortex_btrblocks::schemes::float::PcoScheme) -> bool -impl core::marker::Copy for vortex_btrblocks::compressor::float::FloatRLEConfig +impl core::fmt::Debug for vortex_btrblocks::schemes::float::PcoScheme -impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::float::FloatRLEConfig +pub fn vortex_btrblocks::schemes::float::PcoScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -pub struct vortex_btrblocks::compressor::float::NullDominated +impl core::marker::Copy for vortex_btrblocks::schemes::float::PcoScheme -impl core::clone::Clone for vortex_btrblocks::compressor::float::NullDominated +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::float::PcoScheme -pub fn vortex_btrblocks::compressor::float::NullDominated::clone(&self) -> vortex_btrblocks::compressor::float::NullDominated +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::float::PcoScheme -impl core::cmp::Eq for vortex_btrblocks::compressor::float::NullDominated +pub fn vortex_btrblocks::schemes::float::PcoScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -impl core::cmp::PartialEq for vortex_btrblocks::compressor::float::NullDominated +pub fn vortex_btrblocks::schemes::float::PcoScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -pub fn vortex_btrblocks::compressor::float::NullDominated::eq(&self, other: &vortex_btrblocks::compressor::float::NullDominated) -> bool +pub fn vortex_btrblocks::schemes::float::PcoScheme::scheme_name(&self) -> &'static str -impl core::fmt::Debug for vortex_btrblocks::compressor::float::NullDominated +pub const vortex_btrblocks::schemes::float::RLE_FLOAT_SCHEME: vortex_btrblocks::schemes::rle::RLEScheme -pub fn vortex_btrblocks::compressor::float::NullDominated::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub mod vortex_btrblocks::schemes::integer -impl core::marker::Copy for vortex_btrblocks::compressor::float::NullDominated +pub use vortex_btrblocks::schemes::integer::IntConstantScheme -impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::float::NullDominated +pub use vortex_btrblocks::schemes::integer::IntDictScheme -impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::float::NullDominated +pub use vortex_btrblocks::schemes::integer::IntUncompressedScheme -pub fn vortex_btrblocks::compressor::float::NullDominated::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +pub use vortex_btrblocks::schemes::integer::IntegerStats -pub fn vortex_btrblocks::compressor::float::NullDominated::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +pub use vortex_btrblocks::schemes::integer::is_integer_primitive -pub fn vortex_btrblocks::compressor::float::NullDominated::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool +pub struct vortex_btrblocks::schemes::integer::BitPackingScheme -pub fn vortex_btrblocks::compressor::float::NullDominated::scheme_name(&self) -> &'static str +impl core::clone::Clone for vortex_btrblocks::schemes::integer::BitPackingScheme -pub struct vortex_btrblocks::compressor::float::PcoScheme +pub fn vortex_btrblocks::schemes::integer::BitPackingScheme::clone(&self) -> vortex_btrblocks::schemes::integer::BitPackingScheme -impl core::clone::Clone for vortex_btrblocks::compressor::float::PcoScheme +impl core::cmp::Eq for vortex_btrblocks::schemes::integer::BitPackingScheme -pub fn vortex_btrblocks::compressor::float::PcoScheme::clone(&self) -> vortex_btrblocks::compressor::float::PcoScheme +impl core::cmp::PartialEq for vortex_btrblocks::schemes::integer::BitPackingScheme -impl core::cmp::Eq for vortex_btrblocks::compressor::float::PcoScheme +pub fn vortex_btrblocks::schemes::integer::BitPackingScheme::eq(&self, other: &vortex_btrblocks::schemes::integer::BitPackingScheme) -> bool -impl core::cmp::PartialEq for vortex_btrblocks::compressor::float::PcoScheme +impl core::fmt::Debug for vortex_btrblocks::schemes::integer::BitPackingScheme -pub fn vortex_btrblocks::compressor::float::PcoScheme::eq(&self, other: &vortex_btrblocks::compressor::float::PcoScheme) -> bool +pub fn vortex_btrblocks::schemes::integer::BitPackingScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -impl core::fmt::Debug for vortex_btrblocks::compressor::float::PcoScheme +impl core::marker::Copy for vortex_btrblocks::schemes::integer::BitPackingScheme -pub fn vortex_btrblocks::compressor::float::PcoScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::integer::BitPackingScheme -impl core::marker::Copy for vortex_btrblocks::compressor::float::PcoScheme +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::integer::BitPackingScheme -impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::float::PcoScheme +pub fn vortex_btrblocks::schemes::integer::BitPackingScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::float::PcoScheme +pub fn vortex_btrblocks::schemes::integer::BitPackingScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub fn vortex_btrblocks::compressor::float::PcoScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +pub fn vortex_btrblocks::schemes::integer::BitPackingScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -pub fn vortex_btrblocks::compressor::float::PcoScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool +pub fn vortex_btrblocks::schemes::integer::BitPackingScheme::scheme_name(&self) -> &'static str -pub fn vortex_btrblocks::compressor::float::PcoScheme::scheme_name(&self) -> &'static str +pub struct vortex_btrblocks::schemes::integer::FoRScheme -pub struct vortex_btrblocks::compressor::float::UncompressedScheme +impl core::clone::Clone for vortex_btrblocks::schemes::integer::FoRScheme -impl core::clone::Clone for vortex_btrblocks::compressor::float::UncompressedScheme +pub fn vortex_btrblocks::schemes::integer::FoRScheme::clone(&self) -> vortex_btrblocks::schemes::integer::FoRScheme -pub fn vortex_btrblocks::compressor::float::UncompressedScheme::clone(&self) -> vortex_btrblocks::compressor::float::UncompressedScheme +impl core::cmp::Eq for vortex_btrblocks::schemes::integer::FoRScheme -impl core::cmp::Eq for vortex_btrblocks::compressor::float::UncompressedScheme +impl core::cmp::PartialEq for vortex_btrblocks::schemes::integer::FoRScheme -impl core::cmp::PartialEq for vortex_btrblocks::compressor::float::UncompressedScheme +pub fn vortex_btrblocks::schemes::integer::FoRScheme::eq(&self, other: &vortex_btrblocks::schemes::integer::FoRScheme) -> bool -pub fn vortex_btrblocks::compressor::float::UncompressedScheme::eq(&self, other: &vortex_btrblocks::compressor::float::UncompressedScheme) -> bool +impl core::fmt::Debug for vortex_btrblocks::schemes::integer::FoRScheme -impl core::fmt::Debug for vortex_btrblocks::compressor::float::UncompressedScheme +pub fn vortex_btrblocks::schemes::integer::FoRScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -pub fn vortex_btrblocks::compressor::float::UncompressedScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +impl core::marker::Copy for vortex_btrblocks::schemes::integer::FoRScheme -impl core::marker::Copy for vortex_btrblocks::compressor::float::UncompressedScheme +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::integer::FoRScheme -impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::float::UncompressedScheme +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::integer::FoRScheme -impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::float::UncompressedScheme +pub fn vortex_btrblocks::schemes::integer::FoRScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub fn vortex_btrblocks::compressor::float::UncompressedScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +pub fn vortex_btrblocks::schemes::integer::FoRScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub fn vortex_btrblocks::compressor::float::UncompressedScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, _data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +pub fn vortex_btrblocks::schemes::integer::FoRScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -pub fn vortex_btrblocks::compressor::float::UncompressedScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool +pub fn vortex_btrblocks::schemes::integer::FoRScheme::scheme_name(&self) -> &'static str -pub fn vortex_btrblocks::compressor::float::UncompressedScheme::scheme_name(&self) -> &'static str +pub struct vortex_btrblocks::schemes::integer::IntRLEConfig -pub const vortex_btrblocks::compressor::float::RLE_FLOAT_SCHEME: vortex_btrblocks::compressor::rle::RLEScheme +impl core::clone::Clone for vortex_btrblocks::schemes::integer::IntRLEConfig -pub mod vortex_btrblocks::compressor::integer +pub fn vortex_btrblocks::schemes::integer::IntRLEConfig::clone(&self) -> vortex_btrblocks::schemes::integer::IntRLEConfig -pub use vortex_btrblocks::compressor::integer::IntegerStats +impl core::cmp::Eq for vortex_btrblocks::schemes::integer::IntRLEConfig -pub struct vortex_btrblocks::compressor::integer::BitPackingScheme +impl core::cmp::PartialEq for vortex_btrblocks::schemes::integer::IntRLEConfig -impl core::clone::Clone for vortex_btrblocks::compressor::integer::BitPackingScheme +pub fn vortex_btrblocks::schemes::integer::IntRLEConfig::eq(&self, other: &vortex_btrblocks::schemes::integer::IntRLEConfig) -> bool -pub fn vortex_btrblocks::compressor::integer::BitPackingScheme::clone(&self) -> vortex_btrblocks::compressor::integer::BitPackingScheme +impl core::fmt::Debug for vortex_btrblocks::schemes::integer::IntRLEConfig -impl core::cmp::Eq for vortex_btrblocks::compressor::integer::BitPackingScheme +pub fn vortex_btrblocks::schemes::integer::IntRLEConfig::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -impl core::cmp::PartialEq for vortex_btrblocks::compressor::integer::BitPackingScheme +impl core::marker::Copy for vortex_btrblocks::schemes::integer::IntRLEConfig -pub fn vortex_btrblocks::compressor::integer::BitPackingScheme::eq(&self, other: &vortex_btrblocks::compressor::integer::BitPackingScheme) -> bool +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::integer::IntRLEConfig -impl core::fmt::Debug for vortex_btrblocks::compressor::integer::BitPackingScheme +pub struct vortex_btrblocks::schemes::integer::PcoScheme -pub fn vortex_btrblocks::compressor::integer::BitPackingScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +impl core::clone::Clone for vortex_btrblocks::schemes::integer::PcoScheme -impl core::marker::Copy for vortex_btrblocks::compressor::integer::BitPackingScheme +pub fn vortex_btrblocks::schemes::integer::PcoScheme::clone(&self) -> vortex_btrblocks::schemes::integer::PcoScheme -impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::integer::BitPackingScheme +impl core::cmp::Eq for vortex_btrblocks::schemes::integer::PcoScheme -impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::integer::BitPackingScheme +impl core::cmp::PartialEq for vortex_btrblocks::schemes::integer::PcoScheme -pub fn vortex_btrblocks::compressor::integer::BitPackingScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +pub fn vortex_btrblocks::schemes::integer::PcoScheme::eq(&self, other: &vortex_btrblocks::schemes::integer::PcoScheme) -> bool -pub fn vortex_btrblocks::compressor::integer::BitPackingScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +impl core::fmt::Debug for vortex_btrblocks::schemes::integer::PcoScheme -pub fn vortex_btrblocks::compressor::integer::BitPackingScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool +pub fn vortex_btrblocks::schemes::integer::PcoScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -pub fn vortex_btrblocks::compressor::integer::BitPackingScheme::scheme_name(&self) -> &'static str +impl core::marker::Copy for vortex_btrblocks::schemes::integer::PcoScheme -pub struct vortex_btrblocks::compressor::integer::ConstantScheme +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::integer::PcoScheme -impl core::clone::Clone for vortex_btrblocks::compressor::integer::ConstantScheme +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::integer::PcoScheme -pub fn vortex_btrblocks::compressor::integer::ConstantScheme::clone(&self) -> vortex_btrblocks::compressor::integer::ConstantScheme +pub fn vortex_btrblocks::schemes::integer::PcoScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -impl core::cmp::Eq for vortex_btrblocks::compressor::integer::ConstantScheme +pub fn vortex_btrblocks::schemes::integer::PcoScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -impl core::cmp::PartialEq for vortex_btrblocks::compressor::integer::ConstantScheme +pub fn vortex_btrblocks::schemes::integer::PcoScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -pub fn vortex_btrblocks::compressor::integer::ConstantScheme::eq(&self, other: &vortex_btrblocks::compressor::integer::ConstantScheme) -> bool +pub fn vortex_btrblocks::schemes::integer::PcoScheme::scheme_name(&self) -> &'static str -impl core::fmt::Debug for vortex_btrblocks::compressor::integer::ConstantScheme +pub struct vortex_btrblocks::schemes::integer::RunEndScheme -pub fn vortex_btrblocks::compressor::integer::ConstantScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +impl core::clone::Clone for vortex_btrblocks::schemes::integer::RunEndScheme -impl core::marker::Copy for vortex_btrblocks::compressor::integer::ConstantScheme +pub fn vortex_btrblocks::schemes::integer::RunEndScheme::clone(&self) -> vortex_btrblocks::schemes::integer::RunEndScheme -impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::integer::ConstantScheme +impl core::cmp::Eq for vortex_btrblocks::schemes::integer::RunEndScheme -impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::integer::ConstantScheme +impl core::cmp::PartialEq for vortex_btrblocks::schemes::integer::RunEndScheme -pub fn vortex_btrblocks::compressor::integer::ConstantScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +pub fn vortex_btrblocks::schemes::integer::RunEndScheme::eq(&self, other: &vortex_btrblocks::schemes::integer::RunEndScheme) -> bool -pub fn vortex_btrblocks::compressor::integer::ConstantScheme::detects_constant(&self) -> bool +impl core::fmt::Debug for vortex_btrblocks::schemes::integer::RunEndScheme -pub fn vortex_btrblocks::compressor::integer::ConstantScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +pub fn vortex_btrblocks::schemes::integer::RunEndScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -pub fn vortex_btrblocks::compressor::integer::ConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool +impl core::marker::Copy for vortex_btrblocks::schemes::integer::RunEndScheme -pub fn vortex_btrblocks::compressor::integer::ConstantScheme::scheme_name(&self) -> &'static str +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::integer::RunEndScheme -pub struct vortex_btrblocks::compressor::integer::DictScheme +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::integer::RunEndScheme -impl core::clone::Clone for vortex_btrblocks::compressor::integer::DictScheme +pub fn vortex_btrblocks::schemes::integer::RunEndScheme::ancestor_exclusions(&self) -> alloc::vec::Vec -pub fn vortex_btrblocks::compressor::integer::DictScheme::clone(&self) -> vortex_btrblocks::compressor::integer::DictScheme +pub fn vortex_btrblocks::schemes::integer::RunEndScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -impl core::cmp::Eq for vortex_btrblocks::compressor::integer::DictScheme +pub fn vortex_btrblocks::schemes::integer::RunEndScheme::descendant_exclusions(&self) -> alloc::vec::Vec -impl core::cmp::PartialEq for vortex_btrblocks::compressor::integer::DictScheme +pub fn vortex_btrblocks::schemes::integer::RunEndScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub fn vortex_btrblocks::compressor::integer::DictScheme::eq(&self, other: &vortex_btrblocks::compressor::integer::DictScheme) -> bool +pub fn vortex_btrblocks::schemes::integer::RunEndScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -impl core::fmt::Debug for vortex_btrblocks::compressor::integer::DictScheme +pub fn vortex_btrblocks::schemes::integer::RunEndScheme::num_children(&self) -> usize -pub fn vortex_btrblocks::compressor::integer::DictScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub fn vortex_btrblocks::schemes::integer::RunEndScheme::scheme_name(&self) -> &'static str -impl core::marker::Copy for vortex_btrblocks::compressor::integer::DictScheme +pub struct vortex_btrblocks::schemes::integer::SequenceScheme -impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::integer::DictScheme +impl core::clone::Clone for vortex_btrblocks::schemes::integer::SequenceScheme -impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::integer::DictScheme +pub fn vortex_btrblocks::schemes::integer::SequenceScheme::clone(&self) -> vortex_btrblocks::schemes::integer::SequenceScheme -pub fn vortex_btrblocks::compressor::integer::DictScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +impl core::cmp::Eq for vortex_btrblocks::schemes::integer::SequenceScheme -pub fn vortex_btrblocks::compressor::integer::DictScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +impl core::cmp::PartialEq for vortex_btrblocks::schemes::integer::SequenceScheme -pub fn vortex_btrblocks::compressor::integer::DictScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool +pub fn vortex_btrblocks::schemes::integer::SequenceScheme::eq(&self, other: &vortex_btrblocks::schemes::integer::SequenceScheme) -> bool -pub fn vortex_btrblocks::compressor::integer::DictScheme::scheme_name(&self) -> &'static str +impl core::fmt::Debug for vortex_btrblocks::schemes::integer::SequenceScheme -pub fn vortex_btrblocks::compressor::integer::DictScheme::stats_options(&self) -> vortex_compressor::stats::options::GenerateStatsOptions +pub fn vortex_btrblocks::schemes::integer::SequenceScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -pub struct vortex_btrblocks::compressor::integer::FORScheme +impl core::marker::Copy for vortex_btrblocks::schemes::integer::SequenceScheme -impl core::clone::Clone for vortex_btrblocks::compressor::integer::FORScheme +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::integer::SequenceScheme -pub fn vortex_btrblocks::compressor::integer::FORScheme::clone(&self) -> vortex_btrblocks::compressor::integer::FORScheme +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::integer::SequenceScheme -impl core::cmp::Eq for vortex_btrblocks::compressor::integer::FORScheme +pub fn vortex_btrblocks::schemes::integer::SequenceScheme::ancestor_exclusions(&self) -> alloc::vec::Vec -impl core::cmp::PartialEq for vortex_btrblocks::compressor::integer::FORScheme +pub fn vortex_btrblocks::schemes::integer::SequenceScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub fn vortex_btrblocks::compressor::integer::FORScheme::eq(&self, other: &vortex_btrblocks::compressor::integer::FORScheme) -> bool +pub fn vortex_btrblocks::schemes::integer::SequenceScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -impl core::fmt::Debug for vortex_btrblocks::compressor::integer::FORScheme +pub fn vortex_btrblocks::schemes::integer::SequenceScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -pub fn vortex_btrblocks::compressor::integer::FORScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub fn vortex_btrblocks::schemes::integer::SequenceScheme::scheme_name(&self) -> &'static str -impl core::marker::Copy for vortex_btrblocks::compressor::integer::FORScheme +pub struct vortex_btrblocks::schemes::integer::SparseScheme -impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::integer::FORScheme +impl core::clone::Clone for vortex_btrblocks::schemes::integer::SparseScheme -impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::integer::FORScheme +pub fn vortex_btrblocks::schemes::integer::SparseScheme::clone(&self) -> vortex_btrblocks::schemes::integer::SparseScheme -pub fn vortex_btrblocks::compressor::integer::FORScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +impl core::cmp::Eq for vortex_btrblocks::schemes::integer::SparseScheme -pub fn vortex_btrblocks::compressor::integer::FORScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +impl core::cmp::PartialEq for vortex_btrblocks::schemes::integer::SparseScheme -pub fn vortex_btrblocks::compressor::integer::FORScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool +pub fn vortex_btrblocks::schemes::integer::SparseScheme::eq(&self, other: &vortex_btrblocks::schemes::integer::SparseScheme) -> bool -pub fn vortex_btrblocks::compressor::integer::FORScheme::scheme_name(&self) -> &'static str +impl core::fmt::Debug for vortex_btrblocks::schemes::integer::SparseScheme -pub struct vortex_btrblocks::compressor::integer::IntRLEConfig +pub fn vortex_btrblocks::schemes::integer::SparseScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -impl core::clone::Clone for vortex_btrblocks::compressor::integer::IntRLEConfig +impl core::marker::Copy for vortex_btrblocks::schemes::integer::SparseScheme -pub fn vortex_btrblocks::compressor::integer::IntRLEConfig::clone(&self) -> vortex_btrblocks::compressor::integer::IntRLEConfig +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::integer::SparseScheme -impl core::cmp::Eq for vortex_btrblocks::compressor::integer::IntRLEConfig +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::integer::SparseScheme -impl core::cmp::PartialEq for vortex_btrblocks::compressor::integer::IntRLEConfig +pub fn vortex_btrblocks::schemes::integer::SparseScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub fn vortex_btrblocks::compressor::integer::IntRLEConfig::eq(&self, other: &vortex_btrblocks::compressor::integer::IntRLEConfig) -> bool +pub fn vortex_btrblocks::schemes::integer::SparseScheme::descendant_exclusions(&self) -> alloc::vec::Vec -impl core::fmt::Debug for vortex_btrblocks::compressor::integer::IntRLEConfig +pub fn vortex_btrblocks::schemes::integer::SparseScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub fn vortex_btrblocks::compressor::integer::IntRLEConfig::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub fn vortex_btrblocks::schemes::integer::SparseScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -impl core::marker::Copy for vortex_btrblocks::compressor::integer::IntRLEConfig +pub fn vortex_btrblocks::schemes::integer::SparseScheme::num_children(&self) -> usize -impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::integer::IntRLEConfig +pub fn vortex_btrblocks::schemes::integer::SparseScheme::scheme_name(&self) -> &'static str -pub struct vortex_btrblocks::compressor::integer::PcoScheme +pub fn vortex_btrblocks::schemes::integer::SparseScheme::stats_options(&self) -> vortex_compressor::stats::options::GenerateStatsOptions -impl core::clone::Clone for vortex_btrblocks::compressor::integer::PcoScheme +pub struct vortex_btrblocks::schemes::integer::ZigZagScheme -pub fn vortex_btrblocks::compressor::integer::PcoScheme::clone(&self) -> vortex_btrblocks::compressor::integer::PcoScheme +impl core::clone::Clone for vortex_btrblocks::schemes::integer::ZigZagScheme -impl core::cmp::Eq for vortex_btrblocks::compressor::integer::PcoScheme +pub fn vortex_btrblocks::schemes::integer::ZigZagScheme::clone(&self) -> vortex_btrblocks::schemes::integer::ZigZagScheme -impl core::cmp::PartialEq for vortex_btrblocks::compressor::integer::PcoScheme +impl core::cmp::Eq for vortex_btrblocks::schemes::integer::ZigZagScheme -pub fn vortex_btrblocks::compressor::integer::PcoScheme::eq(&self, other: &vortex_btrblocks::compressor::integer::PcoScheme) -> bool +impl core::cmp::PartialEq for vortex_btrblocks::schemes::integer::ZigZagScheme -impl core::fmt::Debug for vortex_btrblocks::compressor::integer::PcoScheme +pub fn vortex_btrblocks::schemes::integer::ZigZagScheme::eq(&self, other: &vortex_btrblocks::schemes::integer::ZigZagScheme) -> bool -pub fn vortex_btrblocks::compressor::integer::PcoScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +impl core::fmt::Debug for vortex_btrblocks::schemes::integer::ZigZagScheme -impl core::marker::Copy for vortex_btrblocks::compressor::integer::PcoScheme +pub fn vortex_btrblocks::schemes::integer::ZigZagScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::integer::PcoScheme +impl core::marker::Copy for vortex_btrblocks::schemes::integer::ZigZagScheme -impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::integer::PcoScheme +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::integer::ZigZagScheme -pub fn vortex_btrblocks::compressor::integer::PcoScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::integer::ZigZagScheme -pub fn vortex_btrblocks::compressor::integer::PcoScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +pub fn vortex_btrblocks::schemes::integer::ZigZagScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub fn vortex_btrblocks::compressor::integer::PcoScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool +pub fn vortex_btrblocks::schemes::integer::ZigZagScheme::descendant_exclusions(&self) -> alloc::vec::Vec -pub fn vortex_btrblocks::compressor::integer::PcoScheme::scheme_name(&self) -> &'static str +pub fn vortex_btrblocks::schemes::integer::ZigZagScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub struct vortex_btrblocks::compressor::integer::RunEndScheme +pub fn vortex_btrblocks::schemes::integer::ZigZagScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -impl core::clone::Clone for vortex_btrblocks::compressor::integer::RunEndScheme +pub fn vortex_btrblocks::schemes::integer::ZigZagScheme::num_children(&self) -> usize -pub fn vortex_btrblocks::compressor::integer::RunEndScheme::clone(&self) -> vortex_btrblocks::compressor::integer::RunEndScheme +pub fn vortex_btrblocks::schemes::integer::ZigZagScheme::scheme_name(&self) -> &'static str -impl core::cmp::Eq for vortex_btrblocks::compressor::integer::RunEndScheme +pub const vortex_btrblocks::schemes::integer::RLE_INTEGER_SCHEME: vortex_btrblocks::schemes::rle::RLEScheme -impl core::cmp::PartialEq for vortex_btrblocks::compressor::integer::RunEndScheme +pub mod vortex_btrblocks::schemes::string -pub fn vortex_btrblocks::compressor::integer::RunEndScheme::eq(&self, other: &vortex_btrblocks::compressor::integer::RunEndScheme) -> bool +pub use vortex_btrblocks::schemes::string::StringConstantScheme -impl core::fmt::Debug for vortex_btrblocks::compressor::integer::RunEndScheme +pub use vortex_btrblocks::schemes::string::StringDictScheme -pub fn vortex_btrblocks::compressor::integer::RunEndScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub use vortex_btrblocks::schemes::string::StringStats -impl core::marker::Copy for vortex_btrblocks::compressor::integer::RunEndScheme +pub use vortex_btrblocks::schemes::string::StringUncompressedScheme -impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::integer::RunEndScheme +pub use vortex_btrblocks::schemes::string::is_utf8_string -impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::integer::RunEndScheme +pub struct vortex_btrblocks::schemes::string::FSSTScheme -pub fn vortex_btrblocks::compressor::integer::RunEndScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +impl core::clone::Clone for vortex_btrblocks::schemes::string::FSSTScheme -pub fn vortex_btrblocks::compressor::integer::RunEndScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +pub fn vortex_btrblocks::schemes::string::FSSTScheme::clone(&self) -> vortex_btrblocks::schemes::string::FSSTScheme -pub fn vortex_btrblocks::compressor::integer::RunEndScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool +impl core::cmp::Eq for vortex_btrblocks::schemes::string::FSSTScheme -pub fn vortex_btrblocks::compressor::integer::RunEndScheme::scheme_name(&self) -> &'static str +impl core::cmp::PartialEq for vortex_btrblocks::schemes::string::FSSTScheme -pub struct vortex_btrblocks::compressor::integer::SequenceScheme +pub fn vortex_btrblocks::schemes::string::FSSTScheme::eq(&self, other: &vortex_btrblocks::schemes::string::FSSTScheme) -> bool -impl core::clone::Clone for vortex_btrblocks::compressor::integer::SequenceScheme +impl core::fmt::Debug for vortex_btrblocks::schemes::string::FSSTScheme -pub fn vortex_btrblocks::compressor::integer::SequenceScheme::clone(&self) -> vortex_btrblocks::compressor::integer::SequenceScheme +pub fn vortex_btrblocks::schemes::string::FSSTScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -impl core::cmp::Eq for vortex_btrblocks::compressor::integer::SequenceScheme +impl core::marker::Copy for vortex_btrblocks::schemes::string::FSSTScheme -impl core::cmp::PartialEq for vortex_btrblocks::compressor::integer::SequenceScheme +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::string::FSSTScheme -pub fn vortex_btrblocks::compressor::integer::SequenceScheme::eq(&self, other: &vortex_btrblocks::compressor::integer::SequenceScheme) -> bool +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::string::FSSTScheme -impl core::fmt::Debug for vortex_btrblocks::compressor::integer::SequenceScheme +pub fn vortex_btrblocks::schemes::string::FSSTScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub fn vortex_btrblocks::compressor::integer::SequenceScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub fn vortex_btrblocks::schemes::string::FSSTScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -impl core::marker::Copy for vortex_btrblocks::compressor::integer::SequenceScheme +pub fn vortex_btrblocks::schemes::string::FSSTScheme::num_children(&self) -> usize -impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::integer::SequenceScheme +pub fn vortex_btrblocks::schemes::string::FSSTScheme::scheme_name(&self) -> &'static str -impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::integer::SequenceScheme +pub struct vortex_btrblocks::schemes::string::NullDominatedSparseScheme -pub fn vortex_btrblocks::compressor::integer::SequenceScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +impl core::clone::Clone for vortex_btrblocks::schemes::string::NullDominatedSparseScheme -pub fn vortex_btrblocks::compressor::integer::SequenceScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +pub fn vortex_btrblocks::schemes::string::NullDominatedSparseScheme::clone(&self) -> vortex_btrblocks::schemes::string::NullDominatedSparseScheme -pub fn vortex_btrblocks::compressor::integer::SequenceScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool +impl core::cmp::Eq for vortex_btrblocks::schemes::string::NullDominatedSparseScheme -pub fn vortex_btrblocks::compressor::integer::SequenceScheme::scheme_name(&self) -> &'static str +impl core::cmp::PartialEq for vortex_btrblocks::schemes::string::NullDominatedSparseScheme -pub struct vortex_btrblocks::compressor::integer::SparseScheme +pub fn vortex_btrblocks::schemes::string::NullDominatedSparseScheme::eq(&self, other: &vortex_btrblocks::schemes::string::NullDominatedSparseScheme) -> bool -impl core::clone::Clone for vortex_btrblocks::compressor::integer::SparseScheme +impl core::fmt::Debug for vortex_btrblocks::schemes::string::NullDominatedSparseScheme -pub fn vortex_btrblocks::compressor::integer::SparseScheme::clone(&self) -> vortex_btrblocks::compressor::integer::SparseScheme +pub fn vortex_btrblocks::schemes::string::NullDominatedSparseScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -impl core::cmp::Eq for vortex_btrblocks::compressor::integer::SparseScheme +impl core::marker::Copy for vortex_btrblocks::schemes::string::NullDominatedSparseScheme -impl core::cmp::PartialEq for vortex_btrblocks::compressor::integer::SparseScheme +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::string::NullDominatedSparseScheme -pub fn vortex_btrblocks::compressor::integer::SparseScheme::eq(&self, other: &vortex_btrblocks::compressor::integer::SparseScheme) -> bool +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::string::NullDominatedSparseScheme -impl core::fmt::Debug for vortex_btrblocks::compressor::integer::SparseScheme +pub fn vortex_btrblocks::schemes::string::NullDominatedSparseScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub fn vortex_btrblocks::compressor::integer::SparseScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub fn vortex_btrblocks::schemes::string::NullDominatedSparseScheme::descendant_exclusions(&self) -> alloc::vec::Vec -impl core::marker::Copy for vortex_btrblocks::compressor::integer::SparseScheme +pub fn vortex_btrblocks::schemes::string::NullDominatedSparseScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::integer::SparseScheme +pub fn vortex_btrblocks::schemes::string::NullDominatedSparseScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::integer::SparseScheme +pub fn vortex_btrblocks::schemes::string::NullDominatedSparseScheme::num_children(&self) -> usize -pub fn vortex_btrblocks::compressor::integer::SparseScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +pub fn vortex_btrblocks::schemes::string::NullDominatedSparseScheme::scheme_name(&self) -> &'static str -pub fn vortex_btrblocks::compressor::integer::SparseScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +pub struct vortex_btrblocks::schemes::string::ZstdScheme -pub fn vortex_btrblocks::compressor::integer::SparseScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool +impl core::clone::Clone for vortex_btrblocks::schemes::string::ZstdScheme -pub fn vortex_btrblocks::compressor::integer::SparseScheme::scheme_name(&self) -> &'static str +pub fn vortex_btrblocks::schemes::string::ZstdScheme::clone(&self) -> vortex_btrblocks::schemes::string::ZstdScheme -pub fn vortex_btrblocks::compressor::integer::SparseScheme::stats_options(&self) -> vortex_compressor::stats::options::GenerateStatsOptions +impl core::cmp::Eq for vortex_btrblocks::schemes::string::ZstdScheme -pub struct vortex_btrblocks::compressor::integer::UncompressedScheme +impl core::cmp::PartialEq for vortex_btrblocks::schemes::string::ZstdScheme -impl core::clone::Clone for vortex_btrblocks::compressor::integer::UncompressedScheme +pub fn vortex_btrblocks::schemes::string::ZstdScheme::eq(&self, other: &vortex_btrblocks::schemes::string::ZstdScheme) -> bool -pub fn vortex_btrblocks::compressor::integer::UncompressedScheme::clone(&self) -> vortex_btrblocks::compressor::integer::UncompressedScheme +impl core::fmt::Debug for vortex_btrblocks::schemes::string::ZstdScheme -impl core::cmp::Eq for vortex_btrblocks::compressor::integer::UncompressedScheme +pub fn vortex_btrblocks::schemes::string::ZstdScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -impl core::cmp::PartialEq for vortex_btrblocks::compressor::integer::UncompressedScheme +impl core::marker::Copy for vortex_btrblocks::schemes::string::ZstdScheme -pub fn vortex_btrblocks::compressor::integer::UncompressedScheme::eq(&self, other: &vortex_btrblocks::compressor::integer::UncompressedScheme) -> bool +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::string::ZstdScheme -impl core::fmt::Debug for vortex_btrblocks::compressor::integer::UncompressedScheme +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::string::ZstdScheme -pub fn vortex_btrblocks::compressor::integer::UncompressedScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub fn vortex_btrblocks::schemes::string::ZstdScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -impl core::marker::Copy for vortex_btrblocks::compressor::integer::UncompressedScheme +pub fn vortex_btrblocks::schemes::string::ZstdScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::integer::UncompressedScheme +pub fn vortex_btrblocks::schemes::string::ZstdScheme::scheme_name(&self) -> &'static str -impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::integer::UncompressedScheme +pub mod vortex_btrblocks::schemes::temporal -pub fn vortex_btrblocks::compressor::integer::UncompressedScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +pub struct vortex_btrblocks::schemes::temporal::TemporalScheme -pub fn vortex_btrblocks::compressor::integer::UncompressedScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, _data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +impl core::clone::Clone for vortex_btrblocks::schemes::temporal::TemporalScheme -pub fn vortex_btrblocks::compressor::integer::UncompressedScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool +pub fn vortex_btrblocks::schemes::temporal::TemporalScheme::clone(&self) -> vortex_btrblocks::schemes::temporal::TemporalScheme -pub fn vortex_btrblocks::compressor::integer::UncompressedScheme::scheme_name(&self) -> &'static str +impl core::cmp::Eq for vortex_btrblocks::schemes::temporal::TemporalScheme -pub struct vortex_btrblocks::compressor::integer::ZigZagScheme +impl core::cmp::PartialEq for vortex_btrblocks::schemes::temporal::TemporalScheme -impl core::clone::Clone for vortex_btrblocks::compressor::integer::ZigZagScheme +pub fn vortex_btrblocks::schemes::temporal::TemporalScheme::eq(&self, other: &vortex_btrblocks::schemes::temporal::TemporalScheme) -> bool -pub fn vortex_btrblocks::compressor::integer::ZigZagScheme::clone(&self) -> vortex_btrblocks::compressor::integer::ZigZagScheme +impl core::fmt::Debug for vortex_btrblocks::schemes::temporal::TemporalScheme -impl core::cmp::Eq for vortex_btrblocks::compressor::integer::ZigZagScheme +pub fn vortex_btrblocks::schemes::temporal::TemporalScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -impl core::cmp::PartialEq for vortex_btrblocks::compressor::integer::ZigZagScheme +impl core::marker::Copy for vortex_btrblocks::schemes::temporal::TemporalScheme -pub fn vortex_btrblocks::compressor::integer::ZigZagScheme::eq(&self, other: &vortex_btrblocks::compressor::integer::ZigZagScheme) -> bool +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::temporal::TemporalScheme -impl core::fmt::Debug for vortex_btrblocks::compressor::integer::ZigZagScheme +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::temporal::TemporalScheme -pub fn vortex_btrblocks::compressor::integer::ZigZagScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub fn vortex_btrblocks::schemes::temporal::TemporalScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -impl core::marker::Copy for vortex_btrblocks::compressor::integer::ZigZagScheme +pub fn vortex_btrblocks::schemes::temporal::TemporalScheme::detects_constant(&self) -> bool -impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::integer::ZigZagScheme +pub fn vortex_btrblocks::schemes::temporal::TemporalScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, _data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::integer::ZigZagScheme +pub fn vortex_btrblocks::schemes::temporal::TemporalScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -pub fn vortex_btrblocks::compressor::integer::ZigZagScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +pub fn vortex_btrblocks::schemes::temporal::TemporalScheme::num_children(&self) -> usize -pub fn vortex_btrblocks::compressor::integer::ZigZagScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult - -pub fn vortex_btrblocks::compressor::integer::ZigZagScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool - -pub fn vortex_btrblocks::compressor::integer::ZigZagScheme::scheme_name(&self) -> &'static str - -pub const vortex_btrblocks::compressor::integer::RLE_INTEGER_SCHEME: vortex_btrblocks::compressor::rle::RLEScheme - -pub mod vortex_btrblocks::compressor::string - -pub use vortex_btrblocks::compressor::string::StringStats - -pub struct vortex_btrblocks::compressor::string::ConstantScheme - -impl core::clone::Clone for vortex_btrblocks::compressor::string::ConstantScheme - -pub fn vortex_btrblocks::compressor::string::ConstantScheme::clone(&self) -> vortex_btrblocks::compressor::string::ConstantScheme - -impl core::cmp::Eq for vortex_btrblocks::compressor::string::ConstantScheme - -impl core::cmp::PartialEq for vortex_btrblocks::compressor::string::ConstantScheme - -pub fn vortex_btrblocks::compressor::string::ConstantScheme::eq(&self, other: &vortex_btrblocks::compressor::string::ConstantScheme) -> bool - -impl core::fmt::Debug for vortex_btrblocks::compressor::string::ConstantScheme - -pub fn vortex_btrblocks::compressor::string::ConstantScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result - -impl core::marker::Copy for vortex_btrblocks::compressor::string::ConstantScheme - -impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::string::ConstantScheme - -impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::string::ConstantScheme - -pub fn vortex_btrblocks::compressor::string::ConstantScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult - -pub fn vortex_btrblocks::compressor::string::ConstantScheme::detects_constant(&self) -> bool - -pub fn vortex_btrblocks::compressor::string::ConstantScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult - -pub fn vortex_btrblocks::compressor::string::ConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool - -pub fn vortex_btrblocks::compressor::string::ConstantScheme::scheme_name(&self) -> &'static str - -pub struct vortex_btrblocks::compressor::string::DictScheme - -impl core::clone::Clone for vortex_btrblocks::compressor::string::DictScheme - -pub fn vortex_btrblocks::compressor::string::DictScheme::clone(&self) -> vortex_btrblocks::compressor::string::DictScheme - -impl core::cmp::Eq for vortex_btrblocks::compressor::string::DictScheme - -impl core::cmp::PartialEq for vortex_btrblocks::compressor::string::DictScheme - -pub fn vortex_btrblocks::compressor::string::DictScheme::eq(&self, other: &vortex_btrblocks::compressor::string::DictScheme) -> bool - -impl core::fmt::Debug for vortex_btrblocks::compressor::string::DictScheme - -pub fn vortex_btrblocks::compressor::string::DictScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result - -impl core::marker::Copy for vortex_btrblocks::compressor::string::DictScheme - -impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::string::DictScheme - -impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::string::DictScheme - -pub fn vortex_btrblocks::compressor::string::DictScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult - -pub fn vortex_btrblocks::compressor::string::DictScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult - -pub fn vortex_btrblocks::compressor::string::DictScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool - -pub fn vortex_btrblocks::compressor::string::DictScheme::scheme_name(&self) -> &'static str - -pub fn vortex_btrblocks::compressor::string::DictScheme::stats_options(&self) -> vortex_compressor::stats::options::GenerateStatsOptions - -pub struct vortex_btrblocks::compressor::string::FSSTScheme - -impl core::clone::Clone for vortex_btrblocks::compressor::string::FSSTScheme - -pub fn vortex_btrblocks::compressor::string::FSSTScheme::clone(&self) -> vortex_btrblocks::compressor::string::FSSTScheme - -impl core::cmp::Eq for vortex_btrblocks::compressor::string::FSSTScheme - -impl core::cmp::PartialEq for vortex_btrblocks::compressor::string::FSSTScheme - -pub fn vortex_btrblocks::compressor::string::FSSTScheme::eq(&self, other: &vortex_btrblocks::compressor::string::FSSTScheme) -> bool - -impl core::fmt::Debug for vortex_btrblocks::compressor::string::FSSTScheme - -pub fn vortex_btrblocks::compressor::string::FSSTScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result - -impl core::marker::Copy for vortex_btrblocks::compressor::string::FSSTScheme - -impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::string::FSSTScheme - -impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::string::FSSTScheme - -pub fn vortex_btrblocks::compressor::string::FSSTScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult - -pub fn vortex_btrblocks::compressor::string::FSSTScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool - -pub fn vortex_btrblocks::compressor::string::FSSTScheme::scheme_name(&self) -> &'static str - -pub struct vortex_btrblocks::compressor::string::NullDominated - -impl core::clone::Clone for vortex_btrblocks::compressor::string::NullDominated - -pub fn vortex_btrblocks::compressor::string::NullDominated::clone(&self) -> vortex_btrblocks::compressor::string::NullDominated - -impl core::cmp::Eq for vortex_btrblocks::compressor::string::NullDominated - -impl core::cmp::PartialEq for vortex_btrblocks::compressor::string::NullDominated - -pub fn vortex_btrblocks::compressor::string::NullDominated::eq(&self, other: &vortex_btrblocks::compressor::string::NullDominated) -> bool - -impl core::fmt::Debug for vortex_btrblocks::compressor::string::NullDominated - -pub fn vortex_btrblocks::compressor::string::NullDominated::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result - -impl core::marker::Copy for vortex_btrblocks::compressor::string::NullDominated - -impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::string::NullDominated - -impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::string::NullDominated - -pub fn vortex_btrblocks::compressor::string::NullDominated::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult - -pub fn vortex_btrblocks::compressor::string::NullDominated::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult - -pub fn vortex_btrblocks::compressor::string::NullDominated::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool - -pub fn vortex_btrblocks::compressor::string::NullDominated::scheme_name(&self) -> &'static str - -pub struct vortex_btrblocks::compressor::string::UncompressedScheme - -impl core::clone::Clone for vortex_btrblocks::compressor::string::UncompressedScheme - -pub fn vortex_btrblocks::compressor::string::UncompressedScheme::clone(&self) -> vortex_btrblocks::compressor::string::UncompressedScheme - -impl core::cmp::Eq for vortex_btrblocks::compressor::string::UncompressedScheme - -impl core::cmp::PartialEq for vortex_btrblocks::compressor::string::UncompressedScheme - -pub fn vortex_btrblocks::compressor::string::UncompressedScheme::eq(&self, other: &vortex_btrblocks::compressor::string::UncompressedScheme) -> bool - -impl core::fmt::Debug for vortex_btrblocks::compressor::string::UncompressedScheme - -pub fn vortex_btrblocks::compressor::string::UncompressedScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result - -impl core::marker::Copy for vortex_btrblocks::compressor::string::UncompressedScheme - -impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::string::UncompressedScheme - -impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::string::UncompressedScheme - -pub fn vortex_btrblocks::compressor::string::UncompressedScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult - -pub fn vortex_btrblocks::compressor::string::UncompressedScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, _data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult - -pub fn vortex_btrblocks::compressor::string::UncompressedScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool - -pub fn vortex_btrblocks::compressor::string::UncompressedScheme::scheme_name(&self) -> &'static str - -pub struct vortex_btrblocks::compressor::string::ZstdScheme - -impl core::clone::Clone for vortex_btrblocks::compressor::string::ZstdScheme - -pub fn vortex_btrblocks::compressor::string::ZstdScheme::clone(&self) -> vortex_btrblocks::compressor::string::ZstdScheme - -impl core::cmp::Eq for vortex_btrblocks::compressor::string::ZstdScheme - -impl core::cmp::PartialEq for vortex_btrblocks::compressor::string::ZstdScheme - -pub fn vortex_btrblocks::compressor::string::ZstdScheme::eq(&self, other: &vortex_btrblocks::compressor::string::ZstdScheme) -> bool - -impl core::fmt::Debug for vortex_btrblocks::compressor::string::ZstdScheme - -pub fn vortex_btrblocks::compressor::string::ZstdScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result - -impl core::marker::Copy for vortex_btrblocks::compressor::string::ZstdScheme - -impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::string::ZstdScheme - -impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::string::ZstdScheme - -pub fn vortex_btrblocks::compressor::string::ZstdScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult - -pub fn vortex_btrblocks::compressor::string::ZstdScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool - -pub fn vortex_btrblocks::compressor::string::ZstdScheme::scheme_name(&self) -> &'static str - -pub mod vortex_btrblocks::compressor::temporal - -pub struct vortex_btrblocks::compressor::temporal::TemporalScheme - -impl core::clone::Clone for vortex_btrblocks::compressor::temporal::TemporalScheme - -pub fn vortex_btrblocks::compressor::temporal::TemporalScheme::clone(&self) -> vortex_btrblocks::compressor::temporal::TemporalScheme - -impl core::cmp::Eq for vortex_btrblocks::compressor::temporal::TemporalScheme - -impl core::cmp::PartialEq for vortex_btrblocks::compressor::temporal::TemporalScheme - -pub fn vortex_btrblocks::compressor::temporal::TemporalScheme::eq(&self, other: &vortex_btrblocks::compressor::temporal::TemporalScheme) -> bool - -impl core::fmt::Debug for vortex_btrblocks::compressor::temporal::TemporalScheme - -pub fn vortex_btrblocks::compressor::temporal::TemporalScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result - -impl core::marker::Copy for vortex_btrblocks::compressor::temporal::TemporalScheme - -impl core::marker::StructuralPartialEq for vortex_btrblocks::compressor::temporal::TemporalScheme - -impl vortex_compressor::scheme::Scheme for vortex_btrblocks::compressor::temporal::TemporalScheme - -pub fn vortex_btrblocks::compressor::temporal::TemporalScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult - -pub fn vortex_btrblocks::compressor::temporal::TemporalScheme::detects_constant(&self) -> bool - -pub fn vortex_btrblocks::compressor::temporal::TemporalScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, _data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext, _excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult - -pub fn vortex_btrblocks::compressor::temporal::TemporalScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool - -pub fn vortex_btrblocks::compressor::temporal::TemporalScheme::scheme_name(&self) -> &'static str +pub fn vortex_btrblocks::schemes::temporal::TemporalScheme::scheme_name(&self) -> &'static str pub struct vortex_btrblocks::BtrBlocksCompressor(pub vortex_compressor::compressor::CascadingCompressor) @@ -895,5 +671,3 @@ pub const vortex_btrblocks::ALL_SCHEMES: &[&dyn vortex_compressor::scheme::Schem pub fn vortex_btrblocks::compress_patches(patches: &vortex_array::patches::Patches) -> vortex_error::VortexResult pub fn vortex_btrblocks::default_excluded() -> vortex_utils::aliases::hash_set::HashSet - -pub fn vortex_btrblocks::integer_dictionary_encode(stats: &vortex_compressor::stats::integer::IntegerStats) -> vortex_array::arrays::dict::array::DictArray diff --git a/vortex-btrblocks/src/builder.rs b/vortex-btrblocks/src/builder.rs index 745e4755740..26b4c1ab727 100644 --- a/vortex-btrblocks/src/builder.rs +++ b/vortex-btrblocks/src/builder.rs @@ -10,6 +10,11 @@ use crate::CascadingCompressor; use crate::Scheme; use crate::SchemeExt; use crate::SchemeId; +use crate::schemes::decimal; +use crate::schemes::float; +use crate::schemes::integer; +use crate::schemes::string; +use crate::schemes::temporal; /// All available compression schemes. /// @@ -17,58 +22,58 @@ use crate::SchemeId; /// the final scheme list, so that tie-breaking is deterministic. pub const ALL_SCHEMES: &[&dyn Scheme] = &[ // Integer schemes. - &crate::schemes::integer::UncompressedScheme as &dyn Scheme, - &crate::schemes::integer::ConstantScheme, - // NOTE: For must precede BitPacking to avoid unnecessary patches. - &crate::schemes::integer::FORScheme, - &crate::schemes::integer::BitPackingScheme, - &crate::schemes::integer::ZigZagScheme, - &crate::schemes::integer::SparseScheme, - &crate::schemes::integer::DictScheme, - &crate::schemes::integer::RunEndScheme, - &crate::schemes::integer::SequenceScheme, - &crate::schemes::integer::RLE_INTEGER_SCHEME, + &integer::IntUncompressedScheme as &dyn Scheme, + &integer::IntConstantScheme, + // NOTE: FoR must precede BitPacking to avoid unnecessary patches. + &integer::FoRScheme, + &integer::BitPackingScheme, + &integer::ZigZagScheme, + &integer::SparseScheme, + &integer::IntDictScheme, + &integer::RunEndScheme, + &integer::SequenceScheme, + &integer::RLE_INTEGER_SCHEME, #[cfg(feature = "pco")] - &crate::schemes::integer::PcoScheme, + &integer::PcoScheme, // Float schemes. - &crate::schemes::float::UncompressedScheme, - &crate::schemes::float::ConstantScheme, - &crate::schemes::float::ALPScheme, - &crate::schemes::float::ALPRDScheme, - &crate::schemes::float::DictScheme, - &crate::schemes::float::NullDominated, - &crate::schemes::float::RLE_FLOAT_SCHEME, + &float::FloatUncompressedScheme, + &float::FloatConstantScheme, + &float::ALPScheme, + &float::ALPRDScheme, + &float::FloatDictScheme, + &float::NullDominatedSparseScheme, + &float::RLE_FLOAT_SCHEME, #[cfg(feature = "pco")] - &crate::schemes::float::PcoScheme, + &float::PcoScheme, // Decimal schemes. - &crate::schemes::decimal::DecimalScheme, + &decimal::DecimalScheme, // Temporal schemes. - &crate::schemes::temporal::TemporalScheme, + &temporal::TemporalScheme, // String schemes. - &crate::schemes::string::UncompressedScheme, - &crate::schemes::string::DictScheme, - &crate::schemes::string::FSSTScheme, - &crate::schemes::string::ConstantScheme, - &crate::schemes::string::NullDominated, + &string::StringUncompressedScheme, + &string::StringDictScheme, + &string::FSSTScheme, + &string::StringConstantScheme, + &string::NullDominatedSparseScheme, #[cfg(feature = "zstd")] - &crate::schemes::string::ZstdScheme, + &string::ZstdScheme, #[cfg(all(feature = "zstd", feature = "unstable_encodings"))] - &crate::schemes::string::ZstdBuffersScheme, + &string::ZstdBuffersScheme, ]; /// Returns the set of scheme IDs excluded by default (behind feature gates or known-expensive). pub fn default_excluded() -> HashSet { - #[allow(unused_mut)] + #[allow(unused_mut, reason = "depends on enabled feature flags")] let mut excluded = HashSet::new(); #[cfg(feature = "pco")] { - excluded.insert(crate::schemes::integer::PcoScheme.id()); - excluded.insert(crate::schemes::float::PcoScheme.id()); + excluded.insert(integer::PcoScheme.id()); + excluded.insert(float::PcoScheme.id()); } #[cfg(feature = "zstd")] - excluded.insert(crate::schemes::string::ZstdScheme.id()); + excluded.insert(string::ZstdScheme.id()); #[cfg(all(feature = "zstd", feature = "unstable_encodings"))] - excluded.insert(crate::schemes::string::ZstdBuffersScheme.id()); + excluded.insert(string::ZstdBuffersScheme.id()); excluded } @@ -81,20 +86,20 @@ pub fn default_excluded() -> HashSet { /// /// ```rust /// use vortex_btrblocks::{BtrBlocksCompressorBuilder, Scheme, SchemeExt}; -/// use vortex_btrblocks::schemes::integer::DictScheme; +/// use vortex_btrblocks::schemes::integer::IntDictScheme; /// /// // Default compressor - all non-excluded schemes allowed. /// let compressor = BtrBlocksCompressorBuilder::default().build(); /// /// // Exclude specific schemes. /// let compressor = BtrBlocksCompressorBuilder::default() -/// .exclude([DictScheme.id()]) +/// .exclude([IntDictScheme.id()]) /// .build(); /// /// // Exclude then re-include. /// let compressor = BtrBlocksCompressorBuilder::default() -/// .exclude([DictScheme.id()]) -/// .include([DictScheme.id()]) +/// .exclude([IntDictScheme.id()]) +/// .include([IntDictScheme.id()]) /// .build(); /// ``` #[derive(Debug, Clone)] diff --git a/vortex-btrblocks/src/canonical_compressor.rs b/vortex-btrblocks/src/canonical_compressor.rs index 2705d1c2527..4ba118defc9 100644 --- a/vortex-btrblocks/src/canonical_compressor.rs +++ b/vortex-btrblocks/src/canonical_compressor.rs @@ -20,14 +20,14 @@ use crate::CascadingCompressor; /// /// ```rust /// use vortex_btrblocks::{BtrBlocksCompressor, BtrBlocksCompressorBuilder, Scheme, SchemeExt}; -/// use vortex_btrblocks::schemes::integer::DictScheme; +/// use vortex_btrblocks::schemes::integer::IntDictScheme; /// /// // Default compressor - all schemes allowed. /// let compressor = BtrBlocksCompressor::default(); /// /// // Exclude specific schemes using the builder. /// let compressor = BtrBlocksCompressorBuilder::default() -/// .exclude([DictScheme.id()]) +/// .exclude([IntDictScheme.id()]) /// .build(); /// ``` #[derive(Clone)] diff --git a/vortex-btrblocks/src/lib.rs b/vortex-btrblocks/src/lib.rs index 3283513f0b6..26dc56b0d8f 100644 --- a/vortex-btrblocks/src/lib.rs +++ b/vortex-btrblocks/src/lib.rs @@ -41,7 +41,7 @@ //! //! ```rust //! use vortex_btrblocks::{BtrBlocksCompressor, BtrBlocksCompressorBuilder, Scheme, SchemeExt}; -//! use vortex_btrblocks::schemes::integer::DictScheme; +//! use vortex_btrblocks::schemes::integer::IntDictScheme; //! use vortex_array::DynArray; //! //! // Default compressor with all schemes enabled. @@ -49,7 +49,7 @@ //! //! // Configure with builder to exclude specific schemes. //! let compressor = BtrBlocksCompressorBuilder::default() -//! .exclude([DictScheme.id()]) +//! .exclude([IntDictScheme.id()]) //! .build(); //! ``` //! diff --git a/vortex-btrblocks/src/schemes/decimal.rs b/vortex-btrblocks/src/schemes/decimal.rs index 8ccfcc6697f..c4ad932b765 100644 --- a/vortex-btrblocks/src/schemes/decimal.rs +++ b/vortex-btrblocks/src/schemes/decimal.rs @@ -22,8 +22,8 @@ use crate::SchemeExt; /// Compression scheme for decimal arrays via byte-part decomposition. /// -/// Narrows the decimal to the smallest integer type, compresses the underlying primitive, -/// and wraps the result in a [`DecimalBytePartsArray`]. +/// Narrows the decimal to the smallest integer type, compresses the underlying primitive, and wraps +/// the result in a [`DecimalBytePartsArray`]. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct DecimalScheme; @@ -47,15 +47,14 @@ impl Scheme for DecimalScheme { _ctx: CompressorContext, ) -> VortexResult { // Decimal compression is almost always beneficial (narrowing + primitive compression). - // Return a moderate ratio to ensure this scheme is always selected. - Ok(2.0) + Ok(f64::MAX) } fn compress( &self, compressor: &CascadingCompressor, data: &mut ArrayAndStats, - _ctx: CompressorContext, + ctx: CompressorContext, ) -> VortexResult { // TODO(joe): add support splitting i128/256 buffers into chunks of primitive values // for compression. 2 for i128 and 4 for i256. @@ -70,8 +69,7 @@ impl Scheme for DecimalScheme { _ => return Ok(decimal.into_array()), }; - let ctx = CompressorContext::default().descend_with_scheme(self.id(), 0); - let compressed = compressor.compress_canonical(Canonical::Primitive(prim), ctx)?; + let compressed = compressor.compress_child(&prim.into_array(), &ctx, self.id(), 0)?; DecimalBytePartsArray::try_new(compressed, decimal.decimal_dtype()).map(|d| d.into_array()) } diff --git a/vortex-btrblocks/src/schemes/float.rs b/vortex-btrblocks/src/schemes/float.rs index 8279d0eb086..5355f0adde5 100644 --- a/vortex-btrblocks/src/schemes/float.rs +++ b/vortex-btrblocks/src/schemes/float.rs @@ -1,7 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -// Re-export builtin schemes from vortex-compressor. +//! Float compression schemes. + use vortex_alp::ALP; use vortex_alp::ALPArray; use vortex_alp::RDEncoder; @@ -12,13 +13,8 @@ use vortex_array::IntoArray; use vortex_array::ToCanonical; use vortex_array::arrays::PrimitiveArray; use vortex_array::dtype::PType; -pub use vortex_compressor::builtins::FloatConstantScheme as ConstantScheme; -pub use vortex_compressor::builtins::FloatDictScheme as DictScheme; -pub use vortex_compressor::builtins::FloatUncompressedScheme as UncompressedScheme; -pub use vortex_compressor::builtins::is_float_primitive; use vortex_compressor::scheme::ChildSelection; use vortex_compressor::scheme::DescendantExclusion; -pub use vortex_compressor::stats::FloatStats; use vortex_error::VortexResult; use vortex_error::vortex_panic; use vortex_sparse::Sparse; @@ -45,8 +41,10 @@ pub struct ALPScheme; pub struct ALPRDScheme; /// Sparse encoding for null-dominated float arrays. +/// +/// This is the same as the integer `SparseScheme`, but we only use this for null-dominated arrays. #[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct NullDominated; +pub struct NullDominatedSparseScheme; /// Pco (pcodec) compression for floats. #[cfg(feature = "pco")] @@ -57,6 +55,13 @@ pub struct PcoScheme; #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct FloatRLEConfig; +// Re-export builtin schemes from vortex-compressor. +pub use vortex_compressor::builtins::FloatConstantScheme; +pub use vortex_compressor::builtins::FloatDictScheme; +pub use vortex_compressor::builtins::FloatUncompressedScheme; +pub use vortex_compressor::builtins::is_float_primitive; +pub use vortex_compressor::stats::FloatStats; + impl rle::RLEConfig for FloatRLEConfig { type Stats = FloatStats; @@ -115,6 +120,12 @@ impl Scheme for ALPScheme { data: &mut ArrayAndStats, ctx: CompressorContext, ) -> VortexResult { + // ALP encodes floats as integers. Without integer compression afterward, the + // encoded ints are the same size. + if ctx.finished_cascading() { + return Ok(0.0); + } + let stats = data.float_stats(); // We don't support ALP for f16. @@ -122,12 +133,6 @@ impl Scheme for ALPScheme { return Ok(0.0); } - if ctx.finished_cascading() { - // ALP does not compress on its own, we need to be able to cascade it with - // an integer compressor. - return Ok(0.0); - } - estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) } @@ -144,13 +149,12 @@ impl Scheme for ALPScheme { let alp_ints = alp.encoded().to_primitive(); // Compress the ALP ints. + + let compressed_alp_ints = + compressor.compress_child(&alp_ints.into_array(), &ctx, self.id(), 0)?; + // Patches are not compressed. They should be infrequent, and if they are not then we want // to keep them linear for easy indexing. - let compressed_alp_ints = compressor.compress_canonical( - Canonical::Primitive(alp_ints), - ctx.descend_with_scheme(self.id(), 0), - )?; - let patches = alp.patches().map(compress_patches).transpose()?; Ok(ALPArray::new(compressed_alp_ints, alp.exponents(), patches).into_array()) @@ -207,7 +211,7 @@ impl Scheme for ALPRDScheme { } } -impl Scheme for NullDominated { +impl Scheme for NullDominatedSparseScheme { fn scheme_name(&self) -> &'static str { "vortex.float.sparse" } @@ -220,6 +224,8 @@ impl Scheme for NullDominated { 1 } + // TODO(connor): There seems to be stuff missing here... + /// The indices of a null-dominated sparse array should not be sparse-encoded again. fn descendant_exclusions(&self) -> Vec { vec![DescendantExclusion { excluded: IntSparseScheme.id(), @@ -231,13 +237,8 @@ impl Scheme for NullDominated { &self, _compressor: &CascadingCompressor, data: &mut ArrayAndStats, - ctx: CompressorContext, + _ctx: CompressorContext, ) -> VortexResult { - // Only use `SparseScheme` if we can cascade. - if ctx.finished_cascading() { - return Ok(0.0); - } - let stats = data.float_stats(); if stats.value_count() == 0 { @@ -260,8 +261,6 @@ impl Scheme for NullDominated { data: &mut ArrayAndStats, ctx: CompressorContext, ) -> VortexResult { - assert!(!ctx.finished_cascading()); - let stats = data.float_stats(); // We pass None as we only run this pathway for NULL-dominated float arrays. @@ -269,9 +268,11 @@ impl Scheme for NullDominated { if let Some(sparse) = sparse_encoded.as_opt::() { let indices = sparse.patches().indices().to_primitive().narrow()?; - let compressed_indices = compressor.compress_canonical( - Canonical::Primitive(indices.to_primitive()), - ctx.descend_with_scheme(self.id(), 0), + let compressed_indices = compressor.compress_child( + &indices.to_primitive().into_array(), + &ctx, + self.id(), + 0, )?; SparseArray::try_new( @@ -328,14 +329,12 @@ mod tests { use vortex_array::validity::Validity; use vortex_buffer::Buffer; use vortex_buffer::buffer_mut; + use vortex_compressor::CascadingCompressor; use vortex_error::VortexResult; + use vortex_fastlanes::RLE; use super::RLE_FLOAT_SCHEME; - use crate::ArrayAndStats; use crate::BtrBlocksCompressor; - use crate::CompressorContext; - use crate::GenerateStatsOptions; - use crate::Scheme; #[test] fn test_empty() -> VortexResult<()> { @@ -376,11 +375,10 @@ mod tests { values.extend(iter::repeat_n(3.15f32, 150)); let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); - let btr = BtrBlocksCompressor::default(); - let array_ref = array.into_array(); - let mut data = ArrayAndStats::new(array_ref.clone(), GenerateStatsOptions::default()); - let compressed = - RLE_FLOAT_SCHEME.compress(&btr, &mut data, CompressorContext::default())?; + + let compressor = CascadingCompressor::new(vec![&RLE_FLOAT_SCHEME]); + let compressed = compressor.compress(&array.into_array())?; + assert!(compressed.is::()); let expected = Buffer::copy_from(&values).into_array(); assert_arrays_eq!(compressed.as_ref(), expected.as_ref()); diff --git a/vortex-btrblocks/src/schemes/integer.rs b/vortex-btrblocks/src/schemes/integer.rs index 574c8d56bbb..c45df25c45e 100644 --- a/vortex-btrblocks/src/schemes/integer.rs +++ b/vortex-btrblocks/src/schemes/integer.rs @@ -1,7 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -// Re-export builtin schemes from vortex-compressor. +//! Integer compression schemes. + use vortex_array::ArrayRef; use vortex_array::Canonical; use vortex_array::IntoArray; @@ -9,14 +10,11 @@ use vortex_array::ToCanonical; use vortex_array::arrays::ConstantArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::scalar::Scalar; -pub use vortex_compressor::builtins::IntConstantScheme as ConstantScheme; -pub use vortex_compressor::builtins::IntDictScheme as DictScheme; -pub use vortex_compressor::builtins::IntUncompressedScheme as UncompressedScheme; -pub use vortex_compressor::builtins::is_integer_primitive; +use vortex_compressor::builtins::FloatDictScheme; +use vortex_compressor::builtins::StringDictScheme; use vortex_compressor::scheme::AncestorExclusion; use vortex_compressor::scheme::ChildSelection; use vortex_compressor::scheme::DescendantExclusion; -pub use vortex_compressor::stats::IntegerStats; use vortex_error::VortexExpect; use vortex_error::VortexResult; use vortex_error::vortex_bail; @@ -47,7 +45,7 @@ use crate::schemes::rle::RLEStats; /// Frame of Reference encoding. #[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct FORScheme; +pub struct FoRScheme; /// ZigZag encoding for negative integers. #[derive(Debug, Copy, Clone, PartialEq, Eq)] @@ -74,6 +72,13 @@ pub struct SequenceScheme; #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct PcoScheme; +// Re-export builtin schemes from vortex-compressor. +pub use vortex_compressor::builtins::IntConstantScheme; +pub use vortex_compressor::builtins::IntDictScheme; +pub use vortex_compressor::builtins::IntUncompressedScheme; +pub use vortex_compressor::builtins::is_integer_primitive; +pub use vortex_compressor::stats::IntegerStats; + /// Threshold for the average run length in an array before we consider run-end encoding. const RUN_END_THRESHOLD: u32 = 4; @@ -120,7 +125,7 @@ impl RLEStats for IntegerStats { /// RLE scheme for integer compression. pub const RLE_INTEGER_SCHEME: RLEScheme = RLEScheme::new(); -impl Scheme for FORScheme { +impl Scheme for FoRScheme { fn scheme_name(&self) -> &'static str { "vortex.int.for" } @@ -135,7 +140,8 @@ impl Scheme for FORScheme { data: &mut ArrayAndStats, ctx: CompressorContext, ) -> VortexResult { - // Only apply if we are not at the leaf. + // FoR only subtracts the min. Without further compression (e.g. BitPacking), + // the output is the same size. if ctx.finished_cascading() { return Ok(0.0); } @@ -223,10 +229,13 @@ impl Scheme for ZigZagScheme { 1 } + /// Container-style schemes (Dict, RunEnd, Sparse) restructure data and should only be + /// applied once in a cascade chain. ZigZag's output is a simple value transformation, + /// so further restructuring is wasteful. fn descendant_exclusions(&self) -> Vec { vec![ DescendantExclusion { - excluded: DictScheme.id(), + excluded: IntDictScheme.id(), children: ChildSelection::All, }, DescendantExclusion { @@ -246,7 +255,8 @@ impl Scheme for ZigZagScheme { data: &mut ArrayAndStats, ctx: CompressorContext, ) -> VortexResult { - // ZigZag is only useful when we cascade it with another encoding. + // ZigZag only transforms negative values to positive. Without further compression, + // the output is the same size. if ctx.finished_cascading() { return Ok(0.0); } @@ -279,8 +289,7 @@ impl Scheme for ZigZagScheme { let zag = zigzag_encode(stats.source().clone())?; let encoded = zag.encoded().to_primitive(); - let child_ctx = ctx.descend_with_scheme(self.id(), 0); - let compressed = compressor.compress_canonical(Canonical::Primitive(encoded), child_ctx)?; + let compressed = compressor.compress_child(&encoded.into_array(), &ctx, self.id(), 0)?; tracing::debug!("zigzag output: {}", compressed.encoding_id()); @@ -360,9 +369,11 @@ impl Scheme for SparseScheme { 2 } + /// Sparse values and indices are already low-cardinality by construction, so dictionary + /// encoding doesn't add anything. fn descendant_exclusions(&self) -> Vec { vec![DescendantExclusion { - excluded: DictScheme.id(), + excluded: IntDictScheme.id(), children: ChildSelection::All, }] } @@ -371,13 +382,8 @@ impl Scheme for SparseScheme { &self, _compressor: &CascadingCompressor, data: &mut ArrayAndStats, - ctx: CompressorContext, + _ctx: CompressorContext, ) -> VortexResult { - // Only use `SparseScheme` if we can cascade. - if ctx.finished_cascading() { - return Ok(0.0); - } - // We use `generate()` (not `generate_opts` with `count_distinct_values: true`) // because the cache is pre-populated by `choose_and_compress` with the merged // `stats_options` from all eligible schemes. Since this scheme declares @@ -423,8 +429,6 @@ impl Scheme for SparseScheme { data: &mut ArrayAndStats, ctx: CompressorContext, ) -> VortexResult { - assert!(!ctx.finished_cascading()); - let stats = data.integer_stats(); let (top_pvalue, top_count) = stats @@ -456,17 +460,17 @@ impl Scheme for SparseScheme { )?; if let Some(sparse) = sparse_encoded.as_opt::() { - let values_ctx = ctx.clone().descend_with_scheme(self.id(), 0); - let compressed_values = compressor.compress_canonical( - Canonical::Primitive(sparse.patches().values().to_primitive()), - values_ctx, + let compressed_values = compressor.compress_child( + &sparse.patches().values().to_primitive().into_array(), + &ctx, + self.id(), + 0, )?; let indices = sparse.patches().indices().to_primitive().narrow()?; - let indices_ctx = ctx.descend_with_scheme(self.id(), 1); let compressed_indices = - compressor.compress_canonical(Canonical::Primitive(indices), indices_ctx)?; + compressor.compress_child(&indices.into_array(), &ctx, self.id(), 1)?; SparseArray::try_new( compressed_indices, @@ -494,24 +498,21 @@ impl Scheme for RunEndScheme { 2 } + /// Run-end values and ends are already deduplicated, so dictionary encoding doesn't add + /// anything. fn descendant_exclusions(&self) -> Vec { vec![DescendantExclusion { - excluded: DictScheme.id(), + excluded: IntDictScheme.id(), children: ChildSelection::All, }] } + // TODO(connor): There seems to be stuff missing here... fn ancestor_exclusions(&self) -> Vec { - use vortex_compressor::builtins::FloatDictScheme; - - vec![ - // Exclude from FloatDict values child (child 0). This replaces the old ALP - // conditional propagation of float RLE exclusion to integer RunEnd. - AncestorExclusion { - ancestor: FloatDictScheme.id(), - children: ChildSelection::One(0), - }, - ] + vec![AncestorExclusion { + ancestor: FloatDictScheme.id(), + children: ChildSelection::One(0), + }] } fn expected_compression_ratio( @@ -527,10 +528,6 @@ impl Scheme for RunEndScheme { return Ok(0.0); } - if ctx.finished_cascading() { - return Ok(0.0); - } - // Run compression on a sample, see how it performs. estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) } @@ -541,20 +538,16 @@ impl Scheme for RunEndScheme { data: &mut ArrayAndStats, ctx: CompressorContext, ) -> VortexResult { - assert!(!ctx.finished_cascading()); - let stats = data.integer_stats(); // Run-end encode the ends. let (ends, values) = runend_encode(stats.source()); - let values_ctx = ctx.clone().descend_with_scheme(self.id(), 0); - let compressed_values = compressor - .compress_canonical(Canonical::Primitive(values.to_primitive()), values_ctx)?; + let compressed_values = + compressor.compress_child(&values.to_primitive().into_array(), &ctx, self.id(), 0)?; - let ends_ctx = ctx.descend_with_scheme(self.id(), 1); let compressed_ends = - compressor.compress_canonical(Canonical::Primitive(ends.to_primitive()), ends_ctx)?; + compressor.compress_child(&ends.to_primitive().into_array(), &ctx, self.id(), 1)?; // SAFETY: compression doesn't affect invariants. unsafe { @@ -578,22 +571,19 @@ impl Scheme for SequenceScheme { is_integer_primitive(canonical) } + /// Sequence encoding on dictionary codes just adds a layer of indirection without compressing + /// the data. Dict codes are compact integers that benefit from BitPacking or FoR, not from + /// sequence detection. fn ancestor_exclusions(&self) -> Vec { - use vortex_compressor::builtins::FloatDictScheme; - use vortex_compressor::builtins::StringDictScheme; - vec![ - // Exclude from IntDict codes. AncestorExclusion { - ancestor: DictScheme.id(), + ancestor: IntDictScheme.id(), children: ChildSelection::All, }, - // Exclude from FloatDict codes (child 1). AncestorExclusion { ancestor: FloatDictScheme.id(), children: ChildSelection::One(1), }, - // Exclude from StringDict codes (child 1). AncestorExclusion { ancestor: StringDictScheme.id(), children: ChildSelection::One(1), @@ -708,18 +698,14 @@ mod tests { use vortex_buffer::Buffer; use vortex_buffer::BufferMut; use vortex_buffer::buffer; + use vortex_compressor::CascadingCompressor; use vortex_error::VortexResult; + use vortex_fastlanes::RLE; use vortex_sequence::Sequence; use vortex_sparse::Sparse; use super::RLE_INTEGER_SCHEME; - use super::SequenceScheme; - use super::SparseScheme; - use crate::ArrayAndStats; use crate::BtrBlocksCompressor; - use crate::CompressorContext; - use crate::GenerateStatsOptions; - use crate::Scheme; #[test] fn test_empty() -> VortexResult<()> { @@ -758,29 +744,6 @@ mod tests { Ok(()) } - #[test] - fn sparse_with_nulls() -> VortexResult<()> { - let array = PrimitiveArray::new( - buffer![189u8, 189, 189, 0, 46], - Validity::from_iter(vec![true, true, true, true, false]), - ); - let btr = BtrBlocksCompressor::default(); - // SparseScheme needs distinct values. - let mut data = ArrayAndStats::new( - array.clone().into_array(), - GenerateStatsOptions { - count_distinct_values: true, - }, - ); - let compressed = SparseScheme.compress(&btr, &mut data, CompressorContext::default())?; - assert!(compressed.is::()); - let decoded = compressed.clone(); - let expected = - PrimitiveArray::new(buffer![189u8, 189, 189, 0, 0], array.validity()?).into_array(); - assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); - Ok(()) - } - #[test] fn sparse_mostly_nulls() -> VortexResult<()> { let array = PrimitiveArray::new( @@ -789,22 +752,15 @@ mod tests { false, false, false, false, false, false, false, false, false, false, true, ]), ); + let validity = array.validity()?; + let btr = BtrBlocksCompressor::default(); - // SparseScheme needs distinct values. - let mut data = ArrayAndStats::new( - array.clone().into_array(), - GenerateStatsOptions { - count_distinct_values: true, - }, - ); - let compressed = SparseScheme.compress(&btr, &mut data, CompressorContext::default())?; + let compressed = btr.compress(&array.into_array())?; assert!(compressed.is::()); + let decoded = compressed.clone(); - let expected = PrimitiveArray::new( - buffer![0u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 46], - array.validity()?, - ) - .into_array(); + let expected = + PrimitiveArray::new(buffer![0u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 46], validity).into_array(); assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); Ok(()) } @@ -813,11 +769,11 @@ mod tests { fn nullable_sequence() -> VortexResult<()> { let values = (0i32..20).step_by(7).collect_vec(); let array = PrimitiveArray::from_option_iter(values.clone().into_iter().map(Some)); + let btr = BtrBlocksCompressor::default(); - let array_ref = array.into_array(); - let mut data = ArrayAndStats::new(array_ref.clone(), GenerateStatsOptions::default()); - let compressed = SequenceScheme.compress(&btr, &mut data, CompressorContext::default())?; + let compressed = btr.compress(&array.into_array())?; assert!(compressed.is::()); + let decoded = compressed; let expected = PrimitiveArray::from_option_iter(values.into_iter().map(Some)).into_array(); assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); @@ -832,15 +788,12 @@ mod tests { values.extend(iter::repeat_n(987i32, 150)); let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); - let btr = BtrBlocksCompressor::default(); - let array_ref = array.into_array(); - let mut data = ArrayAndStats::new(array_ref.clone(), GenerateStatsOptions::default()); - let compressed = - RLE_INTEGER_SCHEME.compress(&btr, &mut data, CompressorContext::default())?; + let compressor = CascadingCompressor::new(vec![&RLE_INTEGER_SCHEME]); + let compressed = compressor.compress(&array.into_array())?; + assert!(compressed.is::()); - let decoded = compressed; let expected = Buffer::copy_from(&values).into_array(); - assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); + assert_arrays_eq!(compressed.as_ref(), expected.as_ref()); Ok(()) } diff --git a/vortex-btrblocks/src/schemes/mod.rs b/vortex-btrblocks/src/schemes/mod.rs index cea3bc0ec09..13f1bfecd25 100644 --- a/vortex-btrblocks/src/schemes/mod.rs +++ b/vortex-btrblocks/src/schemes/mod.rs @@ -3,15 +3,12 @@ //! Compression scheme implementations. -/// Decimal compression schemes. -pub mod decimal; -/// Float compression schemes. pub mod float; -/// Integer compression schemes. pub mod integer; -pub(crate) mod patches; -pub(crate) mod rle; -/// String compression schemes. pub mod string; -/// Temporal compression schemes. + +pub mod decimal; pub mod temporal; + +pub(crate) mod patches; +pub(crate) mod rle; diff --git a/vortex-btrblocks/src/schemes/rle.rs b/vortex-btrblocks/src/schemes/rle.rs index 8a41a981cf5..0db6f680bb8 100644 --- a/vortex-btrblocks/src/schemes/rle.rs +++ b/vortex-btrblocks/src/schemes/rle.rs @@ -11,6 +11,8 @@ use vortex_array::ToCanonical; use vortex_array::arrays::PrimitiveArray; use vortex_compressor::scheme::ChildSelection; use vortex_compressor::scheme::DescendantExclusion; +#[cfg(feature = "unstable_encodings")] +use vortex_compressor::scheme::SchemeId; use vortex_error::VortexResult; use vortex_fastlanes::RLEArray; @@ -20,7 +22,7 @@ use crate::CompressorContext; use crate::Scheme; use crate::SchemeExt; use crate::estimate_compression_ratio_with_sampling; -use crate::schemes::integer::DictScheme as IntDictScheme; +use crate::schemes::integer::IntDictScheme; /// Threshold for the average run length in an array before we consider run-length encoding. pub const RUN_LENGTH_THRESHOLD: u32 = 4; @@ -35,6 +37,8 @@ pub trait RLEStats { fn source(&self) -> &PrimitiveArray; } +// TODO(connor): This trait is super confusing, we should probably just remove it and hardcode the +// only 2 implementations (integer and float). /// Configuration trait for RLE schemes. /// /// Implement this trait to define the behavior of an RLE scheme for a specific @@ -92,6 +96,9 @@ impl Scheme for RLEScheme { 3 } + /// RLE indices (child 1) and offsets (child 2) are monotonically increasing, so dictionary + /// encoding is pointless. + /// Values (child 0) are not excluded since they may benefit from dict encoding. fn descendant_exclusions(&self) -> Vec { vec![DescendantExclusion { excluded: IntDictScheme.id(), @@ -106,10 +113,6 @@ impl Scheme for RLEScheme { ctx: CompressorContext, ) -> VortexResult { // RLE is only useful when we cascade it with another encoding. - if ctx.finished_cascading() { - return Ok(0.0); - } - let array = data.array().clone(); let stats = data.get_or_insert_with::(|| C::generate_stats(&array)); @@ -137,34 +140,40 @@ impl Scheme for RLEScheme { let stats = data.get_or_insert_with::(|| C::generate_stats(&array)); let rle_array = RLEArray::encode(RLEStats::source(stats))?; - if ctx.finished_cascading() { - return Ok(rle_array.into_array()); - } - - let compressed_values = C::compress_values( - compressor, - &rle_array.values().to_primitive(), - ctx.clone().descend_with_scheme(self.id(), 0), + let compressed_values = compressor.compress_child( + &rle_array.values().to_primitive().into_array(), + &ctx, + self.id(), + 0, )?; // Delta in an unstable encoding, once we deem it stable we can switch over to this always. #[cfg(feature = "unstable_encodings")] let compressed_indices = try_compress_delta( - &rle_array.indices().to_primitive().narrow()?, compressor, - ctx.descend(), - &[IntDictScheme.id()], + &rle_array.indices().to_primitive().narrow()?.into_array(), + &ctx, + self.id(), + 1, )?; #[cfg(not(feature = "unstable_encodings"))] - let compressed_indices = compressor.compress_canonical( - Canonical::Primitive(rle_array.indices().to_primitive().narrow()?), - ctx.clone().descend_with_scheme(self.id(), 1), + let compressed_indices = compressor.compress_child( + &rle_array.indices().to_primitive().narrow()?.into_array(), + &ctx, + self.id(), + 1, )?; - let compressed_offsets = compressor.compress_canonical( - Canonical::Primitive(rle_array.values_idx_offsets().to_primitive().narrow()?), - ctx.descend_with_scheme(self.id(), 2), + let compressed_offsets = compressor.compress_child( + &rle_array + .values_idx_offsets() + .to_primitive() + .narrow()? + .into_array(), + &ctx, + self.id(), + 2, )?; // SAFETY: Recursive compression doesn't affect the invariants. @@ -184,22 +193,19 @@ impl Scheme for RLEScheme { #[cfg(feature = "unstable_encodings")] fn try_compress_delta( - primitive_array: &PrimitiveArray, - compressor: &BtrBlocksCompressor, - ctx: CompressorContext, - excludes: &[SchemeId], + compressor: &CascadingCompressor, + child: &ArrayRef, + parent_ctx: &CompressorContext, + parent_id: SchemeId, + child_index: usize, ) -> VortexResult { - use vortex_array::VortexSessionExecute; - - let (bases, deltas) = vortex_fastlanes::delta_compress( - primitive_array, - &mut vortex_array::LEGACY_SESSION.create_execution_ctx(), - )?; + let (bases, deltas) = + vortex_fastlanes::delta_compress(&child.to_primitive(), &mut compressor.execution_ctx())?; let compressed_bases = - compressor.compress_canonical(Canonical::Primitive(bases), ctx, excludes)?; + compressor.compress_child(&bases.into_array(), parent_ctx, parent_id, child_index)?; let compressed_deltas = - compressor.compress_canonical(Canonical::Primitive(deltas), ctx, excludes)?; + compressor.compress_child(&deltas.into_array(), parent_ctx, parent_id, child_index)?; vortex_fastlanes::DeltaArray::try_from_delta_compress_parts(compressed_bases, compressed_deltas) .map(vortex_fastlanes::DeltaArray::into_array) diff --git a/vortex-btrblocks/src/schemes/string.rs b/vortex-btrblocks/src/schemes/string.rs index 7a43129b6a6..17102f01f5a 100644 --- a/vortex-btrblocks/src/schemes/string.rs +++ b/vortex-btrblocks/src/schemes/string.rs @@ -1,20 +1,16 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -// Re-export builtin schemes from vortex-compressor. +//! String compression schemes. + use vortex_array::ArrayRef; use vortex_array::Canonical; use vortex_array::IntoArray; use vortex_array::ToCanonical; use vortex_array::arrays::VarBinArray; use vortex_array::vtable::ValidityHelper; -pub use vortex_compressor::builtins::StringConstantScheme as ConstantScheme; -pub use vortex_compressor::builtins::StringDictScheme as DictScheme; -pub use vortex_compressor::builtins::StringUncompressedScheme as UncompressedScheme; -pub use vortex_compressor::builtins::is_utf8_string; use vortex_compressor::scheme::ChildSelection; use vortex_compressor::scheme::DescendantExclusion; -pub use vortex_compressor::stats::StringStats; use vortex_error::VortexResult; use vortex_fsst::FSSTArray; use vortex_fsst::fsst_compress; @@ -22,7 +18,7 @@ use vortex_fsst::fsst_train_compressor; use vortex_sparse::Sparse; use vortex_sparse::SparseArray; -use super::integer::DictScheme as IntDictScheme; +use super::integer::IntDictScheme; use super::integer::SparseScheme as IntSparseScheme; use crate::ArrayAndStats; use crate::CascadingCompressor; @@ -35,8 +31,10 @@ use crate::SchemeExt; pub struct FSSTScheme; /// Sparse encoding for null-dominated arrays. +/// +/// This is the same as the integer `SparseScheme`, but we only use this for null-dominated arrays. #[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct NullDominated; +pub struct NullDominatedSparseScheme; /// Zstd compression without dictionaries (nvCOMP compatible). #[cfg(feature = "zstd")] @@ -48,6 +46,13 @@ pub struct ZstdScheme; #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct ZstdBuffersScheme; +// Re-export builtin schemes from vortex-compressor. +pub use vortex_compressor::builtins::StringConstantScheme; +pub use vortex_compressor::builtins::StringDictScheme; +pub use vortex_compressor::builtins::StringUncompressedScheme; +pub use vortex_compressor::builtins::is_utf8_string; +pub use vortex_compressor::stats::StringStats; + impl Scheme for FSSTScheme { fn scheme_name(&self) -> &'static str { "vortex.string.fsst" @@ -74,14 +79,22 @@ impl Scheme for FSSTScheme { fsst_compress(stats.source(), &compressor_fsst) }; - let compressed_original_lengths = compressor.compress_canonical( - Canonical::Primitive(fsst.uncompressed_lengths().to_primitive().narrow()?), - ctx.clone().descend_with_scheme(self.id(), 0), + let compressed_original_lengths = compressor.compress_child( + &fsst + .uncompressed_lengths() + .to_primitive() + .narrow()? + .into_array(), + &ctx, + self.id(), + 0, )?; - let compressed_codes_offsets = compressor.compress_canonical( - Canonical::Primitive(fsst.codes().offsets().to_primitive().narrow()?), - ctx.descend_with_scheme(self.id(), 1), + let compressed_codes_offsets = compressor.compress_child( + &fsst.codes().offsets().to_primitive().narrow()?.into_array(), + &ctx, + self.id(), + 1, )?; let compressed_codes = VarBinArray::try_new( compressed_codes_offsets, @@ -102,7 +115,7 @@ impl Scheme for FSSTScheme { } } -impl Scheme for NullDominated { +impl Scheme for NullDominatedSparseScheme { fn scheme_name(&self) -> &'static str { "vortex.string.sparse" } @@ -115,6 +128,8 @@ impl Scheme for NullDominated { 1 } + // TODO(connor): There seems to be stuff missing here... + /// The indices of a null-dominated sparse array should not be sparse-encoded again. fn descendant_exclusions(&self) -> Vec { vec![ DescendantExclusion { @@ -132,13 +147,8 @@ impl Scheme for NullDominated { &self, _compressor: &CascadingCompressor, data: &mut ArrayAndStats, - ctx: CompressorContext, + _ctx: CompressorContext, ) -> VortexResult { - // Only use `SparseScheme` if we can cascade. - if ctx.finished_cascading() { - return Ok(0.0); - } - let stats = data.string_stats(); if stats.value_count() == 0 { @@ -161,8 +171,6 @@ impl Scheme for NullDominated { data: &mut ArrayAndStats, ctx: CompressorContext, ) -> VortexResult { - assert!(!ctx.finished_cascading()); - let stats = data.string_stats(); // We pass None as we only run this pathway for NULL-dominated string arrays. @@ -171,10 +179,8 @@ impl Scheme for NullDominated { if let Some(sparse) = sparse_encoded.as_opt::() { // Compress the indices only (not the values for strings). let indices = sparse.patches().indices().to_primitive().narrow()?; - let compressed_indices = compressor.compress_canonical( - Canonical::Primitive(indices), - ctx.descend_with_scheme(self.id(), 0), - )?; + let compressed_indices = + compressor.compress_child(&indices.into_array(), &ctx, self.id(), 0)?; SparseArray::try_new( compressed_indices, diff --git a/vortex-btrblocks/src/schemes/temporal.rs b/vortex-btrblocks/src/schemes/temporal.rs index 7e501f146d0..2fb75eb2d0a 100644 --- a/vortex-btrblocks/src/schemes/temporal.rs +++ b/vortex-btrblocks/src/schemes/temporal.rs @@ -72,7 +72,7 @@ impl Scheme for TemporalScheme { &self, compressor: &CascadingCompressor, data: &mut ArrayAndStats, - _ctx: CompressorContext, + ctx: CompressorContext, ) -> VortexResult { let array = data.array().clone(); let ext_array = array.to_extension(); @@ -98,17 +98,23 @@ impl Scheme for TemporalScheme { subseconds, } = split_temporal(temporal_array)?; - let days = compressor.compress_canonical( - Canonical::Primitive(days.to_primitive().narrow()?), - CompressorContext::default().descend_with_scheme(self.id(), 0), + let days = compressor.compress_child( + &days.to_primitive().narrow()?.into_array(), + &ctx, + self.id(), + 0, )?; - let seconds = compressor.compress_canonical( - Canonical::Primitive(seconds.to_primitive().narrow()?), - CompressorContext::default().descend_with_scheme(self.id(), 1), + let seconds = compressor.compress_child( + &seconds.to_primitive().narrow()?.into_array(), + &ctx, + self.id(), + 1, )?; - let subseconds = compressor.compress_canonical( - Canonical::Primitive(subseconds.to_primitive().narrow()?), - CompressorContext::default().descend_with_scheme(self.id(), 2), + let subseconds = compressor.compress_child( + &subseconds.to_primitive().narrow()?.into_array(), + &ctx, + self.id(), + 2, )?; Ok(DateTimePartsArray::try_new(dtype, days, seconds, subseconds)?.into_array()) diff --git a/vortex-compressor/public-api.lock b/vortex-compressor/public-api.lock index cfdc31c79f5..6835206b684 100644 --- a/vortex-compressor/public-api.lock +++ b/vortex-compressor/public-api.lock @@ -1,39 +1,471 @@ pub mod vortex_compressor -pub mod vortex_compressor::ctx +pub mod vortex_compressor::builtins -pub struct vortex_compressor::ctx::CompressorContext +pub struct vortex_compressor::builtins::FloatConstantScheme + +impl core::clone::Clone for vortex_compressor::builtins::FloatConstantScheme + +pub fn vortex_compressor::builtins::FloatConstantScheme::clone(&self) -> vortex_compressor::builtins::FloatConstantScheme + +impl core::cmp::Eq for vortex_compressor::builtins::FloatConstantScheme + +impl core::cmp::PartialEq for vortex_compressor::builtins::FloatConstantScheme + +pub fn vortex_compressor::builtins::FloatConstantScheme::eq(&self, other: &vortex_compressor::builtins::FloatConstantScheme) -> bool + +impl core::fmt::Debug for vortex_compressor::builtins::FloatConstantScheme + +pub fn vortex_compressor::builtins::FloatConstantScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::builtins::FloatConstantScheme + +impl core::marker::StructuralPartialEq for vortex_compressor::builtins::FloatConstantScheme + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::FloatConstantScheme + +pub fn vortex_compressor::builtins::FloatConstantScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::FloatConstantScheme::compress(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::FloatConstantScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::FloatConstantScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::FloatConstantScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::FloatConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::FloatConstantScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::FloatConstantScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::FloatConstantScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +pub struct vortex_compressor::builtins::FloatDictScheme + +impl core::clone::Clone for vortex_compressor::builtins::FloatDictScheme + +pub fn vortex_compressor::builtins::FloatDictScheme::clone(&self) -> vortex_compressor::builtins::FloatDictScheme + +impl core::cmp::Eq for vortex_compressor::builtins::FloatDictScheme + +impl core::cmp::PartialEq for vortex_compressor::builtins::FloatDictScheme + +pub fn vortex_compressor::builtins::FloatDictScheme::eq(&self, other: &vortex_compressor::builtins::FloatDictScheme) -> bool + +impl core::fmt::Debug for vortex_compressor::builtins::FloatDictScheme + +pub fn vortex_compressor::builtins::FloatDictScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::builtins::FloatDictScheme + +impl core::marker::StructuralPartialEq for vortex_compressor::builtins::FloatDictScheme + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::FloatDictScheme + +pub fn vortex_compressor::builtins::FloatDictScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::FloatDictScheme::compress(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::FloatDictScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::FloatDictScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::FloatDictScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::FloatDictScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::FloatDictScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::FloatDictScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::FloatDictScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +pub struct vortex_compressor::builtins::FloatUncompressedScheme + +impl core::clone::Clone for vortex_compressor::builtins::FloatUncompressedScheme + +pub fn vortex_compressor::builtins::FloatUncompressedScheme::clone(&self) -> vortex_compressor::builtins::FloatUncompressedScheme + +impl core::cmp::Eq for vortex_compressor::builtins::FloatUncompressedScheme + +impl core::cmp::PartialEq for vortex_compressor::builtins::FloatUncompressedScheme + +pub fn vortex_compressor::builtins::FloatUncompressedScheme::eq(&self, other: &vortex_compressor::builtins::FloatUncompressedScheme) -> bool + +impl core::fmt::Debug for vortex_compressor::builtins::FloatUncompressedScheme + +pub fn vortex_compressor::builtins::FloatUncompressedScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::builtins::FloatUncompressedScheme + +impl core::marker::StructuralPartialEq for vortex_compressor::builtins::FloatUncompressedScheme + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::FloatUncompressedScheme + +pub fn vortex_compressor::builtins::FloatUncompressedScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::FloatUncompressedScheme::compress(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::FloatUncompressedScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::FloatUncompressedScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::FloatUncompressedScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, _data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::FloatUncompressedScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::FloatUncompressedScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::FloatUncompressedScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::FloatUncompressedScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +pub struct vortex_compressor::builtins::IntConstantScheme + +impl core::clone::Clone for vortex_compressor::builtins::IntConstantScheme + +pub fn vortex_compressor::builtins::IntConstantScheme::clone(&self) -> vortex_compressor::builtins::IntConstantScheme + +impl core::cmp::Eq for vortex_compressor::builtins::IntConstantScheme + +impl core::cmp::PartialEq for vortex_compressor::builtins::IntConstantScheme + +pub fn vortex_compressor::builtins::IntConstantScheme::eq(&self, other: &vortex_compressor::builtins::IntConstantScheme) -> bool + +impl core::fmt::Debug for vortex_compressor::builtins::IntConstantScheme + +pub fn vortex_compressor::builtins::IntConstantScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::builtins::IntConstantScheme + +impl core::marker::StructuralPartialEq for vortex_compressor::builtins::IntConstantScheme + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::IntConstantScheme + +pub fn vortex_compressor::builtins::IntConstantScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::IntConstantScheme::compress(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::IntConstantScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::IntConstantScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::IntConstantScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::IntConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::IntConstantScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::IntConstantScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::IntConstantScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +pub struct vortex_compressor::builtins::IntDictScheme + +impl core::clone::Clone for vortex_compressor::builtins::IntDictScheme + +pub fn vortex_compressor::builtins::IntDictScheme::clone(&self) -> vortex_compressor::builtins::IntDictScheme + +impl core::cmp::Eq for vortex_compressor::builtins::IntDictScheme + +impl core::cmp::PartialEq for vortex_compressor::builtins::IntDictScheme + +pub fn vortex_compressor::builtins::IntDictScheme::eq(&self, other: &vortex_compressor::builtins::IntDictScheme) -> bool + +impl core::fmt::Debug for vortex_compressor::builtins::IntDictScheme + +pub fn vortex_compressor::builtins::IntDictScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::builtins::IntDictScheme + +impl core::marker::StructuralPartialEq for vortex_compressor::builtins::IntDictScheme + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::IntDictScheme + +pub fn vortex_compressor::builtins::IntDictScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::IntDictScheme::compress(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::IntDictScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::IntDictScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::IntDictScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::IntDictScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::IntDictScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::IntDictScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::IntDictScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +pub struct vortex_compressor::builtins::IntUncompressedScheme + +impl core::clone::Clone for vortex_compressor::builtins::IntUncompressedScheme + +pub fn vortex_compressor::builtins::IntUncompressedScheme::clone(&self) -> vortex_compressor::builtins::IntUncompressedScheme + +impl core::cmp::Eq for vortex_compressor::builtins::IntUncompressedScheme + +impl core::cmp::PartialEq for vortex_compressor::builtins::IntUncompressedScheme + +pub fn vortex_compressor::builtins::IntUncompressedScheme::eq(&self, other: &vortex_compressor::builtins::IntUncompressedScheme) -> bool + +impl core::fmt::Debug for vortex_compressor::builtins::IntUncompressedScheme + +pub fn vortex_compressor::builtins::IntUncompressedScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::builtins::IntUncompressedScheme + +impl core::marker::StructuralPartialEq for vortex_compressor::builtins::IntUncompressedScheme + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::IntUncompressedScheme + +pub fn vortex_compressor::builtins::IntUncompressedScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::IntUncompressedScheme::compress(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::IntUncompressedScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::IntUncompressedScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::IntUncompressedScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, _data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::IntUncompressedScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::IntUncompressedScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::IntUncompressedScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::IntUncompressedScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +pub struct vortex_compressor::builtins::StringConstantScheme + +impl core::clone::Clone for vortex_compressor::builtins::StringConstantScheme + +pub fn vortex_compressor::builtins::StringConstantScheme::clone(&self) -> vortex_compressor::builtins::StringConstantScheme + +impl core::cmp::Eq for vortex_compressor::builtins::StringConstantScheme + +impl core::cmp::PartialEq for vortex_compressor::builtins::StringConstantScheme + +pub fn vortex_compressor::builtins::StringConstantScheme::eq(&self, other: &vortex_compressor::builtins::StringConstantScheme) -> bool + +impl core::fmt::Debug for vortex_compressor::builtins::StringConstantScheme + +pub fn vortex_compressor::builtins::StringConstantScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::builtins::StringConstantScheme + +impl core::marker::StructuralPartialEq for vortex_compressor::builtins::StringConstantScheme + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::StringConstantScheme + +pub fn vortex_compressor::builtins::StringConstantScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::StringConstantScheme::compress(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::StringConstantScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::StringConstantScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::StringConstantScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::StringConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::StringConstantScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::StringConstantScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::StringConstantScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +pub struct vortex_compressor::builtins::StringDictScheme + +impl core::clone::Clone for vortex_compressor::builtins::StringDictScheme + +pub fn vortex_compressor::builtins::StringDictScheme::clone(&self) -> vortex_compressor::builtins::StringDictScheme + +impl core::cmp::Eq for vortex_compressor::builtins::StringDictScheme + +impl core::cmp::PartialEq for vortex_compressor::builtins::StringDictScheme + +pub fn vortex_compressor::builtins::StringDictScheme::eq(&self, other: &vortex_compressor::builtins::StringDictScheme) -> bool + +impl core::fmt::Debug for vortex_compressor::builtins::StringDictScheme + +pub fn vortex_compressor::builtins::StringDictScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::builtins::StringDictScheme + +impl core::marker::StructuralPartialEq for vortex_compressor::builtins::StringDictScheme + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::StringDictScheme + +pub fn vortex_compressor::builtins::StringDictScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::StringDictScheme::compress(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::StringDictScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::StringDictScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::StringDictScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::StringDictScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::StringDictScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::StringDictScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::StringDictScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +pub struct vortex_compressor::builtins::StringUncompressedScheme -pub vortex_compressor::ctx::CompressorContext::allowed_cascading: usize +impl core::clone::Clone for vortex_compressor::builtins::StringUncompressedScheme -pub vortex_compressor::ctx::CompressorContext::is_sample: bool +pub fn vortex_compressor::builtins::StringUncompressedScheme::clone(&self) -> vortex_compressor::builtins::StringUncompressedScheme -pub vortex_compressor::ctx::CompressorContext::stats_options: vortex_compressor::stats::GenerateStatsOptions +impl core::cmp::Eq for vortex_compressor::builtins::StringUncompressedScheme + +impl core::cmp::PartialEq for vortex_compressor::builtins::StringUncompressedScheme + +pub fn vortex_compressor::builtins::StringUncompressedScheme::eq(&self, other: &vortex_compressor::builtins::StringUncompressedScheme) -> bool + +impl core::fmt::Debug for vortex_compressor::builtins::StringUncompressedScheme + +pub fn vortex_compressor::builtins::StringUncompressedScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::builtins::StringUncompressedScheme + +impl core::marker::StructuralPartialEq for vortex_compressor::builtins::StringUncompressedScheme + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::StringUncompressedScheme + +pub fn vortex_compressor::builtins::StringUncompressedScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::StringUncompressedScheme::compress(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::StringUncompressedScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::StringUncompressedScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::StringUncompressedScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, _data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::StringUncompressedScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::StringUncompressedScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::StringUncompressedScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::StringUncompressedScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +pub fn vortex_compressor::builtins::float_dictionary_encode(stats: &vortex_compressor::stats::FloatStats) -> vortex_array::arrays::dict::array::DictArray + +pub fn vortex_compressor::builtins::integer_dictionary_encode(stats: &vortex_compressor::stats::IntegerStats) -> vortex_array::arrays::dict::array::DictArray + +pub fn vortex_compressor::builtins::is_float_primitive(canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::is_integer_primitive(canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::is_utf8_string(canonical: &vortex_array::canonical::Canonical) -> bool + +pub mod vortex_compressor::ctx + +pub struct vortex_compressor::ctx::CompressorContext impl vortex_compressor::ctx::CompressorContext +pub fn vortex_compressor::ctx::CompressorContext::as_leaf(self) -> Self + pub fn vortex_compressor::ctx::CompressorContext::as_sample(self) -> Self -pub fn vortex_compressor::ctx::CompressorContext::descend(self) -> Self +pub fn vortex_compressor::ctx::CompressorContext::cascade_history(&self) -> &[(vortex_compressor::scheme::SchemeId, usize)] -impl core::clone::Clone for vortex_compressor::ctx::CompressorContext +pub fn vortex_compressor::ctx::CompressorContext::finished_cascading(&self) -> bool -pub fn vortex_compressor::ctx::CompressorContext::clone(&self) -> vortex_compressor::ctx::CompressorContext +pub fn vortex_compressor::ctx::CompressorContext::is_sample(&self) -> bool + +pub fn vortex_compressor::ctx::CompressorContext::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions -impl core::default::Default for vortex_compressor::ctx::CompressorContext +pub fn vortex_compressor::ctx::CompressorContext::with_stats_options(self, opts: vortex_compressor::stats::GenerateStatsOptions) -> Self + +impl core::clone::Clone for vortex_compressor::ctx::CompressorContext -pub fn vortex_compressor::ctx::CompressorContext::default() -> Self +pub fn vortex_compressor::ctx::CompressorContext::clone(&self) -> vortex_compressor::ctx::CompressorContext impl core::fmt::Debug for vortex_compressor::ctx::CompressorContext pub fn vortex_compressor::ctx::CompressorContext::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -impl core::marker::Copy for vortex_compressor::ctx::CompressorContext - pub const vortex_compressor::ctx::MAX_CASCADE: usize +pub mod vortex_compressor::root_list_children + +pub const vortex_compressor::root_list_children::ELEMENTS: usize + +pub const vortex_compressor::root_list_children::OFFSETS: usize + +pub const vortex_compressor::root_list_children::SIZES: usize + pub mod vortex_compressor::scheme +pub enum vortex_compressor::scheme::ChildSelection + +pub vortex_compressor::scheme::ChildSelection::All + +pub vortex_compressor::scheme::ChildSelection::Many(&'static [usize]) + +pub vortex_compressor::scheme::ChildSelection::One(usize) + +impl vortex_compressor::scheme::ChildSelection + +pub fn vortex_compressor::scheme::ChildSelection::contains(&self, child_index: usize) -> bool + +impl core::clone::Clone for vortex_compressor::scheme::ChildSelection + +pub fn vortex_compressor::scheme::ChildSelection::clone(&self) -> vortex_compressor::scheme::ChildSelection + +impl core::fmt::Debug for vortex_compressor::scheme::ChildSelection + +pub fn vortex_compressor::scheme::ChildSelection::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::scheme::ChildSelection + +pub struct vortex_compressor::scheme::AncestorExclusion + +pub vortex_compressor::scheme::AncestorExclusion::ancestor: vortex_compressor::scheme::SchemeId + +pub vortex_compressor::scheme::AncestorExclusion::children: vortex_compressor::scheme::ChildSelection + +impl core::clone::Clone for vortex_compressor::scheme::AncestorExclusion + +pub fn vortex_compressor::scheme::AncestorExclusion::clone(&self) -> vortex_compressor::scheme::AncestorExclusion + +impl core::fmt::Debug for vortex_compressor::scheme::AncestorExclusion + +pub fn vortex_compressor::scheme::AncestorExclusion::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::scheme::AncestorExclusion + +pub struct vortex_compressor::scheme::DescendantExclusion + +pub vortex_compressor::scheme::DescendantExclusion::children: vortex_compressor::scheme::ChildSelection + +pub vortex_compressor::scheme::DescendantExclusion::excluded: vortex_compressor::scheme::SchemeId + +impl core::clone::Clone for vortex_compressor::scheme::DescendantExclusion + +pub fn vortex_compressor::scheme::DescendantExclusion::clone(&self) -> vortex_compressor::scheme::DescendantExclusion + +impl core::fmt::Debug for vortex_compressor::scheme::DescendantExclusion + +pub fn vortex_compressor::scheme::DescendantExclusion::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::scheme::DescendantExclusion + pub struct vortex_compressor::scheme::SchemeId impl core::clone::Clone for vortex_compressor::scheme::SchemeId @@ -64,18 +496,204 @@ impl core::marker::StructuralPartialEq for vortex_compressor::scheme::SchemeId pub trait vortex_compressor::scheme::Scheme: core::fmt::Debug + core::marker::Send + core::marker::Sync -pub fn vortex_compressor::scheme::Scheme::compress(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +pub fn vortex_compressor::scheme::Scheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::scheme::Scheme::compress(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::scheme::Scheme::descendant_exclusions(&self) -> alloc::vec::Vec pub fn vortex_compressor::scheme::Scheme::detects_constant(&self) -> bool -pub fn vortex_compressor::scheme::Scheme::expected_compression_ratio(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +pub fn vortex_compressor::scheme::Scheme::expected_compression_ratio(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult pub fn vortex_compressor::scheme::Scheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool +pub fn vortex_compressor::scheme::Scheme::num_children(&self) -> usize + pub fn vortex_compressor::scheme::Scheme::scheme_name(&self) -> &'static str pub fn vortex_compressor::scheme::Scheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::FloatConstantScheme + +pub fn vortex_compressor::builtins::FloatConstantScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::FloatConstantScheme::compress(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::FloatConstantScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::FloatConstantScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::FloatConstantScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::FloatConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::FloatConstantScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::FloatConstantScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::FloatConstantScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::FloatDictScheme + +pub fn vortex_compressor::builtins::FloatDictScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::FloatDictScheme::compress(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::FloatDictScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::FloatDictScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::FloatDictScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::FloatDictScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::FloatDictScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::FloatDictScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::FloatDictScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::FloatUncompressedScheme + +pub fn vortex_compressor::builtins::FloatUncompressedScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::FloatUncompressedScheme::compress(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::FloatUncompressedScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::FloatUncompressedScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::FloatUncompressedScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, _data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::FloatUncompressedScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::FloatUncompressedScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::FloatUncompressedScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::FloatUncompressedScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::IntConstantScheme + +pub fn vortex_compressor::builtins::IntConstantScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::IntConstantScheme::compress(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::IntConstantScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::IntConstantScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::IntConstantScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::IntConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::IntConstantScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::IntConstantScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::IntConstantScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::IntDictScheme + +pub fn vortex_compressor::builtins::IntDictScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::IntDictScheme::compress(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::IntDictScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::IntDictScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::IntDictScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::IntDictScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::IntDictScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::IntDictScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::IntDictScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::IntUncompressedScheme + +pub fn vortex_compressor::builtins::IntUncompressedScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::IntUncompressedScheme::compress(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::IntUncompressedScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::IntUncompressedScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::IntUncompressedScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, _data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::IntUncompressedScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::IntUncompressedScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::IntUncompressedScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::IntUncompressedScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::StringConstantScheme + +pub fn vortex_compressor::builtins::StringConstantScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::StringConstantScheme::compress(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::StringConstantScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::StringConstantScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::StringConstantScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::StringConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::StringConstantScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::StringConstantScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::StringConstantScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::StringDictScheme + +pub fn vortex_compressor::builtins::StringDictScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::StringDictScheme::compress(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::StringDictScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::StringDictScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::StringDictScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::StringDictScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::StringDictScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::StringDictScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::StringDictScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::StringUncompressedScheme + +pub fn vortex_compressor::builtins::StringUncompressedScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::StringUncompressedScheme::compress(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::StringUncompressedScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::StringUncompressedScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::StringUncompressedScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, _data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::StringUncompressedScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::StringUncompressedScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::StringUncompressedScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::StringUncompressedScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + pub trait vortex_compressor::scheme::SchemeExt: vortex_compressor::scheme::Scheme pub fn vortex_compressor::scheme::SchemeExt::id(&self) -> vortex_compressor::scheme::SchemeId @@ -84,7 +702,7 @@ impl vortex_compres pub fn T::id(&self) -> vortex_compressor::scheme::SchemeId -pub fn vortex_compressor::scheme::estimate_compression_ratio_with_sampling(scheme: &S, compressor: &vortex_compressor::CascadingCompressor, array: &vortex_array::array::ArrayRef, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +pub fn vortex_compressor::scheme::estimate_compression_ratio_with_sampling(scheme: &S, compressor: &vortex_compressor::CascadingCompressor, array: &vortex_array::array::ArrayRef, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult pub mod vortex_compressor::stats @@ -422,10 +1040,18 @@ impl vortex_compressor::CascadingCompressor pub fn vortex_compressor::CascadingCompressor::compress(&self, array: &vortex_array::array::ArrayRef) -> vortex_error::VortexResult -pub fn vortex_compressor::CascadingCompressor::compress_canonical(&self, array: vortex_array::canonical::Canonical, ctx: vortex_compressor::ctx::CompressorContext, excludes: &[vortex_compressor::scheme::SchemeId]) -> vortex_error::VortexResult +pub fn vortex_compressor::CascadingCompressor::compress_canonical(&self, array: vortex_array::canonical::Canonical, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::CascadingCompressor::compress_child(&self, child: &vortex_array::array::ArrayRef, parent_ctx: &vortex_compressor::ctx::CompressorContext, parent_id: vortex_compressor::scheme::SchemeId, child_index: usize) -> vortex_error::VortexResult + +pub fn vortex_compressor::CascadingCompressor::execution_ctx(&self) -> parking_lot::mutex::MutexGuard<'_, vortex_array::executor::ExecutionCtx> pub fn vortex_compressor::CascadingCompressor::new(schemes: alloc::vec::Vec<&'static dyn vortex_compressor::scheme::Scheme>) -> Self impl core::clone::Clone for vortex_compressor::CascadingCompressor pub fn vortex_compressor::CascadingCompressor::clone(&self) -> vortex_compressor::CascadingCompressor + +impl core::fmt::Debug for vortex_compressor::CascadingCompressor + +pub fn vortex_compressor::CascadingCompressor::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result diff --git a/vortex-compressor/src/builtins/dict/mod.rs b/vortex-compressor/src/builtins/dict/mod.rs index 15c133daf7e..037218f3d0a 100644 --- a/vortex-compressor/src/builtins/dict/mod.rs +++ b/vortex-compressor/src/builtins/dict/mod.rs @@ -11,7 +11,6 @@ use vortex_array::Canonical; use vortex_array::IntoArray; use vortex_array::ToCanonical; use vortex_array::arrays::DictArray; -use vortex_array::arrays::dict::DictArrayParts; use vortex_array::builders::dict::dict_encode; use vortex_error::VortexExpect; use vortex_error::VortexResult; @@ -56,13 +55,8 @@ impl Scheme for IntDictScheme { &self, _compressor: &CascadingCompressor, data: &mut ArrayAndStats, - ctx: CompressorContext, + _ctx: CompressorContext, ) -> VortexResult { - // Dict should not be terminal. - if ctx.finished_cascading() { - return Ok(0.0); - } - let stats = data.integer_stats(); if stats.value_count() == 0 { @@ -103,23 +97,25 @@ impl Scheme for IntDictScheme { data: &mut ArrayAndStats, ctx: CompressorContext, ) -> VortexResult { - assert!(!ctx.finished_cascading()); - let stats = data.integer_stats(); let dict = integer::dictionary_encode(stats); - // Cascade the codes child (child 0 for now — will become child 1 with values-first). - let codes_ctx = ctx.descend_with_scheme(self.id(), 0); - let compressed_codes = compressor.compress_canonical( - Canonical::Primitive(dict.codes().to_primitive().narrow()?), - codes_ctx, + // Values = child 0. + let compressed_values = compressor.compress_child(dict.values(), &ctx, self.id(), 0)?; + + // Codes = child 1. + let compressed_codes = compressor.compress_child( + &dict.codes().to_primitive().narrow()?.into_array(), + &ctx, + self.id(), + 1, )?; // SAFETY: compressing codes does not change their values. unsafe { Ok( - DictArray::new_unchecked(compressed_codes, dict.values().clone()) + DictArray::new_unchecked(compressed_codes, compressed_values) .set_all_values_referenced(dict.has_all_values_referenced()) .into_array(), ) @@ -150,21 +146,17 @@ impl Scheme for FloatDictScheme { 2 } + // TODO(connor): There seems to be stuff missing here... fn descendant_exclusions(&self) -> Vec { vec![ - // Exclude IntDict from codes child. DescendantExclusion { excluded: IntDictScheme.id(), children: ChildSelection::One(1), }, - // Exclude IntDict from values child (replaces ALP conditional propagation). DescendantExclusion { excluded: IntDictScheme.id(), children: ChildSelection::One(0), }, - // Note: IntSequenceScheme and IntRunEndScheme exclusions are expressed as pull - // rules on those schemes in vortex-btrblocks, since they can't be referenced - // from vortex-compressor. ] } @@ -200,18 +192,17 @@ impl Scheme for FloatDictScheme { let dict = float::dictionary_encode(stats); let has_all_values_referenced = dict.has_all_values_referenced(); - let DictArrayParts { codes, values, .. } = dict.into_parts(); + // let DictArrayParts { codes, values, .. } = dict.into_parts(); // Values = child 0. - let compressed_values = compressor.compress_canonical( - Canonical::Primitive(values.to_primitive()), - ctx.clone().descend_with_scheme(self.id(), 0), - )?; + let compressed_values = compressor.compress_child(dict.values(), &ctx, self.id(), 0)?; // Codes = child 1. - let compressed_codes = compressor.compress_canonical( - Canonical::Primitive(codes.to_primitive()), - ctx.descend_with_scheme(self.id(), 1), + let compressed_codes = compressor.compress_child( + &dict.codes().to_primitive().narrow()?.into_array(), + &ctx, + self.id(), + 1, )?; // SAFETY: compressing codes or values does not alter the invariants. @@ -248,16 +239,12 @@ impl Scheme for StringDictScheme { 2 } + // TODO(connor): There seems to be stuff missing here... fn descendant_exclusions(&self) -> Vec { - vec![ - // Exclude IntDict from codes child. - DescendantExclusion { - excluded: IntDictScheme.id(), - children: ChildSelection::One(1), - }, - // Note: IntSequenceScheme exclusion is expressed as a pull rule on - // IntSequenceScheme in vortex-btrblocks. - ] + vec![DescendantExclusion { + excluded: IntDictScheme.id(), + children: ChildSelection::One(1), + }] } fn expected_compression_ratio( @@ -292,21 +279,15 @@ impl Scheme for StringDictScheme { let dict = dict_encode(&stats.source().clone().into_array())?; - // If we are not allowed to cascade, do not attempt codes or values compression. - if ctx.finished_cascading() { - return Ok(dict.into_array()); - } + // Values = child 0. + let compressed_values = compressor.compress_child(dict.values(), &ctx, self.id(), 0)?; // Codes = child 1. - let compressed_codes = compressor.compress_canonical( - Canonical::Primitive(dict.codes().to_primitive()), - ctx.clone().descend_with_scheme(self.id(), 1), - )?; - - // Values = child 0. - let compressed_values = compressor.compress_canonical( - Canonical::VarBinView(dict.values().to_varbinview()), - ctx.descend_with_scheme(self.id(), 0), + let compressed_codes = compressor.compress_child( + &dict.codes().to_primitive().narrow()?.into_array(), + &ctx, + self.id(), + 1, )?; // SAFETY: compressing codes or values does not alter the invariants. diff --git a/vortex-compressor/src/compressor.rs b/vortex-compressor/src/compressor.rs index 02d3a64c305..de2230f1d9c 100644 --- a/vortex-compressor/src/compressor.rs +++ b/vortex-compressor/src/compressor.rs @@ -110,6 +110,38 @@ impl CascadingCompressor { self.ctx.lock() } + /// Compresses a child array produced by a cascading scheme. + /// + /// If the cascade budget is exhausted, the canonical array is returned as-is. Otherwise, + /// the child context is created by descending and recording the parent scheme + child + /// index, and compression proceeds normally. + /// + /// # Errors + /// + /// Returns an error if compression fails. + pub fn compress_child( + &self, + child: &ArrayRef, + parent_ctx: &CompressorContext, + parent_id: SchemeId, + child_index: usize, + ) -> VortexResult { + if parent_ctx.finished_cascading() { + return Ok(child.clone()); + } + + let canonical = child + .clone() + .execute::(&mut self.execution_ctx())? + .0; + let compact = canonical.compact()?; + + let child_ctx = parent_ctx + .clone() + .descend_with_scheme(parent_id, child_index); + self.compress_canonical(compact, child_ctx) + } + /// Compresses an array using cascading adaptive compression. /// /// First canonicalizes and compacts the array, then applies optimal compression schemes. @@ -126,7 +158,7 @@ impl CascadingCompressor { // Compact it, removing any wasted space before we attempt to compress it. let compact = canonical.compact()?; - self.compress_canonical(compact, CompressorContext::default()) + self.compress_canonical(compact, CompressorContext::new()) } /// Compresses a canonical array by dispatching to type-specific logic. @@ -281,8 +313,8 @@ impl CascadingCompressor { } /// Calls [`expected_compression_ratio`] on each candidate and returns the scheme with the - /// highest ratio, or `None` if no scheme exceeds 1.0. Ties are broken by registration - /// order (earlier in the list wins). + /// highest ratio, or `None` if no scheme exceeds 1.0. Ties are broken by registration order + /// (earlier in the list wins). /// /// [`expected_compression_ratio`]: Scheme::expected_compression_ratio fn choose_scheme( diff --git a/vortex-compressor/src/ctx.rs b/vortex-compressor/src/ctx.rs index 770d48358ff..465a7398350 100644 --- a/vortex-compressor/src/ctx.rs +++ b/vortex-compressor/src/ctx.rs @@ -8,6 +8,7 @@ use vortex_error::VortexExpect; use crate::scheme::SchemeId; use crate::stats::GenerateStatsOptions; +// TODO(connor): Why is this 3??? This doesn't seem smart or adaptive. /// Maximum cascade depth for compression. pub const MAX_CASCADE: usize = 3; @@ -32,8 +33,11 @@ pub struct CompressorContext { cascade_history: Vec<(SchemeId, usize)>, } -impl Default for CompressorContext { - fn default() -> Self { +impl CompressorContext { + /// Creates a new `CompressorContext`. + /// + /// This should **only** be created by the compressor. + pub(super) fn new() -> Self { Self { is_sample: false, allowed_cascading: MAX_CASCADE, @@ -43,6 +47,13 @@ impl Default for CompressorContext { } } +#[cfg(test)] +impl Default for CompressorContext { + fn default() -> Self { + Self::new() + } +} + impl CompressorContext { /// Whether this context is for sample compression (ratio estimation). pub fn is_sample(&self) -> bool { @@ -82,7 +93,7 @@ impl CompressorContext { /// /// The `child_index` identifies which child of the scheme is being compressed (e.g. for /// Dict: values=0, codes=1). - pub fn descend_with_scheme(mut self, id: SchemeId, child_index: usize) -> Self { + pub(crate) fn descend_with_scheme(mut self, id: SchemeId, child_index: usize) -> Self { self.allowed_cascading = self .allowed_cascading .checked_sub(1) diff --git a/vortex-compressor/src/scheme.rs b/vortex-compressor/src/scheme.rs index 14a3810a282..dbf21f942e2 100644 --- a/vortex-compressor/src/scheme.rs +++ b/vortex-compressor/src/scheme.rs @@ -26,8 +26,9 @@ use crate::stats::GenerateStatsOptions; /// auto-implemented for all [`Scheme`] types. There is no public constructor. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub struct SchemeId { - /// The scheme name. Only constructable within `vortex-compressor` — the only public way - /// to obtain a [`SchemeId`] is through [`SchemeExt::id()`]. + /// Only constructable within `vortex-compressor`. + /// + /// The only public way to obtain a [`SchemeId`] is through [`SchemeExt::id()`]. pub(super) name: &'static str, } @@ -37,8 +38,6 @@ impl fmt::Display for SchemeId { } } -// --- Exclusion rule types --- - /// Selects which children of a cascading scheme a rule applies to. #[derive(Debug, Clone, Copy)] pub enum ChildSelection { @@ -64,8 +63,8 @@ impl ChildSelection { /// Push rule: declared by a cascading scheme to exclude another scheme from the subtree /// rooted at the specified children. /// -/// Use this when the declaring scheme (the ancestor) knows about the excluded scheme. -/// For example, `ZigZag` excludes `Dict` from all its children. +/// Use this when the declaring scheme (the ancestor) knows about the excluded scheme. For example, +/// `ZigZag` excludes `Dict` from all its children. #[derive(Debug, Clone, Copy)] pub struct DescendantExclusion { /// The scheme to exclude from descendants. @@ -74,8 +73,8 @@ pub struct DescendantExclusion { pub children: ChildSelection, } -/// Pull rule: declared by a scheme to exclude itself when the specified ancestor is in -/// the cascade chain. +/// Pull rule: declared by a scheme to exclude itself when the specified ancestor is in the +/// cascade chain. /// /// Use this when the excluded scheme (the descendant) knows about the ancestor. For example, /// `Sequence` excludes itself when `IntDict` is an ancestor on its codes child. @@ -87,46 +86,46 @@ pub struct AncestorExclusion { pub children: ChildSelection, } -// --- Scheme trait --- - /// A single compression encoding that the [`CascadingCompressor`] can select from. /// /// The compressor evaluates every registered scheme whose [`matches`] returns `true` for a /// given array, picks the one with the highest [`expected_compression_ratio`], and calls /// [`compress`] on the winner. /// -/// One of key features of this compressor is that schemes may "cascade": a scheme's [`compress`] -/// can call back into the compressor to compress child or transformed arrays, building up multiple -/// encoding layers (e.g. frame-of-reference and then bit-packing). +/// One of the key features of this compressor is that schemes may "cascade": a scheme's +/// [`compress`] can call back into the compressor via [`CascadingCompressor::compress_child`] to +/// compress child or transformed arrays, building up multiple encoding layers (e.g. +/// frame-of-reference and then bit-packing). /// /// # Identity /// /// Every scheme has a globally unique name returned by [`scheme_name`]. The [`SchemeExt::id`] -/// method (auto-implemented, cannot be overridden) wraps that name in an opaque [`SchemeId`] -/// used for equality, hashing, and exclusion rules. +/// method (auto-implemented, cannot be overridden) wraps that name in an opaque [`SchemeId`] used +/// for equality, hashing, and exclusion rules. /// /// # Cascading and children /// /// Schemes that produce child arrays for further compression declare [`num_children`] > 0. Each -/// child is identified by index, with values at index 0 by convention. When cascading, schemes -/// call [`CompressorContext::with_scheme`] to record which child they are compressing. +/// child is identified by index. Cascading schemes should use +/// [`CascadingCompressor::compress_child`] to compress each child array, which handles cascade +/// level / budget tracking and context management automatically. /// /// No scheme may appear twice in a cascade chain (enforced by the compressor). This keeps the -/// search space a DAG. +/// search space a tree. /// /// # Exclusion rules /// /// Schemes declare exclusion rules to prevent incompatible scheme combinations in the cascade /// chain: /// -/// - [`descendant_exclusions`] (push): "exclude scheme X from my child Y's subtree." Used when -/// the declaring scheme knows about the excluded scheme. +/// - [`descendant_exclusions`] (push): "exclude scheme X from my child Y's subtree." Used when the +/// declaring scheme knows about the excluded scheme. /// - [`ancestor_exclusions`] (pull): "exclude me if ancestor X's child Y is above me." Used when /// the declaring scheme knows about the ancestor. /// /// # Implementing a scheme /// -/// At a minimum, implementors must implement [`scheme_name`], [`matches`], and [`compress`]. +/// At a minimum, implementors must provide [`scheme_name`], [`matches`], and [`compress`]. /// /// The default [`expected_compression_ratio`] estimates the ratio by compressing a small sample. /// Implementors should only override this method when a cheaper heuristic is available (e.g. @@ -179,8 +178,8 @@ pub trait Scheme: Debug + Send + Sync { /// Ancestors that make this scheme ineligible (pull direction). /// - /// Each rule says: "if ancestor X cascaded through child Y somewhere above me in the chain, - /// do not try me." + /// Each rule says: "if ancestor X cascaded through child Y somewhere above me in the chain, do + /// not try me." fn ancestor_exclusions(&self) -> Vec { Vec::new() } @@ -201,8 +200,8 @@ pub trait Scheme: Debug + Send + Sync { /// Compress the array using this scheme. /// - /// Cascading schemes should call [`CompressorContext::with_scheme`] to record which child - /// they are compressing before delegating to the compressor. + /// Cascading schemes should use [`CascadingCompressor::compress_child`] to compress each child + /// array rather than calling [`CascadingCompressor::compress_canonical`] directly. /// /// # Errors /// @@ -232,7 +231,7 @@ impl Hash for dyn Scheme { /// Extension trait providing [`id`](SchemeExt::id) for all [`Scheme`] implementors. /// /// This trait is automatically implemented for every type that implements [`Scheme`]. Because the -/// blanket implementation covers all types, external crates cannot override the `id()` method. +/// blanket implementation covers all types, external crates cannot override `id()`. pub trait SchemeExt: Scheme { /// Unique identifier derived from [`scheme_name`](Scheme::scheme_name). fn id(&self) -> SchemeId { @@ -246,8 +245,8 @@ impl SchemeExt for T {} /// Estimates compression ratio by compressing a ~1% sample of the data. /// -/// This function will create a new [`ArrayAndStats`] for the sample so that stats are generated -/// from the sample, not the full array. +/// Creates a new [`ArrayAndStats`] for the sample so that stats are generated from the sample, not +/// the full array. /// /// # Errors /// diff --git a/vortex-file/src/strategy.rs b/vortex-file/src/strategy.rs index 534da6c659e..de360b672ce 100644 --- a/vortex-file/src/strategy.rs +++ b/vortex-file/src/strategy.rs @@ -201,8 +201,8 @@ impl WriteStrategyBuilder { integer::SparseScheme.id(), integer::RLE_INTEGER_SCHEME.id(), float::RLE_FLOAT_SCHEME.id(), - float::NullDominated.id(), - string::DictScheme.id(), + float::NullDominatedSparseScheme.id(), + string::StringDictScheme.id(), string::FSSTScheme.id(), ]); diff --git a/vortex-layout/src/layouts/compressed.rs b/vortex-layout/src/layouts/compressed.rs index 981779edeb3..603b2360e0d 100644 --- a/vortex-layout/src/layouts/compressed.rs +++ b/vortex-layout/src/layouts/compressed.rs @@ -12,7 +12,7 @@ use vortex_array::expr::stats::Stat; use vortex_btrblocks::BtrBlocksCompressor; use vortex_btrblocks::BtrBlocksCompressorBuilder; use vortex_btrblocks::SchemeExt; -use vortex_btrblocks::schemes::integer::DictScheme; +use vortex_btrblocks::schemes::integer::IntDictScheme; use vortex_error::VortexResult; use vortex_io::runtime::Handle; @@ -70,7 +70,7 @@ impl CompressingStrategy { pub fn new_btrblocks(child: S, exclude_int_dict_encoding: bool) -> Self { let compressor = if exclude_int_dict_encoding { BtrBlocksCompressorBuilder::default() - .exclude([DictScheme.id()]) + .exclude([IntDictScheme.id()]) .build() } else { BtrBlocksCompressor::default() From 3612521b31c619af42f646d8ed448d8ae243f9c5 Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Thu, 19 Mar 2026 16:54:52 -0400 Subject: [PATCH 8/9] add more exclusion rules (beyond the old compressor) Signed-off-by: Connor Tsui --- vortex-btrblocks/public-api.lock | 4 + vortex-btrblocks/src/schemes/decimal.rs | 1 + vortex-btrblocks/src/schemes/float.rs | 11 +-- vortex-btrblocks/src/schemes/integer.rs | 68 +++++++++++++--- vortex-btrblocks/src/schemes/rle.rs | 31 +++++--- vortex-btrblocks/src/schemes/string.rs | 3 +- vortex-btrblocks/src/schemes/temporal.rs | 1 + vortex-compressor/public-api.lock | 10 --- vortex-compressor/src/builtins/dict/mod.rs | 20 ++++- vortex-compressor/src/compressor.rs | 93 +++++++++++++++++++++- vortex-compressor/src/lib.rs | 1 - vortex-compressor/src/scheme.rs | 3 - 12 files changed, 193 insertions(+), 53 deletions(-) diff --git a/vortex-btrblocks/public-api.lock b/vortex-btrblocks/public-api.lock index 92e9a1472a6..4c2fc1bf8c8 100644 --- a/vortex-btrblocks/public-api.lock +++ b/vortex-btrblocks/public-api.lock @@ -284,6 +284,8 @@ impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::integer::F impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::integer::FoRScheme +pub fn vortex_btrblocks::schemes::integer::FoRScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + pub fn vortex_btrblocks::schemes::integer::FoRScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult pub fn vortex_btrblocks::schemes::integer::FoRScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult @@ -468,6 +470,8 @@ impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::integer::Z impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::integer::ZigZagScheme +pub fn vortex_btrblocks::schemes::integer::ZigZagScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + pub fn vortex_btrblocks::schemes::integer::ZigZagScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult pub fn vortex_btrblocks::schemes::integer::ZigZagScheme::descendant_exclusions(&self) -> alloc::vec::Vec diff --git a/vortex-btrblocks/src/schemes/decimal.rs b/vortex-btrblocks/src/schemes/decimal.rs index c4ad932b765..dcbf74c6f10 100644 --- a/vortex-btrblocks/src/schemes/decimal.rs +++ b/vortex-btrblocks/src/schemes/decimal.rs @@ -36,6 +36,7 @@ impl Scheme for DecimalScheme { matches!(canonical, Canonical::Decimal(_)) } + /// Children: primitive=0. fn num_children(&self) -> usize { 1 } diff --git a/vortex-btrblocks/src/schemes/float.rs b/vortex-btrblocks/src/schemes/float.rs index 5355f0adde5..59fb2058085 100644 --- a/vortex-btrblocks/src/schemes/float.rs +++ b/vortex-btrblocks/src/schemes/float.rs @@ -74,14 +74,6 @@ impl rle::RLEConfig for FloatRLEConfig { fn generate_stats(array: &ArrayRef) -> FloatStats { FloatStats::generate(&array.to_primitive()) } - - fn compress_values( - compressor: &CascadingCompressor, - values: &PrimitiveArray, - ctx: CompressorContext, - ) -> VortexResult { - compressor.compress_canonical(Canonical::Primitive(values.clone()), ctx) - } } impl RLEStats for FloatStats { @@ -110,6 +102,7 @@ impl Scheme for ALPScheme { is_float_primitive(canonical) } + /// Children: encoded_ints=0. fn num_children(&self) -> usize { 1 } @@ -220,11 +213,11 @@ impl Scheme for NullDominatedSparseScheme { is_float_primitive(canonical) } + /// Children: indices=0. fn num_children(&self) -> usize { 1 } - // TODO(connor): There seems to be stuff missing here... /// The indices of a null-dominated sparse array should not be sparse-encoded again. fn descendant_exclusions(&self) -> Vec { vec![DescendantExclusion { diff --git a/vortex-btrblocks/src/schemes/integer.rs b/vortex-btrblocks/src/schemes/integer.rs index c45df25c45e..943bbaaec94 100644 --- a/vortex-btrblocks/src/schemes/integer.rs +++ b/vortex-btrblocks/src/schemes/integer.rs @@ -98,14 +98,6 @@ impl rle::RLEConfig for IntRLEConfig { fn generate_stats(array: &ArrayRef) -> IntegerStats { IntegerStats::generate(&array.to_primitive()) } - - fn compress_values( - compressor: &CascadingCompressor, - values: &PrimitiveArray, - ctx: CompressorContext, - ) -> VortexResult { - compressor.compress_canonical(Canonical::Primitive(values.clone()), ctx) - } } impl RLEStats for IntegerStats { @@ -134,6 +126,24 @@ impl Scheme for FoRScheme { is_integer_primitive(canonical) } + /// Dict codes always start at 0, so FoR (which subtracts the min) is a no-op. + fn ancestor_exclusions(&self) -> Vec { + vec![ + AncestorExclusion { + ancestor: IntDictScheme.id(), + children: ChildSelection::One(1), + }, + AncestorExclusion { + ancestor: FloatDictScheme.id(), + children: ChildSelection::One(1), + }, + AncestorExclusion { + ancestor: StringDictScheme.id(), + children: ChildSelection::One(1), + }, + ] + } + fn expected_compression_ratio( &self, _compressor: &CascadingCompressor, @@ -225,6 +235,7 @@ impl Scheme for ZigZagScheme { is_integer_primitive(canonical) } + /// Children: encoded=0. fn num_children(&self) -> usize { 1 } @@ -249,6 +260,24 @@ impl Scheme for ZigZagScheme { ] } + /// Dict codes are unsigned integers (0..cardinality). ZigZag only helps negatives. + fn ancestor_exclusions(&self) -> Vec { + vec![ + AncestorExclusion { + ancestor: IntDictScheme.id(), + children: ChildSelection::One(1), + }, + AncestorExclusion { + ancestor: FloatDictScheme.id(), + children: ChildSelection::One(1), + }, + AncestorExclusion { + ancestor: StringDictScheme.id(), + children: ChildSelection::One(1), + }, + ] + } + fn expected_compression_ratio( &self, compressor: &CascadingCompressor, @@ -365,6 +394,7 @@ impl Scheme for SparseScheme { } } + /// Children: values=0, indices=1. fn num_children(&self) -> usize { 2 } @@ -494,6 +524,7 @@ impl Scheme for RunEndScheme { is_integer_primitive(canonical) } + /// Children: values=0, ends=1. fn num_children(&self) -> usize { 2 } @@ -507,12 +538,23 @@ impl Scheme for RunEndScheme { }] } - // TODO(connor): There seems to be stuff missing here... + /// Dict values (child 0) are all unique by definition, so run-end encoding them is + /// pointless. Codes (child 1) can have runs and may benefit from RunEnd. fn ancestor_exclusions(&self) -> Vec { - vec![AncestorExclusion { - ancestor: FloatDictScheme.id(), - children: ChildSelection::One(0), - }] + vec![ + AncestorExclusion { + ancestor: IntDictScheme.id(), + children: ChildSelection::One(0), + }, + AncestorExclusion { + ancestor: FloatDictScheme.id(), + children: ChildSelection::One(0), + }, + AncestorExclusion { + ancestor: StringDictScheme.id(), + children: ChildSelection::One(0), + }, + ] } fn expected_compression_ratio( diff --git a/vortex-btrblocks/src/schemes/rle.rs b/vortex-btrblocks/src/schemes/rle.rs index 0db6f680bb8..da63bf58ca0 100644 --- a/vortex-btrblocks/src/schemes/rle.rs +++ b/vortex-btrblocks/src/schemes/rle.rs @@ -9,6 +9,9 @@ use vortex_array::Canonical; use vortex_array::IntoArray; use vortex_array::ToCanonical; use vortex_array::arrays::PrimitiveArray; +use vortex_compressor::builtins::FloatDictScheme; +use vortex_compressor::builtins::StringDictScheme; +use vortex_compressor::scheme::AncestorExclusion; use vortex_compressor::scheme::ChildSelection; use vortex_compressor::scheme::DescendantExclusion; #[cfg(feature = "unstable_encodings")] @@ -37,8 +40,6 @@ pub trait RLEStats { fn source(&self) -> &PrimitiveArray; } -// TODO(connor): This trait is super confusing, we should probably just remove it and hardcode the -// only 2 implementations (integer and float). /// Configuration trait for RLE schemes. /// /// Implement this trait to define the behavior of an RLE scheme for a specific @@ -55,13 +56,6 @@ pub trait RLEConfig: Debug + Send + Sync + 'static { /// Generates statistics for the given array. fn generate_stats(array: &ArrayRef) -> Self::Stats; - - /// Compress the values array after RLE encoding. - fn compress_values( - compressor: &CascadingCompressor, - values: &PrimitiveArray, - ctx: CompressorContext, - ) -> VortexResult; } /// RLE scheme that is generic over a configuration type. @@ -92,6 +86,7 @@ impl Scheme for RLEScheme { C::matches(canonical) } + /// Children: values=0, indices=1, offsets=2. fn num_children(&self) -> usize { 3 } @@ -106,6 +101,24 @@ impl Scheme for RLEScheme { }] } + /// Dict values (child 0) are all unique by definition, so RLE is pointless on them. + fn ancestor_exclusions(&self) -> Vec { + vec![ + AncestorExclusion { + ancestor: IntDictScheme.id(), + children: ChildSelection::One(0), + }, + AncestorExclusion { + ancestor: FloatDictScheme.id(), + children: ChildSelection::One(0), + }, + AncestorExclusion { + ancestor: StringDictScheme.id(), + children: ChildSelection::One(0), + }, + ] + } + fn expected_compression_ratio( &self, compressor: &CascadingCompressor, diff --git a/vortex-btrblocks/src/schemes/string.rs b/vortex-btrblocks/src/schemes/string.rs index 17102f01f5a..8a8b8393529 100644 --- a/vortex-btrblocks/src/schemes/string.rs +++ b/vortex-btrblocks/src/schemes/string.rs @@ -62,6 +62,7 @@ impl Scheme for FSSTScheme { is_utf8_string(canonical) } + /// Children: lengths=0, code_offsets=1. fn num_children(&self) -> usize { 2 } @@ -124,11 +125,11 @@ impl Scheme for NullDominatedSparseScheme { is_utf8_string(canonical) } + /// Children: indices=0. fn num_children(&self) -> usize { 1 } - // TODO(connor): There seems to be stuff missing here... /// The indices of a null-dominated sparse array should not be sparse-encoded again. fn descendant_exclusions(&self) -> Vec { vec![ diff --git a/vortex-btrblocks/src/schemes/temporal.rs b/vortex-btrblocks/src/schemes/temporal.rs index 2fb75eb2d0a..0282693ee69 100644 --- a/vortex-btrblocks/src/schemes/temporal.rs +++ b/vortex-btrblocks/src/schemes/temporal.rs @@ -53,6 +53,7 @@ impl Scheme for TemporalScheme { true } + /// Children: days=0, seconds=1, subseconds=2. fn num_children(&self) -> usize { 3 } diff --git a/vortex-compressor/public-api.lock b/vortex-compressor/public-api.lock index 6835206b684..49c40c3e2eb 100644 --- a/vortex-compressor/public-api.lock +++ b/vortex-compressor/public-api.lock @@ -402,14 +402,6 @@ pub fn vortex_compressor::ctx::CompressorContext::fmt(&self, f: &mut core::fmt:: pub const vortex_compressor::ctx::MAX_CASCADE: usize -pub mod vortex_compressor::root_list_children - -pub const vortex_compressor::root_list_children::ELEMENTS: usize - -pub const vortex_compressor::root_list_children::OFFSETS: usize - -pub const vortex_compressor::root_list_children::SIZES: usize - pub mod vortex_compressor::scheme pub enum vortex_compressor::scheme::ChildSelection @@ -1040,8 +1032,6 @@ impl vortex_compressor::CascadingCompressor pub fn vortex_compressor::CascadingCompressor::compress(&self, array: &vortex_array::array::ArrayRef) -> vortex_error::VortexResult -pub fn vortex_compressor::CascadingCompressor::compress_canonical(&self, array: vortex_array::canonical::Canonical, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult - pub fn vortex_compressor::CascadingCompressor::compress_child(&self, child: &vortex_array::array::ArrayRef, parent_ctx: &vortex_compressor::ctx::CompressorContext, parent_id: vortex_compressor::scheme::SchemeId, child_index: usize) -> vortex_error::VortexResult pub fn vortex_compressor::CascadingCompressor::execution_ctx(&self) -> parking_lot::mutex::MutexGuard<'_, vortex_array::executor::ExecutionCtx> diff --git a/vortex-compressor/src/builtins/dict/mod.rs b/vortex-compressor/src/builtins/dict/mod.rs index 037218f3d0a..c8d38dcf56c 100644 --- a/vortex-compressor/src/builtins/dict/mod.rs +++ b/vortex-compressor/src/builtins/dict/mod.rs @@ -47,8 +47,9 @@ impl Scheme for IntDictScheme { } } + /// Children: values=0, codes=1. fn num_children(&self) -> usize { - 1 + 2 } fn expected_compression_ratio( @@ -142,11 +143,19 @@ impl Scheme for FloatDictScheme { } } + /// Children: values=0, codes=1. fn num_children(&self) -> usize { 2 } - // TODO(connor): There seems to be stuff missing here... + /// Float dict codes (child 1) are compact unsigned integers that should not be + /// dict-encoded again. Float dict values (child 0) flow through ALP into integer-land, + /// where integer dict encoding is redundant since the values are already deduplicated at + /// the float level. + /// + /// Additional exclusions for codes (IntSequenceScheme, IntRunEndScheme, FoRScheme, + /// ZigZagScheme, SparseScheme, RLE) are expressed as pull rules on those schemes in + /// vortex-btrblocks. fn descendant_exclusions(&self) -> Vec { vec![ DescendantExclusion { @@ -235,11 +244,16 @@ impl Scheme for StringDictScheme { } } + /// Children: values=0, codes=1. fn num_children(&self) -> usize { 2 } - // TODO(connor): There seems to be stuff missing here... + /// String dict codes (child 1) are compact unsigned integers that should not be dict-encoded + /// again. + /// + /// Additional exclusions for codes (IntSequenceScheme, FoRScheme, ZigZagScheme, SparseScheme, + /// RunEndScheme, RLE, etc.) are expressed as pull rules on those schemes in `vortex-btrblocks`. fn descendant_exclusions(&self) -> Vec { vec![DescendantExclusion { excluded: IntDictScheme.id(), diff --git a/vortex-compressor/src/compressor.rs b/vortex-compressor/src/compressor.rs index de2230f1d9c..a6ed57375c1 100644 --- a/vortex-compressor/src/compressor.rs +++ b/vortex-compressor/src/compressor.rs @@ -47,9 +47,7 @@ const ROOT_SCHEME_ID: SchemeId = SchemeId { }; /// Child indices for the compressor's list/listview compression. -pub mod root_list_children { - /// List elements child. - pub const ELEMENTS: usize = 0; +mod root_list_children { /// List/ListView offsets child. pub const OFFSETS: usize = 1; /// ListView sizes child. @@ -166,7 +164,7 @@ impl CascadingCompressor { /// # Errors /// /// Returns an error if compression of any sub-array fails. - pub fn compress_canonical( + fn compress_canonical( &self, array: Canonical, ctx: CompressorContext, @@ -326,12 +324,23 @@ impl CascadingCompressor { let mut best: Option<(&'static dyn Scheme, f64)> = None; for &scheme in schemes { + // Constant detection on a sample is a false positive: the sample being constant + // does not mean the full array is constant. + if ctx.is_sample() && scheme.detects_constant() { + continue; + } + let ratio = scheme.expected_compression_ratio(self, data, ctx.clone())?; tracing::debug!(scheme = %scheme.id(), ratio, "evaluated compression ratio"); if is_better_ratio(ratio, &best) { best = Some((scheme, ratio)); + + // Schemes that return f64::MAX (like Constant) cannot be beat, so stop early. + if ratio == f64::MAX { + break; + } } } @@ -450,3 +459,79 @@ impl CascadingCompressor { fn is_better_ratio(ratio: f64, best: &Option<(&'static dyn Scheme, f64)>) -> bool { ratio.is_finite() && !ratio.is_subnormal() && ratio > 1.0 && best.is_none_or(|(_, r)| ratio > r) } + +#[cfg(test)] +mod tests { + use super::*; + use crate::builtins::FloatDictScheme; + use crate::builtins::IntDictScheme; + use crate::builtins::IntUncompressedScheme; + use crate::builtins::StringDictScheme; + use crate::ctx::CompressorContext; + use crate::scheme::SchemeExt; + + fn compressor() -> CascadingCompressor { + CascadingCompressor::new(vec![ + &IntUncompressedScheme, + &IntDictScheme, + &FloatDictScheme, + &StringDictScheme, + ]) + } + + #[test] + fn test_self_exclusion() { + let c = compressor(); + let ctx = CompressorContext::default().descend_with_scheme(IntDictScheme.id(), 0); + + // IntDictScheme is in the history, so it should be excluded. + assert!(c.is_excluded(&IntDictScheme, &ctx)); + // IntUncompressedScheme is not in the history. + assert!(!c.is_excluded(&IntUncompressedScheme, &ctx)); + } + + #[test] + fn test_root_exclusion_list_offsets() { + let c = compressor(); + let ctx = CompressorContext::default() + .descend_with_scheme(ROOT_SCHEME_ID, root_list_children::OFFSETS); + + // IntDict should be excluded for list offsets. + assert!(c.is_excluded(&IntDictScheme, &ctx)); + // IntUncompressed should not be excluded. + assert!(!c.is_excluded(&IntUncompressedScheme, &ctx)); + } + + #[test] + fn test_push_rule_float_dict_excludes_int_dict_from_codes() { + let c = compressor(); + // FloatDict cascading through codes (child 1). + let ctx = CompressorContext::default().descend_with_scheme(FloatDictScheme.id(), 1); + + // IntDict should be excluded from FloatDict's codes child. + assert!(c.is_excluded(&IntDictScheme, &ctx)); + // IntUncompressed should not be excluded. + assert!(!c.is_excluded(&IntUncompressedScheme, &ctx)); + } + + #[test] + fn test_push_rule_float_dict_excludes_int_dict_from_values() { + let c = compressor(); + // FloatDict cascading through values (child 0). + let ctx = CompressorContext::default().descend_with_scheme(FloatDictScheme.id(), 0); + + // IntDict should also be excluded from FloatDict's values child (ALP propagation + // replacement). + assert!(c.is_excluded(&IntDictScheme, &ctx)); + } + + #[test] + fn test_no_exclusion_without_history() { + let c = compressor(); + let ctx = CompressorContext::default(); + + // No history means no exclusions. + assert!(!c.is_excluded(&IntDictScheme, &ctx)); + assert!(!c.is_excluded(&IntUncompressedScheme, &ctx)); + } +} diff --git a/vortex-compressor/src/lib.rs b/vortex-compressor/src/lib.rs index 9066d9ecd86..683bea4f8aa 100644 --- a/vortex-compressor/src/lib.rs +++ b/vortex-compressor/src/lib.rs @@ -25,4 +25,3 @@ mod sample; mod compressor; pub use compressor::CascadingCompressor; -pub use compressor::root_list_children; diff --git a/vortex-compressor/src/scheme.rs b/vortex-compressor/src/scheme.rs index dbf21f942e2..f0fa1557feb 100644 --- a/vortex-compressor/src/scheme.rs +++ b/vortex-compressor/src/scheme.rs @@ -200,9 +200,6 @@ pub trait Scheme: Debug + Send + Sync { /// Compress the array using this scheme. /// - /// Cascading schemes should use [`CascadingCompressor::compress_child`] to compress each child - /// array rather than calling [`CascadingCompressor::compress_canonical`] directly. - /// /// # Errors /// /// Returns an error if compression fails. From 0b30df9cc43775a0b806c2d82720a7062d1acdc8 Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Fri, 20 Mar 2026 10:38:40 -0400 Subject: [PATCH 9/9] add more rules and clean up Signed-off-by: Connor Tsui --- vortex-btrblocks/src/builder.rs | 8 +- vortex-btrblocks/src/schemes/float.rs | 77 +++---------- vortex-btrblocks/src/schemes/integer.rs | 145 +++++++++++------------- vortex-btrblocks/src/schemes/rle.rs | 118 ++++++++++++++++--- 4 files changed, 188 insertions(+), 160 deletions(-) diff --git a/vortex-btrblocks/src/builder.rs b/vortex-btrblocks/src/builder.rs index 26b4c1ab727..b5558fc926c 100644 --- a/vortex-btrblocks/src/builder.rs +++ b/vortex-btrblocks/src/builder.rs @@ -13,6 +13,7 @@ use crate::SchemeId; use crate::schemes::decimal; use crate::schemes::float; use crate::schemes::integer; +use crate::schemes::rle; use crate::schemes::string; use crate::schemes::temporal; @@ -26,13 +27,14 @@ pub const ALL_SCHEMES: &[&dyn Scheme] = &[ &integer::IntConstantScheme, // NOTE: FoR must precede BitPacking to avoid unnecessary patches. &integer::FoRScheme, - &integer::BitPackingScheme, + // NOTE: ZigZag should precede BitPacking because we don't want negative numbers. &integer::ZigZagScheme, + &integer::BitPackingScheme, &integer::SparseScheme, &integer::IntDictScheme, &integer::RunEndScheme, &integer::SequenceScheme, - &integer::RLE_INTEGER_SCHEME, + &rle::RLE_INTEGER_SCHEME, #[cfg(feature = "pco")] &integer::PcoScheme, // Float schemes. @@ -42,7 +44,7 @@ pub const ALL_SCHEMES: &[&dyn Scheme] = &[ &float::ALPRDScheme, &float::FloatDictScheme, &float::NullDominatedSparseScheme, - &float::RLE_FLOAT_SCHEME, + &rle::RLE_FLOAT_SCHEME, #[cfg(feature = "pco")] &float::PcoScheme, // Decimal schemes. diff --git a/vortex-btrblocks/src/schemes/float.rs b/vortex-btrblocks/src/schemes/float.rs index 59fb2058085..9cbec68ca94 100644 --- a/vortex-btrblocks/src/schemes/float.rs +++ b/vortex-btrblocks/src/schemes/float.rs @@ -3,7 +3,6 @@ //! Float compression schemes. -use vortex_alp::ALP; use vortex_alp::ALPArray; use vortex_alp::RDEncoder; use vortex_alp::alp_encode; @@ -11,7 +10,6 @@ use vortex_array::ArrayRef; use vortex_array::Canonical; use vortex_array::IntoArray; use vortex_array::ToCanonical; -use vortex_array::arrays::PrimitiveArray; use vortex_array::dtype::PType; use vortex_compressor::scheme::ChildSelection; use vortex_compressor::scheme::DescendantExclusion; @@ -28,15 +26,12 @@ use crate::Scheme; use crate::SchemeExt; use crate::compress_patches; use crate::estimate_compression_ratio_with_sampling; -use crate::schemes::rle; -use crate::schemes::rle::RLEScheme; -use crate::schemes::rle::RLEStats; /// ALP (Adaptive Lossless floating-Point) encoding. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct ALPScheme; -/// ALPRD (ALP with Right Division) encoding variant. +/// ALPRD (ALP with Real Double) encoding variant. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct ALPRDScheme; @@ -51,10 +46,6 @@ pub struct NullDominatedSparseScheme; #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct PcoScheme; -/// Configuration for float RLE compression. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct FloatRLEConfig; - // Re-export builtin schemes from vortex-compressor. pub use vortex_compressor::builtins::FloatConstantScheme; pub use vortex_compressor::builtins::FloatDictScheme; @@ -62,36 +53,7 @@ pub use vortex_compressor::builtins::FloatUncompressedScheme; pub use vortex_compressor::builtins::is_float_primitive; pub use vortex_compressor::stats::FloatStats; -impl rle::RLEConfig for FloatRLEConfig { - type Stats = FloatStats; - - const SCHEME_NAME: &'static str = "vortex.float.rle"; - - fn matches(canonical: &Canonical) -> bool { - is_float_primitive(canonical) - } - - fn generate_stats(array: &ArrayRef) -> FloatStats { - FloatStats::generate(&array.to_primitive()) - } -} - -impl RLEStats for FloatStats { - fn value_count(&self) -> u32 { - FloatStats::value_count(self) - } - - fn average_run_length(&self) -> u32 { - FloatStats::average_run_length(self) - } - - fn source(&self) -> &PrimitiveArray { - FloatStats::source(self) - } -} - -/// RLE scheme for float compression. -pub const RLE_FLOAT_SCHEME: RLEScheme = RLEScheme::new(); +pub use crate::schemes::rle::RLE_FLOAT_SCHEME; impl Scheme for ALPScheme { fn scheme_name(&self) -> &'static str { @@ -113,16 +75,14 @@ impl Scheme for ALPScheme { data: &mut ArrayAndStats, ctx: CompressorContext, ) -> VortexResult { - // ALP encodes floats as integers. Without integer compression afterward, the - // encoded ints are the same size. + // ALP encodes floats as integers. Without integer compression afterward, the encoded ints + // are the same size. if ctx.finished_cascading() { return Ok(0.0); } - let stats = data.float_stats(); - // We don't support ALP for f16. - if stats.source().ptype() == PType::F16 { + if data.float_stats().source().ptype() == PType::F16 { return Ok(0.0); } @@ -138,19 +98,16 @@ impl Scheme for ALPScheme { let stats = data.float_stats(); let alp_encoded = alp_encode(&stats.source().to_primitive(), None)?; - let alp = alp_encoded.as_::(); - let alp_ints = alp.encoded().to_primitive(); // Compress the ALP ints. - let compressed_alp_ints = - compressor.compress_child(&alp_ints.into_array(), &ctx, self.id(), 0)?; + compressor.compress_child(alp_encoded.encoded(), &ctx, self.id(), 0)?; // Patches are not compressed. They should be infrequent, and if they are not then we want // to keep them linear for easy indexing. - let patches = alp.patches().map(compress_patches).transpose()?; + let patches = alp_encoded.patches().map(compress_patches).transpose()?; - Ok(ALPArray::new(compressed_alp_ints, alp.exponents(), patches).into_array()) + Ok(ALPArray::new(compressed_alp_ints, alp_encoded.exponents(), patches).into_array()) } } @@ -169,9 +126,7 @@ impl Scheme for ALPRDScheme { data: &mut ArrayAndStats, ctx: CompressorContext, ) -> VortexResult { - let stats = data.float_stats(); - - if stats.source().ptype() == PType::F16 { + if data.float_stats().source().ptype() == PType::F16 { return Ok(0.0); } @@ -235,11 +190,11 @@ impl Scheme for NullDominatedSparseScheme { let stats = data.float_stats(); if stats.value_count() == 0 { - // All nulls should use ConstantScheme. + // All nulls should use ConstantScheme instead of this. return Ok(0.0); } - // If the majority is null, will compress well. + // If the majority (90%) of values is null, this will compress well. if stats.null_count() as f64 / stats.source().len() as f64 > 0.9 { return Ok(stats.source().len() as f64 / stats.value_count() as f64); } @@ -261,12 +216,8 @@ impl Scheme for NullDominatedSparseScheme { if let Some(sparse) = sparse_encoded.as_opt::() { let indices = sparse.patches().indices().to_primitive().narrow()?; - let compressed_indices = compressor.compress_child( - &indices.to_primitive().into_array(), - &ctx, - self.id(), - 0, - )?; + let compressed_indices = + compressor.compress_child(&indices.into_array(), &ctx, self.id(), 0)?; SparseArray::try_new( compressed_indices, @@ -326,8 +277,8 @@ mod tests { use vortex_error::VortexResult; use vortex_fastlanes::RLE; - use super::RLE_FLOAT_SCHEME; use crate::BtrBlocksCompressor; + use crate::schemes::rle::RLE_FLOAT_SCHEME; #[test] fn test_empty() -> VortexResult<()> { diff --git a/vortex-btrblocks/src/schemes/integer.rs b/vortex-btrblocks/src/schemes/integer.rs index 943bbaaec94..5e37175afa6 100644 --- a/vortex-btrblocks/src/schemes/integer.rs +++ b/vortex-btrblocks/src/schemes/integer.rs @@ -8,7 +8,6 @@ use vortex_array::Canonical; use vortex_array::IntoArray; use vortex_array::ToCanonical; use vortex_array::arrays::ConstantArray; -use vortex_array::arrays::PrimitiveArray; use vortex_array::scalar::Scalar; use vortex_compressor::builtins::FloatDictScheme; use vortex_compressor::builtins::StringDictScheme; @@ -39,9 +38,6 @@ use crate::Scheme; use crate::SchemeExt; use crate::compress_patches; use crate::estimate_compression_ratio_with_sampling; -use crate::schemes::rle; -use crate::schemes::rle::RLEScheme; -use crate::schemes::rle::RLEStats; /// Frame of Reference encoding. #[derive(Debug, Copy, Clone, PartialEq, Eq)] @@ -79,44 +75,11 @@ pub use vortex_compressor::builtins::IntUncompressedScheme; pub use vortex_compressor::builtins::is_integer_primitive; pub use vortex_compressor::stats::IntegerStats; +pub use crate::schemes::rle::RLE_INTEGER_SCHEME; + /// Threshold for the average run length in an array before we consider run-end encoding. const RUN_END_THRESHOLD: u32 = 4; -/// Configuration for integer RLE compression. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct IntRLEConfig; - -impl rle::RLEConfig for IntRLEConfig { - type Stats = IntegerStats; - - const SCHEME_NAME: &'static str = "vortex.int.rle"; - - fn matches(canonical: &Canonical) -> bool { - is_integer_primitive(canonical) - } - - fn generate_stats(array: &ArrayRef) -> IntegerStats { - IntegerStats::generate(&array.to_primitive()) - } -} - -impl RLEStats for IntegerStats { - fn value_count(&self) -> u32 { - self.value_count() - } - - fn average_run_length(&self) -> u32 { - self.average_run_length() - } - - fn source(&self) -> &PrimitiveArray { - self.source() - } -} - -/// RLE scheme for integer compression. -pub const RLE_INTEGER_SCHEME: RLEScheme = RLEScheme::new(); - impl Scheme for FoRScheme { fn scheme_name(&self) -> &'static str { "vortex.int.for" @@ -150,8 +113,8 @@ impl Scheme for FoRScheme { data: &mut ArrayAndStats, ctx: CompressorContext, ) -> VortexResult { - // FoR only subtracts the min. Without further compression (e.g. BitPacking), - // the output is the same size. + // FoR only subtracts the min. Without further compression (e.g. BitPacking), the output is + // the same size. if ctx.finished_cascading() { return Ok(0.0); } @@ -169,34 +132,35 @@ impl Scheme for FoRScheme { } // Difference between max and min. - let full_width: u32 = stats - .source() - .ptype() - .bit_width() - .try_into() - .vortex_expect("bit width must fit in u32"); - let for_bw = match stats.erased().max_minus_min().checked_ilog2() { + let for_bitwidth = match stats.erased().max_minus_min().checked_ilog2() { Some(l) => l + 1, - // If max-min == 0, we should use a different compression scheme as we don't want to - // bitpack down to 0 bits. + // If max-min == 0, the we should compress as a constant array. None => return Ok(0.0), }; - // If BitPacking could apply (non-negative values) and FOR doesn't reduce bit width - // compared to BitPacking, don't use FOR since it has overhead (storing reference). - // Only skip FOR when min >= 0, otherwise BitPacking can't apply directly. + // If BitPacking can be applied (only non-negative values) and FoR doesn't reduce bit width + // compared to BitPacking, don't use FoR since it has a small amount of overhead (storing + // the reference) for effectively no benefits. if let Some(max_log) = stats .erased() .max_ilog2() + // Only skip FoR when min >= 0, otherwise BitPacking can't be applied without ZigZag. .filter(|_| !stats.erased().min_is_negative()) { - let bitpack_bw = max_log + 1; - if for_bw >= bitpack_bw { + let bitpack_bitwidth = max_log + 1; + if for_bitwidth >= bitpack_bitwidth { return Ok(0.0); } } - Ok(full_width as f64 / for_bw as f64) + let full_width: u32 = stats + .source() + .ptype() + .bit_width() + .try_into() + .vortex_expect("bit width must fit in u32"); + + Ok(full_width as f64 / for_bitwidth as f64) } fn compress( @@ -217,11 +181,13 @@ impl Scheme for FoRScheme { let mut biased_data = ArrayAndStats::new(biased.into_array(), ctx.stats_options()); let compressed = BitPackingScheme.compress(compressor, &mut biased_data, leaf_ctx)?; + // TODO(connor): This should really be `new_unchecked`. let for_compressed = FoRArray::try_new(compressed, for_array.reference_scalar().clone())?; for_compressed .as_ref() .statistics() .inherit_from(for_array.as_ref().statistics()); + Ok(for_compressed.into_array()) } } @@ -240,9 +206,9 @@ impl Scheme for ZigZagScheme { 1 } - /// Container-style schemes (Dict, RunEnd, Sparse) restructure data and should only be - /// applied once in a cascade chain. ZigZag's output is a simple value transformation, - /// so further restructuring is wasteful. + /// ZigZag is a bijective value transform that preserves cardinality, run patterns, and value + /// dominance. If Dict, RunEnd, or Sparse lost on the original array, they will lose on ZigZag's + /// output too, so we skip evaluating them. fn descendant_exclusions(&self) -> Vec { vec![ DescendantExclusion { @@ -399,13 +365,27 @@ impl Scheme for SparseScheme { 2 } - /// Sparse values and indices are already low-cardinality by construction, so dictionary - /// encoding doesn't add anything. + /// Sparse indices (child 1) are monotonically increasing positions with all unique values. + /// Dict, RunEnd, RLE, and Sparse are all pointless on such data. fn descendant_exclusions(&self) -> Vec { - vec![DescendantExclusion { - excluded: IntDictScheme.id(), - children: ChildSelection::All, - }] + vec![ + DescendantExclusion { + excluded: IntDictScheme.id(), + children: ChildSelection::One(1), + }, + DescendantExclusion { + excluded: RunEndScheme.id(), + children: ChildSelection::One(1), + }, + DescendantExclusion { + excluded: RLE_INTEGER_SCHEME.id(), + children: ChildSelection::One(1), + }, + DescendantExclusion { + excluded: SparseScheme.id(), + children: ChildSelection::One(1), + }, + ] } fn expected_compression_ratio( @@ -414,11 +394,6 @@ impl Scheme for SparseScheme { data: &mut ArrayAndStats, _ctx: CompressorContext, ) -> VortexResult { - // We use `generate()` (not `generate_opts` with `count_distinct_values: true`) - // because the cache is pre-populated by `choose_and_compress` with the merged - // `stats_options` from all eligible schemes. Since this scheme declares - // `stats_options()` with `count_distinct_values: true`, the pre-populated stats - // will have distinct values computed. let stats = data.integer_stats(); if stats.value_count() == 0 { @@ -529,13 +504,27 @@ impl Scheme for RunEndScheme { 2 } - /// Run-end values and ends are already deduplicated, so dictionary encoding doesn't add - /// anything. + /// RunEnd ends (child 1) are monotonically increasing positions with all unique values. + /// Dict, RunEnd, RLE, and Sparse are all pointless on such data. fn descendant_exclusions(&self) -> Vec { - vec![DescendantExclusion { - excluded: IntDictScheme.id(), - children: ChildSelection::All, - }] + vec![ + DescendantExclusion { + excluded: IntDictScheme.id(), + children: ChildSelection::One(1), + }, + DescendantExclusion { + excluded: RunEndScheme.id(), + children: ChildSelection::One(1), + }, + DescendantExclusion { + excluded: RLE_INTEGER_SCHEME.id(), + children: ChildSelection::One(1), + }, + DescendantExclusion { + excluded: SparseScheme.id(), + children: ChildSelection::One(1), + }, + ] } /// Dict values (child 0) are all unique by definition, so run-end encoding them is @@ -620,7 +609,7 @@ impl Scheme for SequenceScheme { vec![ AncestorExclusion { ancestor: IntDictScheme.id(), - children: ChildSelection::All, + children: ChildSelection::One(1), }, AncestorExclusion { ancestor: FloatDictScheme.id(), @@ -746,8 +735,8 @@ mod tests { use vortex_sequence::Sequence; use vortex_sparse::Sparse; - use super::RLE_INTEGER_SCHEME; use crate::BtrBlocksCompressor; + use crate::schemes::rle::RLE_INTEGER_SCHEME; #[test] fn test_empty() -> VortexResult<()> { diff --git a/vortex-btrblocks/src/schemes/rle.rs b/vortex-btrblocks/src/schemes/rle.rs index da63bf58ca0..b301098eb4c 100644 --- a/vortex-btrblocks/src/schemes/rle.rs +++ b/vortex-btrblocks/src/schemes/rle.rs @@ -11,11 +11,15 @@ use vortex_array::ToCanonical; use vortex_array::arrays::PrimitiveArray; use vortex_compressor::builtins::FloatDictScheme; use vortex_compressor::builtins::StringDictScheme; +use vortex_compressor::builtins::is_float_primitive; +use vortex_compressor::builtins::is_integer_primitive; use vortex_compressor::scheme::AncestorExclusion; use vortex_compressor::scheme::ChildSelection; use vortex_compressor::scheme::DescendantExclusion; #[cfg(feature = "unstable_encodings")] use vortex_compressor::scheme::SchemeId; +use vortex_compressor::stats::FloatStats; +use vortex_compressor::stats::IntegerStats; use vortex_error::VortexResult; use vortex_fastlanes::RLEArray; @@ -26,19 +30,24 @@ use crate::Scheme; use crate::SchemeExt; use crate::estimate_compression_ratio_with_sampling; use crate::schemes::integer::IntDictScheme; +use crate::schemes::integer::SparseScheme; /// Threshold for the average run length in an array before we consider run-length encoding. pub const RUN_LENGTH_THRESHOLD: u32 = 4; -/// Trait for accessing RLE-specific statistics. -pub trait RLEStats { - /// Returns the number of non-null values. - fn value_count(&self) -> u32; - /// Returns the average run length. - fn average_run_length(&self) -> u32; - /// Returns the underlying source array. - fn source(&self) -> &PrimitiveArray; -} +/// RLE scheme for integer compression. +pub const RLE_INTEGER_SCHEME: RLEScheme = RLEScheme::new(); + +/// RLE scheme for float compression. +pub const RLE_FLOAT_SCHEME: RLEScheme = RLEScheme::new(); + +/// Configuration for integer RLE compression. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct IntRLEConfig; + +/// Configuration for float RLE compression. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct FloatRLEConfig; /// Configuration trait for RLE schemes. /// @@ -58,6 +67,72 @@ pub trait RLEConfig: Debug + Send + Sync + 'static { fn generate_stats(array: &ArrayRef) -> Self::Stats; } +impl RLEConfig for IntRLEConfig { + type Stats = IntegerStats; + + const SCHEME_NAME: &'static str = "vortex.int.rle"; + + fn matches(canonical: &Canonical) -> bool { + is_integer_primitive(canonical) + } + + fn generate_stats(array: &ArrayRef) -> IntegerStats { + IntegerStats::generate(&array.to_primitive()) + } +} + +impl RLEConfig for FloatRLEConfig { + type Stats = FloatStats; + + const SCHEME_NAME: &'static str = "vortex.float.rle"; + + fn matches(canonical: &Canonical) -> bool { + is_float_primitive(canonical) + } + + fn generate_stats(array: &ArrayRef) -> FloatStats { + FloatStats::generate(&array.to_primitive()) + } +} + +/// Trait for accessing RLE-specific statistics. +pub trait RLEStats { + /// Returns the number of non-null values. + fn value_count(&self) -> u32; + /// Returns the average run length. + fn average_run_length(&self) -> u32; + /// Returns the underlying source array. + fn source(&self) -> &PrimitiveArray; +} + +impl RLEStats for IntegerStats { + fn value_count(&self) -> u32 { + self.value_count() + } + + fn average_run_length(&self) -> u32 { + self.average_run_length() + } + + fn source(&self) -> &PrimitiveArray { + self.source() + } +} + +impl RLEStats for FloatStats { + fn value_count(&self) -> u32 { + FloatStats::value_count(self) + } + + fn average_run_length(&self) -> u32 { + FloatStats::average_run_length(self) + } + + fn source(&self) -> &PrimitiveArray { + FloatStats::source(self) + } +} + /// RLE scheme that is generic over a configuration type. /// /// This is a ZST (zero-sized type) - all behavior is defined by the `RLEConfig` trait. @@ -91,14 +166,25 @@ impl Scheme for RLEScheme { 3 } - /// RLE indices (child 1) and offsets (child 2) are monotonically increasing, so dictionary - /// encoding is pointless. - /// Values (child 0) are not excluded since they may benefit from dict encoding. + /// RLE indices (child 1) and offsets (child 2) are monotonically increasing positions + /// with all unique values. Dict, RunEnd, and Sparse are all pointless on such data. + /// Self-exclusion already prevents RLE on RLE children. fn descendant_exclusions(&self) -> Vec { - vec![DescendantExclusion { - excluded: IntDictScheme.id(), - children: ChildSelection::Many(&[1, 2]), - }] + vec![ + DescendantExclusion { + excluded: IntDictScheme.id(), + children: ChildSelection::Many(&[1, 2]), + }, + // TODO(connor): This is wrong for some reason? + // DescendantExclusion { + // excluded: RunEndScheme.id(), + // children: ChildSelection::Many(&[1, 2]), + // }, + DescendantExclusion { + excluded: SparseScheme.id(), + children: ChildSelection::Many(&[1, 2]), + }, + ] } /// Dict values (child 0) are all unique by definition, so RLE is pointless on them.