From 41fcdeef65abff6f34917ae440f12521717aed0a Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 30 Jan 2026 17:36:05 +0000 Subject: [PATCH 01/10] perf[runend]: run end bool perf test Signed-off-by: Joe Isaacs --- encodings/runend/Cargo.toml | 4 + encodings/runend/benches/run_end_decode.rs | 110 +++++++ encodings/runend/src/compress.rs | 18 +- encodings/runend/src/decompress_bool.rs | 353 +++++++++++++++++++++ encodings/runend/src/lib.rs | 1 + 5 files changed, 469 insertions(+), 17 deletions(-) create mode 100644 encodings/runend/benches/run_end_decode.rs create mode 100644 encodings/runend/src/decompress_bool.rs diff --git a/encodings/runend/Cargo.toml b/encodings/runend/Cargo.toml index c6823f16bd3..01a5b8d7a3e 100644 --- a/encodings/runend/Cargo.toml +++ b/encodings/runend/Cargo.toml @@ -48,3 +48,7 @@ harness = false [[bench]] name = "run_end_compress" harness = false + +[[bench]] +name = "run_end_decode" +harness = false diff --git a/encodings/runend/benches/run_end_decode.rs b/encodings/runend/benches/run_end_decode.rs new file mode 100644 index 00000000000..93aa5aa58f3 --- /dev/null +++ b/encodings/runend/benches/run_end_decode.rs @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +#![allow(clippy::unwrap_used, clippy::cast_possible_truncation)] + +use divan::Bencher; +use vortex_array::arrays::BoolArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::compute::warm_up_vtables; +use vortex_array::validity::Validity; +use vortex_buffer::BitBuffer; +use vortex_buffer::BufferMut; +use vortex_runend::decompress_bool::runend_decode_bools; + +fn main() { + warm_up_vtables(); + divan::main(); +} + +/// Distribution types for bool benchmarks +#[derive(Clone, Copy)] +enum BoolDistribution { + /// Alternating true/false (50/50) + Alternating, + /// Mostly true (90% true runs) + MostlyTrue, + /// Mostly false (90% false runs) + MostlyFalse, + /// All true + AllTrue, + /// All false + AllFalse, +} + +/// Creates bool test data with configurable distribution +fn create_bool_test_data( + total_length: usize, + avg_run_length: usize, + distribution: BoolDistribution, +) -> (PrimitiveArray, BoolArray) { + let mut ends = BufferMut::::with_capacity(total_length / avg_run_length + 1); + let mut values = Vec::with_capacity(total_length / avg_run_length + 1); + + let mut pos = 0usize; + let mut run_index = 0usize; + + while pos < total_length { + let run_len = avg_run_length.min(total_length - pos); + pos += run_len; + ends.push(pos as u32); + + let val = match distribution { + BoolDistribution::Alternating => run_index % 2 == 0, + BoolDistribution::MostlyTrue => run_index % 10 != 0, // 90% true + BoolDistribution::MostlyFalse => run_index % 10 == 0, // 10% true (90% false) + BoolDistribution::AllTrue => true, + BoolDistribution::AllFalse => false, + }; + values.push(val); + run_index += 1; + } + + ( + PrimitiveArray::new(ends.freeze(), Validity::NonNullable), + BoolArray::from(BitBuffer::from(values)), + ) +} + +// Medium size: 10k elements with various run lengths +const BOOL_ARGS: &[(usize, usize)] = &[ + (10_000, 2), // Very short runs (5000 runs) + (10_000, 10), // Short runs (1000 runs) + (10_000, 100), // Medium runs (100 runs) + (10_000, 1000), // Long runs (10 runs) +]; + +#[divan::bench(args = BOOL_ARGS)] +fn decode_bool_alternating(bencher: Bencher, (total_length, avg_run_length): (usize, usize)) { + let (ends, values) = + create_bool_test_data(total_length, avg_run_length, BoolDistribution::Alternating); + bencher.bench(|| runend_decode_bools(ends.clone(), values.clone(), 0, total_length)); +} + +#[divan::bench(args = BOOL_ARGS)] +fn decode_bool_mostly_true(bencher: Bencher, (total_length, avg_run_length): (usize, usize)) { + let (ends, values) = + create_bool_test_data(total_length, avg_run_length, BoolDistribution::MostlyTrue); + bencher.bench(|| runend_decode_bools(ends.clone(), values.clone(), 0, total_length)); +} + +#[divan::bench(args = BOOL_ARGS)] +fn decode_bool_mostly_false(bencher: Bencher, (total_length, avg_run_length): (usize, usize)) { + let (ends, values) = + create_bool_test_data(total_length, avg_run_length, BoolDistribution::MostlyFalse); + bencher.bench(|| runend_decode_bools(ends.clone(), values.clone(), 0, total_length)); +} + +#[divan::bench(args = BOOL_ARGS)] +fn decode_bool_all_true(bencher: Bencher, (total_length, avg_run_length): (usize, usize)) { + let (ends, values) = + create_bool_test_data(total_length, avg_run_length, BoolDistribution::AllTrue); + bencher.bench(|| runend_decode_bools(ends.clone(), values.clone(), 0, total_length)); +} + +#[divan::bench(args = BOOL_ARGS)] +fn decode_bool_all_false(bencher: Bencher, (total_length, avg_run_length): (usize, usize)) { + let (ends, values) = + create_bool_test_data(total_length, avg_run_length, BoolDistribution::AllFalse); + bencher.bench(|| runend_decode_bools(ends.clone(), values.clone(), 0, total_length)); +} diff --git a/encodings/runend/src/compress.rs b/encodings/runend/src/compress.rs index ff3462d961c..b27afc1c5a6 100644 --- a/encodings/runend/src/compress.rs +++ b/encodings/runend/src/compress.rs @@ -188,23 +188,7 @@ pub fn runend_decode_primitive( })) } -pub fn runend_decode_bools( - ends: PrimitiveArray, - values: BoolArray, - offset: usize, - length: usize, -) -> VortexResult { - let validity_mask = values.validity_mask()?; - Ok(match_each_unsigned_integer_ptype!(ends.ptype(), |E| { - runend_decode_typed_bool( - trimmed_ends_iter(ends.as_slice::(), offset, length), - &values.to_bit_buffer(), - validity_mask, - values.dtype().nullability(), - length, - ) - })) -} +pub use crate::decompress_bool::runend_decode_bools; /// Decode a run-end encoded slice of values into a flat `Buffer` and `Validity`. /// diff --git a/encodings/runend/src/decompress_bool.rs b/encodings/runend/src/decompress_bool.rs new file mode 100644 index 00000000000..280b0d9f38f --- /dev/null +++ b/encodings/runend/src/decompress_bool.rs @@ -0,0 +1,353 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Optimized run-end decoding for boolean arrays. +//! +//! Uses an adaptive strategy that pre-fills the buffer with the majority value +//! (0s or 1s) and only fills the minority runs, minimizing work for skewed distributions. + +use itertools::Itertools; +use vortex_array::arrays::BoolArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::validity::Validity; +use vortex_buffer::BitBuffer; +use vortex_buffer::BitBufferMut; +use vortex_dtype::Nullability; +use vortex_dtype::match_each_unsigned_integer_ptype; +use vortex_error::VortexResult; +use vortex_mask::Mask; + +use crate::iter::trimmed_ends_iter; + +/// Decodes run-end encoded boolean values into a flat `BoolArray`. +pub fn runend_decode_bools( + ends: PrimitiveArray, + values: BoolArray, + offset: usize, + length: usize, +) -> VortexResult { + let validity = values.validity_mask()?; + Ok(match_each_unsigned_integer_ptype!(ends.ptype(), |E| { + runend_decode_typed_bool( + trimmed_ends_iter(ends.as_slice::(), offset, length), + &values.to_bit_buffer(), + validity, + values.dtype().nullability(), + length, + ) + })) +} + +/// Fills bits in range [start, end) to true using byte-level operations. +/// Assumes the buffer is pre-initialized to all zeros. +#[inline(always)] +fn fill_bits_true(slice: &mut [u8], start: usize, end: usize) { + if start >= end { + return; + } + + let start_byte = start / 8; + let start_bit = start % 8; + let end_byte = end / 8; + let end_bit = end % 8; + + if start_byte == end_byte { + // All bits in same byte + // Use u16 to avoid overflow, then truncate (guaranteed to fit in u8 since max is 0xFF) + #[allow(clippy::cast_possible_truncation)] + let mask = ((1u16 << (end_bit - start_bit)) - 1) as u8; + slice[start_byte] |= mask << start_bit; + } else { + // First partial byte + if start_bit != 0 { + slice[start_byte] |= !((1u8 << start_bit) - 1); + } + + // Middle bytes (bulk memset to 0xFF) + let fill_start = if start_bit != 0 { + start_byte + 1 + } else { + start_byte + }; + if fill_start < end_byte { + slice[fill_start..end_byte].fill(0xFF); + } + + // Last partial byte + if end_bit != 0 { + slice[end_byte] |= (1u8 << end_bit) - 1; + } + } +} + +/// Clears bits in range [start, end) to false using byte-level operations. +/// Assumes the buffer is pre-initialized to all ones. +#[inline(always)] +fn fill_bits_false(slice: &mut [u8], start: usize, end: usize) { + if start >= end { + return; + } + + let start_byte = start / 8; + let start_bit = start % 8; + let end_byte = end / 8; + let end_bit = end % 8; + + if start_byte == end_byte { + // All bits in same byte - create mask with 0s in the range we want to clear + #[allow(clippy::cast_possible_truncation)] + let mask = ((1u16 << (end_bit - start_bit)) - 1) as u8; + slice[start_byte] &= !(mask << start_bit); + } else { + // First partial byte - clear high bits from start_bit + if start_bit != 0 { + slice[start_byte] &= (1u8 << start_bit) - 1; + } + + // Middle bytes (bulk memset to 0x00) + let fill_start = if start_bit != 0 { + start_byte + 1 + } else { + start_byte + }; + if fill_start < end_byte { + slice[fill_start..end_byte].fill(0x00); + } + + // Last partial byte - clear low bits up to end_bit + if end_bit != 0 { + slice[end_byte] &= !((1u8 << end_bit) - 1); + } + } +} + +/// Decodes run-end encoded boolean values using an adaptive strategy. +/// +/// The strategy counts true vs false runs and chooses the optimal approach: +/// - If more true runs: pre-fill with 1s, clear false runs +/// - If more false runs: pre-fill with 0s, fill true runs +/// +/// This minimizes work for skewed distributions (e.g., sparse validity masks). +pub fn runend_decode_typed_bool( + run_ends: impl Iterator, + values: &BitBuffer, + values_validity: Mask, + values_nullability: Nullability, + length: usize, +) -> BoolArray { + match values_validity { + Mask::AllTrue(_) => { + // Adaptive strategy: choose based on which value is more common + // If more runs have true values, pre-fill with 1s and clear false runs + // If more runs have false values, pre-fill with 0s and fill true runs + let true_count = values.true_count(); + let false_count = values.len() - true_count; + + if true_count > false_count { + // More true runs - pre-fill with 1s and clear false runs + let mut decoded = BitBufferMut::new_set(length); + let decoded_bytes = decoded.as_mut_slice(); + let mut current_pos = 0usize; + + for (end, value) in run_ends.zip_eq(values.iter()) { + // Only clear when value is false (true is already 1) + if end > current_pos && !value { + fill_bits_false(decoded_bytes, current_pos, end); + } + current_pos = end; + } + BoolArray::new(decoded.freeze(), values_nullability.into()) + } else { + // More or equal false runs - pre-fill with 0s and fill true runs + let mut decoded = BitBufferMut::new_unset(length); + let decoded_bytes = decoded.as_mut_slice(); + let mut current_pos = 0usize; + + for (end, value) in run_ends.zip_eq(values.iter()) { + // Only fill when value is true (false is already 0) + if end > current_pos && value { + fill_bits_true(decoded_bytes, current_pos, end); + } + current_pos = end; + } + BoolArray::new(decoded.freeze(), values_nullability.into()) + } + } + Mask::AllFalse(_) => BoolArray::new(BitBuffer::new_unset(length), Validity::AllInvalid), + Mask::Values(mask) => { + // For nullable values, adaptive strategy based on true count + // (counting only valid values as true) + let valid_true_count = values + .iter() + .zip(mask.bit_buffer().iter()) + .filter(|&(v, is_valid)| is_valid && v) + .count(); + let valid_false_count = values + .iter() + .zip(mask.bit_buffer().iter()) + .filter(|&(v, is_valid)| is_valid && !v) + .count(); + + if valid_true_count > valid_false_count { + // More true runs - pre-fill with 1s and clear false/null runs + let mut decoded = BitBufferMut::new_set(length); + let mut decoded_validity = BitBufferMut::new_unset(length); + let decoded_bytes = decoded.as_mut_slice(); + let validity_bytes = decoded_validity.as_mut_slice(); + let mut current_pos = 0usize; + + for (end, value) in run_ends.zip_eq( + values + .iter() + .zip(mask.bit_buffer().iter()) + .map(|(v, is_valid)| is_valid.then_some(v)), + ) { + if end > current_pos { + match value { + None => { + // Null: clear decoded bits, validity stays false + fill_bits_false(decoded_bytes, current_pos, end); + } + Some(v) => { + // Valid: set validity bits to true + fill_bits_true(validity_bytes, current_pos, end); + // Clear decoded bits if value is false + if !v { + fill_bits_false(decoded_bytes, current_pos, end); + } + } + } + current_pos = end; + } + } + BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) + } else { + // More or equal false runs - pre-fill with 0s and fill true runs + let mut decoded = BitBufferMut::new_unset(length); + let mut decoded_validity = BitBufferMut::new_unset(length); + let decoded_bytes = decoded.as_mut_slice(); + let validity_bytes = decoded_validity.as_mut_slice(); + let mut current_pos = 0usize; + + for (end, value) in run_ends.zip_eq( + values + .iter() + .zip(mask.bit_buffer().iter()) + .map(|(v, is_valid)| is_valid.then_some(v)), + ) { + if end > current_pos { + match value { + None => { + // Validity stays false (already 0), decoded stays false + } + Some(v) => { + // Set validity bits to true + fill_bits_true(validity_bytes, current_pos, end); + // Set decoded bits if value is true + if v { + fill_bits_true(decoded_bytes, current_pos, end); + } + } + } + current_pos = end; + } + } + BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) + } + } + } +} + +#[cfg(test)] +mod tests { + use vortex_array::arrays::BoolArray; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::assert_arrays_eq; + use vortex_buffer::BitBuffer; + use vortex_error::VortexResult; + + use super::runend_decode_bools; + + #[test] + fn decode_bools_alternating() -> VortexResult<()> { + // Alternating true/false: [T, T, F, F, F, T, T, T, T, T] + let ends = PrimitiveArray::from_iter([2u32, 5, 10]); + let values = BoolArray::from(BitBuffer::from(vec![true, false, true])); + let decoded = runend_decode_bools(ends, values, 0, 10)?; + + let expected = BoolArray::from(BitBuffer::from(vec![ + true, true, false, false, false, true, true, true, true, true, + ])); + assert_arrays_eq!(decoded, expected); + Ok(()) + } + + #[test] + fn decode_bools_mostly_true() -> VortexResult<()> { + // Mostly true: [T, T, T, T, T, F, T, T, T, T] - triggers true-heavy path + let ends = PrimitiveArray::from_iter([5u32, 6, 10]); + let values = BoolArray::from(BitBuffer::from(vec![true, false, true])); + let decoded = runend_decode_bools(ends, values, 0, 10)?; + + let expected = BoolArray::from(BitBuffer::from(vec![ + true, true, true, true, true, false, true, true, true, true, + ])); + assert_arrays_eq!(decoded, expected); + Ok(()) + } + + #[test] + fn decode_bools_mostly_false() -> VortexResult<()> { + // Mostly false: [F, F, F, F, F, T, F, F, F, F] - triggers false-heavy path + let ends = PrimitiveArray::from_iter([5u32, 6, 10]); + let values = BoolArray::from(BitBuffer::from(vec![false, true, false])); + let decoded = runend_decode_bools(ends, values, 0, 10)?; + + let expected = BoolArray::from(BitBuffer::from(vec![ + false, false, false, false, false, true, false, false, false, false, + ])); + assert_arrays_eq!(decoded, expected); + Ok(()) + } + + #[test] + fn decode_bools_all_true() -> VortexResult<()> { + // All true: single run + let ends = PrimitiveArray::from_iter([10u32]); + let values = BoolArray::from(BitBuffer::from(vec![true])); + let decoded = runend_decode_bools(ends, values, 0, 10)?; + + let expected = BoolArray::from(BitBuffer::from(vec![ + true, true, true, true, true, true, true, true, true, true, + ])); + assert_arrays_eq!(decoded, expected); + Ok(()) + } + + #[test] + fn decode_bools_all_false() -> VortexResult<()> { + // All false: single run + let ends = PrimitiveArray::from_iter([10u32]); + let values = BoolArray::from(BitBuffer::from(vec![false])); + let decoded = runend_decode_bools(ends, values, 0, 10)?; + + let expected = BoolArray::from(BitBuffer::from(vec![ + false, false, false, false, false, false, false, false, false, false, + ])); + assert_arrays_eq!(decoded, expected); + Ok(()) + } + + #[test] + fn decode_bools_with_offset() -> VortexResult<()> { + // Test with offset: [T, T, F, F, F, T, T, T, T, T] -> slice [2..8] = [F, F, F, T, T, T] + let ends = PrimitiveArray::from_iter([2u32, 5, 10]); + let values = BoolArray::from(BitBuffer::from(vec![true, false, true])); + let decoded = runend_decode_bools(ends, values, 2, 6)?; + + let expected = + BoolArray::from(BitBuffer::from(vec![false, false, false, true, true, true])); + assert_arrays_eq!(decoded, expected); + Ok(()) + } +} diff --git a/encodings/runend/src/lib.rs b/encodings/runend/src/lib.rs index bf289a23823..e83d77e287c 100644 --- a/encodings/runend/src/lib.rs +++ b/encodings/runend/src/lib.rs @@ -13,6 +13,7 @@ mod array; mod arrow; pub mod compress; mod compute; +pub mod decompress_bool; mod iter; mod kernel; mod ops; From 4294c4cc25cfe06ce43ff44885ad872e00db8fd3 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Mon, 2 Feb 2026 10:35:11 +0000 Subject: [PATCH 02/10] clean up Signed-off-by: Joe Isaacs --- encodings/runend/benches/run_end_decode.rs | 183 ++++++++++--- encodings/runend/src/array.rs | 2 +- encodings/runend/src/compute/compare.rs | 2 +- encodings/runend/src/decompress_bool.rs | 291 +++++++++++---------- 4 files changed, 295 insertions(+), 183 deletions(-) diff --git a/encodings/runend/benches/run_end_decode.rs b/encodings/runend/benches/run_end_decode.rs index 93aa5aa58f3..def6c41c7ad 100644 --- a/encodings/runend/benches/run_end_decode.rs +++ b/encodings/runend/benches/run_end_decode.rs @@ -3,6 +3,8 @@ #![allow(clippy::unwrap_used, clippy::cast_possible_truncation)] +use std::fmt; + use divan::Bencher; use vortex_array::arrays::BoolArray; use vortex_array::arrays::PrimitiveArray; @@ -32,6 +34,35 @@ enum BoolDistribution { AllFalse, } +impl fmt::Display for BoolDistribution { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + BoolDistribution::Alternating => write!(f, "alternating"), + BoolDistribution::MostlyTrue => write!(f, "mostly_true"), + BoolDistribution::MostlyFalse => write!(f, "mostly_false"), + BoolDistribution::AllTrue => write!(f, "all_true"), + BoolDistribution::AllFalse => write!(f, "all_false"), + } + } +} + +#[derive(Clone, Copy)] +struct BoolBenchArgs { + total_length: usize, + avg_run_length: usize, + distribution: BoolDistribution, +} + +impl fmt::Display for BoolBenchArgs { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "{}_{}_{}", + self.total_length, self.avg_run_length, self.distribution + ) + } +} + /// Creates bool test data with configurable distribution fn create_bool_test_data( total_length: usize, @@ -66,45 +97,121 @@ fn create_bool_test_data( ) } -// Medium size: 10k elements with various run lengths -const BOOL_ARGS: &[(usize, usize)] = &[ - (10_000, 2), // Very short runs (5000 runs) - (10_000, 10), // Short runs (1000 runs) - (10_000, 100), // Medium runs (100 runs) - (10_000, 1000), // Long runs (10 runs) +// Medium size: 10k elements with various run lengths and distributions +const BOOL_ARGS: &[BoolBenchArgs] = &[ + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 2, + distribution: BoolDistribution::Alternating, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 10, + distribution: BoolDistribution::Alternating, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 100, + distribution: BoolDistribution::Alternating, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 1000, + distribution: BoolDistribution::Alternating, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 2, + distribution: BoolDistribution::MostlyTrue, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 10, + distribution: BoolDistribution::MostlyTrue, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 100, + distribution: BoolDistribution::MostlyTrue, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 1000, + distribution: BoolDistribution::MostlyTrue, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 2, + distribution: BoolDistribution::MostlyFalse, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 10, + distribution: BoolDistribution::MostlyFalse, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 100, + distribution: BoolDistribution::MostlyFalse, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 1000, + distribution: BoolDistribution::MostlyFalse, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 2, + distribution: BoolDistribution::AllTrue, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 10, + distribution: BoolDistribution::AllTrue, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 100, + distribution: BoolDistribution::AllTrue, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 1000, + distribution: BoolDistribution::AllTrue, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 2, + distribution: BoolDistribution::AllFalse, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 10, + distribution: BoolDistribution::AllFalse, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 100, + distribution: BoolDistribution::AllFalse, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 1000, + distribution: BoolDistribution::AllFalse, + }, ]; #[divan::bench(args = BOOL_ARGS)] -fn decode_bool_alternating(bencher: Bencher, (total_length, avg_run_length): (usize, usize)) { - let (ends, values) = - create_bool_test_data(total_length, avg_run_length, BoolDistribution::Alternating); - bencher.bench(|| runend_decode_bools(ends.clone(), values.clone(), 0, total_length)); -} - -#[divan::bench(args = BOOL_ARGS)] -fn decode_bool_mostly_true(bencher: Bencher, (total_length, avg_run_length): (usize, usize)) { - let (ends, values) = - create_bool_test_data(total_length, avg_run_length, BoolDistribution::MostlyTrue); - bencher.bench(|| runend_decode_bools(ends.clone(), values.clone(), 0, total_length)); -} - -#[divan::bench(args = BOOL_ARGS)] -fn decode_bool_mostly_false(bencher: Bencher, (total_length, avg_run_length): (usize, usize)) { - let (ends, values) = - create_bool_test_data(total_length, avg_run_length, BoolDistribution::MostlyFalse); - bencher.bench(|| runend_decode_bools(ends.clone(), values.clone(), 0, total_length)); -} - -#[divan::bench(args = BOOL_ARGS)] -fn decode_bool_all_true(bencher: Bencher, (total_length, avg_run_length): (usize, usize)) { - let (ends, values) = - create_bool_test_data(total_length, avg_run_length, BoolDistribution::AllTrue); - bencher.bench(|| runend_decode_bools(ends.clone(), values.clone(), 0, total_length)); -} - -#[divan::bench(args = BOOL_ARGS)] -fn decode_bool_all_false(bencher: Bencher, (total_length, avg_run_length): (usize, usize)) { - let (ends, values) = - create_bool_test_data(total_length, avg_run_length, BoolDistribution::AllFalse); - bencher.bench(|| runend_decode_bools(ends.clone(), values.clone(), 0, total_length)); +fn decode_bool(bencher: Bencher, args: BoolBenchArgs) { + let BoolBenchArgs { + total_length, + avg_run_length, + distribution, + } = args; + let (bools, indices) = create_bool_test_data(total_length, avg_run_length, distribution); + bencher + .with_inputs(|| (&bools, &indices)) + .bench_refs(|(ends, values)| { + runend_decode_bools(ends.clone(), values.clone(), 0, total_length) + }); } diff --git a/encodings/runend/src/array.rs b/encodings/runend/src/array.rs index f943b747cda..f0eac5d967e 100644 --- a/encodings/runend/src/array.rs +++ b/encodings/runend/src/array.rs @@ -39,10 +39,10 @@ use vortex_error::vortex_ensure; use vortex_error::vortex_panic; use vortex_session::VortexSession; -use crate::compress::runend_decode_bools; use crate::compress::runend_decode_primitive; use crate::compress::runend_decode_varbinview; use crate::compress::runend_encode; +use crate::decompress_bool::runend_decode_bools; use crate::kernel::PARENT_KERNELS; use crate::rules::RULES; diff --git a/encodings/runend/src/compute/compare.rs b/encodings/runend/src/compute/compare.rs index 0b02c474dc1..535d2c1d37a 100644 --- a/encodings/runend/src/compute/compare.rs +++ b/encodings/runend/src/compute/compare.rs @@ -16,7 +16,7 @@ use vortex_error::VortexResult; use crate::RunEndArray; use crate::RunEndVTable; -use crate::compress::runend_decode_bools; +use crate::decompress_bool::runend_decode_bools; impl CompareKernel for RunEndVTable { fn compare( diff --git a/encodings/runend/src/decompress_bool.rs b/encodings/runend/src/decompress_bool.rs index 280b0d9f38f..4df188d8b8f 100644 --- a/encodings/runend/src/decompress_bool.rs +++ b/encodings/runend/src/decompress_bool.rs @@ -38,6 +38,150 @@ pub fn runend_decode_bools( })) } +/// Decodes run-end encoded boolean values using an adaptive strategy. +/// +/// The strategy counts true vs false runs and chooses the optimal approach: +/// - If more true runs: pre-fill with 1s, clear false runs +/// - If more false runs: pre-fill with 0s, fill true runs +/// +/// This minimizes work for skewed distributions (e.g., sparse validity masks). +pub fn runend_decode_typed_bool( + run_ends: impl Iterator, + values: &BitBuffer, + values_validity: Mask, + values_nullability: Nullability, + length: usize, +) -> BoolArray { + match values_validity { + Mask::AllTrue(_) => decode_bool_non_nullable(run_ends, values, values_nullability, length), + Mask::AllFalse(_) => BoolArray::new(BitBuffer::new_unset(length), Validity::AllInvalid), + Mask::Values(mask) => decode_bool_nullable(run_ends, values, mask.bit_buffer(), length), + } +} + +/// Decodes run-end encoded booleans when all values are valid (non-nullable). +fn decode_bool_non_nullable( + run_ends: impl Iterator, + values: &BitBuffer, + nullability: Nullability, + length: usize, +) -> BoolArray { + // Adaptive strategy: choose based on which value is more common + // If more runs have true values, pre-fill with 1s and clear false runs + // If more runs have false values, pre-fill with 0s and fill true runs + let true_count = values.true_count(); + let false_count = values.len() - true_count; + + if true_count > false_count { + // More true runs - pre-fill with 1s and clear false runs + let mut decoded = BitBufferMut::new_set(length); + let decoded_bytes = decoded.as_mut_slice(); + let mut current_pos = 0usize; + + for (end, value) in run_ends.zip_eq(values.iter()) { + // Only clear when value is false (true is already 1) + if end > current_pos && !value { + fill_bits_false(decoded_bytes, current_pos, end); + } + current_pos = end; + } + BoolArray::new(decoded.freeze(), nullability.into()) + } else { + // More or equal false runs - pre-fill with 0s and fill true runs + let mut decoded = BitBufferMut::new_unset(length); + let decoded_bytes = decoded.as_mut_slice(); + let mut current_pos = 0usize; + + for (end, value) in run_ends.zip_eq(values.iter()) { + // Only fill when value is true (false is already 0) + if end > current_pos && value { + fill_bits_true(decoded_bytes, current_pos, end); + } + current_pos = end; + } + BoolArray::new(decoded.freeze(), nullability.into()) + } +} + +/// Decodes run-end encoded booleans when values may be null (nullable). +fn decode_bool_nullable( + run_ends: impl Iterator, + values: &BitBuffer, + validity_mask: &BitBuffer, + length: usize, +) -> BoolArray { + let true_count = values.true_count(); + let false_count = values.len() - true_count; + + // Use true and false count as a proxy for valid true and false count. + if true_count > false_count { + // More true runs - pre-fill with 1s and clear false/null runs + let mut decoded = BitBufferMut::new_set(length); + let mut decoded_validity = BitBufferMut::new_unset(length); + let decoded_bytes = decoded.as_mut_slice(); + let validity_bytes = decoded_validity.as_mut_slice(); + let mut current_pos = 0usize; + + for (end, value) in run_ends.zip_eq( + values + .iter() + .zip(validity_mask.iter()) + .map(|(v, is_valid)| is_valid.then_some(v)), + ) { + if end > current_pos { + match value { + None => { + // Null: clear decoded bits, validity stays false + fill_bits_false(decoded_bytes, current_pos, end); + } + Some(v) => { + // Valid: set validity bits to true + fill_bits_true(validity_bytes, current_pos, end); + // Clear decoded bits if value is false + if !v { + fill_bits_false(decoded_bytes, current_pos, end); + } + } + } + current_pos = end; + } + } + BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) + } else { + // More or equal false runs - pre-fill with 0s and fill true runs + let mut decoded = BitBufferMut::new_unset(length); + let mut decoded_validity = BitBufferMut::new_unset(length); + let decoded_bytes = decoded.as_mut_slice(); + let validity_bytes = decoded_validity.as_mut_slice(); + let mut current_pos = 0usize; + + for (end, value) in run_ends.zip_eq( + values + .iter() + .zip(validity_mask.iter()) + .map(|(v, is_valid)| is_valid.then_some(v)), + ) { + if end > current_pos { + match value { + None => { + // Validity stays false (already 0), decoded stays false + } + Some(v) => { + // Set validity bits to true + fill_bits_true(validity_bytes, current_pos, end); + // Set decoded bits if value is true + if v { + fill_bits_true(decoded_bytes, current_pos, end); + } + } + } + current_pos = end; + } + } + BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) + } +} + /// Fills bits in range [start, end) to true using byte-level operations. /// Assumes the buffer is pre-initialized to all zeros. #[inline(always)] @@ -121,143 +265,6 @@ fn fill_bits_false(slice: &mut [u8], start: usize, end: usize) { } } -/// Decodes run-end encoded boolean values using an adaptive strategy. -/// -/// The strategy counts true vs false runs and chooses the optimal approach: -/// - If more true runs: pre-fill with 1s, clear false runs -/// - If more false runs: pre-fill with 0s, fill true runs -/// -/// This minimizes work for skewed distributions (e.g., sparse validity masks). -pub fn runend_decode_typed_bool( - run_ends: impl Iterator, - values: &BitBuffer, - values_validity: Mask, - values_nullability: Nullability, - length: usize, -) -> BoolArray { - match values_validity { - Mask::AllTrue(_) => { - // Adaptive strategy: choose based on which value is more common - // If more runs have true values, pre-fill with 1s and clear false runs - // If more runs have false values, pre-fill with 0s and fill true runs - let true_count = values.true_count(); - let false_count = values.len() - true_count; - - if true_count > false_count { - // More true runs - pre-fill with 1s and clear false runs - let mut decoded = BitBufferMut::new_set(length); - let decoded_bytes = decoded.as_mut_slice(); - let mut current_pos = 0usize; - - for (end, value) in run_ends.zip_eq(values.iter()) { - // Only clear when value is false (true is already 1) - if end > current_pos && !value { - fill_bits_false(decoded_bytes, current_pos, end); - } - current_pos = end; - } - BoolArray::new(decoded.freeze(), values_nullability.into()) - } else { - // More or equal false runs - pre-fill with 0s and fill true runs - let mut decoded = BitBufferMut::new_unset(length); - let decoded_bytes = decoded.as_mut_slice(); - let mut current_pos = 0usize; - - for (end, value) in run_ends.zip_eq(values.iter()) { - // Only fill when value is true (false is already 0) - if end > current_pos && value { - fill_bits_true(decoded_bytes, current_pos, end); - } - current_pos = end; - } - BoolArray::new(decoded.freeze(), values_nullability.into()) - } - } - Mask::AllFalse(_) => BoolArray::new(BitBuffer::new_unset(length), Validity::AllInvalid), - Mask::Values(mask) => { - // For nullable values, adaptive strategy based on true count - // (counting only valid values as true) - let valid_true_count = values - .iter() - .zip(mask.bit_buffer().iter()) - .filter(|&(v, is_valid)| is_valid && v) - .count(); - let valid_false_count = values - .iter() - .zip(mask.bit_buffer().iter()) - .filter(|&(v, is_valid)| is_valid && !v) - .count(); - - if valid_true_count > valid_false_count { - // More true runs - pre-fill with 1s and clear false/null runs - let mut decoded = BitBufferMut::new_set(length); - let mut decoded_validity = BitBufferMut::new_unset(length); - let decoded_bytes = decoded.as_mut_slice(); - let validity_bytes = decoded_validity.as_mut_slice(); - let mut current_pos = 0usize; - - for (end, value) in run_ends.zip_eq( - values - .iter() - .zip(mask.bit_buffer().iter()) - .map(|(v, is_valid)| is_valid.then_some(v)), - ) { - if end > current_pos { - match value { - None => { - // Null: clear decoded bits, validity stays false - fill_bits_false(decoded_bytes, current_pos, end); - } - Some(v) => { - // Valid: set validity bits to true - fill_bits_true(validity_bytes, current_pos, end); - // Clear decoded bits if value is false - if !v { - fill_bits_false(decoded_bytes, current_pos, end); - } - } - } - current_pos = end; - } - } - BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) - } else { - // More or equal false runs - pre-fill with 0s and fill true runs - let mut decoded = BitBufferMut::new_unset(length); - let mut decoded_validity = BitBufferMut::new_unset(length); - let decoded_bytes = decoded.as_mut_slice(); - let validity_bytes = decoded_validity.as_mut_slice(); - let mut current_pos = 0usize; - - for (end, value) in run_ends.zip_eq( - values - .iter() - .zip(mask.bit_buffer().iter()) - .map(|(v, is_valid)| is_valid.then_some(v)), - ) { - if end > current_pos { - match value { - None => { - // Validity stays false (already 0), decoded stays false - } - Some(v) => { - // Set validity bits to true - fill_bits_true(validity_bytes, current_pos, end); - // Set decoded bits if value is true - if v { - fill_bits_true(decoded_bytes, current_pos, end); - } - } - } - current_pos = end; - } - } - BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) - } - } - } -} - #[cfg(test)] mod tests { use vortex_array::arrays::BoolArray; @@ -284,7 +291,7 @@ mod tests { #[test] fn decode_bools_mostly_true() -> VortexResult<()> { - // Mostly true: [T, T, T, T, T, F, T, T, T, T] - triggers true-heavy path + // Mostly true: [T, T, T, T, T, F, T, T, T, T] let ends = PrimitiveArray::from_iter([5u32, 6, 10]); let values = BoolArray::from(BitBuffer::from(vec![true, false, true])); let decoded = runend_decode_bools(ends, values, 0, 10)?; @@ -298,7 +305,7 @@ mod tests { #[test] fn decode_bools_mostly_false() -> VortexResult<()> { - // Mostly false: [F, F, F, F, F, T, F, F, F, F] - triggers false-heavy path + // Mostly false: [F, F, F, F, F, T, F, F, F, F] let ends = PrimitiveArray::from_iter([5u32, 6, 10]); let values = BoolArray::from(BitBuffer::from(vec![false, true, false])); let decoded = runend_decode_bools(ends, values, 0, 10)?; @@ -311,8 +318,7 @@ mod tests { } #[test] - fn decode_bools_all_true() -> VortexResult<()> { - // All true: single run + fn decode_bools_all_true_single_run() -> VortexResult<()> { let ends = PrimitiveArray::from_iter([10u32]); let values = BoolArray::from(BitBuffer::from(vec![true])); let decoded = runend_decode_bools(ends, values, 0, 10)?; @@ -325,8 +331,7 @@ mod tests { } #[test] - fn decode_bools_all_false() -> VortexResult<()> { - // All false: single run + fn decode_bools_all_false_single_run() -> VortexResult<()> { let ends = PrimitiveArray::from_iter([10u32]); let values = BoolArray::from(BitBuffer::from(vec![false])); let decoded = runend_decode_bools(ends, values, 0, 10)?; From 887f6cc61631ec20a5a3f2573f4c65cdf9e6e9bc Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Mon, 2 Feb 2026 10:35:34 +0000 Subject: [PATCH 03/10] clean up Signed-off-by: Joe Isaacs --- encodings/runend/benches/run_end_decode.rs | 167 ++++++++++++++++++++- 1 file changed, 165 insertions(+), 2 deletions(-) diff --git a/encodings/runend/benches/run_end_decode.rs b/encodings/runend/benches/run_end_decode.rs index def6c41c7ad..06ceac4186d 100644 --- a/encodings/runend/benches/run_end_decode.rs +++ b/encodings/runend/benches/run_end_decode.rs @@ -208,9 +208,172 @@ fn decode_bool(bencher: Bencher, args: BoolBenchArgs) { avg_run_length, distribution, } = args; - let (bools, indices) = create_bool_test_data(total_length, avg_run_length, distribution); + let (ends, values) = create_bool_test_data(total_length, avg_run_length, distribution); bencher - .with_inputs(|| (&bools, &indices)) + .with_inputs(|| (ends.clone(), values.clone())) + .bench_refs(|(ends, values)| { + runend_decode_bools(ends.clone(), values.clone(), 0, total_length) + }); +} + +/// Validity distribution for nullable benchmarks +#[derive(Clone, Copy)] +enum ValidityDistribution { + /// 90% valid + MostlyValid, + /// 50% valid + HalfValid, + /// 10% valid + MostlyNull, +} + +impl fmt::Display for ValidityDistribution { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ValidityDistribution::MostlyValid => write!(f, "mostly_valid"), + ValidityDistribution::HalfValid => write!(f, "half_valid"), + ValidityDistribution::MostlyNull => write!(f, "mostly_null"), + } + } +} + +#[derive(Clone, Copy)] +struct NullableBoolBenchArgs { + total_length: usize, + avg_run_length: usize, + distribution: BoolDistribution, + validity: ValidityDistribution, +} + +impl fmt::Display for NullableBoolBenchArgs { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "{}_{}_{}_{}", + self.total_length, self.avg_run_length, self.distribution, self.validity + ) + } +} + +/// Creates nullable bool test data with configurable distribution and validity +fn create_nullable_bool_test_data( + total_length: usize, + avg_run_length: usize, + distribution: BoolDistribution, + validity: ValidityDistribution, +) -> (PrimitiveArray, BoolArray) { + let mut ends = BufferMut::::with_capacity(total_length / avg_run_length + 1); + let mut values = Vec::with_capacity(total_length / avg_run_length + 1); + let mut validity_bits = Vec::with_capacity(total_length / avg_run_length + 1); + + let mut pos = 0usize; + let mut run_index = 0usize; + + while pos < total_length { + let run_len = avg_run_length.min(total_length - pos); + pos += run_len; + ends.push(pos as u32); + + let val = match distribution { + BoolDistribution::Alternating => run_index % 2 == 0, + BoolDistribution::MostlyTrue => run_index % 10 != 0, + BoolDistribution::MostlyFalse => run_index % 10 == 0, + BoolDistribution::AllTrue => true, + BoolDistribution::AllFalse => false, + }; + values.push(val); + + let is_valid = match validity { + ValidityDistribution::MostlyValid => run_index % 10 != 0, + ValidityDistribution::HalfValid => run_index % 2 == 0, + ValidityDistribution::MostlyNull => run_index % 10 == 0, + }; + validity_bits.push(is_valid); + + run_index += 1; + } + + ( + PrimitiveArray::new(ends.freeze(), Validity::NonNullable), + BoolArray::new( + BitBuffer::from(values), + Validity::from(BitBuffer::from(validity_bits)), + ), + ) +} + +const NULLABLE_BOOL_ARGS: &[NullableBoolBenchArgs] = &[ + // Alternating with different validity + NullableBoolBenchArgs { + total_length: 10_000, + avg_run_length: 10, + distribution: BoolDistribution::Alternating, + validity: ValidityDistribution::MostlyValid, + }, + NullableBoolBenchArgs { + total_length: 10_000, + avg_run_length: 10, + distribution: BoolDistribution::Alternating, + validity: ValidityDistribution::HalfValid, + }, + NullableBoolBenchArgs { + total_length: 10_000, + avg_run_length: 10, + distribution: BoolDistribution::Alternating, + validity: ValidityDistribution::MostlyNull, + }, + // MostlyTrue with different validity + NullableBoolBenchArgs { + total_length: 10_000, + avg_run_length: 10, + distribution: BoolDistribution::MostlyTrue, + validity: ValidityDistribution::MostlyValid, + }, + NullableBoolBenchArgs { + total_length: 10_000, + avg_run_length: 10, + distribution: BoolDistribution::MostlyTrue, + validity: ValidityDistribution::HalfValid, + }, + NullableBoolBenchArgs { + total_length: 10_000, + avg_run_length: 10, + distribution: BoolDistribution::MostlyTrue, + validity: ValidityDistribution::MostlyNull, + }, + // Different run lengths with MostlyValid + NullableBoolBenchArgs { + total_length: 10_000, + avg_run_length: 2, + distribution: BoolDistribution::Alternating, + validity: ValidityDistribution::MostlyValid, + }, + NullableBoolBenchArgs { + total_length: 10_000, + avg_run_length: 100, + distribution: BoolDistribution::Alternating, + validity: ValidityDistribution::MostlyValid, + }, + NullableBoolBenchArgs { + total_length: 10_000, + avg_run_length: 1000, + distribution: BoolDistribution::Alternating, + validity: ValidityDistribution::MostlyValid, + }, +]; + +#[divan::bench(args = NULLABLE_BOOL_ARGS)] +fn decode_bool_nullable(bencher: Bencher, args: NullableBoolBenchArgs) { + let NullableBoolBenchArgs { + total_length, + avg_run_length, + distribution, + validity, + } = args; + let (ends, values) = + create_nullable_bool_test_data(total_length, avg_run_length, distribution, validity); + bencher + .with_inputs(|| (ends.clone(), values.clone())) .bench_refs(|(ends, values)| { runend_decode_bools(ends.clone(), values.clone(), 0, total_length) }); From e57b338860fd7d4d516d1a83e25c66ccaa55e0f7 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Tue, 3 Feb 2026 13:46:26 +0000 Subject: [PATCH 04/10] wip Signed-off-by: Joe Isaacs --- encodings/runend/PERF_NOTES.md | 521 +++++++++++++++++++++ encodings/runend/benches/run_end_decode.rs | 55 +++ encodings/runend/src/decompress_bool.rs | 361 +++++++++++--- 3 files changed, 864 insertions(+), 73 deletions(-) create mode 100644 encodings/runend/PERF_NOTES.md diff --git a/encodings/runend/PERF_NOTES.md b/encodings/runend/PERF_NOTES.md new file mode 100644 index 00000000000..151fbb611dd --- /dev/null +++ b/encodings/runend/PERF_NOTES.md @@ -0,0 +1,521 @@ +# Run-End Boolean Decoding Performance Notes + +## Overview + +This document captures the state of performance optimization work on `decompress_bool.rs` for run-end encoded boolean arrays. + +## Problem Statement + +The original benchmark comparison showed the new implementation was slower for the 1000 run length case (only 10 runs): + +``` +10000_1000_alternating_mostly_valid: develop 401 ns, new 714 ns, 0.56x slower +``` + +## Root Cause Analysis + +### Benchmark Unfairness + +The baseline benchmark (`decode_bool_nullable_develop`) and new implementation (`decode_bool_nullable`) measure different things: + +**New implementation (what gets timed):** +```rust +bencher + .with_inputs(|| (ends.clone(), values.clone())) // Setup: just clone + .bench_refs(|(ends, values)| { + // TIMED: extraction + decode + runend_decode_bools(ends.clone(), values.clone(), 0, total_length) + }); +``` + +Inside `runend_decode_bools` (all timed): +1. `values.validity_mask()?` - extract validity mask +2. `values.to_bit_buffer()` - extract bit buffer +3. `match_each_unsigned_integer_ptype!` - generic type dispatch +4. `trimmed_ends_iter()` - iterator with 3 chained `.map()` operations +5. Actual decode loop + +**Baseline (what gets timed):** +```rust +bencher + .with_inputs(|| { + // NOT TIMED: all extraction done here + let ends_slice: Vec = ends.as_slice::().to_vec(); + let values_buf = values.to_bit_buffer(); + let validity_buf = values.validity_mask().unwrap(); + let validity_bits = match validity_buf { ... }; + (ends_slice, values_buf, validity_bits) + }) + .bench_refs(|(ends, values, validity)| { + // TIMED: only the decode loop with pre-extracted data + decode_bool_nullable_baseline(ends, values, validity, total_length) + }); +``` + +**Key insight:** The baseline excludes ~150ns of extraction overhead from timing. + +### Overhead Sources for Few Runs + +For 10 runs (1000 run length), the overhead dominates: + +1. **`trimmed_ends_iter`** - 3 chained `.map()` per element: + - `v - offset_e` (subtract offset) + - `min(v, length_e)` (clamp to length) + - `v.as_()` (convert to usize) + +2. **Array method calls:** + - `values.validity_mask()?` + - `values.to_bit_buffer()` + - `ends.as_slice::()` + +3. **Generic dispatch:** `match_each_unsigned_integer_ptype!` macro expansion + +## Optimizations Implemented + +### 1. Fast Path for Few Runs with No Offset + +Added `decode_few_runs_no_offset()` function that: +- Bypasses `trimmed_ends_iter` iterator chain +- Uses direct slice iteration: `for (i, &end) in ends.iter().enumerate()` +- Triggered when `offset == 0 && num_runs < PREFILL_RUN_THRESHOLD` (32) + +```rust +// In runend_decode_bools(): +if offset == 0 && num_runs < PREFILL_RUN_THRESHOLD { + return Ok(match_each_unsigned_integer_ptype!(ends.ptype(), |E| { + decode_few_runs_no_offset( + ends.as_slice::(), + &values_buf, + validity, + nullability, + length, + ) + })); +} +``` + +### 2. Optimized Nullable Fast Path with fill_bits + +For nullable decoding in the fast path, uses `fill_bits_true`/`fill_bits_false` instead of `append_n`: + +```rust +Mask::Values(mask) => { + let validity_buf = mask.bit_buffer(); + let mut decoded = BitBufferMut::new_unset(length); + let mut decoded_validity = BitBufferMut::new_unset(length); + let decoded_bytes = decoded.as_mut_slice(); + let validity_bytes = decoded_validity.as_mut_slice(); + let mut prev_end = 0usize; + for (i, &end) in ends.iter().enumerate() { + let end = end.as_().min(length); + if end > prev_end { + let is_valid = validity_buf.value(i); + if is_valid { + fill_bits_true(validity_bytes, prev_end, end); + if values.value(i) { + fill_bits_true(decoded_bytes, prev_end, end); + } + } + } + prev_end = end; + } + BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) +} +``` + +## Current Benchmark Results + +### Nullable Cases + +| Benchmark | New | Baseline | Speedup | +|-----------|-----|----------|---------| +| 10000_2_alternating_mostly_valid | 12.2 µs | 42.6 µs | **3.5x** | +| 10000_10_alternating_mostly_valid | 3.6 µs | 13.1 µs | **3.6x** | +| 10000_10_alternating_mostly_null | 2.8 µs | 12.1 µs | **4.3x** | +| 10000_10_mostly_true_mostly_valid | 3.0 µs | 11.8 µs | **3.9x** | +| 10000_100_alternating_mostly_valid | 0.90 µs | 2.27 µs | **2.5x** | +| 10000_1000_alternating_mostly_valid | 0.48 µs | 0.32 µs | **0.67x** (1.5x slower) | + +### Non-Nullable Cases (1000 run length) + +| Benchmark | Time | +|-----------|------| +| 10000_1000_all_false | ~191-200 ns | +| 10000_1000_all_true | ~191-202 ns | +| 10000_1000_alternating | ~194-201 ns | +| 10000_1000_mostly_false | ~192-199 ns | +| 10000_1000_mostly_true | ~192-201 ns | + +Non-nullable fast path is very efficient. + +## Progress + +- **Before optimizations:** 0.56x (1.8x slower) for 1000 run length nullable +- **After optimizations:** 0.67x (1.5x slower) for 1000 run length nullable +- **Remaining gap:** ~150ns extraction overhead + +## Remaining Work + +### Option 1: Fix the Benchmark (Recommended) + +Make the benchmark fair by including extraction in the baseline timing: + +```rust +#[divan::bench(args = NULLABLE_BOOL_ARGS)] +fn decode_bool_nullable_develop_fair(bencher: Bencher, args: NullableBoolBenchArgs) { + let (ends, values) = create_nullable_bool_test_data(...); + bencher + .with_inputs(|| (ends.clone(), values.clone())) + .bench_refs(|(ends, values)| { + // Now timing extraction too + let ends_slice: Vec = ends.as_slice::().to_vec(); + let values_buf = values.to_bit_buffer(); + let validity_buf = values.validity_mask().unwrap(); + let validity_bits = match validity_buf { + vortex_mask::Mask::Values(m) => m.bit_buffer().clone(), + _ => BitBuffer::new_set(values.len()), + }; + decode_bool_nullable_baseline(&ends_slice, &values_buf, &validity_bits, total_length) + }); +} +``` + +### Option 2: Lower-Level API + +Add a public function that takes pre-extracted data for users who want maximum performance and are willing to manage extraction themselves: + +```rust +pub fn runend_decode_bools_from_slices( + ends: &[E], + values: &BitBuffer, + validity: &BitBuffer, // or Option<&BitBuffer> + length: usize, +) -> BoolArray +``` + +### Option 3: Reduce Extraction Overhead + +Investigate ways to make `validity_mask()` and `to_bit_buffer()` cheaper: +- Caching +- Avoiding allocations +- Direct field access if possible + +## Files Changed + +- `encodings/runend/src/decompress_bool.rs`: + - Added `PREFILL_RUN_THRESHOLD` constant at module level + - Added `decode_few_runs_no_offset()` function + - Modified `runend_decode_bools()` to use fast path + - Added tests: `decode_bools_nullable`, `decode_bools_nullable_few_runs` + +## Tests + +All tests pass: +``` +running 8 tests +test decompress_bool::tests::decode_bools_all_false_single_run ... ok +test decompress_bool::tests::decode_bools_all_true_single_run ... ok +test decompress_bool::tests::decode_bools_alternating ... ok +test decompress_bool::tests::decode_bools_mostly_false ... ok +test decompress_bool::tests::decode_bools_mostly_true ... ok +test decompress_bool::tests::decode_bools_nullable ... ok +test decompress_bool::tests::decode_bools_nullable_few_runs ... ok +test decompress_bool::tests::decode_bools_with_offset ... ok +``` + +## Code Locations + +- Implementation: `encodings/runend/src/decompress_bool.rs` +- Benchmarks: `encodings/runend/benches/run_end_decode.rs` +- Iterator helper: `encodings/runend/src/iter.rs` (`trimmed_ends_iter`) + +## Investigation: fill_bits Performance (2025-02-02) + +### Hypothesis + +The `fill_bits_true`/`fill_bits_false` functions might be slow and could benefit from using u64 instead of u8 for the middle byte fill. + +### Benchmark Results + +Added benchmarks comparing byte-level (u8) vs word-level (u64) fill implementations: + +| Range (bits) | Offset | u8 `.fill()` | u64 manual | Winner | +|--------------|--------|--------------|------------|--------| +| 10 | 0 | ~2.1ns | ~2.6ns | **u8** | +| 10 | 3 | ~1.1ns | ~1.2ns | ~same | +| 100 | 0 | ~4.1ns | ~6.5ns | **u8** | +| 100 | 5 | ~3.9ns | ~8.5ns | **u8 (2x)** | +| 1000 | 0 | ~2.4ns | ~6.7ns | **u8 (3x)** | +| 1000 | 7 | ~3.0ns | ~11ns | **u8 (4x)** | +| 5000 | 0 | ~9.7ns | ~9.8ns | ~same | +| 5000 | 1 | ~10ns | ~13ns | **u8** | + +### Conclusion + +**The fill functions are NOT the bottleneck.** The `.fill()` method is already highly optimized by LLVM - it generates vectorized memset-like code internally. The manual u64 approach adds overhead from: +1. Alignment checking (`align_offset`) +2. Extra branches for prefix/suffix handling +3. Unsafe pointer casts + +The fill operations only take ~2-10ns, while the full decode takes ~200-700ns. The overhead comes from elsewhere. + +### What IS the bottleneck? + +For the 1000 run length nullable case: +- Baseline (pre-extracted data): ~320ns +- New implementation (includes extraction): ~480ns +- Difference: ~160ns + +The overhead sources are: +1. **Extraction calls** (~150ns): + - `values.validity_mask()?` + - `values.to_bit_buffer()` + - `ends.as_slice::()` + +2. **Iterator chain** (for non-fast-path cases): + - `trimmed_ends_iter` with 3 chained `.map()` operations + +### Next Steps + +1. **Profile the extraction methods** - understand what makes `validity_mask()` and `to_bit_buffer()` expensive +2. **Consider caching** - if these methods are called frequently, cache results +3. **Accept the tradeoff** - the extraction overhead is necessary for a clean API; users who need maximum performance can use the lower-level functions directly + +## Optimization: validity_mask() Fast Path (2025-02-02) + +### Change + +Added a fast path in `validity_mask()` (in `vortex-array/src/compute/filter.rs`) to avoid the expensive `fill_null()` call when the validity array is already a non-nullable BoolArray. + +### Extraction Benchmark Results (After) + +| Operation | Before | After | Improvement | +|-----------|--------|-------|-------------| +| `validity_mask()` | ~150-166ns | ~98-102ns | **~40% faster** | +| All combined | ~195-208ns | ~127-135ns | **~35% faster** | + +### Full Decode Benchmark Results (After) + +| Benchmark | New | Baseline | Speedup | +|-----------|-----|----------|---------| +| 10000_2_alternating_mostly_valid | 14.3 µs | 49.9 µs | **3.5x faster** | +| 10000_10_alternating_mostly_valid | 4.0 µs | 15.3 µs | **3.8x faster** | +| 10000_100_alternating_mostly_valid | 922 ns | 2.6 µs | **2.8x faster** | +| 10000_1000_alternating_mostly_valid | 446 ns | 376 ns | 1.2x slower | + +### Summary + +The new implementation is now: +- **2.8x-3.8x faster** for typical cases (many runs) +- **~1.2x slower** only for the edge case with very few runs (10 runs at 1000 run length) + +The remaining ~70ns gap in the 1000 run length case comes from: +1. Remaining extraction overhead (~50ns for validity_mask) +2. Iterator/function call overhead + +This is an acceptable tradeoff since: +1. The few-runs case is already very fast (~446ns) +2. The common case (many runs) is significantly faster +3. Further optimization would require invasive changes to the core API + +## Experiment: u64 Fill in decompress_bool.rs (2025-02-02) + +### Hypothesis + +Using u64 writes instead of byte-level `.fill()` for the middle portion of `fill_bits_true`/`fill_bits_false` might improve performance. + +### Implementation + +Modified `fill_bits_true`/`fill_bits_false` to use a `fill_bytes_u64` helper that: +1. Handles unaligned prefix bytes +2. Writes aligned u64s for the middle +3. Handles suffix bytes + +### Result + +**No improvement.** The u64 approach was about the same speed or slightly slower: +- Nullable 1000 run: ~458-498ns (vs ~374-446ns with byte fill) + +### Why + +1. **LLVM already optimizes `.fill()`** - It generates vectorized SIMD code for slice fills +2. **Overhead** - Alignment checking and branching add overhead that outweighs any benefit +3. **Small runs** - For small byte ranges, the u64 approach has more overhead + +### Conclusion + +Keep the simple byte-level `.fill()` implementation. It's already optimal. + +## Ablation Study: Which Optimizations Matter? (2025-02-02) + +Tested three strategies: +1. **Sequential** - append_n for each run (no prefill) +2. **Prefill zeros** - prefill buffer with 0s, fill true runs +3. **Adaptive** - choose prefill value based on majority + +### Results + +| Scenario | Sequential | Prefill 0s | Adaptive | Best | +|----------|------------|------------|----------|------| +| 10 runs, alternating | 120ns | 77ns | 125ns | prefill | +| 10 runs, mostly_true | 121ns | 86ns | 106ns | prefill | +| 32 runs, alternating | 752ns | 187ns | 294ns | prefill | +| 32 runs, mostly_true | 492ns | 463ns | 159ns | **adaptive** | +| 100 runs, alternating | 1.06µs | 323ns | 484ns | prefill | +| 100 runs, mostly_true | 1.08µs | 948ns | 166ns | **adaptive** | +| 1000 runs, alternating | 6.3µs | 1.5µs | 1.4µs | ~same | +| 1000 runs, mostly_true | 5.8µs | 2.2µs | 828ns | **adaptive** | + +### Conclusions + +1. **Prefill vs Sequential**: Prefill is **always faster** for many runs + - 10 runs: 1.5x faster + - 100 runs: 3x faster + - 1000 runs: **4x faster** + +2. **Adaptive prefill**: Critical for **skewed distributions** (common in real data) + - Alternating (50/50): prefill_zeros is same or slightly better + - Mostly_true (90%): adaptive is **2-3x faster** + +Both optimizations are justified and should be kept. + +## Final Implementation Architecture + +### Entry Point: `runend_decode_bools` + +```rust +pub fn runend_decode_bools( + ends: PrimitiveArray, + values: BoolArray, + offset: usize, + length: usize, +) -> VortexResult +``` + +### Decision Tree + +``` +runend_decode_bools +├── Extract: validity_mask(), to_bit_buffer() +├── IF offset == 0 && num_runs < 32: +│ └── decode_few_runs_no_offset ← Fast path, no iterator +└── ELSE: + └── runend_decode_typed_bool ← Uses trimmed_ends_iter + ├── Mask::AllTrue → decode_bool_non_nullable + │ ├── IF num_runs < 32: sequential append_n + │ └── ELSE: adaptive prefill + │ ├── more true → prefill 1s, clear false runs + │ └── more false → prefill 0s, fill true runs + ├── Mask::AllFalse → return all-invalid array + └── Mask::Values → decode_bool_nullable + ├── IF num_runs < 32: sequential append + └── ELSE: 4 variants based on majority: + ├── (true, valid) → prefill decoded=1, validity=1 + ├── (true, null) → prefill decoded=1, validity=0 + ├── (false, valid) → prefill decoded=0, validity=1 + └── (false, null) → prefill decoded=0, validity=0 +``` + +### Key Difference: `decode_few_runs_no_offset` vs `runend_decode_typed_bool` + +| Aspect | `decode_few_runs_no_offset` | `runend_decode_typed_bool` | +|--------|----------------------------|---------------------------| +| Offset handling | Assumes `offset == 0` | Handles any offset | +| Iterator | Direct slice: `for (i, &end) in ends.iter()` | `trimmed_ends_iter` with 3 `.map()` chains | +| Overhead | Minimal | ~20-30ns iterator overhead | +| When used | `offset == 0 && num_runs < 32` | All other cases | + +### `trimmed_ends_iter` Details + +```rust +run_ends.iter() + .map(|v| v - offset_e) // subtract offset (redundant when offset=0) + .map(|v| min(v, length_e)) // clamp to length + .map(|v| v.as_()) // convert to usize +``` + +For 10 runs, these 3 chained closures add measurable overhead. For 1000 runs, it's amortized. + +### Threshold: PREFILL_RUN_THRESHOLD = 32 + +Below 32 runs: +- Iterator overhead dominates +- Sequential `append_n` is competitive with prefill +- Use direct slice access, avoid iterator chain + +Above 32 runs: +- Prefill + fill_bits is 3-4x faster than sequential +- Adaptive selection matters for skewed data +- Iterator overhead is negligible + +## `fill_bits_true` / `fill_bits_false` Implementation + +```rust +fn fill_bits_true(slice: &mut [u8], start: usize, end: usize) { + // Handle same-byte case + if start_byte == end_byte { + let mask = ((1u16 << (end_bit - start_bit)) - 1) as u8; + slice[start_byte] |= mask << start_bit; + } else { + // First partial byte + if start_bit != 0 { + slice[start_byte] |= !((1u8 << start_bit) - 1); + } + // Middle bytes - LLVM optimizes to SIMD + slice[fill_start..end_byte].fill(0xFF); + // Last partial byte + if end_bit != 0 { + slice[end_byte] |= (1u8 << end_bit) - 1; + } + } +} +``` + +Key insight: `.fill()` is already vectorized by LLVM. Manual u64 approach adds overhead without benefit. + +## External Optimization: `validity_mask()` Fast Path + +In `vortex-array/src/compute/filter.rs`: + +```rust +// Added fast path for non-nullable canonical bool arrays +if !self.dtype().is_nullable() && self.is_canonical() { + return Ok(Mask::from_buffer(self.to_bool().to_bit_buffer())); +} +``` + +This avoids the expensive `fill_null()` call when the validity array is already a non-nullable BoolArray (common case). + +## Final Performance Summary + +### vs Baseline (pre-extracted data) + +| Scenario | New Impl | Baseline | Result | +|----------|----------|----------|--------| +| Many runs (10-100) | 3-4 µs | 13-15 µs | **3-4x faster** | +| Medium runs (100) | 800-900 ns | 2.6 µs | **2.8x faster** | +| Few runs (10 @ 1000 len) | 380-450 ns | 320-376 ns | ~1.2x slower | + +### Absolute Performance (non-nullable, 10K elements) + +| Runs | Time | Throughput | +|------|------|------------| +| 10 | ~200 ns | 50M elements/sec | +| 100 | ~350 ns | 28M elements/sec | +| 1000 | ~1.4 µs | 7M elements/sec | + +## Files Modified + +1. **`encodings/runend/src/decompress_bool.rs`** + - Full implementation with all optimizations + - ~430 lines including tests + +2. **`encodings/runend/benches/run_end_decode.rs`** + - Added baseline comparison benchmark + - ~435 lines + +3. **`vortex-array/src/compute/filter.rs`** + - Added 4-line fast path for `validity_mask()` + +4. **`encodings/runend/PERF_NOTES.md`** + - This file - full documentation of investigation diff --git a/encodings/runend/benches/run_end_decode.rs b/encodings/runend/benches/run_end_decode.rs index 06ceac4186d..f08ebe9a733 100644 --- a/encodings/runend/benches/run_end_decode.rs +++ b/encodings/runend/benches/run_end_decode.rs @@ -11,6 +11,7 @@ use vortex_array::arrays::PrimitiveArray; use vortex_array::compute::warm_up_vtables; use vortex_array::validity::Validity; use vortex_buffer::BitBuffer; +use vortex_buffer::BitBufferMut; use vortex_buffer::BufferMut; use vortex_runend::decompress_bool::runend_decode_bools; @@ -378,3 +379,57 @@ fn decode_bool_nullable(bencher: Bencher, args: NullableBoolBenchArgs) { runend_decode_bools(ends.clone(), values.clone(), 0, total_length) }); } + +/// Baseline using develop branch's append_n approach +fn decode_bool_nullable_baseline( + ends: &[u32], + values: &BitBuffer, + validity_mask: &BitBuffer, + _length: usize, +) -> BoolArray { + let mut decoded = BitBufferMut::with_capacity(ends.last().copied().unwrap_or(0) as usize); + let mut decoded_validity = + BitBufferMut::with_capacity(ends.last().copied().unwrap_or(0) as usize); + + let mut prev_end = 0usize; + for ((&end, value), is_valid) in ends.iter().zip(values.iter()).zip(validity_mask.iter()) { + let end = end as usize; + if is_valid { + decoded_validity.append_n(true, end - prev_end); + decoded.append_n(value, end - prev_end); + } else { + decoded_validity.append_n(false, end - prev_end); + decoded.append_n(false, end - prev_end); + } + prev_end = end; + } + + BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) +} + +#[divan::bench(args = NULLABLE_BOOL_ARGS)] +fn decode_bool_nullable_develop(bencher: Bencher, args: NullableBoolBenchArgs) { + let NullableBoolBenchArgs { + total_length, + avg_run_length, + distribution, + validity, + } = args; + let (ends, values) = + create_nullable_bool_test_data(total_length, avg_run_length, distribution, validity); + + bencher + .with_inputs(|| { + let ends_slice: Vec = ends.as_slice::().to_vec(); + let values_buf = values.to_bit_buffer(); + let validity_buf = values.validity_mask().unwrap(); + let validity_bits = match validity_buf { + vortex_mask::Mask::Values(m) => m.bit_buffer().clone(), + _ => BitBuffer::new_set(values.len()), + }; + (ends_slice, values_buf, validity_bits) + }) + .bench_refs(|(ends, values, validity)| { + decode_bool_nullable_baseline(ends, values, validity, total_length) + }); +} diff --git a/encodings/runend/src/decompress_bool.rs b/encodings/runend/src/decompress_bool.rs index 4df188d8b8f..9d872f9a6d8 100644 --- a/encodings/runend/src/decompress_bool.rs +++ b/encodings/runend/src/decompress_bool.rs @@ -19,6 +19,10 @@ use vortex_mask::Mask; use crate::iter::trimmed_ends_iter; +/// Threshold for number of runs below which we use sequential append instead of prefill. +/// With few runs, the overhead of prefilling the entire buffer dominates. +const PREFILL_RUN_THRESHOLD: usize = 32; + /// Decodes run-end encoded boolean values into a flat `BoolArray`. pub fn runend_decode_bools( ends: PrimitiveArray, @@ -27,12 +31,29 @@ pub fn runend_decode_bools( length: usize, ) -> VortexResult { let validity = values.validity_mask()?; + let values_buf = values.to_bit_buffer(); + let nullability = values.dtype().nullability(); + + // Fast path for few runs with no offset - avoids iterator overhead + let num_runs = values_buf.len(); + if offset == 0 && num_runs < PREFILL_RUN_THRESHOLD { + return Ok(match_each_unsigned_integer_ptype!(ends.ptype(), |E| { + decode_few_runs_no_offset( + ends.as_slice::(), + &values_buf, + validity, + nullability, + length, + ) + })); + } + Ok(match_each_unsigned_integer_ptype!(ends.ptype(), |E| { runend_decode_typed_bool( trimmed_ends_iter(ends.as_slice::(), offset, length), - &values.to_bit_buffer(), + &values_buf, validity, - values.dtype().nullability(), + nullability, length, ) })) @@ -59,6 +80,55 @@ pub fn runend_decode_typed_bool( } } +/// Fast path for few runs with no offset. Uses direct slice access to minimize overhead. +/// This avoids the `trimmed_ends_iter` iterator chain which adds significant overhead +/// for small numbers of runs. +#[inline(always)] +fn decode_few_runs_no_offset( + ends: &[E], + values: &BitBuffer, + validity: Mask, + nullability: Nullability, + length: usize, +) -> BoolArray { + match validity { + Mask::AllTrue(_) => { + let mut decoded = BitBufferMut::with_capacity(length); + let mut prev_end = 0usize; + for (i, &end) in ends.iter().enumerate() { + let end = end.as_().min(length); + decoded.append_n(values.value(i), end - prev_end); + prev_end = end; + } + BoolArray::new(decoded.freeze(), nullability.into()) + } + Mask::AllFalse(_) => BoolArray::new(BitBuffer::new_unset(length), Validity::AllInvalid), + Mask::Values(mask) => { + let validity_buf = mask.bit_buffer(); + // Use prefill + fill_bits for better performance with larger runs + let mut decoded = BitBufferMut::new_unset(length); + let mut decoded_validity = BitBufferMut::new_unset(length); + let decoded_bytes = decoded.as_mut_slice(); + let validity_bytes = decoded_validity.as_mut_slice(); + let mut prev_end = 0usize; + for (i, &end) in ends.iter().enumerate() { + let end = end.as_().min(length); + if end > prev_end { + let is_valid = validity_buf.value(i); + if is_valid { + fill_bits_true(validity_bytes, prev_end, end); + if values.value(i) { + fill_bits_true(decoded_bytes, prev_end, end); + } + } + } + prev_end = end; + } + BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) + } + } +} + /// Decodes run-end encoded booleans when all values are valid (non-nullable). fn decode_bool_non_nullable( run_ends: impl Iterator, @@ -66,11 +136,20 @@ fn decode_bool_non_nullable( nullability: Nullability, length: usize, ) -> BoolArray { + let num_runs = values.len(); + + // For few runs, sequential append is faster than prefill + modify + if num_runs < PREFILL_RUN_THRESHOLD { + let mut decoded = BitBufferMut::with_capacity(length); + for (end, value) in run_ends.zip(values.iter()) { + decoded.append_n(value, end - decoded.len()); + } + return BoolArray::new(decoded.freeze(), nullability.into()); + } + // Adaptive strategy: choose based on which value is more common - // If more runs have true values, pre-fill with 1s and clear false runs - // If more runs have false values, pre-fill with 0s and fill true runs let true_count = values.true_count(); - let false_count = values.len() - true_count; + let false_count = num_runs - true_count; if true_count > false_count { // More true runs - pre-fill with 1s and clear false runs @@ -110,79 +189,168 @@ fn decode_bool_nullable( validity_mask: &BitBuffer, length: usize, ) -> BoolArray { + let num_runs = values.len(); + + // For few runs, sequential append is faster than prefill + modify + if num_runs < PREFILL_RUN_THRESHOLD { + return decode_nullable_sequential(run_ends, values, validity_mask, length); + } + let true_count = values.true_count(); - let false_count = values.len() - true_count; + let false_count = num_runs - true_count; + let valid_count = validity_mask.true_count(); + let null_count = num_runs - valid_count; + + let prefill_true = true_count > false_count; + let prefill_valid = valid_count > null_count; + + match (prefill_true, prefill_valid) { + (true, true) => decode_nullable_true_valid(run_ends, values, validity_mask, length), + (true, false) => decode_nullable_true_null(run_ends, values, validity_mask, length), + (false, true) => decode_nullable_false_valid(run_ends, values, validity_mask, length), + (false, false) => decode_nullable_false_null(run_ends, values, validity_mask, length), + } +} - // Use true and false count as a proxy for valid true and false count. - if true_count > false_count { - // More true runs - pre-fill with 1s and clear false/null runs - let mut decoded = BitBufferMut::new_set(length); - let mut decoded_validity = BitBufferMut::new_unset(length); - let decoded_bytes = decoded.as_mut_slice(); - let validity_bytes = decoded_validity.as_mut_slice(); - let mut current_pos = 0usize; +/// Sequential decode for few runs - avoids prefill overhead. +#[inline(always)] +fn decode_nullable_sequential( + run_ends: impl Iterator, + values: &BitBuffer, + validity_mask: &BitBuffer, + length: usize, +) -> BoolArray { + let mut decoded = BitBufferMut::with_capacity(length); + let mut decoded_validity = BitBufferMut::with_capacity(length); + + for (end, (value, is_valid)) in run_ends.zip(values.iter().zip(validity_mask.iter())) { + let run_len = end - decoded.len(); + if is_valid { + decoded_validity.append_n(true, run_len); + decoded.append_n(value, run_len); + } else { + decoded_validity.append_n(false, run_len); + decoded.append_n(false, run_len); + } + } - for (end, value) in run_ends.zip_eq( - values - .iter() - .zip(validity_mask.iter()) - .map(|(v, is_valid)| is_valid.then_some(v)), - ) { - if end > current_pos { - match value { - None => { - // Null: clear decoded bits, validity stays false - fill_bits_false(decoded_bytes, current_pos, end); - } - Some(v) => { - // Valid: set validity bits to true - fill_bits_true(validity_bytes, current_pos, end); - // Clear decoded bits if value is false - if !v { - fill_bits_false(decoded_bytes, current_pos, end); - } - } + BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) +} + +/// Prefill decoded=1s, validity=1s. Clear for false values and nulls. +#[inline(always)] +fn decode_nullable_true_valid( + run_ends: impl Iterator, + values: &BitBuffer, + validity_mask: &BitBuffer, + length: usize, +) -> BoolArray { + let mut decoded = BitBufferMut::new_set(length); + let mut decoded_validity = BitBufferMut::new_set(length); + let decoded_bytes = decoded.as_mut_slice(); + let validity_bytes = decoded_validity.as_mut_slice(); + let mut current_pos = 0usize; + + for (end, (value, is_valid)) in run_ends.zip_eq(values.iter().zip(validity_mask.iter())) { + if end > current_pos { + if !is_valid { + fill_bits_false(validity_bytes, current_pos, end); + fill_bits_false(decoded_bytes, current_pos, end); + } else if !value { + fill_bits_false(decoded_bytes, current_pos, end); + } + current_pos = end; + } + } + BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) +} + +/// Prefill decoded=1s, validity=0s. Set validity for valid, clear decoded for false/null. +#[inline(always)] +fn decode_nullable_true_null( + run_ends: impl Iterator, + values: &BitBuffer, + validity_mask: &BitBuffer, + length: usize, +) -> BoolArray { + let mut decoded = BitBufferMut::new_set(length); + let mut decoded_validity = BitBufferMut::new_unset(length); + let decoded_bytes = decoded.as_mut_slice(); + let validity_bytes = decoded_validity.as_mut_slice(); + let mut current_pos = 0usize; + + for (end, (value, is_valid)) in run_ends.zip_eq(values.iter().zip(validity_mask.iter())) { + if end > current_pos { + if is_valid { + fill_bits_true(validity_bytes, current_pos, end); + if !value { + fill_bits_false(decoded_bytes, current_pos, end); } - current_pos = end; + } else { + fill_bits_false(decoded_bytes, current_pos, end); } + current_pos = end; } - BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) - } else { - // More or equal false runs - pre-fill with 0s and fill true runs - let mut decoded = BitBufferMut::new_unset(length); - let mut decoded_validity = BitBufferMut::new_unset(length); - let decoded_bytes = decoded.as_mut_slice(); - let validity_bytes = decoded_validity.as_mut_slice(); - let mut current_pos = 0usize; + } + BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) +} - for (end, value) in run_ends.zip_eq( - values - .iter() - .zip(validity_mask.iter()) - .map(|(v, is_valid)| is_valid.then_some(v)), - ) { - if end > current_pos { - match value { - None => { - // Validity stays false (already 0), decoded stays false - } - Some(v) => { - // Set validity bits to true - fill_bits_true(validity_bytes, current_pos, end); - // Set decoded bits if value is true - if v { - fill_bits_true(decoded_bytes, current_pos, end); - } - } +/// Prefill decoded=0s, validity=1s. Clear validity for nulls, set decoded for true. +#[inline(always)] +fn decode_nullable_false_valid( + run_ends: impl Iterator, + values: &BitBuffer, + validity_mask: &BitBuffer, + length: usize, +) -> BoolArray { + let mut decoded = BitBufferMut::new_unset(length); + let mut decoded_validity = BitBufferMut::new_set(length); + let decoded_bytes = decoded.as_mut_slice(); + let validity_bytes = decoded_validity.as_mut_slice(); + let mut current_pos = 0usize; + + for (end, (value, is_valid)) in run_ends.zip_eq(values.iter().zip(validity_mask.iter())) { + if end > current_pos { + if !is_valid { + fill_bits_false(validity_bytes, current_pos, end); + } else if value { + fill_bits_true(decoded_bytes, current_pos, end); + } + current_pos = end; + } + } + BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) +} + +/// Prefill decoded=0s, validity=0s. Set validity and decoded for valid true values. +#[inline(always)] +fn decode_nullable_false_null( + run_ends: impl Iterator, + values: &BitBuffer, + validity_mask: &BitBuffer, + length: usize, +) -> BoolArray { + let mut decoded = BitBufferMut::new_unset(length); + let mut decoded_validity = BitBufferMut::new_unset(length); + let decoded_bytes = decoded.as_mut_slice(); + let validity_bytes = decoded_validity.as_mut_slice(); + let mut current_pos = 0usize; + + for (end, (value, is_valid)) in run_ends.zip_eq(values.iter().zip(validity_mask.iter())) { + if end > current_pos { + if is_valid { + fill_bits_true(validity_bytes, current_pos, end); + if value { + fill_bits_true(decoded_bytes, current_pos, end); } - current_pos = end; } + current_pos = end; } - BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) } + BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) } -/// Fills bits in range [start, end) to true using byte-level operations. +/// Fills bits in range [start, end) to true. /// Assumes the buffer is pre-initialized to all zeros. #[inline(always)] fn fill_bits_true(slice: &mut [u8], start: usize, end: usize) { @@ -196,8 +364,6 @@ fn fill_bits_true(slice: &mut [u8], start: usize, end: usize) { let end_bit = end % 8; if start_byte == end_byte { - // All bits in same byte - // Use u16 to avoid overflow, then truncate (guaranteed to fit in u8 since max is 0xFF) #[allow(clippy::cast_possible_truncation)] let mask = ((1u16 << (end_bit - start_bit)) - 1) as u8; slice[start_byte] |= mask << start_bit; @@ -207,7 +373,7 @@ fn fill_bits_true(slice: &mut [u8], start: usize, end: usize) { slice[start_byte] |= !((1u8 << start_bit) - 1); } - // Middle bytes (bulk memset to 0xFF) + // Middle bytes let fill_start = if start_bit != 0 { start_byte + 1 } else { @@ -224,7 +390,7 @@ fn fill_bits_true(slice: &mut [u8], start: usize, end: usize) { } } -/// Clears bits in range [start, end) to false using byte-level operations. +/// Clears bits in range [start, end) to false. /// Assumes the buffer is pre-initialized to all ones. #[inline(always)] fn fill_bits_false(slice: &mut [u8], start: usize, end: usize) { @@ -238,17 +404,16 @@ fn fill_bits_false(slice: &mut [u8], start: usize, end: usize) { let end_bit = end % 8; if start_byte == end_byte { - // All bits in same byte - create mask with 0s in the range we want to clear #[allow(clippy::cast_possible_truncation)] let mask = ((1u16 << (end_bit - start_bit)) - 1) as u8; slice[start_byte] &= !(mask << start_bit); } else { - // First partial byte - clear high bits from start_bit + // First partial byte if start_bit != 0 { slice[start_byte] &= (1u8 << start_bit) - 1; } - // Middle bytes (bulk memset to 0x00) + // Middle bytes let fill_start = if start_bit != 0 { start_byte + 1 } else { @@ -258,7 +423,7 @@ fn fill_bits_false(slice: &mut [u8], start: usize, end: usize) { slice[fill_start..end_byte].fill(0x00); } - // Last partial byte - clear low bits up to end_bit + // Last partial byte if end_bit != 0 { slice[end_byte] &= !((1u8 << end_bit) - 1); } @@ -355,4 +520,54 @@ mod tests { assert_arrays_eq!(decoded, expected); Ok(()) } + + #[test] + fn decode_bools_nullable() -> VortexResult<()> { + use vortex_array::validity::Validity; + + // 3 runs: T (valid), F (null), T (valid) -> [T, T, null, null, null, T, T, T, T, T] + let ends = PrimitiveArray::from_iter([2u32, 5, 10]); + let values = BoolArray::new( + BitBuffer::from(vec![true, false, true]), + Validity::from(BitBuffer::from(vec![true, false, true])), + ); + let decoded = runend_decode_bools(ends, values, 0, 10)?; + + // Expected: values=[T, T, F, F, F, T, T, T, T, T], validity=[1, 1, 0, 0, 0, 1, 1, 1, 1, 1] + let expected = BoolArray::new( + BitBuffer::from(vec![ + true, true, false, false, false, true, true, true, true, true, + ]), + Validity::from(BitBuffer::from(vec![ + true, true, false, false, false, true, true, true, true, true, + ])), + ); + assert_arrays_eq!(decoded, expected); + Ok(()) + } + + #[test] + fn decode_bools_nullable_few_runs() -> VortexResult<()> { + use vortex_array::validity::Validity; + + // Test few runs (uses fast path): 5 runs of length 2000 each + let ends = PrimitiveArray::from_iter([2000u32, 4000, 6000, 8000, 10000]); + let values = BoolArray::new( + BitBuffer::from(vec![true, false, true, false, true]), + Validity::from(BitBuffer::from(vec![true, false, true, false, true])), + ); + let decoded = runend_decode_bools(ends, values, 0, 10000)?; + + // Check length and a few values + assert_eq!(decoded.len(), 10000); + // First run: valid true + assert!(decoded.validity_mask()?.value(0)); + assert!(decoded.to_bit_buffer().value(0)); + // Second run: null (validity false) + assert!(!decoded.validity_mask()?.value(2000)); + // Third run: valid true + assert!(decoded.validity_mask()?.value(4000)); + assert!(decoded.to_bit_buffer().value(4000)); + Ok(()) + } } From 3222cdc4aa3a38657d2743fb6716b0e252589ec1 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Tue, 10 Mar 2026 12:18:01 +0000 Subject: [PATCH 05/10] fixup Signed-off-by: Joe Isaacs --- encodings/runend/src/decompress_bool.rs | 194 ++++++------------------ 1 file changed, 45 insertions(+), 149 deletions(-) diff --git a/encodings/runend/src/decompress_bool.rs b/encodings/runend/src/decompress_bool.rs index 9d872f9a6d8..a834026d251 100644 --- a/encodings/runend/src/decompress_bool.rs +++ b/encodings/runend/src/decompress_bool.rs @@ -9,11 +9,11 @@ use itertools::Itertools; use vortex_array::arrays::BoolArray; use vortex_array::arrays::PrimitiveArray; +use vortex_array::dtype::Nullability; +use vortex_array::match_each_unsigned_integer_ptype; use vortex_array::validity::Validity; use vortex_buffer::BitBuffer; use vortex_buffer::BitBufferMut; -use vortex_dtype::Nullability; -use vortex_dtype::match_each_unsigned_integer_ptype; use vortex_error::VortexResult; use vortex_mask::Mask; @@ -84,7 +84,7 @@ pub fn runend_decode_typed_bool( /// This avoids the `trimmed_ends_iter` iterator chain which adds significant overhead /// for small numbers of runs. #[inline(always)] -fn decode_few_runs_no_offset( +fn decode_few_runs_no_offset( ends: &[E], values: &BitBuffer, validity: Mask, @@ -147,39 +147,19 @@ fn decode_bool_non_nullable( return BoolArray::new(decoded.freeze(), nullability.into()); } - // Adaptive strategy: choose based on which value is more common - let true_count = values.true_count(); - let false_count = num_runs - true_count; - - if true_count > false_count { - // More true runs - pre-fill with 1s and clear false runs - let mut decoded = BitBufferMut::new_set(length); - let decoded_bytes = decoded.as_mut_slice(); - let mut current_pos = 0usize; + // Adaptive strategy: prefill with majority value, only flip minority runs + let prefill = values.true_count() > num_runs - values.true_count(); + let mut decoded = prefill_buffer(prefill, length); + let decoded_bytes = decoded.as_mut_slice(); + let mut current_pos = 0usize; - for (end, value) in run_ends.zip_eq(values.iter()) { - // Only clear when value is false (true is already 1) - if end > current_pos && !value { - fill_bits_false(decoded_bytes, current_pos, end); - } - current_pos = end; - } - BoolArray::new(decoded.freeze(), nullability.into()) - } else { - // More or equal false runs - pre-fill with 0s and fill true runs - let mut decoded = BitBufferMut::new_unset(length); - let decoded_bytes = decoded.as_mut_slice(); - let mut current_pos = 0usize; - - for (end, value) in run_ends.zip_eq(values.iter()) { - // Only fill when value is true (false is already 0) - if end > current_pos && value { - fill_bits_true(decoded_bytes, current_pos, end); - } - current_pos = end; + for (end, value) in run_ends.zip_eq(values.iter()) { + if end > current_pos && value != prefill { + flip_bits(decoded_bytes, current_pos, end, value); } - BoolArray::new(decoded.freeze(), nullability.into()) + current_pos = end; } + BoolArray::new(decoded.freeze(), nullability.into()) } /// Decodes run-end encoded booleans when values may be null (nullable). @@ -196,20 +176,30 @@ fn decode_bool_nullable( return decode_nullable_sequential(run_ends, values, validity_mask, length); } - let true_count = values.true_count(); - let false_count = num_runs - true_count; - let valid_count = validity_mask.true_count(); - let null_count = num_runs - valid_count; + // Adaptive strategy: prefill each buffer with its majority value + let prefill_decoded = values.true_count() > num_runs - values.true_count(); + let prefill_valid = validity_mask.true_count() > num_runs - validity_mask.true_count(); - let prefill_true = true_count > false_count; - let prefill_valid = valid_count > null_count; + let mut decoded = prefill_buffer(prefill_decoded, length); + let mut decoded_validity = prefill_buffer(prefill_valid, length); + let decoded_bytes = decoded.as_mut_slice(); + let validity_bytes = decoded_validity.as_mut_slice(); + let mut current_pos = 0usize; - match (prefill_true, prefill_valid) { - (true, true) => decode_nullable_true_valid(run_ends, values, validity_mask, length), - (true, false) => decode_nullable_true_null(run_ends, values, validity_mask, length), - (false, true) => decode_nullable_false_valid(run_ends, values, validity_mask, length), - (false, false) => decode_nullable_false_null(run_ends, values, validity_mask, length), + for (end, (value, is_valid)) in run_ends.zip_eq(values.iter().zip(validity_mask.iter())) { + if end > current_pos { + if is_valid != prefill_valid { + flip_bits(validity_bytes, current_pos, end, is_valid); + } + // Decoded bit should be the actual value when valid, false when null. + let want_decoded = is_valid && value; + if want_decoded != prefill_decoded { + flip_bits(decoded_bytes, current_pos, end, want_decoded); + } + current_pos = end; + } } + BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) } /// Sequential decode for few runs - avoids prefill overhead. @@ -237,117 +227,23 @@ fn decode_nullable_sequential( BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) } -/// Prefill decoded=1s, validity=1s. Clear for false values and nulls. -#[inline(always)] -fn decode_nullable_true_valid( - run_ends: impl Iterator, - values: &BitBuffer, - validity_mask: &BitBuffer, - length: usize, -) -> BoolArray { - let mut decoded = BitBufferMut::new_set(length); - let mut decoded_validity = BitBufferMut::new_set(length); - let decoded_bytes = decoded.as_mut_slice(); - let validity_bytes = decoded_validity.as_mut_slice(); - let mut current_pos = 0usize; - - for (end, (value, is_valid)) in run_ends.zip_eq(values.iter().zip(validity_mask.iter())) { - if end > current_pos { - if !is_valid { - fill_bits_false(validity_bytes, current_pos, end); - fill_bits_false(decoded_bytes, current_pos, end); - } else if !value { - fill_bits_false(decoded_bytes, current_pos, end); - } - current_pos = end; - } - } - BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) -} - -/// Prefill decoded=1s, validity=0s. Set validity for valid, clear decoded for false/null. #[inline(always)] -fn decode_nullable_true_null( - run_ends: impl Iterator, - values: &BitBuffer, - validity_mask: &BitBuffer, - length: usize, -) -> BoolArray { - let mut decoded = BitBufferMut::new_set(length); - let mut decoded_validity = BitBufferMut::new_unset(length); - let decoded_bytes = decoded.as_mut_slice(); - let validity_bytes = decoded_validity.as_mut_slice(); - let mut current_pos = 0usize; - - for (end, (value, is_valid)) in run_ends.zip_eq(values.iter().zip(validity_mask.iter())) { - if end > current_pos { - if is_valid { - fill_bits_true(validity_bytes, current_pos, end); - if !value { - fill_bits_false(decoded_bytes, current_pos, end); - } - } else { - fill_bits_false(decoded_bytes, current_pos, end); - } - current_pos = end; - } - } - BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) -} - -/// Prefill decoded=0s, validity=1s. Clear validity for nulls, set decoded for true. -#[inline(always)] -fn decode_nullable_false_valid( - run_ends: impl Iterator, - values: &BitBuffer, - validity_mask: &BitBuffer, - length: usize, -) -> BoolArray { - let mut decoded = BitBufferMut::new_unset(length); - let mut decoded_validity = BitBufferMut::new_set(length); - let decoded_bytes = decoded.as_mut_slice(); - let validity_bytes = decoded_validity.as_mut_slice(); - let mut current_pos = 0usize; - - for (end, (value, is_valid)) in run_ends.zip_eq(values.iter().zip(validity_mask.iter())) { - if end > current_pos { - if !is_valid { - fill_bits_false(validity_bytes, current_pos, end); - } else if value { - fill_bits_true(decoded_bytes, current_pos, end); - } - current_pos = end; - } +fn prefill_buffer(set: bool, length: usize) -> BitBufferMut { + if set { + BitBufferMut::new_set(length) + } else { + BitBufferMut::new_unset(length) } - BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) } -/// Prefill decoded=0s, validity=0s. Set validity and decoded for valid true values. +/// Sets or clears all bits in `[start, end)` to match `value`. #[inline(always)] -fn decode_nullable_false_null( - run_ends: impl Iterator, - values: &BitBuffer, - validity_mask: &BitBuffer, - length: usize, -) -> BoolArray { - let mut decoded = BitBufferMut::new_unset(length); - let mut decoded_validity = BitBufferMut::new_unset(length); - let decoded_bytes = decoded.as_mut_slice(); - let validity_bytes = decoded_validity.as_mut_slice(); - let mut current_pos = 0usize; - - for (end, (value, is_valid)) in run_ends.zip_eq(values.iter().zip(validity_mask.iter())) { - if end > current_pos { - if is_valid { - fill_bits_true(validity_bytes, current_pos, end); - if value { - fill_bits_true(decoded_bytes, current_pos, end); - } - } - current_pos = end; - } +fn flip_bits(slice: &mut [u8], start: usize, end: usize, value: bool) { + if value { + fill_bits_true(slice, start, end); + } else { + fill_bits_false(slice, start, end); } - BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) } /// Fills bits in range [start, end) to true. From dabda034ca631220ff0569bc019b3f0501a57c28 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Tue, 10 Mar 2026 14:58:31 +0000 Subject: [PATCH 06/10] refactor: consolidate bit-filling logic into BitBufferMut::fill_range and fill_bits - Add `fill_bits` free function to vortex-buffer for efficient bit-range filling on raw `&mut [u8]` slices - Add `BitBufferMut::fill_range` method that delegates to `fill_bits` - Refactor `append_n` to delegate to `fill_range`, eliminating duplication - Replace private `fill_bits_true`/`fill_bits_false`/`flip_bits` helpers in decompress_bool.rs with `BitBufferMut::fill_range` - Simplify few-runs nullable path to use `append_n` directly - Replace `prefill_buffer` helper with `BitBufferMut::full` - Remove unused re-export of `runend_decode_bools` from compress.rs - Remove baseline benchmark code (decode_bool_nullable_develop) - Delete PERF_NOTES.md - Fix clippy is_multiple_of warnings in benchmarks Signed-off-by: Joseph Isaacs Co-Authored-By: Claude Opus 4.6 --- encodings/runend/PERF_NOTES.md | 521 --------------------- encodings/runend/benches/run_end_decode.rs | 73 +-- encodings/runend/public-api.lock | 2 - encodings/runend/src/array.rs | 2 +- encodings/runend/src/compress.rs | 2 - encodings/runend/src/compute/compare.rs | 6 +- encodings/runend/src/decompress_bool.rs | 176 ++----- vortex-buffer/src/bit/buf_mut.rs | 74 +-- vortex-buffer/src/bit/mod.rs | 62 +++ 9 files changed, 139 insertions(+), 779 deletions(-) delete mode 100644 encodings/runend/PERF_NOTES.md diff --git a/encodings/runend/PERF_NOTES.md b/encodings/runend/PERF_NOTES.md deleted file mode 100644 index 151fbb611dd..00000000000 --- a/encodings/runend/PERF_NOTES.md +++ /dev/null @@ -1,521 +0,0 @@ -# Run-End Boolean Decoding Performance Notes - -## Overview - -This document captures the state of performance optimization work on `decompress_bool.rs` for run-end encoded boolean arrays. - -## Problem Statement - -The original benchmark comparison showed the new implementation was slower for the 1000 run length case (only 10 runs): - -``` -10000_1000_alternating_mostly_valid: develop 401 ns, new 714 ns, 0.56x slower -``` - -## Root Cause Analysis - -### Benchmark Unfairness - -The baseline benchmark (`decode_bool_nullable_develop`) and new implementation (`decode_bool_nullable`) measure different things: - -**New implementation (what gets timed):** -```rust -bencher - .with_inputs(|| (ends.clone(), values.clone())) // Setup: just clone - .bench_refs(|(ends, values)| { - // TIMED: extraction + decode - runend_decode_bools(ends.clone(), values.clone(), 0, total_length) - }); -``` - -Inside `runend_decode_bools` (all timed): -1. `values.validity_mask()?` - extract validity mask -2. `values.to_bit_buffer()` - extract bit buffer -3. `match_each_unsigned_integer_ptype!` - generic type dispatch -4. `trimmed_ends_iter()` - iterator with 3 chained `.map()` operations -5. Actual decode loop - -**Baseline (what gets timed):** -```rust -bencher - .with_inputs(|| { - // NOT TIMED: all extraction done here - let ends_slice: Vec = ends.as_slice::().to_vec(); - let values_buf = values.to_bit_buffer(); - let validity_buf = values.validity_mask().unwrap(); - let validity_bits = match validity_buf { ... }; - (ends_slice, values_buf, validity_bits) - }) - .bench_refs(|(ends, values, validity)| { - // TIMED: only the decode loop with pre-extracted data - decode_bool_nullable_baseline(ends, values, validity, total_length) - }); -``` - -**Key insight:** The baseline excludes ~150ns of extraction overhead from timing. - -### Overhead Sources for Few Runs - -For 10 runs (1000 run length), the overhead dominates: - -1. **`trimmed_ends_iter`** - 3 chained `.map()` per element: - - `v - offset_e` (subtract offset) - - `min(v, length_e)` (clamp to length) - - `v.as_()` (convert to usize) - -2. **Array method calls:** - - `values.validity_mask()?` - - `values.to_bit_buffer()` - - `ends.as_slice::()` - -3. **Generic dispatch:** `match_each_unsigned_integer_ptype!` macro expansion - -## Optimizations Implemented - -### 1. Fast Path for Few Runs with No Offset - -Added `decode_few_runs_no_offset()` function that: -- Bypasses `trimmed_ends_iter` iterator chain -- Uses direct slice iteration: `for (i, &end) in ends.iter().enumerate()` -- Triggered when `offset == 0 && num_runs < PREFILL_RUN_THRESHOLD` (32) - -```rust -// In runend_decode_bools(): -if offset == 0 && num_runs < PREFILL_RUN_THRESHOLD { - return Ok(match_each_unsigned_integer_ptype!(ends.ptype(), |E| { - decode_few_runs_no_offset( - ends.as_slice::(), - &values_buf, - validity, - nullability, - length, - ) - })); -} -``` - -### 2. Optimized Nullable Fast Path with fill_bits - -For nullable decoding in the fast path, uses `fill_bits_true`/`fill_bits_false` instead of `append_n`: - -```rust -Mask::Values(mask) => { - let validity_buf = mask.bit_buffer(); - let mut decoded = BitBufferMut::new_unset(length); - let mut decoded_validity = BitBufferMut::new_unset(length); - let decoded_bytes = decoded.as_mut_slice(); - let validity_bytes = decoded_validity.as_mut_slice(); - let mut prev_end = 0usize; - for (i, &end) in ends.iter().enumerate() { - let end = end.as_().min(length); - if end > prev_end { - let is_valid = validity_buf.value(i); - if is_valid { - fill_bits_true(validity_bytes, prev_end, end); - if values.value(i) { - fill_bits_true(decoded_bytes, prev_end, end); - } - } - } - prev_end = end; - } - BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) -} -``` - -## Current Benchmark Results - -### Nullable Cases - -| Benchmark | New | Baseline | Speedup | -|-----------|-----|----------|---------| -| 10000_2_alternating_mostly_valid | 12.2 µs | 42.6 µs | **3.5x** | -| 10000_10_alternating_mostly_valid | 3.6 µs | 13.1 µs | **3.6x** | -| 10000_10_alternating_mostly_null | 2.8 µs | 12.1 µs | **4.3x** | -| 10000_10_mostly_true_mostly_valid | 3.0 µs | 11.8 µs | **3.9x** | -| 10000_100_alternating_mostly_valid | 0.90 µs | 2.27 µs | **2.5x** | -| 10000_1000_alternating_mostly_valid | 0.48 µs | 0.32 µs | **0.67x** (1.5x slower) | - -### Non-Nullable Cases (1000 run length) - -| Benchmark | Time | -|-----------|------| -| 10000_1000_all_false | ~191-200 ns | -| 10000_1000_all_true | ~191-202 ns | -| 10000_1000_alternating | ~194-201 ns | -| 10000_1000_mostly_false | ~192-199 ns | -| 10000_1000_mostly_true | ~192-201 ns | - -Non-nullable fast path is very efficient. - -## Progress - -- **Before optimizations:** 0.56x (1.8x slower) for 1000 run length nullable -- **After optimizations:** 0.67x (1.5x slower) for 1000 run length nullable -- **Remaining gap:** ~150ns extraction overhead - -## Remaining Work - -### Option 1: Fix the Benchmark (Recommended) - -Make the benchmark fair by including extraction in the baseline timing: - -```rust -#[divan::bench(args = NULLABLE_BOOL_ARGS)] -fn decode_bool_nullable_develop_fair(bencher: Bencher, args: NullableBoolBenchArgs) { - let (ends, values) = create_nullable_bool_test_data(...); - bencher - .with_inputs(|| (ends.clone(), values.clone())) - .bench_refs(|(ends, values)| { - // Now timing extraction too - let ends_slice: Vec = ends.as_slice::().to_vec(); - let values_buf = values.to_bit_buffer(); - let validity_buf = values.validity_mask().unwrap(); - let validity_bits = match validity_buf { - vortex_mask::Mask::Values(m) => m.bit_buffer().clone(), - _ => BitBuffer::new_set(values.len()), - }; - decode_bool_nullable_baseline(&ends_slice, &values_buf, &validity_bits, total_length) - }); -} -``` - -### Option 2: Lower-Level API - -Add a public function that takes pre-extracted data for users who want maximum performance and are willing to manage extraction themselves: - -```rust -pub fn runend_decode_bools_from_slices( - ends: &[E], - values: &BitBuffer, - validity: &BitBuffer, // or Option<&BitBuffer> - length: usize, -) -> BoolArray -``` - -### Option 3: Reduce Extraction Overhead - -Investigate ways to make `validity_mask()` and `to_bit_buffer()` cheaper: -- Caching -- Avoiding allocations -- Direct field access if possible - -## Files Changed - -- `encodings/runend/src/decompress_bool.rs`: - - Added `PREFILL_RUN_THRESHOLD` constant at module level - - Added `decode_few_runs_no_offset()` function - - Modified `runend_decode_bools()` to use fast path - - Added tests: `decode_bools_nullable`, `decode_bools_nullable_few_runs` - -## Tests - -All tests pass: -``` -running 8 tests -test decompress_bool::tests::decode_bools_all_false_single_run ... ok -test decompress_bool::tests::decode_bools_all_true_single_run ... ok -test decompress_bool::tests::decode_bools_alternating ... ok -test decompress_bool::tests::decode_bools_mostly_false ... ok -test decompress_bool::tests::decode_bools_mostly_true ... ok -test decompress_bool::tests::decode_bools_nullable ... ok -test decompress_bool::tests::decode_bools_nullable_few_runs ... ok -test decompress_bool::tests::decode_bools_with_offset ... ok -``` - -## Code Locations - -- Implementation: `encodings/runend/src/decompress_bool.rs` -- Benchmarks: `encodings/runend/benches/run_end_decode.rs` -- Iterator helper: `encodings/runend/src/iter.rs` (`trimmed_ends_iter`) - -## Investigation: fill_bits Performance (2025-02-02) - -### Hypothesis - -The `fill_bits_true`/`fill_bits_false` functions might be slow and could benefit from using u64 instead of u8 for the middle byte fill. - -### Benchmark Results - -Added benchmarks comparing byte-level (u8) vs word-level (u64) fill implementations: - -| Range (bits) | Offset | u8 `.fill()` | u64 manual | Winner | -|--------------|--------|--------------|------------|--------| -| 10 | 0 | ~2.1ns | ~2.6ns | **u8** | -| 10 | 3 | ~1.1ns | ~1.2ns | ~same | -| 100 | 0 | ~4.1ns | ~6.5ns | **u8** | -| 100 | 5 | ~3.9ns | ~8.5ns | **u8 (2x)** | -| 1000 | 0 | ~2.4ns | ~6.7ns | **u8 (3x)** | -| 1000 | 7 | ~3.0ns | ~11ns | **u8 (4x)** | -| 5000 | 0 | ~9.7ns | ~9.8ns | ~same | -| 5000 | 1 | ~10ns | ~13ns | **u8** | - -### Conclusion - -**The fill functions are NOT the bottleneck.** The `.fill()` method is already highly optimized by LLVM - it generates vectorized memset-like code internally. The manual u64 approach adds overhead from: -1. Alignment checking (`align_offset`) -2. Extra branches for prefix/suffix handling -3. Unsafe pointer casts - -The fill operations only take ~2-10ns, while the full decode takes ~200-700ns. The overhead comes from elsewhere. - -### What IS the bottleneck? - -For the 1000 run length nullable case: -- Baseline (pre-extracted data): ~320ns -- New implementation (includes extraction): ~480ns -- Difference: ~160ns - -The overhead sources are: -1. **Extraction calls** (~150ns): - - `values.validity_mask()?` - - `values.to_bit_buffer()` - - `ends.as_slice::()` - -2. **Iterator chain** (for non-fast-path cases): - - `trimmed_ends_iter` with 3 chained `.map()` operations - -### Next Steps - -1. **Profile the extraction methods** - understand what makes `validity_mask()` and `to_bit_buffer()` expensive -2. **Consider caching** - if these methods are called frequently, cache results -3. **Accept the tradeoff** - the extraction overhead is necessary for a clean API; users who need maximum performance can use the lower-level functions directly - -## Optimization: validity_mask() Fast Path (2025-02-02) - -### Change - -Added a fast path in `validity_mask()` (in `vortex-array/src/compute/filter.rs`) to avoid the expensive `fill_null()` call when the validity array is already a non-nullable BoolArray. - -### Extraction Benchmark Results (After) - -| Operation | Before | After | Improvement | -|-----------|--------|-------|-------------| -| `validity_mask()` | ~150-166ns | ~98-102ns | **~40% faster** | -| All combined | ~195-208ns | ~127-135ns | **~35% faster** | - -### Full Decode Benchmark Results (After) - -| Benchmark | New | Baseline | Speedup | -|-----------|-----|----------|---------| -| 10000_2_alternating_mostly_valid | 14.3 µs | 49.9 µs | **3.5x faster** | -| 10000_10_alternating_mostly_valid | 4.0 µs | 15.3 µs | **3.8x faster** | -| 10000_100_alternating_mostly_valid | 922 ns | 2.6 µs | **2.8x faster** | -| 10000_1000_alternating_mostly_valid | 446 ns | 376 ns | 1.2x slower | - -### Summary - -The new implementation is now: -- **2.8x-3.8x faster** for typical cases (many runs) -- **~1.2x slower** only for the edge case with very few runs (10 runs at 1000 run length) - -The remaining ~70ns gap in the 1000 run length case comes from: -1. Remaining extraction overhead (~50ns for validity_mask) -2. Iterator/function call overhead - -This is an acceptable tradeoff since: -1. The few-runs case is already very fast (~446ns) -2. The common case (many runs) is significantly faster -3. Further optimization would require invasive changes to the core API - -## Experiment: u64 Fill in decompress_bool.rs (2025-02-02) - -### Hypothesis - -Using u64 writes instead of byte-level `.fill()` for the middle portion of `fill_bits_true`/`fill_bits_false` might improve performance. - -### Implementation - -Modified `fill_bits_true`/`fill_bits_false` to use a `fill_bytes_u64` helper that: -1. Handles unaligned prefix bytes -2. Writes aligned u64s for the middle -3. Handles suffix bytes - -### Result - -**No improvement.** The u64 approach was about the same speed or slightly slower: -- Nullable 1000 run: ~458-498ns (vs ~374-446ns with byte fill) - -### Why - -1. **LLVM already optimizes `.fill()`** - It generates vectorized SIMD code for slice fills -2. **Overhead** - Alignment checking and branching add overhead that outweighs any benefit -3. **Small runs** - For small byte ranges, the u64 approach has more overhead - -### Conclusion - -Keep the simple byte-level `.fill()` implementation. It's already optimal. - -## Ablation Study: Which Optimizations Matter? (2025-02-02) - -Tested three strategies: -1. **Sequential** - append_n for each run (no prefill) -2. **Prefill zeros** - prefill buffer with 0s, fill true runs -3. **Adaptive** - choose prefill value based on majority - -### Results - -| Scenario | Sequential | Prefill 0s | Adaptive | Best | -|----------|------------|------------|----------|------| -| 10 runs, alternating | 120ns | 77ns | 125ns | prefill | -| 10 runs, mostly_true | 121ns | 86ns | 106ns | prefill | -| 32 runs, alternating | 752ns | 187ns | 294ns | prefill | -| 32 runs, mostly_true | 492ns | 463ns | 159ns | **adaptive** | -| 100 runs, alternating | 1.06µs | 323ns | 484ns | prefill | -| 100 runs, mostly_true | 1.08µs | 948ns | 166ns | **adaptive** | -| 1000 runs, alternating | 6.3µs | 1.5µs | 1.4µs | ~same | -| 1000 runs, mostly_true | 5.8µs | 2.2µs | 828ns | **adaptive** | - -### Conclusions - -1. **Prefill vs Sequential**: Prefill is **always faster** for many runs - - 10 runs: 1.5x faster - - 100 runs: 3x faster - - 1000 runs: **4x faster** - -2. **Adaptive prefill**: Critical for **skewed distributions** (common in real data) - - Alternating (50/50): prefill_zeros is same or slightly better - - Mostly_true (90%): adaptive is **2-3x faster** - -Both optimizations are justified and should be kept. - -## Final Implementation Architecture - -### Entry Point: `runend_decode_bools` - -```rust -pub fn runend_decode_bools( - ends: PrimitiveArray, - values: BoolArray, - offset: usize, - length: usize, -) -> VortexResult -``` - -### Decision Tree - -``` -runend_decode_bools -├── Extract: validity_mask(), to_bit_buffer() -├── IF offset == 0 && num_runs < 32: -│ └── decode_few_runs_no_offset ← Fast path, no iterator -└── ELSE: - └── runend_decode_typed_bool ← Uses trimmed_ends_iter - ├── Mask::AllTrue → decode_bool_non_nullable - │ ├── IF num_runs < 32: sequential append_n - │ └── ELSE: adaptive prefill - │ ├── more true → prefill 1s, clear false runs - │ └── more false → prefill 0s, fill true runs - ├── Mask::AllFalse → return all-invalid array - └── Mask::Values → decode_bool_nullable - ├── IF num_runs < 32: sequential append - └── ELSE: 4 variants based on majority: - ├── (true, valid) → prefill decoded=1, validity=1 - ├── (true, null) → prefill decoded=1, validity=0 - ├── (false, valid) → prefill decoded=0, validity=1 - └── (false, null) → prefill decoded=0, validity=0 -``` - -### Key Difference: `decode_few_runs_no_offset` vs `runend_decode_typed_bool` - -| Aspect | `decode_few_runs_no_offset` | `runend_decode_typed_bool` | -|--------|----------------------------|---------------------------| -| Offset handling | Assumes `offset == 0` | Handles any offset | -| Iterator | Direct slice: `for (i, &end) in ends.iter()` | `trimmed_ends_iter` with 3 `.map()` chains | -| Overhead | Minimal | ~20-30ns iterator overhead | -| When used | `offset == 0 && num_runs < 32` | All other cases | - -### `trimmed_ends_iter` Details - -```rust -run_ends.iter() - .map(|v| v - offset_e) // subtract offset (redundant when offset=0) - .map(|v| min(v, length_e)) // clamp to length - .map(|v| v.as_()) // convert to usize -``` - -For 10 runs, these 3 chained closures add measurable overhead. For 1000 runs, it's amortized. - -### Threshold: PREFILL_RUN_THRESHOLD = 32 - -Below 32 runs: -- Iterator overhead dominates -- Sequential `append_n` is competitive with prefill -- Use direct slice access, avoid iterator chain - -Above 32 runs: -- Prefill + fill_bits is 3-4x faster than sequential -- Adaptive selection matters for skewed data -- Iterator overhead is negligible - -## `fill_bits_true` / `fill_bits_false` Implementation - -```rust -fn fill_bits_true(slice: &mut [u8], start: usize, end: usize) { - // Handle same-byte case - if start_byte == end_byte { - let mask = ((1u16 << (end_bit - start_bit)) - 1) as u8; - slice[start_byte] |= mask << start_bit; - } else { - // First partial byte - if start_bit != 0 { - slice[start_byte] |= !((1u8 << start_bit) - 1); - } - // Middle bytes - LLVM optimizes to SIMD - slice[fill_start..end_byte].fill(0xFF); - // Last partial byte - if end_bit != 0 { - slice[end_byte] |= (1u8 << end_bit) - 1; - } - } -} -``` - -Key insight: `.fill()` is already vectorized by LLVM. Manual u64 approach adds overhead without benefit. - -## External Optimization: `validity_mask()` Fast Path - -In `vortex-array/src/compute/filter.rs`: - -```rust -// Added fast path for non-nullable canonical bool arrays -if !self.dtype().is_nullable() && self.is_canonical() { - return Ok(Mask::from_buffer(self.to_bool().to_bit_buffer())); -} -``` - -This avoids the expensive `fill_null()` call when the validity array is already a non-nullable BoolArray (common case). - -## Final Performance Summary - -### vs Baseline (pre-extracted data) - -| Scenario | New Impl | Baseline | Result | -|----------|----------|----------|--------| -| Many runs (10-100) | 3-4 µs | 13-15 µs | **3-4x faster** | -| Medium runs (100) | 800-900 ns | 2.6 µs | **2.8x faster** | -| Few runs (10 @ 1000 len) | 380-450 ns | 320-376 ns | ~1.2x slower | - -### Absolute Performance (non-nullable, 10K elements) - -| Runs | Time | Throughput | -|------|------|------------| -| 10 | ~200 ns | 50M elements/sec | -| 100 | ~350 ns | 28M elements/sec | -| 1000 | ~1.4 µs | 7M elements/sec | - -## Files Modified - -1. **`encodings/runend/src/decompress_bool.rs`** - - Full implementation with all optimizations - - ~430 lines including tests - -2. **`encodings/runend/benches/run_end_decode.rs`** - - Added baseline comparison benchmark - - ~435 lines - -3. **`vortex-array/src/compute/filter.rs`** - - Added 4-line fast path for `validity_mask()` - -4. **`encodings/runend/PERF_NOTES.md`** - - This file - full documentation of investigation diff --git a/encodings/runend/benches/run_end_decode.rs b/encodings/runend/benches/run_end_decode.rs index f08ebe9a733..9f64beabff5 100644 --- a/encodings/runend/benches/run_end_decode.rs +++ b/encodings/runend/benches/run_end_decode.rs @@ -11,7 +11,6 @@ use vortex_array::arrays::PrimitiveArray; use vortex_array::compute::warm_up_vtables; use vortex_array::validity::Validity; use vortex_buffer::BitBuffer; -use vortex_buffer::BitBufferMut; use vortex_buffer::BufferMut; use vortex_runend::decompress_bool::runend_decode_bools; @@ -82,9 +81,9 @@ fn create_bool_test_data( ends.push(pos as u32); let val = match distribution { - BoolDistribution::Alternating => run_index % 2 == 0, - BoolDistribution::MostlyTrue => run_index % 10 != 0, // 90% true - BoolDistribution::MostlyFalse => run_index % 10 == 0, // 10% true (90% false) + BoolDistribution::Alternating => run_index.is_multiple_of(2), + BoolDistribution::MostlyTrue => !run_index.is_multiple_of(10), // 90% true + BoolDistribution::MostlyFalse => run_index.is_multiple_of(10), // 10% true (90% false) BoolDistribution::AllTrue => true, BoolDistribution::AllFalse => false, }; @@ -276,18 +275,18 @@ fn create_nullable_bool_test_data( ends.push(pos as u32); let val = match distribution { - BoolDistribution::Alternating => run_index % 2 == 0, - BoolDistribution::MostlyTrue => run_index % 10 != 0, - BoolDistribution::MostlyFalse => run_index % 10 == 0, + BoolDistribution::Alternating => run_index.is_multiple_of(2), + BoolDistribution::MostlyTrue => !run_index.is_multiple_of(10), + BoolDistribution::MostlyFalse => run_index.is_multiple_of(10), BoolDistribution::AllTrue => true, BoolDistribution::AllFalse => false, }; values.push(val); let is_valid = match validity { - ValidityDistribution::MostlyValid => run_index % 10 != 0, - ValidityDistribution::HalfValid => run_index % 2 == 0, - ValidityDistribution::MostlyNull => run_index % 10 == 0, + ValidityDistribution::MostlyValid => !run_index.is_multiple_of(10), + ValidityDistribution::HalfValid => run_index.is_multiple_of(2), + ValidityDistribution::MostlyNull => run_index.is_multiple_of(10), }; validity_bits.push(is_valid); @@ -379,57 +378,3 @@ fn decode_bool_nullable(bencher: Bencher, args: NullableBoolBenchArgs) { runend_decode_bools(ends.clone(), values.clone(), 0, total_length) }); } - -/// Baseline using develop branch's append_n approach -fn decode_bool_nullable_baseline( - ends: &[u32], - values: &BitBuffer, - validity_mask: &BitBuffer, - _length: usize, -) -> BoolArray { - let mut decoded = BitBufferMut::with_capacity(ends.last().copied().unwrap_or(0) as usize); - let mut decoded_validity = - BitBufferMut::with_capacity(ends.last().copied().unwrap_or(0) as usize); - - let mut prev_end = 0usize; - for ((&end, value), is_valid) in ends.iter().zip(values.iter()).zip(validity_mask.iter()) { - let end = end as usize; - if is_valid { - decoded_validity.append_n(true, end - prev_end); - decoded.append_n(value, end - prev_end); - } else { - decoded_validity.append_n(false, end - prev_end); - decoded.append_n(false, end - prev_end); - } - prev_end = end; - } - - BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) -} - -#[divan::bench(args = NULLABLE_BOOL_ARGS)] -fn decode_bool_nullable_develop(bencher: Bencher, args: NullableBoolBenchArgs) { - let NullableBoolBenchArgs { - total_length, - avg_run_length, - distribution, - validity, - } = args; - let (ends, values) = - create_nullable_bool_test_data(total_length, avg_run_length, distribution, validity); - - bencher - .with_inputs(|| { - let ends_slice: Vec = ends.as_slice::().to_vec(); - let values_buf = values.to_bit_buffer(); - let validity_buf = values.validity_mask().unwrap(); - let validity_bits = match validity_buf { - vortex_mask::Mask::Values(m) => m.bit_buffer().clone(), - _ => BitBuffer::new_set(values.len()), - }; - (ends_slice, values_buf, validity_bits) - }) - .bench_refs(|(ends, values, validity)| { - decode_bool_nullable_baseline(ends, values, validity, total_length) - }); -} diff --git a/encodings/runend/public-api.lock b/encodings/runend/public-api.lock index 4f9516a32fc..1d2547bc034 100644 --- a/encodings/runend/public-api.lock +++ b/encodings/runend/public-api.lock @@ -2,8 +2,6 @@ pub mod vortex_runend pub mod vortex_runend::compress -pub fn vortex_runend::compress::runend_decode_bools(ends: vortex_array::arrays::primitive::array::PrimitiveArray, values: vortex_array::arrays::bool::array::BoolArray, offset: usize, length: usize) -> vortex_error::VortexResult - pub fn vortex_runend::compress::runend_decode_primitive(ends: vortex_array::arrays::primitive::array::PrimitiveArray, values: vortex_array::arrays::primitive::array::PrimitiveArray, offset: usize, length: usize) -> vortex_error::VortexResult pub fn vortex_runend::compress::runend_decode_typed_bool(run_ends: impl core::iter::traits::iterator::Iterator, values: &vortex_buffer::bit::buf::BitBuffer, values_validity: vortex_mask::Mask, values_nullability: vortex_array::dtype::nullability::Nullability, length: usize) -> vortex_array::arrays::bool::array::BoolArray diff --git a/encodings/runend/src/array.rs b/encodings/runend/src/array.rs index f0eac5d967e..2201cb57107 100644 --- a/encodings/runend/src/array.rs +++ b/encodings/runend/src/array.rs @@ -486,7 +486,7 @@ pub(super) fn run_end_canonicalize( Ok(match array.dtype() { DType::Bool(_) => { let bools = array.values().clone().execute_as("values", ctx)?; - runend_decode_bools(pends, bools, array.offset(), array.len())?.into_array() + runend_decode_bools(pends, bools, array.offset(), array.len())? } DType::Primitive(..) => { let pvalues = array.values().clone().execute_as("values", ctx)?; diff --git a/encodings/runend/src/compress.rs b/encodings/runend/src/compress.rs index b27afc1c5a6..5e232fa0fa6 100644 --- a/encodings/runend/src/compress.rs +++ b/encodings/runend/src/compress.rs @@ -188,8 +188,6 @@ pub fn runend_decode_primitive( })) } -pub use crate::decompress_bool::runend_decode_bools; - /// Decode a run-end encoded slice of values into a flat `Buffer` and `Validity`. /// /// This is the core decode loop shared by primitive and varbinview run-end decoding. diff --git a/encodings/runend/src/compute/compare.rs b/encodings/runend/src/compute/compare.rs index 535d2c1d37a..6cc4ae55b4d 100644 --- a/encodings/runend/src/compute/compare.rs +++ b/encodings/runend/src/compute/compare.rs @@ -31,13 +31,13 @@ impl CompareKernel for RunEndVTable { ConstantArray::new(const_scalar, lhs.values().len()).into_array(), Operator::from(operator), )?; - let decoded = runend_decode_bools( + return runend_decode_bools( lhs.ends().clone().execute::(ctx)?, values.execute::(ctx)?, lhs.offset(), lhs.len(), - )?; - return Ok(Some(decoded.into_array())); + ) + .map(Some); } // Otherwise, fall back diff --git a/encodings/runend/src/decompress_bool.rs b/encodings/runend/src/decompress_bool.rs index a834026d251..79ca1b497c8 100644 --- a/encodings/runend/src/decompress_bool.rs +++ b/encodings/runend/src/decompress_bool.rs @@ -7,10 +7,15 @@ //! (0s or 1s) and only fills the minority runs, minimizing work for skewed distributions. use itertools::Itertools; +use vortex_array::ArrayRef; +use vortex_array::IntoArray; use vortex_array::arrays::BoolArray; +use vortex_array::arrays::ConstantArray; use vortex_array::arrays::PrimitiveArray; +use vortex_array::dtype::DType; use vortex_array::dtype::Nullability; use vortex_array::match_each_unsigned_integer_ptype; +use vortex_array::scalar::Scalar; use vortex_array::validity::Validity; use vortex_buffer::BitBuffer; use vortex_buffer::BitBufferMut; @@ -29,7 +34,7 @@ pub fn runend_decode_bools( values: BoolArray, offset: usize, length: usize, -) -> VortexResult { +) -> VortexResult { let validity = values.validity_mask()?; let values_buf = values.to_bit_buffer(); let nullability = values.dtype().nullability(); @@ -72,11 +77,18 @@ pub fn runend_decode_typed_bool( values_validity: Mask, values_nullability: Nullability, length: usize, -) -> BoolArray { +) -> ArrayRef { match values_validity { - Mask::AllTrue(_) => decode_bool_non_nullable(run_ends, values, values_nullability, length), - Mask::AllFalse(_) => BoolArray::new(BitBuffer::new_unset(length), Validity::AllInvalid), - Mask::Values(mask) => decode_bool_nullable(run_ends, values, mask.bit_buffer(), length), + Mask::AllTrue(_) => { + decode_bool_non_nullable(run_ends, values, values_nullability, length).into_array() + } + Mask::AllFalse(_) => { + ConstantArray::new(Scalar::null(DType::Bool(Nullability::Nullable)), length) + .into_array() + } + Mask::Values(mask) => { + decode_bool_nullable(run_ends, values, mask.bit_buffer(), length).into_array() + } } } @@ -90,7 +102,7 @@ fn decode_few_runs_no_offset( validity: Mask, nullability: Nullability, length: usize, -) -> BoolArray { +) -> ArrayRef { match validity { Mask::AllTrue(_) => { let mut decoded = BitBufferMut::with_capacity(length); @@ -100,31 +112,31 @@ fn decode_few_runs_no_offset( decoded.append_n(values.value(i), end - prev_end); prev_end = end; } - BoolArray::new(decoded.freeze(), nullability.into()) + BoolArray::new(decoded.freeze(), nullability.into()).into_array() + } + Mask::AllFalse(_) => { + ConstantArray::new(Scalar::null(DType::Bool(Nullability::Nullable)), length) + .into_array() } - Mask::AllFalse(_) => BoolArray::new(BitBuffer::new_unset(length), Validity::AllInvalid), Mask::Values(mask) => { let validity_buf = mask.bit_buffer(); - // Use prefill + fill_bits for better performance with larger runs - let mut decoded = BitBufferMut::new_unset(length); - let mut decoded_validity = BitBufferMut::new_unset(length); - let decoded_bytes = decoded.as_mut_slice(); - let validity_bytes = decoded_validity.as_mut_slice(); + let mut decoded = BitBufferMut::with_capacity(length); + let mut decoded_validity = BitBufferMut::with_capacity(length); let mut prev_end = 0usize; for (i, &end) in ends.iter().enumerate() { let end = end.as_().min(length); - if end > prev_end { - let is_valid = validity_buf.value(i); - if is_valid { - fill_bits_true(validity_bytes, prev_end, end); - if values.value(i) { - fill_bits_true(decoded_bytes, prev_end, end); - } - } + let run_len = end - prev_end; + let is_valid = validity_buf.value(i); + if is_valid { + decoded_validity.append_n(true, run_len); + decoded.append_n(values.value(i), run_len); + } else { + decoded_validity.append_n(false, run_len); + decoded.append_n(false, run_len); } prev_end = end; } - BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) + BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())).into_array() } } } @@ -149,13 +161,12 @@ fn decode_bool_non_nullable( // Adaptive strategy: prefill with majority value, only flip minority runs let prefill = values.true_count() > num_runs - values.true_count(); - let mut decoded = prefill_buffer(prefill, length); - let decoded_bytes = decoded.as_mut_slice(); + let mut decoded = BitBufferMut::full(prefill, length); let mut current_pos = 0usize; for (end, value) in run_ends.zip_eq(values.iter()) { if end > current_pos && value != prefill { - flip_bits(decoded_bytes, current_pos, end, value); + decoded.fill_range(current_pos, end, value); } current_pos = end; } @@ -180,21 +191,19 @@ fn decode_bool_nullable( let prefill_decoded = values.true_count() > num_runs - values.true_count(); let prefill_valid = validity_mask.true_count() > num_runs - validity_mask.true_count(); - let mut decoded = prefill_buffer(prefill_decoded, length); - let mut decoded_validity = prefill_buffer(prefill_valid, length); - let decoded_bytes = decoded.as_mut_slice(); - let validity_bytes = decoded_validity.as_mut_slice(); + let mut decoded = BitBufferMut::full(prefill_decoded, length); + let mut decoded_validity = BitBufferMut::full(prefill_valid, length); let mut current_pos = 0usize; for (end, (value, is_valid)) in run_ends.zip_eq(values.iter().zip(validity_mask.iter())) { if end > current_pos { if is_valid != prefill_valid { - flip_bits(validity_bytes, current_pos, end, is_valid); + decoded_validity.fill_range(current_pos, end, is_valid); } // Decoded bit should be the actual value when valid, false when null. let want_decoded = is_valid && value; if want_decoded != prefill_decoded { - flip_bits(decoded_bytes, current_pos, end, want_decoded); + decoded.fill_range(current_pos, end, want_decoded); } current_pos = end; } @@ -227,110 +236,13 @@ fn decode_nullable_sequential( BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) } -#[inline(always)] -fn prefill_buffer(set: bool, length: usize) -> BitBufferMut { - if set { - BitBufferMut::new_set(length) - } else { - BitBufferMut::new_unset(length) - } -} - -/// Sets or clears all bits in `[start, end)` to match `value`. -#[inline(always)] -fn flip_bits(slice: &mut [u8], start: usize, end: usize, value: bool) { - if value { - fill_bits_true(slice, start, end); - } else { - fill_bits_false(slice, start, end); - } -} - -/// Fills bits in range [start, end) to true. -/// Assumes the buffer is pre-initialized to all zeros. -#[inline(always)] -fn fill_bits_true(slice: &mut [u8], start: usize, end: usize) { - if start >= end { - return; - } - - let start_byte = start / 8; - let start_bit = start % 8; - let end_byte = end / 8; - let end_bit = end % 8; - - if start_byte == end_byte { - #[allow(clippy::cast_possible_truncation)] - let mask = ((1u16 << (end_bit - start_bit)) - 1) as u8; - slice[start_byte] |= mask << start_bit; - } else { - // First partial byte - if start_bit != 0 { - slice[start_byte] |= !((1u8 << start_bit) - 1); - } - - // Middle bytes - let fill_start = if start_bit != 0 { - start_byte + 1 - } else { - start_byte - }; - if fill_start < end_byte { - slice[fill_start..end_byte].fill(0xFF); - } - - // Last partial byte - if end_bit != 0 { - slice[end_byte] |= (1u8 << end_bit) - 1; - } - } -} - -/// Clears bits in range [start, end) to false. -/// Assumes the buffer is pre-initialized to all ones. -#[inline(always)] -fn fill_bits_false(slice: &mut [u8], start: usize, end: usize) { - if start >= end { - return; - } - - let start_byte = start / 8; - let start_bit = start % 8; - let end_byte = end / 8; - let end_bit = end % 8; - - if start_byte == end_byte { - #[allow(clippy::cast_possible_truncation)] - let mask = ((1u16 << (end_bit - start_bit)) - 1) as u8; - slice[start_byte] &= !(mask << start_bit); - } else { - // First partial byte - if start_bit != 0 { - slice[start_byte] &= (1u8 << start_bit) - 1; - } - - // Middle bytes - let fill_start = if start_bit != 0 { - start_byte + 1 - } else { - start_byte - }; - if fill_start < end_byte { - slice[fill_start..end_byte].fill(0x00); - } - - // Last partial byte - if end_bit != 0 { - slice[end_byte] &= !((1u8 << end_bit) - 1); - } - } -} - #[cfg(test)] mod tests { + use vortex_array::ToCanonical; use vortex_array::arrays::BoolArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::assert_arrays_eq; + use vortex_array::validity::Validity; use vortex_buffer::BitBuffer; use vortex_error::VortexResult; @@ -444,15 +356,13 @@ mod tests { #[test] fn decode_bools_nullable_few_runs() -> VortexResult<()> { - use vortex_array::validity::Validity; - // Test few runs (uses fast path): 5 runs of length 2000 each let ends = PrimitiveArray::from_iter([2000u32, 4000, 6000, 8000, 10000]); let values = BoolArray::new( BitBuffer::from(vec![true, false, true, false, true]), Validity::from(BitBuffer::from(vec![true, false, true, false, true])), ); - let decoded = runend_decode_bools(ends, values, 0, 10000)?; + let decoded = runend_decode_bools(ends, values, 0, 10000)?.to_bool(); // Check length and a few values assert_eq!(decoded.len(), 10000); diff --git a/vortex-buffer/src/bit/buf_mut.rs b/vortex-buffer/src/bit/buf_mut.rs index 5d16c783af9..a34e16a670a 100644 --- a/vortex-buffer/src/bit/buf_mut.rs +++ b/vortex-buffer/src/bit/buf_mut.rs @@ -398,13 +398,13 @@ impl BitBufferMut { /// the length will be incremented by `n`. /// /// Panics if the buffer does not have `n` slots left. + #[inline] pub fn append_n(&mut self, value: bool, n: usize) { if n == 0 { return; } - let start_bit_pos = self.offset + self.len; - let end_bit_pos = start_bit_pos + n; + let end_bit_pos = self.offset + self.len + n; let required_bytes = end_bit_pos.div_ceil(8); // Ensure buffer has enough bytes @@ -412,58 +412,26 @@ impl BitBufferMut { self.buffer.push_n(0x00, required_bytes - self.buffer.len()); } - let fill_byte = if value { 0xFF } else { 0x00 }; - - // Calculate byte positions - let start_byte = start_bit_pos / 8; - let start_bit = start_bit_pos % 8; - let end_byte = end_bit_pos / 8; - let end_bit = end_bit_pos % 8; - - let slice = self.buffer.as_mut_slice(); - - if start_byte == end_byte { - // All bits are in the same byte - let mask = ((1u8 << (end_bit - start_bit)) - 1) << start_bit; - if value { - slice[start_byte] |= mask; - } else { - slice[start_byte] &= !mask; - } - } else { - // Fill the first partial byte - if start_bit != 0 { - let mask = !((1u8 << start_bit) - 1); - if value { - slice[start_byte] |= mask; - } else { - slice[start_byte] &= !mask; - } - } - - // Fill the complete middle bytes - let fill_start = if start_bit != 0 { - start_byte + 1 - } else { - start_byte - }; - let fill_end = end_byte; - if fill_start < fill_end { - slice[fill_start..fill_end].fill(fill_byte); - } - - // Fill the last partial byte - if end_bit != 0 { - let mask = (1u8 << end_bit) - 1; - if value { - slice[end_byte] |= mask; - } else { - slice[end_byte] &= !mask; - } - } - } - + let start = self.len; self.len += n; + self.fill_range(start, self.len, value); + } + + /// Sets all bits in the range `[start, end)` to `value`. + /// + /// This operates on an arbitrary range within the existing length of the buffer. + /// Panics if `end > self.len` or `start > end`. + #[inline(always)] + pub fn fill_range(&mut self, start: usize, end: usize, value: bool) { + assert!(end <= self.len, "end {end} exceeds len {}", self.len); + assert!(start <= end, "start {start} exceeds end {end}"); + + crate::bit::fill_bits( + self.buffer.as_mut_slice(), + self.offset + start, + self.offset + end, + value, + ); } /// Append a [`BitBuffer`] to this [`BitBufferMut`] diff --git a/vortex-buffer/src/bit/mod.rs b/vortex-buffer/src/bit/mod.rs index 5ca932c0187..ca786c77afe 100644 --- a/vortex-buffer/src/bit/mod.rs +++ b/vortex-buffer/src/bit/mod.rs @@ -62,3 +62,65 @@ pub unsafe fn set_bit_unchecked(buf: *mut u8, index: usize) { pub unsafe fn unset_bit_unchecked(buf: *mut u8, index: usize) { unsafe { *buf.add(index / 8) &= !(1 << (index % 8)) }; } + +/// Sets all bits in the bit-range `[start_bit, end_bit)` of `slice` to `value`. +/// +/// This operates directly on a byte slice where bits are stored in little-endian order. +/// The caller must ensure that the slice is large enough to hold bits up to `end_bit`. +/// +/// # Panics +/// +/// Panics if `start_bit > end_bit` or if the slice is too small. +#[inline(always)] +pub fn fill_bits(slice: &mut [u8], start_bit: usize, end_bit: usize, value: bool) { + if start_bit >= end_bit { + return; + } + + let fill_byte: u8 = if value { 0xFF } else { 0x00 }; + + let start_byte = start_bit / 8; + let start_rem = start_bit % 8; + let end_byte = end_bit / 8; + let end_rem = end_bit % 8; + + if start_byte == end_byte { + // All bits are in the same byte + let mask = ((1u8 << (end_rem - start_rem)) - 1) << start_rem; + if value { + slice[start_byte] |= mask; + } else { + slice[start_byte] &= !mask; + } + } else { + // First partial byte + if start_rem != 0 { + let mask = !((1u8 << start_rem) - 1); + if value { + slice[start_byte] |= mask; + } else { + slice[start_byte] &= !mask; + } + } + + // Middle bytes + let fill_start = if start_rem != 0 { + start_byte + 1 + } else { + start_byte + }; + if fill_start < end_byte { + slice[fill_start..end_byte].fill(fill_byte); + } + + // Last partial byte + if end_rem != 0 { + let mask = (1u8 << end_rem) - 1; + if value { + slice[end_byte] |= mask; + } else { + slice[end_byte] &= !mask; + } + } + } +} From 46e2001327194e77d2622d1bc9acb337123bb5e0 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Tue, 10 Mar 2026 18:39:46 +0000 Subject: [PATCH 07/10] fixup Signed-off-by: Joe Isaacs --- encodings/runend/src/compress.rs | 41 -------------------------------- 1 file changed, 41 deletions(-) diff --git a/encodings/runend/src/compress.rs b/encodings/runend/src/compress.rs index 5e232fa0fa6..0a841c28194 100644 --- a/encodings/runend/src/compress.rs +++ b/encodings/runend/src/compress.rs @@ -267,47 +267,6 @@ pub fn runend_decode_typed_primitive( PrimitiveArray::new(decoded, validity) } -pub fn runend_decode_typed_bool( - run_ends: impl Iterator, - values: &BitBuffer, - values_validity: Mask, - values_nullability: Nullability, - length: usize, -) -> BoolArray { - match values_validity { - Mask::AllTrue(_) => { - let mut decoded = BitBufferMut::with_capacity(length); - for (end, value) in run_ends.zip_eq(values.iter()) { - decoded.append_n(value, end - decoded.len()); - } - BoolArray::new(decoded.freeze(), values_nullability.into()) - } - Mask::AllFalse(_) => BoolArray::new(BitBuffer::new_unset(length), Validity::AllInvalid), - Mask::Values(mask) => { - let mut decoded = BitBufferMut::with_capacity(length); - let mut decoded_validity = BitBufferMut::with_capacity(length); - for (end, value) in run_ends.zip_eq( - values - .iter() - .zip(mask.bit_buffer().iter()) - .map(|(v, is_valid)| is_valid.then_some(v)), - ) { - match value { - None => { - decoded_validity.append_n(false, end - decoded.len()); - decoded.append_n(false, end - decoded.len()); - } - Some(value) => { - decoded_validity.append_n(true, end - decoded.len()); - decoded.append_n(value, end - decoded.len()); - } - } - } - BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) - } - } -} - /// Decode a run-end encoded VarBinView array by expanding views directly. pub fn runend_decode_varbinview( ends: PrimitiveArray, From 80634d4d8323fa70f97775a35d58d3d957ad8de1 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Tue, 10 Mar 2026 19:01:06 +0000 Subject: [PATCH 08/10] fixup Signed-off-by: Joe Isaacs --- encodings/runend/src/decompress_bool.rs | 8 +-- vortex-buffer/src/bit/buf_mut.rs | 69 ++++++++++++++++++++++++- vortex-buffer/src/bit/mod.rs | 62 ---------------------- 3 files changed, 73 insertions(+), 66 deletions(-) diff --git a/encodings/runend/src/decompress_bool.rs b/encodings/runend/src/decompress_bool.rs index 79ca1b497c8..407745d9154 100644 --- a/encodings/runend/src/decompress_bool.rs +++ b/encodings/runend/src/decompress_bool.rs @@ -166,7 +166,8 @@ fn decode_bool_non_nullable( for (end, value) in run_ends.zip_eq(values.iter()) { if end > current_pos && value != prefill { - decoded.fill_range(current_pos, end, value); + // SAFETY: current_pos < end <= length == decoded.len() + unsafe { decoded.fill_range_unchecked(current_pos, end, value) }; } current_pos = end; } @@ -197,13 +198,14 @@ fn decode_bool_nullable( for (end, (value, is_valid)) in run_ends.zip_eq(values.iter().zip(validity_mask.iter())) { if end > current_pos { + // SAFETY: current_pos < end <= length == decoded.len() == decoded_validity.len() if is_valid != prefill_valid { - decoded_validity.fill_range(current_pos, end, is_valid); + unsafe { decoded_validity.fill_range_unchecked(current_pos, end, is_valid) }; } // Decoded bit should be the actual value when valid, false when null. let want_decoded = is_valid && value; if want_decoded != prefill_decoded { - decoded.fill_range(current_pos, end, want_decoded); + unsafe { decoded.fill_range_unchecked(current_pos, end, want_decoded) }; } current_pos = end; } diff --git a/vortex-buffer/src/bit/buf_mut.rs b/vortex-buffer/src/bit/buf_mut.rs index a34e16a670a..231c866cbdc 100644 --- a/vortex-buffer/src/bit/buf_mut.rs +++ b/vortex-buffer/src/bit/buf_mut.rs @@ -16,6 +16,61 @@ use crate::bit::set_bit_unchecked; use crate::bit::unset_bit_unchecked; use crate::buffer_mut; +/// Sets all bits in the bit-range `[start_bit, end_bit)` of `slice` to `value`. +#[inline(always)] +fn fill_bits(slice: &mut [u8], start_bit: usize, end_bit: usize, value: bool) { + if start_bit >= end_bit { + return; + } + + let fill_byte: u8 = if value { 0xFF } else { 0x00 }; + + let start_byte = start_bit / 8; + let start_rem = start_bit % 8; + let end_byte = end_bit / 8; + let end_rem = end_bit % 8; + + if start_byte == end_byte { + // All bits are in the same byte + let mask = ((1u8 << (end_rem - start_rem)) - 1) << start_rem; + if value { + slice[start_byte] |= mask; + } else { + slice[start_byte] &= !mask; + } + } else { + // First partial byte + if start_rem != 0 { + let mask = !((1u8 << start_rem) - 1); + if value { + slice[start_byte] |= mask; + } else { + slice[start_byte] &= !mask; + } + } + + // Middle bytes + let fill_start = if start_rem != 0 { + start_byte + 1 + } else { + start_byte + }; + if fill_start < end_byte { + slice[fill_start..end_byte].fill(fill_byte); + } + + // Last partial byte + if end_rem != 0 { + let mask = (1u8 << end_rem) - 1; + if value { + slice[end_byte] |= mask; + } else { + slice[end_byte] &= !mask; + } + } + } +} + /// A mutable bitset buffer that allows random access to individual bits for set and get. /// /// @@ -426,7 +481,19 @@ impl BitBufferMut { assert!(end <= self.len, "end {end} exceeds len {}", self.len); assert!(start <= end, "start {start} exceeds end {end}"); - crate::bit::fill_bits( + // SAFETY: assertions above guarantee start <= end <= self.len, + // so offset + end fits within the buffer. + unsafe { self.fill_range_unchecked(start, end, value) } + } + + /// Sets all bits in the range `[start, end)` to `value` without bounds checking. + /// + /// # Safety + /// + /// The caller must ensure that `start <= end <= self.len`. + #[inline(always)] + pub unsafe fn fill_range_unchecked(&mut self, start: usize, end: usize, value: bool) { + fill_bits( self.buffer.as_mut_slice(), self.offset + start, self.offset + end, diff --git a/vortex-buffer/src/bit/mod.rs b/vortex-buffer/src/bit/mod.rs index ca786c77afe..5ca932c0187 100644 --- a/vortex-buffer/src/bit/mod.rs +++ b/vortex-buffer/src/bit/mod.rs @@ -62,65 +62,3 @@ pub unsafe fn set_bit_unchecked(buf: *mut u8, index: usize) { pub unsafe fn unset_bit_unchecked(buf: *mut u8, index: usize) { unsafe { *buf.add(index / 8) &= !(1 << (index % 8)) }; } - -/// Sets all bits in the bit-range `[start_bit, end_bit)` of `slice` to `value`. -/// -/// This operates directly on a byte slice where bits are stored in little-endian order. -/// The caller must ensure that the slice is large enough to hold bits up to `end_bit`. -/// -/// # Panics -/// -/// Panics if `start_bit > end_bit` or if the slice is too small. -#[inline(always)] -pub fn fill_bits(slice: &mut [u8], start_bit: usize, end_bit: usize, value: bool) { - if start_bit >= end_bit { - return; - } - - let fill_byte: u8 = if value { 0xFF } else { 0x00 }; - - let start_byte = start_bit / 8; - let start_rem = start_bit % 8; - let end_byte = end_bit / 8; - let end_rem = end_bit % 8; - - if start_byte == end_byte { - // All bits are in the same byte - let mask = ((1u8 << (end_rem - start_rem)) - 1) << start_rem; - if value { - slice[start_byte] |= mask; - } else { - slice[start_byte] &= !mask; - } - } else { - // First partial byte - if start_rem != 0 { - let mask = !((1u8 << start_rem) - 1); - if value { - slice[start_byte] |= mask; - } else { - slice[start_byte] &= !mask; - } - } - - // Middle bytes - let fill_start = if start_rem != 0 { - start_byte + 1 - } else { - start_byte - }; - if fill_start < end_byte { - slice[fill_start..end_byte].fill(fill_byte); - } - - // Last partial byte - if end_rem != 0 { - let mask = (1u8 << end_rem) - 1; - if value { - slice[end_byte] |= mask; - } else { - slice[end_byte] &= !mask; - } - } - } -} From aedfd34ef0e88ecc5e20c12310451dd3f11e52fd Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Tue, 10 Mar 2026 19:03:04 +0000 Subject: [PATCH 09/10] fixup Signed-off-by: Joe Isaacs --- vortex-buffer/src/bit/buf_mut.rs | 1 + vortex-buffer/src/bit/ops.rs | 2 ++ 2 files changed, 3 insertions(+) diff --git a/vortex-buffer/src/bit/buf_mut.rs b/vortex-buffer/src/bit/buf_mut.rs index 231c866cbdc..da3c1bf3d5c 100644 --- a/vortex-buffer/src/bit/buf_mut.rs +++ b/vortex-buffer/src/bit/buf_mut.rs @@ -647,6 +647,7 @@ impl Default for BitBufferMut { impl Not for BitBufferMut { type Output = BitBufferMut; + #[inline] fn not(mut self) -> Self::Output { ops::bitwise_unary_op_mut(&mut self, |b| !b); self diff --git a/vortex-buffer/src/bit/ops.rs b/vortex-buffer/src/bit/ops.rs index d44224cc1d3..37ceeabf652 100644 --- a/vortex-buffer/src/bit/ops.rs +++ b/vortex-buffer/src/bit/ops.rs @@ -6,6 +6,7 @@ use crate::BitBufferMut; use crate::Buffer; use crate::trusted_len::TrustedLenExt; +#[inline] pub(super) fn bitwise_unary_op u64>(buffer: &BitBuffer, op: F) -> BitBuffer { let iter = buffer.chunks().iter_padded().map(op); let iter = unsafe { iter.trusted_len() }; @@ -15,6 +16,7 @@ pub(super) fn bitwise_unary_op u64>(buffer: &BitBuffer, op: F) BitBuffer::new(result, buffer.len()) } +#[inline] pub(super) fn bitwise_unary_op_mut u64>(buffer: &mut BitBufferMut, mut op: F) { let slice_mut = buffer.as_mut_slice(); From 9385cf034687485d9bfe5c7aa71ab205ff91d7f2 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Tue, 10 Mar 2026 19:13:20 +0000 Subject: [PATCH 10/10] fixup Signed-off-by: Joe Isaacs --- encodings/runend/public-api.lock | 8 ++++++-- vortex-buffer/public-api.lock | 4 ++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/encodings/runend/public-api.lock b/encodings/runend/public-api.lock index 1d2547bc034..433f06037fe 100644 --- a/encodings/runend/public-api.lock +++ b/encodings/runend/public-api.lock @@ -4,14 +4,18 @@ pub mod vortex_runend::compress pub fn vortex_runend::compress::runend_decode_primitive(ends: vortex_array::arrays::primitive::array::PrimitiveArray, values: vortex_array::arrays::primitive::array::PrimitiveArray, offset: usize, length: usize) -> vortex_error::VortexResult -pub fn vortex_runend::compress::runend_decode_typed_bool(run_ends: impl core::iter::traits::iterator::Iterator, values: &vortex_buffer::bit::buf::BitBuffer, values_validity: vortex_mask::Mask, values_nullability: vortex_array::dtype::nullability::Nullability, length: usize) -> vortex_array::arrays::bool::array::BoolArray - pub fn vortex_runend::compress::runend_decode_typed_primitive(run_ends: impl core::iter::traits::iterator::Iterator, values: &[T], values_validity: vortex_mask::Mask, values_nullability: vortex_array::dtype::nullability::Nullability, length: usize) -> vortex_array::arrays::primitive::array::PrimitiveArray pub fn vortex_runend::compress::runend_decode_varbinview(ends: vortex_array::arrays::primitive::array::PrimitiveArray, values: vortex_array::arrays::varbinview::array::VarBinViewArray, offset: usize, length: usize) -> vortex_error::VortexResult pub fn vortex_runend::compress::runend_encode(array: &vortex_array::arrays::primitive::array::PrimitiveArray) -> (vortex_array::arrays::primitive::array::PrimitiveArray, vortex_array::array::ArrayRef) +pub mod vortex_runend::decompress_bool + +pub fn vortex_runend::decompress_bool::runend_decode_bools(ends: vortex_array::arrays::primitive::array::PrimitiveArray, values: vortex_array::arrays::bool::array::BoolArray, offset: usize, length: usize) -> vortex_error::VortexResult + +pub fn vortex_runend::decompress_bool::runend_decode_typed_bool(run_ends: impl core::iter::traits::iterator::Iterator, values: &vortex_buffer::bit::buf::BitBuffer, values_validity: vortex_mask::Mask, values_nullability: vortex_array::dtype::nullability::Nullability, length: usize) -> vortex_array::array::ArrayRef + pub struct vortex_runend::RunEndArray impl vortex_runend::RunEndArray diff --git a/vortex-buffer/public-api.lock b/vortex-buffer/public-api.lock index b695c9cf14d..71040a2f150 100644 --- a/vortex-buffer/public-api.lock +++ b/vortex-buffer/public-api.lock @@ -448,6 +448,10 @@ pub fn vortex_buffer::BitBufferMut::empty() -> Self pub fn vortex_buffer::BitBufferMut::false_count(&self) -> usize +pub fn vortex_buffer::BitBufferMut::fill_range(&mut self, start: usize, end: usize, value: bool) + +pub unsafe fn vortex_buffer::BitBufferMut::fill_range_unchecked(&mut self, start: usize, end: usize, value: bool) + pub fn vortex_buffer::BitBufferMut::freeze(self) -> vortex_buffer::BitBuffer pub fn vortex_buffer::BitBufferMut::from_buffer(buffer: vortex_buffer::ByteBufferMut, offset: usize, len: usize) -> Self