diff --git a/encodings/fsst/benches/fsst_compress.rs b/encodings/fsst/benches/fsst_compress.rs index 8b5942a91bd..e95f69e554e 100644 --- a/encodings/fsst/benches/fsst_compress.rs +++ b/encodings/fsst/benches/fsst_compress.rs @@ -45,12 +45,16 @@ const BENCH_ARGS: &[(usize, usize, u8)] = &[ (1_000, 16, 8), (1_000, 64, 4), (1_000, 64, 8), + (1_000, 256, 4), + (1_000, 256, 8), (10_000, 4, 4), (10_000, 4, 8), (10_000, 16, 4), (10_000, 16, 8), (10_000, 64, 4), (10_000, 64, 8), + (10_000, 256, 4), + (10_000, 256, 8), ]; #[divan::bench(args = BENCH_ARGS)] diff --git a/encodings/fsst/src/canonical.rs b/encodings/fsst/src/canonical.rs index b5474d1d923..93092532855 100644 --- a/encodings/fsst/src/canonical.rs +++ b/encodings/fsst/src/canonical.rs @@ -57,29 +57,24 @@ pub(crate) fn fsst_decode_views( .clone() .execute::(ctx)?; - #[allow(clippy::cast_possible_truncation)] - let total_size: usize = match_each_integer_ptype!(uncompressed_lens_array.ptype(), |P| { - uncompressed_lens_array - .as_slice::

() - .iter() - .map(|x| *x as usize) - .sum() - }); - - // Bulk-decompress the entire array. - let decompressor = fsst_array.decompressor(); - let mut uncompressed_bytes = ByteBufferMut::with_capacity(total_size + 7); - let len = - decompressor.decompress_into(bytes.as_slice(), uncompressed_bytes.spare_capacity_mut()); - unsafe { uncompressed_bytes.set_len(len) }; - - // Directly create the binary views. + // Single dispatch: sum lengths, decompress, and build views in one match arm. match_each_integer_ptype!(uncompressed_lens_array.ptype(), |P| { + let lens = uncompressed_lens_array.as_slice::

(); + #[allow(clippy::cast_possible_truncation)] + let total_size: usize = lens.iter().map(|x| *x as usize).sum(); + + // Bulk-decompress the entire array. + let decompressor = fsst_array.decompressor(); + let mut uncompressed_bytes = ByteBufferMut::with_capacity(total_size + 7); + let len = + decompressor.decompress_into(bytes.as_slice(), uncompressed_bytes.spare_capacity_mut()); + unsafe { uncompressed_bytes.set_len(len) }; + Ok(build_views( start_buf_index, MAX_BUFFER_LEN, uncompressed_bytes, - uncompressed_lens_array.as_slice::

(), + lens, )) }) } diff --git a/vortex-array/src/arrays/varbinview/build_views.rs b/vortex-array/src/arrays/varbinview/build_views.rs index 87f5c84706b..a04aa6d65aa 100644 --- a/vortex-array/src/arrays/varbinview/build_views.rs +++ b/vortex-array/src/arrays/varbinview/build_views.rs @@ -24,7 +24,12 @@ pub fn offsets_to_lengths(offsets: &[P]) -> Buffer

{ /// Maximum number of buffer bytes that can be referenced by a single `BinaryView` pub const MAX_BUFFER_LEN: usize = i32::MAX as usize; -/// Split a large buffer of input `bytes` holding string data +/// Build `BinaryView`s from a contiguous byte buffer and per-element lengths. +/// +/// When total data exceeds `max_buffer_len` (2 GiB), buffers are split to ensure +/// offsets fit in `u32`. When data fits in a single buffer, per-element split checks +/// are skipped entirely. +#[allow(clippy::cast_possible_truncation)] pub fn build_views>( start_buf_index: u32, max_buffer_len: usize, @@ -33,35 +38,64 @@ pub fn build_views>( ) -> (Vec, Buffer) { let mut views = BufferMut::::with_capacity(lens.len()); - let mut buffers = Vec::new(); - let mut buf_index = start_buf_index; - - let mut offset = 0; - for &len in lens { - let len = len.as_(); - assert!(len <= max_buffer_len, "values cannot exceed max_buffer_len"); - - if (offset + len) > max_buffer_len { - // Roll the buffer every 2GiB, to avoid overflowing VarBinView offset field - let rest = bytes.split_off(offset); + if bytes.len() <= max_buffer_len { + // Fast path: all data fits in a single buffer. No split checks needed per element + // and offsets are guaranteed to fit in u32 (max_buffer_len <= i32::MAX < u32::MAX). + let bytes_ptr = bytes.as_slice().as_ptr(); + + // Write views directly via pointer to avoid per-element push_unchecked overhead + // (spare-capacity lookup + length bookkeeping on each iteration). + let views_dst = views.spare_capacity_mut().as_mut_ptr() as *mut BinaryView; + + let mut offset: usize = 0; + for (i, &len) in lens.iter().enumerate() { + let len: usize = len.as_(); + // SAFETY: the sum of all lengths equals bytes.len() and we process them + // sequentially, so offset + len <= bytes.len() at every iteration. + let value = unsafe { std::slice::from_raw_parts(bytes_ptr.add(offset), len) }; + let view = BinaryView::make_view(value, start_buf_index, offset as u32); + // SAFETY: we reserved capacity for lens.len() views and i < lens.len(). + unsafe { views_dst.add(i).write(view) }; + offset += len; + } + // SAFETY: we wrote exactly lens.len() views above. + unsafe { views.set_len(lens.len()) }; + + let buffers = if bytes.is_empty() { + Vec::new() + } else { + vec![bytes.freeze()] + }; + + (buffers, views.freeze()) + } else { + // Slow path: may need to split across multiple 2 GiB buffers. + let mut buffers = Vec::new(); + let mut buf_index = start_buf_index; + let mut offset = 0; + + for &len in lens { + let len = len.as_(); + assert!(len <= max_buffer_len, "values cannot exceed max_buffer_len"); + + if (offset + len) > max_buffer_len { + let rest = bytes.split_off(offset); + buffers.push(bytes.freeze()); + buf_index += 1; + offset = 0; + bytes = rest; + } + let view = BinaryView::make_view(&bytes[offset..][..len], buf_index, offset.as_()); + unsafe { views.push_unchecked(view) }; + offset += len; + } + if !bytes.is_empty() { buffers.push(bytes.freeze()); - buf_index += 1; - offset = 0; - - bytes = rest; } - let view = BinaryView::make_view(&bytes[offset..][..len], buf_index, offset.as_()); - // SAFETY: we reserved the right capacity beforehand - unsafe { views.push_unchecked(view) }; - offset += len; - } - if !bytes.is_empty() { - buffers.push(bytes.freeze()); + (buffers, views.freeze()) } - - (buffers, views.freeze()) } #[cfg(test)] diff --git a/vortex-array/src/arrays/varbinview/view.rs b/vortex-array/src/arrays/varbinview/view.rs index 068fbde6d4e..3f0e1d979f6 100644 --- a/vortex-array/src/arrays/varbinview/view.rs +++ b/vortex-array/src/arrays/varbinview/view.rs @@ -10,7 +10,6 @@ use std::ops::Range; use static_assertions::assert_eq_align; use static_assertions::assert_eq_size; -use vortex_error::VortexExpect; /// A view over a variable-length binary value. /// @@ -46,18 +45,6 @@ pub struct Inlined { } impl Inlined { - /// Creates a new inlined representation from the provided value of constant size. - #[inline] - fn new(value: &[u8]) -> Self { - debug_assert_eq!(value.len(), N); - let mut inlined = Self { - size: N.try_into().vortex_expect("inlined size must fit in u32"), - data: [0u8; BinaryView::MAX_INLINED_SIZE], - }; - inlined.data[..N].copy_from_slice(&value[..N]); - inlined - } - /// Returns the full inlined value. #[inline] pub fn value(&self) -> &[u8] { @@ -102,66 +89,34 @@ impl BinaryView { /// Maximum size of an inlined binary value. pub const MAX_INLINED_SIZE: usize = 12; - /// Create a view from a value, block and offset + /// Create a view from a value, block and offset. /// /// Depending on the length of the provided value either a new inlined /// or a reference view will be constructed. - /// - /// Adapted from arrow-rs - /// Explicitly enumerating inlined view produces code that avoids calling generic `ptr::copy_non_interleave` that's slower than explicit stores - #[inline(never)] + #[inline] + #[allow(clippy::cast_possible_truncation)] pub fn make_view(value: &[u8], block: u32, offset: u32) -> Self { - match value.len() { - 0 => Self { - inlined: Inlined::new::<0>(value), - }, - 1 => Self { - inlined: Inlined::new::<1>(value), - }, - 2 => Self { - inlined: Inlined::new::<2>(value), - }, - 3 => Self { - inlined: Inlined::new::<3>(value), - }, - 4 => Self { - inlined: Inlined::new::<4>(value), - }, - 5 => Self { - inlined: Inlined::new::<5>(value), - }, - 6 => Self { - inlined: Inlined::new::<6>(value), - }, - 7 => Self { - inlined: Inlined::new::<7>(value), - }, - 8 => Self { - inlined: Inlined::new::<8>(value), - }, - 9 => Self { - inlined: Inlined::new::<9>(value), - }, - 10 => Self { - inlined: Inlined::new::<10>(value), - }, - 11 => Self { - inlined: Inlined::new::<11>(value), - }, - 12 => Self { - inlined: Inlined::new::<12>(value), - }, - _ => Self { + let len = value.len(); + if len <= Self::MAX_INLINED_SIZE { + // Inlined: zero-initialize, write size, then copy value bytes. + let mut view = Self { + le_bytes: [0u8; 16], + }; + unsafe { + view.inlined.size = len as u32; + std::ptr::copy_nonoverlapping(value.as_ptr(), view.inlined.data.as_mut_ptr(), len); + } + view + } else { + Self { _ref: Ref { - size: u32::try_from(value.len()).vortex_expect("value length must fit in u32"), - prefix: value[0..4] - .try_into() - .ok() - .vortex_expect("prefix must be exactly 4 bytes"), + size: len as u32, + // SAFETY: len >= 13, so reading 4 bytes from the start is always valid. + prefix: unsafe { (value.as_ptr() as *const [u8; 4]).read_unaligned() }, buffer_index: block, offset, }, - }, + } } }