From f3b7ac6e989ee0764848b5aacb853fffe19be10a Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Fri, 19 Jun 2026 11:09:40 +0100 Subject: [PATCH 1/3] Reduce vortex-fastlanes debug symbolx size without config Signed-off-by: Adam Gutglick --- Cargo.toml | 4 -- .../src/bitpacking/compute/compare.rs | 55 +++++++++++++------ vortex-array/src/dtype/ptype.rs | 6 ++ 3 files changed, 43 insertions(+), 22 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 8cf6e741a8b..c25c634eae5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -419,7 +419,3 @@ incremental = false debug = false debug-assertions = true incremental = false - -# This improved build times significantly for default common cases that we use locally -[profile.dev.package.vortex-fastlanes] -debug = false diff --git a/encodings/fastlanes/src/bitpacking/compute/compare.rs b/encodings/fastlanes/src/bitpacking/compute/compare.rs index d5c50751bae..241fa0056b9 100644 --- a/encodings/fastlanes/src/bitpacking/compute/compare.rs +++ b/encodings/fastlanes/src/bitpacking/compute/compare.rs @@ -80,24 +80,43 @@ where + FastLanesComparable::Physical>, ::Physical: BitPacking + NativePType + BitPackingCompare, { - match operator { - CompareOperator::Eq => { - stream_compare_fused::(lhs, rhs, nullability, |a, b| a.is_eq(b), ctx) - } - CompareOperator::NotEq => { - stream_compare_fused::(lhs, rhs, nullability, |a, b| !a.is_eq(b), ctx) - } - CompareOperator::Lt => { - stream_compare_fused::(lhs, rhs, nullability, |a, b| a.is_lt(b), ctx) - } - CompareOperator::Lte => { - stream_compare_fused::(lhs, rhs, nullability, |a, b| a.is_le(b), ctx) - } - CompareOperator::Gt => { - stream_compare_fused::(lhs, rhs, nullability, |a, b| a.is_gt(b), ctx) - } - CompareOperator::Gte => { - stream_compare_fused::(lhs, rhs, nullability, |a, b| a.is_ge(b), ctx) + #[cfg(debug_assertions)] + { + // FastLanes expands `unchecked_unpack_cmp` by width, value type, and comparator type. + // In dev builds, using one function-pointer comparator type avoids multiplying that + // generated code and debug info by every comparison operator. + let cmp: fn(T, T) -> bool = match operator { + CompareOperator::Eq => NativePType::is_eq, + CompareOperator::NotEq => NativePType::is_ne, + CompareOperator::Lt => NativePType::is_lt, + CompareOperator::Lte => NativePType::is_le, + CompareOperator::Gt => NativePType::is_gt, + CompareOperator::Gte => NativePType::is_ge, + }; + stream_compare_fused:: bool>(lhs, rhs, nullability, cmp, ctx) + } + + #[cfg(not(debug_assertions))] + { + match operator { + CompareOperator::Eq => { + stream_compare_fused::(lhs, rhs, nullability, |a, b| a.is_eq(b), ctx) + } + CompareOperator::NotEq => { + stream_compare_fused::(lhs, rhs, nullability, |a, b| !a.is_eq(b), ctx) + } + CompareOperator::Lt => { + stream_compare_fused::(lhs, rhs, nullability, |a, b| a.is_lt(b), ctx) + } + CompareOperator::Lte => { + stream_compare_fused::(lhs, rhs, nullability, |a, b| a.is_le(b), ctx) + } + CompareOperator::Gt => { + stream_compare_fused::(lhs, rhs, nullability, |a, b| a.is_gt(b), ctx) + } + CompareOperator::Gte => { + stream_compare_fused::(lhs, rhs, nullability, |a, b| a.is_ge(b), ctx) + } } } } diff --git a/vortex-array/src/dtype/ptype.rs b/vortex-array/src/dtype/ptype.rs index ab443618d66..5a86364fe80 100644 --- a/vortex-array/src/dtype/ptype.rs +++ b/vortex-array/src/dtype/ptype.rs @@ -152,6 +152,12 @@ pub trait NativePType: /// Whether another instance of this type (`other`) is bitwise equal to `self` fn is_eq(self, other: Self) -> bool; + + /// Whether another instance of this type (`other`) is bitwise not equal to `self` + fn is_ne(self, other: Self) -> bool { + !self.is_eq!(other) + } + /// Downcast the provided object to a type-specific instance. fn downcast(visitor: V) -> V::Output; From d833f4672e8b1fbd3f5abcf45673eab381ebef18 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Fri, 19 Jun 2026 11:39:42 +0100 Subject: [PATCH 2/3] . Signed-off-by: Adam Gutglick --- .../src/bitpacking/compute/compare.rs | 24 +++++++++---------- vortex-array/src/dtype/ptype.rs | 3 +-- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/encodings/fastlanes/src/bitpacking/compute/compare.rs b/encodings/fastlanes/src/bitpacking/compute/compare.rs index 241fa0056b9..d69e098c594 100644 --- a/encodings/fastlanes/src/bitpacking/compute/compare.rs +++ b/encodings/fastlanes/src/bitpacking/compute/compare.rs @@ -86,12 +86,12 @@ where // In dev builds, using one function-pointer comparator type avoids multiplying that // generated code and debug info by every comparison operator. let cmp: fn(T, T) -> bool = match operator { - CompareOperator::Eq => NativePType::is_eq, - CompareOperator::NotEq => NativePType::is_ne, - CompareOperator::Lt => NativePType::is_lt, - CompareOperator::Lte => NativePType::is_le, - CompareOperator::Gt => NativePType::is_gt, - CompareOperator::Gte => NativePType::is_ge, + CompareOperator::Eq => T::is_eq, + CompareOperator::NotEq => T::is_ne, + CompareOperator::Lt => T::is_lt, + CompareOperator::Lte => T::is_le, + CompareOperator::Gt => T::is_gt, + CompareOperator::Gte => T::is_ge, }; stream_compare_fused:: bool>(lhs, rhs, nullability, cmp, ctx) } @@ -100,22 +100,22 @@ where { match operator { CompareOperator::Eq => { - stream_compare_fused::(lhs, rhs, nullability, |a, b| a.is_eq(b), ctx) + stream_compare_fused::(lhs, rhs, nullability, T::is_eq, ctx) } CompareOperator::NotEq => { - stream_compare_fused::(lhs, rhs, nullability, |a, b| !a.is_eq(b), ctx) + stream_compare_fused::(lhs, rhs, nullability, T::is_ne, ctx) } CompareOperator::Lt => { - stream_compare_fused::(lhs, rhs, nullability, |a, b| a.is_lt(b), ctx) + stream_compare_fused::(lhs, rhs, nullability, T::is_lt, ctx) } CompareOperator::Lte => { - stream_compare_fused::(lhs, rhs, nullability, |a, b| a.is_le(b), ctx) + stream_compare_fused::(lhs, rhs, nullability, T::is_le, ctx) } CompareOperator::Gt => { - stream_compare_fused::(lhs, rhs, nullability, |a, b| a.is_gt(b), ctx) + stream_compare_fused::(lhs, rhs, nullability, T::is_gt, ctx) } CompareOperator::Gte => { - stream_compare_fused::(lhs, rhs, nullability, |a, b| a.is_ge(b), ctx) + stream_compare_fused::(lhs, rhs, nullability, T::is_ge, ctx) } } } diff --git a/vortex-array/src/dtype/ptype.rs b/vortex-array/src/dtype/ptype.rs index 5a86364fe80..e2565147ac3 100644 --- a/vortex-array/src/dtype/ptype.rs +++ b/vortex-array/src/dtype/ptype.rs @@ -152,10 +152,9 @@ pub trait NativePType: /// Whether another instance of this type (`other`) is bitwise equal to `self` fn is_eq(self, other: Self) -> bool; - /// Whether another instance of this type (`other`) is bitwise not equal to `self` fn is_ne(self, other: Self) -> bool { - !self.is_eq!(other) + !self.is_eq(other) } /// Downcast the provided object to a type-specific instance. From ff7d3561e7f3e2b39ab553f7393de02e8c8aaf1b Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Fri, 19 Jun 2026 12:33:31 +0100 Subject: [PATCH 3/3] fastlanes experiment Signed-off-by: Adam Gutglick --- .../src/bitpacking/compute/compare.rs | 44 +------ .../src/bitpacking/compute/compare_fused.rs | 114 +++++++++++++++--- 2 files changed, 98 insertions(+), 60 deletions(-) diff --git a/encodings/fastlanes/src/bitpacking/compute/compare.rs b/encodings/fastlanes/src/bitpacking/compute/compare.rs index d69e098c594..db0113f9755 100644 --- a/encodings/fastlanes/src/bitpacking/compute/compare.rs +++ b/encodings/fastlanes/src/bitpacking/compute/compare.rs @@ -65,8 +65,8 @@ impl CompareKernel for BitPacked { /// Compare every value against the constant via the fused FastLanes `unpack_cmp` kernel. /// -/// `NativePType::is_eq` / `is_lt` etc. provide total comparison (matching the primitive between -/// kernel's dispatch shape). `NotEq` has no direct method, so use `!is_eq`. +/// `NativePType::is_eq` / `is_lt` etc. provide total comparison matching the kernel's dispatch +/// shape. fn compare_constant_typed( lhs: ArrayView<'_, BitPacked>, rhs: T, @@ -80,45 +80,7 @@ where + FastLanesComparable::Physical>, ::Physical: BitPacking + NativePType + BitPackingCompare, { - #[cfg(debug_assertions)] - { - // FastLanes expands `unchecked_unpack_cmp` by width, value type, and comparator type. - // In dev builds, using one function-pointer comparator type avoids multiplying that - // generated code and debug info by every comparison operator. - let cmp: fn(T, T) -> bool = match operator { - CompareOperator::Eq => T::is_eq, - CompareOperator::NotEq => T::is_ne, - CompareOperator::Lt => T::is_lt, - CompareOperator::Lte => T::is_le, - CompareOperator::Gt => T::is_gt, - CompareOperator::Gte => T::is_ge, - }; - stream_compare_fused:: bool>(lhs, rhs, nullability, cmp, ctx) - } - - #[cfg(not(debug_assertions))] - { - match operator { - CompareOperator::Eq => { - stream_compare_fused::(lhs, rhs, nullability, T::is_eq, ctx) - } - CompareOperator::NotEq => { - stream_compare_fused::(lhs, rhs, nullability, T::is_ne, ctx) - } - CompareOperator::Lt => { - stream_compare_fused::(lhs, rhs, nullability, T::is_lt, ctx) - } - CompareOperator::Lte => { - stream_compare_fused::(lhs, rhs, nullability, T::is_le, ctx) - } - CompareOperator::Gt => { - stream_compare_fused::(lhs, rhs, nullability, T::is_gt, ctx) - } - CompareOperator::Gte => { - stream_compare_fused::(lhs, rhs, nullability, T::is_ge, ctx) - } - } - } + stream_compare_fused::(lhs, rhs, operator, nullability, ctx) } #[cfg(test)] diff --git a/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs b/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs index baf3b0ede96..456f645ba49 100644 --- a/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs +++ b/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs @@ -41,6 +41,7 @@ use vortex_array::dtype::NativePType; use vortex_array::dtype::Nullability; use vortex_array::dtype::PhysicalPType; use vortex_array::match_each_unsigned_integer_ptype; +use vortex_array::scalar_fn::fns::operators::CompareOperator; use vortex_buffer::BitBufferMut; use vortex_buffer::BufferMut; use vortex_error::VortexExpect; @@ -59,15 +60,14 @@ const WORDS_PER_CHUNK: usize = CHUNK_SIZE / U64_BITS; /// Compare the unpacked values of a [`BitPackedArray`] against `rhs` using the fused FastLanes /// `unpack_cmp` kernel, producing a [`BoolArray`]. /// -/// `cmp(value, rhs)` defines the predicate; it must be the total-order comparison matching the -/// requested operator (e.g. `|a, b| a.is_lt(b)`). +/// `operator` defines the total-order comparison to apply against `rhs`. /// /// [`BitPackedArray`]: crate::BitPackedArray -pub(super) fn stream_compare_fused( +pub(super) fn stream_compare_fused( array: ArrayView<'_, BitPacked>, rhs: T, + operator: CompareOperator, nullability: Nullability, - cmp: F, ctx: &mut ExecutionCtx, ) -> VortexResult where @@ -75,7 +75,6 @@ where + BitPackedIter + FastLanesComparable::Physical>, ::Physical: BitPacking + NativePType + BitPackingCompare, - F: Fn(T, T) -> bool + Copy, { let len = array.len(); let bit_width = array.bit_width() as usize; @@ -84,7 +83,12 @@ where // A degenerate width has no packed payload for the fused kernel to consume; defer to the scalar // streaming predicate, which handles every layout (including the empty array). if len == 0 || bit_width == 0 { - return stream_predicate::(array, nullability, move |v| cmp(v, rhs), ctx); + return stream_predicate::( + array, + nullability, + move |v| compare_value(v, rhs, operator), + ctx, + ); } // Over-allocate to whole 1024-bit blocks in padded coordinates so every block - including the @@ -101,18 +105,7 @@ where let out = words[range.start / U64_BITS..] .first_chunk_mut::() .vortex_expect("over-allocated buffer holds a full block per chunk"); - // SAFETY: `packed_chunk` holds exactly `128 * bit_width / size_of::()` packed - // elements and `bit_width <= U::T`, satisfying `unchecked_unpack_cmp`'s contract. The - // kernel assigns every word in `transposed`, so its previous contents are irrelevant. - unsafe { - <::Physical as BitPackingCompare>::unchecked_unpack_cmp::( - bit_width, - packed_chunk, - &mut transposed, - cmp, - rhs, - ); - } + unpack_cmp_operator::(operator, bit_width, packed_chunk, &mut transposed, rhs); untranspose_bits::<::Physical>(&transposed, out); }); } @@ -133,7 +126,7 @@ where for (&global, &value) in indices.iter().zip(values) { let global: usize = global.as_(); let idx = global - p_off; - bits.set_to(idx, cmp(value, rhs)) + bits.set_to(idx, compare_value(value, rhs, operator)) } }); } @@ -141,3 +134,86 @@ where let validity = array.validity()?.union_nullability(nullability); Ok(BoolArray::new(bits.freeze(), validity).into_array()) } + +fn unpack_cmp_operator( + operator: CompareOperator, + bit_width: usize, + packed_chunk: &[::Physical], + transposed: &mut [u64; WORDS_PER_CHUNK], + rhs: T, +) where + T: NativePType + + BitPackedIter + + FastLanesComparable::Physical>, + ::Physical: BitPacking + NativePType + BitPackingCompare, +{ + let (operator, invert) = canonical_operator(operator); + + // SAFETY: `packed_chunk` holds exactly `128 * bit_width / size_of::()` packed + // elements and `bit_width <= U::T`, satisfying `unchecked_unpack_cmp`'s contract. The + // kernel assigns every word in `transposed`, so its previous contents are irrelevant. + unsafe { + match operator { + CompareOperator::Eq => { + <::Physical as BitPackingCompare>::unchecked_unpack_cmp::( + bit_width, + packed_chunk, + transposed, + T::is_eq, + rhs, + ); + } + CompareOperator::Lt => { + <::Physical as BitPackingCompare>::unchecked_unpack_cmp::( + bit_width, + packed_chunk, + transposed, + T::is_lt, + rhs, + ); + } + CompareOperator::Gt => { + <::Physical as BitPackingCompare>::unchecked_unpack_cmp::( + bit_width, + packed_chunk, + transposed, + T::is_gt, + rhs, + ); + } + CompareOperator::NotEq | CompareOperator::Lte | CompareOperator::Gte => { + unreachable!("canonical_operator only returns Eq, Lt, or Gt") + } + } + } + + if invert { + for word in transposed { + *word = !*word; + } + } +} + +#[inline] +fn canonical_operator(operator: CompareOperator) -> (CompareOperator, bool) { + match operator { + CompareOperator::Eq => (CompareOperator::Eq, false), + CompareOperator::NotEq => (CompareOperator::Eq, true), + CompareOperator::Lt => (CompareOperator::Lt, false), + CompareOperator::Lte => (CompareOperator::Gt, true), + CompareOperator::Gt => (CompareOperator::Gt, false), + CompareOperator::Gte => (CompareOperator::Lt, true), + } +} + +#[inline] +fn compare_value(value: T, rhs: T, operator: CompareOperator) -> bool { + match operator { + CompareOperator::Eq => value.is_eq(rhs), + CompareOperator::NotEq => value.is_ne(rhs), + CompareOperator::Lt => value.is_lt(rhs), + CompareOperator::Lte => value.is_le(rhs), + CompareOperator::Gt => value.is_gt(rhs), + CompareOperator::Gte => value.is_ge(rhs), + } +}