From d15418f8f6111e3ac15924b2c2e4772536cb9067 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Wed, 24 Jun 2026 10:42:07 -0400 Subject: [PATCH] Improve FSST LIKE contains handling Signed-off-by: Nicholas Gates --- encodings/fsst/src/compute/like.rs | 24 ++++++++++++- encodings/fsst/src/dfa/mod.rs | 4 +-- encodings/fsst/src/kernel.rs | 43 ++++++++++++++++++++++++ vortex-array/src/arrays/shared/vtable.rs | 8 +++++ 4 files changed, 76 insertions(+), 3 deletions(-) diff --git a/encodings/fsst/src/compute/like.rs b/encodings/fsst/src/compute/like.rs index c922fba088e..1d621294b45 100644 --- a/encodings/fsst/src/compute/like.rs +++ b/encodings/fsst/src/compute/like.rs @@ -3,21 +3,28 @@ use vortex_array::ArrayRef; use vortex_array::ArrayView; +use vortex_array::Canonical; use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::arrays::BoolArray; use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::scalar_fn::ScalarFnFactoryExt; use vortex_array::arrays::varbin::VarBinArrayExt; use vortex_array::match_each_integer_ptype; +use vortex_array::scalar_fn::fns::like::Like; use vortex_array::scalar_fn::fns::like::LikeKernel; use vortex_array::scalar_fn::fns::like::LikeOptions; use vortex_error::VortexResult; use crate::FSST; use crate::FSSTArrayExt; +use crate::canonical::canonicalize_fsst; use crate::dfa::FsstMatcher; +use crate::dfa::LikeKind; use crate::dfa::dfa_scan_to_bitbuf; +const DECODE_CONTAINS_MAX_NEEDLE_LEN: usize = 16; + impl LikeKernel for FSST { fn like( array: ArrayView<'_, Self>, @@ -47,9 +54,24 @@ impl LikeKernel for FSST { return Ok(None); }; + let like_kind = LikeKind::parse(pattern_bytes); + if let Some(LikeKind::Contains(needle)) = like_kind + && !needle.is_empty() + && needle.len() <= DECODE_CONTAINS_MAX_NEEDLE_LEN + { + // For short substring patterns, bulk FSST decode plus Arrow's memmem-backed LIKE is + // faster than walking the compressed stream through the byte-at-a-time DFA. + let decoded = canonicalize_fsst(array, ctx)?; + let result = Like + .try_new_array(array.len(), options, [decoded, pattern.clone()])? + .into_array() + .execute::(ctx)? + .into_bool(); + return Ok(Some(result.into_array())); + } + let symbols = array.symbols(); let symbol_lengths = array.symbol_lengths(); - let Some(matcher) = FsstMatcher::try_new(symbols.as_slice(), symbol_lengths.as_slice(), pattern_bytes)? else { diff --git a/encodings/fsst/src/dfa/mod.rs b/encodings/fsst/src/dfa/mod.rs index 5f67f92997e..2fcafb19cfc 100644 --- a/encodings/fsst/src/dfa/mod.rs +++ b/encodings/fsst/src/dfa/mod.rs @@ -211,7 +211,7 @@ impl FsstMatcher { } /// The subset of LIKE patterns we can handle without decompression. -enum LikeKind<'a> { +pub(crate) enum LikeKind<'a> { /// `prefix%` Prefix(Cow<'a, [u8]>), /// `%needle%` @@ -219,7 +219,7 @@ enum LikeKind<'a> { } impl<'a> LikeKind<'a> { - fn parse(pattern: &'a [u8]) -> Option { + pub(crate) fn parse(pattern: &'a [u8]) -> Option { Self::parse_prefix(pattern).or_else(|| Self::parse_contains(pattern)) } diff --git a/encodings/fsst/src/kernel.rs b/encodings/fsst/src/kernel.rs index 30f25006195..95d25080af7 100644 --- a/encodings/fsst/src/kernel.rs +++ b/encodings/fsst/src/kernel.rs @@ -38,14 +38,20 @@ mod tests { use vortex_array::Canonical; use vortex_array::IntoArray; use vortex_array::VortexSessionExecute; + use vortex_array::arrays::BoolArray; + use vortex_array::arrays::ConstantArray; use vortex_array::arrays::FilterArray; use vortex_array::arrays::PrimitiveArray; + use vortex_array::arrays::SharedArray; + use vortex_array::arrays::VarBinArray; use vortex_array::arrays::varbin::builder::VarBinBuilder; use vortex_array::assert_arrays_eq; + use vortex_array::builtins::ArrayBuiltins; use vortex_array::dtype::DType; use vortex_array::dtype::Nullability; use vortex_array::expr::byte_length; use vortex_array::expr::root; + use vortex_array::scalar_fn::fns::operators::Operator; use vortex_error::VortexResult; use vortex_mask::Mask; use vortex_session::VortexSession; @@ -230,4 +236,41 @@ mod tests { assert_arrays_eq!(result, expected, &mut ctx); Ok(()) } + + #[test] + fn test_shared_fsst_parent_kernels() -> VortexResult<()> { + let session = vortex_array::array_session(); + crate::initialize(&session); + let mut ctx = session.create_execution_ctx(); + + let varbin = VarBinArray::from_iter( + ["hello", "", "world!!"].map(Some), + DType::Utf8(Nullability::NonNullable), + ) + .into_array(); + let compressor = fsst_train_compressor(&varbin, &mut ctx)?; + let fsst = fsst_compress(&varbin, &compressor, &mut ctx)?.into_array(); + let shared = SharedArray::new(fsst).into_array(); + + let lengths = shared.clone().apply(&byte_length(root()))?; + assert_arrays_eq!( + lengths, + PrimitiveArray::from_iter(vec![5u64, 0, 7]), + &mut ctx + ); + + let not_empty = shared + .binary( + ConstantArray::new("", shared.len()).into_array(), + Operator::NotEq, + )? + .execute::(&mut ctx)?; + assert_arrays_eq!( + not_empty, + BoolArray::from_iter([true, false, true]), + &mut ctx + ); + + Ok(()) + } } diff --git a/vortex-array/src/arrays/shared/vtable.rs b/vortex-array/src/arrays/shared/vtable.rs index 3c3a09216d2..fc03255d785 100644 --- a/vortex-array/src/arrays/shared/vtable.rs +++ b/vortex-array/src/arrays/shared/vtable.rs @@ -113,6 +113,14 @@ impl VTable for Shared { .get_or_compute(|source| source.clone().execute::(ctx)) .map(ExecutionResult::done) } + + fn reduce_parent( + array: ArrayView<'_, Self>, + parent: &ArrayRef, + child_idx: usize, + ) -> VortexResult> { + array.current_array_ref().reduce_parent(parent, child_idx) + } } impl OperationsVTable for Shared { fn scalar_at(