Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion encodings/fsst/src/compute/like.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,28 @@

use vortex_array::ArrayRef;
use vortex_array::ArrayView;
use vortex_array::Canonical;
use vortex_array::ExecutionCtx;
use vortex_array::IntoArray;
use vortex_array::arrays::BoolArray;
use vortex_array::arrays::PrimitiveArray;
use vortex_array::arrays::scalar_fn::ScalarFnFactoryExt;
use vortex_array::arrays::varbin::VarBinArrayExt;
use vortex_array::match_each_integer_ptype;
use vortex_array::scalar_fn::fns::like::Like;
use vortex_array::scalar_fn::fns::like::LikeKernel;
use vortex_array::scalar_fn::fns::like::LikeOptions;
use vortex_error::VortexResult;

use crate::FSST;
use crate::FSSTArrayExt;
use crate::canonical::canonicalize_fsst;
use crate::dfa::FsstMatcher;
use crate::dfa::LikeKind;
use crate::dfa::dfa_scan_to_bitbuf;

const DECODE_CONTAINS_MAX_NEEDLE_LEN: usize = 16;

impl LikeKernel for FSST {
fn like(
array: ArrayView<'_, Self>,
Expand Down Expand Up @@ -47,9 +54,24 @@ impl LikeKernel for FSST {
return Ok(None);
};

let like_kind = LikeKind::parse(pattern_bytes);
if let Some(LikeKind::Contains(needle)) = like_kind
&& !needle.is_empty()
&& needle.len() <= DECODE_CONTAINS_MAX_NEEDLE_LEN
{
// For short substring patterns, bulk FSST decode plus Arrow's memmem-backed LIKE is
// faster than walking the compressed stream through the byte-at-a-time DFA.
let decoded = canonicalize_fsst(array, ctx)?;
let result = Like
.try_new_array(array.len(), options, [decoded, pattern.clone()])?
.into_array()
.execute::<Canonical>(ctx)?
.into_bool();
return Ok(Some(result.into_array()));
}

let symbols = array.symbols();
let symbol_lengths = array.symbol_lengths();

let Some(matcher) =
FsstMatcher::try_new(symbols.as_slice(), symbol_lengths.as_slice(), pattern_bytes)?
else {
Expand Down
4 changes: 2 additions & 2 deletions encodings/fsst/src/dfa/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -211,15 +211,15 @@ impl FsstMatcher {
}

/// The subset of LIKE patterns we can handle without decompression.
enum LikeKind<'a> {
pub(crate) enum LikeKind<'a> {
/// `prefix%`
Prefix(Cow<'a, [u8]>),
/// `%needle%`
Contains(Cow<'a, [u8]>),
}

impl<'a> LikeKind<'a> {
fn parse(pattern: &'a [u8]) -> Option<Self> {
pub(crate) fn parse(pattern: &'a [u8]) -> Option<Self> {
Self::parse_prefix(pattern).or_else(|| Self::parse_contains(pattern))
}

Expand Down
43 changes: 43 additions & 0 deletions encodings/fsst/src/kernel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,20 @@ mod tests {
use vortex_array::Canonical;
use vortex_array::IntoArray;
use vortex_array::VortexSessionExecute;
use vortex_array::arrays::BoolArray;
use vortex_array::arrays::ConstantArray;
use vortex_array::arrays::FilterArray;
use vortex_array::arrays::PrimitiveArray;
use vortex_array::arrays::SharedArray;
use vortex_array::arrays::VarBinArray;
use vortex_array::arrays::varbin::builder::VarBinBuilder;
use vortex_array::assert_arrays_eq;
use vortex_array::builtins::ArrayBuiltins;
use vortex_array::dtype::DType;
use vortex_array::dtype::Nullability;
use vortex_array::expr::byte_length;
use vortex_array::expr::root;
use vortex_array::scalar_fn::fns::operators::Operator;
use vortex_error::VortexResult;
use vortex_mask::Mask;
use vortex_session::VortexSession;
Expand Down Expand Up @@ -230,4 +236,41 @@ mod tests {
assert_arrays_eq!(result, expected, &mut ctx);
Ok(())
}

#[test]
fn test_shared_fsst_parent_kernels() -> VortexResult<()> {
let session = vortex_array::array_session();
crate::initialize(&session);
let mut ctx = session.create_execution_ctx();

let varbin = VarBinArray::from_iter(
["hello", "", "world!!"].map(Some),
DType::Utf8(Nullability::NonNullable),
)
.into_array();
let compressor = fsst_train_compressor(&varbin, &mut ctx)?;
let fsst = fsst_compress(&varbin, &compressor, &mut ctx)?.into_array();
let shared = SharedArray::new(fsst).into_array();

let lengths = shared.clone().apply(&byte_length(root()))?;
assert_arrays_eq!(
lengths,
PrimitiveArray::from_iter(vec![5u64, 0, 7]),
&mut ctx
);

let not_empty = shared
.binary(
ConstantArray::new("", shared.len()).into_array(),
Operator::NotEq,
)?
.execute::<BoolArray>(&mut ctx)?;
assert_arrays_eq!(
not_empty,
BoolArray::from_iter([true, false, true]),
&mut ctx
);

Ok(())
}
}
8 changes: 8 additions & 0 deletions vortex-array/src/arrays/shared/vtable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,14 @@ impl VTable for Shared {
.get_or_compute(|source| source.clone().execute::<Canonical>(ctx))
.map(ExecutionResult::done)
}

fn reduce_parent(
array: ArrayView<'_, Self>,
parent: &ArrayRef,
child_idx: usize,
) -> VortexResult<Option<ArrayRef>> {
array.current_array_ref().reduce_parent(parent, child_idx)
}
}
impl OperationsVTable<Shared> for Shared {
fn scalar_at(
Expand Down
Loading