Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions vortex-array/public-api.lock
Original file line number Diff line number Diff line change
Expand Up @@ -2268,6 +2268,12 @@ pub type vortex_array::arrays::decimal::DecimalArray = vortex_array::Array<vorte

pub mod vortex_array::arrays::dict

pub mod vortex_array::arrays::dict::cardinality

pub fn vortex_array::arrays::dict::cardinality::estimate_code_cardinality<I: vortex_array::dtype::IntegerPType>(&vortex_array::arrays::PrimitiveArray, &vortex_mask::Mask) -> core::option::Option<usize>

pub fn vortex_array::arrays::dict::cardinality::has_repeated_code_sample<I: vortex_array::dtype::IntegerPType>(&vortex_array::arrays::PrimitiveArray, &vortex_mask::Mask) -> bool

pub mod vortex_array::arrays::dict::vtable

pub struct vortex_array::arrays::dict::vtable::Dict
Expand Down
91 changes: 55 additions & 36 deletions vortex-array/src/arrays/dict/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use vortex_error::VortexResult;
use vortex_error::vortex_bail;
use vortex_error::vortex_ensure;
use vortex_mask::AllOr;
use vortex_mask::Mask;

use crate::ArrayRef;
use crate::ArraySlots;
Expand All @@ -23,6 +24,7 @@ use crate::array::ArrayParts;
use crate::array::TypedArrayRef;
use crate::array_slots;
use crate::arrays::Dict;
use crate::arrays::PrimitiveArray;
use crate::dtype::DType;
use crate::dtype::PType;
use crate::match_each_integer_ptype;
Expand Down Expand Up @@ -150,46 +152,63 @@ pub trait DictArrayExt: TypedArrayRef<Dict> + DictArraySlotsExt {
.execute_mask(codes.len(), &mut LEGACY_SESSION.create_execution_ctx())?;
#[expect(deprecated)]
let codes_primitive = self.codes().to_primitive();
let values_len = self.values().len();

let init_value = !referenced;
let referenced_value = referenced;

let mut values_vec = vec![init_value; values_len];
match codes_validity.bit_buffer() {
AllOr::All => {
match_each_integer_ptype!(codes_primitive.ptype(), |P| {
#[allow(
clippy::cast_possible_truncation,
clippy::cast_sign_loss,
reason = "codes are non-negative indices; a negative signed code would wrap to a large usize and panic on the bounds-checked array index"
)]
for &idx in codes_primitive.as_slice::<P>() {
values_vec[idx as usize] = referenced_value;
}
});
}
AllOr::None => {}
AllOr::Some(mask) => {
match_each_integer_ptype!(codes_primitive.ptype(), |P| {
let codes = codes_primitive.as_slice::<P>();

#[allow(
clippy::cast_possible_truncation,
clippy::cast_sign_loss,
reason = "codes are non-negative indices; a negative signed code would wrap to a large usize and panic on the bounds-checked array index"
)]
mask.set_indices().for_each(|idx| {
values_vec[codes[idx] as usize] = referenced_value;
});
compute_referenced_values_mask_from_codes(
&codes_primitive,
self.values().len(),
&codes_validity,
referenced,
)
}
}
impl<T: TypedArrayRef<Dict>> DictArrayExt for T {}

/// Build an exact bitmap over dictionary values referenced by valid codes.
///
/// The sampling-based sparse dictionary estimator only decides whether an exact pass is likely to
/// be worthwhile. This helper is the exact pass: aggregate kernels use it to ignore unreferenced
/// values, and sparse dictionary canonicalization uses it to compact values before remapping codes.
pub(crate) fn compute_referenced_values_mask_from_codes(
codes_primitive: &PrimitiveArray,
values_len: usize,
codes_validity: &Mask,
referenced: bool,
) -> VortexResult<BitBuffer> {
let init_value = !referenced;
let referenced_value = referenced;

let mut values_vec = vec![init_value; values_len];
match codes_validity.bit_buffer() {
AllOr::All => {
match_each_integer_ptype!(codes_primitive.ptype(), |P| {
#[allow(
clippy::cast_possible_truncation,
clippy::cast_sign_loss,
reason = "codes are non-negative indices; a negative signed code would wrap to a large usize and panic on the bounds-checked array index"
)]
for &idx in codes_primitive.as_slice::<P>() {
values_vec[idx as usize] = referenced_value;
}
});
}
AllOr::None => {}
AllOr::Some(mask) => {
match_each_integer_ptype!(codes_primitive.ptype(), |P| {
let codes = codes_primitive.as_slice::<P>();

#[allow(
clippy::cast_possible_truncation,
clippy::cast_sign_loss,
reason = "codes are non-negative indices; a negative signed code would wrap to a large usize and panic on the bounds-checked array index"
)]
mask.set_indices().for_each(|idx| {
values_vec[codes[idx] as usize] = referenced_value;
});
}
});
}

Ok(BitBuffer::from(values_vec))
}

Ok(BitBuffer::from(values_vec))
}
impl<T: TypedArrayRef<Dict>> DictArrayExt for T {}

/// Concrete parts of a [`DictArray`](super::DictArray) after iterative execution.
pub struct DictParts {
Expand Down
135 changes: 135 additions & 0 deletions vortex-array/src/arrays/dict/cardinality.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

//! Sampling-based cardinality estimation for dictionary codes.
//!
//! This module is used only as a cheap gate before the exact sparse-dictionary remap pass and as a
//! routing hint for downstream exporters. The estimate may be conservative or noisy, but
//! correctness does not depend on it: callers must still collect the exact unique code set and
//! re-check the sparse threshold before compacting.

use vortex_mask::Mask;

use crate::arrays::PrimitiveArray;
use crate::dtype::IntegerPType;

const SAMPLE_SIZE: usize = 128;
const REPEATED_CODE_PROBE_SIZE: usize = 16;

/// Return whether a small deterministic probe observes a repeated non-null code.
///
/// Sparse canonicalization always has a cheap worst-case gate before it samples. This probe is the
/// next, cheaper filter for cases that are not sparse by row count alone: dense dictionaries should
/// not pay the full estimator cost unless the code stream first shows evidence of repeated codes.
/// A `true` result only means "run the estimator"; it is not enough to compact by itself.
pub fn has_repeated_code_sample<I: IntegerPType>(
codes: &PrimitiveArray,
validity_mask: &Mask,
) -> bool {
let sample_count = codes.len().min(REPEATED_CODE_PROBE_SIZE);
let mut observed_codes = Vec::<usize>::with_capacity(sample_count);

for sample_idx in 0..sample_count {
let idx = sample_index(sample_idx, codes.len(), sample_count);
if !validity_mask.value(idx) {
continue;
}

let code: usize = codes.as_slice::<I>()[idx].as_();
if observed_codes.contains(&code) {
return true;
}
observed_codes.push(code);
}

false
}

/// Estimate the number of distinct non-null dictionary codes.
///
/// The estimator samples deterministic bucket midpoints so repeated executions make the same
/// compaction decision for the same input. Returning `None` means no valid sampled codes were seen.
/// A returned value should only be used to decide whether an exact pass is worth attempting.
pub fn estimate_code_cardinality<I: IntegerPType>(
codes: &PrimitiveArray,
validity_mask: &Mask,
) -> Option<usize> {
let sample_count = codes.len().min(SAMPLE_SIZE);
let mut observed_codes = Vec::<(usize, usize)>::new();

// Sample deterministic bucket midpoints instead of using randomness. The estimate only gates
// whether to run the exact pass; correctness never depends on the sample.
for sample_idx in 0..sample_count {
let idx = sample_index(sample_idx, codes.len(), sample_count);
if !validity_mask.value(idx) {
continue;
}

let code: usize = codes.as_slice::<I>()[idx].as_();
if let Some((_, count)) = observed_codes
.iter_mut()
.find(|(observed, _)| *observed == code)
{
*count += 1;
} else {
observed_codes.push((code, 1));
}
}

estimate_cardinality_from_observations(&observed_codes)
}

/// Estimate total cardinality from `(code, observed_count)` sample observations.
///
/// The correction is Chao1-style: singleton-heavy samples imply more unseen codes, while repeated
/// observations imply the code stream is likely low-cardinality.
fn estimate_cardinality_from_observations(observed_codes: &[(usize, usize)]) -> Option<usize> {
if observed_codes.is_empty() {
return None;
}

let unique_count = observed_codes.len();
let singleton_count = observed_codes
.iter()
.filter(|(_, count)| *count == 1)
.count();
let doubleton_count = observed_codes
.iter()
.filter(|(_, count)| *count == 2)
.count();

// Chao1-style lower-bias estimate for unseen codes. Repeated samples keep the estimate small
// for low-cardinality code streams; many singleton samples make dense streams look expensive.
let unseen_estimate = if doubleton_count == 0 {
singleton_count.saturating_mul(singleton_count.saturating_sub(1)) / 2
} else {
div_ceil(
singleton_count.saturating_mul(singleton_count),
2 * doubleton_count,
)
};

Some(unique_count.saturating_add(unseen_estimate))
}

/// Return the midpoint index for one deterministic sampling bucket.
///
/// Splitting the full code range into buckets avoids clustering all samples near the start while
/// avoiding RNG state in a hot execution path.
fn sample_index(sample_idx: usize, len: usize, sample_count: usize) -> usize {
debug_assert!(len > 0);
debug_assert!(sample_count > 0);

let sample_idx = sample_idx as u128;
let len = len as u128;
let sample_count = sample_count as u128;
let bucket_start = sample_idx * len / sample_count;
let bucket_end = (sample_idx + 1) * len / sample_count;

((bucket_start + bucket_end) / 2).min(len - 1) as usize
}

fn div_ceil(numerator: usize, denominator: usize) -> usize {
debug_assert!(denominator > 0);
numerator / denominator + usize::from(!numerator.is_multiple_of(denominator))
}
68 changes: 61 additions & 7 deletions vortex-array/src/arrays/dict/compute/slice.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,36 @@ use crate::arrays::Constant;
use crate::arrays::ConstantArray;
use crate::arrays::Dict;
use crate::arrays::DictArray;
use crate::arrays::dict::DictArrayExt;
use crate::arrays::dict::DictArraySlotsExt;
use crate::arrays::slice::SliceReduce;
use crate::scalar::Scalar;

impl SliceReduce for Dict {
fn slice(array: ArrayView<'_, Self>, range: Range<usize>) -> VortexResult<Option<ArrayRef>> {
let is_full_slice = range.len() == array.len();
let sliced_code = array.codes().slice(range)?;
// TODO(joe): if the range is size 1 replace with a constant array
if let Some(code) = sliced_code.as_opt::<Constant>() {
let code = code.scalar().as_primitive().as_::<usize>();
return if let Some(code) = code {
let values = array.values().slice(code..code + 1)?;
Ok(Some(
DictArray::new(
// SAFETY: the only dictionary value is referenced by every non-null code when the
// slice is non-empty. An empty code stream cannot reference a non-empty values
// array.
let sliced = unsafe {
DictArray::new_unchecked(
ConstantArray::new(0u8, sliced_code.len()).into_array(),
values,
)
.into_array(),
))
};
if sliced_code.is_empty() {
Ok(Some(sliced.into_array()))
} else {
Ok(Some(unsafe {
sliced.set_all_values_referenced(true).into_array()
}))
}
} else {
Ok(Some(
ConstantArray::new(Scalar::null(array.dtype().clone()), sliced_code.len())
Expand All @@ -39,9 +50,18 @@ impl SliceReduce for Dict {
};
}
// SAFETY: slicing the codes preserves invariants.
Ok(Some(
unsafe { DictArray::new_unchecked(sliced_code, array.values().clone()) }.into_array(),
))
let sliced = unsafe { DictArray::new_unchecked(sliced_code, array.values().clone()) };
if is_full_slice {
// A full-length slice preserves the exact code stream, so the referenced-values
// metadata remains sound. Partial slices may drop the only reference to a value.
Ok(Some(unsafe {
sliced
.set_all_values_referenced(array.has_all_values_referenced())
.into_array()
}))
} else {
Ok(Some(sliced.into_array()))
}
}
}

Expand All @@ -51,8 +71,10 @@ mod tests {
use vortex_error::VortexResult;

use crate::IntoArray;
use crate::arrays::Dict;
use crate::arrays::DictArray;
use crate::arrays::PrimitiveArray;
use crate::arrays::dict::DictArrayExt;
use crate::arrays::dict::compute::slice::ConstantArray;
use crate::assert_arrays_eq;
use crate::dtype::DType;
Expand Down Expand Up @@ -84,4 +106,36 @@ mod tests {
assert_arrays_eq!(sliced, expected);
Ok(())
}

#[test]
fn full_slice_preserves_all_values_referenced_metadata() -> VortexResult<()> {
let dict = unsafe {
DictArray::new_unchecked(
buffer![0u8, 1].into_array(),
buffer![10i32, 20].into_array(),
)
.set_all_values_referenced(true)
};

let sliced = dict.slice(0..2)?;

assert!(sliced.as_::<Dict>().has_all_values_referenced());
Ok(())
}

#[test]
fn partial_slice_drops_all_values_referenced_metadata() -> VortexResult<()> {
let dict = unsafe {
DictArray::new_unchecked(
buffer![0u8, 1].into_array(),
buffer![10i32, 20].into_array(),
)
.set_all_values_referenced(true)
};

let sliced = dict.slice(0..1)?;

assert!(!sliced.as_::<Dict>().has_all_values_referenced());
Ok(())
}
}
1 change: 1 addition & 0 deletions vortex-array/src/arrays/dict/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ pub use arbitrary::ArbitraryDictArray;
mod array;
pub use array::*;

pub mod cardinality;
pub(crate) mod compute;
mod execute;

Expand Down
Loading
Loading