From bf4b790e080ff9e4b14270a7719e424c9bec156d Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Wed, 24 Jun 2026 10:54:22 -0400 Subject: [PATCH 1/3] Validate row-index selections Signed-off-by: "Nicholas Gates" --- vortex-bench/src/random_access/take.rs | 2 +- vortex-cxx/src/lib.rs | 5 +- vortex-cxx/src/read.rs | 5 +- vortex-datafusion/src/persistent/opener.rs | 8 +- vortex-duckdb/src/convert/table_filter.rs | 16 +- vortex-ffi/src/scan.rs | 4 +- vortex-file/src/tests.rs | 9 +- vortex-jni/src/scan.rs | 4 +- vortex-layout/src/scan/scan_builder.rs | 6 +- vortex-python/src/dataset.rs | 2 +- vortex-python/src/file.rs | 2 +- vortex-scan/README.md | 2 +- vortex-scan/src/selection.rs | 234 +++++++++++++++++---- 13 files changed, 240 insertions(+), 59 deletions(-) diff --git a/vortex-bench/src/random_access/take.rs b/vortex-bench/src/random_access/take.rs index e86e5d576d6..42d6ebb4924 100644 --- a/vortex-bench/src/random_access/take.rs +++ b/vortex-bench/src/random_access/take.rs @@ -76,7 +76,7 @@ impl RandomAccessor for VortexRandomAccessor { let array = self .file .scan()? - .with_row_indices(indices_buf) + .with_row_indices(indices_buf)? .into_array_stream()? .read_all() .await?; diff --git a/vortex-cxx/src/lib.rs b/vortex-cxx/src/lib.rs index b6d002513f6..a7535e66027 100644 --- a/vortex-cxx/src/lib.rs +++ b/vortex-cxx/src/lib.rs @@ -95,7 +95,10 @@ mod ffi { fn with_projection(self: &mut VortexScanBuilder, projection: Box); fn with_projection_ref(self: &mut VortexScanBuilder, projection: &Expr); fn with_row_range(self: &mut VortexScanBuilder, row_range_start: u64, row_range_end: u64); - fn with_include_by_index(self: &mut VortexScanBuilder, include_by_index: &[u64]); + fn with_include_by_index( + self: &mut VortexScanBuilder, + include_by_index: &[u64], + ) -> Result<()>; fn with_limit(self: &mut VortexScanBuilder, limit: usize); unsafe fn with_output_schema( self: &mut VortexScanBuilder, diff --git a/vortex-cxx/src/read.rs b/vortex-cxx/src/read.rs index 4ce229b7a2b..be071958d0b 100644 --- a/vortex-cxx/src/read.rs +++ b/vortex-cxx/src/read.rs @@ -91,9 +91,10 @@ impl VortexScanBuilder { }); } - pub(crate) fn with_include_by_index(&mut self, include_by_index: &[u64]) { - let selection = Selection::IncludeByIndex(Buffer::copy_from(include_by_index)); + pub(crate) fn with_include_by_index(&mut self, include_by_index: &[u64]) -> Result<()> { + let selection = Selection::include_by_index(Buffer::copy_from(include_by_index))?; take_mut::take(&mut self.inner, |inner| inner.with_selection(selection)); + Ok(()) } pub(crate) fn with_limit(&mut self, limit: usize) { diff --git a/vortex-datafusion/src/persistent/opener.rs b/vortex-datafusion/src/persistent/opener.rs index 2923b6c313c..4bfcf45f442 100644 --- a/vortex-datafusion/src/persistent/opener.rs +++ b/vortex-datafusion/src/persistent/opener.rs @@ -1207,9 +1207,9 @@ mod tests { let mut file = PartitionedFile::new(file_path.to_string(), data_size); file.extensions .insert( - VortexAccessPlan::default().with_selection(Selection::IncludeByIndex( + VortexAccessPlan::default().with_selection(Selection::include_by_index( Buffer::from_iter(vec![1, 3, 5, 7]), - )), + )?), ); let opener = make_test_opener( @@ -1251,9 +1251,9 @@ mod tests { let mut file = PartitionedFile::new(file_path.to_string(), data_size); file.extensions .insert( - VortexAccessPlan::default().with_selection(Selection::ExcludeByIndex( + VortexAccessPlan::default().with_selection(Selection::exclude_by_index( Buffer::from_iter(vec![0, 2, 4, 6, 8]), - )), + )?), ); let opener = make_test_opener( diff --git a/vortex-duckdb/src/convert/table_filter.rs b/vortex-duckdb/src/convert/table_filter.rs index fa7ee26e744..2c8be999e04 100644 --- a/vortex-duckdb/src/convert/table_filter.rs +++ b/vortex-duckdb/src/convert/table_filter.rs @@ -166,6 +166,12 @@ fn intersect_sorted(left: &[u64], right: &[u64]) -> Vec { result } +fn normalize_indices(mut indices: Vec) -> Vec { + indices.sort_unstable(); + indices.dedup(); + indices +} + /// For constant comparison on IN filters over file_index or file_row_number /// virtual column, create a selection and a range covering the same range as /// expressions do. @@ -178,7 +184,10 @@ pub fn try_from_virtual_column_filter( .iter() .map(nonnegative_number_from_value) .collect::>>()?; - Ok((Selection::IncludeByIndex(Buffer::from_iter(indices)), None)) + Ok(( + Selection::include_by_index(Buffer::from_iter(normalize_indices(indices)))?, + None, + )) } TableFilterClass::ConstantComparison(const_) => { let n = nonnegative_number_from_value(const_.value)?; @@ -206,7 +215,7 @@ pub fn try_from_virtual_column_filter( let (sel, range) = try_from_virtual_column_filter(child)?; if let Selection::IncludeByIndex(buf) = sel { indices = Some(match indices { - None => buf.iter().copied().collect(), + None => buf.as_slice().to_vec(), Some(existing) => intersect_sorted(&existing, buf.as_ref()), }); } @@ -217,7 +226,8 @@ pub fn try_from_virtual_column_filter( } let range = (start < end).then_some(start..end); let sel = indices - .map(|v| Selection::IncludeByIndex(Buffer::from_iter(v))) + .map(|v| Selection::include_by_index(Buffer::from_iter(v))) + .transpose()? .unwrap_or(Selection::All); Ok((sel, range)) } diff --git a/vortex-ffi/src/scan.rs b/vortex-ffi/src/scan.rs index 619f82e3f3e..86badf9ff67 100644 --- a/vortex-ffi/src/scan.rs +++ b/vortex-ffi/src/scan.rs @@ -177,13 +177,13 @@ fn scan_request(opts: *const vx_scan_options) -> VortexResult { vortex_ensure!(!selection.idx.is_null()); let buf = unsafe { slice::from_raw_parts(selection.idx, selection.idx_len) }; let buf = Buffer::copy_from(buf); - Selection::IncludeByIndex(buf) + Selection::include_by_index(buf)? } vx_scan_selection_include::VX_SELECTION_EXCLUDE_RANGE => { vortex_ensure!(!selection.idx.is_null()); let buf = unsafe { slice::from_raw_parts(selection.idx, selection.idx_len) }; let buf = Buffer::copy_from(buf); - Selection::ExcludeByIndex(buf) + Selection::exclude_by_index(buf)? } }; diff --git a/vortex-file/src/tests.rs b/vortex-file/src/tests.rs index 295fef36039..8e60368d55d 100644 --- a/vortex-file/src/tests.rs +++ b/vortex-file/src/tests.rs @@ -741,6 +741,7 @@ async fn test_with_indices_simple() { .scan() .unwrap() .with_row_indices(Buffer::::empty()) + .unwrap() .into_array_stream() .unwrap() .read_all() @@ -758,6 +759,7 @@ async fn test_with_indices_simple() { .scan() .unwrap() .with_row_indices(Buffer::from_iter(kept_indices)) + .unwrap() .into_array_stream() .unwrap() .read_all() @@ -783,6 +785,7 @@ async fn test_with_indices_simple() { .scan() .unwrap() .with_row_indices((0u64..500).collect::>()) + .unwrap() .into_array_stream() .unwrap() .read_all() @@ -828,6 +831,7 @@ async fn test_with_indices_on_two_columns() { .scan() .unwrap() .with_row_indices(Buffer::from_iter(kept_indices)) + .unwrap() .into_array_stream() .unwrap() .read_all() @@ -885,6 +889,7 @@ async fn test_with_indices_and_with_row_filter_simple() { .unwrap() .with_filter(gt(get_item("numbers", root()), lit(50_i16))) .with_row_indices(Buffer::empty()) + .unwrap() .into_array_stream() .unwrap() .read_all() @@ -903,6 +908,7 @@ async fn test_with_indices_and_with_row_filter_simple() { .unwrap() .with_filter(gt(get_item("numbers", root()), lit(50_i16))) .with_row_indices(Buffer::from_iter(kept_indices)) + .unwrap() .into_array_stream() .unwrap() .read_all() @@ -931,6 +937,7 @@ async fn test_with_indices_and_with_row_filter_simple() { .unwrap() .with_filter(gt(get_item("numbers", root()), lit(50_i16))) .with_row_indices((0..500).collect::>()) + .unwrap() .into_array_stream() .unwrap() .read_all() @@ -1233,7 +1240,7 @@ async fn file_take() -> VortexResult<()> { let vxf = chunked_file().await?; let result = vxf .scan()? - .with_row_indices(buffer![0, 1, 8]) + .with_row_indices(buffer![0, 1, 8])? .into_array_stream()? .read_all() .await?; diff --git a/vortex-jni/src/scan.rs b/vortex-jni/src/scan.rs index 606ec8cd040..0a59a3ee1e8 100644 --- a/vortex-jni/src/scan.rs +++ b/vortex-jni/src/scan.rs @@ -96,8 +96,8 @@ fn build_scan_request( let selection = match selection_include { 0 => Selection::All, - 1 => Selection::IncludeByIndex(Buffer::copy_from(selection_idx)), - 2 => Selection::ExcludeByIndex(Buffer::copy_from(selection_idx)), + 1 => Selection::include_by_index(Buffer::copy_from(selection_idx))?, + 2 => Selection::exclude_by_index(Buffer::copy_from(selection_idx))?, 3 => Selection::IncludeRoaring(deserialize_roaring_selection(selection_roaring_bitmap)?), 4 => Selection::ExcludeRoaring(deserialize_roaring_selection(selection_roaring_bitmap)?), other => vortex_bail!("unknown selection include code: {other}"), diff --git a/vortex-layout/src/scan/scan_builder.rs b/vortex-layout/src/scan/scan_builder.rs index 11fd5c7b882..224a9e8783c 100644 --- a/vortex-layout/src/scan/scan_builder.rs +++ b/vortex-layout/src/scan/scan_builder.rs @@ -173,9 +173,9 @@ impl ScanBuilder { } /// Select rows by absolute indices relative to the scan input. - pub fn with_row_indices(mut self, row_indices: Buffer) -> Self { - self.selection = Selection::IncludeByIndex(row_indices); - self + pub fn with_row_indices(mut self, row_indices: Buffer) -> VortexResult { + self.selection = Selection::include_by_index(row_indices)?; + Ok(self) } /// Set the root row offset used by row-index expressions. diff --git a/vortex-python/src/dataset.rs b/vortex-python/src/dataset.rs index acb8285e5a0..da2d8f018ad 100644 --- a/vortex-python/src/dataset.rs +++ b/vortex-python/src/dataset.rs @@ -66,7 +66,7 @@ pub fn read_array_from_reader( if let Some(indices) = indices { let primitive = indices.execute::(ctx)?; let indices = primitive.into_buffer(); - scan = scan.with_row_indices(indices); + scan = scan.with_row_indices(indices)?; } if let Some((l, r)) = row_range { diff --git a/vortex-python/src/file.rs b/vortex-python/src/file.rs index f6bd1ed9cda..65327b1e967 100644 --- a/vortex-python/src/file.rs +++ b/vortex-python/src/file.rs @@ -220,7 +220,7 @@ fn scan_builder( if let Some(indices) = indices { let casted = indices.cast(DType::Primitive(PType::U64, NonNullable))?; let indices = casted.execute::(ctx)?.into_buffer::(); - builder = builder.with_row_indices(indices); + builder = builder.with_row_indices(indices)?; } if let Some(batch_size) = batch_size { diff --git a/vortex-scan/README.md b/vortex-scan/README.md index 31621370f50..55ed6fc2bec 100644 --- a/vortex-scan/README.md +++ b/vortex-scan/README.md @@ -90,7 +90,7 @@ use vortex_scan::Selection; // Select specific rows by index let scan = ScanBuilder::new(layout_reader) -.with_selection(Selection::IncludeByIndex(indices.into())) +.with_selection(Selection::include_by_index(indices.into())?) .build() ?; // Or use row ranges diff --git a/vortex-scan/src/selection.rs b/vortex-scan/src/selection.rs index 79abafc3d4d..d17af0b950d 100644 --- a/vortex-scan/src/selection.rs +++ b/vortex-scan/src/selection.rs @@ -7,11 +7,99 @@ use std::ops::Not; use std::ops::Range; use vortex_buffer::Buffer; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; use vortex_error::vortex_panic; use vortex_mask::Mask; use crate::row_mask::RowMask; +/// A validated selection of rows to include by absolute row index. +#[derive(Clone, Debug)] +pub struct IncludeByIndex { + indices: Buffer, +} + +impl IncludeByIndex { + /// Create a new include-by-index selection. + pub fn try_new(indices: Buffer) -> VortexResult { + validate_indices(&indices)?; + Ok(Self { indices }) + } + + /// Return the selected row indices. + pub fn as_slice(&self) -> &[u64] { + self.indices.as_slice() + } + + /// Return true if the selection contains no row indices. + pub fn is_empty(&self) -> bool { + self.indices.is_empty() + } + + /// Return the number of selected row indices. + pub fn len(&self) -> usize { + self.indices.len() + } +} + +impl std::ops::Deref for IncludeByIndex { + type Target = [u64]; + + fn deref(&self) -> &Self::Target { + self.as_slice() + } +} + +impl AsRef<[u64]> for IncludeByIndex { + fn as_ref(&self) -> &[u64] { + self.as_slice() + } +} + +/// A validated selection of rows to exclude by absolute row index. +#[derive(Clone, Debug)] +pub struct ExcludeByIndex { + indices: Buffer, +} + +impl ExcludeByIndex { + /// Create a new exclude-by-index selection. + pub fn try_new(indices: Buffer) -> VortexResult { + validate_indices(&indices)?; + Ok(Self { indices }) + } + + /// Return the excluded row indices. + pub fn as_slice(&self) -> &[u64] { + self.indices.as_slice() + } + + /// Return true if the selection contains no row indices. + pub fn is_empty(&self) -> bool { + self.indices.is_empty() + } + + /// Return the number of excluded row indices. + pub fn len(&self) -> usize { + self.indices.len() + } +} + +impl std::ops::Deref for ExcludeByIndex { + type Target = [u64]; + + fn deref(&self) -> &Self::Target { + self.as_slice() + } +} + +impl AsRef<[u64]> for ExcludeByIndex { + fn as_ref(&self) -> &[u64] { + self.as_slice() + } +} + /// A selection identifies a set of rows to include in the scan (in addition to applying any /// filter predicates). #[derive(Default, Clone, Debug)] @@ -19,10 +107,10 @@ pub enum Selection { /// No selection, all rows are included. #[default] All, - /// A selection of sorted rows to include by index. - IncludeByIndex(Buffer), - /// A selection of sorted rows to exclude by index. - ExcludeByIndex(Buffer), + /// A selection of sorted, unique rows to include by index. + IncludeByIndex(IncludeByIndex), + /// A selection of sorted, unique rows to exclude by index. + ExcludeByIndex(ExcludeByIndex), /// A selection of rows to include using a [`roaring::RoaringTreemap`]. IncludeRoaring(roaring::RoaringTreemap), /// A selection of rows to exclude using a [`roaring::RoaringTreemap`]. @@ -30,6 +118,16 @@ pub enum Selection { } impl Selection { + /// Create a selection of rows to include by absolute row index. + pub fn include_by_index(indices: Buffer) -> VortexResult { + Ok(Self::IncludeByIndex(IncludeByIndex::try_new(indices)?)) + } + + /// Create a selection of rows to exclude by absolute row index. + pub fn exclude_by_index(indices: Buffer) -> VortexResult { + Ok(Self::ExcludeByIndex(ExcludeByIndex::try_new(indices)?)) + } + /// Return the row count for this selection. pub fn row_count(&self, total_rows: u64) -> u64 { match self { @@ -62,12 +160,12 @@ impl Selection { match self { Selection::All => RowMask::new(range.start, Mask::new_true(range_len)), Selection::IncludeByIndex(include) => { - let mask = indices_range(range, include) + let indices = include.as_slice(); + let mask = indices_range(range, indices) .map(|idx_range| { Mask::from_indices( range_len, - include - .slice(idx_range) + indices[idx_range] .iter() .map(|idx| { idx.checked_sub(range.start).unwrap_or_else(|| { @@ -89,10 +187,26 @@ impl Selection { RowMask::new(range.start, mask) } Selection::ExcludeByIndex(exclude) => { - let mask = Selection::IncludeByIndex(exclude.clone()) - .row_mask(range) - .mask() - .clone(); + let indices = exclude.as_slice(); + let mask = indices_range(range, indices) + .map(|idx_range| { + Mask::from_indices( + range_len, + indices[idx_range] + .iter() + .map(|idx| { + idx.checked_sub(range.start).unwrap_or_else(|| { + vortex_panic!( + "index underflow, range: {:?}, idx: {:?}", + range, + idx + ) + }) + }) + .filter_map(|idx| usize::try_from(idx).ok()), + ) + }) + .unwrap_or_else(|| Mask::new_false(range_len)); RowMask::new(range.start, mask.not()) } Selection::IncludeRoaring(roaring) => { @@ -156,6 +270,24 @@ impl Selection { } } +fn validate_indices(indices: &[u64]) -> VortexResult<()> { + // Row-mask extraction uses binary search over these indices, and row_count treats + // them as set membership. Unsorted or duplicate input can otherwise silently + // mis-select rows or over-report the selected row count. + for (idx, window) in indices.windows(2).enumerate() { + if window[0] >= window[1] { + vortex_bail!( + "row index selection must be strictly increasing at positions {} and {}: {} >= {}", + idx, + idx + 1, + window[0], + window[1] + ); + } + } + Ok(()) +} + /// Find the positional range within row_indices that covers all rows in the given range. fn indices_range(range: &Range, row_indices: &[u64]) -> Option> { if row_indices.first().is_some_and(|&first| first >= range.end) @@ -177,9 +309,39 @@ fn indices_range(range: &Range, row_indices: &[u64]) -> Option mod tests { use vortex_buffer::Buffer; + use super::Selection; + + fn include(indices: impl IntoIterator) -> Selection { + Selection::include_by_index(Buffer::from_iter(indices)) + .expect("test indices should be strictly increasing") + } + + fn exclude(indices: impl IntoIterator) -> Selection { + Selection::exclude_by_index(Buffer::from_iter(indices)) + .expect("test indices should be strictly increasing") + } + + #[test] + fn include_by_index_rejects_unsorted_indices() { + let err = Selection::include_by_index(Buffer::from_iter([3, 1])).unwrap_err(); + assert!(err.to_string().contains("strictly increasing")); + } + + #[test] + fn include_by_index_rejects_duplicate_indices() { + let err = Selection::include_by_index(Buffer::from_iter([1, 1])).unwrap_err(); + assert!(err.to_string().contains("strictly increasing")); + } + + #[test] + fn exclude_by_index_rejects_unsorted_indices() { + let err = Selection::exclude_by_index(Buffer::from_iter([3, 1])).unwrap_err(); + assert!(err.to_string().contains("strictly increasing")); + } + #[test] fn test_row_mask_all() { - let selection = super::Selection::IncludeByIndex(Buffer::from_iter(vec![1, 3, 5, 7])); + let selection = include([1, 3, 5, 7]); let range = 1..8; let row_mask = selection.row_mask(&range); @@ -188,7 +350,7 @@ mod tests { #[test] fn test_row_mask_slice() { - let selection = super::Selection::IncludeByIndex(Buffer::from_iter(vec![1, 3, 5, 7])); + let selection = include([1, 3, 5, 7]); let range = 3..6; let row_mask = selection.row_mask(&range); @@ -197,7 +359,7 @@ mod tests { #[test] fn test_row_mask_exclusive() { - let selection = super::Selection::IncludeByIndex(Buffer::from_iter(vec![1, 3, 5, 7])); + let selection = include([1, 3, 5, 7]); let range = 3..5; let row_mask = selection.row_mask(&range); @@ -206,7 +368,7 @@ mod tests { #[test] fn test_row_mask_all_false() { - let selection = super::Selection::IncludeByIndex(Buffer::from_iter(vec![1, 3, 5, 7])); + let selection = include([1, 3, 5, 7]); let range = 8..10; let row_mask = selection.row_mask(&range); @@ -215,7 +377,7 @@ mod tests { #[test] fn test_row_mask_all_true() { - let selection = super::Selection::IncludeByIndex(Buffer::from_iter(vec![1, 3, 4, 5, 6])); + let selection = include([1, 3, 4, 5, 6]); let range = 3..7; let row_mask = selection.row_mask(&range); @@ -224,7 +386,7 @@ mod tests { #[test] fn test_row_mask_zero() { - let selection = super::Selection::IncludeByIndex(Buffer::from_iter(vec![0])); + let selection = include([0]); let range = 0..5; let row_mask = selection.row_mask(&range); @@ -244,7 +406,7 @@ mod tests { roaring.insert(5); roaring.insert(7); - let selection = super::super::Selection::IncludeRoaring(roaring); + let selection = Selection::IncludeRoaring(roaring); let range = 1..8; let row_mask = selection.row_mask(&range); @@ -259,7 +421,7 @@ mod tests { roaring.insert(5); roaring.insert(7); - let selection = super::super::Selection::IncludeRoaring(roaring); + let selection = Selection::IncludeRoaring(roaring); let range = 3..6; let row_mask = selection.row_mask(&range); @@ -274,7 +436,7 @@ mod tests { roaring.insert(5); roaring.insert(7); - let selection = super::super::Selection::IncludeRoaring(roaring); + let selection = Selection::IncludeRoaring(roaring); let range = 8..10; let row_mask = selection.row_mask(&range); @@ -289,7 +451,7 @@ mod tests { roaring.insert(i); } - let selection = super::super::Selection::IncludeRoaring(roaring); + let selection = Selection::IncludeRoaring(roaring); let range = 1000..2000; let row_mask = selection.row_mask(&range); @@ -304,7 +466,7 @@ mod tests { roaring.insert(3); roaring.insert(5); - let selection = super::super::Selection::ExcludeRoaring(roaring); + let selection = Selection::ExcludeRoaring(roaring); let range = 0..7; let row_mask = selection.row_mask(&range); @@ -320,7 +482,7 @@ mod tests { roaring.insert(i); } - let selection = super::super::Selection::ExcludeRoaring(roaring); + let selection = Selection::ExcludeRoaring(roaring); let range = 10..20; let row_mask = selection.row_mask(&range); @@ -333,7 +495,7 @@ mod tests { roaring.insert(100); roaring.insert(101); - let selection = super::super::Selection::ExcludeRoaring(roaring); + let selection = Selection::ExcludeRoaring(roaring); let range = 0..10; let row_mask = selection.row_mask(&range); @@ -349,7 +511,7 @@ mod tests { roaring.insert(7); roaring.insert(15); // Outside range - let selection = super::super::Selection::ExcludeRoaring(roaring); + let selection = Selection::ExcludeRoaring(roaring); let range = 5..10; let row_mask = selection.row_mask(&range); @@ -360,7 +522,7 @@ mod tests { #[test] fn test_roaring_include_empty() { let roaring = RoaringTreemap::new(); - let selection = super::super::Selection::IncludeRoaring(roaring); + let selection = Selection::IncludeRoaring(roaring); let range = 0..100; let row_mask = selection.row_mask(&range); @@ -370,7 +532,7 @@ mod tests { #[test] fn test_roaring_exclude_empty() { let roaring = RoaringTreemap::new(); - let selection = super::super::Selection::ExcludeRoaring(roaring); + let selection = Selection::ExcludeRoaring(roaring); let range = 0..100; let row_mask = selection.row_mask(&range); @@ -383,7 +545,7 @@ mod tests { roaring.insert(0); roaring.insert(99); - let selection = super::super::Selection::IncludeRoaring(roaring); + let selection = Selection::IncludeRoaring(roaring); let range = 0..100; let row_mask = selection.row_mask(&range); @@ -397,7 +559,7 @@ mod tests { roaring.insert_range(10..20); roaring.insert_range(30..40); - let selection = super::super::Selection::IncludeRoaring(roaring); + let selection = Selection::IncludeRoaring(roaring); let range = 15..35; let row_mask = selection.row_mask(&range); @@ -413,7 +575,7 @@ mod tests { roaring.insert(u64::MAX - 1); roaring.insert(u64::MAX); - let selection = super::super::Selection::IncludeRoaring(roaring); + let selection = Selection::IncludeRoaring(roaring); let range = u64::MAX - 10..u64::MAX; let row_mask = selection.row_mask(&range); @@ -426,7 +588,7 @@ mod tests { let mut roaring = RoaringTreemap::new(); roaring.insert(u64::MAX - 1); - let selection = super::super::Selection::ExcludeRoaring(roaring); + let selection = Selection::ExcludeRoaring(roaring); let range = u64::MAX - 10..u64::MAX; let row_mask = selection.row_mask(&range); @@ -439,14 +601,13 @@ mod tests { // Test that RoaringTreemap and Buffer produce same results let indices = vec![1, 3, 5, 7, 9]; - let buffer_selection = - super::super::Selection::IncludeByIndex(Buffer::from_iter(indices.clone())); + let buffer_selection = include(indices.clone()); let mut roaring = RoaringTreemap::new(); for idx in &indices { roaring.insert(*idx); } - let roaring_selection = super::super::Selection::IncludeRoaring(roaring); + let roaring_selection = Selection::IncludeRoaring(roaring); let range = 0..12; let buffer_mask = buffer_selection.row_mask(&range); @@ -463,14 +624,13 @@ mod tests { // Test that ExcludeRoaring and ExcludeByIndex produce same results let indices = vec![2, 4, 6, 8]; - let buffer_selection = - super::super::Selection::ExcludeByIndex(Buffer::from_iter(indices.clone())); + let buffer_selection = exclude(indices.clone()); let mut roaring = RoaringTreemap::new(); for idx in &indices { roaring.insert(*idx); } - let roaring_selection = super::super::Selection::ExcludeRoaring(roaring); + let roaring_selection = Selection::ExcludeRoaring(roaring); let range = 0..10; let buffer_mask = buffer_selection.row_mask(&range); From e0b6d96eb6f12973a432be532ab24541efb388b6 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Wed, 24 Jun 2026 11:04:01 -0400 Subject: [PATCH 2/3] Do not normalize row-index selections Signed-off-by: "Nicholas Gates" --- vortex-duckdb/src/convert/table_filter.rs | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/vortex-duckdb/src/convert/table_filter.rs b/vortex-duckdb/src/convert/table_filter.rs index 2c8be999e04..b7c25a2df65 100644 --- a/vortex-duckdb/src/convert/table_filter.rs +++ b/vortex-duckdb/src/convert/table_filter.rs @@ -166,12 +166,6 @@ fn intersect_sorted(left: &[u64], right: &[u64]) -> Vec { result } -fn normalize_indices(mut indices: Vec) -> Vec { - indices.sort_unstable(); - indices.dedup(); - indices -} - /// For constant comparison on IN filters over file_index or file_row_number /// virtual column, create a selection and a range covering the same range as /// expressions do. @@ -185,7 +179,7 @@ pub fn try_from_virtual_column_filter( .map(nonnegative_number_from_value) .collect::>>()?; Ok(( - Selection::include_by_index(Buffer::from_iter(normalize_indices(indices)))?, + Selection::include_by_index(Buffer::from_iter(indices))?, None, )) } From a2e44d28a67f1f88e6a58076382bbaae26442efa Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Wed, 24 Jun 2026 11:15:20 -0400 Subject: [PATCH 3/3] Use StrictSortedBuffer for row selections Signed-off-by: "Nicholas Gates" --- vortex-bench/src/random_access/take.rs | 3 +- vortex-cxx/src/read.rs | 5 +- vortex-datafusion/src/persistent/opener.rs | 15 ++- vortex-duckdb/src/convert/table_filter.rs | 7 +- vortex-ffi/src/scan.rs | 5 +- vortex-file/src/tests.rs | 28 ++--- vortex-jni/src/scan.rs | 9 +- vortex-layout/src/scan/scan_builder.rs | 10 +- vortex-python/src/dataset.rs | 3 +- vortex-python/src/file.rs | 3 +- vortex-scan/README.md | 4 +- vortex-scan/src/selection.rs | 138 ++++++++------------- 12 files changed, 106 insertions(+), 124 deletions(-) diff --git a/vortex-bench/src/random_access/take.rs b/vortex-bench/src/random_access/take.rs index 42d6ebb4924..5ba33ea2e11 100644 --- a/vortex-bench/src/random_access/take.rs +++ b/vortex-bench/src/random_access/take.rs @@ -25,6 +25,7 @@ use vortex::array::stream::ArrayStreamExt; use vortex::buffer::Buffer; use vortex::file::OpenOptionsSessionExt; use vortex::file::VortexFile; +use vortex::scan::selection::StrictSortedBuffer; use vortex::utils::aliases::hash_map::HashMap; use crate::Format; @@ -76,7 +77,7 @@ impl RandomAccessor for VortexRandomAccessor { let array = self .file .scan()? - .with_row_indices(indices_buf)? + .with_row_indices(StrictSortedBuffer::try_new(indices_buf)?) .into_array_stream()? .read_all() .await?; diff --git a/vortex-cxx/src/read.rs b/vortex-cxx/src/read.rs index be071958d0b..681949d99e1 100644 --- a/vortex-cxx/src/read.rs +++ b/vortex-cxx/src/read.rs @@ -24,6 +24,7 @@ use vortex::io::runtime::BlockingRuntime; use vortex::layout::scan::arrow::RecordBatchIteratorAdapter; use vortex::layout::scan::scan_builder::ScanBuilder; use vortex::scan::selection::Selection; +use vortex::scan::selection::StrictSortedBuffer; use crate::RUNTIME; use crate::SESSION; @@ -92,7 +93,9 @@ impl VortexScanBuilder { } pub(crate) fn with_include_by_index(&mut self, include_by_index: &[u64]) -> Result<()> { - let selection = Selection::include_by_index(Buffer::copy_from(include_by_index))?; + let selection = Selection::IncludeByIndex(StrictSortedBuffer::try_new(Buffer::copy_from( + include_by_index, + ))?); take_mut::take(&mut self.inner, |inner| inner.with_selection(selection)); Ok(()) } diff --git a/vortex-datafusion/src/persistent/opener.rs b/vortex-datafusion/src/persistent/opener.rs index 4bfcf45f442..d077b60a91e 100644 --- a/vortex-datafusion/src/persistent/opener.rs +++ b/vortex-datafusion/src/persistent/opener.rs @@ -603,6 +603,7 @@ mod tests { use vortex::io::object_store::ObjectStoreWrite; use vortex::metrics::DefaultMetricsRegistry; use vortex::scan::selection::Selection; + use vortex::scan::selection::StrictSortedBuffer; use vortex::session::VortexSession; use super::*; @@ -1193,8 +1194,6 @@ mod tests { // Test that Selection::IncludeByIndex filters to specific row indices. async fn test_selection_include_by_index() -> anyhow::Result<()> { use datafusion::arrow::util::pretty::pretty_format_batches_with_options; - use vortex::buffer::Buffer; - use vortex::scan::selection::Selection; let object_store = Arc::new(InMemory::new()) as Arc; let file_path = "/path/file.vortex"; @@ -1207,9 +1206,9 @@ mod tests { let mut file = PartitionedFile::new(file_path.to_string(), data_size); file.extensions .insert( - VortexAccessPlan::default().with_selection(Selection::include_by_index( - Buffer::from_iter(vec![1, 3, 5, 7]), - )?), + VortexAccessPlan::default().with_selection(Selection::IncludeByIndex( + StrictSortedBuffer::try_new(Buffer::from_iter(vec![1, 3, 5, 7]))?, + )), ); let opener = make_test_opener( @@ -1251,9 +1250,9 @@ mod tests { let mut file = PartitionedFile::new(file_path.to_string(), data_size); file.extensions .insert( - VortexAccessPlan::default().with_selection(Selection::exclude_by_index( - Buffer::from_iter(vec![0, 2, 4, 6, 8]), - )?), + VortexAccessPlan::default().with_selection(Selection::ExcludeByIndex( + StrictSortedBuffer::try_new(Buffer::from_iter(vec![0, 2, 4, 6, 8]))?, + )), ); let opener = make_test_opener( diff --git a/vortex-duckdb/src/convert/table_filter.rs b/vortex-duckdb/src/convert/table_filter.rs index b7c25a2df65..85ce9868d96 100644 --- a/vortex-duckdb/src/convert/table_filter.rs +++ b/vortex-duckdb/src/convert/table_filter.rs @@ -25,6 +25,7 @@ use vortex::scalar_fn::ScalarFnVTableExt; use vortex::scalar_fn::fns::binary::Binary; use vortex::scalar_fn::fns::operators::CompareOperator; use vortex::scan::selection::Selection; +use vortex::scan::selection::StrictSortedBuffer; use super::expr::try_from_bound_expression_with_col_sub; use crate::cpp::DUCKDB_VX_EXPR_TYPE; @@ -179,7 +180,7 @@ pub fn try_from_virtual_column_filter( .map(nonnegative_number_from_value) .collect::>>()?; Ok(( - Selection::include_by_index(Buffer::from_iter(indices))?, + Selection::IncludeByIndex(StrictSortedBuffer::try_new(Buffer::from_iter(indices))?), None, )) } @@ -220,7 +221,9 @@ pub fn try_from_virtual_column_filter( } let range = (start < end).then_some(start..end); let sel = indices - .map(|v| Selection::include_by_index(Buffer::from_iter(v))) + .map(|v| { + StrictSortedBuffer::try_new(Buffer::from_iter(v)).map(Selection::IncludeByIndex) + }) .transpose()? .unwrap_or(Selection::All); Ok((sel, range)) diff --git a/vortex-ffi/src/scan.rs b/vortex-ffi/src/scan.rs index 86badf9ff67..9e80f63c1d9 100644 --- a/vortex-ffi/src/scan.rs +++ b/vortex-ffi/src/scan.rs @@ -33,6 +33,7 @@ use vortex::scan::Partition; use vortex::scan::PartitionStream; use vortex::scan::ScanRequest; use vortex::scan::selection::Selection; +use vortex::scan::selection::StrictSortedBuffer; use crate::RUNTIME; use crate::array::vx_array; @@ -177,13 +178,13 @@ fn scan_request(opts: *const vx_scan_options) -> VortexResult { vortex_ensure!(!selection.idx.is_null()); let buf = unsafe { slice::from_raw_parts(selection.idx, selection.idx_len) }; let buf = Buffer::copy_from(buf); - Selection::include_by_index(buf)? + Selection::IncludeByIndex(StrictSortedBuffer::try_new(buf)?) } vx_scan_selection_include::VX_SELECTION_EXCLUDE_RANGE => { vortex_ensure!(!selection.idx.is_null()); let buf = unsafe { slice::from_raw_parts(selection.idx, selection.idx_len) }; let buf = Buffer::copy_from(buf); - Selection::exclude_by_index(buf)? + Selection::ExcludeByIndex(StrictSortedBuffer::try_new(buf)?) } }; diff --git a/vortex-file/src/tests.rs b/vortex-file/src/tests.rs index 8e60368d55d..a36e49595f2 100644 --- a/vortex-file/src/tests.rs +++ b/vortex-file/src/tests.rs @@ -68,6 +68,7 @@ use vortex_layout::layouts::zoned::LegacyStats; use vortex_layout::layouts::zoned::Zoned; use vortex_layout::scan::scan_builder::ScanBuilder; use vortex_layout::session::LayoutSession; +use vortex_scan::selection::StrictSortedBuffer; use vortex_session::VortexSession; use crate::OpenOptionsSessionExt; @@ -86,6 +87,10 @@ static SESSION: LazyLock = LazyLock::new(|| { session }); +fn strict_sorted(indices: Buffer) -> StrictSortedBuffer { + StrictSortedBuffer::try_new(indices).expect("test indices should be strictly increasing") +} + #[tokio::test] async fn test_eof_values() { // this test exists as a reminder to think about whether we should increment the version @@ -740,8 +745,7 @@ async fn test_with_indices_simple() { let actual_kept_array = file .scan() .unwrap() - .with_row_indices(Buffer::::empty()) - .unwrap() + .with_row_indices(strict_sorted(Buffer::::empty())) .into_array_stream() .unwrap() .read_all() @@ -758,8 +762,7 @@ async fn test_with_indices_simple() { let actual_kept_array = file .scan() .unwrap() - .with_row_indices(Buffer::from_iter(kept_indices)) - .unwrap() + .with_row_indices(strict_sorted(Buffer::from_iter(kept_indices))) .into_array_stream() .unwrap() .read_all() @@ -784,8 +787,7 @@ async fn test_with_indices_simple() { let actual_array = file .scan() .unwrap() - .with_row_indices((0u64..500).collect::>()) - .unwrap() + .with_row_indices(strict_sorted((0u64..500).collect::>())) .into_array_stream() .unwrap() .read_all() @@ -830,8 +832,7 @@ async fn test_with_indices_on_two_columns() { let array = file .scan() .unwrap() - .with_row_indices(Buffer::from_iter(kept_indices)) - .unwrap() + .with_row_indices(strict_sorted(Buffer::from_iter(kept_indices))) .into_array_stream() .unwrap() .read_all() @@ -888,8 +889,7 @@ async fn test_with_indices_and_with_row_filter_simple() { .scan() .unwrap() .with_filter(gt(get_item("numbers", root()), lit(50_i16))) - .with_row_indices(Buffer::empty()) - .unwrap() + .with_row_indices(strict_sorted(Buffer::empty())) .into_array_stream() .unwrap() .read_all() @@ -907,8 +907,7 @@ async fn test_with_indices_and_with_row_filter_simple() { .scan() .unwrap() .with_filter(gt(get_item("numbers", root()), lit(50_i16))) - .with_row_indices(Buffer::from_iter(kept_indices)) - .unwrap() + .with_row_indices(strict_sorted(Buffer::from_iter(kept_indices))) .into_array_stream() .unwrap() .read_all() @@ -936,8 +935,7 @@ async fn test_with_indices_and_with_row_filter_simple() { .scan() .unwrap() .with_filter(gt(get_item("numbers", root()), lit(50_i16))) - .with_row_indices((0..500).collect::>()) - .unwrap() + .with_row_indices(strict_sorted((0..500).collect::>())) .into_array_stream() .unwrap() .read_all() @@ -1240,7 +1238,7 @@ async fn file_take() -> VortexResult<()> { let vxf = chunked_file().await?; let result = vxf .scan()? - .with_row_indices(buffer![0, 1, 8])? + .with_row_indices(StrictSortedBuffer::try_new(buffer![0, 1, 8])?) .into_array_stream()? .read_all() .await?; diff --git a/vortex-jni/src/scan.rs b/vortex-jni/src/scan.rs index 0a59a3ee1e8..ab0fdb6d28d 100644 --- a/vortex-jni/src/scan.rs +++ b/vortex-jni/src/scan.rs @@ -46,6 +46,7 @@ use vortex::scan::PartitionRef; use vortex::scan::PartitionStream; use vortex::scan::ScanRequest; use vortex::scan::selection::Selection; +use vortex::scan::selection::StrictSortedBuffer; use crate::RUNTIME; use crate::data_source::NativeDataSource; @@ -96,8 +97,12 @@ fn build_scan_request( let selection = match selection_include { 0 => Selection::All, - 1 => Selection::include_by_index(Buffer::copy_from(selection_idx))?, - 2 => Selection::exclude_by_index(Buffer::copy_from(selection_idx))?, + 1 => Selection::IncludeByIndex(StrictSortedBuffer::try_new(Buffer::copy_from( + selection_idx, + ))?), + 2 => Selection::ExcludeByIndex(StrictSortedBuffer::try_new(Buffer::copy_from( + selection_idx, + ))?), 3 => Selection::IncludeRoaring(deserialize_roaring_selection(selection_roaring_bitmap)?), 4 => Selection::ExcludeRoaring(deserialize_roaring_selection(selection_roaring_bitmap)?), other => vortex_bail!("unknown selection include code: {other}"), diff --git a/vortex-layout/src/scan/scan_builder.rs b/vortex-layout/src/scan/scan_builder.rs index 224a9e8783c..c5aea847e97 100644 --- a/vortex-layout/src/scan/scan_builder.rs +++ b/vortex-layout/src/scan/scan_builder.rs @@ -24,7 +24,6 @@ use vortex_array::iter::ArrayIteratorAdapter; use vortex_array::stats::StatsSet; use vortex_array::stream::ArrayStream; use vortex_array::stream::ArrayStreamAdapter; -use vortex_buffer::Buffer; use vortex_error::VortexExpect; use vortex_error::VortexResult; use vortex_error::vortex_bail; @@ -34,6 +33,7 @@ use vortex_io::runtime::Task; use vortex_io::session::RuntimeSessionExt; use vortex_metrics::MetricsRegistry; use vortex_scan::selection::Selection; +use vortex_scan::selection::StrictSortedBuffer; use vortex_session::VortexSession; use vortex_utils::parallelism::get_available_parallelism; @@ -172,10 +172,10 @@ impl ScanBuilder { self } - /// Select rows by absolute indices relative to the scan input. - pub fn with_row_indices(mut self, row_indices: Buffer) -> VortexResult { - self.selection = Selection::include_by_index(row_indices)?; - Ok(self) + /// Select rows by strictly sorted absolute indices relative to the scan input. + pub fn with_row_indices(mut self, row_indices: StrictSortedBuffer) -> Self { + self.selection = Selection::IncludeByIndex(row_indices); + self } /// Set the root row offset used by row-index expressions. diff --git a/vortex-python/src/dataset.rs b/vortex-python/src/dataset.rs index da2d8f018ad..0ece0925388 100644 --- a/vortex-python/src/dataset.rs +++ b/vortex-python/src/dataset.rs @@ -25,6 +25,7 @@ use vortex::file::OpenOptionsSessionExt; use vortex::file::VortexFile; use vortex::io::runtime::BlockingRuntime; use vortex::layout::scan::split_by::SplitBy; +use vortex::scan::selection::StrictSortedBuffer; use crate::RUNTIME; use crate::arrays::PyArrayRef; @@ -66,7 +67,7 @@ pub fn read_array_from_reader( if let Some(indices) = indices { let primitive = indices.execute::(ctx)?; let indices = primitive.into_buffer(); - scan = scan.with_row_indices(indices)?; + scan = scan.with_row_indices(StrictSortedBuffer::try_new(indices)?); } if let Some((l, r)) = row_range { diff --git a/vortex-python/src/file.rs b/vortex-python/src/file.rs index 65327b1e967..6f4897689f7 100644 --- a/vortex-python/src/file.rs +++ b/vortex-python/src/file.rs @@ -27,6 +27,7 @@ use vortex::io::runtime::BlockingRuntime; use vortex::layout::scan::scan_builder::ScanBuilder; use vortex::layout::scan::split_by::SplitBy; use vortex::layout::segments::MokaSegmentCache; +use vortex::scan::selection::StrictSortedBuffer; use crate::RUNTIME; use crate::arrays::PyArrayRef; @@ -220,7 +221,7 @@ fn scan_builder( if let Some(indices) = indices { let casted = indices.cast(DType::Primitive(PType::U64, NonNullable))?; let indices = casted.execute::(ctx)?.into_buffer::(); - builder = builder.with_row_indices(indices)?; + builder = builder.with_row_indices(StrictSortedBuffer::try_new(indices)?); } if let Some(batch_size) = batch_size { diff --git a/vortex-scan/README.md b/vortex-scan/README.md index 55ed6fc2bec..20be73fb9c4 100644 --- a/vortex-scan/README.md +++ b/vortex-scan/README.md @@ -86,11 +86,11 @@ let record_batch: RecordBatch = batch ?; ### Row Selection ```rust -use vortex_scan::Selection; +use vortex_scan::selection::{Selection, StrictSortedBuffer}; // Select specific rows by index let scan = ScanBuilder::new(layout_reader) -.with_selection(Selection::include_by_index(indices.into())?) +.with_selection(Selection::IncludeByIndex(StrictSortedBuffer::try_new(indices.into())?)) .build() ?; // Or use row ranges diff --git a/vortex-scan/src/selection.rs b/vortex-scan/src/selection.rs index d17af0b950d..8adea2ba05b 100644 --- a/vortex-scan/src/selection.rs +++ b/vortex-scan/src/selection.rs @@ -7,6 +7,7 @@ use std::ops::Not; use std::ops::Range; use vortex_buffer::Buffer; +use vortex_error::VortexError; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_panic; @@ -14,88 +15,74 @@ use vortex_mask::Mask; use crate::row_mask::RowMask; -/// A validated selection of rows to include by absolute row index. +/// A buffer whose values are known to be strictly sorted in ascending order. #[derive(Clone, Debug)] -pub struct IncludeByIndex { - indices: Buffer, +pub struct StrictSortedBuffer { + buffer: Buffer, } -impl IncludeByIndex { - /// Create a new include-by-index selection. - pub fn try_new(indices: Buffer) -> VortexResult { - validate_indices(&indices)?; - Ok(Self { indices }) +impl StrictSortedBuffer { + /// Return the sorted values. + pub fn as_slice(&self) -> &[T] { + self.buffer.as_slice() } - /// Return the selected row indices. - pub fn as_slice(&self) -> &[u64] { - self.indices.as_slice() + /// Return the sorted buffer. + pub fn into_inner(self) -> Buffer { + self.buffer } - /// Return true if the selection contains no row indices. + /// Return true if the buffer contains no values. pub fn is_empty(&self) -> bool { - self.indices.is_empty() + self.buffer.is_empty() } - /// Return the number of selected row indices. + /// Return the number of values in the buffer. pub fn len(&self) -> usize { - self.indices.len() + self.buffer.len() } } -impl std::ops::Deref for IncludeByIndex { - type Target = [u64]; - - fn deref(&self) -> &Self::Target { - self.as_slice() +impl StrictSortedBuffer { + /// Create a new buffer, failing if the values are not strictly increasing. + pub fn try_new(buffer: Buffer) -> VortexResult { + validate_strictly_sorted(buffer.as_slice())?; + Ok(Self { buffer }) } } -impl AsRef<[u64]> for IncludeByIndex { - fn as_ref(&self) -> &[u64] { - self.as_slice() +impl Default for StrictSortedBuffer { + fn default() -> Self { + Self { + buffer: Buffer::default(), + } } } -/// A validated selection of rows to exclude by absolute row index. -#[derive(Clone, Debug)] -pub struct ExcludeByIndex { - indices: Buffer, -} +impl TryFrom> for StrictSortedBuffer { + type Error = VortexError; -impl ExcludeByIndex { - /// Create a new exclude-by-index selection. - pub fn try_new(indices: Buffer) -> VortexResult { - validate_indices(&indices)?; - Ok(Self { indices }) - } - - /// Return the excluded row indices. - pub fn as_slice(&self) -> &[u64] { - self.indices.as_slice() + fn try_from(value: Buffer) -> Result { + Self::try_new(value) } +} - /// Return true if the selection contains no row indices. - pub fn is_empty(&self) -> bool { - self.indices.is_empty() - } - - /// Return the number of excluded row indices. - pub fn len(&self) -> usize { - self.indices.len() +impl From> for Buffer { + fn from(value: StrictSortedBuffer) -> Self { + value.into_inner() } } -impl std::ops::Deref for ExcludeByIndex { - type Target = [u64]; +impl std::ops::Deref for StrictSortedBuffer { + type Target = [T]; fn deref(&self) -> &Self::Target { self.as_slice() } } -impl AsRef<[u64]> for ExcludeByIndex { - fn as_ref(&self) -> &[u64] { +impl AsRef<[T]> for StrictSortedBuffer { + fn as_ref(&self) -> &[T] { self.as_slice() } } @@ -108,9 +95,9 @@ pub enum Selection { #[default] All, /// A selection of sorted, unique rows to include by index. - IncludeByIndex(IncludeByIndex), + IncludeByIndex(StrictSortedBuffer), /// A selection of sorted, unique rows to exclude by index. - ExcludeByIndex(ExcludeByIndex), + ExcludeByIndex(StrictSortedBuffer), /// A selection of rows to include using a [`roaring::RoaringTreemap`]. IncludeRoaring(roaring::RoaringTreemap), /// A selection of rows to exclude using a [`roaring::RoaringTreemap`]. @@ -118,16 +105,6 @@ pub enum Selection { } impl Selection { - /// Create a selection of rows to include by absolute row index. - pub fn include_by_index(indices: Buffer) -> VortexResult { - Ok(Self::IncludeByIndex(IncludeByIndex::try_new(indices)?)) - } - - /// Create a selection of rows to exclude by absolute row index. - pub fn exclude_by_index(indices: Buffer) -> VortexResult { - Ok(Self::ExcludeByIndex(ExcludeByIndex::try_new(indices)?)) - } - /// Return the row count for this selection. pub fn row_count(&self, total_rows: u64) -> u64 { match self { @@ -270,18 +247,13 @@ impl Selection { } } -fn validate_indices(indices: &[u64]) -> VortexResult<()> { - // Row-mask extraction uses binary search over these indices, and row_count treats - // them as set membership. Unsorted or duplicate input can otherwise silently - // mis-select rows or over-report the selected row count. - for (idx, window) in indices.windows(2).enumerate() { +fn validate_strictly_sorted(values: &[T]) -> VortexResult<()> { + for (idx, window) in values.windows(2).enumerate() { if window[0] >= window[1] { vortex_bail!( - "row index selection must be strictly increasing at positions {} and {}: {} >= {}", + "buffer values must be strictly increasing at positions {} and {}", idx, - idx + 1, - window[0], - window[1] + idx + 1 ); } } @@ -310,32 +282,30 @@ mod tests { use vortex_buffer::Buffer; use super::Selection; + use super::StrictSortedBuffer; - fn include(indices: impl IntoIterator) -> Selection { - Selection::include_by_index(Buffer::from_iter(indices)) + fn strict_sorted(indices: impl IntoIterator) -> StrictSortedBuffer { + StrictSortedBuffer::try_new(Buffer::from_iter(indices)) .expect("test indices should be strictly increasing") } - fn exclude(indices: impl IntoIterator) -> Selection { - Selection::exclude_by_index(Buffer::from_iter(indices)) - .expect("test indices should be strictly increasing") + fn include(indices: impl IntoIterator) -> Selection { + Selection::IncludeByIndex(strict_sorted(indices)) } - #[test] - fn include_by_index_rejects_unsorted_indices() { - let err = Selection::include_by_index(Buffer::from_iter([3, 1])).unwrap_err(); - assert!(err.to_string().contains("strictly increasing")); + fn exclude(indices: impl IntoIterator) -> Selection { + Selection::ExcludeByIndex(strict_sorted(indices)) } #[test] - fn include_by_index_rejects_duplicate_indices() { - let err = Selection::include_by_index(Buffer::from_iter([1, 1])).unwrap_err(); + fn strict_sorted_buffer_rejects_unsorted_values() { + let err = StrictSortedBuffer::try_new(Buffer::from_iter([3, 1])).unwrap_err(); assert!(err.to_string().contains("strictly increasing")); } #[test] - fn exclude_by_index_rejects_unsorted_indices() { - let err = Selection::exclude_by_index(Buffer::from_iter([3, 1])).unwrap_err(); + fn strict_sorted_buffer_rejects_duplicate_values() { + let err = StrictSortedBuffer::try_new(Buffer::from_iter([1, 1])).unwrap_err(); assert!(err.to_string().contains("strictly increasing")); }