Compact StringView buffer during sparse take to avoid holding the original buffers alive when selectivity is high

lyang24 · lyang24 · commit 0a86a904912d · 2026-02-10T21:49:32.000-08:00
diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs
@@ -28,7 +28,7 @@ use arrow_buffer::{
     ArrowNativeType, BooleanBuffer, Buffer, MutableBuffer, NullBuffer, OffsetBuffer, ScalarBuffer,
     bit_util,
 };
-use arrow_data::ArrayDataBuilder;
+use arrow_data::{ArrayDataBuilder, ByteView, MAX_INLINE_VIEW_LEN};
 use arrow_schema::{ArrowError, DataType, FieldRef, UnionMode};
 
 use num_traits::{One, Zero};
@@ -596,10 +596,18 @@ fn take_bytes<T: ByteArrayType, IndexType: ArrowPrimitiveType>(
 }
 
 /// `take` implementation for byte view arrays
+///
+/// Automatically compacts string data when the take is sparse (selecting less
+/// than half the rows), which avoids holding the original large buffers alive.
 fn take_byte_view<T: ByteViewType, IndexType: ArrowPrimitiveType>(
     array: &GenericByteViewArray<T>,
     indices: &PrimitiveArray<IndexType>,
 ) -> Result<GenericByteViewArray<T>, ArrowError> {
+    // If selecting less than half the rows, compact the string data to avoid
+    // keeping the original (potentially large) buffers alive via Arc.
+    if indices.len() < array.len() / 2 {
+        return take_byte_view_compact(array, indices);
+    }
     let new_views = take_native(array.views(), indices);
     let new_nulls = take_nulls(array.nulls(), indices);
     // Safety:  array.views was valid, and take_native copies only valid values, and verifies bounds
@@ -608,6 +616,156 @@ fn take_byte_view<T: ByteViewType, IndexType: ArrowPrimitiveType>(
     })
 }
 
+/// `take` implementation for byte view arrays that compacts string data into
+/// new buffers rather than sharing the original buffers.
+///
+/// This fuses the gather (take) with string compaction in a single pass,
+/// producing an output array whose buffers contain only the referenced data.
+/// This is beneficial when `take` selects a small fraction of the source array,
+/// as it avoids keeping the original large buffers alive.
+///
+/// The output uses multiple buffers if a single buffer would exceed `u32::MAX`
+/// bytes, ensuring `ByteView::offset` never overflows.
+#[inline(never)]
+fn take_byte_view_compact<T: ByteViewType, IndexType: ArrowPrimitiveType>(
+    array: &GenericByteViewArray<T>,
+    indices: &PrimitiveArray<IndexType>,
+) -> Result<GenericByteViewArray<T>, ArrowError> {
+    let src_views = array.views();
+    let src_buffers = array.data_buffers();
+
+    // Phase 1: Calculate total non-inlined string bytes to pre-allocate.
+    // This avoids reallocations during the copy phase. We only read the u128
+    // view descriptors here, not the actual string data.
+    let mut total_bytes: usize = 0;
+    let indices_null_count = indices.null_count();
+    if indices_null_count == 0 {
+        for idx in indices.values() {
+            let raw_view = src_views[idx.as_usize()];
+            let len = raw_view as u32;
+            if len > MAX_INLINE_VIEW_LEN {
+                total_bytes += len as usize;
+            }
+        }
+    } else {
+        let index_nulls = indices.nulls().unwrap();
+        for (i, idx) in indices.values().iter().enumerate() {
+            // SAFETY: i < indices.len()
+            if unsafe { index_nulls.inner().value_unchecked(i) } {
+                let raw_view = src_views[idx.as_usize()];
+                let len = raw_view as u32;
+                if len > MAX_INLINE_VIEW_LEN {
+                    total_bytes += len as usize;
+                }
+            }
+        }
+    }
+
+    // Phase 2: Build output views and compact string data.
+    // We cap each buffer at u32::MAX bytes to avoid offset overflow.
+    let mut new_views: Vec<u128> = Vec::with_capacity(indices.len());
+    let mut completed_buffers: Vec<Buffer> = Vec::new();
+    // Start with exact pre-allocation when it fits in a single buffer,
+    // otherwise start with u32::MAX and we'll spill as needed.
+    let initial_cap = total_bytes.min(u32::MAX as usize);
+    let mut current_buffer: Vec<u8> = Vec::with_capacity(initial_cap);
+
+    if indices_null_count == 0 {
+        for idx in indices.values() {
+            let raw_view = src_views[idx.as_usize()];
+            let len = raw_view as u32;
+            if len > MAX_INLINE_VIEW_LEN {
+                let mut view = ByteView::from(raw_view);
+                let str_len = view.length as usize;
+
+                // Check if adding this string would overflow u32 offset
+                if current_buffer.len() + str_len > u32::MAX as usize {
+                    let full_buf = std::mem::replace(
+                        &mut current_buffer,
+                        Vec::with_capacity(
+                            (total_bytes - completed_buffers.iter().map(|b| b.len()).sum::<usize>())
+                                .min(u32::MAX as usize),
+                        ),
+                    );
+                    completed_buffers.push(Buffer::from_vec(full_buf));
+                }
+
+                // SAFETY: source views are validly constructed
+                let src = unsafe {
+                    src_buffers
+                        .get_unchecked(view.buffer_index as usize)
+                        .get_unchecked(view.offset as usize..view.offset as usize + str_len)
+                };
+
+                view.buffer_index = completed_buffers.len() as u32;
+                view.offset = current_buffer.len() as u32;
+                current_buffer.extend_from_slice(src);
+
+                new_views.push(view.as_u128());
+            } else {
+                new_views.push(raw_view);
+            }
+        }
+    } else {
+        let index_nulls = indices.nulls().unwrap();
+        for (i, idx) in indices.values().iter().enumerate() {
+            // SAFETY: i < indices.len()
+            if unsafe { !index_nulls.inner().value_unchecked(i) } {
+                new_views.push(0u128);
+                continue;
+            }
+            let raw_view = src_views[idx.as_usize()];
+            let len = raw_view as u32;
+            if len > MAX_INLINE_VIEW_LEN {
+                let mut view = ByteView::from(raw_view);
+                let str_len = view.length as usize;
+
+                // Check if adding this string would overflow u32 offset
+                if current_buffer.len() + str_len > u32::MAX as usize {
+                    let full_buf = std::mem::replace(
+                        &mut current_buffer,
+                        Vec::with_capacity(
+                            (total_bytes - completed_buffers.iter().map(|b| b.len()).sum::<usize>())
+                                .min(u32::MAX as usize),
+                        ),
+                    );
+                    completed_buffers.push(Buffer::from_vec(full_buf));
+                }
+
+                // SAFETY: source views are validly constructed
+                let src = unsafe {
+                    src_buffers
+                        .get_unchecked(view.buffer_index as usize)
+                        .get_unchecked(view.offset as usize..view.offset as usize + str_len)
+                };
+
+                view.buffer_index = completed_buffers.len() as u32;
+                view.offset = current_buffer.len() as u32;
+                current_buffer.extend_from_slice(src);
+
+                new_views.push(view.as_u128());
+            } else {
+                new_views.push(raw_view);
+            }
+        }
+    }
+
+    // Finalize the last buffer
+    completed_buffers.push(Buffer::from_vec(current_buffer));
+
+    let new_nulls = take_nulls(array.nulls(), indices);
+
+    // SAFETY: views are constructed from valid source views with updated
+    // buffer_index and offset pointing into the newly built compact buffers.
+    Ok(unsafe {
+        GenericByteViewArray::new_unchecked(
+            ScalarBuffer::from(new_views),
+            completed_buffers,
+            new_nulls,
+        )
+    })
+}
+
 /// `take` implementation for list arrays
 ///
 /// Calculates the index and indexed offset for the inner array,
@@ -1710,6 +1868,154 @@ mod tests {
         _test_byte_view::<BinaryViewType>()
     }
 
+    /// Helper to test the compact path of take_byte_view.
+    /// Creates an array with many rows and takes a small subset to trigger compaction.
+    fn _test_byte_view_compact<T>()
+    where
+        T: ByteViewType,
+        str: AsRef<T::Native>,
+        T::Native: PartialEq,
+    {
+        // Build a source array with 20 rows (mix of inlined and non-inlined strings)
+        let mut builder = GenericByteViewBuilder::<T>::new();
+        let values = [
+            "hello",                                  // 5 bytes, inlined
+            "world",                                  // 5 bytes, inlined
+            "this is a long string over 12b",          // 30 bytes, non-inlined
+            "short",                                  // 5 bytes, inlined
+            "another large payload string!!", // 30 bytes, non-inlined
+            "x",
+            "y",
+            "z",
+            "a medium str!!", // 14 bytes, non-inlined
+            "b",
+            "c",
+            "d",
+            "e",
+            "f",
+            "g",
+            "h",
+            "i",
+            "j",
+            "k",
+            "l",
+        ];
+        for v in &values {
+            builder.append_value(v);
+        }
+        let array = builder.finish();
+        assert_eq!(array.len(), 20);
+
+        // Take 3 rows out of 20 → 3 < 20/2 = 10, so compact path is taken
+        let indices = UInt32Array::from(vec![0, 2, 4]);
+        let result = take(&array, &indices, None).unwrap();
+
+        assert_eq!(result.len(), 3);
+        let result = result.as_any().downcast_ref::<GenericByteViewArray<T>>().unwrap();
+
+        let expected = {
+            let mut b = GenericByteViewBuilder::<T>::new();
+            b.append_value(values[0]);
+            b.append_value(values[2]);
+            b.append_value(values[4]);
+            b.finish()
+        };
+        assert_eq!(result, &expected);
+
+        // Verify compaction: result should NOT share the original buffers.
+        // The original buffers hold all 20 rows' data. The compact result should
+        // only have the bytes for the 2 non-inlined strings we selected.
+        let original_buf_size: usize = array.data_buffers().iter().map(|b| b.len()).sum();
+        let result_buf_size: usize = result.data_buffers().iter().map(|b| b.len()).sum();
+        assert!(
+            result_buf_size < original_buf_size,
+            "compact result buffers ({result_buf_size}) should be smaller than original ({original_buf_size})"
+        );
+    }
+
+    #[test]
+    fn test_take_string_view_compact() {
+        _test_byte_view_compact::<StringViewType>()
+    }
+
+    #[test]
+    fn test_take_binary_view_compact() {
+        _test_byte_view_compact::<BinaryViewType>()
+    }
+
+    /// Test compact path with nullable indices
+    #[test]
+    fn test_take_byte_view_compact_with_nulls() {
+        let mut builder = GenericByteViewBuilder::<StringViewType>::new();
+        for i in 0..20 {
+            if i % 5 == 0 {
+                builder.append_value(format!("long string number {i} over twelve bytes"));
+            } else {
+                builder.append_value(format!("s{i}"));
+            }
+        }
+        let array = builder.finish();
+
+        // Take with some null indices; 4 < 20/2 so compact triggers
+        let indices = UInt32Array::from(vec![Some(0), None, Some(5), Some(1)]);
+        let result = take(&array, &indices, None).unwrap();
+
+        assert_eq!(result.len(), 4);
+        let sv = result.as_string_view();
+        assert_eq!(sv.value(0), "long string number 0 over twelve bytes");
+        assert!(result.is_null(1));
+        assert_eq!(sv.value(2), "long string number 5 over twelve bytes");
+        assert_eq!(sv.value(3), "s1");
+    }
+
+    /// Test compact path with nullable source array
+    #[test]
+    fn test_take_byte_view_compact_with_null_source() {
+        let mut builder = GenericByteViewBuilder::<StringViewType>::new();
+        for i in 0..20 {
+            if i == 3 {
+                builder.append_null();
+            } else if i % 5 == 0 {
+                builder.append_value(format!("long string number {i} over twelve bytes"));
+            } else {
+                builder.append_value(format!("s{i}"));
+            }
+        }
+        let array = builder.finish();
+
+        // indices.len() = 4 < 20/2 = 10 → compact
+        let indices = UInt32Array::from(vec![0, 3, 5, 1]);
+        let result = take(&array, &indices, None).unwrap();
+
+        assert_eq!(result.len(), 4);
+        let sv = result.as_string_view();
+        assert_eq!(sv.value(0), "long string number 0 over twelve bytes");
+        assert!(result.is_null(1)); // source was null at index 3
+        assert_eq!(sv.value(2), "long string number 5 over twelve bytes");
+        assert_eq!(sv.value(3), "s1");
+    }
+
+    /// Test that the non-compact (shared buffer) path still works when
+    /// selecting more than half the rows.
+    #[test]
+    fn test_take_byte_view_shared_path() {
+        let mut builder = GenericByteViewBuilder::<StringViewType>::new();
+        for i in 0..10 {
+            builder.append_value(format!("long string number {i} over twelve bytes"));
+        }
+        let array = builder.finish();
+
+        // Take 8 out of 10 → 8 >= 10/2 = 5, so shared (non-compact) path
+        let indices = UInt32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7]);
+        let result = take(&array, &indices, None).unwrap();
+
+        assert_eq!(result.len(), 8);
+        let sv = result.as_string_view();
+        for i in 0..8 {
+            assert_eq!(sv.value(i), format!("long string number {i} over twelve bytes"));
+        }
+    }
+
     macro_rules! test_take_list {
         ($offset_type:ty, $list_data_type:ident, $list_array_type:ident) => {{
             // Construct a value array, [[0,0,0], [-1,-2,-1], [], [2,3]]