-
Notifications
You must be signed in to change notification settings - Fork 149
Expand file tree
/
Copy pathbuild_views.rs
More file actions
144 lines (124 loc) · 5.07 KB
/
build_views.rs
File metadata and controls
144 lines (124 loc) · 5.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors
use itertools::Itertools;
use num_traits::AsPrimitive;
use vortex_buffer::Buffer;
use vortex_buffer::BufferMut;
use vortex_buffer::ByteBuffer;
use vortex_buffer::ByteBufferMut;
pub use crate::arrays::varbinview::BinaryView;
use crate::dtype::NativePType;
/// Convert an offsets buffer to a buffer of element lengths.
#[inline]
pub fn offsets_to_lengths<P: NativePType>(offsets: &[P]) -> Buffer<P> {
offsets
.iter()
.tuple_windows::<(_, _)>()
.map(|(&start, &end)| end - start)
.collect()
}
/// Maximum number of buffer bytes that can be referenced by a single `BinaryView`
pub const MAX_BUFFER_LEN: usize = i32::MAX as usize;
/// Build `BinaryView`s from a contiguous byte buffer and per-element lengths.
///
/// When total data exceeds `max_buffer_len` (2 GiB), buffers are split to ensure
/// offsets fit in `u32`. When data fits in a single buffer, per-element split checks
/// are skipped entirely.
#[allow(clippy::cast_possible_truncation)]
pub fn build_views<P: NativePType + AsPrimitive<usize>>(
start_buf_index: u32,
max_buffer_len: usize,
mut bytes: ByteBufferMut,
lens: &[P],
) -> (Vec<ByteBuffer>, Buffer<BinaryView>) {
let mut views = BufferMut::<BinaryView>::with_capacity(lens.len());
if bytes.len() <= max_buffer_len {
// Fast path: all data fits in a single buffer. No split checks needed per element
// and offsets are guaranteed to fit in u32 (max_buffer_len <= i32::MAX < u32::MAX).
let bytes_ptr = bytes.as_slice().as_ptr();
// Write views directly via pointer to avoid per-element push_unchecked overhead
// (spare-capacity lookup + length bookkeeping on each iteration).
let views_dst = views.spare_capacity_mut().as_mut_ptr() as *mut BinaryView;
let mut offset: usize = 0;
for (i, &len) in lens.iter().enumerate() {
let len: usize = len.as_();
// SAFETY: the sum of all lengths equals bytes.len() and we process them
// sequentially, so offset + len <= bytes.len() at every iteration.
let value = unsafe { std::slice::from_raw_parts(bytes_ptr.add(offset), len) };
let view = BinaryView::make_view(value, start_buf_index, offset as u32);
// SAFETY: we reserved capacity for lens.len() views and i < lens.len().
unsafe { views_dst.add(i).write(view) };
offset += len;
}
// SAFETY: we wrote exactly lens.len() views above.
unsafe { views.set_len(lens.len()) };
let buffers = if bytes.is_empty() {
Vec::new()
} else {
vec![bytes.freeze()]
};
(buffers, views.freeze())
} else {
// Slow path: may need to split across multiple 2 GiB buffers.
let mut buffers = Vec::new();
let mut buf_index = start_buf_index;
let mut offset = 0;
for &len in lens {
let len = len.as_();
assert!(len <= max_buffer_len, "values cannot exceed max_buffer_len");
if (offset + len) > max_buffer_len {
let rest = bytes.split_off(offset);
buffers.push(bytes.freeze());
buf_index += 1;
offset = 0;
bytes = rest;
}
let view = BinaryView::make_view(&bytes[offset..][..len], buf_index, offset.as_());
unsafe { views.push_unchecked(view) };
offset += len;
}
if !bytes.is_empty() {
buffers.push(bytes.freeze());
}
(buffers, views.freeze())
}
}
#[cfg(test)]
mod tests {
use vortex_buffer::ByteBuffer;
use vortex_buffer::ByteBufferMut;
use crate::arrays::varbinview::BinaryView;
use crate::arrays::varbinview::build_views::build_views;
#[test]
fn test_to_canonical_large() {
// We are testing generating views for raw data that should look like
//
// aaaaaaaaaaaaa ("a"*13)
// bbbbbbbbbbbbb ("b"*13)
// ccccccccccccc ("c"*13)
// ddddddddddddd ("d"*13)
//
// In real code, this would all fit in one buffer, but to unit test the splitting logic
// we split buffers at length 26, which should result in two buffers for the output array.
let raw_data =
ByteBufferMut::copy_from("aaaaaaaaaaaaabbbbbbbbbbbbbcccccccccccccddddddddddddd");
let lens = vec![13u8; 4];
let (buffers, views) = build_views(0, 26, raw_data, &lens);
assert_eq!(
buffers,
vec![
ByteBuffer::copy_from("aaaaaaaaaaaaabbbbbbbbbbbbb"),
ByteBuffer::copy_from("cccccccccccccddddddddddddd"),
]
);
assert_eq!(
views.as_slice(),
&[
BinaryView::make_view(b"aaaaaaaaaaaaa", 0, 0),
BinaryView::make_view(b"bbbbbbbbbbbbb", 0, 13),
BinaryView::make_view(b"ccccccccccccc", 1, 0),
BinaryView::make_view(b"ddddddddddddd", 1, 13),
]
)
}
}