Skip to content

Commit d48ce1e

Browse files
committed
inplace
Signed-off-by: Robert Kruszewski <github@robertk.io>
1 parent a4991dc commit d48ce1e

2 files changed

Lines changed: 45 additions & 37 deletions

File tree

encodings/fastlanes/src/bit_transpose/validity.rs

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ use vortex_array::IntoArray;
1010
use vortex_array::arrays::BoolArray;
1111
use vortex_array::validity::Validity;
1212
use vortex_buffer::BitBuffer;
13+
use vortex_buffer::ByteBuffer;
1314
use vortex_buffer::ByteBufferMut;
1415
use vortex_error::VortexExpect;
1516
use vortex_error::VortexResult;
@@ -38,7 +39,25 @@ pub fn transpose_validity(validity: &Validity, ctx: &mut ExecutionCtx) -> Vortex
3839

3940
#[inline]
4041
pub fn transpose_bitbuffer(bits: BitBuffer) -> BitBuffer {
41-
fastlanes_layout_apply(bits, transpose_bits)
42+
let (offset, len, bytes) = bits.into_inner();
43+
44+
if bytes.len().is_multiple_of(128) {
45+
match bytes.try_into_mut() {
46+
Ok(mut bytes_mut) => {
47+
// We can ignore the spare trailer capacity that can be an artifact of allocator as we requested 128 multiple chunks
48+
let (chunks, _) = bytes_mut.as_chunks_mut::<128>();
49+
let mut tmp = [0u8; 128];
50+
for chunk in chunks {
51+
transpose_bits(chunk, &mut tmp);
52+
chunk.copy_from_slice(&tmp);
53+
}
54+
BitBuffer::new_with_offset(bytes_mut.freeze().into_byte_buffer(), len, offset)
55+
}
56+
Err(bytes) => bits_op_with_copy(bytes, len, offset, transpose_bits),
57+
}
58+
} else {
59+
bits_op_with_copy(bytes, len, offset, transpose_bits)
60+
}
4261
}
4362

4463
pub fn untranspose_validity(validity: &Validity, ctx: &mut ExecutionCtx) -> VortexResult<Validity> {
@@ -62,12 +81,31 @@ pub fn untranspose_validity(validity: &Validity, ctx: &mut ExecutionCtx) -> Vort
6281

6382
#[inline]
6483
pub fn untranspose_bitbuffer(bits: BitBuffer) -> BitBuffer {
65-
fastlanes_layout_apply(bits, untranspose_bits)
66-
}
67-
68-
fn fastlanes_layout_apply<F: Fn(&[u8; 128], &mut [u8; 128])>(bits: BitBuffer, op: F) -> BitBuffer {
84+
assert!(
85+
bits.inner().len().is_multiple_of(128),
86+
"Transpose BitBuffer must be 128-byte aligned"
87+
);
6988
let (offset, len, bytes) = bits.into_inner();
89+
match bytes.try_into_mut() {
90+
Ok(mut bytes_mut) => {
91+
let (chunks, _) = bytes_mut.as_chunks_mut::<128>();
92+
let mut tmp = [0u8; 128];
93+
for chunk in chunks {
94+
untranspose_bits(chunk, &mut tmp);
95+
chunk.copy_from_slice(&tmp);
96+
}
97+
BitBuffer::new_with_offset(bytes_mut.freeze().into_byte_buffer(), len, offset)
98+
}
99+
Err(bytes) => bits_op_with_copy(bytes, len, offset, untranspose_bits),
100+
}
101+
}
70102

103+
fn bits_op_with_copy<F: Fn(&[u8; 128], &mut [u8; 128])>(
104+
bytes: ByteBuffer,
105+
len: usize,
106+
offset: usize,
107+
op: F,
108+
) -> BitBuffer {
71109
let output_len = bytes.len().next_multiple_of(128);
72110
let mut output = ByteBufferMut::with_capacity(output_len);
73111
let (input_chunks, input_trailer) = bytes.as_chunks::<128>();

encodings/fastlanes/src/delta/array/delta_compress.rs

Lines changed: 2 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ use vortex_buffer::Buffer;
2121
use vortex_buffer::BufferMut;
2222
use vortex_error::VortexResult;
2323

24-
use crate::bit_transpose::transpose_bitbuffer;
24+
use crate::bit_transpose::{transpose_bitbuffer, transpose_validity};
2525

2626
pub fn delta_compress(
2727
array: &PrimitiveArray,
@@ -30,7 +30,7 @@ pub fn delta_compress(
3030
let (bases, deltas) = match_each_unsigned_integer_ptype!(array.ptype(), |T| {
3131
const LANES: usize = T::LANES;
3232
let (bases, deltas) = compress_primitive::<T, LANES>(array.as_slice::<T>());
33-
let validity = transpose_and_pad_validity(array.validity(), deltas.len(), ctx)?;
33+
let validity = transpose_validity(array.validity(), ctx)?;
3434
(
3535
PrimitiveArray::new(bases, array.dtype().nullability().into()),
3636
PrimitiveArray::new(deltas, validity),
@@ -40,36 +40,6 @@ pub fn delta_compress(
4040
Ok((bases, deltas))
4141
}
4242

43-
/// Transpose and pad validity to match the padded deltas length.
44-
///
45-
/// For [`Validity::Array`], the validity bits are transposed into FastLanes order and then
46-
/// extended to `padded_len`. The underlying byte buffer from transposition is already
47-
/// padded to 128-byte alignment (1024 bits), which exactly matches our 1024-element chunks.
48-
fn transpose_and_pad_validity(
49-
validity: &Validity,
50-
padded_len: usize,
51-
ctx: &mut ExecutionCtx,
52-
) -> VortexResult<Validity> {
53-
match validity {
54-
Validity::Array(mask) => {
55-
let bools = mask
56-
.clone()
57-
.execute::<Canonical>(ctx)?
58-
.into_bool()
59-
.into_bit_buffer();
60-
let transposed = transpose_bitbuffer(bools);
61-
let (offset, _len, bytes) = transposed.into_inner();
62-
let padded = BitBuffer::new_with_offset(bytes, padded_len, offset);
63-
Ok(Validity::Array(
64-
BoolArray::new(padded, Validity::NonNullable).into_array(),
65-
))
66-
}
67-
v @ Validity::AllValid | v @ Validity::AllInvalid | v @ Validity::NonNullable => {
68-
Ok(v.clone())
69-
}
70-
}
71-
}
72-
7343
fn compress_primitive<T: NativePType + Delta + Transpose, const LANES: usize>(
7444
array: &[T],
7545
) -> (Buffer<T>, Buffer<T>) {

0 commit comments

Comments
 (0)