diff --git a/encodings/fastlanes/public-api.lock b/encodings/fastlanes/public-api.lock index 36b4a4abe43..205c9639069 100644 --- a/encodings/fastlanes/public-api.lock +++ b/encodings/fastlanes/public-api.lock @@ -196,9 +196,9 @@ impl vortex_array::arrays::filter::kernel::FilterKernel for vortex_fastlanes::Bi pub fn vortex_fastlanes::BitPackedVTable::filter(array: &vortex_fastlanes::BitPackedArray, mask: &vortex_mask::Mask, _ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> -impl vortex_array::arrays::slice::SliceKernel for vortex_fastlanes::BitPackedVTable +impl vortex_array::arrays::slice::SliceReduce for vortex_fastlanes::BitPackedVTable -pub fn vortex_fastlanes::BitPackedVTable::slice(array: &vortex_fastlanes::BitPackedArray, range: core::ops::range::Range, _ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> +pub fn vortex_fastlanes::BitPackedVTable::slice(array: &vortex_fastlanes::BitPackedArray, range: core::ops::range::Range) -> vortex_error::VortexResult> impl vortex_array::compute::is_constant::IsConstantKernel for vortex_fastlanes::BitPackedVTable diff --git a/encodings/fastlanes/src/bitpacking/compute/slice.rs b/encodings/fastlanes/src/bitpacking/compute/slice.rs index 55cd12b3975..2e49a5af8cd 100644 --- a/encodings/fastlanes/src/bitpacking/compute/slice.rs +++ b/encodings/fastlanes/src/bitpacking/compute/slice.rs @@ -5,20 +5,15 @@ use std::cmp::max; use std::ops::Range; use vortex_array::ArrayRef; -use vortex_array::ExecutionCtx; use vortex_array::IntoArray; -use vortex_array::arrays::SliceKernel; +use vortex_array::arrays::SliceReduce; use vortex_error::VortexResult; use crate::BitPackedArray; use crate::BitPackedVTable; -impl SliceKernel for BitPackedVTable { - fn slice( - array: &BitPackedArray, - range: Range, - _ctx: &mut ExecutionCtx, - ) -> VortexResult> { +impl SliceReduce for BitPackedVTable { + fn slice(array: &BitPackedArray, range: Range) -> VortexResult> { let offset_start = range.start + array.offset() as usize; let offset_stop = range.end + array.offset() as usize; let offset = offset_start % 1024; @@ -51,43 +46,44 @@ impl SliceKernel for BitPackedVTable { #[cfg(test)] mod tests { - use std::sync::LazyLock; - use vortex_array::Array; - use vortex_array::IntoArray; - use vortex_array::VortexSessionExecute; - use vortex_array::arrays::SliceArray; - use vortex_array::session::ArraySession; - use vortex_array::vtable::VTable; + use vortex_array::arrays::SliceReduce; + use vortex_array::arrays::SliceVTable; use vortex_error::VortexResult; - use vortex_session::VortexSession; use crate::BitPackedVTable; use crate::bitpack_compress::bitpack_encode; - static SESSION: LazyLock = - LazyLock::new(|| VortexSession::empty().with::()); + #[test] + fn test_slice_returns_bitpacked() -> VortexResult<()> { + let values = vortex_array::arrays::PrimitiveArray::from_iter(0u32..2048); + let bitpacked = bitpack_encode(&values, 11, None)?; + + let result = + BitPackedVTable::slice(&bitpacked, 500..1500)?.expect("expected slice to succeed"); + + assert!(result.is::()); + let result_bp = result.as_::(); + assert_eq!(result_bp.offset(), 500); + assert_eq!(result.len(), 1000); + + Ok(()) + } #[test] - fn test_execute_parent_returns_bitpacked_slice() -> VortexResult<()> { + fn test_slice_via_array_trait() -> VortexResult<()> { let values = vortex_array::arrays::PrimitiveArray::from_iter(0u32..2048); let bitpacked = bitpack_encode(&values, 11, None)?; - let slice_array = SliceArray::new(bitpacked.clone().into_array(), 500..1500); - - let mut ctx = SESSION.create_execution_ctx(); - let reduced = ::execute_parent( - &bitpacked, - &slice_array.into_array(), - 0, - &mut ctx, - )? - .expect("expected slice kernel to execute"); - - assert!(reduced.is::()); - let reduced_bp = reduced.as_::(); - assert_eq!(reduced_bp.offset(), 500); - assert_eq!(reduced.len(), 1000); + let sliced = bitpacked.as_ref().slice(500..1500)?; + + // After optimize, the SliceArray should have been reduced away. + assert!( + !sliced.is::(), + "expected SliceReduce to eliminate the SliceArray wrapper" + ); + assert!(sliced.is::()); + assert_eq!(sliced.len(), 1000); Ok(()) } diff --git a/encodings/fastlanes/src/bitpacking/vtable/kernels.rs b/encodings/fastlanes/src/bitpacking/vtable/kernels.rs index ca9b98c09db..0685ac4e536 100644 --- a/encodings/fastlanes/src/bitpacking/vtable/kernels.rs +++ b/encodings/fastlanes/src/bitpacking/vtable/kernels.rs @@ -2,7 +2,6 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors use vortex_array::arrays::FilterExecuteAdaptor; -use vortex_array::arrays::SliceExecuteAdaptor; use vortex_array::arrays::TakeExecuteAdaptor; use vortex_array::kernel::ParentKernelSet; @@ -10,6 +9,5 @@ use crate::BitPackedVTable; pub(crate) const PARENT_KERNELS: ParentKernelSet = ParentKernelSet::new(&[ ParentKernelSet::lift(&FilterExecuteAdaptor(BitPackedVTable)), - ParentKernelSet::lift(&SliceExecuteAdaptor(BitPackedVTable)), ParentKernelSet::lift(&TakeExecuteAdaptor(BitPackedVTable)), ]); diff --git a/encodings/fastlanes/src/bitpacking/vtable/operations.rs b/encodings/fastlanes/src/bitpacking/vtable/operations.rs index 2dcf62ceac3..5e6e2000567 100644 --- a/encodings/fastlanes/src/bitpacking/vtable/operations.rs +++ b/encodings/fastlanes/src/bitpacking/vtable/operations.rs @@ -26,13 +26,10 @@ impl OperationsVTable for BitPackedVTable { #[cfg(test)] mod test { use std::ops::Range; - use std::sync::LazyLock; use vortex_array::Array; use vortex_array::IntoArray; - use vortex_array::VortexSessionExecute; use vortex_array::arrays::PrimitiveArray; - use vortex_array::arrays::SliceArray; use vortex_array::assert_arrays_eq; use vortex_array::assert_nth_scalar; use vortex_array::buffer::BufferHandle; @@ -41,9 +38,7 @@ mod test { use vortex_array::dtype::PType; use vortex_array::patches::Patches; use vortex_array::scalar::Scalar; - use vortex_array::session::ArraySession; use vortex_array::validity::Validity; - use vortex_array::vtable::VTable; use vortex_buffer::Alignment; use vortex_buffer::Buffer; use vortex_buffer::ByteBuffer; @@ -52,20 +47,8 @@ mod test { use crate::BitPackedArray; use crate::BitPackedVTable; - static SESSION: LazyLock = - LazyLock::new(|| vortex_session::VortexSession::empty().with::()); - fn slice_via_kernel(array: &BitPackedArray, range: Range) -> BitPackedArray { - let slice_array = SliceArray::new(array.clone().into_array(), range); - let mut ctx = SESSION.create_execution_ctx(); - let sliced = ::execute_parent( - array, - &slice_array.into_array(), - 0, - &mut ctx, - ) - .expect("execute_parent failed") - .expect("expected slice kernel to execute"); + let sliced = array.as_ref().slice(range).expect("slice failed"); sliced.as_::().clone() } diff --git a/encodings/fastlanes/src/bitpacking/vtable/rules.rs b/encodings/fastlanes/src/bitpacking/vtable/rules.rs index 35c4a388f6a..53bff931272 100644 --- a/encodings/fastlanes/src/bitpacking/vtable/rules.rs +++ b/encodings/fastlanes/src/bitpacking/vtable/rules.rs @@ -1,10 +1,13 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +use vortex_array::arrays::SliceReduceAdaptor; use vortex_array::optimizer::rules::ParentRuleSet; use vortex_array::scalar_fn::fns::cast::CastReduceAdaptor; use crate::BitPackedVTable; -pub(crate) const RULES: ParentRuleSet = - ParentRuleSet::new(&[ParentRuleSet::lift(&CastReduceAdaptor(BitPackedVTable))]); +pub(crate) const RULES: ParentRuleSet = ParentRuleSet::new(&[ + ParentRuleSet::lift(&CastReduceAdaptor(BitPackedVTable)), + ParentRuleSet::lift(&SliceReduceAdaptor(BitPackedVTable)), +]); diff --git a/vortex-cuda/gpu-scan-cli/src/main.rs b/vortex-cuda/gpu-scan-cli/src/main.rs index 7aac1762804..2cea7cc579f 100644 --- a/vortex-cuda/gpu-scan-cli/src/main.rs +++ b/vortex-cuda/gpu-scan-cli/src/main.rs @@ -94,7 +94,7 @@ async fn main() -> VortexResult<()> { // Create a full scan that executes on the GPU let cuda_stream = - VortexCudaStreamPool::new(Arc::clone(cuda_ctx.stream().context()), 1).get_stream()?; + VortexCudaStreamPool::new(Arc::clone(cuda_ctx.stream().context()), 1).stream()?; let gpu_reader = CopyDeviceReadAt::new(recompressed, cuda_stream); let gpu_file = session diff --git a/vortex-cuda/kernels/src/dynamic_dispatch.cu b/vortex-cuda/kernels/src/dynamic_dispatch.cu index 9168a8580ab..14aa55b66ad 100644 --- a/vortex-cuda/kernels/src/dynamic_dispatch.cu +++ b/vortex-cuda/kernels/src/dynamic_dispatch.cu @@ -56,8 +56,10 @@ __device__ inline void dynamic_source_op(const T *__restrict input, constexpr uint32_t FL_CHUNK_SIZE = 1024; constexpr uint32_t LANES_PER_FL_BLOCK = FL_CHUNK_SIZE / T_BITS; const uint32_t bit_width = source_op.params.bitunpack.bit_width; + const uint32_t element_offset = source_op.params.bitunpack.element_offset; const uint32_t packed_words_per_fl_block = LANES_PER_FL_BLOCK * bit_width; - const uint64_t first_fl_block = chunk_start / FL_CHUNK_SIZE; + // Shift chunk_start by the sub-block element offset. + const uint64_t first_fl_block = (chunk_start + element_offset) / FL_CHUNK_SIZE; // FL blocks must divide evenly. Otherwise, the last unpack would overflow smem. static_assert((ELEMENTS_PER_BLOCK % FL_CHUNK_SIZE) == 0); diff --git a/vortex-cuda/kernels/src/dynamic_dispatch.h b/vortex-cuda/kernels/src/dynamic_dispatch.h index f8fbeaf6c13..c208be7459b 100644 --- a/vortex-cuda/kernels/src/dynamic_dispatch.h +++ b/vortex-cuda/kernels/src/dynamic_dispatch.h @@ -44,9 +44,12 @@ union SourceParams { /// Unpack bit-packed data using FastLanes layout. struct BitunpackParams { uint8_t bit_width; + uint8_t _padding[3]; + uint32_t element_offset; // Element offset within FL block (0..1023) } bitunpack; /// Copy elements verbatim from global memory to shared memory. + /// The input pointer is pre-adjusted on the host to account for slicing. struct LoadParams { uint8_t _padding; } load; diff --git a/vortex-cuda/src/device_buffer.rs b/vortex-cuda/src/device_buffer.rs index 17bcd44f5d4..c8d2841f10a 100644 --- a/vortex-cuda/src/device_buffer.rs +++ b/vortex-cuda/src/device_buffer.rs @@ -81,8 +81,6 @@ mod private { } } -// Get it back out as a View of u8 - impl CudaDeviceBuffer { /// Creates a new CUDA device buffer from a [`CudaSlice`]. /// @@ -101,6 +99,16 @@ impl CudaDeviceBuffer { } } + /// Returns the byte offset within the allocated buffer. + pub fn offset(&self) -> usize { + self.offset + } + + /// Returns the adjusted device pointer accounting for the offset. + pub fn offset_ptr(&self) -> sys::CUdeviceptr { + self.device_ptr + self.offset as u64 + } + /// Returns a [`CudaView`] to the CUDA device buffer. pub fn as_view(&self) -> CudaView<'_, T> { // Return a new &[T] @@ -159,7 +167,7 @@ impl CudaBufferExt for BufferHandle { .as_any() .downcast_ref::() .ok_or_else(|| vortex_err!("expected CudaDeviceBuffer"))? - .device_ptr; + .offset_ptr(); Ok(ptr) } @@ -281,7 +289,7 @@ impl DeviceBuffer for CudaDeviceBuffer { /// Slices the CUDA device buffer to a subrange. /// - /// **IMPORTANT**: this is a byte range, not elements range, due to the DeviceBuffer interface. + /// This is a byte range, not elements range, due to the DeviceBuffer interface. fn slice(&self, range: Range) -> Arc { assert!( range.end <= self.len, diff --git a/vortex-cuda/src/dynamic_dispatch/mod.rs b/vortex-cuda/src/dynamic_dispatch/mod.rs index 133fb5de677..5832601933d 100644 --- a/vortex-cuda/src/dynamic_dispatch/mod.rs +++ b/vortex-cuda/src/dynamic_dispatch/mod.rs @@ -32,16 +32,27 @@ unsafe impl cudarc::driver::DeviceRepr for Stage {} impl SourceOp { /// Unpack bit-packed data using FastLanes layout. - pub fn bitunpack(bit_width: u8) -> Self { + /// + /// The device pointer already accounts for buffer slicing, so no + /// offset parameter is needed. `element_offset` (0..1023) is the + /// remaining position within the first FastLanes block. + pub fn bitunpack(bit_width: u8, element_offset: u16) -> Self { Self { op_code: SourceOp_SourceOpCode_BITUNPACK, params: SourceParams { - bitunpack: SourceParams_BitunpackParams { bit_width }, + bitunpack: SourceParams_BitunpackParams { + bit_width, + _padding: [0; 3], + element_offset: u32::from(element_offset), + }, }, } } /// Copy elements verbatim from global memory to shared memory. + /// + /// The device pointer already accounts for buffer slicing, so no + /// offset parameter is needed. pub fn load() -> Self { Self { op_code: SourceOp_SourceOpCode_LOAD, @@ -262,7 +273,7 @@ mod tests { let plan = DynamicDispatchPlan::new([Stage::output( input_ptr, 0, - SourceOp::bitunpack(bit_width), + SourceOp::bitunpack(bit_width, 0), &scalar_ops, )]); assert_eq!(plan.stages[0].num_scalar_ops, 4); @@ -282,13 +293,13 @@ mod tests { 0xAAAA, 0, 256, - SourceOp::bitunpack(4), + SourceOp::bitunpack(4, 0), &[ScalarOp::frame_of_ref(10)], ), Stage::output( 0xBBBB, 256, - SourceOp::bitunpack(6), + SourceOp::bitunpack(6, 0), &[ScalarOp::frame_of_ref(42), ScalarOp::dict(0)], ), ]); @@ -687,4 +698,113 @@ mod tests { Ok(()) } + + #[test] + fn test_sliced_primitive() -> VortexResult<()> { + let len = 5000; + let data: Vec = (0..len).map(|i| (i * 7) % 1000).collect(); + + let prim = PrimitiveArray::new(Buffer::from(data.clone()), NonNullable); + + let slice_start = 500; + let slice_end = 3500; + let sliced = prim.as_ref().slice(slice_start..slice_end)?; + + let expected: Vec = data[slice_start..slice_end].to_vec(); + + let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; + let (plan, _bufs) = build_plan(&sliced, &cuda_ctx)?; + + let actual = run_dynamic_dispatch_plan(&cuda_ctx, expected.len(), &plan)?; + assert_eq!(actual, expected); + + Ok(()) + } + + #[test] + fn test_sliced_for_bitpacked() -> VortexResult<()> { + let reference = 100u32; + let bit_width = 10u8; + let max_val = (1u32 << bit_width) - 1; + let len = 5000; + + let encoded_data: Vec = (0..len).map(|i| (i as u32) % max_val).collect(); + let prim = PrimitiveArray::new(Buffer::from(encoded_data.clone()), NonNullable); + let bp = BitPackedArray::encode(prim.as_ref(), bit_width)?; + let for_arr = FoRArray::try_new(bp.into_array(), Scalar::from(reference))?; + + let all_decoded: Vec = encoded_data.iter().map(|&v| v + reference).collect(); + + let slice_start = 1024; + let slice_end = 3500; + let sliced = for_arr.as_ref().slice(slice_start..slice_end)?; + let expected: Vec = all_decoded[slice_start..slice_end].to_vec(); + + let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; + let (plan, _bufs) = build_plan(&sliced, &cuda_ctx)?; + + let actual = run_dynamic_dispatch_plan(&cuda_ctx, expected.len(), &plan)?; + assert_eq!(actual, expected); + + Ok(()) + } + + #[test] + fn test_sliced_zigzag_bitpacked() -> VortexResult<()> { + let bit_width = 10u8; + let max_val = (1u32 << bit_width) - 1; + let len = 5000; + + let raw: Vec = (0..len).map(|i| (i as u32) % max_val).collect(); + let all_decoded: Vec = raw + .iter() + .map(|&v| (v >> 1) ^ (0u32.wrapping_sub(v & 1))) + .collect(); + + let prim = PrimitiveArray::new(Buffer::from(raw), NonNullable); + let bp = BitPackedArray::encode(prim.as_ref(), bit_width)?; + let zz = ZigZagArray::try_new(bp.into_array())?; + + let slice_start = 1024; + let slice_end = 3500; + let sliced = zz.as_ref().slice(slice_start..slice_end)?; + let expected: Vec = all_decoded[slice_start..slice_end].to_vec(); + + let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; + let (plan, _bufs) = build_plan(&sliced, &cuda_ctx)?; + + let actual = run_dynamic_dispatch_plan(&cuda_ctx, expected.len(), &plan)?; + assert_eq!(actual, expected); + + Ok(()) + } + + #[test] + fn test_sliced_dict_with_primitive_codes() -> VortexResult<()> { + let dict_values: Vec = vec![100, 200, 300, 400, 500]; + let dict_size = dict_values.len(); + let len = 5000; + let codes: Vec = (0..len).map(|i| (i % dict_size) as u32).collect(); + + let codes_prim = PrimitiveArray::new(Buffer::from(codes.clone()), NonNullable); + let values_prim = PrimitiveArray::new(Buffer::from(dict_values.clone()), NonNullable); + let dict = DictArray::try_new(codes_prim.into_array(), values_prim.into_array())?; + + let slice_start = 1000; + let slice_end = 3000; + let sliced = dict.as_ref().slice(slice_start..slice_end)?; + + let expected: Vec = codes[slice_start..slice_end] + .iter() + .map(|&c| dict_values[c as usize]) + .collect(); + + let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; + let (plan, _bufs) = build_plan(&sliced, &cuda_ctx)?; + + let actual = run_dynamic_dispatch_plan(&cuda_ctx, expected.len(), &plan)?; + assert_eq!(actual, expected); + + Ok(()) + } } diff --git a/vortex-cuda/src/dynamic_dispatch/plan_builder.rs b/vortex-cuda/src/dynamic_dispatch/plan_builder.rs index f63e0edc569..e686c8106fa 100644 --- a/vortex-cuda/src/dynamic_dispatch/plan_builder.rs +++ b/vortex-cuda/src/dynamic_dispatch/plan_builder.rs @@ -19,6 +19,7 @@ use vortex::encodings::alp::ALPFloat; use vortex::encodings::alp::ALPVTable; use vortex::encodings::fastlanes::BitPackedArrayParts; use vortex::encodings::fastlanes::BitPackedVTable; +use vortex::encodings::fastlanes::FoRArray; use vortex::encodings::fastlanes::FoRVTable; use vortex::encodings::runend::RunEndArrayParts; use vortex::encodings::runend::RunEndVTable; @@ -84,8 +85,6 @@ struct Pipeline { /// receive a value regardless of whether the input was null. Only arrays with /// `NonNullable` or `AllValid` validity produce correct results. /// -/// **Slicing**: Not supported. -/// /// **Patches**: `BitPackedArray` with patches and `ALPArray` with patches are /// not supported and will return an error. /// @@ -161,6 +160,9 @@ impl PlanBuilderState<'_> { } /// Canonical primitive array → LOAD source op. + /// + /// The device pointer already accounts for buffer slicing, so no + /// offset parameter is needed. fn walk_primitive(&mut self, array: ArrayRef) -> VortexResult { let prim = array.to_canonical()?.into_primitive(); let PrimitiveArrayParts { buffer, .. } = prim.into_parts(); @@ -170,11 +172,17 @@ impl PlanBuilderState<'_> { Ok(Pipeline { source: SourceOp::load(), scalar_ops: vec![], - input_ptr: ptr, + input_ptr: ptr as u64, }) } /// BitPackedArray → BITUNPACK source op. + /// + /// The device pointer already accounts for buffer slicing, so no + /// offset parameter is needed. The sub-block element offset (0..1023) + /// from `BitPackedArrayParts` is the exception: It is passed as a kernel + /// parameter since it cannot be expressed as pointer arithmetic on + /// bit-packed data. fn walk_bitpacked(&mut self, array: ArrayRef) -> VortexResult { let bp = array .try_into::() @@ -187,11 +195,6 @@ impl PlanBuilderState<'_> { .. } = bp.into_parts(); - if offset != 0 { - vortex_bail!( - "Dynamic dispatch does not support sliced BitPackedArray (offset={offset})" - ); - } if patches.is_some() { vortex_bail!("Dynamic dispatch does not support BitPackedArray with patches"); } @@ -200,9 +203,9 @@ impl PlanBuilderState<'_> { let ptr = device_buf.cuda_device_ptr()?; self.device_buffers.push(device_buf); Ok(Pipeline { - source: SourceOp::bitunpack(bit_width), + source: SourceOp::bitunpack(bit_width, offset), scalar_ops: vec![], - input_ptr: ptr, + input_ptr: ptr as u64, }) } @@ -313,7 +316,7 @@ impl PlanBuilderState<'_> { } /// Extract a FoR reference scalar as u64 bits. -fn extract_for_reference(for_arr: &vortex::encodings::fastlanes::FoRArray) -> VortexResult { +fn extract_for_reference(for_arr: &FoRArray) -> VortexResult { if let Ok(v) = u32::try_from(for_arr.reference_scalar()) { Ok(v as u64) } else if let Ok(v) = i32::try_from(for_arr.reference_scalar()) { diff --git a/vortex-cuda/src/kernel/encodings/bitpacked.rs b/vortex-cuda/src/kernel/encodings/bitpacked.rs index 2f29fd149d0..fc66571ab45 100644 --- a/vortex-cuda/src/kernel/encodings/bitpacked.rs +++ b/vortex-cuda/src/kernel/encodings/bitpacked.rs @@ -157,13 +157,10 @@ where mod tests { use futures::executor::block_on; use rstest::rstest; - use vortex::array::ExecutionCtx; use vortex::array::IntoArray; use vortex::array::arrays::PrimitiveArray; use vortex::array::assert_arrays_eq; - use vortex::array::session::ArraySession; use vortex::array::validity::Validity::NonNullable; - use vortex::array::vtable::VTable; use vortex::buffer::Buffer; use vortex::error::VortexExpect; use vortex::session::VortexSession; @@ -464,15 +461,7 @@ mod tests { let bitpacked_array = BitPackedArray::encode(primitive_array.as_ref(), bit_width) .vortex_expect("operation should succeed in test"); - let slice_ref = bitpacked_array.clone().into_array().slice(67..3969)?; - let mut exec_ctx = ExecutionCtx::new(VortexSession::empty().with::()); - let sliced_array = ::execute_parent( - &bitpacked_array, - &slice_ref, - 0, - &mut exec_ctx, - )? - .expect("expected slice kernel to execute"); + let sliced_array = bitpacked_array.as_ref().slice(67..3969)?; let cpu_result = sliced_array.to_canonical()?; let gpu_result = block_on(async { BitPackedExecutor