diff --git a/encodings/fastlanes/public-api.lock b/encodings/fastlanes/public-api.lock
index 36b4a4abe43..205c9639069 100644
--- a/encodings/fastlanes/public-api.lock
+++ b/encodings/fastlanes/public-api.lock
@@ -196,9 +196,9 @@ impl vortex_array::arrays::filter::kernel::FilterKernel for vortex_fastlanes::Bi
 
 pub fn vortex_fastlanes::BitPackedVTable::filter(array: &vortex_fastlanes::BitPackedArray, mask: &vortex_mask::Mask, _ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::array::ArrayRef>>
 
-impl vortex_array::arrays::slice::SliceKernel for vortex_fastlanes::BitPackedVTable
+impl vortex_array::arrays::slice::SliceReduce for vortex_fastlanes::BitPackedVTable
 
-pub fn vortex_fastlanes::BitPackedVTable::slice(array: &vortex_fastlanes::BitPackedArray, range: core::ops::range::Range<usize>, _ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::array::ArrayRef>>
+pub fn vortex_fastlanes::BitPackedVTable::slice(array: &vortex_fastlanes::BitPackedArray, range: core::ops::range::Range<usize>) -> vortex_error::VortexResult<core::option::Option<vortex_array::array::ArrayRef>>
 
 impl vortex_array::compute::is_constant::IsConstantKernel for vortex_fastlanes::BitPackedVTable
 
diff --git a/encodings/fastlanes/src/bitpacking/compute/slice.rs b/encodings/fastlanes/src/bitpacking/compute/slice.rs
index 55cd12b3975..2e49a5af8cd 100644
--- a/encodings/fastlanes/src/bitpacking/compute/slice.rs
+++ b/encodings/fastlanes/src/bitpacking/compute/slice.rs
@@ -5,20 +5,15 @@ use std::cmp::max;
 use std::ops::Range;
 
 use vortex_array::ArrayRef;
-use vortex_array::ExecutionCtx;
 use vortex_array::IntoArray;
-use vortex_array::arrays::SliceKernel;
+use vortex_array::arrays::SliceReduce;
 use vortex_error::VortexResult;
 
 use crate::BitPackedArray;
 use crate::BitPackedVTable;
 
-impl SliceKernel for BitPackedVTable {
-    fn slice(
-        array: &BitPackedArray,
-        range: Range<usize>,
-        _ctx: &mut ExecutionCtx,
-    ) -> VortexResult<Option<ArrayRef>> {
+impl SliceReduce for BitPackedVTable {
+    fn slice(array: &BitPackedArray, range: Range<usize>) -> VortexResult<Option<ArrayRef>> {
         let offset_start = range.start + array.offset() as usize;
         let offset_stop = range.end + array.offset() as usize;
         let offset = offset_start % 1024;
@@ -51,43 +46,44 @@ impl SliceKernel for BitPackedVTable {
 
 #[cfg(test)]
 mod tests {
-    use std::sync::LazyLock;
-
     use vortex_array::Array;
-    use vortex_array::IntoArray;
-    use vortex_array::VortexSessionExecute;
-    use vortex_array::arrays::SliceArray;
-    use vortex_array::session::ArraySession;
-    use vortex_array::vtable::VTable;
+    use vortex_array::arrays::SliceReduce;
+    use vortex_array::arrays::SliceVTable;
     use vortex_error::VortexResult;
-    use vortex_session::VortexSession;
 
     use crate::BitPackedVTable;
     use crate::bitpack_compress::bitpack_encode;
 
-    static SESSION: LazyLock<VortexSession> =
-        LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
+    #[test]
+    fn test_slice_returns_bitpacked() -> VortexResult<()> {
+        let values = vortex_array::arrays::PrimitiveArray::from_iter(0u32..2048);
+        let bitpacked = bitpack_encode(&values, 11, None)?;
+
+        let result =
+            BitPackedVTable::slice(&bitpacked, 500..1500)?.expect("expected slice to succeed");
+
+        assert!(result.is::<BitPackedVTable>());
+        let result_bp = result.as_::<BitPackedVTable>();
+        assert_eq!(result_bp.offset(), 500);
+        assert_eq!(result.len(), 1000);
+
+        Ok(())
+    }
 
     #[test]
-    fn test_execute_parent_returns_bitpacked_slice() -> VortexResult<()> {
+    fn test_slice_via_array_trait() -> VortexResult<()> {
         let values = vortex_array::arrays::PrimitiveArray::from_iter(0u32..2048);
         let bitpacked = bitpack_encode(&values, 11, None)?;
 
-        let slice_array = SliceArray::new(bitpacked.clone().into_array(), 500..1500);
-
-        let mut ctx = SESSION.create_execution_ctx();
-        let reduced = <BitPackedVTable as VTable>::execute_parent(
-            &bitpacked,
-            &slice_array.into_array(),
-            0,
-            &mut ctx,
-        )?
-        .expect("expected slice kernel to execute");
-
-        assert!(reduced.is::<BitPackedVTable>());
-        let reduced_bp = reduced.as_::<BitPackedVTable>();
-        assert_eq!(reduced_bp.offset(), 500);
-        assert_eq!(reduced.len(), 1000);
+        let sliced = bitpacked.as_ref().slice(500..1500)?;
+
+        // After optimize, the SliceArray should have been reduced away.
+        assert!(
+            !sliced.is::<SliceVTable>(),
+            "expected SliceReduce to eliminate the SliceArray wrapper"
+        );
+        assert!(sliced.is::<BitPackedVTable>());
+        assert_eq!(sliced.len(), 1000);
 
         Ok(())
     }
diff --git a/encodings/fastlanes/src/bitpacking/vtable/kernels.rs b/encodings/fastlanes/src/bitpacking/vtable/kernels.rs
index ca9b98c09db..0685ac4e536 100644
--- a/encodings/fastlanes/src/bitpacking/vtable/kernels.rs
+++ b/encodings/fastlanes/src/bitpacking/vtable/kernels.rs
@@ -2,7 +2,6 @@
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
 use vortex_array::arrays::FilterExecuteAdaptor;
-use vortex_array::arrays::SliceExecuteAdaptor;
 use vortex_array::arrays::TakeExecuteAdaptor;
 use vortex_array::kernel::ParentKernelSet;
 
@@ -10,6 +9,5 @@ use crate::BitPackedVTable;
 
 pub(crate) const PARENT_KERNELS: ParentKernelSet<BitPackedVTable> = ParentKernelSet::new(&[
     ParentKernelSet::lift(&FilterExecuteAdaptor(BitPackedVTable)),
-    ParentKernelSet::lift(&SliceExecuteAdaptor(BitPackedVTable)),
     ParentKernelSet::lift(&TakeExecuteAdaptor(BitPackedVTable)),
 ]);
diff --git a/encodings/fastlanes/src/bitpacking/vtable/operations.rs b/encodings/fastlanes/src/bitpacking/vtable/operations.rs
index 2dcf62ceac3..5e6e2000567 100644
--- a/encodings/fastlanes/src/bitpacking/vtable/operations.rs
+++ b/encodings/fastlanes/src/bitpacking/vtable/operations.rs
@@ -26,13 +26,10 @@ impl OperationsVTable<BitPackedVTable> for BitPackedVTable {
 #[cfg(test)]
 mod test {
     use std::ops::Range;
-    use std::sync::LazyLock;
 
     use vortex_array::Array;
     use vortex_array::IntoArray;
-    use vortex_array::VortexSessionExecute;
     use vortex_array::arrays::PrimitiveArray;
-    use vortex_array::arrays::SliceArray;
     use vortex_array::assert_arrays_eq;
     use vortex_array::assert_nth_scalar;
     use vortex_array::buffer::BufferHandle;
@@ -41,9 +38,7 @@ mod test {
     use vortex_array::dtype::PType;
     use vortex_array::patches::Patches;
     use vortex_array::scalar::Scalar;
-    use vortex_array::session::ArraySession;
     use vortex_array::validity::Validity;
-    use vortex_array::vtable::VTable;
     use vortex_buffer::Alignment;
     use vortex_buffer::Buffer;
     use vortex_buffer::ByteBuffer;
@@ -52,20 +47,8 @@ mod test {
     use crate::BitPackedArray;
     use crate::BitPackedVTable;
 
-    static SESSION: LazyLock<vortex_session::VortexSession> =
-        LazyLock::new(|| vortex_session::VortexSession::empty().with::<ArraySession>());
-
     fn slice_via_kernel(array: &BitPackedArray, range: Range<usize>) -> BitPackedArray {
-        let slice_array = SliceArray::new(array.clone().into_array(), range);
-        let mut ctx = SESSION.create_execution_ctx();
-        let sliced = <BitPackedVTable as VTable>::execute_parent(
-            array,
-            &slice_array.into_array(),
-            0,
-            &mut ctx,
-        )
-        .expect("execute_parent failed")
-        .expect("expected slice kernel to execute");
+        let sliced = array.as_ref().slice(range).expect("slice failed");
         sliced.as_::<BitPackedVTable>().clone()
     }
 
diff --git a/encodings/fastlanes/src/bitpacking/vtable/rules.rs b/encodings/fastlanes/src/bitpacking/vtable/rules.rs
index 35c4a388f6a..53bff931272 100644
--- a/encodings/fastlanes/src/bitpacking/vtable/rules.rs
+++ b/encodings/fastlanes/src/bitpacking/vtable/rules.rs
@@ -1,10 +1,13 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
+use vortex_array::arrays::SliceReduceAdaptor;
 use vortex_array::optimizer::rules::ParentRuleSet;
 use vortex_array::scalar_fn::fns::cast::CastReduceAdaptor;
 
 use crate::BitPackedVTable;
 
-pub(crate) const RULES: ParentRuleSet<BitPackedVTable> =
-    ParentRuleSet::new(&[ParentRuleSet::lift(&CastReduceAdaptor(BitPackedVTable))]);
+pub(crate) const RULES: ParentRuleSet<BitPackedVTable> = ParentRuleSet::new(&[
+    ParentRuleSet::lift(&CastReduceAdaptor(BitPackedVTable)),
+    ParentRuleSet::lift(&SliceReduceAdaptor(BitPackedVTable)),
+]);
diff --git a/vortex-cuda/gpu-scan-cli/src/main.rs b/vortex-cuda/gpu-scan-cli/src/main.rs
index 7aac1762804..2cea7cc579f 100644
--- a/vortex-cuda/gpu-scan-cli/src/main.rs
+++ b/vortex-cuda/gpu-scan-cli/src/main.rs
@@ -94,7 +94,7 @@ async fn main() -> VortexResult<()> {
 
     // Create a full scan that executes on the GPU
     let cuda_stream =
-        VortexCudaStreamPool::new(Arc::clone(cuda_ctx.stream().context()), 1).get_stream()?;
+        VortexCudaStreamPool::new(Arc::clone(cuda_ctx.stream().context()), 1).stream()?;
     let gpu_reader = CopyDeviceReadAt::new(recompressed, cuda_stream);
 
     let gpu_file = session
diff --git a/vortex-cuda/kernels/src/dynamic_dispatch.cu b/vortex-cuda/kernels/src/dynamic_dispatch.cu
index 9168a8580ab..14aa55b66ad 100644
--- a/vortex-cuda/kernels/src/dynamic_dispatch.cu
+++ b/vortex-cuda/kernels/src/dynamic_dispatch.cu
@@ -56,8 +56,10 @@ __device__ inline void dynamic_source_op(const T *__restrict input,
         constexpr uint32_t FL_CHUNK_SIZE = 1024;
         constexpr uint32_t LANES_PER_FL_BLOCK = FL_CHUNK_SIZE / T_BITS;
         const uint32_t bit_width = source_op.params.bitunpack.bit_width;
+        const uint32_t element_offset = source_op.params.bitunpack.element_offset;
         const uint32_t packed_words_per_fl_block = LANES_PER_FL_BLOCK * bit_width;
-        const uint64_t first_fl_block = chunk_start / FL_CHUNK_SIZE;
+        // Shift chunk_start by the sub-block element offset.
+        const uint64_t first_fl_block = (chunk_start + element_offset) / FL_CHUNK_SIZE;
 
         // FL blocks must divide evenly. Otherwise, the last unpack would overflow smem.
         static_assert((ELEMENTS_PER_BLOCK % FL_CHUNK_SIZE) == 0);
diff --git a/vortex-cuda/kernels/src/dynamic_dispatch.h b/vortex-cuda/kernels/src/dynamic_dispatch.h
index f8fbeaf6c13..c208be7459b 100644
--- a/vortex-cuda/kernels/src/dynamic_dispatch.h
+++ b/vortex-cuda/kernels/src/dynamic_dispatch.h
@@ -44,9 +44,12 @@ union SourceParams {
     /// Unpack bit-packed data using FastLanes layout.
     struct BitunpackParams {
         uint8_t bit_width;
+        uint8_t _padding[3];
+        uint32_t element_offset; // Element offset within FL block (0..1023)
     } bitunpack;
 
     /// Copy elements verbatim from global memory to shared memory.
+    /// The input pointer is pre-adjusted on the host to account for slicing.
     struct LoadParams {
         uint8_t _padding;
     } load;
diff --git a/vortex-cuda/src/device_buffer.rs b/vortex-cuda/src/device_buffer.rs
index 17bcd44f5d4..c8d2841f10a 100644
--- a/vortex-cuda/src/device_buffer.rs
+++ b/vortex-cuda/src/device_buffer.rs
@@ -81,8 +81,6 @@ mod private {
     }
 }
 
-// Get it back out as a View of u8
-
 impl CudaDeviceBuffer {
     /// Creates a new CUDA device buffer from a [`CudaSlice<T>`].
     ///
@@ -101,6 +99,16 @@ impl CudaDeviceBuffer {
         }
     }
 
+    /// Returns the byte offset within the allocated buffer.
+    pub fn offset(&self) -> usize {
+        self.offset
+    }
+
+    /// Returns the adjusted device pointer accounting for the offset.
+    pub fn offset_ptr(&self) -> sys::CUdeviceptr {
+        self.device_ptr + self.offset as u64
+    }
+
     /// Returns a [`CudaView`] to the CUDA device buffer.
     pub fn as_view<T: DeviceRepr + 'static>(&self) -> CudaView<'_, T> {
         // Return a new &[T]
@@ -159,7 +167,7 @@ impl CudaBufferExt for BufferHandle {
             .as_any()
             .downcast_ref::<CudaDeviceBuffer>()
             .ok_or_else(|| vortex_err!("expected CudaDeviceBuffer"))?
-            .device_ptr;
+            .offset_ptr();
 
         Ok(ptr)
     }
@@ -281,7 +289,7 @@ impl DeviceBuffer for CudaDeviceBuffer {
 
     /// Slices the CUDA device buffer to a subrange.
     ///
-    /// **IMPORTANT**: this is a byte range, not elements range, due to the DeviceBuffer interface.
+    /// This is a byte range, not elements range, due to the DeviceBuffer interface.
     fn slice(&self, range: Range<usize>) -> Arc<dyn DeviceBuffer> {
         assert!(
             range.end <= self.len,
diff --git a/vortex-cuda/src/dynamic_dispatch/mod.rs b/vortex-cuda/src/dynamic_dispatch/mod.rs
index 133fb5de677..5832601933d 100644
--- a/vortex-cuda/src/dynamic_dispatch/mod.rs
+++ b/vortex-cuda/src/dynamic_dispatch/mod.rs
@@ -32,16 +32,27 @@ unsafe impl cudarc::driver::DeviceRepr for Stage {}
 
 impl SourceOp {
     /// Unpack bit-packed data using FastLanes layout.
-    pub fn bitunpack(bit_width: u8) -> Self {
+    ///
+    /// The device pointer already accounts for buffer slicing, so no
+    /// offset parameter is needed. `element_offset` (0..1023) is the
+    /// remaining position within the first FastLanes block.
+    pub fn bitunpack(bit_width: u8, element_offset: u16) -> Self {
         Self {
             op_code: SourceOp_SourceOpCode_BITUNPACK,
             params: SourceParams {
-                bitunpack: SourceParams_BitunpackParams { bit_width },
+                bitunpack: SourceParams_BitunpackParams {
+                    bit_width,
+                    _padding: [0; 3],
+                    element_offset: u32::from(element_offset),
+                },
             },
         }
     }
 
     /// Copy elements verbatim from global memory to shared memory.
+    ///
+    /// The device pointer already accounts for buffer slicing, so no
+    /// offset parameter is needed.
     pub fn load() -> Self {
         Self {
             op_code: SourceOp_SourceOpCode_LOAD,
@@ -262,7 +273,7 @@ mod tests {
         let plan = DynamicDispatchPlan::new([Stage::output(
             input_ptr,
             0,
-            SourceOp::bitunpack(bit_width),
+            SourceOp::bitunpack(bit_width, 0),
             &scalar_ops,
         )]);
         assert_eq!(plan.stages[0].num_scalar_ops, 4);
@@ -282,13 +293,13 @@ mod tests {
                 0xAAAA,
                 0,
                 256,
-                SourceOp::bitunpack(4),
+                SourceOp::bitunpack(4, 0),
                 &[ScalarOp::frame_of_ref(10)],
             ),
             Stage::output(
                 0xBBBB,
                 256,
-                SourceOp::bitunpack(6),
+                SourceOp::bitunpack(6, 0),
                 &[ScalarOp::frame_of_ref(42), ScalarOp::dict(0)],
             ),
         ]);
@@ -687,4 +698,113 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_sliced_primitive() -> VortexResult<()> {
+        let len = 5000;
+        let data: Vec<u32> = (0..len).map(|i| (i * 7) % 1000).collect();
+
+        let prim = PrimitiveArray::new(Buffer::from(data.clone()), NonNullable);
+
+        let slice_start = 500;
+        let slice_end = 3500;
+        let sliced = prim.as_ref().slice(slice_start..slice_end)?;
+
+        let expected: Vec<u32> = data[slice_start..slice_end].to_vec();
+
+        let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+        let (plan, _bufs) = build_plan(&sliced, &cuda_ctx)?;
+
+        let actual = run_dynamic_dispatch_plan(&cuda_ctx, expected.len(), &plan)?;
+        assert_eq!(actual, expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_sliced_for_bitpacked() -> VortexResult<()> {
+        let reference = 100u32;
+        let bit_width = 10u8;
+        let max_val = (1u32 << bit_width) - 1;
+        let len = 5000;
+
+        let encoded_data: Vec<u32> = (0..len).map(|i| (i as u32) % max_val).collect();
+        let prim = PrimitiveArray::new(Buffer::from(encoded_data.clone()), NonNullable);
+        let bp = BitPackedArray::encode(prim.as_ref(), bit_width)?;
+        let for_arr = FoRArray::try_new(bp.into_array(), Scalar::from(reference))?;
+
+        let all_decoded: Vec<u32> = encoded_data.iter().map(|&v| v + reference).collect();
+
+        let slice_start = 1024;
+        let slice_end = 3500;
+        let sliced = for_arr.as_ref().slice(slice_start..slice_end)?;
+        let expected: Vec<u32> = all_decoded[slice_start..slice_end].to_vec();
+
+        let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+        let (plan, _bufs) = build_plan(&sliced, &cuda_ctx)?;
+
+        let actual = run_dynamic_dispatch_plan(&cuda_ctx, expected.len(), &plan)?;
+        assert_eq!(actual, expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_sliced_zigzag_bitpacked() -> VortexResult<()> {
+        let bit_width = 10u8;
+        let max_val = (1u32 << bit_width) - 1;
+        let len = 5000;
+
+        let raw: Vec<u32> = (0..len).map(|i| (i as u32) % max_val).collect();
+        let all_decoded: Vec<u32> = raw
+            .iter()
+            .map(|&v| (v >> 1) ^ (0u32.wrapping_sub(v & 1)))
+            .collect();
+
+        let prim = PrimitiveArray::new(Buffer::from(raw), NonNullable);
+        let bp = BitPackedArray::encode(prim.as_ref(), bit_width)?;
+        let zz = ZigZagArray::try_new(bp.into_array())?;
+
+        let slice_start = 1024;
+        let slice_end = 3500;
+        let sliced = zz.as_ref().slice(slice_start..slice_end)?;
+        let expected: Vec<u32> = all_decoded[slice_start..slice_end].to_vec();
+
+        let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+        let (plan, _bufs) = build_plan(&sliced, &cuda_ctx)?;
+
+        let actual = run_dynamic_dispatch_plan(&cuda_ctx, expected.len(), &plan)?;
+        assert_eq!(actual, expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_sliced_dict_with_primitive_codes() -> VortexResult<()> {
+        let dict_values: Vec<u32> = vec![100, 200, 300, 400, 500];
+        let dict_size = dict_values.len();
+        let len = 5000;
+        let codes: Vec<u32> = (0..len).map(|i| (i % dict_size) as u32).collect();
+
+        let codes_prim = PrimitiveArray::new(Buffer::from(codes.clone()), NonNullable);
+        let values_prim = PrimitiveArray::new(Buffer::from(dict_values.clone()), NonNullable);
+        let dict = DictArray::try_new(codes_prim.into_array(), values_prim.into_array())?;
+
+        let slice_start = 1000;
+        let slice_end = 3000;
+        let sliced = dict.as_ref().slice(slice_start..slice_end)?;
+
+        let expected: Vec<u32> = codes[slice_start..slice_end]
+            .iter()
+            .map(|&c| dict_values[c as usize])
+            .collect();
+
+        let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+        let (plan, _bufs) = build_plan(&sliced, &cuda_ctx)?;
+
+        let actual = run_dynamic_dispatch_plan(&cuda_ctx, expected.len(), &plan)?;
+        assert_eq!(actual, expected);
+
+        Ok(())
+    }
 }
diff --git a/vortex-cuda/src/dynamic_dispatch/plan_builder.rs b/vortex-cuda/src/dynamic_dispatch/plan_builder.rs
index f63e0edc569..e686c8106fa 100644
--- a/vortex-cuda/src/dynamic_dispatch/plan_builder.rs
+++ b/vortex-cuda/src/dynamic_dispatch/plan_builder.rs
@@ -19,6 +19,7 @@ use vortex::encodings::alp::ALPFloat;
 use vortex::encodings::alp::ALPVTable;
 use vortex::encodings::fastlanes::BitPackedArrayParts;
 use vortex::encodings::fastlanes::BitPackedVTable;
+use vortex::encodings::fastlanes::FoRArray;
 use vortex::encodings::fastlanes::FoRVTable;
 use vortex::encodings::runend::RunEndArrayParts;
 use vortex::encodings::runend::RunEndVTable;
@@ -84,8 +85,6 @@ struct Pipeline {
 /// receive a value regardless of whether the input was null. Only arrays with
 /// `NonNullable` or `AllValid` validity produce correct results.
 ///
-/// **Slicing**: Not supported.
-///
 /// **Patches**: `BitPackedArray` with patches and `ALPArray` with patches are
 /// not supported and will return an error.
 ///
@@ -161,6 +160,9 @@ impl PlanBuilderState<'_> {
     }
 
     /// Canonical primitive array → LOAD source op.
+    ///
+    /// The device pointer already accounts for buffer slicing, so no
+    /// offset parameter is needed.
     fn walk_primitive(&mut self, array: ArrayRef) -> VortexResult<Pipeline> {
         let prim = array.to_canonical()?.into_primitive();
         let PrimitiveArrayParts { buffer, .. } = prim.into_parts();
@@ -170,11 +172,17 @@ impl PlanBuilderState<'_> {
         Ok(Pipeline {
             source: SourceOp::load(),
             scalar_ops: vec![],
-            input_ptr: ptr,
+            input_ptr: ptr as u64,
         })
     }
 
     /// BitPackedArray → BITUNPACK source op.
+    ///
+    /// The device pointer already accounts for buffer slicing, so no
+    /// offset parameter is needed. The sub-block element offset (0..1023)
+    /// from `BitPackedArrayParts` is the exception: It is passed as a kernel
+    /// parameter since it cannot be expressed as pointer arithmetic on
+    /// bit-packed data.
     fn walk_bitpacked(&mut self, array: ArrayRef) -> VortexResult<Pipeline> {
         let bp = array
             .try_into::<BitPackedVTable>()
@@ -187,11 +195,6 @@ impl PlanBuilderState<'_> {
             ..
         } = bp.into_parts();
 
-        if offset != 0 {
-            vortex_bail!(
-                "Dynamic dispatch does not support sliced BitPackedArray (offset={offset})"
-            );
-        }
         if patches.is_some() {
             vortex_bail!("Dynamic dispatch does not support BitPackedArray with patches");
         }
@@ -200,9 +203,9 @@ impl PlanBuilderState<'_> {
         let ptr = device_buf.cuda_device_ptr()?;
         self.device_buffers.push(device_buf);
         Ok(Pipeline {
-            source: SourceOp::bitunpack(bit_width),
+            source: SourceOp::bitunpack(bit_width, offset),
             scalar_ops: vec![],
-            input_ptr: ptr,
+            input_ptr: ptr as u64,
         })
     }
 
@@ -313,7 +316,7 @@ impl PlanBuilderState<'_> {
 }
 
 /// Extract a FoR reference scalar as u64 bits.
-fn extract_for_reference(for_arr: &vortex::encodings::fastlanes::FoRArray) -> VortexResult<u64> {
+fn extract_for_reference(for_arr: &FoRArray) -> VortexResult<u64> {
     if let Ok(v) = u32::try_from(for_arr.reference_scalar()) {
         Ok(v as u64)
     } else if let Ok(v) = i32::try_from(for_arr.reference_scalar()) {
diff --git a/vortex-cuda/src/kernel/encodings/bitpacked.rs b/vortex-cuda/src/kernel/encodings/bitpacked.rs
index 2f29fd149d0..fc66571ab45 100644
--- a/vortex-cuda/src/kernel/encodings/bitpacked.rs
+++ b/vortex-cuda/src/kernel/encodings/bitpacked.rs
@@ -157,13 +157,10 @@ where
 mod tests {
     use futures::executor::block_on;
     use rstest::rstest;
-    use vortex::array::ExecutionCtx;
     use vortex::array::IntoArray;
     use vortex::array::arrays::PrimitiveArray;
     use vortex::array::assert_arrays_eq;
-    use vortex::array::session::ArraySession;
     use vortex::array::validity::Validity::NonNullable;
-    use vortex::array::vtable::VTable;
     use vortex::buffer::Buffer;
     use vortex::error::VortexExpect;
     use vortex::session::VortexSession;
@@ -464,15 +461,7 @@ mod tests {
 
         let bitpacked_array = BitPackedArray::encode(primitive_array.as_ref(), bit_width)
             .vortex_expect("operation should succeed in test");
-        let slice_ref = bitpacked_array.clone().into_array().slice(67..3969)?;
-        let mut exec_ctx = ExecutionCtx::new(VortexSession::empty().with::<ArraySession>());
-        let sliced_array = <BitPackedVTable as VTable>::execute_parent(
-            &bitpacked_array,
-            &slice_ref,
-            0,
-            &mut exec_ctx,
-        )?
-        .expect("expected slice kernel to execute");
+        let sliced_array = bitpacked_array.as_ref().slice(67..3969)?;
         let cpu_result = sliced_array.to_canonical()?;
         let gpu_result = block_on(async {
             BitPackedExecutor