CUDA execute Patched

a10y · a10y · commit e16d1deb8db6 · 2026-04-09T15:19:23.000-04:00
Signed-off-by: Andrew Duffy &lt;andrew@a10y.dev&gt;
diff --git a/vortex-array/public-api.lock b/vortex-array/public-api.lock
@@ -3368,6 +3368,10 @@ pub struct vortex_array::arrays::patched::Patched
 
 impl vortex_array::arrays::patched::Patched
 
+pub const vortex_array::arrays::patched::Patched::ID: vortex_array::ArrayId
+
+impl vortex_array::arrays::patched::Patched
+
 pub fn vortex_array::arrays::patched::Patched::from_array_and_patches(inner: vortex_array::ArrayRef, patches: &vortex_array::patches::Patches, ctx: &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::Array<vortex_array::arrays::patched::Patched>>
 
 impl core::clone::Clone for vortex_array::arrays::patched::Patched
@@ -6218,6 +6222,10 @@ pub struct vortex_array::arrays::Patched
 
 impl vortex_array::arrays::patched::Patched
 
+pub const vortex_array::arrays::patched::Patched::ID: vortex_array::ArrayId
+
+impl vortex_array::arrays::patched::Patched
+
 pub fn vortex_array::arrays::patched::Patched::from_array_and_patches(inner: vortex_array::ArrayRef, patches: &vortex_array::patches::Patches, ctx: &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::Array<vortex_array::arrays::patched::Patched>>
 
 impl core::clone::Clone for vortex_array::arrays::patched::Patched
diff --git a/vortex-array/src/arrays/patched/vtable/mod.rs b/vortex-array/src/arrays/patched/vtable/mod.rs
@@ -57,6 +57,11 @@ pub type PatchedArray = Array<Patched>;
 #[derive(Clone, Debug)]
 pub struct Patched;
 
+impl Patched {
+    /// The array ID for Patched arrays.
+    pub const ID: ArrayId = ArrayId::new_ref("vortex.patched");
+}
+
 impl ValidityChild<Patched> for Patched {
     fn validity_child(array: ArrayView<'_, Patched>) -> ArrayRef {
         array.inner().clone()
@@ -99,7 +104,7 @@ impl VTable for Patched {
     type ValidityVTable = ValidityVTableFromChild;
 
     fn id(&self) -> ArrayId {
-        ArrayId::new_ref("vortex.patched")
+        Self::ID
     }
 
     fn validate(
diff --git a/vortex-cuda/src/kernel/encodings/bitpacked.rs b/vortex-cuda/src/kernel/encodings/bitpacked.rs
@@ -3,12 +3,6 @@
 
 use std::fmt::Debug;
 
-use crate::CudaBufferExt;
-use crate::CudaDeviceBuffer;
-use crate::executor::CudaExecutionCtx;
-use crate::executor::{CudaArrayExt, CudaExecute};
-use crate::kernel::patches::gpu::GPUPatches;
-use crate::kernel::patches::types::{DevicePatches, transpose_patches};
 use async_trait::async_trait;
 use cudarc::driver::CudaFunction;
 use cudarc::driver::DeviceRepr;
@@ -24,15 +18,23 @@ use vortex::array::match_each_integer_ptype;
 use vortex::dtype::NativePType;
 use vortex::encodings::fastlanes::BitPacked;
 use vortex::encodings::fastlanes::BitPackedArray;
+use vortex::encodings::fastlanes::BitPackedArrayExt;
 use vortex::encodings::fastlanes::BitPackedDataParts;
 use vortex::encodings::fastlanes::unpack_iter::BitPacked as BitPackedUnpack;
 use vortex::error::VortexResult;
 use vortex::error::vortex_ensure;
 use vortex::error::vortex_err;
-use vortex_array::arrays::PatchedArray;
-use vortex_array::arrays::patched::PatchedArraySlotsExt;
 use vortex_array::patches::Patches;
 
+use crate::CudaBufferExt;
+use crate::CudaDeviceBuffer;
+use crate::executor::CudaArrayExt;
+use crate::executor::CudaExecute;
+use crate::executor::CudaExecutionCtx;
+use crate::kernel::patches::gpu::GPUPatches;
+use crate::kernel::patches::types::DevicePatches;
+use crate::kernel::patches::types::transpose_patches;
+
 /// CUDA decoder for bit-packed arrays.
 #[derive(Debug)]
 pub(crate) struct BitPackedExecutor;
@@ -54,8 +56,13 @@ impl CudaExecute for BitPackedExecutor {
         let array =
             Self::try_specialize(array).ok_or_else(|| vortex_err!("Expected BitPackedArray"))?;
 
+        let patch_kind = match array.patches() {
+            Some(patches) => PatchKind::Interior(patches),
+            None => PatchKind::None,
+        };
+
         match_each_integer_ptype!(array.ptype(array.dtype()), |A| {
-            decode_bitpacked::<A>(array, A::default(), ctx).await
+            decode_bitpacked::<A>(array, A::default(), patch_kind, ctx).await
         })
     }
 }
@@ -110,7 +117,7 @@ pub(crate) enum PatchKind {
 
 impl PatchKind {
     pub(crate) async fn execute(
-        mut self,
+        self,
         ctx: &mut CudaExecutionCtx,
     ) -> VortexResult<Option<DevicePatches>> {
         match self {
@@ -160,6 +167,7 @@ impl PatchKind {
 pub(crate) async fn decode_bitpacked<A>(
     array: BitPackedArray,
     reference: A,
+    patch_kind: PatchKind,
     ctx: &mut CudaExecutionCtx,
 ) -> VortexResult<Canonical>
 where
@@ -171,7 +179,7 @@ where
         bit_width,
         len,
         packed,
-        patches,
+        patches: _,
         validity,
     } = BitPacked::into_parts(array);
 
@@ -192,12 +200,8 @@ where
     let cuda_function = bitpacked_cuda_kernel(bit_width, output_width, ctx)?;
     let config = bitpacked_cuda_launch_config(output_width, len)?;
 
-    // We hold this here to keep the device buffers alive.
-    let device_patches = if let Some(patches) = patches {
-        Some(transpose_patches(&patches, ctx).await?)
-    } else {
-        None
-    };
+    // Execute the patch kind to get device patches
+    let device_patches = patch_kind.execute(ctx).await?;
 
     let patches_arg = if let Some(p) = &device_patches {
         GPUPatches {
diff --git a/vortex-cuda/src/kernel/encodings/for_.rs b/vortex-cuda/src/kernel/encodings/for_.rs
@@ -18,6 +18,7 @@ use vortex::array::match_each_integer_ptype;
 use vortex::array::match_each_native_simd_ptype;
 use vortex::dtype::NativePType;
 use vortex::encodings::fastlanes::BitPacked;
+use vortex::encodings::fastlanes::BitPackedArrayExt;
 use vortex::encodings::fastlanes::FoR;
 use vortex::encodings::fastlanes::FoRArray;
 use vortex::encodings::fastlanes::FoRArrayExt;
@@ -30,6 +31,7 @@ use crate::CudaBufferExt;
 use crate::executor::CudaArrayExt;
 use crate::executor::CudaExecute;
 use crate::executor::CudaExecutionCtx;
+use crate::kernel::encodings::bitpacked::PatchKind;
 use crate::kernel::encodings::bitpacked::decode_bitpacked;
 
 /// CUDA decoder for frame-of-reference.
@@ -54,9 +56,13 @@ impl CudaExecute for FoRExecutor {
 
         // Fuse FOR + BP => FFOR
         if let Some(bitpacked) = array.encoded().as_opt::<BitPacked>() {
+            let patch_kind = match bitpacked.patches() {
+                Some(patches) => PatchKind::Interior(patches),
+                None => PatchKind::None,
+            };
             match_each_integer_ptype!(bitpacked.ptype(bitpacked.dtype()), |P| {
                 let reference: P = array.reference_scalar().try_into()?;
-                return decode_bitpacked(bitpacked.into_owned(), reference, ctx).await;
+                return decode_bitpacked(bitpacked.into_owned(), reference, patch_kind, ctx).await;
             })
         }
 
@@ -65,9 +71,13 @@ impl CudaExecute for FoRExecutor {
             && let Some(bitpacked) = slice_array.child().as_opt::<BitPacked>()
         {
             let slice_range = slice_array.slice_range().clone();
+            let patch_kind = match bitpacked.patches() {
+                Some(patches) => PatchKind::Interior(patches),
+                None => PatchKind::None,
+            };
             let unpacked = match_each_integer_ptype!(bitpacked.ptype(bitpacked.dtype()), |P| {
                 let reference: P = array.reference_scalar().try_into()?;
-                decode_bitpacked(bitpacked.into_owned(), reference, ctx).await?
+                decode_bitpacked(bitpacked.into_owned(), reference, patch_kind, ctx).await?
             });
 
             return unpacked
diff --git a/vortex-cuda/src/kernel/encodings/mod.rs b/vortex-cuda/src/kernel/encodings/mod.rs
@@ -6,6 +6,7 @@ mod bitpacked;
 mod date_time_parts;
 mod decimal_byte_parts;
 mod for_;
+mod patched;
 mod runend;
 mod sequence;
 mod zigzag;
@@ -18,6 +19,7 @@ pub(crate) use bitpacked::BitPackedExecutor;
 pub(crate) use date_time_parts::DateTimePartsExecutor;
 pub(crate) use decimal_byte_parts::DecimalBytePartsExecutor;
 pub(crate) use for_::FoRExecutor;
+pub(crate) use patched::PatchedExecutor;
 pub(crate) use runend::RunEndExecutor;
 pub(crate) use sequence::SequenceExecutor;
 pub(crate) use zigzag::ZigZagExecutor;
diff --git a/vortex-cuda/src/kernel/encodings/patched.rs b/vortex-cuda/src/kernel/encodings/patched.rs
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::fmt::Debug;
+
+use async_trait::async_trait;
+use tracing::instrument;
+use vortex::array::ArrayRef;
+use vortex::array::Canonical;
+use vortex::array::match_each_integer_ptype;
+use vortex::encodings::fastlanes::BitPacked;
+use vortex::encodings::fastlanes::BitPackedArrayExt;
+use vortex::error::VortexResult;
+use vortex::error::vortex_err;
+use vortex_array::arrays::PatchedArray;
+use vortex_array::arrays::patched::Patched;
+use vortex_array::arrays::patched::PatchedArraySlotsExt;
+
+use crate::executor::CudaArrayExt;
+use crate::executor::CudaExecute;
+use crate::executor::CudaExecutionCtx;
+use crate::kernel::encodings::bitpacked::PatchKind;
+use crate::kernel::encodings::bitpacked::decode_bitpacked;
+
+/// CUDA decoder for Patched arrays.
+///
+/// When the inner child is BitPacked, fuses patching with bit-unpacking to avoid
+/// an additional kernel dispatch.
+#[derive(Debug)]
+pub(crate) struct PatchedExecutor;
+
+impl PatchedExecutor {
+    fn try_specialize(array: ArrayRef) -> Option<PatchedArray> {
+        array.try_downcast::<Patched>().ok()
+    }
+}
+
+#[async_trait]
+impl CudaExecute for PatchedExecutor {
+    #[instrument(level = "trace", skip_all, fields(executor = ?self))]
+    async fn execute(
+        &self,
+        array: ArrayRef,
+        ctx: &mut CudaExecutionCtx,
+    ) -> VortexResult<Canonical> {
+        let array =
+            Self::try_specialize(array).ok_or_else(|| vortex_err!("Expected PatchedArray"))?;
+
+        // Check if the inner child is BitPacked - if so, we can fuse patching with unpacking
+        if let Some(bitpacked) = array.inner().as_opt::<BitPacked>() {
+            // The inner BitPacked should not have its own interior patches since they've
+            // been externalized into the Patched wrapper
+            if bitpacked.patches().is_some() {
+                return Err(vortex_err!(
+                    "Patched(BitPacked) should not have interior patches in BitPacked child"
+                ));
+            }
+
+            // Create PatchKind::Patched from the externalized patches
+            let patch_kind = PatchKind::Patched {
+                lane_offsets: array.lane_offsets().clone(),
+                patch_indices: array.patch_indices().clone(),
+                patch_values: array.patch_values().clone(),
+            };
+
+            match_each_integer_ptype!(bitpacked.ptype(bitpacked.dtype()), |P| {
+                return decode_bitpacked::<P>(
+                    bitpacked.into_owned(),
+                    P::default(),
+                    patch_kind,
+                    ctx,
+                )
+                .await;
+            })
+        }
+
+        // Fallback: execute inner and apply patches on the result
+        let inner_canonical = array.inner().clone().execute_cuda(ctx).await?;
+        let inner_primitive = inner_canonical.into_primitive();
+
+        // Execute patch components
+        let lane_offsets = array.lane_offsets().clone().execute_cuda(ctx).await?;
+        let patch_indices = array.patch_indices().clone().execute_cuda(ctx).await?;
+        let patch_values = array.patch_values().clone().execute_cuda(ctx).await?;
+
+        // For now, fall back to CPU execution for non-BitPacked inner types
+        // by returning an error indicating we need CPU fallback
+        Err(vortex_err!(
+            "Patched array with non-BitPacked inner type not yet supported on GPU, \
+             inner encoding: {:?}, inner: {:?}, lane_offsets: {:?}, patch_indices: {:?}, patch_values: {:?}",
+            array.inner().encoding_id(),
+            inner_primitive,
+            lane_offsets,
+            patch_indices,
+            patch_values
+        ))
+    }
+}
diff --git a/vortex-cuda/src/lib.rs b/vortex-cuda/src/lib.rs
@@ -39,6 +39,7 @@ use kernel::DictExecutor;
 use kernel::FilterExecutor;
 use kernel::FoRExecutor;
 pub use kernel::LaunchStrategy;
+use kernel::PatchedExecutor;
 use kernel::RunEndExecutor;
 use kernel::SharedExecutor;
 pub use kernel::TracingLaunchStrategy;
@@ -62,6 +63,7 @@ pub use stream_pool::VortexCudaStreamPool;
 use vortex::array::arrays::Constant;
 use vortex::array::arrays::Dict;
 use vortex::array::arrays::Filter;
+use vortex::array::arrays::Patched;
 use vortex::array::arrays::Shared;
 use vortex::array::arrays::Slice;
 use vortex::encodings::alp::ALP;
@@ -99,6 +101,7 @@ pub fn initialize_cuda(session: &CudaSession) {
     session.register_kernel(DateTimeParts::ID, &DateTimePartsExecutor);
     session.register_kernel(DecimalByteParts::ID, &DecimalBytePartsExecutor);
     session.register_kernel(Dict::ID, &DictExecutor);
+    session.register_kernel(Patched::ID, &PatchedExecutor);
     session.register_kernel(Shared::ID, &SharedExecutor);
     session.register_kernel(FoR::ID, &FoRExecutor);
     session.register_kernel(RunEnd::ID, &RunEndExecutor);