Align TensorMap API surface with review feedback and enforce context safety.

cpcloud · cpcloud · commit 892ee605e500 · 2026-03-07T09:41:46.000-05:00
Expose only TensorMapDescriptor in cuda.core, add StridedMemoryView.as_tensor_map(), remove redundant tensor-map fallback packing, and track/check descriptor context/device compatibility before replacement and kernel launch argument packing.

Made-with: Cursor
diff --git a/.gitignore b/.gitignore
@@ -14,7 +14,6 @@ __pycache__/
 !*_impl.cpp
 !cuda_bindings/cuda/bindings/_lib/param_packer.cpp
 !cuda_bindings/cuda/bindings/_bindings/loader.cpp
-!cuda_core/cuda/core/_cpp/*.cpp
 cache_driver
 cache_runtime
 cache_nvrtc
diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
@@ -68,12 +68,4 @@
     Stream,
     StreamOptions,
 )
-from cuda.core._tensor_map import (
-    TensorMapDataType,
-    TensorMapDescriptor,
-    TensorMapIm2ColWideMode,
-    TensorMapInterleave,
-    TensorMapL2Promotion,
-    TensorMapOOBFill,
-    TensorMapSwizzle,
-)
+from cuda.core._tensor_map import TensorMapDescriptor
diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx
@@ -135,6 +135,7 @@ cdef inline int prepare_tensor_map_arg(
         vector.vector[void*]& data_addresses,
         TensorMapDescriptor arg,
         const size_t idx) except -1:
+    arg._check_context_compat()
     # Allocate a temporary buffer for the 128-byte CUtensorMap struct.
     # We copy rather than pointing directly at arg._tensor_map for lifetime
     # safety: ParamHolder owns and frees its argument buffers independently.
@@ -350,9 +351,6 @@ cdef class ParamHolder:
                 elif isinstance(arg, driver.CUgraphConditionalHandle):
                     prepare_arg[cydriver.CUgraphConditionalHandle](self.data, self.data_addresses, arg, i)
                     continue
-                elif isinstance(arg, tensor_map_descriptor_type):
-                    prepare_tensor_map_arg(self.data, self.data_addresses, <TensorMapDescriptor>arg, i)
-                    continue
                 # TODO: support ctypes/numpy struct
                 raise TypeError("the argument is of unsupported type: " + str(type(arg)))
 
diff --git a/cuda_core/cuda/core/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx
@@ -316,6 +316,39 @@ cdef class StridedMemoryView:
         view_buffer_strided(view, self.get_buffer(), layout, dtype, self.readonly)
         return view
 
+    def as_tensor_map(
+        self,
+        box_dim,
+        *,
+        element_strides=None,
+        data_type=None,
+        interleave=None,
+        swizzle=None,
+        l2_promotion=None,
+        oob_fill=None,
+    ):
+        """Create a tiled :obj:`TensorMapDescriptor` from this view.
+
+        This is a convenience wrapper around
+        :meth:`cuda.core._tensor_map.TensorMapDescriptor.from_tiled`.
+        """
+        from cuda.core._tensor_map import TensorMapDescriptor
+
+        kwargs = {}
+        if element_strides is not None:
+            kwargs["element_strides"] = element_strides
+        if data_type is not None:
+            kwargs["data_type"] = data_type
+        if interleave is not None:
+            kwargs["interleave"] = interleave
+        if swizzle is not None:
+            kwargs["swizzle"] = swizzle
+        if l2_promotion is not None:
+            kwargs["l2_promotion"] = l2_promotion
+        if oob_fill is not None:
+            kwargs["oob_fill"] = oob_fill
+        return TensorMapDescriptor.from_tiled(self, box_dim, **kwargs)
+
     def copy_from(
         self, other : StridedMemoryView, stream : Stream,
         allocator = None,
diff --git a/cuda_core/cuda/core/_tensor_map.pxd b/cuda_core/cuda/core/_tensor_map.pxd
@@ -3,12 +3,16 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from cuda.bindings cimport cydriver
+from libc.stdint cimport intptr_t
 
 
 cdef class TensorMapDescriptor:
     cdef cydriver.CUtensorMap _tensor_map
+    cdef int _device_id
+    cdef intptr_t _context
     cdef object _source_ref
     cdef object _view_ref
     cdef object _repr_info
 
+    cdef int _check_context_compat(self) except -1
     cdef void* _get_data_ptr(self)
diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx
@@ -263,6 +263,22 @@ def _get_validated_view(tensor):
     return view
 
 
+cdef inline intptr_t _get_current_context_ptr() except? 0:
+    cdef cydriver.CUcontext ctx
+    with nogil:
+        HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx))
+    if ctx == NULL:
+        raise RuntimeError("TensorMapDescriptor requires an active CUDA context")
+    return <intptr_t>ctx
+
+
+cdef inline int _get_current_device_id() except -1:
+    cdef cydriver.CUdevice dev
+    with nogil:
+        HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev))
+    return <int>dev
+
+
 def _compute_byte_strides(shape, strides, elem_size):
     """Compute byte strides from element strides or C-contiguous fallback.
 
@@ -313,6 +329,28 @@ cdef class TensorMapDescriptor:
     cdef void* _get_data_ptr(self):
         return <void*>&self._tensor_map
 
+    cdef int _check_context_compat(self) except -1:
+        cdef cydriver.CUcontext current_ctx
+        cdef cydriver.CUdevice current_dev
+        cdef int current_dev_id
+        if self._context == 0 and self._device_id < 0:
+            return 0
+        with nogil:
+            HANDLE_RETURN(cydriver.cuCtxGetCurrent(&current_ctx))
+        if current_ctx == NULL:
+            raise RuntimeError("TensorMapDescriptor requires an active CUDA context")
+        if self._context != 0 and <intptr_t>current_ctx != self._context:
+            raise RuntimeError(
+                "TensorMapDescriptor was created in a different CUDA context")
+        with nogil:
+            HANDLE_RETURN(cydriver.cuCtxGetDevice(&current_dev))
+        current_dev_id = <int>current_dev
+        if self._device_id >= 0 and current_dev_id != self._device_id:
+            raise RuntimeError(
+                f"TensorMapDescriptor belongs to device {self._device_id}, "
+                f"but current device is {current_dev_id}")
+        return 0
+
     @classmethod
     def from_tiled(cls, tensor, box_dim, *,
                    element_strides=None,
@@ -366,6 +404,8 @@ cdef class TensorMapDescriptor:
         # deleter can free the backing allocation when released.
         desc._source_ref = tensor
         desc._view_ref = view
+        desc._context = _get_current_context_ptr()
+        desc._device_id = _get_current_device_id()
 
         tma_dt = _resolve_data_type(view, data_type)
         cdef int c_data_type_int = int(tma_dt)
@@ -593,6 +633,8 @@ cdef class TensorMapDescriptor:
         view = _get_validated_view(tensor)
         desc._source_ref = tensor
         desc._view_ref = view
+        desc._context = _get_current_context_ptr()
+        desc._device_id = _get_current_device_id()
 
         tma_dt = _resolve_data_type(view, data_type)
         cdef int c_data_type_int = int(tma_dt)
@@ -750,6 +792,8 @@ cdef class TensorMapDescriptor:
             view = _get_validated_view(tensor)
             desc._source_ref = tensor
             desc._view_ref = view
+            desc._context = _get_current_context_ptr()
+            desc._device_id = _get_current_device_id()
 
             tma_dt = _resolve_data_type(view, data_type)
             cdef int c_data_type_int = int(tma_dt)
@@ -839,7 +883,11 @@ cdef class TensorMapDescriptor:
             or a :obj:`~cuda.core.StridedMemoryView`. Must refer to
             device-accessible memory with a 16-byte-aligned pointer.
         """
+        self._check_context_compat()
         view = _get_validated_view(tensor)
+        if view.device_id != self._device_id:
+            raise ValueError(
+                f"replace_address expects tensor on device {self._device_id}, got {view.device_id}")
 
         cdef intptr_t global_address = view.ptr
 
diff --git a/cuda_core/examples/tma_replace_address.py b/cuda_core/examples/tma_replace_address.py
@@ -36,7 +36,7 @@
     LaunchConfig,
     Program,
     ProgramOptions,
-    TensorMapDescriptor,
+    StridedMemoryView,
     launch,
 )
 
@@ -159,7 +159,7 @@
 output = cp.zeros(N, dtype=cp.float32)
 dev.sync()  # cupy uses its own stream
 
-tensor_map = TensorMapDescriptor.from_tiled(a, box_dim=(TILE_SIZE,))
+tensor_map = StridedMemoryView.from_any_interface(a, stream_ptr=-1).as_tensor_map(box_dim=(TILE_SIZE,))
 
 n_tiles = N // TILE_SIZE
 config = LaunchConfig(grid=n_tiles, block=TILE_SIZE)
diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py
@@ -31,7 +31,7 @@
     LaunchConfig,
     Program,
     ProgramOptions,
-    TensorMapDescriptor,
+    StridedMemoryView,
     launch,
 )
 
@@ -48,8 +48,6 @@
     sys.exit(0)
 dev.set_current()
 
-arch_str = "".join(f"{i}" for i in arch)
-
 # ---------------------------------------------------------------------------
 # CUDA kernel that uses TMA to load a 1-D tile into shared memory, then
 # copies the tile to an output buffer so we can verify correctness.
@@ -141,7 +139,7 @@
 prog = Program(
     code,
     code_type="c++",
-    options=ProgramOptions(std="c++17", arch=f"sm_{arch_str}"),
+    options=ProgramOptions(std="c++17", arch=f"sm_{dev.arch}"),
 )
 mod = prog.compile("cubin")
 ker = mod.get_kernel("tma_copy")
@@ -155,11 +153,10 @@
 dev.sync()  # cupy uses its own stream
 
 # ---------------------------------------------------------------------------
-# 2) Create a TMA tiled descriptor
-#    from_tiled() accepts any DLPack / __cuda_array_interface__ object.
+# 2) Create a TMA tiled descriptor from a StridedMemoryView.
 #    The dtype (float32) is inferred automatically from the CuPy array.
 # ---------------------------------------------------------------------------
-tensor_map = TensorMapDescriptor.from_tiled(a, box_dim=(TILE_SIZE,))
+tensor_map = StridedMemoryView.from_any_interface(a, stream_ptr=-1).as_tensor_map(box_dim=(TILE_SIZE,))
 
 # ---------------------------------------------------------------------------
 # 3) Launch the kernel
diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
@@ -6,8 +6,11 @@
 
 from cuda.core import (
     Device,
-    TensorMapDataType,
+    StridedMemoryView,
     TensorMapDescriptor,
+)
+from cuda.core._tensor_map import (
+    TensorMapDataType,
     TensorMapIm2ColWideMode,
     TensorMapInterleave,
     TensorMapL2Promotion,
@@ -107,6 +110,16 @@ def test_from_tiled_2d(self, dev, skip_if_no_tma):
         )
         assert desc is not None
 
+    def test_strided_memory_view_as_tensor_map(self, dev, skip_if_no_tma):
+        buf = dev.allocate(64 * 64 * 4)
+        tensor = _DeviceArray(buf, (64, 64))
+        view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
+        desc = view.as_tensor_map(
+            box_dim=(32, 32),
+            data_type=TensorMapDataType.FLOAT32,
+        )
+        assert desc is not None
+
     def test_from_tiled_3d(self, dev, skip_if_no_tma):
         buf = dev.allocate(16 * 16 * 16 * 4)  # 16x16x16 float32
         tensor = _DeviceArray(buf, (16, 16, 16))