Handle TensorMap device validation by DLPack type

rparolin · rparolin · commit eef1c7a1293b · 2026-03-10T10:39:26.000-07:00
Reject CUDA device-local tensors from a different GPU while still allowing CUDA host and managed memory.

Add regression tests for descriptor creation, replace_address, and the shared validation helper.
diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx
@@ -263,6 +263,19 @@ def _get_validated_view(tensor):
     return view
 
 
+def _require_view_device(view, expected_device_id, operation):
+    """Ensure device-local tensors match the current CUDA device.
+
+    DLPack reports host/managed CUDA memory as ``kDLCUDAHost`` /
+    ``kDLCUDAManaged`` with ``device_id=0`` regardless of the current device,
+    so only true ``kDLCUDA`` tensors are rejected by device-id mismatch.
+    """
+    device_type, device_id = view.__dlpack_device__()
+    if device_type == _kDLCUDA and device_id != expected_device_id:
+        raise ValueError(
+            f"{operation} expects tensor on device {expected_device_id}, got {device_id}")
+
+
 cdef inline intptr_t _get_current_context_ptr() except? 0:
     cdef cydriver.CUcontext ctx
     with nogil:
@@ -406,6 +419,7 @@ cdef class TensorMapDescriptor:
         desc._view_ref = view
         desc._context = _get_current_context_ptr()
         desc._device_id = _get_current_device_id()
+        _require_view_device(view, desc._device_id, "TensorMapDescriptor.from_tiled")
 
         tma_dt = _resolve_data_type(view, data_type)
         cdef int c_data_type_int = int(tma_dt)
@@ -447,6 +461,8 @@ cdef class TensorMapDescriptor:
         cdef int i_cccl
         cdef int device_type
         cdef int c_device_id
+        cdef int dl_device_type
+        cdef int dl_device_id
         cdef int c_cccl_interleave_int
         cdef int c_cccl_swizzle_int
         cdef int c_cccl_l2_promotion_int
@@ -471,8 +487,9 @@ cdef class TensorMapDescriptor:
             if elem_strides_provided:
                 c_elem_strides_ptr = &c_elem_strides[0]
 
-            device_type = <int>_kDLCUDA
-            c_device_id = <int>view.device_id
+            dl_device_type, dl_device_id = view.__dlpack_device__()
+            device_type = dl_device_type
+            c_device_id = dl_device_id
             c_cccl_interleave_int = int(interleave)
             c_cccl_swizzle_int = int(swizzle)
             c_cccl_l2_promotion_int = int(l2_promotion)
@@ -635,6 +652,7 @@ cdef class TensorMapDescriptor:
         desc._view_ref = view
         desc._context = _get_current_context_ptr()
         desc._device_id = _get_current_device_id()
+        _require_view_device(view, desc._device_id, "TensorMapDescriptor.from_im2col")
 
         tma_dt = _resolve_data_type(view, data_type)
         cdef int c_data_type_int = int(tma_dt)
@@ -794,6 +812,7 @@ cdef class TensorMapDescriptor:
             desc._view_ref = view
             desc._context = _get_current_context_ptr()
             desc._device_id = _get_current_device_id()
+            _require_view_device(view, desc._device_id, "TensorMapDescriptor.from_im2col_wide")
 
             tma_dt = _resolve_data_type(view, data_type)
             cdef int c_data_type_int = int(tma_dt)
@@ -885,9 +904,7 @@ cdef class TensorMapDescriptor:
         """
         self._check_context_compat()
         view = _get_validated_view(tensor)
-        if view.device_id != self._device_id:
-            raise ValueError(
-                f"replace_address expects tensor on device {self._device_id}, got {view.device_id}")
+        _require_view_device(view, self._device_id, "replace_address")
 
         cdef intptr_t global_address = view.ptr
 
diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
@@ -4,18 +4,23 @@
 import numpy as np
 import pytest
 
+from conftest import create_managed_memory_resource_or_skip, skip_if_managed_memory_unsupported
 from cuda.core import (
     Device,
+    ManagedMemoryResourceOptions,
     StridedMemoryView,
     TensorMapDescriptor,
+    system,
 )
+from cuda.core._dlpack import DLDeviceType
 from cuda.core._tensor_map import (
     TensorMapDataType,
     TensorMapIm2ColWideMode,
     TensorMapInterleave,
     TensorMapL2Promotion,
     TensorMapOOBFill,
     TensorMapSwizzle,
+    _require_view_device,
 )
 
 
@@ -48,6 +53,15 @@ def __init__(self, buf, shape, dtype=np.float32):
         }
 
 
+class _MockTensorMapView:
+    def __init__(self, device_type, device_id):
+        self._device_type = device_type
+        self._device_id = device_id
+
+    def __dlpack_device__(self):
+        return (self._device_type, self._device_id)
+
+
 class TestTensorMapEnums:
     """Test that enum wrappers expose the expected values."""
 
@@ -323,6 +337,115 @@ def test_replace_address_requires_device_accessible(self, dev, skip_if_no_tma):
         with pytest.raises(ValueError, match="device-accessible"):
             desc.replace_address(host_arr)
 
+    def test_replace_address_rejects_tensor_from_other_device(self, dev, skip_if_no_tma):
+        if system.get_num_devices() < 2:
+            pytest.skip("requires multi-GPU")
+
+        dev0 = dev
+        dev1 = Device(1)
+
+        dev0.set_current()
+        buf0 = dev0.allocate(1024 * 4)
+        desc = TensorMapDescriptor.from_tiled(
+            buf0,
+            box_dim=(64,),
+            data_type=TensorMapDataType.FLOAT32,
+        )
+
+        dev1.set_current()
+        buf1 = dev1.allocate(1024 * 4)
+        dev0.set_current()
+
+        with pytest.raises(ValueError, match=r"replace_address expects tensor on device 0, got 1"):
+            desc.replace_address(buf1)
+
+    def test_replace_address_accepts_managed_buffer_on_nonzero_device(self, init_cuda):
+        if system.get_num_devices() < 2:
+            pytest.skip("requires multi-GPU")
+
+        dev1 = Device(1)
+        if not dev1.properties.tensor_map_access_supported:
+            pytest.skip("Device does not support TMA (requires compute capability 9.0+)")
+        skip_if_managed_memory_unsupported(dev1)
+
+        dev1.set_current()
+        desc = TensorMapDescriptor.from_tiled(
+            dev1.allocate(1024 * 4),
+            box_dim=(64,),
+            data_type=TensorMapDataType.FLOAT32,
+        )
+
+        mr = create_managed_memory_resource_or_skip(
+            ManagedMemoryResourceOptions(preferred_location=dev1.device_id)
+        )
+        managed_buf = mr.allocate(1024 * 4)
+
+        desc.replace_address(managed_buf)
+
+
+class TestTensorMapMultiDeviceValidation:
+    """Test multi-device validation for descriptor creation."""
+
+    def test_from_tiled_rejects_tensor_from_other_device(self, init_cuda):
+        if system.get_num_devices() < 2:
+            pytest.skip("requires multi-GPU")
+
+        dev0 = Device(0)
+        dev1 = Device(1)
+
+        dev1.set_current()
+        buf1 = dev1.allocate(1024 * 4)
+        dev0.set_current()
+
+        with pytest.raises(
+            ValueError,
+            match=r"TensorMapDescriptor\.from_tiled expects tensor on device 0, got 1",
+        ):
+            TensorMapDescriptor.from_tiled(
+                buf1,
+                box_dim=(64,),
+                data_type=TensorMapDataType.FLOAT32,
+            )
+
+    def test_from_tiled_accepts_managed_buffer_on_nonzero_device(self, init_cuda):
+        if system.get_num_devices() < 2:
+            pytest.skip("requires multi-GPU")
+
+        dev1 = Device(1)
+        if not dev1.properties.tensor_map_access_supported:
+            pytest.skip("Device does not support TMA (requires compute capability 9.0+)")
+        skip_if_managed_memory_unsupported(dev1)
+
+        dev1.set_current()
+        mr = create_managed_memory_resource_or_skip(
+            ManagedMemoryResourceOptions(preferred_location=dev1.device_id)
+        )
+        managed_buf = mr.allocate(1024 * 4)
+
+        desc = TensorMapDescriptor.from_tiled(
+            managed_buf,
+            box_dim=(64,),
+            data_type=TensorMapDataType.FLOAT32,
+        )
+        assert desc is not None
+
+
+class TestTensorMapDeviceValidation:
+    """Test device validation behavior for tensor-map-compatible views."""
+
+    def test_require_view_device_accepts_same_cuda_device(self):
+        _require_view_device(_MockTensorMapView(DLDeviceType.kDLCUDA, 1), 1, "op")
+
+    def test_require_view_device_rejects_different_cuda_device(self):
+        with pytest.raises(ValueError, match=r"op expects tensor on device 0, got 1"):
+            _require_view_device(_MockTensorMapView(DLDeviceType.kDLCUDA, 1), 0, "op")
+
+    def test_require_view_device_allows_cuda_host_memory(self):
+        _require_view_device(_MockTensorMapView(DLDeviceType.kDLCUDAHost, 0), 1, "op")
+
+    def test_require_view_device_allows_cuda_managed_memory(self):
+        _require_view_device(_MockTensorMapView(DLDeviceType.kDLCUDAManaged, 0), 1, "op")
+
 
 class TestTensorMapIm2col:
     """Test im2col TMA descriptor creation."""