|
4 | 4 | import numpy as np |
5 | 5 | import pytest |
6 | 6 |
|
| 7 | +from conftest import create_managed_memory_resource_or_skip, skip_if_managed_memory_unsupported |
7 | 8 | from cuda.core import ( |
8 | 9 | Device, |
| 10 | + ManagedMemoryResourceOptions, |
9 | 11 | StridedMemoryView, |
10 | 12 | TensorMapDescriptor, |
| 13 | + system, |
11 | 14 | ) |
| 15 | +from cuda.core._dlpack import DLDeviceType |
12 | 16 | from cuda.core._tensor_map import ( |
13 | 17 | TensorMapDataType, |
14 | 18 | TensorMapIm2ColWideMode, |
15 | 19 | TensorMapInterleave, |
16 | 20 | TensorMapL2Promotion, |
17 | 21 | TensorMapOOBFill, |
18 | 22 | TensorMapSwizzle, |
| 23 | + _require_view_device, |
19 | 24 | ) |
20 | 25 |
|
21 | 26 |
|
@@ -48,6 +53,15 @@ def __init__(self, buf, shape, dtype=np.float32): |
48 | 53 | } |
49 | 54 |
|
50 | 55 |
|
| 56 | +class _MockTensorMapView: |
| 57 | + def __init__(self, device_type, device_id): |
| 58 | + self._device_type = device_type |
| 59 | + self._device_id = device_id |
| 60 | + |
| 61 | + def __dlpack_device__(self): |
| 62 | + return (self._device_type, self._device_id) |
| 63 | + |
| 64 | + |
51 | 65 | class TestTensorMapEnums: |
52 | 66 | """Test that enum wrappers expose the expected values.""" |
53 | 67 |
|
@@ -323,6 +337,115 @@ def test_replace_address_requires_device_accessible(self, dev, skip_if_no_tma): |
323 | 337 | with pytest.raises(ValueError, match="device-accessible"): |
324 | 338 | desc.replace_address(host_arr) |
325 | 339 |
|
| 340 | + def test_replace_address_rejects_tensor_from_other_device(self, dev, skip_if_no_tma): |
| 341 | + if system.get_num_devices() < 2: |
| 342 | + pytest.skip("requires multi-GPU") |
| 343 | + |
| 344 | + dev0 = dev |
| 345 | + dev1 = Device(1) |
| 346 | + |
| 347 | + dev0.set_current() |
| 348 | + buf0 = dev0.allocate(1024 * 4) |
| 349 | + desc = TensorMapDescriptor.from_tiled( |
| 350 | + buf0, |
| 351 | + box_dim=(64,), |
| 352 | + data_type=TensorMapDataType.FLOAT32, |
| 353 | + ) |
| 354 | + |
| 355 | + dev1.set_current() |
| 356 | + buf1 = dev1.allocate(1024 * 4) |
| 357 | + dev0.set_current() |
| 358 | + |
| 359 | + with pytest.raises(ValueError, match=r"replace_address expects tensor on device 0, got 1"): |
| 360 | + desc.replace_address(buf1) |
| 361 | + |
| 362 | + def test_replace_address_accepts_managed_buffer_on_nonzero_device(self, init_cuda): |
| 363 | + if system.get_num_devices() < 2: |
| 364 | + pytest.skip("requires multi-GPU") |
| 365 | + |
| 366 | + dev1 = Device(1) |
| 367 | + if not dev1.properties.tensor_map_access_supported: |
| 368 | + pytest.skip("Device does not support TMA (requires compute capability 9.0+)") |
| 369 | + skip_if_managed_memory_unsupported(dev1) |
| 370 | + |
| 371 | + dev1.set_current() |
| 372 | + desc = TensorMapDescriptor.from_tiled( |
| 373 | + dev1.allocate(1024 * 4), |
| 374 | + box_dim=(64,), |
| 375 | + data_type=TensorMapDataType.FLOAT32, |
| 376 | + ) |
| 377 | + |
| 378 | + mr = create_managed_memory_resource_or_skip( |
| 379 | + ManagedMemoryResourceOptions(preferred_location=dev1.device_id) |
| 380 | + ) |
| 381 | + managed_buf = mr.allocate(1024 * 4) |
| 382 | + |
| 383 | + desc.replace_address(managed_buf) |
| 384 | + |
| 385 | + |
| 386 | +class TestTensorMapMultiDeviceValidation: |
| 387 | + """Test multi-device validation for descriptor creation.""" |
| 388 | + |
| 389 | + def test_from_tiled_rejects_tensor_from_other_device(self, init_cuda): |
| 390 | + if system.get_num_devices() < 2: |
| 391 | + pytest.skip("requires multi-GPU") |
| 392 | + |
| 393 | + dev0 = Device(0) |
| 394 | + dev1 = Device(1) |
| 395 | + |
| 396 | + dev1.set_current() |
| 397 | + buf1 = dev1.allocate(1024 * 4) |
| 398 | + dev0.set_current() |
| 399 | + |
| 400 | + with pytest.raises( |
| 401 | + ValueError, |
| 402 | + match=r"TensorMapDescriptor\.from_tiled expects tensor on device 0, got 1", |
| 403 | + ): |
| 404 | + TensorMapDescriptor.from_tiled( |
| 405 | + buf1, |
| 406 | + box_dim=(64,), |
| 407 | + data_type=TensorMapDataType.FLOAT32, |
| 408 | + ) |
| 409 | + |
| 410 | + def test_from_tiled_accepts_managed_buffer_on_nonzero_device(self, init_cuda): |
| 411 | + if system.get_num_devices() < 2: |
| 412 | + pytest.skip("requires multi-GPU") |
| 413 | + |
| 414 | + dev1 = Device(1) |
| 415 | + if not dev1.properties.tensor_map_access_supported: |
| 416 | + pytest.skip("Device does not support TMA (requires compute capability 9.0+)") |
| 417 | + skip_if_managed_memory_unsupported(dev1) |
| 418 | + |
| 419 | + dev1.set_current() |
| 420 | + mr = create_managed_memory_resource_or_skip( |
| 421 | + ManagedMemoryResourceOptions(preferred_location=dev1.device_id) |
| 422 | + ) |
| 423 | + managed_buf = mr.allocate(1024 * 4) |
| 424 | + |
| 425 | + desc = TensorMapDescriptor.from_tiled( |
| 426 | + managed_buf, |
| 427 | + box_dim=(64,), |
| 428 | + data_type=TensorMapDataType.FLOAT32, |
| 429 | + ) |
| 430 | + assert desc is not None |
| 431 | + |
| 432 | + |
| 433 | +class TestTensorMapDeviceValidation: |
| 434 | + """Test device validation behavior for tensor-map-compatible views.""" |
| 435 | + |
| 436 | + def test_require_view_device_accepts_same_cuda_device(self): |
| 437 | + _require_view_device(_MockTensorMapView(DLDeviceType.kDLCUDA, 1), 1, "op") |
| 438 | + |
| 439 | + def test_require_view_device_rejects_different_cuda_device(self): |
| 440 | + with pytest.raises(ValueError, match=r"op expects tensor on device 0, got 1"): |
| 441 | + _require_view_device(_MockTensorMapView(DLDeviceType.kDLCUDA, 1), 0, "op") |
| 442 | + |
| 443 | + def test_require_view_device_allows_cuda_host_memory(self): |
| 444 | + _require_view_device(_MockTensorMapView(DLDeviceType.kDLCUDAHost, 0), 1, "op") |
| 445 | + |
| 446 | + def test_require_view_device_allows_cuda_managed_memory(self): |
| 447 | + _require_view_device(_MockTensorMapView(DLDeviceType.kDLCUDAManaged, 0), 1, "op") |
| 448 | + |
326 | 449 |
|
327 | 450 | class TestTensorMapIm2col: |
328 | 451 | """Test im2col TMA descriptor creation.""" |
|
0 commit comments