Allow StridedMemoryView to be constructed from dlpacks type code (#1623)

isVoid · web-flow · commit 5985ee154091 · 2026-02-18T12:26:11.000-08:00
* Add ml_dtypes.bfloat16 dlpack code parsing

* add pytorch tensor tests

* skip if cupy major version is less than 14

* add documentation

* move ml_dtypes dependency to test-cuxx

* trim doc for brevity and fix display error

* skip test when cupy is not present; install ml-dtypes to ft test environment

---------

Co-authored-by: Michael Wang &lt;isVoid@users.noreply.github.com&gt;
diff --git a/cuda_core/cuda/core/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 
@@ -27,6 +27,12 @@ from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 
 from cuda.core._memory import Buffer
 
+
+try:
+    from ml_dtypes import bfloat16
+except ImportError:
+    bfloat16 = None
+
 # TODO(leofang): support NumPy structured dtypes
 
 
@@ -332,6 +338,11 @@ cdef class StridedMemoryView:
     def dtype(self) -> numpy.dtype | None:
         """
         Data type of the tensor.
+
+        Supports standard NumPy dtypes as well as narrow data types (e.g., ``bfloat16``)
+        when the optional `ml_dtypes <https://github.com/jax-ml/ml_dtypes>`_ package is
+        installed. If ``ml_dtypes`` is not available and such a tensor is encountered,
+        a :obj:`NotImplementedError` will be raised.
         """
         return self.get_dtype()
 
@@ -555,8 +566,13 @@ cdef object dtype_dlpack_to_numpy(DLDataType* dtype):
         else:
             raise TypeError(f'{bits}-bit bool is not supported')
     elif dtype.code == kDLBfloat:
-        # TODO(leofang): use ml_dtype.bfloat16?
-        raise NotImplementedError('bfloat is not supported yet')
+        if bfloat16 is not None:
+            np_dtype = numpy.dtype("bfloat16")
+        else:
+            raise NotImplementedError(
+                'Support for bfloat16 within cuda-core requires `ml_dtypes`'
+                'to be installed.'
+            )
     else:
         raise TypeError('Unsupported dtype. dtype code: {}'.format(dtype.code))
 
diff --git a/cuda_core/docs/source/interoperability.rst b/cuda_core/docs/source/interoperability.rst
@@ -1,4 +1,4 @@
-.. SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+.. SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 .. SPDX-License-Identifier: Apache-2.0
 
 .. currentmodule:: cuda.core
@@ -79,6 +79,16 @@ array libraries.
 The :attr:`~utils.StridedMemoryView.is_device_accessible` attribute can be used to check
 whether or not the underlying buffer can be accessed on GPU.
 
+The :class:`~utils.StridedMemoryView` class supports narrow data types (e.g., ``bfloat16``) when the optional
+`ml_dtypes <https://github.com/jax-ml/ml_dtypes>`_ package is installed. This enables interoperability with libraries that use
+narrow dtype tensors, such as PyTorch with ``torch.bfloat16`` or CuPy with ``"bfloat16"`` dtype.
+If ``ml_dtypes`` is not available and such a tensor is encountered, a
+:obj:`NotImplementedError` will be raised.
+
+Currently supported narrow data types:
+
+* ``bfloat16``
+
 .. rubric:: Footnotes
 
 .. [1] https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html
diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml
@@ -56,12 +56,13 @@ cu13 = ["cuda-bindings[all]==13.*"]
 
 [dependency-groups]
 test = ["cython>=3.2,<3.3", "setuptools", "pytest>=6.2.4", "pytest-randomly", "pytest-repeat"]
-test-cu12 = ["cuda-core[test]", "cupy-cuda12x; python_version < '3.14'", "cuda-toolkit[cudart]==12.*"]  # runtime headers needed by CuPy
-test-cu13 = ["cuda-core[test]", "cupy-cuda13x; python_version < '3.14'", "cuda-toolkit[cudart]==13.*"]  # runtime headers needed by CuPy
+ml-dtypes = ["ml-dtypes>=0.5.4,<0.6.0"]
+test-cu12 = [ {include-group = "ml-dtypes" }, "cuda-core[test]", "cupy-cuda12x; python_version < '3.14'", "cuda-toolkit[cudart]==12.*"]  # runtime headers needed by CuPy
+test-cu13 = [ {include-group = "ml-dtypes" }, "cuda-core[test]", "cupy-cuda13x; python_version < '3.14'", "cuda-toolkit[cudart]==13.*"]  # runtime headers needed by CuPy
 # free threaded build, cupy doesn't support free-threaded builds yet, so avoid installing it for now
 # TODO: cupy should support free threaded builds
-test-cu12-ft = ["cuda-core[test]", "cuda-toolkit[cudart]==12.*"]
-test-cu13-ft = ["cuda-core[test]", "cuda-toolkit[cudart]==13.*"]
+test-cu12-ft = [ {include-group = "ml-dtypes" }, "cuda-core[test]", "cuda-toolkit[cudart]==12.*"]
+test-cu13-ft = [ {include-group = "ml-dtypes" }, "cuda-core[test]", "cuda-toolkit[cudart]==13.*"]
 
 [project.urls]
 homepage = "https://nvidia.github.io/cuda-python/"
diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py
@@ -1,9 +1,10 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 import math
 
+# TODO: replace optional imports with pytest.importorskip
 try:
     import cupy as cp
 except ImportError:
@@ -12,7 +13,12 @@
     from numba import cuda as numba_cuda
 except ImportError:
     numba_cuda = None
+try:
+    import torch
+except ImportError:
+    torch = None
 import cuda.core
+import ml_dtypes
 import numpy as np
 import pytest
 from cuda.core import Device
@@ -21,6 +27,12 @@
 from pytest import param
 
 
+def _get_cupy_version_major() -> int | None:
+    if cp is None:
+        return None
+    return int(cp.__version__.split(".")[0])
+
+
 def test_cast_to_3_tuple_success():
     c3t = cuda.core._utils.cuda_utils.cast_to_3_tuple
     assert c3t("", ()) == (1, 1, 1)
@@ -524,3 +536,85 @@ def test_from_array_interface_unsupported_strides(init_cuda):
     with pytest.raises(ValueError, match="strides must be divisible by itemsize"):
         # TODO: ideally this would raise on construction
         smv.strides  # noqa: B018
+
+
+@pytest.mark.parametrize(
+    "slices",
+    [
+        param((slice(None), slice(None)), id="contiguous"),
+        param((slice(None, None, 2), slice(1, None, 2)), id="strided"),
+    ],
+)
+@pytest.mark.skipif(cp is None, reason="CuPy is not installed")
+@pytest.mark.skipif(cp is not None and _get_cupy_version_major() < 14, reason="CuPy version is less than 14.0.0")
+def test_ml_dtypes_bfloat16_dlpack(init_cuda, slices):
+    a = cp.array([1, 2, 3, 4, 5, 6], dtype=ml_dtypes.bfloat16).reshape(2, 3)[slices]
+    smv = StridedMemoryView.from_dlpack(a, stream_ptr=0)
+
+    assert smv.size == a.size
+    assert smv.dtype == np.dtype("bfloat16")
+    assert smv.dtype == np.dtype(ml_dtypes.bfloat16)
+    assert smv.shape == a.shape
+    assert smv.ptr == a.data.ptr
+    assert smv.device_id == init_cuda.device_id
+    assert smv.is_device_accessible is True
+    assert smv.exporting_obj is a
+    assert smv.readonly is a.__cuda_array_interface__["data"][1]
+
+    strides_in_counts = convert_strides_to_counts(a.strides, a.dtype.itemsize)
+    if a.flags["C_CONTIGUOUS"]:
+        assert smv.strides in (None, strides_in_counts)
+    else:
+        assert smv.strides == strides_in_counts
+
+
+@pytest.mark.parametrize(
+    "slices",
+    [
+        param((slice(None), slice(None)), id="contiguous"),
+        param((slice(None, None, 2), slice(1, None, 2)), id="strided"),
+    ],
+)
+@pytest.mark.skipif(torch is None, reason="PyTorch is not installed")
+def test_ml_dtypes_bfloat16_torch_dlpack(init_cuda, slices):
+    a = torch.tensor([1, 2, 3, 4, 5, 6], dtype=torch.bfloat16, device="cuda").reshape(2, 3)[slices]
+    smv = StridedMemoryView.from_dlpack(a, stream_ptr=0)
+
+    assert smv.size == a.numel()
+    assert smv.dtype == np.dtype("bfloat16")
+    assert smv.dtype == np.dtype(ml_dtypes.bfloat16)
+    assert smv.shape == tuple(a.shape)
+    assert smv.ptr == a.data_ptr()
+    assert smv.device_id == init_cuda.device_id
+    assert smv.is_device_accessible is True
+    assert smv.exporting_obj is a
+
+    # PyTorch stride() returns strides in elements, convert to bytes first
+    strides_in_bytes = tuple(s * a.element_size() for s in a.stride())
+    strides_in_counts = convert_strides_to_counts(strides_in_bytes, a.element_size())
+    if a.is_contiguous():
+        assert smv.strides in (None, strides_in_counts)
+    else:
+        assert smv.strides == strides_in_counts
+
+
+@pytest.fixture
+def no_ml_dtypes(monkeypatch):
+    monkeypatch.setattr("cuda.core._memoryview.bfloat16", None)
+    yield
+
+
+@pytest.mark.parametrize(
+    "api",
+    [
+        param(StridedMemoryView.from_dlpack, id="from_dlpack"),
+        param(StridedMemoryView.from_any_interface, id="from_any_interface"),
+    ],
+)
+@pytest.mark.skipif(cp is None, reason="CuPy is not installed")
+@pytest.mark.skipif(cp is not None and _get_cupy_version_major() < 14, reason="CuPy version is less than 14.0.0")
+def test_ml_dtypes_bfloat16_dlpack_requires_ml_dtypes(init_cuda, no_ml_dtypes, api):
+    a = cp.array([1, 2, 3], dtype="bfloat16")
+    smv = api(a, stream_ptr=0)
+    with pytest.raises(NotImplementedError, match=r"requires `ml_dtypes`"):
+        smv.dtype  # noqa: B018