unified?

brandonros · brandonros · commit 0673be2c4cf6 · 2026-04-14T23:52:04.000-04:00
diff --git a/crates/cust/build.rs b/crates/cust/build.rs
@@ -40,4 +40,12 @@ fn main() {
         println!("cargo::rustc-cfg=cuGraphGetEdges_v2");
         println!("cargo::rustc-cfg=cuCtxCreate_v4");
     }
+
+    // In CUDA 13.2 the `id` field in `CUmemLocation_st` was placed inside an anonymous union.
+    // Bindgen renders this as `__bindgen_anon_1: CUmemLocation_st__bindgen_ty_1` instead of a
+    // direct `id` field. This cfg gates the struct initialization syntax accordingly.
+    println!("cargo::rustc-check-cfg=cfg(cuMemLocation_anon_id)");
+    if driver_version >= 13020 {
+        println!("cargo::rustc-cfg=cuMemLocation_anon_id");
+    }
 }
diff --git a/crates/cust/src/memory/unified.rs b/crates/cust/src/memory/unified.rs
@@ -19,29 +19,6 @@ use crate::memory::UnifiedPointer;
 use crate::memory::malloc::{cuda_free_unified, cuda_malloc_unified};
 use crate::prelude::Stream;
 
-#[cfg(any(cuMemPrefetchAsync_v2, cuMemAdvise_v2))]
-unsafe fn cu_mem_location(
-    type_: driver_sys::CUmemLocationType,
-    id: std::os::raw::c_int,
-) -> driver_sys::CUmemLocation {
-    let mut location = std::mem::MaybeUninit::<driver_sys::CUmemLocation>::zeroed();
-    let location_ptr = location.as_mut_ptr();
-
-    // Support both older bindgen output (`{ type_, id }`) and the newer
-    // anonymous-union layout emitted from CUDA 13.2 headers.
-    unsafe {
-        (*location_ptr).type_ = type_;
-        std::ptr::write(
-            (location_ptr.cast::<u8>())
-                .add(std::mem::size_of::<driver_sys::CUmemLocationType>())
-                .cast::<std::os::raw::c_int>(),
-            id,
-        );
-
-        location.assume_init()
-    }
-}
-
 /// A pointer type for heap-allocation in CUDA unified memory.
 ///
 /// See the [`module-level documentation`](../memory/index.html) for more information on unified
@@ -663,13 +640,20 @@ pub trait MemoryAdvise<T: DeviceCopy>: private::Sealed {
         let mem_size = std::mem::size_of_val(slice);
 
         unsafe {
+            let id = -1; // -1 is CU_DEVICE_CPU
             driver_sys::cuMemPrefetchAsync(
                 slice.as_ptr() as driver_sys::CUdeviceptr,
                 mem_size,
                 #[cfg(cuMemPrefetchAsync_v2)]
-                cu_mem_location(driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_HOST, 0),
+                driver_sys::CUmemLocation {
+                    type_: driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE,
+                    #[cfg(cuMemLocation_anon_id)]
+                    __bindgen_anon_1: driver_sys::CUmemLocation_st__bindgen_ty_1 { id },
+                    #[cfg(not(cuMemLocation_anon_id))]
+                    id,
+                },
                 #[cfg(not(cuMemPrefetchAsync_v2))]
-                -1, // -1 is CU_DEVICE_CPU
+                id,
                 #[cfg(cuMemPrefetchAsync_v2)]
                 0, // flags for future use, must be 0 as of CUDA 13.0
                 stream.as_inner(),
@@ -710,7 +694,13 @@ pub trait MemoryAdvise<T: DeviceCopy>: private::Sealed {
                 slice.as_ptr() as driver_sys::CUdeviceptr,
                 mem_size,
                 #[cfg(cuMemPrefetchAsync_v2)]
-                cu_mem_location(driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE, id),
+                driver_sys::CUmemLocation {
+                    type_: driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE,
+                    #[cfg(cuMemLocation_anon_id)]
+                    __bindgen_anon_1: driver_sys::CUmemLocation_st__bindgen_ty_1 { id },
+                    #[cfg(not(cuMemLocation_anon_id))]
+                    id,
+                },
                 #[cfg(not(cuMemPrefetchAsync_v2))]
                 id,
                 #[cfg(cuMemPrefetchAsync_v2)]
@@ -743,14 +733,21 @@ pub trait MemoryAdvise<T: DeviceCopy>: private::Sealed {
         };
 
         unsafe {
+            let id = 0;
             driver_sys::cuMemAdvise(
                 slice.as_ptr() as driver_sys::CUdeviceptr,
                 mem_size,
                 advice,
                 #[cfg(cuMemAdvise_v2)]
-                cu_mem_location(driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_HOST, 0),
+                driver_sys::CUmemLocation {
+                    type_: driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE,
+                    #[cfg(cuMemLocation_anon_id)]
+                    __bindgen_anon_1: driver_sys::CUmemLocation_st__bindgen_ty_1 { id },
+                    #[cfg(not(cuMemLocation_anon_id))]
+                    id,
+                },
                 #[cfg(not(cuMemAdvise_v2))]
-                0,
+                id,
             )
             .to_result()?;
         }
@@ -787,11 +784,12 @@ pub trait MemoryAdvise<T: DeviceCopy>: private::Sealed {
                 mem_size,
                 driver_sys::CUmem_advise::CU_MEM_ADVISE_SET_PREFERRED_LOCATION,
                 #[cfg(cuMemAdvise_v2)]
-                match preferred_location {
-                    Some(_) => {
-                        cu_mem_location(driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE, id)
-                    }
-                    None => cu_mem_location(driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_HOST, 0),
+                driver_sys::CUmemLocation {
+                    type_: driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE,
+                    #[cfg(cuMemLocation_anon_id)]
+                    __bindgen_anon_1: driver_sys::CUmemLocation_st__bindgen_ty_1 { id },
+                    #[cfg(not(cuMemLocation_anon_id))]
+                    id,
                 },
                 #[cfg(not(cuMemAdvise_v2))]
                 id,
@@ -807,14 +805,21 @@ pub trait MemoryAdvise<T: DeviceCopy>: private::Sealed {
         let mem_size = std::mem::size_of_val(slice);
 
         unsafe {
+            let id = 0;
             driver_sys::cuMemAdvise(
                 slice.as_ptr() as driver_sys::CUdeviceptr,
                 mem_size,
                 driver_sys::CUmem_advise::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION,
                 #[cfg(cuMemAdvise_v2)]
-                cu_mem_location(driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_HOST, 0),
+                driver_sys::CUmemLocation {
+                    type_: driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE,
+                    #[cfg(cuMemLocation_anon_id)]
+                    __bindgen_anon_1: driver_sys::CUmemLocation_st__bindgen_ty_1 { id },
+                    #[cfg(not(cuMemLocation_anon_id))]
+                    id,
+                },
                 #[cfg(not(cuMemAdvise_v2))]
-                0,
+                id,
             )
             .to_result()?;
         }
diff --git a/examples/vecadd/src/main.rs b/examples/vecadd/src/main.rs
@@ -7,6 +7,22 @@ const NUMBERS_LEN: usize = 100_000;
 
 static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx"));
 
+macro_rules! step {
+    ($label:expr, $expr:expr) => {{
+        eprintln!("[vecadd] {} ...", $label);
+        match $expr {
+            Ok(v) => {
+                eprintln!("[vecadd] {} ok", $label);
+                v
+            }
+            Err(e) => {
+                eprintln!("[vecadd] {} FAILED: {:?}", $label, e);
+                return Err(e.into());
+            }
+        }
+    }};
+}
+
 fn main() -> Result<(), Box<dyn Error>> {
     // generate our random vectors.
     let mut wyrand = WyRand::new();
@@ -15,59 +31,80 @@ fn main() -> Result<(), Box<dyn Error>> {
     let mut rhs = vec![0.0f32; NUMBERS_LEN];
     wyrand.fill(&mut rhs);
 
-    // initialize CUDA, this will pick the first available device and will
-    // make a CUDA context from it.
-    // We don't need the context for anything but it must be kept alive.
-    let _ctx = cust::quick_init()?;
-
-    // Make the CUDA module, modules just house the GPU code for the kernels we created.
-    // they can be made from PTX code, cubins, or fatbins.
-    let module = Module::from_ptx(PTX, &[])?;
-
-    // make a CUDA stream to issue calls to. You can think of this as an OS thread but for dispatching
-    // GPU calls.
-    let stream = Stream::new(StreamFlags::NON_BLOCKING, None)?;
-
-    // allocate the GPU memory needed to house our numbers and copy them over.
-    let lhs_gpu = lhs.as_slice().as_dbuf()?;
-    let rhs_gpu = rhs.as_slice().as_dbuf()?;
+    let _ctx = step!("cust::quick_init", cust::quick_init());
+
+    let (driver_major, driver_minor) = step!(
+        "CudaApiVersion::get",
+        cust::CudaApiVersion::get().map(|v| (v.major(), v.minor()))
+    );
+    eprintln!("[vecadd] CUDA driver API version: {driver_major}.{driver_minor}");
+
+    let device = step!("Device::get_device(0)", cust::device::Device::get_device(0));
+    let cc_major = step!(
+        "Device::get_attribute(ComputeCapabilityMajor)",
+        device.get_attribute(cust::device::DeviceAttribute::ComputeCapabilityMajor)
+    );
+    let cc_minor = step!(
+        "Device::get_attribute(ComputeCapabilityMinor)",
+        device.get_attribute(cust::device::DeviceAttribute::ComputeCapabilityMinor)
+    );
+    let name = step!("Device::name", device.name());
+    eprintln!("[vecadd] GPU: {name} (compute {cc_major}.{cc_minor})");
+
+    eprintln!("[vecadd] PTX size: {} bytes", PTX.len());
+    eprintln!(
+        "[vecadd] PTX header: {}",
+        PTX.lines().take(10).collect::<Vec<_>>().join(" | ")
+    );
+
+    let module = step!("Module::from_ptx", Module::from_ptx(PTX, &[]));
+
+    let stream = step!(
+        "Stream::new",
+        Stream::new(StreamFlags::NON_BLOCKING, None)
+    );
+
+    let lhs_gpu = step!("DeviceBuffer::from lhs", lhs.as_slice().as_dbuf());
+    let rhs_gpu = step!("DeviceBuffer::from rhs", rhs.as_slice().as_dbuf());
 
-    // allocate our output buffer. You could also use DeviceBuffer::uninitialized() to avoid the
-    // cost of the copy, but you need to be careful not to read from the buffer.
     let mut out = vec![0.0f32; NUMBERS_LEN];
-    let out_buf = out.as_slice().as_dbuf()?;
+    let out_buf = step!("DeviceBuffer::from out", out.as_slice().as_dbuf());
 
-    // retrieve the `vecadd` kernel from the module so we can calculate the right launch config.
-    let vecadd = module.get_function("vecadd")?;
+    let vecadd = step!(
+        "Module::get_function(\"vecadd\")",
+        module.get_function("vecadd")
+    );
 
-    // use the CUDA occupancy API to find an optimal launch configuration for the grid and block size.
-    // This will try to maximize how much of the GPU is used by finding the best launch configuration for the
-    // current CUDA device/architecture.
-    let (_, block_size) = vecadd.suggested_launch_configuration(0, 0.into())?;
+    let (_, block_size) = step!(
+        "suggested_launch_configuration",
+        vecadd.suggested_launch_configuration(0, 0.into())
+    );
 
     let grid_size = (NUMBERS_LEN as u32).div_ceil(block_size);
 
     println!("using {grid_size} blocks and {block_size} threads per block");
 
-    // Actually launch the GPU kernel. This will queue up the launch on the stream, it will
-    // not block the thread until the kernel is finished.
+    eprintln!("[vecadd] launching kernel ...");
     unsafe {
         launch!(
-            // slices are passed as two parameters, the pointer and the length.
             vecadd<<<grid_size, block_size, 0, stream>>>(
                 lhs_gpu.as_device_ptr(),
                 lhs_gpu.len(),
                 rhs_gpu.as_device_ptr(),
                 rhs_gpu.len(),
                 out_buf.as_device_ptr(),
             )
-        )?;
+        )
+        .map_err(|e| {
+            eprintln!("[vecadd] launch FAILED: {e:?}");
+            e
+        })?;
     }
+    eprintln!("[vecadd] launch queued ok");
 
-    stream.synchronize()?;
+    step!("stream.synchronize", stream.synchronize());
 
-    // copy back the data from the GPU.
-    out_buf.copy_to(&mut out)?;
+    step!("copy_to", out_buf.copy_to(&mut out));
 
     println!("{} + {} = {}", lhs[0], rhs[0], out[0]);
 
diff --git a/llvm-19 b/llvm-19
@@ -0,0 +1 @@
+Subproject commit 2123f5cd336f2bed449e8d8d6612c4224553f2ba

Original file line number	Diff line number	Diff line change
`@@ -40,4 +40,12 @@ fn main() {`
`40`	`40`	`println!("cargo::rustc-cfg=cuGraphGetEdges_v2");`
`41`	`41`	`println!("cargo::rustc-cfg=cuCtxCreate_v4");`
`42`	`42`	`}`
	`43`	`+`
	`44`	+ // In CUDA 13.2 the `id` field in `CUmemLocation_st` was placed inside an anonymous union.
	`45`	+ // Bindgen renders this as `__bindgen_anon_1: CUmemLocation_st__bindgen_ty_1` instead of a
	`46`	+ // direct `id` field. This cfg gates the struct initialization syntax accordingly.
	`47`	`+ println!("cargo::rustc-check-cfg=cfg(cuMemLocation_anon_id)");`
	`48`	`+ if driver_version >= 13020 {`
	`49`	`+ println!("cargo::rustc-cfg=cuMemLocation_anon_id");`
	`50`	`+ }`
`43`	`51`	`}`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Subproject commit 2123f5cd336f2bed449e8d8d6612c4224553f2ba`