Skip to content

Commit 0673be2

Browse files
committed
unified?
1 parent cb3c232 commit 0673be2

4 files changed

Lines changed: 118 additions & 67 deletions

File tree

crates/cust/build.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,4 +40,12 @@ fn main() {
4040
println!("cargo::rustc-cfg=cuGraphGetEdges_v2");
4141
println!("cargo::rustc-cfg=cuCtxCreate_v4");
4242
}
43+
44+
// In CUDA 13.2 the `id` field in `CUmemLocation_st` was placed inside an anonymous union.
45+
// Bindgen renders this as `__bindgen_anon_1: CUmemLocation_st__bindgen_ty_1` instead of a
46+
// direct `id` field. This cfg gates the struct initialization syntax accordingly.
47+
println!("cargo::rustc-check-cfg=cfg(cuMemLocation_anon_id)");
48+
if driver_version >= 13020 {
49+
println!("cargo::rustc-cfg=cuMemLocation_anon_id");
50+
}
4351
}

crates/cust/src/memory/unified.rs

Lines changed: 40 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -19,29 +19,6 @@ use crate::memory::UnifiedPointer;
1919
use crate::memory::malloc::{cuda_free_unified, cuda_malloc_unified};
2020
use crate::prelude::Stream;
2121

22-
#[cfg(any(cuMemPrefetchAsync_v2, cuMemAdvise_v2))]
23-
unsafe fn cu_mem_location(
24-
type_: driver_sys::CUmemLocationType,
25-
id: std::os::raw::c_int,
26-
) -> driver_sys::CUmemLocation {
27-
let mut location = std::mem::MaybeUninit::<driver_sys::CUmemLocation>::zeroed();
28-
let location_ptr = location.as_mut_ptr();
29-
30-
// Support both older bindgen output (`{ type_, id }`) and the newer
31-
// anonymous-union layout emitted from CUDA 13.2 headers.
32-
unsafe {
33-
(*location_ptr).type_ = type_;
34-
std::ptr::write(
35-
(location_ptr.cast::<u8>())
36-
.add(std::mem::size_of::<driver_sys::CUmemLocationType>())
37-
.cast::<std::os::raw::c_int>(),
38-
id,
39-
);
40-
41-
location.assume_init()
42-
}
43-
}
44-
4522
/// A pointer type for heap-allocation in CUDA unified memory.
4623
///
4724
/// See the [`module-level documentation`](../memory/index.html) for more information on unified
@@ -663,13 +640,20 @@ pub trait MemoryAdvise<T: DeviceCopy>: private::Sealed {
663640
let mem_size = std::mem::size_of_val(slice);
664641

665642
unsafe {
643+
let id = -1; // -1 is CU_DEVICE_CPU
666644
driver_sys::cuMemPrefetchAsync(
667645
slice.as_ptr() as driver_sys::CUdeviceptr,
668646
mem_size,
669647
#[cfg(cuMemPrefetchAsync_v2)]
670-
cu_mem_location(driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_HOST, 0),
648+
driver_sys::CUmemLocation {
649+
type_: driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE,
650+
#[cfg(cuMemLocation_anon_id)]
651+
__bindgen_anon_1: driver_sys::CUmemLocation_st__bindgen_ty_1 { id },
652+
#[cfg(not(cuMemLocation_anon_id))]
653+
id,
654+
},
671655
#[cfg(not(cuMemPrefetchAsync_v2))]
672-
-1, // -1 is CU_DEVICE_CPU
656+
id,
673657
#[cfg(cuMemPrefetchAsync_v2)]
674658
0, // flags for future use, must be 0 as of CUDA 13.0
675659
stream.as_inner(),
@@ -710,7 +694,13 @@ pub trait MemoryAdvise<T: DeviceCopy>: private::Sealed {
710694
slice.as_ptr() as driver_sys::CUdeviceptr,
711695
mem_size,
712696
#[cfg(cuMemPrefetchAsync_v2)]
713-
cu_mem_location(driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE, id),
697+
driver_sys::CUmemLocation {
698+
type_: driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE,
699+
#[cfg(cuMemLocation_anon_id)]
700+
__bindgen_anon_1: driver_sys::CUmemLocation_st__bindgen_ty_1 { id },
701+
#[cfg(not(cuMemLocation_anon_id))]
702+
id,
703+
},
714704
#[cfg(not(cuMemPrefetchAsync_v2))]
715705
id,
716706
#[cfg(cuMemPrefetchAsync_v2)]
@@ -743,14 +733,21 @@ pub trait MemoryAdvise<T: DeviceCopy>: private::Sealed {
743733
};
744734

745735
unsafe {
736+
let id = 0;
746737
driver_sys::cuMemAdvise(
747738
slice.as_ptr() as driver_sys::CUdeviceptr,
748739
mem_size,
749740
advice,
750741
#[cfg(cuMemAdvise_v2)]
751-
cu_mem_location(driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_HOST, 0),
742+
driver_sys::CUmemLocation {
743+
type_: driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE,
744+
#[cfg(cuMemLocation_anon_id)]
745+
__bindgen_anon_1: driver_sys::CUmemLocation_st__bindgen_ty_1 { id },
746+
#[cfg(not(cuMemLocation_anon_id))]
747+
id,
748+
},
752749
#[cfg(not(cuMemAdvise_v2))]
753-
0,
750+
id,
754751
)
755752
.to_result()?;
756753
}
@@ -787,11 +784,12 @@ pub trait MemoryAdvise<T: DeviceCopy>: private::Sealed {
787784
mem_size,
788785
driver_sys::CUmem_advise::CU_MEM_ADVISE_SET_PREFERRED_LOCATION,
789786
#[cfg(cuMemAdvise_v2)]
790-
match preferred_location {
791-
Some(_) => {
792-
cu_mem_location(driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE, id)
793-
}
794-
None => cu_mem_location(driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_HOST, 0),
787+
driver_sys::CUmemLocation {
788+
type_: driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE,
789+
#[cfg(cuMemLocation_anon_id)]
790+
__bindgen_anon_1: driver_sys::CUmemLocation_st__bindgen_ty_1 { id },
791+
#[cfg(not(cuMemLocation_anon_id))]
792+
id,
795793
},
796794
#[cfg(not(cuMemAdvise_v2))]
797795
id,
@@ -807,14 +805,21 @@ pub trait MemoryAdvise<T: DeviceCopy>: private::Sealed {
807805
let mem_size = std::mem::size_of_val(slice);
808806

809807
unsafe {
808+
let id = 0;
810809
driver_sys::cuMemAdvise(
811810
slice.as_ptr() as driver_sys::CUdeviceptr,
812811
mem_size,
813812
driver_sys::CUmem_advise::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION,
814813
#[cfg(cuMemAdvise_v2)]
815-
cu_mem_location(driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_HOST, 0),
814+
driver_sys::CUmemLocation {
815+
type_: driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE,
816+
#[cfg(cuMemLocation_anon_id)]
817+
__bindgen_anon_1: driver_sys::CUmemLocation_st__bindgen_ty_1 { id },
818+
#[cfg(not(cuMemLocation_anon_id))]
819+
id,
820+
},
816821
#[cfg(not(cuMemAdvise_v2))]
817-
0,
822+
id,
818823
)
819824
.to_result()?;
820825
}

examples/vecadd/src/main.rs

Lines changed: 69 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,22 @@ const NUMBERS_LEN: usize = 100_000;
77

88
static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx"));
99

10+
macro_rules! step {
11+
($label:expr, $expr:expr) => {{
12+
eprintln!("[vecadd] {} ...", $label);
13+
match $expr {
14+
Ok(v) => {
15+
eprintln!("[vecadd] {} ok", $label);
16+
v
17+
}
18+
Err(e) => {
19+
eprintln!("[vecadd] {} FAILED: {:?}", $label, e);
20+
return Err(e.into());
21+
}
22+
}
23+
}};
24+
}
25+
1026
fn main() -> Result<(), Box<dyn Error>> {
1127
// generate our random vectors.
1228
let mut wyrand = WyRand::new();
@@ -15,59 +31,80 @@ fn main() -> Result<(), Box<dyn Error>> {
1531
let mut rhs = vec![0.0f32; NUMBERS_LEN];
1632
wyrand.fill(&mut rhs);
1733

18-
// initialize CUDA, this will pick the first available device and will
19-
// make a CUDA context from it.
20-
// We don't need the context for anything but it must be kept alive.
21-
let _ctx = cust::quick_init()?;
22-
23-
// Make the CUDA module, modules just house the GPU code for the kernels we created.
24-
// they can be made from PTX code, cubins, or fatbins.
25-
let module = Module::from_ptx(PTX, &[])?;
26-
27-
// make a CUDA stream to issue calls to. You can think of this as an OS thread but for dispatching
28-
// GPU calls.
29-
let stream = Stream::new(StreamFlags::NON_BLOCKING, None)?;
30-
31-
// allocate the GPU memory needed to house our numbers and copy them over.
32-
let lhs_gpu = lhs.as_slice().as_dbuf()?;
33-
let rhs_gpu = rhs.as_slice().as_dbuf()?;
34+
let _ctx = step!("cust::quick_init", cust::quick_init());
35+
36+
let (driver_major, driver_minor) = step!(
37+
"CudaApiVersion::get",
38+
cust::CudaApiVersion::get().map(|v| (v.major(), v.minor()))
39+
);
40+
eprintln!("[vecadd] CUDA driver API version: {driver_major}.{driver_minor}");
41+
42+
let device = step!("Device::get_device(0)", cust::device::Device::get_device(0));
43+
let cc_major = step!(
44+
"Device::get_attribute(ComputeCapabilityMajor)",
45+
device.get_attribute(cust::device::DeviceAttribute::ComputeCapabilityMajor)
46+
);
47+
let cc_minor = step!(
48+
"Device::get_attribute(ComputeCapabilityMinor)",
49+
device.get_attribute(cust::device::DeviceAttribute::ComputeCapabilityMinor)
50+
);
51+
let name = step!("Device::name", device.name());
52+
eprintln!("[vecadd] GPU: {name} (compute {cc_major}.{cc_minor})");
53+
54+
eprintln!("[vecadd] PTX size: {} bytes", PTX.len());
55+
eprintln!(
56+
"[vecadd] PTX header: {}",
57+
PTX.lines().take(10).collect::<Vec<_>>().join(" | ")
58+
);
59+
60+
let module = step!("Module::from_ptx", Module::from_ptx(PTX, &[]));
61+
62+
let stream = step!(
63+
"Stream::new",
64+
Stream::new(StreamFlags::NON_BLOCKING, None)
65+
);
66+
67+
let lhs_gpu = step!("DeviceBuffer::from lhs", lhs.as_slice().as_dbuf());
68+
let rhs_gpu = step!("DeviceBuffer::from rhs", rhs.as_slice().as_dbuf());
3469

35-
// allocate our output buffer. You could also use DeviceBuffer::uninitialized() to avoid the
36-
// cost of the copy, but you need to be careful not to read from the buffer.
3770
let mut out = vec![0.0f32; NUMBERS_LEN];
38-
let out_buf = out.as_slice().as_dbuf()?;
71+
let out_buf = step!("DeviceBuffer::from out", out.as_slice().as_dbuf());
3972

40-
// retrieve the `vecadd` kernel from the module so we can calculate the right launch config.
41-
let vecadd = module.get_function("vecadd")?;
73+
let vecadd = step!(
74+
"Module::get_function(\"vecadd\")",
75+
module.get_function("vecadd")
76+
);
4277

43-
// use the CUDA occupancy API to find an optimal launch configuration for the grid and block size.
44-
// This will try to maximize how much of the GPU is used by finding the best launch configuration for the
45-
// current CUDA device/architecture.
46-
let (_, block_size) = vecadd.suggested_launch_configuration(0, 0.into())?;
78+
let (_, block_size) = step!(
79+
"suggested_launch_configuration",
80+
vecadd.suggested_launch_configuration(0, 0.into())
81+
);
4782

4883
let grid_size = (NUMBERS_LEN as u32).div_ceil(block_size);
4984

5085
println!("using {grid_size} blocks and {block_size} threads per block");
5186

52-
// Actually launch the GPU kernel. This will queue up the launch on the stream, it will
53-
// not block the thread until the kernel is finished.
87+
eprintln!("[vecadd] launching kernel ...");
5488
unsafe {
5589
launch!(
56-
// slices are passed as two parameters, the pointer and the length.
5790
vecadd<<<grid_size, block_size, 0, stream>>>(
5891
lhs_gpu.as_device_ptr(),
5992
lhs_gpu.len(),
6093
rhs_gpu.as_device_ptr(),
6194
rhs_gpu.len(),
6295
out_buf.as_device_ptr(),
6396
)
64-
)?;
97+
)
98+
.map_err(|e| {
99+
eprintln!("[vecadd] launch FAILED: {e:?}");
100+
e
101+
})?;
65102
}
103+
eprintln!("[vecadd] launch queued ok");
66104

67-
stream.synchronize()?;
105+
step!("stream.synchronize", stream.synchronize());
68106

69-
// copy back the data from the GPU.
70-
out_buf.copy_to(&mut out)?;
107+
step!("copy_to", out_buf.copy_to(&mut out));
71108

72109
println!("{} + {} = {}", lhs[0], rhs[0], out[0]);
73110

llvm-19

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Subproject commit 2123f5cd336f2bed449e8d8d6612c4224553f2ba

0 commit comments

Comments
 (0)