Rust-GPU
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 0 deletions b/‎.gitignore‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎crates/cuda_builder/src/lib.rs‎
Lines changed: 17 additions & 1 deletion b/‎crates/cuda_builder/src/lib.rs‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎crates/cust/src/memory/unified.rs‎
Lines changed: 35 additions & 25 deletions b/‎crates/cust/src/memory/unified.rs‎
Lines changed: 35 additions & 25 deletions
diff --git a/‎crates/nvvm/src/lib.rs‎
Lines changed: 30 additions & 1 deletion b/‎crates/nvvm/src/lib.rs‎
Lines changed: 30 additions & 1 deletion
diff --git a/‎crates/rustc_codegen_nvvm/Cargo.toml‎
Lines changed: 4 additions & 0 deletions b/‎crates/rustc_codegen_nvvm/Cargo.toml‎
Lines changed: 4 additions & 0 deletions
@@ -2,3 +2,7 @@ book
 /target
 **/.vscode
 .devcontainer
+.codex
+rustc-ice-*.txt
+.nix-driver-libs
+.claude
@@ -196,6 +196,21 @@ pub struct CudaBuilder {
     pub final_module_path: Option<PathBuf>,
 }
 
+/// Default arch for new `CudaBuilder`s.
+///
+/// When the backend is being built with LLVM 19 support (detected via the `LLVM_CONFIG_19`
+/// env var — the same signal `rustc_codegen_nvvm`'s build script uses), default to the
+/// lowest Blackwell compute capability (`Compute100`). Pre-Blackwell archs use the legacy
+/// LLVM 7 NVVM dialect, so pairing them with an LLVM 19 backend is never the right choice.
+/// Callers can still override via [`CudaBuilder::arch`].
+fn default_arch() -> NvvmArch {
+    if env::var_os("LLVM_CONFIG_19").is_some() {
+        NvvmArch::Compute100
+    } else {
+        NvvmArch::default()
+    }
+}
+
 impl CudaBuilder {
     pub fn new(path_to_crate_root: impl AsRef<Path>) -> Self {
         Self {
@@ -204,7 +219,7 @@ impl CudaBuilder {
             ptx_file_copy_path: None,
             generate_line_info: true,
             nvvm_opts: true,
-            arch: NvvmArch::default(),
+            arch: default_arch(),
             ftz: false,
             fast_sqrt: false,
             fast_div: false,
@@ -355,6 +370,7 @@ impl CudaBuilder {
     /// ptx file. If [`ptx_file_copy_path`](Self::ptx_file_copy_path) is set, this returns the copied path.
     pub fn build(self) -> Result<PathBuf, CudaBuilderError> {
         println!("cargo:rerun-if-changed={}", self.path_to_crate.display());
+        println!("cargo:rerun-if-env-changed=LLVM_CONFIG_19");
         let path = invoke_rustc(&self)?;
         if let Some(copy_path) = self.ptx_file_copy_path {
             std::fs::copy(path, &copy_path).map_err(CudaBuilderError::FailedToCopyPtxFile)?;
 
@@ -19,6 +19,29 @@ use crate::memory::UnifiedPointer;
 use crate::memory::malloc::{cuda_free_unified, cuda_malloc_unified};
 use crate::prelude::Stream;
 
+#[cfg(any(cuMemPrefetchAsync_v2, cuMemAdvise_v2))]
+unsafe fn cu_mem_location(
+    type_: driver_sys::CUmemLocationType,
+    id: std::os::raw::c_int,
+) -> driver_sys::CUmemLocation {
+    let mut location = std::mem::MaybeUninit::<driver_sys::CUmemLocation>::zeroed();
+    let location_ptr = location.as_mut_ptr();
+
+    // Support both older bindgen output (`{ type_, id }`) and the newer
+    // anonymous-union layout emitted from CUDA 13.2 headers.
+    unsafe {
+        (*location_ptr).type_ = type_;
+        std::ptr::write(
+            (location_ptr.cast::<u8>())
+                .add(std::mem::size_of::<driver_sys::CUmemLocationType>())
+                .cast::<std::os::raw::c_int>(),
+            id,
+        );
+
+        location.assume_init()
+    }
+}
+
 /// A pointer type for heap-allocation in CUDA unified memory.
 ///
 /// See the [`module-level documentation`](../memory/index.html) for more information on unified
@@ -640,17 +663,13 @@ pub trait MemoryAdvise<T: DeviceCopy>: private::Sealed {
         let mem_size = std::mem::size_of_val(slice);
 
         unsafe {
-            let id = -1; // -1 is CU_DEVICE_CPU
             driver_sys::cuMemPrefetchAsync(
                 slice.as_ptr() as driver_sys::CUdeviceptr,
                 mem_size,
                 #[cfg(cuMemPrefetchAsync_v2)]
-                driver_sys::CUmemLocation {
-                    type_: driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE,
-                    id,
-                },
+                cu_mem_location(driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_HOST, 0),
                 #[cfg(not(cuMemPrefetchAsync_v2))]
-                id,
+                -1, // -1 is CU_DEVICE_CPU
                 #[cfg(cuMemPrefetchAsync_v2)]
                 0, // flags for future use, must be 0 as of CUDA 13.0
                 stream.as_inner(),
@@ -691,10 +710,7 @@ pub trait MemoryAdvise<T: DeviceCopy>: private::Sealed {
                 slice.as_ptr() as driver_sys::CUdeviceptr,
                 mem_size,
                 #[cfg(cuMemPrefetchAsync_v2)]
-                driver_sys::CUmemLocation {
-                    type_: driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE,
-                    id,
-                },
+                cu_mem_location(driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE, id),
                 #[cfg(not(cuMemPrefetchAsync_v2))]
                 id,
                 #[cfg(cuMemPrefetchAsync_v2)]
@@ -727,18 +743,14 @@ pub trait MemoryAdvise<T: DeviceCopy>: private::Sealed {
         };
 
         unsafe {
-            let id = 0;
             driver_sys::cuMemAdvise(
                 slice.as_ptr() as driver_sys::CUdeviceptr,
                 mem_size,
                 advice,
                 #[cfg(cuMemAdvise_v2)]
-                driver_sys::CUmemLocation {
-                    type_: driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE,
-                    id,
-                },
+                cu_mem_location(driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_HOST, 0),
                 #[cfg(not(cuMemAdvise_v2))]
-                id,
+                0,
             )
             .to_result()?;
         }
@@ -775,9 +787,11 @@ pub trait MemoryAdvise<T: DeviceCopy>: private::Sealed {
                 mem_size,
                 driver_sys::CUmem_advise::CU_MEM_ADVISE_SET_PREFERRED_LOCATION,
                 #[cfg(cuMemAdvise_v2)]
-                driver_sys::CUmemLocation {
-                    type_: driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE,
-                    id,
+                match preferred_location {
+                    Some(_) => {
+                        cu_mem_location(driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE, id)
+                    }
+                    None => cu_mem_location(driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_HOST, 0),
                 },
                 #[cfg(not(cuMemAdvise_v2))]
                 id,
@@ -793,18 +807,14 @@ pub trait MemoryAdvise<T: DeviceCopy>: private::Sealed {
         let mem_size = std::mem::size_of_val(slice);
 
         unsafe {
-            let id = 0;
             driver_sys::cuMemAdvise(
                 slice.as_ptr() as driver_sys::CUdeviceptr,
                 mem_size,
                 driver_sys::CUmem_advise::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION,
                 #[cfg(cuMemAdvise_v2)]
-                driver_sys::CUmemLocation {
-                    type_: driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE,
-                    id,
-                },
+                cu_mem_location(driver_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_HOST, 0),
                 #[cfg(not(cuMemAdvise_v2))]
-                id,
+                0,
             )
             .to_result()?;
         }
 
@@ -325,6 +325,10 @@ pub enum NvvmArch {
     Compute89,
     Compute90,
     Compute90a,
+    /// First Blackwell arch and the cutoff for NVVM's modern IR dialect — everything at
+    /// or above this capability uses the LLVM 19-flavored bitcode accepted by CUDA 12.9+
+    /// `libnvvm`. See [`NvvmArch::uses_modern_ir_dialect`]. This is also the default arch
+    /// `cuda_builder` picks when the backend is built with `LLVM_CONFIG_19` set.
     Compute100,
     Compute100f,
     Compute100a,
@@ -448,6 +452,14 @@ impl NvvmArch {
         self.capability_value() % 10
     }
 
+    /// Whether this target uses NVVM's modern IR dialect rather than the legacy LLVM 7 dialect.
+    ///
+    /// CUDA 13.2 documents the modern dialect as Blackwell-and-later only, which begins at
+    /// `compute_100`.
+    pub fn uses_modern_ir_dialect(&self) -> bool {
+        self.capability_value() >= 100
+    }
+
     /// Get the target feature string (e.g., "compute_50" for `Compute50`, "compute_90a" for
     /// `Compute90a`).
     pub fn target_feature(&self) -> &'static str {
@@ -739,7 +751,24 @@ impl NvvmProgram {
     /// Verify the program without actually compiling it. In the case of invalid IR, you can find
     /// more detailed error info by calling [`compiler_log`](Self::compiler_log).
     pub fn verify(&self) -> Result<(), NvvmError> {
-        unsafe { nvvm_sys::nvvmVerifyProgram(self.raw, 0, null_mut()).to_result() }
+        self.verify_with_options(&[])
+    }
+
+    /// Like [`verify`](Self::verify), but runs the verifier with the same `NvvmOption`s that will
+    /// be passed to [`compile`](Self::compile). Passing the user-selected `-arch=compute_XXX` in
+    /// particular matters for CUDA 12.9+ / LLVM 19 bitcode: without it the verifier can fall back
+    /// to the legacy LLVM 7 parser and reject modern-dialect bitcode that would otherwise compile
+    /// fine.
+    pub fn verify_with_options(&self, options: &[NvvmOption]) -> Result<(), NvvmError> {
+        unsafe {
+            let options = options.iter().map(|x| format!("{x}\0")).collect::<Vec<_>>();
+            let mut options_ptr = options
+                .iter()
+                .map(|x| x.as_ptr().cast())
+                .collect::<Vec<_>>();
+            nvvm_sys::nvvmVerifyProgram(self.raw, options.len() as i32, options_ptr.as_mut_ptr())
+                .to_result()
+        }
     }
 }
 
 
@@ -14,6 +14,10 @@ readme = "../../README.md"
 [lib]
 crate-type = ["dylib"]
 
+[features]
+default = []
+llvm19 = []
+
 [dependencies]
 nvvm = { version = "0.1", path = "../nvvm" }
 rustc-demangle = "0.1.24"