@@ -382,7 +382,16 @@ class ET_EXPERIMENTAL CudaBackend final
382382 return (DelegateHandle*)handle; // Return the handle post-processing
383383 }
384384
385- // Once per execution
385+ // Execute the AOTI-compiled CUDA kernel for one inference step.
386+ //
387+ // Currently supports both CPU and CUDA memory for IO tensors:
388+ // - Inputs: detected via cudaPointerGetAttributes; CUDA data is wrapped
389+ // in-place (no copy), CPU data is copied to GPU via from_etensor().
390+ // - Outputs: either copied to ETensor's backing memory (CPU or CUDA),
391+ // or the ETensor is rewired to point at GPU memory (skip-copy mode).
392+ //
393+ // TODO: Once the device tensor pipeline is fully adopted, all IO tensors
394+ // will reside in CUDA memory. Remove the CPU fallback paths.
386395 Error execute (
387396 BackendExecutionContext& context,
388397 DelegateHandle* handle_,
@@ -405,14 +414,17 @@ class ET_EXPERIMENTAL CudaBackend final
405414 n_outputs,
406415 args.size ())
407416
408- // Verify device info on all memory-planned, ET-driven IO tensors.
409- // All input and output tensors should have device_type = CUDA, which
410- // is set during serialization by PropagateDevicePass based on the
411- // target_device compile spec from CudaPartitioner.
417+ // Verify device metadata on all IO tensors.
418+ // All tensors should have device_type = CUDA, set during serialization
419+ // by PropagateDevicePass based on the target_device compile spec from
420+ // CudaPartitioner.
412421 //
413- // Note: At this stage, the tensor memory is still on CPU. The device_type
414- // is metadata indicating where the tensor *should* reside. The backend
415- // is responsible for copying data to the actual CUDA device.
422+ // Note: device_type is metadata — the actual memory location may be
423+ // either CPU (legacy path with H2D copy ops) or CUDA (when device
424+ // memory planning is enabled via enable_non_cpu_memory_planning,
425+ // which allocates delegate IO in CUDA memory). The backend detects
426+ // the actual location via cudaPointerGetAttributes and handles both
427+ // cases.
416428 for (size_t i = 0 ; i < n_inputs + n_outputs; i++) {
417429 auto * tensor = &(args[i]->toTensor ());
418430 auto device_type = tensor->unsafeGetTensorImpl ()->device_type ();
@@ -425,34 +437,37 @@ class ET_EXPERIMENTAL CudaBackend final
425437 static_cast <int >(device_type));
426438 }
427439
428- // NOTE: ExecuTorch tensors may be on CPU or GPU due to the skip-copy
429- // optimization. We need to create GPU copies for CUDA kernel execution
430- // using SlimTensor .
440+ // Convert ExecuTorch tensors to SlimTensors for AOTI kernel execution.
441+ // Input data may be in CPU or CUDA memory — the backend detects and
442+ // handles both cases automatically (see memory model comment above) .
431443 std::vector<SlimTensor*> gpu_inputs (n_inputs);
432444 std::vector<SlimTensor*> gpu_outputs (n_outputs);
433445
434446 // Process input tensors: convert ETensor (CPU) to SlimTensor (GPU)
435447 for (size_t i = 0 ; i < n_inputs; i++) {
436- auto * cpu_tensor = &(args[i]->toTensor ());
448+ auto * input_tensor = &(args[i]->toTensor ());
437449
438- // Check if input data is already on GPU (skip-copy optimization for
439- // inputs) This can happen when the caller has pre-staged data on GPU
450+ // Detect if input data is already in CUDA memory. This occurs when:
451+ // - Device memory planning is enabled (enable_non_cpu_memory_planning),
452+ // which allocates delegate IO in CUDA memory
453+ // - The input is a skip-copy output from a previous method execution
454+ // When detected, the data is wrapped directly — no H2D copy needed.
440455 cudaPointerAttributes attributes{};
441- const void * data_ptr = cpu_tensor ->const_data_ptr ();
456+ const void * data_ptr = input_tensor ->const_data_ptr ();
442457 if (data_ptr != nullptr ) {
443458 cudaError_t err = cudaPointerGetAttributes (&attributes, data_ptr);
444459 if (err == cudaSuccess && attributes.type == cudaMemoryTypeDevice) {
445460 // Data is already on GPU - wrap it directly without copy
446- auto sizes = cpu_tensor ->sizes ();
447- auto strides = cpu_tensor ->strides ();
461+ auto sizes = input_tensor ->sizes ();
462+ auto strides = input_tensor ->strides ();
448463 std::vector<int64_t > sizes_vec (sizes.begin (), sizes.end ());
449464 std::vector<int64_t > strides_vec (strides.begin (), strides.end ());
450465
451466 gpu_inputs[i] = new SlimTensor (slim::from_blob (
452467 const_cast <void *>(data_ptr),
453468 slim::makeArrayRef (sizes_vec),
454469 slim::makeArrayRef (strides_vec),
455- static_cast <slim::c10::ScalarType>(cpu_tensor ->scalar_type ()),
470+ static_cast <slim::c10::ScalarType>(input_tensor ->scalar_type ()),
456471 DEFAULT_CUDA_DEVICE,
457472 0 // storage_offset
458473 ));
@@ -461,19 +476,22 @@ class ET_EXPERIMENTAL CudaBackend final
461476 }
462477 }
463478
464- // Data is on CPU - use from_etensor to copy to GPU
479+ // Data is in CPU memory (legacy path) — copy to GPU via from_etensor.
480+ // TODO: Remove this path once all callers use the device tensor pipeline.
465481 gpu_inputs[i] = new SlimTensor (
466- from_etensor (*cpu_tensor , CPU_DEVICE, DEFAULT_CUDA_DEVICE));
482+ from_etensor (*input_tensor , CPU_DEVICE, DEFAULT_CUDA_DEVICE));
467483 }
468484
469- // Process output tensors: create GPU SlimTensors for kernel output.
470- // Save pre-run handles to detect orphans after run().
485+ // Allocate GPU SlimTensors for kernel outputs. These are always
486+ // freshly allocated on GPU regardless of the input memory mode.
487+ // Save pre-run handles to detect orphans after run() (the AOTI
488+ // runtime may replace output handles with its own allocations).
471489 std::vector<SlimTensor*> pre_run_outputs (n_outputs, nullptr );
472490 for (size_t i = 0 ; i < n_outputs; i++) {
473- auto * cpu_output_tensor = &(args[i + n_inputs]->toTensor ());
474- auto sizes = cpu_output_tensor ->sizes ();
475- auto strides = cpu_output_tensor ->strides ();
476- auto scalar_type = cpu_output_tensor ->scalar_type ();
491+ auto * output_tensor = &(args[i + n_inputs]->toTensor ());
492+ auto sizes = output_tensor ->sizes ();
493+ auto strides = output_tensor ->strides ();
494+ auto scalar_type = output_tensor ->scalar_type ();
477495
478496 std::vector<int64_t > sizes_vec (sizes.begin (), sizes.end ());
479497 std::vector<int64_t > strides_vec (strides.begin (), strides.end ());
@@ -536,13 +554,18 @@ class ET_EXPERIMENTAL CudaBackend final
536554
537555 const bool copy_outputs = !should_skip_copy_for_method (handle->method_name );
538556
557+ // Output disposition: copy to ETensor backing memory or keep on GPU.
558+ // When copy_outputs is true (default), results are copied to the
559+ // ETensor's memory (which may be CPU or CUDA planned memory).
560+ // When false (skip-copy optimization), the ETensor is rewired to
561+ // point at the GPU SlimTensor's memory directly.
539562 if (copy_outputs) {
540563 for (size_t i = 0 ; i < n_outputs; i++) {
541- auto * cpu_output_tensor = &(args[i + n_inputs]->toTensor ());
564+ auto * output_tensor = &(args[i + n_inputs]->toTensor ());
542565 ET_CHECK_OK_OR_RETURN_ERROR (
543566 copy_slimtensor_to_etensor_async (
544- gpu_outputs[i], cpu_output_tensor , cuda_stream),
545- " Failed to copy GPU output %zu back to CPU ETensor" ,
567+ gpu_outputs[i], output_tensor , cuda_stream),
568+ " Failed to copy GPU output %zu back to ETensor" ,
546569 i);
547570 delete gpu_outputs[i];
548571 gpu_outputs[i] = nullptr ;
0 commit comments