[WebGPU] Add native CMake build and runtime integration

digantdesai · digantdesai · commit d7376e53dec3 · 2026-04-09T23:33:44.000-05:00
Wire wgpu-native into the CMake build and integrate
WebGPUDevice into the compute graph for native
Metal/Vulkan execution.
diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt
@@ -29,6 +29,7 @@ set(WEBGPU_SRCS
     runtime/WebGPUBackend.cpp
     runtime/WebGPUGraph.cpp
     runtime/WebGPUDelegateHeader.cpp
+    runtime/WebGPUDevice.cpp
     runtime/ops/OperatorRegistry.cpp
     runtime/ops/add/BinaryOp.cpp
 )
@@ -42,6 +43,37 @@ target_include_directories(
 
 target_link_libraries(webgpu_backend PRIVATE vulkan_schema executorch_core)
 
+# Native build: link against wgpu-native
+set(WGPU_NATIVE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/third-party/wgpu-native"
+    CACHE PATH "Path to wgpu-native installation")
+
+if(NOT EXISTS "${WGPU_NATIVE_DIR}/lib/libwgpu_native.a")
+  message(FATAL_ERROR
+    "wgpu-native not found at ${WGPU_NATIVE_DIR}. "
+    "Run: bash backends/webgpu/scripts/setup-wgpu-native.sh")
+endif()
+
+add_library(wgpu_native STATIC IMPORTED)
+set_target_properties(wgpu_native PROPERTIES
+  IMPORTED_LOCATION "${WGPU_NATIVE_DIR}/lib/libwgpu_native.a"
+)
+
+target_include_directories(webgpu_backend
+  PUBLIC $<BUILD_INTERFACE:${WGPU_NATIVE_DIR}/include>
+)
+target_link_libraries(webgpu_backend PRIVATE wgpu_native)
+
+if(APPLE)
+  target_link_libraries(webgpu_backend PRIVATE
+    "-framework Metal"
+    "-framework QuartzCore"
+    "-framework CoreGraphics"
+    "-framework Foundation"
+  )
+else()
+  target_link_libraries(webgpu_backend PRIVATE dl m pthread)
+endif()
+
 target_compile_options(webgpu_backend PRIVATE -fexceptions)
 
 # Link with --whole-archive for static registration of backend + ops
@@ -54,3 +86,39 @@ install(
   EXPORT ExecuTorchTargets
   DESTINATION ${CMAKE_INSTALL_LIBDIR}
 )
+
+# Native test target
+if(EXECUTORCH_BUILD_WEBGPU_TEST)
+  add_executable(webgpu_native_test test/test_webgpu_native.cpp)
+
+  target_include_directories(webgpu_native_test
+    PRIVATE
+      $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
+      "${WGPU_NATIVE_DIR}/include"
+  )
+
+  target_link_libraries(webgpu_native_test
+    PRIVATE
+      webgpu_backend
+      wgpu_native
+      executorch_core
+      extension_module_static
+      extension_data_loader
+      extension_tensor
+      portable_kernels
+      portable_ops_lib
+  )
+
+  if(APPLE)
+    target_link_libraries(webgpu_native_test PRIVATE
+      "-framework Metal"
+      "-framework QuartzCore"
+      "-framework CoreGraphics"
+    )
+  else()
+    target_link_libraries(webgpu_native_test PRIVATE dl m pthread)
+  endif()
+
+  target_compile_options(webgpu_native_test PRIVATE -fexceptions)
+  set_property(TARGET webgpu_native_test PROPERTY CXX_STANDARD 17)
+endif()
diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp
@@ -11,6 +11,9 @@
 
 #include <executorch/backends/vulkan/serialization/schema_generated.h>
 
+#include <executorch/backends/webgpu/runtime/WebGPUDevice.h>
+#include <webgpu/wgpu.h>
+
 #include <cstring>
 #include <stdexcept>
 
@@ -69,6 +72,13 @@ WebGPUGraph::~WebGPUGraph() {
 void WebGPUGraph::build(
     const void* flatbuffer_data,
     const uint8_t* constant_data) {
+  if (!device_) {
+    auto* ctx = get_default_webgpu_context();
+    if (ctx) {
+      device_ = ctx->device;
+      instance_ = ctx->instance;
+    }
+  }
   if (!device_) {
     throw std::runtime_error(
         "WebGPU device not available. "
@@ -289,6 +299,9 @@ void WebGPUGraph::copy_outputs(
         outputs[i].second,
         cb_info);
 
+    // Poll until the map callback fires.
+    wgpuDevicePoll(device_, true, nullptr);
+
     if (cb_data.status == WGPUMapAsyncStatus_Success) {
       const void* mapped =
           wgpuBufferGetConstMappedRange(output_staging_buffers_[i], 0, outputs[i].second);
@@ -300,6 +313,22 @@ void WebGPUGraph::copy_outputs(
   }
 }
 
+WebGPUMemoryStats WebGPUGraph::memory_stats() const {
+  WebGPUMemoryStats stats;
+  for (size_t i = 0; i < value_types_.size(); i++) {
+    if (value_types_[i] == ValueType::Tensor && tensors_[i].nbytes > 0) {
+      stats.tensor_buffer_bytes += tensors_[i].nbytes;
+      stats.num_tensors++;
+    }
+  }
+  for (size_t i = 0; i < output_ids_.size(); i++) {
+    stats.staging_buffer_bytes += tensors_[output_ids_[i]].nbytes;
+  }
+  stats.uniform_buffer_bytes = uniform_buffer_bytes_;
+  stats.num_dispatches = static_cast<int>(dispatches_.size());
+  return stats;
+}
+
 } // namespace webgpu
 } // namespace backends
 } // namespace executorch
diff --git a/backends/webgpu/runtime/WebGPUGraph.h b/backends/webgpu/runtime/WebGPUGraph.h
@@ -30,6 +30,18 @@ struct WebGPUDispatch {
   uint32_t workgroup_count_x = 1;
 };
 
+struct WebGPUMemoryStats {
+  size_t tensor_buffer_bytes = 0;
+  size_t staging_buffer_bytes = 0;
+  size_t uniform_buffer_bytes = 0;
+  int num_tensors = 0;
+  int num_dispatches = 0;
+
+  size_t total_bytes() const {
+    return tensor_buffer_bytes + staging_buffer_bytes + uniform_buffer_bytes;
+  }
+};
+
 class WebGPUGraph {
  public:
   WebGPUGraph();
@@ -83,6 +95,19 @@ class WebGPUGraph {
     dispatches_.push_back(dispatch);
   }
 
+  void add_uniform_buffer_bytes(size_t bytes) {
+    uniform_buffer_bytes_ += bytes;
+  }
+
+  void set_instance(WGPUInstance instance) {
+    instance_ = instance;
+  }
+  void set_device(WGPUDevice device) {
+    device_ = device;
+  }
+
+  WebGPUMemoryStats memory_stats() const;
+
   int num_values() const {
     return static_cast<int>(value_types_.size());
   }
@@ -94,6 +119,7 @@ class WebGPUGraph {
   }
 
  private:
+  WGPUInstance instance_ = nullptr;
   WGPUDevice device_ = nullptr;
   WGPUQueue queue_ = nullptr;
 
@@ -112,6 +138,8 @@ class WebGPUGraph {
   std::vector<WGPUBuffer> output_staging_buffers_;
 
   std::vector<WebGPUDispatch> dispatches_;
+
+  size_t uniform_buffer_bytes_ = 0;
 };
 
 } // namespace webgpu
diff --git a/backends/webgpu/runtime/ops/add/BinaryOp.cpp b/backends/webgpu/runtime/ops/add/BinaryOp.cpp
@@ -64,6 +64,8 @@ void add_impl(WebGPUGraph& graph, const std::vector<int>& args) {
   std::memcpy(mapped, &params, sizeof(AddParams));
   wgpuBufferUnmap(uniform_buffer);
 
+  graph.add_uniform_buffer_bytes(sizeof(AddParams));
+
   // Create shader module from built-in WGSL source
   WGPUShaderSourceWGSL wgsl_desc = {};
   wgsl_desc.chain.sType = WGPUSType_ShaderSourceWGSL;