[WebGPU] Add export test using Vulkan partitioner

digantdesai · digantdesai · commit 0c155b01f50c · 2026-04-09T23:33:43.000-05:00
Export tests verify fp32 torch.add models produce a .pte with
VulkanBackend delegate: 2D/3D/4D shapes, broadcasting,
self-add, scalar add, and chained adds. Includes TODO with
architecture notes and next steps.
diff --git a/backends/webgpu/TODO.md b/backends/webgpu/TODO.md
@@ -0,0 +1,39 @@
+# WebGPU Backend — TODO
+
+## Current State (Prototype)
+- Single op: `aten.add.Tensor` (fp32, buffer storage)
+- No Python AOT code — directly consumes Vulkan delegate (.pte exported via VulkanPartitioner)
+- Reuses Vulkan FlatBuffer format (VH00 header + VK00 payload)
+- Registers as `"VulkanBackend"` at runtime — mutually exclusive with Vulkan backend at link time
+- Built-in WGSL shaders (not embedded in .pte)
+
+## Architecture
+```
+VulkanPartitioner (Python) → VkGraphBuilder → VK00 FlatBuffer → .pte
+    → WebGPU Runtime: registers as "VulkanBackend", parses VH00/VK00
+    → WebGPUGraph::build → GPU buffers/pipelines/bind groups
+    → WebGPUGraph::execute → encode + submit compute passes
+```
+
+Adding a new op requires only C++ runtime work:
+1. WGSL shader + header
+2. C++ op implementation (read args from VkGraph, create pipeline, record dispatch)
+3. Register in CMakeLists.txt
+4. Test with VulkanPartitioner export
+
+## Performance: Command Encoding Overhead
+WebGPU `GPUCommandBuffer` is single-use (no equivalent to Vulkan's cached command lists).
+Per-dispatch API call cost adds up for large graphs.
+
+**Primary mitigation: mega-kernel fusion.** Generate fused WGSL shaders for chains of
+element-wise ops (add→relu→mul→clamp) at compile time. Embed via the existing
+`shaders: [VkBytes]` field in schema.fbs.
+
+## Next Steps
+1. **More ops**: sub, mul, relu, linear (matmul), softmax, layer_norm
+2. **fp16 support**: Feature-detect `shader-f16`, fallback to fp32
+3. **Buffer pooling**: Reuse GPU buffers to avoid OOM at scale
+4. **Pipeline caching**: Cache compiled pipelines across runs
+5. **Profiling**: Wire WebGPU timestamp queries into ETDump/EventTracer
+6. **LLM support**: KV cache management, Flash Attention in WGSL, quantized ops (int4/int8)
+7. **Browser/JS runtime**: Emscripten build, JS harness, browser test page
diff --git a/backends/webgpu/test/conftest.py b/backends/webgpu/test/conftest.py
@@ -0,0 +1,28 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Workaround for PyTorch 2.11 bug where LeafSpec dataclass fields
+# (type, _context, _children) are not initialized by the C++ constructor,
+# causing AttributeError in run_decompositions and copy.deepcopy.
+import dataclasses
+
+from torch.utils._pytree import LeafSpec
+
+
+def _leafspec_getattr(self, name):  # type: ignore[no-untyped-def]
+    for f in dataclasses.fields(type(self)):
+        if f.name == name:
+            if f.default is not dataclasses.MISSING:
+                return f.default
+            elif f.default_factory is not dataclasses.MISSING:
+                val = f.default_factory()
+                object.__setattr__(self, name, val)
+                return val
+    raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
+
+
+if not hasattr(LeafSpec(), "type"):
+    LeafSpec.__getattr__ = _leafspec_getattr
diff --git a/backends/webgpu/test/ops/__init__.py b/backends/webgpu/test/ops/__init__.py
diff --git a/backends/webgpu/test/ops/add/test_add.py b/backends/webgpu/test/ops/add/test_add.py
@@ -0,0 +1,99 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from executorch.backends.vulkan import VulkanPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+
+class AddModule(torch.nn.Module):
+    def forward(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+        return a + b
+
+
+class AddSelfModule(torch.nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x + x
+
+
+class AddScalarModule(torch.nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x + 3.0
+
+
+class AddChainedModule(torch.nn.Module):
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        z = x + y
+        z = z + x
+        z = z + y
+        return z
+
+
+class TestAdd(unittest.TestCase):
+    """fp32 torch.add export tests — uses VulkanPartitioner since the WebGPU
+    runtime directly consumes the Vulkan delegate (VK00 FlatBuffer)."""
+
+    def _export_and_check(self, model, example_inputs) -> None:
+        ep = torch.export.export(model, example_inputs)
+        et_program = to_edge_transform_and_lower(
+            ep, partitioner=[VulkanPartitioner()]
+        ).to_executorch()
+
+        found_vulkan = False
+        for plan in et_program.executorch_program.execution_plan:
+            for delegate in plan.delegates:
+                if delegate.id == "VulkanBackend":
+                    found_vulkan = True
+                    break
+        self.assertTrue(found_vulkan, "Expected VulkanBackend delegate in .pte")
+        self.assertGreater(len(et_program.buffer), 100)
+
+    def test_add_2d(self) -> None:
+        self._export_and_check(AddModule(), (torch.randn(4, 4), torch.randn(4, 4)))
+
+    def test_add_3d(self) -> None:
+        self._export_and_check(AddModule(), (torch.randn(2, 3, 4), torch.randn(2, 3, 4)))
+
+    def test_add_4d(self) -> None:
+        self._export_and_check(
+            AddModule(), (torch.randn(1, 2, 3, 4), torch.randn(1, 2, 3, 4))
+        )
+
+    def test_add_broadcast_last_dim(self) -> None:
+        self._export_and_check(AddModule(), (torch.randn(4, 4), torch.randn(4, 1)))
+
+    def test_add_broadcast_first_dim(self) -> None:
+        self._export_and_check(AddModule(), (torch.randn(4, 4), torch.randn(1, 4)))
+
+    def test_add_self(self) -> None:
+        self._export_and_check(AddSelfModule(), (torch.randn(4, 4),))
+
+    def test_add_scalar(self) -> None:
+        self._export_and_check(AddScalarModule(), (torch.randn(4, 4),))
+
+    def test_add_chained(self) -> None:
+        self._export_and_check(
+            AddChainedModule(), (torch.randn(4, 4), torch.randn(4, 4))
+        )
+
+
+def export_add_model(output_path: str) -> None:
+    """Export a simple add model to .pte for native runtime testing."""
+    model = AddModule()
+    example_inputs = (torch.randn(1024, 1024), torch.randn(1024, 1024))
+    ep = torch.export.export(model, example_inputs)
+    et_program = to_edge_transform_and_lower(
+        ep, partitioner=[VulkanPartitioner()]
+    ).to_executorch()
+    with open(output_path, "wb") as f:
+        f.write(et_program.buffer)
+    print(f"Exported {output_path}")
+
+
+if __name__ == "__main__":
+    unittest.main()