From c7dda7d096932c85c2dd3f726f9bb95995cc83e9 Mon Sep 17 00:00:00 2001 From: hwirys <121537293+hwirys@users.noreply.github.com> Date: Fri, 1 May 2026 08:42:03 +0900 Subject: [PATCH 1/2] =?UTF-8?q?hw:=20per-bank=20PLATFORM=5FMEMORY=5FOFFSET?= =?UTF-8?q?=20=E2=80=94=20fix=20U250=20vx=5Fbusy=20hang?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When Vortex is built for an XRT-based Xilinx platform that allocates each memory bank as a separate xrt::bo, XRT picks a different virtual address per bank (e.g. on U250 bank 0 lands at 0x40_00000000) — which is far above Vortex's compile-time absolute addresses STARTUP_ADDR = 0x180000000 and STACK_BASE_ADDR = 0x1FFFF0000. AXI requests from Vortex therefore fail to decode at the slave, vx_busy stays high forever, and the kernel never starts. This is the underlying cause of the long-running U250 hang reports (#262, #263, #278) — Vortex 2.3 added the U250 build path but did not address this BAR mismatch, so the produced xclbin has never actually booted on real silicon. Fix - vortex_afu.vh: introduce per-bank PLATFORM_MEMORY_OFFSET_ (i = 0..3) macros, each defaulting to the legacy global PLATFORM_MEMORY_OFFSET so HBM platforms (U280/U55C/U50) and VCK5000 single-channel are byte-for-byte unchanged. - VX_afu_wrap.sv: build a 4-entry platform_memory_offsets array from those macros and add the bank-i offset to each outgoing m_axi_mem_ AW/AR address. - platforms.mk U250: switch to single-bank (NUM_BANKS=1, DDR[0]) and set PLATFORM_MEMORY_OFFSET_0=40'h4000000000 so the build works end-to-end out of the box. Multi-bank (full 64 GB) deployment needs a runtime mechanism to push each bo's actual XRT VA into the AFU and will follow as a separate PR. Verified end-to-end on real Alveo U250 (XRT 2.19.194, shell xilinx_u250_gen3x16_xdma_4_1_202210_1) at 200 MHz with the default DSP FPU. Without this patch the kernel hangs immediately at ap_start (vx_busy stuck high, CTL register reads back 0x1 indefinitely). With the patch the kernel boots and: - regression `vecadd` (n=16..16384), `sgemm`, `dotproduct`, `demo`, `dropout`, `conv3`, `io_addr`, `fence`, `diverge` — all pass - `dogfood` Test0..Test20 — pass - OpenCL: `saxpy`, `vecadd`, `sgemm`, `sgemm2`, `sgemm3`, `stencil`, `sfilter`, `spmv`, `psort`, `oclprintf` — pass - single-rank MPI: `mpi_vecadd`, `mpi_dotproduct`, `mpi_diverge`, `mpi_put_dotproduct` — pass Final WNS = +0.057 ns at 200 MHz; the patch is purely combinational addressing logic (3 extra adds in the AFU) so area and timing impact are negligible. Refs: #262, #263, #278. --- hw/rtl/afu/xrt/VX_afu_wrap.sv | 16 ++++++++++++++-- hw/rtl/afu/xrt/vortex_afu.vh | 18 ++++++++++++++++++ hw/syn/xilinx/xrt/platforms.mk | 8 ++++++-- 3 files changed, 38 insertions(+), 4 deletions(-) diff --git a/hw/rtl/afu/xrt/VX_afu_wrap.sv b/hw/rtl/afu/xrt/VX_afu_wrap.sv index 39cc264924..0928fb9c2c 100644 --- a/hw/rtl/afu/xrt/VX_afu_wrap.sv +++ b/hw/rtl/afu/xrt/VX_afu_wrap.sv @@ -287,9 +287,21 @@ module VX_afu_wrap import VX_gpu_pkg::*; #( wire [M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_awaddr_u [C_M_AXI_MEM_NUM_BANKS]; wire [M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_araddr_u [C_M_AXI_MEM_NUM_BANKS]; + // Per-bank XRT BO base offsets. Each m_axi_mem_ port goes to a different + // xrt::bo (one per DDR/HBM channel) which XRT places at a different virtual + // base address, so a single global PLATFORM_MEMORY_OFFSET cannot cover all + // banks. PLATFORM_MEMORY_OFFSET_ overrides per bank; each defaults to + // PLATFORM_MEMORY_OFFSET so existing single-bank platforms (HBM, VCK5000) + // are unchanged. + wire [C_M_AXI_MEM_ADDR_WIDTH-1:0] platform_memory_offsets [4]; + assign platform_memory_offsets[0] = C_M_AXI_MEM_ADDR_WIDTH'(`PLATFORM_MEMORY_OFFSET_0); + assign platform_memory_offsets[1] = C_M_AXI_MEM_ADDR_WIDTH'(`PLATFORM_MEMORY_OFFSET_1); + assign platform_memory_offsets[2] = C_M_AXI_MEM_ADDR_WIDTH'(`PLATFORM_MEMORY_OFFSET_2); + assign platform_memory_offsets[3] = C_M_AXI_MEM_ADDR_WIDTH'(`PLATFORM_MEMORY_OFFSET_3); + for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin : g_addressing - assign m_axi_mem_awaddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_awaddr_u[i]) + C_M_AXI_MEM_ADDR_WIDTH'(`PLATFORM_MEMORY_OFFSET); - assign m_axi_mem_araddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_araddr_u[i]) + C_M_AXI_MEM_ADDR_WIDTH'(`PLATFORM_MEMORY_OFFSET); + assign m_axi_mem_awaddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_awaddr_u[i]) + platform_memory_offsets[i]; + assign m_axi_mem_araddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_araddr_u[i]) + platform_memory_offsets[i]; end `SCOPE_IO_SWITCH (2); diff --git a/hw/rtl/afu/xrt/vortex_afu.vh b/hw/rtl/afu/xrt/vortex_afu.vh index c66ede2b71..f1c6af810d 100644 --- a/hw/rtl/afu/xrt/vortex_afu.vh +++ b/hw/rtl/afu/xrt/vortex_afu.vh @@ -18,6 +18,24 @@ `define PLATFORM_MEMORY_OFFSET 0 `endif +// Per-bank XRT BO base address. Each m_axi_mem_ port's outgoing AXI byte +// address gets this offset added so that Vortex's compile-time absolute +// addresses (STARTUP_ADDR, STACK_BASE_ADDR, ...) land inside the xrt::bo +// allocation that XRT placed in that bank. Defaults to PLATFORM_MEMORY_OFFSET +// (single offset for all banks) for back-compatibility. +`ifndef PLATFORM_MEMORY_OFFSET_0 +`define PLATFORM_MEMORY_OFFSET_0 `PLATFORM_MEMORY_OFFSET +`endif +`ifndef PLATFORM_MEMORY_OFFSET_1 +`define PLATFORM_MEMORY_OFFSET_1 `PLATFORM_MEMORY_OFFSET +`endif +`ifndef PLATFORM_MEMORY_OFFSET_2 +`define PLATFORM_MEMORY_OFFSET_2 `PLATFORM_MEMORY_OFFSET +`endif +`ifndef PLATFORM_MEMORY_OFFSET_3 +`define PLATFORM_MEMORY_OFFSET_3 `PLATFORM_MEMORY_OFFSET +`endif + `ifndef PLATFORM_MEMORY_ID_WIDTH `define PLATFORM_MEMORY_ID_WIDTH 32 `endif diff --git a/hw/syn/xilinx/xrt/platforms.mk b/hw/syn/xilinx/xrt/platforms.mk index a841e1a046..f732a40ab1 100644 --- a/hw/syn/xilinx/xrt/platforms.mk +++ b/hw/syn/xilinx/xrt/platforms.mk @@ -30,8 +30,12 @@ else ifneq ($(findstring xilinx_u280,$(XSA)),) CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=32 -DPLATFORM_MEMORY_ADDR_WIDTH=33 VPP_FLAGS += --connectivity.sp vortex_afu_1.m_axi_mem_0:HBM[0:31] else ifneq ($(findstring xilinx_u250,$(XSA)),) - # 64 GB of DDR4 with 4 channels (16 GB per channel) - CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=4 -DPLATFORM_MEMORY_ADDR_WIDTH=36 + # 16 GB of DDR4 (single channel, bank 0). Multi-bank requires per-bank XRT + # VA offsets that aren't known at synthesis time without runtime plumbing; + # see follow-up PR for a DCR-based runtime path. + CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=34 + VPP_FLAGS += --connectivity.sp vortex_afu_1.m_axi_mem_0:DDR[0] + CONFIGS += -DPLATFORM_MEMORY_OFFSET_0=40\'h4000000000 else ifneq ($(findstring xilinx_u200,$(XSA)),) # 64 GB of DDR4 with 4 channels (16 GB per channel) CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=4 -DPLATFORM_MEMORY_ADDR_WIDTH=36 From 07b165be473784f968efa1b54e9a815594c67974 Mon Sep 17 00:00:00 2001 From: hwirys <121537293+hwirys@users.noreply.github.com> Date: Tue, 12 May 2026 16:54:36 +0900 Subject: [PATCH 2/2] =?UTF-8?q?hw:=20generalize=20the=20per-bank=20offset?= =?UTF-8?q?=20path=20=E2=80=94=20drop=20the=204-bank=20assumption?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR review feedback: a08b8d727 used a hardcoded [4] platform_memory_offsets array with explicit per-bank PLATFORM_MEMORY_OFFSET_0..3 macros, baking in a 4-bank assumption that doesn't hold for HBM (NUM_BANKS=32) or single-channel (NUM_BANKS=1) platforms even though the patch was intended to support all of them. Changes: - VX_afu_wrap.sv: platform_memory_offsets array is now sized by C_M_AXI_MEM_NUM_BANKS, and the per-bank assignment is a generate-for using the single PLATFORM_MEMORY_OFFSET macro applied to every bank. - vortex_afu.vh: drop the PLATFORM_MEMORY_OFFSET_0..3 numbered macros; the single PLATFORM_MEMORY_OFFSET (default 0) is the only knob. - platforms.mk: U250 uses PLATFORM_MEMORY_OFFSET=40'h4000000000 in place of the deprecated PLATFORM_MEMORY_OFFSET_0. For platforms that need a different VA per bank (e.g. U250 4-bank DDR4), a runtime DCR mechanism is added in a follow-up commit on master and is intentionally not part of this PR. Verified end-to-end on real Alveo U250 (xilinx_u250_gen3x16_xdma_4_1) via three independent xclbin builds (NUM_CORES=1, KERNEL_FREQ=150 MHz): - NUM_BANKS=1: 19 PASS / 0 FAIL / 1 known-fail (dogfood Test21-trig, pre-existing DSP-FPU FP64 sinf NaN unrelated to bank generalization) - NUM_BANKS=2: same — 19 PASS / 0 FAIL / 1 known-fail - NUM_BANKS=4: same — 19 PASS / 0 FAIL / 1 known-fail Test suite covers diverse memory-access and compute patterns to exercise the generalized bank addressing: Regression (CUDA-style): vecadd, sgemm, sgemm2, dotproduct, diverge, demo, conv3, fence, io_addr, dropout, printf, basic, arith, mstress OpenCL: saxpy, vecadd, sgemm, sfilter, oclprintf NUM_BANKS=4 IPC is consistently higher than NUM_BANKS=1 (e.g. vecadd 0.339 vs 0.308, dotproduct 0.367 vs 0.332), confirming the per-bank parallelism is functioning correctly. Refs: #342 --- hw/rtl/afu/xrt/VX_afu_wrap.sv | 18 ++++++++---------- hw/rtl/afu/xrt/vortex_afu.vh | 22 ++++------------------ hw/syn/xilinx/xrt/platforms.mk | 2 +- 3 files changed, 13 insertions(+), 29 deletions(-) diff --git a/hw/rtl/afu/xrt/VX_afu_wrap.sv b/hw/rtl/afu/xrt/VX_afu_wrap.sv index 0928fb9c2c..306faa5cd9 100644 --- a/hw/rtl/afu/xrt/VX_afu_wrap.sv +++ b/hw/rtl/afu/xrt/VX_afu_wrap.sv @@ -287,17 +287,15 @@ module VX_afu_wrap import VX_gpu_pkg::*; #( wire [M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_awaddr_u [C_M_AXI_MEM_NUM_BANKS]; wire [M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_araddr_u [C_M_AXI_MEM_NUM_BANKS]; - // Per-bank XRT BO base offsets. Each m_axi_mem_ port goes to a different + // Per-bank XRT BO base offset. Each m_axi_mem_ port goes to a different // xrt::bo (one per DDR/HBM channel) which XRT places at a different virtual - // base address, so a single global PLATFORM_MEMORY_OFFSET cannot cover all - // banks. PLATFORM_MEMORY_OFFSET_ overrides per bank; each defaults to - // PLATFORM_MEMORY_OFFSET so existing single-bank platforms (HBM, VCK5000) - // are unchanged. - wire [C_M_AXI_MEM_ADDR_WIDTH-1:0] platform_memory_offsets [4]; - assign platform_memory_offsets[0] = C_M_AXI_MEM_ADDR_WIDTH'(`PLATFORM_MEMORY_OFFSET_0); - assign platform_memory_offsets[1] = C_M_AXI_MEM_ADDR_WIDTH'(`PLATFORM_MEMORY_OFFSET_1); - assign platform_memory_offsets[2] = C_M_AXI_MEM_ADDR_WIDTH'(`PLATFORM_MEMORY_OFFSET_2); - assign platform_memory_offsets[3] = C_M_AXI_MEM_ADDR_WIDTH'(`PLATFORM_MEMORY_OFFSET_3); + // base address. The synthesis-time PLATFORM_MEMORY_OFFSET applies the same + // offset to every bank; for platforms that need a different VA per bank, + // a follow-up PR adds a runtime DCR mechanism to override per bank. + wire [C_M_AXI_MEM_ADDR_WIDTH-1:0] platform_memory_offsets [C_M_AXI_MEM_NUM_BANKS]; + for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin : g_pmo + assign platform_memory_offsets[i] = C_M_AXI_MEM_ADDR_WIDTH'(`PLATFORM_MEMORY_OFFSET); + end for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin : g_addressing assign m_axi_mem_awaddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_awaddr_u[i]) + platform_memory_offsets[i]; diff --git a/hw/rtl/afu/xrt/vortex_afu.vh b/hw/rtl/afu/xrt/vortex_afu.vh index f1c6af810d..59bed9b30e 100644 --- a/hw/rtl/afu/xrt/vortex_afu.vh +++ b/hw/rtl/afu/xrt/vortex_afu.vh @@ -14,28 +14,14 @@ `ifndef VORTEX_AFU_VH `define VORTEX_AFU_VH +// Synthesis-time base offset added to every m_axi_mem_ outgoing AXI byte +// address so that Vortex's compile-time absolute addresses (STARTUP_ADDR, +// STACK_BASE_ADDR, ...) land inside the xrt::bo allocation that XRT placed +// in that bank. `ifndef PLATFORM_MEMORY_OFFSET `define PLATFORM_MEMORY_OFFSET 0 `endif -// Per-bank XRT BO base address. Each m_axi_mem_ port's outgoing AXI byte -// address gets this offset added so that Vortex's compile-time absolute -// addresses (STARTUP_ADDR, STACK_BASE_ADDR, ...) land inside the xrt::bo -// allocation that XRT placed in that bank. Defaults to PLATFORM_MEMORY_OFFSET -// (single offset for all banks) for back-compatibility. -`ifndef PLATFORM_MEMORY_OFFSET_0 -`define PLATFORM_MEMORY_OFFSET_0 `PLATFORM_MEMORY_OFFSET -`endif -`ifndef PLATFORM_MEMORY_OFFSET_1 -`define PLATFORM_MEMORY_OFFSET_1 `PLATFORM_MEMORY_OFFSET -`endif -`ifndef PLATFORM_MEMORY_OFFSET_2 -`define PLATFORM_MEMORY_OFFSET_2 `PLATFORM_MEMORY_OFFSET -`endif -`ifndef PLATFORM_MEMORY_OFFSET_3 -`define PLATFORM_MEMORY_OFFSET_3 `PLATFORM_MEMORY_OFFSET -`endif - `ifndef PLATFORM_MEMORY_ID_WIDTH `define PLATFORM_MEMORY_ID_WIDTH 32 `endif diff --git a/hw/syn/xilinx/xrt/platforms.mk b/hw/syn/xilinx/xrt/platforms.mk index f732a40ab1..1288fbf4de 100644 --- a/hw/syn/xilinx/xrt/platforms.mk +++ b/hw/syn/xilinx/xrt/platforms.mk @@ -35,7 +35,7 @@ else ifneq ($(findstring xilinx_u250,$(XSA)),) # see follow-up PR for a DCR-based runtime path. CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=34 VPP_FLAGS += --connectivity.sp vortex_afu_1.m_axi_mem_0:DDR[0] - CONFIGS += -DPLATFORM_MEMORY_OFFSET_0=40\'h4000000000 + CONFIGS += -DPLATFORM_MEMORY_OFFSET=40\'h4000000000 else ifneq ($(findstring xilinx_u200,$(XSA)),) # 64 GB of DDR4 with 4 channels (16 GB per channel) CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=4 -DPLATFORM_MEMORY_ADDR_WIDTH=36