From 4326d024770e956e9b6f5298532eff29c6ca206c Mon Sep 17 00:00:00 2001 From: Anton Volkov Date: Thu, 9 Apr 2026 15:25:46 +0200 Subject: [PATCH 1/3] Add a w/a to custom_inclusive_scan_over_group --- .../libtensor/include/utils/sycl_utils.hpp | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp index f78193e614..f8a2226c21 100644 --- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp @@ -268,9 +268,20 @@ T custom_inclusive_scan_over_group(GroupT &&wg, const bool in_range = (lane_id < n_aggregates); const bool in_bounds = in_range && (lane_id > 0 || large_wg); - T __scan_val = (in_bounds) - ? local_mem_acc[(offset + lane_id) * max_sgSize - 1] - : identity; + // Here is a bug where IGC incorrectly optimized the below code: + // T __scan_val = (in_bounds) + // ? local_mem_acc[(offset + lane_id) * max_sgSize - 1] + // : identity; + // That causes `__scan_val` is not initialized with `identity` value: + // wgs = 256, max_sgSize = 16 => n_aggregates = 16 + // wi = 0: in_range = 1, in_bounds = 0 => __scan_val = identity + // The w/s adds SYCL atomic fence, since the explicit memory fence + // prevents reordering/elimination, while it will add slight overhead. + T __scan_val = identity; + sycl::atomic_fence(sycl::memory_order::seq_cst, sycl::memory_scope::work_item); + if (in_bounds) { + __scan_val = local_mem_acc[(offset + lane_id) * max_sgSize - 1]; + } for (std::uint32_t step = 1; step < sgSize; step *= 2) { const bool advanced_lane = (lane_id >= step); const std::uint32_t src_lane_id = From b5f77f47f2866b04c8a78d2246a29bc5b1ed76b6 Mon Sep 17 00:00:00 2001 From: Anton Volkov Date: Thu, 9 Apr 2026 15:30:58 +0200 Subject: [PATCH 2/3] Apply pre-commit formatting rule --- dpctl/tensor/libtensor/include/utils/sycl_utils.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp index f8a2226c21..81cee7b105 100644 --- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp @@ -278,7 +278,8 @@ T custom_inclusive_scan_over_group(GroupT &&wg, // The w/s adds SYCL atomic fence, since the explicit memory fence // prevents reordering/elimination, while it will add slight overhead. T __scan_val = identity; - sycl::atomic_fence(sycl::memory_order::seq_cst, sycl::memory_scope::work_item); + sycl::atomic_fence(sycl::memory_order::seq_cst, + sycl::memory_scope::work_item); if (in_bounds) { __scan_val = local_mem_acc[(offset + lane_id) * max_sgSize - 1]; } From b495d8e03a8e243554e119f97961cb8995d95a78 Mon Sep 17 00:00:00 2001 From: Anton Volkov Date: Thu, 9 Apr 2026 17:51:20 +0200 Subject: [PATCH 3/3] Use relaxed memory order --- dpctl/tensor/libtensor/include/utils/sycl_utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp index 81cee7b105..23b184be98 100644 --- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp @@ -278,7 +278,7 @@ T custom_inclusive_scan_over_group(GroupT &&wg, // The w/s adds SYCL atomic fence, since the explicit memory fence // prevents reordering/elimination, while it will add slight overhead. T __scan_val = identity; - sycl::atomic_fence(sycl::memory_order::seq_cst, + sycl::atomic_fence(sycl::memory_order::relaxed, sycl::memory_scope::work_item); if (in_bounds) { __scan_val = local_mem_acc[(offset + lane_id) * max_sgSize - 1];