From 4326d024770e956e9b6f5298532eff29c6ca206c Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Thu, 9 Apr 2026 15:25:46 +0200
Subject: [PATCH 1/3] Add a w/a to custom_inclusive_scan_over_group

---
 .../libtensor/include/utils/sycl_utils.hpp      | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
index f78193e614..f8a2226c21 100644
--- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
@@ -268,9 +268,20 @@ T custom_inclusive_scan_over_group(GroupT &&wg,
         const bool in_range = (lane_id < n_aggregates);
         const bool in_bounds = in_range && (lane_id > 0 || large_wg);
 
-        T __scan_val = (in_bounds)
-                           ? local_mem_acc[(offset + lane_id) * max_sgSize - 1]
-                           : identity;
+        // Here is a bug where IGC incorrectly optimized the below code:
+        // T __scan_val = (in_bounds)
+        //                 ? local_mem_acc[(offset + lane_id) * max_sgSize - 1]
+        //                 : identity;
+        // That causes `__scan_val` is not initialized with `identity` value:
+        //   wgs = 256, max_sgSize = 16   =>   n_aggregates = 16
+        //   wi = 0:   in_range = 1, in_bounds = 0   =>   __scan_val = identity
+        // The w/s adds SYCL atomic fence, since the explicit memory fence
+        // prevents reordering/elimination, while it will add slight overhead.
+        T __scan_val = identity;
+        sycl::atomic_fence(sycl::memory_order::seq_cst, sycl::memory_scope::work_item);
+        if (in_bounds) {
+            __scan_val = local_mem_acc[(offset + lane_id) * max_sgSize - 1];
+        }
         for (std::uint32_t step = 1; step < sgSize; step *= 2) {
             const bool advanced_lane = (lane_id >= step);
             const std::uint32_t src_lane_id =

From b5f77f47f2866b04c8a78d2246a29bc5b1ed76b6 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Thu, 9 Apr 2026 15:30:58 +0200
Subject: [PATCH 2/3] Apply pre-commit formatting rule

---
 dpctl/tensor/libtensor/include/utils/sycl_utils.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
index f8a2226c21..81cee7b105 100644
--- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
@@ -278,7 +278,8 @@ T custom_inclusive_scan_over_group(GroupT &&wg,
         // The w/s adds SYCL atomic fence, since the explicit memory fence
         // prevents reordering/elimination, while it will add slight overhead.
         T __scan_val = identity;
-        sycl::atomic_fence(sycl::memory_order::seq_cst, sycl::memory_scope::work_item);
+        sycl::atomic_fence(sycl::memory_order::seq_cst,
+                           sycl::memory_scope::work_item);
         if (in_bounds) {
             __scan_val = local_mem_acc[(offset + lane_id) * max_sgSize - 1];
         }

From b495d8e03a8e243554e119f97961cb8995d95a78 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Thu, 9 Apr 2026 17:51:20 +0200
Subject: [PATCH 3/3] Use relaxed memory order

---
 dpctl/tensor/libtensor/include/utils/sycl_utils.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
index 81cee7b105..23b184be98 100644
--- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
@@ -278,7 +278,7 @@ T custom_inclusive_scan_over_group(GroupT &&wg,
         // The w/s adds SYCL atomic fence, since the explicit memory fence
         // prevents reordering/elimination, while it will add slight overhead.
         T __scan_val = identity;
-        sycl::atomic_fence(sycl::memory_order::seq_cst,
+        sycl::atomic_fence(sycl::memory_order::relaxed,
                            sycl::memory_scope::work_item);
         if (in_bounds) {
             __scan_val = local_mem_acc[(offset + lane_id) * max_sgSize - 1];