File tree Expand file tree Collapse file tree
dpctl/tensor/libtensor/include/utils Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -268,9 +268,20 @@ T custom_inclusive_scan_over_group(GroupT &&wg,
268268 const bool in_range = (lane_id < n_aggregates);
269269 const bool in_bounds = in_range && (lane_id > 0 || large_wg);
270270
271- T __scan_val = (in_bounds)
272- ? local_mem_acc[(offset + lane_id) * max_sgSize - 1 ]
273- : identity;
271+ // Here is a bug where IGC incorrectly optimized the below code:
272+ // T __scan_val = (in_bounds)
273+ // ? local_mem_acc[(offset + lane_id) * max_sgSize - 1]
274+ // : identity;
275+ // That causes `__scan_val` is not initialized with `identity` value:
276+ // wgs = 256, max_sgSize = 16 => n_aggregates = 16
277+ // wi = 0: in_range = 1, in_bounds = 0 => __scan_val = identity
278+ // The w/s adds SYCL atomic fence, since the explicit memory fence
279+ // prevents reordering/elimination, while it will add slight overhead.
280+ T __scan_val = identity;
281+ sycl::atomic_fence (sycl::memory_order::seq_cst, sycl::memory_scope::work_item);
282+ if (in_bounds) {
283+ __scan_val = local_mem_acc[(offset + lane_id) * max_sgSize - 1 ];
284+ }
274285 for (std::uint32_t step = 1 ; step < sgSize; step *= 2 ) {
275286 const bool advanced_lane = (lane_id >= step);
276287 const std::uint32_t src_lane_id =
You can’t perform that action at this time.
0 commit comments