Skip to content

Commit 4326d02

Browse files
committed
Add a w/a to custom_inclusive_scan_over_group
1 parent e8af8c9 commit 4326d02

1 file changed

Lines changed: 14 additions & 3 deletions

File tree

dpctl/tensor/libtensor/include/utils/sycl_utils.hpp

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -268,9 +268,20 @@ T custom_inclusive_scan_over_group(GroupT &&wg,
268268
const bool in_range = (lane_id < n_aggregates);
269269
const bool in_bounds = in_range && (lane_id > 0 || large_wg);
270270

271-
T __scan_val = (in_bounds)
272-
? local_mem_acc[(offset + lane_id) * max_sgSize - 1]
273-
: identity;
271+
// Here is a bug where IGC incorrectly optimized the below code:
272+
// T __scan_val = (in_bounds)
273+
// ? local_mem_acc[(offset + lane_id) * max_sgSize - 1]
274+
// : identity;
275+
// That causes `__scan_val` is not initialized with `identity` value:
276+
// wgs = 256, max_sgSize = 16 => n_aggregates = 16
277+
// wi = 0: in_range = 1, in_bounds = 0 => __scan_val = identity
278+
// The w/s adds SYCL atomic fence, since the explicit memory fence
279+
// prevents reordering/elimination, while it will add slight overhead.
280+
T __scan_val = identity;
281+
sycl::atomic_fence(sycl::memory_order::seq_cst, sycl::memory_scope::work_item);
282+
if (in_bounds) {
283+
__scan_val = local_mem_acc[(offset + lane_id) * max_sgSize - 1];
284+
}
274285
for (std::uint32_t step = 1; step < sgSize; step *= 2) {
275286
const bool advanced_lane = (lane_id >= step);
276287
const std::uint32_t src_lane_id =

0 commit comments

Comments
 (0)