Skip to content

Commit 1e70e3b

Browse files
committed
Fix accumulate correctness by reducing block size to 64
Work around a Blelloch parallel prefix sum correctness issue on Intel GPUs at block sizes >= 128 by defaulting to a block size of 64.
1 parent 383ef67 commit 1e70e3b

1 file changed

Lines changed: 14 additions & 6 deletions

File tree

src/accumulate.jl

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,20 @@ import oneAPI
22
import oneAPI: oneArray, oneAPIBackend
33
import AcceleratedKernels as AK
44

5+
# Use a smaller block size on Intel GPUs to work around a scan correctness issue
6+
# with the Blelloch parallel prefix sum at larger block sizes (>=128).
7+
const _ACCUMULATE_BLOCK_SIZE = 64
8+
59
# Accumulate operations using AcceleratedKernels
6-
Base.accumulate!(op, B::oneArray, A::oneArray; init = zero(eltype(A)), kwargs...) =
7-
AK.accumulate!(op, B, A, oneAPIBackend(); init, kwargs...)
10+
Base.accumulate!(op, B::oneArray, A::oneArray; init = zero(eltype(A)),
11+
block_size = _ACCUMULATE_BLOCK_SIZE, kwargs...) =
12+
AK.accumulate!(op, B, A, oneAPIBackend(); init, block_size, kwargs...)
813

9-
Base.accumulate(op, A::oneArray; init = zero(eltype(A)), kwargs...) =
10-
AK.accumulate(op, A, oneAPIBackend(); init, kwargs...)
14+
Base.accumulate(op, A::oneArray; init = zero(eltype(A)),
15+
block_size = _ACCUMULATE_BLOCK_SIZE, kwargs...) =
16+
AK.accumulate(op, A, oneAPIBackend(); init, block_size, kwargs...)
1117

12-
Base.cumsum(src::oneArray; kwargs...) = AK.cumsum(src, oneAPIBackend(); kwargs...)
13-
Base.cumprod(src::oneArray; kwargs...) = AK.cumprod(src, oneAPIBackend(); kwargs...)
18+
Base.cumsum(src::oneArray; block_size = _ACCUMULATE_BLOCK_SIZE, kwargs...) =
19+
AK.cumsum(src, oneAPIBackend(); block_size, kwargs...)
20+
Base.cumprod(src::oneArray; block_size = _ACCUMULATE_BLOCK_SIZE, kwargs...) =
21+
AK.cumprod(src, oneAPIBackend(); block_size, kwargs...)

0 commit comments

Comments
 (0)