@@ -2,12 +2,20 @@ import oneAPI
22import oneAPI: oneArray, oneAPIBackend
33import AcceleratedKernels as AK
44
5+ # Use a smaller block size on Intel GPUs to work around a scan correctness issue
6+ # with the Blelloch parallel prefix sum at larger block sizes (>=128).
7+ const _ACCUMULATE_BLOCK_SIZE = 64
8+
59# Accumulate operations using AcceleratedKernels
6- Base. accumulate! (op, B:: oneArray , A:: oneArray ; init = zero (eltype (A)), kwargs... ) =
7- AK. accumulate! (op, B, A, oneAPIBackend (); init, kwargs... )
10+ Base. accumulate! (op, B:: oneArray , A:: oneArray ; init = zero (eltype (A)),
11+ block_size = _ACCUMULATE_BLOCK_SIZE, kwargs... ) =
12+ AK. accumulate! (op, B, A, oneAPIBackend (); init, block_size, kwargs... )
813
9- Base. accumulate (op, A:: oneArray ; init = zero (eltype (A)), kwargs... ) =
10- AK. accumulate (op, A, oneAPIBackend (); init, kwargs... )
14+ Base. accumulate (op, A:: oneArray ; init = zero (eltype (A)),
15+ block_size = _ACCUMULATE_BLOCK_SIZE, kwargs... ) =
16+ AK. accumulate (op, A, oneAPIBackend (); init, block_size, kwargs... )
1117
12- Base. cumsum (src:: oneArray ; kwargs... ) = AK. cumsum (src, oneAPIBackend (); kwargs... )
13- Base. cumprod (src:: oneArray ; kwargs... ) = AK. cumprod (src, oneAPIBackend (); kwargs... )
18+ Base. cumsum (src:: oneArray ; block_size = _ACCUMULATE_BLOCK_SIZE, kwargs... ) =
19+ AK. cumsum (src, oneAPIBackend (); block_size, kwargs... )
20+ Base. cumprod (src:: oneArray ; block_size = _ACCUMULATE_BLOCK_SIZE, kwargs... ) =
21+ AK. cumprod (src, oneAPIBackend (); block_size, kwargs... )
0 commit comments