|
6 | 6 |
|
7 | 7 | # Widen sub-word types to avoid shared memory corruption on Intel GPUs. |
8 | 8 | # Writing 1/2-byte values to local memory can clobber adjacent bytes. |
9 | | -@inline _widen_type(::Type{T}) where T = sizeof(T) < 4 ? Int32 : T |
| 9 | +# Only applies to integer/boolean types where `%` conversion is valid. |
| 10 | +@inline _widen_type(::Type{Bool}) = Int32 |
| 11 | +@inline _widen_type(::Type{Int8}) = Int32 |
| 12 | +@inline _widen_type(::Type{UInt8}) = Int32 |
| 13 | +@inline _widen_type(::Type{Int16}) = Int32 |
| 14 | +@inline _widen_type(::Type{UInt16}) = Int32 |
| 15 | +@inline _widen_type(::Type{T}) where T = T |
| 16 | + |
| 17 | +# Dispatch-based conversions so the compiler never generates `%` for non-integer types |
| 18 | +@inline _to_wide(val, ::Type{W}) where W = val % W |
| 19 | +@inline _to_wide(val::T, ::Type{T}) where T = val |
| 20 | +@inline _from_wide(val, ::Type{T}) where T = val % T |
| 21 | +@inline _from_wide(val::T, ::Type{T}) where T = val |
10 | 22 |
|
11 | 23 | # Reduce a value across a group, using local memory for communication |
12 | 24 | @inline function reduce_group(op, val::T, neutral, ::Val{maxitems}) where {T, maxitems} |
|
16 | 28 | # use a wider type for shared memory to avoid sub-word corruption |
17 | 29 | W = _widen_type(T) |
18 | 30 | shared = oneLocalArray(W, (maxitems,)) |
19 | | - @inbounds shared[item] = val % W |
| 31 | + @inbounds shared[item] = _to_wide(val, W) |
20 | 32 |
|
21 | 33 | # perform a reduction |
22 | 34 | d = 1 |
|
25 | 37 | index = 2 * d * (item-1) + 1 |
26 | 38 | @inbounds if index <= items |
27 | 39 | other_val = if index + d <= items |
28 | | - shared[index+d] % T |
| 40 | + _from_wide(shared[index+d], T) |
29 | 41 | else |
30 | 42 | neutral |
31 | 43 | end |
32 | | - shared[index] = op(shared[index] % T, other_val) % W |
| 44 | + shared[index] = _to_wide(op(_from_wide(shared[index], T), other_val), W) |
33 | 45 | end |
34 | 46 | d *= 2 |
35 | 47 | end |
36 | 48 |
|
37 | 49 | # load the final value on the first item |
38 | 50 | if item == 1 |
39 | | - val = @inbounds shared[item] % T |
| 51 | + val = @inbounds _from_wide(shared[item], T) |
40 | 52 | end |
41 | 53 |
|
42 | 54 | return val |
|
0 commit comments