|
4 | 4 | # - serial version for lower latency |
5 | 5 | # - group-stride loop to delay need for second kernel launch |
6 | 6 |
|
| 7 | +# Widen sub-word types to avoid shared memory corruption on Intel GPUs. |
| 8 | +# Writing 1/2-byte values to local memory can clobber adjacent bytes. |
| 9 | +@inline _widen_type(::Type{T}) where T = sizeof(T) < 4 ? Int32 : T |
| 10 | + |
7 | 11 | # Reduce a value across a group, using local memory for communication |
8 | 12 | @inline function reduce_group(op, val::T, neutral, ::Val{maxitems}) where {T, maxitems} |
9 | 13 | items = get_local_size() |
10 | 14 | item = get_local_id() |
11 | 15 |
|
12 | | - # local mem for a complete reduction |
13 | | - shared = oneLocalArray(T, (maxitems,)) |
14 | | - @inbounds shared[item] = val |
| 16 | + # use a wider type for shared memory to avoid sub-word corruption |
| 17 | + W = _widen_type(T) |
| 18 | + shared = oneLocalArray(W, (maxitems,)) |
| 19 | + @inbounds shared[item] = val % W |
15 | 20 |
|
16 | 21 | # perform a reduction |
17 | 22 | d = 1 |
|
20 | 25 | index = 2 * d * (item-1) + 1 |
21 | 26 | @inbounds if index <= items |
22 | 27 | other_val = if index + d <= items |
23 | | - shared[index+d] |
| 28 | + shared[index+d] % T |
24 | 29 | else |
25 | 30 | neutral |
26 | 31 | end |
27 | | - shared[index] = op(shared[index], other_val) |
| 32 | + shared[index] = op(shared[index] % T, other_val) % W |
28 | 33 | end |
29 | 34 | d *= 2 |
30 | 35 | end |
31 | 36 |
|
32 | 37 | # load the final value on the first item |
33 | 38 | if item == 1 |
34 | | - val = @inbounds shared[item] |
| 39 | + val = @inbounds shared[item] % T |
35 | 40 | end |
36 | 41 |
|
37 | 42 | return val |
@@ -135,8 +140,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::oneWrappedArray{T}, |
135 | 140 | # that's why each items also loops across their inputs, processing multiple values |
136 | 141 | # so that we can span the entire reduction dimension using a single item group. |
137 | 142 |
|
138 | | - # group size is restricted by local memory |
139 | | - max_lmem_elements = compute_properties(device()).maxSharedLocalMemory ÷ sizeof(T) |
| 143 | + # group size is restricted by local memory (use widened type for sub-word types) |
| 144 | + max_lmem_elements = compute_properties(device()).maxSharedLocalMemory ÷ sizeof(_widen_type(T)) |
140 | 145 | max_items = min(compute_properties(device()).maxTotalGroupSize, |
141 | 146 | compute_items(max_lmem_elements ÷ 2)) |
142 | 147 | # TODO: dynamic local memory to avoid two compilations |
|
0 commit comments