Skip to content

Commit b2d6102

Browse files
committed
Fix mapreduce shared memory corruption for sub-word types
Widen Int8/Int16/Float16 values to Int32 before writing to local memory to avoid clobbering adjacent bytes. Adjusts the local memory size calculation accordingly.
1 parent 1e70e3b commit b2d6102

1 file changed

Lines changed: 13 additions & 8 deletions

File tree

src/mapreduce.jl

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,19 @@
44
# - serial version for lower latency
55
# - group-stride loop to delay need for second kernel launch
66

7+
# Widen sub-word types to avoid shared memory corruption on Intel GPUs.
8+
# Writing 1/2-byte values to local memory can clobber adjacent bytes.
9+
@inline _widen_type(::Type{T}) where T = sizeof(T) < 4 ? Int32 : T
10+
711
# Reduce a value across a group, using local memory for communication
812
@inline function reduce_group(op, val::T, neutral, ::Val{maxitems}) where {T, maxitems}
913
items = get_local_size()
1014
item = get_local_id()
1115

12-
# local mem for a complete reduction
13-
shared = oneLocalArray(T, (maxitems,))
14-
@inbounds shared[item] = val
16+
# use a wider type for shared memory to avoid sub-word corruption
17+
W = _widen_type(T)
18+
shared = oneLocalArray(W, (maxitems,))
19+
@inbounds shared[item] = val % W
1520

1621
# perform a reduction
1722
d = 1
@@ -20,18 +25,18 @@
2025
index = 2 * d * (item-1) + 1
2126
@inbounds if index <= items
2227
other_val = if index + d <= items
23-
shared[index+d]
28+
shared[index+d] % T
2429
else
2530
neutral
2631
end
27-
shared[index] = op(shared[index], other_val)
32+
shared[index] = op(shared[index] % T, other_val) % W
2833
end
2934
d *= 2
3035
end
3136

3237
# load the final value on the first item
3338
if item == 1
34-
val = @inbounds shared[item]
39+
val = @inbounds shared[item] % T
3540
end
3641

3742
return val
@@ -135,8 +140,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::oneWrappedArray{T},
135140
# that's why each items also loops across their inputs, processing multiple values
136141
# so that we can span the entire reduction dimension using a single item group.
137142

138-
# group size is restricted by local memory
139-
max_lmem_elements = compute_properties(device()).maxSharedLocalMemory ÷ sizeof(T)
143+
# group size is restricted by local memory (use widened type for sub-word types)
144+
max_lmem_elements = compute_properties(device()).maxSharedLocalMemory ÷ sizeof(_widen_type(T))
140145
max_items = min(compute_properties(device()).maxTotalGroupSize,
141146
compute_items(max_lmem_elements ÷ 2))
142147
# TODO: dynamic local memory to avoid two compilations

0 commit comments

Comments
 (0)