Skip to content

Commit da1cc5b

Browse files
committed
Strided boolean reduction loop tweak
- Adjusted to reduce branching and hopefully improve vectorization of the loop by removing a conditional
1 parent a5aee5b commit da1cc5b

1 file changed

Lines changed: 19 additions & 18 deletions

File tree

dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp

Lines changed: 19 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -230,12 +230,13 @@ struct ContigBooleanReduction
230230
const size_t red_gws_ = it.get_global_range(0) / iter_gws_;
231231
const size_t reduction_id = it.get_global_id(0) / red_gws_;
232232
const size_t reduction_batch_id = get_reduction_batch_id(it);
233-
size_t wg_size = it.get_local_range(0);
233+
const size_t wg_size = it.get_local_range(0);
234234

235-
size_t base = reduction_id * reduction_max_gid_;
236-
size_t start = base + reduction_batch_id * wg_size * reductions_per_wi;
237-
size_t end = std::min((start + (reductions_per_wi * wg_size)),
238-
base + reduction_max_gid_);
235+
const size_t base = reduction_id * reduction_max_gid_;
236+
const size_t start =
237+
base + reduction_batch_id * wg_size * reductions_per_wi;
238+
const size_t end = std::min((start + (reductions_per_wi * wg_size)),
239+
base + reduction_max_gid_);
239240
// reduction and atomic operations are performed
240241
// in group_op_
241242
group_op_(it, out_, reduction_id, inp_ + start, inp_ + end);
@@ -447,21 +448,21 @@ struct StridedBooleanReduction
447448
outT local_red_val(identity_);
448449
size_t arg_reduce_gid0 =
449450
reduction_lid + reduction_batch_id * wg_size * reductions_per_wi;
450-
for (size_t m = 0; m < reductions_per_wi; ++m) {
451-
size_t arg_reduce_gid = arg_reduce_gid0 + m * wg_size;
452-
453-
if (arg_reduce_gid < reduction_max_gid_) {
454-
py::ssize_t inp_reduction_offset = static_cast<py::ssize_t>(
455-
inp_reduced_dims_indexer_(arg_reduce_gid));
456-
py::ssize_t inp_offset = inp_iter_offset + inp_reduction_offset;
451+
size_t arg_reduce_gid_max = std::min(
452+
reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg_size);
453+
for (size_t arg_reduce_gid = arg_reduce_gid0;
454+
arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg_size)
455+
{
456+
py::ssize_t inp_reduction_offset = static_cast<py::ssize_t>(
457+
inp_reduced_dims_indexer_(arg_reduce_gid));
458+
py::ssize_t inp_offset = inp_iter_offset + inp_reduction_offset;
457459

458-
// must convert to boolean first to handle nans
459-
using dpctl::tensor::type_utils::convert_impl;
460-
bool val = convert_impl<bool, argT>(inp_[inp_offset]);
461-
ReductionOp op = reduction_op_;
460+
// must convert to boolean first to handle nans
461+
using dpctl::tensor::type_utils::convert_impl;
462+
bool val = convert_impl<bool, argT>(inp_[inp_offset]);
463+
ReductionOp op = reduction_op_;
462464

463-
local_red_val = op(local_red_val, static_cast<outT>(val));
464-
}
465+
local_red_val = op(local_red_val, static_cast<outT>(val));
465466
}
466467
// reduction and atomic operations are performed
467468
// in group_op_

0 commit comments

Comments
 (0)