@@ -230,12 +230,13 @@ struct ContigBooleanReduction
230230 const size_t red_gws_ = it.get_global_range (0 ) / iter_gws_;
231231 const size_t reduction_id = it.get_global_id (0 ) / red_gws_;
232232 const size_t reduction_batch_id = get_reduction_batch_id (it);
233- size_t wg_size = it.get_local_range (0 );
233+ const size_t wg_size = it.get_local_range (0 );
234234
235- size_t base = reduction_id * reduction_max_gid_;
236- size_t start = base + reduction_batch_id * wg_size * reductions_per_wi;
237- size_t end = std::min ((start + (reductions_per_wi * wg_size)),
238- base + reduction_max_gid_);
235+ const size_t base = reduction_id * reduction_max_gid_;
236+ const size_t start =
237+ base + reduction_batch_id * wg_size * reductions_per_wi;
238+ const size_t end = std::min ((start + (reductions_per_wi * wg_size)),
239+ base + reduction_max_gid_);
239240 // reduction and atomic operations are performed
240241 // in group_op_
241242 group_op_ (it, out_, reduction_id, inp_ + start, inp_ + end);
@@ -447,21 +448,21 @@ struct StridedBooleanReduction
447448 outT local_red_val (identity_);
448449 size_t arg_reduce_gid0 =
449450 reduction_lid + reduction_batch_id * wg_size * reductions_per_wi;
450- for (size_t m = 0 ; m < reductions_per_wi; ++m) {
451- size_t arg_reduce_gid = arg_reduce_gid0 + m * wg_size;
452-
453- if (arg_reduce_gid < reduction_max_gid_) {
454- py::ssize_t inp_reduction_offset = static_cast <py::ssize_t >(
455- inp_reduced_dims_indexer_ (arg_reduce_gid));
456- py::ssize_t inp_offset = inp_iter_offset + inp_reduction_offset;
451+ size_t arg_reduce_gid_max = std::min (
452+ reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg_size);
453+ for (size_t arg_reduce_gid = arg_reduce_gid0;
454+ arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg_size)
455+ {
456+ py::ssize_t inp_reduction_offset = static_cast <py::ssize_t >(
457+ inp_reduced_dims_indexer_ (arg_reduce_gid));
458+ py::ssize_t inp_offset = inp_iter_offset + inp_reduction_offset;
457459
458- // must convert to boolean first to handle nans
459- using dpctl::tensor::type_utils::convert_impl;
460- bool val = convert_impl<bool , argT>(inp_[inp_offset]);
461- ReductionOp op = reduction_op_;
460+ // must convert to boolean first to handle nans
461+ using dpctl::tensor::type_utils::convert_impl;
462+ bool val = convert_impl<bool , argT>(inp_[inp_offset]);
463+ ReductionOp op = reduction_op_;
462464
463- local_red_val = op (local_red_val, static_cast <outT>(val));
464- }
465+ local_red_val = op (local_red_val, static_cast <outT>(val));
465466 }
466467 // reduction and atomic operations are performed
467468 // in group_op_
0 commit comments