Strided boolean reduction loop tweak

ndgrigorian · ndgrigorian · commit da1cc5bbb532 · 2023-07-29T22:05:09.000-07:00
- Adjusted to reduce branching and hopefully improve vectorization of the loop by removing a conditional
diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp
@@ -230,12 +230,13 @@ struct ContigBooleanReduction
         const size_t red_gws_ = it.get_global_range(0) / iter_gws_;
         const size_t reduction_id = it.get_global_id(0) / red_gws_;
         const size_t reduction_batch_id = get_reduction_batch_id(it);
-        size_t wg_size = it.get_local_range(0);
+        const size_t wg_size = it.get_local_range(0);
 
-        size_t base = reduction_id * reduction_max_gid_;
-        size_t start = base + reduction_batch_id * wg_size * reductions_per_wi;
-        size_t end = std::min((start + (reductions_per_wi * wg_size)),
-                              base + reduction_max_gid_);
+        const size_t base = reduction_id * reduction_max_gid_;
+        const size_t start =
+            base + reduction_batch_id * wg_size * reductions_per_wi;
+        const size_t end = std::min((start + (reductions_per_wi * wg_size)),
+                                    base + reduction_max_gid_);
         // reduction and atomic operations are performed
         // in group_op_
         group_op_(it, out_, reduction_id, inp_ + start, inp_ + end);
@@ -447,21 +448,21 @@ struct StridedBooleanReduction
         outT local_red_val(identity_);
         size_t arg_reduce_gid0 =
             reduction_lid + reduction_batch_id * wg_size * reductions_per_wi;
-        for (size_t m = 0; m < reductions_per_wi; ++m) {
-            size_t arg_reduce_gid = arg_reduce_gid0 + m * wg_size;
-
-            if (arg_reduce_gid < reduction_max_gid_) {
-                py::ssize_t inp_reduction_offset = static_cast<py::ssize_t>(
-                    inp_reduced_dims_indexer_(arg_reduce_gid));
-                py::ssize_t inp_offset = inp_iter_offset + inp_reduction_offset;
+        size_t arg_reduce_gid_max = std::min(
+            reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg_size);
+        for (size_t arg_reduce_gid = arg_reduce_gid0;
+             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg_size)
+        {
+            py::ssize_t inp_reduction_offset = static_cast<py::ssize_t>(
+                inp_reduced_dims_indexer_(arg_reduce_gid));
+            py::ssize_t inp_offset = inp_iter_offset + inp_reduction_offset;
 
-                // must convert to boolean first to handle nans
-                using dpctl::tensor::type_utils::convert_impl;
-                bool val = convert_impl<bool, argT>(inp_[inp_offset]);
-                ReductionOp op = reduction_op_;
+            // must convert to boolean first to handle nans
+            using dpctl::tensor::type_utils::convert_impl;
+            bool val = convert_impl<bool, argT>(inp_[inp_offset]);
+            ReductionOp op = reduction_op_;
 
-                local_red_val = op(local_red_val, static_cast<outT>(val));
-            }
+            local_red_val = op(local_red_val, static_cast<outT>(val));
         }
         // reduction and atomic operations are performed
         // in group_op_