The great Thrust index type fix, part 6: fix the extrema algos.

griwes · griwes · commit 9b164e2d06bb · 2020-02-11T23:58:52.000+01:00
diff --git a/testing/max_element.cu b/testing/max_element.cu
@@ -105,3 +105,20 @@ void TestMaxElementDispatchImplicit()
 }
 DECLARE_UNITTEST(TestMaxElementDispatchImplicit);
 
+void TestMaxElementWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(1);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    ASSERT_EQUAL(*thrust::max_element(thrust::device, begin, end), (1ll << magnitude));
+}
+
+void TestMaxElementWithBigIndexes()
+{
+    TestMaxElementWithBigIndexesHelper(30);
+    TestMaxElementWithBigIndexesHelper(31);
+    TestMaxElementWithBigIndexesHelper(32);
+    TestMaxElementWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestMaxElementWithBigIndexes);
diff --git a/testing/min_element.cu b/testing/min_element.cu
@@ -103,3 +103,22 @@ void TestMinElementDispatchImplicit()
 }
 DECLARE_UNITTEST(TestMinElementDispatchImplicit);
 
+void TestMinElementWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(1);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    ASSERT_EQUAL(
+        *thrust::min_element(thrust::device, begin, end, thrust::greater<long long>()),
+        (1ll << magnitude));
+}
+
+void TestMinElementWithBigIndexes()
+{
+    TestMinElementWithBigIndexesHelper(30);
+    TestMinElementWithBigIndexesHelper(31);
+    TestMinElementWithBigIndexesHelper(32);
+    TestMinElementWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestMinElementWithBigIndexes);
diff --git a/testing/minmax_element.cu b/testing/minmax_element.cu
@@ -110,3 +110,29 @@ void TestMinMaxElementDispatchImplicit()
 }
 DECLARE_UNITTEST(TestMinMaxElementDispatchImplicit);
 
+void TestMinMaxElementWithBigIndexesHelper(int magnitude)
+{
+    typedef thrust::counting_iterator<long long> Iter;
+    Iter begin(1);
+    Iter end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::pair<Iter, Iter> result = thrust::minmax_element(
+        thrust::device, begin, end);
+    ASSERT_EQUAL(*result.first, 1);
+    ASSERT_EQUAL(*result.second, (1ll << magnitude));
+
+    result = thrust::minmax_element(thrust::device, begin, end,
+        thrust::greater<long long>());
+    ASSERT_EQUAL(*result.second, 1);
+    ASSERT_EQUAL(*result.first, (1ll << magnitude));
+}
+
+void TestMinMaxElementWithBigIndexes()
+{
+    TestMinMaxElementWithBigIndexesHelper(30);
+    TestMinMaxElementWithBigIndexesHelper(31);
+    TestMinMaxElementWithBigIndexesHelper(32);
+    TestMinMaxElementWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestMinMaxElementWithBigIndexes);
diff --git a/thrust/system/cuda/detail/extrema.h b/thrust/system/cuda/detail/extrema.h
@@ -161,6 +161,8 @@ namespace __extrema {
     using core::get_agent_plan;
     using core::cuda_optional;
 
+    typedef typename detail::make_unsigned_special<Size>::type UnsignedSize;
+
     if (num_items == 0)
       return cudaErrorNotSupported;
 
@@ -195,16 +197,14 @@ namespace __extrema {
       cuda_optional<int> sm_count = core::get_sm_count();
       CUDA_CUB_RET_IF_FAIL(sm_count.status());
 
-      typedef __reduce::GridSizeType GridSizeType;
-
       // reduction will not use more cta counts than requested
       cuda_optional<int> max_blocks_per_sm =
           reduce_agent::
               template get_max_blocks_per_sm<InputIt,
                                              OutputIt,
                                              Size,
-                                             cub::GridEvenShare<GridSizeType>,
-                                             cub::GridQueue<GridSizeType>,
+                                             cub::GridEvenShare<Size>,
+                                             cub::GridQueue<UnsignedSize>,
                                              ReductionOp>(reduce_plan);
       CUDA_CUB_RET_IF_FAIL(max_blocks_per_sm.status());
 
@@ -215,8 +215,8 @@ namespace __extrema {
       int sm_oversubscription = 5;
       int max_blocks          = reduce_device_occupancy * sm_oversubscription;
 
-      cub::GridEvenShare<GridSizeType> even_share;
-      even_share.DispatchInit(static_cast<int>(num_items), max_blocks,
+      cub::GridEvenShare<Size> even_share;
+      even_share.DispatchInit(num_items, max_blocks,
                               reduce_plan.items_per_tile);
 
       // we will launch at most "max_blocks" blocks in a grid
@@ -230,7 +230,7 @@ namespace __extrema {
       size_t allocation_sizes[3] =
           {
               max_blocks * sizeof(T),                            // bytes needed for privatized block reductions
-              cub::GridQueue<GridSizeType>::AllocationSize(),    // bytes needed for grid queue descriptor0
+              cub::GridQueue<UnsignedSize>::AllocationSize(),    // bytes needed for grid queue descriptor0
               vshmem_size                                        // size of virtualized shared memory storage
           };
       status = cub::AliasTemporaries(d_temp_storage,
@@ -244,7 +244,7 @@ namespace __extrema {
       }
 
       T *d_block_reductions = (T*) allocations[0];
-      cub::GridQueue<GridSizeType> queue(allocations[1]);
+      cub::GridQueue<UnsignedSize> queue(allocations[1]);
       char *vshmem_ptr = vshmem_size > 0 ? (char *)allocations[2] : NULL;
 
 
@@ -321,14 +321,10 @@ namespace __extrema {
     bool         debug_sync         = THRUST_DEBUG_SYNC_FLAG;
 
     cudaError_t status;
-    status = doit_step<T>(NULL,
-                          temp_storage_bytes,
-                          first,
-                          num_items,
-                          binary_op,
-                          reinterpret_cast<T*>(NULL),
-                          stream,
-                          debug_sync);
+    THRUST_INDEX_TYPE_DISPATCH(status, doit_step<T>, num_items,
+        (NULL, temp_storage_bytes, first, num_items_fixed,
+            binary_op, reinterpret_cast<T*>(NULL), stream,
+            debug_sync));
     cuda_cub::throw_on_error(status, "extrema failed on 1st step");
 
     size_t allocation_sizes[2] = {sizeof(T*), temp_storage_bytes};
@@ -354,14 +350,10 @@ namespace __extrema {
 
     T* d_result = thrust::detail::aligned_reinterpret_cast<T*>(allocations[0]);
 
-    status = doit_step<T>(allocations[1],
-                          temp_storage_bytes,
-                          first,
-                          num_items,
-                          binary_op,
-                          d_result,
-                          stream,
-                          debug_sync);
+    THRUST_INDEX_TYPE_DISPATCH(status, doit_step<T>, num_items,
+        (allocations[1], temp_storage_bytes, first,
+            num_items_fixed, binary_op, d_result, stream,
+            debug_sync));
     cuda_cub::throw_on_error(status, "extrema failed on 2nd step");
 
     status = cuda_cub::synchronize(policy);
diff --git a/thrust/system/cuda/detail/make_unsigned_special.h b/thrust/system/cuda/detail/make_unsigned_special.h
@@ -0,0 +1,41 @@
+/*
+ *  Copyright 2019 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+THRUST_BEGIN_NS
+namespace cuda_cub {
+
+namespace detail {
+
+    template<typename Size>
+    struct make_unsigned_special;
+
+    template<>
+    struct make_unsigned_special<int> { typedef unsigned int type; };
+
+    // this is special, because CUDA's atomicAdd doesn't have an overload
+    // for unsigned long, for some godforsaken reason
+    template<>
+    struct make_unsigned_special<long> { typedef unsigned long long type; };
+
+    template<>
+    struct make_unsigned_special<long long> { typedef unsigned long long type; };
+
+}
+}
+THRUST_END_NS
+
diff --git a/thrust/system/cuda/detail/reduce.h b/thrust/system/cuda/detail/reduce.h
@@ -39,6 +39,7 @@
 #include <thrust/system/cuda/detail/par_to_seq.h>
 #include <thrust/system/cuda/detail/get_value.h>
 #include <thrust/system/cuda/detail/dispatch.h>
+#include <thrust/system/cuda/detail/make_unsigned_special.h>
 #include <thrust/functional.h>
 #include <thrust/system/cuda/detail/core/agent_launcher.h>
 #include <thrust/detail/minmax.h>
@@ -64,9 +65,6 @@ namespace cuda_cub {
 
 namespace __reduce {
 
-  // XXX should GridSizeType also be able accomodate 64 bit integers
-  typedef int GridSizeType;
-
   template<bool>
   struct is_true : thrust::detail::false_type {};
   template<>
@@ -149,6 +147,8 @@ namespace __reduce {
             class ReductionOp>
   struct ReduceAgent
   {
+    typedef typename detail::make_unsigned_special<Size>::type UnsignedSize;
+
     template<class Arch>
     struct PtxPlan : Tuning<Arch,T>::type
     {
@@ -457,8 +457,8 @@ namespace __reduce {
       //
       THRUST_DEVICE_FUNCTION T
       consume_tiles(Size /*num_items*/,
-                    cub::GridEvenShare<GridSizeType> &even_share,
-                    cub::GridQueue<GridSizeType> & /*queue*/,
+                    cub::GridEvenShare<Size> &even_share,
+                    cub::GridQueue<UnsignedSize> & /*queue*/,
                     thrust::detail::integral_constant<cub::GridMappingStrategy, cub::GRID_MAPPING_RAKE> /*is_rake*/)
       {
         typedef is_true<ATTEMPT_VECTORIZATION>          attempt_vec;
@@ -488,7 +488,7 @@ namespace __reduce {
       template <class CAN_VECTORIZE>
       THRUST_DEVICE_FUNCTION T
       consume_tiles_impl(Size                         num_items,
-                         cub::GridQueue<GridSizeType> queue,
+                         cub::GridQueue<UnsignedSize> queue,
                          CAN_VECTORIZE                can_vectorize)
       {
         using core::sync_threadblock;
@@ -575,8 +575,8 @@ namespace __reduce {
       THRUST_DEVICE_FUNCTION T
       consume_tiles(
           Size                              num_items,
-          cub::GridEvenShare<GridSizeType> &/*even_share*/,
-          cub::GridQueue<GridSizeType> &    queue,
+          cub::GridEvenShare<Size> &/*even_share*/,
+          cub::GridQueue<UnsignedSize> &    queue,
           thrust::detail::integral_constant<cub::GridMappingStrategy, cub::GRID_MAPPING_DYNAMIC>)
       {
         typedef is_true<ATTEMPT_VECTORIZATION>         attempt_vec;
@@ -643,8 +643,8 @@ namespace __reduce {
     THRUST_AGENT_ENTRY(InputIt                          input_it,
                        OutputIt                         output_it,
                        Size                             num_items,
-                       cub::GridEvenShare<GridSizeType> even_share,
-                       cub::GridQueue<GridSizeType>     queue,
+                       cub::GridEvenShare<Size> even_share,
+                       cub::GridQueue<UnsignedSize>     queue,
                        ReductionOp                      reduction_op,
                        char *                           shmem)
     {
@@ -664,6 +664,8 @@ namespace __reduce {
   template<class Size>
   struct DrainAgent
   {
+    typedef typename detail::make_unsigned_special<Size>::type UnsignedSize;
+
     template <class Arch>
     struct PtxPlan : PtxPolicy<1> {};
     typedef core::specialize_plan<PtxPlan> ptx_plan;
@@ -672,7 +674,7 @@ namespace __reduce {
     // Agent entry point
     //---------------------------------------------------------------------
 
-    THRUST_AGENT_ENTRY(cub::GridQueue<GridSizeType> grid_queue,
+    THRUST_AGENT_ENTRY(cub::GridQueue<UnsignedSize> grid_queue,
                        Size                         num_items,
                        char * /*shmem*/)
     {
@@ -702,6 +704,8 @@ namespace __reduce {
     using core::get_agent_plan;
     using core::cuda_optional;
 
+    typedef typename detail::make_unsigned_special<Size>::type UnsignedSize;
+
     if (num_items == 0)
       return cudaErrorNotSupported;
 
@@ -742,8 +746,8 @@ namespace __reduce {
               template get_max_blocks_per_sm<InputIt,
                                              OutputIt,
                                              Size,
-                                             cub::GridEvenShare<GridSizeType>,
-                                             cub::GridQueue<GridSizeType>,
+                                             cub::GridEvenShare<Size>,
+                                             cub::GridQueue<UnsignedSize>,
                                              ReductionOp>(reduce_plan);
       CUDA_CUB_RET_IF_FAIL(max_blocks_per_sm.status());
 
@@ -754,7 +758,7 @@ namespace __reduce {
       int sm_oversubscription = 5;
       int max_blocks          = reduce_device_occupancy * sm_oversubscription;
 
-      cub::GridEvenShare<GridSizeType> even_share;
+      cub::GridEvenShare<Size> even_share;
       even_share.DispatchInit(static_cast<int>(num_items), max_blocks,
                               reduce_plan.items_per_tile);
 
@@ -769,7 +773,7 @@ namespace __reduce {
       size_t allocation_sizes[3] =
           {
               max_blocks * sizeof(T),                            // bytes needed for privatized block reductions
-              cub::GridQueue<GridSizeType>::AllocationSize(),    // bytes needed for grid queue descriptor0
+              cub::GridQueue<UnsignedSize>::AllocationSize(),    // bytes needed for grid queue descriptor0
               vshmem_size                                        // size of virtualized shared memory storage
           };
       status = cub::AliasTemporaries(d_temp_storage,
@@ -783,7 +787,7 @@ namespace __reduce {
       }
 
       T *d_block_reductions = (T*) allocations[0];
-      cub::GridQueue<GridSizeType> queue(allocations[1]);
+      cub::GridQueue<UnsignedSize> queue(allocations[1]);
       char *vshmem_ptr = vshmem_size > 0 ? (char *)allocations[2] : NULL;