Skip to content

Commit 9b164e2

Browse files
committed
The great Thrust index type fix, part 6: fix the extrema algos.
1 parent 0d17b82 commit 9b164e2

6 files changed

Lines changed: 139 additions & 40 deletions

File tree

testing/max_element.cu

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,3 +105,20 @@ void TestMaxElementDispatchImplicit()
105105
}
106106
DECLARE_UNITTEST(TestMaxElementDispatchImplicit);
107107

108+
void TestMaxElementWithBigIndexesHelper(int magnitude)
109+
{
110+
thrust::counting_iterator<long long> begin(1);
111+
thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
112+
ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
113+
114+
ASSERT_EQUAL(*thrust::max_element(thrust::device, begin, end), (1ll << magnitude));
115+
}
116+
117+
void TestMaxElementWithBigIndexes()
118+
{
119+
TestMaxElementWithBigIndexesHelper(30);
120+
TestMaxElementWithBigIndexesHelper(31);
121+
TestMaxElementWithBigIndexesHelper(32);
122+
TestMaxElementWithBigIndexesHelper(33);
123+
}
124+
DECLARE_UNITTEST(TestMaxElementWithBigIndexes);

testing/min_element.cu

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,3 +103,22 @@ void TestMinElementDispatchImplicit()
103103
}
104104
DECLARE_UNITTEST(TestMinElementDispatchImplicit);
105105

106+
void TestMinElementWithBigIndexesHelper(int magnitude)
107+
{
108+
thrust::counting_iterator<long long> begin(1);
109+
thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
110+
ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
111+
112+
ASSERT_EQUAL(
113+
*thrust::min_element(thrust::device, begin, end, thrust::greater<long long>()),
114+
(1ll << magnitude));
115+
}
116+
117+
void TestMinElementWithBigIndexes()
118+
{
119+
TestMinElementWithBigIndexesHelper(30);
120+
TestMinElementWithBigIndexesHelper(31);
121+
TestMinElementWithBigIndexesHelper(32);
122+
TestMinElementWithBigIndexesHelper(33);
123+
}
124+
DECLARE_UNITTEST(TestMinElementWithBigIndexes);

testing/minmax_element.cu

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,3 +110,29 @@ void TestMinMaxElementDispatchImplicit()
110110
}
111111
DECLARE_UNITTEST(TestMinMaxElementDispatchImplicit);
112112

113+
void TestMinMaxElementWithBigIndexesHelper(int magnitude)
114+
{
115+
typedef thrust::counting_iterator<long long> Iter;
116+
Iter begin(1);
117+
Iter end = begin + (1ll << magnitude);
118+
ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
119+
120+
thrust::pair<Iter, Iter> result = thrust::minmax_element(
121+
thrust::device, begin, end);
122+
ASSERT_EQUAL(*result.first, 1);
123+
ASSERT_EQUAL(*result.second, (1ll << magnitude));
124+
125+
result = thrust::minmax_element(thrust::device, begin, end,
126+
thrust::greater<long long>());
127+
ASSERT_EQUAL(*result.second, 1);
128+
ASSERT_EQUAL(*result.first, (1ll << magnitude));
129+
}
130+
131+
void TestMinMaxElementWithBigIndexes()
132+
{
133+
TestMinMaxElementWithBigIndexesHelper(30);
134+
TestMinMaxElementWithBigIndexesHelper(31);
135+
TestMinMaxElementWithBigIndexesHelper(32);
136+
TestMinMaxElementWithBigIndexesHelper(33);
137+
}
138+
DECLARE_UNITTEST(TestMinMaxElementWithBigIndexes);

thrust/system/cuda/detail/extrema.h

Lines changed: 16 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,8 @@ namespace __extrema {
161161
using core::get_agent_plan;
162162
using core::cuda_optional;
163163

164+
typedef typename detail::make_unsigned_special<Size>::type UnsignedSize;
165+
164166
if (num_items == 0)
165167
return cudaErrorNotSupported;
166168

@@ -195,16 +197,14 @@ namespace __extrema {
195197
cuda_optional<int> sm_count = core::get_sm_count();
196198
CUDA_CUB_RET_IF_FAIL(sm_count.status());
197199

198-
typedef __reduce::GridSizeType GridSizeType;
199-
200200
// reduction will not use more cta counts than requested
201201
cuda_optional<int> max_blocks_per_sm =
202202
reduce_agent::
203203
template get_max_blocks_per_sm<InputIt,
204204
OutputIt,
205205
Size,
206-
cub::GridEvenShare<GridSizeType>,
207-
cub::GridQueue<GridSizeType>,
206+
cub::GridEvenShare<Size>,
207+
cub::GridQueue<UnsignedSize>,
208208
ReductionOp>(reduce_plan);
209209
CUDA_CUB_RET_IF_FAIL(max_blocks_per_sm.status());
210210

@@ -215,8 +215,8 @@ namespace __extrema {
215215
int sm_oversubscription = 5;
216216
int max_blocks = reduce_device_occupancy * sm_oversubscription;
217217

218-
cub::GridEvenShare<GridSizeType> even_share;
219-
even_share.DispatchInit(static_cast<int>(num_items), max_blocks,
218+
cub::GridEvenShare<Size> even_share;
219+
even_share.DispatchInit(num_items, max_blocks,
220220
reduce_plan.items_per_tile);
221221

222222
// we will launch at most "max_blocks" blocks in a grid
@@ -230,7 +230,7 @@ namespace __extrema {
230230
size_t allocation_sizes[3] =
231231
{
232232
max_blocks * sizeof(T), // bytes needed for privatized block reductions
233-
cub::GridQueue<GridSizeType>::AllocationSize(), // bytes needed for grid queue descriptor0
233+
cub::GridQueue<UnsignedSize>::AllocationSize(), // bytes needed for grid queue descriptor0
234234
vshmem_size // size of virtualized shared memory storage
235235
};
236236
status = cub::AliasTemporaries(d_temp_storage,
@@ -244,7 +244,7 @@ namespace __extrema {
244244
}
245245

246246
T *d_block_reductions = (T*) allocations[0];
247-
cub::GridQueue<GridSizeType> queue(allocations[1]);
247+
cub::GridQueue<UnsignedSize> queue(allocations[1]);
248248
char *vshmem_ptr = vshmem_size > 0 ? (char *)allocations[2] : NULL;
249249

250250

@@ -321,14 +321,10 @@ namespace __extrema {
321321
bool debug_sync = THRUST_DEBUG_SYNC_FLAG;
322322

323323
cudaError_t status;
324-
status = doit_step<T>(NULL,
325-
temp_storage_bytes,
326-
first,
327-
num_items,
328-
binary_op,
329-
reinterpret_cast<T*>(NULL),
330-
stream,
331-
debug_sync);
324+
THRUST_INDEX_TYPE_DISPATCH(status, doit_step<T>, num_items,
325+
(NULL, temp_storage_bytes, first, num_items_fixed,
326+
binary_op, reinterpret_cast<T*>(NULL), stream,
327+
debug_sync));
332328
cuda_cub::throw_on_error(status, "extrema failed on 1st step");
333329

334330
size_t allocation_sizes[2] = {sizeof(T*), temp_storage_bytes};
@@ -354,14 +350,10 @@ namespace __extrema {
354350

355351
T* d_result = thrust::detail::aligned_reinterpret_cast<T*>(allocations[0]);
356352

357-
status = doit_step<T>(allocations[1],
358-
temp_storage_bytes,
359-
first,
360-
num_items,
361-
binary_op,
362-
d_result,
363-
stream,
364-
debug_sync);
353+
THRUST_INDEX_TYPE_DISPATCH(status, doit_step<T>, num_items,
354+
(allocations[1], temp_storage_bytes, first,
355+
num_items_fixed, binary_op, d_result, stream,
356+
debug_sync));
365357
cuda_cub::throw_on_error(status, "extrema failed on 2nd step");
366358

367359
status = cuda_cub::synchronize(policy);
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
/*
2+
* Copyright 2019 NVIDIA Corporation
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#pragma once
18+
19+
THRUST_BEGIN_NS
20+
namespace cuda_cub {
21+
22+
namespace detail {
23+
24+
template<typename Size>
25+
struct make_unsigned_special;
26+
27+
template<>
28+
struct make_unsigned_special<int> { typedef unsigned int type; };
29+
30+
// this is special, because CUDA's atomicAdd doesn't have an overload
31+
// for unsigned long, for some godforsaken reason
32+
template<>
33+
struct make_unsigned_special<long> { typedef unsigned long long type; };
34+
35+
template<>
36+
struct make_unsigned_special<long long> { typedef unsigned long long type; };
37+
38+
}
39+
}
40+
THRUST_END_NS
41+

thrust/system/cuda/detail/reduce.h

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
#include <thrust/system/cuda/detail/par_to_seq.h>
4040
#include <thrust/system/cuda/detail/get_value.h>
4141
#include <thrust/system/cuda/detail/dispatch.h>
42+
#include <thrust/system/cuda/detail/make_unsigned_special.h>
4243
#include <thrust/functional.h>
4344
#include <thrust/system/cuda/detail/core/agent_launcher.h>
4445
#include <thrust/detail/minmax.h>
@@ -64,9 +65,6 @@ namespace cuda_cub {
6465

6566
namespace __reduce {
6667

67-
// XXX should GridSizeType also be able accomodate 64 bit integers
68-
typedef int GridSizeType;
69-
7068
template<bool>
7169
struct is_true : thrust::detail::false_type {};
7270
template<>
@@ -149,6 +147,8 @@ namespace __reduce {
149147
class ReductionOp>
150148
struct ReduceAgent
151149
{
150+
typedef typename detail::make_unsigned_special<Size>::type UnsignedSize;
151+
152152
template<class Arch>
153153
struct PtxPlan : Tuning<Arch,T>::type
154154
{
@@ -457,8 +457,8 @@ namespace __reduce {
457457
//
458458
THRUST_DEVICE_FUNCTION T
459459
consume_tiles(Size /*num_items*/,
460-
cub::GridEvenShare<GridSizeType> &even_share,
461-
cub::GridQueue<GridSizeType> & /*queue*/,
460+
cub::GridEvenShare<Size> &even_share,
461+
cub::GridQueue<UnsignedSize> & /*queue*/,
462462
thrust::detail::integral_constant<cub::GridMappingStrategy, cub::GRID_MAPPING_RAKE> /*is_rake*/)
463463
{
464464
typedef is_true<ATTEMPT_VECTORIZATION> attempt_vec;
@@ -488,7 +488,7 @@ namespace __reduce {
488488
template <class CAN_VECTORIZE>
489489
THRUST_DEVICE_FUNCTION T
490490
consume_tiles_impl(Size num_items,
491-
cub::GridQueue<GridSizeType> queue,
491+
cub::GridQueue<UnsignedSize> queue,
492492
CAN_VECTORIZE can_vectorize)
493493
{
494494
using core::sync_threadblock;
@@ -575,8 +575,8 @@ namespace __reduce {
575575
THRUST_DEVICE_FUNCTION T
576576
consume_tiles(
577577
Size num_items,
578-
cub::GridEvenShare<GridSizeType> &/*even_share*/,
579-
cub::GridQueue<GridSizeType> & queue,
578+
cub::GridEvenShare<Size> &/*even_share*/,
579+
cub::GridQueue<UnsignedSize> & queue,
580580
thrust::detail::integral_constant<cub::GridMappingStrategy, cub::GRID_MAPPING_DYNAMIC>)
581581
{
582582
typedef is_true<ATTEMPT_VECTORIZATION> attempt_vec;
@@ -643,8 +643,8 @@ namespace __reduce {
643643
THRUST_AGENT_ENTRY(InputIt input_it,
644644
OutputIt output_it,
645645
Size num_items,
646-
cub::GridEvenShare<GridSizeType> even_share,
647-
cub::GridQueue<GridSizeType> queue,
646+
cub::GridEvenShare<Size> even_share,
647+
cub::GridQueue<UnsignedSize> queue,
648648
ReductionOp reduction_op,
649649
char * shmem)
650650
{
@@ -664,6 +664,8 @@ namespace __reduce {
664664
template<class Size>
665665
struct DrainAgent
666666
{
667+
typedef typename detail::make_unsigned_special<Size>::type UnsignedSize;
668+
667669
template <class Arch>
668670
struct PtxPlan : PtxPolicy<1> {};
669671
typedef core::specialize_plan<PtxPlan> ptx_plan;
@@ -672,7 +674,7 @@ namespace __reduce {
672674
// Agent entry point
673675
//---------------------------------------------------------------------
674676

675-
THRUST_AGENT_ENTRY(cub::GridQueue<GridSizeType> grid_queue,
677+
THRUST_AGENT_ENTRY(cub::GridQueue<UnsignedSize> grid_queue,
676678
Size num_items,
677679
char * /*shmem*/)
678680
{
@@ -702,6 +704,8 @@ namespace __reduce {
702704
using core::get_agent_plan;
703705
using core::cuda_optional;
704706

707+
typedef typename detail::make_unsigned_special<Size>::type UnsignedSize;
708+
705709
if (num_items == 0)
706710
return cudaErrorNotSupported;
707711

@@ -742,8 +746,8 @@ namespace __reduce {
742746
template get_max_blocks_per_sm<InputIt,
743747
OutputIt,
744748
Size,
745-
cub::GridEvenShare<GridSizeType>,
746-
cub::GridQueue<GridSizeType>,
749+
cub::GridEvenShare<Size>,
750+
cub::GridQueue<UnsignedSize>,
747751
ReductionOp>(reduce_plan);
748752
CUDA_CUB_RET_IF_FAIL(max_blocks_per_sm.status());
749753

@@ -754,7 +758,7 @@ namespace __reduce {
754758
int sm_oversubscription = 5;
755759
int max_blocks = reduce_device_occupancy * sm_oversubscription;
756760

757-
cub::GridEvenShare<GridSizeType> even_share;
761+
cub::GridEvenShare<Size> even_share;
758762
even_share.DispatchInit(static_cast<int>(num_items), max_blocks,
759763
reduce_plan.items_per_tile);
760764

@@ -769,7 +773,7 @@ namespace __reduce {
769773
size_t allocation_sizes[3] =
770774
{
771775
max_blocks * sizeof(T), // bytes needed for privatized block reductions
772-
cub::GridQueue<GridSizeType>::AllocationSize(), // bytes needed for grid queue descriptor0
776+
cub::GridQueue<UnsignedSize>::AllocationSize(), // bytes needed for grid queue descriptor0
773777
vshmem_size // size of virtualized shared memory storage
774778
};
775779
status = cub::AliasTemporaries(d_temp_storage,
@@ -783,7 +787,7 @@ namespace __reduce {
783787
}
784788

785789
T *d_block_reductions = (T*) allocations[0];
786-
cub::GridQueue<GridSizeType> queue(allocations[1]);
790+
cub::GridQueue<UnsignedSize> queue(allocations[1]);
787791
char *vshmem_ptr = vshmem_size > 0 ? (char *)allocations[2] : NULL;
788792

789793

0 commit comments

Comments
 (0)