Skip to content

Commit 500c4e0

Browse files
committed
The great Thrust index type fix, part 1: adjacent_difference, reduce.
1 parent 42e4491 commit 500c4e0

4 files changed

Lines changed: 150 additions & 51 deletions

File tree

testing/adjacent_difference.cu

Lines changed: 57 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
#include <thrust/adjacent_difference.h>
33
#include <thrust/iterator/discard_iterator.h>
44
#include <thrust/iterator/retag.h>
5+
#include <thrust/device_malloc.h>
6+
#include <thrust/device_free.h>
57

68
template <class Vector>
79
void TestAdjacentDifferenceSimple(void)
@@ -13,21 +15,21 @@ void TestAdjacentDifferenceSimple(void)
1315
input[0] = 1; input[1] = 4; input[2] = 6;
1416

1517
typename Vector::iterator result;
16-
18+
1719
result = thrust::adjacent_difference(input.begin(), input.end(), output.begin());
1820

1921
ASSERT_EQUAL(result - output.begin(), 3);
2022
ASSERT_EQUAL(output[0], T(1));
2123
ASSERT_EQUAL(output[1], T(3));
2224
ASSERT_EQUAL(output[2], T(2));
23-
25+
2426
result = thrust::adjacent_difference(input.begin(), input.end(), output.begin(), thrust::plus<T>());
25-
27+
2628
ASSERT_EQUAL(result - output.begin(), 3);
2729
ASSERT_EQUAL(output[0], T( 1));
2830
ASSERT_EQUAL(output[1], T( 5));
2931
ASSERT_EQUAL(output[2], T(10));
30-
32+
3133
// test in-place operation, result and first are permitted to be the same
3234
result = thrust::adjacent_difference(input.begin(), input.end(), input.begin());
3335

@@ -57,14 +59,14 @@ void TestAdjacentDifference(const size_t n)
5759
ASSERT_EQUAL(std::size_t(h_result - h_output.begin()), n);
5860
ASSERT_EQUAL(std::size_t(d_result - d_output.begin()), n);
5961
ASSERT_EQUAL(h_output, d_output);
60-
62+
6163
h_result = thrust::adjacent_difference(h_input.begin(), h_input.end(), h_output.begin(), thrust::plus<T>());
6264
d_result = thrust::adjacent_difference(d_input.begin(), d_input.end(), d_output.begin(), thrust::plus<T>());
6365

6466
ASSERT_EQUAL(std::size_t(h_result - h_output.begin()), n);
6567
ASSERT_EQUAL(std::size_t(d_result - d_output.begin()), n);
6668
ASSERT_EQUAL(h_output, d_output);
67-
69+
6870
// in-place operation
6971
h_result = thrust::adjacent_difference(h_input.begin(), h_input.end(), h_input.begin(), thrust::plus<T>());
7072
d_result = thrust::adjacent_difference(d_input.begin(), d_input.end(), d_input.begin(), thrust::plus<T>());
@@ -90,7 +92,7 @@ void TestAdjacentDifferenceInPlaceWithRelatedIteratorTypes(const size_t n)
9092

9193
h_result = thrust::adjacent_difference(h_input.begin(), h_input.end(), h_output.begin(), thrust::plus<T>());
9294
d_result = thrust::adjacent_difference(d_input.begin(), d_input.end(), d_output.begin(), thrust::plus<T>());
93-
95+
9496
// in-place operation with different iterator types
9597
h_result = thrust::adjacent_difference(h_input.cbegin(), h_input.cend(), h_input.begin(), thrust::plus<T>());
9698
d_result = thrust::adjacent_difference(d_input.cbegin(), d_input.cend(), d_input.begin(), thrust::plus<T>());
@@ -160,3 +162,51 @@ void TestAdjacentDifferenceDispatchImplicit()
160162
}
161163
DECLARE_UNITTEST(TestAdjacentDifferenceDispatchImplicit);
162164

165+
struct detect_wrong_difference
166+
{
167+
bool * flag;
168+
169+
__host__ __device__ detect_wrong_difference operator++() const { return *this; }
170+
__host__ __device__ detect_wrong_difference operator*() const { return *this; }
171+
template<typename Difference>
172+
__host__ __device__ detect_wrong_difference operator+(Difference) const { return *this; }
173+
template<typename Index>
174+
__host__ __device__ detect_wrong_difference operator[](Index) const { return *this; }
175+
176+
__device__
177+
void operator=(long long difference) const
178+
{
179+
if (difference != 1)
180+
{
181+
*flag = false;
182+
}
183+
}
184+
};
185+
186+
void TestAdjacentDifferenceWithBigIndexesHelper(int magnitude)
187+
{
188+
thrust::counting_iterator<long long> begin(1);
189+
thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
190+
ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
191+
192+
thrust::device_ptr<bool> all_differences_correct = thrust::device_malloc<bool>(1);
193+
*all_differences_correct = true;
194+
195+
detect_wrong_difference out = { thrust::raw_pointer_cast(all_differences_correct) };
196+
197+
thrust::adjacent_difference(thrust::device, begin, end, out);
198+
199+
bool all_differences_correct_h = *all_differences_correct;
200+
thrust::device_free(all_differences_correct);
201+
202+
ASSERT_EQUAL(all_differences_correct_h, true);
203+
}
204+
205+
void TestAdjacentDifferenceWithBigIndexes()
206+
{
207+
TestAdjacentDifferenceWithBigIndexesHelper(30);
208+
TestAdjacentDifferenceWithBigIndexesHelper(31);
209+
TestAdjacentDifferenceWithBigIndexesHelper(32);
210+
TestAdjacentDifferenceWithBigIndexesHelper(33);
211+
}
212+
DECLARE_UNITTEST(TestAdjacentDifferenceWithBigIndexes);

thrust/system/cuda/detail/adjacent_difference.h

Lines changed: 13 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
#include <cub/block/block_adjacent_difference.cuh>
3838
#include <thrust/system/cuda/detail/core/agent_launcher.h>
3939
#include <thrust/system/cuda/detail/par_to_seq.h>
40+
#include <thrust/system/cuda/detail/dispatch.h>
4041
#include <thrust/functional.h>
4142
#include <thrust/distance.h>
4243
#include <thrust/detail/mpl/math.h>
@@ -257,8 +258,8 @@ namespace __adjacent_difference {
257258

258259
template <bool IS_LAST_TILE>
259260
void THRUST_DEVICE_FUNCTION
260-
consume_tile(Size num_remaining,
261-
Size tile_idx,
261+
consume_tile(int num_remaining,
262+
int tile_idx,
262263
Size tile_base)
263264
{
264265
if (tile_idx == 0)
@@ -279,7 +280,7 @@ namespace __adjacent_difference {
279280
consume_range(Size num_items)
280281
{
281282
int tile_idx = blockIdx.x;
282-
Size tile_base = tile_idx * ITEMS_PER_TILE;
283+
Size tile_base = static_cast<Size>(tile_idx) * ITEMS_PER_TILE;
283284
Size num_remaining = num_items - tile_base;
284285

285286
if (num_remaining > ITEMS_PER_TILE) // not a last tile
@@ -349,7 +350,7 @@ namespace __adjacent_difference {
349350
char * /*shmem*/)
350351
{
351352
int tile_idx = blockIdx.x * blockDim.x + threadIdx.x;
352-
int tile_base = tile_idx * items_per_tile;
353+
Size tile_base = static_cast<Size>(tile_idx) * items_per_tile;
353354
if (tile_base > 0 && tile_idx < num_tiles)
354355
result[tile_idx] = first[tile_base - 1];
355356
}
@@ -391,8 +392,8 @@ namespace __adjacent_difference {
391392
AgentPlan init_plan = init_agent::get_plan();
392393

393394

394-
size_t tile_size = difference_plan.items_per_tile;
395-
size_t num_tiles = (num_items + tile_size - 1) / tile_size;
395+
Size tile_size = difference_plan.items_per_tile;
396+
Size num_tiles = (num_items + tile_size - 1) / tile_size;
396397

397398
size_t tmp1 = num_tiles * sizeof(input_type);
398399
size_t vshmem_size = core::vshmem_size(difference_plan.shared_memory_size,
@@ -448,29 +449,19 @@ namespace __adjacent_difference {
448449
bool debug_sync = THRUST_DEBUG_SYNC_FLAG;
449450

450451
cudaError_t status;
451-
status = doit_step(NULL,
452-
storage_size,
453-
first,
454-
result,
455-
binary_op,
456-
num_items,
457-
stream,
458-
debug_sync);
452+
THRUST_INDEX_TYPE_DISPATCH(status, doit_step, num_items,
453+
(NULL, storage_size, first, result, binary_op,
454+
num_items_fixed, stream, debug_sync));
459455
cuda_cub::throw_on_error(status, "adjacent_difference failed on 1st step");
460456

461457
// Allocate temporary storage.
462458
thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
463459
tmp(policy, storage_size);
464460
void *ptr = static_cast<void*>(tmp.data().get());
465461

466-
status = doit_step(ptr,
467-
storage_size,
468-
first,
469-
result,
470-
binary_op,
471-
num_items,
472-
stream,
473-
debug_sync);
462+
THRUST_INDEX_TYPE_DISPATCH(status, doit_step, num_items,
463+
(ptr, storage_size, first, result, binary_op,
464+
num_items_fixed, stream, debug_sync));
474465
cuda_cub::throw_on_error(status, "adjacent_difference failed on 2nd step");
475466

476467
status = cuda_cub::synchronize(policy);
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
/*
2+
* Copyright 2018 NVIDIA Corporation
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#pragma once
18+
19+
#include <thrust/detail/preprocessor.h>
20+
21+
/**
22+
* Dispatch between 32-bit and 64-bit index based versions of the same algorithm
23+
* implementation. This version assumes that callables for both branches consist
24+
* of the same tokens, and is intended to be used with Thrust-style dispatch
25+
* interfaces, that always deduce the size type from the arguments.
26+
*/
27+
#define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments) \
28+
if (count <= std::numeric_limits<thrust::detail::int32_t>::max()) { \
29+
thrust::detail::int32_t THRUST_PP_CAT2(count, _fixed) = count; \
30+
status = call arguments; \
31+
} \
32+
else { \
33+
thrust::detail::int64_t THRUST_PP_CAT2(count, _fixed) = count; \
34+
status = call arguments; \
35+
}
36+
37+
/**
38+
* Dispatch between 32-bit and 64-bit index based versions of the same algorithm
39+
* implementation. This version allows using different token sequences for callables
40+
* in both branches, and is intended to be used with CUB-style dispatch interfaces,
41+
* where the "simple" interface always forces the size to be `int` (making it harder
42+
* for us to use), but the complex interface that we end up using doesn't actually
43+
* provide a way to fully deduce the type from just the call, making the size type
44+
* appear in the token sequence of the callable.
45+
*
46+
* See reduce_n_impl to see an example of how this is meant to be used.
47+
*/
48+
#define THRUST_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count, arguments) \
49+
if (count <= std::numeric_limits<thrust::detail::int32_t>::max()) { \
50+
thrust::detail::int32_t THRUST_PP_CAT2(count, _fixed) = count; \
51+
status = call_32 arguments; \
52+
} \
53+
else { \
54+
thrust::detail::int64_t THRUST_PP_CAT2(count, _fixed) = count; \
55+
status = call_64 arguments; \
56+
}
57+

thrust/system/cuda/detail/reduce.h

Lines changed: 23 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
#include <cub/device/device_reduce.cuh>
3939
#include <thrust/system/cuda/detail/par_to_seq.h>
4040
#include <thrust/system/cuda/detail/get_value.h>
41+
#include <thrust/system/cuda/detail/dispatch.h>
4142
#include <thrust/functional.h>
4243
#include <thrust/system/cuda/detail/core/agent_launcher.h>
4344
#include <thrust/detail/minmax.h>
@@ -930,21 +931,22 @@ T reduce_n_impl(execution_policy<Derived>& policy,
930931
BinaryOp binary_op)
931932
{
932933
cudaStream_t stream = cuda_cub::stream(policy);
934+
cudaError_t status;
933935

934936
// Determine temporary device storage requirements.
935937

936938
size_t tmp_size = 0;
937-
cuda_cub::throw_on_error(
938-
cub::DeviceReduce::Reduce(NULL,
939-
tmp_size,
940-
first,
941-
reinterpret_cast<T*>(NULL),
942-
num_items,
943-
binary_op,
944-
init,
945-
stream,
946-
THRUST_DEBUG_SYNC_FLAG),
947-
"after reduction step 1");
939+
940+
THRUST_INDEX_TYPE_DISPATCH2(status,
941+
cub::DeviceReduce::Reduce,
942+
(cub::DispatchReduce<
943+
InputIt, T*, Size, BinaryOp
944+
>::Dispatch),
945+
num_items,
946+
(NULL, tmp_size, first, reinterpret_cast<T*>(NULL),
947+
num_items_fixed, binary_op, init, stream,
948+
THRUST_DEBUG_SYNC_FLAG));
949+
cuda_cub::throw_on_error(status, "after reduction step 1");
948950

949951
// Allocate temporary storage.
950952

@@ -963,17 +965,16 @@ T reduce_n_impl(execution_policy<Derived>& policy,
963965
// make this guarantee.
964966
T* ret_ptr = thrust::detail::aligned_reinterpret_cast<T*>(tmp.data().get());
965967
void* tmp_ptr = static_cast<void*>((tmp.data() + sizeof(T)).get());
966-
cuda_cub::throw_on_error(
967-
cub::DeviceReduce::Reduce(tmp_ptr,
968-
tmp_size,
969-
first,
970-
ret_ptr,
971-
num_items,
972-
binary_op,
973-
init,
974-
stream,
975-
THRUST_DEBUG_SYNC_FLAG),
976-
"after reduction step 2");
968+
THRUST_INDEX_TYPE_DISPATCH2(status,
969+
cub::DeviceReduce::Reduce,
970+
(cub::DispatchReduce<
971+
InputIt, T*, Size, BinaryOp
972+
>::Dispatch),
973+
num_items,
974+
(tmp_ptr, tmp_size, first, ret_ptr,
975+
num_items_fixed, binary_op, init, stream,
976+
THRUST_DEBUG_SYNC_FLAG));
977+
cuda_cub::throw_on_error(status, "after reduction step 2");
977978

978979
// Synchronize the stream and get the value.
979980

0 commit comments

Comments
 (0)