Skip to content

Commit d4b7985

Browse files
committed
Stop specifying the minimum blocks __launch_bounds__ parameter because it
messes up register allocation and increases register pressure, and we don't actually know at compile time how many blocks we will use (aside from single tile kernels). Bug 2826490 Reviewed-by: Michał 'Griwes' Dominiak <griwes@griwes.info>
1 parent 43d4f10 commit d4b7985

15 files changed

Lines changed: 107 additions & 142 deletions

thrust/system/cuda/detail/adjacent_difference.h

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,16 +63,14 @@ namespace __adjacent_difference {
6363
int _ITEMS_PER_THREAD = 1,
6464
cub::BlockLoadAlgorithm _LOAD_ALGORITHM = cub::BLOCK_LOAD_DIRECT,
6565
cub::CacheLoadModifier _LOAD_MODIFIER = cub::LOAD_DEFAULT,
66-
cub::BlockStoreAlgorithm _STORE_ALGORITHM = cub::BLOCK_STORE_DIRECT,
67-
int _MIN_BLOCKS = 1>
66+
cub::BlockStoreAlgorithm _STORE_ALGORITHM = cub::BLOCK_STORE_DIRECT>
6867
struct PtxPolicy
6968
{
7069
enum
7170
{
7271
BLOCK_THREADS = _BLOCK_THREADS,
7372
ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
74-
ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD,
75-
MIN_BLOCKS = _MIN_BLOCKS
73+
ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD
7674
};
7775

7876
static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;

thrust/system/cuda/detail/binary_search.h

Lines changed: 13 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ namespace __binary_search {
9292
typedef typename iterator_traits<NeedlesIt>::value_type T;
9393

9494
template <class It, class CompareOp>
95-
THRUST_DEVICE_FUNCTION bool
95+
THRUST_DEVICE_FUNCTION bool
9696
operator()(It begin, It end, T const& value, CompareOp comp)
9797
{
9898
HaystackIt iter = system::detail::generic::scalar::lower_bound(begin,
@@ -110,7 +110,7 @@ namespace __binary_search {
110110
class KeysIt2,
111111
class Size,
112112
class BinaryPred>
113-
THRUST_DEVICE_FUNCTION Size
113+
THRUST_DEVICE_FUNCTION Size
114114
merge_path(KeysIt1 keys1,
115115
KeysIt2 keys2,
116116
Size keys1_count,
@@ -143,7 +143,7 @@ namespace __binary_search {
143143
}
144144

145145
template <class It, class T2, class CompareOp, int ITEMS_PER_THREAD>
146-
THRUST_DEVICE_FUNCTION void
146+
THRUST_DEVICE_FUNCTION void
147147
serial_merge(It keys_shared,
148148
int keys1_beg,
149149
int keys2_beg,
@@ -155,7 +155,7 @@ namespace __binary_search {
155155
{
156156
int keys1_end = keys1_beg + keys1_count;
157157
int keys2_end = keys2_beg + keys2_count;
158-
158+
159159
typedef typename iterator_value<It>::type key_type;
160160

161161
key_type key1 = keys_shared[keys1_beg];
@@ -185,7 +185,6 @@ namespace __binary_search {
185185

186186
template <int _BLOCK_THREADS,
187187
int _ITEMS_PER_THREAD = 1,
188-
int _MIN_BLOCKS = 1,
189188
cub::BlockLoadAlgorithm _LOAD_ALGORITHM = cub::BLOCK_LOAD_DIRECT,
190189
cub::CacheLoadModifier _LOAD_MODIFIER = cub::LOAD_LDG,
191190
cub::BlockStoreAlgorithm _STORE_ALGORITHM = cub::BLOCK_STORE_DIRECT>
@@ -195,19 +194,18 @@ namespace __binary_search {
195194
{
196195
BLOCK_THREADS = _BLOCK_THREADS,
197196
ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
198-
MIN_BLOCKS = _MIN_BLOCKS,
199-
ITEMS_PER_TILE = _BLOCK_THREADS * _ITEMS_PER_THREAD,
197+
ITEMS_PER_TILE = _BLOCK_THREADS * _ITEMS_PER_THREAD
200198
};
201199

202200
static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
203201
static const cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
204202
static const cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
205203
}; // PtxPolicy
206-
204+
207205
template <class Arch, class T>
208206
struct Tuning;
209207

210-
template<class T>
208+
template<class T>
211209
struct Tuning<sm30,T>
212210
{
213211
enum
@@ -218,13 +216,12 @@ namespace __binary_search {
218216

219217
typedef PtxPolicy<128,
220218
ITEMS_PER_THREAD,
221-
1,
222219
cub::BLOCK_LOAD_WARP_TRANSPOSE,
223220
cub::LOAD_LDG,
224221
cub::BLOCK_STORE_TRANSPOSE>
225222
type;
226223
};
227-
224+
228225
template<class T>
229226
struct Tuning<sm52,T>
230227
{
@@ -238,13 +235,12 @@ namespace __binary_search {
238235

239236
typedef PtxPolicy<128,
240237
ITEMS_PER_THREAD,
241-
1,
242238
cub::BLOCK_LOAD_WARP_TRANSPOSE,
243239
cub::LOAD_LDG,
244240
cub::BLOCK_STORE_WARP_TRANSPOSE>
245241
type;
246242
};
247-
243+
248244
template <class NeedlesIt,
249245
class HaystackIt,
250246
class Size,
@@ -429,7 +425,7 @@ namespace __binary_search {
429425
needle_type needles_loc[ITEMS_PER_THREAD];
430426
BlockLoadNeedles(storage.load_needles)
431427
.Load(needles_load_it + tile_base, needles_loc, num_remaining);
432-
428+
433429
#ifdef BS_SIMPLE
434430

435431
result_type results_loc[ITEMS_PER_THREAD];
@@ -499,7 +495,7 @@ namespace __binary_search {
499495
needles_loc[ITEM],
500496
compare_op);
501497
}
502-
498+
503499
sync_threadblock();
504500

505501
result_type results_loc[ITEMS_PER_THREAD];
@@ -627,7 +623,7 @@ namespace __binary_search {
627623
result,
628624
compare_op,
629625
search_op);
630-
626+
631627
CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
632628

633629
return status;
@@ -692,7 +688,7 @@ namespace __binary_search {
692688
stream,
693689
debug_sync);
694690
cuda_cub::throw_on_error(status, "binary_search: failed on 2nt call");
695-
691+
696692
status = cuda_cub::synchronize(policy);
697693
cuda_cub::throw_on_error(status, "binary_search: failed to synchronize");
698694

thrust/system/cuda/detail/copy_if.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,6 @@ namespace __copy_if {
6969

7070
template <int _BLOCK_THREADS,
7171
int _ITEMS_PER_THREAD = 1,
72-
int _MIN_BLOCKS = 1,
7372
cub::BlockLoadAlgorithm _LOAD_ALGORITHM = cub::BLOCK_LOAD_DIRECT,
7473
cub::CacheLoadModifier _LOAD_MODIFIER = cub::LOAD_LDG,
7574
cub::BlockScanAlgorithm _SCAN_ALGORITHM = cub::BLOCK_SCAN_WARP_SCANS>
@@ -79,7 +78,6 @@ namespace __copy_if {
7978
{
8079
BLOCK_THREADS = _BLOCK_THREADS,
8180
ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
82-
MIN_BLOCKS = _MIN_BLOCKS,
8381
ITEMS_PER_TILE = _BLOCK_THREADS * _ITEMS_PER_THREAD,
8482
};
8583
static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
@@ -103,7 +101,6 @@ namespace __copy_if {
103101

104102
typedef PtxPolicy<128,
105103
ITEMS_PER_THREAD,
106-
1,
107104
cub::BLOCK_LOAD_WARP_TRANSPOSE,
108105
cub::LOAD_LDG,
109106
cub::BLOCK_SCAN_WARP_SCANS>
@@ -124,7 +121,6 @@ namespace __copy_if {
124121

125122
typedef PtxPolicy<128,
126123
ITEMS_PER_THREAD,
127-
1,
128124
cub::BLOCK_LOAD_WARP_TRANSPOSE,
129125
cub::LOAD_LDG,
130126
cub::BLOCK_SCAN_WARP_SCANS>
@@ -144,7 +140,6 @@ namespace __copy_if {
144140

145141
typedef PtxPolicy<128,
146142
ITEMS_PER_THREAD,
147-
1,
148143
cub::BLOCK_LOAD_WARP_TRANSPOSE,
149144
cub::LOAD_DEFAULT,
150145
cub::BLOCK_SCAN_WARP_SCANS>

0 commit comments

Comments
 (0)