Skip to content

Commit b36f2c9

Browse files
committed
Remove unused functions from the CUDA backend which call slow CUDA attribute
query APIs.
1 parent de845f9 commit b36f2c9

1 file changed

Lines changed: 16 additions & 117 deletions

File tree

  • thrust/system/cuda/detail/core

thrust/system/cuda/detail/core/util.h

Lines changed: 16 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@
3636
#include <thrust/system/cuda/detail/cub/block/block_store.cuh>
3737
#include <thrust/system/cuda/detail/cub/block/block_scan.cuh>
3838

39-
4039
THRUST_BEGIN_NS
4140

4241
namespace cuda_cub {
@@ -56,13 +55,13 @@ namespace core {
5655

5756
// Typelist - a container of types, supports up to 10 types
5857
// --------------------------------------------------------------------------
59-
58+
6059
class _;
6160
template <class = _, class = _, class = _, class = _, class = _, class = _, class = _, class = _, class = _, class = _>
6261
struct typelist;
6362

6463
// -------------------------------------
65-
64+
6665
// supported SM arch
6766
// ---------------------
6867
struct sm30 { enum { ver = 300, warpSize = 32 }; };
@@ -94,7 +93,7 @@ namespace core {
9493

9594
// metafunction to match next viable PtxPlan specialization
9695
// --------------------------------------------------------------------------
97-
96+
9897
__THRUST_DEFINE_HAS_NESTED_TYPE(has_tuning_t, tuning)
9998
__THRUST_DEFINE_HAS_NESTED_TYPE(has_type_t, type)
10099

@@ -121,7 +120,7 @@ namespace core {
121120
template <class, class> class Tuning,
122121
class _0>
123122
struct has_sm_tuning_impl<SM, Tuning<lowest_supported_sm_arch, _0> > : has_type_t<Tuning<SM, _0> > {};
124-
123+
125124
// specializing for Tunig which needs 2 args
126125
template <class SM,
127126
template <class, class,class> class Tuning,
@@ -131,9 +130,9 @@ namespace core {
131130
template <template <class> class P, class SM>
132131
struct has_sm_tuning : has_sm_tuning_impl<SM, typename P<lowest_supported_sm_arch>::tuning > {};
133132

134-
// once first match is found in sm_list, all remaining sm are possible
133+
// once first match is found in sm_list, all remaining sm are possible
135134
// candidate for tuning, so pick the first available
136-
// if the plan P has SM-level tuning then pick it,
135+
// if the plan P has SM-level tuning then pick it,
137136
// otherwise move on to the next sm in the sm_list
138137
template <template <class> class P, class SM, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
139138
struct specialize_plan_impl_match<P, typelist<SM, _1, _2, _3, _4, _5, _6, _7, _8, _9> >
@@ -146,14 +145,14 @@ namespace core {
146145
struct specialize_plan_msvc10_war
147146
{
148147
// if Plan has tuning type, this means it has SM-specific tuning
149-
// so loop through sm_list to find match,
148+
// so loop through sm_list to find match,
150149
// otherwise just specialize on provided SM
151150
typedef thrust::detail::conditional<has_tuning_t<Plan<lowest_supported_sm_arch> >::value,
152151
specialize_plan_impl_loop<Plan, SM, sm_list>,
153152
Plan<SM> >
154153
type;
155154
};
156-
155+
157156
template <template <class> class Plan, class SM = THRUST_TUNING_ARCH>
158157
struct specialize_plan : specialize_plan_msvc10_war<Plan,SM>::type::type {};
159158

@@ -433,67 +432,12 @@ namespace core {
433432
/////////////////////////
434433
/////////////////////////
435434

436-
inline cudaError_t CUB_RUNTIME_FUNCTION
437-
get_occ_device_properties(cudaOccDeviceProp &occ_prop, int dev_id)
438-
{
439-
cudaError_t status = cudaSuccess;
440-
#ifdef __CUDA_ARCH__
441-
{
442-
cudaOccDeviceProp &o = occ_prop;
443-
//
444-
status = cudaDeviceGetAttribute(&o.computeMajor,
445-
cudaDevAttrComputeCapabilityMajor,
446-
dev_id);
447-
status = cudaDeviceGetAttribute(&o.computeMinor,
448-
cudaDevAttrComputeCapabilityMinor,
449-
dev_id);
450-
status = cudaDeviceGetAttribute(&o.maxThreadsPerBlock,
451-
cudaDevAttrMaxThreadsPerBlock,
452-
dev_id);
453-
status = cudaDeviceGetAttribute(&o.maxThreadsPerMultiprocessor,
454-
cudaDevAttrMaxThreadsPerMultiProcessor,
455-
dev_id);
456-
status = cudaDeviceGetAttribute(&o.regsPerBlock,
457-
cudaDevAttrMaxRegistersPerBlock,
458-
dev_id);
459-
status = cudaDeviceGetAttribute(&o.regsPerMultiprocessor,
460-
cudaDevAttrMaxRegistersPerMultiprocessor,
461-
dev_id);
462-
status = cudaDeviceGetAttribute(&o.warpSize,
463-
cudaDevAttrWarpSize,
464-
dev_id);
465-
466-
int i32value;
467-
status = cudaDeviceGetAttribute(&i32value,
468-
cudaDevAttrMaxSharedMemoryPerBlock,
469-
dev_id);
470-
o.sharedMemPerBlock = static_cast<size_t>(i32value);
471-
472-
status = cudaDeviceGetAttribute(&i32value,
473-
cudaDevAttrMaxSharedMemoryPerMultiprocessor,
474-
dev_id);
475-
o.sharedMemPerMultiprocessor = static_cast<size_t>(i32value);
476-
477-
status = cudaDeviceGetAttribute(&o.numSms,
478-
cudaDevAttrMultiProcessorCount,
479-
dev_id);
480-
}
481-
#else
482-
{
483-
cudaDeviceProp props;
484-
status = cudaGetDeviceProperties(&props, dev_id);
485-
occ_prop = cudaOccDeviceProp(props);
486-
}
487-
#endif
488-
return status;
489-
}
490-
491-
int CUB_RUNTIME_FUNCTION
492-
inline get_sm_count()
435+
THRUST_RUNTIME_FUNCTION
436+
int get_sm_count()
493437
{
494438
int dev_id;
495439
cuda_cub::throw_on_error(cudaGetDevice(&dev_id),
496-
"get_sm_count:"
440+
"get_sm_count :"
497441
"failed to cudaGetDevice");
498442

499443
cudaError_t status;
@@ -536,7 +480,7 @@ namespace core {
536480
else
537481
return 0;
538482
}
539-
483+
540484
size_t CUB_RUNTIME_FUNCTION
541485
inline vshmem_size(size_t shmem_per_block, size_t num_blocks)
542486
{
@@ -547,51 +491,6 @@ namespace core {
547491
return 0;
548492
}
549493

550-
template <class Kernel>
551-
int CUB_RUNTIME_FUNCTION
552-
get_max_block_size(Kernel k)
553-
{
554-
int devId;
555-
cuda_cub::throw_on_error(cudaGetDevice(&devId),
556-
"get_max_block_size :"
557-
"failed to cudaGetDevice");
558-
559-
cudaOccDeviceProp occ_prop;
560-
cuda_cub::throw_on_error(get_occ_device_properties(occ_prop, devId),
561-
"get_max_block_size: "
562-
"failed to cudaGetDeviceProperties");
563-
564-
565-
cudaFuncAttributes attribs;
566-
cuda_cub::throw_on_error(cudaFuncGetAttributes(&attribs, reinterpret_cast<void *>(k)),
567-
"get_max_block_size: "
568-
"failed to cudaFuncGetAttributes");
569-
cudaOccFuncAttributes occ_attrib(attribs);
570-
571-
572-
cudaFuncCache cacheConfig;
573-
cuda_cub::throw_on_error(cudaDeviceGetCacheConfig(&cacheConfig),
574-
"get_max_block_size: "
575-
"failed to cudaDeviceGetCacheConfig");
576-
577-
cudaOccDeviceState occ_state;
578-
occ_state.cacheConfig = (cudaOccCacheConfig)cacheConfig;
579-
int block_size = 0;
580-
int min_grid_size = 0;
581-
cudaOccError occ_status = cudaOccMaxPotentialOccupancyBlockSize(&min_grid_size,
582-
&block_size,
583-
&occ_prop,
584-
&occ_attrib,
585-
&occ_state,
586-
0);
587-
if (CUDA_OCC_SUCCESS != occ_status || block_size <= 0)
588-
cuda_cub::throw_on_error(cudaErrorInvalidConfiguration,
589-
"get_max_block_size: "
590-
"failed to cudaOccMaxPotentialOccupancyBlockSize");
591-
592-
return block_size;
593-
}
594-
595494
// LoadIterator
596495
// ------------
597496
// if trivial iterator is passed, wrap loads into LDG
@@ -616,7 +515,7 @@ namespace core {
616515
{
617516
return raw_pointer_cast(&*it);
618517
}
619-
518+
620519
template <class PtxPlan, class It>
621520
typename LoadIterator<PtxPlan, It>::type __device__ __forceinline__
622521
make_load_iterator_impl(It it, thrust::detail::false_type /* is_trivial */)
@@ -657,7 +556,7 @@ namespace core {
657556

658557
type;
659558
};
660-
559+
661560
// BlockStore
662561
// -----------
663562
// a helper metaprogram that returns type of a block loader
@@ -749,7 +648,7 @@ namespace core {
749648

750649
__host__ __device__ __forceinline__ operator T&() { return get(); }
751650
};
752-
651+
753652
// uninitialized_array
754653
// --------------
755654
// allocates uninitialized data on stack
@@ -837,6 +736,6 @@ using core::sm60;
837736
using core::sm52;
838737
using core::sm35;
839738
using core::sm30;
840-
} // namespace cuda_
739+
} // namespace cuda_
841740

842741
THRUST_END_NS

0 commit comments

Comments
 (0)