3636#include < thrust/system/cuda/detail/cub/block/block_store.cuh>
3737#include < thrust/system/cuda/detail/cub/block/block_scan.cuh>
3838
39-
4039THRUST_BEGIN_NS
4140
4241namespace cuda_cub {
@@ -56,13 +55,13 @@ namespace core {
5655
5756 // Typelist - a container of types, supports up to 10 types
5857 // --------------------------------------------------------------------------
59-
58+
6059 class _ ;
6160 template <class = _, class = _, class = _, class = _, class = _, class = _, class = _, class = _, class = _, class = _>
6261 struct typelist ;
6362
6463 // -------------------------------------
65-
64+
6665 // supported SM arch
6766 // ---------------------
6867 struct sm30 { enum { ver = 300 , warpSize = 32 }; };
@@ -94,7 +93,7 @@ namespace core {
9493
9594 // metafunction to match next viable PtxPlan specialization
9695 // --------------------------------------------------------------------------
97-
96+
9897 __THRUST_DEFINE_HAS_NESTED_TYPE (has_tuning_t , tuning)
9998 __THRUST_DEFINE_HAS_NESTED_TYPE (has_type_t , type)
10099
@@ -121,7 +120,7 @@ namespace core {
121120 template <class , class > class Tuning ,
122121 class _0 >
123122 struct has_sm_tuning_impl <SM, Tuning<lowest_supported_sm_arch, _0> > : has_type_t <Tuning<SM, _0> > {};
124-
123+
125124 // specializing for Tunig which needs 2 args
126125 template <class SM ,
127126 template <class , class ,class > class Tuning ,
@@ -131,9 +130,9 @@ namespace core {
131130 template <template <class > class P , class SM >
132131 struct has_sm_tuning : has_sm_tuning_impl<SM, typename P<lowest_supported_sm_arch>::tuning > {};
133132
134- // once first match is found in sm_list, all remaining sm are possible
133+ // once first match is found in sm_list, all remaining sm are possible
135134 // candidate for tuning, so pick the first available
136- // if the plan P has SM-level tuning then pick it,
135+ // if the plan P has SM-level tuning then pick it,
137136 // otherwise move on to the next sm in the sm_list
138137 template <template <class > class P , class SM , class _1 , class _2 , class _3 , class _4 , class _5 , class _6 , class _7 , class _8 , class _9 >
139138 struct specialize_plan_impl_match <P, typelist<SM, _1, _2, _3, _4, _5, _6, _7, _8, _9> >
@@ -146,14 +145,14 @@ namespace core {
146145 struct specialize_plan_msvc10_war
147146 {
148147 // if Plan has tuning type, this means it has SM-specific tuning
149- // so loop through sm_list to find match,
148+ // so loop through sm_list to find match,
150149 // otherwise just specialize on provided SM
151150 typedef thrust::detail::conditional<has_tuning_t <Plan<lowest_supported_sm_arch> >::value,
152151 specialize_plan_impl_loop<Plan, SM, sm_list>,
153152 Plan<SM> >
154153 type;
155154 };
156-
155+
157156 template <template <class > class Plan , class SM = THRUST_TUNING_ARCH>
158157 struct specialize_plan : specialize_plan_msvc10_war<Plan,SM>::type::type {};
159158
@@ -433,67 +432,12 @@ namespace core {
433432 // ///////////////////////
434433 // ///////////////////////
435434
436- inline cudaError_t CUB_RUNTIME_FUNCTION
437- get_occ_device_properties (cudaOccDeviceProp &occ_prop, int dev_id)
438- {
439- cudaError_t status = cudaSuccess;
440- #ifdef __CUDA_ARCH__
441- {
442- cudaOccDeviceProp &o = occ_prop;
443- //
444- status = cudaDeviceGetAttribute (&o.computeMajor ,
445- cudaDevAttrComputeCapabilityMajor,
446- dev_id);
447- status = cudaDeviceGetAttribute (&o.computeMinor ,
448- cudaDevAttrComputeCapabilityMinor,
449- dev_id);
450- status = cudaDeviceGetAttribute (&o.maxThreadsPerBlock ,
451- cudaDevAttrMaxThreadsPerBlock,
452- dev_id);
453- status = cudaDeviceGetAttribute (&o.maxThreadsPerMultiprocessor ,
454- cudaDevAttrMaxThreadsPerMultiProcessor,
455- dev_id);
456- status = cudaDeviceGetAttribute (&o.regsPerBlock ,
457- cudaDevAttrMaxRegistersPerBlock,
458- dev_id);
459- status = cudaDeviceGetAttribute (&o.regsPerMultiprocessor ,
460- cudaDevAttrMaxRegistersPerMultiprocessor,
461- dev_id);
462- status = cudaDeviceGetAttribute (&o.warpSize ,
463- cudaDevAttrWarpSize,
464- dev_id);
465-
466- int i32value;
467- status = cudaDeviceGetAttribute (&i32value,
468- cudaDevAttrMaxSharedMemoryPerBlock,
469- dev_id);
470- o.sharedMemPerBlock = static_cast <size_t >(i32value);
471-
472- status = cudaDeviceGetAttribute (&i32value,
473- cudaDevAttrMaxSharedMemoryPerMultiprocessor,
474- dev_id);
475- o.sharedMemPerMultiprocessor = static_cast <size_t >(i32value);
476-
477- status = cudaDeviceGetAttribute (&o.numSms ,
478- cudaDevAttrMultiProcessorCount,
479- dev_id);
480- }
481- #else
482- {
483- cudaDeviceProp props;
484- status = cudaGetDeviceProperties (&props, dev_id);
485- occ_prop = cudaOccDeviceProp (props);
486- }
487- #endif
488- return status;
489- }
490-
491- int CUB_RUNTIME_FUNCTION
492- inline get_sm_count ()
435+ THRUST_RUNTIME_FUNCTION
436+ int get_sm_count ()
493437 {
494438 int dev_id;
495439 cuda_cub::throw_on_error (cudaGetDevice (&dev_id),
496- " get_sm_count:"
440+ " get_sm_count :"
497441 " failed to cudaGetDevice" );
498442
499443 cudaError_t status;
@@ -536,7 +480,7 @@ namespace core {
536480 else
537481 return 0 ;
538482 }
539-
483+
540484 size_t CUB_RUNTIME_FUNCTION
541485 inline vshmem_size (size_t shmem_per_block, size_t num_blocks)
542486 {
@@ -547,51 +491,6 @@ namespace core {
547491 return 0 ;
548492 }
549493
550- template <class Kernel >
551- int CUB_RUNTIME_FUNCTION
552- get_max_block_size (Kernel k)
553- {
554- int devId;
555- cuda_cub::throw_on_error (cudaGetDevice (&devId),
556- " get_max_block_size :"
557- " failed to cudaGetDevice" );
558-
559- cudaOccDeviceProp occ_prop;
560- cuda_cub::throw_on_error (get_occ_device_properties (occ_prop, devId),
561- " get_max_block_size: "
562- " failed to cudaGetDeviceProperties" );
563-
564-
565- cudaFuncAttributes attribs;
566- cuda_cub::throw_on_error (cudaFuncGetAttributes (&attribs, reinterpret_cast <void *>(k)),
567- " get_max_block_size: "
568- " failed to cudaFuncGetAttributes" );
569- cudaOccFuncAttributes occ_attrib (attribs);
570-
571-
572- cudaFuncCache cacheConfig;
573- cuda_cub::throw_on_error (cudaDeviceGetCacheConfig (&cacheConfig),
574- " get_max_block_size: "
575- " failed to cudaDeviceGetCacheConfig" );
576-
577- cudaOccDeviceState occ_state;
578- occ_state.cacheConfig = (cudaOccCacheConfig)cacheConfig;
579- int block_size = 0 ;
580- int min_grid_size = 0 ;
581- cudaOccError occ_status = cudaOccMaxPotentialOccupancyBlockSize (&min_grid_size,
582- &block_size,
583- &occ_prop,
584- &occ_attrib,
585- &occ_state,
586- 0 );
587- if (CUDA_OCC_SUCCESS != occ_status || block_size <= 0 )
588- cuda_cub::throw_on_error (cudaErrorInvalidConfiguration,
589- " get_max_block_size: "
590- " failed to cudaOccMaxPotentialOccupancyBlockSize" );
591-
592- return block_size;
593- }
594-
595494 // LoadIterator
596495 // ------------
597496 // if trivial iterator is passed, wrap loads into LDG
@@ -616,7 +515,7 @@ namespace core {
616515 {
617516 return raw_pointer_cast (&*it);
618517 }
619-
518+
620519 template <class PtxPlan , class It >
621520 typename LoadIterator<PtxPlan, It>::type __device__ __forceinline__
622521 make_load_iterator_impl (It it, thrust::detail::false_type /* is_trivial */ )
@@ -657,7 +556,7 @@ namespace core {
657556
658557 type;
659558 };
660-
559+
661560 // BlockStore
662561 // -----------
663562 // a helper metaprogram that returns type of a block loader
@@ -749,7 +648,7 @@ namespace core {
749648
750649 __host__ __device__ __forceinline__ operator T&() { return get (); }
751650 };
752-
651+
753652 // uninitialized_array
754653 // --------------
755654 // allocates uninitialized data on stack
@@ -837,6 +736,6 @@ using core::sm60;
837736using core::sm52;
838737using core::sm35;
839738using core::sm30;
840- } // namespace cuda_
739+ } // namespace cuda_
841740
842741THRUST_END_NS
0 commit comments