@@ -713,7 +713,7 @@ _CCCL_DEVICE void bulk_copy_maybe_unaligned(
713713// didn't merge the changes. The problem was mostly a 25% increase in integer instructions, as shown by ncu.
714714template <int threads_per_block,
715715 int UnrollFactor,
716- int StoreVec ,
716+ int StoreVecSize ,
717717 typename Offset,
718718 typename Predicate,
719719 typename F,
@@ -897,18 +897,20 @@ _CCCL_DEVICE void transform_kernel_ublkcp(
897897 // move the whole index and iterator to the block/thread index, to reduce arithmetic in the loops below
898898 out += offset;
899899
900- using output_t = it_value_t <RandomAccessIteratorOut>;
901- constexpr int out_size = int {size_of<output_t >};
902- constexpr int vec_size = (out_size > 0 && out_size <= 16 ) ? 16 / out_size : 1 ;
903- constexpr int store_vec = (StoreVec > 0 ) ? (::cuda::std::min) (StoreVec, vec_size) : vec_size;
900+ using output_t = it_value_t <RandomAccessIteratorOut>;
901+ constexpr int out_size = int {size_of<output_t >};
902+ constexpr int vec_size = (out_size > 0 && out_size <= 16 ) ? 16 / out_size : 1 ;
903+ static_assert (StoreVecSize == 0 || ::cuda::is_power_of_two (StoreVecSize),
904+ " store_vec_size must be 0 (auto) or a power of two" );
905+ constexpr int store_vec_size = (StoreVecSize > 0 ) ? (::cuda::std::min) (StoreVecSize, vec_size) : vec_size;
904906 // compile time eligibility for the vectorized store (STG.128):
905907 // 1. there are no predicates
906908 // 2. memory layout is contiguous
907909 // 3. semantically we can raw copy
908910 // 4. size is power-of-2 and <= 16 bytes
909911 // #TODO(nan): STG.256 (128 should have enough BIF already, but should check perf on blackwell)
910912 constexpr bool vectorize_eligible =
911- store_vec > 1 && :: cuda::is_power_of_two (store_vec) && ::cuda::std::is_same_v<Predicate, ::cuda::always_true>
913+ store_vec_size > 1 && ::cuda::std::is_same_v<Predicate, ::cuda::always_true>
912914 && THRUST_NS_QUALIFIER ::is_contiguous_iterator_v<RandomAccessIteratorOut>
913915 && THRUST_NS_QUALIFIER ::is_trivially_relocatable_v<output_t > && ::cuda::is_power_of_two (out_size)
914916 && (... && ::cuda::is_power_of_two (int {sizeof (InTs)}));
@@ -917,14 +919,11 @@ _CCCL_DEVICE void transform_kernel_ublkcp(
917919 {
918920 if (can_vectorize)
919921 {
920- // store_vec (S) output elements per STG.128/64/.../8, defaulting to vec_size (= 16 / sizeof(output), today's
921- // 16-byte store). Shrinking S narrows the store but also reduces the number of fully-unrolled lambda calls per
922- // store, which bounds register pressure for heavy functors (whose stores aren't the bottleneck anyway). res[] is
923- // indexed only by the fully-unrolled k, i.e. compile-time, so it stays in registers and never spills to local
924- // memory regardless of S.
925- using store_t = decltype (load_store_type<store_vec * out_size>());
922+ // store_vec_size: element count for vectorized store. default = 16 / sizeof(output). must be pow2
923+ // Shrinking store_vec_size narrows the store but also reduces register pressure
924+ using store_t = decltype (load_store_type<store_vec_size * out_size>());
926925 auto * out_vec = reinterpret_cast <store_t *>(out);
927- const int num_groups = valid_items / store_vec ;
926+ const int num_groups = valid_items / store_vec_size ;
928927 for (int g = threadIdx .x ; g < num_groups; g += threads_per_block)
929928 {
930929 char * smem = smem_base;
@@ -935,16 +934,15 @@ _CCCL_DEVICE void transform_kernel_ublkcp(
935934 // alignof(T) will always be powers of 2 per C++ standard
936935 const T* base = reinterpret_cast <const T*>(smem + aligned_ptr.head_padding );
937936 smem += tile_padding + int {sizeof (T)} * tile_size;
938- // Gather this input's vec_size elements for output-vector v into a register array. we take the maximal
939- // alignment out of alignof(T) and 16 bytes. If input is narrower, we will waste a few (0-16) registers
937+ // Gather this input's store_vec_size elements for output-vector v into a register array.
938+ // we take the maximal alignment out of alignof(T) and 16 bytes. This is because compiler assume
939+ // natural alignment on bigger types (e.g. 32 bytes). If input is narrower, we will waste a few (0-16)
940+ // registers
940941 constexpr ::cuda::std::size_t chunk_align = (::cuda::std::max) (alignof (T), alignof (int4 ));
941- ::cuda::__uninitialized_array<T, store_vec, chunk_align> elems;
942- constexpr int chunk_bytes = int {sizeof (T)} * store_vec;
943- // if same width or narrowing (e.g. int32 -> int8), we split it up into multiple 16 byte reads
944- // CAREFUL: the byte width sizeof(T) * vec_size can exceed 16 when the input is wider than the output.
945- // However, since input both input type size and output size is pow2, when the input is wider, it has to be
946- // pow2 times wider. Therefore, chunk_bytes = input size * vec_size is always divisible by 16
947- // (recall 16 = output size * vec_size) , i.e. we can read it as multiple int4 loads
942+ ::cuda::__uninitialized_array<T, store_vec_size, chunk_align> elems;
943+ constexpr int chunk_bytes = int {sizeof (T)} * store_vec_size;
944+ // since store_vec_size is pow2, sizeof(T) is pow2, chunk_bytes must be pow2
945+ // if chunk_bytes is a multiple of 16, we do vectorised load from smem into reg
948946 if constexpr (chunk_bytes % int {sizeof (int4 )} == 0 )
949947 {
950948 constexpr int n = chunk_bytes / int {sizeof (int4 )};
@@ -955,9 +953,8 @@ _CCCL_DEVICE void transform_kernel_ublkcp(
955953 reinterpret_cast <int4 *>(elems.data ())[i] = s[i];
956954 }
957955 }
958- // if widening (e.g. int8 -> int32), just load it in one go. recall chunk_bytes = input size * vec_size, and
959- // vec_size = 16 / output size. Since output size is pow2, vec_size is pow2. Hence chunk_bytes is always pow2.
960- // this ensures load_store_type<chunk_bytes> will never fail.
956+ // if chunk_bytes is not a multiple of 16, since it is pow2, chunk_bytes < 16.
957+ // this ensures load_store_type<chunk_bytes> never fail
961958 else
962959 {
963960 using sub_t = decltype (load_store_type<chunk_bytes>());
@@ -967,10 +964,11 @@ _CCCL_DEVICE void transform_kernel_ublkcp(
967964 };
968965 auto chunks = ::cuda::std::tuple{load_chunk (aligned_ptrs)...};
969966
970- // must fully unroll to take full advantage of ILP. otherwise perf regress by half
971- ::cuda::__uninitialized_array<output_t , store_vec, sizeof (output_t ) * store_vec> res;
967+ // must fully unroll to make sure register index is static
968+ // (otherwise it will be on local memory & perf regress by half)
969+ ::cuda::__uninitialized_array<output_t , store_vec_size, sizeof (output_t ) * store_vec_size> res;
972970 _CCCL_PRAGMA_UNROLL_FULL ()
973- for (int k = 0 ; k < store_vec ; ++k)
971+ for (int k = 0 ; k < store_vec_size ; ++k)
974972 {
975973 res[k] = ::cuda::std::apply (
976974 [&](auto &... c) {
@@ -981,9 +979,9 @@ _CCCL_DEVICE void transform_kernel_ublkcp(
981979 out_vec[g] = *reinterpret_cast <const store_t *>(res.data ());
982980 }
983981
984- // scalar tail: the up to (store_vec - 1) trailing elements not covered by a whole store group. can_vectorize
982+ // we can scalar store tail when element count is not a multiple of store_vec_size
985983 // implies an always_true predicate, so we store unconditionally.
986- for (int idx = num_groups * store_vec + threadIdx .x ; idx < valid_items; idx += threads_per_block)
984+ for (int idx = num_groups * store_vec_size + threadIdx .x ; idx < valid_items; idx += threads_per_block)
987985 {
988986 char * smem = smem_base;
989987 auto fetch_operand = [&](auto aligned_ptr) {
@@ -1178,7 +1176,7 @@ __launch_bounds__(get_threads_per_block<PolicySelector>) _CCCL_KERNEL_ATTRIBUTES
11781176 NV_PROVIDES_SM_90 ,
11791177 (transform_kernel_ublkcp<policy.async_copy .threads_per_block ,
11801178 policy.async_copy .unroll_factor ,
1181- policy.async_copy .store_vec >(
1179+ policy.async_copy .store_vec_size >(
11821180 num_items,
11831181 num_elem_per_thread,
11841182 can_vectorize,
0 commit comments