cub: rename store_vec to store_vec_size; static_assert power-of-two store width

nanan-nvidia · nanan-nvidia · commit 54db169b20d5 · 2026-06-23T11:55:59.000-07:00
diff --git a/cub/cub/device/dispatch/kernels/kernel_transform.cuh b/cub/cub/device/dispatch/kernels/kernel_transform.cuh
@@ -713,7 +713,7 @@ _CCCL_DEVICE void bulk_copy_maybe_unaligned(
 // didn't merge the changes. The problem was mostly a 25% increase in integer instructions, as shown by ncu.
 template <int threads_per_block,
           int UnrollFactor,
-          int StoreVec,
+          int StoreVecSize,
           typename Offset,
           typename Predicate,
           typename F,
@@ -897,18 +897,20 @@ _CCCL_DEVICE void transform_kernel_ublkcp(
   // move the whole index and iterator to the block/thread index, to reduce arithmetic in the loops below
   out += offset;
 
-  using output_t          = it_value_t<RandomAccessIteratorOut>;
-  constexpr int out_size  = int{size_of<output_t>};
-  constexpr int vec_size  = (out_size > 0 && out_size <= 16) ? 16 / out_size : 1;
-  constexpr int store_vec = (StoreVec > 0) ? (::cuda::std::min) (StoreVec, vec_size) : vec_size;
+  using output_t         = it_value_t<RandomAccessIteratorOut>;
+  constexpr int out_size = int{size_of<output_t>};
+  constexpr int vec_size = (out_size > 0 && out_size <= 16) ? 16 / out_size : 1;
+  static_assert(StoreVecSize == 0 || ::cuda::is_power_of_two(StoreVecSize),
+                "store_vec_size must be 0 (auto) or a power of two");
+  constexpr int store_vec_size = (StoreVecSize > 0) ? (::cuda::std::min) (StoreVecSize, vec_size) : vec_size;
   // compile time eligibility for the vectorized store (STG.128):
   // 1. there are no predicates
   // 2. memory layout is contiguous
   // 3. semantically we can raw copy
   // 4. size is power-of-2 and <= 16 bytes
   // #TODO(nan): STG.256 (128 should have enough BIF already, but should check perf on blackwell)
   constexpr bool vectorize_eligible =
-    store_vec > 1 && ::cuda::is_power_of_two(store_vec) && ::cuda::std::is_same_v<Predicate, ::cuda::always_true>
+    store_vec_size > 1 && ::cuda::std::is_same_v<Predicate, ::cuda::always_true>
     && THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorOut>
     && THRUST_NS_QUALIFIER::is_trivially_relocatable_v<output_t> && ::cuda::is_power_of_two(out_size)
     && (... && ::cuda::is_power_of_two(int{sizeof(InTs)}));
@@ -917,14 +919,11 @@ _CCCL_DEVICE void transform_kernel_ublkcp(
   {
     if (can_vectorize)
     {
-      // store_vec (S) output elements per STG.128/64/.../8, defaulting to vec_size (= 16 / sizeof(output), today's
-      // 16-byte store). Shrinking S narrows the store but also reduces the number of fully-unrolled lambda calls per
-      // store, which bounds register pressure for heavy functors (whose stores aren't the bottleneck anyway). res[] is
-      // indexed only by the fully-unrolled k, i.e. compile-time, so it stays in registers and never spills to local
-      // memory regardless of S.
-      using store_t        = decltype(load_store_type<store_vec * out_size>());
+      // store_vec_size: element count for vectorized store. default = 16 / sizeof(output). must be pow2
+      // Shrinking store_vec_size narrows the store but also reduces register pressure
+      using store_t        = decltype(load_store_type<store_vec_size * out_size>());
       auto* out_vec        = reinterpret_cast<store_t*>(out);
-      const int num_groups = valid_items / store_vec;
+      const int num_groups = valid_items / store_vec_size;
       for (int g = threadIdx.x; g < num_groups; g += threads_per_block)
       {
         char* smem      = smem_base;
@@ -935,16 +934,15 @@ _CCCL_DEVICE void transform_kernel_ublkcp(
           // alignof(T) will always be powers of 2 per C++ standard
           const T* base = reinterpret_cast<const T*>(smem + aligned_ptr.head_padding);
           smem += tile_padding + int{sizeof(T)} * tile_size;
-          // Gather this input's vec_size elements for output-vector v into a register array. we take the maximal
-          // alignment out of alignof(T) and 16 bytes. If input is narrower, we will waste a few (0-16) registers
+          // Gather this input's store_vec_size elements for output-vector v into a register array.
+          // we take the maximal alignment out of alignof(T) and 16 bytes. This is because compiler assume
+          // natural alignment on bigger types (e.g. 32 bytes). If input is narrower, we will waste a few (0-16)
+          // registers
           constexpr ::cuda::std::size_t chunk_align = (::cuda::std::max) (alignof(T), alignof(int4));
-          ::cuda::__uninitialized_array<T, store_vec, chunk_align> elems;
-          constexpr int chunk_bytes = int{sizeof(T)} * store_vec;
-          // if same width or narrowing (e.g. int32 -> int8), we split it up into multiple 16 byte reads
-          // CAREFUL: the byte width sizeof(T) * vec_size can exceed 16 when the input is wider than the output.
-          // However, since input both input type size and output size is pow2, when the input is wider, it has to be
-          // pow2 times wider. Therefore, chunk_bytes = input size * vec_size is always divisible by 16
-          // (recall 16 = output size * vec_size) , i.e. we can read it as multiple int4 loads
+          ::cuda::__uninitialized_array<T, store_vec_size, chunk_align> elems;
+          constexpr int chunk_bytes = int{sizeof(T)} * store_vec_size;
+          // since store_vec_size is pow2, sizeof(T) is pow2, chunk_bytes must be pow2
+          // if chunk_bytes is a multiple of 16, we do vectorised load from smem into reg
           if constexpr (chunk_bytes % int{sizeof(int4)} == 0)
           {
             constexpr int n = chunk_bytes / int{sizeof(int4)};
@@ -955,9 +953,8 @@ _CCCL_DEVICE void transform_kernel_ublkcp(
               reinterpret_cast<int4*>(elems.data())[i] = s[i];
             }
           }
-          // if widening (e.g. int8 -> int32), just load it in one go. recall chunk_bytes = input size * vec_size, and
-          // vec_size = 16 / output size. Since output size is pow2, vec_size is pow2. Hence chunk_bytes is always pow2.
-          // this ensures load_store_type<chunk_bytes> will never fail.
+          // if chunk_bytes is not a multiple of 16, since it is pow2, chunk_bytes < 16.
+          // this ensures load_store_type<chunk_bytes> never fail
           else
           {
             using sub_t                             = decltype(load_store_type<chunk_bytes>());
@@ -967,10 +964,11 @@ _CCCL_DEVICE void transform_kernel_ublkcp(
         };
         auto chunks = ::cuda::std::tuple{load_chunk(aligned_ptrs)...};
 
-        // must fully unroll to take full advantage of ILP. otherwise perf regress by half
-        ::cuda::__uninitialized_array<output_t, store_vec, sizeof(output_t) * store_vec> res;
+        // must fully unroll to make sure register index is static
+        // (otherwise it will be on local memory & perf regress by half)
+        ::cuda::__uninitialized_array<output_t, store_vec_size, sizeof(output_t) * store_vec_size> res;
         _CCCL_PRAGMA_UNROLL_FULL()
-        for (int k = 0; k < store_vec; ++k)
+        for (int k = 0; k < store_vec_size; ++k)
         {
           res[k] = ::cuda::std::apply(
             [&](auto&... c) {
@@ -981,9 +979,9 @@ _CCCL_DEVICE void transform_kernel_ublkcp(
         out_vec[g] = *reinterpret_cast<const store_t*>(res.data());
       }
 
-      // scalar tail: the up to (store_vec - 1) trailing elements not covered by a whole store group. can_vectorize
+      // we can scalar store tail when element count is not a multiple of store_vec_size
       // implies an always_true predicate, so we store unconditionally.
-      for (int idx = num_groups * store_vec + threadIdx.x; idx < valid_items; idx += threads_per_block)
+      for (int idx = num_groups * store_vec_size + threadIdx.x; idx < valid_items; idx += threads_per_block)
       {
         char* smem         = smem_base;
         auto fetch_operand = [&](auto aligned_ptr) {
@@ -1178,7 +1176,7 @@ __launch_bounds__(get_threads_per_block<PolicySelector>) _CCCL_KERNEL_ATTRIBUTES
       NV_PROVIDES_SM_90,
       (transform_kernel_ublkcp<policy.async_copy.threads_per_block,
                                policy.async_copy.unroll_factor,
-                               policy.async_copy.store_vec>(
+                               policy.async_copy.store_vec_size>(
          num_items,
          num_elem_per_thread,
          can_vectorize,
diff --git a/cub/cub/device/dispatch/tuning/tuning_transform.cuh b/cub/cub/device/dispatch/tuning/tuning_transform.cuh
@@ -145,17 +145,17 @@ struct TransformAsyncCopyPolicy
   // Unroll 1 tends to improve performance, especially for smaller data types (confirmed by benchmark)
   int unroll_factor = 1; //!< The unroll factor for the transformation loop in the kernel. The value 0 retains the
                          //!< compiler's default unrolling (specifying no unroll pragma), 1 means no unrolling.
-  // Vectorized store width for the ublkcp kernel. 0 means "auto": store_vec = 16 / sizeof(output) (a 16-byte STG.128).
-  // Setting it smaller narrows the store but also reduces the number of fully-unrolled lambda calls per store, which
-  // bounds register pressure for heavy functors (their stores aren't the bottleneck anyway).
-  int store_vec = 0; //!< Output elements per vectorized store (S). 0 = auto (16 / sizeof(output)).
+  // Vectorized store width for the ublkcp kernel. 0 means "auto": store_vec_size = 16 / sizeof(output) (a 16-byte
+  // STG.128). Setting it smaller narrows the store but also reduces the number of fully-unrolled lambda calls per
+  // store, which bounds register pressure for heavy functors (their stores aren't the bottleneck anyway).
+  int store_vec_size = 0; //!< Output elements per vectorized store (S). 0 = auto (16 / sizeof(output)).
 
   [[nodiscard]] _CCCL_HOST_DEVICE_API constexpr friend bool
   operator==(const TransformAsyncCopyPolicy& lhs, const TransformAsyncCopyPolicy& rhs) noexcept
   {
     return lhs.threads_per_block == rhs.threads_per_block && lhs.min_items_per_thread == rhs.min_items_per_thread
         && lhs.max_items_per_thread == rhs.max_items_per_thread && lhs.unroll_factor == rhs.unroll_factor
-        && lhs.store_vec == rhs.store_vec;
+        && lhs.store_vec_size == rhs.store_vec_size;
   }
 
   [[nodiscard]] _CCCL_HOST_DEVICE_API constexpr friend bool
@@ -171,7 +171,7 @@ struct TransformAsyncCopyPolicy
         << "TransformAsyncCopyPolicy { .threads_per_block = " << policy.threads_per_block
         << ", .min_items_per_thread = " << policy.min_items_per_thread
         << ", .max_items_per_thread = " << policy.max_items_per_thread << ", .unroll_factor = " << policy.unroll_factor
-        << ", .store_vec = " << policy.store_vec << " }";
+        << ", .store_vec_size = " << policy.store_vec_size << " }";
   }
 #endif // _CCCL_HOSTED()
 };
diff --git a/cub/test/catch2_test_device_transform_vectorized.cu b/cub/test/catch2_test_device_transform_vectorized.cu
@@ -74,20 +74,20 @@ C2H_TEST("DeviceTransform::Transform vectorized store widening from uint8",
   REQUIRE(reference_h == result);
 }
 
-struct ublkcp_store_vec_3_selector
+struct ublkcp_store_vec_size_2_selector
 {
   _CCCL_HOST_DEVICE_API constexpr auto operator()(::cuda::compute_capability cc) const -> cub::TransformPolicy
   {
     auto async              = cub::TransformAsyncCopyPolicy{};
     async.threads_per_block = 256;
-    async.store_vec         = 3;
+    async.store_vec_size    = 2;
     const auto algorithm =
       (cc < ::cuda::compute_capability{9, 0}) ? cub::TransformAlgorithm::prefetch : cub::TransformAlgorithm::ublkcp;
     return {64 * 1024, algorithm, cub::TransformPrefetchPolicy{256}, {}, async};
   }
 };
 
-C2H_TEST("DeviceTransform::Transform non-power-of-two store_vec falls back to scalar", "[device][transform]")
+C2H_TEST("DeviceTransform::Transform tunable narrower store_vec_size", "[device][transform]")
 {
   using in_t                         = std::uint32_t;
   using out_t                        = std::uint8_t;
@@ -98,7 +98,7 @@ C2H_TEST("DeviceTransform::Transform non-power-of-two store_vec falls back to sc
   c2h::gen(C2H_SEED(1), in);
 
   c2h::device_vector<out_t> result(num_items, thrust::no_init);
-  auto env = cuda::execution::tune(ublkcp_store_vec_3_selector{});
+  auto env = cuda::execution::tune(ublkcp_store_vec_size_2_selector{});
   REQUIRE(cudaSuccess
           == cub::DeviceTransform::Transform(
             cuda::std::make_tuple(in.begin()), result.begin(), num_items, cast_to<out_t>{}, env));

Original file line number	Diff line number	Diff line change
`@@ -74,20 +74,20 @@ C2H_TEST("DeviceTransform::Transform vectorized store widening from uint8",`
`74`	`74`	`REQUIRE(reference_h == result);`
`75`	`75`	`}`
`76`	`76`
`77`		`-struct ublkcp_store_vec_3_selector`
	`77`	`+struct ublkcp_store_vec_size_2_selector`
`78`	`78`	`{`
`79`	`79`	`_CCCL_HOST_DEVICE_API constexpr auto operator()(::cuda::compute_capability cc) const -> cub::TransformPolicy`
`80`	`80`	`{`
`81`	`81`	`auto async = cub::TransformAsyncCopyPolicy{};`
`82`	`82`	`async.threads_per_block = 256;`
`83`		`- async.store_vec = 3;`
	`83`	`+ async.store_vec_size = 2;`
`84`	`84`	`const auto algorithm =`
`85`	`85`	`(cc < ::cuda::compute_capability{9, 0}) ? cub::TransformAlgorithm::prefetch : cub::TransformAlgorithm::ublkcp;`
`86`	`86`	`return {64 * 1024, algorithm, cub::TransformPrefetchPolicy{256}, {}, async};`
`87`	`87`	`}`
`88`	`88`	`};`
`89`	`89`
`90`		`-C2H_TEST("DeviceTransform::Transform non-power-of-two store_vec falls back to scalar", "[device][transform]")`
	`90`	`+C2H_TEST("DeviceTransform::Transform tunable narrower store_vec_size", "[device][transform]")`
`91`	`91`	`{`
`92`	`92`	`using in_t = std::uint32_t;`
`93`	`93`	`using out_t = std::uint8_t;`
`@@ -98,7 +98,7 @@ C2H_TEST("DeviceTransform::Transform non-power-of-two store_vec falls back to sc`
`98`	`98`	`c2h::gen(C2H_SEED(1), in);`
`99`	`99`
`100`	`100`	`c2h::device_vector<out_t> result(num_items, thrust::no_init);`
`101`		`- auto env = cuda::execution::tune(ublkcp_store_vec_3_selector{});`
	`101`	`+ auto env = cuda::execution::tune(ublkcp_store_vec_size_2_selector{});`
`102`	`102`	`REQUIRE(cudaSuccess`
`103`	`103`	`== cub::DeviceTransform::Transform(`
`104`	`104`	`cuda::std::make_tuple(in.begin()), result.begin(), num_items, cast_to<out_t>{}, env));`