Skip to content

Commit 54db169

Browse files
committed
cub: rename store_vec to store_vec_size; static_assert power-of-two store width
1 parent 71cefee commit 54db169

3 files changed

Lines changed: 39 additions & 41 deletions

File tree

cub/cub/device/dispatch/kernels/kernel_transform.cuh

Lines changed: 29 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -713,7 +713,7 @@ _CCCL_DEVICE void bulk_copy_maybe_unaligned(
713713
// didn't merge the changes. The problem was mostly a 25% increase in integer instructions, as shown by ncu.
714714
template <int threads_per_block,
715715
int UnrollFactor,
716-
int StoreVec,
716+
int StoreVecSize,
717717
typename Offset,
718718
typename Predicate,
719719
typename F,
@@ -897,18 +897,20 @@ _CCCL_DEVICE void transform_kernel_ublkcp(
897897
// move the whole index and iterator to the block/thread index, to reduce arithmetic in the loops below
898898
out += offset;
899899

900-
using output_t = it_value_t<RandomAccessIteratorOut>;
901-
constexpr int out_size = int{size_of<output_t>};
902-
constexpr int vec_size = (out_size > 0 && out_size <= 16) ? 16 / out_size : 1;
903-
constexpr int store_vec = (StoreVec > 0) ? (::cuda::std::min) (StoreVec, vec_size) : vec_size;
900+
using output_t = it_value_t<RandomAccessIteratorOut>;
901+
constexpr int out_size = int{size_of<output_t>};
902+
constexpr int vec_size = (out_size > 0 && out_size <= 16) ? 16 / out_size : 1;
903+
static_assert(StoreVecSize == 0 || ::cuda::is_power_of_two(StoreVecSize),
904+
"store_vec_size must be 0 (auto) or a power of two");
905+
constexpr int store_vec_size = (StoreVecSize > 0) ? (::cuda::std::min) (StoreVecSize, vec_size) : vec_size;
904906
// compile time eligibility for the vectorized store (STG.128):
905907
// 1. there are no predicates
906908
// 2. memory layout is contiguous
907909
// 3. semantically we can raw copy
908910
// 4. size is power-of-2 and <= 16 bytes
909911
// #TODO(nan): STG.256 (128 should have enough BIF already, but should check perf on blackwell)
910912
constexpr bool vectorize_eligible =
911-
store_vec > 1 && ::cuda::is_power_of_two(store_vec) && ::cuda::std::is_same_v<Predicate, ::cuda::always_true>
913+
store_vec_size > 1 && ::cuda::std::is_same_v<Predicate, ::cuda::always_true>
912914
&& THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorOut>
913915
&& THRUST_NS_QUALIFIER::is_trivially_relocatable_v<output_t> && ::cuda::is_power_of_two(out_size)
914916
&& (... && ::cuda::is_power_of_two(int{sizeof(InTs)}));
@@ -917,14 +919,11 @@ _CCCL_DEVICE void transform_kernel_ublkcp(
917919
{
918920
if (can_vectorize)
919921
{
920-
// store_vec (S) output elements per STG.128/64/.../8, defaulting to vec_size (= 16 / sizeof(output), today's
921-
// 16-byte store). Shrinking S narrows the store but also reduces the number of fully-unrolled lambda calls per
922-
// store, which bounds register pressure for heavy functors (whose stores aren't the bottleneck anyway). res[] is
923-
// indexed only by the fully-unrolled k, i.e. compile-time, so it stays in registers and never spills to local
924-
// memory regardless of S.
925-
using store_t = decltype(load_store_type<store_vec * out_size>());
922+
// store_vec_size: element count for vectorized store. default = 16 / sizeof(output). must be pow2
923+
// Shrinking store_vec_size narrows the store but also reduces register pressure
924+
using store_t = decltype(load_store_type<store_vec_size * out_size>());
926925
auto* out_vec = reinterpret_cast<store_t*>(out);
927-
const int num_groups = valid_items / store_vec;
926+
const int num_groups = valid_items / store_vec_size;
928927
for (int g = threadIdx.x; g < num_groups; g += threads_per_block)
929928
{
930929
char* smem = smem_base;
@@ -935,16 +934,15 @@ _CCCL_DEVICE void transform_kernel_ublkcp(
935934
// alignof(T) will always be powers of 2 per C++ standard
936935
const T* base = reinterpret_cast<const T*>(smem + aligned_ptr.head_padding);
937936
smem += tile_padding + int{sizeof(T)} * tile_size;
938-
// Gather this input's vec_size elements for output-vector v into a register array. we take the maximal
939-
// alignment out of alignof(T) and 16 bytes. If input is narrower, we will waste a few (0-16) registers
937+
// Gather this input's store_vec_size elements for output-vector v into a register array.
938+
// we take the maximal alignment out of alignof(T) and 16 bytes. This is because compiler assume
939+
// natural alignment on bigger types (e.g. 32 bytes). If input is narrower, we will waste a few (0-16)
940+
// registers
940941
constexpr ::cuda::std::size_t chunk_align = (::cuda::std::max) (alignof(T), alignof(int4));
941-
::cuda::__uninitialized_array<T, store_vec, chunk_align> elems;
942-
constexpr int chunk_bytes = int{sizeof(T)} * store_vec;
943-
// if same width or narrowing (e.g. int32 -> int8), we split it up into multiple 16 byte reads
944-
// CAREFUL: the byte width sizeof(T) * vec_size can exceed 16 when the input is wider than the output.
945-
// However, since input both input type size and output size is pow2, when the input is wider, it has to be
946-
// pow2 times wider. Therefore, chunk_bytes = input size * vec_size is always divisible by 16
947-
// (recall 16 = output size * vec_size) , i.e. we can read it as multiple int4 loads
942+
::cuda::__uninitialized_array<T, store_vec_size, chunk_align> elems;
943+
constexpr int chunk_bytes = int{sizeof(T)} * store_vec_size;
944+
// since store_vec_size is pow2, sizeof(T) is pow2, chunk_bytes must be pow2
945+
// if chunk_bytes is a multiple of 16, we do vectorised load from smem into reg
948946
if constexpr (chunk_bytes % int{sizeof(int4)} == 0)
949947
{
950948
constexpr int n = chunk_bytes / int{sizeof(int4)};
@@ -955,9 +953,8 @@ _CCCL_DEVICE void transform_kernel_ublkcp(
955953
reinterpret_cast<int4*>(elems.data())[i] = s[i];
956954
}
957955
}
958-
// if widening (e.g. int8 -> int32), just load it in one go. recall chunk_bytes = input size * vec_size, and
959-
// vec_size = 16 / output size. Since output size is pow2, vec_size is pow2. Hence chunk_bytes is always pow2.
960-
// this ensures load_store_type<chunk_bytes> will never fail.
956+
// if chunk_bytes is not a multiple of 16, since it is pow2, chunk_bytes < 16.
957+
// this ensures load_store_type<chunk_bytes> never fail
961958
else
962959
{
963960
using sub_t = decltype(load_store_type<chunk_bytes>());
@@ -967,10 +964,11 @@ _CCCL_DEVICE void transform_kernel_ublkcp(
967964
};
968965
auto chunks = ::cuda::std::tuple{load_chunk(aligned_ptrs)...};
969966

970-
// must fully unroll to take full advantage of ILP. otherwise perf regress by half
971-
::cuda::__uninitialized_array<output_t, store_vec, sizeof(output_t) * store_vec> res;
967+
// must fully unroll to make sure register index is static
968+
// (otherwise it will be on local memory & perf regress by half)
969+
::cuda::__uninitialized_array<output_t, store_vec_size, sizeof(output_t) * store_vec_size> res;
972970
_CCCL_PRAGMA_UNROLL_FULL()
973-
for (int k = 0; k < store_vec; ++k)
971+
for (int k = 0; k < store_vec_size; ++k)
974972
{
975973
res[k] = ::cuda::std::apply(
976974
[&](auto&... c) {
@@ -981,9 +979,9 @@ _CCCL_DEVICE void transform_kernel_ublkcp(
981979
out_vec[g] = *reinterpret_cast<const store_t*>(res.data());
982980
}
983981

984-
// scalar tail: the up to (store_vec - 1) trailing elements not covered by a whole store group. can_vectorize
982+
// we can scalar store tail when element count is not a multiple of store_vec_size
985983
// implies an always_true predicate, so we store unconditionally.
986-
for (int idx = num_groups * store_vec + threadIdx.x; idx < valid_items; idx += threads_per_block)
984+
for (int idx = num_groups * store_vec_size + threadIdx.x; idx < valid_items; idx += threads_per_block)
987985
{
988986
char* smem = smem_base;
989987
auto fetch_operand = [&](auto aligned_ptr) {
@@ -1178,7 +1176,7 @@ __launch_bounds__(get_threads_per_block<PolicySelector>) _CCCL_KERNEL_ATTRIBUTES
11781176
NV_PROVIDES_SM_90,
11791177
(transform_kernel_ublkcp<policy.async_copy.threads_per_block,
11801178
policy.async_copy.unroll_factor,
1181-
policy.async_copy.store_vec>(
1179+
policy.async_copy.store_vec_size>(
11821180
num_items,
11831181
num_elem_per_thread,
11841182
can_vectorize,

cub/cub/device/dispatch/tuning/tuning_transform.cuh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -145,17 +145,17 @@ struct TransformAsyncCopyPolicy
145145
// Unroll 1 tends to improve performance, especially for smaller data types (confirmed by benchmark)
146146
int unroll_factor = 1; //!< The unroll factor for the transformation loop in the kernel. The value 0 retains the
147147
//!< compiler's default unrolling (specifying no unroll pragma), 1 means no unrolling.
148-
// Vectorized store width for the ublkcp kernel. 0 means "auto": store_vec = 16 / sizeof(output) (a 16-byte STG.128).
149-
// Setting it smaller narrows the store but also reduces the number of fully-unrolled lambda calls per store, which
150-
// bounds register pressure for heavy functors (their stores aren't the bottleneck anyway).
151-
int store_vec = 0; //!< Output elements per vectorized store (S). 0 = auto (16 / sizeof(output)).
148+
// Vectorized store width for the ublkcp kernel. 0 means "auto": store_vec_size = 16 / sizeof(output) (a 16-byte
149+
// STG.128). Setting it smaller narrows the store but also reduces the number of fully-unrolled lambda calls per
150+
// store, which bounds register pressure for heavy functors (their stores aren't the bottleneck anyway).
151+
int store_vec_size = 0; //!< Output elements per vectorized store (S). 0 = auto (16 / sizeof(output)).
152152

153153
[[nodiscard]] _CCCL_HOST_DEVICE_API constexpr friend bool
154154
operator==(const TransformAsyncCopyPolicy& lhs, const TransformAsyncCopyPolicy& rhs) noexcept
155155
{
156156
return lhs.threads_per_block == rhs.threads_per_block && lhs.min_items_per_thread == rhs.min_items_per_thread
157157
&& lhs.max_items_per_thread == rhs.max_items_per_thread && lhs.unroll_factor == rhs.unroll_factor
158-
&& lhs.store_vec == rhs.store_vec;
158+
&& lhs.store_vec_size == rhs.store_vec_size;
159159
}
160160

161161
[[nodiscard]] _CCCL_HOST_DEVICE_API constexpr friend bool
@@ -171,7 +171,7 @@ struct TransformAsyncCopyPolicy
171171
<< "TransformAsyncCopyPolicy { .threads_per_block = " << policy.threads_per_block
172172
<< ", .min_items_per_thread = " << policy.min_items_per_thread
173173
<< ", .max_items_per_thread = " << policy.max_items_per_thread << ", .unroll_factor = " << policy.unroll_factor
174-
<< ", .store_vec = " << policy.store_vec << " }";
174+
<< ", .store_vec_size = " << policy.store_vec_size << " }";
175175
}
176176
#endif // _CCCL_HOSTED()
177177
};

cub/test/catch2_test_device_transform_vectorized.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -74,20 +74,20 @@ C2H_TEST("DeviceTransform::Transform vectorized store widening from uint8",
7474
REQUIRE(reference_h == result);
7575
}
7676

77-
struct ublkcp_store_vec_3_selector
77+
struct ublkcp_store_vec_size_2_selector
7878
{
7979
_CCCL_HOST_DEVICE_API constexpr auto operator()(::cuda::compute_capability cc) const -> cub::TransformPolicy
8080
{
8181
auto async = cub::TransformAsyncCopyPolicy{};
8282
async.threads_per_block = 256;
83-
async.store_vec = 3;
83+
async.store_vec_size = 2;
8484
const auto algorithm =
8585
(cc < ::cuda::compute_capability{9, 0}) ? cub::TransformAlgorithm::prefetch : cub::TransformAlgorithm::ublkcp;
8686
return {64 * 1024, algorithm, cub::TransformPrefetchPolicy{256}, {}, async};
8787
}
8888
};
8989

90-
C2H_TEST("DeviceTransform::Transform non-power-of-two store_vec falls back to scalar", "[device][transform]")
90+
C2H_TEST("DeviceTransform::Transform tunable narrower store_vec_size", "[device][transform]")
9191
{
9292
using in_t = std::uint32_t;
9393
using out_t = std::uint8_t;
@@ -98,7 +98,7 @@ C2H_TEST("DeviceTransform::Transform non-power-of-two store_vec falls back to sc
9898
c2h::gen(C2H_SEED(1), in);
9999

100100
c2h::device_vector<out_t> result(num_items, thrust::no_init);
101-
auto env = cuda::execution::tune(ublkcp_store_vec_3_selector{});
101+
auto env = cuda::execution::tune(ublkcp_store_vec_size_2_selector{});
102102
REQUIRE(cudaSuccess
103103
== cub::DeviceTransform::Transform(
104104
cuda::std::make_tuple(in.begin()), result.begin(), num_items, cast_to<out_t>{}, env));

0 commit comments

Comments
 (0)