Skip to content
Open
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
14 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion cub/cub/device/dispatch/dispatch_transform.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include <thrust/type_traits/unwrap_contiguous_iterator.h>

#include <cuda/__cmath/ceil_div.h>
#include <cuda/__cmath/pow2.h>
#include <cuda/__memory/is_aligned.h>
#include <cuda/std/__algorithm/clamp.h>
#include <cuda/std/__algorithm/max.h>
Expand Down Expand Up @@ -313,11 +314,28 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke_async_algorithm(
}

auto [launcher, kernel, items_per_thread] = *ret;

// Runtime check whether this launch can take the ublkcp kernel's vectorized (STG.128) store path. The output value
// type must pack into a 16-byte vector and all pointers must be suitably aligned. The kernel additionally gates on
// compile-time eligibility (contiguous, trivially relocatable, power-of-two element sizes, no predicate).
bool can_vectorize = false;
if constexpr (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorOut>)
{
using output_t = it_value_t<RandomAccessIteratorOut>;
constexpr int out_size = int{size_of<output_t>};
constexpr int vec_size = (out_size > 0 && out_size <= 16) ? 16 / out_size : 1;
if constexpr (vec_size > 1 && ::cuda::is_power_of_two(out_size)
&& (... && ::cuda::is_power_of_two(int{sizeof(it_value_t<RandomAccessIteratorsIn>)})))
{
can_vectorize = kernel_source.CanVectorize(vec_size, out, ::cuda::std::get<Is>(in)...);
}
}
Comment thread
bernhardmgruber marked this conversation as resolved.

return launcher.doit(
kernel,
num_items,
items_per_thread,
false,
can_vectorize,
pred,
op,
THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(out),
Expand Down
113 changes: 112 additions & 1 deletion cub/cub/device/dispatch/kernels/kernel_transform.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -713,6 +713,7 @@ _CCCL_DEVICE void bulk_copy_maybe_unaligned(
// didn't merge the changes. The problem was mostly a 25% increase in integer instructions, as shown by ncu.
template <int threads_per_block,
int UnrollFactor,
int StoreVec,
typename Offset,
typename Predicate,
typename F,
Expand All @@ -721,6 +722,7 @@ template <int threads_per_block,
_CCCL_DEVICE void transform_kernel_ublkcp(
Offset num_items,
int num_elem_per_thread,
[[maybe_unused]] bool can_vectorize,
Predicate pred,
F f,
RandomAccessIteratorOut out,
Expand Down Expand Up @@ -895,6 +897,112 @@ _CCCL_DEVICE void transform_kernel_ublkcp(
// move the whole index and iterator to the block/thread index, to reduce arithmetic in the loops below
out += offset;

using output_t = it_value_t<RandomAccessIteratorOut>;
constexpr int out_size = int{size_of<output_t>};
constexpr int vec_size = (out_size > 0 && out_size <= 16) ? 16 / out_size : 1;
Comment thread
bernhardmgruber marked this conversation as resolved.
Outdated
constexpr int store_vec = (StoreVec > 0) ? (::cuda::std::min) (StoreVec, vec_size) : vec_size;
// compile time eligibility for the vectorized store (STG.128):
// 1. there are no predicates
// 2. memory layout is contiguous
// 3. semantically we can raw copy
// 4. size is power-of-2 and <= 16 bytes
// #TODO(nan): STG.256 (128 should have enough BIF already, but should check perf on blackwell)
constexpr bool vectorize_eligible =
store_vec > 1 && ::cuda::is_power_of_two(store_vec) && ::cuda::std::is_same_v<Predicate, ::cuda::always_true>
Comment thread
nanan-nvidia marked this conversation as resolved.
Outdated
&& THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorOut>
&& THRUST_NS_QUALIFIER::is_trivially_relocatable_v<output_t> && ::cuda::is_power_of_two(out_size)
&& (... && ::cuda::is_power_of_two(int{sizeof(InTs)}));

if constexpr (vectorize_eligible)
{
if (can_vectorize)
{
// store_vec (S) output elements per STG.128/64/.../8, defaulting to vec_size (= 16 / sizeof(output), today's
// 16-byte store). Shrinking S narrows the store but also reduces the number of fully-unrolled lambda calls per
// store, which bounds register pressure for heavy functors (whose stores aren't the bottleneck anyway). res[] is
// indexed only by the fully-unrolled k, i.e. compile-time, so it stays in registers and never spills to local
// memory regardless of S.
using store_t = decltype(load_store_type<store_vec * out_size>());
auto* out_vec = reinterpret_cast<store_t*>(out);
const int num_groups = valid_items / store_vec;
for (int g = threadIdx.x; g < num_groups; g += threads_per_block)
{
char* smem = smem_base;
auto load_chunk = [&](auto aligned_ptr) {
using T = typename decltype(aligned_ptr)::value_type;
// on blackwell, head_padding should always be zero

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Important: Please add an assertion for that.

// on hopper, bulk_copy_alignment is 128 bytes, head_padding could be 112 bytes for example
// alignof(T) will always be powers of 2 per C++ standard
const T* base = reinterpret_cast<const T*>(smem + aligned_ptr.head_padding);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Important: since head padding is zero on blackwell, please only add it below SM100 then (NV_IF_TARGET)

smem += tile_padding + int{sizeof(T)} * tile_size;
// Gather this input's vec_size elements for output-vector v into a register array. we take the maximal
// alignment out of alignof(T) and 16 bytes. If input is narrower, we will waste a few (0-16) registers
constexpr ::cuda::std::size_t chunk_align = (::cuda::std::max) (alignof(T), alignof(int4));
Comment thread
nanan-nvidia marked this conversation as resolved.
::cuda::__uninitialized_array<T, store_vec, chunk_align> elems;
constexpr int chunk_bytes = int{sizeof(T)} * store_vec;
// if same width or narrowing (e.g. int32 -> int8), we split it up into multiple 16 byte reads
// CAREFUL: the byte width sizeof(T) * vec_size can exceed 16 when the input is wider than the output.
// However, since input both input type size and output size is pow2, when the input is wider, it has to be
// pow2 times wider. Therefore, chunk_bytes = input size * vec_size is always divisible by 16
// (recall 16 = output size * vec_size) , i.e. we can read it as multiple int4 loads
if constexpr (chunk_bytes % int{sizeof(int4)} == 0)
{
constexpr int n = chunk_bytes / int{sizeof(int4)};
const int4* s = reinterpret_cast<const int4*>(base) + g * n;
_CCCL_PRAGMA_UNROLL_FULL()
for (int i = 0; i < n; ++i)
{
reinterpret_cast<int4*>(elems.data())[i] = s[i];
}
}
// if widening (e.g. int8 -> int32), just load it in one go. recall chunk_bytes = input size * vec_size, and
// vec_size = 16 / output size. Since output size is pow2, vec_size is pow2. Hence chunk_bytes is always pow2.
// this ensures load_store_type<chunk_bytes> will never fail.
else
{
using sub_t = decltype(load_store_type<chunk_bytes>());
*reinterpret_cast<sub_t*>(elems.data()) = reinterpret_cast<const sub_t*>(base)[g];
}
return elems;
};
auto chunks = ::cuda::std::tuple{load_chunk(aligned_ptrs)...};

// must fully unroll to take full advantage of ILP. otherwise perf regress by half
Comment thread
bernhardmgruber marked this conversation as resolved.
Outdated
::cuda::__uninitialized_array<output_t, store_vec, sizeof(output_t) * store_vec> res;
_CCCL_PRAGMA_UNROLL_FULL()
for (int k = 0; k < store_vec; ++k)
{
res[k] = ::cuda::std::apply(
[&](auto&... c) {
return f(c[k]...);
},
chunks);
}
out_vec[g] = *reinterpret_cast<const store_t*>(res.data());
}

// scalar tail: the up to (store_vec - 1) trailing elements not covered by a whole store group. can_vectorize
// implies an always_true predicate, so we store unconditionally.
for (int idx = num_groups * store_vec + threadIdx.x; idx < valid_items; idx += threads_per_block)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Q: Have you tried scaling the items per thread in the host-side dispatch as multiples of the vector store width, so we never need a tail logic?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you elaborate what you meant by scaling?

The main motive for adding a tail logic was so that, if the output pointer is aligned to 16 bytes, but the amount of data we store are not aligned to 16 bytes (i.e. 10,000,001 int8 elements), we can still get the performance benefit of doing the first 10,000,000 vectorized stores.

{
char* smem = smem_base;
auto fetch_operand = [&](auto aligned_ptr) {
using T = typename decltype(aligned_ptr)::value_type;
const int head_padding = alignof(T) < bulk_copy_alignment ? aligned_ptr.head_padding : 0;
const char* src = smem + head_padding;
smem += tile_padding + int{sizeof(T)} * tile_size;
return reinterpret_cast<const T*>(src)[idx];
};
out[idx] = ::cuda::std::apply(
[&](auto... values) {
return f(values...);
},
::cuda::std::tuple<InTs...>{fetch_operand(aligned_ptrs)...});
}
return;
}
Comment on lines +920 to +1001

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nitpick: This is a nontrivial amount of code, I believe we should extract it into a function.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure about this one: this code does not seem to be that reusable across different kernels

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am fine with the code like this, but I agree it's a big unwieldy

}

auto process_tile = [&](auto full_tile) {
unrolled_for<UnrollFactor>(num_elem_per_thread, [&](int j) {
// TODO(bgruber): fbusato suggests to hoist threadIdx.x out of the loop below
Expand Down Expand Up @@ -1068,9 +1176,12 @@ __launch_bounds__(get_threads_per_block<PolicySelector>) _CCCL_KERNEL_ATTRIBUTES
{
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(transform_kernel_ublkcp</*policy*/ policy.async_copy.threads_per_block, policy.async_copy.unroll_factor>(
(transform_kernel_ublkcp<policy.async_copy.threads_per_block,
policy.async_copy.unroll_factor,
policy.async_copy.store_vec>(
num_items,
num_elem_per_thread,
can_vectorize,
::cuda::std::move(pred),
::cuda::std::move(f),
::cuda::std::move(out),
Expand Down
15 changes: 11 additions & 4 deletions cub/cub/device/dispatch/tuning/tuning_transform.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -145,12 +145,17 @@ struct TransformAsyncCopyPolicy
// Unroll 1 tends to improve performance, especially for smaller data types (confirmed by benchmark)
int unroll_factor = 1; //!< The unroll factor for the transformation loop in the kernel. The value 0 retains the
//!< compiler's default unrolling (specifying no unroll pragma), 1 means no unrolling.
// Vectorized store width for the ublkcp kernel. 0 means "auto": store_vec = 16 / sizeof(output) (a 16-byte STG.128).
// Setting it smaller narrows the store but also reduces the number of fully-unrolled lambda calls per store, which
// bounds register pressure for heavy functors (their stores aren't the bottleneck anyway).
int store_vec = 0; //!< Output elements per vectorized store (S). 0 = auto (16 / sizeof(output)).

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Important question: Why do we need a new tuning parameter, and cannot use unroll_factor?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with the analysis that in the vectorized path, unroll_factor happens to be the same value as store_vec_size (I'm taking @miscco 's suggestion on renaming store_vec).

I think they should still be different parameters in the policy, because

  1. unroll_factor already has its own meaning in the regular scalar store. And even just semantically, unrolling something does not mean we are going to store it vectorized. Also, it is set to 1 by default, which is the best default documented for the scalar path. For the vectorized path, the best default would not be 1.
  2. store_vec_size has its own semantic meaning in the sense that:
    a. It means the width at which we store, and in that path we fully unroll the loop as a consequence of a certain store_vec_size.
    b. now we can express auto-vectorization, vectorization to a custom number, or disable vectorization all at once.


[[nodiscard]] _CCCL_HOST_DEVICE_API constexpr friend bool
operator==(const TransformAsyncCopyPolicy& lhs, const TransformAsyncCopyPolicy& rhs) noexcept
{
return lhs.threads_per_block == rhs.threads_per_block && lhs.min_items_per_thread == rhs.min_items_per_thread
&& lhs.max_items_per_thread == rhs.max_items_per_thread && lhs.unroll_factor == rhs.unroll_factor;
&& lhs.max_items_per_thread == rhs.max_items_per_thread && lhs.unroll_factor == rhs.unroll_factor
&& lhs.store_vec == rhs.store_vec;
}

[[nodiscard]] _CCCL_HOST_DEVICE_API constexpr friend bool
Expand All @@ -162,9 +167,11 @@ struct TransformAsyncCopyPolicy
#if _CCCL_HOSTED()
friend ::std::ostream& operator<<(::std::ostream& os, const TransformAsyncCopyPolicy& policy)
{
return os << "TransformAsyncCopyPolicy { .threads_per_block = " << policy.threads_per_block
<< ", .min_items_per_thread = " << policy.min_items_per_thread << ", .max_items_per_thread = "
<< policy.max_items_per_thread << ", .unroll_factor = " << policy.unroll_factor << " }";
return os
<< "TransformAsyncCopyPolicy { .threads_per_block = " << policy.threads_per_block
<< ", .min_items_per_thread = " << policy.min_items_per_thread
<< ", .max_items_per_thread = " << policy.max_items_per_thread << ", .unroll_factor = " << policy.unroll_factor
<< ", .store_vec = " << policy.store_vec << " }";
}
#endif // _CCCL_HOSTED()
};
Expand Down
111 changes: 111 additions & 0 deletions cub/test/catch2_test_device_transform_vectorized.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include "insert_nested_NVTX_range_guard.h"

#include <cub/device/device_transform.cuh>

#include <algorithm>

#include "catch2_test_launch_helper.h"
#include <c2h/catch2_test_helper.h>
Comment thread
coderabbitai[bot] marked this conversation as resolved.

// %PARAM% TEST_LAUNCH lid 0:1:2

DECLARE_LAUNCH_WRAPPER(cub::DeviceTransform::Transform, transform_many);

// Generic counts, deliberately including non-multiples of the 16-byte vectorized store width, to exercise the scalar
// tail of the ublkcp vectorized store path on aligned (c2h::device_vector) buffers.
#define GENERIC_COUNTS 0, 1, 15, 16, 17, 127, 129, 4095, 4097, 100'000

template <typename Out>
struct cast_to
{
template <typename T>
__host__ __device__ Out operator()(T v) const
{
return static_cast<Out>(v);
Comment thread
coderabbitai[bot] marked this conversation as resolved.
}
};

// Narrowing widths (e.g. uint32 -> uint8) drive the multi-int4-load gather; widening (uint8 -> uint32) drives the
// sub-16-byte load. Same-width is already covered by catch2_test_device_transform.cu's BabelStream add.
C2H_TEST("DeviceTransform::Transform vectorized store narrowing to uint8",
"[device][transform]",
c2h::type_list<std::uint16_t, std::uint32_t, std::uint64_t>)
{
using in_t = c2h::get<0, TestType>;
using out_t = std::uint8_t;
using offset_t = cuda::std::int64_t;
const offset_t num_items = GENERATE(GENERIC_COUNTS);
CAPTURE(c2h::type_name<in_t>(), num_items);

c2h::device_vector<in_t> in(num_items, thrust::no_init);
c2h::gen(C2H_SEED(1), in);

c2h::device_vector<out_t> result(num_items, thrust::no_init);
transform_many(cuda::std::make_tuple(in.begin()), result.begin(), num_items, cast_to<out_t>{});

c2h::host_vector<in_t> in_h = in;
c2h::host_vector<out_t> reference_h(num_items, thrust::no_init);
std::transform(in_h.begin(), in_h.end(), reference_h.begin(), cast_to<out_t>{});
REQUIRE(reference_h == result);
}

C2H_TEST("DeviceTransform::Transform vectorized store widening from uint8",
"[device][transform]",
c2h::type_list<std::uint16_t, std::uint32_t, std::uint64_t>)
{
using in_t = std::uint8_t;
using out_t = c2h::get<0, TestType>;
using offset_t = cuda::std::int64_t;
const offset_t num_items = GENERATE(GENERIC_COUNTS);
CAPTURE(c2h::type_name<out_t>(), num_items);

c2h::device_vector<in_t> in(num_items, thrust::no_init);
c2h::gen(C2H_SEED(1), in);

c2h::device_vector<out_t> result(num_items, thrust::no_init);
transform_many(cuda::std::make_tuple(in.begin()), result.begin(), num_items, cast_to<out_t>{});

c2h::host_vector<in_t> in_h = in;
c2h::host_vector<out_t> reference_h(num_items, thrust::no_init);
std::transform(in_h.begin(), in_h.end(), reference_h.begin(), cast_to<out_t>{});
REQUIRE(reference_h == result);
}

struct ublkcp_store_vec_3_selector
{
_CCCL_HOST_DEVICE_API constexpr auto operator()(::cuda::compute_capability cc) const -> cub::TransformPolicy
Comment thread
nanan-nvidia marked this conversation as resolved.
{
auto async = cub::TransformAsyncCopyPolicy{};
async.threads_per_block = 256;
async.store_vec = 3;
Comment thread
nanan-nvidia marked this conversation as resolved.
Outdated
const auto algorithm =
(cc < ::cuda::compute_capability{9, 0}) ? cub::TransformAlgorithm::prefetch : cub::TransformAlgorithm::ublkcp;
return {64 * 1024, algorithm, cub::TransformPrefetchPolicy{256}, {}, async};
}
};

C2H_TEST("DeviceTransform::Transform non-power-of-two store_vec falls back to scalar", "[device][transform]")
{
using in_t = std::uint32_t;
using out_t = std::uint8_t;
const cuda::std::int64_t num_items = GENERATE(GENERIC_COUNTS);
CAPTURE(num_items);

c2h::device_vector<in_t> in(num_items, thrust::no_init);
c2h::gen(C2H_SEED(1), in);

c2h::device_vector<out_t> result(num_items, thrust::no_init);
auto env = cuda::execution::tune(ublkcp_store_vec_3_selector{});
REQUIRE(cudaSuccess
== cub::DeviceTransform::Transform(
cuda::std::make_tuple(in.begin()), result.begin(), num_items, cast_to<out_t>{}, env));
REQUIRE(cudaSuccess == cudaDeviceSynchronize());

c2h::host_vector<in_t> in_h = in;
c2h::host_vector<out_t> reference_h(num_items, thrust::no_init);
std::transform(in_h.begin(), in_h.end(), reference_h.begin(), cast_to<out_t>{});
REQUIRE(reference_h == result);
}
Loading