Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
14 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions c/parallel/src/transform.cu
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ get_kernel_name(std::string_view input_iterator_t, std::string_view output_itera
check(cccl_type_name_from_nvrtc<OffsetT>(&offset_t));

return std::format(
"cub::detail::transform::transform_kernel<{0}, {1}, cuda::always_true, {2}, {3}, {4}>",
"cub::detail::transform::transform_kernel<{0}, {1}, cuda::always_true, {2}, {3}, 1, {4}>",
chained_policy_t, // 0
offset_t, // 1
transform_op_t, // 2
Expand All @@ -92,7 +92,7 @@ std::string get_kernel_name(std::string_view input1_iterator_t,
check(cccl_type_name_from_nvrtc<OffsetT>(&offset_t));

return std::format(
"cub::detail::transform::transform_kernel<{0}, {1}, cuda::always_true, {2}, {3}, {4}, "
"cub::detail::transform::transform_kernel<{0}, {1}, cuda::always_true, {2}, {3}, 1, {4}, "
"{5}>",
chained_policy_t, // 0
offset_t, // 1
Expand Down
23 changes: 20 additions & 3 deletions cub/cub/device/device_transform.cuh

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Important: Once we approve the general mechanics, we should update the per-function documentation.

Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <cuda/__functional/always_true_false.h>
#include <cuda/__functional/call_or.h>
#include <cuda/__iterator/zip_iterator.h>
#include <cuda/__memory/aligned_size.h>
#include <cuda/__stream/get_stream.h>
#include <cuda/std/__execution/env.h>
#include <cuda/std/tuple>
Expand Down Expand Up @@ -92,7 +93,20 @@ struct DeviceTransform
// https://github.com/NVIDIA/cccl/issues/8805 for data. We use choose_signed_offset to just check if it can hold the
// value passed by the user, but otherwise ignore the chosen signed offset type.
using offset_t = ::cuda::std::int64_t;
if (const cudaError_t error = detail::choose_signed_offset<NumItemsT>::is_exceeding_offset_type(num_items))
// num_items may be an aligned_size_t<N> (a caller promise that in/out are N-byte aligned and the count is a
// multiple of N). Unwrap it to the underlying integer.
const auto num_items_count = [&]() {
if constexpr (::cuda::std::is_integral_v<NumItemsT>)
{
return num_items;
}
else
{
return static_cast<::cuda::std::size_t>(num_items);
}
}();
using count_t = ::cuda::std::remove_const_t<decltype(num_items_count)>;
if (const cudaError_t error = detail::choose_signed_offset<count_t>::is_exceeding_offset_type(num_items_count))
{
return error;
}
Expand All @@ -114,10 +128,13 @@ struct DeviceTransform
static_assert(detail::transform::transform_policy_selector<policy_selector>);
#endif // _CCCL_HAS_CONCEPTS()

return detail::transform::dispatch<StableAddress>(
// An aligned_size_t<N> num_items lets the caller promise N-byte-aligned in/out pointers and a count that is a
// multiple of N. This is a compile-time guarantee so the ublkcp kernel can take the vector-store path.
constexpr int out_align = static_cast<int>(::cuda::__get_size_align_v<NumItemsT>);
return detail::transform::dispatch<StableAddress, out_align>(
::cuda::std::move(inputs),
::cuda::std::move(output),
static_cast<offset_t>(num_items),
static_cast<offset_t>(num_items_count),
::cuda::std::move(predicate),
::cuda::std::move(transform_op),
stream,
Expand Down
14 changes: 10 additions & 4 deletions cub/cub/device/dispatch/dispatch_transform.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -77,21 +77,24 @@ template <typename PolicySelector,
typename RandomAccessIteratorsIn,
typename RandomAccessIteratorOut,
typename Predicate,
typename TransformOp>
typename TransformOp,
int OutputAlign>
struct TransformKernelSource;

template <typename PolicySelector,
typename Offset,
typename... RandomAccessIteratorsIn,
typename RandomAccessIteratorOut,
typename Predicate,
typename TransformOp>
typename TransformOp,
int OutputAlign>
struct TransformKernelSource<PolicySelector,
Offset,
::cuda::std::tuple<RandomAccessIteratorsIn...>,
RandomAccessIteratorOut,
Predicate,
TransformOp>
TransformOp,
OutputAlign>
{
// PolicySelector must be stateless, so we can pass the type to the kernel
static_assert(::cuda::std::is_empty_v<PolicySelector>);
Expand All @@ -103,6 +106,7 @@ struct TransformKernelSource<PolicySelector,
Predicate,
TransformOp,
THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t<RandomAccessIteratorOut>,
OutputAlign,
THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t<RandomAccessIteratorsIn>...>);

template <class ActionT>
Expand Down Expand Up @@ -541,6 +545,7 @@ struct invoke_for_cc<::cuda::std::tuple<RandomAccessIteratorsIn...>,
};

template <requires_stable_address StableAddress,
int OutputAlign = 1,

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Important: OutputAlign can benefit from knowing the output type, so should be alignof(it_value_t<RandomAccessIteratorOut>), plus handling for value types of void where it should be 1.

typename... RandomAccessIteratorsIn,
typename RandomAccessIteratorOut,
typename Offset,
Expand All @@ -552,7 +557,8 @@ template <requires_stable_address StableAddress,
::cuda::std::tuple<RandomAccessIteratorsIn...>,
RandomAccessIteratorOut,
Predicate,
TransformOp>,
TransformOp,
OutputAlign>,
typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY>
#if _CCCL_HAS_CONCEPTS()
requires transform_policy_selector<PolicySelector>
Expand Down
138 changes: 108 additions & 30 deletions cub/cub/device/dispatch/kernels/kernel_transform.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -713,6 +713,7 @@ _CCCL_DEVICE void bulk_copy_maybe_unaligned(
// didn't merge the changes. The problem was mostly a 25% increase in integer instructions, as shown by ncu.
template <int threads_per_block,
int UnrollFactor,
int OutputAlign,

This comment was marked as outdated.

typename Offset,
typename Predicate,
typename F,
Expand Down Expand Up @@ -895,41 +896,117 @@ _CCCL_DEVICE void transform_kernel_ublkcp(
// move the whole index and iterator to the block/thread index, to reduce arithmetic in the loops below
out += offset;

auto process_tile = [&](auto full_tile) {
unrolled_for<UnrollFactor>(num_elem_per_thread, [&](int j) {
// TODO(bgruber): fbusato suggests to hoist threadIdx.x out of the loop below
const int idx = j * threads_per_block + threadIdx.x;
if (full_tile || idx < valid_items)
using output_t = it_value_t<RandomAccessIteratorOut>;
constexpr int out_size = int{size_of<output_t>};
constexpr int vec_size = (out_size > 0 && out_size <= 16) ? 16 / out_size : 1;
Comment thread
nanan-nvidia marked this conversation as resolved.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Q: Why can't we vectorize types larger than 16 bytes?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nanan-nvidia I believe this has not been answered. Why can't we vectorize types larger than 16 bytes? Like, if I have a struct { int data[8]; }. That could be vectorized to two 128bit loads.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Important: Please add a comment with the rational why we pick 16 here. That this to avoid arch dependent max vec size and no perf gain on sm100.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion:

Suggested change
constexpr int vec_size = (out_size > 0 && out_size <= 16) ? 16 / out_size : 1;
constexpr int possible_vec_size = (out_size > 0 && out_size <= 16) ? 16 / out_size : 1;

// When the caller guarantees aligned_size_t<N> num_items, i.e. the output pointer is N-byte aligned and the element
// count is a multiple of N, if 1. there are no predicates, 2. memory layout is contiguous, 3. semantically we can

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Critical:

the element count is a multiple of N

This is only true for elements with power of two size. Think of aligned_size_t<16>{n} and int3*, which is valid for every n that is a multiple of 4.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In vectorize_store we require both input element size and output element size to be pow2, so this should not be a bug.

From the definition,

explicit constexpr aligned_size_t(size_t __s) : value(__s) {
  _CCCL_ASSERT(value % align == 0,
               "aligned_size_t must be constructed with a size that is a multiple of the alignment");
}

It seems aligned_size_t<16>(n) just mean n % 16 == 0? Does it actually mean (sizeof(T) * n) % 16 == 0 semantically?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

aligned_size_t refers to a size in bytes, not in elements, at least according to what we currently document.

// raw copy, 4. size is power-of-2 and <= 16 bytes we can do vectorized store (STG.128)
Comment thread
nanan-nvidia marked this conversation as resolved.
Outdated
constexpr bool vectorize_store =
Comment thread
nanan-nvidia marked this conversation as resolved.
Outdated
OutputAlign >= 16 && vec_size > 1 && ::cuda::std::is_same_v<Predicate, ::cuda::always_true>
&& THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorOut>
&& THRUST_NS_QUALIFIER::is_trivially_relocatable_v<output_t> && ::cuda::is_power_of_two(out_size)
&& (true && ... && ::cuda::is_power_of_two(int{sizeof(InTs)}));
Comment thread
nanan-nvidia marked this conversation as resolved.
Outdated

if constexpr (vectorize_store)
{
using store_t = decltype(load_store_type<16>());
auto* out_vec = reinterpret_cast<store_t*>(out);
const int num_vectors = valid_items / vec_size;
for (int v = threadIdx.x; v < num_vectors; v += threads_per_block)
{
char* smem = smem_base;
auto load_chunk = [&](auto aligned_ptr) {
using T = typename decltype(aligned_ptr)::value_type;
// on blackwell, head_padding should always be zero
// on hopper, bulk_copy_alignment is 128 bytes, head_padding could be 112 bytes for example
// alignof(T) will always be powers of 2 per C++ standard
const T* base = reinterpret_cast<const T*>(smem + aligned_ptr.head_padding);
smem += tile_padding + int{sizeof(T)} * tile_size;
// Gather this input's vec_size elements for output-vector v into a register array. we take the maximal
// alignment out of alignof(T) and 16 bytes. If input is narrower, we will waste a few (0-16) registers
constexpr ::cuda::std::size_t chunk_align = (::cuda::std::max) (alignof(T), alignof(int4));
::cuda::__uninitialized_array<T, vec_size, chunk_align> elems;
constexpr int chunk_bytes = int{sizeof(T)} * vec_size;
// if same width or narrowing (e.g. int32 -> int8), we split it up into multiple 16 byte reads
// CAREFUL: the byte width sizeof(T) * vec_size can exceed 16 when the input is wider than the output.
// However, since input both input type size and output size is pow2, when the input is wider, it has to be
// pow2 times wider. Therefore, chunk_bytes = input size * vec_size is always divisible by 16
// (recall 16 = output size * vec_size) , i.e. we can read it as multiple int4 loads
if constexpr (chunk_bytes % int{sizeof(int4)} == 0)
{
constexpr int n = chunk_bytes / int{sizeof(int4)};
const int4* s = reinterpret_cast<const int4*>(base) + v * n;
_CCCL_PRAGMA_UNROLL_FULL()
for (int i = 0; i < n; ++i)
{
reinterpret_cast<int4*>(elems.data())[i] = s[i];
}
}
// if widening (e.g. int8 -> int32), just load it in one go. recall chunk_bytes = input size * vec_size, and
// vec_size = 16 / output size. Since output size is pow2, vec_size is pow2. Hence chunk_bytes is always pow2.
// this ensures load_store_type<chunk_bytes> will never fail.
else
{
using sub_t = decltype(load_store_type<chunk_bytes>());
*reinterpret_cast<sub_t*>(elems.data()) = reinterpret_cast<const sub_t*>(base)[v];
}
return elems;
};
auto chunks = ::cuda::std::tuple{load_chunk(aligned_ptrs)...};

alignas(sizeof(output_t) * vec_size) output_t res[vec_size];
// must fully unroll to take full advantage of ILP. otherwise perf regress by half
_CCCL_PRAGMA_UNROLL_FULL()
for (int k = 0; k < vec_size; ++k)
{
char* smem = smem_base;
auto fetch_operand = [&](auto aligned_ptr) {
using T = typename decltype(aligned_ptr)::value_type;
const int head_padding = alignof(T) < bulk_copy_alignment ? aligned_ptr.head_padding : 0;
const char* src = smem + head_padding;
smem += tile_padding + int{sizeof(T)} * tile_size;
return reinterpret_cast<const T*>(src)[idx];
};

// need to expand into a tuple for guaranteed order of evaluation
::cuda::std::apply(
[&](auto... values) {
if (pred(values...))
{
out[idx] = f(values...);
}
res[k] = ::cuda::std::apply(
[&](auto&... c) {
return f(c[k]...);
},
::cuda::std::tuple<InTs...>{fetch_operand(aligned_ptrs)...});
chunks);
}
});
};
// explicitly calling the lambda on literal true/false lets the compiler emit the lambda twice
if (tile_size == valid_items)
{
process_tile(::cuda::std::true_type{});
out_vec[v] = *reinterpret_cast<const store_t*>(res);
}
Comment on lines +920 to +1001

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nitpick: This is a nontrivial amount of code, I believe we should extract it into a function.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure about this one: this code does not seem to be that reusable across different kernels

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am fine with the code like this, but I agree it's a big unwieldy

}
else
{
process_tile(::cuda::std::false_type{});
auto process_tile = [&](auto full_tile) {
unrolled_for<UnrollFactor>(num_elem_per_thread, [&](int j) {
// TODO(bgruber): fbusato suggests to hoist threadIdx.x out of the loop below
const int idx = j * threads_per_block + threadIdx.x;
if (full_tile || idx < valid_items)
{
char* smem = smem_base;
auto fetch_operand = [&](auto aligned_ptr) {
using T = typename decltype(aligned_ptr)::value_type;
const int head_padding = alignof(T) < bulk_copy_alignment ? aligned_ptr.head_padding : 0;
const char* src = smem + head_padding;
smem += tile_padding + int{sizeof(T)} * tile_size;
return reinterpret_cast<const T*>(src)[idx];
};

// need to expand into a tuple for guaranteed order of evaluation
::cuda::std::apply(
[&](auto... values) {
if (pred(values...))
{
out[idx] = f(values...);
}
},
::cuda::std::tuple<InTs...>{fetch_operand(aligned_ptrs)...});
}
});
};
// explicitly calling the lambda on literal true/false lets the compiler emit the lambda twice
if (tile_size == valid_items)
{
process_tile(::cuda::std::true_type{});
}
else
{
process_tile(::cuda::std::false_type{});
}
}
}

Expand Down Expand Up @@ -1006,6 +1083,7 @@ template <typename PolicySelector,
typename Predicate,
typename F,
typename RandomAccessIteratorOut,
int OutputAlign,
typename... RandomAccessIteratorsIn>
#if _CCCL_HAS_CONCEPTS()
requires transform_policy_selector<PolicySelector>
Expand Down Expand Up @@ -1068,7 +1146,7 @@ __launch_bounds__(get_threads_per_block<PolicySelector>) _CCCL_KERNEL_ATTRIBUTES
{
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(transform_kernel_ublkcp</*policy*/ policy.async_copy.threads_per_block, policy.async_copy.unroll_factor>(
(transform_kernel_ublkcp<policy.async_copy.threads_per_block, policy.async_copy.unroll_factor, OutputAlign>(
num_items,
num_elem_per_thread,
::cuda::std::move(pred),
Expand Down
103 changes: 103 additions & 0 deletions cub/test/catch2_test_device_transform_aligned.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include "insert_nested_NVTX_range_guard.h"

#include <cub/device/device_transform.cuh>

#include <cuda/__memory/aligned_size.h>

#include <algorithm>

#include "catch2_test_launch_helper.h"
#include <c2h/catch2_test_helper.h>
#include <c2h/test_util_vec.h>

// %PARAM% TEST_LAUNCH lid 0:1:2

DECLARE_LAUNCH_WRAPPER(cub::DeviceTransform::Transform, transform_many);

#define ALIGNED_ITEM_COUNTS 0, 16, 128, 4096, 4112, 65536, 99'984

template <typename Out>
struct cast_to
{
template <typename T>
__host__ __device__ Out operator()(T v) const
{
return static_cast<Out>(v);
}
};

C2H_TEST("DeviceTransform::Transform aligned_size_t<16> same-width",
"[device][transform]",
c2h::type_list<std::uint8_t, std::uint16_t, std::uint32_t, std::uint64_t, float, double, uchar3>)
Comment thread
nanan-nvidia marked this conversation as resolved.
Outdated
{
using type = c2h::get<0, TestType>;
using offset_t = cuda::std::int64_t;
const offset_t num_items = GENERATE(ALIGNED_ITEM_COUNTS);
CAPTURE(c2h::type_name<type>(), num_items);

c2h::device_vector<type> a(num_items, thrust::no_init);
c2h::device_vector<type> b(num_items, thrust::no_init);
c2h::gen(C2H_SEED(1), a);
c2h::gen(C2H_SEED(1), b);

c2h::device_vector<type> result(num_items, thrust::no_init);
transform_many(cuda::std::make_tuple(a.begin(), b.begin()),
result.begin(),
cuda::aligned_size_t<16>(num_items),
cuda::std::plus<type>{});

c2h::host_vector<type> a_h = a;
c2h::host_vector<type> b_h = b;
c2h::host_vector<type> reference_h(num_items, thrust::no_init);
std::transform(a_h.begin(), a_h.end(), b_h.begin(), reference_h.begin(), cuda::std::plus<type>{});
REQUIRE(reference_h == result);
}

C2H_TEST("DeviceTransform::Transform aligned_size_t<16> narrowing to uint8",
"[device][transform]",
c2h::type_list<std::uint16_t, std::uint32_t, std::uint64_t>)
{
using in_t = c2h::get<0, TestType>;
using out_t = std::uint8_t;
using offset_t = cuda::std::int64_t;
const offset_t num_items = GENERATE(ALIGNED_ITEM_COUNTS);
CAPTURE(c2h::type_name<in_t>(), num_items);

c2h::device_vector<in_t> in(num_items, thrust::no_init);
c2h::gen(C2H_SEED(1), in);

c2h::device_vector<out_t> result(num_items, thrust::no_init);
transform_many(
cuda::std::make_tuple(in.begin()), result.begin(), cuda::aligned_size_t<16>(num_items), cast_to<out_t>{});

c2h::host_vector<in_t> in_h = in;
c2h::host_vector<out_t> reference_h(num_items, thrust::no_init);
std::transform(in_h.begin(), in_h.end(), reference_h.begin(), cast_to<out_t>{});
REQUIRE(reference_h == result);
}

C2H_TEST("DeviceTransform::Transform aligned_size_t<16> widening from uint8",
"[device][transform]",
c2h::type_list<std::uint16_t, std::uint32_t, std::uint64_t>)
{
using in_t = std::uint8_t;
using out_t = c2h::get<0, TestType>;
using offset_t = cuda::std::int64_t;
const offset_t num_items = GENERATE(ALIGNED_ITEM_COUNTS);
CAPTURE(c2h::type_name<out_t>(), num_items);

c2h::device_vector<in_t> in(num_items, thrust::no_init);
c2h::gen(C2H_SEED(1), in);

c2h::device_vector<out_t> result(num_items, thrust::no_init);
transform_many(
cuda::std::make_tuple(in.begin()), result.begin(), cuda::aligned_size_t<16>(num_items), cast_to<out_t>{});

c2h::host_vector<in_t> in_h = in;
c2h::host_vector<out_t> reference_h(num_items, thrust::no_init);
std::transform(in_h.begin(), in_h.end(), reference_h.begin(), cast_to<out_t>{});
REQUIRE(reference_h == result);
}
Loading