-
Notifications
You must be signed in to change notification settings - Fork 417
Vectorize output store in ublkcp DeviceTransform kernel #9481
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 3 commits
da1fc05
c6559fb
c2a253d
0db60f4
3a5fb1e
505da80
6b64983
5c4cb46
afbddc8
c9a53d7
897a0a4
aff5ab5
71cefee
54db169
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -77,21 +77,24 @@ template <typename PolicySelector, | |
| typename RandomAccessIteratorsIn, | ||
| typename RandomAccessIteratorOut, | ||
| typename Predicate, | ||
| typename TransformOp> | ||
| typename TransformOp, | ||
| int OutputAlign> | ||
| struct TransformKernelSource; | ||
|
|
||
| template <typename PolicySelector, | ||
| typename Offset, | ||
| typename... RandomAccessIteratorsIn, | ||
| typename RandomAccessIteratorOut, | ||
| typename Predicate, | ||
| typename TransformOp> | ||
| typename TransformOp, | ||
| int OutputAlign> | ||
| struct TransformKernelSource<PolicySelector, | ||
| Offset, | ||
| ::cuda::std::tuple<RandomAccessIteratorsIn...>, | ||
| RandomAccessIteratorOut, | ||
| Predicate, | ||
| TransformOp> | ||
| TransformOp, | ||
| OutputAlign> | ||
| { | ||
| // PolicySelector must be stateless, so we can pass the type to the kernel | ||
| static_assert(::cuda::std::is_empty_v<PolicySelector>); | ||
|
|
@@ -103,6 +106,7 @@ struct TransformKernelSource<PolicySelector, | |
| Predicate, | ||
| TransformOp, | ||
| THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t<RandomAccessIteratorOut>, | ||
| OutputAlign, | ||
| THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t<RandomAccessIteratorsIn>...>); | ||
|
|
||
| template <class ActionT> | ||
|
|
@@ -541,6 +545,7 @@ struct invoke_for_cc<::cuda::std::tuple<RandomAccessIteratorsIn...>, | |
| }; | ||
|
|
||
| template <requires_stable_address StableAddress, | ||
| int OutputAlign = 1, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Important: |
||
| typename... RandomAccessIteratorsIn, | ||
| typename RandomAccessIteratorOut, | ||
| typename Offset, | ||
|
|
@@ -552,7 +557,8 @@ template <requires_stable_address StableAddress, | |
| ::cuda::std::tuple<RandomAccessIteratorsIn...>, | ||
| RandomAccessIteratorOut, | ||
| Predicate, | ||
| TransformOp>, | ||
| TransformOp, | ||
| OutputAlign>, | ||
| typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY> | ||
| #if _CCCL_HAS_CONCEPTS() | ||
| requires transform_policy_selector<PolicySelector> | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -713,6 +713,7 @@ _CCCL_DEVICE void bulk_copy_maybe_unaligned( | |||||
| // didn't merge the changes. The problem was mostly a 25% increase in integer instructions, as shown by ncu. | ||||||
| template <int threads_per_block, | ||||||
| int UnrollFactor, | ||||||
| int OutputAlign, | ||||||
This comment was marked as outdated.
Sorry, something went wrong. |
||||||
| typename Offset, | ||||||
| typename Predicate, | ||||||
| typename F, | ||||||
|
|
@@ -895,41 +896,117 @@ _CCCL_DEVICE void transform_kernel_ublkcp( | |||||
| // move the whole index and iterator to the block/thread index, to reduce arithmetic in the loops below | ||||||
| out += offset; | ||||||
|
|
||||||
| auto process_tile = [&](auto full_tile) { | ||||||
| unrolled_for<UnrollFactor>(num_elem_per_thread, [&](int j) { | ||||||
| // TODO(bgruber): fbusato suggests to hoist threadIdx.x out of the loop below | ||||||
| const int idx = j * threads_per_block + threadIdx.x; | ||||||
| if (full_tile || idx < valid_items) | ||||||
| using output_t = it_value_t<RandomAccessIteratorOut>; | ||||||
| constexpr int out_size = int{size_of<output_t>}; | ||||||
| constexpr int vec_size = (out_size > 0 && out_size <= 16) ? 16 / out_size : 1; | ||||||
|
nanan-nvidia marked this conversation as resolved.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Q: Why can't we vectorize types larger than 16 bytes?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @nanan-nvidia I believe this has not been answered. Why can't we vectorize types larger than 16 bytes? Like, if I have a
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Important: Please add a comment with the rational why we pick
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Suggestion:
Suggested change
|
||||||
| // When the caller guarantees aligned_size_t<N> num_items, i.e. the output pointer is N-byte aligned and the element | ||||||
| // count is a multiple of N, if 1. there are no predicates, 2. memory layout is contiguous, 3. semantically we can | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Critical:
This is only true for elements with power of two size. Think of
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In From the definition, It seems
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||||||
| // raw copy, 4. size is power-of-2 and <= 16 bytes we can do vectorized store (STG.128) | ||||||
|
nanan-nvidia marked this conversation as resolved.
Outdated
|
||||||
| constexpr bool vectorize_store = | ||||||
|
nanan-nvidia marked this conversation as resolved.
Outdated
|
||||||
| OutputAlign >= 16 && vec_size > 1 && ::cuda::std::is_same_v<Predicate, ::cuda::always_true> | ||||||
| && THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorOut> | ||||||
| && THRUST_NS_QUALIFIER::is_trivially_relocatable_v<output_t> && ::cuda::is_power_of_two(out_size) | ||||||
| && (true && ... && ::cuda::is_power_of_two(int{sizeof(InTs)})); | ||||||
|
nanan-nvidia marked this conversation as resolved.
Outdated
|
||||||
|
|
||||||
| if constexpr (vectorize_store) | ||||||
| { | ||||||
| using store_t = decltype(load_store_type<16>()); | ||||||
| auto* out_vec = reinterpret_cast<store_t*>(out); | ||||||
| const int num_vectors = valid_items / vec_size; | ||||||
| for (int v = threadIdx.x; v < num_vectors; v += threads_per_block) | ||||||
| { | ||||||
| char* smem = smem_base; | ||||||
| auto load_chunk = [&](auto aligned_ptr) { | ||||||
| using T = typename decltype(aligned_ptr)::value_type; | ||||||
| // on blackwell, head_padding should always be zero | ||||||
| // on hopper, bulk_copy_alignment is 128 bytes, head_padding could be 112 bytes for example | ||||||
| // alignof(T) will always be powers of 2 per C++ standard | ||||||
| const T* base = reinterpret_cast<const T*>(smem + aligned_ptr.head_padding); | ||||||
| smem += tile_padding + int{sizeof(T)} * tile_size; | ||||||
| // Gather this input's vec_size elements for output-vector v into a register array. we take the maximal | ||||||
| // alignment out of alignof(T) and 16 bytes. If input is narrower, we will waste a few (0-16) registers | ||||||
| constexpr ::cuda::std::size_t chunk_align = (::cuda::std::max) (alignof(T), alignof(int4)); | ||||||
| ::cuda::__uninitialized_array<T, vec_size, chunk_align> elems; | ||||||
| constexpr int chunk_bytes = int{sizeof(T)} * vec_size; | ||||||
| // if same width or narrowing (e.g. int32 -> int8), we split it up into multiple 16 byte reads | ||||||
| // CAREFUL: the byte width sizeof(T) * vec_size can exceed 16 when the input is wider than the output. | ||||||
| // However, since input both input type size and output size is pow2, when the input is wider, it has to be | ||||||
| // pow2 times wider. Therefore, chunk_bytes = input size * vec_size is always divisible by 16 | ||||||
| // (recall 16 = output size * vec_size) , i.e. we can read it as multiple int4 loads | ||||||
| if constexpr (chunk_bytes % int{sizeof(int4)} == 0) | ||||||
| { | ||||||
| constexpr int n = chunk_bytes / int{sizeof(int4)}; | ||||||
| const int4* s = reinterpret_cast<const int4*>(base) + v * n; | ||||||
| _CCCL_PRAGMA_UNROLL_FULL() | ||||||
| for (int i = 0; i < n; ++i) | ||||||
| { | ||||||
| reinterpret_cast<int4*>(elems.data())[i] = s[i]; | ||||||
| } | ||||||
| } | ||||||
| // if widening (e.g. int8 -> int32), just load it in one go. recall chunk_bytes = input size * vec_size, and | ||||||
| // vec_size = 16 / output size. Since output size is pow2, vec_size is pow2. Hence chunk_bytes is always pow2. | ||||||
| // this ensures load_store_type<chunk_bytes> will never fail. | ||||||
| else | ||||||
| { | ||||||
| using sub_t = decltype(load_store_type<chunk_bytes>()); | ||||||
| *reinterpret_cast<sub_t*>(elems.data()) = reinterpret_cast<const sub_t*>(base)[v]; | ||||||
| } | ||||||
| return elems; | ||||||
| }; | ||||||
| auto chunks = ::cuda::std::tuple{load_chunk(aligned_ptrs)...}; | ||||||
|
|
||||||
| alignas(sizeof(output_t) * vec_size) output_t res[vec_size]; | ||||||
| // must fully unroll to take full advantage of ILP. otherwise perf regress by half | ||||||
| _CCCL_PRAGMA_UNROLL_FULL() | ||||||
| for (int k = 0; k < vec_size; ++k) | ||||||
| { | ||||||
| char* smem = smem_base; | ||||||
| auto fetch_operand = [&](auto aligned_ptr) { | ||||||
| using T = typename decltype(aligned_ptr)::value_type; | ||||||
| const int head_padding = alignof(T) < bulk_copy_alignment ? aligned_ptr.head_padding : 0; | ||||||
| const char* src = smem + head_padding; | ||||||
| smem += tile_padding + int{sizeof(T)} * tile_size; | ||||||
| return reinterpret_cast<const T*>(src)[idx]; | ||||||
| }; | ||||||
|
|
||||||
| // need to expand into a tuple for guaranteed order of evaluation | ||||||
| ::cuda::std::apply( | ||||||
| [&](auto... values) { | ||||||
| if (pred(values...)) | ||||||
| { | ||||||
| out[idx] = f(values...); | ||||||
| } | ||||||
| res[k] = ::cuda::std::apply( | ||||||
| [&](auto&... c) { | ||||||
| return f(c[k]...); | ||||||
| }, | ||||||
| ::cuda::std::tuple<InTs...>{fetch_operand(aligned_ptrs)...}); | ||||||
| chunks); | ||||||
| } | ||||||
| }); | ||||||
| }; | ||||||
| // explicitly calling the lambda on literal true/false lets the compiler emit the lambda twice | ||||||
| if (tile_size == valid_items) | ||||||
| { | ||||||
| process_tile(::cuda::std::true_type{}); | ||||||
| out_vec[v] = *reinterpret_cast<const store_t*>(res); | ||||||
| } | ||||||
|
Comment on lines
+920
to
+1001
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nitpick: This is a nontrivial amount of code, I believe we should extract it into a function.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not sure about this one: this code does not seem to be that reusable across different kernels
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am fine with the code like this, but I agree it's a big unwieldy |
||||||
| } | ||||||
| else | ||||||
| { | ||||||
| process_tile(::cuda::std::false_type{}); | ||||||
| auto process_tile = [&](auto full_tile) { | ||||||
| unrolled_for<UnrollFactor>(num_elem_per_thread, [&](int j) { | ||||||
| // TODO(bgruber): fbusato suggests to hoist threadIdx.x out of the loop below | ||||||
| const int idx = j * threads_per_block + threadIdx.x; | ||||||
| if (full_tile || idx < valid_items) | ||||||
| { | ||||||
| char* smem = smem_base; | ||||||
| auto fetch_operand = [&](auto aligned_ptr) { | ||||||
| using T = typename decltype(aligned_ptr)::value_type; | ||||||
| const int head_padding = alignof(T) < bulk_copy_alignment ? aligned_ptr.head_padding : 0; | ||||||
| const char* src = smem + head_padding; | ||||||
| smem += tile_padding + int{sizeof(T)} * tile_size; | ||||||
| return reinterpret_cast<const T*>(src)[idx]; | ||||||
| }; | ||||||
|
|
||||||
| // need to expand into a tuple for guaranteed order of evaluation | ||||||
| ::cuda::std::apply( | ||||||
| [&](auto... values) { | ||||||
| if (pred(values...)) | ||||||
| { | ||||||
| out[idx] = f(values...); | ||||||
| } | ||||||
| }, | ||||||
| ::cuda::std::tuple<InTs...>{fetch_operand(aligned_ptrs)...}); | ||||||
| } | ||||||
| }); | ||||||
| }; | ||||||
| // explicitly calling the lambda on literal true/false lets the compiler emit the lambda twice | ||||||
| if (tile_size == valid_items) | ||||||
| { | ||||||
| process_tile(::cuda::std::true_type{}); | ||||||
| } | ||||||
| else | ||||||
| { | ||||||
| process_tile(::cuda::std::false_type{}); | ||||||
| } | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
|
|
@@ -1006,6 +1083,7 @@ template <typename PolicySelector, | |||||
| typename Predicate, | ||||||
| typename F, | ||||||
| typename RandomAccessIteratorOut, | ||||||
| int OutputAlign, | ||||||
| typename... RandomAccessIteratorsIn> | ||||||
| #if _CCCL_HAS_CONCEPTS() | ||||||
| requires transform_policy_selector<PolicySelector> | ||||||
|
|
@@ -1068,7 +1146,7 @@ __launch_bounds__(get_threads_per_block<PolicySelector>) _CCCL_KERNEL_ATTRIBUTES | |||||
| { | ||||||
| NV_IF_TARGET( | ||||||
| NV_PROVIDES_SM_90, | ||||||
| (transform_kernel_ublkcp</*policy*/ policy.async_copy.threads_per_block, policy.async_copy.unroll_factor>( | ||||||
| (transform_kernel_ublkcp<policy.async_copy.threads_per_block, policy.async_copy.unroll_factor, OutputAlign>( | ||||||
| num_items, | ||||||
| num_elem_per_thread, | ||||||
| ::cuda::std::move(pred), | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,103 @@ | ||
| // SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
|
|
||
| #include "insert_nested_NVTX_range_guard.h" | ||
|
|
||
| #include <cub/device/device_transform.cuh> | ||
|
|
||
| #include <cuda/__memory/aligned_size.h> | ||
|
|
||
| #include <algorithm> | ||
|
|
||
| #include "catch2_test_launch_helper.h" | ||
| #include <c2h/catch2_test_helper.h> | ||
| #include <c2h/test_util_vec.h> | ||
|
|
||
| // %PARAM% TEST_LAUNCH lid 0:1:2 | ||
|
|
||
| DECLARE_LAUNCH_WRAPPER(cub::DeviceTransform::Transform, transform_many); | ||
|
|
||
| #define ALIGNED_ITEM_COUNTS 0, 16, 128, 4096, 4112, 65536, 99'984 | ||
|
|
||
| template <typename Out> | ||
| struct cast_to | ||
| { | ||
| template <typename T> | ||
| __host__ __device__ Out operator()(T v) const | ||
| { | ||
| return static_cast<Out>(v); | ||
| } | ||
| }; | ||
|
|
||
| C2H_TEST("DeviceTransform::Transform aligned_size_t<16> same-width", | ||
| "[device][transform]", | ||
| c2h::type_list<std::uint8_t, std::uint16_t, std::uint32_t, std::uint64_t, float, double, uchar3>) | ||
|
nanan-nvidia marked this conversation as resolved.
Outdated
|
||
| { | ||
| using type = c2h::get<0, TestType>; | ||
| using offset_t = cuda::std::int64_t; | ||
| const offset_t num_items = GENERATE(ALIGNED_ITEM_COUNTS); | ||
| CAPTURE(c2h::type_name<type>(), num_items); | ||
|
|
||
| c2h::device_vector<type> a(num_items, thrust::no_init); | ||
| c2h::device_vector<type> b(num_items, thrust::no_init); | ||
| c2h::gen(C2H_SEED(1), a); | ||
| c2h::gen(C2H_SEED(1), b); | ||
|
|
||
| c2h::device_vector<type> result(num_items, thrust::no_init); | ||
| transform_many(cuda::std::make_tuple(a.begin(), b.begin()), | ||
| result.begin(), | ||
| cuda::aligned_size_t<16>(num_items), | ||
| cuda::std::plus<type>{}); | ||
|
|
||
| c2h::host_vector<type> a_h = a; | ||
| c2h::host_vector<type> b_h = b; | ||
| c2h::host_vector<type> reference_h(num_items, thrust::no_init); | ||
| std::transform(a_h.begin(), a_h.end(), b_h.begin(), reference_h.begin(), cuda::std::plus<type>{}); | ||
| REQUIRE(reference_h == result); | ||
| } | ||
|
|
||
| C2H_TEST("DeviceTransform::Transform aligned_size_t<16> narrowing to uint8", | ||
| "[device][transform]", | ||
| c2h::type_list<std::uint16_t, std::uint32_t, std::uint64_t>) | ||
| { | ||
| using in_t = c2h::get<0, TestType>; | ||
| using out_t = std::uint8_t; | ||
| using offset_t = cuda::std::int64_t; | ||
| const offset_t num_items = GENERATE(ALIGNED_ITEM_COUNTS); | ||
| CAPTURE(c2h::type_name<in_t>(), num_items); | ||
|
|
||
| c2h::device_vector<in_t> in(num_items, thrust::no_init); | ||
| c2h::gen(C2H_SEED(1), in); | ||
|
|
||
| c2h::device_vector<out_t> result(num_items, thrust::no_init); | ||
| transform_many( | ||
| cuda::std::make_tuple(in.begin()), result.begin(), cuda::aligned_size_t<16>(num_items), cast_to<out_t>{}); | ||
|
|
||
| c2h::host_vector<in_t> in_h = in; | ||
| c2h::host_vector<out_t> reference_h(num_items, thrust::no_init); | ||
| std::transform(in_h.begin(), in_h.end(), reference_h.begin(), cast_to<out_t>{}); | ||
| REQUIRE(reference_h == result); | ||
| } | ||
|
|
||
| C2H_TEST("DeviceTransform::Transform aligned_size_t<16> widening from uint8", | ||
| "[device][transform]", | ||
| c2h::type_list<std::uint16_t, std::uint32_t, std::uint64_t>) | ||
| { | ||
| using in_t = std::uint8_t; | ||
| using out_t = c2h::get<0, TestType>; | ||
| using offset_t = cuda::std::int64_t; | ||
| const offset_t num_items = GENERATE(ALIGNED_ITEM_COUNTS); | ||
| CAPTURE(c2h::type_name<out_t>(), num_items); | ||
|
|
||
| c2h::device_vector<in_t> in(num_items, thrust::no_init); | ||
| c2h::gen(C2H_SEED(1), in); | ||
|
|
||
| c2h::device_vector<out_t> result(num_items, thrust::no_init); | ||
| transform_many( | ||
| cuda::std::make_tuple(in.begin()), result.begin(), cuda::aligned_size_t<16>(num_items), cast_to<out_t>{}); | ||
|
|
||
| c2h::host_vector<in_t> in_h = in; | ||
| c2h::host_vector<out_t> reference_h(num_items, thrust::no_init); | ||
| std::transform(in_h.begin(), in_h.end(), reference_h.begin(), cast_to<out_t>{}); | ||
| REQUIRE(reference_h == result); | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Important: Once we approve the general mechanics, we should update the per-function documentation.