-
Notifications
You must be signed in to change notification settings - Fork 417
Vectorize output store in ublkcp DeviceTransform kernel #9481
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 13 commits
da1fc05
c6559fb
c2a253d
0db60f4
3a5fb1e
505da80
6b64983
5c4cb46
afbddc8
c9a53d7
897a0a4
aff5ab5
71cefee
54db169
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -713,6 +713,7 @@ _CCCL_DEVICE void bulk_copy_maybe_unaligned( | |
| // didn't merge the changes. The problem was mostly a 25% increase in integer instructions, as shown by ncu. | ||
| template <int threads_per_block, | ||
| int UnrollFactor, | ||
| int StoreVec, | ||
| typename Offset, | ||
| typename Predicate, | ||
| typename F, | ||
|
|
@@ -721,6 +722,7 @@ template <int threads_per_block, | |
| _CCCL_DEVICE void transform_kernel_ublkcp( | ||
| Offset num_items, | ||
| int num_elem_per_thread, | ||
| [[maybe_unused]] bool can_vectorize, | ||
| Predicate pred, | ||
| F f, | ||
| RandomAccessIteratorOut out, | ||
|
|
@@ -895,6 +897,112 @@ _CCCL_DEVICE void transform_kernel_ublkcp( | |
| // move the whole index and iterator to the block/thread index, to reduce arithmetic in the loops below | ||
| out += offset; | ||
|
|
||
| using output_t = it_value_t<RandomAccessIteratorOut>; | ||
| constexpr int out_size = int{size_of<output_t>}; | ||
| constexpr int vec_size = (out_size > 0 && out_size <= 16) ? 16 / out_size : 1; | ||
|
bernhardmgruber marked this conversation as resolved.
Outdated
|
||
| constexpr int store_vec = (StoreVec > 0) ? (::cuda::std::min) (StoreVec, vec_size) : vec_size; | ||
| // compile time eligibility for the vectorized store (STG.128): | ||
| // 1. there are no predicates | ||
| // 2. memory layout is contiguous | ||
| // 3. semantically we can raw copy | ||
| // 4. size is power-of-2 and <= 16 bytes | ||
| // #TODO(nan): STG.256 (128 should have enough BIF already, but should check perf on blackwell) | ||
| constexpr bool vectorize_eligible = | ||
| store_vec > 1 && ::cuda::is_power_of_two(store_vec) && ::cuda::std::is_same_v<Predicate, ::cuda::always_true> | ||
|
nanan-nvidia marked this conversation as resolved.
Outdated
|
||
| && THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorOut> | ||
| && THRUST_NS_QUALIFIER::is_trivially_relocatable_v<output_t> && ::cuda::is_power_of_two(out_size) | ||
| && (... && ::cuda::is_power_of_two(int{sizeof(InTs)})); | ||
|
|
||
| if constexpr (vectorize_eligible) | ||
| { | ||
| if (can_vectorize) | ||
| { | ||
| // store_vec (S) output elements per STG.128/64/.../8, defaulting to vec_size (= 16 / sizeof(output), today's | ||
| // 16-byte store). Shrinking S narrows the store but also reduces the number of fully-unrolled lambda calls per | ||
| // store, which bounds register pressure for heavy functors (whose stores aren't the bottleneck anyway). res[] is | ||
| // indexed only by the fully-unrolled k, i.e. compile-time, so it stays in registers and never spills to local | ||
| // memory regardless of S. | ||
| using store_t = decltype(load_store_type<store_vec * out_size>()); | ||
| auto* out_vec = reinterpret_cast<store_t*>(out); | ||
| const int num_groups = valid_items / store_vec; | ||
| for (int g = threadIdx.x; g < num_groups; g += threads_per_block) | ||
| { | ||
| char* smem = smem_base; | ||
| auto load_chunk = [&](auto aligned_ptr) { | ||
| using T = typename decltype(aligned_ptr)::value_type; | ||
| // on blackwell, head_padding should always be zero | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Important: Please add an assertion for that. |
||
| // on hopper, bulk_copy_alignment is 128 bytes, head_padding could be 112 bytes for example | ||
| // alignof(T) will always be powers of 2 per C++ standard | ||
| const T* base = reinterpret_cast<const T*>(smem + aligned_ptr.head_padding); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Important: since head padding is zero on blackwell, please only add it below SM100 then ( |
||
| smem += tile_padding + int{sizeof(T)} * tile_size; | ||
| // Gather this input's vec_size elements for output-vector v into a register array. we take the maximal | ||
| // alignment out of alignof(T) and 16 bytes. If input is narrower, we will waste a few (0-16) registers | ||
| constexpr ::cuda::std::size_t chunk_align = (::cuda::std::max) (alignof(T), alignof(int4)); | ||
|
nanan-nvidia marked this conversation as resolved.
|
||
| ::cuda::__uninitialized_array<T, store_vec, chunk_align> elems; | ||
| constexpr int chunk_bytes = int{sizeof(T)} * store_vec; | ||
| // if same width or narrowing (e.g. int32 -> int8), we split it up into multiple 16 byte reads | ||
| // CAREFUL: the byte width sizeof(T) * vec_size can exceed 16 when the input is wider than the output. | ||
| // However, since input both input type size and output size is pow2, when the input is wider, it has to be | ||
| // pow2 times wider. Therefore, chunk_bytes = input size * vec_size is always divisible by 16 | ||
| // (recall 16 = output size * vec_size) , i.e. we can read it as multiple int4 loads | ||
| if constexpr (chunk_bytes % int{sizeof(int4)} == 0) | ||
| { | ||
| constexpr int n = chunk_bytes / int{sizeof(int4)}; | ||
| const int4* s = reinterpret_cast<const int4*>(base) + g * n; | ||
| _CCCL_PRAGMA_UNROLL_FULL() | ||
| for (int i = 0; i < n; ++i) | ||
| { | ||
| reinterpret_cast<int4*>(elems.data())[i] = s[i]; | ||
| } | ||
| } | ||
| // if widening (e.g. int8 -> int32), just load it in one go. recall chunk_bytes = input size * vec_size, and | ||
| // vec_size = 16 / output size. Since output size is pow2, vec_size is pow2. Hence chunk_bytes is always pow2. | ||
| // this ensures load_store_type<chunk_bytes> will never fail. | ||
| else | ||
| { | ||
| using sub_t = decltype(load_store_type<chunk_bytes>()); | ||
| *reinterpret_cast<sub_t*>(elems.data()) = reinterpret_cast<const sub_t*>(base)[g]; | ||
| } | ||
| return elems; | ||
| }; | ||
| auto chunks = ::cuda::std::tuple{load_chunk(aligned_ptrs)...}; | ||
|
|
||
| // must fully unroll to take full advantage of ILP. otherwise perf regress by half | ||
|
bernhardmgruber marked this conversation as resolved.
Outdated
|
||
| ::cuda::__uninitialized_array<output_t, store_vec, sizeof(output_t) * store_vec> res; | ||
| _CCCL_PRAGMA_UNROLL_FULL() | ||
| for (int k = 0; k < store_vec; ++k) | ||
| { | ||
| res[k] = ::cuda::std::apply( | ||
| [&](auto&... c) { | ||
| return f(c[k]...); | ||
| }, | ||
| chunks); | ||
| } | ||
| out_vec[g] = *reinterpret_cast<const store_t*>(res.data()); | ||
| } | ||
|
|
||
| // scalar tail: the up to (store_vec - 1) trailing elements not covered by a whole store group. can_vectorize | ||
| // implies an always_true predicate, so we store unconditionally. | ||
| for (int idx = num_groups * store_vec + threadIdx.x; idx < valid_items; idx += threads_per_block) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Q: Have you tried scaling the items per thread in the host-side dispatch as multiples of the vector store width, so we never need a tail logic?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you elaborate what you meant by scaling? The main motive for adding a tail logic was so that, if the output pointer is aligned to 16 bytes, but the amount of data we store are not aligned to 16 bytes (i.e. 10,000,001 |
||
| { | ||
| char* smem = smem_base; | ||
| auto fetch_operand = [&](auto aligned_ptr) { | ||
| using T = typename decltype(aligned_ptr)::value_type; | ||
| const int head_padding = alignof(T) < bulk_copy_alignment ? aligned_ptr.head_padding : 0; | ||
| const char* src = smem + head_padding; | ||
| smem += tile_padding + int{sizeof(T)} * tile_size; | ||
| return reinterpret_cast<const T*>(src)[idx]; | ||
| }; | ||
| out[idx] = ::cuda::std::apply( | ||
| [&](auto... values) { | ||
| return f(values...); | ||
| }, | ||
| ::cuda::std::tuple<InTs...>{fetch_operand(aligned_ptrs)...}); | ||
| } | ||
| return; | ||
| } | ||
|
Comment on lines
+920
to
+1001
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nitpick: This is a nontrivial amount of code, I believe we should extract it into a function.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not sure about this one: this code does not seem to be that reusable across different kernels
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am fine with the code like this, but I agree it's a big unwieldy |
||
| } | ||
|
|
||
| auto process_tile = [&](auto full_tile) { | ||
| unrolled_for<UnrollFactor>(num_elem_per_thread, [&](int j) { | ||
| // TODO(bgruber): fbusato suggests to hoist threadIdx.x out of the loop below | ||
|
|
@@ -1068,9 +1176,12 @@ __launch_bounds__(get_threads_per_block<PolicySelector>) _CCCL_KERNEL_ATTRIBUTES | |
| { | ||
| NV_IF_TARGET( | ||
| NV_PROVIDES_SM_90, | ||
| (transform_kernel_ublkcp</*policy*/ policy.async_copy.threads_per_block, policy.async_copy.unroll_factor>( | ||
| (transform_kernel_ublkcp<policy.async_copy.threads_per_block, | ||
| policy.async_copy.unroll_factor, | ||
| policy.async_copy.store_vec>( | ||
| num_items, | ||
| num_elem_per_thread, | ||
| can_vectorize, | ||
| ::cuda::std::move(pred), | ||
| ::cuda::std::move(f), | ||
| ::cuda::std::move(out), | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -145,12 +145,17 @@ struct TransformAsyncCopyPolicy | |
| // Unroll 1 tends to improve performance, especially for smaller data types (confirmed by benchmark) | ||
| int unroll_factor = 1; //!< The unroll factor for the transformation loop in the kernel. The value 0 retains the | ||
| //!< compiler's default unrolling (specifying no unroll pragma), 1 means no unrolling. | ||
| // Vectorized store width for the ublkcp kernel. 0 means "auto": store_vec = 16 / sizeof(output) (a 16-byte STG.128). | ||
| // Setting it smaller narrows the store but also reduces the number of fully-unrolled lambda calls per store, which | ||
| // bounds register pressure for heavy functors (their stores aren't the bottleneck anyway). | ||
| int store_vec = 0; //!< Output elements per vectorized store (S). 0 = auto (16 / sizeof(output)). | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Important question: Why do we need a new tuning parameter, and cannot use
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree with the analysis that in the vectorized path, I think they should still be different parameters in the policy, because
|
||
|
|
||
| [[nodiscard]] _CCCL_HOST_DEVICE_API constexpr friend bool | ||
| operator==(const TransformAsyncCopyPolicy& lhs, const TransformAsyncCopyPolicy& rhs) noexcept | ||
| { | ||
| return lhs.threads_per_block == rhs.threads_per_block && lhs.min_items_per_thread == rhs.min_items_per_thread | ||
| && lhs.max_items_per_thread == rhs.max_items_per_thread && lhs.unroll_factor == rhs.unroll_factor; | ||
| && lhs.max_items_per_thread == rhs.max_items_per_thread && lhs.unroll_factor == rhs.unroll_factor | ||
| && lhs.store_vec == rhs.store_vec; | ||
| } | ||
|
|
||
| [[nodiscard]] _CCCL_HOST_DEVICE_API constexpr friend bool | ||
|
|
@@ -162,9 +167,11 @@ struct TransformAsyncCopyPolicy | |
| #if _CCCL_HOSTED() | ||
| friend ::std::ostream& operator<<(::std::ostream& os, const TransformAsyncCopyPolicy& policy) | ||
| { | ||
| return os << "TransformAsyncCopyPolicy { .threads_per_block = " << policy.threads_per_block | ||
| << ", .min_items_per_thread = " << policy.min_items_per_thread << ", .max_items_per_thread = " | ||
| << policy.max_items_per_thread << ", .unroll_factor = " << policy.unroll_factor << " }"; | ||
| return os | ||
| << "TransformAsyncCopyPolicy { .threads_per_block = " << policy.threads_per_block | ||
| << ", .min_items_per_thread = " << policy.min_items_per_thread | ||
| << ", .max_items_per_thread = " << policy.max_items_per_thread << ", .unroll_factor = " << policy.unroll_factor | ||
| << ", .store_vec = " << policy.store_vec << " }"; | ||
| } | ||
| #endif // _CCCL_HOSTED() | ||
| }; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,111 @@ | ||
| // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
|
|
||
| #include "insert_nested_NVTX_range_guard.h" | ||
|
|
||
| #include <cub/device/device_transform.cuh> | ||
|
|
||
| #include <algorithm> | ||
|
|
||
| #include "catch2_test_launch_helper.h" | ||
| #include <c2h/catch2_test_helper.h> | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
|
|
||
| // %PARAM% TEST_LAUNCH lid 0:1:2 | ||
|
|
||
| DECLARE_LAUNCH_WRAPPER(cub::DeviceTransform::Transform, transform_many); | ||
|
|
||
| // Generic counts, deliberately including non-multiples of the 16-byte vectorized store width, to exercise the scalar | ||
| // tail of the ublkcp vectorized store path on aligned (c2h::device_vector) buffers. | ||
| #define GENERIC_COUNTS 0, 1, 15, 16, 17, 127, 129, 4095, 4097, 100'000 | ||
|
|
||
| template <typename Out> | ||
| struct cast_to | ||
| { | ||
| template <typename T> | ||
| __host__ __device__ Out operator()(T v) const | ||
| { | ||
| return static_cast<Out>(v); | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
| } | ||
| }; | ||
|
|
||
| // Narrowing widths (e.g. uint32 -> uint8) drive the multi-int4-load gather; widening (uint8 -> uint32) drives the | ||
| // sub-16-byte load. Same-width is already covered by catch2_test_device_transform.cu's BabelStream add. | ||
| C2H_TEST("DeviceTransform::Transform vectorized store narrowing to uint8", | ||
| "[device][transform]", | ||
| c2h::type_list<std::uint16_t, std::uint32_t, std::uint64_t>) | ||
| { | ||
| using in_t = c2h::get<0, TestType>; | ||
| using out_t = std::uint8_t; | ||
| using offset_t = cuda::std::int64_t; | ||
| const offset_t num_items = GENERATE(GENERIC_COUNTS); | ||
| CAPTURE(c2h::type_name<in_t>(), num_items); | ||
|
|
||
| c2h::device_vector<in_t> in(num_items, thrust::no_init); | ||
| c2h::gen(C2H_SEED(1), in); | ||
|
|
||
| c2h::device_vector<out_t> result(num_items, thrust::no_init); | ||
| transform_many(cuda::std::make_tuple(in.begin()), result.begin(), num_items, cast_to<out_t>{}); | ||
|
|
||
| c2h::host_vector<in_t> in_h = in; | ||
| c2h::host_vector<out_t> reference_h(num_items, thrust::no_init); | ||
| std::transform(in_h.begin(), in_h.end(), reference_h.begin(), cast_to<out_t>{}); | ||
| REQUIRE(reference_h == result); | ||
| } | ||
|
|
||
| C2H_TEST("DeviceTransform::Transform vectorized store widening from uint8", | ||
| "[device][transform]", | ||
| c2h::type_list<std::uint16_t, std::uint32_t, std::uint64_t>) | ||
| { | ||
| using in_t = std::uint8_t; | ||
| using out_t = c2h::get<0, TestType>; | ||
| using offset_t = cuda::std::int64_t; | ||
| const offset_t num_items = GENERATE(GENERIC_COUNTS); | ||
| CAPTURE(c2h::type_name<out_t>(), num_items); | ||
|
|
||
| c2h::device_vector<in_t> in(num_items, thrust::no_init); | ||
| c2h::gen(C2H_SEED(1), in); | ||
|
|
||
| c2h::device_vector<out_t> result(num_items, thrust::no_init); | ||
| transform_many(cuda::std::make_tuple(in.begin()), result.begin(), num_items, cast_to<out_t>{}); | ||
|
|
||
| c2h::host_vector<in_t> in_h = in; | ||
| c2h::host_vector<out_t> reference_h(num_items, thrust::no_init); | ||
| std::transform(in_h.begin(), in_h.end(), reference_h.begin(), cast_to<out_t>{}); | ||
| REQUIRE(reference_h == result); | ||
| } | ||
|
|
||
| struct ublkcp_store_vec_3_selector | ||
| { | ||
| _CCCL_HOST_DEVICE_API constexpr auto operator()(::cuda::compute_capability cc) const -> cub::TransformPolicy | ||
|
nanan-nvidia marked this conversation as resolved.
|
||
| { | ||
| auto async = cub::TransformAsyncCopyPolicy{}; | ||
| async.threads_per_block = 256; | ||
| async.store_vec = 3; | ||
|
nanan-nvidia marked this conversation as resolved.
Outdated
|
||
| const auto algorithm = | ||
| (cc < ::cuda::compute_capability{9, 0}) ? cub::TransformAlgorithm::prefetch : cub::TransformAlgorithm::ublkcp; | ||
| return {64 * 1024, algorithm, cub::TransformPrefetchPolicy{256}, {}, async}; | ||
| } | ||
| }; | ||
|
|
||
| C2H_TEST("DeviceTransform::Transform non-power-of-two store_vec falls back to scalar", "[device][transform]") | ||
| { | ||
| using in_t = std::uint32_t; | ||
| using out_t = std::uint8_t; | ||
| const cuda::std::int64_t num_items = GENERATE(GENERIC_COUNTS); | ||
| CAPTURE(num_items); | ||
|
|
||
| c2h::device_vector<in_t> in(num_items, thrust::no_init); | ||
| c2h::gen(C2H_SEED(1), in); | ||
|
|
||
| c2h::device_vector<out_t> result(num_items, thrust::no_init); | ||
| auto env = cuda::execution::tune(ublkcp_store_vec_3_selector{}); | ||
| REQUIRE(cudaSuccess | ||
| == cub::DeviceTransform::Transform( | ||
| cuda::std::make_tuple(in.begin()), result.begin(), num_items, cast_to<out_t>{}, env)); | ||
| REQUIRE(cudaSuccess == cudaDeviceSynchronize()); | ||
|
|
||
| c2h::host_vector<in_t> in_h = in; | ||
| c2h::host_vector<out_t> reference_h(num_items, thrust::no_init); | ||
| std::transform(in_h.begin(), in_h.end(), reference_h.begin(), cast_to<out_t>{}); | ||
| REQUIRE(reference_h == result); | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.