NVIDIA · nanan-nvidia · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026
@@ -28,6 +28,7 @@
 #include <thrust/type_traits/unwrap_contiguous_iterator.h>
 
 #include <cuda/__cmath/ceil_div.h>
+#include <cuda/__cmath/pow2.h>
 #include <cuda/__memory/is_aligned.h>
 #include <cuda/std/__algorithm/clamp.h>
 #include <cuda/std/__algorithm/max.h>
@@ -313,11 +314,28 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke_async_algorithm(
   }
 
   auto [launcher, kernel, items_per_thread] = *ret;
+
+  // Runtime check whether this launch can take the ublkcp kernel's vectorized (STG.128) store path. The output value
+  // type must pack into a 16-byte vector and all pointers must be suitably aligned. The kernel additionally gates on
+  // compile-time eligibility (contiguous, trivially relocatable, power-of-two element sizes, no predicate).
+  bool can_vectorize = false;
+  if constexpr (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorOut>)
+  {
+    using output_t         = it_value_t<RandomAccessIteratorOut>;
+    constexpr int out_size = int{size_of<output_t>};
+    constexpr int vec_size = (out_size > 0 && out_size <= 16) ? 16 / out_size : 1;
+    if constexpr (vec_size > 1 && ::cuda::is_power_of_two(out_size)
+                  && (... && ::cuda::is_power_of_two(int{sizeof(it_value_t<RandomAccessIteratorsIn>)})))
+    {
+      can_vectorize = kernel_source.CanVectorize(vec_size, out, ::cuda::std::get<Is>(in)...);
+    }
+  }
+
   return launcher.doit(
     kernel,
     num_items,
     items_per_thread,
-    false,
+    can_vectorize,
     pred,
     op,
     THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(out),

@@ -713,6 +713,7 @@ _CCCL_DEVICE void bulk_copy_maybe_unaligned(
 // didn't merge the changes. The problem was mostly a 25% increase in integer instructions, as shown by ncu.
 template <int threads_per_block,
           int UnrollFactor,
+          int StoreVec,
           typename Offset,
           typename Predicate,
           typename F,
@@ -721,6 +722,7 @@ template <int threads_per_block,
 _CCCL_DEVICE void transform_kernel_ublkcp(
   Offset num_items,
   int num_elem_per_thread,
+  [[maybe_unused]] bool can_vectorize,
   Predicate pred,
   F f,
   RandomAccessIteratorOut out,
@@ -895,6 +897,112 @@ _CCCL_DEVICE void transform_kernel_ublkcp(
   // move the whole index and iterator to the block/thread index, to reduce arithmetic in the loops below
   out += offset;
 
+  using output_t          = it_value_t<RandomAccessIteratorOut>;
+  constexpr int out_size  = int{size_of<output_t>};
+  constexpr int vec_size  = (out_size > 0 && out_size <= 16) ? 16 / out_size : 1;
+  constexpr int store_vec = (StoreVec > 0) ? (::cuda::std::min) (StoreVec, vec_size) : vec_size;
+  // compile time eligibility for the vectorized store (STG.128):
+  // 1. there are no predicates
+  // 2. memory layout is contiguous
+  // 3. semantically we can raw copy
+  // 4. size is power-of-2 and <= 16 bytes
+  // #TODO(nan): STG.256 (128 should have enough BIF already, but should check perf on blackwell)
+  constexpr bool vectorize_eligible =
+    store_vec > 1 && ::cuda::is_power_of_two(store_vec) && ::cuda::std::is_same_v<Predicate, ::cuda::always_true>
+    && THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorOut>
+    && THRUST_NS_QUALIFIER::is_trivially_relocatable_v<output_t> && ::cuda::is_power_of_two(out_size)
+    && (... && ::cuda::is_power_of_two(int{sizeof(InTs)}));
+
+  if constexpr (vectorize_eligible)
+  {
+    if (can_vectorize)
+    {
+      // store_vec (S) output elements per STG.128/64/.../8, defaulting to vec_size (= 16 / sizeof(output), today's
+      // 16-byte store). Shrinking S narrows the store but also reduces the number of fully-unrolled lambda calls per
+      // store, which bounds register pressure for heavy functors (whose stores aren't the bottleneck anyway). res[] is
+      // indexed only by the fully-unrolled k, i.e. compile-time, so it stays in registers and never spills to local
+      // memory regardless of S.
+      using store_t        = decltype(load_store_type<store_vec * out_size>());
+      auto* out_vec        = reinterpret_cast<store_t*>(out);
+      const int num_groups = valid_items / store_vec;
+      for (int g = threadIdx.x; g < num_groups; g += threads_per_block)
+      {
+        char* smem      = smem_base;
+        auto load_chunk = [&](auto aligned_ptr) {
+          using T = typename decltype(aligned_ptr)::value_type;
+          // on blackwell, head_padding should always be zero
+          // on hopper, bulk_copy_alignment is 128 bytes, head_padding could be 112 bytes for example
+          // alignof(T) will always be powers of 2 per C++ standard
+          const T* base = reinterpret_cast<const T*>(smem + aligned_ptr.head_padding);
+          smem += tile_padding + int{sizeof(T)} * tile_size;
+          // Gather this input's vec_size elements for output-vector v into a register array. we take the maximal
+          // alignment out of alignof(T) and 16 bytes. If input is narrower, we will waste a few (0-16) registers
+          constexpr ::cuda::std::size_t chunk_align = (::cuda::std::max) (alignof(T), alignof(int4));
+          ::cuda::__uninitialized_array<T, store_vec, chunk_align> elems;
+          constexpr int chunk_bytes = int{sizeof(T)} * store_vec;
+          // if same width or narrowing (e.g. int32 -> int8), we split it up into multiple 16 byte reads
+          // CAREFUL: the byte width sizeof(T) * vec_size can exceed 16 when the input is wider than the output.
+          // However, since input both input type size and output size is pow2, when the input is wider, it has to be
+          // pow2 times wider. Therefore, chunk_bytes = input size * vec_size is always divisible by 16
+          // (recall 16 = output size * vec_size) , i.e. we can read it as multiple int4 loads
+          if constexpr (chunk_bytes % int{sizeof(int4)} == 0)
+          {
+            constexpr int n = chunk_bytes / int{sizeof(int4)};
+            const int4* s   = reinterpret_cast<const int4*>(base) + g * n;
+            _CCCL_PRAGMA_UNROLL_FULL()
+            for (int i = 0; i < n; ++i)
+            {
+              reinterpret_cast<int4*>(elems.data())[i] = s[i];
+            }
+          }
+          // if widening (e.g. int8 -> int32), just load it in one go. recall chunk_bytes = input size * vec_size, and
+          // vec_size = 16 / output size. Since output size is pow2, vec_size is pow2. Hence chunk_bytes is always pow2.
+          // this ensures load_store_type<chunk_bytes> will never fail.
+          else
+          {
+            using sub_t                             = decltype(load_store_type<chunk_bytes>());
+            *reinterpret_cast<sub_t*>(elems.data()) = reinterpret_cast<const sub_t*>(base)[g];
+          }
+          return elems;
+        };
+        auto chunks = ::cuda::std::tuple{load_chunk(aligned_ptrs)...};
+
+        // must fully unroll to take full advantage of ILP. otherwise perf regress by half
+        ::cuda::__uninitialized_array<output_t, store_vec, sizeof(output_t) * store_vec> res;
+        _CCCL_PRAGMA_UNROLL_FULL()
+        for (int k = 0; k < store_vec; ++k)
+        {
+          res[k] = ::cuda::std::apply(
+            [&](auto&... c) {
+              return f(c[k]...);
+            },
+            chunks);
+        }
+        out_vec[g] = *reinterpret_cast<const store_t*>(res.data());
+      }
+
+      // scalar tail: the up to (store_vec - 1) trailing elements not covered by a whole store group. can_vectorize
+      // implies an always_true predicate, so we store unconditionally.
+      for (int idx = num_groups * store_vec + threadIdx.x; idx < valid_items; idx += threads_per_block)
+      {
+        char* smem         = smem_base;
+        auto fetch_operand = [&](auto aligned_ptr) {
+          using T                = typename decltype(aligned_ptr)::value_type;
+          const int head_padding = alignof(T) < bulk_copy_alignment ? aligned_ptr.head_padding : 0;
+          const char* src        = smem + head_padding;
+          smem += tile_padding + int{sizeof(T)} * tile_size;
+          return reinterpret_cast<const T*>(src)[idx];
+        };
+        out[idx] = ::cuda::std::apply(
+          [&](auto... values) {
+            return f(values...);
+          },
+          ::cuda::std::tuple<InTs...>{fetch_operand(aligned_ptrs)...});
+      }
+      return;
+    }
+  }
+
   auto process_tile = [&](auto full_tile) {
     unrolled_for<UnrollFactor>(num_elem_per_thread, [&](int j) {
       // TODO(bgruber): fbusato suggests to hoist threadIdx.x out of the loop below
@@ -1068,9 +1176,12 @@ __launch_bounds__(get_threads_per_block<PolicySelector>) _CCCL_KERNEL_ATTRIBUTES
   {
     NV_IF_TARGET(
       NV_PROVIDES_SM_90,
-      (transform_kernel_ublkcp</*policy*/ policy.async_copy.threads_per_block, policy.async_copy.unroll_factor>(
+      (transform_kernel_ublkcp<policy.async_copy.threads_per_block,
+                               policy.async_copy.unroll_factor,
+                               policy.async_copy.store_vec>(
          num_items,
          num_elem_per_thread,
+         can_vectorize,
          ::cuda::std::move(pred),
          ::cuda::std::move(f),
          ::cuda::std::move(out),

@@ -145,12 +145,17 @@ struct TransformAsyncCopyPolicy
   // Unroll 1 tends to improve performance, especially for smaller data types (confirmed by benchmark)
   int unroll_factor = 1; //!< The unroll factor for the transformation loop in the kernel. The value 0 retains the
                          //!< compiler's default unrolling (specifying no unroll pragma), 1 means no unrolling.
+  // Vectorized store width for the ublkcp kernel. 0 means "auto": store_vec = 16 / sizeof(output) (a 16-byte STG.128).
+  // Setting it smaller narrows the store but also reduces the number of fully-unrolled lambda calls per store, which
+  // bounds register pressure for heavy functors (their stores aren't the bottleneck anyway).
+  int store_vec = 0; //!< Output elements per vectorized store (S). 0 = auto (16 / sizeof(output)).
 
   [[nodiscard]] _CCCL_HOST_DEVICE_API constexpr friend bool
   operator==(const TransformAsyncCopyPolicy& lhs, const TransformAsyncCopyPolicy& rhs) noexcept
   {
     return lhs.threads_per_block == rhs.threads_per_block && lhs.min_items_per_thread == rhs.min_items_per_thread
-        && lhs.max_items_per_thread == rhs.max_items_per_thread && lhs.unroll_factor == rhs.unroll_factor;
+        && lhs.max_items_per_thread == rhs.max_items_per_thread && lhs.unroll_factor == rhs.unroll_factor
+        && lhs.store_vec == rhs.store_vec;
   }
 
   [[nodiscard]] _CCCL_HOST_DEVICE_API constexpr friend bool
@@ -162,9 +167,11 @@ struct TransformAsyncCopyPolicy
 #if _CCCL_HOSTED()
   friend ::std::ostream& operator<<(::std::ostream& os, const TransformAsyncCopyPolicy& policy)
   {
-    return os << "TransformAsyncCopyPolicy { .threads_per_block = " << policy.threads_per_block
-              << ", .min_items_per_thread = " << policy.min_items_per_thread << ", .max_items_per_thread = "
-              << policy.max_items_per_thread << ", .unroll_factor = " << policy.unroll_factor << " }";
+    return os
+        << "TransformAsyncCopyPolicy { .threads_per_block = " << policy.threads_per_block
+        << ", .min_items_per_thread = " << policy.min_items_per_thread
+        << ", .max_items_per_thread = " << policy.max_items_per_thread << ", .unroll_factor = " << policy.unroll_factor
+        << ", .store_vec = " << policy.store_vec << " }";
   }
 #endif // _CCCL_HOSTED()
 };

@@ -0,0 +1,111 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "insert_nested_NVTX_range_guard.h"
+
+#include <cub/device/device_transform.cuh>
+
+#include <algorithm>
+
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.h>
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceTransform::Transform, transform_many);
+
+// Generic counts, deliberately including non-multiples of the 16-byte vectorized store width, to exercise the scalar
+// tail of the ublkcp vectorized store path on aligned (c2h::device_vector) buffers.
+#define GENERIC_COUNTS 0, 1, 15, 16, 17, 127, 129, 4095, 4097, 100'000
+
+template <typename Out>
+struct cast_to
+{
+  template <typename T>
+  __host__ __device__ Out operator()(T v) const
+  {
+    return static_cast<Out>(v);
+  }
+};
+
+// Narrowing widths (e.g. uint32 -> uint8) drive the multi-int4-load gather; widening (uint8 -> uint32) drives the
+// sub-16-byte load. Same-width is already covered by catch2_test_device_transform.cu's BabelStream add.
+C2H_TEST("DeviceTransform::Transform vectorized store narrowing to uint8",
+         "[device][transform]",
+         c2h::type_list<std::uint16_t, std::uint32_t, std::uint64_t>)
+{
+  using in_t               = c2h::get<0, TestType>;
+  using out_t              = std::uint8_t;
+  using offset_t           = cuda::std::int64_t;
+  const offset_t num_items = GENERATE(GENERIC_COUNTS);
+  CAPTURE(c2h::type_name<in_t>(), num_items);
+
+  c2h::device_vector<in_t> in(num_items, thrust::no_init);
+  c2h::gen(C2H_SEED(1), in);
+
+  c2h::device_vector<out_t> result(num_items, thrust::no_init);
+  transform_many(cuda::std::make_tuple(in.begin()), result.begin(), num_items, cast_to<out_t>{});
+
+  c2h::host_vector<in_t> in_h = in;
+  c2h::host_vector<out_t> reference_h(num_items, thrust::no_init);
+  std::transform(in_h.begin(), in_h.end(), reference_h.begin(), cast_to<out_t>{});
+  REQUIRE(reference_h == result);
+}
+
+C2H_TEST("DeviceTransform::Transform vectorized store widening from uint8",
+         "[device][transform]",
+         c2h::type_list<std::uint16_t, std::uint32_t, std::uint64_t>)
+{
+  using in_t               = std::uint8_t;
+  using out_t              = c2h::get<0, TestType>;
+  using offset_t           = cuda::std::int64_t;
+  const offset_t num_items = GENERATE(GENERIC_COUNTS);
+  CAPTURE(c2h::type_name<out_t>(), num_items);
+
+  c2h::device_vector<in_t> in(num_items, thrust::no_init);
+  c2h::gen(C2H_SEED(1), in);
+
+  c2h::device_vector<out_t> result(num_items, thrust::no_init);
+  transform_many(cuda::std::make_tuple(in.begin()), result.begin(), num_items, cast_to<out_t>{});
+
+  c2h::host_vector<in_t> in_h = in;
+  c2h::host_vector<out_t> reference_h(num_items, thrust::no_init);
+  std::transform(in_h.begin(), in_h.end(), reference_h.begin(), cast_to<out_t>{});
+  REQUIRE(reference_h == result);
+}
+
+struct ublkcp_store_vec_3_selector
+{
+  _CCCL_HOST_DEVICE_API constexpr auto operator()(::cuda::compute_capability cc) const -> cub::TransformPolicy
+  {
+    auto async              = cub::TransformAsyncCopyPolicy{};
+    async.threads_per_block = 256;
+    async.store_vec         = 3;
+    const auto algorithm =
+      (cc < ::cuda::compute_capability{9, 0}) ? cub::TransformAlgorithm::prefetch : cub::TransformAlgorithm::ublkcp;
+    return {64 * 1024, algorithm, cub::TransformPrefetchPolicy{256}, {}, async};
+  }
+};
+
+C2H_TEST("DeviceTransform::Transform non-power-of-two store_vec falls back to scalar", "[device][transform]")
+{
+  using in_t                         = std::uint32_t;
+  using out_t                        = std::uint8_t;
+  const cuda::std::int64_t num_items = GENERATE(GENERIC_COUNTS);
+  CAPTURE(num_items);
+
+  c2h::device_vector<in_t> in(num_items, thrust::no_init);
+  c2h::gen(C2H_SEED(1), in);
+
+  c2h::device_vector<out_t> result(num_items, thrust::no_init);
+  auto env = cuda::execution::tune(ublkcp_store_vec_3_selector{});
+  REQUIRE(cudaSuccess
+          == cub::DeviceTransform::Transform(
+            cuda::std::make_tuple(in.begin()), result.begin(), num_items, cast_to<out_t>{}, env));
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  c2h::host_vector<in_t> in_h = in;
+  c2h::host_vector<out_t> reference_h(num_items, thrust::no_init);
+  std::transform(in_h.begin(), in_h.end(), reference_h.begin(), cast_to<out_t>{});
+  REQUIRE(reference_h == result);
+}