NVIDIA · nanan-nvidia · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026
diff --git a/c/parallel/src/transform.cu b/c/parallel/src/transform.cu
@@ -72,7 +72,7 @@ get_kernel_name(std::string_view input_iterator_t, std::string_view output_itera
   check(cccl_type_name_from_nvrtc<OffsetT>(&offset_t));
 
   return std::format(
-    "cub::detail::transform::transform_kernel<{0}, {1}, cuda::always_true, {2}, {3}, {4}>",
+    "cub::detail::transform::transform_kernel<{0}, {1}, cuda::always_true, {2}, {3}, 1, {4}>",
     chained_policy_t, // 0
     offset_t, // 1
     transform_op_t, // 2
@@ -92,7 +92,7 @@ std::string get_kernel_name(std::string_view input1_iterator_t,
   check(cccl_type_name_from_nvrtc<OffsetT>(&offset_t));
 
   return std::format(
-    "cub::detail::transform::transform_kernel<{0}, {1}, cuda::always_true, {2}, {3}, {4}, "
+    "cub::detail::transform::transform_kernel<{0}, {1}, cuda::always_true, {2}, {3}, 1, {4}, "
     "{5}>",
     chained_policy_t, // 0
     offset_t, // 1

diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh
@@ -22,6 +22,7 @@
 #include <cuda/__functional/always_true_false.h>
 #include <cuda/__functional/call_or.h>
 #include <cuda/__iterator/zip_iterator.h>
+#include <cuda/__memory/aligned_size.h>
 #include <cuda/__stream/get_stream.h>
 #include <cuda/std/__execution/env.h>
 #include <cuda/std/tuple>
@@ -92,7 +93,20 @@ struct DeviceTransform
     // https://github.com/NVIDIA/cccl/issues/8805 for data. We use choose_signed_offset to just check if it can hold the
     // value passed by the user, but otherwise ignore the chosen signed offset type.
     using offset_t = ::cuda::std::int64_t;
-    if (const cudaError_t error = detail::choose_signed_offset<NumItemsT>::is_exceeding_offset_type(num_items))
+    // num_items may be an aligned_size_t<N> (a caller promise that in/out are N-byte aligned and the count is a
+    // multiple of N). Unwrap it to the underlying integer.
+    const auto num_items_count = [&]() {
+      if constexpr (::cuda::std::is_integral_v<NumItemsT>)
+      {
+        return num_items;
+      }
+      else
+      {
+        return static_cast<::cuda::std::size_t>(num_items);
+      }
+    }();
+    using count_t = ::cuda::std::remove_const_t<decltype(num_items_count)>;
+    if (const cudaError_t error = detail::choose_signed_offset<count_t>::is_exceeding_offset_type(num_items_count))
     {
       return error;
     }
@@ -114,10 +128,13 @@ struct DeviceTransform
     static_assert(detail::transform::transform_policy_selector<policy_selector>);
 #endif // _CCCL_HAS_CONCEPTS()
 
-    return detail::transform::dispatch<StableAddress>(
+    // An aligned_size_t<N> num_items lets the caller promise N-byte-aligned in/out pointers and a count that is a
+    // multiple of N. This is a compile-time guarantee so the ublkcp kernel can take the vector-store path.
+    constexpr int out_align = static_cast<int>(::cuda::__get_size_align_v<NumItemsT>);
+    return detail::transform::dispatch<StableAddress, out_align>(
       ::cuda::std::move(inputs),
       ::cuda::std::move(output),
-      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_items_count),
       ::cuda::std::move(predicate),
       ::cuda::std::move(transform_op),
       stream,

@@ -77,21 +77,24 @@ template <typename PolicySelector,
           typename RandomAccessIteratorsIn,
           typename RandomAccessIteratorOut,
           typename Predicate,
-          typename TransformOp>
+          typename TransformOp,
+          int OutputAlign>
 struct TransformKernelSource;
 
 template <typename PolicySelector,
           typename Offset,
           typename... RandomAccessIteratorsIn,
           typename RandomAccessIteratorOut,
           typename Predicate,
-          typename TransformOp>
+          typename TransformOp,
+          int OutputAlign>
 struct TransformKernelSource<PolicySelector,
                              Offset,
                              ::cuda::std::tuple<RandomAccessIteratorsIn...>,
                              RandomAccessIteratorOut,
                              Predicate,
-                             TransformOp>
+                             TransformOp,
+                             OutputAlign>
 {
   // PolicySelector must be stateless, so we can pass the type to the kernel
   static_assert(::cuda::std::is_empty_v<PolicySelector>);
@@ -103,6 +106,7 @@ struct TransformKernelSource<PolicySelector,
                      Predicate,
                      TransformOp,
                      THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t<RandomAccessIteratorOut>,
+                     OutputAlign,
                      THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t<RandomAccessIteratorsIn>...>);
 
   template <class ActionT>
@@ -541,6 +545,7 @@ struct invoke_for_cc<::cuda::std::tuple<RandomAccessIteratorsIn...>,
 };
 
 template <requires_stable_address StableAddress,
+          int OutputAlign = 1,
           typename... RandomAccessIteratorsIn,
           typename RandomAccessIteratorOut,
           typename Offset,
@@ -552,7 +557,8 @@ template <requires_stable_address StableAddress,
                                                                  ::cuda::std::tuple<RandomAccessIteratorsIn...>,
                                                                  RandomAccessIteratorOut,
                                                                  Predicate,
-                                                                 TransformOp>,
+                                                                 TransformOp,
+                                                                 OutputAlign>,
           typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY>
 #if _CCCL_HAS_CONCEPTS()
   requires transform_policy_selector<PolicySelector>

@@ -713,6 +713,7 @@ _CCCL_DEVICE void bulk_copy_maybe_unaligned(
 // didn't merge the changes. The problem was mostly a 25% increase in integer instructions, as shown by ncu.
 template <int threads_per_block,
           int UnrollFactor,
+          int OutputAlign,
           typename Offset,
           typename Predicate,
           typename F,
@@ -895,41 +896,117 @@ _CCCL_DEVICE void transform_kernel_ublkcp(
   // move the whole index and iterator to the block/thread index, to reduce arithmetic in the loops below
   out += offset;
 
-  auto process_tile = [&](auto full_tile) {
-    unrolled_for<UnrollFactor>(num_elem_per_thread, [&](int j) {
-      // TODO(bgruber): fbusato suggests to hoist threadIdx.x out of the loop below
-      const int idx = j * threads_per_block + threadIdx.x;
-      if (full_tile || idx < valid_items)
+  using output_t         = it_value_t<RandomAccessIteratorOut>;
+  constexpr int out_size = int{size_of<output_t>};
+  constexpr int vec_size = (out_size > 0 && out_size <= 16) ? 16 / out_size : 1;
-  constexpr int vec_size = (out_size > 0 && out_size <= 16) ? 16 / out_size : 1;
+  constexpr int possible_vec_size = (out_size > 0 && out_size <= 16) ? 16 / out_size : 1;
-  constexpr int vec_size = (out_size > 0 && out_size <= 16) ? 16 / out_size : 1;
+  constexpr int possible_vec_size = (out_size > 0 && out_size <= 16) ? 16 / out_size : 1;
+  // When the caller guarantees aligned_size_t<N> num_items, i.e. the output pointer is N-byte aligned and the element
+  // count is a multiple of N, if 1. there are no predicates, 2. memory layout is contiguous, 3. semantically we can
+  // raw copy, 4. size is power-of-2 and <= 16 bytes  we can do vectorized store (STG.128)
+  constexpr bool vectorize_store =
+    OutputAlign >= 16 && vec_size > 1 && ::cuda::std::is_same_v<Predicate, ::cuda::always_true>
+    && THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorOut>
+    && THRUST_NS_QUALIFIER::is_trivially_relocatable_v<output_t> && ::cuda::is_power_of_two(out_size)
+    && (true && ... && ::cuda::is_power_of_two(int{sizeof(InTs)}));
+
+  if constexpr (vectorize_store)
+  {
+    using store_t         = decltype(load_store_type<16>());
+    auto* out_vec         = reinterpret_cast<store_t*>(out);
+    const int num_vectors = valid_items / vec_size;
+    for (int v = threadIdx.x; v < num_vectors; v += threads_per_block)
+    {
+      char* smem      = smem_base;
+      auto load_chunk = [&](auto aligned_ptr) {
+        using T = typename decltype(aligned_ptr)::value_type;
+        // on blackwell, head_padding should always be zero
+        // on hopper, bulk_copy_alignment is 128 bytes, head_padding could be 112 bytes for example
+        // alignof(T) will always be powers of 2 per C++ standard
+        const T* base = reinterpret_cast<const T*>(smem + aligned_ptr.head_padding);
+        smem += tile_padding + int{sizeof(T)} * tile_size;
+        // Gather this input's vec_size elements for output-vector v into a register array. we take the maximal
+        // alignment out of alignof(T) and 16 bytes. If input is narrower, we will waste a few (0-16) registers
+        constexpr ::cuda::std::size_t chunk_align = (::cuda::std::max) (alignof(T), alignof(int4));
+        ::cuda::__uninitialized_array<T, vec_size, chunk_align> elems;
+        constexpr int chunk_bytes = int{sizeof(T)} * vec_size;
+        // if same width or narrowing (e.g. int32 -> int8), we split it up into multiple 16 byte reads
+        // CAREFUL: the byte width sizeof(T) * vec_size can exceed 16 when the input is wider than the output.
+        // However, since input both input type size and output size is pow2, when the input is wider, it has to be
+        // pow2 times wider. Therefore, chunk_bytes = input size * vec_size is always divisible by 16
+        // (recall 16 = output size * vec_size) , i.e. we can read it as multiple int4 loads
+        if constexpr (chunk_bytes % int{sizeof(int4)} == 0)
+        {
+          constexpr int n = chunk_bytes / int{sizeof(int4)};
+          const int4* s   = reinterpret_cast<const int4*>(base) + v * n;
+          _CCCL_PRAGMA_UNROLL_FULL()
+          for (int i = 0; i < n; ++i)
+          {
+            reinterpret_cast<int4*>(elems.data())[i] = s[i];
+          }
+        }
+        // if widening (e.g. int8 -> int32), just load it in one go. recall chunk_bytes = input size * vec_size, and
+        // vec_size = 16 / output size. Since output size is pow2, vec_size is pow2. Hence chunk_bytes is always pow2.
+        // this ensures load_store_type<chunk_bytes> will never fail.
+        else
+        {
+          using sub_t                             = decltype(load_store_type<chunk_bytes>());
+          *reinterpret_cast<sub_t*>(elems.data()) = reinterpret_cast<const sub_t*>(base)[v];
+        }
+        return elems;
+      };
+      auto chunks = ::cuda::std::tuple{load_chunk(aligned_ptrs)...};
+
+      alignas(sizeof(output_t) * vec_size) output_t res[vec_size];
+      // must fully unroll to take full advantage of ILP. otherwise perf regress by half
+      _CCCL_PRAGMA_UNROLL_FULL()
+      for (int k = 0; k < vec_size; ++k)
       {
-        char* smem         = smem_base;
-        auto fetch_operand = [&](auto aligned_ptr) {
-          using T                = typename decltype(aligned_ptr)::value_type;
-          const int head_padding = alignof(T) < bulk_copy_alignment ? aligned_ptr.head_padding : 0;
-          const char* src        = smem + head_padding;
-          smem += tile_padding + int{sizeof(T)} * tile_size;
-          return reinterpret_cast<const T*>(src)[idx];
-        };
-
-        // need to expand into a tuple for guaranteed order of evaluation
-        ::cuda::std::apply(
-          [&](auto... values) {
-            if (pred(values...))
-            {
-              out[idx] = f(values...);
-            }
+        res[k] = ::cuda::std::apply(
+          [&](auto&... c) {
+            return f(c[k]...);
           },
-          ::cuda::std::tuple<InTs...>{fetch_operand(aligned_ptrs)...});
+          chunks);
       }
-    });
-  };
-  // explicitly calling the lambda on literal true/false lets the compiler emit the lambda twice
-  if (tile_size == valid_items)
-  {
-    process_tile(::cuda::std::true_type{});
+      out_vec[v] = *reinterpret_cast<const store_t*>(res);
+    }
   }
   else
   {
-    process_tile(::cuda::std::false_type{});
+    auto process_tile = [&](auto full_tile) {
+      unrolled_for<UnrollFactor>(num_elem_per_thread, [&](int j) {
+        // TODO(bgruber): fbusato suggests to hoist threadIdx.x out of the loop below
+        const int idx = j * threads_per_block + threadIdx.x;
+        if (full_tile || idx < valid_items)
+        {
+          char* smem         = smem_base;
+          auto fetch_operand = [&](auto aligned_ptr) {
+            using T                = typename decltype(aligned_ptr)::value_type;
+            const int head_padding = alignof(T) < bulk_copy_alignment ? aligned_ptr.head_padding : 0;
+            const char* src        = smem + head_padding;
+            smem += tile_padding + int{sizeof(T)} * tile_size;
+            return reinterpret_cast<const T*>(src)[idx];
+          };
+
+          // need to expand into a tuple for guaranteed order of evaluation
+          ::cuda::std::apply(
+            [&](auto... values) {
+              if (pred(values...))
+              {
+                out[idx] = f(values...);
+              }
+            },
+            ::cuda::std::tuple<InTs...>{fetch_operand(aligned_ptrs)...});
+        }
+      });
+    };
+    // explicitly calling the lambda on literal true/false lets the compiler emit the lambda twice
+    if (tile_size == valid_items)
+    {
+      process_tile(::cuda::std::true_type{});
+    }
+    else
+    {
+      process_tile(::cuda::std::false_type{});
+    }
   }
 }
 
@@ -1006,6 +1083,7 @@ template <typename PolicySelector,
           typename Predicate,
           typename F,
           typename RandomAccessIteratorOut,
+          int OutputAlign,
           typename... RandomAccessIteratorsIn>
 #if _CCCL_HAS_CONCEPTS()
   requires transform_policy_selector<PolicySelector>
@@ -1068,7 +1146,7 @@ __launch_bounds__(get_threads_per_block<PolicySelector>) _CCCL_KERNEL_ATTRIBUTES
   {
     NV_IF_TARGET(
       NV_PROVIDES_SM_90,
-      (transform_kernel_ublkcp</*policy*/ policy.async_copy.threads_per_block, policy.async_copy.unroll_factor>(
+      (transform_kernel_ublkcp<policy.async_copy.threads_per_block, policy.async_copy.unroll_factor, OutputAlign>(
          num_items,
          num_elem_per_thread,
          ::cuda::std::move(pred),

diff --git a/cub/test/catch2_test_device_transform_aligned.cu b/cub/test/catch2_test_device_transform_aligned.cu
@@ -0,0 +1,103 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "insert_nested_NVTX_range_guard.h"
+
+#include <cub/device/device_transform.cuh>
+
+#include <cuda/__memory/aligned_size.h>
+
+#include <algorithm>
+
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.h>
+#include <c2h/test_util_vec.h>
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceTransform::Transform, transform_many);
+
+#define ALIGNED_ITEM_COUNTS 0, 16, 128, 4096, 4112, 65536, 99'984
+
+template <typename Out>
+struct cast_to
+{
+  template <typename T>
+  __host__ __device__ Out operator()(T v) const
+  {
+    return static_cast<Out>(v);
+  }
+};
+
+C2H_TEST("DeviceTransform::Transform aligned_size_t<16> same-width",
+         "[device][transform]",
+         c2h::type_list<std::uint8_t, std::uint16_t, std::uint32_t, std::uint64_t, float, double, uchar3>)
+{
+  using type               = c2h::get<0, TestType>;
+  using offset_t           = cuda::std::int64_t;
+  const offset_t num_items = GENERATE(ALIGNED_ITEM_COUNTS);
+  CAPTURE(c2h::type_name<type>(), num_items);
+
+  c2h::device_vector<type> a(num_items, thrust::no_init);
+  c2h::device_vector<type> b(num_items, thrust::no_init);
+  c2h::gen(C2H_SEED(1), a);
+  c2h::gen(C2H_SEED(1), b);
+
+  c2h::device_vector<type> result(num_items, thrust::no_init);
+  transform_many(cuda::std::make_tuple(a.begin(), b.begin()),
+                 result.begin(),
+                 cuda::aligned_size_t<16>(num_items),
+                 cuda::std::plus<type>{});
+
+  c2h::host_vector<type> a_h = a;
+  c2h::host_vector<type> b_h = b;
+  c2h::host_vector<type> reference_h(num_items, thrust::no_init);
+  std::transform(a_h.begin(), a_h.end(), b_h.begin(), reference_h.begin(), cuda::std::plus<type>{});
+  REQUIRE(reference_h == result);
+}
+
+C2H_TEST("DeviceTransform::Transform aligned_size_t<16> narrowing to uint8",
+         "[device][transform]",
+         c2h::type_list<std::uint16_t, std::uint32_t, std::uint64_t>)
+{
+  using in_t               = c2h::get<0, TestType>;
+  using out_t              = std::uint8_t;
+  using offset_t           = cuda::std::int64_t;
+  const offset_t num_items = GENERATE(ALIGNED_ITEM_COUNTS);
+  CAPTURE(c2h::type_name<in_t>(), num_items);
+
+  c2h::device_vector<in_t> in(num_items, thrust::no_init);
+  c2h::gen(C2H_SEED(1), in);
+
+  c2h::device_vector<out_t> result(num_items, thrust::no_init);
+  transform_many(
+    cuda::std::make_tuple(in.begin()), result.begin(), cuda::aligned_size_t<16>(num_items), cast_to<out_t>{});
+
+  c2h::host_vector<in_t> in_h = in;
+  c2h::host_vector<out_t> reference_h(num_items, thrust::no_init);
+  std::transform(in_h.begin(), in_h.end(), reference_h.begin(), cast_to<out_t>{});
+  REQUIRE(reference_h == result);
+}
+
+C2H_TEST("DeviceTransform::Transform aligned_size_t<16> widening from uint8",
+         "[device][transform]",
+         c2h::type_list<std::uint16_t, std::uint32_t, std::uint64_t>)
+{
+  using in_t               = std::uint8_t;
+  using out_t              = c2h::get<0, TestType>;
+  using offset_t           = cuda::std::int64_t;
+  const offset_t num_items = GENERATE(ALIGNED_ITEM_COUNTS);
+  CAPTURE(c2h::type_name<out_t>(), num_items);
+
+  c2h::device_vector<in_t> in(num_items, thrust::no_init);
+  c2h::gen(C2H_SEED(1), in);
+
+  c2h::device_vector<out_t> result(num_items, thrust::no_init);
+  transform_many(
+    cuda::std::make_tuple(in.begin()), result.begin(), cuda::aligned_size_t<16>(num_items), cast_to<out_t>{});
+
+  c2h::host_vector<in_t> in_h = in;
+  c2h::host_vector<out_t> reference_h(num_items, thrust::no_init);
+  std::transform(in_h.begin(), in_h.end(), reference_h.begin(), cast_to<out_t>{});
+  REQUIRE(reference_h == result);
+}