[CUB] Refactor DevicePartition::Flagged to always take an environment

miscco · miscco · commit 45c5b58e69fb · 2026-06-23T10:46:01.000+02:00
We want to be able to pass tunings to the APIs that take user provided memory

Make sure we can pass any environment or stream type to them
diff --git a/cub/cub/device/device_partition.cuh b/cub/cub/device/device_partition.cuh
@@ -180,6 +180,9 @@ struct DevicePartition
   //! @tparam NumItemsT
   //!   **[inferred]** Type of num_items
   //!
+  //! @tparam EnvT
+  //!   **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
+  //!
   //! @param[in] d_temp_storage
   //!   @devicestorage
   //!
@@ -202,15 +205,14 @@ struct DevicePartition
   //! @param[in] num_items
   //!   Total number of items to select from
   //!
-  //! @param[in] stream
-  //!   @rst
-  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
-  //!   @endrst
+  //! @param[in] env
+  //!   **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
   template <typename InputIteratorT,
             typename FlagIterator,
             typename OutputIteratorT,
             typename NumSelectedIteratorT,
-            typename NumItemsT>
+            typename NumItemsT,
+            typename EnvT = ::cuda::std::execution::env<>>
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
@@ -219,31 +221,35 @@ struct DevicePartition
     OutputIteratorT d_out,
     NumSelectedIteratorT d_num_selected_out,
     NumItemsT num_items,
-    cudaStream_t stream = nullptr)
+    const EnvT& env = {})
   {
     _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DevicePartition::Flagged");
-    using ChooseOffsetT = detail::choose_signed_offset<NumItemsT>;
-    using OffsetT       = typename ChooseOffsetT::type; // Signed integer type for global offsets
-    using SelectOp      = NullType; // Selection op (not used)
-    using EqualityOp    = NullType; // Equality operator (not used)
+    using choose_offset_t         = detail::choose_signed_offset<NumItemsT>;
+    using offset_t                = typename choose_offset_t::type;
+    using default_policy_selector = detail::select::
+      policy_selector_from_types<InputIteratorT, FlagIterator, OutputIteratorT, offset_t, SelectImpl::Partition>;
 
     // Check if the number of items exceeds the range covered by the selected signed offset type
-    if (const cudaError_t error = ChooseOffsetT::is_exceeding_offset_type(num_items))
+    if (const auto error = choose_offset_t::is_exceeding_offset_type(num_items))
     {
       return error;
     }
 
-    return detail::select::dispatch<SelectImpl::Partition>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_in,
-      d_flags,
-      d_out,
-      d_num_selected_out,
-      SelectOp{},
-      EqualityOp{},
-      static_cast<OffsetT>(num_items),
-      stream);
+    return detail::dispatch_with_env_and_tuning<default_policy_selector>(
+      d_temp_storage, temp_storage_bytes, env, [&](auto policy_selector, void* storage, size_t& bytes, auto stream) {
+        return detail::select::dispatch<SelectImpl::Partition>(
+          storage,
+          bytes,
+          d_in,
+          d_flags,
+          d_out,
+          d_num_selected_out,
+          NullType{},
+          NullType{},
+          static_cast<offset_t>(num_items),
+          stream,
+          policy_selector);
+      });
   }
 
   //! @rst
@@ -329,7 +335,7 @@ struct DevicePartition
     OutputIteratorT d_out,
     NumSelectedIteratorT d_num_selected_out,
     NumItemsT num_items,
-    EnvT env = {})
+    const EnvT& env = {})
   {
     _CCCL_NVTX_RANGE_SCOPE("cub::DevicePartition::Flagged");
 
diff --git a/cub/test/catch2_test_device_partition_flagged.cu b/cub/test/catch2_test_device_partition_flagged.cu
@@ -10,7 +10,9 @@
 #include <thrust/reverse.h>
 
 #include <cuda/cmath>
+#include <cuda/devices>
 #include <cuda/iterator>
+#include <cuda/std/execution>
 #include <cuda/std/iterator>
 
 #include <algorithm>
@@ -202,6 +204,107 @@ C2H_TEST("DevicePartition::Flagged is stable", "[device][partition_flagged]")
   REQUIRE(reference == out);
 }
 
+#if TEST_LAUNCH == 0
+C2H_TEST("DevicePartition::Flagged works with user provided memory and environment",
+         "[device][partition_flagged]",
+         all_types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items, thrust::default_init);
+  c2h::device_vector<type> out(num_items, thrust::default_init);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::device_vector<int> flags(num_items, thrust::no_init);
+  c2h::gen(C2H_SEED(1), flags, 0, 1);
+
+  const int num_selected = static_cast<int>(thrust::count(c2h::device_policy, flags.begin(), flags.end(), 1));
+  const c2h::host_vector<type> reference = get_reference(in, flags);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  size_t expected_allocation_size = 0;
+  auto error                      = cub::DevicePartition::Flagged(
+    static_cast<void*>(nullptr),
+    expected_allocation_size,
+    in.begin(),
+    flags.begin(),
+    out.begin(),
+    d_num_selected_out,
+    num_items);
+  REQUIRE(error == cudaSuccess);
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  auto d_temp        = c2h::device_vector<uint8_t>(expected_allocation_size, thrust::no_init);
+  void* temp_storage = thrust::raw_pointer_cast(d_temp.data());
+
+  auto test_partition_flagged = [&](const auto& env) {
+    size_t num_bytes = 0;
+    error            = cub::DevicePartition::Flagged(
+      static_cast<void*>(nullptr), num_bytes, in.begin(), flags.begin(), out.begin(), d_num_selected_out, num_items, env);
+    REQUIRE(error == cudaSuccess);
+    REQUIRE(cudaSuccess == cudaPeekAtLastError());
+    REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+    REQUIRE(expected_allocation_size == num_bytes);
+
+    error = cub::DevicePartition::Flagged(
+      temp_storage, num_bytes, in.begin(), flags.begin(), out.begin(), d_num_selected_out, num_items, env);
+    REQUIRE(error == cudaSuccess);
+    REQUIRE(cudaSuccess == cudaPeekAtLastError());
+    REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+    REQUIRE(num_selected == num_selected_out[0]);
+    REQUIRE(reference == out);
+  };
+
+  int current_device;
+  error = cudaGetDevice(&current_device);
+  REQUIRE(error == cudaSuccess);
+
+  SECTION("DevicePartition::Flagged works with cudaStream_t")
+  {
+    cuda::stream stream{cuda::devices[current_device]};
+    test_partition_flagged(stream.get());
+  }
+
+  SECTION("DevicePartition::Flagged works with cuda::stream")
+  {
+    cuda::stream stream{cuda::devices[current_device]};
+    test_partition_flagged(stream);
+  }
+
+  SECTION("DevicePartition::Flagged works with cuda::stream_ref")
+  {
+    cuda::stream stream{cuda::devices[current_device]};
+    cuda::stream_ref stream_ref{stream};
+    test_partition_flagged(stream_ref);
+  }
+
+  SECTION("DevicePartition::Flagged works with cuda::std::execution::env")
+  {
+    cuda::std::execution::env env{};
+    test_partition_flagged(env);
+  }
+
+  SECTION("DevicePartition::Flagged works with cuda::execution::gpu")
+  {
+    const auto policy = cuda::execution::gpu;
+    test_partition_flagged(policy);
+  }
+
+  SECTION("DevicePartition::Flagged works with cuda::execution::gpu with stream")
+  {
+    cuda::stream stream{cuda::devices[current_device]};
+    const auto policy = cuda::execution::gpu.with(cuda::get_stream, stream);
+    test_partition_flagged(policy);
+  }
+}
+#endif // TEST_LAUNCH == 0
+
 C2H_TEST("DevicePartition::Flagged works with iterators", "[device][partition_flagged]", all_types)
 {
   using type = typename c2h::get<0, TestType>;
diff --git a/libcudacxx/include/cuda/std/__pstl/cuda/rotate.h b/libcudacxx/include/cuda/std/__pstl/cuda/rotate.h
@@ -105,7 +105,7 @@ struct __pstl_dispatch<__pstl_algorithm::__rotate, __execution_backend::__cuda>
       __output_wrapper,
       static_cast<_OffsetType*>(nullptr),
       __count,
-      nullptr);
+      __policy);
 
     {
       // Allocate memory for result
@@ -131,9 +131,9 @@ struct __pstl_dispatch<__pstl_algorithm::__rotate, __execution_backend::__cuda>
         __storage.template __get_raw_ptr<1>(),
         ::cuda::transform_iterator{::cuda::counting_iterator<size_t>{0}, __rotate_fn{__count1}},
         ::cuda::std::move(__output_wrapper),
-        __storage.template __get_ptr<0>(),
+        __storage.template __get_raw_ptr<0>(),
         __count,
-        __stream.get());
+        __policy);
     }
 
     __stream.sync();
diff --git a/libcudacxx/include/cuda/std/__pstl/cuda/rotate_copy.h b/libcudacxx/include/cuda/std/__pstl/cuda/rotate_copy.h
@@ -106,7 +106,7 @@ struct __pstl_dispatch<__pstl_algorithm::__rotate_copy, __execution_backend::__c
       __output_wrapper,
       static_cast<_OffsetType*>(nullptr),
       __count,
-      nullptr);
+      __policy);
 
     {
       // Allocate memory for result
@@ -123,7 +123,7 @@ struct __pstl_dispatch<__pstl_algorithm::__rotate_copy, __execution_backend::__c
         ::cuda::std::move(__output_wrapper),
         __storage.template __get_ptr<0>(),
         __count,
-        __stream.get());
+        __policy);
     }
 
     __stream.sync();