NVIDIA · miscco · Jun 25, 2026
@@ -454,6 +454,9 @@ struct DevicePartition
   //! @tparam NumItemsT
   //!   **[inferred]** Type of num_items
   //!
+  //! @tparam EnvT
+  //!   **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
+  //!
   //! @param[in] d_temp_storage
   //!   @devicestorage
   //!
@@ -475,15 +478,14 @@ struct DevicePartition
   //! @param[in] select_op
   //!   Unary selection operator
   //!
-  //! @param[in] stream
-  //!   @rst
-  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
-  //!   @endrst
+  //! @param[in] env
+  //!   **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename NumSelectedIteratorT,
             typename SelectOp,
-            typename NumItemsT>
+            typename NumItemsT,
+            typename EnvT = ::cuda::std::execution::env<>>
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
   If(void* d_temp_storage,
      size_t& temp_storage_bytes,
@@ -492,31 +494,36 @@ struct DevicePartition
      NumSelectedIteratorT d_num_selected_out,
      NumItemsT num_items,
      SelectOp select_op,
-     cudaStream_t stream = nullptr)
+     const EnvT& env = {})
   {
     _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DevicePartition::If");
-    using ChooseOffsetT = detail::choose_signed_offset<NumItemsT>;
-    using OffsetT       = typename ChooseOffsetT::type; // Signed integer type for global offsets
-    using FlagIterator  = NullType*; // FlagT iterator type (not used)
-    using EqualityOp    = NullType; // Equality operator (not used)
+
+    using choose_offset_t         = detail::choose_signed_offset<NumItemsT>;
+    using offset_t                = typename choose_offset_t::type;
+    using default_policy_selector = detail::select::
+      policy_selector_from_types<InputIteratorT, NullType*, OutputIteratorT, offset_t, SelectImpl::Partition>;
 
     // Check if the number of items exceeds the range covered by the selected signed offset type
-    if (const cudaError_t error = ChooseOffsetT::is_exceeding_offset_type(num_items))
+    if (const auto error = choose_offset_t::is_exceeding_offset_type(num_items))
     {
       return error;
     }
 
-    return detail::select::dispatch<SelectImpl::Partition>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_in,
-      FlagIterator{nullptr},
-      d_out,
-      d_num_selected_out,
-      select_op,
-      EqualityOp{},
-      static_cast<OffsetT>(num_items),
-      stream);
+    return detail::dispatch_with_env_and_tuning<default_policy_selector>(
+      d_temp_storage, temp_storage_bytes, env, [&](auto policy_selector, void* storage, size_t& bytes, auto stream) {
+        return detail::select::dispatch<SelectImpl::Partition>(
+          storage,
+          bytes,
+          d_in,
+          static_cast<NullType*>(nullptr),
+          d_out,
+          d_num_selected_out,
+          select_op,
+          NullType{},
+          static_cast<offset_t>(num_items),
+          stream,
+          policy_selector);
+      });
   }
 
   //! @rst
@@ -599,7 +606,7 @@ struct DevicePartition
      NumSelectedIteratorT d_num_selected_out,
      NumItemsT num_items,
      SelectOp select_op,
-     EnvT env = {})
+     const EnvT& env = {})
   {
     _CCCL_NVTX_RANGE_SCOPE("cub::DevicePartition::If");
 
@@ -780,6 +787,9 @@ struct DevicePartition
   //! @tparam NumItemsT
   //!   **[inferred]** Type of num_items
   //!
+  //! @tparam EnvT
+  //!   **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
+  //!
   //! @param[in] d_temp_storage
   //!   @devicestorage
   //!
@@ -814,18 +824,17 @@ struct DevicePartition
   //! @param[in] select_second_part_op
   //!   Unary selection operator to select `d_second_part_out`
   //!
-  //! @param[in] stream
-  //!   @rst
-  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
-  //!   @endrst
+  //! @param[in] env
+  //!   **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
   template <typename InputIteratorT,
             typename FirstOutputIteratorT,
             typename SecondOutputIteratorT,
             typename UnselectedOutputIteratorT,
             typename NumSelectedIteratorT,
             typename SelectFirstPartOp,
             typename SelectSecondPartOp,
-            typename NumItemsT>
+            typename NumItemsT,
+            typename EnvT = ::cuda::std::execution::env<>>
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
   If(void* d_temp_storage,
      size_t& temp_storage_bytes,
@@ -837,7 +846,7 @@ struct DevicePartition
      NumItemsT num_items,
      SelectFirstPartOp select_first_part_op,
      SelectSecondPartOp select_second_part_op,
-     cudaStream_t stream = nullptr)
+     const EnvT& env = {})
   {
     _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DevicePartition::If");
     using choose_offset_t = detail::choose_signed_offset<NumItemsT>;
@@ -849,18 +858,26 @@ struct DevicePartition
     }
 
     using offset_t = typename choose_offset_t::type;
-    return detail::three_way_partition::dispatch(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_in,
-      d_first_part_out,
-      d_second_part_out,
-      d_unselected_out,
-      d_num_selected_out,
-      select_first_part_op,
-      select_second_part_op,
-      static_cast<offset_t>(num_items),
-      stream);
+    using default_policy_selector =
+      detail::three_way_partition::policy_selector_from_types<detail::it_value_t<InputIteratorT>,
+                                                              detail::three_way_partition::per_partition_offset_t>;
+
+    return detail::dispatch_with_env_and_tuning<default_policy_selector>(
+      d_temp_storage, temp_storage_bytes, env, [&](auto policy_selector, void* storage, size_t& bytes, auto stream) {
+        return detail::three_way_partition::dispatch(
+          storage,
+          bytes,
+          d_in,
+          d_first_part_out,
+          d_second_part_out,
+          d_unselected_out,
+          d_num_selected_out,
+          select_first_part_op,
+          select_second_part_op,
+          static_cast<offset_t>(num_items),
+          stream,
+          policy_selector);
+      });
   }
 
   //! @rst
@@ -984,7 +1001,7 @@ struct DevicePartition
      NumItemsT num_items,
      SelectFirstPartOp select_first_part_op,
      SelectSecondPartOp select_second_part_op,
-     EnvT env = {})
+     const EnvT& env = {})
   {
     _CCCL_NVTX_RANGE_SCOPE("cub::DevicePartition::If");
 

@@ -10,8 +10,10 @@
 #include <thrust/reverse.h>
 
 #include <cuda/cmath>
+#include <cuda/devices>
 #include <cuda/functional>
 #include <cuda/iterator>
+#include <cuda/std/execution>
 #include <cuda/std/iterator>
 
 #include <algorithm>
@@ -170,6 +172,109 @@ C2H_TEST("DevicePartition::If is stable", "[device][partition_if]")
   REQUIRE(reference == out);
 }
 
+#if TEST_LAUNCH == 0
+C2H_TEST("DevicePartition::If works with user provided memory and environment", "[device][partition_if]", all_types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items, thrust::default_init);
+  c2h::device_vector<type> out(num_items, thrust::default_init);
+  c2h::gen(C2H_SEED(2), in);
+
+  // just pick one of the input elements as boundary
+  less_than_t<type> le{in[num_items / 2]};
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  // Ensure that we create the same output as std
+  c2h::host_vector<type> reference = in;
+  // The main difference between stable_partition and DevicePartition::If is that the false partition is in reverse
+  // order
+  const auto boundary = std::stable_partition(reference.begin(), reference.end(), le);
+  std::reverse(boundary, reference.end());
+
+  size_t expected_allocation_size = 0;
+  auto error                      = cub::DevicePartition::If(
+    static_cast<void*>(nullptr),
+    expected_allocation_size,
+    in.begin(),
+    out.begin(),
+    d_first_num_selected_out,
+    num_items,
+    le);
+  REQUIRE(error == cudaSuccess);
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  auto d_temp        = c2h::device_vector<uint8_t>(expected_allocation_size, thrust::no_init);
+  void* temp_storage = thrust::raw_pointer_cast(d_temp.data());
+
+  auto test_partition_if = [&](const auto& env) {
+    size_t num_bytes = 0;
+    error            = cub::DevicePartition::If(
+      static_cast<void*>(nullptr), num_bytes, in.begin(), out.begin(), d_first_num_selected_out, num_items, le, env);
+    REQUIRE(error == cudaSuccess);
+    REQUIRE(cudaSuccess == cudaPeekAtLastError());
+    REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+    REQUIRE(expected_allocation_size == num_bytes);
+
+    error = cub::DevicePartition::If(
+      temp_storage, num_bytes, in.begin(), out.begin(), d_first_num_selected_out, num_items, le, env);
+    REQUIRE(error == cudaSuccess);
+    REQUIRE(cudaSuccess == cudaPeekAtLastError());
+    REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+    REQUIRE(num_selected_out[0] == cuda::std::distance(reference.begin(), boundary));
+    REQUIRE(reference == out);
+  };
+
+  int current_device;
+  error = cudaGetDevice(&current_device);
+  REQUIRE(error == cudaSuccess);
+
+  SECTION("DevicePartition::If works with cudaStream_t")
+  {
+    cuda::stream stream{cuda::devices[current_device]};
+    test_partition_if(stream.get());
+  }
+
+  SECTION("DevicePartition::If works with cuda::stream")
+  {
+    cuda::stream stream{cuda::devices[current_device]};
+    test_partition_if(stream);
+  }
+
+  SECTION("DevicePartition::If works with cuda::stream_ref")
+  {
+    cuda::stream stream{cuda::devices[current_device]};
+    cuda::stream_ref stream_ref{stream};
+    test_partition_if(stream_ref);
+  }
+
+  SECTION("DevicePartition::If works with cuda::std::execution::env")
+  {
+    cuda::std::execution::env env{};
+    test_partition_if(env);
+  }
+
+  SECTION("DevicePartition::If works with cuda::execution::gpu")
+  {
+    const auto policy = cuda::execution::gpu;
+    test_partition_if(policy);
+  }
+
+  SECTION("DevicePartition::If works with cuda::execution::gpu with stream")
+  {
+    cuda::stream stream{cuda::devices[current_device]};
+    const auto policy = cuda::execution::gpu.with(cuda::get_stream, stream);
+    test_partition_if(policy);
+  }
+}
+#endif // TEST_LAUNCH == 0
+
 C2H_TEST("DevicePartition::If works with iterators", "[device][partition_if]", all_types)
 {
   using type = typename c2h::get<0, TestType>;

@@ -87,7 +87,7 @@ struct __pstl_dispatch<__pstl_algorithm::__partition, __execution_backend::__cud
       static_cast<_OffsetType*>(nullptr),
       __count,
       __pred,
-      nullptr);
+      __policy);
 
     {
       __temporary_storage<_OffsetType, value_type> __storage{__policy, __num_bytes, 1, __count};
@@ -114,7 +114,7 @@ struct __pstl_dispatch<__pstl_algorithm::__partition, __execution_backend::__cud
         __storage.template __get_ptr<0>(),
         __count,
         ::cuda::std::move(__pred),
-        __stream.get());
+        __policy);
 
       // Copy the result back from storage
       _CCCL_TRY_CUDA_API(

@@ -93,7 +93,7 @@ struct __pstl_dispatch<__pstl_algorithm::__partition_copy, __execution_backend::
       static_cast<_OffsetType*>(nullptr),
       __count,
       __pred,
-      nullptr);
+      __policy);
 
     {
       __temporary_storage<_OffsetType> __storage{__policy, __num_bytes, 1};
@@ -109,7 +109,7 @@ struct __pstl_dispatch<__pstl_algorithm::__partition_copy, __execution_backend::
         __storage.template __get_ptr<0>(),
         __count,
         ::cuda::std::move(__pred),
-        __stream.get());
+        __policy);
 
       // Copy the result back from storage
       _CCCL_TRY_CUDA_API(

@@ -88,7 +88,7 @@ struct __pstl_dispatch<__pstl_algorithm::__stable_partition, __execution_backend
       static_cast<_OffsetType*>(nullptr),
       __count,
       __pred,
-      __stream.get());
+      __policy);
 
     {
       __temporary_storage<_OffsetType, value_type> __storage{__policy, __num_bytes, 1, __count};
@@ -102,7 +102,7 @@ struct __pstl_dispatch<__pstl_algorithm::__stable_partition, __execution_backend
         __count,
         ::cuda::always_true{},
         identity{},
-        __stream.get());
+        __policy);
 
       // Run the kernel, the standard requires that the input and output range do not overlap
       _CCCL_TRY_CUDA_API(
@@ -115,7 +115,7 @@ struct __pstl_dispatch<__pstl_algorithm::__stable_partition, __execution_backend
         __storage.template __get_ptr<0>(),
         __count,
         ::cuda::std::move(__pred),
-        __stream.get());
+        __policy);
 
       // Copy the result back from storage
       _CCCL_TRY_CUDA_API(