Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 59 additions & 42 deletions cub/cub/device/device_partition.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,9 @@ struct DevicePartition
//! @tparam NumItemsT
//! **[inferred]** Type of num_items
//!
//! @tparam EnvT
//! **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
//!
//! @param[in] d_temp_storage
//! @devicestorage
//!
Expand All @@ -475,15 +478,14 @@ struct DevicePartition
//! @param[in] select_op
//! Unary selection operator
//!
//! @param[in] stream
//! @rst
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
//! @endrst
//! @param[in] env
//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
template <typename InputIteratorT,
typename OutputIteratorT,
typename NumSelectedIteratorT,
typename SelectOp,
typename NumItemsT>
typename NumItemsT,
typename EnvT = ::cuda::std::execution::env<>>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
If(void* d_temp_storage,
size_t& temp_storage_bytes,
Expand All @@ -492,31 +494,36 @@ struct DevicePartition
NumSelectedIteratorT d_num_selected_out,
NumItemsT num_items,
SelectOp select_op,
cudaStream_t stream = nullptr)
const EnvT& env = {})
{
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DevicePartition::If");
using ChooseOffsetT = detail::choose_signed_offset<NumItemsT>;
using OffsetT = typename ChooseOffsetT::type; // Signed integer type for global offsets
using FlagIterator = NullType*; // FlagT iterator type (not used)
using EqualityOp = NullType; // Equality operator (not used)

using choose_offset_t = detail::choose_signed_offset<NumItemsT>;
using offset_t = typename choose_offset_t::type;
using default_policy_selector = detail::select::
policy_selector_from_types<InputIteratorT, NullType*, OutputIteratorT, offset_t, SelectImpl::Partition>;

// Check if the number of items exceeds the range covered by the selected signed offset type
if (const cudaError_t error = ChooseOffsetT::is_exceeding_offset_type(num_items))
if (const auto error = choose_offset_t::is_exceeding_offset_type(num_items))
{
return error;
}

return detail::select::dispatch<SelectImpl::Partition>(
d_temp_storage,
temp_storage_bytes,
d_in,
FlagIterator{nullptr},
d_out,
d_num_selected_out,
select_op,
EqualityOp{},
static_cast<OffsetT>(num_items),
stream);
return detail::dispatch_with_env_and_tuning<default_policy_selector>(
d_temp_storage, temp_storage_bytes, env, [&](auto policy_selector, void* storage, size_t& bytes, auto stream) {
return detail::select::dispatch<SelectImpl::Partition>(
storage,
bytes,
d_in,
static_cast<NullType*>(nullptr),
d_out,
d_num_selected_out,
select_op,
NullType{},
static_cast<offset_t>(num_items),
stream,
policy_selector);
});
}

//! @rst
Expand Down Expand Up @@ -599,7 +606,7 @@ struct DevicePartition
NumSelectedIteratorT d_num_selected_out,
NumItemsT num_items,
SelectOp select_op,
EnvT env = {})
const EnvT& env = {})
{
_CCCL_NVTX_RANGE_SCOPE("cub::DevicePartition::If");

Expand Down Expand Up @@ -780,6 +787,9 @@ struct DevicePartition
//! @tparam NumItemsT
//! **[inferred]** Type of num_items
//!
//! @tparam EnvT
//! **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
//!
//! @param[in] d_temp_storage
//! @devicestorage
//!
Expand Down Expand Up @@ -814,18 +824,17 @@ struct DevicePartition
//! @param[in] select_second_part_op
//! Unary selection operator to select `d_second_part_out`
//!
//! @param[in] stream
//! @rst
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
//! @endrst
//! @param[in] env
//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
template <typename InputIteratorT,
typename FirstOutputIteratorT,
typename SecondOutputIteratorT,
typename UnselectedOutputIteratorT,
typename NumSelectedIteratorT,
typename SelectFirstPartOp,
typename SelectSecondPartOp,
typename NumItemsT>
typename NumItemsT,
typename EnvT = ::cuda::std::execution::env<>>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
If(void* d_temp_storage,
size_t& temp_storage_bytes,
Expand All @@ -837,7 +846,7 @@ struct DevicePartition
NumItemsT num_items,
SelectFirstPartOp select_first_part_op,
SelectSecondPartOp select_second_part_op,
cudaStream_t stream = nullptr)
const EnvT& env = {})
{
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DevicePartition::If");
using choose_offset_t = detail::choose_signed_offset<NumItemsT>;
Expand All @@ -849,18 +858,26 @@ struct DevicePartition
}

using offset_t = typename choose_offset_t::type;
return detail::three_way_partition::dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_first_part_out,
d_second_part_out,
d_unselected_out,
d_num_selected_out,
select_first_part_op,
select_second_part_op,
static_cast<offset_t>(num_items),
stream);
using default_policy_selector =
detail::three_way_partition::policy_selector_from_types<detail::it_value_t<InputIteratorT>,
detail::three_way_partition::per_partition_offset_t>;

return detail::dispatch_with_env_and_tuning<default_policy_selector>(
d_temp_storage, temp_storage_bytes, env, [&](auto policy_selector, void* storage, size_t& bytes, auto stream) {
return detail::three_way_partition::dispatch(
storage,
bytes,
d_in,
d_first_part_out,
d_second_part_out,
d_unselected_out,
d_num_selected_out,
select_first_part_op,
select_second_part_op,
static_cast<offset_t>(num_items),
stream,
policy_selector);
});
}

//! @rst
Expand Down Expand Up @@ -984,7 +1001,7 @@ struct DevicePartition
NumItemsT num_items,
SelectFirstPartOp select_first_part_op,
SelectSecondPartOp select_second_part_op,
EnvT env = {})
const EnvT& env = {})
{
_CCCL_NVTX_RANGE_SCOPE("cub::DevicePartition::If");

Expand Down
105 changes: 105 additions & 0 deletions cub/test/catch2_test_device_partition_if.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,10 @@
#include <thrust/reverse.h>

#include <cuda/cmath>
#include <cuda/devices>
#include <cuda/functional>
#include <cuda/iterator>
#include <cuda/std/execution>
#include <cuda/std/iterator>

#include <algorithm>
Expand Down Expand Up @@ -170,6 +172,109 @@ C2H_TEST("DevicePartition::If is stable", "[device][partition_if]")
REQUIRE(reference == out);
}

#if TEST_LAUNCH == 0
C2H_TEST("DevicePartition::If works with user provided memory and environment", "[device][partition_if]", all_types)
{
using type = typename c2h::get<0, TestType>;

const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
c2h::device_vector<type> in(num_items, thrust::default_init);
c2h::device_vector<type> out(num_items, thrust::default_init);
c2h::gen(C2H_SEED(2), in);

// just pick one of the input elements as boundary
less_than_t<type> le{in[num_items / 2]};

// Needs to be device accessible
c2h::device_vector<int> num_selected_out(1, 0);
int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());

// Ensure that we create the same output as std
c2h::host_vector<type> reference = in;
// The main difference between stable_partition and DevicePartition::If is that the false partition is in reverse
// order
const auto boundary = std::stable_partition(reference.begin(), reference.end(), le);
std::reverse(boundary, reference.end());

size_t expected_allocation_size = 0;
auto error = cub::DevicePartition::If(
static_cast<void*>(nullptr),
expected_allocation_size,
in.begin(),
out.begin(),
d_first_num_selected_out,
num_items,
le);
REQUIRE(error == cudaSuccess);
REQUIRE(cudaSuccess == cudaPeekAtLastError());
REQUIRE(cudaSuccess == cudaDeviceSynchronize());

auto d_temp = c2h::device_vector<uint8_t>(expected_allocation_size, thrust::no_init);
void* temp_storage = thrust::raw_pointer_cast(d_temp.data());

auto test_partition_if = [&](const auto& env) {
size_t num_bytes = 0;
error = cub::DevicePartition::If(
static_cast<void*>(nullptr), num_bytes, in.begin(), out.begin(), d_first_num_selected_out, num_items, le, env);
REQUIRE(error == cudaSuccess);
REQUIRE(cudaSuccess == cudaPeekAtLastError());
REQUIRE(cudaSuccess == cudaDeviceSynchronize());
REQUIRE(expected_allocation_size == num_bytes);

error = cub::DevicePartition::If(
temp_storage, num_bytes, in.begin(), out.begin(), d_first_num_selected_out, num_items, le, env);
REQUIRE(error == cudaSuccess);
REQUIRE(cudaSuccess == cudaPeekAtLastError());
REQUIRE(cudaSuccess == cudaDeviceSynchronize());

REQUIRE(num_selected_out[0] == cuda::std::distance(reference.begin(), boundary));
REQUIRE(reference == out);
};

int current_device;
error = cudaGetDevice(&current_device);
REQUIRE(error == cudaSuccess);

SECTION("DevicePartition::If works with cudaStream_t")
{
cuda::stream stream{cuda::devices[current_device]};
test_partition_if(stream.get());
}

SECTION("DevicePartition::If works with cuda::stream")
{
cuda::stream stream{cuda::devices[current_device]};
test_partition_if(stream);
}

SECTION("DevicePartition::If works with cuda::stream_ref")
{
cuda::stream stream{cuda::devices[current_device]};
cuda::stream_ref stream_ref{stream};
test_partition_if(stream_ref);
}

SECTION("DevicePartition::If works with cuda::std::execution::env")
{
cuda::std::execution::env env{};
test_partition_if(env);
}

SECTION("DevicePartition::If works with cuda::execution::gpu")
{
const auto policy = cuda::execution::gpu;
test_partition_if(policy);
}

SECTION("DevicePartition::If works with cuda::execution::gpu with stream")
{
cuda::stream stream{cuda::devices[current_device]};
const auto policy = cuda::execution::gpu.with(cuda::get_stream, stream);
test_partition_if(policy);
}
}
#endif // TEST_LAUNCH == 0

C2H_TEST("DevicePartition::If works with iterators", "[device][partition_if]", all_types)
{
using type = typename c2h::get<0, TestType>;
Expand Down
4 changes: 2 additions & 2 deletions libcudacxx/include/cuda/std/__pstl/cuda/partition.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ struct __pstl_dispatch<__pstl_algorithm::__partition, __execution_backend::__cud
static_cast<_OffsetType*>(nullptr),
__count,
__pred,
nullptr);
__policy);

{
__temporary_storage<_OffsetType, value_type> __storage{__policy, __num_bytes, 1, __count};
Expand All @@ -114,7 +114,7 @@ struct __pstl_dispatch<__pstl_algorithm::__partition, __execution_backend::__cud
__storage.template __get_ptr<0>(),
__count,
::cuda::std::move(__pred),
__stream.get());
__policy);

// Copy the result back from storage
_CCCL_TRY_CUDA_API(
Expand Down
4 changes: 2 additions & 2 deletions libcudacxx/include/cuda/std/__pstl/cuda/partition_copy.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ struct __pstl_dispatch<__pstl_algorithm::__partition_copy, __execution_backend::
static_cast<_OffsetType*>(nullptr),
__count,
__pred,
nullptr);
__policy);

{
__temporary_storage<_OffsetType> __storage{__policy, __num_bytes, 1};
Expand All @@ -109,7 +109,7 @@ struct __pstl_dispatch<__pstl_algorithm::__partition_copy, __execution_backend::
__storage.template __get_ptr<0>(),
__count,
::cuda::std::move(__pred),
__stream.get());
__policy);

// Copy the result back from storage
_CCCL_TRY_CUDA_API(
Expand Down
6 changes: 3 additions & 3 deletions libcudacxx/include/cuda/std/__pstl/cuda/stable_partition.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ struct __pstl_dispatch<__pstl_algorithm::__stable_partition, __execution_backend
static_cast<_OffsetType*>(nullptr),
__count,
__pred,
__stream.get());
__policy);

{
__temporary_storage<_OffsetType, value_type> __storage{__policy, __num_bytes, 1, __count};
Expand All @@ -102,7 +102,7 @@ struct __pstl_dispatch<__pstl_algorithm::__stable_partition, __execution_backend
__count,
::cuda::always_true{},
identity{},
__stream.get());
__policy);

// Run the kernel, the standard requires that the input and output range do not overlap
_CCCL_TRY_CUDA_API(
Expand All @@ -115,7 +115,7 @@ struct __pstl_dispatch<__pstl_algorithm::__stable_partition, __execution_backend
__storage.template __get_ptr<0>(),
__count,
::cuda::std::move(__pred),
__stream.get());
__policy);

// Copy the result back from storage
_CCCL_TRY_CUDA_API(
Expand Down
Loading