@@ -190,6 +190,108 @@ TEST_CASE("DeviceHistogram::HistogramRange works with default environment", "[hi
190190 REQUIRE (d_histogram == expected);
191191}
192192
193+ TEST_CASE (" DeviceHistogram::HistogramRange works with user provided memory and environment" , " [histogram][device]" )
194+ {
195+ auto d_samples = c2h::device_vector<float >{2 .2f , 6 .1f , 7 .5f , 2 .9f , 3 .5f , 0 .3f , 2 .9f , 2 .1f };
196+ int num_samples = static_cast <int >(d_samples.size ());
197+ auto d_levels = c2h::device_vector<float >{0 .0f , 2 .0f , 4 .0f , 6 .0f , 8 .0f };
198+ int num_levels = static_cast <int >(d_levels.size ());
199+ auto d_histogram = c2h::device_vector<int >(num_levels - 1 , 0 );
200+
201+ c2h::device_vector<int > expected{1 , 5 , 0 , 2 };
202+
203+ size_t expected_bytes_allocated{};
204+ auto error = cub::DeviceHistogram::HistogramRange (
205+ nullptr ,
206+ expected_bytes_allocated,
207+ thrust::raw_pointer_cast (d_samples.data ()),
208+ thrust::raw_pointer_cast (d_histogram.data ()),
209+ num_levels,
210+ thrust::raw_pointer_cast (d_levels.data ()),
211+ num_samples);
212+ REQUIRE (error == cudaSuccess);
213+ REQUIRE (cudaSuccess == cudaPeekAtLastError ());
214+ REQUIRE (cudaSuccess == cudaDeviceSynchronize ());
215+
216+ auto d_temp = c2h::device_vector<uint8_t >(expected_bytes_allocated, thrust::no_init);
217+ void * temp_storage = thrust::raw_pointer_cast (d_temp.data ());
218+
219+ auto test_histogram_range = [&](const auto & env) {
220+ size_t num_bytes = 0 ;
221+ error = cub::DeviceHistogram::HistogramRange (
222+ nullptr ,
223+ num_bytes,
224+ thrust::raw_pointer_cast (d_samples.data ()),
225+ thrust::raw_pointer_cast (d_histogram.data ()),
226+ num_levels,
227+ thrust::raw_pointer_cast (d_levels.data ()),
228+ num_samples,
229+ env);
230+ REQUIRE (error == cudaSuccess);
231+ REQUIRE (cudaSuccess == cudaPeekAtLastError ());
232+ REQUIRE (cudaSuccess == cudaDeviceSynchronize ());
233+ REQUIRE (expected_bytes_allocated == num_bytes);
234+
235+ error = cub::DeviceHistogram::HistogramRange (
236+ temp_storage,
237+ num_bytes,
238+ thrust::raw_pointer_cast (d_samples.data ()),
239+ thrust::raw_pointer_cast (d_histogram.data ()),
240+ num_levels,
241+ thrust::raw_pointer_cast (d_levels.data ()),
242+ num_samples,
243+ env);
244+ REQUIRE (error == cudaSuccess);
245+ REQUIRE (cudaSuccess == cudaPeekAtLastError ());
246+ REQUIRE (cudaSuccess == cudaDeviceSynchronize ());
247+
248+ // Verify result
249+ REQUIRE (d_histogram == expected);
250+ };
251+
252+ int current_device;
253+ error = cudaGetDevice (¤t_device);
254+ REQUIRE (error == cudaSuccess);
255+
256+ SECTION (" DeviceHistogram::HistogramRange works with cudaStream_t" )
257+ {
258+ cuda::stream stream{cuda::devices[current_device]};
259+ test_histogram_range (stream.get ());
260+ }
261+
262+ SECTION (" DeviceHistogram::HistogramRange works with cuda::stream" )
263+ {
264+ cuda::stream stream{cuda::devices[current_device]};
265+ test_histogram_range (stream);
266+ }
267+
268+ SECTION (" DeviceHistogram::HistogramRange works with cuda::stream_ref" )
269+ {
270+ cuda::stream stream{cuda::devices[current_device]};
271+ cuda::stream_ref stream_ref{stream};
272+ test_histogram_range (stream_ref);
273+ }
274+
275+ SECTION (" DeviceHistogram::HistogramRange works with cuda::std::execution::env" )
276+ {
277+ cuda::std::execution::env env{};
278+ test_histogram_range (env);
279+ }
280+
281+ SECTION (" DeviceHistogram::HistogramRange works with cuda::execution::gpu" )
282+ {
283+ const auto policy = cuda::execution::gpu;
284+ test_histogram_range (policy);
285+ }
286+
287+ SECTION (" DeviceHistogram::HistogramRange works with cuda::execution::gpu with stream" )
288+ {
289+ cuda::stream stream{cuda::devices[current_device]};
290+ const auto policy = cuda::execution::gpu.with (cuda::get_stream, stream);
291+ test_histogram_range (policy);
292+ }
293+ }
294+
193295TEST_CASE (" DeviceHistogram::MultiHistogramEven works with default environment" , " [histogram][device]" )
194296{
195297 [[maybe_unused]] constexpr int NUM_CHANNELS = 4 ;
0 commit comments