diff --git a/cub/benchmarks/bench/reduce/arg_extrema.cu b/cub/benchmarks/bench/reduce/arg_extrema.cu index 04a6748bd42..9d9019bfdb9 100644 --- a/cub/benchmarks/bench/reduce/arg_extrema.cu +++ b/cub/benchmarks/bench/reduce/arg_extrema.cu @@ -45,26 +45,26 @@ void arg_reduce(nvbench::state& state, nvbench::type_list) // Iterator providing the values being reduced using values_it_t = T*; - // Type used for the final result - using output_tuple_t = cub::KeyValuePair; - - auto const init = ::cuda::std::is_same_v + auto const init = ::cuda::std::is_same_v ? ::cuda::std::numeric_limits::max() : ::cuda::std::numeric_limits::lowest(); // Retrieve axis parameters const auto elements = static_cast(state.get_int64("Elements{io}")); thrust::device_vector in = generate(elements); - thrust::device_vector out(1); + thrust::device_vector out_index(1); + thrust::device_vector out_extremum(1); - values_it_t d_in = thrust::raw_pointer_cast(in.data()); - output_tuple_t* d_out = thrust::raw_pointer_cast(out.data()); - auto const num_items = static_cast(elements); + values_it_t d_in = thrust::raw_pointer_cast(in.data()); + global_offset_t* d_out_index = thrust::raw_pointer_cast(out_index.data()); + T* d_out_extremum = thrust::raw_pointer_cast(out_extremum.data()); + auto const num_items = static_cast(elements); // Enable throughput calculations and add "Size" column to results. state.add_element_count(elements); state.add_global_memory_reads(elements, "Size"); - state.add_global_memory_writes(1); + state.add_global_memory_writes(1); + state.add_global_memory_writes(1); // Allocate temporary storage std::size_t temp_size; @@ -72,10 +72,10 @@ void arg_reduce(nvbench::state& state, nvbench::type_list) nullptr, temp_size, d_in, - d_out, + d_out_index, + d_out_extremum, num_items, OpT{}, - init, 0 /* stream */ #if !TUNE_BASE , @@ -91,10 +91,10 @@ void arg_reduce(nvbench::state& state, nvbench::type_list) temp_storage, temp_size, d_in, - d_out, + d_out_index, + d_out_extremum, num_items, OpT{}, - init, launch.get_stream() #if !TUNE_BASE , @@ -104,7 +104,7 @@ void arg_reduce(nvbench::state& state, nvbench::type_list) }); } -using op_types = nvbench::type_list; +using op_types = nvbench::type_list; NVBENCH_BENCH_TYPES(arg_reduce, NVBENCH_TYPE_AXES(fundamental_types, op_types)) .set_name("base") diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh index b2dd125cfff..af3be0dacf5 100644 --- a/cub/cub/device/device_reduce.cuh +++ b/cub/cub/device/device_reduce.cuh @@ -43,9 +43,11 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -56,23 +58,6 @@ namespace detail template inline constexpr bool is_non_deterministic_v = ::cuda::std::is_same_v; - -namespace reduce -{ -template -struct unzip_and_write_arg_extremum_op -{ - ExtremumOutIteratorT result_out_it; - IndexOutIteratorT index_out_it; - - template - _CCCL_DEVICE _CCCL_FORCEINLINE void operator()(IndexT, KeyValuePairT reduced_result) - { - *result_out_it = reduced_result.value; - *index_out_it = reduced_result.key; - } -}; -} // namespace reduce } // namespace detail //! @rst @@ -948,18 +933,46 @@ public: }); } +private: + template + CUB_RUNTIME_FUNCTION static cudaError_t __arg_min( + void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + ExtremumOutIteratorT d_min_out, + IndexOutIteratorT d_index_out, + ::cuda::std::int64_t num_items, + CompareOpT compare_op, + cudaStream_t stream) + { + using PerPartitionOffsetT = int; // used by the kernel to index within one partition + using GlobalOffsetT = ::cuda::std::int64_t; // in the range [d_in, d_in + num_items) + + return detail::reduce::dispatch_streaming_arg_reduce( + d_temp_storage, + temp_storage_bytes, + d_in, + d_min_out, + d_index_out, + static_cast(num_items), + detail::arg_less{compare_op}, + stream); + } + +public: //! @rst - //! Finds the first device-wide minimum using the less-than (``<``) operator and also returns the index of that item. + //! Finds the first device-wide minimum based on a given comparison operator and also returns the index of that item. //! - //! .. versionadded:: 2.2.0 - //! First appears in CUDA Toolkit 12.3. + //! .. versionadded:: 3.4.0 + //! First appears in CUDA Toolkit 13.4. //! //! - The minimum is written to ``d_min_out`` //! - The offset of the returned item is written to ``d_index_out``, the offset type being written is of type //! ``cuda::std::int64_t``. - //! - For zero-length inputs, ``cuda::std::numeric_limits::max()}`` is written to ``d_min_out`` and the index - //! ``1`` is written to ``d_index_out``. - //! - Does not support ``<`` operators that are non-commutative. + //! - For zero-length inputs, the index ``1`` is written to ``d_index_out`` and, if ``compare_op`` is + //! ``cuda::std::less`` and ``cuda::std::numeric_limits::is_specialized is ``true``, + //! ``cuda::std::numeric_limits::max()`` is written to ``d_min_out``, otherwise ``T{}``. + //! - Does not support comparison operators that are non-commutative. //! - Provides "run-to-run" determinism for pseudo-associative reduction //! (e.g., addition of floating point types) on the same GPU device. //! However, results for pseudo-associative reduction may be inconsistent @@ -982,25 +995,33 @@ public: //! // Declare, allocate, and initialize device-accessible pointers //! // for input and output //! int num_items; // e.g., 7 - //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_in; // e.g., [8, 6, -7, 5, 3, 1, -9] //! int *d_min_out; // memory for the minimum value //! cuda::std::int64_t *d_index_out; // memory for the index of the returned value //! ... //! + //! // Define the comparison operator + //! struct abs_less_t { + //! template + //! __host__ __device__ bool operator()(const T& a, const T& b) const { + //! return cuda::std::abs(a) < cuda::std::abs(b); + //! } + //! }; + //! //! // Determine temporary device storage requirements //! void *d_temp_storage = nullptr; //! size_t temp_storage_bytes = 0; //! cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_min_out, d_index_out, - //! num_items); + //! num_items, abs_less_t{}); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run argmin-reduction //! cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_min_out, d_index_out, - //! num_items); + //! num_items, abs_less_t{}); //! - //! // d_min_out <-- 0 + //! // d_min_out <-- 1 //! // d_index_out <-- 5 //! //! @endrst @@ -1031,6 +1052,9 @@ public: //! @param[out] d_index_out //! Iterator to which the index of the returned value is written //! + //! @param[in] compare_op + //! Comparison operator returning ``true`` if the first argument is less than the second + //! //! @param[in] num_items //! Total number of input items (i.e., length of ``d_in``) //! @@ -1038,7 +1062,14 @@ public: //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template < + typename InputIteratorT, + typename ExtremumOutIteratorT, + typename IndexOutIteratorT, + typename CompareOpT, + // TODO(bgruber): this constraint is not accurate, since the implementation will compare the value types of + // ExtremumOutIteratorT, which is wrong IMO + ::cuda::std::enable_if_t<::cuda::std::indirectly_comparable, int> = 0> CUB_RUNTIME_FUNCTION static cudaError_t ArgMin( void* d_temp_storage, size_t& temp_storage_bytes, @@ -1046,41 +1077,112 @@ public: ExtremumOutIteratorT d_min_out, IndexOutIteratorT d_index_out, ::cuda::std::int64_t num_items, + CompareOpT compare_op, cudaStream_t stream = 0) { _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMin"); + return __arg_min(d_temp_storage, temp_storage_bytes, d_in, d_min_out, d_index_out, num_items, compare_op, stream); + } - // The input type - using InputValueT = cub::detail::it_value_t; + //! @rst + //! .. versionadded:: 2.2.0 + //! First appears in CUDA Toolkit 12.3. + //! @overload + //! @note Uses ``cuda::std::less`` as comparison operator + //! @endrst + template + CUB_RUNTIME_FUNCTION static cudaError_t ArgMin( + void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + ExtremumOutIteratorT d_min_out, + IndexOutIteratorT d_index_out, + ::cuda::std::int64_t num_items, + cudaStream_t stream = 0) + { + return ArgMin( + d_temp_storage, temp_storage_bytes, d_in, d_min_out, d_index_out, num_items, ::cuda::std::less{}, stream); + } - // Offset type used within the kernel and to index within one partition - using PerPartitionOffsetT = int; + template + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t __arg_min_env( + InputIteratorT d_in, + ExtremumOutIteratorT d_min_out, + IndexOutIteratorT d_index_out, + ::cuda::std::int64_t num_items, + CompareOpT compare_op, + EnvT env = {}) + { + static_assert(!::cuda::std::execution::__queryable_with, + "Determinism should be used inside requires to have an effect."); + using requirements_t = ::cuda::std::execution:: + __query_result_or_t>; + using requested_determinism_t = + ::cuda::std::execution::__query_result_or_t; - // Offset type used to index within the total input in the range [d_in, d_in + num_items) - using GlobalOffsetT = ::cuda::std::int64_t; + // Static assert to reject gpu_to_gpu determinism since it's not properly implemented + static_assert(!::cuda::std::is_same_v, + "gpu_to_gpu determinism is not supported"); - // The value type used for the extremum - using OutputExtremumT = detail::non_void_value_t; + // Query relevant properties from the environment + auto stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}}, env); + auto mr = ::cuda::__call_or(::cuda::mr::get_memory_resource, detail::device_memory_resource{}, env); - // Reduction operation - using ReduceOpT = cub::ArgMin; + using PerPartitionOffsetT = int; // used by the kernel to index within one partition + using GlobalOffsetT = ::cuda::std::int64_t; // in the range [d_in, d_in + num_items) - // Initial value - OutputExtremumT initial_value{::cuda::std::numeric_limits::max()}; + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + + // Query the required temporary storage size + if (const auto error = detail::reduce::dispatch_streaming_arg_reduce( + d_temp_storage, + temp_storage_bytes, + d_in, + d_min_out, + d_index_out, + static_cast(num_items), + detail::arg_less{compare_op}, + stream.get())) + { + return error; + } - // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators - auto out_it = ::cuda::make_tabulate_output_iterator( - detail::reduce::unzip_and_write_arg_extremum_op{d_min_out, d_index_out}); + // TODO(gevtushenko): use uninitialized buffer when it's available + if (const auto error = + CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr))) + { + return error; + } - return detail::reduce::dispatch_streaming_arg_reduce( + // Run the algorithm + const auto error = detail::reduce::dispatch_streaming_arg_reduce( d_temp_storage, temp_storage_bytes, d_in, - out_it, + d_min_out, + d_index_out, static_cast(num_items), - ReduceOpT{}, - initial_value, - stream); + detail::arg_less{compare_op}, + stream.get()); + + // Try to deallocate regardless of the error to avoid memory leaks + const auto deallocate_error = + CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr)); + + if (error != cudaSuccess) + { + // Reduction error takes precedence over deallocation error since it happens first + return error; + } + + return deallocate_error; } //! @rst @@ -1092,8 +1194,9 @@ public: //! - The minimum is written to ``d_min_out`` //! - The offset of the returned item is written to ``d_index_out``, the offset type being written is of type //! ``cuda::std::int64_t``. - //! - For zero-length inputs, ``cuda::std::numeric_limits::max()}`` is written to ``d_min_out`` and the index - //! ``1`` is written to ``d_index_out``. + //! - For zero-length inputs, the index ``1`` is written to ``d_index_out`` and, if ``compare_op`` is + //! ``cuda::std::less`` and ``cuda::std::numeric_limits::is_specialized is ``true``, + //! ``cuda::std::numeric_limits::max()`` is written to ``d_min_out``, otherwise ``T{}``. //! - Does not support ``<`` operators that are non-commutative. //! - Provides determinism based on the environment's determinism requirements. //! To request "run-to-run" determinism, pass ``cuda::execution::require(cuda::execution::determinism::run_to_run)`` @@ -1135,6 +1238,9 @@ public: //! @param[out] d_index_out //! Iterator to which the index of the returned value is written //! + //! @param[in] compare_op + //! Comparison operator returning ``true`` if the first argument is less than the second + //! //! @param[in] num_items //! Total number of input items (i.e., length of ``d_in``) //! @@ -1142,10 +1248,36 @@ public: //! @rst //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. //! @endrst + template < + typename InputIteratorT, + typename ExtremumOutIteratorT, + typename IndexOutIteratorT, + typename CompareOpT, + typename EnvT = ::cuda::std::execution::env<>, + // TODO(bgruber): this constraint is not accurate, since the implementation will compare the value types of + // ExtremumOutIteratorT, which is wrong IMO + ::cuda::std::enable_if_t<::cuda::std::indirectly_comparable, int> = 0> + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t ArgMin( + InputIteratorT d_in, + ExtremumOutIteratorT d_min_out, + IndexOutIteratorT d_index_out, + ::cuda::std::int64_t num_items, + CompareOpT compare_op, + EnvT env = {}) + { + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::ArgMin"); + return __arg_min_env(d_in, d_min_out, d_index_out, num_items, compare_op, env); + } + + //! @overload + //! @note Uses ``cuda::std::less`` as comparison operator template > + typename EnvT = ::cuda::std::execution::env<>, + // TODO(bgruber): this constraint is not accurate, since the implementation will compare the value types of + // ExtremumOutIteratorT, which is wrong IMO + ::cuda::std::enable_if_t, int> = 0> [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(InputIteratorT d_in, ExtremumOutIteratorT d_min_out, @@ -1153,86 +1285,7 @@ public: ::cuda::std::int64_t num_items, EnvT env = {}) { - _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::ArgMin"); - - static_assert(!::cuda::std::execution::__queryable_with, - "Determinism should be used inside requires to have an effect."); - using requirements_t = ::cuda::std::execution:: - __query_result_or_t>; - using requested_determinism_t = - ::cuda::std::execution::__query_result_or_t; - - // Static assert to reject gpu_to_gpu determinism since it's not properly implemented - static_assert(!::cuda::std::is_same_v, - "gpu_to_gpu determinism is not supported"); - - // Query relevant properties from the environment - auto stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}}, env); - auto mr = ::cuda::__call_or(::cuda::mr::get_memory_resource, detail::device_memory_resource{}, env); - - void* d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - - // Reduction operation - using ReduceOpT = cub::ArgMin; - using InputValueT = cub::detail::it_value_t; - using PerPartitionOffsetT = int; - using GlobalOffsetT = ::cuda::std::int64_t; - - using OutputExtremumT = detail::non_void_value_t; - - // Initial value - OutputExtremumT initial_value{::cuda::std::numeric_limits::max()}; - - // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators - auto out_it = ::cuda::make_tabulate_output_iterator( - detail::reduce::unzip_and_write_arg_extremum_op{d_min_out, d_index_out}); - - // Query the required temporary storage size - if (const auto error = detail::reduce::dispatch_streaming_arg_reduce( - d_temp_storage, - temp_storage_bytes, - d_in, - out_it, - static_cast(num_items), - ReduceOpT{}, - initial_value, - stream.get())) - { - return error; - } - - // TODO(gevtushenko): use uninitialized buffer when it's available - if (const auto error = - CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr))) - { - return error; - } - - // Run the algorithm - const auto error = detail::reduce::dispatch_streaming_arg_reduce( - d_temp_storage, - temp_storage_bytes, - d_in, - out_it, - static_cast(num_items), - ReduceOpT{}, - initial_value, - stream.get()); - - // Try to deallocate regardless of the error to avoid memory leaks - const auto deallocate_error = - CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr)); - - if (error != cudaSuccess) - { - // Reduction error takes precedence over deallocation error since it happens first - return error; - } - - return deallocate_error; + return ArgMin(d_in, d_min_out, d_index_out, num_items, ::cuda::std::less{}, env); } //! @rst @@ -1354,7 +1407,14 @@ public: InitT initial_value{AccumT(1, ::cuda::std::numeric_limits::max())}; return detail::reduce::dispatch( - d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, OffsetT{num_items}, cub::ArgMin(), initial_value, stream); + d_temp_storage, + temp_storage_bytes, + d_indexed_in, + d_out, + OffsetT{num_items}, + detail::arg_min{}, + initial_value, + stream); } //! @rst @@ -1567,17 +1627,17 @@ public: } //! @rst - //! Finds the first device-wide maximum using the greater-than (``>``) operator and also returns the index of that - //! item. + //! Finds the first device-wide maximum based on a given comparison operator and also returns the index of that item. //! - //! .. versionadded:: 2.2.0 - //! First appears in CUDA Toolkit 12.3. + //! .. versionadded:: 3.4.0 + //! First appears in CUDA Toolkit 13.4. //! //! - The maximum is written to ``d_max_out`` //! - The offset of the returned item is written to ``d_index_out``, the offset type being written is of type //! ``cuda::std::int64_t``. - //! - For zero-length inputs, ``cuda::std::numeric_limits::max()}`` is written to ``d_max_out`` and the index - //! ``1`` is written to ``d_index_out``. + //! - For zero-length inputs, the index ``1`` is written to ``d_index_out`` and, if ``compare_op`` is + //! ``cuda::std::less`` and ``cuda::std::numeric_limits::is_specialized is ``true``, + //! ``cuda::std::numeric_limits::lowest()`` is written to ``d_min_out``, otherwise ``T{}``. //! - Does not support ``>`` operators that are non-commutative. //! - Provides "run-to-run" determinism for pseudo-associative reduction //! (e.g., addition of floating point types) on the same GPU device. @@ -1601,25 +1661,33 @@ public: //! // Declare, allocate, and initialize device-accessible pointers //! // for input and output //! int num_items; // e.g., 7 - //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_in; // e.g., [8, 6, -7, 5, 3, 1, -9] //! int *d_max_out; // memory for the maximum value //! cuda::std::int64_t *d_index_out; // memory for the index of the returned value //! ... //! + //! // Define the comparison operator + //! struct abs_less_t { + //! template + //! __host__ __device__ bool operator()(const T& a, const T& b) const { + //! return cuda::std::abs(a) < cuda::std::abs(b); + //! } + //! }; + //! //! // Determine temporary device storage requirements //! void *d_temp_storage = nullptr; //! size_t temp_storage_bytes = 0; //! cub::DeviceReduce::ArgMax( - //! d_temp_storage, temp_storage_bytes, d_in, d_max_out, d_index_out, num_items); + //! d_temp_storage, temp_storage_bytes, d_in, d_max_out, d_index_out, num_items, abs_less_t{}); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run argmax-reduction //! cub::DeviceReduce::ArgMax( - //! d_temp_storage, temp_storage_bytes, d_in, d_max_out, d_index_out, num_items); + //! d_temp_storage, temp_storage_bytes, d_in, d_max_out, d_index_out, num_items, abs_less_t{}); //! - //! // d_max_out <-- 9 + //! // d_max_out <-- -9 //! // d_index_out <-- 6 //! //! @endrst @@ -1649,6 +1717,9 @@ public: //! @param[out] d_index_out //! Iterator to which the index of the returned value is written //! + //! @param[in] compare_op + //! Comparison operator returning ``true`` if the first argument is less than the second + //! //! @param[in] num_items //! Total number of input items (i.e., length of ``d_in``) //! @@ -1656,7 +1727,14 @@ public: //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template < + typename InputIteratorT, + typename ExtremumOutIteratorT, + typename IndexOutIteratorT, + typename CompareOpT, + // TODO(bgruber): this constraint is not accurate, since the implementation will compare the value types of + // ExtremumOutIteratorT, which is wrong IMO + ::cuda::std::enable_if_t<::cuda::std::indirectly_comparable, int> = 0> CUB_RUNTIME_FUNCTION static cudaError_t ArgMax( void* d_temp_storage, size_t& temp_storage_bytes, @@ -1664,43 +1742,41 @@ public: ExtremumOutIteratorT d_max_out, IndexOutIteratorT d_index_out, ::cuda::std::int64_t num_items, + CompareOpT compare_op, cudaStream_t stream = 0) { _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMax"); - - // The input type - using InputValueT = cub::detail::it_value_t; - - // Offset type used within the kernel and to index within one partition - using PerPartitionOffsetT = int; - - // Offset type used to index within the total input in the range [d_in, d_in + num_items) - using GlobalOffsetT = ::cuda::std::int64_t; - - // The value type used for the extremum - using OutputExtremumT = detail::non_void_value_t; - - // Reduction operation - using ReduceOpT = cub::ArgMax; - - // Initial value - OutputExtremumT initial_value{::cuda::std::numeric_limits::lowest()}; - - // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators - auto out_it = ::cuda::make_tabulate_output_iterator( - detail::reduce::unzip_and_write_arg_extremum_op{d_max_out, d_index_out}); - - return detail::reduce::dispatch_streaming_arg_reduce( + return __arg_min( d_temp_storage, temp_storage_bytes, d_in, - out_it, - static_cast(num_items), - ReduceOpT{}, - initial_value, + d_max_out, + d_index_out, + num_items, + detail::arg_less{detail::swap_args{compare_op}}, stream); } + //! @rst + //! .. versionadded:: 2.2.0 + //! First appears in CUDA Toolkit 12.3. + //! @overload + //! @note Uses ``cuda::std::less`` as comparison operator + //! @endrst + template + CUB_RUNTIME_FUNCTION static cudaError_t ArgMax( + void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + ExtremumOutIteratorT d_max_out, + IndexOutIteratorT d_index_out, + ::cuda::std::int64_t num_items, + cudaStream_t stream = 0) + { + return ArgMax( + d_temp_storage, temp_storage_bytes, d_in, d_max_out, d_index_out, num_items, ::cuda::std::less{}, stream); + } + //! @rst //! Finds the first device-wide maximum using the greater-than (``>``) //! operator, also returning the index of that item @@ -1824,21 +1900,29 @@ public: InitT initial_value{AccumT(1, ::cuda::std::numeric_limits::lowest())}; return detail::reduce::dispatch( - d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, OffsetT{num_items}, cub::ArgMax(), initial_value, stream); + d_temp_storage, + temp_storage_bytes, + d_indexed_in, + d_out, + OffsetT{num_items}, + detail::arg_max{}, + initial_value, + stream); } //! @rst //! Finds the first device-wide maximum using the greater-than (``>``) operator and also returns the index of that //! item. //! - //! .. versionadded:: 2.2.0 - //! First appears in CUDA Toolkit 12.3. + //! .. versionadded:: 3.4.0 + //! First appears in CUDA Toolkit 13.4. //! //! - The maximum is written to ``d_max_out`` //! - The offset of the returned item is written to ``d_index_out``, the offset type being written is of type //! ``cuda::std::int64_t``. - //! - For zero-length inputs, ``cuda::std::numeric_limits::lowest()}`` is written to ``d_max_out`` and the index - //! ``1`` is written to ``d_index_out``. + //! - For zero-length inputs, the index ``1`` is written to ``d_index_out`` and, if ``compare_op`` is + //! ``cuda::std::less`` and ``cuda::std::numeric_limits::is_specialized is ``true``, + //! ``cuda::std::numeric_limits::lowest()`` is written to ``d_min_out``, otherwise ``T{}``. //! - Does not support ``>`` operators that are non-commutative. //! - Provides determinism based on the environment's determinism requirements. //! To request "run-to-run" determinism, pass ``cuda::execution::require(cuda::execution::determinism::run_to_run)`` @@ -1880,6 +1964,9 @@ public: //! @param[out] d_index_out //! Iterator to which the index of the returned value is written //! + //! @param[in] compare_op + //! Comparison operator returning ``true`` if the first argument is less than the second + //! //! @param[in] num_items //! Total number of input items (i.e., length of ``d_in``) //! @@ -1887,6 +1974,30 @@ public: //! @rst //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. //! @endrst + template < + typename InputIteratorT, + typename ExtremumOutIteratorT, + typename IndexOutIteratorT, + typename CompareOpT, + typename EnvT = ::cuda::std::execution::env<>, + // TODO(bgruber): this constraint is not accurate, since the implementation will compare the value types of + // ExtremumOutIteratorT, which is wrong IMO + ::cuda::std::enable_if_t<::cuda::std::indirectly_comparable, int> = 0> + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t ArgMax( + InputIteratorT d_in, + ExtremumOutIteratorT d_max_out, + IndexOutIteratorT d_index_out, + ::cuda::std::int64_t num_items, + CompareOpT compare_op, + EnvT env = {}) + { + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::ArgMax"); + return __arg_min_env( + d_in, d_index_out, num_items, detail::arg_less{detail::swap_args{compare_op}}, ::cuda::std::execution::env{}); + } + + //! @overload + //! @note Uses ``cuda::std::less`` as comparison operator template , - "Determinism should be used inside requires to have an effect."); - using requirements_t = ::cuda::std::execution:: - __query_result_or_t>; - using requested_determinism_t = - ::cuda::std::execution::__query_result_or_t; - - // Static assert to reject gpu_to_gpu determinism since it's not properly implemented - static_assert(!::cuda::std::is_same_v, - "gpu_to_gpu determinism is not supported"); - - // Query relevant properties from the environment - auto stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}}, env); - auto mr = ::cuda::__call_or(::cuda::mr::get_memory_resource, detail::device_memory_resource{}, env); - - void* d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - - // Reduction operation - using ReduceOpT = cub::ArgMax; - using InputValueT = cub::detail::it_value_t; - using PerPartitionOffsetT = int; - using GlobalOffsetT = ::cuda::std::int64_t; - - using OutputExtremumT = detail::non_void_value_t; - - // Initial value - OutputExtremumT initial_value{::cuda::std::numeric_limits::lowest()}; - - // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators - auto out_it = ::cuda::make_tabulate_output_iterator( - detail::reduce::unzip_and_write_arg_extremum_op{d_max_out, d_index_out}); - - // Query the required temporary storage size - if (const auto error = detail::reduce::dispatch_streaming_arg_reduce( - d_temp_storage, - temp_storage_bytes, - d_in, - out_it, - static_cast(num_items), - ReduceOpT{}, - initial_value, - stream.get())) - { - return error; - } - - // TODO(gevtushenko): use uninitialized buffer when it's available - if (const auto error = - CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr))) - { - return error; - } - - // Run the algorithm - const auto error = detail::reduce::dispatch_streaming_arg_reduce( - d_temp_storage, - temp_storage_bytes, - d_in, - out_it, - static_cast(num_items), - ReduceOpT{}, - initial_value, - stream.get()); - - // Try to deallocate regardless of the error to avoid memory leaks - const auto deallocate_error = - CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr)); - - if (error != cudaSuccess) - { - // Reduction error takes precedence over deallocation error since it happens first - return error; - } - - return deallocate_error; + return __arg_min_env(d_in, d_max_out, d_index_out, num_items, detail::arg_max{}, env); } //! @rst diff --git a/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh b/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh index bebe1f9bcb4..9c87aed6546 100644 --- a/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh @@ -112,6 +112,20 @@ struct local_to_global_op } }; +template +struct unzip_and_write_arg_extremum_op +{ + ExtremumOutIteratorT result_out_it; + IndexOutIteratorT index_out_it; + + template + _CCCL_DEVICE _CCCL_FORCEINLINE void operator()(IndexT, KeyValuePairT reduced_result) + { + *result_out_it = reduced_result.value; + *index_out_it = reduced_result.key; + } +}; + /****************************************************************************** * Single-problem streaming reduction dispatch *****************************************************************************/ @@ -143,15 +157,16 @@ struct local_to_global_op // // @tparam PolicySelector // Selects the tuning policy -template < - typename PerPartitionOffsetT, - typename InputIteratorT, - typename OutputIteratorT, - typename GlobalOffsetT, - typename ReductionOpT, - typename InitT, - typename PolicySelector = - reduce::policy_selector_from_types, PerPartitionOffsetT, ReductionOpT>> +template >>, + PerPartitionOffsetT, + ReductionOpT>> # if _CCCL_HAS_CONCEPTS() requires reduce_policy_selector # endif // _CCCL_HAS_CONCEPTS() @@ -159,31 +174,32 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch_streaming_arg_reduce void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, - OutputIteratorT d_result_out, + ExtremumOutIteratorT d_min_out, + IndexOutIteratorT d_index_out, GlobalOffsetT num_items, ReductionOpT reduce_op, - InitT init, cudaStream_t stream, PolicySelector policy_selector = {}) { + using input_value_t = it_value_t; + using output_extremum_t = non_void_value_t; + + // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators + auto d_result_out = ::cuda::make_tabulate_output_iterator( + detail::reduce::unzip_and_write_arg_extremum_op{d_min_out, d_index_out}); + // Wrapped input iterator to produce index-value tuples, i.e., -tuples // We make sure to offset the user-provided input iterator by the current partition's offset - using arg_index_input_iterator_t = ArgIndexInputIterator; - - // The type used for the aggregate that the user wants to find the extremum for - using output_aggregate_t = InitT; + using arg_index_input_iterator_t = ArgIndexInputIterator; // The output tuple type (i.e., extremum plus index tuples) - using per_partition_accum_t = KeyValuePair; - using global_accum_t = KeyValuePair; + using per_partition_accum_t = KeyValuePair; + using global_accum_t = KeyValuePair; // Unary promotion operator type that is used to transform a per-partition result to a global result // operator()(per_partition_accum_t) -> global_accum_t using local_to_global_op_t = local_to_global_op; - // Empty problem initialization type - using empty_problem_init_t = empty_problem_init_t; - // The current partition's input iterator is an ArgIndex iterator that generates indices relative to the beginning // of the current partition, i.e., [0, partition_size) along with an OffsetIterator that offsets the user-provided // input iterator by the current partition's offset @@ -209,18 +225,33 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch_streaming_arg_reduce // Reduction operator type that enables accumulating per-partition results to a global reduction result using accumulating_transform_output_op_t = - accumulating_transform_output_op; - + accumulating_transform_output_op; auto accumulating_out_op = accumulating_transform_output_op_t{ true, is_single_partition, nullptr, nullptr, d_result_out, local_to_global_op, reduce_op}; - empty_problem_init_t initial_value{{PerPartitionOffsetT{1}, init}}; + // Initial value for empty problems, according to documented contract + const auto empty_problem_extremum = static_cast([] { + if constexpr (::cuda::std::is_same_v + && ::cuda::std::numeric_limits::is_specialized) + { + return ::cuda::std::numeric_limits::max(); + } + else if constexpr (::cuda::std::is_same_v + && ::cuda::std::numeric_limits::is_specialized) + { + return ::cuda::std::numeric_limits::lowest(); + } + else + { + return input_value_t{}; + } + }()); + auto initial_value = empty_problem_init_t{{PerPartitionOffsetT{1}, empty_problem_extremum}}; void* allocations[2] = {nullptr, nullptr}; size_t allocation_sizes[2] = {0, 2 * sizeof(global_accum_t)}; // Query temporary storage requirements for per-partition reduction - reduce::dispatch( nullptr, allocation_sizes[0], @@ -233,8 +264,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch_streaming_arg_reduce ::cuda::std::identity{}, policy_selector); - // Alias the temporary allocations from the single storage blob (or compute the necessary size - // of the blob) + // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) if (const auto error = CubDebug(alias_temporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) { return error; diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh index 6b214e106be..f31dbf9ea48 100644 --- a/cub/cub/thread/thread_operators.cuh +++ b/cub/cub/thread/thread_operators.cuh @@ -106,36 +106,29 @@ struct ArgMin namespace detail { -/// @brief Arg max functor (keeps the value and offset of the first occurrence -/// of the larger item) -struct arg_max +// Less-than comparator for an index/value pair that compares values first, and indices when the values are equal +template > +struct arg_less : ValueLessThen { - /// Boolean max operator, preferring the item having the smaller offset in - /// case of ties template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ::cuda::std::pair operator()(const ::cuda::std::pair& a, const ::cuda::std::pair& b) const { - if ((b.second > a.second) || ((a.second == b.second) && (b.first < a.first))) + const auto& less = static_cast(*this); + if (less(b.second, a.second) || (!less(a.second, b.second) && b.first < a.first)) { return b; } return a; } -}; -/// @brief Arg min functor (keeps the value and offset of the first occurrence -/// of the smallest item) -struct arg_min -{ - /// Boolean min operator, preferring the item having the smaller offset in - /// case of ties template - _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ::cuda::std::pair - operator()(const ::cuda::std::pair& a, const ::cuda::std::pair& b) const + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE KeyValuePair + operator()(const KeyValuePair& a, const KeyValuePair& b) const { - if ((b.second < a.second) || ((a.second == b.second) && (b.first < a.first))) + const auto& less = static_cast(*this); + if (less(b.value, a.value) || (!less(a.value, b.value) && b.key < a.key)) { return b; } @@ -144,6 +137,29 @@ struct arg_min } }; +template +arg_less(ValueLessThen) -> arg_less; + +/// @brief Arg min functor (keeps the value and offset of the first occurrence of the smallest item) +using arg_min = arg_less<::cuda::std::less<>>; + +//! @brief Binary functor swapping the arguments to ``operator()`` before forwarding to an inner functor +template +struct swap_args : Predicate +{ + template + _CCCL_API _CCCL_FORCEINLINE decltype(auto) operator()(T&& t, U&& u) const + { + return Predicate::operator()(::cuda::std::forward(u), ::cuda::std::forward(t)); + } +}; + +template +swap_args(Predicate) -> swap_args; + +/// @brief Arg max functor (keeps the value and offset of the first occurrence of the larger item) +using arg_max = arg_less>>; + template struct ScanBySegmentOp { diff --git a/cub/test/catch2_test_device_reduce.cu b/cub/test/catch2_test_device_reduce.cu index d0904fc77f3..fca7ca9a511 100644 --- a/cub/test/catch2_test_device_reduce.cu +++ b/cub/test/catch2_test_device_reduce.cu @@ -4,6 +4,10 @@ #include +#include +#include +#include + #include #include "catch2_test_device_reduce.cuh" @@ -76,6 +80,16 @@ enum class gen_data_t : int GEN_TYPE_CONST }; +struct abs_less_t +{ + template + _CCCL_API auto operator()(const T& a, const T& b) const -> bool + { + // need to use `uabs` to avoid integer overflow in case of abs(INT_MIN) + return cuda::uabs(a) < cuda::uabs(b); + } +}; + C2H_TEST("Device reduce works with all device interfaces", "[reduce][device]", full_type_list) { using params = params_t; @@ -112,6 +126,8 @@ C2H_TEST("Device reduce works with all device interfaces", "[reduce][device]", f } auto d_in_it = thrust::raw_pointer_cast(in_items.data()); + CAPTURE(c2h::type_name(), c2h::type_name(), num_items); + #if TEST_TYPES != 4 SECTION("reduce") { @@ -265,5 +281,53 @@ C2H_TEST("Device reduce works with all device interfaces", "[reduce][device]", f REQUIRE(expected_result[0] == gpu_value); REQUIRE((expected_result - host_items.cbegin()) == gpu_result.key); } + +# if TEST_TYPES < 2 + SECTION("argmin-abs_less_t") + { + abs_less_t compare_op; + + // Prepare verification data + c2h::host_vector host_items(in_items); + auto expected_result = cuda::std::min_element(host_items.cbegin(), host_items.cend(), compare_op); + + // Run test + using result_t = cuda::std::pair>; + c2h::device_vector out_result(num_segments); + auto d_result_ptr = thrust::raw_pointer_cast(out_result.data()); + auto d_index_out = &d_result_ptr->first; + auto d_extremum_out = &d_result_ptr->second; + device_arg_min(unwrap_it(d_in_it), d_extremum_out, d_index_out, num_items, compare_op); + + // Verify result + result_t gpu_result = out_result[0]; + output_t gpu_extremum = static_cast(gpu_result.second); // Explicitly rewrap the gpu value + REQUIRE(expected_result[0] == gpu_extremum); + REQUIRE((expected_result - host_items.cbegin()) == gpu_result.first); + } + + SECTION("argmax-abs_less_t") + { + abs_less_t compare_op; + + // Prepare verification data + c2h::host_vector host_items(in_items); + auto expected_result = cuda::std::max_element(host_items.cbegin(), host_items.cend(), compare_op); + + // Run test + using result_t = cuda::std::pair>; + c2h::device_vector out_result(num_segments); + auto d_result_ptr = thrust::raw_pointer_cast(out_result.data()); + auto d_index_out = &d_result_ptr->first; + auto d_extremum_out = &d_result_ptr->second; + device_arg_max(unwrap_it(d_in_it), d_extremum_out, d_index_out, num_items, compare_op); + + // Verify result + result_t gpu_result = out_result[0]; + output_t gpu_extremum = static_cast(gpu_result.second); // Explicitly rewrap the gpu value + REQUIRE(expected_result[0] == gpu_extremum); + REQUIRE((expected_result - host_items.cbegin()) == gpu_result.first); + } +# endif #endif }