diff --git a/cub/benchmarks/bench/reduce/arg_extrema.cu b/cub/benchmarks/bench/reduce/arg_extrema.cu
index 04a6748bd42..9d9019bfdb9 100644
--- a/cub/benchmarks/bench/reduce/arg_extrema.cu
+++ b/cub/benchmarks/bench/reduce/arg_extrema.cu
@@ -45,26 +45,26 @@ void arg_reduce(nvbench::state& state, nvbench::type_list<T, OpT>)
   // Iterator providing the values being reduced
   using values_it_t = T*;
 
-  // Type used for the final result
-  using output_tuple_t = cub::KeyValuePair<global_offset_t, T>;
-
-  auto const init = ::cuda::std::is_same_v<OpT, cub::ArgMin>
+  auto const init = ::cuda::std::is_same_v<OpT, cub::detail::arg_min>
                     ? ::cuda::std::numeric_limits<T>::max()
                     : ::cuda::std::numeric_limits<T>::lowest();
 
   // Retrieve axis parameters
   const auto elements         = static_cast<std::size_t>(state.get_int64("Elements{io}"));
   thrust::device_vector<T> in = generate(elements);
-  thrust::device_vector<output_tuple_t> out(1);
+  thrust::device_vector<global_offset_t> out_index(1);
+  thrust::device_vector<T> out_extremum(1);
 
-  values_it_t d_in      = thrust::raw_pointer_cast(in.data());
-  output_tuple_t* d_out = thrust::raw_pointer_cast(out.data());
-  auto const num_items  = static_cast<global_offset_t>(elements);
+  values_it_t d_in             = thrust::raw_pointer_cast(in.data());
+  global_offset_t* d_out_index = thrust::raw_pointer_cast(out_index.data());
+  T* d_out_extremum            = thrust::raw_pointer_cast(out_extremum.data());
+  auto const num_items         = static_cast<global_offset_t>(elements);
 
   // Enable throughput calculations and add "Size" column to results.
   state.add_element_count(elements);
   state.add_global_memory_reads<T>(elements, "Size");
-  state.add_global_memory_writes<output_tuple_t>(1);
+  state.add_global_memory_writes<global_offset_t>(1);
+  state.add_global_memory_writes<T>(1);
 
   // Allocate temporary storage
   std::size_t temp_size;
@@ -72,10 +72,10 @@ void arg_reduce(nvbench::state& state, nvbench::type_list<T, OpT>)
     nullptr,
     temp_size,
     d_in,
-    d_out,
+    d_out_index,
+    d_out_extremum,
     num_items,
     OpT{},
-    init,
     0 /* stream */
 #if !TUNE_BASE
     ,
@@ -91,10 +91,10 @@ void arg_reduce(nvbench::state& state, nvbench::type_list<T, OpT>)
       temp_storage,
       temp_size,
       d_in,
-      d_out,
+      d_out_index,
+      d_out_extremum,
       num_items,
       OpT{},
-      init,
       launch.get_stream()
 #if !TUNE_BASE
         ,
@@ -104,7 +104,7 @@ void arg_reduce(nvbench::state& state, nvbench::type_list<T, OpT>)
   });
 }
 
-using op_types = nvbench::type_list<cub::ArgMin, cub::ArgMax>;
+using op_types = nvbench::type_list<cub::detail::arg_min, cub::detail::arg_max>;
 
 NVBENCH_BENCH_TYPES(arg_reduce, NVBENCH_TYPE_AXES(fundamental_types, op_types))
   .set_name("base")
diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh
index b2dd125cfff..af3be0dacf5 100644
--- a/cub/cub/device/device_reduce.cuh
+++ b/cub/cub/device/device_reduce.cuh
@@ -43,9 +43,11 @@
 #include <cuda/std/__functional/identity.h>
 #include <cuda/std/__functional/invoke.h>
 #include <cuda/std/__functional/operations.h>
+#include <cuda/std/__iterator/indirectly_comparable.h>
 #include <cuda/std/__type_traits/conditional.h>
 #include <cuda/std/__type_traits/is_integral.h>
 #include <cuda/std/__type_traits/is_same.h>
+#include <cuda/std/__utility/forward.h>
 #include <cuda/std/cstdint>
 #include <cuda/std/limits>
 
@@ -56,23 +58,6 @@ namespace detail
 template <typename DeterminismT>
 inline constexpr bool is_non_deterministic_v =
   ::cuda::std::is_same_v<DeterminismT, ::cuda::execution::determinism::not_guaranteed_t>;
-
-namespace reduce
-{
-template <typename ExtremumOutIteratorT, typename IndexOutIteratorT>
-struct unzip_and_write_arg_extremum_op
-{
-  ExtremumOutIteratorT result_out_it;
-  IndexOutIteratorT index_out_it;
-
-  template <typename IndexT, typename KeyValuePairT>
-  _CCCL_DEVICE _CCCL_FORCEINLINE void operator()(IndexT, KeyValuePairT reduced_result)
-  {
-    *result_out_it = reduced_result.value;
-    *index_out_it  = reduced_result.key;
-  }
-};
-} // namespace reduce
 } // namespace detail
 
 //! @rst
@@ -948,18 +933,46 @@ public:
     });
   }
 
+private:
+  template <typename InputIteratorT, typename ExtremumOutIteratorT, typename IndexOutIteratorT, typename CompareOpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t __arg_min(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    ExtremumOutIteratorT d_min_out,
+    IndexOutIteratorT d_index_out,
+    ::cuda::std::int64_t num_items,
+    CompareOpT compare_op,
+    cudaStream_t stream)
+  {
+    using PerPartitionOffsetT = int; // used by the kernel to index within one partition
+    using GlobalOffsetT       = ::cuda::std::int64_t; // in the range [d_in, d_in + num_items)
+
+    return detail::reduce::dispatch_streaming_arg_reduce<PerPartitionOffsetT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_min_out,
+      d_index_out,
+      static_cast<GlobalOffsetT>(num_items),
+      detail::arg_less{compare_op},
+      stream);
+  }
+
+public:
   //! @rst
-  //! Finds the first device-wide minimum using the less-than (``<``) operator and also returns the index of that item.
+  //! Finds the first device-wide minimum based on a given comparison operator and also returns the index of that item.
   //!
-  //! .. versionadded:: 2.2.0
-  //!    First appears in CUDA Toolkit 12.3.
+  //! .. versionadded:: 3.4.0
+  //!    First appears in CUDA Toolkit 13.4.
   //!
   //! - The minimum is written to ``d_min_out``
   //! - The offset of the returned item is written to ``d_index_out``, the offset type being written is of type
   //!   ``cuda::std::int64_t``.
-  //! - For zero-length inputs, ``cuda::std::numeric_limits<T>::max()}`` is written to ``d_min_out``  and the index
-  //!   ``1`` is written to ``d_index_out``.
-  //! - Does not support ``<`` operators that are non-commutative.
+  //! - For zero-length inputs, the index ``1`` is written to ``d_index_out`` and, if ``compare_op`` is
+  //!   ``cuda::std::less`` and ``cuda::std::numeric_limits<T>::is_specialized is ``true``,
+  //!   ``cuda::std::numeric_limits<T>::max()`` is written to ``d_min_out``, otherwise ``T{}``.
+  //! - Does not support comparison operators that are non-commutative.
   //! - Provides "run-to-run" determinism for pseudo-associative reduction
   //!   (e.g., addition of floating point types) on the same GPU device.
   //!   However, results for pseudo-associative reduction may be inconsistent
@@ -982,25 +995,33 @@ public:
   //!    // Declare, allocate, and initialize device-accessible pointers
   //!    // for input and output
   //!    int                num_items;    // e.g., 7
-  //!    int                *d_in;        // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int                *d_in;        // e.g., [8, 6, -7, 5, 3, 1, -9]
   //!    int                *d_min_out;   // memory for the minimum value
   //!    cuda::std::int64_t *d_index_out; // memory for the index of the returned value
   //!    ...
   //!
+  //!    // Define the comparison operator
+  //!    struct abs_less_t {
+  //!      template <typename T>
+  //!      __host__ __device__ bool operator()(const T& a, const T& b) const {
+  //!        return cuda::std::abs(a) < cuda::std::abs(b);
+  //!      }
+  //!    };
+  //!
   //!    // Determine temporary device storage requirements
   //!    void     *d_temp_storage = nullptr;
   //!    size_t   temp_storage_bytes = 0;
   //!    cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_min_out, d_index_out,
-  //!    num_items);
+  //!    num_items, abs_less_t{});
   //!
   //!    // Allocate temporary storage
   //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
   //!
   //!    // Run argmin-reduction
   //!    cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_min_out, d_index_out,
-  //!    num_items);
+  //!    num_items, abs_less_t{});
   //!
-  //!    // d_min_out   <-- 0
+  //!    // d_min_out   <-- 1
   //!    // d_index_out <-- 5
   //!
   //! @endrst
@@ -1031,6 +1052,9 @@ public:
   //! @param[out] d_index_out
   //!   Iterator to which the index of the returned value is written
   //!
+  //! @param[in] compare_op
+  //!   Comparison operator returning ``true`` if the first argument is less than the second
+  //!
   //! @param[in] num_items
   //!   Total number of input items (i.e., length of ``d_in``)
   //!
@@ -1038,7 +1062,14 @@ public:
   //!   @rst
   //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
   //!   @endrst
-  template <typename InputIteratorT, typename ExtremumOutIteratorT, typename IndexOutIteratorT>
+  template <
+    typename InputIteratorT,
+    typename ExtremumOutIteratorT,
+    typename IndexOutIteratorT,
+    typename CompareOpT,
+    // TODO(bgruber): this constraint is not accurate, since the implementation will compare the value types of
+    // ExtremumOutIteratorT, which is wrong IMO
+    ::cuda::std::enable_if_t<::cuda::std::indirectly_comparable<InputIteratorT, InputIteratorT, CompareOpT>, int> = 0>
   CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
@@ -1046,41 +1077,112 @@ public:
     ExtremumOutIteratorT d_min_out,
     IndexOutIteratorT d_index_out,
     ::cuda::std::int64_t num_items,
+    CompareOpT compare_op,
     cudaStream_t stream = 0)
   {
     _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMin");
+    return __arg_min(d_temp_storage, temp_storage_bytes, d_in, d_min_out, d_index_out, num_items, compare_op, stream);
+  }
 
-    // The input type
-    using InputValueT = cub::detail::it_value_t<InputIteratorT>;
+  //! @rst
+  //! .. versionadded:: 2.2.0
+  //!    First appears in CUDA Toolkit 12.3.
+  //! @overload
+  //! @note Uses ``cuda::std::less`` as comparison operator
+  //! @endrst
+  template <typename InputIteratorT, typename ExtremumOutIteratorT, typename IndexOutIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    ExtremumOutIteratorT d_min_out,
+    IndexOutIteratorT d_index_out,
+    ::cuda::std::int64_t num_items,
+    cudaStream_t stream = 0)
+  {
+    return ArgMin(
+      d_temp_storage, temp_storage_bytes, d_in, d_min_out, d_index_out, num_items, ::cuda::std::less{}, stream);
+  }
 
-    // Offset type used within the kernel and to index within one partition
-    using PerPartitionOffsetT = int;
+  template <typename InputIteratorT,
+            typename ExtremumOutIteratorT,
+            typename IndexOutIteratorT,
+            typename CompareOpT,
+            typename EnvT>
+  [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t __arg_min_env(
+    InputIteratorT d_in,
+    ExtremumOutIteratorT d_min_out,
+    IndexOutIteratorT d_index_out,
+    ::cuda::std::int64_t num_items,
+    CompareOpT compare_op,
+    EnvT env = {})
+  {
+    static_assert(!::cuda::std::execution::__queryable_with<EnvT, ::cuda::execution::determinism::__get_determinism_t>,
+                  "Determinism should be used inside requires to have an effect.");
+    using requirements_t = ::cuda::std::execution::
+      __query_result_or_t<EnvT, ::cuda::execution::__get_requirements_t, ::cuda::std::execution::env<>>;
+    using requested_determinism_t =
+      ::cuda::std::execution::__query_result_or_t<requirements_t, //
+                                                  ::cuda::execution::determinism::__get_determinism_t,
+                                                  ::cuda::execution::determinism::run_to_run_t>;
 
-    // Offset type used to index within the total input in the range [d_in, d_in + num_items)
-    using GlobalOffsetT = ::cuda::std::int64_t;
+    // Static assert to reject gpu_to_gpu determinism since it's not properly implemented
+    static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>,
+                  "gpu_to_gpu determinism is not supported");
 
-    // The value type used for the extremum
-    using OutputExtremumT = detail::non_void_value_t<ExtremumOutIteratorT, InputValueT>;
+    // Query relevant properties from the environment
+    auto stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}}, env);
+    auto mr     = ::cuda::__call_or(::cuda::mr::get_memory_resource, detail::device_memory_resource{}, env);
 
-    // Reduction operation
-    using ReduceOpT = cub::ArgMin;
+    using PerPartitionOffsetT = int; // used by the kernel to index within one partition
+    using GlobalOffsetT       = ::cuda::std::int64_t; // in the range [d_in, d_in + num_items)
 
-    // Initial value
-    OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::max()};
+    void* d_temp_storage      = nullptr;
+    size_t temp_storage_bytes = 0;
+
+    // Query the required temporary storage size
+    if (const auto error = detail::reduce::dispatch_streaming_arg_reduce<PerPartitionOffsetT>(
+          d_temp_storage,
+          temp_storage_bytes,
+          d_in,
+          d_min_out,
+          d_index_out,
+          static_cast<GlobalOffsetT>(num_items),
+          detail::arg_less{compare_op},
+          stream.get()))
+    {
+      return error;
+    }
 
-    // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
-    auto out_it = ::cuda::make_tabulate_output_iterator(
-      detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_min_out, d_index_out});
+    // TODO(gevtushenko): use uninitialized buffer when it's available
+    if (const auto error =
+          CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr)))
+    {
+      return error;
+    }
 
-    return detail::reduce::dispatch_streaming_arg_reduce<PerPartitionOffsetT>(
+    // Run the algorithm
+    const auto error = detail::reduce::dispatch_streaming_arg_reduce<PerPartitionOffsetT>(
       d_temp_storage,
       temp_storage_bytes,
       d_in,
-      out_it,
+      d_min_out,
+      d_index_out,
       static_cast<GlobalOffsetT>(num_items),
-      ReduceOpT{},
-      initial_value,
-      stream);
+      detail::arg_less{compare_op},
+      stream.get());
+
+    // Try to deallocate regardless of the error to avoid memory leaks
+    const auto deallocate_error =
+      CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr));
+
+    if (error != cudaSuccess)
+    {
+      // Reduction error takes precedence over deallocation error since it happens first
+      return error;
+    }
+
+    return deallocate_error;
   }
 
   //! @rst
@@ -1092,8 +1194,9 @@ public:
   //! - The minimum is written to ``d_min_out``
   //! - The offset of the returned item is written to ``d_index_out``, the offset type being written is of type
   //!   ``cuda::std::int64_t``.
-  //! - For zero-length inputs, ``cuda::std::numeric_limits<T>::max()}`` is written to ``d_min_out``  and the index
-  //!   ``1`` is written to ``d_index_out``.
+  //! - For zero-length inputs, the index ``1`` is written to ``d_index_out`` and, if ``compare_op`` is
+  //!   ``cuda::std::less`` and ``cuda::std::numeric_limits<T>::is_specialized is ``true``,
+  //!   ``cuda::std::numeric_limits<T>::max()`` is written to ``d_min_out``, otherwise ``T{}``.
   //! - Does not support ``<`` operators that are non-commutative.
   //! - Provides determinism based on the environment's determinism requirements.
   //!   To request "run-to-run" determinism, pass ``cuda::execution::require(cuda::execution::determinism::run_to_run)``
@@ -1135,6 +1238,9 @@ public:
   //! @param[out] d_index_out
   //!   Iterator to which the index of the returned value is written
   //!
+  //! @param[in] compare_op
+  //!   Comparison operator returning ``true`` if the first argument is less than the second
+  //!
   //! @param[in] num_items
   //!   Total number of input items (i.e., length of ``d_in``)
   //!
@@ -1142,10 +1248,36 @@ public:
   //!   @rst
   //!   **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
   //!   @endrst
+  template <
+    typename InputIteratorT,
+    typename ExtremumOutIteratorT,
+    typename IndexOutIteratorT,
+    typename CompareOpT,
+    typename EnvT = ::cuda::std::execution::env<>,
+    // TODO(bgruber): this constraint is not accurate, since the implementation will compare the value types of
+    // ExtremumOutIteratorT, which is wrong IMO
+    ::cuda::std::enable_if_t<::cuda::std::indirectly_comparable<InputIteratorT, InputIteratorT, CompareOpT>, int> = 0>
+  [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(
+    InputIteratorT d_in,
+    ExtremumOutIteratorT d_min_out,
+    IndexOutIteratorT d_index_out,
+    ::cuda::std::int64_t num_items,
+    CompareOpT compare_op,
+    EnvT env = {})
+  {
+    _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::ArgMin");
+    return __arg_min_env(d_in, d_min_out, d_index_out, num_items, compare_op, env);
+  }
+
+  //! @overload
+  //! @note Uses ``cuda::std::less`` as comparison operator
   template <typename InputIteratorT,
             typename ExtremumOutIteratorT,
             typename IndexOutIteratorT,
-            typename EnvT = ::cuda::std::execution::env<>>
+            typename EnvT = ::cuda::std::execution::env<>,
+            // TODO(bgruber): this constraint is not accurate, since the implementation will compare the value types of
+            // ExtremumOutIteratorT, which is wrong IMO
+            ::cuda::std::enable_if_t<!::cuda::std::indirectly_comparable<InputIteratorT, InputIteratorT, EnvT>, int> = 0>
   [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
   ArgMin(InputIteratorT d_in,
          ExtremumOutIteratorT d_min_out,
@@ -1153,86 +1285,7 @@ public:
          ::cuda::std::int64_t num_items,
          EnvT env = {})
   {
-    _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::ArgMin");
-
-    static_assert(!::cuda::std::execution::__queryable_with<EnvT, ::cuda::execution::determinism::__get_determinism_t>,
-                  "Determinism should be used inside requires to have an effect.");
-    using requirements_t = ::cuda::std::execution::
-      __query_result_or_t<EnvT, ::cuda::execution::__get_requirements_t, ::cuda::std::execution::env<>>;
-    using requested_determinism_t =
-      ::cuda::std::execution::__query_result_or_t<requirements_t, //
-                                                  ::cuda::execution::determinism::__get_determinism_t,
-                                                  ::cuda::execution::determinism::run_to_run_t>;
-
-    // Static assert to reject gpu_to_gpu determinism since it's not properly implemented
-    static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>,
-                  "gpu_to_gpu determinism is not supported");
-
-    // Query relevant properties from the environment
-    auto stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}}, env);
-    auto mr     = ::cuda::__call_or(::cuda::mr::get_memory_resource, detail::device_memory_resource{}, env);
-
-    void* d_temp_storage      = nullptr;
-    size_t temp_storage_bytes = 0;
-
-    // Reduction operation
-    using ReduceOpT           = cub::ArgMin;
-    using InputValueT         = cub::detail::it_value_t<InputIteratorT>;
-    using PerPartitionOffsetT = int;
-    using GlobalOffsetT       = ::cuda::std::int64_t;
-
-    using OutputExtremumT = detail::non_void_value_t<ExtremumOutIteratorT, InputValueT>;
-
-    // Initial value
-    OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::max()};
-
-    // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
-    auto out_it = ::cuda::make_tabulate_output_iterator(
-      detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_min_out, d_index_out});
-
-    // Query the required temporary storage size
-    if (const auto error = detail::reduce::dispatch_streaming_arg_reduce<PerPartitionOffsetT>(
-          d_temp_storage,
-          temp_storage_bytes,
-          d_in,
-          out_it,
-          static_cast<GlobalOffsetT>(num_items),
-          ReduceOpT{},
-          initial_value,
-          stream.get()))
-    {
-      return error;
-    }
-
-    // TODO(gevtushenko): use uninitialized buffer when it's available
-    if (const auto error =
-          CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr)))
-    {
-      return error;
-    }
-
-    // Run the algorithm
-    const auto error = detail::reduce::dispatch_streaming_arg_reduce<PerPartitionOffsetT>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_in,
-      out_it,
-      static_cast<GlobalOffsetT>(num_items),
-      ReduceOpT{},
-      initial_value,
-      stream.get());
-
-    // Try to deallocate regardless of the error to avoid memory leaks
-    const auto deallocate_error =
-      CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr));
-
-    if (error != cudaSuccess)
-    {
-      // Reduction error takes precedence over deallocation error since it happens first
-      return error;
-    }
-
-    return deallocate_error;
+    return ArgMin(d_in, d_min_out, d_index_out, num_items, ::cuda::std::less{}, env);
   }
 
   //! @rst
@@ -1354,7 +1407,14 @@ public:
     InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::max())};
 
     return detail::reduce::dispatch<AccumT>(
-      d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, OffsetT{num_items}, cub::ArgMin(), initial_value, stream);
+      d_temp_storage,
+      temp_storage_bytes,
+      d_indexed_in,
+      d_out,
+      OffsetT{num_items},
+      detail::arg_min{},
+      initial_value,
+      stream);
   }
 
   //! @rst
@@ -1567,17 +1627,17 @@ public:
   }
 
   //! @rst
-  //! Finds the first device-wide maximum using the greater-than (``>``) operator and also returns the index of that
-  //! item.
+  //! Finds the first device-wide maximum based on a given comparison operator and also returns the index of that item.
   //!
-  //! .. versionadded:: 2.2.0
-  //!    First appears in CUDA Toolkit 12.3.
+  //! .. versionadded:: 3.4.0
+  //!    First appears in CUDA Toolkit 13.4.
   //!
   //! - The maximum is written to ``d_max_out``
   //! - The offset of the returned item is written to ``d_index_out``, the offset type being written is of type
   //!   ``cuda::std::int64_t``.
-  //! - For zero-length inputs, ``cuda::std::numeric_limits<T>::max()}`` is written to ``d_max_out``  and the index
-  //!   ``1`` is written to ``d_index_out``.
+  //! - For zero-length inputs, the index ``1`` is written to ``d_index_out`` and, if ``compare_op`` is
+  //!   ``cuda::std::less`` and ``cuda::std::numeric_limits<T>::is_specialized is ``true``,
+  //!   ``cuda::std::numeric_limits<T>::lowest()`` is written to ``d_min_out``, otherwise ``T{}``.
   //! - Does not support ``>`` operators that are non-commutative.
   //! - Provides "run-to-run" determinism for pseudo-associative reduction
   //!   (e.g., addition of floating point types) on the same GPU device.
@@ -1601,25 +1661,33 @@ public:
   //!    // Declare, allocate, and initialize device-accessible pointers
   //!    // for input and output
   //!    int                num_items;    // e.g., 7
-  //!    int                *d_in;        // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int                *d_in;        // e.g., [8, 6, -7, 5, 3, 1, -9]
   //!    int                *d_max_out;   // memory for the maximum value
   //!    cuda::std::int64_t *d_index_out; // memory for the index of the returned value
   //!    ...
   //!
+  //!    // Define the comparison operator
+  //!    struct abs_less_t {
+  //!      template <typename T>
+  //!      __host__ __device__ bool operator()(const T& a, const T& b) const {
+  //!        return cuda::std::abs(a) < cuda::std::abs(b);
+  //!      }
+  //!    };
+  //!
   //!    // Determine temporary device storage requirements
   //!    void     *d_temp_storage = nullptr;
   //!    size_t   temp_storage_bytes = 0;
   //!    cub::DeviceReduce::ArgMax(
-  //!      d_temp_storage, temp_storage_bytes, d_in, d_max_out, d_index_out, num_items);
+  //!      d_temp_storage, temp_storage_bytes, d_in, d_max_out, d_index_out, num_items, abs_less_t{});
   //!
   //!    // Allocate temporary storage
   //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
   //!
   //!    // Run argmax-reduction
   //!    cub::DeviceReduce::ArgMax(
-  //!      d_temp_storage, temp_storage_bytes, d_in, d_max_out, d_index_out, num_items);
+  //!      d_temp_storage, temp_storage_bytes, d_in, d_max_out, d_index_out, num_items, abs_less_t{});
   //!
-  //!    // d_max_out   <-- 9
+  //!    // d_max_out   <-- -9
   //!    // d_index_out <-- 6
   //!
   //! @endrst
@@ -1649,6 +1717,9 @@ public:
   //! @param[out] d_index_out
   //!   Iterator to which the index of the returned value is written
   //!
+  //! @param[in] compare_op
+  //!   Comparison operator returning ``true`` if the first argument is less than the second
+  //!
   //! @param[in] num_items
   //!   Total number of input items (i.e., length of ``d_in``)
   //!
@@ -1656,7 +1727,14 @@ public:
   //!   @rst
   //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
   //!   @endrst
-  template <typename InputIteratorT, typename ExtremumOutIteratorT, typename IndexOutIteratorT>
+  template <
+    typename InputIteratorT,
+    typename ExtremumOutIteratorT,
+    typename IndexOutIteratorT,
+    typename CompareOpT,
+    // TODO(bgruber): this constraint is not accurate, since the implementation will compare the value types of
+    // ExtremumOutIteratorT, which is wrong IMO
+    ::cuda::std::enable_if_t<::cuda::std::indirectly_comparable<InputIteratorT, InputIteratorT, CompareOpT>, int> = 0>
   CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
@@ -1664,43 +1742,41 @@ public:
     ExtremumOutIteratorT d_max_out,
     IndexOutIteratorT d_index_out,
     ::cuda::std::int64_t num_items,
+    CompareOpT compare_op,
     cudaStream_t stream = 0)
   {
     _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMax");
-
-    // The input type
-    using InputValueT = cub::detail::it_value_t<InputIteratorT>;
-
-    // Offset type used within the kernel and to index within one partition
-    using PerPartitionOffsetT = int;
-
-    // Offset type used to index within the total input in the range [d_in, d_in + num_items)
-    using GlobalOffsetT = ::cuda::std::int64_t;
-
-    // The value type used for the extremum
-    using OutputExtremumT = detail::non_void_value_t<ExtremumOutIteratorT, InputValueT>;
-
-    // Reduction operation
-    using ReduceOpT = cub::ArgMax;
-
-    // Initial value
-    OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::lowest()};
-
-    // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
-    auto out_it = ::cuda::make_tabulate_output_iterator(
-      detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_max_out, d_index_out});
-
-    return detail::reduce::dispatch_streaming_arg_reduce<PerPartitionOffsetT>(
+    return __arg_min(
       d_temp_storage,
       temp_storage_bytes,
       d_in,
-      out_it,
-      static_cast<GlobalOffsetT>(num_items),
-      ReduceOpT{},
-      initial_value,
+      d_max_out,
+      d_index_out,
+      num_items,
+      detail::arg_less{detail::swap_args{compare_op}},
       stream);
   }
 
+  //! @rst
+  //! .. versionadded:: 2.2.0
+  //!    First appears in CUDA Toolkit 12.3.
+  //! @overload
+  //! @note Uses ``cuda::std::less`` as comparison operator
+  //! @endrst
+  template <typename InputIteratorT, typename ExtremumOutIteratorT, typename IndexOutIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    ExtremumOutIteratorT d_max_out,
+    IndexOutIteratorT d_index_out,
+    ::cuda::std::int64_t num_items,
+    cudaStream_t stream = 0)
+  {
+    return ArgMax(
+      d_temp_storage, temp_storage_bytes, d_in, d_max_out, d_index_out, num_items, ::cuda::std::less{}, stream);
+  }
+
   //! @rst
   //! Finds the first device-wide maximum using the greater-than (``>``)
   //! operator, also returning the index of that item
@@ -1824,21 +1900,29 @@ public:
     InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::lowest())};
 
     return detail::reduce::dispatch<AccumT>(
-      d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, OffsetT{num_items}, cub::ArgMax(), initial_value, stream);
+      d_temp_storage,
+      temp_storage_bytes,
+      d_indexed_in,
+      d_out,
+      OffsetT{num_items},
+      detail::arg_max{},
+      initial_value,
+      stream);
   }
 
   //! @rst
   //! Finds the first device-wide maximum using the greater-than (``>``) operator and also returns the index of that
   //! item.
   //!
-  //! .. versionadded:: 2.2.0
-  //!    First appears in CUDA Toolkit 12.3.
+  //! .. versionadded:: 3.4.0
+  //!    First appears in CUDA Toolkit 13.4.
   //!
   //! - The maximum is written to ``d_max_out``
   //! - The offset of the returned item is written to ``d_index_out``, the offset type being written is of type
   //!   ``cuda::std::int64_t``.
-  //! - For zero-length inputs, ``cuda::std::numeric_limits<T>::lowest()}`` is written to ``d_max_out``  and the index
-  //!   ``1`` is written to ``d_index_out``.
+  //! - For zero-length inputs, the index ``1`` is written to ``d_index_out`` and, if ``compare_op`` is
+  //!   ``cuda::std::less`` and ``cuda::std::numeric_limits<T>::is_specialized is ``true``,
+  //!   ``cuda::std::numeric_limits<T>::lowest()`` is written to ``d_min_out``, otherwise ``T{}``.
   //! - Does not support ``>`` operators that are non-commutative.
   //! - Provides determinism based on the environment's determinism requirements.
   //!   To request "run-to-run" determinism, pass ``cuda::execution::require(cuda::execution::determinism::run_to_run)``
@@ -1880,6 +1964,9 @@ public:
   //! @param[out] d_index_out
   //!   Iterator to which the index of the returned value is written
   //!
+  //! @param[in] compare_op
+  //!   Comparison operator returning ``true`` if the first argument is less than the second
+  //!
   //! @param[in] num_items
   //!   Total number of input items (i.e., length of ``d_in``)
   //!
@@ -1887,6 +1974,30 @@ public:
   //!   @rst
   //!   **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
   //!   @endrst
+  template <
+    typename InputIteratorT,
+    typename ExtremumOutIteratorT,
+    typename IndexOutIteratorT,
+    typename CompareOpT,
+    typename EnvT = ::cuda::std::execution::env<>,
+    // TODO(bgruber): this constraint is not accurate, since the implementation will compare the value types of
+    // ExtremumOutIteratorT, which is wrong IMO
+    ::cuda::std::enable_if_t<::cuda::std::indirectly_comparable<InputIteratorT, InputIteratorT, CompareOpT>, int> = 0>
+  [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(
+    InputIteratorT d_in,
+    ExtremumOutIteratorT d_max_out,
+    IndexOutIteratorT d_index_out,
+    ::cuda::std::int64_t num_items,
+    CompareOpT compare_op,
+    EnvT env = {})
+  {
+    _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::ArgMax");
+    return __arg_min_env(
+      d_in, d_index_out, num_items, detail::arg_less{detail::swap_args{compare_op}}, ::cuda::std::execution::env{});
+  }
+
+  //! @overload
+  //! @note Uses ``cuda::std::less`` as comparison operator
   template <typename InputIteratorT,
             typename ExtremumOutIteratorT,
             typename IndexOutIteratorT,
@@ -1899,85 +2010,7 @@ public:
          EnvT env = {})
   {
     _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::ArgMax");
-
-    static_assert(!::cuda::std::execution::__queryable_with<EnvT, ::cuda::execution::determinism::__get_determinism_t>,
-                  "Determinism should be used inside requires to have an effect.");
-    using requirements_t = ::cuda::std::execution::
-      __query_result_or_t<EnvT, ::cuda::execution::__get_requirements_t, ::cuda::std::execution::env<>>;
-    using requested_determinism_t =
-      ::cuda::std::execution::__query_result_or_t<requirements_t, //
-                                                  ::cuda::execution::determinism::__get_determinism_t,
-                                                  ::cuda::execution::determinism::run_to_run_t>;
-
-    // Static assert to reject gpu_to_gpu determinism since it's not properly implemented
-    static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>,
-                  "gpu_to_gpu determinism is not supported");
-
-    // Query relevant properties from the environment
-    auto stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}}, env);
-    auto mr     = ::cuda::__call_or(::cuda::mr::get_memory_resource, detail::device_memory_resource{}, env);
-
-    void* d_temp_storage      = nullptr;
-    size_t temp_storage_bytes = 0;
-
-    // Reduction operation
-    using ReduceOpT           = cub::ArgMax;
-    using InputValueT         = cub::detail::it_value_t<InputIteratorT>;
-    using PerPartitionOffsetT = int;
-    using GlobalOffsetT       = ::cuda::std::int64_t;
-
-    using OutputExtremumT = detail::non_void_value_t<ExtremumOutIteratorT, InputValueT>;
-
-    // Initial value
-    OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::lowest()};
-
-    // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
-    auto out_it = ::cuda::make_tabulate_output_iterator(
-      detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_max_out, d_index_out});
-
-    // Query the required temporary storage size
-    if (const auto error = detail::reduce::dispatch_streaming_arg_reduce<PerPartitionOffsetT>(
-          d_temp_storage,
-          temp_storage_bytes,
-          d_in,
-          out_it,
-          static_cast<GlobalOffsetT>(num_items),
-          ReduceOpT{},
-          initial_value,
-          stream.get()))
-    {
-      return error;
-    }
-
-    // TODO(gevtushenko): use uninitialized buffer when it's available
-    if (const auto error =
-          CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr)))
-    {
-      return error;
-    }
-
-    // Run the algorithm
-    const auto error = detail::reduce::dispatch_streaming_arg_reduce<PerPartitionOffsetT>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_in,
-      out_it,
-      static_cast<GlobalOffsetT>(num_items),
-      ReduceOpT{},
-      initial_value,
-      stream.get());
-
-    // Try to deallocate regardless of the error to avoid memory leaks
-    const auto deallocate_error =
-      CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr));
-
-    if (error != cudaSuccess)
-    {
-      // Reduction error takes precedence over deallocation error since it happens first
-      return error;
-    }
-
-    return deallocate_error;
+    return __arg_min_env(d_in, d_max_out, d_index_out, num_items, detail::arg_max{}, env);
   }
 
   //! @rst
diff --git a/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh b/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh
index bebe1f9bcb4..9c87aed6546 100644
--- a/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh
+++ b/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh
@@ -112,6 +112,20 @@ struct local_to_global_op
   }
 };
 
+template <typename ExtremumOutIteratorT, typename IndexOutIteratorT>
+struct unzip_and_write_arg_extremum_op
+{
+  ExtremumOutIteratorT result_out_it;
+  IndexOutIteratorT index_out_it;
+
+  template <typename IndexT, typename KeyValuePairT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void operator()(IndexT, KeyValuePairT reduced_result)
+  {
+    *result_out_it = reduced_result.value;
+    *index_out_it  = reduced_result.key;
+  }
+};
+
 /******************************************************************************
  * Single-problem streaming reduction dispatch
  *****************************************************************************/
@@ -143,15 +157,16 @@ struct local_to_global_op
 //
 // @tparam PolicySelector
 //   Selects the tuning policy
-template <
-  typename PerPartitionOffsetT,
-  typename InputIteratorT,
-  typename OutputIteratorT,
-  typename GlobalOffsetT,
-  typename ReductionOpT,
-  typename InitT,
-  typename PolicySelector =
-    reduce::policy_selector_from_types<KeyValuePair<PerPartitionOffsetT, InitT>, PerPartitionOffsetT, ReductionOpT>>
+template <typename PerPartitionOffsetT,
+          typename InputIteratorT,
+          typename ExtremumOutIteratorT,
+          typename IndexOutIteratorT,
+          typename GlobalOffsetT,
+          typename ReductionOpT,
+          typename PolicySelector = policy_selector_from_types<
+            KeyValuePair<PerPartitionOffsetT, non_void_value_t<ExtremumOutIteratorT, it_value_t<InputIteratorT>>>,
+            PerPartitionOffsetT,
+            ReductionOpT>>
 #  if _CCCL_HAS_CONCEPTS()
   requires reduce_policy_selector<PolicySelector>
 #  endif // _CCCL_HAS_CONCEPTS()
@@ -159,31 +174,32 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch_streaming_arg_reduce
   void* d_temp_storage,
   size_t& temp_storage_bytes,
   InputIteratorT d_in,
-  OutputIteratorT d_result_out,
+  ExtremumOutIteratorT d_min_out,
+  IndexOutIteratorT d_index_out,
   GlobalOffsetT num_items,
   ReductionOpT reduce_op,
-  InitT init,
   cudaStream_t stream,
   PolicySelector policy_selector = {})
 {
+  using input_value_t     = it_value_t<InputIteratorT>;
+  using output_extremum_t = non_void_value_t<ExtremumOutIteratorT, input_value_t>;
+
+  // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
+  auto d_result_out = ::cuda::make_tabulate_output_iterator(
+    detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_min_out, d_index_out});
+
   // Wrapped input iterator to produce index-value tuples, i.e., <PerPartitionOffsetT, InputT>-tuples
   // We make sure to offset the user-provided input iterator by the current partition's offset
-  using arg_index_input_iterator_t = ArgIndexInputIterator<InputIteratorT, PerPartitionOffsetT, InitT>;
-
-  // The type used for the aggregate that the user wants to find the extremum for
-  using output_aggregate_t = InitT;
+  using arg_index_input_iterator_t = ArgIndexInputIterator<InputIteratorT, PerPartitionOffsetT, output_extremum_t>;
 
   // The output tuple type (i.e., extremum plus index tuples)
-  using per_partition_accum_t = KeyValuePair<PerPartitionOffsetT, output_aggregate_t>;
-  using global_accum_t        = KeyValuePair<GlobalOffsetT, output_aggregate_t>;
+  using per_partition_accum_t = KeyValuePair<PerPartitionOffsetT, output_extremum_t>;
+  using global_accum_t        = KeyValuePair<GlobalOffsetT, output_extremum_t>;
 
   // Unary promotion operator type that is used to transform a per-partition result to a global result
   // operator()(per_partition_accum_t) -> global_accum_t
   using local_to_global_op_t = local_to_global_op<GlobalOffsetT>;
 
-  // Empty problem initialization type
-  using empty_problem_init_t = empty_problem_init_t<per_partition_accum_t>;
-
   // The current partition's input iterator is an ArgIndex iterator that generates indices relative to the beginning
   // of the current partition, i.e., [0, partition_size) along with an OffsetIterator that offsets the user-provided
   // input iterator by the current partition's offset
@@ -209,18 +225,33 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch_streaming_arg_reduce
 
   // Reduction operator type that enables accumulating per-partition results to a global reduction result
   using accumulating_transform_output_op_t =
-    accumulating_transform_output_op<global_accum_t, local_to_global_op_t, ReductionOpT, OutputIteratorT>;
-
+    accumulating_transform_output_op<global_accum_t, local_to_global_op_t, ReductionOpT, decltype(d_result_out)>;
   auto accumulating_out_op = accumulating_transform_output_op_t{
     true, is_single_partition, nullptr, nullptr, d_result_out, local_to_global_op, reduce_op};
 
-  empty_problem_init_t initial_value{{PerPartitionOffsetT{1}, init}};
+  // Initial value for empty problems, according to documented contract
+  const auto empty_problem_extremum = static_cast<output_extremum_t>([] {
+    if constexpr (::cuda::std::is_same_v<ReductionOpT, arg_min>
+                  && ::cuda::std::numeric_limits<input_value_t>::is_specialized)
+    {
+      return ::cuda::std::numeric_limits<input_value_t>::max();
+    }
+    else if constexpr (::cuda::std::is_same_v<ReductionOpT, arg_max>
+                       && ::cuda::std::numeric_limits<input_value_t>::is_specialized)
+    {
+      return ::cuda::std::numeric_limits<input_value_t>::lowest();
+    }
+    else
+    {
+      return input_value_t{};
+    }
+  }());
+  auto initial_value = empty_problem_init_t<per_partition_accum_t>{{PerPartitionOffsetT{1}, empty_problem_extremum}};
 
   void* allocations[2]       = {nullptr, nullptr};
   size_t allocation_sizes[2] = {0, 2 * sizeof(global_accum_t)};
 
   // Query temporary storage requirements for per-partition reduction
-
   reduce::dispatch<per_partition_accum_t>(
     nullptr,
     allocation_sizes[0],
@@ -233,8 +264,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch_streaming_arg_reduce
     ::cuda::std::identity{},
     policy_selector);
 
-  // Alias the temporary allocations from the single storage blob (or compute the necessary size
-  // of the blob)
+  // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
   if (const auto error = CubDebug(alias_temporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)))
   {
     return error;
diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh
index 6b214e106be..f31dbf9ea48 100644
--- a/cub/cub/thread/thread_operators.cuh
+++ b/cub/cub/thread/thread_operators.cuh
@@ -106,36 +106,29 @@ struct ArgMin
 
 namespace detail
 {
-/// @brief Arg max functor (keeps the value and offset of the first occurrence
-///        of the larger item)
-struct arg_max
+// Less-than comparator for an index/value pair that compares values first, and indices when the values are equal
+template <typename ValueLessThen = ::cuda::std::less<>>
+struct arg_less : ValueLessThen
 {
-  /// Boolean max operator, preferring the item having the smaller offset in
-  /// case of ties
   template <typename T, typename OffsetT>
   _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ::cuda::std::pair<OffsetT, T>
   operator()(const ::cuda::std::pair<OffsetT, T>& a, const ::cuda::std::pair<OffsetT, T>& b) const
   {
-    if ((b.second > a.second) || ((a.second == b.second) && (b.first < a.first)))
+    const auto& less = static_cast<const ValueLessThen&>(*this);
+    if (less(b.second, a.second) || (!less(a.second, b.second) && b.first < a.first))
     {
       return b;
     }
 
     return a;
   }
-};
 
-/// @brief Arg min functor (keeps the value and offset of the first occurrence
-///        of the smallest item)
-struct arg_min
-{
-  /// Boolean min operator, preferring the item having the smaller offset in
-  /// case of ties
   template <typename T, typename OffsetT>
-  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ::cuda::std::pair<OffsetT, T>
-  operator()(const ::cuda::std::pair<OffsetT, T>& a, const ::cuda::std::pair<OffsetT, T>& b) const
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE KeyValuePair<OffsetT, T>
+  operator()(const KeyValuePair<OffsetT, T>& a, const KeyValuePair<OffsetT, T>& b) const
   {
-    if ((b.second < a.second) || ((a.second == b.second) && (b.first < a.first)))
+    const auto& less = static_cast<const ValueLessThen&>(*this);
+    if (less(b.value, a.value) || (!less(a.value, b.value) && b.key < a.key))
     {
       return b;
     }
@@ -144,6 +137,29 @@ struct arg_min
   }
 };
 
+template <typename ValueLessThen>
+arg_less(ValueLessThen) -> arg_less<ValueLessThen>;
+
+/// @brief Arg min functor (keeps the value and offset of the first occurrence of the smallest item)
+using arg_min = arg_less<::cuda::std::less<>>;
+
+//! @brief Binary functor swapping the arguments to ``operator()`` before forwarding to an inner functor
+template <typename Predicate>
+struct swap_args : Predicate
+{
+  template <typename T, typename U>
+  _CCCL_API _CCCL_FORCEINLINE decltype(auto) operator()(T&& t, U&& u) const
+  {
+    return Predicate::operator()(::cuda::std::forward<U>(u), ::cuda::std::forward<T>(t));
+  }
+};
+
+template <typename Predicate>
+swap_args(Predicate) -> swap_args<Predicate>;
+
+/// @brief Arg max functor (keeps the value and offset of the first occurrence of the larger item)
+using arg_max = arg_less<swap_args<::cuda::std::less<>>>;
+
 template <typename ScanOpT>
 struct ScanBySegmentOp
 {
diff --git a/cub/test/catch2_test_device_reduce.cu b/cub/test/catch2_test_device_reduce.cu
index d0904fc77f3..fca7ca9a511 100644
--- a/cub/test/catch2_test_device_reduce.cu
+++ b/cub/test/catch2_test_device_reduce.cu
@@ -4,6 +4,10 @@
 
 #include <cub/device/device_reduce.cuh>
 
+#include <cuda/__cmath/uabs.h>
+#include <cuda/std/__algorithm/max_element.h>
+#include <cuda/std/__algorithm/min_element.h>
+
 #include <cstdint>
 
 #include "catch2_test_device_reduce.cuh"
@@ -76,6 +80,16 @@ enum class gen_data_t : int
   GEN_TYPE_CONST
 };
 
+struct abs_less_t
+{
+  template <typename T>
+  _CCCL_API auto operator()(const T& a, const T& b) const -> bool
+  {
+    // need to use `uabs` to avoid integer overflow in case of abs(INT_MIN)
+    return cuda::uabs(a) < cuda::uabs(b);
+  }
+};
+
 C2H_TEST("Device reduce works with all device interfaces", "[reduce][device]", full_type_list)
 {
   using params   = params_t<TestType>;
@@ -112,6 +126,8 @@ C2H_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
   }
   auto d_in_it = thrust::raw_pointer_cast(in_items.data());
 
+  CAPTURE(c2h::type_name<item_t>(), c2h::type_name<output_t>(), num_items);
+
 #if TEST_TYPES != 4
   SECTION("reduce")
   {
@@ -265,5 +281,53 @@ C2H_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
     REQUIRE(expected_result[0] == gpu_value);
     REQUIRE((expected_result - host_items.cbegin()) == gpu_result.key);
   }
+
+#  if TEST_TYPES < 2
+  SECTION("argmin-abs_less_t")
+  {
+    abs_less_t compare_op;
+
+    // Prepare verification data
+    c2h::host_vector<item_t> host_items(in_items);
+    auto expected_result = cuda::std::min_element(host_items.cbegin(), host_items.cend(), compare_op);
+
+    // Run test
+    using result_t = cuda::std::pair<cuda::std::int32_t, unwrap_value_t<output_t>>;
+    c2h::device_vector<result_t> out_result(num_segments);
+    auto d_result_ptr   = thrust::raw_pointer_cast(out_result.data());
+    auto d_index_out    = &d_result_ptr->first;
+    auto d_extremum_out = &d_result_ptr->second;
+    device_arg_min(unwrap_it(d_in_it), d_extremum_out, d_index_out, num_items, compare_op);
+
+    // Verify result
+    result_t gpu_result   = out_result[0];
+    output_t gpu_extremum = static_cast<output_t>(gpu_result.second); // Explicitly rewrap the gpu value
+    REQUIRE(expected_result[0] == gpu_extremum);
+    REQUIRE((expected_result - host_items.cbegin()) == gpu_result.first);
+  }
+
+  SECTION("argmax-abs_less_t")
+  {
+    abs_less_t compare_op;
+
+    // Prepare verification data
+    c2h::host_vector<item_t> host_items(in_items);
+    auto expected_result = cuda::std::max_element(host_items.cbegin(), host_items.cend(), compare_op);
+
+    // Run test
+    using result_t = cuda::std::pair<cuda::std::int32_t, unwrap_value_t<output_t>>;
+    c2h::device_vector<result_t> out_result(num_segments);
+    auto d_result_ptr   = thrust::raw_pointer_cast(out_result.data());
+    auto d_index_out    = &d_result_ptr->first;
+    auto d_extremum_out = &d_result_ptr->second;
+    device_arg_max(unwrap_it(d_in_it), d_extremum_out, d_index_out, num_items, compare_op);
+
+    // Verify result
+    result_t gpu_result   = out_result[0];
+    output_t gpu_extremum = static_cast<output_t>(gpu_result.second); // Explicitly rewrap the gpu value
+    REQUIRE(expected_result[0] == gpu_extremum);
+    REQUIRE((expected_result - host_items.cbegin()) == gpu_result.first);
+  }
+#  endif
 #endif
 }