diff --git a/libcudacxx/include/cuda/std/__internal/namespaces.h b/libcudacxx/include/cuda/std/__internal/namespaces.h index 95d513d1723..2c663744fb6 100644 --- a/libcudacxx/include/cuda/std/__internal/namespaces.h +++ b/libcudacxx/include/cuda/std/__internal/namespaces.h @@ -77,6 +77,10 @@ #define _CCCL_BEGIN_NAMESPACE_CUDA_DRIVER _CCCL_BEGIN_NAMESPACE(cuda::__driver) #define _CCCL_END_NAMESPACE_CUDA_DRIVER _CCCL_END_NAMESPACE(cuda::__driver) +// Namespaces related to +#define _CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD _CCCL_BEGIN_NAMESPACE(cuda::std::simd) +#define _CCCL_END_NAMESPACE_CUDA_STD_SIMD _CCCL_END_NAMESPACE(cuda::std::simd) + // Namespaces related to #define _CCCL_BEGIN_NAMESPACE_CUDA_STD_RANGES _CCCL_BEGIN_NAMESPACE(cuda::std::ranges) #define _CCCL_END_NAMESPACE_CUDA_STD_RANGES _CCCL_END_NAMESPACE(cuda::std::ranges) diff --git a/libcudacxx/include/cuda/std/__simd/abi.h b/libcudacxx/include/cuda/std/__simd/abi.h new file mode 100644 index 00000000000..3eee5f14692 --- /dev/null +++ b/libcudacxx/include/cuda/std/__simd/abi.h @@ -0,0 +1,49 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___SIMD_ABI_H +#define _CUDA_STD___SIMD_ABI_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD + +using __simd_size_type = ptrdiff_t; + +// [simd.expos.abi], simd ABI tags +template <__simd_size_type _Np> +struct __fixed_size; // internal ABI tag + +template <__simd_size_type _Np> +using fixed_size = __fixed_size<_Np>; // implementation-defined ABI + +template +using native = fixed_size<1>; // implementation-defined ABI + +template +using __deduce_abi_t = fixed_size<_Np>; // exposition-only + +_CCCL_END_NAMESPACE_CUDA_STD_SIMD + +#include + +#endif // _CUDA_STD___SIMD_ABI_H diff --git a/libcudacxx/include/cuda/std/__simd/basic_mask.h b/libcudacxx/include/cuda/std/__simd/basic_mask.h new file mode 100644 index 00000000000..070fc619967 --- /dev/null +++ b/libcudacxx/include/cuda/std/__simd/basic_mask.h @@ -0,0 +1,349 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___SIMD_BASIC_MASK_H +#define _CUDA_STD___SIMD_BASIC_MASK_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD + +// [simd.mask.class], class template basic_mask +template +class basic_mask : public __mask_operations<_Bytes, _Abi> +{ + static_assert(__is_abi_tag_v<_Abi>, "basic_mask requires a valid ABI tag"); + + template + friend class basic_vec; + + using _Impl = __mask_operations<_Bytes, _Abi>; + using _Storage = typename _Impl::_MaskStorage; + + _Storage __s_; + + struct __storage_tag_t + {}; + static constexpr __storage_tag_t __storage_tag{}; + + _CCCL_API constexpr basic_mask(_Storage __v, __storage_tag_t) noexcept + : __s_{__v} + {} + +public: + using value_type = bool; + using abi_type = _Abi; + + // TODO(fbusato): add simd-iterator + // using iterator = simd-iterator; + // using const_iterator = simd-iterator; + + // constexpr iterator begin() noexcept { return {*this, 0}; } + // constexpr const_iterator begin() const noexcept { return {*this, 0}; } + // constexpr const_iterator cbegin() const noexcept { return {*this, 0}; } + // constexpr default_sentinel_t end() const noexcept { return {}; } + // constexpr default_sentinel_t cend() const noexcept { return {}; } + + static constexpr integral_constant<__simd_size_type, __simd_size_v<__integer_from<_Bytes>, _Abi>> size{}; + + static constexpr auto __usize = size_t{size}; + static constexpr auto __size = __simd_size_type{size}; + + _CCCL_HIDE_FROM_ABI constexpr basic_mask() noexcept = default; + + // [simd.mask.ctor], basic_mask constructors + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(same_as<_Up, value_type>) + _CCCL_API constexpr explicit basic_mask(_Up __v) noexcept + : __s_{_Impl::__broadcast(__v)} + {} + + _CCCL_TEMPLATE(size_t _UBytes, typename _UAbi) + _CCCL_REQUIRES((__simd_size_v<__integer_from<_UBytes>, _UAbi> == __size)) + _CCCL_API constexpr explicit basic_mask(const basic_mask<_UBytes, _UAbi>& __x) noexcept + { + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < __size; ++__i) + { + __s_.__set(__i, __x[__i]); + } + } + + _CCCL_TEMPLATE(typename _Generator) + _CCCL_REQUIRES(__can_generate_v) + _CCCL_API constexpr explicit basic_mask(_Generator&& __g) + : __s_{_Impl::__generate(__g)} + {} + + _CCCL_TEMPLATE(typename _Tp) + _CCCL_REQUIRES(same_as<_Tp, bitset<__usize>>) + _CCCL_API constexpr basic_mask(const _Tp& __b) noexcept + : __s_{_Impl::__broadcast(false)} + { + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < __size; ++__i) + { + __s_.__set(__i, static_cast(__b[__i])); + } + } + + _CCCL_TEMPLATE(typename _Tp) + _CCCL_REQUIRES((__cccl_is_unsigned_integer_v<_Tp>) ) + _CCCL_API constexpr explicit basic_mask(_Tp __val) noexcept + : __s_{_Impl::__broadcast(false)} + { + constexpr auto __num_bits = __simd_size_type{__num_bits_v<_Tp>}; + constexpr auto __m = __size < __num_bits ? __size : __num_bits; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < __m; ++__i) + { + __s_.__set(__i, static_cast((__val >> __i) & _Tp{1})); + } + } + + // [simd.mask.subscr], basic_mask subscript operators + + [[nodiscard]] _CCCL_API constexpr value_type operator[](__simd_size_type __i) const + { + _CCCL_ASSERT(::cuda::in_range(__i, __simd_size_type{0}, __size), "Index is out of bounds"); + return static_cast(__s_.__get(__i)); + } + + // TODO(fbusato): subscript with integral indices, requires permute() + // template + // constexpr resize_t operator[](const I& indices) const; + + // [simd.mask.unary], basic_mask unary operators + + [[nodiscard]] _CCCL_API constexpr basic_mask operator!() const noexcept + { + return {_Impl::__bitwise_not(__s_), __storage_tag}; + } + + template + static constexpr bool __has_integer_from_v = + (_ByteSize == 1 || _ByteSize == 2 || _ByteSize == 4 || _ByteSize == 8 +#if _CCCL_HAS_INT128() + || _ByteSize == 16 +#endif // _CCCL_HAS_INT128() + ); + + _CCCL_TEMPLATE(size_t _Bp = _Bytes) + _CCCL_REQUIRES(__has_integer_from_v<_Bp>) + [[nodiscard]] _CCCL_API constexpr basic_vec<__integer_from<_Bp>, _Abi> operator+() const noexcept + { + return static_cast, _Abi>>(*this); + } + + _CCCL_TEMPLATE(size_t _Bp = _Bytes) + _CCCL_REQUIRES((!__has_integer_from_v<_Bp>) ) + _CCCL_API void operator+() const noexcept = delete; + + _CCCL_TEMPLATE(size_t _Bp = _Bytes) + _CCCL_REQUIRES(__has_integer_from_v<_Bp>) + [[nodiscard]] _CCCL_API constexpr basic_vec<__integer_from<_Bp>, _Abi> operator-() const noexcept + { + return -static_cast, _Abi>>(*this); + } + + _CCCL_TEMPLATE(size_t _Bp = _Bytes) + _CCCL_REQUIRES((!__has_integer_from_v<_Bp>) ) + _CCCL_API void operator-() const noexcept = delete; + + _CCCL_TEMPLATE(size_t _Bp = _Bytes) + _CCCL_REQUIRES(__has_integer_from_v<_Bp>) + [[nodiscard]] _CCCL_API constexpr basic_vec<__integer_from<_Bp>, _Abi> operator~() const noexcept + { + return ~static_cast, _Abi>>(*this); + } + + _CCCL_TEMPLATE(size_t _Bp = _Bytes) + _CCCL_REQUIRES((!__has_integer_from_v<_Bp>) ) + _CCCL_API void operator~() const noexcept = delete; + + // [simd.mask.conv], basic_mask conversions + + _CCCL_TEMPLATE(typename _Up, typename _Ap) + _CCCL_REQUIRES((sizeof(_Up) != _Bytes && __simd_size_v<_Up, _Ap> == __size)) + _CCCL_API constexpr explicit operator basic_vec<_Up, _Ap>() const noexcept + { + basic_vec<_Up, _Ap> __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < __size; ++__i) + { + __result.__s_.__set(__i, static_cast<_Up>((*this)[__i])); + } + return __result; + } + + _CCCL_TEMPLATE(typename _Up, typename _Ap) + _CCCL_REQUIRES((sizeof(_Up) == _Bytes && __simd_size_v<_Up, _Ap> == __size)) + _CCCL_API constexpr operator basic_vec<_Up, _Ap>() const noexcept + { + basic_vec<_Up, _Ap> __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < __size; ++__i) + { + __result.__s_.__set(__i, static_cast<_Up>((*this)[__i])); + } + return __result; + } + + [[nodiscard]] _CCCL_API constexpr bitset<__usize> to_bitset() const noexcept + { + bitset<__usize> __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < __size; ++__i) + { + __result.set(__i, (*this)[__i]); + } + return __result; + } + + [[nodiscard]] _CCCL_API constexpr unsigned long long to_ullong() const + { + constexpr __simd_size_type __nbits = __num_bits_v; + if constexpr (__size > __nbits) + { + for (auto __i = __nbits; __i < __size; ++__i) + { + _CCCL_ASSERT(!(*this)[__i], "Bit above unsigned long long width is set"); + } + } + return to_bitset().to_ullong(); + } + + // [simd.mask.binary], basic_mask binary operators + + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator&&(const basic_mask& __lhs, const basic_mask& __rhs) noexcept + { + return {_Impl::__logic_and(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator||(const basic_mask& __lhs, const basic_mask& __rhs) noexcept + { + return {_Impl::__logic_or(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator&(const basic_mask& __lhs, const basic_mask& __rhs) noexcept + { + return {_Impl::__bitwise_and(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator|(const basic_mask& __lhs, const basic_mask& __rhs) noexcept + { + return {_Impl::__bitwise_or(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator^(const basic_mask& __lhs, const basic_mask& __rhs) noexcept + { + return {_Impl::__bitwise_xor(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + // [simd.mask.cassign], basic_mask compound assignment + + _CCCL_API friend constexpr basic_mask& operator&=(basic_mask& __lhs, const basic_mask& __rhs) noexcept + { + return __lhs = __lhs & __rhs; + } + + _CCCL_API friend constexpr basic_mask& operator|=(basic_mask& __lhs, const basic_mask& __rhs) noexcept + { + return __lhs = __lhs | __rhs; + } + + _CCCL_API friend constexpr basic_mask& operator^=(basic_mask& __lhs, const basic_mask& __rhs) noexcept + { + return __lhs = __lhs ^ __rhs; + } + + // [simd.mask.comparison], basic_mask comparisons (element-wise) + + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator==(const basic_mask& __lhs, const basic_mask& __rhs) noexcept + { + return !(__lhs ^ __rhs); + } + + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator!=(const basic_mask& __lhs, const basic_mask& __rhs) noexcept + { + return __lhs ^ __rhs; + } + + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator>=(const basic_mask& __lhs, const basic_mask& __rhs) noexcept + { + return __lhs || !__rhs; + } + + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator<=(const basic_mask& __lhs, const basic_mask& __rhs) noexcept + { + return !__lhs || __rhs; + } + + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator>(const basic_mask& __lhs, const basic_mask& __rhs) noexcept + { + return __lhs && !__rhs; + } + + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator<(const basic_mask& __lhs, const basic_mask& __rhs) noexcept + { + return !__lhs && __rhs; + } + + // TODO(fbusato): [simd.mask.cond], basic_mask exposition only conditional operators + // friend constexpr basic_mask __simd_select_impl( + // const basic_mask&, const basic_mask&, const basic_mask&) noexcept; + // friend constexpr basic_mask __simd_select_impl( + // const basic_mask&, same_as auto, same_as auto) noexcept; + // template + // friend constexpr vec __simd_select_impl( + // const basic_mask&, const T0&, const T1&) noexcept; +}; +_CCCL_END_NAMESPACE_CUDA_STD_SIMD + +#include + +#endif // _CUDA_STD___SIMD_BASIC_MASK_H diff --git a/libcudacxx/include/cuda/std/__simd/basic_vec.h b/libcudacxx/include/cuda/std/__simd/basic_vec.h new file mode 100644 index 00000000000..f819749e4e7 --- /dev/null +++ b/libcudacxx/include/cuda/std/__simd/basic_vec.h @@ -0,0 +1,526 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___SIMD_BASIC_VEC_H +#define _CUDA_STD___SIMD_BASIC_VEC_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD + +// [simd.class], class template basic_vec +template +class basic_vec : public __simd_operations<_Tp, _Abi> +{ +public: + using value_type = _Tp; + using mask_type = basic_mask; + +private: + static_assert(__is_vectorizable_v<_Tp>, "basic_vec requires a vectorizable type"); + static_assert(__is_abi_tag_v<_Abi>, "basic_vec requires a valid ABI tag"); + + template + friend class basic_mask; + + using _Impl = __simd_operations<_Tp, _Abi>; + using _Storage = typename _Impl::_SimdStorage; + + _Storage __s_; + + struct __storage_tag_t + {}; + static constexpr __storage_tag_t __storage_tag{}; + + _CCCL_API constexpr basic_vec(_Storage __s, __storage_tag_t) noexcept + : __s_{__s} + {} + + // Friend comparison operators (e.g. operator==) cannot access basic_mask's private constructor directly (friendship + // is not transitive). This function is required to access the private constructor of basic_mask. + _CCCL_API static constexpr mask_type __make_mask(typename mask_type::_Storage __s) noexcept + { + return mask_type{__s, mask_type::__storage_tag}; + } + + // operator[] is const only. We need this function to set values + _CCCL_API constexpr void __set(__simd_size_type __i, value_type __v) noexcept + { + __s_.__set(__i, __v); + } + +public: + using abi_type = _Abi; + + // TODO(fbusato): add simd-iterator + // using iterator = simd-iterator; + // using const_iterator = simd-iterator; + + // constexpr iterator begin() noexcept { return {*this, 0}; } + // constexpr const_iterator begin() const noexcept { return {*this, 0}; } + // constexpr const_iterator cbegin() const noexcept { return {*this, 0}; } + // constexpr default_sentinel_t end() const noexcept { return {}; } + // constexpr default_sentinel_t cend() const noexcept { return {}; } + + static constexpr integral_constant<__simd_size_type, __simd_size_v> size{}; + + static constexpr auto __usize = size_t{size}; + static constexpr auto __size = __simd_size_type{size}; + + _CCCL_HIDE_FROM_ABI basic_vec() noexcept = default; + + // [simd.ctor], basic_vec constructors + + // [simd.ctor] value broadcast constructor (explicit overload) + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES((__explicitly_convertible_to) _CCCL_AND(!__is_value_ctor_implicit<_Up, value_type>)) + _CCCL_API constexpr explicit basic_vec(_Up&& __v) noexcept + : __s_{_Impl::__broadcast(static_cast(__v))} + {} + + // [simd.ctor] value broadcast constructor (implicit overload) + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES((__explicitly_convertible_to) _CCCL_AND(__is_value_ctor_implicit<_Up, value_type>)) + _CCCL_API constexpr basic_vec(_Up&& __v) noexcept + : __s_{_Impl::__broadcast(static_cast(__v))} + {} + + // [simd.ctor] converting constructor from basic_vec (explicit overload) + _CCCL_TEMPLATE(typename _Up, typename _UAbi) + _CCCL_REQUIRES((__simd_size_v<_Up, _UAbi> == __size) _CCCL_AND(__explicitly_convertible_to) + _CCCL_AND(__is_vec_ctor_explicit<_Up, value_type>)) + _CCCL_API constexpr explicit basic_vec(const basic_vec<_Up, _UAbi>& __v) noexcept + { + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < __size; ++__i) + { + __s_.__set(__i, static_cast(__v[__i])); + } + } + + // [simd.ctor] converting constructor from basic_vec (implicit overload) + _CCCL_TEMPLATE(typename _Up, typename _UAbi) + _CCCL_REQUIRES((__simd_size_v<_Up, _UAbi> == __size) _CCCL_AND(__explicitly_convertible_to) + _CCCL_AND(!__is_vec_ctor_explicit<_Up, value_type>)) + _CCCL_API constexpr basic_vec(const basic_vec<_Up, _UAbi>& __v) noexcept + { + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < __size; ++__i) + { + __s_.__set(__i, static_cast(__v[__i])); + } + } + + // [simd.ctor] generator constructor + _CCCL_TEMPLATE(typename _Generator) + _CCCL_REQUIRES(__can_generate_v) + _CCCL_API constexpr explicit basic_vec(_Generator&& __g) + : __s_{_Impl::__generate(__g)} + {} + + // [simd.ctor] range constructor + + template + static constexpr bool __is_compatible_range = __is_compatible_range_v; + + // [simd.ctor] range constructor + _CCCL_TEMPLATE(typename _Range, typename... _Flags) + _CCCL_REQUIRES(__is_compatible_range<_Range>) + _CCCL_API constexpr basic_vec(_Range&& __range, flags<_Flags...> = {}) + { + static_assert(__has_convert_flag_v<_Flags...> || __is_value_preserving_v, value_type>, + "Conversion from range_value_t to value_type is not value-preserving; use flag_convert"); + const auto __data = ranges::data(__range); + __assert_load_store_alignment, _Flags...>(__data); + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < __size; ++__i) + { + __s_.__set(__i, static_cast(__data[__i])); + } + } + + // [simd.ctor] masked range constructor + _CCCL_TEMPLATE(typename _Range, typename... _Flags) + _CCCL_REQUIRES(__is_compatible_range<_Range>) + _CCCL_API constexpr basic_vec(_Range&& __range, const mask_type& __mask, flags<_Flags...> = {}) + { + static_assert(__has_convert_flag_v<_Flags...> || __is_value_preserving_v, value_type>, + "Conversion from range_value_t to value_type is not value-preserving; use flag_convert"); + const auto __data = ranges::data(__range); + __assert_load_store_alignment, _Flags...>(__data); + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < __size; ++__i) + { + __s_.__set(__i, __mask[__i] ? static_cast(__data[__i]) : value_type()); + } + } + + // TODO(fbusato): add complex constructor + // constexpr basic_vec(const real-type& __reals, const real-type& __imags = {}) noexcept; + + // [simd.subscr], basic_vec subscript operators + + [[nodiscard]] _CCCL_API constexpr value_type operator[](__simd_size_type __i) const + { + _CCCL_ASSERT(::cuda::in_range(__i, __simd_size_type{0}, __size), "Index is out of bounds"); + return __s_.__get(__i); + } + + // TODO(fbusato): subscript with integral indices, requires permute() + // template + // constexpr resize_t<_Idx::size(), basic_vec> operator[](const _Idx& __indices) const; + + // TODO(fbusato): [simd.complex.access], basic_vec complex accessors + // constexpr real-type real() const noexcept; + // constexpr real-type imag() const noexcept; + // constexpr void real(const real-type& __v) noexcept; + // constexpr void imag(const real-type& __v) noexcept; + + // [simd.unary], basic_vec unary operators + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_pre_increment<_Up>) + _CCCL_API constexpr basic_vec& operator++() noexcept + { + _Impl::__increment(__s_); + return *this; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_post_increment<_Up>) + [[nodiscard]] _CCCL_API constexpr basic_vec operator++(int) noexcept + { + const basic_vec __r = *this; + _Impl::__increment(__s_); + return __r; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_pre_decrement<_Up>) + _CCCL_API constexpr basic_vec& operator--() noexcept + { + _Impl::__decrement(__s_); + return *this; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_post_decrement<_Up>) + [[nodiscard]] _CCCL_API constexpr basic_vec operator--(int) noexcept + { + const basic_vec __r = *this; + _Impl::__decrement(__s_); + return __r; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_negate<_Up>) + [[nodiscard]] _CCCL_API constexpr mask_type operator!() const noexcept + { + return mask_type{_Impl::__negate(__s_), mask_type::__storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_bitwise_not<_Up>) + [[nodiscard]] _CCCL_API constexpr basic_vec operator~() const noexcept + { + return basic_vec{_Impl::__bitwise_not(__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_unary_plus<_Up>) + [[nodiscard]] _CCCL_API constexpr basic_vec operator+() const noexcept + { + return *this; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_unary_minus<_Up>) + [[nodiscard]] _CCCL_API constexpr basic_vec operator-() const noexcept + { + return basic_vec{_Impl::__unary_minus(__s_), __storage_tag}; + } + + // [simd.binary], basic_vec binary operators + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_binary_plus<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator+(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return basic_vec{_Impl::__plus(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_binary_minus<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator-(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return basic_vec{_Impl::__minus(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_multiplies<_Up>) + [[nodiscard]] + _CCCL_API friend constexpr basic_vec operator*(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return basic_vec{_Impl::__multiplies(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_divides<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator/(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return basic_vec{_Impl::__divides(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_modulo<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator%(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return basic_vec{_Impl::__modulo(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_bitwise_and<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator&(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return basic_vec{_Impl::__bitwise_and(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_bitwise_or<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator|(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return basic_vec{_Impl::__bitwise_or(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_bitwise_xor<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator^(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return basic_vec{_Impl::__bitwise_xor(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_shift_left<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator<<(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return basic_vec{_Impl::__shift_left(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_shift_right<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator>>(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return basic_vec{_Impl::__shift_right(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_shift_left_size<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator<<(const basic_vec& __lhs, __simd_size_type __n) noexcept + { + return __lhs << basic_vec{__n}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_shift_right_size<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator>>(const basic_vec& __lhs, __simd_size_type __n) noexcept + { + return __lhs >> basic_vec{__n}; + } + + // [simd.cassign], basic_vec compound assignment + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_binary_plus<_Up>) + _CCCL_API friend constexpr basic_vec& operator+=(basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __lhs = __lhs + __rhs; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_binary_minus<_Up>) + _CCCL_API friend constexpr basic_vec& operator-=(basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __lhs = __lhs - __rhs; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_multiplies<_Up>) + _CCCL_API friend constexpr basic_vec& operator*=(basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __lhs = __lhs * __rhs; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_divides<_Up>) + _CCCL_API friend constexpr basic_vec& operator/=(basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __lhs = __lhs / __rhs; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_modulo<_Up>) + _CCCL_API friend constexpr basic_vec& operator%=(basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __lhs = __lhs % __rhs; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_bitwise_and<_Up>) + _CCCL_API friend constexpr basic_vec& operator&=(basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __lhs = __lhs & __rhs; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_bitwise_or<_Up>) + _CCCL_API friend constexpr basic_vec& operator|=(basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __lhs = __lhs | __rhs; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_bitwise_xor<_Up>) + _CCCL_API friend constexpr basic_vec& operator^=(basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __lhs = __lhs ^ __rhs; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_shift_left<_Up>) + _CCCL_API friend constexpr basic_vec& operator<<=(basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __lhs = __lhs << __rhs; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_shift_right<_Up>) + _CCCL_API friend constexpr basic_vec& operator>>=(basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __lhs = __lhs >> __rhs; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_shift_left_size<_Up>) + _CCCL_API friend constexpr basic_vec& operator<<=(basic_vec& __lhs, __simd_size_type __n) noexcept + { + return __lhs = __lhs << __n; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_shift_right_size<_Up>) + _CCCL_API friend constexpr basic_vec& operator>>=(basic_vec& __lhs, __simd_size_type __n) noexcept + { + return __lhs = __lhs >> __n; + } + + // [simd.comparison], basic_vec compare operators + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_equal_to<_Up>) + [[nodiscard]] _CCCL_API friend constexpr mask_type operator==(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __make_mask(_Impl::__equal_to(__lhs.__s_, __rhs.__s_)); + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_not_equal_to<_Up>) + [[nodiscard]] _CCCL_API friend constexpr mask_type operator!=(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __make_mask(_Impl::__not_equal_to(__lhs.__s_, __rhs.__s_)); + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_greater_equal<_Up>) + [[nodiscard]] _CCCL_API friend constexpr mask_type operator>=(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __make_mask(_Impl::__greater_equal(__lhs.__s_, __rhs.__s_)); + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_less_equal<_Up>) + [[nodiscard]] _CCCL_API friend constexpr mask_type operator<=(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __make_mask(_Impl::__less_equal(__lhs.__s_, __rhs.__s_)); + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_greater<_Up>) + [[nodiscard]] _CCCL_API friend constexpr mask_type operator>(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __make_mask(_Impl::__greater(__lhs.__s_, __rhs.__s_)); + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_less<_Up>) + [[nodiscard]] _CCCL_API friend constexpr mask_type operator<(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __make_mask(_Impl::__less(__lhs.__s_, __rhs.__s_)); + } + + // TODO(fbusato): [simd.cond], basic_vec exposition-only conditional operators + // friend constexpr basic_vec __simd_select_impl( + // const mask_type&, const basic_vec&, const basic_vec&) noexcept; +}; + +// GCC fails when the deduction guides are marked __host__ __device__, while it is requires for other cases, e.g. clang +#if !_CCCL_COMPILER(GCC) +# define _CCCL_API_DEDUCTION_GUIDE _CCCL_API +#else // ^^^ _CCCL_COMPILER(GCC) ^^^ / vvv !_CCCL_COMPILER(GCC) vvv +# define _CCCL_API_DEDUCTION_GUIDE +#endif // !_CCCL_COMPILER(GCC) + +// [simd.ctor] deduction guide from contiguous sized range +// Deduces vec, static_cast(ranges::size(r))> +// * it is not possible to use the alias "vec" for the deduction guide +// * "vec" is defined as basic_vec<_Tp, __deduce_abi_t<_Tp, _Np>> +// * where _Np is __simd_size_v<_Tp, __static_range_size_v<_Range>> +_CCCL_TEMPLATE(typename _Range, typename... _Ts) +_CCCL_REQUIRES( + ranges::contiguous_range<_Range> _CCCL_AND ranges::sized_range<_Range> _CCCL_AND __has_static_size<_Range>) +_CCCL_API_DEDUCTION_GUIDE basic_vec(_Range&&, _Ts...) + -> basic_vec, + __deduce_abi_t, __static_range_size_v<_Range>>>; + +// [simd.ctor] deduction guide from basic_mask +// basic_vec<__integer_from, Abi> is equivalent to decltype(+k): +// * k has type basic_mask<_Bytes, _Abi> +// * +k calls basic_mask::operator+() +// * the return type is basic_vec<__integer_from<_Bp>, _Abi> +// The deduced type is equivalent to decltype(+k), i.e. basic_vec<__integer_from, Abi> +_CCCL_TEMPLATE(size_t _Bytes, typename _Abi) +_CCCL_REQUIRES(__has_unary_plus>) +_CCCL_API_DEDUCTION_GUIDE basic_vec(basic_mask<_Bytes, _Abi>) -> basic_vec<__integer_from<_Bytes>, _Abi>; + +_CCCL_END_NAMESPACE_CUDA_STD_SIMD + +#include + +#endif // _CUDA_STD___SIMD_BASIC_VEC_H diff --git a/libcudacxx/include/cuda/std/__simd/concepts.h b/libcudacxx/include/cuda/std/__simd/concepts.h new file mode 100644 index 00000000000..0b4885ef21c --- /dev/null +++ b/libcudacxx/include/cuda/std/__simd/concepts.h @@ -0,0 +1,256 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___SIMD_CONCEPTS_H +#define _CUDA_STD___SIMD_CONCEPTS_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD + +// [simd.expos], explicitly-convertible-to concept + +template +_CCCL_CONCEPT __explicitly_convertible_to = _CCCL_REQUIRES_EXPR((_To, _From))((static_cast<_To>(declval<_From>()))); + +// [simd.expos], constexpr-wrapper-like concept + +template +_CCCL_CONCEPT __constexpr_wrapper_like = _CCCL_REQUIRES_EXPR((_Tp))( + requires(convertible_to<_Tp, decltype(_Tp::value)>), + requires(equality_comparable_with<_Tp, decltype(_Tp::value)>), + requires(bool_constant<(_Tp() == _Tp::value)>::value), + requires(bool_constant<(static_cast(_Tp()) == _Tp::value)>::value)); + +// Covers all integral types including character types (char16_t, char32_t, wchar_t, char8_t), +// which are excluded by __cccl_is_integer_v +template +constexpr bool __is_integral__value_preserving_v = + is_integral_v<_From> && is_integral_v<_To> && numeric_limits<_From>::digits <= numeric_limits<_To>::digits + && (!is_signed_v<_From> || is_signed_v<_To>); + +// The conversion from an arithmetic type U to a vectorizable type T is value-preserving if all possible +// values of U can be represented with type T. +template +constexpr bool __is_value_preserving_v = + __is_integral__value_preserving_v<_From, _To> + || (::cuda::is_floating_point_v<_From> && ::cuda::is_floating_point_v<_To> + && __fp_is_implicit_conversion_v<_From, _To>) + || (is_integral_v<_From> && ::cuda::is_floating_point_v<_To> + && numeric_limits<_From>::digits <= numeric_limits<_To>::digits); + +template +constexpr bool __is_constexpr_wrapper_value_preserving_v = false; + +// The standard requires checking whether the specific compile-time value From::value is representable by _ValueType, +// not whether the entire source type is value-preserving. +template +constexpr bool __is_constexpr_wrapper_value_preserving_v<_From, _ValueType, void_t> = + is_arithmetic_v> + && (static_cast>(static_cast<_ValueType>(_From::value)) == _From::value); + +// [simd.ctor] implicit value constructor +// - From is not an arithmetic type and does not satisfy constexpr-wrapper-like, +// - From is an arithmetic type and the conversion from From to value_type is value-preserving +// - From satisfies constexpr-wrapper-like, remove_cvref_t is an arithmetic type, and +// From​::​value is representable by value_type. +template > +_CCCL_CONCEPT __is_value_ctor_implicit = + convertible_to<_Up, _ValueType> + && ((!is_arithmetic_v<_From> && !__constexpr_wrapper_like<_From>) + || (is_arithmetic_v<_From> && __is_value_preserving_v<_From, _ValueType>) + || (__constexpr_wrapper_like<_From> && __is_constexpr_wrapper_value_preserving_v<_From, _ValueType>) ); + +// [conv.rank], integer conversion rank for [simd.ctor] p7 + +template +inline constexpr int __integer_conversion_rank = 0; + +template <> +inline constexpr int __integer_conversion_rank = 1; +template <> +inline constexpr int __integer_conversion_rank = 1; +template <> +inline constexpr int __integer_conversion_rank = 1; +template <> +inline constexpr int __integer_conversion_rank = 2; +template <> +inline constexpr int __integer_conversion_rank = 2; +template <> +inline constexpr int __integer_conversion_rank = 3; +template <> +inline constexpr int __integer_conversion_rank = 3; +template <> +inline constexpr int __integer_conversion_rank = 4; +template <> +inline constexpr int __integer_conversion_rank = 4; +template <> +inline constexpr int __integer_conversion_rank = 5; +template <> +inline constexpr int __integer_conversion_rank = 5; +#if _CCCL_HAS_INT128() +template <> +inline constexpr int __integer_conversion_rank<__int128_t> = 6; +template <> +inline constexpr int __integer_conversion_rank<__uint128_t> = 6; +#endif // _CCCL_HAS_INT128() + +// [conv.rank], floating-point conversion rank for [simd.ctor] p7 + +template +inline constexpr int __fp_conversion_rank = 0; + +#if _CCCL_HAS_NVFP16() +template <> +inline constexpr int __fp_conversion_rank<__half> = 1; +#endif // _CCCL_HAS_NVFP16() +#if _CCCL_HAS_NVBF16() +template <> +inline constexpr int __fp_conversion_rank<__nv_bfloat16> = 1; +#endif // _CCCL_HAS_NVBF16() +template <> +inline constexpr int __fp_conversion_rank = 2; +template <> +inline constexpr int __fp_conversion_rank = 3; +#if _CCCL_HAS_LONG_DOUBLE() +template <> +inline constexpr int __fp_conversion_rank = 4; +#endif // _CCCL_HAS_LONG_DOUBLE() +#if _CCCL_HAS_FLOAT128() +template <> +inline constexpr int __fp_conversion_rank<__float128> = 5; +#endif // _CCCL_HAS_FLOAT128() + +// [simd.ctor] p7: explicit(see below) for basic_vec(const basic_vec&) +// explicit evaluates to true if either: +// - conversion from U to value_type is not value-preserving, or +// - both U and value_type are integral and integer_conversion_rank(U) > rank(value_type), or +// - both U and value_type are floating-point and fp_conversion_rank(U) > rank(value_type) +template +constexpr bool __is_vec_ctor_explicit = + !__is_value_preserving_v<_Up, _ValueType> + || (is_integral_v<_Up> && is_integral_v<_ValueType> + && __integer_conversion_rank<_Up> > __integer_conversion_rank<_ValueType>) + || (::cuda::is_floating_point_v<_Up> && ::cuda::is_floating_point_v<_ValueType> + && __fp_conversion_rank<_Up> > __fp_conversion_rank<_ValueType>); + +// [simd.unary], operator constraints + +template +_CCCL_CONCEPT __has_pre_increment = _CCCL_REQUIRES_EXPR((_Tp), _Tp& __t)((++__t)); + +template +_CCCL_CONCEPT __has_post_increment = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t++)); + +template +_CCCL_CONCEPT __has_pre_decrement = _CCCL_REQUIRES_EXPR((_Tp), _Tp& __t)((--__t)); + +template +_CCCL_CONCEPT __has_post_decrement = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t--)); + +template +_CCCL_CONCEPT __has_negate = _CCCL_REQUIRES_EXPR((_Tp), const _Tp __t)((!__t)); + +template +_CCCL_CONCEPT __has_bitwise_not = _CCCL_REQUIRES_EXPR((_Tp), const _Tp __t)((~__t)); + +template +_CCCL_CONCEPT __has_unary_plus = _CCCL_REQUIRES_EXPR((_Tp), const _Tp __t)((+__t)); + +template +_CCCL_CONCEPT __has_unary_minus = _CCCL_REQUIRES_EXPR((_Tp), const _Tp __t)((-__t)); + +// [simd.binary], binary operator constraints + +template +_CCCL_CONCEPT __has_binary_plus = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a + __b)); + +template +_CCCL_CONCEPT __has_binary_minus = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a - __b)); + +template +_CCCL_CONCEPT __has_multiplies = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a * __b)); + +template +_CCCL_CONCEPT __has_divides = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a / __b)); + +template +_CCCL_CONCEPT __has_modulo = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a % __b)); + +template +_CCCL_CONCEPT __has_bitwise_and = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a & __b)); + +template +_CCCL_CONCEPT __has_bitwise_or = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a | __b)); + +template +_CCCL_CONCEPT __has_bitwise_xor = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a ^ __b)); + +template +_CCCL_CONCEPT __has_shift_left = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a << __b)); + +template +_CCCL_CONCEPT __has_shift_right = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a >> __b)); + +template +_CCCL_CONCEPT __has_shift_left_size = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t << __simd_size_type{})); + +template +_CCCL_CONCEPT __has_shift_right_size = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t >> __simd_size_type{})); + +// [simd.comparison], comparison operator constraints + +template +_CCCL_CONCEPT __has_equal_to = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a == __b)); + +template +_CCCL_CONCEPT __has_not_equal_to = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a != __b)); + +template +_CCCL_CONCEPT __has_greater_equal = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a >= __b)); + +template +_CCCL_CONCEPT __has_less_equal = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a <= __b)); + +template +_CCCL_CONCEPT __has_greater = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a > __b)); + +template +_CCCL_CONCEPT __has_less = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a < __b)); + +_CCCL_END_NAMESPACE_CUDA_STD_SIMD + +#include + +#endif // _CUDA_STD___SIMD_CONCEPTS_H diff --git a/libcudacxx/include/cuda/std/__simd/declaration.h b/libcudacxx/include/cuda/std/__simd/declaration.h new file mode 100644 index 00000000000..49fa867b860 --- /dev/null +++ b/libcudacxx/include/cuda/std/__simd/declaration.h @@ -0,0 +1,62 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___SIMD_DECLARATION_H +#define _CUDA_STD___SIMD_DECLARATION_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD + +template > +class basic_vec; + +template >> +class basic_mask; + +template >> +using vec = basic_vec<_Tp, __deduce_abi_t<_Tp, _Np>>; + +template >> +using mask = basic_mask>; + +// specializations + +template +struct __simd_storage; + +template +struct __simd_operations; + +template +struct __mask_storage; + +template +struct __mask_operations; + +_CCCL_END_NAMESPACE_CUDA_STD_SIMD + +#include + +#endif // _CUDA_STD___SIMD_DECLARATION_H diff --git a/libcudacxx/include/cuda/std/__simd/exposition.h b/libcudacxx/include/cuda/std/__simd/exposition.h new file mode 100644 index 00000000000..3edd8f61aa5 --- /dev/null +++ b/libcudacxx/include/cuda/std/__simd/exposition.h @@ -0,0 +1,60 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___SIMD_EXPOSITION_H +#define _CUDA_STD___SIMD_EXPOSITION_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD + +// [simd.expos], exposition-only helpers + +template +using __integer_from = __make_nbit_int_t<_Bytes * 8, true>; + +// all standard integer types, character types, and the types float and double ([basic.fundamental]); +// std​::​float16_t, std​::​float32_t, and std​::​float64_t if defined ([basic.extended.fp]); and +// TODO(fbusato) complex where T is a vectorizable floating-point type. +template +constexpr bool __is_vectorizable_v = + (is_integral_v<_Tp> || ::cuda::is_floating_point_v<_Tp>) + && !is_same_v<_Tp, bool> && !is_const_v<_Tp> && !is_volatile_v<_Tp>; + +template +constexpr __simd_size_type __simd_size_v = 0; + +template +constexpr __simd_size_type __simd_size_v<_Tp, fixed_size<_Np>> = _Np; + +_CCCL_END_NAMESPACE_CUDA_STD_SIMD + +#include + +#endif // _CUDA_STD___SIMD_EXPOSITION_H diff --git a/libcudacxx/include/cuda/std/__simd/flag.h b/libcudacxx/include/cuda/std/__simd/flag.h new file mode 100644 index 00000000000..e2bf4b39ac2 --- /dev/null +++ b/libcudacxx/include/cuda/std/__simd/flag.h @@ -0,0 +1,111 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___SIMD_FLAG_H +#define _CUDA_STD___SIMD_FLAG_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD + +// [simd.expos], exposition-only flag types + +struct __convert_flag +{}; + +struct __aligned_flag +{}; + +template +struct __overaligned_flag +{ + static_assert(::cuda::__is_valid_alignment(_Np), "Overaligned flag requires a power-of-2 alignment"); +}; + +template +constexpr bool __is_flag_type_v = false; + +template <> +constexpr bool __is_flag_type_v<__convert_flag> = true; + +template <> +constexpr bool __is_flag_type_v<__aligned_flag> = true; + +template +constexpr bool __is_flag_type_v<__overaligned_flag<_Np>> = true; + +template +constexpr size_t __overaligned_value_v = 0; + +template +constexpr size_t __overaligned_value_v<__overaligned_flag<_Np>> = _Np; + +// [simd.flags.overview], class template flags + +template +struct flags +{ + static_assert((true && ... && __is_flag_type_v<_Flags>), + "Every flag type must be one of convert_flag, aligned_flag, or overaligned_flag"); + static_assert((0 + ... + static_cast(__overaligned_value_v<_Flags> != 0)) <= 1, + "At most one overaligned_flag is allowed"); + // we cannot use __is_valid_alignment because 0 has a different meaning + static_assert((true && ... + && (__overaligned_value_v<_Flags> == 0 || ::cuda::is_power_of_two(__overaligned_value_v<_Flags>))), + "Overaligned flag requires a power-of-2 alignment"); + + // [simd.flags.oper], flags operators + template + [[nodiscard]] _CCCL_API friend _CCCL_CONSTEVAL flags<_Flags..., _Other...> operator|(flags, flags<_Other...>) + { + return {}; + } +}; + +// [simd.flags], flag constants + +inline constexpr flags<> flag_default{}; +inline constexpr flags<__convert_flag> flag_convert{}; +inline constexpr flags<__aligned_flag> flag_aligned{}; + +template +constexpr flags<__overaligned_flag<_Np>> flag_overaligned{}; + +template +constexpr bool __has_convert_flag_v = (false || ... || is_same_v<_Flags, __convert_flag>); + +template +constexpr bool __has_aligned_flag_v = (false || ... || is_same_v<_Flags, __aligned_flag>); + +template +constexpr bool __has_overaligned_flag_v = (false || ... || (__overaligned_value_v<_Flags> != 0)); + +template +constexpr size_t __overaligned_alignment_v = (size_t{0} | ... | __overaligned_value_v<_Flags>); + +_CCCL_END_NAMESPACE_CUDA_STD_SIMD + +#include + +#endif // _CUDA_STD___SIMD_FLAG_H diff --git a/libcudacxx/include/cuda/std/__simd/load.h b/libcudacxx/include/cuda/std/__simd/load.h new file mode 100644 index 00000000000..a416dd546c2 --- /dev/null +++ b/libcudacxx/include/cuda/std/__simd/load.h @@ -0,0 +1,311 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___SIMD_LOAD_H +#define _CUDA_STD___SIMD_LOAD_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD + +// [simd.loadstore] helper: resolves default V template parameter for load functions +// When _Vp = void (default), resolves to basic_vec<_Up>; otherwise uses the explicit _Vp +template +struct __load_vec_type +{ + using type = _Vp; +}; + +template +struct __load_vec_type +{ + using type = basic_vec<_Up>; +}; + +template +using __load_vec_t = typename __load_vec_type<_Vp, _Up>::type; + +// TODO(fbusato): Important: use ::cuda::is_trivially_copyable_v +template +constexpr bool __are_vectoriazable_v = + is_same_v, remove_cv_t<_TpOut>> // _TpIn == _TpOut + && is_trivially_copyable_v<_TpIn> // byte-copy is fine (memcpy) + && (alignof(_TpIn) >= sizeof(_TpIn)); // e.g. char3 doesn't work: alignof(char3) == 1, sizeof(char3) == 3 + +// [simd.loadstore] helper: core partial load from pointer + count + mask +template +[[nodiscard]] _CCCL_API constexpr _Result +__partial_load_from_ptr(const _Up* __ptr, __simd_size_type __count, const typename _Result::mask_type& __mask) +{ + using _Tp = typename _Result::value_type; + static_assert(same_as, _Result>, "V must not be a reference or cv-qualified type"); + static_assert(__is_vectorizable_v<_Tp> && __is_abi_tag_v, + "V must be an enabled specialization of basic_vec"); + static_assert(__is_vectorizable_v<_Up>, "range_value_t must be a vectorizable type"); + static_assert(__explicitly_convertible_to<_Tp, _Up>, + "range_value_t must satisfy explicitly-convertible-to"); + static_assert(__has_convert_flag_v<_Flags...> || __is_value_preserving_v<_Up, _Tp>, + "Conversion from range_value_t to value_type is not value-preserving; use flag_convert"); + ::cuda::std::simd::__assert_load_store_alignment<_Result, _Up, _Flags...>(__ptr); + _Result __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Result::size; ++__i) + { + if (__mask[__i] && __i < __count) + { + __result.__set(__i, static_cast<_Tp>(__ptr[__i])); + } + } + return __result; +} + +template +[[nodiscard]] _CCCL_API constexpr _Result +__full_load_from_ptr(const _Up* __ptr, const typename _Result::mask_type& __mask) +{ + using _Tp = typename _Result::value_type; + static_assert(same_as, _Result>, "V must not be a reference or cv-qualified type"); + static_assert(__is_vectorizable_v<_Tp> && __is_abi_tag_v, + "V must be an enabled specialization of basic_vec"); + static_assert(__is_vectorizable_v<_Up>, "range_value_t must be a vectorizable type"); + static_assert(__explicitly_convertible_to<_Tp, _Up>, + "range_value_t must satisfy explicitly-convertible-to"); + static_assert(__has_convert_flag_v<_Flags...> || __is_value_preserving_v<_Up, _Tp>, + "Conversion from range_value_t to value_type is not value-preserving; use flag_convert"); + ::cuda::std::simd::__assert_load_store_alignment<_Result, _Up, _Flags...>(__ptr); + if constexpr (::cuda::is_power_of_two(_Result::size) && __are_vectoriazable_v<_Tp, _Up> + && (__has_aligned_flag_v<_Flags...> || __has_overaligned_flag_v<_Flags...>) ) + { + _CCCL_IF_CONSTEVAL + { + return ::cuda::std::simd::__partial_load_from_ptr<_Result, _Up, _Flags...>(__ptr, _Result::size, __mask); + } + else + { + constexpr auto __alignment = alignof(_Tp) * _Result::size; + constexpr auto __ptr_alignment = ::cuda::std::max(__alignment, __overaligned_value_v<_Flags...>); + struct alignas(__alignment) __aligned_t + { + char __data[sizeof(_Result)]; + }; + // nvcc performance bug: memcpy from pointer could not be vectorized + const auto __aligned_ptr = ::cuda::ptr_rebind<__aligned_t>(__ptr); + const auto __data = *::cuda::std::assume_aligned<__ptr_alignment>(__aligned_ptr); + _Result __result; + ::cuda::std::memcpy(&__result, &__data, sizeof(_Result)); + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Result::size; ++__i) + { + if (!__mask[__i]) + { + __result.__set(__i, _Tp{}); + } + } + return __result; + } + } + else + { + return ::cuda::std::simd::__partial_load_from_ptr<_Result, _Up, _Flags...>(__ptr, _Result::size, __mask); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// [simd.loadstore] partial_load + +// partial_load: range, masked +_CCCL_TEMPLATE(typename _Vp = void, typename _Range, typename... _Flags) +_CCCL_REQUIRES(ranges::contiguous_range<_Range> _CCCL_AND ranges::sized_range<_Range>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ranges::range_value_t<_Range>> partial_load( + _Range&& __r, + const typename __load_vec_t<_Vp, ranges::range_value_t<_Range>>::mask_type& __mask, + flags<_Flags...> = {}) +{ + using __result_t = __load_vec_t<_Vp, ranges::range_value_t<_Range>>; + using __input_t = ranges::range_value_t<_Range>; + return ::cuda::std::simd::__partial_load_from_ptr<__result_t, __input_t, _Flags...>( + ::cuda::std::ranges::data(__r), static_cast<__simd_size_type>(::cuda::std::ranges::size(__r)), __mask); +} + +// partial_load: range, no mask +_CCCL_TEMPLATE(typename _Vp = void, typename _Range, typename... _Flags) +_CCCL_REQUIRES(ranges::contiguous_range<_Range> _CCCL_AND ranges::sized_range<_Range>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ranges::range_value_t<_Range>> +partial_load(_Range&& __r, flags<_Flags...> __f = {}) +{ + using __result_t = __load_vec_t<_Vp, ranges::range_value_t<_Range>>; + return ::cuda::std::simd::partial_load<_Vp>( + ::cuda::std::forward<_Range>(__r), typename __result_t::mask_type(true), __f); +} + +// partial_load: iterator + count, masked +_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename... _Flags) +_CCCL_REQUIRES(contiguous_iterator<_Ip>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, iter_value_t<_Ip>> partial_load( + _Ip __first, + iter_difference_t<_Ip> __n, + const typename __load_vec_t<_Vp, iter_value_t<_Ip>>::mask_type& __mask, + flags<_Flags...> = {}) +{ + using __result_t = __load_vec_t<_Vp, iter_value_t<_Ip>>; + using __input_t = iter_value_t<_Ip>; + return ::cuda::std::simd::__partial_load_from_ptr<__result_t, __input_t, _Flags...>( + ::cuda::std::to_address(__first), static_cast<__simd_size_type>(__n), __mask); +} + +// partial_load: iterator + count, no mask +_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename... _Flags) +_CCCL_REQUIRES(contiguous_iterator<_Ip>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, iter_value_t<_Ip>> +partial_load(_Ip __first, iter_difference_t<_Ip> __n, flags<_Flags...> __f = {}) +{ + using __result_t = __load_vec_t<_Vp, iter_value_t<_Ip>>; + return ::cuda::std::simd::partial_load<_Vp>(__first, __n, typename __result_t::mask_type(true), __f); +} + +// partial_load: iterator + sentinel, masked +_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename _Sp, typename... _Flags) +_CCCL_REQUIRES(contiguous_iterator<_Ip> _CCCL_AND sized_sentinel_for<_Sp, _Ip>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, iter_value_t<_Ip>> partial_load( + _Ip __first, _Sp __last, const typename __load_vec_t<_Vp, iter_value_t<_Ip>>::mask_type& __mask, flags<_Flags...> = {}) +{ + using __result_t = __load_vec_t<_Vp, iter_value_t<_Ip>>; + using __input_t = iter_value_t<_Ip>; + return ::cuda::std::simd::__partial_load_from_ptr<__result_t, __input_t, _Flags...>( + ::cuda::std::to_address(__first), static_cast<__simd_size_type>(::cuda::std::distance(__first, __last)), __mask); +} + +// partial_load: iterator + sentinel, no mask +_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename _Sp, typename... _Flags) +_CCCL_REQUIRES(contiguous_iterator<_Ip> _CCCL_AND sized_sentinel_for<_Sp, _Ip>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, iter_value_t<_Ip>> +partial_load(_Ip __first, _Sp __last, flags<_Flags...> __f = {}) +{ + using __result_t = __load_vec_t<_Vp, iter_value_t<_Ip>>; + return ::cuda::std::simd::partial_load<_Vp>(__first, __last, typename __result_t::mask_type(true), __f); +} + +//---------------------------------------------------------------------------------------------------------------------- +// [simd.loadstore] unchecked_load + +// unchecked_load: range, masked +_CCCL_TEMPLATE(typename _Vp = void, typename _Range, typename... _Flags) +_CCCL_REQUIRES(ranges::contiguous_range<_Range> _CCCL_AND ranges::sized_range<_Range>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ranges::range_value_t<_Range>> unchecked_load( + _Range&& __r, + const typename __load_vec_t<_Vp, ranges::range_value_t<_Range>>::mask_type& __mask, + flags<_Flags...> __f = {}) +{ + using __result_t = __load_vec_t<_Vp, ranges::range_value_t<_Range>>; + if constexpr (__has_static_size<_Range>) + { + static_assert(__static_range_size_v<_Range> >= __result_t::size(), + "unchecked_load requires ranges::size(r) >= V::size()"); + } + _CCCL_ASSERT(::cuda::std::cmp_greater_equal(::cuda::std::ranges::size(__r), __result_t::size()), + "unchecked_load requires ranges::size(r) >= V::size()"); + return ::cuda::std::simd::partial_load<_Vp>(::cuda::std::forward<_Range>(__r), __mask, __f); +} + +// unchecked_load: range, no mask +_CCCL_TEMPLATE(typename _Vp = void, typename _Range, typename... _Flags) +_CCCL_REQUIRES(ranges::contiguous_range<_Range> _CCCL_AND ranges::sized_range<_Range>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ranges::range_value_t<_Range>> +unchecked_load(_Range&& __r, flags<_Flags...> __f = {}) +{ + using __result_t = __load_vec_t<_Vp, ranges::range_value_t<_Range>>; + return ::cuda::std::simd::unchecked_load<_Vp>( + ::cuda::std::forward<_Range>(__r), typename __result_t::mask_type(true), __f); +} + +// unchecked_load: iterator + count, masked +_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename... _Flags) +_CCCL_REQUIRES(contiguous_iterator<_Ip>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, iter_value_t<_Ip>> unchecked_load( + _Ip __first, + iter_difference_t<_Ip> __n, + const typename __load_vec_t<_Vp, iter_value_t<_Ip>>::mask_type& __mask, + flags<_Flags...> __f = {}) +{ + using __result_t = __load_vec_t<_Vp, iter_value_t<_Ip>>; + _CCCL_ASSERT(::cuda::std::cmp_greater_equal(__n, __result_t::size()), "unchecked_load requires n >= V::size()"); + return ::cuda::std::simd::partial_load<_Vp>(__first, __n, __mask, __f); +} + +// unchecked_load: iterator + count, no mask +_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename... _Flags) +_CCCL_REQUIRES(contiguous_iterator<_Ip>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, iter_value_t<_Ip>> +unchecked_load(_Ip __first, iter_difference_t<_Ip> __n, flags<_Flags...> __f = {}) +{ + using __result_t = __load_vec_t<_Vp, iter_value_t<_Ip>>; + return ::cuda::std::simd::unchecked_load<_Vp>(__first, __n, typename __result_t::mask_type(true), __f); +} + +// unchecked_load: iterator + sentinel, masked +_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename _Sp, typename... _Flags) +_CCCL_REQUIRES(contiguous_iterator<_Ip> _CCCL_AND sized_sentinel_for<_Sp, _Ip>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, iter_value_t<_Ip>> unchecked_load( + _Ip __first, + _Sp __last, + const typename __load_vec_t<_Vp, iter_value_t<_Ip>>::mask_type& __mask, + flags<_Flags...> __f = {}) +{ + using __result_t = __load_vec_t<_Vp, iter_value_t<_Ip>>; + _CCCL_ASSERT(::cuda::std::cmp_greater_equal(::cuda::std::distance(__first, __last), __result_t::size()), + "unchecked_load requires distance(first, last) >= V::size()"); + return ::cuda::std::simd::partial_load<_Vp>(__first, __last, __mask, __f); +} + +// unchecked_load: iterator + sentinel, no mask +_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename _Sp, typename... _Flags) +_CCCL_REQUIRES(contiguous_iterator<_Ip> _CCCL_AND sized_sentinel_for<_Sp, _Ip>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, iter_value_t<_Ip>> +unchecked_load(_Ip __first, _Sp __last, flags<_Flags...> __f = {}) +{ + using __result_t = __load_vec_t<_Vp, iter_value_t<_Ip>>; + return ::cuda::std::simd::unchecked_load<_Vp>(__first, __last, typename __result_t::mask_type(true), __f); +} + +_CCCL_END_NAMESPACE_CUDA_STD_SIMD + +#include + +#endif // _CUDA_STD___SIMD_LOAD_H diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_mask.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_mask.h new file mode 100644 index 00000000000..55c69c27863 --- /dev/null +++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_mask.h @@ -0,0 +1,232 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_SIMPLE_MASK_H +#define _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_SIMPLE_MASK_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD + +// Bool-per-element mask storage for fixed_size ABI +template +struct __mask_storage<_Bytes, __fixed_size<_Np>> +{ + static constexpr size_t __element_bytes = _Bytes; + + bool __data[_Np]{}; // initialization required for constexpr constructor + + [[nodiscard]] _CCCL_API constexpr bool __get(__simd_size_type __idx) const noexcept + { + _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds"); + return __data[__idx]; + } + + _CCCL_API constexpr void __set(__simd_size_type __idx, bool __v) noexcept + { + _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds"); + __data[__idx] = __v; + } +}; + +// Mask operations for fixed_size ABI with bool-per-element storage +template +struct __mask_operations<_Bytes, __fixed_size<_Np>> +{ + using _MaskStorage = __mask_storage<_Bytes, __fixed_size<_Np>>; + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage __broadcast(bool __v) noexcept + { + _MaskStorage __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = __v; + } + return __result; + } + + template + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __generate_init(_Generator&& __g, integer_sequence<__simd_size_type, _Is...>) + { + _MaskStorage __result{}; + ((__result.__data[_Is] = static_cast(__g(integral_constant<__simd_size_type, _Is>()))), ...); + return __result; + } + + template + [[nodiscard]] _CCCL_API static constexpr _MaskStorage __generate(_Generator&& __g) + { + return __generate_init(__g, make_integer_sequence<__simd_size_type, _Np>()); + } + + // Logical operators (for operator&& and operator||) + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __logic_and(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept + { + _MaskStorage __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = __lhs.__data[__i] && __rhs.__data[__i]; + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __logic_or(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept + { + _MaskStorage __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = __lhs.__data[__i] || __rhs.__data[__i]; + } + return __result; + } + + // Bitwise operators (for operator&, operator|, operator^) + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __bitwise_and(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept + { + _MaskStorage __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = __lhs.__data[__i] && __rhs.__data[__i]; + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __bitwise_or(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept + { + _MaskStorage __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = __lhs.__data[__i] || __rhs.__data[__i]; + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __bitwise_xor(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept + { + _MaskStorage __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = __lhs.__data[__i] != __rhs.__data[__i]; + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage __bitwise_not(const _MaskStorage& __s) noexcept + { + _MaskStorage __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = !__s.__data[__i]; + } + return __result; + } + + // Reductions + + [[nodiscard]] _CCCL_API static constexpr bool __all(const _MaskStorage& __s) noexcept + { + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + if (!__s.__data[__i]) + { + return false; + } + } + return true; + } + + [[nodiscard]] _CCCL_API static constexpr bool __any(const _MaskStorage& __s) noexcept + { + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + if (__s.__data[__i]) + { + return true; + } + } + return false; + } + + [[nodiscard]] _CCCL_API static constexpr __simd_size_type __count(const _MaskStorage& __s) noexcept + { + __simd_size_type __count = 0; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __count += static_cast<__simd_size_type>(__s.__data[__i]); + } + return __count; + } + + [[nodiscard]] _CCCL_API static constexpr __simd_size_type __min_index(const _MaskStorage& __s) noexcept + { + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + if (__s.__data[__i]) + { + return __i; + } + } + _CCCL_UNREACHABLE(); + } + + [[nodiscard]] _CCCL_API static constexpr __simd_size_type __max_index(const _MaskStorage& __s) noexcept + { + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = _Np - 1; __i >= 0; --__i) + { + if (__s.__data[__i]) + { + return __i; + } + } + _CCCL_UNREACHABLE(); + } +}; + +_CCCL_END_NAMESPACE_CUDA_STD_SIMD + +#include + +#endif // _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_SIMPLE_MASK_H diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h new file mode 100644 index 00000000000..a803c2bc510 --- /dev/null +++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h @@ -0,0 +1,351 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_VEC_H +#define _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_VEC_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD + +template <__simd_size_type _Np> +struct __fixed_size +{ + static_assert(_Np > 0, "_Np must be greater than 0"); + + static constexpr __simd_size_type __simd_size = _Np; +}; + +// Element-per-slot simd storage for fixed_size ABI +template +struct __simd_storage<_Tp, __fixed_size<_Np>> +{ + using value_type = _Tp; + + _Tp __data[_Np]{}; // initialization required for constexpr constructor + + [[nodiscard]] _CCCL_API constexpr _Tp __get(__simd_size_type __idx) const noexcept + { + _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds"); + return __data[__idx]; + } + + _CCCL_API constexpr void __set(__simd_size_type __idx, _Tp __v) noexcept + { + _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds"); + __data[__idx] = __v; + } +}; + +// Simd operations for fixed_size ABI +template +struct __simd_operations<_Tp, __fixed_size<_Np>> +{ + using _SimdStorage = __simd_storage<_Tp, __fixed_size<_Np>>; + using _MaskStorage = __mask_storage>; + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage __broadcast(_Tp __v) noexcept + { + _SimdStorage __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = __v; + } + return __result; + } + + template + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __generate_init(_Generator&& __g, integer_sequence<__simd_size_type, _Is...>) + { + return _SimdStorage{{__g(integral_constant<__simd_size_type, _Is>())...}}; + } + + template + [[nodiscard]] _CCCL_API static constexpr _SimdStorage __generate(_Generator&& __g) + { + return __generate_init(__g, make_integer_sequence<__simd_size_type, _Np>()); + } + + // Unary operations + + _CCCL_API static constexpr void __increment(_SimdStorage& __s) noexcept + { + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + ++__s.__data[__i]; + } + } + + _CCCL_API static constexpr void __decrement(_SimdStorage& __s) noexcept + { + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + --__s.__data[__i]; + } + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage __negate(const _SimdStorage& __s) noexcept + { + _MaskStorage __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = !__s.__data[__i]; + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage __bitwise_not(const _SimdStorage& __s) noexcept + { + _SimdStorage __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = ~__s.__data[__i]; + } + return __result; + } + + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_MSVC(4146) // unary minus applied to unsigned type + [[nodiscard]] _CCCL_API static constexpr _SimdStorage __unary_minus(const _SimdStorage& __s) noexcept + { + _SimdStorage __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = -__s.__data[__i]; + } + return __result; + } + _CCCL_DIAG_POP + + // Binary arithmetic operations + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __plus(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _SimdStorage __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = (__lhs.__data[__i] + __rhs.__data[__i]); + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __minus(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _SimdStorage __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = (__lhs.__data[__i] - __rhs.__data[__i]); + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __multiplies(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _SimdStorage __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = (__lhs.__data[__i] * __rhs.__data[__i]); + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __divides(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _SimdStorage __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = (__lhs.__data[__i] / __rhs.__data[__i]); + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __modulo(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _SimdStorage __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = (__lhs.__data[__i] % __rhs.__data[__i]); + } + return __result; + } + + // Comparison operations + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __equal_to(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _MaskStorage __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = (__lhs.__data[__i] == __rhs.__data[__i]); + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __not_equal_to(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _MaskStorage __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = (__lhs.__data[__i] != __rhs.__data[__i]); + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __less(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _MaskStorage __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = (__lhs.__data[__i] < __rhs.__data[__i]); + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __less_equal(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _MaskStorage __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = (__lhs.__data[__i] <= __rhs.__data[__i]); + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __greater(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _MaskStorage __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = (__lhs.__data[__i] > __rhs.__data[__i]); + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __greater_equal(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _MaskStorage __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = (__lhs.__data[__i] >= __rhs.__data[__i]); + } + return __result; + } + + // Bitwise and shift operations + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __bitwise_and(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _SimdStorage __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = (__lhs.__data[__i] & __rhs.__data[__i]); + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __bitwise_or(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _SimdStorage __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = (__lhs.__data[__i] | __rhs.__data[__i]); + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __bitwise_xor(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _SimdStorage __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = (__lhs.__data[__i] ^ __rhs.__data[__i]); + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __shift_left(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _SimdStorage __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = (__lhs.__data[__i] << __rhs.__data[__i]); + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __shift_right(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _SimdStorage __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = (__lhs.__data[__i] >> __rhs.__data[__i]); + } + return __result; + } +}; +_CCCL_END_NAMESPACE_CUDA_STD_SIMD + +#include + +#endif // _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_VEC_H diff --git a/libcudacxx/include/cuda/std/__simd/store.h b/libcudacxx/include/cuda/std/__simd/store.h new file mode 100644 index 00000000000..ca4f3d615de --- /dev/null +++ b/libcudacxx/include/cuda/std/__simd/store.h @@ -0,0 +1,240 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___SIMD_STORE_H +#define _CUDA_STD___SIMD_STORE_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD + +// [simd.loadstore] helper: core partial store to pointer + count + mask +template +_CCCL_API constexpr void __partial_store_to_ptr( + const basic_vec<_Tp, _Abi>& __v, + _Up* __ptr, + __simd_size_type __count, + const typename basic_vec<_Tp, _Abi>::mask_type& __mask) +{ + static_assert(__is_vectorizable_v<_Up>, "range_value_t must be a vectorizable type"); + static_assert(__explicitly_convertible_to<_Up, _Tp>, + "value_type must satisfy explicitly-convertible-to>"); + static_assert(__has_convert_flag_v<_Flags...> || __is_value_preserving_v<_Tp, _Up>, + "Conversion from value_type to range_value_t is not value-preserving; use flag_convert"); + ::cuda::std::simd::__assert_load_store_alignment, _Up, _Flags...>(__ptr); + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < basic_vec<_Tp, _Abi>::size; ++__i) + { + if (__mask[__i] && __i < __count) + { + __ptr[__i] = static_cast<_Up>(__v[__i]); + } + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// [simd.loadstore] partial_store + +// partial_store: range, masked +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Range, typename... _Flags) +_CCCL_REQUIRES(ranges::contiguous_range<_Range> _CCCL_AND ranges::sized_range<_Range> _CCCL_AND + __explicitly_convertible_to, _Tp>) +_CCCL_API constexpr void partial_store( + const basic_vec<_Tp, _Abi>& __v, + _Range&& __r, + const typename basic_vec<_Tp, _Abi>::mask_type& __mask, + flags<_Flags...> = {}) +{ + static_assert(indirectly_writable, ranges::range_value_t<_Range>>, + "ranges::iterator_t must model indirectly_writable>"); + using _Up = ranges::range_value_t<_Range>; + ::cuda::std::simd::__partial_store_to_ptr<_Tp, _Abi, _Up, _Flags...>( + __v, ::cuda::std::ranges::data(__r), static_cast<__simd_size_type>(::cuda::std::ranges::size(__r)), __mask); +} + +// partial_store: range, no mask +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Range, typename... _Flags) +_CCCL_REQUIRES(ranges::contiguous_range<_Range> _CCCL_AND ranges::sized_range<_Range> _CCCL_AND + __explicitly_convertible_to, _Tp>) +_CCCL_API constexpr void partial_store(const basic_vec<_Tp, _Abi>& __v, _Range&& __r, flags<_Flags...> __f = {}) +{ + ::cuda::std::simd::partial_store( + __v, ::cuda::std::forward<_Range>(__r), typename basic_vec<_Tp, _Abi>::mask_type(true), __f); +} + +// partial_store: iterator + count, masked +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename... _Flags) +_CCCL_REQUIRES(contiguous_iterator<_Ip> _CCCL_AND __explicitly_convertible_to, _Tp>) +_CCCL_API constexpr void partial_store( + const basic_vec<_Tp, _Abi>& __v, + _Ip __first, + iter_difference_t<_Ip> __n, + const typename basic_vec<_Tp, _Abi>::mask_type& __mask, + flags<_Flags...> = {}) +{ + static_assert(indirectly_writable<_Ip, iter_value_t<_Ip>>, "I must model indirectly_writable>"); + using _Up = iter_value_t<_Ip>; + ::cuda::std::simd::__partial_store_to_ptr<_Tp, _Abi, _Up, _Flags...>( + __v, ::cuda::std::to_address(__first), static_cast<__simd_size_type>(__n), __mask); +} + +// partial_store: iterator + count, no mask +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename... _Flags) +_CCCL_REQUIRES(contiguous_iterator<_Ip> _CCCL_AND __explicitly_convertible_to, _Tp>) +_CCCL_API constexpr void +partial_store(const basic_vec<_Tp, _Abi>& __v, _Ip __first, iter_difference_t<_Ip> __n, flags<_Flags...> __f = {}) +{ + ::cuda::std::simd::partial_store(__v, __first, __n, typename basic_vec<_Tp, _Abi>::mask_type(true), __f); +} + +// partial_store: iterator + sentinel, masked +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename _Sp, typename... _Flags) +_CCCL_REQUIRES(contiguous_iterator<_Ip> _CCCL_AND sized_sentinel_for<_Sp, _Ip> _CCCL_AND + __explicitly_convertible_to, _Tp>) +_CCCL_API constexpr void partial_store( + const basic_vec<_Tp, _Abi>& __v, + _Ip __first, + _Sp __last, + const typename basic_vec<_Tp, _Abi>::mask_type& __mask, + flags<_Flags...> = {}) +{ + static_assert(indirectly_writable<_Ip, iter_value_t<_Ip>>, "I must model indirectly_writable>"); + using _Up = iter_value_t<_Ip>; + ::cuda::std::simd::__partial_store_to_ptr<_Tp, _Abi, _Up, _Flags...>( + __v, + ::cuda::std::to_address(__first), + static_cast<__simd_size_type>(::cuda::std::distance(__first, __last)), + __mask); +} + +// partial_store: iterator + sentinel, no mask +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename _Sp, typename... _Flags) +_CCCL_REQUIRES(contiguous_iterator<_Ip> _CCCL_AND sized_sentinel_for<_Sp, _Ip> _CCCL_AND + __explicitly_convertible_to, _Tp>) +_CCCL_API constexpr void +partial_store(const basic_vec<_Tp, _Abi>& __v, _Ip __first, _Sp __last, flags<_Flags...> __f = {}) +{ + ::cuda::std::simd::partial_store(__v, __first, __last, typename basic_vec<_Tp, _Abi>::mask_type(true), __f); +} + +//---------------------------------------------------------------------------------------------------------------------- +// [simd.loadstore] unchecked_store + +// unchecked_store: range, masked +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Range, typename... _Flags) +_CCCL_REQUIRES(ranges::contiguous_range<_Range> _CCCL_AND ranges::sized_range<_Range> _CCCL_AND + __explicitly_convertible_to, _Tp>) +_CCCL_API constexpr void unchecked_store( + const basic_vec<_Tp, _Abi>& __v, + _Range&& __r, + const typename basic_vec<_Tp, _Abi>::mask_type& __mask, + flags<_Flags...> __f = {}) +{ + if constexpr (__has_static_size<_Range>) + { + static_assert(__static_range_size_v<_Range> >= basic_vec<_Tp, _Abi>::size(), + "unchecked_store requires ranges::size(r) >= V::size()"); + } + _CCCL_ASSERT(::cuda::std::cmp_greater_equal(::cuda::std::ranges::size(__r), __v.size), + "unchecked_store requires ranges::size(r) >= V::size()"); + ::cuda::std::simd::partial_store(__v, ::cuda::std::forward<_Range>(__r), __mask, __f); +} + +// unchecked_store: range, no mask +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Range, typename... _Flags) +_CCCL_REQUIRES(ranges::contiguous_range<_Range> _CCCL_AND ranges::sized_range<_Range> _CCCL_AND + __explicitly_convertible_to, _Tp>) +_CCCL_API constexpr void unchecked_store(const basic_vec<_Tp, _Abi>& __v, _Range&& __r, flags<_Flags...> __f = {}) +{ + ::cuda::std::simd::unchecked_store( + __v, ::cuda::std::forward<_Range>(__r), typename basic_vec<_Tp, _Abi>::mask_type(true), __f); +} + +// unchecked_store: iterator + count, masked +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename... _Flags) +_CCCL_REQUIRES(contiguous_iterator<_Ip> _CCCL_AND __explicitly_convertible_to, _Tp>) +_CCCL_API constexpr void unchecked_store( + const basic_vec<_Tp, _Abi>& __v, + _Ip __first, + iter_difference_t<_Ip> __n, + const typename basic_vec<_Tp, _Abi>::mask_type& __mask, + flags<_Flags...> __f = {}) +{ + _CCCL_ASSERT(::cuda::std::cmp_greater_equal(__n, __v.size), "unchecked_store requires n >= V::size()"); + ::cuda::std::simd::partial_store(__v, __first, __n, __mask, __f); +} + +// unchecked_store: iterator + count, no mask +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename... _Flags) +_CCCL_REQUIRES(contiguous_iterator<_Ip> _CCCL_AND __explicitly_convertible_to, _Tp>) +_CCCL_API constexpr void +unchecked_store(const basic_vec<_Tp, _Abi>& __v, _Ip __first, iter_difference_t<_Ip> __n, flags<_Flags...> __f = {}) +{ + ::cuda::std::simd::unchecked_store(__v, __first, __n, typename basic_vec<_Tp, _Abi>::mask_type(true), __f); +} + +// unchecked_store: iterator + sentinel, masked +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename _Sp, typename... _Flags) +_CCCL_REQUIRES(contiguous_iterator<_Ip> _CCCL_AND sized_sentinel_for<_Sp, _Ip> _CCCL_AND + __explicitly_convertible_to, _Tp>) +_CCCL_API constexpr void unchecked_store( + const basic_vec<_Tp, _Abi>& __v, + _Ip __first, + _Sp __last, + const typename basic_vec<_Tp, _Abi>::mask_type& __mask, + flags<_Flags...> __f = {}) +{ + _CCCL_ASSERT(::cuda::std::cmp_greater_equal(::cuda::std::distance(__first, __last), __v.size), + "unchecked_store requires distance(first, last) >= V::size()"); + ::cuda::std::simd::partial_store(__v, __first, __last, __mask, __f); +} + +// unchecked_store: iterator + sentinel, no mask +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename _Sp, typename... _Flags) +_CCCL_REQUIRES(contiguous_iterator<_Ip> _CCCL_AND sized_sentinel_for<_Sp, _Ip> _CCCL_AND + __explicitly_convertible_to, _Tp>) +_CCCL_API constexpr void +unchecked_store(const basic_vec<_Tp, _Abi>& __v, _Ip __first, _Sp __last, flags<_Flags...> __f = {}) +{ + ::cuda::std::simd::unchecked_store(__v, __first, __last, typename basic_vec<_Tp, _Abi>::mask_type(true), __f); +} + +_CCCL_END_NAMESPACE_CUDA_STD_SIMD + +#include + +#endif // _CUDA_STD___SIMD_STORE_H diff --git a/libcudacxx/include/cuda/std/__simd/type_traits.h b/libcudacxx/include/cuda/std/__simd/type_traits.h new file mode 100644 index 00000000000..d169461ce9c --- /dev/null +++ b/libcudacxx/include/cuda/std/__simd/type_traits.h @@ -0,0 +1,99 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___SIMD_TYPE_TRAITS_H +#define _CUDA_STD___SIMD_TYPE_TRAITS_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD + +// [simd.traits], alignment +template +struct alignment; + +template +struct alignment, _Up> + : integral_constant * alignof(_Up)) + ? __simd_size_v<_Tp, _Abi> * alignof(_Up) + : alignof(_Up)> +{ + static_assert(__is_vectorizable_v<_Up>, "U must be a vectorizable type"); +}; + +template +constexpr size_t alignment_v = alignment<_Tp, _Up>::value; + +// [simd.traits], rebind +template +struct rebind; + +template +struct rebind<_Tp, basic_vec<_Up, _Abi>> +{ + static_assert(__is_vectorizable_v<_Tp>, "T must be a vectorizable type"); + using type = basic_vec<_Tp, __deduce_abi_t<_Tp, __simd_size_v<_Up, _Abi>>>; +}; + +template +struct rebind<_Tp, basic_mask<_Bytes, _Abi>> +{ + static_assert(__is_vectorizable_v<_Tp>, "T must be a vectorizable type"); + using __integer_t = __integer_from; + using __integer_bytes_t = __integer_from<_Bytes>; + + using type = basic_mask>>; +}; + +template +using rebind_t = typename rebind<_Tp, _Vp>::type; + +// [simd.traits], resize +template <__simd_size_type _Np, typename _Vp> +struct resize; + +template <__simd_size_type _Np, typename _Tp, typename _Abi> +struct resize<_Np, basic_vec<_Tp, _Abi>> +{ + using type = basic_vec<_Tp, __deduce_abi_t<_Tp, _Np>>; +}; + +template <__simd_size_type _Np, size_t _Bytes, typename _Abi> +struct resize<_Np, basic_mask<_Bytes, _Abi>> +{ + using type = basic_mask<_Bytes, __deduce_abi_t<__integer_from<_Bytes>, _Np>>; +}; + +template <__simd_size_type _Np, typename _Vp> +using resize_t = typename resize<_Np, _Vp>::type; + +_CCCL_END_NAMESPACE_CUDA_STD_SIMD + +#include + +#endif // _CUDA_STD___SIMD_TYPE_TRAITS_H diff --git a/libcudacxx/include/cuda/std/__simd/utility.h b/libcudacxx/include/cuda/std/__simd/utility.h new file mode 100644 index 00000000000..9b9c9f708c3 --- /dev/null +++ b/libcudacxx/include/cuda/std/__simd/utility.h @@ -0,0 +1,153 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___SIMD_UTILITY_H +#define _CUDA_STD___SIMD_UTILITY_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD + +template +constexpr bool __is_abi_tag_v = false; + +template <__simd_size_type _Np> +constexpr bool __is_abi_tag_v<__fixed_size<_Np>> = true; + +//---------------------------------------------------------------------------------------------------------------------- +// __can_generate_v + +template +constexpr bool __is_well_formed = false; + +template +constexpr bool __is_well_formed<_Tp, + _Generator, + _Idx, + void_t()(integral_constant<__simd_size_type, _Idx>()))>> = + is_convertible_v()(integral_constant<__simd_size_type, _Idx>())), _Tp>; + +template +[[nodiscard]] +_CCCL_API constexpr bool __can_generate(integer_sequence<__simd_size_type, _Indices...>) noexcept +{ + return (true && ... && __is_well_formed<_Tp, _Generator, _Indices>); +} + +template +constexpr bool __can_generate_v = __can_generate<_Tp, _Generator>(make_integer_sequence<__simd_size_type, _Size>()); + +//---------------------------------------------------------------------------------------------------------------------- +// __is_compatible_range_v + +template +constexpr bool __has_tuple_size_v = false; + +template +constexpr bool __has_tuple_size_v<_Range, void_t>::value)>> = true; + +template +constexpr bool __has_static_extent_v = false; + +template +constexpr bool __has_static_extent_v<_Range, void_t::extent)>> = + remove_cvref_t<_Range>::extent != dynamic_extent; + +// Proxy for ranges::size(r) is a constant expression. +template +_CCCL_CONCEPT __has_static_size = __has_tuple_size_v<_Range> || __has_static_extent_v<_Range>; + +template +[[nodiscard]] _CCCL_API constexpr __simd_size_type __get_static_range_size() noexcept +{ + using __range_t = remove_cvref_t<_Range>; + if constexpr (__has_tuple_size_v<_Range>) + { + return __simd_size_type{tuple_size_v<__range_t>}; + } + else + { + return __simd_size_type{__range_t::extent}; + } +} + +template +constexpr __simd_size_type __static_range_size_v = __get_static_range_size<_Range>(); + +// This trait is defined at namespace scope (not as a static member of basic_vec) because GCC 13 rejects partial +// specialization of static member variable templates. The static-size detection intentionally avoids directly using +// tuple_size_v in the guard because that causes a hard error (instead of SFINAE) on NVCC with +// clang-19/clang-14/nvc++ when T is an incomplete specialization of tuple_size. +template +constexpr bool __is_compatible_range_guard_v = + __has_static_size<_Range> && ranges::contiguous_range<_Range> && ranges::sized_range<_Range>; + +template > +constexpr bool __is_compatible_range_v = false; + +template +constexpr bool __is_compatible_range_v<_Tp, _Size, _Range, true> = + (__static_range_size_v<_Range> == _Size) // + && __is_vectorizable_v> // + && __explicitly_convertible_to<_Tp, ranges::range_value_t<_Range>>; + +//---------------------------------------------------------------------------------------------------------------------- +// [simd.flags] alignment assertion for load/store pointers + +template +_CCCL_API constexpr void __assert_load_store_alignment([[maybe_unused]] const _Up* __data) noexcept +{ + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + if constexpr (__has_aligned_flag_v<_Flags...>) + { + _CCCL_ASSERT(::cuda::is_aligned(__data, alignment_v<_Vec, _Up>), + "flag_aligned requires data to be aligned to alignment_v>"); + } + else if constexpr (__has_overaligned_flag_v<_Flags...>) + { + _CCCL_ASSERT(::cuda::is_aligned(__data, __overaligned_alignment_v<_Flags...>), + "flag_overaligned requires data to be aligned to N"); + } + } +} + +_CCCL_END_NAMESPACE_CUDA_STD_SIMD + +#include + +#endif // _CUDA_STD___SIMD_UTILITY_H diff --git a/libcudacxx/include/cuda/std/__simd_ b/libcudacxx/include/cuda/std/__simd_ new file mode 100644 index 00000000000..02b28686752 --- /dev/null +++ b/libcudacxx/include/cuda/std/__simd_ @@ -0,0 +1,31 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD_SIMD +#define _CUDA_STD_SIMD + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include + +#endif // _CUDA_STD_SIMD diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.flags/flags.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.flags/flags.pass.cpp new file mode 100644 index 00000000000..5b2ff1dc3bf --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.flags/flags.pass.cpp @@ -0,0 +1,64 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// + +// [simd.flags], load and store flags +// +// template struct flags; +// +// inline constexpr flags<> flag_default{}; +// inline constexpr flags flag_convert{}; +// inline constexpr flags flag_aligned{}; +// template constexpr flags> flag_overaligned{}; +// +// [simd.flags.oper], flags operators +// template friend constexpr flags operator|(flags, flags); + +#include +#include + +#include "test_macros.h" + +namespace simd = cuda::std::simd; + +template +__host__ __device__ constexpr bool is_same_flags(const T&, const R&) +{ + return cuda::std::is_same_v, cuda::std::remove_cvref_t>; +} + +__host__ __device__ void test() +{ + static_assert(is_same_flags(simd::flag_default, simd::flags<>{})); + // default | X == X + static_assert(is_same_flags(simd::flag_default | simd::flag_convert, simd::flag_convert)); + static_assert(is_same_flags(simd::flag_default | simd::flag_aligned, simd::flag_aligned)); + static_assert(is_same_flags(simd::flag_default | simd::flag_overaligned<32>, simd::flag_overaligned<32>)); + + // X | default == X + static_assert(is_same_flags(simd::flag_convert | simd::flag_default, simd::flag_convert)); + static_assert(is_same_flags(simd::flag_aligned | simd::flag_default, simd::flag_aligned)); + static_assert(is_same_flags(simd::flag_overaligned<32> | simd::flag_default, simd::flag_overaligned<32>)); + + // two distinct flags + static_assert(is_same_flags(simd::flag_convert | simd::flag_aligned, simd::flag_convert | simd::flag_aligned)); + static_assert(is_same_flags(simd::flag_aligned | simd::flag_convert, simd::flag_aligned | simd::flag_convert)); + static_assert( + is_same_flags(simd::flag_convert | simd::flag_overaligned<32>, simd::flag_convert | simd::flag_overaligned<32>)); + static_assert( + is_same_flags(simd::flag_overaligned<32> | simd::flag_convert, simd::flag_overaligned<32> | simd::flag_convert)); +} + +int main(int, char**) +{ + test(); + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.flags/multiple_overaligned.fail.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.flags/multiple_overaligned.fail.cpp new file mode 100644 index 00000000000..ab5e00a0384 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.flags/multiple_overaligned.fail.cpp @@ -0,0 +1,23 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// flags allows at most one overaligned_flag. + +#include + +#include "test_macros.h" + +namespace simd = cuda::std::simd; + +int main(int, char**) +{ + [[maybe_unused]] auto bad = simd::flag_overaligned<8> | simd::flag_overaligned<16>; // expected-error + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.flags/non_flag_type.fail.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.flags/non_flag_type.fail.cpp new file mode 100644 index 00000000000..e6db1323f17 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.flags/non_flag_type.fail.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// flags requires every type in the pack to be one of +// convert_flag, aligned_flag, or overaligned_flag. + +#include + +#include "test_macros.h" + +int main(int, char**) +{ + [[maybe_unused]] cuda::std::simd::flags bad{}; // expected-error + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.flags/non_power_of_two_overaligned.fail.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.flags/non_power_of_two_overaligned.fail.cpp new file mode 100644 index 00000000000..43514208373 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.flags/non_power_of_two_overaligned.fail.cpp @@ -0,0 +1,21 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// overaligned_flag requires N to be a power of two. + +#include + +#include "test_macros.h" + +int main(int, char**) +{ + [[maybe_unused]] auto bad = cuda::std::simd::flag_overaligned<3>; // expected-error + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/binary.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/binary.pass.cpp new file mode 100644 index 00000000000..b321e44c520 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/binary.pass.cpp @@ -0,0 +1,138 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// + +// [simd.mask.binary], basic_mask binary operators +// +// friend constexpr basic_mask operator&&(const basic_mask&, const basic_mask&) noexcept; +// friend constexpr basic_mask operator||(const basic_mask&, const basic_mask&) noexcept; +// friend constexpr basic_mask operator&(const basic_mask&, const basic_mask&) noexcept; +// friend constexpr basic_mask operator|(const basic_mask&, const basic_mask&) noexcept; +// friend constexpr basic_mask operator^(const basic_mask&, const basic_mask&) noexcept; + +#include "../simd_test_utils.h" + +template +__host__ __device__ constexpr void test_all_patterns() +{ + using Mask = simd::basic_mask>; + Mask all_true(true); + Mask all_false(false); + + static_assert(cuda::std::is_same_v); + static_assert(cuda::std::is_same_v); + static_assert(cuda::std::is_same_v); + static_assert(cuda::std::is_same_v); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(all_true && all_false)); + static_assert(noexcept(all_true || all_false)); + static_assert(noexcept(all_true & all_false)); + static_assert(noexcept(all_true | all_false)); + static_assert(noexcept(all_true ^ all_false)); + // logical AND + { + Mask and_result_tt = all_true && all_true; + Mask and_result_ft = all_false && all_true; + Mask and_result_tf = all_true && all_false; + Mask and_result_ff = all_false && all_false; + for (int i = 0; i < N; ++i) + { + assert(and_result_tt[i] == true); + assert(and_result_ft[i] == false); + assert(and_result_tf[i] == false); + assert(and_result_ff[i] == false); + } + } + // logical OR + { + Mask or_result_tt = all_true || all_true; + Mask or_result_ft = all_false || all_true; + Mask or_result_tf = all_true || all_false; + Mask or_result_ff = all_false || all_false; + for (int i = 0; i < N; ++i) + { + assert(or_result_tt[i] == true); + assert(or_result_ft[i] == true); + assert(or_result_tf[i] == true); + assert(or_result_ff[i] == false); + } + } + // bitwise AND + { + Mask bit_and_result_tt = all_true & all_true; + Mask bit_and_result_ft = all_false & all_true; + Mask bit_and_result_tf = all_true & all_false; + Mask bit_and_result_ff = all_false & all_false; + for (int i = 0; i < N; ++i) + { + assert(bit_and_result_tt[i] == true); + assert(bit_and_result_ft[i] == false); + assert(bit_and_result_tf[i] == false); + assert(bit_and_result_ff[i] == false); + } + } + // bitwise OR + { + Mask bit_or_result_tt = all_true | all_true; + Mask bit_or_result_ft = all_false | all_true; + Mask bit_or_result_tf = all_true | all_false; + Mask bit_or_result_ff = all_false | all_false; + for (int i = 0; i < N; ++i) + { + assert(bit_or_result_tt[i] == true); + assert(bit_or_result_ft[i] == true); + assert(bit_or_result_tf[i] == true); + assert(bit_or_result_ff[i] == false); + } + } + // bitwise XOR + { + Mask bit_xor_result_tt = all_true ^ all_true; + Mask bit_xor_result_ft = all_false ^ all_true; + Mask bit_xor_result_tf = all_true ^ all_false; + Mask bit_xor_result_ff = all_false ^ all_false; + for (int i = 0; i < N; ++i) + { + assert(bit_xor_result_tt[i] == false); + assert(bit_xor_result_ft[i] == true); + assert(bit_xor_result_tf[i] == true); + assert(bit_xor_result_ff[i] == false); + } + } +} + +//---------------------------------------------------------------------------------------------------------------------- + +template +__host__ __device__ constexpr void test_bytes() +{ + test_all_patterns(); + test_all_patterns(); +} + +__host__ __device__ constexpr bool test() +{ + test_bytes<1>(); + test_bytes<2>(); + test_bytes<4>(); + test_bytes<8>(); +#if _CCCL_HAS_INT128() + test_bytes<16>(); +#endif + return true; +} + +int main(int, char**) +{ + assert(test()); + static_assert(test()); + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/comparison.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/comparison.pass.cpp new file mode 100644 index 00000000000..fd9d6ff21b4 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/comparison.pass.cpp @@ -0,0 +1,159 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// + +// [simd.mask.comparison], basic_mask comparisons (element-wise) +// +// friend constexpr basic_mask operator==(const basic_mask&, const basic_mask&) noexcept; +// friend constexpr basic_mask operator!=(const basic_mask&, const basic_mask&) noexcept; +// friend constexpr basic_mask operator>=(const basic_mask&, const basic_mask&) noexcept; +// friend constexpr basic_mask operator<=(const basic_mask&, const basic_mask&) noexcept; +// friend constexpr basic_mask operator>(const basic_mask&, const basic_mask&) noexcept; +// friend constexpr basic_mask operator<(const basic_mask&, const basic_mask&) noexcept; + +#include "../simd_test_utils.h" + +//---------------------------------------------------------------------------------------------------------------------- +// identical masks + +template +__host__ __device__ constexpr void test_all_patterns() +{ + using Mask = simd::basic_mask>; + Mask all_true(true); + Mask all_false(false); + + static_assert(cuda::std::is_same_v); + static_assert(cuda::std::is_same_v); + static_assert(cuda::std::is_same_v= all_false), Mask>); + static_assert(cuda::std::is_same_v); + static_assert(cuda::std::is_same_v all_false), Mask>); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(all_true == all_false)); + static_assert(noexcept(all_true != all_false)); + static_assert(noexcept(all_true >= all_false)); + static_assert(noexcept(all_true <= all_false)); + static_assert(noexcept(all_true > all_false)); + static_assert(noexcept(all_true < all_false)); + + // operator== + { + Mask eq_result_tt = all_true == all_true; + Mask eq_result_ft = all_false == all_true; + Mask eq_result_tf = all_true == all_false; + Mask eq_result_ff = all_false == all_false; + for (int i = 0; i < N; ++i) + { + assert(eq_result_tt[i] == true); + assert(eq_result_ft[i] == false); + assert(eq_result_tf[i] == false); + assert(eq_result_ff[i] == true); + } + } + // operator!= + { + Mask ne_result_tt = all_true != all_true; + Mask ne_result_ft = all_false != all_true; + Mask ne_result_tf = all_true != all_false; + Mask ne_result_ff = all_false != all_false; + for (int i = 0; i < N; ++i) + { + assert(ne_result_tt[i] == false); + assert(ne_result_ft[i] == true); + assert(ne_result_tf[i] == true); + assert(ne_result_ff[i] == false); + } + } + // operator>= + { + Mask ge_result_tt = all_true >= all_true; + Mask ge_result_ft = all_false >= all_true; + Mask ge_result_tf = all_true >= all_false; + Mask ge_result_ff = all_false >= all_false; + for (int i = 0; i < N; ++i) + { + assert(ge_result_tt[i] == true); + assert(ge_result_ft[i] == false); + assert(ge_result_tf[i] == true); + assert(ge_result_ff[i] == true); + } + } + // operator<= + { + Mask le_result_tt = all_true <= all_true; + Mask le_result_ft = all_false <= all_true; + Mask le_result_tf = all_true <= all_false; + Mask le_result_ff = all_false <= all_false; + for (int i = 0; i < N; ++i) + { + assert(le_result_tt[i] == true); + assert(le_result_ft[i] == true); + assert(le_result_tf[i] == false); + assert(le_result_ff[i] == true); + } + } + // operator> + { + Mask gt_result_tt = all_true > all_true; + Mask gt_result_ft = all_false > all_true; + Mask gt_result_tf = all_true > all_false; + Mask gt_result_ff = all_false > all_false; + for (int i = 0; i < N; ++i) + { + assert(gt_result_tt[i] == false); + assert(gt_result_ft[i] == false); + assert(gt_result_tf[i] == true); + assert(gt_result_ff[i] == false); + } + } + // operator< + { + Mask lt_result_tt = all_true < all_true; + Mask lt_result_ft = all_false < all_true; + Mask lt_result_tf = all_true < all_false; + Mask lt_result_ff = all_false < all_false; + for (int i = 0; i < N; ++i) + { + assert(lt_result_tt[i] == false); + assert(lt_result_ft[i] == true); + assert(lt_result_tf[i] == false); + assert(lt_result_ff[i] == false); + } + } +} + +//---------------------------------------------------------------------------------------------------------------------- + +template +__host__ __device__ constexpr void test_bytes() +{ + test_all_patterns(); + test_all_patterns(); +} + +__host__ __device__ constexpr bool test() +{ + test_bytes<1>(); + test_bytes<2>(); + test_bytes<4>(); + test_bytes<8>(); +#if _CCCL_HAS_INT128() + test_bytes<16>(); +#endif + return true; +} + +int main(int, char**) +{ + assert(test()); + static_assert(test()); + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/compound_assign.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/compound_assign.pass.cpp new file mode 100644 index 00000000000..de635f4ac0d --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/compound_assign.pass.cpp @@ -0,0 +1,107 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// + +// [simd.mask.cassign], basic_mask compound assignment +// +// friend constexpr basic_mask& operator&=(basic_mask&, const basic_mask&) noexcept; +// friend constexpr basic_mask& operator|=(basic_mask&, const basic_mask&) noexcept; +// friend constexpr basic_mask& operator^=(basic_mask&, const basic_mask&) noexcept; + +#include "../simd_test_utils.h" + +template +__host__ __device__ constexpr void test_and() +{ + using Mask = simd::basic_mask>; + Mask a(is_even{}); + Mask b(is_first_half{}); + + static_assert(cuda::std::is_same_v); + static_assert(noexcept(a &= b)); + + a &= b; + assert(a[0] == true); + assert(a[1] == false); + assert(a[2] == false); + assert(a[3] == false); +} + +template +__host__ __device__ constexpr void test_or() +{ + using Mask = simd::basic_mask>; + Mask a(is_even{}); + Mask b(is_first_half{}); + + static_assert(cuda::std::is_same_v); + static_assert(noexcept(a |= b)); + + a |= b; + assert(a[0] == true); + assert(a[1] == true); + assert(a[2] == true); + assert(a[3] == false); +} + +template +__host__ __device__ constexpr void test_xor() +{ + using Mask = simd::basic_mask>; + Mask a(is_even{}); + Mask b(is_first_half{}); + + static_assert(cuda::std::is_same_v); + static_assert(noexcept(a ^= b)); + + a ^= b; + assert(a[0] == false); + assert(a[1] == true); + assert(a[2] == true); + assert(a[3] == false); +} + +//---------------------------------------------------------------------------------------------------------------------- + +__host__ __device__ constexpr bool test() +{ + test_and<1>(); + test_and<2>(); + test_and<4>(); + test_and<8>(); +#if _CCCL_HAS_INT128() + test_and<16>(); +#endif + + test_or<1>(); + test_or<2>(); + test_or<4>(); + test_or<8>(); +#if _CCCL_HAS_INT128() + test_or<16>(); +#endif + + test_xor<1>(); + test_xor<2>(); + test_xor<4>(); + test_xor<8>(); +#if _CCCL_HAS_INT128() + test_xor<16>(); +#endif + return true; +} + +int main(int, char**) +{ + assert(test()); + static_assert(test()); + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/conversion.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/conversion.pass.cpp new file mode 100644 index 00000000000..b158e21f6c0 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/conversion.pass.cpp @@ -0,0 +1,194 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// + +// [simd.mask.conv], basic_mask conversions +// +// explicit operator basic_vec() const noexcept; // sizeof(U) != Bytes +// operator basic_vec() const noexcept; // sizeof(U) == Bytes (implicit) +// constexpr bitset to_bitset() const noexcept; +// constexpr unsigned long long to_ullong() const; + +#include "../simd_test_utils.h" + +//---------------------------------------------------------------------------------------------------------------------- +// implicit conversion to basic_vec (sizeof(U) == Bytes) + +template +__host__ __device__ constexpr void test_implicit_conv() +{ + using Mask = simd::basic_mask>; + using Vec = simd::basic_vec>; + Mask mask(is_even{}); + + static_assert(cuda::std::is_convertible_v); + static_assert(noexcept(static_cast(mask))); + + Vec vec = mask; + for (int i = 0; i < N; ++i) + { + assert(vec[i] == static_cast(i % 2 == 0)); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// explicit conversion to basic_vec (sizeof(U) != Bytes) + +template +__host__ __device__ constexpr void test_explicit_conv() +{ + static_assert(sizeof(U) != Bytes); + using Mask = simd::basic_mask>; + using Vec = simd::basic_vec>; + Mask mask(is_even{}); + + static_assert(!cuda::std::is_convertible_v); + static_assert(cuda::std::is_same_v(mask)), Vec>); + static_assert(noexcept(static_cast(mask))); + + Vec vec = static_cast(mask); + for (int i = 0; i < N; ++i) + { + assert(vec[i] == static_cast(i % 2 == 0)); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// to_bitset + +template +__host__ __device__ constexpr void test_to_bitset() +{ + using Mask = simd::basic_mask>; + Mask mask(true); + + static_assert(cuda::std::is_same_v>); + static_assert(noexcept(mask.to_bitset())); + static_assert(is_const_member_function_v); + unused(mask); + + Mask all_false(false); + auto bitset_false = all_false.to_bitset(); + assert(bitset_false.none()); + + Mask all_true(true); + auto bitset_true = all_true.to_bitset(); + assert(bitset_true.all()); + + Mask mixed(is_even{}); + auto bitset = mixed.to_bitset(); + for (int i = 0; i < N; ++i) + { + assert(bitset[i] == (i % 2 == 0)); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// to_ullong + +template +__host__ __device__ constexpr void test_to_ullong() +{ + using Mask = simd::basic_mask>; + Mask mask(true); + + static_assert(cuda::std::is_same_v); + static_assert(!noexcept(mask.to_ullong())); + static_assert(is_const_member_function_v); + unused(mask); + + Mask all_false(false); + assert(all_false.to_ullong() == 0ULL); + + if constexpr (N <= 64) + { + Mask all_true(true); + constexpr unsigned long long expected = (N == 64) ? ~0ULL : (~0ULL >> (64 - N)); + assert(all_true.to_ullong() == expected); + + Mask mixed(is_even{}); + unsigned long long expected_mixed = 0ULL; + for (int i = 0; i < N; ++i) + { + if (i % 2 == 0) + { + expected_mixed |= (1ULL << i); + } + } + assert(mixed.to_ullong() == expected_mixed); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// SFINAE constraints + +template +__host__ __device__ constexpr void test_sfinae_negative() +{ + using Mask = simd::basic_mask>; + using Integer = integer_from_t; + + // mismatched element count: conversion is fully rejected + using WrongSizeVec = simd::basic_vec>; + static_assert(!cuda::std::is_constructible_v); + static_assert(!cuda::std::is_convertible_v); +} + +//---------------------------------------------------------------------------------------------------------------------- + +template +__host__ __device__ constexpr void test_size() +{ + test_to_bitset(); + test_to_ullong(); + test_sfinae_negative(); +} + +template +__host__ __device__ constexpr void test_bytes() +{ + test_size(); + test_size(); +} + +__host__ __device__ constexpr bool test() +{ + test_bytes<1>(); + test_bytes<2>(); + test_bytes<4>(); + test_bytes<8>(); +#if _CCCL_HAS_INT128() + test_bytes<16>(); +#endif + + test_implicit_conv(); + test_implicit_conv(); + test_implicit_conv(); + test_implicit_conv(); + + test_explicit_conv<4, short, 4>(); + test_explicit_conv<2, int, 4>(); + test_explicit_conv<8, int, 4>(); + test_explicit_conv<4, long long, 4>(); + test_explicit_conv<1, short, 4>(); + test_explicit_conv<2, long long, 8>(); + + test_to_ullong<1, 64>(); + test_to_ullong<1, 65>(); + return true; +} + +int main(int, char**) +{ + assert(test()); + static_assert(test()); + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/ctor.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/ctor.pass.cpp new file mode 100644 index 00000000000..def96e06e22 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/ctor.pass.cpp @@ -0,0 +1,271 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// + +// [simd.mask.ctor], basic_mask constructors +// +// constexpr explicit basic_mask(value_type) noexcept; // broadcast +// constexpr explicit basic_mask(const basic_mask&) noexcept; // converting +// constexpr explicit basic_mask(Generator&&); // generator +// constexpr basic_mask(const bitset&) noexcept; // bitset +// constexpr explicit basic_mask(unsigned-integer) noexcept; // unsigned integer + +#include +#include + +#include "../simd_test_utils.h" + +//---------------------------------------------------------------------------------------------------------------------- +// member types and size + +template +__host__ __device__ constexpr void test_member_types() +{ + using Mask = simd::basic_mask>; + + static_assert(cuda::std::is_same_v); + static_assert(cuda::std::is_same_v>); + static_assert(Mask::size() == N); + static_assert(cuda::std::is_trivially_copyable_v); +} + +//---------------------------------------------------------------------------------------------------------------------- +// default construction: value-initializes all elements to false + +template +__host__ __device__ constexpr void test_default_ctor() +{ + using Mask = simd::basic_mask>; + Mask mask{}; + for (int i = 0; i < N; ++i) + { + assert(mask[i] == false); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// copy construction and copy assignment + +template +__host__ __device__ constexpr void test_copy() +{ + using Mask = simd::basic_mask>; + Mask original(is_even{}); + + Mask copied(original); + for (int i = 0; i < N; ++i) + { + assert(copied[i] == original[i]); + } + + Mask assigned(false); + assigned = original; + for (int i = 0; i < N; ++i) + { + assert(assigned[i] == original[i]); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// broadcast constructor + +template +__host__ __device__ constexpr void test_broadcast() +{ + using Mask = simd::basic_mask>; + static_assert(noexcept(Mask(true))); + + Mask all_true(true); + Mask all_false(false); + for (int i = 0; i < N; ++i) + { + assert(all_true[i] == true); + assert(all_false[i] == false); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// converting constructor + +template +__host__ __device__ constexpr void test_converting() +{ + using Src = simd::basic_mask>; + using Dst = simd::basic_mask>; + Src src(is_even{}); + static_assert(noexcept(Dst(src))); + + Dst dst(src); + for (int i = 0; i < N; ++i) + { + assert(dst[i] == src[i]); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// generator constructor + +template +__host__ __device__ constexpr void test_generator() +{ + using Mask = simd::basic_mask>; +#if _CCCL_COMPILER(GCC, !=, 7) + static_assert(!noexcept(Mask(is_even{}))); +#endif + + Mask mask(is_even{}); + for (int i = 0; i < N; ++i) + { + assert(mask[i] == (i % 2 == 0)); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// bitset constructor + +template +__host__ __device__ constexpr void test_bitset() +{ + using Mask = simd::basic_mask>; + cuda::std::bitset bitset; + static_assert(noexcept(Mask(bitset))); + + for (int i = 0; i < N; ++i) + { + bitset.set(i, (i % 2 == 0)); + } + Mask mask(bitset); + for (int i = 0; i < N; ++i) + { + assert(mask[i] == (i % 2 == 0)); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// unsigned integer constructor + +template +__host__ __device__ constexpr void test_unsigned_int() +{ + using Mask = simd::basic_mask>; + static_assert(noexcept(Mask(U{0}))); + + Mask mask(U{0}); + for (int i = 0; i < N; ++i) + { + assert(mask[i] == false); + } + + constexpr int num_bits = cuda::std::__num_bits_v; + constexpr int mask_bits = cuda::std::min(N, num_bits); + Mask all_one(static_cast(~U{0})); + for (int i = 0; i < mask_bits; ++i) + { + assert(all_one[i] == true); + } + + if constexpr (N >= 4) + { + Mask m_pat(U{0b101}); + assert(m_pat[0] == true); + assert(m_pat[1] == false); + assert(m_pat[2] == true); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// SFINAE and explicit constraints + +template +__host__ __device__ constexpr void test_sfinae() +{ + using Mask = simd::basic_mask>; + + // broadcast: only accepts bool + static_assert(!cuda::std::is_constructible_v); + // broadcast: must be explicit + static_assert(!cuda::std::is_convertible_v); + + // converting: requires matching element count + using MaskDifferentSize = simd::basic_mask>; + static_assert(!cuda::std::is_constructible_v); + // converting: must be explicit + using MaskOtherBytes = simd::basic_mask<(Bytes == 1 ? 2 : 1), simd::fixed_size>; + static_assert(!cuda::std::is_convertible_v); + + // generator: rejects non-callable types + static_assert(!cuda::std::is_constructible_v); + // generator: must be explicit + static_assert(!cuda::std::is_convertible_v); + + // bitset: requires matching size + static_assert(!cuda::std::is_constructible_v>); + // bitset: is implicit + static_assert(cuda::std::is_convertible_v&, Mask>); + + // unsigned integer: must be explicit + static_assert(!cuda::std::is_convertible_v); +} + +//---------------------------------------------------------------------------------------------------------------------- + +template +__host__ __device__ constexpr void test_size() +{ + test_member_types(); + test_default_ctor(); + test_copy(); + test_broadcast(); + test_generator(); + test_bitset(); + test_unsigned_int(); + test_unsigned_int(); + test_unsigned_int(); + test_unsigned_int(); + test_sfinae(); +} + +template +__host__ __device__ constexpr void test_bytes() +{ + test_size(); + test_size(); +} + +__host__ __device__ constexpr bool test() +{ + test_bytes<1>(); + test_bytes<2>(); + test_bytes<4>(); + test_bytes<8>(); +#if _CCCL_HAS_INT128() + test_bytes<16>(); +#endif + + // test_converting N1: Destination type size, N2: Source type size, N3: Mask number of elements + test_converting<4, 2, 4>(); // 4 -> 2, 4 elements + test_converting<2, 4, 4>(); // 2 -> 4, 4 elements + test_converting<1, 8, 4>(); // 1 -> 8, 4 elements + test_converting<8, 1, 4>(); // 8 -> 1, 4 elements + test_converting<4, 4, 4>(); // 4 -> 4, 4 elements + + test_converting<1, 2, 1>(); // 1 -> 2, 1 element + test_converting<1, 2, 2>(); // 1 -> 2, 2 elements + test_converting<1, 2, 8>(); // 1 -> 2, 8 elements + return true; +} + +int main(int, char**) +{ + assert(test()); + static_assert(test()); + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/instantiation.fail.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/instantiation.fail.cpp new file mode 100644 index 00000000000..b3c25eaa78d --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/instantiation.fail.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// basic_mask requires Bytes to map to a valid integer type (1, 2, 4, 8, or 16 with __int128). + +#include + +#include "test_macros.h" + +int main(int, char**) +{ + using Mask = cuda::std::simd::basic_mask<3, cuda::std::simd::fixed_size<4>>; + Mask mask(true); // expected-error + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/subscript.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/subscript.pass.cpp new file mode 100644 index 00000000000..22f176d530f --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/subscript.pass.cpp @@ -0,0 +1,70 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// + +// [simd.mask.subscr], basic_mask subscript operators +// +// constexpr value_type operator[](simd-size-type) const noexcept; + +#include "../simd_test_utils.h" + +//---------------------------------------------------------------------------------------------------------------------- +// subscript read-back + +template +__host__ __device__ constexpr void test_subscript() +{ + using Mask = simd::basic_mask>; + Mask mask(true); + + static_assert(cuda::std::is_same_v); + static_assert(!noexcept(mask[0])); + static_assert(is_const_member_function_v); + unused(mask); + + Mask all_true(true); + Mask all_false(false); + Mask alternating(is_even{}); + for (int i = 0; i < N; ++i) + { + assert(all_true[i] == true); + assert(all_false[i] == false); + assert(alternating[i] == (i % 2 == 0)); + } +} + +//---------------------------------------------------------------------------------------------------------------------- + +template +__host__ __device__ constexpr void test_bytes() +{ + test_subscript(); + test_subscript(); +} + +__host__ __device__ constexpr bool test() +{ + test_bytes<1>(); + test_bytes<2>(); + test_bytes<4>(); + test_bytes<8>(); +#if _CCCL_HAS_INT128() + test_bytes<16>(); +#endif + return true; +} + +int main(int, char**) +{ + assert(test()); + static_assert(test()); + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/unary.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/unary.pass.cpp new file mode 100644 index 00000000000..6d0c6d737d5 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/unary.pass.cpp @@ -0,0 +1,162 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// + +// [simd.mask.unary], basic_mask unary operators +// +// constexpr basic_mask operator!() const noexcept; +// constexpr basic_vec, Abi> operator+() const noexcept; +// constexpr basic_vec, Abi> operator-() const noexcept; +// constexpr basic_vec, Abi> operator~() const noexcept; + +#include "../simd_test_utils.h" + +//---------------------------------------------------------------------------------------------------------------------- +// operator! + +template +__host__ __device__ constexpr void test_logical_not() +{ + using Mask = simd::basic_mask>; + Mask mask(true); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(!mask)); + static_assert(is_const_member_function_v); + unused(mask); + + Mask all_true(true); + Mask all_false(false); + Mask mixed(is_even{}); + for (int i = 0; i < N; ++i) + { + assert((!all_true)[i] == false); + assert((!all_false)[i] == true); + assert((!mixed)[i] == (i % 2 != 0)); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// operator+ + +template +__host__ __device__ constexpr void test_unary_plus() +{ + using Mask = simd::basic_mask>; + using Integer = integer_from_t; + using Vec = simd::basic_vec>; + Mask mask(is_even{}); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(+mask)); + + auto vec = +mask; + for (int i = 0; i < N; ++i) + { + if (i % 2 == 0) + { + assert(vec[i] == 1); + } + else + { + assert(vec[i] == 0); + } + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// operator- + +template +__host__ __device__ constexpr void test_unary_minus() +{ + using Mask = simd::basic_mask>; + using Integer = integer_from_t; + using Vec = simd::basic_vec>; + Mask mask(is_even{}); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(-mask)); + + Vec vec = -mask; + for (int i = 0; i < N; ++i) + { + if (i % 2 == 0) + { + assert(vec[i] == static_cast(-Integer{1})); + } + else + { + assert(vec[i] == Integer{0}); + } + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// operator~ + +template +__host__ __device__ constexpr void test_bitwise_not() +{ + using Mask = simd::basic_mask>; + using Integer = integer_from_t; + using Vec = simd::basic_vec>; + Mask mask(is_even{}); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(~mask)); + + Vec vec = ~mask; + for (int i = 0; i < N; ++i) + { + if (i % 2 == 0) + { + assert(vec[i] == static_cast(~Integer{1})); + } + else + { + assert(vec[i] == static_cast(~Integer{0})); + } + } +} + +//---------------------------------------------------------------------------------------------------------------------- + +template +__host__ __device__ constexpr void test_size() +{ + test_logical_not(); + test_unary_plus(); + test_unary_minus(); + test_bitwise_not(); +} + +template +__host__ __device__ constexpr void test_bytes() +{ + test_size(); + test_size(); +} + +__host__ __device__ constexpr bool test() +{ + test_bytes<1>(); + test_bytes<2>(); + test_bytes<4>(); + test_bytes<8>(); +#if _CCCL_HAS_INT128() + test_bytes<16>(); +#endif + return true; +} + +int main(int, char**) +{ + assert(test()); + static_assert(test()); + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/aliases.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/aliases.pass.cpp new file mode 100644 index 00000000000..2e9bf8fb576 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/aliases.pass.cpp @@ -0,0 +1,73 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// + +// template using vec = ...; +// template using mask = ...; + +#include "../simd_test_utils.h" + +//---------------------------------------------------------------------------------------------------------------------- +// vec resolves to basic_vec> + +template +__host__ __device__ constexpr void test_vec_alias() +{ + using Alias = simd::vec; + using Direct = simd::basic_vec>; + static_assert(cuda::std::is_same_v); +} + +//---------------------------------------------------------------------------------------------------------------------- +// mask resolves to basic_mask> + +template +__host__ __device__ constexpr void test_mask_alias() +{ + using Alias = simd::mask; + using Direct = simd::basic_mask>; + static_assert(cuda::std::is_same_v); +} + +//---------------------------------------------------------------------------------------------------------------------- +// default N for vec and mask uses native ABI size + +template +__host__ __device__ constexpr void test_default_size() +{ + using DefaultVec = simd::vec; + using NativeVec = simd::basic_vec>; + static_assert(cuda::std::is_same_v); + + using DefaultMask = simd::mask; + using NativeMask = simd::basic_mask>; + static_assert(cuda::std::is_same_v); +} + +//---------------------------------------------------------------------------------------------------------------------- + +template +__host__ __device__ constexpr void test_type() +{ + test_vec_alias(); + test_mask_alias(); + test_default_size(); +} + +DEFINE_BASIC_VEC_TEST() +DEFINE_BASIC_VEC_TEST_RUNTIME() + +int main(int, char**) +{ + static_assert(test()); + assert(test_runtime()); + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/alignment.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/alignment.pass.cpp new file mode 100644 index 00000000000..c179620f379 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/alignment.pass.cpp @@ -0,0 +1,83 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// + +// template +// struct alignment; +// +// template +// constexpr size_t alignment_v = alignment::value; + +#include +#include + +#include "test_macros.h" + +namespace simd = cuda::std::simd; + +template +__host__ __device__ void test_default_u() +{ + using V = simd::basic_vec>; + static_assert(simd::alignment::value == ExpectedAlign); + static_assert(simd::alignment_v == ExpectedAlign); +} + +template +__host__ __device__ void test_explicit_u() +{ + using V = simd::basic_vec>; + static_assert(simd::alignment::value == ExpectedAlign); + static_assert(simd::alignment_v == ExpectedAlign); +} + +template +__host__ __device__ void test_type() +{ + test_default_u(); + test_default_u(); + test_default_u(); + test_default_u(); + test_default_u(); +} + +__host__ __device__ void test() +{ + // default U = value_type + test_type(); + test_type(); + test_type(); + test_type(); + test_type(); + test_type(); +#if _CCCL_HAS_INT128() + test_type<__int128_t>(); +#endif // _CCCL_HAS_INT128() +#if _LIBCUDACXX_HAS_NVFP16() + test_type<__half>(); +#endif // _LIBCUDACXX_HAS_NVFP16() +#if _LIBCUDACXX_HAS_NVBF16() + test_type<__nv_bfloat16>(); +#endif // _LIBCUDACXX_HAS_NVBF16() + + // explicit U different from value_type + test_explicit_u(); + test_explicit_u(); + test_explicit_u(); + test_explicit_u(); + test_explicit_u(); +} + +int main(int, char**) +{ + test(); + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/rebind.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/rebind.pass.cpp new file mode 100644 index 00000000000..d683bf9d8c2 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/rebind.pass.cpp @@ -0,0 +1,106 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// + +// template struct rebind; +// template using rebind_t = typename rebind::type; + +#include +#include + +#include "test_macros.h" + +namespace simd = cuda::std::simd; + +//---------------------------------------------------------------------------------------------------------------------- +// rebind with basic_vec + +template +__host__ __device__ void test_rebind_vec() +{ + using OldVec = simd::basic_vec>; + using Result = simd::rebind_t; + static_assert(cuda::std::is_same_v); + static_assert(Result::size() == N); +} + +template +__host__ __device__ void test_rebind_vec_sizes() +{ + test_rebind_vec(); + test_rebind_vec(); + test_rebind_vec(); + test_rebind_vec(); +} + +//---------------------------------------------------------------------------------------------------------------------- +// rebind with basic_mask + +template +__host__ __device__ void test_rebind_mask() +{ + using OldMask = simd::basic_mask>; + using Result = simd::rebind_t; + static_assert(cuda::std::is_same_v>>); + static_assert(Result::size() == N); +} + +template +__host__ __device__ void test_rebind_mask_sizes() +{ + test_rebind_mask(); + test_rebind_mask(); + test_rebind_mask(); + test_rebind_mask(); +} + +//---------------------------------------------------------------------------------------------------------------------- +// rebind_t matches rebind::type + +template +__host__ __device__ void test_rebind_t_alias() +{ + static_assert(cuda::std::is_same_v, typename simd::rebind::type>); +} + +__host__ __device__ void test() +{ + // rebind basic_vec + test_rebind_vec_sizes(); + test_rebind_vec_sizes(); + test_rebind_vec_sizes(); + test_rebind_vec_sizes(); + + // rebind basic_vec: different sizes + test_rebind_vec_sizes(); + test_rebind_vec_sizes(); + test_rebind_vec_sizes(); + test_rebind_vec_sizes(); + test_rebind_vec_sizes(); + + // rebind basic_mask + test_rebind_mask_sizes(); + test_rebind_mask_sizes(); + test_rebind_mask_sizes(); + test_rebind_mask_sizes(); + test_rebind_mask_sizes(); + + // rebind_t alias matches rebind::type + test_rebind_t_alias>(); + test_rebind_t_alias>(); + test_rebind_t_alias>(); +} + +int main(int, char**) +{ + test(); + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/resize.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/resize.pass.cpp new file mode 100644 index 00000000000..b2dfb0f5130 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/resize.pass.cpp @@ -0,0 +1,99 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// + +// template struct resize; +// template using resize_t = typename resize::type; + +#include +#include + +#include "test_macros.h" + +namespace simd = cuda::std::simd; + +//---------------------------------------------------------------------------------------------------------------------- +// resize with basic_vec + +template +__host__ __device__ void test_resize_vec() +{ + using OldVec = simd::basic_vec>; + using Result = simd::resize_t; + using Expected = simd::basic_vec>; + static_assert(cuda::std::is_same_v); + static_assert(Result::size() == NewN); + static_assert(cuda::std::is_same_v); +} + +template +__host__ __device__ void test_resize_vec_all() +{ + test_resize_vec(); + test_resize_vec(); + test_resize_vec(); +} + +//---------------------------------------------------------------------------------------------------------------------- +// resize with basic_mask + +template +__host__ __device__ void test_resize_mask() +{ + using OldMask = simd::basic_mask>; + using Result = simd::resize_t; + using Expected = simd::basic_mask>; + static_assert(Result::size() == NewN); + static_assert(cuda::std::is_same_v); +} + +template +__host__ __device__ void test_resize_mask_all() +{ + test_resize_mask(); + test_resize_mask(); + test_resize_mask(); +} + +//---------------------------------------------------------------------------------------------------------------------- +// resize_t matches resize::type + +template +__host__ __device__ void test_resize_t_alias() +{ + static_assert(cuda::std::is_same_v, typename simd::resize::type>); +} + +__host__ __device__ void test() +{ + // resize basic_vec + test_resize_vec_all(); + test_resize_vec_all(); + test_resize_vec_all(); + test_resize_vec_all(); + test_resize_vec_all(); + test_resize_vec_all(); + + // resize basic_mask + test_resize_mask_all(); + test_resize_mask_all(); + test_resize_mask_all(); + + // resize_t alias matches resize::type + test_resize_t_alias<8, simd::vec>(); + test_resize_t_alias<2, simd::vec>(); +} + +int main(int, char**) +{ + test(); + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/vectorizable.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/vectorizable.pass.cpp new file mode 100644 index 00000000000..6ce4ae99210 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/vectorizable.pass.cpp @@ -0,0 +1,72 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// + +// [simd.expos] __is_vectorizable_v: true for all standard integer types, +// character types, and float/double; false for bool, const/volatile types. + +#include +#include + +#include "test_macros.h" + +namespace simd = cuda::std::simd; + +// positive cases: integer types +static_assert(simd::__is_vectorizable_v); +static_assert(simd::__is_vectorizable_v); +static_assert(simd::__is_vectorizable_v); +static_assert(simd::__is_vectorizable_v); +static_assert(simd::__is_vectorizable_v); +static_assert(simd::__is_vectorizable_v); +static_assert(simd::__is_vectorizable_v); +static_assert(simd::__is_vectorizable_v); + +// positive cases: character types +static_assert(simd::__is_vectorizable_v); +static_assert(simd::__is_vectorizable_v); +static_assert(simd::__is_vectorizable_v); +static_assert(simd::__is_vectorizable_v); +#if defined(__cccl_lib_char8_t) +static_assert(simd::__is_vectorizable_v); +#endif + +#if _CCCL_HAS_INT128() +static_assert(simd::__is_vectorizable_v<__int128_t>); +#endif + +// floating-point types +static_assert(simd::__is_vectorizable_v); +static_assert(simd::__is_vectorizable_v); +#if _LIBCUDACXX_HAS_NVFP16() +static_assert(simd::__is_vectorizable_v<__half>); +#endif +#if _LIBCUDACXX_HAS_NVBF16() +static_assert(simd::__is_vectorizable_v<__nv_bfloat16>); +#endif + +// negative cases +static_assert(!simd::__is_vectorizable_v); +static_assert(!simd::__is_vectorizable_v); +static_assert(!simd::__is_vectorizable_v); +static_assert(!simd::__is_vectorizable_v); +static_assert(!simd::__is_vectorizable_v); + +struct user_type +{}; +static_assert(!simd::__is_vectorizable_v); +static_assert(!simd::__is_vectorizable_v); +static_assert(!simd::__is_vectorizable_v); + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/binary.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/binary.pass.cpp new file mode 100644 index 00000000000..afb29578985 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/binary.pass.cpp @@ -0,0 +1,144 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// + +// [simd.binary], basic_vec binary operators +// +// friend constexpr basic_vec operator+(const basic_vec&, const basic_vec&) noexcept; +// friend constexpr basic_vec operator-(const basic_vec&, const basic_vec&) noexcept; +// friend constexpr basic_vec operator*(const basic_vec&, const basic_vec&) noexcept; +// friend constexpr basic_vec operator/(const basic_vec&, const basic_vec&) noexcept; +// friend constexpr basic_vec operator%(const basic_vec&, const basic_vec&) noexcept; +// friend constexpr basic_vec operator&(const basic_vec&, const basic_vec&) noexcept; +// friend constexpr basic_vec operator|(const basic_vec&, const basic_vec&) noexcept; +// friend constexpr basic_vec operator^(const basic_vec&, const basic_vec&) noexcept; +// friend constexpr basic_vec operator<<(const basic_vec&, const basic_vec&) noexcept; +// friend constexpr basic_vec operator>>(const basic_vec&, const basic_vec&) noexcept; +// friend constexpr basic_vec operator<<(const basic_vec&, simd-size-type) noexcept; +// friend constexpr basic_vec operator>>(const basic_vec&, simd-size-type) noexcept; + +#include "../simd_test_utils.h" + +template +__host__ __device__ constexpr void test_arithmetic() +{ + using Vec = simd::basic_vec>; + Vec a(T{6}); + Vec b(T{3}); + + static_assert(cuda::std::is_same_v); + static_assert(cuda::std::is_same_v); + static_assert(cuda::std::is_same_v); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(a + b)); + static_assert(noexcept(a - b)); + static_assert(noexcept(a * b)); + static_assert(noexcept(a / b)); + + Vec sum = a + b; + Vec diff = a - b; + Vec prod = a * b; + Vec quot = a / b; + for (int i = 0; i < N; ++i) + { + assert(sum[i] == static_cast(T{6} + T{3})); + assert(diff[i] == static_cast(T{6} - T{3})); + assert(prod[i] == static_cast(T{6} * T{3})); + assert(quot[i] == static_cast(T{6} / T{3})); + } +} + +template +__host__ __device__ constexpr void test_integral_ops() +{ + using Vec = simd::basic_vec>; + Vec a(T{7}); + Vec b(T{3}); + + static_assert(cuda::std::is_same_v); + static_assert(cuda::std::is_same_v); + static_assert(cuda::std::is_same_v); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(a % b)); + static_assert(noexcept(a & b)); + static_assert(noexcept(a | b)); + static_assert(noexcept(a ^ b)); + + Vec mod = a % b; + Vec bit_and = a & b; + Vec bit_or = a | b; + Vec bit_xor = a ^ b; + for (int i = 0; i < N; ++i) + { + assert(mod[i] == static_cast(T{7} % T{3})); + assert(bit_and[i] == static_cast(T{7} & T{3})); + assert(bit_or[i] == static_cast(T{7} | T{3})); + assert(bit_xor[i] == static_cast(T{7} ^ T{3})); + } +} + +template +__host__ __device__ constexpr void test_shifts() +{ + using Vec = simd::basic_vec>; + Vec a(T{4}); + Vec shift_amount(T{1}); + + static_assert(cuda::std::is_same_v); + static_assert(cuda::std::is_same_v> shift_amount), Vec>); + static_assert(noexcept(a << shift_amount)); + static_assert(noexcept(a >> shift_amount)); + + Vec shl = a << shift_amount; + Vec shr = a >> shift_amount; + for (int i = 0; i < N; ++i) + { + assert(shl[i] == static_cast(T{4} << T{1})); + assert(shr[i] == static_cast(T{4} >> T{1})); + } + + static_assert(cuda::std::is_same_v); + static_assert(cuda::std::is_same_v> 1), Vec>); + static_assert(noexcept(a << 1)); + static_assert(noexcept(a >> 1)); + + Vec shl_n = a << 1; + Vec shr_n = a >> 1; + for (int i = 0; i < N; ++i) + { + assert(shl_n[i] == static_cast(T{4} << T{1})); + assert(shr_n[i] == static_cast(T{4} >> T{1})); + } +} + +//---------------------------------------------------------------------------------------------------------------------- + +template +__host__ __device__ constexpr void test_type() +{ + test_arithmetic(); + if constexpr (cuda::std::is_integral_v) + { + test_integral_ops(); + test_shifts(); + } +} + +DEFINE_BASIC_VEC_TEST() +DEFINE_BASIC_VEC_TEST_RUNTIME() + +int main(int, char**) +{ + assert(test()); + static_assert(test()); + assert(test_runtime()); + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/comparison.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/comparison.pass.cpp new file mode 100644 index 00000000000..4b795836347 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/comparison.pass.cpp @@ -0,0 +1,74 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// + +// [simd.comparison], basic_vec compare operators +// +// friend constexpr mask_type operator==(const basic_vec&, const basic_vec&) noexcept; +// friend constexpr mask_type operator!=(const basic_vec&, const basic_vec&) noexcept; +// friend constexpr mask_type operator>=(const basic_vec&, const basic_vec&) noexcept; +// friend constexpr mask_type operator<=(const basic_vec&, const basic_vec&) noexcept; +// friend constexpr mask_type operator>(const basic_vec&, const basic_vec&) noexcept; +// friend constexpr mask_type operator<(const basic_vec&, const basic_vec&) noexcept; + +#include "../simd_test_utils.h" + +template +__host__ __device__ constexpr void test_type() +{ + using Vec = simd::basic_vec>; + using Mask = typename Vec::mask_type; + Vec a = make_iota_vec(); + Vec b(T{3}); + + static_assert(cuda::std::is_same_v); + static_assert(cuda::std::is_same_v); + static_assert(cuda::std::is_same_v= b), Mask>); + static_assert(cuda::std::is_same_v); + static_assert(cuda::std::is_same_v b), Mask>); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(a == b)); + static_assert(noexcept(a != b)); + static_assert(noexcept(a >= b)); + static_assert(noexcept(a <= b)); + static_assert(noexcept(a > b)); + static_assert(noexcept(a < b)); + + Mask eq = a == b; + Mask ne = a != b; + Mask ge = a >= b; + Mask le = a <= b; + Mask gt = a > b; + Mask lt = a < b; + for (int i = 0; i < N; ++i) + { + T val = static_cast(i); + assert(eq[i] == (val == T{3})); + assert(ne[i] == (val != T{3})); + assert(ge[i] == (val >= T{3})); + assert(le[i] == (val <= T{3})); + assert(gt[i] == (val > T{3})); + assert(lt[i] == (val < T{3})); + } +} + +//---------------------------------------------------------------------------------------------------------------------- + +DEFINE_BASIC_VEC_TEST() +DEFINE_BASIC_VEC_TEST_RUNTIME() + +int main(int, char**) +{ + assert(test()); + static_assert(test()); + assert(test_runtime()); + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/compound_assign.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/compound_assign.pass.cpp new file mode 100644 index 00000000000..74441f92617 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/compound_assign.pass.cpp @@ -0,0 +1,208 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// + +// [simd.cassign], basic_vec compound assignment +// +// friend constexpr basic_vec& operator+=(basic_vec&, const basic_vec&) noexcept; +// friend constexpr basic_vec& operator-=(basic_vec&, const basic_vec&) noexcept; +// friend constexpr basic_vec& operator*=(basic_vec&, const basic_vec&) noexcept; +// friend constexpr basic_vec& operator/=(basic_vec&, const basic_vec&) noexcept; +// friend constexpr basic_vec& operator%=(basic_vec&, const basic_vec&) noexcept; +// friend constexpr basic_vec& operator&=(basic_vec&, const basic_vec&) noexcept; +// friend constexpr basic_vec& operator|=(basic_vec&, const basic_vec&) noexcept; +// friend constexpr basic_vec& operator^=(basic_vec&, const basic_vec&) noexcept; +// friend constexpr basic_vec& operator<<=(basic_vec&, const basic_vec&) noexcept; +// friend constexpr basic_vec& operator>>=(basic_vec&, const basic_vec&) noexcept; +// friend constexpr basic_vec& operator<<=(basic_vec&, simd-size-type) noexcept; +// friend constexpr basic_vec& operator>>=(basic_vec&, simd-size-type) noexcept; + +#include "../simd_test_utils.h" + +template +__host__ __device__ constexpr void test_arithmetic_assign() +{ + using Vec = simd::basic_vec>; + Vec b(T{3}); + + // operator+= + { + Vec a(T{6}); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(a += b)); + a += b; + for (int i = 0; i < N; ++i) + { + assert(a[i] == static_cast(T{6} + T{3})); + } + } + // operator-= + { + Vec a(T{6}); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(a -= b)); + a -= b; + for (int i = 0; i < N; ++i) + { + assert(a[i] == static_cast(T{6} - T{3})); + } + } + // operator*= + { + Vec a(T{6}); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(a *= b)); + a *= b; + for (int i = 0; i < N; ++i) + { + assert(a[i] == static_cast(T{6} * T{3})); + } + } + // operator/= + { + Vec a(T{6}); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(a /= b)); + a /= b; + for (int i = 0; i < N; ++i) + { + assert(a[i] == static_cast(T{6} / T{3})); + } + } +} + +template +__host__ __device__ constexpr void test_integral_assign() +{ + using Vec = simd::basic_vec>; + Vec b(T{3}); + + // operator%= + { + Vec a(T{7}); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(a %= b)); + a %= b; + for (int i = 0; i < N; ++i) + { + assert(a[i] == static_cast(T{7} % T{3})); + } + } + // operator&= + { + Vec a(T{7}); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(a &= b)); + a &= b; + for (int i = 0; i < N; ++i) + { + assert(a[i] == static_cast(T{7} & T{3})); + } + } + // operator|= + { + Vec a(T{7}); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(a |= b)); + a |= b; + for (int i = 0; i < N; ++i) + { + assert(a[i] == static_cast(T{7} | T{3})); + } + } + // operator^= + { + Vec a(T{7}); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(a ^= b)); + a ^= b; + for (int i = 0; i < N; ++i) + { + assert(a[i] == static_cast(T{7} ^ T{3})); + } + } +} + +template +__host__ __device__ constexpr void test_shift_assign() +{ + using Vec = simd::basic_vec>; + Vec shift(T{1}); + + // operator<<= + { + Vec a(T{4}); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(a <<= shift)); + a <<= shift; + for (int i = 0; i < N; ++i) + { + assert(a[i] == static_cast(T{4} << T{1})); + } + } + // operator>>= + { + Vec a(T{4}); + static_assert(cuda::std::is_same_v>= shift), Vec&>); + static_assert(noexcept(a >>= shift)); + a >>= shift; + for (int i = 0; i < N; ++i) + { + assert(a[i] == static_cast(T{4} >> T{1})); + } + } + // operator<<= + { + Vec a(T{4}); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(a <<= 1)); + a <<= 1; + for (int i = 0; i < N; ++i) + { + assert(a[i] == static_cast(T{4} << T{1})); + } + } + // operator>>= + { + Vec a(T{4}); + static_assert(cuda::std::is_same_v>= 1), Vec&>); + static_assert(noexcept(a >>= 1)); + a >>= 1; + for (int i = 0; i < N; ++i) + { + assert(a[i] == static_cast(T{4} >> T{1})); + } + } +} + +//---------------------------------------------------------------------------------------------------------------------- + +template +__host__ __device__ constexpr void test_type() +{ + test_arithmetic_assign(); + if constexpr (cuda::std::is_integral_v) + { + test_integral_assign(); + test_shift_assign(); + } +} + +DEFINE_BASIC_VEC_TEST() +DEFINE_BASIC_VEC_TEST_RUNTIME() + +int main(int, char**) +{ + assert(test()); + static_assert(test()); + assert(test_runtime()); + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/ctor.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/ctor.pass.cpp new file mode 100644 index 00000000000..5007f577750 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/ctor.pass.cpp @@ -0,0 +1,448 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// + +// [simd.ctor], basic_vec constructors +// +// constexpr explicit basic_vec(Up&&) noexcept; // broadcast (explicit) +// constexpr basic_vec(Up&&) noexcept; // broadcast (implicit) +// constexpr explicit basic_vec(const basic_vec&) noexcept; // converting (explicit) +// constexpr basic_vec(const basic_vec&) noexcept; // converting (implicit) +// constexpr explicit basic_vec(Generator&&); // generator +// constexpr basic_vec(Range&&, flags<> = {}); // range +// constexpr basic_vec(Range&&, const mask_type&, flags<> = {}); // masked range + +#include + +#include "../simd_test_utils.h" + +//---------------------------------------------------------------------------------------------------------------------- +// member types and size + +template +__host__ __device__ constexpr void test_member_types() +{ + using Vec = simd::basic_vec>; + + static_assert(cuda::std::is_same_v); + static_assert(cuda::std::is_same_v>); + static_assert(Vec::size() == N); + static_assert(cuda::std::is_trivially_copyable_v); +} + +//---------------------------------------------------------------------------------------------------------------------- +// default construction: value-initialize all elements + +template +__host__ __device__ constexpr void test_default_ctor() +{ + using Vec = simd::basic_vec>; + Vec vec{}; + for (int i = 0; i < N; ++i) + { + assert(vec[i] == T{}); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// copy construction and copy assignment + +template +__host__ __device__ constexpr void test_copy() +{ + using Vec = simd::basic_vec>; + Vec original(T{42}); + + Vec copied(original); + for (int i = 0; i < N; ++i) + { + assert(copied[i] == T{42}); + } + + Vec assigned{}; + assigned = original; + for (int i = 0; i < N; ++i) + { + assert(assigned[i] == T{42}); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// broadcast constructor + +template +__host__ __device__ constexpr void test_broadcast() +{ + using Vec = simd::basic_vec>; + static_assert(noexcept(Vec(cuda::std::declval()))); // declval() is needed for __half and __nv_bfloat16 + + Vec vec(T{42}); + for (int i = 0; i < N; ++i) + { + assert(vec[i] == T{42}); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// generator constructor + +template +__host__ __device__ constexpr void test_generator() +{ + using Vec = simd::basic_vec>; + + const Vec vec(iota_generator{}); + for (int i = 0; i < N; ++i) + { + assert(vec[i] == static_cast(i + 1)); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// converting constructor + +template +__host__ __device__ constexpr void test_converting() +{ + using Src = simd::basic_vec>; + using Dst = simd::basic_vec>; + Src src(U{3}); + static_assert(noexcept(Dst(src))); + + Dst dst(src); + for (int i = 0; i < N; ++i) + { + assert(dst[i] == static_cast(U{3})); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// range constructor + +template +__host__ __device__ constexpr void test_range() +{ + using Vec = simd::basic_vec>; + cuda::std::array arr{}; + for (int i = 0; i < N; ++i) + { + arr[i] = static_cast(i + 1); + } + + static_assert(!noexcept(Vec(arr))); + static_assert(!noexcept(Vec(arr, simd::flag_default))); + + Vec vec(arr); + for (int i = 0; i < N; ++i) + { + assert(vec[i] == static_cast(i + 1)); + } + + Vec vec2(arr, simd::flag_default); + for (int i = 0; i < N; ++i) + { + assert(vec2[i] == static_cast(i + 1)); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// range constructor with fixed-extent span + +template +__host__ __device__ constexpr void test_range_span() +{ + using Vec = simd::basic_vec>; + cuda::std::array arr{}; + for (int i = 0; i < N; ++i) + { + arr[i] = static_cast(i + 1); + } + + const cuda::std::span values(arr); + const Vec vec(values); + const Vec vec2(values, simd::flag_default); + for (int i = 0; i < N; ++i) + { + assert(vec[i] == static_cast(i + 1)); + assert(vec2[i] == static_cast(i + 1)); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// range constructor with alignment flags + +template +__host__ __device__ constexpr void test_range_alignment_flags() +{ + using Vec = simd::basic_vec>; + alignas(64) cuda::std::array arr{}; + for (int i = 0; i < N; ++i) + { + arr[i] = static_cast(i + 1); + } + + const Vec aligned_vec(arr, simd::flag_aligned); + const Vec overaligned_vec(arr, simd::flag_overaligned<32>); + for (int i = 0; i < N; ++i) + { + assert(aligned_vec[i] == static_cast(i + 1)); + assert(overaligned_vec[i] == static_cast(i + 1)); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// masked range constructor + +template +__host__ __device__ constexpr void test_masked_range() +{ + using Vec = simd::basic_vec>; + using Mask = typename Vec::mask_type; + cuda::std::array arr{}; + for (int i = 0; i < N; ++i) + { + arr[i] = static_cast(i + 1); + } + + Mask even_mask(is_even{}); + static_assert(!noexcept(Vec(arr, even_mask))); + static_assert(!noexcept(Vec(arr, even_mask, simd::flag_default))); + + Vec vec(arr, even_mask); + for (int i = 0; i < N; ++i) + { + if (i % 2 == 0) + { + assert(vec[i] == static_cast(i + 1)); + } + else + { + assert(vec[i] == T{0}); + } + } + + Vec vec2(arr, even_mask, simd::flag_default); + for (int i = 0; i < N; ++i) + { + if (i % 2 == 0) + { + assert(vec2[i] == static_cast(i + 1)); + } + else + { + assert(vec2[i] == T{0}); + } + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// range constructor with flag_convert +// constructs a basic_vec from an array with simd::flag_convert, where U is wider than T (not value-preserving) + +template +__host__ __device__ constexpr void test_range_convert_lossy() +{ + using Vec = simd::basic_vec>; + cuda::std::array arr{}; + for (int i = 0; i < N; ++i) + { + arr[i] = static_cast(i + 1); + } + + static_assert(!noexcept(Vec(arr, simd::flag_convert))); + + Vec vec(arr, simd::flag_convert); + for (int i = 0; i < N; ++i) + { + assert(vec[i] == static_cast(static_cast(i + 1))); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// masked range constructor with flag_convert +// constructs a basic_vec from an array with simd::flag_convert, where U is wider than T (not value-preserving) + +template +__host__ __device__ constexpr void test_masked_range_convert_lossy() +{ + using Vec = simd::basic_vec>; + using Mask = typename Vec::mask_type; + cuda::std::array arr{}; + for (int i = 0; i < N; ++i) + { + arr[i] = static_cast(i + 1); + } + + Mask even_mask(is_even{}); + static_assert(!noexcept(Vec(arr, even_mask, simd::flag_convert))); + + Vec vec(arr, even_mask, simd::flag_convert); + for (int i = 0; i < N; ++i) + { + if (i % 2 == 0) + { + assert(vec[i] == static_cast(static_cast(i + 1))); + } + else + { + assert(vec[i] == T{0}); + } + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// broadcast constructor with constexpr-wrapper-like types +// [simd.ctor] p4.3: implicit when From::value is representable by value_type + +template +__host__ __device__ constexpr void test_broadcast_constexpr_wrapper() +{ + using Vec = simd::basic_vec>; + + // integral_constant where V fits in T: implicit + static_assert(cuda::std::is_convertible_v, Vec>); + + // integral_constant from a wider type, but the specific value fits: implicit + if constexpr (sizeof(T) < sizeof(int64_t) && cuda::std::is_integral_v) + { + using IC = cuda::std::integral_constant; + static_assert(cuda::std::is_constructible_v); + static_assert(cuda::std::is_convertible_v); + Vec vec = IC{}; + for (int i = 0; i < N; ++i) + { + assert(vec[i] == static_cast(5)); + } + } + + // integral_constant from a wider type with a value that does NOT fit: explicit + if constexpr (cuda::std::is_same_v) + { + using IC = cuda::std::integral_constant; + static_assert(cuda::std::is_constructible_v); + static_assert(!cuda::std::is_convertible_v); + } + + // unsigned value in a signed target that fits: implicit + if constexpr (cuda::std::is_same_v) + { + using IC = cuda::std::integral_constant; + static_assert(cuda::std::is_convertible_v); + } + + // negative value in an unsigned target: explicit + if constexpr (cuda::std::is_same_v) + { + using IC_neg = cuda::std::integral_constant; + static_assert(cuda::std::is_constructible_v); + static_assert(!cuda::std::is_convertible_v); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// broadcast constructor explicit/implicit boundary for arithmetic types +// [simd.ctor] p4: implicit iff convertible_to and value-preserving + +template +__host__ __device__ constexpr void test_broadcast_explicit_implicit() +{ + using Vec = simd::basic_vec>; + + // (1) same type is implicit + static_assert(cuda::std::is_convertible_v); + + // (2) value-preserving and wider type is implicit + if constexpr (cuda::std::is_same_v) + { + static_assert(cuda::std::is_convertible_v); + } + else if constexpr (cuda::std::is_same_v) + { + static_assert(cuda::std::is_convertible_v); + } + + // (3) narrow conversion is explicit + else if constexpr (cuda::std::is_same_v) + { + static_assert(cuda::std::is_constructible_v); + static_assert(!cuda::std::is_convertible_v); + } + else if constexpr (cuda::std::is_same_v) + { + static_assert(cuda::std::is_constructible_v); + static_assert(!cuda::std::is_convertible_v); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// SFINAE constraints + +template +__host__ __device__ constexpr void test_sfinae() +{ + using Vec = simd::basic_vec>; + + static_assert(cuda::std::is_constructible_v); + + using VecDifferentSize = simd::basic_vec>; + static_assert(!cuda::std::is_constructible_v); + + static_assert(!cuda::std::is_constructible_v); +} + +//---------------------------------------------------------------------------------------------------------------------- + +template +__host__ __device__ constexpr void test_type() +{ + test_member_types(); + test_default_ctor(); + test_copy(); + test_broadcast(); + test_broadcast_explicit_implicit(); + test_generator(); + test_range(); + test_range_span(); + test_range_alignment_flags(); + test_masked_range(); + if constexpr (cuda::std::is_integral_v) + { + test_broadcast_constexpr_wrapper(); + } + test_sfinae(); + if constexpr (sizeof(T) >= 2 && cuda::std::is_integral_v) + { + using Smaller = cuda::std::conditional_t, int8_t, uint8_t>; + test_converting(); + } + if constexpr (sizeof(T) < 8 && cuda::std::is_integral_v) + { + using Wider = cuda::std::conditional_t, int64_t, uint64_t>; + test_range_convert_lossy(); + test_masked_range_convert_lossy(); + } + if constexpr (cuda::std::is_same_v) + { + test_range_convert_lossy(); + test_masked_range_convert_lossy(); + } +} + +DEFINE_BASIC_VEC_TEST() +DEFINE_BASIC_VEC_TEST_RUNTIME() + +int main(int, char**) +{ + assert(test()); + static_assert(test()); + assert(test_runtime()); + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/deduction.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/deduction.pass.cpp new file mode 100644 index 00000000000..f9eefa2e91f --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/deduction.pass.cpp @@ -0,0 +1,123 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// (CTAD bug) Deduction guides fail with: +// * gcc: all nvcc+gcc configs (EDG frontend bug) +// * nvrtc (EDG frontend bug) +// * msvc: all nvcc+msvc configs (MSVC CTAD limitation) +// * nvcc+clang C++17 (EDG frontend bug) +// +// while they work with: +// +// * nvcc+clang C++20: PASS +// * clang-cuda: PASS +// * nvc++: PASS + +// error: error: no instance of constructor "cuda::std::simd::basic_vec" matches the argument list + +// UNSUPPORTED: gcc +// UNSUPPORTED: nvrtc +// UNSUPPORTED: msvc +// UNSUPPORTED: nvcc && clang && c++17 + +// + +// [simd.ctor] deduction guides +// +// basic_vec(Range&&, Ts...) -> basic_vec, deduce-abi-t<...>>; +// basic_vec(basic_mask) -> basic_vec, Abi>; + +#include + +#include "../simd_test_utils.h" + +//---------------------------------------------------------------------------------------------------------------------- +// deduction from range + +template +__host__ __device__ constexpr void test_range_deduction() +{ + cuda::std::array arr{}; + for (int i = 0; i < N; ++i) + { + arr[i] = static_cast(i); + } + simd::basic_vec vec(arr); + static_assert(cuda::std::is_same_v); + static_assert(decltype(vec)::size() == N); + for (int i = 0; i < N; ++i) + { + assert(vec[i] == static_cast(i)); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// deduction from fixed-extent span + +template +__host__ __device__ constexpr void test_span_deduction() +{ + cuda::std::array arr{}; + for (int i = 0; i < N; ++i) + { + arr[i] = static_cast(i); + } + + const cuda::std::span values(arr); + simd::basic_vec vec(values); + static_assert(cuda::std::is_same_v); + static_assert(decltype(vec)::size() == N); + for (int i = 0; i < N; ++i) + { + assert(vec[i] == static_cast(i)); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// deduction from basic_mask + +template +__host__ __device__ constexpr void test_mask_deduction() +{ + using Mask = simd::basic_mask>; + Mask mask(true); + simd::basic_vec vec(mask); + static_assert(decltype(vec)::size() == N); + for (int i = 0; i < N; ++i) + { + assert(vec[i] == 1); + } +} + +//---------------------------------------------------------------------------------------------------------------------- + +__host__ __device__ constexpr bool test_deduction() +{ + test_range_deduction(); + test_range_deduction(); + test_range_deduction(); + test_range_deduction(); + test_span_deduction(); + test_span_deduction(); + test_span_deduction(); + test_span_deduction(); + + test_mask_deduction<1, 1>(); + test_mask_deduction<1, 4>(); + test_mask_deduction<4, 4>(); + return true; +} + +int main(int, char**) +{ + assert(test_deduction()); + static_assert(test_deduction()); + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/instantiation.fail.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/instantiation.fail.cpp new file mode 100644 index 00000000000..183fc7079fd --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/instantiation.fail.cpp @@ -0,0 +1,23 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// basic_vec requires T to be a vectorizable type. +// bool is explicitly excluded by the standard. + +#include + +#include "test_macros.h" + +int main(int, char**) +{ + using Vec = cuda::std::simd::basic_vec>; + Vec vec(true); // expected-error + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/subscript.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/subscript.pass.cpp new file mode 100644 index 00000000000..34d7544a6c7 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/subscript.pass.cpp @@ -0,0 +1,51 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// + +// [simd.subscr], basic_vec subscript operators +// +// constexpr value_type operator[](simd-size-type) const; + +#include "../simd_test_utils.h" + +//---------------------------------------------------------------------------------------------------------------------- +// subscript read-back + +template +__host__ __device__ constexpr void test_type() +{ + using Vec = simd::basic_vec>; + Vec vec(T{7}); + + static_assert(cuda::std::is_same_v); + static_assert(!noexcept(vec[0])); + static_assert(is_const_member_function_v); + unused(vec); + + Vec iota = make_iota_vec(); + for (int i = 0; i < N; ++i) + { + assert(iota[i] == static_cast(i)); + } +} + +//---------------------------------------------------------------------------------------------------------------------- + +DEFINE_BASIC_VEC_TEST() +DEFINE_BASIC_VEC_TEST_RUNTIME() + +int main(int, char**) +{ + assert(test()); + static_assert(test()); + assert(test_runtime()); + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/unary.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/unary.pass.cpp new file mode 100644 index 00000000000..ae1989d0cdf --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/unary.pass.cpp @@ -0,0 +1,204 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// + +// [simd.unary], basic_vec unary operators +// +// constexpr basic_vec& operator++() noexcept; +// constexpr basic_vec operator++(int) noexcept; +// constexpr basic_vec& operator--() noexcept; +// constexpr basic_vec operator--(int) noexcept; +// constexpr mask_type operator!() const noexcept; +// constexpr basic_vec operator~() const noexcept; +// constexpr basic_vec operator+() const noexcept; +// constexpr basic_vec operator-() const noexcept; + +#include "../simd_test_utils.h" + +TEST_DIAG_SUPPRESS_MSVC(4146) // unary minus operator applied to unsigned type, result still unsigned + +//---------------------------------------------------------------------------------------------------------------------- +// operator++ (pre) + +template +__host__ __device__ constexpr void test_pre_increment() +{ + using Vec = simd::basic_vec>; + Vec vec(T{5}); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(++vec)); + + Vec& ref = ++vec; + assert(&ref == &vec); + for (int i = 0; i < N; ++i) + { + assert(vec[i] == T{6}); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// operator++ (post) + +template +__host__ __device__ constexpr void test_post_increment() +{ + using Vec = simd::basic_vec>; + Vec vec(T{5}); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(vec++)); + + Vec old = vec++; + for (int i = 0; i < N; ++i) + { + assert(old[i] == T{5}); + assert(vec[i] == T{6}); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// operator-- (pre) + +template +__host__ __device__ constexpr void test_pre_decrement() +{ + using Vec = simd::basic_vec>; + Vec vec(T{5}); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(--vec)); + + Vec& ref = --vec; + assert(&ref == &vec); + for (int i = 0; i < N; ++i) + { + assert(vec[i] == T{4}); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// operator-- (post) + +template +__host__ __device__ constexpr void test_post_decrement() +{ + using Vec = simd::basic_vec>; + Vec vec(T{5}); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(vec--)); + + Vec old = vec--; + for (int i = 0; i < N; ++i) + { + assert(old[i] == T{5}); + assert(vec[i] == T{4}); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// operator! + +template +__host__ __device__ constexpr void test_logical_not() +{ + using Vec = simd::basic_vec>; + using Mask = typename Vec::mask_type; + Vec vec = make_iota_vec(); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(!vec)); + + Mask result = !vec; + for (int i = 0; i < N; ++i) + { + assert(result[i] == (static_cast(i) == T{0})); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// operator~ + +template +__host__ __device__ constexpr void test_bitwise_not() +{ + using Vec = simd::basic_vec>; + Vec vec(T{0}); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(~vec)); + + Vec result = ~vec; + for (int i = 0; i < N; ++i) + { + assert(result[i] == static_cast(~T{0})); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// operator+ + +template +__host__ __device__ constexpr void test_unary_plus() +{ + using Vec = simd::basic_vec>; + Vec vec(T{42}); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(+vec)); + + Vec result = +vec; + for (int i = 0; i < N; ++i) + { + assert(result[i] == T{42}); + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// operator- + +template +__host__ __device__ constexpr void test_unary_minus() +{ + using Vec = simd::basic_vec>; + Vec vec(T{3}); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(-vec)); + + Vec result = -vec; + for (int i = 0; i < N; ++i) + { + assert(result[i] == static_cast(-T{3})); + } +} + +//---------------------------------------------------------------------------------------------------------------------- + +template +__host__ __device__ constexpr void test_type() +{ + test_pre_increment(); + test_post_increment(); + test_pre_decrement(); + test_post_decrement(); + test_logical_not(); + test_unary_plus(); + test_unary_minus(); + if constexpr (cuda::std::is_integral_v) + { + test_bitwise_not(); + } +} + +DEFINE_BASIC_VEC_TEST() +DEFINE_BASIC_VEC_TEST_RUNTIME() + +int main(int, char**) +{ + assert(test()); + static_assert(test()); + assert(test_runtime()); + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd_test_utils.h b/libcudacxx/test/libcudacxx/std/numerics/simd/simd_test_utils.h new file mode 100644 index 00000000000..32af8847961 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd_test_utils.h @@ -0,0 +1,171 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef SIMD_TEST_UTILS_H +#define SIMD_TEST_UTILS_H + +#include +#include +#include +#include + +#include "test_macros.h" + +namespace simd = cuda::std::simd; + +//---------------------------------------------------------------------------------------------------------------------- +// common utilities + +struct wrong_generator +{}; + +template +struct is_const_member_function : cuda::std::false_type +{}; + +template +struct is_const_member_function : cuda::std::true_type +{}; + +template +struct is_const_member_function : cuda::std::true_type +{}; + +template +constexpr bool is_const_member_function_v = is_const_member_function::value; + +//---------------------------------------------------------------------------------------------------------------------- +// mask utilities + +struct is_even +{ + template + __host__ __device__ constexpr bool operator()(I i) const noexcept + { + return i % 2 == 0; + } +}; + +struct is_first_half +{ + template + __host__ __device__ constexpr bool operator()(I i) const noexcept + { + return i < 2; + } +}; + +template +using integer_from_t = cuda::std::__make_nbit_int_t; + +//---------------------------------------------------------------------------------------------------------------------- +// vec utilities + +template +struct iota_generator +{ + template + __host__ __device__ constexpr T operator()(I i) const noexcept + { + return static_cast(i + 1); + } +}; + +template +__host__ __device__ constexpr simd::basic_vec> make_iota_vec() +{ + cuda::std::array arr{}; + for (int i = 0; i < N; ++i) + { + arr[i] = static_cast(i); + } + return simd::basic_vec>(arr); +} + +// Each vec test file must define test_type() and then define test() using this macro. +// clang-format off +#if defined(__cccl_lib_char8_t) +# define _SIMD_TEST_CHAR8_T() \ + test_type(); \ + test_type(); +#else +# define _SIMD_TEST_CHAR8_T() +#endif + +#if _CCCL_HAS_INT128() +# define _SIMD_TEST_INT128() \ + test_type<__int128_t, 1>(); \ + test_type<__int128_t, 4>(); +#else +# define _SIMD_TEST_INT128() +#endif + +#if _LIBCUDACXX_HAS_NVFP16() +# define _SIMD_TEST_FP16() \ + test_type<__half, 1>(); \ + test_type<__half, 4>(); +#else +# define _SIMD_TEST_FP16() +#endif + +#if _LIBCUDACXX_HAS_NVBF16() +# define _SIMD_TEST_BF16() \ + test_type<__nv_bfloat16, 1>(); \ + test_type<__nv_bfloat16, 4>(); +#else +# define _SIMD_TEST_BF16() +#endif + +// __half and __nv_bfloat16 constructors are not constexpr (CUDA toolkit limitation), +// so they are tested only at runtime via test_runtime(). +#define DEFINE_BASIC_VEC_TEST_RUNTIME() \ + __host__ __device__ bool test_runtime() \ + { \ + _SIMD_TEST_FP16() \ + _SIMD_TEST_BF16() \ + return true; \ + } + +#define DEFINE_BASIC_VEC_TEST() \ + __host__ __device__ constexpr bool test() \ + { \ + test_type(); \ + test_type(); \ + test_type(); \ + test_type(); \ + test_type(); \ + test_type(); \ + test_type(); \ + test_type(); \ + test_type(); \ + test_type(); \ + test_type(); \ + test_type(); \ + test_type(); \ + test_type(); \ + test_type(); \ + test_type(); \ + test_type(); \ + test_type(); \ + test_type(); \ + test_type(); \ + test_type(); \ + test_type(); \ + _SIMD_TEST_CHAR8_T() \ + test_type(); \ + test_type(); \ + test_type(); \ + test_type(); \ + _SIMD_TEST_INT128() \ + return true; \ + } +// clang-format on + +#endif // SIMD_TEST_UTILS_H