diff --git a/libcudacxx/include/cuda/std/__internal/namespaces.h b/libcudacxx/include/cuda/std/__internal/namespaces.h
index 95d513d1723..2c663744fb6 100644
--- a/libcudacxx/include/cuda/std/__internal/namespaces.h
+++ b/libcudacxx/include/cuda/std/__internal/namespaces.h
@@ -77,6 +77,10 @@
 #define _CCCL_BEGIN_NAMESPACE_CUDA_DRIVER              _CCCL_BEGIN_NAMESPACE(cuda::__driver)
 #define _CCCL_END_NAMESPACE_CUDA_DRIVER                _CCCL_END_NAMESPACE(cuda::__driver)
 
+// Namespaces related to <simd>
+#define _CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD _CCCL_BEGIN_NAMESPACE(cuda::std::simd)
+#define _CCCL_END_NAMESPACE_CUDA_STD_SIMD   _CCCL_END_NAMESPACE(cuda::std::simd)
+
 // Namespaces related to <ranges>
 #define _CCCL_BEGIN_NAMESPACE_CUDA_STD_RANGES _CCCL_BEGIN_NAMESPACE(cuda::std::ranges)
 #define _CCCL_END_NAMESPACE_CUDA_STD_RANGES   _CCCL_END_NAMESPACE(cuda::std::ranges)
diff --git a/libcudacxx/include/cuda/std/__simd/abi.h b/libcudacxx/include/cuda/std/__simd/abi.h
new file mode 100644
index 00000000000..3eee5f14692
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__simd/abi.h
@@ -0,0 +1,49 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___SIMD_ABI_H
+#define _CUDA_STD___SIMD_ABI_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__cstddef/types.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD
+
+using __simd_size_type = ptrdiff_t;
+
+// [simd.expos.abi], simd ABI tags
+template <__simd_size_type _Np>
+struct __fixed_size; // internal ABI tag
+
+template <__simd_size_type _Np>
+using fixed_size = __fixed_size<_Np>; // implementation-defined ABI
+
+template <typename>
+using native = fixed_size<1>; // implementation-defined ABI
+
+template <typename, __simd_size_type _Np>
+using __deduce_abi_t = fixed_size<_Np>; // exposition-only
+
+_CCCL_END_NAMESPACE_CUDA_STD_SIMD
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA_STD___SIMD_ABI_H
diff --git a/libcudacxx/include/cuda/std/__simd/basic_mask.h b/libcudacxx/include/cuda/std/__simd/basic_mask.h
new file mode 100644
index 00000000000..070fc619967
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__simd/basic_mask.h
@@ -0,0 +1,349 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___SIMD_BASIC_MASK_H
+#define _CUDA_STD___SIMD_BASIC_MASK_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__utility/in_range.h>
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__concepts/same_as.h>
+#include <cuda/std/__cstddef/types.h>
+#include <cuda/std/__simd/declaration.h>
+#include <cuda/std/__simd/specializations/fixed_size_mask.h>
+#include <cuda/std/__simd/utility.h>
+#include <cuda/std/__type_traits/integral_constant.h>
+#include <cuda/std/__type_traits/is_unsigned_integer.h>
+#include <cuda/std/__type_traits/num_bits.h>
+#include <cuda/std/array>
+#include <cuda/std/bitset>
+#include <cuda/std/climits>
+#include <cuda/std/cstdint>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD
+
+// [simd.mask.class], class template basic_mask
+template <size_t _Bytes, typename _Abi>
+class basic_mask : public __mask_operations<_Bytes, _Abi>
+{
+  static_assert(__is_abi_tag_v<_Abi>, "basic_mask requires a valid ABI tag");
+
+  template <typename, typename>
+  friend class basic_vec;
+
+  using _Impl    = __mask_operations<_Bytes, _Abi>;
+  using _Storage = typename _Impl::_MaskStorage;
+
+  _Storage __s_;
+
+  struct __storage_tag_t
+  {};
+  static constexpr __storage_tag_t __storage_tag{};
+
+  _CCCL_API constexpr basic_mask(_Storage __v, __storage_tag_t) noexcept
+      : __s_{__v}
+  {}
+
+public:
+  using value_type = bool;
+  using abi_type   = _Abi;
+
+  // TODO(fbusato): add simd-iterator
+  // using iterator       = simd-iterator<basic_mask>;
+  // using const_iterator = simd-iterator<const basic_mask>;
+
+  // constexpr iterator begin() noexcept { return {*this, 0}; }
+  // constexpr const_iterator begin() const noexcept { return {*this, 0}; }
+  // constexpr const_iterator cbegin() const noexcept { return {*this, 0}; }
+  // constexpr default_sentinel_t end() const noexcept { return {}; }
+  // constexpr default_sentinel_t cend() const noexcept { return {}; }
+
+  static constexpr integral_constant<__simd_size_type, __simd_size_v<__integer_from<_Bytes>, _Abi>> size{};
+
+  static constexpr auto __usize = size_t{size};
+  static constexpr auto __size  = __simd_size_type{size};
+
+  _CCCL_HIDE_FROM_ABI constexpr basic_mask() noexcept = default;
+
+  // [simd.mask.ctor], basic_mask constructors
+
+  _CCCL_TEMPLATE(typename _Up)
+  _CCCL_REQUIRES(same_as<_Up, value_type>)
+  _CCCL_API constexpr explicit basic_mask(_Up __v) noexcept
+      : __s_{_Impl::__broadcast(__v)}
+  {}
+
+  _CCCL_TEMPLATE(size_t _UBytes, typename _UAbi)
+  _CCCL_REQUIRES((__simd_size_v<__integer_from<_UBytes>, _UAbi> == __size))
+  _CCCL_API constexpr explicit basic_mask(const basic_mask<_UBytes, _UAbi>& __x) noexcept
+  {
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < __size; ++__i)
+    {
+      __s_.__set(__i, __x[__i]);
+    }
+  }
+
+  _CCCL_TEMPLATE(typename _Generator)
+  _CCCL_REQUIRES(__can_generate_v<bool, _Generator, __size>)
+  _CCCL_API constexpr explicit basic_mask(_Generator&& __g)
+      : __s_{_Impl::__generate(__g)}
+  {}
+
+  _CCCL_TEMPLATE(typename _Tp)
+  _CCCL_REQUIRES(same_as<_Tp, bitset<__usize>>)
+  _CCCL_API constexpr basic_mask(const _Tp& __b) noexcept
+      : __s_{_Impl::__broadcast(false)}
+  {
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < __size; ++__i)
+    {
+      __s_.__set(__i, static_cast<bool>(__b[__i]));
+    }
+  }
+
+  _CCCL_TEMPLATE(typename _Tp)
+  _CCCL_REQUIRES((__cccl_is_unsigned_integer_v<_Tp>) )
+  _CCCL_API constexpr explicit basic_mask(_Tp __val) noexcept
+      : __s_{_Impl::__broadcast(false)}
+  {
+    constexpr auto __num_bits = __simd_size_type{__num_bits_v<_Tp>};
+    constexpr auto __m        = __size < __num_bits ? __size : __num_bits;
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < __m; ++__i)
+    {
+      __s_.__set(__i, static_cast<bool>((__val >> __i) & _Tp{1}));
+    }
+  }
+
+  // [simd.mask.subscr], basic_mask subscript operators
+
+  [[nodiscard]] _CCCL_API constexpr value_type operator[](__simd_size_type __i) const
+  {
+    _CCCL_ASSERT(::cuda::in_range(__i, __simd_size_type{0}, __size), "Index is out of bounds");
+    return static_cast<bool>(__s_.__get(__i));
+  }
+
+  // TODO(fbusato): subscript with integral indices, requires permute()
+  // template<simd-integral I>
+  // constexpr resize_t<I::size(), basic_mask> operator[](const I& indices) const;
+
+  // [simd.mask.unary], basic_mask unary operators
+
+  [[nodiscard]] _CCCL_API constexpr basic_mask operator!() const noexcept
+  {
+    return {_Impl::__bitwise_not(__s_), __storage_tag};
+  }
+
+  template <size_t _ByteSize>
+  static constexpr bool __has_integer_from_v =
+    (_ByteSize == 1 || _ByteSize == 2 || _ByteSize == 4 || _ByteSize == 8
+#if _CCCL_HAS_INT128()
+     || _ByteSize == 16
+#endif // _CCCL_HAS_INT128()
+    );
+
+  _CCCL_TEMPLATE(size_t _Bp = _Bytes)
+  _CCCL_REQUIRES(__has_integer_from_v<_Bp>)
+  [[nodiscard]] _CCCL_API constexpr basic_vec<__integer_from<_Bp>, _Abi> operator+() const noexcept
+  {
+    return static_cast<basic_vec<__integer_from<_Bp>, _Abi>>(*this);
+  }
+
+  _CCCL_TEMPLATE(size_t _Bp = _Bytes)
+  _CCCL_REQUIRES((!__has_integer_from_v<_Bp>) )
+  _CCCL_API void operator+() const noexcept = delete;
+
+  _CCCL_TEMPLATE(size_t _Bp = _Bytes)
+  _CCCL_REQUIRES(__has_integer_from_v<_Bp>)
+  [[nodiscard]] _CCCL_API constexpr basic_vec<__integer_from<_Bp>, _Abi> operator-() const noexcept
+  {
+    return -static_cast<basic_vec<__integer_from<_Bp>, _Abi>>(*this);
+  }
+
+  _CCCL_TEMPLATE(size_t _Bp = _Bytes)
+  _CCCL_REQUIRES((!__has_integer_from_v<_Bp>) )
+  _CCCL_API void operator-() const noexcept = delete;
+
+  _CCCL_TEMPLATE(size_t _Bp = _Bytes)
+  _CCCL_REQUIRES(__has_integer_from_v<_Bp>)
+  [[nodiscard]] _CCCL_API constexpr basic_vec<__integer_from<_Bp>, _Abi> operator~() const noexcept
+  {
+    return ~static_cast<basic_vec<__integer_from<_Bp>, _Abi>>(*this);
+  }
+
+  _CCCL_TEMPLATE(size_t _Bp = _Bytes)
+  _CCCL_REQUIRES((!__has_integer_from_v<_Bp>) )
+  _CCCL_API void operator~() const noexcept = delete;
+
+  // [simd.mask.conv], basic_mask conversions
+
+  _CCCL_TEMPLATE(typename _Up, typename _Ap)
+  _CCCL_REQUIRES((sizeof(_Up) != _Bytes && __simd_size_v<_Up, _Ap> == __size))
+  _CCCL_API constexpr explicit operator basic_vec<_Up, _Ap>() const noexcept
+  {
+    basic_vec<_Up, _Ap> __result{};
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < __size; ++__i)
+    {
+      __result.__s_.__set(__i, static_cast<_Up>((*this)[__i]));
+    }
+    return __result;
+  }
+
+  _CCCL_TEMPLATE(typename _Up, typename _Ap)
+  _CCCL_REQUIRES((sizeof(_Up) == _Bytes && __simd_size_v<_Up, _Ap> == __size))
+  _CCCL_API constexpr operator basic_vec<_Up, _Ap>() const noexcept
+  {
+    basic_vec<_Up, _Ap> __result{};
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < __size; ++__i)
+    {
+      __result.__s_.__set(__i, static_cast<_Up>((*this)[__i]));
+    }
+    return __result;
+  }
+
+  [[nodiscard]] _CCCL_API constexpr bitset<__usize> to_bitset() const noexcept
+  {
+    bitset<__usize> __result{};
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < __size; ++__i)
+    {
+      __result.set(__i, (*this)[__i]);
+    }
+    return __result;
+  }
+
+  [[nodiscard]] _CCCL_API constexpr unsigned long long to_ullong() const
+  {
+    constexpr __simd_size_type __nbits = __num_bits_v<unsigned long long>;
+    if constexpr (__size > __nbits)
+    {
+      for (auto __i = __nbits; __i < __size; ++__i)
+      {
+        _CCCL_ASSERT(!(*this)[__i], "Bit above unsigned long long width is set");
+      }
+    }
+    return to_bitset().to_ullong();
+  }
+
+  // [simd.mask.binary], basic_mask binary operators
+
+  [[nodiscard]] _CCCL_API friend constexpr basic_mask
+  operator&&(const basic_mask& __lhs, const basic_mask& __rhs) noexcept
+  {
+    return {_Impl::__logic_and(__lhs.__s_, __rhs.__s_), __storage_tag};
+  }
+
+  [[nodiscard]] _CCCL_API friend constexpr basic_mask
+  operator||(const basic_mask& __lhs, const basic_mask& __rhs) noexcept
+  {
+    return {_Impl::__logic_or(__lhs.__s_, __rhs.__s_), __storage_tag};
+  }
+
+  [[nodiscard]] _CCCL_API friend constexpr basic_mask
+  operator&(const basic_mask& __lhs, const basic_mask& __rhs) noexcept
+  {
+    return {_Impl::__bitwise_and(__lhs.__s_, __rhs.__s_), __storage_tag};
+  }
+
+  [[nodiscard]] _CCCL_API friend constexpr basic_mask
+  operator|(const basic_mask& __lhs, const basic_mask& __rhs) noexcept
+  {
+    return {_Impl::__bitwise_or(__lhs.__s_, __rhs.__s_), __storage_tag};
+  }
+
+  [[nodiscard]] _CCCL_API friend constexpr basic_mask
+  operator^(const basic_mask& __lhs, const basic_mask& __rhs) noexcept
+  {
+    return {_Impl::__bitwise_xor(__lhs.__s_, __rhs.__s_), __storage_tag};
+  }
+
+  // [simd.mask.cassign], basic_mask compound assignment
+
+  _CCCL_API friend constexpr basic_mask& operator&=(basic_mask& __lhs, const basic_mask& __rhs) noexcept
+  {
+    return __lhs = __lhs & __rhs;
+  }
+
+  _CCCL_API friend constexpr basic_mask& operator|=(basic_mask& __lhs, const basic_mask& __rhs) noexcept
+  {
+    return __lhs = __lhs | __rhs;
+  }
+
+  _CCCL_API friend constexpr basic_mask& operator^=(basic_mask& __lhs, const basic_mask& __rhs) noexcept
+  {
+    return __lhs = __lhs ^ __rhs;
+  }
+
+  // [simd.mask.comparison], basic_mask comparisons (element-wise)
+
+  [[nodiscard]] _CCCL_API friend constexpr basic_mask
+  operator==(const basic_mask& __lhs, const basic_mask& __rhs) noexcept
+  {
+    return !(__lhs ^ __rhs);
+  }
+
+  [[nodiscard]] _CCCL_API friend constexpr basic_mask
+  operator!=(const basic_mask& __lhs, const basic_mask& __rhs) noexcept
+  {
+    return __lhs ^ __rhs;
+  }
+
+  [[nodiscard]] _CCCL_API friend constexpr basic_mask
+  operator>=(const basic_mask& __lhs, const basic_mask& __rhs) noexcept
+  {
+    return __lhs || !__rhs;
+  }
+
+  [[nodiscard]] _CCCL_API friend constexpr basic_mask
+  operator<=(const basic_mask& __lhs, const basic_mask& __rhs) noexcept
+  {
+    return !__lhs || __rhs;
+  }
+
+  [[nodiscard]] _CCCL_API friend constexpr basic_mask
+  operator>(const basic_mask& __lhs, const basic_mask& __rhs) noexcept
+  {
+    return __lhs && !__rhs;
+  }
+
+  [[nodiscard]] _CCCL_API friend constexpr basic_mask
+  operator<(const basic_mask& __lhs, const basic_mask& __rhs) noexcept
+  {
+    return !__lhs && __rhs;
+  }
+
+  // TODO(fbusato): [simd.mask.cond], basic_mask exposition only conditional operators
+  // friend constexpr basic_mask __simd_select_impl(
+  //   const basic_mask&, const basic_mask&, const basic_mask&) noexcept;
+  // friend constexpr basic_mask __simd_select_impl(
+  //   const basic_mask&, same_as<bool> auto, same_as<bool> auto) noexcept;
+  // template<class T0, class T1>
+  //   friend constexpr vec<see below, size()> __simd_select_impl(
+  //     const basic_mask&, const T0&, const T1&) noexcept;
+};
+_CCCL_END_NAMESPACE_CUDA_STD_SIMD
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA_STD___SIMD_BASIC_MASK_H
diff --git a/libcudacxx/include/cuda/std/__simd/basic_vec.h b/libcudacxx/include/cuda/std/__simd/basic_vec.h
new file mode 100644
index 00000000000..f819749e4e7
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__simd/basic_vec.h
@@ -0,0 +1,526 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___SIMD_BASIC_VEC_H
+#define _CUDA_STD___SIMD_BASIC_VEC_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__utility/in_range.h>
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__ranges/concepts.h>
+#include <cuda/std/__ranges/data.h>
+#include <cuda/std/__simd/basic_mask.h>
+#include <cuda/std/__simd/concepts.h>
+#include <cuda/std/__simd/declaration.h>
+#include <cuda/std/__simd/flag.h>
+#include <cuda/std/__simd/utility.h>
+#include <cuda/std/__type_traits/integral_constant.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD
+
+// [simd.class], class template basic_vec
+template <typename _Tp, typename _Abi>
+class basic_vec : public __simd_operations<_Tp, _Abi>
+{
+public:
+  using value_type = _Tp;
+  using mask_type  = basic_mask<sizeof(value_type), _Abi>;
+
+private:
+  static_assert(__is_vectorizable_v<_Tp>, "basic_vec requires a vectorizable type");
+  static_assert(__is_abi_tag_v<_Abi>, "basic_vec requires a valid ABI tag");
+
+  template <size_t, typename>
+  friend class basic_mask;
+
+  using _Impl    = __simd_operations<_Tp, _Abi>;
+  using _Storage = typename _Impl::_SimdStorage;
+
+  _Storage __s_;
+
+  struct __storage_tag_t
+  {};
+  static constexpr __storage_tag_t __storage_tag{};
+
+  _CCCL_API constexpr basic_vec(_Storage __s, __storage_tag_t) noexcept
+      : __s_{__s}
+  {}
+
+  // Friend comparison operators (e.g. operator==) cannot access basic_mask's private constructor directly (friendship
+  // is not transitive). This function is required to access the private constructor of basic_mask.
+  _CCCL_API static constexpr mask_type __make_mask(typename mask_type::_Storage __s) noexcept
+  {
+    return mask_type{__s, mask_type::__storage_tag};
+  }
+
+  // operator[] is const only. We need this function to set values
+  _CCCL_API constexpr void __set(__simd_size_type __i, value_type __v) noexcept
+  {
+    __s_.__set(__i, __v);
+  }
+
+public:
+  using abi_type = _Abi;
+
+  // TODO(fbusato): add simd-iterator
+  // using iterator       = simd-iterator<basic_vec>;
+  // using const_iterator = simd-iterator<const basic_vec>;
+
+  // constexpr iterator begin() noexcept { return {*this, 0}; }
+  // constexpr const_iterator begin() const noexcept { return {*this, 0}; }
+  // constexpr const_iterator cbegin() const noexcept { return {*this, 0}; }
+  // constexpr default_sentinel_t end() const noexcept { return {}; }
+  // constexpr default_sentinel_t cend() const noexcept { return {}; }
+
+  static constexpr integral_constant<__simd_size_type, __simd_size_v<value_type, abi_type>> size{};
+
+  static constexpr auto __usize = size_t{size};
+  static constexpr auto __size  = __simd_size_type{size};
+
+  _CCCL_HIDE_FROM_ABI basic_vec() noexcept = default;
+
+  // [simd.ctor], basic_vec constructors
+
+  // [simd.ctor] value broadcast constructor (explicit overload)
+  _CCCL_TEMPLATE(typename _Up)
+  _CCCL_REQUIRES((__explicitly_convertible_to<value_type, _Up>) _CCCL_AND(!__is_value_ctor_implicit<_Up, value_type>))
+  _CCCL_API constexpr explicit basic_vec(_Up&& __v) noexcept
+      : __s_{_Impl::__broadcast(static_cast<value_type>(__v))}
+  {}
+
+  // [simd.ctor] value broadcast constructor (implicit overload)
+  _CCCL_TEMPLATE(typename _Up)
+  _CCCL_REQUIRES((__explicitly_convertible_to<value_type, _Up>) _CCCL_AND(__is_value_ctor_implicit<_Up, value_type>))
+  _CCCL_API constexpr basic_vec(_Up&& __v) noexcept
+      : __s_{_Impl::__broadcast(static_cast<value_type>(__v))}
+  {}
+
+  // [simd.ctor] converting constructor from basic_vec<U, UAbi> (explicit overload)
+  _CCCL_TEMPLATE(typename _Up, typename _UAbi)
+  _CCCL_REQUIRES((__simd_size_v<_Up, _UAbi> == __size) _CCCL_AND(__explicitly_convertible_to<value_type, _Up>)
+                   _CCCL_AND(__is_vec_ctor_explicit<_Up, value_type>))
+  _CCCL_API constexpr explicit basic_vec(const basic_vec<_Up, _UAbi>& __v) noexcept
+  {
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < __size; ++__i)
+    {
+      __s_.__set(__i, static_cast<value_type>(__v[__i]));
+    }
+  }
+
+  // [simd.ctor] converting constructor from basic_vec<U, UAbi> (implicit overload)
+  _CCCL_TEMPLATE(typename _Up, typename _UAbi)
+  _CCCL_REQUIRES((__simd_size_v<_Up, _UAbi> == __size) _CCCL_AND(__explicitly_convertible_to<value_type, _Up>)
+                   _CCCL_AND(!__is_vec_ctor_explicit<_Up, value_type>))
+  _CCCL_API constexpr basic_vec(const basic_vec<_Up, _UAbi>& __v) noexcept
+  {
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < __size; ++__i)
+    {
+      __s_.__set(__i, static_cast<value_type>(__v[__i]));
+    }
+  }
+
+  // [simd.ctor] generator constructor
+  _CCCL_TEMPLATE(typename _Generator)
+  _CCCL_REQUIRES(__can_generate_v<value_type, _Generator, __size>)
+  _CCCL_API constexpr explicit basic_vec(_Generator&& __g)
+      : __s_{_Impl::__generate(__g)}
+  {}
+
+  // [simd.ctor] range constructor
+
+  template <typename _Range>
+  static constexpr bool __is_compatible_range = __is_compatible_range_v<value_type, __size, _Range>;
+
+  // [simd.ctor] range constructor
+  _CCCL_TEMPLATE(typename _Range, typename... _Flags)
+  _CCCL_REQUIRES(__is_compatible_range<_Range>)
+  _CCCL_API constexpr basic_vec(_Range&& __range, flags<_Flags...> = {})
+  {
+    static_assert(__has_convert_flag_v<_Flags...> || __is_value_preserving_v<ranges::range_value_t<_Range>, value_type>,
+                  "Conversion from range_value_t<R> to value_type is not value-preserving; use flag_convert");
+    const auto __data = ranges::data(__range);
+    __assert_load_store_alignment<basic_vec, ranges::range_value_t<_Range>, _Flags...>(__data);
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < __size; ++__i)
+    {
+      __s_.__set(__i, static_cast<value_type>(__data[__i]));
+    }
+  }
+
+  // [simd.ctor] masked range constructor
+  _CCCL_TEMPLATE(typename _Range, typename... _Flags)
+  _CCCL_REQUIRES(__is_compatible_range<_Range>)
+  _CCCL_API constexpr basic_vec(_Range&& __range, const mask_type& __mask, flags<_Flags...> = {})
+  {
+    static_assert(__has_convert_flag_v<_Flags...> || __is_value_preserving_v<ranges::range_value_t<_Range>, value_type>,
+                  "Conversion from range_value_t<R> to value_type is not value-preserving; use flag_convert");
+    const auto __data = ranges::data(__range);
+    __assert_load_store_alignment<basic_vec, ranges::range_value_t<_Range>, _Flags...>(__data);
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < __size; ++__i)
+    {
+      __s_.__set(__i, __mask[__i] ? static_cast<value_type>(__data[__i]) : value_type());
+    }
+  }
+
+  // TODO(fbusato): add complex constructor
+  // constexpr basic_vec(const real-type& __reals, const real-type& __imags = {}) noexcept;
+
+  // [simd.subscr], basic_vec subscript operators
+
+  [[nodiscard]] _CCCL_API constexpr value_type operator[](__simd_size_type __i) const
+  {
+    _CCCL_ASSERT(::cuda::in_range(__i, __simd_size_type{0}, __size), "Index is out of bounds");
+    return __s_.__get(__i);
+  }
+
+  // TODO(fbusato): subscript with integral indices, requires permute()
+  // template<simd-integral _Idx>
+  //   constexpr resize_t<_Idx::size(), basic_vec> operator[](const _Idx& __indices) const;
+
+  // TODO(fbusato): [simd.complex.access], basic_vec complex accessors
+  // constexpr real-type real() const noexcept;
+  // constexpr real-type imag() const noexcept;
+  // constexpr void real(const real-type& __v) noexcept;
+  // constexpr void imag(const real-type& __v) noexcept;
+
+  // [simd.unary], basic_vec unary operators
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_pre_increment<_Up>)
+  _CCCL_API constexpr basic_vec& operator++() noexcept
+  {
+    _Impl::__increment(__s_);
+    return *this;
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_post_increment<_Up>)
+  [[nodiscard]] _CCCL_API constexpr basic_vec operator++(int) noexcept
+  {
+    const basic_vec __r = *this;
+    _Impl::__increment(__s_);
+    return __r;
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_pre_decrement<_Up>)
+  _CCCL_API constexpr basic_vec& operator--() noexcept
+  {
+    _Impl::__decrement(__s_);
+    return *this;
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_post_decrement<_Up>)
+  [[nodiscard]] _CCCL_API constexpr basic_vec operator--(int) noexcept
+  {
+    const basic_vec __r = *this;
+    _Impl::__decrement(__s_);
+    return __r;
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_negate<_Up>)
+  [[nodiscard]] _CCCL_API constexpr mask_type operator!() const noexcept
+  {
+    return mask_type{_Impl::__negate(__s_), mask_type::__storage_tag};
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_bitwise_not<_Up>)
+  [[nodiscard]] _CCCL_API constexpr basic_vec operator~() const noexcept
+  {
+    return basic_vec{_Impl::__bitwise_not(__s_), __storage_tag};
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_unary_plus<_Up>)
+  [[nodiscard]] _CCCL_API constexpr basic_vec operator+() const noexcept
+  {
+    return *this;
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_unary_minus<_Up>)
+  [[nodiscard]] _CCCL_API constexpr basic_vec operator-() const noexcept
+  {
+    return basic_vec{_Impl::__unary_minus(__s_), __storage_tag};
+  }
+
+  // [simd.binary], basic_vec binary operators
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_binary_plus<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr basic_vec operator+(const basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return basic_vec{_Impl::__plus(__lhs.__s_, __rhs.__s_), __storage_tag};
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_binary_minus<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr basic_vec operator-(const basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return basic_vec{_Impl::__minus(__lhs.__s_, __rhs.__s_), __storage_tag};
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_multiplies<_Up>)
+  [[nodiscard]]
+  _CCCL_API friend constexpr basic_vec operator*(const basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return basic_vec{_Impl::__multiplies(__lhs.__s_, __rhs.__s_), __storage_tag};
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_divides<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr basic_vec operator/(const basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return basic_vec{_Impl::__divides(__lhs.__s_, __rhs.__s_), __storage_tag};
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_modulo<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr basic_vec operator%(const basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return basic_vec{_Impl::__modulo(__lhs.__s_, __rhs.__s_), __storage_tag};
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_bitwise_and<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr basic_vec operator&(const basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return basic_vec{_Impl::__bitwise_and(__lhs.__s_, __rhs.__s_), __storage_tag};
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_bitwise_or<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr basic_vec operator|(const basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return basic_vec{_Impl::__bitwise_or(__lhs.__s_, __rhs.__s_), __storage_tag};
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_bitwise_xor<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr basic_vec operator^(const basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return basic_vec{_Impl::__bitwise_xor(__lhs.__s_, __rhs.__s_), __storage_tag};
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_shift_left<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr basic_vec operator<<(const basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return basic_vec{_Impl::__shift_left(__lhs.__s_, __rhs.__s_), __storage_tag};
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_shift_right<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr basic_vec operator>>(const basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return basic_vec{_Impl::__shift_right(__lhs.__s_, __rhs.__s_), __storage_tag};
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_shift_left_size<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr basic_vec operator<<(const basic_vec& __lhs, __simd_size_type __n) noexcept
+  {
+    return __lhs << basic_vec{__n};
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_shift_right_size<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr basic_vec operator>>(const basic_vec& __lhs, __simd_size_type __n) noexcept
+  {
+    return __lhs >> basic_vec{__n};
+  }
+
+  // [simd.cassign], basic_vec compound assignment
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_binary_plus<_Up>)
+  _CCCL_API friend constexpr basic_vec& operator+=(basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return __lhs = __lhs + __rhs;
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_binary_minus<_Up>)
+  _CCCL_API friend constexpr basic_vec& operator-=(basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return __lhs = __lhs - __rhs;
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_multiplies<_Up>)
+  _CCCL_API friend constexpr basic_vec& operator*=(basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return __lhs = __lhs * __rhs;
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_divides<_Up>)
+  _CCCL_API friend constexpr basic_vec& operator/=(basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return __lhs = __lhs / __rhs;
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_modulo<_Up>)
+  _CCCL_API friend constexpr basic_vec& operator%=(basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return __lhs = __lhs % __rhs;
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_bitwise_and<_Up>)
+  _CCCL_API friend constexpr basic_vec& operator&=(basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return __lhs = __lhs & __rhs;
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_bitwise_or<_Up>)
+  _CCCL_API friend constexpr basic_vec& operator|=(basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return __lhs = __lhs | __rhs;
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_bitwise_xor<_Up>)
+  _CCCL_API friend constexpr basic_vec& operator^=(basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return __lhs = __lhs ^ __rhs;
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_shift_left<_Up>)
+  _CCCL_API friend constexpr basic_vec& operator<<=(basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return __lhs = __lhs << __rhs;
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_shift_right<_Up>)
+  _CCCL_API friend constexpr basic_vec& operator>>=(basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return __lhs = __lhs >> __rhs;
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_shift_left_size<_Up>)
+  _CCCL_API friend constexpr basic_vec& operator<<=(basic_vec& __lhs, __simd_size_type __n) noexcept
+  {
+    return __lhs = __lhs << __n;
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_shift_right_size<_Up>)
+  _CCCL_API friend constexpr basic_vec& operator>>=(basic_vec& __lhs, __simd_size_type __n) noexcept
+  {
+    return __lhs = __lhs >> __n;
+  }
+
+  // [simd.comparison], basic_vec compare operators
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_equal_to<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr mask_type operator==(const basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return __make_mask(_Impl::__equal_to(__lhs.__s_, __rhs.__s_));
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_not_equal_to<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr mask_type operator!=(const basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return __make_mask(_Impl::__not_equal_to(__lhs.__s_, __rhs.__s_));
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_greater_equal<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr mask_type operator>=(const basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return __make_mask(_Impl::__greater_equal(__lhs.__s_, __rhs.__s_));
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_less_equal<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr mask_type operator<=(const basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return __make_mask(_Impl::__less_equal(__lhs.__s_, __rhs.__s_));
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_greater<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr mask_type operator>(const basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return __make_mask(_Impl::__greater(__lhs.__s_, __rhs.__s_));
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_less<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr mask_type operator<(const basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return __make_mask(_Impl::__less(__lhs.__s_, __rhs.__s_));
+  }
+
+  // TODO(fbusato): [simd.cond], basic_vec exposition-only conditional operators
+  // friend constexpr basic_vec __simd_select_impl(
+  //   const mask_type&, const basic_vec&, const basic_vec&) noexcept;
+};
+
+// GCC fails when the deduction guides are marked __host__ __device__, while it is requires for other cases, e.g. clang
+#if !_CCCL_COMPILER(GCC)
+#  define _CCCL_API_DEDUCTION_GUIDE _CCCL_API
+#else // ^^^ _CCCL_COMPILER(GCC) ^^^ / vvv !_CCCL_COMPILER(GCC) vvv
+#  define _CCCL_API_DEDUCTION_GUIDE
+#endif // !_CCCL_COMPILER(GCC)
+
+// [simd.ctor] deduction guide from contiguous sized range
+// Deduces vec<range_value_t<R>, static_cast<simd-size-type>(ranges::size(r))>
+//    * it is not possible to use the alias "vec" for the deduction guide
+//    * "vec" is defined as basic_vec<_Tp, __deduce_abi_t<_Tp, _Np>>
+//    * where _Np is __simd_size_v<_Tp, __static_range_size_v<_Range>>
+_CCCL_TEMPLATE(typename _Range, typename... _Ts)
+_CCCL_REQUIRES(
+  ranges::contiguous_range<_Range> _CCCL_AND ranges::sized_range<_Range> _CCCL_AND __has_static_size<_Range>)
+_CCCL_API_DEDUCTION_GUIDE basic_vec(_Range&&, _Ts...)
+  -> basic_vec<ranges::range_value_t<_Range>,
+               __deduce_abi_t<ranges::range_value_t<_Range>, __static_range_size_v<_Range>>>;
+
+// [simd.ctor] deduction guide from basic_mask
+// basic_vec<__integer_from<Bytes>, Abi> is equivalent to decltype(+k):
+//   * k has type basic_mask<_Bytes, _Abi>
+//   * +k calls basic_mask::operator+()
+//   * the return type is basic_vec<__integer_from<_Bp>, _Abi>
+// The deduced type is equivalent to decltype(+k), i.e. basic_vec<__integer_from<Bytes>, Abi>
+_CCCL_TEMPLATE(size_t _Bytes, typename _Abi)
+_CCCL_REQUIRES(__has_unary_plus<basic_mask<_Bytes, _Abi>>)
+_CCCL_API_DEDUCTION_GUIDE basic_vec(basic_mask<_Bytes, _Abi>) -> basic_vec<__integer_from<_Bytes>, _Abi>;
+
+_CCCL_END_NAMESPACE_CUDA_STD_SIMD
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA_STD___SIMD_BASIC_VEC_H
diff --git a/libcudacxx/include/cuda/std/__simd/concepts.h b/libcudacxx/include/cuda/std/__simd/concepts.h
new file mode 100644
index 00000000000..0b4885ef21c
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__simd/concepts.h
@@ -0,0 +1,256 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___SIMD_CONCEPTS_H
+#define _CUDA_STD___SIMD_CONCEPTS_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__type_traits/is_floating_point.h>
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__concepts/convertible_to.h>
+#include <cuda/std/__concepts/equality_comparable.h>
+#include <cuda/std/__floating_point/conversion_rank_order.h>
+#include <cuda/std/__limits/numeric_limits.h>
+#include <cuda/std/__simd/abi.h>
+#include <cuda/std/__type_traits/is_arithmetic.h>
+#include <cuda/std/__type_traits/is_integral.h>
+#include <cuda/std/__type_traits/is_signed.h>
+#include <cuda/std/__type_traits/remove_cvref.h>
+#include <cuda/std/__type_traits/void_t.h>
+#include <cuda/std/__utility/declval.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD
+
+// [simd.expos], explicitly-convertible-to concept
+
+template <typename _To, typename _From>
+_CCCL_CONCEPT __explicitly_convertible_to = _CCCL_REQUIRES_EXPR((_To, _From))((static_cast<_To>(declval<_From>())));
+
+// [simd.expos], constexpr-wrapper-like concept
+
+template <typename _Tp>
+_CCCL_CONCEPT __constexpr_wrapper_like = _CCCL_REQUIRES_EXPR((_Tp))(
+  requires(convertible_to<_Tp, decltype(_Tp::value)>),
+  requires(equality_comparable_with<_Tp, decltype(_Tp::value)>),
+  requires(bool_constant<(_Tp() == _Tp::value)>::value),
+  requires(bool_constant<(static_cast<decltype(_Tp::value)>(_Tp()) == _Tp::value)>::value));
+
+// Covers all integral types including character types (char16_t, char32_t, wchar_t, char8_t),
+// which are excluded by __cccl_is_integer_v
+template <typename _From, typename _To>
+constexpr bool __is_integral__value_preserving_v =
+  is_integral_v<_From> && is_integral_v<_To> && numeric_limits<_From>::digits <= numeric_limits<_To>::digits
+  && (!is_signed_v<_From> || is_signed_v<_To>);
+
+// The conversion from an arithmetic type U to a vectorizable type T is value-preserving if all possible
+// values of U can be represented with type T.
+template <typename _From, typename _To>
+constexpr bool __is_value_preserving_v =
+  __is_integral__value_preserving_v<_From, _To>
+  || (::cuda::is_floating_point_v<_From> && ::cuda::is_floating_point_v<_To>
+      && __fp_is_implicit_conversion_v<_From, _To>)
+  || (is_integral_v<_From> && ::cuda::is_floating_point_v<_To>
+      && numeric_limits<_From>::digits <= numeric_limits<_To>::digits);
+
+template <typename _From, typename _ValueType, typename = void>
+constexpr bool __is_constexpr_wrapper_value_preserving_v = false;
+
+// The standard requires checking whether the specific compile-time value From::value is representable by _ValueType,
+// not whether the entire source type is value-preserving.
+template <typename _From, typename _ValueType>
+constexpr bool __is_constexpr_wrapper_value_preserving_v<_From, _ValueType, void_t<decltype(_From::value)>> =
+  is_arithmetic_v<remove_cvref_t<decltype(_From::value)>>
+  && (static_cast<remove_cvref_t<decltype(_From::value)>>(static_cast<_ValueType>(_From::value)) == _From::value);
+
+// [simd.ctor] implicit value constructor
+// - From is not an arithmetic type and does not satisfy constexpr-wrapper-like,
+// - From is an arithmetic type and the conversion from From to value_type is value-preserving
+// - From satisfies constexpr-wrapper-like, remove_cvref_t<decltype(From​::​value)> is an arithmetic type, and
+//   From​::​value is representable by value_type.
+template <typename _Up, typename _ValueType, typename _From = remove_cvref_t<_Up>>
+_CCCL_CONCEPT __is_value_ctor_implicit =
+  convertible_to<_Up, _ValueType>
+  && ((!is_arithmetic_v<_From> && !__constexpr_wrapper_like<_From>)
+      || (is_arithmetic_v<_From> && __is_value_preserving_v<_From, _ValueType>)
+      || (__constexpr_wrapper_like<_From> && __is_constexpr_wrapper_value_preserving_v<_From, _ValueType>) );
+
+// [conv.rank], integer conversion rank for [simd.ctor] p7
+
+template <typename _Tp>
+inline constexpr int __integer_conversion_rank = 0;
+
+template <>
+inline constexpr int __integer_conversion_rank<signed char> = 1;
+template <>
+inline constexpr int __integer_conversion_rank<unsigned char> = 1;
+template <>
+inline constexpr int __integer_conversion_rank<char> = 1;
+template <>
+inline constexpr int __integer_conversion_rank<short> = 2;
+template <>
+inline constexpr int __integer_conversion_rank<unsigned short> = 2;
+template <>
+inline constexpr int __integer_conversion_rank<int> = 3;
+template <>
+inline constexpr int __integer_conversion_rank<unsigned int> = 3;
+template <>
+inline constexpr int __integer_conversion_rank<long> = 4;
+template <>
+inline constexpr int __integer_conversion_rank<unsigned long> = 4;
+template <>
+inline constexpr int __integer_conversion_rank<long long> = 5;
+template <>
+inline constexpr int __integer_conversion_rank<unsigned long long> = 5;
+#if _CCCL_HAS_INT128()
+template <>
+inline constexpr int __integer_conversion_rank<__int128_t> = 6;
+template <>
+inline constexpr int __integer_conversion_rank<__uint128_t> = 6;
+#endif // _CCCL_HAS_INT128()
+
+// [conv.rank], floating-point conversion rank for [simd.ctor] p7
+
+template <typename _Tp>
+inline constexpr int __fp_conversion_rank = 0;
+
+#if _CCCL_HAS_NVFP16()
+template <>
+inline constexpr int __fp_conversion_rank<__half> = 1;
+#endif // _CCCL_HAS_NVFP16()
+#if _CCCL_HAS_NVBF16()
+template <>
+inline constexpr int __fp_conversion_rank<__nv_bfloat16> = 1;
+#endif // _CCCL_HAS_NVBF16()
+template <>
+inline constexpr int __fp_conversion_rank<float> = 2;
+template <>
+inline constexpr int __fp_conversion_rank<double> = 3;
+#if _CCCL_HAS_LONG_DOUBLE()
+template <>
+inline constexpr int __fp_conversion_rank<long double> = 4;
+#endif // _CCCL_HAS_LONG_DOUBLE()
+#if _CCCL_HAS_FLOAT128()
+template <>
+inline constexpr int __fp_conversion_rank<__float128> = 5;
+#endif // _CCCL_HAS_FLOAT128()
+
+// [simd.ctor] p7: explicit(see below) for basic_vec(const basic_vec<U, UAbi>&)
+// explicit evaluates to true if either:
+//   - conversion from U to value_type is not value-preserving, or
+//   - both U and value_type are integral and integer_conversion_rank(U) > rank(value_type), or
+//   - both U and value_type are floating-point and fp_conversion_rank(U) > rank(value_type)
+template <typename _Up, typename _ValueType>
+constexpr bool __is_vec_ctor_explicit =
+  !__is_value_preserving_v<_Up, _ValueType>
+  || (is_integral_v<_Up> && is_integral_v<_ValueType>
+      && __integer_conversion_rank<_Up> > __integer_conversion_rank<_ValueType>)
+  || (::cuda::is_floating_point_v<_Up> && ::cuda::is_floating_point_v<_ValueType>
+      && __fp_conversion_rank<_Up> > __fp_conversion_rank<_ValueType>);
+
+// [simd.unary], operator constraints
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_pre_increment = _CCCL_REQUIRES_EXPR((_Tp), _Tp& __t)((++__t));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_post_increment = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t++));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_pre_decrement = _CCCL_REQUIRES_EXPR((_Tp), _Tp& __t)((--__t));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_post_decrement = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t--));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_negate = _CCCL_REQUIRES_EXPR((_Tp), const _Tp __t)((!__t));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_bitwise_not = _CCCL_REQUIRES_EXPR((_Tp), const _Tp __t)((~__t));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_unary_plus = _CCCL_REQUIRES_EXPR((_Tp), const _Tp __t)((+__t));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_unary_minus = _CCCL_REQUIRES_EXPR((_Tp), const _Tp __t)((-__t));
+
+// [simd.binary], binary operator constraints
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_binary_plus = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a + __b));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_binary_minus = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a - __b));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_multiplies = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a * __b));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_divides = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a / __b));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_modulo = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a % __b));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_bitwise_and = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a & __b));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_bitwise_or = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a | __b));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_bitwise_xor = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a ^ __b));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_shift_left = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a << __b));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_shift_right = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a >> __b));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_shift_left_size = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t << __simd_size_type{}));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_shift_right_size = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t >> __simd_size_type{}));
+
+// [simd.comparison], comparison operator constraints
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_equal_to = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a == __b));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_not_equal_to = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a != __b));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_greater_equal = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a >= __b));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_less_equal = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a <= __b));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_greater = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a > __b));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_less = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a < __b));
+
+_CCCL_END_NAMESPACE_CUDA_STD_SIMD
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA_STD___SIMD_CONCEPTS_H
diff --git a/libcudacxx/include/cuda/std/__simd/declaration.h b/libcudacxx/include/cuda/std/__simd/declaration.h
new file mode 100644
index 00000000000..49fa867b860
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__simd/declaration.h
@@ -0,0 +1,62 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___SIMD_DECLARATION_H
+#define _CUDA_STD___SIMD_DECLARATION_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__cstddef/types.h>
+#include <cuda/std/__simd/abi.h>
+#include <cuda/std/__simd/exposition.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD
+
+template <typename _Tp, typename _Abi = native<_Tp>>
+class basic_vec;
+
+template <size_t _Bytes, typename _Abi = native<__integer_from<_Bytes>>>
+class basic_mask;
+
+template <typename _Tp, __simd_size_type _Np = __simd_size_v<_Tp, native<_Tp>>>
+using vec = basic_vec<_Tp, __deduce_abi_t<_Tp, _Np>>;
+
+template <typename _Tp, __simd_size_type _Np = __simd_size_v<_Tp, native<_Tp>>>
+using mask = basic_mask<sizeof(_Tp), __deduce_abi_t<_Tp, _Np>>;
+
+// specializations
+
+template <typename _Tp, typename _Abi>
+struct __simd_storage;
+
+template <typename _Tp, typename _Abi>
+struct __simd_operations;
+
+template <size_t _Bytes, typename _Abi>
+struct __mask_storage;
+
+template <size_t _Bytes, typename _Abi>
+struct __mask_operations;
+
+_CCCL_END_NAMESPACE_CUDA_STD_SIMD
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA_STD___SIMD_DECLARATION_H
diff --git a/libcudacxx/include/cuda/std/__simd/exposition.h b/libcudacxx/include/cuda/std/__simd/exposition.h
new file mode 100644
index 00000000000..3edd8f61aa5
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__simd/exposition.h
@@ -0,0 +1,60 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___SIMD_EXPOSITION_H
+#define _CUDA_STD___SIMD_EXPOSITION_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__type_traits/is_floating_point.h>
+#include <cuda/std/__cstddef/types.h>
+#include <cuda/std/__simd/abi.h>
+#include <cuda/std/__type_traits/is_const.h>
+#include <cuda/std/__type_traits/is_integral.h>
+#include <cuda/std/__type_traits/is_same.h>
+#include <cuda/std/__type_traits/is_volatile.h>
+#include <cuda/std/__type_traits/make_nbit_int.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD
+
+// [simd.expos], exposition-only helpers
+
+template <size_t _Bytes>
+using __integer_from = __make_nbit_int_t<_Bytes * 8, true>;
+
+// all standard integer types, character types, and the types float and double ([basic.fundamental]);
+// std​::​float16_t, std​::​float32_t, and std​::​float64_t if defined ([basic.extended.fp]); and
+// TODO(fbusato) complex<T> where T is a vectorizable floating-point type.
+template <typename _Tp>
+constexpr bool __is_vectorizable_v =
+  (is_integral_v<_Tp> || ::cuda::is_floating_point_v<_Tp>)
+  && !is_same_v<_Tp, bool> && !is_const_v<_Tp> && !is_volatile_v<_Tp>;
+
+template <typename _Tp, typename _Abi>
+constexpr __simd_size_type __simd_size_v = 0;
+
+template <typename _Tp, __simd_size_type _Np>
+constexpr __simd_size_type __simd_size_v<_Tp, fixed_size<_Np>> = _Np;
+
+_CCCL_END_NAMESPACE_CUDA_STD_SIMD
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA_STD___SIMD_EXPOSITION_H
diff --git a/libcudacxx/include/cuda/std/__simd/flag.h b/libcudacxx/include/cuda/std/__simd/flag.h
new file mode 100644
index 00000000000..e2bf4b39ac2
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__simd/flag.h
@@ -0,0 +1,111 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___SIMD_FLAG_H
+#define _CUDA_STD___SIMD_FLAG_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__cmath/pow2.h>
+#include <cuda/std/__cstddef/types.h>
+#include <cuda/std/__type_traits/is_same.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD
+
+// [simd.expos], exposition-only flag types
+
+struct __convert_flag
+{};
+
+struct __aligned_flag
+{};
+
+template <size_t _Np>
+struct __overaligned_flag
+{
+  static_assert(::cuda::__is_valid_alignment(_Np), "Overaligned flag requires a power-of-2 alignment");
+};
+
+template <typename _Tp>
+constexpr bool __is_flag_type_v = false;
+
+template <>
+constexpr bool __is_flag_type_v<__convert_flag> = true;
+
+template <>
+constexpr bool __is_flag_type_v<__aligned_flag> = true;
+
+template <size_t _Np>
+constexpr bool __is_flag_type_v<__overaligned_flag<_Np>> = true;
+
+template <typename _Flag>
+constexpr size_t __overaligned_value_v = 0;
+
+template <size_t _Np>
+constexpr size_t __overaligned_value_v<__overaligned_flag<_Np>> = _Np;
+
+// [simd.flags.overview], class template flags
+
+template <typename... _Flags>
+struct flags
+{
+  static_assert((true && ... && __is_flag_type_v<_Flags>),
+                "Every flag type must be one of convert_flag, aligned_flag, or overaligned_flag<N>");
+  static_assert((0 + ... + static_cast<int>(__overaligned_value_v<_Flags> != 0)) <= 1,
+                "At most one overaligned_flag is allowed");
+  // we cannot use __is_valid_alignment because 0 has a different meaning
+  static_assert((true && ...
+                 && (__overaligned_value_v<_Flags> == 0 || ::cuda::is_power_of_two(__overaligned_value_v<_Flags>))),
+                "Overaligned flag requires a power-of-2 alignment");
+
+  // [simd.flags.oper], flags operators
+  template <typename... _Other>
+  [[nodiscard]] _CCCL_API friend _CCCL_CONSTEVAL flags<_Flags..., _Other...> operator|(flags, flags<_Other...>)
+  {
+    return {};
+  }
+};
+
+// [simd.flags], flag constants
+
+inline constexpr flags<> flag_default{};
+inline constexpr flags<__convert_flag> flag_convert{};
+inline constexpr flags<__aligned_flag> flag_aligned{};
+
+template <size_t _Np>
+constexpr flags<__overaligned_flag<_Np>> flag_overaligned{};
+
+template <typename... _Flags>
+constexpr bool __has_convert_flag_v = (false || ... || is_same_v<_Flags, __convert_flag>);
+
+template <typename... _Flags>
+constexpr bool __has_aligned_flag_v = (false || ... || is_same_v<_Flags, __aligned_flag>);
+
+template <typename... _Flags>
+constexpr bool __has_overaligned_flag_v = (false || ... || (__overaligned_value_v<_Flags> != 0));
+
+template <typename... _Flags>
+constexpr size_t __overaligned_alignment_v = (size_t{0} | ... | __overaligned_value_v<_Flags>);
+
+_CCCL_END_NAMESPACE_CUDA_STD_SIMD
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA_STD___SIMD_FLAG_H
diff --git a/libcudacxx/include/cuda/std/__simd/load.h b/libcudacxx/include/cuda/std/__simd/load.h
new file mode 100644
index 00000000000..a416dd546c2
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__simd/load.h
@@ -0,0 +1,311 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___SIMD_LOAD_H
+#define _CUDA_STD___SIMD_LOAD_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__memory/ptr_rebind.h>
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__concepts/same_as.h>
+#include <cuda/std/__iterator/concepts.h>
+#include <cuda/std/__iterator/distance.h>
+#include <cuda/std/__iterator/incrementable_traits.h>
+#include <cuda/std/__iterator/readable_traits.h>
+#include <cuda/std/__memory/assume_aligned.h>
+#include <cuda/std/__memory/pointer_traits.h>
+#include <cuda/std/__ranges/concepts.h>
+#include <cuda/std/__ranges/data.h>
+#include <cuda/std/__ranges/size.h>
+#include <cuda/std/__simd/basic_vec.h>
+#include <cuda/std/__simd/concepts.h>
+#include <cuda/std/__simd/flag.h>
+#include <cuda/std/__simd/utility.h>
+#include <cuda/std/__utility/cmp.h>
+#include <cuda/std/__utility/forward.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD
+
+// [simd.loadstore] helper: resolves default V template parameter for load functions
+// When _Vp = void (default), resolves to basic_vec<_Up>; otherwise uses the explicit _Vp
+template <typename _Vp, typename _Up>
+struct __load_vec_type
+{
+  using type = _Vp;
+};
+
+template <typename _Up>
+struct __load_vec_type<void, _Up>
+{
+  using type = basic_vec<_Up>;
+};
+
+template <typename _Vp, typename _Up>
+using __load_vec_t = typename __load_vec_type<_Vp, _Up>::type;
+
+// TODO(fbusato): Important: use ::cuda::is_trivially_copyable_v
+template <typename _TpIn, typename _TpOut>
+constexpr bool __are_vectoriazable_v =
+  is_same_v<remove_cv_t<_TpIn>, remove_cv_t<_TpOut>> // _TpIn == _TpOut
+  && is_trivially_copyable_v<_TpIn> // byte-copy is fine (memcpy)
+  && (alignof(_TpIn) >= sizeof(_TpIn)); // e.g. char3 doesn't work: alignof(char3) == 1, sizeof(char3) == 3
+
+// [simd.loadstore] helper: core partial load from pointer + count + mask
+template <typename _Result, typename _Up, typename... _Flags>
+[[nodiscard]] _CCCL_API constexpr _Result
+__partial_load_from_ptr(const _Up* __ptr, __simd_size_type __count, const typename _Result::mask_type& __mask)
+{
+  using _Tp = typename _Result::value_type;
+  static_assert(same_as<remove_cvref_t<_Result>, _Result>, "V must not be a reference or cv-qualified type");
+  static_assert(__is_vectorizable_v<_Tp> && __is_abi_tag_v<typename _Result::abi_type>,
+                "V must be an enabled specialization of basic_vec");
+  static_assert(__is_vectorizable_v<_Up>, "range_value_t<R> must be a vectorizable type");
+  static_assert(__explicitly_convertible_to<_Tp, _Up>,
+                "range_value_t<R> must satisfy explicitly-convertible-to<value_type>");
+  static_assert(__has_convert_flag_v<_Flags...> || __is_value_preserving_v<_Up, _Tp>,
+                "Conversion from range_value_t<R> to value_type is not value-preserving; use flag_convert");
+  ::cuda::std::simd::__assert_load_store_alignment<_Result, _Up, _Flags...>(__ptr);
+  _Result __result{};
+  _CCCL_PRAGMA_UNROLL_FULL()
+  for (__simd_size_type __i = 0; __i < _Result::size; ++__i)
+  {
+    if (__mask[__i] && __i < __count)
+    {
+      __result.__set(__i, static_cast<_Tp>(__ptr[__i]));
+    }
+  }
+  return __result;
+}
+
+template <typename _Result, typename _Up, typename... _Flags>
+[[nodiscard]] _CCCL_API constexpr _Result
+__full_load_from_ptr(const _Up* __ptr, const typename _Result::mask_type& __mask)
+{
+  using _Tp = typename _Result::value_type;
+  static_assert(same_as<remove_cvref_t<_Result>, _Result>, "V must not be a reference or cv-qualified type");
+  static_assert(__is_vectorizable_v<_Tp> && __is_abi_tag_v<typename _Result::abi_type>,
+                "V must be an enabled specialization of basic_vec");
+  static_assert(__is_vectorizable_v<_Up>, "range_value_t<R> must be a vectorizable type");
+  static_assert(__explicitly_convertible_to<_Tp, _Up>,
+                "range_value_t<R> must satisfy explicitly-convertible-to<value_type>");
+  static_assert(__has_convert_flag_v<_Flags...> || __is_value_preserving_v<_Up, _Tp>,
+                "Conversion from range_value_t<R> to value_type is not value-preserving; use flag_convert");
+  ::cuda::std::simd::__assert_load_store_alignment<_Result, _Up, _Flags...>(__ptr);
+  if constexpr (::cuda::is_power_of_two(_Result::size) && __are_vectoriazable_v<_Tp, _Up>
+                && (__has_aligned_flag_v<_Flags...> || __has_overaligned_flag_v<_Flags...>) )
+  {
+    _CCCL_IF_CONSTEVAL
+    {
+      return ::cuda::std::simd::__partial_load_from_ptr<_Result, _Up, _Flags...>(__ptr, _Result::size, __mask);
+    }
+    else
+    {
+      constexpr auto __alignment     = alignof(_Tp) * _Result::size;
+      constexpr auto __ptr_alignment = ::cuda::std::max(__alignment, __overaligned_value_v<_Flags...>);
+      struct alignas(__alignment) __aligned_t
+      {
+        char __data[sizeof(_Result)];
+      };
+      // nvcc performance bug: memcpy from pointer could not be vectorized
+      const auto __aligned_ptr = ::cuda::ptr_rebind<__aligned_t>(__ptr);
+      const auto __data        = *::cuda::std::assume_aligned<__ptr_alignment>(__aligned_ptr);
+      _Result __result;
+      ::cuda::std::memcpy(&__result, &__data, sizeof(_Result));
+      _CCCL_PRAGMA_UNROLL_FULL()
+      for (__simd_size_type __i = 0; __i < _Result::size; ++__i)
+      {
+        if (!__mask[__i])
+        {
+          __result.__set(__i, _Tp{});
+        }
+      }
+      return __result;
+    }
+  }
+  else
+  {
+    return ::cuda::std::simd::__partial_load_from_ptr<_Result, _Up, _Flags...>(__ptr, _Result::size, __mask);
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// [simd.loadstore] partial_load
+
+// partial_load: range, masked
+_CCCL_TEMPLATE(typename _Vp = void, typename _Range, typename... _Flags)
+_CCCL_REQUIRES(ranges::contiguous_range<_Range> _CCCL_AND ranges::sized_range<_Range>)
+[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ranges::range_value_t<_Range>> partial_load(
+  _Range&& __r,
+  const typename __load_vec_t<_Vp, ranges::range_value_t<_Range>>::mask_type& __mask,
+  flags<_Flags...> = {})
+{
+  using __result_t = __load_vec_t<_Vp, ranges::range_value_t<_Range>>;
+  using __input_t  = ranges::range_value_t<_Range>;
+  return ::cuda::std::simd::__partial_load_from_ptr<__result_t, __input_t, _Flags...>(
+    ::cuda::std::ranges::data(__r), static_cast<__simd_size_type>(::cuda::std::ranges::size(__r)), __mask);
+}
+
+// partial_load: range, no mask
+_CCCL_TEMPLATE(typename _Vp = void, typename _Range, typename... _Flags)
+_CCCL_REQUIRES(ranges::contiguous_range<_Range> _CCCL_AND ranges::sized_range<_Range>)
+[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ranges::range_value_t<_Range>>
+partial_load(_Range&& __r, flags<_Flags...> __f = {})
+{
+  using __result_t = __load_vec_t<_Vp, ranges::range_value_t<_Range>>;
+  return ::cuda::std::simd::partial_load<_Vp>(
+    ::cuda::std::forward<_Range>(__r), typename __result_t::mask_type(true), __f);
+}
+
+// partial_load: iterator + count, masked
+_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename... _Flags)
+_CCCL_REQUIRES(contiguous_iterator<_Ip>)
+[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, iter_value_t<_Ip>> partial_load(
+  _Ip __first,
+  iter_difference_t<_Ip> __n,
+  const typename __load_vec_t<_Vp, iter_value_t<_Ip>>::mask_type& __mask,
+  flags<_Flags...> = {})
+{
+  using __result_t = __load_vec_t<_Vp, iter_value_t<_Ip>>;
+  using __input_t  = iter_value_t<_Ip>;
+  return ::cuda::std::simd::__partial_load_from_ptr<__result_t, __input_t, _Flags...>(
+    ::cuda::std::to_address(__first), static_cast<__simd_size_type>(__n), __mask);
+}
+
+// partial_load: iterator + count, no mask
+_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename... _Flags)
+_CCCL_REQUIRES(contiguous_iterator<_Ip>)
+[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, iter_value_t<_Ip>>
+partial_load(_Ip __first, iter_difference_t<_Ip> __n, flags<_Flags...> __f = {})
+{
+  using __result_t = __load_vec_t<_Vp, iter_value_t<_Ip>>;
+  return ::cuda::std::simd::partial_load<_Vp>(__first, __n, typename __result_t::mask_type(true), __f);
+}
+
+// partial_load: iterator + sentinel, masked
+_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename _Sp, typename... _Flags)
+_CCCL_REQUIRES(contiguous_iterator<_Ip> _CCCL_AND sized_sentinel_for<_Sp, _Ip>)
+[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, iter_value_t<_Ip>> partial_load(
+  _Ip __first, _Sp __last, const typename __load_vec_t<_Vp, iter_value_t<_Ip>>::mask_type& __mask, flags<_Flags...> = {})
+{
+  using __result_t = __load_vec_t<_Vp, iter_value_t<_Ip>>;
+  using __input_t  = iter_value_t<_Ip>;
+  return ::cuda::std::simd::__partial_load_from_ptr<__result_t, __input_t, _Flags...>(
+    ::cuda::std::to_address(__first), static_cast<__simd_size_type>(::cuda::std::distance(__first, __last)), __mask);
+}
+
+// partial_load: iterator + sentinel, no mask
+_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename _Sp, typename... _Flags)
+_CCCL_REQUIRES(contiguous_iterator<_Ip> _CCCL_AND sized_sentinel_for<_Sp, _Ip>)
+[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, iter_value_t<_Ip>>
+partial_load(_Ip __first, _Sp __last, flags<_Flags...> __f = {})
+{
+  using __result_t = __load_vec_t<_Vp, iter_value_t<_Ip>>;
+  return ::cuda::std::simd::partial_load<_Vp>(__first, __last, typename __result_t::mask_type(true), __f);
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// [simd.loadstore] unchecked_load
+
+// unchecked_load: range, masked
+_CCCL_TEMPLATE(typename _Vp = void, typename _Range, typename... _Flags)
+_CCCL_REQUIRES(ranges::contiguous_range<_Range> _CCCL_AND ranges::sized_range<_Range>)
+[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ranges::range_value_t<_Range>> unchecked_load(
+  _Range&& __r,
+  const typename __load_vec_t<_Vp, ranges::range_value_t<_Range>>::mask_type& __mask,
+  flags<_Flags...> __f = {})
+{
+  using __result_t = __load_vec_t<_Vp, ranges::range_value_t<_Range>>;
+  if constexpr (__has_static_size<_Range>)
+  {
+    static_assert(__static_range_size_v<_Range> >= __result_t::size(),
+                  "unchecked_load requires ranges::size(r) >= V::size()");
+  }
+  _CCCL_ASSERT(::cuda::std::cmp_greater_equal(::cuda::std::ranges::size(__r), __result_t::size()),
+               "unchecked_load requires ranges::size(r) >= V::size()");
+  return ::cuda::std::simd::partial_load<_Vp>(::cuda::std::forward<_Range>(__r), __mask, __f);
+}
+
+// unchecked_load: range, no mask
+_CCCL_TEMPLATE(typename _Vp = void, typename _Range, typename... _Flags)
+_CCCL_REQUIRES(ranges::contiguous_range<_Range> _CCCL_AND ranges::sized_range<_Range>)
+[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ranges::range_value_t<_Range>>
+unchecked_load(_Range&& __r, flags<_Flags...> __f = {})
+{
+  using __result_t = __load_vec_t<_Vp, ranges::range_value_t<_Range>>;
+  return ::cuda::std::simd::unchecked_load<_Vp>(
+    ::cuda::std::forward<_Range>(__r), typename __result_t::mask_type(true), __f);
+}
+
+// unchecked_load: iterator + count, masked
+_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename... _Flags)
+_CCCL_REQUIRES(contiguous_iterator<_Ip>)
+[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, iter_value_t<_Ip>> unchecked_load(
+  _Ip __first,
+  iter_difference_t<_Ip> __n,
+  const typename __load_vec_t<_Vp, iter_value_t<_Ip>>::mask_type& __mask,
+  flags<_Flags...> __f = {})
+{
+  using __result_t = __load_vec_t<_Vp, iter_value_t<_Ip>>;
+  _CCCL_ASSERT(::cuda::std::cmp_greater_equal(__n, __result_t::size()), "unchecked_load requires n >= V::size()");
+  return ::cuda::std::simd::partial_load<_Vp>(__first, __n, __mask, __f);
+}
+
+// unchecked_load: iterator + count, no mask
+_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename... _Flags)
+_CCCL_REQUIRES(contiguous_iterator<_Ip>)
+[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, iter_value_t<_Ip>>
+unchecked_load(_Ip __first, iter_difference_t<_Ip> __n, flags<_Flags...> __f = {})
+{
+  using __result_t = __load_vec_t<_Vp, iter_value_t<_Ip>>;
+  return ::cuda::std::simd::unchecked_load<_Vp>(__first, __n, typename __result_t::mask_type(true), __f);
+}
+
+// unchecked_load: iterator + sentinel, masked
+_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename _Sp, typename... _Flags)
+_CCCL_REQUIRES(contiguous_iterator<_Ip> _CCCL_AND sized_sentinel_for<_Sp, _Ip>)
+[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, iter_value_t<_Ip>> unchecked_load(
+  _Ip __first,
+  _Sp __last,
+  const typename __load_vec_t<_Vp, iter_value_t<_Ip>>::mask_type& __mask,
+  flags<_Flags...> __f = {})
+{
+  using __result_t = __load_vec_t<_Vp, iter_value_t<_Ip>>;
+  _CCCL_ASSERT(::cuda::std::cmp_greater_equal(::cuda::std::distance(__first, __last), __result_t::size()),
+               "unchecked_load requires distance(first, last) >= V::size()");
+  return ::cuda::std::simd::partial_load<_Vp>(__first, __last, __mask, __f);
+}
+
+// unchecked_load: iterator + sentinel, no mask
+_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename _Sp, typename... _Flags)
+_CCCL_REQUIRES(contiguous_iterator<_Ip> _CCCL_AND sized_sentinel_for<_Sp, _Ip>)
+[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, iter_value_t<_Ip>>
+unchecked_load(_Ip __first, _Sp __last, flags<_Flags...> __f = {})
+{
+  using __result_t = __load_vec_t<_Vp, iter_value_t<_Ip>>;
+  return ::cuda::std::simd::unchecked_load<_Vp>(__first, __last, typename __result_t::mask_type(true), __f);
+}
+
+_CCCL_END_NAMESPACE_CUDA_STD_SIMD
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA_STD___SIMD_LOAD_H
diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_mask.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_mask.h
new file mode 100644
index 00000000000..55c69c27863
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_mask.h
@@ -0,0 +1,232 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_SIMPLE_MASK_H
+#define _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_SIMPLE_MASK_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__utility/in_range.h>
+#include <cuda/std/__cstddef/types.h>
+#include <cuda/std/__simd/declaration.h>
+#include <cuda/std/__type_traits/integral_constant.h>
+#include <cuda/std/__utility/integer_sequence.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD
+
+// Bool-per-element mask storage for fixed_size ABI
+template <size_t _Bytes, __simd_size_type _Np>
+struct __mask_storage<_Bytes, __fixed_size<_Np>>
+{
+  static constexpr size_t __element_bytes = _Bytes;
+
+  bool __data[_Np]{}; // initialization required for constexpr constructor
+
+  [[nodiscard]] _CCCL_API constexpr bool __get(__simd_size_type __idx) const noexcept
+  {
+    _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds");
+    return __data[__idx];
+  }
+
+  _CCCL_API constexpr void __set(__simd_size_type __idx, bool __v) noexcept
+  {
+    _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds");
+    __data[__idx] = __v;
+  }
+};
+
+// Mask operations for fixed_size ABI with bool-per-element storage
+template <size_t _Bytes, __simd_size_type _Np>
+struct __mask_operations<_Bytes, __fixed_size<_Np>>
+{
+  using _MaskStorage = __mask_storage<_Bytes, __fixed_size<_Np>>;
+
+  [[nodiscard]] _CCCL_API static constexpr _MaskStorage __broadcast(bool __v) noexcept
+  {
+    _MaskStorage __result{};
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = __v;
+    }
+    return __result;
+  }
+
+  template <typename _Generator, __simd_size_type... _Is>
+  [[nodiscard]] _CCCL_API static constexpr _MaskStorage
+  __generate_init(_Generator&& __g, integer_sequence<__simd_size_type, _Is...>)
+  {
+    _MaskStorage __result{};
+    ((__result.__data[_Is] = static_cast<bool>(__g(integral_constant<__simd_size_type, _Is>()))), ...);
+    return __result;
+  }
+
+  template <typename _Generator>
+  [[nodiscard]] _CCCL_API static constexpr _MaskStorage __generate(_Generator&& __g)
+  {
+    return __generate_init(__g, make_integer_sequence<__simd_size_type, _Np>());
+  }
+
+  // Logical operators (for operator&& and operator||)
+
+  [[nodiscard]] _CCCL_API static constexpr _MaskStorage
+  __logic_and(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept
+  {
+    _MaskStorage __result{};
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = __lhs.__data[__i] && __rhs.__data[__i];
+    }
+    return __result;
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _MaskStorage
+  __logic_or(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept
+  {
+    _MaskStorage __result{};
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = __lhs.__data[__i] || __rhs.__data[__i];
+    }
+    return __result;
+  }
+
+  // Bitwise operators (for operator&, operator|, operator^)
+
+  [[nodiscard]] _CCCL_API static constexpr _MaskStorage
+  __bitwise_and(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept
+  {
+    _MaskStorage __result{};
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = __lhs.__data[__i] && __rhs.__data[__i];
+    }
+    return __result;
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _MaskStorage
+  __bitwise_or(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept
+  {
+    _MaskStorage __result{};
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = __lhs.__data[__i] || __rhs.__data[__i];
+    }
+    return __result;
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _MaskStorage
+  __bitwise_xor(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept
+  {
+    _MaskStorage __result{};
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = __lhs.__data[__i] != __rhs.__data[__i];
+    }
+    return __result;
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _MaskStorage __bitwise_not(const _MaskStorage& __s) noexcept
+  {
+    _MaskStorage __result{};
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = !__s.__data[__i];
+    }
+    return __result;
+  }
+
+  // Reductions
+
+  [[nodiscard]] _CCCL_API static constexpr bool __all(const _MaskStorage& __s) noexcept
+  {
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      if (!__s.__data[__i])
+      {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr bool __any(const _MaskStorage& __s) noexcept
+  {
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      if (__s.__data[__i])
+      {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr __simd_size_type __count(const _MaskStorage& __s) noexcept
+  {
+    __simd_size_type __count = 0;
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __count += static_cast<__simd_size_type>(__s.__data[__i]);
+    }
+    return __count;
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr __simd_size_type __min_index(const _MaskStorage& __s) noexcept
+  {
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      if (__s.__data[__i])
+      {
+        return __i;
+      }
+    }
+    _CCCL_UNREACHABLE();
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr __simd_size_type __max_index(const _MaskStorage& __s) noexcept
+  {
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = _Np - 1; __i >= 0; --__i)
+    {
+      if (__s.__data[__i])
+      {
+        return __i;
+      }
+    }
+    _CCCL_UNREACHABLE();
+  }
+};
+
+_CCCL_END_NAMESPACE_CUDA_STD_SIMD
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_SIMPLE_MASK_H
diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h
new file mode 100644
index 00000000000..a803c2bc510
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h
@@ -0,0 +1,351 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_VEC_H
+#define _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_VEC_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__utility/in_range.h>
+#include <cuda/std/__simd/declaration.h>
+#include <cuda/std/__type_traits/integral_constant.h>
+#include <cuda/std/__utility/integer_sequence.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD
+
+template <__simd_size_type _Np>
+struct __fixed_size
+{
+  static_assert(_Np > 0, "_Np must be greater than 0");
+
+  static constexpr __simd_size_type __simd_size = _Np;
+};
+
+// Element-per-slot simd storage for fixed_size ABI
+template <typename _Tp, __simd_size_type _Np>
+struct __simd_storage<_Tp, __fixed_size<_Np>>
+{
+  using value_type = _Tp;
+
+  _Tp __data[_Np]{}; // initialization required for constexpr constructor
+
+  [[nodiscard]] _CCCL_API constexpr _Tp __get(__simd_size_type __idx) const noexcept
+  {
+    _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds");
+    return __data[__idx];
+  }
+
+  _CCCL_API constexpr void __set(__simd_size_type __idx, _Tp __v) noexcept
+  {
+    _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds");
+    __data[__idx] = __v;
+  }
+};
+
+// Simd operations for fixed_size ABI
+template <typename _Tp, __simd_size_type _Np>
+struct __simd_operations<_Tp, __fixed_size<_Np>>
+{
+  using _SimdStorage = __simd_storage<_Tp, __fixed_size<_Np>>;
+  using _MaskStorage = __mask_storage<sizeof(_Tp), __fixed_size<_Np>>;
+
+  [[nodiscard]] _CCCL_API static constexpr _SimdStorage __broadcast(_Tp __v) noexcept
+  {
+    _SimdStorage __result{};
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = __v;
+    }
+    return __result;
+  }
+
+  template <typename _Generator, __simd_size_type... _Is>
+  [[nodiscard]] _CCCL_API static constexpr _SimdStorage
+  __generate_init(_Generator&& __g, integer_sequence<__simd_size_type, _Is...>)
+  {
+    return _SimdStorage{{__g(integral_constant<__simd_size_type, _Is>())...}};
+  }
+
+  template <typename _Generator>
+  [[nodiscard]] _CCCL_API static constexpr _SimdStorage __generate(_Generator&& __g)
+  {
+    return __generate_init(__g, make_integer_sequence<__simd_size_type, _Np>());
+  }
+
+  // Unary operations
+
+  _CCCL_API static constexpr void __increment(_SimdStorage& __s) noexcept
+  {
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      ++__s.__data[__i];
+    }
+  }
+
+  _CCCL_API static constexpr void __decrement(_SimdStorage& __s) noexcept
+  {
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      --__s.__data[__i];
+    }
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _MaskStorage __negate(const _SimdStorage& __s) noexcept
+  {
+    _MaskStorage __result{};
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = !__s.__data[__i];
+    }
+    return __result;
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _SimdStorage __bitwise_not(const _SimdStorage& __s) noexcept
+  {
+    _SimdStorage __result{};
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = ~__s.__data[__i];
+    }
+    return __result;
+  }
+
+  _CCCL_DIAG_PUSH
+  _CCCL_DIAG_SUPPRESS_MSVC(4146) // unary minus applied to unsigned type
+  [[nodiscard]] _CCCL_API static constexpr _SimdStorage __unary_minus(const _SimdStorage& __s) noexcept
+  {
+    _SimdStorage __result{};
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = -__s.__data[__i];
+    }
+    return __result;
+  }
+  _CCCL_DIAG_POP
+
+  // Binary arithmetic operations
+
+  [[nodiscard]] _CCCL_API static constexpr _SimdStorage
+  __plus(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept
+  {
+    _SimdStorage __result{};
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = (__lhs.__data[__i] + __rhs.__data[__i]);
+    }
+    return __result;
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _SimdStorage
+  __minus(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept
+  {
+    _SimdStorage __result{};
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = (__lhs.__data[__i] - __rhs.__data[__i]);
+    }
+    return __result;
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _SimdStorage
+  __multiplies(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept
+  {
+    _SimdStorage __result{};
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = (__lhs.__data[__i] * __rhs.__data[__i]);
+    }
+    return __result;
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _SimdStorage
+  __divides(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept
+  {
+    _SimdStorage __result{};
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = (__lhs.__data[__i] / __rhs.__data[__i]);
+    }
+    return __result;
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _SimdStorage
+  __modulo(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept
+  {
+    _SimdStorage __result{};
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = (__lhs.__data[__i] % __rhs.__data[__i]);
+    }
+    return __result;
+  }
+
+  // Comparison operations
+
+  [[nodiscard]] _CCCL_API static constexpr _MaskStorage
+  __equal_to(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept
+  {
+    _MaskStorage __result{};
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = (__lhs.__data[__i] == __rhs.__data[__i]);
+    }
+    return __result;
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _MaskStorage
+  __not_equal_to(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept
+  {
+    _MaskStorage __result{};
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = (__lhs.__data[__i] != __rhs.__data[__i]);
+    }
+    return __result;
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _MaskStorage
+  __less(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept
+  {
+    _MaskStorage __result{};
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = (__lhs.__data[__i] < __rhs.__data[__i]);
+    }
+    return __result;
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _MaskStorage
+  __less_equal(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept
+  {
+    _MaskStorage __result{};
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = (__lhs.__data[__i] <= __rhs.__data[__i]);
+    }
+    return __result;
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _MaskStorage
+  __greater(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept
+  {
+    _MaskStorage __result{};
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = (__lhs.__data[__i] > __rhs.__data[__i]);
+    }
+    return __result;
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _MaskStorage
+  __greater_equal(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept
+  {
+    _MaskStorage __result{};
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = (__lhs.__data[__i] >= __rhs.__data[__i]);
+    }
+    return __result;
+  }
+
+  // Bitwise and shift operations
+
+  [[nodiscard]] _CCCL_API static constexpr _SimdStorage
+  __bitwise_and(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept
+  {
+    _SimdStorage __result{};
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = (__lhs.__data[__i] & __rhs.__data[__i]);
+    }
+    return __result;
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _SimdStorage
+  __bitwise_or(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept
+  {
+    _SimdStorage __result{};
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = (__lhs.__data[__i] | __rhs.__data[__i]);
+    }
+    return __result;
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _SimdStorage
+  __bitwise_xor(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept
+  {
+    _SimdStorage __result{};
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = (__lhs.__data[__i] ^ __rhs.__data[__i]);
+    }
+    return __result;
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _SimdStorage
+  __shift_left(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept
+  {
+    _SimdStorage __result{};
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = (__lhs.__data[__i] << __rhs.__data[__i]);
+    }
+    return __result;
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _SimdStorage
+  __shift_right(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept
+  {
+    _SimdStorage __result{};
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = (__lhs.__data[__i] >> __rhs.__data[__i]);
+    }
+    return __result;
+  }
+};
+_CCCL_END_NAMESPACE_CUDA_STD_SIMD
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_VEC_H
diff --git a/libcudacxx/include/cuda/std/__simd/store.h b/libcudacxx/include/cuda/std/__simd/store.h
new file mode 100644
index 00000000000..ca4f3d615de
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__simd/store.h
@@ -0,0 +1,240 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___SIMD_STORE_H
+#define _CUDA_STD___SIMD_STORE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__iterator/concepts.h>
+#include <cuda/std/__iterator/distance.h>
+#include <cuda/std/__iterator/incrementable_traits.h>
+#include <cuda/std/__iterator/readable_traits.h>
+#include <cuda/std/__memory/pointer_traits.h>
+#include <cuda/std/__ranges/access.h>
+#include <cuda/std/__ranges/concepts.h>
+#include <cuda/std/__ranges/data.h>
+#include <cuda/std/__ranges/size.h>
+#include <cuda/std/__simd/basic_vec.h>
+#include <cuda/std/__simd/concepts.h>
+#include <cuda/std/__simd/flag.h>
+#include <cuda/std/__simd/utility.h>
+#include <cuda/std/__utility/cmp.h>
+#include <cuda/std/__utility/forward.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD
+
+// [simd.loadstore] helper: core partial store to pointer + count + mask
+template <typename _Tp, typename _Abi, typename _Up, typename... _Flags>
+_CCCL_API constexpr void __partial_store_to_ptr(
+  const basic_vec<_Tp, _Abi>& __v,
+  _Up* __ptr,
+  __simd_size_type __count,
+  const typename basic_vec<_Tp, _Abi>::mask_type& __mask)
+{
+  static_assert(__is_vectorizable_v<_Up>, "range_value_t<R> must be a vectorizable type");
+  static_assert(__explicitly_convertible_to<_Up, _Tp>,
+                "value_type must satisfy explicitly-convertible-to<range_value_t<R>>");
+  static_assert(__has_convert_flag_v<_Flags...> || __is_value_preserving_v<_Tp, _Up>,
+                "Conversion from value_type to range_value_t<R> is not value-preserving; use flag_convert");
+  ::cuda::std::simd::__assert_load_store_alignment<basic_vec<_Tp, _Abi>, _Up, _Flags...>(__ptr);
+  _CCCL_PRAGMA_UNROLL_FULL()
+  for (__simd_size_type __i = 0; __i < basic_vec<_Tp, _Abi>::size; ++__i)
+  {
+    if (__mask[__i] && __i < __count)
+    {
+      __ptr[__i] = static_cast<_Up>(__v[__i]);
+    }
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// [simd.loadstore] partial_store
+
+// partial_store: range, masked
+_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Range, typename... _Flags)
+_CCCL_REQUIRES(ranges::contiguous_range<_Range> _CCCL_AND ranges::sized_range<_Range> _CCCL_AND
+                 __explicitly_convertible_to<ranges::range_value_t<_Range>, _Tp>)
+_CCCL_API constexpr void partial_store(
+  const basic_vec<_Tp, _Abi>& __v,
+  _Range&& __r,
+  const typename basic_vec<_Tp, _Abi>::mask_type& __mask,
+  flags<_Flags...> = {})
+{
+  static_assert(indirectly_writable<ranges::iterator_t<_Range>, ranges::range_value_t<_Range>>,
+                "ranges::iterator_t<R> must model indirectly_writable<ranges::range_value_t<R>>");
+  using _Up = ranges::range_value_t<_Range>;
+  ::cuda::std::simd::__partial_store_to_ptr<_Tp, _Abi, _Up, _Flags...>(
+    __v, ::cuda::std::ranges::data(__r), static_cast<__simd_size_type>(::cuda::std::ranges::size(__r)), __mask);
+}
+
+// partial_store: range, no mask
+_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Range, typename... _Flags)
+_CCCL_REQUIRES(ranges::contiguous_range<_Range> _CCCL_AND ranges::sized_range<_Range> _CCCL_AND
+                 __explicitly_convertible_to<ranges::range_value_t<_Range>, _Tp>)
+_CCCL_API constexpr void partial_store(const basic_vec<_Tp, _Abi>& __v, _Range&& __r, flags<_Flags...> __f = {})
+{
+  ::cuda::std::simd::partial_store(
+    __v, ::cuda::std::forward<_Range>(__r), typename basic_vec<_Tp, _Abi>::mask_type(true), __f);
+}
+
+// partial_store: iterator + count, masked
+_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename... _Flags)
+_CCCL_REQUIRES(contiguous_iterator<_Ip> _CCCL_AND __explicitly_convertible_to<iter_value_t<_Ip>, _Tp>)
+_CCCL_API constexpr void partial_store(
+  const basic_vec<_Tp, _Abi>& __v,
+  _Ip __first,
+  iter_difference_t<_Ip> __n,
+  const typename basic_vec<_Tp, _Abi>::mask_type& __mask,
+  flags<_Flags...> = {})
+{
+  static_assert(indirectly_writable<_Ip, iter_value_t<_Ip>>, "I must model indirectly_writable<iter_value_t<I>>");
+  using _Up = iter_value_t<_Ip>;
+  ::cuda::std::simd::__partial_store_to_ptr<_Tp, _Abi, _Up, _Flags...>(
+    __v, ::cuda::std::to_address(__first), static_cast<__simd_size_type>(__n), __mask);
+}
+
+// partial_store: iterator + count, no mask
+_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename... _Flags)
+_CCCL_REQUIRES(contiguous_iterator<_Ip> _CCCL_AND __explicitly_convertible_to<iter_value_t<_Ip>, _Tp>)
+_CCCL_API constexpr void
+partial_store(const basic_vec<_Tp, _Abi>& __v, _Ip __first, iter_difference_t<_Ip> __n, flags<_Flags...> __f = {})
+{
+  ::cuda::std::simd::partial_store(__v, __first, __n, typename basic_vec<_Tp, _Abi>::mask_type(true), __f);
+}
+
+// partial_store: iterator + sentinel, masked
+_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename _Sp, typename... _Flags)
+_CCCL_REQUIRES(contiguous_iterator<_Ip> _CCCL_AND sized_sentinel_for<_Sp, _Ip> _CCCL_AND
+                 __explicitly_convertible_to<iter_value_t<_Ip>, _Tp>)
+_CCCL_API constexpr void partial_store(
+  const basic_vec<_Tp, _Abi>& __v,
+  _Ip __first,
+  _Sp __last,
+  const typename basic_vec<_Tp, _Abi>::mask_type& __mask,
+  flags<_Flags...> = {})
+{
+  static_assert(indirectly_writable<_Ip, iter_value_t<_Ip>>, "I must model indirectly_writable<iter_value_t<I>>");
+  using _Up = iter_value_t<_Ip>;
+  ::cuda::std::simd::__partial_store_to_ptr<_Tp, _Abi, _Up, _Flags...>(
+    __v,
+    ::cuda::std::to_address(__first),
+    static_cast<__simd_size_type>(::cuda::std::distance(__first, __last)),
+    __mask);
+}
+
+// partial_store: iterator + sentinel, no mask
+_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename _Sp, typename... _Flags)
+_CCCL_REQUIRES(contiguous_iterator<_Ip> _CCCL_AND sized_sentinel_for<_Sp, _Ip> _CCCL_AND
+                 __explicitly_convertible_to<iter_value_t<_Ip>, _Tp>)
+_CCCL_API constexpr void
+partial_store(const basic_vec<_Tp, _Abi>& __v, _Ip __first, _Sp __last, flags<_Flags...> __f = {})
+{
+  ::cuda::std::simd::partial_store(__v, __first, __last, typename basic_vec<_Tp, _Abi>::mask_type(true), __f);
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// [simd.loadstore] unchecked_store
+
+// unchecked_store: range, masked
+_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Range, typename... _Flags)
+_CCCL_REQUIRES(ranges::contiguous_range<_Range> _CCCL_AND ranges::sized_range<_Range> _CCCL_AND
+                 __explicitly_convertible_to<ranges::range_value_t<_Range>, _Tp>)
+_CCCL_API constexpr void unchecked_store(
+  const basic_vec<_Tp, _Abi>& __v,
+  _Range&& __r,
+  const typename basic_vec<_Tp, _Abi>::mask_type& __mask,
+  flags<_Flags...> __f = {})
+{
+  if constexpr (__has_static_size<_Range>)
+  {
+    static_assert(__static_range_size_v<_Range> >= basic_vec<_Tp, _Abi>::size(),
+                  "unchecked_store requires ranges::size(r) >= V::size()");
+  }
+  _CCCL_ASSERT(::cuda::std::cmp_greater_equal(::cuda::std::ranges::size(__r), __v.size),
+               "unchecked_store requires ranges::size(r) >= V::size()");
+  ::cuda::std::simd::partial_store(__v, ::cuda::std::forward<_Range>(__r), __mask, __f);
+}
+
+// unchecked_store: range, no mask
+_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Range, typename... _Flags)
+_CCCL_REQUIRES(ranges::contiguous_range<_Range> _CCCL_AND ranges::sized_range<_Range> _CCCL_AND
+                 __explicitly_convertible_to<ranges::range_value_t<_Range>, _Tp>)
+_CCCL_API constexpr void unchecked_store(const basic_vec<_Tp, _Abi>& __v, _Range&& __r, flags<_Flags...> __f = {})
+{
+  ::cuda::std::simd::unchecked_store(
+    __v, ::cuda::std::forward<_Range>(__r), typename basic_vec<_Tp, _Abi>::mask_type(true), __f);
+}
+
+// unchecked_store: iterator + count, masked
+_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename... _Flags)
+_CCCL_REQUIRES(contiguous_iterator<_Ip> _CCCL_AND __explicitly_convertible_to<iter_value_t<_Ip>, _Tp>)
+_CCCL_API constexpr void unchecked_store(
+  const basic_vec<_Tp, _Abi>& __v,
+  _Ip __first,
+  iter_difference_t<_Ip> __n,
+  const typename basic_vec<_Tp, _Abi>::mask_type& __mask,
+  flags<_Flags...> __f = {})
+{
+  _CCCL_ASSERT(::cuda::std::cmp_greater_equal(__n, __v.size), "unchecked_store requires n >= V::size()");
+  ::cuda::std::simd::partial_store(__v, __first, __n, __mask, __f);
+}
+
+// unchecked_store: iterator + count, no mask
+_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename... _Flags)
+_CCCL_REQUIRES(contiguous_iterator<_Ip> _CCCL_AND __explicitly_convertible_to<iter_value_t<_Ip>, _Tp>)
+_CCCL_API constexpr void
+unchecked_store(const basic_vec<_Tp, _Abi>& __v, _Ip __first, iter_difference_t<_Ip> __n, flags<_Flags...> __f = {})
+{
+  ::cuda::std::simd::unchecked_store(__v, __first, __n, typename basic_vec<_Tp, _Abi>::mask_type(true), __f);
+}
+
+// unchecked_store: iterator + sentinel, masked
+_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename _Sp, typename... _Flags)
+_CCCL_REQUIRES(contiguous_iterator<_Ip> _CCCL_AND sized_sentinel_for<_Sp, _Ip> _CCCL_AND
+                 __explicitly_convertible_to<iter_value_t<_Ip>, _Tp>)
+_CCCL_API constexpr void unchecked_store(
+  const basic_vec<_Tp, _Abi>& __v,
+  _Ip __first,
+  _Sp __last,
+  const typename basic_vec<_Tp, _Abi>::mask_type& __mask,
+  flags<_Flags...> __f = {})
+{
+  _CCCL_ASSERT(::cuda::std::cmp_greater_equal(::cuda::std::distance(__first, __last), __v.size),
+               "unchecked_store requires distance(first, last) >= V::size()");
+  ::cuda::std::simd::partial_store(__v, __first, __last, __mask, __f);
+}
+
+// unchecked_store: iterator + sentinel, no mask
+_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename _Sp, typename... _Flags)
+_CCCL_REQUIRES(contiguous_iterator<_Ip> _CCCL_AND sized_sentinel_for<_Sp, _Ip> _CCCL_AND
+                 __explicitly_convertible_to<iter_value_t<_Ip>, _Tp>)
+_CCCL_API constexpr void
+unchecked_store(const basic_vec<_Tp, _Abi>& __v, _Ip __first, _Sp __last, flags<_Flags...> __f = {})
+{
+  ::cuda::std::simd::unchecked_store(__v, __first, __last, typename basic_vec<_Tp, _Abi>::mask_type(true), __f);
+}
+
+_CCCL_END_NAMESPACE_CUDA_STD_SIMD
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA_STD___SIMD_STORE_H
diff --git a/libcudacxx/include/cuda/std/__simd/type_traits.h b/libcudacxx/include/cuda/std/__simd/type_traits.h
new file mode 100644
index 00000000000..d169461ce9c
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__simd/type_traits.h
@@ -0,0 +1,99 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___SIMD_TYPE_TRAITS_H
+#define _CUDA_STD___SIMD_TYPE_TRAITS_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__memory/is_valid_alignment.h>
+#include <cuda/std/__cstddef/types.h>
+#include <cuda/std/__simd/abi.h>
+#include <cuda/std/__simd/declaration.h>
+#include <cuda/std/__simd/exposition.h>
+#include <cuda/std/__type_traits/integral_constant.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD
+
+// [simd.traits], alignment
+template <typename _Tp, typename _Up = typename _Tp::value_type>
+struct alignment;
+
+template <typename _Tp, typename _Abi, typename _Up>
+struct alignment<basic_vec<_Tp, _Abi>, _Up>
+    : integral_constant<size_t,
+                        ::cuda::__is_valid_alignment(__simd_size_v<_Tp, _Abi> * alignof(_Up))
+                          ? __simd_size_v<_Tp, _Abi> * alignof(_Up)
+                          : alignof(_Up)>
+{
+  static_assert(__is_vectorizable_v<_Up>, "U must be a vectorizable type");
+};
+
+template <typename _Tp, typename _Up = typename _Tp::value_type>
+constexpr size_t alignment_v = alignment<_Tp, _Up>::value;
+
+// [simd.traits], rebind
+template <typename _Tp, typename _Vp>
+struct rebind;
+
+template <typename _Tp, typename _Up, typename _Abi>
+struct rebind<_Tp, basic_vec<_Up, _Abi>>
+{
+  static_assert(__is_vectorizable_v<_Tp>, "T must be a vectorizable type");
+  using type = basic_vec<_Tp, __deduce_abi_t<_Tp, __simd_size_v<_Up, _Abi>>>;
+};
+
+template <typename _Tp, size_t _Bytes, typename _Abi>
+struct rebind<_Tp, basic_mask<_Bytes, _Abi>>
+{
+  static_assert(__is_vectorizable_v<_Tp>, "T must be a vectorizable type");
+  using __integer_t       = __integer_from<sizeof(_Tp)>;
+  using __integer_bytes_t = __integer_from<_Bytes>;
+
+  using type = basic_mask<sizeof(_Tp), __deduce_abi_t<__integer_t, __simd_size_v<__integer_bytes_t, _Abi>>>;
+};
+
+template <typename _Tp, typename _Vp>
+using rebind_t = typename rebind<_Tp, _Vp>::type;
+
+// [simd.traits], resize
+template <__simd_size_type _Np, typename _Vp>
+struct resize;
+
+template <__simd_size_type _Np, typename _Tp, typename _Abi>
+struct resize<_Np, basic_vec<_Tp, _Abi>>
+{
+  using type = basic_vec<_Tp, __deduce_abi_t<_Tp, _Np>>;
+};
+
+template <__simd_size_type _Np, size_t _Bytes, typename _Abi>
+struct resize<_Np, basic_mask<_Bytes, _Abi>>
+{
+  using type = basic_mask<_Bytes, __deduce_abi_t<__integer_from<_Bytes>, _Np>>;
+};
+
+template <__simd_size_type _Np, typename _Vp>
+using resize_t = typename resize<_Np, _Vp>::type;
+
+_CCCL_END_NAMESPACE_CUDA_STD_SIMD
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA_STD___SIMD_TYPE_TRAITS_H
diff --git a/libcudacxx/include/cuda/std/__simd/utility.h b/libcudacxx/include/cuda/std/__simd/utility.h
new file mode 100644
index 00000000000..9b9c9f708c3
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__simd/utility.h
@@ -0,0 +1,153 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___SIMD_UTILITY_H
+#define _CUDA_STD___SIMD_UTILITY_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__memory/is_aligned.h>
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__fwd/span.h>
+#include <cuda/std/__ranges/concepts.h>
+#include <cuda/std/__simd/abi.h>
+#include <cuda/std/__simd/concepts.h>
+#include <cuda/std/__simd/flag.h>
+#include <cuda/std/__simd/specializations/fixed_size_vec.h>
+#include <cuda/std/__simd/type_traits.h>
+#include <cuda/std/__tuple_dir/tuple_size.h>
+#include <cuda/std/__type_traits/integral_constant.h>
+#include <cuda/std/__type_traits/is_convertible.h>
+#include <cuda/std/__type_traits/remove_cvref.h>
+#include <cuda/std/__type_traits/void_t.h>
+#include <cuda/std/__utility/declval.h>
+#include <cuda/std/__utility/integer_sequence.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD
+
+template <typename _Tp>
+constexpr bool __is_abi_tag_v = false;
+
+template <__simd_size_type _Np>
+constexpr bool __is_abi_tag_v<__fixed_size<_Np>> = true;
+
+//----------------------------------------------------------------------------------------------------------------------
+// __can_generate_v
+
+template <typename _Tp, typename _Generator, __simd_size_type _Idx, typename = void>
+constexpr bool __is_well_formed = false;
+
+template <typename _Tp, typename _Generator, __simd_size_type _Idx>
+constexpr bool __is_well_formed<_Tp,
+                                _Generator,
+                                _Idx,
+                                void_t<decltype(declval<_Generator>()(integral_constant<__simd_size_type, _Idx>()))>> =
+  is_convertible_v<decltype(declval<_Generator>()(integral_constant<__simd_size_type, _Idx>())), _Tp>;
+
+template <typename _Tp, typename _Generator, __simd_size_type... _Indices>
+[[nodiscard]]
+_CCCL_API constexpr bool __can_generate(integer_sequence<__simd_size_type, _Indices...>) noexcept
+{
+  return (true && ... && __is_well_formed<_Tp, _Generator, _Indices>);
+}
+
+template <typename _Tp, typename _Generator, __simd_size_type _Size>
+constexpr bool __can_generate_v = __can_generate<_Tp, _Generator>(make_integer_sequence<__simd_size_type, _Size>());
+
+//----------------------------------------------------------------------------------------------------------------------
+// __is_compatible_range_v
+
+template <typename _Range, typename = void>
+constexpr bool __has_tuple_size_v = false;
+
+template <typename _Range>
+constexpr bool __has_tuple_size_v<_Range, void_t<decltype(tuple_size<remove_cvref_t<_Range>>::value)>> = true;
+
+template <typename _Range, typename = void>
+constexpr bool __has_static_extent_v = false;
+
+template <typename _Range>
+constexpr bool __has_static_extent_v<_Range, void_t<decltype(remove_cvref_t<_Range>::extent)>> =
+  remove_cvref_t<_Range>::extent != dynamic_extent;
+
+// Proxy for ranges::size(r) is a constant expression.
+template <typename _Range>
+_CCCL_CONCEPT __has_static_size = __has_tuple_size_v<_Range> || __has_static_extent_v<_Range>;
+
+template <typename _Range>
+[[nodiscard]] _CCCL_API constexpr __simd_size_type __get_static_range_size() noexcept
+{
+  using __range_t = remove_cvref_t<_Range>;
+  if constexpr (__has_tuple_size_v<_Range>)
+  {
+    return __simd_size_type{tuple_size_v<__range_t>};
+  }
+  else
+  {
+    return __simd_size_type{__range_t::extent};
+  }
+}
+
+template <typename _Range>
+constexpr __simd_size_type __static_range_size_v = __get_static_range_size<_Range>();
+
+// This trait is defined at namespace scope (not as a static member of basic_vec) because GCC 13 rejects partial
+// specialization of static member variable templates. The static-size detection intentionally avoids directly using
+// tuple_size_v<T> in the guard because that causes a hard error (instead of SFINAE) on NVCC with
+// clang-19/clang-14/nvc++ when T is an incomplete specialization of tuple_size.
+template <typename _Range>
+constexpr bool __is_compatible_range_guard_v =
+  __has_static_size<_Range> && ranges::contiguous_range<_Range> && ranges::sized_range<_Range>;
+
+template <typename _Tp, __simd_size_type _Size, typename _Range, bool = __is_compatible_range_guard_v<_Range>>
+constexpr bool __is_compatible_range_v = false;
+
+template <typename _Tp, __simd_size_type _Size, typename _Range>
+constexpr bool __is_compatible_range_v<_Tp, _Size, _Range, true> =
+  (__static_range_size_v<_Range> == _Size) //
+  && __is_vectorizable_v<ranges::range_value_t<_Range>> //
+  && __explicitly_convertible_to<_Tp, ranges::range_value_t<_Range>>;
+
+//----------------------------------------------------------------------------------------------------------------------
+// [simd.flags] alignment assertion for load/store pointers
+
+template <typename _Vec, typename _Up, typename... _Flags>
+_CCCL_API constexpr void __assert_load_store_alignment([[maybe_unused]] const _Up* __data) noexcept
+{
+  _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+  {
+    if constexpr (__has_aligned_flag_v<_Flags...>)
+    {
+      _CCCL_ASSERT(::cuda::is_aligned(__data, alignment_v<_Vec, _Up>),
+                   "flag_aligned requires data to be aligned to alignment_v<V, range_value_t<R>>");
+    }
+    else if constexpr (__has_overaligned_flag_v<_Flags...>)
+    {
+      _CCCL_ASSERT(::cuda::is_aligned(__data, __overaligned_alignment_v<_Flags...>),
+                   "flag_overaligned<N> requires data to be aligned to N");
+    }
+  }
+}
+
+_CCCL_END_NAMESPACE_CUDA_STD_SIMD
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA_STD___SIMD_UTILITY_H
diff --git a/libcudacxx/include/cuda/std/__simd_ b/libcudacxx/include/cuda/std/__simd_
new file mode 100644
index 00000000000..02b28686752
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__simd_
@@ -0,0 +1,31 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD_SIMD
+#define _CUDA_STD_SIMD
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__simd/abi.h>
+#include <cuda/std/__simd/basic_mask.h>
+#include <cuda/std/__simd/basic_vec.h>
+#include <cuda/std/__simd/declaration.h>
+#include <cuda/std/__simd/flag.h>
+#include <cuda/std/__simd/type_traits.h>
+
+#endif // _CUDA_STD_SIMD
diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.flags/flags.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.flags/flags.pass.cpp
new file mode 100644
index 00000000000..5b2ff1dc3bf
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.flags/flags.pass.cpp
@@ -0,0 +1,64 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cuda/std/__simd_>
+
+// [simd.flags], load and store flags
+//
+// template<class... Flags> struct flags;
+//
+// inline constexpr flags<> flag_default{};
+// inline constexpr flags<convert-flag> flag_convert{};
+// inline constexpr flags<aligned-flag> flag_aligned{};
+// template<size_t N> constexpr flags<overaligned-flag<N>> flag_overaligned{};
+//
+// [simd.flags.oper], flags operators
+// template<class... Other> friend constexpr flags<Flags..., Other...> operator|(flags, flags<Other...>);
+
+#include <cuda/std/__simd_>
+#include <cuda/std/type_traits>
+
+#include "test_macros.h"
+
+namespace simd = cuda::std::simd;
+
+template <typename T, typename R>
+__host__ __device__ constexpr bool is_same_flags(const T&, const R&)
+{
+  return cuda::std::is_same_v<cuda::std::remove_cvref_t<T>, cuda::std::remove_cvref_t<R>>;
+}
+
+__host__ __device__ void test()
+{
+  static_assert(is_same_flags(simd::flag_default, simd::flags<>{}));
+  // default | X == X
+  static_assert(is_same_flags(simd::flag_default | simd::flag_convert, simd::flag_convert));
+  static_assert(is_same_flags(simd::flag_default | simd::flag_aligned, simd::flag_aligned));
+  static_assert(is_same_flags(simd::flag_default | simd::flag_overaligned<32>, simd::flag_overaligned<32>));
+
+  // X | default == X
+  static_assert(is_same_flags(simd::flag_convert | simd::flag_default, simd::flag_convert));
+  static_assert(is_same_flags(simd::flag_aligned | simd::flag_default, simd::flag_aligned));
+  static_assert(is_same_flags(simd::flag_overaligned<32> | simd::flag_default, simd::flag_overaligned<32>));
+
+  // two distinct flags
+  static_assert(is_same_flags(simd::flag_convert | simd::flag_aligned, simd::flag_convert | simd::flag_aligned));
+  static_assert(is_same_flags(simd::flag_aligned | simd::flag_convert, simd::flag_aligned | simd::flag_convert));
+  static_assert(
+    is_same_flags(simd::flag_convert | simd::flag_overaligned<32>, simd::flag_convert | simd::flag_overaligned<32>));
+  static_assert(
+    is_same_flags(simd::flag_overaligned<32> | simd::flag_convert, simd::flag_overaligned<32> | simd::flag_convert));
+}
+
+int main(int, char**)
+{
+  test();
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.flags/multiple_overaligned.fail.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.flags/multiple_overaligned.fail.cpp
new file mode 100644
index 00000000000..ab5e00a0384
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.flags/multiple_overaligned.fail.cpp
@@ -0,0 +1,23 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// flags<Flags...> allows at most one overaligned_flag<N>.
+
+#include <cuda/std/__simd_>
+
+#include "test_macros.h"
+
+namespace simd = cuda::std::simd;
+
+int main(int, char**)
+{
+  [[maybe_unused]] auto bad = simd::flag_overaligned<8> | simd::flag_overaligned<16>; // expected-error
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.flags/non_flag_type.fail.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.flags/non_flag_type.fail.cpp
new file mode 100644
index 00000000000..e6db1323f17
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.flags/non_flag_type.fail.cpp
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// flags<Flags...> requires every type in the pack to be one of
+// convert_flag, aligned_flag, or overaligned_flag<N>.
+
+#include <cuda/std/__simd_>
+
+#include "test_macros.h"
+
+int main(int, char**)
+{
+  [[maybe_unused]] cuda::std::simd::flags<int> bad{}; // expected-error
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.flags/non_power_of_two_overaligned.fail.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.flags/non_power_of_two_overaligned.fail.cpp
new file mode 100644
index 00000000000..43514208373
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.flags/non_power_of_two_overaligned.fail.cpp
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// overaligned_flag<N> requires N to be a power of two.
+
+#include <cuda/std/__simd_>
+
+#include "test_macros.h"
+
+int main(int, char**)
+{
+  [[maybe_unused]] auto bad = cuda::std::simd::flag_overaligned<3>; // expected-error
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/binary.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/binary.pass.cpp
new file mode 100644
index 00000000000..b321e44c520
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/binary.pass.cpp
@@ -0,0 +1,138 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cuda/std/__simd_>
+
+// [simd.mask.binary], basic_mask binary operators
+//
+// friend constexpr basic_mask operator&&(const basic_mask&, const basic_mask&) noexcept;
+// friend constexpr basic_mask operator||(const basic_mask&, const basic_mask&) noexcept;
+// friend constexpr basic_mask operator&(const basic_mask&, const basic_mask&) noexcept;
+// friend constexpr basic_mask operator|(const basic_mask&, const basic_mask&) noexcept;
+// friend constexpr basic_mask operator^(const basic_mask&, const basic_mask&) noexcept;
+
+#include "../simd_test_utils.h"
+
+template <int Bytes, int N>
+__host__ __device__ constexpr void test_all_patterns()
+{
+  using Mask = simd::basic_mask<Bytes, simd::fixed_size<N>>;
+  Mask all_true(true);
+  Mask all_false(false);
+
+  static_assert(cuda::std::is_same_v<decltype(all_true && all_false), Mask>);
+  static_assert(cuda::std::is_same_v<decltype(all_true || all_false), Mask>);
+  static_assert(cuda::std::is_same_v<decltype(all_true & all_false), Mask>);
+  static_assert(cuda::std::is_same_v<decltype(all_true | all_false), Mask>);
+  static_assert(cuda::std::is_same_v<decltype(all_true ^ all_false), Mask>);
+  static_assert(noexcept(all_true && all_false));
+  static_assert(noexcept(all_true || all_false));
+  static_assert(noexcept(all_true & all_false));
+  static_assert(noexcept(all_true | all_false));
+  static_assert(noexcept(all_true ^ all_false));
+  // logical AND
+  {
+    Mask and_result_tt = all_true && all_true;
+    Mask and_result_ft = all_false && all_true;
+    Mask and_result_tf = all_true && all_false;
+    Mask and_result_ff = all_false && all_false;
+    for (int i = 0; i < N; ++i)
+    {
+      assert(and_result_tt[i] == true);
+      assert(and_result_ft[i] == false);
+      assert(and_result_tf[i] == false);
+      assert(and_result_ff[i] == false);
+    }
+  }
+  // logical OR
+  {
+    Mask or_result_tt = all_true || all_true;
+    Mask or_result_ft = all_false || all_true;
+    Mask or_result_tf = all_true || all_false;
+    Mask or_result_ff = all_false || all_false;
+    for (int i = 0; i < N; ++i)
+    {
+      assert(or_result_tt[i] == true);
+      assert(or_result_ft[i] == true);
+      assert(or_result_tf[i] == true);
+      assert(or_result_ff[i] == false);
+    }
+  }
+  // bitwise AND
+  {
+    Mask bit_and_result_tt = all_true & all_true;
+    Mask bit_and_result_ft = all_false & all_true;
+    Mask bit_and_result_tf = all_true & all_false;
+    Mask bit_and_result_ff = all_false & all_false;
+    for (int i = 0; i < N; ++i)
+    {
+      assert(bit_and_result_tt[i] == true);
+      assert(bit_and_result_ft[i] == false);
+      assert(bit_and_result_tf[i] == false);
+      assert(bit_and_result_ff[i] == false);
+    }
+  }
+  // bitwise OR
+  {
+    Mask bit_or_result_tt = all_true | all_true;
+    Mask bit_or_result_ft = all_false | all_true;
+    Mask bit_or_result_tf = all_true | all_false;
+    Mask bit_or_result_ff = all_false | all_false;
+    for (int i = 0; i < N; ++i)
+    {
+      assert(bit_or_result_tt[i] == true);
+      assert(bit_or_result_ft[i] == true);
+      assert(bit_or_result_tf[i] == true);
+      assert(bit_or_result_ff[i] == false);
+    }
+  }
+  // bitwise XOR
+  {
+    Mask bit_xor_result_tt = all_true ^ all_true;
+    Mask bit_xor_result_ft = all_false ^ all_true;
+    Mask bit_xor_result_tf = all_true ^ all_false;
+    Mask bit_xor_result_ff = all_false ^ all_false;
+    for (int i = 0; i < N; ++i)
+    {
+      assert(bit_xor_result_tt[i] == false);
+      assert(bit_xor_result_ft[i] == true);
+      assert(bit_xor_result_tf[i] == true);
+      assert(bit_xor_result_ff[i] == false);
+    }
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+
+template <int Bytes>
+__host__ __device__ constexpr void test_bytes()
+{
+  test_all_patterns<Bytes, 1>();
+  test_all_patterns<Bytes, 4>();
+}
+
+__host__ __device__ constexpr bool test()
+{
+  test_bytes<1>();
+  test_bytes<2>();
+  test_bytes<4>();
+  test_bytes<8>();
+#if _CCCL_HAS_INT128()
+  test_bytes<16>();
+#endif
+  return true;
+}
+
+int main(int, char**)
+{
+  assert(test());
+  static_assert(test());
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/comparison.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/comparison.pass.cpp
new file mode 100644
index 00000000000..fd9d6ff21b4
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/comparison.pass.cpp
@@ -0,0 +1,159 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cuda/std/__simd_>
+
+// [simd.mask.comparison], basic_mask comparisons (element-wise)
+//
+// friend constexpr basic_mask operator==(const basic_mask&, const basic_mask&) noexcept;
+// friend constexpr basic_mask operator!=(const basic_mask&, const basic_mask&) noexcept;
+// friend constexpr basic_mask operator>=(const basic_mask&, const basic_mask&) noexcept;
+// friend constexpr basic_mask operator<=(const basic_mask&, const basic_mask&) noexcept;
+// friend constexpr basic_mask operator>(const basic_mask&, const basic_mask&) noexcept;
+// friend constexpr basic_mask operator<(const basic_mask&, const basic_mask&) noexcept;
+
+#include "../simd_test_utils.h"
+
+//----------------------------------------------------------------------------------------------------------------------
+// identical masks
+
+template <int Bytes, int N>
+__host__ __device__ constexpr void test_all_patterns()
+{
+  using Mask = simd::basic_mask<Bytes, simd::fixed_size<N>>;
+  Mask all_true(true);
+  Mask all_false(false);
+
+  static_assert(cuda::std::is_same_v<decltype(all_true == all_true), Mask>);
+  static_assert(cuda::std::is_same_v<decltype(all_true != all_false), Mask>);
+  static_assert(cuda::std::is_same_v<decltype(all_true >= all_false), Mask>);
+  static_assert(cuda::std::is_same_v<decltype(all_true <= all_false), Mask>);
+  static_assert(cuda::std::is_same_v<decltype(all_true > all_false), Mask>);
+  static_assert(cuda::std::is_same_v<decltype(all_true < all_false), Mask>);
+  static_assert(noexcept(all_true == all_false));
+  static_assert(noexcept(all_true != all_false));
+  static_assert(noexcept(all_true >= all_false));
+  static_assert(noexcept(all_true <= all_false));
+  static_assert(noexcept(all_true > all_false));
+  static_assert(noexcept(all_true < all_false));
+
+  // operator==
+  {
+    Mask eq_result_tt = all_true == all_true;
+    Mask eq_result_ft = all_false == all_true;
+    Mask eq_result_tf = all_true == all_false;
+    Mask eq_result_ff = all_false == all_false;
+    for (int i = 0; i < N; ++i)
+    {
+      assert(eq_result_tt[i] == true);
+      assert(eq_result_ft[i] == false);
+      assert(eq_result_tf[i] == false);
+      assert(eq_result_ff[i] == true);
+    }
+  }
+  // operator!=
+  {
+    Mask ne_result_tt = all_true != all_true;
+    Mask ne_result_ft = all_false != all_true;
+    Mask ne_result_tf = all_true != all_false;
+    Mask ne_result_ff = all_false != all_false;
+    for (int i = 0; i < N; ++i)
+    {
+      assert(ne_result_tt[i] == false);
+      assert(ne_result_ft[i] == true);
+      assert(ne_result_tf[i] == true);
+      assert(ne_result_ff[i] == false);
+    }
+  }
+  // operator>=
+  {
+    Mask ge_result_tt = all_true >= all_true;
+    Mask ge_result_ft = all_false >= all_true;
+    Mask ge_result_tf = all_true >= all_false;
+    Mask ge_result_ff = all_false >= all_false;
+    for (int i = 0; i < N; ++i)
+    {
+      assert(ge_result_tt[i] == true);
+      assert(ge_result_ft[i] == false);
+      assert(ge_result_tf[i] == true);
+      assert(ge_result_ff[i] == true);
+    }
+  }
+  // operator<=
+  {
+    Mask le_result_tt = all_true <= all_true;
+    Mask le_result_ft = all_false <= all_true;
+    Mask le_result_tf = all_true <= all_false;
+    Mask le_result_ff = all_false <= all_false;
+    for (int i = 0; i < N; ++i)
+    {
+      assert(le_result_tt[i] == true);
+      assert(le_result_ft[i] == true);
+      assert(le_result_tf[i] == false);
+      assert(le_result_ff[i] == true);
+    }
+  }
+  // operator>
+  {
+    Mask gt_result_tt = all_true > all_true;
+    Mask gt_result_ft = all_false > all_true;
+    Mask gt_result_tf = all_true > all_false;
+    Mask gt_result_ff = all_false > all_false;
+    for (int i = 0; i < N; ++i)
+    {
+      assert(gt_result_tt[i] == false);
+      assert(gt_result_ft[i] == false);
+      assert(gt_result_tf[i] == true);
+      assert(gt_result_ff[i] == false);
+    }
+  }
+  // operator<
+  {
+    Mask lt_result_tt = all_true < all_true;
+    Mask lt_result_ft = all_false < all_true;
+    Mask lt_result_tf = all_true < all_false;
+    Mask lt_result_ff = all_false < all_false;
+    for (int i = 0; i < N; ++i)
+    {
+      assert(lt_result_tt[i] == false);
+      assert(lt_result_ft[i] == true);
+      assert(lt_result_tf[i] == false);
+      assert(lt_result_ff[i] == false);
+    }
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+
+template <int Bytes>
+__host__ __device__ constexpr void test_bytes()
+{
+  test_all_patterns<Bytes, 1>();
+  test_all_patterns<Bytes, 4>();
+}
+
+__host__ __device__ constexpr bool test()
+{
+  test_bytes<1>();
+  test_bytes<2>();
+  test_bytes<4>();
+  test_bytes<8>();
+#if _CCCL_HAS_INT128()
+  test_bytes<16>();
+#endif
+  return true;
+}
+
+int main(int, char**)
+{
+  assert(test());
+  static_assert(test());
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/compound_assign.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/compound_assign.pass.cpp
new file mode 100644
index 00000000000..de635f4ac0d
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/compound_assign.pass.cpp
@@ -0,0 +1,107 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cuda/std/__simd_>
+
+// [simd.mask.cassign], basic_mask compound assignment
+//
+// friend constexpr basic_mask& operator&=(basic_mask&, const basic_mask&) noexcept;
+// friend constexpr basic_mask& operator|=(basic_mask&, const basic_mask&) noexcept;
+// friend constexpr basic_mask& operator^=(basic_mask&, const basic_mask&) noexcept;
+
+#include "../simd_test_utils.h"
+
+template <int Bytes>
+__host__ __device__ constexpr void test_and()
+{
+  using Mask = simd::basic_mask<Bytes, simd::fixed_size<4>>;
+  Mask a(is_even{});
+  Mask b(is_first_half{});
+
+  static_assert(cuda::std::is_same_v<decltype(a &= b), Mask&>);
+  static_assert(noexcept(a &= b));
+
+  a &= b;
+  assert(a[0] == true);
+  assert(a[1] == false);
+  assert(a[2] == false);
+  assert(a[3] == false);
+}
+
+template <int Bytes>
+__host__ __device__ constexpr void test_or()
+{
+  using Mask = simd::basic_mask<Bytes, simd::fixed_size<4>>;
+  Mask a(is_even{});
+  Mask b(is_first_half{});
+
+  static_assert(cuda::std::is_same_v<decltype(a |= b), Mask&>);
+  static_assert(noexcept(a |= b));
+
+  a |= b;
+  assert(a[0] == true);
+  assert(a[1] == true);
+  assert(a[2] == true);
+  assert(a[3] == false);
+}
+
+template <int Bytes>
+__host__ __device__ constexpr void test_xor()
+{
+  using Mask = simd::basic_mask<Bytes, simd::fixed_size<4>>;
+  Mask a(is_even{});
+  Mask b(is_first_half{});
+
+  static_assert(cuda::std::is_same_v<decltype(a ^= b), Mask&>);
+  static_assert(noexcept(a ^= b));
+
+  a ^= b;
+  assert(a[0] == false);
+  assert(a[1] == true);
+  assert(a[2] == true);
+  assert(a[3] == false);
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+
+__host__ __device__ constexpr bool test()
+{
+  test_and<1>();
+  test_and<2>();
+  test_and<4>();
+  test_and<8>();
+#if _CCCL_HAS_INT128()
+  test_and<16>();
+#endif
+
+  test_or<1>();
+  test_or<2>();
+  test_or<4>();
+  test_or<8>();
+#if _CCCL_HAS_INT128()
+  test_or<16>();
+#endif
+
+  test_xor<1>();
+  test_xor<2>();
+  test_xor<4>();
+  test_xor<8>();
+#if _CCCL_HAS_INT128()
+  test_xor<16>();
+#endif
+  return true;
+}
+
+int main(int, char**)
+{
+  assert(test());
+  static_assert(test());
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/conversion.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/conversion.pass.cpp
new file mode 100644
index 00000000000..b158e21f6c0
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/conversion.pass.cpp
@@ -0,0 +1,194 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cuda/std/__simd_>
+
+// [simd.mask.conv], basic_mask conversions
+//
+// explicit operator basic_vec<U, A>() const noexcept;   // sizeof(U) != Bytes
+// operator basic_vec<U, A>() const noexcept;            // sizeof(U) == Bytes (implicit)
+// constexpr bitset<size()> to_bitset() const noexcept;
+// constexpr unsigned long long to_ullong() const;
+
+#include "../simd_test_utils.h"
+
+//----------------------------------------------------------------------------------------------------------------------
+// implicit conversion to basic_vec (sizeof(U) == Bytes)
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_implicit_conv()
+{
+  using Mask = simd::basic_mask<sizeof(T), simd::fixed_size<N>>;
+  using Vec  = simd::basic_vec<T, simd::fixed_size<N>>;
+  Mask mask(is_even{});
+
+  static_assert(cuda::std::is_convertible_v<const Mask&, Vec>);
+  static_assert(noexcept(static_cast<Vec>(mask)));
+
+  Vec vec = mask;
+  for (int i = 0; i < N; ++i)
+  {
+    assert(vec[i] == static_cast<T>(i % 2 == 0));
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// explicit conversion to basic_vec (sizeof(U) != Bytes)
+
+template <int Bytes, typename U, int N>
+__host__ __device__ constexpr void test_explicit_conv()
+{
+  static_assert(sizeof(U) != Bytes);
+  using Mask = simd::basic_mask<Bytes, simd::fixed_size<N>>;
+  using Vec  = simd::basic_vec<U, simd::fixed_size<N>>;
+  Mask mask(is_even{});
+
+  static_assert(!cuda::std::is_convertible_v<const Mask&, Vec>);
+  static_assert(cuda::std::is_same_v<decltype(static_cast<Vec>(mask)), Vec>);
+  static_assert(noexcept(static_cast<Vec>(mask)));
+
+  Vec vec = static_cast<Vec>(mask);
+  for (int i = 0; i < N; ++i)
+  {
+    assert(vec[i] == static_cast<U>(i % 2 == 0));
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// to_bitset
+
+template <int Bytes, int N>
+__host__ __device__ constexpr void test_to_bitset()
+{
+  using Mask = simd::basic_mask<Bytes, simd::fixed_size<N>>;
+  Mask mask(true);
+
+  static_assert(cuda::std::is_same_v<decltype(mask.to_bitset()), cuda::std::bitset<N>>);
+  static_assert(noexcept(mask.to_bitset()));
+  static_assert(is_const_member_function_v<decltype(&Mask::to_bitset)>);
+  unused(mask);
+
+  Mask all_false(false);
+  auto bitset_false = all_false.to_bitset();
+  assert(bitset_false.none());
+
+  Mask all_true(true);
+  auto bitset_true = all_true.to_bitset();
+  assert(bitset_true.all());
+
+  Mask mixed(is_even{});
+  auto bitset = mixed.to_bitset();
+  for (int i = 0; i < N; ++i)
+  {
+    assert(bitset[i] == (i % 2 == 0));
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// to_ullong
+
+template <int Bytes, int N>
+__host__ __device__ constexpr void test_to_ullong()
+{
+  using Mask = simd::basic_mask<Bytes, simd::fixed_size<N>>;
+  Mask mask(true);
+
+  static_assert(cuda::std::is_same_v<decltype(mask.to_ullong()), unsigned long long>);
+  static_assert(!noexcept(mask.to_ullong()));
+  static_assert(is_const_member_function_v<decltype(&Mask::to_ullong)>);
+  unused(mask);
+
+  Mask all_false(false);
+  assert(all_false.to_ullong() == 0ULL);
+
+  if constexpr (N <= 64)
+  {
+    Mask all_true(true);
+    constexpr unsigned long long expected = (N == 64) ? ~0ULL : (~0ULL >> (64 - N));
+    assert(all_true.to_ullong() == expected);
+
+    Mask mixed(is_even{});
+    unsigned long long expected_mixed = 0ULL;
+    for (int i = 0; i < N; ++i)
+    {
+      if (i % 2 == 0)
+      {
+        expected_mixed |= (1ULL << i);
+      }
+    }
+    assert(mixed.to_ullong() == expected_mixed);
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// SFINAE constraints
+
+template <int Bytes, int N>
+__host__ __device__ constexpr void test_sfinae_negative()
+{
+  using Mask    = simd::basic_mask<Bytes, simd::fixed_size<N>>;
+  using Integer = integer_from_t<Bytes>;
+
+  // mismatched element count: conversion is fully rejected
+  using WrongSizeVec = simd::basic_vec<Integer, simd::fixed_size<N + 1>>;
+  static_assert(!cuda::std::is_constructible_v<WrongSizeVec, const Mask&>);
+  static_assert(!cuda::std::is_convertible_v<const Mask&, WrongSizeVec>);
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+
+template <int Bytes, int N>
+__host__ __device__ constexpr void test_size()
+{
+  test_to_bitset<Bytes, N>();
+  test_to_ullong<Bytes, N>();
+  test_sfinae_negative<Bytes, N>();
+}
+
+template <int Bytes>
+__host__ __device__ constexpr void test_bytes()
+{
+  test_size<Bytes, 1>();
+  test_size<Bytes, 4>();
+}
+
+__host__ __device__ constexpr bool test()
+{
+  test_bytes<1>();
+  test_bytes<2>();
+  test_bytes<4>();
+  test_bytes<8>();
+#if _CCCL_HAS_INT128()
+  test_bytes<16>();
+#endif
+
+  test_implicit_conv<int, 1>();
+  test_implicit_conv<int, 8>();
+  test_implicit_conv<short, 4>();
+  test_implicit_conv<long long, 4>();
+
+  test_explicit_conv<4, short, 4>();
+  test_explicit_conv<2, int, 4>();
+  test_explicit_conv<8, int, 4>();
+  test_explicit_conv<4, long long, 4>();
+  test_explicit_conv<1, short, 4>();
+  test_explicit_conv<2, long long, 8>();
+
+  test_to_ullong<1, 64>();
+  test_to_ullong<1, 65>();
+  return true;
+}
+
+int main(int, char**)
+{
+  assert(test());
+  static_assert(test());
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/ctor.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/ctor.pass.cpp
new file mode 100644
index 00000000000..def96e06e22
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/ctor.pass.cpp
@@ -0,0 +1,271 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cuda/std/__simd_>
+
+// [simd.mask.ctor], basic_mask constructors
+//
+// constexpr explicit basic_mask(value_type) noexcept;                          // broadcast
+// constexpr explicit basic_mask(const basic_mask<UBytes, UAbi>&) noexcept;     // converting
+// constexpr explicit basic_mask(Generator&&);                                  // generator
+// constexpr basic_mask(const bitset<size()>&) noexcept;                        // bitset
+// constexpr explicit basic_mask(unsigned-integer) noexcept;                    // unsigned integer
+
+#include <cuda/std/bitset>
+#include <cuda/std/type_traits>
+
+#include "../simd_test_utils.h"
+
+//----------------------------------------------------------------------------------------------------------------------
+// member types and size
+
+template <int Bytes, int N>
+__host__ __device__ constexpr void test_member_types()
+{
+  using Mask = simd::basic_mask<Bytes, simd::fixed_size<N>>;
+
+  static_assert(cuda::std::is_same_v<typename Mask::value_type, bool>);
+  static_assert(cuda::std::is_same_v<typename Mask::abi_type, simd::fixed_size<N>>);
+  static_assert(Mask::size() == N);
+  static_assert(cuda::std::is_trivially_copyable_v<Mask>);
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// default construction: value-initializes all elements to false
+
+template <int Bytes, int N>
+__host__ __device__ constexpr void test_default_ctor()
+{
+  using Mask = simd::basic_mask<Bytes, simd::fixed_size<N>>;
+  Mask mask{};
+  for (int i = 0; i < N; ++i)
+  {
+    assert(mask[i] == false);
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// copy construction and copy assignment
+
+template <int Bytes, int N>
+__host__ __device__ constexpr void test_copy()
+{
+  using Mask = simd::basic_mask<Bytes, simd::fixed_size<N>>;
+  Mask original(is_even{});
+
+  Mask copied(original);
+  for (int i = 0; i < N; ++i)
+  {
+    assert(copied[i] == original[i]);
+  }
+
+  Mask assigned(false);
+  assigned = original;
+  for (int i = 0; i < N; ++i)
+  {
+    assert(assigned[i] == original[i]);
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// broadcast constructor
+
+template <int Bytes, int N>
+__host__ __device__ constexpr void test_broadcast()
+{
+  using Mask = simd::basic_mask<Bytes, simd::fixed_size<N>>;
+  static_assert(noexcept(Mask(true)));
+
+  Mask all_true(true);
+  Mask all_false(false);
+  for (int i = 0; i < N; ++i)
+  {
+    assert(all_true[i] == true);
+    assert(all_false[i] == false);
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// converting constructor
+
+template <int Bytes, int UBytes, int N>
+__host__ __device__ constexpr void test_converting()
+{
+  using Src = simd::basic_mask<UBytes, simd::fixed_size<N>>;
+  using Dst = simd::basic_mask<Bytes, simd::fixed_size<N>>;
+  Src src(is_even{});
+  static_assert(noexcept(Dst(src)));
+
+  Dst dst(src);
+  for (int i = 0; i < N; ++i)
+  {
+    assert(dst[i] == src[i]);
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// generator constructor
+
+template <int Bytes, int N>
+__host__ __device__ constexpr void test_generator()
+{
+  using Mask = simd::basic_mask<Bytes, simd::fixed_size<N>>;
+#if _CCCL_COMPILER(GCC, !=, 7)
+  static_assert(!noexcept(Mask(is_even{})));
+#endif
+
+  Mask mask(is_even{});
+  for (int i = 0; i < N; ++i)
+  {
+    assert(mask[i] == (i % 2 == 0));
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// bitset constructor
+
+template <int Bytes, int N>
+__host__ __device__ constexpr void test_bitset()
+{
+  using Mask = simd::basic_mask<Bytes, simd::fixed_size<N>>;
+  cuda::std::bitset<N> bitset;
+  static_assert(noexcept(Mask(bitset)));
+
+  for (int i = 0; i < N; ++i)
+  {
+    bitset.set(i, (i % 2 == 0));
+  }
+  Mask mask(bitset);
+  for (int i = 0; i < N; ++i)
+  {
+    assert(mask[i] == (i % 2 == 0));
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// unsigned integer constructor
+
+template <int Bytes, int N, typename U>
+__host__ __device__ constexpr void test_unsigned_int()
+{
+  using Mask = simd::basic_mask<Bytes, simd::fixed_size<N>>;
+  static_assert(noexcept(Mask(U{0})));
+
+  Mask mask(U{0});
+  for (int i = 0; i < N; ++i)
+  {
+    assert(mask[i] == false);
+  }
+
+  constexpr int num_bits  = cuda::std::__num_bits_v<U>;
+  constexpr int mask_bits = cuda::std::min(N, num_bits);
+  Mask all_one(static_cast<U>(~U{0}));
+  for (int i = 0; i < mask_bits; ++i)
+  {
+    assert(all_one[i] == true);
+  }
+
+  if constexpr (N >= 4)
+  {
+    Mask m_pat(U{0b101});
+    assert(m_pat[0] == true);
+    assert(m_pat[1] == false);
+    assert(m_pat[2] == true);
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// SFINAE and explicit constraints
+
+template <int Bytes, int N>
+__host__ __device__ constexpr void test_sfinae()
+{
+  using Mask = simd::basic_mask<Bytes, simd::fixed_size<N>>;
+
+  // broadcast: only accepts bool
+  static_assert(!cuda::std::is_constructible_v<Mask, int>);
+  // broadcast: must be explicit
+  static_assert(!cuda::std::is_convertible_v<bool, Mask>);
+
+  // converting: requires matching element count
+  using MaskDifferentSize = simd::basic_mask<Bytes, simd::fixed_size<N + 1>>;
+  static_assert(!cuda::std::is_constructible_v<Mask, const MaskDifferentSize&>);
+  // converting: must be explicit
+  using MaskOtherBytes = simd::basic_mask<(Bytes == 1 ? 2 : 1), simd::fixed_size<N>>;
+  static_assert(!cuda::std::is_convertible_v<const MaskOtherBytes&, Mask>);
+
+  // generator: rejects non-callable types
+  static_assert(!cuda::std::is_constructible_v<Mask, wrong_generator>);
+  // generator: must be explicit
+  static_assert(!cuda::std::is_convertible_v<is_even, Mask>);
+
+  // bitset: requires matching size
+  static_assert(!cuda::std::is_constructible_v<Mask, cuda::std::bitset<N + 1>>);
+  // bitset: is implicit
+  static_assert(cuda::std::is_convertible_v<const cuda::std::bitset<N>&, Mask>);
+
+  // unsigned integer: must be explicit
+  static_assert(!cuda::std::is_convertible_v<uint32_t, Mask>);
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+
+template <int Bytes, int N>
+__host__ __device__ constexpr void test_size()
+{
+  test_member_types<Bytes, N>();
+  test_default_ctor<Bytes, N>();
+  test_copy<Bytes, N>();
+  test_broadcast<Bytes, N>();
+  test_generator<Bytes, N>();
+  test_bitset<Bytes, N>();
+  test_unsigned_int<Bytes, N, uint8_t>();
+  test_unsigned_int<Bytes, N, uint16_t>();
+  test_unsigned_int<Bytes, N, uint32_t>();
+  test_unsigned_int<Bytes, N, uint64_t>();
+  test_sfinae<Bytes, N>();
+}
+
+template <int Bytes>
+__host__ __device__ constexpr void test_bytes()
+{
+  test_size<Bytes, 1>();
+  test_size<Bytes, 4>();
+}
+
+__host__ __device__ constexpr bool test()
+{
+  test_bytes<1>();
+  test_bytes<2>();
+  test_bytes<4>();
+  test_bytes<8>();
+#if _CCCL_HAS_INT128()
+  test_bytes<16>();
+#endif
+
+  // test_converting  N1: Destination type size, N2: Source type size, N3: Mask number of elements
+  test_converting<4, 2, 4>(); // 4 -> 2, 4 elements
+  test_converting<2, 4, 4>(); // 2 -> 4, 4 elements
+  test_converting<1, 8, 4>(); // 1 -> 8, 4 elements
+  test_converting<8, 1, 4>(); // 8 -> 1, 4 elements
+  test_converting<4, 4, 4>(); // 4 -> 4, 4 elements
+
+  test_converting<1, 2, 1>(); // 1 -> 2, 1 element
+  test_converting<1, 2, 2>(); // 1 -> 2, 2 elements
+  test_converting<1, 2, 8>(); // 1 -> 2, 8 elements
+  return true;
+}
+
+int main(int, char**)
+{
+  assert(test());
+  static_assert(test());
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/instantiation.fail.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/instantiation.fail.cpp
new file mode 100644
index 00000000000..b3c25eaa78d
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/instantiation.fail.cpp
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// basic_mask<Bytes, Abi> requires Bytes to map to a valid integer type (1, 2, 4, 8, or 16 with __int128).
+
+#include <cuda/std/__simd_>
+
+#include "test_macros.h"
+
+int main(int, char**)
+{
+  using Mask = cuda::std::simd::basic_mask<3, cuda::std::simd::fixed_size<4>>;
+  Mask mask(true); // expected-error
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/subscript.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/subscript.pass.cpp
new file mode 100644
index 00000000000..22f176d530f
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/subscript.pass.cpp
@@ -0,0 +1,70 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cuda/std/__simd_>
+
+// [simd.mask.subscr], basic_mask subscript operators
+//
+// constexpr value_type operator[](simd-size-type) const noexcept;
+
+#include "../simd_test_utils.h"
+
+//----------------------------------------------------------------------------------------------------------------------
+// subscript read-back
+
+template <int Bytes, int N>
+__host__ __device__ constexpr void test_subscript()
+{
+  using Mask = simd::basic_mask<Bytes, simd::fixed_size<N>>;
+  Mask mask(true);
+
+  static_assert(cuda::std::is_same_v<decltype(mask[0]), typename Mask::value_type>);
+  static_assert(!noexcept(mask[0]));
+  static_assert(is_const_member_function_v<decltype(&Mask::operator[])>);
+  unused(mask);
+
+  Mask all_true(true);
+  Mask all_false(false);
+  Mask alternating(is_even{});
+  for (int i = 0; i < N; ++i)
+  {
+    assert(all_true[i] == true);
+    assert(all_false[i] == false);
+    assert(alternating[i] == (i % 2 == 0));
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+
+template <int Bytes>
+__host__ __device__ constexpr void test_bytes()
+{
+  test_subscript<Bytes, 1>();
+  test_subscript<Bytes, 4>();
+}
+
+__host__ __device__ constexpr bool test()
+{
+  test_bytes<1>();
+  test_bytes<2>();
+  test_bytes<4>();
+  test_bytes<8>();
+#if _CCCL_HAS_INT128()
+  test_bytes<16>();
+#endif
+  return true;
+}
+
+int main(int, char**)
+{
+  assert(test());
+  static_assert(test());
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/unary.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/unary.pass.cpp
new file mode 100644
index 00000000000..6d0c6d737d5
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.mask.class/unary.pass.cpp
@@ -0,0 +1,162 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cuda/std/__simd_>
+
+// [simd.mask.unary], basic_mask unary operators
+//
+// constexpr basic_mask operator!() const noexcept;
+// constexpr basic_vec<integer-from<Bytes>, Abi> operator+() const noexcept;
+// constexpr basic_vec<integer-from<Bytes>, Abi> operator-() const noexcept;
+// constexpr basic_vec<integer-from<Bytes>, Abi> operator~() const noexcept;
+
+#include "../simd_test_utils.h"
+
+//----------------------------------------------------------------------------------------------------------------------
+// operator!
+
+template <int Bytes, int N>
+__host__ __device__ constexpr void test_logical_not()
+{
+  using Mask = simd::basic_mask<Bytes, simd::fixed_size<N>>;
+  Mask mask(true);
+  static_assert(cuda::std::is_same_v<decltype(!mask), Mask>);
+  static_assert(noexcept(!mask));
+  static_assert(is_const_member_function_v<decltype(&Mask::operator!)>);
+  unused(mask);
+
+  Mask all_true(true);
+  Mask all_false(false);
+  Mask mixed(is_even{});
+  for (int i = 0; i < N; ++i)
+  {
+    assert((!all_true)[i] == false);
+    assert((!all_false)[i] == true);
+    assert((!mixed)[i] == (i % 2 != 0));
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// operator+
+
+template <int Bytes, int N>
+__host__ __device__ constexpr void test_unary_plus()
+{
+  using Mask    = simd::basic_mask<Bytes, simd::fixed_size<N>>;
+  using Integer = integer_from_t<Bytes>;
+  using Vec     = simd::basic_vec<Integer, simd::fixed_size<N>>;
+  Mask mask(is_even{});
+  static_assert(cuda::std::is_same_v<decltype(+mask), Vec>);
+  static_assert(noexcept(+mask));
+
+  auto vec = +mask;
+  for (int i = 0; i < N; ++i)
+  {
+    if (i % 2 == 0)
+    {
+      assert(vec[i] == 1);
+    }
+    else
+    {
+      assert(vec[i] == 0);
+    }
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// operator-
+
+template <int Bytes, int N>
+__host__ __device__ constexpr void test_unary_minus()
+{
+  using Mask    = simd::basic_mask<Bytes, simd::fixed_size<N>>;
+  using Integer = integer_from_t<Bytes>;
+  using Vec     = simd::basic_vec<Integer, simd::fixed_size<N>>;
+  Mask mask(is_even{});
+  static_assert(cuda::std::is_same_v<decltype(-mask), Vec>);
+  static_assert(noexcept(-mask));
+
+  Vec vec = -mask;
+  for (int i = 0; i < N; ++i)
+  {
+    if (i % 2 == 0)
+    {
+      assert(vec[i] == static_cast<Integer>(-Integer{1}));
+    }
+    else
+    {
+      assert(vec[i] == Integer{0});
+    }
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// operator~
+
+template <int Bytes, int N>
+__host__ __device__ constexpr void test_bitwise_not()
+{
+  using Mask    = simd::basic_mask<Bytes, simd::fixed_size<N>>;
+  using Integer = integer_from_t<Bytes>;
+  using Vec     = simd::basic_vec<Integer, simd::fixed_size<N>>;
+  Mask mask(is_even{});
+  static_assert(cuda::std::is_same_v<decltype(~mask), Vec>);
+  static_assert(noexcept(~mask));
+
+  Vec vec = ~mask;
+  for (int i = 0; i < N; ++i)
+  {
+    if (i % 2 == 0)
+    {
+      assert(vec[i] == static_cast<Integer>(~Integer{1}));
+    }
+    else
+    {
+      assert(vec[i] == static_cast<Integer>(~Integer{0}));
+    }
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+
+template <int Bytes, int N>
+__host__ __device__ constexpr void test_size()
+{
+  test_logical_not<Bytes, N>();
+  test_unary_plus<Bytes, N>();
+  test_unary_minus<Bytes, N>();
+  test_bitwise_not<Bytes, N>();
+}
+
+template <int Bytes>
+__host__ __device__ constexpr void test_bytes()
+{
+  test_size<Bytes, 1>();
+  test_size<Bytes, 4>();
+}
+
+__host__ __device__ constexpr bool test()
+{
+  test_bytes<1>();
+  test_bytes<2>();
+  test_bytes<4>();
+  test_bytes<8>();
+#if _CCCL_HAS_INT128()
+  test_bytes<16>();
+#endif
+  return true;
+}
+
+int main(int, char**)
+{
+  assert(test());
+  static_assert(test());
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/aliases.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/aliases.pass.cpp
new file mode 100644
index 00000000000..2e9bf8fb576
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/aliases.pass.cpp
@@ -0,0 +1,73 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cuda/std/__simd_>
+
+// template<class T, simd-size-type N> using vec  = ...;
+// template<class T, simd-size-type N> using mask = ...;
+
+#include "../simd_test_utils.h"
+
+//----------------------------------------------------------------------------------------------------------------------
+// vec<T, N> resolves to basic_vec<T, deduce-abi-t<T, N>>
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_vec_alias()
+{
+  using Alias  = simd::vec<T, N>;
+  using Direct = simd::basic_vec<T, simd::fixed_size<N>>;
+  static_assert(cuda::std::is_same_v<Alias, Direct>);
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// mask<T, N> resolves to basic_mask<sizeof(T), deduce-abi-t<T, N>>
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_mask_alias()
+{
+  using Alias  = simd::mask<T, N>;
+  using Direct = simd::basic_mask<sizeof(T), simd::fixed_size<N>>;
+  static_assert(cuda::std::is_same_v<Alias, Direct>);
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// default N for vec and mask uses native ABI size
+
+template <typename T>
+__host__ __device__ constexpr void test_default_size()
+{
+  using DefaultVec = simd::vec<T>;
+  using NativeVec  = simd::basic_vec<T, simd::native<T>>;
+  static_assert(cuda::std::is_same_v<DefaultVec, NativeVec>);
+
+  using DefaultMask = simd::mask<T>;
+  using NativeMask  = simd::basic_mask<sizeof(T), simd::native<T>>;
+  static_assert(cuda::std::is_same_v<DefaultMask, NativeMask>);
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_type()
+{
+  test_vec_alias<T, N>();
+  test_mask_alias<T, N>();
+  test_default_size<T>();
+}
+
+DEFINE_BASIC_VEC_TEST()
+DEFINE_BASIC_VEC_TEST_RUNTIME()
+
+int main(int, char**)
+{
+  static_assert(test());
+  assert(test_runtime());
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/alignment.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/alignment.pass.cpp
new file mode 100644
index 00000000000..c179620f379
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/alignment.pass.cpp
@@ -0,0 +1,83 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cuda/std/__simd_>
+
+// template<class T, class U = typename T::value_type>
+// struct alignment;
+//
+// template<class T, class U = typename T::value_type>
+// constexpr size_t alignment_v = alignment<T, U>::value;
+
+#include <cuda/std/__simd_>
+#include <cuda/std/type_traits>
+
+#include "test_macros.h"
+
+namespace simd = cuda::std::simd;
+
+template <typename T, int N, size_t ExpectedAlign = alignof(T) * N>
+__host__ __device__ void test_default_u()
+{
+  using V = simd::basic_vec<T, simd::fixed_size<N>>;
+  static_assert(simd::alignment<V>::value == ExpectedAlign);
+  static_assert(simd::alignment_v<V> == ExpectedAlign);
+}
+
+template <typename T, int N, typename U, size_t ExpectedAlign = alignof(U) * N>
+__host__ __device__ void test_explicit_u()
+{
+  using V = simd::basic_vec<T, simd::fixed_size<N>>;
+  static_assert(simd::alignment<V, U>::value == ExpectedAlign);
+  static_assert(simd::alignment_v<V, U> == ExpectedAlign);
+}
+
+template <typename T>
+__host__ __device__ void test_type()
+{
+  test_default_u<T, 1>();
+  test_default_u<T, 3, alignof(T)>();
+  test_default_u<T, 2>();
+  test_default_u<T, 4>();
+  test_default_u<T, 8>();
+}
+
+__host__ __device__ void test()
+{
+  // default U = value_type
+  test_type<char>();
+  test_type<short>();
+  test_type<int>();
+  test_type<long long>();
+  test_type<float>();
+  test_type<double>();
+#if _CCCL_HAS_INT128()
+  test_type<__int128_t>();
+#endif // _CCCL_HAS_INT128()
+#if _LIBCUDACXX_HAS_NVFP16()
+  test_type<__half>();
+#endif // _LIBCUDACXX_HAS_NVFP16()
+#if _LIBCUDACXX_HAS_NVBF16()
+  test_type<__nv_bfloat16>();
+#endif // _LIBCUDACXX_HAS_NVBF16()
+
+  // explicit U different from value_type
+  test_explicit_u<int, 1, float>();
+  test_explicit_u<int, 3, float, alignof(float)>();
+  test_explicit_u<int, 4, char>();
+  test_explicit_u<float, 2, double>();
+  test_explicit_u<double, 4, int>();
+}
+
+int main(int, char**)
+{
+  test();
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/rebind.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/rebind.pass.cpp
new file mode 100644
index 00000000000..d683bf9d8c2
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/rebind.pass.cpp
@@ -0,0 +1,106 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cuda/std/__simd_>
+
+// template<class T, class V> struct rebind;
+// template<class T, class V> using rebind_t = typename rebind<T, V>::type;
+
+#include <cuda/std/__simd_>
+#include <cuda/std/type_traits>
+
+#include "test_macros.h"
+
+namespace simd = cuda::std::simd;
+
+//----------------------------------------------------------------------------------------------------------------------
+// rebind with basic_vec
+
+template <typename NewT, typename OldT, int N>
+__host__ __device__ void test_rebind_vec()
+{
+  using OldVec = simd::basic_vec<OldT, simd::fixed_size<N>>;
+  using Result = simd::rebind_t<NewT, OldVec>;
+  static_assert(cuda::std::is_same_v<typename Result::value_type, NewT>);
+  static_assert(Result::size() == N);
+}
+
+template <typename NewT, typename OldT>
+__host__ __device__ void test_rebind_vec_sizes()
+{
+  test_rebind_vec<NewT, OldT, 1>();
+  test_rebind_vec<NewT, OldT, 2>();
+  test_rebind_vec<NewT, OldT, 4>();
+  test_rebind_vec<NewT, OldT, 8>();
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// rebind with basic_mask
+
+template <typename NewT, typename OldT, int N>
+__host__ __device__ void test_rebind_mask()
+{
+  using OldMask = simd::basic_mask<sizeof(OldT), simd::fixed_size<N>>;
+  using Result  = simd::rebind_t<NewT, OldMask>;
+  static_assert(cuda::std::is_same_v<Result, simd::basic_mask<sizeof(NewT), simd::fixed_size<N>>>);
+  static_assert(Result::size() == N);
+}
+
+template <typename NewT, typename OldT>
+__host__ __device__ void test_rebind_mask_sizes()
+{
+  test_rebind_mask<NewT, OldT, 1>();
+  test_rebind_mask<NewT, OldT, 2>();
+  test_rebind_mask<NewT, OldT, 4>();
+  test_rebind_mask<NewT, OldT, 8>();
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// rebind_t matches rebind::type
+
+template <typename NewT, typename V>
+__host__ __device__ void test_rebind_t_alias()
+{
+  static_assert(cuda::std::is_same_v<simd::rebind_t<NewT, V>, typename simd::rebind<NewT, V>::type>);
+}
+
+__host__ __device__ void test()
+{
+  // rebind basic_vec
+  test_rebind_vec_sizes<int, int>();
+  test_rebind_vec_sizes<float, float>();
+  test_rebind_vec_sizes<float, int>();
+  test_rebind_vec_sizes<int, float>();
+
+  // rebind basic_vec: different sizes
+  test_rebind_vec_sizes<char, int>();
+  test_rebind_vec_sizes<int, char>();
+  test_rebind_vec_sizes<short, long long>();
+  test_rebind_vec_sizes<double, int>();
+  test_rebind_vec_sizes<int, double>();
+
+  // rebind basic_mask
+  test_rebind_mask_sizes<int, int>();
+  test_rebind_mask_sizes<float, float>();
+  test_rebind_mask_sizes<char, int>();
+  test_rebind_mask_sizes<double, int>();
+  test_rebind_mask_sizes<short, long long>();
+
+  // rebind_t alias matches rebind::type
+  test_rebind_t_alias<float, simd::vec<int, 4>>();
+  test_rebind_t_alias<int, simd::vec<double, 2>>();
+  test_rebind_t_alias<double, simd::mask<int, 4>>();
+}
+
+int main(int, char**)
+{
+  test();
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/resize.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/resize.pass.cpp
new file mode 100644
index 00000000000..b2dfb0f5130
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/resize.pass.cpp
@@ -0,0 +1,99 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cuda/std/__simd_>
+
+// template<simd-size-type N, class V> struct resize;
+// template<simd-size-type N, class V> using resize_t = typename resize<N, V>::type;
+
+#include <cuda/std/__simd_>
+#include <cuda/std/type_traits>
+
+#include "test_macros.h"
+
+namespace simd = cuda::std::simd;
+
+//----------------------------------------------------------------------------------------------------------------------
+// resize with basic_vec
+
+template <typename T, int OldN, int NewN>
+__host__ __device__ void test_resize_vec()
+{
+  using OldVec   = simd::basic_vec<T, simd::fixed_size<OldN>>;
+  using Result   = simd::resize_t<NewN, OldVec>;
+  using Expected = simd::basic_vec<T, simd::fixed_size<NewN>>;
+  static_assert(cuda::std::is_same_v<typename Result::value_type, T>);
+  static_assert(Result::size() == NewN);
+  static_assert(cuda::std::is_same_v<Result, Expected>);
+}
+
+template <typename T>
+__host__ __device__ void test_resize_vec_all()
+{
+  test_resize_vec<T, 4, 4>();
+  test_resize_vec<T, 4, 2>();
+  test_resize_vec<T, 2, 8>();
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// resize with basic_mask
+
+template <typename T, int OldN, int NewN>
+__host__ __device__ void test_resize_mask()
+{
+  using OldMask  = simd::basic_mask<sizeof(T), simd::fixed_size<OldN>>;
+  using Result   = simd::resize_t<NewN, OldMask>;
+  using Expected = simd::basic_mask<sizeof(T), simd::fixed_size<NewN>>;
+  static_assert(Result::size() == NewN);
+  static_assert(cuda::std::is_same_v<Result, Expected>);
+}
+
+template <typename T>
+__host__ __device__ void test_resize_mask_all()
+{
+  test_resize_mask<T, 4, 4>();
+  test_resize_mask<T, 4, 2>();
+  test_resize_mask<T, 2, 8>();
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// resize_t matches resize::type
+
+template <int N, typename V>
+__host__ __device__ void test_resize_t_alias()
+{
+  static_assert(cuda::std::is_same_v<simd::resize_t<N, V>, typename simd::resize<N, V>::type>);
+}
+
+__host__ __device__ void test()
+{
+  // resize basic_vec
+  test_resize_vec_all<char>();
+  test_resize_vec_all<short>();
+  test_resize_vec_all<int>();
+  test_resize_vec_all<long long>();
+  test_resize_vec_all<float>();
+  test_resize_vec_all<double>();
+
+  // resize basic_mask
+  test_resize_mask_all<char>();
+  test_resize_mask_all<int>();
+  test_resize_mask_all<double>();
+
+  // resize_t alias matches resize::type
+  test_resize_t_alias<8, simd::vec<int, 4>>();
+  test_resize_t_alias<2, simd::vec<float, 8>>();
+}
+
+int main(int, char**)
+{
+  test();
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/vectorizable.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/vectorizable.pass.cpp
new file mode 100644
index 00000000000..6ce4ae99210
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/vectorizable.pass.cpp
@@ -0,0 +1,72 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cuda/std/__simd_>
+
+// [simd.expos] __is_vectorizable_v: true for all standard integer types,
+// character types, and float/double; false for bool, const/volatile types.
+
+#include <cuda/std/__simd_>
+#include <cuda/std/cstdint>
+
+#include "test_macros.h"
+
+namespace simd = cuda::std::simd;
+
+// positive cases: integer types
+static_assert(simd::__is_vectorizable_v<int8_t>);
+static_assert(simd::__is_vectorizable_v<int16_t>);
+static_assert(simd::__is_vectorizable_v<int32_t>);
+static_assert(simd::__is_vectorizable_v<int64_t>);
+static_assert(simd::__is_vectorizable_v<uint8_t>);
+static_assert(simd::__is_vectorizable_v<uint16_t>);
+static_assert(simd::__is_vectorizable_v<uint32_t>);
+static_assert(simd::__is_vectorizable_v<uint64_t>);
+
+// positive cases: character types
+static_assert(simd::__is_vectorizable_v<char>);
+static_assert(simd::__is_vectorizable_v<char16_t>);
+static_assert(simd::__is_vectorizable_v<char32_t>);
+static_assert(simd::__is_vectorizable_v<wchar_t>);
+#if defined(__cccl_lib_char8_t)
+static_assert(simd::__is_vectorizable_v<char8_t>);
+#endif
+
+#if _CCCL_HAS_INT128()
+static_assert(simd::__is_vectorizable_v<__int128_t>);
+#endif
+
+// floating-point types
+static_assert(simd::__is_vectorizable_v<float>);
+static_assert(simd::__is_vectorizable_v<double>);
+#if _LIBCUDACXX_HAS_NVFP16()
+static_assert(simd::__is_vectorizable_v<__half>);
+#endif
+#if _LIBCUDACXX_HAS_NVBF16()
+static_assert(simd::__is_vectorizable_v<__nv_bfloat16>);
+#endif
+
+// negative cases
+static_assert(!simd::__is_vectorizable_v<bool>);
+static_assert(!simd::__is_vectorizable_v<const int>);
+static_assert(!simd::__is_vectorizable_v<volatile int>);
+static_assert(!simd::__is_vectorizable_v<const volatile int>);
+static_assert(!simd::__is_vectorizable_v<void>);
+
+struct user_type
+{};
+static_assert(!simd::__is_vectorizable_v<user_type>);
+static_assert(!simd::__is_vectorizable_v<int*>);
+static_assert(!simd::__is_vectorizable_v<int&>);
+
+int main(int, char**)
+{
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/binary.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/binary.pass.cpp
new file mode 100644
index 00000000000..afb29578985
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/binary.pass.cpp
@@ -0,0 +1,144 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cuda/std/__simd_>
+
+// [simd.binary], basic_vec binary operators
+//
+// friend constexpr basic_vec operator+(const basic_vec&, const basic_vec&) noexcept;
+// friend constexpr basic_vec operator-(const basic_vec&, const basic_vec&) noexcept;
+// friend constexpr basic_vec operator*(const basic_vec&, const basic_vec&) noexcept;
+// friend constexpr basic_vec operator/(const basic_vec&, const basic_vec&) noexcept;
+// friend constexpr basic_vec operator%(const basic_vec&, const basic_vec&) noexcept;
+// friend constexpr basic_vec operator&(const basic_vec&, const basic_vec&) noexcept;
+// friend constexpr basic_vec operator|(const basic_vec&, const basic_vec&) noexcept;
+// friend constexpr basic_vec operator^(const basic_vec&, const basic_vec&) noexcept;
+// friend constexpr basic_vec operator<<(const basic_vec&, const basic_vec&) noexcept;
+// friend constexpr basic_vec operator>>(const basic_vec&, const basic_vec&) noexcept;
+// friend constexpr basic_vec operator<<(const basic_vec&, simd-size-type) noexcept;
+// friend constexpr basic_vec operator>>(const basic_vec&, simd-size-type) noexcept;
+
+#include "../simd_test_utils.h"
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_arithmetic()
+{
+  using Vec = simd::basic_vec<T, simd::fixed_size<N>>;
+  Vec a(T{6});
+  Vec b(T{3});
+
+  static_assert(cuda::std::is_same_v<decltype(a + b), Vec>);
+  static_assert(cuda::std::is_same_v<decltype(a - b), Vec>);
+  static_assert(cuda::std::is_same_v<decltype(a * b), Vec>);
+  static_assert(cuda::std::is_same_v<decltype(a / b), Vec>);
+  static_assert(noexcept(a + b));
+  static_assert(noexcept(a - b));
+  static_assert(noexcept(a * b));
+  static_assert(noexcept(a / b));
+
+  Vec sum  = a + b;
+  Vec diff = a - b;
+  Vec prod = a * b;
+  Vec quot = a / b;
+  for (int i = 0; i < N; ++i)
+  {
+    assert(sum[i] == static_cast<T>(T{6} + T{3}));
+    assert(diff[i] == static_cast<T>(T{6} - T{3}));
+    assert(prod[i] == static_cast<T>(T{6} * T{3}));
+    assert(quot[i] == static_cast<T>(T{6} / T{3}));
+  }
+}
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_integral_ops()
+{
+  using Vec = simd::basic_vec<T, simd::fixed_size<N>>;
+  Vec a(T{7});
+  Vec b(T{3});
+
+  static_assert(cuda::std::is_same_v<decltype(a % b), Vec>);
+  static_assert(cuda::std::is_same_v<decltype(a & b), Vec>);
+  static_assert(cuda::std::is_same_v<decltype(a | b), Vec>);
+  static_assert(cuda::std::is_same_v<decltype(a ^ b), Vec>);
+  static_assert(noexcept(a % b));
+  static_assert(noexcept(a & b));
+  static_assert(noexcept(a | b));
+  static_assert(noexcept(a ^ b));
+
+  Vec mod     = a % b;
+  Vec bit_and = a & b;
+  Vec bit_or  = a | b;
+  Vec bit_xor = a ^ b;
+  for (int i = 0; i < N; ++i)
+  {
+    assert(mod[i] == static_cast<T>(T{7} % T{3}));
+    assert(bit_and[i] == static_cast<T>(T{7} & T{3}));
+    assert(bit_or[i] == static_cast<T>(T{7} | T{3}));
+    assert(bit_xor[i] == static_cast<T>(T{7} ^ T{3}));
+  }
+}
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_shifts()
+{
+  using Vec = simd::basic_vec<T, simd::fixed_size<N>>;
+  Vec a(T{4});
+  Vec shift_amount(T{1});
+
+  static_assert(cuda::std::is_same_v<decltype(a << shift_amount), Vec>);
+  static_assert(cuda::std::is_same_v<decltype(a >> shift_amount), Vec>);
+  static_assert(noexcept(a << shift_amount));
+  static_assert(noexcept(a >> shift_amount));
+
+  Vec shl = a << shift_amount;
+  Vec shr = a >> shift_amount;
+  for (int i = 0; i < N; ++i)
+  {
+    assert(shl[i] == static_cast<T>(T{4} << T{1}));
+    assert(shr[i] == static_cast<T>(T{4} >> T{1}));
+  }
+
+  static_assert(cuda::std::is_same_v<decltype(a << 1), Vec>);
+  static_assert(cuda::std::is_same_v<decltype(a >> 1), Vec>);
+  static_assert(noexcept(a << 1));
+  static_assert(noexcept(a >> 1));
+
+  Vec shl_n = a << 1;
+  Vec shr_n = a >> 1;
+  for (int i = 0; i < N; ++i)
+  {
+    assert(shl_n[i] == static_cast<T>(T{4} << T{1}));
+    assert(shr_n[i] == static_cast<T>(T{4} >> T{1}));
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_type()
+{
+  test_arithmetic<T, N>();
+  if constexpr (cuda::std::is_integral_v<T>)
+  {
+    test_integral_ops<T, N>();
+    test_shifts<T, N>();
+  }
+}
+
+DEFINE_BASIC_VEC_TEST()
+DEFINE_BASIC_VEC_TEST_RUNTIME()
+
+int main(int, char**)
+{
+  assert(test());
+  static_assert(test());
+  assert(test_runtime());
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/comparison.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/comparison.pass.cpp
new file mode 100644
index 00000000000..4b795836347
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/comparison.pass.cpp
@@ -0,0 +1,74 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cuda/std/__simd_>
+
+// [simd.comparison], basic_vec compare operators
+//
+// friend constexpr mask_type operator==(const basic_vec&, const basic_vec&) noexcept;
+// friend constexpr mask_type operator!=(const basic_vec&, const basic_vec&) noexcept;
+// friend constexpr mask_type operator>=(const basic_vec&, const basic_vec&) noexcept;
+// friend constexpr mask_type operator<=(const basic_vec&, const basic_vec&) noexcept;
+// friend constexpr mask_type operator>(const basic_vec&, const basic_vec&) noexcept;
+// friend constexpr mask_type operator<(const basic_vec&, const basic_vec&) noexcept;
+
+#include "../simd_test_utils.h"
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_type()
+{
+  using Vec  = simd::basic_vec<T, simd::fixed_size<N>>;
+  using Mask = typename Vec::mask_type;
+  Vec a      = make_iota_vec<T, N>();
+  Vec b(T{3});
+
+  static_assert(cuda::std::is_same_v<decltype(a == b), Mask>);
+  static_assert(cuda::std::is_same_v<decltype(a != b), Mask>);
+  static_assert(cuda::std::is_same_v<decltype(a >= b), Mask>);
+  static_assert(cuda::std::is_same_v<decltype(a <= b), Mask>);
+  static_assert(cuda::std::is_same_v<decltype(a > b), Mask>);
+  static_assert(cuda::std::is_same_v<decltype(a < b), Mask>);
+  static_assert(noexcept(a == b));
+  static_assert(noexcept(a != b));
+  static_assert(noexcept(a >= b));
+  static_assert(noexcept(a <= b));
+  static_assert(noexcept(a > b));
+  static_assert(noexcept(a < b));
+
+  Mask eq = a == b;
+  Mask ne = a != b;
+  Mask ge = a >= b;
+  Mask le = a <= b;
+  Mask gt = a > b;
+  Mask lt = a < b;
+  for (int i = 0; i < N; ++i)
+  {
+    T val = static_cast<T>(i);
+    assert(eq[i] == (val == T{3}));
+    assert(ne[i] == (val != T{3}));
+    assert(ge[i] == (val >= T{3}));
+    assert(le[i] == (val <= T{3}));
+    assert(gt[i] == (val > T{3}));
+    assert(lt[i] == (val < T{3}));
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+
+DEFINE_BASIC_VEC_TEST()
+DEFINE_BASIC_VEC_TEST_RUNTIME()
+
+int main(int, char**)
+{
+  assert(test());
+  static_assert(test());
+  assert(test_runtime());
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/compound_assign.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/compound_assign.pass.cpp
new file mode 100644
index 00000000000..74441f92617
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/compound_assign.pass.cpp
@@ -0,0 +1,208 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cuda/std/__simd_>
+
+// [simd.cassign], basic_vec compound assignment
+//
+// friend constexpr basic_vec& operator+=(basic_vec&, const basic_vec&) noexcept;
+// friend constexpr basic_vec& operator-=(basic_vec&, const basic_vec&) noexcept;
+// friend constexpr basic_vec& operator*=(basic_vec&, const basic_vec&) noexcept;
+// friend constexpr basic_vec& operator/=(basic_vec&, const basic_vec&) noexcept;
+// friend constexpr basic_vec& operator%=(basic_vec&, const basic_vec&) noexcept;
+// friend constexpr basic_vec& operator&=(basic_vec&, const basic_vec&) noexcept;
+// friend constexpr basic_vec& operator|=(basic_vec&, const basic_vec&) noexcept;
+// friend constexpr basic_vec& operator^=(basic_vec&, const basic_vec&) noexcept;
+// friend constexpr basic_vec& operator<<=(basic_vec&, const basic_vec&) noexcept;
+// friend constexpr basic_vec& operator>>=(basic_vec&, const basic_vec&) noexcept;
+// friend constexpr basic_vec& operator<<=(basic_vec&, simd-size-type) noexcept;
+// friend constexpr basic_vec& operator>>=(basic_vec&, simd-size-type) noexcept;
+
+#include "../simd_test_utils.h"
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_arithmetic_assign()
+{
+  using Vec = simd::basic_vec<T, simd::fixed_size<N>>;
+  Vec b(T{3});
+
+  // operator+=
+  {
+    Vec a(T{6});
+    static_assert(cuda::std::is_same_v<decltype(a += b), Vec&>);
+    static_assert(noexcept(a += b));
+    a += b;
+    for (int i = 0; i < N; ++i)
+    {
+      assert(a[i] == static_cast<T>(T{6} + T{3}));
+    }
+  }
+  // operator-=
+  {
+    Vec a(T{6});
+    static_assert(cuda::std::is_same_v<decltype(a -= b), Vec&>);
+    static_assert(noexcept(a -= b));
+    a -= b;
+    for (int i = 0; i < N; ++i)
+    {
+      assert(a[i] == static_cast<T>(T{6} - T{3}));
+    }
+  }
+  // operator*=
+  {
+    Vec a(T{6});
+    static_assert(cuda::std::is_same_v<decltype(a *= b), Vec&>);
+    static_assert(noexcept(a *= b));
+    a *= b;
+    for (int i = 0; i < N; ++i)
+    {
+      assert(a[i] == static_cast<T>(T{6} * T{3}));
+    }
+  }
+  // operator/=
+  {
+    Vec a(T{6});
+    static_assert(cuda::std::is_same_v<decltype(a /= b), Vec&>);
+    static_assert(noexcept(a /= b));
+    a /= b;
+    for (int i = 0; i < N; ++i)
+    {
+      assert(a[i] == static_cast<T>(T{6} / T{3}));
+    }
+  }
+}
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_integral_assign()
+{
+  using Vec = simd::basic_vec<T, simd::fixed_size<N>>;
+  Vec b(T{3});
+
+  // operator%=
+  {
+    Vec a(T{7});
+    static_assert(cuda::std::is_same_v<decltype(a %= b), Vec&>);
+    static_assert(noexcept(a %= b));
+    a %= b;
+    for (int i = 0; i < N; ++i)
+    {
+      assert(a[i] == static_cast<T>(T{7} % T{3}));
+    }
+  }
+  // operator&=
+  {
+    Vec a(T{7});
+    static_assert(cuda::std::is_same_v<decltype(a &= b), Vec&>);
+    static_assert(noexcept(a &= b));
+    a &= b;
+    for (int i = 0; i < N; ++i)
+    {
+      assert(a[i] == static_cast<T>(T{7} & T{3}));
+    }
+  }
+  // operator|=
+  {
+    Vec a(T{7});
+    static_assert(cuda::std::is_same_v<decltype(a |= b), Vec&>);
+    static_assert(noexcept(a |= b));
+    a |= b;
+    for (int i = 0; i < N; ++i)
+    {
+      assert(a[i] == static_cast<T>(T{7} | T{3}));
+    }
+  }
+  // operator^=
+  {
+    Vec a(T{7});
+    static_assert(cuda::std::is_same_v<decltype(a ^= b), Vec&>);
+    static_assert(noexcept(a ^= b));
+    a ^= b;
+    for (int i = 0; i < N; ++i)
+    {
+      assert(a[i] == static_cast<T>(T{7} ^ T{3}));
+    }
+  }
+}
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_shift_assign()
+{
+  using Vec = simd::basic_vec<T, simd::fixed_size<N>>;
+  Vec shift(T{1});
+
+  // operator<<=
+  {
+    Vec a(T{4});
+    static_assert(cuda::std::is_same_v<decltype(a <<= shift), Vec&>);
+    static_assert(noexcept(a <<= shift));
+    a <<= shift;
+    for (int i = 0; i < N; ++i)
+    {
+      assert(a[i] == static_cast<T>(T{4} << T{1}));
+    }
+  }
+  // operator>>=
+  {
+    Vec a(T{4});
+    static_assert(cuda::std::is_same_v<decltype(a >>= shift), Vec&>);
+    static_assert(noexcept(a >>= shift));
+    a >>= shift;
+    for (int i = 0; i < N; ++i)
+    {
+      assert(a[i] == static_cast<T>(T{4} >> T{1}));
+    }
+  }
+  // operator<<=
+  {
+    Vec a(T{4});
+    static_assert(cuda::std::is_same_v<decltype(a <<= 1), Vec&>);
+    static_assert(noexcept(a <<= 1));
+    a <<= 1;
+    for (int i = 0; i < N; ++i)
+    {
+      assert(a[i] == static_cast<T>(T{4} << T{1}));
+    }
+  }
+  // operator>>=
+  {
+    Vec a(T{4});
+    static_assert(cuda::std::is_same_v<decltype(a >>= 1), Vec&>);
+    static_assert(noexcept(a >>= 1));
+    a >>= 1;
+    for (int i = 0; i < N; ++i)
+    {
+      assert(a[i] == static_cast<T>(T{4} >> T{1}));
+    }
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_type()
+{
+  test_arithmetic_assign<T, N>();
+  if constexpr (cuda::std::is_integral_v<T>)
+  {
+    test_integral_assign<T, N>();
+    test_shift_assign<T, N>();
+  }
+}
+
+DEFINE_BASIC_VEC_TEST()
+DEFINE_BASIC_VEC_TEST_RUNTIME()
+
+int main(int, char**)
+{
+  assert(test());
+  static_assert(test());
+  assert(test_runtime());
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/ctor.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/ctor.pass.cpp
new file mode 100644
index 00000000000..5007f577750
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/ctor.pass.cpp
@@ -0,0 +1,448 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cuda/std/__simd_>
+
+// [simd.ctor], basic_vec constructors
+//
+// constexpr explicit basic_vec(Up&&) noexcept;                                     // broadcast (explicit)
+// constexpr basic_vec(Up&&) noexcept;                                              // broadcast (implicit)
+// constexpr explicit basic_vec(const basic_vec<U,UAbi>&) noexcept;                 // converting (explicit)
+// constexpr basic_vec(const basic_vec<U,UAbi>&) noexcept;                          // converting (implicit)
+// constexpr explicit basic_vec(Generator&&);                                       // generator
+// constexpr basic_vec(Range&&, flags<> = {});                                      // range
+// constexpr basic_vec(Range&&, const mask_type&, flags<> = {});                    // masked range
+
+#include <cuda/std/span>
+
+#include "../simd_test_utils.h"
+
+//----------------------------------------------------------------------------------------------------------------------
+// member types and size
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_member_types()
+{
+  using Vec = simd::basic_vec<T, simd::fixed_size<N>>;
+
+  static_assert(cuda::std::is_same_v<typename Vec::value_type, T>);
+  static_assert(cuda::std::is_same_v<typename Vec::abi_type, simd::fixed_size<N>>);
+  static_assert(Vec::size() == N);
+  static_assert(cuda::std::is_trivially_copyable_v<Vec>);
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// default construction: value-initialize all elements
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_default_ctor()
+{
+  using Vec = simd::basic_vec<T, simd::fixed_size<N>>;
+  Vec vec{};
+  for (int i = 0; i < N; ++i)
+  {
+    assert(vec[i] == T{});
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// copy construction and copy assignment
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_copy()
+{
+  using Vec = simd::basic_vec<T, simd::fixed_size<N>>;
+  Vec original(T{42});
+
+  Vec copied(original);
+  for (int i = 0; i < N; ++i)
+  {
+    assert(copied[i] == T{42});
+  }
+
+  Vec assigned{};
+  assigned = original;
+  for (int i = 0; i < N; ++i)
+  {
+    assert(assigned[i] == T{42});
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// broadcast constructor
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_broadcast()
+{
+  using Vec = simd::basic_vec<T, simd::fixed_size<N>>;
+  static_assert(noexcept(Vec(cuda::std::declval<T>()))); // declval<T>() is needed for __half and __nv_bfloat16
+
+  Vec vec(T{42});
+  for (int i = 0; i < N; ++i)
+  {
+    assert(vec[i] == T{42});
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// generator constructor
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_generator()
+{
+  using Vec = simd::basic_vec<T, simd::fixed_size<N>>;
+
+  const Vec vec(iota_generator<T>{});
+  for (int i = 0; i < N; ++i)
+  {
+    assert(vec[i] == static_cast<T>(i + 1));
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// converting constructor
+
+template <typename T, typename U, int N>
+__host__ __device__ constexpr void test_converting()
+{
+  using Src = simd::basic_vec<U, simd::fixed_size<N>>;
+  using Dst = simd::basic_vec<T, simd::fixed_size<N>>;
+  Src src(U{3});
+  static_assert(noexcept(Dst(src)));
+
+  Dst dst(src);
+  for (int i = 0; i < N; ++i)
+  {
+    assert(dst[i] == static_cast<T>(U{3}));
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// range constructor
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_range()
+{
+  using Vec = simd::basic_vec<T, simd::fixed_size<N>>;
+  cuda::std::array<T, N> arr{};
+  for (int i = 0; i < N; ++i)
+  {
+    arr[i] = static_cast<T>(i + 1);
+  }
+
+  static_assert(!noexcept(Vec(arr)));
+  static_assert(!noexcept(Vec(arr, simd::flag_default)));
+
+  Vec vec(arr);
+  for (int i = 0; i < N; ++i)
+  {
+    assert(vec[i] == static_cast<T>(i + 1));
+  }
+
+  Vec vec2(arr, simd::flag_default);
+  for (int i = 0; i < N; ++i)
+  {
+    assert(vec2[i] == static_cast<T>(i + 1));
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// range constructor with fixed-extent span
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_range_span()
+{
+  using Vec = simd::basic_vec<T, simd::fixed_size<N>>;
+  cuda::std::array<T, N> arr{};
+  for (int i = 0; i < N; ++i)
+  {
+    arr[i] = static_cast<T>(i + 1);
+  }
+
+  const cuda::std::span<T, N> values(arr);
+  const Vec vec(values);
+  const Vec vec2(values, simd::flag_default);
+  for (int i = 0; i < N; ++i)
+  {
+    assert(vec[i] == static_cast<T>(i + 1));
+    assert(vec2[i] == static_cast<T>(i + 1));
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// range constructor with alignment flags
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_range_alignment_flags()
+{
+  using Vec                      = simd::basic_vec<T, simd::fixed_size<N>>;
+  alignas(64) cuda::std::array<T, N> arr{};
+  for (int i = 0; i < N; ++i)
+  {
+    arr[i] = static_cast<T>(i + 1);
+  }
+
+  const Vec aligned_vec(arr, simd::flag_aligned);
+  const Vec overaligned_vec(arr, simd::flag_overaligned<32>);
+  for (int i = 0; i < N; ++i)
+  {
+    assert(aligned_vec[i] == static_cast<T>(i + 1));
+    assert(overaligned_vec[i] == static_cast<T>(i + 1));
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// masked range constructor
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_masked_range()
+{
+  using Vec  = simd::basic_vec<T, simd::fixed_size<N>>;
+  using Mask = typename Vec::mask_type;
+  cuda::std::array<T, N> arr{};
+  for (int i = 0; i < N; ++i)
+  {
+    arr[i] = static_cast<T>(i + 1);
+  }
+
+  Mask even_mask(is_even{});
+  static_assert(!noexcept(Vec(arr, even_mask)));
+  static_assert(!noexcept(Vec(arr, even_mask, simd::flag_default)));
+
+  Vec vec(arr, even_mask);
+  for (int i = 0; i < N; ++i)
+  {
+    if (i % 2 == 0)
+    {
+      assert(vec[i] == static_cast<T>(i + 1));
+    }
+    else
+    {
+      assert(vec[i] == T{0});
+    }
+  }
+
+  Vec vec2(arr, even_mask, simd::flag_default);
+  for (int i = 0; i < N; ++i)
+  {
+    if (i % 2 == 0)
+    {
+      assert(vec2[i] == static_cast<T>(i + 1));
+    }
+    else
+    {
+      assert(vec2[i] == T{0});
+    }
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// range constructor with flag_convert
+// constructs a basic_vec<T> from an array<U> with simd::flag_convert, where U is wider than T (not value-preserving)
+
+template <typename T, typename U, int N>
+__host__ __device__ constexpr void test_range_convert_lossy()
+{
+  using Vec = simd::basic_vec<T, simd::fixed_size<N>>;
+  cuda::std::array<U, N> arr{};
+  for (int i = 0; i < N; ++i)
+  {
+    arr[i] = static_cast<U>(i + 1);
+  }
+
+  static_assert(!noexcept(Vec(arr, simd::flag_convert)));
+
+  Vec vec(arr, simd::flag_convert);
+  for (int i = 0; i < N; ++i)
+  {
+    assert(vec[i] == static_cast<T>(static_cast<U>(i + 1)));
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// masked range constructor with flag_convert
+// constructs a basic_vec<T> from an array<U> with simd::flag_convert, where U is wider than T (not value-preserving)
+
+template <typename T, typename U, int N>
+__host__ __device__ constexpr void test_masked_range_convert_lossy()
+{
+  using Vec  = simd::basic_vec<T, simd::fixed_size<N>>;
+  using Mask = typename Vec::mask_type;
+  cuda::std::array<U, N> arr{};
+  for (int i = 0; i < N; ++i)
+  {
+    arr[i] = static_cast<U>(i + 1);
+  }
+
+  Mask even_mask(is_even{});
+  static_assert(!noexcept(Vec(arr, even_mask, simd::flag_convert)));
+
+  Vec vec(arr, even_mask, simd::flag_convert);
+  for (int i = 0; i < N; ++i)
+  {
+    if (i % 2 == 0)
+    {
+      assert(vec[i] == static_cast<T>(static_cast<U>(i + 1)));
+    }
+    else
+    {
+      assert(vec[i] == T{0});
+    }
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// broadcast constructor with constexpr-wrapper-like types
+// [simd.ctor] p4.3: implicit when From::value is representable by value_type
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_broadcast_constexpr_wrapper()
+{
+  using Vec = simd::basic_vec<T, simd::fixed_size<N>>;
+
+  // integral_constant<T, V> where V fits in T: implicit
+  static_assert(cuda::std::is_convertible_v<cuda::std::integral_constant<T, T{1}>, Vec>);
+
+  // integral_constant from a wider type, but the specific value fits: implicit
+  if constexpr (sizeof(T) < sizeof(int64_t) && cuda::std::is_integral_v<T>)
+  {
+    using IC = cuda::std::integral_constant<int64_t, 5>;
+    static_assert(cuda::std::is_constructible_v<Vec, IC>);
+    static_assert(cuda::std::is_convertible_v<IC, Vec>);
+    Vec vec = IC{};
+    for (int i = 0; i < N; ++i)
+    {
+      assert(vec[i] == static_cast<T>(5));
+    }
+  }
+
+  // integral_constant from a wider type with a value that does NOT fit: explicit
+  if constexpr (cuda::std::is_same_v<T, int8_t>)
+  {
+    using IC = cuda::std::integral_constant<int64_t, 200>;
+    static_assert(cuda::std::is_constructible_v<Vec, IC>);
+    static_assert(!cuda::std::is_convertible_v<IC, Vec>);
+  }
+
+  // unsigned value in a signed target that fits: implicit
+  if constexpr (cuda::std::is_same_v<T, int>)
+  {
+    using IC = cuda::std::integral_constant<uint16_t, 100>;
+    static_assert(cuda::std::is_convertible_v<IC, Vec>);
+  }
+
+  // negative value in an unsigned target: explicit
+  if constexpr (cuda::std::is_same_v<T, uint32_t>)
+  {
+    using IC_neg = cuda::std::integral_constant<int64_t, -1>;
+    static_assert(cuda::std::is_constructible_v<Vec, IC_neg>);
+    static_assert(!cuda::std::is_convertible_v<IC_neg, Vec>);
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// broadcast constructor explicit/implicit boundary for arithmetic types
+// [simd.ctor] p4: implicit iff convertible_to and value-preserving
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_broadcast_explicit_implicit()
+{
+  using Vec = simd::basic_vec<T, simd::fixed_size<N>>;
+
+  // (1) same type is implicit
+  static_assert(cuda::std::is_convertible_v<T, Vec>);
+
+  // (2) value-preserving and wider type is implicit
+  if constexpr (cuda::std::is_same_v<T, int>)
+  {
+    static_assert(cuda::std::is_convertible_v<int16_t, Vec>);
+  }
+  else if constexpr (cuda::std::is_same_v<T, double>)
+  {
+    static_assert(cuda::std::is_convertible_v<float, Vec>);
+  }
+
+  // (3) narrow conversion is explicit
+  else if constexpr (cuda::std::is_same_v<T, int16_t>)
+  {
+    static_assert(cuda::std::is_constructible_v<Vec, int>);
+    static_assert(!cuda::std::is_convertible_v<int, Vec>);
+  }
+  else if constexpr (cuda::std::is_same_v<T, float>)
+  {
+    static_assert(cuda::std::is_constructible_v<Vec, double>);
+    static_assert(!cuda::std::is_convertible_v<double, Vec>);
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// SFINAE constraints
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_sfinae()
+{
+  using Vec = simd::basic_vec<T, simd::fixed_size<N>>;
+
+  static_assert(cuda::std::is_constructible_v<Vec, T>);
+
+  using VecDifferentSize = simd::basic_vec<T, simd::fixed_size<N + 1>>;
+  static_assert(!cuda::std::is_constructible_v<Vec, const VecDifferentSize&>);
+
+  static_assert(!cuda::std::is_constructible_v<Vec, wrong_generator>);
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_type()
+{
+  test_member_types<T, N>();
+  test_default_ctor<T, N>();
+  test_copy<T, N>();
+  test_broadcast<T, N>();
+  test_broadcast_explicit_implicit<T, N>();
+  test_generator<T, N>();
+  test_range<T, N>();
+  test_range_span<T, N>();
+  test_range_alignment_flags<T, N>();
+  test_masked_range<T, N>();
+  if constexpr (cuda::std::is_integral_v<T>)
+  {
+    test_broadcast_constexpr_wrapper<T, N>();
+  }
+  test_sfinae<T, N>();
+  if constexpr (sizeof(T) >= 2 && cuda::std::is_integral_v<T>)
+  {
+    using Smaller = cuda::std::conditional_t<cuda::std::is_signed_v<T>, int8_t, uint8_t>;
+    test_converting<T, Smaller, N>();
+  }
+  if constexpr (sizeof(T) < 8 && cuda::std::is_integral_v<T>)
+  {
+    using Wider = cuda::std::conditional_t<cuda::std::is_signed_v<T>, int64_t, uint64_t>;
+    test_range_convert_lossy<T, Wider, N>();
+    test_masked_range_convert_lossy<T, Wider, N>();
+  }
+  if constexpr (cuda::std::is_same_v<T, float>)
+  {
+    test_range_convert_lossy<T, double, N>();
+    test_masked_range_convert_lossy<T, double, N>();
+  }
+}
+
+DEFINE_BASIC_VEC_TEST()
+DEFINE_BASIC_VEC_TEST_RUNTIME()
+
+int main(int, char**)
+{
+  assert(test());
+  static_assert(test());
+  assert(test_runtime());
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/deduction.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/deduction.pass.cpp
new file mode 100644
index 00000000000..f9eefa2e91f
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/deduction.pass.cpp
@@ -0,0 +1,123 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// (CTAD bug) Deduction guides fail with:
+//   * gcc: all nvcc+gcc configs (EDG frontend bug)
+//   * nvrtc (EDG frontend bug)
+//   * msvc: all nvcc+msvc configs (MSVC CTAD limitation)
+//   * nvcc+clang C++17 (EDG frontend bug)
+//
+// while they work with:
+//
+//   * nvcc+clang C++20: PASS
+//   * clang-cuda: PASS
+//   * nvc++: PASS
+
+// error: error: no instance of constructor "cuda::std::simd::basic_vec" matches the argument list
+
+// UNSUPPORTED: gcc
+// UNSUPPORTED: nvrtc
+// UNSUPPORTED: msvc
+// UNSUPPORTED: nvcc && clang && c++17
+
+// <cuda/std/__simd_>
+
+// [simd.ctor] deduction guides
+//
+// basic_vec(Range&&, Ts...) -> basic_vec<range_value_t<Range>, deduce-abi-t<...>>;
+// basic_vec(basic_mask<Bytes, Abi>) -> basic_vec<integer-from<Bytes>, Abi>;
+
+#include <cuda/std/span>
+
+#include "../simd_test_utils.h"
+
+//----------------------------------------------------------------------------------------------------------------------
+// deduction from range
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_range_deduction()
+{
+  cuda::std::array<T, N> arr{};
+  for (int i = 0; i < N; ++i)
+  {
+    arr[i] = static_cast<T>(i);
+  }
+  simd::basic_vec vec(arr);
+  static_assert(cuda::std::is_same_v<typename decltype(vec)::value_type, T>);
+  static_assert(decltype(vec)::size() == N);
+  for (int i = 0; i < N; ++i)
+  {
+    assert(vec[i] == static_cast<T>(i));
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// deduction from fixed-extent span
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_span_deduction()
+{
+  cuda::std::array<T, N> arr{};
+  for (int i = 0; i < N; ++i)
+  {
+    arr[i] = static_cast<T>(i);
+  }
+
+  const cuda::std::span<T, N> values(arr);
+  simd::basic_vec vec(values);
+  static_assert(cuda::std::is_same_v<typename decltype(vec)::value_type, T>);
+  static_assert(decltype(vec)::size() == N);
+  for (int i = 0; i < N; ++i)
+  {
+    assert(vec[i] == static_cast<T>(i));
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// deduction from basic_mask
+
+template <int Bytes, int N>
+__host__ __device__ constexpr void test_mask_deduction()
+{
+  using Mask = simd::basic_mask<Bytes, simd::fixed_size<N>>;
+  Mask mask(true);
+  simd::basic_vec vec(mask);
+  static_assert(decltype(vec)::size() == N);
+  for (int i = 0; i < N; ++i)
+  {
+    assert(vec[i] == 1);
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+
+__host__ __device__ constexpr bool test_deduction()
+{
+  test_range_deduction<int, 1>();
+  test_range_deduction<int, 4>();
+  test_range_deduction<short, 4>();
+  test_range_deduction<long long, 4>();
+  test_span_deduction<int, 1>();
+  test_span_deduction<int, 4>();
+  test_span_deduction<short, 4>();
+  test_span_deduction<long long, 4>();
+
+  test_mask_deduction<1, 1>();
+  test_mask_deduction<1, 4>();
+  test_mask_deduction<4, 4>();
+  return true;
+}
+
+int main(int, char**)
+{
+  assert(test_deduction());
+  static_assert(test_deduction());
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/instantiation.fail.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/instantiation.fail.cpp
new file mode 100644
index 00000000000..183fc7079fd
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/instantiation.fail.cpp
@@ -0,0 +1,23 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// basic_vec<T, Abi> requires T to be a vectorizable type.
+// bool is explicitly excluded by the standard.
+
+#include <cuda/std/__simd_>
+
+#include "test_macros.h"
+
+int main(int, char**)
+{
+  using Vec = cuda::std::simd::basic_vec<bool, cuda::std::simd::fixed_size<4>>;
+  Vec vec(true); // expected-error
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/subscript.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/subscript.pass.cpp
new file mode 100644
index 00000000000..34d7544a6c7
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/subscript.pass.cpp
@@ -0,0 +1,51 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cuda/std/__simd_>
+
+// [simd.subscr], basic_vec subscript operators
+//
+// constexpr value_type operator[](simd-size-type) const;
+
+#include "../simd_test_utils.h"
+
+//----------------------------------------------------------------------------------------------------------------------
+// subscript read-back
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_type()
+{
+  using Vec = simd::basic_vec<T, simd::fixed_size<N>>;
+  Vec vec(T{7});
+
+  static_assert(cuda::std::is_same_v<decltype(vec[0]), typename Vec::value_type>);
+  static_assert(!noexcept(vec[0]));
+  static_assert(is_const_member_function_v<decltype(&Vec::operator[])>);
+  unused(vec);
+
+  Vec iota = make_iota_vec<T, N>();
+  for (int i = 0; i < N; ++i)
+  {
+    assert(iota[i] == static_cast<T>(i));
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+
+DEFINE_BASIC_VEC_TEST()
+DEFINE_BASIC_VEC_TEST_RUNTIME()
+
+int main(int, char**)
+{
+  assert(test());
+  static_assert(test());
+  assert(test_runtime());
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/unary.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/unary.pass.cpp
new file mode 100644
index 00000000000..ae1989d0cdf
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.vec.class/unary.pass.cpp
@@ -0,0 +1,204 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cuda/std/__simd_>
+
+// [simd.unary], basic_vec unary operators
+//
+// constexpr basic_vec& operator++() noexcept;
+// constexpr basic_vec  operator++(int) noexcept;
+// constexpr basic_vec& operator--() noexcept;
+// constexpr basic_vec  operator--(int) noexcept;
+// constexpr mask_type  operator!() const noexcept;
+// constexpr basic_vec  operator~() const noexcept;
+// constexpr basic_vec  operator+() const noexcept;
+// constexpr basic_vec  operator-() const noexcept;
+
+#include "../simd_test_utils.h"
+
+TEST_DIAG_SUPPRESS_MSVC(4146) // unary minus operator applied to unsigned type, result still unsigned
+
+//----------------------------------------------------------------------------------------------------------------------
+// operator++ (pre)
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_pre_increment()
+{
+  using Vec = simd::basic_vec<T, simd::fixed_size<N>>;
+  Vec vec(T{5});
+  static_assert(cuda::std::is_same_v<decltype(++vec), Vec&>);
+  static_assert(noexcept(++vec));
+
+  Vec& ref = ++vec;
+  assert(&ref == &vec);
+  for (int i = 0; i < N; ++i)
+  {
+    assert(vec[i] == T{6});
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// operator++ (post)
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_post_increment()
+{
+  using Vec = simd::basic_vec<T, simd::fixed_size<N>>;
+  Vec vec(T{5});
+  static_assert(cuda::std::is_same_v<decltype(vec++), Vec>);
+  static_assert(noexcept(vec++));
+
+  Vec old = vec++;
+  for (int i = 0; i < N; ++i)
+  {
+    assert(old[i] == T{5});
+    assert(vec[i] == T{6});
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// operator-- (pre)
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_pre_decrement()
+{
+  using Vec = simd::basic_vec<T, simd::fixed_size<N>>;
+  Vec vec(T{5});
+  static_assert(cuda::std::is_same_v<decltype(--vec), Vec&>);
+  static_assert(noexcept(--vec));
+
+  Vec& ref = --vec;
+  assert(&ref == &vec);
+  for (int i = 0; i < N; ++i)
+  {
+    assert(vec[i] == T{4});
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// operator-- (post)
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_post_decrement()
+{
+  using Vec = simd::basic_vec<T, simd::fixed_size<N>>;
+  Vec vec(T{5});
+  static_assert(cuda::std::is_same_v<decltype(vec--), Vec>);
+  static_assert(noexcept(vec--));
+
+  Vec old = vec--;
+  for (int i = 0; i < N; ++i)
+  {
+    assert(old[i] == T{5});
+    assert(vec[i] == T{4});
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// operator!
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_logical_not()
+{
+  using Vec  = simd::basic_vec<T, simd::fixed_size<N>>;
+  using Mask = typename Vec::mask_type;
+  Vec vec    = make_iota_vec<T, N>();
+  static_assert(cuda::std::is_same_v<decltype(!vec), Mask>);
+  static_assert(noexcept(!vec));
+
+  Mask result = !vec;
+  for (int i = 0; i < N; ++i)
+  {
+    assert(result[i] == (static_cast<T>(i) == T{0}));
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// operator~
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_bitwise_not()
+{
+  using Vec = simd::basic_vec<T, simd::fixed_size<N>>;
+  Vec vec(T{0});
+  static_assert(cuda::std::is_same_v<decltype(~vec), Vec>);
+  static_assert(noexcept(~vec));
+
+  Vec result = ~vec;
+  for (int i = 0; i < N; ++i)
+  {
+    assert(result[i] == static_cast<T>(~T{0}));
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// operator+
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_unary_plus()
+{
+  using Vec = simd::basic_vec<T, simd::fixed_size<N>>;
+  Vec vec(T{42});
+  static_assert(cuda::std::is_same_v<decltype(+vec), Vec>);
+  static_assert(noexcept(+vec));
+
+  Vec result = +vec;
+  for (int i = 0; i < N; ++i)
+  {
+    assert(result[i] == T{42});
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// operator-
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_unary_minus()
+{
+  using Vec = simd::basic_vec<T, simd::fixed_size<N>>;
+  Vec vec(T{3});
+  static_assert(cuda::std::is_same_v<decltype(-vec), Vec>);
+  static_assert(noexcept(-vec));
+
+  Vec result = -vec;
+  for (int i = 0; i < N; ++i)
+  {
+    assert(result[i] == static_cast<T>(-T{3}));
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+
+template <typename T, int N>
+__host__ __device__ constexpr void test_type()
+{
+  test_pre_increment<T, N>();
+  test_post_increment<T, N>();
+  test_pre_decrement<T, N>();
+  test_post_decrement<T, N>();
+  test_logical_not<T, N>();
+  test_unary_plus<T, N>();
+  test_unary_minus<T, N>();
+  if constexpr (cuda::std::is_integral_v<T>)
+  {
+    test_bitwise_not<T, N>();
+  }
+}
+
+DEFINE_BASIC_VEC_TEST()
+DEFINE_BASIC_VEC_TEST_RUNTIME()
+
+int main(int, char**)
+{
+  assert(test());
+  static_assert(test());
+  assert(test_runtime());
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd_test_utils.h b/libcudacxx/test/libcudacxx/std/numerics/simd/simd_test_utils.h
new file mode 100644
index 00000000000..32af8847961
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd_test_utils.h
@@ -0,0 +1,171 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SIMD_TEST_UTILS_H
+#define SIMD_TEST_UTILS_H
+
+#include <cuda/std/__simd_>
+#include <cuda/std/array>
+#include <cuda/std/cstdint>
+#include <cuda/std/type_traits>
+
+#include "test_macros.h"
+
+namespace simd = cuda::std::simd;
+
+//----------------------------------------------------------------------------------------------------------------------
+// common utilities
+
+struct wrong_generator
+{};
+
+template <typename>
+struct is_const_member_function : cuda::std::false_type
+{};
+
+template <typename R, typename C, typename... Args>
+struct is_const_member_function<R (C::*)(Args...) const> : cuda::std::true_type
+{};
+
+template <typename R, typename C, typename... Args>
+struct is_const_member_function<R (C::*)(Args...) const noexcept> : cuda::std::true_type
+{};
+
+template <typename T>
+constexpr bool is_const_member_function_v = is_const_member_function<T>::value;
+
+//----------------------------------------------------------------------------------------------------------------------
+// mask utilities
+
+struct is_even
+{
+  template <typename I>
+  __host__ __device__ constexpr bool operator()(I i) const noexcept
+  {
+    return i % 2 == 0;
+  }
+};
+
+struct is_first_half
+{
+  template <typename I>
+  __host__ __device__ constexpr bool operator()(I i) const noexcept
+  {
+    return i < 2;
+  }
+};
+
+template <int Bytes>
+using integer_from_t = cuda::std::__make_nbit_int_t<Bytes * 8, true>;
+
+//----------------------------------------------------------------------------------------------------------------------
+// vec utilities
+
+template <typename T>
+struct iota_generator
+{
+  template <typename I>
+  __host__ __device__ constexpr T operator()(I i) const noexcept
+  {
+    return static_cast<T>(i + 1);
+  }
+};
+
+template <typename T, int N>
+__host__ __device__ constexpr simd::basic_vec<T, simd::fixed_size<N>> make_iota_vec()
+{
+  cuda::std::array<T, N> arr{};
+  for (int i = 0; i < N; ++i)
+  {
+    arr[i] = static_cast<T>(i);
+  }
+  return simd::basic_vec<T, simd::fixed_size<N>>(arr);
+}
+
+// Each vec test file must define test_type<T, N>() and then define test() using this macro.
+// clang-format off
+#if defined(__cccl_lib_char8_t)
+#  define _SIMD_TEST_CHAR8_T()                                    \
+    test_type<char8_t, 1>();                                      \
+    test_type<char8_t, 4>();
+#else
+#  define _SIMD_TEST_CHAR8_T()
+#endif
+
+#if _CCCL_HAS_INT128()
+#  define _SIMD_TEST_INT128()                                     \
+    test_type<__int128_t, 1>();                                   \
+    test_type<__int128_t, 4>();
+#else
+#  define _SIMD_TEST_INT128()
+#endif
+
+#if _LIBCUDACXX_HAS_NVFP16()
+#  define _SIMD_TEST_FP16()                                       \
+    test_type<__half, 1>();                                       \
+    test_type<__half, 4>();
+#else
+#  define _SIMD_TEST_FP16()
+#endif
+
+#if _LIBCUDACXX_HAS_NVBF16()
+#  define _SIMD_TEST_BF16()                                       \
+    test_type<__nv_bfloat16, 1>();                                \
+    test_type<__nv_bfloat16, 4>();
+#else
+#  define _SIMD_TEST_BF16()
+#endif
+
+// __half and __nv_bfloat16 constructors are not constexpr (CUDA toolkit limitation),
+// so they are tested only at runtime via test_runtime().
+#define DEFINE_BASIC_VEC_TEST_RUNTIME()                           \
+  __host__ __device__ bool test_runtime()                         \
+  {                                                               \
+    _SIMD_TEST_FP16()                                             \
+    _SIMD_TEST_BF16()                                             \
+    return true;                                                  \
+  }
+
+#define DEFINE_BASIC_VEC_TEST()                                   \
+  __host__ __device__ constexpr bool test()                       \
+  {                                                               \
+    test_type<int8_t, 1>();                                       \
+    test_type<int8_t, 4>();                                       \
+    test_type<int16_t, 1>();                                      \
+    test_type<int16_t, 4>();                                      \
+    test_type<int32_t, 1>();                                      \
+    test_type<int32_t, 4>();                                      \
+    test_type<int64_t, 1>();                                      \
+    test_type<int64_t, 4>();                                      \
+    test_type<uint8_t, 1>();                                      \
+    test_type<uint8_t, 4>();                                      \
+    test_type<uint16_t, 1>();                                     \
+    test_type<uint16_t, 4>();                                     \
+    test_type<uint32_t, 1>();                                     \
+    test_type<uint32_t, 4>();                                     \
+    test_type<uint64_t, 1>();                                     \
+    test_type<uint64_t, 4>();                                     \
+    test_type<char16_t, 1>();                                     \
+    test_type<char16_t, 4>();                                     \
+    test_type<char32_t, 1>();                                     \
+    test_type<char32_t, 4>();                                     \
+    test_type<wchar_t, 1>();                                      \
+    test_type<wchar_t, 4>();                                      \
+    _SIMD_TEST_CHAR8_T()                                          \
+    test_type<float, 1>();                                        \
+    test_type<float, 4>();                                        \
+    test_type<double, 1>();                                       \
+    test_type<double, 4>();                                       \
+    _SIMD_TEST_INT128()                                           \
+    return true;                                                  \
+  }
+// clang-format on
+
+#endif // SIMD_TEST_UTILS_H