From d240cefc2e55533ab368387eda420716b2c96184 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 5 May 2026 14:06:57 -0400 Subject: [PATCH 1/2] feat: dynamic batch_bool masks + avx_128 / avx2_128 mask overloads Adds runtime batch_bool mask overloads of xsimd::load_masked and xsimd::store_masked across AVX, AVX2, AVX-512, SSE, SVE, RVV, and NEON; generic common-path fallback collapsed to a whole-vector select. SVE compile-time masked load/store forwarded through the runtime path so the per-lane predicate is correct on SVE wider than 128 bits. Adds arch-specific runtime-mask overloads of load_masked / store_masked for the avx_128 and avx2_128 arches so they inherit the hardware predicated load/store path on x86. Squashed from: b57a7667 feat: add runtime batch_bool mask overloads for load_masked/store_masked d5f21c70 feat: add runtime batch_bool mask overloads for avx_128 / avx2_128 --- docs/source/api/data_transfer.rst | 12 +- .../xsimd/arch/common/xsimd_common_memory.hpp | 36 +++++ include/xsimd/arch/xsimd_avx.hpp | 33 +++++ include/xsimd/arch/xsimd_avx2.hpp | 46 +++++-- include/xsimd/arch/xsimd_avx2_128.hpp | 71 ++++++++-- include/xsimd/arch/xsimd_avx_128.hpp | 31 +++++ include/xsimd/arch/xsimd_common_fwd.hpp | 4 + include/xsimd/arch/xsimd_rvv.hpp | 24 ++++ include/xsimd/arch/xsimd_sve.hpp | 43 +++++- include/xsimd/types/xsimd_api.hpp | 64 +++++++++ include/xsimd/types/xsimd_batch.hpp | 28 +++- include/xsimd/types/xsimd_utils.hpp | 10 ++ test/test_load_store.cpp | 123 ++++++++++++------ 13 files changed, 452 insertions(+), 73 deletions(-) diff --git a/docs/source/api/data_transfer.rst b/docs/source/api/data_transfer.rst index 815f56293..b389a48ad 100644 --- a/docs/source/api/data_transfer.rst +++ b/docs/source/api/data_transfer.rst @@ -12,7 +12,7 @@ Data Transfers From memory: +---------------------------------------+----------------------------------------------------+ -| :cpp:func:`load` | load values from memory (optionally masked) | +| :cpp:func:`load` | load values from memory (optionally masked) [#m]_ | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`load_aligned` | load values from aligned memory | +---------------------------------------+----------------------------------------------------+ @@ -32,7 +32,7 @@ From a scalar: To memory: +---------------------------------------+----------------------------------------------------+ -| :cpp:func:`store` | store values to memory (optionally masked) | +| :cpp:func:`store` | store values to memory (optionally masked) [#m]_ | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`store_aligned` | store values to aligned memory | +---------------------------------------+----------------------------------------------------+ @@ -84,3 +84,11 @@ The following empty types are used for tag dispatching: .. doxygenstruct:: xsimd::unaligned_mode :project: xsimd + +.. rubric:: Footnotes + +.. [#m] Masked ``load`` / ``store`` come in two flavours. The + :cpp:class:`batch_bool_constant` overload encodes the mask in the type and + is resolved at compile time. The runtime :cpp:class:`batch_bool` overload + accepts a mask computed at runtime. Prefer the compile-time mask whenever + the selection is known at compile time. diff --git a/include/xsimd/arch/common/xsimd_common_memory.hpp b/include/xsimd/arch/common/xsimd_common_memory.hpp index c8038334a..0639f5168 100644 --- a/include/xsimd/arch/common/xsimd_common_memory.hpp +++ b/include/xsimd/arch/common/xsimd_common_memory.hpp @@ -15,6 +15,7 @@ #include #include #include +#include #include "../../types/xsimd_batch_constant.hpp" #include "./xsimd_common_details.hpp" @@ -374,6 +375,23 @@ namespace xsimd return batch::load(buffer.data(), aligned_mode {}); } + template + XSIMD_INLINE batch + load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + // Per-lane validity contract: only active lanes of ``mem`` are + // required to be addressable. An unconditional whole-vector load + // would touch inactive lanes and trip ASan/Valgrind on partial + // buffers, so stay scalar. Arches with hardware predicated loads + // (AVX2 32/64-bit, AVX-512, SVE, RVV) override this with a single + // intrinsic that suppresses inactive-lane reads in hardware. + constexpr std::size_t size = batch::size; + alignas(A::alignment()) std::array buffer; + for (std::size_t i = 0; i < size; ++i) + buffer[i] = mask.get(i) ? mem[i] : T(0); + return batch::load_aligned(buffer.data()); + } + template XSIMD_INLINE void store_masked(T_out* mem, batch const& src, batch_bool_constant, alignment, requires_arch) noexcept @@ -388,6 +406,24 @@ namespace xsimd } } + template + XSIMD_INLINE void + store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + // Per-lane validity contract (matches native masked-store APIs): + // only active lanes of ``mem`` are touched. A load+select+store + // RMW would both read and write inactive bytes, breaking that + // contract — stay scalar. Arches with hardware predicated stores + // override this with a single intrinsic that suppresses inactive + // lanes in hardware. + constexpr std::size_t size = batch::size; + alignas(A::alignment()) std::array src_buf; + src.store_aligned(src_buf.data()); + for (std::size_t i = 0; i < size; ++i) + if (mask.get(i)) + mem[i] = src_buf[i]; + } + template XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index 305041f11..429637784 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -1015,6 +1015,23 @@ namespace xsimd } } + // Runtime-mask load for float/double on AVX. Both aligned_mode and + // unaligned_mode map to _mm256_maskload_* — the intrinsic does not fault + // on masked-off lanes, so partial loads across page boundaries are safe. + template + XSIMD_INLINE batch + load_masked(float const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + return _mm256_maskload_ps(mem, _mm256_castps_si256(mask)); + } + + template + XSIMD_INLINE batch + load_masked(double const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + return _mm256_maskload_pd(mem, _mm256_castpd_si256(mask)); + } + // store_masked namespace detail { @@ -1031,6 +1048,22 @@ namespace xsimd } } + // Runtime-mask store for float/double on AVX. Same fault-suppression + // semantics as the masked loads above; alignment mode is irrelevant. + template + XSIMD_INLINE void + store_masked(float* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + _mm256_maskstore_ps(mem, _mm256_castps_si256(mask), src); + } + + template + XSIMD_INLINE void + store_masked(double* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + _mm256_maskstore_pd(mem, _mm256_castpd_si256(mask), src); + } + template XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index ebffd910b..f2666018b 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -118,10 +118,15 @@ namespace xsimd } } - // load_masked - // AVX2 low-level helpers (operate on raw SIMD registers) + // load_masked. AVX2 mask{load,store}_epi{32,64} take int*/long long*; + // these helpers exist so the dispatch picks the right intrinsic by + // pointer width. Static_asserts pin sizeof(int)/(long long) to the + // values the intrinsics expect — the dispatcher relies on it. namespace detail { + static_assert(sizeof(int) == 4, "AVX2 maskload/maskstore expects sizeof(int) == 4"); + static_assert(sizeof(long long) == 8, "AVX2 maskload/maskstore expects sizeof(long long) == 8"); + XSIMD_INLINE __m256i maskload(const int32_t* mem, __m256i mask) noexcept { return _mm256_maskload_epi32(mem, mask); @@ -129,7 +134,7 @@ namespace xsimd XSIMD_INLINE __m256i maskload(const long long* mem, __m256i mask) noexcept { - return _mm256_maskload_epi64(reinterpret_cast(mem), mask); + return _mm256_maskload_epi64(mem, mask); } XSIMD_INLINE __m256i zero_extend(__m128i hi) noexcept @@ -138,14 +143,12 @@ namespace xsimd } } - // single templated implementation for integer masked loads (32/64-bit) template XSIMD_INLINE std::enable_if_t::value && (sizeof(T) >= 4), batch> load_masked(T const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { static_assert(sizeof(T) == 4 || sizeof(T) == 8, "load_masked supports only 32/64-bit integers on AVX2"); using int_t = std::conditional_t; - // Use the raw register-level maskload helpers for the remaining cases. return detail::maskload(reinterpret_cast(mem), mask.as_batch()); } @@ -175,19 +178,31 @@ namespace xsimd return bitwise_cast(r); } + // Runtime-mask load for 32/64-bit integers on AVX2. 8/16-bit integers + // fall back to the scalar common path: AVX2 has no native maskload for + // those widths, and a load-then-blend would break fault-suppression at + // page boundaries (the main reason callers ask for a masked load). + // Both aligned_mode and unaligned_mode route to the same intrinsic — + // masked-off lanes do not fault regardless of alignment. + template + XSIMD_INLINE std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8), batch> + load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + using int_t = std::conditional_t; + return detail::maskload(reinterpret_cast(mem), __m256i(mask)); + } + // store_masked namespace detail { - template XSIMD_INLINE void maskstore(int32_t* mem, __m256i mask, __m256i src) noexcept { _mm256_maskstore_epi32(reinterpret_cast(mem), mask, src); } - template - XSIMD_INLINE void maskstore(int64_t* mem, __m256i mask, __m256i src) noexcept + XSIMD_INLINE void maskstore(long long* mem, __m256i mask, __m256i src) noexcept { - _mm256_maskstore_epi64(reinterpret_cast(mem), mask, src); + _mm256_maskstore_epi64(mem, mask, src); } } @@ -195,15 +210,14 @@ namespace xsimd XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { constexpr size_t lanes_per_half = batch::size / 2; + using int_t = std::conditional_t; - // confined to lower 128-bit half → forward to SSE XSIMD_IF_CONSTEXPR(mask.countl_zero() >= lanes_per_half) { constexpr auto mlo = ::xsimd::detail::lower_half(mask); const auto lo = detail::lower_half(src); store_masked(mem, lo, mlo, Mode {}, sse4_2 {}); } - // confined to upper 128-bit half → forward to SSE else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= lanes_per_half) { constexpr auto mhi = ::xsimd::detail::upper_half(mask); @@ -212,7 +226,7 @@ namespace xsimd } else { - detail::maskstore(mem, mask.as_batch(), src); + detail::maskstore(reinterpret_cast(mem), mask.as_batch(), src); } } @@ -230,6 +244,14 @@ namespace xsimd store_masked(reinterpret_cast(mem), s64, batch_bool_constant {}, Mode {}, avx2 {}); } + template + XSIMD_INLINE std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8), void> + store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + using int_t = std::conditional_t; + detail::maskstore(reinterpret_cast(mem), __m256i(mask), __m256i(src)); + } + // load_stream template ::value, void>> XSIMD_INLINE batch load_stream(T const* mem, convert, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx2_128.hpp b/include/xsimd/arch/xsimd_avx2_128.hpp index 7e8b0d05a..1f2de84b3 100644 --- a/include/xsimd/arch/xsimd_avx2_128.hpp +++ b/include/xsimd/arch/xsimd_avx2_128.hpp @@ -89,48 +89,91 @@ namespace xsimd } } - // load_masked + namespace detail + { + XSIMD_INLINE __m128i maskload_128(int32_t const* mem, __m128i mask) noexcept + { + return _mm_maskload_epi32(mem, mask); + } + XSIMD_INLINE __m128i maskload_128(long long const* mem, __m128i mask) noexcept + { + return _mm_maskload_epi64(mem, mask); + } + XSIMD_INLINE void maskstore_128(int32_t* mem, __m128i mask, __m128i src) noexcept + { + _mm_maskstore_epi32(mem, mask, src); + } + XSIMD_INLINE void maskstore_128(long long* mem, __m128i mask, __m128i src) noexcept + { + _mm_maskstore_epi64(mem, mask, src); + } + } + + // load_masked / store_masked. AVX2-128 has _mm_maskload/store for + // 32/64-bit integers only; 8/16-bit fall back to the common scalar + // path. Aligned and unaligned go through the same intrinsic — masked- + // off lanes do not fault regardless of alignment. template XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - return _mm_maskload_epi32(mem, mask.as_batch()); + return detail::maskload_128(mem, mask.as_batch()); } template XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - return _mm_maskload_epi32((int32_t*)mem, mask.as_batch()); + return detail::maskload_128(reinterpret_cast(mem), mask.as_batch()); } template - XSIMD_INLINE batch load_masked(int64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + XSIMD_INLINE batch load_masked(int64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - return _mm_maskload_epi64(mem, mask.as_batch()); + return detail::maskload_128(reinterpret_cast(mem), mask.as_batch()); } template - XSIMD_INLINE batch load_masked(uint64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + XSIMD_INLINE batch load_masked(uint64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - return _mm_maskload_epi64((int64_t*)mem, mask.as_batch()); + return detail::maskload_128(reinterpret_cast(mem), mask.as_batch()); } - // store_masked template XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { - return _mm_maskstore_epi32(mem, mask.as_batch(), src); + detail::maskstore_128(mem, mask.as_batch(), src); } template XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { - return _mm_maskstore_epi32((int32_t*)mem, mask.as_batch(), src); + detail::maskstore_128(reinterpret_cast(mem), mask.as_batch(), src); } template - XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { - return _mm_maskstore_epi64(mem, mask.as_batch(), src); + detail::maskstore_128(reinterpret_cast(mem), mask.as_batch(), src); } template - XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + detail::maskstore_128(reinterpret_cast(mem), mask.as_batch(), src); + } + + // Runtime-mask path: route through the same helpers as the constant- + // mask overloads. Templated on T so the dispatcher does not need to + // see four near-identical functions; the runtime-mask overloads of + // the common arch take ``batch_bool``, not the four signed/ + // unsigned int variants, so there is no ambiguity here. + template + XSIMD_INLINE std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8), batch> + load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + using int_t = std::conditional_t; + return detail::maskload_128(reinterpret_cast(mem), __m128i(mask)); + } + + template + XSIMD_INLINE std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8), void> + store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept { - return _mm_maskstore_epi64((int64_t*)mem, mask.as_batch(), src); + using int_t = std::conditional_t; + detail::maskstore_128(reinterpret_cast(mem), __m128i(mask), __m128i(src)); } // gather diff --git a/include/xsimd/arch/xsimd_avx_128.hpp b/include/xsimd/arch/xsimd_avx_128.hpp index af17568e1..98642dac6 100644 --- a/include/xsimd/arch/xsimd_avx_128.hpp +++ b/include/xsimd/arch/xsimd_avx_128.hpp @@ -115,6 +115,22 @@ namespace xsimd return _mm_maskload_pd(mem, mask.as_batch()); } + // Runtime-mask load for float/double on AVX-128. Both aligned_mode and + // unaligned_mode map to _mm_maskload_* — the intrinsic does not fault + // on masked-off lanes, so partial loads across page boundaries are safe. + template + XSIMD_INLINE batch + load_masked(float const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + return _mm_maskload_ps(mem, _mm_castps_si128(mask)); + } + template + XSIMD_INLINE batch + load_masked(double const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + return _mm_maskload_pd(mem, _mm_castpd_si128(mask)); + } + // store_masked template XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept @@ -128,6 +144,21 @@ namespace xsimd return _mm_maskstore_pd(mem, mask.as_batch(), src); } + // Runtime-mask store for float/double on AVX-128. Same fault-suppression + // semantics as the masked loads above; alignment mode is irrelevant. + template + XSIMD_INLINE void + store_masked(float* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + _mm_maskstore_ps(mem, _mm_castps_si128(mask), src); + } + template + XSIMD_INLINE void + store_masked(double* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + _mm_maskstore_pd(mem, _mm_castpd_si128(mask), src); + } + // swizzle (dynamic mask) template ::value && sizeof(T) == sizeof(ITy)>> XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_common_fwd.hpp b/include/xsimd/arch/xsimd_common_fwd.hpp index e78864f6e..1474aeeb8 100644 --- a/include/xsimd/arch/xsimd_common_fwd.hpp +++ b/include/xsimd/arch/xsimd_common_fwd.hpp @@ -79,8 +79,12 @@ namespace xsimd XSIMD_INLINE batch load(T const* mem, unaligned_mode, requires_arch) noexcept; template XSIMD_INLINE batch load_masked(T_in const* mem, batch_bool_constant mask, convert, alignment, requires_arch) noexcept; + template + XSIMD_INLINE batch load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept; template XSIMD_INLINE void store_masked(T_out* mem, batch const& src, batch_bool_constant mask, alignment, requires_arch) noexcept; + template + XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept; template XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept; template diff --git a/include/xsimd/arch/xsimd_rvv.hpp b/include/xsimd/arch/xsimd_rvv.hpp index 3ae649fdb..df64d3c1b 100644 --- a/include/xsimd/arch/xsimd_rvv.hpp +++ b/include/xsimd/arch/xsimd_rvv.hpp @@ -409,6 +409,11 @@ namespace xsimd { XSIMD_RVV_OVERLOAD(rvvle, (__riscv_vle XSIMD_RVV_S _v_ XSIMD_RVV_TSM), , vec(T const*)) XSIMD_RVV_OVERLOAD(rvvse, (__riscv_vse XSIMD_RVV_S _v_ XSIMD_RVV_TSM), , void(T*, vec)) + // Masked load (mask-undisturbed with zero passthrough): inactive lanes read as 0, + // no memory access is performed for inactive lanes (page-fault safe). + XSIMD_RVV_OVERLOAD(rvvle_mu, (__riscv_vle XSIMD_RVV_S _v_ XSIMD_RVV_TSM _mu), , vec(bvec, vec, T const*)) + // Masked store: inactive lanes are not written. + XSIMD_RVV_OVERLOAD(rvvse_m, (__riscv_vse XSIMD_RVV_S _v_ XSIMD_RVV_TSM _m), , void(bvec, T*, vec)) } template = 0> @@ -423,6 +428,16 @@ namespace xsimd return load_aligned(src, convert(), rvv {}); } + // load_masked (runtime mask): native vle*.v vd, (rs1), v0.t with zero-init + // passthrough so inactive lanes read as 0, matching xsimd's contract. + template = 0> + XSIMD_INLINE batch load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + using proj_t = map_to_sized_type_t; + const auto zero = detail_rvv::rvvmv_splat(proj_t {}); + return detail_rvv::rvvle_mu(mask, zero, reinterpret_cast(mem)); + } + // load_complex namespace detail_rvv { @@ -500,6 +515,15 @@ namespace xsimd store_aligned(dst, src, rvv {}); } + // store_masked (runtime mask): native vse*.v vd, (rs1), v0.t — inactive lanes + // are not written (page-fault safe). + template = 0> + XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + using proj_t = map_to_sized_type_t; + detail_rvv::rvvse_m(mask, reinterpret_cast(mem), src); + } + /****************** * scatter/gather * ******************/ diff --git a/include/xsimd/arch/xsimd_sve.hpp b/include/xsimd/arch/xsimd_sve.hpp index c15404dd9..e5729e65e 100644 --- a/include/xsimd/arch/xsimd_sve.hpp +++ b/include/xsimd/arch/xsimd_sve.hpp @@ -101,11 +101,28 @@ namespace xsimd return load_aligned(src, convert(), sve {}); } - // load_masked - template = 0> - XSIMD_INLINE batch load_masked(T const* mem, batch_bool_constant, Mode, requires_arch) noexcept + // load_masked (compile-time mask): build a runtime predicate from + // the constant mask and reuse the runtime-mask path. ``pmask`` only + // constructs a 128-bit chunk predicate (svdupq_b{8,16,32,64}), which + // is replication-based and does not correctly express a per-lane + // mask on SVE wider than 128 bits — going through ``as_batch_bool`` + // gives the right predicate for every vector width. ``int32``/ + // ``int64``/``uint32``/``uint64`` are excluded so the common-arch + // dispatchers that reinterpret to ``float``/``double`` win partial + // ordering (otherwise we'd be ambiguous with ``requires_arch``). + template = 0, + std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8)), int> = 0> + XSIMD_INLINE batch load_masked(T const* mem, batch_bool_constant mask, convert, Mode m, requires_arch) noexcept { - return svld1(detail_sve::pmask(), reinterpret_cast const*>(mem)); + return load_masked(mem, mask.as_batch_bool(), convert {}, m, sve {}); + } + + // load_masked (runtime mask) + template = 0> + XSIMD_INLINE batch load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + return svld1(mask, reinterpret_cast const*>(mem)); } // load_complex @@ -141,6 +158,24 @@ namespace xsimd store_aligned(dst, src, sve {}); } + // store_masked (compile-time mask): forward to the runtime-mask + // path for the same reason as load_masked above; same exclusion of + // 32/64-bit integers to defer to the common dispatchers. + template = 0, + std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8)), int> = 0> + XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, Mode m, requires_arch) noexcept + { + store_masked(mem, src, mask.as_batch_bool(), m, sve {}); + } + + // store_masked (runtime mask) + template = 0> + XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + svst1(mask, reinterpret_cast*>(mem), src); + } + // store_complex template = 0> XSIMD_INLINE void store_complex_aligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp index 6a7206116..341339abe 100644 --- a/include/xsimd/types/xsimd_api.hpp +++ b/include/xsimd/types/xsimd_api.hpp @@ -1550,6 +1550,27 @@ namespace xsimd return batch::load(ptr, mask, aligned_mode {}); } + /** + * @ingroup batch_data_transfer + * + * Creates a batch from the buffer \c ptr using a runtime mask. Elements + * corresponding to \c false in the mask are not accessed in memory and are + * zero-initialized in the resulting batch. No type conversion is performed: + * \c ptr must point to \c T. Prefer the \c batch_bool_constant overload + * whenever the mask is known at compile time. + * @param ptr the memory buffer to read + * @param mask runtime selection mask for the elements to load + * @return a new batch instance + */ + template + XSIMD_INLINE batch load(T const* ptr, + batch_bool mask, + aligned_mode = {}) noexcept + { + detail::static_check_supported_config(); + return batch::load(ptr, mask, aligned_mode {}); + } + /** * @ingroup batch_data_transfer * @@ -1570,6 +1591,16 @@ namespace xsimd return batch::load(ptr, mask, unaligned_mode {}); } + /// \overload + template + XSIMD_INLINE batch load(T const* ptr, + batch_bool mask, + unaligned_mode) noexcept + { + detail::static_check_supported_config(); + return batch::load(ptr, mask, unaligned_mode {}); + } + /** * @ingroup batch_data_transfer * @@ -2663,6 +2694,28 @@ namespace xsimd val.store(mem, mask, aligned_mode {}); } + /** + * @ingroup batch_data_transfer + * + * Copy selected elements of batch \c val to the buffer \c mem using a + * runtime mask. Elements corresponding to \c false in the mask are not + * written and leave the contents of \c mem untouched. No type conversion + * is performed: \c mem must point to \c T. Prefer the \c + * batch_bool_constant overload whenever the mask is known at compile time. + * @param mem the memory buffer to write to + * @param val the batch to copy from + * @param mask runtime selection mask for the elements to store + */ + template + XSIMD_INLINE void store(T* mem, + batch const& val, + batch_bool mask, + aligned_mode = {}) noexcept + { + detail::static_check_supported_config(); + val.store(mem, mask, aligned_mode {}); + } + /** * @ingroup batch_data_transfer * @@ -2683,6 +2736,17 @@ namespace xsimd val.store(mem, mask, unaligned_mode {}); } + /// \overload + template + XSIMD_INLINE void store(T* mem, + batch const& val, + batch_bool mask, + unaligned_mode) noexcept + { + detail::static_check_supported_config(); + val.store(mem, mask, unaligned_mode {}); + } + /** * @ingroup batch_data_transfer * diff --git a/include/xsimd/types/xsimd_batch.hpp b/include/xsimd/types/xsimd_batch.hpp index b584a2d81..063186150 100644 --- a/include/xsimd/types/xsimd_batch.hpp +++ b/include/xsimd/types/xsimd_batch.hpp @@ -144,9 +144,12 @@ namespace xsimd template XSIMD_INLINE void store(U* mem, stream_mode) const noexcept; - // Compile-time mask overloads + // Masked overloads template XSIMD_INLINE void store(U* mem, batch_bool_constant mask, Mode) const noexcept; + /** \brief Runtime-mask store; see xsimd::store(T*, batch const&, batch_bool, Mode). */ + template + XSIMD_INLINE void store(T* mem, batch_bool mask, Mode = {}) const noexcept; template XSIMD_NO_DISCARD static XSIMD_INLINE batch load_aligned(U const* mem) noexcept; @@ -156,9 +159,12 @@ namespace xsimd XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, aligned_mode) noexcept; template XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, unaligned_mode) noexcept; - // Compile-time mask overloads + // Masked overloads template XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, batch_bool_constant mask, Mode = {}) noexcept; + /** \brief Runtime-mask load; see xsimd::load(T const*, batch_bool, Mode). */ + template + XSIMD_NO_DISCARD static XSIMD_INLINE batch load(T const* mem, batch_bool mask, Mode = {}) noexcept; template XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, stream_mode) noexcept; @@ -722,6 +728,14 @@ namespace xsimd } } + template + template + XSIMD_INLINE batch batch::load(T const* mem, batch_bool mask, Mode mode) noexcept + { + detail::static_check_supported_config(); + return kernel::load_masked(mem, mask, kernel::convert {}, mode, A {}); + } + template template XSIMD_INLINE void batch::store(U* mem, @@ -745,6 +759,16 @@ namespace xsimd } } + template + template + XSIMD_INLINE void batch::store(T* mem, + batch_bool mask, + Mode mode) const noexcept + { + detail::static_check_supported_config(); + kernel::store_masked(mem, *this, mask, mode, A {}); + } + template template XSIMD_INLINE batch batch::load(U const* mem, stream_mode) noexcept diff --git a/include/xsimd/types/xsimd_utils.hpp b/include/xsimd/types/xsimd_utils.hpp index 6af62c1a0..940ee084f 100644 --- a/include/xsimd/types/xsimd_utils.hpp +++ b/include/xsimd/types/xsimd_utils.hpp @@ -457,6 +457,16 @@ namespace xsimd template using complex_batch_type_t = typename complex_batch_type::type; + + namespace details + { + // Returns a bitmask with the lowest \c size bits set. Used by masked + // load/store fast paths to detect "all lanes active". + inline constexpr uint64_t full_mask(std::size_t size) noexcept + { + return size >= 64 ? ~uint64_t(0) : ((uint64_t(1) << size) - 1); + } + } } #endif diff --git a/test/test_load_store.cpp b/test/test_load_store.cpp index a5266eeb3..48e5ac0d5 100644 --- a/test/test_load_store.cpp +++ b/test/test_load_store.cpp @@ -66,22 +66,6 @@ struct load_store_test static constexpr bool get(std::size_t index, std::size_t size) noexcept { return index >= (size / 2); } }; - struct mask_first_n - { - static constexpr bool get(std::size_t index, std::size_t size) noexcept - { - return index < (size > 2 ? size / 3 : std::size_t(1)); - } - }; - - struct mask_last_n - { - static constexpr bool get(std::size_t index, std::size_t size) noexcept - { - return index >= size - (size > 2 ? size / 3 : std::size_t(1)); - } - }; - struct mask_even { static constexpr bool get(std::size_t index, std::size_t) noexcept { return (index % 2) == 0; } @@ -105,6 +89,41 @@ struct load_store_test static constexpr bool get(std::size_t, std::size_t) noexcept { return true; } }; + template + static batch_bool_type make_runtime_mask() noexcept + { + uint64_t bits = 0; + for (std::size_t i = 0; i < size; ++i) + { + if (Generator::get(i, size)) + { + bits |= uint64_t(1) << i; + } + } + return batch_bool_type::from_mask(bits); + } + + struct compile_time_mask + { + static constexpr const char* tag = ""; + template + static auto make() noexcept + -> decltype(xsimd::make_batch_bool_constant()) + { + return xsimd::make_batch_bool_constant(); + } + }; + + struct runtime_mask + { + static constexpr const char* tag = " runtime"; + template + static batch_bool_type make() noexcept + { + return make_runtime_mask(); + } + }; + int8_vector_type i8_vec; uint8_vector_type ui8_vec; int16_vector_type i16_vec; @@ -367,16 +386,22 @@ struct load_store_test template void run_mask_tests(const V& v, const std::string& name, batch_type& b, const array_type& expected, std::true_type) { - run_load_mask_pattern(v, name, b, expected, " masked none"); - run_load_mask_pattern(v, name, b, expected, " masked first element"); - run_load_mask_pattern(v, name, b, expected, " masked first half"); - run_load_mask_pattern(v, name, b, expected, " masked last half"); - run_load_mask_pattern(v, name, b, expected, " masked first N"); - run_load_mask_pattern(v, name, b, expected, " masked last N"); - run_load_mask_pattern(v, name, b, expected, " masked even elements"); - run_load_mask_pattern(v, name, b, expected, " masked odd elements"); - run_load_mask_pattern(v, name, b, expected, " masked pseudo random"); - run_load_mask_pattern(v, name, b, expected, " masked all elements"); + run_load_mask_patterns(v, name, b, expected); + run_load_mask_patterns(v, name, b, expected); + } + + template + void run_load_mask_patterns(const V& v, const std::string& name, batch_type& b, const array_type& expected) + { + const std::string p = std::string(MaskKind::tag) + " masked"; + run_load_mask_pattern(v, name, b, expected, p + " none"); + run_load_mask_pattern(v, name, b, expected, p + " first element"); + run_load_mask_pattern(v, name, b, expected, p + " first half"); + run_load_mask_pattern(v, name, b, expected, p + " last half"); + run_load_mask_pattern(v, name, b, expected, p + " even elements"); + run_load_mask_pattern(v, name, b, expected, p + " odd elements"); + run_load_mask_pattern(v, name, b, expected, p + " pseudo random"); + run_load_mask_pattern(v, name, b, expected, p + " all elements"); } template @@ -384,10 +409,10 @@ struct load_store_test { } - template + template void run_load_mask_pattern(const V& v, const std::string& name, batch_type& b, const array_type& expected, const std::string& label) { - constexpr auto mask = xsimd::make_batch_bool_constant(); + const auto mask = MaskKind::template make(); array_type expected_masked { 0 }; for (std::size_t i = 0; i < size; ++i) @@ -404,10 +429,10 @@ struct load_store_test CHECK_BATCH_EQ(b, expected_masked); } - template + template void run_store_mask_pattern(const V& v, const std::string& name, batch_type& b, V& res, V& expected_masked, const std::string& label) { - auto mask = xsimd::make_batch_bool_constant(); + const auto mask = MaskKind::template make(); for (std::size_t i = 0; i < size; ++i) { expected_masked[i] = Generator::get(i, size) ? v[i] : value_type(); @@ -425,15 +450,21 @@ struct load_store_test template void run_store_mask_tests(const V& v, const std::string& name, batch_type& b, V& res, V& expected_masked, std::true_type) { - run_store_mask_pattern(v, name, b, res, expected_masked, " masked first element"); - run_store_mask_pattern(v, name, b, res, expected_masked, " masked first half"); - run_store_mask_pattern(v, name, b, res, expected_masked, " masked last half"); - run_store_mask_pattern(v, name, b, res, expected_masked, " masked first N"); - run_store_mask_pattern(v, name, b, res, expected_masked, " masked last N"); - run_store_mask_pattern(v, name, b, res, expected_masked, " masked even elements"); - run_store_mask_pattern(v, name, b, res, expected_masked, " masked odd elements"); - run_store_mask_pattern(v, name, b, res, expected_masked, " masked pseudo random"); - run_store_mask_pattern(v, name, b, res, expected_masked, " masked all elements"); + run_store_mask_patterns(v, name, b, res, expected_masked); + run_store_mask_patterns(v, name, b, res, expected_masked); + } + + template + void run_store_mask_patterns(const V& v, const std::string& name, batch_type& b, V& res, V& expected_masked) + { + const std::string p = std::string(MaskKind::tag) + " masked"; + run_store_mask_pattern(v, name, b, res, expected_masked, p + " first element"); + run_store_mask_pattern(v, name, b, res, expected_masked, p + " first half"); + run_store_mask_pattern(v, name, b, res, expected_masked, p + " last half"); + run_store_mask_pattern(v, name, b, res, expected_masked, p + " even elements"); + run_store_mask_pattern(v, name, b, res, expected_masked, p + " odd elements"); + run_store_mask_pattern(v, name, b, res, expected_masked, p + " pseudo random"); + run_store_mask_pattern(v, name, b, res, expected_masked, p + " all elements"); } template @@ -453,6 +484,7 @@ struct load_store_test V sentinel_expected(size, sentinel); auto zero_mask = xsimd::make_batch_bool_constant(); + auto runtime_zero_mask = make_runtime_mask(); std::fill(res.begin(), res.end(), sentinel); b.store(res.data(), zero_mask, xsimd::aligned_mode()); INFO(name, " masked none aligned store"); @@ -470,6 +502,19 @@ struct load_store_test CHECK(std::all_of(scratch.begin(), scratch.end(), [](const value_type v) { return v == sentinel; })); + std::fill(res.begin(), res.end(), sentinel); + xsimd::store(res.data(), b, runtime_zero_mask, xsimd::aligned_mode()); + INFO(name, " runtime masked none aligned store"); + CHECK_VECTOR_EQ(res, sentinel_expected); + + std::fill(scratch.begin(), scratch.end(), sentinel); + xsimd::store(scratch_ptr, b, runtime_zero_mask, xsimd::unaligned_mode()); + INFO(name, " runtime masked none unaligned store"); + std::copy(scratch_ptr, scratch_ptr + scratch_slice.size(), scratch_slice.begin()); + CHECK_VECTOR_EQ(scratch_slice, sentinel_expected); + CHECK(std::all_of(scratch.begin(), scratch.end(), [](const value_type v) + { return v == sentinel; })); + run_store_mask_tests(v, name, b, res, expected_masked, std::true_type {}); } From 0325fb4c0ba3693d38b6a8dd4f3b86b2894de193 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 5 May 2026 14:42:57 -0400 Subject: [PATCH 2/2] refactor: trim masked load/store comments and consolidate AVX2-128 helpers Shorten verbose comments around masked load/store paths, drop the sizeof(int)/sizeof(long long) static_asserts (intrinsic boundaries now reinterpret_cast at the call site), and collapse the four maskload_128/maskstore_128 detail overloads into two XSIMD_IF_CONSTEXPR- dispatched templates. Public surface unchanged. --- .../xsimd/arch/common/xsimd_common_memory.hpp | 16 ++--- include/xsimd/arch/xsimd_avx.hpp | 7 +- include/xsimd/arch/xsimd_avx2.hpp | 34 ++++------ include/xsimd/arch/xsimd_avx2_128.hpp | 66 +++++++++---------- include/xsimd/arch/xsimd_avx_128.hpp | 7 +- 5 files changed, 53 insertions(+), 77 deletions(-) diff --git a/include/xsimd/arch/common/xsimd_common_memory.hpp b/include/xsimd/arch/common/xsimd_common_memory.hpp index 0639f5168..9d582943c 100644 --- a/include/xsimd/arch/common/xsimd_common_memory.hpp +++ b/include/xsimd/arch/common/xsimd_common_memory.hpp @@ -379,12 +379,8 @@ namespace xsimd XSIMD_INLINE batch load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept { - // Per-lane validity contract: only active lanes of ``mem`` are - // required to be addressable. An unconditional whole-vector load - // would touch inactive lanes and trip ASan/Valgrind on partial - // buffers, so stay scalar. Arches with hardware predicated loads - // (AVX2 32/64-bit, AVX-512, SVE, RVV) override this with a single - // intrinsic that suppresses inactive-lane reads in hardware. + // Scalar fallback: only active lanes are touched. Arches with + // hardware predicated loads override this. constexpr std::size_t size = batch::size; alignas(A::alignment()) std::array buffer; for (std::size_t i = 0; i < size; ++i) @@ -410,12 +406,8 @@ namespace xsimd XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept { - // Per-lane validity contract (matches native masked-store APIs): - // only active lanes of ``mem`` are touched. A load+select+store - // RMW would both read and write inactive bytes, breaking that - // contract — stay scalar. Arches with hardware predicated stores - // override this with a single intrinsic that suppresses inactive - // lanes in hardware. + // Scalar fallback: only active lanes are touched. Arches with + // hardware predicated stores override this. constexpr std::size_t size = batch::size; alignas(A::alignment()) std::array src_buf; src.store_aligned(src_buf.data()); diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index 429637784..a477e30b3 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -1015,9 +1015,7 @@ namespace xsimd } } - // Runtime-mask load for float/double on AVX. Both aligned_mode and - // unaligned_mode map to _mm256_maskload_* — the intrinsic does not fault - // on masked-off lanes, so partial loads across page boundaries are safe. + // Runtime-mask load (float/double). template XSIMD_INLINE batch load_masked(float const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept @@ -1048,8 +1046,7 @@ namespace xsimd } } - // Runtime-mask store for float/double on AVX. Same fault-suppression - // semantics as the masked loads above; alignment mode is irrelevant. + // Runtime-mask store (float/double). template XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index f2666018b..9f3415cf2 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -118,23 +118,18 @@ namespace xsimd } } - // load_masked. AVX2 mask{load,store}_epi{32,64} take int*/long long*; - // these helpers exist so the dispatch picks the right intrinsic by - // pointer width. Static_asserts pin sizeof(int)/(long long) to the - // values the intrinsics expect — the dispatcher relies on it. + // load_masked: AVX2 has _mm256_maskload_epi{32,64} for 32/64-bit + // integers; 8/16-bit fall back to the common scalar path. namespace detail { - static_assert(sizeof(int) == 4, "AVX2 maskload/maskstore expects sizeof(int) == 4"); - static_assert(sizeof(long long) == 8, "AVX2 maskload/maskstore expects sizeof(long long) == 8"); - XSIMD_INLINE __m256i maskload(const int32_t* mem, __m256i mask) noexcept { - return _mm256_maskload_epi32(mem, mask); + return _mm256_maskload_epi32(reinterpret_cast(mem), mask); } - XSIMD_INLINE __m256i maskload(const long long* mem, __m256i mask) noexcept + XSIMD_INLINE __m256i maskload(const int64_t* mem, __m256i mask) noexcept { - return _mm256_maskload_epi64(mem, mask); + return _mm256_maskload_epi64(reinterpret_cast(mem), mask); } XSIMD_INLINE __m256i zero_extend(__m128i hi) noexcept @@ -148,7 +143,7 @@ namespace xsimd load_masked(T const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { static_assert(sizeof(T) == 4 || sizeof(T) == 8, "load_masked supports only 32/64-bit integers on AVX2"); - using int_t = std::conditional_t; + using int_t = std::conditional_t; return detail::maskload(reinterpret_cast(mem), mask.as_batch()); } @@ -178,17 +173,12 @@ namespace xsimd return bitwise_cast(r); } - // Runtime-mask load for 32/64-bit integers on AVX2. 8/16-bit integers - // fall back to the scalar common path: AVX2 has no native maskload for - // those widths, and a load-then-blend would break fault-suppression at - // page boundaries (the main reason callers ask for a masked load). - // Both aligned_mode and unaligned_mode route to the same intrinsic — - // masked-off lanes do not fault regardless of alignment. + // Runtime-mask load (32/64-bit integers); 8/16-bit fall back to common. template XSIMD_INLINE std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8), batch> load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept { - using int_t = std::conditional_t; + using int_t = std::conditional_t; return detail::maskload(reinterpret_cast(mem), __m256i(mask)); } @@ -200,9 +190,9 @@ namespace xsimd _mm256_maskstore_epi32(reinterpret_cast(mem), mask, src); } - XSIMD_INLINE void maskstore(long long* mem, __m256i mask, __m256i src) noexcept + XSIMD_INLINE void maskstore(int64_t* mem, __m256i mask, __m256i src) noexcept { - _mm256_maskstore_epi64(mem, mask, src); + _mm256_maskstore_epi64(reinterpret_cast(mem), mask, src); } } @@ -210,7 +200,7 @@ namespace xsimd XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { constexpr size_t lanes_per_half = batch::size / 2; - using int_t = std::conditional_t; + using int_t = std::conditional_t; XSIMD_IF_CONSTEXPR(mask.countl_zero() >= lanes_per_half) { @@ -248,7 +238,7 @@ namespace xsimd XSIMD_INLINE std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8), void> store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept { - using int_t = std::conditional_t; + using int_t = std::conditional_t; detail::maskstore(reinterpret_cast(mem), __m256i(mask), __m256i(src)); } diff --git a/include/xsimd/arch/xsimd_avx2_128.hpp b/include/xsimd/arch/xsimd_avx2_128.hpp index 1f2de84b3..d4c29b211 100644 --- a/include/xsimd/arch/xsimd_avx2_128.hpp +++ b/include/xsimd/arch/xsimd_avx2_128.hpp @@ -89,91 +89,91 @@ namespace xsimd } } + // load_masked / store_masked: AVX2-128 has _mm_maskload/store for + // 32/64-bit integers; 8/16-bit fall back to the common scalar path. namespace detail { - XSIMD_INLINE __m128i maskload_128(int32_t const* mem, __m128i mask) noexcept + template + XSIMD_INLINE __m128i maskload_avx2_128(T const* mem, __m128i mask) noexcept { - return _mm_maskload_epi32(mem, mask); - } - XSIMD_INLINE __m128i maskload_128(long long const* mem, __m128i mask) noexcept - { - return _mm_maskload_epi64(mem, mask); - } - XSIMD_INLINE void maskstore_128(int32_t* mem, __m128i mask, __m128i src) noexcept - { - _mm_maskstore_epi32(mem, mask, src); + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_maskload_epi32(reinterpret_cast(mem), mask); + } + else + { + return _mm_maskload_epi64(reinterpret_cast(mem), mask); + } } - XSIMD_INLINE void maskstore_128(long long* mem, __m128i mask, __m128i src) noexcept + + template + XSIMD_INLINE void maskstore_avx2_128(T* mem, __m128i mask, __m128i src) noexcept { - _mm_maskstore_epi64(mem, mask, src); + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + _mm_maskstore_epi32(reinterpret_cast(mem), mask, src); + } + else + { + _mm_maskstore_epi64(reinterpret_cast(mem), mask, src); + } } } - // load_masked / store_masked. AVX2-128 has _mm_maskload/store for - // 32/64-bit integers only; 8/16-bit fall back to the common scalar - // path. Aligned and unaligned go through the same intrinsic — masked- - // off lanes do not fault regardless of alignment. template XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - return detail::maskload_128(mem, mask.as_batch()); + return detail::maskload_avx2_128(mem, mask.as_batch()); } template XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - return detail::maskload_128(reinterpret_cast(mem), mask.as_batch()); + return detail::maskload_avx2_128(mem, mask.as_batch()); } template XSIMD_INLINE batch load_masked(int64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - return detail::maskload_128(reinterpret_cast(mem), mask.as_batch()); + return detail::maskload_avx2_128(mem, mask.as_batch()); } template XSIMD_INLINE batch load_masked(uint64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - return detail::maskload_128(reinterpret_cast(mem), mask.as_batch()); + return detail::maskload_avx2_128(mem, mask.as_batch()); } template XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { - detail::maskstore_128(mem, mask.as_batch(), src); + detail::maskstore_avx2_128(mem, mask.as_batch(), src); } template XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { - detail::maskstore_128(reinterpret_cast(mem), mask.as_batch(), src); + detail::maskstore_avx2_128(mem, mask.as_batch(), __m128i(src)); } template XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { - detail::maskstore_128(reinterpret_cast(mem), mask.as_batch(), src); + detail::maskstore_avx2_128(mem, mask.as_batch(), src); } template XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { - detail::maskstore_128(reinterpret_cast(mem), mask.as_batch(), src); + detail::maskstore_avx2_128(mem, mask.as_batch(), __m128i(src)); } - // Runtime-mask path: route through the same helpers as the constant- - // mask overloads. Templated on T so the dispatcher does not need to - // see four near-identical functions; the runtime-mask overloads of - // the common arch take ``batch_bool``, not the four signed/ - // unsigned int variants, so there is no ambiguity here. template XSIMD_INLINE std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8), batch> load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept { - using int_t = std::conditional_t; - return detail::maskload_128(reinterpret_cast(mem), __m128i(mask)); + return detail::maskload_avx2_128(mem, __m128i(mask)); } template XSIMD_INLINE std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8), void> store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept { - using int_t = std::conditional_t; - detail::maskstore_128(reinterpret_cast(mem), __m128i(mask), __m128i(src)); + detail::maskstore_avx2_128(mem, __m128i(mask), __m128i(src)); } // gather diff --git a/include/xsimd/arch/xsimd_avx_128.hpp b/include/xsimd/arch/xsimd_avx_128.hpp index 98642dac6..e282010f2 100644 --- a/include/xsimd/arch/xsimd_avx_128.hpp +++ b/include/xsimd/arch/xsimd_avx_128.hpp @@ -115,9 +115,7 @@ namespace xsimd return _mm_maskload_pd(mem, mask.as_batch()); } - // Runtime-mask load for float/double on AVX-128. Both aligned_mode and - // unaligned_mode map to _mm_maskload_* — the intrinsic does not fault - // on masked-off lanes, so partial loads across page boundaries are safe. + // Runtime-mask load (float/double). template XSIMD_INLINE batch load_masked(float const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept @@ -144,8 +142,7 @@ namespace xsimd return _mm_maskstore_pd(mem, mask.as_batch(), src); } - // Runtime-mask store for float/double on AVX-128. Same fault-suppression - // semantics as the masked loads above; alignment mode is irrelevant. + // Runtime-mask store (float/double). template XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept