@@ -11319,16 +11319,106 @@ __declspec(noalias) void __stdcall __std_bitset_to_string_2(wchar_t* const _Dest
1131911319}
1132011320
1132111321} // extern "C"
11322+ #endif // ^^^ !defined(_M_ARM64) ^^^
1132211323
1132311324#ifndef _M_ARM64
1132411325namespace {
1132511326 namespace _Bitset_from_string {
11326- #ifdef _M_ARM64EC
11327- using _Traits_1_avx = void;
11328- using _Traits_1_sse = void;
11329- using _Traits_2_avx = void;
11330- using _Traits_2_sse = void;
11331- #else // ^^^ defined(_M_ARM64EC) / !defined(_M_ARM64EC) vvv
11327+ #if defined(_M_ARM64) || defined(_M_ARM64EC)
11328+ struct _Traits_1_neon {
11329+ using _Guard = char;
11330+ using _Vec = uint8x16_t;
11331+
11332+ static _Vec _Load(const void* const _Src) noexcept {
11333+ return vld1q_u8(static_cast<const uint8_t*>(_Src));
11334+ }
11335+
11336+ static void _Store(void* const _Dest, const _Vec _Val) noexcept {
11337+ return vst1q_u8(static_cast<uint8_t*>(_Dest), _Val);
11338+ }
11339+
11340+ static _Vec _Set(const uint8_t _Val) noexcept {
11341+ return vdupq_n_u8(_Val);
11342+ }
11343+
11344+ static _Vec _Cmp(const _Vec _Val1, const _Vec _Val2) noexcept {
11345+ return vceqq_u8(_Val1, _Val2);
11346+ }
11347+
11348+ static bool _Check(const _Vec _Val, const _Vec _Ex1, const _Vec _Dx0) noexcept {
11349+ const auto _Ex0 = _Cmp(_Val, _Dx0);
11350+ const auto _Ex01 = vorrq_u8(_Ex0, _Ex1);
11351+ const auto _Msk = vgetq_lane_u64(vreinterpretq_u64_u8(vpminq_u8(_Ex01, _Ex01)), 0);
11352+ return _Msk == 0xFFFF'FFFF'FFFF'FFFF;
11353+ }
11354+
11355+ static uint64_t _Movemask(const _Vec _Val) noexcept {
11356+ uint64_t _Val0 = vgetq_lane_u64(vreinterpretq_u64_u8(_Val), 0);
11357+ uint64_t _Val1 = vgetq_lane_u64(vreinterpretq_u64_u8(_Val), 1);
11358+
11359+ _Val0 &= 0x8080808080808080ull;
11360+ _Val1 &= 0x8080808080808080ull;
11361+ _Val0 *= 0x02040810204081ull;
11362+ _Val1 *= 0x02040810204081ull;
11363+
11364+ return (_Val0 >> 56) | ((_Val1 >> 56) << 8);
11365+ }
11366+
11367+ static uint16_t _To_bits(const _Vec _Ex1) noexcept {
11368+ // We do not omit static here, despite DevCom-11055227, because codegen is worse - see DevCom-11056805.
11369+ static constexpr uint8_t _Idx_arr[16] = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
11370+ const auto _Idx = vld1q_u8(_Idx_arr);
11371+
11372+ const auto _Ex2 = vqtbl1q_u8(_Ex1, _Idx);
11373+ return static_cast<uint16_t>(_Movemask(_Ex2));
11374+ }
11375+ };
11376+
11377+ struct _Traits_2_neon {
11378+ using _Guard = char;
11379+ using _Vec = uint16x8_t;
11380+
11381+ static _Vec _Load(const void* const _Src) noexcept {
11382+ return vld1q_u16(static_cast<const uint16_t*>(_Src));
11383+ }
11384+
11385+ static void _Store(void* const _Dest, const _Vec _Val) noexcept {
11386+ return vst1q_u16(static_cast<uint16_t*>(_Dest), _Val);
11387+ }
11388+
11389+ static _Vec _Set(const uint16_t _Val) noexcept {
11390+ return vdupq_n_u16(_Val);
11391+ }
11392+
11393+ static _Vec _Cmp(const _Vec _Val1, const _Vec _Val2) noexcept {
11394+ return vceqq_u16(_Val1, _Val2);
11395+ }
11396+
11397+ static bool _Check(const _Vec _Val, const _Vec _Ex1, const _Vec _Dx0) noexcept {
11398+ const auto _Ex0 = _Cmp(_Val, _Dx0);
11399+ const auto _Ex01 = vorrq_u16(_Ex0, _Ex1);
11400+ const auto _Msk = vgetq_lane_u64(vreinterpretq_u64_u16(vpminq_u16(_Ex01, _Ex01)), 0);
11401+ return _Msk == 0xFFFF'FFFF'FFFF'FFFF;
11402+ }
11403+
11404+ static uint64_t _Movemask(const _Traits_1_neon::_Vec _Val) noexcept {
11405+ uint64_t _Val0 = vgetq_lane_u64(vreinterpretq_u64_u8(_Val), 0);
11406+ _Val0 &= 0x8080808080808080ull;
11407+ _Val0 *= 0x02040810204081ull;
11408+ return _Val0 >> 56;
11409+ }
11410+
11411+ static uint8_t _To_bits(const _Vec _Ex1) noexcept {
11412+ // We do not omit static here, despite DevCom-11055227, because codegen is worse - see DevCom-11056805.
11413+ static constexpr uint8_t _Idx_arr[16] = {
11414+ 14, 12, 10, 8, 6, 4, 2, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
11415+ const auto _Idx = vld1q_u8(_Idx_arr);
11416+
11417+ const auto _Ex2 = vqtbl1q_u8(vreinterpretq_u8_u16(_Ex1), _Idx);
11418+ return static_cast<uint8_t>(_Movemask(_Ex2));
11419+ }
11420+ };
11421+ #else // ^^^ defined(_M_ARM64) || defined(_M_ARM64EC) / !defined(_M_ARM64) && !defined(_M_ARM64EC) vvv
1133211422 struct _Traits_avx {
1133311423 using _Guard = _Zeroupper_on_exit;
1133411424 using _Vec = __m256i;
@@ -11432,6 +11522,7 @@ namespace {
1143211522 return _mm_cmpeq_epi16(_Val, _Dx1);
1143311523 }
1143411524 };
11525+ #endif // ^^^ !defined(_M_ARM64) && !defined(_M_ARM64EC) ^^^
1143511526
1143611527 template <class _Traits, class _Elem, class _OutFn>
1143711528 bool _Loop(const _Elem* const _Src, const _Elem* _Src_end, const typename _Traits::_Vec _Dx0,
@@ -11500,7 +11591,6 @@ namespace {
1150011591
1150111592 return true;
1150211593 }
11503- #endif // ^^^ !defined(_M_ARM64EC) ^^^
1150411594
1150511595 template <class _Elem>
1150611596 bool _Fallback(void* const _Dest, const _Elem* const _Src, const size_t _Size_bytes, const size_t _Size_bits,
@@ -11533,20 +11623,19 @@ namespace {
1153311623 return true;
1153411624 }
1153511625
11626+ #if !defined(_M_ARM64) && !defined(_M_ARM64EC)
1153611627 template <class _Avx, class _Sse, class _Elem>
1153711628 bool _Dispatch(void* _Dest, const _Elem* _Src, size_t _Size_bytes, size_t _Size_bits, size_t _Size_chars,
1153811629 _Elem _Elem0, _Elem _Elem1) noexcept {
11539- #ifndef _M_ARM64EC
1154011630 if (_Use_avx2() && _Size_bits >= 256) {
1154111631 return _Impl<_Avx>(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1);
1154211632 } else if (_Use_sse42()) {
1154311633 return _Impl<_Sse>(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1);
11544- } else
11545- #endif // ^^^ !defined(_M_ARM64EC) ^^^
11546- {
11634+ } else {
1154711635 return _Fallback(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1);
1154811636 }
1154911637 }
11638+ #endif // ^^^ !defined(_M_ARM64) && !defined(_M_ARM64EC) ^^^
1155011639 } // namespace _Bitset_from_string
1155111640} // unnamed namespace
1155211641
@@ -11557,16 +11646,23 @@ __declspec(noalias) bool __stdcall __std_bitset_from_string_1(void* const _Dest,
1155711646 const char _Elem1) noexcept {
1155811647 using namespace _Bitset_from_string;
1155911648
11649+ #if defined(_M_ARM64) || defined(_M_ARM64EC)
11650+ return _Impl<_Traits_1_neon>(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1);
11651+ #else // ^^^ defined(_M_ARM64) || defined(_M_ARM64EC) / !defined(_M_ARM64) && !defined(_M_ARM64EC) vvv
1156011652 return _Dispatch<_Traits_1_avx, _Traits_1_sse>(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1);
11653+ #endif // ^^^ !defined(_M_ARM64) && !defined(_M_ARM64EC) ^^^
1156111654}
1156211655
1156311656__declspec(noalias) bool __stdcall __std_bitset_from_string_2(void* const _Dest, const wchar_t* const _Src,
1156411657 const size_t _Size_bytes, const size_t _Size_bits, const size_t _Size_chars, const wchar_t _Elem0,
1156511658 const wchar_t _Elem1) noexcept {
1156611659 using namespace _Bitset_from_string;
1156711660
11661+ #if defined(_M_ARM64) || defined(_M_ARM64EC)
11662+ return _Impl<_Traits_2_neon>(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1);
11663+ #else // ^^^ defined(_M_ARM64) || defined(_M_ARM64EC) / !defined(_M_ARM64) && !defined(_M_ARM64EC) vvv
1156811664 return _Dispatch<_Traits_2_avx, _Traits_2_sse>(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1);
11665+ #endif // ^^^ !defined(_M_ARM64) && !defined(_M_ARM64EC) ^^^
1156911666}
1157011667
1157111668} // extern "C"
11572- #endif // ^^^ !defined(_M_ARM64) ^^^
0 commit comments