@@ -11069,15 +11069,70 @@ __declspec(noalias) bool __stdcall __std_includes_less_8u(
1106911069
1107011070} // extern "C"
1107111071
11072- #ifndef _M_ARM64
1107311072namespace {
1107411073 namespace _Bitset_to_string {
11075- #ifdef _M_ARM64EC
11076- using _Traits_1_avx = void;
11077- using _Traits_1_sse = void;
11078- using _Traits_2_avx = void;
11079- using _Traits_2_sse = void;
11080- #else // ^^^ defined(_M_ARM64EC) / !defined(_M_ARM64EC) vvv
11074+ #if defined(_M_ARM64) || defined(_M_ARM64EC)
11075+ struct _Traits_1_neon {
11076+ using _Value_type = uint16_t;
11077+ using _Vec_t = uint8x16_t;
11078+ static void _Exit_vectorized() noexcept {}
11079+
11080+ static void _Out(void* const _Dest, const _Vec_t _Elems) noexcept {
11081+ vst1q_u8(static_cast<uint8_t*>(_Dest), _Elems);
11082+ }
11083+
11084+ static _Vec_t _Set(const uint8_t _Val) noexcept {
11085+ return vdupq_n_u8(_Val);
11086+ }
11087+
11088+ static _Vec_t _Load_constant() noexcept {
11089+ // We do not omit static here, despite DevCom-11055227, because codegen is worse - see DevCom-11056805.
11090+ static constexpr uint32_t _Idx_arr[4] = {0x01010101, 0x01010101, 0x00000000, 0x00000000};
11091+ const auto _Idx = vld1q_u8(reinterpret_cast<const uint8_t*>(_Idx_arr));
11092+ return _Idx;
11093+ }
11094+
11095+ static _Vec_t __forceinline _Step(
11096+ const uint16_t _Val, const _Vec_t _Px0, const _Vec_t _Px1, const _Vec_t _Idx) noexcept {
11097+ const auto _Vx0 = vdupq_n_u16(_Val);
11098+ const auto _Vx1 = vqtbl1q_u8(vreinterpretq_u8_u16(_Vx0), _Idx);
11099+ const auto _Msk = vandq_u8(_Vx1, vreinterpretq_u8_u64(vdupq_n_u64(0x0102040810204080)));
11100+ const auto _Ex0 = vceqq_u8(_Msk, vdupq_n_u8(0));
11101+ const auto _Ex1 = vbslq_u8(_Ex0, _Px0, _Px1);
11102+ return _Ex1;
11103+ }
11104+ };
11105+
11106+ struct _Traits_2_neon {
11107+ using _Value_type = uint8_t;
11108+ using _Vec_t = uint16x8_t;
11109+ static void _Exit_vectorized() noexcept {}
11110+
11111+ static void _Out(void* const _Dest, const _Vec_t _Elems) noexcept {
11112+ vst1q_u16(static_cast<uint16_t*>(_Dest), _Elems);
11113+ }
11114+
11115+ static _Vec_t _Set(const uint16_t _Val) noexcept {
11116+ return vdupq_n_u16(_Val);
11117+ }
11118+
11119+ static _Vec_t _Load_constant() noexcept {
11120+ // We do not omit static here, despite DevCom-11055227, because codegen is worse - see DevCom-11056805.
11121+ static constexpr uint64_t _Wx_arr[2] = {0x0010002000400080, 0x0001000200040008};
11122+ const auto _Wx = vld1q_u64(_Wx_arr);
11123+ return vreinterpretq_u16_u64(_Wx);
11124+ }
11125+
11126+ static _Vec_t __forceinline _Step(
11127+ const uint8_t _Val, const _Vec_t _Px0, const _Vec_t _Px1, const _Vec_t _Wx) noexcept {
11128+ const auto _Vx0 = vdupq_n_u16(static_cast<uint16_t>(_Val));
11129+ const auto _Msk = vandq_u16(_Vx0, _Wx);
11130+ const auto _Ex0 = vceqq_u16(_Msk, vdupq_n_u16(0));
11131+ const auto _Ex1 = vbslq_u16(_Ex0, _Px0, _Px1);
11132+ return _Ex1;
11133+ }
11134+ };
11135+ #else // ^^^ defined(_M_ARM64) || defined(_M_ARM64EC) / !defined(_M_ARM64) && !defined(_M_ARM64EC) vvv
1108111136 struct _Traits_avx {
1108211137 static void _Out(void* const _Dest, const __m256i _Elems) noexcept {
1108311138 _mm256_storeu_si256(static_cast<__m256i*>(_Dest), _Elems);
@@ -11086,6 +11141,10 @@ namespace {
1108611141 static void _Exit_vectorized() noexcept {
1108711142 _mm256_zeroupper();
1108811143 }
11144+
11145+ static int _Load_constant() noexcept {
11146+ return 0;
11147+ }
1108911148 };
1109011149
1109111150 struct _Traits_sse {
@@ -11094,6 +11153,10 @@ namespace {
1109411153 }
1109511154
1109611155 static void _Exit_vectorized() noexcept {}
11156+
11157+ static int _Load_constant() noexcept {
11158+ return 0;
11159+ }
1109711160 };
1109811161
1109911162 struct _Traits_1_avx : _Traits_avx {
@@ -11103,7 +11166,8 @@ namespace {
1110311166 return _mm256_broadcastb_epi8(_mm_cvtsi32_si128(_Val));
1110411167 }
1110511168
11106- static __m256i __forceinline _Step(const uint32_t _Val, const __m256i _Px0, const __m256i _Px1) noexcept {
11169+ static __m256i __forceinline _Step(
11170+ const uint32_t _Val, const __m256i _Px0, const __m256i _Px1, int) noexcept {
1110711171 const __m128i _Vx0 = _mm_cvtsi32_si128(_Val);
1110811172 const __m128i _Vx1 =
1110911173 _mm_shuffle_epi8(_Vx0, _mm_set_epi32(0x00000000, 0x01010101, 0x02020202, 0x03030303));
@@ -11123,7 +11187,8 @@ namespace {
1112311187 return _mm_shuffle_epi8(_mm_cvtsi32_si128(_Val), _mm_setzero_si128());
1112411188 }
1112511189
11126- static __m128i __forceinline _Step(const uint16_t _Val, const __m128i _Px0, const __m128i _Px1) noexcept {
11190+ static __m128i __forceinline _Step(
11191+ const uint16_t _Val, const __m128i _Px0, const __m128i _Px1, int) noexcept {
1112711192 const __m128i _Vx0 = _mm_cvtsi32_si128(_Val);
1112811193 const __m128i _Vx1 =
1112911194 _mm_shuffle_epi8(_Vx0, _mm_set_epi32(0x00000000, 0x00000000, 0x01010101, 0x01010101));
@@ -11141,7 +11206,8 @@ namespace {
1114111206 return _mm256_broadcastw_epi16(_mm_cvtsi32_si128(_Val));
1114211207 }
1114311208
11144- static __m256i __forceinline _Step(const uint16_t _Val, const __m256i _Px0, const __m256i _Px1) noexcept {
11209+ static __m256i __forceinline _Step(
11210+ const uint16_t _Val, const __m256i _Px0, const __m256i _Px1, int) noexcept {
1114511211 const __m128i _Vx0 = _mm_cvtsi32_si128(_Val);
1114611212 const __m128i _Vx1 =
1114711213 _mm_shuffle_epi8(_Vx0, _mm_set_epi32(0x00000000, 0x00000000, 0x01010101, 0x01010101));
@@ -11162,14 +11228,16 @@ namespace {
1116211228 return _mm_set1_epi16(_Val);
1116311229 }
1116411230
11165- static __m128i __forceinline _Step(const uint8_t _Val, const __m128i _Px0, const __m128i _Px1) noexcept {
11231+ static __m128i __forceinline _Step(
11232+ const uint8_t _Val, const __m128i _Px0, const __m128i _Px1, int) noexcept {
1116611233 const __m128i _Vx = _mm_set1_epi16(_Val);
1116711234 const __m128i _Msk = _mm_and_si128(_Vx, _mm_set_epi64x(0x0001000200040008, 0x0010002000400080));
1116811235 const __m128i _Ex0 = _mm_cmpeq_epi16(_Msk, _mm_setzero_si128());
1116911236 const __m128i _Ex1 = _mm_blendv_epi8(_Px1, _Px0, _Ex0);
1117011237 return _Ex1;
1117111238 }
1117211239 };
11240+ #endif // ^^^ !defined(_M_ARM64) && !defined(_M_ARM64EC) ^^^
1117311241
1117411242 template <class _Traits, class _Elem>
1117511243 void __stdcall _Impl(
@@ -11178,14 +11246,18 @@ namespace {
1117811246
1117911247 const auto _Px0 = _Traits::_Set(_Elem0);
1118011248 const auto _Px1 = _Traits::_Set(_Elem1);
11249+
11250+ // TRANSITION: DevCom-11056873
11251+ const auto _Constant = _Traits::_Load_constant();
11252+
1118111253 if (_Size_bits >= _Step_size_bits) {
1118211254 _Elem* _Pos = _Dest + _Size_bits;
1118311255 _Size_bits &= _Step_size_bits - 1;
1118411256 _Elem* const _Stop_at = _Dest + _Size_bits;
1118511257 do {
1118611258 typename _Traits::_Value_type _Val;
1118711259 memcpy(&_Val, _Src, sizeof(_Val));
11188- const auto _Elems = _Traits::_Step(_Val, _Px0, _Px1);
11260+ const auto _Elems = _Traits::_Step(_Val, _Px0, _Px1, _Constant );
1118911261 _Pos -= _Step_size_bits;
1119011262 _Traits::_Out(_Pos, _Elems);
1119111263 _Advance_bytes(_Src, sizeof(_Val));
@@ -11195,7 +11267,7 @@ namespace {
1119511267 if (_Size_bits > 0) {
1119611268 typename _Traits::_Value_type _Val;
1119711269 memcpy(&_Val, _Src, sizeof(_Val));
11198- const auto _Elems = _Traits::_Step(_Val, _Px0, _Px1);
11270+ const auto _Elems = _Traits::_Step(_Val, _Px0, _Px1, _Constant );
1119911271 _Elem _Tmp[_Step_size_bits];
1120011272 _Traits::_Out(_Tmp, _Elems);
1120111273 const _Elem* const _Tmpd = _Tmp + (_Step_size_bits - _Size_bits);
@@ -11204,25 +11276,23 @@ namespace {
1120411276
1120511277 _Traits::_Exit_vectorized(); // TRANSITION, DevCom-10331414
1120611278 }
11207- #endif // ^^^ !defined(_M_ARM64EC) ^^^
1120811279
11280+ #if !defined(_M_ARM64) && !defined(_M_ARM64EC)
1120911281 template <class _Avx_traits, class _Sse_traits, class _Elem>
1121011282 void __stdcall _Dispatch(_Elem* const _Dest, const void* const _Src, const size_t _Size_bits,
1121111283 const _Elem _Elem0, const _Elem _Elem1) noexcept {
11212- #ifndef _M_ARM64EC
1121311284 if (_Use_avx2() && _Size_bits >= 256) {
1121411285 _Impl<_Avx_traits>(_Dest, _Src, _Size_bits, _Elem0, _Elem1);
1121511286 } else if (_Use_sse42()) {
1121611287 _Impl<_Sse_traits>(_Dest, _Src, _Size_bits, _Elem0, _Elem1);
11217- } else
11218- #endif // ^^^ !defined(_M_ARM64EC) ^^^
11219- {
11288+ } else {
1122011289 const auto _Arr = reinterpret_cast<const uint8_t*>(_Src);
1122111290 for (size_t _Ix = 0; _Ix < _Size_bits; ++_Ix) {
1122211291 _Dest[_Size_bits - 1 - _Ix] = ((_Arr[_Ix >> 3] >> (_Ix & 7)) & 1) != 0 ? _Elem1 : _Elem0;
1122311292 }
1122411293 }
1122511294 }
11295+ #endif // ^^^ !defined(_M_ARM64) && !defined(_M_ARM64EC) ^^^
1122611296 } // namespace _Bitset_to_string
1122711297} // unnamed namespace
1122811298
@@ -11231,17 +11301,26 @@ extern "C" {
1123111301__declspec(noalias) void __stdcall __std_bitset_to_string_1(
1123211302 char* const _Dest, const void* const _Src, const size_t _Size_bits, const char _Elem0, const char _Elem1) noexcept {
1123311303 using namespace _Bitset_to_string;
11304+ #if defined(_M_ARM64) || defined(_M_ARM64EC)
11305+ _Impl<_Traits_1_neon>(_Dest, _Src, _Size_bits, _Elem0, _Elem1);
11306+ #else // ^^^ defined(_M_ARM64) || defined(_M_ARM64EC) / !defined(_M_ARM64) && !defined(_M_ARM64EC) vvv
1123411307 _Dispatch<_Traits_1_avx, _Traits_1_sse>(_Dest, _Src, _Size_bits, _Elem0, _Elem1);
11308+ #endif // ^^^ !defined(_M_ARM64) && !defined(_M_ARM64EC) ^^^
1123511309}
1123611310
1123711311__declspec(noalias) void __stdcall __std_bitset_to_string_2(wchar_t* const _Dest, const void* const _Src,
1123811312 const size_t _Size_bits, const wchar_t _Elem0, const wchar_t _Elem1) noexcept {
1123911313 using namespace _Bitset_to_string;
11314+ #if defined(_M_ARM64) || defined(_M_ARM64EC)
11315+ _Impl<_Traits_2_neon>(_Dest, _Src, _Size_bits, _Elem0, _Elem1);
11316+ #else // ^^^ defined(_M_ARM64) || defined(_M_ARM64EC) / !defined(_M_ARM64) && !defined(_M_ARM64EC) vvv
1124011317 _Dispatch<_Traits_2_avx, _Traits_2_sse>(_Dest, _Src, _Size_bits, _Elem0, _Elem1);
11318+ #endif // ^^^ !defined(_M_ARM64) && !defined(_M_ARM64EC) ^^^
1124111319}
1124211320
1124311321} // extern "C"
1124411322
11323+ #ifndef _M_ARM64
1124511324namespace {
1124611325 namespace _Bitset_from_string {
1124711326#ifdef _M_ARM64EC
0 commit comments