@@ -460,56 +460,38 @@ const fn is_ascii(s: &[u8]) -> bool {
460460 )
461461}
462462
463- /// Chunk size for vectorized ASCII checking (two 16-byte SSE registers ).
463+ /// Chunk size for SSE2 vectorized ASCII checking (4x 16-byte loads ).
464464#[ cfg( all( target_arch = "x86_64" , target_feature = "sse2" ) ) ]
465- const CHUNK_SIZE : usize = 32 ;
465+ const SSE2_CHUNK_SIZE : usize = 64 ;
466466
467- /// SSE2 implementation using `_mm_movemask_epi8` (compiles to `pmovmskb`) to
468- /// avoid LLVM's broken AVX-512 auto-vectorization of counting loops.
469- ///
470- /// FIXME(llvm#176906): Remove this workaround once LLVM generates efficient code.
471467#[ cfg( all( target_arch = "x86_64" , target_feature = "sse2" ) ) ]
468+ #[ inline]
472469fn is_ascii_sse2 ( bytes : & [ u8 ] ) -> bool {
473470 use crate :: arch:: x86_64:: { __m128i, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128} ;
474471
475- let mut i = 0 ;
476-
477- while i + CHUNK_SIZE <= bytes. len ( ) {
478- // SAFETY: We have verified that `i + CHUNK_SIZE <= bytes.len()`.
479- let ptr = unsafe { bytes. as_ptr ( ) . add ( i) } ;
480-
481- // Load two 16-byte chunks and combine them.
482- // SAFETY: We verified `i + 32 <= len`, so ptr is valid for 32 bytes.
483- // `_mm_loadu_si128` allows unaligned loads.
484- let chunk1 = unsafe { _mm_loadu_si128 ( ptr as * const __m128i ) } ;
485- // SAFETY: Same as above - ptr.add(16) is within the valid 32-byte range.
486- let chunk2 = unsafe { _mm_loadu_si128 ( ptr. add ( 16 ) as * const __m128i ) } ;
487-
488- // OR them together - if any byte has the high bit set, the result will too.
489- // SAFETY: SSE2 is guaranteed by the cfg predicate.
490- let combined = unsafe { _mm_or_si128 ( chunk1, chunk2) } ;
491-
492- // Create a mask from the MSBs of each byte.
493- // If any byte is >= 128, its MSB is 1, so the mask will be non-zero.
494- // SAFETY: SSE2 is guaranteed by the cfg predicate.
495- let mask = unsafe { _mm_movemask_epi8 ( combined) } ;
496-
472+ let ( chunks, rest) = bytes. as_chunks :: < SSE2_CHUNK_SIZE > ( ) ;
473+
474+ for chunk in chunks {
475+ let ptr = chunk. as_ptr ( ) ;
476+ // SAFETY: chunk is 64 bytes. SSE2 is baseline on x86_64.
477+ let mask = unsafe {
478+ let a1 = _mm_loadu_si128 ( ptr as * const __m128i ) ;
479+ let a2 = _mm_loadu_si128 ( ptr. add ( 16 ) as * const __m128i ) ;
480+ let b1 = _mm_loadu_si128 ( ptr. add ( 32 ) as * const __m128i ) ;
481+ let b2 = _mm_loadu_si128 ( ptr. add ( 48 ) as * const __m128i ) ;
482+ // OR all chunks - if any byte has high bit set, combined will too.
483+ let combined = _mm_or_si128 ( _mm_or_si128 ( a1, a2) , _mm_or_si128 ( b1, b2) ) ;
484+ // Create a mask from the MSBs of each byte.
485+ // If any byte is >= 128, its MSB is 1, so the mask will be non-zero.
486+ _mm_movemask_epi8 ( combined)
487+ } ;
497488 if mask != 0 {
498489 return false ;
499490 }
500-
501- i += CHUNK_SIZE ;
502- }
503-
504- // Handle remaining bytes with simple loop
505- while i < bytes. len ( ) {
506- if !bytes[ i] . is_ascii ( ) {
507- return false ;
508- }
509- i += 1 ;
510491 }
511492
512- true
493+ // Handle remaining bytes
494+ rest. iter ( ) . all ( |b| b. is_ascii ( ) )
513495}
514496
515497/// ASCII test optimized to use the `pmovmskb` instruction on `x86-64`.
@@ -529,7 +511,7 @@ const fn is_ascii(bytes: &[u8]) -> bool {
529511 is_ascii_simple( bytes)
530512 } else {
531513 // For small inputs, use usize-at-a-time processing to avoid SSE2 call overhead.
532- if bytes. len( ) < CHUNK_SIZE {
514+ if bytes. len( ) < SSE2_CHUNK_SIZE {
533515 let chunks = bytes. chunks_exact( USIZE_SIZE ) ;
534516 let remainder = chunks. remainder( ) ;
535517 for chunk in chunks {
0 commit comments