Skip to content

Commit b614256

Browse files
committed
Auto merge of #151664 - jhpratt:rollup-scDdbzg, r=jhpratt
Rollup of 4 pull requests Successful merges: - #150353 (refactor rustc-hash integration) - #151611 (Improve is_ascii performance on x86_64 with explicit SSE2 intrinsics) - #150705 (Add missing mut to pin.rs docs) - #151639 (Fix broken WASIp1 reference link)
2 parents 873d468 + efe56a5 commit b614256

7 files changed

Lines changed: 34 additions & 58 deletions

File tree

compiler/rustc_data_structures/src/fx.rs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
1-
use std::hash::BuildHasherDefault;
2-
31
pub use rustc_hash::{FxBuildHasher, FxHashMap, FxHashSet, FxHasher};
42

53
pub type StdEntry<'a, K, V> = std::collections::hash_map::Entry<'a, K, V>;
64

7-
pub type FxIndexMap<K, V> = indexmap::IndexMap<K, V, BuildHasherDefault<FxHasher>>;
8-
pub type FxIndexSet<V> = indexmap::IndexSet<V, BuildHasherDefault<FxHasher>>;
5+
pub type FxIndexMap<K, V> = indexmap::IndexMap<K, V, FxBuildHasher>;
6+
pub type FxIndexSet<V> = indexmap::IndexSet<V, FxBuildHasher>;
97
pub type IndexEntry<'a, K, V> = indexmap::map::Entry<'a, K, V>;
108
pub type IndexOccupiedEntry<'a, K, V> = indexmap::map::OccupiedEntry<'a, K, V>;
119

compiler/rustc_data_structures/src/unord.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,10 @@ use std::hash::Hash;
88
use std::iter::{Product, Sum};
99
use std::ops::Index;
1010

11-
use rustc_hash::{FxBuildHasher, FxHashMap, FxHashSet};
1211
use rustc_macros::{Decodable_NoContext, Encodable_NoContext};
1312

1413
use crate::fingerprint::Fingerprint;
14+
use crate::fx::{FxBuildHasher, FxHashMap, FxHashSet};
1515
use crate::stable_hasher::{HashStable, StableCompare, StableHasher, ToStableHashKey};
1616

1717
/// `UnordItems` is the order-less version of `Iterator`. It only contains methods

compiler/rustc_type_ir/src/data_structures/mod.rs

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
1-
use std::hash::BuildHasherDefault;
2-
31
pub use ena::unify::{NoError, UnifyKey, UnifyValue};
4-
use rustc_hash::FxHasher;
2+
use rustc_hash::FxBuildHasher;
53
pub use rustc_hash::{FxHashMap as HashMap, FxHashSet as HashSet};
64

7-
pub type IndexMap<K, V> = indexmap::IndexMap<K, V, BuildHasherDefault<FxHasher>>;
8-
pub type IndexSet<V> = indexmap::IndexSet<V, BuildHasherDefault<FxHasher>>;
5+
pub type IndexMap<K, V> = indexmap::IndexMap<K, V, FxBuildHasher>;
6+
pub type IndexSet<V> = indexmap::IndexSet<V, FxBuildHasher>;
97

108
mod delayed_map;
119

library/core/src/pin.rs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -831,15 +831,13 @@
831831
//! <code>fn get_pin_mut(self: [Pin]<[`&mut Self`]>) -> [Pin]<[`&mut T`]></code>.
832832
//! Then we could do the following:
833833
//! ```compile_fail
834-
//! # use std::cell::RefCell;
835-
//! # use std::pin::Pin;
836-
//! fn exploit_ref_cell<T>(rc: Pin<&mut RefCell<T>>) {
834+
//! fn exploit_ref_cell<T>(mut rc: Pin<&mut RefCell<T>>) {
837835
//! // Here we get pinned access to the `T`.
838836
//! let _: Pin<&mut T> = rc.as_mut().get_pin_mut();
839837
//!
840838
//! // And here we have `&mut T` to the same data.
841839
//! let shared: &RefCell<T> = rc.into_ref().get_ref();
842-
//! let borrow = shared.borrow_mut();
840+
//! let mut borrow = shared.borrow_mut();
843841
//! let content = &mut *borrow;
844842
//! }
845843
//! ```

library/core/src/slice/ascii.rs

Lines changed: 22 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -460,56 +460,38 @@ const fn is_ascii(s: &[u8]) -> bool {
460460
)
461461
}
462462

463-
/// Chunk size for vectorized ASCII checking (two 16-byte SSE registers).
463+
/// Chunk size for SSE2 vectorized ASCII checking (4x 16-byte loads).
464464
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
465-
const CHUNK_SIZE: usize = 32;
465+
const SSE2_CHUNK_SIZE: usize = 64;
466466

467-
/// SSE2 implementation using `_mm_movemask_epi8` (compiles to `pmovmskb`) to
468-
/// avoid LLVM's broken AVX-512 auto-vectorization of counting loops.
469-
///
470-
/// FIXME(llvm#176906): Remove this workaround once LLVM generates efficient code.
471467
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
468+
#[inline]
472469
fn is_ascii_sse2(bytes: &[u8]) -> bool {
473470
use crate::arch::x86_64::{__m128i, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128};
474471

475-
let mut i = 0;
476-
477-
while i + CHUNK_SIZE <= bytes.len() {
478-
// SAFETY: We have verified that `i + CHUNK_SIZE <= bytes.len()`.
479-
let ptr = unsafe { bytes.as_ptr().add(i) };
480-
481-
// Load two 16-byte chunks and combine them.
482-
// SAFETY: We verified `i + 32 <= len`, so ptr is valid for 32 bytes.
483-
// `_mm_loadu_si128` allows unaligned loads.
484-
let chunk1 = unsafe { _mm_loadu_si128(ptr as *const __m128i) };
485-
// SAFETY: Same as above - ptr.add(16) is within the valid 32-byte range.
486-
let chunk2 = unsafe { _mm_loadu_si128(ptr.add(16) as *const __m128i) };
487-
488-
// OR them together - if any byte has the high bit set, the result will too.
489-
// SAFETY: SSE2 is guaranteed by the cfg predicate.
490-
let combined = unsafe { _mm_or_si128(chunk1, chunk2) };
491-
492-
// Create a mask from the MSBs of each byte.
493-
// If any byte is >= 128, its MSB is 1, so the mask will be non-zero.
494-
// SAFETY: SSE2 is guaranteed by the cfg predicate.
495-
let mask = unsafe { _mm_movemask_epi8(combined) };
496-
472+
let (chunks, rest) = bytes.as_chunks::<SSE2_CHUNK_SIZE>();
473+
474+
for chunk in chunks {
475+
let ptr = chunk.as_ptr();
476+
// SAFETY: chunk is 64 bytes. SSE2 is baseline on x86_64.
477+
let mask = unsafe {
478+
let a1 = _mm_loadu_si128(ptr as *const __m128i);
479+
let a2 = _mm_loadu_si128(ptr.add(16) as *const __m128i);
480+
let b1 = _mm_loadu_si128(ptr.add(32) as *const __m128i);
481+
let b2 = _mm_loadu_si128(ptr.add(48) as *const __m128i);
482+
// OR all chunks - if any byte has high bit set, combined will too.
483+
let combined = _mm_or_si128(_mm_or_si128(a1, a2), _mm_or_si128(b1, b2));
484+
// Create a mask from the MSBs of each byte.
485+
// If any byte is >= 128, its MSB is 1, so the mask will be non-zero.
486+
_mm_movemask_epi8(combined)
487+
};
497488
if mask != 0 {
498489
return false;
499490
}
500-
501-
i += CHUNK_SIZE;
502-
}
503-
504-
// Handle remaining bytes with simple loop
505-
while i < bytes.len() {
506-
if !bytes[i].is_ascii() {
507-
return false;
508-
}
509-
i += 1;
510491
}
511492

512-
true
493+
// Handle remaining bytes
494+
rest.iter().all(|b| b.is_ascii())
513495
}
514496

515497
/// ASCII test optimized to use the `pmovmskb` instruction on `x86-64`.
@@ -529,7 +511,7 @@ const fn is_ascii(bytes: &[u8]) -> bool {
529511
is_ascii_simple(bytes)
530512
} else {
531513
// For small inputs, use usize-at-a-time processing to avoid SSE2 call overhead.
532-
if bytes.len() < CHUNK_SIZE {
514+
if bytes.len() < SSE2_CHUNK_SIZE {
533515
let chunks = bytes.chunks_exact(USIZE_SIZE);
534516
let remainder = chunks.remainder();
535517
for chunk in chunks {

src/doc/rustc/src/platform-support/wasm32-wasip1.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ focused on the Component Model-based definition of WASI. At this point the
2020
`wasm32-wasip1` Rust target is intended for historical compatibility with
2121
[WASIp1] set of syscalls.
2222

23-
[WASIp1]: https://github.com/WebAssembly/WASI/tree/main/legacy/preview1
23+
[WASIp1]: https://github.com/WebAssembly/WASI/tree/wasi-0.1/preview1
2424
[Component Model]: https://github.com/webassembly/component-model
2525

2626
Today the `wasm32-wasip1` target will generate core WebAssembly modules

tests/assembly-llvm/slice-is-ascii.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,15 @@
1313
/// Verify `is_ascii` generates efficient code on different architectures:
1414
///
1515
/// - x86_64: Must NOT use `kshiftrd`/`kshiftrq` (broken AVX-512 auto-vectorization).
16-
/// The fix uses explicit SSE2 intrinsics (`pmovmskb`/`vpmovmskb`).
17-
/// See: https://github.com/llvm/llvm-project/issues/176906
16+
/// Good version uses explicit SSE2 intrinsics (`pmovmskb`/`vpmovmskb`).
1817
///
1918
/// - loongarch64: Should use `vmskltz.b` instruction for the fast-path.
20-
/// This architecture still relies on LLVM auto-vectorization.
2119
2220
// X86_64-LABEL: test_is_ascii
2321
// X86_64-NOT: kshiftrd
2422
// X86_64-NOT: kshiftrq
23+
// X86_64: {{vpor|por}}
24+
// X86_64: {{vpmovmskb|pmovmskb}}
2525

2626
// LA64-LABEL: test_is_ascii
2727
// LA64: vmskltz.b

0 commit comments

Comments
 (0)