Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions crates/uffs-mft/src/index/base.rs
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,19 @@ impl MftIndex {
stats.total_name_bytes += u64::from(record.first_name.name.length());
}

// WI-4.1: surface NTFS-name decode loss. The shared decoder
// (`io::parser::unified::decode_name_u16`) tallies every U+FFFD
// substitution into a process-global counter; snapshot it into the
// stats and warn once when non-zero so the loss is measured, not
// silent. (Eliminating the loss entirely is the WI-4.4 RFC.)
stats.lossy_name_count = crate::io::parser::unified::lossy_name_count();
if stats.lossy_name_count > 0 {
tracing::warn!(
lossy_name_count = stats.lossy_name_count,
"filenames contained characters not representable in UTF-8 and were stored with U+FFFD"
);
}

self.stats = stats;
}

Expand Down
7 changes: 7 additions & 0 deletions crates/uffs-mft/src/index/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,12 @@ pub struct MftStats {
pub size_bucket_counts: [u32; 8],
/// Total bytes per size bucket.
pub size_bucket_bytes: [u64; 8],
/// Number of U+FFFD substitutions made while decoding NTFS names from
/// UTF-16 (Category 4, WI-4.1). `0` means every name decoded losslessly;
/// `> 0` means that many code units were not representable in UTF-8 and
/// were stored as the replacement character — surfaced via a `warn!` at
/// index-build time so the loss is visible, not silent.
pub lossy_name_count: u64,
}

impl MftStats {
Expand All @@ -113,6 +119,7 @@ impl MftStats {
reparse_bytes: 0,
size_bucket_counts: [0; 8],
size_bucket_bytes: [0; 8],
lossy_name_count: 0,
}
}

Expand Down
5 changes: 4 additions & 1 deletion crates/uffs-mft/src/io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@ mod chunking;
mod extent_map;
mod fixup;
mod merger;
mod parser;
// `pub(crate)` so the instrumented name decoder (`parser::unified::
// decode_name_u16`, WI-4.1) is reachable from the sibling `parse/` and
// `usn/` modules that decode NTFS names — one decoder, one lossy tally.
pub(crate) mod parser;
// readers module available on all platforms (contains ChaosMftReader for
// offline MFT)
pub mod readers;
Expand Down
4 changes: 2 additions & 2 deletions crates/uffs-mft/src/io/parser/fragment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ pub fn parse_record_to_fragment(
.chunks_exact(2)
.map(|pair| u16::from_le_bytes([pair[0], pair[1]]))
.collect();
let name = String::from_utf16_lossy(&name_u16);
let name = crate::io::parser::unified::decode_name_u16(&name_u16).0;
let parent_frs = file_reference_to_frs(fn_attr.parent_directory);
let namespace = fn_attr.file_name_namespace;

Expand Down Expand Up @@ -241,7 +241,7 @@ pub fn parse_record_to_fragment(
.chunks_exact(2)
.map(|pair| u16::from_le_bytes([pair[0], pair[1]]))
.collect();
let stream_name = String::from_utf16_lossy(&name_u16);
let stream_name = crate::io::parser::unified::decode_name_u16(&name_u16).0;
// ALL named $DATA streams create regular
// stream entries. Internal ones are filtered from
// output by is_internal_windows_stream in the output layer.
Expand Down
4 changes: 2 additions & 2 deletions crates/uffs-mft/src/io/parser/fragment_extension.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ pub(super) fn parse_extension_to_fragment(
.chunks_exact(2)
.map(|pair| u16::from_le_bytes([pair[0], pair[1]]))
.collect();
let name = String::from_utf16_lossy(&name_u16);
let name = crate::io::parser::unified::decode_name_u16(&name_u16).0;
let parent_frs = fn_attr.parent_directory & 0x0000_FFFF_FFFF_FFFF;
names.push((name, parent_frs));
}
Expand Down Expand Up @@ -169,7 +169,7 @@ pub(super) fn parse_extension_to_fragment(
.chunks_exact(2)
.map(|pair| u16::from_le_bytes([pair[0], pair[1]]))
.collect();
let stream_name = String::from_utf16_lossy(&name_u16);
let stream_name = crate::io::parser::unified::decode_name_u16(&name_u16).0;
// ALL named $DATA streams create regular
// stream entries. Internal ones are filtered from
// output by is_internal_windows_stream in the output layer.
Expand Down
6 changes: 3 additions & 3 deletions crates/uffs-mft/src/io/parser/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ pub fn parse_record_to_index(data: &[u8], frs: u64, index: &mut crate::index::Mf
.chunks_exact(2)
.map(|pair| u16::from_le_bytes([pair[0], pair[1]]))
.collect();
let name = String::from_utf16_lossy(&name_u16);
let name = crate::io::parser::unified::decode_name_u16(&name_u16).0;
let parent_frs = file_reference_to_frs(fn_attr.parent_directory);
let namespace = fn_attr.file_name_namespace;

Expand Down Expand Up @@ -350,7 +350,7 @@ pub fn parse_record_to_index(data: &[u8], frs: u64, index: &mut crate::index::Mf
.chunks_exact(2)
.map(|pair| u16::from_le_bytes([pair[0], pair[1]]))
.collect();
let stream_name = String::from_utf16_lossy(&name_u16);
let stream_name = crate::io::parser::unified::decode_name_u16(&name_u16).0;

// $BadClus:$Bad (FRS 8) uses InitializedSize
// instead of DataSize/AllocatedSize to avoid counting the
Expand Down Expand Up @@ -449,7 +449,7 @@ pub fn parse_record_to_index(data: &[u8], frs: u64, index: &mut crate::index::Mf
.chunks_exact(2)
.map(|pair| u16::from_le_bytes([pair[0], pair[1]]))
.collect();
String::from_utf16_lossy(&name_u16)
crate::io::parser::unified::decode_name_u16(&name_u16).0
};
(is_i30, name)
} else {
Expand Down
6 changes: 3 additions & 3 deletions crates/uffs-mft/src/io/parser/index_extension.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ pub(super) fn parse_extension_to_index(
.chunks_exact(2)
.map(|pair| u16::from_le_bytes([pair[0], pair[1]]))
.collect();
let name = String::from_utf16_lossy(&name_u16);
let name = crate::io::parser::unified::decode_name_u16(&name_u16).0;
let parent_frs = fn_attr.parent_directory & 0x0000_FFFF_FFFF_FFFF;
names.push((name, parent_frs));
}
Expand Down Expand Up @@ -252,7 +252,7 @@ pub(super) fn parse_extension_to_index(
.chunks_exact(2)
.map(|pair| u16::from_le_bytes([pair[0], pair[1]]))
.collect();
let stream_name = String::from_utf16_lossy(&name_u16);
let stream_name = crate::io::parser::unified::decode_name_u16(&name_u16).0;
// ALL named $DATA streams create regular
// stream entries. Internal ones are filtered from
// output by is_internal_windows_stream in the output layer.
Expand Down Expand Up @@ -304,7 +304,7 @@ pub(super) fn parse_extension_to_index(
.chunks_exact(2)
.map(|pair| u16::from_le_bytes([pair[0], pair[1]]))
.collect();
String::from_utf16_lossy(&name_u16)
crate::io::parser::unified::decode_name_u16(&name_u16).0
};
(is_i30, name)
} else {
Expand Down
109 changes: 106 additions & 3 deletions crates/uffs-mft/src/io/parser/unified.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,17 @@ use crate::ntfs::{
};

/// Decode a UTF-16LE byte slice into `out`, replacing unpaired surrogates
/// with U+FFFD. Returns the number of bytes written to `out`.
/// with U+FFFD. Returns the number of U+FFFD replacements emitted
/// (`0` = lossless).
///
/// This avoids the per-call `SmallVec` + `String` allocation that
/// `String::from_utf16_lossy` requires.
/// `String::from_utf16_lossy` requires, and — unlike `from_utf16_lossy` —
/// surfaces the substitution count so name loss at the NTFS boundary is
/// measured, not silent (Category 4, WI-4.1).
#[inline]
fn decode_utf16le_into(bytes: &[u8], out: &mut String) {
fn decode_utf16le_into(bytes: &[u8], out: &mut String) -> u32 {
out.clear();
let mut replacements: u32 = 0;
let mut i = 0_usize;
while let Some(pair) = bytes
.get(i..i + 2)
Expand All @@ -52,17 +56,21 @@ fn decode_utf16le_into(bytes: &[u8], out: &mut String) {
out.push(ch);
} else {
out.push(char::REPLACEMENT_CHARACTER);
replacements = replacements.saturating_add(1);
}
} else {
out.push(char::REPLACEMENT_CHARACTER);
replacements = replacements.saturating_add(1);
}
} else {
out.push(char::REPLACEMENT_CHARACTER);
replacements = replacements.saturating_add(1);
}
}
// Low surrogate without preceding high
0xDC00..=0xDFFF => {
out.push(char::REPLACEMENT_CHARACTER);
replacements = replacements.saturating_add(1);
}
_ => {
// All non-surrogate u16 values are valid Unicode scalar values.
Expand All @@ -73,6 +81,50 @@ fn decode_utf16le_into(bytes: &[u8], out: &mut String) {
}
}
}
replacements
}

/// Decode a `&[u16]` UTF-16 name into a fresh `String`, returning
/// `(String, replacement_count)`. Use this instead of
/// `String::from_utf16_lossy` at NTFS name boundaries so loss is counted,
/// not silent (Category 4, WI-4.1).
///
/// Most NTFS-name call sites already hold a `Vec<u16>` / `SmallVec<[u16; N]>`
/// (the attribute decoder collects code units before stringifying), so this
/// `&[u16]` entry point avoids re-deriving a byte slice. There is exactly
/// ONE surrogate-handling implementation: this re-encodes to LE bytes and
/// routes through `decode_utf16le_into`.
#[inline]
pub(crate) fn decode_name_u16(units: &[u16]) -> (String, u32) {
let mut bytes = Vec::with_capacity(units.len().saturating_mul(2));
for unit in units {
bytes.extend_from_slice(&unit.to_le_bytes());
}
let mut out = String::new();
let count = decode_utf16le_into(&bytes, &mut out);
if count > 0 {
LOSSY_NAME_COUNT.fetch_add(u64::from(count), core::sync::atomic::Ordering::Relaxed);
}
(out, count)
}

/// Process-global tally of U+FFFD substitutions emitted by
/// [`decode_name_u16`] across all NTFS-name decodes (Category 4, WI-4.1).
///
/// The parser call sites are spread across nine modules and do not thread a
/// stats accumulator through their (hot-path) signatures, so the count is
/// gathered here with a single relaxed atomic — cheap, lock-free, and read
/// at index-build time into [`crate::index::stats::MftStats::lossy_name_count`]
/// for the "N filenames were stored with U+FFFD" warning. `Relaxed` is
/// sufficient: it is a monotonic diagnostic counter, not a synchronisation
/// point.
pub(crate) static LOSSY_NAME_COUNT: core::sync::atomic::AtomicU64 =
core::sync::atomic::AtomicU64::new(0);

/// Snapshot the current global lossy-name tally.
#[inline]
pub(crate) fn lossy_name_count() -> u64 {
LOSSY_NAME_COUNT.load(core::sync::atomic::Ordering::Relaxed)
}

/// Process a single MFT record (base OR extension) in one pass.
Expand Down Expand Up @@ -473,3 +525,54 @@ fn rd_u64(buf: &[u8], off: usize) -> u64 {
.and_then(|sl| <[u8; 8]>::try_from(sl).ok())
.map_or(0, u64::from_le_bytes)
}

#[cfg(test)]
mod tests {
use super::{decode_name_u16, lossy_name_count};

#[test]
fn decode_name_u16_lossless_bmp_and_astral() {
// "Aé😀" — BMP + an astral char (valid surrogate pair). No loss.
// 'A'=0x0041, 'é'=0x00E9, '😀'=U+1F600 → D83D DE00.
let units = [0x0041_u16, 0x00E9, 0xD83D, 0xDE00];
let (name, count) = decode_name_u16(&units);
assert_eq!(count, 0, "well-formed UTF-16 must decode losslessly");
assert_eq!(name, "Aé😀");
assert!(!name.contains(char::REPLACEMENT_CHARACTER));
}

#[test]
fn decode_name_u16_unpaired_surrogate_is_counted_and_replaced() {
// A lone high surrogate (0xD800) with no following low surrogate —
// legal on NTFS, illegal in UTF-8. Must NOT panic; must substitute
// exactly one U+FFFD and report the count.
let units = [
0x0066_u16, // 'f'
0xD800, // unpaired high
0x006F, // 'o'
];
let before = lossy_name_count();
let (name, count) = decode_name_u16(&units);
assert_eq!(count, 1, "one unpaired surrogate → one replacement");
assert!(
name.contains(char::REPLACEMENT_CHARACTER),
"decoded name must contain U+FFFD"
);
// The process-global tally increased by the replacement count, so the
// index-build warn/stat sees the loss (WI-4.1).
assert_eq!(
lossy_name_count(),
before + u64::from(count),
"global lossy tally must increase by the replacement count"
);
}

#[test]
fn decode_name_u16_lone_low_surrogate_is_counted() {
// A lone LOW surrogate (0xDC00) with no preceding high surrogate.
let units = [0xDC00_u16];
let (name, count) = decode_name_u16(&units);
assert_eq!(count, 1);
assert_eq!(name, "\u{FFFD}");
}
}
10 changes: 5 additions & 5 deletions crates/uffs-mft/src/parse/direct_index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ pub fn parse_record_to_index(data: &[u8], frs: u64, index: &mut crate::index::Mf
.chunks_exact(2)
.map(|c| u16::from_le_bytes([c[0], c[1]]))
.collect();
let name = String::from_utf16_lossy(&name_u16);
let name = crate::io::parser::unified::decode_name_u16(&name_u16).0;
let parent_frs = file_reference_to_frs(fn_attr.parent_directory);
let namespace = fn_attr.file_name_namespace;

Expand Down Expand Up @@ -304,7 +304,7 @@ pub fn parse_record_to_index(data: &[u8], frs: u64, index: &mut crate::index::Mf
.chunks_exact(2)
.map(|c| u16::from_le_bytes([c[0], c[1]]))
.collect();
let stream_name = String::from_utf16_lossy(&name_u16);
let stream_name = crate::io::parser::unified::decode_name_u16(&name_u16).0;
// ALL named $DATA streams create regular stream entries.
// Internal ones are filtered from
// output by is_internal_windows_stream in the output layer.
Expand Down Expand Up @@ -376,7 +376,7 @@ pub fn parse_record_to_index(data: &[u8], frs: u64, index: &mut crate::index::Mf
.chunks_exact(2)
.map(|c| u16::from_le_bytes([c[0], c[1]]))
.collect();
String::from_utf16_lossy(&name_u16)
crate::io::parser::unified::decode_name_u16(&name_u16).0
};
(is_i30, name)
} else {
Expand Down Expand Up @@ -500,7 +500,7 @@ pub fn parse_record_to_index(data: &[u8], frs: u64, index: &mut crate::index::Mf
.chunks_exact(2)
.map(|c| u16::from_le_bytes([c[0], c[1]]))
.collect();
String::from_utf16_lossy(&name_u16)
crate::io::parser::unified::decode_name_u16(&name_u16).0
} else {
String::new()
}
Expand Down Expand Up @@ -585,7 +585,7 @@ pub fn parse_record_to_index(data: &[u8], frs: u64, index: &mut crate::index::Mf
.chunks_exact(2)
.map(|c| u16::from_le_bytes([c[0], c[1]]))
.collect();
String::from_utf16_lossy(&name_u16)
crate::io::parser::unified::decode_name_u16(&name_u16).0
} else {
String::new()
}
Expand Down
10 changes: 5 additions & 5 deletions crates/uffs-mft/src/parse/direct_index_extension.rs
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ pub(super) fn parse_extension_to_index(
.chunks_exact(2)
.map(|c| u16::from_le_bytes([c[0], c[1]]))
.collect();
let name = String::from_utf16_lossy(&name_u16);
let name = crate::io::parser::unified::decode_name_u16(&name_u16).0;
let parent_frs = fn_attr.parent_directory & 0x0000_FFFF_FFFF_FFFF;
names.push((name, parent_frs));
}
Expand Down Expand Up @@ -233,7 +233,7 @@ pub(super) fn parse_extension_to_index(
.chunks_exact(2)
.map(|c| u16::from_le_bytes([c[0], c[1]]))
.collect();
let stream_name = String::from_utf16_lossy(&name_u16);
let stream_name = crate::io::parser::unified::decode_name_u16(&name_u16).0;
// ALL named $DATA streams create regular
// stream entries. Internal ones are filtered from
// output by is_internal_windows_stream in the output layer.
Expand Down Expand Up @@ -282,7 +282,7 @@ pub(super) fn parse_extension_to_index(
.chunks_exact(2)
.map(|c| u16::from_le_bytes([c[0], c[1]]))
.collect();
String::from_utf16_lossy(&name_u16)
crate::io::parser::unified::decode_name_u16(&name_u16).0
};
(is_i30, name)
} else {
Expand Down Expand Up @@ -403,7 +403,7 @@ pub(super) fn parse_extension_to_index(
.chunks_exact(2)
.map(|c| u16::from_le_bytes([c[0], c[1]]))
.collect();
String::from_utf16_lossy(&name_u16)
crate::io::parser::unified::decode_name_u16(&name_u16).0
} else {
String::new()
}
Expand Down Expand Up @@ -488,7 +488,7 @@ pub(super) fn parse_extension_to_index(
.chunks_exact(2)
.map(|c| u16::from_le_bytes([c[0], c[1]]))
.collect();
String::from_utf16_lossy(&name_u16)
crate::io::parser::unified::decode_name_u16(&name_u16).0
} else {
String::new()
}
Expand Down
4 changes: 4 additions & 0 deletions crates/uffs-mft/src/platform/system.rs
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,10 @@ fn is_ntfs_volume(drive_letter: DriveLetter) -> bool {
return false;
}

// AUDIT-OK(bytes): decodes the Windows filesystem TYPE label (e.g. "NTFS")
// for an `== "NTFS"` check — not an NTFS filename. A lossy decode could
// only fail the equality (fail-safe: treat as not-NTFS), never corrupt a
// stored name, so the instrumented name decoder does not apply here.
let fs_name_raw = String::from_utf16_lossy(&fs_name_buffer);
let fs_name = fs_name_raw.trim_end_matches('\0');

Expand Down
Loading
Loading