skyllc-ai · githubrobbi · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026
@@ -209,6 +209,19 @@ impl MftIndex {
             stats.total_name_bytes += u64::from(record.first_name.name.length());
         }
 
+        // WI-4.1: surface NTFS-name decode loss. The shared decoder
+        // (`io::parser::unified::decode_name_u16`) tallies every U+FFFD
+        // substitution into a process-global counter; snapshot it into the
+        // stats and warn once when non-zero so the loss is measured, not
+        // silent. (Eliminating the loss entirely is the WI-4.4 RFC.)
+        stats.lossy_name_count = crate::io::parser::unified::lossy_name_count();
+        if stats.lossy_name_count > 0 {
+            tracing::warn!(
+                lossy_name_count = stats.lossy_name_count,
+                "filenames contained characters not representable in UTF-8 and were stored with U+FFFD"
+            );
+        }
+
         self.stats = stats;
     }
 

@@ -87,6 +87,12 @@ pub struct MftStats {
     pub size_bucket_counts: [u32; 8],
     /// Total bytes per size bucket.
     pub size_bucket_bytes: [u64; 8],
+    /// Number of U+FFFD substitutions made while decoding NTFS names from
+    /// UTF-16 (Category 4, WI-4.1). `0` means every name decoded losslessly;
+    /// `> 0` means that many code units were not representable in UTF-8 and
+    /// were stored as the replacement character — surfaced via a `warn!` at
+    /// index-build time so the loss is visible, not silent.
+    pub lossy_name_count: u64,
 }
 
 impl MftStats {
@@ -113,6 +119,7 @@ impl MftStats {
             reparse_bytes: 0,
             size_bucket_counts: [0; 8],
             size_bucket_bytes: [0; 8],
+            lossy_name_count: 0,
         }
     }
 

@@ -25,7 +25,10 @@ mod chunking;
 mod extent_map;
 mod fixup;
 mod merger;
-mod parser;
+// `pub(crate)` so the instrumented name decoder (`parser::unified::
+// decode_name_u16`, WI-4.1) is reachable from the sibling `parse/` and
+// `usn/` modules that decode NTFS names — one decoder, one lossy tally.
+pub(crate) mod parser;
 // readers module available on all platforms (contains ChaosMftReader for
 // offline MFT)
 pub mod readers;

@@ -141,7 +141,7 @@ pub fn parse_record_to_fragment(
                             .chunks_exact(2)
                             .map(|pair| u16::from_le_bytes([pair[0], pair[1]]))
                             .collect();
-                        let name = String::from_utf16_lossy(&name_u16);
+                        let name = crate::io::parser::unified::decode_name_u16(&name_u16).0;
                         let parent_frs = file_reference_to_frs(fn_attr.parent_directory);
                         let namespace = fn_attr.file_name_namespace;
 
@@ -241,7 +241,7 @@ pub fn parse_record_to_fragment(
                             .chunks_exact(2)
                             .map(|pair| u16::from_le_bytes([pair[0], pair[1]]))
                             .collect();
-                        let stream_name = String::from_utf16_lossy(&name_u16);
+                        let stream_name = crate::io::parser::unified::decode_name_u16(&name_u16).0;
                         // ALL named $DATA streams create regular
                         // stream entries.  Internal ones are filtered from
                         // output by is_internal_windows_stream in the output layer.

@@ -95,7 +95,7 @@ pub(super) fn parse_extension_to_fragment(
                                 .chunks_exact(2)
                                 .map(|pair| u16::from_le_bytes([pair[0], pair[1]]))
                                 .collect();
-                            let name = String::from_utf16_lossy(&name_u16);
+                            let name = crate::io::parser::unified::decode_name_u16(&name_u16).0;
                             let parent_frs = fn_attr.parent_directory & 0x0000_FFFF_FFFF_FFFF;
                             names.push((name, parent_frs));
                         }
@@ -169,7 +169,7 @@ pub(super) fn parse_extension_to_fragment(
                             .chunks_exact(2)
                             .map(|pair| u16::from_le_bytes([pair[0], pair[1]]))
                             .collect();
-                        let stream_name = String::from_utf16_lossy(&name_u16);
+                        let stream_name = crate::io::parser::unified::decode_name_u16(&name_u16).0;
                         // ALL named $DATA streams create regular
                         // stream entries.  Internal ones are filtered from
                         // output by is_internal_windows_stream in the output layer.

@@ -201,7 +201,7 @@ pub fn parse_record_to_index(data: &[u8], frs: u64, index: &mut crate::index::Mf
                                 .chunks_exact(2)
                                 .map(|pair| u16::from_le_bytes([pair[0], pair[1]]))
                                 .collect();
-                            let name = String::from_utf16_lossy(&name_u16);
+                            let name = crate::io::parser::unified::decode_name_u16(&name_u16).0;
                             let parent_frs = file_reference_to_frs(fn_attr.parent_directory);
                             let namespace = fn_attr.file_name_namespace;
 
@@ -350,7 +350,7 @@ pub fn parse_record_to_index(data: &[u8], frs: u64, index: &mut crate::index::Mf
                             .chunks_exact(2)
                             .map(|pair| u16::from_le_bytes([pair[0], pair[1]]))
                             .collect();
-                        let stream_name = String::from_utf16_lossy(&name_u16);
+                        let stream_name = crate::io::parser::unified::decode_name_u16(&name_u16).0;
 
                         // $BadClus:$Bad (FRS 8) uses InitializedSize
                         // instead of DataSize/AllocatedSize to avoid counting the
@@ -449,7 +449,7 @@ pub fn parse_record_to_index(data: &[u8], frs: u64, index: &mut crate::index::Mf
                                 .chunks_exact(2)
                                 .map(|pair| u16::from_le_bytes([pair[0], pair[1]]))
                                 .collect();
-                            String::from_utf16_lossy(&name_u16)
+                            crate::io::parser::unified::decode_name_u16(&name_u16).0
                         };
                         (is_i30, name)
                     } else {

@@ -130,7 +130,7 @@ pub(super) fn parse_extension_to_index(
                                     .chunks_exact(2)
                                     .map(|pair| u16::from_le_bytes([pair[0], pair[1]]))
                                     .collect();
-                                let name = String::from_utf16_lossy(&name_u16);
+                                let name = crate::io::parser::unified::decode_name_u16(&name_u16).0;
                                 let parent_frs = fn_attr.parent_directory & 0x0000_FFFF_FFFF_FFFF;
                                 names.push((name, parent_frs));
                             }
@@ -252,7 +252,7 @@ pub(super) fn parse_extension_to_index(
                             .chunks_exact(2)
                             .map(|pair| u16::from_le_bytes([pair[0], pair[1]]))
                             .collect();
-                        let stream_name = String::from_utf16_lossy(&name_u16);
+                        let stream_name = crate::io::parser::unified::decode_name_u16(&name_u16).0;
                         // ALL named $DATA streams create regular
                         // stream entries.  Internal ones are filtered from
                         // output by is_internal_windows_stream in the output layer.
@@ -304,7 +304,7 @@ pub(super) fn parse_extension_to_index(
                                 .chunks_exact(2)
                                 .map(|pair| u16::from_le_bytes([pair[0], pair[1]]))
                                 .collect();
-                            String::from_utf16_lossy(&name_u16)
+                            crate::io::parser::unified::decode_name_u16(&name_u16).0
                         };
                         (is_i30, name)
                     } else {

@@ -21,13 +21,17 @@ use crate::ntfs::{
 };
 
 /// Decode a UTF-16LE byte slice into `out`, replacing unpaired surrogates
-/// with U+FFFD.  Returns the number of bytes written to `out`.
+/// with U+FFFD.  Returns the number of U+FFFD replacements emitted
+/// (`0` = lossless).
 ///
 /// This avoids the per-call `SmallVec` + `String` allocation that
-/// `String::from_utf16_lossy` requires.
+/// `String::from_utf16_lossy` requires, and — unlike `from_utf16_lossy` —
+/// surfaces the substitution count so name loss at the NTFS boundary is
+/// measured, not silent (Category 4, WI-4.1).
 #[inline]
-fn decode_utf16le_into(bytes: &[u8], out: &mut String) {
+fn decode_utf16le_into(bytes: &[u8], out: &mut String) -> u32 {
     out.clear();
+    let mut replacements: u32 = 0;
     let mut i = 0_usize;
     while let Some(pair) = bytes
         .get(i..i + 2)
@@ -52,17 +56,21 @@ fn decode_utf16le_into(bytes: &[u8], out: &mut String) {
                             out.push(ch);
                         } else {
                             out.push(char::REPLACEMENT_CHARACTER);
+                            replacements = replacements.saturating_add(1);
                         }
                     } else {
                         out.push(char::REPLACEMENT_CHARACTER);
+                        replacements = replacements.saturating_add(1);
                     }
                 } else {
                     out.push(char::REPLACEMENT_CHARACTER);
+                    replacements = replacements.saturating_add(1);
                 }
             }
             // Low surrogate without preceding high
             0xDC00..=0xDFFF => {
                 out.push(char::REPLACEMENT_CHARACTER);
+                replacements = replacements.saturating_add(1);
             }
             _ => {
                 // All non-surrogate u16 values are valid Unicode scalar values.
@@ -73,6 +81,50 @@ fn decode_utf16le_into(bytes: &[u8], out: &mut String) {
             }
         }
     }
+    replacements
+}
+
+/// Decode a `&[u16]` UTF-16 name into a fresh `String`, returning
+/// `(String, replacement_count)`.  Use this instead of
+/// `String::from_utf16_lossy` at NTFS name boundaries so loss is counted,
+/// not silent (Category 4, WI-4.1).
+///
+/// Most NTFS-name call sites already hold a `Vec<u16>` / `SmallVec<[u16; N]>`
+/// (the attribute decoder collects code units before stringifying), so this
+/// `&[u16]` entry point avoids re-deriving a byte slice. There is exactly
+/// ONE surrogate-handling implementation: this re-encodes to LE bytes and
+/// routes through `decode_utf16le_into`.
+#[inline]
+pub(crate) fn decode_name_u16(units: &[u16]) -> (String, u32) {
+    let mut bytes = Vec::with_capacity(units.len().saturating_mul(2));
+    for unit in units {
+        bytes.extend_from_slice(&unit.to_le_bytes());
+    }
+    let mut out = String::new();
+    let count = decode_utf16le_into(&bytes, &mut out);
+    if count > 0 {
+        LOSSY_NAME_COUNT.fetch_add(u64::from(count), core::sync::atomic::Ordering::Relaxed);
+    }
+    (out, count)
+}
+
+/// Process-global tally of U+FFFD substitutions emitted by
+/// [`decode_name_u16`] across all NTFS-name decodes (Category 4, WI-4.1).
+///
+/// The parser call sites are spread across nine modules and do not thread a
+/// stats accumulator through their (hot-path) signatures, so the count is
+/// gathered here with a single relaxed atomic — cheap, lock-free, and read
+/// at index-build time into [`crate::index::stats::MftStats::lossy_name_count`]
+/// for the "N filenames were stored with U+FFFD" warning. `Relaxed` is
+/// sufficient: it is a monotonic diagnostic counter, not a synchronisation
+/// point.
+pub(crate) static LOSSY_NAME_COUNT: core::sync::atomic::AtomicU64 =
+    core::sync::atomic::AtomicU64::new(0);
+
+/// Snapshot the current global lossy-name tally.
+#[inline]
+pub(crate) fn lossy_name_count() -> u64 {
+    LOSSY_NAME_COUNT.load(core::sync::atomic::Ordering::Relaxed)
 }
 
 /// Process a single MFT record (base OR extension) in one pass.
@@ -473,3 +525,54 @@ fn rd_u64(buf: &[u8], off: usize) -> u64 {
         .and_then(|sl| <[u8; 8]>::try_from(sl).ok())
         .map_or(0, u64::from_le_bytes)
 }
+
+#[cfg(test)]
+mod tests {
+    use super::{decode_name_u16, lossy_name_count};
+
+    #[test]
+    fn decode_name_u16_lossless_bmp_and_astral() {
+        // "Aé😀" — BMP + an astral char (valid surrogate pair). No loss.
+        // 'A'=0x0041, 'é'=0x00E9, '😀'=U+1F600 → D83D DE00.
+        let units = [0x0041_u16, 0x00E9, 0xD83D, 0xDE00];
+        let (name, count) = decode_name_u16(&units);
+        assert_eq!(count, 0, "well-formed UTF-16 must decode losslessly");
+        assert_eq!(name, "Aé😀");
+        assert!(!name.contains(char::REPLACEMENT_CHARACTER));
+    }
+
+    #[test]
+    fn decode_name_u16_unpaired_surrogate_is_counted_and_replaced() {
+        // A lone high surrogate (0xD800) with no following low surrogate —
+        // legal on NTFS, illegal in UTF-8. Must NOT panic; must substitute
+        // exactly one U+FFFD and report the count.
+        let units = [
+            0x0066_u16, // 'f'
+            0xD800,     // unpaired high
+            0x006F,     // 'o'
+        ];
+        let before = lossy_name_count();
+        let (name, count) = decode_name_u16(&units);
+        assert_eq!(count, 1, "one unpaired surrogate → one replacement");
+        assert!(
+            name.contains(char::REPLACEMENT_CHARACTER),
+            "decoded name must contain U+FFFD"
+        );
+        // The process-global tally increased by the replacement count, so the
+        // index-build warn/stat sees the loss (WI-4.1).
+        assert_eq!(
+            lossy_name_count(),
+            before + u64::from(count),
+            "global lossy tally must increase by the replacement count"
+        );
+    }
+
+    #[test]
+    fn decode_name_u16_lone_low_surrogate_is_counted() {
+        // A lone LOW surrogate (0xDC00) with no preceding high surrogate.
+        let units = [0xDC00_u16];
+        let (name, count) = decode_name_u16(&units);
+        assert_eq!(count, 1);
+        assert_eq!(name, "\u{FFFD}");
+    }
+}
@@ -196,7 +196,7 @@ pub fn parse_record_to_index(data: &[u8], frs: u64, index: &mut crate::index::Mf
                                 .chunks_exact(2)
                                 .map(|c| u16::from_le_bytes([c[0], c[1]]))
                                 .collect();
-                            let name = String::from_utf16_lossy(&name_u16);
+                            let name = crate::io::parser::unified::decode_name_u16(&name_u16).0;
                             let parent_frs = file_reference_to_frs(fn_attr.parent_directory);
                             let namespace = fn_attr.file_name_namespace;
 
@@ -304,7 +304,7 @@ pub fn parse_record_to_index(data: &[u8], frs: u64, index: &mut crate::index::Mf
                             .chunks_exact(2)
                             .map(|c| u16::from_le_bytes([c[0], c[1]]))
                             .collect();
-                        let stream_name = String::from_utf16_lossy(&name_u16);
+                        let stream_name = crate::io::parser::unified::decode_name_u16(&name_u16).0;
                         // ALL named $DATA streams create regular stream entries.
                         // Internal ones are filtered from
                         // output by is_internal_windows_stream in the output layer.
@@ -376,7 +376,7 @@ pub fn parse_record_to_index(data: &[u8], frs: u64, index: &mut crate::index::Mf
                                 .chunks_exact(2)
                                 .map(|c| u16::from_le_bytes([c[0], c[1]]))
                                 .collect();
-                            String::from_utf16_lossy(&name_u16)
+                            crate::io::parser::unified::decode_name_u16(&name_u16).0
                         };
                         (is_i30, name)
                     } else {
@@ -500,7 +500,7 @@ pub fn parse_record_to_index(data: &[u8], frs: u64, index: &mut crate::index::Mf
                                 .chunks_exact(2)
                                 .map(|c| u16::from_le_bytes([c[0], c[1]]))
                                 .collect();
-                            String::from_utf16_lossy(&name_u16)
+                            crate::io::parser::unified::decode_name_u16(&name_u16).0
                         } else {
                             String::new()
                         }
@@ -585,7 +585,7 @@ pub fn parse_record_to_index(data: &[u8], frs: u64, index: &mut crate::index::Mf
                                 .chunks_exact(2)
                                 .map(|c| u16::from_le_bytes([c[0], c[1]]))
                                 .collect();
-                            String::from_utf16_lossy(&name_u16)
+                            crate::io::parser::unified::decode_name_u16(&name_u16).0
                         } else {
                             String::new()
                         }

@@ -154,7 +154,7 @@ pub(super) fn parse_extension_to_index(
                                     .chunks_exact(2)
                                     .map(|c| u16::from_le_bytes([c[0], c[1]]))
                                     .collect();
-                                let name = String::from_utf16_lossy(&name_u16);
+                                let name = crate::io::parser::unified::decode_name_u16(&name_u16).0;
                                 let parent_frs = fn_attr.parent_directory & 0x0000_FFFF_FFFF_FFFF;
                                 names.push((name, parent_frs));
                             }
@@ -233,7 +233,7 @@ pub(super) fn parse_extension_to_index(
                             .chunks_exact(2)
                             .map(|c| u16::from_le_bytes([c[0], c[1]]))
                             .collect();
-                        let stream_name = String::from_utf16_lossy(&name_u16);
+                        let stream_name = crate::io::parser::unified::decode_name_u16(&name_u16).0;
                         // ALL named $DATA streams create regular
                         // stream entries.  Internal ones are filtered from
                         // output by is_internal_windows_stream in the output layer.
@@ -282,7 +282,7 @@ pub(super) fn parse_extension_to_index(
                                 .chunks_exact(2)
                                 .map(|c| u16::from_le_bytes([c[0], c[1]]))
                                 .collect();
-                            String::from_utf16_lossy(&name_u16)
+                            crate::io::parser::unified::decode_name_u16(&name_u16).0
                         };
                         (is_i30, name)
                     } else {
@@ -403,7 +403,7 @@ pub(super) fn parse_extension_to_index(
                                 .chunks_exact(2)
                                 .map(|c| u16::from_le_bytes([c[0], c[1]]))
                                 .collect();
-                            String::from_utf16_lossy(&name_u16)
+                            crate::io::parser::unified::decode_name_u16(&name_u16).0
                         } else {
                             String::new()
                         }
@@ -488,7 +488,7 @@ pub(super) fn parse_extension_to_index(
                                 .chunks_exact(2)
                                 .map(|c| u16::from_le_bytes([c[0], c[1]]))
                                 .collect();
-                            String::from_utf16_lossy(&name_u16)
+                            crate::io::parser::unified::decode_name_u16(&name_u16).0
                         } else {
                             String::new()
                         }

@@ -251,6 +251,10 @@ fn is_ntfs_volume(drive_letter: DriveLetter) -> bool {
         return false;
     }
 
+    // AUDIT-OK(bytes): decodes the Windows filesystem TYPE label (e.g. "NTFS")
+    // for an `== "NTFS"` check — not an NTFS filename. A lossy decode could
+    // only fail the equality (fail-safe: treat as not-NTFS), never corrupt a
+    // stored name, so the instrumented name decoder does not apply here.
     let fs_name_raw = String::from_utf16_lossy(&fs_name_buffer);
     let fs_name = fs_name_raw.trim_end_matches('\0');