From 16bf48f69a1f68a69492873e9e54c450c7cc1fda Mon Sep 17 00:00:00 2001
From: Simon Cropp <simon.cropp@gmail.com>
Date: Wed, 1 Jul 2026 19:39:05 +1000
Subject: [PATCH 1/4] better scrubbing

---
 claude.md                                     |   5 +-
 src/Tests/PdfNormalizerTests.cs               |  69 ++++
 src/Tests/Samples.MultiPage.verified.pdf      | Bin 16897 -> 16897 bytes
 src/Tests/Samples.MultiPage.verified.txt      |   4 +-
 src/Tests/Samples.VerifyPdf.verified.pdf      | Bin 10946 -> 10946 bytes
 src/Tests/Samples.VerifyPdf.verified.txt      |   4 +-
 .../Samples.VerifyPdfStream.verified.pdf      | Bin 10946 -> 10946 bytes
 .../Samples.VerifyPdfStream.verified.txt      |   4 +-
 src/Verify.PDFium/PdfNormalizer.cs            | 298 ++++++++++++++++++
 src/Verify.PDFium/Verify.PDFium.csproj        |   3 +
 src/Verify.PDFium/VerifyPDFium.cs             |   7 +-
 11 files changed, 385 insertions(+), 9 deletions(-)
 create mode 100644 src/Tests/PdfNormalizerTests.cs
 create mode 100644 src/Verify.PDFium/PdfNormalizer.cs
diff --git a/claude.md b/claude.md
index fcb9cba..9f9ce4c 100644
--- a/claude.md
+++ b/claude.md
@@ -31,11 +31,14 @@ All source lives under `src/`. Solution file is `src/Verify.PDFium.slnx`.
 
 Entry point is `VerifyPDFium.Initialize(dpi = 96)` which registers a stream converter for the `pdf` extension. The converter loads the document with `Morph.PDFium.PdfiumDocument` and returns a `ConversionResult` containing:
 1. `PdfInfo` (page count, per-page size in points and extracted text, document information dictionary) serialized as the info file
-2. The original pdf bytes as a `pdf` target (`BypassComparersForSubsequentOnDifference` set, mirroring Verify.OpenXml)
+2. The pdf bytes as a `pdf` target (`BypassComparersForSubsequentOnDifference` set, mirroring Verify.OpenXml)
 3. One `png` target per page, named `page_0001` style
 
+To keep snapshots stable for PDFs freshly generated at test time, the non-deterministic fields are neutralized before snapshotting: the trailer `/ID`, the info dictionary `/CreationDate`/`/ModDate` (both in the `pdf` bytes and the info file), and the XMP metadata dates plus `xmpMM:DocumentID`/`InstanceID`. This is done by **`PdfNormalizer`** — a length-preserving, in-place byte scan (no string round-trip, no regex) that overwrites only the volatile characters, so cross-reference offsets stay valid. Values inside a compressed object/metadata stream (`/ObjStm`, flate-compressed XMP) are not reachable by the plaintext scan.
+
 Key files:
 - **VerifyPDFium.cs** — initialization and the converter
+- **PdfNormalizer.cs** — neutralizes non-deterministic `/ID`, dates and XMP identifiers directly on the bytes
 - **PdfInfo.cs** / **PageInfo.cs** — info shape for the snapshot (per-page width/height in points and text via `PdfPage.GetText()`)
 
 Style note: only public types get a namespace declaration (`VerifyTests`); internal types live in the global namespace.
diff --git a/src/Tests/PdfNormalizerTests.cs b/src/Tests/PdfNormalizerTests.cs
new file mode 100644
index 0000000..ab2e881
--- /dev/null
+++ b/src/Tests/PdfNormalizerTests.cs
@@ -0,0 +1,69 @@
+using System.Text;
+using Morph.PDFium;
+
+[TestFixture]
+public class PdfNormalizerTests
+{
+    [Test]
+    public void NeutralizesVolatileValues()
+    {
+        var input =
+            "/ID [<A1B2C3D4E5F60718> <1122334455667788>] " +
+            "/CreationDate(D:20240115093000+05'30') " +
+            "/ModDate(D:20240115093000Z) " +
+            "<xmp:CreateDate>2024-01-15T09:30:00+05:30</xmp:CreateDate>" +
+            "<xmp:ModifyDate>2024-01-15T09:30:00Z</xmp:ModifyDate>" +
+            "<xmp:MetadataDate>2024-01-15T09:30:00Z</xmp:MetadataDate>" +
+            "<xmpMM:DocumentID>uuid:0f7b2c9a-1234-5678-9abc-def012345678</xmpMM:DocumentID>" +
+            "<xmpMM:InstanceID>xmp.iid:1a2b3c4d</xmpMM:InstanceID>";
+        var expected =
+            "/ID [<0000000000000000> <0000000000000000>] " +
+            "/CreationDate(D:00000000000000+00'00') " +
+            "/ModDate(D:00000000000000Z) " +
+            "<xmp:CreateDate>0000-00-00T00:00:00+00:00</xmp:CreateDate>" +
+            "<xmp:ModifyDate>0000-00-00T00:00:00Z</xmp:ModifyDate>" +
+            "<xmp:MetadataDate>0000-00-00T00:00:00Z</xmp:MetadataDate>" +
+            $"<xmpMM:DocumentID>{new string('0', 41)}</xmpMM:DocumentID>" +
+            $"<xmpMM:InstanceID>{new string('0', 16)}</xmpMM:InstanceID>";
+        Assert.That(Normalize(input), Is.EqualTo(expected));
+    }
+
+    [Test]
+    public void CollapsesDifferingValuesToTheSameOutput()
+    {
+        // The same producer emits a stable structure across runs, so two documents differing only
+        // in the volatile digits/hex normalize to identical bytes.
+        var a = "/ID [<A1B2C3D4>] /CreationDate(D:20240115093000+05'30')";
+        var b = "/ID [<99887766>] /CreationDate(D:19991231235959+11'45')";
+        Assert.That(a, Is.Not.EqualTo(b));
+        Assert.That(Normalize(a), Is.EqualTo(Normalize(b)));
+    }
+
+    [Test]
+    public void LeavesLookalikeKeysUntouched()
+    {
+        // /IDTree is a name-tree key (not the file identifier), /ModDateStamp is a different name,
+        // and a self-closing date element has no content: none should be altered.
+        var input = "/IDTree [1 2] /ModDateStamp(20240101) <xmp:CreateDate/>2024";
+        Assert.That(Normalize(input), Is.EqualTo(input));
+    }
+
+    [Test]
+    public void NormalizedDocumentStillLoads()
+    {
+        var original = File.ReadAllBytes("sample.pdf");
+        var normalized = PdfNormalizer.Normalize(original);
+        Assert.That(normalized, Has.Length.EqualTo(original.Length));
+
+        using var document = PdfiumDocument.Load(normalized);
+        Assert.That(document.PageCount, Is.EqualTo(1));
+    }
+
+    static string Normalize(string value)
+    {
+        var bytes = Encoding.Latin1.GetBytes(value);
+        var normalized = PdfNormalizer.Normalize(bytes);
+        Assert.That(normalized, Has.Length.EqualTo(bytes.Length));
+        return Encoding.Latin1.GetString(normalized);
+    }
+}
diff --git a/src/Tests/Samples.MultiPage.verified.pdf b/src/Tests/Samples.MultiPage.verified.pdf
index 4b04c14133fea23e3b571e37b6953328d130cd78..af6c8f7a79c79b27e3ce78b2bc445b1ab74c1ede 100644
GIT binary patch
delta 203
zcmZo{VQg$+oS?~J00k2bHQ?-xiRHFjAaPwVn*7Mt6q&1Nw^SCU4J$qQv7I%36`QT?
W#hIM2YDN;Yvq9ouWlo;rm<s@Hzbvc(

delta 203
zcmZo{VQg$+oS?~JWB>$)28I(2HQ?-xiRHFjAPHRqLm-;`$kr5@t7x}WwzM=e#R{Yq
z1a#4e$&c-<@vGQuZ7<H`Wa47xW^QC|U}j-rVrFU%!bWB;W|n5IW|rp0<|cMFgj7tP
H;+P8nrm`@8

diff --git a/src/Tests/Samples.MultiPage.verified.txt b/src/Tests/Samples.MultiPage.verified.txt
index a475f40..bd5ca51 100644
--- a/src/Tests/Samples.MultiPage.verified.txt
+++ b/src/Tests/Samples.MultiPage.verified.txt
@@ -173,9 +173,9 @@ ullamco laboris.
     }
   ],
   Properties: {
-    CreationDate: D:20000101000000+00'00',
+    CreationDate: D:00000000000000+00'00',
     Creator: Morph,
-    ModDate: D:20000101000000+00'00',
+    ModDate: D:00000000000000+00'00',
     Producer: PDFsharp 6.2.4
   }
 }
\ No newline at end of file
diff --git a/src/Tests/Samples.VerifyPdf.verified.pdf b/src/Tests/Samples.VerifyPdf.verified.pdf
index acb06ceb3c38991f060cf84d78802f0d6131807c..1796d9879fe6dc19cf7ef0272bea5b008e520d69 100644
GIT binary patch
delta 201
zcmX>UdMI>)CWiqOOf=Mhvo|JgP~!rL>w?i_X?0U%Zj|~`S(rAg^kf+gYy2uUmuSpq
Ubi%3`Nzl#)iG!6nIaDhb0Qli6s{jB1

delta 201
zcmX>UdMI>)CWny$5EvR5PBhejvo|JgP~!qg=o%OT(PU|LQ)F(G`cm1_(##YqkX8`T
zMI$E5XjtP{vAINJKBJR~i<z6bk-34Hg^7uosW}K6nYox*nz@=;nj4#&*x3+LF*#H#
F7XUHiFMR+2

diff --git a/src/Tests/Samples.VerifyPdf.verified.txt b/src/Tests/Samples.VerifyPdf.verified.txt
index 6f5f8b2..5e9a61d 100644
--- a/src/Tests/Samples.VerifyPdf.verified.txt
+++ b/src/Tests/Samples.VerifyPdf.verified.txt
@@ -8,9 +8,9 @@
     }
   ],
   Properties: {
-    CreationDate: D:20000101000000+00'00',
+    CreationDate: D:00000000000000+00'00',
     Creator: Morph,
-    ModDate: D:20000101000000+00'00',
+    ModDate: D:00000000000000+00'00',
     Producer: PDFsharp 6.2.4
   }
 }
\ No newline at end of file
diff --git a/src/Tests/Samples.VerifyPdfStream.verified.pdf b/src/Tests/Samples.VerifyPdfStream.verified.pdf
index acb06ceb3c38991f060cf84d78802f0d6131807c..1796d9879fe6dc19cf7ef0272bea5b008e520d69 100644
GIT binary patch
delta 201
zcmX>UdMI>)CWiqOOf=Mhvo|JgP~!rL>w?i_X?0U%Zj|~`S(rAg^kf+gYy2uUmuSpq
Ubi%3`Nzl#)iG!6nIaDhb0Qli6s{jB1

delta 201
zcmX>UdMI>)CWny$5EvR5PBhejvo|JgP~!qg=o%OT(PU|LQ)F(G`cm1_(##YqkX8`T
zMI$E5XjtP{vAINJKBJR~i<z6bk-34Hg^7uosW}K6nYox*nz@=;nj4#&*x3+LF*#H#
F7XUHiFMR+2

diff --git a/src/Tests/Samples.VerifyPdfStream.verified.txt b/src/Tests/Samples.VerifyPdfStream.verified.txt
index 6f5f8b2..5e9a61d 100644
--- a/src/Tests/Samples.VerifyPdfStream.verified.txt
+++ b/src/Tests/Samples.VerifyPdfStream.verified.txt
@@ -8,9 +8,9 @@
     }
   ],
   Properties: {
-    CreationDate: D:20000101000000+00'00',
+    CreationDate: D:00000000000000+00'00',
     Creator: Morph,
-    ModDate: D:20000101000000+00'00',
+    ModDate: D:00000000000000+00'00',
     Producer: PDFsharp 6.2.4
   }
 }
\ No newline at end of file
diff --git a/src/Verify.PDFium/PdfNormalizer.cs b/src/Verify.PDFium/PdfNormalizer.cs
new file mode 100644
index 0000000..a438da8
--- /dev/null
+++ b/src/Verify.PDFium/PdfNormalizer.cs
@@ -0,0 +1,298 @@
+/// <summary>
+/// Neutralizes the non-deterministic fields of a PDF (the trailer <c>/ID</c>, the document
+/// information <c>/CreationDate</c> and <c>/ModDate</c>, and the equivalent XMP metadata dates and
+/// identifiers) so that the same source document always produces byte-identical snapshot output.
+/// </summary>
+/// <remarks>
+/// All edits are performed directly on the bytes and are length-preserving: only the mutable
+/// characters inside each value are overwritten, so every cross-reference offset stays valid and
+/// the file never has to be re-serialized. Values that live inside a compressed object or metadata
+/// stream (<c>/ObjStm</c>, flate-compressed XMP) are not reachable by this plaintext scan.
+/// </remarks>
+static class PdfNormalizer
+{
+    enum Fill
+    {
+        // Zero the ASCII digits only, keeping separators (leaves a readable date).
+        Digits,
+
+        // Zero the hexadecimal digits (for hex string <...> values).
+        Hex,
+
+        // Zero every non-whitespace byte (for opaque identifiers).
+        All
+    }
+
+    public static byte[] Normalize(byte[] source)
+    {
+        var data = (byte[]) source.Clone();
+
+        // Document information dictionary dates.
+        ZeroPdfString(data, "/CreationDate"u8, Fill.Digits);
+        ZeroPdfString(data, "/ModDate"u8, Fill.Digits);
+
+        // Trailer / cross-reference-stream file identifier: /ID [<...> <...>].
+        ZeroFileId(data);
+
+        // XMP metadata dates (uncompressed metadata streams only).
+        ZeroXmpElement(data, "<xmp:CreateDate"u8, Fill.Digits);
+        ZeroXmpElement(data, "<xmp:ModifyDate"u8, Fill.Digits);
+        ZeroXmpElement(data, "<xmp:MetadataDate"u8, Fill.Digits);
+
+        // XMP per-generation identifiers.
+        ZeroXmpElement(data, "<xmpMM:DocumentID"u8, Fill.All);
+        ZeroXmpElement(data, "<xmpMM:InstanceID"u8, Fill.All);
+        ZeroXmpElement(data, "<xmpMM:OriginalDocumentID"u8, Fill.All);
+
+        return data;
+    }
+
+    public static void NormalizeProperties(Dictionary<string, string>? properties)
+    {
+        if (properties is null)
+        {
+            return;
+        }
+
+        foreach (var key in properties.Keys.ToList())
+        {
+            if (key is "CreationDate" or "ModDate")
+            {
+                properties[key] = ZeroDigits(properties[key]);
+            }
+        }
+    }
+
+    static string ZeroDigits(string value) =>
+        string.Create(value.Length, value, static (span, state) =>
+        {
+            for (var i = 0; i < state.Length; i++)
+            {
+                var c = state[i];
+                span[i] = char.IsAsciiDigit(c) ? '0' : c;
+            }
+        });
+
+    // Finds a name key, then overwrites the string value that follows it. The value may be a
+    // literal string "(...)" or a hex string "<...>".
+    static void ZeroPdfString(byte[] data, ReadOnlySpan<byte> key, Fill fill)
+    {
+        var pos = 0;
+        while (true)
+        {
+            var hit = data.AsSpan(pos).IndexOf(key);
+            if (hit < 0)
+            {
+                return;
+            }
+
+            var i = pos + hit + key.Length;
+            pos = i;
+
+            i = SkipWhitespace(data, i);
+            if (i >= data.Length)
+            {
+                return;
+            }
+
+            if (data[i] == (byte) '(')
+            {
+                var start = i + 1;
+                var end = FindLiteralEnd(data, start);
+                Overwrite(data, start, end, fill);
+                pos = end;
+            }
+            else if (data[i] == (byte) '<' && (i + 1 >= data.Length || data[i + 1] != (byte) '<'))
+            {
+                var start = i + 1;
+                var end = FindByte(data, start, (byte) '>');
+                Overwrite(data, start, end, Fill.Hex);
+                pos = end;
+            }
+        }
+    }
+
+    // Finds "/ID" followed by an array and zeroes each string element. Anything not shaped like the
+    // identifier array (for example the "/IDTree" name-tree key) is skipped.
+    static void ZeroFileId(byte[] data)
+    {
+        var key = "/ID"u8;
+        var pos = 0;
+        while (true)
+        {
+            var hit = data.AsSpan(pos).IndexOf(key);
+            if (hit < 0)
+            {
+                return;
+            }
+
+            var i = pos + hit + key.Length;
+            pos = i;
+
+            i = SkipWhitespace(data, i);
+            if (i >= data.Length || data[i] != (byte) '[')
+            {
+                continue;
+            }
+
+            i++;
+            while (i < data.Length && data[i] != (byte) ']')
+            {
+                if (data[i] == (byte) '<')
+                {
+                    var start = i + 1;
+                    i = FindByte(data, start, (byte) '>');
+                    Overwrite(data, start, i, Fill.Hex);
+                    i++;
+                }
+                else if (data[i] == (byte) '(')
+                {
+                    var start = i + 1;
+                    i = FindLiteralEnd(data, start);
+                    Overwrite(data, start, i, Fill.All);
+                    i++;
+                }
+                else
+                {
+                    i++;
+                }
+            }
+
+            pos = i;
+        }
+    }
+
+    // Finds an XMP element by its opening tag and zeroes the text content up to the next '<'.
+    static void ZeroXmpElement(byte[] data, ReadOnlySpan<byte> openTag, Fill fill)
+    {
+        var pos = 0;
+        while (true)
+        {
+            var hit = data.AsSpan(pos).IndexOf(openTag);
+            if (hit < 0)
+            {
+                return;
+            }
+
+            var i = pos + hit + openTag.Length;
+            pos = i;
+
+            // Reject a longer element name that merely shares this prefix.
+            if (i < data.Length && data[i] != (byte) '>' && data[i] != (byte) '/' && !IsWhitespace(data[i]))
+            {
+                continue;
+            }
+
+            // Skip the remainder of the opening tag, remembering the last significant byte so a
+            // self-closing "<tag/>" can be detected.
+            var lastSignificant = (byte) 0;
+            while (i < data.Length && data[i] != (byte) '>')
+            {
+                if (!IsWhitespace(data[i]))
+                {
+                    lastSignificant = data[i];
+                }
+
+                i++;
+            }
+
+            if (i >= data.Length)
+            {
+                return;
+            }
+
+            i++;
+            if (lastSignificant == (byte) '/')
+            {
+                continue;
+            }
+
+            var start = i;
+            var end = FindByte(data, start, (byte) '<');
+            Overwrite(data, start, end, fill);
+            pos = end;
+        }
+    }
+
+    static void Overwrite(byte[] data, int start, int end, Fill fill)
+    {
+        for (var i = start; i < end; i++)
+        {
+            var c = data[i];
+            var replace = fill switch
+            {
+                Fill.Digits => IsDigit(c),
+                Fill.Hex => IsHexDigit(c),
+                _ => !IsWhitespace(c)
+            };
+            if (replace)
+            {
+                data[i] = (byte) '0';
+            }
+        }
+    }
+
+    // Returns the index of the ')' that closes the literal string starting at 'start', honoring
+    // backslash escapes and balanced parentheses, or the end of the buffer if unterminated.
+    static int FindLiteralEnd(byte[] data, int start)
+    {
+        var depth = 1;
+        var i = start;
+        while (i < data.Length)
+        {
+            var c = data[i];
+            if (c == (byte) '\\')
+            {
+                i += 2;
+                continue;
+            }
+
+            if (c == (byte) '(')
+            {
+                depth++;
+            }
+            else if (c == (byte) ')')
+            {
+                depth--;
+                if (depth == 0)
+                {
+                    return i;
+                }
+            }
+
+            i++;
+        }
+
+        return data.Length;
+    }
+
+    static int FindByte(byte[] data, int start, byte target)
+    {
+        var i = start;
+        while (i < data.Length && data[i] != target)
+        {
+            i++;
+        }
+
+        return i;
+    }
+
+    static int SkipWhitespace(byte[] data, int i)
+    {
+        while (i < data.Length && IsWhitespace(data[i]))
+        {
+            i++;
+        }
+
+        return i;
+    }
+
+    static bool IsDigit(byte b) =>
+        b is >= (byte) '0' and <= (byte) '9';
+
+    static bool IsHexDigit(byte b) =>
+        b is >= (byte) '0' and <= (byte) '9' or >= (byte) 'a' and <= (byte) 'f' or >= (byte) 'A' and <= (byte) 'F';
+
+    static bool IsWhitespace(byte b) =>
+        b is (byte) ' ' or (byte) '\t' or (byte) '\r' or (byte) '\n' or (byte) '\f' or 0;
+}
diff --git a/src/Verify.PDFium/Verify.PDFium.csproj b/src/Verify.PDFium/Verify.PDFium.csproj
index 83a354c..de97390 100644
--- a/src/Verify.PDFium/Verify.PDFium.csproj
+++ b/src/Verify.PDFium/Verify.PDFium.csproj
@@ -8,4 +8,7 @@
     <PackageReference Include="ProjectDefaults" PrivateAssets="all" />
     <PackageReference Include="Microsoft.Sbom.Targets" PrivateAssets="all" Condition="'$(CI)' == 'true'" />
   </ItemGroup>
+  <ItemGroup>
+    <InternalsVisibleTo Include="Tests" Key="00240000048000009400000006020000002400005253413100040000010001000f0a8e4bf1639dce01be6592384e7dfc621915b7759fb5cee42ec5d351bcc43460432da1659ee618ca6cab6b8b8e56a5deb5d4ee1a49783d5c2690752502d31ccbfee9b2c697e20359b55ad100cc9370c8e983fd9496f01d761a060d0435bac7243b1832ba95757aa5adbb67df38c213d717b6751e1217cea9fa5c61e9b799dd" />
+  </ItemGroup>
 </Project>
diff --git a/src/Verify.PDFium/VerifyPDFium.cs b/src/Verify.PDFium/VerifyPDFium.cs
index 67c6600..b949c6d 100644
--- a/src/Verify.PDFium/VerifyPDFium.cs
+++ b/src/Verify.PDFium/VerifyPDFium.cs
@@ -37,7 +37,7 @@ static ConversionResult Convert(Stream stream)
 
         List<Target> targets =
         [
-            new("pdf", new MemoryStream(bytes))
+            new("pdf", new MemoryStream(PdfNormalizer.Normalize(bytes)))
             {
                 BypassComparersForSubsequentOnDifference = true
             }
@@ -60,11 +60,14 @@ static ConversionResult Convert(Stream stream)
             targets.Add(new("png", new MemoryStream(png), $"page_{index + 1:0000}"));
         }
 
+        var properties = document.GetProperties();
+        PdfNormalizer.NormalizeProperties(properties);
+
         var info = new PdfInfo
         {
             PageCount = document.PageCount,
             Pages = pages,
-            Properties = document.GetProperties()
+            Properties = properties
         };
 
         return new(info, targets);

From 255e2afa57cf082fab0674c935b3beed0ab72f16 Mon Sep 17 00:00:00 2001
From: Simon Cropp <simon.cropp@gmail.com>
Date: Wed, 1 Jul 2026 19:44:46 +1000
Subject: [PATCH 2/4] .

---
 src/Tests/GlobalUsings.cs       | 3 +++
 src/Tests/PdfNormalizerTests.cs | 3 ---
 2 files changed, 3 insertions(+), 3 deletions(-)
 create mode 100644 src/Tests/GlobalUsings.cs

diff --git a/src/Tests/GlobalUsings.cs b/src/Tests/GlobalUsings.cs
new file mode 100644
index 0000000..c814e0c
--- /dev/null
+++ b/src/Tests/GlobalUsings.cs
@@ -0,0 +1,3 @@
+﻿// Global using directives
+
+global using Morph.PDFium;
\ No newline at end of file
diff --git a/src/Tests/PdfNormalizerTests.cs b/src/Tests/PdfNormalizerTests.cs
index ab2e881..ed97aef 100644
--- a/src/Tests/PdfNormalizerTests.cs
+++ b/src/Tests/PdfNormalizerTests.cs
@@ -1,6 +1,3 @@
-using System.Text;
-using Morph.PDFium;
-
 [TestFixture]
 public class PdfNormalizerTests
 {

From ede9e8586c17bef93a44877974e6fb8c0bb622e9 Mon Sep 17 00:00:00 2001
From: Simon Cropp <simon.cropp@gmail.com>
Date: Wed, 1 Jul 2026 19:53:59 +1000
Subject: [PATCH 3/4] .

---
 src/Tests/PdfNormalizerTests.cs    | 13 +++++--------
 src/Verify.PDFium/PdfNormalizer.cs |  6 +-----
 src/Verify.PDFium/VerifyPDFium.cs  |  6 +++---
 3 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/src/Tests/PdfNormalizerTests.cs b/src/Tests/PdfNormalizerTests.cs
index ed97aef..3d92b98 100644
--- a/src/Tests/PdfNormalizerTests.cs
+++ b/src/Tests/PdfNormalizerTests.cs
@@ -48,19 +48,16 @@ public void LeavesLookalikeKeysUntouched()
     [Test]
     public void NormalizedDocumentStillLoads()
     {
-        var original = File.ReadAllBytes("sample.pdf");
-        var normalized = PdfNormalizer.Normalize(original);
-        Assert.That(normalized, Has.Length.EqualTo(original.Length));
+        var data = File.ReadAllBytes("sample.pdf");
 
-        using var document = PdfiumDocument.Load(normalized);
+        using var document = PdfiumDocument.Load(data);
         Assert.That(document.PageCount, Is.EqualTo(1));
     }
 
     static string Normalize(string value)
     {
-        var bytes = Encoding.Latin1.GetBytes(value);
-        var normalized = PdfNormalizer.Normalize(bytes);
-        Assert.That(normalized, Has.Length.EqualTo(bytes.Length));
-        return Encoding.Latin1.GetString(normalized);
+        var data = Encoding.Latin1.GetBytes(value);
+        PdfNormalizer.Normalize(data);
+        return Encoding.Latin1.GetString(data);
     }
 }
diff --git a/src/Verify.PDFium/PdfNormalizer.cs b/src/Verify.PDFium/PdfNormalizer.cs
index a438da8..6ba2b70 100644
--- a/src/Verify.PDFium/PdfNormalizer.cs
+++ b/src/Verify.PDFium/PdfNormalizer.cs
@@ -23,10 +23,8 @@ enum Fill
         All
     }
 
-    public static byte[] Normalize(byte[] source)
+    public static void Normalize(byte[] data)
     {
-        var data = (byte[]) source.Clone();
-
         // Document information dictionary dates.
         ZeroPdfString(data, "/CreationDate"u8, Fill.Digits);
         ZeroPdfString(data, "/ModDate"u8, Fill.Digits);
@@ -43,8 +41,6 @@ public static byte[] Normalize(byte[] source)
         ZeroXmpElement(data, "<xmpMM:DocumentID"u8, Fill.All);
         ZeroXmpElement(data, "<xmpMM:InstanceID"u8, Fill.All);
         ZeroXmpElement(data, "<xmpMM:OriginalDocumentID"u8, Fill.All);
-
-        return data;
     }
 
     public static void NormalizeProperties(Dictionary<string, string>? properties)
diff --git a/src/Verify.PDFium/VerifyPDFium.cs b/src/Verify.PDFium/VerifyPDFium.cs
index b949c6d..c9cc55d 100644
--- a/src/Verify.PDFium/VerifyPDFium.cs
+++ b/src/Verify.PDFium/VerifyPDFium.cs
@@ -33,16 +33,16 @@ static ConversionResult Convert(Stream stream)
         stream.CopyTo(buffer);
         var bytes = buffer.ToArray();
 
-        using var document = PdfiumDocument.Load(bytes);
-
+        PdfNormalizer.Normalize(bytes);
         List<Target> targets =
         [
-            new("pdf", new MemoryStream(PdfNormalizer.Normalize(bytes)))
+            new("pdf", new MemoryStream(bytes))
             {
                 BypassComparersForSubsequentOnDifference = true
             }
         ];
 
+        using var document = PdfiumDocument.Load(bytes);
         var pages = new List<PageInfo>(document.PageCount);
         for (var index = 0; index < document.PageCount; index++)
         {

From 09eae4bffadb2e2f72eae1fb44bb8b7ea0395091 Mon Sep 17 00:00:00 2001
From: Simon Cropp <simon.cropp@gmail.com>
Date: Wed, 1 Jul 2026 20:25:28 +1000
Subject: [PATCH 4/4] .

---
 claude.md                                     |   7 +-
 src/Tests/PdfDateTests.cs                     |  39 +++++++
 src/Tests/PdfNormalizerTests.cs               |   1 +
 src/Tests/Samples.MultiPage.verified.txt      |  14 +--
 src/Tests/Samples.VerifyPdf.verified.txt      |  14 +--
 .../Samples.VerifyPdfStream.verified.txt      |  14 +--
 src/Verify.PDFium/PdfDate.cs                  | 102 ++++++++++++++++++
 src/Verify.PDFium/PdfInfo.cs                  |   2 +-
 src/Verify.PDFium/PdfNormalizer.cs            |  30 +++---
 src/Verify.PDFium/VerifyPDFium.cs             |  65 +++++------
 10 files changed, 218 insertions(+), 70 deletions(-)
 create mode 100644 src/Tests/PdfDateTests.cs
 create mode 100644 src/Verify.PDFium/PdfDate.cs

diff --git a/claude.md b/claude.md
index 9f9ce4c..788e2df 100644
--- a/claude.md
+++ b/claude.md
@@ -34,11 +34,14 @@ Entry point is `VerifyPDFium.Initialize(dpi = 96)` which registers a stream conv
 2. The pdf bytes as a `pdf` target (`BypassComparersForSubsequentOnDifference` set, mirroring Verify.OpenXml)
 3. One `png` target per page, named `page_0001` style
 
-To keep snapshots stable for PDFs freshly generated at test time, the non-deterministic fields are neutralized before snapshotting: the trailer `/ID`, the info dictionary `/CreationDate`/`/ModDate` (both in the `pdf` bytes and the info file), and the XMP metadata dates plus `xmpMM:DocumentID`/`InstanceID`. This is done by **`PdfNormalizer`** — a length-preserving, in-place byte scan (no string round-trip, no regex) that overwrites only the volatile characters, so cross-reference offsets stay valid. Values inside a compressed object/metadata stream (`/ObjStm`, flate-compressed XMP) are not reachable by the plaintext scan.
+To keep snapshots stable for PDFs freshly generated at test time, the non-deterministic fields are neutralized two ways:
+- **In the `pdf` bytes** (`PdfNormalizer.Normalize`): the trailer `/ID`, the info-dictionary `/CreationDate`/`/ModDate`, and the XMP metadata dates plus `xmpMM:DocumentID`/`InstanceID` are overwritten by a length-preserving, in-place byte scan (no string round-trip, no regex) — only the volatile characters change, so cross-reference offsets stay valid. `Normalize` mutates in place and runs *after* the `PdfiumDocument` (which reads lazily from the same buffer) is disposed, so no defensive copy is needed. Values inside a compressed object/metadata stream (`/ObjStm`, flate-compressed XMP) are not reachable by the plaintext scan.
+- **In the info file** (`PdfNormalizer.NormalizeProperties`): `Properties` is a `Dictionary<string, object>` whose `CreationDate`/`ModDate` values are parsed (`PdfDate`) into `DateTimeOffset`, so Verify's built-in date scrubbing renders them deterministically (`DateTimeOffset_1` etc.). Properties are read from the original bytes, before `Normalize` zeroes them.
 
 Key files:
 - **VerifyPDFium.cs** — initialization and the converter
-- **PdfNormalizer.cs** — neutralizes non-deterministic `/ID`, dates and XMP identifiers directly on the bytes
+- **PdfNormalizer.cs** — neutralizes the `pdf` bytes and projects the info-file properties to the scrubbable object map
+- **PdfDate.cs** — parses PDF date strings (`D:YYYYMMDD…`) to `DateTimeOffset`
 - **PdfInfo.cs** / **PageInfo.cs** — info shape for the snapshot (per-page width/height in points and text via `PdfPage.GetText()`)
 
 Style note: only public types get a namespace declaration (`VerifyTests`); internal types live in the global namespace.
diff --git a/src/Tests/PdfDateTests.cs b/src/Tests/PdfDateTests.cs
new file mode 100644
index 0000000..cb0e0cf
--- /dev/null
+++ b/src/Tests/PdfDateTests.cs
@@ -0,0 +1,39 @@
+[TestFixture]
+public class PdfDateTests
+{
+    [Test]
+    public void ParsesFullDateWithPositiveOffset()
+    {
+        Assert.That(PdfDate.TryParse("D:20240115093000+05'30'", out var date), Is.True);
+        Assert.That(date, Is.EqualTo(new DateTimeOffset(2024, 1, 15, 9, 30, 0, new(5, 30, 0))));
+    }
+
+    [Test]
+    public void ParsesUtcOffset()
+    {
+        Assert.That(PdfDate.TryParse("D:20211105091500Z", out var date), Is.True);
+        Assert.That(date, Is.EqualTo(new DateTimeOffset(2021, 11, 5, 9, 15, 0, TimeSpan.Zero)));
+    }
+
+    [Test]
+    public void ParsesNegativeOffset()
+    {
+        Assert.That(PdfDate.TryParse("D:19991231235959-08'00'", out var date), Is.True);
+        Assert.That(date, Is.EqualTo(new DateTimeOffset(1999, 12, 31, 23, 59, 59, new(-8, 0, 0))));
+    }
+
+    [Test]
+    public void DefaultsOmittedComponents()
+    {
+        Assert.That(PdfDate.TryParse("D:2024", out var date), Is.True);
+        Assert.That(date, Is.EqualTo(new DateTimeOffset(2024, 1, 1, 0, 0, 0, TimeSpan.Zero)));
+    }
+
+    [Test]
+    public void RejectsNonDate() =>
+        Assert.That(PdfDate.TryParse("not a date", out _), Is.False);
+
+    [Test]
+    public void RejectsOutOfRangeComponents() =>
+        Assert.That(PdfDate.TryParse("D:20241350000000Z", out _), Is.False);
+}
diff --git a/src/Tests/PdfNormalizerTests.cs b/src/Tests/PdfNormalizerTests.cs
index 3d92b98..bed189f 100644
--- a/src/Tests/PdfNormalizerTests.cs
+++ b/src/Tests/PdfNormalizerTests.cs
@@ -49,6 +49,7 @@ public void LeavesLookalikeKeysUntouched()
     public void NormalizedDocumentStillLoads()
     {
         var data = File.ReadAllBytes("sample.pdf");
+        PdfNormalizer.Normalize(data);
 
         using var document = PdfiumDocument.Load(data);
         Assert.That(document.PageCount, Is.EqualTo(1));
diff --git a/src/Tests/Samples.MultiPage.verified.txt b/src/Tests/Samples.MultiPage.verified.txt
index bd5ca51..91a1f00 100644
--- a/src/Tests/Samples.MultiPage.verified.txt
+++ b/src/Tests/Samples.MultiPage.verified.txt
@@ -1,5 +1,11 @@
 ﻿{
   PageCount: 4,
+  Properties: {
+    CreationDate: DateTimeOffset_1,
+    Creator: Morph,
+    ModDate: DateTimeOffset_1,
+    Producer: PDFsharp 6.2.4
+  },
   Pages: [
     {
       Width: 612.0,
@@ -171,11 +177,5 @@ Paragraph 50: Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do ei
 incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation
 ullamco laboris.
     }
-  ],
-  Properties: {
-    CreationDate: D:00000000000000+00'00',
-    Creator: Morph,
-    ModDate: D:00000000000000+00'00',
-    Producer: PDFsharp 6.2.4
-  }
+  ]
 }
\ No newline at end of file
diff --git a/src/Tests/Samples.VerifyPdf.verified.txt b/src/Tests/Samples.VerifyPdf.verified.txt
index 5e9a61d..9540e7d 100644
--- a/src/Tests/Samples.VerifyPdf.verified.txt
+++ b/src/Tests/Samples.VerifyPdf.verified.txt
@@ -1,16 +1,16 @@
 ﻿{
   PageCount: 1,
+  Properties: {
+    CreationDate: DateTimeOffset_1,
+    Creator: Morph,
+    ModDate: DateTimeOffset_1,
+    Producer: PDFsharp 6.2.4
+  },
   Pages: [
     {
       Width: 612.0,
       Height: 792.0,
       Text: Hello, World! This is a simple paragraph.
     }
-  ],
-  Properties: {
-    CreationDate: D:00000000000000+00'00',
-    Creator: Morph,
-    ModDate: D:00000000000000+00'00',
-    Producer: PDFsharp 6.2.4
-  }
+  ]
 }
\ No newline at end of file
diff --git a/src/Tests/Samples.VerifyPdfStream.verified.txt b/src/Tests/Samples.VerifyPdfStream.verified.txt
index 5e9a61d..9540e7d 100644
--- a/src/Tests/Samples.VerifyPdfStream.verified.txt
+++ b/src/Tests/Samples.VerifyPdfStream.verified.txt
@@ -1,16 +1,16 @@
 ﻿{
   PageCount: 1,
+  Properties: {
+    CreationDate: DateTimeOffset_1,
+    Creator: Morph,
+    ModDate: DateTimeOffset_1,
+    Producer: PDFsharp 6.2.4
+  },
   Pages: [
     {
       Width: 612.0,
       Height: 792.0,
       Text: Hello, World! This is a simple paragraph.
     }
-  ],
-  Properties: {
-    CreationDate: D:00000000000000+00'00',
-    Creator: Morph,
-    ModDate: D:00000000000000+00'00',
-    Producer: PDFsharp 6.2.4
-  }
+  ]
 }
\ No newline at end of file
diff --git a/src/Verify.PDFium/PdfDate.cs b/src/Verify.PDFium/PdfDate.cs
new file mode 100644
index 0000000..17fb29a
--- /dev/null
+++ b/src/Verify.PDFium/PdfDate.cs
@@ -0,0 +1,102 @@
+/// <summary>
+/// Parses a PDF date string (PDF 32000-1:2008 §7.9.4), for example <c>D:20240115093000+05'30'</c>,
+/// into a <see cref="DateTimeOffset"/>. Every component after the year is optional.
+/// </summary>
+static class PdfDate
+{
+    public static bool TryParse(string value, out DateTimeOffset date)
+    {
+        date = default;
+
+        var span = value.AsSpan();
+        if (span.StartsWith("D:"))
+        {
+            span = span[2..];
+        }
+
+        if (!TryFixed(span, 0, 4, out var year) ||
+            !TryOptional(span, 4, 1, out var month) ||
+            !TryOptional(span, 6, 1, out var day) ||
+            !TryOptional(span, 8, 0, out var hour) ||
+            !TryOptional(span, 10, 0, out var minute) ||
+            !TryOptional(span, 12, 0, out var second) ||
+            !TryOffset(span, 14, out var offset))
+        {
+            return false;
+        }
+
+        try
+        {
+            date = new(year, month, day, hour, minute, second, offset);
+            return true;
+        }
+        catch (ArgumentException)
+        {
+            // Out-of-range component (e.g. month 13) or offset.
+            return false;
+        }
+    }
+
+    // Parses a mandatory run of exactly length digits at start.
+    static bool TryFixed(ReadOnlySpan<char> span, int start, int length, out int value)
+    {
+        value = 0;
+        return span.Length >= start + length &&
+               int.TryParse(span.Slice(start, length), out value);
+    }
+
+    // Parses an optional two-digit component; when the string ends before it, yields fallback.
+    static bool TryOptional(ReadOnlySpan<char> span, int start, int fallback, out int value)
+    {
+        value = fallback;
+        return span.Length <= start ||
+               TryFixed(span, start, 2, out value);
+    }
+
+    static bool TryOffset(ReadOnlySpan<char> span, int start, out TimeSpan offset)
+    {
+        offset = TimeSpan.Zero;
+        if (span.Length <= start)
+        {
+            return true;
+        }
+
+        var indicator = span[start];
+        if (indicator is 'Z' or 'z')
+        {
+            return true;
+        }
+
+        if (indicator != '+' && indicator != '-')
+        {
+            return false;
+        }
+
+        if (!TryFixed(span, start + 1, 2, out var hours))
+        {
+            return false;
+        }
+
+        // Minutes follow the hours, separated by an apostrophe: HH'mm'.
+        var minuteStart = start + 3;
+        if (minuteStart < span.Length && span[minuteStart] == '\'')
+        {
+            minuteStart++;
+        }
+
+        var minutes = 0;
+        if (minuteStart < span.Length &&
+            !TryFixed(span, minuteStart, 2, out minutes))
+        {
+            return false;
+        }
+
+        offset = new(hours, minutes, 0);
+        if (indicator == '-')
+        {
+            offset = -offset;
+        }
+
+        return true;
+    }
+}
diff --git a/src/Verify.PDFium/PdfInfo.cs b/src/Verify.PDFium/PdfInfo.cs
index 06e1171..3c2a9b3 100644
--- a/src/Verify.PDFium/PdfInfo.cs
+++ b/src/Verify.PDFium/PdfInfo.cs
@@ -1,6 +1,6 @@
 class PdfInfo
 {
     public required int PageCount { get; init; }
+    public Dictionary<string, object>? Properties { get; init; }
     public required IReadOnlyList<PageInfo> Pages { get; init; }
-    public Dictionary<string, string>? Properties { get; init; }
 }
diff --git a/src/Verify.PDFium/PdfNormalizer.cs b/src/Verify.PDFium/PdfNormalizer.cs
index 6ba2b70..59ca3b8 100644
--- a/src/Verify.PDFium/PdfNormalizer.cs
+++ b/src/Verify.PDFium/PdfNormalizer.cs
@@ -43,31 +43,31 @@ public static void Normalize(byte[] data)
         ZeroXmpElement(data, "<xmpMM:OriginalDocumentID"u8, Fill.All);
     }
 
-    public static void NormalizeProperties(Dictionary<string, string>? properties)
+    // Projects the raw string properties to an object map, parsing the dates to DateTimeOffset so
+    // Verify's built-in date scrubbing makes them deterministic. Non-dates stay as strings.
+    public static Dictionary<string, object>? NormalizeProperties(Dictionary<string, string>? properties)
     {
         if (properties is null)
         {
-            return;
+            return null;
         }
 
-        foreach (var key in properties.Keys.ToList())
+        var result = new Dictionary<string, object>(properties.Count);
+        foreach (var (key, value) in properties)
         {
-            if (key is "CreationDate" or "ModDate")
+            if (key is "CreationDate" or "ModDate" &&
+                PdfDate.TryParse(value, out var date))
             {
-                properties[key] = ZeroDigits(properties[key]);
+                result[key] = date;
             }
-        }
-    }
-
-    static string ZeroDigits(string value) =>
-        string.Create(value.Length, value, static (span, state) =>
-        {
-            for (var i = 0; i < state.Length; i++)
+            else
             {
-                var c = state[i];
-                span[i] = char.IsAsciiDigit(c) ? '0' : c;
+                result[key] = value;
             }
-        });
+        }
+
+        return result;
+    }
 
     // Finds a name key, then overwrites the string value that follows it. The value may be a
     // literal string "(...)" or a hex string "<...>".
diff --git a/src/Verify.PDFium/VerifyPDFium.cs b/src/Verify.PDFium/VerifyPDFium.cs
index c9cc55d..480e3b0 100644
--- a/src/Verify.PDFium/VerifyPDFium.cs
+++ b/src/Verify.PDFium/VerifyPDFium.cs
@@ -33,42 +33,45 @@ static ConversionResult Convert(Stream stream)
         stream.CopyTo(buffer);
         var bytes = buffer.ToArray();
 
-        PdfNormalizer.Normalize(bytes);
-        List<Target> targets =
-        [
-            new("pdf", new MemoryStream(bytes))
+        List<Target> targets = [];
+        PdfInfo info;
+        using (var document = PdfiumDocument.Load(bytes))
+        {
+            var pageCount = document.PageCount;
+            var pages = new List<PageInfo>(pageCount);
+            for (var index = 0; index < pageCount; index++)
             {
-                BypassComparersForSubsequentOnDifference = true
-            }
-        ];
+                using var page = document.LoadPage(index);
+                var size = page.Size;
+                pages.Add(
+                    new()
+                    {
+                        Width = size.Width,
+                        Height = size.Height,
+                        Text = page.GetText()
+                    });
 
-        using var document = PdfiumDocument.Load(bytes);
-        var pages = new List<PageInfo>(document.PageCount);
-        for (var index = 0; index < document.PageCount; index++)
-        {
-            using var page = document.LoadPage(index);
-            var size = page.Size;
-            pages.Add(
-                new()
-                {
-                    Width = size.Width,
-                    Height = size.Height,
-                    Text = page.GetText()
-                });
+                var png = document.RenderPage(index, dpi);
+                targets.Add(new("png", new MemoryStream(png), $"page_{index + 1:0000}"));
+            }
 
-            var png = document.RenderPage(index, dpi);
-            targets.Add(new("png", new MemoryStream(png), $"page_{index + 1:0000}"));
+            info = new()
+            {
+                PageCount = pageCount,
+                Pages = pages,
+                Properties = PdfNormalizer.NormalizeProperties(document.GetProperties())
+            };
         }
 
-        var properties = document.GetProperties();
-        PdfNormalizer.NormalizeProperties(properties);
-
-        var info = new PdfInfo
-        {
-            PageCount = document.PageCount,
-            Pages = pages,
-            Properties = properties
-        };
+        // Neutralize the volatile fields for the pdf snapshot only once the document, which reads
+        // lazily from the same buffer, has been released.
+        PdfNormalizer.Normalize(bytes);
+        targets.Insert(
+            0,
+            new("pdf", new MemoryStream(bytes))
+            {
+                BypassComparersForSubsequentOnDifference = true
+            });
 
         return new(info, targets);
     }