From 16bf48f69a1f68a69492873e9e54c450c7cc1fda Mon Sep 17 00:00:00 2001 From: Simon Cropp Date: Wed, 1 Jul 2026 19:39:05 +1000 Subject: [PATCH 1/4] better scrubbing --- claude.md | 5 +- src/Tests/PdfNormalizerTests.cs | 69 ++++ src/Tests/Samples.MultiPage.verified.pdf | Bin 16897 -> 16897 bytes src/Tests/Samples.MultiPage.verified.txt | 4 +- src/Tests/Samples.VerifyPdf.verified.pdf | Bin 10946 -> 10946 bytes src/Tests/Samples.VerifyPdf.verified.txt | 4 +- .../Samples.VerifyPdfStream.verified.pdf | Bin 10946 -> 10946 bytes .../Samples.VerifyPdfStream.verified.txt | 4 +- src/Verify.PDFium/PdfNormalizer.cs | 298 ++++++++++++++++++ src/Verify.PDFium/Verify.PDFium.csproj | 3 + src/Verify.PDFium/VerifyPDFium.cs | 7 +- 11 files changed, 385 insertions(+), 9 deletions(-) create mode 100644 src/Tests/PdfNormalizerTests.cs create mode 100644 src/Verify.PDFium/PdfNormalizer.cs diff --git a/claude.md b/claude.md index fcb9cba..9f9ce4c 100644 --- a/claude.md +++ b/claude.md @@ -31,11 +31,14 @@ All source lives under `src/`. Solution file is `src/Verify.PDFium.slnx`. Entry point is `VerifyPDFium.Initialize(dpi = 96)` which registers a stream converter for the `pdf` extension. The converter loads the document with `Morph.PDFium.PdfiumDocument` and returns a `ConversionResult` containing: 1. `PdfInfo` (page count, per-page size in points and extracted text, document information dictionary) serialized as the info file -2. The original pdf bytes as a `pdf` target (`BypassComparersForSubsequentOnDifference` set, mirroring Verify.OpenXml) +2. The pdf bytes as a `pdf` target (`BypassComparersForSubsequentOnDifference` set, mirroring Verify.OpenXml) 3. One `png` target per page, named `page_0001` style +To keep snapshots stable for PDFs freshly generated at test time, the non-deterministic fields are neutralized before snapshotting: the trailer `/ID`, the info dictionary `/CreationDate`/`/ModDate` (both in the `pdf` bytes and the info file), and the XMP metadata dates plus `xmpMM:DocumentID`/`InstanceID`. This is done by **`PdfNormalizer`** — a length-preserving, in-place byte scan (no string round-trip, no regex) that overwrites only the volatile characters, so cross-reference offsets stay valid. Values inside a compressed object/metadata stream (`/ObjStm`, flate-compressed XMP) are not reachable by the plaintext scan. + Key files: - **VerifyPDFium.cs** — initialization and the converter +- **PdfNormalizer.cs** — neutralizes non-deterministic `/ID`, dates and XMP identifiers directly on the bytes - **PdfInfo.cs** / **PageInfo.cs** — info shape for the snapshot (per-page width/height in points and text via `PdfPage.GetText()`) Style note: only public types get a namespace declaration (`VerifyTests`); internal types live in the global namespace. diff --git a/src/Tests/PdfNormalizerTests.cs b/src/Tests/PdfNormalizerTests.cs new file mode 100644 index 0000000..ab2e881 --- /dev/null +++ b/src/Tests/PdfNormalizerTests.cs @@ -0,0 +1,69 @@ +using System.Text; +using Morph.PDFium; + +[TestFixture] +public class PdfNormalizerTests +{ + [Test] + public void NeutralizesVolatileValues() + { + var input = + "/ID [ <1122334455667788>] " + + "/CreationDate(D:20240115093000+05'30') " + + "/ModDate(D:20240115093000Z) " + + "2024-01-15T09:30:00+05:30" + + "2024-01-15T09:30:00Z" + + "2024-01-15T09:30:00Z" + + "uuid:0f7b2c9a-1234-5678-9abc-def012345678" + + "xmp.iid:1a2b3c4d"; + var expected = + "/ID [<0000000000000000> <0000000000000000>] " + + "/CreationDate(D:00000000000000+00'00') " + + "/ModDate(D:00000000000000Z) " + + "0000-00-00T00:00:00+00:00" + + "0000-00-00T00:00:00Z" + + "0000-00-00T00:00:00Z" + + $"{new string('0', 41)}" + + $"{new string('0', 16)}"; + Assert.That(Normalize(input), Is.EqualTo(expected)); + } + + [Test] + public void CollapsesDifferingValuesToTheSameOutput() + { + // The same producer emits a stable structure across runs, so two documents differing only + // in the volatile digits/hex normalize to identical bytes. + var a = "/ID [] /CreationDate(D:20240115093000+05'30')"; + var b = "/ID [<99887766>] /CreationDate(D:19991231235959+11'45')"; + Assert.That(a, Is.Not.EqualTo(b)); + Assert.That(Normalize(a), Is.EqualTo(Normalize(b))); + } + + [Test] + public void LeavesLookalikeKeysUntouched() + { + // /IDTree is a name-tree key (not the file identifier), /ModDateStamp is a different name, + // and a self-closing date element has no content: none should be altered. + var input = "/IDTree [1 2] /ModDateStamp(20240101) 2024"; + Assert.That(Normalize(input), Is.EqualTo(input)); + } + + [Test] + public void NormalizedDocumentStillLoads() + { + var original = File.ReadAllBytes("sample.pdf"); + var normalized = PdfNormalizer.Normalize(original); + Assert.That(normalized, Has.Length.EqualTo(original.Length)); + + using var document = PdfiumDocument.Load(normalized); + Assert.That(document.PageCount, Is.EqualTo(1)); + } + + static string Normalize(string value) + { + var bytes = Encoding.Latin1.GetBytes(value); + var normalized = PdfNormalizer.Normalize(bytes); + Assert.That(normalized, Has.Length.EqualTo(bytes.Length)); + return Encoding.Latin1.GetString(normalized); + } +} diff --git a/src/Tests/Samples.MultiPage.verified.pdf b/src/Tests/Samples.MultiPage.verified.pdf index 4b04c14133fea23e3b571e37b6953328d130cd78..af6c8f7a79c79b27e3ce78b2bc445b1ab74c1ede 100644 GIT binary patch delta 203 zcmZo{VQg$+oS?~J00k2bHQ?-xiRHFjAaPwVn*7Mt6q&1Nw^SCU4J$qQv7I%36`QT? W#hIM2YDN;Yvq9ouWlo;rm$)28I(2HQ?-xiRHFjAPHRqLm-;`$kr5@t7x}WwzM=e#R{Yq z1a#4e$&c-<@vGQuZ7UdMI>)CWiqOOf=Mhvo|JgP~!rL>w?i_X?0U%Zj|~`S(rAg^kf+gYy2uUmuSpq Ubi%3`Nzl#)iG!6nIaDhb0Qli6s{jB1 delta 201 zcmX>UdMI>)CWny$5EvR5PBhejvo|JgP~!qg=o%OT(PU|LQ)F(G`cm1_(##YqkX8`T zMI$E5XjtP{vAINJKBJR~iUdMI>)CWiqOOf=Mhvo|JgP~!rL>w?i_X?0U%Zj|~`S(rAg^kf+gYy2uUmuSpq Ubi%3`Nzl#)iG!6nIaDhb0Qli6s{jB1 delta 201 zcmX>UdMI>)CWny$5EvR5PBhejvo|JgP~!qg=o%OT(PU|LQ)F(G`cm1_(##YqkX8`T zMI$E5XjtP{vAINJKBJR~i +/// Neutralizes the non-deterministic fields of a PDF (the trailer /ID, the document +/// information /CreationDate and /ModDate, and the equivalent XMP metadata dates and +/// identifiers) so that the same source document always produces byte-identical snapshot output. +/// +/// +/// All edits are performed directly on the bytes and are length-preserving: only the mutable +/// characters inside each value are overwritten, so every cross-reference offset stays valid and +/// the file never has to be re-serialized. Values that live inside a compressed object or metadata +/// stream (/ObjStm, flate-compressed XMP) are not reachable by this plaintext scan. +/// +static class PdfNormalizer +{ + enum Fill + { + // Zero the ASCII digits only, keeping separators (leaves a readable date). + Digits, + + // Zero the hexadecimal digits (for hex string <...> values). + Hex, + + // Zero every non-whitespace byte (for opaque identifiers). + All + } + + public static byte[] Normalize(byte[] source) + { + var data = (byte[]) source.Clone(); + + // Document information dictionary dates. + ZeroPdfString(data, "/CreationDate"u8, Fill.Digits); + ZeroPdfString(data, "/ModDate"u8, Fill.Digits); + + // Trailer / cross-reference-stream file identifier: /ID [<...> <...>]. + ZeroFileId(data); + + // XMP metadata dates (uncompressed metadata streams only). + ZeroXmpElement(data, "? properties) + { + if (properties is null) + { + return; + } + + foreach (var key in properties.Keys.ToList()) + { + if (key is "CreationDate" or "ModDate") + { + properties[key] = ZeroDigits(properties[key]); + } + } + } + + static string ZeroDigits(string value) => + string.Create(value.Length, value, static (span, state) => + { + for (var i = 0; i < state.Length; i++) + { + var c = state[i]; + span[i] = char.IsAsciiDigit(c) ? '0' : c; + } + }); + + // Finds a name key, then overwrites the string value that follows it. The value may be a + // literal string "(...)" or a hex string "<...>". + static void ZeroPdfString(byte[] data, ReadOnlySpan key, Fill fill) + { + var pos = 0; + while (true) + { + var hit = data.AsSpan(pos).IndexOf(key); + if (hit < 0) + { + return; + } + + var i = pos + hit + key.Length; + pos = i; + + i = SkipWhitespace(data, i); + if (i >= data.Length) + { + return; + } + + if (data[i] == (byte) '(') + { + var start = i + 1; + var end = FindLiteralEnd(data, start); + Overwrite(data, start, end, fill); + pos = end; + } + else if (data[i] == (byte) '<' && (i + 1 >= data.Length || data[i + 1] != (byte) '<')) + { + var start = i + 1; + var end = FindByte(data, start, (byte) '>'); + Overwrite(data, start, end, Fill.Hex); + pos = end; + } + } + } + + // Finds "/ID" followed by an array and zeroes each string element. Anything not shaped like the + // identifier array (for example the "/IDTree" name-tree key) is skipped. + static void ZeroFileId(byte[] data) + { + var key = "/ID"u8; + var pos = 0; + while (true) + { + var hit = data.AsSpan(pos).IndexOf(key); + if (hit < 0) + { + return; + } + + var i = pos + hit + key.Length; + pos = i; + + i = SkipWhitespace(data, i); + if (i >= data.Length || data[i] != (byte) '[') + { + continue; + } + + i++; + while (i < data.Length && data[i] != (byte) ']') + { + if (data[i] == (byte) '<') + { + var start = i + 1; + i = FindByte(data, start, (byte) '>'); + Overwrite(data, start, i, Fill.Hex); + i++; + } + else if (data[i] == (byte) '(') + { + var start = i + 1; + i = FindLiteralEnd(data, start); + Overwrite(data, start, i, Fill.All); + i++; + } + else + { + i++; + } + } + + pos = i; + } + } + + // Finds an XMP element by its opening tag and zeroes the text content up to the next '<'. + static void ZeroXmpElement(byte[] data, ReadOnlySpan openTag, Fill fill) + { + var pos = 0; + while (true) + { + var hit = data.AsSpan(pos).IndexOf(openTag); + if (hit < 0) + { + return; + } + + var i = pos + hit + openTag.Length; + pos = i; + + // Reject a longer element name that merely shares this prefix. + if (i < data.Length && data[i] != (byte) '>' && data[i] != (byte) '/' && !IsWhitespace(data[i])) + { + continue; + } + + // Skip the remainder of the opening tag, remembering the last significant byte so a + // self-closing "" can be detected. + var lastSignificant = (byte) 0; + while (i < data.Length && data[i] != (byte) '>') + { + if (!IsWhitespace(data[i])) + { + lastSignificant = data[i]; + } + + i++; + } + + if (i >= data.Length) + { + return; + } + + i++; + if (lastSignificant == (byte) '/') + { + continue; + } + + var start = i; + var end = FindByte(data, start, (byte) '<'); + Overwrite(data, start, end, fill); + pos = end; + } + } + + static void Overwrite(byte[] data, int start, int end, Fill fill) + { + for (var i = start; i < end; i++) + { + var c = data[i]; + var replace = fill switch + { + Fill.Digits => IsDigit(c), + Fill.Hex => IsHexDigit(c), + _ => !IsWhitespace(c) + }; + if (replace) + { + data[i] = (byte) '0'; + } + } + } + + // Returns the index of the ')' that closes the literal string starting at 'start', honoring + // backslash escapes and balanced parentheses, or the end of the buffer if unterminated. + static int FindLiteralEnd(byte[] data, int start) + { + var depth = 1; + var i = start; + while (i < data.Length) + { + var c = data[i]; + if (c == (byte) '\\') + { + i += 2; + continue; + } + + if (c == (byte) '(') + { + depth++; + } + else if (c == (byte) ')') + { + depth--; + if (depth == 0) + { + return i; + } + } + + i++; + } + + return data.Length; + } + + static int FindByte(byte[] data, int start, byte target) + { + var i = start; + while (i < data.Length && data[i] != target) + { + i++; + } + + return i; + } + + static int SkipWhitespace(byte[] data, int i) + { + while (i < data.Length && IsWhitespace(data[i])) + { + i++; + } + + return i; + } + + static bool IsDigit(byte b) => + b is >= (byte) '0' and <= (byte) '9'; + + static bool IsHexDigit(byte b) => + b is >= (byte) '0' and <= (byte) '9' or >= (byte) 'a' and <= (byte) 'f' or >= (byte) 'A' and <= (byte) 'F'; + + static bool IsWhitespace(byte b) => + b is (byte) ' ' or (byte) '\t' or (byte) '\r' or (byte) '\n' or (byte) '\f' or 0; +} diff --git a/src/Verify.PDFium/Verify.PDFium.csproj b/src/Verify.PDFium/Verify.PDFium.csproj index 83a354c..de97390 100644 --- a/src/Verify.PDFium/Verify.PDFium.csproj +++ b/src/Verify.PDFium/Verify.PDFium.csproj @@ -8,4 +8,7 @@ + + + diff --git a/src/Verify.PDFium/VerifyPDFium.cs b/src/Verify.PDFium/VerifyPDFium.cs index 67c6600..b949c6d 100644 --- a/src/Verify.PDFium/VerifyPDFium.cs +++ b/src/Verify.PDFium/VerifyPDFium.cs @@ -37,7 +37,7 @@ static ConversionResult Convert(Stream stream) List targets = [ - new("pdf", new MemoryStream(bytes)) + new("pdf", new MemoryStream(PdfNormalizer.Normalize(bytes))) { BypassComparersForSubsequentOnDifference = true } @@ -60,11 +60,14 @@ static ConversionResult Convert(Stream stream) targets.Add(new("png", new MemoryStream(png), $"page_{index + 1:0000}")); } + var properties = document.GetProperties(); + PdfNormalizer.NormalizeProperties(properties); + var info = new PdfInfo { PageCount = document.PageCount, Pages = pages, - Properties = document.GetProperties() + Properties = properties }; return new(info, targets); From 255e2afa57cf082fab0674c935b3beed0ab72f16 Mon Sep 17 00:00:00 2001 From: Simon Cropp Date: Wed, 1 Jul 2026 19:44:46 +1000 Subject: [PATCH 2/4] . --- src/Tests/GlobalUsings.cs | 3 +++ src/Tests/PdfNormalizerTests.cs | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) create mode 100644 src/Tests/GlobalUsings.cs diff --git a/src/Tests/GlobalUsings.cs b/src/Tests/GlobalUsings.cs new file mode 100644 index 0000000..c814e0c --- /dev/null +++ b/src/Tests/GlobalUsings.cs @@ -0,0 +1,3 @@ +// Global using directives + +global using Morph.PDFium; \ No newline at end of file diff --git a/src/Tests/PdfNormalizerTests.cs b/src/Tests/PdfNormalizerTests.cs index ab2e881..ed97aef 100644 --- a/src/Tests/PdfNormalizerTests.cs +++ b/src/Tests/PdfNormalizerTests.cs @@ -1,6 +1,3 @@ -using System.Text; -using Morph.PDFium; - [TestFixture] public class PdfNormalizerTests { From ede9e8586c17bef93a44877974e6fb8c0bb622e9 Mon Sep 17 00:00:00 2001 From: Simon Cropp Date: Wed, 1 Jul 2026 19:53:59 +1000 Subject: [PATCH 3/4] . --- src/Tests/PdfNormalizerTests.cs | 13 +++++-------- src/Verify.PDFium/PdfNormalizer.cs | 6 +----- src/Verify.PDFium/VerifyPDFium.cs | 6 +++--- 3 files changed, 9 insertions(+), 16 deletions(-) diff --git a/src/Tests/PdfNormalizerTests.cs b/src/Tests/PdfNormalizerTests.cs index ed97aef..3d92b98 100644 --- a/src/Tests/PdfNormalizerTests.cs +++ b/src/Tests/PdfNormalizerTests.cs @@ -48,19 +48,16 @@ public void LeavesLookalikeKeysUntouched() [Test] public void NormalizedDocumentStillLoads() { - var original = File.ReadAllBytes("sample.pdf"); - var normalized = PdfNormalizer.Normalize(original); - Assert.That(normalized, Has.Length.EqualTo(original.Length)); + var data = File.ReadAllBytes("sample.pdf"); - using var document = PdfiumDocument.Load(normalized); + using var document = PdfiumDocument.Load(data); Assert.That(document.PageCount, Is.EqualTo(1)); } static string Normalize(string value) { - var bytes = Encoding.Latin1.GetBytes(value); - var normalized = PdfNormalizer.Normalize(bytes); - Assert.That(normalized, Has.Length.EqualTo(bytes.Length)); - return Encoding.Latin1.GetString(normalized); + var data = Encoding.Latin1.GetBytes(value); + PdfNormalizer.Normalize(data); + return Encoding.Latin1.GetString(data); } } diff --git a/src/Verify.PDFium/PdfNormalizer.cs b/src/Verify.PDFium/PdfNormalizer.cs index a438da8..6ba2b70 100644 --- a/src/Verify.PDFium/PdfNormalizer.cs +++ b/src/Verify.PDFium/PdfNormalizer.cs @@ -23,10 +23,8 @@ enum Fill All } - public static byte[] Normalize(byte[] source) + public static void Normalize(byte[] data) { - var data = (byte[]) source.Clone(); - // Document information dictionary dates. ZeroPdfString(data, "/CreationDate"u8, Fill.Digits); ZeroPdfString(data, "/ModDate"u8, Fill.Digits); @@ -43,8 +41,6 @@ public static byte[] Normalize(byte[] source) ZeroXmpElement(data, "? properties) diff --git a/src/Verify.PDFium/VerifyPDFium.cs b/src/Verify.PDFium/VerifyPDFium.cs index b949c6d..c9cc55d 100644 --- a/src/Verify.PDFium/VerifyPDFium.cs +++ b/src/Verify.PDFium/VerifyPDFium.cs @@ -33,16 +33,16 @@ static ConversionResult Convert(Stream stream) stream.CopyTo(buffer); var bytes = buffer.ToArray(); - using var document = PdfiumDocument.Load(bytes); - + PdfNormalizer.Normalize(bytes); List targets = [ - new("pdf", new MemoryStream(PdfNormalizer.Normalize(bytes))) + new("pdf", new MemoryStream(bytes)) { BypassComparersForSubsequentOnDifference = true } ]; + using var document = PdfiumDocument.Load(bytes); var pages = new List(document.PageCount); for (var index = 0; index < document.PageCount; index++) { From 09eae4bffadb2e2f72eae1fb44bb8b7ea0395091 Mon Sep 17 00:00:00 2001 From: Simon Cropp Date: Wed, 1 Jul 2026 20:25:28 +1000 Subject: [PATCH 4/4] . --- claude.md | 7 +- src/Tests/PdfDateTests.cs | 39 +++++++ src/Tests/PdfNormalizerTests.cs | 1 + src/Tests/Samples.MultiPage.verified.txt | 14 +-- src/Tests/Samples.VerifyPdf.verified.txt | 14 +-- .../Samples.VerifyPdfStream.verified.txt | 14 +-- src/Verify.PDFium/PdfDate.cs | 102 ++++++++++++++++++ src/Verify.PDFium/PdfInfo.cs | 2 +- src/Verify.PDFium/PdfNormalizer.cs | 30 +++--- src/Verify.PDFium/VerifyPDFium.cs | 65 +++++------ 10 files changed, 218 insertions(+), 70 deletions(-) create mode 100644 src/Tests/PdfDateTests.cs create mode 100644 src/Verify.PDFium/PdfDate.cs diff --git a/claude.md b/claude.md index 9f9ce4c..788e2df 100644 --- a/claude.md +++ b/claude.md @@ -34,11 +34,14 @@ Entry point is `VerifyPDFium.Initialize(dpi = 96)` which registers a stream conv 2. The pdf bytes as a `pdf` target (`BypassComparersForSubsequentOnDifference` set, mirroring Verify.OpenXml) 3. One `png` target per page, named `page_0001` style -To keep snapshots stable for PDFs freshly generated at test time, the non-deterministic fields are neutralized before snapshotting: the trailer `/ID`, the info dictionary `/CreationDate`/`/ModDate` (both in the `pdf` bytes and the info file), and the XMP metadata dates plus `xmpMM:DocumentID`/`InstanceID`. This is done by **`PdfNormalizer`** — a length-preserving, in-place byte scan (no string round-trip, no regex) that overwrites only the volatile characters, so cross-reference offsets stay valid. Values inside a compressed object/metadata stream (`/ObjStm`, flate-compressed XMP) are not reachable by the plaintext scan. +To keep snapshots stable for PDFs freshly generated at test time, the non-deterministic fields are neutralized two ways: +- **In the `pdf` bytes** (`PdfNormalizer.Normalize`): the trailer `/ID`, the info-dictionary `/CreationDate`/`/ModDate`, and the XMP metadata dates plus `xmpMM:DocumentID`/`InstanceID` are overwritten by a length-preserving, in-place byte scan (no string round-trip, no regex) — only the volatile characters change, so cross-reference offsets stay valid. `Normalize` mutates in place and runs *after* the `PdfiumDocument` (which reads lazily from the same buffer) is disposed, so no defensive copy is needed. Values inside a compressed object/metadata stream (`/ObjStm`, flate-compressed XMP) are not reachable by the plaintext scan. +- **In the info file** (`PdfNormalizer.NormalizeProperties`): `Properties` is a `Dictionary` whose `CreationDate`/`ModDate` values are parsed (`PdfDate`) into `DateTimeOffset`, so Verify's built-in date scrubbing renders them deterministically (`DateTimeOffset_1` etc.). Properties are read from the original bytes, before `Normalize` zeroes them. Key files: - **VerifyPDFium.cs** — initialization and the converter -- **PdfNormalizer.cs** — neutralizes non-deterministic `/ID`, dates and XMP identifiers directly on the bytes +- **PdfNormalizer.cs** — neutralizes the `pdf` bytes and projects the info-file properties to the scrubbable object map +- **PdfDate.cs** — parses PDF date strings (`D:YYYYMMDD…`) to `DateTimeOffset` - **PdfInfo.cs** / **PageInfo.cs** — info shape for the snapshot (per-page width/height in points and text via `PdfPage.GetText()`) Style note: only public types get a namespace declaration (`VerifyTests`); internal types live in the global namespace. diff --git a/src/Tests/PdfDateTests.cs b/src/Tests/PdfDateTests.cs new file mode 100644 index 0000000..cb0e0cf --- /dev/null +++ b/src/Tests/PdfDateTests.cs @@ -0,0 +1,39 @@ +[TestFixture] +public class PdfDateTests +{ + [Test] + public void ParsesFullDateWithPositiveOffset() + { + Assert.That(PdfDate.TryParse("D:20240115093000+05'30'", out var date), Is.True); + Assert.That(date, Is.EqualTo(new DateTimeOffset(2024, 1, 15, 9, 30, 0, new(5, 30, 0)))); + } + + [Test] + public void ParsesUtcOffset() + { + Assert.That(PdfDate.TryParse("D:20211105091500Z", out var date), Is.True); + Assert.That(date, Is.EqualTo(new DateTimeOffset(2021, 11, 5, 9, 15, 0, TimeSpan.Zero))); + } + + [Test] + public void ParsesNegativeOffset() + { + Assert.That(PdfDate.TryParse("D:19991231235959-08'00'", out var date), Is.True); + Assert.That(date, Is.EqualTo(new DateTimeOffset(1999, 12, 31, 23, 59, 59, new(-8, 0, 0)))); + } + + [Test] + public void DefaultsOmittedComponents() + { + Assert.That(PdfDate.TryParse("D:2024", out var date), Is.True); + Assert.That(date, Is.EqualTo(new DateTimeOffset(2024, 1, 1, 0, 0, 0, TimeSpan.Zero))); + } + + [Test] + public void RejectsNonDate() => + Assert.That(PdfDate.TryParse("not a date", out _), Is.False); + + [Test] + public void RejectsOutOfRangeComponents() => + Assert.That(PdfDate.TryParse("D:20241350000000Z", out _), Is.False); +} diff --git a/src/Tests/PdfNormalizerTests.cs b/src/Tests/PdfNormalizerTests.cs index 3d92b98..bed189f 100644 --- a/src/Tests/PdfNormalizerTests.cs +++ b/src/Tests/PdfNormalizerTests.cs @@ -49,6 +49,7 @@ public void LeavesLookalikeKeysUntouched() public void NormalizedDocumentStillLoads() { var data = File.ReadAllBytes("sample.pdf"); + PdfNormalizer.Normalize(data); using var document = PdfiumDocument.Load(data); Assert.That(document.PageCount, Is.EqualTo(1)); diff --git a/src/Tests/Samples.MultiPage.verified.txt b/src/Tests/Samples.MultiPage.verified.txt index bd5ca51..91a1f00 100644 --- a/src/Tests/Samples.MultiPage.verified.txt +++ b/src/Tests/Samples.MultiPage.verified.txt @@ -1,5 +1,11 @@ { PageCount: 4, + Properties: { + CreationDate: DateTimeOffset_1, + Creator: Morph, + ModDate: DateTimeOffset_1, + Producer: PDFsharp 6.2.4 + }, Pages: [ { Width: 612.0, @@ -171,11 +177,5 @@ Paragraph 50: Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do ei incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris. } - ], - Properties: { - CreationDate: D:00000000000000+00'00', - Creator: Morph, - ModDate: D:00000000000000+00'00', - Producer: PDFsharp 6.2.4 - } + ] } \ No newline at end of file diff --git a/src/Tests/Samples.VerifyPdf.verified.txt b/src/Tests/Samples.VerifyPdf.verified.txt index 5e9a61d..9540e7d 100644 --- a/src/Tests/Samples.VerifyPdf.verified.txt +++ b/src/Tests/Samples.VerifyPdf.verified.txt @@ -1,16 +1,16 @@ { PageCount: 1, + Properties: { + CreationDate: DateTimeOffset_1, + Creator: Morph, + ModDate: DateTimeOffset_1, + Producer: PDFsharp 6.2.4 + }, Pages: [ { Width: 612.0, Height: 792.0, Text: Hello, World! This is a simple paragraph. } - ], - Properties: { - CreationDate: D:00000000000000+00'00', - Creator: Morph, - ModDate: D:00000000000000+00'00', - Producer: PDFsharp 6.2.4 - } + ] } \ No newline at end of file diff --git a/src/Tests/Samples.VerifyPdfStream.verified.txt b/src/Tests/Samples.VerifyPdfStream.verified.txt index 5e9a61d..9540e7d 100644 --- a/src/Tests/Samples.VerifyPdfStream.verified.txt +++ b/src/Tests/Samples.VerifyPdfStream.verified.txt @@ -1,16 +1,16 @@ { PageCount: 1, + Properties: { + CreationDate: DateTimeOffset_1, + Creator: Morph, + ModDate: DateTimeOffset_1, + Producer: PDFsharp 6.2.4 + }, Pages: [ { Width: 612.0, Height: 792.0, Text: Hello, World! This is a simple paragraph. } - ], - Properties: { - CreationDate: D:00000000000000+00'00', - Creator: Morph, - ModDate: D:00000000000000+00'00', - Producer: PDFsharp 6.2.4 - } + ] } \ No newline at end of file diff --git a/src/Verify.PDFium/PdfDate.cs b/src/Verify.PDFium/PdfDate.cs new file mode 100644 index 0000000..17fb29a --- /dev/null +++ b/src/Verify.PDFium/PdfDate.cs @@ -0,0 +1,102 @@ +/// +/// Parses a PDF date string (PDF 32000-1:2008 §7.9.4), for example D:20240115093000+05'30', +/// into a . Every component after the year is optional. +/// +static class PdfDate +{ + public static bool TryParse(string value, out DateTimeOffset date) + { + date = default; + + var span = value.AsSpan(); + if (span.StartsWith("D:")) + { + span = span[2..]; + } + + if (!TryFixed(span, 0, 4, out var year) || + !TryOptional(span, 4, 1, out var month) || + !TryOptional(span, 6, 1, out var day) || + !TryOptional(span, 8, 0, out var hour) || + !TryOptional(span, 10, 0, out var minute) || + !TryOptional(span, 12, 0, out var second) || + !TryOffset(span, 14, out var offset)) + { + return false; + } + + try + { + date = new(year, month, day, hour, minute, second, offset); + return true; + } + catch (ArgumentException) + { + // Out-of-range component (e.g. month 13) or offset. + return false; + } + } + + // Parses a mandatory run of exactly length digits at start. + static bool TryFixed(ReadOnlySpan span, int start, int length, out int value) + { + value = 0; + return span.Length >= start + length && + int.TryParse(span.Slice(start, length), out value); + } + + // Parses an optional two-digit component; when the string ends before it, yields fallback. + static bool TryOptional(ReadOnlySpan span, int start, int fallback, out int value) + { + value = fallback; + return span.Length <= start || + TryFixed(span, start, 2, out value); + } + + static bool TryOffset(ReadOnlySpan span, int start, out TimeSpan offset) + { + offset = TimeSpan.Zero; + if (span.Length <= start) + { + return true; + } + + var indicator = span[start]; + if (indicator is 'Z' or 'z') + { + return true; + } + + if (indicator != '+' && indicator != '-') + { + return false; + } + + if (!TryFixed(span, start + 1, 2, out var hours)) + { + return false; + } + + // Minutes follow the hours, separated by an apostrophe: HH'mm'. + var minuteStart = start + 3; + if (minuteStart < span.Length && span[minuteStart] == '\'') + { + minuteStart++; + } + + var minutes = 0; + if (minuteStart < span.Length && + !TryFixed(span, minuteStart, 2, out minutes)) + { + return false; + } + + offset = new(hours, minutes, 0); + if (indicator == '-') + { + offset = -offset; + } + + return true; + } +} diff --git a/src/Verify.PDFium/PdfInfo.cs b/src/Verify.PDFium/PdfInfo.cs index 06e1171..3c2a9b3 100644 --- a/src/Verify.PDFium/PdfInfo.cs +++ b/src/Verify.PDFium/PdfInfo.cs @@ -1,6 +1,6 @@ class PdfInfo { public required int PageCount { get; init; } + public Dictionary? Properties { get; init; } public required IReadOnlyList Pages { get; init; } - public Dictionary? Properties { get; init; } } diff --git a/src/Verify.PDFium/PdfNormalizer.cs b/src/Verify.PDFium/PdfNormalizer.cs index 6ba2b70..59ca3b8 100644 --- a/src/Verify.PDFium/PdfNormalizer.cs +++ b/src/Verify.PDFium/PdfNormalizer.cs @@ -43,31 +43,31 @@ public static void Normalize(byte[] data) ZeroXmpElement(data, "? properties) + // Projects the raw string properties to an object map, parsing the dates to DateTimeOffset so + // Verify's built-in date scrubbing makes them deterministic. Non-dates stay as strings. + public static Dictionary? NormalizeProperties(Dictionary? properties) { if (properties is null) { - return; + return null; } - foreach (var key in properties.Keys.ToList()) + var result = new Dictionary(properties.Count); + foreach (var (key, value) in properties) { - if (key is "CreationDate" or "ModDate") + if (key is "CreationDate" or "ModDate" && + PdfDate.TryParse(value, out var date)) { - properties[key] = ZeroDigits(properties[key]); + result[key] = date; } - } - } - - static string ZeroDigits(string value) => - string.Create(value.Length, value, static (span, state) => - { - for (var i = 0; i < state.Length; i++) + else { - var c = state[i]; - span[i] = char.IsAsciiDigit(c) ? '0' : c; + result[key] = value; } - }); + } + + return result; + } // Finds a name key, then overwrites the string value that follows it. The value may be a // literal string "(...)" or a hex string "<...>". diff --git a/src/Verify.PDFium/VerifyPDFium.cs b/src/Verify.PDFium/VerifyPDFium.cs index c9cc55d..480e3b0 100644 --- a/src/Verify.PDFium/VerifyPDFium.cs +++ b/src/Verify.PDFium/VerifyPDFium.cs @@ -33,42 +33,45 @@ static ConversionResult Convert(Stream stream) stream.CopyTo(buffer); var bytes = buffer.ToArray(); - PdfNormalizer.Normalize(bytes); - List targets = - [ - new("pdf", new MemoryStream(bytes)) + List targets = []; + PdfInfo info; + using (var document = PdfiumDocument.Load(bytes)) + { + var pageCount = document.PageCount; + var pages = new List(pageCount); + for (var index = 0; index < pageCount; index++) { - BypassComparersForSubsequentOnDifference = true - } - ]; + using var page = document.LoadPage(index); + var size = page.Size; + pages.Add( + new() + { + Width = size.Width, + Height = size.Height, + Text = page.GetText() + }); - using var document = PdfiumDocument.Load(bytes); - var pages = new List(document.PageCount); - for (var index = 0; index < document.PageCount; index++) - { - using var page = document.LoadPage(index); - var size = page.Size; - pages.Add( - new() - { - Width = size.Width, - Height = size.Height, - Text = page.GetText() - }); + var png = document.RenderPage(index, dpi); + targets.Add(new("png", new MemoryStream(png), $"page_{index + 1:0000}")); + } - var png = document.RenderPage(index, dpi); - targets.Add(new("png", new MemoryStream(png), $"page_{index + 1:0000}")); + info = new() + { + PageCount = pageCount, + Pages = pages, + Properties = PdfNormalizer.NormalizeProperties(document.GetProperties()) + }; } - var properties = document.GetProperties(); - PdfNormalizer.NormalizeProperties(properties); - - var info = new PdfInfo - { - PageCount = document.PageCount, - Pages = pages, - Properties = properties - }; + // Neutralize the volatile fields for the pdf snapshot only once the document, which reads + // lazily from the same buffer, has been released. + PdfNormalizer.Normalize(bytes); + targets.Insert( + 0, + new("pdf", new MemoryStream(bytes)) + { + BypassComparersForSubsequentOnDifference = true + }); return new(info, targets); }