diff --git a/claude.md b/claude.md index fcb9cba..788e2df 100644 --- a/claude.md +++ b/claude.md @@ -31,11 +31,17 @@ All source lives under `src/`. Solution file is `src/Verify.PDFium.slnx`. Entry point is `VerifyPDFium.Initialize(dpi = 96)` which registers a stream converter for the `pdf` extension. The converter loads the document with `Morph.PDFium.PdfiumDocument` and returns a `ConversionResult` containing: 1. `PdfInfo` (page count, per-page size in points and extracted text, document information dictionary) serialized as the info file -2. The original pdf bytes as a `pdf` target (`BypassComparersForSubsequentOnDifference` set, mirroring Verify.OpenXml) +2. The pdf bytes as a `pdf` target (`BypassComparersForSubsequentOnDifference` set, mirroring Verify.OpenXml) 3. One `png` target per page, named `page_0001` style +To keep snapshots stable for PDFs freshly generated at test time, the non-deterministic fields are neutralized two ways: +- **In the `pdf` bytes** (`PdfNormalizer.Normalize`): the trailer `/ID`, the info-dictionary `/CreationDate`/`/ModDate`, and the XMP metadata dates plus `xmpMM:DocumentID`/`InstanceID` are overwritten by a length-preserving, in-place byte scan (no string round-trip, no regex) — only the volatile characters change, so cross-reference offsets stay valid. `Normalize` mutates in place and runs *after* the `PdfiumDocument` (which reads lazily from the same buffer) is disposed, so no defensive copy is needed. Values inside a compressed object/metadata stream (`/ObjStm`, flate-compressed XMP) are not reachable by the plaintext scan. +- **In the info file** (`PdfNormalizer.NormalizeProperties`): `Properties` is a `Dictionary` whose `CreationDate`/`ModDate` values are parsed (`PdfDate`) into `DateTimeOffset`, so Verify's built-in date scrubbing renders them deterministically (`DateTimeOffset_1` etc.). Properties are read from the original bytes, before `Normalize` zeroes them. + Key files: - **VerifyPDFium.cs** — initialization and the converter +- **PdfNormalizer.cs** — neutralizes the `pdf` bytes and projects the info-file properties to the scrubbable object map +- **PdfDate.cs** — parses PDF date strings (`D:YYYYMMDD…`) to `DateTimeOffset` - **PdfInfo.cs** / **PageInfo.cs** — info shape for the snapshot (per-page width/height in points and text via `PdfPage.GetText()`) Style note: only public types get a namespace declaration (`VerifyTests`); internal types live in the global namespace. diff --git a/src/Tests/GlobalUsings.cs b/src/Tests/GlobalUsings.cs new file mode 100644 index 0000000..c814e0c --- /dev/null +++ b/src/Tests/GlobalUsings.cs @@ -0,0 +1,3 @@ +// Global using directives + +global using Morph.PDFium; \ No newline at end of file diff --git a/src/Tests/PdfDateTests.cs b/src/Tests/PdfDateTests.cs new file mode 100644 index 0000000..cb0e0cf --- /dev/null +++ b/src/Tests/PdfDateTests.cs @@ -0,0 +1,39 @@ +[TestFixture] +public class PdfDateTests +{ + [Test] + public void ParsesFullDateWithPositiveOffset() + { + Assert.That(PdfDate.TryParse("D:20240115093000+05'30'", out var date), Is.True); + Assert.That(date, Is.EqualTo(new DateTimeOffset(2024, 1, 15, 9, 30, 0, new(5, 30, 0)))); + } + + [Test] + public void ParsesUtcOffset() + { + Assert.That(PdfDate.TryParse("D:20211105091500Z", out var date), Is.True); + Assert.That(date, Is.EqualTo(new DateTimeOffset(2021, 11, 5, 9, 15, 0, TimeSpan.Zero))); + } + + [Test] + public void ParsesNegativeOffset() + { + Assert.That(PdfDate.TryParse("D:19991231235959-08'00'", out var date), Is.True); + Assert.That(date, Is.EqualTo(new DateTimeOffset(1999, 12, 31, 23, 59, 59, new(-8, 0, 0)))); + } + + [Test] + public void DefaultsOmittedComponents() + { + Assert.That(PdfDate.TryParse("D:2024", out var date), Is.True); + Assert.That(date, Is.EqualTo(new DateTimeOffset(2024, 1, 1, 0, 0, 0, TimeSpan.Zero))); + } + + [Test] + public void RejectsNonDate() => + Assert.That(PdfDate.TryParse("not a date", out _), Is.False); + + [Test] + public void RejectsOutOfRangeComponents() => + Assert.That(PdfDate.TryParse("D:20241350000000Z", out _), Is.False); +} diff --git a/src/Tests/PdfNormalizerTests.cs b/src/Tests/PdfNormalizerTests.cs new file mode 100644 index 0000000..bed189f --- /dev/null +++ b/src/Tests/PdfNormalizerTests.cs @@ -0,0 +1,64 @@ +[TestFixture] +public class PdfNormalizerTests +{ + [Test] + public void NeutralizesVolatileValues() + { + var input = + "/ID [ <1122334455667788>] " + + "/CreationDate(D:20240115093000+05'30') " + + "/ModDate(D:20240115093000Z) " + + "2024-01-15T09:30:00+05:30" + + "2024-01-15T09:30:00Z" + + "2024-01-15T09:30:00Z" + + "uuid:0f7b2c9a-1234-5678-9abc-def012345678" + + "xmp.iid:1a2b3c4d"; + var expected = + "/ID [<0000000000000000> <0000000000000000>] " + + "/CreationDate(D:00000000000000+00'00') " + + "/ModDate(D:00000000000000Z) " + + "0000-00-00T00:00:00+00:00" + + "0000-00-00T00:00:00Z" + + "0000-00-00T00:00:00Z" + + $"{new string('0', 41)}" + + $"{new string('0', 16)}"; + Assert.That(Normalize(input), Is.EqualTo(expected)); + } + + [Test] + public void CollapsesDifferingValuesToTheSameOutput() + { + // The same producer emits a stable structure across runs, so two documents differing only + // in the volatile digits/hex normalize to identical bytes. + var a = "/ID [] /CreationDate(D:20240115093000+05'30')"; + var b = "/ID [<99887766>] /CreationDate(D:19991231235959+11'45')"; + Assert.That(a, Is.Not.EqualTo(b)); + Assert.That(Normalize(a), Is.EqualTo(Normalize(b))); + } + + [Test] + public void LeavesLookalikeKeysUntouched() + { + // /IDTree is a name-tree key (not the file identifier), /ModDateStamp is a different name, + // and a self-closing date element has no content: none should be altered. + var input = "/IDTree [1 2] /ModDateStamp(20240101) 2024"; + Assert.That(Normalize(input), Is.EqualTo(input)); + } + + [Test] + public void NormalizedDocumentStillLoads() + { + var data = File.ReadAllBytes("sample.pdf"); + PdfNormalizer.Normalize(data); + + using var document = PdfiumDocument.Load(data); + Assert.That(document.PageCount, Is.EqualTo(1)); + } + + static string Normalize(string value) + { + var data = Encoding.Latin1.GetBytes(value); + PdfNormalizer.Normalize(data); + return Encoding.Latin1.GetString(data); + } +} diff --git a/src/Tests/Samples.MultiPage.verified.pdf b/src/Tests/Samples.MultiPage.verified.pdf index 4b04c14..af6c8f7 100644 Binary files a/src/Tests/Samples.MultiPage.verified.pdf and b/src/Tests/Samples.MultiPage.verified.pdf differ diff --git a/src/Tests/Samples.MultiPage.verified.txt b/src/Tests/Samples.MultiPage.verified.txt index a475f40..91a1f00 100644 --- a/src/Tests/Samples.MultiPage.verified.txt +++ b/src/Tests/Samples.MultiPage.verified.txt @@ -1,5 +1,11 @@ { PageCount: 4, + Properties: { + CreationDate: DateTimeOffset_1, + Creator: Morph, + ModDate: DateTimeOffset_1, + Producer: PDFsharp 6.2.4 + }, Pages: [ { Width: 612.0, @@ -171,11 +177,5 @@ Paragraph 50: Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do ei incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris. } - ], - Properties: { - CreationDate: D:20000101000000+00'00', - Creator: Morph, - ModDate: D:20000101000000+00'00', - Producer: PDFsharp 6.2.4 - } + ] } \ No newline at end of file diff --git a/src/Tests/Samples.VerifyPdf.verified.pdf b/src/Tests/Samples.VerifyPdf.verified.pdf index acb06ce..1796d98 100644 Binary files a/src/Tests/Samples.VerifyPdf.verified.pdf and b/src/Tests/Samples.VerifyPdf.verified.pdf differ diff --git a/src/Tests/Samples.VerifyPdf.verified.txt b/src/Tests/Samples.VerifyPdf.verified.txt index 6f5f8b2..9540e7d 100644 --- a/src/Tests/Samples.VerifyPdf.verified.txt +++ b/src/Tests/Samples.VerifyPdf.verified.txt @@ -1,16 +1,16 @@ { PageCount: 1, + Properties: { + CreationDate: DateTimeOffset_1, + Creator: Morph, + ModDate: DateTimeOffset_1, + Producer: PDFsharp 6.2.4 + }, Pages: [ { Width: 612.0, Height: 792.0, Text: Hello, World! This is a simple paragraph. } - ], - Properties: { - CreationDate: D:20000101000000+00'00', - Creator: Morph, - ModDate: D:20000101000000+00'00', - Producer: PDFsharp 6.2.4 - } + ] } \ No newline at end of file diff --git a/src/Tests/Samples.VerifyPdfStream.verified.pdf b/src/Tests/Samples.VerifyPdfStream.verified.pdf index acb06ce..1796d98 100644 Binary files a/src/Tests/Samples.VerifyPdfStream.verified.pdf and b/src/Tests/Samples.VerifyPdfStream.verified.pdf differ diff --git a/src/Tests/Samples.VerifyPdfStream.verified.txt b/src/Tests/Samples.VerifyPdfStream.verified.txt index 6f5f8b2..9540e7d 100644 --- a/src/Tests/Samples.VerifyPdfStream.verified.txt +++ b/src/Tests/Samples.VerifyPdfStream.verified.txt @@ -1,16 +1,16 @@ { PageCount: 1, + Properties: { + CreationDate: DateTimeOffset_1, + Creator: Morph, + ModDate: DateTimeOffset_1, + Producer: PDFsharp 6.2.4 + }, Pages: [ { Width: 612.0, Height: 792.0, Text: Hello, World! This is a simple paragraph. } - ], - Properties: { - CreationDate: D:20000101000000+00'00', - Creator: Morph, - ModDate: D:20000101000000+00'00', - Producer: PDFsharp 6.2.4 - } + ] } \ No newline at end of file diff --git a/src/Verify.PDFium/PdfDate.cs b/src/Verify.PDFium/PdfDate.cs new file mode 100644 index 0000000..17fb29a --- /dev/null +++ b/src/Verify.PDFium/PdfDate.cs @@ -0,0 +1,102 @@ +/// +/// Parses a PDF date string (PDF 32000-1:2008 §7.9.4), for example D:20240115093000+05'30', +/// into a . Every component after the year is optional. +/// +static class PdfDate +{ + public static bool TryParse(string value, out DateTimeOffset date) + { + date = default; + + var span = value.AsSpan(); + if (span.StartsWith("D:")) + { + span = span[2..]; + } + + if (!TryFixed(span, 0, 4, out var year) || + !TryOptional(span, 4, 1, out var month) || + !TryOptional(span, 6, 1, out var day) || + !TryOptional(span, 8, 0, out var hour) || + !TryOptional(span, 10, 0, out var minute) || + !TryOptional(span, 12, 0, out var second) || + !TryOffset(span, 14, out var offset)) + { + return false; + } + + try + { + date = new(year, month, day, hour, minute, second, offset); + return true; + } + catch (ArgumentException) + { + // Out-of-range component (e.g. month 13) or offset. + return false; + } + } + + // Parses a mandatory run of exactly length digits at start. + static bool TryFixed(ReadOnlySpan span, int start, int length, out int value) + { + value = 0; + return span.Length >= start + length && + int.TryParse(span.Slice(start, length), out value); + } + + // Parses an optional two-digit component; when the string ends before it, yields fallback. + static bool TryOptional(ReadOnlySpan span, int start, int fallback, out int value) + { + value = fallback; + return span.Length <= start || + TryFixed(span, start, 2, out value); + } + + static bool TryOffset(ReadOnlySpan span, int start, out TimeSpan offset) + { + offset = TimeSpan.Zero; + if (span.Length <= start) + { + return true; + } + + var indicator = span[start]; + if (indicator is 'Z' or 'z') + { + return true; + } + + if (indicator != '+' && indicator != '-') + { + return false; + } + + if (!TryFixed(span, start + 1, 2, out var hours)) + { + return false; + } + + // Minutes follow the hours, separated by an apostrophe: HH'mm'. + var minuteStart = start + 3; + if (minuteStart < span.Length && span[minuteStart] == '\'') + { + minuteStart++; + } + + var minutes = 0; + if (minuteStart < span.Length && + !TryFixed(span, minuteStart, 2, out minutes)) + { + return false; + } + + offset = new(hours, minutes, 0); + if (indicator == '-') + { + offset = -offset; + } + + return true; + } +} diff --git a/src/Verify.PDFium/PdfInfo.cs b/src/Verify.PDFium/PdfInfo.cs index 06e1171..3c2a9b3 100644 --- a/src/Verify.PDFium/PdfInfo.cs +++ b/src/Verify.PDFium/PdfInfo.cs @@ -1,6 +1,6 @@ class PdfInfo { public required int PageCount { get; init; } + public Dictionary? Properties { get; init; } public required IReadOnlyList Pages { get; init; } - public Dictionary? Properties { get; init; } } diff --git a/src/Verify.PDFium/PdfNormalizer.cs b/src/Verify.PDFium/PdfNormalizer.cs new file mode 100644 index 0000000..59ca3b8 --- /dev/null +++ b/src/Verify.PDFium/PdfNormalizer.cs @@ -0,0 +1,294 @@ +/// +/// Neutralizes the non-deterministic fields of a PDF (the trailer /ID, the document +/// information /CreationDate and /ModDate, and the equivalent XMP metadata dates and +/// identifiers) so that the same source document always produces byte-identical snapshot output. +/// +/// +/// All edits are performed directly on the bytes and are length-preserving: only the mutable +/// characters inside each value are overwritten, so every cross-reference offset stays valid and +/// the file never has to be re-serialized. Values that live inside a compressed object or metadata +/// stream (/ObjStm, flate-compressed XMP) are not reachable by this plaintext scan. +/// +static class PdfNormalizer +{ + enum Fill + { + // Zero the ASCII digits only, keeping separators (leaves a readable date). + Digits, + + // Zero the hexadecimal digits (for hex string <...> values). + Hex, + + // Zero every non-whitespace byte (for opaque identifiers). + All + } + + public static void Normalize(byte[] data) + { + // Document information dictionary dates. + ZeroPdfString(data, "/CreationDate"u8, Fill.Digits); + ZeroPdfString(data, "/ModDate"u8, Fill.Digits); + + // Trailer / cross-reference-stream file identifier: /ID [<...> <...>]. + ZeroFileId(data); + + // XMP metadata dates (uncompressed metadata streams only). + ZeroXmpElement(data, "? NormalizeProperties(Dictionary? properties) + { + if (properties is null) + { + return null; + } + + var result = new Dictionary(properties.Count); + foreach (var (key, value) in properties) + { + if (key is "CreationDate" or "ModDate" && + PdfDate.TryParse(value, out var date)) + { + result[key] = date; + } + else + { + result[key] = value; + } + } + + return result; + } + + // Finds a name key, then overwrites the string value that follows it. The value may be a + // literal string "(...)" or a hex string "<...>". + static void ZeroPdfString(byte[] data, ReadOnlySpan key, Fill fill) + { + var pos = 0; + while (true) + { + var hit = data.AsSpan(pos).IndexOf(key); + if (hit < 0) + { + return; + } + + var i = pos + hit + key.Length; + pos = i; + + i = SkipWhitespace(data, i); + if (i >= data.Length) + { + return; + } + + if (data[i] == (byte) '(') + { + var start = i + 1; + var end = FindLiteralEnd(data, start); + Overwrite(data, start, end, fill); + pos = end; + } + else if (data[i] == (byte) '<' && (i + 1 >= data.Length || data[i + 1] != (byte) '<')) + { + var start = i + 1; + var end = FindByte(data, start, (byte) '>'); + Overwrite(data, start, end, Fill.Hex); + pos = end; + } + } + } + + // Finds "/ID" followed by an array and zeroes each string element. Anything not shaped like the + // identifier array (for example the "/IDTree" name-tree key) is skipped. + static void ZeroFileId(byte[] data) + { + var key = "/ID"u8; + var pos = 0; + while (true) + { + var hit = data.AsSpan(pos).IndexOf(key); + if (hit < 0) + { + return; + } + + var i = pos + hit + key.Length; + pos = i; + + i = SkipWhitespace(data, i); + if (i >= data.Length || data[i] != (byte) '[') + { + continue; + } + + i++; + while (i < data.Length && data[i] != (byte) ']') + { + if (data[i] == (byte) '<') + { + var start = i + 1; + i = FindByte(data, start, (byte) '>'); + Overwrite(data, start, i, Fill.Hex); + i++; + } + else if (data[i] == (byte) '(') + { + var start = i + 1; + i = FindLiteralEnd(data, start); + Overwrite(data, start, i, Fill.All); + i++; + } + else + { + i++; + } + } + + pos = i; + } + } + + // Finds an XMP element by its opening tag and zeroes the text content up to the next '<'. + static void ZeroXmpElement(byte[] data, ReadOnlySpan openTag, Fill fill) + { + var pos = 0; + while (true) + { + var hit = data.AsSpan(pos).IndexOf(openTag); + if (hit < 0) + { + return; + } + + var i = pos + hit + openTag.Length; + pos = i; + + // Reject a longer element name that merely shares this prefix. + if (i < data.Length && data[i] != (byte) '>' && data[i] != (byte) '/' && !IsWhitespace(data[i])) + { + continue; + } + + // Skip the remainder of the opening tag, remembering the last significant byte so a + // self-closing "" can be detected. + var lastSignificant = (byte) 0; + while (i < data.Length && data[i] != (byte) '>') + { + if (!IsWhitespace(data[i])) + { + lastSignificant = data[i]; + } + + i++; + } + + if (i >= data.Length) + { + return; + } + + i++; + if (lastSignificant == (byte) '/') + { + continue; + } + + var start = i; + var end = FindByte(data, start, (byte) '<'); + Overwrite(data, start, end, fill); + pos = end; + } + } + + static void Overwrite(byte[] data, int start, int end, Fill fill) + { + for (var i = start; i < end; i++) + { + var c = data[i]; + var replace = fill switch + { + Fill.Digits => IsDigit(c), + Fill.Hex => IsHexDigit(c), + _ => !IsWhitespace(c) + }; + if (replace) + { + data[i] = (byte) '0'; + } + } + } + + // Returns the index of the ')' that closes the literal string starting at 'start', honoring + // backslash escapes and balanced parentheses, or the end of the buffer if unterminated. + static int FindLiteralEnd(byte[] data, int start) + { + var depth = 1; + var i = start; + while (i < data.Length) + { + var c = data[i]; + if (c == (byte) '\\') + { + i += 2; + continue; + } + + if (c == (byte) '(') + { + depth++; + } + else if (c == (byte) ')') + { + depth--; + if (depth == 0) + { + return i; + } + } + + i++; + } + + return data.Length; + } + + static int FindByte(byte[] data, int start, byte target) + { + var i = start; + while (i < data.Length && data[i] != target) + { + i++; + } + + return i; + } + + static int SkipWhitespace(byte[] data, int i) + { + while (i < data.Length && IsWhitespace(data[i])) + { + i++; + } + + return i; + } + + static bool IsDigit(byte b) => + b is >= (byte) '0' and <= (byte) '9'; + + static bool IsHexDigit(byte b) => + b is >= (byte) '0' and <= (byte) '9' or >= (byte) 'a' and <= (byte) 'f' or >= (byte) 'A' and <= (byte) 'F'; + + static bool IsWhitespace(byte b) => + b is (byte) ' ' or (byte) '\t' or (byte) '\r' or (byte) '\n' or (byte) '\f' or 0; +} diff --git a/src/Verify.PDFium/Verify.PDFium.csproj b/src/Verify.PDFium/Verify.PDFium.csproj index 83a354c..de97390 100644 --- a/src/Verify.PDFium/Verify.PDFium.csproj +++ b/src/Verify.PDFium/Verify.PDFium.csproj @@ -8,4 +8,7 @@ + + + diff --git a/src/Verify.PDFium/VerifyPDFium.cs b/src/Verify.PDFium/VerifyPDFium.cs index 67c6600..480e3b0 100644 --- a/src/Verify.PDFium/VerifyPDFium.cs +++ b/src/Verify.PDFium/VerifyPDFium.cs @@ -33,39 +33,45 @@ static ConversionResult Convert(Stream stream) stream.CopyTo(buffer); var bytes = buffer.ToArray(); - using var document = PdfiumDocument.Load(bytes); - - List targets = - [ - new("pdf", new MemoryStream(bytes)) + List targets = []; + PdfInfo info; + using (var document = PdfiumDocument.Load(bytes)) + { + var pageCount = document.PageCount; + var pages = new List(pageCount); + for (var index = 0; index < pageCount; index++) { - BypassComparersForSubsequentOnDifference = true - } - ]; + using var page = document.LoadPage(index); + var size = page.Size; + pages.Add( + new() + { + Width = size.Width, + Height = size.Height, + Text = page.GetText() + }); - var pages = new List(document.PageCount); - for (var index = 0; index < document.PageCount; index++) - { - using var page = document.LoadPage(index); - var size = page.Size; - pages.Add( - new() - { - Width = size.Width, - Height = size.Height, - Text = page.GetText() - }); + var png = document.RenderPage(index, dpi); + targets.Add(new("png", new MemoryStream(png), $"page_{index + 1:0000}")); + } - var png = document.RenderPage(index, dpi); - targets.Add(new("png", new MemoryStream(png), $"page_{index + 1:0000}")); + info = new() + { + PageCount = pageCount, + Pages = pages, + Properties = PdfNormalizer.NormalizeProperties(document.GetProperties()) + }; } - var info = new PdfInfo - { - PageCount = document.PageCount, - Pages = pages, - Properties = document.GetProperties() - }; + // Neutralize the volatile fields for the pdf snapshot only once the document, which reads + // lazily from the same buffer, has been released. + PdfNormalizer.Normalize(bytes); + targets.Insert( + 0, + new("pdf", new MemoryStream(bytes)) + { + BypassComparersForSubsequentOnDifference = true + }); return new(info, targets); }