Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion claude.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,17 @@ All source lives under `src/`. Solution file is `src/Verify.PDFium.slnx`.

Entry point is `VerifyPDFium.Initialize(dpi = 96)` which registers a stream converter for the `pdf` extension. The converter loads the document with `Morph.PDFium.PdfiumDocument` and returns a `ConversionResult` containing:
1. `PdfInfo` (page count, per-page size in points and extracted text, document information dictionary) serialized as the info file
2. The original pdf bytes as a `pdf` target (`BypassComparersForSubsequentOnDifference` set, mirroring Verify.OpenXml)
2. The pdf bytes as a `pdf` target (`BypassComparersForSubsequentOnDifference` set, mirroring Verify.OpenXml)
3. One `png` target per page, named `page_0001` style

To keep snapshots stable for PDFs freshly generated at test time, the non-deterministic fields are neutralized two ways:
- **In the `pdf` bytes** (`PdfNormalizer.Normalize`): the trailer `/ID`, the info-dictionary `/CreationDate`/`/ModDate`, and the XMP metadata dates plus `xmpMM:DocumentID`/`InstanceID` are overwritten by a length-preserving, in-place byte scan (no string round-trip, no regex) — only the volatile characters change, so cross-reference offsets stay valid. `Normalize` mutates in place and runs *after* the `PdfiumDocument` (which reads lazily from the same buffer) is disposed, so no defensive copy is needed. Values inside a compressed object/metadata stream (`/ObjStm`, flate-compressed XMP) are not reachable by the plaintext scan.
- **In the info file** (`PdfNormalizer.NormalizeProperties`): `Properties` is a `Dictionary<string, object>` whose `CreationDate`/`ModDate` values are parsed (`PdfDate`) into `DateTimeOffset`, so Verify's built-in date scrubbing renders them deterministically (`DateTimeOffset_1` etc.). Properties are read from the original bytes, before `Normalize` zeroes them.

Key files:
- **VerifyPDFium.cs** — initialization and the converter
- **PdfNormalizer.cs** — neutralizes the `pdf` bytes and projects the info-file properties to the scrubbable object map
- **PdfDate.cs** — parses PDF date strings (`D:YYYYMMDD…`) to `DateTimeOffset`
- **PdfInfo.cs** / **PageInfo.cs** — info shape for the snapshot (per-page width/height in points and text via `PdfPage.GetText()`)

Style note: only public types get a namespace declaration (`VerifyTests`); internal types live in the global namespace.
Expand Down
3 changes: 3 additions & 0 deletions src/Tests/GlobalUsings.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
// Global using directives

global using Morph.PDFium;
39 changes: 39 additions & 0 deletions src/Tests/PdfDateTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
[TestFixture]
public class PdfDateTests
{
[Test]
public void ParsesFullDateWithPositiveOffset()
{
Assert.That(PdfDate.TryParse("D:20240115093000+05'30'", out var date), Is.True);
Assert.That(date, Is.EqualTo(new DateTimeOffset(2024, 1, 15, 9, 30, 0, new(5, 30, 0))));
}

[Test]
public void ParsesUtcOffset()
{
Assert.That(PdfDate.TryParse("D:20211105091500Z", out var date), Is.True);
Assert.That(date, Is.EqualTo(new DateTimeOffset(2021, 11, 5, 9, 15, 0, TimeSpan.Zero)));
}

[Test]
public void ParsesNegativeOffset()
{
Assert.That(PdfDate.TryParse("D:19991231235959-08'00'", out var date), Is.True);
Assert.That(date, Is.EqualTo(new DateTimeOffset(1999, 12, 31, 23, 59, 59, new(-8, 0, 0))));
}

[Test]
public void DefaultsOmittedComponents()
{
Assert.That(PdfDate.TryParse("D:2024", out var date), Is.True);
Assert.That(date, Is.EqualTo(new DateTimeOffset(2024, 1, 1, 0, 0, 0, TimeSpan.Zero)));
}

[Test]
public void RejectsNonDate() =>
Assert.That(PdfDate.TryParse("not a date", out _), Is.False);

[Test]
public void RejectsOutOfRangeComponents() =>
Assert.That(PdfDate.TryParse("D:20241350000000Z", out _), Is.False);
}
64 changes: 64 additions & 0 deletions src/Tests/PdfNormalizerTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
[TestFixture]
public class PdfNormalizerTests
{
[Test]
public void NeutralizesVolatileValues()
{
var input =
"/ID [<A1B2C3D4E5F60718> <1122334455667788>] " +
"/CreationDate(D:20240115093000+05'30') " +
"/ModDate(D:20240115093000Z) " +
"<xmp:CreateDate>2024-01-15T09:30:00+05:30</xmp:CreateDate>" +
"<xmp:ModifyDate>2024-01-15T09:30:00Z</xmp:ModifyDate>" +
"<xmp:MetadataDate>2024-01-15T09:30:00Z</xmp:MetadataDate>" +
"<xmpMM:DocumentID>uuid:0f7b2c9a-1234-5678-9abc-def012345678</xmpMM:DocumentID>" +
"<xmpMM:InstanceID>xmp.iid:1a2b3c4d</xmpMM:InstanceID>";
var expected =
"/ID [<0000000000000000> <0000000000000000>] " +
"/CreationDate(D:00000000000000+00'00') " +
"/ModDate(D:00000000000000Z) " +
"<xmp:CreateDate>0000-00-00T00:00:00+00:00</xmp:CreateDate>" +
"<xmp:ModifyDate>0000-00-00T00:00:00Z</xmp:ModifyDate>" +
"<xmp:MetadataDate>0000-00-00T00:00:00Z</xmp:MetadataDate>" +
$"<xmpMM:DocumentID>{new string('0', 41)}</xmpMM:DocumentID>" +
$"<xmpMM:InstanceID>{new string('0', 16)}</xmpMM:InstanceID>";
Assert.That(Normalize(input), Is.EqualTo(expected));
}

[Test]
public void CollapsesDifferingValuesToTheSameOutput()
{
// The same producer emits a stable structure across runs, so two documents differing only
// in the volatile digits/hex normalize to identical bytes.
var a = "/ID [<A1B2C3D4>] /CreationDate(D:20240115093000+05'30')";
var b = "/ID [<99887766>] /CreationDate(D:19991231235959+11'45')";
Assert.That(a, Is.Not.EqualTo(b));
Assert.That(Normalize(a), Is.EqualTo(Normalize(b)));
}

[Test]
public void LeavesLookalikeKeysUntouched()
{
// /IDTree is a name-tree key (not the file identifier), /ModDateStamp is a different name,
// and a self-closing date element has no content: none should be altered.
var input = "/IDTree [1 2] /ModDateStamp(20240101) <xmp:CreateDate/>2024";
Assert.That(Normalize(input), Is.EqualTo(input));
}

[Test]
public void NormalizedDocumentStillLoads()
{
var data = File.ReadAllBytes("sample.pdf");
PdfNormalizer.Normalize(data);

using var document = PdfiumDocument.Load(data);
Assert.That(document.PageCount, Is.EqualTo(1));
}

static string Normalize(string value)
{
var data = Encoding.Latin1.GetBytes(value);
PdfNormalizer.Normalize(data);
return Encoding.Latin1.GetString(data);
}
}
Binary file modified src/Tests/Samples.MultiPage.verified.pdf
Binary file not shown.
14 changes: 7 additions & 7 deletions src/Tests/Samples.MultiPage.verified.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
{
PageCount: 4,
Properties: {
CreationDate: DateTimeOffset_1,
Creator: Morph,
ModDate: DateTimeOffset_1,
Producer: PDFsharp 6.2.4
},
Pages: [
{
Width: 612.0,
Expand Down Expand Up @@ -171,11 +177,5 @@ Paragraph 50: Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do ei
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation
ullamco laboris.
}
],
Properties: {
CreationDate: D:20000101000000+00'00',
Creator: Morph,
ModDate: D:20000101000000+00'00',
Producer: PDFsharp 6.2.4
}
]
}
Binary file modified src/Tests/Samples.VerifyPdf.verified.pdf
Binary file not shown.
14 changes: 7 additions & 7 deletions src/Tests/Samples.VerifyPdf.verified.txt
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
{
PageCount: 1,
Properties: {
CreationDate: DateTimeOffset_1,
Creator: Morph,
ModDate: DateTimeOffset_1,
Producer: PDFsharp 6.2.4
},
Pages: [
{
Width: 612.0,
Height: 792.0,
Text: Hello, World! This is a simple paragraph.
}
],
Properties: {
CreationDate: D:20000101000000+00'00',
Creator: Morph,
ModDate: D:20000101000000+00'00',
Producer: PDFsharp 6.2.4
}
]
}
Binary file modified src/Tests/Samples.VerifyPdfStream.verified.pdf
Binary file not shown.
14 changes: 7 additions & 7 deletions src/Tests/Samples.VerifyPdfStream.verified.txt
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
{
PageCount: 1,
Properties: {
CreationDate: DateTimeOffset_1,
Creator: Morph,
ModDate: DateTimeOffset_1,
Producer: PDFsharp 6.2.4
},
Pages: [
{
Width: 612.0,
Height: 792.0,
Text: Hello, World! This is a simple paragraph.
}
],
Properties: {
CreationDate: D:20000101000000+00'00',
Creator: Morph,
ModDate: D:20000101000000+00'00',
Producer: PDFsharp 6.2.4
}
]
}
102 changes: 102 additions & 0 deletions src/Verify.PDFium/PdfDate.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
/// <summary>
/// Parses a PDF date string (PDF 32000-1:2008 §7.9.4), for example <c>D:20240115093000+05'30'</c>,
/// into a <see cref="DateTimeOffset"/>. Every component after the year is optional.
/// </summary>
static class PdfDate
{
public static bool TryParse(string value, out DateTimeOffset date)
{
date = default;

var span = value.AsSpan();
if (span.StartsWith("D:"))
{
span = span[2..];
}

if (!TryFixed(span, 0, 4, out var year) ||
!TryOptional(span, 4, 1, out var month) ||
!TryOptional(span, 6, 1, out var day) ||
!TryOptional(span, 8, 0, out var hour) ||
!TryOptional(span, 10, 0, out var minute) ||
!TryOptional(span, 12, 0, out var second) ||
!TryOffset(span, 14, out var offset))
{
return false;
}

try
{
date = new(year, month, day, hour, minute, second, offset);
return true;
}
catch (ArgumentException)
{
// Out-of-range component (e.g. month 13) or offset.
return false;
}
}

// Parses a mandatory run of exactly length digits at start.
static bool TryFixed(ReadOnlySpan<char> span, int start, int length, out int value)
{
value = 0;
return span.Length >= start + length &&
int.TryParse(span.Slice(start, length), out value);
}

// Parses an optional two-digit component; when the string ends before it, yields fallback.
static bool TryOptional(ReadOnlySpan<char> span, int start, int fallback, out int value)
{
value = fallback;
return span.Length <= start ||
TryFixed(span, start, 2, out value);
}

static bool TryOffset(ReadOnlySpan<char> span, int start, out TimeSpan offset)
{
offset = TimeSpan.Zero;
if (span.Length <= start)
{
return true;
}

var indicator = span[start];
if (indicator is 'Z' or 'z')
{
return true;
}

if (indicator != '+' && indicator != '-')
{
return false;
}

if (!TryFixed(span, start + 1, 2, out var hours))
{
return false;
}

// Minutes follow the hours, separated by an apostrophe: HH'mm'.
var minuteStart = start + 3;
if (minuteStart < span.Length && span[minuteStart] == '\'')
{
minuteStart++;
}

var minutes = 0;
if (minuteStart < span.Length &&
!TryFixed(span, minuteStart, 2, out minutes))
{
return false;
}

offset = new(hours, minutes, 0);
if (indicator == '-')
{
offset = -offset;
}

return true;
}
}
2 changes: 1 addition & 1 deletion src/Verify.PDFium/PdfInfo.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
class PdfInfo
{
public required int PageCount { get; init; }
public Dictionary<string, object>? Properties { get; init; }
public required IReadOnlyList<PageInfo> Pages { get; init; }
public Dictionary<string, string>? Properties { get; init; }
}
Loading
Loading