Skip to content

Commit 9ff6562

Browse files
Thomas SondergaardThomas Sondergaard
authored andcommitted
feat(pdf): Add pdfTocSource option to generate TOC from document headings
Add new configuration options to control PDF Table of Contents source: - pdfTocSource: 'toc' (default) or 'headings' to extract from document content - pdfTocHeadingDepth: maximum heading level to include (1-6, default 3) When pdfTocSource is set to 'headings', the PDF TOC is generated by extracting h1-h6 headings from the rendered HTML pages instead of using the toc.yml structure. This is particularly useful for print-style publications using docfx for markdown-to-PDF generation, where the TOC should reflect the document's internal heading structure rather than website navigation. Fixes #10994
1 parent 917cda8 commit 9ff6562

3 files changed

Lines changed: 213 additions & 9 deletions

File tree

docs/docs/pdf.md

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,33 @@ Sets the PDF output file name. The default value is `toc.pdf`.
7575

7676
Indicates whether to include a "Table of Contents" pages at the beginning.
7777

78+
### `pdfTocSource`
79+
80+
Controls the source for the PDF Table of Contents. Possible values:
81+
82+
- `toc` (default): Generates TOC from the `toc.yml` structure.
83+
- `headings`: Generates TOC from headings (h1, h2, h3, etc.) extracted from document content.
84+
85+
When set to `headings`, the TOC reflects the actual heading structure within each document rather than the navigation defined in `toc.yml`. This is useful for single-document or small documentation sets where you want the PDF TOC to show the internal sections of each document.
86+
87+
```yaml
88+
pdf: true
89+
pdfTocPage: true
90+
pdfTocSource: headings
91+
items:
92+
- name: My Document
93+
href: my-document.md
94+
```
95+
96+
### `pdfTocHeadingDepth`
97+
98+
Maximum heading level to include in the PDF TOC when `pdfTocSource` is `headings`. Default is `3`, which includes h1, h2, and h3 headings. Set to a higher value (up to 6) to include deeper heading levels.
99+
100+
```yaml
101+
pdfTocSource: headings
102+
pdfTocHeadingDepth: 4 # Include h1-h4 headings
103+
```
104+
78105
### `pdfCoverPage`
79106

80107
A path to an HTML page relative to the root of the output directory. The HTML page will be inserted at the beginning of the PDF file as cover page.

schemas/toc.schema.json

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,19 @@
106106
"type": "boolean",
107107
"default": false,
108108
"description": "If set to true, Child items are displayed as dropdown on top navigation bar."
109+
},
110+
"pdfTocSource": {
111+
"type": "string",
112+
"enum": ["toc", "headings"],
113+
"default": "toc",
114+
"description": "Source for PDF Table of Contents. 'toc' uses toc.yml structure (default), 'headings' extracts headings from document content."
115+
},
116+
"pdfTocHeadingDepth": {
117+
"type": "integer",
118+
"minimum": 1,
119+
"maximum": 6,
120+
"default": 3,
121+
"description": "Maximum heading level to include in PDF TOC when pdfTocSource is 'headings'. For example, 3 includes h1, h2, and h3."
109122
}
110123
}
111124
},

src/Docfx.App/PdfBuilder.cs

Lines changed: 173 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,14 @@ static class PdfBuilder
3737
{
3838
private static readonly SearchValues<char> InvalidPathChars = SearchValues.Create(Path.GetInvalidPathChars());
3939

40+
class HeadingInfo
41+
{
42+
public string Text { get; init; } = "";
43+
public string Id { get; init; } = "";
44+
public int Level { get; init; }
45+
public Uri PageUrl { get; init; } = null!;
46+
}
47+
4048
class Outline
4149
{
4250
public string name { get; init; } = "";
@@ -51,6 +59,9 @@ class Outline
5159

5260
public string? pdfHeaderTemplate { get; init; }
5361
public string? pdfFooterTemplate { get; init; }
62+
63+
public string? pdfTocSource { get; init; }
64+
public int pdfTocHeadingDepth { get; init; } = 3;
5465
}
5566

5667
public static Task Run(BuildJsonConfig config, string configDirectory, string? outputDirectory = null, CancellationToken cancellationToken = default)
@@ -93,6 +104,8 @@ void onSignal(PosixSignalContext context)
93104

94105
Uri? baseUrl = null;
95106
var pdfPageNumbers = new ConcurrentDictionary<string, Dictionary<Outline, int>>();
107+
var pdfUrlPageNumbers = new ConcurrentDictionary<string, Dictionary<Uri, int>>();
108+
var pdfHeadings = new ConcurrentDictionary<string, List<HeadingInfo>>();
96109

97110
using var app = builder.Build();
98111
app.UseServe(outputFolder);
@@ -127,6 +140,8 @@ void onSignal(PosixSignalContext context)
127140
await CreatePdf(
128141
PrintPdf, PrintHeaderFooter, task, new(baseUrl, url), toc, outputFolder, pdfOutputPath,
129142
pageNumbers => pdfPageNumbers[url] = pageNumbers,
143+
urlPageNumbers => pdfUrlPageNumbers[url] = urlPageNumbers,
144+
headings => pdfHeadings[url] = headings,
130145
cancellationToken);
131146

132147
task.Value = task.MaxValue;
@@ -186,20 +201,23 @@ await CreatePdf(
186201
IResult TocPage(string url)
187202
{
188203
var pageNumbers = pdfPageNumbers.GetValueOrDefault(url);
189-
return Results.Content(TocHtmlTemplate(new Uri(baseUrl!, url), pdfTocs[url], pageNumbers).ToString(), "text/html", Encoding.UTF8);
204+
var urlPageNumbers = pdfUrlPageNumbers.GetValueOrDefault(url);
205+
var headings = pdfHeadings.GetValueOrDefault(url);
206+
return Results.Content(TocHtmlTemplate(new Uri(baseUrl!, url), pdfTocs[url], pageNumbers, urlPageNumbers, headings).ToString(), "text/html", Encoding.UTF8);
190207
}
191208

192-
async Task<byte[]?> PrintPdf(Outline outline, Uri url)
209+
async Task<(byte[]? bytes, List<HeadingInfo> headings)> PrintPdf(Outline outline, Uri url, int headingDepth)
193210
{
194211
await pageLimiter.WaitAsync(cancellationToken);
195212
var page = pagePool.TryTake(out var pooled) ? pooled : await context.NewPageAsync();
213+
var headings = new List<HeadingInfo>();
196214

197215
try
198216
{
199217
Uri beforeUri = new(page.Url);
200218
var response = await page.GotoAsync(url.ToString(), new() { WaitUntil = WaitUntilState.DOMContentLoaded });
201219
if (response?.Status is 404)
202-
return null;
220+
return (null, headings);
203221

204222
bool isSameUrlNavigation = response == null && beforeUri == url;
205223
bool isHashFragmentNavigation = response == null
@@ -234,11 +252,19 @@ IResult TocPage(string url)
234252
}
235253
}
236254

237-
return await page.PdfAsync(new PagePdfOptions
255+
// Extract headings from the page if needed
256+
if (outline.pdfTocSource == "headings" && headingDepth > 0 && !IsTocPage(url) && !IsCoverPage(url, outputFolder, outline.pdfCoverPage))
257+
{
258+
headings = await ExtractHeadingsFromPage(page, url, headingDepth);
259+
}
260+
261+
var bytes = await page.PdfAsync(new PagePdfOptions
238262
{
239263
PreferCSSPageSize = true,
240264
PrintBackground = outline.pdfPrintBackground,
241265
});
266+
267+
return (bytes, headings);
242268
}
243269
finally
244270
{
@@ -247,6 +273,45 @@ IResult TocPage(string url)
247273
}
248274
}
249275

276+
async Task<List<HeadingInfo>> ExtractHeadingsFromPage(IPage page, Uri pageUrl, int maxDepth)
277+
{
278+
var headings = new List<HeadingInfo>();
279+
var selector = string.Join(",", Enumerable.Range(1, maxDepth).Select(i => $"article h{i}, .content h{i}"));
280+
281+
try
282+
{
283+
var elements = await page.QuerySelectorAllAsync(selector);
284+
foreach (var element in elements)
285+
{
286+
var tagName = await element.EvaluateAsync<string>("e => e.tagName");
287+
var level = int.Parse(tagName[1].ToString());
288+
var id = await element.GetAttributeAsync("id") ?? "";
289+
var text = (await element.InnerTextAsync()).Trim();
290+
291+
// Skip headings without id or text
292+
if (string.IsNullOrEmpty(id) || string.IsNullOrEmpty(text))
293+
continue;
294+
295+
// Clean up text (remove source link icons, etc.)
296+
var cleanText = text.Split('\n')[0].Trim();
297+
298+
headings.Add(new HeadingInfo
299+
{
300+
Text = cleanText,
301+
Id = id,
302+
Level = level,
303+
PageUrl = pageUrl
304+
});
305+
}
306+
}
307+
catch (Exception ex)
308+
{
309+
Logger.LogWarning($"Failed to extract headings from {pageUrl}: {ex.Message}");
310+
}
311+
312+
return headings;
313+
}
314+
250315
Task<byte[]> PrintHeaderFooter(Outline toc, int pageNumber, int totalPages, Page contentPage)
251316
{
252317
var headerTemplate = ExpandTemplate(GetHeaderFooter(toc.pdfHeaderTemplate), pageNumber, totalPages);
@@ -333,32 +398,51 @@ static string ExpandTemplate(string? pdfTemplate, int pageNumber, int totalPages
333398
}
334399

335400
static async Task CreatePdf(
336-
Func<Outline, Uri, Task<byte[]?>> printPdf, Func<Outline, int, int, Page, Task<byte[]>> printHeaderFooter, ProgressTask task,
337-
Uri outlineUrl, Outline outline, string outputFolder, string pdfOutputPath, Action<Dictionary<Outline, int>> updatePageNumbers, CancellationToken cancellationToken)
401+
Func<Outline, Uri, int, Task<(byte[]? bytes, List<HeadingInfo> headings)>> printPdf, Func<Outline, int, int, Page, Task<byte[]>> printHeaderFooter, ProgressTask task,
402+
Uri outlineUrl, Outline outline, string outputFolder, string pdfOutputPath, Action<Dictionary<Outline, int>> updatePageNumbers, Action<Dictionary<Uri, int>> updateUrlPageNumbers, Action<List<HeadingInfo>> updateHeadings, CancellationToken cancellationToken)
338403
{
339404
var pages = GetPages(outline).ToArray();
340405
if (pages.Length == 0)
341406
return;
342407

343408
var pageBytes = new Dictionary<Outline, byte[]>();
409+
var pageHeadings = new Dictionary<Outline, List<HeadingInfo>>();
344410

345411
// Make progress at 99% before merge PDF
346412
task.MaxValue = pages.Length + (pages.Length / 99.0);
347413

348414
await Parallel.ForEachAsync(pages, new ParallelOptions { CancellationToken = cancellationToken }, async (item, _) =>
349415
{
350416
var (url, node) = item;
351-
if (await printPdf(outline, url) is { } bytes)
417+
var result = await printPdf(outline, url, outline.pdfTocHeadingDepth);
418+
if (result.bytes is { } bytes)
352419
{
353420
lock (pageBytes)
354421
pageBytes[node] = bytes;
355422
}
423+
if (result.headings.Count > 0)
424+
{
425+
lock (pageHeadings)
426+
pageHeadings[node] = result.headings;
427+
}
356428
task.Increment(1);
357429
});
358430

431+
// Collect headings in document order:
432+
// - Page order: preserved by iterating `pages` array (parallel processing loses this)
433+
// - Within-page order: preserved by DOM order from QuerySelectorAllAsync
434+
var allHeadings = pages
435+
.Where(p => pageHeadings.ContainsKey(p.node))
436+
.SelectMany(p => pageHeadings[p.node])
437+
.ToList();
438+
439+
// Update headings before page numbers are calculated
440+
updateHeadings(allHeadings);
441+
359442
var pagesByNode = pages.ToDictionary(p => p.node);
360443
var pagesByUrl = new Dictionary<Uri, List<(Outline node, NamedDestinations namedDests)>>();
361444
var pageNumbers = new Dictionary<Outline, int>();
445+
var urlPageNumbers = new Dictionary<Uri, int>();
362446
var numberOfPages = 0;
363447

364448
foreach (var (url, node) in pages)
@@ -379,6 +463,7 @@ static async Task CreatePdf(
379463

380464
pageBytes[node] = bytes;
381465
pageNumbers[node] = numberOfPages + 1;
466+
urlPageNumbers[CleanUrl(url)] = numberOfPages + 1;
382467
numberOfPages += document.NumberOfPages;
383468
}
384469

@@ -444,7 +529,10 @@ async Task MergePdf()
444529
{
445530
// Refresh TOC page numbers
446531
updatePageNumbers(pageNumbers);
447-
bytes = await printPdf(outline, url);
532+
updateUrlPageNumbers(urlPageNumbers);
533+
updateHeadings(allHeadings);
534+
var result = await printPdf(outline, url, 0); // 0 = don't extract headings from TOC page
535+
bytes = result.bytes;
448536

449537
if (bytes == null)
450538
continue;
@@ -607,8 +695,40 @@ IEnumerable<BookmarkNode> CreateBookmarksCore(Outline[]? items, int level)
607695
}
608696
}
609697

610-
static HtmlTemplate TocHtmlTemplate(Uri baseUrl, Outline node, Dictionary<Outline, int>? pageNumbers)
698+
static HtmlTemplate TocHtmlTemplate(Uri baseUrl, Outline node, Dictionary<Outline, int>? pageNumbers, Dictionary<Uri, int>? urlPageNumbers, List<HeadingInfo>? headings)
611699
{
700+
// If pdfTocSource is "headings" and we have headings, generate TOC from headings
701+
if (node.pdfTocSource == "headings" && headings is { Count: > 0 })
702+
{
703+
var headingTocContent = BuildHeadingToc(baseUrl, headings, urlPageNumbers);
704+
var cssStyles = Html($"""
705+
<style>
706+
/* Indentation for heading levels */
707+
li[data-level="1"] {"{ padding-left: 0; }"}
708+
li[data-level="2"] {"{ padding-left: 1.5em; }"}
709+
li[data-level="3"] {"{ padding-left: 3em; }"}
710+
li[data-level="4"] {"{ padding-left: 4.5em; }"}
711+
li[data-level="5"] {"{ padding-left: 6em; }"}
712+
li[data-level="6"] {"{ padding-left: 7.5em; }"}
713+
</style>
714+
""");
715+
return Html($"""
716+
<!DOCTYPE html>
717+
<html>
718+
<head>
719+
<link rel="stylesheet" href="/public/docfx.min.css">
720+
<link rel="stylesheet" href="/public/main.css">
721+
{cssStyles}
722+
</head>
723+
<body class="pdftoc">
724+
<h1>Table of Contents</h1>
725+
<ul>{headingTocContent}</ul>
726+
</body>
727+
</html>
728+
""");;
729+
}
730+
731+
// Default: generate TOC from toc.yml structure
612732
return Html($"""
613733
<!DOCTYPE html>
614734
<html>
@@ -637,6 +757,35 @@ static HtmlTemplate TocHtmlTemplate(Uri baseUrl, Outline node, Dictionary<Outlin
637757
""");
638758
}
639759

760+
static HtmlTemplate BuildHeadingToc(Uri baseUrl, List<HeadingInfo> headings, Dictionary<Uri, int>? urlPageNumbers)
761+
{
762+
// Build flat list of all headings with CSS-based indentation for hierarchy
763+
var result = new List<HtmlTemplate>();
764+
765+
foreach (var heading in headings)
766+
{
767+
var href = new UriBuilder(heading.PageUrl) { Fragment = heading.Id }.Uri;
768+
var cleanUrl = new UriBuilder(heading.PageUrl) { Query = null, Fragment = null }.Uri;
769+
770+
var pageNumberHtml = urlPageNumbers?.TryGetValue(cleanUrl, out var pageNum) is true
771+
? Html($"<span class='spacer'></span> <span class='page-number'>{pageNum}</span>")
772+
: default;
773+
774+
// Use data-level attribute for CSS styling of indentation
775+
var item = Html($"""
776+
<li data-level='{heading.Level}'>
777+
<a href='{href}'>{System.Web.HttpUtility.HtmlEncode(heading.Text)}
778+
{pageNumberHtml}
779+
</a>
780+
</li>
781+
""");
782+
783+
result.Add(item);
784+
}
785+
786+
return Html($"{result}");
787+
}
788+
640789
/// <summary>
641790
/// Adds hidden links to headings to ensure Chromium saves heading anchors to named dests
642791
/// for cross page bookmark reference.
@@ -726,4 +875,19 @@ private static StringComparison GetStringComparison()
726875
? StringComparison.OrdinalIgnoreCase
727876
: StringComparison.Ordinal;
728877
}
878+
879+
private static bool IsTocPage(Uri url) => url.AbsolutePath.StartsWith("/_pdftoc/");
880+
881+
private static bool IsCoverPage(Uri pageUri, string baseFolder, string? pdfCoverPage)
882+
{
883+
Debug.Assert(Path.IsPathFullyQualified(baseFolder));
884+
885+
if (string.IsNullOrEmpty(pdfCoverPage))
886+
return false;
887+
888+
string pagePath = pageUri.AbsolutePath.TrimStart('/');
889+
string covePagePath = PathUtility.MakeRelativePath(baseFolder, Path.GetFullPath(Path.Combine(baseFolder, pdfCoverPage)));
890+
891+
return pagePath.Equals(covePagePath, GetStringComparison());
892+
}
729893
}

0 commit comments

Comments
 (0)